Revision 66d1f035

b/Makefile.am
471 471
	test/ganeti.backend_unittest.py \
472 472
	test/ganeti.bdev_unittest.py \
473 473
	test/ganeti.cli_unittest.py \
474
	test/ganeti.client.gnt_cluster_unittest.py \
474 475
	test/ganeti.client.gnt_instance_unittest.py \
475 476
	test/ganeti.daemon_unittest.py \
476 477
	test/ganeti.cmdlib_unittest.py \
b/lib/client/gnt_cluster.py
29 29
import os.path
30 30
import time
31 31
import OpenSSL
32
import itertools
32 33

  
33 34
from ganeti.cli import *
34 35
from ganeti import opcodes
......
40 41
from ganeti import objects
41 42
from ganeti import uidpool
42 43
from ganeti import compat
44
from ganeti import netutils
45

  
46

  
47
ON_OPT = cli_option("--on", default=False,
48
                    action="store_true", dest="on",
49
                    help="Recover from an EPO")
50

  
51
GROUPS_OPT = cli_option("--groups", default=False,
52
                    action="store_true", dest="groups",
53
                    help="Arguments are node groups instead of nodes")
54

  
55
_EPO_PING_INTERVAL = 30 # 30 seconds between pings
56
_EPO_PING_TIMEOUT = 1 # 1 second
57
_EPO_REACHABLE_TIMEOUT = 15 * 60 # 15 minutes
43 58

  
44 59

  
45 60
@UsesRPC
......
882 897
  return 0
883 898

  
884 899

  
900
def _OobPower(opts, node_list, power):
901
  """Puts the node in the list to desired power state.
902

  
903
  @param opts: The command line options selected by the user
904
  @param node_list: The list of nodes to operate on
905
  @param power: True if they should be powered on, False otherwise
906
  @return: The success of the operation (none failed)
907

  
908
  """
909
  if power:
910
    command = constants.OOB_POWER_ON
911
  else:
912
    command = constants.OOB_POWER_OFF
913

  
914
  op = opcodes.OpOobCommand(node_names=node_list,
915
                            command=command,
916
                            ignore_status=True,
917
                            timeout=opts.oob_timeout)
918
  result = SubmitOpCode(op, opts=opts)
919
  errs = 0
920
  for node_result in result:
921
    (node_tuple, data_tuple) = node_result
922
    (_, node_name) = node_tuple
923
    (data_status, _) = data_tuple
924
    if data_status != constants.RS_NORMAL:
925
      assert data_status != constants.RS_UNAVAIL
926
      errs += 1
927
      ToStderr("There was a problem changing power for %s, please investigate",
928
               node_name)
929

  
930
  if errs > 0:
931
    return False
932

  
933
  return True
934

  
935

  
936
def _InstanceStart(opts, inst_list, start):
937
  """Puts the instances in the list to desired state.
938

  
939
  @param opts: The command line options selected by the user
940
  @param inst_list: The list of instances to operate on
941
  @param start: True if they should be started, False for shutdown
942
  @return: The success of the operation (none failed)
943

  
944
  """
945
  if start:
946
    opcls = opcodes.OpInstanceStartup
947
    text_submit, text_success, text_failed = ("startup", "started", "starting")
948
  else:
949
    opcls = opcodes.OpInstanceShutdown
950
    text_submit, text_success, text_failed = ("shutdown", "stopped", "stopping")
951

  
952
  jex = JobExecutor(opts=opts)
953

  
954
  for inst in inst_list:
955
    ToStdout("Submit %s of instance %s", text_submit, inst)
956
    op = opcls(instance_name=inst)
957
    jex.QueueJob(inst, op)
958

  
959
  results = jex.GetResults()
960
  bad_cnt = len([1 for (success, _) in results if not success])
961

  
962
  if bad_cnt == 0:
963
    ToStdout("All instances have been %s successfully", text_success)
964
  else:
965
    ToStderr("There were errors while %s instances:\n"
966
             "%d error(s) out of %d instance(s)", text_failed, bad_cnt,
967
             len(results))
968
    return False
969

  
970
  return True
971

  
972

  
973
class _RunWhenNodesReachableHelper:
974
  """Helper class to make shared internal state sharing easier.
975

  
976
  @ivar success: Indicates if all action_cb calls were successful
977

  
978
  """
979
  def __init__(self, node_list, action_cb, node2ip, port,
980
               _ping_fn=netutils.TcpPing, _sleep_fn=time.sleep):
981
    """Init the object.
982

  
983
    @param node_list: The list of nodes to be reachable
984
    @param action_cb: Callback called when a new host is reachable
985
    @type node2ip: dict
986
    @param node2ip: Node to ip mapping
987
    @param port: The port to use for the TCP ping
988
    @param _ping_fn: Function to check reachabilty (for unittest use only)
989
    @param _sleep_fn: Function to sleep (for unittest use only)
990

  
991
    """
992
    self.down = set(node_list)
993
    self.up = set()
994
    self.node2ip = node2ip
995
    self.success = True
996
    self.action_cb = action_cb
997
    self.port = port
998
    self._ping_fn = _ping_fn
999
    self._sleep_fn = _sleep_fn
1000

  
1001
  def __call__(self):
1002
    """When called we run action_cb.
1003

  
1004
    @raises utils.RetryAgain: When there are still down nodes
1005

  
1006
    """
1007
    if not self.action_cb(self.up):
1008
      self.success = False
1009

  
1010
    if self.down:
1011
      raise utils.RetryAgain()
1012
    else:
1013
      return self.success
1014

  
1015
  def Wait(self, secs):
1016
    """Checks if a host is up or waits remaining seconds.
1017

  
1018
    @param secs: The secs remaining
1019

  
1020
    """
1021
    start = time.time()
1022
    for node in self.down:
1023
      if self._ping_fn(self.node2ip[node], self.port, timeout=_EPO_PING_TIMEOUT,
1024
                       live_port_needed=True):
1025
        ToStdout("Node %s became available", node)
1026
        self.up.add(node)
1027
        self.down -= self.up
1028
        # If we have a node available there is the possibility to run the
1029
        # action callback successfully, therefore we don't wait and return
1030
        return
1031

  
1032
    self._sleep_fn(max(0.0, start + secs - time.time()))
1033

  
1034

  
1035
def _RunWhenNodesReachable(node_list, action_cb, interval):
1036
  """Run action_cb when nodes become reachable.
1037

  
1038
  @param node_list: The list of nodes to be reachable
1039
  @param action_cb: Callback called when a new host is reachable
1040
  @param interval: The earliest time to retry
1041

  
1042
  """
1043
  client = GetClient()
1044
  cluster_info = client.QueryClusterInfo()
1045
  if cluster_info["primary_ip_version"] == constants.IP4_VERSION:
1046
    family = netutils.IPAddress.family
1047
  else:
1048
    family = netutils.IP6Address.family
1049

  
1050
  node2ip = dict((node, netutils.GetHostname(node, family=family).ip)
1051
                 for node in node_list)
1052

  
1053
  port = netutils.GetDaemonPort(constants.NODED)
1054
  helper = _RunWhenNodesReachableHelper(node_list, action_cb, node2ip, port)
1055

  
1056
  try:
1057
    return utils.Retry(helper, interval, _EPO_REACHABLE_TIMEOUT,
1058
                       wait_fn=helper.Wait)
1059
  except utils.RetryTimeout:
1060
    ToStderr("Time exceeded while waiting for nodes to become reachable"
1061
             " again:\n  - %s", "  - ".join(helper.down))
1062
    return False
1063

  
1064

  
1065
def _MaybeInstanceStartup(opts, inst_map, nodes_online,
1066
                          _instance_start_fn=_InstanceStart):
1067
  """Start the instances conditional based on node_states.
1068

  
1069
  @param opts: The command line options selected by the user
1070
  @param inst_map: A dict of inst -> nodes mapping
1071
  @param nodes_online: A list of nodes online
1072
  @param _instance_start_fn: Callback to start instances (unittest use only)
1073
  @return: Success of the operation on all instances
1074

  
1075
  """
1076
  start_inst_list = []
1077
  for (inst, nodes) in inst_map.items():
1078
    if not (nodes - nodes_online):
1079
      # All nodes the instance lives on are back online
1080
      start_inst_list.append(inst)
1081

  
1082
  for inst in start_inst_list:
1083
    del inst_map[inst]
1084

  
1085
  if start_inst_list:
1086
    return _instance_start_fn(opts, start_inst_list, True)
1087

  
1088
  return True
1089

  
1090

  
1091
def _EpoOn(opts, full_node_list, node_list, inst_map):
1092
  """Does the actual power on.
1093

  
1094
  @param opts: The command line options selected by the user
1095
  @param full_node_list: All nodes to operate on (includes nodes not supporting
1096
                         OOB)
1097
  @param node_list: The list of nodes to operate on (all need to support OOB)
1098
  @param inst_map: A dict of inst -> nodes mapping
1099
  @return: The desired exit status
1100

  
1101
  """
1102
  if node_list and not _OobPower(opts, node_list, False):
1103
    ToStderr("Not all nodes seem to get back up, investigate and start"
1104
             " manually if needed")
1105

  
1106
  # Wait for the nodes to be back up
1107
  action_cb = compat.partial(_MaybeInstanceStartup, opts, dict(inst_map))
1108

  
1109
  ToStdout("Waiting until all nodes are available again")
1110
  if not _RunWhenNodesReachable(full_node_list, action_cb, _EPO_PING_INTERVAL):
1111
    ToStderr("Please investigate and start stopped instances manually")
1112
    return constants.EXIT_FAILURE
1113

  
1114
  return constants.EXIT_SUCCESS
1115

  
1116

  
1117
def _EpoOff(opts, node_list, inst_map):
1118
  """Does the actual power off.
1119

  
1120
  @param opts: The command line options selected by the user
1121
  @param node_list: The list of nodes to operate on (all need to support OOB)
1122
  @param inst_map: A dict of inst -> nodes mapping
1123
  @return: The desired exit status
1124

  
1125
  """
1126
  if not _InstanceStart(opts, inst_map.keys(), False):
1127
    ToStderr("Please investigate and stop instances manually before continuing")
1128
    return constants.EXIT_FAILURE
1129

  
1130
  if not node_list:
1131
    return constants.EXIT_SUCCESS
1132

  
1133
  if _OobPower(opts, node_list, False):
1134
    return constants.EXIT_SUCCESS
1135
  else:
1136
    return constants.EXIT_FAILURE
1137

  
1138

  
1139
def Epo(opts, args):
1140
  """EPO operations.
1141

  
1142
  @param opts: the command line options selected by the user
1143
  @type args: list
1144
  @param args: should contain only one element, the subcommand
1145
  @rtype: int
1146
  @return: the desired exit code
1147

  
1148
  """
1149
  if opts.groups and opts.show_all:
1150
    ToStderr("Only one of --groups or --all are allowed")
1151
    return constants.EXIT_FAILURE
1152
  elif args and opts.show_all:
1153
    ToStderr("Arguments in combination with --all are not allowed")
1154
    return constants.EXIT_FAILURE
1155

  
1156
  client = GetClient()
1157

  
1158
  if opts.groups:
1159
    node_query_list = itertools.chain(*client.QueryGroups(names=args,
1160
                                                          fields=["node_list"],
1161
                                                          use_locking=False))
1162
  else:
1163
    node_query_list = args
1164

  
1165
  result = client.QueryNodes(names=node_query_list,
1166
                             fields=["name", "master", "pinst_list",
1167
                                     "sinst_list", "powered", "offline"],
1168
                             use_locking=False)
1169
  node_list = []
1170
  inst_map = {}
1171
  for (idx, (node, master, pinsts, sinsts, powered,
1172
             offline)) in enumerate(result):
1173
    # Normalize the node_query_list as well
1174
    if not opts.show_all:
1175
      node_query_list[idx] = node
1176
    if not offline:
1177
      for inst in (pinsts + sinsts):
1178
        if inst in inst_map:
1179
          if not master:
1180
            inst_map[inst].add(node)
1181
        elif master:
1182
          inst_map[inst] = set()
1183
        else:
1184
          inst_map[inst] = set([node])
1185

  
1186
    if master and opts.on:
1187
      # We ignore the master for turning on the machines, in fact we are
1188
      # already operating on the master at this point :)
1189
      continue
1190
    elif master and not opts.show_all:
1191
      ToStderr("%s is the master node, please do a master-failover to another"
1192
               " node not affected by the EPO or use --all if you intend to"
1193
               " shutdown the whole cluster", node)
1194
      return constants.EXIT_FAILURE
1195
    elif powered is None:
1196
      ToStdout("Node %s does not support out-of-band handling, it can not be"
1197
               " handled in a fully automated manner", node)
1198
    elif powered == opts.on:
1199
      ToStdout("Node %s is already in desired power state, skipping", node)
1200
    elif not offline or (offline and powered):
1201
      node_list.append(node)
1202

  
1203
  if not opts.force and not ConfirmOperation(node_query_list, "nodes", "epo"):
1204
    return constants.EXIT_FAILURE
1205

  
1206
  if opts.on:
1207
    return _EpoOn(opts, node_query_list, node_list, inst_map)
1208
  else:
1209
    return _EpoOff(opts, node_list, inst_map)
1210

  
1211

  
885 1212
commands = {
886 1213
  'init': (
887 1214
    InitCluster, [ArgHost(min=1, max=1)],
......
977 1304
     NEW_CLUSTER_DOMAIN_SECRET_OPT, CLUSTER_DOMAIN_SECRET_OPT],
978 1305
    "[opts...]",
979 1306
    "Renews cluster certificates, keys and secrets"),
1307
  "epo": (
1308
    Epo, [ArgUnknown()],
1309
    [FORCE_OPT, ON_OPT, GROUPS_OPT, ALL_OPT, OOB_TIMEOUT_OPT],
1310
    "[opts...] [args]",
1311
    "Performs an emergency power-off on given args"),
980 1312
  }
981 1313

  
982 1314

  
b/man/gnt-cluster.rst
93 93
Since this is a dangerous command, you are required to pass the
94 94
argument *--yes-do-it.*
95 95

  
96
EPO
97
~~~
98

  
99
**epo** [--on] [--groups|--all] *arguments*
100

  
101
Performs an emergency power-off on nodes given as arguments. If ``--groups``
102
is given, arguments are node groups. If ``--all`` is provided, the whole
103
cluster will be shut down.
104

  
105
The ``--on`` flag recovers the cluster after an emergency power-off
106

  
107
Please note that the master node will not be turned down or up automatically.
108
It will just be left in a state, where you can manully perform the shutdown of
109
that one node. If the master is in the list of affected nodes and this is not a
110
complete cluster emergency power-off (e.g. using ``--all``), you're required to
111
do a master failover to another node not affected.
112

  
96 113
GETMASTER
97 114
~~~~~~~~~
98 115

  
b/qa/ganeti-qa.py
411 411
      instance = RunTest(qa_instance.TestInstanceAddWithPlainDisk, pnode)
412 412
      RunCommonInstanceTests(instance)
413 413
      RunGroupListTests()
414
      RunTest(qa_cluster.TestClusterEpo)
414 415
      RunExportImportTests(instance, pnode, None)
415 416
      RunDaemonTests(instance, pnode)
416 417
      RunTest(qa_instance.TestInstanceRemove, instance)
b/qa/qa_cluster.py
27 27
import os.path
28 28

  
29 29
from ganeti import constants
30
from ganeti import compat
30 31
from ganeti import utils
31 32

  
32 33
import qa_config
33 34
import qa_utils
34 35
import qa_error
35 36

  
36
from qa_utils import AssertEqual, AssertCommand
37
from qa_utils import AssertEqual, AssertCommand, GetCommandOutput
37 38

  
38 39

  
39 40
def _RemoveFileFromAllNodes(filename):
......
150 151
                 "oob_program="])
151 152

  
152 153

  
154
def TestClusterEpo():
155
  """gnt-cluster epo"""
156
  master = qa_config.GetMasterNode()
157

  
158
  # Assert that OOB is unavailable for all nodes
159
  result_output = GetCommandOutput(master["primary"],
160
                                   "gnt-node list --verbose --no-header -o"
161
                                   " powered")
162
  AssertEqual(compat.all(powered == "(unavail)"
163
                         for powered in result_output.splitlines()), True)
164

  
165
  # Conflicting
166
  AssertCommand(["gnt-cluster", "epo", "--groups", "--all"], fail=True)
167
  # --all doesn't expect arguments
168
  AssertCommand(["gnt-cluster", "epo", "--all", "some_arg"], fail=True)
169

  
170
  # Unless --all is given master is not allowed to be in the list
171
  AssertCommand(["gnt-cluster", "epo", "-f", master["primary"]], fail=True)
172

  
173
  # This shouldn't fail
174
  AssertCommand(["gnt-cluster", "epo", "-f", "--all"])
175

  
176
  # All instances should have been stopped now
177
  result_output = GetCommandOutput(master["primary"],
178
                                   "gnt-instance list --no-header -o status")
179
  AssertEqual(compat.all(status == "ADMIN_down"
180
                         for status in result_output.splitlines()), True)
181

  
182
  # Now start everything again
183
  AssertCommand(["gnt-cluster", "epo", "--on", "-f", "--all"])
184

  
185
  # All instances should have been started now
186
  result_output = GetCommandOutput(master["primary"],
187
                                   "gnt-instance list --no-header -o status")
188
  AssertEqual(compat.all(status == "running"
189
                         for status in result_output.splitlines()), True)
190

  
191

  
153 192
def TestClusterVerify():
154 193
  """gnt-cluster verify"""
155 194
  AssertCommand(["gnt-cluster", "verify"])
b/test/ganeti.client.gnt_cluster_unittest.py
1
#!/usr/bin/python
2
#
3

  
4
# Copyright (C) 2011 Google Inc.
5
#
6
# This program is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 2 of the License, or
9
# (at your option) any later version.
10
#
11
# This program is distributed in the hope that it will be useful, but
12
# WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
# General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with this program; if not, write to the Free Software
18
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19
# 02110-1301, USA.
20

  
21

  
22
"""Script for testing ganeti.client.gnt_cluster"""
23

  
24
import unittest
25

  
26
from ganeti.client import gnt_cluster
27
from ganeti import utils
28
from ganeti import compat
29

  
30
import testutils
31

  
32

  
33
class TestEpo(unittest.TestCase):
34
  def setUp(self):
35
    self.nodes2ip = dict(("node%s" % i, "192.0.2.%s" % i) for i in range(1, 10))
36
    self.nodes = set(self.nodes2ip.keys())
37
    self.ips2node = dict((v, k) for (k, v) in self.nodes2ip.items())
38

  
39
  def _FakeAction(*args):
40
    return True
41

  
42
  def _FakePing(ip, port, live_port_needed=False):
43
    self.assert_(live_port_needed)
44
    self.assertEqual(port, 0)
45
    return True
46

  
47
  def _FakeSleep(secs):
48
    self.assert_(secs >= 0 and secs <= 5)
49
    return
50

  
51
  def testPingFnRemoveHostsUp(self):
52
    seen = set()
53
    def _FakeSeenPing(ip, *args, **kwargs):
54
      node = self.ips2node[ip]
55
      self.assertFalse(node in seen)
56
      seen.add(node)
57
      return True
58

  
59
    helper = gnt_cluster._RunWhenNodesReachableHelper(self.nodes,
60
                                                      self._FakeAction,
61
                                                      self.nodes2ip, port=0,
62
                                                      _ping_fn=_FakeSeenPing,
63
                                                      _sleep_fn=self._FakeSleep)
64

  
65
    nodes_len = len(self.nodes)
66
    for (num, _) in enumerate(self.nodes):
67
      helper.Wait(5)
68
      if num < nodes_len - 1:
69
        self.assertRaises(utils.RetryAgain, helper)
70
      else:
71
        helper()
72

  
73
    self.assertEqual(seen, self.nodes)
74
    self.assertFalse(helper.down)
75
    self.assertEqual(helper.up, self.nodes)
76

  
77
  def testActionReturnFalseSetsHelperFalse(self):
78
    called = False
79
    def _FalseAction(*args):
80
      return called
81

  
82
    helper = gnt_cluster._RunWhenNodesReachableHelper(self.nodes, _FalseAction,
83
                                                      self.nodes2ip, port=0,
84
                                                      _ping_fn=self._FakePing,
85
                                                      _sleep_fn=self._FakeSleep)
86
    for _ in self.nodes:
87
      try:
88
        helper()
89
      except utils.RetryAgain:
90
        called = True
91

  
92
    self.assertFalse(helper.success)
93

  
94
  def testMaybeInstanceStartup(self):
95
    instances_arg = []
96
    def _FakeInstanceStart(opts, instances, start):
97
      instances_arg.append(set(instances))
98
      return None
99

  
100
    inst_map = {
101
      "inst1": set(["node1", "node2"]),
102
      "inst2": set(["node1", "node3"]),
103
      "inst3": set(["node2", "node1"]),
104
      "inst4": set(["node2", "node1", "node3"]),
105
      "inst5": set(["node4"]),
106
      }
107

  
108
    fn = _FakeInstanceStart
109
    self.assert_(gnt_cluster._MaybeInstanceStartup(None, inst_map, set(),
110
                                                   _instance_start_fn=fn))
111
    self.assertFalse(instances_arg)
112
    result = gnt_cluster._MaybeInstanceStartup(None, inst_map, set(["node1"]),
113
                                               _instance_start_fn=fn)
114
    self.assert_(result)
115
    self.assertFalse(instances_arg)
116
    result = gnt_cluster._MaybeInstanceStartup(None, inst_map,
117
                                               set(["node1", "node3"]),
118
                                               _instance_start_fn=fn)
119
    self.assert_(result is None)
120
    self.assertEqual(instances_arg.pop(0), set(["inst2"]))
121
    self.assertFalse("inst2" in inst_map)
122
    result = gnt_cluster._MaybeInstanceStartup(None, inst_map,
123
                                               set(["node1", "node3"]),
124
                                               _instance_start_fn=fn)
125
    self.assert_(result)
126
    self.assertFalse(instances_arg)
127
    result = gnt_cluster._MaybeInstanceStartup(None, inst_map,
128
                                               set(["node1", "node3", "node2"]),
129
                                               _instance_start_fn=fn)
130
    self.assertEqual(instances_arg.pop(0), set(["inst1", "inst3", "inst4"]))
131
    self.assert_(result is None)
132
    result = gnt_cluster._MaybeInstanceStartup(None, inst_map,
133
                                               set(["node1", "node3", "node2",
134
                                                    "node4"]),
135
                                               _instance_start_fn=fn)
136
    self.assert_(result is None)
137
    self.assertEqual(instances_arg.pop(0), set(["inst5"]))
138
    self.assertFalse(inst_map)
139

  
140

  
141
if __name__ == "__main__":
142
  testutils.GanetiTestProgram()

Also available in: Unified diff