Revision 66d1f035 lib/client/gnt_cluster.py

b/lib/client/gnt_cluster.py
29 29
import os.path
30 30
import time
31 31
import OpenSSL
32
import itertools
32 33

  
33 34
from ganeti.cli import *
34 35
from ganeti import opcodes
......
40 41
from ganeti import objects
41 42
from ganeti import uidpool
42 43
from ganeti import compat
44
from ganeti import netutils
45

  
46

  
47
ON_OPT = cli_option("--on", default=False,
48
                    action="store_true", dest="on",
49
                    help="Recover from an EPO")
50

  
51
GROUPS_OPT = cli_option("--groups", default=False,
52
                    action="store_true", dest="groups",
53
                    help="Arguments are node groups instead of nodes")
54

  
55
_EPO_PING_INTERVAL = 30 # 30 seconds between pings
56
_EPO_PING_TIMEOUT = 1 # 1 second
57
_EPO_REACHABLE_TIMEOUT = 15 * 60 # 15 minutes
43 58

  
44 59

  
45 60
@UsesRPC
......
882 897
  return 0
883 898

  
884 899

  
900
def _OobPower(opts, node_list, power):
901
  """Puts the node in the list to desired power state.
902

  
903
  @param opts: The command line options selected by the user
904
  @param node_list: The list of nodes to operate on
905
  @param power: True if they should be powered on, False otherwise
906
  @return: The success of the operation (none failed)
907

  
908
  """
909
  if power:
910
    command = constants.OOB_POWER_ON
911
  else:
912
    command = constants.OOB_POWER_OFF
913

  
914
  op = opcodes.OpOobCommand(node_names=node_list,
915
                            command=command,
916
                            ignore_status=True,
917
                            timeout=opts.oob_timeout)
918
  result = SubmitOpCode(op, opts=opts)
919
  errs = 0
920
  for node_result in result:
921
    (node_tuple, data_tuple) = node_result
922
    (_, node_name) = node_tuple
923
    (data_status, _) = data_tuple
924
    if data_status != constants.RS_NORMAL:
925
      assert data_status != constants.RS_UNAVAIL
926
      errs += 1
927
      ToStderr("There was a problem changing power for %s, please investigate",
928
               node_name)
929

  
930
  if errs > 0:
931
    return False
932

  
933
  return True
934

  
935

  
936
def _InstanceStart(opts, inst_list, start):
937
  """Puts the instances in the list to desired state.
938

  
939
  @param opts: The command line options selected by the user
940
  @param inst_list: The list of instances to operate on
941
  @param start: True if they should be started, False for shutdown
942
  @return: The success of the operation (none failed)
943

  
944
  """
945
  if start:
946
    opcls = opcodes.OpInstanceStartup
947
    text_submit, text_success, text_failed = ("startup", "started", "starting")
948
  else:
949
    opcls = opcodes.OpInstanceShutdown
950
    text_submit, text_success, text_failed = ("shutdown", "stopped", "stopping")
951

  
952
  jex = JobExecutor(opts=opts)
953

  
954
  for inst in inst_list:
955
    ToStdout("Submit %s of instance %s", text_submit, inst)
956
    op = opcls(instance_name=inst)
957
    jex.QueueJob(inst, op)
958

  
959
  results = jex.GetResults()
960
  bad_cnt = len([1 for (success, _) in results if not success])
961

  
962
  if bad_cnt == 0:
963
    ToStdout("All instances have been %s successfully", text_success)
964
  else:
965
    ToStderr("There were errors while %s instances:\n"
966
             "%d error(s) out of %d instance(s)", text_failed, bad_cnt,
967
             len(results))
968
    return False
969

  
970
  return True
971

  
972

  
973
class _RunWhenNodesReachableHelper:
974
  """Helper class to make shared internal state sharing easier.
975

  
976
  @ivar success: Indicates if all action_cb calls were successful
977

  
978
  """
979
  def __init__(self, node_list, action_cb, node2ip, port,
980
               _ping_fn=netutils.TcpPing, _sleep_fn=time.sleep):
981
    """Init the object.
982

  
983
    @param node_list: The list of nodes to be reachable
984
    @param action_cb: Callback called when a new host is reachable
985
    @type node2ip: dict
986
    @param node2ip: Node to ip mapping
987
    @param port: The port to use for the TCP ping
988
    @param _ping_fn: Function to check reachabilty (for unittest use only)
989
    @param _sleep_fn: Function to sleep (for unittest use only)
990

  
991
    """
992
    self.down = set(node_list)
993
    self.up = set()
994
    self.node2ip = node2ip
995
    self.success = True
996
    self.action_cb = action_cb
997
    self.port = port
998
    self._ping_fn = _ping_fn
999
    self._sleep_fn = _sleep_fn
1000

  
1001
  def __call__(self):
1002
    """When called we run action_cb.
1003

  
1004
    @raises utils.RetryAgain: When there are still down nodes
1005

  
1006
    """
1007
    if not self.action_cb(self.up):
1008
      self.success = False
1009

  
1010
    if self.down:
1011
      raise utils.RetryAgain()
1012
    else:
1013
      return self.success
1014

  
1015
  def Wait(self, secs):
1016
    """Checks if a host is up or waits remaining seconds.
1017

  
1018
    @param secs: The secs remaining
1019

  
1020
    """
1021
    start = time.time()
1022
    for node in self.down:
1023
      if self._ping_fn(self.node2ip[node], self.port, timeout=_EPO_PING_TIMEOUT,
1024
                       live_port_needed=True):
1025
        ToStdout("Node %s became available", node)
1026
        self.up.add(node)
1027
        self.down -= self.up
1028
        # If we have a node available there is the possibility to run the
1029
        # action callback successfully, therefore we don't wait and return
1030
        return
1031

  
1032
    self._sleep_fn(max(0.0, start + secs - time.time()))
1033

  
1034

  
1035
def _RunWhenNodesReachable(node_list, action_cb, interval):
1036
  """Run action_cb when nodes become reachable.
1037

  
1038
  @param node_list: The list of nodes to be reachable
1039
  @param action_cb: Callback called when a new host is reachable
1040
  @param interval: The earliest time to retry
1041

  
1042
  """
1043
  client = GetClient()
1044
  cluster_info = client.QueryClusterInfo()
1045
  if cluster_info["primary_ip_version"] == constants.IP4_VERSION:
1046
    family = netutils.IPAddress.family
1047
  else:
1048
    family = netutils.IP6Address.family
1049

  
1050
  node2ip = dict((node, netutils.GetHostname(node, family=family).ip)
1051
                 for node in node_list)
1052

  
1053
  port = netutils.GetDaemonPort(constants.NODED)
1054
  helper = _RunWhenNodesReachableHelper(node_list, action_cb, node2ip, port)
1055

  
1056
  try:
1057
    return utils.Retry(helper, interval, _EPO_REACHABLE_TIMEOUT,
1058
                       wait_fn=helper.Wait)
1059
  except utils.RetryTimeout:
1060
    ToStderr("Time exceeded while waiting for nodes to become reachable"
1061
             " again:\n  - %s", "  - ".join(helper.down))
1062
    return False
1063

  
1064

  
1065
def _MaybeInstanceStartup(opts, inst_map, nodes_online,
1066
                          _instance_start_fn=_InstanceStart):
1067
  """Start the instances conditional based on node_states.
1068

  
1069
  @param opts: The command line options selected by the user
1070
  @param inst_map: A dict of inst -> nodes mapping
1071
  @param nodes_online: A list of nodes online
1072
  @param _instance_start_fn: Callback to start instances (unittest use only)
1073
  @return: Success of the operation on all instances
1074

  
1075
  """
1076
  start_inst_list = []
1077
  for (inst, nodes) in inst_map.items():
1078
    if not (nodes - nodes_online):
1079
      # All nodes the instance lives on are back online
1080
      start_inst_list.append(inst)
1081

  
1082
  for inst in start_inst_list:
1083
    del inst_map[inst]
1084

  
1085
  if start_inst_list:
1086
    return _instance_start_fn(opts, start_inst_list, True)
1087

  
1088
  return True
1089

  
1090

  
1091
def _EpoOn(opts, full_node_list, node_list, inst_map):
1092
  """Does the actual power on.
1093

  
1094
  @param opts: The command line options selected by the user
1095
  @param full_node_list: All nodes to operate on (includes nodes not supporting
1096
                         OOB)
1097
  @param node_list: The list of nodes to operate on (all need to support OOB)
1098
  @param inst_map: A dict of inst -> nodes mapping
1099
  @return: The desired exit status
1100

  
1101
  """
1102
  if node_list and not _OobPower(opts, node_list, False):
1103
    ToStderr("Not all nodes seem to get back up, investigate and start"
1104
             " manually if needed")
1105

  
1106
  # Wait for the nodes to be back up
1107
  action_cb = compat.partial(_MaybeInstanceStartup, opts, dict(inst_map))
1108

  
1109
  ToStdout("Waiting until all nodes are available again")
1110
  if not _RunWhenNodesReachable(full_node_list, action_cb, _EPO_PING_INTERVAL):
1111
    ToStderr("Please investigate and start stopped instances manually")
1112
    return constants.EXIT_FAILURE
1113

  
1114
  return constants.EXIT_SUCCESS
1115

  
1116

  
1117
def _EpoOff(opts, node_list, inst_map):
1118
  """Does the actual power off.
1119

  
1120
  @param opts: The command line options selected by the user
1121
  @param node_list: The list of nodes to operate on (all need to support OOB)
1122
  @param inst_map: A dict of inst -> nodes mapping
1123
  @return: The desired exit status
1124

  
1125
  """
1126
  if not _InstanceStart(opts, inst_map.keys(), False):
1127
    ToStderr("Please investigate and stop instances manually before continuing")
1128
    return constants.EXIT_FAILURE
1129

  
1130
  if not node_list:
1131
    return constants.EXIT_SUCCESS
1132

  
1133
  if _OobPower(opts, node_list, False):
1134
    return constants.EXIT_SUCCESS
1135
  else:
1136
    return constants.EXIT_FAILURE
1137

  
1138

  
1139
def Epo(opts, args):
1140
  """EPO operations.
1141

  
1142
  @param opts: the command line options selected by the user
1143
  @type args: list
1144
  @param args: should contain only one element, the subcommand
1145
  @rtype: int
1146
  @return: the desired exit code
1147

  
1148
  """
1149
  if opts.groups and opts.show_all:
1150
    ToStderr("Only one of --groups or --all are allowed")
1151
    return constants.EXIT_FAILURE
1152
  elif args and opts.show_all:
1153
    ToStderr("Arguments in combination with --all are not allowed")
1154
    return constants.EXIT_FAILURE
1155

  
1156
  client = GetClient()
1157

  
1158
  if opts.groups:
1159
    node_query_list = itertools.chain(*client.QueryGroups(names=args,
1160
                                                          fields=["node_list"],
1161
                                                          use_locking=False))
1162
  else:
1163
    node_query_list = args
1164

  
1165
  result = client.QueryNodes(names=node_query_list,
1166
                             fields=["name", "master", "pinst_list",
1167
                                     "sinst_list", "powered", "offline"],
1168
                             use_locking=False)
1169
  node_list = []
1170
  inst_map = {}
1171
  for (idx, (node, master, pinsts, sinsts, powered,
1172
             offline)) in enumerate(result):
1173
    # Normalize the node_query_list as well
1174
    if not opts.show_all:
1175
      node_query_list[idx] = node
1176
    if not offline:
1177
      for inst in (pinsts + sinsts):
1178
        if inst in inst_map:
1179
          if not master:
1180
            inst_map[inst].add(node)
1181
        elif master:
1182
          inst_map[inst] = set()
1183
        else:
1184
          inst_map[inst] = set([node])
1185

  
1186
    if master and opts.on:
1187
      # We ignore the master for turning on the machines, in fact we are
1188
      # already operating on the master at this point :)
1189
      continue
1190
    elif master and not opts.show_all:
1191
      ToStderr("%s is the master node, please do a master-failover to another"
1192
               " node not affected by the EPO or use --all if you intend to"
1193
               " shutdown the whole cluster", node)
1194
      return constants.EXIT_FAILURE
1195
    elif powered is None:
1196
      ToStdout("Node %s does not support out-of-band handling, it can not be"
1197
               " handled in a fully automated manner", node)
1198
    elif powered == opts.on:
1199
      ToStdout("Node %s is already in desired power state, skipping", node)
1200
    elif not offline or (offline and powered):
1201
      node_list.append(node)
1202

  
1203
  if not opts.force and not ConfirmOperation(node_query_list, "nodes", "epo"):
1204
    return constants.EXIT_FAILURE
1205

  
1206
  if opts.on:
1207
    return _EpoOn(opts, node_query_list, node_list, inst_map)
1208
  else:
1209
    return _EpoOff(opts, node_list, inst_map)
1210

  
1211

  
885 1212
commands = {
886 1213
  'init': (
887 1214
    InitCluster, [ArgHost(min=1, max=1)],
......
977 1304
     NEW_CLUSTER_DOMAIN_SECRET_OPT, CLUSTER_DOMAIN_SECRET_OPT],
978 1305
    "[opts...]",
979 1306
    "Renews cluster certificates, keys and secrets"),
1307
  "epo": (
1308
    Epo, [ArgUnknown()],
1309
    [FORCE_OPT, ON_OPT, GROUPS_OPT, ALL_OPT, OOB_TIMEOUT_OPT],
1310
    "[opts...] [args]",
1311
    "Performs an emergency power-off on given args"),
980 1312
  }
981 1313

  
982 1314

  

Also available in: Unified diff