Revision 66d1f035 lib/client/gnt_cluster.py
b/lib/client/gnt_cluster.py | ||
---|---|---|
29 | 29 |
import os.path |
30 | 30 |
import time |
31 | 31 |
import OpenSSL |
32 |
import itertools |
|
32 | 33 |
|
33 | 34 |
from ganeti.cli import * |
34 | 35 |
from ganeti import opcodes |
... | ... | |
40 | 41 |
from ganeti import objects |
41 | 42 |
from ganeti import uidpool |
42 | 43 |
from ganeti import compat |
44 |
from ganeti import netutils |
|
45 |
|
|
46 |
|
|
47 |
ON_OPT = cli_option("--on", default=False, |
|
48 |
action="store_true", dest="on", |
|
49 |
help="Recover from an EPO") |
|
50 |
|
|
51 |
GROUPS_OPT = cli_option("--groups", default=False, |
|
52 |
action="store_true", dest="groups", |
|
53 |
help="Arguments are node groups instead of nodes") |
|
54 |
|
|
55 |
_EPO_PING_INTERVAL = 30 # 30 seconds between pings |
|
56 |
_EPO_PING_TIMEOUT = 1 # 1 second |
|
57 |
_EPO_REACHABLE_TIMEOUT = 15 * 60 # 15 minutes |
|
43 | 58 |
|
44 | 59 |
|
45 | 60 |
@UsesRPC |
... | ... | |
882 | 897 |
return 0 |
883 | 898 |
|
884 | 899 |
|
900 |
def _OobPower(opts, node_list, power): |
|
901 |
"""Puts the node in the list to desired power state. |
|
902 |
|
|
903 |
@param opts: The command line options selected by the user |
|
904 |
@param node_list: The list of nodes to operate on |
|
905 |
@param power: True if they should be powered on, False otherwise |
|
906 |
@return: The success of the operation (none failed) |
|
907 |
|
|
908 |
""" |
|
909 |
if power: |
|
910 |
command = constants.OOB_POWER_ON |
|
911 |
else: |
|
912 |
command = constants.OOB_POWER_OFF |
|
913 |
|
|
914 |
op = opcodes.OpOobCommand(node_names=node_list, |
|
915 |
command=command, |
|
916 |
ignore_status=True, |
|
917 |
timeout=opts.oob_timeout) |
|
918 |
result = SubmitOpCode(op, opts=opts) |
|
919 |
errs = 0 |
|
920 |
for node_result in result: |
|
921 |
(node_tuple, data_tuple) = node_result |
|
922 |
(_, node_name) = node_tuple |
|
923 |
(data_status, _) = data_tuple |
|
924 |
if data_status != constants.RS_NORMAL: |
|
925 |
assert data_status != constants.RS_UNAVAIL |
|
926 |
errs += 1 |
|
927 |
ToStderr("There was a problem changing power for %s, please investigate", |
|
928 |
node_name) |
|
929 |
|
|
930 |
if errs > 0: |
|
931 |
return False |
|
932 |
|
|
933 |
return True |
|
934 |
|
|
935 |
|
|
936 |
def _InstanceStart(opts, inst_list, start): |
|
937 |
"""Puts the instances in the list to desired state. |
|
938 |
|
|
939 |
@param opts: The command line options selected by the user |
|
940 |
@param inst_list: The list of instances to operate on |
|
941 |
@param start: True if they should be started, False for shutdown |
|
942 |
@return: The success of the operation (none failed) |
|
943 |
|
|
944 |
""" |
|
945 |
if start: |
|
946 |
opcls = opcodes.OpInstanceStartup |
|
947 |
text_submit, text_success, text_failed = ("startup", "started", "starting") |
|
948 |
else: |
|
949 |
opcls = opcodes.OpInstanceShutdown |
|
950 |
text_submit, text_success, text_failed = ("shutdown", "stopped", "stopping") |
|
951 |
|
|
952 |
jex = JobExecutor(opts=opts) |
|
953 |
|
|
954 |
for inst in inst_list: |
|
955 |
ToStdout("Submit %s of instance %s", text_submit, inst) |
|
956 |
op = opcls(instance_name=inst) |
|
957 |
jex.QueueJob(inst, op) |
|
958 |
|
|
959 |
results = jex.GetResults() |
|
960 |
bad_cnt = len([1 for (success, _) in results if not success]) |
|
961 |
|
|
962 |
if bad_cnt == 0: |
|
963 |
ToStdout("All instances have been %s successfully", text_success) |
|
964 |
else: |
|
965 |
ToStderr("There were errors while %s instances:\n" |
|
966 |
"%d error(s) out of %d instance(s)", text_failed, bad_cnt, |
|
967 |
len(results)) |
|
968 |
return False |
|
969 |
|
|
970 |
return True |
|
971 |
|
|
972 |
|
|
973 |
class _RunWhenNodesReachableHelper: |
|
974 |
"""Helper class to make shared internal state sharing easier. |
|
975 |
|
|
976 |
@ivar success: Indicates if all action_cb calls were successful |
|
977 |
|
|
978 |
""" |
|
979 |
def __init__(self, node_list, action_cb, node2ip, port, |
|
980 |
_ping_fn=netutils.TcpPing, _sleep_fn=time.sleep): |
|
981 |
"""Init the object. |
|
982 |
|
|
983 |
@param node_list: The list of nodes to be reachable |
|
984 |
@param action_cb: Callback called when a new host is reachable |
|
985 |
@type node2ip: dict |
|
986 |
@param node2ip: Node to ip mapping |
|
987 |
@param port: The port to use for the TCP ping |
|
988 |
@param _ping_fn: Function to check reachabilty (for unittest use only) |
|
989 |
@param _sleep_fn: Function to sleep (for unittest use only) |
|
990 |
|
|
991 |
""" |
|
992 |
self.down = set(node_list) |
|
993 |
self.up = set() |
|
994 |
self.node2ip = node2ip |
|
995 |
self.success = True |
|
996 |
self.action_cb = action_cb |
|
997 |
self.port = port |
|
998 |
self._ping_fn = _ping_fn |
|
999 |
self._sleep_fn = _sleep_fn |
|
1000 |
|
|
1001 |
def __call__(self): |
|
1002 |
"""When called we run action_cb. |
|
1003 |
|
|
1004 |
@raises utils.RetryAgain: When there are still down nodes |
|
1005 |
|
|
1006 |
""" |
|
1007 |
if not self.action_cb(self.up): |
|
1008 |
self.success = False |
|
1009 |
|
|
1010 |
if self.down: |
|
1011 |
raise utils.RetryAgain() |
|
1012 |
else: |
|
1013 |
return self.success |
|
1014 |
|
|
1015 |
def Wait(self, secs): |
|
1016 |
"""Checks if a host is up or waits remaining seconds. |
|
1017 |
|
|
1018 |
@param secs: The secs remaining |
|
1019 |
|
|
1020 |
""" |
|
1021 |
start = time.time() |
|
1022 |
for node in self.down: |
|
1023 |
if self._ping_fn(self.node2ip[node], self.port, timeout=_EPO_PING_TIMEOUT, |
|
1024 |
live_port_needed=True): |
|
1025 |
ToStdout("Node %s became available", node) |
|
1026 |
self.up.add(node) |
|
1027 |
self.down -= self.up |
|
1028 |
# If we have a node available there is the possibility to run the |
|
1029 |
# action callback successfully, therefore we don't wait and return |
|
1030 |
return |
|
1031 |
|
|
1032 |
self._sleep_fn(max(0.0, start + secs - time.time())) |
|
1033 |
|
|
1034 |
|
|
1035 |
def _RunWhenNodesReachable(node_list, action_cb, interval): |
|
1036 |
"""Run action_cb when nodes become reachable. |
|
1037 |
|
|
1038 |
@param node_list: The list of nodes to be reachable |
|
1039 |
@param action_cb: Callback called when a new host is reachable |
|
1040 |
@param interval: The earliest time to retry |
|
1041 |
|
|
1042 |
""" |
|
1043 |
client = GetClient() |
|
1044 |
cluster_info = client.QueryClusterInfo() |
|
1045 |
if cluster_info["primary_ip_version"] == constants.IP4_VERSION: |
|
1046 |
family = netutils.IPAddress.family |
|
1047 |
else: |
|
1048 |
family = netutils.IP6Address.family |
|
1049 |
|
|
1050 |
node2ip = dict((node, netutils.GetHostname(node, family=family).ip) |
|
1051 |
for node in node_list) |
|
1052 |
|
|
1053 |
port = netutils.GetDaemonPort(constants.NODED) |
|
1054 |
helper = _RunWhenNodesReachableHelper(node_list, action_cb, node2ip, port) |
|
1055 |
|
|
1056 |
try: |
|
1057 |
return utils.Retry(helper, interval, _EPO_REACHABLE_TIMEOUT, |
|
1058 |
wait_fn=helper.Wait) |
|
1059 |
except utils.RetryTimeout: |
|
1060 |
ToStderr("Time exceeded while waiting for nodes to become reachable" |
|
1061 |
" again:\n - %s", " - ".join(helper.down)) |
|
1062 |
return False |
|
1063 |
|
|
1064 |
|
|
1065 |
def _MaybeInstanceStartup(opts, inst_map, nodes_online, |
|
1066 |
_instance_start_fn=_InstanceStart): |
|
1067 |
"""Start the instances conditional based on node_states. |
|
1068 |
|
|
1069 |
@param opts: The command line options selected by the user |
|
1070 |
@param inst_map: A dict of inst -> nodes mapping |
|
1071 |
@param nodes_online: A list of nodes online |
|
1072 |
@param _instance_start_fn: Callback to start instances (unittest use only) |
|
1073 |
@return: Success of the operation on all instances |
|
1074 |
|
|
1075 |
""" |
|
1076 |
start_inst_list = [] |
|
1077 |
for (inst, nodes) in inst_map.items(): |
|
1078 |
if not (nodes - nodes_online): |
|
1079 |
# All nodes the instance lives on are back online |
|
1080 |
start_inst_list.append(inst) |
|
1081 |
|
|
1082 |
for inst in start_inst_list: |
|
1083 |
del inst_map[inst] |
|
1084 |
|
|
1085 |
if start_inst_list: |
|
1086 |
return _instance_start_fn(opts, start_inst_list, True) |
|
1087 |
|
|
1088 |
return True |
|
1089 |
|
|
1090 |
|
|
1091 |
def _EpoOn(opts, full_node_list, node_list, inst_map): |
|
1092 |
"""Does the actual power on. |
|
1093 |
|
|
1094 |
@param opts: The command line options selected by the user |
|
1095 |
@param full_node_list: All nodes to operate on (includes nodes not supporting |
|
1096 |
OOB) |
|
1097 |
@param node_list: The list of nodes to operate on (all need to support OOB) |
|
1098 |
@param inst_map: A dict of inst -> nodes mapping |
|
1099 |
@return: The desired exit status |
|
1100 |
|
|
1101 |
""" |
|
1102 |
if node_list and not _OobPower(opts, node_list, False): |
|
1103 |
ToStderr("Not all nodes seem to get back up, investigate and start" |
|
1104 |
" manually if needed") |
|
1105 |
|
|
1106 |
# Wait for the nodes to be back up |
|
1107 |
action_cb = compat.partial(_MaybeInstanceStartup, opts, dict(inst_map)) |
|
1108 |
|
|
1109 |
ToStdout("Waiting until all nodes are available again") |
|
1110 |
if not _RunWhenNodesReachable(full_node_list, action_cb, _EPO_PING_INTERVAL): |
|
1111 |
ToStderr("Please investigate and start stopped instances manually") |
|
1112 |
return constants.EXIT_FAILURE |
|
1113 |
|
|
1114 |
return constants.EXIT_SUCCESS |
|
1115 |
|
|
1116 |
|
|
1117 |
def _EpoOff(opts, node_list, inst_map): |
|
1118 |
"""Does the actual power off. |
|
1119 |
|
|
1120 |
@param opts: The command line options selected by the user |
|
1121 |
@param node_list: The list of nodes to operate on (all need to support OOB) |
|
1122 |
@param inst_map: A dict of inst -> nodes mapping |
|
1123 |
@return: The desired exit status |
|
1124 |
|
|
1125 |
""" |
|
1126 |
if not _InstanceStart(opts, inst_map.keys(), False): |
|
1127 |
ToStderr("Please investigate and stop instances manually before continuing") |
|
1128 |
return constants.EXIT_FAILURE |
|
1129 |
|
|
1130 |
if not node_list: |
|
1131 |
return constants.EXIT_SUCCESS |
|
1132 |
|
|
1133 |
if _OobPower(opts, node_list, False): |
|
1134 |
return constants.EXIT_SUCCESS |
|
1135 |
else: |
|
1136 |
return constants.EXIT_FAILURE |
|
1137 |
|
|
1138 |
|
|
1139 |
def Epo(opts, args): |
|
1140 |
"""EPO operations. |
|
1141 |
|
|
1142 |
@param opts: the command line options selected by the user |
|
1143 |
@type args: list |
|
1144 |
@param args: should contain only one element, the subcommand |
|
1145 |
@rtype: int |
|
1146 |
@return: the desired exit code |
|
1147 |
|
|
1148 |
""" |
|
1149 |
if opts.groups and opts.show_all: |
|
1150 |
ToStderr("Only one of --groups or --all are allowed") |
|
1151 |
return constants.EXIT_FAILURE |
|
1152 |
elif args and opts.show_all: |
|
1153 |
ToStderr("Arguments in combination with --all are not allowed") |
|
1154 |
return constants.EXIT_FAILURE |
|
1155 |
|
|
1156 |
client = GetClient() |
|
1157 |
|
|
1158 |
if opts.groups: |
|
1159 |
node_query_list = itertools.chain(*client.QueryGroups(names=args, |
|
1160 |
fields=["node_list"], |
|
1161 |
use_locking=False)) |
|
1162 |
else: |
|
1163 |
node_query_list = args |
|
1164 |
|
|
1165 |
result = client.QueryNodes(names=node_query_list, |
|
1166 |
fields=["name", "master", "pinst_list", |
|
1167 |
"sinst_list", "powered", "offline"], |
|
1168 |
use_locking=False) |
|
1169 |
node_list = [] |
|
1170 |
inst_map = {} |
|
1171 |
for (idx, (node, master, pinsts, sinsts, powered, |
|
1172 |
offline)) in enumerate(result): |
|
1173 |
# Normalize the node_query_list as well |
|
1174 |
if not opts.show_all: |
|
1175 |
node_query_list[idx] = node |
|
1176 |
if not offline: |
|
1177 |
for inst in (pinsts + sinsts): |
|
1178 |
if inst in inst_map: |
|
1179 |
if not master: |
|
1180 |
inst_map[inst].add(node) |
|
1181 |
elif master: |
|
1182 |
inst_map[inst] = set() |
|
1183 |
else: |
|
1184 |
inst_map[inst] = set([node]) |
|
1185 |
|
|
1186 |
if master and opts.on: |
|
1187 |
# We ignore the master for turning on the machines, in fact we are |
|
1188 |
# already operating on the master at this point :) |
|
1189 |
continue |
|
1190 |
elif master and not opts.show_all: |
|
1191 |
ToStderr("%s is the master node, please do a master-failover to another" |
|
1192 |
" node not affected by the EPO or use --all if you intend to" |
|
1193 |
" shutdown the whole cluster", node) |
|
1194 |
return constants.EXIT_FAILURE |
|
1195 |
elif powered is None: |
|
1196 |
ToStdout("Node %s does not support out-of-band handling, it can not be" |
|
1197 |
" handled in a fully automated manner", node) |
|
1198 |
elif powered == opts.on: |
|
1199 |
ToStdout("Node %s is already in desired power state, skipping", node) |
|
1200 |
elif not offline or (offline and powered): |
|
1201 |
node_list.append(node) |
|
1202 |
|
|
1203 |
if not opts.force and not ConfirmOperation(node_query_list, "nodes", "epo"): |
|
1204 |
return constants.EXIT_FAILURE |
|
1205 |
|
|
1206 |
if opts.on: |
|
1207 |
return _EpoOn(opts, node_query_list, node_list, inst_map) |
|
1208 |
else: |
|
1209 |
return _EpoOff(opts, node_list, inst_map) |
|
1210 |
|
|
1211 |
|
|
885 | 1212 |
commands = { |
886 | 1213 |
'init': ( |
887 | 1214 |
InitCluster, [ArgHost(min=1, max=1)], |
... | ... | |
977 | 1304 |
NEW_CLUSTER_DOMAIN_SECRET_OPT, CLUSTER_DOMAIN_SECRET_OPT], |
978 | 1305 |
"[opts...]", |
979 | 1306 |
"Renews cluster certificates, keys and secrets"), |
1307 |
"epo": ( |
|
1308 |
Epo, [ArgUnknown()], |
|
1309 |
[FORCE_OPT, ON_OPT, GROUPS_OPT, ALL_OPT, OOB_TIMEOUT_OPT], |
|
1310 |
"[opts...] [args]", |
|
1311 |
"Performs an emergency power-off on given args"), |
|
980 | 1312 |
} |
981 | 1313 |
|
982 | 1314 |
|
Also available in: Unified diff