X-Git-Url: https://code.grnet.gr/git/ganeti-local/blobdiff_plain/bee581e24b874fe2178ec82af0c72fb2dd312ef9..3f1e065d5095b2c0cda036a130575458c8f270af:/lib/cmdlib.py diff --git a/lib/cmdlib.py b/lib/cmdlib.py index d7e95b8..98a056c 100644 --- a/lib/cmdlib.py +++ b/lib/cmdlib.py @@ -57,6 +57,7 @@ from ganeti import netutils from ganeti import query from ganeti import qlang from ganeti import opcodes +from ganeti import ht import ganeti.masterd.instance # pylint: disable-msg=W0611 @@ -1153,7 +1154,7 @@ def _CheckIAllocatorOrNode(lu, iallocator_slot, node_slot): iallocator = getattr(lu.op, iallocator_slot, None) if node is not None and iallocator is not None: - raise errors.OpPrereqError("Do not specify both, iallocator and node.", + raise errors.OpPrereqError("Do not specify both, iallocator and node", errors.ECODE_INVAL) elif node is None and iallocator is None: default_iallocator = lu.cfg.GetDefaultIAllocator() @@ -1161,10 +1162,10 @@ def _CheckIAllocatorOrNode(lu, iallocator_slot, node_slot): setattr(lu.op, iallocator_slot, default_iallocator) else: raise errors.OpPrereqError("No iallocator or node given and no" - " cluster-wide default iallocator found." - " Please specify either an iallocator or a" + " cluster-wide default iallocator found;" + " please specify either an iallocator or a" " node, or set a cluster-wide default" - " iallocator.") + " iallocator") class LUClusterPostInit(LogicalUnit): @@ -1253,7 +1254,7 @@ class LUClusterDestroy(LogicalUnit): def _VerifyCertificate(filename): - """Verifies a certificate for LUClusterVerify. + """Verifies a certificate for LUClusterVerifyConfig. @type filename: string @param filename: Path to PEM file @@ -1263,7 +1264,7 @@ def _VerifyCertificate(filename): cert = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM, utils.ReadFile(filename)) except Exception, err: # pylint: disable-msg=W0703 - return (LUClusterVerify.ETYPE_ERROR, + return (LUClusterVerifyConfig.ETYPE_ERROR, "Failed to load X509 certificate %s: %s" % (filename, err)) (errcode, msg) = \ @@ -1278,21 +1279,52 @@ def _VerifyCertificate(filename): if errcode is None: return (None, fnamemsg) elif errcode == utils.CERT_WARNING: - return (LUClusterVerify.ETYPE_WARNING, fnamemsg) + return (LUClusterVerifyConfig.ETYPE_WARNING, fnamemsg) elif errcode == utils.CERT_ERROR: - return (LUClusterVerify.ETYPE_ERROR, fnamemsg) + return (LUClusterVerifyConfig.ETYPE_ERROR, fnamemsg) raise errors.ProgrammerError("Unhandled certificate error code %r" % errcode) -class LUClusterVerify(LogicalUnit): - """Verifies the cluster status. +def _GetAllHypervisorParameters(cluster, instances): + """Compute the set of all hypervisor parameters. + + @type cluster: L{objects.Cluster} + @param cluster: the cluster object + @param instances: list of L{objects.Instance} + @param instances: additional instances from which to obtain parameters + @rtype: list of (origin, hypervisor, parameters) + @return: a list with all parameters found, indicating the hypervisor they + apply to, and the origin (can be "cluster", "os X", or "instance Y") """ - HPATH = "cluster-verify" - HTYPE = constants.HTYPE_CLUSTER - REQ_BGL = False + hvp_data = [] + + for hv_name in cluster.enabled_hypervisors: + hvp_data.append(("cluster", hv_name, cluster.GetHVDefaults(hv_name))) + + for os_name, os_hvp in cluster.os_hvp.items(): + for hv_name, hv_params in os_hvp.items(): + if hv_params: + full_params = cluster.GetHVDefaults(hv_name, os_name=os_name) + hvp_data.append(("os %s" % os_name, hv_name, full_params)) + + # TODO: collapse identical parameter values in a single one + for instance in instances: + if instance.hvparams: + hvp_data.append(("instance %s" % instance.name, instance.hypervisor, + cluster.FillHV(instance))) + + return hvp_data + +class _VerifyErrors(object): + """Mix-in for cluster/group verify LUs. + + It provides _Error and _ErrorIf, and updates the self.bad boolean. (Expects + self.op and self._feedback_fn to be available.) + + """ TCLUSTER = "cluster" TNODE = "node" TINSTANCE = "instance" @@ -1300,6 +1332,8 @@ class LUClusterVerify(LogicalUnit): ECLUSTERCFG = (TCLUSTER, "ECLUSTERCFG") ECLUSTERCERT = (TCLUSTER, "ECLUSTERCERT") ECLUSTERFILECHECK = (TCLUSTER, "ECLUSTERFILECHECK") + ECLUSTERDANGLINGNODES = (TNODE, "ECLUSTERDANGLINGNODES") + ECLUSTERDANGLINGINST = (TNODE, "ECLUSTERDANGLINGINST") EINSTANCEBADNODE = (TINSTANCE, "EINSTANCEBADNODE") EINSTANCEDOWN = (TINSTANCE, "EINSTANCEDOWN") EINSTANCELAYOUT = (TINSTANCE, "EINSTANCELAYOUT") @@ -1329,6 +1363,138 @@ class LUClusterVerify(LogicalUnit): ETYPE_ERROR = "ERROR" ETYPE_WARNING = "WARNING" + def _Error(self, ecode, item, msg, *args, **kwargs): + """Format an error message. + + Based on the opcode's error_codes parameter, either format a + parseable error code, or a simpler error string. + + This must be called only from Exec and functions called from Exec. + + """ + ltype = kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR) + itype, etxt = ecode + # first complete the msg + if args: + msg = msg % args + # then format the whole message + if self.op.error_codes: # This is a mix-in. pylint: disable-msg=E1101 + msg = "%s:%s:%s:%s:%s" % (ltype, etxt, itype, item, msg) + else: + if item: + item = " " + item + else: + item = "" + msg = "%s: %s%s: %s" % (ltype, itype, item, msg) + # and finally report it via the feedback_fn + self._feedback_fn(" - %s" % msg) # Mix-in. pylint: disable-msg=E1101 + + def _ErrorIf(self, cond, *args, **kwargs): + """Log an error message if the passed condition is True. + + """ + cond = (bool(cond) + or self.op.debug_simulate_errors) # pylint: disable-msg=E1101 + if cond: + self._Error(*args, **kwargs) + # do not mark the operation as failed for WARN cases only + if kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR) == self.ETYPE_ERROR: + self.bad = self.bad or cond + + +class LUClusterVerifyConfig(NoHooksLU, _VerifyErrors): + """Verifies the cluster config. + + """ + REQ_BGL = False + + def _VerifyHVP(self, hvp_data): + """Verifies locally the syntax of the hypervisor parameters. + + """ + for item, hv_name, hv_params in hvp_data: + msg = ("hypervisor %s parameters syntax check (source %s): %%s" % + (item, hv_name)) + try: + hv_class = hypervisor.GetHypervisor(hv_name) + utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES) + hv_class.CheckParameterSyntax(hv_params) + except errors.GenericError, err: + self._ErrorIf(True, self.ECLUSTERCFG, None, msg % str(err)) + + def ExpandNames(self): + self.all_group_info = self.cfg.GetAllNodeGroupsInfo() + self.all_node_info = self.cfg.GetAllNodesInfo() + self.all_inst_info = self.cfg.GetAllInstancesInfo() + self.needed_locks = {} + + def Exec(self, feedback_fn): + """Verify integrity of cluster, performing various test on nodes. + + """ + self.bad = False + self._feedback_fn = feedback_fn + + feedback_fn("* Verifying cluster config") + + for msg in self.cfg.VerifyConfig(): + self._ErrorIf(True, self.ECLUSTERCFG, None, msg) + + feedback_fn("* Verifying cluster certificate files") + + for cert_filename in constants.ALL_CERT_FILES: + (errcode, msg) = _VerifyCertificate(cert_filename) + self._ErrorIf(errcode, self.ECLUSTERCERT, None, msg, code=errcode) + + feedback_fn("* Verifying hypervisor parameters") + + self._VerifyHVP(_GetAllHypervisorParameters(self.cfg.GetClusterInfo(), + self.all_inst_info.values())) + + feedback_fn("* Verifying all nodes belong to an existing group") + + # We do this verification here because, should this bogus circumstance + # occur, it would never be catched by VerifyGroup, which only acts on + # nodes/instances reachable from existing node groups. + + dangling_nodes = set(node.name for node in self.all_node_info.values() + if node.group not in self.all_group_info) + + dangling_instances = {} + no_node_instances = [] + + for inst in self.all_inst_info.values(): + if inst.primary_node in dangling_nodes: + dangling_instances.setdefault(inst.primary_node, []).append(inst.name) + elif inst.primary_node not in self.all_node_info: + no_node_instances.append(inst.name) + + pretty_dangling = [ + "%s (%s)" % + (node.name, + utils.CommaJoin(dangling_instances.get(node.name, + ["no instances"]))) + for node in dangling_nodes] + + self._ErrorIf(bool(dangling_nodes), self.ECLUSTERDANGLINGNODES, None, + "the following nodes (and their instances) belong to a non" + " existing group: %s", utils.CommaJoin(pretty_dangling)) + + self._ErrorIf(bool(no_node_instances), self.ECLUSTERDANGLINGINST, None, + "the following instances have a non-existing primary-node:" + " %s", utils.CommaJoin(no_node_instances)) + + return (not self.bad, [g.name for g in self.all_group_info.values()]) + + +class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): + """Verifies the status of a node group. + + """ + HPATH = "cluster-verify" + HTYPE = constants.HTYPE_CLUSTER + REQ_BGL = False + _HOOKS_INDENT_RE = re.compile("^", re.M) class NodeImage(object): @@ -1382,48 +1548,90 @@ class LUClusterVerify(LogicalUnit): self.oslist = {} def ExpandNames(self): + # This raises errors.OpPrereqError on its own: + self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name) + + all_node_info = self.cfg.GetAllNodesInfo() + all_inst_info = self.cfg.GetAllInstancesInfo() + + node_names = set(node.name + for node in all_node_info.values() + if node.group == self.group_uuid) + + inst_names = [inst.name + for inst in all_inst_info.values() + if inst.primary_node in node_names] + + # In Exec(), we warn about mirrored instances that have primary and + # secondary living in separate node groups. To fully verify that + # volumes for these instances are healthy, we will need to do an + # extra call to their secondaries. We ensure here those nodes will + # be locked. + for inst in inst_names: + if all_inst_info[inst].disk_template in constants.DTS_INT_MIRROR: + node_names.update(all_inst_info[inst].secondary_nodes) + self.needed_locks = { - locking.LEVEL_NODE: locking.ALL_SET, - locking.LEVEL_INSTANCE: locking.ALL_SET, + locking.LEVEL_NODEGROUP: [self.group_uuid], + locking.LEVEL_NODE: list(node_names), + locking.LEVEL_INSTANCE: inst_names, } + self.share_locks = dict.fromkeys(locking.LEVELS, 1) - def _Error(self, ecode, item, msg, *args, **kwargs): - """Format an error message. + def CheckPrereq(self): + self.all_node_info = self.cfg.GetAllNodesInfo() + self.all_inst_info = self.cfg.GetAllInstancesInfo() - Based on the opcode's error_codes parameter, either format a - parseable error code, or a simpler error string. + group_nodes = set(node.name + for node in self.all_node_info.values() + if node.group == self.group_uuid) - This must be called only from Exec and functions called from Exec. + group_instances = set(inst.name + for inst in self.all_inst_info.values() + if inst.primary_node in group_nodes) - """ - ltype = kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR) - itype, etxt = ecode - # first complete the msg - if args: - msg = msg % args - # then format the whole message - if self.op.error_codes: - msg = "%s:%s:%s:%s:%s" % (ltype, etxt, itype, item, msg) - else: - if item: - item = " " + item - else: - item = "" - msg = "%s: %s%s: %s" % (ltype, itype, item, msg) - # and finally report it via the feedback_fn - self._feedback_fn(" - %s" % msg) + unlocked_nodes = \ + group_nodes.difference(self.glm.list_owned(locking.LEVEL_NODE)) - def _ErrorIf(self, cond, *args, **kwargs): - """Log an error message if the passed condition is True. + unlocked_instances = \ + group_instances.difference(self.glm.list_owned(locking.LEVEL_INSTANCE)) - """ - cond = bool(cond) or self.op.debug_simulate_errors - if cond: - self._Error(*args, **kwargs) - # do not mark the operation as failed for WARN cases only - if kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR) == self.ETYPE_ERROR: - self.bad = self.bad or cond + if unlocked_nodes: + raise errors.OpPrereqError("missing lock for nodes: %s" % + utils.CommaJoin(unlocked_nodes)) + + if unlocked_instances: + raise errors.OpPrereqError("missing lock for instances: %s" % + utils.CommaJoin(unlocked_instances)) + + self.my_node_names = utils.NiceSort(group_nodes) + self.my_inst_names = utils.NiceSort(group_instances) + + self.my_node_info = dict((name, self.all_node_info[name]) + for name in self.my_node_names) + + self.my_inst_info = dict((name, self.all_inst_info[name]) + for name in self.my_inst_names) + + # We detect here the nodes that will need the extra RPC calls for verifying + # split LV volumes; they should be locked. + extra_lv_nodes = set() + + for inst in self.my_inst_info.values(): + if inst.disk_template in constants.DTS_INT_MIRROR: + group = self.my_node_info[inst.primary_node].group + for nname in inst.secondary_nodes: + if self.all_node_info[nname].group != group: + extra_lv_nodes.add(nname) + + unlocked_lv_nodes = \ + extra_lv_nodes.difference(self.glm.list_owned(locking.LEVEL_NODE)) + + if unlocked_lv_nodes: + raise errors.OpPrereqError("these nodes could be locked: %s" % + utils.CommaJoin(unlocked_lv_nodes)) + self.extra_lv_nodes = list(extra_lv_nodes) def _VerifyNode(self, ninfo, nresult): """Perform some basic validation on data returned from a node. @@ -1663,12 +1871,6 @@ class LUClusterVerify(LogicalUnit): "instance not running on its primary node %s", node_current) - for node, n_img in node_image.items(): - if node != node_current: - test = instance in n_img.instances - _ErrorIf(test, self.EINSTANCEWRONGNODE, instance, - "instance should not run on node %s", node) - diskdata = [(nname, success, status, idx) for (nname, disks) in diskstatus.items() for idx, (success, status) in enumerate(disks)] @@ -1708,18 +1910,6 @@ class LUClusterVerify(LogicalUnit): self._ErrorIf(test, self.ENODEORPHANLV, node, "volume %s is unknown", volume) - def _VerifyOrphanInstances(self, instancelist, node_image): - """Verify the list of running instances. - - This checks what instances are running but unknown to the cluster. - - """ - for node, n_img in node_image.items(): - for o_inst in n_img.instances: - test = o_inst not in instancelist - self._ErrorIf(test, self.ENODEORPHANINSTANCE, node, - "instance %s on node %s should not exist", o_inst, node) - def _VerifyNPlusOneMemory(self, node_image, instance_cfg): """Verify N+1 Memory Resilience. @@ -2202,20 +2392,6 @@ class LUClusterVerify(LogicalUnit): return instdisk - def _VerifyHVP(self, hvp_data): - """Verifies locally the syntax of the hypervisor parameters. - - """ - for item, hv_name, hv_params in hvp_data: - msg = ("hypervisor %s parameters syntax check (source %s): %%s" % - (item, hv_name)) - try: - hv_class = hypervisor.GetHypervisor(hv_name) - utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES) - hv_class.CheckParameterSyntax(hv_params) - except errors.GenericError, err: - self._ErrorIf(True, self.ECLUSTERCFG, None, msg % str(err)) - def BuildHooksEnv(self): """Build hooks env. @@ -2223,14 +2399,12 @@ class LUClusterVerify(LogicalUnit): the output be logged in the verify output and the verification to fail. """ - cfg = self.cfg - env = { - "CLUSTER_TAGS": " ".join(cfg.GetClusterInfo().GetTags()) + "CLUSTER_TAGS": " ".join(self.cfg.GetClusterInfo().GetTags()) } env.update(("NODE_TAGS_%s" % node.name, " ".join(node.GetTags())) - for node in cfg.GetAllNodesInfo().values()) + for node in self.my_node_info.values()) return env @@ -2238,10 +2412,12 @@ class LUClusterVerify(LogicalUnit): """Build hooks nodes. """ - return ([], self.cfg.GetNodeList()) + assert self.my_node_names, ("Node list not gathered," + " has CheckPrereq been executed?") + return ([], self.my_node_names) def Exec(self, feedback_fn): - """Verify integrity of cluster, performing various test on nodes. + """Verify integrity of the node group, performing various test on nodes. """ # This method has too many local variables. pylint: disable-msg=R0914 @@ -2249,25 +2425,14 @@ class LUClusterVerify(LogicalUnit): _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103 verbose = self.op.verbose self._feedback_fn = feedback_fn - feedback_fn("* Verifying global settings") - for msg in self.cfg.VerifyConfig(): - _ErrorIf(True, self.ECLUSTERCFG, None, msg) - - # Check the cluster certificates - for cert_filename in constants.ALL_CERT_FILES: - (errcode, msg) = _VerifyCertificate(cert_filename) - _ErrorIf(errcode, self.ECLUSTERCERT, None, msg, code=errcode) vg_name = self.cfg.GetVGName() drbd_helper = self.cfg.GetDRBDHelper() - hypervisors = self.cfg.GetClusterInfo().enabled_hypervisors cluster = self.cfg.GetClusterInfo() - nodeinfo_byname = self.cfg.GetAllNodesInfo() - nodelist = utils.NiceSort(nodeinfo_byname.keys()) - nodeinfo = [nodeinfo_byname[nname] for nname in nodelist] - instanceinfo = self.cfg.GetAllInstancesInfo() - instancelist = utils.NiceSort(instanceinfo.keys()) groupinfo = self.cfg.GetAllNodeGroupsInfo() + hypervisors = cluster.enabled_hypervisors + node_data_list = [self.my_node_info[name] for name in self.my_node_names] + i_non_redundant = [] # Non redundant instances i_non_a_balanced = [] # Non auto-balanced instances n_offline = 0 # Count of offline nodes @@ -2283,37 +2448,32 @@ class LUClusterVerify(LogicalUnit): master_node = self.master_node = self.cfg.GetMasterNode() master_ip = self.cfg.GetMasterIP() - # Compute the set of hypervisor parameters - hvp_data = [] - for hv_name in hypervisors: - hvp_data.append(("cluster", hv_name, cluster.GetHVDefaults(hv_name))) - for os_name, os_hvp in cluster.os_hvp.items(): - for hv_name, hv_params in os_hvp.items(): - if not hv_params: - continue - full_params = cluster.GetHVDefaults(hv_name, os_name=os_name) - hvp_data.append(("os %s" % os_name, hv_name, full_params)) - # TODO: collapse identical parameter values in a single one - for instance in instanceinfo.values(): - if not instance.hvparams: - continue - hvp_data.append(("instance %s" % instance.name, instance.hypervisor, - cluster.FillHV(instance))) - # and verify them locally - self._VerifyHVP(hvp_data) + feedback_fn("* Gathering data (%d nodes)" % len(self.my_node_names)) + + # We will make nodes contact all nodes in their group, and one node from + # every other group. + # TODO: should it be a *random* node, different every time? + online_nodes = [node.name for node in node_data_list if not node.offline] + other_group_nodes = {} + + for name in sorted(self.all_node_info): + node = self.all_node_info[name] + if (node.group not in other_group_nodes + and node.group != self.group_uuid + and not node.offline): + other_group_nodes[node.group] = node.name - feedback_fn("* Gathering data (%d nodes)" % len(nodelist)) node_verify_param = { constants.NV_FILELIST: utils.UniqueSequence(filename for files in filemap for filename in files), - constants.NV_NODELIST: [node.name for node in nodeinfo - if not node.offline], + constants.NV_NODELIST: online_nodes + other_group_nodes.values(), constants.NV_HYPERVISOR: hypervisors, - constants.NV_HVPARAMS: hvp_data, - constants.NV_NODENETTEST: [(node.name, node.primary_ip, - node.secondary_ip) for node in nodeinfo + constants.NV_HVPARAMS: + _GetAllHypervisorParameters(cluster, self.all_inst_info.values()), + constants.NV_NODENETTEST: [(node.name, node.primary_ip, node.secondary_ip) + for node in node_data_list if not node.offline], constants.NV_INSTANCELIST: hypervisors, constants.NV_VERSION: None, @@ -2340,7 +2500,7 @@ class LUClusterVerify(LogicalUnit): default_nicpp = cluster.nicparams[constants.PP_DEFAULT] if default_nicpp[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED: bridges.add(default_nicpp[constants.NIC_LINK]) - for instance in instanceinfo.values(): + for instance in self.my_inst_info.values(): for nic in instance.nics: full_nic = cluster.SimpleFillNIC(nic.nicparams) if full_nic[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED: @@ -2353,11 +2513,11 @@ class LUClusterVerify(LogicalUnit): node_image = dict((node.name, self.NodeImage(offline=node.offline, name=node.name, vm_capable=node.vm_capable)) - for node in nodeinfo) + for node in node_data_list) # Gather OOB paths oob_paths = [] - for node in nodeinfo: + for node in self.all_node_info.values(): path = _SupportsOob(self.cfg, node) if path and path not in oob_paths: oob_paths.append(path) @@ -2365,14 +2525,13 @@ class LUClusterVerify(LogicalUnit): if oob_paths: node_verify_param[constants.NV_OOB_PATHS] = oob_paths - for instance in instancelist: - inst_config = instanceinfo[instance] + for instance in self.my_inst_names: + inst_config = self.my_inst_info[instance] for nname in inst_config.all_nodes: if nname not in node_image: - # ghost node gnode = self.NodeImage(name=nname) - gnode.ghost = True + gnode.ghost = (nname not in self.all_node_info) node_image[nname] = gnode inst_config.MapLVsByNode(node_vol_should) @@ -2395,23 +2554,59 @@ class LUClusterVerify(LogicalUnit): # time before and after executing the request, we can at least have a time # window. nvinfo_starttime = time.time() - all_nvinfo = self.rpc.call_node_verify(nodelist, node_verify_param, + all_nvinfo = self.rpc.call_node_verify(self.my_node_names, + node_verify_param, self.cfg.GetClusterName()) + if self.extra_lv_nodes and vg_name is not None: + extra_lv_nvinfo = \ + self.rpc.call_node_verify(self.extra_lv_nodes, + {constants.NV_LVLIST: vg_name}, + self.cfg.GetClusterName()) + else: + extra_lv_nvinfo = {} nvinfo_endtime = time.time() all_drbd_map = self.cfg.ComputeDRBDMap() - feedback_fn("* Gathering disk information (%s nodes)" % len(nodelist)) - instdisk = self._CollectDiskInfo(nodelist, node_image, instanceinfo) + feedback_fn("* Gathering disk information (%s nodes)" % + len(self.my_node_names)) + instdisk = self._CollectDiskInfo(self.my_node_names, node_image, + self.my_inst_info) feedback_fn("* Verifying configuration file consistency") - self._VerifyFiles(_ErrorIf, nodeinfo, master_node, all_nvinfo, filemap) + + # If not all nodes are being checked, we need to make sure the master node + # and a non-checked vm_capable node are in the list. + absent_nodes = set(self.all_node_info).difference(self.my_node_info) + if absent_nodes: + vf_nvinfo = all_nvinfo.copy() + vf_node_info = list(self.my_node_info.values()) + additional_nodes = [] + if master_node not in self.my_node_info: + additional_nodes.append(master_node) + vf_node_info.append(self.all_node_info[master_node]) + # Add the first vm_capable node we find which is not included + for node in absent_nodes: + nodeinfo = self.all_node_info[node] + if nodeinfo.vm_capable and not nodeinfo.offline: + additional_nodes.append(node) + vf_node_info.append(self.all_node_info[node]) + break + key = constants.NV_FILELIST + vf_nvinfo.update(self.rpc.call_node_verify(additional_nodes, + {key: node_verify_param[key]}, + self.cfg.GetClusterName())) + else: + vf_nvinfo = all_nvinfo + vf_node_info = self.my_node_info.values() + + self._VerifyFiles(_ErrorIf, vf_node_info, master_node, vf_nvinfo, filemap) feedback_fn("* Verifying node status") refos_img = None - for node_i in nodeinfo: + for node_i in node_data_list: node = node_i.name nimg = node_image[node] @@ -2448,24 +2643,41 @@ class LUClusterVerify(LogicalUnit): if nimg.vm_capable: self._VerifyNodeLVM(node_i, nresult, vg_name) - self._VerifyNodeDrbd(node_i, nresult, instanceinfo, drbd_helper, + self._VerifyNodeDrbd(node_i, nresult, self.all_inst_info, drbd_helper, all_drbd_map) self._UpdateNodeVolumes(node_i, nresult, nimg, vg_name) self._UpdateNodeInstances(node_i, nresult, nimg) self._UpdateNodeInfo(node_i, nresult, nimg, vg_name) self._UpdateNodeOS(node_i, nresult, nimg) + if not nimg.os_fail: if refos_img is None: refos_img = nimg self._VerifyNodeOS(node_i, nimg, refos_img) self._VerifyNodeBridges(node_i, nresult, bridges) + # Check whether all running instancies are primary for the node. (This + # can no longer be done from _VerifyInstance below, since some of the + # wrong instances could be from other node groups.) + non_primary_inst = set(nimg.instances).difference(nimg.pinst) + + for inst in non_primary_inst: + test = inst in self.all_inst_info + _ErrorIf(test, self.EINSTANCEWRONGNODE, inst, + "instance should not run on node %s", node_i.name) + _ErrorIf(not test, self.ENODEORPHANINSTANCE, node_i.name, + "node is running unknown instance %s", inst) + + for node, result in extra_lv_nvinfo.items(): + self._UpdateNodeVolumes(self.all_node_info[node], result.payload, + node_image[node], vg_name) + feedback_fn("* Verifying instance status") - for instance in instancelist: + for instance in self.my_inst_names: if verbose: feedback_fn("* Verifying instance %s" % instance) - inst_config = instanceinfo[instance] + inst_config = self.my_inst_info[instance] self._VerifyInstance(instance, inst_config, node_image, instdisk[instance]) inst_nodes_offline = [] @@ -2500,7 +2712,7 @@ class LUClusterVerify(LogicalUnit): instance_groups = {} for node in instance_nodes: - instance_groups.setdefault(nodeinfo_byname[node].group, + instance_groups.setdefault(self.all_node_info[node].group, []).append(node) pretty_list = [ @@ -2539,14 +2751,22 @@ class LUClusterVerify(LogicalUnit): feedback_fn("* Verifying orphan volumes") reserved = utils.FieldSet(*cluster.reserved_lvs) - self._VerifyOrphanVolumes(node_vol_should, node_image, reserved) - feedback_fn("* Verifying orphan instances") - self._VerifyOrphanInstances(instancelist, node_image) + # We will get spurious "unknown volume" warnings if any node of this group + # is secondary for an instance whose primary is in another group. To avoid + # them, we find these instances and add their volumes to node_vol_should. + for inst in self.all_inst_info.values(): + for secondary in inst.secondary_nodes: + if (secondary in self.my_node_info + and inst.name not in self.my_inst_info): + inst.MapLVsByNode(node_vol_should) + break + + self._VerifyOrphanVolumes(node_vol_should, node_image, reserved) if constants.VERIFY_NPLUSONE_MEM not in self.op.skip_checks: feedback_fn("* Verifying N+1 Memory redundancy") - self._VerifyNPlusOneMemory(node_image, instanceinfo) + self._VerifyNPlusOneMemory(node_image, self.my_inst_info) feedback_fn("* Other Notes") if i_non_redundant: @@ -5458,7 +5678,8 @@ class LUInstanceStartup(LogicalUnit): instance = self.instance force = self.op.force - self.cfg.MarkInstanceUp(instance.name) + if not self.op.no_remember: + self.cfg.MarkInstanceUp(instance.name) if self.primary_offline: assert self.op.ignore_offline_nodes @@ -5623,7 +5844,8 @@ class LUInstanceShutdown(LogicalUnit): node_current = instance.primary_node timeout = self.op.timeout - self.cfg.MarkInstanceDown(instance.name) + if not self.op.no_remember: + self.cfg.MarkInstanceDown(instance.name) if self.primary_offline: assert self.op.ignore_offline_nodes @@ -10083,6 +10305,7 @@ class LUInstanceSetParams(LogicalUnit): self.be_inst = i_bedict # the new dict (without defaults) else: self.be_new = self.be_inst = {} + be_old = cluster.FillBE(instance) # osparams processing if self.op.osparams: @@ -10094,7 +10317,8 @@ class LUInstanceSetParams(LogicalUnit): self.warn = [] - if constants.BE_MEMORY in self.op.beparams and not self.op.force: + if (constants.BE_MEMORY in self.op.beparams and not self.op.force and + be_new[constants.BE_MEMORY] > be_old[constants.BE_MEMORY]): mem_check_list = [pnode] if be_new[constants.BE_AUTO_BALANCE]: # either we changed auto_balance to yes or it was from before @@ -10135,16 +10359,17 @@ class LUInstanceSetParams(LogicalUnit): for node, nres in nodeinfo.items(): if node not in instance.secondary_nodes: continue - msg = nres.fail_msg - if msg: - self.warn.append("Can't get info from secondary node %s: %s" % - (node, msg)) - elif not isinstance(nres.payload.get('memory_free', None), int): - self.warn.append("Secondary node %s didn't return free" - " memory information" % node) + nres.Raise("Can't get info from secondary node %s" % node, + prereq=True, ecode=errors.ECODE_STATE) + if not isinstance(nres.payload.get('memory_free', None), int): + raise errors.OpPrereqError("Secondary node %s didn't return free" + " memory information" % node, + errors.ECODE_STATE) elif be_new[constants.BE_MEMORY] > nres.payload['memory_free']: - self.warn.append("Not enough memory to failover instance to" - " secondary node %s" % node) + raise errors.OpPrereqError("This change will prevent the instance" + " from failover to its secondary node" + " %s, due to not enough memory" % node, + errors.ECODE_STATE) # NIC processing self.nic_pnew = {} @@ -11798,7 +12023,7 @@ class IAllocator(object): self.success = self.info = self.result = None try: - (fn, keyset) = self._MODE_DATA[self.mode] + (fn, keyset, self._result_check) = self._MODE_DATA[self.mode] except KeyError: raise errors.ProgrammerError("Unknown mode '%s' passed to the" " IAllocator" % self.mode) @@ -12103,13 +12328,22 @@ class IAllocator(object): constants.IALLOCATOR_MODE_ALLOC: (_AddNewInstance, ["name", "mem_size", "disks", "disk_template", "os", "tags", "nics", - "vcpus", "hypervisor"]), + "vcpus", "hypervisor"], ht.TList), constants.IALLOCATOR_MODE_RELOC: - (_AddRelocateInstance, ["name", "relocate_from"]), + (_AddRelocateInstance, ["name", "relocate_from"], ht.TList), constants.IALLOCATOR_MODE_MEVAC: - (_AddEvacuateNodes, ["evac_nodes"]), + (_AddEvacuateNodes, ["evac_nodes"], + ht.TListOf(ht.TAnd(ht.TIsLength(2), + ht.TListOf(ht.TString)))), constants.IALLOCATOR_MODE_MRELOC: - (_AddMultiRelocate, ["instances", "reloc_mode", "target_groups"]), + (_AddMultiRelocate, ["instances", "reloc_mode", "target_groups"], + ht.TListOf(ht.TListOf(ht.TStrictDict(True, False, { + # pylint: disable-msg=E1101 + # Class '...' has no 'OP_ID' member + "OP_ID": ht.TElemOf([opcodes.OpInstanceFailover.OP_ID, + opcodes.OpInstanceMigrate.OP_ID, + opcodes.OpInstanceReplaceDisks.OP_ID]) + })))), } def Run(self, name, validate=True, call_fn=None): @@ -12152,28 +12386,45 @@ class IAllocator(object): " missing key '%s'" % key) setattr(self, key, rdict[key]) - if not isinstance(rdict["result"], list): - raise errors.OpExecError("Can't parse iallocator results: 'result' key" - " is not a list") - - if self.mode == constants.IALLOCATOR_MODE_RELOC: - assert self.relocate_from is not None - assert self.required_nodes == 1 + if not self._result_check(self.result): + raise errors.OpExecError("Iallocator returned invalid result," + " expected %s, got %s" % + (self._result_check, self.result), + errors.ECODE_INVAL) + if self.mode in (constants.IALLOCATOR_MODE_RELOC, + constants.IALLOCATOR_MODE_MEVAC): node2group = dict((name, ndata["group"]) for (name, ndata) in self.in_data["nodes"].items()) fn = compat.partial(self._NodesToGroups, node2group, self.in_data["nodegroups"]) - request_groups = fn(self.relocate_from) - result_groups = fn(rdict["result"]) - - if result_groups != request_groups: - raise errors.OpExecError("Groups of nodes returned by iallocator (%s)" - " differ from original groups (%s)" % - (utils.CommaJoin(result_groups), - utils.CommaJoin(request_groups))) + if self.mode == constants.IALLOCATOR_MODE_RELOC: + assert self.relocate_from is not None + assert self.required_nodes == 1 + + request_groups = fn(self.relocate_from) + result_groups = fn(rdict["result"]) + + if result_groups != request_groups: + raise errors.OpExecError("Groups of nodes returned by iallocator (%s)" + " differ from original groups (%s)" % + (utils.CommaJoin(result_groups), + utils.CommaJoin(request_groups))) + elif self.mode == constants.IALLOCATOR_MODE_MEVAC: + request_groups = fn(self.evac_nodes) + for (instance_name, secnode) in self.result: + result_groups = fn([secnode]) + if result_groups != request_groups: + raise errors.OpExecError("Iallocator returned new secondary node" + " '%s' (group '%s') for instance '%s'" + " which is not in original group '%s'" % + (secnode, utils.CommaJoin(result_groups), + instance_name, + utils.CommaJoin(request_groups))) + else: + raise errors.ProgrammerError("Unhandled mode '%s'" % self.mode) self.out_data = rdict