X-Git-Url: https://code.grnet.gr/git/ganeti-local/blobdiff_plain/4e272d8c036bd713219c79fba9669a3d1e1e3b05..3f1e065d5095b2c0cda036a130575458c8f270af:/lib/cmdlib.py diff --git a/lib/cmdlib.py b/lib/cmdlib.py index bddc32d..98a056c 100644 --- a/lib/cmdlib.py +++ b/lib/cmdlib.py @@ -1254,7 +1254,7 @@ class LUClusterDestroy(LogicalUnit): def _VerifyCertificate(filename): - """Verifies a certificate for LUClusterVerify. + """Verifies a certificate for LUClusterVerifyConfig. @type filename: string @param filename: Path to PEM file @@ -1264,7 +1264,7 @@ def _VerifyCertificate(filename): cert = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM, utils.ReadFile(filename)) except Exception, err: # pylint: disable-msg=W0703 - return (LUClusterVerify.ETYPE_ERROR, + return (LUClusterVerifyConfig.ETYPE_ERROR, "Failed to load X509 certificate %s: %s" % (filename, err)) (errcode, msg) = \ @@ -1279,21 +1279,52 @@ def _VerifyCertificate(filename): if errcode is None: return (None, fnamemsg) elif errcode == utils.CERT_WARNING: - return (LUClusterVerify.ETYPE_WARNING, fnamemsg) + return (LUClusterVerifyConfig.ETYPE_WARNING, fnamemsg) elif errcode == utils.CERT_ERROR: - return (LUClusterVerify.ETYPE_ERROR, fnamemsg) + return (LUClusterVerifyConfig.ETYPE_ERROR, fnamemsg) raise errors.ProgrammerError("Unhandled certificate error code %r" % errcode) -class LUClusterVerify(LogicalUnit): - """Verifies the cluster status. +def _GetAllHypervisorParameters(cluster, instances): + """Compute the set of all hypervisor parameters. + + @type cluster: L{objects.Cluster} + @param cluster: the cluster object + @param instances: list of L{objects.Instance} + @param instances: additional instances from which to obtain parameters + @rtype: list of (origin, hypervisor, parameters) + @return: a list with all parameters found, indicating the hypervisor they + apply to, and the origin (can be "cluster", "os X", or "instance Y") """ - HPATH = "cluster-verify" - HTYPE = constants.HTYPE_CLUSTER - REQ_BGL = False + hvp_data = [] + + for hv_name in cluster.enabled_hypervisors: + hvp_data.append(("cluster", hv_name, cluster.GetHVDefaults(hv_name))) + + for os_name, os_hvp in cluster.os_hvp.items(): + for hv_name, hv_params in os_hvp.items(): + if hv_params: + full_params = cluster.GetHVDefaults(hv_name, os_name=os_name) + hvp_data.append(("os %s" % os_name, hv_name, full_params)) + + # TODO: collapse identical parameter values in a single one + for instance in instances: + if instance.hvparams: + hvp_data.append(("instance %s" % instance.name, instance.hypervisor, + cluster.FillHV(instance))) + + return hvp_data + +class _VerifyErrors(object): + """Mix-in for cluster/group verify LUs. + + It provides _Error and _ErrorIf, and updates the self.bad boolean. (Expects + self.op and self._feedback_fn to be available.) + + """ TCLUSTER = "cluster" TNODE = "node" TINSTANCE = "instance" @@ -1301,6 +1332,8 @@ class LUClusterVerify(LogicalUnit): ECLUSTERCFG = (TCLUSTER, "ECLUSTERCFG") ECLUSTERCERT = (TCLUSTER, "ECLUSTERCERT") ECLUSTERFILECHECK = (TCLUSTER, "ECLUSTERFILECHECK") + ECLUSTERDANGLINGNODES = (TNODE, "ECLUSTERDANGLINGNODES") + ECLUSTERDANGLINGINST = (TNODE, "ECLUSTERDANGLINGINST") EINSTANCEBADNODE = (TINSTANCE, "EINSTANCEBADNODE") EINSTANCEDOWN = (TINSTANCE, "EINSTANCEDOWN") EINSTANCELAYOUT = (TINSTANCE, "EINSTANCELAYOUT") @@ -1330,6 +1363,138 @@ class LUClusterVerify(LogicalUnit): ETYPE_ERROR = "ERROR" ETYPE_WARNING = "WARNING" + def _Error(self, ecode, item, msg, *args, **kwargs): + """Format an error message. + + Based on the opcode's error_codes parameter, either format a + parseable error code, or a simpler error string. + + This must be called only from Exec and functions called from Exec. + + """ + ltype = kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR) + itype, etxt = ecode + # first complete the msg + if args: + msg = msg % args + # then format the whole message + if self.op.error_codes: # This is a mix-in. pylint: disable-msg=E1101 + msg = "%s:%s:%s:%s:%s" % (ltype, etxt, itype, item, msg) + else: + if item: + item = " " + item + else: + item = "" + msg = "%s: %s%s: %s" % (ltype, itype, item, msg) + # and finally report it via the feedback_fn + self._feedback_fn(" - %s" % msg) # Mix-in. pylint: disable-msg=E1101 + + def _ErrorIf(self, cond, *args, **kwargs): + """Log an error message if the passed condition is True. + + """ + cond = (bool(cond) + or self.op.debug_simulate_errors) # pylint: disable-msg=E1101 + if cond: + self._Error(*args, **kwargs) + # do not mark the operation as failed for WARN cases only + if kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR) == self.ETYPE_ERROR: + self.bad = self.bad or cond + + +class LUClusterVerifyConfig(NoHooksLU, _VerifyErrors): + """Verifies the cluster config. + + """ + REQ_BGL = False + + def _VerifyHVP(self, hvp_data): + """Verifies locally the syntax of the hypervisor parameters. + + """ + for item, hv_name, hv_params in hvp_data: + msg = ("hypervisor %s parameters syntax check (source %s): %%s" % + (item, hv_name)) + try: + hv_class = hypervisor.GetHypervisor(hv_name) + utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES) + hv_class.CheckParameterSyntax(hv_params) + except errors.GenericError, err: + self._ErrorIf(True, self.ECLUSTERCFG, None, msg % str(err)) + + def ExpandNames(self): + self.all_group_info = self.cfg.GetAllNodeGroupsInfo() + self.all_node_info = self.cfg.GetAllNodesInfo() + self.all_inst_info = self.cfg.GetAllInstancesInfo() + self.needed_locks = {} + + def Exec(self, feedback_fn): + """Verify integrity of cluster, performing various test on nodes. + + """ + self.bad = False + self._feedback_fn = feedback_fn + + feedback_fn("* Verifying cluster config") + + for msg in self.cfg.VerifyConfig(): + self._ErrorIf(True, self.ECLUSTERCFG, None, msg) + + feedback_fn("* Verifying cluster certificate files") + + for cert_filename in constants.ALL_CERT_FILES: + (errcode, msg) = _VerifyCertificate(cert_filename) + self._ErrorIf(errcode, self.ECLUSTERCERT, None, msg, code=errcode) + + feedback_fn("* Verifying hypervisor parameters") + + self._VerifyHVP(_GetAllHypervisorParameters(self.cfg.GetClusterInfo(), + self.all_inst_info.values())) + + feedback_fn("* Verifying all nodes belong to an existing group") + + # We do this verification here because, should this bogus circumstance + # occur, it would never be catched by VerifyGroup, which only acts on + # nodes/instances reachable from existing node groups. + + dangling_nodes = set(node.name for node in self.all_node_info.values() + if node.group not in self.all_group_info) + + dangling_instances = {} + no_node_instances = [] + + for inst in self.all_inst_info.values(): + if inst.primary_node in dangling_nodes: + dangling_instances.setdefault(inst.primary_node, []).append(inst.name) + elif inst.primary_node not in self.all_node_info: + no_node_instances.append(inst.name) + + pretty_dangling = [ + "%s (%s)" % + (node.name, + utils.CommaJoin(dangling_instances.get(node.name, + ["no instances"]))) + for node in dangling_nodes] + + self._ErrorIf(bool(dangling_nodes), self.ECLUSTERDANGLINGNODES, None, + "the following nodes (and their instances) belong to a non" + " existing group: %s", utils.CommaJoin(pretty_dangling)) + + self._ErrorIf(bool(no_node_instances), self.ECLUSTERDANGLINGINST, None, + "the following instances have a non-existing primary-node:" + " %s", utils.CommaJoin(no_node_instances)) + + return (not self.bad, [g.name for g in self.all_group_info.values()]) + + +class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): + """Verifies the status of a node group. + + """ + HPATH = "cluster-verify" + HTYPE = constants.HTYPE_CLUSTER + REQ_BGL = False + _HOOKS_INDENT_RE = re.compile("^", re.M) class NodeImage(object): @@ -1383,56 +1548,90 @@ class LUClusterVerify(LogicalUnit): self.oslist = {} def ExpandNames(self): + # This raises errors.OpPrereqError on its own: + self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name) + + all_node_info = self.cfg.GetAllNodesInfo() + all_inst_info = self.cfg.GetAllInstancesInfo() + + node_names = set(node.name + for node in all_node_info.values() + if node.group == self.group_uuid) + + inst_names = [inst.name + for inst in all_inst_info.values() + if inst.primary_node in node_names] + + # In Exec(), we warn about mirrored instances that have primary and + # secondary living in separate node groups. To fully verify that + # volumes for these instances are healthy, we will need to do an + # extra call to their secondaries. We ensure here those nodes will + # be locked. + for inst in inst_names: + if all_inst_info[inst].disk_template in constants.DTS_INT_MIRROR: + node_names.update(all_inst_info[inst].secondary_nodes) + self.needed_locks = { - locking.LEVEL_NODE: locking.ALL_SET, - locking.LEVEL_INSTANCE: locking.ALL_SET, + locking.LEVEL_NODEGROUP: [self.group_uuid], + locking.LEVEL_NODE: list(node_names), + locking.LEVEL_INSTANCE: inst_names, } + self.share_locks = dict.fromkeys(locking.LEVELS, 1) def CheckPrereq(self): self.all_node_info = self.cfg.GetAllNodesInfo() self.all_inst_info = self.cfg.GetAllInstancesInfo() - self.my_node_names = utils.NiceSort(list(self.all_node_info)) - self.my_node_info = self.all_node_info - self.my_inst_names = utils.NiceSort(list(self.all_inst_info)) - self.my_inst_info = self.all_inst_info - def _Error(self, ecode, item, msg, *args, **kwargs): - """Format an error message. + group_nodes = set(node.name + for node in self.all_node_info.values() + if node.group == self.group_uuid) - Based on the opcode's error_codes parameter, either format a - parseable error code, or a simpler error string. + group_instances = set(inst.name + for inst in self.all_inst_info.values() + if inst.primary_node in group_nodes) - This must be called only from Exec and functions called from Exec. + unlocked_nodes = \ + group_nodes.difference(self.glm.list_owned(locking.LEVEL_NODE)) - """ - ltype = kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR) - itype, etxt = ecode - # first complete the msg - if args: - msg = msg % args - # then format the whole message - if self.op.error_codes: - msg = "%s:%s:%s:%s:%s" % (ltype, etxt, itype, item, msg) - else: - if item: - item = " " + item - else: - item = "" - msg = "%s: %s%s: %s" % (ltype, itype, item, msg) - # and finally report it via the feedback_fn - self._feedback_fn(" - %s" % msg) + unlocked_instances = \ + group_instances.difference(self.glm.list_owned(locking.LEVEL_INSTANCE)) - def _ErrorIf(self, cond, *args, **kwargs): - """Log an error message if the passed condition is True. + if unlocked_nodes: + raise errors.OpPrereqError("missing lock for nodes: %s" % + utils.CommaJoin(unlocked_nodes)) - """ - cond = bool(cond) or self.op.debug_simulate_errors - if cond: - self._Error(*args, **kwargs) - # do not mark the operation as failed for WARN cases only - if kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR) == self.ETYPE_ERROR: - self.bad = self.bad or cond + if unlocked_instances: + raise errors.OpPrereqError("missing lock for instances: %s" % + utils.CommaJoin(unlocked_instances)) + + self.my_node_names = utils.NiceSort(group_nodes) + self.my_inst_names = utils.NiceSort(group_instances) + + self.my_node_info = dict((name, self.all_node_info[name]) + for name in self.my_node_names) + + self.my_inst_info = dict((name, self.all_inst_info[name]) + for name in self.my_inst_names) + + # We detect here the nodes that will need the extra RPC calls for verifying + # split LV volumes; they should be locked. + extra_lv_nodes = set() + + for inst in self.my_inst_info.values(): + if inst.disk_template in constants.DTS_INT_MIRROR: + group = self.my_node_info[inst.primary_node].group + for nname in inst.secondary_nodes: + if self.all_node_info[nname].group != group: + extra_lv_nodes.add(nname) + + unlocked_lv_nodes = \ + extra_lv_nodes.difference(self.glm.list_owned(locking.LEVEL_NODE)) + + if unlocked_lv_nodes: + raise errors.OpPrereqError("these nodes could be locked: %s" % + utils.CommaJoin(unlocked_lv_nodes)) + self.extra_lv_nodes = list(extra_lv_nodes) def _VerifyNode(self, ninfo, nresult): """Perform some basic validation on data returned from a node. @@ -1672,12 +1871,6 @@ class LUClusterVerify(LogicalUnit): "instance not running on its primary node %s", node_current) - for node, n_img in node_image.items(): - if node != node_current: - test = instance in n_img.instances - _ErrorIf(test, self.EINSTANCEWRONGNODE, instance, - "instance should not run on node %s", node) - diskdata = [(nname, success, status, idx) for (nname, disks) in diskstatus.items() for idx, (success, status) in enumerate(disks)] @@ -1717,18 +1910,6 @@ class LUClusterVerify(LogicalUnit): self._ErrorIf(test, self.ENODEORPHANLV, node, "volume %s is unknown", volume) - def _VerifyOrphanInstances(self, instancelist, node_image): - """Verify the list of running instances. - - This checks what instances are running but unknown to the cluster. - - """ - for node, n_img in node_image.items(): - for o_inst in n_img.instances: - test = o_inst not in instancelist - self._ErrorIf(test, self.ENODEORPHANINSTANCE, node, - "instance %s on node %s should not exist", o_inst, node) - def _VerifyNPlusOneMemory(self, node_image, instance_cfg): """Verify N+1 Memory Resilience. @@ -2211,20 +2392,6 @@ class LUClusterVerify(LogicalUnit): return instdisk - def _VerifyHVP(self, hvp_data): - """Verifies locally the syntax of the hypervisor parameters. - - """ - for item, hv_name, hv_params in hvp_data: - msg = ("hypervisor %s parameters syntax check (source %s): %%s" % - (item, hv_name)) - try: - hv_class = hypervisor.GetHypervisor(hv_name) - utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES) - hv_class.CheckParameterSyntax(hv_params) - except errors.GenericError, err: - self._ErrorIf(True, self.ECLUSTERCFG, None, msg % str(err)) - def BuildHooksEnv(self): """Build hooks env. @@ -2250,7 +2417,7 @@ class LUClusterVerify(LogicalUnit): return ([], self.my_node_names) def Exec(self, feedback_fn): - """Verify integrity of cluster, performing various test on nodes. + """Verify integrity of the node group, performing various test on nodes. """ # This method has too many local variables. pylint: disable-msg=R0914 @@ -2258,20 +2425,12 @@ class LUClusterVerify(LogicalUnit): _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103 verbose = self.op.verbose self._feedback_fn = feedback_fn - feedback_fn("* Verifying global settings") - for msg in self.cfg.VerifyConfig(): - _ErrorIf(True, self.ECLUSTERCFG, None, msg) - - # Check the cluster certificates - for cert_filename in constants.ALL_CERT_FILES: - (errcode, msg) = _VerifyCertificate(cert_filename) - _ErrorIf(errcode, self.ECLUSTERCERT, None, msg, code=errcode) vg_name = self.cfg.GetVGName() drbd_helper = self.cfg.GetDRBDHelper() - hypervisors = self.cfg.GetClusterInfo().enabled_hypervisors cluster = self.cfg.GetClusterInfo() groupinfo = self.cfg.GetAllNodeGroupsInfo() + hypervisors = cluster.enabled_hypervisors node_data_list = [self.my_node_info[name] for name in self.my_node_names] i_non_redundant = [] # Non redundant instances @@ -2289,35 +2448,30 @@ class LUClusterVerify(LogicalUnit): master_node = self.master_node = self.cfg.GetMasterNode() master_ip = self.cfg.GetMasterIP() - # Compute the set of hypervisor parameters - hvp_data = [] - for hv_name in hypervisors: - hvp_data.append(("cluster", hv_name, cluster.GetHVDefaults(hv_name))) - for os_name, os_hvp in cluster.os_hvp.items(): - for hv_name, hv_params in os_hvp.items(): - if not hv_params: - continue - full_params = cluster.GetHVDefaults(hv_name, os_name=os_name) - hvp_data.append(("os %s" % os_name, hv_name, full_params)) - # TODO: collapse identical parameter values in a single one - for instance in self.all_inst_info.values(): - if not instance.hvparams: - continue - hvp_data.append(("instance %s" % instance.name, instance.hypervisor, - cluster.FillHV(instance))) - # and verify them locally - self._VerifyHVP(hvp_data) - feedback_fn("* Gathering data (%d nodes)" % len(self.my_node_names)) + + # We will make nodes contact all nodes in their group, and one node from + # every other group. + # TODO: should it be a *random* node, different every time? + online_nodes = [node.name for node in node_data_list if not node.offline] + other_group_nodes = {} + + for name in sorted(self.all_node_info): + node = self.all_node_info[name] + if (node.group not in other_group_nodes + and node.group != self.group_uuid + and not node.offline): + other_group_nodes[node.group] = node.name + node_verify_param = { constants.NV_FILELIST: utils.UniqueSequence(filename for files in filemap for filename in files), - constants.NV_NODELIST: [node.name for node in self.all_node_info.values() - if not node.offline], + constants.NV_NODELIST: online_nodes + other_group_nodes.values(), constants.NV_HYPERVISOR: hypervisors, - constants.NV_HVPARAMS: hvp_data, + constants.NV_HVPARAMS: + _GetAllHypervisorParameters(cluster, self.all_inst_info.values()), constants.NV_NODENETTEST: [(node.name, node.primary_ip, node.secondary_ip) for node in node_data_list if not node.offline], @@ -2376,9 +2530,8 @@ class LUClusterVerify(LogicalUnit): for nname in inst_config.all_nodes: if nname not in node_image: - # ghost node gnode = self.NodeImage(name=nname) - gnode.ghost = True + gnode.ghost = (nname not in self.all_node_info) node_image[nname] = gnode inst_config.MapLVsByNode(node_vol_should) @@ -2404,6 +2557,13 @@ class LUClusterVerify(LogicalUnit): all_nvinfo = self.rpc.call_node_verify(self.my_node_names, node_verify_param, self.cfg.GetClusterName()) + if self.extra_lv_nodes and vg_name is not None: + extra_lv_nvinfo = \ + self.rpc.call_node_verify(self.extra_lv_nodes, + {constants.NV_LVLIST: vg_name}, + self.cfg.GetClusterName()) + else: + extra_lv_nvinfo = {} nvinfo_endtime = time.time() all_drbd_map = self.cfg.ComputeDRBDMap() @@ -2490,12 +2650,29 @@ class LUClusterVerify(LogicalUnit): self._UpdateNodeInstances(node_i, nresult, nimg) self._UpdateNodeInfo(node_i, nresult, nimg, vg_name) self._UpdateNodeOS(node_i, nresult, nimg) + if not nimg.os_fail: if refos_img is None: refos_img = nimg self._VerifyNodeOS(node_i, nimg, refos_img) self._VerifyNodeBridges(node_i, nresult, bridges) + # Check whether all running instancies are primary for the node. (This + # can no longer be done from _VerifyInstance below, since some of the + # wrong instances could be from other node groups.) + non_primary_inst = set(nimg.instances).difference(nimg.pinst) + + for inst in non_primary_inst: + test = inst in self.all_inst_info + _ErrorIf(test, self.EINSTANCEWRONGNODE, inst, + "instance should not run on node %s", node_i.name) + _ErrorIf(not test, self.ENODEORPHANINSTANCE, node_i.name, + "node is running unknown instance %s", inst) + + for node, result in extra_lv_nvinfo.items(): + self._UpdateNodeVolumes(self.all_node_info[node], result.payload, + node_image[node], vg_name) + feedback_fn("* Verifying instance status") for instance in self.my_inst_names: if verbose: @@ -2574,10 +2751,18 @@ class LUClusterVerify(LogicalUnit): feedback_fn("* Verifying orphan volumes") reserved = utils.FieldSet(*cluster.reserved_lvs) - self._VerifyOrphanVolumes(node_vol_should, node_image, reserved) - feedback_fn("* Verifying orphan instances") - self._VerifyOrphanInstances(set(self.all_inst_info.keys()), node_image) + # We will get spurious "unknown volume" warnings if any node of this group + # is secondary for an instance whose primary is in another group. To avoid + # them, we find these instances and add their volumes to node_vol_should. + for inst in self.all_inst_info.values(): + for secondary in inst.secondary_nodes: + if (secondary in self.my_node_info + and inst.name not in self.my_inst_info): + inst.MapLVsByNode(node_vol_should) + break + + self._VerifyOrphanVolumes(node_vol_should, node_image, reserved) if constants.VERIFY_NPLUSONE_MEM not in self.op.skip_checks: feedback_fn("* Verifying N+1 Memory redundancy")