X-Git-Url: https://code.grnet.gr/git/ganeti-local/blobdiff_plain/97298dc9bb115abca7d1c7c72d7353212c08e5f4..5a85b99e9085e221208b957a7fc0fe87436843f5:/lib/cmdlib.py diff --git a/lib/cmdlib.py b/lib/cmdlib.py index 62cebc2..d147c43 100644 --- a/lib/cmdlib.py +++ b/lib/cmdlib.py @@ -26,7 +26,7 @@ # W0201 since most LU attributes are defined in CheckPrereq or similar # functions -# C0302: since we have waaaay to many lines in this module +# C0302: since we have waaaay too many lines in this module import os import os.path @@ -59,10 +59,15 @@ from ganeti import query from ganeti import qlang from ganeti import opcodes from ganeti import ht +from ganeti import rpc import ganeti.masterd.instance # pylint: disable=W0611 +#: Size of DRBD meta block device +DRBD_META_SIZE = 128 + + class ResultWithJobs: """Data container for LU results with jobs. @@ -108,7 +113,7 @@ class LogicalUnit(object): HTYPE = None REQ_BGL = True - def __init__(self, processor, op, context, rpc): + def __init__(self, processor, op, context, rpc_runner): """Constructor for LogicalUnit. This needs to be overridden in derived classes in order to check op @@ -122,7 +127,7 @@ class LogicalUnit(object): # readability alias self.owned_locks = context.glm.list_owned self.context = context - self.rpc = rpc + self.rpc = rpc_runner # Dicts used to declare locking needs to mcpu self.needed_locks = None self.share_locks = dict.fromkeys(locking.LEVELS, 0) @@ -344,7 +349,8 @@ class LogicalUnit(object): self.op.instance_name) self.needed_locks[locking.LEVEL_INSTANCE] = self.op.instance_name - def _LockInstancesNodes(self, primary_only=False): + def _LockInstancesNodes(self, primary_only=False, + level=locking.LEVEL_NODE): """Helper function to declare instances' nodes for locking. This function should be called after locking one or more instances to lock @@ -365,9 +371,10 @@ class LogicalUnit(object): @type primary_only: boolean @param primary_only: only lock primary nodes of locked instances + @param level: Which lock level to use for locking nodes """ - assert locking.LEVEL_NODE in self.recalculate_locks, \ + assert level in self.recalculate_locks, \ "_LockInstancesNodes helper function called with no nodes to recalculate" # TODO: check if we're really been called with the instance locks held @@ -382,12 +389,14 @@ class LogicalUnit(object): if not primary_only: wanted_nodes.extend(instance.secondary_nodes) - if self.recalculate_locks[locking.LEVEL_NODE] == constants.LOCKS_REPLACE: - self.needed_locks[locking.LEVEL_NODE] = wanted_nodes - elif self.recalculate_locks[locking.LEVEL_NODE] == constants.LOCKS_APPEND: - self.needed_locks[locking.LEVEL_NODE].extend(wanted_nodes) + if self.recalculate_locks[level] == constants.LOCKS_REPLACE: + self.needed_locks[level] = wanted_nodes + elif self.recalculate_locks[level] == constants.LOCKS_APPEND: + self.needed_locks[level].extend(wanted_nodes) + else: + raise errors.ProgrammerError("Unknown recalculation mode") - del self.recalculate_locks[locking.LEVEL_NODE] + del self.recalculate_locks[level] class NoHooksLU(LogicalUnit): # pylint: disable=W0223 @@ -468,13 +477,13 @@ class _QueryBase: #: Attribute holding field definitions FIELDS = None - def __init__(self, filter_, fields, use_locking): + def __init__(self, qfilter, fields, use_locking): """Initializes this class. """ self.use_locking = use_locking - self.query = query.Query(self.FIELDS, fields, filter_=filter_, + self.query = query.Query(self.FIELDS, fields, qfilter=qfilter, namefield="name") self.requested_data = self.query.RequestedData() self.names = self.query.RequestedNames() @@ -753,7 +762,7 @@ def _RunPostHook(lu, node_name): """Runs the post-hook for an opcode on a single node. """ - hm = lu.proc.hmclass(lu.rpc.call_hooks_runner, lu) + hm = lu.proc.BuildHooksManager(lu) try: hm.RunPhase(constants.HOOKS_PHASE_POST, nodes=[node_name]) except: @@ -1204,13 +1213,13 @@ def _GetStorageTypeArgs(cfg, storage_type): return [] -def _FindFaultyInstanceDisks(cfg, rpc, instance, node_name, prereq): +def _FindFaultyInstanceDisks(cfg, rpc_runner, instance, node_name, prereq): faulty = [] for dev in instance.disks: cfg.SetDiskID(dev, node_name) - result = rpc.call_blockdev_getmirrorstatus(node_name, instance.disks) + result = rpc_runner.call_blockdev_getmirrorstatus(node_name, instance.disks) result.Raise("Failed to get disk status from node %s" % node_name, prereq=prereq, ecode=errors.ECODE_ENVIRON) @@ -1350,15 +1359,16 @@ class LUClusterDestroy(LogicalUnit): """Destroys the cluster. """ - master = self.cfg.GetMasterNode() + master_params = self.cfg.GetMasterNetworkParameters() # Run post hooks on master node before it's removed - _RunPostHook(self, master) + _RunPostHook(self, master_params.name) - result = self.rpc.call_node_deactivate_master_ip(master) + result = self.rpc.call_node_deactivate_master_ip(master_params.name, + master_params) result.Raise("Could not disable the master role") - return master + return master_params.name def _VerifyCertificate(filename): @@ -1433,39 +1443,6 @@ class _VerifyErrors(object): self.op and self._feedback_fn to be available.) """ - TCLUSTER = "cluster" - TNODE = "node" - TINSTANCE = "instance" - - ECLUSTERCFG = (TCLUSTER, "ECLUSTERCFG") - ECLUSTERCERT = (TCLUSTER, "ECLUSTERCERT") - ECLUSTERFILECHECK = (TCLUSTER, "ECLUSTERFILECHECK") - ECLUSTERDANGLINGNODES = (TNODE, "ECLUSTERDANGLINGNODES") - ECLUSTERDANGLINGINST = (TNODE, "ECLUSTERDANGLINGINST") - EINSTANCEBADNODE = (TINSTANCE, "EINSTANCEBADNODE") - EINSTANCEDOWN = (TINSTANCE, "EINSTANCEDOWN") - EINSTANCELAYOUT = (TINSTANCE, "EINSTANCELAYOUT") - EINSTANCEMISSINGDISK = (TINSTANCE, "EINSTANCEMISSINGDISK") - EINSTANCEFAULTYDISK = (TINSTANCE, "EINSTANCEFAULTYDISK") - EINSTANCEWRONGNODE = (TINSTANCE, "EINSTANCEWRONGNODE") - EINSTANCESPLITGROUPS = (TINSTANCE, "EINSTANCESPLITGROUPS") - ENODEDRBD = (TNODE, "ENODEDRBD") - ENODEDRBDHELPER = (TNODE, "ENODEDRBDHELPER") - ENODEFILECHECK = (TNODE, "ENODEFILECHECK") - ENODEHOOKS = (TNODE, "ENODEHOOKS") - ENODEHV = (TNODE, "ENODEHV") - ENODELVM = (TNODE, "ENODELVM") - ENODEN1 = (TNODE, "ENODEN1") - ENODENET = (TNODE, "ENODENET") - ENODEOS = (TNODE, "ENODEOS") - ENODEORPHANINSTANCE = (TNODE, "ENODEORPHANINSTANCE") - ENODEORPHANLV = (TNODE, "ENODEORPHANLV") - ENODERPC = (TNODE, "ENODERPC") - ENODESSH = (TNODE, "ENODESSH") - ENODEVERSION = (TNODE, "ENODEVERSION") - ENODESETUP = (TNODE, "ENODESETUP") - ENODETIME = (TNODE, "ENODETIME") - ENODEOOBPATH = (TNODE, "ENODEOOBPATH") ETYPE_FIELD = "code" ETYPE_ERROR = "ERROR" @@ -1481,7 +1458,7 @@ class _VerifyErrors(object): """ ltype = kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR) - itype, etxt = ecode + itype, etxt, _ = ecode # first complete the msg if args: msg = msg % args @@ -1497,14 +1474,22 @@ class _VerifyErrors(object): # and finally report it via the feedback_fn self._feedback_fn(" - %s" % msg) # Mix-in. pylint: disable=E1101 - def _ErrorIf(self, cond, *args, **kwargs): + def _ErrorIf(self, cond, ecode, *args, **kwargs): """Log an error message if the passed condition is True. """ cond = (bool(cond) or self.op.debug_simulate_errors) # pylint: disable=E1101 + + # If the error code is in the list of ignored errors, demote the error to a + # warning + (_, etxt, _) = ecode + if etxt in self.op.ignore_errors: # pylint: disable=E1101 + kwargs[self.ETYPE_FIELD] = self.ETYPE_WARNING + if cond: - self._Error(*args, **kwargs) + self._Error(ecode, *args, **kwargs) + # do not mark the operation as failed for WARN cases only if kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR) == self.ETYPE_ERROR: self.bad = self.bad or cond @@ -1529,13 +1514,16 @@ class LUClusterVerify(NoHooksLU): groups = self.cfg.GetNodeGroupList() # Verify global configuration - jobs.append([opcodes.OpClusterVerifyConfig()]) + jobs.append([ + opcodes.OpClusterVerifyConfig(ignore_errors=self.op.ignore_errors) + ]) # Always depend on global verification depends_fn = lambda: [(-len(jobs), [])] jobs.extend([opcodes.OpClusterVerifyGroup(group_name=group, - depends=depends_fn())] + ignore_errors=self.op.ignore_errors, + depends=depends_fn())] for group in groups) # Fix up all parameters @@ -1569,7 +1557,7 @@ class LUClusterVerifyConfig(NoHooksLU, _VerifyErrors): utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES) hv_class.CheckParameterSyntax(hv_params) except errors.GenericError, err: - self._ErrorIf(True, self.ECLUSTERCFG, None, msg % str(err)) + self._ErrorIf(True, constants.CV_ECLUSTERCFG, None, msg % str(err)) def ExpandNames(self): # Information can be safely retrieved as the BGL is acquired in exclusive @@ -1590,13 +1578,13 @@ class LUClusterVerifyConfig(NoHooksLU, _VerifyErrors): feedback_fn("* Verifying cluster config") for msg in self.cfg.VerifyConfig(): - self._ErrorIf(True, self.ECLUSTERCFG, None, msg) + self._ErrorIf(True, constants.CV_ECLUSTERCFG, None, msg) feedback_fn("* Verifying cluster certificate files") for cert_filename in constants.ALL_CERT_FILES: (errcode, msg) = _VerifyCertificate(cert_filename) - self._ErrorIf(errcode, self.ECLUSTERCERT, None, msg, code=errcode) + self._ErrorIf(errcode, constants.CV_ECLUSTERCERT, None, msg, code=errcode) feedback_fn("* Verifying hypervisor parameters") @@ -1628,11 +1616,13 @@ class LUClusterVerifyConfig(NoHooksLU, _VerifyErrors): ["no instances"]))) for node in dangling_nodes] - self._ErrorIf(bool(dangling_nodes), self.ECLUSTERDANGLINGNODES, None, + self._ErrorIf(bool(dangling_nodes), constants.CV_ECLUSTERDANGLINGNODES, + None, "the following nodes (and their instances) belong to a non" " existing group: %s", utils.CommaJoin(pretty_dangling)) - self._ErrorIf(bool(no_node_instances), self.ECLUSTERDANGLINGINST, None, + self._ErrorIf(bool(no_node_instances), constants.CV_ECLUSTERDANGLINGINST, + None, "the following instances have a non-existing primary-node:" " %s", utils.CommaJoin(no_node_instances)) @@ -1805,7 +1795,7 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): # main result, nresult should be a non-empty dict test = not nresult or not isinstance(nresult, dict) - _ErrorIf(test, self.ENODERPC, node, + _ErrorIf(test, constants.CV_ENODERPC, node, "unable to verify node: no data returned") if test: return False @@ -1816,13 +1806,13 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): test = not (remote_version and isinstance(remote_version, (list, tuple)) and len(remote_version) == 2) - _ErrorIf(test, self.ENODERPC, node, + _ErrorIf(test, constants.CV_ENODERPC, node, "connection to node returned invalid data") if test: return False test = local_version != remote_version[0] - _ErrorIf(test, self.ENODEVERSION, node, + _ErrorIf(test, constants.CV_ENODEVERSION, node, "incompatible protocol versions: master %s," " node %s", local_version, remote_version[0]) if test: @@ -1832,7 +1822,7 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): # full package version self._ErrorIf(constants.RELEASE_VERSION != remote_version[1], - self.ENODEVERSION, node, + constants.CV_ENODEVERSION, node, "software version mismatch: master %s, node %s", constants.RELEASE_VERSION, remote_version[1], code=self.ETYPE_WARNING) @@ -1841,19 +1831,19 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): if ninfo.vm_capable and isinstance(hyp_result, dict): for hv_name, hv_result in hyp_result.iteritems(): test = hv_result is not None - _ErrorIf(test, self.ENODEHV, node, + _ErrorIf(test, constants.CV_ENODEHV, node, "hypervisor %s verify failure: '%s'", hv_name, hv_result) hvp_result = nresult.get(constants.NV_HVPARAMS, None) if ninfo.vm_capable and isinstance(hvp_result, list): for item, hv_name, hv_result in hvp_result: - _ErrorIf(True, self.ENODEHV, node, + _ErrorIf(True, constants.CV_ENODEHV, node, "hypervisor %s parameter verify failure (source %s): %s", hv_name, item, hv_result) test = nresult.get(constants.NV_NODESETUP, ["Missing NODESETUP results"]) - _ErrorIf(test, self.ENODESETUP, node, "node setup error: %s", + _ErrorIf(test, constants.CV_ENODESETUP, node, "node setup error: %s", "; ".join(test)) return True @@ -1876,7 +1866,7 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): try: ntime_merged = utils.MergeTime(ntime) except (ValueError, TypeError): - _ErrorIf(True, self.ENODETIME, node, "Node returned invalid time") + _ErrorIf(True, constants.CV_ENODETIME, node, "Node returned invalid time") return if ntime_merged < (nvinfo_starttime - constants.NODE_MAX_CLOCK_SKEW): @@ -1886,7 +1876,7 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): else: ntime_diff = None - _ErrorIf(ntime_diff is not None, self.ENODETIME, node, + _ErrorIf(ntime_diff is not None, constants.CV_ENODETIME, node, "Node time diverges by at least %s from master node time", ntime_diff) @@ -1908,24 +1898,25 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): # checks vg existence and size > 20G vglist = nresult.get(constants.NV_VGLIST, None) test = not vglist - _ErrorIf(test, self.ENODELVM, node, "unable to check volume groups") + _ErrorIf(test, constants.CV_ENODELVM, node, "unable to check volume groups") if not test: vgstatus = utils.CheckVolumeGroupSize(vglist, vg_name, constants.MIN_VG_SIZE) - _ErrorIf(vgstatus, self.ENODELVM, node, vgstatus) + _ErrorIf(vgstatus, constants.CV_ENODELVM, node, vgstatus) # check pv names pvlist = nresult.get(constants.NV_PVLIST, None) test = pvlist is None - _ErrorIf(test, self.ENODELVM, node, "Can't get PV list from node") + _ErrorIf(test, constants.CV_ENODELVM, node, "Can't get PV list from node") if not test: # check that ':' is not present in PV names, since it's a # special character for lvcreate (denotes the range of PEs to # use on the PV) for _, pvname, owner_vg in pvlist: test = ":" in pvname - _ErrorIf(test, self.ENODELVM, node, "Invalid character ':' in PV" - " '%s' of VG '%s'", pvname, owner_vg) + _ErrorIf(test, constants.CV_ENODELVM, node, + "Invalid character ':' in PV '%s' of VG '%s'", + pvname, owner_vg) def _VerifyNodeBridges(self, ninfo, nresult, bridges): """Check the node bridges. @@ -1944,11 +1935,11 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): missing = nresult.get(constants.NV_BRIDGES, None) test = not isinstance(missing, list) - _ErrorIf(test, self.ENODENET, node, + _ErrorIf(test, constants.CV_ENODENET, node, "did not return valid bridge information") if not test: - _ErrorIf(bool(missing), self.ENODENET, node, "missing bridges: %s" % - utils.CommaJoin(sorted(missing))) + _ErrorIf(bool(missing), constants.CV_ENODENET, node, + "missing bridges: %s" % utils.CommaJoin(sorted(missing))) def _VerifyNodeNetwork(self, ninfo, nresult): """Check the node network connectivity results. @@ -1962,27 +1953,27 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): _ErrorIf = self._ErrorIf # pylint: disable=C0103 test = constants.NV_NODELIST not in nresult - _ErrorIf(test, self.ENODESSH, node, + _ErrorIf(test, constants.CV_ENODESSH, node, "node hasn't returned node ssh connectivity data") if not test: if nresult[constants.NV_NODELIST]: for a_node, a_msg in nresult[constants.NV_NODELIST].items(): - _ErrorIf(True, self.ENODESSH, node, + _ErrorIf(True, constants.CV_ENODESSH, node, "ssh communication with node '%s': %s", a_node, a_msg) test = constants.NV_NODENETTEST not in nresult - _ErrorIf(test, self.ENODENET, node, + _ErrorIf(test, constants.CV_ENODENET, node, "node hasn't returned node tcp connectivity data") if not test: if nresult[constants.NV_NODENETTEST]: nlist = utils.NiceSort(nresult[constants.NV_NODENETTEST].keys()) for anode in nlist: - _ErrorIf(True, self.ENODENET, node, + _ErrorIf(True, constants.CV_ENODENET, node, "tcp communication with node '%s': %s", anode, nresult[constants.NV_NODENETTEST][anode]) test = constants.NV_MASTERIP not in nresult - _ErrorIf(test, self.ENODENET, node, + _ErrorIf(test, constants.CV_ENODENET, node, "node hasn't returned node master IP reachability data") if not test: if not nresult[constants.NV_MASTERIP]: @@ -1990,7 +1981,7 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): msg = "the master node cannot reach the master IP (not configured?)" else: msg = "cannot reach the master IP" - _ErrorIf(True, self.ENODENET, node, msg) + _ErrorIf(True, constants.CV_ENODENET, node, msg) def _VerifyInstance(self, instance, instanceconfig, node_image, diskstatus): @@ -2013,13 +2004,13 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): continue for volume in node_vol_should[node]: test = volume not in n_img.volumes - _ErrorIf(test, self.EINSTANCEMISSINGDISK, instance, + _ErrorIf(test, constants.CV_EINSTANCEMISSINGDISK, instance, "volume %s missing on node %s", volume, node) if instanceconfig.admin_up: pri_img = node_image[node_current] test = instance not in pri_img.instances and not pri_img.offline - _ErrorIf(test, self.EINSTANCEDOWN, instance, + _ErrorIf(test, constants.CV_EINSTANCEDOWN, instance, "instance not running on its primary node %s", node_current) @@ -2033,12 +2024,12 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): snode = node_image[nname] bad_snode = snode.ghost or snode.offline _ErrorIf(instanceconfig.admin_up and not success and not bad_snode, - self.EINSTANCEFAULTYDISK, instance, + constants.CV_EINSTANCEFAULTYDISK, instance, "couldn't retrieve status for disk/%s on %s: %s", idx, nname, bdev_status) _ErrorIf((instanceconfig.admin_up and success and bdev_status.ldisk_status == constants.LDS_FAULTY), - self.EINSTANCEFAULTYDISK, instance, + constants.CV_EINSTANCEFAULTYDISK, instance, "disk/%s on %s is faulty", idx, nname) def _VerifyOrphanVolumes(self, node_vol_should, node_image, reserved): @@ -2059,7 +2050,7 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): test = ((node not in node_vol_should or volume not in node_vol_should[node]) and not reserved.Matches(volume)) - self._ErrorIf(test, self.ENODEORPHANLV, node, + self._ErrorIf(test, constants.CV_ENODEORPHANLV, node, "volume %s is unknown", volume) def _VerifyNPlusOneMemory(self, node_image, instance_cfg): @@ -2092,7 +2083,7 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): if bep[constants.BE_AUTO_BALANCE]: needed_mem += bep[constants.BE_MEMORY] test = n_img.mfree < needed_mem - self._ErrorIf(test, self.ENODEN1, node, + self._ErrorIf(test, constants.CV_ENODEN1, node, "not enough memory to accomodate instance failovers" " should node %s fail (%dMiB needed, %dMiB available)", prinode, needed_mem, n_img.mfree) @@ -2145,7 +2136,7 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): node_files = nresult.payload.get(constants.NV_FILELIST, None) test = not (node_files and isinstance(node_files, dict)) - errorif(test, cls.ENODEFILECHECK, node.name, + errorif(test, constants.CV_ENODEFILECHECK, node.name, "Node did not return file checksum data") if test: ignore_nodes.add(node.name) @@ -2172,20 +2163,19 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): if filename in files_opt: # All or no nodes errorif(missing_file and missing_file != expected_nodes, - cls.ECLUSTERFILECHECK, None, + constants.CV_ECLUSTERFILECHECK, None, "File %s is optional, but it must exist on all or no" " nodes (not found on %s)", filename, utils.CommaJoin(utils.NiceSort(missing_file))) else: - # Non-optional files - errorif(missing_file, cls.ECLUSTERFILECHECK, None, + errorif(missing_file, constants.CV_ECLUSTERFILECHECK, None, "File %s is missing from node(s) %s", filename, utils.CommaJoin(utils.NiceSort(missing_file))) # Warn if a node has a file it shouldn't unexpected = with_file - expected_nodes errorif(unexpected, - cls.ECLUSTERFILECHECK, None, + constants.CV_ECLUSTERFILECHECK, None, "File %s should not exist on node(s) %s", filename, utils.CommaJoin(utils.NiceSort(unexpected))) @@ -2199,7 +2189,7 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): else: variants = [] - errorif(test, cls.ECLUSTERFILECHECK, None, + errorif(test, constants.CV_ECLUSTERFILECHECK, None, "File %s found with %s different checksums (%s)", filename, len(checksums), "; ".join(variants)) @@ -2222,22 +2212,22 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): if drbd_helper: helper_result = nresult.get(constants.NV_DRBDHELPER, None) test = (helper_result == None) - _ErrorIf(test, self.ENODEDRBDHELPER, node, + _ErrorIf(test, constants.CV_ENODEDRBDHELPER, node, "no drbd usermode helper returned") if helper_result: status, payload = helper_result test = not status - _ErrorIf(test, self.ENODEDRBDHELPER, node, + _ErrorIf(test, constants.CV_ENODEDRBDHELPER, node, "drbd usermode helper check unsuccessful: %s", payload) test = status and (payload != drbd_helper) - _ErrorIf(test, self.ENODEDRBDHELPER, node, + _ErrorIf(test, constants.CV_ENODEDRBDHELPER, node, "wrong drbd usermode helper: %s", payload) # compute the DRBD minors node_drbd = {} for minor, instance in drbd_map[node].items(): test = instance not in instanceinfo - _ErrorIf(test, self.ECLUSTERCFG, None, + _ErrorIf(test, constants.CV_ECLUSTERCFG, None, "ghost instance '%s' in temporary DRBD map", instance) # ghost instance should not be running, but otherwise we # don't give double warnings (both ghost instance and @@ -2251,7 +2241,7 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): # and now check them used_minors = nresult.get(constants.NV_DRBDLIST, []) test = not isinstance(used_minors, (tuple, list)) - _ErrorIf(test, self.ENODEDRBD, node, + _ErrorIf(test, constants.CV_ENODEDRBD, node, "cannot parse drbd status file: %s", str(used_minors)) if test: # we cannot check drbd status @@ -2259,11 +2249,11 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): for minor, (iname, must_exist) in node_drbd.items(): test = minor not in used_minors and must_exist - _ErrorIf(test, self.ENODEDRBD, node, + _ErrorIf(test, constants.CV_ENODEDRBD, node, "drbd minor %d of instance %s is not active", minor, iname) for minor in used_minors: test = minor not in node_drbd - _ErrorIf(test, self.ENODEDRBD, node, + _ErrorIf(test, constants.CV_ENODEDRBD, node, "unallocated drbd minor %d is in use", minor) def _UpdateNodeOS(self, ninfo, nresult, nimg): @@ -2283,7 +2273,7 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): not compat.all(isinstance(v, list) and len(v) == 7 for v in remote_os)) - _ErrorIf(test, self.ENODEOS, node, + _ErrorIf(test, constants.CV_ENODEOS, node, "node hasn't returned valid OS data") nimg.os_fail = test @@ -2325,14 +2315,14 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): for os_name, os_data in nimg.oslist.items(): assert os_data, "Empty OS status for OS %s?!" % os_name f_path, f_status, f_diag, f_var, f_param, f_api = os_data[0] - _ErrorIf(not f_status, self.ENODEOS, node, + _ErrorIf(not f_status, constants.CV_ENODEOS, node, "Invalid OS %s (located at %s): %s", os_name, f_path, f_diag) - _ErrorIf(len(os_data) > 1, self.ENODEOS, node, + _ErrorIf(len(os_data) > 1, constants.CV_ENODEOS, node, "OS '%s' has multiple entries (first one shadows the rest): %s", os_name, utils.CommaJoin([v[0] for v in os_data])) # comparisons with the 'base' image test = os_name not in base.oslist - _ErrorIf(test, self.ENODEOS, node, + _ErrorIf(test, constants.CV_ENODEOS, node, "Extra OS %s not present on reference node (%s)", os_name, base.name) if test: @@ -2346,14 +2336,14 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): ("variants list", f_var, b_var), ("parameters", beautify_params(f_param), beautify_params(b_param))]: - _ErrorIf(a != b, self.ENODEOS, node, + _ErrorIf(a != b, constants.CV_ENODEOS, node, "OS %s for %s differs from reference node %s: [%s] vs. [%s]", kind, os_name, base.name, utils.CommaJoin(sorted(a)), utils.CommaJoin(sorted(b))) # check any missing OSes missing = set(base.oslist.keys()).difference(nimg.oslist.keys()) - _ErrorIf(missing, self.ENODEOS, node, + _ErrorIf(missing, constants.CV_ENODEOS, node, "OSes present on reference node %s but missing on this node: %s", base.name, utils.CommaJoin(missing)) @@ -2371,7 +2361,7 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): if ((ninfo.master_candidate or ninfo.master_capable) and constants.NV_OOB_PATHS in nresult): for path_result in nresult[constants.NV_OOB_PATHS]: - self._ErrorIf(path_result, self.ENODEOOBPATH, node, path_result) + self._ErrorIf(path_result, constants.CV_ENODEOOBPATH, node, path_result) def _UpdateNodeVolumes(self, ninfo, nresult, nimg, vg_name): """Verifies and updates the node volume data. @@ -2394,10 +2384,11 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): if vg_name is None: pass elif isinstance(lvdata, basestring): - _ErrorIf(True, self.ENODELVM, node, "LVM problem on node: %s", + _ErrorIf(True, constants.CV_ENODELVM, node, "LVM problem on node: %s", utils.SafeEncode(lvdata)) elif not isinstance(lvdata, dict): - _ErrorIf(True, self.ENODELVM, node, "rpc call to node failed (lvlist)") + _ErrorIf(True, constants.CV_ENODELVM, node, + "rpc call to node failed (lvlist)") else: nimg.volumes = lvdata nimg.lvm_fail = False @@ -2417,8 +2408,9 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): """ idata = nresult.get(constants.NV_INSTANCELIST, None) test = not isinstance(idata, list) - self._ErrorIf(test, self.ENODEHV, ninfo.name, "rpc call to node failed" - " (instancelist): %s", utils.SafeEncode(str(idata))) + self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name, + "rpc call to node failed (instancelist): %s", + utils.SafeEncode(str(idata))) if test: nimg.hyp_fail = True else: @@ -2440,26 +2432,27 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): # try to read free memory (from the hypervisor) hv_info = nresult.get(constants.NV_HVINFO, None) test = not isinstance(hv_info, dict) or "memory_free" not in hv_info - _ErrorIf(test, self.ENODEHV, node, "rpc call to node failed (hvinfo)") + _ErrorIf(test, constants.CV_ENODEHV, node, + "rpc call to node failed (hvinfo)") if not test: try: nimg.mfree = int(hv_info["memory_free"]) except (ValueError, TypeError): - _ErrorIf(True, self.ENODERPC, node, + _ErrorIf(True, constants.CV_ENODERPC, node, "node returned invalid nodeinfo, check hypervisor") # FIXME: devise a free space model for file based instances as well if vg_name is not None: test = (constants.NV_VGLIST not in nresult or vg_name not in nresult[constants.NV_VGLIST]) - _ErrorIf(test, self.ENODELVM, node, + _ErrorIf(test, constants.CV_ENODELVM, node, "node didn't return data for the volume group '%s'" " - it is either missing or broken", vg_name) if not test: try: nimg.dfree = int(nresult[constants.NV_VGLIST][vg_name]) except (ValueError, TypeError): - _ErrorIf(True, self.ENODERPC, node, + _ErrorIf(True, constants.CV_ENODERPC, node, "node returned invalid LVM info, check LVM status") def _CollectDiskInfo(self, nodelist, node_image, instanceinfo): @@ -2526,7 +2519,7 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): data = len(disks) * [(False, "node offline")] else: msg = nres.fail_msg - _ErrorIf(msg, self.ENODERPC, nname, + _ErrorIf(msg, constants.CV_ENODERPC, nname, "while getting disk information: %s", msg) if msg: # No data from this node @@ -2825,7 +2818,8 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): feedback_fn("* Verifying node %s (%s)" % (node, ntype)) msg = all_nvinfo[node].fail_msg - _ErrorIf(msg, self.ENODERPC, node, "while contacting node: %s", msg) + _ErrorIf(msg, constants.CV_ENODERPC, node, "while contacting node: %s", + msg) if msg: nimg.rpc_fail = True continue @@ -2860,9 +2854,9 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): for inst in non_primary_inst: test = inst in self.all_inst_info - _ErrorIf(test, self.EINSTANCEWRONGNODE, inst, + _ErrorIf(test, constants.CV_EINSTANCEWRONGNODE, inst, "instance should not run on node %s", node_i.name) - _ErrorIf(not test, self.ENODEORPHANINSTANCE, node_i.name, + _ErrorIf(not test, constants.CV_ENODEORPHANINSTANCE, node_i.name, "node is running unknown instance %s", inst) for node, result in extra_lv_nvinfo.items(): @@ -2881,11 +2875,11 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): pnode = inst_config.primary_node pnode_img = node_image[pnode] _ErrorIf(pnode_img.rpc_fail and not pnode_img.offline, - self.ENODERPC, pnode, "instance %s, connection to" + constants.CV_ENODERPC, pnode, "instance %s, connection to" " primary node failed", instance) _ErrorIf(inst_config.admin_up and pnode_img.offline, - self.EINSTANCEBADNODE, instance, + constants.CV_EINSTANCEBADNODE, instance, "instance is marked as running and lives on offline node %s", inst_config.primary_node) @@ -2897,7 +2891,8 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): if not inst_config.secondary_nodes: i_non_redundant.append(instance) - _ErrorIf(len(inst_config.secondary_nodes) > 1, self.EINSTANCELAYOUT, + _ErrorIf(len(inst_config.secondary_nodes) > 1, + constants.CV_EINSTANCELAYOUT, instance, "instance has multiple secondary nodes: %s", utils.CommaJoin(inst_config.secondary_nodes), code=self.ETYPE_WARNING) @@ -2918,7 +2913,8 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): key=lambda (_, nodes): pnode in nodes, reverse=True)] - self._ErrorIf(len(instance_groups) > 1, self.EINSTANCESPLITGROUPS, + self._ErrorIf(len(instance_groups) > 1, + constants.CV_EINSTANCESPLITGROUPS, instance, "instance has primary and secondary nodes in" " different groups: %s", utils.CommaJoin(pretty_list), code=self.ETYPE_WARNING) @@ -2928,21 +2924,22 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): for snode in inst_config.secondary_nodes: s_img = node_image[snode] - _ErrorIf(s_img.rpc_fail and not s_img.offline, self.ENODERPC, snode, - "instance %s, connection to secondary node failed", instance) + _ErrorIf(s_img.rpc_fail and not s_img.offline, constants.CV_ENODERPC, + snode, "instance %s, connection to secondary node failed", + instance) if s_img.offline: inst_nodes_offline.append(snode) # warn that the instance lives on offline nodes - _ErrorIf(inst_nodes_offline, self.EINSTANCEBADNODE, instance, + _ErrorIf(inst_nodes_offline, constants.CV_EINSTANCEBADNODE, instance, "instance has offline secondary node(s) %s", utils.CommaJoin(inst_nodes_offline)) # ... or ghost/non-vm_capable nodes for node in inst_config.all_nodes: - _ErrorIf(node_image[node].ghost, self.EINSTANCEBADNODE, instance, - "instance lives on ghost node %s", node) - _ErrorIf(not node_image[node].vm_capable, self.EINSTANCEBADNODE, + _ErrorIf(node_image[node].ghost, constants.CV_EINSTANCEBADNODE, + instance, "instance lives on ghost node %s", node) + _ErrorIf(not node_image[node].vm_capable, constants.CV_EINSTANCEBADNODE, instance, "instance lives on non-vm_capable node %s", node) feedback_fn("* Verifying orphan volumes") @@ -3010,7 +3007,7 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): res = hooks_results[node_name] msg = res.fail_msg test = msg and not res.offline - self._ErrorIf(test, self.ENODEHOOKS, node_name, + self._ErrorIf(test, constants.CV_ENODEHOOKS, node_name, "Communication failure in hooks execution: %s", msg) if res.offline or msg: # No need to investigate payload if node is offline or gave @@ -3018,7 +3015,7 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): continue for script, hkr, output in res.payload: test = hkr == constants.HKR_FAIL - self._ErrorIf(test, self.ENODEHOOKS, node_name, + self._ErrorIf(test, constants.CV_ENODEHOOKS, node_name, "Script %s failed, output:", script) if test: output = self._HOOKS_INDENT_RE.sub(" ", output) @@ -3331,29 +3328,32 @@ class LUClusterRename(LogicalUnit): """ clustername = self.op.name - ip = self.ip + new_ip = self.ip # shutdown the master IP - master = self.cfg.GetMasterNode() - result = self.rpc.call_node_deactivate_master_ip(master) + master_params = self.cfg.GetMasterNetworkParameters() + result = self.rpc.call_node_deactivate_master_ip(master_params.name, + master_params) result.Raise("Could not disable the master role") try: cluster = self.cfg.GetClusterInfo() cluster.cluster_name = clustername - cluster.master_ip = ip + cluster.master_ip = new_ip self.cfg.Update(cluster, feedback_fn) # update the known hosts file ssh.WriteKnownHostsFile(self.cfg, constants.SSH_KNOWN_HOSTS_FILE) node_list = self.cfg.GetOnlineNodeList() try: - node_list.remove(master) + node_list.remove(master_params.name) except ValueError: pass _UploadHelper(self, node_list, constants.SSH_KNOWN_HOSTS_FILE) finally: - result = self.rpc.call_node_activate_master_ip(master) + master_params.ip = new_ip + result = self.rpc.call_node_activate_master_ip(master_params.name, + master_params) msg = result.fail_msg if msg: self.LogWarning("Could not re-enable the master role on" @@ -3362,6 +3362,27 @@ class LUClusterRename(LogicalUnit): return clustername +def _ValidateNetmask(cfg, netmask): + """Checks if a netmask is valid. + + @type cfg: L{config.ConfigWriter} + @param cfg: The cluster configuration + @type netmask: int + @param netmask: the netmask to be verified + @raise errors.OpPrereqError: if the validation fails + + """ + ip_family = cfg.GetPrimaryIPFamily() + try: + ipcls = netutils.IPAddress.GetClassFromIpFamily(ip_family) + except errors.ProgrammerError: + raise errors.OpPrereqError("Invalid primary ip family: %s." % + ip_family) + if not ipcls.ValidateNetmask(netmask): + raise errors.OpPrereqError("CIDR netmask (%s) not valid" % + (netmask)) + + class LUClusterSetParams(LogicalUnit): """Change the parameters of the cluster. @@ -3383,6 +3404,9 @@ class LUClusterSetParams(LogicalUnit): if self.op.remove_uids: uidpool.CheckUidPool(self.op.remove_uids) + if self.op.master_netmask is not None: + _ValidateNetmask(self.cfg, self.op.master_netmask) + def ExpandNames(self): # FIXME: in the future maybe other cluster params won't require checking on # all nodes to be modified. @@ -3683,21 +3707,39 @@ class LUClusterSetParams(LogicalUnit): helper_os("blacklisted_os", self.op.blacklisted_os, "blacklisted") if self.op.master_netdev: - master = self.cfg.GetMasterNode() + master_params = self.cfg.GetMasterNetworkParameters() feedback_fn("Shutting down master ip on the current netdev (%s)" % self.cluster.master_netdev) - result = self.rpc.call_node_deactivate_master_ip(master) + result = self.rpc.call_node_deactivate_master_ip(master_params.name, + master_params) result.Raise("Could not disable the master ip") feedback_fn("Changing master_netdev from %s to %s" % - (self.cluster.master_netdev, self.op.master_netdev)) + (master_params.netdev, self.op.master_netdev)) self.cluster.master_netdev = self.op.master_netdev + if self.op.master_netmask: + master_params = self.cfg.GetMasterNetworkParameters() + feedback_fn("Changing master IP netmask to %s" % self.op.master_netmask) + result = self.rpc.call_node_change_master_netmask(master_params.name, + master_params.netmask, + self.op.master_netmask, + master_params.ip, + master_params.netdev) + if result.fail_msg: + msg = "Could not change the master IP netmask: %s" % result.fail_msg + self.LogWarning(msg) + feedback_fn(msg) + else: + self.cluster.master_netmask = self.op.master_netmask + self.cfg.Update(self.cluster, feedback_fn) if self.op.master_netdev: + master_params = self.cfg.GetMasterNetworkParameters() feedback_fn("Starting the master ip on the new master netdev (%s)" % self.op.master_netdev) - result = self.rpc.call_node_activate_master_ip(master) + result = self.rpc.call_node_activate_master_ip(master_params.name, + master_params) if result.fail_msg: self.LogWarning("Could not re-enable the master ip on" " the master, please restart manually: %s", @@ -3730,6 +3772,8 @@ def _ComputeAncillaryFiles(cluster, redist): constants.SSH_KNOWN_HOSTS_FILE, constants.CONFD_HMAC_KEY, constants.CLUSTER_DOMAIN_SECRET_FILE, + constants.SPICE_CERT_FILE, + constants.SPICE_CACERT_FILE, constants.RAPI_USERS_FILE, ]) @@ -3857,8 +3901,9 @@ class LUClusterActivateMasterIp(NoHooksLU): """Activate the master IP. """ - master = self.cfg.GetMasterNode() - self.rpc.call_node_activate_master_ip(master) + master_params = self.cfg.GetMasterNetworkParameters() + self.rpc.call_node_activate_master_ip(master_params.name, + master_params) class LUClusterDeactivateMasterIp(NoHooksLU): @@ -3869,8 +3914,8 @@ class LUClusterDeactivateMasterIp(NoHooksLU): """Deactivate the master IP. """ - master = self.cfg.GetMasterNode() - self.rpc.call_node_deactivate_master_ip(master) + master_params = self.cfg.GetMasterNetworkParameters() + self.rpc.call_node_deactivate_master_ip(master_params.name, master_params) def _WaitForSync(lu, instance, disks=None, oneshot=False): @@ -4803,7 +4848,7 @@ class LUQuery(NoHooksLU): def CheckArguments(self): qcls = _GetQueryImplementation(self.op.what) - self.impl = qcls(self.op.filter, self.op.fields, self.op.use_locking) + self.impl = qcls(self.op.qfilter, self.op.fields, self.op.use_locking) def ExpandNames(self): self.impl.ExpandNames(self) @@ -5345,7 +5390,9 @@ class LUNodeSetParams(LogicalUnit): if old_role == self._ROLE_OFFLINE and new_role != old_role: # Trying to transition out of offline status - result = self.rpc.call_version([node.name])[node.name] + # TODO: Use standard RPC runner, but make sure it works when the node is + # still marked offline + result = rpc.BootstrapRunner().call_version([node.name])[node.name] if result.fail_msg: raise errors.OpPrereqError("Node %s is being de-offlined but fails" " to report its version: %s" % @@ -5524,6 +5571,7 @@ class LUClusterQuery(NoHooksLU): "ndparams": cluster.ndparams, "candidate_pool_size": cluster.candidate_pool_size, "master_netdev": cluster.master_netdev, + "master_netmask": cluster.master_netmask, "volume_group_name": cluster.volume_group_name, "drbd_usermode_helper": cluster.drbd_usermode_helper, "file_storage_dir": cluster.file_storage_dir, @@ -5914,6 +5962,40 @@ def _CheckNodesFreeDiskOnVG(lu, nodenames, vg, requested): errors.ECODE_NORES) +def _CheckNodesPhysicalCPUs(lu, nodenames, requested, hypervisor_name): + """Checks if nodes have enough physical CPUs + + This function checks if all given nodes have the needed number of + physical CPUs. In case any node has less CPUs or we cannot get the + information from the node, this function raises an OpPrereqError + exception. + + @type lu: C{LogicalUnit} + @param lu: a logical unit from which we get configuration data + @type nodenames: C{list} + @param nodenames: the list of node names to check + @type requested: C{int} + @param requested: the minimum acceptable number of physical CPUs + @raise errors.OpPrereqError: if the node doesn't have enough CPUs, + or we cannot check the node + + """ + nodeinfo = lu.rpc.call_node_info(nodenames, None, hypervisor_name) + for node in nodenames: + info = nodeinfo[node] + info.Raise("Cannot get current information from node %s" % node, + prereq=True, ecode=errors.ECODE_ENVIRON) + num_cpus = info.payload.get("cpu_total", None) + if not isinstance(num_cpus, int): + raise errors.OpPrereqError("Can't compute the number of physical CPUs" + " on node %s, result was '%s'" % + (node, num_cpus), errors.ECODE_ENVIRON) + if requested > num_cpus: + raise errors.OpPrereqError("Node %s has %s physical CPUs, but %s are " + "required" % (node, num_cpus, requested), + errors.ECODE_NORES) + + class LUInstanceStartup(LogicalUnit): """Starts an instance. @@ -6016,9 +6098,11 @@ class LUInstanceStartup(LogicalUnit): _StartInstanceDisks(self, instance, force) - result = self.rpc.call_instance_start(node_current, instance, - self.op.hvparams, self.op.beparams, - self.op.startup_paused) + result = \ + self.rpc.call_instance_start(node_current, + (instance, self.op.hvparams, + self.op.beparams), + self.op.startup_paused) msg = result.fail_msg if msg: _ShutdownInstanceDisks(self, instance) @@ -6108,8 +6192,8 @@ class LUInstanceReboot(LogicalUnit): self.LogInfo("Instance %s was already stopped, starting now", instance.name) _StartInstanceDisks(self, instance, ignore_secondaries) - result = self.rpc.call_instance_start(node_current, instance, - None, None, False) + result = self.rpc.call_instance_start(node_current, + (instance, None, None), False) msg = result.fail_msg if msg: _ShutdownInstanceDisks(self, instance) @@ -6270,9 +6354,9 @@ class LUInstanceReinstall(LogicalUnit): try: feedback_fn("Running the instance OS create scripts...") # FIXME: pass debug option from opcode to backend - result = self.rpc.call_instance_os_add(inst.primary_node, inst, True, - self.op.debug_level, - osparams=self.os_inst) + result = self.rpc.call_instance_os_add(inst.primary_node, + (inst, self.os_inst), True, + self.op.debug_level) result.Raise("Could not install OS for instance %s on node %s" % (inst.name, inst.primary_node)) finally: @@ -6973,8 +7057,8 @@ class LUInstanceMove(LogicalUnit): _ShutdownInstanceDisks(self, instance) raise errors.OpExecError("Can't activate the instance's disks") - result = self.rpc.call_instance_start(target_node, instance, - None, None, False) + result = self.rpc.call_instance_start(target_node, + (instance, None, None), False) msg = result.fail_msg if msg: _ShutdownInstanceDisks(self, instance) @@ -7067,6 +7151,11 @@ class TLMigrateInstance(Tasklet): @ivar shutdown_timeout: In case of failover timeout of the shutdown """ + + # Constants + _MIGRATION_POLL_INTERVAL = 1 # seconds + _MIGRATION_FEEDBACK_INTERVAL = 10 # seconds + def __init__(self, lu, instance_name, cleanup=False, failover=False, fallback=False, ignore_consistency=False, @@ -7390,12 +7479,13 @@ class TLMigrateInstance(Tasklet): """ instance = self.instance target_node = self.target_node + source_node = self.source_node migration_info = self.migration_info - abort_result = self.rpc.call_finalize_migration(target_node, - instance, - migration_info, - False) + abort_result = self.rpc.call_instance_finalize_migration_dst(target_node, + instance, + migration_info, + False) abort_msg = abort_result.fail_msg if abort_msg: logging.error("Aborting migration failed on target node %s: %s", @@ -7403,6 +7493,13 @@ class TLMigrateInstance(Tasklet): # Don't raise an exception here, as we stil have to try to revert the # disk status, even if this step failed. + abort_result = self.rpc.call_instance_finalize_migration_src(source_node, + instance, False, self.live) + abort_msg = abort_result.fail_msg + if abort_msg: + logging.error("Aborting migration failed on source node %s: %s", + source_node, abort_msg) + def _ExecMigration(self): """Migrate an instance. @@ -7489,18 +7586,59 @@ class TLMigrateInstance(Tasklet): raise errors.OpExecError("Could not migrate instance %s: %s" % (instance.name, msg)) + self.feedback_fn("* starting memory transfer") + last_feedback = time.time() + while True: + result = self.rpc.call_instance_get_migration_status(source_node, + instance) + msg = result.fail_msg + ms = result.payload # MigrationStatus instance + if msg or (ms.status in constants.HV_MIGRATION_FAILED_STATUSES): + logging.error("Instance migration failed, trying to revert" + " disk status: %s", msg) + self.feedback_fn("Migration failed, aborting") + self._AbortMigration() + self._RevertDiskStatus() + raise errors.OpExecError("Could not migrate instance %s: %s" % + (instance.name, msg)) + + if result.payload.status != constants.HV_MIGRATION_ACTIVE: + self.feedback_fn("* memory transfer complete") + break + + if (utils.TimeoutExpired(last_feedback, + self._MIGRATION_FEEDBACK_INTERVAL) and + ms.transferred_ram is not None): + mem_progress = 100 * float(ms.transferred_ram) / float(ms.total_ram) + self.feedback_fn("* memory transfer progress: %.2f %%" % mem_progress) + last_feedback = time.time() + + time.sleep(self._MIGRATION_POLL_INTERVAL) + + result = self.rpc.call_instance_finalize_migration_src(source_node, + instance, + True, + self.live) + msg = result.fail_msg + if msg: + logging.error("Instance migration succeeded, but finalization failed" + " on the source node: %s", msg) + raise errors.OpExecError("Could not finalize instance migration: %s" % + msg) + instance.primary_node = target_node + # distribute new instance config to the other nodes self.cfg.Update(instance, self.feedback_fn) - result = self.rpc.call_finalize_migration(target_node, - instance, - migration_info, - True) + result = self.rpc.call_instance_finalize_migration_dst(target_node, + instance, + migration_info, + True) msg = result.fail_msg if msg: - logging.error("Instance migration succeeded, but finalization failed:" - " %s", msg) + logging.error("Instance migration succeeded, but finalization failed" + " on the target node: %s", msg) raise errors.OpExecError("Could not finalize instance migration: %s" % msg) @@ -7583,7 +7721,7 @@ class TLMigrateInstance(Tasklet): self.feedback_fn("* starting the instance on the target node %s" % target_node) - result = self.rpc.call_instance_start(target_node, instance, None, None, + result = self.rpc.call_instance_start(target_node, (instance, None, None), False) msg = result.fail_msg if msg: @@ -7715,7 +7853,7 @@ def _GenerateDRBD8Branch(lu, primary, secondary, size, vgnames, names, shared_secret = lu.cfg.GenerateDRBDSecret(lu.proc.GetECId()) dev_data = objects.Disk(dev_type=constants.LD_LV, size=size, logical_id=(vgnames[0], names[0])) - dev_meta = objects.Disk(dev_type=constants.LD_LV, size=128, + dev_meta = objects.Disk(dev_type=constants.LD_LV, size=DRBD_META_SIZE, logical_id=(vgnames[1], names[1])) drbd_dev = objects.Disk(dev_type=constants.LD_DRBD8, size=size, logical_id=(primary, secondary, port, @@ -8035,7 +8173,7 @@ def _ComputeDiskSizePerVG(disk_template, disks): constants.DT_DISKLESS: {}, constants.DT_PLAIN: _compute(disks, 0), # 128 MB are added for drbd metadata for each disk - constants.DT_DRBD8: _compute(disks, 128), + constants.DT_DRBD8: _compute(disks, DRBD_META_SIZE), constants.DT_FILE: {}, constants.DT_SHARED_FILE: {}, } @@ -8056,7 +8194,8 @@ def _ComputeDiskSize(disk_template, disks): constants.DT_DISKLESS: None, constants.DT_PLAIN: sum(d[constants.IDISK_SIZE] for d in disks), # 128 MB are added for drbd metadata for each disk - constants.DT_DRBD8: sum(d[constants.IDISK_SIZE] + 128 for d in disks), + constants.DT_DRBD8: + sum(d[constants.IDISK_SIZE] + DRBD_META_SIZE for d in disks), constants.DT_FILE: None, constants.DT_SHARED_FILE: 0, constants.DT_BLOCK: 0, @@ -8102,9 +8241,11 @@ def _CheckHVParams(lu, nodenames, hvname, hvparams): """ nodenames = _FilterVmNodes(lu, nodenames) - hvinfo = lu.rpc.call_hypervisor_validate_params(nodenames, - hvname, - hvparams) + + cluster = lu.cfg.GetClusterInfo() + hvfull = objects.FillDict(cluster.hvparams.get(hvname, {}), hvparams) + + hvinfo = lu.rpc.call_hypervisor_validate_params(nodenames, hvname, hvfull) for node in nodenames: info = hvinfo[node] if info.offline: @@ -8130,7 +8271,7 @@ def _CheckOSParams(lu, required, nodenames, osname, osparams): """ nodenames = _FilterVmNodes(lu, nodenames) - result = lu.rpc.call_os_validate(required, nodenames, osname, + result = lu.rpc.call_os_validate(nodenames, required, osname, [constants.OS_VALIDATE_PARAMETERS], osparams) for node, nres in result.items(): @@ -8494,33 +8635,39 @@ class LUInstanceCreate(LogicalUnit): if einfo.has_option(constants.INISECT_INS, "disk_template"): self.op.disk_template = einfo.get(constants.INISECT_INS, "disk_template") + if self.op.disk_template not in constants.DISK_TEMPLATES: + raise errors.OpPrereqError("Disk template specified in configuration" + " file is not one of the allowed values:" + " %s" % " ".join(constants.DISK_TEMPLATES)) else: raise errors.OpPrereqError("No disk template specified and the export" " is missing the disk_template information", errors.ECODE_INVAL) if not self.op.disks: - if einfo.has_option(constants.INISECT_INS, "disk_count"): - disks = [] - # TODO: import the disk iv_name too - for idx in range(einfo.getint(constants.INISECT_INS, "disk_count")): + disks = [] + # TODO: import the disk iv_name too + for idx in range(constants.MAX_DISKS): + if einfo.has_option(constants.INISECT_INS, "disk%d_size" % idx): disk_sz = einfo.getint(constants.INISECT_INS, "disk%d_size" % idx) disks.append({constants.IDISK_SIZE: disk_sz}) - self.op.disks = disks - else: + self.op.disks = disks + if not disks and self.op.disk_template != constants.DT_DISKLESS: raise errors.OpPrereqError("No disk info specified and the export" " is missing the disk information", errors.ECODE_INVAL) - if (not self.op.nics and - einfo.has_option(constants.INISECT_INS, "nic_count")): + if not self.op.nics: nics = [] - for idx in range(einfo.getint(constants.INISECT_INS, "nic_count")): - ndict = {} - for name in list(constants.NICS_PARAMETERS) + ["ip", "mac"]: - v = einfo.get(constants.INISECT_INS, "nic%d_%s" % (idx, name)) - ndict[name] = v - nics.append(ndict) + for idx in range(constants.MAX_NICS): + if einfo.has_option(constants.INISECT_INS, "nic%d_mac" % idx): + ndict = {} + for name in list(constants.NICS_PARAMETERS) + ["ip", "mac"]: + v = einfo.get(constants.INISECT_INS, "nic%d_%s" % (idx, name)) + ndict[name] = v + nics.append(ndict) + else: + break self.op.nics = nics if not self.op.tags and einfo.has_option(constants.INISECT_INS, "tags"): @@ -8624,7 +8771,8 @@ class LUInstanceCreate(LogicalUnit): raise errors.OpPrereqError("Cluster does not support lvm-based" " instances", errors.ECODE_STATE) - if self.op.hypervisor is None: + if (self.op.hypervisor is None or + self.op.hypervisor == constants.VALUE_AUTO): self.op.hypervisor = self.cfg.GetHypervisorType() cluster = self.cfg.GetClusterInfo() @@ -8650,6 +8798,10 @@ class LUInstanceCreate(LogicalUnit): _CheckGlobalHvParams(self.op.hvparams) # fill and remember the beparams dict + default_beparams = cluster.beparams[constants.PP_DEFAULT] + for param, value in self.op.beparams.iteritems(): + if value == constants.VALUE_AUTO: + self.op.beparams[param] = default_beparams[param] utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES) self.be_full = cluster.SimpleFillBE(self.op.beparams) @@ -8666,7 +8818,7 @@ class LUInstanceCreate(LogicalUnit): for idx, nic in enumerate(self.op.nics): nic_mode_req = nic.get(constants.INIC_MODE, None) nic_mode = nic_mode_req - if nic_mode is None: + if nic_mode is None or nic_mode == constants.VALUE_AUTO: nic_mode = cluster.nicparams[constants.PP_DEFAULT][constants.NIC_MODE] # in routed mode, for the first nic, the default ip is 'auto' @@ -8710,9 +8862,11 @@ class LUInstanceCreate(LogicalUnit): # Build nic parameters link = nic.get(constants.INIC_LINK, None) + if link == constants.VALUE_AUTO: + link = cluster.nicparams[constants.PP_DEFAULT][constants.NIC_LINK] nicparams = {} if nic_mode_req: - nicparams[constants.NIC_MODE] = nic_mode_req + nicparams[constants.NIC_MODE] = nic_mode if link: nicparams[constants.NIC_LINK] = link @@ -8749,18 +8903,8 @@ class LUInstanceCreate(LogicalUnit): self.disks.append(new_disk) if self.op.mode == constants.INSTANCE_IMPORT: - - # Check that the new instance doesn't have less disks than the export - instance_disks = len(self.disks) - export_disks = export_info.getint(constants.INISECT_INS, 'disk_count') - if instance_disks < export_disks: - raise errors.OpPrereqError("Not enough disks to import." - " (instance: %d, export: %d)" % - (instance_disks, export_disks), - errors.ECODE_INVAL) - disk_images = [] - for idx in range(export_disks): + for idx in range(len(self.disks)): option = "disk%d_dump" % idx if export_info.has_option(constants.INISECT_INS, option): # FIXME: are the old os-es, disk sizes, etc. useful? @@ -8773,15 +8917,9 @@ class LUInstanceCreate(LogicalUnit): self.src_images = disk_images old_name = export_info.get(constants.INISECT_INS, "name") - try: - exp_nic_count = export_info.getint(constants.INISECT_INS, "nic_count") - except (TypeError, ValueError), err: - raise errors.OpPrereqError("Invalid export file, nic_count is not" - " an integer: %s" % str(err), - errors.ECODE_STATE) if self.op.instance_name == old_name: for idx, nic in enumerate(self.nics): - if nic.mac == constants.VALUE_AUTO and exp_nic_count >= idx: + if nic.mac == constants.VALUE_AUTO: nic_mac_ini = "nic%d_mac" % idx nic.mac = export_info.get(constants.INISECT_INS, nic_mac_ini) @@ -9059,7 +9197,7 @@ class LUInstanceCreate(LogicalUnit): feedback_fn("* running the instance OS create scripts...") # FIXME: pass debug option from opcode to backend os_add_result = \ - self.rpc.call_instance_os_add(pnode_name, iobj, False, + self.rpc.call_instance_os_add(pnode_name, (iobj, None), False, self.op.debug_level) if pause_sync: feedback_fn("* resuming disk sync") @@ -9139,8 +9277,8 @@ class LUInstanceCreate(LogicalUnit): self.cfg.Update(iobj, feedback_fn) logging.info("Starting instance %s on node %s", instance, pnode_name) feedback_fn("* starting instance...") - result = self.rpc.call_instance_start(pnode_name, iobj, - None, None, False) + result = self.rpc.call_instance_start(pnode_name, (iobj, None, None), + False) result.Raise("Could not start instance") return list(iobj.all_nodes) @@ -9724,7 +9862,7 @@ class TLReplaceDisks(Tasklet): lv_data = objects.Disk(dev_type=constants.LD_LV, size=dev.size, logical_id=(vg_data, names[0])) vg_meta = dev.children[1].logical_id[0] - lv_meta = objects.Disk(dev_type=constants.LD_LV, size=128, + lv_meta = objects.Disk(dev_type=constants.LD_LV, size=DRBD_META_SIZE, logical_id=(vg_meta, names[1])) new_lvs = [lv_data, lv_meta] @@ -10877,9 +11015,11 @@ class LUInstanceSetParams(LogicalUnit): # local check hypervisor.GetHypervisor(hv_type).CheckParameterSyntax(hv_new) _CheckHVParams(self, nodelist, instance.hypervisor, hv_new) - self.hv_new = hv_new # the new actual values + self.hv_proposed = self.hv_new = hv_new # the new actual values self.hv_inst = i_hvdict # the new dict (without defaults) else: + self.hv_proposed = cluster.SimpleFillHV(instance.hypervisor, instance.os, + instance.hvparams) self.hv_new = self.hv_inst = {} # beparams processing @@ -10888,12 +11028,40 @@ class LUInstanceSetParams(LogicalUnit): use_none=True) utils.ForceDictType(i_bedict, constants.BES_PARAMETER_TYPES) be_new = cluster.SimpleFillBE(i_bedict) - self.be_new = be_new # the new actual values + self.be_proposed = self.be_new = be_new # the new actual values self.be_inst = i_bedict # the new dict (without defaults) else: self.be_new = self.be_inst = {} + self.be_proposed = cluster.SimpleFillBE(instance.beparams) be_old = cluster.FillBE(instance) + # CPU param validation -- checking every time a paramtere is + # changed to cover all cases where either CPU mask or vcpus have + # changed + if (constants.BE_VCPUS in self.be_proposed and + constants.HV_CPU_MASK in self.hv_proposed): + cpu_list = \ + utils.ParseMultiCpuMask(self.hv_proposed[constants.HV_CPU_MASK]) + # Verify mask is consistent with number of vCPUs. Can skip this + # test if only 1 entry in the CPU mask, which means same mask + # is applied to all vCPUs. + if (len(cpu_list) > 1 and + len(cpu_list) != self.be_proposed[constants.BE_VCPUS]): + raise errors.OpPrereqError("Number of vCPUs [%d] does not match the" + " CPU mask [%s]" % + (self.be_proposed[constants.BE_VCPUS], + self.hv_proposed[constants.HV_CPU_MASK]), + errors.ECODE_INVAL) + + # Only perform this test if a new CPU mask is given + if constants.HV_CPU_MASK in self.hv_new: + # Calculate the largest CPU number requested + max_requested_cpu = max(map(max, cpu_list)) + # Check that all of the instance's nodes have enough physical CPUs to + # satisfy the requested CPU mask + _CheckNodesPhysicalCPUs(self, instance.all_nodes, + max_requested_cpu + 1, instance.hypervisor) + # osparams processing if self.op.osparams: i_osdict = _GetUpdatedParams(instance.osparams, self.op.osparams) @@ -11755,8 +11923,8 @@ class LUBackupExport(LogicalUnit): not self.op.remove_instance): assert not activate_disks feedback_fn("Starting instance %s" % instance.name) - result = self.rpc.call_instance_start(src_node, instance, - None, None, False) + result = self.rpc.call_instance_start(src_node, + (instance, None, None), False) msg = result.fail_msg if msg: feedback_fn("Failed to start instance: %s" % msg) @@ -12888,9 +13056,9 @@ class IAllocator(object): # pylint: disable=R0902 # lots of instance attributes - def __init__(self, cfg, rpc, mode, **kwargs): + def __init__(self, cfg, rpc_runner, mode, **kwargs): self.cfg = cfg - self.rpc = rpc + self.rpc = rpc_runner # init buffer variables self.in_text = self.out_text = self.in_data = self.out_data = None # init all input fields so that pylint is happy