X-Git-Url: https://code.grnet.gr/git/ganeti-local/blobdiff_plain/692738fc141e8b85ca1d170df9534b014fd988a0..313bceadb9cec8b3a0b627d8f274054a495eb671:/lib/cmdlib.py diff --git a/lib/cmdlib.py b/lib/cmdlib.py index 70c577f..93f9e4f 100644 --- a/lib/cmdlib.py +++ b/lib/cmdlib.py @@ -94,7 +94,7 @@ class LogicalUnit(object): self.dry_run_result = None # Tasklets - self.tasklets = [] + self.tasklets = None for attr_name in self._OP_REQP: attr_val = getattr(op, attr_name, None) @@ -213,8 +213,10 @@ class LogicalUnit(object): their canonical form if it hasn't been done by ExpandNames before. """ - if self.tasklets: - for tl in self.tasklets: + if self.tasklets is not None: + for (idx, tl) in enumerate(self.tasklets): + logging.debug("Checking prerequisites for tasklet %s/%s", + idx + 1, len(self.tasklets)) tl.CheckPrereq() else: raise NotImplementedError @@ -227,8 +229,9 @@ class LogicalUnit(object): code, or expected. """ - if self.tasklets: - for tl in self.tasklets: + if self.tasklets is not None: + for (idx, tl) in enumerate(self.tasklets): + logging.debug("Executing tasklet %s/%s", idx + 1, len(self.tasklets)) tl.Exec(feedback_fn) else: raise NotImplementedError @@ -367,6 +370,13 @@ class Tasklet: - Implement Exec """ + def __init__(self, lu): + self.lu = lu + + # Shortcuts + self.cfg = lu.cfg + self.rpc = lu.rpc + def CheckPrereq(self): """Check prerequisites for this tasklets. @@ -597,6 +607,7 @@ def _BuildInstanceHookEnv(name, primary_node, secondary_nodes, os_type, status, return env + def _NICListToTuple(lu, nics): """Build a list of nic information tuples. @@ -620,6 +631,7 @@ def _NICListToTuple(lu, nics): hooks_nics.append((ip, mac, mode, link)) return hooks_nics + def _BuildInstanceHookEnvByObject(lu, instance, override=None): """Builds instance related env variables for hooks from an object. @@ -699,17 +711,90 @@ def _CheckInstanceBridgesExist(lu, instance, node=None): _CheckNicsBridgesExist(lu, instance.nics, node) +def _GetNodeInstancesInner(cfg, fn): + return [i for i in cfg.GetAllInstancesInfo().values() if fn(i)] + + +def _GetNodeInstances(cfg, node_name): + """Returns a list of all primary and secondary instances on a node. + + """ + + return _GetNodeInstancesInner(cfg, lambda inst: node_name in inst.all_nodes) + + +def _GetNodePrimaryInstances(cfg, node_name): + """Returns primary instances on a node. + + """ + return _GetNodeInstancesInner(cfg, + lambda inst: node_name == inst.primary_node) + + def _GetNodeSecondaryInstances(cfg, node_name): """Returns secondary instances on a node. """ - instances = [] + return _GetNodeInstancesInner(cfg, + lambda inst: node_name in inst.secondary_nodes) + + +def _GetStorageTypeArgs(cfg, storage_type): + """Returns the arguments for a storage type. - for (_, inst) in cfg.GetAllInstancesInfo().iteritems(): - if node_name in inst.secondary_nodes: - instances.append(inst) + """ + # Special case for file storage + if storage_type == constants.ST_FILE: + # storage.FileStorage wants a list of storage directories + return [[cfg.GetFileStorageDir()]] + + return [] + + +def _FindFaultyInstanceDisks(cfg, rpc, instance, node_name, prereq): + faulty = [] + + for dev in instance.disks: + cfg.SetDiskID(dev, node_name) - return instances + result = rpc.call_blockdev_getmirrorstatus(node_name, instance.disks) + result.Raise("Failed to get disk status from node %s" % node_name, + prereq=prereq) + + for idx, bdev_status in enumerate(result.payload): + if bdev_status and bdev_status.ldisk_status == constants.LDS_FAULTY: + faulty.append(idx) + + return faulty + + +class LUPostInitCluster(LogicalUnit): + """Logical unit for running hooks after cluster initialization. + + """ + HPATH = "cluster-init" + HTYPE = constants.HTYPE_CLUSTER + _OP_REQP = [] + + def BuildHooksEnv(self): + """Build hooks env. + + """ + env = {"OP_TARGET": self.cfg.GetClusterName()} + mn = self.cfg.GetMasterNode() + return env, [], [mn] + + def CheckPrereq(self): + """No prerequisites to check. + + """ + return True + + def Exec(self, feedback_fn): + """Nothing to do. + + """ + return True class LUDestroyCluster(NoHooksLU): @@ -1434,6 +1519,100 @@ class LUVerifyDisks(NoHooksLU): return result +class LURepairDiskSizes(NoHooksLU): + """Verifies the cluster disks sizes. + + """ + _OP_REQP = ["instances"] + REQ_BGL = False + + def ExpandNames(self): + + if not isinstance(self.op.instances, list): + raise errors.OpPrereqError("Invalid argument type 'instances'") + + if self.op.instances: + self.wanted_names = [] + for name in self.op.instances: + full_name = self.cfg.ExpandInstanceName(name) + if full_name is None: + raise errors.OpPrereqError("Instance '%s' not known" % name) + self.wanted_names.append(full_name) + self.needed_locks[locking.LEVEL_INSTANCE] = self.wanted_names + self.needed_locks = { + locking.LEVEL_NODE: [], + locking.LEVEL_INSTANCE: self.wanted_names, + } + self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE + else: + self.wanted_names = None + self.needed_locks = { + locking.LEVEL_NODE: locking.ALL_SET, + locking.LEVEL_INSTANCE: locking.ALL_SET, + } + self.share_locks = dict(((i, 1) for i in locking.LEVELS)) + + def DeclareLocks(self, level): + if level == locking.LEVEL_NODE and self.wanted_names is not None: + self._LockInstancesNodes(primary_only=True) + + def CheckPrereq(self): + """Check prerequisites. + + This only checks the optional instance list against the existing names. + + """ + if self.wanted_names is None: + self.wanted_names = self.acquired_locks[locking.LEVEL_INSTANCE] + + self.wanted_instances = [self.cfg.GetInstanceInfo(name) for name + in self.wanted_names] + + def Exec(self, feedback_fn): + """Verify the size of cluster disks. + + """ + # TODO: check child disks too + # TODO: check differences in size between primary/secondary nodes + per_node_disks = {} + for instance in self.wanted_instances: + pnode = instance.primary_node + if pnode not in per_node_disks: + per_node_disks[pnode] = [] + for idx, disk in enumerate(instance.disks): + per_node_disks[pnode].append((instance, idx, disk)) + + changed = [] + for node, dskl in per_node_disks.items(): + result = self.rpc.call_blockdev_getsizes(node, [v[2] for v in dskl]) + if result.failed: + self.LogWarning("Failure in blockdev_getsizes call to node" + " %s, ignoring", node) + continue + if len(result.data) != len(dskl): + self.LogWarning("Invalid result from node %s, ignoring node results", + node) + continue + for ((instance, idx, disk), size) in zip(dskl, result.data): + if size is None: + self.LogWarning("Disk %d of instance %s did not return size" + " information, ignoring", idx, instance.name) + continue + if not isinstance(size, (int, long)): + self.LogWarning("Disk %d of instance %s did not return valid" + " size information, ignoring", idx, instance.name) + continue + size = size >> 20 + if size != disk.size: + self.LogInfo("Disk %d of instance %s has mismatched size," + " correcting: recorded %d, actual %d", idx, + instance.name, disk.size, size) + disk.size = size + self.cfg.Update(instance) + changed.append((instance.name, idx, size)) + return changed + + class LURenameCluster(LogicalUnit): """Rename the cluster. @@ -1796,18 +1975,18 @@ def _WaitForSync(lu, instance, oneshot=False, unlock=False): lu.LogWarning("Can't compute data for node %s/%s", node, instance.disks[i].iv_name) continue - # we ignore the ldisk parameter - perc_done, est_time, is_degraded, _ = mstat - cumul_degraded = cumul_degraded or (is_degraded and perc_done is None) - if perc_done is not None: + + cumul_degraded = (cumul_degraded or + (mstat.is_degraded and mstat.sync_percent is None)) + if mstat.sync_percent is not None: done = False - if est_time is not None: - rem_time = "%d estimated seconds remaining" % est_time - max_time = est_time + if mstat.estimated_time is not None: + rem_time = "%d estimated seconds remaining" % mstat.estimated_time + max_time = mstat.estimated_time else: rem_time = "no time estimate" lu.proc.LogInfo("- device %s: %5.2f%% done, %s" % - (instance.disks[i].iv_name, perc_done, rem_time)) + (instance.disks[i].iv_name, mstat.sync_percent, rem_time)) # if we're done but degraded, let's do a few small retries, to # make sure we see a stable and not transient situation; therefore @@ -1837,12 +2016,9 @@ def _CheckDiskConsistency(lu, dev, node, on_primary, ldisk=False): """ lu.cfg.SetDiskID(dev, node) - if ldisk: - idx = 6 - else: - idx = 5 result = True + if on_primary or dev.AssembleOnSecondary(): rstats = lu.rpc.call_blockdev_find(node, dev) msg = rstats.fail_msg @@ -1853,7 +2029,11 @@ def _CheckDiskConsistency(lu, dev, node, on_primary, ldisk=False): lu.LogWarning("Can't find disk on node %s", node) result = False else: - result = result and (not rstats.payload[idx]) + if ldisk: + result = result and rstats.payload.ldisk_status == constants.LDS_OKAY + else: + result = result and not rstats.payload.is_degraded + if dev.children: for child in dev.children: result = result and _CheckDiskConsistency(lu, child, node, on_primary) @@ -2044,7 +2224,7 @@ class LUQueryNodes(NoHooksLU): "name", "pinst_cnt", "sinst_cnt", "pinst_list", "sinst_list", "pip", "sip", "tags", - "serial_no", + "serial_no", "ctime", "mtime", "master_candidate", "master", "offline", @@ -2168,6 +2348,10 @@ class LUQueryNodes(NoHooksLU): val = list(node.GetTags()) elif field == "serial_no": val = node.serial_no + elif field == "ctime": + val = node.ctime + elif field == "mtime": + val = node.mtime elif field == "master_candidate": val = node.master_candidate elif field == "master": @@ -2283,6 +2467,154 @@ class LUQueryNodeVolumes(NoHooksLU): return output +class LUQueryNodeStorage(NoHooksLU): + """Logical unit for getting information on storage units on node(s). + + """ + _OP_REQP = ["nodes", "storage_type", "output_fields"] + REQ_BGL = False + _FIELDS_STATIC = utils.FieldSet("node") + + def ExpandNames(self): + storage_type = self.op.storage_type + + if storage_type not in constants.VALID_STORAGE_FIELDS: + raise errors.OpPrereqError("Unknown storage type: %s" % storage_type) + + dynamic_fields = constants.VALID_STORAGE_FIELDS[storage_type] + + _CheckOutputFields(static=self._FIELDS_STATIC, + dynamic=utils.FieldSet(*dynamic_fields), + selected=self.op.output_fields) + + self.needed_locks = {} + self.share_locks[locking.LEVEL_NODE] = 1 + + if self.op.nodes: + self.needed_locks[locking.LEVEL_NODE] = \ + _GetWantedNodes(self, self.op.nodes) + else: + self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET + + def CheckPrereq(self): + """Check prerequisites. + + This checks that the fields required are valid output fields. + + """ + self.op.name = getattr(self.op, "name", None) + + self.nodes = self.acquired_locks[locking.LEVEL_NODE] + + def Exec(self, feedback_fn): + """Computes the list of nodes and their attributes. + + """ + # Always get name to sort by + if constants.SF_NAME in self.op.output_fields: + fields = self.op.output_fields[:] + else: + fields = [constants.SF_NAME] + self.op.output_fields + + # Never ask for node as it's only known to the LU + while "node" in fields: + fields.remove("node") + + field_idx = dict([(name, idx) for (idx, name) in enumerate(fields)]) + name_idx = field_idx[constants.SF_NAME] + + st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type) + data = self.rpc.call_storage_list(self.nodes, + self.op.storage_type, st_args, + self.op.name, fields) + + result = [] + + for node in utils.NiceSort(self.nodes): + nresult = data[node] + if nresult.offline: + continue + + msg = nresult.fail_msg + if msg: + self.LogWarning("Can't get storage data from node %s: %s", node, msg) + continue + + rows = dict([(row[name_idx], row) for row in nresult.payload]) + + for name in utils.NiceSort(rows.keys()): + row = rows[name] + + out = [] + + for field in self.op.output_fields: + if field == "node": + val = node + elif field in field_idx: + val = row[field_idx[field]] + else: + raise errors.ParameterError(field) + + out.append(val) + + result.append(out) + + return result + + +class LUModifyNodeStorage(NoHooksLU): + """Logical unit for modifying a storage volume on a node. + + """ + _OP_REQP = ["node_name", "storage_type", "name", "changes"] + REQ_BGL = False + + def CheckArguments(self): + node_name = self.cfg.ExpandNodeName(self.op.node_name) + if node_name is None: + raise errors.OpPrereqError("Invalid node name '%s'" % self.op.node_name) + + self.op.node_name = node_name + + storage_type = self.op.storage_type + if storage_type not in constants.VALID_STORAGE_FIELDS: + raise errors.OpPrereqError("Unknown storage type: %s" % storage_type) + + def ExpandNames(self): + self.needed_locks = { + locking.LEVEL_NODE: self.op.node_name, + } + + def CheckPrereq(self): + """Check prerequisites. + + """ + storage_type = self.op.storage_type + + try: + modifiable = constants.MODIFIABLE_STORAGE_FIELDS[storage_type] + except KeyError: + raise errors.OpPrereqError("Storage units of type '%s' can not be" + " modified" % storage_type) + + diff = set(self.op.changes.keys()) - modifiable + if diff: + raise errors.OpPrereqError("The following fields can not be modified for" + " storage units of type '%s': %r" % + (storage_type, list(diff))) + + def Exec(self, feedback_fn): + """Computes the list of nodes and their attributes. + + """ + st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type) + result = self.rpc.call_storage_modify(self.op.node_name, + self.op.storage_type, st_args, + self.op.name, self.op.changes) + result.Raise("Failed to modify storage unit '%s' on %s" % + (self.op.name, self.op.node_name)) + + class LUAddNode(LogicalUnit): """Logical unit for adding node to the cluster. @@ -2650,7 +2982,7 @@ class LUPowercycleNode(NoHooksLU): def ExpandNames(self): """Locking for PowercycleNode. - This is a last-resource option and shouldn't block on other + This is a last-resort option and shouldn't block on other jobs. Therefore, we grab no locks. """ @@ -2714,6 +3046,8 @@ class LUQueryClusterInfo(NoHooksLU): "master_netdev": cluster.master_netdev, "volume_group_name": cluster.volume_group_name, "file_storage_dir": cluster.file_storage_dir, + "ctime": cluster.ctime, + "mtime": cluster.mtime, } return result @@ -2785,19 +3119,24 @@ class LUActivateInstanceDisks(NoHooksLU): assert self.instance is not None, \ "Cannot retrieve locked instance %s" % self.op.instance_name _CheckNodeOnline(self, self.instance.primary_node) + if not hasattr(self.op, "ignore_size"): + self.op.ignore_size = False def Exec(self, feedback_fn): """Activate the disks. """ - disks_ok, disks_info = _AssembleInstanceDisks(self, self.instance) + disks_ok, disks_info = \ + _AssembleInstanceDisks(self, self.instance, + ignore_size=self.op.ignore_size) if not disks_ok: raise errors.OpExecError("Cannot activate block devices") return disks_info -def _AssembleInstanceDisks(lu, instance, ignore_secondaries=False): +def _AssembleInstanceDisks(lu, instance, ignore_secondaries=False, + ignore_size=False): """Prepare the block devices for an instance. This sets up the block devices on all nodes. @@ -2809,6 +3148,10 @@ def _AssembleInstanceDisks(lu, instance, ignore_secondaries=False): @type ignore_secondaries: boolean @param ignore_secondaries: if true, errors on secondary nodes won't result in an error return from the function + @type ignore_size: boolean + @param ignore_size: if true, the current known size of the disk + will not be used during the disk activation, useful for cases + when the size is wrong @return: False if the operation failed, otherwise a list of (host, instance_visible_name, node_visible_name) with the mapping from node devices to instance devices @@ -2829,6 +3172,9 @@ def _AssembleInstanceDisks(lu, instance, ignore_secondaries=False): # 1st pass, assemble on all nodes in secondary mode for inst_disk in instance.disks: for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node): + if ignore_size: + node_disk = node_disk.Copy() + node_disk.UnsetSize() lu.cfg.SetDiskID(node_disk, node) result = lu.rpc.call_blockdev_assemble(node, node_disk, iname, False) msg = result.fail_msg @@ -2846,6 +3192,9 @@ def _AssembleInstanceDisks(lu, instance, ignore_secondaries=False): for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node): if node != instance.primary_node: continue + if ignore_size: + node_disk = node_disk.Copy() + node_disk.UnsetSize() lu.cfg.SetDiskID(node_disk, node) result = lu.rpc.call_blockdev_assemble(node, node_disk, iname, True) msg = result.fail_msg @@ -3302,6 +3651,89 @@ class LUReinstallInstance(LogicalUnit): _ShutdownInstanceDisks(self, inst) +class LURecreateInstanceDisks(LogicalUnit): + """Recreate an instance's missing disks. + + """ + HPATH = "instance-recreate-disks" + HTYPE = constants.HTYPE_INSTANCE + _OP_REQP = ["instance_name", "disks"] + REQ_BGL = False + + def CheckArguments(self): + """Check the arguments. + + """ + if not isinstance(self.op.disks, list): + raise errors.OpPrereqError("Invalid disks parameter") + for item in self.op.disks: + if (not isinstance(item, int) or + item < 0): + raise errors.OpPrereqError("Invalid disk specification '%s'" % + str(item)) + + def ExpandNames(self): + self._ExpandAndLockInstance() + + def BuildHooksEnv(self): + """Build hooks env. + + This runs on master, primary and secondary nodes of the instance. + + """ + env = _BuildInstanceHookEnvByObject(self, self.instance) + nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes) + return env, nl, nl + + def CheckPrereq(self): + """Check prerequisites. + + This checks that the instance is in the cluster and is not running. + + """ + instance = self.cfg.GetInstanceInfo(self.op.instance_name) + assert instance is not None, \ + "Cannot retrieve locked instance %s" % self.op.instance_name + _CheckNodeOnline(self, instance.primary_node) + + if instance.disk_template == constants.DT_DISKLESS: + raise errors.OpPrereqError("Instance '%s' has no disks" % + self.op.instance_name) + if instance.admin_up: + raise errors.OpPrereqError("Instance '%s' is marked to be up" % + self.op.instance_name) + remote_info = self.rpc.call_instance_info(instance.primary_node, + instance.name, + instance.hypervisor) + remote_info.Raise("Error checking node %s" % instance.primary_node, + prereq=True) + if remote_info.payload: + raise errors.OpPrereqError("Instance '%s' is running on the node %s" % + (self.op.instance_name, + instance.primary_node)) + + if not self.op.disks: + self.op.disks = range(len(instance.disks)) + else: + for idx in self.op.disks: + if idx >= len(instance.disks): + raise errors.OpPrereqError("Invalid disk index passed '%s'" % idx) + + self.instance = instance + + def Exec(self, feedback_fn): + """Recreate the disks. + + """ + to_skip = [] + for idx, disk in enumerate(self.instance.disks): + if idx not in self.op.disks: # disk idx has not been passed in + to_skip.append(idx) + continue + + _CreateDisks(self, self.instance, to_skip=to_skip) + + class LURenameInstance(LogicalUnit): """Rename an instance. @@ -3493,7 +3925,9 @@ class LUQueryInstances(NoHooksLU): r"(nic)\.(bridge)/([0-9]+)", r"(nic)\.(macs|ips|modes|links|bridges)", r"(disk|nic)\.(count)", - "serial_no", "hypervisor", "hvparams",] + + "serial_no", "hypervisor", "hvparams", + "ctime", "mtime", + ] + ["hv/%s" % name for name in constants.HVS_PARAMETERS] + ["be/%s" % name @@ -3678,6 +4112,10 @@ class LUQueryInstances(NoHooksLU): val = list(instance.GetTags()) elif field == "serial_no": val = instance.serial_no + elif field == "ctime": + val = instance.ctime + elif field == "mtime": + val = instance.mtime elif field == "network_port": val = instance.network_port elif field == "hypervisor": @@ -3755,26 +4193,204 @@ class LUQueryInstances(NoHooksLU): iout.append(val) output.append(iout) - return output + return output + + +class LUFailoverInstance(LogicalUnit): + """Failover an instance. + + """ + HPATH = "instance-failover" + HTYPE = constants.HTYPE_INSTANCE + _OP_REQP = ["instance_name", "ignore_consistency"] + REQ_BGL = False + + def ExpandNames(self): + self._ExpandAndLockInstance() + self.needed_locks[locking.LEVEL_NODE] = [] + self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE + + def DeclareLocks(self, level): + if level == locking.LEVEL_NODE: + self._LockInstancesNodes() + + def BuildHooksEnv(self): + """Build hooks env. + + This runs on master, primary and secondary nodes of the instance. + + """ + env = { + "IGNORE_CONSISTENCY": self.op.ignore_consistency, + } + env.update(_BuildInstanceHookEnvByObject(self, self.instance)) + nl = [self.cfg.GetMasterNode()] + list(self.instance.secondary_nodes) + return env, nl, nl + + def CheckPrereq(self): + """Check prerequisites. + + This checks that the instance is in the cluster. + + """ + self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name) + assert self.instance is not None, \ + "Cannot retrieve locked instance %s" % self.op.instance_name + + bep = self.cfg.GetClusterInfo().FillBE(instance) + if instance.disk_template not in constants.DTS_NET_MIRROR: + raise errors.OpPrereqError("Instance's disk layout is not" + " network mirrored, cannot failover.") + + secondary_nodes = instance.secondary_nodes + if not secondary_nodes: + raise errors.ProgrammerError("no secondary node but using " + "a mirrored disk template") + + target_node = secondary_nodes[0] + _CheckNodeOnline(self, target_node) + _CheckNodeNotDrained(self, target_node) + if instance.admin_up: + # check memory requirements on the secondary node + _CheckNodeFreeMemory(self, target_node, "failing over instance %s" % + instance.name, bep[constants.BE_MEMORY], + instance.hypervisor) + else: + self.LogInfo("Not checking memory on the secondary node as" + " instance will not be started") + + # check bridge existance + _CheckInstanceBridgesExist(self, instance, node=target_node) + + def Exec(self, feedback_fn): + """Failover an instance. + + The failover is done by shutting it down on its present node and + starting it on the secondary. + + """ + instance = self.instance + + source_node = instance.primary_node + target_node = instance.secondary_nodes[0] + + feedback_fn("* checking disk consistency between source and target") + for dev in instance.disks: + # for drbd, these are drbd over lvm + if not _CheckDiskConsistency(self, dev, target_node, False): + if instance.admin_up and not self.op.ignore_consistency: + raise errors.OpExecError("Disk %s is degraded on target node," + " aborting failover." % dev.iv_name) + + feedback_fn("* shutting down instance on source node") + logging.info("Shutting down instance %s on node %s", + instance.name, source_node) + + result = self.rpc.call_instance_shutdown(source_node, instance) + msg = result.fail_msg + if msg: + if self.op.ignore_consistency: + self.proc.LogWarning("Could not shutdown instance %s on node %s." + " Proceeding anyway. Please make sure node" + " %s is down. Error details: %s", + instance.name, source_node, source_node, msg) + else: + raise errors.OpExecError("Could not shutdown instance %s on" + " node %s: %s" % + (instance.name, source_node, msg)) + + feedback_fn("* deactivating the instance's disks on source node") + if not _ShutdownInstanceDisks(self, instance, ignore_primary=True): + raise errors.OpExecError("Can't shut down the instance's disks.") + + instance.primary_node = target_node + # distribute new instance config to the other nodes + self.cfg.Update(instance) + + # Only start the instance if it's marked as up + if instance.admin_up: + feedback_fn("* activating the instance's disks on target node") + logging.info("Starting instance %s on node %s", + instance.name, target_node) + + disks_ok, _ = _AssembleInstanceDisks(self, instance, + ignore_secondaries=True) + if not disks_ok: + _ShutdownInstanceDisks(self, instance) + raise errors.OpExecError("Can't activate the instance's disks") + + feedback_fn("* starting the instance on the target node") + result = self.rpc.call_instance_start(target_node, instance, None, None) + msg = result.fail_msg + if msg: + _ShutdownInstanceDisks(self, instance) + raise errors.OpExecError("Could not start instance %s on node %s: %s" % + (instance.name, target_node, msg)) + + +class LUMigrateInstance(LogicalUnit): + """Migrate an instance. + + This is migration without shutting down, compared to the failover, + which is done with shutdown. + + """ + HPATH = "instance-migrate" + HTYPE = constants.HTYPE_INSTANCE + _OP_REQP = ["instance_name", "live", "cleanup"] + + REQ_BGL = False + + def ExpandNames(self): + self._ExpandAndLockInstance() + + self.needed_locks[locking.LEVEL_NODE] = [] + self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE + + self._migrater = TLMigrateInstance(self, self.op.instance_name, + self.op.live, self.op.cleanup) + self.tasklets = [self._migrater] + + def DeclareLocks(self, level): + if level == locking.LEVEL_NODE: + self._LockInstancesNodes() + + def BuildHooksEnv(self): + """Build hooks env. + + This runs on master, primary and secondary nodes of the instance. + + """ + instance = self._migrater.instance + env = _BuildInstanceHookEnvByObject(self, instance) + env["MIGRATE_LIVE"] = self.op.live + env["MIGRATE_CLEANUP"] = self.op.cleanup + nl = [self.cfg.GetMasterNode()] + list(instance.secondary_nodes) + return env, nl, nl -class LUFailoverInstance(LogicalUnit): - """Failover an instance. +class LUMoveInstance(LogicalUnit): + """Move an instance by data-copying. """ - HPATH = "instance-failover" + HPATH = "instance-move" HTYPE = constants.HTYPE_INSTANCE - _OP_REQP = ["instance_name", "ignore_consistency"] + _OP_REQP = ["instance_name", "target_node"] REQ_BGL = False def ExpandNames(self): self._ExpandAndLockInstance() - self.needed_locks[locking.LEVEL_NODE] = [] - self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE + target_node = self.cfg.ExpandNodeName(self.op.target_node) + if target_node is None: + raise errors.OpPrereqError("Node '%s' not known" % + self.op.target_node) + self.op.target_node = target_node + self.needed_locks[locking.LEVEL_NODE] = [target_node] + self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND def DeclareLocks(self, level): if level == locking.LEVEL_NODE: - self._LockInstancesNodes() + self._LockInstancesNodes(primary_only=True) def BuildHooksEnv(self): """Build hooks env. @@ -3783,10 +4399,11 @@ class LUFailoverInstance(LogicalUnit): """ env = { - "IGNORE_CONSISTENCY": self.op.ignore_consistency, + "TARGET_NODE": self.op.target_node, } env.update(_BuildInstanceHookEnvByObject(self, self.instance)) - nl = [self.cfg.GetMasterNode()] + list(self.instance.secondary_nodes) + nl = [self.cfg.GetMasterNode()] + [self.instance.primary_node, + self.op.target_node] return env, nl, nl def CheckPrereq(self): @@ -3799,19 +4416,26 @@ class LUFailoverInstance(LogicalUnit): assert self.instance is not None, \ "Cannot retrieve locked instance %s" % self.op.instance_name + node = self.cfg.GetNodeInfo(self.op.target_node) + assert node is not None, \ + "Cannot retrieve locked node %s" % self.op.target_node + + self.target_node = target_node = node.name + + if target_node == instance.primary_node: + raise errors.OpPrereqError("Instance %s is already on the node %s" % + (instance.name, target_node)) + bep = self.cfg.GetClusterInfo().FillBE(instance) - if instance.disk_template not in constants.DTS_NET_MIRROR: - raise errors.OpPrereqError("Instance's disk layout is not" - " network mirrored, cannot failover.") - secondary_nodes = instance.secondary_nodes - if not secondary_nodes: - raise errors.ProgrammerError("no secondary node but using " - "a mirrored disk template") + for idx, dsk in enumerate(instance.disks): + if dsk.dev_type not in (constants.LD_LV, constants.LD_FILE): + raise errors.OpPrereqError("Instance disk %d has a complex layout," + " cannot copy") - target_node = secondary_nodes[0] _CheckNodeOnline(self, target_node) _CheckNodeNotDrained(self, target_node) + if instance.admin_up: # check memory requirements on the secondary node _CheckNodeFreeMemory(self, target_node, "failing over instance %s" % @@ -3825,27 +4449,18 @@ class LUFailoverInstance(LogicalUnit): _CheckInstanceBridgesExist(self, instance, node=target_node) def Exec(self, feedback_fn): - """Failover an instance. + """Move an instance. - The failover is done by shutting it down on its present node and - starting it on the secondary. + The move is done by shutting it down on its present node, copying + the data over (slow) and starting it on the new node. """ instance = self.instance source_node = instance.primary_node - target_node = instance.secondary_nodes[0] - - feedback_fn("* checking disk consistency between source and target") - for dev in instance.disks: - # for drbd, these are drbd over lvm - if not _CheckDiskConsistency(self, dev, target_node, False): - if instance.admin_up and not self.op.ignore_consistency: - raise errors.OpExecError("Disk %s is degraded on target node," - " aborting failover." % dev.iv_name) + target_node = self.target_node - feedback_fn("* shutting down instance on source node") - logging.info("Shutting down instance %s on node %s", + self.LogInfo("Shutting down instance %s on source node %s", instance.name, source_node) result = self.rpc.call_instance_shutdown(source_node, instance) @@ -3861,27 +4476,66 @@ class LUFailoverInstance(LogicalUnit): " node %s: %s" % (instance.name, source_node, msg)) - feedback_fn("* deactivating the instance's disks on source node") - if not _ShutdownInstanceDisks(self, instance, ignore_primary=True): - raise errors.OpExecError("Can't shut down the instance's disks.") + # create the target disks + try: + _CreateDisks(self, instance, target_node=target_node) + except errors.OpExecError: + self.LogWarning("Device creation failed, reverting...") + try: + _RemoveDisks(self, instance, target_node=target_node) + finally: + self.cfg.ReleaseDRBDMinors(instance.name) + raise + + cluster_name = self.cfg.GetClusterInfo().cluster_name + + errs = [] + # activate, get path, copy the data over + for idx, disk in enumerate(instance.disks): + self.LogInfo("Copying data for disk %d", idx) + result = self.rpc.call_blockdev_assemble(target_node, disk, + instance.name, True) + if result.fail_msg: + self.LogWarning("Can't assemble newly created disk %d: %s", + idx, result.fail_msg) + errs.append(result.fail_msg) + break + dev_path = result.payload + result = self.rpc.call_blockdev_export(source_node, disk, + target_node, dev_path, + cluster_name) + if result.fail_msg: + self.LogWarning("Can't copy data over for disk %d: %s", + idx, result.fail_msg) + errs.append(result.fail_msg) + break + + if errs: + self.LogWarning("Some disks failed to copy, aborting") + try: + _RemoveDisks(self, instance, target_node=target_node) + finally: + self.cfg.ReleaseDRBDMinors(instance.name) + raise errors.OpExecError("Errors during disk copy: %s" % + (",".join(errs),)) instance.primary_node = target_node - # distribute new instance config to the other nodes self.cfg.Update(instance) + self.LogInfo("Removing the disks on the original node") + _RemoveDisks(self, instance, target_node=source_node) + # Only start the instance if it's marked as up if instance.admin_up: - feedback_fn("* activating the instance's disks on target node") - logging.info("Starting instance %s on node %s", + self.LogInfo("Starting instance %s on node %s", instance.name, target_node) disks_ok, _ = _AssembleInstanceDisks(self, instance, - ignore_secondaries=True) + ignore_secondaries=True) if not disks_ok: _ShutdownInstanceDisks(self, instance) raise errors.OpExecError("Can't activate the instance's disks") - feedback_fn("* starting the instance on the target node") result = self.rpc.call_instance_start(target_node, instance, None, None) msg = result.fail_msg if msg: @@ -3890,23 +4544,40 @@ class LUFailoverInstance(LogicalUnit): (instance.name, target_node, msg)) -class LUMigrateInstance(LogicalUnit): - """Migrate an instance. - - This is migration without shutting down, compared to the failover, - which is done with shutdown. +class LUMigrateNode(LogicalUnit): + """Migrate all instances from a node. """ - HPATH = "instance-migrate" - HTYPE = constants.HTYPE_INSTANCE - _OP_REQP = ["instance_name", "live", "cleanup"] - + HPATH = "node-migrate" + HTYPE = constants.HTYPE_NODE + _OP_REQP = ["node_name", "live"] REQ_BGL = False def ExpandNames(self): - self._ExpandAndLockInstance() - self.needed_locks[locking.LEVEL_NODE] = [] - self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE + self.op.node_name = self.cfg.ExpandNodeName(self.op.node_name) + if self.op.node_name is None: + raise errors.OpPrereqError("Node '%s' not known" % self.op.node_name) + + self.needed_locks = { + locking.LEVEL_NODE: [self.op.node_name], + } + + self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND + + # Create tasklets for migrating instances for all instances on this node + names = [] + tasklets = [] + + for inst in _GetNodePrimaryInstances(self.cfg, self.op.node_name): + logging.debug("Migrating instance %s", inst.name) + names.append(inst.name) + + tasklets.append(TLMigrateInstance(self, inst.name, self.op.live, False)) + + self.tasklets = tasklets + + # Declare instance locks + self.needed_locks[locking.LEVEL_INSTANCE] = names def DeclareLocks(self, level): if level == locking.LEVEL_NODE: @@ -3915,14 +4586,29 @@ class LUMigrateInstance(LogicalUnit): def BuildHooksEnv(self): """Build hooks env. - This runs on master, primary and secondary nodes of the instance. + This runs on the master, the primary and all the secondaries. """ - env = _BuildInstanceHookEnvByObject(self, self.instance) - env["MIGRATE_LIVE"] = self.op.live - env["MIGRATE_CLEANUP"] = self.op.cleanup - nl = [self.cfg.GetMasterNode()] + list(self.instance.secondary_nodes) - return env, nl, nl + env = { + "NODE_NAME": self.op.node_name, + } + + nl = [self.cfg.GetMasterNode()] + + return (env, nl, nl) + + +class TLMigrateInstance(Tasklet): + def __init__(self, lu, instance_name, live, cleanup): + """Initializes this class. + + """ + Tasklet.__init__(self, lu) + + # Parameters + self.instance_name = instance_name + self.live = live + self.cleanup = cleanup def CheckPrereq(self): """Check prerequisites. @@ -3931,10 +4617,10 @@ class LUMigrateInstance(LogicalUnit): """ instance = self.cfg.GetInstanceInfo( - self.cfg.ExpandInstanceName(self.op.instance_name)) + self.cfg.ExpandInstanceName(self.instance_name)) if instance is None: raise errors.OpPrereqError("Instance '%s' not known" % - self.op.instance_name) + self.instance_name) if instance.disk_template != constants.DT_DRBD8: raise errors.OpPrereqError("Instance's disk layout is not" @@ -3956,7 +4642,7 @@ class LUMigrateInstance(LogicalUnit): # check bridge existance _CheckInstanceBridgesExist(self, instance, node=target_node) - if not self.op.cleanup: + if not self.cleanup: _CheckNodeNotDrained(self, target_node) result = self.rpc.call_instance_migratable(instance.primary_node, instance) @@ -4103,10 +4789,10 @@ class LUMigrateInstance(LogicalUnit): self._GoReconnect(False) self._WaitUntilSync() except errors.OpExecError, err: - self.LogWarning("Migration failed and I can't reconnect the" - " drives: error '%s'\n" - "Please look and recover the instance status" % - str(err)) + self.lu.LogWarning("Migration failed and I can't reconnect the" + " drives: error '%s'\n" + "Please look and recover the instance status" % + str(err)) def _AbortMigration(self): """Call the hypervisor code to abort a started migration. @@ -4186,7 +4872,7 @@ class LUMigrateInstance(LogicalUnit): time.sleep(10) result = self.rpc.call_instance_migrate(source_node, instance, self.nodes_ip[target_node], - self.op.live) + self.live) msg = result.fail_msg if msg: logging.error("Instance migration failed, trying to revert" @@ -4224,6 +4910,8 @@ class LUMigrateInstance(LogicalUnit): """Perform the migration. """ + feedback_fn("Migrating instance %s" % self.instance.name) + self.feedback_fn = feedback_fn self.source_node = self.instance.primary_node @@ -4233,7 +4921,8 @@ class LUMigrateInstance(LogicalUnit): self.source_node: self.cfg.GetNodeInfo(self.source_node).secondary_ip, self.target_node: self.cfg.GetNodeInfo(self.target_node).secondary_ip, } - if self.op.cleanup: + + if self.cleanup: return self._ExecCleanup() else: return self._ExecMigration() @@ -4418,7 +5107,7 @@ def _GetInstanceInfoText(instance): return "originstname+%s" % instance.name -def _CreateDisks(lu, instance): +def _CreateDisks(lu, instance, to_skip=None, target_node=None): """Create all disks for an instance. This abstracts away some work from AddInstance. @@ -4427,12 +5116,21 @@ def _CreateDisks(lu, instance): @param lu: the logical unit on whose behalf we execute @type instance: L{objects.Instance} @param instance: the instance whose disks we should create + @type to_skip: list + @param to_skip: list of indices to skip + @type target_node: string + @param target_node: if passed, overrides the target node for creation @rtype: boolean @return: the success of the creation """ info = _GetInstanceInfoText(instance) - pnode = instance.primary_node + if target_node is None: + pnode = instance.primary_node + all_nodes = instance.all_nodes + else: + pnode = target_node + all_nodes = [pnode] if instance.disk_template == constants.DT_FILE: file_storage_dir = os.path.dirname(instance.disks[0].logical_id[1]) @@ -4443,16 +5141,18 @@ def _CreateDisks(lu, instance): # Note: this needs to be kept in sync with adding of disks in # LUSetInstanceParams - for device in instance.disks: + for idx, device in enumerate(instance.disks): + if to_skip and idx in to_skip: + continue logging.info("Creating volume %s for instance %s", device.iv_name, instance.name) #HARDCODE - for node in instance.all_nodes: + for node in all_nodes: f_create = node == pnode _CreateBlockDev(lu, node, instance, device, f_create, info, f_create) -def _RemoveDisks(lu, instance): +def _RemoveDisks(lu, instance, target_node=None): """Remove all disks for an instance. This abstracts away some work from `AddInstance()` and @@ -4464,6 +5164,8 @@ def _RemoveDisks(lu, instance): @param lu: the logical unit on whose behalf we execute @type instance: L{objects.Instance} @param instance: the instance whose disks we should remove + @type target_node: string + @param target_node: used to override the node on which to remove the disks @rtype: boolean @return: the success of the removal @@ -4472,7 +5174,11 @@ def _RemoveDisks(lu, instance): all_result = True for device in instance.disks: - for node, disk in device.ComputeNodeTree(instance.primary_node): + if target_node: + edata = [(target_node, device)] + else: + edata = device.ComputeNodeTree(instance.primary_node) + for node, disk in edata: lu.cfg.SetDiskID(disk, node) msg = lu.rpc.call_blockdev_remove(node, disk).fail_msg if msg: @@ -4482,12 +5188,14 @@ def _RemoveDisks(lu, instance): if instance.disk_template == constants.DT_FILE: file_storage_dir = os.path.dirname(instance.disks[0].logical_id[1]) - result = lu.rpc.call_file_storage_dir_remove(instance.primary_node, - file_storage_dir) - msg = result.fail_msg - if msg: + if target_node is node: + tgt = instance.primary_node + else: + tgt = instance.target_node + result = lu.rpc.call_file_storage_dir_remove(tgt, file_storage_dir) + if result.fail_msg: lu.LogWarning("Could not remove directory '%s' on node %s: %s", - file_storage_dir, instance.primary_node, msg) + file_storage_dir, instance.primary_node, result.fail_msg) all_result = False return all_result @@ -5190,8 +5898,8 @@ class LUReplaceDisks(LogicalUnit): if not hasattr(self.op, "iallocator"): self.op.iallocator = None - _DiskReplacer.CheckArguments(self.op.mode, self.op.remote_node, - self.op.iallocator) + TLReplaceDisks.CheckArguments(self.op.mode, self.op.remote_node, + self.op.iallocator) def ExpandNames(self): self._ExpandAndLockInstance() @@ -5218,9 +5926,11 @@ class LUReplaceDisks(LogicalUnit): self.needed_locks[locking.LEVEL_NODE] = [] self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE - self.replacer = _DiskReplacer(self, self.op.instance_name, self.op.mode, - self.op.iallocator, self.op.remote_node, - self.op.disks) + self.replacer = TLReplaceDisks(self, self.op.instance_name, self.op.mode, + self.op.iallocator, self.op.remote_node, + self.op.disks) + + self.tasklets = [self.replacer] def DeclareLocks(self, level): # If we're not already locking all nodes in the set we have to declare the @@ -5250,24 +5960,101 @@ class LUReplaceDisks(LogicalUnit): nl.append(self.op.remote_node) return env, nl, nl - def CheckPrereq(self): - """Check prerequisites. - This checks that the instance is in the cluster. +class LUEvacuateNode(LogicalUnit): + """Relocate the secondary instances from a node. - """ - self.replacer.CheckPrereq() + """ + HPATH = "node-evacuate" + HTYPE = constants.HTYPE_NODE + _OP_REQP = ["node_name"] + REQ_BGL = False - def Exec(self, feedback_fn): - """Execute disk replacement. + def CheckArguments(self): + if not hasattr(self.op, "remote_node"): + self.op.remote_node = None + if not hasattr(self.op, "iallocator"): + self.op.iallocator = None - This dispatches the disk replacement to the appropriate handler. + TLReplaceDisks.CheckArguments(constants.REPLACE_DISK_CHG, + self.op.remote_node, + self.op.iallocator) + + def ExpandNames(self): + self.op.node_name = self.cfg.ExpandNodeName(self.op.node_name) + if self.op.node_name is None: + raise errors.OpPrereqError("Node '%s' not known" % self.op.node_name) + + self.needed_locks = {} + + # Declare node locks + if self.op.iallocator is not None: + self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET + + elif self.op.remote_node is not None: + remote_node = self.cfg.ExpandNodeName(self.op.remote_node) + if remote_node is None: + raise errors.OpPrereqError("Node '%s' not known" % + self.op.remote_node) + + self.op.remote_node = remote_node + + # Warning: do not remove the locking of the new secondary here + # unless DRBD8.AddChildren is changed to work in parallel; + # currently it doesn't since parallel invocations of + # FindUnusedMinor will conflict + self.needed_locks[locking.LEVEL_NODE] = [remote_node] + self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND + + else: + raise errors.OpPrereqError("Invalid parameters") + + # Create tasklets for replacing disks for all secondary instances on this + # node + names = [] + tasklets = [] + + for inst in _GetNodeSecondaryInstances(self.cfg, self.op.node_name): + logging.debug("Replacing disks for instance %s", inst.name) + names.append(inst.name) + + replacer = TLReplaceDisks(self, inst.name, constants.REPLACE_DISK_CHG, + self.op.iallocator, self.op.remote_node, []) + tasklets.append(replacer) + + self.tasklets = tasklets + self.instance_names = names + + # Declare instance locks + self.needed_locks[locking.LEVEL_INSTANCE] = self.instance_names + + def DeclareLocks(self, level): + # If we're not already locking all nodes in the set we have to declare the + # instance's primary/secondary nodes. + if (level == locking.LEVEL_NODE and + self.needed_locks[locking.LEVEL_NODE] is not locking.ALL_SET): + self._LockInstancesNodes() + + def BuildHooksEnv(self): + """Build hooks env. + + This runs on the master, the primary and all the secondaries. """ - self.replacer.Exec() + env = { + "NODE_NAME": self.op.node_name, + } + + nl = [self.cfg.GetMasterNode()] + + if self.op.remote_node is not None: + env["NEW_SECONDARY"] = self.op.remote_node + nl.append(self.op.remote_node) + return (env, nl, nl) -class _DiskReplacer: + +class TLReplaceDisks(Tasklet): """Replaces disks for an instance. Note: Locking is not within the scope of this class. @@ -5278,18 +6065,15 @@ class _DiskReplacer: """Initializes this class. """ + Tasklet.__init__(self, lu) + # Parameters - self.lu = lu self.instance_name = instance_name self.mode = mode self.iallocator_name = iallocator_name self.remote_node = remote_node self.disks = disks - # Shortcuts - self.cfg = lu.cfg - self.rpc = lu.rpc - # Runtime data self.instance = None self.new_node = None @@ -5300,21 +6084,25 @@ class _DiskReplacer: @staticmethod def CheckArguments(mode, remote_node, iallocator): + """Helper function for users of this class. + + """ # check for valid parameter combination - cnt = [remote_node, iallocator].count(None) if mode == constants.REPLACE_DISK_CHG: - if cnt == 2: + if remote_node is None and iallocator is None: raise errors.OpPrereqError("When changing the secondary either an" " iallocator script must be used or the" " new node given") - elif cnt == 0: + + if remote_node is not None and iallocator is not None: raise errors.OpPrereqError("Give either the iallocator or the new" " secondary, not both") - else: # not replacing the secondary - if cnt != 2: - raise errors.OpPrereqError("The iallocator and new node options can" - " be used only when changing the" - " secondary node") + + elif remote_node is not None or iallocator is not None: + # Not replacing the secondary + raise errors.OpPrereqError("The iallocator and new node options can" + " only be used when changing the" + " secondary node") @staticmethod def _RunAllocator(lu, iallocator_name, instance_name, relocate_from): @@ -5344,6 +6132,10 @@ class _DiskReplacer: return remote_node_name + def _FindFaultyDisks(self, node_name): + return _FindFaultyInstanceDisks(self.cfg, self.rpc, self.instance, + node_name, True) + def CheckPrereq(self): """Check prerequisites. @@ -5386,35 +6178,64 @@ class _DiskReplacer: raise errors.OpPrereqError("The specified node is already the" " secondary node of the instance.") - if self.mode == constants.REPLACE_DISK_PRI: - self.target_node = self.instance.primary_node - self.other_node = secondary_node - check_nodes = [self.target_node, self.other_node] + if self.disks and self.mode in (constants.REPLACE_DISK_AUTO, + constants.REPLACE_DISK_CHG): + raise errors.OpPrereqError("Cannot specify disks to be replaced") + + if self.mode == constants.REPLACE_DISK_AUTO: + faulty_primary = self._FindFaultyDisks(self.instance.primary_node) + faulty_secondary = self._FindFaultyDisks(secondary_node) + + if faulty_primary and faulty_secondary: + raise errors.OpPrereqError("Instance %s has faulty disks on more than" + " one node and can not be repaired" + " automatically" % self.instance_name) + + if faulty_primary: + self.disks = faulty_primary + self.target_node = self.instance.primary_node + self.other_node = secondary_node + check_nodes = [self.target_node, self.other_node] + elif faulty_secondary: + self.disks = faulty_secondary + self.target_node = secondary_node + self.other_node = self.instance.primary_node + check_nodes = [self.target_node, self.other_node] + else: + self.disks = [] + check_nodes = [] + + else: + # Non-automatic modes + if self.mode == constants.REPLACE_DISK_PRI: + self.target_node = self.instance.primary_node + self.other_node = secondary_node + check_nodes = [self.target_node, self.other_node] - elif self.mode == constants.REPLACE_DISK_SEC: - self.target_node = secondary_node - self.other_node = self.instance.primary_node - check_nodes = [self.target_node, self.other_node] + elif self.mode == constants.REPLACE_DISK_SEC: + self.target_node = secondary_node + self.other_node = self.instance.primary_node + check_nodes = [self.target_node, self.other_node] - elif self.mode == constants.REPLACE_DISK_CHG: - self.new_node = remote_node - self.other_node = self.instance.primary_node - self.target_node = secondary_node - check_nodes = [self.new_node, self.other_node] + elif self.mode == constants.REPLACE_DISK_CHG: + self.new_node = remote_node + self.other_node = self.instance.primary_node + self.target_node = secondary_node + check_nodes = [self.new_node, self.other_node] - _CheckNodeNotDrained(self.lu, remote_node) + _CheckNodeNotDrained(self.lu, remote_node) - else: - raise errors.ProgrammerError("Unhandled disk replace mode (%s)" % - self.mode) + else: + raise errors.ProgrammerError("Unhandled disk replace mode (%s)" % + self.mode) + + # If not specified all disks should be replaced + if not self.disks: + self.disks = range(len(self.instance.disks)) for node in check_nodes: _CheckNodeOnline(self.lu, node) - # If not specified all disks should be replaced - if not self.disks: - self.disks = range(len(self.instance.disks)) - # Check whether disks are valid for disk_idx in self.disks: self.instance.FindDisk(disk_idx) @@ -5428,12 +6249,19 @@ class _DiskReplacer: self.node_secondary_ip = node_2nd_ip - def Exec(self): + def Exec(self, feedback_fn): """Execute disk replacement. This dispatches the disk replacement to the appropriate handler. """ + if not self.disks: + feedback_fn("No disks need replacement") + return + + feedback_fn("Replacing disk(s) %s for %s" % + (", ".join([str(i) for i in self.disks]), self.instance.name)) + activate_disks = (not self.instance.admin_up) # Activate the instance disks if we're replacing them on a down instance @@ -5441,7 +6269,8 @@ class _DiskReplacer: _StartInstanceDisks(self.lu, self.instance, True) try: - if self.mode == constants.REPLACE_DISK_CHG: + # Should we replace the secondary node? + if self.new_node is not None: return self._ExecDrbd8Secondary() else: return self._ExecDrbd8DiskOnly() @@ -5545,7 +6374,7 @@ class _DiskReplacer: raise errors.OpExecError("Can't find DRBD device %s: %s" % (name, msg)) - if result.payload[5]: + if result.payload.is_degraded: raise errors.OpExecError("DRBD device %s is degraded!" % name) def _RemoveOldStorage(self, node_name, iv_names): @@ -5814,6 +6643,62 @@ class _DiskReplacer: self._RemoveOldStorage(self.target_node, iv_names) +class LURepairNodeStorage(NoHooksLU): + """Repairs the volume group on a node. + + """ + _OP_REQP = ["node_name"] + REQ_BGL = False + + def CheckArguments(self): + node_name = self.cfg.ExpandNodeName(self.op.node_name) + if node_name is None: + raise errors.OpPrereqError("Invalid node name '%s'" % self.op.node_name) + + self.op.node_name = node_name + + def ExpandNames(self): + self.needed_locks = { + locking.LEVEL_NODE: [self.op.node_name], + } + + def _CheckFaultyDisks(self, instance, node_name): + if _FindFaultyInstanceDisks(self.cfg, self.rpc, instance, + node_name, True): + raise errors.OpPrereqError("Instance '%s' has faulty disks on" + " node '%s'" % (inst.name, node_name)) + + def CheckPrereq(self): + """Check prerequisites. + + """ + storage_type = self.op.storage_type + + if (constants.SO_FIX_CONSISTENCY not in + constants.VALID_STORAGE_OPERATIONS.get(storage_type, [])): + raise errors.OpPrereqError("Storage units of type '%s' can not be" + " repaired" % storage_type) + + # Check whether any instance on this node has faulty disks + for inst in _GetNodeInstances(self.cfg, self.op.node_name): + check_nodes = set(inst.all_nodes) + check_nodes.discard(self.op.node_name) + for inst_node_name in check_nodes: + self._CheckFaultyDisks(inst, inst_node_name) + + def Exec(self, feedback_fn): + feedback_fn("Repairing storage unit '%s' on %s ..." % + (self.op.name, self.op.node_name)) + + st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type) + result = self.rpc.call_storage_execute(self.op.node_name, + self.op.storage_type, st_args, + self.op.name, + constants.SO_FIX_CONSISTENCY) + result.Raise("Failed to repair storage unit '%s' on %s" % + (self.op.name, self.op.node_name)) + + class LUGrowDisk(LogicalUnit): """Grow a disk of an instance. @@ -5950,22 +6835,33 @@ class LUQueryInstanceData(NoHooksLU): in self.wanted_names] return + def _ComputeBlockdevStatus(self, node, instance_name, dev): + """Returns the status of a block device + + """ + if self.op.static or not node: + return None + + self.cfg.SetDiskID(dev, node) + + result = self.rpc.call_blockdev_find(node, dev) + if result.offline: + return None + + result.Raise("Can't compute disk status for %s" % instance_name) + + status = result.payload + if status is None: + return None + + return (status.dev_path, status.major, status.minor, + status.sync_percent, status.estimated_time, + status.is_degraded, status.ldisk_status) + def _ComputeDiskStatus(self, instance, snode, dev): """Compute block device status. """ - static = self.op.static - if not static: - self.cfg.SetDiskID(dev, instance.primary_node) - dev_pstatus = self.rpc.call_blockdev_find(instance.primary_node, dev) - if dev_pstatus.offline: - dev_pstatus = None - else: - dev_pstatus.Raise("Can't compute disk status for %s" % instance.name) - dev_pstatus = dev_pstatus.payload - else: - dev_pstatus = None - if dev.dev_type in constants.LDS_DRBD: # we change the snode then (otherwise we use the one passed in) if dev.logical_id[0] == instance.primary_node: @@ -5973,16 +6869,9 @@ class LUQueryInstanceData(NoHooksLU): else: snode = dev.logical_id[0] - if snode and not static: - self.cfg.SetDiskID(dev, snode) - dev_sstatus = self.rpc.call_blockdev_find(snode, dev) - if dev_sstatus.offline: - dev_sstatus = None - else: - dev_sstatus.Raise("Can't compute disk status for %s" % instance.name) - dev_sstatus = dev_sstatus.payload - else: - dev_sstatus = None + dev_pstatus = self._ComputeBlockdevStatus(instance.primary_node, + instance.name, dev) + dev_sstatus = self._ComputeBlockdevStatus(snode, instance.name, dev) if dev.children: dev_children = [self._ComputeDiskStatus(instance, snode, child) @@ -6047,6 +6936,9 @@ class LUQueryInstanceData(NoHooksLU): "hv_actual": cluster.FillHV(instance), "be_instance": instance.beparams, "be_actual": cluster.FillBE(instance), + "serial_no": instance.serial_no, + "mtime": instance.mtime, + "ctime": instance.ctime, } result[instance.name] = idict @@ -6694,6 +7586,8 @@ class LUExportInstance(LogicalUnit): for disk in instance.disks: self.cfg.SetDiskID(disk, src_node) + # per-disk results + dresults = [] try: for idx, disk in enumerate(instance.disks): # result.payload will be a snapshot of an lvm leaf of the one we passed @@ -6729,16 +7623,23 @@ class LUExportInstance(LogicalUnit): if msg: self.LogWarning("Could not export disk/%s from node %s to" " node %s: %s", idx, src_node, dst_node.name, msg) + dresults.append(False) + else: + dresults.append(True) msg = self.rpc.call_blockdev_remove(src_node, dev).fail_msg if msg: self.LogWarning("Could not remove snapshot for disk/%d from node" " %s: %s", idx, src_node, msg) + else: + dresults.append(False) result = self.rpc.call_finalize_export(dst_node.name, instance, snap_disks) + fin_resu = True msg = result.fail_msg if msg: self.LogWarning("Could not finalize export for instance %s" " on node %s: %s", instance.name, dst_node.name, msg) + fin_resu = False nodelist = self.cfg.GetNodeList() nodelist.remove(dst_node.name) @@ -6757,6 +7658,7 @@ class LUExportInstance(LogicalUnit): if msg: self.LogWarning("Could not remove older export for instance %s" " on node %s: %s", iname, node, msg) + return fin_resu, dresults class LURemoveExport(NoHooksLU): @@ -7120,11 +8022,12 @@ class IAllocator(object): "master_candidate": ninfo.master_candidate, } - if not ninfo.offline: + if not (ninfo.offline or ninfo.drained): nresult.Raise("Can't get data for node %s" % nname) node_iinfo[nname].Raise("Can't get node instance info from node %s" % nname) remote_info = nresult.payload + for attr in ['memory_total', 'memory_free', 'memory_dom0', 'vg_size', 'vg_free', 'cpu_total']: if attr not in remote_info: