+ REQ_BGL = False
+
+ def ExpandNames(self):
+ self._ExpandAndLockInstance()
+
+ self.needed_locks[locking.LEVEL_NODE] = []
+ self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
+
+ self._migrater = TLMigrateInstance(self, self.op.instance_name,
+ self.op.live, self.op.cleanup)
+ self.tasklets = [self._migrater]
+
+ def DeclareLocks(self, level):
+ if level == locking.LEVEL_NODE:
+ self._LockInstancesNodes()
+
+ def BuildHooksEnv(self):
+ """Build hooks env.
+
+ This runs on master, primary and secondary nodes of the instance.
+
+ """
+ instance = self._migrater.instance
+ env = _BuildInstanceHookEnvByObject(self, instance)
+ env["MIGRATE_LIVE"] = self.op.live
+ env["MIGRATE_CLEANUP"] = self.op.cleanup
+ nl = [self.cfg.GetMasterNode()] + list(instance.secondary_nodes)
+ return env, nl, nl
+
+
+class LUMigrateNode(LogicalUnit):
+ """Migrate all instances from a node.
+
+ """
+ HPATH = "node-migrate"
+ HTYPE = constants.HTYPE_NODE
+ _OP_REQP = ["node_name", "live"]
+ REQ_BGL = False
+
+ def ExpandNames(self):
+ self.op.node_name = self.cfg.ExpandNodeName(self.op.node_name)
+ if self.op.node_name is None:
+ raise errors.OpPrereqError("Node '%s' not known" % self.op.node_name)
+
+ self.needed_locks = {
+ locking.LEVEL_NODE: [self.op.node_name],
+ }
+
+ self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
+
+ # Create tasklets for migrating instances for all instances on this node
+ names = []
+ tasklets = []
+
+ for inst in _GetNodePrimaryInstances(self.cfg, self.op.node_name):
+ logging.debug("Migrating instance %s", inst.name)
+ names.append(inst.name)
+
+ tasklets.append(TLMigrateInstance(self, inst.name, self.op.live, False))
+
+ self.tasklets = tasklets
+
+ # Declare instance locks
+ self.needed_locks[locking.LEVEL_INSTANCE] = names
+
+ def DeclareLocks(self, level):
+ if level == locking.LEVEL_NODE:
+ self._LockInstancesNodes()
+
+ def BuildHooksEnv(self):
+ """Build hooks env.
+
+ This runs on the master, the primary and all the secondaries.
+
+ """
+ env = {
+ "NODE_NAME": self.op.node_name,
+ }
+
+ nl = [self.cfg.GetMasterNode()]
+
+ return (env, nl, nl)
+
+
+class TLMigrateInstance(Tasklet):
+ def __init__(self, lu, instance_name, live, cleanup):
+ """Initializes this class.
+
+ """
+ Tasklet.__init__(self, lu)
+
+ # Parameters
+ self.instance_name = instance_name
+ self.live = live
+ self.cleanup = cleanup
+
+ def CheckPrereq(self):
+ """Check prerequisites.
+
+ This checks that the instance is in the cluster.
+
+ """
+ instance = self.cfg.GetInstanceInfo(
+ self.cfg.ExpandInstanceName(self.instance_name))
+ if instance is None:
+ raise errors.OpPrereqError("Instance '%s' not known" %
+ self.instance_name)
+
+ if instance.disk_template != constants.DT_DRBD8:
+ raise errors.OpPrereqError("Instance's disk layout is not"
+ " drbd8, cannot migrate.")
+
+ secondary_nodes = instance.secondary_nodes
+ if not secondary_nodes:
+ raise errors.ConfigurationError("No secondary node but using"
+ " drbd8 disk template")
+
+ i_be = self.cfg.GetClusterInfo().FillBE(instance)
+
+ target_node = secondary_nodes[0]
+ # check memory requirements on the secondary node
+ _CheckNodeFreeMemory(self, target_node, "migrating instance %s" %
+ instance.name, i_be[constants.BE_MEMORY],
+ instance.hypervisor)
+
+ # check bridge existance
+ _CheckInstanceBridgesExist(self, instance, node=target_node)
+
+ if not self.cleanup:
+ _CheckNodeNotDrained(self, target_node)
+ result = self.rpc.call_instance_migratable(instance.primary_node,
+ instance)
+ result.Raise("Can't migrate, please use failover", prereq=True)
+
+ self.instance = instance
+
+ def _WaitUntilSync(self):
+ """Poll with custom rpc for disk sync.
+
+ This uses our own step-based rpc call.
+
+ """
+ self.feedback_fn("* wait until resync is done")
+ all_done = False
+ while not all_done:
+ all_done = True
+ result = self.rpc.call_drbd_wait_sync(self.all_nodes,
+ self.nodes_ip,
+ self.instance.disks)
+ min_percent = 100
+ for node, nres in result.items():
+ nres.Raise("Cannot resync disks on node %s" % node)
+ node_done, node_percent = nres.payload
+ all_done = all_done and node_done
+ if node_percent is not None:
+ min_percent = min(min_percent, node_percent)
+ if not all_done:
+ if min_percent < 100:
+ self.feedback_fn(" - progress: %.1f%%" % min_percent)
+ time.sleep(2)
+
+ def _EnsureSecondary(self, node):
+ """Demote a node to secondary.
+
+ """
+ self.feedback_fn("* switching node %s to secondary mode" % node)
+
+ for dev in self.instance.disks:
+ self.cfg.SetDiskID(dev, node)
+
+ result = self.rpc.call_blockdev_close(node, self.instance.name,
+ self.instance.disks)
+ result.Raise("Cannot change disk to secondary on node %s" % node)
+
+ def _GoStandalone(self):
+ """Disconnect from the network.
+
+ """
+ self.feedback_fn("* changing into standalone mode")
+ result = self.rpc.call_drbd_disconnect_net(self.all_nodes, self.nodes_ip,
+ self.instance.disks)
+ for node, nres in result.items():
+ nres.Raise("Cannot disconnect disks node %s" % node)
+
+ def _GoReconnect(self, multimaster):
+ """Reconnect to the network.
+
+ """
+ if multimaster:
+ msg = "dual-master"
+ else:
+ msg = "single-master"
+ self.feedback_fn("* changing disks into %s mode" % msg)
+ result = self.rpc.call_drbd_attach_net(self.all_nodes, self.nodes_ip,
+ self.instance.disks,
+ self.instance.name, multimaster)
+ for node, nres in result.items():
+ nres.Raise("Cannot change disks config on node %s" % node)
+
+ def _ExecCleanup(self):
+ """Try to cleanup after a failed migration.
+
+ The cleanup is done by:
+ - check that the instance is running only on one node
+ (and update the config if needed)
+ - change disks on its secondary node to secondary
+ - wait until disks are fully synchronized
+ - disconnect from the network
+ - change disks into single-master mode
+ - wait again until disks are fully synchronized
+
+ """
+ instance = self.instance
+ target_node = self.target_node
+ source_node = self.source_node
+
+ # check running on only one node
+ self.feedback_fn("* checking where the instance actually runs"
+ " (if this hangs, the hypervisor might be in"
+ " a bad state)")
+ ins_l = self.rpc.call_instance_list(self.all_nodes, [instance.hypervisor])
+ for node, result in ins_l.items():
+ result.Raise("Can't contact node %s" % node)
+
+ runningon_source = instance.name in ins_l[source_node].payload
+ runningon_target = instance.name in ins_l[target_node].payload
+
+ if runningon_source and runningon_target:
+ raise errors.OpExecError("Instance seems to be running on two nodes,"
+ " or the hypervisor is confused. You will have"
+ " to ensure manually that it runs only on one"
+ " and restart this operation.")
+
+ if not (runningon_source or runningon_target):
+ raise errors.OpExecError("Instance does not seem to be running at all."
+ " In this case, it's safer to repair by"
+ " running 'gnt-instance stop' to ensure disk"
+ " shutdown, and then restarting it.")
+
+ if runningon_target:
+ # the migration has actually succeeded, we need to update the config
+ self.feedback_fn("* instance running on secondary node (%s),"
+ " updating config" % target_node)
+ instance.primary_node = target_node
+ self.cfg.Update(instance)
+ demoted_node = source_node
+ else:
+ self.feedback_fn("* instance confirmed to be running on its"
+ " primary node (%s)" % source_node)
+ demoted_node = target_node
+
+ self._EnsureSecondary(demoted_node)
+ try:
+ self._WaitUntilSync()
+ except errors.OpExecError:
+ # we ignore here errors, since if the device is standalone, it
+ # won't be able to sync
+ pass
+ self._GoStandalone()
+ self._GoReconnect(False)
+ self._WaitUntilSync()
+
+ self.feedback_fn("* done")
+
+ def _RevertDiskStatus(self):
+ """Try to revert the disk status after a failed migration.
+
+ """
+ target_node = self.target_node
+ try:
+ self._EnsureSecondary(target_node)
+ self._GoStandalone()
+ self._GoReconnect(False)
+ self._WaitUntilSync()
+ except errors.OpExecError, err:
+ self.lu.LogWarning("Migration failed and I can't reconnect the"
+ " drives: error '%s'\n"
+ "Please look and recover the instance status" %
+ str(err))
+
+ def _AbortMigration(self):
+ """Call the hypervisor code to abort a started migration.
+
+ """
+ instance = self.instance
+ target_node = self.target_node
+ migration_info = self.migration_info
+
+ abort_result = self.rpc.call_finalize_migration(target_node,
+ instance,
+ migration_info,
+ False)
+ abort_msg = abort_result.fail_msg
+ if abort_msg:
+ logging.error("Aborting migration failed on target node %s: %s" %
+ (target_node, abort_msg))
+ # Don't raise an exception here, as we stil have to try to revert the
+ # disk status, even if this step failed.
+
+ def _ExecMigration(self):
+ """Migrate an instance.
+
+ The migrate is done by:
+ - change the disks into dual-master mode
+ - wait until disks are fully synchronized again
+ - migrate the instance
+ - change disks on the new secondary node (the old primary) to secondary
+ - wait until disks are fully synchronized
+ - change disks into single-master mode
+
+ """
+ instance = self.instance
+ target_node = self.target_node
+ source_node = self.source_node
+
+ self.feedback_fn("* checking disk consistency between source and target")
+ for dev in instance.disks:
+ if not _CheckDiskConsistency(self, dev, target_node, False):
+ raise errors.OpExecError("Disk %s is degraded or not fully"
+ " synchronized on target node,"
+ " aborting migrate." % dev.iv_name)
+
+ # First get the migration information from the remote node
+ result = self.rpc.call_migration_info(source_node, instance)
+ msg = result.fail_msg
+ if msg:
+ log_err = ("Failed fetching source migration information from %s: %s" %
+ (source_node, msg))
+ logging.error(log_err)
+ raise errors.OpExecError(log_err)
+
+ self.migration_info = migration_info = result.payload
+
+ # Then switch the disks to master/master mode
+ self._EnsureSecondary(target_node)
+ self._GoStandalone()
+ self._GoReconnect(True)
+ self._WaitUntilSync()
+
+ self.feedback_fn("* preparing %s to accept the instance" % target_node)
+ result = self.rpc.call_accept_instance(target_node,
+ instance,
+ migration_info,
+ self.nodes_ip[target_node])
+
+ msg = result.fail_msg
+ if msg:
+ logging.error("Instance pre-migration failed, trying to revert"
+ " disk status: %s", msg)
+ self._AbortMigration()
+ self._RevertDiskStatus()
+ raise errors.OpExecError("Could not pre-migrate instance %s: %s" %
+ (instance.name, msg))
+
+ self.feedback_fn("* migrating instance to %s" % target_node)
+ time.sleep(10)
+ result = self.rpc.call_instance_migrate(source_node, instance,
+ self.nodes_ip[target_node],
+ self.live)
+ msg = result.fail_msg
+ if msg:
+ logging.error("Instance migration failed, trying to revert"
+ " disk status: %s", msg)
+ self._AbortMigration()
+ self._RevertDiskStatus()
+ raise errors.OpExecError("Could not migrate instance %s: %s" %
+ (instance.name, msg))
+ time.sleep(10)
+
+ instance.primary_node = target_node
+ # distribute new instance config to the other nodes
+ self.cfg.Update(instance)
+
+ result = self.rpc.call_finalize_migration(target_node,
+ instance,
+ migration_info,
+ True)
+ msg = result.fail_msg
+ if msg:
+ logging.error("Instance migration succeeded, but finalization failed:"
+ " %s" % msg)
+ raise errors.OpExecError("Could not finalize instance migration: %s" %
+ msg)
+
+ self._EnsureSecondary(source_node)
+ self._WaitUntilSync()
+ self._GoStandalone()
+ self._GoReconnect(False)
+ self._WaitUntilSync()
+
+ self.feedback_fn("* done")