27 |
27 |
import logging
|
28 |
28 |
import operator
|
29 |
29 |
import os
|
30 |
|
import time
|
31 |
30 |
|
32 |
31 |
from ganeti import compat
|
33 |
32 |
from ganeti import constants
|
... | ... | |
47 |
46 |
from ganeti import query
|
48 |
47 |
|
49 |
48 |
from ganeti.cmdlib.base import NoHooksLU, LogicalUnit, _QueryBase, \
|
50 |
|
ResultWithJobs, Tasklet
|
|
49 |
ResultWithJobs
|
51 |
50 |
|
52 |
51 |
from ganeti.cmdlib.common import INSTANCE_ONLINE, INSTANCE_DOWN, \
|
53 |
52 |
INSTANCE_NOT_RUNNING, CAN_CHANGE_INSTANCE_OFFLINE, _CheckNodeOnline, \
|
... | ... | |
58 |
57 |
_GetUpdatedParams, _ExpandInstanceName, _ComputeIPolicySpecViolation, \
|
59 |
58 |
_CheckInstanceState, _ExpandNodeName
|
60 |
59 |
from ganeti.cmdlib.instance_storage import _CreateDisks, \
|
61 |
|
_CheckNodesFreeDiskPerVG, _WipeDisks, _WaitForSync, _CheckDiskConsistency, \
|
|
60 |
_CheckNodesFreeDiskPerVG, _WipeDisks, _WaitForSync, \
|
62 |
61 |
_IsExclusiveStorageEnabledNodeName, _CreateSingleBlockDev, _ComputeDisks, \
|
63 |
62 |
_CheckRADOSFreeSpace, _ComputeDiskSizePerVG, _GenerateDiskTemplate, \
|
64 |
63 |
_CreateBlockDev, _StartInstanceDisks, _ShutdownInstanceDisks, \
|
65 |
|
_AssembleInstanceDisks, _ExpandCheckDisks
|
|
64 |
_AssembleInstanceDisks
|
66 |
65 |
from ganeti.cmdlib.instance_utils import _BuildInstanceHookEnvByObject, \
|
67 |
66 |
_GetClusterDomainSecret, _BuildInstanceHookEnv, _NICListToTuple, \
|
68 |
67 |
_NICToTuple, _CheckNodeNotDrained, _RemoveInstance, _CopyLockList, \
|
69 |
68 |
_ReleaseLocks, _CheckNodeVmCapable, _CheckTargetNodeIPolicy, \
|
70 |
|
_GetInstanceInfoText, _RemoveDisks
|
|
69 |
_GetInstanceInfoText, _RemoveDisks, _CheckNodeFreeMemory, \
|
|
70 |
_CheckInstanceBridgesExist, _CheckNicsBridgesExist
|
71 |
71 |
|
72 |
72 |
import ganeti.masterd.instance
|
73 |
73 |
|
... | ... | |
338 |
338 |
_CheckOSVariant(result.payload, os_name)
|
339 |
339 |
|
340 |
340 |
|
341 |
|
def _CheckNicsBridgesExist(lu, target_nics, target_node):
|
342 |
|
"""Check that the brigdes needed by a list of nics exist.
|
343 |
|
|
344 |
|
"""
|
345 |
|
cluster = lu.cfg.GetClusterInfo()
|
346 |
|
paramslist = [cluster.SimpleFillNIC(nic.nicparams) for nic in target_nics]
|
347 |
|
brlist = [params[constants.NIC_LINK] for params in paramslist
|
348 |
|
if params[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED]
|
349 |
|
if brlist:
|
350 |
|
result = lu.rpc.call_bridges_exist(target_node, brlist)
|
351 |
|
result.Raise("Error checking bridges on destination node '%s'" %
|
352 |
|
target_node, prereq=True, ecode=errors.ECODE_ENVIRON)
|
353 |
|
|
354 |
|
|
355 |
|
def _CheckNodeFreeMemory(lu, node, reason, requested, hypervisor_name):
|
356 |
|
"""Checks if a node has enough free memory.
|
357 |
|
|
358 |
|
This function checks if a given node has the needed amount of free
|
359 |
|
memory. In case the node has less memory or we cannot get the
|
360 |
|
information from the node, this function raises an OpPrereqError
|
361 |
|
exception.
|
362 |
|
|
363 |
|
@type lu: C{LogicalUnit}
|
364 |
|
@param lu: a logical unit from which we get configuration data
|
365 |
|
@type node: C{str}
|
366 |
|
@param node: the node to check
|
367 |
|
@type reason: C{str}
|
368 |
|
@param reason: string to use in the error message
|
369 |
|
@type requested: C{int}
|
370 |
|
@param requested: the amount of memory in MiB to check for
|
371 |
|
@type hypervisor_name: C{str}
|
372 |
|
@param hypervisor_name: the hypervisor to ask for memory stats
|
373 |
|
@rtype: integer
|
374 |
|
@return: node current free memory
|
375 |
|
@raise errors.OpPrereqError: if the node doesn't have enough memory, or
|
376 |
|
we cannot check the node
|
377 |
|
|
378 |
|
"""
|
379 |
|
nodeinfo = lu.rpc.call_node_info([node], None, [hypervisor_name], False)
|
380 |
|
nodeinfo[node].Raise("Can't get data from node %s" % node,
|
381 |
|
prereq=True, ecode=errors.ECODE_ENVIRON)
|
382 |
|
(_, _, (hv_info, )) = nodeinfo[node].payload
|
383 |
|
|
384 |
|
free_mem = hv_info.get("memory_free", None)
|
385 |
|
if not isinstance(free_mem, int):
|
386 |
|
raise errors.OpPrereqError("Can't compute free memory on node %s, result"
|
387 |
|
" was '%s'" % (node, free_mem),
|
388 |
|
errors.ECODE_ENVIRON)
|
389 |
|
if requested > free_mem:
|
390 |
|
raise errors.OpPrereqError("Not enough memory on node %s for %s:"
|
391 |
|
" needed %s MiB, available %s MiB" %
|
392 |
|
(node, reason, requested, free_mem),
|
393 |
|
errors.ECODE_NORES)
|
394 |
|
return free_mem
|
395 |
|
|
396 |
|
|
397 |
341 |
class LUInstanceCreate(LogicalUnit):
|
398 |
342 |
"""Create an instance.
|
399 |
343 |
|
... | ... | |
1680 |
1624 |
_RemoveInstance(self, feedback_fn, instance, self.op.ignore_failures)
|
1681 |
1625 |
|
1682 |
1626 |
|
1683 |
|
def _CheckInstanceBridgesExist(lu, instance, node=None):
|
1684 |
|
"""Check that the brigdes needed by an instance exist.
|
1685 |
|
|
1686 |
|
"""
|
1687 |
|
if node is None:
|
1688 |
|
node = instance.primary_node
|
1689 |
|
_CheckNicsBridgesExist(lu, instance.nics, node)
|
1690 |
|
|
1691 |
|
|
1692 |
1627 |
class LUInstanceMove(LogicalUnit):
|
1693 |
1628 |
"""Move an instance by data-copying.
|
1694 |
1629 |
|
... | ... | |
2743 |
2678 |
return _GetInstanceConsole(self.cfg.GetClusterInfo(), instance)
|
2744 |
2679 |
|
2745 |
2680 |
|
2746 |
|
def _DeclareLocksForMigration(lu, level):
|
2747 |
|
"""Declares locks for L{TLMigrateInstance}.
|
2748 |
|
|
2749 |
|
@type lu: L{LogicalUnit}
|
2750 |
|
@param level: Lock level
|
2751 |
|
|
2752 |
|
"""
|
2753 |
|
if level == locking.LEVEL_NODE_ALLOC:
|
2754 |
|
assert lu.op.instance_name in lu.owned_locks(locking.LEVEL_INSTANCE)
|
2755 |
|
|
2756 |
|
instance = lu.cfg.GetInstanceInfo(lu.op.instance_name)
|
2757 |
|
|
2758 |
|
# Node locks are already declared here rather than at LEVEL_NODE as we need
|
2759 |
|
# the instance object anyway to declare the node allocation lock.
|
2760 |
|
if instance.disk_template in constants.DTS_EXT_MIRROR:
|
2761 |
|
if lu.op.target_node is None:
|
2762 |
|
lu.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
|
2763 |
|
lu.needed_locks[locking.LEVEL_NODE_ALLOC] = locking.ALL_SET
|
2764 |
|
else:
|
2765 |
|
lu.needed_locks[locking.LEVEL_NODE] = [instance.primary_node,
|
2766 |
|
lu.op.target_node]
|
2767 |
|
del lu.recalculate_locks[locking.LEVEL_NODE]
|
2768 |
|
else:
|
2769 |
|
lu._LockInstancesNodes() # pylint: disable=W0212
|
2770 |
|
|
2771 |
|
elif level == locking.LEVEL_NODE:
|
2772 |
|
# Node locks are declared together with the node allocation lock
|
2773 |
|
assert (lu.needed_locks[locking.LEVEL_NODE] or
|
2774 |
|
lu.needed_locks[locking.LEVEL_NODE] is locking.ALL_SET)
|
2775 |
|
|
2776 |
|
elif level == locking.LEVEL_NODE_RES:
|
2777 |
|
# Copy node locks
|
2778 |
|
lu.needed_locks[locking.LEVEL_NODE_RES] = \
|
2779 |
|
_CopyLockList(lu.needed_locks[locking.LEVEL_NODE])
|
2780 |
|
|
2781 |
|
|
2782 |
|
def _ExpandNamesForMigration(lu):
|
2783 |
|
"""Expands names for use with L{TLMigrateInstance}.
|
2784 |
|
|
2785 |
|
@type lu: L{LogicalUnit}
|
2786 |
|
|
2787 |
|
"""
|
2788 |
|
if lu.op.target_node is not None:
|
2789 |
|
lu.op.target_node = _ExpandNodeName(lu.cfg, lu.op.target_node)
|
2790 |
|
|
2791 |
|
lu.needed_locks[locking.LEVEL_NODE] = []
|
2792 |
|
lu.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
|
2793 |
|
|
2794 |
|
lu.needed_locks[locking.LEVEL_NODE_RES] = []
|
2795 |
|
lu.recalculate_locks[locking.LEVEL_NODE_RES] = constants.LOCKS_REPLACE
|
2796 |
|
|
2797 |
|
# The node allocation lock is actually only needed for externally replicated
|
2798 |
|
# instances (e.g. sharedfile or RBD) and if an iallocator is used.
|
2799 |
|
lu.needed_locks[locking.LEVEL_NODE_ALLOC] = []
|
2800 |
|
|
2801 |
|
|
2802 |
|
class LUInstanceFailover(LogicalUnit):
|
2803 |
|
"""Failover an instance.
|
2804 |
|
|
2805 |
|
"""
|
2806 |
|
HPATH = "instance-failover"
|
2807 |
|
HTYPE = constants.HTYPE_INSTANCE
|
2808 |
|
REQ_BGL = False
|
2809 |
|
|
2810 |
|
def CheckArguments(self):
|
2811 |
|
"""Check the arguments.
|
2812 |
|
|
2813 |
|
"""
|
2814 |
|
self.iallocator = getattr(self.op, "iallocator", None)
|
2815 |
|
self.target_node = getattr(self.op, "target_node", None)
|
2816 |
|
|
2817 |
|
def ExpandNames(self):
|
2818 |
|
self._ExpandAndLockInstance()
|
2819 |
|
_ExpandNamesForMigration(self)
|
2820 |
|
|
2821 |
|
self._migrater = \
|
2822 |
|
TLMigrateInstance(self, self.op.instance_name, False, True, False,
|
2823 |
|
self.op.ignore_consistency, True,
|
2824 |
|
self.op.shutdown_timeout, self.op.ignore_ipolicy)
|
2825 |
|
|
2826 |
|
self.tasklets = [self._migrater]
|
2827 |
|
|
2828 |
|
def DeclareLocks(self, level):
|
2829 |
|
_DeclareLocksForMigration(self, level)
|
2830 |
|
|
2831 |
|
def BuildHooksEnv(self):
|
2832 |
|
"""Build hooks env.
|
2833 |
|
|
2834 |
|
This runs on master, primary and secondary nodes of the instance.
|
2835 |
|
|
2836 |
|
"""
|
2837 |
|
instance = self._migrater.instance
|
2838 |
|
source_node = instance.primary_node
|
2839 |
|
target_node = self.op.target_node
|
2840 |
|
env = {
|
2841 |
|
"IGNORE_CONSISTENCY": self.op.ignore_consistency,
|
2842 |
|
"SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
|
2843 |
|
"OLD_PRIMARY": source_node,
|
2844 |
|
"NEW_PRIMARY": target_node,
|
2845 |
|
}
|
2846 |
|
|
2847 |
|
if instance.disk_template in constants.DTS_INT_MIRROR:
|
2848 |
|
env["OLD_SECONDARY"] = instance.secondary_nodes[0]
|
2849 |
|
env["NEW_SECONDARY"] = source_node
|
2850 |
|
else:
|
2851 |
|
env["OLD_SECONDARY"] = env["NEW_SECONDARY"] = ""
|
2852 |
|
|
2853 |
|
env.update(_BuildInstanceHookEnvByObject(self, instance))
|
2854 |
|
|
2855 |
|
return env
|
2856 |
|
|
2857 |
|
def BuildHooksNodes(self):
|
2858 |
|
"""Build hooks nodes.
|
2859 |
|
|
2860 |
|
"""
|
2861 |
|
instance = self._migrater.instance
|
2862 |
|
nl = [self.cfg.GetMasterNode()] + list(instance.secondary_nodes)
|
2863 |
|
return (nl, nl + [instance.primary_node])
|
2864 |
|
|
2865 |
|
|
2866 |
|
class LUInstanceMigrate(LogicalUnit):
|
2867 |
|
"""Migrate an instance.
|
2868 |
|
|
2869 |
|
This is migration without shutting down, compared to the failover,
|
2870 |
|
which is done with shutdown.
|
2871 |
|
|
2872 |
|
"""
|
2873 |
|
HPATH = "instance-migrate"
|
2874 |
|
HTYPE = constants.HTYPE_INSTANCE
|
2875 |
|
REQ_BGL = False
|
2876 |
|
|
2877 |
|
def ExpandNames(self):
|
2878 |
|
self._ExpandAndLockInstance()
|
2879 |
|
_ExpandNamesForMigration(self)
|
2880 |
|
|
2881 |
|
self._migrater = \
|
2882 |
|
TLMigrateInstance(self, self.op.instance_name, self.op.cleanup,
|
2883 |
|
False, self.op.allow_failover, False,
|
2884 |
|
self.op.allow_runtime_changes,
|
2885 |
|
constants.DEFAULT_SHUTDOWN_TIMEOUT,
|
2886 |
|
self.op.ignore_ipolicy)
|
2887 |
|
|
2888 |
|
self.tasklets = [self._migrater]
|
2889 |
|
|
2890 |
|
def DeclareLocks(self, level):
|
2891 |
|
_DeclareLocksForMigration(self, level)
|
2892 |
|
|
2893 |
|
def BuildHooksEnv(self):
|
2894 |
|
"""Build hooks env.
|
2895 |
|
|
2896 |
|
This runs on master, primary and secondary nodes of the instance.
|
2897 |
|
|
2898 |
|
"""
|
2899 |
|
instance = self._migrater.instance
|
2900 |
|
source_node = instance.primary_node
|
2901 |
|
target_node = self.op.target_node
|
2902 |
|
env = _BuildInstanceHookEnvByObject(self, instance)
|
2903 |
|
env.update({
|
2904 |
|
"MIGRATE_LIVE": self._migrater.live,
|
2905 |
|
"MIGRATE_CLEANUP": self.op.cleanup,
|
2906 |
|
"OLD_PRIMARY": source_node,
|
2907 |
|
"NEW_PRIMARY": target_node,
|
2908 |
|
"ALLOW_RUNTIME_CHANGES": self.op.allow_runtime_changes,
|
2909 |
|
})
|
2910 |
|
|
2911 |
|
if instance.disk_template in constants.DTS_INT_MIRROR:
|
2912 |
|
env["OLD_SECONDARY"] = target_node
|
2913 |
|
env["NEW_SECONDARY"] = source_node
|
2914 |
|
else:
|
2915 |
|
env["OLD_SECONDARY"] = env["NEW_SECONDARY"] = None
|
2916 |
|
|
2917 |
|
return env
|
2918 |
|
|
2919 |
|
def BuildHooksNodes(self):
|
2920 |
|
"""Build hooks nodes.
|
2921 |
|
|
2922 |
|
"""
|
2923 |
|
instance = self._migrater.instance
|
2924 |
|
snodes = list(instance.secondary_nodes)
|
2925 |
|
nl = [self.cfg.GetMasterNode(), instance.primary_node] + snodes
|
2926 |
|
return (nl, nl)
|
2927 |
|
|
2928 |
|
|
2929 |
2681 |
class LUInstanceMultiAlloc(NoHooksLU):
|
2930 |
2682 |
"""Allocates multiple instances at the same time.
|
2931 |
2683 |
|
... | ... | |
4592 |
4344 |
" instance '%s'", len(jobs), self.op.instance_name)
|
4593 |
4345 |
|
4594 |
4346 |
return ResultWithJobs(jobs)
|
4595 |
|
|
4596 |
|
|
4597 |
|
class TLMigrateInstance(Tasklet):
|
4598 |
|
"""Tasklet class for instance migration.
|
4599 |
|
|
4600 |
|
@type live: boolean
|
4601 |
|
@ivar live: whether the migration will be done live or non-live;
|
4602 |
|
this variable is initalized only after CheckPrereq has run
|
4603 |
|
@type cleanup: boolean
|
4604 |
|
@ivar cleanup: Wheater we cleanup from a failed migration
|
4605 |
|
@type iallocator: string
|
4606 |
|
@ivar iallocator: The iallocator used to determine target_node
|
4607 |
|
@type target_node: string
|
4608 |
|
@ivar target_node: If given, the target_node to reallocate the instance to
|
4609 |
|
@type failover: boolean
|
4610 |
|
@ivar failover: Whether operation results in failover or migration
|
4611 |
|
@type fallback: boolean
|
4612 |
|
@ivar fallback: Whether fallback to failover is allowed if migration not
|
4613 |
|
possible
|
4614 |
|
@type ignore_consistency: boolean
|
4615 |
|
@ivar ignore_consistency: Wheter we should ignore consistency between source
|
4616 |
|
and target node
|
4617 |
|
@type shutdown_timeout: int
|
4618 |
|
@ivar shutdown_timeout: In case of failover timeout of the shutdown
|
4619 |
|
@type ignore_ipolicy: bool
|
4620 |
|
@ivar ignore_ipolicy: If true, we can ignore instance policy when migrating
|
4621 |
|
|
4622 |
|
"""
|
4623 |
|
|
4624 |
|
# Constants
|
4625 |
|
_MIGRATION_POLL_INTERVAL = 1 # seconds
|
4626 |
|
_MIGRATION_FEEDBACK_INTERVAL = 10 # seconds
|
4627 |
|
|
4628 |
|
def __init__(self, lu, instance_name, cleanup, failover, fallback,
|
4629 |
|
ignore_consistency, allow_runtime_changes, shutdown_timeout,
|
4630 |
|
ignore_ipolicy):
|
4631 |
|
"""Initializes this class.
|
4632 |
|
|
4633 |
|
"""
|
4634 |
|
Tasklet.__init__(self, lu)
|
4635 |
|
|
4636 |
|
# Parameters
|
4637 |
|
self.instance_name = instance_name
|
4638 |
|
self.cleanup = cleanup
|
4639 |
|
self.live = False # will be overridden later
|
4640 |
|
self.failover = failover
|
4641 |
|
self.fallback = fallback
|
4642 |
|
self.ignore_consistency = ignore_consistency
|
4643 |
|
self.shutdown_timeout = shutdown_timeout
|
4644 |
|
self.ignore_ipolicy = ignore_ipolicy
|
4645 |
|
self.allow_runtime_changes = allow_runtime_changes
|
4646 |
|
|
4647 |
|
def CheckPrereq(self):
|
4648 |
|
"""Check prerequisites.
|
4649 |
|
|
4650 |
|
This checks that the instance is in the cluster.
|
4651 |
|
|
4652 |
|
"""
|
4653 |
|
instance_name = _ExpandInstanceName(self.lu.cfg, self.instance_name)
|
4654 |
|
instance = self.cfg.GetInstanceInfo(instance_name)
|
4655 |
|
assert instance is not None
|
4656 |
|
self.instance = instance
|
4657 |
|
cluster = self.cfg.GetClusterInfo()
|
4658 |
|
|
4659 |
|
if (not self.cleanup and
|
4660 |
|
not instance.admin_state == constants.ADMINST_UP and
|
4661 |
|
not self.failover and self.fallback):
|
4662 |
|
self.lu.LogInfo("Instance is marked down or offline, fallback allowed,"
|
4663 |
|
" switching to failover")
|
4664 |
|
self.failover = True
|
4665 |
|
|
4666 |
|
if instance.disk_template not in constants.DTS_MIRRORED:
|
4667 |
|
if self.failover:
|
4668 |
|
text = "failovers"
|
4669 |
|
else:
|
4670 |
|
text = "migrations"
|
4671 |
|
raise errors.OpPrereqError("Instance's disk layout '%s' does not allow"
|
4672 |
|
" %s" % (instance.disk_template, text),
|
4673 |
|
errors.ECODE_STATE)
|
4674 |
|
|
4675 |
|
if instance.disk_template in constants.DTS_EXT_MIRROR:
|
4676 |
|
_CheckIAllocatorOrNode(self.lu, "iallocator", "target_node")
|
4677 |
|
|
4678 |
|
if self.lu.op.iallocator:
|
4679 |
|
assert locking.NAL in self.lu.owned_locks(locking.LEVEL_NODE_ALLOC)
|
4680 |
|
self._RunAllocator()
|
4681 |
|
else:
|
4682 |
|
# We set set self.target_node as it is required by
|
4683 |
|
# BuildHooksEnv
|
4684 |
|
self.target_node = self.lu.op.target_node
|
4685 |
|
|
4686 |
|
# Check that the target node is correct in terms of instance policy
|
4687 |
|
nodeinfo = self.cfg.GetNodeInfo(self.target_node)
|
4688 |
|
group_info = self.cfg.GetNodeGroup(nodeinfo.group)
|
4689 |
|
ipolicy = ganeti.masterd.instance.CalculateGroupIPolicy(cluster,
|
4690 |
|
group_info)
|
4691 |
|
_CheckTargetNodeIPolicy(self.lu, ipolicy, instance, nodeinfo, self.cfg,
|
4692 |
|
ignore=self.ignore_ipolicy)
|
4693 |
|
|
4694 |
|
# self.target_node is already populated, either directly or by the
|
4695 |
|
# iallocator run
|
4696 |
|
target_node = self.target_node
|
4697 |
|
if self.target_node == instance.primary_node:
|
4698 |
|
raise errors.OpPrereqError("Cannot migrate instance %s"
|
4699 |
|
" to its primary (%s)" %
|
4700 |
|
(instance.name, instance.primary_node),
|
4701 |
|
errors.ECODE_STATE)
|
4702 |
|
|
4703 |
|
if len(self.lu.tasklets) == 1:
|
4704 |
|
# It is safe to release locks only when we're the only tasklet
|
4705 |
|
# in the LU
|
4706 |
|
_ReleaseLocks(self.lu, locking.LEVEL_NODE,
|
4707 |
|
keep=[instance.primary_node, self.target_node])
|
4708 |
|
_ReleaseLocks(self.lu, locking.LEVEL_NODE_ALLOC)
|
4709 |
|
|
4710 |
|
else:
|
4711 |
|
assert not self.lu.glm.is_owned(locking.LEVEL_NODE_ALLOC)
|
4712 |
|
|
4713 |
|
secondary_nodes = instance.secondary_nodes
|
4714 |
|
if not secondary_nodes:
|
4715 |
|
raise errors.ConfigurationError("No secondary node but using"
|
4716 |
|
" %s disk template" %
|
4717 |
|
instance.disk_template)
|
4718 |
|
target_node = secondary_nodes[0]
|
4719 |
|
if self.lu.op.iallocator or (self.lu.op.target_node and
|
4720 |
|
self.lu.op.target_node != target_node):
|
4721 |
|
if self.failover:
|
4722 |
|
text = "failed over"
|
4723 |
|
else:
|
4724 |
|
text = "migrated"
|
4725 |
|
raise errors.OpPrereqError("Instances with disk template %s cannot"
|
4726 |
|
" be %s to arbitrary nodes"
|
4727 |
|
" (neither an iallocator nor a target"
|
4728 |
|
" node can be passed)" %
|
4729 |
|
(instance.disk_template, text),
|
4730 |
|
errors.ECODE_INVAL)
|
4731 |
|
nodeinfo = self.cfg.GetNodeInfo(target_node)
|
4732 |
|
group_info = self.cfg.GetNodeGroup(nodeinfo.group)
|
4733 |
|
ipolicy = ganeti.masterd.instance.CalculateGroupIPolicy(cluster,
|
4734 |
|
group_info)
|
4735 |
|
_CheckTargetNodeIPolicy(self.lu, ipolicy, instance, nodeinfo, self.cfg,
|
4736 |
|
ignore=self.ignore_ipolicy)
|
4737 |
|
|
4738 |
|
i_be = cluster.FillBE(instance)
|
4739 |
|
|
4740 |
|
# check memory requirements on the secondary node
|
4741 |
|
if (not self.cleanup and
|
4742 |
|
(not self.failover or instance.admin_state == constants.ADMINST_UP)):
|
4743 |
|
self.tgt_free_mem = _CheckNodeFreeMemory(self.lu, target_node,
|
4744 |
|
"migrating instance %s" %
|
4745 |
|
instance.name,
|
4746 |
|
i_be[constants.BE_MINMEM],
|
4747 |
|
instance.hypervisor)
|
4748 |
|
else:
|
4749 |
|
self.lu.LogInfo("Not checking memory on the secondary node as"
|
4750 |
|
" instance will not be started")
|
4751 |
|
|
4752 |
|
# check if failover must be forced instead of migration
|
4753 |
|
if (not self.cleanup and not self.failover and
|
4754 |
|
i_be[constants.BE_ALWAYS_FAILOVER]):
|
4755 |
|
self.lu.LogInfo("Instance configured to always failover; fallback"
|
4756 |
|
" to failover")
|
4757 |
|
self.failover = True
|
4758 |
|
|
4759 |
|
# check bridge existance
|
4760 |
|
_CheckInstanceBridgesExist(self.lu, instance, node=target_node)
|
4761 |
|
|
4762 |
|
if not self.cleanup:
|
4763 |
|
_CheckNodeNotDrained(self.lu, target_node)
|
4764 |
|
if not self.failover:
|
4765 |
|
result = self.rpc.call_instance_migratable(instance.primary_node,
|
4766 |
|
instance)
|
4767 |
|
if result.fail_msg and self.fallback:
|
4768 |
|
self.lu.LogInfo("Can't migrate, instance offline, fallback to"
|
4769 |
|
" failover")
|
4770 |
|
self.failover = True
|
4771 |
|
else:
|
4772 |
|
result.Raise("Can't migrate, please use failover",
|
4773 |
|
prereq=True, ecode=errors.ECODE_STATE)
|
4774 |
|
|
4775 |
|
assert not (self.failover and self.cleanup)
|
4776 |
|
|
4777 |
|
if not self.failover:
|
4778 |
|
if self.lu.op.live is not None and self.lu.op.mode is not None:
|
4779 |
|
raise errors.OpPrereqError("Only one of the 'live' and 'mode'"
|
4780 |
|
" parameters are accepted",
|
4781 |
|
errors.ECODE_INVAL)
|
4782 |
|
if self.lu.op.live is not None:
|
4783 |
|
if self.lu.op.live:
|
4784 |
|
self.lu.op.mode = constants.HT_MIGRATION_LIVE
|
4785 |
|
else:
|
4786 |
|
self.lu.op.mode = constants.HT_MIGRATION_NONLIVE
|
4787 |
|
# reset the 'live' parameter to None so that repeated
|
4788 |
|
# invocations of CheckPrereq do not raise an exception
|
4789 |
|
self.lu.op.live = None
|
4790 |
|
elif self.lu.op.mode is None:
|
4791 |
|
# read the default value from the hypervisor
|
4792 |
|
i_hv = cluster.FillHV(self.instance, skip_globals=False)
|
4793 |
|
self.lu.op.mode = i_hv[constants.HV_MIGRATION_MODE]
|
4794 |
|
|
4795 |
|
self.live = self.lu.op.mode == constants.HT_MIGRATION_LIVE
|
4796 |
|
else:
|
4797 |
|
# Failover is never live
|
4798 |
|
self.live = False
|
4799 |
|
|
4800 |
|
if not (self.failover or self.cleanup):
|
4801 |
|
remote_info = self.rpc.call_instance_info(instance.primary_node,
|
4802 |
|
instance.name,
|
4803 |
|
instance.hypervisor)
|
4804 |
|
remote_info.Raise("Error checking instance on node %s" %
|
4805 |
|
instance.primary_node)
|
4806 |
|
instance_running = bool(remote_info.payload)
|
4807 |
|
if instance_running:
|
4808 |
|
self.current_mem = int(remote_info.payload["memory"])
|
4809 |
|
|
4810 |
|
def _RunAllocator(self):
|
4811 |
|
"""Run the allocator based on input opcode.
|
4812 |
|
|
4813 |
|
"""
|
4814 |
|
assert locking.NAL in self.lu.owned_locks(locking.LEVEL_NODE_ALLOC)
|
4815 |
|
|
4816 |
|
# FIXME: add a self.ignore_ipolicy option
|
4817 |
|
req = iallocator.IAReqRelocate(name=self.instance_name,
|
4818 |
|
relocate_from=[self.instance.primary_node])
|
4819 |
|
ial = iallocator.IAllocator(self.cfg, self.rpc, req)
|
4820 |
|
|
4821 |
|
ial.Run(self.lu.op.iallocator)
|
4822 |
|
|
4823 |
|
if not ial.success:
|
4824 |
|
raise errors.OpPrereqError("Can't compute nodes using"
|
4825 |
|
" iallocator '%s': %s" %
|
4826 |
|
(self.lu.op.iallocator, ial.info),
|
4827 |
|
errors.ECODE_NORES)
|
4828 |
|
self.target_node = ial.result[0]
|
4829 |
|
self.lu.LogInfo("Selected nodes for instance %s via iallocator %s: %s",
|
4830 |
|
self.instance_name, self.lu.op.iallocator,
|
4831 |
|
utils.CommaJoin(ial.result))
|
4832 |
|
|
4833 |
|
def _WaitUntilSync(self):
|
4834 |
|
"""Poll with custom rpc for disk sync.
|
4835 |
|
|
4836 |
|
This uses our own step-based rpc call.
|
4837 |
|
|
4838 |
|
"""
|
4839 |
|
self.feedback_fn("* wait until resync is done")
|
4840 |
|
all_done = False
|
4841 |
|
while not all_done:
|
4842 |
|
all_done = True
|
4843 |
|
result = self.rpc.call_drbd_wait_sync(self.all_nodes,
|
4844 |
|
self.nodes_ip,
|
4845 |
|
(self.instance.disks,
|
4846 |
|
self.instance))
|
4847 |
|
min_percent = 100
|
4848 |
|
for node, nres in result.items():
|
4849 |
|
nres.Raise("Cannot resync disks on node %s" % node)
|
4850 |
|
node_done, node_percent = nres.payload
|
4851 |
|
all_done = all_done and node_done
|
4852 |
|
if node_percent is not None:
|
4853 |
|
min_percent = min(min_percent, node_percent)
|
4854 |
|
if not all_done:
|
4855 |
|
if min_percent < 100:
|
4856 |
|
self.feedback_fn(" - progress: %.1f%%" % min_percent)
|
4857 |
|
time.sleep(2)
|
4858 |
|
|
4859 |
|
def _EnsureSecondary(self, node):
|
4860 |
|
"""Demote a node to secondary.
|
4861 |
|
|
4862 |
|
"""
|
4863 |
|
self.feedback_fn("* switching node %s to secondary mode" % node)
|
4864 |
|
|
4865 |
|
for dev in self.instance.disks:
|
4866 |
|
self.cfg.SetDiskID(dev, node)
|
4867 |
|
|
4868 |
|
result = self.rpc.call_blockdev_close(node, self.instance.name,
|
4869 |
|
self.instance.disks)
|
4870 |
|
result.Raise("Cannot change disk to secondary on node %s" % node)
|
4871 |
|
|
4872 |
|
def _GoStandalone(self):
|
4873 |
|
"""Disconnect from the network.
|
4874 |
|
|
4875 |
|
"""
|
4876 |
|
self.feedback_fn("* changing into standalone mode")
|
4877 |
|
result = self.rpc.call_drbd_disconnect_net(self.all_nodes, self.nodes_ip,
|
4878 |
|
self.instance.disks)
|
4879 |
|
for node, nres in result.items():
|
4880 |
|
nres.Raise("Cannot disconnect disks node %s" % node)
|
4881 |
|
|
4882 |
|
def _GoReconnect(self, multimaster):
|
4883 |
|
"""Reconnect to the network.
|
4884 |
|
|
4885 |
|
"""
|
4886 |
|
if multimaster:
|
4887 |
|
msg = "dual-master"
|
4888 |
|
else:
|
4889 |
|
msg = "single-master"
|
4890 |
|
self.feedback_fn("* changing disks into %s mode" % msg)
|
4891 |
|
result = self.rpc.call_drbd_attach_net(self.all_nodes, self.nodes_ip,
|
4892 |
|
(self.instance.disks, self.instance),
|
4893 |
|
self.instance.name, multimaster)
|
4894 |
|
for node, nres in result.items():
|
4895 |
|
nres.Raise("Cannot change disks config on node %s" % node)
|
4896 |
|
|
4897 |
|
def _ExecCleanup(self):
|
4898 |
|
"""Try to cleanup after a failed migration.
|
4899 |
|
|
4900 |
|
The cleanup is done by:
|
4901 |
|
- check that the instance is running only on one node
|
4902 |
|
(and update the config if needed)
|
4903 |
|
- change disks on its secondary node to secondary
|
4904 |
|
- wait until disks are fully synchronized
|
4905 |
|
- disconnect from the network
|
4906 |
|
- change disks into single-master mode
|
4907 |
|
- wait again until disks are fully synchronized
|
4908 |
|
|
4909 |
|
"""
|
4910 |
|
instance = self.instance
|
4911 |
|
target_node = self.target_node
|
4912 |
|
source_node = self.source_node
|
4913 |
|
|
4914 |
|
# check running on only one node
|
4915 |
|
self.feedback_fn("* checking where the instance actually runs"
|
4916 |
|
" (if this hangs, the hypervisor might be in"
|
4917 |
|
" a bad state)")
|
4918 |
|
ins_l = self.rpc.call_instance_list(self.all_nodes, [instance.hypervisor])
|
4919 |
|
for node, result in ins_l.items():
|
4920 |
|
result.Raise("Can't contact node %s" % node)
|
4921 |
|
|
4922 |
|
runningon_source = instance.name in ins_l[source_node].payload
|
4923 |
|
runningon_target = instance.name in ins_l[target_node].payload
|
4924 |
|
|
4925 |
|
if runningon_source and runningon_target:
|
4926 |
|
raise errors.OpExecError("Instance seems to be running on two nodes,"
|
4927 |
|
" or the hypervisor is confused; you will have"
|
4928 |
|
" to ensure manually that it runs only on one"
|
4929 |
|
" and restart this operation")
|
4930 |
|
|
4931 |
|
if not (runningon_source or runningon_target):
|
4932 |
|
raise errors.OpExecError("Instance does not seem to be running at all;"
|
4933 |
|
" in this case it's safer to repair by"
|
4934 |
|
" running 'gnt-instance stop' to ensure disk"
|
4935 |
|
" shutdown, and then restarting it")
|
4936 |
|
|
4937 |
|
if runningon_target:
|
4938 |
|
# the migration has actually succeeded, we need to update the config
|
4939 |
|
self.feedback_fn("* instance running on secondary node (%s),"
|
4940 |
|
" updating config" % target_node)
|
4941 |
|
instance.primary_node = target_node
|
4942 |
|
self.cfg.Update(instance, self.feedback_fn)
|
4943 |
|
demoted_node = source_node
|
4944 |
|
else:
|
4945 |
|
self.feedback_fn("* instance confirmed to be running on its"
|
4946 |
|
" primary node (%s)" % source_node)
|
4947 |
|
demoted_node = target_node
|
4948 |
|
|
4949 |
|
if instance.disk_template in constants.DTS_INT_MIRROR:
|
4950 |
|
self._EnsureSecondary(demoted_node)
|
4951 |
|
try:
|
4952 |
|
self._WaitUntilSync()
|
4953 |
|
except errors.OpExecError:
|
4954 |
|
# we ignore here errors, since if the device is standalone, it
|
4955 |
|
# won't be able to sync
|
4956 |
|
pass
|
4957 |
|
self._GoStandalone()
|
4958 |
|
self._GoReconnect(False)
|
4959 |
|
self._WaitUntilSync()
|
4960 |
|
|
4961 |
|
self.feedback_fn("* done")
|
4962 |
|
|
4963 |
|
def _RevertDiskStatus(self):
|
4964 |
|
"""Try to revert the disk status after a failed migration.
|
4965 |
|
|
4966 |
|
"""
|
4967 |
|
target_node = self.target_node
|
4968 |
|
if self.instance.disk_template in constants.DTS_EXT_MIRROR:
|
4969 |
|
return
|
4970 |
|
|
4971 |
|
try:
|
4972 |
|
self._EnsureSecondary(target_node)
|
4973 |
|
self._GoStandalone()
|
4974 |
|
self._GoReconnect(False)
|
4975 |
|
self._WaitUntilSync()
|
4976 |
|
except errors.OpExecError, err:
|
4977 |
|
self.lu.LogWarning("Migration failed and I can't reconnect the drives,"
|
4978 |
|
" please try to recover the instance manually;"
|
4979 |
|
" error '%s'" % str(err))
|
4980 |
|
|
4981 |
|
def _AbortMigration(self):
|
4982 |
|
"""Call the hypervisor code to abort a started migration.
|
4983 |
|
|
4984 |
|
"""
|
4985 |
|
instance = self.instance
|
4986 |
|
target_node = self.target_node
|
4987 |
|
source_node = self.source_node
|
4988 |
|
migration_info = self.migration_info
|
4989 |
|
|
4990 |
|
abort_result = self.rpc.call_instance_finalize_migration_dst(target_node,
|
4991 |
|
instance,
|
4992 |
|
migration_info,
|
4993 |
|
False)
|
4994 |
|
abort_msg = abort_result.fail_msg
|
4995 |
|
if abort_msg:
|
4996 |
|
logging.error("Aborting migration failed on target node %s: %s",
|
4997 |
|
target_node, abort_msg)
|
4998 |
|
# Don't raise an exception here, as we stil have to try to revert the
|
4999 |
|
# disk status, even if this step failed.
|
5000 |
|
|
5001 |
|
abort_result = self.rpc.call_instance_finalize_migration_src(
|
5002 |
|
source_node, instance, False, self.live)
|
5003 |
|
abort_msg = abort_result.fail_msg
|
5004 |
|
if abort_msg:
|
5005 |
|
logging.error("Aborting migration failed on source node %s: %s",
|
5006 |
|
source_node, abort_msg)
|
5007 |
|
|
5008 |
|
def _ExecMigration(self):
|
5009 |
|
"""Migrate an instance.
|
5010 |
|
|
5011 |
|
The migrate is done by:
|
5012 |
|
- change the disks into dual-master mode
|
5013 |
|
- wait until disks are fully synchronized again
|
5014 |
|
- migrate the instance
|
5015 |
|
- change disks on the new secondary node (the old primary) to secondary
|
5016 |
|
- wait until disks are fully synchronized
|
5017 |
|
- change disks into single-master mode
|
5018 |
|
|
5019 |
|
"""
|
5020 |
|
instance = self.instance
|
5021 |
|
target_node = self.target_node
|
5022 |
|
source_node = self.source_node
|
5023 |
|
|
5024 |
|
# Check for hypervisor version mismatch and warn the user.
|
5025 |
|
nodeinfo = self.rpc.call_node_info([source_node, target_node],
|
5026 |
|
None, [self.instance.hypervisor], False)
|
5027 |
|
for ninfo in nodeinfo.values():
|
5028 |
|
ninfo.Raise("Unable to retrieve node information from node '%s'" %
|
5029 |
|
ninfo.node)
|
5030 |
|
(_, _, (src_info, )) = nodeinfo[source_node].payload
|
5031 |
|
(_, _, (dst_info, )) = nodeinfo[target_node].payload
|
5032 |
|
|
5033 |
|
if ((constants.HV_NODEINFO_KEY_VERSION in src_info) and
|
5034 |
|
(constants.HV_NODEINFO_KEY_VERSION in dst_info)):
|
5035 |
|
src_version = src_info[constants.HV_NODEINFO_KEY_VERSION]
|
5036 |
|
dst_version = dst_info[constants.HV_NODEINFO_KEY_VERSION]
|
5037 |
|
if src_version != dst_version:
|
5038 |
|
self.feedback_fn("* warning: hypervisor version mismatch between"
|
5039 |
|
" source (%s) and target (%s) node" %
|
5040 |
|
(src_version, dst_version))
|
5041 |
|
|
5042 |
|
self.feedback_fn("* checking disk consistency between source and target")
|
5043 |
|
for (idx, dev) in enumerate(instance.disks):
|
5044 |
|
if not _CheckDiskConsistency(self.lu, instance, dev, target_node, False):
|
5045 |
|
raise errors.OpExecError("Disk %s is degraded or not fully"
|
5046 |
|
" synchronized on target node,"
|
5047 |
|
" aborting migration" % idx)
|
5048 |
|
|
5049 |
|
if self.current_mem > self.tgt_free_mem:
|
5050 |
|
if not self.allow_runtime_changes:
|
5051 |
|
raise errors.OpExecError("Memory ballooning not allowed and not enough"
|
5052 |
|
" free memory to fit instance %s on target"
|
5053 |
|
" node %s (have %dMB, need %dMB)" %
|
5054 |
|
(instance.name, target_node,
|
5055 |
|
self.tgt_free_mem, self.current_mem))
|
5056 |
|
self.feedback_fn("* setting instance memory to %s" % self.tgt_free_mem)
|
5057 |
|
rpcres = self.rpc.call_instance_balloon_memory(instance.primary_node,
|
5058 |
|
instance,
|
5059 |
|
self.tgt_free_mem)
|
5060 |
|
rpcres.Raise("Cannot modify instance runtime memory")
|
5061 |
|
|
5062 |
|
# First get the migration information from the remote node
|
5063 |
|
result = self.rpc.call_migration_info(source_node, instance)
|
5064 |
|
msg = result.fail_msg
|
5065 |
|
if msg:
|
5066 |
|
log_err = ("Failed fetching source migration information from %s: %s" %
|
5067 |
|
(source_node, msg))
|
5068 |
|
logging.error(log_err)
|
5069 |
|
raise errors.OpExecError(log_err)
|
5070 |
|
|
5071 |
|
self.migration_info = migration_info = result.payload
|
5072 |
|
|
5073 |
|
if self.instance.disk_template not in constants.DTS_EXT_MIRROR:
|
5074 |
|
# Then switch the disks to master/master mode
|
5075 |
|
self._EnsureSecondary(target_node)
|
5076 |
|
self._GoStandalone()
|
5077 |
|
self._GoReconnect(True)
|
5078 |
|
self._WaitUntilSync()
|
5079 |
|
|
5080 |
|
self.feedback_fn("* preparing %s to accept the instance" % target_node)
|
5081 |
|
result = self.rpc.call_accept_instance(target_node,
|
5082 |
|
instance,
|
5083 |
|
migration_info,
|
5084 |
|
self.nodes_ip[target_node])
|
5085 |
|
|
5086 |
|
msg = result.fail_msg
|
5087 |
|
if msg:
|
5088 |
|
logging.error("Instance pre-migration failed, trying to revert"
|
5089 |
|
" disk status: %s", msg)
|
5090 |
|
self.feedback_fn("Pre-migration failed, aborting")
|
5091 |
|
self._AbortMigration()
|
5092 |
|
self._RevertDiskStatus()
|
5093 |
|
raise errors.OpExecError("Could not pre-migrate instance %s: %s" %
|
5094 |
|
(instance.name, msg))
|
5095 |
|
|
5096 |
|
self.feedback_fn("* migrating instance to %s" % target_node)
|
5097 |
|
result = self.rpc.call_instance_migrate(source_node, instance,
|
5098 |
|
self.nodes_ip[target_node],
|
5099 |
|
self.live)
|
5100 |
|
msg = result.fail_msg
|
5101 |
|
if msg:
|
5102 |
|
logging.error("Instance migration failed, trying to revert"
|
5103 |
|
" disk status: %s", msg)
|
5104 |
|
self.feedback_fn("Migration failed, aborting")
|
5105 |
|
self._AbortMigration()
|
5106 |
|
self._RevertDiskStatus()
|
5107 |
|
raise errors.OpExecError("Could not migrate instance %s: %s" %
|
5108 |
|
(instance.name, msg))
|
5109 |
|
|
5110 |
|
self.feedback_fn("* starting memory transfer")
|
5111 |
|
last_feedback = time.time()
|
5112 |
|
while True:
|
5113 |
|
result = self.rpc.call_instance_get_migration_status(source_node,
|
5114 |
|
instance)
|
5115 |
|
msg = result.fail_msg
|
5116 |
|
ms = result.payload # MigrationStatus instance
|
5117 |
|
if msg or (ms.status in constants.HV_MIGRATION_FAILED_STATUSES):
|
5118 |
|
logging.error("Instance migration failed, trying to revert"
|
5119 |
|
" disk status: %s", msg)
|
5120 |
|
self.feedback_fn("Migration failed, aborting")
|
5121 |
|
self._AbortMigration()
|
5122 |
|
self._RevertDiskStatus()
|
5123 |
|
if not msg:
|
5124 |
|
msg = "hypervisor returned failure"
|
5125 |
|
raise errors.OpExecError("Could not migrate instance %s: %s" %
|
5126 |
|
(instance.name, msg))
|
5127 |
|
|
5128 |
|
if result.payload.status != constants.HV_MIGRATION_ACTIVE:
|
5129 |
|
self.feedback_fn("* memory transfer complete")
|
5130 |
|
break
|
5131 |
|
|
5132 |
|
if (utils.TimeoutExpired(last_feedback,
|
5133 |
|
self._MIGRATION_FEEDBACK_INTERVAL) and
|
5134 |
|
ms.transferred_ram is not None):
|
5135 |
|
mem_progress = 100 * float(ms.transferred_ram) / float(ms.total_ram)
|
5136 |
|
self.feedback_fn("* memory transfer progress: %.2f %%" % mem_progress)
|
5137 |
|
last_feedback = time.time()
|
5138 |
|
|
5139 |
|
time.sleep(self._MIGRATION_POLL_INTERVAL)
|
5140 |
|
|
5141 |
|
result = self.rpc.call_instance_finalize_migration_src(source_node,
|
5142 |
|
instance,
|
5143 |
|
True,
|
5144 |
|
self.live)
|
5145 |
|
msg = result.fail_msg
|
5146 |
|
if msg:
|
5147 |
|
logging.error("Instance migration succeeded, but finalization failed"
|
5148 |
|
" on the source node: %s", msg)
|
5149 |
|
raise errors.OpExecError("Could not finalize instance migration: %s" %
|
5150 |
|
msg)
|
5151 |
|
|
5152 |
|
instance.primary_node = target_node
|
5153 |
|
|
5154 |
|
# distribute new instance config to the other nodes
|
5155 |
|
self.cfg.Update(instance, self.feedback_fn)
|
5156 |
|
|
5157 |
|
result = self.rpc.call_instance_finalize_migration_dst(target_node,
|
5158 |
|
instance,
|
5159 |
|
migration_info,
|
5160 |
|
True)
|
5161 |
|
msg = result.fail_msg
|
5162 |
|
if msg:
|
5163 |
|
logging.error("Instance migration succeeded, but finalization failed"
|
5164 |
|
" on the target node: %s", msg)
|
5165 |
|
raise errors.OpExecError("Could not finalize instance migration: %s" %
|
5166 |
|
msg)
|
5167 |
|
|
5168 |
|
if self.instance.disk_template not in constants.DTS_EXT_MIRROR:
|
5169 |
|
self._EnsureSecondary(source_node)
|
5170 |
|
self._WaitUntilSync()
|
5171 |
|
self._GoStandalone()
|
5172 |
|
self._GoReconnect(False)
|
5173 |
|
self._WaitUntilSync()
|
5174 |
|
|
5175 |
|
# If the instance's disk template is `rbd' or `ext' and there was a
|
5176 |
|
# successful migration, unmap the device from the source node.
|
5177 |
|
if self.instance.disk_template in (constants.DT_RBD, constants.DT_EXT):
|
5178 |
|
disks = _ExpandCheckDisks(instance, instance.disks)
|
5179 |
|
self.feedback_fn("* unmapping instance's disks from %s" % source_node)
|
5180 |
|
for disk in disks:
|
5181 |
|
result = self.rpc.call_blockdev_shutdown(source_node, (disk, instance))
|
5182 |
|
msg = result.fail_msg
|
5183 |
|
if msg:
|
5184 |
|
logging.error("Migration was successful, but couldn't unmap the"
|
5185 |
|
" block device %s on source node %s: %s",
|
5186 |
|
disk.iv_name, source_node, msg)
|
5187 |
|
logging.error("You need to unmap the device %s manually on %s",
|
5188 |
|
disk.iv_name, source_node)
|
5189 |
|
|
5190 |
|
self.feedback_fn("* done")
|
5191 |
|
|
5192 |
|
def _ExecFailover(self):
|
5193 |
|
"""Failover an instance.
|
5194 |
|
|
5195 |
|
The failover is done by shutting it down on its present node and
|
5196 |
|
starting it on the secondary.
|
5197 |
|
|
5198 |
|
"""
|
5199 |
|
instance = self.instance
|
5200 |
|
primary_node = self.cfg.GetNodeInfo(instance.primary_node)
|
5201 |
|
|
5202 |
|
source_node = instance.primary_node
|
5203 |
|
target_node = self.target_node
|
5204 |
|
|
5205 |
|
if instance.admin_state == constants.ADMINST_UP:
|
5206 |
|
self.feedback_fn("* checking disk consistency between source and target")
|
5207 |
|
for (idx, dev) in enumerate(instance.disks):
|
5208 |
|
# for drbd, these are drbd over lvm
|
5209 |
|
if not _CheckDiskConsistency(self.lu, instance, dev, target_node,
|
5210 |
|
False):
|
5211 |
|
if primary_node.offline:
|
5212 |
|
self.feedback_fn("Node %s is offline, ignoring degraded disk %s on"
|
5213 |
|
" target node %s" %
|
5214 |
|
(primary_node.name, idx, target_node))
|
5215 |
|
elif not self.ignore_consistency:
|
5216 |
|
raise errors.OpExecError("Disk %s is degraded on target node,"
|
5217 |
|
" aborting failover" % idx)
|
5218 |
|
else:
|
5219 |
|
self.feedback_fn("* not checking disk consistency as instance is not"
|
5220 |
|
" running")
|
5221 |
|
|
5222 |
|
self.feedback_fn("* shutting down instance on source node")
|
5223 |
|
logging.info("Shutting down instance %s on node %s",
|
5224 |
|
instance.name, source_node)
|
5225 |
|
|
5226 |
|
result = self.rpc.call_instance_shutdown(source_node, instance,
|
5227 |
|
self.shutdown_timeout,
|
5228 |
|
self.lu.op.reason)
|
5229 |
|
msg = result.fail_msg
|
5230 |
|
if msg:
|
5231 |
|
if self.ignore_consistency or primary_node.offline:
|
5232 |
|
self.lu.LogWarning("Could not shutdown instance %s on node %s,"
|
5233 |
|
" proceeding anyway; please make sure node"
|
5234 |
|
" %s is down; error details: %s",
|
5235 |
|
instance.name, source_node, source_node, msg)
|
5236 |
|
else:
|
5237 |
|
raise errors.OpExecError("Could not shutdown instance %s on"
|
5238 |
|
" node %s: %s" %
|
5239 |
|
(instance.name, source_node, msg))
|
5240 |
|
|
5241 |
|
self.feedback_fn("* deactivating the instance's disks on source node")
|
5242 |
|
if not _ShutdownInstanceDisks(self.lu, instance, ignore_primary=True):
|
5243 |
|
raise errors.OpExecError("Can't shut down the instance's disks")
|
5244 |
|
|
5245 |
|
instance.primary_node = target_node
|
5246 |
|
# distribute new instance config to the other nodes
|
5247 |
|
self.cfg.Update(instance, self.feedback_fn)
|
5248 |
|
|
5249 |
|
# Only start the instance if it's marked as up
|
5250 |
|
if instance.admin_state == constants.ADMINST_UP:
|
5251 |
|
self.feedback_fn("* activating the instance's disks on target node %s" %
|
5252 |
|
target_node)
|
5253 |
|
logging.info("Starting instance %s on node %s",
|
5254 |
|
instance.name, target_node)
|
5255 |
|
|
5256 |
|
disks_ok, _ = _AssembleInstanceDisks(self.lu, instance,
|
5257 |
|
ignore_secondaries=True)
|
5258 |
|
if not disks_ok:
|
5259 |
|
_ShutdownInstanceDisks(self.lu, instance)
|
5260 |
|
raise errors.OpExecError("Can't activate the instance's disks")
|
5261 |
|
|
5262 |
|
self.feedback_fn("* starting the instance on the target node %s" %
|
5263 |
|
target_node)
|
5264 |
|
result = self.rpc.call_instance_start(target_node, (instance, None, None),
|
5265 |
|
False, self.lu.op.reason)
|
5266 |
|
msg = result.fail_msg
|
5267 |
|
if msg:
|
5268 |
|
_ShutdownInstanceDisks(self.lu, instance)
|
5269 |
|
raise errors.OpExecError("Could not start instance %s on node %s: %s" %
|
5270 |
|
(instance.name, target_node, msg))
|
5271 |
|
|
5272 |
|
def Exec(self, feedback_fn):
|
5273 |
|
"""Perform the migration.
|
5274 |
|
|
5275 |
|
"""
|
5276 |
|
self.feedback_fn = feedback_fn
|
5277 |
|
self.source_node = self.instance.primary_node
|
5278 |
|
|
5279 |
|
# FIXME: if we implement migrate-to-any in DRBD, this needs fixing
|
5280 |
|
if self.instance.disk_template in constants.DTS_INT_MIRROR:
|
5281 |
|
self.target_node = self.instance.secondary_nodes[0]
|
5282 |
|
# Otherwise self.target_node has been populated either
|
5283 |
|
# directly, or through an iallocator.
|
5284 |
|
|
5285 |
|
self.all_nodes = [self.source_node, self.target_node]
|
5286 |
|
self.nodes_ip = dict((name, node.secondary_ip) for (name, node)
|
5287 |
|
in self.cfg.GetMultiNodeInfo(self.all_nodes))
|
5288 |
|
|
5289 |
|
if self.failover:
|
5290 |
|
feedback_fn("Failover instance %s" % self.instance.name)
|
5291 |
|
self._ExecFailover()
|
5292 |
|
else:
|
5293 |
|
feedback_fn("Migrating instance %s" % self.instance.name)
|
5294 |
|
|
5295 |
|
if self.cleanup:
|
5296 |
|
return self._ExecCleanup()
|
5297 |
|
else:
|
5298 |
|
return self._ExecMigration()
|