4 # Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 Google Inc.
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 """Module implementing the master-side code."""
24 # pylint: disable=W0201,C0302
26 # W0201 since most LU attributes are defined in CheckPrereq or similar
29 # C0302: since we have waaaay too many lines in this module
45 from ganeti import ssh
46 from ganeti import utils
47 from ganeti import errors
48 from ganeti import hypervisor
49 from ganeti import locking
50 from ganeti import constants
51 from ganeti import objects
52 from ganeti import serializer
53 from ganeti import ssconf
54 from ganeti import uidpool
55 from ganeti import compat
56 from ganeti import masterd
57 from ganeti import netutils
58 from ganeti import query
59 from ganeti import qlang
60 from ganeti import opcodes
63 import ganeti.masterd.instance # pylint: disable=W0611
66 #: Size of DRBD meta block device
71 """Data container for LU results with jobs.
73 Instances of this class returned from L{LogicalUnit.Exec} will be recognized
74 by L{mcpu.Processor._ProcessResult}. The latter will then submit the jobs
75 contained in the C{jobs} attribute and include the job IDs in the opcode
79 def __init__(self, jobs, **kwargs):
80 """Initializes this class.
82 Additional return values can be specified as keyword arguments.
84 @type jobs: list of lists of L{opcode.OpCode}
85 @param jobs: A list of lists of opcode objects
92 class LogicalUnit(object):
93 """Logical Unit base class.
95 Subclasses must follow these rules:
96 - implement ExpandNames
97 - implement CheckPrereq (except when tasklets are used)
98 - implement Exec (except when tasklets are used)
99 - implement BuildHooksEnv
100 - implement BuildHooksNodes
101 - redefine HPATH and HTYPE
102 - optionally redefine their run requirements:
103 REQ_BGL: the LU needs to hold the Big Ganeti Lock exclusively
105 Note that all commands require root permissions.
107 @ivar dry_run_result: the value (if any) that will be returned to the caller
108 in dry-run mode (signalled by opcode dry_run parameter)
115 def __init__(self, processor, op, context, rpc):
116 """Constructor for LogicalUnit.
118 This needs to be overridden in derived classes in order to check op
122 self.proc = processor
124 self.cfg = context.cfg
125 self.glm = context.glm
127 self.owned_locks = context.glm.list_owned
128 self.context = context
130 # Dicts used to declare locking needs to mcpu
131 self.needed_locks = None
132 self.share_locks = dict.fromkeys(locking.LEVELS, 0)
134 self.remove_locks = {}
135 # Used to force good behavior when calling helper functions
136 self.recalculate_locks = {}
138 self.Log = processor.Log # pylint: disable=C0103
139 self.LogWarning = processor.LogWarning # pylint: disable=C0103
140 self.LogInfo = processor.LogInfo # pylint: disable=C0103
141 self.LogStep = processor.LogStep # pylint: disable=C0103
142 # support for dry-run
143 self.dry_run_result = None
144 # support for generic debug attribute
145 if (not hasattr(self.op, "debug_level") or
146 not isinstance(self.op.debug_level, int)):
147 self.op.debug_level = 0
152 # Validate opcode parameters and set defaults
153 self.op.Validate(True)
155 self.CheckArguments()
157 def CheckArguments(self):
158 """Check syntactic validity for the opcode arguments.
160 This method is for doing a simple syntactic check and ensure
161 validity of opcode parameters, without any cluster-related
162 checks. While the same can be accomplished in ExpandNames and/or
163 CheckPrereq, doing these separate is better because:
165 - ExpandNames is left as as purely a lock-related function
166 - CheckPrereq is run after we have acquired locks (and possible
169 The function is allowed to change the self.op attribute so that
170 later methods can no longer worry about missing parameters.
175 def ExpandNames(self):
176 """Expand names for this LU.
178 This method is called before starting to execute the opcode, and it should
179 update all the parameters of the opcode to their canonical form (e.g. a
180 short node name must be fully expanded after this method has successfully
181 completed). This way locking, hooks, logging, etc. can work correctly.
183 LUs which implement this method must also populate the self.needed_locks
184 member, as a dict with lock levels as keys, and a list of needed lock names
187 - use an empty dict if you don't need any lock
188 - if you don't need any lock at a particular level omit that level
189 - don't put anything for the BGL level
190 - if you want all locks at a level use locking.ALL_SET as a value
192 If you need to share locks (rather than acquire them exclusively) at one
193 level you can modify self.share_locks, setting a true value (usually 1) for
194 that level. By default locks are not shared.
196 This function can also define a list of tasklets, which then will be
197 executed in order instead of the usual LU-level CheckPrereq and Exec
198 functions, if those are not defined by the LU.
202 # Acquire all nodes and one instance
203 self.needed_locks = {
204 locking.LEVEL_NODE: locking.ALL_SET,
205 locking.LEVEL_INSTANCE: ['instance1.example.com'],
207 # Acquire just two nodes
208 self.needed_locks = {
209 locking.LEVEL_NODE: ['node1.example.com', 'node2.example.com'],
212 self.needed_locks = {} # No, you can't leave it to the default value None
215 # The implementation of this method is mandatory only if the new LU is
216 # concurrent, so that old LUs don't need to be changed all at the same
219 self.needed_locks = {} # Exclusive LUs don't need locks.
221 raise NotImplementedError
223 def DeclareLocks(self, level):
224 """Declare LU locking needs for a level
226 While most LUs can just declare their locking needs at ExpandNames time,
227 sometimes there's the need to calculate some locks after having acquired
228 the ones before. This function is called just before acquiring locks at a
229 particular level, but after acquiring the ones at lower levels, and permits
230 such calculations. It can be used to modify self.needed_locks, and by
231 default it does nothing.
233 This function is only called if you have something already set in
234 self.needed_locks for the level.
236 @param level: Locking level which is going to be locked
237 @type level: member of ganeti.locking.LEVELS
241 def CheckPrereq(self):
242 """Check prerequisites for this LU.
244 This method should check that the prerequisites for the execution
245 of this LU are fulfilled. It can do internode communication, but
246 it should be idempotent - no cluster or system changes are
249 The method should raise errors.OpPrereqError in case something is
250 not fulfilled. Its return value is ignored.
252 This method should also update all the parameters of the opcode to
253 their canonical form if it hasn't been done by ExpandNames before.
256 if self.tasklets is not None:
257 for (idx, tl) in enumerate(self.tasklets):
258 logging.debug("Checking prerequisites for tasklet %s/%s",
259 idx + 1, len(self.tasklets))
264 def Exec(self, feedback_fn):
267 This method should implement the actual work. It should raise
268 errors.OpExecError for failures that are somewhat dealt with in
272 if self.tasklets is not None:
273 for (idx, tl) in enumerate(self.tasklets):
274 logging.debug("Executing tasklet %s/%s", idx + 1, len(self.tasklets))
277 raise NotImplementedError
279 def BuildHooksEnv(self):
280 """Build hooks environment for this LU.
283 @return: Dictionary containing the environment that will be used for
284 running the hooks for this LU. The keys of the dict must not be prefixed
285 with "GANETI_"--that'll be added by the hooks runner. The hooks runner
286 will extend the environment with additional variables. If no environment
287 should be defined, an empty dictionary should be returned (not C{None}).
288 @note: If the C{HPATH} attribute of the LU class is C{None}, this function
292 raise NotImplementedError
294 def BuildHooksNodes(self):
295 """Build list of nodes to run LU's hooks.
297 @rtype: tuple; (list, list)
298 @return: Tuple containing a list of node names on which the hook
299 should run before the execution and a list of node names on which the
300 hook should run after the execution. No nodes should be returned as an
301 empty list (and not None).
302 @note: If the C{HPATH} attribute of the LU class is C{None}, this function
306 raise NotImplementedError
308 def HooksCallBack(self, phase, hook_results, feedback_fn, lu_result):
309 """Notify the LU about the results of its hooks.
311 This method is called every time a hooks phase is executed, and notifies
312 the Logical Unit about the hooks' result. The LU can then use it to alter
313 its result based on the hooks. By default the method does nothing and the
314 previous result is passed back unchanged but any LU can define it if it
315 wants to use the local cluster hook-scripts somehow.
317 @param phase: one of L{constants.HOOKS_PHASE_POST} or
318 L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase
319 @param hook_results: the results of the multi-node hooks rpc call
320 @param feedback_fn: function used send feedback back to the caller
321 @param lu_result: the previous Exec result this LU had, or None
323 @return: the new Exec result, based on the previous result
327 # API must be kept, thus we ignore the unused argument and could
328 # be a function warnings
329 # pylint: disable=W0613,R0201
332 def _ExpandAndLockInstance(self):
333 """Helper function to expand and lock an instance.
335 Many LUs that work on an instance take its name in self.op.instance_name
336 and need to expand it and then declare the expanded name for locking. This
337 function does it, and then updates self.op.instance_name to the expanded
338 name. It also initializes needed_locks as a dict, if this hasn't been done
342 if self.needed_locks is None:
343 self.needed_locks = {}
345 assert locking.LEVEL_INSTANCE not in self.needed_locks, \
346 "_ExpandAndLockInstance called with instance-level locks set"
347 self.op.instance_name = _ExpandInstanceName(self.cfg,
348 self.op.instance_name)
349 self.needed_locks[locking.LEVEL_INSTANCE] = self.op.instance_name
351 def _LockInstancesNodes(self, primary_only=False):
352 """Helper function to declare instances' nodes for locking.
354 This function should be called after locking one or more instances to lock
355 their nodes. Its effect is populating self.needed_locks[locking.LEVEL_NODE]
356 with all primary or secondary nodes for instances already locked and
357 present in self.needed_locks[locking.LEVEL_INSTANCE].
359 It should be called from DeclareLocks, and for safety only works if
360 self.recalculate_locks[locking.LEVEL_NODE] is set.
362 In the future it may grow parameters to just lock some instance's nodes, or
363 to just lock primaries or secondary nodes, if needed.
365 If should be called in DeclareLocks in a way similar to::
367 if level == locking.LEVEL_NODE:
368 self._LockInstancesNodes()
370 @type primary_only: boolean
371 @param primary_only: only lock primary nodes of locked instances
374 assert locking.LEVEL_NODE in self.recalculate_locks, \
375 "_LockInstancesNodes helper function called with no nodes to recalculate"
377 # TODO: check if we're really been called with the instance locks held
379 # For now we'll replace self.needed_locks[locking.LEVEL_NODE], but in the
380 # future we might want to have different behaviors depending on the value
381 # of self.recalculate_locks[locking.LEVEL_NODE]
383 locked_i = self.owned_locks(locking.LEVEL_INSTANCE)
384 for _, instance in self.cfg.GetMultiInstanceInfo(locked_i):
385 wanted_nodes.append(instance.primary_node)
387 wanted_nodes.extend(instance.secondary_nodes)
389 if self.recalculate_locks[locking.LEVEL_NODE] == constants.LOCKS_REPLACE:
390 self.needed_locks[locking.LEVEL_NODE] = wanted_nodes
391 elif self.recalculate_locks[locking.LEVEL_NODE] == constants.LOCKS_APPEND:
392 self.needed_locks[locking.LEVEL_NODE].extend(wanted_nodes)
394 del self.recalculate_locks[locking.LEVEL_NODE]
397 class NoHooksLU(LogicalUnit): # pylint: disable=W0223
398 """Simple LU which runs no hooks.
400 This LU is intended as a parent for other LogicalUnits which will
401 run no hooks, in order to reduce duplicate code.
407 def BuildHooksEnv(self):
408 """Empty BuildHooksEnv for NoHooksLu.
410 This just raises an error.
413 raise AssertionError("BuildHooksEnv called for NoHooksLUs")
415 def BuildHooksNodes(self):
416 """Empty BuildHooksNodes for NoHooksLU.
419 raise AssertionError("BuildHooksNodes called for NoHooksLU")
423 """Tasklet base class.
425 Tasklets are subcomponents for LUs. LUs can consist entirely of tasklets or
426 they can mix legacy code with tasklets. Locking needs to be done in the LU,
427 tasklets know nothing about locks.
429 Subclasses must follow these rules:
430 - Implement CheckPrereq
434 def __init__(self, lu):
441 def CheckPrereq(self):
442 """Check prerequisites for this tasklets.
444 This method should check whether the prerequisites for the execution of
445 this tasklet are fulfilled. It can do internode communication, but it
446 should be idempotent - no cluster or system changes are allowed.
448 The method should raise errors.OpPrereqError in case something is not
449 fulfilled. Its return value is ignored.
451 This method should also update all parameters to their canonical form if it
452 hasn't been done before.
457 def Exec(self, feedback_fn):
458 """Execute the tasklet.
460 This method should implement the actual work. It should raise
461 errors.OpExecError for failures that are somewhat dealt with in code, or
465 raise NotImplementedError
469 """Base for query utility classes.
472 #: Attribute holding field definitions
475 def __init__(self, qfilter, fields, use_locking):
476 """Initializes this class.
479 self.use_locking = use_locking
481 self.query = query.Query(self.FIELDS, fields, qfilter=qfilter,
483 self.requested_data = self.query.RequestedData()
484 self.names = self.query.RequestedNames()
486 # Sort only if no names were requested
487 self.sort_by_name = not self.names
489 self.do_locking = None
492 def _GetNames(self, lu, all_names, lock_level):
493 """Helper function to determine names asked for in the query.
497 names = lu.owned_locks(lock_level)
501 if self.wanted == locking.ALL_SET:
502 assert not self.names
503 # caller didn't specify names, so ordering is not important
504 return utils.NiceSort(names)
506 # caller specified names and we must keep the same order
508 assert not self.do_locking or lu.glm.is_owned(lock_level)
510 missing = set(self.wanted).difference(names)
512 raise errors.OpExecError("Some items were removed before retrieving"
513 " their data: %s" % missing)
515 # Return expanded names
518 def ExpandNames(self, lu):
519 """Expand names for this query.
521 See L{LogicalUnit.ExpandNames}.
524 raise NotImplementedError()
526 def DeclareLocks(self, lu, level):
527 """Declare locks for this query.
529 See L{LogicalUnit.DeclareLocks}.
532 raise NotImplementedError()
534 def _GetQueryData(self, lu):
535 """Collects all data for this query.
537 @return: Query data object
540 raise NotImplementedError()
542 def NewStyleQuery(self, lu):
543 """Collect data and execute query.
546 return query.GetQueryResponse(self.query, self._GetQueryData(lu),
547 sort_by_name=self.sort_by_name)
549 def OldStyleQuery(self, lu):
550 """Collect data and execute query.
553 return self.query.OldStyleQuery(self._GetQueryData(lu),
554 sort_by_name=self.sort_by_name)
558 """Returns a dict declaring all lock levels shared.
561 return dict.fromkeys(locking.LEVELS, 1)
564 def _CheckInstanceNodeGroups(cfg, instance_name, owned_groups):
565 """Checks if the owned node groups are still correct for an instance.
567 @type cfg: L{config.ConfigWriter}
568 @param cfg: The cluster configuration
569 @type instance_name: string
570 @param instance_name: Instance name
571 @type owned_groups: set or frozenset
572 @param owned_groups: List of currently owned node groups
575 inst_groups = cfg.GetInstanceNodeGroups(instance_name)
577 if not owned_groups.issuperset(inst_groups):
578 raise errors.OpPrereqError("Instance %s's node groups changed since"
579 " locks were acquired, current groups are"
580 " are '%s', owning groups '%s'; retry the"
583 utils.CommaJoin(inst_groups),
584 utils.CommaJoin(owned_groups)),
590 def _CheckNodeGroupInstances(cfg, group_uuid, owned_instances):
591 """Checks if the instances in a node group are still correct.
593 @type cfg: L{config.ConfigWriter}
594 @param cfg: The cluster configuration
595 @type group_uuid: string
596 @param group_uuid: Node group UUID
597 @type owned_instances: set or frozenset
598 @param owned_instances: List of currently owned instances
601 wanted_instances = cfg.GetNodeGroupInstances(group_uuid)
602 if owned_instances != wanted_instances:
603 raise errors.OpPrereqError("Instances in node group '%s' changed since"
604 " locks were acquired, wanted '%s', have '%s';"
605 " retry the operation" %
607 utils.CommaJoin(wanted_instances),
608 utils.CommaJoin(owned_instances)),
611 return wanted_instances
614 def _SupportsOob(cfg, node):
615 """Tells if node supports OOB.
617 @type cfg: L{config.ConfigWriter}
618 @param cfg: The cluster configuration
619 @type node: L{objects.Node}
620 @param node: The node
621 @return: The OOB script if supported or an empty string otherwise
624 return cfg.GetNdParams(node)[constants.ND_OOB_PROGRAM]
627 def _GetWantedNodes(lu, nodes):
628 """Returns list of checked and expanded node names.
630 @type lu: L{LogicalUnit}
631 @param lu: the logical unit on whose behalf we execute
633 @param nodes: list of node names or None for all nodes
635 @return: the list of nodes, sorted
636 @raise errors.ProgrammerError: if the nodes parameter is wrong type
640 return [_ExpandNodeName(lu.cfg, name) for name in nodes]
642 return utils.NiceSort(lu.cfg.GetNodeList())
645 def _GetWantedInstances(lu, instances):
646 """Returns list of checked and expanded instance names.
648 @type lu: L{LogicalUnit}
649 @param lu: the logical unit on whose behalf we execute
650 @type instances: list
651 @param instances: list of instance names or None for all instances
653 @return: the list of instances, sorted
654 @raise errors.OpPrereqError: if the instances parameter is wrong type
655 @raise errors.OpPrereqError: if any of the passed instances is not found
659 wanted = [_ExpandInstanceName(lu.cfg, name) for name in instances]
661 wanted = utils.NiceSort(lu.cfg.GetInstanceList())
665 def _GetUpdatedParams(old_params, update_dict,
666 use_default=True, use_none=False):
667 """Return the new version of a parameter dictionary.
669 @type old_params: dict
670 @param old_params: old parameters
671 @type update_dict: dict
672 @param update_dict: dict containing new parameter values, or
673 constants.VALUE_DEFAULT to reset the parameter to its default
675 @param use_default: boolean
676 @type use_default: whether to recognise L{constants.VALUE_DEFAULT}
677 values as 'to be deleted' values
678 @param use_none: boolean
679 @type use_none: whether to recognise C{None} values as 'to be
682 @return: the new parameter dictionary
685 params_copy = copy.deepcopy(old_params)
686 for key, val in update_dict.iteritems():
687 if ((use_default and val == constants.VALUE_DEFAULT) or
688 (use_none and val is None)):
694 params_copy[key] = val
698 def _ReleaseLocks(lu, level, names=None, keep=None):
699 """Releases locks owned by an LU.
701 @type lu: L{LogicalUnit}
702 @param level: Lock level
703 @type names: list or None
704 @param names: Names of locks to release
705 @type keep: list or None
706 @param keep: Names of locks to retain
709 assert not (keep is not None and names is not None), \
710 "Only one of the 'names' and the 'keep' parameters can be given"
712 if names is not None:
713 should_release = names.__contains__
715 should_release = lambda name: name not in keep
717 should_release = None
723 # Determine which locks to release
724 for name in lu.owned_locks(level):
725 if should_release(name):
730 assert len(lu.owned_locks(level)) == (len(retain) + len(release))
732 # Release just some locks
733 lu.glm.release(level, names=release)
735 assert frozenset(lu.owned_locks(level)) == frozenset(retain)
738 lu.glm.release(level)
740 assert not lu.glm.is_owned(level), "No locks should be owned"
743 def _MapInstanceDisksToNodes(instances):
744 """Creates a map from (node, volume) to instance name.
746 @type instances: list of L{objects.Instance}
747 @rtype: dict; tuple of (node name, volume name) as key, instance name as value
750 return dict(((node, vol), inst.name)
751 for inst in instances
752 for (node, vols) in inst.MapLVsByNode().items()
756 def _RunPostHook(lu, node_name):
757 """Runs the post-hook for an opcode on a single node.
760 hm = lu.proc.hmclass(lu.rpc.call_hooks_runner, lu)
762 hm.RunPhase(constants.HOOKS_PHASE_POST, nodes=[node_name])
764 # pylint: disable=W0702
765 lu.LogWarning("Errors occurred running hooks on %s" % node_name)
768 def _CheckOutputFields(static, dynamic, selected):
769 """Checks whether all selected fields are valid.
771 @type static: L{utils.FieldSet}
772 @param static: static fields set
773 @type dynamic: L{utils.FieldSet}
774 @param dynamic: dynamic fields set
781 delta = f.NonMatching(selected)
783 raise errors.OpPrereqError("Unknown output fields selected: %s"
784 % ",".join(delta), errors.ECODE_INVAL)
787 def _CheckGlobalHvParams(params):
788 """Validates that given hypervisor params are not global ones.
790 This will ensure that instances don't get customised versions of
794 used_globals = constants.HVC_GLOBALS.intersection(params)
796 msg = ("The following hypervisor parameters are global and cannot"
797 " be customized at instance level, please modify them at"
798 " cluster level: %s" % utils.CommaJoin(used_globals))
799 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
802 def _CheckNodeOnline(lu, node, msg=None):
803 """Ensure that a given node is online.
805 @param lu: the LU on behalf of which we make the check
806 @param node: the node to check
807 @param msg: if passed, should be a message to replace the default one
808 @raise errors.OpPrereqError: if the node is offline
812 msg = "Can't use offline node"
813 if lu.cfg.GetNodeInfo(node).offline:
814 raise errors.OpPrereqError("%s: %s" % (msg, node), errors.ECODE_STATE)
817 def _CheckNodeNotDrained(lu, node):
818 """Ensure that a given node is not drained.
820 @param lu: the LU on behalf of which we make the check
821 @param node: the node to check
822 @raise errors.OpPrereqError: if the node is drained
825 if lu.cfg.GetNodeInfo(node).drained:
826 raise errors.OpPrereqError("Can't use drained node %s" % node,
830 def _CheckNodeVmCapable(lu, node):
831 """Ensure that a given node is vm capable.
833 @param lu: the LU on behalf of which we make the check
834 @param node: the node to check
835 @raise errors.OpPrereqError: if the node is not vm capable
838 if not lu.cfg.GetNodeInfo(node).vm_capable:
839 raise errors.OpPrereqError("Can't use non-vm_capable node %s" % node,
843 def _CheckNodeHasOS(lu, node, os_name, force_variant):
844 """Ensure that a node supports a given OS.
846 @param lu: the LU on behalf of which we make the check
847 @param node: the node to check
848 @param os_name: the OS to query about
849 @param force_variant: whether to ignore variant errors
850 @raise errors.OpPrereqError: if the node is not supporting the OS
853 result = lu.rpc.call_os_get(node, os_name)
854 result.Raise("OS '%s' not in supported OS list for node %s" %
856 prereq=True, ecode=errors.ECODE_INVAL)
857 if not force_variant:
858 _CheckOSVariant(result.payload, os_name)
861 def _CheckNodeHasSecondaryIP(lu, node, secondary_ip, prereq):
862 """Ensure that a node has the given secondary ip.
864 @type lu: L{LogicalUnit}
865 @param lu: the LU on behalf of which we make the check
867 @param node: the node to check
868 @type secondary_ip: string
869 @param secondary_ip: the ip to check
870 @type prereq: boolean
871 @param prereq: whether to throw a prerequisite or an execute error
872 @raise errors.OpPrereqError: if the node doesn't have the ip, and prereq=True
873 @raise errors.OpExecError: if the node doesn't have the ip, and prereq=False
876 result = lu.rpc.call_node_has_ip_address(node, secondary_ip)
877 result.Raise("Failure checking secondary ip on node %s" % node,
878 prereq=prereq, ecode=errors.ECODE_ENVIRON)
879 if not result.payload:
880 msg = ("Node claims it doesn't have the secondary ip you gave (%s),"
881 " please fix and re-run this command" % secondary_ip)
883 raise errors.OpPrereqError(msg, errors.ECODE_ENVIRON)
885 raise errors.OpExecError(msg)
888 def _GetClusterDomainSecret():
889 """Reads the cluster domain secret.
892 return utils.ReadOneLineFile(constants.CLUSTER_DOMAIN_SECRET_FILE,
896 def _CheckInstanceDown(lu, instance, reason):
897 """Ensure that an instance is not running."""
898 if instance.admin_up:
899 raise errors.OpPrereqError("Instance %s is marked to be up, %s" %
900 (instance.name, reason), errors.ECODE_STATE)
902 pnode = instance.primary_node
903 ins_l = lu.rpc.call_instance_list([pnode], [instance.hypervisor])[pnode]
904 ins_l.Raise("Can't contact node %s for instance information" % pnode,
905 prereq=True, ecode=errors.ECODE_ENVIRON)
907 if instance.name in ins_l.payload:
908 raise errors.OpPrereqError("Instance %s is running, %s" %
909 (instance.name, reason), errors.ECODE_STATE)
912 def _ExpandItemName(fn, name, kind):
913 """Expand an item name.
915 @param fn: the function to use for expansion
916 @param name: requested item name
917 @param kind: text description ('Node' or 'Instance')
918 @return: the resolved (full) name
919 @raise errors.OpPrereqError: if the item is not found
923 if full_name is None:
924 raise errors.OpPrereqError("%s '%s' not known" % (kind, name),
929 def _ExpandNodeName(cfg, name):
930 """Wrapper over L{_ExpandItemName} for nodes."""
931 return _ExpandItemName(cfg.ExpandNodeName, name, "Node")
934 def _ExpandInstanceName(cfg, name):
935 """Wrapper over L{_ExpandItemName} for instance."""
936 return _ExpandItemName(cfg.ExpandInstanceName, name, "Instance")
939 def _BuildInstanceHookEnv(name, primary_node, secondary_nodes, os_type, status,
940 memory, vcpus, nics, disk_template, disks,
941 bep, hvp, hypervisor_name, tags):
942 """Builds instance related env variables for hooks
944 This builds the hook environment from individual variables.
947 @param name: the name of the instance
948 @type primary_node: string
949 @param primary_node: the name of the instance's primary node
950 @type secondary_nodes: list
951 @param secondary_nodes: list of secondary nodes as strings
952 @type os_type: string
953 @param os_type: the name of the instance's OS
954 @type status: boolean
955 @param status: the should_run status of the instance
957 @param memory: the memory size of the instance
959 @param vcpus: the count of VCPUs the instance has
961 @param nics: list of tuples (ip, mac, mode, link) representing
962 the NICs the instance has
963 @type disk_template: string
964 @param disk_template: the disk template of the instance
966 @param disks: the list of (size, mode) pairs
968 @param bep: the backend parameters for the instance
970 @param hvp: the hypervisor parameters for the instance
971 @type hypervisor_name: string
972 @param hypervisor_name: the hypervisor for the instance
974 @param tags: list of instance tags as strings
976 @return: the hook environment for this instance
985 "INSTANCE_NAME": name,
986 "INSTANCE_PRIMARY": primary_node,
987 "INSTANCE_SECONDARIES": " ".join(secondary_nodes),
988 "INSTANCE_OS_TYPE": os_type,
989 "INSTANCE_STATUS": str_status,
990 "INSTANCE_MEMORY": memory,
991 "INSTANCE_VCPUS": vcpus,
992 "INSTANCE_DISK_TEMPLATE": disk_template,
993 "INSTANCE_HYPERVISOR": hypervisor_name,
997 nic_count = len(nics)
998 for idx, (ip, mac, mode, link) in enumerate(nics):
1001 env["INSTANCE_NIC%d_IP" % idx] = ip
1002 env["INSTANCE_NIC%d_MAC" % idx] = mac
1003 env["INSTANCE_NIC%d_MODE" % idx] = mode
1004 env["INSTANCE_NIC%d_LINK" % idx] = link
1005 if mode == constants.NIC_MODE_BRIDGED:
1006 env["INSTANCE_NIC%d_BRIDGE" % idx] = link
1010 env["INSTANCE_NIC_COUNT"] = nic_count
1013 disk_count = len(disks)
1014 for idx, (size, mode) in enumerate(disks):
1015 env["INSTANCE_DISK%d_SIZE" % idx] = size
1016 env["INSTANCE_DISK%d_MODE" % idx] = mode
1020 env["INSTANCE_DISK_COUNT"] = disk_count
1025 env["INSTANCE_TAGS"] = " ".join(tags)
1027 for source, kind in [(bep, "BE"), (hvp, "HV")]:
1028 for key, value in source.items():
1029 env["INSTANCE_%s_%s" % (kind, key)] = value
1034 def _NICListToTuple(lu, nics):
1035 """Build a list of nic information tuples.
1037 This list is suitable to be passed to _BuildInstanceHookEnv or as a return
1038 value in LUInstanceQueryData.
1040 @type lu: L{LogicalUnit}
1041 @param lu: the logical unit on whose behalf we execute
1042 @type nics: list of L{objects.NIC}
1043 @param nics: list of nics to convert to hooks tuples
1047 cluster = lu.cfg.GetClusterInfo()
1051 filled_params = cluster.SimpleFillNIC(nic.nicparams)
1052 mode = filled_params[constants.NIC_MODE]
1053 link = filled_params[constants.NIC_LINK]
1054 hooks_nics.append((ip, mac, mode, link))
1058 def _BuildInstanceHookEnvByObject(lu, instance, override=None):
1059 """Builds instance related env variables for hooks from an object.
1061 @type lu: L{LogicalUnit}
1062 @param lu: the logical unit on whose behalf we execute
1063 @type instance: L{objects.Instance}
1064 @param instance: the instance for which we should build the
1066 @type override: dict
1067 @param override: dictionary with key/values that will override
1070 @return: the hook environment dictionary
1073 cluster = lu.cfg.GetClusterInfo()
1074 bep = cluster.FillBE(instance)
1075 hvp = cluster.FillHV(instance)
1077 "name": instance.name,
1078 "primary_node": instance.primary_node,
1079 "secondary_nodes": instance.secondary_nodes,
1080 "os_type": instance.os,
1081 "status": instance.admin_up,
1082 "memory": bep[constants.BE_MEMORY],
1083 "vcpus": bep[constants.BE_VCPUS],
1084 "nics": _NICListToTuple(lu, instance.nics),
1085 "disk_template": instance.disk_template,
1086 "disks": [(disk.size, disk.mode) for disk in instance.disks],
1089 "hypervisor_name": instance.hypervisor,
1090 "tags": instance.tags,
1093 args.update(override)
1094 return _BuildInstanceHookEnv(**args) # pylint: disable=W0142
1097 def _AdjustCandidatePool(lu, exceptions):
1098 """Adjust the candidate pool after node operations.
1101 mod_list = lu.cfg.MaintainCandidatePool(exceptions)
1103 lu.LogInfo("Promoted nodes to master candidate role: %s",
1104 utils.CommaJoin(node.name for node in mod_list))
1105 for name in mod_list:
1106 lu.context.ReaddNode(name)
1107 mc_now, mc_max, _ = lu.cfg.GetMasterCandidateStats(exceptions)
1109 lu.LogInfo("Note: more nodes are candidates (%d) than desired (%d)" %
1113 def _DecideSelfPromotion(lu, exceptions=None):
1114 """Decide whether I should promote myself as a master candidate.
1117 cp_size = lu.cfg.GetClusterInfo().candidate_pool_size
1118 mc_now, mc_should, _ = lu.cfg.GetMasterCandidateStats(exceptions)
1119 # the new node will increase mc_max with one, so:
1120 mc_should = min(mc_should + 1, cp_size)
1121 return mc_now < mc_should
1124 def _CheckNicsBridgesExist(lu, target_nics, target_node):
1125 """Check that the brigdes needed by a list of nics exist.
1128 cluster = lu.cfg.GetClusterInfo()
1129 paramslist = [cluster.SimpleFillNIC(nic.nicparams) for nic in target_nics]
1130 brlist = [params[constants.NIC_LINK] for params in paramslist
1131 if params[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED]
1133 result = lu.rpc.call_bridges_exist(target_node, brlist)
1134 result.Raise("Error checking bridges on destination node '%s'" %
1135 target_node, prereq=True, ecode=errors.ECODE_ENVIRON)
1138 def _CheckInstanceBridgesExist(lu, instance, node=None):
1139 """Check that the brigdes needed by an instance exist.
1143 node = instance.primary_node
1144 _CheckNicsBridgesExist(lu, instance.nics, node)
1147 def _CheckOSVariant(os_obj, name):
1148 """Check whether an OS name conforms to the os variants specification.
1150 @type os_obj: L{objects.OS}
1151 @param os_obj: OS object to check
1153 @param name: OS name passed by the user, to check for validity
1156 variant = objects.OS.GetVariant(name)
1157 if not os_obj.supported_variants:
1159 raise errors.OpPrereqError("OS '%s' doesn't support variants ('%s'"
1160 " passed)" % (os_obj.name, variant),
1164 raise errors.OpPrereqError("OS name must include a variant",
1167 if variant not in os_obj.supported_variants:
1168 raise errors.OpPrereqError("Unsupported OS variant", errors.ECODE_INVAL)
1171 def _GetNodeInstancesInner(cfg, fn):
1172 return [i for i in cfg.GetAllInstancesInfo().values() if fn(i)]
1175 def _GetNodeInstances(cfg, node_name):
1176 """Returns a list of all primary and secondary instances on a node.
1180 return _GetNodeInstancesInner(cfg, lambda inst: node_name in inst.all_nodes)
1183 def _GetNodePrimaryInstances(cfg, node_name):
1184 """Returns primary instances on a node.
1187 return _GetNodeInstancesInner(cfg,
1188 lambda inst: node_name == inst.primary_node)
1191 def _GetNodeSecondaryInstances(cfg, node_name):
1192 """Returns secondary instances on a node.
1195 return _GetNodeInstancesInner(cfg,
1196 lambda inst: node_name in inst.secondary_nodes)
1199 def _GetStorageTypeArgs(cfg, storage_type):
1200 """Returns the arguments for a storage type.
1203 # Special case for file storage
1204 if storage_type == constants.ST_FILE:
1205 # storage.FileStorage wants a list of storage directories
1206 return [[cfg.GetFileStorageDir(), cfg.GetSharedFileStorageDir()]]
1211 def _FindFaultyInstanceDisks(cfg, rpc, instance, node_name, prereq):
1214 for dev in instance.disks:
1215 cfg.SetDiskID(dev, node_name)
1217 result = rpc.call_blockdev_getmirrorstatus(node_name, instance.disks)
1218 result.Raise("Failed to get disk status from node %s" % node_name,
1219 prereq=prereq, ecode=errors.ECODE_ENVIRON)
1221 for idx, bdev_status in enumerate(result.payload):
1222 if bdev_status and bdev_status.ldisk_status == constants.LDS_FAULTY:
1228 def _CheckIAllocatorOrNode(lu, iallocator_slot, node_slot):
1229 """Check the sanity of iallocator and node arguments and use the
1230 cluster-wide iallocator if appropriate.
1232 Check that at most one of (iallocator, node) is specified. If none is
1233 specified, then the LU's opcode's iallocator slot is filled with the
1234 cluster-wide default iallocator.
1236 @type iallocator_slot: string
1237 @param iallocator_slot: the name of the opcode iallocator slot
1238 @type node_slot: string
1239 @param node_slot: the name of the opcode target node slot
1242 node = getattr(lu.op, node_slot, None)
1243 iallocator = getattr(lu.op, iallocator_slot, None)
1245 if node is not None and iallocator is not None:
1246 raise errors.OpPrereqError("Do not specify both, iallocator and node",
1248 elif node is None and iallocator is None:
1249 default_iallocator = lu.cfg.GetDefaultIAllocator()
1250 if default_iallocator:
1251 setattr(lu.op, iallocator_slot, default_iallocator)
1253 raise errors.OpPrereqError("No iallocator or node given and no"
1254 " cluster-wide default iallocator found;"
1255 " please specify either an iallocator or a"
1256 " node, or set a cluster-wide default"
1260 def _GetDefaultIAllocator(cfg, iallocator):
1261 """Decides on which iallocator to use.
1263 @type cfg: L{config.ConfigWriter}
1264 @param cfg: Cluster configuration object
1265 @type iallocator: string or None
1266 @param iallocator: Iallocator specified in opcode
1268 @return: Iallocator name
1272 # Use default iallocator
1273 iallocator = cfg.GetDefaultIAllocator()
1276 raise errors.OpPrereqError("No iallocator was specified, neither in the"
1277 " opcode nor as a cluster-wide default",
1283 class LUClusterPostInit(LogicalUnit):
1284 """Logical unit for running hooks after cluster initialization.
1287 HPATH = "cluster-init"
1288 HTYPE = constants.HTYPE_CLUSTER
1290 def BuildHooksEnv(self):
1295 "OP_TARGET": self.cfg.GetClusterName(),
1298 def BuildHooksNodes(self):
1299 """Build hooks nodes.
1302 return ([], [self.cfg.GetMasterNode()])
1304 def Exec(self, feedback_fn):
1311 class LUClusterDestroy(LogicalUnit):
1312 """Logical unit for destroying the cluster.
1315 HPATH = "cluster-destroy"
1316 HTYPE = constants.HTYPE_CLUSTER
1318 def BuildHooksEnv(self):
1323 "OP_TARGET": self.cfg.GetClusterName(),
1326 def BuildHooksNodes(self):
1327 """Build hooks nodes.
1332 def CheckPrereq(self):
1333 """Check prerequisites.
1335 This checks whether the cluster is empty.
1337 Any errors are signaled by raising errors.OpPrereqError.
1340 master = self.cfg.GetMasterNode()
1342 nodelist = self.cfg.GetNodeList()
1343 if len(nodelist) != 1 or nodelist[0] != master:
1344 raise errors.OpPrereqError("There are still %d node(s) in"
1345 " this cluster." % (len(nodelist) - 1),
1347 instancelist = self.cfg.GetInstanceList()
1349 raise errors.OpPrereqError("There are still %d instance(s) in"
1350 " this cluster." % len(instancelist),
1353 def Exec(self, feedback_fn):
1354 """Destroys the cluster.
1357 master = self.cfg.GetMasterNode()
1359 # Run post hooks on master node before it's removed
1360 _RunPostHook(self, master)
1362 result = self.rpc.call_node_deactivate_master_ip(master)
1363 result.Raise("Could not disable the master role")
1368 def _VerifyCertificate(filename):
1369 """Verifies a certificate for L{LUClusterVerifyConfig}.
1371 @type filename: string
1372 @param filename: Path to PEM file
1376 cert = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM,
1377 utils.ReadFile(filename))
1378 except Exception, err: # pylint: disable=W0703
1379 return (LUClusterVerifyConfig.ETYPE_ERROR,
1380 "Failed to load X509 certificate %s: %s" % (filename, err))
1383 utils.VerifyX509Certificate(cert, constants.SSL_CERT_EXPIRATION_WARN,
1384 constants.SSL_CERT_EXPIRATION_ERROR)
1387 fnamemsg = "While verifying %s: %s" % (filename, msg)
1392 return (None, fnamemsg)
1393 elif errcode == utils.CERT_WARNING:
1394 return (LUClusterVerifyConfig.ETYPE_WARNING, fnamemsg)
1395 elif errcode == utils.CERT_ERROR:
1396 return (LUClusterVerifyConfig.ETYPE_ERROR, fnamemsg)
1398 raise errors.ProgrammerError("Unhandled certificate error code %r" % errcode)
1401 def _GetAllHypervisorParameters(cluster, instances):
1402 """Compute the set of all hypervisor parameters.
1404 @type cluster: L{objects.Cluster}
1405 @param cluster: the cluster object
1406 @param instances: list of L{objects.Instance}
1407 @param instances: additional instances from which to obtain parameters
1408 @rtype: list of (origin, hypervisor, parameters)
1409 @return: a list with all parameters found, indicating the hypervisor they
1410 apply to, and the origin (can be "cluster", "os X", or "instance Y")
1415 for hv_name in cluster.enabled_hypervisors:
1416 hvp_data.append(("cluster", hv_name, cluster.GetHVDefaults(hv_name)))
1418 for os_name, os_hvp in cluster.os_hvp.items():
1419 for hv_name, hv_params in os_hvp.items():
1421 full_params = cluster.GetHVDefaults(hv_name, os_name=os_name)
1422 hvp_data.append(("os %s" % os_name, hv_name, full_params))
1424 # TODO: collapse identical parameter values in a single one
1425 for instance in instances:
1426 if instance.hvparams:
1427 hvp_data.append(("instance %s" % instance.name, instance.hypervisor,
1428 cluster.FillHV(instance)))
1433 class _VerifyErrors(object):
1434 """Mix-in for cluster/group verify LUs.
1436 It provides _Error and _ErrorIf, and updates the self.bad boolean. (Expects
1437 self.op and self._feedback_fn to be available.)
1441 ETYPE_FIELD = "code"
1442 ETYPE_ERROR = "ERROR"
1443 ETYPE_WARNING = "WARNING"
1445 def _Error(self, ecode, item, msg, *args, **kwargs):
1446 """Format an error message.
1448 Based on the opcode's error_codes parameter, either format a
1449 parseable error code, or a simpler error string.
1451 This must be called only from Exec and functions called from Exec.
1454 ltype = kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR)
1455 itype, etxt, _ = ecode
1456 # first complete the msg
1459 # then format the whole message
1460 if self.op.error_codes: # This is a mix-in. pylint: disable=E1101
1461 msg = "%s:%s:%s:%s:%s" % (ltype, etxt, itype, item, msg)
1467 msg = "%s: %s%s: %s" % (ltype, itype, item, msg)
1468 # and finally report it via the feedback_fn
1469 self._feedback_fn(" - %s" % msg) # Mix-in. pylint: disable=E1101
1471 def _ErrorIf(self, cond, ecode, *args, **kwargs):
1472 """Log an error message if the passed condition is True.
1476 or self.op.debug_simulate_errors) # pylint: disable=E1101
1478 # If the error code is in the list of ignored errors, demote the error to a
1480 (_, etxt, _) = ecode
1481 if etxt in self.op.ignore_errors: # pylint: disable=E1101
1482 kwargs[self.ETYPE_FIELD] = self.ETYPE_WARNING
1485 self._Error(ecode, *args, **kwargs)
1487 # do not mark the operation as failed for WARN cases only
1488 if kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR) == self.ETYPE_ERROR:
1489 self.bad = self.bad or cond
1492 class LUClusterVerify(NoHooksLU):
1493 """Submits all jobs necessary to verify the cluster.
1498 def ExpandNames(self):
1499 self.needed_locks = {}
1501 def Exec(self, feedback_fn):
1504 if self.op.group_name:
1505 groups = [self.op.group_name]
1506 depends_fn = lambda: None
1508 groups = self.cfg.GetNodeGroupList()
1510 # Verify global configuration
1512 opcodes.OpClusterVerifyConfig(ignore_errors=self.op.ignore_errors)
1515 # Always depend on global verification
1516 depends_fn = lambda: [(-len(jobs), [])]
1518 jobs.extend([opcodes.OpClusterVerifyGroup(group_name=group,
1519 ignore_errors=self.op.ignore_errors,
1520 depends=depends_fn())]
1521 for group in groups)
1523 # Fix up all parameters
1524 for op in itertools.chain(*jobs): # pylint: disable=W0142
1525 op.debug_simulate_errors = self.op.debug_simulate_errors
1526 op.verbose = self.op.verbose
1527 op.error_codes = self.op.error_codes
1529 op.skip_checks = self.op.skip_checks
1530 except AttributeError:
1531 assert not isinstance(op, opcodes.OpClusterVerifyGroup)
1533 return ResultWithJobs(jobs)
1536 class LUClusterVerifyConfig(NoHooksLU, _VerifyErrors):
1537 """Verifies the cluster config.
1542 def _VerifyHVP(self, hvp_data):
1543 """Verifies locally the syntax of the hypervisor parameters.
1546 for item, hv_name, hv_params in hvp_data:
1547 msg = ("hypervisor %s parameters syntax check (source %s): %%s" %
1550 hv_class = hypervisor.GetHypervisor(hv_name)
1551 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
1552 hv_class.CheckParameterSyntax(hv_params)
1553 except errors.GenericError, err:
1554 self._ErrorIf(True, constants.CV_ECLUSTERCFG, None, msg % str(err))
1556 def ExpandNames(self):
1557 # Information can be safely retrieved as the BGL is acquired in exclusive
1559 assert locking.BGL in self.owned_locks(locking.LEVEL_CLUSTER)
1560 self.all_group_info = self.cfg.GetAllNodeGroupsInfo()
1561 self.all_node_info = self.cfg.GetAllNodesInfo()
1562 self.all_inst_info = self.cfg.GetAllInstancesInfo()
1563 self.needed_locks = {}
1565 def Exec(self, feedback_fn):
1566 """Verify integrity of cluster, performing various test on nodes.
1570 self._feedback_fn = feedback_fn
1572 feedback_fn("* Verifying cluster config")
1574 for msg in self.cfg.VerifyConfig():
1575 self._ErrorIf(True, constants.CV_ECLUSTERCFG, None, msg)
1577 feedback_fn("* Verifying cluster certificate files")
1579 for cert_filename in constants.ALL_CERT_FILES:
1580 (errcode, msg) = _VerifyCertificate(cert_filename)
1581 self._ErrorIf(errcode, constants.CV_ECLUSTERCERT, None, msg, code=errcode)
1583 feedback_fn("* Verifying hypervisor parameters")
1585 self._VerifyHVP(_GetAllHypervisorParameters(self.cfg.GetClusterInfo(),
1586 self.all_inst_info.values()))
1588 feedback_fn("* Verifying all nodes belong to an existing group")
1590 # We do this verification here because, should this bogus circumstance
1591 # occur, it would never be caught by VerifyGroup, which only acts on
1592 # nodes/instances reachable from existing node groups.
1594 dangling_nodes = set(node.name for node in self.all_node_info.values()
1595 if node.group not in self.all_group_info)
1597 dangling_instances = {}
1598 no_node_instances = []
1600 for inst in self.all_inst_info.values():
1601 if inst.primary_node in dangling_nodes:
1602 dangling_instances.setdefault(inst.primary_node, []).append(inst.name)
1603 elif inst.primary_node not in self.all_node_info:
1604 no_node_instances.append(inst.name)
1609 utils.CommaJoin(dangling_instances.get(node.name,
1611 for node in dangling_nodes]
1613 self._ErrorIf(bool(dangling_nodes), constants.CV_ECLUSTERDANGLINGNODES,
1615 "the following nodes (and their instances) belong to a non"
1616 " existing group: %s", utils.CommaJoin(pretty_dangling))
1618 self._ErrorIf(bool(no_node_instances), constants.CV_ECLUSTERDANGLINGINST,
1620 "the following instances have a non-existing primary-node:"
1621 " %s", utils.CommaJoin(no_node_instances))
1626 class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
1627 """Verifies the status of a node group.
1630 HPATH = "cluster-verify"
1631 HTYPE = constants.HTYPE_CLUSTER
1634 _HOOKS_INDENT_RE = re.compile("^", re.M)
1636 class NodeImage(object):
1637 """A class representing the logical and physical status of a node.
1640 @ivar name: the node name to which this object refers
1641 @ivar volumes: a structure as returned from
1642 L{ganeti.backend.GetVolumeList} (runtime)
1643 @ivar instances: a list of running instances (runtime)
1644 @ivar pinst: list of configured primary instances (config)
1645 @ivar sinst: list of configured secondary instances (config)
1646 @ivar sbp: dictionary of {primary-node: list of instances} for all
1647 instances for which this node is secondary (config)
1648 @ivar mfree: free memory, as reported by hypervisor (runtime)
1649 @ivar dfree: free disk, as reported by the node (runtime)
1650 @ivar offline: the offline status (config)
1651 @type rpc_fail: boolean
1652 @ivar rpc_fail: whether the RPC verify call was successfull (overall,
1653 not whether the individual keys were correct) (runtime)
1654 @type lvm_fail: boolean
1655 @ivar lvm_fail: whether the RPC call didn't return valid LVM data
1656 @type hyp_fail: boolean
1657 @ivar hyp_fail: whether the RPC call didn't return the instance list
1658 @type ghost: boolean
1659 @ivar ghost: whether this is a known node or not (config)
1660 @type os_fail: boolean
1661 @ivar os_fail: whether the RPC call didn't return valid OS data
1663 @ivar oslist: list of OSes as diagnosed by DiagnoseOS
1664 @type vm_capable: boolean
1665 @ivar vm_capable: whether the node can host instances
1668 def __init__(self, offline=False, name=None, vm_capable=True):
1677 self.offline = offline
1678 self.vm_capable = vm_capable
1679 self.rpc_fail = False
1680 self.lvm_fail = False
1681 self.hyp_fail = False
1683 self.os_fail = False
1686 def ExpandNames(self):
1687 # This raises errors.OpPrereqError on its own:
1688 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
1690 # Get instances in node group; this is unsafe and needs verification later
1691 inst_names = self.cfg.GetNodeGroupInstances(self.group_uuid)
1693 self.needed_locks = {
1694 locking.LEVEL_INSTANCE: inst_names,
1695 locking.LEVEL_NODEGROUP: [self.group_uuid],
1696 locking.LEVEL_NODE: [],
1699 self.share_locks = _ShareAll()
1701 def DeclareLocks(self, level):
1702 if level == locking.LEVEL_NODE:
1703 # Get members of node group; this is unsafe and needs verification later
1704 nodes = set(self.cfg.GetNodeGroup(self.group_uuid).members)
1706 all_inst_info = self.cfg.GetAllInstancesInfo()
1708 # In Exec(), we warn about mirrored instances that have primary and
1709 # secondary living in separate node groups. To fully verify that
1710 # volumes for these instances are healthy, we will need to do an
1711 # extra call to their secondaries. We ensure here those nodes will
1713 for inst in self.owned_locks(locking.LEVEL_INSTANCE):
1714 # Important: access only the instances whose lock is owned
1715 if all_inst_info[inst].disk_template in constants.DTS_INT_MIRROR:
1716 nodes.update(all_inst_info[inst].secondary_nodes)
1718 self.needed_locks[locking.LEVEL_NODE] = nodes
1720 def CheckPrereq(self):
1721 assert self.group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP)
1722 self.group_info = self.cfg.GetNodeGroup(self.group_uuid)
1724 group_nodes = set(self.group_info.members)
1725 group_instances = self.cfg.GetNodeGroupInstances(self.group_uuid)
1728 group_nodes.difference(self.owned_locks(locking.LEVEL_NODE))
1730 unlocked_instances = \
1731 group_instances.difference(self.owned_locks(locking.LEVEL_INSTANCE))
1734 raise errors.OpPrereqError("Missing lock for nodes: %s" %
1735 utils.CommaJoin(unlocked_nodes))
1737 if unlocked_instances:
1738 raise errors.OpPrereqError("Missing lock for instances: %s" %
1739 utils.CommaJoin(unlocked_instances))
1741 self.all_node_info = self.cfg.GetAllNodesInfo()
1742 self.all_inst_info = self.cfg.GetAllInstancesInfo()
1744 self.my_node_names = utils.NiceSort(group_nodes)
1745 self.my_inst_names = utils.NiceSort(group_instances)
1747 self.my_node_info = dict((name, self.all_node_info[name])
1748 for name in self.my_node_names)
1750 self.my_inst_info = dict((name, self.all_inst_info[name])
1751 for name in self.my_inst_names)
1753 # We detect here the nodes that will need the extra RPC calls for verifying
1754 # split LV volumes; they should be locked.
1755 extra_lv_nodes = set()
1757 for inst in self.my_inst_info.values():
1758 if inst.disk_template in constants.DTS_INT_MIRROR:
1759 group = self.my_node_info[inst.primary_node].group
1760 for nname in inst.secondary_nodes:
1761 if self.all_node_info[nname].group != group:
1762 extra_lv_nodes.add(nname)
1764 unlocked_lv_nodes = \
1765 extra_lv_nodes.difference(self.owned_locks(locking.LEVEL_NODE))
1767 if unlocked_lv_nodes:
1768 raise errors.OpPrereqError("these nodes could be locked: %s" %
1769 utils.CommaJoin(unlocked_lv_nodes))
1770 self.extra_lv_nodes = list(extra_lv_nodes)
1772 def _VerifyNode(self, ninfo, nresult):
1773 """Perform some basic validation on data returned from a node.
1775 - check the result data structure is well formed and has all the
1777 - check ganeti version
1779 @type ninfo: L{objects.Node}
1780 @param ninfo: the node to check
1781 @param nresult: the results from the node
1783 @return: whether overall this call was successful (and we can expect
1784 reasonable values in the respose)
1788 _ErrorIf = self._ErrorIf # pylint: disable=C0103
1790 # main result, nresult should be a non-empty dict
1791 test = not nresult or not isinstance(nresult, dict)
1792 _ErrorIf(test, constants.CV_ENODERPC, node,
1793 "unable to verify node: no data returned")
1797 # compares ganeti version
1798 local_version = constants.PROTOCOL_VERSION
1799 remote_version = nresult.get("version", None)
1800 test = not (remote_version and
1801 isinstance(remote_version, (list, tuple)) and
1802 len(remote_version) == 2)
1803 _ErrorIf(test, constants.CV_ENODERPC, node,
1804 "connection to node returned invalid data")
1808 test = local_version != remote_version[0]
1809 _ErrorIf(test, constants.CV_ENODEVERSION, node,
1810 "incompatible protocol versions: master %s,"
1811 " node %s", local_version, remote_version[0])
1815 # node seems compatible, we can actually try to look into its results
1817 # full package version
1818 self._ErrorIf(constants.RELEASE_VERSION != remote_version[1],
1819 constants.CV_ENODEVERSION, node,
1820 "software version mismatch: master %s, node %s",
1821 constants.RELEASE_VERSION, remote_version[1],
1822 code=self.ETYPE_WARNING)
1824 hyp_result = nresult.get(constants.NV_HYPERVISOR, None)
1825 if ninfo.vm_capable and isinstance(hyp_result, dict):
1826 for hv_name, hv_result in hyp_result.iteritems():
1827 test = hv_result is not None
1828 _ErrorIf(test, constants.CV_ENODEHV, node,
1829 "hypervisor %s verify failure: '%s'", hv_name, hv_result)
1831 hvp_result = nresult.get(constants.NV_HVPARAMS, None)
1832 if ninfo.vm_capable and isinstance(hvp_result, list):
1833 for item, hv_name, hv_result in hvp_result:
1834 _ErrorIf(True, constants.CV_ENODEHV, node,
1835 "hypervisor %s parameter verify failure (source %s): %s",
1836 hv_name, item, hv_result)
1838 test = nresult.get(constants.NV_NODESETUP,
1839 ["Missing NODESETUP results"])
1840 _ErrorIf(test, constants.CV_ENODESETUP, node, "node setup error: %s",
1845 def _VerifyNodeTime(self, ninfo, nresult,
1846 nvinfo_starttime, nvinfo_endtime):
1847 """Check the node time.
1849 @type ninfo: L{objects.Node}
1850 @param ninfo: the node to check
1851 @param nresult: the remote results for the node
1852 @param nvinfo_starttime: the start time of the RPC call
1853 @param nvinfo_endtime: the end time of the RPC call
1857 _ErrorIf = self._ErrorIf # pylint: disable=C0103
1859 ntime = nresult.get(constants.NV_TIME, None)
1861 ntime_merged = utils.MergeTime(ntime)
1862 except (ValueError, TypeError):
1863 _ErrorIf(True, constants.CV_ENODETIME, node, "Node returned invalid time")
1866 if ntime_merged < (nvinfo_starttime - constants.NODE_MAX_CLOCK_SKEW):
1867 ntime_diff = "%.01fs" % abs(nvinfo_starttime - ntime_merged)
1868 elif ntime_merged > (nvinfo_endtime + constants.NODE_MAX_CLOCK_SKEW):
1869 ntime_diff = "%.01fs" % abs(ntime_merged - nvinfo_endtime)
1873 _ErrorIf(ntime_diff is not None, constants.CV_ENODETIME, node,
1874 "Node time diverges by at least %s from master node time",
1877 def _VerifyNodeLVM(self, ninfo, nresult, vg_name):
1878 """Check the node LVM results.
1880 @type ninfo: L{objects.Node}
1881 @param ninfo: the node to check
1882 @param nresult: the remote results for the node
1883 @param vg_name: the configured VG name
1890 _ErrorIf = self._ErrorIf # pylint: disable=C0103
1892 # checks vg existence and size > 20G
1893 vglist = nresult.get(constants.NV_VGLIST, None)
1895 _ErrorIf(test, constants.CV_ENODELVM, node, "unable to check volume groups")
1897 vgstatus = utils.CheckVolumeGroupSize(vglist, vg_name,
1898 constants.MIN_VG_SIZE)
1899 _ErrorIf(vgstatus, constants.CV_ENODELVM, node, vgstatus)
1902 pvlist = nresult.get(constants.NV_PVLIST, None)
1903 test = pvlist is None
1904 _ErrorIf(test, constants.CV_ENODELVM, node, "Can't get PV list from node")
1906 # check that ':' is not present in PV names, since it's a
1907 # special character for lvcreate (denotes the range of PEs to
1909 for _, pvname, owner_vg in pvlist:
1910 test = ":" in pvname
1911 _ErrorIf(test, constants.CV_ENODELVM, node,
1912 "Invalid character ':' in PV '%s' of VG '%s'",
1915 def _VerifyNodeBridges(self, ninfo, nresult, bridges):
1916 """Check the node bridges.
1918 @type ninfo: L{objects.Node}
1919 @param ninfo: the node to check
1920 @param nresult: the remote results for the node
1921 @param bridges: the expected list of bridges
1928 _ErrorIf = self._ErrorIf # pylint: disable=C0103
1930 missing = nresult.get(constants.NV_BRIDGES, None)
1931 test = not isinstance(missing, list)
1932 _ErrorIf(test, constants.CV_ENODENET, node,
1933 "did not return valid bridge information")
1935 _ErrorIf(bool(missing), constants.CV_ENODENET, node,
1936 "missing bridges: %s" % utils.CommaJoin(sorted(missing)))
1938 def _VerifyNodeNetwork(self, ninfo, nresult):
1939 """Check the node network connectivity results.
1941 @type ninfo: L{objects.Node}
1942 @param ninfo: the node to check
1943 @param nresult: the remote results for the node
1947 _ErrorIf = self._ErrorIf # pylint: disable=C0103
1949 test = constants.NV_NODELIST not in nresult
1950 _ErrorIf(test, constants.CV_ENODESSH, node,
1951 "node hasn't returned node ssh connectivity data")
1953 if nresult[constants.NV_NODELIST]:
1954 for a_node, a_msg in nresult[constants.NV_NODELIST].items():
1955 _ErrorIf(True, constants.CV_ENODESSH, node,
1956 "ssh communication with node '%s': %s", a_node, a_msg)
1958 test = constants.NV_NODENETTEST not in nresult
1959 _ErrorIf(test, constants.CV_ENODENET, node,
1960 "node hasn't returned node tcp connectivity data")
1962 if nresult[constants.NV_NODENETTEST]:
1963 nlist = utils.NiceSort(nresult[constants.NV_NODENETTEST].keys())
1965 _ErrorIf(True, constants.CV_ENODENET, node,
1966 "tcp communication with node '%s': %s",
1967 anode, nresult[constants.NV_NODENETTEST][anode])
1969 test = constants.NV_MASTERIP not in nresult
1970 _ErrorIf(test, constants.CV_ENODENET, node,
1971 "node hasn't returned node master IP reachability data")
1973 if not nresult[constants.NV_MASTERIP]:
1974 if node == self.master_node:
1975 msg = "the master node cannot reach the master IP (not configured?)"
1977 msg = "cannot reach the master IP"
1978 _ErrorIf(True, constants.CV_ENODENET, node, msg)
1980 def _VerifyInstance(self, instance, instanceconfig, node_image,
1982 """Verify an instance.
1984 This function checks to see if the required block devices are
1985 available on the instance's node.
1988 _ErrorIf = self._ErrorIf # pylint: disable=C0103
1989 node_current = instanceconfig.primary_node
1991 node_vol_should = {}
1992 instanceconfig.MapLVsByNode(node_vol_should)
1994 for node in node_vol_should:
1995 n_img = node_image[node]
1996 if n_img.offline or n_img.rpc_fail or n_img.lvm_fail:
1997 # ignore missing volumes on offline or broken nodes
1999 for volume in node_vol_should[node]:
2000 test = volume not in n_img.volumes
2001 _ErrorIf(test, constants.CV_EINSTANCEMISSINGDISK, instance,
2002 "volume %s missing on node %s", volume, node)
2004 if instanceconfig.admin_up:
2005 pri_img = node_image[node_current]
2006 test = instance not in pri_img.instances and not pri_img.offline
2007 _ErrorIf(test, constants.CV_EINSTANCEDOWN, instance,
2008 "instance not running on its primary node %s",
2011 diskdata = [(nname, success, status, idx)
2012 for (nname, disks) in diskstatus.items()
2013 for idx, (success, status) in enumerate(disks)]
2015 for nname, success, bdev_status, idx in diskdata:
2016 # the 'ghost node' construction in Exec() ensures that we have a
2018 snode = node_image[nname]
2019 bad_snode = snode.ghost or snode.offline
2020 _ErrorIf(instanceconfig.admin_up and not success and not bad_snode,
2021 constants.CV_EINSTANCEFAULTYDISK, instance,
2022 "couldn't retrieve status for disk/%s on %s: %s",
2023 idx, nname, bdev_status)
2024 _ErrorIf((instanceconfig.admin_up and success and
2025 bdev_status.ldisk_status == constants.LDS_FAULTY),
2026 constants.CV_EINSTANCEFAULTYDISK, instance,
2027 "disk/%s on %s is faulty", idx, nname)
2029 def _VerifyOrphanVolumes(self, node_vol_should, node_image, reserved):
2030 """Verify if there are any unknown volumes in the cluster.
2032 The .os, .swap and backup volumes are ignored. All other volumes are
2033 reported as unknown.
2035 @type reserved: L{ganeti.utils.FieldSet}
2036 @param reserved: a FieldSet of reserved volume names
2039 for node, n_img in node_image.items():
2040 if n_img.offline or n_img.rpc_fail or n_img.lvm_fail:
2041 # skip non-healthy nodes
2043 for volume in n_img.volumes:
2044 test = ((node not in node_vol_should or
2045 volume not in node_vol_should[node]) and
2046 not reserved.Matches(volume))
2047 self._ErrorIf(test, constants.CV_ENODEORPHANLV, node,
2048 "volume %s is unknown", volume)
2050 def _VerifyNPlusOneMemory(self, node_image, instance_cfg):
2051 """Verify N+1 Memory Resilience.
2053 Check that if one single node dies we can still start all the
2054 instances it was primary for.
2057 cluster_info = self.cfg.GetClusterInfo()
2058 for node, n_img in node_image.items():
2059 # This code checks that every node which is now listed as
2060 # secondary has enough memory to host all instances it is
2061 # supposed to should a single other node in the cluster fail.
2062 # FIXME: not ready for failover to an arbitrary node
2063 # FIXME: does not support file-backed instances
2064 # WARNING: we currently take into account down instances as well
2065 # as up ones, considering that even if they're down someone
2066 # might want to start them even in the event of a node failure.
2068 # we're skipping offline nodes from the N+1 warning, since
2069 # most likely we don't have good memory infromation from them;
2070 # we already list instances living on such nodes, and that's
2073 for prinode, instances in n_img.sbp.items():
2075 for instance in instances:
2076 bep = cluster_info.FillBE(instance_cfg[instance])
2077 if bep[constants.BE_AUTO_BALANCE]:
2078 needed_mem += bep[constants.BE_MEMORY]
2079 test = n_img.mfree < needed_mem
2080 self._ErrorIf(test, constants.CV_ENODEN1, node,
2081 "not enough memory to accomodate instance failovers"
2082 " should node %s fail (%dMiB needed, %dMiB available)",
2083 prinode, needed_mem, n_img.mfree)
2086 def _VerifyFiles(cls, errorif, nodeinfo, master_node, all_nvinfo,
2087 (files_all, files_opt, files_mc, files_vm)):
2088 """Verifies file checksums collected from all nodes.
2090 @param errorif: Callback for reporting errors
2091 @param nodeinfo: List of L{objects.Node} objects
2092 @param master_node: Name of master node
2093 @param all_nvinfo: RPC results
2096 # Define functions determining which nodes to consider for a file
2099 (files_mc, lambda node: (node.master_candidate or
2100 node.name == master_node)),
2101 (files_vm, lambda node: node.vm_capable),
2104 # Build mapping from filename to list of nodes which should have the file
2106 for (files, fn) in files2nodefn:
2108 filenodes = nodeinfo
2110 filenodes = filter(fn, nodeinfo)
2111 nodefiles.update((filename,
2112 frozenset(map(operator.attrgetter("name"), filenodes)))
2113 for filename in files)
2115 assert set(nodefiles) == (files_all | files_mc | files_vm)
2117 fileinfo = dict((filename, {}) for filename in nodefiles)
2118 ignore_nodes = set()
2120 for node in nodeinfo:
2122 ignore_nodes.add(node.name)
2125 nresult = all_nvinfo[node.name]
2127 if nresult.fail_msg or not nresult.payload:
2130 node_files = nresult.payload.get(constants.NV_FILELIST, None)
2132 test = not (node_files and isinstance(node_files, dict))
2133 errorif(test, constants.CV_ENODEFILECHECK, node.name,
2134 "Node did not return file checksum data")
2136 ignore_nodes.add(node.name)
2139 # Build per-checksum mapping from filename to nodes having it
2140 for (filename, checksum) in node_files.items():
2141 assert filename in nodefiles
2142 fileinfo[filename].setdefault(checksum, set()).add(node.name)
2144 for (filename, checksums) in fileinfo.items():
2145 assert compat.all(len(i) > 10 for i in checksums), "Invalid checksum"
2147 # Nodes having the file
2148 with_file = frozenset(node_name
2149 for nodes in fileinfo[filename].values()
2150 for node_name in nodes) - ignore_nodes
2152 expected_nodes = nodefiles[filename] - ignore_nodes
2154 # Nodes missing file
2155 missing_file = expected_nodes - with_file
2157 if filename in files_opt:
2159 errorif(missing_file and missing_file != expected_nodes,
2160 constants.CV_ECLUSTERFILECHECK, None,
2161 "File %s is optional, but it must exist on all or no"
2162 " nodes (not found on %s)",
2163 filename, utils.CommaJoin(utils.NiceSort(missing_file)))
2165 errorif(missing_file, constants.CV_ECLUSTERFILECHECK, None,
2166 "File %s is missing from node(s) %s", filename,
2167 utils.CommaJoin(utils.NiceSort(missing_file)))
2169 # Warn if a node has a file it shouldn't
2170 unexpected = with_file - expected_nodes
2172 constants.CV_ECLUSTERFILECHECK, None,
2173 "File %s should not exist on node(s) %s",
2174 filename, utils.CommaJoin(utils.NiceSort(unexpected)))
2176 # See if there are multiple versions of the file
2177 test = len(checksums) > 1
2179 variants = ["variant %s on %s" %
2180 (idx + 1, utils.CommaJoin(utils.NiceSort(nodes)))
2181 for (idx, (checksum, nodes)) in
2182 enumerate(sorted(checksums.items()))]
2186 errorif(test, constants.CV_ECLUSTERFILECHECK, None,
2187 "File %s found with %s different checksums (%s)",
2188 filename, len(checksums), "; ".join(variants))
2190 def _VerifyNodeDrbd(self, ninfo, nresult, instanceinfo, drbd_helper,
2192 """Verifies and the node DRBD status.
2194 @type ninfo: L{objects.Node}
2195 @param ninfo: the node to check
2196 @param nresult: the remote results for the node
2197 @param instanceinfo: the dict of instances
2198 @param drbd_helper: the configured DRBD usermode helper
2199 @param drbd_map: the DRBD map as returned by
2200 L{ganeti.config.ConfigWriter.ComputeDRBDMap}
2204 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2207 helper_result = nresult.get(constants.NV_DRBDHELPER, None)
2208 test = (helper_result == None)
2209 _ErrorIf(test, constants.CV_ENODEDRBDHELPER, node,
2210 "no drbd usermode helper returned")
2212 status, payload = helper_result
2214 _ErrorIf(test, constants.CV_ENODEDRBDHELPER, node,
2215 "drbd usermode helper check unsuccessful: %s", payload)
2216 test = status and (payload != drbd_helper)
2217 _ErrorIf(test, constants.CV_ENODEDRBDHELPER, node,
2218 "wrong drbd usermode helper: %s", payload)
2220 # compute the DRBD minors
2222 for minor, instance in drbd_map[node].items():
2223 test = instance not in instanceinfo
2224 _ErrorIf(test, constants.CV_ECLUSTERCFG, None,
2225 "ghost instance '%s' in temporary DRBD map", instance)
2226 # ghost instance should not be running, but otherwise we
2227 # don't give double warnings (both ghost instance and
2228 # unallocated minor in use)
2230 node_drbd[minor] = (instance, False)
2232 instance = instanceinfo[instance]
2233 node_drbd[minor] = (instance.name, instance.admin_up)
2235 # and now check them
2236 used_minors = nresult.get(constants.NV_DRBDLIST, [])
2237 test = not isinstance(used_minors, (tuple, list))
2238 _ErrorIf(test, constants.CV_ENODEDRBD, node,
2239 "cannot parse drbd status file: %s", str(used_minors))
2241 # we cannot check drbd status
2244 for minor, (iname, must_exist) in node_drbd.items():
2245 test = minor not in used_minors and must_exist
2246 _ErrorIf(test, constants.CV_ENODEDRBD, node,
2247 "drbd minor %d of instance %s is not active", minor, iname)
2248 for minor in used_minors:
2249 test = minor not in node_drbd
2250 _ErrorIf(test, constants.CV_ENODEDRBD, node,
2251 "unallocated drbd minor %d is in use", minor)
2253 def _UpdateNodeOS(self, ninfo, nresult, nimg):
2254 """Builds the node OS structures.
2256 @type ninfo: L{objects.Node}
2257 @param ninfo: the node to check
2258 @param nresult: the remote results for the node
2259 @param nimg: the node image object
2263 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2265 remote_os = nresult.get(constants.NV_OSLIST, None)
2266 test = (not isinstance(remote_os, list) or
2267 not compat.all(isinstance(v, list) and len(v) == 7
2268 for v in remote_os))
2270 _ErrorIf(test, constants.CV_ENODEOS, node,
2271 "node hasn't returned valid OS data")
2280 for (name, os_path, status, diagnose,
2281 variants, parameters, api_ver) in nresult[constants.NV_OSLIST]:
2283 if name not in os_dict:
2286 # parameters is a list of lists instead of list of tuples due to
2287 # JSON lacking a real tuple type, fix it:
2288 parameters = [tuple(v) for v in parameters]
2289 os_dict[name].append((os_path, status, diagnose,
2290 set(variants), set(parameters), set(api_ver)))
2292 nimg.oslist = os_dict
2294 def _VerifyNodeOS(self, ninfo, nimg, base):
2295 """Verifies the node OS list.
2297 @type ninfo: L{objects.Node}
2298 @param ninfo: the node to check
2299 @param nimg: the node image object
2300 @param base: the 'template' node we match against (e.g. from the master)
2304 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2306 assert not nimg.os_fail, "Entered _VerifyNodeOS with failed OS rpc?"
2308 beautify_params = lambda l: ["%s: %s" % (k, v) for (k, v) in l]
2309 for os_name, os_data in nimg.oslist.items():
2310 assert os_data, "Empty OS status for OS %s?!" % os_name
2311 f_path, f_status, f_diag, f_var, f_param, f_api = os_data[0]
2312 _ErrorIf(not f_status, constants.CV_ENODEOS, node,
2313 "Invalid OS %s (located at %s): %s", os_name, f_path, f_diag)
2314 _ErrorIf(len(os_data) > 1, constants.CV_ENODEOS, node,
2315 "OS '%s' has multiple entries (first one shadows the rest): %s",
2316 os_name, utils.CommaJoin([v[0] for v in os_data]))
2317 # comparisons with the 'base' image
2318 test = os_name not in base.oslist
2319 _ErrorIf(test, constants.CV_ENODEOS, node,
2320 "Extra OS %s not present on reference node (%s)",
2324 assert base.oslist[os_name], "Base node has empty OS status?"
2325 _, b_status, _, b_var, b_param, b_api = base.oslist[os_name][0]
2327 # base OS is invalid, skipping
2329 for kind, a, b in [("API version", f_api, b_api),
2330 ("variants list", f_var, b_var),
2331 ("parameters", beautify_params(f_param),
2332 beautify_params(b_param))]:
2333 _ErrorIf(a != b, constants.CV_ENODEOS, node,
2334 "OS %s for %s differs from reference node %s: [%s] vs. [%s]",
2335 kind, os_name, base.name,
2336 utils.CommaJoin(sorted(a)), utils.CommaJoin(sorted(b)))
2338 # check any missing OSes
2339 missing = set(base.oslist.keys()).difference(nimg.oslist.keys())
2340 _ErrorIf(missing, constants.CV_ENODEOS, node,
2341 "OSes present on reference node %s but missing on this node: %s",
2342 base.name, utils.CommaJoin(missing))
2344 def _VerifyOob(self, ninfo, nresult):
2345 """Verifies out of band functionality of a node.
2347 @type ninfo: L{objects.Node}
2348 @param ninfo: the node to check
2349 @param nresult: the remote results for the node
2353 # We just have to verify the paths on master and/or master candidates
2354 # as the oob helper is invoked on the master
2355 if ((ninfo.master_candidate or ninfo.master_capable) and
2356 constants.NV_OOB_PATHS in nresult):
2357 for path_result in nresult[constants.NV_OOB_PATHS]:
2358 self._ErrorIf(path_result, constants.CV_ENODEOOBPATH, node, path_result)
2360 def _UpdateNodeVolumes(self, ninfo, nresult, nimg, vg_name):
2361 """Verifies and updates the node volume data.
2363 This function will update a L{NodeImage}'s internal structures
2364 with data from the remote call.
2366 @type ninfo: L{objects.Node}
2367 @param ninfo: the node to check
2368 @param nresult: the remote results for the node
2369 @param nimg: the node image object
2370 @param vg_name: the configured VG name
2374 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2376 nimg.lvm_fail = True
2377 lvdata = nresult.get(constants.NV_LVLIST, "Missing LV data")
2380 elif isinstance(lvdata, basestring):
2381 _ErrorIf(True, constants.CV_ENODELVM, node, "LVM problem on node: %s",
2382 utils.SafeEncode(lvdata))
2383 elif not isinstance(lvdata, dict):
2384 _ErrorIf(True, constants.CV_ENODELVM, node,
2385 "rpc call to node failed (lvlist)")
2387 nimg.volumes = lvdata
2388 nimg.lvm_fail = False
2390 def _UpdateNodeInstances(self, ninfo, nresult, nimg):
2391 """Verifies and updates the node instance list.
2393 If the listing was successful, then updates this node's instance
2394 list. Otherwise, it marks the RPC call as failed for the instance
2397 @type ninfo: L{objects.Node}
2398 @param ninfo: the node to check
2399 @param nresult: the remote results for the node
2400 @param nimg: the node image object
2403 idata = nresult.get(constants.NV_INSTANCELIST, None)
2404 test = not isinstance(idata, list)
2405 self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name,
2406 "rpc call to node failed (instancelist): %s",
2407 utils.SafeEncode(str(idata)))
2409 nimg.hyp_fail = True
2411 nimg.instances = idata
2413 def _UpdateNodeInfo(self, ninfo, nresult, nimg, vg_name):
2414 """Verifies and computes a node information map
2416 @type ninfo: L{objects.Node}
2417 @param ninfo: the node to check
2418 @param nresult: the remote results for the node
2419 @param nimg: the node image object
2420 @param vg_name: the configured VG name
2424 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2426 # try to read free memory (from the hypervisor)
2427 hv_info = nresult.get(constants.NV_HVINFO, None)
2428 test = not isinstance(hv_info, dict) or "memory_free" not in hv_info
2429 _ErrorIf(test, constants.CV_ENODEHV, node,
2430 "rpc call to node failed (hvinfo)")
2433 nimg.mfree = int(hv_info["memory_free"])
2434 except (ValueError, TypeError):
2435 _ErrorIf(True, constants.CV_ENODERPC, node,
2436 "node returned invalid nodeinfo, check hypervisor")
2438 # FIXME: devise a free space model for file based instances as well
2439 if vg_name is not None:
2440 test = (constants.NV_VGLIST not in nresult or
2441 vg_name not in nresult[constants.NV_VGLIST])
2442 _ErrorIf(test, constants.CV_ENODELVM, node,
2443 "node didn't return data for the volume group '%s'"
2444 " - it is either missing or broken", vg_name)
2447 nimg.dfree = int(nresult[constants.NV_VGLIST][vg_name])
2448 except (ValueError, TypeError):
2449 _ErrorIf(True, constants.CV_ENODERPC, node,
2450 "node returned invalid LVM info, check LVM status")
2452 def _CollectDiskInfo(self, nodelist, node_image, instanceinfo):
2453 """Gets per-disk status information for all instances.
2455 @type nodelist: list of strings
2456 @param nodelist: Node names
2457 @type node_image: dict of (name, L{objects.Node})
2458 @param node_image: Node objects
2459 @type instanceinfo: dict of (name, L{objects.Instance})
2460 @param instanceinfo: Instance objects
2461 @rtype: {instance: {node: [(succes, payload)]}}
2462 @return: a dictionary of per-instance dictionaries with nodes as
2463 keys and disk information as values; the disk information is a
2464 list of tuples (success, payload)
2467 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2470 node_disks_devonly = {}
2471 diskless_instances = set()
2472 diskless = constants.DT_DISKLESS
2474 for nname in nodelist:
2475 node_instances = list(itertools.chain(node_image[nname].pinst,
2476 node_image[nname].sinst))
2477 diskless_instances.update(inst for inst in node_instances
2478 if instanceinfo[inst].disk_template == diskless)
2479 disks = [(inst, disk)
2480 for inst in node_instances
2481 for disk in instanceinfo[inst].disks]
2484 # No need to collect data
2487 node_disks[nname] = disks
2489 # Creating copies as SetDiskID below will modify the objects and that can
2490 # lead to incorrect data returned from nodes
2491 devonly = [dev.Copy() for (_, dev) in disks]
2494 self.cfg.SetDiskID(dev, nname)
2496 node_disks_devonly[nname] = devonly
2498 assert len(node_disks) == len(node_disks_devonly)
2500 # Collect data from all nodes with disks
2501 result = self.rpc.call_blockdev_getmirrorstatus_multi(node_disks.keys(),
2504 assert len(result) == len(node_disks)
2508 for (nname, nres) in result.items():
2509 disks = node_disks[nname]
2512 # No data from this node
2513 data = len(disks) * [(False, "node offline")]
2516 _ErrorIf(msg, constants.CV_ENODERPC, nname,
2517 "while getting disk information: %s", msg)
2519 # No data from this node
2520 data = len(disks) * [(False, msg)]
2523 for idx, i in enumerate(nres.payload):
2524 if isinstance(i, (tuple, list)) and len(i) == 2:
2527 logging.warning("Invalid result from node %s, entry %d: %s",
2529 data.append((False, "Invalid result from the remote node"))
2531 for ((inst, _), status) in zip(disks, data):
2532 instdisk.setdefault(inst, {}).setdefault(nname, []).append(status)
2534 # Add empty entries for diskless instances.
2535 for inst in diskless_instances:
2536 assert inst not in instdisk
2539 assert compat.all(len(statuses) == len(instanceinfo[inst].disks) and
2540 len(nnames) <= len(instanceinfo[inst].all_nodes) and
2541 compat.all(isinstance(s, (tuple, list)) and
2542 len(s) == 2 for s in statuses)
2543 for inst, nnames in instdisk.items()
2544 for nname, statuses in nnames.items())
2545 assert set(instdisk) == set(instanceinfo), "instdisk consistency failure"
2550 def _SshNodeSelector(group_uuid, all_nodes):
2551 """Create endless iterators for all potential SSH check hosts.
2554 nodes = [node for node in all_nodes
2555 if (node.group != group_uuid and
2557 keyfunc = operator.attrgetter("group")
2559 return map(itertools.cycle,
2560 [sorted(map(operator.attrgetter("name"), names))
2561 for _, names in itertools.groupby(sorted(nodes, key=keyfunc),
2565 def _SelectSshCheckNodes(cls, group_nodes, group_uuid, all_nodes):
2566 """Choose which nodes should talk to which other nodes.
2568 We will make nodes contact all nodes in their group, and one node from
2571 @warning: This algorithm has a known issue if one node group is much
2572 smaller than others (e.g. just one node). In such a case all other
2573 nodes will talk to the single node.
2576 online_nodes = sorted(node.name for node in group_nodes if not node.offline)
2577 sel = cls._SshNodeSelector(group_uuid, all_nodes)
2579 return (online_nodes,
2580 dict((name, sorted([i.next() for i in sel]))
2581 for name in online_nodes))
2583 def BuildHooksEnv(self):
2586 Cluster-Verify hooks just ran in the post phase and their failure makes
2587 the output be logged in the verify output and the verification to fail.
2591 "CLUSTER_TAGS": " ".join(self.cfg.GetClusterInfo().GetTags())
2594 env.update(("NODE_TAGS_%s" % node.name, " ".join(node.GetTags()))
2595 for node in self.my_node_info.values())
2599 def BuildHooksNodes(self):
2600 """Build hooks nodes.
2603 return ([], self.my_node_names)
2605 def Exec(self, feedback_fn):
2606 """Verify integrity of the node group, performing various test on nodes.
2609 # This method has too many local variables. pylint: disable=R0914
2610 feedback_fn("* Verifying group '%s'" % self.group_info.name)
2612 if not self.my_node_names:
2614 feedback_fn("* Empty node group, skipping verification")
2618 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2619 verbose = self.op.verbose
2620 self._feedback_fn = feedback_fn
2622 vg_name = self.cfg.GetVGName()
2623 drbd_helper = self.cfg.GetDRBDHelper()
2624 cluster = self.cfg.GetClusterInfo()
2625 groupinfo = self.cfg.GetAllNodeGroupsInfo()
2626 hypervisors = cluster.enabled_hypervisors
2627 node_data_list = [self.my_node_info[name] for name in self.my_node_names]
2629 i_non_redundant = [] # Non redundant instances
2630 i_non_a_balanced = [] # Non auto-balanced instances
2631 n_offline = 0 # Count of offline nodes
2632 n_drained = 0 # Count of nodes being drained
2633 node_vol_should = {}
2635 # FIXME: verify OS list
2638 filemap = _ComputeAncillaryFiles(cluster, False)
2640 # do local checksums
2641 master_node = self.master_node = self.cfg.GetMasterNode()
2642 master_ip = self.cfg.GetMasterIP()
2644 feedback_fn("* Gathering data (%d nodes)" % len(self.my_node_names))
2646 node_verify_param = {
2647 constants.NV_FILELIST:
2648 utils.UniqueSequence(filename
2649 for files in filemap
2650 for filename in files),
2651 constants.NV_NODELIST:
2652 self._SelectSshCheckNodes(node_data_list, self.group_uuid,
2653 self.all_node_info.values()),
2654 constants.NV_HYPERVISOR: hypervisors,
2655 constants.NV_HVPARAMS:
2656 _GetAllHypervisorParameters(cluster, self.all_inst_info.values()),
2657 constants.NV_NODENETTEST: [(node.name, node.primary_ip, node.secondary_ip)
2658 for node in node_data_list
2659 if not node.offline],
2660 constants.NV_INSTANCELIST: hypervisors,
2661 constants.NV_VERSION: None,
2662 constants.NV_HVINFO: self.cfg.GetHypervisorType(),
2663 constants.NV_NODESETUP: None,
2664 constants.NV_TIME: None,
2665 constants.NV_MASTERIP: (master_node, master_ip),
2666 constants.NV_OSLIST: None,
2667 constants.NV_VMNODES: self.cfg.GetNonVmCapableNodeList(),
2670 if vg_name is not None:
2671 node_verify_param[constants.NV_VGLIST] = None
2672 node_verify_param[constants.NV_LVLIST] = vg_name
2673 node_verify_param[constants.NV_PVLIST] = [vg_name]
2674 node_verify_param[constants.NV_DRBDLIST] = None
2677 node_verify_param[constants.NV_DRBDHELPER] = drbd_helper
2680 # FIXME: this needs to be changed per node-group, not cluster-wide
2682 default_nicpp = cluster.nicparams[constants.PP_DEFAULT]
2683 if default_nicpp[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
2684 bridges.add(default_nicpp[constants.NIC_LINK])
2685 for instance in self.my_inst_info.values():
2686 for nic in instance.nics:
2687 full_nic = cluster.SimpleFillNIC(nic.nicparams)
2688 if full_nic[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
2689 bridges.add(full_nic[constants.NIC_LINK])
2692 node_verify_param[constants.NV_BRIDGES] = list(bridges)
2694 # Build our expected cluster state
2695 node_image = dict((node.name, self.NodeImage(offline=node.offline,
2697 vm_capable=node.vm_capable))
2698 for node in node_data_list)
2702 for node in self.all_node_info.values():
2703 path = _SupportsOob(self.cfg, node)
2704 if path and path not in oob_paths:
2705 oob_paths.append(path)
2708 node_verify_param[constants.NV_OOB_PATHS] = oob_paths
2710 for instance in self.my_inst_names:
2711 inst_config = self.my_inst_info[instance]
2713 for nname in inst_config.all_nodes:
2714 if nname not in node_image:
2715 gnode = self.NodeImage(name=nname)
2716 gnode.ghost = (nname not in self.all_node_info)
2717 node_image[nname] = gnode
2719 inst_config.MapLVsByNode(node_vol_should)
2721 pnode = inst_config.primary_node
2722 node_image[pnode].pinst.append(instance)
2724 for snode in inst_config.secondary_nodes:
2725 nimg = node_image[snode]
2726 nimg.sinst.append(instance)
2727 if pnode not in nimg.sbp:
2728 nimg.sbp[pnode] = []
2729 nimg.sbp[pnode].append(instance)
2731 # At this point, we have the in-memory data structures complete,
2732 # except for the runtime information, which we'll gather next
2734 # Due to the way our RPC system works, exact response times cannot be
2735 # guaranteed (e.g. a broken node could run into a timeout). By keeping the
2736 # time before and after executing the request, we can at least have a time
2738 nvinfo_starttime = time.time()
2739 all_nvinfo = self.rpc.call_node_verify(self.my_node_names,
2741 self.cfg.GetClusterName())
2742 nvinfo_endtime = time.time()
2744 if self.extra_lv_nodes and vg_name is not None:
2746 self.rpc.call_node_verify(self.extra_lv_nodes,
2747 {constants.NV_LVLIST: vg_name},
2748 self.cfg.GetClusterName())
2750 extra_lv_nvinfo = {}
2752 all_drbd_map = self.cfg.ComputeDRBDMap()
2754 feedback_fn("* Gathering disk information (%s nodes)" %
2755 len(self.my_node_names))
2756 instdisk = self._CollectDiskInfo(self.my_node_names, node_image,
2759 feedback_fn("* Verifying configuration file consistency")
2761 # If not all nodes are being checked, we need to make sure the master node
2762 # and a non-checked vm_capable node are in the list.
2763 absent_nodes = set(self.all_node_info).difference(self.my_node_info)
2765 vf_nvinfo = all_nvinfo.copy()
2766 vf_node_info = list(self.my_node_info.values())
2767 additional_nodes = []
2768 if master_node not in self.my_node_info:
2769 additional_nodes.append(master_node)
2770 vf_node_info.append(self.all_node_info[master_node])
2771 # Add the first vm_capable node we find which is not included
2772 for node in absent_nodes:
2773 nodeinfo = self.all_node_info[node]
2774 if nodeinfo.vm_capable and not nodeinfo.offline:
2775 additional_nodes.append(node)
2776 vf_node_info.append(self.all_node_info[node])
2778 key = constants.NV_FILELIST
2779 vf_nvinfo.update(self.rpc.call_node_verify(additional_nodes,
2780 {key: node_verify_param[key]},
2781 self.cfg.GetClusterName()))
2783 vf_nvinfo = all_nvinfo
2784 vf_node_info = self.my_node_info.values()
2786 self._VerifyFiles(_ErrorIf, vf_node_info, master_node, vf_nvinfo, filemap)
2788 feedback_fn("* Verifying node status")
2792 for node_i in node_data_list:
2794 nimg = node_image[node]
2798 feedback_fn("* Skipping offline node %s" % (node,))
2802 if node == master_node:
2804 elif node_i.master_candidate:
2805 ntype = "master candidate"
2806 elif node_i.drained:
2812 feedback_fn("* Verifying node %s (%s)" % (node, ntype))
2814 msg = all_nvinfo[node].fail_msg
2815 _ErrorIf(msg, constants.CV_ENODERPC, node, "while contacting node: %s",
2818 nimg.rpc_fail = True
2821 nresult = all_nvinfo[node].payload
2823 nimg.call_ok = self._VerifyNode(node_i, nresult)
2824 self._VerifyNodeTime(node_i, nresult, nvinfo_starttime, nvinfo_endtime)
2825 self._VerifyNodeNetwork(node_i, nresult)
2826 self._VerifyOob(node_i, nresult)
2829 self._VerifyNodeLVM(node_i, nresult, vg_name)
2830 self._VerifyNodeDrbd(node_i, nresult, self.all_inst_info, drbd_helper,
2833 self._UpdateNodeVolumes(node_i, nresult, nimg, vg_name)
2834 self._UpdateNodeInstances(node_i, nresult, nimg)
2835 self._UpdateNodeInfo(node_i, nresult, nimg, vg_name)
2836 self._UpdateNodeOS(node_i, nresult, nimg)
2838 if not nimg.os_fail:
2839 if refos_img is None:
2841 self._VerifyNodeOS(node_i, nimg, refos_img)
2842 self._VerifyNodeBridges(node_i, nresult, bridges)
2844 # Check whether all running instancies are primary for the node. (This
2845 # can no longer be done from _VerifyInstance below, since some of the
2846 # wrong instances could be from other node groups.)
2847 non_primary_inst = set(nimg.instances).difference(nimg.pinst)
2849 for inst in non_primary_inst:
2850 test = inst in self.all_inst_info
2851 _ErrorIf(test, constants.CV_EINSTANCEWRONGNODE, inst,
2852 "instance should not run on node %s", node_i.name)
2853 _ErrorIf(not test, constants.CV_ENODEORPHANINSTANCE, node_i.name,
2854 "node is running unknown instance %s", inst)
2856 for node, result in extra_lv_nvinfo.items():
2857 self._UpdateNodeVolumes(self.all_node_info[node], result.payload,
2858 node_image[node], vg_name)
2860 feedback_fn("* Verifying instance status")
2861 for instance in self.my_inst_names:
2863 feedback_fn("* Verifying instance %s" % instance)
2864 inst_config = self.my_inst_info[instance]
2865 self._VerifyInstance(instance, inst_config, node_image,
2867 inst_nodes_offline = []
2869 pnode = inst_config.primary_node
2870 pnode_img = node_image[pnode]
2871 _ErrorIf(pnode_img.rpc_fail and not pnode_img.offline,
2872 constants.CV_ENODERPC, pnode, "instance %s, connection to"
2873 " primary node failed", instance)
2875 _ErrorIf(inst_config.admin_up and pnode_img.offline,
2876 constants.CV_EINSTANCEBADNODE, instance,
2877 "instance is marked as running and lives on offline node %s",
2878 inst_config.primary_node)
2880 # If the instance is non-redundant we cannot survive losing its primary
2881 # node, so we are not N+1 compliant. On the other hand we have no disk
2882 # templates with more than one secondary so that situation is not well
2884 # FIXME: does not support file-backed instances
2885 if not inst_config.secondary_nodes:
2886 i_non_redundant.append(instance)
2888 _ErrorIf(len(inst_config.secondary_nodes) > 1,
2889 constants.CV_EINSTANCELAYOUT,
2890 instance, "instance has multiple secondary nodes: %s",
2891 utils.CommaJoin(inst_config.secondary_nodes),
2892 code=self.ETYPE_WARNING)
2894 if inst_config.disk_template in constants.DTS_INT_MIRROR:
2895 pnode = inst_config.primary_node
2896 instance_nodes = utils.NiceSort(inst_config.all_nodes)
2897 instance_groups = {}
2899 for node in instance_nodes:
2900 instance_groups.setdefault(self.all_node_info[node].group,
2904 "%s (group %s)" % (utils.CommaJoin(nodes), groupinfo[group].name)
2905 # Sort so that we always list the primary node first.
2906 for group, nodes in sorted(instance_groups.items(),
2907 key=lambda (_, nodes): pnode in nodes,
2910 self._ErrorIf(len(instance_groups) > 1,
2911 constants.CV_EINSTANCESPLITGROUPS,
2912 instance, "instance has primary and secondary nodes in"
2913 " different groups: %s", utils.CommaJoin(pretty_list),
2914 code=self.ETYPE_WARNING)
2916 if not cluster.FillBE(inst_config)[constants.BE_AUTO_BALANCE]:
2917 i_non_a_balanced.append(instance)
2919 for snode in inst_config.secondary_nodes:
2920 s_img = node_image[snode]
2921 _ErrorIf(s_img.rpc_fail and not s_img.offline, constants.CV_ENODERPC,
2922 snode, "instance %s, connection to secondary node failed",
2926 inst_nodes_offline.append(snode)
2928 # warn that the instance lives on offline nodes
2929 _ErrorIf(inst_nodes_offline, constants.CV_EINSTANCEBADNODE, instance,
2930 "instance has offline secondary node(s) %s",
2931 utils.CommaJoin(inst_nodes_offline))
2932 # ... or ghost/non-vm_capable nodes
2933 for node in inst_config.all_nodes:
2934 _ErrorIf(node_image[node].ghost, constants.CV_EINSTANCEBADNODE,
2935 instance, "instance lives on ghost node %s", node)
2936 _ErrorIf(not node_image[node].vm_capable, constants.CV_EINSTANCEBADNODE,
2937 instance, "instance lives on non-vm_capable node %s", node)
2939 feedback_fn("* Verifying orphan volumes")
2940 reserved = utils.FieldSet(*cluster.reserved_lvs)
2942 # We will get spurious "unknown volume" warnings if any node of this group
2943 # is secondary for an instance whose primary is in another group. To avoid
2944 # them, we find these instances and add their volumes to node_vol_should.
2945 for inst in self.all_inst_info.values():
2946 for secondary in inst.secondary_nodes:
2947 if (secondary in self.my_node_info
2948 and inst.name not in self.my_inst_info):
2949 inst.MapLVsByNode(node_vol_should)
2952 self._VerifyOrphanVolumes(node_vol_should, node_image, reserved)
2954 if constants.VERIFY_NPLUSONE_MEM not in self.op.skip_checks:
2955 feedback_fn("* Verifying N+1 Memory redundancy")
2956 self._VerifyNPlusOneMemory(node_image, self.my_inst_info)
2958 feedback_fn("* Other Notes")
2960 feedback_fn(" - NOTICE: %d non-redundant instance(s) found."
2961 % len(i_non_redundant))
2963 if i_non_a_balanced:
2964 feedback_fn(" - NOTICE: %d non-auto-balanced instance(s) found."
2965 % len(i_non_a_balanced))
2968 feedback_fn(" - NOTICE: %d offline node(s) found." % n_offline)
2971 feedback_fn(" - NOTICE: %d drained node(s) found." % n_drained)
2975 def HooksCallBack(self, phase, hooks_results, feedback_fn, lu_result):
2976 """Analyze the post-hooks' result
2978 This method analyses the hook result, handles it, and sends some
2979 nicely-formatted feedback back to the user.
2981 @param phase: one of L{constants.HOOKS_PHASE_POST} or
2982 L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase
2983 @param hooks_results: the results of the multi-node hooks rpc call
2984 @param feedback_fn: function used send feedback back to the caller
2985 @param lu_result: previous Exec result
2986 @return: the new Exec result, based on the previous result
2990 # We only really run POST phase hooks, only for non-empty groups,
2991 # and are only interested in their results
2992 if not self.my_node_names:
2995 elif phase == constants.HOOKS_PHASE_POST:
2996 # Used to change hooks' output to proper indentation
2997 feedback_fn("* Hooks Results")
2998 assert hooks_results, "invalid result from hooks"
3000 for node_name in hooks_results:
3001 res = hooks_results[node_name]
3003 test = msg and not res.offline
3004 self._ErrorIf(test, constants.CV_ENODEHOOKS, node_name,
3005 "Communication failure in hooks execution: %s", msg)
3006 if res.offline or msg:
3007 # No need to investigate payload if node is offline or gave
3010 for script, hkr, output in res.payload:
3011 test = hkr == constants.HKR_FAIL
3012 self._ErrorIf(test, constants.CV_ENODEHOOKS, node_name,
3013 "Script %s failed, output:", script)
3015 output = self._HOOKS_INDENT_RE.sub(" ", output)
3016 feedback_fn("%s" % output)
3022 class LUClusterVerifyDisks(NoHooksLU):
3023 """Verifies the cluster disks status.
3028 def ExpandNames(self):
3029 self.share_locks = _ShareAll()
3030 self.needed_locks = {
3031 locking.LEVEL_NODEGROUP: locking.ALL_SET,
3034 def Exec(self, feedback_fn):
3035 group_names = self.owned_locks(locking.LEVEL_NODEGROUP)
3037 # Submit one instance of L{opcodes.OpGroupVerifyDisks} per node group
3038 return ResultWithJobs([[opcodes.OpGroupVerifyDisks(group_name=group)]
3039 for group in group_names])
3042 class LUGroupVerifyDisks(NoHooksLU):
3043 """Verifies the status of all disks in a node group.
3048 def ExpandNames(self):
3049 # Raises errors.OpPrereqError on its own if group can't be found
3050 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
3052 self.share_locks = _ShareAll()
3053 self.needed_locks = {
3054 locking.LEVEL_INSTANCE: [],
3055 locking.LEVEL_NODEGROUP: [],
3056 locking.LEVEL_NODE: [],
3059 def DeclareLocks(self, level):
3060 if level == locking.LEVEL_INSTANCE:
3061 assert not self.needed_locks[locking.LEVEL_INSTANCE]
3063 # Lock instances optimistically, needs verification once node and group
3064 # locks have been acquired
3065 self.needed_locks[locking.LEVEL_INSTANCE] = \
3066 self.cfg.GetNodeGroupInstances(self.group_uuid)
3068 elif level == locking.LEVEL_NODEGROUP:
3069 assert not self.needed_locks[locking.LEVEL_NODEGROUP]
3071 self.needed_locks[locking.LEVEL_NODEGROUP] = \
3072 set([self.group_uuid] +
3073 # Lock all groups used by instances optimistically; this requires
3074 # going via the node before it's locked, requiring verification
3077 for instance_name in self.owned_locks(locking.LEVEL_INSTANCE)
3078 for group_uuid in self.cfg.GetInstanceNodeGroups(instance_name)])
3080 elif level == locking.LEVEL_NODE:
3081 # This will only lock the nodes in the group to be verified which contain
3083 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
3084 self._LockInstancesNodes()
3086 # Lock all nodes in group to be verified
3087 assert self.group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP)
3088 member_nodes = self.cfg.GetNodeGroup(self.group_uuid).members
3089 self.needed_locks[locking.LEVEL_NODE].extend(member_nodes)
3091 def CheckPrereq(self):
3092 owned_instances = frozenset(self.owned_locks(locking.LEVEL_INSTANCE))
3093 owned_groups = frozenset(self.owned_locks(locking.LEVEL_NODEGROUP))
3094 owned_nodes = frozenset(self.owned_locks(locking.LEVEL_NODE))
3096 assert self.group_uuid in owned_groups
3098 # Check if locked instances are still correct
3099 _CheckNodeGroupInstances(self.cfg, self.group_uuid, owned_instances)
3101 # Get instance information
3102 self.instances = dict(self.cfg.GetMultiInstanceInfo(owned_instances))
3104 # Check if node groups for locked instances are still correct
3105 for (instance_name, inst) in self.instances.items():
3106 assert owned_nodes.issuperset(inst.all_nodes), \
3107 "Instance %s's nodes changed while we kept the lock" % instance_name
3109 inst_groups = _CheckInstanceNodeGroups(self.cfg, instance_name,
3112 assert self.group_uuid in inst_groups, \
3113 "Instance %s has no node in group %s" % (instance_name, self.group_uuid)
3115 def Exec(self, feedback_fn):
3116 """Verify integrity of cluster disks.
3118 @rtype: tuple of three items
3119 @return: a tuple of (dict of node-to-node_error, list of instances
3120 which need activate-disks, dict of instance: (node, volume) for
3125 res_instances = set()
3128 nv_dict = _MapInstanceDisksToNodes([inst
3129 for inst in self.instances.values()
3133 nodes = utils.NiceSort(set(self.owned_locks(locking.LEVEL_NODE)) &
3134 set(self.cfg.GetVmCapableNodeList()))
3136 node_lvs = self.rpc.call_lv_list(nodes, [])
3138 for (node, node_res) in node_lvs.items():
3139 if node_res.offline:
3142 msg = node_res.fail_msg
3144 logging.warning("Error enumerating LVs on node %s: %s", node, msg)
3145 res_nodes[node] = msg
3148 for lv_name, (_, _, lv_online) in node_res.payload.items():
3149 inst = nv_dict.pop((node, lv_name), None)
3150 if not (lv_online or inst is None):
3151 res_instances.add(inst)
3153 # any leftover items in nv_dict are missing LVs, let's arrange the data
3155 for key, inst in nv_dict.iteritems():
3156 res_missing.setdefault(inst, []).append(key)
3158 return (res_nodes, list(res_instances), res_missing)
3161 class LUClusterRepairDiskSizes(NoHooksLU):
3162 """Verifies the cluster disks sizes.
3167 def ExpandNames(self):
3168 if self.op.instances:
3169 self.wanted_names = _GetWantedInstances(self, self.op.instances)
3170 self.needed_locks = {
3171 locking.LEVEL_NODE: [],
3172 locking.LEVEL_INSTANCE: self.wanted_names,
3174 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
3176 self.wanted_names = None
3177 self.needed_locks = {
3178 locking.LEVEL_NODE: locking.ALL_SET,
3179 locking.LEVEL_INSTANCE: locking.ALL_SET,
3181 self.share_locks = _ShareAll()
3183 def DeclareLocks(self, level):
3184 if level == locking.LEVEL_NODE and self.wanted_names is not None:
3185 self._LockInstancesNodes(primary_only=True)
3187 def CheckPrereq(self):
3188 """Check prerequisites.
3190 This only checks the optional instance list against the existing names.
3193 if self.wanted_names is None:
3194 self.wanted_names = self.owned_locks(locking.LEVEL_INSTANCE)
3196 self.wanted_instances = \
3197 map(compat.snd, self.cfg.GetMultiInstanceInfo(self.wanted_names))
3199 def _EnsureChildSizes(self, disk):
3200 """Ensure children of the disk have the needed disk size.
3202 This is valid mainly for DRBD8 and fixes an issue where the
3203 children have smaller disk size.
3205 @param disk: an L{ganeti.objects.Disk} object
3208 if disk.dev_type == constants.LD_DRBD8:
3209 assert disk.children, "Empty children for DRBD8?"
3210 fchild = disk.children[0]
3211 mismatch = fchild.size < disk.size
3213 self.LogInfo("Child disk has size %d, parent %d, fixing",
3214 fchild.size, disk.size)
3215 fchild.size = disk.size
3217 # and we recurse on this child only, not on the metadev
3218 return self._EnsureChildSizes(fchild) or mismatch
3222 def Exec(self, feedback_fn):
3223 """Verify the size of cluster disks.
3226 # TODO: check child disks too
3227 # TODO: check differences in size between primary/secondary nodes
3229 for instance in self.wanted_instances:
3230 pnode = instance.primary_node
3231 if pnode not in per_node_disks:
3232 per_node_disks[pnode] = []
3233 for idx, disk in enumerate(instance.disks):
3234 per_node_disks[pnode].append((instance, idx, disk))
3237 for node, dskl in per_node_disks.items():
3238 newl = [v[2].Copy() for v in dskl]
3240 self.cfg.SetDiskID(dsk, node)
3241 result = self.rpc.call_blockdev_getsize(node, newl)
3243 self.LogWarning("Failure in blockdev_getsize call to node"
3244 " %s, ignoring", node)
3246 if len(result.payload) != len(dskl):
3247 logging.warning("Invalid result from node %s: len(dksl)=%d,"
3248 " result.payload=%s", node, len(dskl), result.payload)
3249 self.LogWarning("Invalid result from node %s, ignoring node results",
3252 for ((instance, idx, disk), size) in zip(dskl, result.payload):
3254 self.LogWarning("Disk %d of instance %s did not return size"
3255 " information, ignoring", idx, instance.name)
3257 if not isinstance(size, (int, long)):
3258 self.LogWarning("Disk %d of instance %s did not return valid"
3259 " size information, ignoring", idx, instance.name)
3262 if size != disk.size:
3263 self.LogInfo("Disk %d of instance %s has mismatched size,"
3264 " correcting: recorded %d, actual %d", idx,
3265 instance.name, disk.size, size)
3267 self.cfg.Update(instance, feedback_fn)
3268 changed.append((instance.name, idx, size))
3269 if self._EnsureChildSizes(disk):
3270 self.cfg.Update(instance, feedback_fn)
3271 changed.append((instance.name, idx, disk.size))
3275 class LUClusterRename(LogicalUnit):
3276 """Rename the cluster.
3279 HPATH = "cluster-rename"
3280 HTYPE = constants.HTYPE_CLUSTER
3282 def BuildHooksEnv(self):
3287 "OP_TARGET": self.cfg.GetClusterName(),
3288 "NEW_NAME": self.op.name,
3291 def BuildHooksNodes(self):
3292 """Build hooks nodes.
3295 return ([self.cfg.GetMasterNode()], self.cfg.GetNodeList())
3297 def CheckPrereq(self):
3298 """Verify that the passed name is a valid one.
3301 hostname = netutils.GetHostname(name=self.op.name,
3302 family=self.cfg.GetPrimaryIPFamily())
3304 new_name = hostname.name
3305 self.ip = new_ip = hostname.ip
3306 old_name = self.cfg.GetClusterName()
3307 old_ip = self.cfg.GetMasterIP()
3308 if new_name == old_name and new_ip == old_ip:
3309 raise errors.OpPrereqError("Neither the name nor the IP address of the"
3310 " cluster has changed",
3312 if new_ip != old_ip:
3313 if netutils.TcpPing(new_ip, constants.DEFAULT_NODED_PORT):
3314 raise errors.OpPrereqError("The given cluster IP address (%s) is"
3315 " reachable on the network" %
3316 new_ip, errors.ECODE_NOTUNIQUE)
3318 self.op.name = new_name
3320 def Exec(self, feedback_fn):
3321 """Rename the cluster.
3324 clustername = self.op.name
3327 # shutdown the master IP
3328 master = self.cfg.GetMasterNode()
3329 result = self.rpc.call_node_deactivate_master_ip(master)
3330 result.Raise("Could not disable the master role")
3333 cluster = self.cfg.GetClusterInfo()
3334 cluster.cluster_name = clustername
3335 cluster.master_ip = ip
3336 self.cfg.Update(cluster, feedback_fn)
3338 # update the known hosts file
3339 ssh.WriteKnownHostsFile(self.cfg, constants.SSH_KNOWN_HOSTS_FILE)
3340 node_list = self.cfg.GetOnlineNodeList()
3342 node_list.remove(master)
3345 _UploadHelper(self, node_list, constants.SSH_KNOWN_HOSTS_FILE)
3347 result = self.rpc.call_node_activate_master_ip(master)
3348 msg = result.fail_msg
3350 self.LogWarning("Could not re-enable the master role on"
3351 " the master, please restart manually: %s", msg)
3356 def _ValidateNetmask(cfg, netmask):
3357 """Checks if a netmask is valid.
3359 @type cfg: L{config.ConfigWriter}
3360 @param cfg: The cluster configuration
3362 @param netmask: the netmask to be verified
3363 @raise errors.OpPrereqError: if the validation fails
3366 ip_family = cfg.GetPrimaryIPFamily()
3368 ipcls = netutils.IPAddress.GetClassFromIpFamily(ip_family)
3369 except errors.ProgrammerError:
3370 raise errors.OpPrereqError("Invalid primary ip family: %s." %
3372 if not ipcls.ValidateNetmask(netmask):
3373 raise errors.OpPrereqError("CIDR netmask (%s) not valid" %
3377 class LUClusterSetParams(LogicalUnit):
3378 """Change the parameters of the cluster.
3381 HPATH = "cluster-modify"
3382 HTYPE = constants.HTYPE_CLUSTER
3385 def CheckArguments(self):
3389 if self.op.uid_pool:
3390 uidpool.CheckUidPool(self.op.uid_pool)
3392 if self.op.add_uids:
3393 uidpool.CheckUidPool(self.op.add_uids)
3395 if self.op.remove_uids:
3396 uidpool.CheckUidPool(self.op.remove_uids)
3398 if self.op.master_netmask is not None:
3399 _ValidateNetmask(self.cfg, self.op.master_netmask)
3401 def ExpandNames(self):
3402 # FIXME: in the future maybe other cluster params won't require checking on
3403 # all nodes to be modified.
3404 self.needed_locks = {
3405 locking.LEVEL_NODE: locking.ALL_SET,
3407 self.share_locks[locking.LEVEL_NODE] = 1
3409 def BuildHooksEnv(self):
3414 "OP_TARGET": self.cfg.GetClusterName(),
3415 "NEW_VG_NAME": self.op.vg_name,
3418 def BuildHooksNodes(self):
3419 """Build hooks nodes.
3422 mn = self.cfg.GetMasterNode()
3425 def CheckPrereq(self):
3426 """Check prerequisites.
3428 This checks whether the given params don't conflict and
3429 if the given volume group is valid.
3432 if self.op.vg_name is not None and not self.op.vg_name:
3433 if self.cfg.HasAnyDiskOfType(constants.LD_LV):
3434 raise errors.OpPrereqError("Cannot disable lvm storage while lvm-based"
3435 " instances exist", errors.ECODE_INVAL)
3437 if self.op.drbd_helper is not None and not self.op.drbd_helper:
3438 if self.cfg.HasAnyDiskOfType(constants.LD_DRBD8):
3439 raise errors.OpPrereqError("Cannot disable drbd helper while"
3440 " drbd-based instances exist",
3443 node_list = self.owned_locks(locking.LEVEL_NODE)
3445 # if vg_name not None, checks given volume group on all nodes
3447 vglist = self.rpc.call_vg_list(node_list)
3448 for node in node_list:
3449 msg = vglist[node].fail_msg
3451 # ignoring down node
3452 self.LogWarning("Error while gathering data on node %s"
3453 " (ignoring node): %s", node, msg)
3455 vgstatus = utils.CheckVolumeGroupSize(vglist[node].payload,
3457 constants.MIN_VG_SIZE)
3459 raise errors.OpPrereqError("Error on node '%s': %s" %
3460 (node, vgstatus), errors.ECODE_ENVIRON)
3462 if self.op.drbd_helper:
3463 # checks given drbd helper on all nodes
3464 helpers = self.rpc.call_drbd_helper(node_list)
3465 for (node, ninfo) in self.cfg.GetMultiNodeInfo(node_list):
3467 self.LogInfo("Not checking drbd helper on offline node %s", node)
3469 msg = helpers[node].fail_msg
3471 raise errors.OpPrereqError("Error checking drbd helper on node"
3472 " '%s': %s" % (node, msg),
3473 errors.ECODE_ENVIRON)
3474 node_helper = helpers[node].payload
3475 if node_helper != self.op.drbd_helper:
3476 raise errors.OpPrereqError("Error on node '%s': drbd helper is %s" %
3477 (node, node_helper), errors.ECODE_ENVIRON)
3479 self.cluster = cluster = self.cfg.GetClusterInfo()
3480 # validate params changes
3481 if self.op.beparams:
3482 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
3483 self.new_beparams = cluster.SimpleFillBE(self.op.beparams)
3485 if self.op.ndparams:
3486 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
3487 self.new_ndparams = cluster.SimpleFillND(self.op.ndparams)
3489 # TODO: we need a more general way to handle resetting
3490 # cluster-level parameters to default values
3491 if self.new_ndparams["oob_program"] == "":
3492 self.new_ndparams["oob_program"] = \
3493 constants.NDC_DEFAULTS[constants.ND_OOB_PROGRAM]
3495 if self.op.nicparams:
3496 utils.ForceDictType(self.op.nicparams, constants.NICS_PARAMETER_TYPES)
3497 self.new_nicparams = cluster.SimpleFillNIC(self.op.nicparams)
3498 objects.NIC.CheckParameterSyntax(self.new_nicparams)
3501 # check all instances for consistency
3502 for instance in self.cfg.GetAllInstancesInfo().values():
3503 for nic_idx, nic in enumerate(instance.nics):
3504 params_copy = copy.deepcopy(nic.nicparams)
3505 params_filled = objects.FillDict(self.new_nicparams, params_copy)
3507 # check parameter syntax
3509 objects.NIC.CheckParameterSyntax(params_filled)
3510 except errors.ConfigurationError, err:
3511 nic_errors.append("Instance %s, nic/%d: %s" %
3512 (instance.name, nic_idx, err))
3514 # if we're moving instances to routed, check that they have an ip
3515 target_mode = params_filled[constants.NIC_MODE]
3516 if target_mode == constants.NIC_MODE_ROUTED and not nic.ip:
3517 nic_errors.append("Instance %s, nic/%d: routed NIC with no ip"
3518 " address" % (instance.name, nic_idx))
3520 raise errors.OpPrereqError("Cannot apply the change, errors:\n%s" %
3521 "\n".join(nic_errors))
3523 # hypervisor list/parameters
3524 self.new_hvparams = new_hvp = objects.FillDict(cluster.hvparams, {})
3525 if self.op.hvparams:
3526 for hv_name, hv_dict in self.op.hvparams.items():
3527 if hv_name not in self.new_hvparams:
3528 self.new_hvparams[hv_name] = hv_dict
3530 self.new_hvparams[hv_name].update(hv_dict)
3532 # os hypervisor parameters
3533 self.new_os_hvp = objects.FillDict(cluster.os_hvp, {})
3535 for os_name, hvs in self.op.os_hvp.items():
3536 if os_name not in self.new_os_hvp:
3537 self.new_os_hvp[os_name] = hvs
3539 for hv_name, hv_dict in hvs.items():
3540 if hv_name not in self.new_os_hvp[os_name]:
3541 self.new_os_hvp[os_name][hv_name] = hv_dict
3543 self.new_os_hvp[os_name][hv_name].update(hv_dict)
3546 self.new_osp = objects.FillDict(cluster.osparams, {})
3547 if self.op.osparams:
3548 for os_name, osp in self.op.osparams.items():
3549 if os_name not in self.new_osp:
3550 self.new_osp[os_name] = {}
3552 self.new_osp[os_name] = _GetUpdatedParams(self.new_osp[os_name], osp,
3555 if not self.new_osp[os_name]:
3556 # we removed all parameters
3557 del self.new_osp[os_name]
3559 # check the parameter validity (remote check)
3560 _CheckOSParams(self, False, [self.cfg.GetMasterNode()],
3561 os_name, self.new_osp[os_name])
3563 # changes to the hypervisor list
3564 if self.op.enabled_hypervisors is not None:
3565 self.hv_list = self.op.enabled_hypervisors
3566 for hv in self.hv_list:
3567 # if the hypervisor doesn't already exist in the cluster
3568 # hvparams, we initialize it to empty, and then (in both
3569 # cases) we make sure to fill the defaults, as we might not
3570 # have a complete defaults list if the hypervisor wasn't
3572 if hv not in new_hvp:
3574 new_hvp[hv] = objects.FillDict(constants.HVC_DEFAULTS[hv], new_hvp[hv])
3575 utils.ForceDictType(new_hvp[hv], constants.HVS_PARAMETER_TYPES)
3577 self.hv_list = cluster.enabled_hypervisors
3579 if self.op.hvparams or self.op.enabled_hypervisors is not None:
3580 # either the enabled list has changed, or the parameters have, validate
3581 for hv_name, hv_params in self.new_hvparams.items():
3582 if ((self.op.hvparams and hv_name in self.op.hvparams) or
3583 (self.op.enabled_hypervisors and
3584 hv_name in self.op.enabled_hypervisors)):
3585 # either this is a new hypervisor, or its parameters have changed
3586 hv_class = hypervisor.GetHypervisor(hv_name)
3587 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
3588 hv_class.CheckParameterSyntax(hv_params)
3589 _CheckHVParams(self, node_list, hv_name, hv_params)
3592 # no need to check any newly-enabled hypervisors, since the
3593 # defaults have already been checked in the above code-block
3594 for os_name, os_hvp in self.new_os_hvp.items():
3595 for hv_name, hv_params in os_hvp.items():
3596 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
3597 # we need to fill in the new os_hvp on top of the actual hv_p
3598 cluster_defaults = self.new_hvparams.get(hv_name, {})
3599 new_osp = objects.FillDict(cluster_defaults, hv_params)
3600 hv_class = hypervisor.GetHypervisor(hv_name)
3601 hv_class.CheckParameterSyntax(new_osp)
3602 _CheckHVParams(self, node_list, hv_name, new_osp)
3604 if self.op.default_iallocator:
3605 alloc_script = utils.FindFile(self.op.default_iallocator,
3606 constants.IALLOCATOR_SEARCH_PATH,
3608 if alloc_script is None:
3609 raise errors.OpPrereqError("Invalid default iallocator script '%s'"
3610 " specified" % self.op.default_iallocator,
3613 def Exec(self, feedback_fn):
3614 """Change the parameters of the cluster.
3617 if self.op.vg_name is not None:
3618 new_volume = self.op.vg_name
3621 if new_volume != self.cfg.GetVGName():
3622 self.cfg.SetVGName(new_volume)
3624 feedback_fn("Cluster LVM configuration already in desired"
3625 " state, not changing")
3626 if self.op.drbd_helper is not None:
3627 new_helper = self.op.drbd_helper
3630 if new_helper != self.cfg.GetDRBDHelper():
3631 self.cfg.SetDRBDHelper(new_helper)
3633 feedback_fn("Cluster DRBD helper already in desired state,"
3635 if self.op.hvparams:
3636 self.cluster.hvparams = self.new_hvparams
3638 self.cluster.os_hvp = self.new_os_hvp
3639 if self.op.enabled_hypervisors is not None:
3640 self.cluster.hvparams = self.new_hvparams
3641 self.cluster.enabled_hypervisors = self.op.enabled_hypervisors
3642 if self.op.beparams:
3643 self.cluster.beparams[constants.PP_DEFAULT] = self.new_beparams
3644 if self.op.nicparams:
3645 self.cluster.nicparams[constants.PP_DEFAULT] = self.new_nicparams
3646 if self.op.osparams:
3647 self.cluster.osparams = self.new_osp
3648 if self.op.ndparams:
3649 self.cluster.ndparams = self.new_ndparams
3651 if self.op.candidate_pool_size is not None:
3652 self.cluster.candidate_pool_size = self.op.candidate_pool_size
3653 # we need to update the pool size here, otherwise the save will fail
3654 _AdjustCandidatePool(self, [])
3656 if self.op.maintain_node_health is not None:
3657 self.cluster.maintain_node_health = self.op.maintain_node_health
3659 if self.op.prealloc_wipe_disks is not None:
3660 self.cluster.prealloc_wipe_disks = self.op.prealloc_wipe_disks
3662 if self.op.add_uids is not None:
3663 uidpool.AddToUidPool(self.cluster.uid_pool, self.op.add_uids)
3665 if self.op.remove_uids is not None:
3666 uidpool.RemoveFromUidPool(self.cluster.uid_pool, self.op.remove_uids)
3668 if self.op.uid_pool is not None:
3669 self.cluster.uid_pool = self.op.uid_pool
3671 if self.op.default_iallocator is not None:
3672 self.cluster.default_iallocator = self.op.default_iallocator
3674 if self.op.reserved_lvs is not None:
3675 self.cluster.reserved_lvs = self.op.reserved_lvs
3677 def helper_os(aname, mods, desc):
3679 lst = getattr(self.cluster, aname)
3680 for key, val in mods:
3681 if key == constants.DDM_ADD:
3683 feedback_fn("OS %s already in %s, ignoring" % (val, desc))
3686 elif key == constants.DDM_REMOVE:
3690 feedback_fn("OS %s not found in %s, ignoring" % (val, desc))
3692 raise errors.ProgrammerError("Invalid modification '%s'" % key)
3694 if self.op.hidden_os:
3695 helper_os("hidden_os", self.op.hidden_os, "hidden")
3697 if self.op.blacklisted_os:
3698 helper_os("blacklisted_os", self.op.blacklisted_os, "blacklisted")
3700 if self.op.master_netdev:
3701 master = self.cfg.GetMasterNode()
3702 feedback_fn("Shutting down master ip on the current netdev (%s)" %
3703 self.cluster.master_netdev)
3704 result = self.rpc.call_node_deactivate_master_ip(master)
3705 result.Raise("Could not disable the master ip")
3706 feedback_fn("Changing master_netdev from %s to %s" %
3707 (self.cluster.master_netdev, self.op.master_netdev))
3708 self.cluster.master_netdev = self.op.master_netdev
3710 if self.op.master_netmask:
3711 master = self.cfg.GetMasterNode()
3712 feedback_fn("Changing master IP netmask to %s" % self.op.master_netmask)
3713 result = self.rpc.call_node_change_master_netmask(master,
3714 self.op.master_netmask)
3716 msg = "Could not change the master IP netmask: %s" % result.fail_msg
3717 self.LogWarning(msg)
3720 self.cluster.master_netmask = self.op.master_netmask
3722 self.cfg.Update(self.cluster, feedback_fn)
3724 if self.op.master_netdev:
3725 feedback_fn("Starting the master ip on the new master netdev (%s)" %
3726 self.op.master_netdev)
3727 result = self.rpc.call_node_activate_master_ip(master)
3729 self.LogWarning("Could not re-enable the master ip on"
3730 " the master, please restart manually: %s",
3734 def _UploadHelper(lu, nodes, fname):
3735 """Helper for uploading a file and showing warnings.
3738 if os.path.exists(fname):
3739 result = lu.rpc.call_upload_file(nodes, fname)
3740 for to_node, to_result in result.items():
3741 msg = to_result.fail_msg
3743 msg = ("Copy of file %s to node %s failed: %s" %
3744 (fname, to_node, msg))
3745 lu.proc.LogWarning(msg)
3748 def _ComputeAncillaryFiles(cluster, redist):
3749 """Compute files external to Ganeti which need to be consistent.
3751 @type redist: boolean
3752 @param redist: Whether to include files which need to be redistributed
3755 # Compute files for all nodes
3757 constants.SSH_KNOWN_HOSTS_FILE,
3758 constants.CONFD_HMAC_KEY,
3759 constants.CLUSTER_DOMAIN_SECRET_FILE,
3760 constants.SPICE_CERT_FILE,
3761 constants.SPICE_CACERT_FILE,
3762 constants.RAPI_USERS_FILE,
3766 files_all.update(constants.ALL_CERT_FILES)
3767 files_all.update(ssconf.SimpleStore().GetFileList())
3769 # we need to ship at least the RAPI certificate
3770 files_all.add(constants.RAPI_CERT_FILE)
3772 if cluster.modify_etc_hosts:
3773 files_all.add(constants.ETC_HOSTS)
3775 # Files which are optional, these must:
3776 # - be present in one other category as well
3777 # - either exist or not exist on all nodes of that category (mc, vm all)
3779 constants.RAPI_USERS_FILE,
3782 # Files which should only be on master candidates
3785 files_mc.add(constants.CLUSTER_CONF_FILE)
3787 # Files which should only be on VM-capable nodes
3788 files_vm = set(filename
3789 for hv_name in cluster.enabled_hypervisors
3790 for filename in hypervisor.GetHypervisor(hv_name).GetAncillaryFiles()[0])
3792 files_opt |= set(filename
3793 for hv_name in cluster.enabled_hypervisors
3794 for filename in hypervisor.GetHypervisor(hv_name).GetAncillaryFiles()[1])
3796 # Filenames in each category must be unique
3797 all_files_set = files_all | files_mc | files_vm
3798 assert (len(all_files_set) ==
3799 sum(map(len, [files_all, files_mc, files_vm]))), \
3800 "Found file listed in more than one file list"
3802 # Optional files must be present in one other category
3803 assert all_files_set.issuperset(files_opt), \
3804 "Optional file not in a different required list"
3806 return (files_all, files_opt, files_mc, files_vm)
3809 def _RedistributeAncillaryFiles(lu, additional_nodes=None, additional_vm=True):
3810 """Distribute additional files which are part of the cluster configuration.
3812 ConfigWriter takes care of distributing the config and ssconf files, but
3813 there are more files which should be distributed to all nodes. This function
3814 makes sure those are copied.
3816 @param lu: calling logical unit
3817 @param additional_nodes: list of nodes not in the config to distribute to
3818 @type additional_vm: boolean
3819 @param additional_vm: whether the additional nodes are vm-capable or not
3822 # Gather target nodes
3823 cluster = lu.cfg.GetClusterInfo()
3824 master_info = lu.cfg.GetNodeInfo(lu.cfg.GetMasterNode())
3826 online_nodes = lu.cfg.GetOnlineNodeList()
3827 vm_nodes = lu.cfg.GetVmCapableNodeList()
3829 if additional_nodes is not None:
3830 online_nodes.extend(additional_nodes)
3832 vm_nodes.extend(additional_nodes)
3834 # Never distribute to master node
3835 for nodelist in [online_nodes, vm_nodes]:
3836 if master_info.name in nodelist:
3837 nodelist.remove(master_info.name)
3840 (files_all, _, files_mc, files_vm) = \
3841 _ComputeAncillaryFiles(cluster, True)
3843 # Never re-distribute configuration file from here
3844 assert not (constants.CLUSTER_CONF_FILE in files_all or
3845 constants.CLUSTER_CONF_FILE in files_vm)
3846 assert not files_mc, "Master candidates not handled in this function"
3849 (online_nodes, files_all),
3850 (vm_nodes, files_vm),
3854 for (node_list, files) in filemap:
3856 _UploadHelper(lu, node_list, fname)
3859 class LUClusterRedistConf(NoHooksLU):
3860 """Force the redistribution of cluster configuration.
3862 This is a very simple LU.
3867 def ExpandNames(self):
3868 self.needed_locks = {
3869 locking.LEVEL_NODE: locking.ALL_SET,
3871 self.share_locks[locking.LEVEL_NODE] = 1
3873 def Exec(self, feedback_fn):
3874 """Redistribute the configuration.
3877 self.cfg.Update(self.cfg.GetClusterInfo(), feedback_fn)
3878 _RedistributeAncillaryFiles(self)
3881 class LUClusterActivateMasterIp(NoHooksLU):
3882 """Activate the master IP on the master node.
3885 def Exec(self, feedback_fn):
3886 """Activate the master IP.
3889 master = self.cfg.GetMasterNode()
3890 self.rpc.call_node_activate_master_ip(master)
3893 class LUClusterDeactivateMasterIp(NoHooksLU):
3894 """Deactivate the master IP on the master node.
3897 def Exec(self, feedback_fn):
3898 """Deactivate the master IP.
3901 master = self.cfg.GetMasterNode()
3902 self.rpc.call_node_deactivate_master_ip(master)
3905 def _WaitForSync(lu, instance, disks=None, oneshot=False):
3906 """Sleep and poll for an instance's disk to sync.
3909 if not instance.disks or disks is not None and not disks:
3912 disks = _ExpandCheckDisks(instance, disks)
3915 lu.proc.LogInfo("Waiting for instance %s to sync disks." % instance.name)
3917 node = instance.primary_node
3920 lu.cfg.SetDiskID(dev, node)
3922 # TODO: Convert to utils.Retry
3925 degr_retries = 10 # in seconds, as we sleep 1 second each time
3929 cumul_degraded = False
3930 rstats = lu.rpc.call_blockdev_getmirrorstatus(node, disks)
3931 msg = rstats.fail_msg
3933 lu.LogWarning("Can't get any data from node %s: %s", node, msg)
3936 raise errors.RemoteError("Can't contact node %s for mirror data,"
3937 " aborting." % node)
3940 rstats = rstats.payload
3942 for i, mstat in enumerate(rstats):
3944 lu.LogWarning("Can't compute data for node %s/%s",
3945 node, disks[i].iv_name)
3948 cumul_degraded = (cumul_degraded or
3949 (mstat.is_degraded and mstat.sync_percent is None))
3950 if mstat.sync_percent is not None:
3952 if mstat.estimated_time is not None:
3953 rem_time = ("%s remaining (estimated)" %
3954 utils.FormatSeconds(mstat.estimated_time))
3955 max_time = mstat.estimated_time
3957 rem_time = "no time estimate"
3958 lu.proc.LogInfo("- device %s: %5.2f%% done, %s" %
3959 (disks[i].iv_name, mstat.sync_percent, rem_time))
3961 # if we're done but degraded, let's do a few small retries, to
3962 # make sure we see a stable and not transient situation; therefore
3963 # we force restart of the loop
3964 if (done or oneshot) and cumul_degraded and degr_retries > 0:
3965 logging.info("Degraded disks found, %d retries left", degr_retries)
3973 time.sleep(min(60, max_time))
3976 lu.proc.LogInfo("Instance %s's disks are in sync." % instance.name)
3977 return not cumul_degraded
3980 def _CheckDiskConsistency(lu, dev, node, on_primary, ldisk=False):
3981 """Check that mirrors are not degraded.
3983 The ldisk parameter, if True, will change the test from the
3984 is_degraded attribute (which represents overall non-ok status for
3985 the device(s)) to the ldisk (representing the local storage status).
3988 lu.cfg.SetDiskID(dev, node)
3992 if on_primary or dev.AssembleOnSecondary():
3993 rstats = lu.rpc.call_blockdev_find(node, dev)
3994 msg = rstats.fail_msg
3996 lu.LogWarning("Can't find disk on node %s: %s", node, msg)
3998 elif not rstats.payload:
3999 lu.LogWarning("Can't find disk on node %s", node)
4003 result = result and rstats.payload.ldisk_status == constants.LDS_OKAY
4005 result = result and not rstats.payload.is_degraded
4008 for child in dev.children:
4009 result = result and _CheckDiskConsistency(lu, child, node, on_primary)
4014 class LUOobCommand(NoHooksLU):
4015 """Logical unit for OOB handling.
4019 _SKIP_MASTER = (constants.OOB_POWER_OFF, constants.OOB_POWER_CYCLE)
4021 def ExpandNames(self):
4022 """Gather locks we need.
4025 if self.op.node_names:
4026 self.op.node_names = _GetWantedNodes(self, self.op.node_names)
4027 lock_names = self.op.node_names
4029 lock_names = locking.ALL_SET
4031 self.needed_locks = {
4032 locking.LEVEL_NODE: lock_names,
4035 def CheckPrereq(self):
4036 """Check prerequisites.
4039 - the node exists in the configuration
4042 Any errors are signaled by raising errors.OpPrereqError.
4046 self.master_node = self.cfg.GetMasterNode()
4048 assert self.op.power_delay >= 0.0
4050 if self.op.node_names:
4051 if (self.op.command in self._SKIP_MASTER and
4052 self.master_node in self.op.node_names):
4053 master_node_obj = self.cfg.GetNodeInfo(self.master_node)
4054 master_oob_handler = _SupportsOob(self.cfg, master_node_obj)
4056 if master_oob_handler:
4057 additional_text = ("run '%s %s %s' if you want to operate on the"
4058 " master regardless") % (master_oob_handler,
4062 additional_text = "it does not support out-of-band operations"
4064 raise errors.OpPrereqError(("Operating on the master node %s is not"
4065 " allowed for %s; %s") %
4066 (self.master_node, self.op.command,
4067 additional_text), errors.ECODE_INVAL)
4069 self.op.node_names = self.cfg.GetNodeList()
4070 if self.op.command in self._SKIP_MASTER:
4071 self.op.node_names.remove(self.master_node)
4073 if self.op.command in self._SKIP_MASTER:
4074 assert self.master_node not in self.op.node_names
4076 for (node_name, node) in self.cfg.GetMultiNodeInfo(self.op.node_names):
4078 raise errors.OpPrereqError("Node %s not found" % node_name,
4081 self.nodes.append(node)
4083 if (not self.op.ignore_status and
4084 (self.op.command == constants.OOB_POWER_OFF and not node.offline)):
4085 raise errors.OpPrereqError(("Cannot power off node %s because it is"
4086 " not marked offline") % node_name,
4089 def Exec(self, feedback_fn):
4090 """Execute OOB and return result if we expect any.
4093 master_node = self.master_node
4096 for idx, node in enumerate(utils.NiceSort(self.nodes,
4097 key=lambda node: node.name)):
4098 node_entry = [(constants.RS_NORMAL, node.name)]
4099 ret.append(node_entry)
4101 oob_program = _SupportsOob(self.cfg, node)
4104 node_entry.append((constants.RS_UNAVAIL, None))
4107 logging.info("Executing out-of-band command '%s' using '%s' on %s",
4108 self.op.command, oob_program, node.name)
4109 result = self.rpc.call_run_oob(master_node, oob_program,
4110 self.op.command, node.name,
4114 self.LogWarning("Out-of-band RPC failed on node '%s': %s",
4115 node.name, result.fail_msg)
4116 node_entry.append((constants.RS_NODATA, None))
4119 self._CheckPayload(result)
4120 except errors.OpExecError, err:
4121 self.LogWarning("Payload returned by node '%s' is not valid: %s",
4123 node_entry.append((constants.RS_NODATA, None))
4125 if self.op.command == constants.OOB_HEALTH:
4126 # For health we should log important events
4127 for item, status in result.payload:
4128 if status in [constants.OOB_STATUS_WARNING,
4129 constants.OOB_STATUS_CRITICAL]:
4130 self.LogWarning("Item '%s' on node '%s' has status '%s'",
4131 item, node.name, status)
4133 if self.op.command == constants.OOB_POWER_ON:
4135 elif self.op.command == constants.OOB_POWER_OFF:
4136 node.powered = False
4137 elif self.op.command == constants.OOB_POWER_STATUS:
4138 powered = result.payload[constants.OOB_POWER_STATUS_POWERED]
4139 if powered != node.powered:
4140 logging.warning(("Recorded power state (%s) of node '%s' does not"
4141 " match actual power state (%s)"), node.powered,
4144 # For configuration changing commands we should update the node
4145 if self.op.command in (constants.OOB_POWER_ON,
4146 constants.OOB_POWER_OFF):
4147 self.cfg.Update(node, feedback_fn)
4149 node_entry.append((constants.RS_NORMAL, result.payload))
4151 if (self.op.command == constants.OOB_POWER_ON and
4152 idx < len(self.nodes) - 1):
4153 time.sleep(self.op.power_delay)
4157 def _CheckPayload(self, result):
4158 """Checks if the payload is valid.
4160 @param result: RPC result
4161 @raises errors.OpExecError: If payload is not valid
4165 if self.op.command == constants.OOB_HEALTH:
4166 if not isinstance(result.payload, list):
4167 errs.append("command 'health' is expected to return a list but got %s" %
4168 type(result.payload))
4170 for item, status in result.payload:
4171 if status not in constants.OOB_STATUSES:
4172 errs.append("health item '%s' has invalid status '%s'" %
4175 if self.op.command == constants.OOB_POWER_STATUS:
4176 if not isinstance(result.payload, dict):
4177 errs.append("power-status is expected to return a dict but got %s" %
4178 type(result.payload))
4180 if self.op.command in [
4181 constants.OOB_POWER_ON,
4182 constants.OOB_POWER_OFF,
4183 constants.OOB_POWER_CYCLE,
4185 if result.payload is not None:
4186 errs.append("%s is expected to not return payload but got '%s'" %
4187 (self.op.command, result.payload))
4190 raise errors.OpExecError("Check of out-of-band payload failed due to %s" %
4191 utils.CommaJoin(errs))
4194 class _OsQuery(_QueryBase):
4195 FIELDS = query.OS_FIELDS
4197 def ExpandNames(self, lu):
4198 # Lock all nodes in shared mode
4199 # Temporary removal of locks, should be reverted later
4200 # TODO: reintroduce locks when they are lighter-weight
4201 lu.needed_locks = {}
4202 #self.share_locks[locking.LEVEL_NODE] = 1
4203 #self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
4205 # The following variables interact with _QueryBase._GetNames
4207 self.wanted = self.names
4209 self.wanted = locking.ALL_SET
4211 self.do_locking = self.use_locking
4213 def DeclareLocks(self, lu, level):
4217 def _DiagnoseByOS(rlist):
4218 """Remaps a per-node return list into an a per-os per-node dictionary
4220 @param rlist: a map with node names as keys and OS objects as values
4223 @return: a dictionary with osnames as keys and as value another
4224 map, with nodes as keys and tuples of (path, status, diagnose,
4225 variants, parameters, api_versions) as values, eg::
4227 {"debian-etch": {"node1": [(/usr/lib/..., True, "", [], []),
4228 (/srv/..., False, "invalid api")],
4229 "node2": [(/srv/..., True, "", [], [])]}
4234 # we build here the list of nodes that didn't fail the RPC (at RPC
4235 # level), so that nodes with a non-responding node daemon don't
4236 # make all OSes invalid
4237 good_nodes = [node_name for node_name in rlist
4238 if not rlist[node_name].fail_msg]
4239 for node_name, nr in rlist.items():
4240 if nr.fail_msg or not nr.payload:
4242 for (name, path, status, diagnose, variants,
4243 params, api_versions) in nr.payload:
4244 if name not in all_os:
4245 # build a list of nodes for this os containing empty lists
4246 # for each node in node_list
4248 for nname in good_nodes:
4249 all_os[name][nname] = []
4250 # convert params from [name, help] to (name, help)
4251 params = [tuple(v) for v in params]
4252 all_os[name][node_name].append((path, status, diagnose,
4253 variants, params, api_versions))
4256 def _GetQueryData(self, lu):
4257 """Computes the list of nodes and their attributes.
4260 # Locking is not used
4261 assert not (compat.any(lu.glm.is_owned(level)
4262 for level in locking.LEVELS
4263 if level != locking.LEVEL_CLUSTER) or
4264 self.do_locking or self.use_locking)
4266 valid_nodes = [node.name
4267 for node in lu.cfg.GetAllNodesInfo().values()
4268 if not node.offline and node.vm_capable]
4269 pol = self._DiagnoseByOS(lu.rpc.call_os_diagnose(valid_nodes))
4270 cluster = lu.cfg.GetClusterInfo()
4274 for (os_name, os_data) in pol.items():
4275 info = query.OsInfo(name=os_name, valid=True, node_status=os_data,
4276 hidden=(os_name in cluster.hidden_os),
4277 blacklisted=(os_name in cluster.blacklisted_os))
4281 api_versions = set()
4283 for idx, osl in enumerate(os_data.values()):
4284 info.valid = bool(info.valid and osl and osl[0][1])
4288 (node_variants, node_params, node_api) = osl[0][3:6]
4291 variants.update(node_variants)
4292 parameters.update(node_params)
4293 api_versions.update(node_api)
4295 # Filter out inconsistent values
4296 variants.intersection_update(node_variants)
4297 parameters.intersection_update(node_params)
4298 api_versions.intersection_update(node_api)
4300 info.variants = list(variants)
4301 info.parameters = list(parameters)
4302 info.api_versions = list(api_versions)
4304 data[os_name] = info
4306 # Prepare data in requested order
4307 return [data[name] for name in self._GetNames(lu, pol.keys(), None)
4311 class LUOsDiagnose(NoHooksLU):
4312 """Logical unit for OS diagnose/query.
4318 def _BuildFilter(fields, names):
4319 """Builds a filter for querying OSes.
4322 name_filter = qlang.MakeSimpleFilter("name", names)
4324 # Legacy behaviour: Hide hidden, blacklisted or invalid OSes if the
4325 # respective field is not requested
4326 status_filter = [[qlang.OP_NOT, [qlang.OP_TRUE, fname]]
4327 for fname in ["hidden", "blacklisted"]
4328 if fname not in fields]
4329 if "valid" not in fields:
4330 status_filter.append([qlang.OP_TRUE, "valid"])
4333 status_filter.insert(0, qlang.OP_AND)
4335 status_filter = None
4337 if name_filter and status_filter:
4338 return [qlang.OP_AND, name_filter, status_filter]
4342 return status_filter
4344 def CheckArguments(self):
4345 self.oq = _OsQuery(self._BuildFilter(self.op.output_fields, self.op.names),
4346 self.op.output_fields, False)
4348 def ExpandNames(self):
4349 self.oq.ExpandNames(self)
4351 def Exec(self, feedback_fn):
4352 return self.oq.OldStyleQuery(self)
4355 class LUNodeRemove(LogicalUnit):
4356 """Logical unit for removing a node.
4359 HPATH = "node-remove"
4360 HTYPE = constants.HTYPE_NODE
4362 def BuildHooksEnv(self):
4365 This doesn't run on the target node in the pre phase as a failed
4366 node would then be impossible to remove.
4370 "OP_TARGET": self.op.node_name,
4371 "NODE_NAME": self.op.node_name,
4374 def BuildHooksNodes(self):
4375 """Build hooks nodes.
4378 all_nodes = self.cfg.GetNodeList()
4380 all_nodes.remove(self.op.node_name)
4382 logging.warning("Node '%s', which is about to be removed, was not found"
4383 " in the list of all nodes", self.op.node_name)
4384 return (all_nodes, all_nodes)
4386 def CheckPrereq(self):
4387 """Check prerequisites.
4390 - the node exists in the configuration
4391 - it does not have primary or secondary instances
4392 - it's not the master
4394 Any errors are signaled by raising errors.OpPrereqError.
4397 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
4398 node = self.cfg.GetNodeInfo(self.op.node_name)
4399 assert node is not None
4401 masternode = self.cfg.GetMasterNode()
4402 if node.name == masternode:
4403 raise errors.OpPrereqError("Node is the master node, failover to another"
4404 " node is required", errors.ECODE_INVAL)
4406 for instance_name, instance in self.cfg.GetAllInstancesInfo():
4407 if node.name in instance.all_nodes:
4408 raise errors.OpPrereqError("Instance %s is still running on the node,"
4409 " please remove first" % instance_name,
4411 self.op.node_name = node.name
4414 def Exec(self, feedback_fn):
4415 """Removes the node from the cluster.
4419 logging.info("Stopping the node daemon and removing configs from node %s",
4422 modify_ssh_setup = self.cfg.GetClusterInfo().modify_ssh_setup
4424 # Promote nodes to master candidate as needed
4425 _AdjustCandidatePool(self, exceptions=[node.name])
4426 self.context.RemoveNode(node.name)
4428 # Run post hooks on the node before it's removed
4429 _RunPostHook(self, node.name)
4431 result = self.rpc.call_node_leave_cluster(node.name, modify_ssh_setup)
4432 msg = result.fail_msg
4434 self.LogWarning("Errors encountered on the remote node while leaving"
4435 " the cluster: %s", msg)
4437 # Remove node from our /etc/hosts
4438 if self.cfg.GetClusterInfo().modify_etc_hosts:
4439 master_node = self.cfg.GetMasterNode()
4440 result = self.rpc.call_etc_hosts_modify(master_node,
4441 constants.ETC_HOSTS_REMOVE,
4443 result.Raise("Can't update hosts file with new host data")
4444 _RedistributeAncillaryFiles(self)
4447 class _NodeQuery(_QueryBase):
4448 FIELDS = query.NODE_FIELDS
4450 def ExpandNames(self, lu):
4451 lu.needed_locks = {}
4452 lu.share_locks = _ShareAll()
4455 self.wanted = _GetWantedNodes(lu, self.names)
4457 self.wanted = locking.ALL_SET
4459 self.do_locking = (self.use_locking and
4460 query.NQ_LIVE in self.requested_data)
4463 # If any non-static field is requested we need to lock the nodes
4464 lu.needed_locks[locking.LEVEL_NODE] = self.wanted
4466 def DeclareLocks(self, lu, level):
4469 def _GetQueryData(self, lu):
4470 """Computes the list of nodes and their attributes.
4473 all_info = lu.cfg.GetAllNodesInfo()
4475 nodenames = self._GetNames(lu, all_info.keys(), locking.LEVEL_NODE)
4477 # Gather data as requested
4478 if query.NQ_LIVE in self.requested_data:
4479 # filter out non-vm_capable nodes
4480 toquery_nodes = [name for name in nodenames if all_info[name].vm_capable]
4482 node_data = lu.rpc.call_node_info(toquery_nodes, lu.cfg.GetVGName(),
4483 lu.cfg.GetHypervisorType())
4484 live_data = dict((name, nresult.payload)
4485 for (name, nresult) in node_data.items()
4486 if not nresult.fail_msg and nresult.payload)
4490 if query.NQ_INST in self.requested_data:
4491 node_to_primary = dict([(name, set()) for name in nodenames])
4492 node_to_secondary = dict([(name, set()) for name in nodenames])
4494 inst_data = lu.cfg.GetAllInstancesInfo()
4496 for inst in inst_data.values():
4497 if inst.primary_node in node_to_primary:
4498 node_to_primary[inst.primary_node].add(inst.name)
4499 for secnode in inst.secondary_nodes:
4500 if secnode in node_to_secondary:
4501 node_to_secondary[secnode].add(inst.name)
4503 node_to_primary = None
4504 node_to_secondary = None
4506 if query.NQ_OOB in self.requested_data:
4507 oob_support = dict((name, bool(_SupportsOob(lu.cfg, node)))
4508 for name, node in all_info.iteritems())
4512 if query.NQ_GROUP in self.requested_data:
4513 groups = lu.cfg.GetAllNodeGroupsInfo()
4517 return query.NodeQueryData([all_info[name] for name in nodenames],
4518 live_data, lu.cfg.GetMasterNode(),
4519 node_to_primary, node_to_secondary, groups,
4520 oob_support, lu.cfg.GetClusterInfo())
4523 class LUNodeQuery(NoHooksLU):
4524 """Logical unit for querying nodes.
4527 # pylint: disable=W0142
4530 def CheckArguments(self):
4531 self.nq = _NodeQuery(qlang.MakeSimpleFilter("name", self.op.names),
4532 self.op.output_fields, self.op.use_locking)
4534 def ExpandNames(self):
4535 self.nq.ExpandNames(self)
4537 def Exec(self, feedback_fn):
4538 return self.nq.OldStyleQuery(self)
4541 class LUNodeQueryvols(NoHooksLU):
4542 """Logical unit for getting volumes on node(s).
4546 _FIELDS_DYNAMIC = utils.FieldSet("phys", "vg", "name", "size", "instance")
4547 _FIELDS_STATIC = utils.FieldSet("node")
4549 def CheckArguments(self):
4550 _CheckOutputFields(static=self._FIELDS_STATIC,
4551 dynamic=self._FIELDS_DYNAMIC,
4552 selected=self.op.output_fields)
4554 def ExpandNames(self):
4555 self.needed_locks = {}
4556 self.share_locks[locking.LEVEL_NODE] = 1
4557 if not self.op.nodes:
4558 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
4560 self.needed_locks[locking.LEVEL_NODE] = \
4561 _GetWantedNodes(self, self.op.nodes)
4563 def Exec(self, feedback_fn):
4564 """Computes the list of nodes and their attributes.
4567 nodenames = self.owned_locks(locking.LEVEL_NODE)
4568 volumes = self.rpc.call_node_volumes(nodenames)
4570 ilist = self.cfg.GetAllInstancesInfo()
4571 vol2inst = _MapInstanceDisksToNodes(ilist.values())
4574 for node in nodenames:
4575 nresult = volumes[node]
4578 msg = nresult.fail_msg
4580 self.LogWarning("Can't compute volume data on node %s: %s", node, msg)
4583 node_vols = sorted(nresult.payload,
4584 key=operator.itemgetter("dev"))
4586 for vol in node_vols:
4588 for field in self.op.output_fields:
4591 elif field == "phys":
4595 elif field == "name":
4597 elif field == "size":
4598 val = int(float(vol["size"]))
4599 elif field == "instance":
4600 val = vol2inst.get((node, vol["vg"] + "/" + vol["name"]), "-")
4602 raise errors.ParameterError(field)
4603 node_output.append(str(val))
4605 output.append(node_output)
4610 class LUNodeQueryStorage(NoHooksLU):
4611 """Logical unit for getting information on storage units on node(s).
4614 _FIELDS_STATIC = utils.FieldSet(constants.SF_NODE)
4617 def CheckArguments(self):
4618 _CheckOutputFields(static=self._FIELDS_STATIC,
4619 dynamic=utils.FieldSet(*constants.VALID_STORAGE_FIELDS),
4620 selected=self.op.output_fields)
4622 def ExpandNames(self):
4623 self.needed_locks = {}
4624 self.share_locks[locking.LEVEL_NODE] = 1
4627 self.needed_locks[locking.LEVEL_NODE] = \
4628 _GetWantedNodes(self, self.op.nodes)
4630 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
4632 def Exec(self, feedback_fn):
4633 """Computes the list of nodes and their attributes.
4636 self.nodes = self.owned_locks(locking.LEVEL_NODE)
4638 # Always get name to sort by
4639 if constants.SF_NAME in self.op.output_fields:
4640 fields = self.op.output_fields[:]
4642 fields = [constants.SF_NAME] + self.op.output_fields
4644 # Never ask for node or type as it's only known to the LU
4645 for extra in [constants.SF_NODE, constants.SF_TYPE]:
4646 while extra in fields:
4647 fields.remove(extra)
4649 field_idx = dict([(name, idx) for (idx, name) in enumerate(fields)])
4650 name_idx = field_idx[constants.SF_NAME]
4652 st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
4653 data = self.rpc.call_storage_list(self.nodes,
4654 self.op.storage_type, st_args,
4655 self.op.name, fields)
4659 for node in utils.NiceSort(self.nodes):
4660 nresult = data[node]
4664 msg = nresult.fail_msg
4666 self.LogWarning("Can't get storage data from node %s: %s", node, msg)
4669 rows = dict([(row[name_idx], row) for row in nresult.payload])
4671 for name in utils.NiceSort(rows.keys()):
4676 for field in self.op.output_fields:
4677 if field == constants.SF_NODE:
4679 elif field == constants.SF_TYPE:
4680 val = self.op.storage_type
4681 elif field in field_idx:
4682 val = row[field_idx[field]]
4684 raise errors.ParameterError(field)
4693 class _InstanceQuery(_QueryBase):
4694 FIELDS = query.INSTANCE_FIELDS
4696 def ExpandNames(self, lu):
4697 lu.needed_locks = {}
4698 lu.share_locks = _ShareAll()
4701 self.wanted = _GetWantedInstances(lu, self.names)
4703 self.wanted = locking.ALL_SET
4705 self.do_locking = (self.use_locking and
4706 query.IQ_LIVE in self.requested_data)
4708 lu.needed_locks[locking.LEVEL_INSTANCE] = self.wanted
4709 lu.needed_locks[locking.LEVEL_NODEGROUP] = []
4710 lu.needed_locks[locking.LEVEL_NODE] = []
4711 lu.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
4713 self.do_grouplocks = (self.do_locking and
4714 query.IQ_NODES in self.requested_data)
4716 def DeclareLocks(self, lu, level):
4718 if level == locking.LEVEL_NODEGROUP and self.do_grouplocks:
4719 assert not lu.needed_locks[locking.LEVEL_NODEGROUP]
4721 # Lock all groups used by instances optimistically; this requires going
4722 # via the node before it's locked, requiring verification later on
4723 lu.needed_locks[locking.LEVEL_NODEGROUP] = \
4725 for instance_name in lu.owned_locks(locking.LEVEL_INSTANCE)
4726 for group_uuid in lu.cfg.GetInstanceNodeGroups(instance_name))
4727 elif level == locking.LEVEL_NODE:
4728 lu._LockInstancesNodes() # pylint: disable=W0212
4731 def _CheckGroupLocks(lu):
4732 owned_instances = frozenset(lu.owned_locks(locking.LEVEL_INSTANCE))
4733 owned_groups = frozenset(lu.owned_locks(locking.LEVEL_NODEGROUP))
4735 # Check if node groups for locked instances are still correct
4736 for instance_name in owned_instances:
4737 _CheckInstanceNodeGroups(lu.cfg, instance_name, owned_groups)
4739 def _GetQueryData(self, lu):
4740 """Computes the list of instances and their attributes.
4743 if self.do_grouplocks:
4744 self._CheckGroupLocks(lu)
4746 cluster = lu.cfg.GetClusterInfo()
4747 all_info = lu.cfg.GetAllInstancesInfo()
4749 instance_names = self._GetNames(lu, all_info.keys(), locking.LEVEL_INSTANCE)
4751 instance_list = [all_info[name] for name in instance_names]
4752 nodes = frozenset(itertools.chain(*(inst.all_nodes
4753 for inst in instance_list)))
4754 hv_list = list(set([inst.hypervisor for inst in instance_list]))
4757 wrongnode_inst = set()
4759 # Gather data as requested
4760 if self.requested_data & set([query.IQ_LIVE, query.IQ_CONSOLE]):
4762 node_data = lu.rpc.call_all_instances_info(nodes, hv_list)
4764 result = node_data[name]
4766 # offline nodes will be in both lists
4767 assert result.fail_msg
4768 offline_nodes.append(name)
4770 bad_nodes.append(name)
4771 elif result.payload:
4772 for inst in result.payload:
4773 if inst in all_info:
4774 if all_info[inst].primary_node == name:
4775 live_data.update(result.payload)
4777 wrongnode_inst.add(inst)
4779 # orphan instance; we don't list it here as we don't
4780 # handle this case yet in the output of instance listing
4781 logging.warning("Orphan instance '%s' found on node %s",
4783 # else no instance is alive
4787 if query.IQ_DISKUSAGE in self.requested_data:
4788 disk_usage = dict((inst.name,
4789 _ComputeDiskSize(inst.disk_template,
4790 [{constants.IDISK_SIZE: disk.size}
4791 for disk in inst.disks]))
4792 for inst in instance_list)
4796 if query.IQ_CONSOLE in self.requested_data:
4798 for inst in instance_list:
4799 if inst.name in live_data:
4800 # Instance is running
4801 consinfo[inst.name] = _GetInstanceConsole(cluster, inst)
4803 consinfo[inst.name] = None
4804 assert set(consinfo.keys()) == set(instance_names)
4808 if query.IQ_NODES in self.requested_data:
4809 node_names = set(itertools.chain(*map(operator.attrgetter("all_nodes"),
4811 nodes = dict(lu.cfg.GetMultiNodeInfo(node_names))
4812 groups = dict((uuid, lu.cfg.GetNodeGroup(uuid))
4813 for uuid in set(map(operator.attrgetter("group"),
4819 return query.InstanceQueryData(instance_list, lu.cfg.GetClusterInfo(),
4820 disk_usage, offline_nodes, bad_nodes,
4821 live_data, wrongnode_inst, consinfo,
4825 class LUQuery(NoHooksLU):
4826 """Query for resources/items of a certain kind.
4829 # pylint: disable=W0142
4832 def CheckArguments(self):
4833 qcls = _GetQueryImplementation(self.op.what)
4835 self.impl = qcls(self.op.qfilter, self.op.fields, self.op.use_locking)
4837 def ExpandNames(self):
4838 self.impl.ExpandNames(self)
4840 def DeclareLocks(self, level):
4841 self.impl.DeclareLocks(self, level)
4843 def Exec(self, feedback_fn):
4844 return self.impl.NewStyleQuery(self)
4847 class LUQueryFields(NoHooksLU):
4848 """Query for resources/items of a certain kind.
4851 # pylint: disable=W0142
4854 def CheckArguments(self):
4855 self.qcls = _GetQueryImplementation(self.op.what)
4857 def ExpandNames(self):
4858 self.needed_locks = {}
4860 def Exec(self, feedback_fn):
4861 return query.QueryFields(self.qcls.FIELDS, self.op.fields)
4864 class LUNodeModifyStorage(NoHooksLU):
4865 """Logical unit for modifying a storage volume on a node.
4870 def CheckArguments(self):
4871 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
4873 storage_type = self.op.storage_type
4876 modifiable = constants.MODIFIABLE_STORAGE_FIELDS[storage_type]
4878 raise errors.OpPrereqError("Storage units of type '%s' can not be"
4879 " modified" % storage_type,
4882 diff = set(self.op.changes.keys()) - modifiable
4884 raise errors.OpPrereqError("The following fields can not be modified for"
4885 " storage units of type '%s': %r" %
4886 (storage_type, list(diff)),
4889 def ExpandNames(self):
4890 self.needed_locks = {
4891 locking.LEVEL_NODE: self.op.node_name,
4894 def Exec(self, feedback_fn):
4895 """Computes the list of nodes and their attributes.
4898 st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
4899 result = self.rpc.call_storage_modify(self.op.node_name,
4900 self.op.storage_type, st_args,
4901 self.op.name, self.op.changes)
4902 result.Raise("Failed to modify storage unit '%s' on %s" %
4903 (self.op.name, self.op.node_name))
4906 class LUNodeAdd(LogicalUnit):
4907 """Logical unit for adding node to the cluster.
4911 HTYPE = constants.HTYPE_NODE
4912 _NFLAGS = ["master_capable", "vm_capable"]
4914 def CheckArguments(self):
4915 self.primary_ip_family = self.cfg.GetPrimaryIPFamily()
4916 # validate/normalize the node name
4917 self.hostname = netutils.GetHostname(name=self.op.node_name,
4918 family=self.primary_ip_family)
4919 self.op.node_name = self.hostname.name
4921 if self.op.readd and self.op.node_name == self.cfg.GetMasterNode():
4922 raise errors.OpPrereqError("Cannot readd the master node",
4925 if self.op.readd and self.op.group:
4926 raise errors.OpPrereqError("Cannot pass a node group when a node is"
4927 " being readded", errors.ECODE_INVAL)
4929 def BuildHooksEnv(self):
4932 This will run on all nodes before, and on all nodes + the new node after.
4936 "OP_TARGET": self.op.node_name,
4937 "NODE_NAME": self.op.node_name,
4938 "NODE_PIP": self.op.primary_ip,
4939 "NODE_SIP": self.op.secondary_ip,
4940 "MASTER_CAPABLE": str(self.op.master_capable),
4941 "VM_CAPABLE": str(self.op.vm_capable),
4944 def BuildHooksNodes(self):
4945 """Build hooks nodes.
4948 # Exclude added node
4949 pre_nodes = list(set(self.cfg.GetNodeList()) - set([self.op.node_name]))
4950 post_nodes = pre_nodes + [self.op.node_name, ]
4952 return (pre_nodes, post_nodes)
4954 def CheckPrereq(self):
4955 """Check prerequisites.
4958 - the new node is not already in the config
4960 - its parameters (single/dual homed) matches the cluster
4962 Any errors are signaled by raising errors.OpPrereqError.
4966 hostname = self.hostname
4967 node = hostname.name
4968 primary_ip = self.op.primary_ip = hostname.ip
4969 if self.op.secondary_ip is None:
4970 if self.primary_ip_family == netutils.IP6Address.family:
4971 raise errors.OpPrereqError("When using a IPv6 primary address, a valid"
4972 " IPv4 address must be given as secondary",
4974 self.op.secondary_ip = primary_ip
4976 secondary_ip = self.op.secondary_ip
4977 if not netutils.IP4Address.IsValid(secondary_ip):
4978 raise errors.OpPrereqError("Secondary IP (%s) needs to be a valid IPv4"
4979 " address" % secondary_ip, errors.ECODE_INVAL)
4981 node_list = cfg.GetNodeList()
4982 if not self.op.readd and node in node_list:
4983 raise errors.OpPrereqError("Node %s is already in the configuration" %
4984 node, errors.ECODE_EXISTS)
4985 elif self.op.readd and node not in node_list:
4986 raise errors.OpPrereqError("Node %s is not in the configuration" % node,
4989 self.changed_primary_ip = False
4991 for existing_node_name, existing_node in cfg.GetMultiNodeInfo(node_list):
4992 if self.op.readd and node == existing_node_name:
4993 if existing_node.secondary_ip != secondary_ip:
4994 raise errors.OpPrereqError("Readded node doesn't have the same IP"
4995 " address configuration as before",
4997 if existing_node.primary_ip != primary_ip:
4998 self.changed_primary_ip = True
5002 if (existing_node.primary_ip == primary_ip or
5003 existing_node.secondary_ip == primary_ip or
5004 existing_node.primary_ip == secondary_ip or
5005 existing_node.secondary_ip == secondary_ip):
5006 raise errors.OpPrereqError("New node ip address(es) conflict with"
5007 " existing node %s" % existing_node.name,
5008 errors.ECODE_NOTUNIQUE)
5010 # After this 'if' block, None is no longer a valid value for the
5011 # _capable op attributes
5013 old_node = self.cfg.GetNodeInfo(node)
5014 assert old_node is not None, "Can't retrieve locked node %s" % node
5015 for attr in self._NFLAGS:
5016 if getattr(self.op, attr) is None:
5017 setattr(self.op, attr, getattr(old_node, attr))
5019 for attr in self._NFLAGS:
5020 if getattr(self.op, attr) is None:
5021 setattr(self.op, attr, True)
5023 if self.op.readd and not self.op.vm_capable:
5024 pri, sec = cfg.GetNodeInstances(node)
5026 raise errors.OpPrereqError("Node %s being re-added with vm_capable"
5027 " flag set to false, but it already holds"
5028 " instances" % node,
5031 # check that the type of the node (single versus dual homed) is the
5032 # same as for the master
5033 myself = cfg.GetNodeInfo(self.cfg.GetMasterNode())
5034 master_singlehomed = myself.secondary_ip == myself.primary_ip
5035 newbie_singlehomed = secondary_ip == primary_ip
5036 if master_singlehomed != newbie_singlehomed:
5037 if master_singlehomed:
5038 raise errors.OpPrereqError("The master has no secondary ip but the"
5039 " new node has one",
5042 raise errors.OpPrereqError("The master has a secondary ip but the"
5043 " new node doesn't have one",
5046 # checks reachability
5047 if not netutils.TcpPing(primary_ip, constants.DEFAULT_NODED_PORT):
5048 raise errors.OpPrereqError("Node not reachable by ping",
5049 errors.ECODE_ENVIRON)
5051 if not newbie_singlehomed:
5052 # check reachability from my secondary ip to newbie's secondary ip
5053 if not netutils.TcpPing(secondary_ip, constants.DEFAULT_NODED_PORT,
5054 source=myself.secondary_ip):
5055 raise errors.OpPrereqError("Node secondary ip not reachable by TCP"
5056 " based ping to node daemon port",
5057 errors.ECODE_ENVIRON)
5064 if self.op.master_capable:
5065 self.master_candidate = _DecideSelfPromotion(self, exceptions=exceptions)
5067 self.master_candidate = False
5070 self.new_node = old_node
5072 node_group = cfg.LookupNodeGroup(self.op.group)
5073 self.new_node = objects.Node(name=node,
5074 primary_ip=primary_ip,
5075 secondary_ip=secondary_ip,
5076 master_candidate=self.master_candidate,
5077 offline=False, drained=False,
5080 if self.op.ndparams:
5081 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
5083 def Exec(self, feedback_fn):
5084 """Adds the new node to the cluster.
5087 new_node = self.new_node
5088 node = new_node.name
5090 # We adding a new node so we assume it's powered
5091 new_node.powered = True
5093 # for re-adds, reset the offline/drained/master-candidate flags;
5094 # we need to reset here, otherwise offline would prevent RPC calls
5095 # later in the procedure; this also means that if the re-add
5096 # fails, we are left with a non-offlined, broken node
5098 new_node.drained = new_node.offline = False # pylint: disable=W0201
5099 self.LogInfo("Readding a node, the offline/drained flags were reset")
5100 # if we demote the node, we do cleanup later in the procedure
5101 new_node.master_candidate = self.master_candidate
5102 if self.changed_primary_ip:
5103 new_node.primary_ip = self.op.primary_ip
5105 # copy the master/vm_capable flags
5106 for attr in self._NFLAGS:
5107 setattr(new_node, attr, getattr(self.op, attr))
5109 # notify the user about any possible mc promotion
5110 if new_node.master_candidate:
5111 self.LogInfo("Node will be a master candidate")
5113 if self.op.ndparams:
5114 new_node.ndparams = self.op.ndparams
5116 new_node.ndparams = {}
5118 # check connectivity
5119 result = self.rpc.call_version([node])[node]
5120 result.Raise("Can't get version information from node %s" % node)
5121 if constants.PROTOCOL_VERSION == result.payload:
5122 logging.info("Communication to node %s fine, sw version %s match",
5123 node, result.payload)
5125 raise errors.OpExecError("Version mismatch master version %s,"
5126 " node version %s" %
5127 (constants.PROTOCOL_VERSION, result.payload))
5129 # Add node to our /etc/hosts, and add key to known_hosts
5130 if self.cfg.GetClusterInfo().modify_etc_hosts:
5131 master_node = self.cfg.GetMasterNode()
5132 result = self.rpc.call_etc_hosts_modify(master_node,
5133 constants.ETC_HOSTS_ADD,
5136 result.Raise("Can't update hosts file with new host data")
5138 if new_node.secondary_ip != new_node.primary_ip:
5139 _CheckNodeHasSecondaryIP(self, new_node.name, new_node.secondary_ip,
5142 node_verify_list = [self.cfg.GetMasterNode()]
5143 node_verify_param = {
5144 constants.NV_NODELIST: ([node], {}),
5145 # TODO: do a node-net-test as well?
5148 result = self.rpc.call_node_verify(node_verify_list, node_verify_param,
5149 self.cfg.GetClusterName())
5150 for verifier in node_verify_list:
5151 result[verifier].Raise("Cannot communicate with node %s" % verifier)
5152 nl_payload = result[verifier].payload[constants.NV_NODELIST]
5154 for failed in nl_payload:
5155 feedback_fn("ssh/hostname verification failed"
5156 " (checking from %s): %s" %
5157 (verifier, nl_payload[failed]))
5158 raise errors.OpExecError("ssh/hostname verification failed")
5161 _RedistributeAncillaryFiles(self)
5162 self.context.ReaddNode(new_node)
5163 # make sure we redistribute the config
5164 self.cfg.Update(new_node, feedback_fn)
5165 # and make sure the new node will not have old files around
5166 if not new_node.master_candidate:
5167 result = self.rpc.call_node_demote_from_mc(new_node.name)
5168 msg = result.fail_msg
5170 self.LogWarning("Node failed to demote itself from master"
5171 " candidate status: %s" % msg)
5173 _RedistributeAncillaryFiles(self, additional_nodes=[node],
5174 additional_vm=self.op.vm_capable)
5175 self.context.AddNode(new_node, self.proc.GetECId())
5178 class LUNodeSetParams(LogicalUnit):
5179 """Modifies the parameters of a node.
5181 @cvar _F2R: a dictionary from tuples of flags (mc, drained, offline)
5182 to the node role (as _ROLE_*)
5183 @cvar _R2F: a dictionary from node role to tuples of flags
5184 @cvar _FLAGS: a list of attribute names corresponding to the flags
5187 HPATH = "node-modify"
5188 HTYPE = constants.HTYPE_NODE
5190 (_ROLE_CANDIDATE, _ROLE_DRAINED, _ROLE_OFFLINE, _ROLE_REGULAR) = range(4)
5192 (True, False, False): _ROLE_CANDIDATE,
5193 (False, True, False): _ROLE_DRAINED,
5194 (False, False, True): _ROLE_OFFLINE,
5195 (False, False, False): _ROLE_REGULAR,
5197 _R2F = dict((v, k) for k, v in _F2R.items())
5198 _FLAGS = ["master_candidate", "drained", "offline"]
5200 def CheckArguments(self):
5201 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
5202 all_mods = [self.op.offline, self.op.master_candidate, self.op.drained,
5203 self.op.master_capable, self.op.vm_capable,
5204 self.op.secondary_ip, self.op.ndparams]
5205 if all_mods.count(None) == len(all_mods):
5206 raise errors.OpPrereqError("Please pass at least one modification",
5208 if all_mods.count(True) > 1:
5209 raise errors.OpPrereqError("Can't set the node into more than one"
5210 " state at the same time",
5213 # Boolean value that tells us whether we might be demoting from MC
5214 self.might_demote = (self.op.master_candidate == False or
5215 self.op.offline == True or
5216 self.op.drained == True or
5217 self.op.master_capable == False)
5219 if self.op.secondary_ip:
5220 if not netutils.IP4Address.IsValid(self.op.secondary_ip):
5221 raise errors.OpPrereqError("Secondary IP (%s) needs to be a valid IPv4"
5222 " address" % self.op.secondary_ip,
5225 self.lock_all = self.op.auto_promote and self.might_demote
5226 self.lock_instances = self.op.secondary_ip is not None
5228 def ExpandNames(self):
5230 self.needed_locks = {locking.LEVEL_NODE: locking.ALL_SET}
5232 self.needed_locks = {locking.LEVEL_NODE: self.op.node_name}
5234 if self.lock_instances:
5235 self.needed_locks[locking.LEVEL_INSTANCE] = locking.ALL_SET
5237 def DeclareLocks(self, level):
5238 # If we have locked all instances, before waiting to lock nodes, release
5239 # all the ones living on nodes unrelated to the current operation.
5240 if level == locking.LEVEL_NODE and self.lock_instances:
5241 self.affected_instances = []
5242 if self.needed_locks[locking.LEVEL_NODE] is not locking.ALL_SET:
5245 # Build list of instances to release
5246 locked_i = self.owned_locks(locking.LEVEL_INSTANCE)
5247 for instance_name, instance in self.cfg.GetMultiInstanceInfo(locked_i):
5248 if (instance.disk_template in constants.DTS_INT_MIRROR and
5249 self.op.node_name in instance.all_nodes):
5250 instances_keep.append(instance_name)
5251 self.affected_instances.append(instance)
5253 _ReleaseLocks(self, locking.LEVEL_INSTANCE, keep=instances_keep)
5255 assert (set(self.owned_locks(locking.LEVEL_INSTANCE)) ==
5256 set(instances_keep))
5258 def BuildHooksEnv(self):
5261 This runs on the master node.
5265 "OP_TARGET": self.op.node_name,
5266 "MASTER_CANDIDATE": str(self.op.master_candidate),
5267 "OFFLINE": str(self.op.offline),
5268 "DRAINED": str(self.op.drained),
5269 "MASTER_CAPABLE": str(self.op.master_capable),
5270 "VM_CAPABLE": str(self.op.vm_capable),
5273 def BuildHooksNodes(self):
5274 """Build hooks nodes.
5277 nl = [self.cfg.GetMasterNode(), self.op.node_name]
5280 def CheckPrereq(self):
5281 """Check prerequisites.
5283 This only checks the instance list against the existing names.
5286 node = self.node = self.cfg.GetNodeInfo(self.op.node_name)
5288 if (self.op.master_candidate is not None or
5289 self.op.drained is not None or
5290 self.op.offline is not None):
5291 # we can't change the master's node flags
5292 if self.op.node_name == self.cfg.GetMasterNode():
5293 raise errors.OpPrereqError("The master role can be changed"
5294 " only via master-failover",
5297 if self.op.master_candidate and not node.master_capable:
5298 raise errors.OpPrereqError("Node %s is not master capable, cannot make"
5299 " it a master candidate" % node.name,
5302 if self.op.vm_capable == False:
5303 (ipri, isec) = self.cfg.GetNodeInstances(self.op.node_name)
5305 raise errors.OpPrereqError("Node %s hosts instances, cannot unset"
5306 " the vm_capable flag" % node.name,
5309 if node.master_candidate and self.might_demote and not self.lock_all:
5310 assert not self.op.auto_promote, "auto_promote set but lock_all not"
5311 # check if after removing the current node, we're missing master
5313 (mc_remaining, mc_should, _) = \
5314 self.cfg.GetMasterCandidateStats(exceptions=[node.name])
5315 if mc_remaining < mc_should:
5316 raise errors.OpPrereqError("Not enough master candidates, please"
5317 " pass auto promote option to allow"
5318 " promotion", errors.ECODE_STATE)
5320 self.old_flags = old_flags = (node.master_candidate,
5321 node.drained, node.offline)
5322 assert old_flags in self._F2R, "Un-handled old flags %s" % str(old_flags)
5323 self.old_role = old_role = self._F2R[old_flags]
5325 # Check for ineffective changes
5326 for attr in self._FLAGS:
5327 if (getattr(self.op, attr) == False and getattr(node, attr) == False):
5328 self.LogInfo("Ignoring request to unset flag %s, already unset", attr)
5329 setattr(self.op, attr, None)
5331 # Past this point, any flag change to False means a transition
5332 # away from the respective state, as only real changes are kept
5334 # TODO: We might query the real power state if it supports OOB
5335 if _SupportsOob(self.cfg, node):
5336 if self.op.offline is False and not (node.powered or
5337 self.op.powered == True):
5338 raise errors.OpPrereqError(("Node %s needs to be turned on before its"
5339 " offline status can be reset") %
5341 elif self.op.powered is not None:
5342 raise errors.OpPrereqError(("Unable to change powered state for node %s"
5343 " as it does not support out-of-band"
5344 " handling") % self.op.node_name)
5346 # If we're being deofflined/drained, we'll MC ourself if needed
5347 if (self.op.drained == False or self.op.offline == False or
5348 (self.op.master_capable and not node.master_capable)):
5349 if _DecideSelfPromotion(self):
5350 self.op.master_candidate = True
5351 self.LogInfo("Auto-promoting node to master candidate")
5353 # If we're no longer master capable, we'll demote ourselves from MC
5354 if self.op.master_capable == False and node.master_candidate:
5355 self.LogInfo("Demoting from master candidate")
5356 self.op.master_candidate = False
5359 assert [getattr(self.op, attr) for attr in self._FLAGS].count(True) <= 1
5360 if self.op.master_candidate:
5361 new_role = self._ROLE_CANDIDATE
5362 elif self.op.drained:
5363 new_role = self._ROLE_DRAINED
5364 elif self.op.offline:
5365 new_role = self._ROLE_OFFLINE
5366 elif False in [self.op.master_candidate, self.op.drained, self.op.offline]:
5367 # False is still in new flags, which means we're un-setting (the
5369 new_role = self._ROLE_REGULAR
5370 else: # no new flags, nothing, keep old role
5373 self.new_role = new_role
5375 if old_role == self._ROLE_OFFLINE and new_role != old_role:
5376 # Trying to transition out of offline status
5377 result = self.rpc.call_version([node.name])[node.name]
5379 raise errors.OpPrereqError("Node %s is being de-offlined but fails"
5380 " to report its version: %s" %
5381 (node.name, result.fail_msg),
5384 self.LogWarning("Transitioning node from offline to online state"
5385 " without using re-add. Please make sure the node"
5388 if self.op.secondary_ip:
5389 # Ok even without locking, because this can't be changed by any LU
5390 master = self.cfg.GetNodeInfo(self.cfg.GetMasterNode())
5391 master_singlehomed = master.secondary_ip == master.primary_ip
5392 if master_singlehomed and self.op.secondary_ip:
5393 raise errors.OpPrereqError("Cannot change the secondary ip on a single"
5394 " homed cluster", errors.ECODE_INVAL)
5397 if self.affected_instances:
5398 raise errors.OpPrereqError("Cannot change secondary ip: offline"
5399 " node has instances (%s) configured"
5400 " to use it" % self.affected_instances)
5402 # On online nodes, check that no instances are running, and that
5403 # the node has the new ip and we can reach it.
5404 for instance in self.affected_instances:
5405 _CheckInstanceDown(self, instance, "cannot change secondary ip")
5407 _CheckNodeHasSecondaryIP(self, node.name, self.op.secondary_ip, True)
5408 if master.name != node.name:
5409 # check reachability from master secondary ip to new secondary ip
5410 if not netutils.TcpPing(self.op.secondary_ip,
5411 constants.DEFAULT_NODED_PORT,
5412 source=master.secondary_ip):
5413 raise errors.OpPrereqError("Node secondary ip not reachable by TCP"
5414 " based ping to node daemon port",
5415 errors.ECODE_ENVIRON)
5417 if self.op.ndparams:
5418 new_ndparams = _GetUpdatedParams(self.node.ndparams, self.op.ndparams)
5419 utils.ForceDictType(new_ndparams, constants.NDS_PARAMETER_TYPES)
5420 self.new_ndparams = new_ndparams
5422 def Exec(self, feedback_fn):
5427 old_role = self.old_role
5428 new_role = self.new_role
5432 if self.op.ndparams:
5433 node.ndparams = self.new_ndparams
5435 if self.op.powered is not None:
5436 node.powered = self.op.powered
5438 for attr in ["master_capable", "vm_capable"]:
5439 val = getattr(self.op, attr)
5441 setattr(node, attr, val)
5442 result.append((attr, str(val)))
5444 if new_role != old_role:
5445 # Tell the node to demote itself, if no longer MC and not offline
5446 if old_role == self._ROLE_CANDIDATE and new_role != self._ROLE_OFFLINE:
5447 msg = self.rpc.call_node_demote_from_mc(node.name).fail_msg
5449 self.LogWarning("Node failed to demote itself: %s", msg)
5451 new_flags = self._R2F[new_role]
5452 for of, nf, desc in zip(self.old_flags, new_flags, self._FLAGS):
5454 result.append((desc, str(nf)))
5455 (node.master_candidate, node.drained, node.offline) = new_flags
5457 # we locked all nodes, we adjust the CP before updating this node
5459 _AdjustCandidatePool(self, [node.name])
5461 if self.op.secondary_ip:
5462 node.secondary_ip = self.op.secondary_ip
5463 result.append(("secondary_ip", self.op.secondary_ip))
5465 # this will trigger configuration file update, if needed
5466 self.cfg.Update(node, feedback_fn)
5468 # this will trigger job queue propagation or cleanup if the mc
5470 if [old_role, new_role].count(self._ROLE_CANDIDATE) == 1:
5471 self.context.ReaddNode(node)
5476 class LUNodePowercycle(NoHooksLU):
5477 """Powercycles a node.
5482 def CheckArguments(self):
5483 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
5484 if self.op.node_name == self.cfg.GetMasterNode() and not self.op.force:
5485 raise errors.OpPrereqError("The node is the master and the force"
5486 " parameter was not set",
5489 def ExpandNames(self):
5490 """Locking for PowercycleNode.
5492 This is a last-resort option and shouldn't block on other
5493 jobs. Therefore, we grab no locks.
5496 self.needed_locks = {}
5498 def Exec(self, feedback_fn):
5502 result = self.rpc.call_node_powercycle(self.op.node_name,
5503 self.cfg.GetHypervisorType())
5504 result.Raise("Failed to schedule the reboot")
5505 return result.payload
5508 class LUClusterQuery(NoHooksLU):
5509 """Query cluster configuration.
5514 def ExpandNames(self):
5515 self.needed_locks = {}
5517 def Exec(self, feedback_fn):
5518 """Return cluster config.
5521 cluster = self.cfg.GetClusterInfo()
5524 # Filter just for enabled hypervisors
5525 for os_name, hv_dict in cluster.os_hvp.items():
5526 os_hvp[os_name] = {}
5527 for hv_name, hv_params in hv_dict.items():
5528 if hv_name in cluster.enabled_hypervisors:
5529 os_hvp[os_name][hv_name] = hv_params
5531 # Convert ip_family to ip_version
5532 primary_ip_version = constants.IP4_VERSION
5533 if cluster.primary_ip_family == netutils.IP6Address.family:
5534 primary_ip_version = constants.IP6_VERSION
5537 "software_version": constants.RELEASE_VERSION,
5538 "protocol_version": constants.PROTOCOL_VERSION,
5539 "config_version": constants.CONFIG_VERSION,
5540 "os_api_version": max(constants.OS_API_VERSIONS),
5541 "export_version": constants.EXPORT_VERSION,
5542 "architecture": (platform.architecture()[0], platform.machine()),
5543 "name": cluster.cluster_name,
5544 "master": cluster.master_node,
5545 "default_hypervisor": cluster.enabled_hypervisors[0],
5546 "enabled_hypervisors": cluster.enabled_hypervisors,
5547 "hvparams": dict([(hypervisor_name, cluster.hvparams[hypervisor_name])
5548 for hypervisor_name in cluster.enabled_hypervisors]),
5550 "beparams": cluster.beparams,
5551 "osparams": cluster.osparams,
5552 "nicparams": cluster.nicparams,
5553 "ndparams": cluster.ndparams,
5554 "candidate_pool_size": cluster.candidate_pool_size,
5555 "master_netdev": cluster.master_netdev,
5556 "master_netmask": cluster.master_netmask,
5557 "volume_group_name": cluster.volume_group_name,
5558 "drbd_usermode_helper": cluster.drbd_usermode_helper,
5559 "file_storage_dir": cluster.file_storage_dir,
5560 "shared_file_storage_dir": cluster.shared_file_storage_dir,
5561 "maintain_node_health": cluster.maintain_node_health,
5562 "ctime": cluster.ctime,
5563 "mtime": cluster.mtime,
5564 "uuid": cluster.uuid,
5565 "tags": list(cluster.GetTags()),
5566 "uid_pool": cluster.uid_pool,
5567 "default_iallocator": cluster.default_iallocator,
5568 "reserved_lvs": cluster.reserved_lvs,
5569 "primary_ip_version": primary_ip_version,
5570 "prealloc_wipe_disks": cluster.prealloc_wipe_disks,
5571 "hidden_os": cluster.hidden_os,
5572 "blacklisted_os": cluster.blacklisted_os,
5578 class LUClusterConfigQuery(NoHooksLU):
5579 """Return configuration values.
5583 _FIELDS_DYNAMIC = utils.FieldSet()
5584 _FIELDS_STATIC = utils.FieldSet("cluster_name", "master_node", "drain_flag",
5585 "watcher_pause", "volume_group_name")
5587 def CheckArguments(self):
5588 _CheckOutputFields(static=self._FIELDS_STATIC,
5589 dynamic=self._FIELDS_DYNAMIC,
5590 selected=self.op.output_fields)
5592 def ExpandNames(self):
5593 self.needed_locks = {}
5595 def Exec(self, feedback_fn):
5596 """Dump a representation of the cluster config to the standard output.
5600 for field in self.op.output_fields:
5601 if field == "cluster_name":
5602 entry = self.cfg.GetClusterName()
5603 elif field == "master_node":
5604 entry = self.cfg.GetMasterNode()
5605 elif field == "drain_flag":
5606 entry = os.path.exists(constants.JOB_QUEUE_DRAIN_FILE)
5607 elif field == "watcher_pause":
5608 entry = utils.ReadWatcherPauseFile(constants.WATCHER_PAUSEFILE)
5609 elif field == "volume_group_name":
5610 entry = self.cfg.GetVGName()
5612 raise errors.ParameterError(field)
5613 values.append(entry)
5617 class LUInstanceActivateDisks(NoHooksLU):
5618 """Bring up an instance's disks.
5623 def ExpandNames(self):
5624 self._ExpandAndLockInstance()
5625 self.needed_locks[locking.LEVEL_NODE] = []
5626 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
5628 def DeclareLocks(self, level):
5629 if level == locking.LEVEL_NODE:
5630 self._LockInstancesNodes()
5632 def CheckPrereq(self):
5633 """Check prerequisites.
5635 This checks that the instance is in the cluster.
5638 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5639 assert self.instance is not None, \
5640 "Cannot retrieve locked instance %s" % self.op.instance_name
5641 _CheckNodeOnline(self, self.instance.primary_node)
5643 def Exec(self, feedback_fn):
5644 """Activate the disks.
5647 disks_ok, disks_info = \
5648 _AssembleInstanceDisks(self, self.instance,
5649 ignore_size=self.op.ignore_size)
5651 raise errors.OpExecError("Cannot activate block devices")
5656 def _AssembleInstanceDisks(lu, instance, disks=None, ignore_secondaries=False,
5658 """Prepare the block devices for an instance.
5660 This sets up the block devices on all nodes.
5662 @type lu: L{LogicalUnit}
5663 @param lu: the logical unit on whose behalf we execute
5664 @type instance: L{objects.Instance}
5665 @param instance: the instance for whose disks we assemble
5666 @type disks: list of L{objects.Disk} or None
5667 @param disks: which disks to assemble (or all, if None)
5668 @type ignore_secondaries: boolean
5669 @param ignore_secondaries: if true, errors on secondary nodes
5670 won't result in an error return from the function
5671 @type ignore_size: boolean
5672 @param ignore_size: if true, the current known size of the disk
5673 will not be used during the disk activation, useful for cases
5674 when the size is wrong
5675 @return: False if the operation failed, otherwise a list of
5676 (host, instance_visible_name, node_visible_name)
5677 with the mapping from node devices to instance devices
5682 iname = instance.name
5683 disks = _ExpandCheckDisks(instance, disks)
5685 # With the two passes mechanism we try to reduce the window of
5686 # opportunity for the race condition of switching DRBD to primary
5687 # before handshaking occured, but we do not eliminate it
5689 # The proper fix would be to wait (with some limits) until the
5690 # connection has been made and drbd transitions from WFConnection
5691 # into any other network-connected state (Connected, SyncTarget,
5694 # 1st pass, assemble on all nodes in secondary mode
5695 for idx, inst_disk in enumerate(disks):
5696 for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node):
5698 node_disk = node_disk.Copy()
5699 node_disk.UnsetSize()
5700 lu.cfg.SetDiskID(node_disk, node)
5701 result = lu.rpc.call_blockdev_assemble(node, node_disk, iname, False, idx)
5702 msg = result.fail_msg
5704 lu.proc.LogWarning("Could not prepare block device %s on node %s"
5705 " (is_primary=False, pass=1): %s",
5706 inst_disk.iv_name, node, msg)
5707 if not ignore_secondaries:
5710 # FIXME: race condition on drbd migration to primary
5712 # 2nd pass, do only the primary node
5713 for idx, inst_disk in enumerate(disks):
5716 for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node):
5717 if node != instance.primary_node:
5720 node_disk = node_disk.Copy()
5721 node_disk.UnsetSize()
5722 lu.cfg.SetDiskID(node_disk, node)
5723 result = lu.rpc.call_blockdev_assemble(node, node_disk, iname, True, idx)
5724 msg = result.fail_msg
5726 lu.proc.LogWarning("Could not prepare block device %s on node %s"
5727 " (is_primary=True, pass=2): %s",
5728 inst_disk.iv_name, node, msg)
5731 dev_path = result.payload
5733 device_info.append((instance.primary_node, inst_disk.iv_name, dev_path))
5735 # leave the disks configured for the primary node
5736 # this is a workaround that would be fixed better by
5737 # improving the logical/physical id handling
5739 lu.cfg.SetDiskID(disk, instance.primary_node)
5741 return disks_ok, device_info
5744 def _StartInstanceDisks(lu, instance, force):
5745 """Start the disks of an instance.
5748 disks_ok, _ = _AssembleInstanceDisks(lu, instance,
5749 ignore_secondaries=force)
5751 _ShutdownInstanceDisks(lu, instance)
5752 if force is not None and not force:
5753 lu.proc.LogWarning("", hint="If the message above refers to a"
5755 " you can retry the operation using '--force'.")
5756 raise errors.OpExecError("Disk consistency error")
5759 class LUInstanceDeactivateDisks(NoHooksLU):
5760 """Shutdown an instance's disks.
5765 def ExpandNames(self):
5766 self._ExpandAndLockInstance()
5767 self.needed_locks[locking.LEVEL_NODE] = []
5768 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
5770 def DeclareLocks(self, level):
5771 if level == locking.LEVEL_NODE:
5772 self._LockInstancesNodes()
5774 def CheckPrereq(self):
5775 """Check prerequisites.
5777 This checks that the instance is in the cluster.
5780 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5781 assert self.instance is not None, \
5782 "Cannot retrieve locked instance %s" % self.op.instance_name
5784 def Exec(self, feedback_fn):
5785 """Deactivate the disks
5788 instance = self.instance
5790 _ShutdownInstanceDisks(self, instance)
5792 _SafeShutdownInstanceDisks(self, instance)
5795 def _SafeShutdownInstanceDisks(lu, instance, disks=None):
5796 """Shutdown block devices of an instance.
5798 This function checks if an instance is running, before calling
5799 _ShutdownInstanceDisks.
5802 _CheckInstanceDown(lu, instance, "cannot shutdown disks")
5803 _ShutdownInstanceDisks(lu, instance, disks=disks)
5806 def _ExpandCheckDisks(instance, disks):
5807 """Return the instance disks selected by the disks list
5809 @type disks: list of L{objects.Disk} or None
5810 @param disks: selected disks
5811 @rtype: list of L{objects.Disk}
5812 @return: selected instance disks to act on
5816 return instance.disks
5818 if not set(disks).issubset(instance.disks):
5819 raise errors.ProgrammerError("Can only act on disks belonging to the"
5824 def _ShutdownInstanceDisks(lu, instance, disks=None, ignore_primary=False):
5825 """Shutdown block devices of an instance.
5827 This does the shutdown on all nodes of the instance.
5829 If the ignore_primary is false, errors on the primary node are
5834 disks = _ExpandCheckDisks(instance, disks)
5837 for node, top_disk in disk.ComputeNodeTree(instance.primary_node):
5838 lu.cfg.SetDiskID(top_disk, node)
5839 result = lu.rpc.call_blockdev_shutdown(node, top_disk)
5840 msg = result.fail_msg
5842 lu.LogWarning("Could not shutdown block device %s on node %s: %s",
5843 disk.iv_name, node, msg)
5844 if ((node == instance.primary_node and not ignore_primary) or
5845 (node != instance.primary_node and not result.offline)):
5850 def _CheckNodeFreeMemory(lu, node, reason, requested, hypervisor_name):
5851 """Checks if a node has enough free memory.
5853 This function check if a given node has the needed amount of free
5854 memory. In case the node has less memory or we cannot get the
5855 information from the node, this function raise an OpPrereqError
5858 @type lu: C{LogicalUnit}
5859 @param lu: a logical unit from which we get configuration data
5861 @param node: the node to check
5862 @type reason: C{str}
5863 @param reason: string to use in the error message
5864 @type requested: C{int}
5865 @param requested: the amount of memory in MiB to check for
5866 @type hypervisor_name: C{str}
5867 @param hypervisor_name: the hypervisor to ask for memory stats
5868 @raise errors.OpPrereqError: if the node doesn't have enough memory, or
5869 we cannot check the node
5872 nodeinfo = lu.rpc.call_node_info([node], None, hypervisor_name)
5873 nodeinfo[node].Raise("Can't get data from node %s" % node,
5874 prereq=True, ecode=errors.ECODE_ENVIRON)
5875 free_mem = nodeinfo[node].payload.get("memory_free", None)
5876 if not isinstance(free_mem, int):
5877 raise errors.OpPrereqError("Can't compute free memory on node %s, result"
5878 " was '%s'" % (node, free_mem),
5879 errors.ECODE_ENVIRON)
5880 if requested > free_mem:
5881 raise errors.OpPrereqError("Not enough memory on node %s for %s:"
5882 " needed %s MiB, available %s MiB" %
5883 (node, reason, requested, free_mem),
5887 def _CheckNodesFreeDiskPerVG(lu, nodenames, req_sizes):
5888 """Checks if nodes have enough free disk space in the all VGs.
5890 This function check if all given nodes have the needed amount of
5891 free disk. In case any node has less disk or we cannot get the
5892 information from the node, this function raise an OpPrereqError
5895 @type lu: C{LogicalUnit}
5896 @param lu: a logical unit from which we get configuration data
5897 @type nodenames: C{list}
5898 @param nodenames: the list of node names to check
5899 @type req_sizes: C{dict}
5900 @param req_sizes: the hash of vg and corresponding amount of disk in
5902 @raise errors.OpPrereqError: if the node doesn't have enough disk,
5903 or we cannot check the node
5906 for vg, req_size in req_sizes.items():
5907 _CheckNodesFreeDiskOnVG(lu, nodenames, vg, req_size)
5910 def _CheckNodesFreeDiskOnVG(lu, nodenames, vg, requested):
5911 """Checks if nodes have enough free disk space in the specified VG.
5913 This function check if all given nodes have the needed amount of
5914 free disk. In case any node has less disk or we cannot get the
5915 information from the node, this function raise an OpPrereqError
5918 @type lu: C{LogicalUnit}
5919 @param lu: a logical unit from which we get configuration data
5920 @type nodenames: C{list}
5921 @param nodenames: the list of node names to check
5923 @param vg: the volume group to check
5924 @type requested: C{int}
5925 @param requested: the amount of disk in MiB to check for
5926 @raise errors.OpPrereqError: if the node doesn't have enough disk,
5927 or we cannot check the node
5930 nodeinfo = lu.rpc.call_node_info(nodenames, vg, None)
5931 for node in nodenames:
5932 info = nodeinfo[node]
5933 info.Raise("Cannot get current information from node %s" % node,
5934 prereq=True, ecode=errors.ECODE_ENVIRON)
5935 vg_free = info.payload.get("vg_free", None)
5936 if not isinstance(vg_free, int):
5937 raise errors.OpPrereqError("Can't compute free disk space on node"
5938 " %s for vg %s, result was '%s'" %
5939 (node, vg, vg_free), errors.ECODE_ENVIRON)
5940 if requested > vg_free:
5941 raise errors.OpPrereqError("Not enough disk space on target node %s"
5942 " vg %s: required %d MiB, available %d MiB" %
5943 (node, vg, requested, vg_free),
5947 def _CheckNodesPhysicalCPUs(lu, nodenames, requested, hypervisor_name):
5948 """Checks if nodes have enough physical CPUs
5950 This function checks if all given nodes have the needed number of
5951 physical CPUs. In case any node has less CPUs or we cannot get the
5952 information from the node, this function raises an OpPrereqError
5955 @type lu: C{LogicalUnit}
5956 @param lu: a logical unit from which we get configuration data
5957 @type nodenames: C{list}
5958 @param nodenames: the list of node names to check
5959 @type requested: C{int}
5960 @param requested: the minimum acceptable number of physical CPUs
5961 @raise errors.OpPrereqError: if the node doesn't have enough CPUs,
5962 or we cannot check the node
5965 nodeinfo = lu.rpc.call_node_info(nodenames, None, hypervisor_name)
5966 for node in nodenames:
5967 info = nodeinfo[node]
5968 info.Raise("Cannot get current information from node %s" % node,
5969 prereq=True, ecode=errors.ECODE_ENVIRON)
5970 num_cpus = info.payload.get("cpu_total", None)
5971 if not isinstance(num_cpus, int):
5972 raise errors.OpPrereqError("Can't compute the number of physical CPUs"
5973 " on node %s, result was '%s'" %
5974 (node, num_cpus), errors.ECODE_ENVIRON)
5975 if requested > num_cpus:
5976 raise errors.OpPrereqError("Node %s has %s physical CPUs, but %s are "
5977 "required" % (node, num_cpus, requested),
5981 class LUInstanceStartup(LogicalUnit):
5982 """Starts an instance.
5985 HPATH = "instance-start"
5986 HTYPE = constants.HTYPE_INSTANCE
5989 def CheckArguments(self):
5991 if self.op.beparams:
5992 # fill the beparams dict
5993 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
5995 def ExpandNames(self):
5996 self._ExpandAndLockInstance()
5998 def BuildHooksEnv(self):
6001 This runs on master, primary and secondary nodes of the instance.
6005 "FORCE": self.op.force,
6008 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
6012 def BuildHooksNodes(self):
6013 """Build hooks nodes.
6016 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6019 def CheckPrereq(self):
6020 """Check prerequisites.
6022 This checks that the instance is in the cluster.
6025 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6026 assert self.instance is not None, \
6027 "Cannot retrieve locked instance %s" % self.op.instance_name
6030 if self.op.hvparams:
6031 # check hypervisor parameter syntax (locally)
6032 cluster = self.cfg.GetClusterInfo()
6033 utils.ForceDictType(self.op.hvparams, constants.HVS_PARAMETER_TYPES)
6034 filled_hvp = cluster.FillHV(instance)
6035 filled_hvp.update(self.op.hvparams)
6036 hv_type = hypervisor.GetHypervisor(instance.hypervisor)
6037 hv_type.CheckParameterSyntax(filled_hvp)
6038 _CheckHVParams(self, instance.all_nodes, instance.hypervisor, filled_hvp)
6040 self.primary_offline = self.cfg.GetNodeInfo(instance.primary_node).offline
6042 if self.primary_offline and self.op.ignore_offline_nodes:
6043 self.proc.LogWarning("Ignoring offline primary node")
6045 if self.op.hvparams or self.op.beparams:
6046 self.proc.LogWarning("Overridden parameters are ignored")
6048 _CheckNodeOnline(self, instance.primary_node)
6050 bep = self.cfg.GetClusterInfo().FillBE(instance)
6052 # check bridges existence
6053 _CheckInstanceBridgesExist(self, instance)
6055 remote_info = self.rpc.call_instance_info(instance.primary_node,
6057 instance.hypervisor)
6058 remote_info.Raise("Error checking node %s" % instance.primary_node,
6059 prereq=True, ecode=errors.ECODE_ENVIRON)
6060 if not remote_info.payload: # not running already
6061 _CheckNodeFreeMemory(self, instance.primary_node,
6062 "starting instance %s" % instance.name,
6063 bep[constants.BE_MEMORY], instance.hypervisor)
6065 def Exec(self, feedback_fn):
6066 """Start the instance.
6069 instance = self.instance
6070 force = self.op.force
6072 if not self.op.no_remember:
6073 self.cfg.MarkInstanceUp(instance.name)
6075 if self.primary_offline:
6076 assert self.op.ignore_offline_nodes
6077 self.proc.LogInfo("Primary node offline, marked instance as started")
6079 node_current = instance.primary_node
6081 _StartInstanceDisks(self, instance, force)
6084 self.rpc.call_instance_start(node_current,
6085 (instance, self.op.hvparams,
6087 self.op.startup_paused)
6088 msg = result.fail_msg
6090 _ShutdownInstanceDisks(self, instance)
6091 raise errors.OpExecError("Could not start instance: %s" % msg)
6094 class LUInstanceReboot(LogicalUnit):
6095 """Reboot an instance.
6098 HPATH = "instance-reboot"
6099 HTYPE = constants.HTYPE_INSTANCE
6102 def ExpandNames(self):
6103 self._ExpandAndLockInstance()
6105 def BuildHooksEnv(self):
6108 This runs on master, primary and secondary nodes of the instance.
6112 "IGNORE_SECONDARIES": self.op.ignore_secondaries,
6113 "REBOOT_TYPE": self.op.reboot_type,
6114 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
6117 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
6121 def BuildHooksNodes(self):
6122 """Build hooks nodes.
6125 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6128 def CheckPrereq(self):
6129 """Check prerequisites.
6131 This checks that the instance is in the cluster.
6134 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6135 assert self.instance is not None, \
6136 "Cannot retrieve locked instance %s" % self.op.instance_name
6138 _CheckNodeOnline(self, instance.primary_node)
6140 # check bridges existence
6141 _CheckInstanceBridgesExist(self, instance)
6143 def Exec(self, feedback_fn):
6144 """Reboot the instance.
6147 instance = self.instance
6148 ignore_secondaries = self.op.ignore_secondaries
6149 reboot_type = self.op.reboot_type
6151 remote_info = self.rpc.call_instance_info(instance.primary_node,
6153 instance.hypervisor)
6154 remote_info.Raise("Error checking node %s" % instance.primary_node)
6155 instance_running = bool(remote_info.payload)
6157 node_current = instance.primary_node
6159 if instance_running and reboot_type in [constants.INSTANCE_REBOOT_SOFT,
6160 constants.INSTANCE_REBOOT_HARD]:
6161 for disk in instance.disks:
6162 self.cfg.SetDiskID(disk, node_current)
6163 result = self.rpc.call_instance_reboot(node_current, instance,
6165 self.op.shutdown_timeout)
6166 result.Raise("Could not reboot instance")
6168 if instance_running:
6169 result = self.rpc.call_instance_shutdown(node_current, instance,
6170 self.op.shutdown_timeout)
6171 result.Raise("Could not shutdown instance for full reboot")
6172 _ShutdownInstanceDisks(self, instance)
6174 self.LogInfo("Instance %s was already stopped, starting now",
6176 _StartInstanceDisks(self, instance, ignore_secondaries)
6177 result = self.rpc.call_instance_start(node_current,
6178 (instance, None, None), False)
6179 msg = result.fail_msg
6181 _ShutdownInstanceDisks(self, instance)
6182 raise errors.OpExecError("Could not start instance for"
6183 " full reboot: %s" % msg)
6185 self.cfg.MarkInstanceUp(instance.name)
6188 class LUInstanceShutdown(LogicalUnit):
6189 """Shutdown an instance.
6192 HPATH = "instance-stop"
6193 HTYPE = constants.HTYPE_INSTANCE
6196 def ExpandNames(self):
6197 self._ExpandAndLockInstance()
6199 def BuildHooksEnv(self):
6202 This runs on master, primary and secondary nodes of the instance.
6205 env = _BuildInstanceHookEnvByObject(self, self.instance)
6206 env["TIMEOUT"] = self.op.timeout
6209 def BuildHooksNodes(self):
6210 """Build hooks nodes.
6213 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6216 def CheckPrereq(self):
6217 """Check prerequisites.
6219 This checks that the instance is in the cluster.
6222 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6223 assert self.instance is not None, \
6224 "Cannot retrieve locked instance %s" % self.op.instance_name
6226 self.primary_offline = \
6227 self.cfg.GetNodeInfo(self.instance.primary_node).offline
6229 if self.primary_offline and self.op.ignore_offline_nodes:
6230 self.proc.LogWarning("Ignoring offline primary node")
6232 _CheckNodeOnline(self, self.instance.primary_node)
6234 def Exec(self, feedback_fn):
6235 """Shutdown the instance.
6238 instance = self.instance
6239 node_current = instance.primary_node
6240 timeout = self.op.timeout
6242 if not self.op.no_remember:
6243 self.cfg.MarkInstanceDown(instance.name)
6245 if self.primary_offline:
6246 assert self.op.ignore_offline_nodes
6247 self.proc.LogInfo("Primary node offline, marked instance as stopped")
6249 result = self.rpc.call_instance_shutdown(node_current, instance, timeout)
6250 msg = result.fail_msg
6252 self.proc.LogWarning("Could not shutdown instance: %s" % msg)
6254 _ShutdownInstanceDisks(self, instance)
6257 class LUInstanceReinstall(LogicalUnit):
6258 """Reinstall an instance.
6261 HPATH = "instance-reinstall"
6262 HTYPE = constants.HTYPE_INSTANCE
6265 def ExpandNames(self):
6266 self._ExpandAndLockInstance()
6268 def BuildHooksEnv(self):
6271 This runs on master, primary and secondary nodes of the instance.
6274 return _BuildInstanceHookEnvByObject(self, self.instance)
6276 def BuildHooksNodes(self):
6277 """Build hooks nodes.
6280 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6283 def CheckPrereq(self):
6284 """Check prerequisites.
6286 This checks that the instance is in the cluster and is not running.
6289 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6290 assert instance is not None, \
6291 "Cannot retrieve locked instance %s" % self.op.instance_name
6292 _CheckNodeOnline(self, instance.primary_node, "Instance primary node"
6293 " offline, cannot reinstall")
6294 for node in instance.secondary_nodes:
6295 _CheckNodeOnline(self, node, "Instance secondary node offline,"
6296 " cannot reinstall")
6298 if instance.disk_template == constants.DT_DISKLESS:
6299 raise errors.OpPrereqError("Instance '%s' has no disks" %
6300 self.op.instance_name,
6302 _CheckInstanceDown(self, instance, "cannot reinstall")
6304 if self.op.os_type is not None:
6306 pnode = _ExpandNodeName(self.cfg, instance.primary_node)
6307 _CheckNodeHasOS(self, pnode, self.op.os_type, self.op.force_variant)
6308 instance_os = self.op.os_type
6310 instance_os = instance.os
6312 nodelist = list(instance.all_nodes)
6314 if self.op.osparams:
6315 i_osdict = _GetUpdatedParams(instance.osparams, self.op.osparams)
6316 _CheckOSParams(self, True, nodelist, instance_os, i_osdict)
6317 self.os_inst = i_osdict # the new dict (without defaults)
6321 self.instance = instance
6323 def Exec(self, feedback_fn):
6324 """Reinstall the instance.
6327 inst = self.instance
6329 if self.op.os_type is not None:
6330 feedback_fn("Changing OS to '%s'..." % self.op.os_type)
6331 inst.os = self.op.os_type
6332 # Write to configuration
6333 self.cfg.Update(inst, feedback_fn)
6335 _StartInstanceDisks(self, inst, None)
6337 feedback_fn("Running the instance OS create scripts...")
6338 # FIXME: pass debug option from opcode to backend
6339 result = self.rpc.call_instance_os_add(inst.primary_node,
6340 (inst, self.os_inst), True,
6341 self.op.debug_level)
6342 result.Raise("Could not install OS for instance %s on node %s" %
6343 (inst.name, inst.primary_node))
6345 _ShutdownInstanceDisks(self, inst)
6348 class LUInstanceRecreateDisks(LogicalUnit):
6349 """Recreate an instance's missing disks.
6352 HPATH = "instance-recreate-disks"
6353 HTYPE = constants.HTYPE_INSTANCE
6356 def CheckArguments(self):
6357 # normalise the disk list
6358 self.op.disks = sorted(frozenset(self.op.disks))
6360 def ExpandNames(self):
6361 self._ExpandAndLockInstance()
6362 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
6364 self.op.nodes = [_ExpandNodeName(self.cfg, n) for n in self.op.nodes]
6365 self.needed_locks[locking.LEVEL_NODE] = list(self.op.nodes)
6367 self.needed_locks[locking.LEVEL_NODE] = []
6369 def DeclareLocks(self, level):
6370 if level == locking.LEVEL_NODE:
6371 # if we replace the nodes, we only need to lock the old primary,
6372 # otherwise we need to lock all nodes for disk re-creation
6373 primary_only = bool(self.op.nodes)
6374 self._LockInstancesNodes(primary_only=primary_only)
6376 def BuildHooksEnv(self):
6379 This runs on master, primary and secondary nodes of the instance.
6382 return _BuildInstanceHookEnvByObject(self, self.instance)
6384 def BuildHooksNodes(self):
6385 """Build hooks nodes.
6388 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6391 def CheckPrereq(self):
6392 """Check prerequisites.
6394 This checks that the instance is in the cluster and is not running.
6397 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6398 assert instance is not None, \
6399 "Cannot retrieve locked instance %s" % self.op.instance_name
6401 if len(self.op.nodes) != len(instance.all_nodes):
6402 raise errors.OpPrereqError("Instance %s currently has %d nodes, but"
6403 " %d replacement nodes were specified" %
6404 (instance.name, len(instance.all_nodes),
6405 len(self.op.nodes)),
6407 assert instance.disk_template != constants.DT_DRBD8 or \
6408 len(self.op.nodes) == 2
6409 assert instance.disk_template != constants.DT_PLAIN or \
6410 len(self.op.nodes) == 1
6411 primary_node = self.op.nodes[0]
6413 primary_node = instance.primary_node
6414 _CheckNodeOnline(self, primary_node)
6416 if instance.disk_template == constants.DT_DISKLESS:
6417 raise errors.OpPrereqError("Instance '%s' has no disks" %
6418 self.op.instance_name, errors.ECODE_INVAL)
6419 # if we replace nodes *and* the old primary is offline, we don't
6421 assert instance.primary_node in self.needed_locks[locking.LEVEL_NODE]
6422 old_pnode = self.cfg.GetNodeInfo(instance.primary_node)
6423 if not (self.op.nodes and old_pnode.offline):
6424 _CheckInstanceDown(self, instance, "cannot recreate disks")
6426 if not self.op.disks:
6427 self.op.disks = range(len(instance.disks))
6429 for idx in self.op.disks:
6430 if idx >= len(instance.disks):
6431 raise errors.OpPrereqError("Invalid disk index '%s'" % idx,
6433 if self.op.disks != range(len(instance.disks)) and self.op.nodes:
6434 raise errors.OpPrereqError("Can't recreate disks partially and"
6435 " change the nodes at the same time",
6437 self.instance = instance
6439 def Exec(self, feedback_fn):
6440 """Recreate the disks.
6443 instance = self.instance
6446 mods = [] # keeps track of needed logical_id changes
6448 for idx, disk in enumerate(instance.disks):
6449 if idx not in self.op.disks: # disk idx has not been passed in
6452 # update secondaries for disks, if needed
6454 if disk.dev_type == constants.LD_DRBD8:
6455 # need to update the nodes and minors
6456 assert len(self.op.nodes) == 2
6457 assert len(disk.logical_id) == 6 # otherwise disk internals
6459 (_, _, old_port, _, _, old_secret) = disk.logical_id
6460 new_minors = self.cfg.AllocateDRBDMinor(self.op.nodes, instance.name)
6461 new_id = (self.op.nodes[0], self.op.nodes[1], old_port,
6462 new_minors[0], new_minors[1], old_secret)
6463 assert len(disk.logical_id) == len(new_id)
6464 mods.append((idx, new_id))
6466 # now that we have passed all asserts above, we can apply the mods
6467 # in a single run (to avoid partial changes)
6468 for idx, new_id in mods:
6469 instance.disks[idx].logical_id = new_id
6471 # change primary node, if needed
6473 instance.primary_node = self.op.nodes[0]
6474 self.LogWarning("Changing the instance's nodes, you will have to"
6475 " remove any disks left on the older nodes manually")
6478 self.cfg.Update(instance, feedback_fn)
6480 _CreateDisks(self, instance, to_skip=to_skip)
6483 class LUInstanceRename(LogicalUnit):
6484 """Rename an instance.
6487 HPATH = "instance-rename"
6488 HTYPE = constants.HTYPE_INSTANCE
6490 def CheckArguments(self):
6494 if self.op.ip_check and not self.op.name_check:
6495 # TODO: make the ip check more flexible and not depend on the name check
6496 raise errors.OpPrereqError("IP address check requires a name check",
6499 def BuildHooksEnv(self):
6502 This runs on master, primary and secondary nodes of the instance.
6505 env = _BuildInstanceHookEnvByObject(self, self.instance)
6506 env["INSTANCE_NEW_NAME"] = self.op.new_name
6509 def BuildHooksNodes(self):
6510 """Build hooks nodes.
6513 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6516 def CheckPrereq(self):
6517 """Check prerequisites.
6519 This checks that the instance is in the cluster and is not running.
6522 self.op.instance_name = _ExpandInstanceName(self.cfg,
6523 self.op.instance_name)
6524 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6525 assert instance is not None
6526 _CheckNodeOnline(self, instance.primary_node)
6527 _CheckInstanceDown(self, instance, "cannot rename")
6528 self.instance = instance
6530 new_name = self.op.new_name
6531 if self.op.name_check:
6532 hostname = netutils.GetHostname(name=new_name)
6533 if hostname != new_name:
6534 self.LogInfo("Resolved given name '%s' to '%s'", new_name,
6536 if not utils.MatchNameComponent(self.op.new_name, [hostname.name]):
6537 raise errors.OpPrereqError(("Resolved hostname '%s' does not look the"
6538 " same as given hostname '%s'") %
6539 (hostname.name, self.op.new_name),
6541 new_name = self.op.new_name = hostname.name
6542 if (self.op.ip_check and
6543 netutils.TcpPing(hostname.ip, constants.DEFAULT_NODED_PORT)):
6544 raise errors.OpPrereqError("IP %s of instance %s already in use" %
6545 (hostname.ip, new_name),
6546 errors.ECODE_NOTUNIQUE)
6548 instance_list = self.cfg.GetInstanceList()
6549 if new_name in instance_list and new_name != instance.name:
6550 raise errors.OpPrereqError("Instance '%s' is already in the cluster" %
6551 new_name, errors.ECODE_EXISTS)
6553 def Exec(self, feedback_fn):
6554 """Rename the instance.
6557 inst = self.instance
6558 old_name = inst.name
6560 rename_file_storage = False
6561 if (inst.disk_template in constants.DTS_FILEBASED and
6562 self.op.new_name != inst.name):
6563 old_file_storage_dir = os.path.dirname(inst.disks[0].logical_id[1])
6564 rename_file_storage = True
6566 self.cfg.RenameInstance(inst.name, self.op.new_name)
6567 # Change the instance lock. This is definitely safe while we hold the BGL.
6568 # Otherwise the new lock would have to be added in acquired mode.
6570 self.glm.remove(locking.LEVEL_INSTANCE, old_name)
6571 self.glm.add(locking.LEVEL_INSTANCE, self.op.new_name)
6573 # re-read the instance from the configuration after rename
6574 inst = self.cfg.GetInstanceInfo(self.op.new_name)
6576 if rename_file_storage:
6577 new_file_storage_dir = os.path.dirname(inst.disks[0].logical_id[1])
6578 result = self.rpc.call_file_storage_dir_rename(inst.primary_node,
6579 old_file_storage_dir,
6580 new_file_storage_dir)
6581 result.Raise("Could not rename on node %s directory '%s' to '%s'"
6582 " (but the instance has been renamed in Ganeti)" %
6583 (inst.primary_node, old_file_storage_dir,
6584 new_file_storage_dir))
6586 _StartInstanceDisks(self, inst, None)
6588 result = self.rpc.call_instance_run_rename(inst.primary_node, inst,
6589 old_name, self.op.debug_level)
6590 msg = result.fail_msg
6592 msg = ("Could not run OS rename script for instance %s on node %s"
6593 " (but the instance has been renamed in Ganeti): %s" %
6594 (inst.name, inst.primary_node, msg))
6595 self.proc.LogWarning(msg)
6597 _ShutdownInstanceDisks(self, inst)
6602 class LUInstanceRemove(LogicalUnit):
6603 """Remove an instance.
6606 HPATH = "instance-remove"
6607 HTYPE = constants.HTYPE_INSTANCE
6610 def ExpandNames(self):
6611 self._ExpandAndLockInstance()
6612 self.needed_locks[locking.LEVEL_NODE] = []
6613 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
6615 def DeclareLocks(self, level):
6616 if level == locking.LEVEL_NODE:
6617 self._LockInstancesNodes()
6619 def BuildHooksEnv(self):
6622 This runs on master, primary and secondary nodes of the instance.
6625 env = _BuildInstanceHookEnvByObject(self, self.instance)
6626 env["SHUTDOWN_TIMEOUT"] = self.op.shutdown_timeout
6629 def BuildHooksNodes(self):
6630 """Build hooks nodes.
6633 nl = [self.cfg.GetMasterNode()]
6634 nl_post = list(self.instance.all_nodes) + nl
6635 return (nl, nl_post)
6637 def CheckPrereq(self):
6638 """Check prerequisites.
6640 This checks that the instance is in the cluster.
6643 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6644 assert self.instance is not None, \
6645 "Cannot retrieve locked instance %s" % self.op.instance_name
6647 def Exec(self, feedback_fn):
6648 """Remove the instance.
6651 instance = self.instance
6652 logging.info("Shutting down instance %s on node %s",
6653 instance.name, instance.primary_node)
6655 result = self.rpc.call_instance_shutdown(instance.primary_node, instance,
6656 self.op.shutdown_timeout)
6657 msg = result.fail_msg
6659 if self.op.ignore_failures:
6660 feedback_fn("Warning: can't shutdown instance: %s" % msg)
6662 raise errors.OpExecError("Could not shutdown instance %s on"
6664 (instance.name, instance.primary_node, msg))
6666 _RemoveInstance(self, feedback_fn, instance, self.op.ignore_failures)
6669 def _RemoveInstance(lu, feedback_fn, instance, ignore_failures):
6670 """Utility function to remove an instance.
6673 logging.info("Removing block devices for instance %s", instance.name)
6675 if not _RemoveDisks(lu, instance):
6676 if not ignore_failures:
6677 raise errors.OpExecError("Can't remove instance's disks")
6678 feedback_fn("Warning: can't remove instance's disks")
6680 logging.info("Removing instance %s out of cluster config", instance.name)
6682 lu.cfg.RemoveInstance(instance.name)
6684 assert not lu.remove_locks.get(locking.LEVEL_INSTANCE), \
6685 "Instance lock removal conflict"
6687 # Remove lock for the instance
6688 lu.remove_locks[locking.LEVEL_INSTANCE] = instance.name
6691 class LUInstanceQuery(NoHooksLU):
6692 """Logical unit for querying instances.
6695 # pylint: disable=W0142
6698 def CheckArguments(self):
6699 self.iq = _InstanceQuery(qlang.MakeSimpleFilter("name", self.op.names),
6700 self.op.output_fields, self.op.use_locking)
6702 def ExpandNames(self):
6703 self.iq.ExpandNames(self)
6705 def DeclareLocks(self, level):
6706 self.iq.DeclareLocks(self, level)
6708 def Exec(self, feedback_fn):
6709 return self.iq.OldStyleQuery(self)
6712 class LUInstanceFailover(LogicalUnit):
6713 """Failover an instance.
6716 HPATH = "instance-failover"
6717 HTYPE = constants.HTYPE_INSTANCE
6720 def CheckArguments(self):
6721 """Check the arguments.
6724 self.iallocator = getattr(self.op, "iallocator", None)
6725 self.target_node = getattr(self.op, "target_node", None)
6727 def ExpandNames(self):
6728 self._ExpandAndLockInstance()
6730 if self.op.target_node is not None:
6731 self.op.target_node = _ExpandNodeName(self.cfg, self.op.target_node)
6733 self.needed_locks[locking.LEVEL_NODE] = []
6734 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
6736 ignore_consistency = self.op.ignore_consistency
6737 shutdown_timeout = self.op.shutdown_timeout
6738 self._migrater = TLMigrateInstance(self, self.op.instance_name,
6741 ignore_consistency=ignore_consistency,
6742 shutdown_timeout=shutdown_timeout)
6743 self.tasklets = [self._migrater]
6745 def DeclareLocks(self, level):
6746 if level == locking.LEVEL_NODE:
6747 instance = self.context.cfg.GetInstanceInfo(self.op.instance_name)
6748 if instance.disk_template in constants.DTS_EXT_MIRROR:
6749 if self.op.target_node is None:
6750 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
6752 self.needed_locks[locking.LEVEL_NODE] = [instance.primary_node,
6753 self.op.target_node]
6754 del self.recalculate_locks[locking.LEVEL_NODE]
6756 self._LockInstancesNodes()
6758 def BuildHooksEnv(self):
6761 This runs on master, primary and secondary nodes of the instance.
6764 instance = self._migrater.instance
6765 source_node = instance.primary_node
6766 target_node = self.op.target_node
6768 "IGNORE_CONSISTENCY": self.op.ignore_consistency,
6769 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
6770 "OLD_PRIMARY": source_node,
6771 "NEW_PRIMARY": target_node,
6774 if instance.disk_template in constants.DTS_INT_MIRROR:
6775 env["OLD_SECONDARY"] = instance.secondary_nodes[0]
6776 env["NEW_SECONDARY"] = source_node
6778 env["OLD_SECONDARY"] = env["NEW_SECONDARY"] = ""
6780 env.update(_BuildInstanceHookEnvByObject(self, instance))
6784 def BuildHooksNodes(self):
6785 """Build hooks nodes.
6788 instance = self._migrater.instance
6789 nl = [self.cfg.GetMasterNode()] + list(instance.secondary_nodes)
6790 return (nl, nl + [instance.primary_node])
6793 class LUInstanceMigrate(LogicalUnit):
6794 """Migrate an instance.
6796 This is migration without shutting down, compared to the failover,
6797 which is done with shutdown.
6800 HPATH = "instance-migrate"
6801 HTYPE = constants.HTYPE_INSTANCE
6804 def ExpandNames(self):
6805 self._ExpandAndLockInstance()
6807 if self.op.target_node is not None:
6808 self.op.target_node = _ExpandNodeName(self.cfg, self.op.target_node)
6810 self.needed_locks[locking.LEVEL_NODE] = []
6811 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
6813 self._migrater = TLMigrateInstance(self, self.op.instance_name,
6814 cleanup=self.op.cleanup,
6816 fallback=self.op.allow_failover)
6817 self.tasklets = [self._migrater]
6819 def DeclareLocks(self, level):
6820 if level == locking.LEVEL_NODE:
6821 instance = self.context.cfg.GetInstanceInfo(self.op.instance_name)
6822 if instance.disk_template in constants.DTS_EXT_MIRROR:
6823 if self.op.target_node is None:
6824 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
6826 self.needed_locks[locking.LEVEL_NODE] = [instance.primary_node,
6827 self.op.target_node]
6828 del self.recalculate_locks[locking.LEVEL_NODE]
6830 self._LockInstancesNodes()
6832 def BuildHooksEnv(self):
6835 This runs on master, primary and secondary nodes of the instance.
6838 instance = self._migrater.instance
6839 source_node = instance.primary_node
6840 target_node = self.op.target_node
6841 env = _BuildInstanceHookEnvByObject(self, instance)
6843 "MIGRATE_LIVE": self._migrater.live,
6844 "MIGRATE_CLEANUP": self.op.cleanup,
6845 "OLD_PRIMARY": source_node,
6846 "NEW_PRIMARY": target_node,
6849 if instance.disk_template in constants.DTS_INT_MIRROR:
6850 env["OLD_SECONDARY"] = target_node
6851 env["NEW_SECONDARY"] = source_node
6853 env["OLD_SECONDARY"] = env["NEW_SECONDARY"] = None
6857 def BuildHooksNodes(self):
6858 """Build hooks nodes.
6861 instance = self._migrater.instance
6862 nl = [self.cfg.GetMasterNode()] + list(instance.secondary_nodes)
6863 return (nl, nl + [instance.primary_node])
6866 class LUInstanceMove(LogicalUnit):
6867 """Move an instance by data-copying.
6870 HPATH = "instance-move"
6871 HTYPE = constants.HTYPE_INSTANCE
6874 def ExpandNames(self):
6875 self._ExpandAndLockInstance()
6876 target_node = _ExpandNodeName(self.cfg, self.op.target_node)
6877 self.op.target_node = target_node
6878 self.needed_locks[locking.LEVEL_NODE] = [target_node]
6879 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
6881 def DeclareLocks(self, level):
6882 if level == locking.LEVEL_NODE:
6883 self._LockInstancesNodes(primary_only=True)
6885 def BuildHooksEnv(self):
6888 This runs on master, primary and secondary nodes of the instance.
6892 "TARGET_NODE": self.op.target_node,
6893 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
6895 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
6898 def BuildHooksNodes(self):
6899 """Build hooks nodes.
6903 self.cfg.GetMasterNode(),
6904 self.instance.primary_node,
6905 self.op.target_node,
6909 def CheckPrereq(self):
6910 """Check prerequisites.
6912 This checks that the instance is in the cluster.
6915 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6916 assert self.instance is not None, \
6917 "Cannot retrieve locked instance %s" % self.op.instance_name
6919 node = self.cfg.GetNodeInfo(self.op.target_node)
6920 assert node is not None, \
6921 "Cannot retrieve locked node %s" % self.op.target_node
6923 self.target_node = target_node = node.name
6925 if target_node == instance.primary_node:
6926 raise errors.OpPrereqError("Instance %s is already on the node %s" %
6927 (instance.name, target_node),
6930 bep = self.cfg.GetClusterInfo().FillBE(instance)
6932 for idx, dsk in enumerate(instance.disks):
6933 if dsk.dev_type not in (constants.LD_LV, constants.LD_FILE):
6934 raise errors.OpPrereqError("Instance disk %d has a complex layout,"
6935 " cannot copy" % idx, errors.ECODE_STATE)
6937 _CheckNodeOnline(self, target_node)
6938 _CheckNodeNotDrained(self, target_node)
6939 _CheckNodeVmCapable(self, target_node)
6941 if instance.admin_up:
6942 # check memory requirements on the secondary node
6943 _CheckNodeFreeMemory(self, target_node, "failing over instance %s" %
6944 instance.name, bep[constants.BE_MEMORY],
6945 instance.hypervisor)
6947 self.LogInfo("Not checking memory on the secondary node as"
6948 " instance will not be started")
6950 # check bridge existance
6951 _CheckInstanceBridgesExist(self, instance, node=target_node)
6953 def Exec(self, feedback_fn):
6954 """Move an instance.
6956 The move is done by shutting it down on its present node, copying
6957 the data over (slow) and starting it on the new node.
6960 instance = self.instance
6962 source_node = instance.primary_node
6963 target_node = self.target_node
6965 self.LogInfo("Shutting down instance %s on source node %s",
6966 instance.name, source_node)
6968 result = self.rpc.call_instance_shutdown(source_node, instance,
6969 self.op.shutdown_timeout)
6970 msg = result.fail_msg
6972 if self.op.ignore_consistency:
6973 self.proc.LogWarning("Could not shutdown instance %s on node %s."
6974 " Proceeding anyway. Please make sure node"
6975 " %s is down. Error details: %s",
6976 instance.name, source_node, source_node, msg)
6978 raise errors.OpExecError("Could not shutdown instance %s on"
6980 (instance.name, source_node, msg))
6982 # create the target disks
6984 _CreateDisks(self, instance, target_node=target_node)
6985 except errors.OpExecError:
6986 self.LogWarning("Device creation failed, reverting...")
6988 _RemoveDisks(self, instance, target_node=target_node)
6990 self.cfg.ReleaseDRBDMinors(instance.name)
6993 cluster_name = self.cfg.GetClusterInfo().cluster_name
6996 # activate, get path, copy the data over
6997 for idx, disk in enumerate(instance.disks):
6998 self.LogInfo("Copying data for disk %d", idx)
6999 result = self.rpc.call_blockdev_assemble(target_node, disk,
7000 instance.name, True, idx)
7002 self.LogWarning("Can't assemble newly created disk %d: %s",
7003 idx, result.fail_msg)
7004 errs.append(result.fail_msg)
7006 dev_path = result.payload
7007 result = self.rpc.call_blockdev_export(source_node, disk,
7008 target_node, dev_path,
7011 self.LogWarning("Can't copy data over for disk %d: %s",
7012 idx, result.fail_msg)
7013 errs.append(result.fail_msg)
7017 self.LogWarning("Some disks failed to copy, aborting")
7019 _RemoveDisks(self, instance, target_node=target_node)
7021 self.cfg.ReleaseDRBDMinors(instance.name)
7022 raise errors.OpExecError("Errors during disk copy: %s" %
7025 instance.primary_node = target_node
7026 self.cfg.Update(instance, feedback_fn)
7028 self.LogInfo("Removing the disks on the original node")
7029 _RemoveDisks(self, instance, target_node=source_node)
7031 # Only start the instance if it's marked as up
7032 if instance.admin_up:
7033 self.LogInfo("Starting instance %s on node %s",
7034 instance.name, target_node)
7036 disks_ok, _ = _AssembleInstanceDisks(self, instance,
7037 ignore_secondaries=True)
7039 _ShutdownInstanceDisks(self, instance)
7040 raise errors.OpExecError("Can't activate the instance's disks")
7042 result = self.rpc.call_instance_start(target_node,
7043 (instance, None, None), False)
7044 msg = result.fail_msg
7046 _ShutdownInstanceDisks(self, instance)
7047 raise errors.OpExecError("Could not start instance %s on node %s: %s" %
7048 (instance.name, target_node, msg))
7051 class LUNodeMigrate(LogicalUnit):
7052 """Migrate all instances from a node.
7055 HPATH = "node-migrate"
7056 HTYPE = constants.HTYPE_NODE
7059 def CheckArguments(self):
7062 def ExpandNames(self):
7063 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
7065 self.share_locks = _ShareAll()
7066 self.needed_locks = {
7067 locking.LEVEL_NODE: [self.op.node_name],
7070 def BuildHooksEnv(self):
7073 This runs on the master, the primary and all the secondaries.
7077 "NODE_NAME": self.op.node_name,
7080 def BuildHooksNodes(self):
7081 """Build hooks nodes.
7084 nl = [self.cfg.GetMasterNode()]
7087 def CheckPrereq(self):
7090 def Exec(self, feedback_fn):
7091 # Prepare jobs for migration instances
7093 [opcodes.OpInstanceMigrate(instance_name=inst.name,
7096 iallocator=self.op.iallocator,
7097 target_node=self.op.target_node)]
7098 for inst in _GetNodePrimaryInstances(self.cfg, self.op.node_name)
7101 # TODO: Run iallocator in this opcode and pass correct placement options to
7102 # OpInstanceMigrate. Since other jobs can modify the cluster between
7103 # running the iallocator and the actual migration, a good consistency model
7104 # will have to be found.
7106 assert (frozenset(self.owned_locks(locking.LEVEL_NODE)) ==
7107 frozenset([self.op.node_name]))
7109 return ResultWithJobs(jobs)
7112 class TLMigrateInstance(Tasklet):
7113 """Tasklet class for instance migration.
7116 @ivar live: whether the migration will be done live or non-live;
7117 this variable is initalized only after CheckPrereq has run
7118 @type cleanup: boolean
7119 @ivar cleanup: Wheater we cleanup from a failed migration
7120 @type iallocator: string
7121 @ivar iallocator: The iallocator used to determine target_node
7122 @type target_node: string
7123 @ivar target_node: If given, the target_node to reallocate the instance to
7124 @type failover: boolean
7125 @ivar failover: Whether operation results in failover or migration
7126 @type fallback: boolean
7127 @ivar fallback: Whether fallback to failover is allowed if migration not
7129 @type ignore_consistency: boolean
7130 @ivar ignore_consistency: Wheter we should ignore consistency between source
7132 @type shutdown_timeout: int
7133 @ivar shutdown_timeout: In case of failover timeout of the shutdown
7138 _MIGRATION_POLL_INTERVAL = 1 # seconds
7139 _MIGRATION_FEEDBACK_INTERVAL = 10 # seconds
7141 def __init__(self, lu, instance_name, cleanup=False,
7142 failover=False, fallback=False,
7143 ignore_consistency=False,
7144 shutdown_timeout=constants.DEFAULT_SHUTDOWN_TIMEOUT):
7145 """Initializes this class.
7148 Tasklet.__init__(self, lu)
7151 self.instance_name = instance_name
7152 self.cleanup = cleanup
7153 self.live = False # will be overridden later
7154 self.failover = failover
7155 self.fallback = fallback
7156 self.ignore_consistency = ignore_consistency
7157 self.shutdown_timeout = shutdown_timeout
7159 def CheckPrereq(self):
7160 """Check prerequisites.
7162 This checks that the instance is in the cluster.
7165 instance_name = _ExpandInstanceName(self.lu.cfg, self.instance_name)
7166 instance = self.cfg.GetInstanceInfo(instance_name)
7167 assert instance is not None
7168 self.instance = instance
7170 if (not self.cleanup and not instance.admin_up and not self.failover and
7172 self.lu.LogInfo("Instance is marked down, fallback allowed, switching"
7174 self.failover = True
7176 if instance.disk_template not in constants.DTS_MIRRORED:
7181 raise errors.OpPrereqError("Instance's disk layout '%s' does not allow"
7182 " %s" % (instance.disk_template, text),
7185 if instance.disk_template in constants.DTS_EXT_MIRROR:
7186 _CheckIAllocatorOrNode(self.lu, "iallocator", "target_node")
7188 if self.lu.op.iallocator:
7189 self._RunAllocator()
7191 # We set set self.target_node as it is required by
7193 self.target_node = self.lu.op.target_node
7195 # self.target_node is already populated, either directly or by the
7197 target_node = self.target_node
7198 if self.target_node == instance.primary_node:
7199 raise errors.OpPrereqError("Cannot migrate instance %s"
7200 " to its primary (%s)" %
7201 (instance.name, instance.primary_node))
7203 if len(self.lu.tasklets) == 1:
7204 # It is safe to release locks only when we're the only tasklet
7206 _ReleaseLocks(self.lu, locking.LEVEL_NODE,
7207 keep=[instance.primary_node, self.target_node])
7210 secondary_nodes = instance.secondary_nodes
7211 if not secondary_nodes:
7212 raise errors.ConfigurationError("No secondary node but using"
7213 " %s disk template" %
7214 instance.disk_template)
7215 target_node = secondary_nodes[0]
7216 if self.lu.op.iallocator or (self.lu.op.target_node and
7217 self.lu.op.target_node != target_node):
7219 text = "failed over"
7222 raise errors.OpPrereqError("Instances with disk template %s cannot"
7223 " be %s to arbitrary nodes"
7224 " (neither an iallocator nor a target"
7225 " node can be passed)" %
7226 (instance.disk_template, text),
7229 i_be = self.cfg.GetClusterInfo().FillBE(instance)
7231 # check memory requirements on the secondary node
7232 if not self.failover or instance.admin_up:
7233 _CheckNodeFreeMemory(self.lu, target_node, "migrating instance %s" %
7234 instance.name, i_be[constants.BE_MEMORY],
7235 instance.hypervisor)
7237 self.lu.LogInfo("Not checking memory on the secondary node as"
7238 " instance will not be started")
7240 # check bridge existance
7241 _CheckInstanceBridgesExist(self.lu, instance, node=target_node)
7243 if not self.cleanup:
7244 _CheckNodeNotDrained(self.lu, target_node)
7245 if not self.failover:
7246 result = self.rpc.call_instance_migratable(instance.primary_node,
7248 if result.fail_msg and self.fallback:
7249 self.lu.LogInfo("Can't migrate, instance offline, fallback to"
7251 self.failover = True
7253 result.Raise("Can't migrate, please use failover",
7254 prereq=True, ecode=errors.ECODE_STATE)
7256 assert not (self.failover and self.cleanup)
7258 if not self.failover:
7259 if self.lu.op.live is not None and self.lu.op.mode is not None:
7260 raise errors.OpPrereqError("Only one of the 'live' and 'mode'"
7261 " parameters are accepted",
7263 if self.lu.op.live is not None:
7265 self.lu.op.mode = constants.HT_MIGRATION_LIVE
7267 self.lu.op.mode = constants.HT_MIGRATION_NONLIVE
7268 # reset the 'live' parameter to None so that repeated
7269 # invocations of CheckPrereq do not raise an exception
7270 self.lu.op.live = None
7271 elif self.lu.op.mode is None:
7272 # read the default value from the hypervisor
7273 i_hv = self.cfg.GetClusterInfo().FillHV(self.instance,
7275 self.lu.op.mode = i_hv[constants.HV_MIGRATION_MODE]
7277 self.live = self.lu.op.mode == constants.HT_MIGRATION_LIVE
7279 # Failover is never live
7282 def _RunAllocator(self):
7283 """Run the allocator based on input opcode.
7286 ial = IAllocator(self.cfg, self.rpc,
7287 mode=constants.IALLOCATOR_MODE_RELOC,
7288 name=self.instance_name,
7289 # TODO See why hail breaks with a single node below
7290 relocate_from=[self.instance.primary_node,
7291 self.instance.primary_node],
7294 ial.Run(self.lu.op.iallocator)
7297 raise errors.OpPrereqError("Can't compute nodes using"
7298 " iallocator '%s': %s" %
7299 (self.lu.op.iallocator, ial.info),
7301 if len(ial.result) != ial.required_nodes:
7302 raise errors.OpPrereqError("iallocator '%s' returned invalid number"
7303 " of nodes (%s), required %s" %
7304 (self.lu.op.iallocator, len(ial.result),
7305 ial.required_nodes), errors.ECODE_FAULT)
7306 self.target_node = ial.result[0]
7307 self.lu.LogInfo("Selected nodes for instance %s via iallocator %s: %s",
7308 self.instance_name, self.lu.op.iallocator,
7309 utils.CommaJoin(ial.result))
7311 def _WaitUntilSync(self):
7312 """Poll with custom rpc for disk sync.
7314 This uses our own step-based rpc call.
7317 self.feedback_fn("* wait until resync is done")
7321 result = self.rpc.call_drbd_wait_sync(self.all_nodes,
7323 self.instance.disks)
7325 for node, nres in result.items():
7326 nres.Raise("Cannot resync disks on node %s" % node)
7327 node_done, node_percent = nres.payload
7328 all_done = all_done and node_done
7329 if node_percent is not None:
7330 min_percent = min(min_percent, node_percent)
7332 if min_percent < 100:
7333 self.feedback_fn(" - progress: %.1f%%" % min_percent)
7336 def _EnsureSecondary(self, node):
7337 """Demote a node to secondary.
7340 self.feedback_fn("* switching node %s to secondary mode" % node)
7342 for dev in self.instance.disks:
7343 self.cfg.SetDiskID(dev, node)
7345 result = self.rpc.call_blockdev_close(node, self.instance.name,
7346 self.instance.disks)
7347 result.Raise("Cannot change disk to secondary on node %s" % node)
7349 def _GoStandalone(self):
7350 """Disconnect from the network.
7353 self.feedback_fn("* changing into standalone mode")
7354 result = self.rpc.call_drbd_disconnect_net(self.all_nodes, self.nodes_ip,
7355 self.instance.disks)
7356 for node, nres in result.items():
7357 nres.Raise("Cannot disconnect disks node %s" % node)
7359 def _GoReconnect(self, multimaster):
7360 """Reconnect to the network.
7366 msg = "single-master"
7367 self.feedback_fn("* changing disks into %s mode" % msg)
7368 result = self.rpc.call_drbd_attach_net(self.all_nodes, self.nodes_ip,
7369 self.instance.disks,
7370 self.instance.name, multimaster)
7371 for node, nres in result.items():
7372 nres.Raise("Cannot change disks config on node %s" % node)
7374 def _ExecCleanup(self):
7375 """Try to cleanup after a failed migration.
7377 The cleanup is done by:
7378 - check that the instance is running only on one node
7379 (and update the config if needed)
7380 - change disks on its secondary node to secondary
7381 - wait until disks are fully synchronized
7382 - disconnect from the network
7383 - change disks into single-master mode
7384 - wait again until disks are fully synchronized
7387 instance = self.instance
7388 target_node = self.target_node
7389 source_node = self.source_node
7391 # check running on only one node
7392 self.feedback_fn("* checking where the instance actually runs"
7393 " (if this hangs, the hypervisor might be in"
7395 ins_l = self.rpc.call_instance_list(self.all_nodes, [instance.hypervisor])
7396 for node, result in ins_l.items():
7397 result.Raise("Can't contact node %s" % node)
7399 runningon_source = instance.name in ins_l[source_node].payload
7400 runningon_target = instance.name in ins_l[target_node].payload
7402 if runningon_source and runningon_target:
7403 raise errors.OpExecError("Instance seems to be running on two nodes,"
7404 " or the hypervisor is confused; you will have"
7405 " to ensure manually that it runs only on one"
7406 " and restart this operation")
7408 if not (runningon_source or runningon_target):
7409 raise errors.OpExecError("Instance does not seem to be running at all;"
7410 " in this case it's safer to repair by"
7411 " running 'gnt-instance stop' to ensure disk"
7412 " shutdown, and then restarting it")
7414 if runningon_target:
7415 # the migration has actually succeeded, we need to update the config
7416 self.feedback_fn("* instance running on secondary node (%s),"
7417 " updating config" % target_node)
7418 instance.primary_node = target_node
7419 self.cfg.Update(instance, self.feedback_fn)
7420 demoted_node = source_node
7422 self.feedback_fn("* instance confirmed to be running on its"
7423 " primary node (%s)" % source_node)
7424 demoted_node = target_node
7426 if instance.disk_template in constants.DTS_INT_MIRROR:
7427 self._EnsureSecondary(demoted_node)
7429 self._WaitUntilSync()
7430 except errors.OpExecError:
7431 # we ignore here errors, since if the device is standalone, it
7432 # won't be able to sync
7434 self._GoStandalone()
7435 self._GoReconnect(False)
7436 self._WaitUntilSync()
7438 self.feedback_fn("* done")
7440 def _RevertDiskStatus(self):
7441 """Try to revert the disk status after a failed migration.
7444 target_node = self.target_node
7445 if self.instance.disk_template in constants.DTS_EXT_MIRROR:
7449 self._EnsureSecondary(target_node)
7450 self._GoStandalone()
7451 self._GoReconnect(False)
7452 self._WaitUntilSync()
7453 except errors.OpExecError, err:
7454 self.lu.LogWarning("Migration failed and I can't reconnect the drives,"
7455 " please try to recover the instance manually;"
7456 " error '%s'" % str(err))
7458 def _AbortMigration(self):
7459 """Call the hypervisor code to abort a started migration.
7462 instance = self.instance
7463 target_node = self.target_node
7464 source_node = self.source_node
7465 migration_info = self.migration_info
7467 abort_result = self.rpc.call_instance_finalize_migration_dst(target_node,
7471 abort_msg = abort_result.fail_msg
7473 logging.error("Aborting migration failed on target node %s: %s",
7474 target_node, abort_msg)
7475 # Don't raise an exception here, as we stil have to try to revert the
7476 # disk status, even if this step failed.
7478 abort_result = self.rpc.call_instance_finalize_migration_src(source_node,
7479 instance, False, self.live)
7480 abort_msg = abort_result.fail_msg
7482 logging.error("Aborting migration failed on source node %s: %s",
7483 source_node, abort_msg)
7485 def _ExecMigration(self):
7486 """Migrate an instance.
7488 The migrate is done by:
7489 - change the disks into dual-master mode
7490 - wait until disks are fully synchronized again
7491 - migrate the instance
7492 - change disks on the new secondary node (the old primary) to secondary
7493 - wait until disks are fully synchronized
7494 - change disks into single-master mode
7497 instance = self.instance
7498 target_node = self.target_node
7499 source_node = self.source_node
7501 # Check for hypervisor version mismatch and warn the user.
7502 nodeinfo = self.rpc.call_node_info([source_node, target_node],
7503 None, self.instance.hypervisor)
7504 src_info = nodeinfo[source_node]
7505 dst_info = nodeinfo[target_node]
7507 if ((constants.HV_NODEINFO_KEY_VERSION in src_info.payload) and
7508 (constants.HV_NODEINFO_KEY_VERSION in dst_info.payload)):
7509 src_version = src_info.payload[constants.HV_NODEINFO_KEY_VERSION]
7510 dst_version = dst_info.payload[constants.HV_NODEINFO_KEY_VERSION]
7511 if src_version != dst_version:
7512 self.feedback_fn("* warning: hypervisor version mismatch between"
7513 " source (%s) and target (%s) node" %
7514 (src_version, dst_version))
7516 self.feedback_fn("* checking disk consistency between source and target")
7517 for dev in instance.disks:
7518 if not _CheckDiskConsistency(self.lu, dev, target_node, False):
7519 raise errors.OpExecError("Disk %s is degraded or not fully"
7520 " synchronized on target node,"
7521 " aborting migration" % dev.iv_name)
7523 # First get the migration information from the remote node
7524 result = self.rpc.call_migration_info(source_node, instance)
7525 msg = result.fail_msg
7527 log_err = ("Failed fetching source migration information from %s: %s" %
7529 logging.error(log_err)
7530 raise errors.OpExecError(log_err)
7532 self.migration_info = migration_info = result.payload
7534 if self.instance.disk_template not in constants.DTS_EXT_MIRROR:
7535 # Then switch the disks to master/master mode
7536 self._EnsureSecondary(target_node)
7537 self._GoStandalone()
7538 self._GoReconnect(True)
7539 self._WaitUntilSync()
7541 self.feedback_fn("* preparing %s to accept the instance" % target_node)
7542 result = self.rpc.call_accept_instance(target_node,
7545 self.nodes_ip[target_node])
7547 msg = result.fail_msg
7549 logging.error("Instance pre-migration failed, trying to revert"
7550 " disk status: %s", msg)
7551 self.feedback_fn("Pre-migration failed, aborting")
7552 self._AbortMigration()
7553 self._RevertDiskStatus()
7554 raise errors.OpExecError("Could not pre-migrate instance %s: %s" %
7555 (instance.name, msg))
7557 self.feedback_fn("* migrating instance to %s" % target_node)
7558 result = self.rpc.call_instance_migrate(source_node, instance,
7559 self.nodes_ip[target_node],
7561 msg = result.fail_msg
7563 logging.error("Instance migration failed, trying to revert"
7564 " disk status: %s", msg)
7565 self.feedback_fn("Migration failed, aborting")
7566 self._AbortMigration()
7567 self._RevertDiskStatus()
7568 raise errors.OpExecError("Could not migrate instance %s: %s" %
7569 (instance.name, msg))
7571 self.feedback_fn("* starting memory transfer")
7572 last_feedback = time.time()
7574 result = self.rpc.call_instance_get_migration_status(source_node,
7576 msg = result.fail_msg
7577 ms = result.payload # MigrationStatus instance
7578 if msg or (ms.status in constants.HV_MIGRATION_FAILED_STATUSES):
7579 logging.error("Instance migration failed, trying to revert"
7580 " disk status: %s", msg)
7581 self.feedback_fn("Migration failed, aborting")
7582 self._AbortMigration()
7583 self._RevertDiskStatus()
7584 raise errors.OpExecError("Could not migrate instance %s: %s" %
7585 (instance.name, msg))
7587 if result.payload.status != constants.HV_MIGRATION_ACTIVE:
7588 self.feedback_fn("* memory transfer complete")
7591 if (utils.TimeoutExpired(last_feedback,
7592 self._MIGRATION_FEEDBACK_INTERVAL) and
7593 ms.transferred_ram is not None):
7594 mem_progress = 100 * float(ms.transferred_ram) / float(ms.total_ram)
7595 self.feedback_fn("* memory transfer progress: %.2f %%" % mem_progress)
7596 last_feedback = time.time()
7598 time.sleep(self._MIGRATION_POLL_INTERVAL)
7600 result = self.rpc.call_instance_finalize_migration_src(source_node,
7604 msg = result.fail_msg
7606 logging.error("Instance migration succeeded, but finalization failed"
7607 " on the source node: %s", msg)
7608 raise errors.OpExecError("Could not finalize instance migration: %s" %
7611 instance.primary_node = target_node
7613 # distribute new instance config to the other nodes
7614 self.cfg.Update(instance, self.feedback_fn)
7616 result = self.rpc.call_instance_finalize_migration_dst(target_node,
7620 msg = result.fail_msg
7622 logging.error("Instance migration succeeded, but finalization failed"
7623 " on the target node: %s", msg)
7624 raise errors.OpExecError("Could not finalize instance migration: %s" %
7627 if self.instance.disk_template not in constants.DTS_EXT_MIRROR:
7628 self._EnsureSecondary(source_node)
7629 self._WaitUntilSync()
7630 self._GoStandalone()
7631 self._GoReconnect(False)
7632 self._WaitUntilSync()
7634 self.feedback_fn("* done")
7636 def _ExecFailover(self):
7637 """Failover an instance.
7639 The failover is done by shutting it down on its present node and
7640 starting it on the secondary.
7643 instance = self.instance
7644 primary_node = self.cfg.GetNodeInfo(instance.primary_node)
7646 source_node = instance.primary_node
7647 target_node = self.target_node
7649 if instance.admin_up:
7650 self.feedback_fn("* checking disk consistency between source and target")
7651 for dev in instance.disks:
7652 # for drbd, these are drbd over lvm
7653 if not _CheckDiskConsistency(self.lu, dev, target_node, False):
7654 if primary_node.offline:
7655 self.feedback_fn("Node %s is offline, ignoring degraded disk %s on"
7657 (primary_node.name, dev.iv_name, target_node))
7658 elif not self.ignore_consistency:
7659 raise errors.OpExecError("Disk %s is degraded on target node,"
7660 " aborting failover" % dev.iv_name)
7662 self.feedback_fn("* not checking disk consistency as instance is not"
7665 self.feedback_fn("* shutting down instance on source node")
7666 logging.info("Shutting down instance %s on node %s",
7667 instance.name, source_node)
7669 result = self.rpc.call_instance_shutdown(source_node, instance,
7670 self.shutdown_timeout)
7671 msg = result.fail_msg
7673 if self.ignore_consistency or primary_node.offline:
7674 self.lu.LogWarning("Could not shutdown instance %s on node %s,"
7675 " proceeding anyway; please make sure node"
7676 " %s is down; error details: %s",
7677 instance.name, source_node, source_node, msg)
7679 raise errors.OpExecError("Could not shutdown instance %s on"
7681 (instance.name, source_node, msg))
7683 self.feedback_fn("* deactivating the instance's disks on source node")
7684 if not _ShutdownInstanceDisks(self.lu, instance, ignore_primary=True):
7685 raise errors.OpExecError("Can't shut down the instance's disks")
7687 instance.primary_node = target_node
7688 # distribute new instance config to the other nodes
7689 self.cfg.Update(instance, self.feedback_fn)
7691 # Only start the instance if it's marked as up
7692 if instance.admin_up:
7693 self.feedback_fn("* activating the instance's disks on target node %s" %
7695 logging.info("Starting instance %s on node %s",
7696 instance.name, target_node)
7698 disks_ok, _ = _AssembleInstanceDisks(self.lu, instance,
7699 ignore_secondaries=True)
7701 _ShutdownInstanceDisks(self.lu, instance)
7702 raise errors.OpExecError("Can't activate the instance's disks")
7704 self.feedback_fn("* starting the instance on the target node %s" %
7706 result = self.rpc.call_instance_start(target_node, (instance, None, None),
7708 msg = result.fail_msg
7710 _ShutdownInstanceDisks(self.lu, instance)
7711 raise errors.OpExecError("Could not start instance %s on node %s: %s" %
7712 (instance.name, target_node, msg))
7714 def Exec(self, feedback_fn):
7715 """Perform the migration.
7718 self.feedback_fn = feedback_fn
7719 self.source_node = self.instance.primary_node
7721 # FIXME: if we implement migrate-to-any in DRBD, this needs fixing
7722 if self.instance.disk_template in constants.DTS_INT_MIRROR:
7723 self.target_node = self.instance.secondary_nodes[0]
7724 # Otherwise self.target_node has been populated either
7725 # directly, or through an iallocator.
7727 self.all_nodes = [self.source_node, self.target_node]
7728 self.nodes_ip = dict((name, node.secondary_ip) for (name, node)
7729 in self.cfg.GetMultiNodeInfo(self.all_nodes))
7732 feedback_fn("Failover instance %s" % self.instance.name)
7733 self._ExecFailover()
7735 feedback_fn("Migrating instance %s" % self.instance.name)
7738 return self._ExecCleanup()
7740 return self._ExecMigration()
7743 def _CreateBlockDev(lu, node, instance, device, force_create,
7745 """Create a tree of block devices on a given node.
7747 If this device type has to be created on secondaries, create it and
7750 If not, just recurse to children keeping the same 'force' value.
7752 @param lu: the lu on whose behalf we execute
7753 @param node: the node on which to create the device
7754 @type instance: L{objects.Instance}
7755 @param instance: the instance which owns the device
7756 @type device: L{objects.Disk}
7757 @param device: the device to create
7758 @type force_create: boolean
7759 @param force_create: whether to force creation of this device; this
7760 will be change to True whenever we find a device which has
7761 CreateOnSecondary() attribute
7762 @param info: the extra 'metadata' we should attach to the device
7763 (this will be represented as a LVM tag)
7764 @type force_open: boolean
7765 @param force_open: this parameter will be passes to the
7766 L{backend.BlockdevCreate} function where it specifies
7767 whether we run on primary or not, and it affects both
7768 the child assembly and the device own Open() execution
7771 if device.CreateOnSecondary():
7775 for child in device.children:
7776 _CreateBlockDev(lu, node, instance, child, force_create,
7779 if not force_create:
7782 _CreateSingleBlockDev(lu, node, instance, device, info, force_open)
7785 def _CreateSingleBlockDev(lu, node, instance, device, info, force_open):
7786 """Create a single block device on a given node.
7788 This will not recurse over children of the device, so they must be
7791 @param lu: the lu on whose behalf we execute
7792 @param node: the node on which to create the device
7793 @type instance: L{objects.Instance}
7794 @param instance: the instance which owns the device
7795 @type device: L{objects.Disk}
7796 @param device: the device to create
7797 @param info: the extra 'metadata' we should attach to the device
7798 (this will be represented as a LVM tag)
7799 @type force_open: boolean
7800 @param force_open: this parameter will be passes to the
7801 L{backend.BlockdevCreate} function where it specifies
7802 whether we run on primary or not, and it affects both
7803 the child assembly and the device own Open() execution
7806 lu.cfg.SetDiskID(device, node)
7807 result = lu.rpc.call_blockdev_create(node, device, device.size,
7808 instance.name, force_open, info)
7809 result.Raise("Can't create block device %s on"
7810 " node %s for instance %s" % (device, node, instance.name))
7811 if device.physical_id is None:
7812 device.physical_id = result.payload
7815 def _GenerateUniqueNames(lu, exts):
7816 """Generate a suitable LV name.
7818 This will generate a logical volume name for the given instance.
7823 new_id = lu.cfg.GenerateUniqueID(lu.proc.GetECId())
7824 results.append("%s%s" % (new_id, val))
7828 def _GenerateDRBD8Branch(lu, primary, secondary, size, vgnames, names,
7829 iv_name, p_minor, s_minor):
7830 """Generate a drbd8 device complete with its children.
7833 assert len(vgnames) == len(names) == 2
7834 port = lu.cfg.AllocatePort()
7835 shared_secret = lu.cfg.GenerateDRBDSecret(lu.proc.GetECId())
7836 dev_data = objects.Disk(dev_type=constants.LD_LV, size=size,
7837 logical_id=(vgnames[0], names[0]))
7838 dev_meta = objects.Disk(dev_type=constants.LD_LV, size=DRBD_META_SIZE,
7839 logical_id=(vgnames[1], names[1]))
7840 drbd_dev = objects.Disk(dev_type=constants.LD_DRBD8, size=size,
7841 logical_id=(primary, secondary, port,
7844 children=[dev_data, dev_meta],
7849 def _GenerateDiskTemplate(lu, template_name,
7850 instance_name, primary_node,
7851 secondary_nodes, disk_info,
7852 file_storage_dir, file_driver,
7853 base_index, feedback_fn):
7854 """Generate the entire disk layout for a given template type.
7857 #TODO: compute space requirements
7859 vgname = lu.cfg.GetVGName()
7860 disk_count = len(disk_info)
7862 if template_name == constants.DT_DISKLESS:
7864 elif template_name == constants.DT_PLAIN:
7865 if len(secondary_nodes) != 0:
7866 raise errors.ProgrammerError("Wrong template configuration")
7868 names = _GenerateUniqueNames(lu, [".disk%d" % (base_index + i)
7869 for i in range(disk_count)])
7870 for idx, disk in enumerate(disk_info):
7871 disk_index = idx + base_index
7872 vg = disk.get(constants.IDISK_VG, vgname)
7873 feedback_fn("* disk %i, vg %s, name %s" % (idx, vg, names[idx]))
7874 disk_dev = objects.Disk(dev_type=constants.LD_LV,
7875 size=disk[constants.IDISK_SIZE],
7876 logical_id=(vg, names[idx]),
7877 iv_name="disk/%d" % disk_index,
7878 mode=disk[constants.IDISK_MODE])
7879 disks.append(disk_dev)
7880 elif template_name == constants.DT_DRBD8:
7881 if len(secondary_nodes) != 1:
7882 raise errors.ProgrammerError("Wrong template configuration")
7883 remote_node = secondary_nodes[0]
7884 minors = lu.cfg.AllocateDRBDMinor(
7885 [primary_node, remote_node] * len(disk_info), instance_name)
7888 for lv_prefix in _GenerateUniqueNames(lu, [".disk%d" % (base_index + i)
7889 for i in range(disk_count)]):
7890 names.append(lv_prefix + "_data")
7891 names.append(lv_prefix + "_meta")
7892 for idx, disk in enumerate(disk_info):
7893 disk_index = idx + base_index
7894 data_vg = disk.get(constants.IDISK_VG, vgname)
7895 meta_vg = disk.get(constants.IDISK_METAVG, data_vg)
7896 disk_dev = _GenerateDRBD8Branch(lu, primary_node, remote_node,
7897 disk[constants.IDISK_SIZE],
7899 names[idx * 2:idx * 2 + 2],
7900 "disk/%d" % disk_index,
7901 minors[idx * 2], minors[idx * 2 + 1])
7902 disk_dev.mode = disk[constants.IDISK_MODE]
7903 disks.append(disk_dev)
7904 elif template_name == constants.DT_FILE:
7905 if len(secondary_nodes) != 0:
7906 raise errors.ProgrammerError("Wrong template configuration")
7908 opcodes.RequireFileStorage()
7910 for idx, disk in enumerate(disk_info):
7911 disk_index = idx + base_index
7912 disk_dev = objects.Disk(dev_type=constants.LD_FILE,
7913 size=disk[constants.IDISK_SIZE],
7914 iv_name="disk/%d" % disk_index,
7915 logical_id=(file_driver,
7916 "%s/disk%d" % (file_storage_dir,
7918 mode=disk[constants.IDISK_MODE])
7919 disks.append(disk_dev)
7920 elif template_name == constants.DT_SHARED_FILE:
7921 if len(secondary_nodes) != 0:
7922 raise errors.ProgrammerError("Wrong template configuration")
7924 opcodes.RequireSharedFileStorage()
7926 for idx, disk in enumerate(disk_info):
7927 disk_index = idx + base_index
7928 disk_dev = objects.Disk(dev_type=constants.LD_FILE,
7929 size=disk[constants.IDISK_SIZE],
7930 iv_name="disk/%d" % disk_index,
7931 logical_id=(file_driver,
7932 "%s/disk%d" % (file_storage_dir,
7934 mode=disk[constants.IDISK_MODE])
7935 disks.append(disk_dev)
7936 elif template_name == constants.DT_BLOCK:
7937 if len(secondary_nodes) != 0:
7938 raise errors.ProgrammerError("Wrong template configuration")
7940 for idx, disk in enumerate(disk_info):
7941 disk_index = idx + base_index
7942 disk_dev = objects.Disk(dev_type=constants.LD_BLOCKDEV,
7943 size=disk[constants.IDISK_SIZE],
7944 logical_id=(constants.BLOCKDEV_DRIVER_MANUAL,
7945 disk[constants.IDISK_ADOPT]),
7946 iv_name="disk/%d" % disk_index,
7947 mode=disk[constants.IDISK_MODE])
7948 disks.append(disk_dev)
7951 raise errors.ProgrammerError("Invalid disk template '%s'" % template_name)
7955 def _GetInstanceInfoText(instance):
7956 """Compute that text that should be added to the disk's metadata.
7959 return "originstname+%s" % instance.name
7962 def _CalcEta(time_taken, written, total_size):
7963 """Calculates the ETA based on size written and total size.
7965 @param time_taken: The time taken so far
7966 @param written: amount written so far
7967 @param total_size: The total size of data to be written
7968 @return: The remaining time in seconds
7971 avg_time = time_taken / float(written)
7972 return (total_size - written) * avg_time
7975 def _WipeDisks(lu, instance):
7976 """Wipes instance disks.
7978 @type lu: L{LogicalUnit}
7979 @param lu: the logical unit on whose behalf we execute
7980 @type instance: L{objects.Instance}
7981 @param instance: the instance whose disks we should create
7982 @return: the success of the wipe
7985 node = instance.primary_node
7987 for device in instance.disks:
7988 lu.cfg.SetDiskID(device, node)
7990 logging.info("Pause sync of instance %s disks", instance.name)
7991 result = lu.rpc.call_blockdev_pause_resume_sync(node, instance.disks, True)
7993 for idx, success in enumerate(result.payload):
7995 logging.warn("pause-sync of instance %s for disks %d failed",
7999 for idx, device in enumerate(instance.disks):
8000 # The wipe size is MIN_WIPE_CHUNK_PERCENT % of the instance disk but
8001 # MAX_WIPE_CHUNK at max
8002 wipe_chunk_size = min(constants.MAX_WIPE_CHUNK, device.size / 100.0 *
8003 constants.MIN_WIPE_CHUNK_PERCENT)
8004 # we _must_ make this an int, otherwise rounding errors will
8006 wipe_chunk_size = int(wipe_chunk_size)
8008 lu.LogInfo("* Wiping disk %d", idx)
8009 logging.info("Wiping disk %d for instance %s, node %s using"
8010 " chunk size %s", idx, instance.name, node, wipe_chunk_size)
8015 start_time = time.time()
8017 while offset < size:
8018 wipe_size = min(wipe_chunk_size, size - offset)
8019 logging.debug("Wiping disk %d, offset %s, chunk %s",
8020 idx, offset, wipe_size)
8021 result = lu.rpc.call_blockdev_wipe(node, device, offset, wipe_size)
8022 result.Raise("Could not wipe disk %d at offset %d for size %d" %
8023 (idx, offset, wipe_size))
8026 if now - last_output >= 60:
8027 eta = _CalcEta(now - start_time, offset, size)
8028 lu.LogInfo(" - done: %.1f%% ETA: %s" %
8029 (offset / float(size) * 100, utils.FormatSeconds(eta)))
8032 logging.info("Resume sync of instance %s disks", instance.name)
8034 result = lu.rpc.call_blockdev_pause_resume_sync(node, instance.disks, False)
8036 for idx, success in enumerate(result.payload):
8038 lu.LogWarning("Resume sync of disk %d failed, please have a"
8039 " look at the status and troubleshoot the issue", idx)
8040 logging.warn("resume-sync of instance %s for disks %d failed",
8044 def _CreateDisks(lu, instance, to_skip=None, target_node=None):
8045 """Create all disks for an instance.
8047 This abstracts away some work from AddInstance.
8049 @type lu: L{LogicalUnit}
8050 @param lu: the logical unit on whose behalf we execute
8051 @type instance: L{objects.Instance}
8052 @param instance: the instance whose disks we should create
8054 @param to_skip: list of indices to skip
8055 @type target_node: string
8056 @param target_node: if passed, overrides the target node for creation
8058 @return: the success of the creation
8061 info = _GetInstanceInfoText(instance)
8062 if target_node is None:
8063 pnode = instance.primary_node
8064 all_nodes = instance.all_nodes
8069 if instance.disk_template in constants.DTS_FILEBASED:
8070 file_storage_dir = os.path.dirname(instance.disks[0].logical_id[1])
8071 result = lu.rpc.call_file_storage_dir_create(pnode, file_storage_dir)
8073 result.Raise("Failed to create directory '%s' on"
8074 " node %s" % (file_storage_dir, pnode))
8076 # Note: this needs to be kept in sync with adding of disks in
8077 # LUInstanceSetParams
8078 for idx, device in enumerate(instance.disks):
8079 if to_skip and idx in to_skip:
8081 logging.info("Creating volume %s for instance %s",
8082 device.iv_name, instance.name)
8084 for node in all_nodes:
8085 f_create = node == pnode
8086 _CreateBlockDev(lu, node, instance, device, f_create, info, f_create)
8089 def _RemoveDisks(lu, instance, target_node=None):
8090 """Remove all disks for an instance.
8092 This abstracts away some work from `AddInstance()` and
8093 `RemoveInstance()`. Note that in case some of the devices couldn't
8094 be removed, the removal will continue with the other ones (compare
8095 with `_CreateDisks()`).
8097 @type lu: L{LogicalUnit}
8098 @param lu: the logical unit on whose behalf we execute
8099 @type instance: L{objects.Instance}
8100 @param instance: the instance whose disks we should remove
8101 @type target_node: string
8102 @param target_node: used to override the node on which to remove the disks
8104 @return: the success of the removal
8107 logging.info("Removing block devices for instance %s", instance.name)
8110 for device in instance.disks:
8112 edata = [(target_node, device)]
8114 edata = device.ComputeNodeTree(instance.primary_node)
8115 for node, disk in edata:
8116 lu.cfg.SetDiskID(disk, node)
8117 msg = lu.rpc.call_blockdev_remove(node, disk).fail_msg
8119 lu.LogWarning("Could not remove block device %s on node %s,"
8120 " continuing anyway: %s", device.iv_name, node, msg)
8123 if instance.disk_template == constants.DT_FILE:
8124 file_storage_dir = os.path.dirname(instance.disks[0].logical_id[1])
8128 tgt = instance.primary_node
8129 result = lu.rpc.call_file_storage_dir_remove(tgt, file_storage_dir)
8131 lu.LogWarning("Could not remove directory '%s' on node %s: %s",
8132 file_storage_dir, instance.primary_node, result.fail_msg)
8138 def _ComputeDiskSizePerVG(disk_template, disks):
8139 """Compute disk size requirements in the volume group
8142 def _compute(disks, payload):
8143 """Universal algorithm.
8148 vgs[disk[constants.IDISK_VG]] = \
8149 vgs.get(constants.IDISK_VG, 0) + disk[constants.IDISK_SIZE] + payload
8153 # Required free disk space as a function of disk and swap space
8155 constants.DT_DISKLESS: {},
8156 constants.DT_PLAIN: _compute(disks, 0),
8157 # 128 MB are added for drbd metadata for each disk
8158 constants.DT_DRBD8: _compute(disks, DRBD_META_SIZE),
8159 constants.DT_FILE: {},
8160 constants.DT_SHARED_FILE: {},
8163 if disk_template not in req_size_dict:
8164 raise errors.ProgrammerError("Disk template '%s' size requirement"
8165 " is unknown" % disk_template)
8167 return req_size_dict[disk_template]
8170 def _ComputeDiskSize(disk_template, disks):
8171 """Compute disk size requirements in the volume group
8174 # Required free disk space as a function of disk and swap space
8176 constants.DT_DISKLESS: None,
8177 constants.DT_PLAIN: sum(d[constants.IDISK_SIZE] for d in disks),
8178 # 128 MB are added for drbd metadata for each disk
8180 sum(d[constants.IDISK_SIZE] + DRBD_META_SIZE for d in disks),
8181 constants.DT_FILE: None,
8182 constants.DT_SHARED_FILE: 0,
8183 constants.DT_BLOCK: 0,
8186 if disk_template not in req_size_dict:
8187 raise errors.ProgrammerError("Disk template '%s' size requirement"
8188 " is unknown" % disk_template)
8190 return req_size_dict[disk_template]
8193 def _FilterVmNodes(lu, nodenames):
8194 """Filters out non-vm_capable nodes from a list.
8196 @type lu: L{LogicalUnit}
8197 @param lu: the logical unit for which we check
8198 @type nodenames: list
8199 @param nodenames: the list of nodes on which we should check
8201 @return: the list of vm-capable nodes
8204 vm_nodes = frozenset(lu.cfg.GetNonVmCapableNodeList())
8205 return [name for name in nodenames if name not in vm_nodes]
8208 def _CheckHVParams(lu, nodenames, hvname, hvparams):
8209 """Hypervisor parameter validation.
8211 This function abstract the hypervisor parameter validation to be
8212 used in both instance create and instance modify.
8214 @type lu: L{LogicalUnit}
8215 @param lu: the logical unit for which we check
8216 @type nodenames: list
8217 @param nodenames: the list of nodes on which we should check
8218 @type hvname: string
8219 @param hvname: the name of the hypervisor we should use
8220 @type hvparams: dict
8221 @param hvparams: the parameters which we need to check
8222 @raise errors.OpPrereqError: if the parameters are not valid
8225 nodenames = _FilterVmNodes(lu, nodenames)
8227 cluster = lu.cfg.GetClusterInfo()
8228 hvfull = objects.FillDict(cluster.hvparams.get(hvname, {}), hvparams)
8230 hvinfo = lu.rpc.call_hypervisor_validate_params(nodenames, hvname, hvfull)
8231 for node in nodenames:
8235 info.Raise("Hypervisor parameter validation failed on node %s" % node)
8238 def _CheckOSParams(lu, required, nodenames, osname, osparams):
8239 """OS parameters validation.
8241 @type lu: L{LogicalUnit}
8242 @param lu: the logical unit for which we check
8243 @type required: boolean
8244 @param required: whether the validation should fail if the OS is not
8246 @type nodenames: list
8247 @param nodenames: the list of nodes on which we should check
8248 @type osname: string
8249 @param osname: the name of the hypervisor we should use
8250 @type osparams: dict
8251 @param osparams: the parameters which we need to check
8252 @raise errors.OpPrereqError: if the parameters are not valid
8255 nodenames = _FilterVmNodes(lu, nodenames)
8256 result = lu.rpc.call_os_validate(nodenames, required, osname,
8257 [constants.OS_VALIDATE_PARAMETERS],
8259 for node, nres in result.items():
8260 # we don't check for offline cases since this should be run only
8261 # against the master node and/or an instance's nodes
8262 nres.Raise("OS Parameters validation failed on node %s" % node)
8263 if not nres.payload:
8264 lu.LogInfo("OS %s not found on node %s, validation skipped",
8268 class LUInstanceCreate(LogicalUnit):
8269 """Create an instance.
8272 HPATH = "instance-add"
8273 HTYPE = constants.HTYPE_INSTANCE
8276 def CheckArguments(self):
8280 # do not require name_check to ease forward/backward compatibility
8282 if self.op.no_install and self.op.start:
8283 self.LogInfo("No-installation mode selected, disabling startup")
8284 self.op.start = False
8285 # validate/normalize the instance name
8286 self.op.instance_name = \
8287 netutils.Hostname.GetNormalizedName(self.op.instance_name)
8289 if self.op.ip_check and not self.op.name_check:
8290 # TODO: make the ip check more flexible and not depend on the name check
8291 raise errors.OpPrereqError("Cannot do IP address check without a name"
8292 " check", errors.ECODE_INVAL)
8294 # check nics' parameter names
8295 for nic in self.op.nics:
8296 utils.ForceDictType(nic, constants.INIC_PARAMS_TYPES)
8298 # check disks. parameter names and consistent adopt/no-adopt strategy
8299 has_adopt = has_no_adopt = False
8300 for disk in self.op.disks:
8301 utils.ForceDictType(disk, constants.IDISK_PARAMS_TYPES)
8302 if constants.IDISK_ADOPT in disk:
8306 if has_adopt and has_no_adopt:
8307 raise errors.OpPrereqError("Either all disks are adopted or none is",
8310 if self.op.disk_template not in constants.DTS_MAY_ADOPT:
8311 raise errors.OpPrereqError("Disk adoption is not supported for the"
8312 " '%s' disk template" %
8313 self.op.disk_template,
8315 if self.op.iallocator is not None:
8316 raise errors.OpPrereqError("Disk adoption not allowed with an"
8317 " iallocator script", errors.ECODE_INVAL)
8318 if self.op.mode == constants.INSTANCE_IMPORT:
8319 raise errors.OpPrereqError("Disk adoption not allowed for"
8320 " instance import", errors.ECODE_INVAL)
8322 if self.op.disk_template in constants.DTS_MUST_ADOPT:
8323 raise errors.OpPrereqError("Disk template %s requires disk adoption,"
8324 " but no 'adopt' parameter given" %
8325 self.op.disk_template,
8328 self.adopt_disks = has_adopt
8330 # instance name verification
8331 if self.op.name_check:
8332 self.hostname1 = netutils.GetHostname(name=self.op.instance_name)
8333 self.op.instance_name = self.hostname1.name
8334 # used in CheckPrereq for ip ping check
8335 self.check_ip = self.hostname1.ip
8337 self.check_ip = None
8339 # file storage checks
8340 if (self.op.file_driver and
8341 not self.op.file_driver in constants.FILE_DRIVER):
8342 raise errors.OpPrereqError("Invalid file driver name '%s'" %
8343 self.op.file_driver, errors.ECODE_INVAL)
8345 if self.op.disk_template == constants.DT_FILE:
8346 opcodes.RequireFileStorage()
8347 elif self.op.disk_template == constants.DT_SHARED_FILE:
8348 opcodes.RequireSharedFileStorage()
8350 ### Node/iallocator related checks
8351 _CheckIAllocatorOrNode(self, "iallocator", "pnode")
8353 if self.op.pnode is not None:
8354 if self.op.disk_template in constants.DTS_INT_MIRROR:
8355 if self.op.snode is None:
8356 raise errors.OpPrereqError("The networked disk templates need"
8357 " a mirror node", errors.ECODE_INVAL)
8359 self.LogWarning("Secondary node will be ignored on non-mirrored disk"
8361 self.op.snode = None
8363 self._cds = _GetClusterDomainSecret()
8365 if self.op.mode == constants.INSTANCE_IMPORT:
8366 # On import force_variant must be True, because if we forced it at
8367 # initial install, our only chance when importing it back is that it
8369 self.op.force_variant = True
8371 if self.op.no_install:
8372 self.LogInfo("No-installation mode has no effect during import")
8374 elif self.op.mode == constants.INSTANCE_CREATE:
8375 if self.op.os_type is None:
8376 raise errors.OpPrereqError("No guest OS specified",
8378 if self.op.os_type in self.cfg.GetClusterInfo().blacklisted_os:
8379 raise errors.OpPrereqError("Guest OS '%s' is not allowed for"
8380 " installation" % self.op.os_type,
8382 if self.op.disk_template is None:
8383 raise errors.OpPrereqError("No disk template specified",
8386 elif self.op.mode == constants.INSTANCE_REMOTE_IMPORT:
8387 # Check handshake to ensure both clusters have the same domain secret
8388 src_handshake = self.op.source_handshake
8389 if not src_handshake:
8390 raise errors.OpPrereqError("Missing source handshake",
8393 errmsg = masterd.instance.CheckRemoteExportHandshake(self._cds,
8396 raise errors.OpPrereqError("Invalid handshake: %s" % errmsg,
8399 # Load and check source CA
8400 self.source_x509_ca_pem = self.op.source_x509_ca
8401 if not self.source_x509_ca_pem:
8402 raise errors.OpPrereqError("Missing source X509 CA",
8406 (cert, _) = utils.LoadSignedX509Certificate(self.source_x509_ca_pem,
8408 except OpenSSL.crypto.Error, err:
8409 raise errors.OpPrereqError("Unable to load source X509 CA (%s)" %
8410 (err, ), errors.ECODE_INVAL)
8412 (errcode, msg) = utils.VerifyX509Certificate(cert, None, None)
8413 if errcode is not None:
8414 raise errors.OpPrereqError("Invalid source X509 CA (%s)" % (msg, ),
8417 self.source_x509_ca = cert
8419 src_instance_name = self.op.source_instance_name
8420 if not src_instance_name:
8421 raise errors.OpPrereqError("Missing source instance name",
8424 self.source_instance_name = \
8425 netutils.GetHostname(name=src_instance_name).name
8428 raise errors.OpPrereqError("Invalid instance creation mode %r" %
8429 self.op.mode, errors.ECODE_INVAL)
8431 def ExpandNames(self):
8432 """ExpandNames for CreateInstance.
8434 Figure out the right locks for instance creation.
8437 self.needed_locks = {}
8439 instance_name = self.op.instance_name
8440 # this is just a preventive check, but someone might still add this
8441 # instance in the meantime, and creation will fail at lock-add time
8442 if instance_name in self.cfg.GetInstanceList():
8443 raise errors.OpPrereqError("Instance '%s' is already in the cluster" %
8444 instance_name, errors.ECODE_EXISTS)
8446 self.add_locks[locking.LEVEL_INSTANCE] = instance_name
8448 if self.op.iallocator:
8449 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
8451 self.op.pnode = _ExpandNodeName(self.cfg, self.op.pnode)
8452 nodelist = [self.op.pnode]
8453 if self.op.snode is not None:
8454 self.op.snode = _ExpandNodeName(self.cfg, self.op.snode)
8455 nodelist.append(self.op.snode)
8456 self.needed_locks[locking.LEVEL_NODE] = nodelist
8458 # in case of import lock the source node too
8459 if self.op.mode == constants.INSTANCE_IMPORT:
8460 src_node = self.op.src_node
8461 src_path = self.op.src_path
8463 if src_path is None:
8464 self.op.src_path = src_path = self.op.instance_name
8466 if src_node is None:
8467 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
8468 self.op.src_node = None
8469 if os.path.isabs(src_path):
8470 raise errors.OpPrereqError("Importing an instance from a path"
8471 " requires a source node option",
8474 self.op.src_node = src_node = _ExpandNodeName(self.cfg, src_node)
8475 if self.needed_locks[locking.LEVEL_NODE] is not locking.ALL_SET:
8476 self.needed_locks[locking.LEVEL_NODE].append(src_node)
8477 if not os.path.isabs(src_path):
8478 self.op.src_path = src_path = \
8479 utils.PathJoin(constants.EXPORT_DIR, src_path)
8481 def _RunAllocator(self):
8482 """Run the allocator based on input opcode.
8485 nics = [n.ToDict() for n in self.nics]
8486 ial = IAllocator(self.cfg, self.rpc,
8487 mode=constants.IALLOCATOR_MODE_ALLOC,
8488 name=self.op.instance_name,
8489 disk_template=self.op.disk_template,
8492 vcpus=self.be_full[constants.BE_VCPUS],
8493 memory=self.be_full[constants.BE_MEMORY],
8496 hypervisor=self.op.hypervisor,
8499 ial.Run(self.op.iallocator)
8502 raise errors.OpPrereqError("Can't compute nodes using"
8503 " iallocator '%s': %s" %
8504 (self.op.iallocator, ial.info),
8506 if len(ial.result) != ial.required_nodes:
8507 raise errors.OpPrereqError("iallocator '%s' returned invalid number"
8508 " of nodes (%s), required %s" %
8509 (self.op.iallocator, len(ial.result),
8510 ial.required_nodes), errors.ECODE_FAULT)
8511 self.op.pnode = ial.result[0]
8512 self.LogInfo("Selected nodes for instance %s via iallocator %s: %s",
8513 self.op.instance_name, self.op.iallocator,
8514 utils.CommaJoin(ial.result))
8515 if ial.required_nodes == 2:
8516 self.op.snode = ial.result[1]
8518 def BuildHooksEnv(self):
8521 This runs on master, primary and secondary nodes of the instance.
8525 "ADD_MODE": self.op.mode,
8527 if self.op.mode == constants.INSTANCE_IMPORT:
8528 env["SRC_NODE"] = self.op.src_node
8529 env["SRC_PATH"] = self.op.src_path
8530 env["SRC_IMAGES"] = self.src_images
8532 env.update(_BuildInstanceHookEnv(
8533 name=self.op.instance_name,
8534 primary_node=self.op.pnode,
8535 secondary_nodes=self.secondaries,
8536 status=self.op.start,
8537 os_type=self.op.os_type,
8538 memory=self.be_full[constants.BE_MEMORY],
8539 vcpus=self.be_full[constants.BE_VCPUS],
8540 nics=_NICListToTuple(self, self.nics),
8541 disk_template=self.op.disk_template,
8542 disks=[(d[constants.IDISK_SIZE], d[constants.IDISK_MODE])
8543 for d in self.disks],
8546 hypervisor_name=self.op.hypervisor,
8552 def BuildHooksNodes(self):
8553 """Build hooks nodes.
8556 nl = [self.cfg.GetMasterNode(), self.op.pnode] + self.secondaries
8559 def _ReadExportInfo(self):
8560 """Reads the export information from disk.
8562 It will override the opcode source node and path with the actual
8563 information, if these two were not specified before.
8565 @return: the export information
8568 assert self.op.mode == constants.INSTANCE_IMPORT
8570 src_node = self.op.src_node
8571 src_path = self.op.src_path
8573 if src_node is None:
8574 locked_nodes = self.owned_locks(locking.LEVEL_NODE)
8575 exp_list = self.rpc.call_export_list(locked_nodes)
8577 for node in exp_list:
8578 if exp_list[node].fail_msg:
8580 if src_path in exp_list[node].payload:
8582 self.op.src_node = src_node = node
8583 self.op.src_path = src_path = utils.PathJoin(constants.EXPORT_DIR,
8587 raise errors.OpPrereqError("No export found for relative path %s" %
8588 src_path, errors.ECODE_INVAL)
8590 _CheckNodeOnline(self, src_node)
8591 result = self.rpc.call_export_info(src_node, src_path)
8592 result.Raise("No export or invalid export found in dir %s" % src_path)
8594 export_info = objects.SerializableConfigParser.Loads(str(result.payload))
8595 if not export_info.has_section(constants.INISECT_EXP):
8596 raise errors.ProgrammerError("Corrupted export config",
8597 errors.ECODE_ENVIRON)
8599 ei_version = export_info.get(constants.INISECT_EXP, "version")
8600 if (int(ei_version) != constants.EXPORT_VERSION):
8601 raise errors.OpPrereqError("Wrong export version %s (wanted %d)" %
8602 (ei_version, constants.EXPORT_VERSION),
8603 errors.ECODE_ENVIRON)
8606 def _ReadExportParams(self, einfo):
8607 """Use export parameters as defaults.
8609 In case the opcode doesn't specify (as in override) some instance
8610 parameters, then try to use them from the export information, if
8614 self.op.os_type = einfo.get(constants.INISECT_EXP, "os")
8616 if self.op.disk_template is None:
8617 if einfo.has_option(constants.INISECT_INS, "disk_template"):
8618 self.op.disk_template = einfo.get(constants.INISECT_INS,
8620 if self.op.disk_template not in constants.DISK_TEMPLATES:
8621 raise errors.OpPrereqError("Disk template specified in configuration"
8622 " file is not one of the allowed values:"
8623 " %s" % " ".join(constants.DISK_TEMPLATES))
8625 raise errors.OpPrereqError("No disk template specified and the export"
8626 " is missing the disk_template information",
8629 if not self.op.disks:
8631 # TODO: import the disk iv_name too
8632 for idx in range(constants.MAX_DISKS):
8633 if einfo.has_option(constants.INISECT_INS, "disk%d_size" % idx):
8634 disk_sz = einfo.getint(constants.INISECT_INS, "disk%d_size" % idx)
8635 disks.append({constants.IDISK_SIZE: disk_sz})
8636 self.op.disks = disks
8637 if not disks and self.op.disk_template != constants.DT_DISKLESS:
8638 raise errors.OpPrereqError("No disk info specified and the export"
8639 " is missing the disk information",
8642 if not self.op.nics:
8644 for idx in range(constants.MAX_NICS):
8645 if einfo.has_option(constants.INISECT_INS, "nic%d_mac" % idx):
8647 for name in list(constants.NICS_PARAMETERS) + ["ip", "mac"]:
8648 v = einfo.get(constants.INISECT_INS, "nic%d_%s" % (idx, name))
8655 if not self.op.tags and einfo.has_option(constants.INISECT_INS, "tags"):
8656 self.op.tags = einfo.get(constants.INISECT_INS, "tags").split()
8658 if (self.op.hypervisor is None and
8659 einfo.has_option(constants.INISECT_INS, "hypervisor")):
8660 self.op.hypervisor = einfo.get(constants.INISECT_INS, "hypervisor")
8662 if einfo.has_section(constants.INISECT_HYP):
8663 # use the export parameters but do not override the ones
8664 # specified by the user
8665 for name, value in einfo.items(constants.INISECT_HYP):
8666 if name not in self.op.hvparams:
8667 self.op.hvparams[name] = value
8669 if einfo.has_section(constants.INISECT_BEP):
8670 # use the parameters, without overriding
8671 for name, value in einfo.items(constants.INISECT_BEP):
8672 if name not in self.op.beparams:
8673 self.op.beparams[name] = value
8675 # try to read the parameters old style, from the main section
8676 for name in constants.BES_PARAMETERS:
8677 if (name not in self.op.beparams and
8678 einfo.has_option(constants.INISECT_INS, name)):
8679 self.op.beparams[name] = einfo.get(constants.INISECT_INS, name)
8681 if einfo.has_section(constants.INISECT_OSP):
8682 # use the parameters, without overriding
8683 for name, value in einfo.items(constants.INISECT_OSP):
8684 if name not in self.op.osparams:
8685 self.op.osparams[name] = value
8687 def _RevertToDefaults(self, cluster):
8688 """Revert the instance parameters to the default values.
8692 hv_defs = cluster.SimpleFillHV(self.op.hypervisor, self.op.os_type, {})
8693 for name in self.op.hvparams.keys():
8694 if name in hv_defs and hv_defs[name] == self.op.hvparams[name]:
8695 del self.op.hvparams[name]
8697 be_defs = cluster.SimpleFillBE({})
8698 for name in self.op.beparams.keys():
8699 if name in be_defs and be_defs[name] == self.op.beparams[name]:
8700 del self.op.beparams[name]
8702 nic_defs = cluster.SimpleFillNIC({})
8703 for nic in self.op.nics:
8704 for name in constants.NICS_PARAMETERS:
8705 if name in nic and name in nic_defs and nic[name] == nic_defs[name]:
8708 os_defs = cluster.SimpleFillOS(self.op.os_type, {})
8709 for name in self.op.osparams.keys():
8710 if name in os_defs and os_defs[name] == self.op.osparams[name]:
8711 del self.op.osparams[name]
8713 def _CalculateFileStorageDir(self):
8714 """Calculate final instance file storage dir.
8717 # file storage dir calculation/check
8718 self.instance_file_storage_dir = None
8719 if self.op.disk_template in constants.DTS_FILEBASED:
8720 # build the full file storage dir path
8723 if self.op.disk_template == constants.DT_SHARED_FILE:
8724 get_fsd_fn = self.cfg.GetSharedFileStorageDir
8726 get_fsd_fn = self.cfg.GetFileStorageDir
8728 cfg_storagedir = get_fsd_fn()
8729 if not cfg_storagedir:
8730 raise errors.OpPrereqError("Cluster file storage dir not defined")
8731 joinargs.append(cfg_storagedir)
8733 if self.op.file_storage_dir is not None:
8734 joinargs.append(self.op.file_storage_dir)
8736 joinargs.append(self.op.instance_name)
8738 # pylint: disable=W0142
8739 self.instance_file_storage_dir = utils.PathJoin(*joinargs)
8741 def CheckPrereq(self):
8742 """Check prerequisites.
8745 self._CalculateFileStorageDir()
8747 if self.op.mode == constants.INSTANCE_IMPORT:
8748 export_info = self._ReadExportInfo()
8749 self._ReadExportParams(export_info)
8751 if (not self.cfg.GetVGName() and
8752 self.op.disk_template not in constants.DTS_NOT_LVM):
8753 raise errors.OpPrereqError("Cluster does not support lvm-based"
8754 " instances", errors.ECODE_STATE)
8756 if (self.op.hypervisor is None or
8757 self.op.hypervisor == constants.VALUE_AUTO):
8758 self.op.hypervisor = self.cfg.GetHypervisorType()
8760 cluster = self.cfg.GetClusterInfo()
8761 enabled_hvs = cluster.enabled_hypervisors
8762 if self.op.hypervisor not in enabled_hvs:
8763 raise errors.OpPrereqError("Selected hypervisor (%s) not enabled in the"
8764 " cluster (%s)" % (self.op.hypervisor,
8765 ",".join(enabled_hvs)),
8768 # Check tag validity
8769 for tag in self.op.tags:
8770 objects.TaggableObject.ValidateTag(tag)
8772 # check hypervisor parameter syntax (locally)
8773 utils.ForceDictType(self.op.hvparams, constants.HVS_PARAMETER_TYPES)
8774 filled_hvp = cluster.SimpleFillHV(self.op.hypervisor, self.op.os_type,
8776 hv_type = hypervisor.GetHypervisor(self.op.hypervisor)
8777 hv_type.CheckParameterSyntax(filled_hvp)
8778 self.hv_full = filled_hvp
8779 # check that we don't specify global parameters on an instance
8780 _CheckGlobalHvParams(self.op.hvparams)
8782 # fill and remember the beparams dict
8783 default_beparams = cluster.beparams[constants.PP_DEFAULT]
8784 for param, value in self.op.beparams.iteritems():
8785 if value == constants.VALUE_AUTO:
8786 self.op.beparams[param] = default_beparams[param]
8787 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
8788 self.be_full = cluster.SimpleFillBE(self.op.beparams)
8790 # build os parameters
8791 self.os_full = cluster.SimpleFillOS(self.op.os_type, self.op.osparams)
8793 # now that hvp/bep are in final format, let's reset to defaults,
8795 if self.op.identify_defaults:
8796 self._RevertToDefaults(cluster)
8800 for idx, nic in enumerate(self.op.nics):
8801 nic_mode_req = nic.get(constants.INIC_MODE, None)
8802 nic_mode = nic_mode_req
8803 if nic_mode is None or nic_mode == constants.VALUE_AUTO:
8804 nic_mode = cluster.nicparams[constants.PP_DEFAULT][constants.NIC_MODE]
8806 # in routed mode, for the first nic, the default ip is 'auto'
8807 if nic_mode == constants.NIC_MODE_ROUTED and idx == 0:
8808 default_ip_mode = constants.VALUE_AUTO
8810 default_ip_mode = constants.VALUE_NONE
8812 # ip validity checks
8813 ip = nic.get(constants.INIC_IP, default_ip_mode)
8814 if ip is None or ip.lower() == constants.VALUE_NONE:
8816 elif ip.lower() == constants.VALUE_AUTO:
8817 if not self.op.name_check:
8818 raise errors.OpPrereqError("IP address set to auto but name checks"
8819 " have been skipped",
8821 nic_ip = self.hostname1.ip
8823 if not netutils.IPAddress.IsValid(ip):
8824 raise errors.OpPrereqError("Invalid IP address '%s'" % ip,
8828 # TODO: check the ip address for uniqueness
8829 if nic_mode == constants.NIC_MODE_ROUTED and not nic_ip:
8830 raise errors.OpPrereqError("Routed nic mode requires an ip address",
8833 # MAC address verification
8834 mac = nic.get(constants.INIC_MAC, constants.VALUE_AUTO)
8835 if mac not in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
8836 mac = utils.NormalizeAndValidateMac(mac)
8839 self.cfg.ReserveMAC(mac, self.proc.GetECId())
8840 except errors.ReservationError:
8841 raise errors.OpPrereqError("MAC address %s already in use"
8842 " in cluster" % mac,
8843 errors.ECODE_NOTUNIQUE)
8845 # Build nic parameters
8846 link = nic.get(constants.INIC_LINK, None)
8847 if link == constants.VALUE_AUTO:
8848 link = cluster.nicparams[constants.PP_DEFAULT][constants.NIC_LINK]
8851 nicparams[constants.NIC_MODE] = nic_mode
8853 nicparams[constants.NIC_LINK] = link
8855 check_params = cluster.SimpleFillNIC(nicparams)
8856 objects.NIC.CheckParameterSyntax(check_params)
8857 self.nics.append(objects.NIC(mac=mac, ip=nic_ip, nicparams=nicparams))
8859 # disk checks/pre-build
8860 default_vg = self.cfg.GetVGName()
8862 for disk in self.op.disks:
8863 mode = disk.get(constants.IDISK_MODE, constants.DISK_RDWR)
8864 if mode not in constants.DISK_ACCESS_SET:
8865 raise errors.OpPrereqError("Invalid disk access mode '%s'" %
8866 mode, errors.ECODE_INVAL)
8867 size = disk.get(constants.IDISK_SIZE, None)
8869 raise errors.OpPrereqError("Missing disk size", errors.ECODE_INVAL)
8872 except (TypeError, ValueError):
8873 raise errors.OpPrereqError("Invalid disk size '%s'" % size,
8876 data_vg = disk.get(constants.IDISK_VG, default_vg)
8878 constants.IDISK_SIZE: size,
8879 constants.IDISK_MODE: mode,
8880 constants.IDISK_VG: data_vg,
8881 constants.IDISK_METAVG: disk.get(constants.IDISK_METAVG, data_vg),
8883 if constants.IDISK_ADOPT in disk:
8884 new_disk[constants.IDISK_ADOPT] = disk[constants.IDISK_ADOPT]
8885 self.disks.append(new_disk)
8887 if self.op.mode == constants.INSTANCE_IMPORT:
8889 for idx in range(len(self.disks)):
8890 option = "disk%d_dump" % idx
8891 if export_info.has_option(constants.INISECT_INS, option):
8892 # FIXME: are the old os-es, disk sizes, etc. useful?
8893 export_name = export_info.get(constants.INISECT_INS, option)
8894 image = utils.PathJoin(self.op.src_path, export_name)
8895 disk_images.append(image)
8897 disk_images.append(False)
8899 self.src_images = disk_images
8901 old_name = export_info.get(constants.INISECT_INS, "name")
8902 if self.op.instance_name == old_name:
8903 for idx, nic in enumerate(self.nics):
8904 if nic.mac == constants.VALUE_AUTO:
8905 nic_mac_ini = "nic%d_mac" % idx
8906 nic.mac = export_info.get(constants.INISECT_INS, nic_mac_ini)
8908 # ENDIF: self.op.mode == constants.INSTANCE_IMPORT
8910 # ip ping checks (we use the same ip that was resolved in ExpandNames)
8911 if self.op.ip_check:
8912 if netutils.TcpPing(self.check_ip, constants.DEFAULT_NODED_PORT):
8913 raise errors.OpPrereqError("IP %s of instance %s already in use" %
8914 (self.check_ip, self.op.instance_name),
8915 errors.ECODE_NOTUNIQUE)
8917 #### mac address generation
8918 # By generating here the mac address both the allocator and the hooks get
8919 # the real final mac address rather than the 'auto' or 'generate' value.
8920 # There is a race condition between the generation and the instance object
8921 # creation, which means that we know the mac is valid now, but we're not
8922 # sure it will be when we actually add the instance. If things go bad
8923 # adding the instance will abort because of a duplicate mac, and the
8924 # creation job will fail.
8925 for nic in self.nics:
8926 if nic.mac in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
8927 nic.mac = self.cfg.GenerateMAC(self.proc.GetECId())
8931 if self.op.iallocator is not None:
8932 self._RunAllocator()
8934 #### node related checks
8936 # check primary node
8937 self.pnode = pnode = self.cfg.GetNodeInfo(self.op.pnode)
8938 assert self.pnode is not None, \
8939 "Cannot retrieve locked node %s" % self.op.pnode
8941 raise errors.OpPrereqError("Cannot use offline primary node '%s'" %
8942 pnode.name, errors.ECODE_STATE)
8944 raise errors.OpPrereqError("Cannot use drained primary node '%s'" %
8945 pnode.name, errors.ECODE_STATE)
8946 if not pnode.vm_capable:
8947 raise errors.OpPrereqError("Cannot use non-vm_capable primary node"
8948 " '%s'" % pnode.name, errors.ECODE_STATE)
8950 self.secondaries = []
8952 # mirror node verification
8953 if self.op.disk_template in constants.DTS_INT_MIRROR:
8954 if self.op.snode == pnode.name:
8955 raise errors.OpPrereqError("The secondary node cannot be the"
8956 " primary node", errors.ECODE_INVAL)
8957 _CheckNodeOnline(self, self.op.snode)
8958 _CheckNodeNotDrained(self, self.op.snode)
8959 _CheckNodeVmCapable(self, self.op.snode)
8960 self.secondaries.append(self.op.snode)
8962 nodenames = [pnode.name] + self.secondaries
8964 if not self.adopt_disks:
8965 # Check lv size requirements, if not adopting
8966 req_sizes = _ComputeDiskSizePerVG(self.op.disk_template, self.disks)
8967 _CheckNodesFreeDiskPerVG(self, nodenames, req_sizes)
8969 elif self.op.disk_template == constants.DT_PLAIN: # Check the adoption data
8970 all_lvs = set(["%s/%s" % (disk[constants.IDISK_VG],
8971 disk[constants.IDISK_ADOPT])
8972 for disk in self.disks])
8973 if len(all_lvs) != len(self.disks):
8974 raise errors.OpPrereqError("Duplicate volume names given for adoption",
8976 for lv_name in all_lvs:
8978 # FIXME: lv_name here is "vg/lv" need to ensure that other calls
8979 # to ReserveLV uses the same syntax
8980 self.cfg.ReserveLV(lv_name, self.proc.GetECId())
8981 except errors.ReservationError:
8982 raise errors.OpPrereqError("LV named %s used by another instance" %
8983 lv_name, errors.ECODE_NOTUNIQUE)
8985 vg_names = self.rpc.call_vg_list([pnode.name])[pnode.name]
8986 vg_names.Raise("Cannot get VG information from node %s" % pnode.name)
8988 node_lvs = self.rpc.call_lv_list([pnode.name],
8989 vg_names.payload.keys())[pnode.name]
8990 node_lvs.Raise("Cannot get LV information from node %s" % pnode.name)
8991 node_lvs = node_lvs.payload
8993 delta = all_lvs.difference(node_lvs.keys())
8995 raise errors.OpPrereqError("Missing logical volume(s): %s" %
8996 utils.CommaJoin(delta),
8998 online_lvs = [lv for lv in all_lvs if node_lvs[lv][2]]
9000 raise errors.OpPrereqError("Online logical volumes found, cannot"
9001 " adopt: %s" % utils.CommaJoin(online_lvs),
9003 # update the size of disk based on what is found
9004 for dsk in self.disks:
9005 dsk[constants.IDISK_SIZE] = \
9006 int(float(node_lvs["%s/%s" % (dsk[constants.IDISK_VG],
9007 dsk[constants.IDISK_ADOPT])][0]))
9009 elif self.op.disk_template == constants.DT_BLOCK:
9010 # Normalize and de-duplicate device paths
9011 all_disks = set([os.path.abspath(disk[constants.IDISK_ADOPT])
9012 for disk in self.disks])
9013 if len(all_disks) != len(self.disks):
9014 raise errors.OpPrereqError("Duplicate disk names given for adoption",
9016 baddisks = [d for d in all_disks
9017 if not d.startswith(constants.ADOPTABLE_BLOCKDEV_ROOT)]
9019 raise errors.OpPrereqError("Device node(s) %s lie outside %s and"
9020 " cannot be adopted" %
9021 (", ".join(baddisks),
9022 constants.ADOPTABLE_BLOCKDEV_ROOT),
9025 node_disks = self.rpc.call_bdev_sizes([pnode.name],
9026 list(all_disks))[pnode.name]
9027 node_disks.Raise("Cannot get block device information from node %s" %
9029 node_disks = node_disks.payload
9030 delta = all_disks.difference(node_disks.keys())
9032 raise errors.OpPrereqError("Missing block device(s): %s" %
9033 utils.CommaJoin(delta),
9035 for dsk in self.disks:
9036 dsk[constants.IDISK_SIZE] = \
9037 int(float(node_disks[dsk[constants.IDISK_ADOPT]]))
9039 _CheckHVParams(self, nodenames, self.op.hypervisor, self.op.hvparams)
9041 _CheckNodeHasOS(self, pnode.name, self.op.os_type, self.op.force_variant)
9042 # check OS parameters (remotely)
9043 _CheckOSParams(self, True, nodenames, self.op.os_type, self.os_full)
9045 _CheckNicsBridgesExist(self, self.nics, self.pnode.name)
9047 # memory check on primary node
9049 _CheckNodeFreeMemory(self, self.pnode.name,
9050 "creating instance %s" % self.op.instance_name,
9051 self.be_full[constants.BE_MEMORY],
9054 self.dry_run_result = list(nodenames)
9056 def Exec(self, feedback_fn):
9057 """Create and add the instance to the cluster.
9060 instance = self.op.instance_name
9061 pnode_name = self.pnode.name
9063 ht_kind = self.op.hypervisor
9064 if ht_kind in constants.HTS_REQ_PORT:
9065 network_port = self.cfg.AllocatePort()
9069 disks = _GenerateDiskTemplate(self,
9070 self.op.disk_template,
9071 instance, pnode_name,
9074 self.instance_file_storage_dir,
9075 self.op.file_driver,
9079 iobj = objects.Instance(name=instance, os=self.op.os_type,
9080 primary_node=pnode_name,
9081 nics=self.nics, disks=disks,
9082 disk_template=self.op.disk_template,
9084 network_port=network_port,
9085 beparams=self.op.beparams,
9086 hvparams=self.op.hvparams,
9087 hypervisor=self.op.hypervisor,
9088 osparams=self.op.osparams,
9092 for tag in self.op.tags:
9095 if self.adopt_disks:
9096 if self.op.disk_template == constants.DT_PLAIN:
9097 # rename LVs to the newly-generated names; we need to construct
9098 # 'fake' LV disks with the old data, plus the new unique_id
9099 tmp_disks = [objects.Disk.FromDict(v.ToDict()) for v in disks]
9101 for t_dsk, a_dsk in zip(tmp_disks, self.disks):
9102 rename_to.append(t_dsk.logical_id)
9103 t_dsk.logical_id = (t_dsk.logical_id[0], a_dsk[constants.IDISK_ADOPT])
9104 self.cfg.SetDiskID(t_dsk, pnode_name)
9105 result = self.rpc.call_blockdev_rename(pnode_name,
9106 zip(tmp_disks, rename_to))
9107 result.Raise("Failed to rename adoped LVs")
9109 feedback_fn("* creating instance disks...")
9111 _CreateDisks(self, iobj)
9112 except errors.OpExecError:
9113 self.LogWarning("Device creation failed, reverting...")
9115 _RemoveDisks(self, iobj)
9117 self.cfg.ReleaseDRBDMinors(instance)
9120 feedback_fn("adding instance %s to cluster config" % instance)
9122 self.cfg.AddInstance(iobj, self.proc.GetECId())
9124 # Declare that we don't want to remove the instance lock anymore, as we've
9125 # added the instance to the config
9126 del self.remove_locks[locking.LEVEL_INSTANCE]
9128 if self.op.mode == constants.INSTANCE_IMPORT:
9129 # Release unused nodes
9130 _ReleaseLocks(self, locking.LEVEL_NODE, keep=[self.op.src_node])
9133 _ReleaseLocks(self, locking.LEVEL_NODE)
9136 if not self.adopt_disks and self.cfg.GetClusterInfo().prealloc_wipe_disks:
9137 feedback_fn("* wiping instance disks...")
9139 _WipeDisks(self, iobj)
9140 except errors.OpExecError, err:
9141 logging.exception("Wiping disks failed")
9142 self.LogWarning("Wiping instance disks failed (%s)", err)
9146 # Something is already wrong with the disks, don't do anything else
9148 elif self.op.wait_for_sync:
9149 disk_abort = not _WaitForSync(self, iobj)
9150 elif iobj.disk_template in constants.DTS_INT_MIRROR:
9151 # make sure the disks are not degraded (still sync-ing is ok)
9152 feedback_fn("* checking mirrors status")
9153 disk_abort = not _WaitForSync(self, iobj, oneshot=True)
9158 _RemoveDisks(self, iobj)
9159 self.cfg.RemoveInstance(iobj.name)
9160 # Make sure the instance lock gets removed
9161 self.remove_locks[locking.LEVEL_INSTANCE] = iobj.name
9162 raise errors.OpExecError("There are some degraded disks for"
9165 if iobj.disk_template != constants.DT_DISKLESS and not self.adopt_disks:
9166 if self.op.mode == constants.INSTANCE_CREATE:
9167 if not self.op.no_install:
9168 pause_sync = (iobj.disk_template in constants.DTS_INT_MIRROR and
9169 not self.op.wait_for_sync)
9171 feedback_fn("* pausing disk sync to install instance OS")
9172 result = self.rpc.call_blockdev_pause_resume_sync(pnode_name,
9174 for idx, success in enumerate(result.payload):
9176 logging.warn("pause-sync of instance %s for disk %d failed",
9179 feedback_fn("* running the instance OS create scripts...")
9180 # FIXME: pass debug option from opcode to backend
9182 self.rpc.call_instance_os_add(pnode_name, (iobj, None), False,
9183 self.op.debug_level)
9185 feedback_fn("* resuming disk sync")
9186 result = self.rpc.call_blockdev_pause_resume_sync(pnode_name,
9188 for idx, success in enumerate(result.payload):
9190 logging.warn("resume-sync of instance %s for disk %d failed",
9193 os_add_result.Raise("Could not add os for instance %s"
9194 " on node %s" % (instance, pnode_name))
9196 elif self.op.mode == constants.INSTANCE_IMPORT:
9197 feedback_fn("* running the instance OS import scripts...")
9201 for idx, image in enumerate(self.src_images):
9205 # FIXME: pass debug option from opcode to backend
9206 dt = masterd.instance.DiskTransfer("disk/%s" % idx,
9207 constants.IEIO_FILE, (image, ),
9208 constants.IEIO_SCRIPT,
9209 (iobj.disks[idx], idx),
9211 transfers.append(dt)
9214 masterd.instance.TransferInstanceData(self, feedback_fn,
9215 self.op.src_node, pnode_name,
9216 self.pnode.secondary_ip,
9218 if not compat.all(import_result):
9219 self.LogWarning("Some disks for instance %s on node %s were not"
9220 " imported successfully" % (instance, pnode_name))
9222 elif self.op.mode == constants.INSTANCE_REMOTE_IMPORT:
9223 feedback_fn("* preparing remote import...")
9224 # The source cluster will stop the instance before attempting to make a
9225 # connection. In some cases stopping an instance can take a long time,
9226 # hence the shutdown timeout is added to the connection timeout.
9227 connect_timeout = (constants.RIE_CONNECT_TIMEOUT +
9228 self.op.source_shutdown_timeout)
9229 timeouts = masterd.instance.ImportExportTimeouts(connect_timeout)
9231 assert iobj.primary_node == self.pnode.name
9233 masterd.instance.RemoteImport(self, feedback_fn, iobj, self.pnode,
9234 self.source_x509_ca,
9235 self._cds, timeouts)
9236 if not compat.all(disk_results):
9237 # TODO: Should the instance still be started, even if some disks
9238 # failed to import (valid for local imports, too)?
9239 self.LogWarning("Some disks for instance %s on node %s were not"
9240 " imported successfully" % (instance, pnode_name))
9242 # Run rename script on newly imported instance
9243 assert iobj.name == instance
9244 feedback_fn("Running rename script for %s" % instance)
9245 result = self.rpc.call_instance_run_rename(pnode_name, iobj,
9246 self.source_instance_name,
9247 self.op.debug_level)
9249 self.LogWarning("Failed to run rename script for %s on node"
9250 " %s: %s" % (instance, pnode_name, result.fail_msg))
9253 # also checked in the prereq part
9254 raise errors.ProgrammerError("Unknown OS initialization mode '%s'"
9258 iobj.admin_up = True
9259 self.cfg.Update(iobj, feedback_fn)
9260 logging.info("Starting instance %s on node %s", instance, pnode_name)
9261 feedback_fn("* starting instance...")
9262 result = self.rpc.call_instance_start(pnode_name, (iobj, None, None),
9264 result.Raise("Could not start instance")
9266 return list(iobj.all_nodes)
9269 class LUInstanceConsole(NoHooksLU):
9270 """Connect to an instance's console.
9272 This is somewhat special in that it returns the command line that
9273 you need to run on the master node in order to connect to the
9279 def ExpandNames(self):
9280 self._ExpandAndLockInstance()
9282 def CheckPrereq(self):
9283 """Check prerequisites.
9285 This checks that the instance is in the cluster.
9288 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
9289 assert self.instance is not None, \
9290 "Cannot retrieve locked instance %s" % self.op.instance_name
9291 _CheckNodeOnline(self, self.instance.primary_node)
9293 def Exec(self, feedback_fn):
9294 """Connect to the console of an instance
9297 instance = self.instance
9298 node = instance.primary_node
9300 node_insts = self.rpc.call_instance_list([node],
9301 [instance.hypervisor])[node]
9302 node_insts.Raise("Can't get node information from %s" % node)
9304 if instance.name not in node_insts.payload:
9305 if instance.admin_up:
9306 state = constants.INSTST_ERRORDOWN
9308 state = constants.INSTST_ADMINDOWN
9309 raise errors.OpExecError("Instance %s is not running (state %s)" %
9310 (instance.name, state))
9312 logging.debug("Connecting to console of %s on %s", instance.name, node)
9314 return _GetInstanceConsole(self.cfg.GetClusterInfo(), instance)
9317 def _GetInstanceConsole(cluster, instance):
9318 """Returns console information for an instance.
9320 @type cluster: L{objects.Cluster}
9321 @type instance: L{objects.Instance}
9325 hyper = hypervisor.GetHypervisor(instance.hypervisor)
9326 # beparams and hvparams are passed separately, to avoid editing the
9327 # instance and then saving the defaults in the instance itself.
9328 hvparams = cluster.FillHV(instance)
9329 beparams = cluster.FillBE(instance)
9330 console = hyper.GetInstanceConsole(instance, hvparams, beparams)
9332 assert console.instance == instance.name
9333 assert console.Validate()
9335 return console.ToDict()
9338 class LUInstanceReplaceDisks(LogicalUnit):
9339 """Replace the disks of an instance.
9342 HPATH = "mirrors-replace"
9343 HTYPE = constants.HTYPE_INSTANCE
9346 def CheckArguments(self):
9347 TLReplaceDisks.CheckArguments(self.op.mode, self.op.remote_node,
9350 def ExpandNames(self):
9351 self._ExpandAndLockInstance()
9353 assert locking.LEVEL_NODE not in self.needed_locks
9354 assert locking.LEVEL_NODEGROUP not in self.needed_locks
9356 assert self.op.iallocator is None or self.op.remote_node is None, \
9357 "Conflicting options"
9359 if self.op.remote_node is not None:
9360 self.op.remote_node = _ExpandNodeName(self.cfg, self.op.remote_node)
9362 # Warning: do not remove the locking of the new secondary here
9363 # unless DRBD8.AddChildren is changed to work in parallel;
9364 # currently it doesn't since parallel invocations of
9365 # FindUnusedMinor will conflict
9366 self.needed_locks[locking.LEVEL_NODE] = [self.op.remote_node]
9367 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
9369 self.needed_locks[locking.LEVEL_NODE] = []
9370 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
9372 if self.op.iallocator is not None:
9373 # iallocator will select a new node in the same group
9374 self.needed_locks[locking.LEVEL_NODEGROUP] = []
9376 self.replacer = TLReplaceDisks(self, self.op.instance_name, self.op.mode,
9377 self.op.iallocator, self.op.remote_node,
9378 self.op.disks, False, self.op.early_release)
9380 self.tasklets = [self.replacer]
9382 def DeclareLocks(self, level):
9383 if level == locking.LEVEL_NODEGROUP:
9384 assert self.op.remote_node is None
9385 assert self.op.iallocator is not None
9386 assert not self.needed_locks[locking.LEVEL_NODEGROUP]
9388 self.share_locks[locking.LEVEL_NODEGROUP] = 1
9389 self.needed_locks[locking.LEVEL_NODEGROUP] = \
9390 self.cfg.GetInstanceNodeGroups(self.op.instance_name)
9392 elif level == locking.LEVEL_NODE:
9393 if self.op.iallocator is not None:
9394 assert self.op.remote_node is None
9395 assert not self.needed_locks[locking.LEVEL_NODE]
9397 # Lock member nodes of all locked groups
9398 self.needed_locks[locking.LEVEL_NODE] = [node_name
9399 for group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP)
9400 for node_name in self.cfg.GetNodeGroup(group_uuid).members]
9402 self._LockInstancesNodes()
9404 def BuildHooksEnv(self):
9407 This runs on the master, the primary and all the secondaries.
9410 instance = self.replacer.instance
9412 "MODE": self.op.mode,
9413 "NEW_SECONDARY": self.op.remote_node,
9414 "OLD_SECONDARY": instance.secondary_nodes[0],
9416 env.update(_BuildInstanceHookEnvByObject(self, instance))
9419 def BuildHooksNodes(self):
9420 """Build hooks nodes.
9423 instance = self.replacer.instance
9425 self.cfg.GetMasterNode(),
9426 instance.primary_node,
9428 if self.op.remote_node is not None:
9429 nl.append(self.op.remote_node)
9432 def CheckPrereq(self):
9433 """Check prerequisites.
9436 assert (self.glm.is_owned(locking.LEVEL_NODEGROUP) or
9437 self.op.iallocator is None)
9439 owned_groups = self.owned_locks(locking.LEVEL_NODEGROUP)
9441 _CheckInstanceNodeGroups(self.cfg, self.op.instance_name, owned_groups)
9443 return LogicalUnit.CheckPrereq(self)
9446 class TLReplaceDisks(Tasklet):
9447 """Replaces disks for an instance.
9449 Note: Locking is not within the scope of this class.
9452 def __init__(self, lu, instance_name, mode, iallocator_name, remote_node,
9453 disks, delay_iallocator, early_release):
9454 """Initializes this class.
9457 Tasklet.__init__(self, lu)
9460 self.instance_name = instance_name
9462 self.iallocator_name = iallocator_name
9463 self.remote_node = remote_node
9465 self.delay_iallocator = delay_iallocator
9466 self.early_release = early_release
9469 self.instance = None
9470 self.new_node = None
9471 self.target_node = None
9472 self.other_node = None
9473 self.remote_node_info = None
9474 self.node_secondary_ip = None
9477 def CheckArguments(mode, remote_node, iallocator):
9478 """Helper function for users of this class.
9481 # check for valid parameter combination
9482 if mode == constants.REPLACE_DISK_CHG:
9483 if remote_node is None and iallocator is None:
9484 raise errors.OpPrereqError("When changing the secondary either an"
9485 " iallocator script must be used or the"
9486 " new node given", errors.ECODE_INVAL)
9488 if remote_node is not None and iallocator is not None:
9489 raise errors.OpPrereqError("Give either the iallocator or the new"
9490 " secondary, not both", errors.ECODE_INVAL)
9492 elif remote_node is not None or iallocator is not None:
9493 # Not replacing the secondary
9494 raise errors.OpPrereqError("The iallocator and new node options can"
9495 " only be used when changing the"
9496 " secondary node", errors.ECODE_INVAL)
9499 def _RunAllocator(lu, iallocator_name, instance_name, relocate_from):
9500 """Compute a new secondary node using an IAllocator.
9503 ial = IAllocator(lu.cfg, lu.rpc,
9504 mode=constants.IALLOCATOR_MODE_RELOC,
9506 relocate_from=list(relocate_from))
9508 ial.Run(iallocator_name)
9511 raise errors.OpPrereqError("Can't compute nodes using iallocator '%s':"
9512 " %s" % (iallocator_name, ial.info),
9515 if len(ial.result) != ial.required_nodes:
9516 raise errors.OpPrereqError("iallocator '%s' returned invalid number"
9517 " of nodes (%s), required %s" %
9519 len(ial.result), ial.required_nodes),
9522 remote_node_name = ial.result[0]
9524 lu.LogInfo("Selected new secondary for instance '%s': %s",
9525 instance_name, remote_node_name)
9527 return remote_node_name
9529 def _FindFaultyDisks(self, node_name):
9530 """Wrapper for L{_FindFaultyInstanceDisks}.
9533 return _FindFaultyInstanceDisks(self.cfg, self.rpc, self.instance,
9536 def _CheckDisksActivated(self, instance):
9537 """Checks if the instance disks are activated.
9539 @param instance: The instance to check disks
9540 @return: True if they are activated, False otherwise
9543 nodes = instance.all_nodes
9545 for idx, dev in enumerate(instance.disks):
9547 self.lu.LogInfo("Checking disk/%d on %s", idx, node)
9548 self.cfg.SetDiskID(dev, node)
9550 result = self.rpc.call_blockdev_find(node, dev)
9554 elif result.fail_msg or not result.payload:
9559 def CheckPrereq(self):
9560 """Check prerequisites.
9562 This checks that the instance is in the cluster.
9565 self.instance = instance = self.cfg.GetInstanceInfo(self.instance_name)
9566 assert instance is not None, \
9567 "Cannot retrieve locked instance %s" % self.instance_name
9569 if instance.disk_template != constants.DT_DRBD8:
9570 raise errors.OpPrereqError("Can only run replace disks for DRBD8-based"
9571 " instances", errors.ECODE_INVAL)
9573 if len(instance.secondary_nodes) != 1:
9574 raise errors.OpPrereqError("The instance has a strange layout,"
9575 " expected one secondary but found %d" %
9576 len(instance.secondary_nodes),
9579 if not self.delay_iallocator:
9580 self._CheckPrereq2()
9582 def _CheckPrereq2(self):
9583 """Check prerequisites, second part.
9585 This function should always be part of CheckPrereq. It was separated and is
9586 now called from Exec because during node evacuation iallocator was only
9587 called with an unmodified cluster model, not taking planned changes into
9591 instance = self.instance
9592 secondary_node = instance.secondary_nodes[0]
9594 if self.iallocator_name is None:
9595 remote_node = self.remote_node
9597 remote_node = self._RunAllocator(self.lu, self.iallocator_name,
9598 instance.name, instance.secondary_nodes)
9600 if remote_node is None:
9601 self.remote_node_info = None
9603 assert remote_node in self.lu.owned_locks(locking.LEVEL_NODE), \
9604 "Remote node '%s' is not locked" % remote_node
9606 self.remote_node_info = self.cfg.GetNodeInfo(remote_node)
9607 assert self.remote_node_info is not None, \
9608 "Cannot retrieve locked node %s" % remote_node
9610 if remote_node == self.instance.primary_node:
9611 raise errors.OpPrereqError("The specified node is the primary node of"
9612 " the instance", errors.ECODE_INVAL)
9614 if remote_node == secondary_node:
9615 raise errors.OpPrereqError("The specified node is already the"
9616 " secondary node of the instance",
9619 if self.disks and self.mode in (constants.REPLACE_DISK_AUTO,
9620 constants.REPLACE_DISK_CHG):
9621 raise errors.OpPrereqError("Cannot specify disks to be replaced",
9624 if self.mode == constants.REPLACE_DISK_AUTO:
9625 if not self._CheckDisksActivated(instance):
9626 raise errors.OpPrereqError("Please run activate-disks on instance %s"
9627 " first" % self.instance_name,
9629 faulty_primary = self._FindFaultyDisks(instance.primary_node)
9630 faulty_secondary = self._FindFaultyDisks(secondary_node)
9632 if faulty_primary and faulty_secondary:
9633 raise errors.OpPrereqError("Instance %s has faulty disks on more than"
9634 " one node and can not be repaired"
9635 " automatically" % self.instance_name,
9639 self.disks = faulty_primary
9640 self.target_node = instance.primary_node
9641 self.other_node = secondary_node
9642 check_nodes = [self.target_node, self.other_node]
9643 elif faulty_secondary:
9644 self.disks = faulty_secondary
9645 self.target_node = secondary_node
9646 self.other_node = instance.primary_node
9647 check_nodes = [self.target_node, self.other_node]
9653 # Non-automatic modes
9654 if self.mode == constants.REPLACE_DISK_PRI:
9655 self.target_node = instance.primary_node
9656 self.other_node = secondary_node
9657 check_nodes = [self.target_node, self.other_node]
9659 elif self.mode == constants.REPLACE_DISK_SEC:
9660 self.target_node = secondary_node
9661 self.other_node = instance.primary_node
9662 check_nodes = [self.target_node, self.other_node]
9664 elif self.mode == constants.REPLACE_DISK_CHG:
9665 self.new_node = remote_node
9666 self.other_node = instance.primary_node
9667 self.target_node = secondary_node
9668 check_nodes = [self.new_node, self.other_node]
9670 _CheckNodeNotDrained(self.lu, remote_node)
9671 _CheckNodeVmCapable(self.lu, remote_node)
9673 old_node_info = self.cfg.GetNodeInfo(secondary_node)
9674 assert old_node_info is not None
9675 if old_node_info.offline and not self.early_release:
9676 # doesn't make sense to delay the release
9677 self.early_release = True
9678 self.lu.LogInfo("Old secondary %s is offline, automatically enabling"
9679 " early-release mode", secondary_node)
9682 raise errors.ProgrammerError("Unhandled disk replace mode (%s)" %
9685 # If not specified all disks should be replaced
9687 self.disks = range(len(self.instance.disks))
9689 for node in check_nodes:
9690 _CheckNodeOnline(self.lu, node)
9692 touched_nodes = frozenset(node_name for node_name in [self.new_node,
9695 if node_name is not None)
9697 # Release unneeded node locks
9698 _ReleaseLocks(self.lu, locking.LEVEL_NODE, keep=touched_nodes)
9700 # Release any owned node group
9701 if self.lu.glm.is_owned(locking.LEVEL_NODEGROUP):
9702 _ReleaseLocks(self.lu, locking.LEVEL_NODEGROUP)
9704 # Check whether disks are valid
9705 for disk_idx in self.disks:
9706 instance.FindDisk(disk_idx)
9708 # Get secondary node IP addresses
9709 self.node_secondary_ip = dict((name, node.secondary_ip) for (name, node)
9710 in self.cfg.GetMultiNodeInfo(touched_nodes))
9712 def Exec(self, feedback_fn):
9713 """Execute disk replacement.
9715 This dispatches the disk replacement to the appropriate handler.
9718 if self.delay_iallocator:
9719 self._CheckPrereq2()
9722 # Verify owned locks before starting operation
9723 owned_nodes = self.lu.owned_locks(locking.LEVEL_NODE)
9724 assert set(owned_nodes) == set(self.node_secondary_ip), \
9725 ("Incorrect node locks, owning %s, expected %s" %
9726 (owned_nodes, self.node_secondary_ip.keys()))
9728 owned_instances = self.lu.owned_locks(locking.LEVEL_INSTANCE)
9729 assert list(owned_instances) == [self.instance_name], \
9730 "Instance '%s' not locked" % self.instance_name
9732 assert not self.lu.glm.is_owned(locking.LEVEL_NODEGROUP), \
9733 "Should not own any node group lock at this point"
9736 feedback_fn("No disks need replacement")
9739 feedback_fn("Replacing disk(s) %s for %s" %
9740 (utils.CommaJoin(self.disks), self.instance.name))
9742 activate_disks = (not self.instance.admin_up)
9744 # Activate the instance disks if we're replacing them on a down instance
9746 _StartInstanceDisks(self.lu, self.instance, True)
9749 # Should we replace the secondary node?
9750 if self.new_node is not None:
9751 fn = self._ExecDrbd8Secondary
9753 fn = self._ExecDrbd8DiskOnly
9755 result = fn(feedback_fn)
9757 # Deactivate the instance disks if we're replacing them on a
9760 _SafeShutdownInstanceDisks(self.lu, self.instance)
9763 # Verify owned locks
9764 owned_nodes = self.lu.owned_locks(locking.LEVEL_NODE)
9765 nodes = frozenset(self.node_secondary_ip)
9766 assert ((self.early_release and not owned_nodes) or
9767 (not self.early_release and not (set(owned_nodes) - nodes))), \
9768 ("Not owning the correct locks, early_release=%s, owned=%r,"
9769 " nodes=%r" % (self.early_release, owned_nodes, nodes))
9773 def _CheckVolumeGroup(self, nodes):
9774 self.lu.LogInfo("Checking volume groups")
9776 vgname = self.cfg.GetVGName()
9778 # Make sure volume group exists on all involved nodes
9779 results = self.rpc.call_vg_list(nodes)
9781 raise errors.OpExecError("Can't list volume groups on the nodes")
9785 res.Raise("Error checking node %s" % node)
9786 if vgname not in res.payload:
9787 raise errors.OpExecError("Volume group '%s' not found on node %s" %
9790 def _CheckDisksExistence(self, nodes):
9791 # Check disk existence
9792 for idx, dev in enumerate(self.instance.disks):
9793 if idx not in self.disks:
9797 self.lu.LogInfo("Checking disk/%d on %s" % (idx, node))
9798 self.cfg.SetDiskID(dev, node)
9800 result = self.rpc.call_blockdev_find(node, dev)
9802 msg = result.fail_msg
9803 if msg or not result.payload:
9805 msg = "disk not found"
9806 raise errors.OpExecError("Can't find disk/%d on node %s: %s" %
9809 def _CheckDisksConsistency(self, node_name, on_primary, ldisk):
9810 for idx, dev in enumerate(self.instance.disks):
9811 if idx not in self.disks:
9814 self.lu.LogInfo("Checking disk/%d consistency on node %s" %
9817 if not _CheckDiskConsistency(self.lu, dev, node_name, on_primary,
9819 raise errors.OpExecError("Node %s has degraded storage, unsafe to"
9820 " replace disks for instance %s" %
9821 (node_name, self.instance.name))
9823 def _CreateNewStorage(self, node_name):
9824 """Create new storage on the primary or secondary node.
9826 This is only used for same-node replaces, not for changing the
9827 secondary node, hence we don't want to modify the existing disk.
9832 for idx, dev in enumerate(self.instance.disks):
9833 if idx not in self.disks:
9836 self.lu.LogInfo("Adding storage on %s for disk/%d" % (node_name, idx))
9838 self.cfg.SetDiskID(dev, node_name)
9840 lv_names = [".disk%d_%s" % (idx, suffix) for suffix in ["data", "meta"]]
9841 names = _GenerateUniqueNames(self.lu, lv_names)
9843 vg_data = dev.children[0].logical_id[0]
9844 lv_data = objects.Disk(dev_type=constants.LD_LV, size=dev.size,
9845 logical_id=(vg_data, names[0]))
9846 vg_meta = dev.children[1].logical_id[0]
9847 lv_meta = objects.Disk(dev_type=constants.LD_LV, size=DRBD_META_SIZE,
9848 logical_id=(vg_meta, names[1]))
9850 new_lvs = [lv_data, lv_meta]
9851 old_lvs = [child.Copy() for child in dev.children]
9852 iv_names[dev.iv_name] = (dev, old_lvs, new_lvs)
9854 # we pass force_create=True to force the LVM creation
9855 for new_lv in new_lvs:
9856 _CreateBlockDev(self.lu, node_name, self.instance, new_lv, True,
9857 _GetInstanceInfoText(self.instance), False)
9861 def _CheckDevices(self, node_name, iv_names):
9862 for name, (dev, _, _) in iv_names.iteritems():
9863 self.cfg.SetDiskID(dev, node_name)
9865 result = self.rpc.call_blockdev_find(node_name, dev)
9867 msg = result.fail_msg
9868 if msg or not result.payload:
9870 msg = "disk not found"
9871 raise errors.OpExecError("Can't find DRBD device %s: %s" %
9874 if result.payload.is_degraded:
9875 raise errors.OpExecError("DRBD device %s is degraded!" % name)
9877 def _RemoveOldStorage(self, node_name, iv_names):
9878 for name, (_, old_lvs, _) in iv_names.iteritems():
9879 self.lu.LogInfo("Remove logical volumes for %s" % name)
9882 self.cfg.SetDiskID(lv, node_name)
9884 msg = self.rpc.call_blockdev_remove(node_name, lv).fail_msg
9886 self.lu.LogWarning("Can't remove old LV: %s" % msg,
9887 hint="remove unused LVs manually")
9889 def _ExecDrbd8DiskOnly(self, feedback_fn): # pylint: disable=W0613
9890 """Replace a disk on the primary or secondary for DRBD 8.
9892 The algorithm for replace is quite complicated:
9894 1. for each disk to be replaced:
9896 1. create new LVs on the target node with unique names
9897 1. detach old LVs from the drbd device
9898 1. rename old LVs to name_replaced.<time_t>
9899 1. rename new LVs to old LVs
9900 1. attach the new LVs (with the old names now) to the drbd device
9902 1. wait for sync across all devices
9904 1. for each modified disk:
9906 1. remove old LVs (which have the name name_replaces.<time_t>)
9908 Failures are not very well handled.
9913 # Step: check device activation
9914 self.lu.LogStep(1, steps_total, "Check device existence")
9915 self._CheckDisksExistence([self.other_node, self.target_node])
9916 self._CheckVolumeGroup([self.target_node, self.other_node])
9918 # Step: check other node consistency
9919 self.lu.LogStep(2, steps_total, "Check peer consistency")
9920 self._CheckDisksConsistency(self.other_node,
9921 self.other_node == self.instance.primary_node,
9924 # Step: create new storage
9925 self.lu.LogStep(3, steps_total, "Allocate new storage")
9926 iv_names = self._CreateNewStorage(self.target_node)
9928 # Step: for each lv, detach+rename*2+attach
9929 self.lu.LogStep(4, steps_total, "Changing drbd configuration")
9930 for dev, old_lvs, new_lvs in iv_names.itervalues():
9931 self.lu.LogInfo("Detaching %s drbd from local storage" % dev.iv_name)
9933 result = self.rpc.call_blockdev_removechildren(self.target_node, dev,
9935 result.Raise("Can't detach drbd from local storage on node"
9936 " %s for device %s" % (self.target_node, dev.iv_name))
9938 #cfg.Update(instance)
9940 # ok, we created the new LVs, so now we know we have the needed
9941 # storage; as such, we proceed on the target node to rename
9942 # old_lv to _old, and new_lv to old_lv; note that we rename LVs
9943 # using the assumption that logical_id == physical_id (which in
9944 # turn is the unique_id on that node)
9946 # FIXME(iustin): use a better name for the replaced LVs
9947 temp_suffix = int(time.time())
9948 ren_fn = lambda d, suff: (d.physical_id[0],
9949 d.physical_id[1] + "_replaced-%s" % suff)
9951 # Build the rename list based on what LVs exist on the node
9952 rename_old_to_new = []
9953 for to_ren in old_lvs:
9954 result = self.rpc.call_blockdev_find(self.target_node, to_ren)
9955 if not result.fail_msg and result.payload:
9957 rename_old_to_new.append((to_ren, ren_fn(to_ren, temp_suffix)))
9959 self.lu.LogInfo("Renaming the old LVs on the target node")
9960 result = self.rpc.call_blockdev_rename(self.target_node,
9962 result.Raise("Can't rename old LVs on node %s" % self.target_node)
9964 # Now we rename the new LVs to the old LVs
9965 self.lu.LogInfo("Renaming the new LVs on the target node")
9966 rename_new_to_old = [(new, old.physical_id)
9967 for old, new in zip(old_lvs, new_lvs)]
9968 result = self.rpc.call_blockdev_rename(self.target_node,
9970 result.Raise("Can't rename new LVs on node %s" % self.target_node)
9972 # Intermediate steps of in memory modifications
9973 for old, new in zip(old_lvs, new_lvs):
9974 new.logical_id = old.logical_id
9975 self.cfg.SetDiskID(new, self.target_node)
9977 # We need to modify old_lvs so that removal later removes the
9978 # right LVs, not the newly added ones; note that old_lvs is a
9980 for disk in old_lvs:
9981 disk.logical_id = ren_fn(disk, temp_suffix)
9982 self.cfg.SetDiskID(disk, self.target_node)
9984 # Now that the new lvs have the old name, we can add them to the device
9985 self.lu.LogInfo("Adding new mirror component on %s" % self.target_node)
9986 result = self.rpc.call_blockdev_addchildren(self.target_node, dev,
9988 msg = result.fail_msg
9990 for new_lv in new_lvs:
9991 msg2 = self.rpc.call_blockdev_remove(self.target_node,
9994 self.lu.LogWarning("Can't rollback device %s: %s", dev, msg2,
9995 hint=("cleanup manually the unused logical"
9997 raise errors.OpExecError("Can't add local storage to drbd: %s" % msg)
10000 if self.early_release:
10001 self.lu.LogStep(cstep, steps_total, "Removing old storage")
10003 self._RemoveOldStorage(self.target_node, iv_names)
10004 # WARNING: we release both node locks here, do not do other RPCs
10005 # than WaitForSync to the primary node
10006 _ReleaseLocks(self.lu, locking.LEVEL_NODE,
10007 names=[self.target_node, self.other_node])
10010 # This can fail as the old devices are degraded and _WaitForSync
10011 # does a combined result over all disks, so we don't check its return value
10012 self.lu.LogStep(cstep, steps_total, "Sync devices")
10014 _WaitForSync(self.lu, self.instance)
10016 # Check all devices manually
10017 self._CheckDevices(self.instance.primary_node, iv_names)
10019 # Step: remove old storage
10020 if not self.early_release:
10021 self.lu.LogStep(cstep, steps_total, "Removing old storage")
10023 self._RemoveOldStorage(self.target_node, iv_names)
10025 def _ExecDrbd8Secondary(self, feedback_fn):
10026 """Replace the secondary node for DRBD 8.
10028 The algorithm for replace is quite complicated:
10029 - for all disks of the instance:
10030 - create new LVs on the new node with same names
10031 - shutdown the drbd device on the old secondary
10032 - disconnect the drbd network on the primary
10033 - create the drbd device on the new secondary
10034 - network attach the drbd on the primary, using an artifice:
10035 the drbd code for Attach() will connect to the network if it
10036 finds a device which is connected to the good local disks but
10037 not network enabled
10038 - wait for sync across all devices
10039 - remove all disks from the old secondary
10041 Failures are not very well handled.
10046 pnode = self.instance.primary_node
10048 # Step: check device activation
10049 self.lu.LogStep(1, steps_total, "Check device existence")
10050 self._CheckDisksExistence([self.instance.primary_node])
10051 self._CheckVolumeGroup([self.instance.primary_node])
10053 # Step: check other node consistency
10054 self.lu.LogStep(2, steps_total, "Check peer consistency")
10055 self._CheckDisksConsistency(self.instance.primary_node, True, True)
10057 # Step: create new storage
10058 self.lu.LogStep(3, steps_total, "Allocate new storage")
10059 for idx, dev in enumerate(self.instance.disks):
10060 self.lu.LogInfo("Adding new local storage on %s for disk/%d" %
10061 (self.new_node, idx))
10062 # we pass force_create=True to force LVM creation
10063 for new_lv in dev.children:
10064 _CreateBlockDev(self.lu, self.new_node, self.instance, new_lv, True,
10065 _GetInstanceInfoText(self.instance), False)
10067 # Step 4: dbrd minors and drbd setups changes
10068 # after this, we must manually remove the drbd minors on both the
10069 # error and the success paths
10070 self.lu.LogStep(4, steps_total, "Changing drbd configuration")
10071 minors = self.cfg.AllocateDRBDMinor([self.new_node
10072 for dev in self.instance.disks],
10073 self.instance.name)
10074 logging.debug("Allocated minors %r", minors)
10077 for idx, (dev, new_minor) in enumerate(zip(self.instance.disks, minors)):
10078 self.lu.LogInfo("activating a new drbd on %s for disk/%d" %
10079 (self.new_node, idx))
10080 # create new devices on new_node; note that we create two IDs:
10081 # one without port, so the drbd will be activated without
10082 # networking information on the new node at this stage, and one
10083 # with network, for the latter activation in step 4
10084 (o_node1, o_node2, o_port, o_minor1, o_minor2, o_secret) = dev.logical_id
10085 if self.instance.primary_node == o_node1:
10088 assert self.instance.primary_node == o_node2, "Three-node instance?"
10091 new_alone_id = (self.instance.primary_node, self.new_node, None,
10092 p_minor, new_minor, o_secret)
10093 new_net_id = (self.instance.primary_node, self.new_node, o_port,
10094 p_minor, new_minor, o_secret)
10096 iv_names[idx] = (dev, dev.children, new_net_id)
10097 logging.debug("Allocated new_minor: %s, new_logical_id: %s", new_minor,
10099 new_drbd = objects.Disk(dev_type=constants.LD_DRBD8,
10100 logical_id=new_alone_id,
10101 children=dev.children,
10104 _CreateSingleBlockDev(self.lu, self.new_node, self.instance, new_drbd,
10105 _GetInstanceInfoText(self.instance), False)
10106 except errors.GenericError:
10107 self.cfg.ReleaseDRBDMinors(self.instance.name)
10110 # We have new devices, shutdown the drbd on the old secondary
10111 for idx, dev in enumerate(self.instance.disks):
10112 self.lu.LogInfo("Shutting down drbd for disk/%d on old node" % idx)
10113 self.cfg.SetDiskID(dev, self.target_node)
10114 msg = self.rpc.call_blockdev_shutdown(self.target_node, dev).fail_msg
10116 self.lu.LogWarning("Failed to shutdown drbd for disk/%d on old"
10117 "node: %s" % (idx, msg),
10118 hint=("Please cleanup this device manually as"
10119 " soon as possible"))
10121 self.lu.LogInfo("Detaching primary drbds from the network (=> standalone)")
10122 result = self.rpc.call_drbd_disconnect_net([pnode], self.node_secondary_ip,
10123 self.instance.disks)[pnode]
10125 msg = result.fail_msg
10127 # detaches didn't succeed (unlikely)
10128 self.cfg.ReleaseDRBDMinors(self.instance.name)
10129 raise errors.OpExecError("Can't detach the disks from the network on"
10130 " old node: %s" % (msg,))
10132 # if we managed to detach at least one, we update all the disks of
10133 # the instance to point to the new secondary
10134 self.lu.LogInfo("Updating instance configuration")
10135 for dev, _, new_logical_id in iv_names.itervalues():
10136 dev.logical_id = new_logical_id
10137 self.cfg.SetDiskID(dev, self.instance.primary_node)
10139 self.cfg.Update(self.instance, feedback_fn)
10141 # and now perform the drbd attach
10142 self.lu.LogInfo("Attaching primary drbds to new secondary"
10143 " (standalone => connected)")
10144 result = self.rpc.call_drbd_attach_net([self.instance.primary_node,
10146 self.node_secondary_ip,
10147 self.instance.disks,
10148 self.instance.name,
10150 for to_node, to_result in result.items():
10151 msg = to_result.fail_msg
10153 self.lu.LogWarning("Can't attach drbd disks on node %s: %s",
10155 hint=("please do a gnt-instance info to see the"
10156 " status of disks"))
10158 if self.early_release:
10159 self.lu.LogStep(cstep, steps_total, "Removing old storage")
10161 self._RemoveOldStorage(self.target_node, iv_names)
10162 # WARNING: we release all node locks here, do not do other RPCs
10163 # than WaitForSync to the primary node
10164 _ReleaseLocks(self.lu, locking.LEVEL_NODE,
10165 names=[self.instance.primary_node,
10170 # This can fail as the old devices are degraded and _WaitForSync
10171 # does a combined result over all disks, so we don't check its return value
10172 self.lu.LogStep(cstep, steps_total, "Sync devices")
10174 _WaitForSync(self.lu, self.instance)
10176 # Check all devices manually
10177 self._CheckDevices(self.instance.primary_node, iv_names)
10179 # Step: remove old storage
10180 if not self.early_release:
10181 self.lu.LogStep(cstep, steps_total, "Removing old storage")
10182 self._RemoveOldStorage(self.target_node, iv_names)
10185 class LURepairNodeStorage(NoHooksLU):
10186 """Repairs the volume group on a node.
10191 def CheckArguments(self):
10192 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
10194 storage_type = self.op.storage_type
10196 if (constants.SO_FIX_CONSISTENCY not in
10197 constants.VALID_STORAGE_OPERATIONS.get(storage_type, [])):
10198 raise errors.OpPrereqError("Storage units of type '%s' can not be"
10199 " repaired" % storage_type,
10200 errors.ECODE_INVAL)
10202 def ExpandNames(self):
10203 self.needed_locks = {
10204 locking.LEVEL_NODE: [self.op.node_name],
10207 def _CheckFaultyDisks(self, instance, node_name):
10208 """Ensure faulty disks abort the opcode or at least warn."""
10210 if _FindFaultyInstanceDisks(self.cfg, self.rpc, instance,
10212 raise errors.OpPrereqError("Instance '%s' has faulty disks on"
10213 " node '%s'" % (instance.name, node_name),
10214 errors.ECODE_STATE)
10215 except errors.OpPrereqError, err:
10216 if self.op.ignore_consistency:
10217 self.proc.LogWarning(str(err.args[0]))
10221 def CheckPrereq(self):
10222 """Check prerequisites.
10225 # Check whether any instance on this node has faulty disks
10226 for inst in _GetNodeInstances(self.cfg, self.op.node_name):
10227 if not inst.admin_up:
10229 check_nodes = set(inst.all_nodes)
10230 check_nodes.discard(self.op.node_name)
10231 for inst_node_name in check_nodes:
10232 self._CheckFaultyDisks(inst, inst_node_name)
10234 def Exec(self, feedback_fn):
10235 feedback_fn("Repairing storage unit '%s' on %s ..." %
10236 (self.op.name, self.op.node_name))
10238 st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
10239 result = self.rpc.call_storage_execute(self.op.node_name,
10240 self.op.storage_type, st_args,
10242 constants.SO_FIX_CONSISTENCY)
10243 result.Raise("Failed to repair storage unit '%s' on %s" %
10244 (self.op.name, self.op.node_name))
10247 class LUNodeEvacuate(NoHooksLU):
10248 """Evacuates instances off a list of nodes.
10253 def CheckArguments(self):
10254 _CheckIAllocatorOrNode(self, "iallocator", "remote_node")
10256 def ExpandNames(self):
10257 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
10259 if self.op.remote_node is not None:
10260 self.op.remote_node = _ExpandNodeName(self.cfg, self.op.remote_node)
10261 assert self.op.remote_node
10263 if self.op.remote_node == self.op.node_name:
10264 raise errors.OpPrereqError("Can not use evacuated node as a new"
10265 " secondary node", errors.ECODE_INVAL)
10267 if self.op.mode != constants.IALLOCATOR_NEVAC_SEC:
10268 raise errors.OpPrereqError("Without the use of an iallocator only"
10269 " secondary instances can be evacuated",
10270 errors.ECODE_INVAL)
10273 self.share_locks = _ShareAll()
10274 self.needed_locks = {
10275 locking.LEVEL_INSTANCE: [],
10276 locking.LEVEL_NODEGROUP: [],
10277 locking.LEVEL_NODE: [],
10280 if self.op.remote_node is None:
10281 # Iallocator will choose any node(s) in the same group
10282 group_nodes = self.cfg.GetNodeGroupMembersByNodes([self.op.node_name])
10284 group_nodes = frozenset([self.op.remote_node])
10286 # Determine nodes to be locked
10287 self.lock_nodes = set([self.op.node_name]) | group_nodes
10289 def _DetermineInstances(self):
10290 """Builds list of instances to operate on.
10293 assert self.op.mode in constants.IALLOCATOR_NEVAC_MODES
10295 if self.op.mode == constants.IALLOCATOR_NEVAC_PRI:
10296 # Primary instances only
10297 inst_fn = _GetNodePrimaryInstances
10298 assert self.op.remote_node is None, \
10299 "Evacuating primary instances requires iallocator"
10300 elif self.op.mode == constants.IALLOCATOR_NEVAC_SEC:
10301 # Secondary instances only
10302 inst_fn = _GetNodeSecondaryInstances
10305 assert self.op.mode == constants.IALLOCATOR_NEVAC_ALL
10306 inst_fn = _GetNodeInstances
10308 return inst_fn(self.cfg, self.op.node_name)
10310 def DeclareLocks(self, level):
10311 if level == locking.LEVEL_INSTANCE:
10312 # Lock instances optimistically, needs verification once node and group
10313 # locks have been acquired
10314 self.needed_locks[locking.LEVEL_INSTANCE] = \
10315 set(i.name for i in self._DetermineInstances())
10317 elif level == locking.LEVEL_NODEGROUP:
10318 # Lock node groups optimistically, needs verification once nodes have
10320 self.needed_locks[locking.LEVEL_NODEGROUP] = \
10321 self.cfg.GetNodeGroupsFromNodes(self.lock_nodes)
10323 elif level == locking.LEVEL_NODE:
10324 self.needed_locks[locking.LEVEL_NODE] = self.lock_nodes
10326 def CheckPrereq(self):
10328 owned_instances = self.owned_locks(locking.LEVEL_INSTANCE)
10329 owned_nodes = self.owned_locks(locking.LEVEL_NODE)
10330 owned_groups = self.owned_locks(locking.LEVEL_NODEGROUP)
10332 assert owned_nodes == self.lock_nodes
10334 wanted_groups = self.cfg.GetNodeGroupsFromNodes(owned_nodes)
10335 if owned_groups != wanted_groups:
10336 raise errors.OpExecError("Node groups changed since locks were acquired,"
10337 " current groups are '%s', used to be '%s'" %
10338 (utils.CommaJoin(wanted_groups),
10339 utils.CommaJoin(owned_groups)))
10341 # Determine affected instances
10342 self.instances = self._DetermineInstances()
10343 self.instance_names = [i.name for i in self.instances]
10345 if set(self.instance_names) != owned_instances:
10346 raise errors.OpExecError("Instances on node '%s' changed since locks"
10347 " were acquired, current instances are '%s',"
10348 " used to be '%s'" %
10349 (self.op.node_name,
10350 utils.CommaJoin(self.instance_names),
10351 utils.CommaJoin(owned_instances)))
10353 if self.instance_names:
10354 self.LogInfo("Evacuating instances from node '%s': %s",
10356 utils.CommaJoin(utils.NiceSort(self.instance_names)))
10358 self.LogInfo("No instances to evacuate from node '%s'",
10361 if self.op.remote_node is not None:
10362 for i in self.instances:
10363 if i.primary_node == self.op.remote_node:
10364 raise errors.OpPrereqError("Node %s is the primary node of"
10365 " instance %s, cannot use it as"
10367 (self.op.remote_node, i.name),
10368 errors.ECODE_INVAL)
10370 def Exec(self, feedback_fn):
10371 assert (self.op.iallocator is not None) ^ (self.op.remote_node is not None)
10373 if not self.instance_names:
10374 # No instances to evacuate
10377 elif self.op.iallocator is not None:
10378 # TODO: Implement relocation to other group
10379 ial = IAllocator(self.cfg, self.rpc, constants.IALLOCATOR_MODE_NODE_EVAC,
10380 evac_mode=self.op.mode,
10381 instances=list(self.instance_names))
10383 ial.Run(self.op.iallocator)
10385 if not ial.success:
10386 raise errors.OpPrereqError("Can't compute node evacuation using"
10387 " iallocator '%s': %s" %
10388 (self.op.iallocator, ial.info),
10389 errors.ECODE_NORES)
10391 jobs = _LoadNodeEvacResult(self, ial.result, self.op.early_release, True)
10393 elif self.op.remote_node is not None:
10394 assert self.op.mode == constants.IALLOCATOR_NEVAC_SEC
10396 [opcodes.OpInstanceReplaceDisks(instance_name=instance_name,
10397 remote_node=self.op.remote_node,
10399 mode=constants.REPLACE_DISK_CHG,
10400 early_release=self.op.early_release)]
10401 for instance_name in self.instance_names
10405 raise errors.ProgrammerError("No iallocator or remote node")
10407 return ResultWithJobs(jobs)
10410 def _SetOpEarlyRelease(early_release, op):
10411 """Sets C{early_release} flag on opcodes if available.
10415 op.early_release = early_release
10416 except AttributeError:
10417 assert not isinstance(op, opcodes.OpInstanceReplaceDisks)
10422 def _NodeEvacDest(use_nodes, group, nodes):
10423 """Returns group or nodes depending on caller's choice.
10427 return utils.CommaJoin(nodes)
10432 def _LoadNodeEvacResult(lu, alloc_result, early_release, use_nodes):
10433 """Unpacks the result of change-group and node-evacuate iallocator requests.
10435 Iallocator modes L{constants.IALLOCATOR_MODE_NODE_EVAC} and
10436 L{constants.IALLOCATOR_MODE_CHG_GROUP}.
10438 @type lu: L{LogicalUnit}
10439 @param lu: Logical unit instance
10440 @type alloc_result: tuple/list
10441 @param alloc_result: Result from iallocator
10442 @type early_release: bool
10443 @param early_release: Whether to release locks early if possible
10444 @type use_nodes: bool
10445 @param use_nodes: Whether to display node names instead of groups
10448 (moved, failed, jobs) = alloc_result
10451 lu.LogWarning("Unable to evacuate instances %s",
10452 utils.CommaJoin("%s (%s)" % (name, reason)
10453 for (name, reason) in failed))
10456 lu.LogInfo("Instances to be moved: %s",
10457 utils.CommaJoin("%s (to %s)" %
10458 (name, _NodeEvacDest(use_nodes, group, nodes))
10459 for (name, group, nodes) in moved))
10461 return [map(compat.partial(_SetOpEarlyRelease, early_release),
10462 map(opcodes.OpCode.LoadOpCode, ops))
10466 class LUInstanceGrowDisk(LogicalUnit):
10467 """Grow a disk of an instance.
10470 HPATH = "disk-grow"
10471 HTYPE = constants.HTYPE_INSTANCE
10474 def ExpandNames(self):
10475 self._ExpandAndLockInstance()
10476 self.needed_locks[locking.LEVEL_NODE] = []
10477 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
10479 def DeclareLocks(self, level):
10480 if level == locking.LEVEL_NODE:
10481 self._LockInstancesNodes()
10483 def BuildHooksEnv(self):
10484 """Build hooks env.
10486 This runs on the master, the primary and all the secondaries.
10490 "DISK": self.op.disk,
10491 "AMOUNT": self.op.amount,
10493 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
10496 def BuildHooksNodes(self):
10497 """Build hooks nodes.
10500 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
10503 def CheckPrereq(self):
10504 """Check prerequisites.
10506 This checks that the instance is in the cluster.
10509 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
10510 assert instance is not None, \
10511 "Cannot retrieve locked instance %s" % self.op.instance_name
10512 nodenames = list(instance.all_nodes)
10513 for node in nodenames:
10514 _CheckNodeOnline(self, node)
10516 self.instance = instance
10518 if instance.disk_template not in constants.DTS_GROWABLE:
10519 raise errors.OpPrereqError("Instance's disk layout does not support"
10520 " growing", errors.ECODE_INVAL)
10522 self.disk = instance.FindDisk(self.op.disk)
10524 if instance.disk_template not in (constants.DT_FILE,
10525 constants.DT_SHARED_FILE):
10526 # TODO: check the free disk space for file, when that feature will be
10528 _CheckNodesFreeDiskPerVG(self, nodenames,
10529 self.disk.ComputeGrowth(self.op.amount))
10531 def Exec(self, feedback_fn):
10532 """Execute disk grow.
10535 instance = self.instance
10538 disks_ok, _ = _AssembleInstanceDisks(self, self.instance, disks=[disk])
10540 raise errors.OpExecError("Cannot activate block device to grow")
10542 # First run all grow ops in dry-run mode
10543 for node in instance.all_nodes:
10544 self.cfg.SetDiskID(disk, node)
10545 result = self.rpc.call_blockdev_grow(node, disk, self.op.amount, True)
10546 result.Raise("Grow request failed to node %s" % node)
10548 # We know that (as far as we can test) operations across different
10549 # nodes will succeed, time to run it for real
10550 for node in instance.all_nodes:
10551 self.cfg.SetDiskID(disk, node)
10552 result = self.rpc.call_blockdev_grow(node, disk, self.op.amount, False)
10553 result.Raise("Grow request failed to node %s" % node)
10555 # TODO: Rewrite code to work properly
10556 # DRBD goes into sync mode for a short amount of time after executing the
10557 # "resize" command. DRBD 8.x below version 8.0.13 contains a bug whereby
10558 # calling "resize" in sync mode fails. Sleeping for a short amount of
10559 # time is a work-around.
10562 disk.RecordGrow(self.op.amount)
10563 self.cfg.Update(instance, feedback_fn)
10564 if self.op.wait_for_sync:
10565 disk_abort = not _WaitForSync(self, instance, disks=[disk])
10567 self.proc.LogWarning("Disk sync-ing has not returned a good"
10568 " status; please check the instance")
10569 if not instance.admin_up:
10570 _SafeShutdownInstanceDisks(self, instance, disks=[disk])
10571 elif not instance.admin_up:
10572 self.proc.LogWarning("Not shutting down the disk even if the instance is"
10573 " not supposed to be running because no wait for"
10574 " sync mode was requested")
10577 class LUInstanceQueryData(NoHooksLU):
10578 """Query runtime instance data.
10583 def ExpandNames(self):
10584 self.needed_locks = {}
10586 # Use locking if requested or when non-static information is wanted
10587 if not (self.op.static or self.op.use_locking):
10588 self.LogWarning("Non-static data requested, locks need to be acquired")
10589 self.op.use_locking = True
10591 if self.op.instances or not self.op.use_locking:
10592 # Expand instance names right here
10593 self.wanted_names = _GetWantedInstances(self, self.op.instances)
10595 # Will use acquired locks
10596 self.wanted_names = None
10598 if self.op.use_locking:
10599 self.share_locks = _ShareAll()
10601 if self.wanted_names is None:
10602 self.needed_locks[locking.LEVEL_INSTANCE] = locking.ALL_SET
10604 self.needed_locks[locking.LEVEL_INSTANCE] = self.wanted_names
10606 self.needed_locks[locking.LEVEL_NODE] = []
10607 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
10609 def DeclareLocks(self, level):
10610 if self.op.use_locking and level == locking.LEVEL_NODE:
10611 self._LockInstancesNodes()
10613 def CheckPrereq(self):
10614 """Check prerequisites.
10616 This only checks the optional instance list against the existing names.
10619 if self.wanted_names is None:
10620 assert self.op.use_locking, "Locking was not used"
10621 self.wanted_names = self.owned_locks(locking.LEVEL_INSTANCE)
10623 self.wanted_instances = \
10624 map(compat.snd, self.cfg.GetMultiInstanceInfo(self.wanted_names))
10626 def _ComputeBlockdevStatus(self, node, instance_name, dev):
10627 """Returns the status of a block device
10630 if self.op.static or not node:
10633 self.cfg.SetDiskID(dev, node)
10635 result = self.rpc.call_blockdev_find(node, dev)
10639 result.Raise("Can't compute disk status for %s" % instance_name)
10641 status = result.payload
10645 return (status.dev_path, status.major, status.minor,
10646 status.sync_percent, status.estimated_time,
10647 status.is_degraded, status.ldisk_status)
10649 def _ComputeDiskStatus(self, instance, snode, dev):
10650 """Compute block device status.
10653 if dev.dev_type in constants.LDS_DRBD:
10654 # we change the snode then (otherwise we use the one passed in)
10655 if dev.logical_id[0] == instance.primary_node:
10656 snode = dev.logical_id[1]
10658 snode = dev.logical_id[0]
10660 dev_pstatus = self._ComputeBlockdevStatus(instance.primary_node,
10661 instance.name, dev)
10662 dev_sstatus = self._ComputeBlockdevStatus(snode, instance.name, dev)
10665 dev_children = map(compat.partial(self._ComputeDiskStatus,
10672 "iv_name": dev.iv_name,
10673 "dev_type": dev.dev_type,
10674 "logical_id": dev.logical_id,
10675 "physical_id": dev.physical_id,
10676 "pstatus": dev_pstatus,
10677 "sstatus": dev_sstatus,
10678 "children": dev_children,
10683 def Exec(self, feedback_fn):
10684 """Gather and return data"""
10687 cluster = self.cfg.GetClusterInfo()
10689 pri_nodes = self.cfg.GetMultiNodeInfo(i.primary_node
10690 for i in self.wanted_instances)
10691 for instance, (_, pnode) in zip(self.wanted_instances, pri_nodes):
10692 if self.op.static or pnode.offline:
10693 remote_state = None
10695 self.LogWarning("Primary node %s is marked offline, returning static"
10696 " information only for instance %s" %
10697 (pnode.name, instance.name))
10699 remote_info = self.rpc.call_instance_info(instance.primary_node,
10701 instance.hypervisor)
10702 remote_info.Raise("Error checking node %s" % instance.primary_node)
10703 remote_info = remote_info.payload
10704 if remote_info and "state" in remote_info:
10705 remote_state = "up"
10707 remote_state = "down"
10709 if instance.admin_up:
10710 config_state = "up"
10712 config_state = "down"
10714 disks = map(compat.partial(self._ComputeDiskStatus, instance, None),
10717 result[instance.name] = {
10718 "name": instance.name,
10719 "config_state": config_state,
10720 "run_state": remote_state,
10721 "pnode": instance.primary_node,
10722 "snodes": instance.secondary_nodes,
10724 # this happens to be the same format used for hooks
10725 "nics": _NICListToTuple(self, instance.nics),
10726 "disk_template": instance.disk_template,
10728 "hypervisor": instance.hypervisor,
10729 "network_port": instance.network_port,
10730 "hv_instance": instance.hvparams,
10731 "hv_actual": cluster.FillHV(instance, skip_globals=True),
10732 "be_instance": instance.beparams,
10733 "be_actual": cluster.FillBE(instance),
10734 "os_instance": instance.osparams,
10735 "os_actual": cluster.SimpleFillOS(instance.os, instance.osparams),
10736 "serial_no": instance.serial_no,
10737 "mtime": instance.mtime,
10738 "ctime": instance.ctime,
10739 "uuid": instance.uuid,
10745 class LUInstanceSetParams(LogicalUnit):
10746 """Modifies an instances's parameters.
10749 HPATH = "instance-modify"
10750 HTYPE = constants.HTYPE_INSTANCE
10753 def CheckArguments(self):
10754 if not (self.op.nics or self.op.disks or self.op.disk_template or
10755 self.op.hvparams or self.op.beparams or self.op.os_name):
10756 raise errors.OpPrereqError("No changes submitted", errors.ECODE_INVAL)
10758 if self.op.hvparams:
10759 _CheckGlobalHvParams(self.op.hvparams)
10763 for disk_op, disk_dict in self.op.disks:
10764 utils.ForceDictType(disk_dict, constants.IDISK_PARAMS_TYPES)
10765 if disk_op == constants.DDM_REMOVE:
10766 disk_addremove += 1
10768 elif disk_op == constants.DDM_ADD:
10769 disk_addremove += 1
10771 if not isinstance(disk_op, int):
10772 raise errors.OpPrereqError("Invalid disk index", errors.ECODE_INVAL)
10773 if not isinstance(disk_dict, dict):
10774 msg = "Invalid disk value: expected dict, got '%s'" % disk_dict
10775 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
10777 if disk_op == constants.DDM_ADD:
10778 mode = disk_dict.setdefault(constants.IDISK_MODE, constants.DISK_RDWR)
10779 if mode not in constants.DISK_ACCESS_SET:
10780 raise errors.OpPrereqError("Invalid disk access mode '%s'" % mode,
10781 errors.ECODE_INVAL)
10782 size = disk_dict.get(constants.IDISK_SIZE, None)
10784 raise errors.OpPrereqError("Required disk parameter size missing",
10785 errors.ECODE_INVAL)
10788 except (TypeError, ValueError), err:
10789 raise errors.OpPrereqError("Invalid disk size parameter: %s" %
10790 str(err), errors.ECODE_INVAL)
10791 disk_dict[constants.IDISK_SIZE] = size
10793 # modification of disk
10794 if constants.IDISK_SIZE in disk_dict:
10795 raise errors.OpPrereqError("Disk size change not possible, use"
10796 " grow-disk", errors.ECODE_INVAL)
10798 if disk_addremove > 1:
10799 raise errors.OpPrereqError("Only one disk add or remove operation"
10800 " supported at a time", errors.ECODE_INVAL)
10802 if self.op.disks and self.op.disk_template is not None:
10803 raise errors.OpPrereqError("Disk template conversion and other disk"
10804 " changes not supported at the same time",
10805 errors.ECODE_INVAL)
10807 if (self.op.disk_template and
10808 self.op.disk_template in constants.DTS_INT_MIRROR and
10809 self.op.remote_node is None):
10810 raise errors.OpPrereqError("Changing the disk template to a mirrored"
10811 " one requires specifying a secondary node",
10812 errors.ECODE_INVAL)
10816 for nic_op, nic_dict in self.op.nics:
10817 utils.ForceDictType(nic_dict, constants.INIC_PARAMS_TYPES)
10818 if nic_op == constants.DDM_REMOVE:
10821 elif nic_op == constants.DDM_ADD:
10824 if not isinstance(nic_op, int):
10825 raise errors.OpPrereqError("Invalid nic index", errors.ECODE_INVAL)
10826 if not isinstance(nic_dict, dict):
10827 msg = "Invalid nic value: expected dict, got '%s'" % nic_dict
10828 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
10830 # nic_dict should be a dict
10831 nic_ip = nic_dict.get(constants.INIC_IP, None)
10832 if nic_ip is not None:
10833 if nic_ip.lower() == constants.VALUE_NONE:
10834 nic_dict[constants.INIC_IP] = None
10836 if not netutils.IPAddress.IsValid(nic_ip):
10837 raise errors.OpPrereqError("Invalid IP address '%s'" % nic_ip,
10838 errors.ECODE_INVAL)
10840 nic_bridge = nic_dict.get("bridge", None)
10841 nic_link = nic_dict.get(constants.INIC_LINK, None)
10842 if nic_bridge and nic_link:
10843 raise errors.OpPrereqError("Cannot pass 'bridge' and 'link'"
10844 " at the same time", errors.ECODE_INVAL)
10845 elif nic_bridge and nic_bridge.lower() == constants.VALUE_NONE:
10846 nic_dict["bridge"] = None
10847 elif nic_link and nic_link.lower() == constants.VALUE_NONE:
10848 nic_dict[constants.INIC_LINK] = None
10850 if nic_op == constants.DDM_ADD:
10851 nic_mac = nic_dict.get(constants.INIC_MAC, None)
10852 if nic_mac is None:
10853 nic_dict[constants.INIC_MAC] = constants.VALUE_AUTO
10855 if constants.INIC_MAC in nic_dict:
10856 nic_mac = nic_dict[constants.INIC_MAC]
10857 if nic_mac not in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
10858 nic_mac = utils.NormalizeAndValidateMac(nic_mac)
10860 if nic_op != constants.DDM_ADD and nic_mac == constants.VALUE_AUTO:
10861 raise errors.OpPrereqError("'auto' is not a valid MAC address when"
10862 " modifying an existing nic",
10863 errors.ECODE_INVAL)
10865 if nic_addremove > 1:
10866 raise errors.OpPrereqError("Only one NIC add or remove operation"
10867 " supported at a time", errors.ECODE_INVAL)
10869 def ExpandNames(self):
10870 self._ExpandAndLockInstance()
10871 self.needed_locks[locking.LEVEL_NODE] = []
10872 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
10874 def DeclareLocks(self, level):
10875 if level == locking.LEVEL_NODE:
10876 self._LockInstancesNodes()
10877 if self.op.disk_template and self.op.remote_node:
10878 self.op.remote_node = _ExpandNodeName(self.cfg, self.op.remote_node)
10879 self.needed_locks[locking.LEVEL_NODE].append(self.op.remote_node)
10881 def BuildHooksEnv(self):
10882 """Build hooks env.
10884 This runs on the master, primary and secondaries.
10888 if constants.BE_MEMORY in self.be_new:
10889 args["memory"] = self.be_new[constants.BE_MEMORY]
10890 if constants.BE_VCPUS in self.be_new:
10891 args["vcpus"] = self.be_new[constants.BE_VCPUS]
10892 # TODO: export disk changes. Note: _BuildInstanceHookEnv* don't export disk
10893 # information at all.
10896 nic_override = dict(self.op.nics)
10897 for idx, nic in enumerate(self.instance.nics):
10898 if idx in nic_override:
10899 this_nic_override = nic_override[idx]
10901 this_nic_override = {}
10902 if constants.INIC_IP in this_nic_override:
10903 ip = this_nic_override[constants.INIC_IP]
10906 if constants.INIC_MAC in this_nic_override:
10907 mac = this_nic_override[constants.INIC_MAC]
10910 if idx in self.nic_pnew:
10911 nicparams = self.nic_pnew[idx]
10913 nicparams = self.cluster.SimpleFillNIC(nic.nicparams)
10914 mode = nicparams[constants.NIC_MODE]
10915 link = nicparams[constants.NIC_LINK]
10916 args["nics"].append((ip, mac, mode, link))
10917 if constants.DDM_ADD in nic_override:
10918 ip = nic_override[constants.DDM_ADD].get(constants.INIC_IP, None)
10919 mac = nic_override[constants.DDM_ADD][constants.INIC_MAC]
10920 nicparams = self.nic_pnew[constants.DDM_ADD]
10921 mode = nicparams[constants.NIC_MODE]
10922 link = nicparams[constants.NIC_LINK]
10923 args["nics"].append((ip, mac, mode, link))
10924 elif constants.DDM_REMOVE in nic_override:
10925 del args["nics"][-1]
10927 env = _BuildInstanceHookEnvByObject(self, self.instance, override=args)
10928 if self.op.disk_template:
10929 env["NEW_DISK_TEMPLATE"] = self.op.disk_template
10933 def BuildHooksNodes(self):
10934 """Build hooks nodes.
10937 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
10940 def CheckPrereq(self):
10941 """Check prerequisites.
10943 This only checks the instance list against the existing names.
10946 # checking the new params on the primary/secondary nodes
10948 instance = self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
10949 cluster = self.cluster = self.cfg.GetClusterInfo()
10950 assert self.instance is not None, \
10951 "Cannot retrieve locked instance %s" % self.op.instance_name
10952 pnode = instance.primary_node
10953 nodelist = list(instance.all_nodes)
10956 if self.op.os_name and not self.op.force:
10957 _CheckNodeHasOS(self, instance.primary_node, self.op.os_name,
10958 self.op.force_variant)
10959 instance_os = self.op.os_name
10961 instance_os = instance.os
10963 if self.op.disk_template:
10964 if instance.disk_template == self.op.disk_template:
10965 raise errors.OpPrereqError("Instance already has disk template %s" %
10966 instance.disk_template, errors.ECODE_INVAL)
10968 if (instance.disk_template,
10969 self.op.disk_template) not in self._DISK_CONVERSIONS:
10970 raise errors.OpPrereqError("Unsupported disk template conversion from"
10971 " %s to %s" % (instance.disk_template,
10972 self.op.disk_template),
10973 errors.ECODE_INVAL)
10974 _CheckInstanceDown(self, instance, "cannot change disk template")
10975 if self.op.disk_template in constants.DTS_INT_MIRROR:
10976 if self.op.remote_node == pnode:
10977 raise errors.OpPrereqError("Given new secondary node %s is the same"
10978 " as the primary node of the instance" %
10979 self.op.remote_node, errors.ECODE_STATE)
10980 _CheckNodeOnline(self, self.op.remote_node)
10981 _CheckNodeNotDrained(self, self.op.remote_node)
10982 # FIXME: here we assume that the old instance type is DT_PLAIN
10983 assert instance.disk_template == constants.DT_PLAIN
10984 disks = [{constants.IDISK_SIZE: d.size,
10985 constants.IDISK_VG: d.logical_id[0]}
10986 for d in instance.disks]
10987 required = _ComputeDiskSizePerVG(self.op.disk_template, disks)
10988 _CheckNodesFreeDiskPerVG(self, [self.op.remote_node], required)
10990 # hvparams processing
10991 if self.op.hvparams:
10992 hv_type = instance.hypervisor
10993 i_hvdict = _GetUpdatedParams(instance.hvparams, self.op.hvparams)
10994 utils.ForceDictType(i_hvdict, constants.HVS_PARAMETER_TYPES)
10995 hv_new = cluster.SimpleFillHV(hv_type, instance.os, i_hvdict)
10998 hypervisor.GetHypervisor(hv_type).CheckParameterSyntax(hv_new)
10999 _CheckHVParams(self, nodelist, instance.hypervisor, hv_new)
11000 self.hv_proposed = self.hv_new = hv_new # the new actual values
11001 self.hv_inst = i_hvdict # the new dict (without defaults)
11003 self.hv_proposed = cluster.SimpleFillHV(instance.hypervisor, instance.os,
11005 self.hv_new = self.hv_inst = {}
11007 # beparams processing
11008 if self.op.beparams:
11009 i_bedict = _GetUpdatedParams(instance.beparams, self.op.beparams,
11011 utils.ForceDictType(i_bedict, constants.BES_PARAMETER_TYPES)
11012 be_new = cluster.SimpleFillBE(i_bedict)
11013 self.be_proposed = self.be_new = be_new # the new actual values
11014 self.be_inst = i_bedict # the new dict (without defaults)
11016 self.be_new = self.be_inst = {}
11017 self.be_proposed = cluster.SimpleFillBE(instance.beparams)
11018 be_old = cluster.FillBE(instance)
11020 # CPU param validation -- checking every time a paramtere is
11021 # changed to cover all cases where either CPU mask or vcpus have
11023 if (constants.BE_VCPUS in self.be_proposed and
11024 constants.HV_CPU_MASK in self.hv_proposed):
11026 utils.ParseMultiCpuMask(self.hv_proposed[constants.HV_CPU_MASK])
11027 # Verify mask is consistent with number of vCPUs. Can skip this
11028 # test if only 1 entry in the CPU mask, which means same mask
11029 # is applied to all vCPUs.
11030 if (len(cpu_list) > 1 and
11031 len(cpu_list) != self.be_proposed[constants.BE_VCPUS]):
11032 raise errors.OpPrereqError("Number of vCPUs [%d] does not match the"
11034 (self.be_proposed[constants.BE_VCPUS],
11035 self.hv_proposed[constants.HV_CPU_MASK]),
11036 errors.ECODE_INVAL)
11038 # Only perform this test if a new CPU mask is given
11039 if constants.HV_CPU_MASK in self.hv_new:
11040 # Calculate the largest CPU number requested
11041 max_requested_cpu = max(map(max, cpu_list))
11042 # Check that all of the instance's nodes have enough physical CPUs to
11043 # satisfy the requested CPU mask
11044 _CheckNodesPhysicalCPUs(self, instance.all_nodes,
11045 max_requested_cpu + 1, instance.hypervisor)
11047 # osparams processing
11048 if self.op.osparams:
11049 i_osdict = _GetUpdatedParams(instance.osparams, self.op.osparams)
11050 _CheckOSParams(self, True, nodelist, instance_os, i_osdict)
11051 self.os_inst = i_osdict # the new dict (without defaults)
11057 if (constants.BE_MEMORY in self.op.beparams and not self.op.force and
11058 be_new[constants.BE_MEMORY] > be_old[constants.BE_MEMORY]):
11059 mem_check_list = [pnode]
11060 if be_new[constants.BE_AUTO_BALANCE]:
11061 # either we changed auto_balance to yes or it was from before
11062 mem_check_list.extend(instance.secondary_nodes)
11063 instance_info = self.rpc.call_instance_info(pnode, instance.name,
11064 instance.hypervisor)
11065 nodeinfo = self.rpc.call_node_info(mem_check_list, None,
11066 instance.hypervisor)
11067 pninfo = nodeinfo[pnode]
11068 msg = pninfo.fail_msg
11070 # Assume the primary node is unreachable and go ahead
11071 self.warn.append("Can't get info from primary node %s: %s" %
11073 elif not isinstance(pninfo.payload.get("memory_free", None), int):
11074 self.warn.append("Node data from primary node %s doesn't contain"
11075 " free memory information" % pnode)
11076 elif instance_info.fail_msg:
11077 self.warn.append("Can't get instance runtime information: %s" %
11078 instance_info.fail_msg)
11080 if instance_info.payload:
11081 current_mem = int(instance_info.payload["memory"])
11083 # Assume instance not running
11084 # (there is a slight race condition here, but it's not very probable,
11085 # and we have no other way to check)
11087 miss_mem = (be_new[constants.BE_MEMORY] - current_mem -
11088 pninfo.payload["memory_free"])
11090 raise errors.OpPrereqError("This change will prevent the instance"
11091 " from starting, due to %d MB of memory"
11092 " missing on its primary node" % miss_mem,
11093 errors.ECODE_NORES)
11095 if be_new[constants.BE_AUTO_BALANCE]:
11096 for node, nres in nodeinfo.items():
11097 if node not in instance.secondary_nodes:
11099 nres.Raise("Can't get info from secondary node %s" % node,
11100 prereq=True, ecode=errors.ECODE_STATE)
11101 if not isinstance(nres.payload.get("memory_free", None), int):
11102 raise errors.OpPrereqError("Secondary node %s didn't return free"
11103 " memory information" % node,
11104 errors.ECODE_STATE)
11105 elif be_new[constants.BE_MEMORY] > nres.payload["memory_free"]:
11106 raise errors.OpPrereqError("This change will prevent the instance"
11107 " from failover to its secondary node"
11108 " %s, due to not enough memory" % node,
11109 errors.ECODE_STATE)
11113 self.nic_pinst = {}
11114 for nic_op, nic_dict in self.op.nics:
11115 if nic_op == constants.DDM_REMOVE:
11116 if not instance.nics:
11117 raise errors.OpPrereqError("Instance has no NICs, cannot remove",
11118 errors.ECODE_INVAL)
11120 if nic_op != constants.DDM_ADD:
11122 if not instance.nics:
11123 raise errors.OpPrereqError("Invalid NIC index %s, instance has"
11124 " no NICs" % nic_op,
11125 errors.ECODE_INVAL)
11126 if nic_op < 0 or nic_op >= len(instance.nics):
11127 raise errors.OpPrereqError("Invalid NIC index %s, valid values"
11129 (nic_op, len(instance.nics) - 1),
11130 errors.ECODE_INVAL)
11131 old_nic_params = instance.nics[nic_op].nicparams
11132 old_nic_ip = instance.nics[nic_op].ip
11134 old_nic_params = {}
11137 update_params_dict = dict([(key, nic_dict[key])
11138 for key in constants.NICS_PARAMETERS
11139 if key in nic_dict])
11141 if "bridge" in nic_dict:
11142 update_params_dict[constants.NIC_LINK] = nic_dict["bridge"]
11144 new_nic_params = _GetUpdatedParams(old_nic_params,
11145 update_params_dict)
11146 utils.ForceDictType(new_nic_params, constants.NICS_PARAMETER_TYPES)
11147 new_filled_nic_params = cluster.SimpleFillNIC(new_nic_params)
11148 objects.NIC.CheckParameterSyntax(new_filled_nic_params)
11149 self.nic_pinst[nic_op] = new_nic_params
11150 self.nic_pnew[nic_op] = new_filled_nic_params
11151 new_nic_mode = new_filled_nic_params[constants.NIC_MODE]
11153 if new_nic_mode == constants.NIC_MODE_BRIDGED:
11154 nic_bridge = new_filled_nic_params[constants.NIC_LINK]
11155 msg = self.rpc.call_bridges_exist(pnode, [nic_bridge]).fail_msg
11157 msg = "Error checking bridges on node %s: %s" % (pnode, msg)
11159 self.warn.append(msg)
11161 raise errors.OpPrereqError(msg, errors.ECODE_ENVIRON)
11162 if new_nic_mode == constants.NIC_MODE_ROUTED:
11163 if constants.INIC_IP in nic_dict:
11164 nic_ip = nic_dict[constants.INIC_IP]
11166 nic_ip = old_nic_ip
11168 raise errors.OpPrereqError("Cannot set the nic ip to None"
11169 " on a routed nic", errors.ECODE_INVAL)
11170 if constants.INIC_MAC in nic_dict:
11171 nic_mac = nic_dict[constants.INIC_MAC]
11172 if nic_mac is None:
11173 raise errors.OpPrereqError("Cannot set the nic mac to None",
11174 errors.ECODE_INVAL)
11175 elif nic_mac in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
11176 # otherwise generate the mac
11177 nic_dict[constants.INIC_MAC] = \
11178 self.cfg.GenerateMAC(self.proc.GetECId())
11180 # or validate/reserve the current one
11182 self.cfg.ReserveMAC(nic_mac, self.proc.GetECId())
11183 except errors.ReservationError:
11184 raise errors.OpPrereqError("MAC address %s already in use"
11185 " in cluster" % nic_mac,
11186 errors.ECODE_NOTUNIQUE)
11189 if self.op.disks and instance.disk_template == constants.DT_DISKLESS:
11190 raise errors.OpPrereqError("Disk operations not supported for"
11191 " diskless instances",
11192 errors.ECODE_INVAL)
11193 for disk_op, _ in self.op.disks:
11194 if disk_op == constants.DDM_REMOVE:
11195 if len(instance.disks) == 1:
11196 raise errors.OpPrereqError("Cannot remove the last disk of"
11197 " an instance", errors.ECODE_INVAL)
11198 _CheckInstanceDown(self, instance, "cannot remove disks")
11200 if (disk_op == constants.DDM_ADD and
11201 len(instance.disks) >= constants.MAX_DISKS):
11202 raise errors.OpPrereqError("Instance has too many disks (%d), cannot"
11203 " add more" % constants.MAX_DISKS,
11204 errors.ECODE_STATE)
11205 if disk_op not in (constants.DDM_ADD, constants.DDM_REMOVE):
11207 if disk_op < 0 or disk_op >= len(instance.disks):
11208 raise errors.OpPrereqError("Invalid disk index %s, valid values"
11210 (disk_op, len(instance.disks)),
11211 errors.ECODE_INVAL)
11215 def _ConvertPlainToDrbd(self, feedback_fn):
11216 """Converts an instance from plain to drbd.
11219 feedback_fn("Converting template to drbd")
11220 instance = self.instance
11221 pnode = instance.primary_node
11222 snode = self.op.remote_node
11224 # create a fake disk info for _GenerateDiskTemplate
11225 disk_info = [{constants.IDISK_SIZE: d.size, constants.IDISK_MODE: d.mode,
11226 constants.IDISK_VG: d.logical_id[0]}
11227 for d in instance.disks]
11228 new_disks = _GenerateDiskTemplate(self, self.op.disk_template,
11229 instance.name, pnode, [snode],
11230 disk_info, None, None, 0, feedback_fn)
11231 info = _GetInstanceInfoText(instance)
11232 feedback_fn("Creating aditional volumes...")
11233 # first, create the missing data and meta devices
11234 for disk in new_disks:
11235 # unfortunately this is... not too nice
11236 _CreateSingleBlockDev(self, pnode, instance, disk.children[1],
11238 for child in disk.children:
11239 _CreateSingleBlockDev(self, snode, instance, child, info, True)
11240 # at this stage, all new LVs have been created, we can rename the
11242 feedback_fn("Renaming original volumes...")
11243 rename_list = [(o, n.children[0].logical_id)
11244 for (o, n) in zip(instance.disks, new_disks)]
11245 result = self.rpc.call_blockdev_rename(pnode, rename_list)
11246 result.Raise("Failed to rename original LVs")
11248 feedback_fn("Initializing DRBD devices...")
11249 # all child devices are in place, we can now create the DRBD devices
11250 for disk in new_disks:
11251 for node in [pnode, snode]:
11252 f_create = node == pnode
11253 _CreateSingleBlockDev(self, node, instance, disk, info, f_create)
11255 # at this point, the instance has been modified
11256 instance.disk_template = constants.DT_DRBD8
11257 instance.disks = new_disks
11258 self.cfg.Update(instance, feedback_fn)
11260 # disks are created, waiting for sync
11261 disk_abort = not _WaitForSync(self, instance,
11262 oneshot=not self.op.wait_for_sync)
11264 raise errors.OpExecError("There are some degraded disks for"
11265 " this instance, please cleanup manually")
11267 def _ConvertDrbdToPlain(self, feedback_fn):
11268 """Converts an instance from drbd to plain.
11271 instance = self.instance
11272 assert len(instance.secondary_nodes) == 1
11273 pnode = instance.primary_node
11274 snode = instance.secondary_nodes[0]
11275 feedback_fn("Converting template to plain")
11277 old_disks = instance.disks
11278 new_disks = [d.children[0] for d in old_disks]
11280 # copy over size and mode
11281 for parent, child in zip(old_disks, new_disks):
11282 child.size = parent.size
11283 child.mode = parent.mode
11285 # update instance structure
11286 instance.disks = new_disks
11287 instance.disk_template = constants.DT_PLAIN
11288 self.cfg.Update(instance, feedback_fn)
11290 feedback_fn("Removing volumes on the secondary node...")
11291 for disk in old_disks:
11292 self.cfg.SetDiskID(disk, snode)
11293 msg = self.rpc.call_blockdev_remove(snode, disk).fail_msg
11295 self.LogWarning("Could not remove block device %s on node %s,"
11296 " continuing anyway: %s", disk.iv_name, snode, msg)
11298 feedback_fn("Removing unneeded volumes on the primary node...")
11299 for idx, disk in enumerate(old_disks):
11300 meta = disk.children[1]
11301 self.cfg.SetDiskID(meta, pnode)
11302 msg = self.rpc.call_blockdev_remove(pnode, meta).fail_msg
11304 self.LogWarning("Could not remove metadata for disk %d on node %s,"
11305 " continuing anyway: %s", idx, pnode, msg)
11307 def Exec(self, feedback_fn):
11308 """Modifies an instance.
11310 All parameters take effect only at the next restart of the instance.
11313 # Process here the warnings from CheckPrereq, as we don't have a
11314 # feedback_fn there.
11315 for warn in self.warn:
11316 feedback_fn("WARNING: %s" % warn)
11319 instance = self.instance
11321 for disk_op, disk_dict in self.op.disks:
11322 if disk_op == constants.DDM_REMOVE:
11323 # remove the last disk
11324 device = instance.disks.pop()
11325 device_idx = len(instance.disks)
11326 for node, disk in device.ComputeNodeTree(instance.primary_node):
11327 self.cfg.SetDiskID(disk, node)
11328 msg = self.rpc.call_blockdev_remove(node, disk).fail_msg
11330 self.LogWarning("Could not remove disk/%d on node %s: %s,"
11331 " continuing anyway", device_idx, node, msg)
11332 result.append(("disk/%d" % device_idx, "remove"))
11333 elif disk_op == constants.DDM_ADD:
11335 if instance.disk_template in (constants.DT_FILE,
11336 constants.DT_SHARED_FILE):
11337 file_driver, file_path = instance.disks[0].logical_id
11338 file_path = os.path.dirname(file_path)
11340 file_driver = file_path = None
11341 disk_idx_base = len(instance.disks)
11342 new_disk = _GenerateDiskTemplate(self,
11343 instance.disk_template,
11344 instance.name, instance.primary_node,
11345 instance.secondary_nodes,
11349 disk_idx_base, feedback_fn)[0]
11350 instance.disks.append(new_disk)
11351 info = _GetInstanceInfoText(instance)
11353 logging.info("Creating volume %s for instance %s",
11354 new_disk.iv_name, instance.name)
11355 # Note: this needs to be kept in sync with _CreateDisks
11357 for node in instance.all_nodes:
11358 f_create = node == instance.primary_node
11360 _CreateBlockDev(self, node, instance, new_disk,
11361 f_create, info, f_create)
11362 except errors.OpExecError, err:
11363 self.LogWarning("Failed to create volume %s (%s) on"
11365 new_disk.iv_name, new_disk, node, err)
11366 result.append(("disk/%d" % disk_idx_base, "add:size=%s,mode=%s" %
11367 (new_disk.size, new_disk.mode)))
11369 # change a given disk
11370 instance.disks[disk_op].mode = disk_dict[constants.IDISK_MODE]
11371 result.append(("disk.mode/%d" % disk_op,
11372 disk_dict[constants.IDISK_MODE]))
11374 if self.op.disk_template:
11375 r_shut = _ShutdownInstanceDisks(self, instance)
11377 raise errors.OpExecError("Cannot shutdown instance disks, unable to"
11378 " proceed with disk template conversion")
11379 mode = (instance.disk_template, self.op.disk_template)
11381 self._DISK_CONVERSIONS[mode](self, feedback_fn)
11383 self.cfg.ReleaseDRBDMinors(instance.name)
11385 result.append(("disk_template", self.op.disk_template))
11388 for nic_op, nic_dict in self.op.nics:
11389 if nic_op == constants.DDM_REMOVE:
11390 # remove the last nic
11391 del instance.nics[-1]
11392 result.append(("nic.%d" % len(instance.nics), "remove"))
11393 elif nic_op == constants.DDM_ADD:
11394 # mac and bridge should be set, by now
11395 mac = nic_dict[constants.INIC_MAC]
11396 ip = nic_dict.get(constants.INIC_IP, None)
11397 nicparams = self.nic_pinst[constants.DDM_ADD]
11398 new_nic = objects.NIC(mac=mac, ip=ip, nicparams=nicparams)
11399 instance.nics.append(new_nic)
11400 result.append(("nic.%d" % (len(instance.nics) - 1),
11401 "add:mac=%s,ip=%s,mode=%s,link=%s" %
11402 (new_nic.mac, new_nic.ip,
11403 self.nic_pnew[constants.DDM_ADD][constants.NIC_MODE],
11404 self.nic_pnew[constants.DDM_ADD][constants.NIC_LINK]
11407 for key in (constants.INIC_MAC, constants.INIC_IP):
11408 if key in nic_dict:
11409 setattr(instance.nics[nic_op], key, nic_dict[key])
11410 if nic_op in self.nic_pinst:
11411 instance.nics[nic_op].nicparams = self.nic_pinst[nic_op]
11412 for key, val in nic_dict.iteritems():
11413 result.append(("nic.%s/%d" % (key, nic_op), val))
11416 if self.op.hvparams:
11417 instance.hvparams = self.hv_inst
11418 for key, val in self.op.hvparams.iteritems():
11419 result.append(("hv/%s" % key, val))
11422 if self.op.beparams:
11423 instance.beparams = self.be_inst
11424 for key, val in self.op.beparams.iteritems():
11425 result.append(("be/%s" % key, val))
11428 if self.op.os_name:
11429 instance.os = self.op.os_name
11432 if self.op.osparams:
11433 instance.osparams = self.os_inst
11434 for key, val in self.op.osparams.iteritems():
11435 result.append(("os/%s" % key, val))
11437 self.cfg.Update(instance, feedback_fn)
11441 _DISK_CONVERSIONS = {
11442 (constants.DT_PLAIN, constants.DT_DRBD8): _ConvertPlainToDrbd,
11443 (constants.DT_DRBD8, constants.DT_PLAIN): _ConvertDrbdToPlain,
11447 class LUInstanceChangeGroup(LogicalUnit):
11448 HPATH = "instance-change-group"
11449 HTYPE = constants.HTYPE_INSTANCE
11452 def ExpandNames(self):
11453 self.share_locks = _ShareAll()
11454 self.needed_locks = {
11455 locking.LEVEL_NODEGROUP: [],
11456 locking.LEVEL_NODE: [],
11459 self._ExpandAndLockInstance()
11461 if self.op.target_groups:
11462 self.req_target_uuids = map(self.cfg.LookupNodeGroup,
11463 self.op.target_groups)
11465 self.req_target_uuids = None
11467 self.op.iallocator = _GetDefaultIAllocator(self.cfg, self.op.iallocator)
11469 def DeclareLocks(self, level):
11470 if level == locking.LEVEL_NODEGROUP:
11471 assert not self.needed_locks[locking.LEVEL_NODEGROUP]
11473 if self.req_target_uuids:
11474 lock_groups = set(self.req_target_uuids)
11476 # Lock all groups used by instance optimistically; this requires going
11477 # via the node before it's locked, requiring verification later on
11478 instance_groups = self.cfg.GetInstanceNodeGroups(self.op.instance_name)
11479 lock_groups.update(instance_groups)
11481 # No target groups, need to lock all of them
11482 lock_groups = locking.ALL_SET
11484 self.needed_locks[locking.LEVEL_NODEGROUP] = lock_groups
11486 elif level == locking.LEVEL_NODE:
11487 if self.req_target_uuids:
11488 # Lock all nodes used by instances
11489 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
11490 self._LockInstancesNodes()
11492 # Lock all nodes in all potential target groups
11493 lock_groups = (frozenset(self.owned_locks(locking.LEVEL_NODEGROUP)) -
11494 self.cfg.GetInstanceNodeGroups(self.op.instance_name))
11495 member_nodes = [node_name
11496 for group in lock_groups
11497 for node_name in self.cfg.GetNodeGroup(group).members]
11498 self.needed_locks[locking.LEVEL_NODE].extend(member_nodes)
11500 # Lock all nodes as all groups are potential targets
11501 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
11503 def CheckPrereq(self):
11504 owned_instances = frozenset(self.owned_locks(locking.LEVEL_INSTANCE))
11505 owned_groups = frozenset(self.owned_locks(locking.LEVEL_NODEGROUP))
11506 owned_nodes = frozenset(self.owned_locks(locking.LEVEL_NODE))
11508 assert (self.req_target_uuids is None or
11509 owned_groups.issuperset(self.req_target_uuids))
11510 assert owned_instances == set([self.op.instance_name])
11512 # Get instance information
11513 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
11515 # Check if node groups for locked instance are still correct
11516 assert owned_nodes.issuperset(self.instance.all_nodes), \
11517 ("Instance %s's nodes changed while we kept the lock" %
11518 self.op.instance_name)
11520 inst_groups = _CheckInstanceNodeGroups(self.cfg, self.op.instance_name,
11523 if self.req_target_uuids:
11524 # User requested specific target groups
11525 self.target_uuids = self.req_target_uuids
11527 # All groups except those used by the instance are potential targets
11528 self.target_uuids = owned_groups - inst_groups
11530 conflicting_groups = self.target_uuids & inst_groups
11531 if conflicting_groups:
11532 raise errors.OpPrereqError("Can't use group(s) '%s' as targets, they are"
11533 " used by the instance '%s'" %
11534 (utils.CommaJoin(conflicting_groups),
11535 self.op.instance_name),
11536 errors.ECODE_INVAL)
11538 if not self.target_uuids:
11539 raise errors.OpPrereqError("There are no possible target groups",
11540 errors.ECODE_INVAL)
11542 def BuildHooksEnv(self):
11543 """Build hooks env.
11546 assert self.target_uuids
11549 "TARGET_GROUPS": " ".join(self.target_uuids),
11552 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
11556 def BuildHooksNodes(self):
11557 """Build hooks nodes.
11560 mn = self.cfg.GetMasterNode()
11561 return ([mn], [mn])
11563 def Exec(self, feedback_fn):
11564 instances = list(self.owned_locks(locking.LEVEL_INSTANCE))
11566 assert instances == [self.op.instance_name], "Instance not locked"
11568 ial = IAllocator(self.cfg, self.rpc, constants.IALLOCATOR_MODE_CHG_GROUP,
11569 instances=instances, target_groups=list(self.target_uuids))
11571 ial.Run(self.op.iallocator)
11573 if not ial.success:
11574 raise errors.OpPrereqError("Can't compute solution for changing group of"
11575 " instance '%s' using iallocator '%s': %s" %
11576 (self.op.instance_name, self.op.iallocator,
11578 errors.ECODE_NORES)
11580 jobs = _LoadNodeEvacResult(self, ial.result, self.op.early_release, False)
11582 self.LogInfo("Iallocator returned %s job(s) for changing group of"
11583 " instance '%s'", len(jobs), self.op.instance_name)
11585 return ResultWithJobs(jobs)
11588 class LUBackupQuery(NoHooksLU):
11589 """Query the exports list
11594 def ExpandNames(self):
11595 self.needed_locks = {}
11596 self.share_locks[locking.LEVEL_NODE] = 1
11597 if not self.op.nodes:
11598 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
11600 self.needed_locks[locking.LEVEL_NODE] = \
11601 _GetWantedNodes(self, self.op.nodes)
11603 def Exec(self, feedback_fn):
11604 """Compute the list of all the exported system images.
11607 @return: a dictionary with the structure node->(export-list)
11608 where export-list is a list of the instances exported on
11612 self.nodes = self.owned_locks(locking.LEVEL_NODE)
11613 rpcresult = self.rpc.call_export_list(self.nodes)
11615 for node in rpcresult:
11616 if rpcresult[node].fail_msg:
11617 result[node] = False
11619 result[node] = rpcresult[node].payload
11624 class LUBackupPrepare(NoHooksLU):
11625 """Prepares an instance for an export and returns useful information.
11630 def ExpandNames(self):
11631 self._ExpandAndLockInstance()
11633 def CheckPrereq(self):
11634 """Check prerequisites.
11637 instance_name = self.op.instance_name
11639 self.instance = self.cfg.GetInstanceInfo(instance_name)
11640 assert self.instance is not None, \
11641 "Cannot retrieve locked instance %s" % self.op.instance_name
11642 _CheckNodeOnline(self, self.instance.primary_node)
11644 self._cds = _GetClusterDomainSecret()
11646 def Exec(self, feedback_fn):
11647 """Prepares an instance for an export.
11650 instance = self.instance
11652 if self.op.mode == constants.EXPORT_MODE_REMOTE:
11653 salt = utils.GenerateSecret(8)
11655 feedback_fn("Generating X509 certificate on %s" % instance.primary_node)
11656 result = self.rpc.call_x509_cert_create(instance.primary_node,
11657 constants.RIE_CERT_VALIDITY)
11658 result.Raise("Can't create X509 key and certificate on %s" % result.node)
11660 (name, cert_pem) = result.payload
11662 cert = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM,
11666 "handshake": masterd.instance.ComputeRemoteExportHandshake(self._cds),
11667 "x509_key_name": (name, utils.Sha1Hmac(self._cds, name, salt=salt),
11669 "x509_ca": utils.SignX509Certificate(cert, self._cds, salt),
11675 class LUBackupExport(LogicalUnit):
11676 """Export an instance to an image in the cluster.
11679 HPATH = "instance-export"
11680 HTYPE = constants.HTYPE_INSTANCE
11683 def CheckArguments(self):
11684 """Check the arguments.
11687 self.x509_key_name = self.op.x509_key_name
11688 self.dest_x509_ca_pem = self.op.destination_x509_ca
11690 if self.op.mode == constants.EXPORT_MODE_REMOTE:
11691 if not self.x509_key_name:
11692 raise errors.OpPrereqError("Missing X509 key name for encryption",
11693 errors.ECODE_INVAL)
11695 if not self.dest_x509_ca_pem:
11696 raise errors.OpPrereqError("Missing destination X509 CA",
11697 errors.ECODE_INVAL)
11699 def ExpandNames(self):
11700 self._ExpandAndLockInstance()
11702 # Lock all nodes for local exports
11703 if self.op.mode == constants.EXPORT_MODE_LOCAL:
11704 # FIXME: lock only instance primary and destination node
11706 # Sad but true, for now we have do lock all nodes, as we don't know where
11707 # the previous export might be, and in this LU we search for it and
11708 # remove it from its current node. In the future we could fix this by:
11709 # - making a tasklet to search (share-lock all), then create the
11710 # new one, then one to remove, after
11711 # - removing the removal operation altogether
11712 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
11714 def DeclareLocks(self, level):
11715 """Last minute lock declaration."""
11716 # All nodes are locked anyway, so nothing to do here.
11718 def BuildHooksEnv(self):
11719 """Build hooks env.
11721 This will run on the master, primary node and target node.
11725 "EXPORT_MODE": self.op.mode,
11726 "EXPORT_NODE": self.op.target_node,
11727 "EXPORT_DO_SHUTDOWN": self.op.shutdown,
11728 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
11729 # TODO: Generic function for boolean env variables
11730 "REMOVE_INSTANCE": str(bool(self.op.remove_instance)),
11733 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
11737 def BuildHooksNodes(self):
11738 """Build hooks nodes.
11741 nl = [self.cfg.GetMasterNode(), self.instance.primary_node]
11743 if self.op.mode == constants.EXPORT_MODE_LOCAL:
11744 nl.append(self.op.target_node)
11748 def CheckPrereq(self):
11749 """Check prerequisites.
11751 This checks that the instance and node names are valid.
11754 instance_name = self.op.instance_name
11756 self.instance = self.cfg.GetInstanceInfo(instance_name)
11757 assert self.instance is not None, \
11758 "Cannot retrieve locked instance %s" % self.op.instance_name
11759 _CheckNodeOnline(self, self.instance.primary_node)
11761 if (self.op.remove_instance and self.instance.admin_up and
11762 not self.op.shutdown):
11763 raise errors.OpPrereqError("Can not remove instance without shutting it"
11766 if self.op.mode == constants.EXPORT_MODE_LOCAL:
11767 self.op.target_node = _ExpandNodeName(self.cfg, self.op.target_node)
11768 self.dst_node = self.cfg.GetNodeInfo(self.op.target_node)
11769 assert self.dst_node is not None
11771 _CheckNodeOnline(self, self.dst_node.name)
11772 _CheckNodeNotDrained(self, self.dst_node.name)
11775 self.dest_disk_info = None
11776 self.dest_x509_ca = None
11778 elif self.op.mode == constants.EXPORT_MODE_REMOTE:
11779 self.dst_node = None
11781 if len(self.op.target_node) != len(self.instance.disks):
11782 raise errors.OpPrereqError(("Received destination information for %s"
11783 " disks, but instance %s has %s disks") %
11784 (len(self.op.target_node), instance_name,
11785 len(self.instance.disks)),
11786 errors.ECODE_INVAL)
11788 cds = _GetClusterDomainSecret()
11790 # Check X509 key name
11792 (key_name, hmac_digest, hmac_salt) = self.x509_key_name
11793 except (TypeError, ValueError), err:
11794 raise errors.OpPrereqError("Invalid data for X509 key name: %s" % err)
11796 if not utils.VerifySha1Hmac(cds, key_name, hmac_digest, salt=hmac_salt):
11797 raise errors.OpPrereqError("HMAC for X509 key name is wrong",
11798 errors.ECODE_INVAL)
11800 # Load and verify CA
11802 (cert, _) = utils.LoadSignedX509Certificate(self.dest_x509_ca_pem, cds)
11803 except OpenSSL.crypto.Error, err:
11804 raise errors.OpPrereqError("Unable to load destination X509 CA (%s)" %
11805 (err, ), errors.ECODE_INVAL)
11807 (errcode, msg) = utils.VerifyX509Certificate(cert, None, None)
11808 if errcode is not None:
11809 raise errors.OpPrereqError("Invalid destination X509 CA (%s)" %
11810 (msg, ), errors.ECODE_INVAL)
11812 self.dest_x509_ca = cert
11814 # Verify target information
11816 for idx, disk_data in enumerate(self.op.target_node):
11818 (host, port, magic) = \
11819 masterd.instance.CheckRemoteExportDiskInfo(cds, idx, disk_data)
11820 except errors.GenericError, err:
11821 raise errors.OpPrereqError("Target info for disk %s: %s" %
11822 (idx, err), errors.ECODE_INVAL)
11824 disk_info.append((host, port, magic))
11826 assert len(disk_info) == len(self.op.target_node)
11827 self.dest_disk_info = disk_info
11830 raise errors.ProgrammerError("Unhandled export mode %r" %
11833 # instance disk type verification
11834 # TODO: Implement export support for file-based disks
11835 for disk in self.instance.disks:
11836 if disk.dev_type == constants.LD_FILE:
11837 raise errors.OpPrereqError("Export not supported for instances with"
11838 " file-based disks", errors.ECODE_INVAL)
11840 def _CleanupExports(self, feedback_fn):
11841 """Removes exports of current instance from all other nodes.
11843 If an instance in a cluster with nodes A..D was exported to node C, its
11844 exports will be removed from the nodes A, B and D.
11847 assert self.op.mode != constants.EXPORT_MODE_REMOTE
11849 nodelist = self.cfg.GetNodeList()
11850 nodelist.remove(self.dst_node.name)
11852 # on one-node clusters nodelist will be empty after the removal
11853 # if we proceed the backup would be removed because OpBackupQuery
11854 # substitutes an empty list with the full cluster node list.
11855 iname = self.instance.name
11857 feedback_fn("Removing old exports for instance %s" % iname)
11858 exportlist = self.rpc.call_export_list(nodelist)
11859 for node in exportlist:
11860 if exportlist[node].fail_msg:
11862 if iname in exportlist[node].payload:
11863 msg = self.rpc.call_export_remove(node, iname).fail_msg
11865 self.LogWarning("Could not remove older export for instance %s"
11866 " on node %s: %s", iname, node, msg)
11868 def Exec(self, feedback_fn):
11869 """Export an instance to an image in the cluster.
11872 assert self.op.mode in constants.EXPORT_MODES
11874 instance = self.instance
11875 src_node = instance.primary_node
11877 if self.op.shutdown:
11878 # shutdown the instance, but not the disks
11879 feedback_fn("Shutting down instance %s" % instance.name)
11880 result = self.rpc.call_instance_shutdown(src_node, instance,
11881 self.op.shutdown_timeout)
11882 # TODO: Maybe ignore failures if ignore_remove_failures is set
11883 result.Raise("Could not shutdown instance %s on"
11884 " node %s" % (instance.name, src_node))
11886 # set the disks ID correctly since call_instance_start needs the
11887 # correct drbd minor to create the symlinks
11888 for disk in instance.disks:
11889 self.cfg.SetDiskID(disk, src_node)
11891 activate_disks = (not instance.admin_up)
11894 # Activate the instance disks if we'exporting a stopped instance
11895 feedback_fn("Activating disks for %s" % instance.name)
11896 _StartInstanceDisks(self, instance, None)
11899 helper = masterd.instance.ExportInstanceHelper(self, feedback_fn,
11902 helper.CreateSnapshots()
11904 if (self.op.shutdown and instance.admin_up and
11905 not self.op.remove_instance):
11906 assert not activate_disks
11907 feedback_fn("Starting instance %s" % instance.name)
11908 result = self.rpc.call_instance_start(src_node,
11909 (instance, None, None), False)
11910 msg = result.fail_msg
11912 feedback_fn("Failed to start instance: %s" % msg)
11913 _ShutdownInstanceDisks(self, instance)
11914 raise errors.OpExecError("Could not start instance: %s" % msg)
11916 if self.op.mode == constants.EXPORT_MODE_LOCAL:
11917 (fin_resu, dresults) = helper.LocalExport(self.dst_node)
11918 elif self.op.mode == constants.EXPORT_MODE_REMOTE:
11919 connect_timeout = constants.RIE_CONNECT_TIMEOUT
11920 timeouts = masterd.instance.ImportExportTimeouts(connect_timeout)
11922 (key_name, _, _) = self.x509_key_name
11925 OpenSSL.crypto.dump_certificate(OpenSSL.crypto.FILETYPE_PEM,
11928 (fin_resu, dresults) = helper.RemoteExport(self.dest_disk_info,
11929 key_name, dest_ca_pem,
11934 # Check for backwards compatibility
11935 assert len(dresults) == len(instance.disks)
11936 assert compat.all(isinstance(i, bool) for i in dresults), \
11937 "Not all results are boolean: %r" % dresults
11941 feedback_fn("Deactivating disks for %s" % instance.name)
11942 _ShutdownInstanceDisks(self, instance)
11944 if not (compat.all(dresults) and fin_resu):
11947 failures.append("export finalization")
11948 if not compat.all(dresults):
11949 fdsk = utils.CommaJoin(idx for (idx, dsk) in enumerate(dresults)
11951 failures.append("disk export: disk(s) %s" % fdsk)
11953 raise errors.OpExecError("Export failed, errors in %s" %
11954 utils.CommaJoin(failures))
11956 # At this point, the export was successful, we can cleanup/finish
11958 # Remove instance if requested
11959 if self.op.remove_instance:
11960 feedback_fn("Removing instance %s" % instance.name)
11961 _RemoveInstance(self, feedback_fn, instance,
11962 self.op.ignore_remove_failures)
11964 if self.op.mode == constants.EXPORT_MODE_LOCAL:
11965 self._CleanupExports(feedback_fn)
11967 return fin_resu, dresults
11970 class LUBackupRemove(NoHooksLU):
11971 """Remove exports related to the named instance.
11976 def ExpandNames(self):
11977 self.needed_locks = {}
11978 # We need all nodes to be locked in order for RemoveExport to work, but we
11979 # don't need to lock the instance itself, as nothing will happen to it (and
11980 # we can remove exports also for a removed instance)
11981 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
11983 def Exec(self, feedback_fn):
11984 """Remove any export.
11987 instance_name = self.cfg.ExpandInstanceName(self.op.instance_name)
11988 # If the instance was not found we'll try with the name that was passed in.
11989 # This will only work if it was an FQDN, though.
11991 if not instance_name:
11993 instance_name = self.op.instance_name
11995 locked_nodes = self.owned_locks(locking.LEVEL_NODE)
11996 exportlist = self.rpc.call_export_list(locked_nodes)
11998 for node in exportlist:
11999 msg = exportlist[node].fail_msg
12001 self.LogWarning("Failed to query node %s (continuing): %s", node, msg)
12003 if instance_name in exportlist[node].payload:
12005 result = self.rpc.call_export_remove(node, instance_name)
12006 msg = result.fail_msg
12008 logging.error("Could not remove export for instance %s"
12009 " on node %s: %s", instance_name, node, msg)
12011 if fqdn_warn and not found:
12012 feedback_fn("Export not found. If trying to remove an export belonging"
12013 " to a deleted instance please use its Fully Qualified"
12017 class LUGroupAdd(LogicalUnit):
12018 """Logical unit for creating node groups.
12021 HPATH = "group-add"
12022 HTYPE = constants.HTYPE_GROUP
12025 def ExpandNames(self):
12026 # We need the new group's UUID here so that we can create and acquire the
12027 # corresponding lock. Later, in Exec(), we'll indicate to cfg.AddNodeGroup
12028 # that it should not check whether the UUID exists in the configuration.
12029 self.group_uuid = self.cfg.GenerateUniqueID(self.proc.GetECId())
12030 self.needed_locks = {}
12031 self.add_locks[locking.LEVEL_NODEGROUP] = self.group_uuid
12033 def CheckPrereq(self):
12034 """Check prerequisites.
12036 This checks that the given group name is not an existing node group
12041 existing_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
12042 except errors.OpPrereqError:
12045 raise errors.OpPrereqError("Desired group name '%s' already exists as a"
12046 " node group (UUID: %s)" %
12047 (self.op.group_name, existing_uuid),
12048 errors.ECODE_EXISTS)
12050 if self.op.ndparams:
12051 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
12053 def BuildHooksEnv(self):
12054 """Build hooks env.
12058 "GROUP_NAME": self.op.group_name,
12061 def BuildHooksNodes(self):
12062 """Build hooks nodes.
12065 mn = self.cfg.GetMasterNode()
12066 return ([mn], [mn])
12068 def Exec(self, feedback_fn):
12069 """Add the node group to the cluster.
12072 group_obj = objects.NodeGroup(name=self.op.group_name, members=[],
12073 uuid=self.group_uuid,
12074 alloc_policy=self.op.alloc_policy,
12075 ndparams=self.op.ndparams)
12077 self.cfg.AddNodeGroup(group_obj, self.proc.GetECId(), check_uuid=False)
12078 del self.remove_locks[locking.LEVEL_NODEGROUP]
12081 class LUGroupAssignNodes(NoHooksLU):
12082 """Logical unit for assigning nodes to groups.
12087 def ExpandNames(self):
12088 # These raise errors.OpPrereqError on their own:
12089 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
12090 self.op.nodes = _GetWantedNodes(self, self.op.nodes)
12092 # We want to lock all the affected nodes and groups. We have readily
12093 # available the list of nodes, and the *destination* group. To gather the
12094 # list of "source" groups, we need to fetch node information later on.
12095 self.needed_locks = {
12096 locking.LEVEL_NODEGROUP: set([self.group_uuid]),
12097 locking.LEVEL_NODE: self.op.nodes,
12100 def DeclareLocks(self, level):
12101 if level == locking.LEVEL_NODEGROUP:
12102 assert len(self.needed_locks[locking.LEVEL_NODEGROUP]) == 1
12104 # Try to get all affected nodes' groups without having the group or node
12105 # lock yet. Needs verification later in the code flow.
12106 groups = self.cfg.GetNodeGroupsFromNodes(self.op.nodes)
12108 self.needed_locks[locking.LEVEL_NODEGROUP].update(groups)
12110 def CheckPrereq(self):
12111 """Check prerequisites.
12114 assert self.needed_locks[locking.LEVEL_NODEGROUP]
12115 assert (frozenset(self.owned_locks(locking.LEVEL_NODE)) ==
12116 frozenset(self.op.nodes))
12118 expected_locks = (set([self.group_uuid]) |
12119 self.cfg.GetNodeGroupsFromNodes(self.op.nodes))
12120 actual_locks = self.owned_locks(locking.LEVEL_NODEGROUP)
12121 if actual_locks != expected_locks:
12122 raise errors.OpExecError("Nodes changed groups since locks were acquired,"
12123 " current groups are '%s', used to be '%s'" %
12124 (utils.CommaJoin(expected_locks),
12125 utils.CommaJoin(actual_locks)))
12127 self.node_data = self.cfg.GetAllNodesInfo()
12128 self.group = self.cfg.GetNodeGroup(self.group_uuid)
12129 instance_data = self.cfg.GetAllInstancesInfo()
12131 if self.group is None:
12132 raise errors.OpExecError("Could not retrieve group '%s' (UUID: %s)" %
12133 (self.op.group_name, self.group_uuid))
12135 (new_splits, previous_splits) = \
12136 self.CheckAssignmentForSplitInstances([(node, self.group_uuid)
12137 for node in self.op.nodes],
12138 self.node_data, instance_data)
12141 fmt_new_splits = utils.CommaJoin(utils.NiceSort(new_splits))
12143 if not self.op.force:
12144 raise errors.OpExecError("The following instances get split by this"
12145 " change and --force was not given: %s" %
12148 self.LogWarning("This operation will split the following instances: %s",
12151 if previous_splits:
12152 self.LogWarning("In addition, these already-split instances continue"
12153 " to be split across groups: %s",
12154 utils.CommaJoin(utils.NiceSort(previous_splits)))
12156 def Exec(self, feedback_fn):
12157 """Assign nodes to a new group.
12160 for node in self.op.nodes:
12161 self.node_data[node].group = self.group_uuid
12163 # FIXME: Depends on side-effects of modifying the result of
12164 # C{cfg.GetAllNodesInfo}
12166 self.cfg.Update(self.group, feedback_fn) # Saves all modified nodes.
12169 def CheckAssignmentForSplitInstances(changes, node_data, instance_data):
12170 """Check for split instances after a node assignment.
12172 This method considers a series of node assignments as an atomic operation,
12173 and returns information about split instances after applying the set of
12176 In particular, it returns information about newly split instances, and
12177 instances that were already split, and remain so after the change.
12179 Only instances whose disk template is listed in constants.DTS_INT_MIRROR are
12182 @type changes: list of (node_name, new_group_uuid) pairs.
12183 @param changes: list of node assignments to consider.
12184 @param node_data: a dict with data for all nodes
12185 @param instance_data: a dict with all instances to consider
12186 @rtype: a two-tuple
12187 @return: a list of instances that were previously okay and result split as a
12188 consequence of this change, and a list of instances that were previously
12189 split and this change does not fix.
12192 changed_nodes = dict((node, group) for node, group in changes
12193 if node_data[node].group != group)
12195 all_split_instances = set()
12196 previously_split_instances = set()
12198 def InstanceNodes(instance):
12199 return [instance.primary_node] + list(instance.secondary_nodes)
12201 for inst in instance_data.values():
12202 if inst.disk_template not in constants.DTS_INT_MIRROR:
12205 instance_nodes = InstanceNodes(inst)
12207 if len(set(node_data[node].group for node in instance_nodes)) > 1:
12208 previously_split_instances.add(inst.name)
12210 if len(set(changed_nodes.get(node, node_data[node].group)
12211 for node in instance_nodes)) > 1:
12212 all_split_instances.add(inst.name)
12214 return (list(all_split_instances - previously_split_instances),
12215 list(previously_split_instances & all_split_instances))
12218 class _GroupQuery(_QueryBase):
12219 FIELDS = query.GROUP_FIELDS
12221 def ExpandNames(self, lu):
12222 lu.needed_locks = {}
12224 self._all_groups = lu.cfg.GetAllNodeGroupsInfo()
12225 name_to_uuid = dict((g.name, g.uuid) for g in self._all_groups.values())
12228 self.wanted = [name_to_uuid[name]
12229 for name in utils.NiceSort(name_to_uuid.keys())]
12231 # Accept names to be either names or UUIDs.
12234 all_uuid = frozenset(self._all_groups.keys())
12236 for name in self.names:
12237 if name in all_uuid:
12238 self.wanted.append(name)
12239 elif name in name_to_uuid:
12240 self.wanted.append(name_to_uuid[name])
12242 missing.append(name)
12245 raise errors.OpPrereqError("Some groups do not exist: %s" %
12246 utils.CommaJoin(missing),
12247 errors.ECODE_NOENT)
12249 def DeclareLocks(self, lu, level):
12252 def _GetQueryData(self, lu):
12253 """Computes the list of node groups and their attributes.
12256 do_nodes = query.GQ_NODE in self.requested_data
12257 do_instances = query.GQ_INST in self.requested_data
12259 group_to_nodes = None
12260 group_to_instances = None
12262 # For GQ_NODE, we need to map group->[nodes], and group->[instances] for
12263 # GQ_INST. The former is attainable with just GetAllNodesInfo(), but for the
12264 # latter GetAllInstancesInfo() is not enough, for we have to go through
12265 # instance->node. Hence, we will need to process nodes even if we only need
12266 # instance information.
12267 if do_nodes or do_instances:
12268 all_nodes = lu.cfg.GetAllNodesInfo()
12269 group_to_nodes = dict((uuid, []) for uuid in self.wanted)
12272 for node in all_nodes.values():
12273 if node.group in group_to_nodes:
12274 group_to_nodes[node.group].append(node.name)
12275 node_to_group[node.name] = node.group
12278 all_instances = lu.cfg.GetAllInstancesInfo()
12279 group_to_instances = dict((uuid, []) for uuid in self.wanted)
12281 for instance in all_instances.values():
12282 node = instance.primary_node
12283 if node in node_to_group:
12284 group_to_instances[node_to_group[node]].append(instance.name)
12287 # Do not pass on node information if it was not requested.
12288 group_to_nodes = None
12290 return query.GroupQueryData([self._all_groups[uuid]
12291 for uuid in self.wanted],
12292 group_to_nodes, group_to_instances)
12295 class LUGroupQuery(NoHooksLU):
12296 """Logical unit for querying node groups.
12301 def CheckArguments(self):
12302 self.gq = _GroupQuery(qlang.MakeSimpleFilter("name", self.op.names),
12303 self.op.output_fields, False)
12305 def ExpandNames(self):
12306 self.gq.ExpandNames(self)
12308 def DeclareLocks(self, level):
12309 self.gq.DeclareLocks(self, level)
12311 def Exec(self, feedback_fn):
12312 return self.gq.OldStyleQuery(self)
12315 class LUGroupSetParams(LogicalUnit):
12316 """Modifies the parameters of a node group.
12319 HPATH = "group-modify"
12320 HTYPE = constants.HTYPE_GROUP
12323 def CheckArguments(self):
12326 self.op.alloc_policy,
12329 if all_changes.count(None) == len(all_changes):
12330 raise errors.OpPrereqError("Please pass at least one modification",
12331 errors.ECODE_INVAL)
12333 def ExpandNames(self):
12334 # This raises errors.OpPrereqError on its own:
12335 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
12337 self.needed_locks = {
12338 locking.LEVEL_NODEGROUP: [self.group_uuid],
12341 def CheckPrereq(self):
12342 """Check prerequisites.
12345 self.group = self.cfg.GetNodeGroup(self.group_uuid)
12347 if self.group is None:
12348 raise errors.OpExecError("Could not retrieve group '%s' (UUID: %s)" %
12349 (self.op.group_name, self.group_uuid))
12351 if self.op.ndparams:
12352 new_ndparams = _GetUpdatedParams(self.group.ndparams, self.op.ndparams)
12353 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
12354 self.new_ndparams = new_ndparams
12356 def BuildHooksEnv(self):
12357 """Build hooks env.
12361 "GROUP_NAME": self.op.group_name,
12362 "NEW_ALLOC_POLICY": self.op.alloc_policy,
12365 def BuildHooksNodes(self):
12366 """Build hooks nodes.
12369 mn = self.cfg.GetMasterNode()
12370 return ([mn], [mn])
12372 def Exec(self, feedback_fn):
12373 """Modifies the node group.
12378 if self.op.ndparams:
12379 self.group.ndparams = self.new_ndparams
12380 result.append(("ndparams", str(self.group.ndparams)))
12382 if self.op.alloc_policy:
12383 self.group.alloc_policy = self.op.alloc_policy
12385 self.cfg.Update(self.group, feedback_fn)
12389 class LUGroupRemove(LogicalUnit):
12390 HPATH = "group-remove"
12391 HTYPE = constants.HTYPE_GROUP
12394 def ExpandNames(self):
12395 # This will raises errors.OpPrereqError on its own:
12396 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
12397 self.needed_locks = {
12398 locking.LEVEL_NODEGROUP: [self.group_uuid],
12401 def CheckPrereq(self):
12402 """Check prerequisites.
12404 This checks that the given group name exists as a node group, that is
12405 empty (i.e., contains no nodes), and that is not the last group of the
12409 # Verify that the group is empty.
12410 group_nodes = [node.name
12411 for node in self.cfg.GetAllNodesInfo().values()
12412 if node.group == self.group_uuid]
12415 raise errors.OpPrereqError("Group '%s' not empty, has the following"
12417 (self.op.group_name,
12418 utils.CommaJoin(utils.NiceSort(group_nodes))),
12419 errors.ECODE_STATE)
12421 # Verify the cluster would not be left group-less.
12422 if len(self.cfg.GetNodeGroupList()) == 1:
12423 raise errors.OpPrereqError("Group '%s' is the only group,"
12424 " cannot be removed" %
12425 self.op.group_name,
12426 errors.ECODE_STATE)
12428 def BuildHooksEnv(self):
12429 """Build hooks env.
12433 "GROUP_NAME": self.op.group_name,
12436 def BuildHooksNodes(self):
12437 """Build hooks nodes.
12440 mn = self.cfg.GetMasterNode()
12441 return ([mn], [mn])
12443 def Exec(self, feedback_fn):
12444 """Remove the node group.
12448 self.cfg.RemoveNodeGroup(self.group_uuid)
12449 except errors.ConfigurationError:
12450 raise errors.OpExecError("Group '%s' with UUID %s disappeared" %
12451 (self.op.group_name, self.group_uuid))
12453 self.remove_locks[locking.LEVEL_NODEGROUP] = self.group_uuid
12456 class LUGroupRename(LogicalUnit):
12457 HPATH = "group-rename"
12458 HTYPE = constants.HTYPE_GROUP
12461 def ExpandNames(self):
12462 # This raises errors.OpPrereqError on its own:
12463 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
12465 self.needed_locks = {
12466 locking.LEVEL_NODEGROUP: [self.group_uuid],
12469 def CheckPrereq(self):
12470 """Check prerequisites.
12472 Ensures requested new name is not yet used.
12476 new_name_uuid = self.cfg.LookupNodeGroup(self.op.new_name)
12477 except errors.OpPrereqError:
12480 raise errors.OpPrereqError("Desired new name '%s' clashes with existing"
12481 " node group (UUID: %s)" %
12482 (self.op.new_name, new_name_uuid),
12483 errors.ECODE_EXISTS)
12485 def BuildHooksEnv(self):
12486 """Build hooks env.
12490 "OLD_NAME": self.op.group_name,
12491 "NEW_NAME": self.op.new_name,
12494 def BuildHooksNodes(self):
12495 """Build hooks nodes.
12498 mn = self.cfg.GetMasterNode()
12500 all_nodes = self.cfg.GetAllNodesInfo()
12501 all_nodes.pop(mn, None)
12504 run_nodes.extend(node.name for node in all_nodes.values()
12505 if node.group == self.group_uuid)
12507 return (run_nodes, run_nodes)
12509 def Exec(self, feedback_fn):
12510 """Rename the node group.
12513 group = self.cfg.GetNodeGroup(self.group_uuid)
12516 raise errors.OpExecError("Could not retrieve group '%s' (UUID: %s)" %
12517 (self.op.group_name, self.group_uuid))
12519 group.name = self.op.new_name
12520 self.cfg.Update(group, feedback_fn)
12522 return self.op.new_name
12525 class LUGroupEvacuate(LogicalUnit):
12526 HPATH = "group-evacuate"
12527 HTYPE = constants.HTYPE_GROUP
12530 def ExpandNames(self):
12531 # This raises errors.OpPrereqError on its own:
12532 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
12534 if self.op.target_groups:
12535 self.req_target_uuids = map(self.cfg.LookupNodeGroup,
12536 self.op.target_groups)
12538 self.req_target_uuids = []
12540 if self.group_uuid in self.req_target_uuids:
12541 raise errors.OpPrereqError("Group to be evacuated (%s) can not be used"
12542 " as a target group (targets are %s)" %
12544 utils.CommaJoin(self.req_target_uuids)),
12545 errors.ECODE_INVAL)
12547 self.op.iallocator = _GetDefaultIAllocator(self.cfg, self.op.iallocator)
12549 self.share_locks = _ShareAll()
12550 self.needed_locks = {
12551 locking.LEVEL_INSTANCE: [],
12552 locking.LEVEL_NODEGROUP: [],
12553 locking.LEVEL_NODE: [],
12556 def DeclareLocks(self, level):
12557 if level == locking.LEVEL_INSTANCE:
12558 assert not self.needed_locks[locking.LEVEL_INSTANCE]
12560 # Lock instances optimistically, needs verification once node and group
12561 # locks have been acquired
12562 self.needed_locks[locking.LEVEL_INSTANCE] = \
12563 self.cfg.GetNodeGroupInstances(self.group_uuid)
12565 elif level == locking.LEVEL_NODEGROUP:
12566 assert not self.needed_locks[locking.LEVEL_NODEGROUP]
12568 if self.req_target_uuids:
12569 lock_groups = set([self.group_uuid] + self.req_target_uuids)
12571 # Lock all groups used by instances optimistically; this requires going
12572 # via the node before it's locked, requiring verification later on
12573 lock_groups.update(group_uuid
12574 for instance_name in
12575 self.owned_locks(locking.LEVEL_INSTANCE)
12577 self.cfg.GetInstanceNodeGroups(instance_name))
12579 # No target groups, need to lock all of them
12580 lock_groups = locking.ALL_SET
12582 self.needed_locks[locking.LEVEL_NODEGROUP] = lock_groups
12584 elif level == locking.LEVEL_NODE:
12585 # This will only lock the nodes in the group to be evacuated which
12586 # contain actual instances
12587 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
12588 self._LockInstancesNodes()
12590 # Lock all nodes in group to be evacuated and target groups
12591 owned_groups = frozenset(self.owned_locks(locking.LEVEL_NODEGROUP))
12592 assert self.group_uuid in owned_groups
12593 member_nodes = [node_name
12594 for group in owned_groups
12595 for node_name in self.cfg.GetNodeGroup(group).members]
12596 self.needed_locks[locking.LEVEL_NODE].extend(member_nodes)
12598 def CheckPrereq(self):
12599 owned_instances = frozenset(self.owned_locks(locking.LEVEL_INSTANCE))
12600 owned_groups = frozenset(self.owned_locks(locking.LEVEL_NODEGROUP))
12601 owned_nodes = frozenset(self.owned_locks(locking.LEVEL_NODE))
12603 assert owned_groups.issuperset(self.req_target_uuids)
12604 assert self.group_uuid in owned_groups
12606 # Check if locked instances are still correct
12607 _CheckNodeGroupInstances(self.cfg, self.group_uuid, owned_instances)
12609 # Get instance information
12610 self.instances = dict(self.cfg.GetMultiInstanceInfo(owned_instances))
12612 # Check if node groups for locked instances are still correct
12613 for instance_name in owned_instances:
12614 inst = self.instances[instance_name]
12615 assert owned_nodes.issuperset(inst.all_nodes), \
12616 "Instance %s's nodes changed while we kept the lock" % instance_name
12618 inst_groups = _CheckInstanceNodeGroups(self.cfg, instance_name,
12621 assert self.group_uuid in inst_groups, \
12622 "Instance %s has no node in group %s" % (instance_name, self.group_uuid)
12624 if self.req_target_uuids:
12625 # User requested specific target groups
12626 self.target_uuids = self.req_target_uuids
12628 # All groups except the one to be evacuated are potential targets
12629 self.target_uuids = [group_uuid for group_uuid in owned_groups
12630 if group_uuid != self.group_uuid]
12632 if not self.target_uuids:
12633 raise errors.OpPrereqError("There are no possible target groups",
12634 errors.ECODE_INVAL)
12636 def BuildHooksEnv(self):
12637 """Build hooks env.
12641 "GROUP_NAME": self.op.group_name,
12642 "TARGET_GROUPS": " ".join(self.target_uuids),
12645 def BuildHooksNodes(self):
12646 """Build hooks nodes.
12649 mn = self.cfg.GetMasterNode()
12651 assert self.group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP)
12653 run_nodes = [mn] + self.cfg.GetNodeGroup(self.group_uuid).members
12655 return (run_nodes, run_nodes)
12657 def Exec(self, feedback_fn):
12658 instances = list(self.owned_locks(locking.LEVEL_INSTANCE))
12660 assert self.group_uuid not in self.target_uuids
12662 ial = IAllocator(self.cfg, self.rpc, constants.IALLOCATOR_MODE_CHG_GROUP,
12663 instances=instances, target_groups=self.target_uuids)
12665 ial.Run(self.op.iallocator)
12667 if not ial.success:
12668 raise errors.OpPrereqError("Can't compute group evacuation using"
12669 " iallocator '%s': %s" %
12670 (self.op.iallocator, ial.info),
12671 errors.ECODE_NORES)
12673 jobs = _LoadNodeEvacResult(self, ial.result, self.op.early_release, False)
12675 self.LogInfo("Iallocator returned %s job(s) for evacuating node group %s",
12676 len(jobs), self.op.group_name)
12678 return ResultWithJobs(jobs)
12681 class TagsLU(NoHooksLU): # pylint: disable=W0223
12682 """Generic tags LU.
12684 This is an abstract class which is the parent of all the other tags LUs.
12687 def ExpandNames(self):
12688 self.group_uuid = None
12689 self.needed_locks = {}
12690 if self.op.kind == constants.TAG_NODE:
12691 self.op.name = _ExpandNodeName(self.cfg, self.op.name)
12692 self.needed_locks[locking.LEVEL_NODE] = self.op.name
12693 elif self.op.kind == constants.TAG_INSTANCE:
12694 self.op.name = _ExpandInstanceName(self.cfg, self.op.name)
12695 self.needed_locks[locking.LEVEL_INSTANCE] = self.op.name
12696 elif self.op.kind == constants.TAG_NODEGROUP:
12697 self.group_uuid = self.cfg.LookupNodeGroup(self.op.name)
12699 # FIXME: Acquire BGL for cluster tag operations (as of this writing it's
12700 # not possible to acquire the BGL based on opcode parameters)
12702 def CheckPrereq(self):
12703 """Check prerequisites.
12706 if self.op.kind == constants.TAG_CLUSTER:
12707 self.target = self.cfg.GetClusterInfo()
12708 elif self.op.kind == constants.TAG_NODE:
12709 self.target = self.cfg.GetNodeInfo(self.op.name)
12710 elif self.op.kind == constants.TAG_INSTANCE:
12711 self.target = self.cfg.GetInstanceInfo(self.op.name)
12712 elif self.op.kind == constants.TAG_NODEGROUP:
12713 self.target = self.cfg.GetNodeGroup(self.group_uuid)
12715 raise errors.OpPrereqError("Wrong tag type requested (%s)" %
12716 str(self.op.kind), errors.ECODE_INVAL)
12719 class LUTagsGet(TagsLU):
12720 """Returns the tags of a given object.
12725 def ExpandNames(self):
12726 TagsLU.ExpandNames(self)
12728 # Share locks as this is only a read operation
12729 self.share_locks = _ShareAll()
12731 def Exec(self, feedback_fn):
12732 """Returns the tag list.
12735 return list(self.target.GetTags())
12738 class LUTagsSearch(NoHooksLU):
12739 """Searches the tags for a given pattern.
12744 def ExpandNames(self):
12745 self.needed_locks = {}
12747 def CheckPrereq(self):
12748 """Check prerequisites.
12750 This checks the pattern passed for validity by compiling it.
12754 self.re = re.compile(self.op.pattern)
12755 except re.error, err:
12756 raise errors.OpPrereqError("Invalid search pattern '%s': %s" %
12757 (self.op.pattern, err), errors.ECODE_INVAL)
12759 def Exec(self, feedback_fn):
12760 """Returns the tag list.
12764 tgts = [("/cluster", cfg.GetClusterInfo())]
12765 ilist = cfg.GetAllInstancesInfo().values()
12766 tgts.extend([("/instances/%s" % i.name, i) for i in ilist])
12767 nlist = cfg.GetAllNodesInfo().values()
12768 tgts.extend([("/nodes/%s" % n.name, n) for n in nlist])
12769 tgts.extend(("/nodegroup/%s" % n.name, n)
12770 for n in cfg.GetAllNodeGroupsInfo().values())
12772 for path, target in tgts:
12773 for tag in target.GetTags():
12774 if self.re.search(tag):
12775 results.append((path, tag))
12779 class LUTagsSet(TagsLU):
12780 """Sets a tag on a given object.
12785 def CheckPrereq(self):
12786 """Check prerequisites.
12788 This checks the type and length of the tag name and value.
12791 TagsLU.CheckPrereq(self)
12792 for tag in self.op.tags:
12793 objects.TaggableObject.ValidateTag(tag)
12795 def Exec(self, feedback_fn):
12800 for tag in self.op.tags:
12801 self.target.AddTag(tag)
12802 except errors.TagError, err:
12803 raise errors.OpExecError("Error while setting tag: %s" % str(err))
12804 self.cfg.Update(self.target, feedback_fn)
12807 class LUTagsDel(TagsLU):
12808 """Delete a list of tags from a given object.
12813 def CheckPrereq(self):
12814 """Check prerequisites.
12816 This checks that we have the given tag.
12819 TagsLU.CheckPrereq(self)
12820 for tag in self.op.tags:
12821 objects.TaggableObject.ValidateTag(tag)
12822 del_tags = frozenset(self.op.tags)
12823 cur_tags = self.target.GetTags()
12825 diff_tags = del_tags - cur_tags
12827 diff_names = ("'%s'" % i for i in sorted(diff_tags))
12828 raise errors.OpPrereqError("Tag(s) %s not found" %
12829 (utils.CommaJoin(diff_names), ),
12830 errors.ECODE_NOENT)
12832 def Exec(self, feedback_fn):
12833 """Remove the tag from the object.
12836 for tag in self.op.tags:
12837 self.target.RemoveTag(tag)
12838 self.cfg.Update(self.target, feedback_fn)
12841 class LUTestDelay(NoHooksLU):
12842 """Sleep for a specified amount of time.
12844 This LU sleeps on the master and/or nodes for a specified amount of
12850 def ExpandNames(self):
12851 """Expand names and set required locks.
12853 This expands the node list, if any.
12856 self.needed_locks = {}
12857 if self.op.on_nodes:
12858 # _GetWantedNodes can be used here, but is not always appropriate to use
12859 # this way in ExpandNames. Check LogicalUnit.ExpandNames docstring for
12860 # more information.
12861 self.op.on_nodes = _GetWantedNodes(self, self.op.on_nodes)
12862 self.needed_locks[locking.LEVEL_NODE] = self.op.on_nodes
12864 def _TestDelay(self):
12865 """Do the actual sleep.
12868 if self.op.on_master:
12869 if not utils.TestDelay(self.op.duration):
12870 raise errors.OpExecError("Error during master delay test")
12871 if self.op.on_nodes:
12872 result = self.rpc.call_test_delay(self.op.on_nodes, self.op.duration)
12873 for node, node_result in result.items():
12874 node_result.Raise("Failure during rpc call to node %s" % node)
12876 def Exec(self, feedback_fn):
12877 """Execute the test delay opcode, with the wanted repetitions.
12880 if self.op.repeat == 0:
12883 top_value = self.op.repeat - 1
12884 for i in range(self.op.repeat):
12885 self.LogInfo("Test delay iteration %d/%d" % (i, top_value))
12889 class LUTestJqueue(NoHooksLU):
12890 """Utility LU to test some aspects of the job queue.
12895 # Must be lower than default timeout for WaitForJobChange to see whether it
12896 # notices changed jobs
12897 _CLIENT_CONNECT_TIMEOUT = 20.0
12898 _CLIENT_CONFIRM_TIMEOUT = 60.0
12901 def _NotifyUsingSocket(cls, cb, errcls):
12902 """Opens a Unix socket and waits for another program to connect.
12905 @param cb: Callback to send socket name to client
12906 @type errcls: class
12907 @param errcls: Exception class to use for errors
12910 # Using a temporary directory as there's no easy way to create temporary
12911 # sockets without writing a custom loop around tempfile.mktemp and
12913 tmpdir = tempfile.mkdtemp()
12915 tmpsock = utils.PathJoin(tmpdir, "sock")
12917 logging.debug("Creating temporary socket at %s", tmpsock)
12918 sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
12923 # Send details to client
12926 # Wait for client to connect before continuing
12927 sock.settimeout(cls._CLIENT_CONNECT_TIMEOUT)
12929 (conn, _) = sock.accept()
12930 except socket.error, err:
12931 raise errcls("Client didn't connect in time (%s)" % err)
12935 # Remove as soon as client is connected
12936 shutil.rmtree(tmpdir)
12938 # Wait for client to close
12941 # pylint: disable=E1101
12942 # Instance of '_socketobject' has no ... member
12943 conn.settimeout(cls._CLIENT_CONFIRM_TIMEOUT)
12945 except socket.error, err:
12946 raise errcls("Client failed to confirm notification (%s)" % err)
12950 def _SendNotification(self, test, arg, sockname):
12951 """Sends a notification to the client.
12954 @param test: Test name
12955 @param arg: Test argument (depends on test)
12956 @type sockname: string
12957 @param sockname: Socket path
12960 self.Log(constants.ELOG_JQUEUE_TEST, (sockname, test, arg))
12962 def _Notify(self, prereq, test, arg):
12963 """Notifies the client of a test.
12966 @param prereq: Whether this is a prereq-phase test
12968 @param test: Test name
12969 @param arg: Test argument (depends on test)
12973 errcls = errors.OpPrereqError
12975 errcls = errors.OpExecError
12977 return self._NotifyUsingSocket(compat.partial(self._SendNotification,
12981 def CheckArguments(self):
12982 self.checkargs_calls = getattr(self, "checkargs_calls", 0) + 1
12983 self.expandnames_calls = 0
12985 def ExpandNames(self):
12986 checkargs_calls = getattr(self, "checkargs_calls", 0)
12987 if checkargs_calls < 1:
12988 raise errors.ProgrammerError("CheckArguments was not called")
12990 self.expandnames_calls += 1
12992 if self.op.notify_waitlock:
12993 self._Notify(True, constants.JQT_EXPANDNAMES, None)
12995 self.LogInfo("Expanding names")
12997 # Get lock on master node (just to get a lock, not for a particular reason)
12998 self.needed_locks = {
12999 locking.LEVEL_NODE: self.cfg.GetMasterNode(),
13002 def Exec(self, feedback_fn):
13003 if self.expandnames_calls < 1:
13004 raise errors.ProgrammerError("ExpandNames was not called")
13006 if self.op.notify_exec:
13007 self._Notify(False, constants.JQT_EXEC, None)
13009 self.LogInfo("Executing")
13011 if self.op.log_messages:
13012 self._Notify(False, constants.JQT_STARTMSG, len(self.op.log_messages))
13013 for idx, msg in enumerate(self.op.log_messages):
13014 self.LogInfo("Sending log message %s", idx + 1)
13015 feedback_fn(constants.JQT_MSGPREFIX + msg)
13016 # Report how many test messages have been sent
13017 self._Notify(False, constants.JQT_LOGMSG, idx + 1)
13020 raise errors.OpExecError("Opcode failure was requested")
13025 class IAllocator(object):
13026 """IAllocator framework.
13028 An IAllocator instance has three sets of attributes:
13029 - cfg that is needed to query the cluster
13030 - input data (all members of the _KEYS class attribute are required)
13031 - four buffer attributes (in|out_data|text), that represent the
13032 input (to the external script) in text and data structure format,
13033 and the output from it, again in two formats
13034 - the result variables from the script (success, info, nodes) for
13038 # pylint: disable=R0902
13039 # lots of instance attributes
13041 def __init__(self, cfg, rpc, mode, **kwargs):
13044 # init buffer variables
13045 self.in_text = self.out_text = self.in_data = self.out_data = None
13046 # init all input fields so that pylint is happy
13048 self.memory = self.disks = self.disk_template = None
13049 self.os = self.tags = self.nics = self.vcpus = None
13050 self.hypervisor = None
13051 self.relocate_from = None
13053 self.instances = None
13054 self.evac_mode = None
13055 self.target_groups = []
13057 self.required_nodes = None
13058 # init result fields
13059 self.success = self.info = self.result = None
13062 (fn, keydata, self._result_check) = self._MODE_DATA[self.mode]
13064 raise errors.ProgrammerError("Unknown mode '%s' passed to the"
13065 " IAllocator" % self.mode)
13067 keyset = [n for (n, _) in keydata]
13070 if key not in keyset:
13071 raise errors.ProgrammerError("Invalid input parameter '%s' to"
13072 " IAllocator" % key)
13073 setattr(self, key, kwargs[key])
13076 if key not in kwargs:
13077 raise errors.ProgrammerError("Missing input parameter '%s' to"
13078 " IAllocator" % key)
13079 self._BuildInputData(compat.partial(fn, self), keydata)
13081 def _ComputeClusterData(self):
13082 """Compute the generic allocator input data.
13084 This is the data that is independent of the actual operation.
13088 cluster_info = cfg.GetClusterInfo()
13091 "version": constants.IALLOCATOR_VERSION,
13092 "cluster_name": cfg.GetClusterName(),
13093 "cluster_tags": list(cluster_info.GetTags()),
13094 "enabled_hypervisors": list(cluster_info.enabled_hypervisors),
13095 # we don't have job IDs
13097 ninfo = cfg.GetAllNodesInfo()
13098 iinfo = cfg.GetAllInstancesInfo().values()
13099 i_list = [(inst, cluster_info.FillBE(inst)) for inst in iinfo]
13102 node_list = [n.name for n in ninfo.values() if n.vm_capable]
13104 if self.mode == constants.IALLOCATOR_MODE_ALLOC:
13105 hypervisor_name = self.hypervisor
13106 elif self.mode == constants.IALLOCATOR_MODE_RELOC:
13107 hypervisor_name = cfg.GetInstanceInfo(self.name).hypervisor
13109 hypervisor_name = cluster_info.enabled_hypervisors[0]
13111 node_data = self.rpc.call_node_info(node_list, cfg.GetVGName(),
13114 self.rpc.call_all_instances_info(node_list,
13115 cluster_info.enabled_hypervisors)
13117 data["nodegroups"] = self._ComputeNodeGroupData(cfg)
13119 config_ndata = self._ComputeBasicNodeData(ninfo)
13120 data["nodes"] = self._ComputeDynamicNodeData(ninfo, node_data, node_iinfo,
13121 i_list, config_ndata)
13122 assert len(data["nodes"]) == len(ninfo), \
13123 "Incomplete node data computed"
13125 data["instances"] = self._ComputeInstanceData(cluster_info, i_list)
13127 self.in_data = data
13130 def _ComputeNodeGroupData(cfg):
13131 """Compute node groups data.
13134 ng = dict((guuid, {
13135 "name": gdata.name,
13136 "alloc_policy": gdata.alloc_policy,
13138 for guuid, gdata in cfg.GetAllNodeGroupsInfo().items())
13143 def _ComputeBasicNodeData(node_cfg):
13144 """Compute global node data.
13147 @returns: a dict of name: (node dict, node config)
13150 # fill in static (config-based) values
13151 node_results = dict((ninfo.name, {
13152 "tags": list(ninfo.GetTags()),
13153 "primary_ip": ninfo.primary_ip,
13154 "secondary_ip": ninfo.secondary_ip,
13155 "offline": ninfo.offline,
13156 "drained": ninfo.drained,
13157 "master_candidate": ninfo.master_candidate,
13158 "group": ninfo.group,
13159 "master_capable": ninfo.master_capable,
13160 "vm_capable": ninfo.vm_capable,
13162 for ninfo in node_cfg.values())
13164 return node_results
13167 def _ComputeDynamicNodeData(node_cfg, node_data, node_iinfo, i_list,
13169 """Compute global node data.
13171 @param node_results: the basic node structures as filled from the config
13174 # make a copy of the current dict
13175 node_results = dict(node_results)
13176 for nname, nresult in node_data.items():
13177 assert nname in node_results, "Missing basic data for node %s" % nname
13178 ninfo = node_cfg[nname]
13180 if not (ninfo.offline or ninfo.drained):
13181 nresult.Raise("Can't get data for node %s" % nname)
13182 node_iinfo[nname].Raise("Can't get node instance info from node %s" %
13184 remote_info = nresult.payload
13186 for attr in ["memory_total", "memory_free", "memory_dom0",
13187 "vg_size", "vg_free", "cpu_total"]:
13188 if attr not in remote_info:
13189 raise errors.OpExecError("Node '%s' didn't return attribute"
13190 " '%s'" % (nname, attr))
13191 if not isinstance(remote_info[attr], int):
13192 raise errors.OpExecError("Node '%s' returned invalid value"
13194 (nname, attr, remote_info[attr]))
13195 # compute memory used by primary instances
13196 i_p_mem = i_p_up_mem = 0
13197 for iinfo, beinfo in i_list:
13198 if iinfo.primary_node == nname:
13199 i_p_mem += beinfo[constants.BE_MEMORY]
13200 if iinfo.name not in node_iinfo[nname].payload:
13203 i_used_mem = int(node_iinfo[nname].payload[iinfo.name]["memory"])
13204 i_mem_diff = beinfo[constants.BE_MEMORY] - i_used_mem
13205 remote_info["memory_free"] -= max(0, i_mem_diff)
13208 i_p_up_mem += beinfo[constants.BE_MEMORY]
13210 # compute memory used by instances
13212 "total_memory": remote_info["memory_total"],
13213 "reserved_memory": remote_info["memory_dom0"],
13214 "free_memory": remote_info["memory_free"],
13215 "total_disk": remote_info["vg_size"],
13216 "free_disk": remote_info["vg_free"],
13217 "total_cpus": remote_info["cpu_total"],
13218 "i_pri_memory": i_p_mem,
13219 "i_pri_up_memory": i_p_up_mem,
13221 pnr_dyn.update(node_results[nname])
13222 node_results[nname] = pnr_dyn
13224 return node_results
13227 def _ComputeInstanceData(cluster_info, i_list):
13228 """Compute global instance data.
13232 for iinfo, beinfo in i_list:
13234 for nic in iinfo.nics:
13235 filled_params = cluster_info.SimpleFillNIC(nic.nicparams)
13239 "mode": filled_params[constants.NIC_MODE],
13240 "link": filled_params[constants.NIC_LINK],
13242 if filled_params[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
13243 nic_dict["bridge"] = filled_params[constants.NIC_LINK]
13244 nic_data.append(nic_dict)
13246 "tags": list(iinfo.GetTags()),
13247 "admin_up": iinfo.admin_up,
13248 "vcpus": beinfo[constants.BE_VCPUS],
13249 "memory": beinfo[constants.BE_MEMORY],
13251 "nodes": [iinfo.primary_node] + list(iinfo.secondary_nodes),
13253 "disks": [{constants.IDISK_SIZE: dsk.size,
13254 constants.IDISK_MODE: dsk.mode}
13255 for dsk in iinfo.disks],
13256 "disk_template": iinfo.disk_template,
13257 "hypervisor": iinfo.hypervisor,
13259 pir["disk_space_total"] = _ComputeDiskSize(iinfo.disk_template,
13261 instance_data[iinfo.name] = pir
13263 return instance_data
13265 def _AddNewInstance(self):
13266 """Add new instance data to allocator structure.
13268 This in combination with _AllocatorGetClusterData will create the
13269 correct structure needed as input for the allocator.
13271 The checks for the completeness of the opcode must have already been
13275 disk_space = _ComputeDiskSize(self.disk_template, self.disks)
13277 if self.disk_template in constants.DTS_INT_MIRROR:
13278 self.required_nodes = 2
13280 self.required_nodes = 1
13284 "disk_template": self.disk_template,
13287 "vcpus": self.vcpus,
13288 "memory": self.memory,
13289 "disks": self.disks,
13290 "disk_space_total": disk_space,
13292 "required_nodes": self.required_nodes,
13293 "hypervisor": self.hypervisor,
13298 def _AddRelocateInstance(self):
13299 """Add relocate instance data to allocator structure.
13301 This in combination with _IAllocatorGetClusterData will create the
13302 correct structure needed as input for the allocator.
13304 The checks for the completeness of the opcode must have already been
13308 instance = self.cfg.GetInstanceInfo(self.name)
13309 if instance is None:
13310 raise errors.ProgrammerError("Unknown instance '%s' passed to"
13311 " IAllocator" % self.name)
13313 if instance.disk_template not in constants.DTS_MIRRORED:
13314 raise errors.OpPrereqError("Can't relocate non-mirrored instances",
13315 errors.ECODE_INVAL)
13317 if instance.disk_template in constants.DTS_INT_MIRROR and \
13318 len(instance.secondary_nodes) != 1:
13319 raise errors.OpPrereqError("Instance has not exactly one secondary node",
13320 errors.ECODE_STATE)
13322 self.required_nodes = 1
13323 disk_sizes = [{constants.IDISK_SIZE: disk.size} for disk in instance.disks]
13324 disk_space = _ComputeDiskSize(instance.disk_template, disk_sizes)
13328 "disk_space_total": disk_space,
13329 "required_nodes": self.required_nodes,
13330 "relocate_from": self.relocate_from,
13334 def _AddNodeEvacuate(self):
13335 """Get data for node-evacuate requests.
13339 "instances": self.instances,
13340 "evac_mode": self.evac_mode,
13343 def _AddChangeGroup(self):
13344 """Get data for node-evacuate requests.
13348 "instances": self.instances,
13349 "target_groups": self.target_groups,
13352 def _BuildInputData(self, fn, keydata):
13353 """Build input data structures.
13356 self._ComputeClusterData()
13359 request["type"] = self.mode
13360 for keyname, keytype in keydata:
13361 if keyname not in request:
13362 raise errors.ProgrammerError("Request parameter %s is missing" %
13364 val = request[keyname]
13365 if not keytype(val):
13366 raise errors.ProgrammerError("Request parameter %s doesn't pass"
13367 " validation, value %s, expected"
13368 " type %s" % (keyname, val, keytype))
13369 self.in_data["request"] = request
13371 self.in_text = serializer.Dump(self.in_data)
13373 _STRING_LIST = ht.TListOf(ht.TString)
13374 _JOB_LIST = ht.TListOf(ht.TListOf(ht.TStrictDict(True, False, {
13375 # pylint: disable=E1101
13376 # Class '...' has no 'OP_ID' member
13377 "OP_ID": ht.TElemOf([opcodes.OpInstanceFailover.OP_ID,
13378 opcodes.OpInstanceMigrate.OP_ID,
13379 opcodes.OpInstanceReplaceDisks.OP_ID])
13383 ht.TListOf(ht.TAnd(ht.TIsLength(3),
13384 ht.TItems([ht.TNonEmptyString,
13385 ht.TNonEmptyString,
13386 ht.TListOf(ht.TNonEmptyString),
13389 ht.TListOf(ht.TAnd(ht.TIsLength(2),
13390 ht.TItems([ht.TNonEmptyString,
13393 _NEVAC_RESULT = ht.TAnd(ht.TIsLength(3),
13394 ht.TItems([_NEVAC_MOVED, _NEVAC_FAILED, _JOB_LIST]))
13397 constants.IALLOCATOR_MODE_ALLOC:
13400 ("name", ht.TString),
13401 ("memory", ht.TInt),
13402 ("disks", ht.TListOf(ht.TDict)),
13403 ("disk_template", ht.TString),
13404 ("os", ht.TString),
13405 ("tags", _STRING_LIST),
13406 ("nics", ht.TListOf(ht.TDict)),
13407 ("vcpus", ht.TInt),
13408 ("hypervisor", ht.TString),
13410 constants.IALLOCATOR_MODE_RELOC:
13411 (_AddRelocateInstance,
13412 [("name", ht.TString), ("relocate_from", _STRING_LIST)],
13414 constants.IALLOCATOR_MODE_NODE_EVAC:
13415 (_AddNodeEvacuate, [
13416 ("instances", _STRING_LIST),
13417 ("evac_mode", ht.TElemOf(constants.IALLOCATOR_NEVAC_MODES)),
13419 constants.IALLOCATOR_MODE_CHG_GROUP:
13420 (_AddChangeGroup, [
13421 ("instances", _STRING_LIST),
13422 ("target_groups", _STRING_LIST),
13426 def Run(self, name, validate=True, call_fn=None):
13427 """Run an instance allocator and return the results.
13430 if call_fn is None:
13431 call_fn = self.rpc.call_iallocator_runner
13433 result = call_fn(self.cfg.GetMasterNode(), name, self.in_text)
13434 result.Raise("Failure while running the iallocator script")
13436 self.out_text = result.payload
13438 self._ValidateResult()
13440 def _ValidateResult(self):
13441 """Process the allocator results.
13443 This will process and if successful save the result in
13444 self.out_data and the other parameters.
13448 rdict = serializer.Load(self.out_text)
13449 except Exception, err:
13450 raise errors.OpExecError("Can't parse iallocator results: %s" % str(err))
13452 if not isinstance(rdict, dict):
13453 raise errors.OpExecError("Can't parse iallocator results: not a dict")
13455 # TODO: remove backwards compatiblity in later versions
13456 if "nodes" in rdict and "result" not in rdict:
13457 rdict["result"] = rdict["nodes"]
13460 for key in "success", "info", "result":
13461 if key not in rdict:
13462 raise errors.OpExecError("Can't parse iallocator results:"
13463 " missing key '%s'" % key)
13464 setattr(self, key, rdict[key])
13466 if not self._result_check(self.result):
13467 raise errors.OpExecError("Iallocator returned invalid result,"
13468 " expected %s, got %s" %
13469 (self._result_check, self.result),
13470 errors.ECODE_INVAL)
13472 if self.mode == constants.IALLOCATOR_MODE_RELOC:
13473 assert self.relocate_from is not None
13474 assert self.required_nodes == 1
13476 node2group = dict((name, ndata["group"])
13477 for (name, ndata) in self.in_data["nodes"].items())
13479 fn = compat.partial(self._NodesToGroups, node2group,
13480 self.in_data["nodegroups"])
13482 instance = self.cfg.GetInstanceInfo(self.name)
13483 request_groups = fn(self.relocate_from + [instance.primary_node])
13484 result_groups = fn(rdict["result"] + [instance.primary_node])
13486 if self.success and not set(result_groups).issubset(request_groups):
13487 raise errors.OpExecError("Groups of nodes returned by iallocator (%s)"
13488 " differ from original groups (%s)" %
13489 (utils.CommaJoin(result_groups),
13490 utils.CommaJoin(request_groups)))
13492 elif self.mode == constants.IALLOCATOR_MODE_NODE_EVAC:
13493 assert self.evac_mode in constants.IALLOCATOR_NEVAC_MODES
13495 self.out_data = rdict
13498 def _NodesToGroups(node2group, groups, nodes):
13499 """Returns a list of unique group names for a list of nodes.
13501 @type node2group: dict
13502 @param node2group: Map from node name to group UUID
13504 @param groups: Group information
13506 @param nodes: Node names
13513 group_uuid = node2group[node]
13515 # Ignore unknown node
13519 group = groups[group_uuid]
13521 # Can't find group, let's use UUID
13522 group_name = group_uuid
13524 group_name = group["name"]
13526 result.add(group_name)
13528 return sorted(result)
13531 class LUTestAllocator(NoHooksLU):
13532 """Run allocator tests.
13534 This LU runs the allocator tests
13537 def CheckPrereq(self):
13538 """Check prerequisites.
13540 This checks the opcode parameters depending on the director and mode test.
13543 if self.op.mode == constants.IALLOCATOR_MODE_ALLOC:
13544 for attr in ["memory", "disks", "disk_template",
13545 "os", "tags", "nics", "vcpus"]:
13546 if not hasattr(self.op, attr):
13547 raise errors.OpPrereqError("Missing attribute '%s' on opcode input" %
13548 attr, errors.ECODE_INVAL)
13549 iname = self.cfg.ExpandInstanceName(self.op.name)
13550 if iname is not None:
13551 raise errors.OpPrereqError("Instance '%s' already in the cluster" %
13552 iname, errors.ECODE_EXISTS)
13553 if not isinstance(self.op.nics, list):
13554 raise errors.OpPrereqError("Invalid parameter 'nics'",
13555 errors.ECODE_INVAL)
13556 if not isinstance(self.op.disks, list):
13557 raise errors.OpPrereqError("Invalid parameter 'disks'",
13558 errors.ECODE_INVAL)
13559 for row in self.op.disks:
13560 if (not isinstance(row, dict) or
13561 constants.IDISK_SIZE not in row or
13562 not isinstance(row[constants.IDISK_SIZE], int) or
13563 constants.IDISK_MODE not in row or
13564 row[constants.IDISK_MODE] not in constants.DISK_ACCESS_SET):
13565 raise errors.OpPrereqError("Invalid contents of the 'disks'"
13566 " parameter", errors.ECODE_INVAL)
13567 if self.op.hypervisor is None:
13568 self.op.hypervisor = self.cfg.GetHypervisorType()
13569 elif self.op.mode == constants.IALLOCATOR_MODE_RELOC:
13570 fname = _ExpandInstanceName(self.cfg, self.op.name)
13571 self.op.name = fname
13572 self.relocate_from = \
13573 list(self.cfg.GetInstanceInfo(fname).secondary_nodes)
13574 elif self.op.mode in (constants.IALLOCATOR_MODE_CHG_GROUP,
13575 constants.IALLOCATOR_MODE_NODE_EVAC):
13576 if not self.op.instances:
13577 raise errors.OpPrereqError("Missing instances", errors.ECODE_INVAL)
13578 self.op.instances = _GetWantedInstances(self, self.op.instances)
13580 raise errors.OpPrereqError("Invalid test allocator mode '%s'" %
13581 self.op.mode, errors.ECODE_INVAL)
13583 if self.op.direction == constants.IALLOCATOR_DIR_OUT:
13584 if self.op.allocator is None:
13585 raise errors.OpPrereqError("Missing allocator name",
13586 errors.ECODE_INVAL)
13587 elif self.op.direction != constants.IALLOCATOR_DIR_IN:
13588 raise errors.OpPrereqError("Wrong allocator test '%s'" %
13589 self.op.direction, errors.ECODE_INVAL)
13591 def Exec(self, feedback_fn):
13592 """Run the allocator test.
13595 if self.op.mode == constants.IALLOCATOR_MODE_ALLOC:
13596 ial = IAllocator(self.cfg, self.rpc,
13599 memory=self.op.memory,
13600 disks=self.op.disks,
13601 disk_template=self.op.disk_template,
13605 vcpus=self.op.vcpus,
13606 hypervisor=self.op.hypervisor,
13608 elif self.op.mode == constants.IALLOCATOR_MODE_RELOC:
13609 ial = IAllocator(self.cfg, self.rpc,
13612 relocate_from=list(self.relocate_from),
13614 elif self.op.mode == constants.IALLOCATOR_MODE_CHG_GROUP:
13615 ial = IAllocator(self.cfg, self.rpc,
13617 instances=self.op.instances,
13618 target_groups=self.op.target_groups)
13619 elif self.op.mode == constants.IALLOCATOR_MODE_NODE_EVAC:
13620 ial = IAllocator(self.cfg, self.rpc,
13622 instances=self.op.instances,
13623 evac_mode=self.op.evac_mode)
13625 raise errors.ProgrammerError("Uncatched mode %s in"
13626 " LUTestAllocator.Exec", self.op.mode)
13628 if self.op.direction == constants.IALLOCATOR_DIR_IN:
13629 result = ial.in_text
13631 ial.Run(self.op.allocator, validate=False)
13632 result = ial.out_text
13636 #: Query type implementations
13638 constants.QR_INSTANCE: _InstanceQuery,
13639 constants.QR_NODE: _NodeQuery,
13640 constants.QR_GROUP: _GroupQuery,
13641 constants.QR_OS: _OsQuery,
13644 assert set(_QUERY_IMPL.keys()) == constants.QR_VIA_OP
13647 def _GetQueryImplementation(name):
13648 """Returns the implemtnation for a query type.
13650 @param name: Query type, must be one of L{constants.QR_VIA_OP}
13654 return _QUERY_IMPL[name]
13656 raise errors.OpPrereqError("Unknown query resource '%s'" % name,
13657 errors.ECODE_INVAL)