4 # Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 Google Inc.
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 """Module implementing the master-side code."""
24 # pylint: disable=W0201,C0302
26 # W0201 since most LU attributes are defined in CheckPrereq or similar
29 # C0302: since we have waaaay to many lines in this module
45 from ganeti import ssh
46 from ganeti import utils
47 from ganeti import errors
48 from ganeti import hypervisor
49 from ganeti import locking
50 from ganeti import constants
51 from ganeti import objects
52 from ganeti import serializer
53 from ganeti import ssconf
54 from ganeti import uidpool
55 from ganeti import compat
56 from ganeti import masterd
57 from ganeti import netutils
58 from ganeti import query
59 from ganeti import qlang
60 from ganeti import opcodes
63 import ganeti.masterd.instance # pylint: disable=W0611
67 """Data container for LU results with jobs.
69 Instances of this class returned from L{LogicalUnit.Exec} will be recognized
70 by L{mcpu.Processor._ProcessResult}. The latter will then submit the jobs
71 contained in the C{jobs} attribute and include the job IDs in the opcode
75 def __init__(self, jobs, **kwargs):
76 """Initializes this class.
78 Additional return values can be specified as keyword arguments.
80 @type jobs: list of lists of L{opcode.OpCode}
81 @param jobs: A list of lists of opcode objects
88 class LogicalUnit(object):
89 """Logical Unit base class.
91 Subclasses must follow these rules:
92 - implement ExpandNames
93 - implement CheckPrereq (except when tasklets are used)
94 - implement Exec (except when tasklets are used)
95 - implement BuildHooksEnv
96 - implement BuildHooksNodes
97 - redefine HPATH and HTYPE
98 - optionally redefine their run requirements:
99 REQ_BGL: the LU needs to hold the Big Ganeti Lock exclusively
101 Note that all commands require root permissions.
103 @ivar dry_run_result: the value (if any) that will be returned to the caller
104 in dry-run mode (signalled by opcode dry_run parameter)
111 def __init__(self, processor, op, context, rpc):
112 """Constructor for LogicalUnit.
114 This needs to be overridden in derived classes in order to check op
118 self.proc = processor
120 self.cfg = context.cfg
121 self.glm = context.glm
123 self.owned_locks = context.glm.list_owned
124 self.context = context
126 # Dicts used to declare locking needs to mcpu
127 self.needed_locks = None
128 self.share_locks = dict.fromkeys(locking.LEVELS, 0)
130 self.remove_locks = {}
131 # Used to force good behavior when calling helper functions
132 self.recalculate_locks = {}
134 self.Log = processor.Log # pylint: disable=C0103
135 self.LogWarning = processor.LogWarning # pylint: disable=C0103
136 self.LogInfo = processor.LogInfo # pylint: disable=C0103
137 self.LogStep = processor.LogStep # pylint: disable=C0103
138 # support for dry-run
139 self.dry_run_result = None
140 # support for generic debug attribute
141 if (not hasattr(self.op, "debug_level") or
142 not isinstance(self.op.debug_level, int)):
143 self.op.debug_level = 0
148 # Validate opcode parameters and set defaults
149 self.op.Validate(True)
151 self.CheckArguments()
153 def CheckArguments(self):
154 """Check syntactic validity for the opcode arguments.
156 This method is for doing a simple syntactic check and ensure
157 validity of opcode parameters, without any cluster-related
158 checks. While the same can be accomplished in ExpandNames and/or
159 CheckPrereq, doing these separate is better because:
161 - ExpandNames is left as as purely a lock-related function
162 - CheckPrereq is run after we have acquired locks (and possible
165 The function is allowed to change the self.op attribute so that
166 later methods can no longer worry about missing parameters.
171 def ExpandNames(self):
172 """Expand names for this LU.
174 This method is called before starting to execute the opcode, and it should
175 update all the parameters of the opcode to their canonical form (e.g. a
176 short node name must be fully expanded after this method has successfully
177 completed). This way locking, hooks, logging, etc. can work correctly.
179 LUs which implement this method must also populate the self.needed_locks
180 member, as a dict with lock levels as keys, and a list of needed lock names
183 - use an empty dict if you don't need any lock
184 - if you don't need any lock at a particular level omit that level
185 - don't put anything for the BGL level
186 - if you want all locks at a level use locking.ALL_SET as a value
188 If you need to share locks (rather than acquire them exclusively) at one
189 level you can modify self.share_locks, setting a true value (usually 1) for
190 that level. By default locks are not shared.
192 This function can also define a list of tasklets, which then will be
193 executed in order instead of the usual LU-level CheckPrereq and Exec
194 functions, if those are not defined by the LU.
198 # Acquire all nodes and one instance
199 self.needed_locks = {
200 locking.LEVEL_NODE: locking.ALL_SET,
201 locking.LEVEL_INSTANCE: ['instance1.example.com'],
203 # Acquire just two nodes
204 self.needed_locks = {
205 locking.LEVEL_NODE: ['node1.example.com', 'node2.example.com'],
208 self.needed_locks = {} # No, you can't leave it to the default value None
211 # The implementation of this method is mandatory only if the new LU is
212 # concurrent, so that old LUs don't need to be changed all at the same
215 self.needed_locks = {} # Exclusive LUs don't need locks.
217 raise NotImplementedError
219 def DeclareLocks(self, level):
220 """Declare LU locking needs for a level
222 While most LUs can just declare their locking needs at ExpandNames time,
223 sometimes there's the need to calculate some locks after having acquired
224 the ones before. This function is called just before acquiring locks at a
225 particular level, but after acquiring the ones at lower levels, and permits
226 such calculations. It can be used to modify self.needed_locks, and by
227 default it does nothing.
229 This function is only called if you have something already set in
230 self.needed_locks for the level.
232 @param level: Locking level which is going to be locked
233 @type level: member of ganeti.locking.LEVELS
237 def CheckPrereq(self):
238 """Check prerequisites for this LU.
240 This method should check that the prerequisites for the execution
241 of this LU are fulfilled. It can do internode communication, but
242 it should be idempotent - no cluster or system changes are
245 The method should raise errors.OpPrereqError in case something is
246 not fulfilled. Its return value is ignored.
248 This method should also update all the parameters of the opcode to
249 their canonical form if it hasn't been done by ExpandNames before.
252 if self.tasklets is not None:
253 for (idx, tl) in enumerate(self.tasklets):
254 logging.debug("Checking prerequisites for tasklet %s/%s",
255 idx + 1, len(self.tasklets))
260 def Exec(self, feedback_fn):
263 This method should implement the actual work. It should raise
264 errors.OpExecError for failures that are somewhat dealt with in
268 if self.tasklets is not None:
269 for (idx, tl) in enumerate(self.tasklets):
270 logging.debug("Executing tasklet %s/%s", idx + 1, len(self.tasklets))
273 raise NotImplementedError
275 def BuildHooksEnv(self):
276 """Build hooks environment for this LU.
279 @return: Dictionary containing the environment that will be used for
280 running the hooks for this LU. The keys of the dict must not be prefixed
281 with "GANETI_"--that'll be added by the hooks runner. The hooks runner
282 will extend the environment with additional variables. If no environment
283 should be defined, an empty dictionary should be returned (not C{None}).
284 @note: If the C{HPATH} attribute of the LU class is C{None}, this function
288 raise NotImplementedError
290 def BuildHooksNodes(self):
291 """Build list of nodes to run LU's hooks.
293 @rtype: tuple; (list, list)
294 @return: Tuple containing a list of node names on which the hook
295 should run before the execution and a list of node names on which the
296 hook should run after the execution. No nodes should be returned as an
297 empty list (and not None).
298 @note: If the C{HPATH} attribute of the LU class is C{None}, this function
302 raise NotImplementedError
304 def HooksCallBack(self, phase, hook_results, feedback_fn, lu_result):
305 """Notify the LU about the results of its hooks.
307 This method is called every time a hooks phase is executed, and notifies
308 the Logical Unit about the hooks' result. The LU can then use it to alter
309 its result based on the hooks. By default the method does nothing and the
310 previous result is passed back unchanged but any LU can define it if it
311 wants to use the local cluster hook-scripts somehow.
313 @param phase: one of L{constants.HOOKS_PHASE_POST} or
314 L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase
315 @param hook_results: the results of the multi-node hooks rpc call
316 @param feedback_fn: function used send feedback back to the caller
317 @param lu_result: the previous Exec result this LU had, or None
319 @return: the new Exec result, based on the previous result
323 # API must be kept, thus we ignore the unused argument and could
324 # be a function warnings
325 # pylint: disable=W0613,R0201
328 def _ExpandAndLockInstance(self):
329 """Helper function to expand and lock an instance.
331 Many LUs that work on an instance take its name in self.op.instance_name
332 and need to expand it and then declare the expanded name for locking. This
333 function does it, and then updates self.op.instance_name to the expanded
334 name. It also initializes needed_locks as a dict, if this hasn't been done
338 if self.needed_locks is None:
339 self.needed_locks = {}
341 assert locking.LEVEL_INSTANCE not in self.needed_locks, \
342 "_ExpandAndLockInstance called with instance-level locks set"
343 self.op.instance_name = _ExpandInstanceName(self.cfg,
344 self.op.instance_name)
345 self.needed_locks[locking.LEVEL_INSTANCE] = self.op.instance_name
347 def _LockInstancesNodes(self, primary_only=False):
348 """Helper function to declare instances' nodes for locking.
350 This function should be called after locking one or more instances to lock
351 their nodes. Its effect is populating self.needed_locks[locking.LEVEL_NODE]
352 with all primary or secondary nodes for instances already locked and
353 present in self.needed_locks[locking.LEVEL_INSTANCE].
355 It should be called from DeclareLocks, and for safety only works if
356 self.recalculate_locks[locking.LEVEL_NODE] is set.
358 In the future it may grow parameters to just lock some instance's nodes, or
359 to just lock primaries or secondary nodes, if needed.
361 If should be called in DeclareLocks in a way similar to::
363 if level == locking.LEVEL_NODE:
364 self._LockInstancesNodes()
366 @type primary_only: boolean
367 @param primary_only: only lock primary nodes of locked instances
370 assert locking.LEVEL_NODE in self.recalculate_locks, \
371 "_LockInstancesNodes helper function called with no nodes to recalculate"
373 # TODO: check if we're really been called with the instance locks held
375 # For now we'll replace self.needed_locks[locking.LEVEL_NODE], but in the
376 # future we might want to have different behaviors depending on the value
377 # of self.recalculate_locks[locking.LEVEL_NODE]
379 locked_i = self.owned_locks(locking.LEVEL_INSTANCE)
380 for _, instance in self.cfg.GetMultiInstanceInfo(locked_i):
381 wanted_nodes.append(instance.primary_node)
383 wanted_nodes.extend(instance.secondary_nodes)
385 if self.recalculate_locks[locking.LEVEL_NODE] == constants.LOCKS_REPLACE:
386 self.needed_locks[locking.LEVEL_NODE] = wanted_nodes
387 elif self.recalculate_locks[locking.LEVEL_NODE] == constants.LOCKS_APPEND:
388 self.needed_locks[locking.LEVEL_NODE].extend(wanted_nodes)
390 del self.recalculate_locks[locking.LEVEL_NODE]
393 class NoHooksLU(LogicalUnit): # pylint: disable=W0223
394 """Simple LU which runs no hooks.
396 This LU is intended as a parent for other LogicalUnits which will
397 run no hooks, in order to reduce duplicate code.
403 def BuildHooksEnv(self):
404 """Empty BuildHooksEnv for NoHooksLu.
406 This just raises an error.
409 raise AssertionError("BuildHooksEnv called for NoHooksLUs")
411 def BuildHooksNodes(self):
412 """Empty BuildHooksNodes for NoHooksLU.
415 raise AssertionError("BuildHooksNodes called for NoHooksLU")
419 """Tasklet base class.
421 Tasklets are subcomponents for LUs. LUs can consist entirely of tasklets or
422 they can mix legacy code with tasklets. Locking needs to be done in the LU,
423 tasklets know nothing about locks.
425 Subclasses must follow these rules:
426 - Implement CheckPrereq
430 def __init__(self, lu):
437 def CheckPrereq(self):
438 """Check prerequisites for this tasklets.
440 This method should check whether the prerequisites for the execution of
441 this tasklet are fulfilled. It can do internode communication, but it
442 should be idempotent - no cluster or system changes are allowed.
444 The method should raise errors.OpPrereqError in case something is not
445 fulfilled. Its return value is ignored.
447 This method should also update all parameters to their canonical form if it
448 hasn't been done before.
453 def Exec(self, feedback_fn):
454 """Execute the tasklet.
456 This method should implement the actual work. It should raise
457 errors.OpExecError for failures that are somewhat dealt with in code, or
461 raise NotImplementedError
465 """Base for query utility classes.
468 #: Attribute holding field definitions
471 def __init__(self, filter_, fields, use_locking):
472 """Initializes this class.
475 self.use_locking = use_locking
477 self.query = query.Query(self.FIELDS, fields, filter_=filter_,
479 self.requested_data = self.query.RequestedData()
480 self.names = self.query.RequestedNames()
482 # Sort only if no names were requested
483 self.sort_by_name = not self.names
485 self.do_locking = None
488 def _GetNames(self, lu, all_names, lock_level):
489 """Helper function to determine names asked for in the query.
493 names = lu.owned_locks(lock_level)
497 if self.wanted == locking.ALL_SET:
498 assert not self.names
499 # caller didn't specify names, so ordering is not important
500 return utils.NiceSort(names)
502 # caller specified names and we must keep the same order
504 assert not self.do_locking or lu.glm.is_owned(lock_level)
506 missing = set(self.wanted).difference(names)
508 raise errors.OpExecError("Some items were removed before retrieving"
509 " their data: %s" % missing)
511 # Return expanded names
514 def ExpandNames(self, lu):
515 """Expand names for this query.
517 See L{LogicalUnit.ExpandNames}.
520 raise NotImplementedError()
522 def DeclareLocks(self, lu, level):
523 """Declare locks for this query.
525 See L{LogicalUnit.DeclareLocks}.
528 raise NotImplementedError()
530 def _GetQueryData(self, lu):
531 """Collects all data for this query.
533 @return: Query data object
536 raise NotImplementedError()
538 def NewStyleQuery(self, lu):
539 """Collect data and execute query.
542 return query.GetQueryResponse(self.query, self._GetQueryData(lu),
543 sort_by_name=self.sort_by_name)
545 def OldStyleQuery(self, lu):
546 """Collect data and execute query.
549 return self.query.OldStyleQuery(self._GetQueryData(lu),
550 sort_by_name=self.sort_by_name)
554 """Returns a dict declaring all lock levels shared.
557 return dict.fromkeys(locking.LEVELS, 1)
560 def _CheckInstanceNodeGroups(cfg, instance_name, owned_groups):
561 """Checks if the owned node groups are still correct for an instance.
563 @type cfg: L{config.ConfigWriter}
564 @param cfg: The cluster configuration
565 @type instance_name: string
566 @param instance_name: Instance name
567 @type owned_groups: set or frozenset
568 @param owned_groups: List of currently owned node groups
571 inst_groups = cfg.GetInstanceNodeGroups(instance_name)
573 if not owned_groups.issuperset(inst_groups):
574 raise errors.OpPrereqError("Instance %s's node groups changed since"
575 " locks were acquired, current groups are"
576 " are '%s', owning groups '%s'; retry the"
579 utils.CommaJoin(inst_groups),
580 utils.CommaJoin(owned_groups)),
586 def _CheckNodeGroupInstances(cfg, group_uuid, owned_instances):
587 """Checks if the instances in a node group are still correct.
589 @type cfg: L{config.ConfigWriter}
590 @param cfg: The cluster configuration
591 @type group_uuid: string
592 @param group_uuid: Node group UUID
593 @type owned_instances: set or frozenset
594 @param owned_instances: List of currently owned instances
597 wanted_instances = cfg.GetNodeGroupInstances(group_uuid)
598 if owned_instances != wanted_instances:
599 raise errors.OpPrereqError("Instances in node group '%s' changed since"
600 " locks were acquired, wanted '%s', have '%s';"
601 " retry the operation" %
603 utils.CommaJoin(wanted_instances),
604 utils.CommaJoin(owned_instances)),
607 return wanted_instances
610 def _SupportsOob(cfg, node):
611 """Tells if node supports OOB.
613 @type cfg: L{config.ConfigWriter}
614 @param cfg: The cluster configuration
615 @type node: L{objects.Node}
616 @param node: The node
617 @return: The OOB script if supported or an empty string otherwise
620 return cfg.GetNdParams(node)[constants.ND_OOB_PROGRAM]
623 def _GetWantedNodes(lu, nodes):
624 """Returns list of checked and expanded node names.
626 @type lu: L{LogicalUnit}
627 @param lu: the logical unit on whose behalf we execute
629 @param nodes: list of node names or None for all nodes
631 @return: the list of nodes, sorted
632 @raise errors.ProgrammerError: if the nodes parameter is wrong type
636 return [_ExpandNodeName(lu.cfg, name) for name in nodes]
638 return utils.NiceSort(lu.cfg.GetNodeList())
641 def _GetWantedInstances(lu, instances):
642 """Returns list of checked and expanded instance names.
644 @type lu: L{LogicalUnit}
645 @param lu: the logical unit on whose behalf we execute
646 @type instances: list
647 @param instances: list of instance names or None for all instances
649 @return: the list of instances, sorted
650 @raise errors.OpPrereqError: if the instances parameter is wrong type
651 @raise errors.OpPrereqError: if any of the passed instances is not found
655 wanted = [_ExpandInstanceName(lu.cfg, name) for name in instances]
657 wanted = utils.NiceSort(lu.cfg.GetInstanceList())
661 def _GetUpdatedParams(old_params, update_dict,
662 use_default=True, use_none=False):
663 """Return the new version of a parameter dictionary.
665 @type old_params: dict
666 @param old_params: old parameters
667 @type update_dict: dict
668 @param update_dict: dict containing new parameter values, or
669 constants.VALUE_DEFAULT to reset the parameter to its default
671 @param use_default: boolean
672 @type use_default: whether to recognise L{constants.VALUE_DEFAULT}
673 values as 'to be deleted' values
674 @param use_none: boolean
675 @type use_none: whether to recognise C{None} values as 'to be
678 @return: the new parameter dictionary
681 params_copy = copy.deepcopy(old_params)
682 for key, val in update_dict.iteritems():
683 if ((use_default and val == constants.VALUE_DEFAULT) or
684 (use_none and val is None)):
690 params_copy[key] = val
694 def _ReleaseLocks(lu, level, names=None, keep=None):
695 """Releases locks owned by an LU.
697 @type lu: L{LogicalUnit}
698 @param level: Lock level
699 @type names: list or None
700 @param names: Names of locks to release
701 @type keep: list or None
702 @param keep: Names of locks to retain
705 assert not (keep is not None and names is not None), \
706 "Only one of the 'names' and the 'keep' parameters can be given"
708 if names is not None:
709 should_release = names.__contains__
711 should_release = lambda name: name not in keep
713 should_release = None
719 # Determine which locks to release
720 for name in lu.owned_locks(level):
721 if should_release(name):
726 assert len(lu.owned_locks(level)) == (len(retain) + len(release))
728 # Release just some locks
729 lu.glm.release(level, names=release)
731 assert frozenset(lu.owned_locks(level)) == frozenset(retain)
734 lu.glm.release(level)
736 assert not lu.glm.is_owned(level), "No locks should be owned"
739 def _MapInstanceDisksToNodes(instances):
740 """Creates a map from (node, volume) to instance name.
742 @type instances: list of L{objects.Instance}
743 @rtype: dict; tuple of (node name, volume name) as key, instance name as value
746 return dict(((node, vol), inst.name)
747 for inst in instances
748 for (node, vols) in inst.MapLVsByNode().items()
752 def _RunPostHook(lu, node_name):
753 """Runs the post-hook for an opcode on a single node.
756 hm = lu.proc.BuildHooksManager(lu)
758 hm.RunPhase(constants.HOOKS_PHASE_POST, nodes=[node_name])
760 # pylint: disable=W0702
761 lu.LogWarning("Errors occurred running hooks on %s" % node_name)
764 def _CheckOutputFields(static, dynamic, selected):
765 """Checks whether all selected fields are valid.
767 @type static: L{utils.FieldSet}
768 @param static: static fields set
769 @type dynamic: L{utils.FieldSet}
770 @param dynamic: dynamic fields set
777 delta = f.NonMatching(selected)
779 raise errors.OpPrereqError("Unknown output fields selected: %s"
780 % ",".join(delta), errors.ECODE_INVAL)
783 def _CheckGlobalHvParams(params):
784 """Validates that given hypervisor params are not global ones.
786 This will ensure that instances don't get customised versions of
790 used_globals = constants.HVC_GLOBALS.intersection(params)
792 msg = ("The following hypervisor parameters are global and cannot"
793 " be customized at instance level, please modify them at"
794 " cluster level: %s" % utils.CommaJoin(used_globals))
795 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
798 def _CheckNodeOnline(lu, node, msg=None):
799 """Ensure that a given node is online.
801 @param lu: the LU on behalf of which we make the check
802 @param node: the node to check
803 @param msg: if passed, should be a message to replace the default one
804 @raise errors.OpPrereqError: if the node is offline
808 msg = "Can't use offline node"
809 if lu.cfg.GetNodeInfo(node).offline:
810 raise errors.OpPrereqError("%s: %s" % (msg, node), errors.ECODE_STATE)
813 def _CheckNodeNotDrained(lu, node):
814 """Ensure that a given node is not drained.
816 @param lu: the LU on behalf of which we make the check
817 @param node: the node to check
818 @raise errors.OpPrereqError: if the node is drained
821 if lu.cfg.GetNodeInfo(node).drained:
822 raise errors.OpPrereqError("Can't use drained node %s" % node,
826 def _CheckNodeVmCapable(lu, node):
827 """Ensure that a given node is vm capable.
829 @param lu: the LU on behalf of which we make the check
830 @param node: the node to check
831 @raise errors.OpPrereqError: if the node is not vm capable
834 if not lu.cfg.GetNodeInfo(node).vm_capable:
835 raise errors.OpPrereqError("Can't use non-vm_capable node %s" % node,
839 def _CheckNodeHasOS(lu, node, os_name, force_variant):
840 """Ensure that a node supports a given OS.
842 @param lu: the LU on behalf of which we make the check
843 @param node: the node to check
844 @param os_name: the OS to query about
845 @param force_variant: whether to ignore variant errors
846 @raise errors.OpPrereqError: if the node is not supporting the OS
849 result = lu.rpc.call_os_get(node, os_name)
850 result.Raise("OS '%s' not in supported OS list for node %s" %
852 prereq=True, ecode=errors.ECODE_INVAL)
853 if not force_variant:
854 _CheckOSVariant(result.payload, os_name)
857 def _CheckNodeHasSecondaryIP(lu, node, secondary_ip, prereq):
858 """Ensure that a node has the given secondary ip.
860 @type lu: L{LogicalUnit}
861 @param lu: the LU on behalf of which we make the check
863 @param node: the node to check
864 @type secondary_ip: string
865 @param secondary_ip: the ip to check
866 @type prereq: boolean
867 @param prereq: whether to throw a prerequisite or an execute error
868 @raise errors.OpPrereqError: if the node doesn't have the ip, and prereq=True
869 @raise errors.OpExecError: if the node doesn't have the ip, and prereq=False
872 result = lu.rpc.call_node_has_ip_address(node, secondary_ip)
873 result.Raise("Failure checking secondary ip on node %s" % node,
874 prereq=prereq, ecode=errors.ECODE_ENVIRON)
875 if not result.payload:
876 msg = ("Node claims it doesn't have the secondary ip you gave (%s),"
877 " please fix and re-run this command" % secondary_ip)
879 raise errors.OpPrereqError(msg, errors.ECODE_ENVIRON)
881 raise errors.OpExecError(msg)
884 def _GetClusterDomainSecret():
885 """Reads the cluster domain secret.
888 return utils.ReadOneLineFile(constants.CLUSTER_DOMAIN_SECRET_FILE,
892 def _CheckInstanceDown(lu, instance, reason):
893 """Ensure that an instance is not running."""
894 if instance.admin_up:
895 raise errors.OpPrereqError("Instance %s is marked to be up, %s" %
896 (instance.name, reason), errors.ECODE_STATE)
898 pnode = instance.primary_node
899 ins_l = lu.rpc.call_instance_list([pnode], [instance.hypervisor])[pnode]
900 ins_l.Raise("Can't contact node %s for instance information" % pnode,
901 prereq=True, ecode=errors.ECODE_ENVIRON)
903 if instance.name in ins_l.payload:
904 raise errors.OpPrereqError("Instance %s is running, %s" %
905 (instance.name, reason), errors.ECODE_STATE)
908 def _ExpandItemName(fn, name, kind):
909 """Expand an item name.
911 @param fn: the function to use for expansion
912 @param name: requested item name
913 @param kind: text description ('Node' or 'Instance')
914 @return: the resolved (full) name
915 @raise errors.OpPrereqError: if the item is not found
919 if full_name is None:
920 raise errors.OpPrereqError("%s '%s' not known" % (kind, name),
925 def _ExpandNodeName(cfg, name):
926 """Wrapper over L{_ExpandItemName} for nodes."""
927 return _ExpandItemName(cfg.ExpandNodeName, name, "Node")
930 def _ExpandInstanceName(cfg, name):
931 """Wrapper over L{_ExpandItemName} for instance."""
932 return _ExpandItemName(cfg.ExpandInstanceName, name, "Instance")
935 def _BuildInstanceHookEnv(name, primary_node, secondary_nodes, os_type, status,
936 memory, vcpus, nics, disk_template, disks,
937 bep, hvp, hypervisor_name, tags):
938 """Builds instance related env variables for hooks
940 This builds the hook environment from individual variables.
943 @param name: the name of the instance
944 @type primary_node: string
945 @param primary_node: the name of the instance's primary node
946 @type secondary_nodes: list
947 @param secondary_nodes: list of secondary nodes as strings
948 @type os_type: string
949 @param os_type: the name of the instance's OS
950 @type status: boolean
951 @param status: the should_run status of the instance
953 @param memory: the memory size of the instance
955 @param vcpus: the count of VCPUs the instance has
957 @param nics: list of tuples (ip, mac, mode, link) representing
958 the NICs the instance has
959 @type disk_template: string
960 @param disk_template: the disk template of the instance
962 @param disks: the list of (size, mode) pairs
964 @param bep: the backend parameters for the instance
966 @param hvp: the hypervisor parameters for the instance
967 @type hypervisor_name: string
968 @param hypervisor_name: the hypervisor for the instance
970 @param tags: list of instance tags as strings
972 @return: the hook environment for this instance
981 "INSTANCE_NAME": name,
982 "INSTANCE_PRIMARY": primary_node,
983 "INSTANCE_SECONDARIES": " ".join(secondary_nodes),
984 "INSTANCE_OS_TYPE": os_type,
985 "INSTANCE_STATUS": str_status,
986 "INSTANCE_MEMORY": memory,
987 "INSTANCE_VCPUS": vcpus,
988 "INSTANCE_DISK_TEMPLATE": disk_template,
989 "INSTANCE_HYPERVISOR": hypervisor_name,
993 nic_count = len(nics)
994 for idx, (ip, mac, mode, link) in enumerate(nics):
997 env["INSTANCE_NIC%d_IP" % idx] = ip
998 env["INSTANCE_NIC%d_MAC" % idx] = mac
999 env["INSTANCE_NIC%d_MODE" % idx] = mode
1000 env["INSTANCE_NIC%d_LINK" % idx] = link
1001 if mode == constants.NIC_MODE_BRIDGED:
1002 env["INSTANCE_NIC%d_BRIDGE" % idx] = link
1006 env["INSTANCE_NIC_COUNT"] = nic_count
1009 disk_count = len(disks)
1010 for idx, (size, mode) in enumerate(disks):
1011 env["INSTANCE_DISK%d_SIZE" % idx] = size
1012 env["INSTANCE_DISK%d_MODE" % idx] = mode
1016 env["INSTANCE_DISK_COUNT"] = disk_count
1021 env["INSTANCE_TAGS"] = " ".join(tags)
1023 for source, kind in [(bep, "BE"), (hvp, "HV")]:
1024 for key, value in source.items():
1025 env["INSTANCE_%s_%s" % (kind, key)] = value
1030 def _NICListToTuple(lu, nics):
1031 """Build a list of nic information tuples.
1033 This list is suitable to be passed to _BuildInstanceHookEnv or as a return
1034 value in LUInstanceQueryData.
1036 @type lu: L{LogicalUnit}
1037 @param lu: the logical unit on whose behalf we execute
1038 @type nics: list of L{objects.NIC}
1039 @param nics: list of nics to convert to hooks tuples
1043 cluster = lu.cfg.GetClusterInfo()
1047 filled_params = cluster.SimpleFillNIC(nic.nicparams)
1048 mode = filled_params[constants.NIC_MODE]
1049 link = filled_params[constants.NIC_LINK]
1050 hooks_nics.append((ip, mac, mode, link))
1054 def _BuildInstanceHookEnvByObject(lu, instance, override=None):
1055 """Builds instance related env variables for hooks from an object.
1057 @type lu: L{LogicalUnit}
1058 @param lu: the logical unit on whose behalf we execute
1059 @type instance: L{objects.Instance}
1060 @param instance: the instance for which we should build the
1062 @type override: dict
1063 @param override: dictionary with key/values that will override
1066 @return: the hook environment dictionary
1069 cluster = lu.cfg.GetClusterInfo()
1070 bep = cluster.FillBE(instance)
1071 hvp = cluster.FillHV(instance)
1073 "name": instance.name,
1074 "primary_node": instance.primary_node,
1075 "secondary_nodes": instance.secondary_nodes,
1076 "os_type": instance.os,
1077 "status": instance.admin_up,
1078 "memory": bep[constants.BE_MEMORY],
1079 "vcpus": bep[constants.BE_VCPUS],
1080 "nics": _NICListToTuple(lu, instance.nics),
1081 "disk_template": instance.disk_template,
1082 "disks": [(disk.size, disk.mode) for disk in instance.disks],
1085 "hypervisor_name": instance.hypervisor,
1086 "tags": instance.tags,
1089 args.update(override)
1090 return _BuildInstanceHookEnv(**args) # pylint: disable=W0142
1093 def _AdjustCandidatePool(lu, exceptions):
1094 """Adjust the candidate pool after node operations.
1097 mod_list = lu.cfg.MaintainCandidatePool(exceptions)
1099 lu.LogInfo("Promoted nodes to master candidate role: %s",
1100 utils.CommaJoin(node.name for node in mod_list))
1101 for name in mod_list:
1102 lu.context.ReaddNode(name)
1103 mc_now, mc_max, _ = lu.cfg.GetMasterCandidateStats(exceptions)
1105 lu.LogInfo("Note: more nodes are candidates (%d) than desired (%d)" %
1109 def _DecideSelfPromotion(lu, exceptions=None):
1110 """Decide whether I should promote myself as a master candidate.
1113 cp_size = lu.cfg.GetClusterInfo().candidate_pool_size
1114 mc_now, mc_should, _ = lu.cfg.GetMasterCandidateStats(exceptions)
1115 # the new node will increase mc_max with one, so:
1116 mc_should = min(mc_should + 1, cp_size)
1117 return mc_now < mc_should
1120 def _CheckNicsBridgesExist(lu, target_nics, target_node):
1121 """Check that the brigdes needed by a list of nics exist.
1124 cluster = lu.cfg.GetClusterInfo()
1125 paramslist = [cluster.SimpleFillNIC(nic.nicparams) for nic in target_nics]
1126 brlist = [params[constants.NIC_LINK] for params in paramslist
1127 if params[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED]
1129 result = lu.rpc.call_bridges_exist(target_node, brlist)
1130 result.Raise("Error checking bridges on destination node '%s'" %
1131 target_node, prereq=True, ecode=errors.ECODE_ENVIRON)
1134 def _CheckInstanceBridgesExist(lu, instance, node=None):
1135 """Check that the brigdes needed by an instance exist.
1139 node = instance.primary_node
1140 _CheckNicsBridgesExist(lu, instance.nics, node)
1143 def _CheckOSVariant(os_obj, name):
1144 """Check whether an OS name conforms to the os variants specification.
1146 @type os_obj: L{objects.OS}
1147 @param os_obj: OS object to check
1149 @param name: OS name passed by the user, to check for validity
1152 variant = objects.OS.GetVariant(name)
1153 if not os_obj.supported_variants:
1155 raise errors.OpPrereqError("OS '%s' doesn't support variants ('%s'"
1156 " passed)" % (os_obj.name, variant),
1160 raise errors.OpPrereqError("OS name must include a variant",
1163 if variant not in os_obj.supported_variants:
1164 raise errors.OpPrereqError("Unsupported OS variant", errors.ECODE_INVAL)
1167 def _GetNodeInstancesInner(cfg, fn):
1168 return [i for i in cfg.GetAllInstancesInfo().values() if fn(i)]
1171 def _GetNodeInstances(cfg, node_name):
1172 """Returns a list of all primary and secondary instances on a node.
1176 return _GetNodeInstancesInner(cfg, lambda inst: node_name in inst.all_nodes)
1179 def _GetNodePrimaryInstances(cfg, node_name):
1180 """Returns primary instances on a node.
1183 return _GetNodeInstancesInner(cfg,
1184 lambda inst: node_name == inst.primary_node)
1187 def _GetNodeSecondaryInstances(cfg, node_name):
1188 """Returns secondary instances on a node.
1191 return _GetNodeInstancesInner(cfg,
1192 lambda inst: node_name in inst.secondary_nodes)
1195 def _GetStorageTypeArgs(cfg, storage_type):
1196 """Returns the arguments for a storage type.
1199 # Special case for file storage
1200 if storage_type == constants.ST_FILE:
1201 # storage.FileStorage wants a list of storage directories
1202 return [[cfg.GetFileStorageDir(), cfg.GetSharedFileStorageDir()]]
1207 def _FindFaultyInstanceDisks(cfg, rpc, instance, node_name, prereq):
1210 for dev in instance.disks:
1211 cfg.SetDiskID(dev, node_name)
1213 result = rpc.call_blockdev_getmirrorstatus(node_name, instance.disks)
1214 result.Raise("Failed to get disk status from node %s" % node_name,
1215 prereq=prereq, ecode=errors.ECODE_ENVIRON)
1217 for idx, bdev_status in enumerate(result.payload):
1218 if bdev_status and bdev_status.ldisk_status == constants.LDS_FAULTY:
1224 def _CheckIAllocatorOrNode(lu, iallocator_slot, node_slot):
1225 """Check the sanity of iallocator and node arguments and use the
1226 cluster-wide iallocator if appropriate.
1228 Check that at most one of (iallocator, node) is specified. If none is
1229 specified, then the LU's opcode's iallocator slot is filled with the
1230 cluster-wide default iallocator.
1232 @type iallocator_slot: string
1233 @param iallocator_slot: the name of the opcode iallocator slot
1234 @type node_slot: string
1235 @param node_slot: the name of the opcode target node slot
1238 node = getattr(lu.op, node_slot, None)
1239 iallocator = getattr(lu.op, iallocator_slot, None)
1241 if node is not None and iallocator is not None:
1242 raise errors.OpPrereqError("Do not specify both, iallocator and node",
1244 elif node is None and iallocator is None:
1245 default_iallocator = lu.cfg.GetDefaultIAllocator()
1246 if default_iallocator:
1247 setattr(lu.op, iallocator_slot, default_iallocator)
1249 raise errors.OpPrereqError("No iallocator or node given and no"
1250 " cluster-wide default iallocator found;"
1251 " please specify either an iallocator or a"
1252 " node, or set a cluster-wide default"
1256 def _GetDefaultIAllocator(cfg, iallocator):
1257 """Decides on which iallocator to use.
1259 @type cfg: L{config.ConfigWriter}
1260 @param cfg: Cluster configuration object
1261 @type iallocator: string or None
1262 @param iallocator: Iallocator specified in opcode
1264 @return: Iallocator name
1268 # Use default iallocator
1269 iallocator = cfg.GetDefaultIAllocator()
1272 raise errors.OpPrereqError("No iallocator was specified, neither in the"
1273 " opcode nor as a cluster-wide default",
1279 class LUClusterPostInit(LogicalUnit):
1280 """Logical unit for running hooks after cluster initialization.
1283 HPATH = "cluster-init"
1284 HTYPE = constants.HTYPE_CLUSTER
1286 def BuildHooksEnv(self):
1291 "OP_TARGET": self.cfg.GetClusterName(),
1294 def BuildHooksNodes(self):
1295 """Build hooks nodes.
1298 return ([], [self.cfg.GetMasterNode()])
1300 def Exec(self, feedback_fn):
1307 class LUClusterDestroy(LogicalUnit):
1308 """Logical unit for destroying the cluster.
1311 HPATH = "cluster-destroy"
1312 HTYPE = constants.HTYPE_CLUSTER
1314 def BuildHooksEnv(self):
1319 "OP_TARGET": self.cfg.GetClusterName(),
1322 def BuildHooksNodes(self):
1323 """Build hooks nodes.
1328 def CheckPrereq(self):
1329 """Check prerequisites.
1331 This checks whether the cluster is empty.
1333 Any errors are signaled by raising errors.OpPrereqError.
1336 master = self.cfg.GetMasterNode()
1338 nodelist = self.cfg.GetNodeList()
1339 if len(nodelist) != 1 or nodelist[0] != master:
1340 raise errors.OpPrereqError("There are still %d node(s) in"
1341 " this cluster." % (len(nodelist) - 1),
1343 instancelist = self.cfg.GetInstanceList()
1345 raise errors.OpPrereqError("There are still %d instance(s) in"
1346 " this cluster." % len(instancelist),
1349 def Exec(self, feedback_fn):
1350 """Destroys the cluster.
1353 master = self.cfg.GetMasterNode()
1355 # Run post hooks on master node before it's removed
1356 _RunPostHook(self, master)
1358 result = self.rpc.call_node_deactivate_master_ip(master)
1359 result.Raise("Could not disable the master role")
1364 def _VerifyCertificate(filename):
1365 """Verifies a certificate for L{LUClusterVerifyConfig}.
1367 @type filename: string
1368 @param filename: Path to PEM file
1372 cert = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM,
1373 utils.ReadFile(filename))
1374 except Exception, err: # pylint: disable=W0703
1375 return (LUClusterVerifyConfig.ETYPE_ERROR,
1376 "Failed to load X509 certificate %s: %s" % (filename, err))
1379 utils.VerifyX509Certificate(cert, constants.SSL_CERT_EXPIRATION_WARN,
1380 constants.SSL_CERT_EXPIRATION_ERROR)
1383 fnamemsg = "While verifying %s: %s" % (filename, msg)
1388 return (None, fnamemsg)
1389 elif errcode == utils.CERT_WARNING:
1390 return (LUClusterVerifyConfig.ETYPE_WARNING, fnamemsg)
1391 elif errcode == utils.CERT_ERROR:
1392 return (LUClusterVerifyConfig.ETYPE_ERROR, fnamemsg)
1394 raise errors.ProgrammerError("Unhandled certificate error code %r" % errcode)
1397 def _GetAllHypervisorParameters(cluster, instances):
1398 """Compute the set of all hypervisor parameters.
1400 @type cluster: L{objects.Cluster}
1401 @param cluster: the cluster object
1402 @param instances: list of L{objects.Instance}
1403 @param instances: additional instances from which to obtain parameters
1404 @rtype: list of (origin, hypervisor, parameters)
1405 @return: a list with all parameters found, indicating the hypervisor they
1406 apply to, and the origin (can be "cluster", "os X", or "instance Y")
1411 for hv_name in cluster.enabled_hypervisors:
1412 hvp_data.append(("cluster", hv_name, cluster.GetHVDefaults(hv_name)))
1414 for os_name, os_hvp in cluster.os_hvp.items():
1415 for hv_name, hv_params in os_hvp.items():
1417 full_params = cluster.GetHVDefaults(hv_name, os_name=os_name)
1418 hvp_data.append(("os %s" % os_name, hv_name, full_params))
1420 # TODO: collapse identical parameter values in a single one
1421 for instance in instances:
1422 if instance.hvparams:
1423 hvp_data.append(("instance %s" % instance.name, instance.hypervisor,
1424 cluster.FillHV(instance)))
1429 class _VerifyErrors(object):
1430 """Mix-in for cluster/group verify LUs.
1432 It provides _Error and _ErrorIf, and updates the self.bad boolean. (Expects
1433 self.op and self._feedback_fn to be available.)
1436 TCLUSTER = "cluster"
1438 TINSTANCE = "instance"
1440 ECLUSTERCFG = (TCLUSTER, "ECLUSTERCFG")
1441 ECLUSTERCERT = (TCLUSTER, "ECLUSTERCERT")
1442 ECLUSTERFILECHECK = (TCLUSTER, "ECLUSTERFILECHECK")
1443 ECLUSTERDANGLINGNODES = (TNODE, "ECLUSTERDANGLINGNODES")
1444 ECLUSTERDANGLINGINST = (TNODE, "ECLUSTERDANGLINGINST")
1445 EINSTANCEBADNODE = (TINSTANCE, "EINSTANCEBADNODE")
1446 EINSTANCEDOWN = (TINSTANCE, "EINSTANCEDOWN")
1447 EINSTANCELAYOUT = (TINSTANCE, "EINSTANCELAYOUT")
1448 EINSTANCEMISSINGDISK = (TINSTANCE, "EINSTANCEMISSINGDISK")
1449 EINSTANCEFAULTYDISK = (TINSTANCE, "EINSTANCEFAULTYDISK")
1450 EINSTANCEWRONGNODE = (TINSTANCE, "EINSTANCEWRONGNODE")
1451 EINSTANCESPLITGROUPS = (TINSTANCE, "EINSTANCESPLITGROUPS")
1452 ENODEDRBD = (TNODE, "ENODEDRBD")
1453 ENODEDRBDHELPER = (TNODE, "ENODEDRBDHELPER")
1454 ENODEFILECHECK = (TNODE, "ENODEFILECHECK")
1455 ENODEHOOKS = (TNODE, "ENODEHOOKS")
1456 ENODEHV = (TNODE, "ENODEHV")
1457 ENODELVM = (TNODE, "ENODELVM")
1458 ENODEN1 = (TNODE, "ENODEN1")
1459 ENODENET = (TNODE, "ENODENET")
1460 ENODEOS = (TNODE, "ENODEOS")
1461 ENODEORPHANINSTANCE = (TNODE, "ENODEORPHANINSTANCE")
1462 ENODEORPHANLV = (TNODE, "ENODEORPHANLV")
1463 ENODERPC = (TNODE, "ENODERPC")
1464 ENODESSH = (TNODE, "ENODESSH")
1465 ENODEVERSION = (TNODE, "ENODEVERSION")
1466 ENODESETUP = (TNODE, "ENODESETUP")
1467 ENODETIME = (TNODE, "ENODETIME")
1468 ENODEOOBPATH = (TNODE, "ENODEOOBPATH")
1470 ETYPE_FIELD = "code"
1471 ETYPE_ERROR = "ERROR"
1472 ETYPE_WARNING = "WARNING"
1474 def _Error(self, ecode, item, msg, *args, **kwargs):
1475 """Format an error message.
1477 Based on the opcode's error_codes parameter, either format a
1478 parseable error code, or a simpler error string.
1480 This must be called only from Exec and functions called from Exec.
1483 ltype = kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR)
1485 # first complete the msg
1488 # then format the whole message
1489 if self.op.error_codes: # This is a mix-in. pylint: disable=E1101
1490 msg = "%s:%s:%s:%s:%s" % (ltype, etxt, itype, item, msg)
1496 msg = "%s: %s%s: %s" % (ltype, itype, item, msg)
1497 # and finally report it via the feedback_fn
1498 self._feedback_fn(" - %s" % msg) # Mix-in. pylint: disable=E1101
1500 def _ErrorIf(self, cond, *args, **kwargs):
1501 """Log an error message if the passed condition is True.
1505 or self.op.debug_simulate_errors) # pylint: disable=E1101
1507 self._Error(*args, **kwargs)
1508 # do not mark the operation as failed for WARN cases only
1509 if kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR) == self.ETYPE_ERROR:
1510 self.bad = self.bad or cond
1513 class LUClusterVerify(NoHooksLU):
1514 """Submits all jobs necessary to verify the cluster.
1519 def ExpandNames(self):
1520 self.needed_locks = {}
1522 def Exec(self, feedback_fn):
1525 if self.op.group_name:
1526 groups = [self.op.group_name]
1527 depends_fn = lambda: None
1529 groups = self.cfg.GetNodeGroupList()
1531 # Verify global configuration
1532 jobs.append([opcodes.OpClusterVerifyConfig()])
1534 # Always depend on global verification
1535 depends_fn = lambda: [(-len(jobs), [])]
1537 jobs.extend([opcodes.OpClusterVerifyGroup(group_name=group,
1538 depends=depends_fn())]
1539 for group in groups)
1541 # Fix up all parameters
1542 for op in itertools.chain(*jobs): # pylint: disable=W0142
1543 op.debug_simulate_errors = self.op.debug_simulate_errors
1544 op.verbose = self.op.verbose
1545 op.error_codes = self.op.error_codes
1547 op.skip_checks = self.op.skip_checks
1548 except AttributeError:
1549 assert not isinstance(op, opcodes.OpClusterVerifyGroup)
1551 return ResultWithJobs(jobs)
1554 class LUClusterVerifyConfig(NoHooksLU, _VerifyErrors):
1555 """Verifies the cluster config.
1560 def _VerifyHVP(self, hvp_data):
1561 """Verifies locally the syntax of the hypervisor parameters.
1564 for item, hv_name, hv_params in hvp_data:
1565 msg = ("hypervisor %s parameters syntax check (source %s): %%s" %
1568 hv_class = hypervisor.GetHypervisor(hv_name)
1569 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
1570 hv_class.CheckParameterSyntax(hv_params)
1571 except errors.GenericError, err:
1572 self._ErrorIf(True, self.ECLUSTERCFG, None, msg % str(err))
1574 def ExpandNames(self):
1575 # Information can be safely retrieved as the BGL is acquired in exclusive
1577 assert locking.BGL in self.owned_locks(locking.LEVEL_CLUSTER)
1578 self.all_group_info = self.cfg.GetAllNodeGroupsInfo()
1579 self.all_node_info = self.cfg.GetAllNodesInfo()
1580 self.all_inst_info = self.cfg.GetAllInstancesInfo()
1581 self.needed_locks = {}
1583 def Exec(self, feedback_fn):
1584 """Verify integrity of cluster, performing various test on nodes.
1588 self._feedback_fn = feedback_fn
1590 feedback_fn("* Verifying cluster config")
1592 for msg in self.cfg.VerifyConfig():
1593 self._ErrorIf(True, self.ECLUSTERCFG, None, msg)
1595 feedback_fn("* Verifying cluster certificate files")
1597 for cert_filename in constants.ALL_CERT_FILES:
1598 (errcode, msg) = _VerifyCertificate(cert_filename)
1599 self._ErrorIf(errcode, self.ECLUSTERCERT, None, msg, code=errcode)
1601 feedback_fn("* Verifying hypervisor parameters")
1603 self._VerifyHVP(_GetAllHypervisorParameters(self.cfg.GetClusterInfo(),
1604 self.all_inst_info.values()))
1606 feedback_fn("* Verifying all nodes belong to an existing group")
1608 # We do this verification here because, should this bogus circumstance
1609 # occur, it would never be caught by VerifyGroup, which only acts on
1610 # nodes/instances reachable from existing node groups.
1612 dangling_nodes = set(node.name for node in self.all_node_info.values()
1613 if node.group not in self.all_group_info)
1615 dangling_instances = {}
1616 no_node_instances = []
1618 for inst in self.all_inst_info.values():
1619 if inst.primary_node in dangling_nodes:
1620 dangling_instances.setdefault(inst.primary_node, []).append(inst.name)
1621 elif inst.primary_node not in self.all_node_info:
1622 no_node_instances.append(inst.name)
1627 utils.CommaJoin(dangling_instances.get(node.name,
1629 for node in dangling_nodes]
1631 self._ErrorIf(bool(dangling_nodes), self.ECLUSTERDANGLINGNODES, None,
1632 "the following nodes (and their instances) belong to a non"
1633 " existing group: %s", utils.CommaJoin(pretty_dangling))
1635 self._ErrorIf(bool(no_node_instances), self.ECLUSTERDANGLINGINST, None,
1636 "the following instances have a non-existing primary-node:"
1637 " %s", utils.CommaJoin(no_node_instances))
1642 class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
1643 """Verifies the status of a node group.
1646 HPATH = "cluster-verify"
1647 HTYPE = constants.HTYPE_CLUSTER
1650 _HOOKS_INDENT_RE = re.compile("^", re.M)
1652 class NodeImage(object):
1653 """A class representing the logical and physical status of a node.
1656 @ivar name: the node name to which this object refers
1657 @ivar volumes: a structure as returned from
1658 L{ganeti.backend.GetVolumeList} (runtime)
1659 @ivar instances: a list of running instances (runtime)
1660 @ivar pinst: list of configured primary instances (config)
1661 @ivar sinst: list of configured secondary instances (config)
1662 @ivar sbp: dictionary of {primary-node: list of instances} for all
1663 instances for which this node is secondary (config)
1664 @ivar mfree: free memory, as reported by hypervisor (runtime)
1665 @ivar dfree: free disk, as reported by the node (runtime)
1666 @ivar offline: the offline status (config)
1667 @type rpc_fail: boolean
1668 @ivar rpc_fail: whether the RPC verify call was successfull (overall,
1669 not whether the individual keys were correct) (runtime)
1670 @type lvm_fail: boolean
1671 @ivar lvm_fail: whether the RPC call didn't return valid LVM data
1672 @type hyp_fail: boolean
1673 @ivar hyp_fail: whether the RPC call didn't return the instance list
1674 @type ghost: boolean
1675 @ivar ghost: whether this is a known node or not (config)
1676 @type os_fail: boolean
1677 @ivar os_fail: whether the RPC call didn't return valid OS data
1679 @ivar oslist: list of OSes as diagnosed by DiagnoseOS
1680 @type vm_capable: boolean
1681 @ivar vm_capable: whether the node can host instances
1684 def __init__(self, offline=False, name=None, vm_capable=True):
1693 self.offline = offline
1694 self.vm_capable = vm_capable
1695 self.rpc_fail = False
1696 self.lvm_fail = False
1697 self.hyp_fail = False
1699 self.os_fail = False
1702 def ExpandNames(self):
1703 # This raises errors.OpPrereqError on its own:
1704 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
1706 # Get instances in node group; this is unsafe and needs verification later
1707 inst_names = self.cfg.GetNodeGroupInstances(self.group_uuid)
1709 self.needed_locks = {
1710 locking.LEVEL_INSTANCE: inst_names,
1711 locking.LEVEL_NODEGROUP: [self.group_uuid],
1712 locking.LEVEL_NODE: [],
1715 self.share_locks = _ShareAll()
1717 def DeclareLocks(self, level):
1718 if level == locking.LEVEL_NODE:
1719 # Get members of node group; this is unsafe and needs verification later
1720 nodes = set(self.cfg.GetNodeGroup(self.group_uuid).members)
1722 all_inst_info = self.cfg.GetAllInstancesInfo()
1724 # In Exec(), we warn about mirrored instances that have primary and
1725 # secondary living in separate node groups. To fully verify that
1726 # volumes for these instances are healthy, we will need to do an
1727 # extra call to their secondaries. We ensure here those nodes will
1729 for inst in self.owned_locks(locking.LEVEL_INSTANCE):
1730 # Important: access only the instances whose lock is owned
1731 if all_inst_info[inst].disk_template in constants.DTS_INT_MIRROR:
1732 nodes.update(all_inst_info[inst].secondary_nodes)
1734 self.needed_locks[locking.LEVEL_NODE] = nodes
1736 def CheckPrereq(self):
1737 assert self.group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP)
1738 self.group_info = self.cfg.GetNodeGroup(self.group_uuid)
1740 group_nodes = set(self.group_info.members)
1741 group_instances = self.cfg.GetNodeGroupInstances(self.group_uuid)
1744 group_nodes.difference(self.owned_locks(locking.LEVEL_NODE))
1746 unlocked_instances = \
1747 group_instances.difference(self.owned_locks(locking.LEVEL_INSTANCE))
1750 raise errors.OpPrereqError("Missing lock for nodes: %s" %
1751 utils.CommaJoin(unlocked_nodes))
1753 if unlocked_instances:
1754 raise errors.OpPrereqError("Missing lock for instances: %s" %
1755 utils.CommaJoin(unlocked_instances))
1757 self.all_node_info = self.cfg.GetAllNodesInfo()
1758 self.all_inst_info = self.cfg.GetAllInstancesInfo()
1760 self.my_node_names = utils.NiceSort(group_nodes)
1761 self.my_inst_names = utils.NiceSort(group_instances)
1763 self.my_node_info = dict((name, self.all_node_info[name])
1764 for name in self.my_node_names)
1766 self.my_inst_info = dict((name, self.all_inst_info[name])
1767 for name in self.my_inst_names)
1769 # We detect here the nodes that will need the extra RPC calls for verifying
1770 # split LV volumes; they should be locked.
1771 extra_lv_nodes = set()
1773 for inst in self.my_inst_info.values():
1774 if inst.disk_template in constants.DTS_INT_MIRROR:
1775 group = self.my_node_info[inst.primary_node].group
1776 for nname in inst.secondary_nodes:
1777 if self.all_node_info[nname].group != group:
1778 extra_lv_nodes.add(nname)
1780 unlocked_lv_nodes = \
1781 extra_lv_nodes.difference(self.owned_locks(locking.LEVEL_NODE))
1783 if unlocked_lv_nodes:
1784 raise errors.OpPrereqError("these nodes could be locked: %s" %
1785 utils.CommaJoin(unlocked_lv_nodes))
1786 self.extra_lv_nodes = list(extra_lv_nodes)
1788 def _VerifyNode(self, ninfo, nresult):
1789 """Perform some basic validation on data returned from a node.
1791 - check the result data structure is well formed and has all the
1793 - check ganeti version
1795 @type ninfo: L{objects.Node}
1796 @param ninfo: the node to check
1797 @param nresult: the results from the node
1799 @return: whether overall this call was successful (and we can expect
1800 reasonable values in the respose)
1804 _ErrorIf = self._ErrorIf # pylint: disable=C0103
1806 # main result, nresult should be a non-empty dict
1807 test = not nresult or not isinstance(nresult, dict)
1808 _ErrorIf(test, self.ENODERPC, node,
1809 "unable to verify node: no data returned")
1813 # compares ganeti version
1814 local_version = constants.PROTOCOL_VERSION
1815 remote_version = nresult.get("version", None)
1816 test = not (remote_version and
1817 isinstance(remote_version, (list, tuple)) and
1818 len(remote_version) == 2)
1819 _ErrorIf(test, self.ENODERPC, node,
1820 "connection to node returned invalid data")
1824 test = local_version != remote_version[0]
1825 _ErrorIf(test, self.ENODEVERSION, node,
1826 "incompatible protocol versions: master %s,"
1827 " node %s", local_version, remote_version[0])
1831 # node seems compatible, we can actually try to look into its results
1833 # full package version
1834 self._ErrorIf(constants.RELEASE_VERSION != remote_version[1],
1835 self.ENODEVERSION, node,
1836 "software version mismatch: master %s, node %s",
1837 constants.RELEASE_VERSION, remote_version[1],
1838 code=self.ETYPE_WARNING)
1840 hyp_result = nresult.get(constants.NV_HYPERVISOR, None)
1841 if ninfo.vm_capable and isinstance(hyp_result, dict):
1842 for hv_name, hv_result in hyp_result.iteritems():
1843 test = hv_result is not None
1844 _ErrorIf(test, self.ENODEHV, node,
1845 "hypervisor %s verify failure: '%s'", hv_name, hv_result)
1847 hvp_result = nresult.get(constants.NV_HVPARAMS, None)
1848 if ninfo.vm_capable and isinstance(hvp_result, list):
1849 for item, hv_name, hv_result in hvp_result:
1850 _ErrorIf(True, self.ENODEHV, node,
1851 "hypervisor %s parameter verify failure (source %s): %s",
1852 hv_name, item, hv_result)
1854 test = nresult.get(constants.NV_NODESETUP,
1855 ["Missing NODESETUP results"])
1856 _ErrorIf(test, self.ENODESETUP, node, "node setup error: %s",
1861 def _VerifyNodeTime(self, ninfo, nresult,
1862 nvinfo_starttime, nvinfo_endtime):
1863 """Check the node time.
1865 @type ninfo: L{objects.Node}
1866 @param ninfo: the node to check
1867 @param nresult: the remote results for the node
1868 @param nvinfo_starttime: the start time of the RPC call
1869 @param nvinfo_endtime: the end time of the RPC call
1873 _ErrorIf = self._ErrorIf # pylint: disable=C0103
1875 ntime = nresult.get(constants.NV_TIME, None)
1877 ntime_merged = utils.MergeTime(ntime)
1878 except (ValueError, TypeError):
1879 _ErrorIf(True, self.ENODETIME, node, "Node returned invalid time")
1882 if ntime_merged < (nvinfo_starttime - constants.NODE_MAX_CLOCK_SKEW):
1883 ntime_diff = "%.01fs" % abs(nvinfo_starttime - ntime_merged)
1884 elif ntime_merged > (nvinfo_endtime + constants.NODE_MAX_CLOCK_SKEW):
1885 ntime_diff = "%.01fs" % abs(ntime_merged - nvinfo_endtime)
1889 _ErrorIf(ntime_diff is not None, self.ENODETIME, node,
1890 "Node time diverges by at least %s from master node time",
1893 def _VerifyNodeLVM(self, ninfo, nresult, vg_name):
1894 """Check the node LVM results.
1896 @type ninfo: L{objects.Node}
1897 @param ninfo: the node to check
1898 @param nresult: the remote results for the node
1899 @param vg_name: the configured VG name
1906 _ErrorIf = self._ErrorIf # pylint: disable=C0103
1908 # checks vg existence and size > 20G
1909 vglist = nresult.get(constants.NV_VGLIST, None)
1911 _ErrorIf(test, self.ENODELVM, node, "unable to check volume groups")
1913 vgstatus = utils.CheckVolumeGroupSize(vglist, vg_name,
1914 constants.MIN_VG_SIZE)
1915 _ErrorIf(vgstatus, self.ENODELVM, node, vgstatus)
1918 pvlist = nresult.get(constants.NV_PVLIST, None)
1919 test = pvlist is None
1920 _ErrorIf(test, self.ENODELVM, node, "Can't get PV list from node")
1922 # check that ':' is not present in PV names, since it's a
1923 # special character for lvcreate (denotes the range of PEs to
1925 for _, pvname, owner_vg in pvlist:
1926 test = ":" in pvname
1927 _ErrorIf(test, self.ENODELVM, node, "Invalid character ':' in PV"
1928 " '%s' of VG '%s'", pvname, owner_vg)
1930 def _VerifyNodeBridges(self, ninfo, nresult, bridges):
1931 """Check the node bridges.
1933 @type ninfo: L{objects.Node}
1934 @param ninfo: the node to check
1935 @param nresult: the remote results for the node
1936 @param bridges: the expected list of bridges
1943 _ErrorIf = self._ErrorIf # pylint: disable=C0103
1945 missing = nresult.get(constants.NV_BRIDGES, None)
1946 test = not isinstance(missing, list)
1947 _ErrorIf(test, self.ENODENET, node,
1948 "did not return valid bridge information")
1950 _ErrorIf(bool(missing), self.ENODENET, node, "missing bridges: %s" %
1951 utils.CommaJoin(sorted(missing)))
1953 def _VerifyNodeNetwork(self, ninfo, nresult):
1954 """Check the node network connectivity results.
1956 @type ninfo: L{objects.Node}
1957 @param ninfo: the node to check
1958 @param nresult: the remote results for the node
1962 _ErrorIf = self._ErrorIf # pylint: disable=C0103
1964 test = constants.NV_NODELIST not in nresult
1965 _ErrorIf(test, self.ENODESSH, node,
1966 "node hasn't returned node ssh connectivity data")
1968 if nresult[constants.NV_NODELIST]:
1969 for a_node, a_msg in nresult[constants.NV_NODELIST].items():
1970 _ErrorIf(True, self.ENODESSH, node,
1971 "ssh communication with node '%s': %s", a_node, a_msg)
1973 test = constants.NV_NODENETTEST not in nresult
1974 _ErrorIf(test, self.ENODENET, node,
1975 "node hasn't returned node tcp connectivity data")
1977 if nresult[constants.NV_NODENETTEST]:
1978 nlist = utils.NiceSort(nresult[constants.NV_NODENETTEST].keys())
1980 _ErrorIf(True, self.ENODENET, node,
1981 "tcp communication with node '%s': %s",
1982 anode, nresult[constants.NV_NODENETTEST][anode])
1984 test = constants.NV_MASTERIP not in nresult
1985 _ErrorIf(test, self.ENODENET, node,
1986 "node hasn't returned node master IP reachability data")
1988 if not nresult[constants.NV_MASTERIP]:
1989 if node == self.master_node:
1990 msg = "the master node cannot reach the master IP (not configured?)"
1992 msg = "cannot reach the master IP"
1993 _ErrorIf(True, self.ENODENET, node, msg)
1995 def _VerifyInstance(self, instance, instanceconfig, node_image,
1997 """Verify an instance.
1999 This function checks to see if the required block devices are
2000 available on the instance's node.
2003 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2004 node_current = instanceconfig.primary_node
2006 node_vol_should = {}
2007 instanceconfig.MapLVsByNode(node_vol_should)
2009 for node in node_vol_should:
2010 n_img = node_image[node]
2011 if n_img.offline or n_img.rpc_fail or n_img.lvm_fail:
2012 # ignore missing volumes on offline or broken nodes
2014 for volume in node_vol_should[node]:
2015 test = volume not in n_img.volumes
2016 _ErrorIf(test, self.EINSTANCEMISSINGDISK, instance,
2017 "volume %s missing on node %s", volume, node)
2019 if instanceconfig.admin_up:
2020 pri_img = node_image[node_current]
2021 test = instance not in pri_img.instances and not pri_img.offline
2022 _ErrorIf(test, self.EINSTANCEDOWN, instance,
2023 "instance not running on its primary node %s",
2026 diskdata = [(nname, success, status, idx)
2027 for (nname, disks) in diskstatus.items()
2028 for idx, (success, status) in enumerate(disks)]
2030 for nname, success, bdev_status, idx in diskdata:
2031 # the 'ghost node' construction in Exec() ensures that we have a
2033 snode = node_image[nname]
2034 bad_snode = snode.ghost or snode.offline
2035 _ErrorIf(instanceconfig.admin_up and not success and not bad_snode,
2036 self.EINSTANCEFAULTYDISK, instance,
2037 "couldn't retrieve status for disk/%s on %s: %s",
2038 idx, nname, bdev_status)
2039 _ErrorIf((instanceconfig.admin_up and success and
2040 bdev_status.ldisk_status == constants.LDS_FAULTY),
2041 self.EINSTANCEFAULTYDISK, instance,
2042 "disk/%s on %s is faulty", idx, nname)
2044 def _VerifyOrphanVolumes(self, node_vol_should, node_image, reserved):
2045 """Verify if there are any unknown volumes in the cluster.
2047 The .os, .swap and backup volumes are ignored. All other volumes are
2048 reported as unknown.
2050 @type reserved: L{ganeti.utils.FieldSet}
2051 @param reserved: a FieldSet of reserved volume names
2054 for node, n_img in node_image.items():
2055 if n_img.offline or n_img.rpc_fail or n_img.lvm_fail:
2056 # skip non-healthy nodes
2058 for volume in n_img.volumes:
2059 test = ((node not in node_vol_should or
2060 volume not in node_vol_should[node]) and
2061 not reserved.Matches(volume))
2062 self._ErrorIf(test, self.ENODEORPHANLV, node,
2063 "volume %s is unknown", volume)
2065 def _VerifyNPlusOneMemory(self, node_image, instance_cfg):
2066 """Verify N+1 Memory Resilience.
2068 Check that if one single node dies we can still start all the
2069 instances it was primary for.
2072 cluster_info = self.cfg.GetClusterInfo()
2073 for node, n_img in node_image.items():
2074 # This code checks that every node which is now listed as
2075 # secondary has enough memory to host all instances it is
2076 # supposed to should a single other node in the cluster fail.
2077 # FIXME: not ready for failover to an arbitrary node
2078 # FIXME: does not support file-backed instances
2079 # WARNING: we currently take into account down instances as well
2080 # as up ones, considering that even if they're down someone
2081 # might want to start them even in the event of a node failure.
2083 # we're skipping offline nodes from the N+1 warning, since
2084 # most likely we don't have good memory infromation from them;
2085 # we already list instances living on such nodes, and that's
2088 for prinode, instances in n_img.sbp.items():
2090 for instance in instances:
2091 bep = cluster_info.FillBE(instance_cfg[instance])
2092 if bep[constants.BE_AUTO_BALANCE]:
2093 needed_mem += bep[constants.BE_MEMORY]
2094 test = n_img.mfree < needed_mem
2095 self._ErrorIf(test, self.ENODEN1, node,
2096 "not enough memory to accomodate instance failovers"
2097 " should node %s fail (%dMiB needed, %dMiB available)",
2098 prinode, needed_mem, n_img.mfree)
2101 def _VerifyFiles(cls, errorif, nodeinfo, master_node, all_nvinfo,
2102 (files_all, files_opt, files_mc, files_vm)):
2103 """Verifies file checksums collected from all nodes.
2105 @param errorif: Callback for reporting errors
2106 @param nodeinfo: List of L{objects.Node} objects
2107 @param master_node: Name of master node
2108 @param all_nvinfo: RPC results
2111 # Define functions determining which nodes to consider for a file
2114 (files_mc, lambda node: (node.master_candidate or
2115 node.name == master_node)),
2116 (files_vm, lambda node: node.vm_capable),
2119 # Build mapping from filename to list of nodes which should have the file
2121 for (files, fn) in files2nodefn:
2123 filenodes = nodeinfo
2125 filenodes = filter(fn, nodeinfo)
2126 nodefiles.update((filename,
2127 frozenset(map(operator.attrgetter("name"), filenodes)))
2128 for filename in files)
2130 assert set(nodefiles) == (files_all | files_mc | files_vm)
2132 fileinfo = dict((filename, {}) for filename in nodefiles)
2133 ignore_nodes = set()
2135 for node in nodeinfo:
2137 ignore_nodes.add(node.name)
2140 nresult = all_nvinfo[node.name]
2142 if nresult.fail_msg or not nresult.payload:
2145 node_files = nresult.payload.get(constants.NV_FILELIST, None)
2147 test = not (node_files and isinstance(node_files, dict))
2148 errorif(test, cls.ENODEFILECHECK, node.name,
2149 "Node did not return file checksum data")
2151 ignore_nodes.add(node.name)
2154 # Build per-checksum mapping from filename to nodes having it
2155 for (filename, checksum) in node_files.items():
2156 assert filename in nodefiles
2157 fileinfo[filename].setdefault(checksum, set()).add(node.name)
2159 for (filename, checksums) in fileinfo.items():
2160 assert compat.all(len(i) > 10 for i in checksums), "Invalid checksum"
2162 # Nodes having the file
2163 with_file = frozenset(node_name
2164 for nodes in fileinfo[filename].values()
2165 for node_name in nodes) - ignore_nodes
2167 expected_nodes = nodefiles[filename] - ignore_nodes
2169 # Nodes missing file
2170 missing_file = expected_nodes - with_file
2172 if filename in files_opt:
2174 errorif(missing_file and missing_file != expected_nodes,
2175 cls.ECLUSTERFILECHECK, None,
2176 "File %s is optional, but it must exist on all or no"
2177 " nodes (not found on %s)",
2178 filename, utils.CommaJoin(utils.NiceSort(missing_file)))
2180 # Non-optional files
2181 errorif(missing_file, cls.ECLUSTERFILECHECK, None,
2182 "File %s is missing from node(s) %s", filename,
2183 utils.CommaJoin(utils.NiceSort(missing_file)))
2185 # Warn if a node has a file it shouldn't
2186 unexpected = with_file - expected_nodes
2188 cls.ECLUSTERFILECHECK, None,
2189 "File %s should not exist on node(s) %s",
2190 filename, utils.CommaJoin(utils.NiceSort(unexpected)))
2192 # See if there are multiple versions of the file
2193 test = len(checksums) > 1
2195 variants = ["variant %s on %s" %
2196 (idx + 1, utils.CommaJoin(utils.NiceSort(nodes)))
2197 for (idx, (checksum, nodes)) in
2198 enumerate(sorted(checksums.items()))]
2202 errorif(test, cls.ECLUSTERFILECHECK, None,
2203 "File %s found with %s different checksums (%s)",
2204 filename, len(checksums), "; ".join(variants))
2206 def _VerifyNodeDrbd(self, ninfo, nresult, instanceinfo, drbd_helper,
2208 """Verifies and the node DRBD status.
2210 @type ninfo: L{objects.Node}
2211 @param ninfo: the node to check
2212 @param nresult: the remote results for the node
2213 @param instanceinfo: the dict of instances
2214 @param drbd_helper: the configured DRBD usermode helper
2215 @param drbd_map: the DRBD map as returned by
2216 L{ganeti.config.ConfigWriter.ComputeDRBDMap}
2220 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2223 helper_result = nresult.get(constants.NV_DRBDHELPER, None)
2224 test = (helper_result == None)
2225 _ErrorIf(test, self.ENODEDRBDHELPER, node,
2226 "no drbd usermode helper returned")
2228 status, payload = helper_result
2230 _ErrorIf(test, self.ENODEDRBDHELPER, node,
2231 "drbd usermode helper check unsuccessful: %s", payload)
2232 test = status and (payload != drbd_helper)
2233 _ErrorIf(test, self.ENODEDRBDHELPER, node,
2234 "wrong drbd usermode helper: %s", payload)
2236 # compute the DRBD minors
2238 for minor, instance in drbd_map[node].items():
2239 test = instance not in instanceinfo
2240 _ErrorIf(test, self.ECLUSTERCFG, None,
2241 "ghost instance '%s' in temporary DRBD map", instance)
2242 # ghost instance should not be running, but otherwise we
2243 # don't give double warnings (both ghost instance and
2244 # unallocated minor in use)
2246 node_drbd[minor] = (instance, False)
2248 instance = instanceinfo[instance]
2249 node_drbd[minor] = (instance.name, instance.admin_up)
2251 # and now check them
2252 used_minors = nresult.get(constants.NV_DRBDLIST, [])
2253 test = not isinstance(used_minors, (tuple, list))
2254 _ErrorIf(test, self.ENODEDRBD, node,
2255 "cannot parse drbd status file: %s", str(used_minors))
2257 # we cannot check drbd status
2260 for minor, (iname, must_exist) in node_drbd.items():
2261 test = minor not in used_minors and must_exist
2262 _ErrorIf(test, self.ENODEDRBD, node,
2263 "drbd minor %d of instance %s is not active", minor, iname)
2264 for minor in used_minors:
2265 test = minor not in node_drbd
2266 _ErrorIf(test, self.ENODEDRBD, node,
2267 "unallocated drbd minor %d is in use", minor)
2269 def _UpdateNodeOS(self, ninfo, nresult, nimg):
2270 """Builds the node OS structures.
2272 @type ninfo: L{objects.Node}
2273 @param ninfo: the node to check
2274 @param nresult: the remote results for the node
2275 @param nimg: the node image object
2279 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2281 remote_os = nresult.get(constants.NV_OSLIST, None)
2282 test = (not isinstance(remote_os, list) or
2283 not compat.all(isinstance(v, list) and len(v) == 7
2284 for v in remote_os))
2286 _ErrorIf(test, self.ENODEOS, node,
2287 "node hasn't returned valid OS data")
2296 for (name, os_path, status, diagnose,
2297 variants, parameters, api_ver) in nresult[constants.NV_OSLIST]:
2299 if name not in os_dict:
2302 # parameters is a list of lists instead of list of tuples due to
2303 # JSON lacking a real tuple type, fix it:
2304 parameters = [tuple(v) for v in parameters]
2305 os_dict[name].append((os_path, status, diagnose,
2306 set(variants), set(parameters), set(api_ver)))
2308 nimg.oslist = os_dict
2310 def _VerifyNodeOS(self, ninfo, nimg, base):
2311 """Verifies the node OS list.
2313 @type ninfo: L{objects.Node}
2314 @param ninfo: the node to check
2315 @param nimg: the node image object
2316 @param base: the 'template' node we match against (e.g. from the master)
2320 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2322 assert not nimg.os_fail, "Entered _VerifyNodeOS with failed OS rpc?"
2324 beautify_params = lambda l: ["%s: %s" % (k, v) for (k, v) in l]
2325 for os_name, os_data in nimg.oslist.items():
2326 assert os_data, "Empty OS status for OS %s?!" % os_name
2327 f_path, f_status, f_diag, f_var, f_param, f_api = os_data[0]
2328 _ErrorIf(not f_status, self.ENODEOS, node,
2329 "Invalid OS %s (located at %s): %s", os_name, f_path, f_diag)
2330 _ErrorIf(len(os_data) > 1, self.ENODEOS, node,
2331 "OS '%s' has multiple entries (first one shadows the rest): %s",
2332 os_name, utils.CommaJoin([v[0] for v in os_data]))
2333 # comparisons with the 'base' image
2334 test = os_name not in base.oslist
2335 _ErrorIf(test, self.ENODEOS, node,
2336 "Extra OS %s not present on reference node (%s)",
2340 assert base.oslist[os_name], "Base node has empty OS status?"
2341 _, b_status, _, b_var, b_param, b_api = base.oslist[os_name][0]
2343 # base OS is invalid, skipping
2345 for kind, a, b in [("API version", f_api, b_api),
2346 ("variants list", f_var, b_var),
2347 ("parameters", beautify_params(f_param),
2348 beautify_params(b_param))]:
2349 _ErrorIf(a != b, self.ENODEOS, node,
2350 "OS %s for %s differs from reference node %s: [%s] vs. [%s]",
2351 kind, os_name, base.name,
2352 utils.CommaJoin(sorted(a)), utils.CommaJoin(sorted(b)))
2354 # check any missing OSes
2355 missing = set(base.oslist.keys()).difference(nimg.oslist.keys())
2356 _ErrorIf(missing, self.ENODEOS, node,
2357 "OSes present on reference node %s but missing on this node: %s",
2358 base.name, utils.CommaJoin(missing))
2360 def _VerifyOob(self, ninfo, nresult):
2361 """Verifies out of band functionality of a node.
2363 @type ninfo: L{objects.Node}
2364 @param ninfo: the node to check
2365 @param nresult: the remote results for the node
2369 # We just have to verify the paths on master and/or master candidates
2370 # as the oob helper is invoked on the master
2371 if ((ninfo.master_candidate or ninfo.master_capable) and
2372 constants.NV_OOB_PATHS in nresult):
2373 for path_result in nresult[constants.NV_OOB_PATHS]:
2374 self._ErrorIf(path_result, self.ENODEOOBPATH, node, path_result)
2376 def _UpdateNodeVolumes(self, ninfo, nresult, nimg, vg_name):
2377 """Verifies and updates the node volume data.
2379 This function will update a L{NodeImage}'s internal structures
2380 with data from the remote call.
2382 @type ninfo: L{objects.Node}
2383 @param ninfo: the node to check
2384 @param nresult: the remote results for the node
2385 @param nimg: the node image object
2386 @param vg_name: the configured VG name
2390 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2392 nimg.lvm_fail = True
2393 lvdata = nresult.get(constants.NV_LVLIST, "Missing LV data")
2396 elif isinstance(lvdata, basestring):
2397 _ErrorIf(True, self.ENODELVM, node, "LVM problem on node: %s",
2398 utils.SafeEncode(lvdata))
2399 elif not isinstance(lvdata, dict):
2400 _ErrorIf(True, self.ENODELVM, node, "rpc call to node failed (lvlist)")
2402 nimg.volumes = lvdata
2403 nimg.lvm_fail = False
2405 def _UpdateNodeInstances(self, ninfo, nresult, nimg):
2406 """Verifies and updates the node instance list.
2408 If the listing was successful, then updates this node's instance
2409 list. Otherwise, it marks the RPC call as failed for the instance
2412 @type ninfo: L{objects.Node}
2413 @param ninfo: the node to check
2414 @param nresult: the remote results for the node
2415 @param nimg: the node image object
2418 idata = nresult.get(constants.NV_INSTANCELIST, None)
2419 test = not isinstance(idata, list)
2420 self._ErrorIf(test, self.ENODEHV, ninfo.name, "rpc call to node failed"
2421 " (instancelist): %s", utils.SafeEncode(str(idata)))
2423 nimg.hyp_fail = True
2425 nimg.instances = idata
2427 def _UpdateNodeInfo(self, ninfo, nresult, nimg, vg_name):
2428 """Verifies and computes a node information map
2430 @type ninfo: L{objects.Node}
2431 @param ninfo: the node to check
2432 @param nresult: the remote results for the node
2433 @param nimg: the node image object
2434 @param vg_name: the configured VG name
2438 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2440 # try to read free memory (from the hypervisor)
2441 hv_info = nresult.get(constants.NV_HVINFO, None)
2442 test = not isinstance(hv_info, dict) or "memory_free" not in hv_info
2443 _ErrorIf(test, self.ENODEHV, node, "rpc call to node failed (hvinfo)")
2446 nimg.mfree = int(hv_info["memory_free"])
2447 except (ValueError, TypeError):
2448 _ErrorIf(True, self.ENODERPC, node,
2449 "node returned invalid nodeinfo, check hypervisor")
2451 # FIXME: devise a free space model for file based instances as well
2452 if vg_name is not None:
2453 test = (constants.NV_VGLIST not in nresult or
2454 vg_name not in nresult[constants.NV_VGLIST])
2455 _ErrorIf(test, self.ENODELVM, node,
2456 "node didn't return data for the volume group '%s'"
2457 " - it is either missing or broken", vg_name)
2460 nimg.dfree = int(nresult[constants.NV_VGLIST][vg_name])
2461 except (ValueError, TypeError):
2462 _ErrorIf(True, self.ENODERPC, node,
2463 "node returned invalid LVM info, check LVM status")
2465 def _CollectDiskInfo(self, nodelist, node_image, instanceinfo):
2466 """Gets per-disk status information for all instances.
2468 @type nodelist: list of strings
2469 @param nodelist: Node names
2470 @type node_image: dict of (name, L{objects.Node})
2471 @param node_image: Node objects
2472 @type instanceinfo: dict of (name, L{objects.Instance})
2473 @param instanceinfo: Instance objects
2474 @rtype: {instance: {node: [(succes, payload)]}}
2475 @return: a dictionary of per-instance dictionaries with nodes as
2476 keys and disk information as values; the disk information is a
2477 list of tuples (success, payload)
2480 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2483 node_disks_devonly = {}
2484 diskless_instances = set()
2485 diskless = constants.DT_DISKLESS
2487 for nname in nodelist:
2488 node_instances = list(itertools.chain(node_image[nname].pinst,
2489 node_image[nname].sinst))
2490 diskless_instances.update(inst for inst in node_instances
2491 if instanceinfo[inst].disk_template == diskless)
2492 disks = [(inst, disk)
2493 for inst in node_instances
2494 for disk in instanceinfo[inst].disks]
2497 # No need to collect data
2500 node_disks[nname] = disks
2502 # Creating copies as SetDiskID below will modify the objects and that can
2503 # lead to incorrect data returned from nodes
2504 devonly = [dev.Copy() for (_, dev) in disks]
2507 self.cfg.SetDiskID(dev, nname)
2509 node_disks_devonly[nname] = devonly
2511 assert len(node_disks) == len(node_disks_devonly)
2513 # Collect data from all nodes with disks
2514 result = self.rpc.call_blockdev_getmirrorstatus_multi(node_disks.keys(),
2517 assert len(result) == len(node_disks)
2521 for (nname, nres) in result.items():
2522 disks = node_disks[nname]
2525 # No data from this node
2526 data = len(disks) * [(False, "node offline")]
2529 _ErrorIf(msg, self.ENODERPC, nname,
2530 "while getting disk information: %s", msg)
2532 # No data from this node
2533 data = len(disks) * [(False, msg)]
2536 for idx, i in enumerate(nres.payload):
2537 if isinstance(i, (tuple, list)) and len(i) == 2:
2540 logging.warning("Invalid result from node %s, entry %d: %s",
2542 data.append((False, "Invalid result from the remote node"))
2544 for ((inst, _), status) in zip(disks, data):
2545 instdisk.setdefault(inst, {}).setdefault(nname, []).append(status)
2547 # Add empty entries for diskless instances.
2548 for inst in diskless_instances:
2549 assert inst not in instdisk
2552 assert compat.all(len(statuses) == len(instanceinfo[inst].disks) and
2553 len(nnames) <= len(instanceinfo[inst].all_nodes) and
2554 compat.all(isinstance(s, (tuple, list)) and
2555 len(s) == 2 for s in statuses)
2556 for inst, nnames in instdisk.items()
2557 for nname, statuses in nnames.items())
2558 assert set(instdisk) == set(instanceinfo), "instdisk consistency failure"
2563 def _SshNodeSelector(group_uuid, all_nodes):
2564 """Create endless iterators for all potential SSH check hosts.
2567 nodes = [node for node in all_nodes
2568 if (node.group != group_uuid and
2570 keyfunc = operator.attrgetter("group")
2572 return map(itertools.cycle,
2573 [sorted(map(operator.attrgetter("name"), names))
2574 for _, names in itertools.groupby(sorted(nodes, key=keyfunc),
2578 def _SelectSshCheckNodes(cls, group_nodes, group_uuid, all_nodes):
2579 """Choose which nodes should talk to which other nodes.
2581 We will make nodes contact all nodes in their group, and one node from
2584 @warning: This algorithm has a known issue if one node group is much
2585 smaller than others (e.g. just one node). In such a case all other
2586 nodes will talk to the single node.
2589 online_nodes = sorted(node.name for node in group_nodes if not node.offline)
2590 sel = cls._SshNodeSelector(group_uuid, all_nodes)
2592 return (online_nodes,
2593 dict((name, sorted([i.next() for i in sel]))
2594 for name in online_nodes))
2596 def BuildHooksEnv(self):
2599 Cluster-Verify hooks just ran in the post phase and their failure makes
2600 the output be logged in the verify output and the verification to fail.
2604 "CLUSTER_TAGS": " ".join(self.cfg.GetClusterInfo().GetTags())
2607 env.update(("NODE_TAGS_%s" % node.name, " ".join(node.GetTags()))
2608 for node in self.my_node_info.values())
2612 def BuildHooksNodes(self):
2613 """Build hooks nodes.
2616 return ([], self.my_node_names)
2618 def Exec(self, feedback_fn):
2619 """Verify integrity of the node group, performing various test on nodes.
2622 # This method has too many local variables. pylint: disable=R0914
2623 feedback_fn("* Verifying group '%s'" % self.group_info.name)
2625 if not self.my_node_names:
2627 feedback_fn("* Empty node group, skipping verification")
2631 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2632 verbose = self.op.verbose
2633 self._feedback_fn = feedback_fn
2635 vg_name = self.cfg.GetVGName()
2636 drbd_helper = self.cfg.GetDRBDHelper()
2637 cluster = self.cfg.GetClusterInfo()
2638 groupinfo = self.cfg.GetAllNodeGroupsInfo()
2639 hypervisors = cluster.enabled_hypervisors
2640 node_data_list = [self.my_node_info[name] for name in self.my_node_names]
2642 i_non_redundant = [] # Non redundant instances
2643 i_non_a_balanced = [] # Non auto-balanced instances
2644 n_offline = 0 # Count of offline nodes
2645 n_drained = 0 # Count of nodes being drained
2646 node_vol_should = {}
2648 # FIXME: verify OS list
2651 filemap = _ComputeAncillaryFiles(cluster, False)
2653 # do local checksums
2654 master_node = self.master_node = self.cfg.GetMasterNode()
2655 master_ip = self.cfg.GetMasterIP()
2657 feedback_fn("* Gathering data (%d nodes)" % len(self.my_node_names))
2659 node_verify_param = {
2660 constants.NV_FILELIST:
2661 utils.UniqueSequence(filename
2662 for files in filemap
2663 for filename in files),
2664 constants.NV_NODELIST:
2665 self._SelectSshCheckNodes(node_data_list, self.group_uuid,
2666 self.all_node_info.values()),
2667 constants.NV_HYPERVISOR: hypervisors,
2668 constants.NV_HVPARAMS:
2669 _GetAllHypervisorParameters(cluster, self.all_inst_info.values()),
2670 constants.NV_NODENETTEST: [(node.name, node.primary_ip, node.secondary_ip)
2671 for node in node_data_list
2672 if not node.offline],
2673 constants.NV_INSTANCELIST: hypervisors,
2674 constants.NV_VERSION: None,
2675 constants.NV_HVINFO: self.cfg.GetHypervisorType(),
2676 constants.NV_NODESETUP: None,
2677 constants.NV_TIME: None,
2678 constants.NV_MASTERIP: (master_node, master_ip),
2679 constants.NV_OSLIST: None,
2680 constants.NV_VMNODES: self.cfg.GetNonVmCapableNodeList(),
2683 if vg_name is not None:
2684 node_verify_param[constants.NV_VGLIST] = None
2685 node_verify_param[constants.NV_LVLIST] = vg_name
2686 node_verify_param[constants.NV_PVLIST] = [vg_name]
2687 node_verify_param[constants.NV_DRBDLIST] = None
2690 node_verify_param[constants.NV_DRBDHELPER] = drbd_helper
2693 # FIXME: this needs to be changed per node-group, not cluster-wide
2695 default_nicpp = cluster.nicparams[constants.PP_DEFAULT]
2696 if default_nicpp[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
2697 bridges.add(default_nicpp[constants.NIC_LINK])
2698 for instance in self.my_inst_info.values():
2699 for nic in instance.nics:
2700 full_nic = cluster.SimpleFillNIC(nic.nicparams)
2701 if full_nic[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
2702 bridges.add(full_nic[constants.NIC_LINK])
2705 node_verify_param[constants.NV_BRIDGES] = list(bridges)
2707 # Build our expected cluster state
2708 node_image = dict((node.name, self.NodeImage(offline=node.offline,
2710 vm_capable=node.vm_capable))
2711 for node in node_data_list)
2715 for node in self.all_node_info.values():
2716 path = _SupportsOob(self.cfg, node)
2717 if path and path not in oob_paths:
2718 oob_paths.append(path)
2721 node_verify_param[constants.NV_OOB_PATHS] = oob_paths
2723 for instance in self.my_inst_names:
2724 inst_config = self.my_inst_info[instance]
2726 for nname in inst_config.all_nodes:
2727 if nname not in node_image:
2728 gnode = self.NodeImage(name=nname)
2729 gnode.ghost = (nname not in self.all_node_info)
2730 node_image[nname] = gnode
2732 inst_config.MapLVsByNode(node_vol_should)
2734 pnode = inst_config.primary_node
2735 node_image[pnode].pinst.append(instance)
2737 for snode in inst_config.secondary_nodes:
2738 nimg = node_image[snode]
2739 nimg.sinst.append(instance)
2740 if pnode not in nimg.sbp:
2741 nimg.sbp[pnode] = []
2742 nimg.sbp[pnode].append(instance)
2744 # At this point, we have the in-memory data structures complete,
2745 # except for the runtime information, which we'll gather next
2747 # Due to the way our RPC system works, exact response times cannot be
2748 # guaranteed (e.g. a broken node could run into a timeout). By keeping the
2749 # time before and after executing the request, we can at least have a time
2751 nvinfo_starttime = time.time()
2752 all_nvinfo = self.rpc.call_node_verify(self.my_node_names,
2754 self.cfg.GetClusterName())
2755 nvinfo_endtime = time.time()
2757 if self.extra_lv_nodes and vg_name is not None:
2759 self.rpc.call_node_verify(self.extra_lv_nodes,
2760 {constants.NV_LVLIST: vg_name},
2761 self.cfg.GetClusterName())
2763 extra_lv_nvinfo = {}
2765 all_drbd_map = self.cfg.ComputeDRBDMap()
2767 feedback_fn("* Gathering disk information (%s nodes)" %
2768 len(self.my_node_names))
2769 instdisk = self._CollectDiskInfo(self.my_node_names, node_image,
2772 feedback_fn("* Verifying configuration file consistency")
2774 # If not all nodes are being checked, we need to make sure the master node
2775 # and a non-checked vm_capable node are in the list.
2776 absent_nodes = set(self.all_node_info).difference(self.my_node_info)
2778 vf_nvinfo = all_nvinfo.copy()
2779 vf_node_info = list(self.my_node_info.values())
2780 additional_nodes = []
2781 if master_node not in self.my_node_info:
2782 additional_nodes.append(master_node)
2783 vf_node_info.append(self.all_node_info[master_node])
2784 # Add the first vm_capable node we find which is not included
2785 for node in absent_nodes:
2786 nodeinfo = self.all_node_info[node]
2787 if nodeinfo.vm_capable and not nodeinfo.offline:
2788 additional_nodes.append(node)
2789 vf_node_info.append(self.all_node_info[node])
2791 key = constants.NV_FILELIST
2792 vf_nvinfo.update(self.rpc.call_node_verify(additional_nodes,
2793 {key: node_verify_param[key]},
2794 self.cfg.GetClusterName()))
2796 vf_nvinfo = all_nvinfo
2797 vf_node_info = self.my_node_info.values()
2799 self._VerifyFiles(_ErrorIf, vf_node_info, master_node, vf_nvinfo, filemap)
2801 feedback_fn("* Verifying node status")
2805 for node_i in node_data_list:
2807 nimg = node_image[node]
2811 feedback_fn("* Skipping offline node %s" % (node,))
2815 if node == master_node:
2817 elif node_i.master_candidate:
2818 ntype = "master candidate"
2819 elif node_i.drained:
2825 feedback_fn("* Verifying node %s (%s)" % (node, ntype))
2827 msg = all_nvinfo[node].fail_msg
2828 _ErrorIf(msg, self.ENODERPC, node, "while contacting node: %s", msg)
2830 nimg.rpc_fail = True
2833 nresult = all_nvinfo[node].payload
2835 nimg.call_ok = self._VerifyNode(node_i, nresult)
2836 self._VerifyNodeTime(node_i, nresult, nvinfo_starttime, nvinfo_endtime)
2837 self._VerifyNodeNetwork(node_i, nresult)
2838 self._VerifyOob(node_i, nresult)
2841 self._VerifyNodeLVM(node_i, nresult, vg_name)
2842 self._VerifyNodeDrbd(node_i, nresult, self.all_inst_info, drbd_helper,
2845 self._UpdateNodeVolumes(node_i, nresult, nimg, vg_name)
2846 self._UpdateNodeInstances(node_i, nresult, nimg)
2847 self._UpdateNodeInfo(node_i, nresult, nimg, vg_name)
2848 self._UpdateNodeOS(node_i, nresult, nimg)
2850 if not nimg.os_fail:
2851 if refos_img is None:
2853 self._VerifyNodeOS(node_i, nimg, refos_img)
2854 self._VerifyNodeBridges(node_i, nresult, bridges)
2856 # Check whether all running instancies are primary for the node. (This
2857 # can no longer be done from _VerifyInstance below, since some of the
2858 # wrong instances could be from other node groups.)
2859 non_primary_inst = set(nimg.instances).difference(nimg.pinst)
2861 for inst in non_primary_inst:
2862 test = inst in self.all_inst_info
2863 _ErrorIf(test, self.EINSTANCEWRONGNODE, inst,
2864 "instance should not run on node %s", node_i.name)
2865 _ErrorIf(not test, self.ENODEORPHANINSTANCE, node_i.name,
2866 "node is running unknown instance %s", inst)
2868 for node, result in extra_lv_nvinfo.items():
2869 self._UpdateNodeVolumes(self.all_node_info[node], result.payload,
2870 node_image[node], vg_name)
2872 feedback_fn("* Verifying instance status")
2873 for instance in self.my_inst_names:
2875 feedback_fn("* Verifying instance %s" % instance)
2876 inst_config = self.my_inst_info[instance]
2877 self._VerifyInstance(instance, inst_config, node_image,
2879 inst_nodes_offline = []
2881 pnode = inst_config.primary_node
2882 pnode_img = node_image[pnode]
2883 _ErrorIf(pnode_img.rpc_fail and not pnode_img.offline,
2884 self.ENODERPC, pnode, "instance %s, connection to"
2885 " primary node failed", instance)
2887 _ErrorIf(inst_config.admin_up and pnode_img.offline,
2888 self.EINSTANCEBADNODE, instance,
2889 "instance is marked as running and lives on offline node %s",
2890 inst_config.primary_node)
2892 # If the instance is non-redundant we cannot survive losing its primary
2893 # node, so we are not N+1 compliant. On the other hand we have no disk
2894 # templates with more than one secondary so that situation is not well
2896 # FIXME: does not support file-backed instances
2897 if not inst_config.secondary_nodes:
2898 i_non_redundant.append(instance)
2900 _ErrorIf(len(inst_config.secondary_nodes) > 1, self.EINSTANCELAYOUT,
2901 instance, "instance has multiple secondary nodes: %s",
2902 utils.CommaJoin(inst_config.secondary_nodes),
2903 code=self.ETYPE_WARNING)
2905 if inst_config.disk_template in constants.DTS_INT_MIRROR:
2906 pnode = inst_config.primary_node
2907 instance_nodes = utils.NiceSort(inst_config.all_nodes)
2908 instance_groups = {}
2910 for node in instance_nodes:
2911 instance_groups.setdefault(self.all_node_info[node].group,
2915 "%s (group %s)" % (utils.CommaJoin(nodes), groupinfo[group].name)
2916 # Sort so that we always list the primary node first.
2917 for group, nodes in sorted(instance_groups.items(),
2918 key=lambda (_, nodes): pnode in nodes,
2921 self._ErrorIf(len(instance_groups) > 1, self.EINSTANCESPLITGROUPS,
2922 instance, "instance has primary and secondary nodes in"
2923 " different groups: %s", utils.CommaJoin(pretty_list),
2924 code=self.ETYPE_WARNING)
2926 if not cluster.FillBE(inst_config)[constants.BE_AUTO_BALANCE]:
2927 i_non_a_balanced.append(instance)
2929 for snode in inst_config.secondary_nodes:
2930 s_img = node_image[snode]
2931 _ErrorIf(s_img.rpc_fail and not s_img.offline, self.ENODERPC, snode,
2932 "instance %s, connection to secondary node failed", instance)
2935 inst_nodes_offline.append(snode)
2937 # warn that the instance lives on offline nodes
2938 _ErrorIf(inst_nodes_offline, self.EINSTANCEBADNODE, instance,
2939 "instance has offline secondary node(s) %s",
2940 utils.CommaJoin(inst_nodes_offline))
2941 # ... or ghost/non-vm_capable nodes
2942 for node in inst_config.all_nodes:
2943 _ErrorIf(node_image[node].ghost, self.EINSTANCEBADNODE, instance,
2944 "instance lives on ghost node %s", node)
2945 _ErrorIf(not node_image[node].vm_capable, self.EINSTANCEBADNODE,
2946 instance, "instance lives on non-vm_capable node %s", node)
2948 feedback_fn("* Verifying orphan volumes")
2949 reserved = utils.FieldSet(*cluster.reserved_lvs)
2951 # We will get spurious "unknown volume" warnings if any node of this group
2952 # is secondary for an instance whose primary is in another group. To avoid
2953 # them, we find these instances and add their volumes to node_vol_should.
2954 for inst in self.all_inst_info.values():
2955 for secondary in inst.secondary_nodes:
2956 if (secondary in self.my_node_info
2957 and inst.name not in self.my_inst_info):
2958 inst.MapLVsByNode(node_vol_should)
2961 self._VerifyOrphanVolumes(node_vol_should, node_image, reserved)
2963 if constants.VERIFY_NPLUSONE_MEM not in self.op.skip_checks:
2964 feedback_fn("* Verifying N+1 Memory redundancy")
2965 self._VerifyNPlusOneMemory(node_image, self.my_inst_info)
2967 feedback_fn("* Other Notes")
2969 feedback_fn(" - NOTICE: %d non-redundant instance(s) found."
2970 % len(i_non_redundant))
2972 if i_non_a_balanced:
2973 feedback_fn(" - NOTICE: %d non-auto-balanced instance(s) found."
2974 % len(i_non_a_balanced))
2977 feedback_fn(" - NOTICE: %d offline node(s) found." % n_offline)
2980 feedback_fn(" - NOTICE: %d drained node(s) found." % n_drained)
2984 def HooksCallBack(self, phase, hooks_results, feedback_fn, lu_result):
2985 """Analyze the post-hooks' result
2987 This method analyses the hook result, handles it, and sends some
2988 nicely-formatted feedback back to the user.
2990 @param phase: one of L{constants.HOOKS_PHASE_POST} or
2991 L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase
2992 @param hooks_results: the results of the multi-node hooks rpc call
2993 @param feedback_fn: function used send feedback back to the caller
2994 @param lu_result: previous Exec result
2995 @return: the new Exec result, based on the previous result
2999 # We only really run POST phase hooks, only for non-empty groups,
3000 # and are only interested in their results
3001 if not self.my_node_names:
3004 elif phase == constants.HOOKS_PHASE_POST:
3005 # Used to change hooks' output to proper indentation
3006 feedback_fn("* Hooks Results")
3007 assert hooks_results, "invalid result from hooks"
3009 for node_name in hooks_results:
3010 res = hooks_results[node_name]
3012 test = msg and not res.offline
3013 self._ErrorIf(test, self.ENODEHOOKS, node_name,
3014 "Communication failure in hooks execution: %s", msg)
3015 if res.offline or msg:
3016 # No need to investigate payload if node is offline or gave
3019 for script, hkr, output in res.payload:
3020 test = hkr == constants.HKR_FAIL
3021 self._ErrorIf(test, self.ENODEHOOKS, node_name,
3022 "Script %s failed, output:", script)
3024 output = self._HOOKS_INDENT_RE.sub(" ", output)
3025 feedback_fn("%s" % output)
3031 class LUClusterVerifyDisks(NoHooksLU):
3032 """Verifies the cluster disks status.
3037 def ExpandNames(self):
3038 self.share_locks = _ShareAll()
3039 self.needed_locks = {
3040 locking.LEVEL_NODEGROUP: locking.ALL_SET,
3043 def Exec(self, feedback_fn):
3044 group_names = self.owned_locks(locking.LEVEL_NODEGROUP)
3046 # Submit one instance of L{opcodes.OpGroupVerifyDisks} per node group
3047 return ResultWithJobs([[opcodes.OpGroupVerifyDisks(group_name=group)]
3048 for group in group_names])
3051 class LUGroupVerifyDisks(NoHooksLU):
3052 """Verifies the status of all disks in a node group.
3057 def ExpandNames(self):
3058 # Raises errors.OpPrereqError on its own if group can't be found
3059 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
3061 self.share_locks = _ShareAll()
3062 self.needed_locks = {
3063 locking.LEVEL_INSTANCE: [],
3064 locking.LEVEL_NODEGROUP: [],
3065 locking.LEVEL_NODE: [],
3068 def DeclareLocks(self, level):
3069 if level == locking.LEVEL_INSTANCE:
3070 assert not self.needed_locks[locking.LEVEL_INSTANCE]
3072 # Lock instances optimistically, needs verification once node and group
3073 # locks have been acquired
3074 self.needed_locks[locking.LEVEL_INSTANCE] = \
3075 self.cfg.GetNodeGroupInstances(self.group_uuid)
3077 elif level == locking.LEVEL_NODEGROUP:
3078 assert not self.needed_locks[locking.LEVEL_NODEGROUP]
3080 self.needed_locks[locking.LEVEL_NODEGROUP] = \
3081 set([self.group_uuid] +
3082 # Lock all groups used by instances optimistically; this requires
3083 # going via the node before it's locked, requiring verification
3086 for instance_name in self.owned_locks(locking.LEVEL_INSTANCE)
3087 for group_uuid in self.cfg.GetInstanceNodeGroups(instance_name)])
3089 elif level == locking.LEVEL_NODE:
3090 # This will only lock the nodes in the group to be verified which contain
3092 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
3093 self._LockInstancesNodes()
3095 # Lock all nodes in group to be verified
3096 assert self.group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP)
3097 member_nodes = self.cfg.GetNodeGroup(self.group_uuid).members
3098 self.needed_locks[locking.LEVEL_NODE].extend(member_nodes)
3100 def CheckPrereq(self):
3101 owned_instances = frozenset(self.owned_locks(locking.LEVEL_INSTANCE))
3102 owned_groups = frozenset(self.owned_locks(locking.LEVEL_NODEGROUP))
3103 owned_nodes = frozenset(self.owned_locks(locking.LEVEL_NODE))
3105 assert self.group_uuid in owned_groups
3107 # Check if locked instances are still correct
3108 _CheckNodeGroupInstances(self.cfg, self.group_uuid, owned_instances)
3110 # Get instance information
3111 self.instances = dict(self.cfg.GetMultiInstanceInfo(owned_instances))
3113 # Check if node groups for locked instances are still correct
3114 for (instance_name, inst) in self.instances.items():
3115 assert owned_nodes.issuperset(inst.all_nodes), \
3116 "Instance %s's nodes changed while we kept the lock" % instance_name
3118 inst_groups = _CheckInstanceNodeGroups(self.cfg, instance_name,
3121 assert self.group_uuid in inst_groups, \
3122 "Instance %s has no node in group %s" % (instance_name, self.group_uuid)
3124 def Exec(self, feedback_fn):
3125 """Verify integrity of cluster disks.
3127 @rtype: tuple of three items
3128 @return: a tuple of (dict of node-to-node_error, list of instances
3129 which need activate-disks, dict of instance: (node, volume) for
3134 res_instances = set()
3137 nv_dict = _MapInstanceDisksToNodes([inst
3138 for inst in self.instances.values()
3142 nodes = utils.NiceSort(set(self.owned_locks(locking.LEVEL_NODE)) &
3143 set(self.cfg.GetVmCapableNodeList()))
3145 node_lvs = self.rpc.call_lv_list(nodes, [])
3147 for (node, node_res) in node_lvs.items():
3148 if node_res.offline:
3151 msg = node_res.fail_msg
3153 logging.warning("Error enumerating LVs on node %s: %s", node, msg)
3154 res_nodes[node] = msg
3157 for lv_name, (_, _, lv_online) in node_res.payload.items():
3158 inst = nv_dict.pop((node, lv_name), None)
3159 if not (lv_online or inst is None):
3160 res_instances.add(inst)
3162 # any leftover items in nv_dict are missing LVs, let's arrange the data
3164 for key, inst in nv_dict.iteritems():
3165 res_missing.setdefault(inst, []).append(list(key))
3167 return (res_nodes, list(res_instances), res_missing)
3170 class LUClusterRepairDiskSizes(NoHooksLU):
3171 """Verifies the cluster disks sizes.
3176 def ExpandNames(self):
3177 if self.op.instances:
3178 self.wanted_names = _GetWantedInstances(self, self.op.instances)
3179 self.needed_locks = {
3180 locking.LEVEL_NODE: [],
3181 locking.LEVEL_INSTANCE: self.wanted_names,
3183 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
3185 self.wanted_names = None
3186 self.needed_locks = {
3187 locking.LEVEL_NODE: locking.ALL_SET,
3188 locking.LEVEL_INSTANCE: locking.ALL_SET,
3190 self.share_locks = {
3191 locking.LEVEL_NODE: 1,
3192 locking.LEVEL_INSTANCE: 0,
3195 def DeclareLocks(self, level):
3196 if level == locking.LEVEL_NODE and self.wanted_names is not None:
3197 self._LockInstancesNodes(primary_only=True)
3199 def CheckPrereq(self):
3200 """Check prerequisites.
3202 This only checks the optional instance list against the existing names.
3205 if self.wanted_names is None:
3206 self.wanted_names = self.owned_locks(locking.LEVEL_INSTANCE)
3208 self.wanted_instances = \
3209 map(compat.snd, self.cfg.GetMultiInstanceInfo(self.wanted_names))
3211 def _EnsureChildSizes(self, disk):
3212 """Ensure children of the disk have the needed disk size.
3214 This is valid mainly for DRBD8 and fixes an issue where the
3215 children have smaller disk size.
3217 @param disk: an L{ganeti.objects.Disk} object
3220 if disk.dev_type == constants.LD_DRBD8:
3221 assert disk.children, "Empty children for DRBD8?"
3222 fchild = disk.children[0]
3223 mismatch = fchild.size < disk.size
3225 self.LogInfo("Child disk has size %d, parent %d, fixing",
3226 fchild.size, disk.size)
3227 fchild.size = disk.size
3229 # and we recurse on this child only, not on the metadev
3230 return self._EnsureChildSizes(fchild) or mismatch
3234 def Exec(self, feedback_fn):
3235 """Verify the size of cluster disks.
3238 # TODO: check child disks too
3239 # TODO: check differences in size between primary/secondary nodes
3241 for instance in self.wanted_instances:
3242 pnode = instance.primary_node
3243 if pnode not in per_node_disks:
3244 per_node_disks[pnode] = []
3245 for idx, disk in enumerate(instance.disks):
3246 per_node_disks[pnode].append((instance, idx, disk))
3249 for node, dskl in per_node_disks.items():
3250 newl = [v[2].Copy() for v in dskl]
3252 self.cfg.SetDiskID(dsk, node)
3253 result = self.rpc.call_blockdev_getsize(node, newl)
3255 self.LogWarning("Failure in blockdev_getsize call to node"
3256 " %s, ignoring", node)
3258 if len(result.payload) != len(dskl):
3259 logging.warning("Invalid result from node %s: len(dksl)=%d,"
3260 " result.payload=%s", node, len(dskl), result.payload)
3261 self.LogWarning("Invalid result from node %s, ignoring node results",
3264 for ((instance, idx, disk), size) in zip(dskl, result.payload):
3266 self.LogWarning("Disk %d of instance %s did not return size"
3267 " information, ignoring", idx, instance.name)
3269 if not isinstance(size, (int, long)):
3270 self.LogWarning("Disk %d of instance %s did not return valid"
3271 " size information, ignoring", idx, instance.name)
3274 if size != disk.size:
3275 self.LogInfo("Disk %d of instance %s has mismatched size,"
3276 " correcting: recorded %d, actual %d", idx,
3277 instance.name, disk.size, size)
3279 self.cfg.Update(instance, feedback_fn)
3280 changed.append((instance.name, idx, size))
3281 if self._EnsureChildSizes(disk):
3282 self.cfg.Update(instance, feedback_fn)
3283 changed.append((instance.name, idx, disk.size))
3287 class LUClusterRename(LogicalUnit):
3288 """Rename the cluster.
3291 HPATH = "cluster-rename"
3292 HTYPE = constants.HTYPE_CLUSTER
3294 def BuildHooksEnv(self):
3299 "OP_TARGET": self.cfg.GetClusterName(),
3300 "NEW_NAME": self.op.name,
3303 def BuildHooksNodes(self):
3304 """Build hooks nodes.
3307 return ([self.cfg.GetMasterNode()], self.cfg.GetNodeList())
3309 def CheckPrereq(self):
3310 """Verify that the passed name is a valid one.
3313 hostname = netutils.GetHostname(name=self.op.name,
3314 family=self.cfg.GetPrimaryIPFamily())
3316 new_name = hostname.name
3317 self.ip = new_ip = hostname.ip
3318 old_name = self.cfg.GetClusterName()
3319 old_ip = self.cfg.GetMasterIP()
3320 if new_name == old_name and new_ip == old_ip:
3321 raise errors.OpPrereqError("Neither the name nor the IP address of the"
3322 " cluster has changed",
3324 if new_ip != old_ip:
3325 if netutils.TcpPing(new_ip, constants.DEFAULT_NODED_PORT):
3326 raise errors.OpPrereqError("The given cluster IP address (%s) is"
3327 " reachable on the network" %
3328 new_ip, errors.ECODE_NOTUNIQUE)
3330 self.op.name = new_name
3332 def Exec(self, feedback_fn):
3333 """Rename the cluster.
3336 clustername = self.op.name
3339 # shutdown the master IP
3340 master = self.cfg.GetMasterNode()
3341 result = self.rpc.call_node_deactivate_master_ip(master)
3342 result.Raise("Could not disable the master role")
3345 cluster = self.cfg.GetClusterInfo()
3346 cluster.cluster_name = clustername
3347 cluster.master_ip = ip
3348 self.cfg.Update(cluster, feedback_fn)
3350 # update the known hosts file
3351 ssh.WriteKnownHostsFile(self.cfg, constants.SSH_KNOWN_HOSTS_FILE)
3352 node_list = self.cfg.GetOnlineNodeList()
3354 node_list.remove(master)
3357 _UploadHelper(self, node_list, constants.SSH_KNOWN_HOSTS_FILE)
3359 result = self.rpc.call_node_activate_master_ip(master)
3360 msg = result.fail_msg
3362 self.LogWarning("Could not re-enable the master role on"
3363 " the master, please restart manually: %s", msg)
3368 class LUClusterSetParams(LogicalUnit):
3369 """Change the parameters of the cluster.
3372 HPATH = "cluster-modify"
3373 HTYPE = constants.HTYPE_CLUSTER
3376 def CheckArguments(self):
3380 if self.op.uid_pool:
3381 uidpool.CheckUidPool(self.op.uid_pool)
3383 if self.op.add_uids:
3384 uidpool.CheckUidPool(self.op.add_uids)
3386 if self.op.remove_uids:
3387 uidpool.CheckUidPool(self.op.remove_uids)
3389 def ExpandNames(self):
3390 # FIXME: in the future maybe other cluster params won't require checking on
3391 # all nodes to be modified.
3392 self.needed_locks = {
3393 locking.LEVEL_NODE: locking.ALL_SET,
3395 self.share_locks[locking.LEVEL_NODE] = 1
3397 def BuildHooksEnv(self):
3402 "OP_TARGET": self.cfg.GetClusterName(),
3403 "NEW_VG_NAME": self.op.vg_name,
3406 def BuildHooksNodes(self):
3407 """Build hooks nodes.
3410 mn = self.cfg.GetMasterNode()
3413 def CheckPrereq(self):
3414 """Check prerequisites.
3416 This checks whether the given params don't conflict and
3417 if the given volume group is valid.
3420 if self.op.vg_name is not None and not self.op.vg_name:
3421 if self.cfg.HasAnyDiskOfType(constants.LD_LV):
3422 raise errors.OpPrereqError("Cannot disable lvm storage while lvm-based"
3423 " instances exist", errors.ECODE_INVAL)
3425 if self.op.drbd_helper is not None and not self.op.drbd_helper:
3426 if self.cfg.HasAnyDiskOfType(constants.LD_DRBD8):
3427 raise errors.OpPrereqError("Cannot disable drbd helper while"
3428 " drbd-based instances exist",
3431 node_list = self.owned_locks(locking.LEVEL_NODE)
3433 # if vg_name not None, checks given volume group on all nodes
3435 vglist = self.rpc.call_vg_list(node_list)
3436 for node in node_list:
3437 msg = vglist[node].fail_msg
3439 # ignoring down node
3440 self.LogWarning("Error while gathering data on node %s"
3441 " (ignoring node): %s", node, msg)
3443 vgstatus = utils.CheckVolumeGroupSize(vglist[node].payload,
3445 constants.MIN_VG_SIZE)
3447 raise errors.OpPrereqError("Error on node '%s': %s" %
3448 (node, vgstatus), errors.ECODE_ENVIRON)
3450 if self.op.drbd_helper:
3451 # checks given drbd helper on all nodes
3452 helpers = self.rpc.call_drbd_helper(node_list)
3453 for (node, ninfo) in self.cfg.GetMultiNodeInfo(node_list):
3455 self.LogInfo("Not checking drbd helper on offline node %s", node)
3457 msg = helpers[node].fail_msg
3459 raise errors.OpPrereqError("Error checking drbd helper on node"
3460 " '%s': %s" % (node, msg),
3461 errors.ECODE_ENVIRON)
3462 node_helper = helpers[node].payload
3463 if node_helper != self.op.drbd_helper:
3464 raise errors.OpPrereqError("Error on node '%s': drbd helper is %s" %
3465 (node, node_helper), errors.ECODE_ENVIRON)
3467 self.cluster = cluster = self.cfg.GetClusterInfo()
3468 # validate params changes
3469 if self.op.beparams:
3470 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
3471 self.new_beparams = cluster.SimpleFillBE(self.op.beparams)
3473 if self.op.ndparams:
3474 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
3475 self.new_ndparams = cluster.SimpleFillND(self.op.ndparams)
3477 # TODO: we need a more general way to handle resetting
3478 # cluster-level parameters to default values
3479 if self.new_ndparams["oob_program"] == "":
3480 self.new_ndparams["oob_program"] = \
3481 constants.NDC_DEFAULTS[constants.ND_OOB_PROGRAM]
3483 if self.op.nicparams:
3484 utils.ForceDictType(self.op.nicparams, constants.NICS_PARAMETER_TYPES)
3485 self.new_nicparams = cluster.SimpleFillNIC(self.op.nicparams)
3486 objects.NIC.CheckParameterSyntax(self.new_nicparams)
3489 # check all instances for consistency
3490 for instance in self.cfg.GetAllInstancesInfo().values():
3491 for nic_idx, nic in enumerate(instance.nics):
3492 params_copy = copy.deepcopy(nic.nicparams)
3493 params_filled = objects.FillDict(self.new_nicparams, params_copy)
3495 # check parameter syntax
3497 objects.NIC.CheckParameterSyntax(params_filled)
3498 except errors.ConfigurationError, err:
3499 nic_errors.append("Instance %s, nic/%d: %s" %
3500 (instance.name, nic_idx, err))
3502 # if we're moving instances to routed, check that they have an ip
3503 target_mode = params_filled[constants.NIC_MODE]
3504 if target_mode == constants.NIC_MODE_ROUTED and not nic.ip:
3505 nic_errors.append("Instance %s, nic/%d: routed NIC with no ip"
3506 " address" % (instance.name, nic_idx))
3508 raise errors.OpPrereqError("Cannot apply the change, errors:\n%s" %
3509 "\n".join(nic_errors))
3511 # hypervisor list/parameters
3512 self.new_hvparams = new_hvp = objects.FillDict(cluster.hvparams, {})
3513 if self.op.hvparams:
3514 for hv_name, hv_dict in self.op.hvparams.items():
3515 if hv_name not in self.new_hvparams:
3516 self.new_hvparams[hv_name] = hv_dict
3518 self.new_hvparams[hv_name].update(hv_dict)
3520 # os hypervisor parameters
3521 self.new_os_hvp = objects.FillDict(cluster.os_hvp, {})
3523 for os_name, hvs in self.op.os_hvp.items():
3524 if os_name not in self.new_os_hvp:
3525 self.new_os_hvp[os_name] = hvs
3527 for hv_name, hv_dict in hvs.items():
3528 if hv_name not in self.new_os_hvp[os_name]:
3529 self.new_os_hvp[os_name][hv_name] = hv_dict
3531 self.new_os_hvp[os_name][hv_name].update(hv_dict)
3534 self.new_osp = objects.FillDict(cluster.osparams, {})
3535 if self.op.osparams:
3536 for os_name, osp in self.op.osparams.items():
3537 if os_name not in self.new_osp:
3538 self.new_osp[os_name] = {}
3540 self.new_osp[os_name] = _GetUpdatedParams(self.new_osp[os_name], osp,
3543 if not self.new_osp[os_name]:
3544 # we removed all parameters
3545 del self.new_osp[os_name]
3547 # check the parameter validity (remote check)
3548 _CheckOSParams(self, False, [self.cfg.GetMasterNode()],
3549 os_name, self.new_osp[os_name])
3551 # changes to the hypervisor list
3552 if self.op.enabled_hypervisors is not None:
3553 self.hv_list = self.op.enabled_hypervisors
3554 for hv in self.hv_list:
3555 # if the hypervisor doesn't already exist in the cluster
3556 # hvparams, we initialize it to empty, and then (in both
3557 # cases) we make sure to fill the defaults, as we might not
3558 # have a complete defaults list if the hypervisor wasn't
3560 if hv not in new_hvp:
3562 new_hvp[hv] = objects.FillDict(constants.HVC_DEFAULTS[hv], new_hvp[hv])
3563 utils.ForceDictType(new_hvp[hv], constants.HVS_PARAMETER_TYPES)
3565 self.hv_list = cluster.enabled_hypervisors
3567 if self.op.hvparams or self.op.enabled_hypervisors is not None:
3568 # either the enabled list has changed, or the parameters have, validate
3569 for hv_name, hv_params in self.new_hvparams.items():
3570 if ((self.op.hvparams and hv_name in self.op.hvparams) or
3571 (self.op.enabled_hypervisors and
3572 hv_name in self.op.enabled_hypervisors)):
3573 # either this is a new hypervisor, or its parameters have changed
3574 hv_class = hypervisor.GetHypervisor(hv_name)
3575 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
3576 hv_class.CheckParameterSyntax(hv_params)
3577 _CheckHVParams(self, node_list, hv_name, hv_params)
3580 # no need to check any newly-enabled hypervisors, since the
3581 # defaults have already been checked in the above code-block
3582 for os_name, os_hvp in self.new_os_hvp.items():
3583 for hv_name, hv_params in os_hvp.items():
3584 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
3585 # we need to fill in the new os_hvp on top of the actual hv_p
3586 cluster_defaults = self.new_hvparams.get(hv_name, {})
3587 new_osp = objects.FillDict(cluster_defaults, hv_params)
3588 hv_class = hypervisor.GetHypervisor(hv_name)
3589 hv_class.CheckParameterSyntax(new_osp)
3590 _CheckHVParams(self, node_list, hv_name, new_osp)
3592 if self.op.default_iallocator:
3593 alloc_script = utils.FindFile(self.op.default_iallocator,
3594 constants.IALLOCATOR_SEARCH_PATH,
3596 if alloc_script is None:
3597 raise errors.OpPrereqError("Invalid default iallocator script '%s'"
3598 " specified" % self.op.default_iallocator,
3601 def Exec(self, feedback_fn):
3602 """Change the parameters of the cluster.
3605 if self.op.vg_name is not None:
3606 new_volume = self.op.vg_name
3609 if new_volume != self.cfg.GetVGName():
3610 self.cfg.SetVGName(new_volume)
3612 feedback_fn("Cluster LVM configuration already in desired"
3613 " state, not changing")
3614 if self.op.drbd_helper is not None:
3615 new_helper = self.op.drbd_helper
3618 if new_helper != self.cfg.GetDRBDHelper():
3619 self.cfg.SetDRBDHelper(new_helper)
3621 feedback_fn("Cluster DRBD helper already in desired state,"
3623 if self.op.hvparams:
3624 self.cluster.hvparams = self.new_hvparams
3626 self.cluster.os_hvp = self.new_os_hvp
3627 if self.op.enabled_hypervisors is not None:
3628 self.cluster.hvparams = self.new_hvparams
3629 self.cluster.enabled_hypervisors = self.op.enabled_hypervisors
3630 if self.op.beparams:
3631 self.cluster.beparams[constants.PP_DEFAULT] = self.new_beparams
3632 if self.op.nicparams:
3633 self.cluster.nicparams[constants.PP_DEFAULT] = self.new_nicparams
3634 if self.op.osparams:
3635 self.cluster.osparams = self.new_osp
3636 if self.op.ndparams:
3637 self.cluster.ndparams = self.new_ndparams
3639 if self.op.candidate_pool_size is not None:
3640 self.cluster.candidate_pool_size = self.op.candidate_pool_size
3641 # we need to update the pool size here, otherwise the save will fail
3642 _AdjustCandidatePool(self, [])
3644 if self.op.maintain_node_health is not None:
3645 self.cluster.maintain_node_health = self.op.maintain_node_health
3647 if self.op.prealloc_wipe_disks is not None:
3648 self.cluster.prealloc_wipe_disks = self.op.prealloc_wipe_disks
3650 if self.op.add_uids is not None:
3651 uidpool.AddToUidPool(self.cluster.uid_pool, self.op.add_uids)
3653 if self.op.remove_uids is not None:
3654 uidpool.RemoveFromUidPool(self.cluster.uid_pool, self.op.remove_uids)
3656 if self.op.uid_pool is not None:
3657 self.cluster.uid_pool = self.op.uid_pool
3659 if self.op.default_iallocator is not None:
3660 self.cluster.default_iallocator = self.op.default_iallocator
3662 if self.op.reserved_lvs is not None:
3663 self.cluster.reserved_lvs = self.op.reserved_lvs
3665 def helper_os(aname, mods, desc):
3667 lst = getattr(self.cluster, aname)
3668 for key, val in mods:
3669 if key == constants.DDM_ADD:
3671 feedback_fn("OS %s already in %s, ignoring" % (val, desc))
3674 elif key == constants.DDM_REMOVE:
3678 feedback_fn("OS %s not found in %s, ignoring" % (val, desc))
3680 raise errors.ProgrammerError("Invalid modification '%s'" % key)
3682 if self.op.hidden_os:
3683 helper_os("hidden_os", self.op.hidden_os, "hidden")
3685 if self.op.blacklisted_os:
3686 helper_os("blacklisted_os", self.op.blacklisted_os, "blacklisted")
3688 if self.op.master_netdev:
3689 master = self.cfg.GetMasterNode()
3690 feedback_fn("Shutting down master ip on the current netdev (%s)" %
3691 self.cluster.master_netdev)
3692 result = self.rpc.call_node_deactivate_master_ip(master)
3693 result.Raise("Could not disable the master ip")
3694 feedback_fn("Changing master_netdev from %s to %s" %
3695 (self.cluster.master_netdev, self.op.master_netdev))
3696 self.cluster.master_netdev = self.op.master_netdev
3698 self.cfg.Update(self.cluster, feedback_fn)
3700 if self.op.master_netdev:
3701 feedback_fn("Starting the master ip on the new master netdev (%s)" %
3702 self.op.master_netdev)
3703 result = self.rpc.call_node_activate_master_ip(master)
3705 self.LogWarning("Could not re-enable the master ip on"
3706 " the master, please restart manually: %s",
3710 def _UploadHelper(lu, nodes, fname):
3711 """Helper for uploading a file and showing warnings.
3714 if os.path.exists(fname):
3715 result = lu.rpc.call_upload_file(nodes, fname)
3716 for to_node, to_result in result.items():
3717 msg = to_result.fail_msg
3719 msg = ("Copy of file %s to node %s failed: %s" %
3720 (fname, to_node, msg))
3721 lu.proc.LogWarning(msg)
3724 def _ComputeAncillaryFiles(cluster, redist):
3725 """Compute files external to Ganeti which need to be consistent.
3727 @type redist: boolean
3728 @param redist: Whether to include files which need to be redistributed
3731 # Compute files for all nodes
3733 constants.SSH_KNOWN_HOSTS_FILE,
3734 constants.CONFD_HMAC_KEY,
3735 constants.CLUSTER_DOMAIN_SECRET_FILE,
3736 constants.RAPI_USERS_FILE,
3740 files_all.update(constants.ALL_CERT_FILES)
3741 files_all.update(ssconf.SimpleStore().GetFileList())
3743 # we need to ship at least the RAPI certificate
3744 files_all.add(constants.RAPI_CERT_FILE)
3746 if cluster.modify_etc_hosts:
3747 files_all.add(constants.ETC_HOSTS)
3749 # Files which are optional, these must:
3750 # - be present in one other category as well
3751 # - either exist or not exist on all nodes of that category (mc, vm all)
3753 constants.RAPI_USERS_FILE,
3756 # Files which should only be on master candidates
3759 files_mc.add(constants.CLUSTER_CONF_FILE)
3761 # Files which should only be on VM-capable nodes
3762 files_vm = set(filename
3763 for hv_name in cluster.enabled_hypervisors
3764 for filename in hypervisor.GetHypervisor(hv_name).GetAncillaryFiles()[0])
3766 files_opt |= set(filename
3767 for hv_name in cluster.enabled_hypervisors
3768 for filename in hypervisor.GetHypervisor(hv_name).GetAncillaryFiles()[1])
3770 # Filenames in each category must be unique
3771 all_files_set = files_all | files_mc | files_vm
3772 assert (len(all_files_set) ==
3773 sum(map(len, [files_all, files_mc, files_vm]))), \
3774 "Found file listed in more than one file list"
3776 # Optional files must be present in one other category
3777 assert all_files_set.issuperset(files_opt), \
3778 "Optional file not in a different required list"
3780 return (files_all, files_opt, files_mc, files_vm)
3783 def _RedistributeAncillaryFiles(lu, additional_nodes=None, additional_vm=True):
3784 """Distribute additional files which are part of the cluster configuration.
3786 ConfigWriter takes care of distributing the config and ssconf files, but
3787 there are more files which should be distributed to all nodes. This function
3788 makes sure those are copied.
3790 @param lu: calling logical unit
3791 @param additional_nodes: list of nodes not in the config to distribute to
3792 @type additional_vm: boolean
3793 @param additional_vm: whether the additional nodes are vm-capable or not
3796 # Gather target nodes
3797 cluster = lu.cfg.GetClusterInfo()
3798 master_info = lu.cfg.GetNodeInfo(lu.cfg.GetMasterNode())
3800 online_nodes = lu.cfg.GetOnlineNodeList()
3801 vm_nodes = lu.cfg.GetVmCapableNodeList()
3803 if additional_nodes is not None:
3804 online_nodes.extend(additional_nodes)
3806 vm_nodes.extend(additional_nodes)
3808 # Never distribute to master node
3809 for nodelist in [online_nodes, vm_nodes]:
3810 if master_info.name in nodelist:
3811 nodelist.remove(master_info.name)
3814 (files_all, _, files_mc, files_vm) = \
3815 _ComputeAncillaryFiles(cluster, True)
3817 # Never re-distribute configuration file from here
3818 assert not (constants.CLUSTER_CONF_FILE in files_all or
3819 constants.CLUSTER_CONF_FILE in files_vm)
3820 assert not files_mc, "Master candidates not handled in this function"
3823 (online_nodes, files_all),
3824 (vm_nodes, files_vm),
3828 for (node_list, files) in filemap:
3830 _UploadHelper(lu, node_list, fname)
3833 class LUClusterRedistConf(NoHooksLU):
3834 """Force the redistribution of cluster configuration.
3836 This is a very simple LU.
3841 def ExpandNames(self):
3842 self.needed_locks = {
3843 locking.LEVEL_NODE: locking.ALL_SET,
3845 self.share_locks[locking.LEVEL_NODE] = 1
3847 def Exec(self, feedback_fn):
3848 """Redistribute the configuration.
3851 self.cfg.Update(self.cfg.GetClusterInfo(), feedback_fn)
3852 _RedistributeAncillaryFiles(self)
3855 class LUClusterActivateMasterIp(NoHooksLU):
3856 """Activate the master IP on the master node.
3859 def Exec(self, feedback_fn):
3860 """Activate the master IP.
3863 master = self.cfg.GetMasterNode()
3864 result = self.rpc.call_node_activate_master_ip(master)
3865 result.Raise("Could not activate the master IP")
3868 class LUClusterDeactivateMasterIp(NoHooksLU):
3869 """Deactivate the master IP on the master node.
3872 def Exec(self, feedback_fn):
3873 """Deactivate the master IP.
3876 master = self.cfg.GetMasterNode()
3877 result = self.rpc.call_node_deactivate_master_ip(master)
3878 result.Raise("Could not deactivate the master IP")
3881 def _WaitForSync(lu, instance, disks=None, oneshot=False):
3882 """Sleep and poll for an instance's disk to sync.
3885 if not instance.disks or disks is not None and not disks:
3888 disks = _ExpandCheckDisks(instance, disks)
3891 lu.proc.LogInfo("Waiting for instance %s to sync disks." % instance.name)
3893 node = instance.primary_node
3896 lu.cfg.SetDiskID(dev, node)
3898 # TODO: Convert to utils.Retry
3901 degr_retries = 10 # in seconds, as we sleep 1 second each time
3905 cumul_degraded = False
3906 rstats = lu.rpc.call_blockdev_getmirrorstatus(node, disks)
3907 msg = rstats.fail_msg
3909 lu.LogWarning("Can't get any data from node %s: %s", node, msg)
3912 raise errors.RemoteError("Can't contact node %s for mirror data,"
3913 " aborting." % node)
3916 rstats = rstats.payload
3918 for i, mstat in enumerate(rstats):
3920 lu.LogWarning("Can't compute data for node %s/%s",
3921 node, disks[i].iv_name)
3924 cumul_degraded = (cumul_degraded or
3925 (mstat.is_degraded and mstat.sync_percent is None))
3926 if mstat.sync_percent is not None:
3928 if mstat.estimated_time is not None:
3929 rem_time = ("%s remaining (estimated)" %
3930 utils.FormatSeconds(mstat.estimated_time))
3931 max_time = mstat.estimated_time
3933 rem_time = "no time estimate"
3934 lu.proc.LogInfo("- device %s: %5.2f%% done, %s" %
3935 (disks[i].iv_name, mstat.sync_percent, rem_time))
3937 # if we're done but degraded, let's do a few small retries, to
3938 # make sure we see a stable and not transient situation; therefore
3939 # we force restart of the loop
3940 if (done or oneshot) and cumul_degraded and degr_retries > 0:
3941 logging.info("Degraded disks found, %d retries left", degr_retries)
3949 time.sleep(min(60, max_time))
3952 lu.proc.LogInfo("Instance %s's disks are in sync." % instance.name)
3953 return not cumul_degraded
3956 def _CheckDiskConsistency(lu, dev, node, on_primary, ldisk=False):
3957 """Check that mirrors are not degraded.
3959 The ldisk parameter, if True, will change the test from the
3960 is_degraded attribute (which represents overall non-ok status for
3961 the device(s)) to the ldisk (representing the local storage status).
3964 lu.cfg.SetDiskID(dev, node)
3968 if on_primary or dev.AssembleOnSecondary():
3969 rstats = lu.rpc.call_blockdev_find(node, dev)
3970 msg = rstats.fail_msg
3972 lu.LogWarning("Can't find disk on node %s: %s", node, msg)
3974 elif not rstats.payload:
3975 lu.LogWarning("Can't find disk on node %s", node)
3979 result = result and rstats.payload.ldisk_status == constants.LDS_OKAY
3981 result = result and not rstats.payload.is_degraded
3984 for child in dev.children:
3985 result = result and _CheckDiskConsistency(lu, child, node, on_primary)
3990 class LUOobCommand(NoHooksLU):
3991 """Logical unit for OOB handling.
3995 _SKIP_MASTER = (constants.OOB_POWER_OFF, constants.OOB_POWER_CYCLE)
3997 def ExpandNames(self):
3998 """Gather locks we need.
4001 if self.op.node_names:
4002 self.op.node_names = _GetWantedNodes(self, self.op.node_names)
4003 lock_names = self.op.node_names
4005 lock_names = locking.ALL_SET
4007 self.needed_locks = {
4008 locking.LEVEL_NODE: lock_names,
4011 def CheckPrereq(self):
4012 """Check prerequisites.
4015 - the node exists in the configuration
4018 Any errors are signaled by raising errors.OpPrereqError.
4022 self.master_node = self.cfg.GetMasterNode()
4024 assert self.op.power_delay >= 0.0
4026 if self.op.node_names:
4027 if (self.op.command in self._SKIP_MASTER and
4028 self.master_node in self.op.node_names):
4029 master_node_obj = self.cfg.GetNodeInfo(self.master_node)
4030 master_oob_handler = _SupportsOob(self.cfg, master_node_obj)
4032 if master_oob_handler:
4033 additional_text = ("run '%s %s %s' if you want to operate on the"
4034 " master regardless") % (master_oob_handler,
4038 additional_text = "it does not support out-of-band operations"
4040 raise errors.OpPrereqError(("Operating on the master node %s is not"
4041 " allowed for %s; %s") %
4042 (self.master_node, self.op.command,
4043 additional_text), errors.ECODE_INVAL)
4045 self.op.node_names = self.cfg.GetNodeList()
4046 if self.op.command in self._SKIP_MASTER:
4047 self.op.node_names.remove(self.master_node)
4049 if self.op.command in self._SKIP_MASTER:
4050 assert self.master_node not in self.op.node_names
4052 for (node_name, node) in self.cfg.GetMultiNodeInfo(self.op.node_names):
4054 raise errors.OpPrereqError("Node %s not found" % node_name,
4057 self.nodes.append(node)
4059 if (not self.op.ignore_status and
4060 (self.op.command == constants.OOB_POWER_OFF and not node.offline)):
4061 raise errors.OpPrereqError(("Cannot power off node %s because it is"
4062 " not marked offline") % node_name,
4065 def Exec(self, feedback_fn):
4066 """Execute OOB and return result if we expect any.
4069 master_node = self.master_node
4072 for idx, node in enumerate(utils.NiceSort(self.nodes,
4073 key=lambda node: node.name)):
4074 node_entry = [(constants.RS_NORMAL, node.name)]
4075 ret.append(node_entry)
4077 oob_program = _SupportsOob(self.cfg, node)
4080 node_entry.append((constants.RS_UNAVAIL, None))
4083 logging.info("Executing out-of-band command '%s' using '%s' on %s",
4084 self.op.command, oob_program, node.name)
4085 result = self.rpc.call_run_oob(master_node, oob_program,
4086 self.op.command, node.name,
4090 self.LogWarning("Out-of-band RPC failed on node '%s': %s",
4091 node.name, result.fail_msg)
4092 node_entry.append((constants.RS_NODATA, None))
4095 self._CheckPayload(result)
4096 except errors.OpExecError, err:
4097 self.LogWarning("Payload returned by node '%s' is not valid: %s",
4099 node_entry.append((constants.RS_NODATA, None))
4101 if self.op.command == constants.OOB_HEALTH:
4102 # For health we should log important events
4103 for item, status in result.payload:
4104 if status in [constants.OOB_STATUS_WARNING,
4105 constants.OOB_STATUS_CRITICAL]:
4106 self.LogWarning("Item '%s' on node '%s' has status '%s'",
4107 item, node.name, status)
4109 if self.op.command == constants.OOB_POWER_ON:
4111 elif self.op.command == constants.OOB_POWER_OFF:
4112 node.powered = False
4113 elif self.op.command == constants.OOB_POWER_STATUS:
4114 powered = result.payload[constants.OOB_POWER_STATUS_POWERED]
4115 if powered != node.powered:
4116 logging.warning(("Recorded power state (%s) of node '%s' does not"
4117 " match actual power state (%s)"), node.powered,
4120 # For configuration changing commands we should update the node
4121 if self.op.command in (constants.OOB_POWER_ON,
4122 constants.OOB_POWER_OFF):
4123 self.cfg.Update(node, feedback_fn)
4125 node_entry.append((constants.RS_NORMAL, result.payload))
4127 if (self.op.command == constants.OOB_POWER_ON and
4128 idx < len(self.nodes) - 1):
4129 time.sleep(self.op.power_delay)
4133 def _CheckPayload(self, result):
4134 """Checks if the payload is valid.
4136 @param result: RPC result
4137 @raises errors.OpExecError: If payload is not valid
4141 if self.op.command == constants.OOB_HEALTH:
4142 if not isinstance(result.payload, list):
4143 errs.append("command 'health' is expected to return a list but got %s" %
4144 type(result.payload))
4146 for item, status in result.payload:
4147 if status not in constants.OOB_STATUSES:
4148 errs.append("health item '%s' has invalid status '%s'" %
4151 if self.op.command == constants.OOB_POWER_STATUS:
4152 if not isinstance(result.payload, dict):
4153 errs.append("power-status is expected to return a dict but got %s" %
4154 type(result.payload))
4156 if self.op.command in [
4157 constants.OOB_POWER_ON,
4158 constants.OOB_POWER_OFF,
4159 constants.OOB_POWER_CYCLE,
4161 if result.payload is not None:
4162 errs.append("%s is expected to not return payload but got '%s'" %
4163 (self.op.command, result.payload))
4166 raise errors.OpExecError("Check of out-of-band payload failed due to %s" %
4167 utils.CommaJoin(errs))
4170 class _OsQuery(_QueryBase):
4171 FIELDS = query.OS_FIELDS
4173 def ExpandNames(self, lu):
4174 # Lock all nodes in shared mode
4175 # Temporary removal of locks, should be reverted later
4176 # TODO: reintroduce locks when they are lighter-weight
4177 lu.needed_locks = {}
4178 #self.share_locks[locking.LEVEL_NODE] = 1
4179 #self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
4181 # The following variables interact with _QueryBase._GetNames
4183 self.wanted = self.names
4185 self.wanted = locking.ALL_SET
4187 self.do_locking = self.use_locking
4189 def DeclareLocks(self, lu, level):
4193 def _DiagnoseByOS(rlist):
4194 """Remaps a per-node return list into an a per-os per-node dictionary
4196 @param rlist: a map with node names as keys and OS objects as values
4199 @return: a dictionary with osnames as keys and as value another
4200 map, with nodes as keys and tuples of (path, status, diagnose,
4201 variants, parameters, api_versions) as values, eg::
4203 {"debian-etch": {"node1": [(/usr/lib/..., True, "", [], []),
4204 (/srv/..., False, "invalid api")],
4205 "node2": [(/srv/..., True, "", [], [])]}
4210 # we build here the list of nodes that didn't fail the RPC (at RPC
4211 # level), so that nodes with a non-responding node daemon don't
4212 # make all OSes invalid
4213 good_nodes = [node_name for node_name in rlist
4214 if not rlist[node_name].fail_msg]
4215 for node_name, nr in rlist.items():
4216 if nr.fail_msg or not nr.payload:
4218 for (name, path, status, diagnose, variants,
4219 params, api_versions) in nr.payload:
4220 if name not in all_os:
4221 # build a list of nodes for this os containing empty lists
4222 # for each node in node_list
4224 for nname in good_nodes:
4225 all_os[name][nname] = []
4226 # convert params from [name, help] to (name, help)
4227 params = [tuple(v) for v in params]
4228 all_os[name][node_name].append((path, status, diagnose,
4229 variants, params, api_versions))
4232 def _GetQueryData(self, lu):
4233 """Computes the list of nodes and their attributes.
4236 # Locking is not used
4237 assert not (compat.any(lu.glm.is_owned(level)
4238 for level in locking.LEVELS
4239 if level != locking.LEVEL_CLUSTER) or
4240 self.do_locking or self.use_locking)
4242 valid_nodes = [node.name
4243 for node in lu.cfg.GetAllNodesInfo().values()
4244 if not node.offline and node.vm_capable]
4245 pol = self._DiagnoseByOS(lu.rpc.call_os_diagnose(valid_nodes))
4246 cluster = lu.cfg.GetClusterInfo()
4250 for (os_name, os_data) in pol.items():
4251 info = query.OsInfo(name=os_name, valid=True, node_status=os_data,
4252 hidden=(os_name in cluster.hidden_os),
4253 blacklisted=(os_name in cluster.blacklisted_os))
4257 api_versions = set()
4259 for idx, osl in enumerate(os_data.values()):
4260 info.valid = bool(info.valid and osl and osl[0][1])
4264 (node_variants, node_params, node_api) = osl[0][3:6]
4267 variants.update(node_variants)
4268 parameters.update(node_params)
4269 api_versions.update(node_api)
4271 # Filter out inconsistent values
4272 variants.intersection_update(node_variants)
4273 parameters.intersection_update(node_params)
4274 api_versions.intersection_update(node_api)
4276 info.variants = list(variants)
4277 info.parameters = list(parameters)
4278 info.api_versions = list(api_versions)
4280 data[os_name] = info
4282 # Prepare data in requested order
4283 return [data[name] for name in self._GetNames(lu, pol.keys(), None)
4287 class LUOsDiagnose(NoHooksLU):
4288 """Logical unit for OS diagnose/query.
4294 def _BuildFilter(fields, names):
4295 """Builds a filter for querying OSes.
4298 name_filter = qlang.MakeSimpleFilter("name", names)
4300 # Legacy behaviour: Hide hidden, blacklisted or invalid OSes if the
4301 # respective field is not requested
4302 status_filter = [[qlang.OP_NOT, [qlang.OP_TRUE, fname]]
4303 for fname in ["hidden", "blacklisted"]
4304 if fname not in fields]
4305 if "valid" not in fields:
4306 status_filter.append([qlang.OP_TRUE, "valid"])
4309 status_filter.insert(0, qlang.OP_AND)
4311 status_filter = None
4313 if name_filter and status_filter:
4314 return [qlang.OP_AND, name_filter, status_filter]
4318 return status_filter
4320 def CheckArguments(self):
4321 self.oq = _OsQuery(self._BuildFilter(self.op.output_fields, self.op.names),
4322 self.op.output_fields, False)
4324 def ExpandNames(self):
4325 self.oq.ExpandNames(self)
4327 def Exec(self, feedback_fn):
4328 return self.oq.OldStyleQuery(self)
4331 class LUNodeRemove(LogicalUnit):
4332 """Logical unit for removing a node.
4335 HPATH = "node-remove"
4336 HTYPE = constants.HTYPE_NODE
4338 def BuildHooksEnv(self):
4341 This doesn't run on the target node in the pre phase as a failed
4342 node would then be impossible to remove.
4346 "OP_TARGET": self.op.node_name,
4347 "NODE_NAME": self.op.node_name,
4350 def BuildHooksNodes(self):
4351 """Build hooks nodes.
4354 all_nodes = self.cfg.GetNodeList()
4356 all_nodes.remove(self.op.node_name)
4358 logging.warning("Node '%s', which is about to be removed, was not found"
4359 " in the list of all nodes", self.op.node_name)
4360 return (all_nodes, all_nodes)
4362 def CheckPrereq(self):
4363 """Check prerequisites.
4366 - the node exists in the configuration
4367 - it does not have primary or secondary instances
4368 - it's not the master
4370 Any errors are signaled by raising errors.OpPrereqError.
4373 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
4374 node = self.cfg.GetNodeInfo(self.op.node_name)
4375 assert node is not None
4377 masternode = self.cfg.GetMasterNode()
4378 if node.name == masternode:
4379 raise errors.OpPrereqError("Node is the master node, failover to another"
4380 " node is required", errors.ECODE_INVAL)
4382 for instance_name, instance in self.cfg.GetAllInstancesInfo().items():
4383 if node.name in instance.all_nodes:
4384 raise errors.OpPrereqError("Instance %s is still running on the node,"
4385 " please remove first" % instance_name,
4387 self.op.node_name = node.name
4390 def Exec(self, feedback_fn):
4391 """Removes the node from the cluster.
4395 logging.info("Stopping the node daemon and removing configs from node %s",
4398 modify_ssh_setup = self.cfg.GetClusterInfo().modify_ssh_setup
4400 # Promote nodes to master candidate as needed
4401 _AdjustCandidatePool(self, exceptions=[node.name])
4402 self.context.RemoveNode(node.name)
4404 # Run post hooks on the node before it's removed
4405 _RunPostHook(self, node.name)
4407 result = self.rpc.call_node_leave_cluster(node.name, modify_ssh_setup)
4408 msg = result.fail_msg
4410 self.LogWarning("Errors encountered on the remote node while leaving"
4411 " the cluster: %s", msg)
4413 # Remove node from our /etc/hosts
4414 if self.cfg.GetClusterInfo().modify_etc_hosts:
4415 master_node = self.cfg.GetMasterNode()
4416 result = self.rpc.call_etc_hosts_modify(master_node,
4417 constants.ETC_HOSTS_REMOVE,
4419 result.Raise("Can't update hosts file with new host data")
4420 _RedistributeAncillaryFiles(self)
4423 class _NodeQuery(_QueryBase):
4424 FIELDS = query.NODE_FIELDS
4426 def ExpandNames(self, lu):
4427 lu.needed_locks = {}
4428 lu.share_locks = _ShareAll()
4431 self.wanted = _GetWantedNodes(lu, self.names)
4433 self.wanted = locking.ALL_SET
4435 self.do_locking = (self.use_locking and
4436 query.NQ_LIVE in self.requested_data)
4439 # If any non-static field is requested we need to lock the nodes
4440 lu.needed_locks[locking.LEVEL_NODE] = self.wanted
4442 def DeclareLocks(self, lu, level):
4445 def _GetQueryData(self, lu):
4446 """Computes the list of nodes and their attributes.
4449 all_info = lu.cfg.GetAllNodesInfo()
4451 nodenames = self._GetNames(lu, all_info.keys(), locking.LEVEL_NODE)
4453 # Gather data as requested
4454 if query.NQ_LIVE in self.requested_data:
4455 # filter out non-vm_capable nodes
4456 toquery_nodes = [name for name in nodenames if all_info[name].vm_capable]
4458 node_data = lu.rpc.call_node_info(toquery_nodes, lu.cfg.GetVGName(),
4459 lu.cfg.GetHypervisorType())
4460 live_data = dict((name, nresult.payload)
4461 for (name, nresult) in node_data.items()
4462 if not nresult.fail_msg and nresult.payload)
4466 if query.NQ_INST in self.requested_data:
4467 node_to_primary = dict([(name, set()) for name in nodenames])
4468 node_to_secondary = dict([(name, set()) for name in nodenames])
4470 inst_data = lu.cfg.GetAllInstancesInfo()
4472 for inst in inst_data.values():
4473 if inst.primary_node in node_to_primary:
4474 node_to_primary[inst.primary_node].add(inst.name)
4475 for secnode in inst.secondary_nodes:
4476 if secnode in node_to_secondary:
4477 node_to_secondary[secnode].add(inst.name)
4479 node_to_primary = None
4480 node_to_secondary = None
4482 if query.NQ_OOB in self.requested_data:
4483 oob_support = dict((name, bool(_SupportsOob(lu.cfg, node)))
4484 for name, node in all_info.iteritems())
4488 if query.NQ_GROUP in self.requested_data:
4489 groups = lu.cfg.GetAllNodeGroupsInfo()
4493 return query.NodeQueryData([all_info[name] for name in nodenames],
4494 live_data, lu.cfg.GetMasterNode(),
4495 node_to_primary, node_to_secondary, groups,
4496 oob_support, lu.cfg.GetClusterInfo())
4499 class LUNodeQuery(NoHooksLU):
4500 """Logical unit for querying nodes.
4503 # pylint: disable=W0142
4506 def CheckArguments(self):
4507 self.nq = _NodeQuery(qlang.MakeSimpleFilter("name", self.op.names),
4508 self.op.output_fields, self.op.use_locking)
4510 def ExpandNames(self):
4511 self.nq.ExpandNames(self)
4513 def Exec(self, feedback_fn):
4514 return self.nq.OldStyleQuery(self)
4517 class LUNodeQueryvols(NoHooksLU):
4518 """Logical unit for getting volumes on node(s).
4522 _FIELDS_DYNAMIC = utils.FieldSet("phys", "vg", "name", "size", "instance")
4523 _FIELDS_STATIC = utils.FieldSet("node")
4525 def CheckArguments(self):
4526 _CheckOutputFields(static=self._FIELDS_STATIC,
4527 dynamic=self._FIELDS_DYNAMIC,
4528 selected=self.op.output_fields)
4530 def ExpandNames(self):
4531 self.needed_locks = {}
4532 self.share_locks[locking.LEVEL_NODE] = 1
4533 if not self.op.nodes:
4534 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
4536 self.needed_locks[locking.LEVEL_NODE] = \
4537 _GetWantedNodes(self, self.op.nodes)
4539 def Exec(self, feedback_fn):
4540 """Computes the list of nodes and their attributes.
4543 nodenames = self.owned_locks(locking.LEVEL_NODE)
4544 volumes = self.rpc.call_node_volumes(nodenames)
4546 ilist = self.cfg.GetAllInstancesInfo()
4547 vol2inst = _MapInstanceDisksToNodes(ilist.values())
4550 for node in nodenames:
4551 nresult = volumes[node]
4554 msg = nresult.fail_msg
4556 self.LogWarning("Can't compute volume data on node %s: %s", node, msg)
4559 node_vols = sorted(nresult.payload,
4560 key=operator.itemgetter("dev"))
4562 for vol in node_vols:
4564 for field in self.op.output_fields:
4567 elif field == "phys":
4571 elif field == "name":
4573 elif field == "size":
4574 val = int(float(vol["size"]))
4575 elif field == "instance":
4576 val = vol2inst.get((node, vol["vg"] + "/" + vol["name"]), "-")
4578 raise errors.ParameterError(field)
4579 node_output.append(str(val))
4581 output.append(node_output)
4586 class LUNodeQueryStorage(NoHooksLU):
4587 """Logical unit for getting information on storage units on node(s).
4590 _FIELDS_STATIC = utils.FieldSet(constants.SF_NODE)
4593 def CheckArguments(self):
4594 _CheckOutputFields(static=self._FIELDS_STATIC,
4595 dynamic=utils.FieldSet(*constants.VALID_STORAGE_FIELDS),
4596 selected=self.op.output_fields)
4598 def ExpandNames(self):
4599 self.needed_locks = {}
4600 self.share_locks[locking.LEVEL_NODE] = 1
4603 self.needed_locks[locking.LEVEL_NODE] = \
4604 _GetWantedNodes(self, self.op.nodes)
4606 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
4608 def Exec(self, feedback_fn):
4609 """Computes the list of nodes and their attributes.
4612 self.nodes = self.owned_locks(locking.LEVEL_NODE)
4614 # Always get name to sort by
4615 if constants.SF_NAME in self.op.output_fields:
4616 fields = self.op.output_fields[:]
4618 fields = [constants.SF_NAME] + self.op.output_fields
4620 # Never ask for node or type as it's only known to the LU
4621 for extra in [constants.SF_NODE, constants.SF_TYPE]:
4622 while extra in fields:
4623 fields.remove(extra)
4625 field_idx = dict([(name, idx) for (idx, name) in enumerate(fields)])
4626 name_idx = field_idx[constants.SF_NAME]
4628 st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
4629 data = self.rpc.call_storage_list(self.nodes,
4630 self.op.storage_type, st_args,
4631 self.op.name, fields)
4635 for node in utils.NiceSort(self.nodes):
4636 nresult = data[node]
4640 msg = nresult.fail_msg
4642 self.LogWarning("Can't get storage data from node %s: %s", node, msg)
4645 rows = dict([(row[name_idx], row) for row in nresult.payload])
4647 for name in utils.NiceSort(rows.keys()):
4652 for field in self.op.output_fields:
4653 if field == constants.SF_NODE:
4655 elif field == constants.SF_TYPE:
4656 val = self.op.storage_type
4657 elif field in field_idx:
4658 val = row[field_idx[field]]
4660 raise errors.ParameterError(field)
4669 class _InstanceQuery(_QueryBase):
4670 FIELDS = query.INSTANCE_FIELDS
4672 def ExpandNames(self, lu):
4673 lu.needed_locks = {}
4674 lu.share_locks = _ShareAll()
4677 self.wanted = _GetWantedInstances(lu, self.names)
4679 self.wanted = locking.ALL_SET
4681 self.do_locking = (self.use_locking and
4682 query.IQ_LIVE in self.requested_data)
4684 lu.needed_locks[locking.LEVEL_INSTANCE] = self.wanted
4685 lu.needed_locks[locking.LEVEL_NODEGROUP] = []
4686 lu.needed_locks[locking.LEVEL_NODE] = []
4687 lu.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
4689 self.do_grouplocks = (self.do_locking and
4690 query.IQ_NODES in self.requested_data)
4692 def DeclareLocks(self, lu, level):
4694 if level == locking.LEVEL_NODEGROUP and self.do_grouplocks:
4695 assert not lu.needed_locks[locking.LEVEL_NODEGROUP]
4697 # Lock all groups used by instances optimistically; this requires going
4698 # via the node before it's locked, requiring verification later on
4699 lu.needed_locks[locking.LEVEL_NODEGROUP] = \
4701 for instance_name in lu.owned_locks(locking.LEVEL_INSTANCE)
4702 for group_uuid in lu.cfg.GetInstanceNodeGroups(instance_name))
4703 elif level == locking.LEVEL_NODE:
4704 lu._LockInstancesNodes() # pylint: disable=W0212
4707 def _CheckGroupLocks(lu):
4708 owned_instances = frozenset(lu.owned_locks(locking.LEVEL_INSTANCE))
4709 owned_groups = frozenset(lu.owned_locks(locking.LEVEL_NODEGROUP))
4711 # Check if node groups for locked instances are still correct
4712 for instance_name in owned_instances:
4713 _CheckInstanceNodeGroups(lu.cfg, instance_name, owned_groups)
4715 def _GetQueryData(self, lu):
4716 """Computes the list of instances and their attributes.
4719 if self.do_grouplocks:
4720 self._CheckGroupLocks(lu)
4722 cluster = lu.cfg.GetClusterInfo()
4723 all_info = lu.cfg.GetAllInstancesInfo()
4725 instance_names = self._GetNames(lu, all_info.keys(), locking.LEVEL_INSTANCE)
4727 instance_list = [all_info[name] for name in instance_names]
4728 nodes = frozenset(itertools.chain(*(inst.all_nodes
4729 for inst in instance_list)))
4730 hv_list = list(set([inst.hypervisor for inst in instance_list]))
4733 wrongnode_inst = set()
4735 # Gather data as requested
4736 if self.requested_data & set([query.IQ_LIVE, query.IQ_CONSOLE]):
4738 node_data = lu.rpc.call_all_instances_info(nodes, hv_list)
4740 result = node_data[name]
4742 # offline nodes will be in both lists
4743 assert result.fail_msg
4744 offline_nodes.append(name)
4746 bad_nodes.append(name)
4747 elif result.payload:
4748 for inst in result.payload:
4749 if inst in all_info:
4750 if all_info[inst].primary_node == name:
4751 live_data.update(result.payload)
4753 wrongnode_inst.add(inst)
4755 # orphan instance; we don't list it here as we don't
4756 # handle this case yet in the output of instance listing
4757 logging.warning("Orphan instance '%s' found on node %s",
4759 # else no instance is alive
4763 if query.IQ_DISKUSAGE in self.requested_data:
4764 disk_usage = dict((inst.name,
4765 _ComputeDiskSize(inst.disk_template,
4766 [{constants.IDISK_SIZE: disk.size}
4767 for disk in inst.disks]))
4768 for inst in instance_list)
4772 if query.IQ_CONSOLE in self.requested_data:
4774 for inst in instance_list:
4775 if inst.name in live_data:
4776 # Instance is running
4777 consinfo[inst.name] = _GetInstanceConsole(cluster, inst)
4779 consinfo[inst.name] = None
4780 assert set(consinfo.keys()) == set(instance_names)
4784 if query.IQ_NODES in self.requested_data:
4785 node_names = set(itertools.chain(*map(operator.attrgetter("all_nodes"),
4787 nodes = dict(lu.cfg.GetMultiNodeInfo(node_names))
4788 groups = dict((uuid, lu.cfg.GetNodeGroup(uuid))
4789 for uuid in set(map(operator.attrgetter("group"),
4795 return query.InstanceQueryData(instance_list, lu.cfg.GetClusterInfo(),
4796 disk_usage, offline_nodes, bad_nodes,
4797 live_data, wrongnode_inst, consinfo,
4801 class LUQuery(NoHooksLU):
4802 """Query for resources/items of a certain kind.
4805 # pylint: disable=W0142
4808 def CheckArguments(self):
4809 qcls = _GetQueryImplementation(self.op.what)
4811 self.impl = qcls(self.op.filter, self.op.fields, self.op.use_locking)
4813 def ExpandNames(self):
4814 self.impl.ExpandNames(self)
4816 def DeclareLocks(self, level):
4817 self.impl.DeclareLocks(self, level)
4819 def Exec(self, feedback_fn):
4820 return self.impl.NewStyleQuery(self)
4823 class LUQueryFields(NoHooksLU):
4824 """Query for resources/items of a certain kind.
4827 # pylint: disable=W0142
4830 def CheckArguments(self):
4831 self.qcls = _GetQueryImplementation(self.op.what)
4833 def ExpandNames(self):
4834 self.needed_locks = {}
4836 def Exec(self, feedback_fn):
4837 return query.QueryFields(self.qcls.FIELDS, self.op.fields)
4840 class LUNodeModifyStorage(NoHooksLU):
4841 """Logical unit for modifying a storage volume on a node.
4846 def CheckArguments(self):
4847 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
4849 storage_type = self.op.storage_type
4852 modifiable = constants.MODIFIABLE_STORAGE_FIELDS[storage_type]
4854 raise errors.OpPrereqError("Storage units of type '%s' can not be"
4855 " modified" % storage_type,
4858 diff = set(self.op.changes.keys()) - modifiable
4860 raise errors.OpPrereqError("The following fields can not be modified for"
4861 " storage units of type '%s': %r" %
4862 (storage_type, list(diff)),
4865 def ExpandNames(self):
4866 self.needed_locks = {
4867 locking.LEVEL_NODE: self.op.node_name,
4870 def Exec(self, feedback_fn):
4871 """Computes the list of nodes and their attributes.
4874 st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
4875 result = self.rpc.call_storage_modify(self.op.node_name,
4876 self.op.storage_type, st_args,
4877 self.op.name, self.op.changes)
4878 result.Raise("Failed to modify storage unit '%s' on %s" %
4879 (self.op.name, self.op.node_name))
4882 class LUNodeAdd(LogicalUnit):
4883 """Logical unit for adding node to the cluster.
4887 HTYPE = constants.HTYPE_NODE
4888 _NFLAGS = ["master_capable", "vm_capable"]
4890 def CheckArguments(self):
4891 self.primary_ip_family = self.cfg.GetPrimaryIPFamily()
4892 # validate/normalize the node name
4893 self.hostname = netutils.GetHostname(name=self.op.node_name,
4894 family=self.primary_ip_family)
4895 self.op.node_name = self.hostname.name
4897 if self.op.readd and self.op.node_name == self.cfg.GetMasterNode():
4898 raise errors.OpPrereqError("Cannot readd the master node",
4901 if self.op.readd and self.op.group:
4902 raise errors.OpPrereqError("Cannot pass a node group when a node is"
4903 " being readded", errors.ECODE_INVAL)
4905 def BuildHooksEnv(self):
4908 This will run on all nodes before, and on all nodes + the new node after.
4912 "OP_TARGET": self.op.node_name,
4913 "NODE_NAME": self.op.node_name,
4914 "NODE_PIP": self.op.primary_ip,
4915 "NODE_SIP": self.op.secondary_ip,
4916 "MASTER_CAPABLE": str(self.op.master_capable),
4917 "VM_CAPABLE": str(self.op.vm_capable),
4920 def BuildHooksNodes(self):
4921 """Build hooks nodes.
4924 # Exclude added node
4925 pre_nodes = list(set(self.cfg.GetNodeList()) - set([self.op.node_name]))
4926 post_nodes = pre_nodes + [self.op.node_name, ]
4928 return (pre_nodes, post_nodes)
4930 def CheckPrereq(self):
4931 """Check prerequisites.
4934 - the new node is not already in the config
4936 - its parameters (single/dual homed) matches the cluster
4938 Any errors are signaled by raising errors.OpPrereqError.
4942 hostname = self.hostname
4943 node = hostname.name
4944 primary_ip = self.op.primary_ip = hostname.ip
4945 if self.op.secondary_ip is None:
4946 if self.primary_ip_family == netutils.IP6Address.family:
4947 raise errors.OpPrereqError("When using a IPv6 primary address, a valid"
4948 " IPv4 address must be given as secondary",
4950 self.op.secondary_ip = primary_ip
4952 secondary_ip = self.op.secondary_ip
4953 if not netutils.IP4Address.IsValid(secondary_ip):
4954 raise errors.OpPrereqError("Secondary IP (%s) needs to be a valid IPv4"
4955 " address" % secondary_ip, errors.ECODE_INVAL)
4957 node_list = cfg.GetNodeList()
4958 if not self.op.readd and node in node_list:
4959 raise errors.OpPrereqError("Node %s is already in the configuration" %
4960 node, errors.ECODE_EXISTS)
4961 elif self.op.readd and node not in node_list:
4962 raise errors.OpPrereqError("Node %s is not in the configuration" % node,
4965 self.changed_primary_ip = False
4967 for existing_node_name, existing_node in cfg.GetMultiNodeInfo(node_list):
4968 if self.op.readd and node == existing_node_name:
4969 if existing_node.secondary_ip != secondary_ip:
4970 raise errors.OpPrereqError("Readded node doesn't have the same IP"
4971 " address configuration as before",
4973 if existing_node.primary_ip != primary_ip:
4974 self.changed_primary_ip = True
4978 if (existing_node.primary_ip == primary_ip or
4979 existing_node.secondary_ip == primary_ip or
4980 existing_node.primary_ip == secondary_ip or
4981 existing_node.secondary_ip == secondary_ip):
4982 raise errors.OpPrereqError("New node ip address(es) conflict with"
4983 " existing node %s" % existing_node.name,
4984 errors.ECODE_NOTUNIQUE)
4986 # After this 'if' block, None is no longer a valid value for the
4987 # _capable op attributes
4989 old_node = self.cfg.GetNodeInfo(node)
4990 assert old_node is not None, "Can't retrieve locked node %s" % node
4991 for attr in self._NFLAGS:
4992 if getattr(self.op, attr) is None:
4993 setattr(self.op, attr, getattr(old_node, attr))
4995 for attr in self._NFLAGS:
4996 if getattr(self.op, attr) is None:
4997 setattr(self.op, attr, True)
4999 if self.op.readd and not self.op.vm_capable:
5000 pri, sec = cfg.GetNodeInstances(node)
5002 raise errors.OpPrereqError("Node %s being re-added with vm_capable"
5003 " flag set to false, but it already holds"
5004 " instances" % node,
5007 # check that the type of the node (single versus dual homed) is the
5008 # same as for the master
5009 myself = cfg.GetNodeInfo(self.cfg.GetMasterNode())
5010 master_singlehomed = myself.secondary_ip == myself.primary_ip
5011 newbie_singlehomed = secondary_ip == primary_ip
5012 if master_singlehomed != newbie_singlehomed:
5013 if master_singlehomed:
5014 raise errors.OpPrereqError("The master has no secondary ip but the"
5015 " new node has one",
5018 raise errors.OpPrereqError("The master has a secondary ip but the"
5019 " new node doesn't have one",
5022 # checks reachability
5023 if not netutils.TcpPing(primary_ip, constants.DEFAULT_NODED_PORT):
5024 raise errors.OpPrereqError("Node not reachable by ping",
5025 errors.ECODE_ENVIRON)
5027 if not newbie_singlehomed:
5028 # check reachability from my secondary ip to newbie's secondary ip
5029 if not netutils.TcpPing(secondary_ip, constants.DEFAULT_NODED_PORT,
5030 source=myself.secondary_ip):
5031 raise errors.OpPrereqError("Node secondary ip not reachable by TCP"
5032 " based ping to node daemon port",
5033 errors.ECODE_ENVIRON)
5040 if self.op.master_capable:
5041 self.master_candidate = _DecideSelfPromotion(self, exceptions=exceptions)
5043 self.master_candidate = False
5046 self.new_node = old_node
5048 node_group = cfg.LookupNodeGroup(self.op.group)
5049 self.new_node = objects.Node(name=node,
5050 primary_ip=primary_ip,
5051 secondary_ip=secondary_ip,
5052 master_candidate=self.master_candidate,
5053 offline=False, drained=False,
5056 if self.op.ndparams:
5057 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
5059 def Exec(self, feedback_fn):
5060 """Adds the new node to the cluster.
5063 new_node = self.new_node
5064 node = new_node.name
5066 # We adding a new node so we assume it's powered
5067 new_node.powered = True
5069 # for re-adds, reset the offline/drained/master-candidate flags;
5070 # we need to reset here, otherwise offline would prevent RPC calls
5071 # later in the procedure; this also means that if the re-add
5072 # fails, we are left with a non-offlined, broken node
5074 new_node.drained = new_node.offline = False # pylint: disable=W0201
5075 self.LogInfo("Readding a node, the offline/drained flags were reset")
5076 # if we demote the node, we do cleanup later in the procedure
5077 new_node.master_candidate = self.master_candidate
5078 if self.changed_primary_ip:
5079 new_node.primary_ip = self.op.primary_ip
5081 # copy the master/vm_capable flags
5082 for attr in self._NFLAGS:
5083 setattr(new_node, attr, getattr(self.op, attr))
5085 # notify the user about any possible mc promotion
5086 if new_node.master_candidate:
5087 self.LogInfo("Node will be a master candidate")
5089 if self.op.ndparams:
5090 new_node.ndparams = self.op.ndparams
5092 new_node.ndparams = {}
5094 # check connectivity
5095 result = self.rpc.call_version([node])[node]
5096 result.Raise("Can't get version information from node %s" % node)
5097 if constants.PROTOCOL_VERSION == result.payload:
5098 logging.info("Communication to node %s fine, sw version %s match",
5099 node, result.payload)
5101 raise errors.OpExecError("Version mismatch master version %s,"
5102 " node version %s" %
5103 (constants.PROTOCOL_VERSION, result.payload))
5105 # Add node to our /etc/hosts, and add key to known_hosts
5106 if self.cfg.GetClusterInfo().modify_etc_hosts:
5107 master_node = self.cfg.GetMasterNode()
5108 result = self.rpc.call_etc_hosts_modify(master_node,
5109 constants.ETC_HOSTS_ADD,
5112 result.Raise("Can't update hosts file with new host data")
5114 if new_node.secondary_ip != new_node.primary_ip:
5115 _CheckNodeHasSecondaryIP(self, new_node.name, new_node.secondary_ip,
5118 node_verify_list = [self.cfg.GetMasterNode()]
5119 node_verify_param = {
5120 constants.NV_NODELIST: ([node], {}),
5121 # TODO: do a node-net-test as well?
5124 result = self.rpc.call_node_verify(node_verify_list, node_verify_param,
5125 self.cfg.GetClusterName())
5126 for verifier in node_verify_list:
5127 result[verifier].Raise("Cannot communicate with node %s" % verifier)
5128 nl_payload = result[verifier].payload[constants.NV_NODELIST]
5130 for failed in nl_payload:
5131 feedback_fn("ssh/hostname verification failed"
5132 " (checking from %s): %s" %
5133 (verifier, nl_payload[failed]))
5134 raise errors.OpExecError("ssh/hostname verification failed")
5137 _RedistributeAncillaryFiles(self)
5138 self.context.ReaddNode(new_node)
5139 # make sure we redistribute the config
5140 self.cfg.Update(new_node, feedback_fn)
5141 # and make sure the new node will not have old files around
5142 if not new_node.master_candidate:
5143 result = self.rpc.call_node_demote_from_mc(new_node.name)
5144 msg = result.fail_msg
5146 self.LogWarning("Node failed to demote itself from master"
5147 " candidate status: %s" % msg)
5149 _RedistributeAncillaryFiles(self, additional_nodes=[node],
5150 additional_vm=self.op.vm_capable)
5151 self.context.AddNode(new_node, self.proc.GetECId())
5154 class LUNodeSetParams(LogicalUnit):
5155 """Modifies the parameters of a node.
5157 @cvar _F2R: a dictionary from tuples of flags (mc, drained, offline)
5158 to the node role (as _ROLE_*)
5159 @cvar _R2F: a dictionary from node role to tuples of flags
5160 @cvar _FLAGS: a list of attribute names corresponding to the flags
5163 HPATH = "node-modify"
5164 HTYPE = constants.HTYPE_NODE
5166 (_ROLE_CANDIDATE, _ROLE_DRAINED, _ROLE_OFFLINE, _ROLE_REGULAR) = range(4)
5168 (True, False, False): _ROLE_CANDIDATE,
5169 (False, True, False): _ROLE_DRAINED,
5170 (False, False, True): _ROLE_OFFLINE,
5171 (False, False, False): _ROLE_REGULAR,
5173 _R2F = dict((v, k) for k, v in _F2R.items())
5174 _FLAGS = ["master_candidate", "drained", "offline"]
5176 def CheckArguments(self):
5177 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
5178 all_mods = [self.op.offline, self.op.master_candidate, self.op.drained,
5179 self.op.master_capable, self.op.vm_capable,
5180 self.op.secondary_ip, self.op.ndparams]
5181 if all_mods.count(None) == len(all_mods):
5182 raise errors.OpPrereqError("Please pass at least one modification",
5184 if all_mods.count(True) > 1:
5185 raise errors.OpPrereqError("Can't set the node into more than one"
5186 " state at the same time",
5189 # Boolean value that tells us whether we might be demoting from MC
5190 self.might_demote = (self.op.master_candidate == False or
5191 self.op.offline == True or
5192 self.op.drained == True or
5193 self.op.master_capable == False)
5195 if self.op.secondary_ip:
5196 if not netutils.IP4Address.IsValid(self.op.secondary_ip):
5197 raise errors.OpPrereqError("Secondary IP (%s) needs to be a valid IPv4"
5198 " address" % self.op.secondary_ip,
5201 self.lock_all = self.op.auto_promote and self.might_demote
5202 self.lock_instances = self.op.secondary_ip is not None
5204 def ExpandNames(self):
5206 self.needed_locks = {locking.LEVEL_NODE: locking.ALL_SET}
5208 self.needed_locks = {locking.LEVEL_NODE: self.op.node_name}
5210 if self.lock_instances:
5211 self.needed_locks[locking.LEVEL_INSTANCE] = locking.ALL_SET
5213 def DeclareLocks(self, level):
5214 # If we have locked all instances, before waiting to lock nodes, release
5215 # all the ones living on nodes unrelated to the current operation.
5216 if level == locking.LEVEL_NODE and self.lock_instances:
5217 self.affected_instances = []
5218 if self.needed_locks[locking.LEVEL_NODE] is not locking.ALL_SET:
5221 # Build list of instances to release
5222 locked_i = self.owned_locks(locking.LEVEL_INSTANCE)
5223 for instance_name, instance in self.cfg.GetMultiInstanceInfo(locked_i):
5224 if (instance.disk_template in constants.DTS_INT_MIRROR and
5225 self.op.node_name in instance.all_nodes):
5226 instances_keep.append(instance_name)
5227 self.affected_instances.append(instance)
5229 _ReleaseLocks(self, locking.LEVEL_INSTANCE, keep=instances_keep)
5231 assert (set(self.owned_locks(locking.LEVEL_INSTANCE)) ==
5232 set(instances_keep))
5234 def BuildHooksEnv(self):
5237 This runs on the master node.
5241 "OP_TARGET": self.op.node_name,
5242 "MASTER_CANDIDATE": str(self.op.master_candidate),
5243 "OFFLINE": str(self.op.offline),
5244 "DRAINED": str(self.op.drained),
5245 "MASTER_CAPABLE": str(self.op.master_capable),
5246 "VM_CAPABLE": str(self.op.vm_capable),
5249 def BuildHooksNodes(self):
5250 """Build hooks nodes.
5253 nl = [self.cfg.GetMasterNode(), self.op.node_name]
5256 def CheckPrereq(self):
5257 """Check prerequisites.
5259 This only checks the instance list against the existing names.
5262 node = self.node = self.cfg.GetNodeInfo(self.op.node_name)
5264 if (self.op.master_candidate is not None or
5265 self.op.drained is not None or
5266 self.op.offline is not None):
5267 # we can't change the master's node flags
5268 if self.op.node_name == self.cfg.GetMasterNode():
5269 raise errors.OpPrereqError("The master role can be changed"
5270 " only via master-failover",
5273 if self.op.master_candidate and not node.master_capable:
5274 raise errors.OpPrereqError("Node %s is not master capable, cannot make"
5275 " it a master candidate" % node.name,
5278 if self.op.vm_capable == False:
5279 (ipri, isec) = self.cfg.GetNodeInstances(self.op.node_name)
5281 raise errors.OpPrereqError("Node %s hosts instances, cannot unset"
5282 " the vm_capable flag" % node.name,
5285 if node.master_candidate and self.might_demote and not self.lock_all:
5286 assert not self.op.auto_promote, "auto_promote set but lock_all not"
5287 # check if after removing the current node, we're missing master
5289 (mc_remaining, mc_should, _) = \
5290 self.cfg.GetMasterCandidateStats(exceptions=[node.name])
5291 if mc_remaining < mc_should:
5292 raise errors.OpPrereqError("Not enough master candidates, please"
5293 " pass auto promote option to allow"
5294 " promotion", errors.ECODE_STATE)
5296 self.old_flags = old_flags = (node.master_candidate,
5297 node.drained, node.offline)
5298 assert old_flags in self._F2R, "Un-handled old flags %s" % str(old_flags)
5299 self.old_role = old_role = self._F2R[old_flags]
5301 # Check for ineffective changes
5302 for attr in self._FLAGS:
5303 if (getattr(self.op, attr) == False and getattr(node, attr) == False):
5304 self.LogInfo("Ignoring request to unset flag %s, already unset", attr)
5305 setattr(self.op, attr, None)
5307 # Past this point, any flag change to False means a transition
5308 # away from the respective state, as only real changes are kept
5310 # TODO: We might query the real power state if it supports OOB
5311 if _SupportsOob(self.cfg, node):
5312 if self.op.offline is False and not (node.powered or
5313 self.op.powered == True):
5314 raise errors.OpPrereqError(("Node %s needs to be turned on before its"
5315 " offline status can be reset") %
5317 elif self.op.powered is not None:
5318 raise errors.OpPrereqError(("Unable to change powered state for node %s"
5319 " as it does not support out-of-band"
5320 " handling") % self.op.node_name)
5322 # If we're being deofflined/drained, we'll MC ourself if needed
5323 if (self.op.drained == False or self.op.offline == False or
5324 (self.op.master_capable and not node.master_capable)):
5325 if _DecideSelfPromotion(self):
5326 self.op.master_candidate = True
5327 self.LogInfo("Auto-promoting node to master candidate")
5329 # If we're no longer master capable, we'll demote ourselves from MC
5330 if self.op.master_capable == False and node.master_candidate:
5331 self.LogInfo("Demoting from master candidate")
5332 self.op.master_candidate = False
5335 assert [getattr(self.op, attr) for attr in self._FLAGS].count(True) <= 1
5336 if self.op.master_candidate:
5337 new_role = self._ROLE_CANDIDATE
5338 elif self.op.drained:
5339 new_role = self._ROLE_DRAINED
5340 elif self.op.offline:
5341 new_role = self._ROLE_OFFLINE
5342 elif False in [self.op.master_candidate, self.op.drained, self.op.offline]:
5343 # False is still in new flags, which means we're un-setting (the
5345 new_role = self._ROLE_REGULAR
5346 else: # no new flags, nothing, keep old role
5349 self.new_role = new_role
5351 if old_role == self._ROLE_OFFLINE and new_role != old_role:
5352 # Trying to transition out of offline status
5353 result = self.rpc.call_version([node.name])[node.name]
5355 raise errors.OpPrereqError("Node %s is being de-offlined but fails"
5356 " to report its version: %s" %
5357 (node.name, result.fail_msg),
5360 self.LogWarning("Transitioning node from offline to online state"
5361 " without using re-add. Please make sure the node"
5364 if self.op.secondary_ip:
5365 # Ok even without locking, because this can't be changed by any LU
5366 master = self.cfg.GetNodeInfo(self.cfg.GetMasterNode())
5367 master_singlehomed = master.secondary_ip == master.primary_ip
5368 if master_singlehomed and self.op.secondary_ip:
5369 raise errors.OpPrereqError("Cannot change the secondary ip on a single"
5370 " homed cluster", errors.ECODE_INVAL)
5373 if self.affected_instances:
5374 raise errors.OpPrereqError("Cannot change secondary ip: offline"
5375 " node has instances (%s) configured"
5376 " to use it" % self.affected_instances)
5378 # On online nodes, check that no instances are running, and that
5379 # the node has the new ip and we can reach it.
5380 for instance in self.affected_instances:
5381 _CheckInstanceDown(self, instance, "cannot change secondary ip")
5383 _CheckNodeHasSecondaryIP(self, node.name, self.op.secondary_ip, True)
5384 if master.name != node.name:
5385 # check reachability from master secondary ip to new secondary ip
5386 if not netutils.TcpPing(self.op.secondary_ip,
5387 constants.DEFAULT_NODED_PORT,
5388 source=master.secondary_ip):
5389 raise errors.OpPrereqError("Node secondary ip not reachable by TCP"
5390 " based ping to node daemon port",
5391 errors.ECODE_ENVIRON)
5393 if self.op.ndparams:
5394 new_ndparams = _GetUpdatedParams(self.node.ndparams, self.op.ndparams)
5395 utils.ForceDictType(new_ndparams, constants.NDS_PARAMETER_TYPES)
5396 self.new_ndparams = new_ndparams
5398 def Exec(self, feedback_fn):
5403 old_role = self.old_role
5404 new_role = self.new_role
5408 if self.op.ndparams:
5409 node.ndparams = self.new_ndparams
5411 if self.op.powered is not None:
5412 node.powered = self.op.powered
5414 for attr in ["master_capable", "vm_capable"]:
5415 val = getattr(self.op, attr)
5417 setattr(node, attr, val)
5418 result.append((attr, str(val)))
5420 if new_role != old_role:
5421 # Tell the node to demote itself, if no longer MC and not offline
5422 if old_role == self._ROLE_CANDIDATE and new_role != self._ROLE_OFFLINE:
5423 msg = self.rpc.call_node_demote_from_mc(node.name).fail_msg
5425 self.LogWarning("Node failed to demote itself: %s", msg)
5427 new_flags = self._R2F[new_role]
5428 for of, nf, desc in zip(self.old_flags, new_flags, self._FLAGS):
5430 result.append((desc, str(nf)))
5431 (node.master_candidate, node.drained, node.offline) = new_flags
5433 # we locked all nodes, we adjust the CP before updating this node
5435 _AdjustCandidatePool(self, [node.name])
5437 if self.op.secondary_ip:
5438 node.secondary_ip = self.op.secondary_ip
5439 result.append(("secondary_ip", self.op.secondary_ip))
5441 # this will trigger configuration file update, if needed
5442 self.cfg.Update(node, feedback_fn)
5444 # this will trigger job queue propagation or cleanup if the mc
5446 if [old_role, new_role].count(self._ROLE_CANDIDATE) == 1:
5447 self.context.ReaddNode(node)
5452 class LUNodePowercycle(NoHooksLU):
5453 """Powercycles a node.
5458 def CheckArguments(self):
5459 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
5460 if self.op.node_name == self.cfg.GetMasterNode() and not self.op.force:
5461 raise errors.OpPrereqError("The node is the master and the force"
5462 " parameter was not set",
5465 def ExpandNames(self):
5466 """Locking for PowercycleNode.
5468 This is a last-resort option and shouldn't block on other
5469 jobs. Therefore, we grab no locks.
5472 self.needed_locks = {}
5474 def Exec(self, feedback_fn):
5478 result = self.rpc.call_node_powercycle(self.op.node_name,
5479 self.cfg.GetHypervisorType())
5480 result.Raise("Failed to schedule the reboot")
5481 return result.payload
5484 class LUClusterQuery(NoHooksLU):
5485 """Query cluster configuration.
5490 def ExpandNames(self):
5491 self.needed_locks = {}
5493 def Exec(self, feedback_fn):
5494 """Return cluster config.
5497 cluster = self.cfg.GetClusterInfo()
5500 # Filter just for enabled hypervisors
5501 for os_name, hv_dict in cluster.os_hvp.items():
5502 os_hvp[os_name] = {}
5503 for hv_name, hv_params in hv_dict.items():
5504 if hv_name in cluster.enabled_hypervisors:
5505 os_hvp[os_name][hv_name] = hv_params
5507 # Convert ip_family to ip_version
5508 primary_ip_version = constants.IP4_VERSION
5509 if cluster.primary_ip_family == netutils.IP6Address.family:
5510 primary_ip_version = constants.IP6_VERSION
5513 "software_version": constants.RELEASE_VERSION,
5514 "protocol_version": constants.PROTOCOL_VERSION,
5515 "config_version": constants.CONFIG_VERSION,
5516 "os_api_version": max(constants.OS_API_VERSIONS),
5517 "export_version": constants.EXPORT_VERSION,
5518 "architecture": (platform.architecture()[0], platform.machine()),
5519 "name": cluster.cluster_name,
5520 "master": cluster.master_node,
5521 "default_hypervisor": cluster.enabled_hypervisors[0],
5522 "enabled_hypervisors": cluster.enabled_hypervisors,
5523 "hvparams": dict([(hypervisor_name, cluster.hvparams[hypervisor_name])
5524 for hypervisor_name in cluster.enabled_hypervisors]),
5526 "beparams": cluster.beparams,
5527 "osparams": cluster.osparams,
5528 "nicparams": cluster.nicparams,
5529 "ndparams": cluster.ndparams,
5530 "candidate_pool_size": cluster.candidate_pool_size,
5531 "master_netdev": cluster.master_netdev,
5532 "volume_group_name": cluster.volume_group_name,
5533 "drbd_usermode_helper": cluster.drbd_usermode_helper,
5534 "file_storage_dir": cluster.file_storage_dir,
5535 "shared_file_storage_dir": cluster.shared_file_storage_dir,
5536 "maintain_node_health": cluster.maintain_node_health,
5537 "ctime": cluster.ctime,
5538 "mtime": cluster.mtime,
5539 "uuid": cluster.uuid,
5540 "tags": list(cluster.GetTags()),
5541 "uid_pool": cluster.uid_pool,
5542 "default_iallocator": cluster.default_iallocator,
5543 "reserved_lvs": cluster.reserved_lvs,
5544 "primary_ip_version": primary_ip_version,
5545 "prealloc_wipe_disks": cluster.prealloc_wipe_disks,
5546 "hidden_os": cluster.hidden_os,
5547 "blacklisted_os": cluster.blacklisted_os,
5553 class LUClusterConfigQuery(NoHooksLU):
5554 """Return configuration values.
5558 _FIELDS_DYNAMIC = utils.FieldSet()
5559 _FIELDS_STATIC = utils.FieldSet("cluster_name", "master_node", "drain_flag",
5560 "watcher_pause", "volume_group_name")
5562 def CheckArguments(self):
5563 _CheckOutputFields(static=self._FIELDS_STATIC,
5564 dynamic=self._FIELDS_DYNAMIC,
5565 selected=self.op.output_fields)
5567 def ExpandNames(self):
5568 self.needed_locks = {}
5570 def Exec(self, feedback_fn):
5571 """Dump a representation of the cluster config to the standard output.
5575 for field in self.op.output_fields:
5576 if field == "cluster_name":
5577 entry = self.cfg.GetClusterName()
5578 elif field == "master_node":
5579 entry = self.cfg.GetMasterNode()
5580 elif field == "drain_flag":
5581 entry = os.path.exists(constants.JOB_QUEUE_DRAIN_FILE)
5582 elif field == "watcher_pause":
5583 entry = utils.ReadWatcherPauseFile(constants.WATCHER_PAUSEFILE)
5584 elif field == "volume_group_name":
5585 entry = self.cfg.GetVGName()
5587 raise errors.ParameterError(field)
5588 values.append(entry)
5592 class LUInstanceActivateDisks(NoHooksLU):
5593 """Bring up an instance's disks.
5598 def ExpandNames(self):
5599 self._ExpandAndLockInstance()
5600 self.needed_locks[locking.LEVEL_NODE] = []
5601 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
5603 def DeclareLocks(self, level):
5604 if level == locking.LEVEL_NODE:
5605 self._LockInstancesNodes()
5607 def CheckPrereq(self):
5608 """Check prerequisites.
5610 This checks that the instance is in the cluster.
5613 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5614 assert self.instance is not None, \
5615 "Cannot retrieve locked instance %s" % self.op.instance_name
5616 _CheckNodeOnline(self, self.instance.primary_node)
5618 def Exec(self, feedback_fn):
5619 """Activate the disks.
5622 disks_ok, disks_info = \
5623 _AssembleInstanceDisks(self, self.instance,
5624 ignore_size=self.op.ignore_size)
5626 raise errors.OpExecError("Cannot activate block devices")
5631 def _AssembleInstanceDisks(lu, instance, disks=None, ignore_secondaries=False,
5633 """Prepare the block devices for an instance.
5635 This sets up the block devices on all nodes.
5637 @type lu: L{LogicalUnit}
5638 @param lu: the logical unit on whose behalf we execute
5639 @type instance: L{objects.Instance}
5640 @param instance: the instance for whose disks we assemble
5641 @type disks: list of L{objects.Disk} or None
5642 @param disks: which disks to assemble (or all, if None)
5643 @type ignore_secondaries: boolean
5644 @param ignore_secondaries: if true, errors on secondary nodes
5645 won't result in an error return from the function
5646 @type ignore_size: boolean
5647 @param ignore_size: if true, the current known size of the disk
5648 will not be used during the disk activation, useful for cases
5649 when the size is wrong
5650 @return: False if the operation failed, otherwise a list of
5651 (host, instance_visible_name, node_visible_name)
5652 with the mapping from node devices to instance devices
5657 iname = instance.name
5658 disks = _ExpandCheckDisks(instance, disks)
5660 # With the two passes mechanism we try to reduce the window of
5661 # opportunity for the race condition of switching DRBD to primary
5662 # before handshaking occured, but we do not eliminate it
5664 # The proper fix would be to wait (with some limits) until the
5665 # connection has been made and drbd transitions from WFConnection
5666 # into any other network-connected state (Connected, SyncTarget,
5669 # 1st pass, assemble on all nodes in secondary mode
5670 for idx, inst_disk in enumerate(disks):
5671 for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node):
5673 node_disk = node_disk.Copy()
5674 node_disk.UnsetSize()
5675 lu.cfg.SetDiskID(node_disk, node)
5676 result = lu.rpc.call_blockdev_assemble(node, node_disk, iname, False, idx)
5677 msg = result.fail_msg
5679 lu.proc.LogWarning("Could not prepare block device %s on node %s"
5680 " (is_primary=False, pass=1): %s",
5681 inst_disk.iv_name, node, msg)
5682 if not ignore_secondaries:
5685 # FIXME: race condition on drbd migration to primary
5687 # 2nd pass, do only the primary node
5688 for idx, inst_disk in enumerate(disks):
5691 for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node):
5692 if node != instance.primary_node:
5695 node_disk = node_disk.Copy()
5696 node_disk.UnsetSize()
5697 lu.cfg.SetDiskID(node_disk, node)
5698 result = lu.rpc.call_blockdev_assemble(node, node_disk, iname, True, idx)
5699 msg = result.fail_msg
5701 lu.proc.LogWarning("Could not prepare block device %s on node %s"
5702 " (is_primary=True, pass=2): %s",
5703 inst_disk.iv_name, node, msg)
5706 dev_path = result.payload
5708 device_info.append((instance.primary_node, inst_disk.iv_name, dev_path))
5710 # leave the disks configured for the primary node
5711 # this is a workaround that would be fixed better by
5712 # improving the logical/physical id handling
5714 lu.cfg.SetDiskID(disk, instance.primary_node)
5716 return disks_ok, device_info
5719 def _StartInstanceDisks(lu, instance, force):
5720 """Start the disks of an instance.
5723 disks_ok, _ = _AssembleInstanceDisks(lu, instance,
5724 ignore_secondaries=force)
5726 _ShutdownInstanceDisks(lu, instance)
5727 if force is not None and not force:
5728 lu.proc.LogWarning("", hint="If the message above refers to a"
5730 " you can retry the operation using '--force'.")
5731 raise errors.OpExecError("Disk consistency error")
5734 class LUInstanceDeactivateDisks(NoHooksLU):
5735 """Shutdown an instance's disks.
5740 def ExpandNames(self):
5741 self._ExpandAndLockInstance()
5742 self.needed_locks[locking.LEVEL_NODE] = []
5743 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
5745 def DeclareLocks(self, level):
5746 if level == locking.LEVEL_NODE:
5747 self._LockInstancesNodes()
5749 def CheckPrereq(self):
5750 """Check prerequisites.
5752 This checks that the instance is in the cluster.
5755 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5756 assert self.instance is not None, \
5757 "Cannot retrieve locked instance %s" % self.op.instance_name
5759 def Exec(self, feedback_fn):
5760 """Deactivate the disks
5763 instance = self.instance
5765 _ShutdownInstanceDisks(self, instance)
5767 _SafeShutdownInstanceDisks(self, instance)
5770 def _SafeShutdownInstanceDisks(lu, instance, disks=None):
5771 """Shutdown block devices of an instance.
5773 This function checks if an instance is running, before calling
5774 _ShutdownInstanceDisks.
5777 _CheckInstanceDown(lu, instance, "cannot shutdown disks")
5778 _ShutdownInstanceDisks(lu, instance, disks=disks)
5781 def _ExpandCheckDisks(instance, disks):
5782 """Return the instance disks selected by the disks list
5784 @type disks: list of L{objects.Disk} or None
5785 @param disks: selected disks
5786 @rtype: list of L{objects.Disk}
5787 @return: selected instance disks to act on
5791 return instance.disks
5793 if not set(disks).issubset(instance.disks):
5794 raise errors.ProgrammerError("Can only act on disks belonging to the"
5799 def _ShutdownInstanceDisks(lu, instance, disks=None, ignore_primary=False):
5800 """Shutdown block devices of an instance.
5802 This does the shutdown on all nodes of the instance.
5804 If the ignore_primary is false, errors on the primary node are
5809 disks = _ExpandCheckDisks(instance, disks)
5812 for node, top_disk in disk.ComputeNodeTree(instance.primary_node):
5813 lu.cfg.SetDiskID(top_disk, node)
5814 result = lu.rpc.call_blockdev_shutdown(node, top_disk)
5815 msg = result.fail_msg
5817 lu.LogWarning("Could not shutdown block device %s on node %s: %s",
5818 disk.iv_name, node, msg)
5819 if ((node == instance.primary_node and not ignore_primary) or
5820 (node != instance.primary_node and not result.offline)):
5825 def _CheckNodeFreeMemory(lu, node, reason, requested, hypervisor_name):
5826 """Checks if a node has enough free memory.
5828 This function check if a given node has the needed amount of free
5829 memory. In case the node has less memory or we cannot get the
5830 information from the node, this function raise an OpPrereqError
5833 @type lu: C{LogicalUnit}
5834 @param lu: a logical unit from which we get configuration data
5836 @param node: the node to check
5837 @type reason: C{str}
5838 @param reason: string to use in the error message
5839 @type requested: C{int}
5840 @param requested: the amount of memory in MiB to check for
5841 @type hypervisor_name: C{str}
5842 @param hypervisor_name: the hypervisor to ask for memory stats
5843 @raise errors.OpPrereqError: if the node doesn't have enough memory, or
5844 we cannot check the node
5847 nodeinfo = lu.rpc.call_node_info([node], None, hypervisor_name)
5848 nodeinfo[node].Raise("Can't get data from node %s" % node,
5849 prereq=True, ecode=errors.ECODE_ENVIRON)
5850 free_mem = nodeinfo[node].payload.get("memory_free", None)
5851 if not isinstance(free_mem, int):
5852 raise errors.OpPrereqError("Can't compute free memory on node %s, result"
5853 " was '%s'" % (node, free_mem),
5854 errors.ECODE_ENVIRON)
5855 if requested > free_mem:
5856 raise errors.OpPrereqError("Not enough memory on node %s for %s:"
5857 " needed %s MiB, available %s MiB" %
5858 (node, reason, requested, free_mem),
5862 def _CheckNodesFreeDiskPerVG(lu, nodenames, req_sizes):
5863 """Checks if nodes have enough free disk space in the all VGs.
5865 This function check if all given nodes have the needed amount of
5866 free disk. In case any node has less disk or we cannot get the
5867 information from the node, this function raise an OpPrereqError
5870 @type lu: C{LogicalUnit}
5871 @param lu: a logical unit from which we get configuration data
5872 @type nodenames: C{list}
5873 @param nodenames: the list of node names to check
5874 @type req_sizes: C{dict}
5875 @param req_sizes: the hash of vg and corresponding amount of disk in
5877 @raise errors.OpPrereqError: if the node doesn't have enough disk,
5878 or we cannot check the node
5881 for vg, req_size in req_sizes.items():
5882 _CheckNodesFreeDiskOnVG(lu, nodenames, vg, req_size)
5885 def _CheckNodesFreeDiskOnVG(lu, nodenames, vg, requested):
5886 """Checks if nodes have enough free disk space in the specified VG.
5888 This function check if all given nodes have the needed amount of
5889 free disk. In case any node has less disk or we cannot get the
5890 information from the node, this function raise an OpPrereqError
5893 @type lu: C{LogicalUnit}
5894 @param lu: a logical unit from which we get configuration data
5895 @type nodenames: C{list}
5896 @param nodenames: the list of node names to check
5898 @param vg: the volume group to check
5899 @type requested: C{int}
5900 @param requested: the amount of disk in MiB to check for
5901 @raise errors.OpPrereqError: if the node doesn't have enough disk,
5902 or we cannot check the node
5905 nodeinfo = lu.rpc.call_node_info(nodenames, vg, None)
5906 for node in nodenames:
5907 info = nodeinfo[node]
5908 info.Raise("Cannot get current information from node %s" % node,
5909 prereq=True, ecode=errors.ECODE_ENVIRON)
5910 vg_free = info.payload.get("vg_free", None)
5911 if not isinstance(vg_free, int):
5912 raise errors.OpPrereqError("Can't compute free disk space on node"
5913 " %s for vg %s, result was '%s'" %
5914 (node, vg, vg_free), errors.ECODE_ENVIRON)
5915 if requested > vg_free:
5916 raise errors.OpPrereqError("Not enough disk space on target node %s"
5917 " vg %s: required %d MiB, available %d MiB" %
5918 (node, vg, requested, vg_free),
5922 class LUInstanceStartup(LogicalUnit):
5923 """Starts an instance.
5926 HPATH = "instance-start"
5927 HTYPE = constants.HTYPE_INSTANCE
5930 def CheckArguments(self):
5932 if self.op.beparams:
5933 # fill the beparams dict
5934 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
5936 def ExpandNames(self):
5937 self._ExpandAndLockInstance()
5939 def BuildHooksEnv(self):
5942 This runs on master, primary and secondary nodes of the instance.
5946 "FORCE": self.op.force,
5949 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
5953 def BuildHooksNodes(self):
5954 """Build hooks nodes.
5957 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
5960 def CheckPrereq(self):
5961 """Check prerequisites.
5963 This checks that the instance is in the cluster.
5966 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5967 assert self.instance is not None, \
5968 "Cannot retrieve locked instance %s" % self.op.instance_name
5971 if self.op.hvparams:
5972 # check hypervisor parameter syntax (locally)
5973 cluster = self.cfg.GetClusterInfo()
5974 utils.ForceDictType(self.op.hvparams, constants.HVS_PARAMETER_TYPES)
5975 filled_hvp = cluster.FillHV(instance)
5976 filled_hvp.update(self.op.hvparams)
5977 hv_type = hypervisor.GetHypervisor(instance.hypervisor)
5978 hv_type.CheckParameterSyntax(filled_hvp)
5979 _CheckHVParams(self, instance.all_nodes, instance.hypervisor, filled_hvp)
5981 self.primary_offline = self.cfg.GetNodeInfo(instance.primary_node).offline
5983 if self.primary_offline and self.op.ignore_offline_nodes:
5984 self.proc.LogWarning("Ignoring offline primary node")
5986 if self.op.hvparams or self.op.beparams:
5987 self.proc.LogWarning("Overridden parameters are ignored")
5989 _CheckNodeOnline(self, instance.primary_node)
5991 bep = self.cfg.GetClusterInfo().FillBE(instance)
5993 # check bridges existence
5994 _CheckInstanceBridgesExist(self, instance)
5996 remote_info = self.rpc.call_instance_info(instance.primary_node,
5998 instance.hypervisor)
5999 remote_info.Raise("Error checking node %s" % instance.primary_node,
6000 prereq=True, ecode=errors.ECODE_ENVIRON)
6001 if not remote_info.payload: # not running already
6002 _CheckNodeFreeMemory(self, instance.primary_node,
6003 "starting instance %s" % instance.name,
6004 bep[constants.BE_MEMORY], instance.hypervisor)
6006 def Exec(self, feedback_fn):
6007 """Start the instance.
6010 instance = self.instance
6011 force = self.op.force
6013 if not self.op.no_remember:
6014 self.cfg.MarkInstanceUp(instance.name)
6016 if self.primary_offline:
6017 assert self.op.ignore_offline_nodes
6018 self.proc.LogInfo("Primary node offline, marked instance as started")
6020 node_current = instance.primary_node
6022 _StartInstanceDisks(self, instance, force)
6024 result = self.rpc.call_instance_start(node_current, instance,
6025 self.op.hvparams, self.op.beparams,
6026 self.op.startup_paused)
6027 msg = result.fail_msg
6029 _ShutdownInstanceDisks(self, instance)
6030 raise errors.OpExecError("Could not start instance: %s" % msg)
6033 class LUInstanceReboot(LogicalUnit):
6034 """Reboot an instance.
6037 HPATH = "instance-reboot"
6038 HTYPE = constants.HTYPE_INSTANCE
6041 def ExpandNames(self):
6042 self._ExpandAndLockInstance()
6044 def BuildHooksEnv(self):
6047 This runs on master, primary and secondary nodes of the instance.
6051 "IGNORE_SECONDARIES": self.op.ignore_secondaries,
6052 "REBOOT_TYPE": self.op.reboot_type,
6053 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
6056 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
6060 def BuildHooksNodes(self):
6061 """Build hooks nodes.
6064 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6067 def CheckPrereq(self):
6068 """Check prerequisites.
6070 This checks that the instance is in the cluster.
6073 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6074 assert self.instance is not None, \
6075 "Cannot retrieve locked instance %s" % self.op.instance_name
6077 _CheckNodeOnline(self, instance.primary_node)
6079 # check bridges existence
6080 _CheckInstanceBridgesExist(self, instance)
6082 def Exec(self, feedback_fn):
6083 """Reboot the instance.
6086 instance = self.instance
6087 ignore_secondaries = self.op.ignore_secondaries
6088 reboot_type = self.op.reboot_type
6090 remote_info = self.rpc.call_instance_info(instance.primary_node,
6092 instance.hypervisor)
6093 remote_info.Raise("Error checking node %s" % instance.primary_node)
6094 instance_running = bool(remote_info.payload)
6096 node_current = instance.primary_node
6098 if instance_running and reboot_type in [constants.INSTANCE_REBOOT_SOFT,
6099 constants.INSTANCE_REBOOT_HARD]:
6100 for disk in instance.disks:
6101 self.cfg.SetDiskID(disk, node_current)
6102 result = self.rpc.call_instance_reboot(node_current, instance,
6104 self.op.shutdown_timeout)
6105 result.Raise("Could not reboot instance")
6107 if instance_running:
6108 result = self.rpc.call_instance_shutdown(node_current, instance,
6109 self.op.shutdown_timeout)
6110 result.Raise("Could not shutdown instance for full reboot")
6111 _ShutdownInstanceDisks(self, instance)
6113 self.LogInfo("Instance %s was already stopped, starting now",
6115 _StartInstanceDisks(self, instance, ignore_secondaries)
6116 result = self.rpc.call_instance_start(node_current, instance,
6118 msg = result.fail_msg
6120 _ShutdownInstanceDisks(self, instance)
6121 raise errors.OpExecError("Could not start instance for"
6122 " full reboot: %s" % msg)
6124 self.cfg.MarkInstanceUp(instance.name)
6127 class LUInstanceShutdown(LogicalUnit):
6128 """Shutdown an instance.
6131 HPATH = "instance-stop"
6132 HTYPE = constants.HTYPE_INSTANCE
6135 def ExpandNames(self):
6136 self._ExpandAndLockInstance()
6138 def BuildHooksEnv(self):
6141 This runs on master, primary and secondary nodes of the instance.
6144 env = _BuildInstanceHookEnvByObject(self, self.instance)
6145 env["TIMEOUT"] = self.op.timeout
6148 def BuildHooksNodes(self):
6149 """Build hooks nodes.
6152 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6155 def CheckPrereq(self):
6156 """Check prerequisites.
6158 This checks that the instance is in the cluster.
6161 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6162 assert self.instance is not None, \
6163 "Cannot retrieve locked instance %s" % self.op.instance_name
6165 self.primary_offline = \
6166 self.cfg.GetNodeInfo(self.instance.primary_node).offline
6168 if self.primary_offline and self.op.ignore_offline_nodes:
6169 self.proc.LogWarning("Ignoring offline primary node")
6171 _CheckNodeOnline(self, self.instance.primary_node)
6173 def Exec(self, feedback_fn):
6174 """Shutdown the instance.
6177 instance = self.instance
6178 node_current = instance.primary_node
6179 timeout = self.op.timeout
6181 if not self.op.no_remember:
6182 self.cfg.MarkInstanceDown(instance.name)
6184 if self.primary_offline:
6185 assert self.op.ignore_offline_nodes
6186 self.proc.LogInfo("Primary node offline, marked instance as stopped")
6188 result = self.rpc.call_instance_shutdown(node_current, instance, timeout)
6189 msg = result.fail_msg
6191 self.proc.LogWarning("Could not shutdown instance: %s" % msg)
6193 _ShutdownInstanceDisks(self, instance)
6196 class LUInstanceReinstall(LogicalUnit):
6197 """Reinstall an instance.
6200 HPATH = "instance-reinstall"
6201 HTYPE = constants.HTYPE_INSTANCE
6204 def ExpandNames(self):
6205 self._ExpandAndLockInstance()
6207 def BuildHooksEnv(self):
6210 This runs on master, primary and secondary nodes of the instance.
6213 return _BuildInstanceHookEnvByObject(self, self.instance)
6215 def BuildHooksNodes(self):
6216 """Build hooks nodes.
6219 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6222 def CheckPrereq(self):
6223 """Check prerequisites.
6225 This checks that the instance is in the cluster and is not running.
6228 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6229 assert instance is not None, \
6230 "Cannot retrieve locked instance %s" % self.op.instance_name
6231 _CheckNodeOnline(self, instance.primary_node, "Instance primary node"
6232 " offline, cannot reinstall")
6233 for node in instance.secondary_nodes:
6234 _CheckNodeOnline(self, node, "Instance secondary node offline,"
6235 " cannot reinstall")
6237 if instance.disk_template == constants.DT_DISKLESS:
6238 raise errors.OpPrereqError("Instance '%s' has no disks" %
6239 self.op.instance_name,
6241 _CheckInstanceDown(self, instance, "cannot reinstall")
6243 if self.op.os_type is not None:
6245 pnode = _ExpandNodeName(self.cfg, instance.primary_node)
6246 _CheckNodeHasOS(self, pnode, self.op.os_type, self.op.force_variant)
6247 instance_os = self.op.os_type
6249 instance_os = instance.os
6251 nodelist = list(instance.all_nodes)
6253 if self.op.osparams:
6254 i_osdict = _GetUpdatedParams(instance.osparams, self.op.osparams)
6255 _CheckOSParams(self, True, nodelist, instance_os, i_osdict)
6256 self.os_inst = i_osdict # the new dict (without defaults)
6260 self.instance = instance
6262 def Exec(self, feedback_fn):
6263 """Reinstall the instance.
6266 inst = self.instance
6268 if self.op.os_type is not None:
6269 feedback_fn("Changing OS to '%s'..." % self.op.os_type)
6270 inst.os = self.op.os_type
6271 # Write to configuration
6272 self.cfg.Update(inst, feedback_fn)
6274 _StartInstanceDisks(self, inst, None)
6276 feedback_fn("Running the instance OS create scripts...")
6277 # FIXME: pass debug option from opcode to backend
6278 result = self.rpc.call_instance_os_add(inst.primary_node, inst, True,
6279 self.op.debug_level,
6280 osparams=self.os_inst)
6281 result.Raise("Could not install OS for instance %s on node %s" %
6282 (inst.name, inst.primary_node))
6284 _ShutdownInstanceDisks(self, inst)
6287 class LUInstanceRecreateDisks(LogicalUnit):
6288 """Recreate an instance's missing disks.
6291 HPATH = "instance-recreate-disks"
6292 HTYPE = constants.HTYPE_INSTANCE
6295 def CheckArguments(self):
6296 # normalise the disk list
6297 self.op.disks = sorted(frozenset(self.op.disks))
6299 def ExpandNames(self):
6300 self._ExpandAndLockInstance()
6301 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
6303 self.op.nodes = [_ExpandNodeName(self.cfg, n) for n in self.op.nodes]
6304 self.needed_locks[locking.LEVEL_NODE] = list(self.op.nodes)
6306 self.needed_locks[locking.LEVEL_NODE] = []
6308 def DeclareLocks(self, level):
6309 if level == locking.LEVEL_NODE:
6310 # if we replace the nodes, we only need to lock the old primary,
6311 # otherwise we need to lock all nodes for disk re-creation
6312 primary_only = bool(self.op.nodes)
6313 self._LockInstancesNodes(primary_only=primary_only)
6315 def BuildHooksEnv(self):
6318 This runs on master, primary and secondary nodes of the instance.
6321 return _BuildInstanceHookEnvByObject(self, self.instance)
6323 def BuildHooksNodes(self):
6324 """Build hooks nodes.
6327 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6330 def CheckPrereq(self):
6331 """Check prerequisites.
6333 This checks that the instance is in the cluster and is not running.
6336 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6337 assert instance is not None, \
6338 "Cannot retrieve locked instance %s" % self.op.instance_name
6340 if len(self.op.nodes) != len(instance.all_nodes):
6341 raise errors.OpPrereqError("Instance %s currently has %d nodes, but"
6342 " %d replacement nodes were specified" %
6343 (instance.name, len(instance.all_nodes),
6344 len(self.op.nodes)),
6346 assert instance.disk_template != constants.DT_DRBD8 or \
6347 len(self.op.nodes) == 2
6348 assert instance.disk_template != constants.DT_PLAIN or \
6349 len(self.op.nodes) == 1
6350 primary_node = self.op.nodes[0]
6352 primary_node = instance.primary_node
6353 _CheckNodeOnline(self, primary_node)
6355 if instance.disk_template == constants.DT_DISKLESS:
6356 raise errors.OpPrereqError("Instance '%s' has no disks" %
6357 self.op.instance_name, errors.ECODE_INVAL)
6358 # if we replace nodes *and* the old primary is offline, we don't
6360 assert instance.primary_node in self.needed_locks[locking.LEVEL_NODE]
6361 old_pnode = self.cfg.GetNodeInfo(instance.primary_node)
6362 if not (self.op.nodes and old_pnode.offline):
6363 _CheckInstanceDown(self, instance, "cannot recreate disks")
6365 if not self.op.disks:
6366 self.op.disks = range(len(instance.disks))
6368 for idx in self.op.disks:
6369 if idx >= len(instance.disks):
6370 raise errors.OpPrereqError("Invalid disk index '%s'" % idx,
6372 if self.op.disks != range(len(instance.disks)) and self.op.nodes:
6373 raise errors.OpPrereqError("Can't recreate disks partially and"
6374 " change the nodes at the same time",
6376 self.instance = instance
6378 def Exec(self, feedback_fn):
6379 """Recreate the disks.
6382 instance = self.instance
6385 mods = [] # keeps track of needed logical_id changes
6387 for idx, disk in enumerate(instance.disks):
6388 if idx not in self.op.disks: # disk idx has not been passed in
6391 # update secondaries for disks, if needed
6393 if disk.dev_type == constants.LD_DRBD8:
6394 # need to update the nodes and minors
6395 assert len(self.op.nodes) == 2
6396 assert len(disk.logical_id) == 6 # otherwise disk internals
6398 (_, _, old_port, _, _, old_secret) = disk.logical_id
6399 new_minors = self.cfg.AllocateDRBDMinor(self.op.nodes, instance.name)
6400 new_id = (self.op.nodes[0], self.op.nodes[1], old_port,
6401 new_minors[0], new_minors[1], old_secret)
6402 assert len(disk.logical_id) == len(new_id)
6403 mods.append((idx, new_id))
6405 # now that we have passed all asserts above, we can apply the mods
6406 # in a single run (to avoid partial changes)
6407 for idx, new_id in mods:
6408 instance.disks[idx].logical_id = new_id
6410 # change primary node, if needed
6412 instance.primary_node = self.op.nodes[0]
6413 self.LogWarning("Changing the instance's nodes, you will have to"
6414 " remove any disks left on the older nodes manually")
6417 self.cfg.Update(instance, feedback_fn)
6419 _CreateDisks(self, instance, to_skip=to_skip)
6422 class LUInstanceRename(LogicalUnit):
6423 """Rename an instance.
6426 HPATH = "instance-rename"
6427 HTYPE = constants.HTYPE_INSTANCE
6429 def CheckArguments(self):
6433 if self.op.ip_check and not self.op.name_check:
6434 # TODO: make the ip check more flexible and not depend on the name check
6435 raise errors.OpPrereqError("IP address check requires a name check",
6438 def BuildHooksEnv(self):
6441 This runs on master, primary and secondary nodes of the instance.
6444 env = _BuildInstanceHookEnvByObject(self, self.instance)
6445 env["INSTANCE_NEW_NAME"] = self.op.new_name
6448 def BuildHooksNodes(self):
6449 """Build hooks nodes.
6452 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6455 def CheckPrereq(self):
6456 """Check prerequisites.
6458 This checks that the instance is in the cluster and is not running.
6461 self.op.instance_name = _ExpandInstanceName(self.cfg,
6462 self.op.instance_name)
6463 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6464 assert instance is not None
6465 _CheckNodeOnline(self, instance.primary_node)
6466 _CheckInstanceDown(self, instance, "cannot rename")
6467 self.instance = instance
6469 new_name = self.op.new_name
6470 if self.op.name_check:
6471 hostname = netutils.GetHostname(name=new_name)
6472 if hostname.name != new_name:
6473 self.LogInfo("Resolved given name '%s' to '%s'", new_name,
6475 if not utils.MatchNameComponent(self.op.new_name, [hostname.name]):
6476 raise errors.OpPrereqError(("Resolved hostname '%s' does not look the"
6477 " same as given hostname '%s'") %
6478 (hostname.name, self.op.new_name),
6480 new_name = self.op.new_name = hostname.name
6481 if (self.op.ip_check and
6482 netutils.TcpPing(hostname.ip, constants.DEFAULT_NODED_PORT)):
6483 raise errors.OpPrereqError("IP %s of instance %s already in use" %
6484 (hostname.ip, new_name),
6485 errors.ECODE_NOTUNIQUE)
6487 instance_list = self.cfg.GetInstanceList()
6488 if new_name in instance_list and new_name != instance.name:
6489 raise errors.OpPrereqError("Instance '%s' is already in the cluster" %
6490 new_name, errors.ECODE_EXISTS)
6492 def Exec(self, feedback_fn):
6493 """Rename the instance.
6496 inst = self.instance
6497 old_name = inst.name
6499 rename_file_storage = False
6500 if (inst.disk_template in constants.DTS_FILEBASED and
6501 self.op.new_name != inst.name):
6502 old_file_storage_dir = os.path.dirname(inst.disks[0].logical_id[1])
6503 rename_file_storage = True
6505 self.cfg.RenameInstance(inst.name, self.op.new_name)
6506 # Change the instance lock. This is definitely safe while we hold the BGL.
6507 # Otherwise the new lock would have to be added in acquired mode.
6509 self.glm.remove(locking.LEVEL_INSTANCE, old_name)
6510 self.glm.add(locking.LEVEL_INSTANCE, self.op.new_name)
6512 # re-read the instance from the configuration after rename
6513 inst = self.cfg.GetInstanceInfo(self.op.new_name)
6515 if rename_file_storage:
6516 new_file_storage_dir = os.path.dirname(inst.disks[0].logical_id[1])
6517 result = self.rpc.call_file_storage_dir_rename(inst.primary_node,
6518 old_file_storage_dir,
6519 new_file_storage_dir)
6520 result.Raise("Could not rename on node %s directory '%s' to '%s'"
6521 " (but the instance has been renamed in Ganeti)" %
6522 (inst.primary_node, old_file_storage_dir,
6523 new_file_storage_dir))
6525 _StartInstanceDisks(self, inst, None)
6527 result = self.rpc.call_instance_run_rename(inst.primary_node, inst,
6528 old_name, self.op.debug_level)
6529 msg = result.fail_msg
6531 msg = ("Could not run OS rename script for instance %s on node %s"
6532 " (but the instance has been renamed in Ganeti): %s" %
6533 (inst.name, inst.primary_node, msg))
6534 self.proc.LogWarning(msg)
6536 _ShutdownInstanceDisks(self, inst)
6541 class LUInstanceRemove(LogicalUnit):
6542 """Remove an instance.
6545 HPATH = "instance-remove"
6546 HTYPE = constants.HTYPE_INSTANCE
6549 def ExpandNames(self):
6550 self._ExpandAndLockInstance()
6551 self.needed_locks[locking.LEVEL_NODE] = []
6552 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
6554 def DeclareLocks(self, level):
6555 if level == locking.LEVEL_NODE:
6556 self._LockInstancesNodes()
6558 def BuildHooksEnv(self):
6561 This runs on master, primary and secondary nodes of the instance.
6564 env = _BuildInstanceHookEnvByObject(self, self.instance)
6565 env["SHUTDOWN_TIMEOUT"] = self.op.shutdown_timeout
6568 def BuildHooksNodes(self):
6569 """Build hooks nodes.
6572 nl = [self.cfg.GetMasterNode()]
6573 nl_post = list(self.instance.all_nodes) + nl
6574 return (nl, nl_post)
6576 def CheckPrereq(self):
6577 """Check prerequisites.
6579 This checks that the instance is in the cluster.
6582 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6583 assert self.instance is not None, \
6584 "Cannot retrieve locked instance %s" % self.op.instance_name
6586 def Exec(self, feedback_fn):
6587 """Remove the instance.
6590 instance = self.instance
6591 logging.info("Shutting down instance %s on node %s",
6592 instance.name, instance.primary_node)
6594 result = self.rpc.call_instance_shutdown(instance.primary_node, instance,
6595 self.op.shutdown_timeout)
6596 msg = result.fail_msg
6598 if self.op.ignore_failures:
6599 feedback_fn("Warning: can't shutdown instance: %s" % msg)
6601 raise errors.OpExecError("Could not shutdown instance %s on"
6603 (instance.name, instance.primary_node, msg))
6605 _RemoveInstance(self, feedback_fn, instance, self.op.ignore_failures)
6608 def _RemoveInstance(lu, feedback_fn, instance, ignore_failures):
6609 """Utility function to remove an instance.
6612 logging.info("Removing block devices for instance %s", instance.name)
6614 if not _RemoveDisks(lu, instance):
6615 if not ignore_failures:
6616 raise errors.OpExecError("Can't remove instance's disks")
6617 feedback_fn("Warning: can't remove instance's disks")
6619 logging.info("Removing instance %s out of cluster config", instance.name)
6621 lu.cfg.RemoveInstance(instance.name)
6623 assert not lu.remove_locks.get(locking.LEVEL_INSTANCE), \
6624 "Instance lock removal conflict"
6626 # Remove lock for the instance
6627 lu.remove_locks[locking.LEVEL_INSTANCE] = instance.name
6630 class LUInstanceQuery(NoHooksLU):
6631 """Logical unit for querying instances.
6634 # pylint: disable=W0142
6637 def CheckArguments(self):
6638 self.iq = _InstanceQuery(qlang.MakeSimpleFilter("name", self.op.names),
6639 self.op.output_fields, self.op.use_locking)
6641 def ExpandNames(self):
6642 self.iq.ExpandNames(self)
6644 def DeclareLocks(self, level):
6645 self.iq.DeclareLocks(self, level)
6647 def Exec(self, feedback_fn):
6648 return self.iq.OldStyleQuery(self)
6651 class LUInstanceFailover(LogicalUnit):
6652 """Failover an instance.
6655 HPATH = "instance-failover"
6656 HTYPE = constants.HTYPE_INSTANCE
6659 def CheckArguments(self):
6660 """Check the arguments.
6663 self.iallocator = getattr(self.op, "iallocator", None)
6664 self.target_node = getattr(self.op, "target_node", None)
6666 def ExpandNames(self):
6667 self._ExpandAndLockInstance()
6669 if self.op.target_node is not None:
6670 self.op.target_node = _ExpandNodeName(self.cfg, self.op.target_node)
6672 self.needed_locks[locking.LEVEL_NODE] = []
6673 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
6675 ignore_consistency = self.op.ignore_consistency
6676 shutdown_timeout = self.op.shutdown_timeout
6677 self._migrater = TLMigrateInstance(self, self.op.instance_name,
6680 ignore_consistency=ignore_consistency,
6681 shutdown_timeout=shutdown_timeout)
6682 self.tasklets = [self._migrater]
6684 def DeclareLocks(self, level):
6685 if level == locking.LEVEL_NODE:
6686 instance = self.context.cfg.GetInstanceInfo(self.op.instance_name)
6687 if instance.disk_template in constants.DTS_EXT_MIRROR:
6688 if self.op.target_node is None:
6689 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
6691 self.needed_locks[locking.LEVEL_NODE] = [instance.primary_node,
6692 self.op.target_node]
6693 del self.recalculate_locks[locking.LEVEL_NODE]
6695 self._LockInstancesNodes()
6697 def BuildHooksEnv(self):
6700 This runs on master, primary and secondary nodes of the instance.
6703 instance = self._migrater.instance
6704 source_node = instance.primary_node
6705 target_node = self.op.target_node
6707 "IGNORE_CONSISTENCY": self.op.ignore_consistency,
6708 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
6709 "OLD_PRIMARY": source_node,
6710 "NEW_PRIMARY": target_node,
6713 if instance.disk_template in constants.DTS_INT_MIRROR:
6714 env["OLD_SECONDARY"] = instance.secondary_nodes[0]
6715 env["NEW_SECONDARY"] = source_node
6717 env["OLD_SECONDARY"] = env["NEW_SECONDARY"] = ""
6719 env.update(_BuildInstanceHookEnvByObject(self, instance))
6723 def BuildHooksNodes(self):
6724 """Build hooks nodes.
6727 instance = self._migrater.instance
6728 nl = [self.cfg.GetMasterNode()] + list(instance.secondary_nodes)
6729 return (nl, nl + [instance.primary_node])
6732 class LUInstanceMigrate(LogicalUnit):
6733 """Migrate an instance.
6735 This is migration without shutting down, compared to the failover,
6736 which is done with shutdown.
6739 HPATH = "instance-migrate"
6740 HTYPE = constants.HTYPE_INSTANCE
6743 def ExpandNames(self):
6744 self._ExpandAndLockInstance()
6746 if self.op.target_node is not None:
6747 self.op.target_node = _ExpandNodeName(self.cfg, self.op.target_node)
6749 self.needed_locks[locking.LEVEL_NODE] = []
6750 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
6752 self._migrater = TLMigrateInstance(self, self.op.instance_name,
6753 cleanup=self.op.cleanup,
6755 fallback=self.op.allow_failover)
6756 self.tasklets = [self._migrater]
6758 def DeclareLocks(self, level):
6759 if level == locking.LEVEL_NODE:
6760 instance = self.context.cfg.GetInstanceInfo(self.op.instance_name)
6761 if instance.disk_template in constants.DTS_EXT_MIRROR:
6762 if self.op.target_node is None:
6763 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
6765 self.needed_locks[locking.LEVEL_NODE] = [instance.primary_node,
6766 self.op.target_node]
6767 del self.recalculate_locks[locking.LEVEL_NODE]
6769 self._LockInstancesNodes()
6771 def BuildHooksEnv(self):
6774 This runs on master, primary and secondary nodes of the instance.
6777 instance = self._migrater.instance
6778 source_node = instance.primary_node
6779 target_node = self.op.target_node
6780 env = _BuildInstanceHookEnvByObject(self, instance)
6782 "MIGRATE_LIVE": self._migrater.live,
6783 "MIGRATE_CLEANUP": self.op.cleanup,
6784 "OLD_PRIMARY": source_node,
6785 "NEW_PRIMARY": target_node,
6788 if instance.disk_template in constants.DTS_INT_MIRROR:
6789 env["OLD_SECONDARY"] = target_node
6790 env["NEW_SECONDARY"] = source_node
6792 env["OLD_SECONDARY"] = env["NEW_SECONDARY"] = None
6796 def BuildHooksNodes(self):
6797 """Build hooks nodes.
6800 instance = self._migrater.instance
6801 nl = [self.cfg.GetMasterNode()] + list(instance.secondary_nodes)
6802 return (nl, nl + [instance.primary_node])
6805 class LUInstanceMove(LogicalUnit):
6806 """Move an instance by data-copying.
6809 HPATH = "instance-move"
6810 HTYPE = constants.HTYPE_INSTANCE
6813 def ExpandNames(self):
6814 self._ExpandAndLockInstance()
6815 target_node = _ExpandNodeName(self.cfg, self.op.target_node)
6816 self.op.target_node = target_node
6817 self.needed_locks[locking.LEVEL_NODE] = [target_node]
6818 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
6820 def DeclareLocks(self, level):
6821 if level == locking.LEVEL_NODE:
6822 self._LockInstancesNodes(primary_only=True)
6824 def BuildHooksEnv(self):
6827 This runs on master, primary and secondary nodes of the instance.
6831 "TARGET_NODE": self.op.target_node,
6832 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
6834 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
6837 def BuildHooksNodes(self):
6838 """Build hooks nodes.
6842 self.cfg.GetMasterNode(),
6843 self.instance.primary_node,
6844 self.op.target_node,
6848 def CheckPrereq(self):
6849 """Check prerequisites.
6851 This checks that the instance is in the cluster.
6854 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6855 assert self.instance is not None, \
6856 "Cannot retrieve locked instance %s" % self.op.instance_name
6858 node = self.cfg.GetNodeInfo(self.op.target_node)
6859 assert node is not None, \
6860 "Cannot retrieve locked node %s" % self.op.target_node
6862 self.target_node = target_node = node.name
6864 if target_node == instance.primary_node:
6865 raise errors.OpPrereqError("Instance %s is already on the node %s" %
6866 (instance.name, target_node),
6869 bep = self.cfg.GetClusterInfo().FillBE(instance)
6871 for idx, dsk in enumerate(instance.disks):
6872 if dsk.dev_type not in (constants.LD_LV, constants.LD_FILE):
6873 raise errors.OpPrereqError("Instance disk %d has a complex layout,"
6874 " cannot copy" % idx, errors.ECODE_STATE)
6876 _CheckNodeOnline(self, target_node)
6877 _CheckNodeNotDrained(self, target_node)
6878 _CheckNodeVmCapable(self, target_node)
6880 if instance.admin_up:
6881 # check memory requirements on the secondary node
6882 _CheckNodeFreeMemory(self, target_node, "failing over instance %s" %
6883 instance.name, bep[constants.BE_MEMORY],
6884 instance.hypervisor)
6886 self.LogInfo("Not checking memory on the secondary node as"
6887 " instance will not be started")
6889 # check bridge existance
6890 _CheckInstanceBridgesExist(self, instance, node=target_node)
6892 def Exec(self, feedback_fn):
6893 """Move an instance.
6895 The move is done by shutting it down on its present node, copying
6896 the data over (slow) and starting it on the new node.
6899 instance = self.instance
6901 source_node = instance.primary_node
6902 target_node = self.target_node
6904 self.LogInfo("Shutting down instance %s on source node %s",
6905 instance.name, source_node)
6907 result = self.rpc.call_instance_shutdown(source_node, instance,
6908 self.op.shutdown_timeout)
6909 msg = result.fail_msg
6911 if self.op.ignore_consistency:
6912 self.proc.LogWarning("Could not shutdown instance %s on node %s."
6913 " Proceeding anyway. Please make sure node"
6914 " %s is down. Error details: %s",
6915 instance.name, source_node, source_node, msg)
6917 raise errors.OpExecError("Could not shutdown instance %s on"
6919 (instance.name, source_node, msg))
6921 # create the target disks
6923 _CreateDisks(self, instance, target_node=target_node)
6924 except errors.OpExecError:
6925 self.LogWarning("Device creation failed, reverting...")
6927 _RemoveDisks(self, instance, target_node=target_node)
6929 self.cfg.ReleaseDRBDMinors(instance.name)
6932 cluster_name = self.cfg.GetClusterInfo().cluster_name
6935 # activate, get path, copy the data over
6936 for idx, disk in enumerate(instance.disks):
6937 self.LogInfo("Copying data for disk %d", idx)
6938 result = self.rpc.call_blockdev_assemble(target_node, disk,
6939 instance.name, True, idx)
6941 self.LogWarning("Can't assemble newly created disk %d: %s",
6942 idx, result.fail_msg)
6943 errs.append(result.fail_msg)
6945 dev_path = result.payload
6946 result = self.rpc.call_blockdev_export(source_node, disk,
6947 target_node, dev_path,
6950 self.LogWarning("Can't copy data over for disk %d: %s",
6951 idx, result.fail_msg)
6952 errs.append(result.fail_msg)
6956 self.LogWarning("Some disks failed to copy, aborting")
6958 _RemoveDisks(self, instance, target_node=target_node)
6960 self.cfg.ReleaseDRBDMinors(instance.name)
6961 raise errors.OpExecError("Errors during disk copy: %s" %
6964 instance.primary_node = target_node
6965 self.cfg.Update(instance, feedback_fn)
6967 self.LogInfo("Removing the disks on the original node")
6968 _RemoveDisks(self, instance, target_node=source_node)
6970 # Only start the instance if it's marked as up
6971 if instance.admin_up:
6972 self.LogInfo("Starting instance %s on node %s",
6973 instance.name, target_node)
6975 disks_ok, _ = _AssembleInstanceDisks(self, instance,
6976 ignore_secondaries=True)
6978 _ShutdownInstanceDisks(self, instance)
6979 raise errors.OpExecError("Can't activate the instance's disks")
6981 result = self.rpc.call_instance_start(target_node, instance,
6983 msg = result.fail_msg
6985 _ShutdownInstanceDisks(self, instance)
6986 raise errors.OpExecError("Could not start instance %s on node %s: %s" %
6987 (instance.name, target_node, msg))
6990 class LUNodeMigrate(LogicalUnit):
6991 """Migrate all instances from a node.
6994 HPATH = "node-migrate"
6995 HTYPE = constants.HTYPE_NODE
6998 def CheckArguments(self):
7001 def ExpandNames(self):
7002 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
7004 self.share_locks = _ShareAll()
7005 self.needed_locks = {
7006 locking.LEVEL_NODE: [self.op.node_name],
7009 def BuildHooksEnv(self):
7012 This runs on the master, the primary and all the secondaries.
7016 "NODE_NAME": self.op.node_name,
7019 def BuildHooksNodes(self):
7020 """Build hooks nodes.
7023 nl = [self.cfg.GetMasterNode()]
7026 def CheckPrereq(self):
7029 def Exec(self, feedback_fn):
7030 # Prepare jobs for migration instances
7032 [opcodes.OpInstanceMigrate(instance_name=inst.name,
7035 iallocator=self.op.iallocator,
7036 target_node=self.op.target_node)]
7037 for inst in _GetNodePrimaryInstances(self.cfg, self.op.node_name)
7040 # TODO: Run iallocator in this opcode and pass correct placement options to
7041 # OpInstanceMigrate. Since other jobs can modify the cluster between
7042 # running the iallocator and the actual migration, a good consistency model
7043 # will have to be found.
7045 assert (frozenset(self.owned_locks(locking.LEVEL_NODE)) ==
7046 frozenset([self.op.node_name]))
7048 return ResultWithJobs(jobs)
7051 class TLMigrateInstance(Tasklet):
7052 """Tasklet class for instance migration.
7055 @ivar live: whether the migration will be done live or non-live;
7056 this variable is initalized only after CheckPrereq has run
7057 @type cleanup: boolean
7058 @ivar cleanup: Wheater we cleanup from a failed migration
7059 @type iallocator: string
7060 @ivar iallocator: The iallocator used to determine target_node
7061 @type target_node: string
7062 @ivar target_node: If given, the target_node to reallocate the instance to
7063 @type failover: boolean
7064 @ivar failover: Whether operation results in failover or migration
7065 @type fallback: boolean
7066 @ivar fallback: Whether fallback to failover is allowed if migration not
7068 @type ignore_consistency: boolean
7069 @ivar ignore_consistency: Wheter we should ignore consistency between source
7071 @type shutdown_timeout: int
7072 @ivar shutdown_timeout: In case of failover timeout of the shutdown
7075 def __init__(self, lu, instance_name, cleanup=False,
7076 failover=False, fallback=False,
7077 ignore_consistency=False,
7078 shutdown_timeout=constants.DEFAULT_SHUTDOWN_TIMEOUT):
7079 """Initializes this class.
7082 Tasklet.__init__(self, lu)
7085 self.instance_name = instance_name
7086 self.cleanup = cleanup
7087 self.live = False # will be overridden later
7088 self.failover = failover
7089 self.fallback = fallback
7090 self.ignore_consistency = ignore_consistency
7091 self.shutdown_timeout = shutdown_timeout
7093 def CheckPrereq(self):
7094 """Check prerequisites.
7096 This checks that the instance is in the cluster.
7099 instance_name = _ExpandInstanceName(self.lu.cfg, self.instance_name)
7100 instance = self.cfg.GetInstanceInfo(instance_name)
7101 assert instance is not None
7102 self.instance = instance
7104 if (not self.cleanup and not instance.admin_up and not self.failover and
7106 self.lu.LogInfo("Instance is marked down, fallback allowed, switching"
7108 self.failover = True
7110 if instance.disk_template not in constants.DTS_MIRRORED:
7115 raise errors.OpPrereqError("Instance's disk layout '%s' does not allow"
7116 " %s" % (instance.disk_template, text),
7119 if instance.disk_template in constants.DTS_EXT_MIRROR:
7120 _CheckIAllocatorOrNode(self.lu, "iallocator", "target_node")
7122 if self.lu.op.iallocator:
7123 self._RunAllocator()
7125 # We set set self.target_node as it is required by
7127 self.target_node = self.lu.op.target_node
7129 # self.target_node is already populated, either directly or by the
7131 target_node = self.target_node
7132 if self.target_node == instance.primary_node:
7133 raise errors.OpPrereqError("Cannot migrate instance %s"
7134 " to its primary (%s)" %
7135 (instance.name, instance.primary_node))
7137 if len(self.lu.tasklets) == 1:
7138 # It is safe to release locks only when we're the only tasklet
7140 _ReleaseLocks(self.lu, locking.LEVEL_NODE,
7141 keep=[instance.primary_node, self.target_node])
7144 secondary_nodes = instance.secondary_nodes
7145 if not secondary_nodes:
7146 raise errors.ConfigurationError("No secondary node but using"
7147 " %s disk template" %
7148 instance.disk_template)
7149 target_node = secondary_nodes[0]
7150 if self.lu.op.iallocator or (self.lu.op.target_node and
7151 self.lu.op.target_node != target_node):
7153 text = "failed over"
7156 raise errors.OpPrereqError("Instances with disk template %s cannot"
7157 " be %s to arbitrary nodes"
7158 " (neither an iallocator nor a target"
7159 " node can be passed)" %
7160 (instance.disk_template, text),
7163 i_be = self.cfg.GetClusterInfo().FillBE(instance)
7165 # check memory requirements on the secondary node
7166 if not self.failover or instance.admin_up:
7167 _CheckNodeFreeMemory(self.lu, target_node, "migrating instance %s" %
7168 instance.name, i_be[constants.BE_MEMORY],
7169 instance.hypervisor)
7171 self.lu.LogInfo("Not checking memory on the secondary node as"
7172 " instance will not be started")
7174 # check bridge existance
7175 _CheckInstanceBridgesExist(self.lu, instance, node=target_node)
7177 if not self.cleanup:
7178 _CheckNodeNotDrained(self.lu, target_node)
7179 if not self.failover:
7180 result = self.rpc.call_instance_migratable(instance.primary_node,
7182 if result.fail_msg and self.fallback:
7183 self.lu.LogInfo("Can't migrate, instance offline, fallback to"
7185 self.failover = True
7187 result.Raise("Can't migrate, please use failover",
7188 prereq=True, ecode=errors.ECODE_STATE)
7190 assert not (self.failover and self.cleanup)
7192 if not self.failover:
7193 if self.lu.op.live is not None and self.lu.op.mode is not None:
7194 raise errors.OpPrereqError("Only one of the 'live' and 'mode'"
7195 " parameters are accepted",
7197 if self.lu.op.live is not None:
7199 self.lu.op.mode = constants.HT_MIGRATION_LIVE
7201 self.lu.op.mode = constants.HT_MIGRATION_NONLIVE
7202 # reset the 'live' parameter to None so that repeated
7203 # invocations of CheckPrereq do not raise an exception
7204 self.lu.op.live = None
7205 elif self.lu.op.mode is None:
7206 # read the default value from the hypervisor
7207 i_hv = self.cfg.GetClusterInfo().FillHV(self.instance,
7209 self.lu.op.mode = i_hv[constants.HV_MIGRATION_MODE]
7211 self.live = self.lu.op.mode == constants.HT_MIGRATION_LIVE
7213 # Failover is never live
7216 def _RunAllocator(self):
7217 """Run the allocator based on input opcode.
7220 ial = IAllocator(self.cfg, self.rpc,
7221 mode=constants.IALLOCATOR_MODE_RELOC,
7222 name=self.instance_name,
7223 # TODO See why hail breaks with a single node below
7224 relocate_from=[self.instance.primary_node,
7225 self.instance.primary_node],
7228 ial.Run(self.lu.op.iallocator)
7231 raise errors.OpPrereqError("Can't compute nodes using"
7232 " iallocator '%s': %s" %
7233 (self.lu.op.iallocator, ial.info),
7235 if len(ial.result) != ial.required_nodes:
7236 raise errors.OpPrereqError("iallocator '%s' returned invalid number"
7237 " of nodes (%s), required %s" %
7238 (self.lu.op.iallocator, len(ial.result),
7239 ial.required_nodes), errors.ECODE_FAULT)
7240 self.target_node = ial.result[0]
7241 self.lu.LogInfo("Selected nodes for instance %s via iallocator %s: %s",
7242 self.instance_name, self.lu.op.iallocator,
7243 utils.CommaJoin(ial.result))
7245 def _WaitUntilSync(self):
7246 """Poll with custom rpc for disk sync.
7248 This uses our own step-based rpc call.
7251 self.feedback_fn("* wait until resync is done")
7255 result = self.rpc.call_drbd_wait_sync(self.all_nodes,
7257 self.instance.disks)
7259 for node, nres in result.items():
7260 nres.Raise("Cannot resync disks on node %s" % node)
7261 node_done, node_percent = nres.payload
7262 all_done = all_done and node_done
7263 if node_percent is not None:
7264 min_percent = min(min_percent, node_percent)
7266 if min_percent < 100:
7267 self.feedback_fn(" - progress: %.1f%%" % min_percent)
7270 def _EnsureSecondary(self, node):
7271 """Demote a node to secondary.
7274 self.feedback_fn("* switching node %s to secondary mode" % node)
7276 for dev in self.instance.disks:
7277 self.cfg.SetDiskID(dev, node)
7279 result = self.rpc.call_blockdev_close(node, self.instance.name,
7280 self.instance.disks)
7281 result.Raise("Cannot change disk to secondary on node %s" % node)
7283 def _GoStandalone(self):
7284 """Disconnect from the network.
7287 self.feedback_fn("* changing into standalone mode")
7288 result = self.rpc.call_drbd_disconnect_net(self.all_nodes, self.nodes_ip,
7289 self.instance.disks)
7290 for node, nres in result.items():
7291 nres.Raise("Cannot disconnect disks node %s" % node)
7293 def _GoReconnect(self, multimaster):
7294 """Reconnect to the network.
7300 msg = "single-master"
7301 self.feedback_fn("* changing disks into %s mode" % msg)
7302 result = self.rpc.call_drbd_attach_net(self.all_nodes, self.nodes_ip,
7303 self.instance.disks,
7304 self.instance.name, multimaster)
7305 for node, nres in result.items():
7306 nres.Raise("Cannot change disks config on node %s" % node)
7308 def _ExecCleanup(self):
7309 """Try to cleanup after a failed migration.
7311 The cleanup is done by:
7312 - check that the instance is running only on one node
7313 (and update the config if needed)
7314 - change disks on its secondary node to secondary
7315 - wait until disks are fully synchronized
7316 - disconnect from the network
7317 - change disks into single-master mode
7318 - wait again until disks are fully synchronized
7321 instance = self.instance
7322 target_node = self.target_node
7323 source_node = self.source_node
7325 # check running on only one node
7326 self.feedback_fn("* checking where the instance actually runs"
7327 " (if this hangs, the hypervisor might be in"
7329 ins_l = self.rpc.call_instance_list(self.all_nodes, [instance.hypervisor])
7330 for node, result in ins_l.items():
7331 result.Raise("Can't contact node %s" % node)
7333 runningon_source = instance.name in ins_l[source_node].payload
7334 runningon_target = instance.name in ins_l[target_node].payload
7336 if runningon_source and runningon_target:
7337 raise errors.OpExecError("Instance seems to be running on two nodes,"
7338 " or the hypervisor is confused; you will have"
7339 " to ensure manually that it runs only on one"
7340 " and restart this operation")
7342 if not (runningon_source or runningon_target):
7343 raise errors.OpExecError("Instance does not seem to be running at all;"
7344 " in this case it's safer to repair by"
7345 " running 'gnt-instance stop' to ensure disk"
7346 " shutdown, and then restarting it")
7348 if runningon_target:
7349 # the migration has actually succeeded, we need to update the config
7350 self.feedback_fn("* instance running on secondary node (%s),"
7351 " updating config" % target_node)
7352 instance.primary_node = target_node
7353 self.cfg.Update(instance, self.feedback_fn)
7354 demoted_node = source_node
7356 self.feedback_fn("* instance confirmed to be running on its"
7357 " primary node (%s)" % source_node)
7358 demoted_node = target_node
7360 if instance.disk_template in constants.DTS_INT_MIRROR:
7361 self._EnsureSecondary(demoted_node)
7363 self._WaitUntilSync()
7364 except errors.OpExecError:
7365 # we ignore here errors, since if the device is standalone, it
7366 # won't be able to sync
7368 self._GoStandalone()
7369 self._GoReconnect(False)
7370 self._WaitUntilSync()
7372 self.feedback_fn("* done")
7374 def _RevertDiskStatus(self):
7375 """Try to revert the disk status after a failed migration.
7378 target_node = self.target_node
7379 if self.instance.disk_template in constants.DTS_EXT_MIRROR:
7383 self._EnsureSecondary(target_node)
7384 self._GoStandalone()
7385 self._GoReconnect(False)
7386 self._WaitUntilSync()
7387 except errors.OpExecError, err:
7388 self.lu.LogWarning("Migration failed and I can't reconnect the drives,"
7389 " please try to recover the instance manually;"
7390 " error '%s'" % str(err))
7392 def _AbortMigration(self):
7393 """Call the hypervisor code to abort a started migration.
7396 instance = self.instance
7397 target_node = self.target_node
7398 migration_info = self.migration_info
7400 abort_result = self.rpc.call_finalize_migration(target_node,
7404 abort_msg = abort_result.fail_msg
7406 logging.error("Aborting migration failed on target node %s: %s",
7407 target_node, abort_msg)
7408 # Don't raise an exception here, as we stil have to try to revert the
7409 # disk status, even if this step failed.
7411 def _ExecMigration(self):
7412 """Migrate an instance.
7414 The migrate is done by:
7415 - change the disks into dual-master mode
7416 - wait until disks are fully synchronized again
7417 - migrate the instance
7418 - change disks on the new secondary node (the old primary) to secondary
7419 - wait until disks are fully synchronized
7420 - change disks into single-master mode
7423 instance = self.instance
7424 target_node = self.target_node
7425 source_node = self.source_node
7427 # Check for hypervisor version mismatch and warn the user.
7428 nodeinfo = self.rpc.call_node_info([source_node, target_node],
7429 None, self.instance.hypervisor)
7430 src_info = nodeinfo[source_node]
7431 dst_info = nodeinfo[target_node]
7433 if ((constants.HV_NODEINFO_KEY_VERSION in src_info.payload) and
7434 (constants.HV_NODEINFO_KEY_VERSION in dst_info.payload)):
7435 src_version = src_info.payload[constants.HV_NODEINFO_KEY_VERSION]
7436 dst_version = dst_info.payload[constants.HV_NODEINFO_KEY_VERSION]
7437 if src_version != dst_version:
7438 self.feedback_fn("* warning: hypervisor version mismatch between"
7439 " source (%s) and target (%s) node" %
7440 (src_version, dst_version))
7442 self.feedback_fn("* checking disk consistency between source and target")
7443 for dev in instance.disks:
7444 if not _CheckDiskConsistency(self.lu, dev, target_node, False):
7445 raise errors.OpExecError("Disk %s is degraded or not fully"
7446 " synchronized on target node,"
7447 " aborting migration" % dev.iv_name)
7449 # First get the migration information from the remote node
7450 result = self.rpc.call_migration_info(source_node, instance)
7451 msg = result.fail_msg
7453 log_err = ("Failed fetching source migration information from %s: %s" %
7455 logging.error(log_err)
7456 raise errors.OpExecError(log_err)
7458 self.migration_info = migration_info = result.payload
7460 if self.instance.disk_template not in constants.DTS_EXT_MIRROR:
7461 # Then switch the disks to master/master mode
7462 self._EnsureSecondary(target_node)
7463 self._GoStandalone()
7464 self._GoReconnect(True)
7465 self._WaitUntilSync()
7467 self.feedback_fn("* preparing %s to accept the instance" % target_node)
7468 result = self.rpc.call_accept_instance(target_node,
7471 self.nodes_ip[target_node])
7473 msg = result.fail_msg
7475 logging.error("Instance pre-migration failed, trying to revert"
7476 " disk status: %s", msg)
7477 self.feedback_fn("Pre-migration failed, aborting")
7478 self._AbortMigration()
7479 self._RevertDiskStatus()
7480 raise errors.OpExecError("Could not pre-migrate instance %s: %s" %
7481 (instance.name, msg))
7483 self.feedback_fn("* migrating instance to %s" % target_node)
7484 result = self.rpc.call_instance_migrate(source_node, instance,
7485 self.nodes_ip[target_node],
7487 msg = result.fail_msg
7489 logging.error("Instance migration failed, trying to revert"
7490 " disk status: %s", msg)
7491 self.feedback_fn("Migration failed, aborting")
7492 self._AbortMigration()
7493 self._RevertDiskStatus()
7494 raise errors.OpExecError("Could not migrate instance %s: %s" %
7495 (instance.name, msg))
7497 instance.primary_node = target_node
7498 # distribute new instance config to the other nodes
7499 self.cfg.Update(instance, self.feedback_fn)
7501 result = self.rpc.call_finalize_migration(target_node,
7505 msg = result.fail_msg
7507 logging.error("Instance migration succeeded, but finalization failed:"
7509 raise errors.OpExecError("Could not finalize instance migration: %s" %
7512 if self.instance.disk_template not in constants.DTS_EXT_MIRROR:
7513 self._EnsureSecondary(source_node)
7514 self._WaitUntilSync()
7515 self._GoStandalone()
7516 self._GoReconnect(False)
7517 self._WaitUntilSync()
7519 self.feedback_fn("* done")
7521 def _ExecFailover(self):
7522 """Failover an instance.
7524 The failover is done by shutting it down on its present node and
7525 starting it on the secondary.
7528 instance = self.instance
7529 primary_node = self.cfg.GetNodeInfo(instance.primary_node)
7531 source_node = instance.primary_node
7532 target_node = self.target_node
7534 if instance.admin_up:
7535 self.feedback_fn("* checking disk consistency between source and target")
7536 for dev in instance.disks:
7537 # for drbd, these are drbd over lvm
7538 if not _CheckDiskConsistency(self.lu, dev, target_node, False):
7539 if primary_node.offline:
7540 self.feedback_fn("Node %s is offline, ignoring degraded disk %s on"
7542 (primary_node.name, dev.iv_name, target_node))
7543 elif not self.ignore_consistency:
7544 raise errors.OpExecError("Disk %s is degraded on target node,"
7545 " aborting failover" % dev.iv_name)
7547 self.feedback_fn("* not checking disk consistency as instance is not"
7550 self.feedback_fn("* shutting down instance on source node")
7551 logging.info("Shutting down instance %s on node %s",
7552 instance.name, source_node)
7554 result = self.rpc.call_instance_shutdown(source_node, instance,
7555 self.shutdown_timeout)
7556 msg = result.fail_msg
7558 if self.ignore_consistency or primary_node.offline:
7559 self.lu.LogWarning("Could not shutdown instance %s on node %s,"
7560 " proceeding anyway; please make sure node"
7561 " %s is down; error details: %s",
7562 instance.name, source_node, source_node, msg)
7564 raise errors.OpExecError("Could not shutdown instance %s on"
7566 (instance.name, source_node, msg))
7568 self.feedback_fn("* deactivating the instance's disks on source node")
7569 if not _ShutdownInstanceDisks(self.lu, instance, ignore_primary=True):
7570 raise errors.OpExecError("Can't shut down the instance's disks")
7572 instance.primary_node = target_node
7573 # distribute new instance config to the other nodes
7574 self.cfg.Update(instance, self.feedback_fn)
7576 # Only start the instance if it's marked as up
7577 if instance.admin_up:
7578 self.feedback_fn("* activating the instance's disks on target node %s" %
7580 logging.info("Starting instance %s on node %s",
7581 instance.name, target_node)
7583 disks_ok, _ = _AssembleInstanceDisks(self.lu, instance,
7584 ignore_secondaries=True)
7586 _ShutdownInstanceDisks(self.lu, instance)
7587 raise errors.OpExecError("Can't activate the instance's disks")
7589 self.feedback_fn("* starting the instance on the target node %s" %
7591 result = self.rpc.call_instance_start(target_node, instance, None, None,
7593 msg = result.fail_msg
7595 _ShutdownInstanceDisks(self.lu, instance)
7596 raise errors.OpExecError("Could not start instance %s on node %s: %s" %
7597 (instance.name, target_node, msg))
7599 def Exec(self, feedback_fn):
7600 """Perform the migration.
7603 self.feedback_fn = feedback_fn
7604 self.source_node = self.instance.primary_node
7606 # FIXME: if we implement migrate-to-any in DRBD, this needs fixing
7607 if self.instance.disk_template in constants.DTS_INT_MIRROR:
7608 self.target_node = self.instance.secondary_nodes[0]
7609 # Otherwise self.target_node has been populated either
7610 # directly, or through an iallocator.
7612 self.all_nodes = [self.source_node, self.target_node]
7613 self.nodes_ip = dict((name, node.secondary_ip) for (name, node)
7614 in self.cfg.GetMultiNodeInfo(self.all_nodes))
7617 feedback_fn("Failover instance %s" % self.instance.name)
7618 self._ExecFailover()
7620 feedback_fn("Migrating instance %s" % self.instance.name)
7623 return self._ExecCleanup()
7625 return self._ExecMigration()
7628 def _CreateBlockDev(lu, node, instance, device, force_create,
7630 """Create a tree of block devices on a given node.
7632 If this device type has to be created on secondaries, create it and
7635 If not, just recurse to children keeping the same 'force' value.
7637 @param lu: the lu on whose behalf we execute
7638 @param node: the node on which to create the device
7639 @type instance: L{objects.Instance}
7640 @param instance: the instance which owns the device
7641 @type device: L{objects.Disk}
7642 @param device: the device to create
7643 @type force_create: boolean
7644 @param force_create: whether to force creation of this device; this
7645 will be change to True whenever we find a device which has
7646 CreateOnSecondary() attribute
7647 @param info: the extra 'metadata' we should attach to the device
7648 (this will be represented as a LVM tag)
7649 @type force_open: boolean
7650 @param force_open: this parameter will be passes to the
7651 L{backend.BlockdevCreate} function where it specifies
7652 whether we run on primary or not, and it affects both
7653 the child assembly and the device own Open() execution
7656 if device.CreateOnSecondary():
7660 for child in device.children:
7661 _CreateBlockDev(lu, node, instance, child, force_create,
7664 if not force_create:
7667 _CreateSingleBlockDev(lu, node, instance, device, info, force_open)
7670 def _CreateSingleBlockDev(lu, node, instance, device, info, force_open):
7671 """Create a single block device on a given node.
7673 This will not recurse over children of the device, so they must be
7676 @param lu: the lu on whose behalf we execute
7677 @param node: the node on which to create the device
7678 @type instance: L{objects.Instance}
7679 @param instance: the instance which owns the device
7680 @type device: L{objects.Disk}
7681 @param device: the device to create
7682 @param info: the extra 'metadata' we should attach to the device
7683 (this will be represented as a LVM tag)
7684 @type force_open: boolean
7685 @param force_open: this parameter will be passes to the
7686 L{backend.BlockdevCreate} function where it specifies
7687 whether we run on primary or not, and it affects both
7688 the child assembly and the device own Open() execution
7691 lu.cfg.SetDiskID(device, node)
7692 result = lu.rpc.call_blockdev_create(node, device, device.size,
7693 instance.name, force_open, info)
7694 result.Raise("Can't create block device %s on"
7695 " node %s for instance %s" % (device, node, instance.name))
7696 if device.physical_id is None:
7697 device.physical_id = result.payload
7700 def _GenerateUniqueNames(lu, exts):
7701 """Generate a suitable LV name.
7703 This will generate a logical volume name for the given instance.
7708 new_id = lu.cfg.GenerateUniqueID(lu.proc.GetECId())
7709 results.append("%s%s" % (new_id, val))
7713 def _GenerateDRBD8Branch(lu, primary, secondary, size, vgnames, names,
7714 iv_name, p_minor, s_minor):
7715 """Generate a drbd8 device complete with its children.
7718 assert len(vgnames) == len(names) == 2
7719 port = lu.cfg.AllocatePort()
7720 shared_secret = lu.cfg.GenerateDRBDSecret(lu.proc.GetECId())
7721 dev_data = objects.Disk(dev_type=constants.LD_LV, size=size,
7722 logical_id=(vgnames[0], names[0]))
7723 dev_meta = objects.Disk(dev_type=constants.LD_LV, size=128,
7724 logical_id=(vgnames[1], names[1]))
7725 drbd_dev = objects.Disk(dev_type=constants.LD_DRBD8, size=size,
7726 logical_id=(primary, secondary, port,
7729 children=[dev_data, dev_meta],
7734 def _GenerateDiskTemplate(lu, template_name,
7735 instance_name, primary_node,
7736 secondary_nodes, disk_info,
7737 file_storage_dir, file_driver,
7738 base_index, feedback_fn):
7739 """Generate the entire disk layout for a given template type.
7742 #TODO: compute space requirements
7744 vgname = lu.cfg.GetVGName()
7745 disk_count = len(disk_info)
7747 if template_name == constants.DT_DISKLESS:
7749 elif template_name == constants.DT_PLAIN:
7750 if len(secondary_nodes) != 0:
7751 raise errors.ProgrammerError("Wrong template configuration")
7753 names = _GenerateUniqueNames(lu, [".disk%d" % (base_index + i)
7754 for i in range(disk_count)])
7755 for idx, disk in enumerate(disk_info):
7756 disk_index = idx + base_index
7757 vg = disk.get(constants.IDISK_VG, vgname)
7758 feedback_fn("* disk %i, vg %s, name %s" % (idx, vg, names[idx]))
7759 disk_dev = objects.Disk(dev_type=constants.LD_LV,
7760 size=disk[constants.IDISK_SIZE],
7761 logical_id=(vg, names[idx]),
7762 iv_name="disk/%d" % disk_index,
7763 mode=disk[constants.IDISK_MODE])
7764 disks.append(disk_dev)
7765 elif template_name == constants.DT_DRBD8:
7766 if len(secondary_nodes) != 1:
7767 raise errors.ProgrammerError("Wrong template configuration")
7768 remote_node = secondary_nodes[0]
7769 minors = lu.cfg.AllocateDRBDMinor(
7770 [primary_node, remote_node] * len(disk_info), instance_name)
7773 for lv_prefix in _GenerateUniqueNames(lu, [".disk%d" % (base_index + i)
7774 for i in range(disk_count)]):
7775 names.append(lv_prefix + "_data")
7776 names.append(lv_prefix + "_meta")
7777 for idx, disk in enumerate(disk_info):
7778 disk_index = idx + base_index
7779 data_vg = disk.get(constants.IDISK_VG, vgname)
7780 meta_vg = disk.get(constants.IDISK_METAVG, data_vg)
7781 disk_dev = _GenerateDRBD8Branch(lu, primary_node, remote_node,
7782 disk[constants.IDISK_SIZE],
7784 names[idx * 2:idx * 2 + 2],
7785 "disk/%d" % disk_index,
7786 minors[idx * 2], minors[idx * 2 + 1])
7787 disk_dev.mode = disk[constants.IDISK_MODE]
7788 disks.append(disk_dev)
7789 elif template_name == constants.DT_FILE:
7790 if len(secondary_nodes) != 0:
7791 raise errors.ProgrammerError("Wrong template configuration")
7793 opcodes.RequireFileStorage()
7795 for idx, disk in enumerate(disk_info):
7796 disk_index = idx + base_index
7797 disk_dev = objects.Disk(dev_type=constants.LD_FILE,
7798 size=disk[constants.IDISK_SIZE],
7799 iv_name="disk/%d" % disk_index,
7800 logical_id=(file_driver,
7801 "%s/disk%d" % (file_storage_dir,
7803 mode=disk[constants.IDISK_MODE])
7804 disks.append(disk_dev)
7805 elif template_name == constants.DT_SHARED_FILE:
7806 if len(secondary_nodes) != 0:
7807 raise errors.ProgrammerError("Wrong template configuration")
7809 opcodes.RequireSharedFileStorage()
7811 for idx, disk in enumerate(disk_info):
7812 disk_index = idx + base_index
7813 disk_dev = objects.Disk(dev_type=constants.LD_FILE,
7814 size=disk[constants.IDISK_SIZE],
7815 iv_name="disk/%d" % disk_index,
7816 logical_id=(file_driver,
7817 "%s/disk%d" % (file_storage_dir,
7819 mode=disk[constants.IDISK_MODE])
7820 disks.append(disk_dev)
7821 elif template_name == constants.DT_BLOCK:
7822 if len(secondary_nodes) != 0:
7823 raise errors.ProgrammerError("Wrong template configuration")
7825 for idx, disk in enumerate(disk_info):
7826 disk_index = idx + base_index
7827 disk_dev = objects.Disk(dev_type=constants.LD_BLOCKDEV,
7828 size=disk[constants.IDISK_SIZE],
7829 logical_id=(constants.BLOCKDEV_DRIVER_MANUAL,
7830 disk[constants.IDISK_ADOPT]),
7831 iv_name="disk/%d" % disk_index,
7832 mode=disk[constants.IDISK_MODE])
7833 disks.append(disk_dev)
7836 raise errors.ProgrammerError("Invalid disk template '%s'" % template_name)
7840 def _GetInstanceInfoText(instance):
7841 """Compute that text that should be added to the disk's metadata.
7844 return "originstname+%s" % instance.name
7847 def _CalcEta(time_taken, written, total_size):
7848 """Calculates the ETA based on size written and total size.
7850 @param time_taken: The time taken so far
7851 @param written: amount written so far
7852 @param total_size: The total size of data to be written
7853 @return: The remaining time in seconds
7856 avg_time = time_taken / float(written)
7857 return (total_size - written) * avg_time
7860 def _WipeDisks(lu, instance):
7861 """Wipes instance disks.
7863 @type lu: L{LogicalUnit}
7864 @param lu: the logical unit on whose behalf we execute
7865 @type instance: L{objects.Instance}
7866 @param instance: the instance whose disks we should create
7867 @return: the success of the wipe
7870 node = instance.primary_node
7872 for device in instance.disks:
7873 lu.cfg.SetDiskID(device, node)
7875 logging.info("Pause sync of instance %s disks", instance.name)
7876 result = lu.rpc.call_blockdev_pause_resume_sync(node, instance.disks, True)
7878 for idx, success in enumerate(result.payload):
7880 logging.warn("pause-sync of instance %s for disks %d failed",
7884 for idx, device in enumerate(instance.disks):
7885 # The wipe size is MIN_WIPE_CHUNK_PERCENT % of the instance disk but
7886 # MAX_WIPE_CHUNK at max
7887 wipe_chunk_size = min(constants.MAX_WIPE_CHUNK, device.size / 100.0 *
7888 constants.MIN_WIPE_CHUNK_PERCENT)
7889 # we _must_ make this an int, otherwise rounding errors will
7891 wipe_chunk_size = int(wipe_chunk_size)
7893 lu.LogInfo("* Wiping disk %d", idx)
7894 logging.info("Wiping disk %d for instance %s, node %s using"
7895 " chunk size %s", idx, instance.name, node, wipe_chunk_size)
7900 start_time = time.time()
7902 while offset < size:
7903 wipe_size = min(wipe_chunk_size, size - offset)
7904 logging.debug("Wiping disk %d, offset %s, chunk %s",
7905 idx, offset, wipe_size)
7906 result = lu.rpc.call_blockdev_wipe(node, device, offset, wipe_size)
7907 result.Raise("Could not wipe disk %d at offset %d for size %d" %
7908 (idx, offset, wipe_size))
7911 if now - last_output >= 60:
7912 eta = _CalcEta(now - start_time, offset, size)
7913 lu.LogInfo(" - done: %.1f%% ETA: %s" %
7914 (offset / float(size) * 100, utils.FormatSeconds(eta)))
7917 logging.info("Resume sync of instance %s disks", instance.name)
7919 result = lu.rpc.call_blockdev_pause_resume_sync(node, instance.disks, False)
7921 for idx, success in enumerate(result.payload):
7923 lu.LogWarning("Resume sync of disk %d failed, please have a"
7924 " look at the status and troubleshoot the issue", idx)
7925 logging.warn("resume-sync of instance %s for disks %d failed",
7929 def _CreateDisks(lu, instance, to_skip=None, target_node=None):
7930 """Create all disks for an instance.
7932 This abstracts away some work from AddInstance.
7934 @type lu: L{LogicalUnit}
7935 @param lu: the logical unit on whose behalf we execute
7936 @type instance: L{objects.Instance}
7937 @param instance: the instance whose disks we should create
7939 @param to_skip: list of indices to skip
7940 @type target_node: string
7941 @param target_node: if passed, overrides the target node for creation
7943 @return: the success of the creation
7946 info = _GetInstanceInfoText(instance)
7947 if target_node is None:
7948 pnode = instance.primary_node
7949 all_nodes = instance.all_nodes
7954 if instance.disk_template in constants.DTS_FILEBASED:
7955 file_storage_dir = os.path.dirname(instance.disks[0].logical_id[1])
7956 result = lu.rpc.call_file_storage_dir_create(pnode, file_storage_dir)
7958 result.Raise("Failed to create directory '%s' on"
7959 " node %s" % (file_storage_dir, pnode))
7961 # Note: this needs to be kept in sync with adding of disks in
7962 # LUInstanceSetParams
7963 for idx, device in enumerate(instance.disks):
7964 if to_skip and idx in to_skip:
7966 logging.info("Creating volume %s for instance %s",
7967 device.iv_name, instance.name)
7969 for node in all_nodes:
7970 f_create = node == pnode
7971 _CreateBlockDev(lu, node, instance, device, f_create, info, f_create)
7974 def _RemoveDisks(lu, instance, target_node=None):
7975 """Remove all disks for an instance.
7977 This abstracts away some work from `AddInstance()` and
7978 `RemoveInstance()`. Note that in case some of the devices couldn't
7979 be removed, the removal will continue with the other ones (compare
7980 with `_CreateDisks()`).
7982 @type lu: L{LogicalUnit}
7983 @param lu: the logical unit on whose behalf we execute
7984 @type instance: L{objects.Instance}
7985 @param instance: the instance whose disks we should remove
7986 @type target_node: string
7987 @param target_node: used to override the node on which to remove the disks
7989 @return: the success of the removal
7992 logging.info("Removing block devices for instance %s", instance.name)
7995 for device in instance.disks:
7997 edata = [(target_node, device)]
7999 edata = device.ComputeNodeTree(instance.primary_node)
8000 for node, disk in edata:
8001 lu.cfg.SetDiskID(disk, node)
8002 msg = lu.rpc.call_blockdev_remove(node, disk).fail_msg
8004 lu.LogWarning("Could not remove block device %s on node %s,"
8005 " continuing anyway: %s", device.iv_name, node, msg)
8008 # if this is a DRBD disk, return its port to the pool
8009 if device.dev_type in constants.LDS_DRBD:
8010 tcp_port = device.logical_id[2]
8011 lu.cfg.AddTcpUdpPort(tcp_port)
8013 if instance.disk_template == constants.DT_FILE:
8014 file_storage_dir = os.path.dirname(instance.disks[0].logical_id[1])
8018 tgt = instance.primary_node
8019 result = lu.rpc.call_file_storage_dir_remove(tgt, file_storage_dir)
8021 lu.LogWarning("Could not remove directory '%s' on node %s: %s",
8022 file_storage_dir, instance.primary_node, result.fail_msg)
8028 def _ComputeDiskSizePerVG(disk_template, disks):
8029 """Compute disk size requirements in the volume group
8032 def _compute(disks, payload):
8033 """Universal algorithm.
8038 vgs[disk[constants.IDISK_VG]] = \
8039 vgs.get(constants.IDISK_VG, 0) + disk[constants.IDISK_SIZE] + payload
8043 # Required free disk space as a function of disk and swap space
8045 constants.DT_DISKLESS: {},
8046 constants.DT_PLAIN: _compute(disks, 0),
8047 # 128 MB are added for drbd metadata for each disk
8048 constants.DT_DRBD8: _compute(disks, 128),
8049 constants.DT_FILE: {},
8050 constants.DT_SHARED_FILE: {},
8053 if disk_template not in req_size_dict:
8054 raise errors.ProgrammerError("Disk template '%s' size requirement"
8055 " is unknown" % disk_template)
8057 return req_size_dict[disk_template]
8060 def _ComputeDiskSize(disk_template, disks):
8061 """Compute disk size requirements in the volume group
8064 # Required free disk space as a function of disk and swap space
8066 constants.DT_DISKLESS: None,
8067 constants.DT_PLAIN: sum(d[constants.IDISK_SIZE] for d in disks),
8068 # 128 MB are added for drbd metadata for each disk
8069 constants.DT_DRBD8: sum(d[constants.IDISK_SIZE] + 128 for d in disks),
8070 constants.DT_FILE: None,
8071 constants.DT_SHARED_FILE: 0,
8072 constants.DT_BLOCK: 0,
8075 if disk_template not in req_size_dict:
8076 raise errors.ProgrammerError("Disk template '%s' size requirement"
8077 " is unknown" % disk_template)
8079 return req_size_dict[disk_template]
8082 def _FilterVmNodes(lu, nodenames):
8083 """Filters out non-vm_capable nodes from a list.
8085 @type lu: L{LogicalUnit}
8086 @param lu: the logical unit for which we check
8087 @type nodenames: list
8088 @param nodenames: the list of nodes on which we should check
8090 @return: the list of vm-capable nodes
8093 vm_nodes = frozenset(lu.cfg.GetNonVmCapableNodeList())
8094 return [name for name in nodenames if name not in vm_nodes]
8097 def _CheckHVParams(lu, nodenames, hvname, hvparams):
8098 """Hypervisor parameter validation.
8100 This function abstract the hypervisor parameter validation to be
8101 used in both instance create and instance modify.
8103 @type lu: L{LogicalUnit}
8104 @param lu: the logical unit for which we check
8105 @type nodenames: list
8106 @param nodenames: the list of nodes on which we should check
8107 @type hvname: string
8108 @param hvname: the name of the hypervisor we should use
8109 @type hvparams: dict
8110 @param hvparams: the parameters which we need to check
8111 @raise errors.OpPrereqError: if the parameters are not valid
8114 nodenames = _FilterVmNodes(lu, nodenames)
8115 hvinfo = lu.rpc.call_hypervisor_validate_params(nodenames,
8118 for node in nodenames:
8122 info.Raise("Hypervisor parameter validation failed on node %s" % node)
8125 def _CheckOSParams(lu, required, nodenames, osname, osparams):
8126 """OS parameters validation.
8128 @type lu: L{LogicalUnit}
8129 @param lu: the logical unit for which we check
8130 @type required: boolean
8131 @param required: whether the validation should fail if the OS is not
8133 @type nodenames: list
8134 @param nodenames: the list of nodes on which we should check
8135 @type osname: string
8136 @param osname: the name of the hypervisor we should use
8137 @type osparams: dict
8138 @param osparams: the parameters which we need to check
8139 @raise errors.OpPrereqError: if the parameters are not valid
8142 nodenames = _FilterVmNodes(lu, nodenames)
8143 result = lu.rpc.call_os_validate(required, nodenames, osname,
8144 [constants.OS_VALIDATE_PARAMETERS],
8146 for node, nres in result.items():
8147 # we don't check for offline cases since this should be run only
8148 # against the master node and/or an instance's nodes
8149 nres.Raise("OS Parameters validation failed on node %s" % node)
8150 if not nres.payload:
8151 lu.LogInfo("OS %s not found on node %s, validation skipped",
8155 class LUInstanceCreate(LogicalUnit):
8156 """Create an instance.
8159 HPATH = "instance-add"
8160 HTYPE = constants.HTYPE_INSTANCE
8163 def CheckArguments(self):
8167 # do not require name_check to ease forward/backward compatibility
8169 if self.op.no_install and self.op.start:
8170 self.LogInfo("No-installation mode selected, disabling startup")
8171 self.op.start = False
8172 # validate/normalize the instance name
8173 self.op.instance_name = \
8174 netutils.Hostname.GetNormalizedName(self.op.instance_name)
8176 if self.op.ip_check and not self.op.name_check:
8177 # TODO: make the ip check more flexible and not depend on the name check
8178 raise errors.OpPrereqError("Cannot do IP address check without a name"
8179 " check", errors.ECODE_INVAL)
8181 # check nics' parameter names
8182 for nic in self.op.nics:
8183 utils.ForceDictType(nic, constants.INIC_PARAMS_TYPES)
8185 # check disks. parameter names and consistent adopt/no-adopt strategy
8186 has_adopt = has_no_adopt = False
8187 for disk in self.op.disks:
8188 utils.ForceDictType(disk, constants.IDISK_PARAMS_TYPES)
8189 if constants.IDISK_ADOPT in disk:
8193 if has_adopt and has_no_adopt:
8194 raise errors.OpPrereqError("Either all disks are adopted or none is",
8197 if self.op.disk_template not in constants.DTS_MAY_ADOPT:
8198 raise errors.OpPrereqError("Disk adoption is not supported for the"
8199 " '%s' disk template" %
8200 self.op.disk_template,
8202 if self.op.iallocator is not None:
8203 raise errors.OpPrereqError("Disk adoption not allowed with an"
8204 " iallocator script", errors.ECODE_INVAL)
8205 if self.op.mode == constants.INSTANCE_IMPORT:
8206 raise errors.OpPrereqError("Disk adoption not allowed for"
8207 " instance import", errors.ECODE_INVAL)
8209 if self.op.disk_template in constants.DTS_MUST_ADOPT:
8210 raise errors.OpPrereqError("Disk template %s requires disk adoption,"
8211 " but no 'adopt' parameter given" %
8212 self.op.disk_template,
8215 self.adopt_disks = has_adopt
8217 # instance name verification
8218 if self.op.name_check:
8219 self.hostname1 = netutils.GetHostname(name=self.op.instance_name)
8220 self.op.instance_name = self.hostname1.name
8221 # used in CheckPrereq for ip ping check
8222 self.check_ip = self.hostname1.ip
8224 self.check_ip = None
8226 # file storage checks
8227 if (self.op.file_driver and
8228 not self.op.file_driver in constants.FILE_DRIVER):
8229 raise errors.OpPrereqError("Invalid file driver name '%s'" %
8230 self.op.file_driver, errors.ECODE_INVAL)
8232 if self.op.disk_template == constants.DT_FILE:
8233 opcodes.RequireFileStorage()
8234 elif self.op.disk_template == constants.DT_SHARED_FILE:
8235 opcodes.RequireSharedFileStorage()
8237 ### Node/iallocator related checks
8238 _CheckIAllocatorOrNode(self, "iallocator", "pnode")
8240 if self.op.pnode is not None:
8241 if self.op.disk_template in constants.DTS_INT_MIRROR:
8242 if self.op.snode is None:
8243 raise errors.OpPrereqError("The networked disk templates need"
8244 " a mirror node", errors.ECODE_INVAL)
8246 self.LogWarning("Secondary node will be ignored on non-mirrored disk"
8248 self.op.snode = None
8250 self._cds = _GetClusterDomainSecret()
8252 if self.op.mode == constants.INSTANCE_IMPORT:
8253 # On import force_variant must be True, because if we forced it at
8254 # initial install, our only chance when importing it back is that it
8256 self.op.force_variant = True
8258 if self.op.no_install:
8259 self.LogInfo("No-installation mode has no effect during import")
8261 elif self.op.mode == constants.INSTANCE_CREATE:
8262 if self.op.os_type is None:
8263 raise errors.OpPrereqError("No guest OS specified",
8265 if self.op.os_type in self.cfg.GetClusterInfo().blacklisted_os:
8266 raise errors.OpPrereqError("Guest OS '%s' is not allowed for"
8267 " installation" % self.op.os_type,
8269 if self.op.disk_template is None:
8270 raise errors.OpPrereqError("No disk template specified",
8273 elif self.op.mode == constants.INSTANCE_REMOTE_IMPORT:
8274 # Check handshake to ensure both clusters have the same domain secret
8275 src_handshake = self.op.source_handshake
8276 if not src_handshake:
8277 raise errors.OpPrereqError("Missing source handshake",
8280 errmsg = masterd.instance.CheckRemoteExportHandshake(self._cds,
8283 raise errors.OpPrereqError("Invalid handshake: %s" % errmsg,
8286 # Load and check source CA
8287 self.source_x509_ca_pem = self.op.source_x509_ca
8288 if not self.source_x509_ca_pem:
8289 raise errors.OpPrereqError("Missing source X509 CA",
8293 (cert, _) = utils.LoadSignedX509Certificate(self.source_x509_ca_pem,
8295 except OpenSSL.crypto.Error, err:
8296 raise errors.OpPrereqError("Unable to load source X509 CA (%s)" %
8297 (err, ), errors.ECODE_INVAL)
8299 (errcode, msg) = utils.VerifyX509Certificate(cert, None, None)
8300 if errcode is not None:
8301 raise errors.OpPrereqError("Invalid source X509 CA (%s)" % (msg, ),
8304 self.source_x509_ca = cert
8306 src_instance_name = self.op.source_instance_name
8307 if not src_instance_name:
8308 raise errors.OpPrereqError("Missing source instance name",
8311 self.source_instance_name = \
8312 netutils.GetHostname(name=src_instance_name).name
8315 raise errors.OpPrereqError("Invalid instance creation mode %r" %
8316 self.op.mode, errors.ECODE_INVAL)
8318 def ExpandNames(self):
8319 """ExpandNames for CreateInstance.
8321 Figure out the right locks for instance creation.
8324 self.needed_locks = {}
8326 instance_name = self.op.instance_name
8327 # this is just a preventive check, but someone might still add this
8328 # instance in the meantime, and creation will fail at lock-add time
8329 if instance_name in self.cfg.GetInstanceList():
8330 raise errors.OpPrereqError("Instance '%s' is already in the cluster" %
8331 instance_name, errors.ECODE_EXISTS)
8333 self.add_locks[locking.LEVEL_INSTANCE] = instance_name
8335 if self.op.iallocator:
8336 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
8338 self.op.pnode = _ExpandNodeName(self.cfg, self.op.pnode)
8339 nodelist = [self.op.pnode]
8340 if self.op.snode is not None:
8341 self.op.snode = _ExpandNodeName(self.cfg, self.op.snode)
8342 nodelist.append(self.op.snode)
8343 self.needed_locks[locking.LEVEL_NODE] = nodelist
8345 # in case of import lock the source node too
8346 if self.op.mode == constants.INSTANCE_IMPORT:
8347 src_node = self.op.src_node
8348 src_path = self.op.src_path
8350 if src_path is None:
8351 self.op.src_path = src_path = self.op.instance_name
8353 if src_node is None:
8354 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
8355 self.op.src_node = None
8356 if os.path.isabs(src_path):
8357 raise errors.OpPrereqError("Importing an instance from a path"
8358 " requires a source node option",
8361 self.op.src_node = src_node = _ExpandNodeName(self.cfg, src_node)
8362 if self.needed_locks[locking.LEVEL_NODE] is not locking.ALL_SET:
8363 self.needed_locks[locking.LEVEL_NODE].append(src_node)
8364 if not os.path.isabs(src_path):
8365 self.op.src_path = src_path = \
8366 utils.PathJoin(constants.EXPORT_DIR, src_path)
8368 def _RunAllocator(self):
8369 """Run the allocator based on input opcode.
8372 nics = [n.ToDict() for n in self.nics]
8373 ial = IAllocator(self.cfg, self.rpc,
8374 mode=constants.IALLOCATOR_MODE_ALLOC,
8375 name=self.op.instance_name,
8376 disk_template=self.op.disk_template,
8379 vcpus=self.be_full[constants.BE_VCPUS],
8380 memory=self.be_full[constants.BE_MEMORY],
8383 hypervisor=self.op.hypervisor,
8386 ial.Run(self.op.iallocator)
8389 raise errors.OpPrereqError("Can't compute nodes using"
8390 " iallocator '%s': %s" %
8391 (self.op.iallocator, ial.info),
8393 if len(ial.result) != ial.required_nodes:
8394 raise errors.OpPrereqError("iallocator '%s' returned invalid number"
8395 " of nodes (%s), required %s" %
8396 (self.op.iallocator, len(ial.result),
8397 ial.required_nodes), errors.ECODE_FAULT)
8398 self.op.pnode = ial.result[0]
8399 self.LogInfo("Selected nodes for instance %s via iallocator %s: %s",
8400 self.op.instance_name, self.op.iallocator,
8401 utils.CommaJoin(ial.result))
8402 if ial.required_nodes == 2:
8403 self.op.snode = ial.result[1]
8405 def BuildHooksEnv(self):
8408 This runs on master, primary and secondary nodes of the instance.
8412 "ADD_MODE": self.op.mode,
8414 if self.op.mode == constants.INSTANCE_IMPORT:
8415 env["SRC_NODE"] = self.op.src_node
8416 env["SRC_PATH"] = self.op.src_path
8417 env["SRC_IMAGES"] = self.src_images
8419 env.update(_BuildInstanceHookEnv(
8420 name=self.op.instance_name,
8421 primary_node=self.op.pnode,
8422 secondary_nodes=self.secondaries,
8423 status=self.op.start,
8424 os_type=self.op.os_type,
8425 memory=self.be_full[constants.BE_MEMORY],
8426 vcpus=self.be_full[constants.BE_VCPUS],
8427 nics=_NICListToTuple(self, self.nics),
8428 disk_template=self.op.disk_template,
8429 disks=[(d[constants.IDISK_SIZE], d[constants.IDISK_MODE])
8430 for d in self.disks],
8433 hypervisor_name=self.op.hypervisor,
8439 def BuildHooksNodes(self):
8440 """Build hooks nodes.
8443 nl = [self.cfg.GetMasterNode(), self.op.pnode] + self.secondaries
8446 def _ReadExportInfo(self):
8447 """Reads the export information from disk.
8449 It will override the opcode source node and path with the actual
8450 information, if these two were not specified before.
8452 @return: the export information
8455 assert self.op.mode == constants.INSTANCE_IMPORT
8457 src_node = self.op.src_node
8458 src_path = self.op.src_path
8460 if src_node is None:
8461 locked_nodes = self.owned_locks(locking.LEVEL_NODE)
8462 exp_list = self.rpc.call_export_list(locked_nodes)
8464 for node in exp_list:
8465 if exp_list[node].fail_msg:
8467 if src_path in exp_list[node].payload:
8469 self.op.src_node = src_node = node
8470 self.op.src_path = src_path = utils.PathJoin(constants.EXPORT_DIR,
8474 raise errors.OpPrereqError("No export found for relative path %s" %
8475 src_path, errors.ECODE_INVAL)
8477 _CheckNodeOnline(self, src_node)
8478 result = self.rpc.call_export_info(src_node, src_path)
8479 result.Raise("No export or invalid export found in dir %s" % src_path)
8481 export_info = objects.SerializableConfigParser.Loads(str(result.payload))
8482 if not export_info.has_section(constants.INISECT_EXP):
8483 raise errors.ProgrammerError("Corrupted export config",
8484 errors.ECODE_ENVIRON)
8486 ei_version = export_info.get(constants.INISECT_EXP, "version")
8487 if (int(ei_version) != constants.EXPORT_VERSION):
8488 raise errors.OpPrereqError("Wrong export version %s (wanted %d)" %
8489 (ei_version, constants.EXPORT_VERSION),
8490 errors.ECODE_ENVIRON)
8493 def _ReadExportParams(self, einfo):
8494 """Use export parameters as defaults.
8496 In case the opcode doesn't specify (as in override) some instance
8497 parameters, then try to use them from the export information, if
8501 self.op.os_type = einfo.get(constants.INISECT_EXP, "os")
8503 if self.op.disk_template is None:
8504 if einfo.has_option(constants.INISECT_INS, "disk_template"):
8505 self.op.disk_template = einfo.get(constants.INISECT_INS,
8508 raise errors.OpPrereqError("No disk template specified and the export"
8509 " is missing the disk_template information",
8512 if not self.op.disks:
8513 if einfo.has_option(constants.INISECT_INS, "disk_count"):
8515 # TODO: import the disk iv_name too
8516 for idx in range(einfo.getint(constants.INISECT_INS, "disk_count")):
8517 disk_sz = einfo.getint(constants.INISECT_INS, "disk%d_size" % idx)
8518 disks.append({constants.IDISK_SIZE: disk_sz})
8519 self.op.disks = disks
8521 raise errors.OpPrereqError("No disk info specified and the export"
8522 " is missing the disk information",
8525 if (not self.op.nics and
8526 einfo.has_option(constants.INISECT_INS, "nic_count")):
8528 for idx in range(einfo.getint(constants.INISECT_INS, "nic_count")):
8530 for name in list(constants.NICS_PARAMETERS) + ["ip", "mac"]:
8531 v = einfo.get(constants.INISECT_INS, "nic%d_%s" % (idx, name))
8536 if not self.op.tags and einfo.has_option(constants.INISECT_INS, "tags"):
8537 self.op.tags = einfo.get(constants.INISECT_INS, "tags").split()
8539 if (self.op.hypervisor is None and
8540 einfo.has_option(constants.INISECT_INS, "hypervisor")):
8541 self.op.hypervisor = einfo.get(constants.INISECT_INS, "hypervisor")
8543 if einfo.has_section(constants.INISECT_HYP):
8544 # use the export parameters but do not override the ones
8545 # specified by the user
8546 for name, value in einfo.items(constants.INISECT_HYP):
8547 if name not in self.op.hvparams:
8548 self.op.hvparams[name] = value
8550 if einfo.has_section(constants.INISECT_BEP):
8551 # use the parameters, without overriding
8552 for name, value in einfo.items(constants.INISECT_BEP):
8553 if name not in self.op.beparams:
8554 self.op.beparams[name] = value
8556 # try to read the parameters old style, from the main section
8557 for name in constants.BES_PARAMETERS:
8558 if (name not in self.op.beparams and
8559 einfo.has_option(constants.INISECT_INS, name)):
8560 self.op.beparams[name] = einfo.get(constants.INISECT_INS, name)
8562 if einfo.has_section(constants.INISECT_OSP):
8563 # use the parameters, without overriding
8564 for name, value in einfo.items(constants.INISECT_OSP):
8565 if name not in self.op.osparams:
8566 self.op.osparams[name] = value
8568 def _RevertToDefaults(self, cluster):
8569 """Revert the instance parameters to the default values.
8573 hv_defs = cluster.SimpleFillHV(self.op.hypervisor, self.op.os_type, {})
8574 for name in self.op.hvparams.keys():
8575 if name in hv_defs and hv_defs[name] == self.op.hvparams[name]:
8576 del self.op.hvparams[name]
8578 be_defs = cluster.SimpleFillBE({})
8579 for name in self.op.beparams.keys():
8580 if name in be_defs and be_defs[name] == self.op.beparams[name]:
8581 del self.op.beparams[name]
8583 nic_defs = cluster.SimpleFillNIC({})
8584 for nic in self.op.nics:
8585 for name in constants.NICS_PARAMETERS:
8586 if name in nic and name in nic_defs and nic[name] == nic_defs[name]:
8589 os_defs = cluster.SimpleFillOS(self.op.os_type, {})
8590 for name in self.op.osparams.keys():
8591 if name in os_defs and os_defs[name] == self.op.osparams[name]:
8592 del self.op.osparams[name]
8594 def _CalculateFileStorageDir(self):
8595 """Calculate final instance file storage dir.
8598 # file storage dir calculation/check
8599 self.instance_file_storage_dir = None
8600 if self.op.disk_template in constants.DTS_FILEBASED:
8601 # build the full file storage dir path
8604 if self.op.disk_template == constants.DT_SHARED_FILE:
8605 get_fsd_fn = self.cfg.GetSharedFileStorageDir
8607 get_fsd_fn = self.cfg.GetFileStorageDir
8609 cfg_storagedir = get_fsd_fn()
8610 if not cfg_storagedir:
8611 raise errors.OpPrereqError("Cluster file storage dir not defined")
8612 joinargs.append(cfg_storagedir)
8614 if self.op.file_storage_dir is not None:
8615 joinargs.append(self.op.file_storage_dir)
8617 joinargs.append(self.op.instance_name)
8619 # pylint: disable=W0142
8620 self.instance_file_storage_dir = utils.PathJoin(*joinargs)
8622 def CheckPrereq(self):
8623 """Check prerequisites.
8626 self._CalculateFileStorageDir()
8628 if self.op.mode == constants.INSTANCE_IMPORT:
8629 export_info = self._ReadExportInfo()
8630 self._ReadExportParams(export_info)
8632 if (not self.cfg.GetVGName() and
8633 self.op.disk_template not in constants.DTS_NOT_LVM):
8634 raise errors.OpPrereqError("Cluster does not support lvm-based"
8635 " instances", errors.ECODE_STATE)
8637 if self.op.hypervisor is None:
8638 self.op.hypervisor = self.cfg.GetHypervisorType()
8640 cluster = self.cfg.GetClusterInfo()
8641 enabled_hvs = cluster.enabled_hypervisors
8642 if self.op.hypervisor not in enabled_hvs:
8643 raise errors.OpPrereqError("Selected hypervisor (%s) not enabled in the"
8644 " cluster (%s)" % (self.op.hypervisor,
8645 ",".join(enabled_hvs)),
8648 # Check tag validity
8649 for tag in self.op.tags:
8650 objects.TaggableObject.ValidateTag(tag)
8652 # check hypervisor parameter syntax (locally)
8653 utils.ForceDictType(self.op.hvparams, constants.HVS_PARAMETER_TYPES)
8654 filled_hvp = cluster.SimpleFillHV(self.op.hypervisor, self.op.os_type,
8656 hv_type = hypervisor.GetHypervisor(self.op.hypervisor)
8657 hv_type.CheckParameterSyntax(filled_hvp)
8658 self.hv_full = filled_hvp
8659 # check that we don't specify global parameters on an instance
8660 _CheckGlobalHvParams(self.op.hvparams)
8662 # fill and remember the beparams dict
8663 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
8664 self.be_full = cluster.SimpleFillBE(self.op.beparams)
8666 # build os parameters
8667 self.os_full = cluster.SimpleFillOS(self.op.os_type, self.op.osparams)
8669 # now that hvp/bep are in final format, let's reset to defaults,
8671 if self.op.identify_defaults:
8672 self._RevertToDefaults(cluster)
8676 for idx, nic in enumerate(self.op.nics):
8677 nic_mode_req = nic.get(constants.INIC_MODE, None)
8678 nic_mode = nic_mode_req
8679 if nic_mode is None:
8680 nic_mode = cluster.nicparams[constants.PP_DEFAULT][constants.NIC_MODE]
8682 # in routed mode, for the first nic, the default ip is 'auto'
8683 if nic_mode == constants.NIC_MODE_ROUTED and idx == 0:
8684 default_ip_mode = constants.VALUE_AUTO
8686 default_ip_mode = constants.VALUE_NONE
8688 # ip validity checks
8689 ip = nic.get(constants.INIC_IP, default_ip_mode)
8690 if ip is None or ip.lower() == constants.VALUE_NONE:
8692 elif ip.lower() == constants.VALUE_AUTO:
8693 if not self.op.name_check:
8694 raise errors.OpPrereqError("IP address set to auto but name checks"
8695 " have been skipped",
8697 nic_ip = self.hostname1.ip
8699 if not netutils.IPAddress.IsValid(ip):
8700 raise errors.OpPrereqError("Invalid IP address '%s'" % ip,
8704 # TODO: check the ip address for uniqueness
8705 if nic_mode == constants.NIC_MODE_ROUTED and not nic_ip:
8706 raise errors.OpPrereqError("Routed nic mode requires an ip address",
8709 # MAC address verification
8710 mac = nic.get(constants.INIC_MAC, constants.VALUE_AUTO)
8711 if mac not in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
8712 mac = utils.NormalizeAndValidateMac(mac)
8715 self.cfg.ReserveMAC(mac, self.proc.GetECId())
8716 except errors.ReservationError:
8717 raise errors.OpPrereqError("MAC address %s already in use"
8718 " in cluster" % mac,
8719 errors.ECODE_NOTUNIQUE)
8721 # Build nic parameters
8722 link = nic.get(constants.INIC_LINK, None)
8725 nicparams[constants.NIC_MODE] = nic_mode_req
8727 nicparams[constants.NIC_LINK] = link
8729 check_params = cluster.SimpleFillNIC(nicparams)
8730 objects.NIC.CheckParameterSyntax(check_params)
8731 self.nics.append(objects.NIC(mac=mac, ip=nic_ip, nicparams=nicparams))
8733 # disk checks/pre-build
8734 default_vg = self.cfg.GetVGName()
8736 for disk in self.op.disks:
8737 mode = disk.get(constants.IDISK_MODE, constants.DISK_RDWR)
8738 if mode not in constants.DISK_ACCESS_SET:
8739 raise errors.OpPrereqError("Invalid disk access mode '%s'" %
8740 mode, errors.ECODE_INVAL)
8741 size = disk.get(constants.IDISK_SIZE, None)
8743 raise errors.OpPrereqError("Missing disk size", errors.ECODE_INVAL)
8746 except (TypeError, ValueError):
8747 raise errors.OpPrereqError("Invalid disk size '%s'" % size,
8750 data_vg = disk.get(constants.IDISK_VG, default_vg)
8752 constants.IDISK_SIZE: size,
8753 constants.IDISK_MODE: mode,
8754 constants.IDISK_VG: data_vg,
8755 constants.IDISK_METAVG: disk.get(constants.IDISK_METAVG, data_vg),
8757 if constants.IDISK_ADOPT in disk:
8758 new_disk[constants.IDISK_ADOPT] = disk[constants.IDISK_ADOPT]
8759 self.disks.append(new_disk)
8761 if self.op.mode == constants.INSTANCE_IMPORT:
8763 # Check that the new instance doesn't have less disks than the export
8764 instance_disks = len(self.disks)
8765 export_disks = export_info.getint(constants.INISECT_INS, 'disk_count')
8766 if instance_disks < export_disks:
8767 raise errors.OpPrereqError("Not enough disks to import."
8768 " (instance: %d, export: %d)" %
8769 (instance_disks, export_disks),
8773 for idx in range(export_disks):
8774 option = "disk%d_dump" % idx
8775 if export_info.has_option(constants.INISECT_INS, option):
8776 # FIXME: are the old os-es, disk sizes, etc. useful?
8777 export_name = export_info.get(constants.INISECT_INS, option)
8778 image = utils.PathJoin(self.op.src_path, export_name)
8779 disk_images.append(image)
8781 disk_images.append(False)
8783 self.src_images = disk_images
8785 old_name = export_info.get(constants.INISECT_INS, "name")
8787 exp_nic_count = export_info.getint(constants.INISECT_INS, "nic_count")
8788 except (TypeError, ValueError), err:
8789 raise errors.OpPrereqError("Invalid export file, nic_count is not"
8790 " an integer: %s" % str(err),
8792 if self.op.instance_name == old_name:
8793 for idx, nic in enumerate(self.nics):
8794 if nic.mac == constants.VALUE_AUTO and exp_nic_count >= idx:
8795 nic_mac_ini = "nic%d_mac" % idx
8796 nic.mac = export_info.get(constants.INISECT_INS, nic_mac_ini)
8798 # ENDIF: self.op.mode == constants.INSTANCE_IMPORT
8800 # ip ping checks (we use the same ip that was resolved in ExpandNames)
8801 if self.op.ip_check:
8802 if netutils.TcpPing(self.check_ip, constants.DEFAULT_NODED_PORT):
8803 raise errors.OpPrereqError("IP %s of instance %s already in use" %
8804 (self.check_ip, self.op.instance_name),
8805 errors.ECODE_NOTUNIQUE)
8807 #### mac address generation
8808 # By generating here the mac address both the allocator and the hooks get
8809 # the real final mac address rather than the 'auto' or 'generate' value.
8810 # There is a race condition between the generation and the instance object
8811 # creation, which means that we know the mac is valid now, but we're not
8812 # sure it will be when we actually add the instance. If things go bad
8813 # adding the instance will abort because of a duplicate mac, and the
8814 # creation job will fail.
8815 for nic in self.nics:
8816 if nic.mac in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
8817 nic.mac = self.cfg.GenerateMAC(self.proc.GetECId())
8821 if self.op.iallocator is not None:
8822 self._RunAllocator()
8824 # Release all unneeded node locks
8825 _ReleaseLocks(self, locking.LEVEL_NODE,
8826 keep=filter(None, [self.op.pnode, self.op.snode,
8829 #### node related checks
8831 # check primary node
8832 self.pnode = pnode = self.cfg.GetNodeInfo(self.op.pnode)
8833 assert self.pnode is not None, \
8834 "Cannot retrieve locked node %s" % self.op.pnode
8836 raise errors.OpPrereqError("Cannot use offline primary node '%s'" %
8837 pnode.name, errors.ECODE_STATE)
8839 raise errors.OpPrereqError("Cannot use drained primary node '%s'" %
8840 pnode.name, errors.ECODE_STATE)
8841 if not pnode.vm_capable:
8842 raise errors.OpPrereqError("Cannot use non-vm_capable primary node"
8843 " '%s'" % pnode.name, errors.ECODE_STATE)
8845 self.secondaries = []
8847 # mirror node verification
8848 if self.op.disk_template in constants.DTS_INT_MIRROR:
8849 if self.op.snode == pnode.name:
8850 raise errors.OpPrereqError("The secondary node cannot be the"
8851 " primary node", errors.ECODE_INVAL)
8852 _CheckNodeOnline(self, self.op.snode)
8853 _CheckNodeNotDrained(self, self.op.snode)
8854 _CheckNodeVmCapable(self, self.op.snode)
8855 self.secondaries.append(self.op.snode)
8857 nodenames = [pnode.name] + self.secondaries
8859 if not self.adopt_disks:
8860 # Check lv size requirements, if not adopting
8861 req_sizes = _ComputeDiskSizePerVG(self.op.disk_template, self.disks)
8862 _CheckNodesFreeDiskPerVG(self, nodenames, req_sizes)
8864 elif self.op.disk_template == constants.DT_PLAIN: # Check the adoption data
8865 all_lvs = set(["%s/%s" % (disk[constants.IDISK_VG],
8866 disk[constants.IDISK_ADOPT])
8867 for disk in self.disks])
8868 if len(all_lvs) != len(self.disks):
8869 raise errors.OpPrereqError("Duplicate volume names given for adoption",
8871 for lv_name in all_lvs:
8873 # FIXME: lv_name here is "vg/lv" need to ensure that other calls
8874 # to ReserveLV uses the same syntax
8875 self.cfg.ReserveLV(lv_name, self.proc.GetECId())
8876 except errors.ReservationError:
8877 raise errors.OpPrereqError("LV named %s used by another instance" %
8878 lv_name, errors.ECODE_NOTUNIQUE)
8880 vg_names = self.rpc.call_vg_list([pnode.name])[pnode.name]
8881 vg_names.Raise("Cannot get VG information from node %s" % pnode.name)
8883 node_lvs = self.rpc.call_lv_list([pnode.name],
8884 vg_names.payload.keys())[pnode.name]
8885 node_lvs.Raise("Cannot get LV information from node %s" % pnode.name)
8886 node_lvs = node_lvs.payload
8888 delta = all_lvs.difference(node_lvs.keys())
8890 raise errors.OpPrereqError("Missing logical volume(s): %s" %
8891 utils.CommaJoin(delta),
8893 online_lvs = [lv for lv in all_lvs if node_lvs[lv][2]]
8895 raise errors.OpPrereqError("Online logical volumes found, cannot"
8896 " adopt: %s" % utils.CommaJoin(online_lvs),
8898 # update the size of disk based on what is found
8899 for dsk in self.disks:
8900 dsk[constants.IDISK_SIZE] = \
8901 int(float(node_lvs["%s/%s" % (dsk[constants.IDISK_VG],
8902 dsk[constants.IDISK_ADOPT])][0]))
8904 elif self.op.disk_template == constants.DT_BLOCK:
8905 # Normalize and de-duplicate device paths
8906 all_disks = set([os.path.abspath(disk[constants.IDISK_ADOPT])
8907 for disk in self.disks])
8908 if len(all_disks) != len(self.disks):
8909 raise errors.OpPrereqError("Duplicate disk names given for adoption",
8911 baddisks = [d for d in all_disks
8912 if not d.startswith(constants.ADOPTABLE_BLOCKDEV_ROOT)]
8914 raise errors.OpPrereqError("Device node(s) %s lie outside %s and"
8915 " cannot be adopted" %
8916 (", ".join(baddisks),
8917 constants.ADOPTABLE_BLOCKDEV_ROOT),
8920 node_disks = self.rpc.call_bdev_sizes([pnode.name],
8921 list(all_disks))[pnode.name]
8922 node_disks.Raise("Cannot get block device information from node %s" %
8924 node_disks = node_disks.payload
8925 delta = all_disks.difference(node_disks.keys())
8927 raise errors.OpPrereqError("Missing block device(s): %s" %
8928 utils.CommaJoin(delta),
8930 for dsk in self.disks:
8931 dsk[constants.IDISK_SIZE] = \
8932 int(float(node_disks[dsk[constants.IDISK_ADOPT]]))
8934 _CheckHVParams(self, nodenames, self.op.hypervisor, self.op.hvparams)
8936 _CheckNodeHasOS(self, pnode.name, self.op.os_type, self.op.force_variant)
8937 # check OS parameters (remotely)
8938 _CheckOSParams(self, True, nodenames, self.op.os_type, self.os_full)
8940 _CheckNicsBridgesExist(self, self.nics, self.pnode.name)
8942 # memory check on primary node
8944 _CheckNodeFreeMemory(self, self.pnode.name,
8945 "creating instance %s" % self.op.instance_name,
8946 self.be_full[constants.BE_MEMORY],
8949 self.dry_run_result = list(nodenames)
8951 def Exec(self, feedback_fn):
8952 """Create and add the instance to the cluster.
8955 instance = self.op.instance_name
8956 pnode_name = self.pnode.name
8958 ht_kind = self.op.hypervisor
8959 if ht_kind in constants.HTS_REQ_PORT:
8960 network_port = self.cfg.AllocatePort()
8964 disks = _GenerateDiskTemplate(self,
8965 self.op.disk_template,
8966 instance, pnode_name,
8969 self.instance_file_storage_dir,
8970 self.op.file_driver,
8974 iobj = objects.Instance(name=instance, os=self.op.os_type,
8975 primary_node=pnode_name,
8976 nics=self.nics, disks=disks,
8977 disk_template=self.op.disk_template,
8979 network_port=network_port,
8980 beparams=self.op.beparams,
8981 hvparams=self.op.hvparams,
8982 hypervisor=self.op.hypervisor,
8983 osparams=self.op.osparams,
8987 for tag in self.op.tags:
8990 if self.adopt_disks:
8991 if self.op.disk_template == constants.DT_PLAIN:
8992 # rename LVs to the newly-generated names; we need to construct
8993 # 'fake' LV disks with the old data, plus the new unique_id
8994 tmp_disks = [objects.Disk.FromDict(v.ToDict()) for v in disks]
8996 for t_dsk, a_dsk in zip(tmp_disks, self.disks):
8997 rename_to.append(t_dsk.logical_id)
8998 t_dsk.logical_id = (t_dsk.logical_id[0], a_dsk[constants.IDISK_ADOPT])
8999 self.cfg.SetDiskID(t_dsk, pnode_name)
9000 result = self.rpc.call_blockdev_rename(pnode_name,
9001 zip(tmp_disks, rename_to))
9002 result.Raise("Failed to rename adoped LVs")
9004 feedback_fn("* creating instance disks...")
9006 _CreateDisks(self, iobj)
9007 except errors.OpExecError:
9008 self.LogWarning("Device creation failed, reverting...")
9010 _RemoveDisks(self, iobj)
9012 self.cfg.ReleaseDRBDMinors(instance)
9015 feedback_fn("adding instance %s to cluster config" % instance)
9017 self.cfg.AddInstance(iobj, self.proc.GetECId())
9019 # Declare that we don't want to remove the instance lock anymore, as we've
9020 # added the instance to the config
9021 del self.remove_locks[locking.LEVEL_INSTANCE]
9023 if self.op.mode == constants.INSTANCE_IMPORT:
9024 # Release unused nodes
9025 _ReleaseLocks(self, locking.LEVEL_NODE, keep=[self.op.src_node])
9028 _ReleaseLocks(self, locking.LEVEL_NODE)
9031 if not self.adopt_disks and self.cfg.GetClusterInfo().prealloc_wipe_disks:
9032 feedback_fn("* wiping instance disks...")
9034 _WipeDisks(self, iobj)
9035 except errors.OpExecError, err:
9036 logging.exception("Wiping disks failed")
9037 self.LogWarning("Wiping instance disks failed (%s)", err)
9041 # Something is already wrong with the disks, don't do anything else
9043 elif self.op.wait_for_sync:
9044 disk_abort = not _WaitForSync(self, iobj)
9045 elif iobj.disk_template in constants.DTS_INT_MIRROR:
9046 # make sure the disks are not degraded (still sync-ing is ok)
9047 feedback_fn("* checking mirrors status")
9048 disk_abort = not _WaitForSync(self, iobj, oneshot=True)
9053 _RemoveDisks(self, iobj)
9054 self.cfg.RemoveInstance(iobj.name)
9055 # Make sure the instance lock gets removed
9056 self.remove_locks[locking.LEVEL_INSTANCE] = iobj.name
9057 raise errors.OpExecError("There are some degraded disks for"
9060 if iobj.disk_template != constants.DT_DISKLESS and not self.adopt_disks:
9061 if self.op.mode == constants.INSTANCE_CREATE:
9062 if not self.op.no_install:
9063 pause_sync = (iobj.disk_template in constants.DTS_INT_MIRROR and
9064 not self.op.wait_for_sync)
9066 feedback_fn("* pausing disk sync to install instance OS")
9067 result = self.rpc.call_blockdev_pause_resume_sync(pnode_name,
9069 for idx, success in enumerate(result.payload):
9071 logging.warn("pause-sync of instance %s for disk %d failed",
9074 feedback_fn("* running the instance OS create scripts...")
9075 # FIXME: pass debug option from opcode to backend
9077 self.rpc.call_instance_os_add(pnode_name, iobj, False,
9078 self.op.debug_level)
9080 feedback_fn("* resuming disk sync")
9081 result = self.rpc.call_blockdev_pause_resume_sync(pnode_name,
9083 for idx, success in enumerate(result.payload):
9085 logging.warn("resume-sync of instance %s for disk %d failed",
9088 os_add_result.Raise("Could not add os for instance %s"
9089 " on node %s" % (instance, pnode_name))
9091 elif self.op.mode == constants.INSTANCE_IMPORT:
9092 feedback_fn("* running the instance OS import scripts...")
9096 for idx, image in enumerate(self.src_images):
9100 # FIXME: pass debug option from opcode to backend
9101 dt = masterd.instance.DiskTransfer("disk/%s" % idx,
9102 constants.IEIO_FILE, (image, ),
9103 constants.IEIO_SCRIPT,
9104 (iobj.disks[idx], idx),
9106 transfers.append(dt)
9109 masterd.instance.TransferInstanceData(self, feedback_fn,
9110 self.op.src_node, pnode_name,
9111 self.pnode.secondary_ip,
9113 if not compat.all(import_result):
9114 self.LogWarning("Some disks for instance %s on node %s were not"
9115 " imported successfully" % (instance, pnode_name))
9117 elif self.op.mode == constants.INSTANCE_REMOTE_IMPORT:
9118 feedback_fn("* preparing remote import...")
9119 # The source cluster will stop the instance before attempting to make a
9120 # connection. In some cases stopping an instance can take a long time,
9121 # hence the shutdown timeout is added to the connection timeout.
9122 connect_timeout = (constants.RIE_CONNECT_TIMEOUT +
9123 self.op.source_shutdown_timeout)
9124 timeouts = masterd.instance.ImportExportTimeouts(connect_timeout)
9126 assert iobj.primary_node == self.pnode.name
9128 masterd.instance.RemoteImport(self, feedback_fn, iobj, self.pnode,
9129 self.source_x509_ca,
9130 self._cds, timeouts)
9131 if not compat.all(disk_results):
9132 # TODO: Should the instance still be started, even if some disks
9133 # failed to import (valid for local imports, too)?
9134 self.LogWarning("Some disks for instance %s on node %s were not"
9135 " imported successfully" % (instance, pnode_name))
9137 # Run rename script on newly imported instance
9138 assert iobj.name == instance
9139 feedback_fn("Running rename script for %s" % instance)
9140 result = self.rpc.call_instance_run_rename(pnode_name, iobj,
9141 self.source_instance_name,
9142 self.op.debug_level)
9144 self.LogWarning("Failed to run rename script for %s on node"
9145 " %s: %s" % (instance, pnode_name, result.fail_msg))
9148 # also checked in the prereq part
9149 raise errors.ProgrammerError("Unknown OS initialization mode '%s'"
9153 iobj.admin_up = True
9154 self.cfg.Update(iobj, feedback_fn)
9155 logging.info("Starting instance %s on node %s", instance, pnode_name)
9156 feedback_fn("* starting instance...")
9157 result = self.rpc.call_instance_start(pnode_name, iobj,
9159 result.Raise("Could not start instance")
9161 return list(iobj.all_nodes)
9164 class LUInstanceConsole(NoHooksLU):
9165 """Connect to an instance's console.
9167 This is somewhat special in that it returns the command line that
9168 you need to run on the master node in order to connect to the
9174 def ExpandNames(self):
9175 self._ExpandAndLockInstance()
9177 def CheckPrereq(self):
9178 """Check prerequisites.
9180 This checks that the instance is in the cluster.
9183 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
9184 assert self.instance is not None, \
9185 "Cannot retrieve locked instance %s" % self.op.instance_name
9186 _CheckNodeOnline(self, self.instance.primary_node)
9188 def Exec(self, feedback_fn):
9189 """Connect to the console of an instance
9192 instance = self.instance
9193 node = instance.primary_node
9195 node_insts = self.rpc.call_instance_list([node],
9196 [instance.hypervisor])[node]
9197 node_insts.Raise("Can't get node information from %s" % node)
9199 if instance.name not in node_insts.payload:
9200 if instance.admin_up:
9201 state = constants.INSTST_ERRORDOWN
9203 state = constants.INSTST_ADMINDOWN
9204 raise errors.OpExecError("Instance %s is not running (state %s)" %
9205 (instance.name, state))
9207 logging.debug("Connecting to console of %s on %s", instance.name, node)
9209 return _GetInstanceConsole(self.cfg.GetClusterInfo(), instance)
9212 def _GetInstanceConsole(cluster, instance):
9213 """Returns console information for an instance.
9215 @type cluster: L{objects.Cluster}
9216 @type instance: L{objects.Instance}
9220 hyper = hypervisor.GetHypervisor(instance.hypervisor)
9221 # beparams and hvparams are passed separately, to avoid editing the
9222 # instance and then saving the defaults in the instance itself.
9223 hvparams = cluster.FillHV(instance)
9224 beparams = cluster.FillBE(instance)
9225 console = hyper.GetInstanceConsole(instance, hvparams, beparams)
9227 assert console.instance == instance.name
9228 assert console.Validate()
9230 return console.ToDict()
9233 class LUInstanceReplaceDisks(LogicalUnit):
9234 """Replace the disks of an instance.
9237 HPATH = "mirrors-replace"
9238 HTYPE = constants.HTYPE_INSTANCE
9241 def CheckArguments(self):
9242 TLReplaceDisks.CheckArguments(self.op.mode, self.op.remote_node,
9245 def ExpandNames(self):
9246 self._ExpandAndLockInstance()
9248 assert locking.LEVEL_NODE not in self.needed_locks
9249 assert locking.LEVEL_NODEGROUP not in self.needed_locks
9251 assert self.op.iallocator is None or self.op.remote_node is None, \
9252 "Conflicting options"
9254 if self.op.remote_node is not None:
9255 self.op.remote_node = _ExpandNodeName(self.cfg, self.op.remote_node)
9257 # Warning: do not remove the locking of the new secondary here
9258 # unless DRBD8.AddChildren is changed to work in parallel;
9259 # currently it doesn't since parallel invocations of
9260 # FindUnusedMinor will conflict
9261 self.needed_locks[locking.LEVEL_NODE] = [self.op.remote_node]
9262 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
9264 self.needed_locks[locking.LEVEL_NODE] = []
9265 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
9267 if self.op.iallocator is not None:
9268 # iallocator will select a new node in the same group
9269 self.needed_locks[locking.LEVEL_NODEGROUP] = []
9271 self.replacer = TLReplaceDisks(self, self.op.instance_name, self.op.mode,
9272 self.op.iallocator, self.op.remote_node,
9273 self.op.disks, False, self.op.early_release)
9275 self.tasklets = [self.replacer]
9277 def DeclareLocks(self, level):
9278 if level == locking.LEVEL_NODEGROUP:
9279 assert self.op.remote_node is None
9280 assert self.op.iallocator is not None
9281 assert not self.needed_locks[locking.LEVEL_NODEGROUP]
9283 self.share_locks[locking.LEVEL_NODEGROUP] = 1
9284 self.needed_locks[locking.LEVEL_NODEGROUP] = \
9285 self.cfg.GetInstanceNodeGroups(self.op.instance_name)
9287 elif level == locking.LEVEL_NODE:
9288 if self.op.iallocator is not None:
9289 assert self.op.remote_node is None
9290 assert not self.needed_locks[locking.LEVEL_NODE]
9292 # Lock member nodes of all locked groups
9293 self.needed_locks[locking.LEVEL_NODE] = [node_name
9294 for group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP)
9295 for node_name in self.cfg.GetNodeGroup(group_uuid).members]
9297 self._LockInstancesNodes()
9299 def BuildHooksEnv(self):
9302 This runs on the master, the primary and all the secondaries.
9305 instance = self.replacer.instance
9307 "MODE": self.op.mode,
9308 "NEW_SECONDARY": self.op.remote_node,
9309 "OLD_SECONDARY": instance.secondary_nodes[0],
9311 env.update(_BuildInstanceHookEnvByObject(self, instance))
9314 def BuildHooksNodes(self):
9315 """Build hooks nodes.
9318 instance = self.replacer.instance
9320 self.cfg.GetMasterNode(),
9321 instance.primary_node,
9323 if self.op.remote_node is not None:
9324 nl.append(self.op.remote_node)
9327 def CheckPrereq(self):
9328 """Check prerequisites.
9331 assert (self.glm.is_owned(locking.LEVEL_NODEGROUP) or
9332 self.op.iallocator is None)
9334 owned_groups = self.owned_locks(locking.LEVEL_NODEGROUP)
9336 _CheckInstanceNodeGroups(self.cfg, self.op.instance_name, owned_groups)
9338 return LogicalUnit.CheckPrereq(self)
9341 class TLReplaceDisks(Tasklet):
9342 """Replaces disks for an instance.
9344 Note: Locking is not within the scope of this class.
9347 def __init__(self, lu, instance_name, mode, iallocator_name, remote_node,
9348 disks, delay_iallocator, early_release):
9349 """Initializes this class.
9352 Tasklet.__init__(self, lu)
9355 self.instance_name = instance_name
9357 self.iallocator_name = iallocator_name
9358 self.remote_node = remote_node
9360 self.delay_iallocator = delay_iallocator
9361 self.early_release = early_release
9364 self.instance = None
9365 self.new_node = None
9366 self.target_node = None
9367 self.other_node = None
9368 self.remote_node_info = None
9369 self.node_secondary_ip = None
9372 def CheckArguments(mode, remote_node, iallocator):
9373 """Helper function for users of this class.
9376 # check for valid parameter combination
9377 if mode == constants.REPLACE_DISK_CHG:
9378 if remote_node is None and iallocator is None:
9379 raise errors.OpPrereqError("When changing the secondary either an"
9380 " iallocator script must be used or the"
9381 " new node given", errors.ECODE_INVAL)
9383 if remote_node is not None and iallocator is not None:
9384 raise errors.OpPrereqError("Give either the iallocator or the new"
9385 " secondary, not both", errors.ECODE_INVAL)
9387 elif remote_node is not None or iallocator is not None:
9388 # Not replacing the secondary
9389 raise errors.OpPrereqError("The iallocator and new node options can"
9390 " only be used when changing the"
9391 " secondary node", errors.ECODE_INVAL)
9394 def _RunAllocator(lu, iallocator_name, instance_name, relocate_from):
9395 """Compute a new secondary node using an IAllocator.
9398 ial = IAllocator(lu.cfg, lu.rpc,
9399 mode=constants.IALLOCATOR_MODE_RELOC,
9401 relocate_from=list(relocate_from))
9403 ial.Run(iallocator_name)
9406 raise errors.OpPrereqError("Can't compute nodes using iallocator '%s':"
9407 " %s" % (iallocator_name, ial.info),
9410 if len(ial.result) != ial.required_nodes:
9411 raise errors.OpPrereqError("iallocator '%s' returned invalid number"
9412 " of nodes (%s), required %s" %
9414 len(ial.result), ial.required_nodes),
9417 remote_node_name = ial.result[0]
9419 lu.LogInfo("Selected new secondary for instance '%s': %s",
9420 instance_name, remote_node_name)
9422 return remote_node_name
9424 def _FindFaultyDisks(self, node_name):
9425 """Wrapper for L{_FindFaultyInstanceDisks}.
9428 return _FindFaultyInstanceDisks(self.cfg, self.rpc, self.instance,
9431 def _CheckDisksActivated(self, instance):
9432 """Checks if the instance disks are activated.
9434 @param instance: The instance to check disks
9435 @return: True if they are activated, False otherwise
9438 nodes = instance.all_nodes
9440 for idx, dev in enumerate(instance.disks):
9442 self.lu.LogInfo("Checking disk/%d on %s", idx, node)
9443 self.cfg.SetDiskID(dev, node)
9445 result = self.rpc.call_blockdev_find(node, dev)
9449 elif result.fail_msg or not result.payload:
9454 def CheckPrereq(self):
9455 """Check prerequisites.
9457 This checks that the instance is in the cluster.
9460 self.instance = instance = self.cfg.GetInstanceInfo(self.instance_name)
9461 assert instance is not None, \
9462 "Cannot retrieve locked instance %s" % self.instance_name
9464 if instance.disk_template != constants.DT_DRBD8:
9465 raise errors.OpPrereqError("Can only run replace disks for DRBD8-based"
9466 " instances", errors.ECODE_INVAL)
9468 if len(instance.secondary_nodes) != 1:
9469 raise errors.OpPrereqError("The instance has a strange layout,"
9470 " expected one secondary but found %d" %
9471 len(instance.secondary_nodes),
9474 if not self.delay_iallocator:
9475 self._CheckPrereq2()
9477 def _CheckPrereq2(self):
9478 """Check prerequisites, second part.
9480 This function should always be part of CheckPrereq. It was separated and is
9481 now called from Exec because during node evacuation iallocator was only
9482 called with an unmodified cluster model, not taking planned changes into
9486 instance = self.instance
9487 secondary_node = instance.secondary_nodes[0]
9489 if self.iallocator_name is None:
9490 remote_node = self.remote_node
9492 remote_node = self._RunAllocator(self.lu, self.iallocator_name,
9493 instance.name, instance.secondary_nodes)
9495 if remote_node is None:
9496 self.remote_node_info = None
9498 assert remote_node in self.lu.owned_locks(locking.LEVEL_NODE), \
9499 "Remote node '%s' is not locked" % remote_node
9501 self.remote_node_info = self.cfg.GetNodeInfo(remote_node)
9502 assert self.remote_node_info is not None, \
9503 "Cannot retrieve locked node %s" % remote_node
9505 if remote_node == self.instance.primary_node:
9506 raise errors.OpPrereqError("The specified node is the primary node of"
9507 " the instance", errors.ECODE_INVAL)
9509 if remote_node == secondary_node:
9510 raise errors.OpPrereqError("The specified node is already the"
9511 " secondary node of the instance",
9514 if self.disks and self.mode in (constants.REPLACE_DISK_AUTO,
9515 constants.REPLACE_DISK_CHG):
9516 raise errors.OpPrereqError("Cannot specify disks to be replaced",
9519 if self.mode == constants.REPLACE_DISK_AUTO:
9520 if not self._CheckDisksActivated(instance):
9521 raise errors.OpPrereqError("Please run activate-disks on instance %s"
9522 " first" % self.instance_name,
9524 faulty_primary = self._FindFaultyDisks(instance.primary_node)
9525 faulty_secondary = self._FindFaultyDisks(secondary_node)
9527 if faulty_primary and faulty_secondary:
9528 raise errors.OpPrereqError("Instance %s has faulty disks on more than"
9529 " one node and can not be repaired"
9530 " automatically" % self.instance_name,
9534 self.disks = faulty_primary
9535 self.target_node = instance.primary_node
9536 self.other_node = secondary_node
9537 check_nodes = [self.target_node, self.other_node]
9538 elif faulty_secondary:
9539 self.disks = faulty_secondary
9540 self.target_node = secondary_node
9541 self.other_node = instance.primary_node
9542 check_nodes = [self.target_node, self.other_node]
9548 # Non-automatic modes
9549 if self.mode == constants.REPLACE_DISK_PRI:
9550 self.target_node = instance.primary_node
9551 self.other_node = secondary_node
9552 check_nodes = [self.target_node, self.other_node]
9554 elif self.mode == constants.REPLACE_DISK_SEC:
9555 self.target_node = secondary_node
9556 self.other_node = instance.primary_node
9557 check_nodes = [self.target_node, self.other_node]
9559 elif self.mode == constants.REPLACE_DISK_CHG:
9560 self.new_node = remote_node
9561 self.other_node = instance.primary_node
9562 self.target_node = secondary_node
9563 check_nodes = [self.new_node, self.other_node]
9565 _CheckNodeNotDrained(self.lu, remote_node)
9566 _CheckNodeVmCapable(self.lu, remote_node)
9568 old_node_info = self.cfg.GetNodeInfo(secondary_node)
9569 assert old_node_info is not None
9570 if old_node_info.offline and not self.early_release:
9571 # doesn't make sense to delay the release
9572 self.early_release = True
9573 self.lu.LogInfo("Old secondary %s is offline, automatically enabling"
9574 " early-release mode", secondary_node)
9577 raise errors.ProgrammerError("Unhandled disk replace mode (%s)" %
9580 # If not specified all disks should be replaced
9582 self.disks = range(len(self.instance.disks))
9584 for node in check_nodes:
9585 _CheckNodeOnline(self.lu, node)
9587 touched_nodes = frozenset(node_name for node_name in [self.new_node,
9590 if node_name is not None)
9592 # Release unneeded node locks
9593 _ReleaseLocks(self.lu, locking.LEVEL_NODE, keep=touched_nodes)
9595 # Release any owned node group
9596 if self.lu.glm.is_owned(locking.LEVEL_NODEGROUP):
9597 _ReleaseLocks(self.lu, locking.LEVEL_NODEGROUP)
9599 # Check whether disks are valid
9600 for disk_idx in self.disks:
9601 instance.FindDisk(disk_idx)
9603 # Get secondary node IP addresses
9604 self.node_secondary_ip = dict((name, node.secondary_ip) for (name, node)
9605 in self.cfg.GetMultiNodeInfo(touched_nodes))
9607 def Exec(self, feedback_fn):
9608 """Execute disk replacement.
9610 This dispatches the disk replacement to the appropriate handler.
9613 if self.delay_iallocator:
9614 self._CheckPrereq2()
9617 # Verify owned locks before starting operation
9618 owned_nodes = self.lu.owned_locks(locking.LEVEL_NODE)
9619 assert set(owned_nodes) == set(self.node_secondary_ip), \
9620 ("Incorrect node locks, owning %s, expected %s" %
9621 (owned_nodes, self.node_secondary_ip.keys()))
9623 owned_instances = self.lu.owned_locks(locking.LEVEL_INSTANCE)
9624 assert list(owned_instances) == [self.instance_name], \
9625 "Instance '%s' not locked" % self.instance_name
9627 assert not self.lu.glm.is_owned(locking.LEVEL_NODEGROUP), \
9628 "Should not own any node group lock at this point"
9631 feedback_fn("No disks need replacement")
9634 feedback_fn("Replacing disk(s) %s for %s" %
9635 (utils.CommaJoin(self.disks), self.instance.name))
9637 activate_disks = (not self.instance.admin_up)
9639 # Activate the instance disks if we're replacing them on a down instance
9641 _StartInstanceDisks(self.lu, self.instance, True)
9644 # Should we replace the secondary node?
9645 if self.new_node is not None:
9646 fn = self._ExecDrbd8Secondary
9648 fn = self._ExecDrbd8DiskOnly
9650 result = fn(feedback_fn)
9652 # Deactivate the instance disks if we're replacing them on a
9655 _SafeShutdownInstanceDisks(self.lu, self.instance)
9658 # Verify owned locks
9659 owned_nodes = self.lu.owned_locks(locking.LEVEL_NODE)
9660 nodes = frozenset(self.node_secondary_ip)
9661 assert ((self.early_release and not owned_nodes) or
9662 (not self.early_release and not (set(owned_nodes) - nodes))), \
9663 ("Not owning the correct locks, early_release=%s, owned=%r,"
9664 " nodes=%r" % (self.early_release, owned_nodes, nodes))
9668 def _CheckVolumeGroup(self, nodes):
9669 self.lu.LogInfo("Checking volume groups")
9671 vgname = self.cfg.GetVGName()
9673 # Make sure volume group exists on all involved nodes
9674 results = self.rpc.call_vg_list(nodes)
9676 raise errors.OpExecError("Can't list volume groups on the nodes")
9680 res.Raise("Error checking node %s" % node)
9681 if vgname not in res.payload:
9682 raise errors.OpExecError("Volume group '%s' not found on node %s" %
9685 def _CheckDisksExistence(self, nodes):
9686 # Check disk existence
9687 for idx, dev in enumerate(self.instance.disks):
9688 if idx not in self.disks:
9692 self.lu.LogInfo("Checking disk/%d on %s" % (idx, node))
9693 self.cfg.SetDiskID(dev, node)
9695 result = self.rpc.call_blockdev_find(node, dev)
9697 msg = result.fail_msg
9698 if msg or not result.payload:
9700 msg = "disk not found"
9701 raise errors.OpExecError("Can't find disk/%d on node %s: %s" %
9704 def _CheckDisksConsistency(self, node_name, on_primary, ldisk):
9705 for idx, dev in enumerate(self.instance.disks):
9706 if idx not in self.disks:
9709 self.lu.LogInfo("Checking disk/%d consistency on node %s" %
9712 if not _CheckDiskConsistency(self.lu, dev, node_name, on_primary,
9714 raise errors.OpExecError("Node %s has degraded storage, unsafe to"
9715 " replace disks for instance %s" %
9716 (node_name, self.instance.name))
9718 def _CreateNewStorage(self, node_name):
9719 """Create new storage on the primary or secondary node.
9721 This is only used for same-node replaces, not for changing the
9722 secondary node, hence we don't want to modify the existing disk.
9727 for idx, dev in enumerate(self.instance.disks):
9728 if idx not in self.disks:
9731 self.lu.LogInfo("Adding storage on %s for disk/%d" % (node_name, idx))
9733 self.cfg.SetDiskID(dev, node_name)
9735 lv_names = [".disk%d_%s" % (idx, suffix) for suffix in ["data", "meta"]]
9736 names = _GenerateUniqueNames(self.lu, lv_names)
9738 vg_data = dev.children[0].logical_id[0]
9739 lv_data = objects.Disk(dev_type=constants.LD_LV, size=dev.size,
9740 logical_id=(vg_data, names[0]))
9741 vg_meta = dev.children[1].logical_id[0]
9742 lv_meta = objects.Disk(dev_type=constants.LD_LV, size=128,
9743 logical_id=(vg_meta, names[1]))
9745 new_lvs = [lv_data, lv_meta]
9746 old_lvs = [child.Copy() for child in dev.children]
9747 iv_names[dev.iv_name] = (dev, old_lvs, new_lvs)
9749 # we pass force_create=True to force the LVM creation
9750 for new_lv in new_lvs:
9751 _CreateBlockDev(self.lu, node_name, self.instance, new_lv, True,
9752 _GetInstanceInfoText(self.instance), False)
9756 def _CheckDevices(self, node_name, iv_names):
9757 for name, (dev, _, _) in iv_names.iteritems():
9758 self.cfg.SetDiskID(dev, node_name)
9760 result = self.rpc.call_blockdev_find(node_name, dev)
9762 msg = result.fail_msg
9763 if msg or not result.payload:
9765 msg = "disk not found"
9766 raise errors.OpExecError("Can't find DRBD device %s: %s" %
9769 if result.payload.is_degraded:
9770 raise errors.OpExecError("DRBD device %s is degraded!" % name)
9772 def _RemoveOldStorage(self, node_name, iv_names):
9773 for name, (_, old_lvs, _) in iv_names.iteritems():
9774 self.lu.LogInfo("Remove logical volumes for %s" % name)
9777 self.cfg.SetDiskID(lv, node_name)
9779 msg = self.rpc.call_blockdev_remove(node_name, lv).fail_msg
9781 self.lu.LogWarning("Can't remove old LV: %s" % msg,
9782 hint="remove unused LVs manually")
9784 def _ExecDrbd8DiskOnly(self, feedback_fn): # pylint: disable=W0613
9785 """Replace a disk on the primary or secondary for DRBD 8.
9787 The algorithm for replace is quite complicated:
9789 1. for each disk to be replaced:
9791 1. create new LVs on the target node with unique names
9792 1. detach old LVs from the drbd device
9793 1. rename old LVs to name_replaced.<time_t>
9794 1. rename new LVs to old LVs
9795 1. attach the new LVs (with the old names now) to the drbd device
9797 1. wait for sync across all devices
9799 1. for each modified disk:
9801 1. remove old LVs (which have the name name_replaces.<time_t>)
9803 Failures are not very well handled.
9808 # Step: check device activation
9809 self.lu.LogStep(1, steps_total, "Check device existence")
9810 self._CheckDisksExistence([self.other_node, self.target_node])
9811 self._CheckVolumeGroup([self.target_node, self.other_node])
9813 # Step: check other node consistency
9814 self.lu.LogStep(2, steps_total, "Check peer consistency")
9815 self._CheckDisksConsistency(self.other_node,
9816 self.other_node == self.instance.primary_node,
9819 # Step: create new storage
9820 self.lu.LogStep(3, steps_total, "Allocate new storage")
9821 iv_names = self._CreateNewStorage(self.target_node)
9823 # Step: for each lv, detach+rename*2+attach
9824 self.lu.LogStep(4, steps_total, "Changing drbd configuration")
9825 for dev, old_lvs, new_lvs in iv_names.itervalues():
9826 self.lu.LogInfo("Detaching %s drbd from local storage" % dev.iv_name)
9828 result = self.rpc.call_blockdev_removechildren(self.target_node, dev,
9830 result.Raise("Can't detach drbd from local storage on node"
9831 " %s for device %s" % (self.target_node, dev.iv_name))
9833 #cfg.Update(instance)
9835 # ok, we created the new LVs, so now we know we have the needed
9836 # storage; as such, we proceed on the target node to rename
9837 # old_lv to _old, and new_lv to old_lv; note that we rename LVs
9838 # using the assumption that logical_id == physical_id (which in
9839 # turn is the unique_id on that node)
9841 # FIXME(iustin): use a better name for the replaced LVs
9842 temp_suffix = int(time.time())
9843 ren_fn = lambda d, suff: (d.physical_id[0],
9844 d.physical_id[1] + "_replaced-%s" % suff)
9846 # Build the rename list based on what LVs exist on the node
9847 rename_old_to_new = []
9848 for to_ren in old_lvs:
9849 result = self.rpc.call_blockdev_find(self.target_node, to_ren)
9850 if not result.fail_msg and result.payload:
9852 rename_old_to_new.append((to_ren, ren_fn(to_ren, temp_suffix)))
9854 self.lu.LogInfo("Renaming the old LVs on the target node")
9855 result = self.rpc.call_blockdev_rename(self.target_node,
9857 result.Raise("Can't rename old LVs on node %s" % self.target_node)
9859 # Now we rename the new LVs to the old LVs
9860 self.lu.LogInfo("Renaming the new LVs on the target node")
9861 rename_new_to_old = [(new, old.physical_id)
9862 for old, new in zip(old_lvs, new_lvs)]
9863 result = self.rpc.call_blockdev_rename(self.target_node,
9865 result.Raise("Can't rename new LVs on node %s" % self.target_node)
9867 # Intermediate steps of in memory modifications
9868 for old, new in zip(old_lvs, new_lvs):
9869 new.logical_id = old.logical_id
9870 self.cfg.SetDiskID(new, self.target_node)
9872 # We need to modify old_lvs so that removal later removes the
9873 # right LVs, not the newly added ones; note that old_lvs is a
9875 for disk in old_lvs:
9876 disk.logical_id = ren_fn(disk, temp_suffix)
9877 self.cfg.SetDiskID(disk, self.target_node)
9879 # Now that the new lvs have the old name, we can add them to the device
9880 self.lu.LogInfo("Adding new mirror component on %s" % self.target_node)
9881 result = self.rpc.call_blockdev_addchildren(self.target_node, dev,
9883 msg = result.fail_msg
9885 for new_lv in new_lvs:
9886 msg2 = self.rpc.call_blockdev_remove(self.target_node,
9889 self.lu.LogWarning("Can't rollback device %s: %s", dev, msg2,
9890 hint=("cleanup manually the unused logical"
9892 raise errors.OpExecError("Can't add local storage to drbd: %s" % msg)
9895 if self.early_release:
9896 self.lu.LogStep(cstep, steps_total, "Removing old storage")
9898 self._RemoveOldStorage(self.target_node, iv_names)
9899 # WARNING: we release both node locks here, do not do other RPCs
9900 # than WaitForSync to the primary node
9901 _ReleaseLocks(self.lu, locking.LEVEL_NODE,
9902 names=[self.target_node, self.other_node])
9905 # This can fail as the old devices are degraded and _WaitForSync
9906 # does a combined result over all disks, so we don't check its return value
9907 self.lu.LogStep(cstep, steps_total, "Sync devices")
9909 _WaitForSync(self.lu, self.instance)
9911 # Check all devices manually
9912 self._CheckDevices(self.instance.primary_node, iv_names)
9914 # Step: remove old storage
9915 if not self.early_release:
9916 self.lu.LogStep(cstep, steps_total, "Removing old storage")
9918 self._RemoveOldStorage(self.target_node, iv_names)
9920 def _ExecDrbd8Secondary(self, feedback_fn):
9921 """Replace the secondary node for DRBD 8.
9923 The algorithm for replace is quite complicated:
9924 - for all disks of the instance:
9925 - create new LVs on the new node with same names
9926 - shutdown the drbd device on the old secondary
9927 - disconnect the drbd network on the primary
9928 - create the drbd device on the new secondary
9929 - network attach the drbd on the primary, using an artifice:
9930 the drbd code for Attach() will connect to the network if it
9931 finds a device which is connected to the good local disks but
9933 - wait for sync across all devices
9934 - remove all disks from the old secondary
9936 Failures are not very well handled.
9941 pnode = self.instance.primary_node
9943 # Step: check device activation
9944 self.lu.LogStep(1, steps_total, "Check device existence")
9945 self._CheckDisksExistence([self.instance.primary_node])
9946 self._CheckVolumeGroup([self.instance.primary_node])
9948 # Step: check other node consistency
9949 self.lu.LogStep(2, steps_total, "Check peer consistency")
9950 self._CheckDisksConsistency(self.instance.primary_node, True, True)
9952 # Step: create new storage
9953 self.lu.LogStep(3, steps_total, "Allocate new storage")
9954 for idx, dev in enumerate(self.instance.disks):
9955 self.lu.LogInfo("Adding new local storage on %s for disk/%d" %
9956 (self.new_node, idx))
9957 # we pass force_create=True to force LVM creation
9958 for new_lv in dev.children:
9959 _CreateBlockDev(self.lu, self.new_node, self.instance, new_lv, True,
9960 _GetInstanceInfoText(self.instance), False)
9962 # Step 4: dbrd minors and drbd setups changes
9963 # after this, we must manually remove the drbd minors on both the
9964 # error and the success paths
9965 self.lu.LogStep(4, steps_total, "Changing drbd configuration")
9966 minors = self.cfg.AllocateDRBDMinor([self.new_node
9967 for dev in self.instance.disks],
9969 logging.debug("Allocated minors %r", minors)
9972 for idx, (dev, new_minor) in enumerate(zip(self.instance.disks, minors)):
9973 self.lu.LogInfo("activating a new drbd on %s for disk/%d" %
9974 (self.new_node, idx))
9975 # create new devices on new_node; note that we create two IDs:
9976 # one without port, so the drbd will be activated without
9977 # networking information on the new node at this stage, and one
9978 # with network, for the latter activation in step 4
9979 (o_node1, o_node2, o_port, o_minor1, o_minor2, o_secret) = dev.logical_id
9980 if self.instance.primary_node == o_node1:
9983 assert self.instance.primary_node == o_node2, "Three-node instance?"
9986 new_alone_id = (self.instance.primary_node, self.new_node, None,
9987 p_minor, new_minor, o_secret)
9988 new_net_id = (self.instance.primary_node, self.new_node, o_port,
9989 p_minor, new_minor, o_secret)
9991 iv_names[idx] = (dev, dev.children, new_net_id)
9992 logging.debug("Allocated new_minor: %s, new_logical_id: %s", new_minor,
9994 new_drbd = objects.Disk(dev_type=constants.LD_DRBD8,
9995 logical_id=new_alone_id,
9996 children=dev.children,
9999 _CreateSingleBlockDev(self.lu, self.new_node, self.instance, new_drbd,
10000 _GetInstanceInfoText(self.instance), False)
10001 except errors.GenericError:
10002 self.cfg.ReleaseDRBDMinors(self.instance.name)
10005 # We have new devices, shutdown the drbd on the old secondary
10006 for idx, dev in enumerate(self.instance.disks):
10007 self.lu.LogInfo("Shutting down drbd for disk/%d on old node" % idx)
10008 self.cfg.SetDiskID(dev, self.target_node)
10009 msg = self.rpc.call_blockdev_shutdown(self.target_node, dev).fail_msg
10011 self.lu.LogWarning("Failed to shutdown drbd for disk/%d on old"
10012 "node: %s" % (idx, msg),
10013 hint=("Please cleanup this device manually as"
10014 " soon as possible"))
10016 self.lu.LogInfo("Detaching primary drbds from the network (=> standalone)")
10017 result = self.rpc.call_drbd_disconnect_net([pnode], self.node_secondary_ip,
10018 self.instance.disks)[pnode]
10020 msg = result.fail_msg
10022 # detaches didn't succeed (unlikely)
10023 self.cfg.ReleaseDRBDMinors(self.instance.name)
10024 raise errors.OpExecError("Can't detach the disks from the network on"
10025 " old node: %s" % (msg,))
10027 # if we managed to detach at least one, we update all the disks of
10028 # the instance to point to the new secondary
10029 self.lu.LogInfo("Updating instance configuration")
10030 for dev, _, new_logical_id in iv_names.itervalues():
10031 dev.logical_id = new_logical_id
10032 self.cfg.SetDiskID(dev, self.instance.primary_node)
10034 self.cfg.Update(self.instance, feedback_fn)
10036 # and now perform the drbd attach
10037 self.lu.LogInfo("Attaching primary drbds to new secondary"
10038 " (standalone => connected)")
10039 result = self.rpc.call_drbd_attach_net([self.instance.primary_node,
10041 self.node_secondary_ip,
10042 self.instance.disks,
10043 self.instance.name,
10045 for to_node, to_result in result.items():
10046 msg = to_result.fail_msg
10048 self.lu.LogWarning("Can't attach drbd disks on node %s: %s",
10050 hint=("please do a gnt-instance info to see the"
10051 " status of disks"))
10053 if self.early_release:
10054 self.lu.LogStep(cstep, steps_total, "Removing old storage")
10056 self._RemoveOldStorage(self.target_node, iv_names)
10057 # WARNING: we release all node locks here, do not do other RPCs
10058 # than WaitForSync to the primary node
10059 _ReleaseLocks(self.lu, locking.LEVEL_NODE,
10060 names=[self.instance.primary_node,
10065 # This can fail as the old devices are degraded and _WaitForSync
10066 # does a combined result over all disks, so we don't check its return value
10067 self.lu.LogStep(cstep, steps_total, "Sync devices")
10069 _WaitForSync(self.lu, self.instance)
10071 # Check all devices manually
10072 self._CheckDevices(self.instance.primary_node, iv_names)
10074 # Step: remove old storage
10075 if not self.early_release:
10076 self.lu.LogStep(cstep, steps_total, "Removing old storage")
10077 self._RemoveOldStorage(self.target_node, iv_names)
10080 class LURepairNodeStorage(NoHooksLU):
10081 """Repairs the volume group on a node.
10086 def CheckArguments(self):
10087 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
10089 storage_type = self.op.storage_type
10091 if (constants.SO_FIX_CONSISTENCY not in
10092 constants.VALID_STORAGE_OPERATIONS.get(storage_type, [])):
10093 raise errors.OpPrereqError("Storage units of type '%s' can not be"
10094 " repaired" % storage_type,
10095 errors.ECODE_INVAL)
10097 def ExpandNames(self):
10098 self.needed_locks = {
10099 locking.LEVEL_NODE: [self.op.node_name],
10102 def _CheckFaultyDisks(self, instance, node_name):
10103 """Ensure faulty disks abort the opcode or at least warn."""
10105 if _FindFaultyInstanceDisks(self.cfg, self.rpc, instance,
10107 raise errors.OpPrereqError("Instance '%s' has faulty disks on"
10108 " node '%s'" % (instance.name, node_name),
10109 errors.ECODE_STATE)
10110 except errors.OpPrereqError, err:
10111 if self.op.ignore_consistency:
10112 self.proc.LogWarning(str(err.args[0]))
10116 def CheckPrereq(self):
10117 """Check prerequisites.
10120 # Check whether any instance on this node has faulty disks
10121 for inst in _GetNodeInstances(self.cfg, self.op.node_name):
10122 if not inst.admin_up:
10124 check_nodes = set(inst.all_nodes)
10125 check_nodes.discard(self.op.node_name)
10126 for inst_node_name in check_nodes:
10127 self._CheckFaultyDisks(inst, inst_node_name)
10129 def Exec(self, feedback_fn):
10130 feedback_fn("Repairing storage unit '%s' on %s ..." %
10131 (self.op.name, self.op.node_name))
10133 st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
10134 result = self.rpc.call_storage_execute(self.op.node_name,
10135 self.op.storage_type, st_args,
10137 constants.SO_FIX_CONSISTENCY)
10138 result.Raise("Failed to repair storage unit '%s' on %s" %
10139 (self.op.name, self.op.node_name))
10142 class LUNodeEvacuate(NoHooksLU):
10143 """Evacuates instances off a list of nodes.
10148 _MODE2IALLOCATOR = {
10149 constants.NODE_EVAC_PRI: constants.IALLOCATOR_NEVAC_PRI,
10150 constants.NODE_EVAC_SEC: constants.IALLOCATOR_NEVAC_SEC,
10151 constants.NODE_EVAC_ALL: constants.IALLOCATOR_NEVAC_ALL,
10153 assert frozenset(_MODE2IALLOCATOR.keys()) == constants.NODE_EVAC_MODES
10154 assert (frozenset(_MODE2IALLOCATOR.values()) ==
10155 constants.IALLOCATOR_NEVAC_MODES)
10157 def CheckArguments(self):
10158 _CheckIAllocatorOrNode(self, "iallocator", "remote_node")
10160 def ExpandNames(self):
10161 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
10163 if self.op.remote_node is not None:
10164 self.op.remote_node = _ExpandNodeName(self.cfg, self.op.remote_node)
10165 assert self.op.remote_node
10167 if self.op.remote_node == self.op.node_name:
10168 raise errors.OpPrereqError("Can not use evacuated node as a new"
10169 " secondary node", errors.ECODE_INVAL)
10171 if self.op.mode != constants.NODE_EVAC_SEC:
10172 raise errors.OpPrereqError("Without the use of an iallocator only"
10173 " secondary instances can be evacuated",
10174 errors.ECODE_INVAL)
10177 self.share_locks = _ShareAll()
10178 self.needed_locks = {
10179 locking.LEVEL_INSTANCE: [],
10180 locking.LEVEL_NODEGROUP: [],
10181 locking.LEVEL_NODE: [],
10184 # Determine nodes (via group) optimistically, needs verification once locks
10185 # have been acquired
10186 self.lock_nodes = self._DetermineNodes()
10188 def _DetermineNodes(self):
10189 """Gets the list of nodes to operate on.
10192 if self.op.remote_node is None:
10193 # Iallocator will choose any node(s) in the same group
10194 group_nodes = self.cfg.GetNodeGroupMembersByNodes([self.op.node_name])
10196 group_nodes = frozenset([self.op.remote_node])
10198 # Determine nodes to be locked
10199 return set([self.op.node_name]) | group_nodes
10201 def _DetermineInstances(self):
10202 """Builds list of instances to operate on.
10205 assert self.op.mode in constants.NODE_EVAC_MODES
10207 if self.op.mode == constants.NODE_EVAC_PRI:
10208 # Primary instances only
10209 inst_fn = _GetNodePrimaryInstances
10210 assert self.op.remote_node is None, \
10211 "Evacuating primary instances requires iallocator"
10212 elif self.op.mode == constants.NODE_EVAC_SEC:
10213 # Secondary instances only
10214 inst_fn = _GetNodeSecondaryInstances
10217 assert self.op.mode == constants.NODE_EVAC_ALL
10218 inst_fn = _GetNodeInstances
10219 # TODO: In 2.6, change the iallocator interface to take an evacuation mode
10221 raise errors.OpPrereqError("Due to an issue with the iallocator"
10222 " interface it is not possible to evacuate"
10223 " all instances at once; specify explicitly"
10224 " whether to evacuate primary or secondary"
10226 errors.ECODE_INVAL)
10228 return inst_fn(self.cfg, self.op.node_name)
10230 def DeclareLocks(self, level):
10231 if level == locking.LEVEL_INSTANCE:
10232 # Lock instances optimistically, needs verification once node and group
10233 # locks have been acquired
10234 self.needed_locks[locking.LEVEL_INSTANCE] = \
10235 set(i.name for i in self._DetermineInstances())
10237 elif level == locking.LEVEL_NODEGROUP:
10238 # Lock node groups for all potential target nodes optimistically, needs
10239 # verification once nodes have been acquired
10240 self.needed_locks[locking.LEVEL_NODEGROUP] = \
10241 self.cfg.GetNodeGroupsFromNodes(self.lock_nodes)
10243 elif level == locking.LEVEL_NODE:
10244 self.needed_locks[locking.LEVEL_NODE] = self.lock_nodes
10246 def CheckPrereq(self):
10248 owned_instances = self.owned_locks(locking.LEVEL_INSTANCE)
10249 owned_nodes = self.owned_locks(locking.LEVEL_NODE)
10250 owned_groups = self.owned_locks(locking.LEVEL_NODEGROUP)
10252 need_nodes = self._DetermineNodes()
10254 if not owned_nodes.issuperset(need_nodes):
10255 raise errors.OpPrereqError("Nodes in same group as '%s' changed since"
10256 " locks were acquired, current nodes are"
10257 " are '%s', used to be '%s'; retry the"
10259 (self.op.node_name,
10260 utils.CommaJoin(need_nodes),
10261 utils.CommaJoin(owned_nodes)),
10262 errors.ECODE_STATE)
10264 wanted_groups = self.cfg.GetNodeGroupsFromNodes(owned_nodes)
10265 if owned_groups != wanted_groups:
10266 raise errors.OpExecError("Node groups changed since locks were acquired,"
10267 " current groups are '%s', used to be '%s';"
10268 " retry the operation" %
10269 (utils.CommaJoin(wanted_groups),
10270 utils.CommaJoin(owned_groups)))
10272 # Determine affected instances
10273 self.instances = self._DetermineInstances()
10274 self.instance_names = [i.name for i in self.instances]
10276 if set(self.instance_names) != owned_instances:
10277 raise errors.OpExecError("Instances on node '%s' changed since locks"
10278 " were acquired, current instances are '%s',"
10279 " used to be '%s'; retry the operation" %
10280 (self.op.node_name,
10281 utils.CommaJoin(self.instance_names),
10282 utils.CommaJoin(owned_instances)))
10284 if self.instance_names:
10285 self.LogInfo("Evacuating instances from node '%s': %s",
10287 utils.CommaJoin(utils.NiceSort(self.instance_names)))
10289 self.LogInfo("No instances to evacuate from node '%s'",
10292 if self.op.remote_node is not None:
10293 for i in self.instances:
10294 if i.primary_node == self.op.remote_node:
10295 raise errors.OpPrereqError("Node %s is the primary node of"
10296 " instance %s, cannot use it as"
10298 (self.op.remote_node, i.name),
10299 errors.ECODE_INVAL)
10301 def Exec(self, feedback_fn):
10302 assert (self.op.iallocator is not None) ^ (self.op.remote_node is not None)
10304 if not self.instance_names:
10305 # No instances to evacuate
10308 elif self.op.iallocator is not None:
10309 # TODO: Implement relocation to other group
10310 ial = IAllocator(self.cfg, self.rpc, constants.IALLOCATOR_MODE_NODE_EVAC,
10311 evac_mode=self._MODE2IALLOCATOR[self.op.mode],
10312 instances=list(self.instance_names))
10314 ial.Run(self.op.iallocator)
10316 if not ial.success:
10317 raise errors.OpPrereqError("Can't compute node evacuation using"
10318 " iallocator '%s': %s" %
10319 (self.op.iallocator, ial.info),
10320 errors.ECODE_NORES)
10322 jobs = _LoadNodeEvacResult(self, ial.result, self.op.early_release, True)
10324 elif self.op.remote_node is not None:
10325 assert self.op.mode == constants.NODE_EVAC_SEC
10327 [opcodes.OpInstanceReplaceDisks(instance_name=instance_name,
10328 remote_node=self.op.remote_node,
10330 mode=constants.REPLACE_DISK_CHG,
10331 early_release=self.op.early_release)]
10332 for instance_name in self.instance_names
10336 raise errors.ProgrammerError("No iallocator or remote node")
10338 return ResultWithJobs(jobs)
10341 def _SetOpEarlyRelease(early_release, op):
10342 """Sets C{early_release} flag on opcodes if available.
10346 op.early_release = early_release
10347 except AttributeError:
10348 assert not isinstance(op, opcodes.OpInstanceReplaceDisks)
10353 def _NodeEvacDest(use_nodes, group, nodes):
10354 """Returns group or nodes depending on caller's choice.
10358 return utils.CommaJoin(nodes)
10363 def _LoadNodeEvacResult(lu, alloc_result, early_release, use_nodes):
10364 """Unpacks the result of change-group and node-evacuate iallocator requests.
10366 Iallocator modes L{constants.IALLOCATOR_MODE_NODE_EVAC} and
10367 L{constants.IALLOCATOR_MODE_CHG_GROUP}.
10369 @type lu: L{LogicalUnit}
10370 @param lu: Logical unit instance
10371 @type alloc_result: tuple/list
10372 @param alloc_result: Result from iallocator
10373 @type early_release: bool
10374 @param early_release: Whether to release locks early if possible
10375 @type use_nodes: bool
10376 @param use_nodes: Whether to display node names instead of groups
10379 (moved, failed, jobs) = alloc_result
10382 failreason = utils.CommaJoin("%s (%s)" % (name, reason)
10383 for (name, reason) in failed)
10384 lu.LogWarning("Unable to evacuate instances %s", failreason)
10385 raise errors.OpExecError("Unable to evacuate instances %s" % failreason)
10388 lu.LogInfo("Instances to be moved: %s",
10389 utils.CommaJoin("%s (to %s)" %
10390 (name, _NodeEvacDest(use_nodes, group, nodes))
10391 for (name, group, nodes) in moved))
10393 return [map(compat.partial(_SetOpEarlyRelease, early_release),
10394 map(opcodes.OpCode.LoadOpCode, ops))
10398 class LUInstanceGrowDisk(LogicalUnit):
10399 """Grow a disk of an instance.
10402 HPATH = "disk-grow"
10403 HTYPE = constants.HTYPE_INSTANCE
10406 def ExpandNames(self):
10407 self._ExpandAndLockInstance()
10408 self.needed_locks[locking.LEVEL_NODE] = []
10409 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
10411 def DeclareLocks(self, level):
10412 if level == locking.LEVEL_NODE:
10413 self._LockInstancesNodes()
10415 def BuildHooksEnv(self):
10416 """Build hooks env.
10418 This runs on the master, the primary and all the secondaries.
10422 "DISK": self.op.disk,
10423 "AMOUNT": self.op.amount,
10425 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
10428 def BuildHooksNodes(self):
10429 """Build hooks nodes.
10432 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
10435 def CheckPrereq(self):
10436 """Check prerequisites.
10438 This checks that the instance is in the cluster.
10441 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
10442 assert instance is not None, \
10443 "Cannot retrieve locked instance %s" % self.op.instance_name
10444 nodenames = list(instance.all_nodes)
10445 for node in nodenames:
10446 _CheckNodeOnline(self, node)
10448 self.instance = instance
10450 if instance.disk_template not in constants.DTS_GROWABLE:
10451 raise errors.OpPrereqError("Instance's disk layout does not support"
10452 " growing", errors.ECODE_INVAL)
10454 self.disk = instance.FindDisk(self.op.disk)
10456 if instance.disk_template not in (constants.DT_FILE,
10457 constants.DT_SHARED_FILE):
10458 # TODO: check the free disk space for file, when that feature will be
10460 _CheckNodesFreeDiskPerVG(self, nodenames,
10461 self.disk.ComputeGrowth(self.op.amount))
10463 def Exec(self, feedback_fn):
10464 """Execute disk grow.
10467 instance = self.instance
10470 disks_ok, _ = _AssembleInstanceDisks(self, self.instance, disks=[disk])
10472 raise errors.OpExecError("Cannot activate block device to grow")
10474 # First run all grow ops in dry-run mode
10475 for node in instance.all_nodes:
10476 self.cfg.SetDiskID(disk, node)
10477 result = self.rpc.call_blockdev_grow(node, disk, self.op.amount, True)
10478 result.Raise("Grow request failed to node %s" % node)
10480 # We know that (as far as we can test) operations across different
10481 # nodes will succeed, time to run it for real
10482 for node in instance.all_nodes:
10483 self.cfg.SetDiskID(disk, node)
10484 result = self.rpc.call_blockdev_grow(node, disk, self.op.amount, False)
10485 result.Raise("Grow request failed to node %s" % node)
10487 # TODO: Rewrite code to work properly
10488 # DRBD goes into sync mode for a short amount of time after executing the
10489 # "resize" command. DRBD 8.x below version 8.0.13 contains a bug whereby
10490 # calling "resize" in sync mode fails. Sleeping for a short amount of
10491 # time is a work-around.
10494 disk.RecordGrow(self.op.amount)
10495 self.cfg.Update(instance, feedback_fn)
10496 if self.op.wait_for_sync:
10497 disk_abort = not _WaitForSync(self, instance, disks=[disk])
10499 self.proc.LogWarning("Disk sync-ing has not returned a good"
10500 " status; please check the instance")
10501 if not instance.admin_up:
10502 _SafeShutdownInstanceDisks(self, instance, disks=[disk])
10503 elif not instance.admin_up:
10504 self.proc.LogWarning("Not shutting down the disk even if the instance is"
10505 " not supposed to be running because no wait for"
10506 " sync mode was requested")
10509 class LUInstanceQueryData(NoHooksLU):
10510 """Query runtime instance data.
10515 def ExpandNames(self):
10516 self.needed_locks = {}
10518 # Use locking if requested or when non-static information is wanted
10519 if not (self.op.static or self.op.use_locking):
10520 self.LogWarning("Non-static data requested, locks need to be acquired")
10521 self.op.use_locking = True
10523 if self.op.instances or not self.op.use_locking:
10524 # Expand instance names right here
10525 self.wanted_names = _GetWantedInstances(self, self.op.instances)
10527 # Will use acquired locks
10528 self.wanted_names = None
10530 if self.op.use_locking:
10531 self.share_locks = _ShareAll()
10533 if self.wanted_names is None:
10534 self.needed_locks[locking.LEVEL_INSTANCE] = locking.ALL_SET
10536 self.needed_locks[locking.LEVEL_INSTANCE] = self.wanted_names
10538 self.needed_locks[locking.LEVEL_NODE] = []
10539 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
10541 def DeclareLocks(self, level):
10542 if self.op.use_locking and level == locking.LEVEL_NODE:
10543 self._LockInstancesNodes()
10545 def CheckPrereq(self):
10546 """Check prerequisites.
10548 This only checks the optional instance list against the existing names.
10551 if self.wanted_names is None:
10552 assert self.op.use_locking, "Locking was not used"
10553 self.wanted_names = self.owned_locks(locking.LEVEL_INSTANCE)
10555 self.wanted_instances = \
10556 map(compat.snd, self.cfg.GetMultiInstanceInfo(self.wanted_names))
10558 def _ComputeBlockdevStatus(self, node, instance_name, dev):
10559 """Returns the status of a block device
10562 if self.op.static or not node:
10565 self.cfg.SetDiskID(dev, node)
10567 result = self.rpc.call_blockdev_find(node, dev)
10571 result.Raise("Can't compute disk status for %s" % instance_name)
10573 status = result.payload
10577 return (status.dev_path, status.major, status.minor,
10578 status.sync_percent, status.estimated_time,
10579 status.is_degraded, status.ldisk_status)
10581 def _ComputeDiskStatus(self, instance, snode, dev):
10582 """Compute block device status.
10585 if dev.dev_type in constants.LDS_DRBD:
10586 # we change the snode then (otherwise we use the one passed in)
10587 if dev.logical_id[0] == instance.primary_node:
10588 snode = dev.logical_id[1]
10590 snode = dev.logical_id[0]
10592 dev_pstatus = self._ComputeBlockdevStatus(instance.primary_node,
10593 instance.name, dev)
10594 dev_sstatus = self._ComputeBlockdevStatus(snode, instance.name, dev)
10597 dev_children = map(compat.partial(self._ComputeDiskStatus,
10604 "iv_name": dev.iv_name,
10605 "dev_type": dev.dev_type,
10606 "logical_id": dev.logical_id,
10607 "physical_id": dev.physical_id,
10608 "pstatus": dev_pstatus,
10609 "sstatus": dev_sstatus,
10610 "children": dev_children,
10615 def Exec(self, feedback_fn):
10616 """Gather and return data"""
10619 cluster = self.cfg.GetClusterInfo()
10621 pri_nodes = self.cfg.GetMultiNodeInfo(i.primary_node
10622 for i in self.wanted_instances)
10623 for instance, (_, pnode) in zip(self.wanted_instances, pri_nodes):
10624 if self.op.static or pnode.offline:
10625 remote_state = None
10627 self.LogWarning("Primary node %s is marked offline, returning static"
10628 " information only for instance %s" %
10629 (pnode.name, instance.name))
10631 remote_info = self.rpc.call_instance_info(instance.primary_node,
10633 instance.hypervisor)
10634 remote_info.Raise("Error checking node %s" % instance.primary_node)
10635 remote_info = remote_info.payload
10636 if remote_info and "state" in remote_info:
10637 remote_state = "up"
10639 remote_state = "down"
10641 if instance.admin_up:
10642 config_state = "up"
10644 config_state = "down"
10646 disks = map(compat.partial(self._ComputeDiskStatus, instance, None),
10649 result[instance.name] = {
10650 "name": instance.name,
10651 "config_state": config_state,
10652 "run_state": remote_state,
10653 "pnode": instance.primary_node,
10654 "snodes": instance.secondary_nodes,
10656 # this happens to be the same format used for hooks
10657 "nics": _NICListToTuple(self, instance.nics),
10658 "disk_template": instance.disk_template,
10660 "hypervisor": instance.hypervisor,
10661 "network_port": instance.network_port,
10662 "hv_instance": instance.hvparams,
10663 "hv_actual": cluster.FillHV(instance, skip_globals=True),
10664 "be_instance": instance.beparams,
10665 "be_actual": cluster.FillBE(instance),
10666 "os_instance": instance.osparams,
10667 "os_actual": cluster.SimpleFillOS(instance.os, instance.osparams),
10668 "serial_no": instance.serial_no,
10669 "mtime": instance.mtime,
10670 "ctime": instance.ctime,
10671 "uuid": instance.uuid,
10677 class LUInstanceSetParams(LogicalUnit):
10678 """Modifies an instances's parameters.
10681 HPATH = "instance-modify"
10682 HTYPE = constants.HTYPE_INSTANCE
10685 def CheckArguments(self):
10686 if not (self.op.nics or self.op.disks or self.op.disk_template or
10687 self.op.hvparams or self.op.beparams or self.op.os_name):
10688 raise errors.OpPrereqError("No changes submitted", errors.ECODE_INVAL)
10690 if self.op.hvparams:
10691 _CheckGlobalHvParams(self.op.hvparams)
10695 for disk_op, disk_dict in self.op.disks:
10696 utils.ForceDictType(disk_dict, constants.IDISK_PARAMS_TYPES)
10697 if disk_op == constants.DDM_REMOVE:
10698 disk_addremove += 1
10700 elif disk_op == constants.DDM_ADD:
10701 disk_addremove += 1
10703 if not isinstance(disk_op, int):
10704 raise errors.OpPrereqError("Invalid disk index", errors.ECODE_INVAL)
10705 if not isinstance(disk_dict, dict):
10706 msg = "Invalid disk value: expected dict, got '%s'" % disk_dict
10707 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
10709 if disk_op == constants.DDM_ADD:
10710 mode = disk_dict.setdefault(constants.IDISK_MODE, constants.DISK_RDWR)
10711 if mode not in constants.DISK_ACCESS_SET:
10712 raise errors.OpPrereqError("Invalid disk access mode '%s'" % mode,
10713 errors.ECODE_INVAL)
10714 size = disk_dict.get(constants.IDISK_SIZE, None)
10716 raise errors.OpPrereqError("Required disk parameter size missing",
10717 errors.ECODE_INVAL)
10720 except (TypeError, ValueError), err:
10721 raise errors.OpPrereqError("Invalid disk size parameter: %s" %
10722 str(err), errors.ECODE_INVAL)
10723 disk_dict[constants.IDISK_SIZE] = size
10725 # modification of disk
10726 if constants.IDISK_SIZE in disk_dict:
10727 raise errors.OpPrereqError("Disk size change not possible, use"
10728 " grow-disk", errors.ECODE_INVAL)
10730 if disk_addremove > 1:
10731 raise errors.OpPrereqError("Only one disk add or remove operation"
10732 " supported at a time", errors.ECODE_INVAL)
10734 if self.op.disks and self.op.disk_template is not None:
10735 raise errors.OpPrereqError("Disk template conversion and other disk"
10736 " changes not supported at the same time",
10737 errors.ECODE_INVAL)
10739 if (self.op.disk_template and
10740 self.op.disk_template in constants.DTS_INT_MIRROR and
10741 self.op.remote_node is None):
10742 raise errors.OpPrereqError("Changing the disk template to a mirrored"
10743 " one requires specifying a secondary node",
10744 errors.ECODE_INVAL)
10748 for nic_op, nic_dict in self.op.nics:
10749 utils.ForceDictType(nic_dict, constants.INIC_PARAMS_TYPES)
10750 if nic_op == constants.DDM_REMOVE:
10753 elif nic_op == constants.DDM_ADD:
10756 if not isinstance(nic_op, int):
10757 raise errors.OpPrereqError("Invalid nic index", errors.ECODE_INVAL)
10758 if not isinstance(nic_dict, dict):
10759 msg = "Invalid nic value: expected dict, got '%s'" % nic_dict
10760 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
10762 # nic_dict should be a dict
10763 nic_ip = nic_dict.get(constants.INIC_IP, None)
10764 if nic_ip is not None:
10765 if nic_ip.lower() == constants.VALUE_NONE:
10766 nic_dict[constants.INIC_IP] = None
10768 if not netutils.IPAddress.IsValid(nic_ip):
10769 raise errors.OpPrereqError("Invalid IP address '%s'" % nic_ip,
10770 errors.ECODE_INVAL)
10772 nic_bridge = nic_dict.get("bridge", None)
10773 nic_link = nic_dict.get(constants.INIC_LINK, None)
10774 if nic_bridge and nic_link:
10775 raise errors.OpPrereqError("Cannot pass 'bridge' and 'link'"
10776 " at the same time", errors.ECODE_INVAL)
10777 elif nic_bridge and nic_bridge.lower() == constants.VALUE_NONE:
10778 nic_dict["bridge"] = None
10779 elif nic_link and nic_link.lower() == constants.VALUE_NONE:
10780 nic_dict[constants.INIC_LINK] = None
10782 if nic_op == constants.DDM_ADD:
10783 nic_mac = nic_dict.get(constants.INIC_MAC, None)
10784 if nic_mac is None:
10785 nic_dict[constants.INIC_MAC] = constants.VALUE_AUTO
10787 if constants.INIC_MAC in nic_dict:
10788 nic_mac = nic_dict[constants.INIC_MAC]
10789 if nic_mac not in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
10790 nic_mac = utils.NormalizeAndValidateMac(nic_mac)
10792 if nic_op != constants.DDM_ADD and nic_mac == constants.VALUE_AUTO:
10793 raise errors.OpPrereqError("'auto' is not a valid MAC address when"
10794 " modifying an existing nic",
10795 errors.ECODE_INVAL)
10797 if nic_addremove > 1:
10798 raise errors.OpPrereqError("Only one NIC add or remove operation"
10799 " supported at a time", errors.ECODE_INVAL)
10801 def ExpandNames(self):
10802 self._ExpandAndLockInstance()
10803 self.needed_locks[locking.LEVEL_NODE] = []
10804 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
10806 def DeclareLocks(self, level):
10807 if level == locking.LEVEL_NODE:
10808 self._LockInstancesNodes()
10809 if self.op.disk_template and self.op.remote_node:
10810 self.op.remote_node = _ExpandNodeName(self.cfg, self.op.remote_node)
10811 self.needed_locks[locking.LEVEL_NODE].append(self.op.remote_node)
10813 def BuildHooksEnv(self):
10814 """Build hooks env.
10816 This runs on the master, primary and secondaries.
10820 if constants.BE_MEMORY in self.be_new:
10821 args["memory"] = self.be_new[constants.BE_MEMORY]
10822 if constants.BE_VCPUS in self.be_new:
10823 args["vcpus"] = self.be_new[constants.BE_VCPUS]
10824 # TODO: export disk changes. Note: _BuildInstanceHookEnv* don't export disk
10825 # information at all.
10828 nic_override = dict(self.op.nics)
10829 for idx, nic in enumerate(self.instance.nics):
10830 if idx in nic_override:
10831 this_nic_override = nic_override[idx]
10833 this_nic_override = {}
10834 if constants.INIC_IP in this_nic_override:
10835 ip = this_nic_override[constants.INIC_IP]
10838 if constants.INIC_MAC in this_nic_override:
10839 mac = this_nic_override[constants.INIC_MAC]
10842 if idx in self.nic_pnew:
10843 nicparams = self.nic_pnew[idx]
10845 nicparams = self.cluster.SimpleFillNIC(nic.nicparams)
10846 mode = nicparams[constants.NIC_MODE]
10847 link = nicparams[constants.NIC_LINK]
10848 args["nics"].append((ip, mac, mode, link))
10849 if constants.DDM_ADD in nic_override:
10850 ip = nic_override[constants.DDM_ADD].get(constants.INIC_IP, None)
10851 mac = nic_override[constants.DDM_ADD][constants.INIC_MAC]
10852 nicparams = self.nic_pnew[constants.DDM_ADD]
10853 mode = nicparams[constants.NIC_MODE]
10854 link = nicparams[constants.NIC_LINK]
10855 args["nics"].append((ip, mac, mode, link))
10856 elif constants.DDM_REMOVE in nic_override:
10857 del args["nics"][-1]
10859 env = _BuildInstanceHookEnvByObject(self, self.instance, override=args)
10860 if self.op.disk_template:
10861 env["NEW_DISK_TEMPLATE"] = self.op.disk_template
10865 def BuildHooksNodes(self):
10866 """Build hooks nodes.
10869 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
10872 def CheckPrereq(self):
10873 """Check prerequisites.
10875 This only checks the instance list against the existing names.
10878 # checking the new params on the primary/secondary nodes
10880 instance = self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
10881 cluster = self.cluster = self.cfg.GetClusterInfo()
10882 assert self.instance is not None, \
10883 "Cannot retrieve locked instance %s" % self.op.instance_name
10884 pnode = instance.primary_node
10885 nodelist = list(instance.all_nodes)
10888 if self.op.os_name and not self.op.force:
10889 _CheckNodeHasOS(self, instance.primary_node, self.op.os_name,
10890 self.op.force_variant)
10891 instance_os = self.op.os_name
10893 instance_os = instance.os
10895 if self.op.disk_template:
10896 if instance.disk_template == self.op.disk_template:
10897 raise errors.OpPrereqError("Instance already has disk template %s" %
10898 instance.disk_template, errors.ECODE_INVAL)
10900 if (instance.disk_template,
10901 self.op.disk_template) not in self._DISK_CONVERSIONS:
10902 raise errors.OpPrereqError("Unsupported disk template conversion from"
10903 " %s to %s" % (instance.disk_template,
10904 self.op.disk_template),
10905 errors.ECODE_INVAL)
10906 _CheckInstanceDown(self, instance, "cannot change disk template")
10907 if self.op.disk_template in constants.DTS_INT_MIRROR:
10908 if self.op.remote_node == pnode:
10909 raise errors.OpPrereqError("Given new secondary node %s is the same"
10910 " as the primary node of the instance" %
10911 self.op.remote_node, errors.ECODE_STATE)
10912 _CheckNodeOnline(self, self.op.remote_node)
10913 _CheckNodeNotDrained(self, self.op.remote_node)
10914 # FIXME: here we assume that the old instance type is DT_PLAIN
10915 assert instance.disk_template == constants.DT_PLAIN
10916 disks = [{constants.IDISK_SIZE: d.size,
10917 constants.IDISK_VG: d.logical_id[0]}
10918 for d in instance.disks]
10919 required = _ComputeDiskSizePerVG(self.op.disk_template, disks)
10920 _CheckNodesFreeDiskPerVG(self, [self.op.remote_node], required)
10922 # hvparams processing
10923 if self.op.hvparams:
10924 hv_type = instance.hypervisor
10925 i_hvdict = _GetUpdatedParams(instance.hvparams, self.op.hvparams)
10926 utils.ForceDictType(i_hvdict, constants.HVS_PARAMETER_TYPES)
10927 hv_new = cluster.SimpleFillHV(hv_type, instance.os, i_hvdict)
10930 hypervisor.GetHypervisor(hv_type).CheckParameterSyntax(hv_new)
10931 _CheckHVParams(self, nodelist, instance.hypervisor, hv_new)
10932 self.hv_new = hv_new # the new actual values
10933 self.hv_inst = i_hvdict # the new dict (without defaults)
10935 self.hv_new = self.hv_inst = {}
10937 # beparams processing
10938 if self.op.beparams:
10939 i_bedict = _GetUpdatedParams(instance.beparams, self.op.beparams,
10941 utils.ForceDictType(i_bedict, constants.BES_PARAMETER_TYPES)
10942 be_new = cluster.SimpleFillBE(i_bedict)
10943 self.be_new = be_new # the new actual values
10944 self.be_inst = i_bedict # the new dict (without defaults)
10946 self.be_new = self.be_inst = {}
10947 be_old = cluster.FillBE(instance)
10949 # osparams processing
10950 if self.op.osparams:
10951 i_osdict = _GetUpdatedParams(instance.osparams, self.op.osparams)
10952 _CheckOSParams(self, True, nodelist, instance_os, i_osdict)
10953 self.os_inst = i_osdict # the new dict (without defaults)
10959 if (constants.BE_MEMORY in self.op.beparams and not self.op.force and
10960 be_new[constants.BE_MEMORY] > be_old[constants.BE_MEMORY]):
10961 mem_check_list = [pnode]
10962 if be_new[constants.BE_AUTO_BALANCE]:
10963 # either we changed auto_balance to yes or it was from before
10964 mem_check_list.extend(instance.secondary_nodes)
10965 instance_info = self.rpc.call_instance_info(pnode, instance.name,
10966 instance.hypervisor)
10967 nodeinfo = self.rpc.call_node_info(mem_check_list, None,
10968 instance.hypervisor)
10969 pninfo = nodeinfo[pnode]
10970 msg = pninfo.fail_msg
10972 # Assume the primary node is unreachable and go ahead
10973 self.warn.append("Can't get info from primary node %s: %s" %
10975 elif not isinstance(pninfo.payload.get("memory_free", None), int):
10976 self.warn.append("Node data from primary node %s doesn't contain"
10977 " free memory information" % pnode)
10978 elif instance_info.fail_msg:
10979 self.warn.append("Can't get instance runtime information: %s" %
10980 instance_info.fail_msg)
10982 if instance_info.payload:
10983 current_mem = int(instance_info.payload["memory"])
10985 # Assume instance not running
10986 # (there is a slight race condition here, but it's not very probable,
10987 # and we have no other way to check)
10989 miss_mem = (be_new[constants.BE_MEMORY] - current_mem -
10990 pninfo.payload["memory_free"])
10992 raise errors.OpPrereqError("This change will prevent the instance"
10993 " from starting, due to %d MB of memory"
10994 " missing on its primary node" % miss_mem,
10995 errors.ECODE_NORES)
10997 if be_new[constants.BE_AUTO_BALANCE]:
10998 for node, nres in nodeinfo.items():
10999 if node not in instance.secondary_nodes:
11001 nres.Raise("Can't get info from secondary node %s" % node,
11002 prereq=True, ecode=errors.ECODE_STATE)
11003 if not isinstance(nres.payload.get("memory_free", None), int):
11004 raise errors.OpPrereqError("Secondary node %s didn't return free"
11005 " memory information" % node,
11006 errors.ECODE_STATE)
11007 elif be_new[constants.BE_MEMORY] > nres.payload["memory_free"]:
11008 raise errors.OpPrereqError("This change will prevent the instance"
11009 " from failover to its secondary node"
11010 " %s, due to not enough memory" % node,
11011 errors.ECODE_STATE)
11015 self.nic_pinst = {}
11016 for nic_op, nic_dict in self.op.nics:
11017 if nic_op == constants.DDM_REMOVE:
11018 if not instance.nics:
11019 raise errors.OpPrereqError("Instance has no NICs, cannot remove",
11020 errors.ECODE_INVAL)
11022 if nic_op != constants.DDM_ADD:
11024 if not instance.nics:
11025 raise errors.OpPrereqError("Invalid NIC index %s, instance has"
11026 " no NICs" % nic_op,
11027 errors.ECODE_INVAL)
11028 if nic_op < 0 or nic_op >= len(instance.nics):
11029 raise errors.OpPrereqError("Invalid NIC index %s, valid values"
11031 (nic_op, len(instance.nics) - 1),
11032 errors.ECODE_INVAL)
11033 old_nic_params = instance.nics[nic_op].nicparams
11034 old_nic_ip = instance.nics[nic_op].ip
11036 old_nic_params = {}
11039 update_params_dict = dict([(key, nic_dict[key])
11040 for key in constants.NICS_PARAMETERS
11041 if key in nic_dict])
11043 if "bridge" in nic_dict:
11044 update_params_dict[constants.NIC_LINK] = nic_dict["bridge"]
11046 new_nic_params = _GetUpdatedParams(old_nic_params,
11047 update_params_dict)
11048 utils.ForceDictType(new_nic_params, constants.NICS_PARAMETER_TYPES)
11049 new_filled_nic_params = cluster.SimpleFillNIC(new_nic_params)
11050 objects.NIC.CheckParameterSyntax(new_filled_nic_params)
11051 self.nic_pinst[nic_op] = new_nic_params
11052 self.nic_pnew[nic_op] = new_filled_nic_params
11053 new_nic_mode = new_filled_nic_params[constants.NIC_MODE]
11055 if new_nic_mode == constants.NIC_MODE_BRIDGED:
11056 nic_bridge = new_filled_nic_params[constants.NIC_LINK]
11057 msg = self.rpc.call_bridges_exist(pnode, [nic_bridge]).fail_msg
11059 msg = "Error checking bridges on node %s: %s" % (pnode, msg)
11061 self.warn.append(msg)
11063 raise errors.OpPrereqError(msg, errors.ECODE_ENVIRON)
11064 if new_nic_mode == constants.NIC_MODE_ROUTED:
11065 if constants.INIC_IP in nic_dict:
11066 nic_ip = nic_dict[constants.INIC_IP]
11068 nic_ip = old_nic_ip
11070 raise errors.OpPrereqError("Cannot set the nic ip to None"
11071 " on a routed nic", errors.ECODE_INVAL)
11072 if constants.INIC_MAC in nic_dict:
11073 nic_mac = nic_dict[constants.INIC_MAC]
11074 if nic_mac is None:
11075 raise errors.OpPrereqError("Cannot set the nic mac to None",
11076 errors.ECODE_INVAL)
11077 elif nic_mac in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
11078 # otherwise generate the mac
11079 nic_dict[constants.INIC_MAC] = \
11080 self.cfg.GenerateMAC(self.proc.GetECId())
11082 # or validate/reserve the current one
11084 self.cfg.ReserveMAC(nic_mac, self.proc.GetECId())
11085 except errors.ReservationError:
11086 raise errors.OpPrereqError("MAC address %s already in use"
11087 " in cluster" % nic_mac,
11088 errors.ECODE_NOTUNIQUE)
11091 if self.op.disks and instance.disk_template == constants.DT_DISKLESS:
11092 raise errors.OpPrereqError("Disk operations not supported for"
11093 " diskless instances",
11094 errors.ECODE_INVAL)
11095 for disk_op, _ in self.op.disks:
11096 if disk_op == constants.DDM_REMOVE:
11097 if len(instance.disks) == 1:
11098 raise errors.OpPrereqError("Cannot remove the last disk of"
11099 " an instance", errors.ECODE_INVAL)
11100 _CheckInstanceDown(self, instance, "cannot remove disks")
11102 if (disk_op == constants.DDM_ADD and
11103 len(instance.disks) >= constants.MAX_DISKS):
11104 raise errors.OpPrereqError("Instance has too many disks (%d), cannot"
11105 " add more" % constants.MAX_DISKS,
11106 errors.ECODE_STATE)
11107 if disk_op not in (constants.DDM_ADD, constants.DDM_REMOVE):
11109 if disk_op < 0 or disk_op >= len(instance.disks):
11110 raise errors.OpPrereqError("Invalid disk index %s, valid values"
11112 (disk_op, len(instance.disks)),
11113 errors.ECODE_INVAL)
11117 def _ConvertPlainToDrbd(self, feedback_fn):
11118 """Converts an instance from plain to drbd.
11121 feedback_fn("Converting template to drbd")
11122 instance = self.instance
11123 pnode = instance.primary_node
11124 snode = self.op.remote_node
11126 # create a fake disk info for _GenerateDiskTemplate
11127 disk_info = [{constants.IDISK_SIZE: d.size, constants.IDISK_MODE: d.mode,
11128 constants.IDISK_VG: d.logical_id[0]}
11129 for d in instance.disks]
11130 new_disks = _GenerateDiskTemplate(self, self.op.disk_template,
11131 instance.name, pnode, [snode],
11132 disk_info, None, None, 0, feedback_fn)
11133 info = _GetInstanceInfoText(instance)
11134 feedback_fn("Creating aditional volumes...")
11135 # first, create the missing data and meta devices
11136 for disk in new_disks:
11137 # unfortunately this is... not too nice
11138 _CreateSingleBlockDev(self, pnode, instance, disk.children[1],
11140 for child in disk.children:
11141 _CreateSingleBlockDev(self, snode, instance, child, info, True)
11142 # at this stage, all new LVs have been created, we can rename the
11144 feedback_fn("Renaming original volumes...")
11145 rename_list = [(o, n.children[0].logical_id)
11146 for (o, n) in zip(instance.disks, new_disks)]
11147 result = self.rpc.call_blockdev_rename(pnode, rename_list)
11148 result.Raise("Failed to rename original LVs")
11150 feedback_fn("Initializing DRBD devices...")
11151 # all child devices are in place, we can now create the DRBD devices
11152 for disk in new_disks:
11153 for node in [pnode, snode]:
11154 f_create = node == pnode
11155 _CreateSingleBlockDev(self, node, instance, disk, info, f_create)
11157 # at this point, the instance has been modified
11158 instance.disk_template = constants.DT_DRBD8
11159 instance.disks = new_disks
11160 self.cfg.Update(instance, feedback_fn)
11162 # disks are created, waiting for sync
11163 disk_abort = not _WaitForSync(self, instance,
11164 oneshot=not self.op.wait_for_sync)
11166 raise errors.OpExecError("There are some degraded disks for"
11167 " this instance, please cleanup manually")
11169 def _ConvertDrbdToPlain(self, feedback_fn):
11170 """Converts an instance from drbd to plain.
11173 instance = self.instance
11174 assert len(instance.secondary_nodes) == 1
11175 pnode = instance.primary_node
11176 snode = instance.secondary_nodes[0]
11177 feedback_fn("Converting template to plain")
11179 old_disks = instance.disks
11180 new_disks = [d.children[0] for d in old_disks]
11182 # copy over size and mode
11183 for parent, child in zip(old_disks, new_disks):
11184 child.size = parent.size
11185 child.mode = parent.mode
11187 # update instance structure
11188 instance.disks = new_disks
11189 instance.disk_template = constants.DT_PLAIN
11190 self.cfg.Update(instance, feedback_fn)
11192 feedback_fn("Removing volumes on the secondary node...")
11193 for disk in old_disks:
11194 self.cfg.SetDiskID(disk, snode)
11195 msg = self.rpc.call_blockdev_remove(snode, disk).fail_msg
11197 self.LogWarning("Could not remove block device %s on node %s,"
11198 " continuing anyway: %s", disk.iv_name, snode, msg)
11200 feedback_fn("Removing unneeded volumes on the primary node...")
11201 for idx, disk in enumerate(old_disks):
11202 meta = disk.children[1]
11203 self.cfg.SetDiskID(meta, pnode)
11204 msg = self.rpc.call_blockdev_remove(pnode, meta).fail_msg
11206 self.LogWarning("Could not remove metadata for disk %d on node %s,"
11207 " continuing anyway: %s", idx, pnode, msg)
11209 # this is a DRBD disk, return its port to the pool
11210 for disk in old_disks:
11211 tcp_port = disk.logical_id[2]
11212 self.cfg.AddTcpUdpPort(tcp_port)
11214 def Exec(self, feedback_fn):
11215 """Modifies an instance.
11217 All parameters take effect only at the next restart of the instance.
11220 # Process here the warnings from CheckPrereq, as we don't have a
11221 # feedback_fn there.
11222 for warn in self.warn:
11223 feedback_fn("WARNING: %s" % warn)
11226 instance = self.instance
11228 for disk_op, disk_dict in self.op.disks:
11229 if disk_op == constants.DDM_REMOVE:
11230 # remove the last disk
11231 device = instance.disks.pop()
11232 device_idx = len(instance.disks)
11233 for node, disk in device.ComputeNodeTree(instance.primary_node):
11234 self.cfg.SetDiskID(disk, node)
11235 msg = self.rpc.call_blockdev_remove(node, disk).fail_msg
11237 self.LogWarning("Could not remove disk/%d on node %s: %s,"
11238 " continuing anyway", device_idx, node, msg)
11239 result.append(("disk/%d" % device_idx, "remove"))
11241 # if this is a DRBD disk, return its port to the pool
11242 if device.dev_type in constants.LDS_DRBD:
11243 tcp_port = device.logical_id[2]
11244 self.cfg.AddTcpUdpPort(tcp_port)
11245 elif disk_op == constants.DDM_ADD:
11247 if instance.disk_template in (constants.DT_FILE,
11248 constants.DT_SHARED_FILE):
11249 file_driver, file_path = instance.disks[0].logical_id
11250 file_path = os.path.dirname(file_path)
11252 file_driver = file_path = None
11253 disk_idx_base = len(instance.disks)
11254 new_disk = _GenerateDiskTemplate(self,
11255 instance.disk_template,
11256 instance.name, instance.primary_node,
11257 instance.secondary_nodes,
11261 disk_idx_base, feedback_fn)[0]
11262 instance.disks.append(new_disk)
11263 info = _GetInstanceInfoText(instance)
11265 logging.info("Creating volume %s for instance %s",
11266 new_disk.iv_name, instance.name)
11267 # Note: this needs to be kept in sync with _CreateDisks
11269 for node in instance.all_nodes:
11270 f_create = node == instance.primary_node
11272 _CreateBlockDev(self, node, instance, new_disk,
11273 f_create, info, f_create)
11274 except errors.OpExecError, err:
11275 self.LogWarning("Failed to create volume %s (%s) on"
11277 new_disk.iv_name, new_disk, node, err)
11278 result.append(("disk/%d" % disk_idx_base, "add:size=%s,mode=%s" %
11279 (new_disk.size, new_disk.mode)))
11281 # change a given disk
11282 instance.disks[disk_op].mode = disk_dict[constants.IDISK_MODE]
11283 result.append(("disk.mode/%d" % disk_op,
11284 disk_dict[constants.IDISK_MODE]))
11286 if self.op.disk_template:
11287 r_shut = _ShutdownInstanceDisks(self, instance)
11289 raise errors.OpExecError("Cannot shutdown instance disks, unable to"
11290 " proceed with disk template conversion")
11291 mode = (instance.disk_template, self.op.disk_template)
11293 self._DISK_CONVERSIONS[mode](self, feedback_fn)
11295 self.cfg.ReleaseDRBDMinors(instance.name)
11297 result.append(("disk_template", self.op.disk_template))
11300 for nic_op, nic_dict in self.op.nics:
11301 if nic_op == constants.DDM_REMOVE:
11302 # remove the last nic
11303 del instance.nics[-1]
11304 result.append(("nic.%d" % len(instance.nics), "remove"))
11305 elif nic_op == constants.DDM_ADD:
11306 # mac and bridge should be set, by now
11307 mac = nic_dict[constants.INIC_MAC]
11308 ip = nic_dict.get(constants.INIC_IP, None)
11309 nicparams = self.nic_pinst[constants.DDM_ADD]
11310 new_nic = objects.NIC(mac=mac, ip=ip, nicparams=nicparams)
11311 instance.nics.append(new_nic)
11312 result.append(("nic.%d" % (len(instance.nics) - 1),
11313 "add:mac=%s,ip=%s,mode=%s,link=%s" %
11314 (new_nic.mac, new_nic.ip,
11315 self.nic_pnew[constants.DDM_ADD][constants.NIC_MODE],
11316 self.nic_pnew[constants.DDM_ADD][constants.NIC_LINK]
11319 for key in (constants.INIC_MAC, constants.INIC_IP):
11320 if key in nic_dict:
11321 setattr(instance.nics[nic_op], key, nic_dict[key])
11322 if nic_op in self.nic_pinst:
11323 instance.nics[nic_op].nicparams = self.nic_pinst[nic_op]
11324 for key, val in nic_dict.iteritems():
11325 result.append(("nic.%s/%d" % (key, nic_op), val))
11328 if self.op.hvparams:
11329 instance.hvparams = self.hv_inst
11330 for key, val in self.op.hvparams.iteritems():
11331 result.append(("hv/%s" % key, val))
11334 if self.op.beparams:
11335 instance.beparams = self.be_inst
11336 for key, val in self.op.beparams.iteritems():
11337 result.append(("be/%s" % key, val))
11340 if self.op.os_name:
11341 instance.os = self.op.os_name
11344 if self.op.osparams:
11345 instance.osparams = self.os_inst
11346 for key, val in self.op.osparams.iteritems():
11347 result.append(("os/%s" % key, val))
11349 self.cfg.Update(instance, feedback_fn)
11353 _DISK_CONVERSIONS = {
11354 (constants.DT_PLAIN, constants.DT_DRBD8): _ConvertPlainToDrbd,
11355 (constants.DT_DRBD8, constants.DT_PLAIN): _ConvertDrbdToPlain,
11359 class LUInstanceChangeGroup(LogicalUnit):
11360 HPATH = "instance-change-group"
11361 HTYPE = constants.HTYPE_INSTANCE
11364 def ExpandNames(self):
11365 self.share_locks = _ShareAll()
11366 self.needed_locks = {
11367 locking.LEVEL_NODEGROUP: [],
11368 locking.LEVEL_NODE: [],
11371 self._ExpandAndLockInstance()
11373 if self.op.target_groups:
11374 self.req_target_uuids = map(self.cfg.LookupNodeGroup,
11375 self.op.target_groups)
11377 self.req_target_uuids = None
11379 self.op.iallocator = _GetDefaultIAllocator(self.cfg, self.op.iallocator)
11381 def DeclareLocks(self, level):
11382 if level == locking.LEVEL_NODEGROUP:
11383 assert not self.needed_locks[locking.LEVEL_NODEGROUP]
11385 if self.req_target_uuids:
11386 lock_groups = set(self.req_target_uuids)
11388 # Lock all groups used by instance optimistically; this requires going
11389 # via the node before it's locked, requiring verification later on
11390 instance_groups = self.cfg.GetInstanceNodeGroups(self.op.instance_name)
11391 lock_groups.update(instance_groups)
11393 # No target groups, need to lock all of them
11394 lock_groups = locking.ALL_SET
11396 self.needed_locks[locking.LEVEL_NODEGROUP] = lock_groups
11398 elif level == locking.LEVEL_NODE:
11399 if self.req_target_uuids:
11400 # Lock all nodes used by instances
11401 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
11402 self._LockInstancesNodes()
11404 # Lock all nodes in all potential target groups
11405 lock_groups = (frozenset(self.owned_locks(locking.LEVEL_NODEGROUP)) -
11406 self.cfg.GetInstanceNodeGroups(self.op.instance_name))
11407 member_nodes = [node_name
11408 for group in lock_groups
11409 for node_name in self.cfg.GetNodeGroup(group).members]
11410 self.needed_locks[locking.LEVEL_NODE].extend(member_nodes)
11412 # Lock all nodes as all groups are potential targets
11413 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
11415 def CheckPrereq(self):
11416 owned_instances = frozenset(self.owned_locks(locking.LEVEL_INSTANCE))
11417 owned_groups = frozenset(self.owned_locks(locking.LEVEL_NODEGROUP))
11418 owned_nodes = frozenset(self.owned_locks(locking.LEVEL_NODE))
11420 assert (self.req_target_uuids is None or
11421 owned_groups.issuperset(self.req_target_uuids))
11422 assert owned_instances == set([self.op.instance_name])
11424 # Get instance information
11425 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
11427 # Check if node groups for locked instance are still correct
11428 assert owned_nodes.issuperset(self.instance.all_nodes), \
11429 ("Instance %s's nodes changed while we kept the lock" %
11430 self.op.instance_name)
11432 inst_groups = _CheckInstanceNodeGroups(self.cfg, self.op.instance_name,
11435 if self.req_target_uuids:
11436 # User requested specific target groups
11437 self.target_uuids = self.req_target_uuids
11439 # All groups except those used by the instance are potential targets
11440 self.target_uuids = owned_groups - inst_groups
11442 conflicting_groups = self.target_uuids & inst_groups
11443 if conflicting_groups:
11444 raise errors.OpPrereqError("Can't use group(s) '%s' as targets, they are"
11445 " used by the instance '%s'" %
11446 (utils.CommaJoin(conflicting_groups),
11447 self.op.instance_name),
11448 errors.ECODE_INVAL)
11450 if not self.target_uuids:
11451 raise errors.OpPrereqError("There are no possible target groups",
11452 errors.ECODE_INVAL)
11454 def BuildHooksEnv(self):
11455 """Build hooks env.
11458 assert self.target_uuids
11461 "TARGET_GROUPS": " ".join(self.target_uuids),
11464 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
11468 def BuildHooksNodes(self):
11469 """Build hooks nodes.
11472 mn = self.cfg.GetMasterNode()
11473 return ([mn], [mn])
11475 def Exec(self, feedback_fn):
11476 instances = list(self.owned_locks(locking.LEVEL_INSTANCE))
11478 assert instances == [self.op.instance_name], "Instance not locked"
11480 ial = IAllocator(self.cfg, self.rpc, constants.IALLOCATOR_MODE_CHG_GROUP,
11481 instances=instances, target_groups=list(self.target_uuids))
11483 ial.Run(self.op.iallocator)
11485 if not ial.success:
11486 raise errors.OpPrereqError("Can't compute solution for changing group of"
11487 " instance '%s' using iallocator '%s': %s" %
11488 (self.op.instance_name, self.op.iallocator,
11490 errors.ECODE_NORES)
11492 jobs = _LoadNodeEvacResult(self, ial.result, self.op.early_release, False)
11494 self.LogInfo("Iallocator returned %s job(s) for changing group of"
11495 " instance '%s'", len(jobs), self.op.instance_name)
11497 return ResultWithJobs(jobs)
11500 class LUBackupQuery(NoHooksLU):
11501 """Query the exports list
11506 def ExpandNames(self):
11507 self.needed_locks = {}
11508 self.share_locks[locking.LEVEL_NODE] = 1
11509 if not self.op.nodes:
11510 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
11512 self.needed_locks[locking.LEVEL_NODE] = \
11513 _GetWantedNodes(self, self.op.nodes)
11515 def Exec(self, feedback_fn):
11516 """Compute the list of all the exported system images.
11519 @return: a dictionary with the structure node->(export-list)
11520 where export-list is a list of the instances exported on
11524 self.nodes = self.owned_locks(locking.LEVEL_NODE)
11525 rpcresult = self.rpc.call_export_list(self.nodes)
11527 for node in rpcresult:
11528 if rpcresult[node].fail_msg:
11529 result[node] = False
11531 result[node] = rpcresult[node].payload
11536 class LUBackupPrepare(NoHooksLU):
11537 """Prepares an instance for an export and returns useful information.
11542 def ExpandNames(self):
11543 self._ExpandAndLockInstance()
11545 def CheckPrereq(self):
11546 """Check prerequisites.
11549 instance_name = self.op.instance_name
11551 self.instance = self.cfg.GetInstanceInfo(instance_name)
11552 assert self.instance is not None, \
11553 "Cannot retrieve locked instance %s" % self.op.instance_name
11554 _CheckNodeOnline(self, self.instance.primary_node)
11556 self._cds = _GetClusterDomainSecret()
11558 def Exec(self, feedback_fn):
11559 """Prepares an instance for an export.
11562 instance = self.instance
11564 if self.op.mode == constants.EXPORT_MODE_REMOTE:
11565 salt = utils.GenerateSecret(8)
11567 feedback_fn("Generating X509 certificate on %s" % instance.primary_node)
11568 result = self.rpc.call_x509_cert_create(instance.primary_node,
11569 constants.RIE_CERT_VALIDITY)
11570 result.Raise("Can't create X509 key and certificate on %s" % result.node)
11572 (name, cert_pem) = result.payload
11574 cert = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM,
11578 "handshake": masterd.instance.ComputeRemoteExportHandshake(self._cds),
11579 "x509_key_name": (name, utils.Sha1Hmac(self._cds, name, salt=salt),
11581 "x509_ca": utils.SignX509Certificate(cert, self._cds, salt),
11587 class LUBackupExport(LogicalUnit):
11588 """Export an instance to an image in the cluster.
11591 HPATH = "instance-export"
11592 HTYPE = constants.HTYPE_INSTANCE
11595 def CheckArguments(self):
11596 """Check the arguments.
11599 self.x509_key_name = self.op.x509_key_name
11600 self.dest_x509_ca_pem = self.op.destination_x509_ca
11602 if self.op.mode == constants.EXPORT_MODE_REMOTE:
11603 if not self.x509_key_name:
11604 raise errors.OpPrereqError("Missing X509 key name for encryption",
11605 errors.ECODE_INVAL)
11607 if not self.dest_x509_ca_pem:
11608 raise errors.OpPrereqError("Missing destination X509 CA",
11609 errors.ECODE_INVAL)
11611 def ExpandNames(self):
11612 self._ExpandAndLockInstance()
11614 # Lock all nodes for local exports
11615 if self.op.mode == constants.EXPORT_MODE_LOCAL:
11616 # FIXME: lock only instance primary and destination node
11618 # Sad but true, for now we have do lock all nodes, as we don't know where
11619 # the previous export might be, and in this LU we search for it and
11620 # remove it from its current node. In the future we could fix this by:
11621 # - making a tasklet to search (share-lock all), then create the
11622 # new one, then one to remove, after
11623 # - removing the removal operation altogether
11624 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
11626 def DeclareLocks(self, level):
11627 """Last minute lock declaration."""
11628 # All nodes are locked anyway, so nothing to do here.
11630 def BuildHooksEnv(self):
11631 """Build hooks env.
11633 This will run on the master, primary node and target node.
11637 "EXPORT_MODE": self.op.mode,
11638 "EXPORT_NODE": self.op.target_node,
11639 "EXPORT_DO_SHUTDOWN": self.op.shutdown,
11640 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
11641 # TODO: Generic function for boolean env variables
11642 "REMOVE_INSTANCE": str(bool(self.op.remove_instance)),
11645 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
11649 def BuildHooksNodes(self):
11650 """Build hooks nodes.
11653 nl = [self.cfg.GetMasterNode(), self.instance.primary_node]
11655 if self.op.mode == constants.EXPORT_MODE_LOCAL:
11656 nl.append(self.op.target_node)
11660 def CheckPrereq(self):
11661 """Check prerequisites.
11663 This checks that the instance and node names are valid.
11666 instance_name = self.op.instance_name
11668 self.instance = self.cfg.GetInstanceInfo(instance_name)
11669 assert self.instance is not None, \
11670 "Cannot retrieve locked instance %s" % self.op.instance_name
11671 _CheckNodeOnline(self, self.instance.primary_node)
11673 if (self.op.remove_instance and self.instance.admin_up and
11674 not self.op.shutdown):
11675 raise errors.OpPrereqError("Can not remove instance without shutting it"
11678 if self.op.mode == constants.EXPORT_MODE_LOCAL:
11679 self.op.target_node = _ExpandNodeName(self.cfg, self.op.target_node)
11680 self.dst_node = self.cfg.GetNodeInfo(self.op.target_node)
11681 assert self.dst_node is not None
11683 _CheckNodeOnline(self, self.dst_node.name)
11684 _CheckNodeNotDrained(self, self.dst_node.name)
11687 self.dest_disk_info = None
11688 self.dest_x509_ca = None
11690 elif self.op.mode == constants.EXPORT_MODE_REMOTE:
11691 self.dst_node = None
11693 if len(self.op.target_node) != len(self.instance.disks):
11694 raise errors.OpPrereqError(("Received destination information for %s"
11695 " disks, but instance %s has %s disks") %
11696 (len(self.op.target_node), instance_name,
11697 len(self.instance.disks)),
11698 errors.ECODE_INVAL)
11700 cds = _GetClusterDomainSecret()
11702 # Check X509 key name
11704 (key_name, hmac_digest, hmac_salt) = self.x509_key_name
11705 except (TypeError, ValueError), err:
11706 raise errors.OpPrereqError("Invalid data for X509 key name: %s" % err)
11708 if not utils.VerifySha1Hmac(cds, key_name, hmac_digest, salt=hmac_salt):
11709 raise errors.OpPrereqError("HMAC for X509 key name is wrong",
11710 errors.ECODE_INVAL)
11712 # Load and verify CA
11714 (cert, _) = utils.LoadSignedX509Certificate(self.dest_x509_ca_pem, cds)
11715 except OpenSSL.crypto.Error, err:
11716 raise errors.OpPrereqError("Unable to load destination X509 CA (%s)" %
11717 (err, ), errors.ECODE_INVAL)
11719 (errcode, msg) = utils.VerifyX509Certificate(cert, None, None)
11720 if errcode is not None:
11721 raise errors.OpPrereqError("Invalid destination X509 CA (%s)" %
11722 (msg, ), errors.ECODE_INVAL)
11724 self.dest_x509_ca = cert
11726 # Verify target information
11728 for idx, disk_data in enumerate(self.op.target_node):
11730 (host, port, magic) = \
11731 masterd.instance.CheckRemoteExportDiskInfo(cds, idx, disk_data)
11732 except errors.GenericError, err:
11733 raise errors.OpPrereqError("Target info for disk %s: %s" %
11734 (idx, err), errors.ECODE_INVAL)
11736 disk_info.append((host, port, magic))
11738 assert len(disk_info) == len(self.op.target_node)
11739 self.dest_disk_info = disk_info
11742 raise errors.ProgrammerError("Unhandled export mode %r" %
11745 # instance disk type verification
11746 # TODO: Implement export support for file-based disks
11747 for disk in self.instance.disks:
11748 if disk.dev_type == constants.LD_FILE:
11749 raise errors.OpPrereqError("Export not supported for instances with"
11750 " file-based disks", errors.ECODE_INVAL)
11752 def _CleanupExports(self, feedback_fn):
11753 """Removes exports of current instance from all other nodes.
11755 If an instance in a cluster with nodes A..D was exported to node C, its
11756 exports will be removed from the nodes A, B and D.
11759 assert self.op.mode != constants.EXPORT_MODE_REMOTE
11761 nodelist = self.cfg.GetNodeList()
11762 nodelist.remove(self.dst_node.name)
11764 # on one-node clusters nodelist will be empty after the removal
11765 # if we proceed the backup would be removed because OpBackupQuery
11766 # substitutes an empty list with the full cluster node list.
11767 iname = self.instance.name
11769 feedback_fn("Removing old exports for instance %s" % iname)
11770 exportlist = self.rpc.call_export_list(nodelist)
11771 for node in exportlist:
11772 if exportlist[node].fail_msg:
11774 if iname in exportlist[node].payload:
11775 msg = self.rpc.call_export_remove(node, iname).fail_msg
11777 self.LogWarning("Could not remove older export for instance %s"
11778 " on node %s: %s", iname, node, msg)
11780 def Exec(self, feedback_fn):
11781 """Export an instance to an image in the cluster.
11784 assert self.op.mode in constants.EXPORT_MODES
11786 instance = self.instance
11787 src_node = instance.primary_node
11789 if self.op.shutdown:
11790 # shutdown the instance, but not the disks
11791 feedback_fn("Shutting down instance %s" % instance.name)
11792 result = self.rpc.call_instance_shutdown(src_node, instance,
11793 self.op.shutdown_timeout)
11794 # TODO: Maybe ignore failures if ignore_remove_failures is set
11795 result.Raise("Could not shutdown instance %s on"
11796 " node %s" % (instance.name, src_node))
11798 # set the disks ID correctly since call_instance_start needs the
11799 # correct drbd minor to create the symlinks
11800 for disk in instance.disks:
11801 self.cfg.SetDiskID(disk, src_node)
11803 activate_disks = (not instance.admin_up)
11806 # Activate the instance disks if we'exporting a stopped instance
11807 feedback_fn("Activating disks for %s" % instance.name)
11808 _StartInstanceDisks(self, instance, None)
11811 helper = masterd.instance.ExportInstanceHelper(self, feedback_fn,
11814 helper.CreateSnapshots()
11816 if (self.op.shutdown and instance.admin_up and
11817 not self.op.remove_instance):
11818 assert not activate_disks
11819 feedback_fn("Starting instance %s" % instance.name)
11820 result = self.rpc.call_instance_start(src_node, instance,
11822 msg = result.fail_msg
11824 feedback_fn("Failed to start instance: %s" % msg)
11825 _ShutdownInstanceDisks(self, instance)
11826 raise errors.OpExecError("Could not start instance: %s" % msg)
11828 if self.op.mode == constants.EXPORT_MODE_LOCAL:
11829 (fin_resu, dresults) = helper.LocalExport(self.dst_node)
11830 elif self.op.mode == constants.EXPORT_MODE_REMOTE:
11831 connect_timeout = constants.RIE_CONNECT_TIMEOUT
11832 timeouts = masterd.instance.ImportExportTimeouts(connect_timeout)
11834 (key_name, _, _) = self.x509_key_name
11837 OpenSSL.crypto.dump_certificate(OpenSSL.crypto.FILETYPE_PEM,
11840 (fin_resu, dresults) = helper.RemoteExport(self.dest_disk_info,
11841 key_name, dest_ca_pem,
11846 # Check for backwards compatibility
11847 assert len(dresults) == len(instance.disks)
11848 assert compat.all(isinstance(i, bool) for i in dresults), \
11849 "Not all results are boolean: %r" % dresults
11853 feedback_fn("Deactivating disks for %s" % instance.name)
11854 _ShutdownInstanceDisks(self, instance)
11856 if not (compat.all(dresults) and fin_resu):
11859 failures.append("export finalization")
11860 if not compat.all(dresults):
11861 fdsk = utils.CommaJoin(idx for (idx, dsk) in enumerate(dresults)
11863 failures.append("disk export: disk(s) %s" % fdsk)
11865 raise errors.OpExecError("Export failed, errors in %s" %
11866 utils.CommaJoin(failures))
11868 # At this point, the export was successful, we can cleanup/finish
11870 # Remove instance if requested
11871 if self.op.remove_instance:
11872 feedback_fn("Removing instance %s" % instance.name)
11873 _RemoveInstance(self, feedback_fn, instance,
11874 self.op.ignore_remove_failures)
11876 if self.op.mode == constants.EXPORT_MODE_LOCAL:
11877 self._CleanupExports(feedback_fn)
11879 return fin_resu, dresults
11882 class LUBackupRemove(NoHooksLU):
11883 """Remove exports related to the named instance.
11888 def ExpandNames(self):
11889 self.needed_locks = {}
11890 # We need all nodes to be locked in order for RemoveExport to work, but we
11891 # don't need to lock the instance itself, as nothing will happen to it (and
11892 # we can remove exports also for a removed instance)
11893 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
11895 def Exec(self, feedback_fn):
11896 """Remove any export.
11899 instance_name = self.cfg.ExpandInstanceName(self.op.instance_name)
11900 # If the instance was not found we'll try with the name that was passed in.
11901 # This will only work if it was an FQDN, though.
11903 if not instance_name:
11905 instance_name = self.op.instance_name
11907 locked_nodes = self.owned_locks(locking.LEVEL_NODE)
11908 exportlist = self.rpc.call_export_list(locked_nodes)
11910 for node in exportlist:
11911 msg = exportlist[node].fail_msg
11913 self.LogWarning("Failed to query node %s (continuing): %s", node, msg)
11915 if instance_name in exportlist[node].payload:
11917 result = self.rpc.call_export_remove(node, instance_name)
11918 msg = result.fail_msg
11920 logging.error("Could not remove export for instance %s"
11921 " on node %s: %s", instance_name, node, msg)
11923 if fqdn_warn and not found:
11924 feedback_fn("Export not found. If trying to remove an export belonging"
11925 " to a deleted instance please use its Fully Qualified"
11929 class LUGroupAdd(LogicalUnit):
11930 """Logical unit for creating node groups.
11933 HPATH = "group-add"
11934 HTYPE = constants.HTYPE_GROUP
11937 def ExpandNames(self):
11938 # We need the new group's UUID here so that we can create and acquire the
11939 # corresponding lock. Later, in Exec(), we'll indicate to cfg.AddNodeGroup
11940 # that it should not check whether the UUID exists in the configuration.
11941 self.group_uuid = self.cfg.GenerateUniqueID(self.proc.GetECId())
11942 self.needed_locks = {}
11943 self.add_locks[locking.LEVEL_NODEGROUP] = self.group_uuid
11945 def CheckPrereq(self):
11946 """Check prerequisites.
11948 This checks that the given group name is not an existing node group
11953 existing_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
11954 except errors.OpPrereqError:
11957 raise errors.OpPrereqError("Desired group name '%s' already exists as a"
11958 " node group (UUID: %s)" %
11959 (self.op.group_name, existing_uuid),
11960 errors.ECODE_EXISTS)
11962 if self.op.ndparams:
11963 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
11965 def BuildHooksEnv(self):
11966 """Build hooks env.
11970 "GROUP_NAME": self.op.group_name,
11973 def BuildHooksNodes(self):
11974 """Build hooks nodes.
11977 mn = self.cfg.GetMasterNode()
11978 return ([mn], [mn])
11980 def Exec(self, feedback_fn):
11981 """Add the node group to the cluster.
11984 group_obj = objects.NodeGroup(name=self.op.group_name, members=[],
11985 uuid=self.group_uuid,
11986 alloc_policy=self.op.alloc_policy,
11987 ndparams=self.op.ndparams)
11989 self.cfg.AddNodeGroup(group_obj, self.proc.GetECId(), check_uuid=False)
11990 del self.remove_locks[locking.LEVEL_NODEGROUP]
11993 class LUGroupAssignNodes(NoHooksLU):
11994 """Logical unit for assigning nodes to groups.
11999 def ExpandNames(self):
12000 # These raise errors.OpPrereqError on their own:
12001 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
12002 self.op.nodes = _GetWantedNodes(self, self.op.nodes)
12004 # We want to lock all the affected nodes and groups. We have readily
12005 # available the list of nodes, and the *destination* group. To gather the
12006 # list of "source" groups, we need to fetch node information later on.
12007 self.needed_locks = {
12008 locking.LEVEL_NODEGROUP: set([self.group_uuid]),
12009 locking.LEVEL_NODE: self.op.nodes,
12012 def DeclareLocks(self, level):
12013 if level == locking.LEVEL_NODEGROUP:
12014 assert len(self.needed_locks[locking.LEVEL_NODEGROUP]) == 1
12016 # Try to get all affected nodes' groups without having the group or node
12017 # lock yet. Needs verification later in the code flow.
12018 groups = self.cfg.GetNodeGroupsFromNodes(self.op.nodes)
12020 self.needed_locks[locking.LEVEL_NODEGROUP].update(groups)
12022 def CheckPrereq(self):
12023 """Check prerequisites.
12026 assert self.needed_locks[locking.LEVEL_NODEGROUP]
12027 assert (frozenset(self.owned_locks(locking.LEVEL_NODE)) ==
12028 frozenset(self.op.nodes))
12030 expected_locks = (set([self.group_uuid]) |
12031 self.cfg.GetNodeGroupsFromNodes(self.op.nodes))
12032 actual_locks = self.owned_locks(locking.LEVEL_NODEGROUP)
12033 if actual_locks != expected_locks:
12034 raise errors.OpExecError("Nodes changed groups since locks were acquired,"
12035 " current groups are '%s', used to be '%s'" %
12036 (utils.CommaJoin(expected_locks),
12037 utils.CommaJoin(actual_locks)))
12039 self.node_data = self.cfg.GetAllNodesInfo()
12040 self.group = self.cfg.GetNodeGroup(self.group_uuid)
12041 instance_data = self.cfg.GetAllInstancesInfo()
12043 if self.group is None:
12044 raise errors.OpExecError("Could not retrieve group '%s' (UUID: %s)" %
12045 (self.op.group_name, self.group_uuid))
12047 (new_splits, previous_splits) = \
12048 self.CheckAssignmentForSplitInstances([(node, self.group_uuid)
12049 for node in self.op.nodes],
12050 self.node_data, instance_data)
12053 fmt_new_splits = utils.CommaJoin(utils.NiceSort(new_splits))
12055 if not self.op.force:
12056 raise errors.OpExecError("The following instances get split by this"
12057 " change and --force was not given: %s" %
12060 self.LogWarning("This operation will split the following instances: %s",
12063 if previous_splits:
12064 self.LogWarning("In addition, these already-split instances continue"
12065 " to be split across groups: %s",
12066 utils.CommaJoin(utils.NiceSort(previous_splits)))
12068 def Exec(self, feedback_fn):
12069 """Assign nodes to a new group.
12072 mods = [(node_name, self.group_uuid) for node_name in self.op.nodes]
12074 self.cfg.AssignGroupNodes(mods)
12077 def CheckAssignmentForSplitInstances(changes, node_data, instance_data):
12078 """Check for split instances after a node assignment.
12080 This method considers a series of node assignments as an atomic operation,
12081 and returns information about split instances after applying the set of
12084 In particular, it returns information about newly split instances, and
12085 instances that were already split, and remain so after the change.
12087 Only instances whose disk template is listed in constants.DTS_INT_MIRROR are
12090 @type changes: list of (node_name, new_group_uuid) pairs.
12091 @param changes: list of node assignments to consider.
12092 @param node_data: a dict with data for all nodes
12093 @param instance_data: a dict with all instances to consider
12094 @rtype: a two-tuple
12095 @return: a list of instances that were previously okay and result split as a
12096 consequence of this change, and a list of instances that were previously
12097 split and this change does not fix.
12100 changed_nodes = dict((node, group) for node, group in changes
12101 if node_data[node].group != group)
12103 all_split_instances = set()
12104 previously_split_instances = set()
12106 def InstanceNodes(instance):
12107 return [instance.primary_node] + list(instance.secondary_nodes)
12109 for inst in instance_data.values():
12110 if inst.disk_template not in constants.DTS_INT_MIRROR:
12113 instance_nodes = InstanceNodes(inst)
12115 if len(set(node_data[node].group for node in instance_nodes)) > 1:
12116 previously_split_instances.add(inst.name)
12118 if len(set(changed_nodes.get(node, node_data[node].group)
12119 for node in instance_nodes)) > 1:
12120 all_split_instances.add(inst.name)
12122 return (list(all_split_instances - previously_split_instances),
12123 list(previously_split_instances & all_split_instances))
12126 class _GroupQuery(_QueryBase):
12127 FIELDS = query.GROUP_FIELDS
12129 def ExpandNames(self, lu):
12130 lu.needed_locks = {}
12132 self._all_groups = lu.cfg.GetAllNodeGroupsInfo()
12133 name_to_uuid = dict((g.name, g.uuid) for g in self._all_groups.values())
12136 self.wanted = [name_to_uuid[name]
12137 for name in utils.NiceSort(name_to_uuid.keys())]
12139 # Accept names to be either names or UUIDs.
12142 all_uuid = frozenset(self._all_groups.keys())
12144 for name in self.names:
12145 if name in all_uuid:
12146 self.wanted.append(name)
12147 elif name in name_to_uuid:
12148 self.wanted.append(name_to_uuid[name])
12150 missing.append(name)
12153 raise errors.OpPrereqError("Some groups do not exist: %s" %
12154 utils.CommaJoin(missing),
12155 errors.ECODE_NOENT)
12157 def DeclareLocks(self, lu, level):
12160 def _GetQueryData(self, lu):
12161 """Computes the list of node groups and their attributes.
12164 do_nodes = query.GQ_NODE in self.requested_data
12165 do_instances = query.GQ_INST in self.requested_data
12167 group_to_nodes = None
12168 group_to_instances = None
12170 # For GQ_NODE, we need to map group->[nodes], and group->[instances] for
12171 # GQ_INST. The former is attainable with just GetAllNodesInfo(), but for the
12172 # latter GetAllInstancesInfo() is not enough, for we have to go through
12173 # instance->node. Hence, we will need to process nodes even if we only need
12174 # instance information.
12175 if do_nodes or do_instances:
12176 all_nodes = lu.cfg.GetAllNodesInfo()
12177 group_to_nodes = dict((uuid, []) for uuid in self.wanted)
12180 for node in all_nodes.values():
12181 if node.group in group_to_nodes:
12182 group_to_nodes[node.group].append(node.name)
12183 node_to_group[node.name] = node.group
12186 all_instances = lu.cfg.GetAllInstancesInfo()
12187 group_to_instances = dict((uuid, []) for uuid in self.wanted)
12189 for instance in all_instances.values():
12190 node = instance.primary_node
12191 if node in node_to_group:
12192 group_to_instances[node_to_group[node]].append(instance.name)
12195 # Do not pass on node information if it was not requested.
12196 group_to_nodes = None
12198 return query.GroupQueryData([self._all_groups[uuid]
12199 for uuid in self.wanted],
12200 group_to_nodes, group_to_instances)
12203 class LUGroupQuery(NoHooksLU):
12204 """Logical unit for querying node groups.
12209 def CheckArguments(self):
12210 self.gq = _GroupQuery(qlang.MakeSimpleFilter("name", self.op.names),
12211 self.op.output_fields, False)
12213 def ExpandNames(self):
12214 self.gq.ExpandNames(self)
12216 def DeclareLocks(self, level):
12217 self.gq.DeclareLocks(self, level)
12219 def Exec(self, feedback_fn):
12220 return self.gq.OldStyleQuery(self)
12223 class LUGroupSetParams(LogicalUnit):
12224 """Modifies the parameters of a node group.
12227 HPATH = "group-modify"
12228 HTYPE = constants.HTYPE_GROUP
12231 def CheckArguments(self):
12234 self.op.alloc_policy,
12237 if all_changes.count(None) == len(all_changes):
12238 raise errors.OpPrereqError("Please pass at least one modification",
12239 errors.ECODE_INVAL)
12241 def ExpandNames(self):
12242 # This raises errors.OpPrereqError on its own:
12243 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
12245 self.needed_locks = {
12246 locking.LEVEL_NODEGROUP: [self.group_uuid],
12249 def CheckPrereq(self):
12250 """Check prerequisites.
12253 self.group = self.cfg.GetNodeGroup(self.group_uuid)
12255 if self.group is None:
12256 raise errors.OpExecError("Could not retrieve group '%s' (UUID: %s)" %
12257 (self.op.group_name, self.group_uuid))
12259 if self.op.ndparams:
12260 new_ndparams = _GetUpdatedParams(self.group.ndparams, self.op.ndparams)
12261 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
12262 self.new_ndparams = new_ndparams
12264 def BuildHooksEnv(self):
12265 """Build hooks env.
12269 "GROUP_NAME": self.op.group_name,
12270 "NEW_ALLOC_POLICY": self.op.alloc_policy,
12273 def BuildHooksNodes(self):
12274 """Build hooks nodes.
12277 mn = self.cfg.GetMasterNode()
12278 return ([mn], [mn])
12280 def Exec(self, feedback_fn):
12281 """Modifies the node group.
12286 if self.op.ndparams:
12287 self.group.ndparams = self.new_ndparams
12288 result.append(("ndparams", str(self.group.ndparams)))
12290 if self.op.alloc_policy:
12291 self.group.alloc_policy = self.op.alloc_policy
12293 self.cfg.Update(self.group, feedback_fn)
12297 class LUGroupRemove(LogicalUnit):
12298 HPATH = "group-remove"
12299 HTYPE = constants.HTYPE_GROUP
12302 def ExpandNames(self):
12303 # This will raises errors.OpPrereqError on its own:
12304 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
12305 self.needed_locks = {
12306 locking.LEVEL_NODEGROUP: [self.group_uuid],
12309 def CheckPrereq(self):
12310 """Check prerequisites.
12312 This checks that the given group name exists as a node group, that is
12313 empty (i.e., contains no nodes), and that is not the last group of the
12317 # Verify that the group is empty.
12318 group_nodes = [node.name
12319 for node in self.cfg.GetAllNodesInfo().values()
12320 if node.group == self.group_uuid]
12323 raise errors.OpPrereqError("Group '%s' not empty, has the following"
12325 (self.op.group_name,
12326 utils.CommaJoin(utils.NiceSort(group_nodes))),
12327 errors.ECODE_STATE)
12329 # Verify the cluster would not be left group-less.
12330 if len(self.cfg.GetNodeGroupList()) == 1:
12331 raise errors.OpPrereqError("Group '%s' is the only group,"
12332 " cannot be removed" %
12333 self.op.group_name,
12334 errors.ECODE_STATE)
12336 def BuildHooksEnv(self):
12337 """Build hooks env.
12341 "GROUP_NAME": self.op.group_name,
12344 def BuildHooksNodes(self):
12345 """Build hooks nodes.
12348 mn = self.cfg.GetMasterNode()
12349 return ([mn], [mn])
12351 def Exec(self, feedback_fn):
12352 """Remove the node group.
12356 self.cfg.RemoveNodeGroup(self.group_uuid)
12357 except errors.ConfigurationError:
12358 raise errors.OpExecError("Group '%s' with UUID %s disappeared" %
12359 (self.op.group_name, self.group_uuid))
12361 self.remove_locks[locking.LEVEL_NODEGROUP] = self.group_uuid
12364 class LUGroupRename(LogicalUnit):
12365 HPATH = "group-rename"
12366 HTYPE = constants.HTYPE_GROUP
12369 def ExpandNames(self):
12370 # This raises errors.OpPrereqError on its own:
12371 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
12373 self.needed_locks = {
12374 locking.LEVEL_NODEGROUP: [self.group_uuid],
12377 def CheckPrereq(self):
12378 """Check prerequisites.
12380 Ensures requested new name is not yet used.
12384 new_name_uuid = self.cfg.LookupNodeGroup(self.op.new_name)
12385 except errors.OpPrereqError:
12388 raise errors.OpPrereqError("Desired new name '%s' clashes with existing"
12389 " node group (UUID: %s)" %
12390 (self.op.new_name, new_name_uuid),
12391 errors.ECODE_EXISTS)
12393 def BuildHooksEnv(self):
12394 """Build hooks env.
12398 "OLD_NAME": self.op.group_name,
12399 "NEW_NAME": self.op.new_name,
12402 def BuildHooksNodes(self):
12403 """Build hooks nodes.
12406 mn = self.cfg.GetMasterNode()
12408 all_nodes = self.cfg.GetAllNodesInfo()
12409 all_nodes.pop(mn, None)
12412 run_nodes.extend(node.name for node in all_nodes.values()
12413 if node.group == self.group_uuid)
12415 return (run_nodes, run_nodes)
12417 def Exec(self, feedback_fn):
12418 """Rename the node group.
12421 group = self.cfg.GetNodeGroup(self.group_uuid)
12424 raise errors.OpExecError("Could not retrieve group '%s' (UUID: %s)" %
12425 (self.op.group_name, self.group_uuid))
12427 group.name = self.op.new_name
12428 self.cfg.Update(group, feedback_fn)
12430 return self.op.new_name
12433 class LUGroupEvacuate(LogicalUnit):
12434 HPATH = "group-evacuate"
12435 HTYPE = constants.HTYPE_GROUP
12438 def ExpandNames(self):
12439 # This raises errors.OpPrereqError on its own:
12440 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
12442 if self.op.target_groups:
12443 self.req_target_uuids = map(self.cfg.LookupNodeGroup,
12444 self.op.target_groups)
12446 self.req_target_uuids = []
12448 if self.group_uuid in self.req_target_uuids:
12449 raise errors.OpPrereqError("Group to be evacuated (%s) can not be used"
12450 " as a target group (targets are %s)" %
12452 utils.CommaJoin(self.req_target_uuids)),
12453 errors.ECODE_INVAL)
12455 self.op.iallocator = _GetDefaultIAllocator(self.cfg, self.op.iallocator)
12457 self.share_locks = _ShareAll()
12458 self.needed_locks = {
12459 locking.LEVEL_INSTANCE: [],
12460 locking.LEVEL_NODEGROUP: [],
12461 locking.LEVEL_NODE: [],
12464 def DeclareLocks(self, level):
12465 if level == locking.LEVEL_INSTANCE:
12466 assert not self.needed_locks[locking.LEVEL_INSTANCE]
12468 # Lock instances optimistically, needs verification once node and group
12469 # locks have been acquired
12470 self.needed_locks[locking.LEVEL_INSTANCE] = \
12471 self.cfg.GetNodeGroupInstances(self.group_uuid)
12473 elif level == locking.LEVEL_NODEGROUP:
12474 assert not self.needed_locks[locking.LEVEL_NODEGROUP]
12476 if self.req_target_uuids:
12477 lock_groups = set([self.group_uuid] + self.req_target_uuids)
12479 # Lock all groups used by instances optimistically; this requires going
12480 # via the node before it's locked, requiring verification later on
12481 lock_groups.update(group_uuid
12482 for instance_name in
12483 self.owned_locks(locking.LEVEL_INSTANCE)
12485 self.cfg.GetInstanceNodeGroups(instance_name))
12487 # No target groups, need to lock all of them
12488 lock_groups = locking.ALL_SET
12490 self.needed_locks[locking.LEVEL_NODEGROUP] = lock_groups
12492 elif level == locking.LEVEL_NODE:
12493 # This will only lock the nodes in the group to be evacuated which
12494 # contain actual instances
12495 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
12496 self._LockInstancesNodes()
12498 # Lock all nodes in group to be evacuated and target groups
12499 owned_groups = frozenset(self.owned_locks(locking.LEVEL_NODEGROUP))
12500 assert self.group_uuid in owned_groups
12501 member_nodes = [node_name
12502 for group in owned_groups
12503 for node_name in self.cfg.GetNodeGroup(group).members]
12504 self.needed_locks[locking.LEVEL_NODE].extend(member_nodes)
12506 def CheckPrereq(self):
12507 owned_instances = frozenset(self.owned_locks(locking.LEVEL_INSTANCE))
12508 owned_groups = frozenset(self.owned_locks(locking.LEVEL_NODEGROUP))
12509 owned_nodes = frozenset(self.owned_locks(locking.LEVEL_NODE))
12511 assert owned_groups.issuperset(self.req_target_uuids)
12512 assert self.group_uuid in owned_groups
12514 # Check if locked instances are still correct
12515 _CheckNodeGroupInstances(self.cfg, self.group_uuid, owned_instances)
12517 # Get instance information
12518 self.instances = dict(self.cfg.GetMultiInstanceInfo(owned_instances))
12520 # Check if node groups for locked instances are still correct
12521 for instance_name in owned_instances:
12522 inst = self.instances[instance_name]
12523 assert owned_nodes.issuperset(inst.all_nodes), \
12524 "Instance %s's nodes changed while we kept the lock" % instance_name
12526 inst_groups = _CheckInstanceNodeGroups(self.cfg, instance_name,
12529 assert self.group_uuid in inst_groups, \
12530 "Instance %s has no node in group %s" % (instance_name, self.group_uuid)
12532 if self.req_target_uuids:
12533 # User requested specific target groups
12534 self.target_uuids = self.req_target_uuids
12536 # All groups except the one to be evacuated are potential targets
12537 self.target_uuids = [group_uuid for group_uuid in owned_groups
12538 if group_uuid != self.group_uuid]
12540 if not self.target_uuids:
12541 raise errors.OpPrereqError("There are no possible target groups",
12542 errors.ECODE_INVAL)
12544 def BuildHooksEnv(self):
12545 """Build hooks env.
12549 "GROUP_NAME": self.op.group_name,
12550 "TARGET_GROUPS": " ".join(self.target_uuids),
12553 def BuildHooksNodes(self):
12554 """Build hooks nodes.
12557 mn = self.cfg.GetMasterNode()
12559 assert self.group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP)
12561 run_nodes = [mn] + self.cfg.GetNodeGroup(self.group_uuid).members
12563 return (run_nodes, run_nodes)
12565 def Exec(self, feedback_fn):
12566 instances = list(self.owned_locks(locking.LEVEL_INSTANCE))
12568 assert self.group_uuid not in self.target_uuids
12570 ial = IAllocator(self.cfg, self.rpc, constants.IALLOCATOR_MODE_CHG_GROUP,
12571 instances=instances, target_groups=self.target_uuids)
12573 ial.Run(self.op.iallocator)
12575 if not ial.success:
12576 raise errors.OpPrereqError("Can't compute group evacuation using"
12577 " iallocator '%s': %s" %
12578 (self.op.iallocator, ial.info),
12579 errors.ECODE_NORES)
12581 jobs = _LoadNodeEvacResult(self, ial.result, self.op.early_release, False)
12583 self.LogInfo("Iallocator returned %s job(s) for evacuating node group %s",
12584 len(jobs), self.op.group_name)
12586 return ResultWithJobs(jobs)
12589 class TagsLU(NoHooksLU): # pylint: disable=W0223
12590 """Generic tags LU.
12592 This is an abstract class which is the parent of all the other tags LUs.
12595 def ExpandNames(self):
12596 self.group_uuid = None
12597 self.needed_locks = {}
12598 if self.op.kind == constants.TAG_NODE:
12599 self.op.name = _ExpandNodeName(self.cfg, self.op.name)
12600 self.needed_locks[locking.LEVEL_NODE] = self.op.name
12601 elif self.op.kind == constants.TAG_INSTANCE:
12602 self.op.name = _ExpandInstanceName(self.cfg, self.op.name)
12603 self.needed_locks[locking.LEVEL_INSTANCE] = self.op.name
12604 elif self.op.kind == constants.TAG_NODEGROUP:
12605 self.group_uuid = self.cfg.LookupNodeGroup(self.op.name)
12607 # FIXME: Acquire BGL for cluster tag operations (as of this writing it's
12608 # not possible to acquire the BGL based on opcode parameters)
12610 def CheckPrereq(self):
12611 """Check prerequisites.
12614 if self.op.kind == constants.TAG_CLUSTER:
12615 self.target = self.cfg.GetClusterInfo()
12616 elif self.op.kind == constants.TAG_NODE:
12617 self.target = self.cfg.GetNodeInfo(self.op.name)
12618 elif self.op.kind == constants.TAG_INSTANCE:
12619 self.target = self.cfg.GetInstanceInfo(self.op.name)
12620 elif self.op.kind == constants.TAG_NODEGROUP:
12621 self.target = self.cfg.GetNodeGroup(self.group_uuid)
12623 raise errors.OpPrereqError("Wrong tag type requested (%s)" %
12624 str(self.op.kind), errors.ECODE_INVAL)
12627 class LUTagsGet(TagsLU):
12628 """Returns the tags of a given object.
12633 def ExpandNames(self):
12634 TagsLU.ExpandNames(self)
12636 # Share locks as this is only a read operation
12637 self.share_locks = _ShareAll()
12639 def Exec(self, feedback_fn):
12640 """Returns the tag list.
12643 return list(self.target.GetTags())
12646 class LUTagsSearch(NoHooksLU):
12647 """Searches the tags for a given pattern.
12652 def ExpandNames(self):
12653 self.needed_locks = {}
12655 def CheckPrereq(self):
12656 """Check prerequisites.
12658 This checks the pattern passed for validity by compiling it.
12662 self.re = re.compile(self.op.pattern)
12663 except re.error, err:
12664 raise errors.OpPrereqError("Invalid search pattern '%s': %s" %
12665 (self.op.pattern, err), errors.ECODE_INVAL)
12667 def Exec(self, feedback_fn):
12668 """Returns the tag list.
12672 tgts = [("/cluster", cfg.GetClusterInfo())]
12673 ilist = cfg.GetAllInstancesInfo().values()
12674 tgts.extend([("/instances/%s" % i.name, i) for i in ilist])
12675 nlist = cfg.GetAllNodesInfo().values()
12676 tgts.extend([("/nodes/%s" % n.name, n) for n in nlist])
12677 tgts.extend(("/nodegroup/%s" % n.name, n)
12678 for n in cfg.GetAllNodeGroupsInfo().values())
12680 for path, target in tgts:
12681 for tag in target.GetTags():
12682 if self.re.search(tag):
12683 results.append((path, tag))
12687 class LUTagsSet(TagsLU):
12688 """Sets a tag on a given object.
12693 def CheckPrereq(self):
12694 """Check prerequisites.
12696 This checks the type and length of the tag name and value.
12699 TagsLU.CheckPrereq(self)
12700 for tag in self.op.tags:
12701 objects.TaggableObject.ValidateTag(tag)
12703 def Exec(self, feedback_fn):
12708 for tag in self.op.tags:
12709 self.target.AddTag(tag)
12710 except errors.TagError, err:
12711 raise errors.OpExecError("Error while setting tag: %s" % str(err))
12712 self.cfg.Update(self.target, feedback_fn)
12715 class LUTagsDel(TagsLU):
12716 """Delete a list of tags from a given object.
12721 def CheckPrereq(self):
12722 """Check prerequisites.
12724 This checks that we have the given tag.
12727 TagsLU.CheckPrereq(self)
12728 for tag in self.op.tags:
12729 objects.TaggableObject.ValidateTag(tag)
12730 del_tags = frozenset(self.op.tags)
12731 cur_tags = self.target.GetTags()
12733 diff_tags = del_tags - cur_tags
12735 diff_names = ("'%s'" % i for i in sorted(diff_tags))
12736 raise errors.OpPrereqError("Tag(s) %s not found" %
12737 (utils.CommaJoin(diff_names), ),
12738 errors.ECODE_NOENT)
12740 def Exec(self, feedback_fn):
12741 """Remove the tag from the object.
12744 for tag in self.op.tags:
12745 self.target.RemoveTag(tag)
12746 self.cfg.Update(self.target, feedback_fn)
12749 class LUTestDelay(NoHooksLU):
12750 """Sleep for a specified amount of time.
12752 This LU sleeps on the master and/or nodes for a specified amount of
12758 def ExpandNames(self):
12759 """Expand names and set required locks.
12761 This expands the node list, if any.
12764 self.needed_locks = {}
12765 if self.op.on_nodes:
12766 # _GetWantedNodes can be used here, but is not always appropriate to use
12767 # this way in ExpandNames. Check LogicalUnit.ExpandNames docstring for
12768 # more information.
12769 self.op.on_nodes = _GetWantedNodes(self, self.op.on_nodes)
12770 self.needed_locks[locking.LEVEL_NODE] = self.op.on_nodes
12772 def _TestDelay(self):
12773 """Do the actual sleep.
12776 if self.op.on_master:
12777 if not utils.TestDelay(self.op.duration):
12778 raise errors.OpExecError("Error during master delay test")
12779 if self.op.on_nodes:
12780 result = self.rpc.call_test_delay(self.op.on_nodes, self.op.duration)
12781 for node, node_result in result.items():
12782 node_result.Raise("Failure during rpc call to node %s" % node)
12784 def Exec(self, feedback_fn):
12785 """Execute the test delay opcode, with the wanted repetitions.
12788 if self.op.repeat == 0:
12791 top_value = self.op.repeat - 1
12792 for i in range(self.op.repeat):
12793 self.LogInfo("Test delay iteration %d/%d" % (i, top_value))
12797 class LUTestJqueue(NoHooksLU):
12798 """Utility LU to test some aspects of the job queue.
12803 # Must be lower than default timeout for WaitForJobChange to see whether it
12804 # notices changed jobs
12805 _CLIENT_CONNECT_TIMEOUT = 20.0
12806 _CLIENT_CONFIRM_TIMEOUT = 60.0
12809 def _NotifyUsingSocket(cls, cb, errcls):
12810 """Opens a Unix socket and waits for another program to connect.
12813 @param cb: Callback to send socket name to client
12814 @type errcls: class
12815 @param errcls: Exception class to use for errors
12818 # Using a temporary directory as there's no easy way to create temporary
12819 # sockets without writing a custom loop around tempfile.mktemp and
12821 tmpdir = tempfile.mkdtemp()
12823 tmpsock = utils.PathJoin(tmpdir, "sock")
12825 logging.debug("Creating temporary socket at %s", tmpsock)
12826 sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
12831 # Send details to client
12834 # Wait for client to connect before continuing
12835 sock.settimeout(cls._CLIENT_CONNECT_TIMEOUT)
12837 (conn, _) = sock.accept()
12838 except socket.error, err:
12839 raise errcls("Client didn't connect in time (%s)" % err)
12843 # Remove as soon as client is connected
12844 shutil.rmtree(tmpdir)
12846 # Wait for client to close
12849 # pylint: disable=E1101
12850 # Instance of '_socketobject' has no ... member
12851 conn.settimeout(cls._CLIENT_CONFIRM_TIMEOUT)
12853 except socket.error, err:
12854 raise errcls("Client failed to confirm notification (%s)" % err)
12858 def _SendNotification(self, test, arg, sockname):
12859 """Sends a notification to the client.
12862 @param test: Test name
12863 @param arg: Test argument (depends on test)
12864 @type sockname: string
12865 @param sockname: Socket path
12868 self.Log(constants.ELOG_JQUEUE_TEST, (sockname, test, arg))
12870 def _Notify(self, prereq, test, arg):
12871 """Notifies the client of a test.
12874 @param prereq: Whether this is a prereq-phase test
12876 @param test: Test name
12877 @param arg: Test argument (depends on test)
12881 errcls = errors.OpPrereqError
12883 errcls = errors.OpExecError
12885 return self._NotifyUsingSocket(compat.partial(self._SendNotification,
12889 def CheckArguments(self):
12890 self.checkargs_calls = getattr(self, "checkargs_calls", 0) + 1
12891 self.expandnames_calls = 0
12893 def ExpandNames(self):
12894 checkargs_calls = getattr(self, "checkargs_calls", 0)
12895 if checkargs_calls < 1:
12896 raise errors.ProgrammerError("CheckArguments was not called")
12898 self.expandnames_calls += 1
12900 if self.op.notify_waitlock:
12901 self._Notify(True, constants.JQT_EXPANDNAMES, None)
12903 self.LogInfo("Expanding names")
12905 # Get lock on master node (just to get a lock, not for a particular reason)
12906 self.needed_locks = {
12907 locking.LEVEL_NODE: self.cfg.GetMasterNode(),
12910 def Exec(self, feedback_fn):
12911 if self.expandnames_calls < 1:
12912 raise errors.ProgrammerError("ExpandNames was not called")
12914 if self.op.notify_exec:
12915 self._Notify(False, constants.JQT_EXEC, None)
12917 self.LogInfo("Executing")
12919 if self.op.log_messages:
12920 self._Notify(False, constants.JQT_STARTMSG, len(self.op.log_messages))
12921 for idx, msg in enumerate(self.op.log_messages):
12922 self.LogInfo("Sending log message %s", idx + 1)
12923 feedback_fn(constants.JQT_MSGPREFIX + msg)
12924 # Report how many test messages have been sent
12925 self._Notify(False, constants.JQT_LOGMSG, idx + 1)
12928 raise errors.OpExecError("Opcode failure was requested")
12933 class IAllocator(object):
12934 """IAllocator framework.
12936 An IAllocator instance has three sets of attributes:
12937 - cfg that is needed to query the cluster
12938 - input data (all members of the _KEYS class attribute are required)
12939 - four buffer attributes (in|out_data|text), that represent the
12940 input (to the external script) in text and data structure format,
12941 and the output from it, again in two formats
12942 - the result variables from the script (success, info, nodes) for
12946 # pylint: disable=R0902
12947 # lots of instance attributes
12949 def __init__(self, cfg, rpc, mode, **kwargs):
12952 # init buffer variables
12953 self.in_text = self.out_text = self.in_data = self.out_data = None
12954 # init all input fields so that pylint is happy
12956 self.memory = self.disks = self.disk_template = None
12957 self.os = self.tags = self.nics = self.vcpus = None
12958 self.hypervisor = None
12959 self.relocate_from = None
12961 self.instances = None
12962 self.evac_mode = None
12963 self.target_groups = []
12965 self.required_nodes = None
12966 # init result fields
12967 self.success = self.info = self.result = None
12970 (fn, keydata, self._result_check) = self._MODE_DATA[self.mode]
12972 raise errors.ProgrammerError("Unknown mode '%s' passed to the"
12973 " IAllocator" % self.mode)
12975 keyset = [n for (n, _) in keydata]
12978 if key not in keyset:
12979 raise errors.ProgrammerError("Invalid input parameter '%s' to"
12980 " IAllocator" % key)
12981 setattr(self, key, kwargs[key])
12984 if key not in kwargs:
12985 raise errors.ProgrammerError("Missing input parameter '%s' to"
12986 " IAllocator" % key)
12987 self._BuildInputData(compat.partial(fn, self), keydata)
12989 def _ComputeClusterData(self):
12990 """Compute the generic allocator input data.
12992 This is the data that is independent of the actual operation.
12996 cluster_info = cfg.GetClusterInfo()
12999 "version": constants.IALLOCATOR_VERSION,
13000 "cluster_name": cfg.GetClusterName(),
13001 "cluster_tags": list(cluster_info.GetTags()),
13002 "enabled_hypervisors": list(cluster_info.enabled_hypervisors),
13003 # we don't have job IDs
13005 ninfo = cfg.GetAllNodesInfo()
13006 iinfo = cfg.GetAllInstancesInfo().values()
13007 i_list = [(inst, cluster_info.FillBE(inst)) for inst in iinfo]
13010 node_list = [n.name for n in ninfo.values() if n.vm_capable]
13012 if self.mode == constants.IALLOCATOR_MODE_ALLOC:
13013 hypervisor_name = self.hypervisor
13014 elif self.mode == constants.IALLOCATOR_MODE_RELOC:
13015 hypervisor_name = cfg.GetInstanceInfo(self.name).hypervisor
13017 hypervisor_name = cluster_info.enabled_hypervisors[0]
13019 node_data = self.rpc.call_node_info(node_list, cfg.GetVGName(),
13022 self.rpc.call_all_instances_info(node_list,
13023 cluster_info.enabled_hypervisors)
13025 data["nodegroups"] = self._ComputeNodeGroupData(cfg)
13027 config_ndata = self._ComputeBasicNodeData(ninfo)
13028 data["nodes"] = self._ComputeDynamicNodeData(ninfo, node_data, node_iinfo,
13029 i_list, config_ndata)
13030 assert len(data["nodes"]) == len(ninfo), \
13031 "Incomplete node data computed"
13033 data["instances"] = self._ComputeInstanceData(cluster_info, i_list)
13035 self.in_data = data
13038 def _ComputeNodeGroupData(cfg):
13039 """Compute node groups data.
13042 ng = dict((guuid, {
13043 "name": gdata.name,
13044 "alloc_policy": gdata.alloc_policy,
13046 for guuid, gdata in cfg.GetAllNodeGroupsInfo().items())
13051 def _ComputeBasicNodeData(node_cfg):
13052 """Compute global node data.
13055 @returns: a dict of name: (node dict, node config)
13058 # fill in static (config-based) values
13059 node_results = dict((ninfo.name, {
13060 "tags": list(ninfo.GetTags()),
13061 "primary_ip": ninfo.primary_ip,
13062 "secondary_ip": ninfo.secondary_ip,
13063 "offline": ninfo.offline,
13064 "drained": ninfo.drained,
13065 "master_candidate": ninfo.master_candidate,
13066 "group": ninfo.group,
13067 "master_capable": ninfo.master_capable,
13068 "vm_capable": ninfo.vm_capable,
13070 for ninfo in node_cfg.values())
13072 return node_results
13075 def _ComputeDynamicNodeData(node_cfg, node_data, node_iinfo, i_list,
13077 """Compute global node data.
13079 @param node_results: the basic node structures as filled from the config
13082 # make a copy of the current dict
13083 node_results = dict(node_results)
13084 for nname, nresult in node_data.items():
13085 assert nname in node_results, "Missing basic data for node %s" % nname
13086 ninfo = node_cfg[nname]
13088 if not (ninfo.offline or ninfo.drained):
13089 nresult.Raise("Can't get data for node %s" % nname)
13090 node_iinfo[nname].Raise("Can't get node instance info from node %s" %
13092 remote_info = nresult.payload
13094 for attr in ["memory_total", "memory_free", "memory_dom0",
13095 "vg_size", "vg_free", "cpu_total"]:
13096 if attr not in remote_info:
13097 raise errors.OpExecError("Node '%s' didn't return attribute"
13098 " '%s'" % (nname, attr))
13099 if not isinstance(remote_info[attr], int):
13100 raise errors.OpExecError("Node '%s' returned invalid value"
13102 (nname, attr, remote_info[attr]))
13103 # compute memory used by primary instances
13104 i_p_mem = i_p_up_mem = 0
13105 for iinfo, beinfo in i_list:
13106 if iinfo.primary_node == nname:
13107 i_p_mem += beinfo[constants.BE_MEMORY]
13108 if iinfo.name not in node_iinfo[nname].payload:
13111 i_used_mem = int(node_iinfo[nname].payload[iinfo.name]["memory"])
13112 i_mem_diff = beinfo[constants.BE_MEMORY] - i_used_mem
13113 remote_info["memory_free"] -= max(0, i_mem_diff)
13116 i_p_up_mem += beinfo[constants.BE_MEMORY]
13118 # compute memory used by instances
13120 "total_memory": remote_info["memory_total"],
13121 "reserved_memory": remote_info["memory_dom0"],
13122 "free_memory": remote_info["memory_free"],
13123 "total_disk": remote_info["vg_size"],
13124 "free_disk": remote_info["vg_free"],
13125 "total_cpus": remote_info["cpu_total"],
13126 "i_pri_memory": i_p_mem,
13127 "i_pri_up_memory": i_p_up_mem,
13129 pnr_dyn.update(node_results[nname])
13130 node_results[nname] = pnr_dyn
13132 return node_results
13135 def _ComputeInstanceData(cluster_info, i_list):
13136 """Compute global instance data.
13140 for iinfo, beinfo in i_list:
13142 for nic in iinfo.nics:
13143 filled_params = cluster_info.SimpleFillNIC(nic.nicparams)
13147 "mode": filled_params[constants.NIC_MODE],
13148 "link": filled_params[constants.NIC_LINK],
13150 if filled_params[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
13151 nic_dict["bridge"] = filled_params[constants.NIC_LINK]
13152 nic_data.append(nic_dict)
13154 "tags": list(iinfo.GetTags()),
13155 "admin_up": iinfo.admin_up,
13156 "vcpus": beinfo[constants.BE_VCPUS],
13157 "memory": beinfo[constants.BE_MEMORY],
13159 "nodes": [iinfo.primary_node] + list(iinfo.secondary_nodes),
13161 "disks": [{constants.IDISK_SIZE: dsk.size,
13162 constants.IDISK_MODE: dsk.mode}
13163 for dsk in iinfo.disks],
13164 "disk_template": iinfo.disk_template,
13165 "hypervisor": iinfo.hypervisor,
13167 pir["disk_space_total"] = _ComputeDiskSize(iinfo.disk_template,
13169 instance_data[iinfo.name] = pir
13171 return instance_data
13173 def _AddNewInstance(self):
13174 """Add new instance data to allocator structure.
13176 This in combination with _AllocatorGetClusterData will create the
13177 correct structure needed as input for the allocator.
13179 The checks for the completeness of the opcode must have already been
13183 disk_space = _ComputeDiskSize(self.disk_template, self.disks)
13185 if self.disk_template in constants.DTS_INT_MIRROR:
13186 self.required_nodes = 2
13188 self.required_nodes = 1
13192 "disk_template": self.disk_template,
13195 "vcpus": self.vcpus,
13196 "memory": self.memory,
13197 "disks": self.disks,
13198 "disk_space_total": disk_space,
13200 "required_nodes": self.required_nodes,
13201 "hypervisor": self.hypervisor,
13206 def _AddRelocateInstance(self):
13207 """Add relocate instance data to allocator structure.
13209 This in combination with _IAllocatorGetClusterData will create the
13210 correct structure needed as input for the allocator.
13212 The checks for the completeness of the opcode must have already been
13216 instance = self.cfg.GetInstanceInfo(self.name)
13217 if instance is None:
13218 raise errors.ProgrammerError("Unknown instance '%s' passed to"
13219 " IAllocator" % self.name)
13221 if instance.disk_template not in constants.DTS_MIRRORED:
13222 raise errors.OpPrereqError("Can't relocate non-mirrored instances",
13223 errors.ECODE_INVAL)
13225 if instance.disk_template in constants.DTS_INT_MIRROR and \
13226 len(instance.secondary_nodes) != 1:
13227 raise errors.OpPrereqError("Instance has not exactly one secondary node",
13228 errors.ECODE_STATE)
13230 self.required_nodes = 1
13231 disk_sizes = [{constants.IDISK_SIZE: disk.size} for disk in instance.disks]
13232 disk_space = _ComputeDiskSize(instance.disk_template, disk_sizes)
13236 "disk_space_total": disk_space,
13237 "required_nodes": self.required_nodes,
13238 "relocate_from": self.relocate_from,
13242 def _AddNodeEvacuate(self):
13243 """Get data for node-evacuate requests.
13247 "instances": self.instances,
13248 "evac_mode": self.evac_mode,
13251 def _AddChangeGroup(self):
13252 """Get data for node-evacuate requests.
13256 "instances": self.instances,
13257 "target_groups": self.target_groups,
13260 def _BuildInputData(self, fn, keydata):
13261 """Build input data structures.
13264 self._ComputeClusterData()
13267 request["type"] = self.mode
13268 for keyname, keytype in keydata:
13269 if keyname not in request:
13270 raise errors.ProgrammerError("Request parameter %s is missing" %
13272 val = request[keyname]
13273 if not keytype(val):
13274 raise errors.ProgrammerError("Request parameter %s doesn't pass"
13275 " validation, value %s, expected"
13276 " type %s" % (keyname, val, keytype))
13277 self.in_data["request"] = request
13279 self.in_text = serializer.Dump(self.in_data)
13281 _STRING_LIST = ht.TListOf(ht.TString)
13282 _JOB_LIST = ht.TListOf(ht.TListOf(ht.TStrictDict(True, False, {
13283 # pylint: disable=E1101
13284 # Class '...' has no 'OP_ID' member
13285 "OP_ID": ht.TElemOf([opcodes.OpInstanceFailover.OP_ID,
13286 opcodes.OpInstanceMigrate.OP_ID,
13287 opcodes.OpInstanceReplaceDisks.OP_ID])
13291 ht.TListOf(ht.TAnd(ht.TIsLength(3),
13292 ht.TItems([ht.TNonEmptyString,
13293 ht.TNonEmptyString,
13294 ht.TListOf(ht.TNonEmptyString),
13297 ht.TListOf(ht.TAnd(ht.TIsLength(2),
13298 ht.TItems([ht.TNonEmptyString,
13301 _NEVAC_RESULT = ht.TAnd(ht.TIsLength(3),
13302 ht.TItems([_NEVAC_MOVED, _NEVAC_FAILED, _JOB_LIST]))
13305 constants.IALLOCATOR_MODE_ALLOC:
13308 ("name", ht.TString),
13309 ("memory", ht.TInt),
13310 ("disks", ht.TListOf(ht.TDict)),
13311 ("disk_template", ht.TString),
13312 ("os", ht.TString),
13313 ("tags", _STRING_LIST),
13314 ("nics", ht.TListOf(ht.TDict)),
13315 ("vcpus", ht.TInt),
13316 ("hypervisor", ht.TString),
13318 constants.IALLOCATOR_MODE_RELOC:
13319 (_AddRelocateInstance,
13320 [("name", ht.TString), ("relocate_from", _STRING_LIST)],
13322 constants.IALLOCATOR_MODE_NODE_EVAC:
13323 (_AddNodeEvacuate, [
13324 ("instances", _STRING_LIST),
13325 ("evac_mode", ht.TElemOf(constants.IALLOCATOR_NEVAC_MODES)),
13327 constants.IALLOCATOR_MODE_CHG_GROUP:
13328 (_AddChangeGroup, [
13329 ("instances", _STRING_LIST),
13330 ("target_groups", _STRING_LIST),
13334 def Run(self, name, validate=True, call_fn=None):
13335 """Run an instance allocator and return the results.
13338 if call_fn is None:
13339 call_fn = self.rpc.call_iallocator_runner
13341 result = call_fn(self.cfg.GetMasterNode(), name, self.in_text)
13342 result.Raise("Failure while running the iallocator script")
13344 self.out_text = result.payload
13346 self._ValidateResult()
13348 def _ValidateResult(self):
13349 """Process the allocator results.
13351 This will process and if successful save the result in
13352 self.out_data and the other parameters.
13356 rdict = serializer.Load(self.out_text)
13357 except Exception, err:
13358 raise errors.OpExecError("Can't parse iallocator results: %s" % str(err))
13360 if not isinstance(rdict, dict):
13361 raise errors.OpExecError("Can't parse iallocator results: not a dict")
13363 # TODO: remove backwards compatiblity in later versions
13364 if "nodes" in rdict and "result" not in rdict:
13365 rdict["result"] = rdict["nodes"]
13368 for key in "success", "info", "result":
13369 if key not in rdict:
13370 raise errors.OpExecError("Can't parse iallocator results:"
13371 " missing key '%s'" % key)
13372 setattr(self, key, rdict[key])
13374 if not self._result_check(self.result):
13375 raise errors.OpExecError("Iallocator returned invalid result,"
13376 " expected %s, got %s" %
13377 (self._result_check, self.result),
13378 errors.ECODE_INVAL)
13380 if self.mode == constants.IALLOCATOR_MODE_RELOC:
13381 assert self.relocate_from is not None
13382 assert self.required_nodes == 1
13384 node2group = dict((name, ndata["group"])
13385 for (name, ndata) in self.in_data["nodes"].items())
13387 fn = compat.partial(self._NodesToGroups, node2group,
13388 self.in_data["nodegroups"])
13390 instance = self.cfg.GetInstanceInfo(self.name)
13391 request_groups = fn(self.relocate_from + [instance.primary_node])
13392 result_groups = fn(rdict["result"] + [instance.primary_node])
13394 if self.success and not set(result_groups).issubset(request_groups):
13395 raise errors.OpExecError("Groups of nodes returned by iallocator (%s)"
13396 " differ from original groups (%s)" %
13397 (utils.CommaJoin(result_groups),
13398 utils.CommaJoin(request_groups)))
13400 elif self.mode == constants.IALLOCATOR_MODE_NODE_EVAC:
13401 assert self.evac_mode in constants.IALLOCATOR_NEVAC_MODES
13403 self.out_data = rdict
13406 def _NodesToGroups(node2group, groups, nodes):
13407 """Returns a list of unique group names for a list of nodes.
13409 @type node2group: dict
13410 @param node2group: Map from node name to group UUID
13412 @param groups: Group information
13414 @param nodes: Node names
13421 group_uuid = node2group[node]
13423 # Ignore unknown node
13427 group = groups[group_uuid]
13429 # Can't find group, let's use UUID
13430 group_name = group_uuid
13432 group_name = group["name"]
13434 result.add(group_name)
13436 return sorted(result)
13439 class LUTestAllocator(NoHooksLU):
13440 """Run allocator tests.
13442 This LU runs the allocator tests
13445 def CheckPrereq(self):
13446 """Check prerequisites.
13448 This checks the opcode parameters depending on the director and mode test.
13451 if self.op.mode == constants.IALLOCATOR_MODE_ALLOC:
13452 for attr in ["memory", "disks", "disk_template",
13453 "os", "tags", "nics", "vcpus"]:
13454 if not hasattr(self.op, attr):
13455 raise errors.OpPrereqError("Missing attribute '%s' on opcode input" %
13456 attr, errors.ECODE_INVAL)
13457 iname = self.cfg.ExpandInstanceName(self.op.name)
13458 if iname is not None:
13459 raise errors.OpPrereqError("Instance '%s' already in the cluster" %
13460 iname, errors.ECODE_EXISTS)
13461 if not isinstance(self.op.nics, list):
13462 raise errors.OpPrereqError("Invalid parameter 'nics'",
13463 errors.ECODE_INVAL)
13464 if not isinstance(self.op.disks, list):
13465 raise errors.OpPrereqError("Invalid parameter 'disks'",
13466 errors.ECODE_INVAL)
13467 for row in self.op.disks:
13468 if (not isinstance(row, dict) or
13469 constants.IDISK_SIZE not in row or
13470 not isinstance(row[constants.IDISK_SIZE], int) or
13471 constants.IDISK_MODE not in row or
13472 row[constants.IDISK_MODE] not in constants.DISK_ACCESS_SET):
13473 raise errors.OpPrereqError("Invalid contents of the 'disks'"
13474 " parameter", errors.ECODE_INVAL)
13475 if self.op.hypervisor is None:
13476 self.op.hypervisor = self.cfg.GetHypervisorType()
13477 elif self.op.mode == constants.IALLOCATOR_MODE_RELOC:
13478 fname = _ExpandInstanceName(self.cfg, self.op.name)
13479 self.op.name = fname
13480 self.relocate_from = \
13481 list(self.cfg.GetInstanceInfo(fname).secondary_nodes)
13482 elif self.op.mode in (constants.IALLOCATOR_MODE_CHG_GROUP,
13483 constants.IALLOCATOR_MODE_NODE_EVAC):
13484 if not self.op.instances:
13485 raise errors.OpPrereqError("Missing instances", errors.ECODE_INVAL)
13486 self.op.instances = _GetWantedInstances(self, self.op.instances)
13488 raise errors.OpPrereqError("Invalid test allocator mode '%s'" %
13489 self.op.mode, errors.ECODE_INVAL)
13491 if self.op.direction == constants.IALLOCATOR_DIR_OUT:
13492 if self.op.allocator is None:
13493 raise errors.OpPrereqError("Missing allocator name",
13494 errors.ECODE_INVAL)
13495 elif self.op.direction != constants.IALLOCATOR_DIR_IN:
13496 raise errors.OpPrereqError("Wrong allocator test '%s'" %
13497 self.op.direction, errors.ECODE_INVAL)
13499 def Exec(self, feedback_fn):
13500 """Run the allocator test.
13503 if self.op.mode == constants.IALLOCATOR_MODE_ALLOC:
13504 ial = IAllocator(self.cfg, self.rpc,
13507 memory=self.op.memory,
13508 disks=self.op.disks,
13509 disk_template=self.op.disk_template,
13513 vcpus=self.op.vcpus,
13514 hypervisor=self.op.hypervisor,
13516 elif self.op.mode == constants.IALLOCATOR_MODE_RELOC:
13517 ial = IAllocator(self.cfg, self.rpc,
13520 relocate_from=list(self.relocate_from),
13522 elif self.op.mode == constants.IALLOCATOR_MODE_CHG_GROUP:
13523 ial = IAllocator(self.cfg, self.rpc,
13525 instances=self.op.instances,
13526 target_groups=self.op.target_groups)
13527 elif self.op.mode == constants.IALLOCATOR_MODE_NODE_EVAC:
13528 ial = IAllocator(self.cfg, self.rpc,
13530 instances=self.op.instances,
13531 evac_mode=self.op.evac_mode)
13533 raise errors.ProgrammerError("Uncatched mode %s in"
13534 " LUTestAllocator.Exec", self.op.mode)
13536 if self.op.direction == constants.IALLOCATOR_DIR_IN:
13537 result = ial.in_text
13539 ial.Run(self.op.allocator, validate=False)
13540 result = ial.out_text
13544 #: Query type implementations
13546 constants.QR_INSTANCE: _InstanceQuery,
13547 constants.QR_NODE: _NodeQuery,
13548 constants.QR_GROUP: _GroupQuery,
13549 constants.QR_OS: _OsQuery,
13552 assert set(_QUERY_IMPL.keys()) == constants.QR_VIA_OP
13555 def _GetQueryImplementation(name):
13556 """Returns the implemtnation for a query type.
13558 @param name: Query type, must be one of L{constants.QR_VIA_OP}
13562 return _QUERY_IMPL[name]
13564 raise errors.OpPrereqError("Unknown query resource '%s'" % name,
13565 errors.ECODE_INVAL)