4 # Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 Google Inc.
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 """Module implementing the master-side code."""
24 # pylint: disable=W0201,C0302
26 # W0201 since most LU attributes are defined in CheckPrereq or similar
29 # C0302: since we have waaaay to many lines in this module
44 from ganeti import ssh
45 from ganeti import utils
46 from ganeti import errors
47 from ganeti import hypervisor
48 from ganeti import locking
49 from ganeti import constants
50 from ganeti import objects
51 from ganeti import serializer
52 from ganeti import ssconf
53 from ganeti import uidpool
54 from ganeti import compat
55 from ganeti import masterd
56 from ganeti import netutils
57 from ganeti import query
58 from ganeti import qlang
59 from ganeti import opcodes
61 from ganeti import runtime
63 import ganeti.masterd.instance # pylint: disable=W0611
67 """Data container for LU results with jobs.
69 Instances of this class returned from L{LogicalUnit.Exec} will be recognized
70 by L{mcpu.Processor._ProcessResult}. The latter will then submit the jobs
71 contained in the C{jobs} attribute and include the job IDs in the opcode
75 def __init__(self, jobs, **kwargs):
76 """Initializes this class.
78 Additional return values can be specified as keyword arguments.
80 @type jobs: list of lists of L{opcode.OpCode}
81 @param jobs: A list of lists of opcode objects
88 class LogicalUnit(object):
89 """Logical Unit base class.
91 Subclasses must follow these rules:
92 - implement ExpandNames
93 - implement CheckPrereq (except when tasklets are used)
94 - implement Exec (except when tasklets are used)
95 - implement BuildHooksEnv
96 - implement BuildHooksNodes
97 - redefine HPATH and HTYPE
98 - optionally redefine their run requirements:
99 REQ_BGL: the LU needs to hold the Big Ganeti Lock exclusively
101 Note that all commands require root permissions.
103 @ivar dry_run_result: the value (if any) that will be returned to the caller
104 in dry-run mode (signalled by opcode dry_run parameter)
111 def __init__(self, processor, op, context, rpc):
112 """Constructor for LogicalUnit.
114 This needs to be overridden in derived classes in order to check op
118 self.proc = processor
120 self.cfg = context.cfg
121 self.glm = context.glm
123 self.owned_locks = context.glm.list_owned
124 self.context = context
126 # Dicts used to declare locking needs to mcpu
127 self.needed_locks = None
128 self.share_locks = dict.fromkeys(locking.LEVELS, 0)
130 self.remove_locks = {}
131 # Used to force good behavior when calling helper functions
132 self.recalculate_locks = {}
134 self.Log = processor.Log # pylint: disable=C0103
135 self.LogWarning = processor.LogWarning # pylint: disable=C0103
136 self.LogInfo = processor.LogInfo # pylint: disable=C0103
137 self.LogStep = processor.LogStep # pylint: disable=C0103
138 # support for dry-run
139 self.dry_run_result = None
140 # support for generic debug attribute
141 if (not hasattr(self.op, "debug_level") or
142 not isinstance(self.op.debug_level, int)):
143 self.op.debug_level = 0
148 # Validate opcode parameters and set defaults
149 self.op.Validate(True)
151 self.CheckArguments()
153 def CheckArguments(self):
154 """Check syntactic validity for the opcode arguments.
156 This method is for doing a simple syntactic check and ensure
157 validity of opcode parameters, without any cluster-related
158 checks. While the same can be accomplished in ExpandNames and/or
159 CheckPrereq, doing these separate is better because:
161 - ExpandNames is left as as purely a lock-related function
162 - CheckPrereq is run after we have acquired locks (and possible
165 The function is allowed to change the self.op attribute so that
166 later methods can no longer worry about missing parameters.
171 def ExpandNames(self):
172 """Expand names for this LU.
174 This method is called before starting to execute the opcode, and it should
175 update all the parameters of the opcode to their canonical form (e.g. a
176 short node name must be fully expanded after this method has successfully
177 completed). This way locking, hooks, logging, etc. can work correctly.
179 LUs which implement this method must also populate the self.needed_locks
180 member, as a dict with lock levels as keys, and a list of needed lock names
183 - use an empty dict if you don't need any lock
184 - if you don't need any lock at a particular level omit that level
185 - don't put anything for the BGL level
186 - if you want all locks at a level use locking.ALL_SET as a value
188 If you need to share locks (rather than acquire them exclusively) at one
189 level you can modify self.share_locks, setting a true value (usually 1) for
190 that level. By default locks are not shared.
192 This function can also define a list of tasklets, which then will be
193 executed in order instead of the usual LU-level CheckPrereq and Exec
194 functions, if those are not defined by the LU.
198 # Acquire all nodes and one instance
199 self.needed_locks = {
200 locking.LEVEL_NODE: locking.ALL_SET,
201 locking.LEVEL_INSTANCE: ['instance1.example.com'],
203 # Acquire just two nodes
204 self.needed_locks = {
205 locking.LEVEL_NODE: ['node1.example.com', 'node2.example.com'],
208 self.needed_locks = {} # No, you can't leave it to the default value None
211 # The implementation of this method is mandatory only if the new LU is
212 # concurrent, so that old LUs don't need to be changed all at the same
215 self.needed_locks = {} # Exclusive LUs don't need locks.
217 raise NotImplementedError
219 def DeclareLocks(self, level):
220 """Declare LU locking needs for a level
222 While most LUs can just declare their locking needs at ExpandNames time,
223 sometimes there's the need to calculate some locks after having acquired
224 the ones before. This function is called just before acquiring locks at a
225 particular level, but after acquiring the ones at lower levels, and permits
226 such calculations. It can be used to modify self.needed_locks, and by
227 default it does nothing.
229 This function is only called if you have something already set in
230 self.needed_locks for the level.
232 @param level: Locking level which is going to be locked
233 @type level: member of ganeti.locking.LEVELS
237 def CheckPrereq(self):
238 """Check prerequisites for this LU.
240 This method should check that the prerequisites for the execution
241 of this LU are fulfilled. It can do internode communication, but
242 it should be idempotent - no cluster or system changes are
245 The method should raise errors.OpPrereqError in case something is
246 not fulfilled. Its return value is ignored.
248 This method should also update all the parameters of the opcode to
249 their canonical form if it hasn't been done by ExpandNames before.
252 if self.tasklets is not None:
253 for (idx, tl) in enumerate(self.tasklets):
254 logging.debug("Checking prerequisites for tasklet %s/%s",
255 idx + 1, len(self.tasklets))
260 def Exec(self, feedback_fn):
263 This method should implement the actual work. It should raise
264 errors.OpExecError for failures that are somewhat dealt with in
268 if self.tasklets is not None:
269 for (idx, tl) in enumerate(self.tasklets):
270 logging.debug("Executing tasklet %s/%s", idx + 1, len(self.tasklets))
273 raise NotImplementedError
275 def BuildHooksEnv(self):
276 """Build hooks environment for this LU.
279 @return: Dictionary containing the environment that will be used for
280 running the hooks for this LU. The keys of the dict must not be prefixed
281 with "GANETI_"--that'll be added by the hooks runner. The hooks runner
282 will extend the environment with additional variables. If no environment
283 should be defined, an empty dictionary should be returned (not C{None}).
284 @note: If the C{HPATH} attribute of the LU class is C{None}, this function
288 raise NotImplementedError
290 def BuildHooksNodes(self):
291 """Build list of nodes to run LU's hooks.
293 @rtype: tuple; (list, list)
294 @return: Tuple containing a list of node names on which the hook
295 should run before the execution and a list of node names on which the
296 hook should run after the execution. No nodes should be returned as an
297 empty list (and not None).
298 @note: If the C{HPATH} attribute of the LU class is C{None}, this function
302 raise NotImplementedError
304 def HooksCallBack(self, phase, hook_results, feedback_fn, lu_result):
305 """Notify the LU about the results of its hooks.
307 This method is called every time a hooks phase is executed, and notifies
308 the Logical Unit about the hooks' result. The LU can then use it to alter
309 its result based on the hooks. By default the method does nothing and the
310 previous result is passed back unchanged but any LU can define it if it
311 wants to use the local cluster hook-scripts somehow.
313 @param phase: one of L{constants.HOOKS_PHASE_POST} or
314 L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase
315 @param hook_results: the results of the multi-node hooks rpc call
316 @param feedback_fn: function used send feedback back to the caller
317 @param lu_result: the previous Exec result this LU had, or None
319 @return: the new Exec result, based on the previous result
323 # API must be kept, thus we ignore the unused argument and could
324 # be a function warnings
325 # pylint: disable=W0613,R0201
328 def _ExpandAndLockInstance(self):
329 """Helper function to expand and lock an instance.
331 Many LUs that work on an instance take its name in self.op.instance_name
332 and need to expand it and then declare the expanded name for locking. This
333 function does it, and then updates self.op.instance_name to the expanded
334 name. It also initializes needed_locks as a dict, if this hasn't been done
338 if self.needed_locks is None:
339 self.needed_locks = {}
341 assert locking.LEVEL_INSTANCE not in self.needed_locks, \
342 "_ExpandAndLockInstance called with instance-level locks set"
343 self.op.instance_name = _ExpandInstanceName(self.cfg,
344 self.op.instance_name)
345 self.needed_locks[locking.LEVEL_INSTANCE] = self.op.instance_name
347 def _LockInstancesNodes(self, primary_only=False):
348 """Helper function to declare instances' nodes for locking.
350 This function should be called after locking one or more instances to lock
351 their nodes. Its effect is populating self.needed_locks[locking.LEVEL_NODE]
352 with all primary or secondary nodes for instances already locked and
353 present in self.needed_locks[locking.LEVEL_INSTANCE].
355 It should be called from DeclareLocks, and for safety only works if
356 self.recalculate_locks[locking.LEVEL_NODE] is set.
358 In the future it may grow parameters to just lock some instance's nodes, or
359 to just lock primaries or secondary nodes, if needed.
361 If should be called in DeclareLocks in a way similar to::
363 if level == locking.LEVEL_NODE:
364 self._LockInstancesNodes()
366 @type primary_only: boolean
367 @param primary_only: only lock primary nodes of locked instances
370 assert locking.LEVEL_NODE in self.recalculate_locks, \
371 "_LockInstancesNodes helper function called with no nodes to recalculate"
373 # TODO: check if we're really been called with the instance locks held
375 # For now we'll replace self.needed_locks[locking.LEVEL_NODE], but in the
376 # future we might want to have different behaviors depending on the value
377 # of self.recalculate_locks[locking.LEVEL_NODE]
379 locked_i = self.owned_locks(locking.LEVEL_INSTANCE)
380 for _, instance in self.cfg.GetMultiInstanceInfo(locked_i):
381 wanted_nodes.append(instance.primary_node)
383 wanted_nodes.extend(instance.secondary_nodes)
385 if self.recalculate_locks[locking.LEVEL_NODE] == constants.LOCKS_REPLACE:
386 self.needed_locks[locking.LEVEL_NODE] = wanted_nodes
387 elif self.recalculate_locks[locking.LEVEL_NODE] == constants.LOCKS_APPEND:
388 self.needed_locks[locking.LEVEL_NODE].extend(wanted_nodes)
390 del self.recalculate_locks[locking.LEVEL_NODE]
393 class NoHooksLU(LogicalUnit): # pylint: disable=W0223
394 """Simple LU which runs no hooks.
396 This LU is intended as a parent for other LogicalUnits which will
397 run no hooks, in order to reduce duplicate code.
403 def BuildHooksEnv(self):
404 """Empty BuildHooksEnv for NoHooksLu.
406 This just raises an error.
409 raise AssertionError("BuildHooksEnv called for NoHooksLUs")
411 def BuildHooksNodes(self):
412 """Empty BuildHooksNodes for NoHooksLU.
415 raise AssertionError("BuildHooksNodes called for NoHooksLU")
419 """Tasklet base class.
421 Tasklets are subcomponents for LUs. LUs can consist entirely of tasklets or
422 they can mix legacy code with tasklets. Locking needs to be done in the LU,
423 tasklets know nothing about locks.
425 Subclasses must follow these rules:
426 - Implement CheckPrereq
430 def __init__(self, lu):
437 def CheckPrereq(self):
438 """Check prerequisites for this tasklets.
440 This method should check whether the prerequisites for the execution of
441 this tasklet are fulfilled. It can do internode communication, but it
442 should be idempotent - no cluster or system changes are allowed.
444 The method should raise errors.OpPrereqError in case something is not
445 fulfilled. Its return value is ignored.
447 This method should also update all parameters to their canonical form if it
448 hasn't been done before.
453 def Exec(self, feedback_fn):
454 """Execute the tasklet.
456 This method should implement the actual work. It should raise
457 errors.OpExecError for failures that are somewhat dealt with in code, or
461 raise NotImplementedError
465 """Base for query utility classes.
468 #: Attribute holding field definitions
471 def __init__(self, filter_, fields, use_locking):
472 """Initializes this class.
475 self.use_locking = use_locking
477 self.query = query.Query(self.FIELDS, fields, filter_=filter_,
479 self.requested_data = self.query.RequestedData()
480 self.names = self.query.RequestedNames()
482 # Sort only if no names were requested
483 self.sort_by_name = not self.names
485 self.do_locking = None
488 def _GetNames(self, lu, all_names, lock_level):
489 """Helper function to determine names asked for in the query.
493 names = lu.owned_locks(lock_level)
497 if self.wanted == locking.ALL_SET:
498 assert not self.names
499 # caller didn't specify names, so ordering is not important
500 return utils.NiceSort(names)
502 # caller specified names and we must keep the same order
504 assert not self.do_locking or lu.glm.is_owned(lock_level)
506 missing = set(self.wanted).difference(names)
508 raise errors.OpExecError("Some items were removed before retrieving"
509 " their data: %s" % missing)
511 # Return expanded names
514 def ExpandNames(self, lu):
515 """Expand names for this query.
517 See L{LogicalUnit.ExpandNames}.
520 raise NotImplementedError()
522 def DeclareLocks(self, lu, level):
523 """Declare locks for this query.
525 See L{LogicalUnit.DeclareLocks}.
528 raise NotImplementedError()
530 def _GetQueryData(self, lu):
531 """Collects all data for this query.
533 @return: Query data object
536 raise NotImplementedError()
538 def NewStyleQuery(self, lu):
539 """Collect data and execute query.
542 return query.GetQueryResponse(self.query, self._GetQueryData(lu),
543 sort_by_name=self.sort_by_name)
545 def OldStyleQuery(self, lu):
546 """Collect data and execute query.
549 return self.query.OldStyleQuery(self._GetQueryData(lu),
550 sort_by_name=self.sort_by_name)
554 """Returns a dict declaring all lock levels shared.
557 return dict.fromkeys(locking.LEVELS, 1)
560 def _CheckInstancesNodeGroups(cfg, instances, owned_groups, owned_nodes,
562 """Checks if node groups for locked instances are still correct.
564 @type cfg: L{config.ConfigWriter}
565 @param cfg: Cluster configuration
566 @type instances: dict; string as key, L{objects.Instance} as value
567 @param instances: Dictionary, instance name as key, instance object as value
568 @type owned_groups: iterable of string
569 @param owned_groups: List of owned groups
570 @type owned_nodes: iterable of string
571 @param owned_nodes: List of owned nodes
572 @type cur_group_uuid: string or None
573 @type cur_group_uuid: Optional group UUID to check against instance's groups
576 for (name, inst) in instances.items():
577 assert owned_nodes.issuperset(inst.all_nodes), \
578 "Instance %s's nodes changed while we kept the lock" % name
580 inst_groups = _CheckInstanceNodeGroups(cfg, name, owned_groups)
582 assert cur_group_uuid is None or cur_group_uuid in inst_groups, \
583 "Instance %s has no node in group %s" % (name, cur_group_uuid)
586 def _CheckInstanceNodeGroups(cfg, instance_name, owned_groups):
587 """Checks if the owned node groups are still correct for an instance.
589 @type cfg: L{config.ConfigWriter}
590 @param cfg: The cluster configuration
591 @type instance_name: string
592 @param instance_name: Instance name
593 @type owned_groups: set or frozenset
594 @param owned_groups: List of currently owned node groups
597 inst_groups = cfg.GetInstanceNodeGroups(instance_name)
599 if not owned_groups.issuperset(inst_groups):
600 raise errors.OpPrereqError("Instance %s's node groups changed since"
601 " locks were acquired, current groups are"
602 " are '%s', owning groups '%s'; retry the"
605 utils.CommaJoin(inst_groups),
606 utils.CommaJoin(owned_groups)),
612 def _CheckNodeGroupInstances(cfg, group_uuid, owned_instances):
613 """Checks if the instances in a node group are still correct.
615 @type cfg: L{config.ConfigWriter}
616 @param cfg: The cluster configuration
617 @type group_uuid: string
618 @param group_uuid: Node group UUID
619 @type owned_instances: set or frozenset
620 @param owned_instances: List of currently owned instances
623 wanted_instances = cfg.GetNodeGroupInstances(group_uuid)
624 if owned_instances != wanted_instances:
625 raise errors.OpPrereqError("Instances in node group '%s' changed since"
626 " locks were acquired, wanted '%s', have '%s';"
627 " retry the operation" %
629 utils.CommaJoin(wanted_instances),
630 utils.CommaJoin(owned_instances)),
633 return wanted_instances
636 def _SupportsOob(cfg, node):
637 """Tells if node supports OOB.
639 @type cfg: L{config.ConfigWriter}
640 @param cfg: The cluster configuration
641 @type node: L{objects.Node}
642 @param node: The node
643 @return: The OOB script if supported or an empty string otherwise
646 return cfg.GetNdParams(node)[constants.ND_OOB_PROGRAM]
649 def _GetWantedNodes(lu, nodes):
650 """Returns list of checked and expanded node names.
652 @type lu: L{LogicalUnit}
653 @param lu: the logical unit on whose behalf we execute
655 @param nodes: list of node names or None for all nodes
657 @return: the list of nodes, sorted
658 @raise errors.ProgrammerError: if the nodes parameter is wrong type
662 return [_ExpandNodeName(lu.cfg, name) for name in nodes]
664 return utils.NiceSort(lu.cfg.GetNodeList())
667 def _GetWantedInstances(lu, instances):
668 """Returns list of checked and expanded instance names.
670 @type lu: L{LogicalUnit}
671 @param lu: the logical unit on whose behalf we execute
672 @type instances: list
673 @param instances: list of instance names or None for all instances
675 @return: the list of instances, sorted
676 @raise errors.OpPrereqError: if the instances parameter is wrong type
677 @raise errors.OpPrereqError: if any of the passed instances is not found
681 wanted = [_ExpandInstanceName(lu.cfg, name) for name in instances]
683 wanted = utils.NiceSort(lu.cfg.GetInstanceList())
687 def _GetUpdatedParams(old_params, update_dict,
688 use_default=True, use_none=False):
689 """Return the new version of a parameter dictionary.
691 @type old_params: dict
692 @param old_params: old parameters
693 @type update_dict: dict
694 @param update_dict: dict containing new parameter values, or
695 constants.VALUE_DEFAULT to reset the parameter to its default
697 @param use_default: boolean
698 @type use_default: whether to recognise L{constants.VALUE_DEFAULT}
699 values as 'to be deleted' values
700 @param use_none: boolean
701 @type use_none: whether to recognise C{None} values as 'to be
704 @return: the new parameter dictionary
707 params_copy = copy.deepcopy(old_params)
708 for key, val in update_dict.iteritems():
709 if ((use_default and val == constants.VALUE_DEFAULT) or
710 (use_none and val is None)):
716 params_copy[key] = val
720 def _ReleaseLocks(lu, level, names=None, keep=None):
721 """Releases locks owned by an LU.
723 @type lu: L{LogicalUnit}
724 @param level: Lock level
725 @type names: list or None
726 @param names: Names of locks to release
727 @type keep: list or None
728 @param keep: Names of locks to retain
731 assert not (keep is not None and names is not None), \
732 "Only one of the 'names' and the 'keep' parameters can be given"
734 if names is not None:
735 should_release = names.__contains__
737 should_release = lambda name: name not in keep
739 should_release = None
745 # Determine which locks to release
746 for name in lu.owned_locks(level):
747 if should_release(name):
752 assert len(lu.owned_locks(level)) == (len(retain) + len(release))
754 # Release just some locks
755 lu.glm.release(level, names=release)
757 assert frozenset(lu.owned_locks(level)) == frozenset(retain)
760 lu.glm.release(level)
762 assert not lu.glm.is_owned(level), "No locks should be owned"
765 def _MapInstanceDisksToNodes(instances):
766 """Creates a map from (node, volume) to instance name.
768 @type instances: list of L{objects.Instance}
769 @rtype: dict; tuple of (node name, volume name) as key, instance name as value
772 return dict(((node, vol), inst.name)
773 for inst in instances
774 for (node, vols) in inst.MapLVsByNode().items()
778 def _RunPostHook(lu, node_name):
779 """Runs the post-hook for an opcode on a single node.
782 hm = lu.proc.BuildHooksManager(lu)
784 hm.RunPhase(constants.HOOKS_PHASE_POST, nodes=[node_name])
786 # pylint: disable=W0702
787 lu.LogWarning("Errors occurred running hooks on %s" % node_name)
790 def _CheckOutputFields(static, dynamic, selected):
791 """Checks whether all selected fields are valid.
793 @type static: L{utils.FieldSet}
794 @param static: static fields set
795 @type dynamic: L{utils.FieldSet}
796 @param dynamic: dynamic fields set
803 delta = f.NonMatching(selected)
805 raise errors.OpPrereqError("Unknown output fields selected: %s"
806 % ",".join(delta), errors.ECODE_INVAL)
809 def _CheckGlobalHvParams(params):
810 """Validates that given hypervisor params are not global ones.
812 This will ensure that instances don't get customised versions of
816 used_globals = constants.HVC_GLOBALS.intersection(params)
818 msg = ("The following hypervisor parameters are global and cannot"
819 " be customized at instance level, please modify them at"
820 " cluster level: %s" % utils.CommaJoin(used_globals))
821 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
824 def _CheckNodeOnline(lu, node, msg=None):
825 """Ensure that a given node is online.
827 @param lu: the LU on behalf of which we make the check
828 @param node: the node to check
829 @param msg: if passed, should be a message to replace the default one
830 @raise errors.OpPrereqError: if the node is offline
834 msg = "Can't use offline node"
835 if lu.cfg.GetNodeInfo(node).offline:
836 raise errors.OpPrereqError("%s: %s" % (msg, node), errors.ECODE_STATE)
839 def _CheckNodeNotDrained(lu, node):
840 """Ensure that a given node is not drained.
842 @param lu: the LU on behalf of which we make the check
843 @param node: the node to check
844 @raise errors.OpPrereqError: if the node is drained
847 if lu.cfg.GetNodeInfo(node).drained:
848 raise errors.OpPrereqError("Can't use drained node %s" % node,
852 def _CheckNodeVmCapable(lu, node):
853 """Ensure that a given node is vm capable.
855 @param lu: the LU on behalf of which we make the check
856 @param node: the node to check
857 @raise errors.OpPrereqError: if the node is not vm capable
860 if not lu.cfg.GetNodeInfo(node).vm_capable:
861 raise errors.OpPrereqError("Can't use non-vm_capable node %s" % node,
865 def _CheckNodeHasOS(lu, node, os_name, force_variant):
866 """Ensure that a node supports a given OS.
868 @param lu: the LU on behalf of which we make the check
869 @param node: the node to check
870 @param os_name: the OS to query about
871 @param force_variant: whether to ignore variant errors
872 @raise errors.OpPrereqError: if the node is not supporting the OS
875 result = lu.rpc.call_os_get(node, os_name)
876 result.Raise("OS '%s' not in supported OS list for node %s" %
878 prereq=True, ecode=errors.ECODE_INVAL)
879 if not force_variant:
880 _CheckOSVariant(result.payload, os_name)
883 def _CheckNodeHasSecondaryIP(lu, node, secondary_ip, prereq):
884 """Ensure that a node has the given secondary ip.
886 @type lu: L{LogicalUnit}
887 @param lu: the LU on behalf of which we make the check
889 @param node: the node to check
890 @type secondary_ip: string
891 @param secondary_ip: the ip to check
892 @type prereq: boolean
893 @param prereq: whether to throw a prerequisite or an execute error
894 @raise errors.OpPrereqError: if the node doesn't have the ip, and prereq=True
895 @raise errors.OpExecError: if the node doesn't have the ip, and prereq=False
898 result = lu.rpc.call_node_has_ip_address(node, secondary_ip)
899 result.Raise("Failure checking secondary ip on node %s" % node,
900 prereq=prereq, ecode=errors.ECODE_ENVIRON)
901 if not result.payload:
902 msg = ("Node claims it doesn't have the secondary ip you gave (%s),"
903 " please fix and re-run this command" % secondary_ip)
905 raise errors.OpPrereqError(msg, errors.ECODE_ENVIRON)
907 raise errors.OpExecError(msg)
910 def _GetClusterDomainSecret():
911 """Reads the cluster domain secret.
914 return utils.ReadOneLineFile(constants.CLUSTER_DOMAIN_SECRET_FILE,
918 def _CheckInstanceDown(lu, instance, reason):
919 """Ensure that an instance is not running."""
920 if instance.admin_up:
921 raise errors.OpPrereqError("Instance %s is marked to be up, %s" %
922 (instance.name, reason), errors.ECODE_STATE)
924 pnode = instance.primary_node
925 ins_l = lu.rpc.call_instance_list([pnode], [instance.hypervisor])[pnode]
926 ins_l.Raise("Can't contact node %s for instance information" % pnode,
927 prereq=True, ecode=errors.ECODE_ENVIRON)
929 if instance.name in ins_l.payload:
930 raise errors.OpPrereqError("Instance %s is running, %s" %
931 (instance.name, reason), errors.ECODE_STATE)
934 def _ExpandItemName(fn, name, kind):
935 """Expand an item name.
937 @param fn: the function to use for expansion
938 @param name: requested item name
939 @param kind: text description ('Node' or 'Instance')
940 @return: the resolved (full) name
941 @raise errors.OpPrereqError: if the item is not found
945 if full_name is None:
946 raise errors.OpPrereqError("%s '%s' not known" % (kind, name),
951 def _ExpandNodeName(cfg, name):
952 """Wrapper over L{_ExpandItemName} for nodes."""
953 return _ExpandItemName(cfg.ExpandNodeName, name, "Node")
956 def _ExpandInstanceName(cfg, name):
957 """Wrapper over L{_ExpandItemName} for instance."""
958 return _ExpandItemName(cfg.ExpandInstanceName, name, "Instance")
961 def _BuildInstanceHookEnv(name, primary_node, secondary_nodes, os_type, status,
962 memory, vcpus, nics, disk_template, disks,
963 bep, hvp, hypervisor_name, tags):
964 """Builds instance related env variables for hooks
966 This builds the hook environment from individual variables.
969 @param name: the name of the instance
970 @type primary_node: string
971 @param primary_node: the name of the instance's primary node
972 @type secondary_nodes: list
973 @param secondary_nodes: list of secondary nodes as strings
974 @type os_type: string
975 @param os_type: the name of the instance's OS
976 @type status: boolean
977 @param status: the should_run status of the instance
979 @param memory: the memory size of the instance
981 @param vcpus: the count of VCPUs the instance has
983 @param nics: list of tuples (ip, mac, mode, link) representing
984 the NICs the instance has
985 @type disk_template: string
986 @param disk_template: the disk template of the instance
988 @param disks: the list of (size, mode) pairs
990 @param bep: the backend parameters for the instance
992 @param hvp: the hypervisor parameters for the instance
993 @type hypervisor_name: string
994 @param hypervisor_name: the hypervisor for the instance
996 @param tags: list of instance tags as strings
998 @return: the hook environment for this instance
1007 "INSTANCE_NAME": name,
1008 "INSTANCE_PRIMARY": primary_node,
1009 "INSTANCE_SECONDARIES": " ".join(secondary_nodes),
1010 "INSTANCE_OS_TYPE": os_type,
1011 "INSTANCE_STATUS": str_status,
1012 "INSTANCE_MEMORY": memory,
1013 "INSTANCE_VCPUS": vcpus,
1014 "INSTANCE_DISK_TEMPLATE": disk_template,
1015 "INSTANCE_HYPERVISOR": hypervisor_name,
1019 nic_count = len(nics)
1020 for idx, (ip, mac, mode, link) in enumerate(nics):
1023 env["INSTANCE_NIC%d_IP" % idx] = ip
1024 env["INSTANCE_NIC%d_MAC" % idx] = mac
1025 env["INSTANCE_NIC%d_MODE" % idx] = mode
1026 env["INSTANCE_NIC%d_LINK" % idx] = link
1027 if mode == constants.NIC_MODE_BRIDGED:
1028 env["INSTANCE_NIC%d_BRIDGE" % idx] = link
1032 env["INSTANCE_NIC_COUNT"] = nic_count
1035 disk_count = len(disks)
1036 for idx, (size, mode) in enumerate(disks):
1037 env["INSTANCE_DISK%d_SIZE" % idx] = size
1038 env["INSTANCE_DISK%d_MODE" % idx] = mode
1042 env["INSTANCE_DISK_COUNT"] = disk_count
1047 env["INSTANCE_TAGS"] = " ".join(tags)
1049 for source, kind in [(bep, "BE"), (hvp, "HV")]:
1050 for key, value in source.items():
1051 env["INSTANCE_%s_%s" % (kind, key)] = value
1056 def _NICListToTuple(lu, nics):
1057 """Build a list of nic information tuples.
1059 This list is suitable to be passed to _BuildInstanceHookEnv or as a return
1060 value in LUInstanceQueryData.
1062 @type lu: L{LogicalUnit}
1063 @param lu: the logical unit on whose behalf we execute
1064 @type nics: list of L{objects.NIC}
1065 @param nics: list of nics to convert to hooks tuples
1069 cluster = lu.cfg.GetClusterInfo()
1073 filled_params = cluster.SimpleFillNIC(nic.nicparams)
1074 mode = filled_params[constants.NIC_MODE]
1075 link = filled_params[constants.NIC_LINK]
1076 hooks_nics.append((ip, mac, mode, link))
1080 def _BuildInstanceHookEnvByObject(lu, instance, override=None):
1081 """Builds instance related env variables for hooks from an object.
1083 @type lu: L{LogicalUnit}
1084 @param lu: the logical unit on whose behalf we execute
1085 @type instance: L{objects.Instance}
1086 @param instance: the instance for which we should build the
1088 @type override: dict
1089 @param override: dictionary with key/values that will override
1092 @return: the hook environment dictionary
1095 cluster = lu.cfg.GetClusterInfo()
1096 bep = cluster.FillBE(instance)
1097 hvp = cluster.FillHV(instance)
1099 "name": instance.name,
1100 "primary_node": instance.primary_node,
1101 "secondary_nodes": instance.secondary_nodes,
1102 "os_type": instance.os,
1103 "status": instance.admin_up,
1104 "memory": bep[constants.BE_MEMORY],
1105 "vcpus": bep[constants.BE_VCPUS],
1106 "nics": _NICListToTuple(lu, instance.nics),
1107 "disk_template": instance.disk_template,
1108 "disks": [(disk.size, disk.mode) for disk in instance.disks],
1111 "hypervisor_name": instance.hypervisor,
1112 "tags": instance.tags,
1115 args.update(override)
1116 return _BuildInstanceHookEnv(**args) # pylint: disable=W0142
1119 def _AdjustCandidatePool(lu, exceptions):
1120 """Adjust the candidate pool after node operations.
1123 mod_list = lu.cfg.MaintainCandidatePool(exceptions)
1125 lu.LogInfo("Promoted nodes to master candidate role: %s",
1126 utils.CommaJoin(node.name for node in mod_list))
1127 for name in mod_list:
1128 lu.context.ReaddNode(name)
1129 mc_now, mc_max, _ = lu.cfg.GetMasterCandidateStats(exceptions)
1131 lu.LogInfo("Note: more nodes are candidates (%d) than desired (%d)" %
1135 def _DecideSelfPromotion(lu, exceptions=None):
1136 """Decide whether I should promote myself as a master candidate.
1139 cp_size = lu.cfg.GetClusterInfo().candidate_pool_size
1140 mc_now, mc_should, _ = lu.cfg.GetMasterCandidateStats(exceptions)
1141 # the new node will increase mc_max with one, so:
1142 mc_should = min(mc_should + 1, cp_size)
1143 return mc_now < mc_should
1146 def _CheckNicsBridgesExist(lu, target_nics, target_node):
1147 """Check that the brigdes needed by a list of nics exist.
1150 cluster = lu.cfg.GetClusterInfo()
1151 paramslist = [cluster.SimpleFillNIC(nic.nicparams) for nic in target_nics]
1152 brlist = [params[constants.NIC_LINK] for params in paramslist
1153 if params[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED]
1155 result = lu.rpc.call_bridges_exist(target_node, brlist)
1156 result.Raise("Error checking bridges on destination node '%s'" %
1157 target_node, prereq=True, ecode=errors.ECODE_ENVIRON)
1160 def _CheckInstanceBridgesExist(lu, instance, node=None):
1161 """Check that the brigdes needed by an instance exist.
1165 node = instance.primary_node
1166 _CheckNicsBridgesExist(lu, instance.nics, node)
1169 def _CheckOSVariant(os_obj, name):
1170 """Check whether an OS name conforms to the os variants specification.
1172 @type os_obj: L{objects.OS}
1173 @param os_obj: OS object to check
1175 @param name: OS name passed by the user, to check for validity
1178 variant = objects.OS.GetVariant(name)
1179 if not os_obj.supported_variants:
1181 raise errors.OpPrereqError("OS '%s' doesn't support variants ('%s'"
1182 " passed)" % (os_obj.name, variant),
1186 raise errors.OpPrereqError("OS name must include a variant",
1189 if variant not in os_obj.supported_variants:
1190 raise errors.OpPrereqError("Unsupported OS variant", errors.ECODE_INVAL)
1193 def _GetNodeInstancesInner(cfg, fn):
1194 return [i for i in cfg.GetAllInstancesInfo().values() if fn(i)]
1197 def _GetNodeInstances(cfg, node_name):
1198 """Returns a list of all primary and secondary instances on a node.
1202 return _GetNodeInstancesInner(cfg, lambda inst: node_name in inst.all_nodes)
1205 def _GetNodePrimaryInstances(cfg, node_name):
1206 """Returns primary instances on a node.
1209 return _GetNodeInstancesInner(cfg,
1210 lambda inst: node_name == inst.primary_node)
1213 def _GetNodeSecondaryInstances(cfg, node_name):
1214 """Returns secondary instances on a node.
1217 return _GetNodeInstancesInner(cfg,
1218 lambda inst: node_name in inst.secondary_nodes)
1221 def _GetStorageTypeArgs(cfg, storage_type):
1222 """Returns the arguments for a storage type.
1225 # Special case for file storage
1226 if storage_type == constants.ST_FILE:
1227 # storage.FileStorage wants a list of storage directories
1228 return [[cfg.GetFileStorageDir(), cfg.GetSharedFileStorageDir()]]
1233 def _FindFaultyInstanceDisks(cfg, rpc, instance, node_name, prereq):
1236 for dev in instance.disks:
1237 cfg.SetDiskID(dev, node_name)
1239 result = rpc.call_blockdev_getmirrorstatus(node_name, instance.disks)
1240 result.Raise("Failed to get disk status from node %s" % node_name,
1241 prereq=prereq, ecode=errors.ECODE_ENVIRON)
1243 for idx, bdev_status in enumerate(result.payload):
1244 if bdev_status and bdev_status.ldisk_status == constants.LDS_FAULTY:
1250 def _CheckIAllocatorOrNode(lu, iallocator_slot, node_slot):
1251 """Check the sanity of iallocator and node arguments and use the
1252 cluster-wide iallocator if appropriate.
1254 Check that at most one of (iallocator, node) is specified. If none is
1255 specified, then the LU's opcode's iallocator slot is filled with the
1256 cluster-wide default iallocator.
1258 @type iallocator_slot: string
1259 @param iallocator_slot: the name of the opcode iallocator slot
1260 @type node_slot: string
1261 @param node_slot: the name of the opcode target node slot
1264 node = getattr(lu.op, node_slot, None)
1265 iallocator = getattr(lu.op, iallocator_slot, None)
1267 if node is not None and iallocator is not None:
1268 raise errors.OpPrereqError("Do not specify both, iallocator and node",
1270 elif node is None and iallocator is None:
1271 default_iallocator = lu.cfg.GetDefaultIAllocator()
1272 if default_iallocator:
1273 setattr(lu.op, iallocator_slot, default_iallocator)
1275 raise errors.OpPrereqError("No iallocator or node given and no"
1276 " cluster-wide default iallocator found;"
1277 " please specify either an iallocator or a"
1278 " node, or set a cluster-wide default"
1282 def _GetDefaultIAllocator(cfg, iallocator):
1283 """Decides on which iallocator to use.
1285 @type cfg: L{config.ConfigWriter}
1286 @param cfg: Cluster configuration object
1287 @type iallocator: string or None
1288 @param iallocator: Iallocator specified in opcode
1290 @return: Iallocator name
1294 # Use default iallocator
1295 iallocator = cfg.GetDefaultIAllocator()
1298 raise errors.OpPrereqError("No iallocator was specified, neither in the"
1299 " opcode nor as a cluster-wide default",
1305 class LUClusterPostInit(LogicalUnit):
1306 """Logical unit for running hooks after cluster initialization.
1309 HPATH = "cluster-init"
1310 HTYPE = constants.HTYPE_CLUSTER
1312 def BuildHooksEnv(self):
1317 "OP_TARGET": self.cfg.GetClusterName(),
1320 def BuildHooksNodes(self):
1321 """Build hooks nodes.
1324 return ([], [self.cfg.GetMasterNode()])
1326 def Exec(self, feedback_fn):
1333 class LUClusterDestroy(LogicalUnit):
1334 """Logical unit for destroying the cluster.
1337 HPATH = "cluster-destroy"
1338 HTYPE = constants.HTYPE_CLUSTER
1340 def BuildHooksEnv(self):
1345 "OP_TARGET": self.cfg.GetClusterName(),
1348 def BuildHooksNodes(self):
1349 """Build hooks nodes.
1354 def CheckPrereq(self):
1355 """Check prerequisites.
1357 This checks whether the cluster is empty.
1359 Any errors are signaled by raising errors.OpPrereqError.
1362 master = self.cfg.GetMasterNode()
1364 nodelist = self.cfg.GetNodeList()
1365 if len(nodelist) != 1 or nodelist[0] != master:
1366 raise errors.OpPrereqError("There are still %d node(s) in"
1367 " this cluster." % (len(nodelist) - 1),
1369 instancelist = self.cfg.GetInstanceList()
1371 raise errors.OpPrereqError("There are still %d instance(s) in"
1372 " this cluster." % len(instancelist),
1375 def Exec(self, feedback_fn):
1376 """Destroys the cluster.
1379 master = self.cfg.GetMasterNode()
1381 # Run post hooks on master node before it's removed
1382 _RunPostHook(self, master)
1384 result = self.rpc.call_node_deactivate_master_ip(master)
1385 result.Raise("Could not disable the master role")
1390 def _VerifyCertificate(filename):
1391 """Verifies a certificate for L{LUClusterVerifyConfig}.
1393 @type filename: string
1394 @param filename: Path to PEM file
1398 cert = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM,
1399 utils.ReadFile(filename))
1400 except Exception, err: # pylint: disable=W0703
1401 return (LUClusterVerifyConfig.ETYPE_ERROR,
1402 "Failed to load X509 certificate %s: %s" % (filename, err))
1405 utils.VerifyX509Certificate(cert, constants.SSL_CERT_EXPIRATION_WARN,
1406 constants.SSL_CERT_EXPIRATION_ERROR)
1409 fnamemsg = "While verifying %s: %s" % (filename, msg)
1414 return (None, fnamemsg)
1415 elif errcode == utils.CERT_WARNING:
1416 return (LUClusterVerifyConfig.ETYPE_WARNING, fnamemsg)
1417 elif errcode == utils.CERT_ERROR:
1418 return (LUClusterVerifyConfig.ETYPE_ERROR, fnamemsg)
1420 raise errors.ProgrammerError("Unhandled certificate error code %r" % errcode)
1423 def _GetAllHypervisorParameters(cluster, instances):
1424 """Compute the set of all hypervisor parameters.
1426 @type cluster: L{objects.Cluster}
1427 @param cluster: the cluster object
1428 @param instances: list of L{objects.Instance}
1429 @param instances: additional instances from which to obtain parameters
1430 @rtype: list of (origin, hypervisor, parameters)
1431 @return: a list with all parameters found, indicating the hypervisor they
1432 apply to, and the origin (can be "cluster", "os X", or "instance Y")
1437 for hv_name in cluster.enabled_hypervisors:
1438 hvp_data.append(("cluster", hv_name, cluster.GetHVDefaults(hv_name)))
1440 for os_name, os_hvp in cluster.os_hvp.items():
1441 for hv_name, hv_params in os_hvp.items():
1443 full_params = cluster.GetHVDefaults(hv_name, os_name=os_name)
1444 hvp_data.append(("os %s" % os_name, hv_name, full_params))
1446 # TODO: collapse identical parameter values in a single one
1447 for instance in instances:
1448 if instance.hvparams:
1449 hvp_data.append(("instance %s" % instance.name, instance.hypervisor,
1450 cluster.FillHV(instance)))
1455 class _VerifyErrors(object):
1456 """Mix-in for cluster/group verify LUs.
1458 It provides _Error and _ErrorIf, and updates the self.bad boolean. (Expects
1459 self.op and self._feedback_fn to be available.)
1462 TCLUSTER = "cluster"
1464 TINSTANCE = "instance"
1466 ECLUSTERCFG = (TCLUSTER, "ECLUSTERCFG")
1467 ECLUSTERCERT = (TCLUSTER, "ECLUSTERCERT")
1468 ECLUSTERFILECHECK = (TCLUSTER, "ECLUSTERFILECHECK")
1469 ECLUSTERDANGLINGNODES = (TNODE, "ECLUSTERDANGLINGNODES")
1470 ECLUSTERDANGLINGINST = (TNODE, "ECLUSTERDANGLINGINST")
1471 EINSTANCEBADNODE = (TINSTANCE, "EINSTANCEBADNODE")
1472 EINSTANCEDOWN = (TINSTANCE, "EINSTANCEDOWN")
1473 EINSTANCELAYOUT = (TINSTANCE, "EINSTANCELAYOUT")
1474 EINSTANCEMISSINGDISK = (TINSTANCE, "EINSTANCEMISSINGDISK")
1475 EINSTANCEFAULTYDISK = (TINSTANCE, "EINSTANCEFAULTYDISK")
1476 EINSTANCEWRONGNODE = (TINSTANCE, "EINSTANCEWRONGNODE")
1477 EINSTANCESPLITGROUPS = (TINSTANCE, "EINSTANCESPLITGROUPS")
1478 ENODEDRBD = (TNODE, "ENODEDRBD")
1479 ENODEDRBDHELPER = (TNODE, "ENODEDRBDHELPER")
1480 ENODEFILECHECK = (TNODE, "ENODEFILECHECK")
1481 ENODEHOOKS = (TNODE, "ENODEHOOKS")
1482 ENODEHV = (TNODE, "ENODEHV")
1483 ENODELVM = (TNODE, "ENODELVM")
1484 ENODEN1 = (TNODE, "ENODEN1")
1485 ENODENET = (TNODE, "ENODENET")
1486 ENODEOS = (TNODE, "ENODEOS")
1487 ENODEORPHANINSTANCE = (TNODE, "ENODEORPHANINSTANCE")
1488 ENODEORPHANLV = (TNODE, "ENODEORPHANLV")
1489 ENODERPC = (TNODE, "ENODERPC")
1490 ENODESSH = (TNODE, "ENODESSH")
1491 ENODEVERSION = (TNODE, "ENODEVERSION")
1492 ENODESETUP = (TNODE, "ENODESETUP")
1493 ENODETIME = (TNODE, "ENODETIME")
1494 ENODEOOBPATH = (TNODE, "ENODEOOBPATH")
1496 ETYPE_FIELD = "code"
1497 ETYPE_ERROR = "ERROR"
1498 ETYPE_WARNING = "WARNING"
1500 def _Error(self, ecode, item, msg, *args, **kwargs):
1501 """Format an error message.
1503 Based on the opcode's error_codes parameter, either format a
1504 parseable error code, or a simpler error string.
1506 This must be called only from Exec and functions called from Exec.
1509 ltype = kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR)
1511 # first complete the msg
1514 # then format the whole message
1515 if self.op.error_codes: # This is a mix-in. pylint: disable=E1101
1516 msg = "%s:%s:%s:%s:%s" % (ltype, etxt, itype, item, msg)
1522 msg = "%s: %s%s: %s" % (ltype, itype, item, msg)
1523 # and finally report it via the feedback_fn
1524 self._feedback_fn(" - %s" % msg) # Mix-in. pylint: disable=E1101
1526 def _ErrorIf(self, cond, *args, **kwargs):
1527 """Log an error message if the passed condition is True.
1531 or self.op.debug_simulate_errors) # pylint: disable=E1101
1533 self._Error(*args, **kwargs)
1534 # do not mark the operation as failed for WARN cases only
1535 if kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR) == self.ETYPE_ERROR:
1536 self.bad = self.bad or cond
1539 class LUClusterVerify(NoHooksLU):
1540 """Submits all jobs necessary to verify the cluster.
1545 def ExpandNames(self):
1546 self.needed_locks = {}
1548 def Exec(self, feedback_fn):
1551 if self.op.group_name:
1552 groups = [self.op.group_name]
1553 depends_fn = lambda: None
1555 groups = self.cfg.GetNodeGroupList()
1557 # Verify global configuration
1558 jobs.append([opcodes.OpClusterVerifyConfig()])
1560 # Always depend on global verification
1561 depends_fn = lambda: [(-len(jobs), [])]
1563 jobs.extend([opcodes.OpClusterVerifyGroup(group_name=group,
1564 depends=depends_fn())]
1565 for group in groups)
1567 # Fix up all parameters
1568 for op in itertools.chain(*jobs): # pylint: disable=W0142
1569 op.debug_simulate_errors = self.op.debug_simulate_errors
1570 op.verbose = self.op.verbose
1571 op.error_codes = self.op.error_codes
1573 op.skip_checks = self.op.skip_checks
1574 except AttributeError:
1575 assert not isinstance(op, opcodes.OpClusterVerifyGroup)
1577 return ResultWithJobs(jobs)
1580 class LUClusterVerifyConfig(NoHooksLU, _VerifyErrors):
1581 """Verifies the cluster config.
1586 def _VerifyHVP(self, hvp_data):
1587 """Verifies locally the syntax of the hypervisor parameters.
1590 for item, hv_name, hv_params in hvp_data:
1591 msg = ("hypervisor %s parameters syntax check (source %s): %%s" %
1594 hv_class = hypervisor.GetHypervisor(hv_name)
1595 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
1596 hv_class.CheckParameterSyntax(hv_params)
1597 except errors.GenericError, err:
1598 self._ErrorIf(True, self.ECLUSTERCFG, None, msg % str(err))
1600 def ExpandNames(self):
1601 # Information can be safely retrieved as the BGL is acquired in exclusive
1603 assert locking.BGL in self.owned_locks(locking.LEVEL_CLUSTER)
1604 self.all_group_info = self.cfg.GetAllNodeGroupsInfo()
1605 self.all_node_info = self.cfg.GetAllNodesInfo()
1606 self.all_inst_info = self.cfg.GetAllInstancesInfo()
1607 self.needed_locks = {}
1609 def Exec(self, feedback_fn):
1610 """Verify integrity of cluster, performing various test on nodes.
1614 self._feedback_fn = feedback_fn
1616 feedback_fn("* Verifying cluster config")
1618 for msg in self.cfg.VerifyConfig():
1619 self._ErrorIf(True, self.ECLUSTERCFG, None, msg)
1621 feedback_fn("* Verifying cluster certificate files")
1623 for cert_filename in constants.ALL_CERT_FILES:
1624 (errcode, msg) = _VerifyCertificate(cert_filename)
1625 self._ErrorIf(errcode, self.ECLUSTERCERT, None, msg, code=errcode)
1627 feedback_fn("* Verifying hypervisor parameters")
1629 self._VerifyHVP(_GetAllHypervisorParameters(self.cfg.GetClusterInfo(),
1630 self.all_inst_info.values()))
1632 feedback_fn("* Verifying all nodes belong to an existing group")
1634 # We do this verification here because, should this bogus circumstance
1635 # occur, it would never be caught by VerifyGroup, which only acts on
1636 # nodes/instances reachable from existing node groups.
1638 dangling_nodes = set(node.name for node in self.all_node_info.values()
1639 if node.group not in self.all_group_info)
1641 dangling_instances = {}
1642 no_node_instances = []
1644 for inst in self.all_inst_info.values():
1645 if inst.primary_node in dangling_nodes:
1646 dangling_instances.setdefault(inst.primary_node, []).append(inst.name)
1647 elif inst.primary_node not in self.all_node_info:
1648 no_node_instances.append(inst.name)
1653 utils.CommaJoin(dangling_instances.get(node.name,
1655 for node in dangling_nodes]
1657 self._ErrorIf(bool(dangling_nodes), self.ECLUSTERDANGLINGNODES, None,
1658 "the following nodes (and their instances) belong to a non"
1659 " existing group: %s", utils.CommaJoin(pretty_dangling))
1661 self._ErrorIf(bool(no_node_instances), self.ECLUSTERDANGLINGINST, None,
1662 "the following instances have a non-existing primary-node:"
1663 " %s", utils.CommaJoin(no_node_instances))
1668 class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
1669 """Verifies the status of a node group.
1672 HPATH = "cluster-verify"
1673 HTYPE = constants.HTYPE_CLUSTER
1676 _HOOKS_INDENT_RE = re.compile("^", re.M)
1678 class NodeImage(object):
1679 """A class representing the logical and physical status of a node.
1682 @ivar name: the node name to which this object refers
1683 @ivar volumes: a structure as returned from
1684 L{ganeti.backend.GetVolumeList} (runtime)
1685 @ivar instances: a list of running instances (runtime)
1686 @ivar pinst: list of configured primary instances (config)
1687 @ivar sinst: list of configured secondary instances (config)
1688 @ivar sbp: dictionary of {primary-node: list of instances} for all
1689 instances for which this node is secondary (config)
1690 @ivar mfree: free memory, as reported by hypervisor (runtime)
1691 @ivar dfree: free disk, as reported by the node (runtime)
1692 @ivar offline: the offline status (config)
1693 @type rpc_fail: boolean
1694 @ivar rpc_fail: whether the RPC verify call was successfull (overall,
1695 not whether the individual keys were correct) (runtime)
1696 @type lvm_fail: boolean
1697 @ivar lvm_fail: whether the RPC call didn't return valid LVM data
1698 @type hyp_fail: boolean
1699 @ivar hyp_fail: whether the RPC call didn't return the instance list
1700 @type ghost: boolean
1701 @ivar ghost: whether this is a known node or not (config)
1702 @type os_fail: boolean
1703 @ivar os_fail: whether the RPC call didn't return valid OS data
1705 @ivar oslist: list of OSes as diagnosed by DiagnoseOS
1706 @type vm_capable: boolean
1707 @ivar vm_capable: whether the node can host instances
1710 def __init__(self, offline=False, name=None, vm_capable=True):
1719 self.offline = offline
1720 self.vm_capable = vm_capable
1721 self.rpc_fail = False
1722 self.lvm_fail = False
1723 self.hyp_fail = False
1725 self.os_fail = False
1728 def ExpandNames(self):
1729 # This raises errors.OpPrereqError on its own:
1730 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
1732 # Get instances in node group; this is unsafe and needs verification later
1734 self.cfg.GetNodeGroupInstances(self.group_uuid, primary_only=True)
1736 self.needed_locks = {
1737 locking.LEVEL_INSTANCE: inst_names,
1738 locking.LEVEL_NODEGROUP: [self.group_uuid],
1739 locking.LEVEL_NODE: [],
1742 self.share_locks = _ShareAll()
1744 def DeclareLocks(self, level):
1745 if level == locking.LEVEL_NODE:
1746 # Get members of node group; this is unsafe and needs verification later
1747 nodes = set(self.cfg.GetNodeGroup(self.group_uuid).members)
1749 all_inst_info = self.cfg.GetAllInstancesInfo()
1751 # In Exec(), we warn about mirrored instances that have primary and
1752 # secondary living in separate node groups. To fully verify that
1753 # volumes for these instances are healthy, we will need to do an
1754 # extra call to their secondaries. We ensure here those nodes will
1756 for inst in self.owned_locks(locking.LEVEL_INSTANCE):
1757 # Important: access only the instances whose lock is owned
1758 if all_inst_info[inst].disk_template in constants.DTS_INT_MIRROR:
1759 nodes.update(all_inst_info[inst].secondary_nodes)
1761 self.needed_locks[locking.LEVEL_NODE] = nodes
1763 def CheckPrereq(self):
1764 assert self.group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP)
1765 self.group_info = self.cfg.GetNodeGroup(self.group_uuid)
1767 group_nodes = set(self.group_info.members)
1769 self.cfg.GetNodeGroupInstances(self.group_uuid, primary_only=True)
1772 group_nodes.difference(self.owned_locks(locking.LEVEL_NODE))
1774 unlocked_instances = \
1775 group_instances.difference(self.owned_locks(locking.LEVEL_INSTANCE))
1778 raise errors.OpPrereqError("Missing lock for nodes: %s" %
1779 utils.CommaJoin(unlocked_nodes),
1782 if unlocked_instances:
1783 raise errors.OpPrereqError("Missing lock for instances: %s" %
1784 utils.CommaJoin(unlocked_instances),
1787 self.all_node_info = self.cfg.GetAllNodesInfo()
1788 self.all_inst_info = self.cfg.GetAllInstancesInfo()
1790 self.my_node_names = utils.NiceSort(group_nodes)
1791 self.my_inst_names = utils.NiceSort(group_instances)
1793 self.my_node_info = dict((name, self.all_node_info[name])
1794 for name in self.my_node_names)
1796 self.my_inst_info = dict((name, self.all_inst_info[name])
1797 for name in self.my_inst_names)
1799 # We detect here the nodes that will need the extra RPC calls for verifying
1800 # split LV volumes; they should be locked.
1801 extra_lv_nodes = set()
1803 for inst in self.my_inst_info.values():
1804 if inst.disk_template in constants.DTS_INT_MIRROR:
1805 for nname in inst.all_nodes:
1806 if self.all_node_info[nname].group != self.group_uuid:
1807 extra_lv_nodes.add(nname)
1809 unlocked_lv_nodes = \
1810 extra_lv_nodes.difference(self.owned_locks(locking.LEVEL_NODE))
1812 if unlocked_lv_nodes:
1813 raise errors.OpPrereqError("Missing node locks for LV check: %s" %
1814 utils.CommaJoin(unlocked_lv_nodes),
1816 self.extra_lv_nodes = list(extra_lv_nodes)
1818 def _VerifyNode(self, ninfo, nresult):
1819 """Perform some basic validation on data returned from a node.
1821 - check the result data structure is well formed and has all the
1823 - check ganeti version
1825 @type ninfo: L{objects.Node}
1826 @param ninfo: the node to check
1827 @param nresult: the results from the node
1829 @return: whether overall this call was successful (and we can expect
1830 reasonable values in the respose)
1834 _ErrorIf = self._ErrorIf # pylint: disable=C0103
1836 # main result, nresult should be a non-empty dict
1837 test = not nresult or not isinstance(nresult, dict)
1838 _ErrorIf(test, self.ENODERPC, node,
1839 "unable to verify node: no data returned")
1843 # compares ganeti version
1844 local_version = constants.PROTOCOL_VERSION
1845 remote_version = nresult.get("version", None)
1846 test = not (remote_version and
1847 isinstance(remote_version, (list, tuple)) and
1848 len(remote_version) == 2)
1849 _ErrorIf(test, self.ENODERPC, node,
1850 "connection to node returned invalid data")
1854 test = local_version != remote_version[0]
1855 _ErrorIf(test, self.ENODEVERSION, node,
1856 "incompatible protocol versions: master %s,"
1857 " node %s", local_version, remote_version[0])
1861 # node seems compatible, we can actually try to look into its results
1863 # full package version
1864 self._ErrorIf(constants.RELEASE_VERSION != remote_version[1],
1865 self.ENODEVERSION, node,
1866 "software version mismatch: master %s, node %s",
1867 constants.RELEASE_VERSION, remote_version[1],
1868 code=self.ETYPE_WARNING)
1870 hyp_result = nresult.get(constants.NV_HYPERVISOR, None)
1871 if ninfo.vm_capable and isinstance(hyp_result, dict):
1872 for hv_name, hv_result in hyp_result.iteritems():
1873 test = hv_result is not None
1874 _ErrorIf(test, self.ENODEHV, node,
1875 "hypervisor %s verify failure: '%s'", hv_name, hv_result)
1877 hvp_result = nresult.get(constants.NV_HVPARAMS, None)
1878 if ninfo.vm_capable and isinstance(hvp_result, list):
1879 for item, hv_name, hv_result in hvp_result:
1880 _ErrorIf(True, self.ENODEHV, node,
1881 "hypervisor %s parameter verify failure (source %s): %s",
1882 hv_name, item, hv_result)
1884 test = nresult.get(constants.NV_NODESETUP,
1885 ["Missing NODESETUP results"])
1886 _ErrorIf(test, self.ENODESETUP, node, "node setup error: %s",
1891 def _VerifyNodeTime(self, ninfo, nresult,
1892 nvinfo_starttime, nvinfo_endtime):
1893 """Check the node time.
1895 @type ninfo: L{objects.Node}
1896 @param ninfo: the node to check
1897 @param nresult: the remote results for the node
1898 @param nvinfo_starttime: the start time of the RPC call
1899 @param nvinfo_endtime: the end time of the RPC call
1903 _ErrorIf = self._ErrorIf # pylint: disable=C0103
1905 ntime = nresult.get(constants.NV_TIME, None)
1907 ntime_merged = utils.MergeTime(ntime)
1908 except (ValueError, TypeError):
1909 _ErrorIf(True, self.ENODETIME, node, "Node returned invalid time")
1912 if ntime_merged < (nvinfo_starttime - constants.NODE_MAX_CLOCK_SKEW):
1913 ntime_diff = "%.01fs" % abs(nvinfo_starttime - ntime_merged)
1914 elif ntime_merged > (nvinfo_endtime + constants.NODE_MAX_CLOCK_SKEW):
1915 ntime_diff = "%.01fs" % abs(ntime_merged - nvinfo_endtime)
1919 _ErrorIf(ntime_diff is not None, self.ENODETIME, node,
1920 "Node time diverges by at least %s from master node time",
1923 def _VerifyNodeLVM(self, ninfo, nresult, vg_name):
1924 """Check the node LVM results.
1926 @type ninfo: L{objects.Node}
1927 @param ninfo: the node to check
1928 @param nresult: the remote results for the node
1929 @param vg_name: the configured VG name
1936 _ErrorIf = self._ErrorIf # pylint: disable=C0103
1938 # checks vg existence and size > 20G
1939 vglist = nresult.get(constants.NV_VGLIST, None)
1941 _ErrorIf(test, self.ENODELVM, node, "unable to check volume groups")
1943 vgstatus = utils.CheckVolumeGroupSize(vglist, vg_name,
1944 constants.MIN_VG_SIZE)
1945 _ErrorIf(vgstatus, self.ENODELVM, node, vgstatus)
1948 pvlist = nresult.get(constants.NV_PVLIST, None)
1949 test = pvlist is None
1950 _ErrorIf(test, self.ENODELVM, node, "Can't get PV list from node")
1952 # check that ':' is not present in PV names, since it's a
1953 # special character for lvcreate (denotes the range of PEs to
1955 for _, pvname, owner_vg in pvlist:
1956 test = ":" in pvname
1957 _ErrorIf(test, self.ENODELVM, node, "Invalid character ':' in PV"
1958 " '%s' of VG '%s'", pvname, owner_vg)
1960 def _VerifyNodeBridges(self, ninfo, nresult, bridges):
1961 """Check the node bridges.
1963 @type ninfo: L{objects.Node}
1964 @param ninfo: the node to check
1965 @param nresult: the remote results for the node
1966 @param bridges: the expected list of bridges
1973 _ErrorIf = self._ErrorIf # pylint: disable=C0103
1975 missing = nresult.get(constants.NV_BRIDGES, None)
1976 test = not isinstance(missing, list)
1977 _ErrorIf(test, self.ENODENET, node,
1978 "did not return valid bridge information")
1980 _ErrorIf(bool(missing), self.ENODENET, node, "missing bridges: %s" %
1981 utils.CommaJoin(sorted(missing)))
1983 def _VerifyNodeNetwork(self, ninfo, nresult):
1984 """Check the node network connectivity results.
1986 @type ninfo: L{objects.Node}
1987 @param ninfo: the node to check
1988 @param nresult: the remote results for the node
1992 _ErrorIf = self._ErrorIf # pylint: disable=C0103
1994 test = constants.NV_NODELIST not in nresult
1995 _ErrorIf(test, self.ENODESSH, node,
1996 "node hasn't returned node ssh connectivity data")
1998 if nresult[constants.NV_NODELIST]:
1999 for a_node, a_msg in nresult[constants.NV_NODELIST].items():
2000 _ErrorIf(True, self.ENODESSH, node,
2001 "ssh communication with node '%s': %s", a_node, a_msg)
2003 test = constants.NV_NODENETTEST not in nresult
2004 _ErrorIf(test, self.ENODENET, node,
2005 "node hasn't returned node tcp connectivity data")
2007 if nresult[constants.NV_NODENETTEST]:
2008 nlist = utils.NiceSort(nresult[constants.NV_NODENETTEST].keys())
2010 _ErrorIf(True, self.ENODENET, node,
2011 "tcp communication with node '%s': %s",
2012 anode, nresult[constants.NV_NODENETTEST][anode])
2014 test = constants.NV_MASTERIP not in nresult
2015 _ErrorIf(test, self.ENODENET, node,
2016 "node hasn't returned node master IP reachability data")
2018 if not nresult[constants.NV_MASTERIP]:
2019 if node == self.master_node:
2020 msg = "the master node cannot reach the master IP (not configured?)"
2022 msg = "cannot reach the master IP"
2023 _ErrorIf(True, self.ENODENET, node, msg)
2025 def _VerifyInstance(self, instance, instanceconfig, node_image,
2027 """Verify an instance.
2029 This function checks to see if the required block devices are
2030 available on the instance's node.
2033 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2034 node_current = instanceconfig.primary_node
2036 node_vol_should = {}
2037 instanceconfig.MapLVsByNode(node_vol_should)
2039 for node in node_vol_should:
2040 n_img = node_image[node]
2041 if n_img.offline or n_img.rpc_fail or n_img.lvm_fail:
2042 # ignore missing volumes on offline or broken nodes
2044 for volume in node_vol_should[node]:
2045 test = volume not in n_img.volumes
2046 _ErrorIf(test, self.EINSTANCEMISSINGDISK, instance,
2047 "volume %s missing on node %s", volume, node)
2049 if instanceconfig.admin_up:
2050 pri_img = node_image[node_current]
2051 test = instance not in pri_img.instances and not pri_img.offline
2052 _ErrorIf(test, self.EINSTANCEDOWN, instance,
2053 "instance not running on its primary node %s",
2056 diskdata = [(nname, success, status, idx)
2057 for (nname, disks) in diskstatus.items()
2058 for idx, (success, status) in enumerate(disks)]
2060 for nname, success, bdev_status, idx in diskdata:
2061 # the 'ghost node' construction in Exec() ensures that we have a
2063 snode = node_image[nname]
2064 bad_snode = snode.ghost or snode.offline
2065 _ErrorIf(instanceconfig.admin_up and not success and not bad_snode,
2066 self.EINSTANCEFAULTYDISK, instance,
2067 "couldn't retrieve status for disk/%s on %s: %s",
2068 idx, nname, bdev_status)
2069 _ErrorIf((instanceconfig.admin_up and success and
2070 bdev_status.ldisk_status == constants.LDS_FAULTY),
2071 self.EINSTANCEFAULTYDISK, instance,
2072 "disk/%s on %s is faulty", idx, nname)
2074 def _VerifyOrphanVolumes(self, node_vol_should, node_image, reserved):
2075 """Verify if there are any unknown volumes in the cluster.
2077 The .os, .swap and backup volumes are ignored. All other volumes are
2078 reported as unknown.
2080 @type reserved: L{ganeti.utils.FieldSet}
2081 @param reserved: a FieldSet of reserved volume names
2084 for node, n_img in node_image.items():
2085 if (n_img.offline or n_img.rpc_fail or n_img.lvm_fail or
2086 self.all_node_info[node].group != self.group_uuid):
2087 # skip non-healthy nodes
2089 for volume in n_img.volumes:
2090 test = ((node not in node_vol_should or
2091 volume not in node_vol_should[node]) and
2092 not reserved.Matches(volume))
2093 self._ErrorIf(test, self.ENODEORPHANLV, node,
2094 "volume %s is unknown", volume)
2096 def _VerifyNPlusOneMemory(self, node_image, instance_cfg):
2097 """Verify N+1 Memory Resilience.
2099 Check that if one single node dies we can still start all the
2100 instances it was primary for.
2103 cluster_info = self.cfg.GetClusterInfo()
2104 for node, n_img in node_image.items():
2105 # This code checks that every node which is now listed as
2106 # secondary has enough memory to host all instances it is
2107 # supposed to should a single other node in the cluster fail.
2108 # FIXME: not ready for failover to an arbitrary node
2109 # FIXME: does not support file-backed instances
2110 # WARNING: we currently take into account down instances as well
2111 # as up ones, considering that even if they're down someone
2112 # might want to start them even in the event of a node failure.
2113 if n_img.offline or self.all_node_info[node].group != self.group_uuid:
2114 # we're skipping nodes marked offline and nodes in other groups from
2115 # the N+1 warning, since most likely we don't have good memory
2116 # infromation from them; we already list instances living on such
2117 # nodes, and that's enough warning
2119 for prinode, instances in n_img.sbp.items():
2121 for instance in instances:
2122 bep = cluster_info.FillBE(instance_cfg[instance])
2123 if bep[constants.BE_AUTO_BALANCE]:
2124 needed_mem += bep[constants.BE_MEMORY]
2125 test = n_img.mfree < needed_mem
2126 self._ErrorIf(test, self.ENODEN1, node,
2127 "not enough memory to accomodate instance failovers"
2128 " should node %s fail (%dMiB needed, %dMiB available)",
2129 prinode, needed_mem, n_img.mfree)
2132 def _VerifyFiles(cls, errorif, nodeinfo, master_node, all_nvinfo,
2133 (files_all, files_opt, files_mc, files_vm)):
2134 """Verifies file checksums collected from all nodes.
2136 @param errorif: Callback for reporting errors
2137 @param nodeinfo: List of L{objects.Node} objects
2138 @param master_node: Name of master node
2139 @param all_nvinfo: RPC results
2142 # Define functions determining which nodes to consider for a file
2145 (files_mc, lambda node: (node.master_candidate or
2146 node.name == master_node)),
2147 (files_vm, lambda node: node.vm_capable),
2150 # Build mapping from filename to list of nodes which should have the file
2152 for (files, fn) in files2nodefn:
2154 filenodes = nodeinfo
2156 filenodes = filter(fn, nodeinfo)
2157 nodefiles.update((filename,
2158 frozenset(map(operator.attrgetter("name"), filenodes)))
2159 for filename in files)
2161 assert set(nodefiles) == (files_all | files_mc | files_vm)
2163 fileinfo = dict((filename, {}) for filename in nodefiles)
2164 ignore_nodes = set()
2166 for node in nodeinfo:
2168 ignore_nodes.add(node.name)
2171 nresult = all_nvinfo[node.name]
2173 if nresult.fail_msg or not nresult.payload:
2176 node_files = nresult.payload.get(constants.NV_FILELIST, None)
2178 test = not (node_files and isinstance(node_files, dict))
2179 errorif(test, cls.ENODEFILECHECK, node.name,
2180 "Node did not return file checksum data")
2182 ignore_nodes.add(node.name)
2185 # Build per-checksum mapping from filename to nodes having it
2186 for (filename, checksum) in node_files.items():
2187 assert filename in nodefiles
2188 fileinfo[filename].setdefault(checksum, set()).add(node.name)
2190 for (filename, checksums) in fileinfo.items():
2191 assert compat.all(len(i) > 10 for i in checksums), "Invalid checksum"
2193 # Nodes having the file
2194 with_file = frozenset(node_name
2195 for nodes in fileinfo[filename].values()
2196 for node_name in nodes) - ignore_nodes
2198 expected_nodes = nodefiles[filename] - ignore_nodes
2200 # Nodes missing file
2201 missing_file = expected_nodes - with_file
2203 if filename in files_opt:
2205 errorif(missing_file and missing_file != expected_nodes,
2206 cls.ECLUSTERFILECHECK, None,
2207 "File %s is optional, but it must exist on all or no"
2208 " nodes (not found on %s)",
2209 filename, utils.CommaJoin(utils.NiceSort(missing_file)))
2211 # Non-optional files
2212 errorif(missing_file, cls.ECLUSTERFILECHECK, None,
2213 "File %s is missing from node(s) %s", filename,
2214 utils.CommaJoin(utils.NiceSort(missing_file)))
2216 # Warn if a node has a file it shouldn't
2217 unexpected = with_file - expected_nodes
2219 cls.ECLUSTERFILECHECK, None,
2220 "File %s should not exist on node(s) %s",
2221 filename, utils.CommaJoin(utils.NiceSort(unexpected)))
2223 # See if there are multiple versions of the file
2224 test = len(checksums) > 1
2226 variants = ["variant %s on %s" %
2227 (idx + 1, utils.CommaJoin(utils.NiceSort(nodes)))
2228 for (idx, (checksum, nodes)) in
2229 enumerate(sorted(checksums.items()))]
2233 errorif(test, cls.ECLUSTERFILECHECK, None,
2234 "File %s found with %s different checksums (%s)",
2235 filename, len(checksums), "; ".join(variants))
2237 def _VerifyNodeDrbd(self, ninfo, nresult, instanceinfo, drbd_helper,
2239 """Verifies and the node DRBD status.
2241 @type ninfo: L{objects.Node}
2242 @param ninfo: the node to check
2243 @param nresult: the remote results for the node
2244 @param instanceinfo: the dict of instances
2245 @param drbd_helper: the configured DRBD usermode helper
2246 @param drbd_map: the DRBD map as returned by
2247 L{ganeti.config.ConfigWriter.ComputeDRBDMap}
2251 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2254 helper_result = nresult.get(constants.NV_DRBDHELPER, None)
2255 test = (helper_result == None)
2256 _ErrorIf(test, self.ENODEDRBDHELPER, node,
2257 "no drbd usermode helper returned")
2259 status, payload = helper_result
2261 _ErrorIf(test, self.ENODEDRBDHELPER, node,
2262 "drbd usermode helper check unsuccessful: %s", payload)
2263 test = status and (payload != drbd_helper)
2264 _ErrorIf(test, self.ENODEDRBDHELPER, node,
2265 "wrong drbd usermode helper: %s", payload)
2267 # compute the DRBD minors
2269 for minor, instance in drbd_map[node].items():
2270 test = instance not in instanceinfo
2271 _ErrorIf(test, self.ECLUSTERCFG, None,
2272 "ghost instance '%s' in temporary DRBD map", instance)
2273 # ghost instance should not be running, but otherwise we
2274 # don't give double warnings (both ghost instance and
2275 # unallocated minor in use)
2277 node_drbd[minor] = (instance, False)
2279 instance = instanceinfo[instance]
2280 node_drbd[minor] = (instance.name, instance.admin_up)
2282 # and now check them
2283 used_minors = nresult.get(constants.NV_DRBDLIST, [])
2284 test = not isinstance(used_minors, (tuple, list))
2285 _ErrorIf(test, self.ENODEDRBD, node,
2286 "cannot parse drbd status file: %s", str(used_minors))
2288 # we cannot check drbd status
2291 for minor, (iname, must_exist) in node_drbd.items():
2292 test = minor not in used_minors and must_exist
2293 _ErrorIf(test, self.ENODEDRBD, node,
2294 "drbd minor %d of instance %s is not active", minor, iname)
2295 for minor in used_minors:
2296 test = minor not in node_drbd
2297 _ErrorIf(test, self.ENODEDRBD, node,
2298 "unallocated drbd minor %d is in use", minor)
2300 def _UpdateNodeOS(self, ninfo, nresult, nimg):
2301 """Builds the node OS structures.
2303 @type ninfo: L{objects.Node}
2304 @param ninfo: the node to check
2305 @param nresult: the remote results for the node
2306 @param nimg: the node image object
2310 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2312 remote_os = nresult.get(constants.NV_OSLIST, None)
2313 test = (not isinstance(remote_os, list) or
2314 not compat.all(isinstance(v, list) and len(v) == 7
2315 for v in remote_os))
2317 _ErrorIf(test, self.ENODEOS, node,
2318 "node hasn't returned valid OS data")
2327 for (name, os_path, status, diagnose,
2328 variants, parameters, api_ver) in nresult[constants.NV_OSLIST]:
2330 if name not in os_dict:
2333 # parameters is a list of lists instead of list of tuples due to
2334 # JSON lacking a real tuple type, fix it:
2335 parameters = [tuple(v) for v in parameters]
2336 os_dict[name].append((os_path, status, diagnose,
2337 set(variants), set(parameters), set(api_ver)))
2339 nimg.oslist = os_dict
2341 def _VerifyNodeOS(self, ninfo, nimg, base):
2342 """Verifies the node OS list.
2344 @type ninfo: L{objects.Node}
2345 @param ninfo: the node to check
2346 @param nimg: the node image object
2347 @param base: the 'template' node we match against (e.g. from the master)
2351 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2353 assert not nimg.os_fail, "Entered _VerifyNodeOS with failed OS rpc?"
2355 beautify_params = lambda l: ["%s: %s" % (k, v) for (k, v) in l]
2356 for os_name, os_data in nimg.oslist.items():
2357 assert os_data, "Empty OS status for OS %s?!" % os_name
2358 f_path, f_status, f_diag, f_var, f_param, f_api = os_data[0]
2359 _ErrorIf(not f_status, self.ENODEOS, node,
2360 "Invalid OS %s (located at %s): %s", os_name, f_path, f_diag)
2361 _ErrorIf(len(os_data) > 1, self.ENODEOS, node,
2362 "OS '%s' has multiple entries (first one shadows the rest): %s",
2363 os_name, utils.CommaJoin([v[0] for v in os_data]))
2364 # comparisons with the 'base' image
2365 test = os_name not in base.oslist
2366 _ErrorIf(test, self.ENODEOS, node,
2367 "Extra OS %s not present on reference node (%s)",
2371 assert base.oslist[os_name], "Base node has empty OS status?"
2372 _, b_status, _, b_var, b_param, b_api = base.oslist[os_name][0]
2374 # base OS is invalid, skipping
2376 for kind, a, b in [("API version", f_api, b_api),
2377 ("variants list", f_var, b_var),
2378 ("parameters", beautify_params(f_param),
2379 beautify_params(b_param))]:
2380 _ErrorIf(a != b, self.ENODEOS, node,
2381 "OS %s for %s differs from reference node %s: [%s] vs. [%s]",
2382 kind, os_name, base.name,
2383 utils.CommaJoin(sorted(a)), utils.CommaJoin(sorted(b)))
2385 # check any missing OSes
2386 missing = set(base.oslist.keys()).difference(nimg.oslist.keys())
2387 _ErrorIf(missing, self.ENODEOS, node,
2388 "OSes present on reference node %s but missing on this node: %s",
2389 base.name, utils.CommaJoin(missing))
2391 def _VerifyOob(self, ninfo, nresult):
2392 """Verifies out of band functionality of a node.
2394 @type ninfo: L{objects.Node}
2395 @param ninfo: the node to check
2396 @param nresult: the remote results for the node
2400 # We just have to verify the paths on master and/or master candidates
2401 # as the oob helper is invoked on the master
2402 if ((ninfo.master_candidate or ninfo.master_capable) and
2403 constants.NV_OOB_PATHS in nresult):
2404 for path_result in nresult[constants.NV_OOB_PATHS]:
2405 self._ErrorIf(path_result, self.ENODEOOBPATH, node, path_result)
2407 def _UpdateNodeVolumes(self, ninfo, nresult, nimg, vg_name):
2408 """Verifies and updates the node volume data.
2410 This function will update a L{NodeImage}'s internal structures
2411 with data from the remote call.
2413 @type ninfo: L{objects.Node}
2414 @param ninfo: the node to check
2415 @param nresult: the remote results for the node
2416 @param nimg: the node image object
2417 @param vg_name: the configured VG name
2421 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2423 nimg.lvm_fail = True
2424 lvdata = nresult.get(constants.NV_LVLIST, "Missing LV data")
2427 elif isinstance(lvdata, basestring):
2428 _ErrorIf(True, self.ENODELVM, node, "LVM problem on node: %s",
2429 utils.SafeEncode(lvdata))
2430 elif not isinstance(lvdata, dict):
2431 _ErrorIf(True, self.ENODELVM, node, "rpc call to node failed (lvlist)")
2433 nimg.volumes = lvdata
2434 nimg.lvm_fail = False
2436 def _UpdateNodeInstances(self, ninfo, nresult, nimg):
2437 """Verifies and updates the node instance list.
2439 If the listing was successful, then updates this node's instance
2440 list. Otherwise, it marks the RPC call as failed for the instance
2443 @type ninfo: L{objects.Node}
2444 @param ninfo: the node to check
2445 @param nresult: the remote results for the node
2446 @param nimg: the node image object
2449 idata = nresult.get(constants.NV_INSTANCELIST, None)
2450 test = not isinstance(idata, list)
2451 self._ErrorIf(test, self.ENODEHV, ninfo.name, "rpc call to node failed"
2452 " (instancelist): %s", utils.SafeEncode(str(idata)))
2454 nimg.hyp_fail = True
2456 nimg.instances = idata
2458 def _UpdateNodeInfo(self, ninfo, nresult, nimg, vg_name):
2459 """Verifies and computes a node information map
2461 @type ninfo: L{objects.Node}
2462 @param ninfo: the node to check
2463 @param nresult: the remote results for the node
2464 @param nimg: the node image object
2465 @param vg_name: the configured VG name
2469 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2471 # try to read free memory (from the hypervisor)
2472 hv_info = nresult.get(constants.NV_HVINFO, None)
2473 test = not isinstance(hv_info, dict) or "memory_free" not in hv_info
2474 _ErrorIf(test, self.ENODEHV, node, "rpc call to node failed (hvinfo)")
2477 nimg.mfree = int(hv_info["memory_free"])
2478 except (ValueError, TypeError):
2479 _ErrorIf(True, self.ENODERPC, node,
2480 "node returned invalid nodeinfo, check hypervisor")
2482 # FIXME: devise a free space model for file based instances as well
2483 if vg_name is not None:
2484 test = (constants.NV_VGLIST not in nresult or
2485 vg_name not in nresult[constants.NV_VGLIST])
2486 _ErrorIf(test, self.ENODELVM, node,
2487 "node didn't return data for the volume group '%s'"
2488 " - it is either missing or broken", vg_name)
2491 nimg.dfree = int(nresult[constants.NV_VGLIST][vg_name])
2492 except (ValueError, TypeError):
2493 _ErrorIf(True, self.ENODERPC, node,
2494 "node returned invalid LVM info, check LVM status")
2496 def _CollectDiskInfo(self, nodelist, node_image, instanceinfo):
2497 """Gets per-disk status information for all instances.
2499 @type nodelist: list of strings
2500 @param nodelist: Node names
2501 @type node_image: dict of (name, L{objects.Node})
2502 @param node_image: Node objects
2503 @type instanceinfo: dict of (name, L{objects.Instance})
2504 @param instanceinfo: Instance objects
2505 @rtype: {instance: {node: [(succes, payload)]}}
2506 @return: a dictionary of per-instance dictionaries with nodes as
2507 keys and disk information as values; the disk information is a
2508 list of tuples (success, payload)
2511 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2514 node_disks_devonly = {}
2515 diskless_instances = set()
2516 diskless = constants.DT_DISKLESS
2518 for nname in nodelist:
2519 node_instances = list(itertools.chain(node_image[nname].pinst,
2520 node_image[nname].sinst))
2521 diskless_instances.update(inst for inst in node_instances
2522 if instanceinfo[inst].disk_template == diskless)
2523 disks = [(inst, disk)
2524 for inst in node_instances
2525 for disk in instanceinfo[inst].disks]
2528 # No need to collect data
2531 node_disks[nname] = disks
2533 # Creating copies as SetDiskID below will modify the objects and that can
2534 # lead to incorrect data returned from nodes
2535 devonly = [dev.Copy() for (_, dev) in disks]
2538 self.cfg.SetDiskID(dev, nname)
2540 node_disks_devonly[nname] = devonly
2542 assert len(node_disks) == len(node_disks_devonly)
2544 # Collect data from all nodes with disks
2545 result = self.rpc.call_blockdev_getmirrorstatus_multi(node_disks.keys(),
2548 assert len(result) == len(node_disks)
2552 for (nname, nres) in result.items():
2553 disks = node_disks[nname]
2556 # No data from this node
2557 data = len(disks) * [(False, "node offline")]
2560 _ErrorIf(msg, self.ENODERPC, nname,
2561 "while getting disk information: %s", msg)
2563 # No data from this node
2564 data = len(disks) * [(False, msg)]
2567 for idx, i in enumerate(nres.payload):
2568 if isinstance(i, (tuple, list)) and len(i) == 2:
2571 logging.warning("Invalid result from node %s, entry %d: %s",
2573 data.append((False, "Invalid result from the remote node"))
2575 for ((inst, _), status) in zip(disks, data):
2576 instdisk.setdefault(inst, {}).setdefault(nname, []).append(status)
2578 # Add empty entries for diskless instances.
2579 for inst in diskless_instances:
2580 assert inst not in instdisk
2583 assert compat.all(len(statuses) == len(instanceinfo[inst].disks) and
2584 len(nnames) <= len(instanceinfo[inst].all_nodes) and
2585 compat.all(isinstance(s, (tuple, list)) and
2586 len(s) == 2 for s in statuses)
2587 for inst, nnames in instdisk.items()
2588 for nname, statuses in nnames.items())
2589 assert set(instdisk) == set(instanceinfo), "instdisk consistency failure"
2594 def _SshNodeSelector(group_uuid, all_nodes):
2595 """Create endless iterators for all potential SSH check hosts.
2598 nodes = [node for node in all_nodes
2599 if (node.group != group_uuid and
2601 keyfunc = operator.attrgetter("group")
2603 return map(itertools.cycle,
2604 [sorted(map(operator.attrgetter("name"), names))
2605 for _, names in itertools.groupby(sorted(nodes, key=keyfunc),
2609 def _SelectSshCheckNodes(cls, group_nodes, group_uuid, all_nodes):
2610 """Choose which nodes should talk to which other nodes.
2612 We will make nodes contact all nodes in their group, and one node from
2615 @warning: This algorithm has a known issue if one node group is much
2616 smaller than others (e.g. just one node). In such a case all other
2617 nodes will talk to the single node.
2620 online_nodes = sorted(node.name for node in group_nodes if not node.offline)
2621 sel = cls._SshNodeSelector(group_uuid, all_nodes)
2623 return (online_nodes,
2624 dict((name, sorted([i.next() for i in sel]))
2625 for name in online_nodes))
2627 def BuildHooksEnv(self):
2630 Cluster-Verify hooks just ran in the post phase and their failure makes
2631 the output be logged in the verify output and the verification to fail.
2635 "CLUSTER_TAGS": " ".join(self.cfg.GetClusterInfo().GetTags())
2638 env.update(("NODE_TAGS_%s" % node.name, " ".join(node.GetTags()))
2639 for node in self.my_node_info.values())
2643 def BuildHooksNodes(self):
2644 """Build hooks nodes.
2647 return ([], self.my_node_names)
2649 def Exec(self, feedback_fn):
2650 """Verify integrity of the node group, performing various test on nodes.
2653 # This method has too many local variables. pylint: disable=R0914
2654 feedback_fn("* Verifying group '%s'" % self.group_info.name)
2656 if not self.my_node_names:
2658 feedback_fn("* Empty node group, skipping verification")
2662 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2663 verbose = self.op.verbose
2664 self._feedback_fn = feedback_fn
2666 vg_name = self.cfg.GetVGName()
2667 drbd_helper = self.cfg.GetDRBDHelper()
2668 cluster = self.cfg.GetClusterInfo()
2669 groupinfo = self.cfg.GetAllNodeGroupsInfo()
2670 hypervisors = cluster.enabled_hypervisors
2671 node_data_list = [self.my_node_info[name] for name in self.my_node_names]
2673 i_non_redundant = [] # Non redundant instances
2674 i_non_a_balanced = [] # Non auto-balanced instances
2675 n_offline = 0 # Count of offline nodes
2676 n_drained = 0 # Count of nodes being drained
2677 node_vol_should = {}
2679 # FIXME: verify OS list
2682 filemap = _ComputeAncillaryFiles(cluster, False)
2684 # do local checksums
2685 master_node = self.master_node = self.cfg.GetMasterNode()
2686 master_ip = self.cfg.GetMasterIP()
2688 feedback_fn("* Gathering data (%d nodes)" % len(self.my_node_names))
2690 node_verify_param = {
2691 constants.NV_FILELIST:
2692 utils.UniqueSequence(filename
2693 for files in filemap
2694 for filename in files),
2695 constants.NV_NODELIST:
2696 self._SelectSshCheckNodes(node_data_list, self.group_uuid,
2697 self.all_node_info.values()),
2698 constants.NV_HYPERVISOR: hypervisors,
2699 constants.NV_HVPARAMS:
2700 _GetAllHypervisorParameters(cluster, self.all_inst_info.values()),
2701 constants.NV_NODENETTEST: [(node.name, node.primary_ip, node.secondary_ip)
2702 for node in node_data_list
2703 if not node.offline],
2704 constants.NV_INSTANCELIST: hypervisors,
2705 constants.NV_VERSION: None,
2706 constants.NV_HVINFO: self.cfg.GetHypervisorType(),
2707 constants.NV_NODESETUP: None,
2708 constants.NV_TIME: None,
2709 constants.NV_MASTERIP: (master_node, master_ip),
2710 constants.NV_OSLIST: None,
2711 constants.NV_VMNODES: self.cfg.GetNonVmCapableNodeList(),
2714 if vg_name is not None:
2715 node_verify_param[constants.NV_VGLIST] = None
2716 node_verify_param[constants.NV_LVLIST] = vg_name
2717 node_verify_param[constants.NV_PVLIST] = [vg_name]
2718 node_verify_param[constants.NV_DRBDLIST] = None
2721 node_verify_param[constants.NV_DRBDHELPER] = drbd_helper
2724 # FIXME: this needs to be changed per node-group, not cluster-wide
2726 default_nicpp = cluster.nicparams[constants.PP_DEFAULT]
2727 if default_nicpp[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
2728 bridges.add(default_nicpp[constants.NIC_LINK])
2729 for instance in self.my_inst_info.values():
2730 for nic in instance.nics:
2731 full_nic = cluster.SimpleFillNIC(nic.nicparams)
2732 if full_nic[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
2733 bridges.add(full_nic[constants.NIC_LINK])
2736 node_verify_param[constants.NV_BRIDGES] = list(bridges)
2738 # Build our expected cluster state
2739 node_image = dict((node.name, self.NodeImage(offline=node.offline,
2741 vm_capable=node.vm_capable))
2742 for node in node_data_list)
2746 for node in self.all_node_info.values():
2747 path = _SupportsOob(self.cfg, node)
2748 if path and path not in oob_paths:
2749 oob_paths.append(path)
2752 node_verify_param[constants.NV_OOB_PATHS] = oob_paths
2754 for instance in self.my_inst_names:
2755 inst_config = self.my_inst_info[instance]
2757 for nname in inst_config.all_nodes:
2758 if nname not in node_image:
2759 gnode = self.NodeImage(name=nname)
2760 gnode.ghost = (nname not in self.all_node_info)
2761 node_image[nname] = gnode
2763 inst_config.MapLVsByNode(node_vol_should)
2765 pnode = inst_config.primary_node
2766 node_image[pnode].pinst.append(instance)
2768 for snode in inst_config.secondary_nodes:
2769 nimg = node_image[snode]
2770 nimg.sinst.append(instance)
2771 if pnode not in nimg.sbp:
2772 nimg.sbp[pnode] = []
2773 nimg.sbp[pnode].append(instance)
2775 # At this point, we have the in-memory data structures complete,
2776 # except for the runtime information, which we'll gather next
2778 # Due to the way our RPC system works, exact response times cannot be
2779 # guaranteed (e.g. a broken node could run into a timeout). By keeping the
2780 # time before and after executing the request, we can at least have a time
2782 nvinfo_starttime = time.time()
2783 all_nvinfo = self.rpc.call_node_verify(self.my_node_names,
2785 self.cfg.GetClusterName())
2786 nvinfo_endtime = time.time()
2788 if self.extra_lv_nodes and vg_name is not None:
2790 self.rpc.call_node_verify(self.extra_lv_nodes,
2791 {constants.NV_LVLIST: vg_name},
2792 self.cfg.GetClusterName())
2794 extra_lv_nvinfo = {}
2796 all_drbd_map = self.cfg.ComputeDRBDMap()
2798 feedback_fn("* Gathering disk information (%s nodes)" %
2799 len(self.my_node_names))
2800 instdisk = self._CollectDiskInfo(self.my_node_names, node_image,
2803 feedback_fn("* Verifying configuration file consistency")
2805 # If not all nodes are being checked, we need to make sure the master node
2806 # and a non-checked vm_capable node are in the list.
2807 absent_nodes = set(self.all_node_info).difference(self.my_node_info)
2809 vf_nvinfo = all_nvinfo.copy()
2810 vf_node_info = list(self.my_node_info.values())
2811 additional_nodes = []
2812 if master_node not in self.my_node_info:
2813 additional_nodes.append(master_node)
2814 vf_node_info.append(self.all_node_info[master_node])
2815 # Add the first vm_capable node we find which is not included
2816 for node in absent_nodes:
2817 nodeinfo = self.all_node_info[node]
2818 if nodeinfo.vm_capable and not nodeinfo.offline:
2819 additional_nodes.append(node)
2820 vf_node_info.append(self.all_node_info[node])
2822 key = constants.NV_FILELIST
2823 vf_nvinfo.update(self.rpc.call_node_verify(additional_nodes,
2824 {key: node_verify_param[key]},
2825 self.cfg.GetClusterName()))
2827 vf_nvinfo = all_nvinfo
2828 vf_node_info = self.my_node_info.values()
2830 self._VerifyFiles(_ErrorIf, vf_node_info, master_node, vf_nvinfo, filemap)
2832 feedback_fn("* Verifying node status")
2836 for node_i in node_data_list:
2838 nimg = node_image[node]
2842 feedback_fn("* Skipping offline node %s" % (node,))
2846 if node == master_node:
2848 elif node_i.master_candidate:
2849 ntype = "master candidate"
2850 elif node_i.drained:
2856 feedback_fn("* Verifying node %s (%s)" % (node, ntype))
2858 msg = all_nvinfo[node].fail_msg
2859 _ErrorIf(msg, self.ENODERPC, node, "while contacting node: %s", msg)
2861 nimg.rpc_fail = True
2864 nresult = all_nvinfo[node].payload
2866 nimg.call_ok = self._VerifyNode(node_i, nresult)
2867 self._VerifyNodeTime(node_i, nresult, nvinfo_starttime, nvinfo_endtime)
2868 self._VerifyNodeNetwork(node_i, nresult)
2869 self._VerifyOob(node_i, nresult)
2872 self._VerifyNodeLVM(node_i, nresult, vg_name)
2873 self._VerifyNodeDrbd(node_i, nresult, self.all_inst_info, drbd_helper,
2876 self._UpdateNodeVolumes(node_i, nresult, nimg, vg_name)
2877 self._UpdateNodeInstances(node_i, nresult, nimg)
2878 self._UpdateNodeInfo(node_i, nresult, nimg, vg_name)
2879 self._UpdateNodeOS(node_i, nresult, nimg)
2881 if not nimg.os_fail:
2882 if refos_img is None:
2884 self._VerifyNodeOS(node_i, nimg, refos_img)
2885 self._VerifyNodeBridges(node_i, nresult, bridges)
2887 # Check whether all running instancies are primary for the node. (This
2888 # can no longer be done from _VerifyInstance below, since some of the
2889 # wrong instances could be from other node groups.)
2890 non_primary_inst = set(nimg.instances).difference(nimg.pinst)
2892 for inst in non_primary_inst:
2893 test = inst in self.all_inst_info
2894 _ErrorIf(test, self.EINSTANCEWRONGNODE, inst,
2895 "instance should not run on node %s", node_i.name)
2896 _ErrorIf(not test, self.ENODEORPHANINSTANCE, node_i.name,
2897 "node is running unknown instance %s", inst)
2899 for node, result in extra_lv_nvinfo.items():
2900 self._UpdateNodeVolumes(self.all_node_info[node], result.payload,
2901 node_image[node], vg_name)
2903 feedback_fn("* Verifying instance status")
2904 for instance in self.my_inst_names:
2906 feedback_fn("* Verifying instance %s" % instance)
2907 inst_config = self.my_inst_info[instance]
2908 self._VerifyInstance(instance, inst_config, node_image,
2910 inst_nodes_offline = []
2912 pnode = inst_config.primary_node
2913 pnode_img = node_image[pnode]
2914 _ErrorIf(pnode_img.rpc_fail and not pnode_img.offline,
2915 self.ENODERPC, pnode, "instance %s, connection to"
2916 " primary node failed", instance)
2918 _ErrorIf(inst_config.admin_up and pnode_img.offline,
2919 self.EINSTANCEBADNODE, instance,
2920 "instance is marked as running and lives on offline node %s",
2921 inst_config.primary_node)
2923 # If the instance is non-redundant we cannot survive losing its primary
2924 # node, so we are not N+1 compliant. On the other hand we have no disk
2925 # templates with more than one secondary so that situation is not well
2927 # FIXME: does not support file-backed instances
2928 if not inst_config.secondary_nodes:
2929 i_non_redundant.append(instance)
2931 _ErrorIf(len(inst_config.secondary_nodes) > 1, self.EINSTANCELAYOUT,
2932 instance, "instance has multiple secondary nodes: %s",
2933 utils.CommaJoin(inst_config.secondary_nodes),
2934 code=self.ETYPE_WARNING)
2936 if inst_config.disk_template in constants.DTS_INT_MIRROR:
2937 pnode = inst_config.primary_node
2938 instance_nodes = utils.NiceSort(inst_config.all_nodes)
2939 instance_groups = {}
2941 for node in instance_nodes:
2942 instance_groups.setdefault(self.all_node_info[node].group,
2946 "%s (group %s)" % (utils.CommaJoin(nodes), groupinfo[group].name)
2947 # Sort so that we always list the primary node first.
2948 for group, nodes in sorted(instance_groups.items(),
2949 key=lambda (_, nodes): pnode in nodes,
2952 self._ErrorIf(len(instance_groups) > 1, self.EINSTANCESPLITGROUPS,
2953 instance, "instance has primary and secondary nodes in"
2954 " different groups: %s", utils.CommaJoin(pretty_list),
2955 code=self.ETYPE_WARNING)
2957 if not cluster.FillBE(inst_config)[constants.BE_AUTO_BALANCE]:
2958 i_non_a_balanced.append(instance)
2960 for snode in inst_config.secondary_nodes:
2961 s_img = node_image[snode]
2962 _ErrorIf(s_img.rpc_fail and not s_img.offline, self.ENODERPC, snode,
2963 "instance %s, connection to secondary node failed", instance)
2966 inst_nodes_offline.append(snode)
2968 # warn that the instance lives on offline nodes
2969 _ErrorIf(inst_nodes_offline, self.EINSTANCEBADNODE, instance,
2970 "instance has offline secondary node(s) %s",
2971 utils.CommaJoin(inst_nodes_offline))
2972 # ... or ghost/non-vm_capable nodes
2973 for node in inst_config.all_nodes:
2974 _ErrorIf(node_image[node].ghost, self.EINSTANCEBADNODE, instance,
2975 "instance lives on ghost node %s", node)
2976 _ErrorIf(not node_image[node].vm_capable, self.EINSTANCEBADNODE,
2977 instance, "instance lives on non-vm_capable node %s", node)
2979 feedback_fn("* Verifying orphan volumes")
2980 reserved = utils.FieldSet(*cluster.reserved_lvs)
2982 # We will get spurious "unknown volume" warnings if any node of this group
2983 # is secondary for an instance whose primary is in another group. To avoid
2984 # them, we find these instances and add their volumes to node_vol_should.
2985 for inst in self.all_inst_info.values():
2986 for secondary in inst.secondary_nodes:
2987 if (secondary in self.my_node_info
2988 and inst.name not in self.my_inst_info):
2989 inst.MapLVsByNode(node_vol_should)
2992 self._VerifyOrphanVolumes(node_vol_should, node_image, reserved)
2994 if constants.VERIFY_NPLUSONE_MEM not in self.op.skip_checks:
2995 feedback_fn("* Verifying N+1 Memory redundancy")
2996 self._VerifyNPlusOneMemory(node_image, self.my_inst_info)
2998 feedback_fn("* Other Notes")
3000 feedback_fn(" - NOTICE: %d non-redundant instance(s) found."
3001 % len(i_non_redundant))
3003 if i_non_a_balanced:
3004 feedback_fn(" - NOTICE: %d non-auto-balanced instance(s) found."
3005 % len(i_non_a_balanced))
3008 feedback_fn(" - NOTICE: %d offline node(s) found." % n_offline)
3011 feedback_fn(" - NOTICE: %d drained node(s) found." % n_drained)
3015 def HooksCallBack(self, phase, hooks_results, feedback_fn, lu_result):
3016 """Analyze the post-hooks' result
3018 This method analyses the hook result, handles it, and sends some
3019 nicely-formatted feedback back to the user.
3021 @param phase: one of L{constants.HOOKS_PHASE_POST} or
3022 L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase
3023 @param hooks_results: the results of the multi-node hooks rpc call
3024 @param feedback_fn: function used send feedback back to the caller
3025 @param lu_result: previous Exec result
3026 @return: the new Exec result, based on the previous result
3030 # We only really run POST phase hooks, only for non-empty groups,
3031 # and are only interested in their results
3032 if not self.my_node_names:
3035 elif phase == constants.HOOKS_PHASE_POST:
3036 # Used to change hooks' output to proper indentation
3037 feedback_fn("* Hooks Results")
3038 assert hooks_results, "invalid result from hooks"
3040 for node_name in hooks_results:
3041 res = hooks_results[node_name]
3043 test = msg and not res.offline
3044 self._ErrorIf(test, self.ENODEHOOKS, node_name,
3045 "Communication failure in hooks execution: %s", msg)
3046 if res.offline or msg:
3047 # No need to investigate payload if node is offline or gave
3050 for script, hkr, output in res.payload:
3051 test = hkr == constants.HKR_FAIL
3052 self._ErrorIf(test, self.ENODEHOOKS, node_name,
3053 "Script %s failed, output:", script)
3055 output = self._HOOKS_INDENT_RE.sub(" ", output)
3056 feedback_fn("%s" % output)
3062 class LUClusterVerifyDisks(NoHooksLU):
3063 """Verifies the cluster disks status.
3068 def ExpandNames(self):
3069 self.share_locks = _ShareAll()
3070 self.needed_locks = {
3071 locking.LEVEL_NODEGROUP: locking.ALL_SET,
3074 def Exec(self, feedback_fn):
3075 group_names = self.owned_locks(locking.LEVEL_NODEGROUP)
3077 # Submit one instance of L{opcodes.OpGroupVerifyDisks} per node group
3078 return ResultWithJobs([[opcodes.OpGroupVerifyDisks(group_name=group)]
3079 for group in group_names])
3082 class LUGroupVerifyDisks(NoHooksLU):
3083 """Verifies the status of all disks in a node group.
3088 def ExpandNames(self):
3089 # Raises errors.OpPrereqError on its own if group can't be found
3090 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
3092 self.share_locks = _ShareAll()
3093 self.needed_locks = {
3094 locking.LEVEL_INSTANCE: [],
3095 locking.LEVEL_NODEGROUP: [],
3096 locking.LEVEL_NODE: [],
3099 def DeclareLocks(self, level):
3100 if level == locking.LEVEL_INSTANCE:
3101 assert not self.needed_locks[locking.LEVEL_INSTANCE]
3103 # Lock instances optimistically, needs verification once node and group
3104 # locks have been acquired
3105 self.needed_locks[locking.LEVEL_INSTANCE] = \
3106 self.cfg.GetNodeGroupInstances(self.group_uuid)
3108 elif level == locking.LEVEL_NODEGROUP:
3109 assert not self.needed_locks[locking.LEVEL_NODEGROUP]
3111 self.needed_locks[locking.LEVEL_NODEGROUP] = \
3112 set([self.group_uuid] +
3113 # Lock all groups used by instances optimistically; this requires
3114 # going via the node before it's locked, requiring verification
3117 for instance_name in self.owned_locks(locking.LEVEL_INSTANCE)
3118 for group_uuid in self.cfg.GetInstanceNodeGroups(instance_name)])
3120 elif level == locking.LEVEL_NODE:
3121 # This will only lock the nodes in the group to be verified which contain
3123 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
3124 self._LockInstancesNodes()
3126 # Lock all nodes in group to be verified
3127 assert self.group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP)
3128 member_nodes = self.cfg.GetNodeGroup(self.group_uuid).members
3129 self.needed_locks[locking.LEVEL_NODE].extend(member_nodes)
3131 def CheckPrereq(self):
3132 owned_instances = frozenset(self.owned_locks(locking.LEVEL_INSTANCE))
3133 owned_groups = frozenset(self.owned_locks(locking.LEVEL_NODEGROUP))
3134 owned_nodes = frozenset(self.owned_locks(locking.LEVEL_NODE))
3136 assert self.group_uuid in owned_groups
3138 # Check if locked instances are still correct
3139 _CheckNodeGroupInstances(self.cfg, self.group_uuid, owned_instances)
3141 # Get instance information
3142 self.instances = dict(self.cfg.GetMultiInstanceInfo(owned_instances))
3144 # Check if node groups for locked instances are still correct
3145 _CheckInstancesNodeGroups(self.cfg, self.instances,
3146 owned_groups, owned_nodes, self.group_uuid)
3148 def Exec(self, feedback_fn):
3149 """Verify integrity of cluster disks.
3151 @rtype: tuple of three items
3152 @return: a tuple of (dict of node-to-node_error, list of instances
3153 which need activate-disks, dict of instance: (node, volume) for
3158 res_instances = set()
3161 nv_dict = _MapInstanceDisksToNodes([inst
3162 for inst in self.instances.values()
3166 nodes = utils.NiceSort(set(self.owned_locks(locking.LEVEL_NODE)) &
3167 set(self.cfg.GetVmCapableNodeList()))
3169 node_lvs = self.rpc.call_lv_list(nodes, [])
3171 for (node, node_res) in node_lvs.items():
3172 if node_res.offline:
3175 msg = node_res.fail_msg
3177 logging.warning("Error enumerating LVs on node %s: %s", node, msg)
3178 res_nodes[node] = msg
3181 for lv_name, (_, _, lv_online) in node_res.payload.items():
3182 inst = nv_dict.pop((node, lv_name), None)
3183 if not (lv_online or inst is None):
3184 res_instances.add(inst)
3186 # any leftover items in nv_dict are missing LVs, let's arrange the data
3188 for key, inst in nv_dict.iteritems():
3189 res_missing.setdefault(inst, []).append(list(key))
3191 return (res_nodes, list(res_instances), res_missing)
3194 class LUClusterRepairDiskSizes(NoHooksLU):
3195 """Verifies the cluster disks sizes.
3200 def ExpandNames(self):
3201 if self.op.instances:
3202 self.wanted_names = _GetWantedInstances(self, self.op.instances)
3203 self.needed_locks = {
3204 locking.LEVEL_NODE: [],
3205 locking.LEVEL_INSTANCE: self.wanted_names,
3207 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
3209 self.wanted_names = None
3210 self.needed_locks = {
3211 locking.LEVEL_NODE: locking.ALL_SET,
3212 locking.LEVEL_INSTANCE: locking.ALL_SET,
3214 self.share_locks = {
3215 locking.LEVEL_NODE: 1,
3216 locking.LEVEL_INSTANCE: 0,
3219 def DeclareLocks(self, level):
3220 if level == locking.LEVEL_NODE and self.wanted_names is not None:
3221 self._LockInstancesNodes(primary_only=True)
3223 def CheckPrereq(self):
3224 """Check prerequisites.
3226 This only checks the optional instance list against the existing names.
3229 if self.wanted_names is None:
3230 self.wanted_names = self.owned_locks(locking.LEVEL_INSTANCE)
3232 self.wanted_instances = \
3233 map(compat.snd, self.cfg.GetMultiInstanceInfo(self.wanted_names))
3235 def _EnsureChildSizes(self, disk):
3236 """Ensure children of the disk have the needed disk size.
3238 This is valid mainly for DRBD8 and fixes an issue where the
3239 children have smaller disk size.
3241 @param disk: an L{ganeti.objects.Disk} object
3244 if disk.dev_type == constants.LD_DRBD8:
3245 assert disk.children, "Empty children for DRBD8?"
3246 fchild = disk.children[0]
3247 mismatch = fchild.size < disk.size
3249 self.LogInfo("Child disk has size %d, parent %d, fixing",
3250 fchild.size, disk.size)
3251 fchild.size = disk.size
3253 # and we recurse on this child only, not on the metadev
3254 return self._EnsureChildSizes(fchild) or mismatch
3258 def Exec(self, feedback_fn):
3259 """Verify the size of cluster disks.
3262 # TODO: check child disks too
3263 # TODO: check differences in size between primary/secondary nodes
3265 for instance in self.wanted_instances:
3266 pnode = instance.primary_node
3267 if pnode not in per_node_disks:
3268 per_node_disks[pnode] = []
3269 for idx, disk in enumerate(instance.disks):
3270 per_node_disks[pnode].append((instance, idx, disk))
3273 for node, dskl in per_node_disks.items():
3274 newl = [v[2].Copy() for v in dskl]
3276 self.cfg.SetDiskID(dsk, node)
3277 result = self.rpc.call_blockdev_getsize(node, newl)
3279 self.LogWarning("Failure in blockdev_getsize call to node"
3280 " %s, ignoring", node)
3282 if len(result.payload) != len(dskl):
3283 logging.warning("Invalid result from node %s: len(dksl)=%d,"
3284 " result.payload=%s", node, len(dskl), result.payload)
3285 self.LogWarning("Invalid result from node %s, ignoring node results",
3288 for ((instance, idx, disk), size) in zip(dskl, result.payload):
3290 self.LogWarning("Disk %d of instance %s did not return size"
3291 " information, ignoring", idx, instance.name)
3293 if not isinstance(size, (int, long)):
3294 self.LogWarning("Disk %d of instance %s did not return valid"
3295 " size information, ignoring", idx, instance.name)
3298 if size != disk.size:
3299 self.LogInfo("Disk %d of instance %s has mismatched size,"
3300 " correcting: recorded %d, actual %d", idx,
3301 instance.name, disk.size, size)
3303 self.cfg.Update(instance, feedback_fn)
3304 changed.append((instance.name, idx, size))
3305 if self._EnsureChildSizes(disk):
3306 self.cfg.Update(instance, feedback_fn)
3307 changed.append((instance.name, idx, disk.size))
3311 class LUClusterRename(LogicalUnit):
3312 """Rename the cluster.
3315 HPATH = "cluster-rename"
3316 HTYPE = constants.HTYPE_CLUSTER
3318 def BuildHooksEnv(self):
3323 "OP_TARGET": self.cfg.GetClusterName(),
3324 "NEW_NAME": self.op.name,
3327 def BuildHooksNodes(self):
3328 """Build hooks nodes.
3331 return ([self.cfg.GetMasterNode()], self.cfg.GetNodeList())
3333 def CheckPrereq(self):
3334 """Verify that the passed name is a valid one.
3337 hostname = netutils.GetHostname(name=self.op.name,
3338 family=self.cfg.GetPrimaryIPFamily())
3340 new_name = hostname.name
3341 self.ip = new_ip = hostname.ip
3342 old_name = self.cfg.GetClusterName()
3343 old_ip = self.cfg.GetMasterIP()
3344 if new_name == old_name and new_ip == old_ip:
3345 raise errors.OpPrereqError("Neither the name nor the IP address of the"
3346 " cluster has changed",
3348 if new_ip != old_ip:
3349 if netutils.TcpPing(new_ip, constants.DEFAULT_NODED_PORT):
3350 raise errors.OpPrereqError("The given cluster IP address (%s) is"
3351 " reachable on the network" %
3352 new_ip, errors.ECODE_NOTUNIQUE)
3354 self.op.name = new_name
3356 def Exec(self, feedback_fn):
3357 """Rename the cluster.
3360 clustername = self.op.name
3363 # shutdown the master IP
3364 master = self.cfg.GetMasterNode()
3365 result = self.rpc.call_node_deactivate_master_ip(master)
3366 result.Raise("Could not disable the master role")
3369 cluster = self.cfg.GetClusterInfo()
3370 cluster.cluster_name = clustername
3371 cluster.master_ip = ip
3372 self.cfg.Update(cluster, feedback_fn)
3374 # update the known hosts file
3375 ssh.WriteKnownHostsFile(self.cfg, constants.SSH_KNOWN_HOSTS_FILE)
3376 node_list = self.cfg.GetOnlineNodeList()
3378 node_list.remove(master)
3381 _UploadHelper(self, node_list, constants.SSH_KNOWN_HOSTS_FILE)
3383 result = self.rpc.call_node_activate_master_ip(master)
3384 msg = result.fail_msg
3386 self.LogWarning("Could not re-enable the master role on"
3387 " the master, please restart manually: %s", msg)
3392 class LUClusterSetParams(LogicalUnit):
3393 """Change the parameters of the cluster.
3396 HPATH = "cluster-modify"
3397 HTYPE = constants.HTYPE_CLUSTER
3400 def CheckArguments(self):
3404 if self.op.uid_pool:
3405 uidpool.CheckUidPool(self.op.uid_pool)
3407 if self.op.add_uids:
3408 uidpool.CheckUidPool(self.op.add_uids)
3410 if self.op.remove_uids:
3411 uidpool.CheckUidPool(self.op.remove_uids)
3413 def ExpandNames(self):
3414 # FIXME: in the future maybe other cluster params won't require checking on
3415 # all nodes to be modified.
3416 self.needed_locks = {
3417 locking.LEVEL_NODE: locking.ALL_SET,
3419 self.share_locks[locking.LEVEL_NODE] = 1
3421 def BuildHooksEnv(self):
3426 "OP_TARGET": self.cfg.GetClusterName(),
3427 "NEW_VG_NAME": self.op.vg_name,
3430 def BuildHooksNodes(self):
3431 """Build hooks nodes.
3434 mn = self.cfg.GetMasterNode()
3437 def CheckPrereq(self):
3438 """Check prerequisites.
3440 This checks whether the given params don't conflict and
3441 if the given volume group is valid.
3444 if self.op.vg_name is not None and not self.op.vg_name:
3445 if self.cfg.HasAnyDiskOfType(constants.LD_LV):
3446 raise errors.OpPrereqError("Cannot disable lvm storage while lvm-based"
3447 " instances exist", errors.ECODE_INVAL)
3449 if self.op.drbd_helper is not None and not self.op.drbd_helper:
3450 if self.cfg.HasAnyDiskOfType(constants.LD_DRBD8):
3451 raise errors.OpPrereqError("Cannot disable drbd helper while"
3452 " drbd-based instances exist",
3455 node_list = self.owned_locks(locking.LEVEL_NODE)
3457 # if vg_name not None, checks given volume group on all nodes
3459 vglist = self.rpc.call_vg_list(node_list)
3460 for node in node_list:
3461 msg = vglist[node].fail_msg
3463 # ignoring down node
3464 self.LogWarning("Error while gathering data on node %s"
3465 " (ignoring node): %s", node, msg)
3467 vgstatus = utils.CheckVolumeGroupSize(vglist[node].payload,
3469 constants.MIN_VG_SIZE)
3471 raise errors.OpPrereqError("Error on node '%s': %s" %
3472 (node, vgstatus), errors.ECODE_ENVIRON)
3474 if self.op.drbd_helper:
3475 # checks given drbd helper on all nodes
3476 helpers = self.rpc.call_drbd_helper(node_list)
3477 for (node, ninfo) in self.cfg.GetMultiNodeInfo(node_list):
3479 self.LogInfo("Not checking drbd helper on offline node %s", node)
3481 msg = helpers[node].fail_msg
3483 raise errors.OpPrereqError("Error checking drbd helper on node"
3484 " '%s': %s" % (node, msg),
3485 errors.ECODE_ENVIRON)
3486 node_helper = helpers[node].payload
3487 if node_helper != self.op.drbd_helper:
3488 raise errors.OpPrereqError("Error on node '%s': drbd helper is %s" %
3489 (node, node_helper), errors.ECODE_ENVIRON)
3491 self.cluster = cluster = self.cfg.GetClusterInfo()
3492 # validate params changes
3493 if self.op.beparams:
3494 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
3495 self.new_beparams = cluster.SimpleFillBE(self.op.beparams)
3497 if self.op.ndparams:
3498 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
3499 self.new_ndparams = cluster.SimpleFillND(self.op.ndparams)
3501 # TODO: we need a more general way to handle resetting
3502 # cluster-level parameters to default values
3503 if self.new_ndparams["oob_program"] == "":
3504 self.new_ndparams["oob_program"] = \
3505 constants.NDC_DEFAULTS[constants.ND_OOB_PROGRAM]
3507 if self.op.nicparams:
3508 utils.ForceDictType(self.op.nicparams, constants.NICS_PARAMETER_TYPES)
3509 self.new_nicparams = cluster.SimpleFillNIC(self.op.nicparams)
3510 objects.NIC.CheckParameterSyntax(self.new_nicparams)
3513 # check all instances for consistency
3514 for instance in self.cfg.GetAllInstancesInfo().values():
3515 for nic_idx, nic in enumerate(instance.nics):
3516 params_copy = copy.deepcopy(nic.nicparams)
3517 params_filled = objects.FillDict(self.new_nicparams, params_copy)
3519 # check parameter syntax
3521 objects.NIC.CheckParameterSyntax(params_filled)
3522 except errors.ConfigurationError, err:
3523 nic_errors.append("Instance %s, nic/%d: %s" %
3524 (instance.name, nic_idx, err))
3526 # if we're moving instances to routed, check that they have an ip
3527 target_mode = params_filled[constants.NIC_MODE]
3528 if target_mode == constants.NIC_MODE_ROUTED and not nic.ip:
3529 nic_errors.append("Instance %s, nic/%d: routed NIC with no ip"
3530 " address" % (instance.name, nic_idx))
3532 raise errors.OpPrereqError("Cannot apply the change, errors:\n%s" %
3533 "\n".join(nic_errors))
3535 # hypervisor list/parameters
3536 self.new_hvparams = new_hvp = objects.FillDict(cluster.hvparams, {})
3537 if self.op.hvparams:
3538 for hv_name, hv_dict in self.op.hvparams.items():
3539 if hv_name not in self.new_hvparams:
3540 self.new_hvparams[hv_name] = hv_dict
3542 self.new_hvparams[hv_name].update(hv_dict)
3544 # os hypervisor parameters
3545 self.new_os_hvp = objects.FillDict(cluster.os_hvp, {})
3547 for os_name, hvs in self.op.os_hvp.items():
3548 if os_name not in self.new_os_hvp:
3549 self.new_os_hvp[os_name] = hvs
3551 for hv_name, hv_dict in hvs.items():
3552 if hv_name not in self.new_os_hvp[os_name]:
3553 self.new_os_hvp[os_name][hv_name] = hv_dict
3555 self.new_os_hvp[os_name][hv_name].update(hv_dict)
3558 self.new_osp = objects.FillDict(cluster.osparams, {})
3559 if self.op.osparams:
3560 for os_name, osp in self.op.osparams.items():
3561 if os_name not in self.new_osp:
3562 self.new_osp[os_name] = {}
3564 self.new_osp[os_name] = _GetUpdatedParams(self.new_osp[os_name], osp,
3567 if not self.new_osp[os_name]:
3568 # we removed all parameters
3569 del self.new_osp[os_name]
3571 # check the parameter validity (remote check)
3572 _CheckOSParams(self, False, [self.cfg.GetMasterNode()],
3573 os_name, self.new_osp[os_name])
3575 # changes to the hypervisor list
3576 if self.op.enabled_hypervisors is not None:
3577 self.hv_list = self.op.enabled_hypervisors
3578 for hv in self.hv_list:
3579 # if the hypervisor doesn't already exist in the cluster
3580 # hvparams, we initialize it to empty, and then (in both
3581 # cases) we make sure to fill the defaults, as we might not
3582 # have a complete defaults list if the hypervisor wasn't
3584 if hv not in new_hvp:
3586 new_hvp[hv] = objects.FillDict(constants.HVC_DEFAULTS[hv], new_hvp[hv])
3587 utils.ForceDictType(new_hvp[hv], constants.HVS_PARAMETER_TYPES)
3589 self.hv_list = cluster.enabled_hypervisors
3591 if self.op.hvparams or self.op.enabled_hypervisors is not None:
3592 # either the enabled list has changed, or the parameters have, validate
3593 for hv_name, hv_params in self.new_hvparams.items():
3594 if ((self.op.hvparams and hv_name in self.op.hvparams) or
3595 (self.op.enabled_hypervisors and
3596 hv_name in self.op.enabled_hypervisors)):
3597 # either this is a new hypervisor, or its parameters have changed
3598 hv_class = hypervisor.GetHypervisor(hv_name)
3599 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
3600 hv_class.CheckParameterSyntax(hv_params)
3601 _CheckHVParams(self, node_list, hv_name, hv_params)
3604 # no need to check any newly-enabled hypervisors, since the
3605 # defaults have already been checked in the above code-block
3606 for os_name, os_hvp in self.new_os_hvp.items():
3607 for hv_name, hv_params in os_hvp.items():
3608 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
3609 # we need to fill in the new os_hvp on top of the actual hv_p
3610 cluster_defaults = self.new_hvparams.get(hv_name, {})
3611 new_osp = objects.FillDict(cluster_defaults, hv_params)
3612 hv_class = hypervisor.GetHypervisor(hv_name)
3613 hv_class.CheckParameterSyntax(new_osp)
3614 _CheckHVParams(self, node_list, hv_name, new_osp)
3616 if self.op.default_iallocator:
3617 alloc_script = utils.FindFile(self.op.default_iallocator,
3618 constants.IALLOCATOR_SEARCH_PATH,
3620 if alloc_script is None:
3621 raise errors.OpPrereqError("Invalid default iallocator script '%s'"
3622 " specified" % self.op.default_iallocator,
3625 def Exec(self, feedback_fn):
3626 """Change the parameters of the cluster.
3629 if self.op.vg_name is not None:
3630 new_volume = self.op.vg_name
3633 if new_volume != self.cfg.GetVGName():
3634 self.cfg.SetVGName(new_volume)
3636 feedback_fn("Cluster LVM configuration already in desired"
3637 " state, not changing")
3638 if self.op.drbd_helper is not None:
3639 new_helper = self.op.drbd_helper
3642 if new_helper != self.cfg.GetDRBDHelper():
3643 self.cfg.SetDRBDHelper(new_helper)
3645 feedback_fn("Cluster DRBD helper already in desired state,"
3647 if self.op.hvparams:
3648 self.cluster.hvparams = self.new_hvparams
3650 self.cluster.os_hvp = self.new_os_hvp
3651 if self.op.enabled_hypervisors is not None:
3652 self.cluster.hvparams = self.new_hvparams
3653 self.cluster.enabled_hypervisors = self.op.enabled_hypervisors
3654 if self.op.beparams:
3655 self.cluster.beparams[constants.PP_DEFAULT] = self.new_beparams
3656 if self.op.nicparams:
3657 self.cluster.nicparams[constants.PP_DEFAULT] = self.new_nicparams
3658 if self.op.osparams:
3659 self.cluster.osparams = self.new_osp
3660 if self.op.ndparams:
3661 self.cluster.ndparams = self.new_ndparams
3663 if self.op.candidate_pool_size is not None:
3664 self.cluster.candidate_pool_size = self.op.candidate_pool_size
3665 # we need to update the pool size here, otherwise the save will fail
3666 _AdjustCandidatePool(self, [])
3668 if self.op.maintain_node_health is not None:
3669 self.cluster.maintain_node_health = self.op.maintain_node_health
3671 if self.op.prealloc_wipe_disks is not None:
3672 self.cluster.prealloc_wipe_disks = self.op.prealloc_wipe_disks
3674 if self.op.add_uids is not None:
3675 uidpool.AddToUidPool(self.cluster.uid_pool, self.op.add_uids)
3677 if self.op.remove_uids is not None:
3678 uidpool.RemoveFromUidPool(self.cluster.uid_pool, self.op.remove_uids)
3680 if self.op.uid_pool is not None:
3681 self.cluster.uid_pool = self.op.uid_pool
3683 if self.op.default_iallocator is not None:
3684 self.cluster.default_iallocator = self.op.default_iallocator
3686 if self.op.reserved_lvs is not None:
3687 self.cluster.reserved_lvs = self.op.reserved_lvs
3689 def helper_os(aname, mods, desc):
3691 lst = getattr(self.cluster, aname)
3692 for key, val in mods:
3693 if key == constants.DDM_ADD:
3695 feedback_fn("OS %s already in %s, ignoring" % (val, desc))
3698 elif key == constants.DDM_REMOVE:
3702 feedback_fn("OS %s not found in %s, ignoring" % (val, desc))
3704 raise errors.ProgrammerError("Invalid modification '%s'" % key)
3706 if self.op.hidden_os:
3707 helper_os("hidden_os", self.op.hidden_os, "hidden")
3709 if self.op.blacklisted_os:
3710 helper_os("blacklisted_os", self.op.blacklisted_os, "blacklisted")
3712 if self.op.master_netdev:
3713 master = self.cfg.GetMasterNode()
3714 feedback_fn("Shutting down master ip on the current netdev (%s)" %
3715 self.cluster.master_netdev)
3716 result = self.rpc.call_node_deactivate_master_ip(master)
3717 result.Raise("Could not disable the master ip")
3718 feedback_fn("Changing master_netdev from %s to %s" %
3719 (self.cluster.master_netdev, self.op.master_netdev))
3720 self.cluster.master_netdev = self.op.master_netdev
3722 self.cfg.Update(self.cluster, feedback_fn)
3724 if self.op.master_netdev:
3725 feedback_fn("Starting the master ip on the new master netdev (%s)" %
3726 self.op.master_netdev)
3727 result = self.rpc.call_node_activate_master_ip(master)
3729 self.LogWarning("Could not re-enable the master ip on"
3730 " the master, please restart manually: %s",
3734 def _UploadHelper(lu, nodes, fname):
3735 """Helper for uploading a file and showing warnings.
3738 if os.path.exists(fname):
3739 result = lu.rpc.call_upload_file(nodes, fname)
3740 for to_node, to_result in result.items():
3741 msg = to_result.fail_msg
3743 msg = ("Copy of file %s to node %s failed: %s" %
3744 (fname, to_node, msg))
3745 lu.proc.LogWarning(msg)
3748 def _ComputeAncillaryFiles(cluster, redist):
3749 """Compute files external to Ganeti which need to be consistent.
3751 @type redist: boolean
3752 @param redist: Whether to include files which need to be redistributed
3755 # Compute files for all nodes
3757 constants.SSH_KNOWN_HOSTS_FILE,
3758 constants.CONFD_HMAC_KEY,
3759 constants.CLUSTER_DOMAIN_SECRET_FILE,
3760 constants.RAPI_USERS_FILE,
3764 files_all.update(constants.ALL_CERT_FILES)
3765 files_all.update(ssconf.SimpleStore().GetFileList())
3767 # we need to ship at least the RAPI certificate
3768 files_all.add(constants.RAPI_CERT_FILE)
3770 if cluster.modify_etc_hosts:
3771 files_all.add(constants.ETC_HOSTS)
3773 # Files which are optional, these must:
3774 # - be present in one other category as well
3775 # - either exist or not exist on all nodes of that category (mc, vm all)
3777 constants.RAPI_USERS_FILE,
3780 # Files which should only be on master candidates
3783 files_mc.add(constants.CLUSTER_CONF_FILE)
3785 # Files which should only be on VM-capable nodes
3786 files_vm = set(filename
3787 for hv_name in cluster.enabled_hypervisors
3788 for filename in hypervisor.GetHypervisor(hv_name).GetAncillaryFiles()[0])
3790 files_opt |= set(filename
3791 for hv_name in cluster.enabled_hypervisors
3792 for filename in hypervisor.GetHypervisor(hv_name).GetAncillaryFiles()[1])
3794 # Filenames in each category must be unique
3795 all_files_set = files_all | files_mc | files_vm
3796 assert (len(all_files_set) ==
3797 sum(map(len, [files_all, files_mc, files_vm]))), \
3798 "Found file listed in more than one file list"
3800 # Optional files must be present in one other category
3801 assert all_files_set.issuperset(files_opt), \
3802 "Optional file not in a different required list"
3804 return (files_all, files_opt, files_mc, files_vm)
3807 def _RedistributeAncillaryFiles(lu, additional_nodes=None, additional_vm=True):
3808 """Distribute additional files which are part of the cluster configuration.
3810 ConfigWriter takes care of distributing the config and ssconf files, but
3811 there are more files which should be distributed to all nodes. This function
3812 makes sure those are copied.
3814 @param lu: calling logical unit
3815 @param additional_nodes: list of nodes not in the config to distribute to
3816 @type additional_vm: boolean
3817 @param additional_vm: whether the additional nodes are vm-capable or not
3820 # Gather target nodes
3821 cluster = lu.cfg.GetClusterInfo()
3822 master_info = lu.cfg.GetNodeInfo(lu.cfg.GetMasterNode())
3824 online_nodes = lu.cfg.GetOnlineNodeList()
3825 vm_nodes = lu.cfg.GetVmCapableNodeList()
3827 if additional_nodes is not None:
3828 online_nodes.extend(additional_nodes)
3830 vm_nodes.extend(additional_nodes)
3832 # Never distribute to master node
3833 for nodelist in [online_nodes, vm_nodes]:
3834 if master_info.name in nodelist:
3835 nodelist.remove(master_info.name)
3838 (files_all, _, files_mc, files_vm) = \
3839 _ComputeAncillaryFiles(cluster, True)
3841 # Never re-distribute configuration file from here
3842 assert not (constants.CLUSTER_CONF_FILE in files_all or
3843 constants.CLUSTER_CONF_FILE in files_vm)
3844 assert not files_mc, "Master candidates not handled in this function"
3847 (online_nodes, files_all),
3848 (vm_nodes, files_vm),
3852 for (node_list, files) in filemap:
3854 _UploadHelper(lu, node_list, fname)
3857 class LUClusterRedistConf(NoHooksLU):
3858 """Force the redistribution of cluster configuration.
3860 This is a very simple LU.
3865 def ExpandNames(self):
3866 self.needed_locks = {
3867 locking.LEVEL_NODE: locking.ALL_SET,
3869 self.share_locks[locking.LEVEL_NODE] = 1
3871 def Exec(self, feedback_fn):
3872 """Redistribute the configuration.
3875 self.cfg.Update(self.cfg.GetClusterInfo(), feedback_fn)
3876 _RedistributeAncillaryFiles(self)
3879 class LUClusterActivateMasterIp(NoHooksLU):
3880 """Activate the master IP on the master node.
3883 def Exec(self, feedback_fn):
3884 """Activate the master IP.
3887 master = self.cfg.GetMasterNode()
3888 result = self.rpc.call_node_activate_master_ip(master)
3889 result.Raise("Could not activate the master IP")
3892 class LUClusterDeactivateMasterIp(NoHooksLU):
3893 """Deactivate the master IP on the master node.
3896 def Exec(self, feedback_fn):
3897 """Deactivate the master IP.
3900 master = self.cfg.GetMasterNode()
3901 result = self.rpc.call_node_deactivate_master_ip(master)
3902 result.Raise("Could not deactivate the master IP")
3905 def _WaitForSync(lu, instance, disks=None, oneshot=False):
3906 """Sleep and poll for an instance's disk to sync.
3909 if not instance.disks or disks is not None and not disks:
3912 disks = _ExpandCheckDisks(instance, disks)
3915 lu.proc.LogInfo("Waiting for instance %s to sync disks." % instance.name)
3917 node = instance.primary_node
3920 lu.cfg.SetDiskID(dev, node)
3922 # TODO: Convert to utils.Retry
3925 degr_retries = 10 # in seconds, as we sleep 1 second each time
3929 cumul_degraded = False
3930 rstats = lu.rpc.call_blockdev_getmirrorstatus(node, disks)
3931 msg = rstats.fail_msg
3933 lu.LogWarning("Can't get any data from node %s: %s", node, msg)
3936 raise errors.RemoteError("Can't contact node %s for mirror data,"
3937 " aborting." % node)
3940 rstats = rstats.payload
3942 for i, mstat in enumerate(rstats):
3944 lu.LogWarning("Can't compute data for node %s/%s",
3945 node, disks[i].iv_name)
3948 cumul_degraded = (cumul_degraded or
3949 (mstat.is_degraded and mstat.sync_percent is None))
3950 if mstat.sync_percent is not None:
3952 if mstat.estimated_time is not None:
3953 rem_time = ("%s remaining (estimated)" %
3954 utils.FormatSeconds(mstat.estimated_time))
3955 max_time = mstat.estimated_time
3957 rem_time = "no time estimate"
3958 lu.proc.LogInfo("- device %s: %5.2f%% done, %s" %
3959 (disks[i].iv_name, mstat.sync_percent, rem_time))
3961 # if we're done but degraded, let's do a few small retries, to
3962 # make sure we see a stable and not transient situation; therefore
3963 # we force restart of the loop
3964 if (done or oneshot) and cumul_degraded and degr_retries > 0:
3965 logging.info("Degraded disks found, %d retries left", degr_retries)
3973 time.sleep(min(60, max_time))
3976 lu.proc.LogInfo("Instance %s's disks are in sync." % instance.name)
3977 return not cumul_degraded
3980 def _CheckDiskConsistency(lu, dev, node, on_primary, ldisk=False):
3981 """Check that mirrors are not degraded.
3983 The ldisk parameter, if True, will change the test from the
3984 is_degraded attribute (which represents overall non-ok status for
3985 the device(s)) to the ldisk (representing the local storage status).
3988 lu.cfg.SetDiskID(dev, node)
3992 if on_primary or dev.AssembleOnSecondary():
3993 rstats = lu.rpc.call_blockdev_find(node, dev)
3994 msg = rstats.fail_msg
3996 lu.LogWarning("Can't find disk on node %s: %s", node, msg)
3998 elif not rstats.payload:
3999 lu.LogWarning("Can't find disk on node %s", node)
4003 result = result and rstats.payload.ldisk_status == constants.LDS_OKAY
4005 result = result and not rstats.payload.is_degraded
4008 for child in dev.children:
4009 result = result and _CheckDiskConsistency(lu, child, node, on_primary)
4014 class LUOobCommand(NoHooksLU):
4015 """Logical unit for OOB handling.
4019 _SKIP_MASTER = (constants.OOB_POWER_OFF, constants.OOB_POWER_CYCLE)
4021 def ExpandNames(self):
4022 """Gather locks we need.
4025 if self.op.node_names:
4026 self.op.node_names = _GetWantedNodes(self, self.op.node_names)
4027 lock_names = self.op.node_names
4029 lock_names = locking.ALL_SET
4031 self.needed_locks = {
4032 locking.LEVEL_NODE: lock_names,
4035 def CheckPrereq(self):
4036 """Check prerequisites.
4039 - the node exists in the configuration
4042 Any errors are signaled by raising errors.OpPrereqError.
4046 self.master_node = self.cfg.GetMasterNode()
4048 assert self.op.power_delay >= 0.0
4050 if self.op.node_names:
4051 if (self.op.command in self._SKIP_MASTER and
4052 self.master_node in self.op.node_names):
4053 master_node_obj = self.cfg.GetNodeInfo(self.master_node)
4054 master_oob_handler = _SupportsOob(self.cfg, master_node_obj)
4056 if master_oob_handler:
4057 additional_text = ("run '%s %s %s' if you want to operate on the"
4058 " master regardless") % (master_oob_handler,
4062 additional_text = "it does not support out-of-band operations"
4064 raise errors.OpPrereqError(("Operating on the master node %s is not"
4065 " allowed for %s; %s") %
4066 (self.master_node, self.op.command,
4067 additional_text), errors.ECODE_INVAL)
4069 self.op.node_names = self.cfg.GetNodeList()
4070 if self.op.command in self._SKIP_MASTER:
4071 self.op.node_names.remove(self.master_node)
4073 if self.op.command in self._SKIP_MASTER:
4074 assert self.master_node not in self.op.node_names
4076 for (node_name, node) in self.cfg.GetMultiNodeInfo(self.op.node_names):
4078 raise errors.OpPrereqError("Node %s not found" % node_name,
4081 self.nodes.append(node)
4083 if (not self.op.ignore_status and
4084 (self.op.command == constants.OOB_POWER_OFF and not node.offline)):
4085 raise errors.OpPrereqError(("Cannot power off node %s because it is"
4086 " not marked offline") % node_name,
4089 def Exec(self, feedback_fn):
4090 """Execute OOB and return result if we expect any.
4093 master_node = self.master_node
4096 for idx, node in enumerate(utils.NiceSort(self.nodes,
4097 key=lambda node: node.name)):
4098 node_entry = [(constants.RS_NORMAL, node.name)]
4099 ret.append(node_entry)
4101 oob_program = _SupportsOob(self.cfg, node)
4104 node_entry.append((constants.RS_UNAVAIL, None))
4107 logging.info("Executing out-of-band command '%s' using '%s' on %s",
4108 self.op.command, oob_program, node.name)
4109 result = self.rpc.call_run_oob(master_node, oob_program,
4110 self.op.command, node.name,
4114 self.LogWarning("Out-of-band RPC failed on node '%s': %s",
4115 node.name, result.fail_msg)
4116 node_entry.append((constants.RS_NODATA, None))
4119 self._CheckPayload(result)
4120 except errors.OpExecError, err:
4121 self.LogWarning("Payload returned by node '%s' is not valid: %s",
4123 node_entry.append((constants.RS_NODATA, None))
4125 if self.op.command == constants.OOB_HEALTH:
4126 # For health we should log important events
4127 for item, status in result.payload:
4128 if status in [constants.OOB_STATUS_WARNING,
4129 constants.OOB_STATUS_CRITICAL]:
4130 self.LogWarning("Item '%s' on node '%s' has status '%s'",
4131 item, node.name, status)
4133 if self.op.command == constants.OOB_POWER_ON:
4135 elif self.op.command == constants.OOB_POWER_OFF:
4136 node.powered = False
4137 elif self.op.command == constants.OOB_POWER_STATUS:
4138 powered = result.payload[constants.OOB_POWER_STATUS_POWERED]
4139 if powered != node.powered:
4140 logging.warning(("Recorded power state (%s) of node '%s' does not"
4141 " match actual power state (%s)"), node.powered,
4144 # For configuration changing commands we should update the node
4145 if self.op.command in (constants.OOB_POWER_ON,
4146 constants.OOB_POWER_OFF):
4147 self.cfg.Update(node, feedback_fn)
4149 node_entry.append((constants.RS_NORMAL, result.payload))
4151 if (self.op.command == constants.OOB_POWER_ON and
4152 idx < len(self.nodes) - 1):
4153 time.sleep(self.op.power_delay)
4157 def _CheckPayload(self, result):
4158 """Checks if the payload is valid.
4160 @param result: RPC result
4161 @raises errors.OpExecError: If payload is not valid
4165 if self.op.command == constants.OOB_HEALTH:
4166 if not isinstance(result.payload, list):
4167 errs.append("command 'health' is expected to return a list but got %s" %
4168 type(result.payload))
4170 for item, status in result.payload:
4171 if status not in constants.OOB_STATUSES:
4172 errs.append("health item '%s' has invalid status '%s'" %
4175 if self.op.command == constants.OOB_POWER_STATUS:
4176 if not isinstance(result.payload, dict):
4177 errs.append("power-status is expected to return a dict but got %s" %
4178 type(result.payload))
4180 if self.op.command in [
4181 constants.OOB_POWER_ON,
4182 constants.OOB_POWER_OFF,
4183 constants.OOB_POWER_CYCLE,
4185 if result.payload is not None:
4186 errs.append("%s is expected to not return payload but got '%s'" %
4187 (self.op.command, result.payload))
4190 raise errors.OpExecError("Check of out-of-band payload failed due to %s" %
4191 utils.CommaJoin(errs))
4194 class _OsQuery(_QueryBase):
4195 FIELDS = query.OS_FIELDS
4197 def ExpandNames(self, lu):
4198 # Lock all nodes in shared mode
4199 # Temporary removal of locks, should be reverted later
4200 # TODO: reintroduce locks when they are lighter-weight
4201 lu.needed_locks = {}
4202 #self.share_locks[locking.LEVEL_NODE] = 1
4203 #self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
4205 # The following variables interact with _QueryBase._GetNames
4207 self.wanted = self.names
4209 self.wanted = locking.ALL_SET
4211 self.do_locking = self.use_locking
4213 def DeclareLocks(self, lu, level):
4217 def _DiagnoseByOS(rlist):
4218 """Remaps a per-node return list into an a per-os per-node dictionary
4220 @param rlist: a map with node names as keys and OS objects as values
4223 @return: a dictionary with osnames as keys and as value another
4224 map, with nodes as keys and tuples of (path, status, diagnose,
4225 variants, parameters, api_versions) as values, eg::
4227 {"debian-etch": {"node1": [(/usr/lib/..., True, "", [], []),
4228 (/srv/..., False, "invalid api")],
4229 "node2": [(/srv/..., True, "", [], [])]}
4234 # we build here the list of nodes that didn't fail the RPC (at RPC
4235 # level), so that nodes with a non-responding node daemon don't
4236 # make all OSes invalid
4237 good_nodes = [node_name for node_name in rlist
4238 if not rlist[node_name].fail_msg]
4239 for node_name, nr in rlist.items():
4240 if nr.fail_msg or not nr.payload:
4242 for (name, path, status, diagnose, variants,
4243 params, api_versions) in nr.payload:
4244 if name not in all_os:
4245 # build a list of nodes for this os containing empty lists
4246 # for each node in node_list
4248 for nname in good_nodes:
4249 all_os[name][nname] = []
4250 # convert params from [name, help] to (name, help)
4251 params = [tuple(v) for v in params]
4252 all_os[name][node_name].append((path, status, diagnose,
4253 variants, params, api_versions))
4256 def _GetQueryData(self, lu):
4257 """Computes the list of nodes and their attributes.
4260 # Locking is not used
4261 assert not (compat.any(lu.glm.is_owned(level)
4262 for level in locking.LEVELS
4263 if level != locking.LEVEL_CLUSTER) or
4264 self.do_locking or self.use_locking)
4266 valid_nodes = [node.name
4267 for node in lu.cfg.GetAllNodesInfo().values()
4268 if not node.offline and node.vm_capable]
4269 pol = self._DiagnoseByOS(lu.rpc.call_os_diagnose(valid_nodes))
4270 cluster = lu.cfg.GetClusterInfo()
4274 for (os_name, os_data) in pol.items():
4275 info = query.OsInfo(name=os_name, valid=True, node_status=os_data,
4276 hidden=(os_name in cluster.hidden_os),
4277 blacklisted=(os_name in cluster.blacklisted_os))
4281 api_versions = set()
4283 for idx, osl in enumerate(os_data.values()):
4284 info.valid = bool(info.valid and osl and osl[0][1])
4288 (node_variants, node_params, node_api) = osl[0][3:6]
4291 variants.update(node_variants)
4292 parameters.update(node_params)
4293 api_versions.update(node_api)
4295 # Filter out inconsistent values
4296 variants.intersection_update(node_variants)
4297 parameters.intersection_update(node_params)
4298 api_versions.intersection_update(node_api)
4300 info.variants = list(variants)
4301 info.parameters = list(parameters)
4302 info.api_versions = list(api_versions)
4304 data[os_name] = info
4306 # Prepare data in requested order
4307 return [data[name] for name in self._GetNames(lu, pol.keys(), None)
4311 class LUOsDiagnose(NoHooksLU):
4312 """Logical unit for OS diagnose/query.
4318 def _BuildFilter(fields, names):
4319 """Builds a filter for querying OSes.
4322 name_filter = qlang.MakeSimpleFilter("name", names)
4324 # Legacy behaviour: Hide hidden, blacklisted or invalid OSes if the
4325 # respective field is not requested
4326 status_filter = [[qlang.OP_NOT, [qlang.OP_TRUE, fname]]
4327 for fname in ["hidden", "blacklisted"]
4328 if fname not in fields]
4329 if "valid" not in fields:
4330 status_filter.append([qlang.OP_TRUE, "valid"])
4333 status_filter.insert(0, qlang.OP_AND)
4335 status_filter = None
4337 if name_filter and status_filter:
4338 return [qlang.OP_AND, name_filter, status_filter]
4342 return status_filter
4344 def CheckArguments(self):
4345 self.oq = _OsQuery(self._BuildFilter(self.op.output_fields, self.op.names),
4346 self.op.output_fields, False)
4348 def ExpandNames(self):
4349 self.oq.ExpandNames(self)
4351 def Exec(self, feedback_fn):
4352 return self.oq.OldStyleQuery(self)
4355 class LUNodeRemove(LogicalUnit):
4356 """Logical unit for removing a node.
4359 HPATH = "node-remove"
4360 HTYPE = constants.HTYPE_NODE
4362 def BuildHooksEnv(self):
4365 This doesn't run on the target node in the pre phase as a failed
4366 node would then be impossible to remove.
4370 "OP_TARGET": self.op.node_name,
4371 "NODE_NAME": self.op.node_name,
4374 def BuildHooksNodes(self):
4375 """Build hooks nodes.
4378 all_nodes = self.cfg.GetNodeList()
4380 all_nodes.remove(self.op.node_name)
4382 logging.warning("Node '%s', which is about to be removed, was not found"
4383 " in the list of all nodes", self.op.node_name)
4384 return (all_nodes, all_nodes)
4386 def CheckPrereq(self):
4387 """Check prerequisites.
4390 - the node exists in the configuration
4391 - it does not have primary or secondary instances
4392 - it's not the master
4394 Any errors are signaled by raising errors.OpPrereqError.
4397 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
4398 node = self.cfg.GetNodeInfo(self.op.node_name)
4399 assert node is not None
4401 masternode = self.cfg.GetMasterNode()
4402 if node.name == masternode:
4403 raise errors.OpPrereqError("Node is the master node, failover to another"
4404 " node is required", errors.ECODE_INVAL)
4406 for instance_name, instance in self.cfg.GetAllInstancesInfo().items():
4407 if node.name in instance.all_nodes:
4408 raise errors.OpPrereqError("Instance %s is still running on the node,"
4409 " please remove first" % instance_name,
4411 self.op.node_name = node.name
4414 def Exec(self, feedback_fn):
4415 """Removes the node from the cluster.
4419 logging.info("Stopping the node daemon and removing configs from node %s",
4422 modify_ssh_setup = self.cfg.GetClusterInfo().modify_ssh_setup
4424 # Promote nodes to master candidate as needed
4425 _AdjustCandidatePool(self, exceptions=[node.name])
4426 self.context.RemoveNode(node.name)
4428 # Run post hooks on the node before it's removed
4429 _RunPostHook(self, node.name)
4431 result = self.rpc.call_node_leave_cluster(node.name, modify_ssh_setup)
4432 msg = result.fail_msg
4434 self.LogWarning("Errors encountered on the remote node while leaving"
4435 " the cluster: %s", msg)
4437 # Remove node from our /etc/hosts
4438 if self.cfg.GetClusterInfo().modify_etc_hosts:
4439 master_node = self.cfg.GetMasterNode()
4440 result = self.rpc.call_etc_hosts_modify(master_node,
4441 constants.ETC_HOSTS_REMOVE,
4443 result.Raise("Can't update hosts file with new host data")
4444 _RedistributeAncillaryFiles(self)
4447 class _NodeQuery(_QueryBase):
4448 FIELDS = query.NODE_FIELDS
4450 def ExpandNames(self, lu):
4451 lu.needed_locks = {}
4452 lu.share_locks = _ShareAll()
4455 self.wanted = _GetWantedNodes(lu, self.names)
4457 self.wanted = locking.ALL_SET
4459 self.do_locking = (self.use_locking and
4460 query.NQ_LIVE in self.requested_data)
4463 # If any non-static field is requested we need to lock the nodes
4464 lu.needed_locks[locking.LEVEL_NODE] = self.wanted
4466 def DeclareLocks(self, lu, level):
4469 def _GetQueryData(self, lu):
4470 """Computes the list of nodes and their attributes.
4473 all_info = lu.cfg.GetAllNodesInfo()
4475 nodenames = self._GetNames(lu, all_info.keys(), locking.LEVEL_NODE)
4477 # Gather data as requested
4478 if query.NQ_LIVE in self.requested_data:
4479 # filter out non-vm_capable nodes
4480 toquery_nodes = [name for name in nodenames if all_info[name].vm_capable]
4482 node_data = lu.rpc.call_node_info(toquery_nodes, lu.cfg.GetVGName(),
4483 lu.cfg.GetHypervisorType())
4484 live_data = dict((name, nresult.payload)
4485 for (name, nresult) in node_data.items()
4486 if not nresult.fail_msg and nresult.payload)
4490 if query.NQ_INST in self.requested_data:
4491 node_to_primary = dict([(name, set()) for name in nodenames])
4492 node_to_secondary = dict([(name, set()) for name in nodenames])
4494 inst_data = lu.cfg.GetAllInstancesInfo()
4496 for inst in inst_data.values():
4497 if inst.primary_node in node_to_primary:
4498 node_to_primary[inst.primary_node].add(inst.name)
4499 for secnode in inst.secondary_nodes:
4500 if secnode in node_to_secondary:
4501 node_to_secondary[secnode].add(inst.name)
4503 node_to_primary = None
4504 node_to_secondary = None
4506 if query.NQ_OOB in self.requested_data:
4507 oob_support = dict((name, bool(_SupportsOob(lu.cfg, node)))
4508 for name, node in all_info.iteritems())
4512 if query.NQ_GROUP in self.requested_data:
4513 groups = lu.cfg.GetAllNodeGroupsInfo()
4517 return query.NodeQueryData([all_info[name] for name in nodenames],
4518 live_data, lu.cfg.GetMasterNode(),
4519 node_to_primary, node_to_secondary, groups,
4520 oob_support, lu.cfg.GetClusterInfo())
4523 class LUNodeQuery(NoHooksLU):
4524 """Logical unit for querying nodes.
4527 # pylint: disable=W0142
4530 def CheckArguments(self):
4531 self.nq = _NodeQuery(qlang.MakeSimpleFilter("name", self.op.names),
4532 self.op.output_fields, self.op.use_locking)
4534 def ExpandNames(self):
4535 self.nq.ExpandNames(self)
4537 def Exec(self, feedback_fn):
4538 return self.nq.OldStyleQuery(self)
4541 class LUNodeQueryvols(NoHooksLU):
4542 """Logical unit for getting volumes on node(s).
4546 _FIELDS_DYNAMIC = utils.FieldSet("phys", "vg", "name", "size", "instance")
4547 _FIELDS_STATIC = utils.FieldSet("node")
4549 def CheckArguments(self):
4550 _CheckOutputFields(static=self._FIELDS_STATIC,
4551 dynamic=self._FIELDS_DYNAMIC,
4552 selected=self.op.output_fields)
4554 def ExpandNames(self):
4555 self.needed_locks = {}
4556 self.share_locks[locking.LEVEL_NODE] = 1
4557 if not self.op.nodes:
4558 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
4560 self.needed_locks[locking.LEVEL_NODE] = \
4561 _GetWantedNodes(self, self.op.nodes)
4563 def Exec(self, feedback_fn):
4564 """Computes the list of nodes and their attributes.
4567 nodenames = self.owned_locks(locking.LEVEL_NODE)
4568 volumes = self.rpc.call_node_volumes(nodenames)
4570 ilist = self.cfg.GetAllInstancesInfo()
4571 vol2inst = _MapInstanceDisksToNodes(ilist.values())
4574 for node in nodenames:
4575 nresult = volumes[node]
4578 msg = nresult.fail_msg
4580 self.LogWarning("Can't compute volume data on node %s: %s", node, msg)
4583 node_vols = sorted(nresult.payload,
4584 key=operator.itemgetter("dev"))
4586 for vol in node_vols:
4588 for field in self.op.output_fields:
4591 elif field == "phys":
4595 elif field == "name":
4597 elif field == "size":
4598 val = int(float(vol["size"]))
4599 elif field == "instance":
4600 val = vol2inst.get((node, vol["vg"] + "/" + vol["name"]), "-")
4602 raise errors.ParameterError(field)
4603 node_output.append(str(val))
4605 output.append(node_output)
4610 class LUNodeQueryStorage(NoHooksLU):
4611 """Logical unit for getting information on storage units on node(s).
4614 _FIELDS_STATIC = utils.FieldSet(constants.SF_NODE)
4617 def CheckArguments(self):
4618 _CheckOutputFields(static=self._FIELDS_STATIC,
4619 dynamic=utils.FieldSet(*constants.VALID_STORAGE_FIELDS),
4620 selected=self.op.output_fields)
4622 def ExpandNames(self):
4623 self.needed_locks = {}
4624 self.share_locks[locking.LEVEL_NODE] = 1
4627 self.needed_locks[locking.LEVEL_NODE] = \
4628 _GetWantedNodes(self, self.op.nodes)
4630 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
4632 def Exec(self, feedback_fn):
4633 """Computes the list of nodes and their attributes.
4636 self.nodes = self.owned_locks(locking.LEVEL_NODE)
4638 # Always get name to sort by
4639 if constants.SF_NAME in self.op.output_fields:
4640 fields = self.op.output_fields[:]
4642 fields = [constants.SF_NAME] + self.op.output_fields
4644 # Never ask for node or type as it's only known to the LU
4645 for extra in [constants.SF_NODE, constants.SF_TYPE]:
4646 while extra in fields:
4647 fields.remove(extra)
4649 field_idx = dict([(name, idx) for (idx, name) in enumerate(fields)])
4650 name_idx = field_idx[constants.SF_NAME]
4652 st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
4653 data = self.rpc.call_storage_list(self.nodes,
4654 self.op.storage_type, st_args,
4655 self.op.name, fields)
4659 for node in utils.NiceSort(self.nodes):
4660 nresult = data[node]
4664 msg = nresult.fail_msg
4666 self.LogWarning("Can't get storage data from node %s: %s", node, msg)
4669 rows = dict([(row[name_idx], row) for row in nresult.payload])
4671 for name in utils.NiceSort(rows.keys()):
4676 for field in self.op.output_fields:
4677 if field == constants.SF_NODE:
4679 elif field == constants.SF_TYPE:
4680 val = self.op.storage_type
4681 elif field in field_idx:
4682 val = row[field_idx[field]]
4684 raise errors.ParameterError(field)
4693 class _InstanceQuery(_QueryBase):
4694 FIELDS = query.INSTANCE_FIELDS
4696 def ExpandNames(self, lu):
4697 lu.needed_locks = {}
4698 lu.share_locks = _ShareAll()
4701 self.wanted = _GetWantedInstances(lu, self.names)
4703 self.wanted = locking.ALL_SET
4705 self.do_locking = (self.use_locking and
4706 query.IQ_LIVE in self.requested_data)
4708 lu.needed_locks[locking.LEVEL_INSTANCE] = self.wanted
4709 lu.needed_locks[locking.LEVEL_NODEGROUP] = []
4710 lu.needed_locks[locking.LEVEL_NODE] = []
4711 lu.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
4713 self.do_grouplocks = (self.do_locking and
4714 query.IQ_NODES in self.requested_data)
4716 def DeclareLocks(self, lu, level):
4718 if level == locking.LEVEL_NODEGROUP and self.do_grouplocks:
4719 assert not lu.needed_locks[locking.LEVEL_NODEGROUP]
4721 # Lock all groups used by instances optimistically; this requires going
4722 # via the node before it's locked, requiring verification later on
4723 lu.needed_locks[locking.LEVEL_NODEGROUP] = \
4725 for instance_name in lu.owned_locks(locking.LEVEL_INSTANCE)
4726 for group_uuid in lu.cfg.GetInstanceNodeGroups(instance_name))
4727 elif level == locking.LEVEL_NODE:
4728 lu._LockInstancesNodes() # pylint: disable=W0212
4731 def _CheckGroupLocks(lu):
4732 owned_instances = frozenset(lu.owned_locks(locking.LEVEL_INSTANCE))
4733 owned_groups = frozenset(lu.owned_locks(locking.LEVEL_NODEGROUP))
4735 # Check if node groups for locked instances are still correct
4736 for instance_name in owned_instances:
4737 _CheckInstanceNodeGroups(lu.cfg, instance_name, owned_groups)
4739 def _GetQueryData(self, lu):
4740 """Computes the list of instances and their attributes.
4743 if self.do_grouplocks:
4744 self._CheckGroupLocks(lu)
4746 cluster = lu.cfg.GetClusterInfo()
4747 all_info = lu.cfg.GetAllInstancesInfo()
4749 instance_names = self._GetNames(lu, all_info.keys(), locking.LEVEL_INSTANCE)
4751 instance_list = [all_info[name] for name in instance_names]
4752 nodes = frozenset(itertools.chain(*(inst.all_nodes
4753 for inst in instance_list)))
4754 hv_list = list(set([inst.hypervisor for inst in instance_list]))
4757 wrongnode_inst = set()
4759 # Gather data as requested
4760 if self.requested_data & set([query.IQ_LIVE, query.IQ_CONSOLE]):
4762 node_data = lu.rpc.call_all_instances_info(nodes, hv_list)
4764 result = node_data[name]
4766 # offline nodes will be in both lists
4767 assert result.fail_msg
4768 offline_nodes.append(name)
4770 bad_nodes.append(name)
4771 elif result.payload:
4772 for inst in result.payload:
4773 if inst in all_info:
4774 if all_info[inst].primary_node == name:
4775 live_data.update(result.payload)
4777 wrongnode_inst.add(inst)
4779 # orphan instance; we don't list it here as we don't
4780 # handle this case yet in the output of instance listing
4781 logging.warning("Orphan instance '%s' found on node %s",
4783 # else no instance is alive
4787 if query.IQ_DISKUSAGE in self.requested_data:
4788 disk_usage = dict((inst.name,
4789 _ComputeDiskSize(inst.disk_template,
4790 [{constants.IDISK_SIZE: disk.size}
4791 for disk in inst.disks]))
4792 for inst in instance_list)
4796 if query.IQ_CONSOLE in self.requested_data:
4798 for inst in instance_list:
4799 if inst.name in live_data:
4800 # Instance is running
4801 consinfo[inst.name] = _GetInstanceConsole(cluster, inst)
4803 consinfo[inst.name] = None
4804 assert set(consinfo.keys()) == set(instance_names)
4808 if query.IQ_NODES in self.requested_data:
4809 node_names = set(itertools.chain(*map(operator.attrgetter("all_nodes"),
4811 nodes = dict(lu.cfg.GetMultiNodeInfo(node_names))
4812 groups = dict((uuid, lu.cfg.GetNodeGroup(uuid))
4813 for uuid in set(map(operator.attrgetter("group"),
4819 return query.InstanceQueryData(instance_list, lu.cfg.GetClusterInfo(),
4820 disk_usage, offline_nodes, bad_nodes,
4821 live_data, wrongnode_inst, consinfo,
4825 class LUQuery(NoHooksLU):
4826 """Query for resources/items of a certain kind.
4829 # pylint: disable=W0142
4832 def CheckArguments(self):
4833 qcls = _GetQueryImplementation(self.op.what)
4835 self.impl = qcls(self.op.filter, self.op.fields, self.op.use_locking)
4837 def ExpandNames(self):
4838 self.impl.ExpandNames(self)
4840 def DeclareLocks(self, level):
4841 self.impl.DeclareLocks(self, level)
4843 def Exec(self, feedback_fn):
4844 return self.impl.NewStyleQuery(self)
4847 class LUQueryFields(NoHooksLU):
4848 """Query for resources/items of a certain kind.
4851 # pylint: disable=W0142
4854 def CheckArguments(self):
4855 self.qcls = _GetQueryImplementation(self.op.what)
4857 def ExpandNames(self):
4858 self.needed_locks = {}
4860 def Exec(self, feedback_fn):
4861 return query.QueryFields(self.qcls.FIELDS, self.op.fields)
4864 class LUNodeModifyStorage(NoHooksLU):
4865 """Logical unit for modifying a storage volume on a node.
4870 def CheckArguments(self):
4871 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
4873 storage_type = self.op.storage_type
4876 modifiable = constants.MODIFIABLE_STORAGE_FIELDS[storage_type]
4878 raise errors.OpPrereqError("Storage units of type '%s' can not be"
4879 " modified" % storage_type,
4882 diff = set(self.op.changes.keys()) - modifiable
4884 raise errors.OpPrereqError("The following fields can not be modified for"
4885 " storage units of type '%s': %r" %
4886 (storage_type, list(diff)),
4889 def ExpandNames(self):
4890 self.needed_locks = {
4891 locking.LEVEL_NODE: self.op.node_name,
4894 def Exec(self, feedback_fn):
4895 """Computes the list of nodes and their attributes.
4898 st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
4899 result = self.rpc.call_storage_modify(self.op.node_name,
4900 self.op.storage_type, st_args,
4901 self.op.name, self.op.changes)
4902 result.Raise("Failed to modify storage unit '%s' on %s" %
4903 (self.op.name, self.op.node_name))
4906 class LUNodeAdd(LogicalUnit):
4907 """Logical unit for adding node to the cluster.
4911 HTYPE = constants.HTYPE_NODE
4912 _NFLAGS = ["master_capable", "vm_capable"]
4914 def CheckArguments(self):
4915 self.primary_ip_family = self.cfg.GetPrimaryIPFamily()
4916 # validate/normalize the node name
4917 self.hostname = netutils.GetHostname(name=self.op.node_name,
4918 family=self.primary_ip_family)
4919 self.op.node_name = self.hostname.name
4921 if self.op.readd and self.op.node_name == self.cfg.GetMasterNode():
4922 raise errors.OpPrereqError("Cannot readd the master node",
4925 if self.op.readd and self.op.group:
4926 raise errors.OpPrereqError("Cannot pass a node group when a node is"
4927 " being readded", errors.ECODE_INVAL)
4929 def BuildHooksEnv(self):
4932 This will run on all nodes before, and on all nodes + the new node after.
4936 "OP_TARGET": self.op.node_name,
4937 "NODE_NAME": self.op.node_name,
4938 "NODE_PIP": self.op.primary_ip,
4939 "NODE_SIP": self.op.secondary_ip,
4940 "MASTER_CAPABLE": str(self.op.master_capable),
4941 "VM_CAPABLE": str(self.op.vm_capable),
4944 def BuildHooksNodes(self):
4945 """Build hooks nodes.
4948 # Exclude added node
4949 pre_nodes = list(set(self.cfg.GetNodeList()) - set([self.op.node_name]))
4950 post_nodes = pre_nodes + [self.op.node_name, ]
4952 return (pre_nodes, post_nodes)
4954 def CheckPrereq(self):
4955 """Check prerequisites.
4958 - the new node is not already in the config
4960 - its parameters (single/dual homed) matches the cluster
4962 Any errors are signaled by raising errors.OpPrereqError.
4966 hostname = self.hostname
4967 node = hostname.name
4968 primary_ip = self.op.primary_ip = hostname.ip
4969 if self.op.secondary_ip is None:
4970 if self.primary_ip_family == netutils.IP6Address.family:
4971 raise errors.OpPrereqError("When using a IPv6 primary address, a valid"
4972 " IPv4 address must be given as secondary",
4974 self.op.secondary_ip = primary_ip
4976 secondary_ip = self.op.secondary_ip
4977 if not netutils.IP4Address.IsValid(secondary_ip):
4978 raise errors.OpPrereqError("Secondary IP (%s) needs to be a valid IPv4"
4979 " address" % secondary_ip, errors.ECODE_INVAL)
4981 node_list = cfg.GetNodeList()
4982 if not self.op.readd and node in node_list:
4983 raise errors.OpPrereqError("Node %s is already in the configuration" %
4984 node, errors.ECODE_EXISTS)
4985 elif self.op.readd and node not in node_list:
4986 raise errors.OpPrereqError("Node %s is not in the configuration" % node,
4989 self.changed_primary_ip = False
4991 for existing_node_name, existing_node in cfg.GetMultiNodeInfo(node_list):
4992 if self.op.readd and node == existing_node_name:
4993 if existing_node.secondary_ip != secondary_ip:
4994 raise errors.OpPrereqError("Readded node doesn't have the same IP"
4995 " address configuration as before",
4997 if existing_node.primary_ip != primary_ip:
4998 self.changed_primary_ip = True
5002 if (existing_node.primary_ip == primary_ip or
5003 existing_node.secondary_ip == primary_ip or
5004 existing_node.primary_ip == secondary_ip or
5005 existing_node.secondary_ip == secondary_ip):
5006 raise errors.OpPrereqError("New node ip address(es) conflict with"
5007 " existing node %s" % existing_node.name,
5008 errors.ECODE_NOTUNIQUE)
5010 # After this 'if' block, None is no longer a valid value for the
5011 # _capable op attributes
5013 old_node = self.cfg.GetNodeInfo(node)
5014 assert old_node is not None, "Can't retrieve locked node %s" % node
5015 for attr in self._NFLAGS:
5016 if getattr(self.op, attr) is None:
5017 setattr(self.op, attr, getattr(old_node, attr))
5019 for attr in self._NFLAGS:
5020 if getattr(self.op, attr) is None:
5021 setattr(self.op, attr, True)
5023 if self.op.readd and not self.op.vm_capable:
5024 pri, sec = cfg.GetNodeInstances(node)
5026 raise errors.OpPrereqError("Node %s being re-added with vm_capable"
5027 " flag set to false, but it already holds"
5028 " instances" % node,
5031 # check that the type of the node (single versus dual homed) is the
5032 # same as for the master
5033 myself = cfg.GetNodeInfo(self.cfg.GetMasterNode())
5034 master_singlehomed = myself.secondary_ip == myself.primary_ip
5035 newbie_singlehomed = secondary_ip == primary_ip
5036 if master_singlehomed != newbie_singlehomed:
5037 if master_singlehomed:
5038 raise errors.OpPrereqError("The master has no secondary ip but the"
5039 " new node has one",
5042 raise errors.OpPrereqError("The master has a secondary ip but the"
5043 " new node doesn't have one",
5046 # checks reachability
5047 if not netutils.TcpPing(primary_ip, constants.DEFAULT_NODED_PORT):
5048 raise errors.OpPrereqError("Node not reachable by ping",
5049 errors.ECODE_ENVIRON)
5051 if not newbie_singlehomed:
5052 # check reachability from my secondary ip to newbie's secondary ip
5053 if not netutils.TcpPing(secondary_ip, constants.DEFAULT_NODED_PORT,
5054 source=myself.secondary_ip):
5055 raise errors.OpPrereqError("Node secondary ip not reachable by TCP"
5056 " based ping to node daemon port",
5057 errors.ECODE_ENVIRON)
5064 if self.op.master_capable:
5065 self.master_candidate = _DecideSelfPromotion(self, exceptions=exceptions)
5067 self.master_candidate = False
5070 self.new_node = old_node
5072 node_group = cfg.LookupNodeGroup(self.op.group)
5073 self.new_node = objects.Node(name=node,
5074 primary_ip=primary_ip,
5075 secondary_ip=secondary_ip,
5076 master_candidate=self.master_candidate,
5077 offline=False, drained=False,
5080 if self.op.ndparams:
5081 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
5083 # check connectivity
5084 result = self.rpc.call_version([self.new_node.name])[self.new_node.name]
5085 result.Raise("Can't get version information from node %s" % node)
5086 if constants.PROTOCOL_VERSION == result.payload:
5087 logging.info("Communication to node %s fine, sw version %s match",
5088 node, result.payload)
5090 raise errors.OpPrereqError("Version mismatch master version %s,"
5091 " node version %s" %
5092 (constants.PROTOCOL_VERSION, result.payload),
5093 errors.ECODE_ENVIRON)
5095 def Exec(self, feedback_fn):
5096 """Adds the new node to the cluster.
5099 new_node = self.new_node
5100 node = new_node.name
5102 # We adding a new node so we assume it's powered
5103 new_node.powered = True
5105 # for re-adds, reset the offline/drained/master-candidate flags;
5106 # we need to reset here, otherwise offline would prevent RPC calls
5107 # later in the procedure; this also means that if the re-add
5108 # fails, we are left with a non-offlined, broken node
5110 new_node.drained = new_node.offline = False # pylint: disable=W0201
5111 self.LogInfo("Readding a node, the offline/drained flags were reset")
5112 # if we demote the node, we do cleanup later in the procedure
5113 new_node.master_candidate = self.master_candidate
5114 if self.changed_primary_ip:
5115 new_node.primary_ip = self.op.primary_ip
5117 # copy the master/vm_capable flags
5118 for attr in self._NFLAGS:
5119 setattr(new_node, attr, getattr(self.op, attr))
5121 # notify the user about any possible mc promotion
5122 if new_node.master_candidate:
5123 self.LogInfo("Node will be a master candidate")
5125 if self.op.ndparams:
5126 new_node.ndparams = self.op.ndparams
5128 new_node.ndparams = {}
5130 # Add node to our /etc/hosts, and add key to known_hosts
5131 if self.cfg.GetClusterInfo().modify_etc_hosts:
5132 master_node = self.cfg.GetMasterNode()
5133 result = self.rpc.call_etc_hosts_modify(master_node,
5134 constants.ETC_HOSTS_ADD,
5137 result.Raise("Can't update hosts file with new host data")
5139 if new_node.secondary_ip != new_node.primary_ip:
5140 _CheckNodeHasSecondaryIP(self, new_node.name, new_node.secondary_ip,
5143 node_verify_list = [self.cfg.GetMasterNode()]
5144 node_verify_param = {
5145 constants.NV_NODELIST: ([node], {}),
5146 # TODO: do a node-net-test as well?
5149 result = self.rpc.call_node_verify(node_verify_list, node_verify_param,
5150 self.cfg.GetClusterName())
5151 for verifier in node_verify_list:
5152 result[verifier].Raise("Cannot communicate with node %s" % verifier)
5153 nl_payload = result[verifier].payload[constants.NV_NODELIST]
5155 for failed in nl_payload:
5156 feedback_fn("ssh/hostname verification failed"
5157 " (checking from %s): %s" %
5158 (verifier, nl_payload[failed]))
5159 raise errors.OpExecError("ssh/hostname verification failed")
5162 _RedistributeAncillaryFiles(self)
5163 self.context.ReaddNode(new_node)
5164 # make sure we redistribute the config
5165 self.cfg.Update(new_node, feedback_fn)
5166 # and make sure the new node will not have old files around
5167 if not new_node.master_candidate:
5168 result = self.rpc.call_node_demote_from_mc(new_node.name)
5169 msg = result.fail_msg
5171 self.LogWarning("Node failed to demote itself from master"
5172 " candidate status: %s" % msg)
5174 _RedistributeAncillaryFiles(self, additional_nodes=[node],
5175 additional_vm=self.op.vm_capable)
5176 self.context.AddNode(new_node, self.proc.GetECId())
5179 class LUNodeSetParams(LogicalUnit):
5180 """Modifies the parameters of a node.
5182 @cvar _F2R: a dictionary from tuples of flags (mc, drained, offline)
5183 to the node role (as _ROLE_*)
5184 @cvar _R2F: a dictionary from node role to tuples of flags
5185 @cvar _FLAGS: a list of attribute names corresponding to the flags
5188 HPATH = "node-modify"
5189 HTYPE = constants.HTYPE_NODE
5191 (_ROLE_CANDIDATE, _ROLE_DRAINED, _ROLE_OFFLINE, _ROLE_REGULAR) = range(4)
5193 (True, False, False): _ROLE_CANDIDATE,
5194 (False, True, False): _ROLE_DRAINED,
5195 (False, False, True): _ROLE_OFFLINE,
5196 (False, False, False): _ROLE_REGULAR,
5198 _R2F = dict((v, k) for k, v in _F2R.items())
5199 _FLAGS = ["master_candidate", "drained", "offline"]
5201 def CheckArguments(self):
5202 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
5203 all_mods = [self.op.offline, self.op.master_candidate, self.op.drained,
5204 self.op.master_capable, self.op.vm_capable,
5205 self.op.secondary_ip, self.op.ndparams]
5206 if all_mods.count(None) == len(all_mods):
5207 raise errors.OpPrereqError("Please pass at least one modification",
5209 if all_mods.count(True) > 1:
5210 raise errors.OpPrereqError("Can't set the node into more than one"
5211 " state at the same time",
5214 # Boolean value that tells us whether we might be demoting from MC
5215 self.might_demote = (self.op.master_candidate == False or
5216 self.op.offline == True or
5217 self.op.drained == True or
5218 self.op.master_capable == False)
5220 if self.op.secondary_ip:
5221 if not netutils.IP4Address.IsValid(self.op.secondary_ip):
5222 raise errors.OpPrereqError("Secondary IP (%s) needs to be a valid IPv4"
5223 " address" % self.op.secondary_ip,
5226 self.lock_all = self.op.auto_promote and self.might_demote
5227 self.lock_instances = self.op.secondary_ip is not None
5229 def ExpandNames(self):
5231 self.needed_locks = {locking.LEVEL_NODE: locking.ALL_SET}
5233 self.needed_locks = {locking.LEVEL_NODE: self.op.node_name}
5235 if self.lock_instances:
5236 self.needed_locks[locking.LEVEL_INSTANCE] = locking.ALL_SET
5238 def DeclareLocks(self, level):
5239 # If we have locked all instances, before waiting to lock nodes, release
5240 # all the ones living on nodes unrelated to the current operation.
5241 if level == locking.LEVEL_NODE and self.lock_instances:
5242 self.affected_instances = []
5243 if self.needed_locks[locking.LEVEL_NODE] is not locking.ALL_SET:
5246 # Build list of instances to release
5247 locked_i = self.owned_locks(locking.LEVEL_INSTANCE)
5248 for instance_name, instance in self.cfg.GetMultiInstanceInfo(locked_i):
5249 if (instance.disk_template in constants.DTS_INT_MIRROR and
5250 self.op.node_name in instance.all_nodes):
5251 instances_keep.append(instance_name)
5252 self.affected_instances.append(instance)
5254 _ReleaseLocks(self, locking.LEVEL_INSTANCE, keep=instances_keep)
5256 assert (set(self.owned_locks(locking.LEVEL_INSTANCE)) ==
5257 set(instances_keep))
5259 def BuildHooksEnv(self):
5262 This runs on the master node.
5266 "OP_TARGET": self.op.node_name,
5267 "MASTER_CANDIDATE": str(self.op.master_candidate),
5268 "OFFLINE": str(self.op.offline),
5269 "DRAINED": str(self.op.drained),
5270 "MASTER_CAPABLE": str(self.op.master_capable),
5271 "VM_CAPABLE": str(self.op.vm_capable),
5274 def BuildHooksNodes(self):
5275 """Build hooks nodes.
5278 nl = [self.cfg.GetMasterNode(), self.op.node_name]
5281 def CheckPrereq(self):
5282 """Check prerequisites.
5284 This only checks the instance list against the existing names.
5287 node = self.node = self.cfg.GetNodeInfo(self.op.node_name)
5289 if (self.op.master_candidate is not None or
5290 self.op.drained is not None or
5291 self.op.offline is not None):
5292 # we can't change the master's node flags
5293 if self.op.node_name == self.cfg.GetMasterNode():
5294 raise errors.OpPrereqError("The master role can be changed"
5295 " only via master-failover",
5298 if self.op.master_candidate and not node.master_capable:
5299 raise errors.OpPrereqError("Node %s is not master capable, cannot make"
5300 " it a master candidate" % node.name,
5303 if self.op.vm_capable == False:
5304 (ipri, isec) = self.cfg.GetNodeInstances(self.op.node_name)
5306 raise errors.OpPrereqError("Node %s hosts instances, cannot unset"
5307 " the vm_capable flag" % node.name,
5310 if node.master_candidate and self.might_demote and not self.lock_all:
5311 assert not self.op.auto_promote, "auto_promote set but lock_all not"
5312 # check if after removing the current node, we're missing master
5314 (mc_remaining, mc_should, _) = \
5315 self.cfg.GetMasterCandidateStats(exceptions=[node.name])
5316 if mc_remaining < mc_should:
5317 raise errors.OpPrereqError("Not enough master candidates, please"
5318 " pass auto promote option to allow"
5319 " promotion", errors.ECODE_STATE)
5321 self.old_flags = old_flags = (node.master_candidate,
5322 node.drained, node.offline)
5323 assert old_flags in self._F2R, "Un-handled old flags %s" % str(old_flags)
5324 self.old_role = old_role = self._F2R[old_flags]
5326 # Check for ineffective changes
5327 for attr in self._FLAGS:
5328 if (getattr(self.op, attr) == False and getattr(node, attr) == False):
5329 self.LogInfo("Ignoring request to unset flag %s, already unset", attr)
5330 setattr(self.op, attr, None)
5332 # Past this point, any flag change to False means a transition
5333 # away from the respective state, as only real changes are kept
5335 # TODO: We might query the real power state if it supports OOB
5336 if _SupportsOob(self.cfg, node):
5337 if self.op.offline is False and not (node.powered or
5338 self.op.powered == True):
5339 raise errors.OpPrereqError(("Node %s needs to be turned on before its"
5340 " offline status can be reset") %
5342 elif self.op.powered is not None:
5343 raise errors.OpPrereqError(("Unable to change powered state for node %s"
5344 " as it does not support out-of-band"
5345 " handling") % self.op.node_name)
5347 # If we're being deofflined/drained, we'll MC ourself if needed
5348 if (self.op.drained == False or self.op.offline == False or
5349 (self.op.master_capable and not node.master_capable)):
5350 if _DecideSelfPromotion(self):
5351 self.op.master_candidate = True
5352 self.LogInfo("Auto-promoting node to master candidate")
5354 # If we're no longer master capable, we'll demote ourselves from MC
5355 if self.op.master_capable == False and node.master_candidate:
5356 self.LogInfo("Demoting from master candidate")
5357 self.op.master_candidate = False
5360 assert [getattr(self.op, attr) for attr in self._FLAGS].count(True) <= 1
5361 if self.op.master_candidate:
5362 new_role = self._ROLE_CANDIDATE
5363 elif self.op.drained:
5364 new_role = self._ROLE_DRAINED
5365 elif self.op.offline:
5366 new_role = self._ROLE_OFFLINE
5367 elif False in [self.op.master_candidate, self.op.drained, self.op.offline]:
5368 # False is still in new flags, which means we're un-setting (the
5370 new_role = self._ROLE_REGULAR
5371 else: # no new flags, nothing, keep old role
5374 self.new_role = new_role
5376 if old_role == self._ROLE_OFFLINE and new_role != old_role:
5377 # Trying to transition out of offline status
5378 result = self.rpc.call_version([node.name])[node.name]
5380 raise errors.OpPrereqError("Node %s is being de-offlined but fails"
5381 " to report its version: %s" %
5382 (node.name, result.fail_msg),
5385 self.LogWarning("Transitioning node from offline to online state"
5386 " without using re-add. Please make sure the node"
5389 if self.op.secondary_ip:
5390 # Ok even without locking, because this can't be changed by any LU
5391 master = self.cfg.GetNodeInfo(self.cfg.GetMasterNode())
5392 master_singlehomed = master.secondary_ip == master.primary_ip
5393 if master_singlehomed and self.op.secondary_ip:
5394 raise errors.OpPrereqError("Cannot change the secondary ip on a single"
5395 " homed cluster", errors.ECODE_INVAL)
5398 if self.affected_instances:
5399 raise errors.OpPrereqError("Cannot change secondary ip: offline"
5400 " node has instances (%s) configured"
5401 " to use it" % self.affected_instances)
5403 # On online nodes, check that no instances are running, and that
5404 # the node has the new ip and we can reach it.
5405 for instance in self.affected_instances:
5406 _CheckInstanceDown(self, instance, "cannot change secondary ip")
5408 _CheckNodeHasSecondaryIP(self, node.name, self.op.secondary_ip, True)
5409 if master.name != node.name:
5410 # check reachability from master secondary ip to new secondary ip
5411 if not netutils.TcpPing(self.op.secondary_ip,
5412 constants.DEFAULT_NODED_PORT,
5413 source=master.secondary_ip):
5414 raise errors.OpPrereqError("Node secondary ip not reachable by TCP"
5415 " based ping to node daemon port",
5416 errors.ECODE_ENVIRON)
5418 if self.op.ndparams:
5419 new_ndparams = _GetUpdatedParams(self.node.ndparams, self.op.ndparams)
5420 utils.ForceDictType(new_ndparams, constants.NDS_PARAMETER_TYPES)
5421 self.new_ndparams = new_ndparams
5423 def Exec(self, feedback_fn):
5428 old_role = self.old_role
5429 new_role = self.new_role
5433 if self.op.ndparams:
5434 node.ndparams = self.new_ndparams
5436 if self.op.powered is not None:
5437 node.powered = self.op.powered
5439 for attr in ["master_capable", "vm_capable"]:
5440 val = getattr(self.op, attr)
5442 setattr(node, attr, val)
5443 result.append((attr, str(val)))
5445 if new_role != old_role:
5446 # Tell the node to demote itself, if no longer MC and not offline
5447 if old_role == self._ROLE_CANDIDATE and new_role != self._ROLE_OFFLINE:
5448 msg = self.rpc.call_node_demote_from_mc(node.name).fail_msg
5450 self.LogWarning("Node failed to demote itself: %s", msg)
5452 new_flags = self._R2F[new_role]
5453 for of, nf, desc in zip(self.old_flags, new_flags, self._FLAGS):
5455 result.append((desc, str(nf)))
5456 (node.master_candidate, node.drained, node.offline) = new_flags
5458 # we locked all nodes, we adjust the CP before updating this node
5460 _AdjustCandidatePool(self, [node.name])
5462 if self.op.secondary_ip:
5463 node.secondary_ip = self.op.secondary_ip
5464 result.append(("secondary_ip", self.op.secondary_ip))
5466 # this will trigger configuration file update, if needed
5467 self.cfg.Update(node, feedback_fn)
5469 # this will trigger job queue propagation or cleanup if the mc
5471 if [old_role, new_role].count(self._ROLE_CANDIDATE) == 1:
5472 self.context.ReaddNode(node)
5477 class LUNodePowercycle(NoHooksLU):
5478 """Powercycles a node.
5483 def CheckArguments(self):
5484 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
5485 if self.op.node_name == self.cfg.GetMasterNode() and not self.op.force:
5486 raise errors.OpPrereqError("The node is the master and the force"
5487 " parameter was not set",
5490 def ExpandNames(self):
5491 """Locking for PowercycleNode.
5493 This is a last-resort option and shouldn't block on other
5494 jobs. Therefore, we grab no locks.
5497 self.needed_locks = {}
5499 def Exec(self, feedback_fn):
5503 result = self.rpc.call_node_powercycle(self.op.node_name,
5504 self.cfg.GetHypervisorType())
5505 result.Raise("Failed to schedule the reboot")
5506 return result.payload
5509 class LUClusterQuery(NoHooksLU):
5510 """Query cluster configuration.
5515 def ExpandNames(self):
5516 self.needed_locks = {}
5518 def Exec(self, feedback_fn):
5519 """Return cluster config.
5522 cluster = self.cfg.GetClusterInfo()
5525 # Filter just for enabled hypervisors
5526 for os_name, hv_dict in cluster.os_hvp.items():
5527 os_hvp[os_name] = {}
5528 for hv_name, hv_params in hv_dict.items():
5529 if hv_name in cluster.enabled_hypervisors:
5530 os_hvp[os_name][hv_name] = hv_params
5532 # Convert ip_family to ip_version
5533 primary_ip_version = constants.IP4_VERSION
5534 if cluster.primary_ip_family == netutils.IP6Address.family:
5535 primary_ip_version = constants.IP6_VERSION
5538 "software_version": constants.RELEASE_VERSION,
5539 "protocol_version": constants.PROTOCOL_VERSION,
5540 "config_version": constants.CONFIG_VERSION,
5541 "os_api_version": max(constants.OS_API_VERSIONS),
5542 "export_version": constants.EXPORT_VERSION,
5543 "architecture": runtime.GetArchInfo(),
5544 "name": cluster.cluster_name,
5545 "master": cluster.master_node,
5546 "default_hypervisor": cluster.enabled_hypervisors[0],
5547 "enabled_hypervisors": cluster.enabled_hypervisors,
5548 "hvparams": dict([(hypervisor_name, cluster.hvparams[hypervisor_name])
5549 for hypervisor_name in cluster.enabled_hypervisors]),
5551 "beparams": cluster.beparams,
5552 "osparams": cluster.osparams,
5553 "nicparams": cluster.nicparams,
5554 "ndparams": cluster.ndparams,
5555 "candidate_pool_size": cluster.candidate_pool_size,
5556 "master_netdev": cluster.master_netdev,
5557 "volume_group_name": cluster.volume_group_name,
5558 "drbd_usermode_helper": cluster.drbd_usermode_helper,
5559 "file_storage_dir": cluster.file_storage_dir,
5560 "shared_file_storage_dir": cluster.shared_file_storage_dir,
5561 "maintain_node_health": cluster.maintain_node_health,
5562 "ctime": cluster.ctime,
5563 "mtime": cluster.mtime,
5564 "uuid": cluster.uuid,
5565 "tags": list(cluster.GetTags()),
5566 "uid_pool": cluster.uid_pool,
5567 "default_iallocator": cluster.default_iallocator,
5568 "reserved_lvs": cluster.reserved_lvs,
5569 "primary_ip_version": primary_ip_version,
5570 "prealloc_wipe_disks": cluster.prealloc_wipe_disks,
5571 "hidden_os": cluster.hidden_os,
5572 "blacklisted_os": cluster.blacklisted_os,
5578 class LUClusterConfigQuery(NoHooksLU):
5579 """Return configuration values.
5583 _FIELDS_DYNAMIC = utils.FieldSet()
5584 _FIELDS_STATIC = utils.FieldSet("cluster_name", "master_node", "drain_flag",
5585 "watcher_pause", "volume_group_name")
5587 def CheckArguments(self):
5588 _CheckOutputFields(static=self._FIELDS_STATIC,
5589 dynamic=self._FIELDS_DYNAMIC,
5590 selected=self.op.output_fields)
5592 def ExpandNames(self):
5593 self.needed_locks = {}
5595 def Exec(self, feedback_fn):
5596 """Dump a representation of the cluster config to the standard output.
5600 for field in self.op.output_fields:
5601 if field == "cluster_name":
5602 entry = self.cfg.GetClusterName()
5603 elif field == "master_node":
5604 entry = self.cfg.GetMasterNode()
5605 elif field == "drain_flag":
5606 entry = os.path.exists(constants.JOB_QUEUE_DRAIN_FILE)
5607 elif field == "watcher_pause":
5608 entry = utils.ReadWatcherPauseFile(constants.WATCHER_PAUSEFILE)
5609 elif field == "volume_group_name":
5610 entry = self.cfg.GetVGName()
5612 raise errors.ParameterError(field)
5613 values.append(entry)
5617 class LUInstanceActivateDisks(NoHooksLU):
5618 """Bring up an instance's disks.
5623 def ExpandNames(self):
5624 self._ExpandAndLockInstance()
5625 self.needed_locks[locking.LEVEL_NODE] = []
5626 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
5628 def DeclareLocks(self, level):
5629 if level == locking.LEVEL_NODE:
5630 self._LockInstancesNodes()
5632 def CheckPrereq(self):
5633 """Check prerequisites.
5635 This checks that the instance is in the cluster.
5638 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5639 assert self.instance is not None, \
5640 "Cannot retrieve locked instance %s" % self.op.instance_name
5641 _CheckNodeOnline(self, self.instance.primary_node)
5643 def Exec(self, feedback_fn):
5644 """Activate the disks.
5647 disks_ok, disks_info = \
5648 _AssembleInstanceDisks(self, self.instance,
5649 ignore_size=self.op.ignore_size)
5651 raise errors.OpExecError("Cannot activate block devices")
5656 def _AssembleInstanceDisks(lu, instance, disks=None, ignore_secondaries=False,
5658 """Prepare the block devices for an instance.
5660 This sets up the block devices on all nodes.
5662 @type lu: L{LogicalUnit}
5663 @param lu: the logical unit on whose behalf we execute
5664 @type instance: L{objects.Instance}
5665 @param instance: the instance for whose disks we assemble
5666 @type disks: list of L{objects.Disk} or None
5667 @param disks: which disks to assemble (or all, if None)
5668 @type ignore_secondaries: boolean
5669 @param ignore_secondaries: if true, errors on secondary nodes
5670 won't result in an error return from the function
5671 @type ignore_size: boolean
5672 @param ignore_size: if true, the current known size of the disk
5673 will not be used during the disk activation, useful for cases
5674 when the size is wrong
5675 @return: False if the operation failed, otherwise a list of
5676 (host, instance_visible_name, node_visible_name)
5677 with the mapping from node devices to instance devices
5682 iname = instance.name
5683 disks = _ExpandCheckDisks(instance, disks)
5685 # With the two passes mechanism we try to reduce the window of
5686 # opportunity for the race condition of switching DRBD to primary
5687 # before handshaking occured, but we do not eliminate it
5689 # The proper fix would be to wait (with some limits) until the
5690 # connection has been made and drbd transitions from WFConnection
5691 # into any other network-connected state (Connected, SyncTarget,
5694 # 1st pass, assemble on all nodes in secondary mode
5695 for idx, inst_disk in enumerate(disks):
5696 for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node):
5698 node_disk = node_disk.Copy()
5699 node_disk.UnsetSize()
5700 lu.cfg.SetDiskID(node_disk, node)
5701 result = lu.rpc.call_blockdev_assemble(node, node_disk, iname, False, idx)
5702 msg = result.fail_msg
5704 lu.proc.LogWarning("Could not prepare block device %s on node %s"
5705 " (is_primary=False, pass=1): %s",
5706 inst_disk.iv_name, node, msg)
5707 if not ignore_secondaries:
5710 # FIXME: race condition on drbd migration to primary
5712 # 2nd pass, do only the primary node
5713 for idx, inst_disk in enumerate(disks):
5716 for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node):
5717 if node != instance.primary_node:
5720 node_disk = node_disk.Copy()
5721 node_disk.UnsetSize()
5722 lu.cfg.SetDiskID(node_disk, node)
5723 result = lu.rpc.call_blockdev_assemble(node, node_disk, iname, True, idx)
5724 msg = result.fail_msg
5726 lu.proc.LogWarning("Could not prepare block device %s on node %s"
5727 " (is_primary=True, pass=2): %s",
5728 inst_disk.iv_name, node, msg)
5731 dev_path = result.payload
5733 device_info.append((instance.primary_node, inst_disk.iv_name, dev_path))
5735 # leave the disks configured for the primary node
5736 # this is a workaround that would be fixed better by
5737 # improving the logical/physical id handling
5739 lu.cfg.SetDiskID(disk, instance.primary_node)
5741 return disks_ok, device_info
5744 def _StartInstanceDisks(lu, instance, force):
5745 """Start the disks of an instance.
5748 disks_ok, _ = _AssembleInstanceDisks(lu, instance,
5749 ignore_secondaries=force)
5751 _ShutdownInstanceDisks(lu, instance)
5752 if force is not None and not force:
5753 lu.proc.LogWarning("", hint="If the message above refers to a"
5755 " you can retry the operation using '--force'.")
5756 raise errors.OpExecError("Disk consistency error")
5759 class LUInstanceDeactivateDisks(NoHooksLU):
5760 """Shutdown an instance's disks.
5765 def ExpandNames(self):
5766 self._ExpandAndLockInstance()
5767 self.needed_locks[locking.LEVEL_NODE] = []
5768 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
5770 def DeclareLocks(self, level):
5771 if level == locking.LEVEL_NODE:
5772 self._LockInstancesNodes()
5774 def CheckPrereq(self):
5775 """Check prerequisites.
5777 This checks that the instance is in the cluster.
5780 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5781 assert self.instance is not None, \
5782 "Cannot retrieve locked instance %s" % self.op.instance_name
5784 def Exec(self, feedback_fn):
5785 """Deactivate the disks
5788 instance = self.instance
5790 _ShutdownInstanceDisks(self, instance)
5792 _SafeShutdownInstanceDisks(self, instance)
5795 def _SafeShutdownInstanceDisks(lu, instance, disks=None):
5796 """Shutdown block devices of an instance.
5798 This function checks if an instance is running, before calling
5799 _ShutdownInstanceDisks.
5802 _CheckInstanceDown(lu, instance, "cannot shutdown disks")
5803 _ShutdownInstanceDisks(lu, instance, disks=disks)
5806 def _ExpandCheckDisks(instance, disks):
5807 """Return the instance disks selected by the disks list
5809 @type disks: list of L{objects.Disk} or None
5810 @param disks: selected disks
5811 @rtype: list of L{objects.Disk}
5812 @return: selected instance disks to act on
5816 return instance.disks
5818 if not set(disks).issubset(instance.disks):
5819 raise errors.ProgrammerError("Can only act on disks belonging to the"
5824 def _ShutdownInstanceDisks(lu, instance, disks=None, ignore_primary=False):
5825 """Shutdown block devices of an instance.
5827 This does the shutdown on all nodes of the instance.
5829 If the ignore_primary is false, errors on the primary node are
5834 disks = _ExpandCheckDisks(instance, disks)
5837 for node, top_disk in disk.ComputeNodeTree(instance.primary_node):
5838 lu.cfg.SetDiskID(top_disk, node)
5839 result = lu.rpc.call_blockdev_shutdown(node, top_disk)
5840 msg = result.fail_msg
5842 lu.LogWarning("Could not shutdown block device %s on node %s: %s",
5843 disk.iv_name, node, msg)
5844 if ((node == instance.primary_node and not ignore_primary) or
5845 (node != instance.primary_node and not result.offline)):
5850 def _CheckNodeFreeMemory(lu, node, reason, requested, hypervisor_name):
5851 """Checks if a node has enough free memory.
5853 This function check if a given node has the needed amount of free
5854 memory. In case the node has less memory or we cannot get the
5855 information from the node, this function raise an OpPrereqError
5858 @type lu: C{LogicalUnit}
5859 @param lu: a logical unit from which we get configuration data
5861 @param node: the node to check
5862 @type reason: C{str}
5863 @param reason: string to use in the error message
5864 @type requested: C{int}
5865 @param requested: the amount of memory in MiB to check for
5866 @type hypervisor_name: C{str}
5867 @param hypervisor_name: the hypervisor to ask for memory stats
5868 @raise errors.OpPrereqError: if the node doesn't have enough memory, or
5869 we cannot check the node
5872 nodeinfo = lu.rpc.call_node_info([node], None, hypervisor_name)
5873 nodeinfo[node].Raise("Can't get data from node %s" % node,
5874 prereq=True, ecode=errors.ECODE_ENVIRON)
5875 free_mem = nodeinfo[node].payload.get("memory_free", None)
5876 if not isinstance(free_mem, int):
5877 raise errors.OpPrereqError("Can't compute free memory on node %s, result"
5878 " was '%s'" % (node, free_mem),
5879 errors.ECODE_ENVIRON)
5880 if requested > free_mem:
5881 raise errors.OpPrereqError("Not enough memory on node %s for %s:"
5882 " needed %s MiB, available %s MiB" %
5883 (node, reason, requested, free_mem),
5887 def _CheckNodesFreeDiskPerVG(lu, nodenames, req_sizes):
5888 """Checks if nodes have enough free disk space in the all VGs.
5890 This function check if all given nodes have the needed amount of
5891 free disk. In case any node has less disk or we cannot get the
5892 information from the node, this function raise an OpPrereqError
5895 @type lu: C{LogicalUnit}
5896 @param lu: a logical unit from which we get configuration data
5897 @type nodenames: C{list}
5898 @param nodenames: the list of node names to check
5899 @type req_sizes: C{dict}
5900 @param req_sizes: the hash of vg and corresponding amount of disk in
5902 @raise errors.OpPrereqError: if the node doesn't have enough disk,
5903 or we cannot check the node
5906 for vg, req_size in req_sizes.items():
5907 _CheckNodesFreeDiskOnVG(lu, nodenames, vg, req_size)
5910 def _CheckNodesFreeDiskOnVG(lu, nodenames, vg, requested):
5911 """Checks if nodes have enough free disk space in the specified VG.
5913 This function check if all given nodes have the needed amount of
5914 free disk. In case any node has less disk or we cannot get the
5915 information from the node, this function raise an OpPrereqError
5918 @type lu: C{LogicalUnit}
5919 @param lu: a logical unit from which we get configuration data
5920 @type nodenames: C{list}
5921 @param nodenames: the list of node names to check
5923 @param vg: the volume group to check
5924 @type requested: C{int}
5925 @param requested: the amount of disk in MiB to check for
5926 @raise errors.OpPrereqError: if the node doesn't have enough disk,
5927 or we cannot check the node
5930 nodeinfo = lu.rpc.call_node_info(nodenames, vg, None)
5931 for node in nodenames:
5932 info = nodeinfo[node]
5933 info.Raise("Cannot get current information from node %s" % node,
5934 prereq=True, ecode=errors.ECODE_ENVIRON)
5935 vg_free = info.payload.get("vg_free", None)
5936 if not isinstance(vg_free, int):
5937 raise errors.OpPrereqError("Can't compute free disk space on node"
5938 " %s for vg %s, result was '%s'" %
5939 (node, vg, vg_free), errors.ECODE_ENVIRON)
5940 if requested > vg_free:
5941 raise errors.OpPrereqError("Not enough disk space on target node %s"
5942 " vg %s: required %d MiB, available %d MiB" %
5943 (node, vg, requested, vg_free),
5947 class LUInstanceStartup(LogicalUnit):
5948 """Starts an instance.
5951 HPATH = "instance-start"
5952 HTYPE = constants.HTYPE_INSTANCE
5955 def CheckArguments(self):
5957 if self.op.beparams:
5958 # fill the beparams dict
5959 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
5961 def ExpandNames(self):
5962 self._ExpandAndLockInstance()
5964 def BuildHooksEnv(self):
5967 This runs on master, primary and secondary nodes of the instance.
5971 "FORCE": self.op.force,
5974 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
5978 def BuildHooksNodes(self):
5979 """Build hooks nodes.
5982 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
5985 def CheckPrereq(self):
5986 """Check prerequisites.
5988 This checks that the instance is in the cluster.
5991 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5992 assert self.instance is not None, \
5993 "Cannot retrieve locked instance %s" % self.op.instance_name
5996 if self.op.hvparams:
5997 # check hypervisor parameter syntax (locally)
5998 cluster = self.cfg.GetClusterInfo()
5999 utils.ForceDictType(self.op.hvparams, constants.HVS_PARAMETER_TYPES)
6000 filled_hvp = cluster.FillHV(instance)
6001 filled_hvp.update(self.op.hvparams)
6002 hv_type = hypervisor.GetHypervisor(instance.hypervisor)
6003 hv_type.CheckParameterSyntax(filled_hvp)
6004 _CheckHVParams(self, instance.all_nodes, instance.hypervisor, filled_hvp)
6006 self.primary_offline = self.cfg.GetNodeInfo(instance.primary_node).offline
6008 if self.primary_offline and self.op.ignore_offline_nodes:
6009 self.proc.LogWarning("Ignoring offline primary node")
6011 if self.op.hvparams or self.op.beparams:
6012 self.proc.LogWarning("Overridden parameters are ignored")
6014 _CheckNodeOnline(self, instance.primary_node)
6016 bep = self.cfg.GetClusterInfo().FillBE(instance)
6018 # check bridges existence
6019 _CheckInstanceBridgesExist(self, instance)
6021 remote_info = self.rpc.call_instance_info(instance.primary_node,
6023 instance.hypervisor)
6024 remote_info.Raise("Error checking node %s" % instance.primary_node,
6025 prereq=True, ecode=errors.ECODE_ENVIRON)
6026 if not remote_info.payload: # not running already
6027 _CheckNodeFreeMemory(self, instance.primary_node,
6028 "starting instance %s" % instance.name,
6029 bep[constants.BE_MEMORY], instance.hypervisor)
6031 def Exec(self, feedback_fn):
6032 """Start the instance.
6035 instance = self.instance
6036 force = self.op.force
6038 if not self.op.no_remember:
6039 self.cfg.MarkInstanceUp(instance.name)
6041 if self.primary_offline:
6042 assert self.op.ignore_offline_nodes
6043 self.proc.LogInfo("Primary node offline, marked instance as started")
6045 node_current = instance.primary_node
6047 _StartInstanceDisks(self, instance, force)
6049 result = self.rpc.call_instance_start(node_current, instance,
6050 self.op.hvparams, self.op.beparams,
6051 self.op.startup_paused)
6052 msg = result.fail_msg
6054 _ShutdownInstanceDisks(self, instance)
6055 raise errors.OpExecError("Could not start instance: %s" % msg)
6058 class LUInstanceReboot(LogicalUnit):
6059 """Reboot an instance.
6062 HPATH = "instance-reboot"
6063 HTYPE = constants.HTYPE_INSTANCE
6066 def ExpandNames(self):
6067 self._ExpandAndLockInstance()
6069 def BuildHooksEnv(self):
6072 This runs on master, primary and secondary nodes of the instance.
6076 "IGNORE_SECONDARIES": self.op.ignore_secondaries,
6077 "REBOOT_TYPE": self.op.reboot_type,
6078 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
6081 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
6085 def BuildHooksNodes(self):
6086 """Build hooks nodes.
6089 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6092 def CheckPrereq(self):
6093 """Check prerequisites.
6095 This checks that the instance is in the cluster.
6098 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6099 assert self.instance is not None, \
6100 "Cannot retrieve locked instance %s" % self.op.instance_name
6102 _CheckNodeOnline(self, instance.primary_node)
6104 # check bridges existence
6105 _CheckInstanceBridgesExist(self, instance)
6107 def Exec(self, feedback_fn):
6108 """Reboot the instance.
6111 instance = self.instance
6112 ignore_secondaries = self.op.ignore_secondaries
6113 reboot_type = self.op.reboot_type
6115 remote_info = self.rpc.call_instance_info(instance.primary_node,
6117 instance.hypervisor)
6118 remote_info.Raise("Error checking node %s" % instance.primary_node)
6119 instance_running = bool(remote_info.payload)
6121 node_current = instance.primary_node
6123 if instance_running and reboot_type in [constants.INSTANCE_REBOOT_SOFT,
6124 constants.INSTANCE_REBOOT_HARD]:
6125 for disk in instance.disks:
6126 self.cfg.SetDiskID(disk, node_current)
6127 result = self.rpc.call_instance_reboot(node_current, instance,
6129 self.op.shutdown_timeout)
6130 result.Raise("Could not reboot instance")
6132 if instance_running:
6133 result = self.rpc.call_instance_shutdown(node_current, instance,
6134 self.op.shutdown_timeout)
6135 result.Raise("Could not shutdown instance for full reboot")
6136 _ShutdownInstanceDisks(self, instance)
6138 self.LogInfo("Instance %s was already stopped, starting now",
6140 _StartInstanceDisks(self, instance, ignore_secondaries)
6141 result = self.rpc.call_instance_start(node_current, instance,
6143 msg = result.fail_msg
6145 _ShutdownInstanceDisks(self, instance)
6146 raise errors.OpExecError("Could not start instance for"
6147 " full reboot: %s" % msg)
6149 self.cfg.MarkInstanceUp(instance.name)
6152 class LUInstanceShutdown(LogicalUnit):
6153 """Shutdown an instance.
6156 HPATH = "instance-stop"
6157 HTYPE = constants.HTYPE_INSTANCE
6160 def ExpandNames(self):
6161 self._ExpandAndLockInstance()
6163 def BuildHooksEnv(self):
6166 This runs on master, primary and secondary nodes of the instance.
6169 env = _BuildInstanceHookEnvByObject(self, self.instance)
6170 env["TIMEOUT"] = self.op.timeout
6173 def BuildHooksNodes(self):
6174 """Build hooks nodes.
6177 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6180 def CheckPrereq(self):
6181 """Check prerequisites.
6183 This checks that the instance is in the cluster.
6186 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6187 assert self.instance is not None, \
6188 "Cannot retrieve locked instance %s" % self.op.instance_name
6190 self.primary_offline = \
6191 self.cfg.GetNodeInfo(self.instance.primary_node).offline
6193 if self.primary_offline and self.op.ignore_offline_nodes:
6194 self.proc.LogWarning("Ignoring offline primary node")
6196 _CheckNodeOnline(self, self.instance.primary_node)
6198 def Exec(self, feedback_fn):
6199 """Shutdown the instance.
6202 instance = self.instance
6203 node_current = instance.primary_node
6204 timeout = self.op.timeout
6206 if not self.op.no_remember:
6207 self.cfg.MarkInstanceDown(instance.name)
6209 if self.primary_offline:
6210 assert self.op.ignore_offline_nodes
6211 self.proc.LogInfo("Primary node offline, marked instance as stopped")
6213 result = self.rpc.call_instance_shutdown(node_current, instance, timeout)
6214 msg = result.fail_msg
6216 self.proc.LogWarning("Could not shutdown instance: %s" % msg)
6218 _ShutdownInstanceDisks(self, instance)
6221 class LUInstanceReinstall(LogicalUnit):
6222 """Reinstall an instance.
6225 HPATH = "instance-reinstall"
6226 HTYPE = constants.HTYPE_INSTANCE
6229 def ExpandNames(self):
6230 self._ExpandAndLockInstance()
6232 def BuildHooksEnv(self):
6235 This runs on master, primary and secondary nodes of the instance.
6238 return _BuildInstanceHookEnvByObject(self, self.instance)
6240 def BuildHooksNodes(self):
6241 """Build hooks nodes.
6244 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6247 def CheckPrereq(self):
6248 """Check prerequisites.
6250 This checks that the instance is in the cluster and is not running.
6253 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6254 assert instance is not None, \
6255 "Cannot retrieve locked instance %s" % self.op.instance_name
6256 _CheckNodeOnline(self, instance.primary_node, "Instance primary node"
6257 " offline, cannot reinstall")
6258 for node in instance.secondary_nodes:
6259 _CheckNodeOnline(self, node, "Instance secondary node offline,"
6260 " cannot reinstall")
6262 if instance.disk_template == constants.DT_DISKLESS:
6263 raise errors.OpPrereqError("Instance '%s' has no disks" %
6264 self.op.instance_name,
6266 _CheckInstanceDown(self, instance, "cannot reinstall")
6268 if self.op.os_type is not None:
6270 pnode = _ExpandNodeName(self.cfg, instance.primary_node)
6271 _CheckNodeHasOS(self, pnode, self.op.os_type, self.op.force_variant)
6272 instance_os = self.op.os_type
6274 instance_os = instance.os
6276 nodelist = list(instance.all_nodes)
6278 if self.op.osparams:
6279 i_osdict = _GetUpdatedParams(instance.osparams, self.op.osparams)
6280 _CheckOSParams(self, True, nodelist, instance_os, i_osdict)
6281 self.os_inst = i_osdict # the new dict (without defaults)
6285 self.instance = instance
6287 def Exec(self, feedback_fn):
6288 """Reinstall the instance.
6291 inst = self.instance
6293 if self.op.os_type is not None:
6294 feedback_fn("Changing OS to '%s'..." % self.op.os_type)
6295 inst.os = self.op.os_type
6296 # Write to configuration
6297 self.cfg.Update(inst, feedback_fn)
6299 _StartInstanceDisks(self, inst, None)
6301 feedback_fn("Running the instance OS create scripts...")
6302 # FIXME: pass debug option from opcode to backend
6303 result = self.rpc.call_instance_os_add(inst.primary_node, inst, True,
6304 self.op.debug_level,
6305 osparams=self.os_inst)
6306 result.Raise("Could not install OS for instance %s on node %s" %
6307 (inst.name, inst.primary_node))
6309 _ShutdownInstanceDisks(self, inst)
6312 class LUInstanceRecreateDisks(LogicalUnit):
6313 """Recreate an instance's missing disks.
6316 HPATH = "instance-recreate-disks"
6317 HTYPE = constants.HTYPE_INSTANCE
6320 def CheckArguments(self):
6321 # normalise the disk list
6322 self.op.disks = sorted(frozenset(self.op.disks))
6324 def ExpandNames(self):
6325 self._ExpandAndLockInstance()
6326 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
6328 self.op.nodes = [_ExpandNodeName(self.cfg, n) for n in self.op.nodes]
6329 self.needed_locks[locking.LEVEL_NODE] = list(self.op.nodes)
6331 self.needed_locks[locking.LEVEL_NODE] = []
6333 def DeclareLocks(self, level):
6334 if level == locking.LEVEL_NODE:
6335 # if we replace the nodes, we only need to lock the old primary,
6336 # otherwise we need to lock all nodes for disk re-creation
6337 primary_only = bool(self.op.nodes)
6338 self._LockInstancesNodes(primary_only=primary_only)
6340 def BuildHooksEnv(self):
6343 This runs on master, primary and secondary nodes of the instance.
6346 return _BuildInstanceHookEnvByObject(self, self.instance)
6348 def BuildHooksNodes(self):
6349 """Build hooks nodes.
6352 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6355 def CheckPrereq(self):
6356 """Check prerequisites.
6358 This checks that the instance is in the cluster and is not running.
6361 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6362 assert instance is not None, \
6363 "Cannot retrieve locked instance %s" % self.op.instance_name
6365 if len(self.op.nodes) != len(instance.all_nodes):
6366 raise errors.OpPrereqError("Instance %s currently has %d nodes, but"
6367 " %d replacement nodes were specified" %
6368 (instance.name, len(instance.all_nodes),
6369 len(self.op.nodes)),
6371 assert instance.disk_template != constants.DT_DRBD8 or \
6372 len(self.op.nodes) == 2
6373 assert instance.disk_template != constants.DT_PLAIN or \
6374 len(self.op.nodes) == 1
6375 primary_node = self.op.nodes[0]
6377 primary_node = instance.primary_node
6378 _CheckNodeOnline(self, primary_node)
6380 if instance.disk_template == constants.DT_DISKLESS:
6381 raise errors.OpPrereqError("Instance '%s' has no disks" %
6382 self.op.instance_name, errors.ECODE_INVAL)
6383 # if we replace nodes *and* the old primary is offline, we don't
6385 assert instance.primary_node in self.needed_locks[locking.LEVEL_NODE]
6386 old_pnode = self.cfg.GetNodeInfo(instance.primary_node)
6387 if not (self.op.nodes and old_pnode.offline):
6388 _CheckInstanceDown(self, instance, "cannot recreate disks")
6390 if not self.op.disks:
6391 self.op.disks = range(len(instance.disks))
6393 for idx in self.op.disks:
6394 if idx >= len(instance.disks):
6395 raise errors.OpPrereqError("Invalid disk index '%s'" % idx,
6397 if self.op.disks != range(len(instance.disks)) and self.op.nodes:
6398 raise errors.OpPrereqError("Can't recreate disks partially and"
6399 " change the nodes at the same time",
6401 self.instance = instance
6403 def Exec(self, feedback_fn):
6404 """Recreate the disks.
6407 instance = self.instance
6410 mods = [] # keeps track of needed logical_id changes
6412 for idx, disk in enumerate(instance.disks):
6413 if idx not in self.op.disks: # disk idx has not been passed in
6416 # update secondaries for disks, if needed
6418 if disk.dev_type == constants.LD_DRBD8:
6419 # need to update the nodes and minors
6420 assert len(self.op.nodes) == 2
6421 assert len(disk.logical_id) == 6 # otherwise disk internals
6423 (_, _, old_port, _, _, old_secret) = disk.logical_id
6424 new_minors = self.cfg.AllocateDRBDMinor(self.op.nodes, instance.name)
6425 new_id = (self.op.nodes[0], self.op.nodes[1], old_port,
6426 new_minors[0], new_minors[1], old_secret)
6427 assert len(disk.logical_id) == len(new_id)
6428 mods.append((idx, new_id))
6430 # now that we have passed all asserts above, we can apply the mods
6431 # in a single run (to avoid partial changes)
6432 for idx, new_id in mods:
6433 instance.disks[idx].logical_id = new_id
6435 # change primary node, if needed
6437 instance.primary_node = self.op.nodes[0]
6438 self.LogWarning("Changing the instance's nodes, you will have to"
6439 " remove any disks left on the older nodes manually")
6442 self.cfg.Update(instance, feedback_fn)
6444 _CreateDisks(self, instance, to_skip=to_skip)
6447 class LUInstanceRename(LogicalUnit):
6448 """Rename an instance.
6451 HPATH = "instance-rename"
6452 HTYPE = constants.HTYPE_INSTANCE
6454 def CheckArguments(self):
6458 if self.op.ip_check and not self.op.name_check:
6459 # TODO: make the ip check more flexible and not depend on the name check
6460 raise errors.OpPrereqError("IP address check requires a name check",
6463 def BuildHooksEnv(self):
6466 This runs on master, primary and secondary nodes of the instance.
6469 env = _BuildInstanceHookEnvByObject(self, self.instance)
6470 env["INSTANCE_NEW_NAME"] = self.op.new_name
6473 def BuildHooksNodes(self):
6474 """Build hooks nodes.
6477 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6480 def CheckPrereq(self):
6481 """Check prerequisites.
6483 This checks that the instance is in the cluster and is not running.
6486 self.op.instance_name = _ExpandInstanceName(self.cfg,
6487 self.op.instance_name)
6488 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6489 assert instance is not None
6490 _CheckNodeOnline(self, instance.primary_node)
6491 _CheckInstanceDown(self, instance, "cannot rename")
6492 self.instance = instance
6494 new_name = self.op.new_name
6495 if self.op.name_check:
6496 hostname = netutils.GetHostname(name=new_name)
6497 if hostname.name != new_name:
6498 self.LogInfo("Resolved given name '%s' to '%s'", new_name,
6500 if not utils.MatchNameComponent(self.op.new_name, [hostname.name]):
6501 raise errors.OpPrereqError(("Resolved hostname '%s' does not look the"
6502 " same as given hostname '%s'") %
6503 (hostname.name, self.op.new_name),
6505 new_name = self.op.new_name = hostname.name
6506 if (self.op.ip_check and
6507 netutils.TcpPing(hostname.ip, constants.DEFAULT_NODED_PORT)):
6508 raise errors.OpPrereqError("IP %s of instance %s already in use" %
6509 (hostname.ip, new_name),
6510 errors.ECODE_NOTUNIQUE)
6512 instance_list = self.cfg.GetInstanceList()
6513 if new_name in instance_list and new_name != instance.name:
6514 raise errors.OpPrereqError("Instance '%s' is already in the cluster" %
6515 new_name, errors.ECODE_EXISTS)
6517 def Exec(self, feedback_fn):
6518 """Rename the instance.
6521 inst = self.instance
6522 old_name = inst.name
6524 rename_file_storage = False
6525 if (inst.disk_template in constants.DTS_FILEBASED and
6526 self.op.new_name != inst.name):
6527 old_file_storage_dir = os.path.dirname(inst.disks[0].logical_id[1])
6528 rename_file_storage = True
6530 self.cfg.RenameInstance(inst.name, self.op.new_name)
6531 # Change the instance lock. This is definitely safe while we hold the BGL.
6532 # Otherwise the new lock would have to be added in acquired mode.
6534 self.glm.remove(locking.LEVEL_INSTANCE, old_name)
6535 self.glm.add(locking.LEVEL_INSTANCE, self.op.new_name)
6537 # re-read the instance from the configuration after rename
6538 inst = self.cfg.GetInstanceInfo(self.op.new_name)
6540 if rename_file_storage:
6541 new_file_storage_dir = os.path.dirname(inst.disks[0].logical_id[1])
6542 result = self.rpc.call_file_storage_dir_rename(inst.primary_node,
6543 old_file_storage_dir,
6544 new_file_storage_dir)
6545 result.Raise("Could not rename on node %s directory '%s' to '%s'"
6546 " (but the instance has been renamed in Ganeti)" %
6547 (inst.primary_node, old_file_storage_dir,
6548 new_file_storage_dir))
6550 _StartInstanceDisks(self, inst, None)
6552 result = self.rpc.call_instance_run_rename(inst.primary_node, inst,
6553 old_name, self.op.debug_level)
6554 msg = result.fail_msg
6556 msg = ("Could not run OS rename script for instance %s on node %s"
6557 " (but the instance has been renamed in Ganeti): %s" %
6558 (inst.name, inst.primary_node, msg))
6559 self.proc.LogWarning(msg)
6561 _ShutdownInstanceDisks(self, inst)
6566 class LUInstanceRemove(LogicalUnit):
6567 """Remove an instance.
6570 HPATH = "instance-remove"
6571 HTYPE = constants.HTYPE_INSTANCE
6574 def ExpandNames(self):
6575 self._ExpandAndLockInstance()
6576 self.needed_locks[locking.LEVEL_NODE] = []
6577 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
6579 def DeclareLocks(self, level):
6580 if level == locking.LEVEL_NODE:
6581 self._LockInstancesNodes()
6583 def BuildHooksEnv(self):
6586 This runs on master, primary and secondary nodes of the instance.
6589 env = _BuildInstanceHookEnvByObject(self, self.instance)
6590 env["SHUTDOWN_TIMEOUT"] = self.op.shutdown_timeout
6593 def BuildHooksNodes(self):
6594 """Build hooks nodes.
6597 nl = [self.cfg.GetMasterNode()]
6598 nl_post = list(self.instance.all_nodes) + nl
6599 return (nl, nl_post)
6601 def CheckPrereq(self):
6602 """Check prerequisites.
6604 This checks that the instance is in the cluster.
6607 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6608 assert self.instance is not None, \
6609 "Cannot retrieve locked instance %s" % self.op.instance_name
6611 def Exec(self, feedback_fn):
6612 """Remove the instance.
6615 instance = self.instance
6616 logging.info("Shutting down instance %s on node %s",
6617 instance.name, instance.primary_node)
6619 result = self.rpc.call_instance_shutdown(instance.primary_node, instance,
6620 self.op.shutdown_timeout)
6621 msg = result.fail_msg
6623 if self.op.ignore_failures:
6624 feedback_fn("Warning: can't shutdown instance: %s" % msg)
6626 raise errors.OpExecError("Could not shutdown instance %s on"
6628 (instance.name, instance.primary_node, msg))
6630 _RemoveInstance(self, feedback_fn, instance, self.op.ignore_failures)
6633 def _RemoveInstance(lu, feedback_fn, instance, ignore_failures):
6634 """Utility function to remove an instance.
6637 logging.info("Removing block devices for instance %s", instance.name)
6639 if not _RemoveDisks(lu, instance):
6640 if not ignore_failures:
6641 raise errors.OpExecError("Can't remove instance's disks")
6642 feedback_fn("Warning: can't remove instance's disks")
6644 logging.info("Removing instance %s out of cluster config", instance.name)
6646 lu.cfg.RemoveInstance(instance.name)
6648 assert not lu.remove_locks.get(locking.LEVEL_INSTANCE), \
6649 "Instance lock removal conflict"
6651 # Remove lock for the instance
6652 lu.remove_locks[locking.LEVEL_INSTANCE] = instance.name
6655 class LUInstanceQuery(NoHooksLU):
6656 """Logical unit for querying instances.
6659 # pylint: disable=W0142
6662 def CheckArguments(self):
6663 self.iq = _InstanceQuery(qlang.MakeSimpleFilter("name", self.op.names),
6664 self.op.output_fields, self.op.use_locking)
6666 def ExpandNames(self):
6667 self.iq.ExpandNames(self)
6669 def DeclareLocks(self, level):
6670 self.iq.DeclareLocks(self, level)
6672 def Exec(self, feedback_fn):
6673 return self.iq.OldStyleQuery(self)
6676 class LUInstanceFailover(LogicalUnit):
6677 """Failover an instance.
6680 HPATH = "instance-failover"
6681 HTYPE = constants.HTYPE_INSTANCE
6684 def CheckArguments(self):
6685 """Check the arguments.
6688 self.iallocator = getattr(self.op, "iallocator", None)
6689 self.target_node = getattr(self.op, "target_node", None)
6691 def ExpandNames(self):
6692 self._ExpandAndLockInstance()
6694 if self.op.target_node is not None:
6695 self.op.target_node = _ExpandNodeName(self.cfg, self.op.target_node)
6697 self.needed_locks[locking.LEVEL_NODE] = []
6698 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
6700 ignore_consistency = self.op.ignore_consistency
6701 shutdown_timeout = self.op.shutdown_timeout
6702 self._migrater = TLMigrateInstance(self, self.op.instance_name,
6705 ignore_consistency=ignore_consistency,
6706 shutdown_timeout=shutdown_timeout)
6707 self.tasklets = [self._migrater]
6709 def DeclareLocks(self, level):
6710 if level == locking.LEVEL_NODE:
6711 instance = self.context.cfg.GetInstanceInfo(self.op.instance_name)
6712 if instance.disk_template in constants.DTS_EXT_MIRROR:
6713 if self.op.target_node is None:
6714 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
6716 self.needed_locks[locking.LEVEL_NODE] = [instance.primary_node,
6717 self.op.target_node]
6718 del self.recalculate_locks[locking.LEVEL_NODE]
6720 self._LockInstancesNodes()
6722 def BuildHooksEnv(self):
6725 This runs on master, primary and secondary nodes of the instance.
6728 instance = self._migrater.instance
6729 source_node = instance.primary_node
6730 target_node = self.op.target_node
6732 "IGNORE_CONSISTENCY": self.op.ignore_consistency,
6733 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
6734 "OLD_PRIMARY": source_node,
6735 "NEW_PRIMARY": target_node,
6738 if instance.disk_template in constants.DTS_INT_MIRROR:
6739 env["OLD_SECONDARY"] = instance.secondary_nodes[0]
6740 env["NEW_SECONDARY"] = source_node
6742 env["OLD_SECONDARY"] = env["NEW_SECONDARY"] = ""
6744 env.update(_BuildInstanceHookEnvByObject(self, instance))
6748 def BuildHooksNodes(self):
6749 """Build hooks nodes.
6752 instance = self._migrater.instance
6753 nl = [self.cfg.GetMasterNode()] + list(instance.secondary_nodes)
6754 return (nl, nl + [instance.primary_node])
6757 class LUInstanceMigrate(LogicalUnit):
6758 """Migrate an instance.
6760 This is migration without shutting down, compared to the failover,
6761 which is done with shutdown.
6764 HPATH = "instance-migrate"
6765 HTYPE = constants.HTYPE_INSTANCE
6768 def ExpandNames(self):
6769 self._ExpandAndLockInstance()
6771 if self.op.target_node is not None:
6772 self.op.target_node = _ExpandNodeName(self.cfg, self.op.target_node)
6774 self.needed_locks[locking.LEVEL_NODE] = []
6775 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
6777 self._migrater = TLMigrateInstance(self, self.op.instance_name,
6778 cleanup=self.op.cleanup,
6780 fallback=self.op.allow_failover)
6781 self.tasklets = [self._migrater]
6783 def DeclareLocks(self, level):
6784 if level == locking.LEVEL_NODE:
6785 instance = self.context.cfg.GetInstanceInfo(self.op.instance_name)
6786 if instance.disk_template in constants.DTS_EXT_MIRROR:
6787 if self.op.target_node is None:
6788 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
6790 self.needed_locks[locking.LEVEL_NODE] = [instance.primary_node,
6791 self.op.target_node]
6792 del self.recalculate_locks[locking.LEVEL_NODE]
6794 self._LockInstancesNodes()
6796 def BuildHooksEnv(self):
6799 This runs on master, primary and secondary nodes of the instance.
6802 instance = self._migrater.instance
6803 source_node = instance.primary_node
6804 target_node = self.op.target_node
6805 env = _BuildInstanceHookEnvByObject(self, instance)
6807 "MIGRATE_LIVE": self._migrater.live,
6808 "MIGRATE_CLEANUP": self.op.cleanup,
6809 "OLD_PRIMARY": source_node,
6810 "NEW_PRIMARY": target_node,
6813 if instance.disk_template in constants.DTS_INT_MIRROR:
6814 env["OLD_SECONDARY"] = target_node
6815 env["NEW_SECONDARY"] = source_node
6817 env["OLD_SECONDARY"] = env["NEW_SECONDARY"] = None
6821 def BuildHooksNodes(self):
6822 """Build hooks nodes.
6825 instance = self._migrater.instance
6826 nl = [self.cfg.GetMasterNode()] + list(instance.secondary_nodes)
6827 return (nl, nl + [instance.primary_node])
6830 class LUInstanceMove(LogicalUnit):
6831 """Move an instance by data-copying.
6834 HPATH = "instance-move"
6835 HTYPE = constants.HTYPE_INSTANCE
6838 def ExpandNames(self):
6839 self._ExpandAndLockInstance()
6840 target_node = _ExpandNodeName(self.cfg, self.op.target_node)
6841 self.op.target_node = target_node
6842 self.needed_locks[locking.LEVEL_NODE] = [target_node]
6843 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
6845 def DeclareLocks(self, level):
6846 if level == locking.LEVEL_NODE:
6847 self._LockInstancesNodes(primary_only=True)
6849 def BuildHooksEnv(self):
6852 This runs on master, primary and secondary nodes of the instance.
6856 "TARGET_NODE": self.op.target_node,
6857 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
6859 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
6862 def BuildHooksNodes(self):
6863 """Build hooks nodes.
6867 self.cfg.GetMasterNode(),
6868 self.instance.primary_node,
6869 self.op.target_node,
6873 def CheckPrereq(self):
6874 """Check prerequisites.
6876 This checks that the instance is in the cluster.
6879 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6880 assert self.instance is not None, \
6881 "Cannot retrieve locked instance %s" % self.op.instance_name
6883 node = self.cfg.GetNodeInfo(self.op.target_node)
6884 assert node is not None, \
6885 "Cannot retrieve locked node %s" % self.op.target_node
6887 self.target_node = target_node = node.name
6889 if target_node == instance.primary_node:
6890 raise errors.OpPrereqError("Instance %s is already on the node %s" %
6891 (instance.name, target_node),
6894 bep = self.cfg.GetClusterInfo().FillBE(instance)
6896 for idx, dsk in enumerate(instance.disks):
6897 if dsk.dev_type not in (constants.LD_LV, constants.LD_FILE):
6898 raise errors.OpPrereqError("Instance disk %d has a complex layout,"
6899 " cannot copy" % idx, errors.ECODE_STATE)
6901 _CheckNodeOnline(self, target_node)
6902 _CheckNodeNotDrained(self, target_node)
6903 _CheckNodeVmCapable(self, target_node)
6905 if instance.admin_up:
6906 # check memory requirements on the secondary node
6907 _CheckNodeFreeMemory(self, target_node, "failing over instance %s" %
6908 instance.name, bep[constants.BE_MEMORY],
6909 instance.hypervisor)
6911 self.LogInfo("Not checking memory on the secondary node as"
6912 " instance will not be started")
6914 # check bridge existance
6915 _CheckInstanceBridgesExist(self, instance, node=target_node)
6917 def Exec(self, feedback_fn):
6918 """Move an instance.
6920 The move is done by shutting it down on its present node, copying
6921 the data over (slow) and starting it on the new node.
6924 instance = self.instance
6926 source_node = instance.primary_node
6927 target_node = self.target_node
6929 self.LogInfo("Shutting down instance %s on source node %s",
6930 instance.name, source_node)
6932 result = self.rpc.call_instance_shutdown(source_node, instance,
6933 self.op.shutdown_timeout)
6934 msg = result.fail_msg
6936 if self.op.ignore_consistency:
6937 self.proc.LogWarning("Could not shutdown instance %s on node %s."
6938 " Proceeding anyway. Please make sure node"
6939 " %s is down. Error details: %s",
6940 instance.name, source_node, source_node, msg)
6942 raise errors.OpExecError("Could not shutdown instance %s on"
6944 (instance.name, source_node, msg))
6946 # create the target disks
6948 _CreateDisks(self, instance, target_node=target_node)
6949 except errors.OpExecError:
6950 self.LogWarning("Device creation failed, reverting...")
6952 _RemoveDisks(self, instance, target_node=target_node)
6954 self.cfg.ReleaseDRBDMinors(instance.name)
6957 cluster_name = self.cfg.GetClusterInfo().cluster_name
6960 # activate, get path, copy the data over
6961 for idx, disk in enumerate(instance.disks):
6962 self.LogInfo("Copying data for disk %d", idx)
6963 result = self.rpc.call_blockdev_assemble(target_node, disk,
6964 instance.name, True, idx)
6966 self.LogWarning("Can't assemble newly created disk %d: %s",
6967 idx, result.fail_msg)
6968 errs.append(result.fail_msg)
6970 dev_path = result.payload
6971 result = self.rpc.call_blockdev_export(source_node, disk,
6972 target_node, dev_path,
6975 self.LogWarning("Can't copy data over for disk %d: %s",
6976 idx, result.fail_msg)
6977 errs.append(result.fail_msg)
6981 self.LogWarning("Some disks failed to copy, aborting")
6983 _RemoveDisks(self, instance, target_node=target_node)
6985 self.cfg.ReleaseDRBDMinors(instance.name)
6986 raise errors.OpExecError("Errors during disk copy: %s" %
6989 instance.primary_node = target_node
6990 self.cfg.Update(instance, feedback_fn)
6992 self.LogInfo("Removing the disks on the original node")
6993 _RemoveDisks(self, instance, target_node=source_node)
6995 # Only start the instance if it's marked as up
6996 if instance.admin_up:
6997 self.LogInfo("Starting instance %s on node %s",
6998 instance.name, target_node)
7000 disks_ok, _ = _AssembleInstanceDisks(self, instance,
7001 ignore_secondaries=True)
7003 _ShutdownInstanceDisks(self, instance)
7004 raise errors.OpExecError("Can't activate the instance's disks")
7006 result = self.rpc.call_instance_start(target_node, instance,
7008 msg = result.fail_msg
7010 _ShutdownInstanceDisks(self, instance)
7011 raise errors.OpExecError("Could not start instance %s on node %s: %s" %
7012 (instance.name, target_node, msg))
7015 class LUNodeMigrate(LogicalUnit):
7016 """Migrate all instances from a node.
7019 HPATH = "node-migrate"
7020 HTYPE = constants.HTYPE_NODE
7023 def CheckArguments(self):
7026 def ExpandNames(self):
7027 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
7029 self.share_locks = _ShareAll()
7030 self.needed_locks = {
7031 locking.LEVEL_NODE: [self.op.node_name],
7034 def BuildHooksEnv(self):
7037 This runs on the master, the primary and all the secondaries.
7041 "NODE_NAME": self.op.node_name,
7044 def BuildHooksNodes(self):
7045 """Build hooks nodes.
7048 nl = [self.cfg.GetMasterNode()]
7051 def CheckPrereq(self):
7054 def Exec(self, feedback_fn):
7055 # Prepare jobs for migration instances
7057 [opcodes.OpInstanceMigrate(instance_name=inst.name,
7060 iallocator=self.op.iallocator,
7061 target_node=self.op.target_node)]
7062 for inst in _GetNodePrimaryInstances(self.cfg, self.op.node_name)
7065 # TODO: Run iallocator in this opcode and pass correct placement options to
7066 # OpInstanceMigrate. Since other jobs can modify the cluster between
7067 # running the iallocator and the actual migration, a good consistency model
7068 # will have to be found.
7070 assert (frozenset(self.owned_locks(locking.LEVEL_NODE)) ==
7071 frozenset([self.op.node_name]))
7073 return ResultWithJobs(jobs)
7076 class TLMigrateInstance(Tasklet):
7077 """Tasklet class for instance migration.
7080 @ivar live: whether the migration will be done live or non-live;
7081 this variable is initalized only after CheckPrereq has run
7082 @type cleanup: boolean
7083 @ivar cleanup: Wheater we cleanup from a failed migration
7084 @type iallocator: string
7085 @ivar iallocator: The iallocator used to determine target_node
7086 @type target_node: string
7087 @ivar target_node: If given, the target_node to reallocate the instance to
7088 @type failover: boolean
7089 @ivar failover: Whether operation results in failover or migration
7090 @type fallback: boolean
7091 @ivar fallback: Whether fallback to failover is allowed if migration not
7093 @type ignore_consistency: boolean
7094 @ivar ignore_consistency: Wheter we should ignore consistency between source
7096 @type shutdown_timeout: int
7097 @ivar shutdown_timeout: In case of failover timeout of the shutdown
7100 def __init__(self, lu, instance_name, cleanup=False,
7101 failover=False, fallback=False,
7102 ignore_consistency=False,
7103 shutdown_timeout=constants.DEFAULT_SHUTDOWN_TIMEOUT):
7104 """Initializes this class.
7107 Tasklet.__init__(self, lu)
7110 self.instance_name = instance_name
7111 self.cleanup = cleanup
7112 self.live = False # will be overridden later
7113 self.failover = failover
7114 self.fallback = fallback
7115 self.ignore_consistency = ignore_consistency
7116 self.shutdown_timeout = shutdown_timeout
7118 def CheckPrereq(self):
7119 """Check prerequisites.
7121 This checks that the instance is in the cluster.
7124 instance_name = _ExpandInstanceName(self.lu.cfg, self.instance_name)
7125 instance = self.cfg.GetInstanceInfo(instance_name)
7126 assert instance is not None
7127 self.instance = instance
7129 if (not self.cleanup and not instance.admin_up and not self.failover and
7131 self.lu.LogInfo("Instance is marked down, fallback allowed, switching"
7133 self.failover = True
7135 if instance.disk_template not in constants.DTS_MIRRORED:
7140 raise errors.OpPrereqError("Instance's disk layout '%s' does not allow"
7141 " %s" % (instance.disk_template, text),
7144 if instance.disk_template in constants.DTS_EXT_MIRROR:
7145 _CheckIAllocatorOrNode(self.lu, "iallocator", "target_node")
7147 if self.lu.op.iallocator:
7148 self._RunAllocator()
7150 # We set set self.target_node as it is required by
7152 self.target_node = self.lu.op.target_node
7154 # self.target_node is already populated, either directly or by the
7156 target_node = self.target_node
7157 if self.target_node == instance.primary_node:
7158 raise errors.OpPrereqError("Cannot migrate instance %s"
7159 " to its primary (%s)" %
7160 (instance.name, instance.primary_node))
7162 if len(self.lu.tasklets) == 1:
7163 # It is safe to release locks only when we're the only tasklet
7165 _ReleaseLocks(self.lu, locking.LEVEL_NODE,
7166 keep=[instance.primary_node, self.target_node])
7169 secondary_nodes = instance.secondary_nodes
7170 if not secondary_nodes:
7171 raise errors.ConfigurationError("No secondary node but using"
7172 " %s disk template" %
7173 instance.disk_template)
7174 target_node = secondary_nodes[0]
7175 if self.lu.op.iallocator or (self.lu.op.target_node and
7176 self.lu.op.target_node != target_node):
7178 text = "failed over"
7181 raise errors.OpPrereqError("Instances with disk template %s cannot"
7182 " be %s to arbitrary nodes"
7183 " (neither an iallocator nor a target"
7184 " node can be passed)" %
7185 (instance.disk_template, text),
7188 i_be = self.cfg.GetClusterInfo().FillBE(instance)
7190 # check memory requirements on the secondary node
7191 if not self.cleanup and (not self.failover or instance.admin_up):
7192 _CheckNodeFreeMemory(self.lu, target_node, "migrating instance %s" %
7193 instance.name, i_be[constants.BE_MEMORY],
7194 instance.hypervisor)
7196 self.lu.LogInfo("Not checking memory on the secondary node as"
7197 " instance will not be started")
7199 # check bridge existance
7200 _CheckInstanceBridgesExist(self.lu, instance, node=target_node)
7202 if not self.cleanup:
7203 _CheckNodeNotDrained(self.lu, target_node)
7204 if not self.failover:
7205 result = self.rpc.call_instance_migratable(instance.primary_node,
7207 if result.fail_msg and self.fallback:
7208 self.lu.LogInfo("Can't migrate, instance offline, fallback to"
7210 self.failover = True
7212 result.Raise("Can't migrate, please use failover",
7213 prereq=True, ecode=errors.ECODE_STATE)
7215 assert not (self.failover and self.cleanup)
7217 if not self.failover:
7218 if self.lu.op.live is not None and self.lu.op.mode is not None:
7219 raise errors.OpPrereqError("Only one of the 'live' and 'mode'"
7220 " parameters are accepted",
7222 if self.lu.op.live is not None:
7224 self.lu.op.mode = constants.HT_MIGRATION_LIVE
7226 self.lu.op.mode = constants.HT_MIGRATION_NONLIVE
7227 # reset the 'live' parameter to None so that repeated
7228 # invocations of CheckPrereq do not raise an exception
7229 self.lu.op.live = None
7230 elif self.lu.op.mode is None:
7231 # read the default value from the hypervisor
7232 i_hv = self.cfg.GetClusterInfo().FillHV(self.instance,
7234 self.lu.op.mode = i_hv[constants.HV_MIGRATION_MODE]
7236 self.live = self.lu.op.mode == constants.HT_MIGRATION_LIVE
7238 # Failover is never live
7241 def _RunAllocator(self):
7242 """Run the allocator based on input opcode.
7245 ial = IAllocator(self.cfg, self.rpc,
7246 mode=constants.IALLOCATOR_MODE_RELOC,
7247 name=self.instance_name,
7248 # TODO See why hail breaks with a single node below
7249 relocate_from=[self.instance.primary_node,
7250 self.instance.primary_node],
7253 ial.Run(self.lu.op.iallocator)
7256 raise errors.OpPrereqError("Can't compute nodes using"
7257 " iallocator '%s': %s" %
7258 (self.lu.op.iallocator, ial.info),
7260 if len(ial.result) != ial.required_nodes:
7261 raise errors.OpPrereqError("iallocator '%s' returned invalid number"
7262 " of nodes (%s), required %s" %
7263 (self.lu.op.iallocator, len(ial.result),
7264 ial.required_nodes), errors.ECODE_FAULT)
7265 self.target_node = ial.result[0]
7266 self.lu.LogInfo("Selected nodes for instance %s via iallocator %s: %s",
7267 self.instance_name, self.lu.op.iallocator,
7268 utils.CommaJoin(ial.result))
7270 def _WaitUntilSync(self):
7271 """Poll with custom rpc for disk sync.
7273 This uses our own step-based rpc call.
7276 self.feedback_fn("* wait until resync is done")
7280 result = self.rpc.call_drbd_wait_sync(self.all_nodes,
7282 self.instance.disks)
7284 for node, nres in result.items():
7285 nres.Raise("Cannot resync disks on node %s" % node)
7286 node_done, node_percent = nres.payload
7287 all_done = all_done and node_done
7288 if node_percent is not None:
7289 min_percent = min(min_percent, node_percent)
7291 if min_percent < 100:
7292 self.feedback_fn(" - progress: %.1f%%" % min_percent)
7295 def _EnsureSecondary(self, node):
7296 """Demote a node to secondary.
7299 self.feedback_fn("* switching node %s to secondary mode" % node)
7301 for dev in self.instance.disks:
7302 self.cfg.SetDiskID(dev, node)
7304 result = self.rpc.call_blockdev_close(node, self.instance.name,
7305 self.instance.disks)
7306 result.Raise("Cannot change disk to secondary on node %s" % node)
7308 def _GoStandalone(self):
7309 """Disconnect from the network.
7312 self.feedback_fn("* changing into standalone mode")
7313 result = self.rpc.call_drbd_disconnect_net(self.all_nodes, self.nodes_ip,
7314 self.instance.disks)
7315 for node, nres in result.items():
7316 nres.Raise("Cannot disconnect disks node %s" % node)
7318 def _GoReconnect(self, multimaster):
7319 """Reconnect to the network.
7325 msg = "single-master"
7326 self.feedback_fn("* changing disks into %s mode" % msg)
7327 result = self.rpc.call_drbd_attach_net(self.all_nodes, self.nodes_ip,
7328 self.instance.disks,
7329 self.instance.name, multimaster)
7330 for node, nres in result.items():
7331 nres.Raise("Cannot change disks config on node %s" % node)
7333 def _ExecCleanup(self):
7334 """Try to cleanup after a failed migration.
7336 The cleanup is done by:
7337 - check that the instance is running only on one node
7338 (and update the config if needed)
7339 - change disks on its secondary node to secondary
7340 - wait until disks are fully synchronized
7341 - disconnect from the network
7342 - change disks into single-master mode
7343 - wait again until disks are fully synchronized
7346 instance = self.instance
7347 target_node = self.target_node
7348 source_node = self.source_node
7350 # check running on only one node
7351 self.feedback_fn("* checking where the instance actually runs"
7352 " (if this hangs, the hypervisor might be in"
7354 ins_l = self.rpc.call_instance_list(self.all_nodes, [instance.hypervisor])
7355 for node, result in ins_l.items():
7356 result.Raise("Can't contact node %s" % node)
7358 runningon_source = instance.name in ins_l[source_node].payload
7359 runningon_target = instance.name in ins_l[target_node].payload
7361 if runningon_source and runningon_target:
7362 raise errors.OpExecError("Instance seems to be running on two nodes,"
7363 " or the hypervisor is confused; you will have"
7364 " to ensure manually that it runs only on one"
7365 " and restart this operation")
7367 if not (runningon_source or runningon_target):
7368 raise errors.OpExecError("Instance does not seem to be running at all;"
7369 " in this case it's safer to repair by"
7370 " running 'gnt-instance stop' to ensure disk"
7371 " shutdown, and then restarting it")
7373 if runningon_target:
7374 # the migration has actually succeeded, we need to update the config
7375 self.feedback_fn("* instance running on secondary node (%s),"
7376 " updating config" % target_node)
7377 instance.primary_node = target_node
7378 self.cfg.Update(instance, self.feedback_fn)
7379 demoted_node = source_node
7381 self.feedback_fn("* instance confirmed to be running on its"
7382 " primary node (%s)" % source_node)
7383 demoted_node = target_node
7385 if instance.disk_template in constants.DTS_INT_MIRROR:
7386 self._EnsureSecondary(demoted_node)
7388 self._WaitUntilSync()
7389 except errors.OpExecError:
7390 # we ignore here errors, since if the device is standalone, it
7391 # won't be able to sync
7393 self._GoStandalone()
7394 self._GoReconnect(False)
7395 self._WaitUntilSync()
7397 self.feedback_fn("* done")
7399 def _RevertDiskStatus(self):
7400 """Try to revert the disk status after a failed migration.
7403 target_node = self.target_node
7404 if self.instance.disk_template in constants.DTS_EXT_MIRROR:
7408 self._EnsureSecondary(target_node)
7409 self._GoStandalone()
7410 self._GoReconnect(False)
7411 self._WaitUntilSync()
7412 except errors.OpExecError, err:
7413 self.lu.LogWarning("Migration failed and I can't reconnect the drives,"
7414 " please try to recover the instance manually;"
7415 " error '%s'" % str(err))
7417 def _AbortMigration(self):
7418 """Call the hypervisor code to abort a started migration.
7421 instance = self.instance
7422 target_node = self.target_node
7423 migration_info = self.migration_info
7425 abort_result = self.rpc.call_finalize_migration(target_node,
7429 abort_msg = abort_result.fail_msg
7431 logging.error("Aborting migration failed on target node %s: %s",
7432 target_node, abort_msg)
7433 # Don't raise an exception here, as we stil have to try to revert the
7434 # disk status, even if this step failed.
7436 def _ExecMigration(self):
7437 """Migrate an instance.
7439 The migrate is done by:
7440 - change the disks into dual-master mode
7441 - wait until disks are fully synchronized again
7442 - migrate the instance
7443 - change disks on the new secondary node (the old primary) to secondary
7444 - wait until disks are fully synchronized
7445 - change disks into single-master mode
7448 instance = self.instance
7449 target_node = self.target_node
7450 source_node = self.source_node
7452 # Check for hypervisor version mismatch and warn the user.
7453 nodeinfo = self.rpc.call_node_info([source_node, target_node],
7454 None, self.instance.hypervisor)
7455 src_info = nodeinfo[source_node]
7456 dst_info = nodeinfo[target_node]
7458 if ((constants.HV_NODEINFO_KEY_VERSION in src_info.payload) and
7459 (constants.HV_NODEINFO_KEY_VERSION in dst_info.payload)):
7460 src_version = src_info.payload[constants.HV_NODEINFO_KEY_VERSION]
7461 dst_version = dst_info.payload[constants.HV_NODEINFO_KEY_VERSION]
7462 if src_version != dst_version:
7463 self.feedback_fn("* warning: hypervisor version mismatch between"
7464 " source (%s) and target (%s) node" %
7465 (src_version, dst_version))
7467 self.feedback_fn("* checking disk consistency between source and target")
7468 for dev in instance.disks:
7469 if not _CheckDiskConsistency(self.lu, dev, target_node, False):
7470 raise errors.OpExecError("Disk %s is degraded or not fully"
7471 " synchronized on target node,"
7472 " aborting migration" % dev.iv_name)
7474 # First get the migration information from the remote node
7475 result = self.rpc.call_migration_info(source_node, instance)
7476 msg = result.fail_msg
7478 log_err = ("Failed fetching source migration information from %s: %s" %
7480 logging.error(log_err)
7481 raise errors.OpExecError(log_err)
7483 self.migration_info = migration_info = result.payload
7485 if self.instance.disk_template not in constants.DTS_EXT_MIRROR:
7486 # Then switch the disks to master/master mode
7487 self._EnsureSecondary(target_node)
7488 self._GoStandalone()
7489 self._GoReconnect(True)
7490 self._WaitUntilSync()
7492 self.feedback_fn("* preparing %s to accept the instance" % target_node)
7493 result = self.rpc.call_accept_instance(target_node,
7496 self.nodes_ip[target_node])
7498 msg = result.fail_msg
7500 logging.error("Instance pre-migration failed, trying to revert"
7501 " disk status: %s", msg)
7502 self.feedback_fn("Pre-migration failed, aborting")
7503 self._AbortMigration()
7504 self._RevertDiskStatus()
7505 raise errors.OpExecError("Could not pre-migrate instance %s: %s" %
7506 (instance.name, msg))
7508 self.feedback_fn("* migrating instance to %s" % target_node)
7509 result = self.rpc.call_instance_migrate(source_node, instance,
7510 self.nodes_ip[target_node],
7512 msg = result.fail_msg
7514 logging.error("Instance migration failed, trying to revert"
7515 " disk status: %s", msg)
7516 self.feedback_fn("Migration failed, aborting")
7517 self._AbortMigration()
7518 self._RevertDiskStatus()
7519 raise errors.OpExecError("Could not migrate instance %s: %s" %
7520 (instance.name, msg))
7522 instance.primary_node = target_node
7523 # distribute new instance config to the other nodes
7524 self.cfg.Update(instance, self.feedback_fn)
7526 result = self.rpc.call_finalize_migration(target_node,
7530 msg = result.fail_msg
7532 logging.error("Instance migration succeeded, but finalization failed:"
7534 raise errors.OpExecError("Could not finalize instance migration: %s" %
7537 if self.instance.disk_template not in constants.DTS_EXT_MIRROR:
7538 self._EnsureSecondary(source_node)
7539 self._WaitUntilSync()
7540 self._GoStandalone()
7541 self._GoReconnect(False)
7542 self._WaitUntilSync()
7544 self.feedback_fn("* done")
7546 def _ExecFailover(self):
7547 """Failover an instance.
7549 The failover is done by shutting it down on its present node and
7550 starting it on the secondary.
7553 instance = self.instance
7554 primary_node = self.cfg.GetNodeInfo(instance.primary_node)
7556 source_node = instance.primary_node
7557 target_node = self.target_node
7559 if instance.admin_up:
7560 self.feedback_fn("* checking disk consistency between source and target")
7561 for dev in instance.disks:
7562 # for drbd, these are drbd over lvm
7563 if not _CheckDiskConsistency(self.lu, dev, target_node, False):
7564 if primary_node.offline:
7565 self.feedback_fn("Node %s is offline, ignoring degraded disk %s on"
7567 (primary_node.name, dev.iv_name, target_node))
7568 elif not self.ignore_consistency:
7569 raise errors.OpExecError("Disk %s is degraded on target node,"
7570 " aborting failover" % dev.iv_name)
7572 self.feedback_fn("* not checking disk consistency as instance is not"
7575 self.feedback_fn("* shutting down instance on source node")
7576 logging.info("Shutting down instance %s on node %s",
7577 instance.name, source_node)
7579 result = self.rpc.call_instance_shutdown(source_node, instance,
7580 self.shutdown_timeout)
7581 msg = result.fail_msg
7583 if self.ignore_consistency or primary_node.offline:
7584 self.lu.LogWarning("Could not shutdown instance %s on node %s,"
7585 " proceeding anyway; please make sure node"
7586 " %s is down; error details: %s",
7587 instance.name, source_node, source_node, msg)
7589 raise errors.OpExecError("Could not shutdown instance %s on"
7591 (instance.name, source_node, msg))
7593 self.feedback_fn("* deactivating the instance's disks on source node")
7594 if not _ShutdownInstanceDisks(self.lu, instance, ignore_primary=True):
7595 raise errors.OpExecError("Can't shut down the instance's disks")
7597 instance.primary_node = target_node
7598 # distribute new instance config to the other nodes
7599 self.cfg.Update(instance, self.feedback_fn)
7601 # Only start the instance if it's marked as up
7602 if instance.admin_up:
7603 self.feedback_fn("* activating the instance's disks on target node %s" %
7605 logging.info("Starting instance %s on node %s",
7606 instance.name, target_node)
7608 disks_ok, _ = _AssembleInstanceDisks(self.lu, instance,
7609 ignore_secondaries=True)
7611 _ShutdownInstanceDisks(self.lu, instance)
7612 raise errors.OpExecError("Can't activate the instance's disks")
7614 self.feedback_fn("* starting the instance on the target node %s" %
7616 result = self.rpc.call_instance_start(target_node, instance, None, None,
7618 msg = result.fail_msg
7620 _ShutdownInstanceDisks(self.lu, instance)
7621 raise errors.OpExecError("Could not start instance %s on node %s: %s" %
7622 (instance.name, target_node, msg))
7624 def Exec(self, feedback_fn):
7625 """Perform the migration.
7628 self.feedback_fn = feedback_fn
7629 self.source_node = self.instance.primary_node
7631 # FIXME: if we implement migrate-to-any in DRBD, this needs fixing
7632 if self.instance.disk_template in constants.DTS_INT_MIRROR:
7633 self.target_node = self.instance.secondary_nodes[0]
7634 # Otherwise self.target_node has been populated either
7635 # directly, or through an iallocator.
7637 self.all_nodes = [self.source_node, self.target_node]
7638 self.nodes_ip = dict((name, node.secondary_ip) for (name, node)
7639 in self.cfg.GetMultiNodeInfo(self.all_nodes))
7642 feedback_fn("Failover instance %s" % self.instance.name)
7643 self._ExecFailover()
7645 feedback_fn("Migrating instance %s" % self.instance.name)
7648 return self._ExecCleanup()
7650 return self._ExecMigration()
7653 def _CreateBlockDev(lu, node, instance, device, force_create,
7655 """Create a tree of block devices on a given node.
7657 If this device type has to be created on secondaries, create it and
7660 If not, just recurse to children keeping the same 'force' value.
7662 @param lu: the lu on whose behalf we execute
7663 @param node: the node on which to create the device
7664 @type instance: L{objects.Instance}
7665 @param instance: the instance which owns the device
7666 @type device: L{objects.Disk}
7667 @param device: the device to create
7668 @type force_create: boolean
7669 @param force_create: whether to force creation of this device; this
7670 will be change to True whenever we find a device which has
7671 CreateOnSecondary() attribute
7672 @param info: the extra 'metadata' we should attach to the device
7673 (this will be represented as a LVM tag)
7674 @type force_open: boolean
7675 @param force_open: this parameter will be passes to the
7676 L{backend.BlockdevCreate} function where it specifies
7677 whether we run on primary or not, and it affects both
7678 the child assembly and the device own Open() execution
7681 if device.CreateOnSecondary():
7685 for child in device.children:
7686 _CreateBlockDev(lu, node, instance, child, force_create,
7689 if not force_create:
7692 _CreateSingleBlockDev(lu, node, instance, device, info, force_open)
7695 def _CreateSingleBlockDev(lu, node, instance, device, info, force_open):
7696 """Create a single block device on a given node.
7698 This will not recurse over children of the device, so they must be
7701 @param lu: the lu on whose behalf we execute
7702 @param node: the node on which to create the device
7703 @type instance: L{objects.Instance}
7704 @param instance: the instance which owns the device
7705 @type device: L{objects.Disk}
7706 @param device: the device to create
7707 @param info: the extra 'metadata' we should attach to the device
7708 (this will be represented as a LVM tag)
7709 @type force_open: boolean
7710 @param force_open: this parameter will be passes to the
7711 L{backend.BlockdevCreate} function where it specifies
7712 whether we run on primary or not, and it affects both
7713 the child assembly and the device own Open() execution
7716 lu.cfg.SetDiskID(device, node)
7717 result = lu.rpc.call_blockdev_create(node, device, device.size,
7718 instance.name, force_open, info)
7719 result.Raise("Can't create block device %s on"
7720 " node %s for instance %s" % (device, node, instance.name))
7721 if device.physical_id is None:
7722 device.physical_id = result.payload
7725 def _GenerateUniqueNames(lu, exts):
7726 """Generate a suitable LV name.
7728 This will generate a logical volume name for the given instance.
7733 new_id = lu.cfg.GenerateUniqueID(lu.proc.GetECId())
7734 results.append("%s%s" % (new_id, val))
7738 def _GenerateDRBD8Branch(lu, primary, secondary, size, vgnames, names,
7739 iv_name, p_minor, s_minor):
7740 """Generate a drbd8 device complete with its children.
7743 assert len(vgnames) == len(names) == 2
7744 port = lu.cfg.AllocatePort()
7745 shared_secret = lu.cfg.GenerateDRBDSecret(lu.proc.GetECId())
7746 dev_data = objects.Disk(dev_type=constants.LD_LV, size=size,
7747 logical_id=(vgnames[0], names[0]))
7748 dev_meta = objects.Disk(dev_type=constants.LD_LV, size=128,
7749 logical_id=(vgnames[1], names[1]))
7750 drbd_dev = objects.Disk(dev_type=constants.LD_DRBD8, size=size,
7751 logical_id=(primary, secondary, port,
7754 children=[dev_data, dev_meta],
7759 def _GenerateDiskTemplate(lu, template_name,
7760 instance_name, primary_node,
7761 secondary_nodes, disk_info,
7762 file_storage_dir, file_driver,
7763 base_index, feedback_fn):
7764 """Generate the entire disk layout for a given template type.
7767 #TODO: compute space requirements
7769 vgname = lu.cfg.GetVGName()
7770 disk_count = len(disk_info)
7772 if template_name == constants.DT_DISKLESS:
7774 elif template_name == constants.DT_PLAIN:
7775 if len(secondary_nodes) != 0:
7776 raise errors.ProgrammerError("Wrong template configuration")
7778 names = _GenerateUniqueNames(lu, [".disk%d" % (base_index + i)
7779 for i in range(disk_count)])
7780 for idx, disk in enumerate(disk_info):
7781 disk_index = idx + base_index
7782 vg = disk.get(constants.IDISK_VG, vgname)
7783 feedback_fn("* disk %i, vg %s, name %s" % (idx, vg, names[idx]))
7784 disk_dev = objects.Disk(dev_type=constants.LD_LV,
7785 size=disk[constants.IDISK_SIZE],
7786 logical_id=(vg, names[idx]),
7787 iv_name="disk/%d" % disk_index,
7788 mode=disk[constants.IDISK_MODE])
7789 disks.append(disk_dev)
7790 elif template_name == constants.DT_DRBD8:
7791 if len(secondary_nodes) != 1:
7792 raise errors.ProgrammerError("Wrong template configuration")
7793 remote_node = secondary_nodes[0]
7794 minors = lu.cfg.AllocateDRBDMinor(
7795 [primary_node, remote_node] * len(disk_info), instance_name)
7798 for lv_prefix in _GenerateUniqueNames(lu, [".disk%d" % (base_index + i)
7799 for i in range(disk_count)]):
7800 names.append(lv_prefix + "_data")
7801 names.append(lv_prefix + "_meta")
7802 for idx, disk in enumerate(disk_info):
7803 disk_index = idx + base_index
7804 data_vg = disk.get(constants.IDISK_VG, vgname)
7805 meta_vg = disk.get(constants.IDISK_METAVG, data_vg)
7806 disk_dev = _GenerateDRBD8Branch(lu, primary_node, remote_node,
7807 disk[constants.IDISK_SIZE],
7809 names[idx * 2:idx * 2 + 2],
7810 "disk/%d" % disk_index,
7811 minors[idx * 2], minors[idx * 2 + 1])
7812 disk_dev.mode = disk[constants.IDISK_MODE]
7813 disks.append(disk_dev)
7814 elif template_name == constants.DT_FILE:
7815 if len(secondary_nodes) != 0:
7816 raise errors.ProgrammerError("Wrong template configuration")
7818 opcodes.RequireFileStorage()
7820 for idx, disk in enumerate(disk_info):
7821 disk_index = idx + base_index
7822 disk_dev = objects.Disk(dev_type=constants.LD_FILE,
7823 size=disk[constants.IDISK_SIZE],
7824 iv_name="disk/%d" % disk_index,
7825 logical_id=(file_driver,
7826 "%s/disk%d" % (file_storage_dir,
7828 mode=disk[constants.IDISK_MODE])
7829 disks.append(disk_dev)
7830 elif template_name == constants.DT_SHARED_FILE:
7831 if len(secondary_nodes) != 0:
7832 raise errors.ProgrammerError("Wrong template configuration")
7834 opcodes.RequireSharedFileStorage()
7836 for idx, disk in enumerate(disk_info):
7837 disk_index = idx + base_index
7838 disk_dev = objects.Disk(dev_type=constants.LD_FILE,
7839 size=disk[constants.IDISK_SIZE],
7840 iv_name="disk/%d" % disk_index,
7841 logical_id=(file_driver,
7842 "%s/disk%d" % (file_storage_dir,
7844 mode=disk[constants.IDISK_MODE])
7845 disks.append(disk_dev)
7846 elif template_name == constants.DT_BLOCK:
7847 if len(secondary_nodes) != 0:
7848 raise errors.ProgrammerError("Wrong template configuration")
7850 for idx, disk in enumerate(disk_info):
7851 disk_index = idx + base_index
7852 disk_dev = objects.Disk(dev_type=constants.LD_BLOCKDEV,
7853 size=disk[constants.IDISK_SIZE],
7854 logical_id=(constants.BLOCKDEV_DRIVER_MANUAL,
7855 disk[constants.IDISK_ADOPT]),
7856 iv_name="disk/%d" % disk_index,
7857 mode=disk[constants.IDISK_MODE])
7858 disks.append(disk_dev)
7861 raise errors.ProgrammerError("Invalid disk template '%s'" % template_name)
7865 def _GetInstanceInfoText(instance):
7866 """Compute that text that should be added to the disk's metadata.
7869 return "originstname+%s" % instance.name
7872 def _CalcEta(time_taken, written, total_size):
7873 """Calculates the ETA based on size written and total size.
7875 @param time_taken: The time taken so far
7876 @param written: amount written so far
7877 @param total_size: The total size of data to be written
7878 @return: The remaining time in seconds
7881 avg_time = time_taken / float(written)
7882 return (total_size - written) * avg_time
7885 def _WipeDisks(lu, instance):
7886 """Wipes instance disks.
7888 @type lu: L{LogicalUnit}
7889 @param lu: the logical unit on whose behalf we execute
7890 @type instance: L{objects.Instance}
7891 @param instance: the instance whose disks we should create
7892 @return: the success of the wipe
7895 node = instance.primary_node
7897 for device in instance.disks:
7898 lu.cfg.SetDiskID(device, node)
7900 logging.info("Pause sync of instance %s disks", instance.name)
7901 result = lu.rpc.call_blockdev_pause_resume_sync(node, instance.disks, True)
7903 for idx, success in enumerate(result.payload):
7905 logging.warn("pause-sync of instance %s for disks %d failed",
7909 for idx, device in enumerate(instance.disks):
7910 # The wipe size is MIN_WIPE_CHUNK_PERCENT % of the instance disk but
7911 # MAX_WIPE_CHUNK at max
7912 wipe_chunk_size = min(constants.MAX_WIPE_CHUNK, device.size / 100.0 *
7913 constants.MIN_WIPE_CHUNK_PERCENT)
7914 # we _must_ make this an int, otherwise rounding errors will
7916 wipe_chunk_size = int(wipe_chunk_size)
7918 lu.LogInfo("* Wiping disk %d", idx)
7919 logging.info("Wiping disk %d for instance %s, node %s using"
7920 " chunk size %s", idx, instance.name, node, wipe_chunk_size)
7925 start_time = time.time()
7927 while offset < size:
7928 wipe_size = min(wipe_chunk_size, size - offset)
7929 logging.debug("Wiping disk %d, offset %s, chunk %s",
7930 idx, offset, wipe_size)
7931 result = lu.rpc.call_blockdev_wipe(node, device, offset, wipe_size)
7932 result.Raise("Could not wipe disk %d at offset %d for size %d" %
7933 (idx, offset, wipe_size))
7936 if now - last_output >= 60:
7937 eta = _CalcEta(now - start_time, offset, size)
7938 lu.LogInfo(" - done: %.1f%% ETA: %s" %
7939 (offset / float(size) * 100, utils.FormatSeconds(eta)))
7942 logging.info("Resume sync of instance %s disks", instance.name)
7944 result = lu.rpc.call_blockdev_pause_resume_sync(node, instance.disks, False)
7946 for idx, success in enumerate(result.payload):
7948 lu.LogWarning("Resume sync of disk %d failed, please have a"
7949 " look at the status and troubleshoot the issue", idx)
7950 logging.warn("resume-sync of instance %s for disks %d failed",
7954 def _CreateDisks(lu, instance, to_skip=None, target_node=None):
7955 """Create all disks for an instance.
7957 This abstracts away some work from AddInstance.
7959 @type lu: L{LogicalUnit}
7960 @param lu: the logical unit on whose behalf we execute
7961 @type instance: L{objects.Instance}
7962 @param instance: the instance whose disks we should create
7964 @param to_skip: list of indices to skip
7965 @type target_node: string
7966 @param target_node: if passed, overrides the target node for creation
7968 @return: the success of the creation
7971 info = _GetInstanceInfoText(instance)
7972 if target_node is None:
7973 pnode = instance.primary_node
7974 all_nodes = instance.all_nodes
7979 if instance.disk_template in constants.DTS_FILEBASED:
7980 file_storage_dir = os.path.dirname(instance.disks[0].logical_id[1])
7981 result = lu.rpc.call_file_storage_dir_create(pnode, file_storage_dir)
7983 result.Raise("Failed to create directory '%s' on"
7984 " node %s" % (file_storage_dir, pnode))
7986 # Note: this needs to be kept in sync with adding of disks in
7987 # LUInstanceSetParams
7988 for idx, device in enumerate(instance.disks):
7989 if to_skip and idx in to_skip:
7991 logging.info("Creating volume %s for instance %s",
7992 device.iv_name, instance.name)
7994 for node in all_nodes:
7995 f_create = node == pnode
7996 _CreateBlockDev(lu, node, instance, device, f_create, info, f_create)
7999 def _RemoveDisks(lu, instance, target_node=None):
8000 """Remove all disks for an instance.
8002 This abstracts away some work from `AddInstance()` and
8003 `RemoveInstance()`. Note that in case some of the devices couldn't
8004 be removed, the removal will continue with the other ones (compare
8005 with `_CreateDisks()`).
8007 @type lu: L{LogicalUnit}
8008 @param lu: the logical unit on whose behalf we execute
8009 @type instance: L{objects.Instance}
8010 @param instance: the instance whose disks we should remove
8011 @type target_node: string
8012 @param target_node: used to override the node on which to remove the disks
8014 @return: the success of the removal
8017 logging.info("Removing block devices for instance %s", instance.name)
8020 for device in instance.disks:
8022 edata = [(target_node, device)]
8024 edata = device.ComputeNodeTree(instance.primary_node)
8025 for node, disk in edata:
8026 lu.cfg.SetDiskID(disk, node)
8027 msg = lu.rpc.call_blockdev_remove(node, disk).fail_msg
8029 lu.LogWarning("Could not remove block device %s on node %s,"
8030 " continuing anyway: %s", device.iv_name, node, msg)
8033 # if this is a DRBD disk, return its port to the pool
8034 if device.dev_type in constants.LDS_DRBD:
8035 tcp_port = device.logical_id[2]
8036 lu.cfg.AddTcpUdpPort(tcp_port)
8038 if instance.disk_template == constants.DT_FILE:
8039 file_storage_dir = os.path.dirname(instance.disks[0].logical_id[1])
8043 tgt = instance.primary_node
8044 result = lu.rpc.call_file_storage_dir_remove(tgt, file_storage_dir)
8046 lu.LogWarning("Could not remove directory '%s' on node %s: %s",
8047 file_storage_dir, instance.primary_node, result.fail_msg)
8053 def _ComputeDiskSizePerVG(disk_template, disks):
8054 """Compute disk size requirements in the volume group
8057 def _compute(disks, payload):
8058 """Universal algorithm.
8063 vgs[disk[constants.IDISK_VG]] = \
8064 vgs.get(constants.IDISK_VG, 0) + disk[constants.IDISK_SIZE] + payload
8068 # Required free disk space as a function of disk and swap space
8070 constants.DT_DISKLESS: {},
8071 constants.DT_PLAIN: _compute(disks, 0),
8072 # 128 MB are added for drbd metadata for each disk
8073 constants.DT_DRBD8: _compute(disks, 128),
8074 constants.DT_FILE: {},
8075 constants.DT_SHARED_FILE: {},
8078 if disk_template not in req_size_dict:
8079 raise errors.ProgrammerError("Disk template '%s' size requirement"
8080 " is unknown" % disk_template)
8082 return req_size_dict[disk_template]
8085 def _ComputeDiskSize(disk_template, disks):
8086 """Compute disk size requirements in the volume group
8089 # Required free disk space as a function of disk and swap space
8091 constants.DT_DISKLESS: None,
8092 constants.DT_PLAIN: sum(d[constants.IDISK_SIZE] for d in disks),
8093 # 128 MB are added for drbd metadata for each disk
8094 constants.DT_DRBD8: sum(d[constants.IDISK_SIZE] + 128 for d in disks),
8095 constants.DT_FILE: None,
8096 constants.DT_SHARED_FILE: 0,
8097 constants.DT_BLOCK: 0,
8100 if disk_template not in req_size_dict:
8101 raise errors.ProgrammerError("Disk template '%s' size requirement"
8102 " is unknown" % disk_template)
8104 return req_size_dict[disk_template]
8107 def _FilterVmNodes(lu, nodenames):
8108 """Filters out non-vm_capable nodes from a list.
8110 @type lu: L{LogicalUnit}
8111 @param lu: the logical unit for which we check
8112 @type nodenames: list
8113 @param nodenames: the list of nodes on which we should check
8115 @return: the list of vm-capable nodes
8118 vm_nodes = frozenset(lu.cfg.GetNonVmCapableNodeList())
8119 return [name for name in nodenames if name not in vm_nodes]
8122 def _CheckHVParams(lu, nodenames, hvname, hvparams):
8123 """Hypervisor parameter validation.
8125 This function abstract the hypervisor parameter validation to be
8126 used in both instance create and instance modify.
8128 @type lu: L{LogicalUnit}
8129 @param lu: the logical unit for which we check
8130 @type nodenames: list
8131 @param nodenames: the list of nodes on which we should check
8132 @type hvname: string
8133 @param hvname: the name of the hypervisor we should use
8134 @type hvparams: dict
8135 @param hvparams: the parameters which we need to check
8136 @raise errors.OpPrereqError: if the parameters are not valid
8139 nodenames = _FilterVmNodes(lu, nodenames)
8140 hvinfo = lu.rpc.call_hypervisor_validate_params(nodenames,
8143 for node in nodenames:
8147 info.Raise("Hypervisor parameter validation failed on node %s" % node)
8150 def _CheckOSParams(lu, required, nodenames, osname, osparams):
8151 """OS parameters validation.
8153 @type lu: L{LogicalUnit}
8154 @param lu: the logical unit for which we check
8155 @type required: boolean
8156 @param required: whether the validation should fail if the OS is not
8158 @type nodenames: list
8159 @param nodenames: the list of nodes on which we should check
8160 @type osname: string
8161 @param osname: the name of the hypervisor we should use
8162 @type osparams: dict
8163 @param osparams: the parameters which we need to check
8164 @raise errors.OpPrereqError: if the parameters are not valid
8167 nodenames = _FilterVmNodes(lu, nodenames)
8168 result = lu.rpc.call_os_validate(required, nodenames, osname,
8169 [constants.OS_VALIDATE_PARAMETERS],
8171 for node, nres in result.items():
8172 # we don't check for offline cases since this should be run only
8173 # against the master node and/or an instance's nodes
8174 nres.Raise("OS Parameters validation failed on node %s" % node)
8175 if not nres.payload:
8176 lu.LogInfo("OS %s not found on node %s, validation skipped",
8180 class LUInstanceCreate(LogicalUnit):
8181 """Create an instance.
8184 HPATH = "instance-add"
8185 HTYPE = constants.HTYPE_INSTANCE
8188 def CheckArguments(self):
8192 # do not require name_check to ease forward/backward compatibility
8194 if self.op.no_install and self.op.start:
8195 self.LogInfo("No-installation mode selected, disabling startup")
8196 self.op.start = False
8197 # validate/normalize the instance name
8198 self.op.instance_name = \
8199 netutils.Hostname.GetNormalizedName(self.op.instance_name)
8201 if self.op.ip_check and not self.op.name_check:
8202 # TODO: make the ip check more flexible and not depend on the name check
8203 raise errors.OpPrereqError("Cannot do IP address check without a name"
8204 " check", errors.ECODE_INVAL)
8206 # check nics' parameter names
8207 for nic in self.op.nics:
8208 utils.ForceDictType(nic, constants.INIC_PARAMS_TYPES)
8210 # check disks. parameter names and consistent adopt/no-adopt strategy
8211 has_adopt = has_no_adopt = False
8212 for disk in self.op.disks:
8213 utils.ForceDictType(disk, constants.IDISK_PARAMS_TYPES)
8214 if constants.IDISK_ADOPT in disk:
8218 if has_adopt and has_no_adopt:
8219 raise errors.OpPrereqError("Either all disks are adopted or none is",
8222 if self.op.disk_template not in constants.DTS_MAY_ADOPT:
8223 raise errors.OpPrereqError("Disk adoption is not supported for the"
8224 " '%s' disk template" %
8225 self.op.disk_template,
8227 if self.op.iallocator is not None:
8228 raise errors.OpPrereqError("Disk adoption not allowed with an"
8229 " iallocator script", errors.ECODE_INVAL)
8230 if self.op.mode == constants.INSTANCE_IMPORT:
8231 raise errors.OpPrereqError("Disk adoption not allowed for"
8232 " instance import", errors.ECODE_INVAL)
8234 if self.op.disk_template in constants.DTS_MUST_ADOPT:
8235 raise errors.OpPrereqError("Disk template %s requires disk adoption,"
8236 " but no 'adopt' parameter given" %
8237 self.op.disk_template,
8240 self.adopt_disks = has_adopt
8242 # instance name verification
8243 if self.op.name_check:
8244 self.hostname1 = netutils.GetHostname(name=self.op.instance_name)
8245 self.op.instance_name = self.hostname1.name
8246 # used in CheckPrereq for ip ping check
8247 self.check_ip = self.hostname1.ip
8249 self.check_ip = None
8251 # file storage checks
8252 if (self.op.file_driver and
8253 not self.op.file_driver in constants.FILE_DRIVER):
8254 raise errors.OpPrereqError("Invalid file driver name '%s'" %
8255 self.op.file_driver, errors.ECODE_INVAL)
8257 if self.op.disk_template == constants.DT_FILE:
8258 opcodes.RequireFileStorage()
8259 elif self.op.disk_template == constants.DT_SHARED_FILE:
8260 opcodes.RequireSharedFileStorage()
8262 ### Node/iallocator related checks
8263 _CheckIAllocatorOrNode(self, "iallocator", "pnode")
8265 if self.op.pnode is not None:
8266 if self.op.disk_template in constants.DTS_INT_MIRROR:
8267 if self.op.snode is None:
8268 raise errors.OpPrereqError("The networked disk templates need"
8269 " a mirror node", errors.ECODE_INVAL)
8271 self.LogWarning("Secondary node will be ignored on non-mirrored disk"
8273 self.op.snode = None
8275 self._cds = _GetClusterDomainSecret()
8277 if self.op.mode == constants.INSTANCE_IMPORT:
8278 # On import force_variant must be True, because if we forced it at
8279 # initial install, our only chance when importing it back is that it
8281 self.op.force_variant = True
8283 if self.op.no_install:
8284 self.LogInfo("No-installation mode has no effect during import")
8286 elif self.op.mode == constants.INSTANCE_CREATE:
8287 if self.op.os_type is None:
8288 raise errors.OpPrereqError("No guest OS specified",
8290 if self.op.os_type in self.cfg.GetClusterInfo().blacklisted_os:
8291 raise errors.OpPrereqError("Guest OS '%s' is not allowed for"
8292 " installation" % self.op.os_type,
8294 if self.op.disk_template is None:
8295 raise errors.OpPrereqError("No disk template specified",
8298 elif self.op.mode == constants.INSTANCE_REMOTE_IMPORT:
8299 # Check handshake to ensure both clusters have the same domain secret
8300 src_handshake = self.op.source_handshake
8301 if not src_handshake:
8302 raise errors.OpPrereqError("Missing source handshake",
8305 errmsg = masterd.instance.CheckRemoteExportHandshake(self._cds,
8308 raise errors.OpPrereqError("Invalid handshake: %s" % errmsg,
8311 # Load and check source CA
8312 self.source_x509_ca_pem = self.op.source_x509_ca
8313 if not self.source_x509_ca_pem:
8314 raise errors.OpPrereqError("Missing source X509 CA",
8318 (cert, _) = utils.LoadSignedX509Certificate(self.source_x509_ca_pem,
8320 except OpenSSL.crypto.Error, err:
8321 raise errors.OpPrereqError("Unable to load source X509 CA (%s)" %
8322 (err, ), errors.ECODE_INVAL)
8324 (errcode, msg) = utils.VerifyX509Certificate(cert, None, None)
8325 if errcode is not None:
8326 raise errors.OpPrereqError("Invalid source X509 CA (%s)" % (msg, ),
8329 self.source_x509_ca = cert
8331 src_instance_name = self.op.source_instance_name
8332 if not src_instance_name:
8333 raise errors.OpPrereqError("Missing source instance name",
8336 self.source_instance_name = \
8337 netutils.GetHostname(name=src_instance_name).name
8340 raise errors.OpPrereqError("Invalid instance creation mode %r" %
8341 self.op.mode, errors.ECODE_INVAL)
8343 def ExpandNames(self):
8344 """ExpandNames for CreateInstance.
8346 Figure out the right locks for instance creation.
8349 self.needed_locks = {}
8351 instance_name = self.op.instance_name
8352 # this is just a preventive check, but someone might still add this
8353 # instance in the meantime, and creation will fail at lock-add time
8354 if instance_name in self.cfg.GetInstanceList():
8355 raise errors.OpPrereqError("Instance '%s' is already in the cluster" %
8356 instance_name, errors.ECODE_EXISTS)
8358 self.add_locks[locking.LEVEL_INSTANCE] = instance_name
8360 if self.op.iallocator:
8361 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
8363 self.op.pnode = _ExpandNodeName(self.cfg, self.op.pnode)
8364 nodelist = [self.op.pnode]
8365 if self.op.snode is not None:
8366 self.op.snode = _ExpandNodeName(self.cfg, self.op.snode)
8367 nodelist.append(self.op.snode)
8368 self.needed_locks[locking.LEVEL_NODE] = nodelist
8370 # in case of import lock the source node too
8371 if self.op.mode == constants.INSTANCE_IMPORT:
8372 src_node = self.op.src_node
8373 src_path = self.op.src_path
8375 if src_path is None:
8376 self.op.src_path = src_path = self.op.instance_name
8378 if src_node is None:
8379 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
8380 self.op.src_node = None
8381 if os.path.isabs(src_path):
8382 raise errors.OpPrereqError("Importing an instance from a path"
8383 " requires a source node option",
8386 self.op.src_node = src_node = _ExpandNodeName(self.cfg, src_node)
8387 if self.needed_locks[locking.LEVEL_NODE] is not locking.ALL_SET:
8388 self.needed_locks[locking.LEVEL_NODE].append(src_node)
8389 if not os.path.isabs(src_path):
8390 self.op.src_path = src_path = \
8391 utils.PathJoin(constants.EXPORT_DIR, src_path)
8393 def _RunAllocator(self):
8394 """Run the allocator based on input opcode.
8397 nics = [n.ToDict() for n in self.nics]
8398 ial = IAllocator(self.cfg, self.rpc,
8399 mode=constants.IALLOCATOR_MODE_ALLOC,
8400 name=self.op.instance_name,
8401 disk_template=self.op.disk_template,
8404 vcpus=self.be_full[constants.BE_VCPUS],
8405 memory=self.be_full[constants.BE_MEMORY],
8408 hypervisor=self.op.hypervisor,
8411 ial.Run(self.op.iallocator)
8414 raise errors.OpPrereqError("Can't compute nodes using"
8415 " iallocator '%s': %s" %
8416 (self.op.iallocator, ial.info),
8418 if len(ial.result) != ial.required_nodes:
8419 raise errors.OpPrereqError("iallocator '%s' returned invalid number"
8420 " of nodes (%s), required %s" %
8421 (self.op.iallocator, len(ial.result),
8422 ial.required_nodes), errors.ECODE_FAULT)
8423 self.op.pnode = ial.result[0]
8424 self.LogInfo("Selected nodes for instance %s via iallocator %s: %s",
8425 self.op.instance_name, self.op.iallocator,
8426 utils.CommaJoin(ial.result))
8427 if ial.required_nodes == 2:
8428 self.op.snode = ial.result[1]
8430 def BuildHooksEnv(self):
8433 This runs on master, primary and secondary nodes of the instance.
8437 "ADD_MODE": self.op.mode,
8439 if self.op.mode == constants.INSTANCE_IMPORT:
8440 env["SRC_NODE"] = self.op.src_node
8441 env["SRC_PATH"] = self.op.src_path
8442 env["SRC_IMAGES"] = self.src_images
8444 env.update(_BuildInstanceHookEnv(
8445 name=self.op.instance_name,
8446 primary_node=self.op.pnode,
8447 secondary_nodes=self.secondaries,
8448 status=self.op.start,
8449 os_type=self.op.os_type,
8450 memory=self.be_full[constants.BE_MEMORY],
8451 vcpus=self.be_full[constants.BE_VCPUS],
8452 nics=_NICListToTuple(self, self.nics),
8453 disk_template=self.op.disk_template,
8454 disks=[(d[constants.IDISK_SIZE], d[constants.IDISK_MODE])
8455 for d in self.disks],
8458 hypervisor_name=self.op.hypervisor,
8464 def BuildHooksNodes(self):
8465 """Build hooks nodes.
8468 nl = [self.cfg.GetMasterNode(), self.op.pnode] + self.secondaries
8471 def _ReadExportInfo(self):
8472 """Reads the export information from disk.
8474 It will override the opcode source node and path with the actual
8475 information, if these two were not specified before.
8477 @return: the export information
8480 assert self.op.mode == constants.INSTANCE_IMPORT
8482 src_node = self.op.src_node
8483 src_path = self.op.src_path
8485 if src_node is None:
8486 locked_nodes = self.owned_locks(locking.LEVEL_NODE)
8487 exp_list = self.rpc.call_export_list(locked_nodes)
8489 for node in exp_list:
8490 if exp_list[node].fail_msg:
8492 if src_path in exp_list[node].payload:
8494 self.op.src_node = src_node = node
8495 self.op.src_path = src_path = utils.PathJoin(constants.EXPORT_DIR,
8499 raise errors.OpPrereqError("No export found for relative path %s" %
8500 src_path, errors.ECODE_INVAL)
8502 _CheckNodeOnline(self, src_node)
8503 result = self.rpc.call_export_info(src_node, src_path)
8504 result.Raise("No export or invalid export found in dir %s" % src_path)
8506 export_info = objects.SerializableConfigParser.Loads(str(result.payload))
8507 if not export_info.has_section(constants.INISECT_EXP):
8508 raise errors.ProgrammerError("Corrupted export config",
8509 errors.ECODE_ENVIRON)
8511 ei_version = export_info.get(constants.INISECT_EXP, "version")
8512 if (int(ei_version) != constants.EXPORT_VERSION):
8513 raise errors.OpPrereqError("Wrong export version %s (wanted %d)" %
8514 (ei_version, constants.EXPORT_VERSION),
8515 errors.ECODE_ENVIRON)
8518 def _ReadExportParams(self, einfo):
8519 """Use export parameters as defaults.
8521 In case the opcode doesn't specify (as in override) some instance
8522 parameters, then try to use them from the export information, if
8526 self.op.os_type = einfo.get(constants.INISECT_EXP, "os")
8528 if self.op.disk_template is None:
8529 if einfo.has_option(constants.INISECT_INS, "disk_template"):
8530 self.op.disk_template = einfo.get(constants.INISECT_INS,
8533 raise errors.OpPrereqError("No disk template specified and the export"
8534 " is missing the disk_template information",
8537 if not self.op.disks:
8538 if einfo.has_option(constants.INISECT_INS, "disk_count"):
8540 # TODO: import the disk iv_name too
8541 for idx in range(einfo.getint(constants.INISECT_INS, "disk_count")):
8542 disk_sz = einfo.getint(constants.INISECT_INS, "disk%d_size" % idx)
8543 disks.append({constants.IDISK_SIZE: disk_sz})
8544 self.op.disks = disks
8546 raise errors.OpPrereqError("No disk info specified and the export"
8547 " is missing the disk information",
8550 if (not self.op.nics and
8551 einfo.has_option(constants.INISECT_INS, "nic_count")):
8553 for idx in range(einfo.getint(constants.INISECT_INS, "nic_count")):
8555 for name in list(constants.NICS_PARAMETERS) + ["ip", "mac"]:
8556 v = einfo.get(constants.INISECT_INS, "nic%d_%s" % (idx, name))
8561 if not self.op.tags and einfo.has_option(constants.INISECT_INS, "tags"):
8562 self.op.tags = einfo.get(constants.INISECT_INS, "tags").split()
8564 if (self.op.hypervisor is None and
8565 einfo.has_option(constants.INISECT_INS, "hypervisor")):
8566 self.op.hypervisor = einfo.get(constants.INISECT_INS, "hypervisor")
8568 if einfo.has_section(constants.INISECT_HYP):
8569 # use the export parameters but do not override the ones
8570 # specified by the user
8571 for name, value in einfo.items(constants.INISECT_HYP):
8572 if name not in self.op.hvparams:
8573 self.op.hvparams[name] = value
8575 if einfo.has_section(constants.INISECT_BEP):
8576 # use the parameters, without overriding
8577 for name, value in einfo.items(constants.INISECT_BEP):
8578 if name not in self.op.beparams:
8579 self.op.beparams[name] = value
8581 # try to read the parameters old style, from the main section
8582 for name in constants.BES_PARAMETERS:
8583 if (name not in self.op.beparams and
8584 einfo.has_option(constants.INISECT_INS, name)):
8585 self.op.beparams[name] = einfo.get(constants.INISECT_INS, name)
8587 if einfo.has_section(constants.INISECT_OSP):
8588 # use the parameters, without overriding
8589 for name, value in einfo.items(constants.INISECT_OSP):
8590 if name not in self.op.osparams:
8591 self.op.osparams[name] = value
8593 def _RevertToDefaults(self, cluster):
8594 """Revert the instance parameters to the default values.
8598 hv_defs = cluster.SimpleFillHV(self.op.hypervisor, self.op.os_type, {})
8599 for name in self.op.hvparams.keys():
8600 if name in hv_defs and hv_defs[name] == self.op.hvparams[name]:
8601 del self.op.hvparams[name]
8603 be_defs = cluster.SimpleFillBE({})
8604 for name in self.op.beparams.keys():
8605 if name in be_defs and be_defs[name] == self.op.beparams[name]:
8606 del self.op.beparams[name]
8608 nic_defs = cluster.SimpleFillNIC({})
8609 for nic in self.op.nics:
8610 for name in constants.NICS_PARAMETERS:
8611 if name in nic and name in nic_defs and nic[name] == nic_defs[name]:
8614 os_defs = cluster.SimpleFillOS(self.op.os_type, {})
8615 for name in self.op.osparams.keys():
8616 if name in os_defs and os_defs[name] == self.op.osparams[name]:
8617 del self.op.osparams[name]
8619 def _CalculateFileStorageDir(self):
8620 """Calculate final instance file storage dir.
8623 # file storage dir calculation/check
8624 self.instance_file_storage_dir = None
8625 if self.op.disk_template in constants.DTS_FILEBASED:
8626 # build the full file storage dir path
8629 if self.op.disk_template == constants.DT_SHARED_FILE:
8630 get_fsd_fn = self.cfg.GetSharedFileStorageDir
8632 get_fsd_fn = self.cfg.GetFileStorageDir
8634 cfg_storagedir = get_fsd_fn()
8635 if not cfg_storagedir:
8636 raise errors.OpPrereqError("Cluster file storage dir not defined")
8637 joinargs.append(cfg_storagedir)
8639 if self.op.file_storage_dir is not None:
8640 joinargs.append(self.op.file_storage_dir)
8642 joinargs.append(self.op.instance_name)
8644 # pylint: disable=W0142
8645 self.instance_file_storage_dir = utils.PathJoin(*joinargs)
8647 def CheckPrereq(self):
8648 """Check prerequisites.
8651 self._CalculateFileStorageDir()
8653 if self.op.mode == constants.INSTANCE_IMPORT:
8654 export_info = self._ReadExportInfo()
8655 self._ReadExportParams(export_info)
8657 if (not self.cfg.GetVGName() and
8658 self.op.disk_template not in constants.DTS_NOT_LVM):
8659 raise errors.OpPrereqError("Cluster does not support lvm-based"
8660 " instances", errors.ECODE_STATE)
8662 if self.op.hypervisor is None:
8663 self.op.hypervisor = self.cfg.GetHypervisorType()
8665 cluster = self.cfg.GetClusterInfo()
8666 enabled_hvs = cluster.enabled_hypervisors
8667 if self.op.hypervisor not in enabled_hvs:
8668 raise errors.OpPrereqError("Selected hypervisor (%s) not enabled in the"
8669 " cluster (%s)" % (self.op.hypervisor,
8670 ",".join(enabled_hvs)),
8673 # Check tag validity
8674 for tag in self.op.tags:
8675 objects.TaggableObject.ValidateTag(tag)
8677 # check hypervisor parameter syntax (locally)
8678 utils.ForceDictType(self.op.hvparams, constants.HVS_PARAMETER_TYPES)
8679 filled_hvp = cluster.SimpleFillHV(self.op.hypervisor, self.op.os_type,
8681 hv_type = hypervisor.GetHypervisor(self.op.hypervisor)
8682 hv_type.CheckParameterSyntax(filled_hvp)
8683 self.hv_full = filled_hvp
8684 # check that we don't specify global parameters on an instance
8685 _CheckGlobalHvParams(self.op.hvparams)
8687 # fill and remember the beparams dict
8688 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
8689 self.be_full = cluster.SimpleFillBE(self.op.beparams)
8691 # build os parameters
8692 self.os_full = cluster.SimpleFillOS(self.op.os_type, self.op.osparams)
8694 # now that hvp/bep are in final format, let's reset to defaults,
8696 if self.op.identify_defaults:
8697 self._RevertToDefaults(cluster)
8701 for idx, nic in enumerate(self.op.nics):
8702 nic_mode_req = nic.get(constants.INIC_MODE, None)
8703 nic_mode = nic_mode_req
8704 if nic_mode is None:
8705 nic_mode = cluster.nicparams[constants.PP_DEFAULT][constants.NIC_MODE]
8707 # in routed mode, for the first nic, the default ip is 'auto'
8708 if nic_mode == constants.NIC_MODE_ROUTED and idx == 0:
8709 default_ip_mode = constants.VALUE_AUTO
8711 default_ip_mode = constants.VALUE_NONE
8713 # ip validity checks
8714 ip = nic.get(constants.INIC_IP, default_ip_mode)
8715 if ip is None or ip.lower() == constants.VALUE_NONE:
8717 elif ip.lower() == constants.VALUE_AUTO:
8718 if not self.op.name_check:
8719 raise errors.OpPrereqError("IP address set to auto but name checks"
8720 " have been skipped",
8722 nic_ip = self.hostname1.ip
8724 if not netutils.IPAddress.IsValid(ip):
8725 raise errors.OpPrereqError("Invalid IP address '%s'" % ip,
8729 # TODO: check the ip address for uniqueness
8730 if nic_mode == constants.NIC_MODE_ROUTED and not nic_ip:
8731 raise errors.OpPrereqError("Routed nic mode requires an ip address",
8734 # MAC address verification
8735 mac = nic.get(constants.INIC_MAC, constants.VALUE_AUTO)
8736 if mac not in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
8737 mac = utils.NormalizeAndValidateMac(mac)
8740 self.cfg.ReserveMAC(mac, self.proc.GetECId())
8741 except errors.ReservationError:
8742 raise errors.OpPrereqError("MAC address %s already in use"
8743 " in cluster" % mac,
8744 errors.ECODE_NOTUNIQUE)
8746 # Build nic parameters
8747 link = nic.get(constants.INIC_LINK, None)
8750 nicparams[constants.NIC_MODE] = nic_mode_req
8752 nicparams[constants.NIC_LINK] = link
8754 check_params = cluster.SimpleFillNIC(nicparams)
8755 objects.NIC.CheckParameterSyntax(check_params)
8756 self.nics.append(objects.NIC(mac=mac, ip=nic_ip, nicparams=nicparams))
8758 # disk checks/pre-build
8759 default_vg = self.cfg.GetVGName()
8761 for disk in self.op.disks:
8762 mode = disk.get(constants.IDISK_MODE, constants.DISK_RDWR)
8763 if mode not in constants.DISK_ACCESS_SET:
8764 raise errors.OpPrereqError("Invalid disk access mode '%s'" %
8765 mode, errors.ECODE_INVAL)
8766 size = disk.get(constants.IDISK_SIZE, None)
8768 raise errors.OpPrereqError("Missing disk size", errors.ECODE_INVAL)
8771 except (TypeError, ValueError):
8772 raise errors.OpPrereqError("Invalid disk size '%s'" % size,
8775 data_vg = disk.get(constants.IDISK_VG, default_vg)
8777 constants.IDISK_SIZE: size,
8778 constants.IDISK_MODE: mode,
8779 constants.IDISK_VG: data_vg,
8780 constants.IDISK_METAVG: disk.get(constants.IDISK_METAVG, data_vg),
8782 if constants.IDISK_ADOPT in disk:
8783 new_disk[constants.IDISK_ADOPT] = disk[constants.IDISK_ADOPT]
8784 self.disks.append(new_disk)
8786 if self.op.mode == constants.INSTANCE_IMPORT:
8788 # Check that the new instance doesn't have less disks than the export
8789 instance_disks = len(self.disks)
8790 export_disks = export_info.getint(constants.INISECT_INS, 'disk_count')
8791 if instance_disks < export_disks:
8792 raise errors.OpPrereqError("Not enough disks to import."
8793 " (instance: %d, export: %d)" %
8794 (instance_disks, export_disks),
8798 for idx in range(export_disks):
8799 option = "disk%d_dump" % idx
8800 if export_info.has_option(constants.INISECT_INS, option):
8801 # FIXME: are the old os-es, disk sizes, etc. useful?
8802 export_name = export_info.get(constants.INISECT_INS, option)
8803 image = utils.PathJoin(self.op.src_path, export_name)
8804 disk_images.append(image)
8806 disk_images.append(False)
8808 self.src_images = disk_images
8810 old_name = export_info.get(constants.INISECT_INS, "name")
8812 exp_nic_count = export_info.getint(constants.INISECT_INS, "nic_count")
8813 except (TypeError, ValueError), err:
8814 raise errors.OpPrereqError("Invalid export file, nic_count is not"
8815 " an integer: %s" % str(err),
8817 if self.op.instance_name == old_name:
8818 for idx, nic in enumerate(self.nics):
8819 if nic.mac == constants.VALUE_AUTO and exp_nic_count >= idx:
8820 nic_mac_ini = "nic%d_mac" % idx
8821 nic.mac = export_info.get(constants.INISECT_INS, nic_mac_ini)
8823 # ENDIF: self.op.mode == constants.INSTANCE_IMPORT
8825 # ip ping checks (we use the same ip that was resolved in ExpandNames)
8826 if self.op.ip_check:
8827 if netutils.TcpPing(self.check_ip, constants.DEFAULT_NODED_PORT):
8828 raise errors.OpPrereqError("IP %s of instance %s already in use" %
8829 (self.check_ip, self.op.instance_name),
8830 errors.ECODE_NOTUNIQUE)
8832 #### mac address generation
8833 # By generating here the mac address both the allocator and the hooks get
8834 # the real final mac address rather than the 'auto' or 'generate' value.
8835 # There is a race condition between the generation and the instance object
8836 # creation, which means that we know the mac is valid now, but we're not
8837 # sure it will be when we actually add the instance. If things go bad
8838 # adding the instance will abort because of a duplicate mac, and the
8839 # creation job will fail.
8840 for nic in self.nics:
8841 if nic.mac in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
8842 nic.mac = self.cfg.GenerateMAC(self.proc.GetECId())
8846 if self.op.iallocator is not None:
8847 self._RunAllocator()
8849 # Release all unneeded node locks
8850 _ReleaseLocks(self, locking.LEVEL_NODE,
8851 keep=filter(None, [self.op.pnode, self.op.snode,
8854 #### node related checks
8856 # check primary node
8857 self.pnode = pnode = self.cfg.GetNodeInfo(self.op.pnode)
8858 assert self.pnode is not None, \
8859 "Cannot retrieve locked node %s" % self.op.pnode
8861 raise errors.OpPrereqError("Cannot use offline primary node '%s'" %
8862 pnode.name, errors.ECODE_STATE)
8864 raise errors.OpPrereqError("Cannot use drained primary node '%s'" %
8865 pnode.name, errors.ECODE_STATE)
8866 if not pnode.vm_capable:
8867 raise errors.OpPrereqError("Cannot use non-vm_capable primary node"
8868 " '%s'" % pnode.name, errors.ECODE_STATE)
8870 self.secondaries = []
8872 # mirror node verification
8873 if self.op.disk_template in constants.DTS_INT_MIRROR:
8874 if self.op.snode == pnode.name:
8875 raise errors.OpPrereqError("The secondary node cannot be the"
8876 " primary node", errors.ECODE_INVAL)
8877 _CheckNodeOnline(self, self.op.snode)
8878 _CheckNodeNotDrained(self, self.op.snode)
8879 _CheckNodeVmCapable(self, self.op.snode)
8880 self.secondaries.append(self.op.snode)
8882 nodenames = [pnode.name] + self.secondaries
8884 if not self.adopt_disks:
8885 # Check lv size requirements, if not adopting
8886 req_sizes = _ComputeDiskSizePerVG(self.op.disk_template, self.disks)
8887 _CheckNodesFreeDiskPerVG(self, nodenames, req_sizes)
8889 elif self.op.disk_template == constants.DT_PLAIN: # Check the adoption data
8890 all_lvs = set(["%s/%s" % (disk[constants.IDISK_VG],
8891 disk[constants.IDISK_ADOPT])
8892 for disk in self.disks])
8893 if len(all_lvs) != len(self.disks):
8894 raise errors.OpPrereqError("Duplicate volume names given for adoption",
8896 for lv_name in all_lvs:
8898 # FIXME: lv_name here is "vg/lv" need to ensure that other calls
8899 # to ReserveLV uses the same syntax
8900 self.cfg.ReserveLV(lv_name, self.proc.GetECId())
8901 except errors.ReservationError:
8902 raise errors.OpPrereqError("LV named %s used by another instance" %
8903 lv_name, errors.ECODE_NOTUNIQUE)
8905 vg_names = self.rpc.call_vg_list([pnode.name])[pnode.name]
8906 vg_names.Raise("Cannot get VG information from node %s" % pnode.name)
8908 node_lvs = self.rpc.call_lv_list([pnode.name],
8909 vg_names.payload.keys())[pnode.name]
8910 node_lvs.Raise("Cannot get LV information from node %s" % pnode.name)
8911 node_lvs = node_lvs.payload
8913 delta = all_lvs.difference(node_lvs.keys())
8915 raise errors.OpPrereqError("Missing logical volume(s): %s" %
8916 utils.CommaJoin(delta),
8918 online_lvs = [lv for lv in all_lvs if node_lvs[lv][2]]
8920 raise errors.OpPrereqError("Online logical volumes found, cannot"
8921 " adopt: %s" % utils.CommaJoin(online_lvs),
8923 # update the size of disk based on what is found
8924 for dsk in self.disks:
8925 dsk[constants.IDISK_SIZE] = \
8926 int(float(node_lvs["%s/%s" % (dsk[constants.IDISK_VG],
8927 dsk[constants.IDISK_ADOPT])][0]))
8929 elif self.op.disk_template == constants.DT_BLOCK:
8930 # Normalize and de-duplicate device paths
8931 all_disks = set([os.path.abspath(disk[constants.IDISK_ADOPT])
8932 for disk in self.disks])
8933 if len(all_disks) != len(self.disks):
8934 raise errors.OpPrereqError("Duplicate disk names given for adoption",
8936 baddisks = [d for d in all_disks
8937 if not d.startswith(constants.ADOPTABLE_BLOCKDEV_ROOT)]
8939 raise errors.OpPrereqError("Device node(s) %s lie outside %s and"
8940 " cannot be adopted" %
8941 (", ".join(baddisks),
8942 constants.ADOPTABLE_BLOCKDEV_ROOT),
8945 node_disks = self.rpc.call_bdev_sizes([pnode.name],
8946 list(all_disks))[pnode.name]
8947 node_disks.Raise("Cannot get block device information from node %s" %
8949 node_disks = node_disks.payload
8950 delta = all_disks.difference(node_disks.keys())
8952 raise errors.OpPrereqError("Missing block device(s): %s" %
8953 utils.CommaJoin(delta),
8955 for dsk in self.disks:
8956 dsk[constants.IDISK_SIZE] = \
8957 int(float(node_disks[dsk[constants.IDISK_ADOPT]]))
8959 _CheckHVParams(self, nodenames, self.op.hypervisor, self.op.hvparams)
8961 _CheckNodeHasOS(self, pnode.name, self.op.os_type, self.op.force_variant)
8962 # check OS parameters (remotely)
8963 _CheckOSParams(self, True, nodenames, self.op.os_type, self.os_full)
8965 _CheckNicsBridgesExist(self, self.nics, self.pnode.name)
8967 # memory check on primary node
8969 _CheckNodeFreeMemory(self, self.pnode.name,
8970 "creating instance %s" % self.op.instance_name,
8971 self.be_full[constants.BE_MEMORY],
8974 self.dry_run_result = list(nodenames)
8976 def Exec(self, feedback_fn):
8977 """Create and add the instance to the cluster.
8980 instance = self.op.instance_name
8981 pnode_name = self.pnode.name
8983 ht_kind = self.op.hypervisor
8984 if ht_kind in constants.HTS_REQ_PORT:
8985 network_port = self.cfg.AllocatePort()
8989 disks = _GenerateDiskTemplate(self,
8990 self.op.disk_template,
8991 instance, pnode_name,
8994 self.instance_file_storage_dir,
8995 self.op.file_driver,
8999 iobj = objects.Instance(name=instance, os=self.op.os_type,
9000 primary_node=pnode_name,
9001 nics=self.nics, disks=disks,
9002 disk_template=self.op.disk_template,
9004 network_port=network_port,
9005 beparams=self.op.beparams,
9006 hvparams=self.op.hvparams,
9007 hypervisor=self.op.hypervisor,
9008 osparams=self.op.osparams,
9012 for tag in self.op.tags:
9015 if self.adopt_disks:
9016 if self.op.disk_template == constants.DT_PLAIN:
9017 # rename LVs to the newly-generated names; we need to construct
9018 # 'fake' LV disks with the old data, plus the new unique_id
9019 tmp_disks = [objects.Disk.FromDict(v.ToDict()) for v in disks]
9021 for t_dsk, a_dsk in zip(tmp_disks, self.disks):
9022 rename_to.append(t_dsk.logical_id)
9023 t_dsk.logical_id = (t_dsk.logical_id[0], a_dsk[constants.IDISK_ADOPT])
9024 self.cfg.SetDiskID(t_dsk, pnode_name)
9025 result = self.rpc.call_blockdev_rename(pnode_name,
9026 zip(tmp_disks, rename_to))
9027 result.Raise("Failed to rename adoped LVs")
9029 feedback_fn("* creating instance disks...")
9031 _CreateDisks(self, iobj)
9032 except errors.OpExecError:
9033 self.LogWarning("Device creation failed, reverting...")
9035 _RemoveDisks(self, iobj)
9037 self.cfg.ReleaseDRBDMinors(instance)
9040 feedback_fn("adding instance %s to cluster config" % instance)
9042 self.cfg.AddInstance(iobj, self.proc.GetECId())
9044 # Declare that we don't want to remove the instance lock anymore, as we've
9045 # added the instance to the config
9046 del self.remove_locks[locking.LEVEL_INSTANCE]
9048 if self.op.mode == constants.INSTANCE_IMPORT:
9049 # Release unused nodes
9050 _ReleaseLocks(self, locking.LEVEL_NODE, keep=[self.op.src_node])
9053 _ReleaseLocks(self, locking.LEVEL_NODE)
9056 if not self.adopt_disks and self.cfg.GetClusterInfo().prealloc_wipe_disks:
9057 feedback_fn("* wiping instance disks...")
9059 _WipeDisks(self, iobj)
9060 except errors.OpExecError, err:
9061 logging.exception("Wiping disks failed")
9062 self.LogWarning("Wiping instance disks failed (%s)", err)
9066 # Something is already wrong with the disks, don't do anything else
9068 elif self.op.wait_for_sync:
9069 disk_abort = not _WaitForSync(self, iobj)
9070 elif iobj.disk_template in constants.DTS_INT_MIRROR:
9071 # make sure the disks are not degraded (still sync-ing is ok)
9072 feedback_fn("* checking mirrors status")
9073 disk_abort = not _WaitForSync(self, iobj, oneshot=True)
9078 _RemoveDisks(self, iobj)
9079 self.cfg.RemoveInstance(iobj.name)
9080 # Make sure the instance lock gets removed
9081 self.remove_locks[locking.LEVEL_INSTANCE] = iobj.name
9082 raise errors.OpExecError("There are some degraded disks for"
9085 if iobj.disk_template != constants.DT_DISKLESS and not self.adopt_disks:
9086 if self.op.mode == constants.INSTANCE_CREATE:
9087 if not self.op.no_install:
9088 pause_sync = (iobj.disk_template in constants.DTS_INT_MIRROR and
9089 not self.op.wait_for_sync)
9091 feedback_fn("* pausing disk sync to install instance OS")
9092 result = self.rpc.call_blockdev_pause_resume_sync(pnode_name,
9094 for idx, success in enumerate(result.payload):
9096 logging.warn("pause-sync of instance %s for disk %d failed",
9099 feedback_fn("* running the instance OS create scripts...")
9100 # FIXME: pass debug option from opcode to backend
9102 self.rpc.call_instance_os_add(pnode_name, iobj, False,
9103 self.op.debug_level)
9105 feedback_fn("* resuming disk sync")
9106 result = self.rpc.call_blockdev_pause_resume_sync(pnode_name,
9108 for idx, success in enumerate(result.payload):
9110 logging.warn("resume-sync of instance %s for disk %d failed",
9113 os_add_result.Raise("Could not add os for instance %s"
9114 " on node %s" % (instance, pnode_name))
9116 elif self.op.mode == constants.INSTANCE_IMPORT:
9117 feedback_fn("* running the instance OS import scripts...")
9121 for idx, image in enumerate(self.src_images):
9125 # FIXME: pass debug option from opcode to backend
9126 dt = masterd.instance.DiskTransfer("disk/%s" % idx,
9127 constants.IEIO_FILE, (image, ),
9128 constants.IEIO_SCRIPT,
9129 (iobj.disks[idx], idx),
9131 transfers.append(dt)
9134 masterd.instance.TransferInstanceData(self, feedback_fn,
9135 self.op.src_node, pnode_name,
9136 self.pnode.secondary_ip,
9138 if not compat.all(import_result):
9139 self.LogWarning("Some disks for instance %s on node %s were not"
9140 " imported successfully" % (instance, pnode_name))
9142 elif self.op.mode == constants.INSTANCE_REMOTE_IMPORT:
9143 feedback_fn("* preparing remote import...")
9144 # The source cluster will stop the instance before attempting to make a
9145 # connection. In some cases stopping an instance can take a long time,
9146 # hence the shutdown timeout is added to the connection timeout.
9147 connect_timeout = (constants.RIE_CONNECT_TIMEOUT +
9148 self.op.source_shutdown_timeout)
9149 timeouts = masterd.instance.ImportExportTimeouts(connect_timeout)
9151 assert iobj.primary_node == self.pnode.name
9153 masterd.instance.RemoteImport(self, feedback_fn, iobj, self.pnode,
9154 self.source_x509_ca,
9155 self._cds, timeouts)
9156 if not compat.all(disk_results):
9157 # TODO: Should the instance still be started, even if some disks
9158 # failed to import (valid for local imports, too)?
9159 self.LogWarning("Some disks for instance %s on node %s were not"
9160 " imported successfully" % (instance, pnode_name))
9162 # Run rename script on newly imported instance
9163 assert iobj.name == instance
9164 feedback_fn("Running rename script for %s" % instance)
9165 result = self.rpc.call_instance_run_rename(pnode_name, iobj,
9166 self.source_instance_name,
9167 self.op.debug_level)
9169 self.LogWarning("Failed to run rename script for %s on node"
9170 " %s: %s" % (instance, pnode_name, result.fail_msg))
9173 # also checked in the prereq part
9174 raise errors.ProgrammerError("Unknown OS initialization mode '%s'"
9178 iobj.admin_up = True
9179 self.cfg.Update(iobj, feedback_fn)
9180 logging.info("Starting instance %s on node %s", instance, pnode_name)
9181 feedback_fn("* starting instance...")
9182 result = self.rpc.call_instance_start(pnode_name, iobj,
9184 result.Raise("Could not start instance")
9186 return list(iobj.all_nodes)
9189 class LUInstanceConsole(NoHooksLU):
9190 """Connect to an instance's console.
9192 This is somewhat special in that it returns the command line that
9193 you need to run on the master node in order to connect to the
9199 def ExpandNames(self):
9200 self._ExpandAndLockInstance()
9202 def CheckPrereq(self):
9203 """Check prerequisites.
9205 This checks that the instance is in the cluster.
9208 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
9209 assert self.instance is not None, \
9210 "Cannot retrieve locked instance %s" % self.op.instance_name
9211 _CheckNodeOnline(self, self.instance.primary_node)
9213 def Exec(self, feedback_fn):
9214 """Connect to the console of an instance
9217 instance = self.instance
9218 node = instance.primary_node
9220 node_insts = self.rpc.call_instance_list([node],
9221 [instance.hypervisor])[node]
9222 node_insts.Raise("Can't get node information from %s" % node)
9224 if instance.name not in node_insts.payload:
9225 if instance.admin_up:
9226 state = constants.INSTST_ERRORDOWN
9228 state = constants.INSTST_ADMINDOWN
9229 raise errors.OpExecError("Instance %s is not running (state %s)" %
9230 (instance.name, state))
9232 logging.debug("Connecting to console of %s on %s", instance.name, node)
9234 return _GetInstanceConsole(self.cfg.GetClusterInfo(), instance)
9237 def _GetInstanceConsole(cluster, instance):
9238 """Returns console information for an instance.
9240 @type cluster: L{objects.Cluster}
9241 @type instance: L{objects.Instance}
9245 hyper = hypervisor.GetHypervisor(instance.hypervisor)
9246 # beparams and hvparams are passed separately, to avoid editing the
9247 # instance and then saving the defaults in the instance itself.
9248 hvparams = cluster.FillHV(instance)
9249 beparams = cluster.FillBE(instance)
9250 console = hyper.GetInstanceConsole(instance, hvparams, beparams)
9252 assert console.instance == instance.name
9253 assert console.Validate()
9255 return console.ToDict()
9258 class LUInstanceReplaceDisks(LogicalUnit):
9259 """Replace the disks of an instance.
9262 HPATH = "mirrors-replace"
9263 HTYPE = constants.HTYPE_INSTANCE
9266 def CheckArguments(self):
9267 TLReplaceDisks.CheckArguments(self.op.mode, self.op.remote_node,
9270 def ExpandNames(self):
9271 self._ExpandAndLockInstance()
9273 assert locking.LEVEL_NODE not in self.needed_locks
9274 assert locking.LEVEL_NODEGROUP not in self.needed_locks
9276 assert self.op.iallocator is None or self.op.remote_node is None, \
9277 "Conflicting options"
9279 if self.op.remote_node is not None:
9280 self.op.remote_node = _ExpandNodeName(self.cfg, self.op.remote_node)
9282 # Warning: do not remove the locking of the new secondary here
9283 # unless DRBD8.AddChildren is changed to work in parallel;
9284 # currently it doesn't since parallel invocations of
9285 # FindUnusedMinor will conflict
9286 self.needed_locks[locking.LEVEL_NODE] = [self.op.remote_node]
9287 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
9289 self.needed_locks[locking.LEVEL_NODE] = []
9290 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
9292 if self.op.iallocator is not None:
9293 # iallocator will select a new node in the same group
9294 self.needed_locks[locking.LEVEL_NODEGROUP] = []
9296 self.replacer = TLReplaceDisks(self, self.op.instance_name, self.op.mode,
9297 self.op.iallocator, self.op.remote_node,
9298 self.op.disks, False, self.op.early_release)
9300 self.tasklets = [self.replacer]
9302 def DeclareLocks(self, level):
9303 if level == locking.LEVEL_NODEGROUP:
9304 assert self.op.remote_node is None
9305 assert self.op.iallocator is not None
9306 assert not self.needed_locks[locking.LEVEL_NODEGROUP]
9308 self.share_locks[locking.LEVEL_NODEGROUP] = 1
9309 self.needed_locks[locking.LEVEL_NODEGROUP] = \
9310 self.cfg.GetInstanceNodeGroups(self.op.instance_name)
9312 elif level == locking.LEVEL_NODE:
9313 if self.op.iallocator is not None:
9314 assert self.op.remote_node is None
9315 assert not self.needed_locks[locking.LEVEL_NODE]
9317 # Lock member nodes of all locked groups
9318 self.needed_locks[locking.LEVEL_NODE] = [node_name
9319 for group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP)
9320 for node_name in self.cfg.GetNodeGroup(group_uuid).members]
9322 self._LockInstancesNodes()
9324 def BuildHooksEnv(self):
9327 This runs on the master, the primary and all the secondaries.
9330 instance = self.replacer.instance
9332 "MODE": self.op.mode,
9333 "NEW_SECONDARY": self.op.remote_node,
9334 "OLD_SECONDARY": instance.secondary_nodes[0],
9336 env.update(_BuildInstanceHookEnvByObject(self, instance))
9339 def BuildHooksNodes(self):
9340 """Build hooks nodes.
9343 instance = self.replacer.instance
9345 self.cfg.GetMasterNode(),
9346 instance.primary_node,
9348 if self.op.remote_node is not None:
9349 nl.append(self.op.remote_node)
9352 def CheckPrereq(self):
9353 """Check prerequisites.
9356 assert (self.glm.is_owned(locking.LEVEL_NODEGROUP) or
9357 self.op.iallocator is None)
9359 owned_groups = self.owned_locks(locking.LEVEL_NODEGROUP)
9361 _CheckInstanceNodeGroups(self.cfg, self.op.instance_name, owned_groups)
9363 return LogicalUnit.CheckPrereq(self)
9366 class TLReplaceDisks(Tasklet):
9367 """Replaces disks for an instance.
9369 Note: Locking is not within the scope of this class.
9372 def __init__(self, lu, instance_name, mode, iallocator_name, remote_node,
9373 disks, delay_iallocator, early_release):
9374 """Initializes this class.
9377 Tasklet.__init__(self, lu)
9380 self.instance_name = instance_name
9382 self.iallocator_name = iallocator_name
9383 self.remote_node = remote_node
9385 self.delay_iallocator = delay_iallocator
9386 self.early_release = early_release
9389 self.instance = None
9390 self.new_node = None
9391 self.target_node = None
9392 self.other_node = None
9393 self.remote_node_info = None
9394 self.node_secondary_ip = None
9397 def CheckArguments(mode, remote_node, iallocator):
9398 """Helper function for users of this class.
9401 # check for valid parameter combination
9402 if mode == constants.REPLACE_DISK_CHG:
9403 if remote_node is None and iallocator is None:
9404 raise errors.OpPrereqError("When changing the secondary either an"
9405 " iallocator script must be used or the"
9406 " new node given", errors.ECODE_INVAL)
9408 if remote_node is not None and iallocator is not None:
9409 raise errors.OpPrereqError("Give either the iallocator or the new"
9410 " secondary, not both", errors.ECODE_INVAL)
9412 elif remote_node is not None or iallocator is not None:
9413 # Not replacing the secondary
9414 raise errors.OpPrereqError("The iallocator and new node options can"
9415 " only be used when changing the"
9416 " secondary node", errors.ECODE_INVAL)
9419 def _RunAllocator(lu, iallocator_name, instance_name, relocate_from):
9420 """Compute a new secondary node using an IAllocator.
9423 ial = IAllocator(lu.cfg, lu.rpc,
9424 mode=constants.IALLOCATOR_MODE_RELOC,
9426 relocate_from=list(relocate_from))
9428 ial.Run(iallocator_name)
9431 raise errors.OpPrereqError("Can't compute nodes using iallocator '%s':"
9432 " %s" % (iallocator_name, ial.info),
9435 if len(ial.result) != ial.required_nodes:
9436 raise errors.OpPrereqError("iallocator '%s' returned invalid number"
9437 " of nodes (%s), required %s" %
9439 len(ial.result), ial.required_nodes),
9442 remote_node_name = ial.result[0]
9444 lu.LogInfo("Selected new secondary for instance '%s': %s",
9445 instance_name, remote_node_name)
9447 return remote_node_name
9449 def _FindFaultyDisks(self, node_name):
9450 """Wrapper for L{_FindFaultyInstanceDisks}.
9453 return _FindFaultyInstanceDisks(self.cfg, self.rpc, self.instance,
9456 def _CheckDisksActivated(self, instance):
9457 """Checks if the instance disks are activated.
9459 @param instance: The instance to check disks
9460 @return: True if they are activated, False otherwise
9463 nodes = instance.all_nodes
9465 for idx, dev in enumerate(instance.disks):
9467 self.lu.LogInfo("Checking disk/%d on %s", idx, node)
9468 self.cfg.SetDiskID(dev, node)
9470 result = self.rpc.call_blockdev_find(node, dev)
9474 elif result.fail_msg or not result.payload:
9479 def CheckPrereq(self):
9480 """Check prerequisites.
9482 This checks that the instance is in the cluster.
9485 self.instance = instance = self.cfg.GetInstanceInfo(self.instance_name)
9486 assert instance is not None, \
9487 "Cannot retrieve locked instance %s" % self.instance_name
9489 if instance.disk_template != constants.DT_DRBD8:
9490 raise errors.OpPrereqError("Can only run replace disks for DRBD8-based"
9491 " instances", errors.ECODE_INVAL)
9493 if len(instance.secondary_nodes) != 1:
9494 raise errors.OpPrereqError("The instance has a strange layout,"
9495 " expected one secondary but found %d" %
9496 len(instance.secondary_nodes),
9499 if not self.delay_iallocator:
9500 self._CheckPrereq2()
9502 def _CheckPrereq2(self):
9503 """Check prerequisites, second part.
9505 This function should always be part of CheckPrereq. It was separated and is
9506 now called from Exec because during node evacuation iallocator was only
9507 called with an unmodified cluster model, not taking planned changes into
9511 instance = self.instance
9512 secondary_node = instance.secondary_nodes[0]
9514 if self.iallocator_name is None:
9515 remote_node = self.remote_node
9517 remote_node = self._RunAllocator(self.lu, self.iallocator_name,
9518 instance.name, instance.secondary_nodes)
9520 if remote_node is None:
9521 self.remote_node_info = None
9523 assert remote_node in self.lu.owned_locks(locking.LEVEL_NODE), \
9524 "Remote node '%s' is not locked" % remote_node
9526 self.remote_node_info = self.cfg.GetNodeInfo(remote_node)
9527 assert self.remote_node_info is not None, \
9528 "Cannot retrieve locked node %s" % remote_node
9530 if remote_node == self.instance.primary_node:
9531 raise errors.OpPrereqError("The specified node is the primary node of"
9532 " the instance", errors.ECODE_INVAL)
9534 if remote_node == secondary_node:
9535 raise errors.OpPrereqError("The specified node is already the"
9536 " secondary node of the instance",
9539 if self.disks and self.mode in (constants.REPLACE_DISK_AUTO,
9540 constants.REPLACE_DISK_CHG):
9541 raise errors.OpPrereqError("Cannot specify disks to be replaced",
9544 if self.mode == constants.REPLACE_DISK_AUTO:
9545 if not self._CheckDisksActivated(instance):
9546 raise errors.OpPrereqError("Please run activate-disks on instance %s"
9547 " first" % self.instance_name,
9549 faulty_primary = self._FindFaultyDisks(instance.primary_node)
9550 faulty_secondary = self._FindFaultyDisks(secondary_node)
9552 if faulty_primary and faulty_secondary:
9553 raise errors.OpPrereqError("Instance %s has faulty disks on more than"
9554 " one node and can not be repaired"
9555 " automatically" % self.instance_name,
9559 self.disks = faulty_primary
9560 self.target_node = instance.primary_node
9561 self.other_node = secondary_node
9562 check_nodes = [self.target_node, self.other_node]
9563 elif faulty_secondary:
9564 self.disks = faulty_secondary
9565 self.target_node = secondary_node
9566 self.other_node = instance.primary_node
9567 check_nodes = [self.target_node, self.other_node]
9573 # Non-automatic modes
9574 if self.mode == constants.REPLACE_DISK_PRI:
9575 self.target_node = instance.primary_node
9576 self.other_node = secondary_node
9577 check_nodes = [self.target_node, self.other_node]
9579 elif self.mode == constants.REPLACE_DISK_SEC:
9580 self.target_node = secondary_node
9581 self.other_node = instance.primary_node
9582 check_nodes = [self.target_node, self.other_node]
9584 elif self.mode == constants.REPLACE_DISK_CHG:
9585 self.new_node = remote_node
9586 self.other_node = instance.primary_node
9587 self.target_node = secondary_node
9588 check_nodes = [self.new_node, self.other_node]
9590 _CheckNodeNotDrained(self.lu, remote_node)
9591 _CheckNodeVmCapable(self.lu, remote_node)
9593 old_node_info = self.cfg.GetNodeInfo(secondary_node)
9594 assert old_node_info is not None
9595 if old_node_info.offline and not self.early_release:
9596 # doesn't make sense to delay the release
9597 self.early_release = True
9598 self.lu.LogInfo("Old secondary %s is offline, automatically enabling"
9599 " early-release mode", secondary_node)
9602 raise errors.ProgrammerError("Unhandled disk replace mode (%s)" %
9605 # If not specified all disks should be replaced
9607 self.disks = range(len(self.instance.disks))
9609 for node in check_nodes:
9610 _CheckNodeOnline(self.lu, node)
9612 touched_nodes = frozenset(node_name for node_name in [self.new_node,
9615 if node_name is not None)
9617 # Release unneeded node locks
9618 _ReleaseLocks(self.lu, locking.LEVEL_NODE, keep=touched_nodes)
9620 # Release any owned node group
9621 if self.lu.glm.is_owned(locking.LEVEL_NODEGROUP):
9622 _ReleaseLocks(self.lu, locking.LEVEL_NODEGROUP)
9624 # Check whether disks are valid
9625 for disk_idx in self.disks:
9626 instance.FindDisk(disk_idx)
9628 # Get secondary node IP addresses
9629 self.node_secondary_ip = dict((name, node.secondary_ip) for (name, node)
9630 in self.cfg.GetMultiNodeInfo(touched_nodes))
9632 def Exec(self, feedback_fn):
9633 """Execute disk replacement.
9635 This dispatches the disk replacement to the appropriate handler.
9638 if self.delay_iallocator:
9639 self._CheckPrereq2()
9642 # Verify owned locks before starting operation
9643 owned_nodes = self.lu.owned_locks(locking.LEVEL_NODE)
9644 assert set(owned_nodes) == set(self.node_secondary_ip), \
9645 ("Incorrect node locks, owning %s, expected %s" %
9646 (owned_nodes, self.node_secondary_ip.keys()))
9648 owned_instances = self.lu.owned_locks(locking.LEVEL_INSTANCE)
9649 assert list(owned_instances) == [self.instance_name], \
9650 "Instance '%s' not locked" % self.instance_name
9652 assert not self.lu.glm.is_owned(locking.LEVEL_NODEGROUP), \
9653 "Should not own any node group lock at this point"
9656 feedback_fn("No disks need replacement")
9659 feedback_fn("Replacing disk(s) %s for %s" %
9660 (utils.CommaJoin(self.disks), self.instance.name))
9662 activate_disks = (not self.instance.admin_up)
9664 # Activate the instance disks if we're replacing them on a down instance
9666 _StartInstanceDisks(self.lu, self.instance, True)
9669 # Should we replace the secondary node?
9670 if self.new_node is not None:
9671 fn = self._ExecDrbd8Secondary
9673 fn = self._ExecDrbd8DiskOnly
9675 result = fn(feedback_fn)
9677 # Deactivate the instance disks if we're replacing them on a
9680 _SafeShutdownInstanceDisks(self.lu, self.instance)
9683 # Verify owned locks
9684 owned_nodes = self.lu.owned_locks(locking.LEVEL_NODE)
9685 nodes = frozenset(self.node_secondary_ip)
9686 assert ((self.early_release and not owned_nodes) or
9687 (not self.early_release and not (set(owned_nodes) - nodes))), \
9688 ("Not owning the correct locks, early_release=%s, owned=%r,"
9689 " nodes=%r" % (self.early_release, owned_nodes, nodes))
9693 def _CheckVolumeGroup(self, nodes):
9694 self.lu.LogInfo("Checking volume groups")
9696 vgname = self.cfg.GetVGName()
9698 # Make sure volume group exists on all involved nodes
9699 results = self.rpc.call_vg_list(nodes)
9701 raise errors.OpExecError("Can't list volume groups on the nodes")
9705 res.Raise("Error checking node %s" % node)
9706 if vgname not in res.payload:
9707 raise errors.OpExecError("Volume group '%s' not found on node %s" %
9710 def _CheckDisksExistence(self, nodes):
9711 # Check disk existence
9712 for idx, dev in enumerate(self.instance.disks):
9713 if idx not in self.disks:
9717 self.lu.LogInfo("Checking disk/%d on %s" % (idx, node))
9718 self.cfg.SetDiskID(dev, node)
9720 result = self.rpc.call_blockdev_find(node, dev)
9722 msg = result.fail_msg
9723 if msg or not result.payload:
9725 msg = "disk not found"
9726 raise errors.OpExecError("Can't find disk/%d on node %s: %s" %
9729 def _CheckDisksConsistency(self, node_name, on_primary, ldisk):
9730 for idx, dev in enumerate(self.instance.disks):
9731 if idx not in self.disks:
9734 self.lu.LogInfo("Checking disk/%d consistency on node %s" %
9737 if not _CheckDiskConsistency(self.lu, dev, node_name, on_primary,
9739 raise errors.OpExecError("Node %s has degraded storage, unsafe to"
9740 " replace disks for instance %s" %
9741 (node_name, self.instance.name))
9743 def _CreateNewStorage(self, node_name):
9744 """Create new storage on the primary or secondary node.
9746 This is only used for same-node replaces, not for changing the
9747 secondary node, hence we don't want to modify the existing disk.
9752 for idx, dev in enumerate(self.instance.disks):
9753 if idx not in self.disks:
9756 self.lu.LogInfo("Adding storage on %s for disk/%d" % (node_name, idx))
9758 self.cfg.SetDiskID(dev, node_name)
9760 lv_names = [".disk%d_%s" % (idx, suffix) for suffix in ["data", "meta"]]
9761 names = _GenerateUniqueNames(self.lu, lv_names)
9763 vg_data = dev.children[0].logical_id[0]
9764 lv_data = objects.Disk(dev_type=constants.LD_LV, size=dev.size,
9765 logical_id=(vg_data, names[0]))
9766 vg_meta = dev.children[1].logical_id[0]
9767 lv_meta = objects.Disk(dev_type=constants.LD_LV, size=128,
9768 logical_id=(vg_meta, names[1]))
9770 new_lvs = [lv_data, lv_meta]
9771 old_lvs = [child.Copy() for child in dev.children]
9772 iv_names[dev.iv_name] = (dev, old_lvs, new_lvs)
9774 # we pass force_create=True to force the LVM creation
9775 for new_lv in new_lvs:
9776 _CreateBlockDev(self.lu, node_name, self.instance, new_lv, True,
9777 _GetInstanceInfoText(self.instance), False)
9781 def _CheckDevices(self, node_name, iv_names):
9782 for name, (dev, _, _) in iv_names.iteritems():
9783 self.cfg.SetDiskID(dev, node_name)
9785 result = self.rpc.call_blockdev_find(node_name, dev)
9787 msg = result.fail_msg
9788 if msg or not result.payload:
9790 msg = "disk not found"
9791 raise errors.OpExecError("Can't find DRBD device %s: %s" %
9794 if result.payload.is_degraded:
9795 raise errors.OpExecError("DRBD device %s is degraded!" % name)
9797 def _RemoveOldStorage(self, node_name, iv_names):
9798 for name, (_, old_lvs, _) in iv_names.iteritems():
9799 self.lu.LogInfo("Remove logical volumes for %s" % name)
9802 self.cfg.SetDiskID(lv, node_name)
9804 msg = self.rpc.call_blockdev_remove(node_name, lv).fail_msg
9806 self.lu.LogWarning("Can't remove old LV: %s" % msg,
9807 hint="remove unused LVs manually")
9809 def _ExecDrbd8DiskOnly(self, feedback_fn): # pylint: disable=W0613
9810 """Replace a disk on the primary or secondary for DRBD 8.
9812 The algorithm for replace is quite complicated:
9814 1. for each disk to be replaced:
9816 1. create new LVs on the target node with unique names
9817 1. detach old LVs from the drbd device
9818 1. rename old LVs to name_replaced.<time_t>
9819 1. rename new LVs to old LVs
9820 1. attach the new LVs (with the old names now) to the drbd device
9822 1. wait for sync across all devices
9824 1. for each modified disk:
9826 1. remove old LVs (which have the name name_replaces.<time_t>)
9828 Failures are not very well handled.
9833 # Step: check device activation
9834 self.lu.LogStep(1, steps_total, "Check device existence")
9835 self._CheckDisksExistence([self.other_node, self.target_node])
9836 self._CheckVolumeGroup([self.target_node, self.other_node])
9838 # Step: check other node consistency
9839 self.lu.LogStep(2, steps_total, "Check peer consistency")
9840 self._CheckDisksConsistency(self.other_node,
9841 self.other_node == self.instance.primary_node,
9844 # Step: create new storage
9845 self.lu.LogStep(3, steps_total, "Allocate new storage")
9846 iv_names = self._CreateNewStorage(self.target_node)
9848 # Step: for each lv, detach+rename*2+attach
9849 self.lu.LogStep(4, steps_total, "Changing drbd configuration")
9850 for dev, old_lvs, new_lvs in iv_names.itervalues():
9851 self.lu.LogInfo("Detaching %s drbd from local storage" % dev.iv_name)
9853 result = self.rpc.call_blockdev_removechildren(self.target_node, dev,
9855 result.Raise("Can't detach drbd from local storage on node"
9856 " %s for device %s" % (self.target_node, dev.iv_name))
9858 #cfg.Update(instance)
9860 # ok, we created the new LVs, so now we know we have the needed
9861 # storage; as such, we proceed on the target node to rename
9862 # old_lv to _old, and new_lv to old_lv; note that we rename LVs
9863 # using the assumption that logical_id == physical_id (which in
9864 # turn is the unique_id on that node)
9866 # FIXME(iustin): use a better name for the replaced LVs
9867 temp_suffix = int(time.time())
9868 ren_fn = lambda d, suff: (d.physical_id[0],
9869 d.physical_id[1] + "_replaced-%s" % suff)
9871 # Build the rename list based on what LVs exist on the node
9872 rename_old_to_new = []
9873 for to_ren in old_lvs:
9874 result = self.rpc.call_blockdev_find(self.target_node, to_ren)
9875 if not result.fail_msg and result.payload:
9877 rename_old_to_new.append((to_ren, ren_fn(to_ren, temp_suffix)))
9879 self.lu.LogInfo("Renaming the old LVs on the target node")
9880 result = self.rpc.call_blockdev_rename(self.target_node,
9882 result.Raise("Can't rename old LVs on node %s" % self.target_node)
9884 # Now we rename the new LVs to the old LVs
9885 self.lu.LogInfo("Renaming the new LVs on the target node")
9886 rename_new_to_old = [(new, old.physical_id)
9887 for old, new in zip(old_lvs, new_lvs)]
9888 result = self.rpc.call_blockdev_rename(self.target_node,
9890 result.Raise("Can't rename new LVs on node %s" % self.target_node)
9892 # Intermediate steps of in memory modifications
9893 for old, new in zip(old_lvs, new_lvs):
9894 new.logical_id = old.logical_id
9895 self.cfg.SetDiskID(new, self.target_node)
9897 # We need to modify old_lvs so that removal later removes the
9898 # right LVs, not the newly added ones; note that old_lvs is a
9900 for disk in old_lvs:
9901 disk.logical_id = ren_fn(disk, temp_suffix)
9902 self.cfg.SetDiskID(disk, self.target_node)
9904 # Now that the new lvs have the old name, we can add them to the device
9905 self.lu.LogInfo("Adding new mirror component on %s" % self.target_node)
9906 result = self.rpc.call_blockdev_addchildren(self.target_node, dev,
9908 msg = result.fail_msg
9910 for new_lv in new_lvs:
9911 msg2 = self.rpc.call_blockdev_remove(self.target_node,
9914 self.lu.LogWarning("Can't rollback device %s: %s", dev, msg2,
9915 hint=("cleanup manually the unused logical"
9917 raise errors.OpExecError("Can't add local storage to drbd: %s" % msg)
9920 if self.early_release:
9921 self.lu.LogStep(cstep, steps_total, "Removing old storage")
9923 self._RemoveOldStorage(self.target_node, iv_names)
9924 # WARNING: we release both node locks here, do not do other RPCs
9925 # than WaitForSync to the primary node
9926 _ReleaseLocks(self.lu, locking.LEVEL_NODE,
9927 names=[self.target_node, self.other_node])
9930 # This can fail as the old devices are degraded and _WaitForSync
9931 # does a combined result over all disks, so we don't check its return value
9932 self.lu.LogStep(cstep, steps_total, "Sync devices")
9934 _WaitForSync(self.lu, self.instance)
9936 # Check all devices manually
9937 self._CheckDevices(self.instance.primary_node, iv_names)
9939 # Step: remove old storage
9940 if not self.early_release:
9941 self.lu.LogStep(cstep, steps_total, "Removing old storage")
9943 self._RemoveOldStorage(self.target_node, iv_names)
9945 def _ExecDrbd8Secondary(self, feedback_fn):
9946 """Replace the secondary node for DRBD 8.
9948 The algorithm for replace is quite complicated:
9949 - for all disks of the instance:
9950 - create new LVs on the new node with same names
9951 - shutdown the drbd device on the old secondary
9952 - disconnect the drbd network on the primary
9953 - create the drbd device on the new secondary
9954 - network attach the drbd on the primary, using an artifice:
9955 the drbd code for Attach() will connect to the network if it
9956 finds a device which is connected to the good local disks but
9958 - wait for sync across all devices
9959 - remove all disks from the old secondary
9961 Failures are not very well handled.
9966 pnode = self.instance.primary_node
9968 # Step: check device activation
9969 self.lu.LogStep(1, steps_total, "Check device existence")
9970 self._CheckDisksExistence([self.instance.primary_node])
9971 self._CheckVolumeGroup([self.instance.primary_node])
9973 # Step: check other node consistency
9974 self.lu.LogStep(2, steps_total, "Check peer consistency")
9975 self._CheckDisksConsistency(self.instance.primary_node, True, True)
9977 # Step: create new storage
9978 self.lu.LogStep(3, steps_total, "Allocate new storage")
9979 for idx, dev in enumerate(self.instance.disks):
9980 self.lu.LogInfo("Adding new local storage on %s for disk/%d" %
9981 (self.new_node, idx))
9982 # we pass force_create=True to force LVM creation
9983 for new_lv in dev.children:
9984 _CreateBlockDev(self.lu, self.new_node, self.instance, new_lv, True,
9985 _GetInstanceInfoText(self.instance), False)
9987 # Step 4: dbrd minors and drbd setups changes
9988 # after this, we must manually remove the drbd minors on both the
9989 # error and the success paths
9990 self.lu.LogStep(4, steps_total, "Changing drbd configuration")
9991 minors = self.cfg.AllocateDRBDMinor([self.new_node
9992 for dev in self.instance.disks],
9994 logging.debug("Allocated minors %r", minors)
9997 for idx, (dev, new_minor) in enumerate(zip(self.instance.disks, minors)):
9998 self.lu.LogInfo("activating a new drbd on %s for disk/%d" %
9999 (self.new_node, idx))
10000 # create new devices on new_node; note that we create two IDs:
10001 # one without port, so the drbd will be activated without
10002 # networking information on the new node at this stage, and one
10003 # with network, for the latter activation in step 4
10004 (o_node1, o_node2, o_port, o_minor1, o_minor2, o_secret) = dev.logical_id
10005 if self.instance.primary_node == o_node1:
10008 assert self.instance.primary_node == o_node2, "Three-node instance?"
10011 new_alone_id = (self.instance.primary_node, self.new_node, None,
10012 p_minor, new_minor, o_secret)
10013 new_net_id = (self.instance.primary_node, self.new_node, o_port,
10014 p_minor, new_minor, o_secret)
10016 iv_names[idx] = (dev, dev.children, new_net_id)
10017 logging.debug("Allocated new_minor: %s, new_logical_id: %s", new_minor,
10019 new_drbd = objects.Disk(dev_type=constants.LD_DRBD8,
10020 logical_id=new_alone_id,
10021 children=dev.children,
10024 _CreateSingleBlockDev(self.lu, self.new_node, self.instance, new_drbd,
10025 _GetInstanceInfoText(self.instance), False)
10026 except errors.GenericError:
10027 self.cfg.ReleaseDRBDMinors(self.instance.name)
10030 # We have new devices, shutdown the drbd on the old secondary
10031 for idx, dev in enumerate(self.instance.disks):
10032 self.lu.LogInfo("Shutting down drbd for disk/%d on old node" % idx)
10033 self.cfg.SetDiskID(dev, self.target_node)
10034 msg = self.rpc.call_blockdev_shutdown(self.target_node, dev).fail_msg
10036 self.lu.LogWarning("Failed to shutdown drbd for disk/%d on old"
10037 "node: %s" % (idx, msg),
10038 hint=("Please cleanup this device manually as"
10039 " soon as possible"))
10041 self.lu.LogInfo("Detaching primary drbds from the network (=> standalone)")
10042 result = self.rpc.call_drbd_disconnect_net([pnode], self.node_secondary_ip,
10043 self.instance.disks)[pnode]
10045 msg = result.fail_msg
10047 # detaches didn't succeed (unlikely)
10048 self.cfg.ReleaseDRBDMinors(self.instance.name)
10049 raise errors.OpExecError("Can't detach the disks from the network on"
10050 " old node: %s" % (msg,))
10052 # if we managed to detach at least one, we update all the disks of
10053 # the instance to point to the new secondary
10054 self.lu.LogInfo("Updating instance configuration")
10055 for dev, _, new_logical_id in iv_names.itervalues():
10056 dev.logical_id = new_logical_id
10057 self.cfg.SetDiskID(dev, self.instance.primary_node)
10059 self.cfg.Update(self.instance, feedback_fn)
10061 # and now perform the drbd attach
10062 self.lu.LogInfo("Attaching primary drbds to new secondary"
10063 " (standalone => connected)")
10064 result = self.rpc.call_drbd_attach_net([self.instance.primary_node,
10066 self.node_secondary_ip,
10067 self.instance.disks,
10068 self.instance.name,
10070 for to_node, to_result in result.items():
10071 msg = to_result.fail_msg
10073 self.lu.LogWarning("Can't attach drbd disks on node %s: %s",
10075 hint=("please do a gnt-instance info to see the"
10076 " status of disks"))
10078 if self.early_release:
10079 self.lu.LogStep(cstep, steps_total, "Removing old storage")
10081 self._RemoveOldStorage(self.target_node, iv_names)
10082 # WARNING: we release all node locks here, do not do other RPCs
10083 # than WaitForSync to the primary node
10084 _ReleaseLocks(self.lu, locking.LEVEL_NODE,
10085 names=[self.instance.primary_node,
10090 # This can fail as the old devices are degraded and _WaitForSync
10091 # does a combined result over all disks, so we don't check its return value
10092 self.lu.LogStep(cstep, steps_total, "Sync devices")
10094 _WaitForSync(self.lu, self.instance)
10096 # Check all devices manually
10097 self._CheckDevices(self.instance.primary_node, iv_names)
10099 # Step: remove old storage
10100 if not self.early_release:
10101 self.lu.LogStep(cstep, steps_total, "Removing old storage")
10102 self._RemoveOldStorage(self.target_node, iv_names)
10105 class LURepairNodeStorage(NoHooksLU):
10106 """Repairs the volume group on a node.
10111 def CheckArguments(self):
10112 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
10114 storage_type = self.op.storage_type
10116 if (constants.SO_FIX_CONSISTENCY not in
10117 constants.VALID_STORAGE_OPERATIONS.get(storage_type, [])):
10118 raise errors.OpPrereqError("Storage units of type '%s' can not be"
10119 " repaired" % storage_type,
10120 errors.ECODE_INVAL)
10122 def ExpandNames(self):
10123 self.needed_locks = {
10124 locking.LEVEL_NODE: [self.op.node_name],
10127 def _CheckFaultyDisks(self, instance, node_name):
10128 """Ensure faulty disks abort the opcode or at least warn."""
10130 if _FindFaultyInstanceDisks(self.cfg, self.rpc, instance,
10132 raise errors.OpPrereqError("Instance '%s' has faulty disks on"
10133 " node '%s'" % (instance.name, node_name),
10134 errors.ECODE_STATE)
10135 except errors.OpPrereqError, err:
10136 if self.op.ignore_consistency:
10137 self.proc.LogWarning(str(err.args[0]))
10141 def CheckPrereq(self):
10142 """Check prerequisites.
10145 # Check whether any instance on this node has faulty disks
10146 for inst in _GetNodeInstances(self.cfg, self.op.node_name):
10147 if not inst.admin_up:
10149 check_nodes = set(inst.all_nodes)
10150 check_nodes.discard(self.op.node_name)
10151 for inst_node_name in check_nodes:
10152 self._CheckFaultyDisks(inst, inst_node_name)
10154 def Exec(self, feedback_fn):
10155 feedback_fn("Repairing storage unit '%s' on %s ..." %
10156 (self.op.name, self.op.node_name))
10158 st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
10159 result = self.rpc.call_storage_execute(self.op.node_name,
10160 self.op.storage_type, st_args,
10162 constants.SO_FIX_CONSISTENCY)
10163 result.Raise("Failed to repair storage unit '%s' on %s" %
10164 (self.op.name, self.op.node_name))
10167 class LUNodeEvacuate(NoHooksLU):
10168 """Evacuates instances off a list of nodes.
10173 _MODE2IALLOCATOR = {
10174 constants.NODE_EVAC_PRI: constants.IALLOCATOR_NEVAC_PRI,
10175 constants.NODE_EVAC_SEC: constants.IALLOCATOR_NEVAC_SEC,
10176 constants.NODE_EVAC_ALL: constants.IALLOCATOR_NEVAC_ALL,
10178 assert frozenset(_MODE2IALLOCATOR.keys()) == constants.NODE_EVAC_MODES
10179 assert (frozenset(_MODE2IALLOCATOR.values()) ==
10180 constants.IALLOCATOR_NEVAC_MODES)
10182 def CheckArguments(self):
10183 _CheckIAllocatorOrNode(self, "iallocator", "remote_node")
10185 def ExpandNames(self):
10186 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
10188 if self.op.remote_node is not None:
10189 self.op.remote_node = _ExpandNodeName(self.cfg, self.op.remote_node)
10190 assert self.op.remote_node
10192 if self.op.remote_node == self.op.node_name:
10193 raise errors.OpPrereqError("Can not use evacuated node as a new"
10194 " secondary node", errors.ECODE_INVAL)
10196 if self.op.mode != constants.NODE_EVAC_SEC:
10197 raise errors.OpPrereqError("Without the use of an iallocator only"
10198 " secondary instances can be evacuated",
10199 errors.ECODE_INVAL)
10202 self.share_locks = _ShareAll()
10203 self.needed_locks = {
10204 locking.LEVEL_INSTANCE: [],
10205 locking.LEVEL_NODEGROUP: [],
10206 locking.LEVEL_NODE: [],
10209 # Determine nodes (via group) optimistically, needs verification once locks
10210 # have been acquired
10211 self.lock_nodes = self._DetermineNodes()
10213 def _DetermineNodes(self):
10214 """Gets the list of nodes to operate on.
10217 if self.op.remote_node is None:
10218 # Iallocator will choose any node(s) in the same group
10219 group_nodes = self.cfg.GetNodeGroupMembersByNodes([self.op.node_name])
10221 group_nodes = frozenset([self.op.remote_node])
10223 # Determine nodes to be locked
10224 return set([self.op.node_name]) | group_nodes
10226 def _DetermineInstances(self):
10227 """Builds list of instances to operate on.
10230 assert self.op.mode in constants.NODE_EVAC_MODES
10232 if self.op.mode == constants.NODE_EVAC_PRI:
10233 # Primary instances only
10234 inst_fn = _GetNodePrimaryInstances
10235 assert self.op.remote_node is None, \
10236 "Evacuating primary instances requires iallocator"
10237 elif self.op.mode == constants.NODE_EVAC_SEC:
10238 # Secondary instances only
10239 inst_fn = _GetNodeSecondaryInstances
10242 assert self.op.mode == constants.NODE_EVAC_ALL
10243 inst_fn = _GetNodeInstances
10244 # TODO: In 2.6, change the iallocator interface to take an evacuation mode
10246 raise errors.OpPrereqError("Due to an issue with the iallocator"
10247 " interface it is not possible to evacuate"
10248 " all instances at once; specify explicitly"
10249 " whether to evacuate primary or secondary"
10251 errors.ECODE_INVAL)
10253 return inst_fn(self.cfg, self.op.node_name)
10255 def DeclareLocks(self, level):
10256 if level == locking.LEVEL_INSTANCE:
10257 # Lock instances optimistically, needs verification once node and group
10258 # locks have been acquired
10259 self.needed_locks[locking.LEVEL_INSTANCE] = \
10260 set(i.name for i in self._DetermineInstances())
10262 elif level == locking.LEVEL_NODEGROUP:
10263 # Lock node groups for all potential target nodes optimistically, needs
10264 # verification once nodes have been acquired
10265 self.needed_locks[locking.LEVEL_NODEGROUP] = \
10266 self.cfg.GetNodeGroupsFromNodes(self.lock_nodes)
10268 elif level == locking.LEVEL_NODE:
10269 self.needed_locks[locking.LEVEL_NODE] = self.lock_nodes
10271 def CheckPrereq(self):
10273 owned_instances = self.owned_locks(locking.LEVEL_INSTANCE)
10274 owned_nodes = self.owned_locks(locking.LEVEL_NODE)
10275 owned_groups = self.owned_locks(locking.LEVEL_NODEGROUP)
10277 need_nodes = self._DetermineNodes()
10279 if not owned_nodes.issuperset(need_nodes):
10280 raise errors.OpPrereqError("Nodes in same group as '%s' changed since"
10281 " locks were acquired, current nodes are"
10282 " are '%s', used to be '%s'; retry the"
10284 (self.op.node_name,
10285 utils.CommaJoin(need_nodes),
10286 utils.CommaJoin(owned_nodes)),
10287 errors.ECODE_STATE)
10289 wanted_groups = self.cfg.GetNodeGroupsFromNodes(owned_nodes)
10290 if owned_groups != wanted_groups:
10291 raise errors.OpExecError("Node groups changed since locks were acquired,"
10292 " current groups are '%s', used to be '%s';"
10293 " retry the operation" %
10294 (utils.CommaJoin(wanted_groups),
10295 utils.CommaJoin(owned_groups)))
10297 # Determine affected instances
10298 self.instances = self._DetermineInstances()
10299 self.instance_names = [i.name for i in self.instances]
10301 if set(self.instance_names) != owned_instances:
10302 raise errors.OpExecError("Instances on node '%s' changed since locks"
10303 " were acquired, current instances are '%s',"
10304 " used to be '%s'; retry the operation" %
10305 (self.op.node_name,
10306 utils.CommaJoin(self.instance_names),
10307 utils.CommaJoin(owned_instances)))
10309 if self.instance_names:
10310 self.LogInfo("Evacuating instances from node '%s': %s",
10312 utils.CommaJoin(utils.NiceSort(self.instance_names)))
10314 self.LogInfo("No instances to evacuate from node '%s'",
10317 if self.op.remote_node is not None:
10318 for i in self.instances:
10319 if i.primary_node == self.op.remote_node:
10320 raise errors.OpPrereqError("Node %s is the primary node of"
10321 " instance %s, cannot use it as"
10323 (self.op.remote_node, i.name),
10324 errors.ECODE_INVAL)
10326 def Exec(self, feedback_fn):
10327 assert (self.op.iallocator is not None) ^ (self.op.remote_node is not None)
10329 if not self.instance_names:
10330 # No instances to evacuate
10333 elif self.op.iallocator is not None:
10334 # TODO: Implement relocation to other group
10335 ial = IAllocator(self.cfg, self.rpc, constants.IALLOCATOR_MODE_NODE_EVAC,
10336 evac_mode=self._MODE2IALLOCATOR[self.op.mode],
10337 instances=list(self.instance_names))
10339 ial.Run(self.op.iallocator)
10341 if not ial.success:
10342 raise errors.OpPrereqError("Can't compute node evacuation using"
10343 " iallocator '%s': %s" %
10344 (self.op.iallocator, ial.info),
10345 errors.ECODE_NORES)
10347 jobs = _LoadNodeEvacResult(self, ial.result, self.op.early_release, True)
10349 elif self.op.remote_node is not None:
10350 assert self.op.mode == constants.NODE_EVAC_SEC
10352 [opcodes.OpInstanceReplaceDisks(instance_name=instance_name,
10353 remote_node=self.op.remote_node,
10355 mode=constants.REPLACE_DISK_CHG,
10356 early_release=self.op.early_release)]
10357 for instance_name in self.instance_names
10361 raise errors.ProgrammerError("No iallocator or remote node")
10363 return ResultWithJobs(jobs)
10366 def _SetOpEarlyRelease(early_release, op):
10367 """Sets C{early_release} flag on opcodes if available.
10371 op.early_release = early_release
10372 except AttributeError:
10373 assert not isinstance(op, opcodes.OpInstanceReplaceDisks)
10378 def _NodeEvacDest(use_nodes, group, nodes):
10379 """Returns group or nodes depending on caller's choice.
10383 return utils.CommaJoin(nodes)
10388 def _LoadNodeEvacResult(lu, alloc_result, early_release, use_nodes):
10389 """Unpacks the result of change-group and node-evacuate iallocator requests.
10391 Iallocator modes L{constants.IALLOCATOR_MODE_NODE_EVAC} and
10392 L{constants.IALLOCATOR_MODE_CHG_GROUP}.
10394 @type lu: L{LogicalUnit}
10395 @param lu: Logical unit instance
10396 @type alloc_result: tuple/list
10397 @param alloc_result: Result from iallocator
10398 @type early_release: bool
10399 @param early_release: Whether to release locks early if possible
10400 @type use_nodes: bool
10401 @param use_nodes: Whether to display node names instead of groups
10404 (moved, failed, jobs) = alloc_result
10407 failreason = utils.CommaJoin("%s (%s)" % (name, reason)
10408 for (name, reason) in failed)
10409 lu.LogWarning("Unable to evacuate instances %s", failreason)
10410 raise errors.OpExecError("Unable to evacuate instances %s" % failreason)
10413 lu.LogInfo("Instances to be moved: %s",
10414 utils.CommaJoin("%s (to %s)" %
10415 (name, _NodeEvacDest(use_nodes, group, nodes))
10416 for (name, group, nodes) in moved))
10418 return [map(compat.partial(_SetOpEarlyRelease, early_release),
10419 map(opcodes.OpCode.LoadOpCode, ops))
10423 class LUInstanceGrowDisk(LogicalUnit):
10424 """Grow a disk of an instance.
10427 HPATH = "disk-grow"
10428 HTYPE = constants.HTYPE_INSTANCE
10431 def ExpandNames(self):
10432 self._ExpandAndLockInstance()
10433 self.needed_locks[locking.LEVEL_NODE] = []
10434 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
10436 def DeclareLocks(self, level):
10437 if level == locking.LEVEL_NODE:
10438 self._LockInstancesNodes()
10440 def BuildHooksEnv(self):
10441 """Build hooks env.
10443 This runs on the master, the primary and all the secondaries.
10447 "DISK": self.op.disk,
10448 "AMOUNT": self.op.amount,
10450 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
10453 def BuildHooksNodes(self):
10454 """Build hooks nodes.
10457 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
10460 def CheckPrereq(self):
10461 """Check prerequisites.
10463 This checks that the instance is in the cluster.
10466 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
10467 assert instance is not None, \
10468 "Cannot retrieve locked instance %s" % self.op.instance_name
10469 nodenames = list(instance.all_nodes)
10470 for node in nodenames:
10471 _CheckNodeOnline(self, node)
10473 self.instance = instance
10475 if instance.disk_template not in constants.DTS_GROWABLE:
10476 raise errors.OpPrereqError("Instance's disk layout does not support"
10477 " growing", errors.ECODE_INVAL)
10479 self.disk = instance.FindDisk(self.op.disk)
10481 if instance.disk_template not in (constants.DT_FILE,
10482 constants.DT_SHARED_FILE):
10483 # TODO: check the free disk space for file, when that feature will be
10485 _CheckNodesFreeDiskPerVG(self, nodenames,
10486 self.disk.ComputeGrowth(self.op.amount))
10488 def Exec(self, feedback_fn):
10489 """Execute disk grow.
10492 instance = self.instance
10495 disks_ok, _ = _AssembleInstanceDisks(self, self.instance, disks=[disk])
10497 raise errors.OpExecError("Cannot activate block device to grow")
10499 # First run all grow ops in dry-run mode
10500 for node in instance.all_nodes:
10501 self.cfg.SetDiskID(disk, node)
10502 result = self.rpc.call_blockdev_grow(node, disk, self.op.amount, True)
10503 result.Raise("Grow request failed to node %s" % node)
10505 # We know that (as far as we can test) operations across different
10506 # nodes will succeed, time to run it for real
10507 for node in instance.all_nodes:
10508 self.cfg.SetDiskID(disk, node)
10509 result = self.rpc.call_blockdev_grow(node, disk, self.op.amount, False)
10510 result.Raise("Grow request failed to node %s" % node)
10512 # TODO: Rewrite code to work properly
10513 # DRBD goes into sync mode for a short amount of time after executing the
10514 # "resize" command. DRBD 8.x below version 8.0.13 contains a bug whereby
10515 # calling "resize" in sync mode fails. Sleeping for a short amount of
10516 # time is a work-around.
10519 disk.RecordGrow(self.op.amount)
10520 self.cfg.Update(instance, feedback_fn)
10521 if self.op.wait_for_sync:
10522 disk_abort = not _WaitForSync(self, instance, disks=[disk])
10524 self.proc.LogWarning("Disk sync-ing has not returned a good"
10525 " status; please check the instance")
10526 if not instance.admin_up:
10527 _SafeShutdownInstanceDisks(self, instance, disks=[disk])
10528 elif not instance.admin_up:
10529 self.proc.LogWarning("Not shutting down the disk even if the instance is"
10530 " not supposed to be running because no wait for"
10531 " sync mode was requested")
10534 class LUInstanceQueryData(NoHooksLU):
10535 """Query runtime instance data.
10540 def ExpandNames(self):
10541 self.needed_locks = {}
10543 # Use locking if requested or when non-static information is wanted
10544 if not (self.op.static or self.op.use_locking):
10545 self.LogWarning("Non-static data requested, locks need to be acquired")
10546 self.op.use_locking = True
10548 if self.op.instances or not self.op.use_locking:
10549 # Expand instance names right here
10550 self.wanted_names = _GetWantedInstances(self, self.op.instances)
10552 # Will use acquired locks
10553 self.wanted_names = None
10555 if self.op.use_locking:
10556 self.share_locks = _ShareAll()
10558 if self.wanted_names is None:
10559 self.needed_locks[locking.LEVEL_INSTANCE] = locking.ALL_SET
10561 self.needed_locks[locking.LEVEL_INSTANCE] = self.wanted_names
10563 self.needed_locks[locking.LEVEL_NODEGROUP] = []
10564 self.needed_locks[locking.LEVEL_NODE] = []
10565 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
10567 def DeclareLocks(self, level):
10568 if self.op.use_locking:
10569 if level == locking.LEVEL_NODEGROUP:
10570 owned_instances = self.owned_locks(locking.LEVEL_INSTANCE)
10572 # Lock all groups used by instances optimistically; this requires going
10573 # via the node before it's locked, requiring verification later on
10574 self.needed_locks[locking.LEVEL_NODEGROUP] = \
10575 frozenset(group_uuid
10576 for instance_name in owned_instances
10578 self.cfg.GetInstanceNodeGroups(instance_name))
10580 elif level == locking.LEVEL_NODE:
10581 self._LockInstancesNodes()
10583 def CheckPrereq(self):
10584 """Check prerequisites.
10586 This only checks the optional instance list against the existing names.
10589 owned_instances = frozenset(self.owned_locks(locking.LEVEL_INSTANCE))
10590 owned_groups = frozenset(self.owned_locks(locking.LEVEL_NODEGROUP))
10591 owned_nodes = frozenset(self.owned_locks(locking.LEVEL_NODE))
10593 if self.wanted_names is None:
10594 assert self.op.use_locking, "Locking was not used"
10595 self.wanted_names = owned_instances
10597 instances = dict(self.cfg.GetMultiInstanceInfo(self.wanted_names))
10599 if self.op.use_locking:
10600 _CheckInstancesNodeGroups(self.cfg, instances, owned_groups, owned_nodes,
10603 assert not (owned_instances or owned_groups or owned_nodes)
10605 self.wanted_instances = instances.values()
10607 def _ComputeBlockdevStatus(self, node, instance_name, dev):
10608 """Returns the status of a block device
10611 if self.op.static or not node:
10614 self.cfg.SetDiskID(dev, node)
10616 result = self.rpc.call_blockdev_find(node, dev)
10620 result.Raise("Can't compute disk status for %s" % instance_name)
10622 status = result.payload
10626 return (status.dev_path, status.major, status.minor,
10627 status.sync_percent, status.estimated_time,
10628 status.is_degraded, status.ldisk_status)
10630 def _ComputeDiskStatus(self, instance, snode, dev):
10631 """Compute block device status.
10634 if dev.dev_type in constants.LDS_DRBD:
10635 # we change the snode then (otherwise we use the one passed in)
10636 if dev.logical_id[0] == instance.primary_node:
10637 snode = dev.logical_id[1]
10639 snode = dev.logical_id[0]
10641 dev_pstatus = self._ComputeBlockdevStatus(instance.primary_node,
10642 instance.name, dev)
10643 dev_sstatus = self._ComputeBlockdevStatus(snode, instance.name, dev)
10646 dev_children = map(compat.partial(self._ComputeDiskStatus,
10653 "iv_name": dev.iv_name,
10654 "dev_type": dev.dev_type,
10655 "logical_id": dev.logical_id,
10656 "physical_id": dev.physical_id,
10657 "pstatus": dev_pstatus,
10658 "sstatus": dev_sstatus,
10659 "children": dev_children,
10664 def Exec(self, feedback_fn):
10665 """Gather and return data"""
10668 cluster = self.cfg.GetClusterInfo()
10670 node_names = itertools.chain(*(i.all_nodes for i in self.wanted_instances))
10671 nodes = dict(self.cfg.GetMultiNodeInfo(node_names))
10673 groups = dict(self.cfg.GetMultiNodeGroupInfo(node.group
10674 for node in nodes.values()))
10676 group2name_fn = lambda uuid: groups[uuid].name
10678 for instance in self.wanted_instances:
10679 pnode = nodes[instance.primary_node]
10681 if self.op.static or pnode.offline:
10682 remote_state = None
10684 self.LogWarning("Primary node %s is marked offline, returning static"
10685 " information only for instance %s" %
10686 (pnode.name, instance.name))
10688 remote_info = self.rpc.call_instance_info(instance.primary_node,
10690 instance.hypervisor)
10691 remote_info.Raise("Error checking node %s" % instance.primary_node)
10692 remote_info = remote_info.payload
10693 if remote_info and "state" in remote_info:
10694 remote_state = "up"
10696 remote_state = "down"
10698 if instance.admin_up:
10699 config_state = "up"
10701 config_state = "down"
10703 disks = map(compat.partial(self._ComputeDiskStatus, instance, None),
10706 snodes_group_uuids = [nodes[snode_name].group
10707 for snode_name in instance.secondary_nodes]
10709 result[instance.name] = {
10710 "name": instance.name,
10711 "config_state": config_state,
10712 "run_state": remote_state,
10713 "pnode": instance.primary_node,
10714 "pnode_group_uuid": pnode.group,
10715 "pnode_group_name": group2name_fn(pnode.group),
10716 "snodes": instance.secondary_nodes,
10717 "snodes_group_uuids": snodes_group_uuids,
10718 "snodes_group_names": map(group2name_fn, snodes_group_uuids),
10720 # this happens to be the same format used for hooks
10721 "nics": _NICListToTuple(self, instance.nics),
10722 "disk_template": instance.disk_template,
10724 "hypervisor": instance.hypervisor,
10725 "network_port": instance.network_port,
10726 "hv_instance": instance.hvparams,
10727 "hv_actual": cluster.FillHV(instance, skip_globals=True),
10728 "be_instance": instance.beparams,
10729 "be_actual": cluster.FillBE(instance),
10730 "os_instance": instance.osparams,
10731 "os_actual": cluster.SimpleFillOS(instance.os, instance.osparams),
10732 "serial_no": instance.serial_no,
10733 "mtime": instance.mtime,
10734 "ctime": instance.ctime,
10735 "uuid": instance.uuid,
10741 class LUInstanceSetParams(LogicalUnit):
10742 """Modifies an instances's parameters.
10745 HPATH = "instance-modify"
10746 HTYPE = constants.HTYPE_INSTANCE
10749 def CheckArguments(self):
10750 if not (self.op.nics or self.op.disks or self.op.disk_template or
10751 self.op.hvparams or self.op.beparams or self.op.os_name):
10752 raise errors.OpPrereqError("No changes submitted", errors.ECODE_INVAL)
10754 if self.op.hvparams:
10755 _CheckGlobalHvParams(self.op.hvparams)
10759 for disk_op, disk_dict in self.op.disks:
10760 utils.ForceDictType(disk_dict, constants.IDISK_PARAMS_TYPES)
10761 if disk_op == constants.DDM_REMOVE:
10762 disk_addremove += 1
10764 elif disk_op == constants.DDM_ADD:
10765 disk_addremove += 1
10767 if not isinstance(disk_op, int):
10768 raise errors.OpPrereqError("Invalid disk index", errors.ECODE_INVAL)
10769 if not isinstance(disk_dict, dict):
10770 msg = "Invalid disk value: expected dict, got '%s'" % disk_dict
10771 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
10773 if disk_op == constants.DDM_ADD:
10774 mode = disk_dict.setdefault(constants.IDISK_MODE, constants.DISK_RDWR)
10775 if mode not in constants.DISK_ACCESS_SET:
10776 raise errors.OpPrereqError("Invalid disk access mode '%s'" % mode,
10777 errors.ECODE_INVAL)
10778 size = disk_dict.get(constants.IDISK_SIZE, None)
10780 raise errors.OpPrereqError("Required disk parameter size missing",
10781 errors.ECODE_INVAL)
10784 except (TypeError, ValueError), err:
10785 raise errors.OpPrereqError("Invalid disk size parameter: %s" %
10786 str(err), errors.ECODE_INVAL)
10787 disk_dict[constants.IDISK_SIZE] = size
10789 # modification of disk
10790 if constants.IDISK_SIZE in disk_dict:
10791 raise errors.OpPrereqError("Disk size change not possible, use"
10792 " grow-disk", errors.ECODE_INVAL)
10794 if disk_addremove > 1:
10795 raise errors.OpPrereqError("Only one disk add or remove operation"
10796 " supported at a time", errors.ECODE_INVAL)
10798 if self.op.disks and self.op.disk_template is not None:
10799 raise errors.OpPrereqError("Disk template conversion and other disk"
10800 " changes not supported at the same time",
10801 errors.ECODE_INVAL)
10803 if (self.op.disk_template and
10804 self.op.disk_template in constants.DTS_INT_MIRROR and
10805 self.op.remote_node is None):
10806 raise errors.OpPrereqError("Changing the disk template to a mirrored"
10807 " one requires specifying a secondary node",
10808 errors.ECODE_INVAL)
10812 for nic_op, nic_dict in self.op.nics:
10813 utils.ForceDictType(nic_dict, constants.INIC_PARAMS_TYPES)
10814 if nic_op == constants.DDM_REMOVE:
10817 elif nic_op == constants.DDM_ADD:
10820 if not isinstance(nic_op, int):
10821 raise errors.OpPrereqError("Invalid nic index", errors.ECODE_INVAL)
10822 if not isinstance(nic_dict, dict):
10823 msg = "Invalid nic value: expected dict, got '%s'" % nic_dict
10824 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
10826 # nic_dict should be a dict
10827 nic_ip = nic_dict.get(constants.INIC_IP, None)
10828 if nic_ip is not None:
10829 if nic_ip.lower() == constants.VALUE_NONE:
10830 nic_dict[constants.INIC_IP] = None
10832 if not netutils.IPAddress.IsValid(nic_ip):
10833 raise errors.OpPrereqError("Invalid IP address '%s'" % nic_ip,
10834 errors.ECODE_INVAL)
10836 nic_bridge = nic_dict.get("bridge", None)
10837 nic_link = nic_dict.get(constants.INIC_LINK, None)
10838 if nic_bridge and nic_link:
10839 raise errors.OpPrereqError("Cannot pass 'bridge' and 'link'"
10840 " at the same time", errors.ECODE_INVAL)
10841 elif nic_bridge and nic_bridge.lower() == constants.VALUE_NONE:
10842 nic_dict["bridge"] = None
10843 elif nic_link and nic_link.lower() == constants.VALUE_NONE:
10844 nic_dict[constants.INIC_LINK] = None
10846 if nic_op == constants.DDM_ADD:
10847 nic_mac = nic_dict.get(constants.INIC_MAC, None)
10848 if nic_mac is None:
10849 nic_dict[constants.INIC_MAC] = constants.VALUE_AUTO
10851 if constants.INIC_MAC in nic_dict:
10852 nic_mac = nic_dict[constants.INIC_MAC]
10853 if nic_mac not in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
10854 nic_mac = utils.NormalizeAndValidateMac(nic_mac)
10856 if nic_op != constants.DDM_ADD and nic_mac == constants.VALUE_AUTO:
10857 raise errors.OpPrereqError("'auto' is not a valid MAC address when"
10858 " modifying an existing nic",
10859 errors.ECODE_INVAL)
10861 if nic_addremove > 1:
10862 raise errors.OpPrereqError("Only one NIC add or remove operation"
10863 " supported at a time", errors.ECODE_INVAL)
10865 def ExpandNames(self):
10866 self._ExpandAndLockInstance()
10867 self.needed_locks[locking.LEVEL_NODE] = []
10868 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
10870 def DeclareLocks(self, level):
10871 if level == locking.LEVEL_NODE:
10872 self._LockInstancesNodes()
10873 if self.op.disk_template and self.op.remote_node:
10874 self.op.remote_node = _ExpandNodeName(self.cfg, self.op.remote_node)
10875 self.needed_locks[locking.LEVEL_NODE].append(self.op.remote_node)
10877 def BuildHooksEnv(self):
10878 """Build hooks env.
10880 This runs on the master, primary and secondaries.
10884 if constants.BE_MEMORY in self.be_new:
10885 args["memory"] = self.be_new[constants.BE_MEMORY]
10886 if constants.BE_VCPUS in self.be_new:
10887 args["vcpus"] = self.be_new[constants.BE_VCPUS]
10888 # TODO: export disk changes. Note: _BuildInstanceHookEnv* don't export disk
10889 # information at all.
10892 nic_override = dict(self.op.nics)
10893 for idx, nic in enumerate(self.instance.nics):
10894 if idx in nic_override:
10895 this_nic_override = nic_override[idx]
10897 this_nic_override = {}
10898 if constants.INIC_IP in this_nic_override:
10899 ip = this_nic_override[constants.INIC_IP]
10902 if constants.INIC_MAC in this_nic_override:
10903 mac = this_nic_override[constants.INIC_MAC]
10906 if idx in self.nic_pnew:
10907 nicparams = self.nic_pnew[idx]
10909 nicparams = self.cluster.SimpleFillNIC(nic.nicparams)
10910 mode = nicparams[constants.NIC_MODE]
10911 link = nicparams[constants.NIC_LINK]
10912 args["nics"].append((ip, mac, mode, link))
10913 if constants.DDM_ADD in nic_override:
10914 ip = nic_override[constants.DDM_ADD].get(constants.INIC_IP, None)
10915 mac = nic_override[constants.DDM_ADD][constants.INIC_MAC]
10916 nicparams = self.nic_pnew[constants.DDM_ADD]
10917 mode = nicparams[constants.NIC_MODE]
10918 link = nicparams[constants.NIC_LINK]
10919 args["nics"].append((ip, mac, mode, link))
10920 elif constants.DDM_REMOVE in nic_override:
10921 del args["nics"][-1]
10923 env = _BuildInstanceHookEnvByObject(self, self.instance, override=args)
10924 if self.op.disk_template:
10925 env["NEW_DISK_TEMPLATE"] = self.op.disk_template
10929 def BuildHooksNodes(self):
10930 """Build hooks nodes.
10933 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
10936 def CheckPrereq(self):
10937 """Check prerequisites.
10939 This only checks the instance list against the existing names.
10942 # checking the new params on the primary/secondary nodes
10944 instance = self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
10945 cluster = self.cluster = self.cfg.GetClusterInfo()
10946 assert self.instance is not None, \
10947 "Cannot retrieve locked instance %s" % self.op.instance_name
10948 pnode = instance.primary_node
10949 nodelist = list(instance.all_nodes)
10952 if self.op.os_name and not self.op.force:
10953 _CheckNodeHasOS(self, instance.primary_node, self.op.os_name,
10954 self.op.force_variant)
10955 instance_os = self.op.os_name
10957 instance_os = instance.os
10959 if self.op.disk_template:
10960 if instance.disk_template == self.op.disk_template:
10961 raise errors.OpPrereqError("Instance already has disk template %s" %
10962 instance.disk_template, errors.ECODE_INVAL)
10964 if (instance.disk_template,
10965 self.op.disk_template) not in self._DISK_CONVERSIONS:
10966 raise errors.OpPrereqError("Unsupported disk template conversion from"
10967 " %s to %s" % (instance.disk_template,
10968 self.op.disk_template),
10969 errors.ECODE_INVAL)
10970 _CheckInstanceDown(self, instance, "cannot change disk template")
10971 if self.op.disk_template in constants.DTS_INT_MIRROR:
10972 if self.op.remote_node == pnode:
10973 raise errors.OpPrereqError("Given new secondary node %s is the same"
10974 " as the primary node of the instance" %
10975 self.op.remote_node, errors.ECODE_STATE)
10976 _CheckNodeOnline(self, self.op.remote_node)
10977 _CheckNodeNotDrained(self, self.op.remote_node)
10978 # FIXME: here we assume that the old instance type is DT_PLAIN
10979 assert instance.disk_template == constants.DT_PLAIN
10980 disks = [{constants.IDISK_SIZE: d.size,
10981 constants.IDISK_VG: d.logical_id[0]}
10982 for d in instance.disks]
10983 required = _ComputeDiskSizePerVG(self.op.disk_template, disks)
10984 _CheckNodesFreeDiskPerVG(self, [self.op.remote_node], required)
10986 # hvparams processing
10987 if self.op.hvparams:
10988 hv_type = instance.hypervisor
10989 i_hvdict = _GetUpdatedParams(instance.hvparams, self.op.hvparams)
10990 utils.ForceDictType(i_hvdict, constants.HVS_PARAMETER_TYPES)
10991 hv_new = cluster.SimpleFillHV(hv_type, instance.os, i_hvdict)
10994 hypervisor.GetHypervisor(hv_type).CheckParameterSyntax(hv_new)
10995 _CheckHVParams(self, nodelist, instance.hypervisor, hv_new)
10996 self.hv_new = hv_new # the new actual values
10997 self.hv_inst = i_hvdict # the new dict (without defaults)
10999 self.hv_new = self.hv_inst = {}
11001 # beparams processing
11002 if self.op.beparams:
11003 i_bedict = _GetUpdatedParams(instance.beparams, self.op.beparams,
11005 utils.ForceDictType(i_bedict, constants.BES_PARAMETER_TYPES)
11006 be_new = cluster.SimpleFillBE(i_bedict)
11007 self.be_new = be_new # the new actual values
11008 self.be_inst = i_bedict # the new dict (without defaults)
11010 self.be_new = self.be_inst = {}
11011 be_old = cluster.FillBE(instance)
11013 # osparams processing
11014 if self.op.osparams:
11015 i_osdict = _GetUpdatedParams(instance.osparams, self.op.osparams)
11016 _CheckOSParams(self, True, nodelist, instance_os, i_osdict)
11017 self.os_inst = i_osdict # the new dict (without defaults)
11023 if (constants.BE_MEMORY in self.op.beparams and not self.op.force and
11024 be_new[constants.BE_MEMORY] > be_old[constants.BE_MEMORY]):
11025 mem_check_list = [pnode]
11026 if be_new[constants.BE_AUTO_BALANCE]:
11027 # either we changed auto_balance to yes or it was from before
11028 mem_check_list.extend(instance.secondary_nodes)
11029 instance_info = self.rpc.call_instance_info(pnode, instance.name,
11030 instance.hypervisor)
11031 nodeinfo = self.rpc.call_node_info(mem_check_list, None,
11032 instance.hypervisor)
11033 pninfo = nodeinfo[pnode]
11034 msg = pninfo.fail_msg
11036 # Assume the primary node is unreachable and go ahead
11037 self.warn.append("Can't get info from primary node %s: %s" %
11039 elif not isinstance(pninfo.payload.get("memory_free", None), int):
11040 self.warn.append("Node data from primary node %s doesn't contain"
11041 " free memory information" % pnode)
11042 elif instance_info.fail_msg:
11043 self.warn.append("Can't get instance runtime information: %s" %
11044 instance_info.fail_msg)
11046 if instance_info.payload:
11047 current_mem = int(instance_info.payload["memory"])
11049 # Assume instance not running
11050 # (there is a slight race condition here, but it's not very probable,
11051 # and we have no other way to check)
11053 miss_mem = (be_new[constants.BE_MEMORY] - current_mem -
11054 pninfo.payload["memory_free"])
11056 raise errors.OpPrereqError("This change will prevent the instance"
11057 " from starting, due to %d MB of memory"
11058 " missing on its primary node" % miss_mem,
11059 errors.ECODE_NORES)
11061 if be_new[constants.BE_AUTO_BALANCE]:
11062 for node, nres in nodeinfo.items():
11063 if node not in instance.secondary_nodes:
11065 nres.Raise("Can't get info from secondary node %s" % node,
11066 prereq=True, ecode=errors.ECODE_STATE)
11067 if not isinstance(nres.payload.get("memory_free", None), int):
11068 raise errors.OpPrereqError("Secondary node %s didn't return free"
11069 " memory information" % node,
11070 errors.ECODE_STATE)
11071 elif be_new[constants.BE_MEMORY] > nres.payload["memory_free"]:
11072 raise errors.OpPrereqError("This change will prevent the instance"
11073 " from failover to its secondary node"
11074 " %s, due to not enough memory" % node,
11075 errors.ECODE_STATE)
11079 self.nic_pinst = {}
11080 for nic_op, nic_dict in self.op.nics:
11081 if nic_op == constants.DDM_REMOVE:
11082 if not instance.nics:
11083 raise errors.OpPrereqError("Instance has no NICs, cannot remove",
11084 errors.ECODE_INVAL)
11086 if nic_op != constants.DDM_ADD:
11088 if not instance.nics:
11089 raise errors.OpPrereqError("Invalid NIC index %s, instance has"
11090 " no NICs" % nic_op,
11091 errors.ECODE_INVAL)
11092 if nic_op < 0 or nic_op >= len(instance.nics):
11093 raise errors.OpPrereqError("Invalid NIC index %s, valid values"
11095 (nic_op, len(instance.nics) - 1),
11096 errors.ECODE_INVAL)
11097 old_nic_params = instance.nics[nic_op].nicparams
11098 old_nic_ip = instance.nics[nic_op].ip
11100 old_nic_params = {}
11103 update_params_dict = dict([(key, nic_dict[key])
11104 for key in constants.NICS_PARAMETERS
11105 if key in nic_dict])
11107 if "bridge" in nic_dict:
11108 update_params_dict[constants.NIC_LINK] = nic_dict["bridge"]
11110 new_nic_params = _GetUpdatedParams(old_nic_params,
11111 update_params_dict)
11112 utils.ForceDictType(new_nic_params, constants.NICS_PARAMETER_TYPES)
11113 new_filled_nic_params = cluster.SimpleFillNIC(new_nic_params)
11114 objects.NIC.CheckParameterSyntax(new_filled_nic_params)
11115 self.nic_pinst[nic_op] = new_nic_params
11116 self.nic_pnew[nic_op] = new_filled_nic_params
11117 new_nic_mode = new_filled_nic_params[constants.NIC_MODE]
11119 if new_nic_mode == constants.NIC_MODE_BRIDGED:
11120 nic_bridge = new_filled_nic_params[constants.NIC_LINK]
11121 msg = self.rpc.call_bridges_exist(pnode, [nic_bridge]).fail_msg
11123 msg = "Error checking bridges on node %s: %s" % (pnode, msg)
11125 self.warn.append(msg)
11127 raise errors.OpPrereqError(msg, errors.ECODE_ENVIRON)
11128 if new_nic_mode == constants.NIC_MODE_ROUTED:
11129 if constants.INIC_IP in nic_dict:
11130 nic_ip = nic_dict[constants.INIC_IP]
11132 nic_ip = old_nic_ip
11134 raise errors.OpPrereqError("Cannot set the nic ip to None"
11135 " on a routed nic", errors.ECODE_INVAL)
11136 if constants.INIC_MAC in nic_dict:
11137 nic_mac = nic_dict[constants.INIC_MAC]
11138 if nic_mac is None:
11139 raise errors.OpPrereqError("Cannot set the nic mac to None",
11140 errors.ECODE_INVAL)
11141 elif nic_mac in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
11142 # otherwise generate the mac
11143 nic_dict[constants.INIC_MAC] = \
11144 self.cfg.GenerateMAC(self.proc.GetECId())
11146 # or validate/reserve the current one
11148 self.cfg.ReserveMAC(nic_mac, self.proc.GetECId())
11149 except errors.ReservationError:
11150 raise errors.OpPrereqError("MAC address %s already in use"
11151 " in cluster" % nic_mac,
11152 errors.ECODE_NOTUNIQUE)
11155 if self.op.disks and instance.disk_template == constants.DT_DISKLESS:
11156 raise errors.OpPrereqError("Disk operations not supported for"
11157 " diskless instances",
11158 errors.ECODE_INVAL)
11159 for disk_op, _ in self.op.disks:
11160 if disk_op == constants.DDM_REMOVE:
11161 if len(instance.disks) == 1:
11162 raise errors.OpPrereqError("Cannot remove the last disk of"
11163 " an instance", errors.ECODE_INVAL)
11164 _CheckInstanceDown(self, instance, "cannot remove disks")
11166 if (disk_op == constants.DDM_ADD and
11167 len(instance.disks) >= constants.MAX_DISKS):
11168 raise errors.OpPrereqError("Instance has too many disks (%d), cannot"
11169 " add more" % constants.MAX_DISKS,
11170 errors.ECODE_STATE)
11171 if disk_op not in (constants.DDM_ADD, constants.DDM_REMOVE):
11173 if disk_op < 0 or disk_op >= len(instance.disks):
11174 raise errors.OpPrereqError("Invalid disk index %s, valid values"
11176 (disk_op, len(instance.disks)),
11177 errors.ECODE_INVAL)
11181 def _ConvertPlainToDrbd(self, feedback_fn):
11182 """Converts an instance from plain to drbd.
11185 feedback_fn("Converting template to drbd")
11186 instance = self.instance
11187 pnode = instance.primary_node
11188 snode = self.op.remote_node
11190 # create a fake disk info for _GenerateDiskTemplate
11191 disk_info = [{constants.IDISK_SIZE: d.size, constants.IDISK_MODE: d.mode,
11192 constants.IDISK_VG: d.logical_id[0]}
11193 for d in instance.disks]
11194 new_disks = _GenerateDiskTemplate(self, self.op.disk_template,
11195 instance.name, pnode, [snode],
11196 disk_info, None, None, 0, feedback_fn)
11197 info = _GetInstanceInfoText(instance)
11198 feedback_fn("Creating aditional volumes...")
11199 # first, create the missing data and meta devices
11200 for disk in new_disks:
11201 # unfortunately this is... not too nice
11202 _CreateSingleBlockDev(self, pnode, instance, disk.children[1],
11204 for child in disk.children:
11205 _CreateSingleBlockDev(self, snode, instance, child, info, True)
11206 # at this stage, all new LVs have been created, we can rename the
11208 feedback_fn("Renaming original volumes...")
11209 rename_list = [(o, n.children[0].logical_id)
11210 for (o, n) in zip(instance.disks, new_disks)]
11211 result = self.rpc.call_blockdev_rename(pnode, rename_list)
11212 result.Raise("Failed to rename original LVs")
11214 feedback_fn("Initializing DRBD devices...")
11215 # all child devices are in place, we can now create the DRBD devices
11216 for disk in new_disks:
11217 for node in [pnode, snode]:
11218 f_create = node == pnode
11219 _CreateSingleBlockDev(self, node, instance, disk, info, f_create)
11221 # at this point, the instance has been modified
11222 instance.disk_template = constants.DT_DRBD8
11223 instance.disks = new_disks
11224 self.cfg.Update(instance, feedback_fn)
11226 # disks are created, waiting for sync
11227 disk_abort = not _WaitForSync(self, instance,
11228 oneshot=not self.op.wait_for_sync)
11230 raise errors.OpExecError("There are some degraded disks for"
11231 " this instance, please cleanup manually")
11233 def _ConvertDrbdToPlain(self, feedback_fn):
11234 """Converts an instance from drbd to plain.
11237 instance = self.instance
11238 assert len(instance.secondary_nodes) == 1
11239 pnode = instance.primary_node
11240 snode = instance.secondary_nodes[0]
11241 feedback_fn("Converting template to plain")
11243 old_disks = instance.disks
11244 new_disks = [d.children[0] for d in old_disks]
11246 # copy over size and mode
11247 for parent, child in zip(old_disks, new_disks):
11248 child.size = parent.size
11249 child.mode = parent.mode
11251 # update instance structure
11252 instance.disks = new_disks
11253 instance.disk_template = constants.DT_PLAIN
11254 self.cfg.Update(instance, feedback_fn)
11256 feedback_fn("Removing volumes on the secondary node...")
11257 for disk in old_disks:
11258 self.cfg.SetDiskID(disk, snode)
11259 msg = self.rpc.call_blockdev_remove(snode, disk).fail_msg
11261 self.LogWarning("Could not remove block device %s on node %s,"
11262 " continuing anyway: %s", disk.iv_name, snode, msg)
11264 feedback_fn("Removing unneeded volumes on the primary node...")
11265 for idx, disk in enumerate(old_disks):
11266 meta = disk.children[1]
11267 self.cfg.SetDiskID(meta, pnode)
11268 msg = self.rpc.call_blockdev_remove(pnode, meta).fail_msg
11270 self.LogWarning("Could not remove metadata for disk %d on node %s,"
11271 " continuing anyway: %s", idx, pnode, msg)
11273 # this is a DRBD disk, return its port to the pool
11274 for disk in old_disks:
11275 tcp_port = disk.logical_id[2]
11276 self.cfg.AddTcpUdpPort(tcp_port)
11278 def Exec(self, feedback_fn):
11279 """Modifies an instance.
11281 All parameters take effect only at the next restart of the instance.
11284 # Process here the warnings from CheckPrereq, as we don't have a
11285 # feedback_fn there.
11286 for warn in self.warn:
11287 feedback_fn("WARNING: %s" % warn)
11290 instance = self.instance
11292 for disk_op, disk_dict in self.op.disks:
11293 if disk_op == constants.DDM_REMOVE:
11294 # remove the last disk
11295 device = instance.disks.pop()
11296 device_idx = len(instance.disks)
11297 for node, disk in device.ComputeNodeTree(instance.primary_node):
11298 self.cfg.SetDiskID(disk, node)
11299 msg = self.rpc.call_blockdev_remove(node, disk).fail_msg
11301 self.LogWarning("Could not remove disk/%d on node %s: %s,"
11302 " continuing anyway", device_idx, node, msg)
11303 result.append(("disk/%d" % device_idx, "remove"))
11305 # if this is a DRBD disk, return its port to the pool
11306 if device.dev_type in constants.LDS_DRBD:
11307 tcp_port = device.logical_id[2]
11308 self.cfg.AddTcpUdpPort(tcp_port)
11309 elif disk_op == constants.DDM_ADD:
11311 if instance.disk_template in (constants.DT_FILE,
11312 constants.DT_SHARED_FILE):
11313 file_driver, file_path = instance.disks[0].logical_id
11314 file_path = os.path.dirname(file_path)
11316 file_driver = file_path = None
11317 disk_idx_base = len(instance.disks)
11318 new_disk = _GenerateDiskTemplate(self,
11319 instance.disk_template,
11320 instance.name, instance.primary_node,
11321 instance.secondary_nodes,
11325 disk_idx_base, feedback_fn)[0]
11326 instance.disks.append(new_disk)
11327 info = _GetInstanceInfoText(instance)
11329 logging.info("Creating volume %s for instance %s",
11330 new_disk.iv_name, instance.name)
11331 # Note: this needs to be kept in sync with _CreateDisks
11333 for node in instance.all_nodes:
11334 f_create = node == instance.primary_node
11336 _CreateBlockDev(self, node, instance, new_disk,
11337 f_create, info, f_create)
11338 except errors.OpExecError, err:
11339 self.LogWarning("Failed to create volume %s (%s) on"
11341 new_disk.iv_name, new_disk, node, err)
11342 result.append(("disk/%d" % disk_idx_base, "add:size=%s,mode=%s" %
11343 (new_disk.size, new_disk.mode)))
11345 # change a given disk
11346 instance.disks[disk_op].mode = disk_dict[constants.IDISK_MODE]
11347 result.append(("disk.mode/%d" % disk_op,
11348 disk_dict[constants.IDISK_MODE]))
11350 if self.op.disk_template:
11351 r_shut = _ShutdownInstanceDisks(self, instance)
11353 raise errors.OpExecError("Cannot shutdown instance disks, unable to"
11354 " proceed with disk template conversion")
11355 mode = (instance.disk_template, self.op.disk_template)
11357 self._DISK_CONVERSIONS[mode](self, feedback_fn)
11359 self.cfg.ReleaseDRBDMinors(instance.name)
11361 result.append(("disk_template", self.op.disk_template))
11364 for nic_op, nic_dict in self.op.nics:
11365 if nic_op == constants.DDM_REMOVE:
11366 # remove the last nic
11367 del instance.nics[-1]
11368 result.append(("nic.%d" % len(instance.nics), "remove"))
11369 elif nic_op == constants.DDM_ADD:
11370 # mac and bridge should be set, by now
11371 mac = nic_dict[constants.INIC_MAC]
11372 ip = nic_dict.get(constants.INIC_IP, None)
11373 nicparams = self.nic_pinst[constants.DDM_ADD]
11374 new_nic = objects.NIC(mac=mac, ip=ip, nicparams=nicparams)
11375 instance.nics.append(new_nic)
11376 result.append(("nic.%d" % (len(instance.nics) - 1),
11377 "add:mac=%s,ip=%s,mode=%s,link=%s" %
11378 (new_nic.mac, new_nic.ip,
11379 self.nic_pnew[constants.DDM_ADD][constants.NIC_MODE],
11380 self.nic_pnew[constants.DDM_ADD][constants.NIC_LINK]
11383 for key in (constants.INIC_MAC, constants.INIC_IP):
11384 if key in nic_dict:
11385 setattr(instance.nics[nic_op], key, nic_dict[key])
11386 if nic_op in self.nic_pinst:
11387 instance.nics[nic_op].nicparams = self.nic_pinst[nic_op]
11388 for key, val in nic_dict.iteritems():
11389 result.append(("nic.%s/%d" % (key, nic_op), val))
11392 if self.op.hvparams:
11393 instance.hvparams = self.hv_inst
11394 for key, val in self.op.hvparams.iteritems():
11395 result.append(("hv/%s" % key, val))
11398 if self.op.beparams:
11399 instance.beparams = self.be_inst
11400 for key, val in self.op.beparams.iteritems():
11401 result.append(("be/%s" % key, val))
11404 if self.op.os_name:
11405 instance.os = self.op.os_name
11408 if self.op.osparams:
11409 instance.osparams = self.os_inst
11410 for key, val in self.op.osparams.iteritems():
11411 result.append(("os/%s" % key, val))
11413 self.cfg.Update(instance, feedback_fn)
11417 _DISK_CONVERSIONS = {
11418 (constants.DT_PLAIN, constants.DT_DRBD8): _ConvertPlainToDrbd,
11419 (constants.DT_DRBD8, constants.DT_PLAIN): _ConvertDrbdToPlain,
11423 class LUInstanceChangeGroup(LogicalUnit):
11424 HPATH = "instance-change-group"
11425 HTYPE = constants.HTYPE_INSTANCE
11428 def ExpandNames(self):
11429 self.share_locks = _ShareAll()
11430 self.needed_locks = {
11431 locking.LEVEL_NODEGROUP: [],
11432 locking.LEVEL_NODE: [],
11435 self._ExpandAndLockInstance()
11437 if self.op.target_groups:
11438 self.req_target_uuids = map(self.cfg.LookupNodeGroup,
11439 self.op.target_groups)
11441 self.req_target_uuids = None
11443 self.op.iallocator = _GetDefaultIAllocator(self.cfg, self.op.iallocator)
11445 def DeclareLocks(self, level):
11446 if level == locking.LEVEL_NODEGROUP:
11447 assert not self.needed_locks[locking.LEVEL_NODEGROUP]
11449 if self.req_target_uuids:
11450 lock_groups = set(self.req_target_uuids)
11452 # Lock all groups used by instance optimistically; this requires going
11453 # via the node before it's locked, requiring verification later on
11454 instance_groups = self.cfg.GetInstanceNodeGroups(self.op.instance_name)
11455 lock_groups.update(instance_groups)
11457 # No target groups, need to lock all of them
11458 lock_groups = locking.ALL_SET
11460 self.needed_locks[locking.LEVEL_NODEGROUP] = lock_groups
11462 elif level == locking.LEVEL_NODE:
11463 if self.req_target_uuids:
11464 # Lock all nodes used by instances
11465 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
11466 self._LockInstancesNodes()
11468 # Lock all nodes in all potential target groups
11469 lock_groups = (frozenset(self.owned_locks(locking.LEVEL_NODEGROUP)) -
11470 self.cfg.GetInstanceNodeGroups(self.op.instance_name))
11471 member_nodes = [node_name
11472 for group in lock_groups
11473 for node_name in self.cfg.GetNodeGroup(group).members]
11474 self.needed_locks[locking.LEVEL_NODE].extend(member_nodes)
11476 # Lock all nodes as all groups are potential targets
11477 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
11479 def CheckPrereq(self):
11480 owned_instances = frozenset(self.owned_locks(locking.LEVEL_INSTANCE))
11481 owned_groups = frozenset(self.owned_locks(locking.LEVEL_NODEGROUP))
11482 owned_nodes = frozenset(self.owned_locks(locking.LEVEL_NODE))
11484 assert (self.req_target_uuids is None or
11485 owned_groups.issuperset(self.req_target_uuids))
11486 assert owned_instances == set([self.op.instance_name])
11488 # Get instance information
11489 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
11491 # Check if node groups for locked instance are still correct
11492 assert owned_nodes.issuperset(self.instance.all_nodes), \
11493 ("Instance %s's nodes changed while we kept the lock" %
11494 self.op.instance_name)
11496 inst_groups = _CheckInstanceNodeGroups(self.cfg, self.op.instance_name,
11499 if self.req_target_uuids:
11500 # User requested specific target groups
11501 self.target_uuids = self.req_target_uuids
11503 # All groups except those used by the instance are potential targets
11504 self.target_uuids = owned_groups - inst_groups
11506 conflicting_groups = self.target_uuids & inst_groups
11507 if conflicting_groups:
11508 raise errors.OpPrereqError("Can't use group(s) '%s' as targets, they are"
11509 " used by the instance '%s'" %
11510 (utils.CommaJoin(conflicting_groups),
11511 self.op.instance_name),
11512 errors.ECODE_INVAL)
11514 if not self.target_uuids:
11515 raise errors.OpPrereqError("There are no possible target groups",
11516 errors.ECODE_INVAL)
11518 def BuildHooksEnv(self):
11519 """Build hooks env.
11522 assert self.target_uuids
11525 "TARGET_GROUPS": " ".join(self.target_uuids),
11528 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
11532 def BuildHooksNodes(self):
11533 """Build hooks nodes.
11536 mn = self.cfg.GetMasterNode()
11537 return ([mn], [mn])
11539 def Exec(self, feedback_fn):
11540 instances = list(self.owned_locks(locking.LEVEL_INSTANCE))
11542 assert instances == [self.op.instance_name], "Instance not locked"
11544 ial = IAllocator(self.cfg, self.rpc, constants.IALLOCATOR_MODE_CHG_GROUP,
11545 instances=instances, target_groups=list(self.target_uuids))
11547 ial.Run(self.op.iallocator)
11549 if not ial.success:
11550 raise errors.OpPrereqError("Can't compute solution for changing group of"
11551 " instance '%s' using iallocator '%s': %s" %
11552 (self.op.instance_name, self.op.iallocator,
11554 errors.ECODE_NORES)
11556 jobs = _LoadNodeEvacResult(self, ial.result, self.op.early_release, False)
11558 self.LogInfo("Iallocator returned %s job(s) for changing group of"
11559 " instance '%s'", len(jobs), self.op.instance_name)
11561 return ResultWithJobs(jobs)
11564 class LUBackupQuery(NoHooksLU):
11565 """Query the exports list
11570 def ExpandNames(self):
11571 self.needed_locks = {}
11572 self.share_locks[locking.LEVEL_NODE] = 1
11573 if not self.op.nodes:
11574 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
11576 self.needed_locks[locking.LEVEL_NODE] = \
11577 _GetWantedNodes(self, self.op.nodes)
11579 def Exec(self, feedback_fn):
11580 """Compute the list of all the exported system images.
11583 @return: a dictionary with the structure node->(export-list)
11584 where export-list is a list of the instances exported on
11588 self.nodes = self.owned_locks(locking.LEVEL_NODE)
11589 rpcresult = self.rpc.call_export_list(self.nodes)
11591 for node in rpcresult:
11592 if rpcresult[node].fail_msg:
11593 result[node] = False
11595 result[node] = rpcresult[node].payload
11600 class LUBackupPrepare(NoHooksLU):
11601 """Prepares an instance for an export and returns useful information.
11606 def ExpandNames(self):
11607 self._ExpandAndLockInstance()
11609 def CheckPrereq(self):
11610 """Check prerequisites.
11613 instance_name = self.op.instance_name
11615 self.instance = self.cfg.GetInstanceInfo(instance_name)
11616 assert self.instance is not None, \
11617 "Cannot retrieve locked instance %s" % self.op.instance_name
11618 _CheckNodeOnline(self, self.instance.primary_node)
11620 self._cds = _GetClusterDomainSecret()
11622 def Exec(self, feedback_fn):
11623 """Prepares an instance for an export.
11626 instance = self.instance
11628 if self.op.mode == constants.EXPORT_MODE_REMOTE:
11629 salt = utils.GenerateSecret(8)
11631 feedback_fn("Generating X509 certificate on %s" % instance.primary_node)
11632 result = self.rpc.call_x509_cert_create(instance.primary_node,
11633 constants.RIE_CERT_VALIDITY)
11634 result.Raise("Can't create X509 key and certificate on %s" % result.node)
11636 (name, cert_pem) = result.payload
11638 cert = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM,
11642 "handshake": masterd.instance.ComputeRemoteExportHandshake(self._cds),
11643 "x509_key_name": (name, utils.Sha1Hmac(self._cds, name, salt=salt),
11645 "x509_ca": utils.SignX509Certificate(cert, self._cds, salt),
11651 class LUBackupExport(LogicalUnit):
11652 """Export an instance to an image in the cluster.
11655 HPATH = "instance-export"
11656 HTYPE = constants.HTYPE_INSTANCE
11659 def CheckArguments(self):
11660 """Check the arguments.
11663 self.x509_key_name = self.op.x509_key_name
11664 self.dest_x509_ca_pem = self.op.destination_x509_ca
11666 if self.op.mode == constants.EXPORT_MODE_REMOTE:
11667 if not self.x509_key_name:
11668 raise errors.OpPrereqError("Missing X509 key name for encryption",
11669 errors.ECODE_INVAL)
11671 if not self.dest_x509_ca_pem:
11672 raise errors.OpPrereqError("Missing destination X509 CA",
11673 errors.ECODE_INVAL)
11675 def ExpandNames(self):
11676 self._ExpandAndLockInstance()
11678 # Lock all nodes for local exports
11679 if self.op.mode == constants.EXPORT_MODE_LOCAL:
11680 # FIXME: lock only instance primary and destination node
11682 # Sad but true, for now we have do lock all nodes, as we don't know where
11683 # the previous export might be, and in this LU we search for it and
11684 # remove it from its current node. In the future we could fix this by:
11685 # - making a tasklet to search (share-lock all), then create the
11686 # new one, then one to remove, after
11687 # - removing the removal operation altogether
11688 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
11690 def DeclareLocks(self, level):
11691 """Last minute lock declaration."""
11692 # All nodes are locked anyway, so nothing to do here.
11694 def BuildHooksEnv(self):
11695 """Build hooks env.
11697 This will run on the master, primary node and target node.
11701 "EXPORT_MODE": self.op.mode,
11702 "EXPORT_NODE": self.op.target_node,
11703 "EXPORT_DO_SHUTDOWN": self.op.shutdown,
11704 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
11705 # TODO: Generic function for boolean env variables
11706 "REMOVE_INSTANCE": str(bool(self.op.remove_instance)),
11709 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
11713 def BuildHooksNodes(self):
11714 """Build hooks nodes.
11717 nl = [self.cfg.GetMasterNode(), self.instance.primary_node]
11719 if self.op.mode == constants.EXPORT_MODE_LOCAL:
11720 nl.append(self.op.target_node)
11724 def CheckPrereq(self):
11725 """Check prerequisites.
11727 This checks that the instance and node names are valid.
11730 instance_name = self.op.instance_name
11732 self.instance = self.cfg.GetInstanceInfo(instance_name)
11733 assert self.instance is not None, \
11734 "Cannot retrieve locked instance %s" % self.op.instance_name
11735 _CheckNodeOnline(self, self.instance.primary_node)
11737 if (self.op.remove_instance and self.instance.admin_up and
11738 not self.op.shutdown):
11739 raise errors.OpPrereqError("Can not remove instance without shutting it"
11742 if self.op.mode == constants.EXPORT_MODE_LOCAL:
11743 self.op.target_node = _ExpandNodeName(self.cfg, self.op.target_node)
11744 self.dst_node = self.cfg.GetNodeInfo(self.op.target_node)
11745 assert self.dst_node is not None
11747 _CheckNodeOnline(self, self.dst_node.name)
11748 _CheckNodeNotDrained(self, self.dst_node.name)
11751 self.dest_disk_info = None
11752 self.dest_x509_ca = None
11754 elif self.op.mode == constants.EXPORT_MODE_REMOTE:
11755 self.dst_node = None
11757 if len(self.op.target_node) != len(self.instance.disks):
11758 raise errors.OpPrereqError(("Received destination information for %s"
11759 " disks, but instance %s has %s disks") %
11760 (len(self.op.target_node), instance_name,
11761 len(self.instance.disks)),
11762 errors.ECODE_INVAL)
11764 cds = _GetClusterDomainSecret()
11766 # Check X509 key name
11768 (key_name, hmac_digest, hmac_salt) = self.x509_key_name
11769 except (TypeError, ValueError), err:
11770 raise errors.OpPrereqError("Invalid data for X509 key name: %s" % err)
11772 if not utils.VerifySha1Hmac(cds, key_name, hmac_digest, salt=hmac_salt):
11773 raise errors.OpPrereqError("HMAC for X509 key name is wrong",
11774 errors.ECODE_INVAL)
11776 # Load and verify CA
11778 (cert, _) = utils.LoadSignedX509Certificate(self.dest_x509_ca_pem, cds)
11779 except OpenSSL.crypto.Error, err:
11780 raise errors.OpPrereqError("Unable to load destination X509 CA (%s)" %
11781 (err, ), errors.ECODE_INVAL)
11783 (errcode, msg) = utils.VerifyX509Certificate(cert, None, None)
11784 if errcode is not None:
11785 raise errors.OpPrereqError("Invalid destination X509 CA (%s)" %
11786 (msg, ), errors.ECODE_INVAL)
11788 self.dest_x509_ca = cert
11790 # Verify target information
11792 for idx, disk_data in enumerate(self.op.target_node):
11794 (host, port, magic) = \
11795 masterd.instance.CheckRemoteExportDiskInfo(cds, idx, disk_data)
11796 except errors.GenericError, err:
11797 raise errors.OpPrereqError("Target info for disk %s: %s" %
11798 (idx, err), errors.ECODE_INVAL)
11800 disk_info.append((host, port, magic))
11802 assert len(disk_info) == len(self.op.target_node)
11803 self.dest_disk_info = disk_info
11806 raise errors.ProgrammerError("Unhandled export mode %r" %
11809 # instance disk type verification
11810 # TODO: Implement export support for file-based disks
11811 for disk in self.instance.disks:
11812 if disk.dev_type == constants.LD_FILE:
11813 raise errors.OpPrereqError("Export not supported for instances with"
11814 " file-based disks", errors.ECODE_INVAL)
11816 def _CleanupExports(self, feedback_fn):
11817 """Removes exports of current instance from all other nodes.
11819 If an instance in a cluster with nodes A..D was exported to node C, its
11820 exports will be removed from the nodes A, B and D.
11823 assert self.op.mode != constants.EXPORT_MODE_REMOTE
11825 nodelist = self.cfg.GetNodeList()
11826 nodelist.remove(self.dst_node.name)
11828 # on one-node clusters nodelist will be empty after the removal
11829 # if we proceed the backup would be removed because OpBackupQuery
11830 # substitutes an empty list with the full cluster node list.
11831 iname = self.instance.name
11833 feedback_fn("Removing old exports for instance %s" % iname)
11834 exportlist = self.rpc.call_export_list(nodelist)
11835 for node in exportlist:
11836 if exportlist[node].fail_msg:
11838 if iname in exportlist[node].payload:
11839 msg = self.rpc.call_export_remove(node, iname).fail_msg
11841 self.LogWarning("Could not remove older export for instance %s"
11842 " on node %s: %s", iname, node, msg)
11844 def Exec(self, feedback_fn):
11845 """Export an instance to an image in the cluster.
11848 assert self.op.mode in constants.EXPORT_MODES
11850 instance = self.instance
11851 src_node = instance.primary_node
11853 if self.op.shutdown:
11854 # shutdown the instance, but not the disks
11855 feedback_fn("Shutting down instance %s" % instance.name)
11856 result = self.rpc.call_instance_shutdown(src_node, instance,
11857 self.op.shutdown_timeout)
11858 # TODO: Maybe ignore failures if ignore_remove_failures is set
11859 result.Raise("Could not shutdown instance %s on"
11860 " node %s" % (instance.name, src_node))
11862 # set the disks ID correctly since call_instance_start needs the
11863 # correct drbd minor to create the symlinks
11864 for disk in instance.disks:
11865 self.cfg.SetDiskID(disk, src_node)
11867 activate_disks = (not instance.admin_up)
11870 # Activate the instance disks if we'exporting a stopped instance
11871 feedback_fn("Activating disks for %s" % instance.name)
11872 _StartInstanceDisks(self, instance, None)
11875 helper = masterd.instance.ExportInstanceHelper(self, feedback_fn,
11878 helper.CreateSnapshots()
11880 if (self.op.shutdown and instance.admin_up and
11881 not self.op.remove_instance):
11882 assert not activate_disks
11883 feedback_fn("Starting instance %s" % instance.name)
11884 result = self.rpc.call_instance_start(src_node, instance,
11886 msg = result.fail_msg
11888 feedback_fn("Failed to start instance: %s" % msg)
11889 _ShutdownInstanceDisks(self, instance)
11890 raise errors.OpExecError("Could not start instance: %s" % msg)
11892 if self.op.mode == constants.EXPORT_MODE_LOCAL:
11893 (fin_resu, dresults) = helper.LocalExport(self.dst_node)
11894 elif self.op.mode == constants.EXPORT_MODE_REMOTE:
11895 connect_timeout = constants.RIE_CONNECT_TIMEOUT
11896 timeouts = masterd.instance.ImportExportTimeouts(connect_timeout)
11898 (key_name, _, _) = self.x509_key_name
11901 OpenSSL.crypto.dump_certificate(OpenSSL.crypto.FILETYPE_PEM,
11904 (fin_resu, dresults) = helper.RemoteExport(self.dest_disk_info,
11905 key_name, dest_ca_pem,
11910 # Check for backwards compatibility
11911 assert len(dresults) == len(instance.disks)
11912 assert compat.all(isinstance(i, bool) for i in dresults), \
11913 "Not all results are boolean: %r" % dresults
11917 feedback_fn("Deactivating disks for %s" % instance.name)
11918 _ShutdownInstanceDisks(self, instance)
11920 if not (compat.all(dresults) and fin_resu):
11923 failures.append("export finalization")
11924 if not compat.all(dresults):
11925 fdsk = utils.CommaJoin(idx for (idx, dsk) in enumerate(dresults)
11927 failures.append("disk export: disk(s) %s" % fdsk)
11929 raise errors.OpExecError("Export failed, errors in %s" %
11930 utils.CommaJoin(failures))
11932 # At this point, the export was successful, we can cleanup/finish
11934 # Remove instance if requested
11935 if self.op.remove_instance:
11936 feedback_fn("Removing instance %s" % instance.name)
11937 _RemoveInstance(self, feedback_fn, instance,
11938 self.op.ignore_remove_failures)
11940 if self.op.mode == constants.EXPORT_MODE_LOCAL:
11941 self._CleanupExports(feedback_fn)
11943 return fin_resu, dresults
11946 class LUBackupRemove(NoHooksLU):
11947 """Remove exports related to the named instance.
11952 def ExpandNames(self):
11953 self.needed_locks = {}
11954 # We need all nodes to be locked in order for RemoveExport to work, but we
11955 # don't need to lock the instance itself, as nothing will happen to it (and
11956 # we can remove exports also for a removed instance)
11957 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
11959 def Exec(self, feedback_fn):
11960 """Remove any export.
11963 instance_name = self.cfg.ExpandInstanceName(self.op.instance_name)
11964 # If the instance was not found we'll try with the name that was passed in.
11965 # This will only work if it was an FQDN, though.
11967 if not instance_name:
11969 instance_name = self.op.instance_name
11971 locked_nodes = self.owned_locks(locking.LEVEL_NODE)
11972 exportlist = self.rpc.call_export_list(locked_nodes)
11974 for node in exportlist:
11975 msg = exportlist[node].fail_msg
11977 self.LogWarning("Failed to query node %s (continuing): %s", node, msg)
11979 if instance_name in exportlist[node].payload:
11981 result = self.rpc.call_export_remove(node, instance_name)
11982 msg = result.fail_msg
11984 logging.error("Could not remove export for instance %s"
11985 " on node %s: %s", instance_name, node, msg)
11987 if fqdn_warn and not found:
11988 feedback_fn("Export not found. If trying to remove an export belonging"
11989 " to a deleted instance please use its Fully Qualified"
11993 class LUGroupAdd(LogicalUnit):
11994 """Logical unit for creating node groups.
11997 HPATH = "group-add"
11998 HTYPE = constants.HTYPE_GROUP
12001 def ExpandNames(self):
12002 # We need the new group's UUID here so that we can create and acquire the
12003 # corresponding lock. Later, in Exec(), we'll indicate to cfg.AddNodeGroup
12004 # that it should not check whether the UUID exists in the configuration.
12005 self.group_uuid = self.cfg.GenerateUniqueID(self.proc.GetECId())
12006 self.needed_locks = {}
12007 self.add_locks[locking.LEVEL_NODEGROUP] = self.group_uuid
12009 def CheckPrereq(self):
12010 """Check prerequisites.
12012 This checks that the given group name is not an existing node group
12017 existing_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
12018 except errors.OpPrereqError:
12021 raise errors.OpPrereqError("Desired group name '%s' already exists as a"
12022 " node group (UUID: %s)" %
12023 (self.op.group_name, existing_uuid),
12024 errors.ECODE_EXISTS)
12026 if self.op.ndparams:
12027 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
12029 def BuildHooksEnv(self):
12030 """Build hooks env.
12034 "GROUP_NAME": self.op.group_name,
12037 def BuildHooksNodes(self):
12038 """Build hooks nodes.
12041 mn = self.cfg.GetMasterNode()
12042 return ([mn], [mn])
12044 def Exec(self, feedback_fn):
12045 """Add the node group to the cluster.
12048 group_obj = objects.NodeGroup(name=self.op.group_name, members=[],
12049 uuid=self.group_uuid,
12050 alloc_policy=self.op.alloc_policy,
12051 ndparams=self.op.ndparams)
12053 self.cfg.AddNodeGroup(group_obj, self.proc.GetECId(), check_uuid=False)
12054 del self.remove_locks[locking.LEVEL_NODEGROUP]
12057 class LUGroupAssignNodes(NoHooksLU):
12058 """Logical unit for assigning nodes to groups.
12063 def ExpandNames(self):
12064 # These raise errors.OpPrereqError on their own:
12065 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
12066 self.op.nodes = _GetWantedNodes(self, self.op.nodes)
12068 # We want to lock all the affected nodes and groups. We have readily
12069 # available the list of nodes, and the *destination* group. To gather the
12070 # list of "source" groups, we need to fetch node information later on.
12071 self.needed_locks = {
12072 locking.LEVEL_NODEGROUP: set([self.group_uuid]),
12073 locking.LEVEL_NODE: self.op.nodes,
12076 def DeclareLocks(self, level):
12077 if level == locking.LEVEL_NODEGROUP:
12078 assert len(self.needed_locks[locking.LEVEL_NODEGROUP]) == 1
12080 # Try to get all affected nodes' groups without having the group or node
12081 # lock yet. Needs verification later in the code flow.
12082 groups = self.cfg.GetNodeGroupsFromNodes(self.op.nodes)
12084 self.needed_locks[locking.LEVEL_NODEGROUP].update(groups)
12086 def CheckPrereq(self):
12087 """Check prerequisites.
12090 assert self.needed_locks[locking.LEVEL_NODEGROUP]
12091 assert (frozenset(self.owned_locks(locking.LEVEL_NODE)) ==
12092 frozenset(self.op.nodes))
12094 expected_locks = (set([self.group_uuid]) |
12095 self.cfg.GetNodeGroupsFromNodes(self.op.nodes))
12096 actual_locks = self.owned_locks(locking.LEVEL_NODEGROUP)
12097 if actual_locks != expected_locks:
12098 raise errors.OpExecError("Nodes changed groups since locks were acquired,"
12099 " current groups are '%s', used to be '%s'" %
12100 (utils.CommaJoin(expected_locks),
12101 utils.CommaJoin(actual_locks)))
12103 self.node_data = self.cfg.GetAllNodesInfo()
12104 self.group = self.cfg.GetNodeGroup(self.group_uuid)
12105 instance_data = self.cfg.GetAllInstancesInfo()
12107 if self.group is None:
12108 raise errors.OpExecError("Could not retrieve group '%s' (UUID: %s)" %
12109 (self.op.group_name, self.group_uuid))
12111 (new_splits, previous_splits) = \
12112 self.CheckAssignmentForSplitInstances([(node, self.group_uuid)
12113 for node in self.op.nodes],
12114 self.node_data, instance_data)
12117 fmt_new_splits = utils.CommaJoin(utils.NiceSort(new_splits))
12119 if not self.op.force:
12120 raise errors.OpExecError("The following instances get split by this"
12121 " change and --force was not given: %s" %
12124 self.LogWarning("This operation will split the following instances: %s",
12127 if previous_splits:
12128 self.LogWarning("In addition, these already-split instances continue"
12129 " to be split across groups: %s",
12130 utils.CommaJoin(utils.NiceSort(previous_splits)))
12132 def Exec(self, feedback_fn):
12133 """Assign nodes to a new group.
12136 mods = [(node_name, self.group_uuid) for node_name in self.op.nodes]
12138 self.cfg.AssignGroupNodes(mods)
12141 def CheckAssignmentForSplitInstances(changes, node_data, instance_data):
12142 """Check for split instances after a node assignment.
12144 This method considers a series of node assignments as an atomic operation,
12145 and returns information about split instances after applying the set of
12148 In particular, it returns information about newly split instances, and
12149 instances that were already split, and remain so after the change.
12151 Only instances whose disk template is listed in constants.DTS_INT_MIRROR are
12154 @type changes: list of (node_name, new_group_uuid) pairs.
12155 @param changes: list of node assignments to consider.
12156 @param node_data: a dict with data for all nodes
12157 @param instance_data: a dict with all instances to consider
12158 @rtype: a two-tuple
12159 @return: a list of instances that were previously okay and result split as a
12160 consequence of this change, and a list of instances that were previously
12161 split and this change does not fix.
12164 changed_nodes = dict((node, group) for node, group in changes
12165 if node_data[node].group != group)
12167 all_split_instances = set()
12168 previously_split_instances = set()
12170 def InstanceNodes(instance):
12171 return [instance.primary_node] + list(instance.secondary_nodes)
12173 for inst in instance_data.values():
12174 if inst.disk_template not in constants.DTS_INT_MIRROR:
12177 instance_nodes = InstanceNodes(inst)
12179 if len(set(node_data[node].group for node in instance_nodes)) > 1:
12180 previously_split_instances.add(inst.name)
12182 if len(set(changed_nodes.get(node, node_data[node].group)
12183 for node in instance_nodes)) > 1:
12184 all_split_instances.add(inst.name)
12186 return (list(all_split_instances - previously_split_instances),
12187 list(previously_split_instances & all_split_instances))
12190 class _GroupQuery(_QueryBase):
12191 FIELDS = query.GROUP_FIELDS
12193 def ExpandNames(self, lu):
12194 lu.needed_locks = {}
12196 self._all_groups = lu.cfg.GetAllNodeGroupsInfo()
12197 name_to_uuid = dict((g.name, g.uuid) for g in self._all_groups.values())
12200 self.wanted = [name_to_uuid[name]
12201 for name in utils.NiceSort(name_to_uuid.keys())]
12203 # Accept names to be either names or UUIDs.
12206 all_uuid = frozenset(self._all_groups.keys())
12208 for name in self.names:
12209 if name in all_uuid:
12210 self.wanted.append(name)
12211 elif name in name_to_uuid:
12212 self.wanted.append(name_to_uuid[name])
12214 missing.append(name)
12217 raise errors.OpPrereqError("Some groups do not exist: %s" %
12218 utils.CommaJoin(missing),
12219 errors.ECODE_NOENT)
12221 def DeclareLocks(self, lu, level):
12224 def _GetQueryData(self, lu):
12225 """Computes the list of node groups and their attributes.
12228 do_nodes = query.GQ_NODE in self.requested_data
12229 do_instances = query.GQ_INST in self.requested_data
12231 group_to_nodes = None
12232 group_to_instances = None
12234 # For GQ_NODE, we need to map group->[nodes], and group->[instances] for
12235 # GQ_INST. The former is attainable with just GetAllNodesInfo(), but for the
12236 # latter GetAllInstancesInfo() is not enough, for we have to go through
12237 # instance->node. Hence, we will need to process nodes even if we only need
12238 # instance information.
12239 if do_nodes or do_instances:
12240 all_nodes = lu.cfg.GetAllNodesInfo()
12241 group_to_nodes = dict((uuid, []) for uuid in self.wanted)
12244 for node in all_nodes.values():
12245 if node.group in group_to_nodes:
12246 group_to_nodes[node.group].append(node.name)
12247 node_to_group[node.name] = node.group
12250 all_instances = lu.cfg.GetAllInstancesInfo()
12251 group_to_instances = dict((uuid, []) for uuid in self.wanted)
12253 for instance in all_instances.values():
12254 node = instance.primary_node
12255 if node in node_to_group:
12256 group_to_instances[node_to_group[node]].append(instance.name)
12259 # Do not pass on node information if it was not requested.
12260 group_to_nodes = None
12262 return query.GroupQueryData([self._all_groups[uuid]
12263 for uuid in self.wanted],
12264 group_to_nodes, group_to_instances)
12267 class LUGroupQuery(NoHooksLU):
12268 """Logical unit for querying node groups.
12273 def CheckArguments(self):
12274 self.gq = _GroupQuery(qlang.MakeSimpleFilter("name", self.op.names),
12275 self.op.output_fields, False)
12277 def ExpandNames(self):
12278 self.gq.ExpandNames(self)
12280 def DeclareLocks(self, level):
12281 self.gq.DeclareLocks(self, level)
12283 def Exec(self, feedback_fn):
12284 return self.gq.OldStyleQuery(self)
12287 class LUGroupSetParams(LogicalUnit):
12288 """Modifies the parameters of a node group.
12291 HPATH = "group-modify"
12292 HTYPE = constants.HTYPE_GROUP
12295 def CheckArguments(self):
12298 self.op.alloc_policy,
12301 if all_changes.count(None) == len(all_changes):
12302 raise errors.OpPrereqError("Please pass at least one modification",
12303 errors.ECODE_INVAL)
12305 def ExpandNames(self):
12306 # This raises errors.OpPrereqError on its own:
12307 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
12309 self.needed_locks = {
12310 locking.LEVEL_NODEGROUP: [self.group_uuid],
12313 def CheckPrereq(self):
12314 """Check prerequisites.
12317 self.group = self.cfg.GetNodeGroup(self.group_uuid)
12319 if self.group is None:
12320 raise errors.OpExecError("Could not retrieve group '%s' (UUID: %s)" %
12321 (self.op.group_name, self.group_uuid))
12323 if self.op.ndparams:
12324 new_ndparams = _GetUpdatedParams(self.group.ndparams, self.op.ndparams)
12325 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
12326 self.new_ndparams = new_ndparams
12328 def BuildHooksEnv(self):
12329 """Build hooks env.
12333 "GROUP_NAME": self.op.group_name,
12334 "NEW_ALLOC_POLICY": self.op.alloc_policy,
12337 def BuildHooksNodes(self):
12338 """Build hooks nodes.
12341 mn = self.cfg.GetMasterNode()
12342 return ([mn], [mn])
12344 def Exec(self, feedback_fn):
12345 """Modifies the node group.
12350 if self.op.ndparams:
12351 self.group.ndparams = self.new_ndparams
12352 result.append(("ndparams", str(self.group.ndparams)))
12354 if self.op.alloc_policy:
12355 self.group.alloc_policy = self.op.alloc_policy
12357 self.cfg.Update(self.group, feedback_fn)
12361 class LUGroupRemove(LogicalUnit):
12362 HPATH = "group-remove"
12363 HTYPE = constants.HTYPE_GROUP
12366 def ExpandNames(self):
12367 # This will raises errors.OpPrereqError on its own:
12368 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
12369 self.needed_locks = {
12370 locking.LEVEL_NODEGROUP: [self.group_uuid],
12373 def CheckPrereq(self):
12374 """Check prerequisites.
12376 This checks that the given group name exists as a node group, that is
12377 empty (i.e., contains no nodes), and that is not the last group of the
12381 # Verify that the group is empty.
12382 group_nodes = [node.name
12383 for node in self.cfg.GetAllNodesInfo().values()
12384 if node.group == self.group_uuid]
12387 raise errors.OpPrereqError("Group '%s' not empty, has the following"
12389 (self.op.group_name,
12390 utils.CommaJoin(utils.NiceSort(group_nodes))),
12391 errors.ECODE_STATE)
12393 # Verify the cluster would not be left group-less.
12394 if len(self.cfg.GetNodeGroupList()) == 1:
12395 raise errors.OpPrereqError("Group '%s' is the only group,"
12396 " cannot be removed" %
12397 self.op.group_name,
12398 errors.ECODE_STATE)
12400 def BuildHooksEnv(self):
12401 """Build hooks env.
12405 "GROUP_NAME": self.op.group_name,
12408 def BuildHooksNodes(self):
12409 """Build hooks nodes.
12412 mn = self.cfg.GetMasterNode()
12413 return ([mn], [mn])
12415 def Exec(self, feedback_fn):
12416 """Remove the node group.
12420 self.cfg.RemoveNodeGroup(self.group_uuid)
12421 except errors.ConfigurationError:
12422 raise errors.OpExecError("Group '%s' with UUID %s disappeared" %
12423 (self.op.group_name, self.group_uuid))
12425 self.remove_locks[locking.LEVEL_NODEGROUP] = self.group_uuid
12428 class LUGroupRename(LogicalUnit):
12429 HPATH = "group-rename"
12430 HTYPE = constants.HTYPE_GROUP
12433 def ExpandNames(self):
12434 # This raises errors.OpPrereqError on its own:
12435 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
12437 self.needed_locks = {
12438 locking.LEVEL_NODEGROUP: [self.group_uuid],
12441 def CheckPrereq(self):
12442 """Check prerequisites.
12444 Ensures requested new name is not yet used.
12448 new_name_uuid = self.cfg.LookupNodeGroup(self.op.new_name)
12449 except errors.OpPrereqError:
12452 raise errors.OpPrereqError("Desired new name '%s' clashes with existing"
12453 " node group (UUID: %s)" %
12454 (self.op.new_name, new_name_uuid),
12455 errors.ECODE_EXISTS)
12457 def BuildHooksEnv(self):
12458 """Build hooks env.
12462 "OLD_NAME": self.op.group_name,
12463 "NEW_NAME": self.op.new_name,
12466 def BuildHooksNodes(self):
12467 """Build hooks nodes.
12470 mn = self.cfg.GetMasterNode()
12472 all_nodes = self.cfg.GetAllNodesInfo()
12473 all_nodes.pop(mn, None)
12476 run_nodes.extend(node.name for node in all_nodes.values()
12477 if node.group == self.group_uuid)
12479 return (run_nodes, run_nodes)
12481 def Exec(self, feedback_fn):
12482 """Rename the node group.
12485 group = self.cfg.GetNodeGroup(self.group_uuid)
12488 raise errors.OpExecError("Could not retrieve group '%s' (UUID: %s)" %
12489 (self.op.group_name, self.group_uuid))
12491 group.name = self.op.new_name
12492 self.cfg.Update(group, feedback_fn)
12494 return self.op.new_name
12497 class LUGroupEvacuate(LogicalUnit):
12498 HPATH = "group-evacuate"
12499 HTYPE = constants.HTYPE_GROUP
12502 def ExpandNames(self):
12503 # This raises errors.OpPrereqError on its own:
12504 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
12506 if self.op.target_groups:
12507 self.req_target_uuids = map(self.cfg.LookupNodeGroup,
12508 self.op.target_groups)
12510 self.req_target_uuids = []
12512 if self.group_uuid in self.req_target_uuids:
12513 raise errors.OpPrereqError("Group to be evacuated (%s) can not be used"
12514 " as a target group (targets are %s)" %
12516 utils.CommaJoin(self.req_target_uuids)),
12517 errors.ECODE_INVAL)
12519 self.op.iallocator = _GetDefaultIAllocator(self.cfg, self.op.iallocator)
12521 self.share_locks = _ShareAll()
12522 self.needed_locks = {
12523 locking.LEVEL_INSTANCE: [],
12524 locking.LEVEL_NODEGROUP: [],
12525 locking.LEVEL_NODE: [],
12528 def DeclareLocks(self, level):
12529 if level == locking.LEVEL_INSTANCE:
12530 assert not self.needed_locks[locking.LEVEL_INSTANCE]
12532 # Lock instances optimistically, needs verification once node and group
12533 # locks have been acquired
12534 self.needed_locks[locking.LEVEL_INSTANCE] = \
12535 self.cfg.GetNodeGroupInstances(self.group_uuid)
12537 elif level == locking.LEVEL_NODEGROUP:
12538 assert not self.needed_locks[locking.LEVEL_NODEGROUP]
12540 if self.req_target_uuids:
12541 lock_groups = set([self.group_uuid] + self.req_target_uuids)
12543 # Lock all groups used by instances optimistically; this requires going
12544 # via the node before it's locked, requiring verification later on
12545 lock_groups.update(group_uuid
12546 for instance_name in
12547 self.owned_locks(locking.LEVEL_INSTANCE)
12549 self.cfg.GetInstanceNodeGroups(instance_name))
12551 # No target groups, need to lock all of them
12552 lock_groups = locking.ALL_SET
12554 self.needed_locks[locking.LEVEL_NODEGROUP] = lock_groups
12556 elif level == locking.LEVEL_NODE:
12557 # This will only lock the nodes in the group to be evacuated which
12558 # contain actual instances
12559 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
12560 self._LockInstancesNodes()
12562 # Lock all nodes in group to be evacuated and target groups
12563 owned_groups = frozenset(self.owned_locks(locking.LEVEL_NODEGROUP))
12564 assert self.group_uuid in owned_groups
12565 member_nodes = [node_name
12566 for group in owned_groups
12567 for node_name in self.cfg.GetNodeGroup(group).members]
12568 self.needed_locks[locking.LEVEL_NODE].extend(member_nodes)
12570 def CheckPrereq(self):
12571 owned_instances = frozenset(self.owned_locks(locking.LEVEL_INSTANCE))
12572 owned_groups = frozenset(self.owned_locks(locking.LEVEL_NODEGROUP))
12573 owned_nodes = frozenset(self.owned_locks(locking.LEVEL_NODE))
12575 assert owned_groups.issuperset(self.req_target_uuids)
12576 assert self.group_uuid in owned_groups
12578 # Check if locked instances are still correct
12579 _CheckNodeGroupInstances(self.cfg, self.group_uuid, owned_instances)
12581 # Get instance information
12582 self.instances = dict(self.cfg.GetMultiInstanceInfo(owned_instances))
12584 # Check if node groups for locked instances are still correct
12585 _CheckInstancesNodeGroups(self.cfg, self.instances,
12586 owned_groups, owned_nodes, self.group_uuid)
12588 if self.req_target_uuids:
12589 # User requested specific target groups
12590 self.target_uuids = self.req_target_uuids
12592 # All groups except the one to be evacuated are potential targets
12593 self.target_uuids = [group_uuid for group_uuid in owned_groups
12594 if group_uuid != self.group_uuid]
12596 if not self.target_uuids:
12597 raise errors.OpPrereqError("There are no possible target groups",
12598 errors.ECODE_INVAL)
12600 def BuildHooksEnv(self):
12601 """Build hooks env.
12605 "GROUP_NAME": self.op.group_name,
12606 "TARGET_GROUPS": " ".join(self.target_uuids),
12609 def BuildHooksNodes(self):
12610 """Build hooks nodes.
12613 mn = self.cfg.GetMasterNode()
12615 assert self.group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP)
12617 run_nodes = [mn] + self.cfg.GetNodeGroup(self.group_uuid).members
12619 return (run_nodes, run_nodes)
12621 def Exec(self, feedback_fn):
12622 instances = list(self.owned_locks(locking.LEVEL_INSTANCE))
12624 assert self.group_uuid not in self.target_uuids
12626 ial = IAllocator(self.cfg, self.rpc, constants.IALLOCATOR_MODE_CHG_GROUP,
12627 instances=instances, target_groups=self.target_uuids)
12629 ial.Run(self.op.iallocator)
12631 if not ial.success:
12632 raise errors.OpPrereqError("Can't compute group evacuation using"
12633 " iallocator '%s': %s" %
12634 (self.op.iallocator, ial.info),
12635 errors.ECODE_NORES)
12637 jobs = _LoadNodeEvacResult(self, ial.result, self.op.early_release, False)
12639 self.LogInfo("Iallocator returned %s job(s) for evacuating node group %s",
12640 len(jobs), self.op.group_name)
12642 return ResultWithJobs(jobs)
12645 class TagsLU(NoHooksLU): # pylint: disable=W0223
12646 """Generic tags LU.
12648 This is an abstract class which is the parent of all the other tags LUs.
12651 def ExpandNames(self):
12652 self.group_uuid = None
12653 self.needed_locks = {}
12654 if self.op.kind == constants.TAG_NODE:
12655 self.op.name = _ExpandNodeName(self.cfg, self.op.name)
12656 self.needed_locks[locking.LEVEL_NODE] = self.op.name
12657 elif self.op.kind == constants.TAG_INSTANCE:
12658 self.op.name = _ExpandInstanceName(self.cfg, self.op.name)
12659 self.needed_locks[locking.LEVEL_INSTANCE] = self.op.name
12660 elif self.op.kind == constants.TAG_NODEGROUP:
12661 self.group_uuid = self.cfg.LookupNodeGroup(self.op.name)
12663 # FIXME: Acquire BGL for cluster tag operations (as of this writing it's
12664 # not possible to acquire the BGL based on opcode parameters)
12666 def CheckPrereq(self):
12667 """Check prerequisites.
12670 if self.op.kind == constants.TAG_CLUSTER:
12671 self.target = self.cfg.GetClusterInfo()
12672 elif self.op.kind == constants.TAG_NODE:
12673 self.target = self.cfg.GetNodeInfo(self.op.name)
12674 elif self.op.kind == constants.TAG_INSTANCE:
12675 self.target = self.cfg.GetInstanceInfo(self.op.name)
12676 elif self.op.kind == constants.TAG_NODEGROUP:
12677 self.target = self.cfg.GetNodeGroup(self.group_uuid)
12679 raise errors.OpPrereqError("Wrong tag type requested (%s)" %
12680 str(self.op.kind), errors.ECODE_INVAL)
12683 class LUTagsGet(TagsLU):
12684 """Returns the tags of a given object.
12689 def ExpandNames(self):
12690 TagsLU.ExpandNames(self)
12692 # Share locks as this is only a read operation
12693 self.share_locks = _ShareAll()
12695 def Exec(self, feedback_fn):
12696 """Returns the tag list.
12699 return list(self.target.GetTags())
12702 class LUTagsSearch(NoHooksLU):
12703 """Searches the tags for a given pattern.
12708 def ExpandNames(self):
12709 self.needed_locks = {}
12711 def CheckPrereq(self):
12712 """Check prerequisites.
12714 This checks the pattern passed for validity by compiling it.
12718 self.re = re.compile(self.op.pattern)
12719 except re.error, err:
12720 raise errors.OpPrereqError("Invalid search pattern '%s': %s" %
12721 (self.op.pattern, err), errors.ECODE_INVAL)
12723 def Exec(self, feedback_fn):
12724 """Returns the tag list.
12728 tgts = [("/cluster", cfg.GetClusterInfo())]
12729 ilist = cfg.GetAllInstancesInfo().values()
12730 tgts.extend([("/instances/%s" % i.name, i) for i in ilist])
12731 nlist = cfg.GetAllNodesInfo().values()
12732 tgts.extend([("/nodes/%s" % n.name, n) for n in nlist])
12733 tgts.extend(("/nodegroup/%s" % n.name, n)
12734 for n in cfg.GetAllNodeGroupsInfo().values())
12736 for path, target in tgts:
12737 for tag in target.GetTags():
12738 if self.re.search(tag):
12739 results.append((path, tag))
12743 class LUTagsSet(TagsLU):
12744 """Sets a tag on a given object.
12749 def CheckPrereq(self):
12750 """Check prerequisites.
12752 This checks the type and length of the tag name and value.
12755 TagsLU.CheckPrereq(self)
12756 for tag in self.op.tags:
12757 objects.TaggableObject.ValidateTag(tag)
12759 def Exec(self, feedback_fn):
12764 for tag in self.op.tags:
12765 self.target.AddTag(tag)
12766 except errors.TagError, err:
12767 raise errors.OpExecError("Error while setting tag: %s" % str(err))
12768 self.cfg.Update(self.target, feedback_fn)
12771 class LUTagsDel(TagsLU):
12772 """Delete a list of tags from a given object.
12777 def CheckPrereq(self):
12778 """Check prerequisites.
12780 This checks that we have the given tag.
12783 TagsLU.CheckPrereq(self)
12784 for tag in self.op.tags:
12785 objects.TaggableObject.ValidateTag(tag)
12786 del_tags = frozenset(self.op.tags)
12787 cur_tags = self.target.GetTags()
12789 diff_tags = del_tags - cur_tags
12791 diff_names = ("'%s'" % i for i in sorted(diff_tags))
12792 raise errors.OpPrereqError("Tag(s) %s not found" %
12793 (utils.CommaJoin(diff_names), ),
12794 errors.ECODE_NOENT)
12796 def Exec(self, feedback_fn):
12797 """Remove the tag from the object.
12800 for tag in self.op.tags:
12801 self.target.RemoveTag(tag)
12802 self.cfg.Update(self.target, feedback_fn)
12805 class LUTestDelay(NoHooksLU):
12806 """Sleep for a specified amount of time.
12808 This LU sleeps on the master and/or nodes for a specified amount of
12814 def ExpandNames(self):
12815 """Expand names and set required locks.
12817 This expands the node list, if any.
12820 self.needed_locks = {}
12821 if self.op.on_nodes:
12822 # _GetWantedNodes can be used here, but is not always appropriate to use
12823 # this way in ExpandNames. Check LogicalUnit.ExpandNames docstring for
12824 # more information.
12825 self.op.on_nodes = _GetWantedNodes(self, self.op.on_nodes)
12826 self.needed_locks[locking.LEVEL_NODE] = self.op.on_nodes
12828 def _TestDelay(self):
12829 """Do the actual sleep.
12832 if self.op.on_master:
12833 if not utils.TestDelay(self.op.duration):
12834 raise errors.OpExecError("Error during master delay test")
12835 if self.op.on_nodes:
12836 result = self.rpc.call_test_delay(self.op.on_nodes, self.op.duration)
12837 for node, node_result in result.items():
12838 node_result.Raise("Failure during rpc call to node %s" % node)
12840 def Exec(self, feedback_fn):
12841 """Execute the test delay opcode, with the wanted repetitions.
12844 if self.op.repeat == 0:
12847 top_value = self.op.repeat - 1
12848 for i in range(self.op.repeat):
12849 self.LogInfo("Test delay iteration %d/%d" % (i, top_value))
12853 class LUTestJqueue(NoHooksLU):
12854 """Utility LU to test some aspects of the job queue.
12859 # Must be lower than default timeout for WaitForJobChange to see whether it
12860 # notices changed jobs
12861 _CLIENT_CONNECT_TIMEOUT = 20.0
12862 _CLIENT_CONFIRM_TIMEOUT = 60.0
12865 def _NotifyUsingSocket(cls, cb, errcls):
12866 """Opens a Unix socket and waits for another program to connect.
12869 @param cb: Callback to send socket name to client
12870 @type errcls: class
12871 @param errcls: Exception class to use for errors
12874 # Using a temporary directory as there's no easy way to create temporary
12875 # sockets without writing a custom loop around tempfile.mktemp and
12877 tmpdir = tempfile.mkdtemp()
12879 tmpsock = utils.PathJoin(tmpdir, "sock")
12881 logging.debug("Creating temporary socket at %s", tmpsock)
12882 sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
12887 # Send details to client
12890 # Wait for client to connect before continuing
12891 sock.settimeout(cls._CLIENT_CONNECT_TIMEOUT)
12893 (conn, _) = sock.accept()
12894 except socket.error, err:
12895 raise errcls("Client didn't connect in time (%s)" % err)
12899 # Remove as soon as client is connected
12900 shutil.rmtree(tmpdir)
12902 # Wait for client to close
12905 # pylint: disable=E1101
12906 # Instance of '_socketobject' has no ... member
12907 conn.settimeout(cls._CLIENT_CONFIRM_TIMEOUT)
12909 except socket.error, err:
12910 raise errcls("Client failed to confirm notification (%s)" % err)
12914 def _SendNotification(self, test, arg, sockname):
12915 """Sends a notification to the client.
12918 @param test: Test name
12919 @param arg: Test argument (depends on test)
12920 @type sockname: string
12921 @param sockname: Socket path
12924 self.Log(constants.ELOG_JQUEUE_TEST, (sockname, test, arg))
12926 def _Notify(self, prereq, test, arg):
12927 """Notifies the client of a test.
12930 @param prereq: Whether this is a prereq-phase test
12932 @param test: Test name
12933 @param arg: Test argument (depends on test)
12937 errcls = errors.OpPrereqError
12939 errcls = errors.OpExecError
12941 return self._NotifyUsingSocket(compat.partial(self._SendNotification,
12945 def CheckArguments(self):
12946 self.checkargs_calls = getattr(self, "checkargs_calls", 0) + 1
12947 self.expandnames_calls = 0
12949 def ExpandNames(self):
12950 checkargs_calls = getattr(self, "checkargs_calls", 0)
12951 if checkargs_calls < 1:
12952 raise errors.ProgrammerError("CheckArguments was not called")
12954 self.expandnames_calls += 1
12956 if self.op.notify_waitlock:
12957 self._Notify(True, constants.JQT_EXPANDNAMES, None)
12959 self.LogInfo("Expanding names")
12961 # Get lock on master node (just to get a lock, not for a particular reason)
12962 self.needed_locks = {
12963 locking.LEVEL_NODE: self.cfg.GetMasterNode(),
12966 def Exec(self, feedback_fn):
12967 if self.expandnames_calls < 1:
12968 raise errors.ProgrammerError("ExpandNames was not called")
12970 if self.op.notify_exec:
12971 self._Notify(False, constants.JQT_EXEC, None)
12973 self.LogInfo("Executing")
12975 if self.op.log_messages:
12976 self._Notify(False, constants.JQT_STARTMSG, len(self.op.log_messages))
12977 for idx, msg in enumerate(self.op.log_messages):
12978 self.LogInfo("Sending log message %s", idx + 1)
12979 feedback_fn(constants.JQT_MSGPREFIX + msg)
12980 # Report how many test messages have been sent
12981 self._Notify(False, constants.JQT_LOGMSG, idx + 1)
12984 raise errors.OpExecError("Opcode failure was requested")
12989 class IAllocator(object):
12990 """IAllocator framework.
12992 An IAllocator instance has three sets of attributes:
12993 - cfg that is needed to query the cluster
12994 - input data (all members of the _KEYS class attribute are required)
12995 - four buffer attributes (in|out_data|text), that represent the
12996 input (to the external script) in text and data structure format,
12997 and the output from it, again in two formats
12998 - the result variables from the script (success, info, nodes) for
13002 # pylint: disable=R0902
13003 # lots of instance attributes
13005 def __init__(self, cfg, rpc, mode, **kwargs):
13008 # init buffer variables
13009 self.in_text = self.out_text = self.in_data = self.out_data = None
13010 # init all input fields so that pylint is happy
13012 self.memory = self.disks = self.disk_template = None
13013 self.os = self.tags = self.nics = self.vcpus = None
13014 self.hypervisor = None
13015 self.relocate_from = None
13017 self.instances = None
13018 self.evac_mode = None
13019 self.target_groups = []
13021 self.required_nodes = None
13022 # init result fields
13023 self.success = self.info = self.result = None
13026 (fn, keydata, self._result_check) = self._MODE_DATA[self.mode]
13028 raise errors.ProgrammerError("Unknown mode '%s' passed to the"
13029 " IAllocator" % self.mode)
13031 keyset = [n for (n, _) in keydata]
13034 if key not in keyset:
13035 raise errors.ProgrammerError("Invalid input parameter '%s' to"
13036 " IAllocator" % key)
13037 setattr(self, key, kwargs[key])
13040 if key not in kwargs:
13041 raise errors.ProgrammerError("Missing input parameter '%s' to"
13042 " IAllocator" % key)
13043 self._BuildInputData(compat.partial(fn, self), keydata)
13045 def _ComputeClusterData(self):
13046 """Compute the generic allocator input data.
13048 This is the data that is independent of the actual operation.
13052 cluster_info = cfg.GetClusterInfo()
13055 "version": constants.IALLOCATOR_VERSION,
13056 "cluster_name": cfg.GetClusterName(),
13057 "cluster_tags": list(cluster_info.GetTags()),
13058 "enabled_hypervisors": list(cluster_info.enabled_hypervisors),
13059 # we don't have job IDs
13061 ninfo = cfg.GetAllNodesInfo()
13062 iinfo = cfg.GetAllInstancesInfo().values()
13063 i_list = [(inst, cluster_info.FillBE(inst)) for inst in iinfo]
13066 node_list = [n.name for n in ninfo.values() if n.vm_capable]
13068 if self.mode == constants.IALLOCATOR_MODE_ALLOC:
13069 hypervisor_name = self.hypervisor
13070 elif self.mode == constants.IALLOCATOR_MODE_RELOC:
13071 hypervisor_name = cfg.GetInstanceInfo(self.name).hypervisor
13073 hypervisor_name = cluster_info.enabled_hypervisors[0]
13075 node_data = self.rpc.call_node_info(node_list, cfg.GetVGName(),
13078 self.rpc.call_all_instances_info(node_list,
13079 cluster_info.enabled_hypervisors)
13081 data["nodegroups"] = self._ComputeNodeGroupData(cfg)
13083 config_ndata = self._ComputeBasicNodeData(ninfo)
13084 data["nodes"] = self._ComputeDynamicNodeData(ninfo, node_data, node_iinfo,
13085 i_list, config_ndata)
13086 assert len(data["nodes"]) == len(ninfo), \
13087 "Incomplete node data computed"
13089 data["instances"] = self._ComputeInstanceData(cluster_info, i_list)
13091 self.in_data = data
13094 def _ComputeNodeGroupData(cfg):
13095 """Compute node groups data.
13098 ng = dict((guuid, {
13099 "name": gdata.name,
13100 "alloc_policy": gdata.alloc_policy,
13102 for guuid, gdata in cfg.GetAllNodeGroupsInfo().items())
13107 def _ComputeBasicNodeData(node_cfg):
13108 """Compute global node data.
13111 @returns: a dict of name: (node dict, node config)
13114 # fill in static (config-based) values
13115 node_results = dict((ninfo.name, {
13116 "tags": list(ninfo.GetTags()),
13117 "primary_ip": ninfo.primary_ip,
13118 "secondary_ip": ninfo.secondary_ip,
13119 "offline": ninfo.offline,
13120 "drained": ninfo.drained,
13121 "master_candidate": ninfo.master_candidate,
13122 "group": ninfo.group,
13123 "master_capable": ninfo.master_capable,
13124 "vm_capable": ninfo.vm_capable,
13126 for ninfo in node_cfg.values())
13128 return node_results
13131 def _ComputeDynamicNodeData(node_cfg, node_data, node_iinfo, i_list,
13133 """Compute global node data.
13135 @param node_results: the basic node structures as filled from the config
13138 # make a copy of the current dict
13139 node_results = dict(node_results)
13140 for nname, nresult in node_data.items():
13141 assert nname in node_results, "Missing basic data for node %s" % nname
13142 ninfo = node_cfg[nname]
13144 if not (ninfo.offline or ninfo.drained):
13145 nresult.Raise("Can't get data for node %s" % nname)
13146 node_iinfo[nname].Raise("Can't get node instance info from node %s" %
13148 remote_info = nresult.payload
13150 for attr in ["memory_total", "memory_free", "memory_dom0",
13151 "vg_size", "vg_free", "cpu_total"]:
13152 if attr not in remote_info:
13153 raise errors.OpExecError("Node '%s' didn't return attribute"
13154 " '%s'" % (nname, attr))
13155 if not isinstance(remote_info[attr], int):
13156 raise errors.OpExecError("Node '%s' returned invalid value"
13158 (nname, attr, remote_info[attr]))
13159 # compute memory used by primary instances
13160 i_p_mem = i_p_up_mem = 0
13161 for iinfo, beinfo in i_list:
13162 if iinfo.primary_node == nname:
13163 i_p_mem += beinfo[constants.BE_MEMORY]
13164 if iinfo.name not in node_iinfo[nname].payload:
13167 i_used_mem = int(node_iinfo[nname].payload[iinfo.name]["memory"])
13168 i_mem_diff = beinfo[constants.BE_MEMORY] - i_used_mem
13169 remote_info["memory_free"] -= max(0, i_mem_diff)
13172 i_p_up_mem += beinfo[constants.BE_MEMORY]
13174 # compute memory used by instances
13176 "total_memory": remote_info["memory_total"],
13177 "reserved_memory": remote_info["memory_dom0"],
13178 "free_memory": remote_info["memory_free"],
13179 "total_disk": remote_info["vg_size"],
13180 "free_disk": remote_info["vg_free"],
13181 "total_cpus": remote_info["cpu_total"],
13182 "i_pri_memory": i_p_mem,
13183 "i_pri_up_memory": i_p_up_mem,
13185 pnr_dyn.update(node_results[nname])
13186 node_results[nname] = pnr_dyn
13188 return node_results
13191 def _ComputeInstanceData(cluster_info, i_list):
13192 """Compute global instance data.
13196 for iinfo, beinfo in i_list:
13198 for nic in iinfo.nics:
13199 filled_params = cluster_info.SimpleFillNIC(nic.nicparams)
13203 "mode": filled_params[constants.NIC_MODE],
13204 "link": filled_params[constants.NIC_LINK],
13206 if filled_params[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
13207 nic_dict["bridge"] = filled_params[constants.NIC_LINK]
13208 nic_data.append(nic_dict)
13210 "tags": list(iinfo.GetTags()),
13211 "admin_up": iinfo.admin_up,
13212 "vcpus": beinfo[constants.BE_VCPUS],
13213 "memory": beinfo[constants.BE_MEMORY],
13215 "nodes": [iinfo.primary_node] + list(iinfo.secondary_nodes),
13217 "disks": [{constants.IDISK_SIZE: dsk.size,
13218 constants.IDISK_MODE: dsk.mode}
13219 for dsk in iinfo.disks],
13220 "disk_template": iinfo.disk_template,
13221 "hypervisor": iinfo.hypervisor,
13223 pir["disk_space_total"] = _ComputeDiskSize(iinfo.disk_template,
13225 instance_data[iinfo.name] = pir
13227 return instance_data
13229 def _AddNewInstance(self):
13230 """Add new instance data to allocator structure.
13232 This in combination with _AllocatorGetClusterData will create the
13233 correct structure needed as input for the allocator.
13235 The checks for the completeness of the opcode must have already been
13239 disk_space = _ComputeDiskSize(self.disk_template, self.disks)
13241 if self.disk_template in constants.DTS_INT_MIRROR:
13242 self.required_nodes = 2
13244 self.required_nodes = 1
13248 "disk_template": self.disk_template,
13251 "vcpus": self.vcpus,
13252 "memory": self.memory,
13253 "disks": self.disks,
13254 "disk_space_total": disk_space,
13256 "required_nodes": self.required_nodes,
13257 "hypervisor": self.hypervisor,
13262 def _AddRelocateInstance(self):
13263 """Add relocate instance data to allocator structure.
13265 This in combination with _IAllocatorGetClusterData will create the
13266 correct structure needed as input for the allocator.
13268 The checks for the completeness of the opcode must have already been
13272 instance = self.cfg.GetInstanceInfo(self.name)
13273 if instance is None:
13274 raise errors.ProgrammerError("Unknown instance '%s' passed to"
13275 " IAllocator" % self.name)
13277 if instance.disk_template not in constants.DTS_MIRRORED:
13278 raise errors.OpPrereqError("Can't relocate non-mirrored instances",
13279 errors.ECODE_INVAL)
13281 if instance.disk_template in constants.DTS_INT_MIRROR and \
13282 len(instance.secondary_nodes) != 1:
13283 raise errors.OpPrereqError("Instance has not exactly one secondary node",
13284 errors.ECODE_STATE)
13286 self.required_nodes = 1
13287 disk_sizes = [{constants.IDISK_SIZE: disk.size} for disk in instance.disks]
13288 disk_space = _ComputeDiskSize(instance.disk_template, disk_sizes)
13292 "disk_space_total": disk_space,
13293 "required_nodes": self.required_nodes,
13294 "relocate_from": self.relocate_from,
13298 def _AddNodeEvacuate(self):
13299 """Get data for node-evacuate requests.
13303 "instances": self.instances,
13304 "evac_mode": self.evac_mode,
13307 def _AddChangeGroup(self):
13308 """Get data for node-evacuate requests.
13312 "instances": self.instances,
13313 "target_groups": self.target_groups,
13316 def _BuildInputData(self, fn, keydata):
13317 """Build input data structures.
13320 self._ComputeClusterData()
13323 request["type"] = self.mode
13324 for keyname, keytype in keydata:
13325 if keyname not in request:
13326 raise errors.ProgrammerError("Request parameter %s is missing" %
13328 val = request[keyname]
13329 if not keytype(val):
13330 raise errors.ProgrammerError("Request parameter %s doesn't pass"
13331 " validation, value %s, expected"
13332 " type %s" % (keyname, val, keytype))
13333 self.in_data["request"] = request
13335 self.in_text = serializer.Dump(self.in_data)
13337 _STRING_LIST = ht.TListOf(ht.TString)
13338 _JOB_LIST = ht.TListOf(ht.TListOf(ht.TStrictDict(True, False, {
13339 # pylint: disable=E1101
13340 # Class '...' has no 'OP_ID' member
13341 "OP_ID": ht.TElemOf([opcodes.OpInstanceFailover.OP_ID,
13342 opcodes.OpInstanceMigrate.OP_ID,
13343 opcodes.OpInstanceReplaceDisks.OP_ID])
13347 ht.TListOf(ht.TAnd(ht.TIsLength(3),
13348 ht.TItems([ht.TNonEmptyString,
13349 ht.TNonEmptyString,
13350 ht.TListOf(ht.TNonEmptyString),
13353 ht.TListOf(ht.TAnd(ht.TIsLength(2),
13354 ht.TItems([ht.TNonEmptyString,
13357 _NEVAC_RESULT = ht.TAnd(ht.TIsLength(3),
13358 ht.TItems([_NEVAC_MOVED, _NEVAC_FAILED, _JOB_LIST]))
13361 constants.IALLOCATOR_MODE_ALLOC:
13364 ("name", ht.TString),
13365 ("memory", ht.TInt),
13366 ("disks", ht.TListOf(ht.TDict)),
13367 ("disk_template", ht.TString),
13368 ("os", ht.TString),
13369 ("tags", _STRING_LIST),
13370 ("nics", ht.TListOf(ht.TDict)),
13371 ("vcpus", ht.TInt),
13372 ("hypervisor", ht.TString),
13374 constants.IALLOCATOR_MODE_RELOC:
13375 (_AddRelocateInstance,
13376 [("name", ht.TString), ("relocate_from", _STRING_LIST)],
13378 constants.IALLOCATOR_MODE_NODE_EVAC:
13379 (_AddNodeEvacuate, [
13380 ("instances", _STRING_LIST),
13381 ("evac_mode", ht.TElemOf(constants.IALLOCATOR_NEVAC_MODES)),
13383 constants.IALLOCATOR_MODE_CHG_GROUP:
13384 (_AddChangeGroup, [
13385 ("instances", _STRING_LIST),
13386 ("target_groups", _STRING_LIST),
13390 def Run(self, name, validate=True, call_fn=None):
13391 """Run an instance allocator and return the results.
13394 if call_fn is None:
13395 call_fn = self.rpc.call_iallocator_runner
13397 result = call_fn(self.cfg.GetMasterNode(), name, self.in_text)
13398 result.Raise("Failure while running the iallocator script")
13400 self.out_text = result.payload
13402 self._ValidateResult()
13404 def _ValidateResult(self):
13405 """Process the allocator results.
13407 This will process and if successful save the result in
13408 self.out_data and the other parameters.
13412 rdict = serializer.Load(self.out_text)
13413 except Exception, err:
13414 raise errors.OpExecError("Can't parse iallocator results: %s" % str(err))
13416 if not isinstance(rdict, dict):
13417 raise errors.OpExecError("Can't parse iallocator results: not a dict")
13419 # TODO: remove backwards compatiblity in later versions
13420 if "nodes" in rdict and "result" not in rdict:
13421 rdict["result"] = rdict["nodes"]
13424 for key in "success", "info", "result":
13425 if key not in rdict:
13426 raise errors.OpExecError("Can't parse iallocator results:"
13427 " missing key '%s'" % key)
13428 setattr(self, key, rdict[key])
13430 if not self._result_check(self.result):
13431 raise errors.OpExecError("Iallocator returned invalid result,"
13432 " expected %s, got %s" %
13433 (self._result_check, self.result),
13434 errors.ECODE_INVAL)
13436 if self.mode == constants.IALLOCATOR_MODE_RELOC:
13437 assert self.relocate_from is not None
13438 assert self.required_nodes == 1
13440 node2group = dict((name, ndata["group"])
13441 for (name, ndata) in self.in_data["nodes"].items())
13443 fn = compat.partial(self._NodesToGroups, node2group,
13444 self.in_data["nodegroups"])
13446 instance = self.cfg.GetInstanceInfo(self.name)
13447 request_groups = fn(self.relocate_from + [instance.primary_node])
13448 result_groups = fn(rdict["result"] + [instance.primary_node])
13450 if self.success and not set(result_groups).issubset(request_groups):
13451 raise errors.OpExecError("Groups of nodes returned by iallocator (%s)"
13452 " differ from original groups (%s)" %
13453 (utils.CommaJoin(result_groups),
13454 utils.CommaJoin(request_groups)))
13456 elif self.mode == constants.IALLOCATOR_MODE_NODE_EVAC:
13457 assert self.evac_mode in constants.IALLOCATOR_NEVAC_MODES
13459 self.out_data = rdict
13462 def _NodesToGroups(node2group, groups, nodes):
13463 """Returns a list of unique group names for a list of nodes.
13465 @type node2group: dict
13466 @param node2group: Map from node name to group UUID
13468 @param groups: Group information
13470 @param nodes: Node names
13477 group_uuid = node2group[node]
13479 # Ignore unknown node
13483 group = groups[group_uuid]
13485 # Can't find group, let's use UUID
13486 group_name = group_uuid
13488 group_name = group["name"]
13490 result.add(group_name)
13492 return sorted(result)
13495 class LUTestAllocator(NoHooksLU):
13496 """Run allocator tests.
13498 This LU runs the allocator tests
13501 def CheckPrereq(self):
13502 """Check prerequisites.
13504 This checks the opcode parameters depending on the director and mode test.
13507 if self.op.mode == constants.IALLOCATOR_MODE_ALLOC:
13508 for attr in ["memory", "disks", "disk_template",
13509 "os", "tags", "nics", "vcpus"]:
13510 if not hasattr(self.op, attr):
13511 raise errors.OpPrereqError("Missing attribute '%s' on opcode input" %
13512 attr, errors.ECODE_INVAL)
13513 iname = self.cfg.ExpandInstanceName(self.op.name)
13514 if iname is not None:
13515 raise errors.OpPrereqError("Instance '%s' already in the cluster" %
13516 iname, errors.ECODE_EXISTS)
13517 if not isinstance(self.op.nics, list):
13518 raise errors.OpPrereqError("Invalid parameter 'nics'",
13519 errors.ECODE_INVAL)
13520 if not isinstance(self.op.disks, list):
13521 raise errors.OpPrereqError("Invalid parameter 'disks'",
13522 errors.ECODE_INVAL)
13523 for row in self.op.disks:
13524 if (not isinstance(row, dict) or
13525 constants.IDISK_SIZE not in row or
13526 not isinstance(row[constants.IDISK_SIZE], int) or
13527 constants.IDISK_MODE not in row or
13528 row[constants.IDISK_MODE] not in constants.DISK_ACCESS_SET):
13529 raise errors.OpPrereqError("Invalid contents of the 'disks'"
13530 " parameter", errors.ECODE_INVAL)
13531 if self.op.hypervisor is None:
13532 self.op.hypervisor = self.cfg.GetHypervisorType()
13533 elif self.op.mode == constants.IALLOCATOR_MODE_RELOC:
13534 fname = _ExpandInstanceName(self.cfg, self.op.name)
13535 self.op.name = fname
13536 self.relocate_from = \
13537 list(self.cfg.GetInstanceInfo(fname).secondary_nodes)
13538 elif self.op.mode in (constants.IALLOCATOR_MODE_CHG_GROUP,
13539 constants.IALLOCATOR_MODE_NODE_EVAC):
13540 if not self.op.instances:
13541 raise errors.OpPrereqError("Missing instances", errors.ECODE_INVAL)
13542 self.op.instances = _GetWantedInstances(self, self.op.instances)
13544 raise errors.OpPrereqError("Invalid test allocator mode '%s'" %
13545 self.op.mode, errors.ECODE_INVAL)
13547 if self.op.direction == constants.IALLOCATOR_DIR_OUT:
13548 if self.op.allocator is None:
13549 raise errors.OpPrereqError("Missing allocator name",
13550 errors.ECODE_INVAL)
13551 elif self.op.direction != constants.IALLOCATOR_DIR_IN:
13552 raise errors.OpPrereqError("Wrong allocator test '%s'" %
13553 self.op.direction, errors.ECODE_INVAL)
13555 def Exec(self, feedback_fn):
13556 """Run the allocator test.
13559 if self.op.mode == constants.IALLOCATOR_MODE_ALLOC:
13560 ial = IAllocator(self.cfg, self.rpc,
13563 memory=self.op.memory,
13564 disks=self.op.disks,
13565 disk_template=self.op.disk_template,
13569 vcpus=self.op.vcpus,
13570 hypervisor=self.op.hypervisor,
13572 elif self.op.mode == constants.IALLOCATOR_MODE_RELOC:
13573 ial = IAllocator(self.cfg, self.rpc,
13576 relocate_from=list(self.relocate_from),
13578 elif self.op.mode == constants.IALLOCATOR_MODE_CHG_GROUP:
13579 ial = IAllocator(self.cfg, self.rpc,
13581 instances=self.op.instances,
13582 target_groups=self.op.target_groups)
13583 elif self.op.mode == constants.IALLOCATOR_MODE_NODE_EVAC:
13584 ial = IAllocator(self.cfg, self.rpc,
13586 instances=self.op.instances,
13587 evac_mode=self.op.evac_mode)
13589 raise errors.ProgrammerError("Uncatched mode %s in"
13590 " LUTestAllocator.Exec", self.op.mode)
13592 if self.op.direction == constants.IALLOCATOR_DIR_IN:
13593 result = ial.in_text
13595 ial.Run(self.op.allocator, validate=False)
13596 result = ial.out_text
13600 #: Query type implementations
13602 constants.QR_INSTANCE: _InstanceQuery,
13603 constants.QR_NODE: _NodeQuery,
13604 constants.QR_GROUP: _GroupQuery,
13605 constants.QR_OS: _OsQuery,
13608 assert set(_QUERY_IMPL.keys()) == constants.QR_VIA_OP
13611 def _GetQueryImplementation(name):
13612 """Returns the implemtnation for a query type.
13614 @param name: Query type, must be one of L{constants.QR_VIA_OP}
13618 return _QUERY_IMPL[name]
13620 raise errors.OpPrereqError("Unknown query resource '%s'" % name,
13621 errors.ECODE_INVAL)