4 # Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 Google Inc.
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 """Module implementing the master-side code."""
24 # pylint: disable=W0201,C0302
26 # W0201 since most LU attributes are defined in CheckPrereq or similar
29 # C0302: since we have waaaay too many lines in this module
45 from ganeti import ssh
46 from ganeti import utils
47 from ganeti import errors
48 from ganeti import hypervisor
49 from ganeti import locking
50 from ganeti import constants
51 from ganeti import objects
52 from ganeti import serializer
53 from ganeti import ssconf
54 from ganeti import uidpool
55 from ganeti import compat
56 from ganeti import masterd
57 from ganeti import netutils
58 from ganeti import query
59 from ganeti import qlang
60 from ganeti import opcodes
63 import ganeti.masterd.instance # pylint: disable=W0611
67 """Data container for LU results with jobs.
69 Instances of this class returned from L{LogicalUnit.Exec} will be recognized
70 by L{mcpu.Processor._ProcessResult}. The latter will then submit the jobs
71 contained in the C{jobs} attribute and include the job IDs in the opcode
75 def __init__(self, jobs, **kwargs):
76 """Initializes this class.
78 Additional return values can be specified as keyword arguments.
80 @type jobs: list of lists of L{opcode.OpCode}
81 @param jobs: A list of lists of opcode objects
88 class LogicalUnit(object):
89 """Logical Unit base class.
91 Subclasses must follow these rules:
92 - implement ExpandNames
93 - implement CheckPrereq (except when tasklets are used)
94 - implement Exec (except when tasklets are used)
95 - implement BuildHooksEnv
96 - implement BuildHooksNodes
97 - redefine HPATH and HTYPE
98 - optionally redefine their run requirements:
99 REQ_BGL: the LU needs to hold the Big Ganeti Lock exclusively
101 Note that all commands require root permissions.
103 @ivar dry_run_result: the value (if any) that will be returned to the caller
104 in dry-run mode (signalled by opcode dry_run parameter)
111 def __init__(self, processor, op, context, rpc):
112 """Constructor for LogicalUnit.
114 This needs to be overridden in derived classes in order to check op
118 self.proc = processor
120 self.cfg = context.cfg
121 self.glm = context.glm
123 self.owned_locks = context.glm.list_owned
124 self.context = context
126 # Dicts used to declare locking needs to mcpu
127 self.needed_locks = None
128 self.share_locks = dict.fromkeys(locking.LEVELS, 0)
130 self.remove_locks = {}
131 # Used to force good behavior when calling helper functions
132 self.recalculate_locks = {}
134 self.Log = processor.Log # pylint: disable=C0103
135 self.LogWarning = processor.LogWarning # pylint: disable=C0103
136 self.LogInfo = processor.LogInfo # pylint: disable=C0103
137 self.LogStep = processor.LogStep # pylint: disable=C0103
138 # support for dry-run
139 self.dry_run_result = None
140 # support for generic debug attribute
141 if (not hasattr(self.op, "debug_level") or
142 not isinstance(self.op.debug_level, int)):
143 self.op.debug_level = 0
148 # Validate opcode parameters and set defaults
149 self.op.Validate(True)
151 self.CheckArguments()
153 def CheckArguments(self):
154 """Check syntactic validity for the opcode arguments.
156 This method is for doing a simple syntactic check and ensure
157 validity of opcode parameters, without any cluster-related
158 checks. While the same can be accomplished in ExpandNames and/or
159 CheckPrereq, doing these separate is better because:
161 - ExpandNames is left as as purely a lock-related function
162 - CheckPrereq is run after we have acquired locks (and possible
165 The function is allowed to change the self.op attribute so that
166 later methods can no longer worry about missing parameters.
171 def ExpandNames(self):
172 """Expand names for this LU.
174 This method is called before starting to execute the opcode, and it should
175 update all the parameters of the opcode to their canonical form (e.g. a
176 short node name must be fully expanded after this method has successfully
177 completed). This way locking, hooks, logging, etc. can work correctly.
179 LUs which implement this method must also populate the self.needed_locks
180 member, as a dict with lock levels as keys, and a list of needed lock names
183 - use an empty dict if you don't need any lock
184 - if you don't need any lock at a particular level omit that level
185 - don't put anything for the BGL level
186 - if you want all locks at a level use locking.ALL_SET as a value
188 If you need to share locks (rather than acquire them exclusively) at one
189 level you can modify self.share_locks, setting a true value (usually 1) for
190 that level. By default locks are not shared.
192 This function can also define a list of tasklets, which then will be
193 executed in order instead of the usual LU-level CheckPrereq and Exec
194 functions, if those are not defined by the LU.
198 # Acquire all nodes and one instance
199 self.needed_locks = {
200 locking.LEVEL_NODE: locking.ALL_SET,
201 locking.LEVEL_INSTANCE: ['instance1.example.com'],
203 # Acquire just two nodes
204 self.needed_locks = {
205 locking.LEVEL_NODE: ['node1.example.com', 'node2.example.com'],
208 self.needed_locks = {} # No, you can't leave it to the default value None
211 # The implementation of this method is mandatory only if the new LU is
212 # concurrent, so that old LUs don't need to be changed all at the same
215 self.needed_locks = {} # Exclusive LUs don't need locks.
217 raise NotImplementedError
219 def DeclareLocks(self, level):
220 """Declare LU locking needs for a level
222 While most LUs can just declare their locking needs at ExpandNames time,
223 sometimes there's the need to calculate some locks after having acquired
224 the ones before. This function is called just before acquiring locks at a
225 particular level, but after acquiring the ones at lower levels, and permits
226 such calculations. It can be used to modify self.needed_locks, and by
227 default it does nothing.
229 This function is only called if you have something already set in
230 self.needed_locks for the level.
232 @param level: Locking level which is going to be locked
233 @type level: member of ganeti.locking.LEVELS
237 def CheckPrereq(self):
238 """Check prerequisites for this LU.
240 This method should check that the prerequisites for the execution
241 of this LU are fulfilled. It can do internode communication, but
242 it should be idempotent - no cluster or system changes are
245 The method should raise errors.OpPrereqError in case something is
246 not fulfilled. Its return value is ignored.
248 This method should also update all the parameters of the opcode to
249 their canonical form if it hasn't been done by ExpandNames before.
252 if self.tasklets is not None:
253 for (idx, tl) in enumerate(self.tasklets):
254 logging.debug("Checking prerequisites for tasklet %s/%s",
255 idx + 1, len(self.tasklets))
260 def Exec(self, feedback_fn):
263 This method should implement the actual work. It should raise
264 errors.OpExecError for failures that are somewhat dealt with in
268 if self.tasklets is not None:
269 for (idx, tl) in enumerate(self.tasklets):
270 logging.debug("Executing tasklet %s/%s", idx + 1, len(self.tasklets))
273 raise NotImplementedError
275 def BuildHooksEnv(self):
276 """Build hooks environment for this LU.
279 @return: Dictionary containing the environment that will be used for
280 running the hooks for this LU. The keys of the dict must not be prefixed
281 with "GANETI_"--that'll be added by the hooks runner. The hooks runner
282 will extend the environment with additional variables. If no environment
283 should be defined, an empty dictionary should be returned (not C{None}).
284 @note: If the C{HPATH} attribute of the LU class is C{None}, this function
288 raise NotImplementedError
290 def BuildHooksNodes(self):
291 """Build list of nodes to run LU's hooks.
293 @rtype: tuple; (list, list)
294 @return: Tuple containing a list of node names on which the hook
295 should run before the execution and a list of node names on which the
296 hook should run after the execution. No nodes should be returned as an
297 empty list (and not None).
298 @note: If the C{HPATH} attribute of the LU class is C{None}, this function
302 raise NotImplementedError
304 def HooksCallBack(self, phase, hook_results, feedback_fn, lu_result):
305 """Notify the LU about the results of its hooks.
307 This method is called every time a hooks phase is executed, and notifies
308 the Logical Unit about the hooks' result. The LU can then use it to alter
309 its result based on the hooks. By default the method does nothing and the
310 previous result is passed back unchanged but any LU can define it if it
311 wants to use the local cluster hook-scripts somehow.
313 @param phase: one of L{constants.HOOKS_PHASE_POST} or
314 L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase
315 @param hook_results: the results of the multi-node hooks rpc call
316 @param feedback_fn: function used send feedback back to the caller
317 @param lu_result: the previous Exec result this LU had, or None
319 @return: the new Exec result, based on the previous result
323 # API must be kept, thus we ignore the unused argument and could
324 # be a function warnings
325 # pylint: disable=W0613,R0201
328 def _ExpandAndLockInstance(self):
329 """Helper function to expand and lock an instance.
331 Many LUs that work on an instance take its name in self.op.instance_name
332 and need to expand it and then declare the expanded name for locking. This
333 function does it, and then updates self.op.instance_name to the expanded
334 name. It also initializes needed_locks as a dict, if this hasn't been done
338 if self.needed_locks is None:
339 self.needed_locks = {}
341 assert locking.LEVEL_INSTANCE not in self.needed_locks, \
342 "_ExpandAndLockInstance called with instance-level locks set"
343 self.op.instance_name = _ExpandInstanceName(self.cfg,
344 self.op.instance_name)
345 self.needed_locks[locking.LEVEL_INSTANCE] = self.op.instance_name
347 def _LockInstancesNodes(self, primary_only=False):
348 """Helper function to declare instances' nodes for locking.
350 This function should be called after locking one or more instances to lock
351 their nodes. Its effect is populating self.needed_locks[locking.LEVEL_NODE]
352 with all primary or secondary nodes for instances already locked and
353 present in self.needed_locks[locking.LEVEL_INSTANCE].
355 It should be called from DeclareLocks, and for safety only works if
356 self.recalculate_locks[locking.LEVEL_NODE] is set.
358 In the future it may grow parameters to just lock some instance's nodes, or
359 to just lock primaries or secondary nodes, if needed.
361 If should be called in DeclareLocks in a way similar to::
363 if level == locking.LEVEL_NODE:
364 self._LockInstancesNodes()
366 @type primary_only: boolean
367 @param primary_only: only lock primary nodes of locked instances
370 assert locking.LEVEL_NODE in self.recalculate_locks, \
371 "_LockInstancesNodes helper function called with no nodes to recalculate"
373 # TODO: check if we're really been called with the instance locks held
375 # For now we'll replace self.needed_locks[locking.LEVEL_NODE], but in the
376 # future we might want to have different behaviors depending on the value
377 # of self.recalculate_locks[locking.LEVEL_NODE]
379 locked_i = self.owned_locks(locking.LEVEL_INSTANCE)
380 for _, instance in self.cfg.GetMultiInstanceInfo(locked_i):
381 wanted_nodes.append(instance.primary_node)
383 wanted_nodes.extend(instance.secondary_nodes)
385 if self.recalculate_locks[locking.LEVEL_NODE] == constants.LOCKS_REPLACE:
386 self.needed_locks[locking.LEVEL_NODE] = wanted_nodes
387 elif self.recalculate_locks[locking.LEVEL_NODE] == constants.LOCKS_APPEND:
388 self.needed_locks[locking.LEVEL_NODE].extend(wanted_nodes)
390 del self.recalculate_locks[locking.LEVEL_NODE]
393 class NoHooksLU(LogicalUnit): # pylint: disable=W0223
394 """Simple LU which runs no hooks.
396 This LU is intended as a parent for other LogicalUnits which will
397 run no hooks, in order to reduce duplicate code.
403 def BuildHooksEnv(self):
404 """Empty BuildHooksEnv for NoHooksLu.
406 This just raises an error.
409 raise AssertionError("BuildHooksEnv called for NoHooksLUs")
411 def BuildHooksNodes(self):
412 """Empty BuildHooksNodes for NoHooksLU.
415 raise AssertionError("BuildHooksNodes called for NoHooksLU")
419 """Tasklet base class.
421 Tasklets are subcomponents for LUs. LUs can consist entirely of tasklets or
422 they can mix legacy code with tasklets. Locking needs to be done in the LU,
423 tasklets know nothing about locks.
425 Subclasses must follow these rules:
426 - Implement CheckPrereq
430 def __init__(self, lu):
437 def CheckPrereq(self):
438 """Check prerequisites for this tasklets.
440 This method should check whether the prerequisites for the execution of
441 this tasklet are fulfilled. It can do internode communication, but it
442 should be idempotent - no cluster or system changes are allowed.
444 The method should raise errors.OpPrereqError in case something is not
445 fulfilled. Its return value is ignored.
447 This method should also update all parameters to their canonical form if it
448 hasn't been done before.
453 def Exec(self, feedback_fn):
454 """Execute the tasklet.
456 This method should implement the actual work. It should raise
457 errors.OpExecError for failures that are somewhat dealt with in code, or
461 raise NotImplementedError
465 """Base for query utility classes.
468 #: Attribute holding field definitions
471 def __init__(self, filter_, fields, use_locking):
472 """Initializes this class.
475 self.use_locking = use_locking
477 self.query = query.Query(self.FIELDS, fields, filter_=filter_,
479 self.requested_data = self.query.RequestedData()
480 self.names = self.query.RequestedNames()
482 # Sort only if no names were requested
483 self.sort_by_name = not self.names
485 self.do_locking = None
488 def _GetNames(self, lu, all_names, lock_level):
489 """Helper function to determine names asked for in the query.
493 names = lu.owned_locks(lock_level)
497 if self.wanted == locking.ALL_SET:
498 assert not self.names
499 # caller didn't specify names, so ordering is not important
500 return utils.NiceSort(names)
502 # caller specified names and we must keep the same order
504 assert not self.do_locking or lu.glm.is_owned(lock_level)
506 missing = set(self.wanted).difference(names)
508 raise errors.OpExecError("Some items were removed before retrieving"
509 " their data: %s" % missing)
511 # Return expanded names
514 def ExpandNames(self, lu):
515 """Expand names for this query.
517 See L{LogicalUnit.ExpandNames}.
520 raise NotImplementedError()
522 def DeclareLocks(self, lu, level):
523 """Declare locks for this query.
525 See L{LogicalUnit.DeclareLocks}.
528 raise NotImplementedError()
530 def _GetQueryData(self, lu):
531 """Collects all data for this query.
533 @return: Query data object
536 raise NotImplementedError()
538 def NewStyleQuery(self, lu):
539 """Collect data and execute query.
542 return query.GetQueryResponse(self.query, self._GetQueryData(lu),
543 sort_by_name=self.sort_by_name)
545 def OldStyleQuery(self, lu):
546 """Collect data and execute query.
549 return self.query.OldStyleQuery(self._GetQueryData(lu),
550 sort_by_name=self.sort_by_name)
554 """Returns a dict declaring all lock levels shared.
557 return dict.fromkeys(locking.LEVELS, 1)
560 def _CheckInstanceNodeGroups(cfg, instance_name, owned_groups):
561 """Checks if the owned node groups are still correct for an instance.
563 @type cfg: L{config.ConfigWriter}
564 @param cfg: The cluster configuration
565 @type instance_name: string
566 @param instance_name: Instance name
567 @type owned_groups: set or frozenset
568 @param owned_groups: List of currently owned node groups
571 inst_groups = cfg.GetInstanceNodeGroups(instance_name)
573 if not owned_groups.issuperset(inst_groups):
574 raise errors.OpPrereqError("Instance %s's node groups changed since"
575 " locks were acquired, current groups are"
576 " are '%s', owning groups '%s'; retry the"
579 utils.CommaJoin(inst_groups),
580 utils.CommaJoin(owned_groups)),
586 def _CheckNodeGroupInstances(cfg, group_uuid, owned_instances):
587 """Checks if the instances in a node group are still correct.
589 @type cfg: L{config.ConfigWriter}
590 @param cfg: The cluster configuration
591 @type group_uuid: string
592 @param group_uuid: Node group UUID
593 @type owned_instances: set or frozenset
594 @param owned_instances: List of currently owned instances
597 wanted_instances = cfg.GetNodeGroupInstances(group_uuid)
598 if owned_instances != wanted_instances:
599 raise errors.OpPrereqError("Instances in node group '%s' changed since"
600 " locks were acquired, wanted '%s', have '%s';"
601 " retry the operation" %
603 utils.CommaJoin(wanted_instances),
604 utils.CommaJoin(owned_instances)),
607 return wanted_instances
610 def _SupportsOob(cfg, node):
611 """Tells if node supports OOB.
613 @type cfg: L{config.ConfigWriter}
614 @param cfg: The cluster configuration
615 @type node: L{objects.Node}
616 @param node: The node
617 @return: The OOB script if supported or an empty string otherwise
620 return cfg.GetNdParams(node)[constants.ND_OOB_PROGRAM]
623 def _GetWantedNodes(lu, nodes):
624 """Returns list of checked and expanded node names.
626 @type lu: L{LogicalUnit}
627 @param lu: the logical unit on whose behalf we execute
629 @param nodes: list of node names or None for all nodes
631 @return: the list of nodes, sorted
632 @raise errors.ProgrammerError: if the nodes parameter is wrong type
636 return [_ExpandNodeName(lu.cfg, name) for name in nodes]
638 return utils.NiceSort(lu.cfg.GetNodeList())
641 def _GetWantedInstances(lu, instances):
642 """Returns list of checked and expanded instance names.
644 @type lu: L{LogicalUnit}
645 @param lu: the logical unit on whose behalf we execute
646 @type instances: list
647 @param instances: list of instance names or None for all instances
649 @return: the list of instances, sorted
650 @raise errors.OpPrereqError: if the instances parameter is wrong type
651 @raise errors.OpPrereqError: if any of the passed instances is not found
655 wanted = [_ExpandInstanceName(lu.cfg, name) for name in instances]
657 wanted = utils.NiceSort(lu.cfg.GetInstanceList())
661 def _GetUpdatedParams(old_params, update_dict,
662 use_default=True, use_none=False):
663 """Return the new version of a parameter dictionary.
665 @type old_params: dict
666 @param old_params: old parameters
667 @type update_dict: dict
668 @param update_dict: dict containing new parameter values, or
669 constants.VALUE_DEFAULT to reset the parameter to its default
671 @param use_default: boolean
672 @type use_default: whether to recognise L{constants.VALUE_DEFAULT}
673 values as 'to be deleted' values
674 @param use_none: boolean
675 @type use_none: whether to recognise C{None} values as 'to be
678 @return: the new parameter dictionary
681 params_copy = copy.deepcopy(old_params)
682 for key, val in update_dict.iteritems():
683 if ((use_default and val == constants.VALUE_DEFAULT) or
684 (use_none and val is None)):
690 params_copy[key] = val
694 def _ReleaseLocks(lu, level, names=None, keep=None):
695 """Releases locks owned by an LU.
697 @type lu: L{LogicalUnit}
698 @param level: Lock level
699 @type names: list or None
700 @param names: Names of locks to release
701 @type keep: list or None
702 @param keep: Names of locks to retain
705 assert not (keep is not None and names is not None), \
706 "Only one of the 'names' and the 'keep' parameters can be given"
708 if names is not None:
709 should_release = names.__contains__
711 should_release = lambda name: name not in keep
713 should_release = None
719 # Determine which locks to release
720 for name in lu.owned_locks(level):
721 if should_release(name):
726 assert len(lu.owned_locks(level)) == (len(retain) + len(release))
728 # Release just some locks
729 lu.glm.release(level, names=release)
731 assert frozenset(lu.owned_locks(level)) == frozenset(retain)
734 lu.glm.release(level)
736 assert not lu.glm.is_owned(level), "No locks should be owned"
739 def _MapInstanceDisksToNodes(instances):
740 """Creates a map from (node, volume) to instance name.
742 @type instances: list of L{objects.Instance}
743 @rtype: dict; tuple of (node name, volume name) as key, instance name as value
746 return dict(((node, vol), inst.name)
747 for inst in instances
748 for (node, vols) in inst.MapLVsByNode().items()
752 def _RunPostHook(lu, node_name):
753 """Runs the post-hook for an opcode on a single node.
756 hm = lu.proc.hmclass(lu.rpc.call_hooks_runner, lu)
758 hm.RunPhase(constants.HOOKS_PHASE_POST, nodes=[node_name])
760 # pylint: disable=W0702
761 lu.LogWarning("Errors occurred running hooks on %s" % node_name)
764 def _CheckOutputFields(static, dynamic, selected):
765 """Checks whether all selected fields are valid.
767 @type static: L{utils.FieldSet}
768 @param static: static fields set
769 @type dynamic: L{utils.FieldSet}
770 @param dynamic: dynamic fields set
777 delta = f.NonMatching(selected)
779 raise errors.OpPrereqError("Unknown output fields selected: %s"
780 % ",".join(delta), errors.ECODE_INVAL)
783 def _CheckGlobalHvParams(params):
784 """Validates that given hypervisor params are not global ones.
786 This will ensure that instances don't get customised versions of
790 used_globals = constants.HVC_GLOBALS.intersection(params)
792 msg = ("The following hypervisor parameters are global and cannot"
793 " be customized at instance level, please modify them at"
794 " cluster level: %s" % utils.CommaJoin(used_globals))
795 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
798 def _CheckNodeOnline(lu, node, msg=None):
799 """Ensure that a given node is online.
801 @param lu: the LU on behalf of which we make the check
802 @param node: the node to check
803 @param msg: if passed, should be a message to replace the default one
804 @raise errors.OpPrereqError: if the node is offline
808 msg = "Can't use offline node"
809 if lu.cfg.GetNodeInfo(node).offline:
810 raise errors.OpPrereqError("%s: %s" % (msg, node), errors.ECODE_STATE)
813 def _CheckNodeNotDrained(lu, node):
814 """Ensure that a given node is not drained.
816 @param lu: the LU on behalf of which we make the check
817 @param node: the node to check
818 @raise errors.OpPrereqError: if the node is drained
821 if lu.cfg.GetNodeInfo(node).drained:
822 raise errors.OpPrereqError("Can't use drained node %s" % node,
826 def _CheckNodeVmCapable(lu, node):
827 """Ensure that a given node is vm capable.
829 @param lu: the LU on behalf of which we make the check
830 @param node: the node to check
831 @raise errors.OpPrereqError: if the node is not vm capable
834 if not lu.cfg.GetNodeInfo(node).vm_capable:
835 raise errors.OpPrereqError("Can't use non-vm_capable node %s" % node,
839 def _CheckNodeHasOS(lu, node, os_name, force_variant):
840 """Ensure that a node supports a given OS.
842 @param lu: the LU on behalf of which we make the check
843 @param node: the node to check
844 @param os_name: the OS to query about
845 @param force_variant: whether to ignore variant errors
846 @raise errors.OpPrereqError: if the node is not supporting the OS
849 result = lu.rpc.call_os_get(node, os_name)
850 result.Raise("OS '%s' not in supported OS list for node %s" %
852 prereq=True, ecode=errors.ECODE_INVAL)
853 if not force_variant:
854 _CheckOSVariant(result.payload, os_name)
857 def _CheckNodeHasSecondaryIP(lu, node, secondary_ip, prereq):
858 """Ensure that a node has the given secondary ip.
860 @type lu: L{LogicalUnit}
861 @param lu: the LU on behalf of which we make the check
863 @param node: the node to check
864 @type secondary_ip: string
865 @param secondary_ip: the ip to check
866 @type prereq: boolean
867 @param prereq: whether to throw a prerequisite or an execute error
868 @raise errors.OpPrereqError: if the node doesn't have the ip, and prereq=True
869 @raise errors.OpExecError: if the node doesn't have the ip, and prereq=False
872 result = lu.rpc.call_node_has_ip_address(node, secondary_ip)
873 result.Raise("Failure checking secondary ip on node %s" % node,
874 prereq=prereq, ecode=errors.ECODE_ENVIRON)
875 if not result.payload:
876 msg = ("Node claims it doesn't have the secondary ip you gave (%s),"
877 " please fix and re-run this command" % secondary_ip)
879 raise errors.OpPrereqError(msg, errors.ECODE_ENVIRON)
881 raise errors.OpExecError(msg)
884 def _GetClusterDomainSecret():
885 """Reads the cluster domain secret.
888 return utils.ReadOneLineFile(constants.CLUSTER_DOMAIN_SECRET_FILE,
892 def _CheckInstanceDown(lu, instance, reason):
893 """Ensure that an instance is not running."""
894 if instance.admin_up:
895 raise errors.OpPrereqError("Instance %s is marked to be up, %s" %
896 (instance.name, reason), errors.ECODE_STATE)
898 pnode = instance.primary_node
899 ins_l = lu.rpc.call_instance_list([pnode], [instance.hypervisor])[pnode]
900 ins_l.Raise("Can't contact node %s for instance information" % pnode,
901 prereq=True, ecode=errors.ECODE_ENVIRON)
903 if instance.name in ins_l.payload:
904 raise errors.OpPrereqError("Instance %s is running, %s" %
905 (instance.name, reason), errors.ECODE_STATE)
908 def _ExpandItemName(fn, name, kind):
909 """Expand an item name.
911 @param fn: the function to use for expansion
912 @param name: requested item name
913 @param kind: text description ('Node' or 'Instance')
914 @return: the resolved (full) name
915 @raise errors.OpPrereqError: if the item is not found
919 if full_name is None:
920 raise errors.OpPrereqError("%s '%s' not known" % (kind, name),
925 def _ExpandNodeName(cfg, name):
926 """Wrapper over L{_ExpandItemName} for nodes."""
927 return _ExpandItemName(cfg.ExpandNodeName, name, "Node")
930 def _ExpandInstanceName(cfg, name):
931 """Wrapper over L{_ExpandItemName} for instance."""
932 return _ExpandItemName(cfg.ExpandInstanceName, name, "Instance")
935 def _BuildInstanceHookEnv(name, primary_node, secondary_nodes, os_type, status,
936 memory, vcpus, nics, disk_template, disks,
937 bep, hvp, hypervisor_name, tags):
938 """Builds instance related env variables for hooks
940 This builds the hook environment from individual variables.
943 @param name: the name of the instance
944 @type primary_node: string
945 @param primary_node: the name of the instance's primary node
946 @type secondary_nodes: list
947 @param secondary_nodes: list of secondary nodes as strings
948 @type os_type: string
949 @param os_type: the name of the instance's OS
950 @type status: boolean
951 @param status: the should_run status of the instance
953 @param memory: the memory size of the instance
955 @param vcpus: the count of VCPUs the instance has
957 @param nics: list of tuples (ip, mac, mode, link) representing
958 the NICs the instance has
959 @type disk_template: string
960 @param disk_template: the disk template of the instance
962 @param disks: the list of (size, mode) pairs
964 @param bep: the backend parameters for the instance
966 @param hvp: the hypervisor parameters for the instance
967 @type hypervisor_name: string
968 @param hypervisor_name: the hypervisor for the instance
970 @param tags: list of instance tags as strings
972 @return: the hook environment for this instance
981 "INSTANCE_NAME": name,
982 "INSTANCE_PRIMARY": primary_node,
983 "INSTANCE_SECONDARIES": " ".join(secondary_nodes),
984 "INSTANCE_OS_TYPE": os_type,
985 "INSTANCE_STATUS": str_status,
986 "INSTANCE_MEMORY": memory,
987 "INSTANCE_VCPUS": vcpus,
988 "INSTANCE_DISK_TEMPLATE": disk_template,
989 "INSTANCE_HYPERVISOR": hypervisor_name,
993 nic_count = len(nics)
994 for idx, (ip, mac, mode, link) in enumerate(nics):
997 env["INSTANCE_NIC%d_IP" % idx] = ip
998 env["INSTANCE_NIC%d_MAC" % idx] = mac
999 env["INSTANCE_NIC%d_MODE" % idx] = mode
1000 env["INSTANCE_NIC%d_LINK" % idx] = link
1001 if mode == constants.NIC_MODE_BRIDGED:
1002 env["INSTANCE_NIC%d_BRIDGE" % idx] = link
1006 env["INSTANCE_NIC_COUNT"] = nic_count
1009 disk_count = len(disks)
1010 for idx, (size, mode) in enumerate(disks):
1011 env["INSTANCE_DISK%d_SIZE" % idx] = size
1012 env["INSTANCE_DISK%d_MODE" % idx] = mode
1016 env["INSTANCE_DISK_COUNT"] = disk_count
1021 env["INSTANCE_TAGS"] = " ".join(tags)
1023 for source, kind in [(bep, "BE"), (hvp, "HV")]:
1024 for key, value in source.items():
1025 env["INSTANCE_%s_%s" % (kind, key)] = value
1030 def _NICListToTuple(lu, nics):
1031 """Build a list of nic information tuples.
1033 This list is suitable to be passed to _BuildInstanceHookEnv or as a return
1034 value in LUInstanceQueryData.
1036 @type lu: L{LogicalUnit}
1037 @param lu: the logical unit on whose behalf we execute
1038 @type nics: list of L{objects.NIC}
1039 @param nics: list of nics to convert to hooks tuples
1043 cluster = lu.cfg.GetClusterInfo()
1047 filled_params = cluster.SimpleFillNIC(nic.nicparams)
1048 mode = filled_params[constants.NIC_MODE]
1049 link = filled_params[constants.NIC_LINK]
1050 hooks_nics.append((ip, mac, mode, link))
1054 def _BuildInstanceHookEnvByObject(lu, instance, override=None):
1055 """Builds instance related env variables for hooks from an object.
1057 @type lu: L{LogicalUnit}
1058 @param lu: the logical unit on whose behalf we execute
1059 @type instance: L{objects.Instance}
1060 @param instance: the instance for which we should build the
1062 @type override: dict
1063 @param override: dictionary with key/values that will override
1066 @return: the hook environment dictionary
1069 cluster = lu.cfg.GetClusterInfo()
1070 bep = cluster.FillBE(instance)
1071 hvp = cluster.FillHV(instance)
1073 "name": instance.name,
1074 "primary_node": instance.primary_node,
1075 "secondary_nodes": instance.secondary_nodes,
1076 "os_type": instance.os,
1077 "status": instance.admin_up,
1078 "memory": bep[constants.BE_MEMORY],
1079 "vcpus": bep[constants.BE_VCPUS],
1080 "nics": _NICListToTuple(lu, instance.nics),
1081 "disk_template": instance.disk_template,
1082 "disks": [(disk.size, disk.mode) for disk in instance.disks],
1085 "hypervisor_name": instance.hypervisor,
1086 "tags": instance.tags,
1089 args.update(override)
1090 return _BuildInstanceHookEnv(**args) # pylint: disable=W0142
1093 def _AdjustCandidatePool(lu, exceptions):
1094 """Adjust the candidate pool after node operations.
1097 mod_list = lu.cfg.MaintainCandidatePool(exceptions)
1099 lu.LogInfo("Promoted nodes to master candidate role: %s",
1100 utils.CommaJoin(node.name for node in mod_list))
1101 for name in mod_list:
1102 lu.context.ReaddNode(name)
1103 mc_now, mc_max, _ = lu.cfg.GetMasterCandidateStats(exceptions)
1105 lu.LogInfo("Note: more nodes are candidates (%d) than desired (%d)" %
1109 def _DecideSelfPromotion(lu, exceptions=None):
1110 """Decide whether I should promote myself as a master candidate.
1113 cp_size = lu.cfg.GetClusterInfo().candidate_pool_size
1114 mc_now, mc_should, _ = lu.cfg.GetMasterCandidateStats(exceptions)
1115 # the new node will increase mc_max with one, so:
1116 mc_should = min(mc_should + 1, cp_size)
1117 return mc_now < mc_should
1120 def _CheckNicsBridgesExist(lu, target_nics, target_node):
1121 """Check that the brigdes needed by a list of nics exist.
1124 cluster = lu.cfg.GetClusterInfo()
1125 paramslist = [cluster.SimpleFillNIC(nic.nicparams) for nic in target_nics]
1126 brlist = [params[constants.NIC_LINK] for params in paramslist
1127 if params[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED]
1129 result = lu.rpc.call_bridges_exist(target_node, brlist)
1130 result.Raise("Error checking bridges on destination node '%s'" %
1131 target_node, prereq=True, ecode=errors.ECODE_ENVIRON)
1134 def _CheckInstanceBridgesExist(lu, instance, node=None):
1135 """Check that the brigdes needed by an instance exist.
1139 node = instance.primary_node
1140 _CheckNicsBridgesExist(lu, instance.nics, node)
1143 def _CheckOSVariant(os_obj, name):
1144 """Check whether an OS name conforms to the os variants specification.
1146 @type os_obj: L{objects.OS}
1147 @param os_obj: OS object to check
1149 @param name: OS name passed by the user, to check for validity
1152 variant = objects.OS.GetVariant(name)
1153 if not os_obj.supported_variants:
1155 raise errors.OpPrereqError("OS '%s' doesn't support variants ('%s'"
1156 " passed)" % (os_obj.name, variant),
1160 raise errors.OpPrereqError("OS name must include a variant",
1163 if variant not in os_obj.supported_variants:
1164 raise errors.OpPrereqError("Unsupported OS variant", errors.ECODE_INVAL)
1167 def _GetNodeInstancesInner(cfg, fn):
1168 return [i for i in cfg.GetAllInstancesInfo().values() if fn(i)]
1171 def _GetNodeInstances(cfg, node_name):
1172 """Returns a list of all primary and secondary instances on a node.
1176 return _GetNodeInstancesInner(cfg, lambda inst: node_name in inst.all_nodes)
1179 def _GetNodePrimaryInstances(cfg, node_name):
1180 """Returns primary instances on a node.
1183 return _GetNodeInstancesInner(cfg,
1184 lambda inst: node_name == inst.primary_node)
1187 def _GetNodeSecondaryInstances(cfg, node_name):
1188 """Returns secondary instances on a node.
1191 return _GetNodeInstancesInner(cfg,
1192 lambda inst: node_name in inst.secondary_nodes)
1195 def _GetStorageTypeArgs(cfg, storage_type):
1196 """Returns the arguments for a storage type.
1199 # Special case for file storage
1200 if storage_type == constants.ST_FILE:
1201 # storage.FileStorage wants a list of storage directories
1202 return [[cfg.GetFileStorageDir(), cfg.GetSharedFileStorageDir()]]
1207 def _FindFaultyInstanceDisks(cfg, rpc, instance, node_name, prereq):
1210 for dev in instance.disks:
1211 cfg.SetDiskID(dev, node_name)
1213 result = rpc.call_blockdev_getmirrorstatus(node_name, instance.disks)
1214 result.Raise("Failed to get disk status from node %s" % node_name,
1215 prereq=prereq, ecode=errors.ECODE_ENVIRON)
1217 for idx, bdev_status in enumerate(result.payload):
1218 if bdev_status and bdev_status.ldisk_status == constants.LDS_FAULTY:
1224 def _CheckIAllocatorOrNode(lu, iallocator_slot, node_slot):
1225 """Check the sanity of iallocator and node arguments and use the
1226 cluster-wide iallocator if appropriate.
1228 Check that at most one of (iallocator, node) is specified. If none is
1229 specified, then the LU's opcode's iallocator slot is filled with the
1230 cluster-wide default iallocator.
1232 @type iallocator_slot: string
1233 @param iallocator_slot: the name of the opcode iallocator slot
1234 @type node_slot: string
1235 @param node_slot: the name of the opcode target node slot
1238 node = getattr(lu.op, node_slot, None)
1239 iallocator = getattr(lu.op, iallocator_slot, None)
1241 if node is not None and iallocator is not None:
1242 raise errors.OpPrereqError("Do not specify both, iallocator and node",
1244 elif node is None and iallocator is None:
1245 default_iallocator = lu.cfg.GetDefaultIAllocator()
1246 if default_iallocator:
1247 setattr(lu.op, iallocator_slot, default_iallocator)
1249 raise errors.OpPrereqError("No iallocator or node given and no"
1250 " cluster-wide default iallocator found;"
1251 " please specify either an iallocator or a"
1252 " node, or set a cluster-wide default"
1256 def _GetDefaultIAllocator(cfg, iallocator):
1257 """Decides on which iallocator to use.
1259 @type cfg: L{config.ConfigWriter}
1260 @param cfg: Cluster configuration object
1261 @type iallocator: string or None
1262 @param iallocator: Iallocator specified in opcode
1264 @return: Iallocator name
1268 # Use default iallocator
1269 iallocator = cfg.GetDefaultIAllocator()
1272 raise errors.OpPrereqError("No iallocator was specified, neither in the"
1273 " opcode nor as a cluster-wide default",
1279 class LUClusterPostInit(LogicalUnit):
1280 """Logical unit for running hooks after cluster initialization.
1283 HPATH = "cluster-init"
1284 HTYPE = constants.HTYPE_CLUSTER
1286 def BuildHooksEnv(self):
1291 "OP_TARGET": self.cfg.GetClusterName(),
1294 def BuildHooksNodes(self):
1295 """Build hooks nodes.
1298 return ([], [self.cfg.GetMasterNode()])
1300 def Exec(self, feedback_fn):
1307 class LUClusterDestroy(LogicalUnit):
1308 """Logical unit for destroying the cluster.
1311 HPATH = "cluster-destroy"
1312 HTYPE = constants.HTYPE_CLUSTER
1314 def BuildHooksEnv(self):
1319 "OP_TARGET": self.cfg.GetClusterName(),
1322 def BuildHooksNodes(self):
1323 """Build hooks nodes.
1328 def CheckPrereq(self):
1329 """Check prerequisites.
1331 This checks whether the cluster is empty.
1333 Any errors are signaled by raising errors.OpPrereqError.
1336 master = self.cfg.GetMasterNode()
1338 nodelist = self.cfg.GetNodeList()
1339 if len(nodelist) != 1 or nodelist[0] != master:
1340 raise errors.OpPrereqError("There are still %d node(s) in"
1341 " this cluster." % (len(nodelist) - 1),
1343 instancelist = self.cfg.GetInstanceList()
1345 raise errors.OpPrereqError("There are still %d instance(s) in"
1346 " this cluster." % len(instancelist),
1349 def Exec(self, feedback_fn):
1350 """Destroys the cluster.
1353 master = self.cfg.GetMasterNode()
1355 # Run post hooks on master node before it's removed
1356 _RunPostHook(self, master)
1358 result = self.rpc.call_node_deactivate_master_ip(master)
1359 result.Raise("Could not disable the master role")
1364 def _VerifyCertificate(filename):
1365 """Verifies a certificate for L{LUClusterVerifyConfig}.
1367 @type filename: string
1368 @param filename: Path to PEM file
1372 cert = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM,
1373 utils.ReadFile(filename))
1374 except Exception, err: # pylint: disable=W0703
1375 return (LUClusterVerifyConfig.ETYPE_ERROR,
1376 "Failed to load X509 certificate %s: %s" % (filename, err))
1379 utils.VerifyX509Certificate(cert, constants.SSL_CERT_EXPIRATION_WARN,
1380 constants.SSL_CERT_EXPIRATION_ERROR)
1383 fnamemsg = "While verifying %s: %s" % (filename, msg)
1388 return (None, fnamemsg)
1389 elif errcode == utils.CERT_WARNING:
1390 return (LUClusterVerifyConfig.ETYPE_WARNING, fnamemsg)
1391 elif errcode == utils.CERT_ERROR:
1392 return (LUClusterVerifyConfig.ETYPE_ERROR, fnamemsg)
1394 raise errors.ProgrammerError("Unhandled certificate error code %r" % errcode)
1397 def _GetAllHypervisorParameters(cluster, instances):
1398 """Compute the set of all hypervisor parameters.
1400 @type cluster: L{objects.Cluster}
1401 @param cluster: the cluster object
1402 @param instances: list of L{objects.Instance}
1403 @param instances: additional instances from which to obtain parameters
1404 @rtype: list of (origin, hypervisor, parameters)
1405 @return: a list with all parameters found, indicating the hypervisor they
1406 apply to, and the origin (can be "cluster", "os X", or "instance Y")
1411 for hv_name in cluster.enabled_hypervisors:
1412 hvp_data.append(("cluster", hv_name, cluster.GetHVDefaults(hv_name)))
1414 for os_name, os_hvp in cluster.os_hvp.items():
1415 for hv_name, hv_params in os_hvp.items():
1417 full_params = cluster.GetHVDefaults(hv_name, os_name=os_name)
1418 hvp_data.append(("os %s" % os_name, hv_name, full_params))
1420 # TODO: collapse identical parameter values in a single one
1421 for instance in instances:
1422 if instance.hvparams:
1423 hvp_data.append(("instance %s" % instance.name, instance.hypervisor,
1424 cluster.FillHV(instance)))
1429 class _VerifyErrors(object):
1430 """Mix-in for cluster/group verify LUs.
1432 It provides _Error and _ErrorIf, and updates the self.bad boolean. (Expects
1433 self.op and self._feedback_fn to be available.)
1437 ETYPE_FIELD = "code"
1438 ETYPE_ERROR = "ERROR"
1439 ETYPE_WARNING = "WARNING"
1441 def _Error(self, ecode, item, msg, *args, **kwargs):
1442 """Format an error message.
1444 Based on the opcode's error_codes parameter, either format a
1445 parseable error code, or a simpler error string.
1447 This must be called only from Exec and functions called from Exec.
1450 ltype = kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR)
1452 # first complete the msg
1455 # then format the whole message
1456 if self.op.error_codes: # This is a mix-in. pylint: disable=E1101
1457 msg = "%s:%s:%s:%s:%s" % (ltype, etxt, itype, item, msg)
1463 msg = "%s: %s%s: %s" % (ltype, itype, item, msg)
1464 # and finally report it via the feedback_fn
1465 self._feedback_fn(" - %s" % msg) # Mix-in. pylint: disable=E1101
1467 def _ErrorIf(self, cond, *args, **kwargs):
1468 """Log an error message if the passed condition is True.
1472 or self.op.debug_simulate_errors) # pylint: disable=E1101
1474 self._Error(*args, **kwargs)
1475 # do not mark the operation as failed for WARN cases only
1476 if kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR) == self.ETYPE_ERROR:
1477 self.bad = self.bad or cond
1480 class LUClusterVerify(NoHooksLU):
1481 """Submits all jobs necessary to verify the cluster.
1486 def ExpandNames(self):
1487 self.needed_locks = {}
1489 def Exec(self, feedback_fn):
1492 if self.op.group_name:
1493 groups = [self.op.group_name]
1494 depends_fn = lambda: None
1496 groups = self.cfg.GetNodeGroupList()
1498 # Verify global configuration
1499 jobs.append([opcodes.OpClusterVerifyConfig()])
1501 # Always depend on global verification
1502 depends_fn = lambda: [(-len(jobs), [])]
1504 jobs.extend([opcodes.OpClusterVerifyGroup(group_name=group,
1505 depends=depends_fn())]
1506 for group in groups)
1508 # Fix up all parameters
1509 for op in itertools.chain(*jobs): # pylint: disable=W0142
1510 op.debug_simulate_errors = self.op.debug_simulate_errors
1511 op.verbose = self.op.verbose
1512 op.error_codes = self.op.error_codes
1514 op.skip_checks = self.op.skip_checks
1515 except AttributeError:
1516 assert not isinstance(op, opcodes.OpClusterVerifyGroup)
1518 return ResultWithJobs(jobs)
1521 class LUClusterVerifyConfig(NoHooksLU, _VerifyErrors):
1522 """Verifies the cluster config.
1527 def _VerifyHVP(self, hvp_data):
1528 """Verifies locally the syntax of the hypervisor parameters.
1531 for item, hv_name, hv_params in hvp_data:
1532 msg = ("hypervisor %s parameters syntax check (source %s): %%s" %
1535 hv_class = hypervisor.GetHypervisor(hv_name)
1536 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
1537 hv_class.CheckParameterSyntax(hv_params)
1538 except errors.GenericError, err:
1539 self._ErrorIf(True, constants.CV_ECLUSTERCFG, None, msg % str(err))
1541 def ExpandNames(self):
1542 # Information can be safely retrieved as the BGL is acquired in exclusive
1544 assert locking.BGL in self.owned_locks(locking.LEVEL_CLUSTER)
1545 self.all_group_info = self.cfg.GetAllNodeGroupsInfo()
1546 self.all_node_info = self.cfg.GetAllNodesInfo()
1547 self.all_inst_info = self.cfg.GetAllInstancesInfo()
1548 self.needed_locks = {}
1550 def Exec(self, feedback_fn):
1551 """Verify integrity of cluster, performing various test on nodes.
1555 self._feedback_fn = feedback_fn
1557 feedback_fn("* Verifying cluster config")
1559 for msg in self.cfg.VerifyConfig():
1560 self._ErrorIf(True, constants.CV_ECLUSTERCFG, None, msg)
1562 feedback_fn("* Verifying cluster certificate files")
1564 for cert_filename in constants.ALL_CERT_FILES:
1565 (errcode, msg) = _VerifyCertificate(cert_filename)
1566 self._ErrorIf(errcode, constants.CV_ECLUSTERCERT, None, msg, code=errcode)
1568 feedback_fn("* Verifying hypervisor parameters")
1570 self._VerifyHVP(_GetAllHypervisorParameters(self.cfg.GetClusterInfo(),
1571 self.all_inst_info.values()))
1573 feedback_fn("* Verifying all nodes belong to an existing group")
1575 # We do this verification here because, should this bogus circumstance
1576 # occur, it would never be caught by VerifyGroup, which only acts on
1577 # nodes/instances reachable from existing node groups.
1579 dangling_nodes = set(node.name for node in self.all_node_info.values()
1580 if node.group not in self.all_group_info)
1582 dangling_instances = {}
1583 no_node_instances = []
1585 for inst in self.all_inst_info.values():
1586 if inst.primary_node in dangling_nodes:
1587 dangling_instances.setdefault(inst.primary_node, []).append(inst.name)
1588 elif inst.primary_node not in self.all_node_info:
1589 no_node_instances.append(inst.name)
1594 utils.CommaJoin(dangling_instances.get(node.name,
1596 for node in dangling_nodes]
1598 self._ErrorIf(bool(dangling_nodes), constants.CV_ECLUSTERDANGLINGNODES,
1600 "the following nodes (and their instances) belong to a non"
1601 " existing group: %s", utils.CommaJoin(pretty_dangling))
1603 self._ErrorIf(bool(no_node_instances), constants.CV_ECLUSTERDANGLINGINST,
1605 "the following instances have a non-existing primary-node:"
1606 " %s", utils.CommaJoin(no_node_instances))
1611 class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
1612 """Verifies the status of a node group.
1615 HPATH = "cluster-verify"
1616 HTYPE = constants.HTYPE_CLUSTER
1619 _HOOKS_INDENT_RE = re.compile("^", re.M)
1621 class NodeImage(object):
1622 """A class representing the logical and physical status of a node.
1625 @ivar name: the node name to which this object refers
1626 @ivar volumes: a structure as returned from
1627 L{ganeti.backend.GetVolumeList} (runtime)
1628 @ivar instances: a list of running instances (runtime)
1629 @ivar pinst: list of configured primary instances (config)
1630 @ivar sinst: list of configured secondary instances (config)
1631 @ivar sbp: dictionary of {primary-node: list of instances} for all
1632 instances for which this node is secondary (config)
1633 @ivar mfree: free memory, as reported by hypervisor (runtime)
1634 @ivar dfree: free disk, as reported by the node (runtime)
1635 @ivar offline: the offline status (config)
1636 @type rpc_fail: boolean
1637 @ivar rpc_fail: whether the RPC verify call was successfull (overall,
1638 not whether the individual keys were correct) (runtime)
1639 @type lvm_fail: boolean
1640 @ivar lvm_fail: whether the RPC call didn't return valid LVM data
1641 @type hyp_fail: boolean
1642 @ivar hyp_fail: whether the RPC call didn't return the instance list
1643 @type ghost: boolean
1644 @ivar ghost: whether this is a known node or not (config)
1645 @type os_fail: boolean
1646 @ivar os_fail: whether the RPC call didn't return valid OS data
1648 @ivar oslist: list of OSes as diagnosed by DiagnoseOS
1649 @type vm_capable: boolean
1650 @ivar vm_capable: whether the node can host instances
1653 def __init__(self, offline=False, name=None, vm_capable=True):
1662 self.offline = offline
1663 self.vm_capable = vm_capable
1664 self.rpc_fail = False
1665 self.lvm_fail = False
1666 self.hyp_fail = False
1668 self.os_fail = False
1671 def ExpandNames(self):
1672 # This raises errors.OpPrereqError on its own:
1673 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
1675 # Get instances in node group; this is unsafe and needs verification later
1676 inst_names = self.cfg.GetNodeGroupInstances(self.group_uuid)
1678 self.needed_locks = {
1679 locking.LEVEL_INSTANCE: inst_names,
1680 locking.LEVEL_NODEGROUP: [self.group_uuid],
1681 locking.LEVEL_NODE: [],
1684 self.share_locks = _ShareAll()
1686 def DeclareLocks(self, level):
1687 if level == locking.LEVEL_NODE:
1688 # Get members of node group; this is unsafe and needs verification later
1689 nodes = set(self.cfg.GetNodeGroup(self.group_uuid).members)
1691 all_inst_info = self.cfg.GetAllInstancesInfo()
1693 # In Exec(), we warn about mirrored instances that have primary and
1694 # secondary living in separate node groups. To fully verify that
1695 # volumes for these instances are healthy, we will need to do an
1696 # extra call to their secondaries. We ensure here those nodes will
1698 for inst in self.owned_locks(locking.LEVEL_INSTANCE):
1699 # Important: access only the instances whose lock is owned
1700 if all_inst_info[inst].disk_template in constants.DTS_INT_MIRROR:
1701 nodes.update(all_inst_info[inst].secondary_nodes)
1703 self.needed_locks[locking.LEVEL_NODE] = nodes
1705 def CheckPrereq(self):
1706 assert self.group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP)
1707 self.group_info = self.cfg.GetNodeGroup(self.group_uuid)
1709 group_nodes = set(self.group_info.members)
1710 group_instances = self.cfg.GetNodeGroupInstances(self.group_uuid)
1713 group_nodes.difference(self.owned_locks(locking.LEVEL_NODE))
1715 unlocked_instances = \
1716 group_instances.difference(self.owned_locks(locking.LEVEL_INSTANCE))
1719 raise errors.OpPrereqError("Missing lock for nodes: %s" %
1720 utils.CommaJoin(unlocked_nodes))
1722 if unlocked_instances:
1723 raise errors.OpPrereqError("Missing lock for instances: %s" %
1724 utils.CommaJoin(unlocked_instances))
1726 self.all_node_info = self.cfg.GetAllNodesInfo()
1727 self.all_inst_info = self.cfg.GetAllInstancesInfo()
1729 self.my_node_names = utils.NiceSort(group_nodes)
1730 self.my_inst_names = utils.NiceSort(group_instances)
1732 self.my_node_info = dict((name, self.all_node_info[name])
1733 for name in self.my_node_names)
1735 self.my_inst_info = dict((name, self.all_inst_info[name])
1736 for name in self.my_inst_names)
1738 # We detect here the nodes that will need the extra RPC calls for verifying
1739 # split LV volumes; they should be locked.
1740 extra_lv_nodes = set()
1742 for inst in self.my_inst_info.values():
1743 if inst.disk_template in constants.DTS_INT_MIRROR:
1744 group = self.my_node_info[inst.primary_node].group
1745 for nname in inst.secondary_nodes:
1746 if self.all_node_info[nname].group != group:
1747 extra_lv_nodes.add(nname)
1749 unlocked_lv_nodes = \
1750 extra_lv_nodes.difference(self.owned_locks(locking.LEVEL_NODE))
1752 if unlocked_lv_nodes:
1753 raise errors.OpPrereqError("these nodes could be locked: %s" %
1754 utils.CommaJoin(unlocked_lv_nodes))
1755 self.extra_lv_nodes = list(extra_lv_nodes)
1757 def _VerifyNode(self, ninfo, nresult):
1758 """Perform some basic validation on data returned from a node.
1760 - check the result data structure is well formed and has all the
1762 - check ganeti version
1764 @type ninfo: L{objects.Node}
1765 @param ninfo: the node to check
1766 @param nresult: the results from the node
1768 @return: whether overall this call was successful (and we can expect
1769 reasonable values in the respose)
1773 _ErrorIf = self._ErrorIf # pylint: disable=C0103
1775 # main result, nresult should be a non-empty dict
1776 test = not nresult or not isinstance(nresult, dict)
1777 _ErrorIf(test, constants.CV_ENODERPC, node,
1778 "unable to verify node: no data returned")
1782 # compares ganeti version
1783 local_version = constants.PROTOCOL_VERSION
1784 remote_version = nresult.get("version", None)
1785 test = not (remote_version and
1786 isinstance(remote_version, (list, tuple)) and
1787 len(remote_version) == 2)
1788 _ErrorIf(test, constants.CV_ENODERPC, node,
1789 "connection to node returned invalid data")
1793 test = local_version != remote_version[0]
1794 _ErrorIf(test, constants.CV_ENODEVERSION, node,
1795 "incompatible protocol versions: master %s,"
1796 " node %s", local_version, remote_version[0])
1800 # node seems compatible, we can actually try to look into its results
1802 # full package version
1803 self._ErrorIf(constants.RELEASE_VERSION != remote_version[1],
1804 constants.CV_ENODEVERSION, node,
1805 "software version mismatch: master %s, node %s",
1806 constants.RELEASE_VERSION, remote_version[1],
1807 code=self.ETYPE_WARNING)
1809 hyp_result = nresult.get(constants.NV_HYPERVISOR, None)
1810 if ninfo.vm_capable and isinstance(hyp_result, dict):
1811 for hv_name, hv_result in hyp_result.iteritems():
1812 test = hv_result is not None
1813 _ErrorIf(test, constants.CV_ENODEHV, node,
1814 "hypervisor %s verify failure: '%s'", hv_name, hv_result)
1816 hvp_result = nresult.get(constants.NV_HVPARAMS, None)
1817 if ninfo.vm_capable and isinstance(hvp_result, list):
1818 for item, hv_name, hv_result in hvp_result:
1819 _ErrorIf(True, constants.CV_ENODEHV, node,
1820 "hypervisor %s parameter verify failure (source %s): %s",
1821 hv_name, item, hv_result)
1823 test = nresult.get(constants.NV_NODESETUP,
1824 ["Missing NODESETUP results"])
1825 _ErrorIf(test, constants.CV_ENODESETUP, node, "node setup error: %s",
1830 def _VerifyNodeTime(self, ninfo, nresult,
1831 nvinfo_starttime, nvinfo_endtime):
1832 """Check the node time.
1834 @type ninfo: L{objects.Node}
1835 @param ninfo: the node to check
1836 @param nresult: the remote results for the node
1837 @param nvinfo_starttime: the start time of the RPC call
1838 @param nvinfo_endtime: the end time of the RPC call
1842 _ErrorIf = self._ErrorIf # pylint: disable=C0103
1844 ntime = nresult.get(constants.NV_TIME, None)
1846 ntime_merged = utils.MergeTime(ntime)
1847 except (ValueError, TypeError):
1848 _ErrorIf(True, constants.CV_ENODETIME, node, "Node returned invalid time")
1851 if ntime_merged < (nvinfo_starttime - constants.NODE_MAX_CLOCK_SKEW):
1852 ntime_diff = "%.01fs" % abs(nvinfo_starttime - ntime_merged)
1853 elif ntime_merged > (nvinfo_endtime + constants.NODE_MAX_CLOCK_SKEW):
1854 ntime_diff = "%.01fs" % abs(ntime_merged - nvinfo_endtime)
1858 _ErrorIf(ntime_diff is not None, constants.CV_ENODETIME, node,
1859 "Node time diverges by at least %s from master node time",
1862 def _VerifyNodeLVM(self, ninfo, nresult, vg_name):
1863 """Check the node LVM results.
1865 @type ninfo: L{objects.Node}
1866 @param ninfo: the node to check
1867 @param nresult: the remote results for the node
1868 @param vg_name: the configured VG name
1875 _ErrorIf = self._ErrorIf # pylint: disable=C0103
1877 # checks vg existence and size > 20G
1878 vglist = nresult.get(constants.NV_VGLIST, None)
1880 _ErrorIf(test, constants.CV_ENODELVM, node, "unable to check volume groups")
1882 vgstatus = utils.CheckVolumeGroupSize(vglist, vg_name,
1883 constants.MIN_VG_SIZE)
1884 _ErrorIf(vgstatus, constants.CV_ENODELVM, node, vgstatus)
1887 pvlist = nresult.get(constants.NV_PVLIST, None)
1888 test = pvlist is None
1889 _ErrorIf(test, constants.CV_ENODELVM, node, "Can't get PV list from node")
1891 # check that ':' is not present in PV names, since it's a
1892 # special character for lvcreate (denotes the range of PEs to
1894 for _, pvname, owner_vg in pvlist:
1895 test = ":" in pvname
1896 _ErrorIf(test, constants.CV_ENODELVM, node,
1897 "Invalid character ':' in PV '%s' of VG '%s'",
1900 def _VerifyNodeBridges(self, ninfo, nresult, bridges):
1901 """Check the node bridges.
1903 @type ninfo: L{objects.Node}
1904 @param ninfo: the node to check
1905 @param nresult: the remote results for the node
1906 @param bridges: the expected list of bridges
1913 _ErrorIf = self._ErrorIf # pylint: disable=C0103
1915 missing = nresult.get(constants.NV_BRIDGES, None)
1916 test = not isinstance(missing, list)
1917 _ErrorIf(test, constants.CV_ENODENET, node,
1918 "did not return valid bridge information")
1920 _ErrorIf(bool(missing), constants.CV_ENODENET, node,
1921 "missing bridges: %s" % utils.CommaJoin(sorted(missing)))
1923 def _VerifyNodeNetwork(self, ninfo, nresult):
1924 """Check the node network connectivity results.
1926 @type ninfo: L{objects.Node}
1927 @param ninfo: the node to check
1928 @param nresult: the remote results for the node
1932 _ErrorIf = self._ErrorIf # pylint: disable=C0103
1934 test = constants.NV_NODELIST not in nresult
1935 _ErrorIf(test, constants.CV_ENODESSH, node,
1936 "node hasn't returned node ssh connectivity data")
1938 if nresult[constants.NV_NODELIST]:
1939 for a_node, a_msg in nresult[constants.NV_NODELIST].items():
1940 _ErrorIf(True, constants.CV_ENODESSH, node,
1941 "ssh communication with node '%s': %s", a_node, a_msg)
1943 test = constants.NV_NODENETTEST not in nresult
1944 _ErrorIf(test, constants.CV_ENODENET, node,
1945 "node hasn't returned node tcp connectivity data")
1947 if nresult[constants.NV_NODENETTEST]:
1948 nlist = utils.NiceSort(nresult[constants.NV_NODENETTEST].keys())
1950 _ErrorIf(True, constants.CV_ENODENET, node,
1951 "tcp communication with node '%s': %s",
1952 anode, nresult[constants.NV_NODENETTEST][anode])
1954 test = constants.NV_MASTERIP not in nresult
1955 _ErrorIf(test, constants.CV_ENODENET, node,
1956 "node hasn't returned node master IP reachability data")
1958 if not nresult[constants.NV_MASTERIP]:
1959 if node == self.master_node:
1960 msg = "the master node cannot reach the master IP (not configured?)"
1962 msg = "cannot reach the master IP"
1963 _ErrorIf(True, constants.CV_ENODENET, node, msg)
1965 def _VerifyInstance(self, instance, instanceconfig, node_image,
1967 """Verify an instance.
1969 This function checks to see if the required block devices are
1970 available on the instance's node.
1973 _ErrorIf = self._ErrorIf # pylint: disable=C0103
1974 node_current = instanceconfig.primary_node
1976 node_vol_should = {}
1977 instanceconfig.MapLVsByNode(node_vol_should)
1979 for node in node_vol_should:
1980 n_img = node_image[node]
1981 if n_img.offline or n_img.rpc_fail or n_img.lvm_fail:
1982 # ignore missing volumes on offline or broken nodes
1984 for volume in node_vol_should[node]:
1985 test = volume not in n_img.volumes
1986 _ErrorIf(test, constants.CV_EINSTANCEMISSINGDISK, instance,
1987 "volume %s missing on node %s", volume, node)
1989 if instanceconfig.admin_up:
1990 pri_img = node_image[node_current]
1991 test = instance not in pri_img.instances and not pri_img.offline
1992 _ErrorIf(test, constants.CV_EINSTANCEDOWN, instance,
1993 "instance not running on its primary node %s",
1996 diskdata = [(nname, success, status, idx)
1997 for (nname, disks) in diskstatus.items()
1998 for idx, (success, status) in enumerate(disks)]
2000 for nname, success, bdev_status, idx in diskdata:
2001 # the 'ghost node' construction in Exec() ensures that we have a
2003 snode = node_image[nname]
2004 bad_snode = snode.ghost or snode.offline
2005 _ErrorIf(instanceconfig.admin_up and not success and not bad_snode,
2006 constants.CV_EINSTANCEFAULTYDISK, instance,
2007 "couldn't retrieve status for disk/%s on %s: %s",
2008 idx, nname, bdev_status)
2009 _ErrorIf((instanceconfig.admin_up and success and
2010 bdev_status.ldisk_status == constants.LDS_FAULTY),
2011 constants.CV_EINSTANCEFAULTYDISK, instance,
2012 "disk/%s on %s is faulty", idx, nname)
2014 def _VerifyOrphanVolumes(self, node_vol_should, node_image, reserved):
2015 """Verify if there are any unknown volumes in the cluster.
2017 The .os, .swap and backup volumes are ignored. All other volumes are
2018 reported as unknown.
2020 @type reserved: L{ganeti.utils.FieldSet}
2021 @param reserved: a FieldSet of reserved volume names
2024 for node, n_img in node_image.items():
2025 if n_img.offline or n_img.rpc_fail or n_img.lvm_fail:
2026 # skip non-healthy nodes
2028 for volume in n_img.volumes:
2029 test = ((node not in node_vol_should or
2030 volume not in node_vol_should[node]) and
2031 not reserved.Matches(volume))
2032 self._ErrorIf(test, constants.CV_ENODEORPHANLV, node,
2033 "volume %s is unknown", volume)
2035 def _VerifyNPlusOneMemory(self, node_image, instance_cfg):
2036 """Verify N+1 Memory Resilience.
2038 Check that if one single node dies we can still start all the
2039 instances it was primary for.
2042 cluster_info = self.cfg.GetClusterInfo()
2043 for node, n_img in node_image.items():
2044 # This code checks that every node which is now listed as
2045 # secondary has enough memory to host all instances it is
2046 # supposed to should a single other node in the cluster fail.
2047 # FIXME: not ready for failover to an arbitrary node
2048 # FIXME: does not support file-backed instances
2049 # WARNING: we currently take into account down instances as well
2050 # as up ones, considering that even if they're down someone
2051 # might want to start them even in the event of a node failure.
2053 # we're skipping offline nodes from the N+1 warning, since
2054 # most likely we don't have good memory infromation from them;
2055 # we already list instances living on such nodes, and that's
2058 for prinode, instances in n_img.sbp.items():
2060 for instance in instances:
2061 bep = cluster_info.FillBE(instance_cfg[instance])
2062 if bep[constants.BE_AUTO_BALANCE]:
2063 needed_mem += bep[constants.BE_MEMORY]
2064 test = n_img.mfree < needed_mem
2065 self._ErrorIf(test, constants.CV_ENODEN1, node,
2066 "not enough memory to accomodate instance failovers"
2067 " should node %s fail (%dMiB needed, %dMiB available)",
2068 prinode, needed_mem, n_img.mfree)
2071 def _VerifyFiles(cls, errorif, nodeinfo, master_node, all_nvinfo,
2072 (files_all, files_all_opt, files_mc, files_vm)):
2073 """Verifies file checksums collected from all nodes.
2075 @param errorif: Callback for reporting errors
2076 @param nodeinfo: List of L{objects.Node} objects
2077 @param master_node: Name of master node
2078 @param all_nvinfo: RPC results
2081 assert (len(files_all | files_all_opt | files_mc | files_vm) ==
2082 sum(map(len, [files_all, files_all_opt, files_mc, files_vm]))), \
2083 "Found file listed in more than one file list"
2085 # Define functions determining which nodes to consider for a file
2088 (files_all_opt, None),
2089 (files_mc, lambda node: (node.master_candidate or
2090 node.name == master_node)),
2091 (files_vm, lambda node: node.vm_capable),
2094 # Build mapping from filename to list of nodes which should have the file
2096 for (files, fn) in files2nodefn:
2098 filenodes = nodeinfo
2100 filenodes = filter(fn, nodeinfo)
2101 nodefiles.update((filename,
2102 frozenset(map(operator.attrgetter("name"), filenodes)))
2103 for filename in files)
2105 assert set(nodefiles) == (files_all | files_all_opt | files_mc | files_vm)
2107 fileinfo = dict((filename, {}) for filename in nodefiles)
2108 ignore_nodes = set()
2110 for node in nodeinfo:
2112 ignore_nodes.add(node.name)
2115 nresult = all_nvinfo[node.name]
2117 if nresult.fail_msg or not nresult.payload:
2120 node_files = nresult.payload.get(constants.NV_FILELIST, None)
2122 test = not (node_files and isinstance(node_files, dict))
2123 errorif(test, constants.CV_ENODEFILECHECK, node.name,
2124 "Node did not return file checksum data")
2126 ignore_nodes.add(node.name)
2129 # Build per-checksum mapping from filename to nodes having it
2130 for (filename, checksum) in node_files.items():
2131 assert filename in nodefiles
2132 fileinfo[filename].setdefault(checksum, set()).add(node.name)
2134 for (filename, checksums) in fileinfo.items():
2135 assert compat.all(len(i) > 10 for i in checksums), "Invalid checksum"
2137 # Nodes having the file
2138 with_file = frozenset(node_name
2139 for nodes in fileinfo[filename].values()
2140 for node_name in nodes) - ignore_nodes
2142 expected_nodes = nodefiles[filename] - ignore_nodes
2144 # Nodes missing file
2145 missing_file = expected_nodes - with_file
2147 if filename in files_all_opt:
2149 errorif(missing_file and missing_file != expected_nodes,
2150 constants.CV_ECLUSTERFILECHECK, None,
2151 "File %s is optional, but it must exist on all or no"
2152 " nodes (not found on %s)",
2153 filename, utils.CommaJoin(utils.NiceSort(missing_file)))
2155 errorif(missing_file, constants.CV_ECLUSTERFILECHECK, None,
2156 "File %s is missing from node(s) %s", filename,
2157 utils.CommaJoin(utils.NiceSort(missing_file)))
2159 # Warn if a node has a file it shouldn't
2160 unexpected = with_file - expected_nodes
2162 constants.CV_ECLUSTERFILECHECK, None,
2163 "File %s should not exist on node(s) %s",
2164 filename, utils.CommaJoin(utils.NiceSort(unexpected)))
2166 # See if there are multiple versions of the file
2167 test = len(checksums) > 1
2169 variants = ["variant %s on %s" %
2170 (idx + 1, utils.CommaJoin(utils.NiceSort(nodes)))
2171 for (idx, (checksum, nodes)) in
2172 enumerate(sorted(checksums.items()))]
2176 errorif(test, constants.CV_ECLUSTERFILECHECK, None,
2177 "File %s found with %s different checksums (%s)",
2178 filename, len(checksums), "; ".join(variants))
2180 def _VerifyNodeDrbd(self, ninfo, nresult, instanceinfo, drbd_helper,
2182 """Verifies and the node DRBD status.
2184 @type ninfo: L{objects.Node}
2185 @param ninfo: the node to check
2186 @param nresult: the remote results for the node
2187 @param instanceinfo: the dict of instances
2188 @param drbd_helper: the configured DRBD usermode helper
2189 @param drbd_map: the DRBD map as returned by
2190 L{ganeti.config.ConfigWriter.ComputeDRBDMap}
2194 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2197 helper_result = nresult.get(constants.NV_DRBDHELPER, None)
2198 test = (helper_result == None)
2199 _ErrorIf(test, constants.CV_ENODEDRBDHELPER, node,
2200 "no drbd usermode helper returned")
2202 status, payload = helper_result
2204 _ErrorIf(test, constants.CV_ENODEDRBDHELPER, node,
2205 "drbd usermode helper check unsuccessful: %s", payload)
2206 test = status and (payload != drbd_helper)
2207 _ErrorIf(test, constants.CV_ENODEDRBDHELPER, node,
2208 "wrong drbd usermode helper: %s", payload)
2210 # compute the DRBD minors
2212 for minor, instance in drbd_map[node].items():
2213 test = instance not in instanceinfo
2214 _ErrorIf(test, constants.CV_ECLUSTERCFG, None,
2215 "ghost instance '%s' in temporary DRBD map", instance)
2216 # ghost instance should not be running, but otherwise we
2217 # don't give double warnings (both ghost instance and
2218 # unallocated minor in use)
2220 node_drbd[minor] = (instance, False)
2222 instance = instanceinfo[instance]
2223 node_drbd[minor] = (instance.name, instance.admin_up)
2225 # and now check them
2226 used_minors = nresult.get(constants.NV_DRBDLIST, [])
2227 test = not isinstance(used_minors, (tuple, list))
2228 _ErrorIf(test, constants.CV_ENODEDRBD, node,
2229 "cannot parse drbd status file: %s", str(used_minors))
2231 # we cannot check drbd status
2234 for minor, (iname, must_exist) in node_drbd.items():
2235 test = minor not in used_minors and must_exist
2236 _ErrorIf(test, constants.CV_ENODEDRBD, node,
2237 "drbd minor %d of instance %s is not active", minor, iname)
2238 for minor in used_minors:
2239 test = minor not in node_drbd
2240 _ErrorIf(test, constants.CV_ENODEDRBD, node,
2241 "unallocated drbd minor %d is in use", minor)
2243 def _UpdateNodeOS(self, ninfo, nresult, nimg):
2244 """Builds the node OS structures.
2246 @type ninfo: L{objects.Node}
2247 @param ninfo: the node to check
2248 @param nresult: the remote results for the node
2249 @param nimg: the node image object
2253 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2255 remote_os = nresult.get(constants.NV_OSLIST, None)
2256 test = (not isinstance(remote_os, list) or
2257 not compat.all(isinstance(v, list) and len(v) == 7
2258 for v in remote_os))
2260 _ErrorIf(test, constants.CV_ENODEOS, node,
2261 "node hasn't returned valid OS data")
2270 for (name, os_path, status, diagnose,
2271 variants, parameters, api_ver) in nresult[constants.NV_OSLIST]:
2273 if name not in os_dict:
2276 # parameters is a list of lists instead of list of tuples due to
2277 # JSON lacking a real tuple type, fix it:
2278 parameters = [tuple(v) for v in parameters]
2279 os_dict[name].append((os_path, status, diagnose,
2280 set(variants), set(parameters), set(api_ver)))
2282 nimg.oslist = os_dict
2284 def _VerifyNodeOS(self, ninfo, nimg, base):
2285 """Verifies the node OS list.
2287 @type ninfo: L{objects.Node}
2288 @param ninfo: the node to check
2289 @param nimg: the node image object
2290 @param base: the 'template' node we match against (e.g. from the master)
2294 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2296 assert not nimg.os_fail, "Entered _VerifyNodeOS with failed OS rpc?"
2298 beautify_params = lambda l: ["%s: %s" % (k, v) for (k, v) in l]
2299 for os_name, os_data in nimg.oslist.items():
2300 assert os_data, "Empty OS status for OS %s?!" % os_name
2301 f_path, f_status, f_diag, f_var, f_param, f_api = os_data[0]
2302 _ErrorIf(not f_status, constants.CV_ENODEOS, node,
2303 "Invalid OS %s (located at %s): %s", os_name, f_path, f_diag)
2304 _ErrorIf(len(os_data) > 1, constants.CV_ENODEOS, node,
2305 "OS '%s' has multiple entries (first one shadows the rest): %s",
2306 os_name, utils.CommaJoin([v[0] for v in os_data]))
2307 # comparisons with the 'base' image
2308 test = os_name not in base.oslist
2309 _ErrorIf(test, constants.CV_ENODEOS, node,
2310 "Extra OS %s not present on reference node (%s)",
2314 assert base.oslist[os_name], "Base node has empty OS status?"
2315 _, b_status, _, b_var, b_param, b_api = base.oslist[os_name][0]
2317 # base OS is invalid, skipping
2319 for kind, a, b in [("API version", f_api, b_api),
2320 ("variants list", f_var, b_var),
2321 ("parameters", beautify_params(f_param),
2322 beautify_params(b_param))]:
2323 _ErrorIf(a != b, constants.CV_ENODEOS, node,
2324 "OS %s for %s differs from reference node %s: [%s] vs. [%s]",
2325 kind, os_name, base.name,
2326 utils.CommaJoin(sorted(a)), utils.CommaJoin(sorted(b)))
2328 # check any missing OSes
2329 missing = set(base.oslist.keys()).difference(nimg.oslist.keys())
2330 _ErrorIf(missing, constants.CV_ENODEOS, node,
2331 "OSes present on reference node %s but missing on this node: %s",
2332 base.name, utils.CommaJoin(missing))
2334 def _VerifyOob(self, ninfo, nresult):
2335 """Verifies out of band functionality of a node.
2337 @type ninfo: L{objects.Node}
2338 @param ninfo: the node to check
2339 @param nresult: the remote results for the node
2343 # We just have to verify the paths on master and/or master candidates
2344 # as the oob helper is invoked on the master
2345 if ((ninfo.master_candidate or ninfo.master_capable) and
2346 constants.NV_OOB_PATHS in nresult):
2347 for path_result in nresult[constants.NV_OOB_PATHS]:
2348 self._ErrorIf(path_result, constants.CV_ENODEOOBPATH, node, path_result)
2350 def _UpdateNodeVolumes(self, ninfo, nresult, nimg, vg_name):
2351 """Verifies and updates the node volume data.
2353 This function will update a L{NodeImage}'s internal structures
2354 with data from the remote call.
2356 @type ninfo: L{objects.Node}
2357 @param ninfo: the node to check
2358 @param nresult: the remote results for the node
2359 @param nimg: the node image object
2360 @param vg_name: the configured VG name
2364 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2366 nimg.lvm_fail = True
2367 lvdata = nresult.get(constants.NV_LVLIST, "Missing LV data")
2370 elif isinstance(lvdata, basestring):
2371 _ErrorIf(True, constants.CV_ENODELVM, node, "LVM problem on node: %s",
2372 utils.SafeEncode(lvdata))
2373 elif not isinstance(lvdata, dict):
2374 _ErrorIf(True, constants.CV_ENODELVM, node,
2375 "rpc call to node failed (lvlist)")
2377 nimg.volumes = lvdata
2378 nimg.lvm_fail = False
2380 def _UpdateNodeInstances(self, ninfo, nresult, nimg):
2381 """Verifies and updates the node instance list.
2383 If the listing was successful, then updates this node's instance
2384 list. Otherwise, it marks the RPC call as failed for the instance
2387 @type ninfo: L{objects.Node}
2388 @param ninfo: the node to check
2389 @param nresult: the remote results for the node
2390 @param nimg: the node image object
2393 idata = nresult.get(constants.NV_INSTANCELIST, None)
2394 test = not isinstance(idata, list)
2395 self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name,
2396 "rpc call to node failed (instancelist): %s",
2397 utils.SafeEncode(str(idata)))
2399 nimg.hyp_fail = True
2401 nimg.instances = idata
2403 def _UpdateNodeInfo(self, ninfo, nresult, nimg, vg_name):
2404 """Verifies and computes a node information map
2406 @type ninfo: L{objects.Node}
2407 @param ninfo: the node to check
2408 @param nresult: the remote results for the node
2409 @param nimg: the node image object
2410 @param vg_name: the configured VG name
2414 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2416 # try to read free memory (from the hypervisor)
2417 hv_info = nresult.get(constants.NV_HVINFO, None)
2418 test = not isinstance(hv_info, dict) or "memory_free" not in hv_info
2419 _ErrorIf(test, constants.CV_ENODEHV, node,
2420 "rpc call to node failed (hvinfo)")
2423 nimg.mfree = int(hv_info["memory_free"])
2424 except (ValueError, TypeError):
2425 _ErrorIf(True, constants.CV_ENODERPC, node,
2426 "node returned invalid nodeinfo, check hypervisor")
2428 # FIXME: devise a free space model for file based instances as well
2429 if vg_name is not None:
2430 test = (constants.NV_VGLIST not in nresult or
2431 vg_name not in nresult[constants.NV_VGLIST])
2432 _ErrorIf(test, constants.CV_ENODELVM, node,
2433 "node didn't return data for the volume group '%s'"
2434 " - it is either missing or broken", vg_name)
2437 nimg.dfree = int(nresult[constants.NV_VGLIST][vg_name])
2438 except (ValueError, TypeError):
2439 _ErrorIf(True, constants.CV_ENODERPC, node,
2440 "node returned invalid LVM info, check LVM status")
2442 def _CollectDiskInfo(self, nodelist, node_image, instanceinfo):
2443 """Gets per-disk status information for all instances.
2445 @type nodelist: list of strings
2446 @param nodelist: Node names
2447 @type node_image: dict of (name, L{objects.Node})
2448 @param node_image: Node objects
2449 @type instanceinfo: dict of (name, L{objects.Instance})
2450 @param instanceinfo: Instance objects
2451 @rtype: {instance: {node: [(succes, payload)]}}
2452 @return: a dictionary of per-instance dictionaries with nodes as
2453 keys and disk information as values; the disk information is a
2454 list of tuples (success, payload)
2457 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2460 node_disks_devonly = {}
2461 diskless_instances = set()
2462 diskless = constants.DT_DISKLESS
2464 for nname in nodelist:
2465 node_instances = list(itertools.chain(node_image[nname].pinst,
2466 node_image[nname].sinst))
2467 diskless_instances.update(inst for inst in node_instances
2468 if instanceinfo[inst].disk_template == diskless)
2469 disks = [(inst, disk)
2470 for inst in node_instances
2471 for disk in instanceinfo[inst].disks]
2474 # No need to collect data
2477 node_disks[nname] = disks
2479 # Creating copies as SetDiskID below will modify the objects and that can
2480 # lead to incorrect data returned from nodes
2481 devonly = [dev.Copy() for (_, dev) in disks]
2484 self.cfg.SetDiskID(dev, nname)
2486 node_disks_devonly[nname] = devonly
2488 assert len(node_disks) == len(node_disks_devonly)
2490 # Collect data from all nodes with disks
2491 result = self.rpc.call_blockdev_getmirrorstatus_multi(node_disks.keys(),
2494 assert len(result) == len(node_disks)
2498 for (nname, nres) in result.items():
2499 disks = node_disks[nname]
2502 # No data from this node
2503 data = len(disks) * [(False, "node offline")]
2506 _ErrorIf(msg, constants.CV_ENODERPC, nname,
2507 "while getting disk information: %s", msg)
2509 # No data from this node
2510 data = len(disks) * [(False, msg)]
2513 for idx, i in enumerate(nres.payload):
2514 if isinstance(i, (tuple, list)) and len(i) == 2:
2517 logging.warning("Invalid result from node %s, entry %d: %s",
2519 data.append((False, "Invalid result from the remote node"))
2521 for ((inst, _), status) in zip(disks, data):
2522 instdisk.setdefault(inst, {}).setdefault(nname, []).append(status)
2524 # Add empty entries for diskless instances.
2525 for inst in diskless_instances:
2526 assert inst not in instdisk
2529 assert compat.all(len(statuses) == len(instanceinfo[inst].disks) and
2530 len(nnames) <= len(instanceinfo[inst].all_nodes) and
2531 compat.all(isinstance(s, (tuple, list)) and
2532 len(s) == 2 for s in statuses)
2533 for inst, nnames in instdisk.items()
2534 for nname, statuses in nnames.items())
2535 assert set(instdisk) == set(instanceinfo), "instdisk consistency failure"
2540 def _SshNodeSelector(group_uuid, all_nodes):
2541 """Create endless iterators for all potential SSH check hosts.
2544 nodes = [node for node in all_nodes
2545 if (node.group != group_uuid and
2547 keyfunc = operator.attrgetter("group")
2549 return map(itertools.cycle,
2550 [sorted(map(operator.attrgetter("name"), names))
2551 for _, names in itertools.groupby(sorted(nodes, key=keyfunc),
2555 def _SelectSshCheckNodes(cls, group_nodes, group_uuid, all_nodes):
2556 """Choose which nodes should talk to which other nodes.
2558 We will make nodes contact all nodes in their group, and one node from
2561 @warning: This algorithm has a known issue if one node group is much
2562 smaller than others (e.g. just one node). In such a case all other
2563 nodes will talk to the single node.
2566 online_nodes = sorted(node.name for node in group_nodes if not node.offline)
2567 sel = cls._SshNodeSelector(group_uuid, all_nodes)
2569 return (online_nodes,
2570 dict((name, sorted([i.next() for i in sel]))
2571 for name in online_nodes))
2573 def BuildHooksEnv(self):
2576 Cluster-Verify hooks just ran in the post phase and their failure makes
2577 the output be logged in the verify output and the verification to fail.
2581 "CLUSTER_TAGS": " ".join(self.cfg.GetClusterInfo().GetTags())
2584 env.update(("NODE_TAGS_%s" % node.name, " ".join(node.GetTags()))
2585 for node in self.my_node_info.values())
2589 def BuildHooksNodes(self):
2590 """Build hooks nodes.
2593 return ([], self.my_node_names)
2595 def Exec(self, feedback_fn):
2596 """Verify integrity of the node group, performing various test on nodes.
2599 # This method has too many local variables. pylint: disable=R0914
2600 feedback_fn("* Verifying group '%s'" % self.group_info.name)
2602 if not self.my_node_names:
2604 feedback_fn("* Empty node group, skipping verification")
2608 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2609 verbose = self.op.verbose
2610 self._feedback_fn = feedback_fn
2612 vg_name = self.cfg.GetVGName()
2613 drbd_helper = self.cfg.GetDRBDHelper()
2614 cluster = self.cfg.GetClusterInfo()
2615 groupinfo = self.cfg.GetAllNodeGroupsInfo()
2616 hypervisors = cluster.enabled_hypervisors
2617 node_data_list = [self.my_node_info[name] for name in self.my_node_names]
2619 i_non_redundant = [] # Non redundant instances
2620 i_non_a_balanced = [] # Non auto-balanced instances
2621 n_offline = 0 # Count of offline nodes
2622 n_drained = 0 # Count of nodes being drained
2623 node_vol_should = {}
2625 # FIXME: verify OS list
2628 filemap = _ComputeAncillaryFiles(cluster, False)
2630 # do local checksums
2631 master_node = self.master_node = self.cfg.GetMasterNode()
2632 master_ip = self.cfg.GetMasterIP()
2634 feedback_fn("* Gathering data (%d nodes)" % len(self.my_node_names))
2636 node_verify_param = {
2637 constants.NV_FILELIST:
2638 utils.UniqueSequence(filename
2639 for files in filemap
2640 for filename in files),
2641 constants.NV_NODELIST:
2642 self._SelectSshCheckNodes(node_data_list, self.group_uuid,
2643 self.all_node_info.values()),
2644 constants.NV_HYPERVISOR: hypervisors,
2645 constants.NV_HVPARAMS:
2646 _GetAllHypervisorParameters(cluster, self.all_inst_info.values()),
2647 constants.NV_NODENETTEST: [(node.name, node.primary_ip, node.secondary_ip)
2648 for node in node_data_list
2649 if not node.offline],
2650 constants.NV_INSTANCELIST: hypervisors,
2651 constants.NV_VERSION: None,
2652 constants.NV_HVINFO: self.cfg.GetHypervisorType(),
2653 constants.NV_NODESETUP: None,
2654 constants.NV_TIME: None,
2655 constants.NV_MASTERIP: (master_node, master_ip),
2656 constants.NV_OSLIST: None,
2657 constants.NV_VMNODES: self.cfg.GetNonVmCapableNodeList(),
2660 if vg_name is not None:
2661 node_verify_param[constants.NV_VGLIST] = None
2662 node_verify_param[constants.NV_LVLIST] = vg_name
2663 node_verify_param[constants.NV_PVLIST] = [vg_name]
2664 node_verify_param[constants.NV_DRBDLIST] = None
2667 node_verify_param[constants.NV_DRBDHELPER] = drbd_helper
2670 # FIXME: this needs to be changed per node-group, not cluster-wide
2672 default_nicpp = cluster.nicparams[constants.PP_DEFAULT]
2673 if default_nicpp[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
2674 bridges.add(default_nicpp[constants.NIC_LINK])
2675 for instance in self.my_inst_info.values():
2676 for nic in instance.nics:
2677 full_nic = cluster.SimpleFillNIC(nic.nicparams)
2678 if full_nic[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
2679 bridges.add(full_nic[constants.NIC_LINK])
2682 node_verify_param[constants.NV_BRIDGES] = list(bridges)
2684 # Build our expected cluster state
2685 node_image = dict((node.name, self.NodeImage(offline=node.offline,
2687 vm_capable=node.vm_capable))
2688 for node in node_data_list)
2692 for node in self.all_node_info.values():
2693 path = _SupportsOob(self.cfg, node)
2694 if path and path not in oob_paths:
2695 oob_paths.append(path)
2698 node_verify_param[constants.NV_OOB_PATHS] = oob_paths
2700 for instance in self.my_inst_names:
2701 inst_config = self.my_inst_info[instance]
2703 for nname in inst_config.all_nodes:
2704 if nname not in node_image:
2705 gnode = self.NodeImage(name=nname)
2706 gnode.ghost = (nname not in self.all_node_info)
2707 node_image[nname] = gnode
2709 inst_config.MapLVsByNode(node_vol_should)
2711 pnode = inst_config.primary_node
2712 node_image[pnode].pinst.append(instance)
2714 for snode in inst_config.secondary_nodes:
2715 nimg = node_image[snode]
2716 nimg.sinst.append(instance)
2717 if pnode not in nimg.sbp:
2718 nimg.sbp[pnode] = []
2719 nimg.sbp[pnode].append(instance)
2721 # At this point, we have the in-memory data structures complete,
2722 # except for the runtime information, which we'll gather next
2724 # Due to the way our RPC system works, exact response times cannot be
2725 # guaranteed (e.g. a broken node could run into a timeout). By keeping the
2726 # time before and after executing the request, we can at least have a time
2728 nvinfo_starttime = time.time()
2729 all_nvinfo = self.rpc.call_node_verify(self.my_node_names,
2731 self.cfg.GetClusterName())
2732 nvinfo_endtime = time.time()
2734 if self.extra_lv_nodes and vg_name is not None:
2736 self.rpc.call_node_verify(self.extra_lv_nodes,
2737 {constants.NV_LVLIST: vg_name},
2738 self.cfg.GetClusterName())
2740 extra_lv_nvinfo = {}
2742 all_drbd_map = self.cfg.ComputeDRBDMap()
2744 feedback_fn("* Gathering disk information (%s nodes)" %
2745 len(self.my_node_names))
2746 instdisk = self._CollectDiskInfo(self.my_node_names, node_image,
2749 feedback_fn("* Verifying configuration file consistency")
2751 # If not all nodes are being checked, we need to make sure the master node
2752 # and a non-checked vm_capable node are in the list.
2753 absent_nodes = set(self.all_node_info).difference(self.my_node_info)
2755 vf_nvinfo = all_nvinfo.copy()
2756 vf_node_info = list(self.my_node_info.values())
2757 additional_nodes = []
2758 if master_node not in self.my_node_info:
2759 additional_nodes.append(master_node)
2760 vf_node_info.append(self.all_node_info[master_node])
2761 # Add the first vm_capable node we find which is not included
2762 for node in absent_nodes:
2763 nodeinfo = self.all_node_info[node]
2764 if nodeinfo.vm_capable and not nodeinfo.offline:
2765 additional_nodes.append(node)
2766 vf_node_info.append(self.all_node_info[node])
2768 key = constants.NV_FILELIST
2769 vf_nvinfo.update(self.rpc.call_node_verify(additional_nodes,
2770 {key: node_verify_param[key]},
2771 self.cfg.GetClusterName()))
2773 vf_nvinfo = all_nvinfo
2774 vf_node_info = self.my_node_info.values()
2776 self._VerifyFiles(_ErrorIf, vf_node_info, master_node, vf_nvinfo, filemap)
2778 feedback_fn("* Verifying node status")
2782 for node_i in node_data_list:
2784 nimg = node_image[node]
2788 feedback_fn("* Skipping offline node %s" % (node,))
2792 if node == master_node:
2794 elif node_i.master_candidate:
2795 ntype = "master candidate"
2796 elif node_i.drained:
2802 feedback_fn("* Verifying node %s (%s)" % (node, ntype))
2804 msg = all_nvinfo[node].fail_msg
2805 _ErrorIf(msg, constants.CV_ENODERPC, node, "while contacting node: %s",
2808 nimg.rpc_fail = True
2811 nresult = all_nvinfo[node].payload
2813 nimg.call_ok = self._VerifyNode(node_i, nresult)
2814 self._VerifyNodeTime(node_i, nresult, nvinfo_starttime, nvinfo_endtime)
2815 self._VerifyNodeNetwork(node_i, nresult)
2816 self._VerifyOob(node_i, nresult)
2819 self._VerifyNodeLVM(node_i, nresult, vg_name)
2820 self._VerifyNodeDrbd(node_i, nresult, self.all_inst_info, drbd_helper,
2823 self._UpdateNodeVolumes(node_i, nresult, nimg, vg_name)
2824 self._UpdateNodeInstances(node_i, nresult, nimg)
2825 self._UpdateNodeInfo(node_i, nresult, nimg, vg_name)
2826 self._UpdateNodeOS(node_i, nresult, nimg)
2828 if not nimg.os_fail:
2829 if refos_img is None:
2831 self._VerifyNodeOS(node_i, nimg, refos_img)
2832 self._VerifyNodeBridges(node_i, nresult, bridges)
2834 # Check whether all running instancies are primary for the node. (This
2835 # can no longer be done from _VerifyInstance below, since some of the
2836 # wrong instances could be from other node groups.)
2837 non_primary_inst = set(nimg.instances).difference(nimg.pinst)
2839 for inst in non_primary_inst:
2840 test = inst in self.all_inst_info
2841 _ErrorIf(test, constants.CV_EINSTANCEWRONGNODE, inst,
2842 "instance should not run on node %s", node_i.name)
2843 _ErrorIf(not test, constants.CV_ENODEORPHANINSTANCE, node_i.name,
2844 "node is running unknown instance %s", inst)
2846 for node, result in extra_lv_nvinfo.items():
2847 self._UpdateNodeVolumes(self.all_node_info[node], result.payload,
2848 node_image[node], vg_name)
2850 feedback_fn("* Verifying instance status")
2851 for instance in self.my_inst_names:
2853 feedback_fn("* Verifying instance %s" % instance)
2854 inst_config = self.my_inst_info[instance]
2855 self._VerifyInstance(instance, inst_config, node_image,
2857 inst_nodes_offline = []
2859 pnode = inst_config.primary_node
2860 pnode_img = node_image[pnode]
2861 _ErrorIf(pnode_img.rpc_fail and not pnode_img.offline,
2862 constants.CV_ENODERPC, pnode, "instance %s, connection to"
2863 " primary node failed", instance)
2865 _ErrorIf(inst_config.admin_up and pnode_img.offline,
2866 constants.CV_EINSTANCEBADNODE, instance,
2867 "instance is marked as running and lives on offline node %s",
2868 inst_config.primary_node)
2870 # If the instance is non-redundant we cannot survive losing its primary
2871 # node, so we are not N+1 compliant. On the other hand we have no disk
2872 # templates with more than one secondary so that situation is not well
2874 # FIXME: does not support file-backed instances
2875 if not inst_config.secondary_nodes:
2876 i_non_redundant.append(instance)
2878 _ErrorIf(len(inst_config.secondary_nodes) > 1,
2879 constants.CV_EINSTANCELAYOUT,
2880 instance, "instance has multiple secondary nodes: %s",
2881 utils.CommaJoin(inst_config.secondary_nodes),
2882 code=self.ETYPE_WARNING)
2884 if inst_config.disk_template in constants.DTS_INT_MIRROR:
2885 pnode = inst_config.primary_node
2886 instance_nodes = utils.NiceSort(inst_config.all_nodes)
2887 instance_groups = {}
2889 for node in instance_nodes:
2890 instance_groups.setdefault(self.all_node_info[node].group,
2894 "%s (group %s)" % (utils.CommaJoin(nodes), groupinfo[group].name)
2895 # Sort so that we always list the primary node first.
2896 for group, nodes in sorted(instance_groups.items(),
2897 key=lambda (_, nodes): pnode in nodes,
2900 self._ErrorIf(len(instance_groups) > 1,
2901 constants.CV_EINSTANCESPLITGROUPS,
2902 instance, "instance has primary and secondary nodes in"
2903 " different groups: %s", utils.CommaJoin(pretty_list),
2904 code=self.ETYPE_WARNING)
2906 if not cluster.FillBE(inst_config)[constants.BE_AUTO_BALANCE]:
2907 i_non_a_balanced.append(instance)
2909 for snode in inst_config.secondary_nodes:
2910 s_img = node_image[snode]
2911 _ErrorIf(s_img.rpc_fail and not s_img.offline, constants.CV_ENODERPC,
2912 snode, "instance %s, connection to secondary node failed",
2916 inst_nodes_offline.append(snode)
2918 # warn that the instance lives on offline nodes
2919 _ErrorIf(inst_nodes_offline, constants.CV_EINSTANCEBADNODE, instance,
2920 "instance has offline secondary node(s) %s",
2921 utils.CommaJoin(inst_nodes_offline))
2922 # ... or ghost/non-vm_capable nodes
2923 for node in inst_config.all_nodes:
2924 _ErrorIf(node_image[node].ghost, constants.CV_EINSTANCEBADNODE,
2925 instance, "instance lives on ghost node %s", node)
2926 _ErrorIf(not node_image[node].vm_capable, constants.CV_EINSTANCEBADNODE,
2927 instance, "instance lives on non-vm_capable node %s", node)
2929 feedback_fn("* Verifying orphan volumes")
2930 reserved = utils.FieldSet(*cluster.reserved_lvs)
2932 # We will get spurious "unknown volume" warnings if any node of this group
2933 # is secondary for an instance whose primary is in another group. To avoid
2934 # them, we find these instances and add their volumes to node_vol_should.
2935 for inst in self.all_inst_info.values():
2936 for secondary in inst.secondary_nodes:
2937 if (secondary in self.my_node_info
2938 and inst.name not in self.my_inst_info):
2939 inst.MapLVsByNode(node_vol_should)
2942 self._VerifyOrphanVolumes(node_vol_should, node_image, reserved)
2944 if constants.VERIFY_NPLUSONE_MEM not in self.op.skip_checks:
2945 feedback_fn("* Verifying N+1 Memory redundancy")
2946 self._VerifyNPlusOneMemory(node_image, self.my_inst_info)
2948 feedback_fn("* Other Notes")
2950 feedback_fn(" - NOTICE: %d non-redundant instance(s) found."
2951 % len(i_non_redundant))
2953 if i_non_a_balanced:
2954 feedback_fn(" - NOTICE: %d non-auto-balanced instance(s) found."
2955 % len(i_non_a_balanced))
2958 feedback_fn(" - NOTICE: %d offline node(s) found." % n_offline)
2961 feedback_fn(" - NOTICE: %d drained node(s) found." % n_drained)
2965 def HooksCallBack(self, phase, hooks_results, feedback_fn, lu_result):
2966 """Analyze the post-hooks' result
2968 This method analyses the hook result, handles it, and sends some
2969 nicely-formatted feedback back to the user.
2971 @param phase: one of L{constants.HOOKS_PHASE_POST} or
2972 L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase
2973 @param hooks_results: the results of the multi-node hooks rpc call
2974 @param feedback_fn: function used send feedback back to the caller
2975 @param lu_result: previous Exec result
2976 @return: the new Exec result, based on the previous result
2980 # We only really run POST phase hooks, only for non-empty groups,
2981 # and are only interested in their results
2982 if not self.my_node_names:
2985 elif phase == constants.HOOKS_PHASE_POST:
2986 # Used to change hooks' output to proper indentation
2987 feedback_fn("* Hooks Results")
2988 assert hooks_results, "invalid result from hooks"
2990 for node_name in hooks_results:
2991 res = hooks_results[node_name]
2993 test = msg and not res.offline
2994 self._ErrorIf(test, constants.CV_ENODEHOOKS, node_name,
2995 "Communication failure in hooks execution: %s", msg)
2996 if res.offline or msg:
2997 # No need to investigate payload if node is offline or gave
3000 for script, hkr, output in res.payload:
3001 test = hkr == constants.HKR_FAIL
3002 self._ErrorIf(test, constants.CV_ENODEHOOKS, node_name,
3003 "Script %s failed, output:", script)
3005 output = self._HOOKS_INDENT_RE.sub(" ", output)
3006 feedback_fn("%s" % output)
3012 class LUClusterVerifyDisks(NoHooksLU):
3013 """Verifies the cluster disks status.
3018 def ExpandNames(self):
3019 self.share_locks = _ShareAll()
3020 self.needed_locks = {
3021 locking.LEVEL_NODEGROUP: locking.ALL_SET,
3024 def Exec(self, feedback_fn):
3025 group_names = self.owned_locks(locking.LEVEL_NODEGROUP)
3027 # Submit one instance of L{opcodes.OpGroupVerifyDisks} per node group
3028 return ResultWithJobs([[opcodes.OpGroupVerifyDisks(group_name=group)]
3029 for group in group_names])
3032 class LUGroupVerifyDisks(NoHooksLU):
3033 """Verifies the status of all disks in a node group.
3038 def ExpandNames(self):
3039 # Raises errors.OpPrereqError on its own if group can't be found
3040 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
3042 self.share_locks = _ShareAll()
3043 self.needed_locks = {
3044 locking.LEVEL_INSTANCE: [],
3045 locking.LEVEL_NODEGROUP: [],
3046 locking.LEVEL_NODE: [],
3049 def DeclareLocks(self, level):
3050 if level == locking.LEVEL_INSTANCE:
3051 assert not self.needed_locks[locking.LEVEL_INSTANCE]
3053 # Lock instances optimistically, needs verification once node and group
3054 # locks have been acquired
3055 self.needed_locks[locking.LEVEL_INSTANCE] = \
3056 self.cfg.GetNodeGroupInstances(self.group_uuid)
3058 elif level == locking.LEVEL_NODEGROUP:
3059 assert not self.needed_locks[locking.LEVEL_NODEGROUP]
3061 self.needed_locks[locking.LEVEL_NODEGROUP] = \
3062 set([self.group_uuid] +
3063 # Lock all groups used by instances optimistically; this requires
3064 # going via the node before it's locked, requiring verification
3067 for instance_name in self.owned_locks(locking.LEVEL_INSTANCE)
3068 for group_uuid in self.cfg.GetInstanceNodeGroups(instance_name)])
3070 elif level == locking.LEVEL_NODE:
3071 # This will only lock the nodes in the group to be verified which contain
3073 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
3074 self._LockInstancesNodes()
3076 # Lock all nodes in group to be verified
3077 assert self.group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP)
3078 member_nodes = self.cfg.GetNodeGroup(self.group_uuid).members
3079 self.needed_locks[locking.LEVEL_NODE].extend(member_nodes)
3081 def CheckPrereq(self):
3082 owned_instances = frozenset(self.owned_locks(locking.LEVEL_INSTANCE))
3083 owned_groups = frozenset(self.owned_locks(locking.LEVEL_NODEGROUP))
3084 owned_nodes = frozenset(self.owned_locks(locking.LEVEL_NODE))
3086 assert self.group_uuid in owned_groups
3088 # Check if locked instances are still correct
3089 _CheckNodeGroupInstances(self.cfg, self.group_uuid, owned_instances)
3091 # Get instance information
3092 self.instances = dict(self.cfg.GetMultiInstanceInfo(owned_instances))
3094 # Check if node groups for locked instances are still correct
3095 for (instance_name, inst) in self.instances.items():
3096 assert owned_nodes.issuperset(inst.all_nodes), \
3097 "Instance %s's nodes changed while we kept the lock" % instance_name
3099 inst_groups = _CheckInstanceNodeGroups(self.cfg, instance_name,
3102 assert self.group_uuid in inst_groups, \
3103 "Instance %s has no node in group %s" % (instance_name, self.group_uuid)
3105 def Exec(self, feedback_fn):
3106 """Verify integrity of cluster disks.
3108 @rtype: tuple of three items
3109 @return: a tuple of (dict of node-to-node_error, list of instances
3110 which need activate-disks, dict of instance: (node, volume) for
3115 res_instances = set()
3118 nv_dict = _MapInstanceDisksToNodes([inst
3119 for inst in self.instances.values()
3123 nodes = utils.NiceSort(set(self.owned_locks(locking.LEVEL_NODE)) &
3124 set(self.cfg.GetVmCapableNodeList()))
3126 node_lvs = self.rpc.call_lv_list(nodes, [])
3128 for (node, node_res) in node_lvs.items():
3129 if node_res.offline:
3132 msg = node_res.fail_msg
3134 logging.warning("Error enumerating LVs on node %s: %s", node, msg)
3135 res_nodes[node] = msg
3138 for lv_name, (_, _, lv_online) in node_res.payload.items():
3139 inst = nv_dict.pop((node, lv_name), None)
3140 if not (lv_online or inst is None):
3141 res_instances.add(inst)
3143 # any leftover items in nv_dict are missing LVs, let's arrange the data
3145 for key, inst in nv_dict.iteritems():
3146 res_missing.setdefault(inst, []).append(key)
3148 return (res_nodes, list(res_instances), res_missing)
3151 class LUClusterRepairDiskSizes(NoHooksLU):
3152 """Verifies the cluster disks sizes.
3157 def ExpandNames(self):
3158 if self.op.instances:
3159 self.wanted_names = _GetWantedInstances(self, self.op.instances)
3160 self.needed_locks = {
3161 locking.LEVEL_NODE: [],
3162 locking.LEVEL_INSTANCE: self.wanted_names,
3164 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
3166 self.wanted_names = None
3167 self.needed_locks = {
3168 locking.LEVEL_NODE: locking.ALL_SET,
3169 locking.LEVEL_INSTANCE: locking.ALL_SET,
3171 self.share_locks = _ShareAll()
3173 def DeclareLocks(self, level):
3174 if level == locking.LEVEL_NODE and self.wanted_names is not None:
3175 self._LockInstancesNodes(primary_only=True)
3177 def CheckPrereq(self):
3178 """Check prerequisites.
3180 This only checks the optional instance list against the existing names.
3183 if self.wanted_names is None:
3184 self.wanted_names = self.owned_locks(locking.LEVEL_INSTANCE)
3186 self.wanted_instances = \
3187 map(compat.snd, self.cfg.GetMultiInstanceInfo(self.wanted_names))
3189 def _EnsureChildSizes(self, disk):
3190 """Ensure children of the disk have the needed disk size.
3192 This is valid mainly for DRBD8 and fixes an issue where the
3193 children have smaller disk size.
3195 @param disk: an L{ganeti.objects.Disk} object
3198 if disk.dev_type == constants.LD_DRBD8:
3199 assert disk.children, "Empty children for DRBD8?"
3200 fchild = disk.children[0]
3201 mismatch = fchild.size < disk.size
3203 self.LogInfo("Child disk has size %d, parent %d, fixing",
3204 fchild.size, disk.size)
3205 fchild.size = disk.size
3207 # and we recurse on this child only, not on the metadev
3208 return self._EnsureChildSizes(fchild) or mismatch
3212 def Exec(self, feedback_fn):
3213 """Verify the size of cluster disks.
3216 # TODO: check child disks too
3217 # TODO: check differences in size between primary/secondary nodes
3219 for instance in self.wanted_instances:
3220 pnode = instance.primary_node
3221 if pnode not in per_node_disks:
3222 per_node_disks[pnode] = []
3223 for idx, disk in enumerate(instance.disks):
3224 per_node_disks[pnode].append((instance, idx, disk))
3227 for node, dskl in per_node_disks.items():
3228 newl = [v[2].Copy() for v in dskl]
3230 self.cfg.SetDiskID(dsk, node)
3231 result = self.rpc.call_blockdev_getsize(node, newl)
3233 self.LogWarning("Failure in blockdev_getsize call to node"
3234 " %s, ignoring", node)
3236 if len(result.payload) != len(dskl):
3237 logging.warning("Invalid result from node %s: len(dksl)=%d,"
3238 " result.payload=%s", node, len(dskl), result.payload)
3239 self.LogWarning("Invalid result from node %s, ignoring node results",
3242 for ((instance, idx, disk), size) in zip(dskl, result.payload):
3244 self.LogWarning("Disk %d of instance %s did not return size"
3245 " information, ignoring", idx, instance.name)
3247 if not isinstance(size, (int, long)):
3248 self.LogWarning("Disk %d of instance %s did not return valid"
3249 " size information, ignoring", idx, instance.name)
3252 if size != disk.size:
3253 self.LogInfo("Disk %d of instance %s has mismatched size,"
3254 " correcting: recorded %d, actual %d", idx,
3255 instance.name, disk.size, size)
3257 self.cfg.Update(instance, feedback_fn)
3258 changed.append((instance.name, idx, size))
3259 if self._EnsureChildSizes(disk):
3260 self.cfg.Update(instance, feedback_fn)
3261 changed.append((instance.name, idx, disk.size))
3265 class LUClusterRename(LogicalUnit):
3266 """Rename the cluster.
3269 HPATH = "cluster-rename"
3270 HTYPE = constants.HTYPE_CLUSTER
3272 def BuildHooksEnv(self):
3277 "OP_TARGET": self.cfg.GetClusterName(),
3278 "NEW_NAME": self.op.name,
3281 def BuildHooksNodes(self):
3282 """Build hooks nodes.
3285 return ([self.cfg.GetMasterNode()], self.cfg.GetNodeList())
3287 def CheckPrereq(self):
3288 """Verify that the passed name is a valid one.
3291 hostname = netutils.GetHostname(name=self.op.name,
3292 family=self.cfg.GetPrimaryIPFamily())
3294 new_name = hostname.name
3295 self.ip = new_ip = hostname.ip
3296 old_name = self.cfg.GetClusterName()
3297 old_ip = self.cfg.GetMasterIP()
3298 if new_name == old_name and new_ip == old_ip:
3299 raise errors.OpPrereqError("Neither the name nor the IP address of the"
3300 " cluster has changed",
3302 if new_ip != old_ip:
3303 if netutils.TcpPing(new_ip, constants.DEFAULT_NODED_PORT):
3304 raise errors.OpPrereqError("The given cluster IP address (%s) is"
3305 " reachable on the network" %
3306 new_ip, errors.ECODE_NOTUNIQUE)
3308 self.op.name = new_name
3310 def Exec(self, feedback_fn):
3311 """Rename the cluster.
3314 clustername = self.op.name
3317 # shutdown the master IP
3318 master = self.cfg.GetMasterNode()
3319 result = self.rpc.call_node_deactivate_master_ip(master)
3320 result.Raise("Could not disable the master role")
3323 cluster = self.cfg.GetClusterInfo()
3324 cluster.cluster_name = clustername
3325 cluster.master_ip = ip
3326 self.cfg.Update(cluster, feedback_fn)
3328 # update the known hosts file
3329 ssh.WriteKnownHostsFile(self.cfg, constants.SSH_KNOWN_HOSTS_FILE)
3330 node_list = self.cfg.GetOnlineNodeList()
3332 node_list.remove(master)
3335 _UploadHelper(self, node_list, constants.SSH_KNOWN_HOSTS_FILE)
3337 result = self.rpc.call_node_activate_master_ip(master)
3338 msg = result.fail_msg
3340 self.LogWarning("Could not re-enable the master role on"
3341 " the master, please restart manually: %s", msg)
3346 def _ValidateNetmask(cfg, netmask):
3347 """Checks if a netmask is valid.
3349 @type cfg: L{config.ConfigWriter}
3350 @param cfg: The cluster configuration
3352 @param netmask: the netmask to be verified
3353 @raise errors.OpPrereqError: if the validation fails
3356 ip_family = cfg.GetPrimaryIPFamily()
3358 ipcls = netutils.IPAddress.GetClassFromIpFamily(ip_family)
3359 except errors.ProgrammerError:
3360 raise errors.OpPrereqError("Invalid primary ip family: %s." %
3362 if not ipcls.ValidateNetmask(netmask):
3363 raise errors.OpPrereqError("CIDR netmask (%s) not valid" %
3367 class LUClusterSetParams(LogicalUnit):
3368 """Change the parameters of the cluster.
3371 HPATH = "cluster-modify"
3372 HTYPE = constants.HTYPE_CLUSTER
3375 def CheckArguments(self):
3379 if self.op.uid_pool:
3380 uidpool.CheckUidPool(self.op.uid_pool)
3382 if self.op.add_uids:
3383 uidpool.CheckUidPool(self.op.add_uids)
3385 if self.op.remove_uids:
3386 uidpool.CheckUidPool(self.op.remove_uids)
3388 if self.op.master_netmask is not None:
3389 _ValidateNetmask(self.cfg, self.op.master_netmask)
3391 def ExpandNames(self):
3392 # FIXME: in the future maybe other cluster params won't require checking on
3393 # all nodes to be modified.
3394 self.needed_locks = {
3395 locking.LEVEL_NODE: locking.ALL_SET,
3397 self.share_locks[locking.LEVEL_NODE] = 1
3399 def BuildHooksEnv(self):
3404 "OP_TARGET": self.cfg.GetClusterName(),
3405 "NEW_VG_NAME": self.op.vg_name,
3408 def BuildHooksNodes(self):
3409 """Build hooks nodes.
3412 mn = self.cfg.GetMasterNode()
3415 def CheckPrereq(self):
3416 """Check prerequisites.
3418 This checks whether the given params don't conflict and
3419 if the given volume group is valid.
3422 if self.op.vg_name is not None and not self.op.vg_name:
3423 if self.cfg.HasAnyDiskOfType(constants.LD_LV):
3424 raise errors.OpPrereqError("Cannot disable lvm storage while lvm-based"
3425 " instances exist", errors.ECODE_INVAL)
3427 if self.op.drbd_helper is not None and not self.op.drbd_helper:
3428 if self.cfg.HasAnyDiskOfType(constants.LD_DRBD8):
3429 raise errors.OpPrereqError("Cannot disable drbd helper while"
3430 " drbd-based instances exist",
3433 node_list = self.owned_locks(locking.LEVEL_NODE)
3435 # if vg_name not None, checks given volume group on all nodes
3437 vglist = self.rpc.call_vg_list(node_list)
3438 for node in node_list:
3439 msg = vglist[node].fail_msg
3441 # ignoring down node
3442 self.LogWarning("Error while gathering data on node %s"
3443 " (ignoring node): %s", node, msg)
3445 vgstatus = utils.CheckVolumeGroupSize(vglist[node].payload,
3447 constants.MIN_VG_SIZE)
3449 raise errors.OpPrereqError("Error on node '%s': %s" %
3450 (node, vgstatus), errors.ECODE_ENVIRON)
3452 if self.op.drbd_helper:
3453 # checks given drbd helper on all nodes
3454 helpers = self.rpc.call_drbd_helper(node_list)
3455 for (node, ninfo) in self.cfg.GetMultiNodeInfo(node_list):
3457 self.LogInfo("Not checking drbd helper on offline node %s", node)
3459 msg = helpers[node].fail_msg
3461 raise errors.OpPrereqError("Error checking drbd helper on node"
3462 " '%s': %s" % (node, msg),
3463 errors.ECODE_ENVIRON)
3464 node_helper = helpers[node].payload
3465 if node_helper != self.op.drbd_helper:
3466 raise errors.OpPrereqError("Error on node '%s': drbd helper is %s" %
3467 (node, node_helper), errors.ECODE_ENVIRON)
3469 self.cluster = cluster = self.cfg.GetClusterInfo()
3470 # validate params changes
3471 if self.op.beparams:
3472 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
3473 self.new_beparams = cluster.SimpleFillBE(self.op.beparams)
3475 if self.op.ndparams:
3476 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
3477 self.new_ndparams = cluster.SimpleFillND(self.op.ndparams)
3479 # TODO: we need a more general way to handle resetting
3480 # cluster-level parameters to default values
3481 if self.new_ndparams["oob_program"] == "":
3482 self.new_ndparams["oob_program"] = \
3483 constants.NDC_DEFAULTS[constants.ND_OOB_PROGRAM]
3485 if self.op.nicparams:
3486 utils.ForceDictType(self.op.nicparams, constants.NICS_PARAMETER_TYPES)
3487 self.new_nicparams = cluster.SimpleFillNIC(self.op.nicparams)
3488 objects.NIC.CheckParameterSyntax(self.new_nicparams)
3491 # check all instances for consistency
3492 for instance in self.cfg.GetAllInstancesInfo().values():
3493 for nic_idx, nic in enumerate(instance.nics):
3494 params_copy = copy.deepcopy(nic.nicparams)
3495 params_filled = objects.FillDict(self.new_nicparams, params_copy)
3497 # check parameter syntax
3499 objects.NIC.CheckParameterSyntax(params_filled)
3500 except errors.ConfigurationError, err:
3501 nic_errors.append("Instance %s, nic/%d: %s" %
3502 (instance.name, nic_idx, err))
3504 # if we're moving instances to routed, check that they have an ip
3505 target_mode = params_filled[constants.NIC_MODE]
3506 if target_mode == constants.NIC_MODE_ROUTED and not nic.ip:
3507 nic_errors.append("Instance %s, nic/%d: routed NIC with no ip"
3508 " address" % (instance.name, nic_idx))
3510 raise errors.OpPrereqError("Cannot apply the change, errors:\n%s" %
3511 "\n".join(nic_errors))
3513 # hypervisor list/parameters
3514 self.new_hvparams = new_hvp = objects.FillDict(cluster.hvparams, {})
3515 if self.op.hvparams:
3516 for hv_name, hv_dict in self.op.hvparams.items():
3517 if hv_name not in self.new_hvparams:
3518 self.new_hvparams[hv_name] = hv_dict
3520 self.new_hvparams[hv_name].update(hv_dict)
3522 # os hypervisor parameters
3523 self.new_os_hvp = objects.FillDict(cluster.os_hvp, {})
3525 for os_name, hvs in self.op.os_hvp.items():
3526 if os_name not in self.new_os_hvp:
3527 self.new_os_hvp[os_name] = hvs
3529 for hv_name, hv_dict in hvs.items():
3530 if hv_name not in self.new_os_hvp[os_name]:
3531 self.new_os_hvp[os_name][hv_name] = hv_dict
3533 self.new_os_hvp[os_name][hv_name].update(hv_dict)
3536 self.new_osp = objects.FillDict(cluster.osparams, {})
3537 if self.op.osparams:
3538 for os_name, osp in self.op.osparams.items():
3539 if os_name not in self.new_osp:
3540 self.new_osp[os_name] = {}
3542 self.new_osp[os_name] = _GetUpdatedParams(self.new_osp[os_name], osp,
3545 if not self.new_osp[os_name]:
3546 # we removed all parameters
3547 del self.new_osp[os_name]
3549 # check the parameter validity (remote check)
3550 _CheckOSParams(self, False, [self.cfg.GetMasterNode()],
3551 os_name, self.new_osp[os_name])
3553 # changes to the hypervisor list
3554 if self.op.enabled_hypervisors is not None:
3555 self.hv_list = self.op.enabled_hypervisors
3556 for hv in self.hv_list:
3557 # if the hypervisor doesn't already exist in the cluster
3558 # hvparams, we initialize it to empty, and then (in both
3559 # cases) we make sure to fill the defaults, as we might not
3560 # have a complete defaults list if the hypervisor wasn't
3562 if hv not in new_hvp:
3564 new_hvp[hv] = objects.FillDict(constants.HVC_DEFAULTS[hv], new_hvp[hv])
3565 utils.ForceDictType(new_hvp[hv], constants.HVS_PARAMETER_TYPES)
3567 self.hv_list = cluster.enabled_hypervisors
3569 if self.op.hvparams or self.op.enabled_hypervisors is not None:
3570 # either the enabled list has changed, or the parameters have, validate
3571 for hv_name, hv_params in self.new_hvparams.items():
3572 if ((self.op.hvparams and hv_name in self.op.hvparams) or
3573 (self.op.enabled_hypervisors and
3574 hv_name in self.op.enabled_hypervisors)):
3575 # either this is a new hypervisor, or its parameters have changed
3576 hv_class = hypervisor.GetHypervisor(hv_name)
3577 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
3578 hv_class.CheckParameterSyntax(hv_params)
3579 _CheckHVParams(self, node_list, hv_name, hv_params)
3582 # no need to check any newly-enabled hypervisors, since the
3583 # defaults have already been checked in the above code-block
3584 for os_name, os_hvp in self.new_os_hvp.items():
3585 for hv_name, hv_params in os_hvp.items():
3586 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
3587 # we need to fill in the new os_hvp on top of the actual hv_p
3588 cluster_defaults = self.new_hvparams.get(hv_name, {})
3589 new_osp = objects.FillDict(cluster_defaults, hv_params)
3590 hv_class = hypervisor.GetHypervisor(hv_name)
3591 hv_class.CheckParameterSyntax(new_osp)
3592 _CheckHVParams(self, node_list, hv_name, new_osp)
3594 if self.op.default_iallocator:
3595 alloc_script = utils.FindFile(self.op.default_iallocator,
3596 constants.IALLOCATOR_SEARCH_PATH,
3598 if alloc_script is None:
3599 raise errors.OpPrereqError("Invalid default iallocator script '%s'"
3600 " specified" % self.op.default_iallocator,
3603 def Exec(self, feedback_fn):
3604 """Change the parameters of the cluster.
3607 if self.op.vg_name is not None:
3608 new_volume = self.op.vg_name
3611 if new_volume != self.cfg.GetVGName():
3612 self.cfg.SetVGName(new_volume)
3614 feedback_fn("Cluster LVM configuration already in desired"
3615 " state, not changing")
3616 if self.op.drbd_helper is not None:
3617 new_helper = self.op.drbd_helper
3620 if new_helper != self.cfg.GetDRBDHelper():
3621 self.cfg.SetDRBDHelper(new_helper)
3623 feedback_fn("Cluster DRBD helper already in desired state,"
3625 if self.op.hvparams:
3626 self.cluster.hvparams = self.new_hvparams
3628 self.cluster.os_hvp = self.new_os_hvp
3629 if self.op.enabled_hypervisors is not None:
3630 self.cluster.hvparams = self.new_hvparams
3631 self.cluster.enabled_hypervisors = self.op.enabled_hypervisors
3632 if self.op.beparams:
3633 self.cluster.beparams[constants.PP_DEFAULT] = self.new_beparams
3634 if self.op.nicparams:
3635 self.cluster.nicparams[constants.PP_DEFAULT] = self.new_nicparams
3636 if self.op.osparams:
3637 self.cluster.osparams = self.new_osp
3638 if self.op.ndparams:
3639 self.cluster.ndparams = self.new_ndparams
3641 if self.op.candidate_pool_size is not None:
3642 self.cluster.candidate_pool_size = self.op.candidate_pool_size
3643 # we need to update the pool size here, otherwise the save will fail
3644 _AdjustCandidatePool(self, [])
3646 if self.op.maintain_node_health is not None:
3647 self.cluster.maintain_node_health = self.op.maintain_node_health
3649 if self.op.prealloc_wipe_disks is not None:
3650 self.cluster.prealloc_wipe_disks = self.op.prealloc_wipe_disks
3652 if self.op.add_uids is not None:
3653 uidpool.AddToUidPool(self.cluster.uid_pool, self.op.add_uids)
3655 if self.op.remove_uids is not None:
3656 uidpool.RemoveFromUidPool(self.cluster.uid_pool, self.op.remove_uids)
3658 if self.op.uid_pool is not None:
3659 self.cluster.uid_pool = self.op.uid_pool
3661 if self.op.default_iallocator is not None:
3662 self.cluster.default_iallocator = self.op.default_iallocator
3664 if self.op.reserved_lvs is not None:
3665 self.cluster.reserved_lvs = self.op.reserved_lvs
3667 def helper_os(aname, mods, desc):
3669 lst = getattr(self.cluster, aname)
3670 for key, val in mods:
3671 if key == constants.DDM_ADD:
3673 feedback_fn("OS %s already in %s, ignoring" % (val, desc))
3676 elif key == constants.DDM_REMOVE:
3680 feedback_fn("OS %s not found in %s, ignoring" % (val, desc))
3682 raise errors.ProgrammerError("Invalid modification '%s'" % key)
3684 if self.op.hidden_os:
3685 helper_os("hidden_os", self.op.hidden_os, "hidden")
3687 if self.op.blacklisted_os:
3688 helper_os("blacklisted_os", self.op.blacklisted_os, "blacklisted")
3690 if self.op.master_netdev:
3691 master = self.cfg.GetMasterNode()
3692 feedback_fn("Shutting down master ip on the current netdev (%s)" %
3693 self.cluster.master_netdev)
3694 result = self.rpc.call_node_deactivate_master_ip(master)
3695 result.Raise("Could not disable the master ip")
3696 feedback_fn("Changing master_netdev from %s to %s" %
3697 (self.cluster.master_netdev, self.op.master_netdev))
3698 self.cluster.master_netdev = self.op.master_netdev
3700 if self.op.master_netmask:
3701 master = self.cfg.GetMasterNode()
3702 feedback_fn("Changing master IP netmask to %s" % self.op.master_netmask)
3703 result = self.rpc.call_node_change_master_netmask(master,
3704 self.op.master_netmask)
3706 msg = "Could not change the master IP netmask: %s" % result.fail_msg
3707 self.LogWarning(msg)
3710 self.cluster.master_netmask = self.op.master_netmask
3712 self.cfg.Update(self.cluster, feedback_fn)
3714 if self.op.master_netdev:
3715 feedback_fn("Starting the master ip on the new master netdev (%s)" %
3716 self.op.master_netdev)
3717 result = self.rpc.call_node_activate_master_ip(master)
3719 self.LogWarning("Could not re-enable the master ip on"
3720 " the master, please restart manually: %s",
3724 def _UploadHelper(lu, nodes, fname):
3725 """Helper for uploading a file and showing warnings.
3728 if os.path.exists(fname):
3729 result = lu.rpc.call_upload_file(nodes, fname)
3730 for to_node, to_result in result.items():
3731 msg = to_result.fail_msg
3733 msg = ("Copy of file %s to node %s failed: %s" %
3734 (fname, to_node, msg))
3735 lu.proc.LogWarning(msg)
3738 def _ComputeAncillaryFiles(cluster, redist):
3739 """Compute files external to Ganeti which need to be consistent.
3741 @type redist: boolean
3742 @param redist: Whether to include files which need to be redistributed
3745 # Compute files for all nodes
3747 constants.SSH_KNOWN_HOSTS_FILE,
3748 constants.CONFD_HMAC_KEY,
3749 constants.CLUSTER_DOMAIN_SECRET_FILE,
3753 files_all.update(constants.ALL_CERT_FILES)
3754 files_all.update(ssconf.SimpleStore().GetFileList())
3756 # we need to ship at least the RAPI certificate
3757 files_all.add(constants.RAPI_CERT_FILE)
3759 if cluster.modify_etc_hosts:
3760 files_all.add(constants.ETC_HOSTS)
3762 # Files which must either exist on all nodes or on none
3763 files_all_opt = set([
3764 constants.RAPI_USERS_FILE,
3767 # Files which should only be on master candidates
3770 files_mc.add(constants.CLUSTER_CONF_FILE)
3772 # Files which should only be on VM-capable nodes
3773 files_vm = set(filename
3774 for hv_name in cluster.enabled_hypervisors
3775 for filename in hypervisor.GetHypervisor(hv_name).GetAncillaryFiles())
3777 # Filenames must be unique
3778 assert (len(files_all | files_all_opt | files_mc | files_vm) ==
3779 sum(map(len, [files_all, files_all_opt, files_mc, files_vm]))), \
3780 "Found file listed in more than one file list"
3782 return (files_all, files_all_opt, files_mc, files_vm)
3785 def _RedistributeAncillaryFiles(lu, additional_nodes=None, additional_vm=True):
3786 """Distribute additional files which are part of the cluster configuration.
3788 ConfigWriter takes care of distributing the config and ssconf files, but
3789 there are more files which should be distributed to all nodes. This function
3790 makes sure those are copied.
3792 @param lu: calling logical unit
3793 @param additional_nodes: list of nodes not in the config to distribute to
3794 @type additional_vm: boolean
3795 @param additional_vm: whether the additional nodes are vm-capable or not
3798 # Gather target nodes
3799 cluster = lu.cfg.GetClusterInfo()
3800 master_info = lu.cfg.GetNodeInfo(lu.cfg.GetMasterNode())
3802 online_nodes = lu.cfg.GetOnlineNodeList()
3803 vm_nodes = lu.cfg.GetVmCapableNodeList()
3805 if additional_nodes is not None:
3806 online_nodes.extend(additional_nodes)
3808 vm_nodes.extend(additional_nodes)
3810 # Never distribute to master node
3811 for nodelist in [online_nodes, vm_nodes]:
3812 if master_info.name in nodelist:
3813 nodelist.remove(master_info.name)
3816 (files_all, files_all_opt, files_mc, files_vm) = \
3817 _ComputeAncillaryFiles(cluster, True)
3819 # Never re-distribute configuration file from here
3820 assert not (constants.CLUSTER_CONF_FILE in files_all or
3821 constants.CLUSTER_CONF_FILE in files_vm)
3822 assert not files_mc, "Master candidates not handled in this function"
3825 (online_nodes, files_all),
3826 (online_nodes, files_all_opt),
3827 (vm_nodes, files_vm),
3831 for (node_list, files) in filemap:
3833 _UploadHelper(lu, node_list, fname)
3836 class LUClusterRedistConf(NoHooksLU):
3837 """Force the redistribution of cluster configuration.
3839 This is a very simple LU.
3844 def ExpandNames(self):
3845 self.needed_locks = {
3846 locking.LEVEL_NODE: locking.ALL_SET,
3848 self.share_locks[locking.LEVEL_NODE] = 1
3850 def Exec(self, feedback_fn):
3851 """Redistribute the configuration.
3854 self.cfg.Update(self.cfg.GetClusterInfo(), feedback_fn)
3855 _RedistributeAncillaryFiles(self)
3858 class LUClusterActivateMasterIp(NoHooksLU):
3859 """Activate the master IP on the master node.
3862 def Exec(self, feedback_fn):
3863 """Activate the master IP.
3866 master = self.cfg.GetMasterNode()
3867 self.rpc.call_node_activate_master_ip(master)
3870 class LUClusterDeactivateMasterIp(NoHooksLU):
3871 """Deactivate the master IP on the master node.
3874 def Exec(self, feedback_fn):
3875 """Deactivate the master IP.
3878 master = self.cfg.GetMasterNode()
3879 self.rpc.call_node_deactivate_master_ip(master)
3882 def _WaitForSync(lu, instance, disks=None, oneshot=False):
3883 """Sleep and poll for an instance's disk to sync.
3886 if not instance.disks or disks is not None and not disks:
3889 disks = _ExpandCheckDisks(instance, disks)
3892 lu.proc.LogInfo("Waiting for instance %s to sync disks." % instance.name)
3894 node = instance.primary_node
3897 lu.cfg.SetDiskID(dev, node)
3899 # TODO: Convert to utils.Retry
3902 degr_retries = 10 # in seconds, as we sleep 1 second each time
3906 cumul_degraded = False
3907 rstats = lu.rpc.call_blockdev_getmirrorstatus(node, disks)
3908 msg = rstats.fail_msg
3910 lu.LogWarning("Can't get any data from node %s: %s", node, msg)
3913 raise errors.RemoteError("Can't contact node %s for mirror data,"
3914 " aborting." % node)
3917 rstats = rstats.payload
3919 for i, mstat in enumerate(rstats):
3921 lu.LogWarning("Can't compute data for node %s/%s",
3922 node, disks[i].iv_name)
3925 cumul_degraded = (cumul_degraded or
3926 (mstat.is_degraded and mstat.sync_percent is None))
3927 if mstat.sync_percent is not None:
3929 if mstat.estimated_time is not None:
3930 rem_time = ("%s remaining (estimated)" %
3931 utils.FormatSeconds(mstat.estimated_time))
3932 max_time = mstat.estimated_time
3934 rem_time = "no time estimate"
3935 lu.proc.LogInfo("- device %s: %5.2f%% done, %s" %
3936 (disks[i].iv_name, mstat.sync_percent, rem_time))
3938 # if we're done but degraded, let's do a few small retries, to
3939 # make sure we see a stable and not transient situation; therefore
3940 # we force restart of the loop
3941 if (done or oneshot) and cumul_degraded and degr_retries > 0:
3942 logging.info("Degraded disks found, %d retries left", degr_retries)
3950 time.sleep(min(60, max_time))
3953 lu.proc.LogInfo("Instance %s's disks are in sync." % instance.name)
3954 return not cumul_degraded
3957 def _CheckDiskConsistency(lu, dev, node, on_primary, ldisk=False):
3958 """Check that mirrors are not degraded.
3960 The ldisk parameter, if True, will change the test from the
3961 is_degraded attribute (which represents overall non-ok status for
3962 the device(s)) to the ldisk (representing the local storage status).
3965 lu.cfg.SetDiskID(dev, node)
3969 if on_primary or dev.AssembleOnSecondary():
3970 rstats = lu.rpc.call_blockdev_find(node, dev)
3971 msg = rstats.fail_msg
3973 lu.LogWarning("Can't find disk on node %s: %s", node, msg)
3975 elif not rstats.payload:
3976 lu.LogWarning("Can't find disk on node %s", node)
3980 result = result and rstats.payload.ldisk_status == constants.LDS_OKAY
3982 result = result and not rstats.payload.is_degraded
3985 for child in dev.children:
3986 result = result and _CheckDiskConsistency(lu, child, node, on_primary)
3991 class LUOobCommand(NoHooksLU):
3992 """Logical unit for OOB handling.
3996 _SKIP_MASTER = (constants.OOB_POWER_OFF, constants.OOB_POWER_CYCLE)
3998 def ExpandNames(self):
3999 """Gather locks we need.
4002 if self.op.node_names:
4003 self.op.node_names = _GetWantedNodes(self, self.op.node_names)
4004 lock_names = self.op.node_names
4006 lock_names = locking.ALL_SET
4008 self.needed_locks = {
4009 locking.LEVEL_NODE: lock_names,
4012 def CheckPrereq(self):
4013 """Check prerequisites.
4016 - the node exists in the configuration
4019 Any errors are signaled by raising errors.OpPrereqError.
4023 self.master_node = self.cfg.GetMasterNode()
4025 assert self.op.power_delay >= 0.0
4027 if self.op.node_names:
4028 if (self.op.command in self._SKIP_MASTER and
4029 self.master_node in self.op.node_names):
4030 master_node_obj = self.cfg.GetNodeInfo(self.master_node)
4031 master_oob_handler = _SupportsOob(self.cfg, master_node_obj)
4033 if master_oob_handler:
4034 additional_text = ("run '%s %s %s' if you want to operate on the"
4035 " master regardless") % (master_oob_handler,
4039 additional_text = "it does not support out-of-band operations"
4041 raise errors.OpPrereqError(("Operating on the master node %s is not"
4042 " allowed for %s; %s") %
4043 (self.master_node, self.op.command,
4044 additional_text), errors.ECODE_INVAL)
4046 self.op.node_names = self.cfg.GetNodeList()
4047 if self.op.command in self._SKIP_MASTER:
4048 self.op.node_names.remove(self.master_node)
4050 if self.op.command in self._SKIP_MASTER:
4051 assert self.master_node not in self.op.node_names
4053 for (node_name, node) in self.cfg.GetMultiNodeInfo(self.op.node_names):
4055 raise errors.OpPrereqError("Node %s not found" % node_name,
4058 self.nodes.append(node)
4060 if (not self.op.ignore_status and
4061 (self.op.command == constants.OOB_POWER_OFF and not node.offline)):
4062 raise errors.OpPrereqError(("Cannot power off node %s because it is"
4063 " not marked offline") % node_name,
4066 def Exec(self, feedback_fn):
4067 """Execute OOB and return result if we expect any.
4070 master_node = self.master_node
4073 for idx, node in enumerate(utils.NiceSort(self.nodes,
4074 key=lambda node: node.name)):
4075 node_entry = [(constants.RS_NORMAL, node.name)]
4076 ret.append(node_entry)
4078 oob_program = _SupportsOob(self.cfg, node)
4081 node_entry.append((constants.RS_UNAVAIL, None))
4084 logging.info("Executing out-of-band command '%s' using '%s' on %s",
4085 self.op.command, oob_program, node.name)
4086 result = self.rpc.call_run_oob(master_node, oob_program,
4087 self.op.command, node.name,
4091 self.LogWarning("Out-of-band RPC failed on node '%s': %s",
4092 node.name, result.fail_msg)
4093 node_entry.append((constants.RS_NODATA, None))
4096 self._CheckPayload(result)
4097 except errors.OpExecError, err:
4098 self.LogWarning("Payload returned by node '%s' is not valid: %s",
4100 node_entry.append((constants.RS_NODATA, None))
4102 if self.op.command == constants.OOB_HEALTH:
4103 # For health we should log important events
4104 for item, status in result.payload:
4105 if status in [constants.OOB_STATUS_WARNING,
4106 constants.OOB_STATUS_CRITICAL]:
4107 self.LogWarning("Item '%s' on node '%s' has status '%s'",
4108 item, node.name, status)
4110 if self.op.command == constants.OOB_POWER_ON:
4112 elif self.op.command == constants.OOB_POWER_OFF:
4113 node.powered = False
4114 elif self.op.command == constants.OOB_POWER_STATUS:
4115 powered = result.payload[constants.OOB_POWER_STATUS_POWERED]
4116 if powered != node.powered:
4117 logging.warning(("Recorded power state (%s) of node '%s' does not"
4118 " match actual power state (%s)"), node.powered,
4121 # For configuration changing commands we should update the node
4122 if self.op.command in (constants.OOB_POWER_ON,
4123 constants.OOB_POWER_OFF):
4124 self.cfg.Update(node, feedback_fn)
4126 node_entry.append((constants.RS_NORMAL, result.payload))
4128 if (self.op.command == constants.OOB_POWER_ON and
4129 idx < len(self.nodes) - 1):
4130 time.sleep(self.op.power_delay)
4134 def _CheckPayload(self, result):
4135 """Checks if the payload is valid.
4137 @param result: RPC result
4138 @raises errors.OpExecError: If payload is not valid
4142 if self.op.command == constants.OOB_HEALTH:
4143 if not isinstance(result.payload, list):
4144 errs.append("command 'health' is expected to return a list but got %s" %
4145 type(result.payload))
4147 for item, status in result.payload:
4148 if status not in constants.OOB_STATUSES:
4149 errs.append("health item '%s' has invalid status '%s'" %
4152 if self.op.command == constants.OOB_POWER_STATUS:
4153 if not isinstance(result.payload, dict):
4154 errs.append("power-status is expected to return a dict but got %s" %
4155 type(result.payload))
4157 if self.op.command in [
4158 constants.OOB_POWER_ON,
4159 constants.OOB_POWER_OFF,
4160 constants.OOB_POWER_CYCLE,
4162 if result.payload is not None:
4163 errs.append("%s is expected to not return payload but got '%s'" %
4164 (self.op.command, result.payload))
4167 raise errors.OpExecError("Check of out-of-band payload failed due to %s" %
4168 utils.CommaJoin(errs))
4171 class _OsQuery(_QueryBase):
4172 FIELDS = query.OS_FIELDS
4174 def ExpandNames(self, lu):
4175 # Lock all nodes in shared mode
4176 # Temporary removal of locks, should be reverted later
4177 # TODO: reintroduce locks when they are lighter-weight
4178 lu.needed_locks = {}
4179 #self.share_locks[locking.LEVEL_NODE] = 1
4180 #self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
4182 # The following variables interact with _QueryBase._GetNames
4184 self.wanted = self.names
4186 self.wanted = locking.ALL_SET
4188 self.do_locking = self.use_locking
4190 def DeclareLocks(self, lu, level):
4194 def _DiagnoseByOS(rlist):
4195 """Remaps a per-node return list into an a per-os per-node dictionary
4197 @param rlist: a map with node names as keys and OS objects as values
4200 @return: a dictionary with osnames as keys and as value another
4201 map, with nodes as keys and tuples of (path, status, diagnose,
4202 variants, parameters, api_versions) as values, eg::
4204 {"debian-etch": {"node1": [(/usr/lib/..., True, "", [], []),
4205 (/srv/..., False, "invalid api")],
4206 "node2": [(/srv/..., True, "", [], [])]}
4211 # we build here the list of nodes that didn't fail the RPC (at RPC
4212 # level), so that nodes with a non-responding node daemon don't
4213 # make all OSes invalid
4214 good_nodes = [node_name for node_name in rlist
4215 if not rlist[node_name].fail_msg]
4216 for node_name, nr in rlist.items():
4217 if nr.fail_msg or not nr.payload:
4219 for (name, path, status, diagnose, variants,
4220 params, api_versions) in nr.payload:
4221 if name not in all_os:
4222 # build a list of nodes for this os containing empty lists
4223 # for each node in node_list
4225 for nname in good_nodes:
4226 all_os[name][nname] = []
4227 # convert params from [name, help] to (name, help)
4228 params = [tuple(v) for v in params]
4229 all_os[name][node_name].append((path, status, diagnose,
4230 variants, params, api_versions))
4233 def _GetQueryData(self, lu):
4234 """Computes the list of nodes and their attributes.
4237 # Locking is not used
4238 assert not (compat.any(lu.glm.is_owned(level)
4239 for level in locking.LEVELS
4240 if level != locking.LEVEL_CLUSTER) or
4241 self.do_locking or self.use_locking)
4243 valid_nodes = [node.name
4244 for node in lu.cfg.GetAllNodesInfo().values()
4245 if not node.offline and node.vm_capable]
4246 pol = self._DiagnoseByOS(lu.rpc.call_os_diagnose(valid_nodes))
4247 cluster = lu.cfg.GetClusterInfo()
4251 for (os_name, os_data) in pol.items():
4252 info = query.OsInfo(name=os_name, valid=True, node_status=os_data,
4253 hidden=(os_name in cluster.hidden_os),
4254 blacklisted=(os_name in cluster.blacklisted_os))
4258 api_versions = set()
4260 for idx, osl in enumerate(os_data.values()):
4261 info.valid = bool(info.valid and osl and osl[0][1])
4265 (node_variants, node_params, node_api) = osl[0][3:6]
4268 variants.update(node_variants)
4269 parameters.update(node_params)
4270 api_versions.update(node_api)
4272 # Filter out inconsistent values
4273 variants.intersection_update(node_variants)
4274 parameters.intersection_update(node_params)
4275 api_versions.intersection_update(node_api)
4277 info.variants = list(variants)
4278 info.parameters = list(parameters)
4279 info.api_versions = list(api_versions)
4281 data[os_name] = info
4283 # Prepare data in requested order
4284 return [data[name] for name in self._GetNames(lu, pol.keys(), None)
4288 class LUOsDiagnose(NoHooksLU):
4289 """Logical unit for OS diagnose/query.
4295 def _BuildFilter(fields, names):
4296 """Builds a filter for querying OSes.
4299 name_filter = qlang.MakeSimpleFilter("name", names)
4301 # Legacy behaviour: Hide hidden, blacklisted or invalid OSes if the
4302 # respective field is not requested
4303 status_filter = [[qlang.OP_NOT, [qlang.OP_TRUE, fname]]
4304 for fname in ["hidden", "blacklisted"]
4305 if fname not in fields]
4306 if "valid" not in fields:
4307 status_filter.append([qlang.OP_TRUE, "valid"])
4310 status_filter.insert(0, qlang.OP_AND)
4312 status_filter = None
4314 if name_filter and status_filter:
4315 return [qlang.OP_AND, name_filter, status_filter]
4319 return status_filter
4321 def CheckArguments(self):
4322 self.oq = _OsQuery(self._BuildFilter(self.op.output_fields, self.op.names),
4323 self.op.output_fields, False)
4325 def ExpandNames(self):
4326 self.oq.ExpandNames(self)
4328 def Exec(self, feedback_fn):
4329 return self.oq.OldStyleQuery(self)
4332 class LUNodeRemove(LogicalUnit):
4333 """Logical unit for removing a node.
4336 HPATH = "node-remove"
4337 HTYPE = constants.HTYPE_NODE
4339 def BuildHooksEnv(self):
4342 This doesn't run on the target node in the pre phase as a failed
4343 node would then be impossible to remove.
4347 "OP_TARGET": self.op.node_name,
4348 "NODE_NAME": self.op.node_name,
4351 def BuildHooksNodes(self):
4352 """Build hooks nodes.
4355 all_nodes = self.cfg.GetNodeList()
4357 all_nodes.remove(self.op.node_name)
4359 logging.warning("Node '%s', which is about to be removed, was not found"
4360 " in the list of all nodes", self.op.node_name)
4361 return (all_nodes, all_nodes)
4363 def CheckPrereq(self):
4364 """Check prerequisites.
4367 - the node exists in the configuration
4368 - it does not have primary or secondary instances
4369 - it's not the master
4371 Any errors are signaled by raising errors.OpPrereqError.
4374 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
4375 node = self.cfg.GetNodeInfo(self.op.node_name)
4376 assert node is not None
4378 masternode = self.cfg.GetMasterNode()
4379 if node.name == masternode:
4380 raise errors.OpPrereqError("Node is the master node, failover to another"
4381 " node is required", errors.ECODE_INVAL)
4383 for instance_name, instance in self.cfg.GetAllInstancesInfo():
4384 if node.name in instance.all_nodes:
4385 raise errors.OpPrereqError("Instance %s is still running on the node,"
4386 " please remove first" % instance_name,
4388 self.op.node_name = node.name
4391 def Exec(self, feedback_fn):
4392 """Removes the node from the cluster.
4396 logging.info("Stopping the node daemon and removing configs from node %s",
4399 modify_ssh_setup = self.cfg.GetClusterInfo().modify_ssh_setup
4401 # Promote nodes to master candidate as needed
4402 _AdjustCandidatePool(self, exceptions=[node.name])
4403 self.context.RemoveNode(node.name)
4405 # Run post hooks on the node before it's removed
4406 _RunPostHook(self, node.name)
4408 result = self.rpc.call_node_leave_cluster(node.name, modify_ssh_setup)
4409 msg = result.fail_msg
4411 self.LogWarning("Errors encountered on the remote node while leaving"
4412 " the cluster: %s", msg)
4414 # Remove node from our /etc/hosts
4415 if self.cfg.GetClusterInfo().modify_etc_hosts:
4416 master_node = self.cfg.GetMasterNode()
4417 result = self.rpc.call_etc_hosts_modify(master_node,
4418 constants.ETC_HOSTS_REMOVE,
4420 result.Raise("Can't update hosts file with new host data")
4421 _RedistributeAncillaryFiles(self)
4424 class _NodeQuery(_QueryBase):
4425 FIELDS = query.NODE_FIELDS
4427 def ExpandNames(self, lu):
4428 lu.needed_locks = {}
4429 lu.share_locks = _ShareAll()
4432 self.wanted = _GetWantedNodes(lu, self.names)
4434 self.wanted = locking.ALL_SET
4436 self.do_locking = (self.use_locking and
4437 query.NQ_LIVE in self.requested_data)
4440 # If any non-static field is requested we need to lock the nodes
4441 lu.needed_locks[locking.LEVEL_NODE] = self.wanted
4443 def DeclareLocks(self, lu, level):
4446 def _GetQueryData(self, lu):
4447 """Computes the list of nodes and their attributes.
4450 all_info = lu.cfg.GetAllNodesInfo()
4452 nodenames = self._GetNames(lu, all_info.keys(), locking.LEVEL_NODE)
4454 # Gather data as requested
4455 if query.NQ_LIVE in self.requested_data:
4456 # filter out non-vm_capable nodes
4457 toquery_nodes = [name for name in nodenames if all_info[name].vm_capable]
4459 node_data = lu.rpc.call_node_info(toquery_nodes, lu.cfg.GetVGName(),
4460 lu.cfg.GetHypervisorType())
4461 live_data = dict((name, nresult.payload)
4462 for (name, nresult) in node_data.items()
4463 if not nresult.fail_msg and nresult.payload)
4467 if query.NQ_INST in self.requested_data:
4468 node_to_primary = dict([(name, set()) for name in nodenames])
4469 node_to_secondary = dict([(name, set()) for name in nodenames])
4471 inst_data = lu.cfg.GetAllInstancesInfo()
4473 for inst in inst_data.values():
4474 if inst.primary_node in node_to_primary:
4475 node_to_primary[inst.primary_node].add(inst.name)
4476 for secnode in inst.secondary_nodes:
4477 if secnode in node_to_secondary:
4478 node_to_secondary[secnode].add(inst.name)
4480 node_to_primary = None
4481 node_to_secondary = None
4483 if query.NQ_OOB in self.requested_data:
4484 oob_support = dict((name, bool(_SupportsOob(lu.cfg, node)))
4485 for name, node in all_info.iteritems())
4489 if query.NQ_GROUP in self.requested_data:
4490 groups = lu.cfg.GetAllNodeGroupsInfo()
4494 return query.NodeQueryData([all_info[name] for name in nodenames],
4495 live_data, lu.cfg.GetMasterNode(),
4496 node_to_primary, node_to_secondary, groups,
4497 oob_support, lu.cfg.GetClusterInfo())
4500 class LUNodeQuery(NoHooksLU):
4501 """Logical unit for querying nodes.
4504 # pylint: disable=W0142
4507 def CheckArguments(self):
4508 self.nq = _NodeQuery(qlang.MakeSimpleFilter("name", self.op.names),
4509 self.op.output_fields, self.op.use_locking)
4511 def ExpandNames(self):
4512 self.nq.ExpandNames(self)
4514 def Exec(self, feedback_fn):
4515 return self.nq.OldStyleQuery(self)
4518 class LUNodeQueryvols(NoHooksLU):
4519 """Logical unit for getting volumes on node(s).
4523 _FIELDS_DYNAMIC = utils.FieldSet("phys", "vg", "name", "size", "instance")
4524 _FIELDS_STATIC = utils.FieldSet("node")
4526 def CheckArguments(self):
4527 _CheckOutputFields(static=self._FIELDS_STATIC,
4528 dynamic=self._FIELDS_DYNAMIC,
4529 selected=self.op.output_fields)
4531 def ExpandNames(self):
4532 self.needed_locks = {}
4533 self.share_locks[locking.LEVEL_NODE] = 1
4534 if not self.op.nodes:
4535 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
4537 self.needed_locks[locking.LEVEL_NODE] = \
4538 _GetWantedNodes(self, self.op.nodes)
4540 def Exec(self, feedback_fn):
4541 """Computes the list of nodes and their attributes.
4544 nodenames = self.owned_locks(locking.LEVEL_NODE)
4545 volumes = self.rpc.call_node_volumes(nodenames)
4547 ilist = self.cfg.GetAllInstancesInfo()
4548 vol2inst = _MapInstanceDisksToNodes(ilist.values())
4551 for node in nodenames:
4552 nresult = volumes[node]
4555 msg = nresult.fail_msg
4557 self.LogWarning("Can't compute volume data on node %s: %s", node, msg)
4560 node_vols = sorted(nresult.payload,
4561 key=operator.itemgetter("dev"))
4563 for vol in node_vols:
4565 for field in self.op.output_fields:
4568 elif field == "phys":
4572 elif field == "name":
4574 elif field == "size":
4575 val = int(float(vol["size"]))
4576 elif field == "instance":
4577 val = vol2inst.get((node, vol["vg"] + "/" + vol["name"]), "-")
4579 raise errors.ParameterError(field)
4580 node_output.append(str(val))
4582 output.append(node_output)
4587 class LUNodeQueryStorage(NoHooksLU):
4588 """Logical unit for getting information on storage units on node(s).
4591 _FIELDS_STATIC = utils.FieldSet(constants.SF_NODE)
4594 def CheckArguments(self):
4595 _CheckOutputFields(static=self._FIELDS_STATIC,
4596 dynamic=utils.FieldSet(*constants.VALID_STORAGE_FIELDS),
4597 selected=self.op.output_fields)
4599 def ExpandNames(self):
4600 self.needed_locks = {}
4601 self.share_locks[locking.LEVEL_NODE] = 1
4604 self.needed_locks[locking.LEVEL_NODE] = \
4605 _GetWantedNodes(self, self.op.nodes)
4607 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
4609 def Exec(self, feedback_fn):
4610 """Computes the list of nodes and their attributes.
4613 self.nodes = self.owned_locks(locking.LEVEL_NODE)
4615 # Always get name to sort by
4616 if constants.SF_NAME in self.op.output_fields:
4617 fields = self.op.output_fields[:]
4619 fields = [constants.SF_NAME] + self.op.output_fields
4621 # Never ask for node or type as it's only known to the LU
4622 for extra in [constants.SF_NODE, constants.SF_TYPE]:
4623 while extra in fields:
4624 fields.remove(extra)
4626 field_idx = dict([(name, idx) for (idx, name) in enumerate(fields)])
4627 name_idx = field_idx[constants.SF_NAME]
4629 st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
4630 data = self.rpc.call_storage_list(self.nodes,
4631 self.op.storage_type, st_args,
4632 self.op.name, fields)
4636 for node in utils.NiceSort(self.nodes):
4637 nresult = data[node]
4641 msg = nresult.fail_msg
4643 self.LogWarning("Can't get storage data from node %s: %s", node, msg)
4646 rows = dict([(row[name_idx], row) for row in nresult.payload])
4648 for name in utils.NiceSort(rows.keys()):
4653 for field in self.op.output_fields:
4654 if field == constants.SF_NODE:
4656 elif field == constants.SF_TYPE:
4657 val = self.op.storage_type
4658 elif field in field_idx:
4659 val = row[field_idx[field]]
4661 raise errors.ParameterError(field)
4670 class _InstanceQuery(_QueryBase):
4671 FIELDS = query.INSTANCE_FIELDS
4673 def ExpandNames(self, lu):
4674 lu.needed_locks = {}
4675 lu.share_locks = _ShareAll()
4678 self.wanted = _GetWantedInstances(lu, self.names)
4680 self.wanted = locking.ALL_SET
4682 self.do_locking = (self.use_locking and
4683 query.IQ_LIVE in self.requested_data)
4685 lu.needed_locks[locking.LEVEL_INSTANCE] = self.wanted
4686 lu.needed_locks[locking.LEVEL_NODEGROUP] = []
4687 lu.needed_locks[locking.LEVEL_NODE] = []
4688 lu.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
4690 self.do_grouplocks = (self.do_locking and
4691 query.IQ_NODES in self.requested_data)
4693 def DeclareLocks(self, lu, level):
4695 if level == locking.LEVEL_NODEGROUP and self.do_grouplocks:
4696 assert not lu.needed_locks[locking.LEVEL_NODEGROUP]
4698 # Lock all groups used by instances optimistically; this requires going
4699 # via the node before it's locked, requiring verification later on
4700 lu.needed_locks[locking.LEVEL_NODEGROUP] = \
4702 for instance_name in lu.owned_locks(locking.LEVEL_INSTANCE)
4703 for group_uuid in lu.cfg.GetInstanceNodeGroups(instance_name))
4704 elif level == locking.LEVEL_NODE:
4705 lu._LockInstancesNodes() # pylint: disable=W0212
4708 def _CheckGroupLocks(lu):
4709 owned_instances = frozenset(lu.owned_locks(locking.LEVEL_INSTANCE))
4710 owned_groups = frozenset(lu.owned_locks(locking.LEVEL_NODEGROUP))
4712 # Check if node groups for locked instances are still correct
4713 for instance_name in owned_instances:
4714 _CheckInstanceNodeGroups(lu.cfg, instance_name, owned_groups)
4716 def _GetQueryData(self, lu):
4717 """Computes the list of instances and their attributes.
4720 if self.do_grouplocks:
4721 self._CheckGroupLocks(lu)
4723 cluster = lu.cfg.GetClusterInfo()
4724 all_info = lu.cfg.GetAllInstancesInfo()
4726 instance_names = self._GetNames(lu, all_info.keys(), locking.LEVEL_INSTANCE)
4728 instance_list = [all_info[name] for name in instance_names]
4729 nodes = frozenset(itertools.chain(*(inst.all_nodes
4730 for inst in instance_list)))
4731 hv_list = list(set([inst.hypervisor for inst in instance_list]))
4734 wrongnode_inst = set()
4736 # Gather data as requested
4737 if self.requested_data & set([query.IQ_LIVE, query.IQ_CONSOLE]):
4739 node_data = lu.rpc.call_all_instances_info(nodes, hv_list)
4741 result = node_data[name]
4743 # offline nodes will be in both lists
4744 assert result.fail_msg
4745 offline_nodes.append(name)
4747 bad_nodes.append(name)
4748 elif result.payload:
4749 for inst in result.payload:
4750 if inst in all_info:
4751 if all_info[inst].primary_node == name:
4752 live_data.update(result.payload)
4754 wrongnode_inst.add(inst)
4756 # orphan instance; we don't list it here as we don't
4757 # handle this case yet in the output of instance listing
4758 logging.warning("Orphan instance '%s' found on node %s",
4760 # else no instance is alive
4764 if query.IQ_DISKUSAGE in self.requested_data:
4765 disk_usage = dict((inst.name,
4766 _ComputeDiskSize(inst.disk_template,
4767 [{constants.IDISK_SIZE: disk.size}
4768 for disk in inst.disks]))
4769 for inst in instance_list)
4773 if query.IQ_CONSOLE in self.requested_data:
4775 for inst in instance_list:
4776 if inst.name in live_data:
4777 # Instance is running
4778 consinfo[inst.name] = _GetInstanceConsole(cluster, inst)
4780 consinfo[inst.name] = None
4781 assert set(consinfo.keys()) == set(instance_names)
4785 if query.IQ_NODES in self.requested_data:
4786 node_names = set(itertools.chain(*map(operator.attrgetter("all_nodes"),
4788 nodes = dict(lu.cfg.GetMultiNodeInfo(node_names))
4789 groups = dict((uuid, lu.cfg.GetNodeGroup(uuid))
4790 for uuid in set(map(operator.attrgetter("group"),
4796 return query.InstanceQueryData(instance_list, lu.cfg.GetClusterInfo(),
4797 disk_usage, offline_nodes, bad_nodes,
4798 live_data, wrongnode_inst, consinfo,
4802 class LUQuery(NoHooksLU):
4803 """Query for resources/items of a certain kind.
4806 # pylint: disable=W0142
4809 def CheckArguments(self):
4810 qcls = _GetQueryImplementation(self.op.what)
4812 self.impl = qcls(self.op.filter, self.op.fields, self.op.use_locking)
4814 def ExpandNames(self):
4815 self.impl.ExpandNames(self)
4817 def DeclareLocks(self, level):
4818 self.impl.DeclareLocks(self, level)
4820 def Exec(self, feedback_fn):
4821 return self.impl.NewStyleQuery(self)
4824 class LUQueryFields(NoHooksLU):
4825 """Query for resources/items of a certain kind.
4828 # pylint: disable=W0142
4831 def CheckArguments(self):
4832 self.qcls = _GetQueryImplementation(self.op.what)
4834 def ExpandNames(self):
4835 self.needed_locks = {}
4837 def Exec(self, feedback_fn):
4838 return query.QueryFields(self.qcls.FIELDS, self.op.fields)
4841 class LUNodeModifyStorage(NoHooksLU):
4842 """Logical unit for modifying a storage volume on a node.
4847 def CheckArguments(self):
4848 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
4850 storage_type = self.op.storage_type
4853 modifiable = constants.MODIFIABLE_STORAGE_FIELDS[storage_type]
4855 raise errors.OpPrereqError("Storage units of type '%s' can not be"
4856 " modified" % storage_type,
4859 diff = set(self.op.changes.keys()) - modifiable
4861 raise errors.OpPrereqError("The following fields can not be modified for"
4862 " storage units of type '%s': %r" %
4863 (storage_type, list(diff)),
4866 def ExpandNames(self):
4867 self.needed_locks = {
4868 locking.LEVEL_NODE: self.op.node_name,
4871 def Exec(self, feedback_fn):
4872 """Computes the list of nodes and their attributes.
4875 st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
4876 result = self.rpc.call_storage_modify(self.op.node_name,
4877 self.op.storage_type, st_args,
4878 self.op.name, self.op.changes)
4879 result.Raise("Failed to modify storage unit '%s' on %s" %
4880 (self.op.name, self.op.node_name))
4883 class LUNodeAdd(LogicalUnit):
4884 """Logical unit for adding node to the cluster.
4888 HTYPE = constants.HTYPE_NODE
4889 _NFLAGS = ["master_capable", "vm_capable"]
4891 def CheckArguments(self):
4892 self.primary_ip_family = self.cfg.GetPrimaryIPFamily()
4893 # validate/normalize the node name
4894 self.hostname = netutils.GetHostname(name=self.op.node_name,
4895 family=self.primary_ip_family)
4896 self.op.node_name = self.hostname.name
4898 if self.op.readd and self.op.node_name == self.cfg.GetMasterNode():
4899 raise errors.OpPrereqError("Cannot readd the master node",
4902 if self.op.readd and self.op.group:
4903 raise errors.OpPrereqError("Cannot pass a node group when a node is"
4904 " being readded", errors.ECODE_INVAL)
4906 def BuildHooksEnv(self):
4909 This will run on all nodes before, and on all nodes + the new node after.
4913 "OP_TARGET": self.op.node_name,
4914 "NODE_NAME": self.op.node_name,
4915 "NODE_PIP": self.op.primary_ip,
4916 "NODE_SIP": self.op.secondary_ip,
4917 "MASTER_CAPABLE": str(self.op.master_capable),
4918 "VM_CAPABLE": str(self.op.vm_capable),
4921 def BuildHooksNodes(self):
4922 """Build hooks nodes.
4925 # Exclude added node
4926 pre_nodes = list(set(self.cfg.GetNodeList()) - set([self.op.node_name]))
4927 post_nodes = pre_nodes + [self.op.node_name, ]
4929 return (pre_nodes, post_nodes)
4931 def CheckPrereq(self):
4932 """Check prerequisites.
4935 - the new node is not already in the config
4937 - its parameters (single/dual homed) matches the cluster
4939 Any errors are signaled by raising errors.OpPrereqError.
4943 hostname = self.hostname
4944 node = hostname.name
4945 primary_ip = self.op.primary_ip = hostname.ip
4946 if self.op.secondary_ip is None:
4947 if self.primary_ip_family == netutils.IP6Address.family:
4948 raise errors.OpPrereqError("When using a IPv6 primary address, a valid"
4949 " IPv4 address must be given as secondary",
4951 self.op.secondary_ip = primary_ip
4953 secondary_ip = self.op.secondary_ip
4954 if not netutils.IP4Address.IsValid(secondary_ip):
4955 raise errors.OpPrereqError("Secondary IP (%s) needs to be a valid IPv4"
4956 " address" % secondary_ip, errors.ECODE_INVAL)
4958 node_list = cfg.GetNodeList()
4959 if not self.op.readd and node in node_list:
4960 raise errors.OpPrereqError("Node %s is already in the configuration" %
4961 node, errors.ECODE_EXISTS)
4962 elif self.op.readd and node not in node_list:
4963 raise errors.OpPrereqError("Node %s is not in the configuration" % node,
4966 self.changed_primary_ip = False
4968 for existing_node_name, existing_node in cfg.GetMultiNodeInfo(node_list):
4969 if self.op.readd and node == existing_node_name:
4970 if existing_node.secondary_ip != secondary_ip:
4971 raise errors.OpPrereqError("Readded node doesn't have the same IP"
4972 " address configuration as before",
4974 if existing_node.primary_ip != primary_ip:
4975 self.changed_primary_ip = True
4979 if (existing_node.primary_ip == primary_ip or
4980 existing_node.secondary_ip == primary_ip or
4981 existing_node.primary_ip == secondary_ip or
4982 existing_node.secondary_ip == secondary_ip):
4983 raise errors.OpPrereqError("New node ip address(es) conflict with"
4984 " existing node %s" % existing_node.name,
4985 errors.ECODE_NOTUNIQUE)
4987 # After this 'if' block, None is no longer a valid value for the
4988 # _capable op attributes
4990 old_node = self.cfg.GetNodeInfo(node)
4991 assert old_node is not None, "Can't retrieve locked node %s" % node
4992 for attr in self._NFLAGS:
4993 if getattr(self.op, attr) is None:
4994 setattr(self.op, attr, getattr(old_node, attr))
4996 for attr in self._NFLAGS:
4997 if getattr(self.op, attr) is None:
4998 setattr(self.op, attr, True)
5000 if self.op.readd and not self.op.vm_capable:
5001 pri, sec = cfg.GetNodeInstances(node)
5003 raise errors.OpPrereqError("Node %s being re-added with vm_capable"
5004 " flag set to false, but it already holds"
5005 " instances" % node,
5008 # check that the type of the node (single versus dual homed) is the
5009 # same as for the master
5010 myself = cfg.GetNodeInfo(self.cfg.GetMasterNode())
5011 master_singlehomed = myself.secondary_ip == myself.primary_ip
5012 newbie_singlehomed = secondary_ip == primary_ip
5013 if master_singlehomed != newbie_singlehomed:
5014 if master_singlehomed:
5015 raise errors.OpPrereqError("The master has no secondary ip but the"
5016 " new node has one",
5019 raise errors.OpPrereqError("The master has a secondary ip but the"
5020 " new node doesn't have one",
5023 # checks reachability
5024 if not netutils.TcpPing(primary_ip, constants.DEFAULT_NODED_PORT):
5025 raise errors.OpPrereqError("Node not reachable by ping",
5026 errors.ECODE_ENVIRON)
5028 if not newbie_singlehomed:
5029 # check reachability from my secondary ip to newbie's secondary ip
5030 if not netutils.TcpPing(secondary_ip, constants.DEFAULT_NODED_PORT,
5031 source=myself.secondary_ip):
5032 raise errors.OpPrereqError("Node secondary ip not reachable by TCP"
5033 " based ping to node daemon port",
5034 errors.ECODE_ENVIRON)
5041 if self.op.master_capable:
5042 self.master_candidate = _DecideSelfPromotion(self, exceptions=exceptions)
5044 self.master_candidate = False
5047 self.new_node = old_node
5049 node_group = cfg.LookupNodeGroup(self.op.group)
5050 self.new_node = objects.Node(name=node,
5051 primary_ip=primary_ip,
5052 secondary_ip=secondary_ip,
5053 master_candidate=self.master_candidate,
5054 offline=False, drained=False,
5057 if self.op.ndparams:
5058 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
5060 def Exec(self, feedback_fn):
5061 """Adds the new node to the cluster.
5064 new_node = self.new_node
5065 node = new_node.name
5067 # We adding a new node so we assume it's powered
5068 new_node.powered = True
5070 # for re-adds, reset the offline/drained/master-candidate flags;
5071 # we need to reset here, otherwise offline would prevent RPC calls
5072 # later in the procedure; this also means that if the re-add
5073 # fails, we are left with a non-offlined, broken node
5075 new_node.drained = new_node.offline = False # pylint: disable=W0201
5076 self.LogInfo("Readding a node, the offline/drained flags were reset")
5077 # if we demote the node, we do cleanup later in the procedure
5078 new_node.master_candidate = self.master_candidate
5079 if self.changed_primary_ip:
5080 new_node.primary_ip = self.op.primary_ip
5082 # copy the master/vm_capable flags
5083 for attr in self._NFLAGS:
5084 setattr(new_node, attr, getattr(self.op, attr))
5086 # notify the user about any possible mc promotion
5087 if new_node.master_candidate:
5088 self.LogInfo("Node will be a master candidate")
5090 if self.op.ndparams:
5091 new_node.ndparams = self.op.ndparams
5093 new_node.ndparams = {}
5095 # check connectivity
5096 result = self.rpc.call_version([node])[node]
5097 result.Raise("Can't get version information from node %s" % node)
5098 if constants.PROTOCOL_VERSION == result.payload:
5099 logging.info("Communication to node %s fine, sw version %s match",
5100 node, result.payload)
5102 raise errors.OpExecError("Version mismatch master version %s,"
5103 " node version %s" %
5104 (constants.PROTOCOL_VERSION, result.payload))
5106 # Add node to our /etc/hosts, and add key to known_hosts
5107 if self.cfg.GetClusterInfo().modify_etc_hosts:
5108 master_node = self.cfg.GetMasterNode()
5109 result = self.rpc.call_etc_hosts_modify(master_node,
5110 constants.ETC_HOSTS_ADD,
5113 result.Raise("Can't update hosts file with new host data")
5115 if new_node.secondary_ip != new_node.primary_ip:
5116 _CheckNodeHasSecondaryIP(self, new_node.name, new_node.secondary_ip,
5119 node_verify_list = [self.cfg.GetMasterNode()]
5120 node_verify_param = {
5121 constants.NV_NODELIST: ([node], {}),
5122 # TODO: do a node-net-test as well?
5125 result = self.rpc.call_node_verify(node_verify_list, node_verify_param,
5126 self.cfg.GetClusterName())
5127 for verifier in node_verify_list:
5128 result[verifier].Raise("Cannot communicate with node %s" % verifier)
5129 nl_payload = result[verifier].payload[constants.NV_NODELIST]
5131 for failed in nl_payload:
5132 feedback_fn("ssh/hostname verification failed"
5133 " (checking from %s): %s" %
5134 (verifier, nl_payload[failed]))
5135 raise errors.OpExecError("ssh/hostname verification failed")
5138 _RedistributeAncillaryFiles(self)
5139 self.context.ReaddNode(new_node)
5140 # make sure we redistribute the config
5141 self.cfg.Update(new_node, feedback_fn)
5142 # and make sure the new node will not have old files around
5143 if not new_node.master_candidate:
5144 result = self.rpc.call_node_demote_from_mc(new_node.name)
5145 msg = result.fail_msg
5147 self.LogWarning("Node failed to demote itself from master"
5148 " candidate status: %s" % msg)
5150 _RedistributeAncillaryFiles(self, additional_nodes=[node],
5151 additional_vm=self.op.vm_capable)
5152 self.context.AddNode(new_node, self.proc.GetECId())
5155 class LUNodeSetParams(LogicalUnit):
5156 """Modifies the parameters of a node.
5158 @cvar _F2R: a dictionary from tuples of flags (mc, drained, offline)
5159 to the node role (as _ROLE_*)
5160 @cvar _R2F: a dictionary from node role to tuples of flags
5161 @cvar _FLAGS: a list of attribute names corresponding to the flags
5164 HPATH = "node-modify"
5165 HTYPE = constants.HTYPE_NODE
5167 (_ROLE_CANDIDATE, _ROLE_DRAINED, _ROLE_OFFLINE, _ROLE_REGULAR) = range(4)
5169 (True, False, False): _ROLE_CANDIDATE,
5170 (False, True, False): _ROLE_DRAINED,
5171 (False, False, True): _ROLE_OFFLINE,
5172 (False, False, False): _ROLE_REGULAR,
5174 _R2F = dict((v, k) for k, v in _F2R.items())
5175 _FLAGS = ["master_candidate", "drained", "offline"]
5177 def CheckArguments(self):
5178 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
5179 all_mods = [self.op.offline, self.op.master_candidate, self.op.drained,
5180 self.op.master_capable, self.op.vm_capable,
5181 self.op.secondary_ip, self.op.ndparams]
5182 if all_mods.count(None) == len(all_mods):
5183 raise errors.OpPrereqError("Please pass at least one modification",
5185 if all_mods.count(True) > 1:
5186 raise errors.OpPrereqError("Can't set the node into more than one"
5187 " state at the same time",
5190 # Boolean value that tells us whether we might be demoting from MC
5191 self.might_demote = (self.op.master_candidate == False or
5192 self.op.offline == True or
5193 self.op.drained == True or
5194 self.op.master_capable == False)
5196 if self.op.secondary_ip:
5197 if not netutils.IP4Address.IsValid(self.op.secondary_ip):
5198 raise errors.OpPrereqError("Secondary IP (%s) needs to be a valid IPv4"
5199 " address" % self.op.secondary_ip,
5202 self.lock_all = self.op.auto_promote and self.might_demote
5203 self.lock_instances = self.op.secondary_ip is not None
5205 def ExpandNames(self):
5207 self.needed_locks = {locking.LEVEL_NODE: locking.ALL_SET}
5209 self.needed_locks = {locking.LEVEL_NODE: self.op.node_name}
5211 if self.lock_instances:
5212 self.needed_locks[locking.LEVEL_INSTANCE] = locking.ALL_SET
5214 def DeclareLocks(self, level):
5215 # If we have locked all instances, before waiting to lock nodes, release
5216 # all the ones living on nodes unrelated to the current operation.
5217 if level == locking.LEVEL_NODE and self.lock_instances:
5218 self.affected_instances = []
5219 if self.needed_locks[locking.LEVEL_NODE] is not locking.ALL_SET:
5222 # Build list of instances to release
5223 locked_i = self.owned_locks(locking.LEVEL_INSTANCE)
5224 for instance_name, instance in self.cfg.GetMultiInstanceInfo(locked_i):
5225 if (instance.disk_template in constants.DTS_INT_MIRROR and
5226 self.op.node_name in instance.all_nodes):
5227 instances_keep.append(instance_name)
5228 self.affected_instances.append(instance)
5230 _ReleaseLocks(self, locking.LEVEL_INSTANCE, keep=instances_keep)
5232 assert (set(self.owned_locks(locking.LEVEL_INSTANCE)) ==
5233 set(instances_keep))
5235 def BuildHooksEnv(self):
5238 This runs on the master node.
5242 "OP_TARGET": self.op.node_name,
5243 "MASTER_CANDIDATE": str(self.op.master_candidate),
5244 "OFFLINE": str(self.op.offline),
5245 "DRAINED": str(self.op.drained),
5246 "MASTER_CAPABLE": str(self.op.master_capable),
5247 "VM_CAPABLE": str(self.op.vm_capable),
5250 def BuildHooksNodes(self):
5251 """Build hooks nodes.
5254 nl = [self.cfg.GetMasterNode(), self.op.node_name]
5257 def CheckPrereq(self):
5258 """Check prerequisites.
5260 This only checks the instance list against the existing names.
5263 node = self.node = self.cfg.GetNodeInfo(self.op.node_name)
5265 if (self.op.master_candidate is not None or
5266 self.op.drained is not None or
5267 self.op.offline is not None):
5268 # we can't change the master's node flags
5269 if self.op.node_name == self.cfg.GetMasterNode():
5270 raise errors.OpPrereqError("The master role can be changed"
5271 " only via master-failover",
5274 if self.op.master_candidate and not node.master_capable:
5275 raise errors.OpPrereqError("Node %s is not master capable, cannot make"
5276 " it a master candidate" % node.name,
5279 if self.op.vm_capable == False:
5280 (ipri, isec) = self.cfg.GetNodeInstances(self.op.node_name)
5282 raise errors.OpPrereqError("Node %s hosts instances, cannot unset"
5283 " the vm_capable flag" % node.name,
5286 if node.master_candidate and self.might_demote and not self.lock_all:
5287 assert not self.op.auto_promote, "auto_promote set but lock_all not"
5288 # check if after removing the current node, we're missing master
5290 (mc_remaining, mc_should, _) = \
5291 self.cfg.GetMasterCandidateStats(exceptions=[node.name])
5292 if mc_remaining < mc_should:
5293 raise errors.OpPrereqError("Not enough master candidates, please"
5294 " pass auto promote option to allow"
5295 " promotion", errors.ECODE_STATE)
5297 self.old_flags = old_flags = (node.master_candidate,
5298 node.drained, node.offline)
5299 assert old_flags in self._F2R, "Un-handled old flags %s" % str(old_flags)
5300 self.old_role = old_role = self._F2R[old_flags]
5302 # Check for ineffective changes
5303 for attr in self._FLAGS:
5304 if (getattr(self.op, attr) == False and getattr(node, attr) == False):
5305 self.LogInfo("Ignoring request to unset flag %s, already unset", attr)
5306 setattr(self.op, attr, None)
5308 # Past this point, any flag change to False means a transition
5309 # away from the respective state, as only real changes are kept
5311 # TODO: We might query the real power state if it supports OOB
5312 if _SupportsOob(self.cfg, node):
5313 if self.op.offline is False and not (node.powered or
5314 self.op.powered == True):
5315 raise errors.OpPrereqError(("Node %s needs to be turned on before its"
5316 " offline status can be reset") %
5318 elif self.op.powered is not None:
5319 raise errors.OpPrereqError(("Unable to change powered state for node %s"
5320 " as it does not support out-of-band"
5321 " handling") % self.op.node_name)
5323 # If we're being deofflined/drained, we'll MC ourself if needed
5324 if (self.op.drained == False or self.op.offline == False or
5325 (self.op.master_capable and not node.master_capable)):
5326 if _DecideSelfPromotion(self):
5327 self.op.master_candidate = True
5328 self.LogInfo("Auto-promoting node to master candidate")
5330 # If we're no longer master capable, we'll demote ourselves from MC
5331 if self.op.master_capable == False and node.master_candidate:
5332 self.LogInfo("Demoting from master candidate")
5333 self.op.master_candidate = False
5336 assert [getattr(self.op, attr) for attr in self._FLAGS].count(True) <= 1
5337 if self.op.master_candidate:
5338 new_role = self._ROLE_CANDIDATE
5339 elif self.op.drained:
5340 new_role = self._ROLE_DRAINED
5341 elif self.op.offline:
5342 new_role = self._ROLE_OFFLINE
5343 elif False in [self.op.master_candidate, self.op.drained, self.op.offline]:
5344 # False is still in new flags, which means we're un-setting (the
5346 new_role = self._ROLE_REGULAR
5347 else: # no new flags, nothing, keep old role
5350 self.new_role = new_role
5352 if old_role == self._ROLE_OFFLINE and new_role != old_role:
5353 # Trying to transition out of offline status
5354 result = self.rpc.call_version([node.name])[node.name]
5356 raise errors.OpPrereqError("Node %s is being de-offlined but fails"
5357 " to report its version: %s" %
5358 (node.name, result.fail_msg),
5361 self.LogWarning("Transitioning node from offline to online state"
5362 " without using re-add. Please make sure the node"
5365 if self.op.secondary_ip:
5366 # Ok even without locking, because this can't be changed by any LU
5367 master = self.cfg.GetNodeInfo(self.cfg.GetMasterNode())
5368 master_singlehomed = master.secondary_ip == master.primary_ip
5369 if master_singlehomed and self.op.secondary_ip:
5370 raise errors.OpPrereqError("Cannot change the secondary ip on a single"
5371 " homed cluster", errors.ECODE_INVAL)
5374 if self.affected_instances:
5375 raise errors.OpPrereqError("Cannot change secondary ip: offline"
5376 " node has instances (%s) configured"
5377 " to use it" % self.affected_instances)
5379 # On online nodes, check that no instances are running, and that
5380 # the node has the new ip and we can reach it.
5381 for instance in self.affected_instances:
5382 _CheckInstanceDown(self, instance, "cannot change secondary ip")
5384 _CheckNodeHasSecondaryIP(self, node.name, self.op.secondary_ip, True)
5385 if master.name != node.name:
5386 # check reachability from master secondary ip to new secondary ip
5387 if not netutils.TcpPing(self.op.secondary_ip,
5388 constants.DEFAULT_NODED_PORT,
5389 source=master.secondary_ip):
5390 raise errors.OpPrereqError("Node secondary ip not reachable by TCP"
5391 " based ping to node daemon port",
5392 errors.ECODE_ENVIRON)
5394 if self.op.ndparams:
5395 new_ndparams = _GetUpdatedParams(self.node.ndparams, self.op.ndparams)
5396 utils.ForceDictType(new_ndparams, constants.NDS_PARAMETER_TYPES)
5397 self.new_ndparams = new_ndparams
5399 def Exec(self, feedback_fn):
5404 old_role = self.old_role
5405 new_role = self.new_role
5409 if self.op.ndparams:
5410 node.ndparams = self.new_ndparams
5412 if self.op.powered is not None:
5413 node.powered = self.op.powered
5415 for attr in ["master_capable", "vm_capable"]:
5416 val = getattr(self.op, attr)
5418 setattr(node, attr, val)
5419 result.append((attr, str(val)))
5421 if new_role != old_role:
5422 # Tell the node to demote itself, if no longer MC and not offline
5423 if old_role == self._ROLE_CANDIDATE and new_role != self._ROLE_OFFLINE:
5424 msg = self.rpc.call_node_demote_from_mc(node.name).fail_msg
5426 self.LogWarning("Node failed to demote itself: %s", msg)
5428 new_flags = self._R2F[new_role]
5429 for of, nf, desc in zip(self.old_flags, new_flags, self._FLAGS):
5431 result.append((desc, str(nf)))
5432 (node.master_candidate, node.drained, node.offline) = new_flags
5434 # we locked all nodes, we adjust the CP before updating this node
5436 _AdjustCandidatePool(self, [node.name])
5438 if self.op.secondary_ip:
5439 node.secondary_ip = self.op.secondary_ip
5440 result.append(("secondary_ip", self.op.secondary_ip))
5442 # this will trigger configuration file update, if needed
5443 self.cfg.Update(node, feedback_fn)
5445 # this will trigger job queue propagation or cleanup if the mc
5447 if [old_role, new_role].count(self._ROLE_CANDIDATE) == 1:
5448 self.context.ReaddNode(node)
5453 class LUNodePowercycle(NoHooksLU):
5454 """Powercycles a node.
5459 def CheckArguments(self):
5460 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
5461 if self.op.node_name == self.cfg.GetMasterNode() and not self.op.force:
5462 raise errors.OpPrereqError("The node is the master and the force"
5463 " parameter was not set",
5466 def ExpandNames(self):
5467 """Locking for PowercycleNode.
5469 This is a last-resort option and shouldn't block on other
5470 jobs. Therefore, we grab no locks.
5473 self.needed_locks = {}
5475 def Exec(self, feedback_fn):
5479 result = self.rpc.call_node_powercycle(self.op.node_name,
5480 self.cfg.GetHypervisorType())
5481 result.Raise("Failed to schedule the reboot")
5482 return result.payload
5485 class LUClusterQuery(NoHooksLU):
5486 """Query cluster configuration.
5491 def ExpandNames(self):
5492 self.needed_locks = {}
5494 def Exec(self, feedback_fn):
5495 """Return cluster config.
5498 cluster = self.cfg.GetClusterInfo()
5501 # Filter just for enabled hypervisors
5502 for os_name, hv_dict in cluster.os_hvp.items():
5503 os_hvp[os_name] = {}
5504 for hv_name, hv_params in hv_dict.items():
5505 if hv_name in cluster.enabled_hypervisors:
5506 os_hvp[os_name][hv_name] = hv_params
5508 # Convert ip_family to ip_version
5509 primary_ip_version = constants.IP4_VERSION
5510 if cluster.primary_ip_family == netutils.IP6Address.family:
5511 primary_ip_version = constants.IP6_VERSION
5514 "software_version": constants.RELEASE_VERSION,
5515 "protocol_version": constants.PROTOCOL_VERSION,
5516 "config_version": constants.CONFIG_VERSION,
5517 "os_api_version": max(constants.OS_API_VERSIONS),
5518 "export_version": constants.EXPORT_VERSION,
5519 "architecture": (platform.architecture()[0], platform.machine()),
5520 "name": cluster.cluster_name,
5521 "master": cluster.master_node,
5522 "default_hypervisor": cluster.enabled_hypervisors[0],
5523 "enabled_hypervisors": cluster.enabled_hypervisors,
5524 "hvparams": dict([(hypervisor_name, cluster.hvparams[hypervisor_name])
5525 for hypervisor_name in cluster.enabled_hypervisors]),
5527 "beparams": cluster.beparams,
5528 "osparams": cluster.osparams,
5529 "nicparams": cluster.nicparams,
5530 "ndparams": cluster.ndparams,
5531 "candidate_pool_size": cluster.candidate_pool_size,
5532 "master_netdev": cluster.master_netdev,
5533 "master_netmask": cluster.master_netmask,
5534 "volume_group_name": cluster.volume_group_name,
5535 "drbd_usermode_helper": cluster.drbd_usermode_helper,
5536 "file_storage_dir": cluster.file_storage_dir,
5537 "shared_file_storage_dir": cluster.shared_file_storage_dir,
5538 "maintain_node_health": cluster.maintain_node_health,
5539 "ctime": cluster.ctime,
5540 "mtime": cluster.mtime,
5541 "uuid": cluster.uuid,
5542 "tags": list(cluster.GetTags()),
5543 "uid_pool": cluster.uid_pool,
5544 "default_iallocator": cluster.default_iallocator,
5545 "reserved_lvs": cluster.reserved_lvs,
5546 "primary_ip_version": primary_ip_version,
5547 "prealloc_wipe_disks": cluster.prealloc_wipe_disks,
5548 "hidden_os": cluster.hidden_os,
5549 "blacklisted_os": cluster.blacklisted_os,
5555 class LUClusterConfigQuery(NoHooksLU):
5556 """Return configuration values.
5560 _FIELDS_DYNAMIC = utils.FieldSet()
5561 _FIELDS_STATIC = utils.FieldSet("cluster_name", "master_node", "drain_flag",
5562 "watcher_pause", "volume_group_name")
5564 def CheckArguments(self):
5565 _CheckOutputFields(static=self._FIELDS_STATIC,
5566 dynamic=self._FIELDS_DYNAMIC,
5567 selected=self.op.output_fields)
5569 def ExpandNames(self):
5570 self.needed_locks = {}
5572 def Exec(self, feedback_fn):
5573 """Dump a representation of the cluster config to the standard output.
5577 for field in self.op.output_fields:
5578 if field == "cluster_name":
5579 entry = self.cfg.GetClusterName()
5580 elif field == "master_node":
5581 entry = self.cfg.GetMasterNode()
5582 elif field == "drain_flag":
5583 entry = os.path.exists(constants.JOB_QUEUE_DRAIN_FILE)
5584 elif field == "watcher_pause":
5585 entry = utils.ReadWatcherPauseFile(constants.WATCHER_PAUSEFILE)
5586 elif field == "volume_group_name":
5587 entry = self.cfg.GetVGName()
5589 raise errors.ParameterError(field)
5590 values.append(entry)
5594 class LUInstanceActivateDisks(NoHooksLU):
5595 """Bring up an instance's disks.
5600 def ExpandNames(self):
5601 self._ExpandAndLockInstance()
5602 self.needed_locks[locking.LEVEL_NODE] = []
5603 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
5605 def DeclareLocks(self, level):
5606 if level == locking.LEVEL_NODE:
5607 self._LockInstancesNodes()
5609 def CheckPrereq(self):
5610 """Check prerequisites.
5612 This checks that the instance is in the cluster.
5615 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5616 assert self.instance is not None, \
5617 "Cannot retrieve locked instance %s" % self.op.instance_name
5618 _CheckNodeOnline(self, self.instance.primary_node)
5620 def Exec(self, feedback_fn):
5621 """Activate the disks.
5624 disks_ok, disks_info = \
5625 _AssembleInstanceDisks(self, self.instance,
5626 ignore_size=self.op.ignore_size)
5628 raise errors.OpExecError("Cannot activate block devices")
5633 def _AssembleInstanceDisks(lu, instance, disks=None, ignore_secondaries=False,
5635 """Prepare the block devices for an instance.
5637 This sets up the block devices on all nodes.
5639 @type lu: L{LogicalUnit}
5640 @param lu: the logical unit on whose behalf we execute
5641 @type instance: L{objects.Instance}
5642 @param instance: the instance for whose disks we assemble
5643 @type disks: list of L{objects.Disk} or None
5644 @param disks: which disks to assemble (or all, if None)
5645 @type ignore_secondaries: boolean
5646 @param ignore_secondaries: if true, errors on secondary nodes
5647 won't result in an error return from the function
5648 @type ignore_size: boolean
5649 @param ignore_size: if true, the current known size of the disk
5650 will not be used during the disk activation, useful for cases
5651 when the size is wrong
5652 @return: False if the operation failed, otherwise a list of
5653 (host, instance_visible_name, node_visible_name)
5654 with the mapping from node devices to instance devices
5659 iname = instance.name
5660 disks = _ExpandCheckDisks(instance, disks)
5662 # With the two passes mechanism we try to reduce the window of
5663 # opportunity for the race condition of switching DRBD to primary
5664 # before handshaking occured, but we do not eliminate it
5666 # The proper fix would be to wait (with some limits) until the
5667 # connection has been made and drbd transitions from WFConnection
5668 # into any other network-connected state (Connected, SyncTarget,
5671 # 1st pass, assemble on all nodes in secondary mode
5672 for idx, inst_disk in enumerate(disks):
5673 for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node):
5675 node_disk = node_disk.Copy()
5676 node_disk.UnsetSize()
5677 lu.cfg.SetDiskID(node_disk, node)
5678 result = lu.rpc.call_blockdev_assemble(node, node_disk, iname, False, idx)
5679 msg = result.fail_msg
5681 lu.proc.LogWarning("Could not prepare block device %s on node %s"
5682 " (is_primary=False, pass=1): %s",
5683 inst_disk.iv_name, node, msg)
5684 if not ignore_secondaries:
5687 # FIXME: race condition on drbd migration to primary
5689 # 2nd pass, do only the primary node
5690 for idx, inst_disk in enumerate(disks):
5693 for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node):
5694 if node != instance.primary_node:
5697 node_disk = node_disk.Copy()
5698 node_disk.UnsetSize()
5699 lu.cfg.SetDiskID(node_disk, node)
5700 result = lu.rpc.call_blockdev_assemble(node, node_disk, iname, True, idx)
5701 msg = result.fail_msg
5703 lu.proc.LogWarning("Could not prepare block device %s on node %s"
5704 " (is_primary=True, pass=2): %s",
5705 inst_disk.iv_name, node, msg)
5708 dev_path = result.payload
5710 device_info.append((instance.primary_node, inst_disk.iv_name, dev_path))
5712 # leave the disks configured for the primary node
5713 # this is a workaround that would be fixed better by
5714 # improving the logical/physical id handling
5716 lu.cfg.SetDiskID(disk, instance.primary_node)
5718 return disks_ok, device_info
5721 def _StartInstanceDisks(lu, instance, force):
5722 """Start the disks of an instance.
5725 disks_ok, _ = _AssembleInstanceDisks(lu, instance,
5726 ignore_secondaries=force)
5728 _ShutdownInstanceDisks(lu, instance)
5729 if force is not None and not force:
5730 lu.proc.LogWarning("", hint="If the message above refers to a"
5732 " you can retry the operation using '--force'.")
5733 raise errors.OpExecError("Disk consistency error")
5736 class LUInstanceDeactivateDisks(NoHooksLU):
5737 """Shutdown an instance's disks.
5742 def ExpandNames(self):
5743 self._ExpandAndLockInstance()
5744 self.needed_locks[locking.LEVEL_NODE] = []
5745 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
5747 def DeclareLocks(self, level):
5748 if level == locking.LEVEL_NODE:
5749 self._LockInstancesNodes()
5751 def CheckPrereq(self):
5752 """Check prerequisites.
5754 This checks that the instance is in the cluster.
5757 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5758 assert self.instance is not None, \
5759 "Cannot retrieve locked instance %s" % self.op.instance_name
5761 def Exec(self, feedback_fn):
5762 """Deactivate the disks
5765 instance = self.instance
5767 _ShutdownInstanceDisks(self, instance)
5769 _SafeShutdownInstanceDisks(self, instance)
5772 def _SafeShutdownInstanceDisks(lu, instance, disks=None):
5773 """Shutdown block devices of an instance.
5775 This function checks if an instance is running, before calling
5776 _ShutdownInstanceDisks.
5779 _CheckInstanceDown(lu, instance, "cannot shutdown disks")
5780 _ShutdownInstanceDisks(lu, instance, disks=disks)
5783 def _ExpandCheckDisks(instance, disks):
5784 """Return the instance disks selected by the disks list
5786 @type disks: list of L{objects.Disk} or None
5787 @param disks: selected disks
5788 @rtype: list of L{objects.Disk}
5789 @return: selected instance disks to act on
5793 return instance.disks
5795 if not set(disks).issubset(instance.disks):
5796 raise errors.ProgrammerError("Can only act on disks belonging to the"
5801 def _ShutdownInstanceDisks(lu, instance, disks=None, ignore_primary=False):
5802 """Shutdown block devices of an instance.
5804 This does the shutdown on all nodes of the instance.
5806 If the ignore_primary is false, errors on the primary node are
5811 disks = _ExpandCheckDisks(instance, disks)
5814 for node, top_disk in disk.ComputeNodeTree(instance.primary_node):
5815 lu.cfg.SetDiskID(top_disk, node)
5816 result = lu.rpc.call_blockdev_shutdown(node, top_disk)
5817 msg = result.fail_msg
5819 lu.LogWarning("Could not shutdown block device %s on node %s: %s",
5820 disk.iv_name, node, msg)
5821 if ((node == instance.primary_node and not ignore_primary) or
5822 (node != instance.primary_node and not result.offline)):
5827 def _CheckNodeFreeMemory(lu, node, reason, requested, hypervisor_name):
5828 """Checks if a node has enough free memory.
5830 This function check if a given node has the needed amount of free
5831 memory. In case the node has less memory or we cannot get the
5832 information from the node, this function raise an OpPrereqError
5835 @type lu: C{LogicalUnit}
5836 @param lu: a logical unit from which we get configuration data
5838 @param node: the node to check
5839 @type reason: C{str}
5840 @param reason: string to use in the error message
5841 @type requested: C{int}
5842 @param requested: the amount of memory in MiB to check for
5843 @type hypervisor_name: C{str}
5844 @param hypervisor_name: the hypervisor to ask for memory stats
5845 @raise errors.OpPrereqError: if the node doesn't have enough memory, or
5846 we cannot check the node
5849 nodeinfo = lu.rpc.call_node_info([node], None, hypervisor_name)
5850 nodeinfo[node].Raise("Can't get data from node %s" % node,
5851 prereq=True, ecode=errors.ECODE_ENVIRON)
5852 free_mem = nodeinfo[node].payload.get("memory_free", None)
5853 if not isinstance(free_mem, int):
5854 raise errors.OpPrereqError("Can't compute free memory on node %s, result"
5855 " was '%s'" % (node, free_mem),
5856 errors.ECODE_ENVIRON)
5857 if requested > free_mem:
5858 raise errors.OpPrereqError("Not enough memory on node %s for %s:"
5859 " needed %s MiB, available %s MiB" %
5860 (node, reason, requested, free_mem),
5864 def _CheckNodesFreeDiskPerVG(lu, nodenames, req_sizes):
5865 """Checks if nodes have enough free disk space in the all VGs.
5867 This function check if all given nodes have the needed amount of
5868 free disk. In case any node has less disk or we cannot get the
5869 information from the node, this function raise an OpPrereqError
5872 @type lu: C{LogicalUnit}
5873 @param lu: a logical unit from which we get configuration data
5874 @type nodenames: C{list}
5875 @param nodenames: the list of node names to check
5876 @type req_sizes: C{dict}
5877 @param req_sizes: the hash of vg and corresponding amount of disk in
5879 @raise errors.OpPrereqError: if the node doesn't have enough disk,
5880 or we cannot check the node
5883 for vg, req_size in req_sizes.items():
5884 _CheckNodesFreeDiskOnVG(lu, nodenames, vg, req_size)
5887 def _CheckNodesFreeDiskOnVG(lu, nodenames, vg, requested):
5888 """Checks if nodes have enough free disk space in the specified VG.
5890 This function check if all given nodes have the needed amount of
5891 free disk. In case any node has less disk or we cannot get the
5892 information from the node, this function raise an OpPrereqError
5895 @type lu: C{LogicalUnit}
5896 @param lu: a logical unit from which we get configuration data
5897 @type nodenames: C{list}
5898 @param nodenames: the list of node names to check
5900 @param vg: the volume group to check
5901 @type requested: C{int}
5902 @param requested: the amount of disk in MiB to check for
5903 @raise errors.OpPrereqError: if the node doesn't have enough disk,
5904 or we cannot check the node
5907 nodeinfo = lu.rpc.call_node_info(nodenames, vg, None)
5908 for node in nodenames:
5909 info = nodeinfo[node]
5910 info.Raise("Cannot get current information from node %s" % node,
5911 prereq=True, ecode=errors.ECODE_ENVIRON)
5912 vg_free = info.payload.get("vg_free", None)
5913 if not isinstance(vg_free, int):
5914 raise errors.OpPrereqError("Can't compute free disk space on node"
5915 " %s for vg %s, result was '%s'" %
5916 (node, vg, vg_free), errors.ECODE_ENVIRON)
5917 if requested > vg_free:
5918 raise errors.OpPrereqError("Not enough disk space on target node %s"
5919 " vg %s: required %d MiB, available %d MiB" %
5920 (node, vg, requested, vg_free),
5924 def _CheckNodesPhysicalCPUs(lu, nodenames, requested, hypervisor_name):
5925 """Checks if nodes have enough physical CPUs
5927 This function checks if all given nodes have the needed number of
5928 physical CPUs. In case any node has less CPUs or we cannot get the
5929 information from the node, this function raises an OpPrereqError
5932 @type lu: C{LogicalUnit}
5933 @param lu: a logical unit from which we get configuration data
5934 @type nodenames: C{list}
5935 @param nodenames: the list of node names to check
5936 @type requested: C{int}
5937 @param requested: the minimum acceptable number of physical CPUs
5938 @raise errors.OpPrereqError: if the node doesn't have enough CPUs,
5939 or we cannot check the node
5942 nodeinfo = lu.rpc.call_node_info(nodenames, None, hypervisor_name)
5943 for node in nodenames:
5944 info = nodeinfo[node]
5945 info.Raise("Cannot get current information from node %s" % node,
5946 prereq=True, ecode=errors.ECODE_ENVIRON)
5947 num_cpus = info.payload.get("cpu_total", None)
5948 if not isinstance(num_cpus, int):
5949 raise errors.OpPrereqError("Can't compute the number of physical CPUs"
5950 " on node %s, result was '%s'" %
5951 (node, num_cpus), errors.ECODE_ENVIRON)
5952 if requested > num_cpus:
5953 raise errors.OpPrereqError("Node %s has %s physical CPUs, but %s are "
5954 "required" % (node, num_cpus, requested),
5958 class LUInstanceStartup(LogicalUnit):
5959 """Starts an instance.
5962 HPATH = "instance-start"
5963 HTYPE = constants.HTYPE_INSTANCE
5966 def CheckArguments(self):
5968 if self.op.beparams:
5969 # fill the beparams dict
5970 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
5972 def ExpandNames(self):
5973 self._ExpandAndLockInstance()
5975 def BuildHooksEnv(self):
5978 This runs on master, primary and secondary nodes of the instance.
5982 "FORCE": self.op.force,
5985 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
5989 def BuildHooksNodes(self):
5990 """Build hooks nodes.
5993 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
5996 def CheckPrereq(self):
5997 """Check prerequisites.
5999 This checks that the instance is in the cluster.
6002 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6003 assert self.instance is not None, \
6004 "Cannot retrieve locked instance %s" % self.op.instance_name
6007 if self.op.hvparams:
6008 # check hypervisor parameter syntax (locally)
6009 cluster = self.cfg.GetClusterInfo()
6010 utils.ForceDictType(self.op.hvparams, constants.HVS_PARAMETER_TYPES)
6011 filled_hvp = cluster.FillHV(instance)
6012 filled_hvp.update(self.op.hvparams)
6013 hv_type = hypervisor.GetHypervisor(instance.hypervisor)
6014 hv_type.CheckParameterSyntax(filled_hvp)
6015 _CheckHVParams(self, instance.all_nodes, instance.hypervisor, filled_hvp)
6017 self.primary_offline = self.cfg.GetNodeInfo(instance.primary_node).offline
6019 if self.primary_offline and self.op.ignore_offline_nodes:
6020 self.proc.LogWarning("Ignoring offline primary node")
6022 if self.op.hvparams or self.op.beparams:
6023 self.proc.LogWarning("Overridden parameters are ignored")
6025 _CheckNodeOnline(self, instance.primary_node)
6027 bep = self.cfg.GetClusterInfo().FillBE(instance)
6029 # check bridges existence
6030 _CheckInstanceBridgesExist(self, instance)
6032 remote_info = self.rpc.call_instance_info(instance.primary_node,
6034 instance.hypervisor)
6035 remote_info.Raise("Error checking node %s" % instance.primary_node,
6036 prereq=True, ecode=errors.ECODE_ENVIRON)
6037 if not remote_info.payload: # not running already
6038 _CheckNodeFreeMemory(self, instance.primary_node,
6039 "starting instance %s" % instance.name,
6040 bep[constants.BE_MEMORY], instance.hypervisor)
6042 def Exec(self, feedback_fn):
6043 """Start the instance.
6046 instance = self.instance
6047 force = self.op.force
6049 if not self.op.no_remember:
6050 self.cfg.MarkInstanceUp(instance.name)
6052 if self.primary_offline:
6053 assert self.op.ignore_offline_nodes
6054 self.proc.LogInfo("Primary node offline, marked instance as started")
6056 node_current = instance.primary_node
6058 _StartInstanceDisks(self, instance, force)
6060 result = self.rpc.call_instance_start(node_current, instance,
6061 self.op.hvparams, self.op.beparams,
6062 self.op.startup_paused)
6063 msg = result.fail_msg
6065 _ShutdownInstanceDisks(self, instance)
6066 raise errors.OpExecError("Could not start instance: %s" % msg)
6069 class LUInstanceReboot(LogicalUnit):
6070 """Reboot an instance.
6073 HPATH = "instance-reboot"
6074 HTYPE = constants.HTYPE_INSTANCE
6077 def ExpandNames(self):
6078 self._ExpandAndLockInstance()
6080 def BuildHooksEnv(self):
6083 This runs on master, primary and secondary nodes of the instance.
6087 "IGNORE_SECONDARIES": self.op.ignore_secondaries,
6088 "REBOOT_TYPE": self.op.reboot_type,
6089 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
6092 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
6096 def BuildHooksNodes(self):
6097 """Build hooks nodes.
6100 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6103 def CheckPrereq(self):
6104 """Check prerequisites.
6106 This checks that the instance is in the cluster.
6109 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6110 assert self.instance is not None, \
6111 "Cannot retrieve locked instance %s" % self.op.instance_name
6113 _CheckNodeOnline(self, instance.primary_node)
6115 # check bridges existence
6116 _CheckInstanceBridgesExist(self, instance)
6118 def Exec(self, feedback_fn):
6119 """Reboot the instance.
6122 instance = self.instance
6123 ignore_secondaries = self.op.ignore_secondaries
6124 reboot_type = self.op.reboot_type
6126 remote_info = self.rpc.call_instance_info(instance.primary_node,
6128 instance.hypervisor)
6129 remote_info.Raise("Error checking node %s" % instance.primary_node)
6130 instance_running = bool(remote_info.payload)
6132 node_current = instance.primary_node
6134 if instance_running and reboot_type in [constants.INSTANCE_REBOOT_SOFT,
6135 constants.INSTANCE_REBOOT_HARD]:
6136 for disk in instance.disks:
6137 self.cfg.SetDiskID(disk, node_current)
6138 result = self.rpc.call_instance_reboot(node_current, instance,
6140 self.op.shutdown_timeout)
6141 result.Raise("Could not reboot instance")
6143 if instance_running:
6144 result = self.rpc.call_instance_shutdown(node_current, instance,
6145 self.op.shutdown_timeout)
6146 result.Raise("Could not shutdown instance for full reboot")
6147 _ShutdownInstanceDisks(self, instance)
6149 self.LogInfo("Instance %s was already stopped, starting now",
6151 _StartInstanceDisks(self, instance, ignore_secondaries)
6152 result = self.rpc.call_instance_start(node_current, instance,
6154 msg = result.fail_msg
6156 _ShutdownInstanceDisks(self, instance)
6157 raise errors.OpExecError("Could not start instance for"
6158 " full reboot: %s" % msg)
6160 self.cfg.MarkInstanceUp(instance.name)
6163 class LUInstanceShutdown(LogicalUnit):
6164 """Shutdown an instance.
6167 HPATH = "instance-stop"
6168 HTYPE = constants.HTYPE_INSTANCE
6171 def ExpandNames(self):
6172 self._ExpandAndLockInstance()
6174 def BuildHooksEnv(self):
6177 This runs on master, primary and secondary nodes of the instance.
6180 env = _BuildInstanceHookEnvByObject(self, self.instance)
6181 env["TIMEOUT"] = self.op.timeout
6184 def BuildHooksNodes(self):
6185 """Build hooks nodes.
6188 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6191 def CheckPrereq(self):
6192 """Check prerequisites.
6194 This checks that the instance is in the cluster.
6197 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6198 assert self.instance is not None, \
6199 "Cannot retrieve locked instance %s" % self.op.instance_name
6201 self.primary_offline = \
6202 self.cfg.GetNodeInfo(self.instance.primary_node).offline
6204 if self.primary_offline and self.op.ignore_offline_nodes:
6205 self.proc.LogWarning("Ignoring offline primary node")
6207 _CheckNodeOnline(self, self.instance.primary_node)
6209 def Exec(self, feedback_fn):
6210 """Shutdown the instance.
6213 instance = self.instance
6214 node_current = instance.primary_node
6215 timeout = self.op.timeout
6217 if not self.op.no_remember:
6218 self.cfg.MarkInstanceDown(instance.name)
6220 if self.primary_offline:
6221 assert self.op.ignore_offline_nodes
6222 self.proc.LogInfo("Primary node offline, marked instance as stopped")
6224 result = self.rpc.call_instance_shutdown(node_current, instance, timeout)
6225 msg = result.fail_msg
6227 self.proc.LogWarning("Could not shutdown instance: %s" % msg)
6229 _ShutdownInstanceDisks(self, instance)
6232 class LUInstanceReinstall(LogicalUnit):
6233 """Reinstall an instance.
6236 HPATH = "instance-reinstall"
6237 HTYPE = constants.HTYPE_INSTANCE
6240 def ExpandNames(self):
6241 self._ExpandAndLockInstance()
6243 def BuildHooksEnv(self):
6246 This runs on master, primary and secondary nodes of the instance.
6249 return _BuildInstanceHookEnvByObject(self, self.instance)
6251 def BuildHooksNodes(self):
6252 """Build hooks nodes.
6255 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6258 def CheckPrereq(self):
6259 """Check prerequisites.
6261 This checks that the instance is in the cluster and is not running.
6264 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6265 assert instance is not None, \
6266 "Cannot retrieve locked instance %s" % self.op.instance_name
6267 _CheckNodeOnline(self, instance.primary_node, "Instance primary node"
6268 " offline, cannot reinstall")
6269 for node in instance.secondary_nodes:
6270 _CheckNodeOnline(self, node, "Instance secondary node offline,"
6271 " cannot reinstall")
6273 if instance.disk_template == constants.DT_DISKLESS:
6274 raise errors.OpPrereqError("Instance '%s' has no disks" %
6275 self.op.instance_name,
6277 _CheckInstanceDown(self, instance, "cannot reinstall")
6279 if self.op.os_type is not None:
6281 pnode = _ExpandNodeName(self.cfg, instance.primary_node)
6282 _CheckNodeHasOS(self, pnode, self.op.os_type, self.op.force_variant)
6283 instance_os = self.op.os_type
6285 instance_os = instance.os
6287 nodelist = list(instance.all_nodes)
6289 if self.op.osparams:
6290 i_osdict = _GetUpdatedParams(instance.osparams, self.op.osparams)
6291 _CheckOSParams(self, True, nodelist, instance_os, i_osdict)
6292 self.os_inst = i_osdict # the new dict (without defaults)
6296 self.instance = instance
6298 def Exec(self, feedback_fn):
6299 """Reinstall the instance.
6302 inst = self.instance
6304 if self.op.os_type is not None:
6305 feedback_fn("Changing OS to '%s'..." % self.op.os_type)
6306 inst.os = self.op.os_type
6307 # Write to configuration
6308 self.cfg.Update(inst, feedback_fn)
6310 _StartInstanceDisks(self, inst, None)
6312 feedback_fn("Running the instance OS create scripts...")
6313 # FIXME: pass debug option from opcode to backend
6314 result = self.rpc.call_instance_os_add(inst.primary_node, inst, True,
6315 self.op.debug_level,
6316 osparams=self.os_inst)
6317 result.Raise("Could not install OS for instance %s on node %s" %
6318 (inst.name, inst.primary_node))
6320 _ShutdownInstanceDisks(self, inst)
6323 class LUInstanceRecreateDisks(LogicalUnit):
6324 """Recreate an instance's missing disks.
6327 HPATH = "instance-recreate-disks"
6328 HTYPE = constants.HTYPE_INSTANCE
6331 def CheckArguments(self):
6332 # normalise the disk list
6333 self.op.disks = sorted(frozenset(self.op.disks))
6335 def ExpandNames(self):
6336 self._ExpandAndLockInstance()
6337 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
6339 self.op.nodes = [_ExpandNodeName(self.cfg, n) for n in self.op.nodes]
6340 self.needed_locks[locking.LEVEL_NODE] = list(self.op.nodes)
6342 self.needed_locks[locking.LEVEL_NODE] = []
6344 def DeclareLocks(self, level):
6345 if level == locking.LEVEL_NODE:
6346 # if we replace the nodes, we only need to lock the old primary,
6347 # otherwise we need to lock all nodes for disk re-creation
6348 primary_only = bool(self.op.nodes)
6349 self._LockInstancesNodes(primary_only=primary_only)
6351 def BuildHooksEnv(self):
6354 This runs on master, primary and secondary nodes of the instance.
6357 return _BuildInstanceHookEnvByObject(self, self.instance)
6359 def BuildHooksNodes(self):
6360 """Build hooks nodes.
6363 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6366 def CheckPrereq(self):
6367 """Check prerequisites.
6369 This checks that the instance is in the cluster and is not running.
6372 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6373 assert instance is not None, \
6374 "Cannot retrieve locked instance %s" % self.op.instance_name
6376 if len(self.op.nodes) != len(instance.all_nodes):
6377 raise errors.OpPrereqError("Instance %s currently has %d nodes, but"
6378 " %d replacement nodes were specified" %
6379 (instance.name, len(instance.all_nodes),
6380 len(self.op.nodes)),
6382 assert instance.disk_template != constants.DT_DRBD8 or \
6383 len(self.op.nodes) == 2
6384 assert instance.disk_template != constants.DT_PLAIN or \
6385 len(self.op.nodes) == 1
6386 primary_node = self.op.nodes[0]
6388 primary_node = instance.primary_node
6389 _CheckNodeOnline(self, primary_node)
6391 if instance.disk_template == constants.DT_DISKLESS:
6392 raise errors.OpPrereqError("Instance '%s' has no disks" %
6393 self.op.instance_name, errors.ECODE_INVAL)
6394 # if we replace nodes *and* the old primary is offline, we don't
6396 assert instance.primary_node in self.needed_locks[locking.LEVEL_NODE]
6397 old_pnode = self.cfg.GetNodeInfo(instance.primary_node)
6398 if not (self.op.nodes and old_pnode.offline):
6399 _CheckInstanceDown(self, instance, "cannot recreate disks")
6401 if not self.op.disks:
6402 self.op.disks = range(len(instance.disks))
6404 for idx in self.op.disks:
6405 if idx >= len(instance.disks):
6406 raise errors.OpPrereqError("Invalid disk index '%s'" % idx,
6408 if self.op.disks != range(len(instance.disks)) and self.op.nodes:
6409 raise errors.OpPrereqError("Can't recreate disks partially and"
6410 " change the nodes at the same time",
6412 self.instance = instance
6414 def Exec(self, feedback_fn):
6415 """Recreate the disks.
6418 instance = self.instance
6421 mods = [] # keeps track of needed logical_id changes
6423 for idx, disk in enumerate(instance.disks):
6424 if idx not in self.op.disks: # disk idx has not been passed in
6427 # update secondaries for disks, if needed
6429 if disk.dev_type == constants.LD_DRBD8:
6430 # need to update the nodes and minors
6431 assert len(self.op.nodes) == 2
6432 assert len(disk.logical_id) == 6 # otherwise disk internals
6434 (_, _, old_port, _, _, old_secret) = disk.logical_id
6435 new_minors = self.cfg.AllocateDRBDMinor(self.op.nodes, instance.name)
6436 new_id = (self.op.nodes[0], self.op.nodes[1], old_port,
6437 new_minors[0], new_minors[1], old_secret)
6438 assert len(disk.logical_id) == len(new_id)
6439 mods.append((idx, new_id))
6441 # now that we have passed all asserts above, we can apply the mods
6442 # in a single run (to avoid partial changes)
6443 for idx, new_id in mods:
6444 instance.disks[idx].logical_id = new_id
6446 # change primary node, if needed
6448 instance.primary_node = self.op.nodes[0]
6449 self.LogWarning("Changing the instance's nodes, you will have to"
6450 " remove any disks left on the older nodes manually")
6453 self.cfg.Update(instance, feedback_fn)
6455 _CreateDisks(self, instance, to_skip=to_skip)
6458 class LUInstanceRename(LogicalUnit):
6459 """Rename an instance.
6462 HPATH = "instance-rename"
6463 HTYPE = constants.HTYPE_INSTANCE
6465 def CheckArguments(self):
6469 if self.op.ip_check and not self.op.name_check:
6470 # TODO: make the ip check more flexible and not depend on the name check
6471 raise errors.OpPrereqError("IP address check requires a name check",
6474 def BuildHooksEnv(self):
6477 This runs on master, primary and secondary nodes of the instance.
6480 env = _BuildInstanceHookEnvByObject(self, self.instance)
6481 env["INSTANCE_NEW_NAME"] = self.op.new_name
6484 def BuildHooksNodes(self):
6485 """Build hooks nodes.
6488 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6491 def CheckPrereq(self):
6492 """Check prerequisites.
6494 This checks that the instance is in the cluster and is not running.
6497 self.op.instance_name = _ExpandInstanceName(self.cfg,
6498 self.op.instance_name)
6499 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6500 assert instance is not None
6501 _CheckNodeOnline(self, instance.primary_node)
6502 _CheckInstanceDown(self, instance, "cannot rename")
6503 self.instance = instance
6505 new_name = self.op.new_name
6506 if self.op.name_check:
6507 hostname = netutils.GetHostname(name=new_name)
6508 if hostname != new_name:
6509 self.LogInfo("Resolved given name '%s' to '%s'", new_name,
6511 if not utils.MatchNameComponent(self.op.new_name, [hostname.name]):
6512 raise errors.OpPrereqError(("Resolved hostname '%s' does not look the"
6513 " same as given hostname '%s'") %
6514 (hostname.name, self.op.new_name),
6516 new_name = self.op.new_name = hostname.name
6517 if (self.op.ip_check and
6518 netutils.TcpPing(hostname.ip, constants.DEFAULT_NODED_PORT)):
6519 raise errors.OpPrereqError("IP %s of instance %s already in use" %
6520 (hostname.ip, new_name),
6521 errors.ECODE_NOTUNIQUE)
6523 instance_list = self.cfg.GetInstanceList()
6524 if new_name in instance_list and new_name != instance.name:
6525 raise errors.OpPrereqError("Instance '%s' is already in the cluster" %
6526 new_name, errors.ECODE_EXISTS)
6528 def Exec(self, feedback_fn):
6529 """Rename the instance.
6532 inst = self.instance
6533 old_name = inst.name
6535 rename_file_storage = False
6536 if (inst.disk_template in constants.DTS_FILEBASED and
6537 self.op.new_name != inst.name):
6538 old_file_storage_dir = os.path.dirname(inst.disks[0].logical_id[1])
6539 rename_file_storage = True
6541 self.cfg.RenameInstance(inst.name, self.op.new_name)
6542 # Change the instance lock. This is definitely safe while we hold the BGL.
6543 # Otherwise the new lock would have to be added in acquired mode.
6545 self.glm.remove(locking.LEVEL_INSTANCE, old_name)
6546 self.glm.add(locking.LEVEL_INSTANCE, self.op.new_name)
6548 # re-read the instance from the configuration after rename
6549 inst = self.cfg.GetInstanceInfo(self.op.new_name)
6551 if rename_file_storage:
6552 new_file_storage_dir = os.path.dirname(inst.disks[0].logical_id[1])
6553 result = self.rpc.call_file_storage_dir_rename(inst.primary_node,
6554 old_file_storage_dir,
6555 new_file_storage_dir)
6556 result.Raise("Could not rename on node %s directory '%s' to '%s'"
6557 " (but the instance has been renamed in Ganeti)" %
6558 (inst.primary_node, old_file_storage_dir,
6559 new_file_storage_dir))
6561 _StartInstanceDisks(self, inst, None)
6563 result = self.rpc.call_instance_run_rename(inst.primary_node, inst,
6564 old_name, self.op.debug_level)
6565 msg = result.fail_msg
6567 msg = ("Could not run OS rename script for instance %s on node %s"
6568 " (but the instance has been renamed in Ganeti): %s" %
6569 (inst.name, inst.primary_node, msg))
6570 self.proc.LogWarning(msg)
6572 _ShutdownInstanceDisks(self, inst)
6577 class LUInstanceRemove(LogicalUnit):
6578 """Remove an instance.
6581 HPATH = "instance-remove"
6582 HTYPE = constants.HTYPE_INSTANCE
6585 def ExpandNames(self):
6586 self._ExpandAndLockInstance()
6587 self.needed_locks[locking.LEVEL_NODE] = []
6588 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
6590 def DeclareLocks(self, level):
6591 if level == locking.LEVEL_NODE:
6592 self._LockInstancesNodes()
6594 def BuildHooksEnv(self):
6597 This runs on master, primary and secondary nodes of the instance.
6600 env = _BuildInstanceHookEnvByObject(self, self.instance)
6601 env["SHUTDOWN_TIMEOUT"] = self.op.shutdown_timeout
6604 def BuildHooksNodes(self):
6605 """Build hooks nodes.
6608 nl = [self.cfg.GetMasterNode()]
6609 nl_post = list(self.instance.all_nodes) + nl
6610 return (nl, nl_post)
6612 def CheckPrereq(self):
6613 """Check prerequisites.
6615 This checks that the instance is in the cluster.
6618 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6619 assert self.instance is not None, \
6620 "Cannot retrieve locked instance %s" % self.op.instance_name
6622 def Exec(self, feedback_fn):
6623 """Remove the instance.
6626 instance = self.instance
6627 logging.info("Shutting down instance %s on node %s",
6628 instance.name, instance.primary_node)
6630 result = self.rpc.call_instance_shutdown(instance.primary_node, instance,
6631 self.op.shutdown_timeout)
6632 msg = result.fail_msg
6634 if self.op.ignore_failures:
6635 feedback_fn("Warning: can't shutdown instance: %s" % msg)
6637 raise errors.OpExecError("Could not shutdown instance %s on"
6639 (instance.name, instance.primary_node, msg))
6641 _RemoveInstance(self, feedback_fn, instance, self.op.ignore_failures)
6644 def _RemoveInstance(lu, feedback_fn, instance, ignore_failures):
6645 """Utility function to remove an instance.
6648 logging.info("Removing block devices for instance %s", instance.name)
6650 if not _RemoveDisks(lu, instance):
6651 if not ignore_failures:
6652 raise errors.OpExecError("Can't remove instance's disks")
6653 feedback_fn("Warning: can't remove instance's disks")
6655 logging.info("Removing instance %s out of cluster config", instance.name)
6657 lu.cfg.RemoveInstance(instance.name)
6659 assert not lu.remove_locks.get(locking.LEVEL_INSTANCE), \
6660 "Instance lock removal conflict"
6662 # Remove lock for the instance
6663 lu.remove_locks[locking.LEVEL_INSTANCE] = instance.name
6666 class LUInstanceQuery(NoHooksLU):
6667 """Logical unit for querying instances.
6670 # pylint: disable=W0142
6673 def CheckArguments(self):
6674 self.iq = _InstanceQuery(qlang.MakeSimpleFilter("name", self.op.names),
6675 self.op.output_fields, self.op.use_locking)
6677 def ExpandNames(self):
6678 self.iq.ExpandNames(self)
6680 def DeclareLocks(self, level):
6681 self.iq.DeclareLocks(self, level)
6683 def Exec(self, feedback_fn):
6684 return self.iq.OldStyleQuery(self)
6687 class LUInstanceFailover(LogicalUnit):
6688 """Failover an instance.
6691 HPATH = "instance-failover"
6692 HTYPE = constants.HTYPE_INSTANCE
6695 def CheckArguments(self):
6696 """Check the arguments.
6699 self.iallocator = getattr(self.op, "iallocator", None)
6700 self.target_node = getattr(self.op, "target_node", None)
6702 def ExpandNames(self):
6703 self._ExpandAndLockInstance()
6705 if self.op.target_node is not None:
6706 self.op.target_node = _ExpandNodeName(self.cfg, self.op.target_node)
6708 self.needed_locks[locking.LEVEL_NODE] = []
6709 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
6711 ignore_consistency = self.op.ignore_consistency
6712 shutdown_timeout = self.op.shutdown_timeout
6713 self._migrater = TLMigrateInstance(self, self.op.instance_name,
6716 ignore_consistency=ignore_consistency,
6717 shutdown_timeout=shutdown_timeout)
6718 self.tasklets = [self._migrater]
6720 def DeclareLocks(self, level):
6721 if level == locking.LEVEL_NODE:
6722 instance = self.context.cfg.GetInstanceInfo(self.op.instance_name)
6723 if instance.disk_template in constants.DTS_EXT_MIRROR:
6724 if self.op.target_node is None:
6725 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
6727 self.needed_locks[locking.LEVEL_NODE] = [instance.primary_node,
6728 self.op.target_node]
6729 del self.recalculate_locks[locking.LEVEL_NODE]
6731 self._LockInstancesNodes()
6733 def BuildHooksEnv(self):
6736 This runs on master, primary and secondary nodes of the instance.
6739 instance = self._migrater.instance
6740 source_node = instance.primary_node
6741 target_node = self.op.target_node
6743 "IGNORE_CONSISTENCY": self.op.ignore_consistency,
6744 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
6745 "OLD_PRIMARY": source_node,
6746 "NEW_PRIMARY": target_node,
6749 if instance.disk_template in constants.DTS_INT_MIRROR:
6750 env["OLD_SECONDARY"] = instance.secondary_nodes[0]
6751 env["NEW_SECONDARY"] = source_node
6753 env["OLD_SECONDARY"] = env["NEW_SECONDARY"] = ""
6755 env.update(_BuildInstanceHookEnvByObject(self, instance))
6759 def BuildHooksNodes(self):
6760 """Build hooks nodes.
6763 instance = self._migrater.instance
6764 nl = [self.cfg.GetMasterNode()] + list(instance.secondary_nodes)
6765 return (nl, nl + [instance.primary_node])
6768 class LUInstanceMigrate(LogicalUnit):
6769 """Migrate an instance.
6771 This is migration without shutting down, compared to the failover,
6772 which is done with shutdown.
6775 HPATH = "instance-migrate"
6776 HTYPE = constants.HTYPE_INSTANCE
6779 def ExpandNames(self):
6780 self._ExpandAndLockInstance()
6782 if self.op.target_node is not None:
6783 self.op.target_node = _ExpandNodeName(self.cfg, self.op.target_node)
6785 self.needed_locks[locking.LEVEL_NODE] = []
6786 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
6788 self._migrater = TLMigrateInstance(self, self.op.instance_name,
6789 cleanup=self.op.cleanup,
6791 fallback=self.op.allow_failover)
6792 self.tasklets = [self._migrater]
6794 def DeclareLocks(self, level):
6795 if level == locking.LEVEL_NODE:
6796 instance = self.context.cfg.GetInstanceInfo(self.op.instance_name)
6797 if instance.disk_template in constants.DTS_EXT_MIRROR:
6798 if self.op.target_node is None:
6799 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
6801 self.needed_locks[locking.LEVEL_NODE] = [instance.primary_node,
6802 self.op.target_node]
6803 del self.recalculate_locks[locking.LEVEL_NODE]
6805 self._LockInstancesNodes()
6807 def BuildHooksEnv(self):
6810 This runs on master, primary and secondary nodes of the instance.
6813 instance = self._migrater.instance
6814 source_node = instance.primary_node
6815 target_node = self.op.target_node
6816 env = _BuildInstanceHookEnvByObject(self, instance)
6818 "MIGRATE_LIVE": self._migrater.live,
6819 "MIGRATE_CLEANUP": self.op.cleanup,
6820 "OLD_PRIMARY": source_node,
6821 "NEW_PRIMARY": target_node,
6824 if instance.disk_template in constants.DTS_INT_MIRROR:
6825 env["OLD_SECONDARY"] = target_node
6826 env["NEW_SECONDARY"] = source_node
6828 env["OLD_SECONDARY"] = env["NEW_SECONDARY"] = None
6832 def BuildHooksNodes(self):
6833 """Build hooks nodes.
6836 instance = self._migrater.instance
6837 nl = [self.cfg.GetMasterNode()] + list(instance.secondary_nodes)
6838 return (nl, nl + [instance.primary_node])
6841 class LUInstanceMove(LogicalUnit):
6842 """Move an instance by data-copying.
6845 HPATH = "instance-move"
6846 HTYPE = constants.HTYPE_INSTANCE
6849 def ExpandNames(self):
6850 self._ExpandAndLockInstance()
6851 target_node = _ExpandNodeName(self.cfg, self.op.target_node)
6852 self.op.target_node = target_node
6853 self.needed_locks[locking.LEVEL_NODE] = [target_node]
6854 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
6856 def DeclareLocks(self, level):
6857 if level == locking.LEVEL_NODE:
6858 self._LockInstancesNodes(primary_only=True)
6860 def BuildHooksEnv(self):
6863 This runs on master, primary and secondary nodes of the instance.
6867 "TARGET_NODE": self.op.target_node,
6868 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
6870 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
6873 def BuildHooksNodes(self):
6874 """Build hooks nodes.
6878 self.cfg.GetMasterNode(),
6879 self.instance.primary_node,
6880 self.op.target_node,
6884 def CheckPrereq(self):
6885 """Check prerequisites.
6887 This checks that the instance is in the cluster.
6890 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6891 assert self.instance is not None, \
6892 "Cannot retrieve locked instance %s" % self.op.instance_name
6894 node = self.cfg.GetNodeInfo(self.op.target_node)
6895 assert node is not None, \
6896 "Cannot retrieve locked node %s" % self.op.target_node
6898 self.target_node = target_node = node.name
6900 if target_node == instance.primary_node:
6901 raise errors.OpPrereqError("Instance %s is already on the node %s" %
6902 (instance.name, target_node),
6905 bep = self.cfg.GetClusterInfo().FillBE(instance)
6907 for idx, dsk in enumerate(instance.disks):
6908 if dsk.dev_type not in (constants.LD_LV, constants.LD_FILE):
6909 raise errors.OpPrereqError("Instance disk %d has a complex layout,"
6910 " cannot copy" % idx, errors.ECODE_STATE)
6912 _CheckNodeOnline(self, target_node)
6913 _CheckNodeNotDrained(self, target_node)
6914 _CheckNodeVmCapable(self, target_node)
6916 if instance.admin_up:
6917 # check memory requirements on the secondary node
6918 _CheckNodeFreeMemory(self, target_node, "failing over instance %s" %
6919 instance.name, bep[constants.BE_MEMORY],
6920 instance.hypervisor)
6922 self.LogInfo("Not checking memory on the secondary node as"
6923 " instance will not be started")
6925 # check bridge existance
6926 _CheckInstanceBridgesExist(self, instance, node=target_node)
6928 def Exec(self, feedback_fn):
6929 """Move an instance.
6931 The move is done by shutting it down on its present node, copying
6932 the data over (slow) and starting it on the new node.
6935 instance = self.instance
6937 source_node = instance.primary_node
6938 target_node = self.target_node
6940 self.LogInfo("Shutting down instance %s on source node %s",
6941 instance.name, source_node)
6943 result = self.rpc.call_instance_shutdown(source_node, instance,
6944 self.op.shutdown_timeout)
6945 msg = result.fail_msg
6947 if self.op.ignore_consistency:
6948 self.proc.LogWarning("Could not shutdown instance %s on node %s."
6949 " Proceeding anyway. Please make sure node"
6950 " %s is down. Error details: %s",
6951 instance.name, source_node, source_node, msg)
6953 raise errors.OpExecError("Could not shutdown instance %s on"
6955 (instance.name, source_node, msg))
6957 # create the target disks
6959 _CreateDisks(self, instance, target_node=target_node)
6960 except errors.OpExecError:
6961 self.LogWarning("Device creation failed, reverting...")
6963 _RemoveDisks(self, instance, target_node=target_node)
6965 self.cfg.ReleaseDRBDMinors(instance.name)
6968 cluster_name = self.cfg.GetClusterInfo().cluster_name
6971 # activate, get path, copy the data over
6972 for idx, disk in enumerate(instance.disks):
6973 self.LogInfo("Copying data for disk %d", idx)
6974 result = self.rpc.call_blockdev_assemble(target_node, disk,
6975 instance.name, True, idx)
6977 self.LogWarning("Can't assemble newly created disk %d: %s",
6978 idx, result.fail_msg)
6979 errs.append(result.fail_msg)
6981 dev_path = result.payload
6982 result = self.rpc.call_blockdev_export(source_node, disk,
6983 target_node, dev_path,
6986 self.LogWarning("Can't copy data over for disk %d: %s",
6987 idx, result.fail_msg)
6988 errs.append(result.fail_msg)
6992 self.LogWarning("Some disks failed to copy, aborting")
6994 _RemoveDisks(self, instance, target_node=target_node)
6996 self.cfg.ReleaseDRBDMinors(instance.name)
6997 raise errors.OpExecError("Errors during disk copy: %s" %
7000 instance.primary_node = target_node
7001 self.cfg.Update(instance, feedback_fn)
7003 self.LogInfo("Removing the disks on the original node")
7004 _RemoveDisks(self, instance, target_node=source_node)
7006 # Only start the instance if it's marked as up
7007 if instance.admin_up:
7008 self.LogInfo("Starting instance %s on node %s",
7009 instance.name, target_node)
7011 disks_ok, _ = _AssembleInstanceDisks(self, instance,
7012 ignore_secondaries=True)
7014 _ShutdownInstanceDisks(self, instance)
7015 raise errors.OpExecError("Can't activate the instance's disks")
7017 result = self.rpc.call_instance_start(target_node, instance,
7019 msg = result.fail_msg
7021 _ShutdownInstanceDisks(self, instance)
7022 raise errors.OpExecError("Could not start instance %s on node %s: %s" %
7023 (instance.name, target_node, msg))
7026 class LUNodeMigrate(LogicalUnit):
7027 """Migrate all instances from a node.
7030 HPATH = "node-migrate"
7031 HTYPE = constants.HTYPE_NODE
7034 def CheckArguments(self):
7037 def ExpandNames(self):
7038 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
7040 self.share_locks = _ShareAll()
7041 self.needed_locks = {
7042 locking.LEVEL_NODE: [self.op.node_name],
7045 def BuildHooksEnv(self):
7048 This runs on the master, the primary and all the secondaries.
7052 "NODE_NAME": self.op.node_name,
7055 def BuildHooksNodes(self):
7056 """Build hooks nodes.
7059 nl = [self.cfg.GetMasterNode()]
7062 def CheckPrereq(self):
7065 def Exec(self, feedback_fn):
7066 # Prepare jobs for migration instances
7068 [opcodes.OpInstanceMigrate(instance_name=inst.name,
7071 iallocator=self.op.iallocator,
7072 target_node=self.op.target_node)]
7073 for inst in _GetNodePrimaryInstances(self.cfg, self.op.node_name)
7076 # TODO: Run iallocator in this opcode and pass correct placement options to
7077 # OpInstanceMigrate. Since other jobs can modify the cluster between
7078 # running the iallocator and the actual migration, a good consistency model
7079 # will have to be found.
7081 assert (frozenset(self.owned_locks(locking.LEVEL_NODE)) ==
7082 frozenset([self.op.node_name]))
7084 return ResultWithJobs(jobs)
7087 class TLMigrateInstance(Tasklet):
7088 """Tasklet class for instance migration.
7091 @ivar live: whether the migration will be done live or non-live;
7092 this variable is initalized only after CheckPrereq has run
7093 @type cleanup: boolean
7094 @ivar cleanup: Wheater we cleanup from a failed migration
7095 @type iallocator: string
7096 @ivar iallocator: The iallocator used to determine target_node
7097 @type target_node: string
7098 @ivar target_node: If given, the target_node to reallocate the instance to
7099 @type failover: boolean
7100 @ivar failover: Whether operation results in failover or migration
7101 @type fallback: boolean
7102 @ivar fallback: Whether fallback to failover is allowed if migration not
7104 @type ignore_consistency: boolean
7105 @ivar ignore_consistency: Wheter we should ignore consistency between source
7107 @type shutdown_timeout: int
7108 @ivar shutdown_timeout: In case of failover timeout of the shutdown
7113 _MIGRATION_POLL_INTERVAL = 1 # seconds
7114 _MIGRATION_FEEDBACK_INTERVAL = 10 # seconds
7116 def __init__(self, lu, instance_name, cleanup=False,
7117 failover=False, fallback=False,
7118 ignore_consistency=False,
7119 shutdown_timeout=constants.DEFAULT_SHUTDOWN_TIMEOUT):
7120 """Initializes this class.
7123 Tasklet.__init__(self, lu)
7126 self.instance_name = instance_name
7127 self.cleanup = cleanup
7128 self.live = False # will be overridden later
7129 self.failover = failover
7130 self.fallback = fallback
7131 self.ignore_consistency = ignore_consistency
7132 self.shutdown_timeout = shutdown_timeout
7134 def CheckPrereq(self):
7135 """Check prerequisites.
7137 This checks that the instance is in the cluster.
7140 instance_name = _ExpandInstanceName(self.lu.cfg, self.instance_name)
7141 instance = self.cfg.GetInstanceInfo(instance_name)
7142 assert instance is not None
7143 self.instance = instance
7145 if (not self.cleanup and not instance.admin_up and not self.failover and
7147 self.lu.LogInfo("Instance is marked down, fallback allowed, switching"
7149 self.failover = True
7151 if instance.disk_template not in constants.DTS_MIRRORED:
7156 raise errors.OpPrereqError("Instance's disk layout '%s' does not allow"
7157 " %s" % (instance.disk_template, text),
7160 if instance.disk_template in constants.DTS_EXT_MIRROR:
7161 _CheckIAllocatorOrNode(self.lu, "iallocator", "target_node")
7163 if self.lu.op.iallocator:
7164 self._RunAllocator()
7166 # We set set self.target_node as it is required by
7168 self.target_node = self.lu.op.target_node
7170 # self.target_node is already populated, either directly or by the
7172 target_node = self.target_node
7173 if self.target_node == instance.primary_node:
7174 raise errors.OpPrereqError("Cannot migrate instance %s"
7175 " to its primary (%s)" %
7176 (instance.name, instance.primary_node))
7178 if len(self.lu.tasklets) == 1:
7179 # It is safe to release locks only when we're the only tasklet
7181 _ReleaseLocks(self.lu, locking.LEVEL_NODE,
7182 keep=[instance.primary_node, self.target_node])
7185 secondary_nodes = instance.secondary_nodes
7186 if not secondary_nodes:
7187 raise errors.ConfigurationError("No secondary node but using"
7188 " %s disk template" %
7189 instance.disk_template)
7190 target_node = secondary_nodes[0]
7191 if self.lu.op.iallocator or (self.lu.op.target_node and
7192 self.lu.op.target_node != target_node):
7194 text = "failed over"
7197 raise errors.OpPrereqError("Instances with disk template %s cannot"
7198 " be %s to arbitrary nodes"
7199 " (neither an iallocator nor a target"
7200 " node can be passed)" %
7201 (instance.disk_template, text),
7204 i_be = self.cfg.GetClusterInfo().FillBE(instance)
7206 # check memory requirements on the secondary node
7207 if not self.failover or instance.admin_up:
7208 _CheckNodeFreeMemory(self.lu, target_node, "migrating instance %s" %
7209 instance.name, i_be[constants.BE_MEMORY],
7210 instance.hypervisor)
7212 self.lu.LogInfo("Not checking memory on the secondary node as"
7213 " instance will not be started")
7215 # check bridge existance
7216 _CheckInstanceBridgesExist(self.lu, instance, node=target_node)
7218 if not self.cleanup:
7219 _CheckNodeNotDrained(self.lu, target_node)
7220 if not self.failover:
7221 result = self.rpc.call_instance_migratable(instance.primary_node,
7223 if result.fail_msg and self.fallback:
7224 self.lu.LogInfo("Can't migrate, instance offline, fallback to"
7226 self.failover = True
7228 result.Raise("Can't migrate, please use failover",
7229 prereq=True, ecode=errors.ECODE_STATE)
7231 assert not (self.failover and self.cleanup)
7233 if not self.failover:
7234 if self.lu.op.live is not None and self.lu.op.mode is not None:
7235 raise errors.OpPrereqError("Only one of the 'live' and 'mode'"
7236 " parameters are accepted",
7238 if self.lu.op.live is not None:
7240 self.lu.op.mode = constants.HT_MIGRATION_LIVE
7242 self.lu.op.mode = constants.HT_MIGRATION_NONLIVE
7243 # reset the 'live' parameter to None so that repeated
7244 # invocations of CheckPrereq do not raise an exception
7245 self.lu.op.live = None
7246 elif self.lu.op.mode is None:
7247 # read the default value from the hypervisor
7248 i_hv = self.cfg.GetClusterInfo().FillHV(self.instance,
7250 self.lu.op.mode = i_hv[constants.HV_MIGRATION_MODE]
7252 self.live = self.lu.op.mode == constants.HT_MIGRATION_LIVE
7254 # Failover is never live
7257 def _RunAllocator(self):
7258 """Run the allocator based on input opcode.
7261 ial = IAllocator(self.cfg, self.rpc,
7262 mode=constants.IALLOCATOR_MODE_RELOC,
7263 name=self.instance_name,
7264 # TODO See why hail breaks with a single node below
7265 relocate_from=[self.instance.primary_node,
7266 self.instance.primary_node],
7269 ial.Run(self.lu.op.iallocator)
7272 raise errors.OpPrereqError("Can't compute nodes using"
7273 " iallocator '%s': %s" %
7274 (self.lu.op.iallocator, ial.info),
7276 if len(ial.result) != ial.required_nodes:
7277 raise errors.OpPrereqError("iallocator '%s' returned invalid number"
7278 " of nodes (%s), required %s" %
7279 (self.lu.op.iallocator, len(ial.result),
7280 ial.required_nodes), errors.ECODE_FAULT)
7281 self.target_node = ial.result[0]
7282 self.lu.LogInfo("Selected nodes for instance %s via iallocator %s: %s",
7283 self.instance_name, self.lu.op.iallocator,
7284 utils.CommaJoin(ial.result))
7286 def _WaitUntilSync(self):
7287 """Poll with custom rpc for disk sync.
7289 This uses our own step-based rpc call.
7292 self.feedback_fn("* wait until resync is done")
7296 result = self.rpc.call_drbd_wait_sync(self.all_nodes,
7298 self.instance.disks)
7300 for node, nres in result.items():
7301 nres.Raise("Cannot resync disks on node %s" % node)
7302 node_done, node_percent = nres.payload
7303 all_done = all_done and node_done
7304 if node_percent is not None:
7305 min_percent = min(min_percent, node_percent)
7307 if min_percent < 100:
7308 self.feedback_fn(" - progress: %.1f%%" % min_percent)
7311 def _EnsureSecondary(self, node):
7312 """Demote a node to secondary.
7315 self.feedback_fn("* switching node %s to secondary mode" % node)
7317 for dev in self.instance.disks:
7318 self.cfg.SetDiskID(dev, node)
7320 result = self.rpc.call_blockdev_close(node, self.instance.name,
7321 self.instance.disks)
7322 result.Raise("Cannot change disk to secondary on node %s" % node)
7324 def _GoStandalone(self):
7325 """Disconnect from the network.
7328 self.feedback_fn("* changing into standalone mode")
7329 result = self.rpc.call_drbd_disconnect_net(self.all_nodes, self.nodes_ip,
7330 self.instance.disks)
7331 for node, nres in result.items():
7332 nres.Raise("Cannot disconnect disks node %s" % node)
7334 def _GoReconnect(self, multimaster):
7335 """Reconnect to the network.
7341 msg = "single-master"
7342 self.feedback_fn("* changing disks into %s mode" % msg)
7343 result = self.rpc.call_drbd_attach_net(self.all_nodes, self.nodes_ip,
7344 self.instance.disks,
7345 self.instance.name, multimaster)
7346 for node, nres in result.items():
7347 nres.Raise("Cannot change disks config on node %s" % node)
7349 def _ExecCleanup(self):
7350 """Try to cleanup after a failed migration.
7352 The cleanup is done by:
7353 - check that the instance is running only on one node
7354 (and update the config if needed)
7355 - change disks on its secondary node to secondary
7356 - wait until disks are fully synchronized
7357 - disconnect from the network
7358 - change disks into single-master mode
7359 - wait again until disks are fully synchronized
7362 instance = self.instance
7363 target_node = self.target_node
7364 source_node = self.source_node
7366 # check running on only one node
7367 self.feedback_fn("* checking where the instance actually runs"
7368 " (if this hangs, the hypervisor might be in"
7370 ins_l = self.rpc.call_instance_list(self.all_nodes, [instance.hypervisor])
7371 for node, result in ins_l.items():
7372 result.Raise("Can't contact node %s" % node)
7374 runningon_source = instance.name in ins_l[source_node].payload
7375 runningon_target = instance.name in ins_l[target_node].payload
7377 if runningon_source and runningon_target:
7378 raise errors.OpExecError("Instance seems to be running on two nodes,"
7379 " or the hypervisor is confused; you will have"
7380 " to ensure manually that it runs only on one"
7381 " and restart this operation")
7383 if not (runningon_source or runningon_target):
7384 raise errors.OpExecError("Instance does not seem to be running at all;"
7385 " in this case it's safer to repair by"
7386 " running 'gnt-instance stop' to ensure disk"
7387 " shutdown, and then restarting it")
7389 if runningon_target:
7390 # the migration has actually succeeded, we need to update the config
7391 self.feedback_fn("* instance running on secondary node (%s),"
7392 " updating config" % target_node)
7393 instance.primary_node = target_node
7394 self.cfg.Update(instance, self.feedback_fn)
7395 demoted_node = source_node
7397 self.feedback_fn("* instance confirmed to be running on its"
7398 " primary node (%s)" % source_node)
7399 demoted_node = target_node
7401 if instance.disk_template in constants.DTS_INT_MIRROR:
7402 self._EnsureSecondary(demoted_node)
7404 self._WaitUntilSync()
7405 except errors.OpExecError:
7406 # we ignore here errors, since if the device is standalone, it
7407 # won't be able to sync
7409 self._GoStandalone()
7410 self._GoReconnect(False)
7411 self._WaitUntilSync()
7413 self.feedback_fn("* done")
7415 def _RevertDiskStatus(self):
7416 """Try to revert the disk status after a failed migration.
7419 target_node = self.target_node
7420 if self.instance.disk_template in constants.DTS_EXT_MIRROR:
7424 self._EnsureSecondary(target_node)
7425 self._GoStandalone()
7426 self._GoReconnect(False)
7427 self._WaitUntilSync()
7428 except errors.OpExecError, err:
7429 self.lu.LogWarning("Migration failed and I can't reconnect the drives,"
7430 " please try to recover the instance manually;"
7431 " error '%s'" % str(err))
7433 def _AbortMigration(self):
7434 """Call the hypervisor code to abort a started migration.
7437 instance = self.instance
7438 target_node = self.target_node
7439 source_node = self.source_node
7440 migration_info = self.migration_info
7442 abort_result = self.rpc.call_instance_finalize_migration_dst(target_node,
7446 abort_msg = abort_result.fail_msg
7448 logging.error("Aborting migration failed on target node %s: %s",
7449 target_node, abort_msg)
7450 # Don't raise an exception here, as we stil have to try to revert the
7451 # disk status, even if this step failed.
7453 abort_result = self.rpc.call_instance_finalize_migration_src(source_node,
7454 instance, False, self.live)
7455 abort_msg = abort_result.fail_msg
7457 logging.error("Aborting migration failed on source node %s: %s",
7458 source_node, abort_msg)
7460 def _ExecMigration(self):
7461 """Migrate an instance.
7463 The migrate is done by:
7464 - change the disks into dual-master mode
7465 - wait until disks are fully synchronized again
7466 - migrate the instance
7467 - change disks on the new secondary node (the old primary) to secondary
7468 - wait until disks are fully synchronized
7469 - change disks into single-master mode
7472 instance = self.instance
7473 target_node = self.target_node
7474 source_node = self.source_node
7476 # Check for hypervisor version mismatch and warn the user.
7477 nodeinfo = self.rpc.call_node_info([source_node, target_node],
7478 None, self.instance.hypervisor)
7479 src_info = nodeinfo[source_node]
7480 dst_info = nodeinfo[target_node]
7482 if ((constants.HV_NODEINFO_KEY_VERSION in src_info.payload) and
7483 (constants.HV_NODEINFO_KEY_VERSION in dst_info.payload)):
7484 src_version = src_info.payload[constants.HV_NODEINFO_KEY_VERSION]
7485 dst_version = dst_info.payload[constants.HV_NODEINFO_KEY_VERSION]
7486 if src_version != dst_version:
7487 self.feedback_fn("* warning: hypervisor version mismatch between"
7488 " source (%s) and target (%s) node" %
7489 (src_version, dst_version))
7491 self.feedback_fn("* checking disk consistency between source and target")
7492 for dev in instance.disks:
7493 if not _CheckDiskConsistency(self.lu, dev, target_node, False):
7494 raise errors.OpExecError("Disk %s is degraded or not fully"
7495 " synchronized on target node,"
7496 " aborting migration" % dev.iv_name)
7498 # First get the migration information from the remote node
7499 result = self.rpc.call_migration_info(source_node, instance)
7500 msg = result.fail_msg
7502 log_err = ("Failed fetching source migration information from %s: %s" %
7504 logging.error(log_err)
7505 raise errors.OpExecError(log_err)
7507 self.migration_info = migration_info = result.payload
7509 if self.instance.disk_template not in constants.DTS_EXT_MIRROR:
7510 # Then switch the disks to master/master mode
7511 self._EnsureSecondary(target_node)
7512 self._GoStandalone()
7513 self._GoReconnect(True)
7514 self._WaitUntilSync()
7516 self.feedback_fn("* preparing %s to accept the instance" % target_node)
7517 result = self.rpc.call_accept_instance(target_node,
7520 self.nodes_ip[target_node])
7522 msg = result.fail_msg
7524 logging.error("Instance pre-migration failed, trying to revert"
7525 " disk status: %s", msg)
7526 self.feedback_fn("Pre-migration failed, aborting")
7527 self._AbortMigration()
7528 self._RevertDiskStatus()
7529 raise errors.OpExecError("Could not pre-migrate instance %s: %s" %
7530 (instance.name, msg))
7532 self.feedback_fn("* migrating instance to %s" % target_node)
7533 result = self.rpc.call_instance_migrate(source_node, instance,
7534 self.nodes_ip[target_node],
7536 msg = result.fail_msg
7538 logging.error("Instance migration failed, trying to revert"
7539 " disk status: %s", msg)
7540 self.feedback_fn("Migration failed, aborting")
7541 self._AbortMigration()
7542 self._RevertDiskStatus()
7543 raise errors.OpExecError("Could not migrate instance %s: %s" %
7544 (instance.name, msg))
7546 self.feedback_fn("* starting memory transfer")
7547 last_feedback = time.time()
7549 result = self.rpc.call_instance_get_migration_status(source_node,
7551 msg = result.fail_msg
7552 ms = result.payload # MigrationStatus instance
7553 if msg or (ms.status in constants.HV_MIGRATION_FAILED_STATUSES):
7554 logging.error("Instance migration failed, trying to revert"
7555 " disk status: %s", msg)
7556 self.feedback_fn("Migration failed, aborting")
7557 self._AbortMigration()
7558 self._RevertDiskStatus()
7559 raise errors.OpExecError("Could not migrate instance %s: %s" %
7560 (instance.name, msg))
7562 if result.payload.status != constants.HV_MIGRATION_ACTIVE:
7563 self.feedback_fn("* memory transfer complete")
7566 if (utils.TimeoutExpired(last_feedback,
7567 self._MIGRATION_FEEDBACK_INTERVAL) and
7568 ms.transferred_ram is not None):
7569 mem_progress = 100 * float(ms.transferred_ram) / float(ms.total_ram)
7570 self.feedback_fn("* memory transfer progress: %.2f %%" % mem_progress)
7571 last_feedback = time.time()
7573 time.sleep(self._MIGRATION_POLL_INTERVAL)
7575 result = self.rpc.call_instance_finalize_migration_src(source_node,
7579 msg = result.fail_msg
7581 logging.error("Instance migration succeeded, but finalization failed"
7582 " on the source node: %s", msg)
7583 raise errors.OpExecError("Could not finalize instance migration: %s" %
7586 instance.primary_node = target_node
7588 # distribute new instance config to the other nodes
7589 self.cfg.Update(instance, self.feedback_fn)
7591 result = self.rpc.call_instance_finalize_migration_dst(target_node,
7595 msg = result.fail_msg
7597 logging.error("Instance migration succeeded, but finalization failed"
7598 " on the target node: %s", msg)
7599 raise errors.OpExecError("Could not finalize instance migration: %s" %
7602 if self.instance.disk_template not in constants.DTS_EXT_MIRROR:
7603 self._EnsureSecondary(source_node)
7604 self._WaitUntilSync()
7605 self._GoStandalone()
7606 self._GoReconnect(False)
7607 self._WaitUntilSync()
7609 self.feedback_fn("* done")
7611 def _ExecFailover(self):
7612 """Failover an instance.
7614 The failover is done by shutting it down on its present node and
7615 starting it on the secondary.
7618 instance = self.instance
7619 primary_node = self.cfg.GetNodeInfo(instance.primary_node)
7621 source_node = instance.primary_node
7622 target_node = self.target_node
7624 if instance.admin_up:
7625 self.feedback_fn("* checking disk consistency between source and target")
7626 for dev in instance.disks:
7627 # for drbd, these are drbd over lvm
7628 if not _CheckDiskConsistency(self.lu, dev, target_node, False):
7629 if primary_node.offline:
7630 self.feedback_fn("Node %s is offline, ignoring degraded disk %s on"
7632 (primary_node.name, dev.iv_name, target_node))
7633 elif not self.ignore_consistency:
7634 raise errors.OpExecError("Disk %s is degraded on target node,"
7635 " aborting failover" % dev.iv_name)
7637 self.feedback_fn("* not checking disk consistency as instance is not"
7640 self.feedback_fn("* shutting down instance on source node")
7641 logging.info("Shutting down instance %s on node %s",
7642 instance.name, source_node)
7644 result = self.rpc.call_instance_shutdown(source_node, instance,
7645 self.shutdown_timeout)
7646 msg = result.fail_msg
7648 if self.ignore_consistency or primary_node.offline:
7649 self.lu.LogWarning("Could not shutdown instance %s on node %s,"
7650 " proceeding anyway; please make sure node"
7651 " %s is down; error details: %s",
7652 instance.name, source_node, source_node, msg)
7654 raise errors.OpExecError("Could not shutdown instance %s on"
7656 (instance.name, source_node, msg))
7658 self.feedback_fn("* deactivating the instance's disks on source node")
7659 if not _ShutdownInstanceDisks(self.lu, instance, ignore_primary=True):
7660 raise errors.OpExecError("Can't shut down the instance's disks")
7662 instance.primary_node = target_node
7663 # distribute new instance config to the other nodes
7664 self.cfg.Update(instance, self.feedback_fn)
7666 # Only start the instance if it's marked as up
7667 if instance.admin_up:
7668 self.feedback_fn("* activating the instance's disks on target node %s" %
7670 logging.info("Starting instance %s on node %s",
7671 instance.name, target_node)
7673 disks_ok, _ = _AssembleInstanceDisks(self.lu, instance,
7674 ignore_secondaries=True)
7676 _ShutdownInstanceDisks(self.lu, instance)
7677 raise errors.OpExecError("Can't activate the instance's disks")
7679 self.feedback_fn("* starting the instance on the target node %s" %
7681 result = self.rpc.call_instance_start(target_node, instance, None, None,
7683 msg = result.fail_msg
7685 _ShutdownInstanceDisks(self.lu, instance)
7686 raise errors.OpExecError("Could not start instance %s on node %s: %s" %
7687 (instance.name, target_node, msg))
7689 def Exec(self, feedback_fn):
7690 """Perform the migration.
7693 self.feedback_fn = feedback_fn
7694 self.source_node = self.instance.primary_node
7696 # FIXME: if we implement migrate-to-any in DRBD, this needs fixing
7697 if self.instance.disk_template in constants.DTS_INT_MIRROR:
7698 self.target_node = self.instance.secondary_nodes[0]
7699 # Otherwise self.target_node has been populated either
7700 # directly, or through an iallocator.
7702 self.all_nodes = [self.source_node, self.target_node]
7703 self.nodes_ip = dict((name, node.secondary_ip) for (name, node)
7704 in self.cfg.GetMultiNodeInfo(self.all_nodes))
7707 feedback_fn("Failover instance %s" % self.instance.name)
7708 self._ExecFailover()
7710 feedback_fn("Migrating instance %s" % self.instance.name)
7713 return self._ExecCleanup()
7715 return self._ExecMigration()
7718 def _CreateBlockDev(lu, node, instance, device, force_create,
7720 """Create a tree of block devices on a given node.
7722 If this device type has to be created on secondaries, create it and
7725 If not, just recurse to children keeping the same 'force' value.
7727 @param lu: the lu on whose behalf we execute
7728 @param node: the node on which to create the device
7729 @type instance: L{objects.Instance}
7730 @param instance: the instance which owns the device
7731 @type device: L{objects.Disk}
7732 @param device: the device to create
7733 @type force_create: boolean
7734 @param force_create: whether to force creation of this device; this
7735 will be change to True whenever we find a device which has
7736 CreateOnSecondary() attribute
7737 @param info: the extra 'metadata' we should attach to the device
7738 (this will be represented as a LVM tag)
7739 @type force_open: boolean
7740 @param force_open: this parameter will be passes to the
7741 L{backend.BlockdevCreate} function where it specifies
7742 whether we run on primary or not, and it affects both
7743 the child assembly and the device own Open() execution
7746 if device.CreateOnSecondary():
7750 for child in device.children:
7751 _CreateBlockDev(lu, node, instance, child, force_create,
7754 if not force_create:
7757 _CreateSingleBlockDev(lu, node, instance, device, info, force_open)
7760 def _CreateSingleBlockDev(lu, node, instance, device, info, force_open):
7761 """Create a single block device on a given node.
7763 This will not recurse over children of the device, so they must be
7766 @param lu: the lu on whose behalf we execute
7767 @param node: the node on which to create the device
7768 @type instance: L{objects.Instance}
7769 @param instance: the instance which owns the device
7770 @type device: L{objects.Disk}
7771 @param device: the device to create
7772 @param info: the extra 'metadata' we should attach to the device
7773 (this will be represented as a LVM tag)
7774 @type force_open: boolean
7775 @param force_open: this parameter will be passes to the
7776 L{backend.BlockdevCreate} function where it specifies
7777 whether we run on primary or not, and it affects both
7778 the child assembly and the device own Open() execution
7781 lu.cfg.SetDiskID(device, node)
7782 result = lu.rpc.call_blockdev_create(node, device, device.size,
7783 instance.name, force_open, info)
7784 result.Raise("Can't create block device %s on"
7785 " node %s for instance %s" % (device, node, instance.name))
7786 if device.physical_id is None:
7787 device.physical_id = result.payload
7790 def _GenerateUniqueNames(lu, exts):
7791 """Generate a suitable LV name.
7793 This will generate a logical volume name for the given instance.
7798 new_id = lu.cfg.GenerateUniqueID(lu.proc.GetECId())
7799 results.append("%s%s" % (new_id, val))
7803 def _GenerateDRBD8Branch(lu, primary, secondary, size, vgnames, names,
7804 iv_name, p_minor, s_minor):
7805 """Generate a drbd8 device complete with its children.
7808 assert len(vgnames) == len(names) == 2
7809 port = lu.cfg.AllocatePort()
7810 shared_secret = lu.cfg.GenerateDRBDSecret(lu.proc.GetECId())
7811 dev_data = objects.Disk(dev_type=constants.LD_LV, size=size,
7812 logical_id=(vgnames[0], names[0]))
7813 dev_meta = objects.Disk(dev_type=constants.LD_LV, size=128,
7814 logical_id=(vgnames[1], names[1]))
7815 drbd_dev = objects.Disk(dev_type=constants.LD_DRBD8, size=size,
7816 logical_id=(primary, secondary, port,
7819 children=[dev_data, dev_meta],
7824 def _GenerateDiskTemplate(lu, template_name,
7825 instance_name, primary_node,
7826 secondary_nodes, disk_info,
7827 file_storage_dir, file_driver,
7828 base_index, feedback_fn):
7829 """Generate the entire disk layout for a given template type.
7832 #TODO: compute space requirements
7834 vgname = lu.cfg.GetVGName()
7835 disk_count = len(disk_info)
7837 if template_name == constants.DT_DISKLESS:
7839 elif template_name == constants.DT_PLAIN:
7840 if len(secondary_nodes) != 0:
7841 raise errors.ProgrammerError("Wrong template configuration")
7843 names = _GenerateUniqueNames(lu, [".disk%d" % (base_index + i)
7844 for i in range(disk_count)])
7845 for idx, disk in enumerate(disk_info):
7846 disk_index = idx + base_index
7847 vg = disk.get(constants.IDISK_VG, vgname)
7848 feedback_fn("* disk %i, vg %s, name %s" % (idx, vg, names[idx]))
7849 disk_dev = objects.Disk(dev_type=constants.LD_LV,
7850 size=disk[constants.IDISK_SIZE],
7851 logical_id=(vg, names[idx]),
7852 iv_name="disk/%d" % disk_index,
7853 mode=disk[constants.IDISK_MODE])
7854 disks.append(disk_dev)
7855 elif template_name == constants.DT_DRBD8:
7856 if len(secondary_nodes) != 1:
7857 raise errors.ProgrammerError("Wrong template configuration")
7858 remote_node = secondary_nodes[0]
7859 minors = lu.cfg.AllocateDRBDMinor(
7860 [primary_node, remote_node] * len(disk_info), instance_name)
7863 for lv_prefix in _GenerateUniqueNames(lu, [".disk%d" % (base_index + i)
7864 for i in range(disk_count)]):
7865 names.append(lv_prefix + "_data")
7866 names.append(lv_prefix + "_meta")
7867 for idx, disk in enumerate(disk_info):
7868 disk_index = idx + base_index
7869 data_vg = disk.get(constants.IDISK_VG, vgname)
7870 meta_vg = disk.get(constants.IDISK_METAVG, data_vg)
7871 disk_dev = _GenerateDRBD8Branch(lu, primary_node, remote_node,
7872 disk[constants.IDISK_SIZE],
7874 names[idx * 2:idx * 2 + 2],
7875 "disk/%d" % disk_index,
7876 minors[idx * 2], minors[idx * 2 + 1])
7877 disk_dev.mode = disk[constants.IDISK_MODE]
7878 disks.append(disk_dev)
7879 elif template_name == constants.DT_FILE:
7880 if len(secondary_nodes) != 0:
7881 raise errors.ProgrammerError("Wrong template configuration")
7883 opcodes.RequireFileStorage()
7885 for idx, disk in enumerate(disk_info):
7886 disk_index = idx + base_index
7887 disk_dev = objects.Disk(dev_type=constants.LD_FILE,
7888 size=disk[constants.IDISK_SIZE],
7889 iv_name="disk/%d" % disk_index,
7890 logical_id=(file_driver,
7891 "%s/disk%d" % (file_storage_dir,
7893 mode=disk[constants.IDISK_MODE])
7894 disks.append(disk_dev)
7895 elif template_name == constants.DT_SHARED_FILE:
7896 if len(secondary_nodes) != 0:
7897 raise errors.ProgrammerError("Wrong template configuration")
7899 opcodes.RequireSharedFileStorage()
7901 for idx, disk in enumerate(disk_info):
7902 disk_index = idx + base_index
7903 disk_dev = objects.Disk(dev_type=constants.LD_FILE,
7904 size=disk[constants.IDISK_SIZE],
7905 iv_name="disk/%d" % disk_index,
7906 logical_id=(file_driver,
7907 "%s/disk%d" % (file_storage_dir,
7909 mode=disk[constants.IDISK_MODE])
7910 disks.append(disk_dev)
7911 elif template_name == constants.DT_BLOCK:
7912 if len(secondary_nodes) != 0:
7913 raise errors.ProgrammerError("Wrong template configuration")
7915 for idx, disk in enumerate(disk_info):
7916 disk_index = idx + base_index
7917 disk_dev = objects.Disk(dev_type=constants.LD_BLOCKDEV,
7918 size=disk[constants.IDISK_SIZE],
7919 logical_id=(constants.BLOCKDEV_DRIVER_MANUAL,
7920 disk[constants.IDISK_ADOPT]),
7921 iv_name="disk/%d" % disk_index,
7922 mode=disk[constants.IDISK_MODE])
7923 disks.append(disk_dev)
7926 raise errors.ProgrammerError("Invalid disk template '%s'" % template_name)
7930 def _GetInstanceInfoText(instance):
7931 """Compute that text that should be added to the disk's metadata.
7934 return "originstname+%s" % instance.name
7937 def _CalcEta(time_taken, written, total_size):
7938 """Calculates the ETA based on size written and total size.
7940 @param time_taken: The time taken so far
7941 @param written: amount written so far
7942 @param total_size: The total size of data to be written
7943 @return: The remaining time in seconds
7946 avg_time = time_taken / float(written)
7947 return (total_size - written) * avg_time
7950 def _WipeDisks(lu, instance):
7951 """Wipes instance disks.
7953 @type lu: L{LogicalUnit}
7954 @param lu: the logical unit on whose behalf we execute
7955 @type instance: L{objects.Instance}
7956 @param instance: the instance whose disks we should create
7957 @return: the success of the wipe
7960 node = instance.primary_node
7962 for device in instance.disks:
7963 lu.cfg.SetDiskID(device, node)
7965 logging.info("Pause sync of instance %s disks", instance.name)
7966 result = lu.rpc.call_blockdev_pause_resume_sync(node, instance.disks, True)
7968 for idx, success in enumerate(result.payload):
7970 logging.warn("pause-sync of instance %s for disks %d failed",
7974 for idx, device in enumerate(instance.disks):
7975 # The wipe size is MIN_WIPE_CHUNK_PERCENT % of the instance disk but
7976 # MAX_WIPE_CHUNK at max
7977 wipe_chunk_size = min(constants.MAX_WIPE_CHUNK, device.size / 100.0 *
7978 constants.MIN_WIPE_CHUNK_PERCENT)
7979 # we _must_ make this an int, otherwise rounding errors will
7981 wipe_chunk_size = int(wipe_chunk_size)
7983 lu.LogInfo("* Wiping disk %d", idx)
7984 logging.info("Wiping disk %d for instance %s, node %s using"
7985 " chunk size %s", idx, instance.name, node, wipe_chunk_size)
7990 start_time = time.time()
7992 while offset < size:
7993 wipe_size = min(wipe_chunk_size, size - offset)
7994 logging.debug("Wiping disk %d, offset %s, chunk %s",
7995 idx, offset, wipe_size)
7996 result = lu.rpc.call_blockdev_wipe(node, device, offset, wipe_size)
7997 result.Raise("Could not wipe disk %d at offset %d for size %d" %
7998 (idx, offset, wipe_size))
8001 if now - last_output >= 60:
8002 eta = _CalcEta(now - start_time, offset, size)
8003 lu.LogInfo(" - done: %.1f%% ETA: %s" %
8004 (offset / float(size) * 100, utils.FormatSeconds(eta)))
8007 logging.info("Resume sync of instance %s disks", instance.name)
8009 result = lu.rpc.call_blockdev_pause_resume_sync(node, instance.disks, False)
8011 for idx, success in enumerate(result.payload):
8013 lu.LogWarning("Resume sync of disk %d failed, please have a"
8014 " look at the status and troubleshoot the issue", idx)
8015 logging.warn("resume-sync of instance %s for disks %d failed",
8019 def _CreateDisks(lu, instance, to_skip=None, target_node=None):
8020 """Create all disks for an instance.
8022 This abstracts away some work from AddInstance.
8024 @type lu: L{LogicalUnit}
8025 @param lu: the logical unit on whose behalf we execute
8026 @type instance: L{objects.Instance}
8027 @param instance: the instance whose disks we should create
8029 @param to_skip: list of indices to skip
8030 @type target_node: string
8031 @param target_node: if passed, overrides the target node for creation
8033 @return: the success of the creation
8036 info = _GetInstanceInfoText(instance)
8037 if target_node is None:
8038 pnode = instance.primary_node
8039 all_nodes = instance.all_nodes
8044 if instance.disk_template in constants.DTS_FILEBASED:
8045 file_storage_dir = os.path.dirname(instance.disks[0].logical_id[1])
8046 result = lu.rpc.call_file_storage_dir_create(pnode, file_storage_dir)
8048 result.Raise("Failed to create directory '%s' on"
8049 " node %s" % (file_storage_dir, pnode))
8051 # Note: this needs to be kept in sync with adding of disks in
8052 # LUInstanceSetParams
8053 for idx, device in enumerate(instance.disks):
8054 if to_skip and idx in to_skip:
8056 logging.info("Creating volume %s for instance %s",
8057 device.iv_name, instance.name)
8059 for node in all_nodes:
8060 f_create = node == pnode
8061 _CreateBlockDev(lu, node, instance, device, f_create, info, f_create)
8064 def _RemoveDisks(lu, instance, target_node=None):
8065 """Remove all disks for an instance.
8067 This abstracts away some work from `AddInstance()` and
8068 `RemoveInstance()`. Note that in case some of the devices couldn't
8069 be removed, the removal will continue with the other ones (compare
8070 with `_CreateDisks()`).
8072 @type lu: L{LogicalUnit}
8073 @param lu: the logical unit on whose behalf we execute
8074 @type instance: L{objects.Instance}
8075 @param instance: the instance whose disks we should remove
8076 @type target_node: string
8077 @param target_node: used to override the node on which to remove the disks
8079 @return: the success of the removal
8082 logging.info("Removing block devices for instance %s", instance.name)
8085 for device in instance.disks:
8087 edata = [(target_node, device)]
8089 edata = device.ComputeNodeTree(instance.primary_node)
8090 for node, disk in edata:
8091 lu.cfg.SetDiskID(disk, node)
8092 msg = lu.rpc.call_blockdev_remove(node, disk).fail_msg
8094 lu.LogWarning("Could not remove block device %s on node %s,"
8095 " continuing anyway: %s", device.iv_name, node, msg)
8098 if instance.disk_template == constants.DT_FILE:
8099 file_storage_dir = os.path.dirname(instance.disks[0].logical_id[1])
8103 tgt = instance.primary_node
8104 result = lu.rpc.call_file_storage_dir_remove(tgt, file_storage_dir)
8106 lu.LogWarning("Could not remove directory '%s' on node %s: %s",
8107 file_storage_dir, instance.primary_node, result.fail_msg)
8113 def _ComputeDiskSizePerVG(disk_template, disks):
8114 """Compute disk size requirements in the volume group
8117 def _compute(disks, payload):
8118 """Universal algorithm.
8123 vgs[disk[constants.IDISK_VG]] = \
8124 vgs.get(constants.IDISK_VG, 0) + disk[constants.IDISK_SIZE] + payload
8128 # Required free disk space as a function of disk and swap space
8130 constants.DT_DISKLESS: {},
8131 constants.DT_PLAIN: _compute(disks, 0),
8132 # 128 MB are added for drbd metadata for each disk
8133 constants.DT_DRBD8: _compute(disks, 128),
8134 constants.DT_FILE: {},
8135 constants.DT_SHARED_FILE: {},
8138 if disk_template not in req_size_dict:
8139 raise errors.ProgrammerError("Disk template '%s' size requirement"
8140 " is unknown" % disk_template)
8142 return req_size_dict[disk_template]
8145 def _ComputeDiskSize(disk_template, disks):
8146 """Compute disk size requirements in the volume group
8149 # Required free disk space as a function of disk and swap space
8151 constants.DT_DISKLESS: None,
8152 constants.DT_PLAIN: sum(d[constants.IDISK_SIZE] for d in disks),
8153 # 128 MB are added for drbd metadata for each disk
8154 constants.DT_DRBD8: sum(d[constants.IDISK_SIZE] + 128 for d in disks),
8155 constants.DT_FILE: None,
8156 constants.DT_SHARED_FILE: 0,
8157 constants.DT_BLOCK: 0,
8160 if disk_template not in req_size_dict:
8161 raise errors.ProgrammerError("Disk template '%s' size requirement"
8162 " is unknown" % disk_template)
8164 return req_size_dict[disk_template]
8167 def _FilterVmNodes(lu, nodenames):
8168 """Filters out non-vm_capable nodes from a list.
8170 @type lu: L{LogicalUnit}
8171 @param lu: the logical unit for which we check
8172 @type nodenames: list
8173 @param nodenames: the list of nodes on which we should check
8175 @return: the list of vm-capable nodes
8178 vm_nodes = frozenset(lu.cfg.GetNonVmCapableNodeList())
8179 return [name for name in nodenames if name not in vm_nodes]
8182 def _CheckHVParams(lu, nodenames, hvname, hvparams):
8183 """Hypervisor parameter validation.
8185 This function abstract the hypervisor parameter validation to be
8186 used in both instance create and instance modify.
8188 @type lu: L{LogicalUnit}
8189 @param lu: the logical unit for which we check
8190 @type nodenames: list
8191 @param nodenames: the list of nodes on which we should check
8192 @type hvname: string
8193 @param hvname: the name of the hypervisor we should use
8194 @type hvparams: dict
8195 @param hvparams: the parameters which we need to check
8196 @raise errors.OpPrereqError: if the parameters are not valid
8199 nodenames = _FilterVmNodes(lu, nodenames)
8200 hvinfo = lu.rpc.call_hypervisor_validate_params(nodenames,
8203 for node in nodenames:
8207 info.Raise("Hypervisor parameter validation failed on node %s" % node)
8210 def _CheckOSParams(lu, required, nodenames, osname, osparams):
8211 """OS parameters validation.
8213 @type lu: L{LogicalUnit}
8214 @param lu: the logical unit for which we check
8215 @type required: boolean
8216 @param required: whether the validation should fail if the OS is not
8218 @type nodenames: list
8219 @param nodenames: the list of nodes on which we should check
8220 @type osname: string
8221 @param osname: the name of the hypervisor we should use
8222 @type osparams: dict
8223 @param osparams: the parameters which we need to check
8224 @raise errors.OpPrereqError: if the parameters are not valid
8227 nodenames = _FilterVmNodes(lu, nodenames)
8228 result = lu.rpc.call_os_validate(required, nodenames, osname,
8229 [constants.OS_VALIDATE_PARAMETERS],
8231 for node, nres in result.items():
8232 # we don't check for offline cases since this should be run only
8233 # against the master node and/or an instance's nodes
8234 nres.Raise("OS Parameters validation failed on node %s" % node)
8235 if not nres.payload:
8236 lu.LogInfo("OS %s not found on node %s, validation skipped",
8240 class LUInstanceCreate(LogicalUnit):
8241 """Create an instance.
8244 HPATH = "instance-add"
8245 HTYPE = constants.HTYPE_INSTANCE
8248 def CheckArguments(self):
8252 # do not require name_check to ease forward/backward compatibility
8254 if self.op.no_install and self.op.start:
8255 self.LogInfo("No-installation mode selected, disabling startup")
8256 self.op.start = False
8257 # validate/normalize the instance name
8258 self.op.instance_name = \
8259 netutils.Hostname.GetNormalizedName(self.op.instance_name)
8261 if self.op.ip_check and not self.op.name_check:
8262 # TODO: make the ip check more flexible and not depend on the name check
8263 raise errors.OpPrereqError("Cannot do IP address check without a name"
8264 " check", errors.ECODE_INVAL)
8266 # check nics' parameter names
8267 for nic in self.op.nics:
8268 utils.ForceDictType(nic, constants.INIC_PARAMS_TYPES)
8270 # check disks. parameter names and consistent adopt/no-adopt strategy
8271 has_adopt = has_no_adopt = False
8272 for disk in self.op.disks:
8273 utils.ForceDictType(disk, constants.IDISK_PARAMS_TYPES)
8274 if constants.IDISK_ADOPT in disk:
8278 if has_adopt and has_no_adopt:
8279 raise errors.OpPrereqError("Either all disks are adopted or none is",
8282 if self.op.disk_template not in constants.DTS_MAY_ADOPT:
8283 raise errors.OpPrereqError("Disk adoption is not supported for the"
8284 " '%s' disk template" %
8285 self.op.disk_template,
8287 if self.op.iallocator is not None:
8288 raise errors.OpPrereqError("Disk adoption not allowed with an"
8289 " iallocator script", errors.ECODE_INVAL)
8290 if self.op.mode == constants.INSTANCE_IMPORT:
8291 raise errors.OpPrereqError("Disk adoption not allowed for"
8292 " instance import", errors.ECODE_INVAL)
8294 if self.op.disk_template in constants.DTS_MUST_ADOPT:
8295 raise errors.OpPrereqError("Disk template %s requires disk adoption,"
8296 " but no 'adopt' parameter given" %
8297 self.op.disk_template,
8300 self.adopt_disks = has_adopt
8302 # instance name verification
8303 if self.op.name_check:
8304 self.hostname1 = netutils.GetHostname(name=self.op.instance_name)
8305 self.op.instance_name = self.hostname1.name
8306 # used in CheckPrereq for ip ping check
8307 self.check_ip = self.hostname1.ip
8309 self.check_ip = None
8311 # file storage checks
8312 if (self.op.file_driver and
8313 not self.op.file_driver in constants.FILE_DRIVER):
8314 raise errors.OpPrereqError("Invalid file driver name '%s'" %
8315 self.op.file_driver, errors.ECODE_INVAL)
8317 if self.op.disk_template == constants.DT_FILE:
8318 opcodes.RequireFileStorage()
8319 elif self.op.disk_template == constants.DT_SHARED_FILE:
8320 opcodes.RequireSharedFileStorage()
8322 ### Node/iallocator related checks
8323 _CheckIAllocatorOrNode(self, "iallocator", "pnode")
8325 if self.op.pnode is not None:
8326 if self.op.disk_template in constants.DTS_INT_MIRROR:
8327 if self.op.snode is None:
8328 raise errors.OpPrereqError("The networked disk templates need"
8329 " a mirror node", errors.ECODE_INVAL)
8331 self.LogWarning("Secondary node will be ignored on non-mirrored disk"
8333 self.op.snode = None
8335 self._cds = _GetClusterDomainSecret()
8337 if self.op.mode == constants.INSTANCE_IMPORT:
8338 # On import force_variant must be True, because if we forced it at
8339 # initial install, our only chance when importing it back is that it
8341 self.op.force_variant = True
8343 if self.op.no_install:
8344 self.LogInfo("No-installation mode has no effect during import")
8346 elif self.op.mode == constants.INSTANCE_CREATE:
8347 if self.op.os_type is None:
8348 raise errors.OpPrereqError("No guest OS specified",
8350 if self.op.os_type in self.cfg.GetClusterInfo().blacklisted_os:
8351 raise errors.OpPrereqError("Guest OS '%s' is not allowed for"
8352 " installation" % self.op.os_type,
8354 if self.op.disk_template is None:
8355 raise errors.OpPrereqError("No disk template specified",
8358 elif self.op.mode == constants.INSTANCE_REMOTE_IMPORT:
8359 # Check handshake to ensure both clusters have the same domain secret
8360 src_handshake = self.op.source_handshake
8361 if not src_handshake:
8362 raise errors.OpPrereqError("Missing source handshake",
8365 errmsg = masterd.instance.CheckRemoteExportHandshake(self._cds,
8368 raise errors.OpPrereqError("Invalid handshake: %s" % errmsg,
8371 # Load and check source CA
8372 self.source_x509_ca_pem = self.op.source_x509_ca
8373 if not self.source_x509_ca_pem:
8374 raise errors.OpPrereqError("Missing source X509 CA",
8378 (cert, _) = utils.LoadSignedX509Certificate(self.source_x509_ca_pem,
8380 except OpenSSL.crypto.Error, err:
8381 raise errors.OpPrereqError("Unable to load source X509 CA (%s)" %
8382 (err, ), errors.ECODE_INVAL)
8384 (errcode, msg) = utils.VerifyX509Certificate(cert, None, None)
8385 if errcode is not None:
8386 raise errors.OpPrereqError("Invalid source X509 CA (%s)" % (msg, ),
8389 self.source_x509_ca = cert
8391 src_instance_name = self.op.source_instance_name
8392 if not src_instance_name:
8393 raise errors.OpPrereqError("Missing source instance name",
8396 self.source_instance_name = \
8397 netutils.GetHostname(name=src_instance_name).name
8400 raise errors.OpPrereqError("Invalid instance creation mode %r" %
8401 self.op.mode, errors.ECODE_INVAL)
8403 def ExpandNames(self):
8404 """ExpandNames for CreateInstance.
8406 Figure out the right locks for instance creation.
8409 self.needed_locks = {}
8411 instance_name = self.op.instance_name
8412 # this is just a preventive check, but someone might still add this
8413 # instance in the meantime, and creation will fail at lock-add time
8414 if instance_name in self.cfg.GetInstanceList():
8415 raise errors.OpPrereqError("Instance '%s' is already in the cluster" %
8416 instance_name, errors.ECODE_EXISTS)
8418 self.add_locks[locking.LEVEL_INSTANCE] = instance_name
8420 if self.op.iallocator:
8421 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
8423 self.op.pnode = _ExpandNodeName(self.cfg, self.op.pnode)
8424 nodelist = [self.op.pnode]
8425 if self.op.snode is not None:
8426 self.op.snode = _ExpandNodeName(self.cfg, self.op.snode)
8427 nodelist.append(self.op.snode)
8428 self.needed_locks[locking.LEVEL_NODE] = nodelist
8430 # in case of import lock the source node too
8431 if self.op.mode == constants.INSTANCE_IMPORT:
8432 src_node = self.op.src_node
8433 src_path = self.op.src_path
8435 if src_path is None:
8436 self.op.src_path = src_path = self.op.instance_name
8438 if src_node is None:
8439 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
8440 self.op.src_node = None
8441 if os.path.isabs(src_path):
8442 raise errors.OpPrereqError("Importing an instance from a path"
8443 " requires a source node option",
8446 self.op.src_node = src_node = _ExpandNodeName(self.cfg, src_node)
8447 if self.needed_locks[locking.LEVEL_NODE] is not locking.ALL_SET:
8448 self.needed_locks[locking.LEVEL_NODE].append(src_node)
8449 if not os.path.isabs(src_path):
8450 self.op.src_path = src_path = \
8451 utils.PathJoin(constants.EXPORT_DIR, src_path)
8453 def _RunAllocator(self):
8454 """Run the allocator based on input opcode.
8457 nics = [n.ToDict() for n in self.nics]
8458 ial = IAllocator(self.cfg, self.rpc,
8459 mode=constants.IALLOCATOR_MODE_ALLOC,
8460 name=self.op.instance_name,
8461 disk_template=self.op.disk_template,
8464 vcpus=self.be_full[constants.BE_VCPUS],
8465 memory=self.be_full[constants.BE_MEMORY],
8468 hypervisor=self.op.hypervisor,
8471 ial.Run(self.op.iallocator)
8474 raise errors.OpPrereqError("Can't compute nodes using"
8475 " iallocator '%s': %s" %
8476 (self.op.iallocator, ial.info),
8478 if len(ial.result) != ial.required_nodes:
8479 raise errors.OpPrereqError("iallocator '%s' returned invalid number"
8480 " of nodes (%s), required %s" %
8481 (self.op.iallocator, len(ial.result),
8482 ial.required_nodes), errors.ECODE_FAULT)
8483 self.op.pnode = ial.result[0]
8484 self.LogInfo("Selected nodes for instance %s via iallocator %s: %s",
8485 self.op.instance_name, self.op.iallocator,
8486 utils.CommaJoin(ial.result))
8487 if ial.required_nodes == 2:
8488 self.op.snode = ial.result[1]
8490 def BuildHooksEnv(self):
8493 This runs on master, primary and secondary nodes of the instance.
8497 "ADD_MODE": self.op.mode,
8499 if self.op.mode == constants.INSTANCE_IMPORT:
8500 env["SRC_NODE"] = self.op.src_node
8501 env["SRC_PATH"] = self.op.src_path
8502 env["SRC_IMAGES"] = self.src_images
8504 env.update(_BuildInstanceHookEnv(
8505 name=self.op.instance_name,
8506 primary_node=self.op.pnode,
8507 secondary_nodes=self.secondaries,
8508 status=self.op.start,
8509 os_type=self.op.os_type,
8510 memory=self.be_full[constants.BE_MEMORY],
8511 vcpus=self.be_full[constants.BE_VCPUS],
8512 nics=_NICListToTuple(self, self.nics),
8513 disk_template=self.op.disk_template,
8514 disks=[(d[constants.IDISK_SIZE], d[constants.IDISK_MODE])
8515 for d in self.disks],
8518 hypervisor_name=self.op.hypervisor,
8524 def BuildHooksNodes(self):
8525 """Build hooks nodes.
8528 nl = [self.cfg.GetMasterNode(), self.op.pnode] + self.secondaries
8531 def _ReadExportInfo(self):
8532 """Reads the export information from disk.
8534 It will override the opcode source node and path with the actual
8535 information, if these two were not specified before.
8537 @return: the export information
8540 assert self.op.mode == constants.INSTANCE_IMPORT
8542 src_node = self.op.src_node
8543 src_path = self.op.src_path
8545 if src_node is None:
8546 locked_nodes = self.owned_locks(locking.LEVEL_NODE)
8547 exp_list = self.rpc.call_export_list(locked_nodes)
8549 for node in exp_list:
8550 if exp_list[node].fail_msg:
8552 if src_path in exp_list[node].payload:
8554 self.op.src_node = src_node = node
8555 self.op.src_path = src_path = utils.PathJoin(constants.EXPORT_DIR,
8559 raise errors.OpPrereqError("No export found for relative path %s" %
8560 src_path, errors.ECODE_INVAL)
8562 _CheckNodeOnline(self, src_node)
8563 result = self.rpc.call_export_info(src_node, src_path)
8564 result.Raise("No export or invalid export found in dir %s" % src_path)
8566 export_info = objects.SerializableConfigParser.Loads(str(result.payload))
8567 if not export_info.has_section(constants.INISECT_EXP):
8568 raise errors.ProgrammerError("Corrupted export config",
8569 errors.ECODE_ENVIRON)
8571 ei_version = export_info.get(constants.INISECT_EXP, "version")
8572 if (int(ei_version) != constants.EXPORT_VERSION):
8573 raise errors.OpPrereqError("Wrong export version %s (wanted %d)" %
8574 (ei_version, constants.EXPORT_VERSION),
8575 errors.ECODE_ENVIRON)
8578 def _ReadExportParams(self, einfo):
8579 """Use export parameters as defaults.
8581 In case the opcode doesn't specify (as in override) some instance
8582 parameters, then try to use them from the export information, if
8586 self.op.os_type = einfo.get(constants.INISECT_EXP, "os")
8588 if self.op.disk_template is None:
8589 if einfo.has_option(constants.INISECT_INS, "disk_template"):
8590 self.op.disk_template = einfo.get(constants.INISECT_INS,
8592 if self.op.disk_template not in constants.DISK_TEMPLATES:
8593 raise errors.OpPrereqError("Disk template specified in configuration"
8594 " file is not one of the allowed values:"
8595 " %s" % " ".join(constants.DISK_TEMPLATES))
8597 raise errors.OpPrereqError("No disk template specified and the export"
8598 " is missing the disk_template information",
8601 if not self.op.disks:
8603 # TODO: import the disk iv_name too
8604 for idx in range(constants.MAX_DISKS):
8605 if einfo.has_option(constants.INISECT_INS, "disk%d_size" % idx):
8606 disk_sz = einfo.getint(constants.INISECT_INS, "disk%d_size" % idx)
8607 disks.append({constants.IDISK_SIZE: disk_sz})
8608 self.op.disks = disks
8609 if not disks and self.op.disk_template != constants.DT_DISKLESS:
8610 raise errors.OpPrereqError("No disk info specified and the export"
8611 " is missing the disk information",
8614 if not self.op.nics:
8616 for idx in range(constants.MAX_NICS):
8617 if einfo.has_option(constants.INISECT_INS, "nic%d_mac" % idx):
8619 for name in list(constants.NICS_PARAMETERS) + ["ip", "mac"]:
8620 v = einfo.get(constants.INISECT_INS, "nic%d_%s" % (idx, name))
8627 if not self.op.tags and einfo.has_option(constants.INISECT_INS, "tags"):
8628 self.op.tags = einfo.get(constants.INISECT_INS, "tags").split()
8630 if (self.op.hypervisor is None and
8631 einfo.has_option(constants.INISECT_INS, "hypervisor")):
8632 self.op.hypervisor = einfo.get(constants.INISECT_INS, "hypervisor")
8634 if einfo.has_section(constants.INISECT_HYP):
8635 # use the export parameters but do not override the ones
8636 # specified by the user
8637 for name, value in einfo.items(constants.INISECT_HYP):
8638 if name not in self.op.hvparams:
8639 self.op.hvparams[name] = value
8641 if einfo.has_section(constants.INISECT_BEP):
8642 # use the parameters, without overriding
8643 for name, value in einfo.items(constants.INISECT_BEP):
8644 if name not in self.op.beparams:
8645 self.op.beparams[name] = value
8647 # try to read the parameters old style, from the main section
8648 for name in constants.BES_PARAMETERS:
8649 if (name not in self.op.beparams and
8650 einfo.has_option(constants.INISECT_INS, name)):
8651 self.op.beparams[name] = einfo.get(constants.INISECT_INS, name)
8653 if einfo.has_section(constants.INISECT_OSP):
8654 # use the parameters, without overriding
8655 for name, value in einfo.items(constants.INISECT_OSP):
8656 if name not in self.op.osparams:
8657 self.op.osparams[name] = value
8659 def _RevertToDefaults(self, cluster):
8660 """Revert the instance parameters to the default values.
8664 hv_defs = cluster.SimpleFillHV(self.op.hypervisor, self.op.os_type, {})
8665 for name in self.op.hvparams.keys():
8666 if name in hv_defs and hv_defs[name] == self.op.hvparams[name]:
8667 del self.op.hvparams[name]
8669 be_defs = cluster.SimpleFillBE({})
8670 for name in self.op.beparams.keys():
8671 if name in be_defs and be_defs[name] == self.op.beparams[name]:
8672 del self.op.beparams[name]
8674 nic_defs = cluster.SimpleFillNIC({})
8675 for nic in self.op.nics:
8676 for name in constants.NICS_PARAMETERS:
8677 if name in nic and name in nic_defs and nic[name] == nic_defs[name]:
8680 os_defs = cluster.SimpleFillOS(self.op.os_type, {})
8681 for name in self.op.osparams.keys():
8682 if name in os_defs and os_defs[name] == self.op.osparams[name]:
8683 del self.op.osparams[name]
8685 def _CalculateFileStorageDir(self):
8686 """Calculate final instance file storage dir.
8689 # file storage dir calculation/check
8690 self.instance_file_storage_dir = None
8691 if self.op.disk_template in constants.DTS_FILEBASED:
8692 # build the full file storage dir path
8695 if self.op.disk_template == constants.DT_SHARED_FILE:
8696 get_fsd_fn = self.cfg.GetSharedFileStorageDir
8698 get_fsd_fn = self.cfg.GetFileStorageDir
8700 cfg_storagedir = get_fsd_fn()
8701 if not cfg_storagedir:
8702 raise errors.OpPrereqError("Cluster file storage dir not defined")
8703 joinargs.append(cfg_storagedir)
8705 if self.op.file_storage_dir is not None:
8706 joinargs.append(self.op.file_storage_dir)
8708 joinargs.append(self.op.instance_name)
8710 # pylint: disable=W0142
8711 self.instance_file_storage_dir = utils.PathJoin(*joinargs)
8713 def CheckPrereq(self):
8714 """Check prerequisites.
8717 self._CalculateFileStorageDir()
8719 if self.op.mode == constants.INSTANCE_IMPORT:
8720 export_info = self._ReadExportInfo()
8721 self._ReadExportParams(export_info)
8723 if (not self.cfg.GetVGName() and
8724 self.op.disk_template not in constants.DTS_NOT_LVM):
8725 raise errors.OpPrereqError("Cluster does not support lvm-based"
8726 " instances", errors.ECODE_STATE)
8728 if (self.op.hypervisor is None or
8729 self.op.hypervisor == constants.VALUE_AUTO):
8730 self.op.hypervisor = self.cfg.GetHypervisorType()
8732 cluster = self.cfg.GetClusterInfo()
8733 enabled_hvs = cluster.enabled_hypervisors
8734 if self.op.hypervisor not in enabled_hvs:
8735 raise errors.OpPrereqError("Selected hypervisor (%s) not enabled in the"
8736 " cluster (%s)" % (self.op.hypervisor,
8737 ",".join(enabled_hvs)),
8740 # Check tag validity
8741 for tag in self.op.tags:
8742 objects.TaggableObject.ValidateTag(tag)
8744 # check hypervisor parameter syntax (locally)
8745 utils.ForceDictType(self.op.hvparams, constants.HVS_PARAMETER_TYPES)
8746 filled_hvp = cluster.SimpleFillHV(self.op.hypervisor, self.op.os_type,
8748 hv_type = hypervisor.GetHypervisor(self.op.hypervisor)
8749 hv_type.CheckParameterSyntax(filled_hvp)
8750 self.hv_full = filled_hvp
8751 # check that we don't specify global parameters on an instance
8752 _CheckGlobalHvParams(self.op.hvparams)
8754 # fill and remember the beparams dict
8755 default_beparams = cluster.beparams[constants.PP_DEFAULT]
8756 for param, value in self.op.beparams.iteritems():
8757 if value == constants.VALUE_AUTO:
8758 self.op.beparams[param] = default_beparams[param]
8759 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
8760 self.be_full = cluster.SimpleFillBE(self.op.beparams)
8762 # build os parameters
8763 self.os_full = cluster.SimpleFillOS(self.op.os_type, self.op.osparams)
8765 # now that hvp/bep are in final format, let's reset to defaults,
8767 if self.op.identify_defaults:
8768 self._RevertToDefaults(cluster)
8772 for idx, nic in enumerate(self.op.nics):
8773 nic_mode_req = nic.get(constants.INIC_MODE, None)
8774 nic_mode = nic_mode_req
8775 if nic_mode is None or nic_mode == constants.VALUE_AUTO:
8776 nic_mode = cluster.nicparams[constants.PP_DEFAULT][constants.NIC_MODE]
8778 # in routed mode, for the first nic, the default ip is 'auto'
8779 if nic_mode == constants.NIC_MODE_ROUTED and idx == 0:
8780 default_ip_mode = constants.VALUE_AUTO
8782 default_ip_mode = constants.VALUE_NONE
8784 # ip validity checks
8785 ip = nic.get(constants.INIC_IP, default_ip_mode)
8786 if ip is None or ip.lower() == constants.VALUE_NONE:
8788 elif ip.lower() == constants.VALUE_AUTO:
8789 if not self.op.name_check:
8790 raise errors.OpPrereqError("IP address set to auto but name checks"
8791 " have been skipped",
8793 nic_ip = self.hostname1.ip
8795 if not netutils.IPAddress.IsValid(ip):
8796 raise errors.OpPrereqError("Invalid IP address '%s'" % ip,
8800 # TODO: check the ip address for uniqueness
8801 if nic_mode == constants.NIC_MODE_ROUTED and not nic_ip:
8802 raise errors.OpPrereqError("Routed nic mode requires an ip address",
8805 # MAC address verification
8806 mac = nic.get(constants.INIC_MAC, constants.VALUE_AUTO)
8807 if mac not in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
8808 mac = utils.NormalizeAndValidateMac(mac)
8811 self.cfg.ReserveMAC(mac, self.proc.GetECId())
8812 except errors.ReservationError:
8813 raise errors.OpPrereqError("MAC address %s already in use"
8814 " in cluster" % mac,
8815 errors.ECODE_NOTUNIQUE)
8817 # Build nic parameters
8818 link = nic.get(constants.INIC_LINK, None)
8819 if link == constants.VALUE_AUTO:
8820 link = cluster.nicparams[constants.PP_DEFAULT][constants.NIC_LINK]
8823 nicparams[constants.NIC_MODE] = nic_mode
8825 nicparams[constants.NIC_LINK] = link
8827 check_params = cluster.SimpleFillNIC(nicparams)
8828 objects.NIC.CheckParameterSyntax(check_params)
8829 self.nics.append(objects.NIC(mac=mac, ip=nic_ip, nicparams=nicparams))
8831 # disk checks/pre-build
8832 default_vg = self.cfg.GetVGName()
8834 for disk in self.op.disks:
8835 mode = disk.get(constants.IDISK_MODE, constants.DISK_RDWR)
8836 if mode not in constants.DISK_ACCESS_SET:
8837 raise errors.OpPrereqError("Invalid disk access mode '%s'" %
8838 mode, errors.ECODE_INVAL)
8839 size = disk.get(constants.IDISK_SIZE, None)
8841 raise errors.OpPrereqError("Missing disk size", errors.ECODE_INVAL)
8844 except (TypeError, ValueError):
8845 raise errors.OpPrereqError("Invalid disk size '%s'" % size,
8848 data_vg = disk.get(constants.IDISK_VG, default_vg)
8850 constants.IDISK_SIZE: size,
8851 constants.IDISK_MODE: mode,
8852 constants.IDISK_VG: data_vg,
8853 constants.IDISK_METAVG: disk.get(constants.IDISK_METAVG, data_vg),
8855 if constants.IDISK_ADOPT in disk:
8856 new_disk[constants.IDISK_ADOPT] = disk[constants.IDISK_ADOPT]
8857 self.disks.append(new_disk)
8859 if self.op.mode == constants.INSTANCE_IMPORT:
8861 for idx in range(len(self.disks)):
8862 option = "disk%d_dump" % idx
8863 if export_info.has_option(constants.INISECT_INS, option):
8864 # FIXME: are the old os-es, disk sizes, etc. useful?
8865 export_name = export_info.get(constants.INISECT_INS, option)
8866 image = utils.PathJoin(self.op.src_path, export_name)
8867 disk_images.append(image)
8869 disk_images.append(False)
8871 self.src_images = disk_images
8873 old_name = export_info.get(constants.INISECT_INS, "name")
8874 if self.op.instance_name == old_name:
8875 for idx, nic in enumerate(self.nics):
8876 if nic.mac == constants.VALUE_AUTO:
8877 nic_mac_ini = "nic%d_mac" % idx
8878 nic.mac = export_info.get(constants.INISECT_INS, nic_mac_ini)
8880 # ENDIF: self.op.mode == constants.INSTANCE_IMPORT
8882 # ip ping checks (we use the same ip that was resolved in ExpandNames)
8883 if self.op.ip_check:
8884 if netutils.TcpPing(self.check_ip, constants.DEFAULT_NODED_PORT):
8885 raise errors.OpPrereqError("IP %s of instance %s already in use" %
8886 (self.check_ip, self.op.instance_name),
8887 errors.ECODE_NOTUNIQUE)
8889 #### mac address generation
8890 # By generating here the mac address both the allocator and the hooks get
8891 # the real final mac address rather than the 'auto' or 'generate' value.
8892 # There is a race condition between the generation and the instance object
8893 # creation, which means that we know the mac is valid now, but we're not
8894 # sure it will be when we actually add the instance. If things go bad
8895 # adding the instance will abort because of a duplicate mac, and the
8896 # creation job will fail.
8897 for nic in self.nics:
8898 if nic.mac in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
8899 nic.mac = self.cfg.GenerateMAC(self.proc.GetECId())
8903 if self.op.iallocator is not None:
8904 self._RunAllocator()
8906 #### node related checks
8908 # check primary node
8909 self.pnode = pnode = self.cfg.GetNodeInfo(self.op.pnode)
8910 assert self.pnode is not None, \
8911 "Cannot retrieve locked node %s" % self.op.pnode
8913 raise errors.OpPrereqError("Cannot use offline primary node '%s'" %
8914 pnode.name, errors.ECODE_STATE)
8916 raise errors.OpPrereqError("Cannot use drained primary node '%s'" %
8917 pnode.name, errors.ECODE_STATE)
8918 if not pnode.vm_capable:
8919 raise errors.OpPrereqError("Cannot use non-vm_capable primary node"
8920 " '%s'" % pnode.name, errors.ECODE_STATE)
8922 self.secondaries = []
8924 # mirror node verification
8925 if self.op.disk_template in constants.DTS_INT_MIRROR:
8926 if self.op.snode == pnode.name:
8927 raise errors.OpPrereqError("The secondary node cannot be the"
8928 " primary node", errors.ECODE_INVAL)
8929 _CheckNodeOnline(self, self.op.snode)
8930 _CheckNodeNotDrained(self, self.op.snode)
8931 _CheckNodeVmCapable(self, self.op.snode)
8932 self.secondaries.append(self.op.snode)
8934 nodenames = [pnode.name] + self.secondaries
8936 if not self.adopt_disks:
8937 # Check lv size requirements, if not adopting
8938 req_sizes = _ComputeDiskSizePerVG(self.op.disk_template, self.disks)
8939 _CheckNodesFreeDiskPerVG(self, nodenames, req_sizes)
8941 elif self.op.disk_template == constants.DT_PLAIN: # Check the adoption data
8942 all_lvs = set(["%s/%s" % (disk[constants.IDISK_VG],
8943 disk[constants.IDISK_ADOPT])
8944 for disk in self.disks])
8945 if len(all_lvs) != len(self.disks):
8946 raise errors.OpPrereqError("Duplicate volume names given for adoption",
8948 for lv_name in all_lvs:
8950 # FIXME: lv_name here is "vg/lv" need to ensure that other calls
8951 # to ReserveLV uses the same syntax
8952 self.cfg.ReserveLV(lv_name, self.proc.GetECId())
8953 except errors.ReservationError:
8954 raise errors.OpPrereqError("LV named %s used by another instance" %
8955 lv_name, errors.ECODE_NOTUNIQUE)
8957 vg_names = self.rpc.call_vg_list([pnode.name])[pnode.name]
8958 vg_names.Raise("Cannot get VG information from node %s" % pnode.name)
8960 node_lvs = self.rpc.call_lv_list([pnode.name],
8961 vg_names.payload.keys())[pnode.name]
8962 node_lvs.Raise("Cannot get LV information from node %s" % pnode.name)
8963 node_lvs = node_lvs.payload
8965 delta = all_lvs.difference(node_lvs.keys())
8967 raise errors.OpPrereqError("Missing logical volume(s): %s" %
8968 utils.CommaJoin(delta),
8970 online_lvs = [lv for lv in all_lvs if node_lvs[lv][2]]
8972 raise errors.OpPrereqError("Online logical volumes found, cannot"
8973 " adopt: %s" % utils.CommaJoin(online_lvs),
8975 # update the size of disk based on what is found
8976 for dsk in self.disks:
8977 dsk[constants.IDISK_SIZE] = \
8978 int(float(node_lvs["%s/%s" % (dsk[constants.IDISK_VG],
8979 dsk[constants.IDISK_ADOPT])][0]))
8981 elif self.op.disk_template == constants.DT_BLOCK:
8982 # Normalize and de-duplicate device paths
8983 all_disks = set([os.path.abspath(disk[constants.IDISK_ADOPT])
8984 for disk in self.disks])
8985 if len(all_disks) != len(self.disks):
8986 raise errors.OpPrereqError("Duplicate disk names given for adoption",
8988 baddisks = [d for d in all_disks
8989 if not d.startswith(constants.ADOPTABLE_BLOCKDEV_ROOT)]
8991 raise errors.OpPrereqError("Device node(s) %s lie outside %s and"
8992 " cannot be adopted" %
8993 (", ".join(baddisks),
8994 constants.ADOPTABLE_BLOCKDEV_ROOT),
8997 node_disks = self.rpc.call_bdev_sizes([pnode.name],
8998 list(all_disks))[pnode.name]
8999 node_disks.Raise("Cannot get block device information from node %s" %
9001 node_disks = node_disks.payload
9002 delta = all_disks.difference(node_disks.keys())
9004 raise errors.OpPrereqError("Missing block device(s): %s" %
9005 utils.CommaJoin(delta),
9007 for dsk in self.disks:
9008 dsk[constants.IDISK_SIZE] = \
9009 int(float(node_disks[dsk[constants.IDISK_ADOPT]]))
9011 _CheckHVParams(self, nodenames, self.op.hypervisor, self.op.hvparams)
9013 _CheckNodeHasOS(self, pnode.name, self.op.os_type, self.op.force_variant)
9014 # check OS parameters (remotely)
9015 _CheckOSParams(self, True, nodenames, self.op.os_type, self.os_full)
9017 _CheckNicsBridgesExist(self, self.nics, self.pnode.name)
9019 # memory check on primary node
9021 _CheckNodeFreeMemory(self, self.pnode.name,
9022 "creating instance %s" % self.op.instance_name,
9023 self.be_full[constants.BE_MEMORY],
9026 self.dry_run_result = list(nodenames)
9028 def Exec(self, feedback_fn):
9029 """Create and add the instance to the cluster.
9032 instance = self.op.instance_name
9033 pnode_name = self.pnode.name
9035 ht_kind = self.op.hypervisor
9036 if ht_kind in constants.HTS_REQ_PORT:
9037 network_port = self.cfg.AllocatePort()
9041 disks = _GenerateDiskTemplate(self,
9042 self.op.disk_template,
9043 instance, pnode_name,
9046 self.instance_file_storage_dir,
9047 self.op.file_driver,
9051 iobj = objects.Instance(name=instance, os=self.op.os_type,
9052 primary_node=pnode_name,
9053 nics=self.nics, disks=disks,
9054 disk_template=self.op.disk_template,
9056 network_port=network_port,
9057 beparams=self.op.beparams,
9058 hvparams=self.op.hvparams,
9059 hypervisor=self.op.hypervisor,
9060 osparams=self.op.osparams,
9064 for tag in self.op.tags:
9067 if self.adopt_disks:
9068 if self.op.disk_template == constants.DT_PLAIN:
9069 # rename LVs to the newly-generated names; we need to construct
9070 # 'fake' LV disks with the old data, plus the new unique_id
9071 tmp_disks = [objects.Disk.FromDict(v.ToDict()) for v in disks]
9073 for t_dsk, a_dsk in zip(tmp_disks, self.disks):
9074 rename_to.append(t_dsk.logical_id)
9075 t_dsk.logical_id = (t_dsk.logical_id[0], a_dsk[constants.IDISK_ADOPT])
9076 self.cfg.SetDiskID(t_dsk, pnode_name)
9077 result = self.rpc.call_blockdev_rename(pnode_name,
9078 zip(tmp_disks, rename_to))
9079 result.Raise("Failed to rename adoped LVs")
9081 feedback_fn("* creating instance disks...")
9083 _CreateDisks(self, iobj)
9084 except errors.OpExecError:
9085 self.LogWarning("Device creation failed, reverting...")
9087 _RemoveDisks(self, iobj)
9089 self.cfg.ReleaseDRBDMinors(instance)
9092 feedback_fn("adding instance %s to cluster config" % instance)
9094 self.cfg.AddInstance(iobj, self.proc.GetECId())
9096 # Declare that we don't want to remove the instance lock anymore, as we've
9097 # added the instance to the config
9098 del self.remove_locks[locking.LEVEL_INSTANCE]
9100 if self.op.mode == constants.INSTANCE_IMPORT:
9101 # Release unused nodes
9102 _ReleaseLocks(self, locking.LEVEL_NODE, keep=[self.op.src_node])
9105 _ReleaseLocks(self, locking.LEVEL_NODE)
9108 if not self.adopt_disks and self.cfg.GetClusterInfo().prealloc_wipe_disks:
9109 feedback_fn("* wiping instance disks...")
9111 _WipeDisks(self, iobj)
9112 except errors.OpExecError, err:
9113 logging.exception("Wiping disks failed")
9114 self.LogWarning("Wiping instance disks failed (%s)", err)
9118 # Something is already wrong with the disks, don't do anything else
9120 elif self.op.wait_for_sync:
9121 disk_abort = not _WaitForSync(self, iobj)
9122 elif iobj.disk_template in constants.DTS_INT_MIRROR:
9123 # make sure the disks are not degraded (still sync-ing is ok)
9124 feedback_fn("* checking mirrors status")
9125 disk_abort = not _WaitForSync(self, iobj, oneshot=True)
9130 _RemoveDisks(self, iobj)
9131 self.cfg.RemoveInstance(iobj.name)
9132 # Make sure the instance lock gets removed
9133 self.remove_locks[locking.LEVEL_INSTANCE] = iobj.name
9134 raise errors.OpExecError("There are some degraded disks for"
9137 if iobj.disk_template != constants.DT_DISKLESS and not self.adopt_disks:
9138 if self.op.mode == constants.INSTANCE_CREATE:
9139 if not self.op.no_install:
9140 pause_sync = (iobj.disk_template in constants.DTS_INT_MIRROR and
9141 not self.op.wait_for_sync)
9143 feedback_fn("* pausing disk sync to install instance OS")
9144 result = self.rpc.call_blockdev_pause_resume_sync(pnode_name,
9146 for idx, success in enumerate(result.payload):
9148 logging.warn("pause-sync of instance %s for disk %d failed",
9151 feedback_fn("* running the instance OS create scripts...")
9152 # FIXME: pass debug option from opcode to backend
9154 self.rpc.call_instance_os_add(pnode_name, iobj, False,
9155 self.op.debug_level)
9157 feedback_fn("* resuming disk sync")
9158 result = self.rpc.call_blockdev_pause_resume_sync(pnode_name,
9160 for idx, success in enumerate(result.payload):
9162 logging.warn("resume-sync of instance %s for disk %d failed",
9165 os_add_result.Raise("Could not add os for instance %s"
9166 " on node %s" % (instance, pnode_name))
9168 elif self.op.mode == constants.INSTANCE_IMPORT:
9169 feedback_fn("* running the instance OS import scripts...")
9173 for idx, image in enumerate(self.src_images):
9177 # FIXME: pass debug option from opcode to backend
9178 dt = masterd.instance.DiskTransfer("disk/%s" % idx,
9179 constants.IEIO_FILE, (image, ),
9180 constants.IEIO_SCRIPT,
9181 (iobj.disks[idx], idx),
9183 transfers.append(dt)
9186 masterd.instance.TransferInstanceData(self, feedback_fn,
9187 self.op.src_node, pnode_name,
9188 self.pnode.secondary_ip,
9190 if not compat.all(import_result):
9191 self.LogWarning("Some disks for instance %s on node %s were not"
9192 " imported successfully" % (instance, pnode_name))
9194 elif self.op.mode == constants.INSTANCE_REMOTE_IMPORT:
9195 feedback_fn("* preparing remote import...")
9196 # The source cluster will stop the instance before attempting to make a
9197 # connection. In some cases stopping an instance can take a long time,
9198 # hence the shutdown timeout is added to the connection timeout.
9199 connect_timeout = (constants.RIE_CONNECT_TIMEOUT +
9200 self.op.source_shutdown_timeout)
9201 timeouts = masterd.instance.ImportExportTimeouts(connect_timeout)
9203 assert iobj.primary_node == self.pnode.name
9205 masterd.instance.RemoteImport(self, feedback_fn, iobj, self.pnode,
9206 self.source_x509_ca,
9207 self._cds, timeouts)
9208 if not compat.all(disk_results):
9209 # TODO: Should the instance still be started, even if some disks
9210 # failed to import (valid for local imports, too)?
9211 self.LogWarning("Some disks for instance %s on node %s were not"
9212 " imported successfully" % (instance, pnode_name))
9214 # Run rename script on newly imported instance
9215 assert iobj.name == instance
9216 feedback_fn("Running rename script for %s" % instance)
9217 result = self.rpc.call_instance_run_rename(pnode_name, iobj,
9218 self.source_instance_name,
9219 self.op.debug_level)
9221 self.LogWarning("Failed to run rename script for %s on node"
9222 " %s: %s" % (instance, pnode_name, result.fail_msg))
9225 # also checked in the prereq part
9226 raise errors.ProgrammerError("Unknown OS initialization mode '%s'"
9230 iobj.admin_up = True
9231 self.cfg.Update(iobj, feedback_fn)
9232 logging.info("Starting instance %s on node %s", instance, pnode_name)
9233 feedback_fn("* starting instance...")
9234 result = self.rpc.call_instance_start(pnode_name, iobj,
9236 result.Raise("Could not start instance")
9238 return list(iobj.all_nodes)
9241 class LUInstanceConsole(NoHooksLU):
9242 """Connect to an instance's console.
9244 This is somewhat special in that it returns the command line that
9245 you need to run on the master node in order to connect to the
9251 def ExpandNames(self):
9252 self._ExpandAndLockInstance()
9254 def CheckPrereq(self):
9255 """Check prerequisites.
9257 This checks that the instance is in the cluster.
9260 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
9261 assert self.instance is not None, \
9262 "Cannot retrieve locked instance %s" % self.op.instance_name
9263 _CheckNodeOnline(self, self.instance.primary_node)
9265 def Exec(self, feedback_fn):
9266 """Connect to the console of an instance
9269 instance = self.instance
9270 node = instance.primary_node
9272 node_insts = self.rpc.call_instance_list([node],
9273 [instance.hypervisor])[node]
9274 node_insts.Raise("Can't get node information from %s" % node)
9276 if instance.name not in node_insts.payload:
9277 if instance.admin_up:
9278 state = constants.INSTST_ERRORDOWN
9280 state = constants.INSTST_ADMINDOWN
9281 raise errors.OpExecError("Instance %s is not running (state %s)" %
9282 (instance.name, state))
9284 logging.debug("Connecting to console of %s on %s", instance.name, node)
9286 return _GetInstanceConsole(self.cfg.GetClusterInfo(), instance)
9289 def _GetInstanceConsole(cluster, instance):
9290 """Returns console information for an instance.
9292 @type cluster: L{objects.Cluster}
9293 @type instance: L{objects.Instance}
9297 hyper = hypervisor.GetHypervisor(instance.hypervisor)
9298 # beparams and hvparams are passed separately, to avoid editing the
9299 # instance and then saving the defaults in the instance itself.
9300 hvparams = cluster.FillHV(instance)
9301 beparams = cluster.FillBE(instance)
9302 console = hyper.GetInstanceConsole(instance, hvparams, beparams)
9304 assert console.instance == instance.name
9305 assert console.Validate()
9307 return console.ToDict()
9310 class LUInstanceReplaceDisks(LogicalUnit):
9311 """Replace the disks of an instance.
9314 HPATH = "mirrors-replace"
9315 HTYPE = constants.HTYPE_INSTANCE
9318 def CheckArguments(self):
9319 TLReplaceDisks.CheckArguments(self.op.mode, self.op.remote_node,
9322 def ExpandNames(self):
9323 self._ExpandAndLockInstance()
9325 assert locking.LEVEL_NODE not in self.needed_locks
9326 assert locking.LEVEL_NODEGROUP not in self.needed_locks
9328 assert self.op.iallocator is None or self.op.remote_node is None, \
9329 "Conflicting options"
9331 if self.op.remote_node is not None:
9332 self.op.remote_node = _ExpandNodeName(self.cfg, self.op.remote_node)
9334 # Warning: do not remove the locking of the new secondary here
9335 # unless DRBD8.AddChildren is changed to work in parallel;
9336 # currently it doesn't since parallel invocations of
9337 # FindUnusedMinor will conflict
9338 self.needed_locks[locking.LEVEL_NODE] = [self.op.remote_node]
9339 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
9341 self.needed_locks[locking.LEVEL_NODE] = []
9342 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
9344 if self.op.iallocator is not None:
9345 # iallocator will select a new node in the same group
9346 self.needed_locks[locking.LEVEL_NODEGROUP] = []
9348 self.replacer = TLReplaceDisks(self, self.op.instance_name, self.op.mode,
9349 self.op.iallocator, self.op.remote_node,
9350 self.op.disks, False, self.op.early_release)
9352 self.tasklets = [self.replacer]
9354 def DeclareLocks(self, level):
9355 if level == locking.LEVEL_NODEGROUP:
9356 assert self.op.remote_node is None
9357 assert self.op.iallocator is not None
9358 assert not self.needed_locks[locking.LEVEL_NODEGROUP]
9360 self.share_locks[locking.LEVEL_NODEGROUP] = 1
9361 self.needed_locks[locking.LEVEL_NODEGROUP] = \
9362 self.cfg.GetInstanceNodeGroups(self.op.instance_name)
9364 elif level == locking.LEVEL_NODE:
9365 if self.op.iallocator is not None:
9366 assert self.op.remote_node is None
9367 assert not self.needed_locks[locking.LEVEL_NODE]
9369 # Lock member nodes of all locked groups
9370 self.needed_locks[locking.LEVEL_NODE] = [node_name
9371 for group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP)
9372 for node_name in self.cfg.GetNodeGroup(group_uuid).members]
9374 self._LockInstancesNodes()
9376 def BuildHooksEnv(self):
9379 This runs on the master, the primary and all the secondaries.
9382 instance = self.replacer.instance
9384 "MODE": self.op.mode,
9385 "NEW_SECONDARY": self.op.remote_node,
9386 "OLD_SECONDARY": instance.secondary_nodes[0],
9388 env.update(_BuildInstanceHookEnvByObject(self, instance))
9391 def BuildHooksNodes(self):
9392 """Build hooks nodes.
9395 instance = self.replacer.instance
9397 self.cfg.GetMasterNode(),
9398 instance.primary_node,
9400 if self.op.remote_node is not None:
9401 nl.append(self.op.remote_node)
9404 def CheckPrereq(self):
9405 """Check prerequisites.
9408 assert (self.glm.is_owned(locking.LEVEL_NODEGROUP) or
9409 self.op.iallocator is None)
9411 owned_groups = self.owned_locks(locking.LEVEL_NODEGROUP)
9413 _CheckInstanceNodeGroups(self.cfg, self.op.instance_name, owned_groups)
9415 return LogicalUnit.CheckPrereq(self)
9418 class TLReplaceDisks(Tasklet):
9419 """Replaces disks for an instance.
9421 Note: Locking is not within the scope of this class.
9424 def __init__(self, lu, instance_name, mode, iallocator_name, remote_node,
9425 disks, delay_iallocator, early_release):
9426 """Initializes this class.
9429 Tasklet.__init__(self, lu)
9432 self.instance_name = instance_name
9434 self.iallocator_name = iallocator_name
9435 self.remote_node = remote_node
9437 self.delay_iallocator = delay_iallocator
9438 self.early_release = early_release
9441 self.instance = None
9442 self.new_node = None
9443 self.target_node = None
9444 self.other_node = None
9445 self.remote_node_info = None
9446 self.node_secondary_ip = None
9449 def CheckArguments(mode, remote_node, iallocator):
9450 """Helper function for users of this class.
9453 # check for valid parameter combination
9454 if mode == constants.REPLACE_DISK_CHG:
9455 if remote_node is None and iallocator is None:
9456 raise errors.OpPrereqError("When changing the secondary either an"
9457 " iallocator script must be used or the"
9458 " new node given", errors.ECODE_INVAL)
9460 if remote_node is not None and iallocator is not None:
9461 raise errors.OpPrereqError("Give either the iallocator or the new"
9462 " secondary, not both", errors.ECODE_INVAL)
9464 elif remote_node is not None or iallocator is not None:
9465 # Not replacing the secondary
9466 raise errors.OpPrereqError("The iallocator and new node options can"
9467 " only be used when changing the"
9468 " secondary node", errors.ECODE_INVAL)
9471 def _RunAllocator(lu, iallocator_name, instance_name, relocate_from):
9472 """Compute a new secondary node using an IAllocator.
9475 ial = IAllocator(lu.cfg, lu.rpc,
9476 mode=constants.IALLOCATOR_MODE_RELOC,
9478 relocate_from=list(relocate_from))
9480 ial.Run(iallocator_name)
9483 raise errors.OpPrereqError("Can't compute nodes using iallocator '%s':"
9484 " %s" % (iallocator_name, ial.info),
9487 if len(ial.result) != ial.required_nodes:
9488 raise errors.OpPrereqError("iallocator '%s' returned invalid number"
9489 " of nodes (%s), required %s" %
9491 len(ial.result), ial.required_nodes),
9494 remote_node_name = ial.result[0]
9496 lu.LogInfo("Selected new secondary for instance '%s': %s",
9497 instance_name, remote_node_name)
9499 return remote_node_name
9501 def _FindFaultyDisks(self, node_name):
9502 """Wrapper for L{_FindFaultyInstanceDisks}.
9505 return _FindFaultyInstanceDisks(self.cfg, self.rpc, self.instance,
9508 def _CheckDisksActivated(self, instance):
9509 """Checks if the instance disks are activated.
9511 @param instance: The instance to check disks
9512 @return: True if they are activated, False otherwise
9515 nodes = instance.all_nodes
9517 for idx, dev in enumerate(instance.disks):
9519 self.lu.LogInfo("Checking disk/%d on %s", idx, node)
9520 self.cfg.SetDiskID(dev, node)
9522 result = self.rpc.call_blockdev_find(node, dev)
9526 elif result.fail_msg or not result.payload:
9531 def CheckPrereq(self):
9532 """Check prerequisites.
9534 This checks that the instance is in the cluster.
9537 self.instance = instance = self.cfg.GetInstanceInfo(self.instance_name)
9538 assert instance is not None, \
9539 "Cannot retrieve locked instance %s" % self.instance_name
9541 if instance.disk_template != constants.DT_DRBD8:
9542 raise errors.OpPrereqError("Can only run replace disks for DRBD8-based"
9543 " instances", errors.ECODE_INVAL)
9545 if len(instance.secondary_nodes) != 1:
9546 raise errors.OpPrereqError("The instance has a strange layout,"
9547 " expected one secondary but found %d" %
9548 len(instance.secondary_nodes),
9551 if not self.delay_iallocator:
9552 self._CheckPrereq2()
9554 def _CheckPrereq2(self):
9555 """Check prerequisites, second part.
9557 This function should always be part of CheckPrereq. It was separated and is
9558 now called from Exec because during node evacuation iallocator was only
9559 called with an unmodified cluster model, not taking planned changes into
9563 instance = self.instance
9564 secondary_node = instance.secondary_nodes[0]
9566 if self.iallocator_name is None:
9567 remote_node = self.remote_node
9569 remote_node = self._RunAllocator(self.lu, self.iallocator_name,
9570 instance.name, instance.secondary_nodes)
9572 if remote_node is None:
9573 self.remote_node_info = None
9575 assert remote_node in self.lu.owned_locks(locking.LEVEL_NODE), \
9576 "Remote node '%s' is not locked" % remote_node
9578 self.remote_node_info = self.cfg.GetNodeInfo(remote_node)
9579 assert self.remote_node_info is not None, \
9580 "Cannot retrieve locked node %s" % remote_node
9582 if remote_node == self.instance.primary_node:
9583 raise errors.OpPrereqError("The specified node is the primary node of"
9584 " the instance", errors.ECODE_INVAL)
9586 if remote_node == secondary_node:
9587 raise errors.OpPrereqError("The specified node is already the"
9588 " secondary node of the instance",
9591 if self.disks and self.mode in (constants.REPLACE_DISK_AUTO,
9592 constants.REPLACE_DISK_CHG):
9593 raise errors.OpPrereqError("Cannot specify disks to be replaced",
9596 if self.mode == constants.REPLACE_DISK_AUTO:
9597 if not self._CheckDisksActivated(instance):
9598 raise errors.OpPrereqError("Please run activate-disks on instance %s"
9599 " first" % self.instance_name,
9601 faulty_primary = self._FindFaultyDisks(instance.primary_node)
9602 faulty_secondary = self._FindFaultyDisks(secondary_node)
9604 if faulty_primary and faulty_secondary:
9605 raise errors.OpPrereqError("Instance %s has faulty disks on more than"
9606 " one node and can not be repaired"
9607 " automatically" % self.instance_name,
9611 self.disks = faulty_primary
9612 self.target_node = instance.primary_node
9613 self.other_node = secondary_node
9614 check_nodes = [self.target_node, self.other_node]
9615 elif faulty_secondary:
9616 self.disks = faulty_secondary
9617 self.target_node = secondary_node
9618 self.other_node = instance.primary_node
9619 check_nodes = [self.target_node, self.other_node]
9625 # Non-automatic modes
9626 if self.mode == constants.REPLACE_DISK_PRI:
9627 self.target_node = instance.primary_node
9628 self.other_node = secondary_node
9629 check_nodes = [self.target_node, self.other_node]
9631 elif self.mode == constants.REPLACE_DISK_SEC:
9632 self.target_node = secondary_node
9633 self.other_node = instance.primary_node
9634 check_nodes = [self.target_node, self.other_node]
9636 elif self.mode == constants.REPLACE_DISK_CHG:
9637 self.new_node = remote_node
9638 self.other_node = instance.primary_node
9639 self.target_node = secondary_node
9640 check_nodes = [self.new_node, self.other_node]
9642 _CheckNodeNotDrained(self.lu, remote_node)
9643 _CheckNodeVmCapable(self.lu, remote_node)
9645 old_node_info = self.cfg.GetNodeInfo(secondary_node)
9646 assert old_node_info is not None
9647 if old_node_info.offline and not self.early_release:
9648 # doesn't make sense to delay the release
9649 self.early_release = True
9650 self.lu.LogInfo("Old secondary %s is offline, automatically enabling"
9651 " early-release mode", secondary_node)
9654 raise errors.ProgrammerError("Unhandled disk replace mode (%s)" %
9657 # If not specified all disks should be replaced
9659 self.disks = range(len(self.instance.disks))
9661 for node in check_nodes:
9662 _CheckNodeOnline(self.lu, node)
9664 touched_nodes = frozenset(node_name for node_name in [self.new_node,
9667 if node_name is not None)
9669 # Release unneeded node locks
9670 _ReleaseLocks(self.lu, locking.LEVEL_NODE, keep=touched_nodes)
9672 # Release any owned node group
9673 if self.lu.glm.is_owned(locking.LEVEL_NODEGROUP):
9674 _ReleaseLocks(self.lu, locking.LEVEL_NODEGROUP)
9676 # Check whether disks are valid
9677 for disk_idx in self.disks:
9678 instance.FindDisk(disk_idx)
9680 # Get secondary node IP addresses
9681 self.node_secondary_ip = dict((name, node.secondary_ip) for (name, node)
9682 in self.cfg.GetMultiNodeInfo(touched_nodes))
9684 def Exec(self, feedback_fn):
9685 """Execute disk replacement.
9687 This dispatches the disk replacement to the appropriate handler.
9690 if self.delay_iallocator:
9691 self._CheckPrereq2()
9694 # Verify owned locks before starting operation
9695 owned_nodes = self.lu.owned_locks(locking.LEVEL_NODE)
9696 assert set(owned_nodes) == set(self.node_secondary_ip), \
9697 ("Incorrect node locks, owning %s, expected %s" %
9698 (owned_nodes, self.node_secondary_ip.keys()))
9700 owned_instances = self.lu.owned_locks(locking.LEVEL_INSTANCE)
9701 assert list(owned_instances) == [self.instance_name], \
9702 "Instance '%s' not locked" % self.instance_name
9704 assert not self.lu.glm.is_owned(locking.LEVEL_NODEGROUP), \
9705 "Should not own any node group lock at this point"
9708 feedback_fn("No disks need replacement")
9711 feedback_fn("Replacing disk(s) %s for %s" %
9712 (utils.CommaJoin(self.disks), self.instance.name))
9714 activate_disks = (not self.instance.admin_up)
9716 # Activate the instance disks if we're replacing them on a down instance
9718 _StartInstanceDisks(self.lu, self.instance, True)
9721 # Should we replace the secondary node?
9722 if self.new_node is not None:
9723 fn = self._ExecDrbd8Secondary
9725 fn = self._ExecDrbd8DiskOnly
9727 result = fn(feedback_fn)
9729 # Deactivate the instance disks if we're replacing them on a
9732 _SafeShutdownInstanceDisks(self.lu, self.instance)
9735 # Verify owned locks
9736 owned_nodes = self.lu.owned_locks(locking.LEVEL_NODE)
9737 nodes = frozenset(self.node_secondary_ip)
9738 assert ((self.early_release and not owned_nodes) or
9739 (not self.early_release and not (set(owned_nodes) - nodes))), \
9740 ("Not owning the correct locks, early_release=%s, owned=%r,"
9741 " nodes=%r" % (self.early_release, owned_nodes, nodes))
9745 def _CheckVolumeGroup(self, nodes):
9746 self.lu.LogInfo("Checking volume groups")
9748 vgname = self.cfg.GetVGName()
9750 # Make sure volume group exists on all involved nodes
9751 results = self.rpc.call_vg_list(nodes)
9753 raise errors.OpExecError("Can't list volume groups on the nodes")
9757 res.Raise("Error checking node %s" % node)
9758 if vgname not in res.payload:
9759 raise errors.OpExecError("Volume group '%s' not found on node %s" %
9762 def _CheckDisksExistence(self, nodes):
9763 # Check disk existence
9764 for idx, dev in enumerate(self.instance.disks):
9765 if idx not in self.disks:
9769 self.lu.LogInfo("Checking disk/%d on %s" % (idx, node))
9770 self.cfg.SetDiskID(dev, node)
9772 result = self.rpc.call_blockdev_find(node, dev)
9774 msg = result.fail_msg
9775 if msg or not result.payload:
9777 msg = "disk not found"
9778 raise errors.OpExecError("Can't find disk/%d on node %s: %s" %
9781 def _CheckDisksConsistency(self, node_name, on_primary, ldisk):
9782 for idx, dev in enumerate(self.instance.disks):
9783 if idx not in self.disks:
9786 self.lu.LogInfo("Checking disk/%d consistency on node %s" %
9789 if not _CheckDiskConsistency(self.lu, dev, node_name, on_primary,
9791 raise errors.OpExecError("Node %s has degraded storage, unsafe to"
9792 " replace disks for instance %s" %
9793 (node_name, self.instance.name))
9795 def _CreateNewStorage(self, node_name):
9796 """Create new storage on the primary or secondary node.
9798 This is only used for same-node replaces, not for changing the
9799 secondary node, hence we don't want to modify the existing disk.
9804 for idx, dev in enumerate(self.instance.disks):
9805 if idx not in self.disks:
9808 self.lu.LogInfo("Adding storage on %s for disk/%d" % (node_name, idx))
9810 self.cfg.SetDiskID(dev, node_name)
9812 lv_names = [".disk%d_%s" % (idx, suffix) for suffix in ["data", "meta"]]
9813 names = _GenerateUniqueNames(self.lu, lv_names)
9815 vg_data = dev.children[0].logical_id[0]
9816 lv_data = objects.Disk(dev_type=constants.LD_LV, size=dev.size,
9817 logical_id=(vg_data, names[0]))
9818 vg_meta = dev.children[1].logical_id[0]
9819 lv_meta = objects.Disk(dev_type=constants.LD_LV, size=128,
9820 logical_id=(vg_meta, names[1]))
9822 new_lvs = [lv_data, lv_meta]
9823 old_lvs = [child.Copy() for child in dev.children]
9824 iv_names[dev.iv_name] = (dev, old_lvs, new_lvs)
9826 # we pass force_create=True to force the LVM creation
9827 for new_lv in new_lvs:
9828 _CreateBlockDev(self.lu, node_name, self.instance, new_lv, True,
9829 _GetInstanceInfoText(self.instance), False)
9833 def _CheckDevices(self, node_name, iv_names):
9834 for name, (dev, _, _) in iv_names.iteritems():
9835 self.cfg.SetDiskID(dev, node_name)
9837 result = self.rpc.call_blockdev_find(node_name, dev)
9839 msg = result.fail_msg
9840 if msg or not result.payload:
9842 msg = "disk not found"
9843 raise errors.OpExecError("Can't find DRBD device %s: %s" %
9846 if result.payload.is_degraded:
9847 raise errors.OpExecError("DRBD device %s is degraded!" % name)
9849 def _RemoveOldStorage(self, node_name, iv_names):
9850 for name, (_, old_lvs, _) in iv_names.iteritems():
9851 self.lu.LogInfo("Remove logical volumes for %s" % name)
9854 self.cfg.SetDiskID(lv, node_name)
9856 msg = self.rpc.call_blockdev_remove(node_name, lv).fail_msg
9858 self.lu.LogWarning("Can't remove old LV: %s" % msg,
9859 hint="remove unused LVs manually")
9861 def _ExecDrbd8DiskOnly(self, feedback_fn): # pylint: disable=W0613
9862 """Replace a disk on the primary or secondary for DRBD 8.
9864 The algorithm for replace is quite complicated:
9866 1. for each disk to be replaced:
9868 1. create new LVs on the target node with unique names
9869 1. detach old LVs from the drbd device
9870 1. rename old LVs to name_replaced.<time_t>
9871 1. rename new LVs to old LVs
9872 1. attach the new LVs (with the old names now) to the drbd device
9874 1. wait for sync across all devices
9876 1. for each modified disk:
9878 1. remove old LVs (which have the name name_replaces.<time_t>)
9880 Failures are not very well handled.
9885 # Step: check device activation
9886 self.lu.LogStep(1, steps_total, "Check device existence")
9887 self._CheckDisksExistence([self.other_node, self.target_node])
9888 self._CheckVolumeGroup([self.target_node, self.other_node])
9890 # Step: check other node consistency
9891 self.lu.LogStep(2, steps_total, "Check peer consistency")
9892 self._CheckDisksConsistency(self.other_node,
9893 self.other_node == self.instance.primary_node,
9896 # Step: create new storage
9897 self.lu.LogStep(3, steps_total, "Allocate new storage")
9898 iv_names = self._CreateNewStorage(self.target_node)
9900 # Step: for each lv, detach+rename*2+attach
9901 self.lu.LogStep(4, steps_total, "Changing drbd configuration")
9902 for dev, old_lvs, new_lvs in iv_names.itervalues():
9903 self.lu.LogInfo("Detaching %s drbd from local storage" % dev.iv_name)
9905 result = self.rpc.call_blockdev_removechildren(self.target_node, dev,
9907 result.Raise("Can't detach drbd from local storage on node"
9908 " %s for device %s" % (self.target_node, dev.iv_name))
9910 #cfg.Update(instance)
9912 # ok, we created the new LVs, so now we know we have the needed
9913 # storage; as such, we proceed on the target node to rename
9914 # old_lv to _old, and new_lv to old_lv; note that we rename LVs
9915 # using the assumption that logical_id == physical_id (which in
9916 # turn is the unique_id on that node)
9918 # FIXME(iustin): use a better name for the replaced LVs
9919 temp_suffix = int(time.time())
9920 ren_fn = lambda d, suff: (d.physical_id[0],
9921 d.physical_id[1] + "_replaced-%s" % suff)
9923 # Build the rename list based on what LVs exist on the node
9924 rename_old_to_new = []
9925 for to_ren in old_lvs:
9926 result = self.rpc.call_blockdev_find(self.target_node, to_ren)
9927 if not result.fail_msg and result.payload:
9929 rename_old_to_new.append((to_ren, ren_fn(to_ren, temp_suffix)))
9931 self.lu.LogInfo("Renaming the old LVs on the target node")
9932 result = self.rpc.call_blockdev_rename(self.target_node,
9934 result.Raise("Can't rename old LVs on node %s" % self.target_node)
9936 # Now we rename the new LVs to the old LVs
9937 self.lu.LogInfo("Renaming the new LVs on the target node")
9938 rename_new_to_old = [(new, old.physical_id)
9939 for old, new in zip(old_lvs, new_lvs)]
9940 result = self.rpc.call_blockdev_rename(self.target_node,
9942 result.Raise("Can't rename new LVs on node %s" % self.target_node)
9944 # Intermediate steps of in memory modifications
9945 for old, new in zip(old_lvs, new_lvs):
9946 new.logical_id = old.logical_id
9947 self.cfg.SetDiskID(new, self.target_node)
9949 # We need to modify old_lvs so that removal later removes the
9950 # right LVs, not the newly added ones; note that old_lvs is a
9952 for disk in old_lvs:
9953 disk.logical_id = ren_fn(disk, temp_suffix)
9954 self.cfg.SetDiskID(disk, self.target_node)
9956 # Now that the new lvs have the old name, we can add them to the device
9957 self.lu.LogInfo("Adding new mirror component on %s" % self.target_node)
9958 result = self.rpc.call_blockdev_addchildren(self.target_node, dev,
9960 msg = result.fail_msg
9962 for new_lv in new_lvs:
9963 msg2 = self.rpc.call_blockdev_remove(self.target_node,
9966 self.lu.LogWarning("Can't rollback device %s: %s", dev, msg2,
9967 hint=("cleanup manually the unused logical"
9969 raise errors.OpExecError("Can't add local storage to drbd: %s" % msg)
9972 if self.early_release:
9973 self.lu.LogStep(cstep, steps_total, "Removing old storage")
9975 self._RemoveOldStorage(self.target_node, iv_names)
9976 # WARNING: we release both node locks here, do not do other RPCs
9977 # than WaitForSync to the primary node
9978 _ReleaseLocks(self.lu, locking.LEVEL_NODE,
9979 names=[self.target_node, self.other_node])
9982 # This can fail as the old devices are degraded and _WaitForSync
9983 # does a combined result over all disks, so we don't check its return value
9984 self.lu.LogStep(cstep, steps_total, "Sync devices")
9986 _WaitForSync(self.lu, self.instance)
9988 # Check all devices manually
9989 self._CheckDevices(self.instance.primary_node, iv_names)
9991 # Step: remove old storage
9992 if not self.early_release:
9993 self.lu.LogStep(cstep, steps_total, "Removing old storage")
9995 self._RemoveOldStorage(self.target_node, iv_names)
9997 def _ExecDrbd8Secondary(self, feedback_fn):
9998 """Replace the secondary node for DRBD 8.
10000 The algorithm for replace is quite complicated:
10001 - for all disks of the instance:
10002 - create new LVs on the new node with same names
10003 - shutdown the drbd device on the old secondary
10004 - disconnect the drbd network on the primary
10005 - create the drbd device on the new secondary
10006 - network attach the drbd on the primary, using an artifice:
10007 the drbd code for Attach() will connect to the network if it
10008 finds a device which is connected to the good local disks but
10009 not network enabled
10010 - wait for sync across all devices
10011 - remove all disks from the old secondary
10013 Failures are not very well handled.
10018 pnode = self.instance.primary_node
10020 # Step: check device activation
10021 self.lu.LogStep(1, steps_total, "Check device existence")
10022 self._CheckDisksExistence([self.instance.primary_node])
10023 self._CheckVolumeGroup([self.instance.primary_node])
10025 # Step: check other node consistency
10026 self.lu.LogStep(2, steps_total, "Check peer consistency")
10027 self._CheckDisksConsistency(self.instance.primary_node, True, True)
10029 # Step: create new storage
10030 self.lu.LogStep(3, steps_total, "Allocate new storage")
10031 for idx, dev in enumerate(self.instance.disks):
10032 self.lu.LogInfo("Adding new local storage on %s for disk/%d" %
10033 (self.new_node, idx))
10034 # we pass force_create=True to force LVM creation
10035 for new_lv in dev.children:
10036 _CreateBlockDev(self.lu, self.new_node, self.instance, new_lv, True,
10037 _GetInstanceInfoText(self.instance), False)
10039 # Step 4: dbrd minors and drbd setups changes
10040 # after this, we must manually remove the drbd minors on both the
10041 # error and the success paths
10042 self.lu.LogStep(4, steps_total, "Changing drbd configuration")
10043 minors = self.cfg.AllocateDRBDMinor([self.new_node
10044 for dev in self.instance.disks],
10045 self.instance.name)
10046 logging.debug("Allocated minors %r", minors)
10049 for idx, (dev, new_minor) in enumerate(zip(self.instance.disks, minors)):
10050 self.lu.LogInfo("activating a new drbd on %s for disk/%d" %
10051 (self.new_node, idx))
10052 # create new devices on new_node; note that we create two IDs:
10053 # one without port, so the drbd will be activated without
10054 # networking information on the new node at this stage, and one
10055 # with network, for the latter activation in step 4
10056 (o_node1, o_node2, o_port, o_minor1, o_minor2, o_secret) = dev.logical_id
10057 if self.instance.primary_node == o_node1:
10060 assert self.instance.primary_node == o_node2, "Three-node instance?"
10063 new_alone_id = (self.instance.primary_node, self.new_node, None,
10064 p_minor, new_minor, o_secret)
10065 new_net_id = (self.instance.primary_node, self.new_node, o_port,
10066 p_minor, new_minor, o_secret)
10068 iv_names[idx] = (dev, dev.children, new_net_id)
10069 logging.debug("Allocated new_minor: %s, new_logical_id: %s", new_minor,
10071 new_drbd = objects.Disk(dev_type=constants.LD_DRBD8,
10072 logical_id=new_alone_id,
10073 children=dev.children,
10076 _CreateSingleBlockDev(self.lu, self.new_node, self.instance, new_drbd,
10077 _GetInstanceInfoText(self.instance), False)
10078 except errors.GenericError:
10079 self.cfg.ReleaseDRBDMinors(self.instance.name)
10082 # We have new devices, shutdown the drbd on the old secondary
10083 for idx, dev in enumerate(self.instance.disks):
10084 self.lu.LogInfo("Shutting down drbd for disk/%d on old node" % idx)
10085 self.cfg.SetDiskID(dev, self.target_node)
10086 msg = self.rpc.call_blockdev_shutdown(self.target_node, dev).fail_msg
10088 self.lu.LogWarning("Failed to shutdown drbd for disk/%d on old"
10089 "node: %s" % (idx, msg),
10090 hint=("Please cleanup this device manually as"
10091 " soon as possible"))
10093 self.lu.LogInfo("Detaching primary drbds from the network (=> standalone)")
10094 result = self.rpc.call_drbd_disconnect_net([pnode], self.node_secondary_ip,
10095 self.instance.disks)[pnode]
10097 msg = result.fail_msg
10099 # detaches didn't succeed (unlikely)
10100 self.cfg.ReleaseDRBDMinors(self.instance.name)
10101 raise errors.OpExecError("Can't detach the disks from the network on"
10102 " old node: %s" % (msg,))
10104 # if we managed to detach at least one, we update all the disks of
10105 # the instance to point to the new secondary
10106 self.lu.LogInfo("Updating instance configuration")
10107 for dev, _, new_logical_id in iv_names.itervalues():
10108 dev.logical_id = new_logical_id
10109 self.cfg.SetDiskID(dev, self.instance.primary_node)
10111 self.cfg.Update(self.instance, feedback_fn)
10113 # and now perform the drbd attach
10114 self.lu.LogInfo("Attaching primary drbds to new secondary"
10115 " (standalone => connected)")
10116 result = self.rpc.call_drbd_attach_net([self.instance.primary_node,
10118 self.node_secondary_ip,
10119 self.instance.disks,
10120 self.instance.name,
10122 for to_node, to_result in result.items():
10123 msg = to_result.fail_msg
10125 self.lu.LogWarning("Can't attach drbd disks on node %s: %s",
10127 hint=("please do a gnt-instance info to see the"
10128 " status of disks"))
10130 if self.early_release:
10131 self.lu.LogStep(cstep, steps_total, "Removing old storage")
10133 self._RemoveOldStorage(self.target_node, iv_names)
10134 # WARNING: we release all node locks here, do not do other RPCs
10135 # than WaitForSync to the primary node
10136 _ReleaseLocks(self.lu, locking.LEVEL_NODE,
10137 names=[self.instance.primary_node,
10142 # This can fail as the old devices are degraded and _WaitForSync
10143 # does a combined result over all disks, so we don't check its return value
10144 self.lu.LogStep(cstep, steps_total, "Sync devices")
10146 _WaitForSync(self.lu, self.instance)
10148 # Check all devices manually
10149 self._CheckDevices(self.instance.primary_node, iv_names)
10151 # Step: remove old storage
10152 if not self.early_release:
10153 self.lu.LogStep(cstep, steps_total, "Removing old storage")
10154 self._RemoveOldStorage(self.target_node, iv_names)
10157 class LURepairNodeStorage(NoHooksLU):
10158 """Repairs the volume group on a node.
10163 def CheckArguments(self):
10164 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
10166 storage_type = self.op.storage_type
10168 if (constants.SO_FIX_CONSISTENCY not in
10169 constants.VALID_STORAGE_OPERATIONS.get(storage_type, [])):
10170 raise errors.OpPrereqError("Storage units of type '%s' can not be"
10171 " repaired" % storage_type,
10172 errors.ECODE_INVAL)
10174 def ExpandNames(self):
10175 self.needed_locks = {
10176 locking.LEVEL_NODE: [self.op.node_name],
10179 def _CheckFaultyDisks(self, instance, node_name):
10180 """Ensure faulty disks abort the opcode or at least warn."""
10182 if _FindFaultyInstanceDisks(self.cfg, self.rpc, instance,
10184 raise errors.OpPrereqError("Instance '%s' has faulty disks on"
10185 " node '%s'" % (instance.name, node_name),
10186 errors.ECODE_STATE)
10187 except errors.OpPrereqError, err:
10188 if self.op.ignore_consistency:
10189 self.proc.LogWarning(str(err.args[0]))
10193 def CheckPrereq(self):
10194 """Check prerequisites.
10197 # Check whether any instance on this node has faulty disks
10198 for inst in _GetNodeInstances(self.cfg, self.op.node_name):
10199 if not inst.admin_up:
10201 check_nodes = set(inst.all_nodes)
10202 check_nodes.discard(self.op.node_name)
10203 for inst_node_name in check_nodes:
10204 self._CheckFaultyDisks(inst, inst_node_name)
10206 def Exec(self, feedback_fn):
10207 feedback_fn("Repairing storage unit '%s' on %s ..." %
10208 (self.op.name, self.op.node_name))
10210 st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
10211 result = self.rpc.call_storage_execute(self.op.node_name,
10212 self.op.storage_type, st_args,
10214 constants.SO_FIX_CONSISTENCY)
10215 result.Raise("Failed to repair storage unit '%s' on %s" %
10216 (self.op.name, self.op.node_name))
10219 class LUNodeEvacuate(NoHooksLU):
10220 """Evacuates instances off a list of nodes.
10225 def CheckArguments(self):
10226 _CheckIAllocatorOrNode(self, "iallocator", "remote_node")
10228 def ExpandNames(self):
10229 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
10231 if self.op.remote_node is not None:
10232 self.op.remote_node = _ExpandNodeName(self.cfg, self.op.remote_node)
10233 assert self.op.remote_node
10235 if self.op.remote_node == self.op.node_name:
10236 raise errors.OpPrereqError("Can not use evacuated node as a new"
10237 " secondary node", errors.ECODE_INVAL)
10239 if self.op.mode != constants.IALLOCATOR_NEVAC_SEC:
10240 raise errors.OpPrereqError("Without the use of an iallocator only"
10241 " secondary instances can be evacuated",
10242 errors.ECODE_INVAL)
10245 self.share_locks = _ShareAll()
10246 self.needed_locks = {
10247 locking.LEVEL_INSTANCE: [],
10248 locking.LEVEL_NODEGROUP: [],
10249 locking.LEVEL_NODE: [],
10252 if self.op.remote_node is None:
10253 # Iallocator will choose any node(s) in the same group
10254 group_nodes = self.cfg.GetNodeGroupMembersByNodes([self.op.node_name])
10256 group_nodes = frozenset([self.op.remote_node])
10258 # Determine nodes to be locked
10259 self.lock_nodes = set([self.op.node_name]) | group_nodes
10261 def _DetermineInstances(self):
10262 """Builds list of instances to operate on.
10265 assert self.op.mode in constants.IALLOCATOR_NEVAC_MODES
10267 if self.op.mode == constants.IALLOCATOR_NEVAC_PRI:
10268 # Primary instances only
10269 inst_fn = _GetNodePrimaryInstances
10270 assert self.op.remote_node is None, \
10271 "Evacuating primary instances requires iallocator"
10272 elif self.op.mode == constants.IALLOCATOR_NEVAC_SEC:
10273 # Secondary instances only
10274 inst_fn = _GetNodeSecondaryInstances
10277 assert self.op.mode == constants.IALLOCATOR_NEVAC_ALL
10278 inst_fn = _GetNodeInstances
10280 return inst_fn(self.cfg, self.op.node_name)
10282 def DeclareLocks(self, level):
10283 if level == locking.LEVEL_INSTANCE:
10284 # Lock instances optimistically, needs verification once node and group
10285 # locks have been acquired
10286 self.needed_locks[locking.LEVEL_INSTANCE] = \
10287 set(i.name for i in self._DetermineInstances())
10289 elif level == locking.LEVEL_NODEGROUP:
10290 # Lock node groups optimistically, needs verification once nodes have
10292 self.needed_locks[locking.LEVEL_NODEGROUP] = \
10293 self.cfg.GetNodeGroupsFromNodes(self.lock_nodes)
10295 elif level == locking.LEVEL_NODE:
10296 self.needed_locks[locking.LEVEL_NODE] = self.lock_nodes
10298 def CheckPrereq(self):
10300 owned_instances = self.owned_locks(locking.LEVEL_INSTANCE)
10301 owned_nodes = self.owned_locks(locking.LEVEL_NODE)
10302 owned_groups = self.owned_locks(locking.LEVEL_NODEGROUP)
10304 assert owned_nodes == self.lock_nodes
10306 wanted_groups = self.cfg.GetNodeGroupsFromNodes(owned_nodes)
10307 if owned_groups != wanted_groups:
10308 raise errors.OpExecError("Node groups changed since locks were acquired,"
10309 " current groups are '%s', used to be '%s'" %
10310 (utils.CommaJoin(wanted_groups),
10311 utils.CommaJoin(owned_groups)))
10313 # Determine affected instances
10314 self.instances = self._DetermineInstances()
10315 self.instance_names = [i.name for i in self.instances]
10317 if set(self.instance_names) != owned_instances:
10318 raise errors.OpExecError("Instances on node '%s' changed since locks"
10319 " were acquired, current instances are '%s',"
10320 " used to be '%s'" %
10321 (self.op.node_name,
10322 utils.CommaJoin(self.instance_names),
10323 utils.CommaJoin(owned_instances)))
10325 if self.instance_names:
10326 self.LogInfo("Evacuating instances from node '%s': %s",
10328 utils.CommaJoin(utils.NiceSort(self.instance_names)))
10330 self.LogInfo("No instances to evacuate from node '%s'",
10333 if self.op.remote_node is not None:
10334 for i in self.instances:
10335 if i.primary_node == self.op.remote_node:
10336 raise errors.OpPrereqError("Node %s is the primary node of"
10337 " instance %s, cannot use it as"
10339 (self.op.remote_node, i.name),
10340 errors.ECODE_INVAL)
10342 def Exec(self, feedback_fn):
10343 assert (self.op.iallocator is not None) ^ (self.op.remote_node is not None)
10345 if not self.instance_names:
10346 # No instances to evacuate
10349 elif self.op.iallocator is not None:
10350 # TODO: Implement relocation to other group
10351 ial = IAllocator(self.cfg, self.rpc, constants.IALLOCATOR_MODE_NODE_EVAC,
10352 evac_mode=self.op.mode,
10353 instances=list(self.instance_names))
10355 ial.Run(self.op.iallocator)
10357 if not ial.success:
10358 raise errors.OpPrereqError("Can't compute node evacuation using"
10359 " iallocator '%s': %s" %
10360 (self.op.iallocator, ial.info),
10361 errors.ECODE_NORES)
10363 jobs = _LoadNodeEvacResult(self, ial.result, self.op.early_release, True)
10365 elif self.op.remote_node is not None:
10366 assert self.op.mode == constants.IALLOCATOR_NEVAC_SEC
10368 [opcodes.OpInstanceReplaceDisks(instance_name=instance_name,
10369 remote_node=self.op.remote_node,
10371 mode=constants.REPLACE_DISK_CHG,
10372 early_release=self.op.early_release)]
10373 for instance_name in self.instance_names
10377 raise errors.ProgrammerError("No iallocator or remote node")
10379 return ResultWithJobs(jobs)
10382 def _SetOpEarlyRelease(early_release, op):
10383 """Sets C{early_release} flag on opcodes if available.
10387 op.early_release = early_release
10388 except AttributeError:
10389 assert not isinstance(op, opcodes.OpInstanceReplaceDisks)
10394 def _NodeEvacDest(use_nodes, group, nodes):
10395 """Returns group or nodes depending on caller's choice.
10399 return utils.CommaJoin(nodes)
10404 def _LoadNodeEvacResult(lu, alloc_result, early_release, use_nodes):
10405 """Unpacks the result of change-group and node-evacuate iallocator requests.
10407 Iallocator modes L{constants.IALLOCATOR_MODE_NODE_EVAC} and
10408 L{constants.IALLOCATOR_MODE_CHG_GROUP}.
10410 @type lu: L{LogicalUnit}
10411 @param lu: Logical unit instance
10412 @type alloc_result: tuple/list
10413 @param alloc_result: Result from iallocator
10414 @type early_release: bool
10415 @param early_release: Whether to release locks early if possible
10416 @type use_nodes: bool
10417 @param use_nodes: Whether to display node names instead of groups
10420 (moved, failed, jobs) = alloc_result
10423 lu.LogWarning("Unable to evacuate instances %s",
10424 utils.CommaJoin("%s (%s)" % (name, reason)
10425 for (name, reason) in failed))
10428 lu.LogInfo("Instances to be moved: %s",
10429 utils.CommaJoin("%s (to %s)" %
10430 (name, _NodeEvacDest(use_nodes, group, nodes))
10431 for (name, group, nodes) in moved))
10433 return [map(compat.partial(_SetOpEarlyRelease, early_release),
10434 map(opcodes.OpCode.LoadOpCode, ops))
10438 class LUInstanceGrowDisk(LogicalUnit):
10439 """Grow a disk of an instance.
10442 HPATH = "disk-grow"
10443 HTYPE = constants.HTYPE_INSTANCE
10446 def ExpandNames(self):
10447 self._ExpandAndLockInstance()
10448 self.needed_locks[locking.LEVEL_NODE] = []
10449 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
10451 def DeclareLocks(self, level):
10452 if level == locking.LEVEL_NODE:
10453 self._LockInstancesNodes()
10455 def BuildHooksEnv(self):
10456 """Build hooks env.
10458 This runs on the master, the primary and all the secondaries.
10462 "DISK": self.op.disk,
10463 "AMOUNT": self.op.amount,
10465 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
10468 def BuildHooksNodes(self):
10469 """Build hooks nodes.
10472 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
10475 def CheckPrereq(self):
10476 """Check prerequisites.
10478 This checks that the instance is in the cluster.
10481 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
10482 assert instance is not None, \
10483 "Cannot retrieve locked instance %s" % self.op.instance_name
10484 nodenames = list(instance.all_nodes)
10485 for node in nodenames:
10486 _CheckNodeOnline(self, node)
10488 self.instance = instance
10490 if instance.disk_template not in constants.DTS_GROWABLE:
10491 raise errors.OpPrereqError("Instance's disk layout does not support"
10492 " growing", errors.ECODE_INVAL)
10494 self.disk = instance.FindDisk(self.op.disk)
10496 if instance.disk_template not in (constants.DT_FILE,
10497 constants.DT_SHARED_FILE):
10498 # TODO: check the free disk space for file, when that feature will be
10500 _CheckNodesFreeDiskPerVG(self, nodenames,
10501 self.disk.ComputeGrowth(self.op.amount))
10503 def Exec(self, feedback_fn):
10504 """Execute disk grow.
10507 instance = self.instance
10510 disks_ok, _ = _AssembleInstanceDisks(self, self.instance, disks=[disk])
10512 raise errors.OpExecError("Cannot activate block device to grow")
10514 # First run all grow ops in dry-run mode
10515 for node in instance.all_nodes:
10516 self.cfg.SetDiskID(disk, node)
10517 result = self.rpc.call_blockdev_grow(node, disk, self.op.amount, True)
10518 result.Raise("Grow request failed to node %s" % node)
10520 # We know that (as far as we can test) operations across different
10521 # nodes will succeed, time to run it for real
10522 for node in instance.all_nodes:
10523 self.cfg.SetDiskID(disk, node)
10524 result = self.rpc.call_blockdev_grow(node, disk, self.op.amount, False)
10525 result.Raise("Grow request failed to node %s" % node)
10527 # TODO: Rewrite code to work properly
10528 # DRBD goes into sync mode for a short amount of time after executing the
10529 # "resize" command. DRBD 8.x below version 8.0.13 contains a bug whereby
10530 # calling "resize" in sync mode fails. Sleeping for a short amount of
10531 # time is a work-around.
10534 disk.RecordGrow(self.op.amount)
10535 self.cfg.Update(instance, feedback_fn)
10536 if self.op.wait_for_sync:
10537 disk_abort = not _WaitForSync(self, instance, disks=[disk])
10539 self.proc.LogWarning("Disk sync-ing has not returned a good"
10540 " status; please check the instance")
10541 if not instance.admin_up:
10542 _SafeShutdownInstanceDisks(self, instance, disks=[disk])
10543 elif not instance.admin_up:
10544 self.proc.LogWarning("Not shutting down the disk even if the instance is"
10545 " not supposed to be running because no wait for"
10546 " sync mode was requested")
10549 class LUInstanceQueryData(NoHooksLU):
10550 """Query runtime instance data.
10555 def ExpandNames(self):
10556 self.needed_locks = {}
10558 # Use locking if requested or when non-static information is wanted
10559 if not (self.op.static or self.op.use_locking):
10560 self.LogWarning("Non-static data requested, locks need to be acquired")
10561 self.op.use_locking = True
10563 if self.op.instances or not self.op.use_locking:
10564 # Expand instance names right here
10565 self.wanted_names = _GetWantedInstances(self, self.op.instances)
10567 # Will use acquired locks
10568 self.wanted_names = None
10570 if self.op.use_locking:
10571 self.share_locks = _ShareAll()
10573 if self.wanted_names is None:
10574 self.needed_locks[locking.LEVEL_INSTANCE] = locking.ALL_SET
10576 self.needed_locks[locking.LEVEL_INSTANCE] = self.wanted_names
10578 self.needed_locks[locking.LEVEL_NODE] = []
10579 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
10581 def DeclareLocks(self, level):
10582 if self.op.use_locking and level == locking.LEVEL_NODE:
10583 self._LockInstancesNodes()
10585 def CheckPrereq(self):
10586 """Check prerequisites.
10588 This only checks the optional instance list against the existing names.
10591 if self.wanted_names is None:
10592 assert self.op.use_locking, "Locking was not used"
10593 self.wanted_names = self.owned_locks(locking.LEVEL_INSTANCE)
10595 self.wanted_instances = \
10596 map(compat.snd, self.cfg.GetMultiInstanceInfo(self.wanted_names))
10598 def _ComputeBlockdevStatus(self, node, instance_name, dev):
10599 """Returns the status of a block device
10602 if self.op.static or not node:
10605 self.cfg.SetDiskID(dev, node)
10607 result = self.rpc.call_blockdev_find(node, dev)
10611 result.Raise("Can't compute disk status for %s" % instance_name)
10613 status = result.payload
10617 return (status.dev_path, status.major, status.minor,
10618 status.sync_percent, status.estimated_time,
10619 status.is_degraded, status.ldisk_status)
10621 def _ComputeDiskStatus(self, instance, snode, dev):
10622 """Compute block device status.
10625 if dev.dev_type in constants.LDS_DRBD:
10626 # we change the snode then (otherwise we use the one passed in)
10627 if dev.logical_id[0] == instance.primary_node:
10628 snode = dev.logical_id[1]
10630 snode = dev.logical_id[0]
10632 dev_pstatus = self._ComputeBlockdevStatus(instance.primary_node,
10633 instance.name, dev)
10634 dev_sstatus = self._ComputeBlockdevStatus(snode, instance.name, dev)
10637 dev_children = map(compat.partial(self._ComputeDiskStatus,
10644 "iv_name": dev.iv_name,
10645 "dev_type": dev.dev_type,
10646 "logical_id": dev.logical_id,
10647 "physical_id": dev.physical_id,
10648 "pstatus": dev_pstatus,
10649 "sstatus": dev_sstatus,
10650 "children": dev_children,
10655 def Exec(self, feedback_fn):
10656 """Gather and return data"""
10659 cluster = self.cfg.GetClusterInfo()
10661 pri_nodes = self.cfg.GetMultiNodeInfo(i.primary_node
10662 for i in self.wanted_instances)
10663 for instance, (_, pnode) in zip(self.wanted_instances, pri_nodes):
10664 if self.op.static or pnode.offline:
10665 remote_state = None
10667 self.LogWarning("Primary node %s is marked offline, returning static"
10668 " information only for instance %s" %
10669 (pnode.name, instance.name))
10671 remote_info = self.rpc.call_instance_info(instance.primary_node,
10673 instance.hypervisor)
10674 remote_info.Raise("Error checking node %s" % instance.primary_node)
10675 remote_info = remote_info.payload
10676 if remote_info and "state" in remote_info:
10677 remote_state = "up"
10679 remote_state = "down"
10681 if instance.admin_up:
10682 config_state = "up"
10684 config_state = "down"
10686 disks = map(compat.partial(self._ComputeDiskStatus, instance, None),
10689 result[instance.name] = {
10690 "name": instance.name,
10691 "config_state": config_state,
10692 "run_state": remote_state,
10693 "pnode": instance.primary_node,
10694 "snodes": instance.secondary_nodes,
10696 # this happens to be the same format used for hooks
10697 "nics": _NICListToTuple(self, instance.nics),
10698 "disk_template": instance.disk_template,
10700 "hypervisor": instance.hypervisor,
10701 "network_port": instance.network_port,
10702 "hv_instance": instance.hvparams,
10703 "hv_actual": cluster.FillHV(instance, skip_globals=True),
10704 "be_instance": instance.beparams,
10705 "be_actual": cluster.FillBE(instance),
10706 "os_instance": instance.osparams,
10707 "os_actual": cluster.SimpleFillOS(instance.os, instance.osparams),
10708 "serial_no": instance.serial_no,
10709 "mtime": instance.mtime,
10710 "ctime": instance.ctime,
10711 "uuid": instance.uuid,
10717 class LUInstanceSetParams(LogicalUnit):
10718 """Modifies an instances's parameters.
10721 HPATH = "instance-modify"
10722 HTYPE = constants.HTYPE_INSTANCE
10725 def CheckArguments(self):
10726 if not (self.op.nics or self.op.disks or self.op.disk_template or
10727 self.op.hvparams or self.op.beparams or self.op.os_name):
10728 raise errors.OpPrereqError("No changes submitted", errors.ECODE_INVAL)
10730 if self.op.hvparams:
10731 _CheckGlobalHvParams(self.op.hvparams)
10735 for disk_op, disk_dict in self.op.disks:
10736 utils.ForceDictType(disk_dict, constants.IDISK_PARAMS_TYPES)
10737 if disk_op == constants.DDM_REMOVE:
10738 disk_addremove += 1
10740 elif disk_op == constants.DDM_ADD:
10741 disk_addremove += 1
10743 if not isinstance(disk_op, int):
10744 raise errors.OpPrereqError("Invalid disk index", errors.ECODE_INVAL)
10745 if not isinstance(disk_dict, dict):
10746 msg = "Invalid disk value: expected dict, got '%s'" % disk_dict
10747 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
10749 if disk_op == constants.DDM_ADD:
10750 mode = disk_dict.setdefault(constants.IDISK_MODE, constants.DISK_RDWR)
10751 if mode not in constants.DISK_ACCESS_SET:
10752 raise errors.OpPrereqError("Invalid disk access mode '%s'" % mode,
10753 errors.ECODE_INVAL)
10754 size = disk_dict.get(constants.IDISK_SIZE, None)
10756 raise errors.OpPrereqError("Required disk parameter size missing",
10757 errors.ECODE_INVAL)
10760 except (TypeError, ValueError), err:
10761 raise errors.OpPrereqError("Invalid disk size parameter: %s" %
10762 str(err), errors.ECODE_INVAL)
10763 disk_dict[constants.IDISK_SIZE] = size
10765 # modification of disk
10766 if constants.IDISK_SIZE in disk_dict:
10767 raise errors.OpPrereqError("Disk size change not possible, use"
10768 " grow-disk", errors.ECODE_INVAL)
10770 if disk_addremove > 1:
10771 raise errors.OpPrereqError("Only one disk add or remove operation"
10772 " supported at a time", errors.ECODE_INVAL)
10774 if self.op.disks and self.op.disk_template is not None:
10775 raise errors.OpPrereqError("Disk template conversion and other disk"
10776 " changes not supported at the same time",
10777 errors.ECODE_INVAL)
10779 if (self.op.disk_template and
10780 self.op.disk_template in constants.DTS_INT_MIRROR and
10781 self.op.remote_node is None):
10782 raise errors.OpPrereqError("Changing the disk template to a mirrored"
10783 " one requires specifying a secondary node",
10784 errors.ECODE_INVAL)
10788 for nic_op, nic_dict in self.op.nics:
10789 utils.ForceDictType(nic_dict, constants.INIC_PARAMS_TYPES)
10790 if nic_op == constants.DDM_REMOVE:
10793 elif nic_op == constants.DDM_ADD:
10796 if not isinstance(nic_op, int):
10797 raise errors.OpPrereqError("Invalid nic index", errors.ECODE_INVAL)
10798 if not isinstance(nic_dict, dict):
10799 msg = "Invalid nic value: expected dict, got '%s'" % nic_dict
10800 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
10802 # nic_dict should be a dict
10803 nic_ip = nic_dict.get(constants.INIC_IP, None)
10804 if nic_ip is not None:
10805 if nic_ip.lower() == constants.VALUE_NONE:
10806 nic_dict[constants.INIC_IP] = None
10808 if not netutils.IPAddress.IsValid(nic_ip):
10809 raise errors.OpPrereqError("Invalid IP address '%s'" % nic_ip,
10810 errors.ECODE_INVAL)
10812 nic_bridge = nic_dict.get("bridge", None)
10813 nic_link = nic_dict.get(constants.INIC_LINK, None)
10814 if nic_bridge and nic_link:
10815 raise errors.OpPrereqError("Cannot pass 'bridge' and 'link'"
10816 " at the same time", errors.ECODE_INVAL)
10817 elif nic_bridge and nic_bridge.lower() == constants.VALUE_NONE:
10818 nic_dict["bridge"] = None
10819 elif nic_link and nic_link.lower() == constants.VALUE_NONE:
10820 nic_dict[constants.INIC_LINK] = None
10822 if nic_op == constants.DDM_ADD:
10823 nic_mac = nic_dict.get(constants.INIC_MAC, None)
10824 if nic_mac is None:
10825 nic_dict[constants.INIC_MAC] = constants.VALUE_AUTO
10827 if constants.INIC_MAC in nic_dict:
10828 nic_mac = nic_dict[constants.INIC_MAC]
10829 if nic_mac not in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
10830 nic_mac = utils.NormalizeAndValidateMac(nic_mac)
10832 if nic_op != constants.DDM_ADD and nic_mac == constants.VALUE_AUTO:
10833 raise errors.OpPrereqError("'auto' is not a valid MAC address when"
10834 " modifying an existing nic",
10835 errors.ECODE_INVAL)
10837 if nic_addremove > 1:
10838 raise errors.OpPrereqError("Only one NIC add or remove operation"
10839 " supported at a time", errors.ECODE_INVAL)
10841 def ExpandNames(self):
10842 self._ExpandAndLockInstance()
10843 self.needed_locks[locking.LEVEL_NODE] = []
10844 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
10846 def DeclareLocks(self, level):
10847 if level == locking.LEVEL_NODE:
10848 self._LockInstancesNodes()
10849 if self.op.disk_template and self.op.remote_node:
10850 self.op.remote_node = _ExpandNodeName(self.cfg, self.op.remote_node)
10851 self.needed_locks[locking.LEVEL_NODE].append(self.op.remote_node)
10853 def BuildHooksEnv(self):
10854 """Build hooks env.
10856 This runs on the master, primary and secondaries.
10860 if constants.BE_MEMORY in self.be_new:
10861 args["memory"] = self.be_new[constants.BE_MEMORY]
10862 if constants.BE_VCPUS in self.be_new:
10863 args["vcpus"] = self.be_new[constants.BE_VCPUS]
10864 # TODO: export disk changes. Note: _BuildInstanceHookEnv* don't export disk
10865 # information at all.
10868 nic_override = dict(self.op.nics)
10869 for idx, nic in enumerate(self.instance.nics):
10870 if idx in nic_override:
10871 this_nic_override = nic_override[idx]
10873 this_nic_override = {}
10874 if constants.INIC_IP in this_nic_override:
10875 ip = this_nic_override[constants.INIC_IP]
10878 if constants.INIC_MAC in this_nic_override:
10879 mac = this_nic_override[constants.INIC_MAC]
10882 if idx in self.nic_pnew:
10883 nicparams = self.nic_pnew[idx]
10885 nicparams = self.cluster.SimpleFillNIC(nic.nicparams)
10886 mode = nicparams[constants.NIC_MODE]
10887 link = nicparams[constants.NIC_LINK]
10888 args["nics"].append((ip, mac, mode, link))
10889 if constants.DDM_ADD in nic_override:
10890 ip = nic_override[constants.DDM_ADD].get(constants.INIC_IP, None)
10891 mac = nic_override[constants.DDM_ADD][constants.INIC_MAC]
10892 nicparams = self.nic_pnew[constants.DDM_ADD]
10893 mode = nicparams[constants.NIC_MODE]
10894 link = nicparams[constants.NIC_LINK]
10895 args["nics"].append((ip, mac, mode, link))
10896 elif constants.DDM_REMOVE in nic_override:
10897 del args["nics"][-1]
10899 env = _BuildInstanceHookEnvByObject(self, self.instance, override=args)
10900 if self.op.disk_template:
10901 env["NEW_DISK_TEMPLATE"] = self.op.disk_template
10905 def BuildHooksNodes(self):
10906 """Build hooks nodes.
10909 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
10912 def CheckPrereq(self):
10913 """Check prerequisites.
10915 This only checks the instance list against the existing names.
10918 # checking the new params on the primary/secondary nodes
10920 instance = self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
10921 cluster = self.cluster = self.cfg.GetClusterInfo()
10922 assert self.instance is not None, \
10923 "Cannot retrieve locked instance %s" % self.op.instance_name
10924 pnode = instance.primary_node
10925 nodelist = list(instance.all_nodes)
10928 if self.op.os_name and not self.op.force:
10929 _CheckNodeHasOS(self, instance.primary_node, self.op.os_name,
10930 self.op.force_variant)
10931 instance_os = self.op.os_name
10933 instance_os = instance.os
10935 if self.op.disk_template:
10936 if instance.disk_template == self.op.disk_template:
10937 raise errors.OpPrereqError("Instance already has disk template %s" %
10938 instance.disk_template, errors.ECODE_INVAL)
10940 if (instance.disk_template,
10941 self.op.disk_template) not in self._DISK_CONVERSIONS:
10942 raise errors.OpPrereqError("Unsupported disk template conversion from"
10943 " %s to %s" % (instance.disk_template,
10944 self.op.disk_template),
10945 errors.ECODE_INVAL)
10946 _CheckInstanceDown(self, instance, "cannot change disk template")
10947 if self.op.disk_template in constants.DTS_INT_MIRROR:
10948 if self.op.remote_node == pnode:
10949 raise errors.OpPrereqError("Given new secondary node %s is the same"
10950 " as the primary node of the instance" %
10951 self.op.remote_node, errors.ECODE_STATE)
10952 _CheckNodeOnline(self, self.op.remote_node)
10953 _CheckNodeNotDrained(self, self.op.remote_node)
10954 # FIXME: here we assume that the old instance type is DT_PLAIN
10955 assert instance.disk_template == constants.DT_PLAIN
10956 disks = [{constants.IDISK_SIZE: d.size,
10957 constants.IDISK_VG: d.logical_id[0]}
10958 for d in instance.disks]
10959 required = _ComputeDiskSizePerVG(self.op.disk_template, disks)
10960 _CheckNodesFreeDiskPerVG(self, [self.op.remote_node], required)
10962 # hvparams processing
10963 if self.op.hvparams:
10964 hv_type = instance.hypervisor
10965 i_hvdict = _GetUpdatedParams(instance.hvparams, self.op.hvparams)
10966 utils.ForceDictType(i_hvdict, constants.HVS_PARAMETER_TYPES)
10967 hv_new = cluster.SimpleFillHV(hv_type, instance.os, i_hvdict)
10970 hypervisor.GetHypervisor(hv_type).CheckParameterSyntax(hv_new)
10971 _CheckHVParams(self, nodelist, instance.hypervisor, hv_new)
10972 self.hv_proposed = self.hv_new = hv_new # the new actual values
10973 self.hv_inst = i_hvdict # the new dict (without defaults)
10975 self.hv_proposed = cluster.SimpleFillHV(instance.hypervisor, instance.os,
10977 self.hv_new = self.hv_inst = {}
10979 # beparams processing
10980 if self.op.beparams:
10981 i_bedict = _GetUpdatedParams(instance.beparams, self.op.beparams,
10983 utils.ForceDictType(i_bedict, constants.BES_PARAMETER_TYPES)
10984 be_new = cluster.SimpleFillBE(i_bedict)
10985 self.be_proposed = self.be_new = be_new # the new actual values
10986 self.be_inst = i_bedict # the new dict (without defaults)
10988 self.be_new = self.be_inst = {}
10989 self.be_proposed = cluster.SimpleFillBE(instance.beparams)
10990 be_old = cluster.FillBE(instance)
10992 # CPU param validation -- checking every time a paramtere is
10993 # changed to cover all cases where either CPU mask or vcpus have
10995 if (constants.BE_VCPUS in self.be_proposed and
10996 constants.HV_CPU_MASK in self.hv_proposed):
10998 utils.ParseMultiCpuMask(self.hv_proposed[constants.HV_CPU_MASK])
10999 # Verify mask is consistent with number of vCPUs. Can skip this
11000 # test if only 1 entry in the CPU mask, which means same mask
11001 # is applied to all vCPUs.
11002 if (len(cpu_list) > 1 and
11003 len(cpu_list) != self.be_proposed[constants.BE_VCPUS]):
11004 raise errors.OpPrereqError("Number of vCPUs [%d] does not match the"
11006 (self.be_proposed[constants.BE_VCPUS],
11007 self.hv_proposed[constants.HV_CPU_MASK]),
11008 errors.ECODE_INVAL)
11010 # Only perform this test if a new CPU mask is given
11011 if constants.HV_CPU_MASK in self.hv_new:
11012 # Calculate the largest CPU number requested
11013 max_requested_cpu = max(map(max, cpu_list))
11014 # Check that all of the instance's nodes have enough physical CPUs to
11015 # satisfy the requested CPU mask
11016 _CheckNodesPhysicalCPUs(self, instance.all_nodes,
11017 max_requested_cpu + 1, instance.hypervisor)
11019 # osparams processing
11020 if self.op.osparams:
11021 i_osdict = _GetUpdatedParams(instance.osparams, self.op.osparams)
11022 _CheckOSParams(self, True, nodelist, instance_os, i_osdict)
11023 self.os_inst = i_osdict # the new dict (without defaults)
11029 if (constants.BE_MEMORY in self.op.beparams and not self.op.force and
11030 be_new[constants.BE_MEMORY] > be_old[constants.BE_MEMORY]):
11031 mem_check_list = [pnode]
11032 if be_new[constants.BE_AUTO_BALANCE]:
11033 # either we changed auto_balance to yes or it was from before
11034 mem_check_list.extend(instance.secondary_nodes)
11035 instance_info = self.rpc.call_instance_info(pnode, instance.name,
11036 instance.hypervisor)
11037 nodeinfo = self.rpc.call_node_info(mem_check_list, None,
11038 instance.hypervisor)
11039 pninfo = nodeinfo[pnode]
11040 msg = pninfo.fail_msg
11042 # Assume the primary node is unreachable and go ahead
11043 self.warn.append("Can't get info from primary node %s: %s" %
11045 elif not isinstance(pninfo.payload.get("memory_free", None), int):
11046 self.warn.append("Node data from primary node %s doesn't contain"
11047 " free memory information" % pnode)
11048 elif instance_info.fail_msg:
11049 self.warn.append("Can't get instance runtime information: %s" %
11050 instance_info.fail_msg)
11052 if instance_info.payload:
11053 current_mem = int(instance_info.payload["memory"])
11055 # Assume instance not running
11056 # (there is a slight race condition here, but it's not very probable,
11057 # and we have no other way to check)
11059 miss_mem = (be_new[constants.BE_MEMORY] - current_mem -
11060 pninfo.payload["memory_free"])
11062 raise errors.OpPrereqError("This change will prevent the instance"
11063 " from starting, due to %d MB of memory"
11064 " missing on its primary node" % miss_mem,
11065 errors.ECODE_NORES)
11067 if be_new[constants.BE_AUTO_BALANCE]:
11068 for node, nres in nodeinfo.items():
11069 if node not in instance.secondary_nodes:
11071 nres.Raise("Can't get info from secondary node %s" % node,
11072 prereq=True, ecode=errors.ECODE_STATE)
11073 if not isinstance(nres.payload.get("memory_free", None), int):
11074 raise errors.OpPrereqError("Secondary node %s didn't return free"
11075 " memory information" % node,
11076 errors.ECODE_STATE)
11077 elif be_new[constants.BE_MEMORY] > nres.payload["memory_free"]:
11078 raise errors.OpPrereqError("This change will prevent the instance"
11079 " from failover to its secondary node"
11080 " %s, due to not enough memory" % node,
11081 errors.ECODE_STATE)
11085 self.nic_pinst = {}
11086 for nic_op, nic_dict in self.op.nics:
11087 if nic_op == constants.DDM_REMOVE:
11088 if not instance.nics:
11089 raise errors.OpPrereqError("Instance has no NICs, cannot remove",
11090 errors.ECODE_INVAL)
11092 if nic_op != constants.DDM_ADD:
11094 if not instance.nics:
11095 raise errors.OpPrereqError("Invalid NIC index %s, instance has"
11096 " no NICs" % nic_op,
11097 errors.ECODE_INVAL)
11098 if nic_op < 0 or nic_op >= len(instance.nics):
11099 raise errors.OpPrereqError("Invalid NIC index %s, valid values"
11101 (nic_op, len(instance.nics) - 1),
11102 errors.ECODE_INVAL)
11103 old_nic_params = instance.nics[nic_op].nicparams
11104 old_nic_ip = instance.nics[nic_op].ip
11106 old_nic_params = {}
11109 update_params_dict = dict([(key, nic_dict[key])
11110 for key in constants.NICS_PARAMETERS
11111 if key in nic_dict])
11113 if "bridge" in nic_dict:
11114 update_params_dict[constants.NIC_LINK] = nic_dict["bridge"]
11116 new_nic_params = _GetUpdatedParams(old_nic_params,
11117 update_params_dict)
11118 utils.ForceDictType(new_nic_params, constants.NICS_PARAMETER_TYPES)
11119 new_filled_nic_params = cluster.SimpleFillNIC(new_nic_params)
11120 objects.NIC.CheckParameterSyntax(new_filled_nic_params)
11121 self.nic_pinst[nic_op] = new_nic_params
11122 self.nic_pnew[nic_op] = new_filled_nic_params
11123 new_nic_mode = new_filled_nic_params[constants.NIC_MODE]
11125 if new_nic_mode == constants.NIC_MODE_BRIDGED:
11126 nic_bridge = new_filled_nic_params[constants.NIC_LINK]
11127 msg = self.rpc.call_bridges_exist(pnode, [nic_bridge]).fail_msg
11129 msg = "Error checking bridges on node %s: %s" % (pnode, msg)
11131 self.warn.append(msg)
11133 raise errors.OpPrereqError(msg, errors.ECODE_ENVIRON)
11134 if new_nic_mode == constants.NIC_MODE_ROUTED:
11135 if constants.INIC_IP in nic_dict:
11136 nic_ip = nic_dict[constants.INIC_IP]
11138 nic_ip = old_nic_ip
11140 raise errors.OpPrereqError("Cannot set the nic ip to None"
11141 " on a routed nic", errors.ECODE_INVAL)
11142 if constants.INIC_MAC in nic_dict:
11143 nic_mac = nic_dict[constants.INIC_MAC]
11144 if nic_mac is None:
11145 raise errors.OpPrereqError("Cannot set the nic mac to None",
11146 errors.ECODE_INVAL)
11147 elif nic_mac in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
11148 # otherwise generate the mac
11149 nic_dict[constants.INIC_MAC] = \
11150 self.cfg.GenerateMAC(self.proc.GetECId())
11152 # or validate/reserve the current one
11154 self.cfg.ReserveMAC(nic_mac, self.proc.GetECId())
11155 except errors.ReservationError:
11156 raise errors.OpPrereqError("MAC address %s already in use"
11157 " in cluster" % nic_mac,
11158 errors.ECODE_NOTUNIQUE)
11161 if self.op.disks and instance.disk_template == constants.DT_DISKLESS:
11162 raise errors.OpPrereqError("Disk operations not supported for"
11163 " diskless instances",
11164 errors.ECODE_INVAL)
11165 for disk_op, _ in self.op.disks:
11166 if disk_op == constants.DDM_REMOVE:
11167 if len(instance.disks) == 1:
11168 raise errors.OpPrereqError("Cannot remove the last disk of"
11169 " an instance", errors.ECODE_INVAL)
11170 _CheckInstanceDown(self, instance, "cannot remove disks")
11172 if (disk_op == constants.DDM_ADD and
11173 len(instance.disks) >= constants.MAX_DISKS):
11174 raise errors.OpPrereqError("Instance has too many disks (%d), cannot"
11175 " add more" % constants.MAX_DISKS,
11176 errors.ECODE_STATE)
11177 if disk_op not in (constants.DDM_ADD, constants.DDM_REMOVE):
11179 if disk_op < 0 or disk_op >= len(instance.disks):
11180 raise errors.OpPrereqError("Invalid disk index %s, valid values"
11182 (disk_op, len(instance.disks)),
11183 errors.ECODE_INVAL)
11187 def _ConvertPlainToDrbd(self, feedback_fn):
11188 """Converts an instance from plain to drbd.
11191 feedback_fn("Converting template to drbd")
11192 instance = self.instance
11193 pnode = instance.primary_node
11194 snode = self.op.remote_node
11196 # create a fake disk info for _GenerateDiskTemplate
11197 disk_info = [{constants.IDISK_SIZE: d.size, constants.IDISK_MODE: d.mode,
11198 constants.IDISK_VG: d.logical_id[0]}
11199 for d in instance.disks]
11200 new_disks = _GenerateDiskTemplate(self, self.op.disk_template,
11201 instance.name, pnode, [snode],
11202 disk_info, None, None, 0, feedback_fn)
11203 info = _GetInstanceInfoText(instance)
11204 feedback_fn("Creating aditional volumes...")
11205 # first, create the missing data and meta devices
11206 for disk in new_disks:
11207 # unfortunately this is... not too nice
11208 _CreateSingleBlockDev(self, pnode, instance, disk.children[1],
11210 for child in disk.children:
11211 _CreateSingleBlockDev(self, snode, instance, child, info, True)
11212 # at this stage, all new LVs have been created, we can rename the
11214 feedback_fn("Renaming original volumes...")
11215 rename_list = [(o, n.children[0].logical_id)
11216 for (o, n) in zip(instance.disks, new_disks)]
11217 result = self.rpc.call_blockdev_rename(pnode, rename_list)
11218 result.Raise("Failed to rename original LVs")
11220 feedback_fn("Initializing DRBD devices...")
11221 # all child devices are in place, we can now create the DRBD devices
11222 for disk in new_disks:
11223 for node in [pnode, snode]:
11224 f_create = node == pnode
11225 _CreateSingleBlockDev(self, node, instance, disk, info, f_create)
11227 # at this point, the instance has been modified
11228 instance.disk_template = constants.DT_DRBD8
11229 instance.disks = new_disks
11230 self.cfg.Update(instance, feedback_fn)
11232 # disks are created, waiting for sync
11233 disk_abort = not _WaitForSync(self, instance,
11234 oneshot=not self.op.wait_for_sync)
11236 raise errors.OpExecError("There are some degraded disks for"
11237 " this instance, please cleanup manually")
11239 def _ConvertDrbdToPlain(self, feedback_fn):
11240 """Converts an instance from drbd to plain.
11243 instance = self.instance
11244 assert len(instance.secondary_nodes) == 1
11245 pnode = instance.primary_node
11246 snode = instance.secondary_nodes[0]
11247 feedback_fn("Converting template to plain")
11249 old_disks = instance.disks
11250 new_disks = [d.children[0] for d in old_disks]
11252 # copy over size and mode
11253 for parent, child in zip(old_disks, new_disks):
11254 child.size = parent.size
11255 child.mode = parent.mode
11257 # update instance structure
11258 instance.disks = new_disks
11259 instance.disk_template = constants.DT_PLAIN
11260 self.cfg.Update(instance, feedback_fn)
11262 feedback_fn("Removing volumes on the secondary node...")
11263 for disk in old_disks:
11264 self.cfg.SetDiskID(disk, snode)
11265 msg = self.rpc.call_blockdev_remove(snode, disk).fail_msg
11267 self.LogWarning("Could not remove block device %s on node %s,"
11268 " continuing anyway: %s", disk.iv_name, snode, msg)
11270 feedback_fn("Removing unneeded volumes on the primary node...")
11271 for idx, disk in enumerate(old_disks):
11272 meta = disk.children[1]
11273 self.cfg.SetDiskID(meta, pnode)
11274 msg = self.rpc.call_blockdev_remove(pnode, meta).fail_msg
11276 self.LogWarning("Could not remove metadata for disk %d on node %s,"
11277 " continuing anyway: %s", idx, pnode, msg)
11279 def Exec(self, feedback_fn):
11280 """Modifies an instance.
11282 All parameters take effect only at the next restart of the instance.
11285 # Process here the warnings from CheckPrereq, as we don't have a
11286 # feedback_fn there.
11287 for warn in self.warn:
11288 feedback_fn("WARNING: %s" % warn)
11291 instance = self.instance
11293 for disk_op, disk_dict in self.op.disks:
11294 if disk_op == constants.DDM_REMOVE:
11295 # remove the last disk
11296 device = instance.disks.pop()
11297 device_idx = len(instance.disks)
11298 for node, disk in device.ComputeNodeTree(instance.primary_node):
11299 self.cfg.SetDiskID(disk, node)
11300 msg = self.rpc.call_blockdev_remove(node, disk).fail_msg
11302 self.LogWarning("Could not remove disk/%d on node %s: %s,"
11303 " continuing anyway", device_idx, node, msg)
11304 result.append(("disk/%d" % device_idx, "remove"))
11305 elif disk_op == constants.DDM_ADD:
11307 if instance.disk_template in (constants.DT_FILE,
11308 constants.DT_SHARED_FILE):
11309 file_driver, file_path = instance.disks[0].logical_id
11310 file_path = os.path.dirname(file_path)
11312 file_driver = file_path = None
11313 disk_idx_base = len(instance.disks)
11314 new_disk = _GenerateDiskTemplate(self,
11315 instance.disk_template,
11316 instance.name, instance.primary_node,
11317 instance.secondary_nodes,
11321 disk_idx_base, feedback_fn)[0]
11322 instance.disks.append(new_disk)
11323 info = _GetInstanceInfoText(instance)
11325 logging.info("Creating volume %s for instance %s",
11326 new_disk.iv_name, instance.name)
11327 # Note: this needs to be kept in sync with _CreateDisks
11329 for node in instance.all_nodes:
11330 f_create = node == instance.primary_node
11332 _CreateBlockDev(self, node, instance, new_disk,
11333 f_create, info, f_create)
11334 except errors.OpExecError, err:
11335 self.LogWarning("Failed to create volume %s (%s) on"
11337 new_disk.iv_name, new_disk, node, err)
11338 result.append(("disk/%d" % disk_idx_base, "add:size=%s,mode=%s" %
11339 (new_disk.size, new_disk.mode)))
11341 # change a given disk
11342 instance.disks[disk_op].mode = disk_dict[constants.IDISK_MODE]
11343 result.append(("disk.mode/%d" % disk_op,
11344 disk_dict[constants.IDISK_MODE]))
11346 if self.op.disk_template:
11347 r_shut = _ShutdownInstanceDisks(self, instance)
11349 raise errors.OpExecError("Cannot shutdown instance disks, unable to"
11350 " proceed with disk template conversion")
11351 mode = (instance.disk_template, self.op.disk_template)
11353 self._DISK_CONVERSIONS[mode](self, feedback_fn)
11355 self.cfg.ReleaseDRBDMinors(instance.name)
11357 result.append(("disk_template", self.op.disk_template))
11360 for nic_op, nic_dict in self.op.nics:
11361 if nic_op == constants.DDM_REMOVE:
11362 # remove the last nic
11363 del instance.nics[-1]
11364 result.append(("nic.%d" % len(instance.nics), "remove"))
11365 elif nic_op == constants.DDM_ADD:
11366 # mac and bridge should be set, by now
11367 mac = nic_dict[constants.INIC_MAC]
11368 ip = nic_dict.get(constants.INIC_IP, None)
11369 nicparams = self.nic_pinst[constants.DDM_ADD]
11370 new_nic = objects.NIC(mac=mac, ip=ip, nicparams=nicparams)
11371 instance.nics.append(new_nic)
11372 result.append(("nic.%d" % (len(instance.nics) - 1),
11373 "add:mac=%s,ip=%s,mode=%s,link=%s" %
11374 (new_nic.mac, new_nic.ip,
11375 self.nic_pnew[constants.DDM_ADD][constants.NIC_MODE],
11376 self.nic_pnew[constants.DDM_ADD][constants.NIC_LINK]
11379 for key in (constants.INIC_MAC, constants.INIC_IP):
11380 if key in nic_dict:
11381 setattr(instance.nics[nic_op], key, nic_dict[key])
11382 if nic_op in self.nic_pinst:
11383 instance.nics[nic_op].nicparams = self.nic_pinst[nic_op]
11384 for key, val in nic_dict.iteritems():
11385 result.append(("nic.%s/%d" % (key, nic_op), val))
11388 if self.op.hvparams:
11389 instance.hvparams = self.hv_inst
11390 for key, val in self.op.hvparams.iteritems():
11391 result.append(("hv/%s" % key, val))
11394 if self.op.beparams:
11395 instance.beparams = self.be_inst
11396 for key, val in self.op.beparams.iteritems():
11397 result.append(("be/%s" % key, val))
11400 if self.op.os_name:
11401 instance.os = self.op.os_name
11404 if self.op.osparams:
11405 instance.osparams = self.os_inst
11406 for key, val in self.op.osparams.iteritems():
11407 result.append(("os/%s" % key, val))
11409 self.cfg.Update(instance, feedback_fn)
11413 _DISK_CONVERSIONS = {
11414 (constants.DT_PLAIN, constants.DT_DRBD8): _ConvertPlainToDrbd,
11415 (constants.DT_DRBD8, constants.DT_PLAIN): _ConvertDrbdToPlain,
11419 class LUInstanceChangeGroup(LogicalUnit):
11420 HPATH = "instance-change-group"
11421 HTYPE = constants.HTYPE_INSTANCE
11424 def ExpandNames(self):
11425 self.share_locks = _ShareAll()
11426 self.needed_locks = {
11427 locking.LEVEL_NODEGROUP: [],
11428 locking.LEVEL_NODE: [],
11431 self._ExpandAndLockInstance()
11433 if self.op.target_groups:
11434 self.req_target_uuids = map(self.cfg.LookupNodeGroup,
11435 self.op.target_groups)
11437 self.req_target_uuids = None
11439 self.op.iallocator = _GetDefaultIAllocator(self.cfg, self.op.iallocator)
11441 def DeclareLocks(self, level):
11442 if level == locking.LEVEL_NODEGROUP:
11443 assert not self.needed_locks[locking.LEVEL_NODEGROUP]
11445 if self.req_target_uuids:
11446 lock_groups = set(self.req_target_uuids)
11448 # Lock all groups used by instance optimistically; this requires going
11449 # via the node before it's locked, requiring verification later on
11450 instance_groups = self.cfg.GetInstanceNodeGroups(self.op.instance_name)
11451 lock_groups.update(instance_groups)
11453 # No target groups, need to lock all of them
11454 lock_groups = locking.ALL_SET
11456 self.needed_locks[locking.LEVEL_NODEGROUP] = lock_groups
11458 elif level == locking.LEVEL_NODE:
11459 if self.req_target_uuids:
11460 # Lock all nodes used by instances
11461 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
11462 self._LockInstancesNodes()
11464 # Lock all nodes in all potential target groups
11465 lock_groups = (frozenset(self.owned_locks(locking.LEVEL_NODEGROUP)) -
11466 self.cfg.GetInstanceNodeGroups(self.op.instance_name))
11467 member_nodes = [node_name
11468 for group in lock_groups
11469 for node_name in self.cfg.GetNodeGroup(group).members]
11470 self.needed_locks[locking.LEVEL_NODE].extend(member_nodes)
11472 # Lock all nodes as all groups are potential targets
11473 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
11475 def CheckPrereq(self):
11476 owned_instances = frozenset(self.owned_locks(locking.LEVEL_INSTANCE))
11477 owned_groups = frozenset(self.owned_locks(locking.LEVEL_NODEGROUP))
11478 owned_nodes = frozenset(self.owned_locks(locking.LEVEL_NODE))
11480 assert (self.req_target_uuids is None or
11481 owned_groups.issuperset(self.req_target_uuids))
11482 assert owned_instances == set([self.op.instance_name])
11484 # Get instance information
11485 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
11487 # Check if node groups for locked instance are still correct
11488 assert owned_nodes.issuperset(self.instance.all_nodes), \
11489 ("Instance %s's nodes changed while we kept the lock" %
11490 self.op.instance_name)
11492 inst_groups = _CheckInstanceNodeGroups(self.cfg, self.op.instance_name,
11495 if self.req_target_uuids:
11496 # User requested specific target groups
11497 self.target_uuids = self.req_target_uuids
11499 # All groups except those used by the instance are potential targets
11500 self.target_uuids = owned_groups - inst_groups
11502 conflicting_groups = self.target_uuids & inst_groups
11503 if conflicting_groups:
11504 raise errors.OpPrereqError("Can't use group(s) '%s' as targets, they are"
11505 " used by the instance '%s'" %
11506 (utils.CommaJoin(conflicting_groups),
11507 self.op.instance_name),
11508 errors.ECODE_INVAL)
11510 if not self.target_uuids:
11511 raise errors.OpPrereqError("There are no possible target groups",
11512 errors.ECODE_INVAL)
11514 def BuildHooksEnv(self):
11515 """Build hooks env.
11518 assert self.target_uuids
11521 "TARGET_GROUPS": " ".join(self.target_uuids),
11524 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
11528 def BuildHooksNodes(self):
11529 """Build hooks nodes.
11532 mn = self.cfg.GetMasterNode()
11533 return ([mn], [mn])
11535 def Exec(self, feedback_fn):
11536 instances = list(self.owned_locks(locking.LEVEL_INSTANCE))
11538 assert instances == [self.op.instance_name], "Instance not locked"
11540 ial = IAllocator(self.cfg, self.rpc, constants.IALLOCATOR_MODE_CHG_GROUP,
11541 instances=instances, target_groups=list(self.target_uuids))
11543 ial.Run(self.op.iallocator)
11545 if not ial.success:
11546 raise errors.OpPrereqError("Can't compute solution for changing group of"
11547 " instance '%s' using iallocator '%s': %s" %
11548 (self.op.instance_name, self.op.iallocator,
11550 errors.ECODE_NORES)
11552 jobs = _LoadNodeEvacResult(self, ial.result, self.op.early_release, False)
11554 self.LogInfo("Iallocator returned %s job(s) for changing group of"
11555 " instance '%s'", len(jobs), self.op.instance_name)
11557 return ResultWithJobs(jobs)
11560 class LUBackupQuery(NoHooksLU):
11561 """Query the exports list
11566 def ExpandNames(self):
11567 self.needed_locks = {}
11568 self.share_locks[locking.LEVEL_NODE] = 1
11569 if not self.op.nodes:
11570 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
11572 self.needed_locks[locking.LEVEL_NODE] = \
11573 _GetWantedNodes(self, self.op.nodes)
11575 def Exec(self, feedback_fn):
11576 """Compute the list of all the exported system images.
11579 @return: a dictionary with the structure node->(export-list)
11580 where export-list is a list of the instances exported on
11584 self.nodes = self.owned_locks(locking.LEVEL_NODE)
11585 rpcresult = self.rpc.call_export_list(self.nodes)
11587 for node in rpcresult:
11588 if rpcresult[node].fail_msg:
11589 result[node] = False
11591 result[node] = rpcresult[node].payload
11596 class LUBackupPrepare(NoHooksLU):
11597 """Prepares an instance for an export and returns useful information.
11602 def ExpandNames(self):
11603 self._ExpandAndLockInstance()
11605 def CheckPrereq(self):
11606 """Check prerequisites.
11609 instance_name = self.op.instance_name
11611 self.instance = self.cfg.GetInstanceInfo(instance_name)
11612 assert self.instance is not None, \
11613 "Cannot retrieve locked instance %s" % self.op.instance_name
11614 _CheckNodeOnline(self, self.instance.primary_node)
11616 self._cds = _GetClusterDomainSecret()
11618 def Exec(self, feedback_fn):
11619 """Prepares an instance for an export.
11622 instance = self.instance
11624 if self.op.mode == constants.EXPORT_MODE_REMOTE:
11625 salt = utils.GenerateSecret(8)
11627 feedback_fn("Generating X509 certificate on %s" % instance.primary_node)
11628 result = self.rpc.call_x509_cert_create(instance.primary_node,
11629 constants.RIE_CERT_VALIDITY)
11630 result.Raise("Can't create X509 key and certificate on %s" % result.node)
11632 (name, cert_pem) = result.payload
11634 cert = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM,
11638 "handshake": masterd.instance.ComputeRemoteExportHandshake(self._cds),
11639 "x509_key_name": (name, utils.Sha1Hmac(self._cds, name, salt=salt),
11641 "x509_ca": utils.SignX509Certificate(cert, self._cds, salt),
11647 class LUBackupExport(LogicalUnit):
11648 """Export an instance to an image in the cluster.
11651 HPATH = "instance-export"
11652 HTYPE = constants.HTYPE_INSTANCE
11655 def CheckArguments(self):
11656 """Check the arguments.
11659 self.x509_key_name = self.op.x509_key_name
11660 self.dest_x509_ca_pem = self.op.destination_x509_ca
11662 if self.op.mode == constants.EXPORT_MODE_REMOTE:
11663 if not self.x509_key_name:
11664 raise errors.OpPrereqError("Missing X509 key name for encryption",
11665 errors.ECODE_INVAL)
11667 if not self.dest_x509_ca_pem:
11668 raise errors.OpPrereqError("Missing destination X509 CA",
11669 errors.ECODE_INVAL)
11671 def ExpandNames(self):
11672 self._ExpandAndLockInstance()
11674 # Lock all nodes for local exports
11675 if self.op.mode == constants.EXPORT_MODE_LOCAL:
11676 # FIXME: lock only instance primary and destination node
11678 # Sad but true, for now we have do lock all nodes, as we don't know where
11679 # the previous export might be, and in this LU we search for it and
11680 # remove it from its current node. In the future we could fix this by:
11681 # - making a tasklet to search (share-lock all), then create the
11682 # new one, then one to remove, after
11683 # - removing the removal operation altogether
11684 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
11686 def DeclareLocks(self, level):
11687 """Last minute lock declaration."""
11688 # All nodes are locked anyway, so nothing to do here.
11690 def BuildHooksEnv(self):
11691 """Build hooks env.
11693 This will run on the master, primary node and target node.
11697 "EXPORT_MODE": self.op.mode,
11698 "EXPORT_NODE": self.op.target_node,
11699 "EXPORT_DO_SHUTDOWN": self.op.shutdown,
11700 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
11701 # TODO: Generic function for boolean env variables
11702 "REMOVE_INSTANCE": str(bool(self.op.remove_instance)),
11705 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
11709 def BuildHooksNodes(self):
11710 """Build hooks nodes.
11713 nl = [self.cfg.GetMasterNode(), self.instance.primary_node]
11715 if self.op.mode == constants.EXPORT_MODE_LOCAL:
11716 nl.append(self.op.target_node)
11720 def CheckPrereq(self):
11721 """Check prerequisites.
11723 This checks that the instance and node names are valid.
11726 instance_name = self.op.instance_name
11728 self.instance = self.cfg.GetInstanceInfo(instance_name)
11729 assert self.instance is not None, \
11730 "Cannot retrieve locked instance %s" % self.op.instance_name
11731 _CheckNodeOnline(self, self.instance.primary_node)
11733 if (self.op.remove_instance and self.instance.admin_up and
11734 not self.op.shutdown):
11735 raise errors.OpPrereqError("Can not remove instance without shutting it"
11738 if self.op.mode == constants.EXPORT_MODE_LOCAL:
11739 self.op.target_node = _ExpandNodeName(self.cfg, self.op.target_node)
11740 self.dst_node = self.cfg.GetNodeInfo(self.op.target_node)
11741 assert self.dst_node is not None
11743 _CheckNodeOnline(self, self.dst_node.name)
11744 _CheckNodeNotDrained(self, self.dst_node.name)
11747 self.dest_disk_info = None
11748 self.dest_x509_ca = None
11750 elif self.op.mode == constants.EXPORT_MODE_REMOTE:
11751 self.dst_node = None
11753 if len(self.op.target_node) != len(self.instance.disks):
11754 raise errors.OpPrereqError(("Received destination information for %s"
11755 " disks, but instance %s has %s disks") %
11756 (len(self.op.target_node), instance_name,
11757 len(self.instance.disks)),
11758 errors.ECODE_INVAL)
11760 cds = _GetClusterDomainSecret()
11762 # Check X509 key name
11764 (key_name, hmac_digest, hmac_salt) = self.x509_key_name
11765 except (TypeError, ValueError), err:
11766 raise errors.OpPrereqError("Invalid data for X509 key name: %s" % err)
11768 if not utils.VerifySha1Hmac(cds, key_name, hmac_digest, salt=hmac_salt):
11769 raise errors.OpPrereqError("HMAC for X509 key name is wrong",
11770 errors.ECODE_INVAL)
11772 # Load and verify CA
11774 (cert, _) = utils.LoadSignedX509Certificate(self.dest_x509_ca_pem, cds)
11775 except OpenSSL.crypto.Error, err:
11776 raise errors.OpPrereqError("Unable to load destination X509 CA (%s)" %
11777 (err, ), errors.ECODE_INVAL)
11779 (errcode, msg) = utils.VerifyX509Certificate(cert, None, None)
11780 if errcode is not None:
11781 raise errors.OpPrereqError("Invalid destination X509 CA (%s)" %
11782 (msg, ), errors.ECODE_INVAL)
11784 self.dest_x509_ca = cert
11786 # Verify target information
11788 for idx, disk_data in enumerate(self.op.target_node):
11790 (host, port, magic) = \
11791 masterd.instance.CheckRemoteExportDiskInfo(cds, idx, disk_data)
11792 except errors.GenericError, err:
11793 raise errors.OpPrereqError("Target info for disk %s: %s" %
11794 (idx, err), errors.ECODE_INVAL)
11796 disk_info.append((host, port, magic))
11798 assert len(disk_info) == len(self.op.target_node)
11799 self.dest_disk_info = disk_info
11802 raise errors.ProgrammerError("Unhandled export mode %r" %
11805 # instance disk type verification
11806 # TODO: Implement export support for file-based disks
11807 for disk in self.instance.disks:
11808 if disk.dev_type == constants.LD_FILE:
11809 raise errors.OpPrereqError("Export not supported for instances with"
11810 " file-based disks", errors.ECODE_INVAL)
11812 def _CleanupExports(self, feedback_fn):
11813 """Removes exports of current instance from all other nodes.
11815 If an instance in a cluster with nodes A..D was exported to node C, its
11816 exports will be removed from the nodes A, B and D.
11819 assert self.op.mode != constants.EXPORT_MODE_REMOTE
11821 nodelist = self.cfg.GetNodeList()
11822 nodelist.remove(self.dst_node.name)
11824 # on one-node clusters nodelist will be empty after the removal
11825 # if we proceed the backup would be removed because OpBackupQuery
11826 # substitutes an empty list with the full cluster node list.
11827 iname = self.instance.name
11829 feedback_fn("Removing old exports for instance %s" % iname)
11830 exportlist = self.rpc.call_export_list(nodelist)
11831 for node in exportlist:
11832 if exportlist[node].fail_msg:
11834 if iname in exportlist[node].payload:
11835 msg = self.rpc.call_export_remove(node, iname).fail_msg
11837 self.LogWarning("Could not remove older export for instance %s"
11838 " on node %s: %s", iname, node, msg)
11840 def Exec(self, feedback_fn):
11841 """Export an instance to an image in the cluster.
11844 assert self.op.mode in constants.EXPORT_MODES
11846 instance = self.instance
11847 src_node = instance.primary_node
11849 if self.op.shutdown:
11850 # shutdown the instance, but not the disks
11851 feedback_fn("Shutting down instance %s" % instance.name)
11852 result = self.rpc.call_instance_shutdown(src_node, instance,
11853 self.op.shutdown_timeout)
11854 # TODO: Maybe ignore failures if ignore_remove_failures is set
11855 result.Raise("Could not shutdown instance %s on"
11856 " node %s" % (instance.name, src_node))
11858 # set the disks ID correctly since call_instance_start needs the
11859 # correct drbd minor to create the symlinks
11860 for disk in instance.disks:
11861 self.cfg.SetDiskID(disk, src_node)
11863 activate_disks = (not instance.admin_up)
11866 # Activate the instance disks if we'exporting a stopped instance
11867 feedback_fn("Activating disks for %s" % instance.name)
11868 _StartInstanceDisks(self, instance, None)
11871 helper = masterd.instance.ExportInstanceHelper(self, feedback_fn,
11874 helper.CreateSnapshots()
11876 if (self.op.shutdown and instance.admin_up and
11877 not self.op.remove_instance):
11878 assert not activate_disks
11879 feedback_fn("Starting instance %s" % instance.name)
11880 result = self.rpc.call_instance_start(src_node, instance,
11882 msg = result.fail_msg
11884 feedback_fn("Failed to start instance: %s" % msg)
11885 _ShutdownInstanceDisks(self, instance)
11886 raise errors.OpExecError("Could not start instance: %s" % msg)
11888 if self.op.mode == constants.EXPORT_MODE_LOCAL:
11889 (fin_resu, dresults) = helper.LocalExport(self.dst_node)
11890 elif self.op.mode == constants.EXPORT_MODE_REMOTE:
11891 connect_timeout = constants.RIE_CONNECT_TIMEOUT
11892 timeouts = masterd.instance.ImportExportTimeouts(connect_timeout)
11894 (key_name, _, _) = self.x509_key_name
11897 OpenSSL.crypto.dump_certificate(OpenSSL.crypto.FILETYPE_PEM,
11900 (fin_resu, dresults) = helper.RemoteExport(self.dest_disk_info,
11901 key_name, dest_ca_pem,
11906 # Check for backwards compatibility
11907 assert len(dresults) == len(instance.disks)
11908 assert compat.all(isinstance(i, bool) for i in dresults), \
11909 "Not all results are boolean: %r" % dresults
11913 feedback_fn("Deactivating disks for %s" % instance.name)
11914 _ShutdownInstanceDisks(self, instance)
11916 if not (compat.all(dresults) and fin_resu):
11919 failures.append("export finalization")
11920 if not compat.all(dresults):
11921 fdsk = utils.CommaJoin(idx for (idx, dsk) in enumerate(dresults)
11923 failures.append("disk export: disk(s) %s" % fdsk)
11925 raise errors.OpExecError("Export failed, errors in %s" %
11926 utils.CommaJoin(failures))
11928 # At this point, the export was successful, we can cleanup/finish
11930 # Remove instance if requested
11931 if self.op.remove_instance:
11932 feedback_fn("Removing instance %s" % instance.name)
11933 _RemoveInstance(self, feedback_fn, instance,
11934 self.op.ignore_remove_failures)
11936 if self.op.mode == constants.EXPORT_MODE_LOCAL:
11937 self._CleanupExports(feedback_fn)
11939 return fin_resu, dresults
11942 class LUBackupRemove(NoHooksLU):
11943 """Remove exports related to the named instance.
11948 def ExpandNames(self):
11949 self.needed_locks = {}
11950 # We need all nodes to be locked in order for RemoveExport to work, but we
11951 # don't need to lock the instance itself, as nothing will happen to it (and
11952 # we can remove exports also for a removed instance)
11953 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
11955 def Exec(self, feedback_fn):
11956 """Remove any export.
11959 instance_name = self.cfg.ExpandInstanceName(self.op.instance_name)
11960 # If the instance was not found we'll try with the name that was passed in.
11961 # This will only work if it was an FQDN, though.
11963 if not instance_name:
11965 instance_name = self.op.instance_name
11967 locked_nodes = self.owned_locks(locking.LEVEL_NODE)
11968 exportlist = self.rpc.call_export_list(locked_nodes)
11970 for node in exportlist:
11971 msg = exportlist[node].fail_msg
11973 self.LogWarning("Failed to query node %s (continuing): %s", node, msg)
11975 if instance_name in exportlist[node].payload:
11977 result = self.rpc.call_export_remove(node, instance_name)
11978 msg = result.fail_msg
11980 logging.error("Could not remove export for instance %s"
11981 " on node %s: %s", instance_name, node, msg)
11983 if fqdn_warn and not found:
11984 feedback_fn("Export not found. If trying to remove an export belonging"
11985 " to a deleted instance please use its Fully Qualified"
11989 class LUGroupAdd(LogicalUnit):
11990 """Logical unit for creating node groups.
11993 HPATH = "group-add"
11994 HTYPE = constants.HTYPE_GROUP
11997 def ExpandNames(self):
11998 # We need the new group's UUID here so that we can create and acquire the
11999 # corresponding lock. Later, in Exec(), we'll indicate to cfg.AddNodeGroup
12000 # that it should not check whether the UUID exists in the configuration.
12001 self.group_uuid = self.cfg.GenerateUniqueID(self.proc.GetECId())
12002 self.needed_locks = {}
12003 self.add_locks[locking.LEVEL_NODEGROUP] = self.group_uuid
12005 def CheckPrereq(self):
12006 """Check prerequisites.
12008 This checks that the given group name is not an existing node group
12013 existing_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
12014 except errors.OpPrereqError:
12017 raise errors.OpPrereqError("Desired group name '%s' already exists as a"
12018 " node group (UUID: %s)" %
12019 (self.op.group_name, existing_uuid),
12020 errors.ECODE_EXISTS)
12022 if self.op.ndparams:
12023 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
12025 def BuildHooksEnv(self):
12026 """Build hooks env.
12030 "GROUP_NAME": self.op.group_name,
12033 def BuildHooksNodes(self):
12034 """Build hooks nodes.
12037 mn = self.cfg.GetMasterNode()
12038 return ([mn], [mn])
12040 def Exec(self, feedback_fn):
12041 """Add the node group to the cluster.
12044 group_obj = objects.NodeGroup(name=self.op.group_name, members=[],
12045 uuid=self.group_uuid,
12046 alloc_policy=self.op.alloc_policy,
12047 ndparams=self.op.ndparams)
12049 self.cfg.AddNodeGroup(group_obj, self.proc.GetECId(), check_uuid=False)
12050 del self.remove_locks[locking.LEVEL_NODEGROUP]
12053 class LUGroupAssignNodes(NoHooksLU):
12054 """Logical unit for assigning nodes to groups.
12059 def ExpandNames(self):
12060 # These raise errors.OpPrereqError on their own:
12061 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
12062 self.op.nodes = _GetWantedNodes(self, self.op.nodes)
12064 # We want to lock all the affected nodes and groups. We have readily
12065 # available the list of nodes, and the *destination* group. To gather the
12066 # list of "source" groups, we need to fetch node information later on.
12067 self.needed_locks = {
12068 locking.LEVEL_NODEGROUP: set([self.group_uuid]),
12069 locking.LEVEL_NODE: self.op.nodes,
12072 def DeclareLocks(self, level):
12073 if level == locking.LEVEL_NODEGROUP:
12074 assert len(self.needed_locks[locking.LEVEL_NODEGROUP]) == 1
12076 # Try to get all affected nodes' groups without having the group or node
12077 # lock yet. Needs verification later in the code flow.
12078 groups = self.cfg.GetNodeGroupsFromNodes(self.op.nodes)
12080 self.needed_locks[locking.LEVEL_NODEGROUP].update(groups)
12082 def CheckPrereq(self):
12083 """Check prerequisites.
12086 assert self.needed_locks[locking.LEVEL_NODEGROUP]
12087 assert (frozenset(self.owned_locks(locking.LEVEL_NODE)) ==
12088 frozenset(self.op.nodes))
12090 expected_locks = (set([self.group_uuid]) |
12091 self.cfg.GetNodeGroupsFromNodes(self.op.nodes))
12092 actual_locks = self.owned_locks(locking.LEVEL_NODEGROUP)
12093 if actual_locks != expected_locks:
12094 raise errors.OpExecError("Nodes changed groups since locks were acquired,"
12095 " current groups are '%s', used to be '%s'" %
12096 (utils.CommaJoin(expected_locks),
12097 utils.CommaJoin(actual_locks)))
12099 self.node_data = self.cfg.GetAllNodesInfo()
12100 self.group = self.cfg.GetNodeGroup(self.group_uuid)
12101 instance_data = self.cfg.GetAllInstancesInfo()
12103 if self.group is None:
12104 raise errors.OpExecError("Could not retrieve group '%s' (UUID: %s)" %
12105 (self.op.group_name, self.group_uuid))
12107 (new_splits, previous_splits) = \
12108 self.CheckAssignmentForSplitInstances([(node, self.group_uuid)
12109 for node in self.op.nodes],
12110 self.node_data, instance_data)
12113 fmt_new_splits = utils.CommaJoin(utils.NiceSort(new_splits))
12115 if not self.op.force:
12116 raise errors.OpExecError("The following instances get split by this"
12117 " change and --force was not given: %s" %
12120 self.LogWarning("This operation will split the following instances: %s",
12123 if previous_splits:
12124 self.LogWarning("In addition, these already-split instances continue"
12125 " to be split across groups: %s",
12126 utils.CommaJoin(utils.NiceSort(previous_splits)))
12128 def Exec(self, feedback_fn):
12129 """Assign nodes to a new group.
12132 for node in self.op.nodes:
12133 self.node_data[node].group = self.group_uuid
12135 # FIXME: Depends on side-effects of modifying the result of
12136 # C{cfg.GetAllNodesInfo}
12138 self.cfg.Update(self.group, feedback_fn) # Saves all modified nodes.
12141 def CheckAssignmentForSplitInstances(changes, node_data, instance_data):
12142 """Check for split instances after a node assignment.
12144 This method considers a series of node assignments as an atomic operation,
12145 and returns information about split instances after applying the set of
12148 In particular, it returns information about newly split instances, and
12149 instances that were already split, and remain so after the change.
12151 Only instances whose disk template is listed in constants.DTS_INT_MIRROR are
12154 @type changes: list of (node_name, new_group_uuid) pairs.
12155 @param changes: list of node assignments to consider.
12156 @param node_data: a dict with data for all nodes
12157 @param instance_data: a dict with all instances to consider
12158 @rtype: a two-tuple
12159 @return: a list of instances that were previously okay and result split as a
12160 consequence of this change, and a list of instances that were previously
12161 split and this change does not fix.
12164 changed_nodes = dict((node, group) for node, group in changes
12165 if node_data[node].group != group)
12167 all_split_instances = set()
12168 previously_split_instances = set()
12170 def InstanceNodes(instance):
12171 return [instance.primary_node] + list(instance.secondary_nodes)
12173 for inst in instance_data.values():
12174 if inst.disk_template not in constants.DTS_INT_MIRROR:
12177 instance_nodes = InstanceNodes(inst)
12179 if len(set(node_data[node].group for node in instance_nodes)) > 1:
12180 previously_split_instances.add(inst.name)
12182 if len(set(changed_nodes.get(node, node_data[node].group)
12183 for node in instance_nodes)) > 1:
12184 all_split_instances.add(inst.name)
12186 return (list(all_split_instances - previously_split_instances),
12187 list(previously_split_instances & all_split_instances))
12190 class _GroupQuery(_QueryBase):
12191 FIELDS = query.GROUP_FIELDS
12193 def ExpandNames(self, lu):
12194 lu.needed_locks = {}
12196 self._all_groups = lu.cfg.GetAllNodeGroupsInfo()
12197 name_to_uuid = dict((g.name, g.uuid) for g in self._all_groups.values())
12200 self.wanted = [name_to_uuid[name]
12201 for name in utils.NiceSort(name_to_uuid.keys())]
12203 # Accept names to be either names or UUIDs.
12206 all_uuid = frozenset(self._all_groups.keys())
12208 for name in self.names:
12209 if name in all_uuid:
12210 self.wanted.append(name)
12211 elif name in name_to_uuid:
12212 self.wanted.append(name_to_uuid[name])
12214 missing.append(name)
12217 raise errors.OpPrereqError("Some groups do not exist: %s" %
12218 utils.CommaJoin(missing),
12219 errors.ECODE_NOENT)
12221 def DeclareLocks(self, lu, level):
12224 def _GetQueryData(self, lu):
12225 """Computes the list of node groups and their attributes.
12228 do_nodes = query.GQ_NODE in self.requested_data
12229 do_instances = query.GQ_INST in self.requested_data
12231 group_to_nodes = None
12232 group_to_instances = None
12234 # For GQ_NODE, we need to map group->[nodes], and group->[instances] for
12235 # GQ_INST. The former is attainable with just GetAllNodesInfo(), but for the
12236 # latter GetAllInstancesInfo() is not enough, for we have to go through
12237 # instance->node. Hence, we will need to process nodes even if we only need
12238 # instance information.
12239 if do_nodes or do_instances:
12240 all_nodes = lu.cfg.GetAllNodesInfo()
12241 group_to_nodes = dict((uuid, []) for uuid in self.wanted)
12244 for node in all_nodes.values():
12245 if node.group in group_to_nodes:
12246 group_to_nodes[node.group].append(node.name)
12247 node_to_group[node.name] = node.group
12250 all_instances = lu.cfg.GetAllInstancesInfo()
12251 group_to_instances = dict((uuid, []) for uuid in self.wanted)
12253 for instance in all_instances.values():
12254 node = instance.primary_node
12255 if node in node_to_group:
12256 group_to_instances[node_to_group[node]].append(instance.name)
12259 # Do not pass on node information if it was not requested.
12260 group_to_nodes = None
12262 return query.GroupQueryData([self._all_groups[uuid]
12263 for uuid in self.wanted],
12264 group_to_nodes, group_to_instances)
12267 class LUGroupQuery(NoHooksLU):
12268 """Logical unit for querying node groups.
12273 def CheckArguments(self):
12274 self.gq = _GroupQuery(qlang.MakeSimpleFilter("name", self.op.names),
12275 self.op.output_fields, False)
12277 def ExpandNames(self):
12278 self.gq.ExpandNames(self)
12280 def DeclareLocks(self, level):
12281 self.gq.DeclareLocks(self, level)
12283 def Exec(self, feedback_fn):
12284 return self.gq.OldStyleQuery(self)
12287 class LUGroupSetParams(LogicalUnit):
12288 """Modifies the parameters of a node group.
12291 HPATH = "group-modify"
12292 HTYPE = constants.HTYPE_GROUP
12295 def CheckArguments(self):
12298 self.op.alloc_policy,
12301 if all_changes.count(None) == len(all_changes):
12302 raise errors.OpPrereqError("Please pass at least one modification",
12303 errors.ECODE_INVAL)
12305 def ExpandNames(self):
12306 # This raises errors.OpPrereqError on its own:
12307 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
12309 self.needed_locks = {
12310 locking.LEVEL_NODEGROUP: [self.group_uuid],
12313 def CheckPrereq(self):
12314 """Check prerequisites.
12317 self.group = self.cfg.GetNodeGroup(self.group_uuid)
12319 if self.group is None:
12320 raise errors.OpExecError("Could not retrieve group '%s' (UUID: %s)" %
12321 (self.op.group_name, self.group_uuid))
12323 if self.op.ndparams:
12324 new_ndparams = _GetUpdatedParams(self.group.ndparams, self.op.ndparams)
12325 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
12326 self.new_ndparams = new_ndparams
12328 def BuildHooksEnv(self):
12329 """Build hooks env.
12333 "GROUP_NAME": self.op.group_name,
12334 "NEW_ALLOC_POLICY": self.op.alloc_policy,
12337 def BuildHooksNodes(self):
12338 """Build hooks nodes.
12341 mn = self.cfg.GetMasterNode()
12342 return ([mn], [mn])
12344 def Exec(self, feedback_fn):
12345 """Modifies the node group.
12350 if self.op.ndparams:
12351 self.group.ndparams = self.new_ndparams
12352 result.append(("ndparams", str(self.group.ndparams)))
12354 if self.op.alloc_policy:
12355 self.group.alloc_policy = self.op.alloc_policy
12357 self.cfg.Update(self.group, feedback_fn)
12361 class LUGroupRemove(LogicalUnit):
12362 HPATH = "group-remove"
12363 HTYPE = constants.HTYPE_GROUP
12366 def ExpandNames(self):
12367 # This will raises errors.OpPrereqError on its own:
12368 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
12369 self.needed_locks = {
12370 locking.LEVEL_NODEGROUP: [self.group_uuid],
12373 def CheckPrereq(self):
12374 """Check prerequisites.
12376 This checks that the given group name exists as a node group, that is
12377 empty (i.e., contains no nodes), and that is not the last group of the
12381 # Verify that the group is empty.
12382 group_nodes = [node.name
12383 for node in self.cfg.GetAllNodesInfo().values()
12384 if node.group == self.group_uuid]
12387 raise errors.OpPrereqError("Group '%s' not empty, has the following"
12389 (self.op.group_name,
12390 utils.CommaJoin(utils.NiceSort(group_nodes))),
12391 errors.ECODE_STATE)
12393 # Verify the cluster would not be left group-less.
12394 if len(self.cfg.GetNodeGroupList()) == 1:
12395 raise errors.OpPrereqError("Group '%s' is the only group,"
12396 " cannot be removed" %
12397 self.op.group_name,
12398 errors.ECODE_STATE)
12400 def BuildHooksEnv(self):
12401 """Build hooks env.
12405 "GROUP_NAME": self.op.group_name,
12408 def BuildHooksNodes(self):
12409 """Build hooks nodes.
12412 mn = self.cfg.GetMasterNode()
12413 return ([mn], [mn])
12415 def Exec(self, feedback_fn):
12416 """Remove the node group.
12420 self.cfg.RemoveNodeGroup(self.group_uuid)
12421 except errors.ConfigurationError:
12422 raise errors.OpExecError("Group '%s' with UUID %s disappeared" %
12423 (self.op.group_name, self.group_uuid))
12425 self.remove_locks[locking.LEVEL_NODEGROUP] = self.group_uuid
12428 class LUGroupRename(LogicalUnit):
12429 HPATH = "group-rename"
12430 HTYPE = constants.HTYPE_GROUP
12433 def ExpandNames(self):
12434 # This raises errors.OpPrereqError on its own:
12435 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
12437 self.needed_locks = {
12438 locking.LEVEL_NODEGROUP: [self.group_uuid],
12441 def CheckPrereq(self):
12442 """Check prerequisites.
12444 Ensures requested new name is not yet used.
12448 new_name_uuid = self.cfg.LookupNodeGroup(self.op.new_name)
12449 except errors.OpPrereqError:
12452 raise errors.OpPrereqError("Desired new name '%s' clashes with existing"
12453 " node group (UUID: %s)" %
12454 (self.op.new_name, new_name_uuid),
12455 errors.ECODE_EXISTS)
12457 def BuildHooksEnv(self):
12458 """Build hooks env.
12462 "OLD_NAME": self.op.group_name,
12463 "NEW_NAME": self.op.new_name,
12466 def BuildHooksNodes(self):
12467 """Build hooks nodes.
12470 mn = self.cfg.GetMasterNode()
12472 all_nodes = self.cfg.GetAllNodesInfo()
12473 all_nodes.pop(mn, None)
12476 run_nodes.extend(node.name for node in all_nodes.values()
12477 if node.group == self.group_uuid)
12479 return (run_nodes, run_nodes)
12481 def Exec(self, feedback_fn):
12482 """Rename the node group.
12485 group = self.cfg.GetNodeGroup(self.group_uuid)
12488 raise errors.OpExecError("Could not retrieve group '%s' (UUID: %s)" %
12489 (self.op.group_name, self.group_uuid))
12491 group.name = self.op.new_name
12492 self.cfg.Update(group, feedback_fn)
12494 return self.op.new_name
12497 class LUGroupEvacuate(LogicalUnit):
12498 HPATH = "group-evacuate"
12499 HTYPE = constants.HTYPE_GROUP
12502 def ExpandNames(self):
12503 # This raises errors.OpPrereqError on its own:
12504 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
12506 if self.op.target_groups:
12507 self.req_target_uuids = map(self.cfg.LookupNodeGroup,
12508 self.op.target_groups)
12510 self.req_target_uuids = []
12512 if self.group_uuid in self.req_target_uuids:
12513 raise errors.OpPrereqError("Group to be evacuated (%s) can not be used"
12514 " as a target group (targets are %s)" %
12516 utils.CommaJoin(self.req_target_uuids)),
12517 errors.ECODE_INVAL)
12519 self.op.iallocator = _GetDefaultIAllocator(self.cfg, self.op.iallocator)
12521 self.share_locks = _ShareAll()
12522 self.needed_locks = {
12523 locking.LEVEL_INSTANCE: [],
12524 locking.LEVEL_NODEGROUP: [],
12525 locking.LEVEL_NODE: [],
12528 def DeclareLocks(self, level):
12529 if level == locking.LEVEL_INSTANCE:
12530 assert not self.needed_locks[locking.LEVEL_INSTANCE]
12532 # Lock instances optimistically, needs verification once node and group
12533 # locks have been acquired
12534 self.needed_locks[locking.LEVEL_INSTANCE] = \
12535 self.cfg.GetNodeGroupInstances(self.group_uuid)
12537 elif level == locking.LEVEL_NODEGROUP:
12538 assert not self.needed_locks[locking.LEVEL_NODEGROUP]
12540 if self.req_target_uuids:
12541 lock_groups = set([self.group_uuid] + self.req_target_uuids)
12543 # Lock all groups used by instances optimistically; this requires going
12544 # via the node before it's locked, requiring verification later on
12545 lock_groups.update(group_uuid
12546 for instance_name in
12547 self.owned_locks(locking.LEVEL_INSTANCE)
12549 self.cfg.GetInstanceNodeGroups(instance_name))
12551 # No target groups, need to lock all of them
12552 lock_groups = locking.ALL_SET
12554 self.needed_locks[locking.LEVEL_NODEGROUP] = lock_groups
12556 elif level == locking.LEVEL_NODE:
12557 # This will only lock the nodes in the group to be evacuated which
12558 # contain actual instances
12559 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
12560 self._LockInstancesNodes()
12562 # Lock all nodes in group to be evacuated and target groups
12563 owned_groups = frozenset(self.owned_locks(locking.LEVEL_NODEGROUP))
12564 assert self.group_uuid in owned_groups
12565 member_nodes = [node_name
12566 for group in owned_groups
12567 for node_name in self.cfg.GetNodeGroup(group).members]
12568 self.needed_locks[locking.LEVEL_NODE].extend(member_nodes)
12570 def CheckPrereq(self):
12571 owned_instances = frozenset(self.owned_locks(locking.LEVEL_INSTANCE))
12572 owned_groups = frozenset(self.owned_locks(locking.LEVEL_NODEGROUP))
12573 owned_nodes = frozenset(self.owned_locks(locking.LEVEL_NODE))
12575 assert owned_groups.issuperset(self.req_target_uuids)
12576 assert self.group_uuid in owned_groups
12578 # Check if locked instances are still correct
12579 _CheckNodeGroupInstances(self.cfg, self.group_uuid, owned_instances)
12581 # Get instance information
12582 self.instances = dict(self.cfg.GetMultiInstanceInfo(owned_instances))
12584 # Check if node groups for locked instances are still correct
12585 for instance_name in owned_instances:
12586 inst = self.instances[instance_name]
12587 assert owned_nodes.issuperset(inst.all_nodes), \
12588 "Instance %s's nodes changed while we kept the lock" % instance_name
12590 inst_groups = _CheckInstanceNodeGroups(self.cfg, instance_name,
12593 assert self.group_uuid in inst_groups, \
12594 "Instance %s has no node in group %s" % (instance_name, self.group_uuid)
12596 if self.req_target_uuids:
12597 # User requested specific target groups
12598 self.target_uuids = self.req_target_uuids
12600 # All groups except the one to be evacuated are potential targets
12601 self.target_uuids = [group_uuid for group_uuid in owned_groups
12602 if group_uuid != self.group_uuid]
12604 if not self.target_uuids:
12605 raise errors.OpPrereqError("There are no possible target groups",
12606 errors.ECODE_INVAL)
12608 def BuildHooksEnv(self):
12609 """Build hooks env.
12613 "GROUP_NAME": self.op.group_name,
12614 "TARGET_GROUPS": " ".join(self.target_uuids),
12617 def BuildHooksNodes(self):
12618 """Build hooks nodes.
12621 mn = self.cfg.GetMasterNode()
12623 assert self.group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP)
12625 run_nodes = [mn] + self.cfg.GetNodeGroup(self.group_uuid).members
12627 return (run_nodes, run_nodes)
12629 def Exec(self, feedback_fn):
12630 instances = list(self.owned_locks(locking.LEVEL_INSTANCE))
12632 assert self.group_uuid not in self.target_uuids
12634 ial = IAllocator(self.cfg, self.rpc, constants.IALLOCATOR_MODE_CHG_GROUP,
12635 instances=instances, target_groups=self.target_uuids)
12637 ial.Run(self.op.iallocator)
12639 if not ial.success:
12640 raise errors.OpPrereqError("Can't compute group evacuation using"
12641 " iallocator '%s': %s" %
12642 (self.op.iallocator, ial.info),
12643 errors.ECODE_NORES)
12645 jobs = _LoadNodeEvacResult(self, ial.result, self.op.early_release, False)
12647 self.LogInfo("Iallocator returned %s job(s) for evacuating node group %s",
12648 len(jobs), self.op.group_name)
12650 return ResultWithJobs(jobs)
12653 class TagsLU(NoHooksLU): # pylint: disable=W0223
12654 """Generic tags LU.
12656 This is an abstract class which is the parent of all the other tags LUs.
12659 def ExpandNames(self):
12660 self.group_uuid = None
12661 self.needed_locks = {}
12662 if self.op.kind == constants.TAG_NODE:
12663 self.op.name = _ExpandNodeName(self.cfg, self.op.name)
12664 self.needed_locks[locking.LEVEL_NODE] = self.op.name
12665 elif self.op.kind == constants.TAG_INSTANCE:
12666 self.op.name = _ExpandInstanceName(self.cfg, self.op.name)
12667 self.needed_locks[locking.LEVEL_INSTANCE] = self.op.name
12668 elif self.op.kind == constants.TAG_NODEGROUP:
12669 self.group_uuid = self.cfg.LookupNodeGroup(self.op.name)
12671 # FIXME: Acquire BGL for cluster tag operations (as of this writing it's
12672 # not possible to acquire the BGL based on opcode parameters)
12674 def CheckPrereq(self):
12675 """Check prerequisites.
12678 if self.op.kind == constants.TAG_CLUSTER:
12679 self.target = self.cfg.GetClusterInfo()
12680 elif self.op.kind == constants.TAG_NODE:
12681 self.target = self.cfg.GetNodeInfo(self.op.name)
12682 elif self.op.kind == constants.TAG_INSTANCE:
12683 self.target = self.cfg.GetInstanceInfo(self.op.name)
12684 elif self.op.kind == constants.TAG_NODEGROUP:
12685 self.target = self.cfg.GetNodeGroup(self.group_uuid)
12687 raise errors.OpPrereqError("Wrong tag type requested (%s)" %
12688 str(self.op.kind), errors.ECODE_INVAL)
12691 class LUTagsGet(TagsLU):
12692 """Returns the tags of a given object.
12697 def ExpandNames(self):
12698 TagsLU.ExpandNames(self)
12700 # Share locks as this is only a read operation
12701 self.share_locks = _ShareAll()
12703 def Exec(self, feedback_fn):
12704 """Returns the tag list.
12707 return list(self.target.GetTags())
12710 class LUTagsSearch(NoHooksLU):
12711 """Searches the tags for a given pattern.
12716 def ExpandNames(self):
12717 self.needed_locks = {}
12719 def CheckPrereq(self):
12720 """Check prerequisites.
12722 This checks the pattern passed for validity by compiling it.
12726 self.re = re.compile(self.op.pattern)
12727 except re.error, err:
12728 raise errors.OpPrereqError("Invalid search pattern '%s': %s" %
12729 (self.op.pattern, err), errors.ECODE_INVAL)
12731 def Exec(self, feedback_fn):
12732 """Returns the tag list.
12736 tgts = [("/cluster", cfg.GetClusterInfo())]
12737 ilist = cfg.GetAllInstancesInfo().values()
12738 tgts.extend([("/instances/%s" % i.name, i) for i in ilist])
12739 nlist = cfg.GetAllNodesInfo().values()
12740 tgts.extend([("/nodes/%s" % n.name, n) for n in nlist])
12741 tgts.extend(("/nodegroup/%s" % n.name, n)
12742 for n in cfg.GetAllNodeGroupsInfo().values())
12744 for path, target in tgts:
12745 for tag in target.GetTags():
12746 if self.re.search(tag):
12747 results.append((path, tag))
12751 class LUTagsSet(TagsLU):
12752 """Sets a tag on a given object.
12757 def CheckPrereq(self):
12758 """Check prerequisites.
12760 This checks the type and length of the tag name and value.
12763 TagsLU.CheckPrereq(self)
12764 for tag in self.op.tags:
12765 objects.TaggableObject.ValidateTag(tag)
12767 def Exec(self, feedback_fn):
12772 for tag in self.op.tags:
12773 self.target.AddTag(tag)
12774 except errors.TagError, err:
12775 raise errors.OpExecError("Error while setting tag: %s" % str(err))
12776 self.cfg.Update(self.target, feedback_fn)
12779 class LUTagsDel(TagsLU):
12780 """Delete a list of tags from a given object.
12785 def CheckPrereq(self):
12786 """Check prerequisites.
12788 This checks that we have the given tag.
12791 TagsLU.CheckPrereq(self)
12792 for tag in self.op.tags:
12793 objects.TaggableObject.ValidateTag(tag)
12794 del_tags = frozenset(self.op.tags)
12795 cur_tags = self.target.GetTags()
12797 diff_tags = del_tags - cur_tags
12799 diff_names = ("'%s'" % i for i in sorted(diff_tags))
12800 raise errors.OpPrereqError("Tag(s) %s not found" %
12801 (utils.CommaJoin(diff_names), ),
12802 errors.ECODE_NOENT)
12804 def Exec(self, feedback_fn):
12805 """Remove the tag from the object.
12808 for tag in self.op.tags:
12809 self.target.RemoveTag(tag)
12810 self.cfg.Update(self.target, feedback_fn)
12813 class LUTestDelay(NoHooksLU):
12814 """Sleep for a specified amount of time.
12816 This LU sleeps on the master and/or nodes for a specified amount of
12822 def ExpandNames(self):
12823 """Expand names and set required locks.
12825 This expands the node list, if any.
12828 self.needed_locks = {}
12829 if self.op.on_nodes:
12830 # _GetWantedNodes can be used here, but is not always appropriate to use
12831 # this way in ExpandNames. Check LogicalUnit.ExpandNames docstring for
12832 # more information.
12833 self.op.on_nodes = _GetWantedNodes(self, self.op.on_nodes)
12834 self.needed_locks[locking.LEVEL_NODE] = self.op.on_nodes
12836 def _TestDelay(self):
12837 """Do the actual sleep.
12840 if self.op.on_master:
12841 if not utils.TestDelay(self.op.duration):
12842 raise errors.OpExecError("Error during master delay test")
12843 if self.op.on_nodes:
12844 result = self.rpc.call_test_delay(self.op.on_nodes, self.op.duration)
12845 for node, node_result in result.items():
12846 node_result.Raise("Failure during rpc call to node %s" % node)
12848 def Exec(self, feedback_fn):
12849 """Execute the test delay opcode, with the wanted repetitions.
12852 if self.op.repeat == 0:
12855 top_value = self.op.repeat - 1
12856 for i in range(self.op.repeat):
12857 self.LogInfo("Test delay iteration %d/%d" % (i, top_value))
12861 class LUTestJqueue(NoHooksLU):
12862 """Utility LU to test some aspects of the job queue.
12867 # Must be lower than default timeout for WaitForJobChange to see whether it
12868 # notices changed jobs
12869 _CLIENT_CONNECT_TIMEOUT = 20.0
12870 _CLIENT_CONFIRM_TIMEOUT = 60.0
12873 def _NotifyUsingSocket(cls, cb, errcls):
12874 """Opens a Unix socket and waits for another program to connect.
12877 @param cb: Callback to send socket name to client
12878 @type errcls: class
12879 @param errcls: Exception class to use for errors
12882 # Using a temporary directory as there's no easy way to create temporary
12883 # sockets without writing a custom loop around tempfile.mktemp and
12885 tmpdir = tempfile.mkdtemp()
12887 tmpsock = utils.PathJoin(tmpdir, "sock")
12889 logging.debug("Creating temporary socket at %s", tmpsock)
12890 sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
12895 # Send details to client
12898 # Wait for client to connect before continuing
12899 sock.settimeout(cls._CLIENT_CONNECT_TIMEOUT)
12901 (conn, _) = sock.accept()
12902 except socket.error, err:
12903 raise errcls("Client didn't connect in time (%s)" % err)
12907 # Remove as soon as client is connected
12908 shutil.rmtree(tmpdir)
12910 # Wait for client to close
12913 # pylint: disable=E1101
12914 # Instance of '_socketobject' has no ... member
12915 conn.settimeout(cls._CLIENT_CONFIRM_TIMEOUT)
12917 except socket.error, err:
12918 raise errcls("Client failed to confirm notification (%s)" % err)
12922 def _SendNotification(self, test, arg, sockname):
12923 """Sends a notification to the client.
12926 @param test: Test name
12927 @param arg: Test argument (depends on test)
12928 @type sockname: string
12929 @param sockname: Socket path
12932 self.Log(constants.ELOG_JQUEUE_TEST, (sockname, test, arg))
12934 def _Notify(self, prereq, test, arg):
12935 """Notifies the client of a test.
12938 @param prereq: Whether this is a prereq-phase test
12940 @param test: Test name
12941 @param arg: Test argument (depends on test)
12945 errcls = errors.OpPrereqError
12947 errcls = errors.OpExecError
12949 return self._NotifyUsingSocket(compat.partial(self._SendNotification,
12953 def CheckArguments(self):
12954 self.checkargs_calls = getattr(self, "checkargs_calls", 0) + 1
12955 self.expandnames_calls = 0
12957 def ExpandNames(self):
12958 checkargs_calls = getattr(self, "checkargs_calls", 0)
12959 if checkargs_calls < 1:
12960 raise errors.ProgrammerError("CheckArguments was not called")
12962 self.expandnames_calls += 1
12964 if self.op.notify_waitlock:
12965 self._Notify(True, constants.JQT_EXPANDNAMES, None)
12967 self.LogInfo("Expanding names")
12969 # Get lock on master node (just to get a lock, not for a particular reason)
12970 self.needed_locks = {
12971 locking.LEVEL_NODE: self.cfg.GetMasterNode(),
12974 def Exec(self, feedback_fn):
12975 if self.expandnames_calls < 1:
12976 raise errors.ProgrammerError("ExpandNames was not called")
12978 if self.op.notify_exec:
12979 self._Notify(False, constants.JQT_EXEC, None)
12981 self.LogInfo("Executing")
12983 if self.op.log_messages:
12984 self._Notify(False, constants.JQT_STARTMSG, len(self.op.log_messages))
12985 for idx, msg in enumerate(self.op.log_messages):
12986 self.LogInfo("Sending log message %s", idx + 1)
12987 feedback_fn(constants.JQT_MSGPREFIX + msg)
12988 # Report how many test messages have been sent
12989 self._Notify(False, constants.JQT_LOGMSG, idx + 1)
12992 raise errors.OpExecError("Opcode failure was requested")
12997 class IAllocator(object):
12998 """IAllocator framework.
13000 An IAllocator instance has three sets of attributes:
13001 - cfg that is needed to query the cluster
13002 - input data (all members of the _KEYS class attribute are required)
13003 - four buffer attributes (in|out_data|text), that represent the
13004 input (to the external script) in text and data structure format,
13005 and the output from it, again in two formats
13006 - the result variables from the script (success, info, nodes) for
13010 # pylint: disable=R0902
13011 # lots of instance attributes
13013 def __init__(self, cfg, rpc, mode, **kwargs):
13016 # init buffer variables
13017 self.in_text = self.out_text = self.in_data = self.out_data = None
13018 # init all input fields so that pylint is happy
13020 self.memory = self.disks = self.disk_template = None
13021 self.os = self.tags = self.nics = self.vcpus = None
13022 self.hypervisor = None
13023 self.relocate_from = None
13025 self.instances = None
13026 self.evac_mode = None
13027 self.target_groups = []
13029 self.required_nodes = None
13030 # init result fields
13031 self.success = self.info = self.result = None
13034 (fn, keydata, self._result_check) = self._MODE_DATA[self.mode]
13036 raise errors.ProgrammerError("Unknown mode '%s' passed to the"
13037 " IAllocator" % self.mode)
13039 keyset = [n for (n, _) in keydata]
13042 if key not in keyset:
13043 raise errors.ProgrammerError("Invalid input parameter '%s' to"
13044 " IAllocator" % key)
13045 setattr(self, key, kwargs[key])
13048 if key not in kwargs:
13049 raise errors.ProgrammerError("Missing input parameter '%s' to"
13050 " IAllocator" % key)
13051 self._BuildInputData(compat.partial(fn, self), keydata)
13053 def _ComputeClusterData(self):
13054 """Compute the generic allocator input data.
13056 This is the data that is independent of the actual operation.
13060 cluster_info = cfg.GetClusterInfo()
13063 "version": constants.IALLOCATOR_VERSION,
13064 "cluster_name": cfg.GetClusterName(),
13065 "cluster_tags": list(cluster_info.GetTags()),
13066 "enabled_hypervisors": list(cluster_info.enabled_hypervisors),
13067 # we don't have job IDs
13069 ninfo = cfg.GetAllNodesInfo()
13070 iinfo = cfg.GetAllInstancesInfo().values()
13071 i_list = [(inst, cluster_info.FillBE(inst)) for inst in iinfo]
13074 node_list = [n.name for n in ninfo.values() if n.vm_capable]
13076 if self.mode == constants.IALLOCATOR_MODE_ALLOC:
13077 hypervisor_name = self.hypervisor
13078 elif self.mode == constants.IALLOCATOR_MODE_RELOC:
13079 hypervisor_name = cfg.GetInstanceInfo(self.name).hypervisor
13081 hypervisor_name = cluster_info.enabled_hypervisors[0]
13083 node_data = self.rpc.call_node_info(node_list, cfg.GetVGName(),
13086 self.rpc.call_all_instances_info(node_list,
13087 cluster_info.enabled_hypervisors)
13089 data["nodegroups"] = self._ComputeNodeGroupData(cfg)
13091 config_ndata = self._ComputeBasicNodeData(ninfo)
13092 data["nodes"] = self._ComputeDynamicNodeData(ninfo, node_data, node_iinfo,
13093 i_list, config_ndata)
13094 assert len(data["nodes"]) == len(ninfo), \
13095 "Incomplete node data computed"
13097 data["instances"] = self._ComputeInstanceData(cluster_info, i_list)
13099 self.in_data = data
13102 def _ComputeNodeGroupData(cfg):
13103 """Compute node groups data.
13106 ng = dict((guuid, {
13107 "name": gdata.name,
13108 "alloc_policy": gdata.alloc_policy,
13110 for guuid, gdata in cfg.GetAllNodeGroupsInfo().items())
13115 def _ComputeBasicNodeData(node_cfg):
13116 """Compute global node data.
13119 @returns: a dict of name: (node dict, node config)
13122 # fill in static (config-based) values
13123 node_results = dict((ninfo.name, {
13124 "tags": list(ninfo.GetTags()),
13125 "primary_ip": ninfo.primary_ip,
13126 "secondary_ip": ninfo.secondary_ip,
13127 "offline": ninfo.offline,
13128 "drained": ninfo.drained,
13129 "master_candidate": ninfo.master_candidate,
13130 "group": ninfo.group,
13131 "master_capable": ninfo.master_capable,
13132 "vm_capable": ninfo.vm_capable,
13134 for ninfo in node_cfg.values())
13136 return node_results
13139 def _ComputeDynamicNodeData(node_cfg, node_data, node_iinfo, i_list,
13141 """Compute global node data.
13143 @param node_results: the basic node structures as filled from the config
13146 # make a copy of the current dict
13147 node_results = dict(node_results)
13148 for nname, nresult in node_data.items():
13149 assert nname in node_results, "Missing basic data for node %s" % nname
13150 ninfo = node_cfg[nname]
13152 if not (ninfo.offline or ninfo.drained):
13153 nresult.Raise("Can't get data for node %s" % nname)
13154 node_iinfo[nname].Raise("Can't get node instance info from node %s" %
13156 remote_info = nresult.payload
13158 for attr in ["memory_total", "memory_free", "memory_dom0",
13159 "vg_size", "vg_free", "cpu_total"]:
13160 if attr not in remote_info:
13161 raise errors.OpExecError("Node '%s' didn't return attribute"
13162 " '%s'" % (nname, attr))
13163 if not isinstance(remote_info[attr], int):
13164 raise errors.OpExecError("Node '%s' returned invalid value"
13166 (nname, attr, remote_info[attr]))
13167 # compute memory used by primary instances
13168 i_p_mem = i_p_up_mem = 0
13169 for iinfo, beinfo in i_list:
13170 if iinfo.primary_node == nname:
13171 i_p_mem += beinfo[constants.BE_MEMORY]
13172 if iinfo.name not in node_iinfo[nname].payload:
13175 i_used_mem = int(node_iinfo[nname].payload[iinfo.name]["memory"])
13176 i_mem_diff = beinfo[constants.BE_MEMORY] - i_used_mem
13177 remote_info["memory_free"] -= max(0, i_mem_diff)
13180 i_p_up_mem += beinfo[constants.BE_MEMORY]
13182 # compute memory used by instances
13184 "total_memory": remote_info["memory_total"],
13185 "reserved_memory": remote_info["memory_dom0"],
13186 "free_memory": remote_info["memory_free"],
13187 "total_disk": remote_info["vg_size"],
13188 "free_disk": remote_info["vg_free"],
13189 "total_cpus": remote_info["cpu_total"],
13190 "i_pri_memory": i_p_mem,
13191 "i_pri_up_memory": i_p_up_mem,
13193 pnr_dyn.update(node_results[nname])
13194 node_results[nname] = pnr_dyn
13196 return node_results
13199 def _ComputeInstanceData(cluster_info, i_list):
13200 """Compute global instance data.
13204 for iinfo, beinfo in i_list:
13206 for nic in iinfo.nics:
13207 filled_params = cluster_info.SimpleFillNIC(nic.nicparams)
13211 "mode": filled_params[constants.NIC_MODE],
13212 "link": filled_params[constants.NIC_LINK],
13214 if filled_params[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
13215 nic_dict["bridge"] = filled_params[constants.NIC_LINK]
13216 nic_data.append(nic_dict)
13218 "tags": list(iinfo.GetTags()),
13219 "admin_up": iinfo.admin_up,
13220 "vcpus": beinfo[constants.BE_VCPUS],
13221 "memory": beinfo[constants.BE_MEMORY],
13223 "nodes": [iinfo.primary_node] + list(iinfo.secondary_nodes),
13225 "disks": [{constants.IDISK_SIZE: dsk.size,
13226 constants.IDISK_MODE: dsk.mode}
13227 for dsk in iinfo.disks],
13228 "disk_template": iinfo.disk_template,
13229 "hypervisor": iinfo.hypervisor,
13231 pir["disk_space_total"] = _ComputeDiskSize(iinfo.disk_template,
13233 instance_data[iinfo.name] = pir
13235 return instance_data
13237 def _AddNewInstance(self):
13238 """Add new instance data to allocator structure.
13240 This in combination with _AllocatorGetClusterData will create the
13241 correct structure needed as input for the allocator.
13243 The checks for the completeness of the opcode must have already been
13247 disk_space = _ComputeDiskSize(self.disk_template, self.disks)
13249 if self.disk_template in constants.DTS_INT_MIRROR:
13250 self.required_nodes = 2
13252 self.required_nodes = 1
13256 "disk_template": self.disk_template,
13259 "vcpus": self.vcpus,
13260 "memory": self.memory,
13261 "disks": self.disks,
13262 "disk_space_total": disk_space,
13264 "required_nodes": self.required_nodes,
13265 "hypervisor": self.hypervisor,
13270 def _AddRelocateInstance(self):
13271 """Add relocate instance data to allocator structure.
13273 This in combination with _IAllocatorGetClusterData will create the
13274 correct structure needed as input for the allocator.
13276 The checks for the completeness of the opcode must have already been
13280 instance = self.cfg.GetInstanceInfo(self.name)
13281 if instance is None:
13282 raise errors.ProgrammerError("Unknown instance '%s' passed to"
13283 " IAllocator" % self.name)
13285 if instance.disk_template not in constants.DTS_MIRRORED:
13286 raise errors.OpPrereqError("Can't relocate non-mirrored instances",
13287 errors.ECODE_INVAL)
13289 if instance.disk_template in constants.DTS_INT_MIRROR and \
13290 len(instance.secondary_nodes) != 1:
13291 raise errors.OpPrereqError("Instance has not exactly one secondary node",
13292 errors.ECODE_STATE)
13294 self.required_nodes = 1
13295 disk_sizes = [{constants.IDISK_SIZE: disk.size} for disk in instance.disks]
13296 disk_space = _ComputeDiskSize(instance.disk_template, disk_sizes)
13300 "disk_space_total": disk_space,
13301 "required_nodes": self.required_nodes,
13302 "relocate_from": self.relocate_from,
13306 def _AddNodeEvacuate(self):
13307 """Get data for node-evacuate requests.
13311 "instances": self.instances,
13312 "evac_mode": self.evac_mode,
13315 def _AddChangeGroup(self):
13316 """Get data for node-evacuate requests.
13320 "instances": self.instances,
13321 "target_groups": self.target_groups,
13324 def _BuildInputData(self, fn, keydata):
13325 """Build input data structures.
13328 self._ComputeClusterData()
13331 request["type"] = self.mode
13332 for keyname, keytype in keydata:
13333 if keyname not in request:
13334 raise errors.ProgrammerError("Request parameter %s is missing" %
13336 val = request[keyname]
13337 if not keytype(val):
13338 raise errors.ProgrammerError("Request parameter %s doesn't pass"
13339 " validation, value %s, expected"
13340 " type %s" % (keyname, val, keytype))
13341 self.in_data["request"] = request
13343 self.in_text = serializer.Dump(self.in_data)
13345 _STRING_LIST = ht.TListOf(ht.TString)
13346 _JOB_LIST = ht.TListOf(ht.TListOf(ht.TStrictDict(True, False, {
13347 # pylint: disable=E1101
13348 # Class '...' has no 'OP_ID' member
13349 "OP_ID": ht.TElemOf([opcodes.OpInstanceFailover.OP_ID,
13350 opcodes.OpInstanceMigrate.OP_ID,
13351 opcodes.OpInstanceReplaceDisks.OP_ID])
13355 ht.TListOf(ht.TAnd(ht.TIsLength(3),
13356 ht.TItems([ht.TNonEmptyString,
13357 ht.TNonEmptyString,
13358 ht.TListOf(ht.TNonEmptyString),
13361 ht.TListOf(ht.TAnd(ht.TIsLength(2),
13362 ht.TItems([ht.TNonEmptyString,
13365 _NEVAC_RESULT = ht.TAnd(ht.TIsLength(3),
13366 ht.TItems([_NEVAC_MOVED, _NEVAC_FAILED, _JOB_LIST]))
13369 constants.IALLOCATOR_MODE_ALLOC:
13372 ("name", ht.TString),
13373 ("memory", ht.TInt),
13374 ("disks", ht.TListOf(ht.TDict)),
13375 ("disk_template", ht.TString),
13376 ("os", ht.TString),
13377 ("tags", _STRING_LIST),
13378 ("nics", ht.TListOf(ht.TDict)),
13379 ("vcpus", ht.TInt),
13380 ("hypervisor", ht.TString),
13382 constants.IALLOCATOR_MODE_RELOC:
13383 (_AddRelocateInstance,
13384 [("name", ht.TString), ("relocate_from", _STRING_LIST)],
13386 constants.IALLOCATOR_MODE_NODE_EVAC:
13387 (_AddNodeEvacuate, [
13388 ("instances", _STRING_LIST),
13389 ("evac_mode", ht.TElemOf(constants.IALLOCATOR_NEVAC_MODES)),
13391 constants.IALLOCATOR_MODE_CHG_GROUP:
13392 (_AddChangeGroup, [
13393 ("instances", _STRING_LIST),
13394 ("target_groups", _STRING_LIST),
13398 def Run(self, name, validate=True, call_fn=None):
13399 """Run an instance allocator and return the results.
13402 if call_fn is None:
13403 call_fn = self.rpc.call_iallocator_runner
13405 result = call_fn(self.cfg.GetMasterNode(), name, self.in_text)
13406 result.Raise("Failure while running the iallocator script")
13408 self.out_text = result.payload
13410 self._ValidateResult()
13412 def _ValidateResult(self):
13413 """Process the allocator results.
13415 This will process and if successful save the result in
13416 self.out_data and the other parameters.
13420 rdict = serializer.Load(self.out_text)
13421 except Exception, err:
13422 raise errors.OpExecError("Can't parse iallocator results: %s" % str(err))
13424 if not isinstance(rdict, dict):
13425 raise errors.OpExecError("Can't parse iallocator results: not a dict")
13427 # TODO: remove backwards compatiblity in later versions
13428 if "nodes" in rdict and "result" not in rdict:
13429 rdict["result"] = rdict["nodes"]
13432 for key in "success", "info", "result":
13433 if key not in rdict:
13434 raise errors.OpExecError("Can't parse iallocator results:"
13435 " missing key '%s'" % key)
13436 setattr(self, key, rdict[key])
13438 if not self._result_check(self.result):
13439 raise errors.OpExecError("Iallocator returned invalid result,"
13440 " expected %s, got %s" %
13441 (self._result_check, self.result),
13442 errors.ECODE_INVAL)
13444 if self.mode == constants.IALLOCATOR_MODE_RELOC:
13445 assert self.relocate_from is not None
13446 assert self.required_nodes == 1
13448 node2group = dict((name, ndata["group"])
13449 for (name, ndata) in self.in_data["nodes"].items())
13451 fn = compat.partial(self._NodesToGroups, node2group,
13452 self.in_data["nodegroups"])
13454 instance = self.cfg.GetInstanceInfo(self.name)
13455 request_groups = fn(self.relocate_from + [instance.primary_node])
13456 result_groups = fn(rdict["result"] + [instance.primary_node])
13458 if self.success and not set(result_groups).issubset(request_groups):
13459 raise errors.OpExecError("Groups of nodes returned by iallocator (%s)"
13460 " differ from original groups (%s)" %
13461 (utils.CommaJoin(result_groups),
13462 utils.CommaJoin(request_groups)))
13464 elif self.mode == constants.IALLOCATOR_MODE_NODE_EVAC:
13465 assert self.evac_mode in constants.IALLOCATOR_NEVAC_MODES
13467 self.out_data = rdict
13470 def _NodesToGroups(node2group, groups, nodes):
13471 """Returns a list of unique group names for a list of nodes.
13473 @type node2group: dict
13474 @param node2group: Map from node name to group UUID
13476 @param groups: Group information
13478 @param nodes: Node names
13485 group_uuid = node2group[node]
13487 # Ignore unknown node
13491 group = groups[group_uuid]
13493 # Can't find group, let's use UUID
13494 group_name = group_uuid
13496 group_name = group["name"]
13498 result.add(group_name)
13500 return sorted(result)
13503 class LUTestAllocator(NoHooksLU):
13504 """Run allocator tests.
13506 This LU runs the allocator tests
13509 def CheckPrereq(self):
13510 """Check prerequisites.
13512 This checks the opcode parameters depending on the director and mode test.
13515 if self.op.mode == constants.IALLOCATOR_MODE_ALLOC:
13516 for attr in ["memory", "disks", "disk_template",
13517 "os", "tags", "nics", "vcpus"]:
13518 if not hasattr(self.op, attr):
13519 raise errors.OpPrereqError("Missing attribute '%s' on opcode input" %
13520 attr, errors.ECODE_INVAL)
13521 iname = self.cfg.ExpandInstanceName(self.op.name)
13522 if iname is not None:
13523 raise errors.OpPrereqError("Instance '%s' already in the cluster" %
13524 iname, errors.ECODE_EXISTS)
13525 if not isinstance(self.op.nics, list):
13526 raise errors.OpPrereqError("Invalid parameter 'nics'",
13527 errors.ECODE_INVAL)
13528 if not isinstance(self.op.disks, list):
13529 raise errors.OpPrereqError("Invalid parameter 'disks'",
13530 errors.ECODE_INVAL)
13531 for row in self.op.disks:
13532 if (not isinstance(row, dict) or
13533 constants.IDISK_SIZE not in row or
13534 not isinstance(row[constants.IDISK_SIZE], int) or
13535 constants.IDISK_MODE not in row or
13536 row[constants.IDISK_MODE] not in constants.DISK_ACCESS_SET):
13537 raise errors.OpPrereqError("Invalid contents of the 'disks'"
13538 " parameter", errors.ECODE_INVAL)
13539 if self.op.hypervisor is None:
13540 self.op.hypervisor = self.cfg.GetHypervisorType()
13541 elif self.op.mode == constants.IALLOCATOR_MODE_RELOC:
13542 fname = _ExpandInstanceName(self.cfg, self.op.name)
13543 self.op.name = fname
13544 self.relocate_from = \
13545 list(self.cfg.GetInstanceInfo(fname).secondary_nodes)
13546 elif self.op.mode in (constants.IALLOCATOR_MODE_CHG_GROUP,
13547 constants.IALLOCATOR_MODE_NODE_EVAC):
13548 if not self.op.instances:
13549 raise errors.OpPrereqError("Missing instances", errors.ECODE_INVAL)
13550 self.op.instances = _GetWantedInstances(self, self.op.instances)
13552 raise errors.OpPrereqError("Invalid test allocator mode '%s'" %
13553 self.op.mode, errors.ECODE_INVAL)
13555 if self.op.direction == constants.IALLOCATOR_DIR_OUT:
13556 if self.op.allocator is None:
13557 raise errors.OpPrereqError("Missing allocator name",
13558 errors.ECODE_INVAL)
13559 elif self.op.direction != constants.IALLOCATOR_DIR_IN:
13560 raise errors.OpPrereqError("Wrong allocator test '%s'" %
13561 self.op.direction, errors.ECODE_INVAL)
13563 def Exec(self, feedback_fn):
13564 """Run the allocator test.
13567 if self.op.mode == constants.IALLOCATOR_MODE_ALLOC:
13568 ial = IAllocator(self.cfg, self.rpc,
13571 memory=self.op.memory,
13572 disks=self.op.disks,
13573 disk_template=self.op.disk_template,
13577 vcpus=self.op.vcpus,
13578 hypervisor=self.op.hypervisor,
13580 elif self.op.mode == constants.IALLOCATOR_MODE_RELOC:
13581 ial = IAllocator(self.cfg, self.rpc,
13584 relocate_from=list(self.relocate_from),
13586 elif self.op.mode == constants.IALLOCATOR_MODE_CHG_GROUP:
13587 ial = IAllocator(self.cfg, self.rpc,
13589 instances=self.op.instances,
13590 target_groups=self.op.target_groups)
13591 elif self.op.mode == constants.IALLOCATOR_MODE_NODE_EVAC:
13592 ial = IAllocator(self.cfg, self.rpc,
13594 instances=self.op.instances,
13595 evac_mode=self.op.evac_mode)
13597 raise errors.ProgrammerError("Uncatched mode %s in"
13598 " LUTestAllocator.Exec", self.op.mode)
13600 if self.op.direction == constants.IALLOCATOR_DIR_IN:
13601 result = ial.in_text
13603 ial.Run(self.op.allocator, validate=False)
13604 result = ial.out_text
13608 #: Query type implementations
13610 constants.QR_INSTANCE: _InstanceQuery,
13611 constants.QR_NODE: _NodeQuery,
13612 constants.QR_GROUP: _GroupQuery,
13613 constants.QR_OS: _OsQuery,
13616 assert set(_QUERY_IMPL.keys()) == constants.QR_VIA_OP
13619 def _GetQueryImplementation(name):
13620 """Returns the implemtnation for a query type.
13622 @param name: Query type, must be one of L{constants.QR_VIA_OP}
13626 return _QUERY_IMPL[name]
13628 raise errors.OpPrereqError("Unknown query resource '%s'" % name,
13629 errors.ECODE_INVAL)