4 # Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012 Google Inc.
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 """Module implementing the master-side code."""
24 # pylint: disable=W0201,C0302
26 # W0201 since most LU attributes are defined in CheckPrereq or similar
29 # C0302: since we have waaaay too many lines in this module
45 from ganeti import ssh
46 from ganeti import utils
47 from ganeti import errors
48 from ganeti import hypervisor
49 from ganeti import locking
50 from ganeti import constants
51 from ganeti import objects
52 from ganeti import serializer
53 from ganeti import ssconf
54 from ganeti import uidpool
55 from ganeti import compat
56 from ganeti import masterd
57 from ganeti import netutils
58 from ganeti import query
59 from ganeti import qlang
60 from ganeti import opcodes
62 from ganeti import rpc
64 import ganeti.masterd.instance # pylint: disable=W0611
67 #: Size of DRBD meta block device
71 INSTANCE_UP = [constants.ADMINST_UP]
72 INSTANCE_DOWN = [constants.ADMINST_DOWN]
73 INSTANCE_OFFLINE = [constants.ADMINST_OFFLINE]
74 INSTANCE_ONLINE = [constants.ADMINST_DOWN, constants.ADMINST_UP]
75 INSTANCE_NOT_RUNNING = [constants.ADMINST_DOWN, constants.ADMINST_OFFLINE]
79 """Data container for LU results with jobs.
81 Instances of this class returned from L{LogicalUnit.Exec} will be recognized
82 by L{mcpu.Processor._ProcessResult}. The latter will then submit the jobs
83 contained in the C{jobs} attribute and include the job IDs in the opcode
87 def __init__(self, jobs, **kwargs):
88 """Initializes this class.
90 Additional return values can be specified as keyword arguments.
92 @type jobs: list of lists of L{opcode.OpCode}
93 @param jobs: A list of lists of opcode objects
100 class LogicalUnit(object):
101 """Logical Unit base class.
103 Subclasses must follow these rules:
104 - implement ExpandNames
105 - implement CheckPrereq (except when tasklets are used)
106 - implement Exec (except when tasklets are used)
107 - implement BuildHooksEnv
108 - implement BuildHooksNodes
109 - redefine HPATH and HTYPE
110 - optionally redefine their run requirements:
111 REQ_BGL: the LU needs to hold the Big Ganeti Lock exclusively
113 Note that all commands require root permissions.
115 @ivar dry_run_result: the value (if any) that will be returned to the caller
116 in dry-run mode (signalled by opcode dry_run parameter)
123 def __init__(self, processor, op, context, rpc_runner):
124 """Constructor for LogicalUnit.
126 This needs to be overridden in derived classes in order to check op
130 self.proc = processor
132 self.cfg = context.cfg
133 self.glm = context.glm
135 self.owned_locks = context.glm.list_owned
136 self.context = context
137 self.rpc = rpc_runner
138 # Dicts used to declare locking needs to mcpu
139 self.needed_locks = None
140 self.share_locks = dict.fromkeys(locking.LEVELS, 0)
142 self.remove_locks = {}
143 # Used to force good behavior when calling helper functions
144 self.recalculate_locks = {}
146 self.Log = processor.Log # pylint: disable=C0103
147 self.LogWarning = processor.LogWarning # pylint: disable=C0103
148 self.LogInfo = processor.LogInfo # pylint: disable=C0103
149 self.LogStep = processor.LogStep # pylint: disable=C0103
150 # support for dry-run
151 self.dry_run_result = None
152 # support for generic debug attribute
153 if (not hasattr(self.op, "debug_level") or
154 not isinstance(self.op.debug_level, int)):
155 self.op.debug_level = 0
160 # Validate opcode parameters and set defaults
161 self.op.Validate(True)
163 self.CheckArguments()
165 def CheckArguments(self):
166 """Check syntactic validity for the opcode arguments.
168 This method is for doing a simple syntactic check and ensure
169 validity of opcode parameters, without any cluster-related
170 checks. While the same can be accomplished in ExpandNames and/or
171 CheckPrereq, doing these separate is better because:
173 - ExpandNames is left as as purely a lock-related function
174 - CheckPrereq is run after we have acquired locks (and possible
177 The function is allowed to change the self.op attribute so that
178 later methods can no longer worry about missing parameters.
183 def ExpandNames(self):
184 """Expand names for this LU.
186 This method is called before starting to execute the opcode, and it should
187 update all the parameters of the opcode to their canonical form (e.g. a
188 short node name must be fully expanded after this method has successfully
189 completed). This way locking, hooks, logging, etc. can work correctly.
191 LUs which implement this method must also populate the self.needed_locks
192 member, as a dict with lock levels as keys, and a list of needed lock names
195 - use an empty dict if you don't need any lock
196 - if you don't need any lock at a particular level omit that level
197 - don't put anything for the BGL level
198 - if you want all locks at a level use locking.ALL_SET as a value
200 If you need to share locks (rather than acquire them exclusively) at one
201 level you can modify self.share_locks, setting a true value (usually 1) for
202 that level. By default locks are not shared.
204 This function can also define a list of tasklets, which then will be
205 executed in order instead of the usual LU-level CheckPrereq and Exec
206 functions, if those are not defined by the LU.
210 # Acquire all nodes and one instance
211 self.needed_locks = {
212 locking.LEVEL_NODE: locking.ALL_SET,
213 locking.LEVEL_INSTANCE: ['instance1.example.com'],
215 # Acquire just two nodes
216 self.needed_locks = {
217 locking.LEVEL_NODE: ['node1.example.com', 'node2.example.com'],
220 self.needed_locks = {} # No, you can't leave it to the default value None
223 # The implementation of this method is mandatory only if the new LU is
224 # concurrent, so that old LUs don't need to be changed all at the same
227 self.needed_locks = {} # Exclusive LUs don't need locks.
229 raise NotImplementedError
231 def DeclareLocks(self, level):
232 """Declare LU locking needs for a level
234 While most LUs can just declare their locking needs at ExpandNames time,
235 sometimes there's the need to calculate some locks after having acquired
236 the ones before. This function is called just before acquiring locks at a
237 particular level, but after acquiring the ones at lower levels, and permits
238 such calculations. It can be used to modify self.needed_locks, and by
239 default it does nothing.
241 This function is only called if you have something already set in
242 self.needed_locks for the level.
244 @param level: Locking level which is going to be locked
245 @type level: member of ganeti.locking.LEVELS
249 def CheckPrereq(self):
250 """Check prerequisites for this LU.
252 This method should check that the prerequisites for the execution
253 of this LU are fulfilled. It can do internode communication, but
254 it should be idempotent - no cluster or system changes are
257 The method should raise errors.OpPrereqError in case something is
258 not fulfilled. Its return value is ignored.
260 This method should also update all the parameters of the opcode to
261 their canonical form if it hasn't been done by ExpandNames before.
264 if self.tasklets is not None:
265 for (idx, tl) in enumerate(self.tasklets):
266 logging.debug("Checking prerequisites for tasklet %s/%s",
267 idx + 1, len(self.tasklets))
272 def Exec(self, feedback_fn):
275 This method should implement the actual work. It should raise
276 errors.OpExecError for failures that are somewhat dealt with in
280 if self.tasklets is not None:
281 for (idx, tl) in enumerate(self.tasklets):
282 logging.debug("Executing tasklet %s/%s", idx + 1, len(self.tasklets))
285 raise NotImplementedError
287 def BuildHooksEnv(self):
288 """Build hooks environment for this LU.
291 @return: Dictionary containing the environment that will be used for
292 running the hooks for this LU. The keys of the dict must not be prefixed
293 with "GANETI_"--that'll be added by the hooks runner. The hooks runner
294 will extend the environment with additional variables. If no environment
295 should be defined, an empty dictionary should be returned (not C{None}).
296 @note: If the C{HPATH} attribute of the LU class is C{None}, this function
300 raise NotImplementedError
302 def BuildHooksNodes(self):
303 """Build list of nodes to run LU's hooks.
305 @rtype: tuple; (list, list)
306 @return: Tuple containing a list of node names on which the hook
307 should run before the execution and a list of node names on which the
308 hook should run after the execution. No nodes should be returned as an
309 empty list (and not None).
310 @note: If the C{HPATH} attribute of the LU class is C{None}, this function
314 raise NotImplementedError
316 def HooksCallBack(self, phase, hook_results, feedback_fn, lu_result):
317 """Notify the LU about the results of its hooks.
319 This method is called every time a hooks phase is executed, and notifies
320 the Logical Unit about the hooks' result. The LU can then use it to alter
321 its result based on the hooks. By default the method does nothing and the
322 previous result is passed back unchanged but any LU can define it if it
323 wants to use the local cluster hook-scripts somehow.
325 @param phase: one of L{constants.HOOKS_PHASE_POST} or
326 L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase
327 @param hook_results: the results of the multi-node hooks rpc call
328 @param feedback_fn: function used send feedback back to the caller
329 @param lu_result: the previous Exec result this LU had, or None
331 @return: the new Exec result, based on the previous result
335 # API must be kept, thus we ignore the unused argument and could
336 # be a function warnings
337 # pylint: disable=W0613,R0201
340 def _ExpandAndLockInstance(self):
341 """Helper function to expand and lock an instance.
343 Many LUs that work on an instance take its name in self.op.instance_name
344 and need to expand it and then declare the expanded name for locking. This
345 function does it, and then updates self.op.instance_name to the expanded
346 name. It also initializes needed_locks as a dict, if this hasn't been done
350 if self.needed_locks is None:
351 self.needed_locks = {}
353 assert locking.LEVEL_INSTANCE not in self.needed_locks, \
354 "_ExpandAndLockInstance called with instance-level locks set"
355 self.op.instance_name = _ExpandInstanceName(self.cfg,
356 self.op.instance_name)
357 self.needed_locks[locking.LEVEL_INSTANCE] = self.op.instance_name
359 def _LockInstancesNodes(self, primary_only=False,
360 level=locking.LEVEL_NODE):
361 """Helper function to declare instances' nodes for locking.
363 This function should be called after locking one or more instances to lock
364 their nodes. Its effect is populating self.needed_locks[locking.LEVEL_NODE]
365 with all primary or secondary nodes for instances already locked and
366 present in self.needed_locks[locking.LEVEL_INSTANCE].
368 It should be called from DeclareLocks, and for safety only works if
369 self.recalculate_locks[locking.LEVEL_NODE] is set.
371 In the future it may grow parameters to just lock some instance's nodes, or
372 to just lock primaries or secondary nodes, if needed.
374 If should be called in DeclareLocks in a way similar to::
376 if level == locking.LEVEL_NODE:
377 self._LockInstancesNodes()
379 @type primary_only: boolean
380 @param primary_only: only lock primary nodes of locked instances
381 @param level: Which lock level to use for locking nodes
384 assert level in self.recalculate_locks, \
385 "_LockInstancesNodes helper function called with no nodes to recalculate"
387 # TODO: check if we're really been called with the instance locks held
389 # For now we'll replace self.needed_locks[locking.LEVEL_NODE], but in the
390 # future we might want to have different behaviors depending on the value
391 # of self.recalculate_locks[locking.LEVEL_NODE]
393 locked_i = self.owned_locks(locking.LEVEL_INSTANCE)
394 for _, instance in self.cfg.GetMultiInstanceInfo(locked_i):
395 wanted_nodes.append(instance.primary_node)
397 wanted_nodes.extend(instance.secondary_nodes)
399 if self.recalculate_locks[level] == constants.LOCKS_REPLACE:
400 self.needed_locks[level] = wanted_nodes
401 elif self.recalculate_locks[level] == constants.LOCKS_APPEND:
402 self.needed_locks[level].extend(wanted_nodes)
404 raise errors.ProgrammerError("Unknown recalculation mode")
406 del self.recalculate_locks[level]
409 class NoHooksLU(LogicalUnit): # pylint: disable=W0223
410 """Simple LU which runs no hooks.
412 This LU is intended as a parent for other LogicalUnits which will
413 run no hooks, in order to reduce duplicate code.
419 def BuildHooksEnv(self):
420 """Empty BuildHooksEnv for NoHooksLu.
422 This just raises an error.
425 raise AssertionError("BuildHooksEnv called for NoHooksLUs")
427 def BuildHooksNodes(self):
428 """Empty BuildHooksNodes for NoHooksLU.
431 raise AssertionError("BuildHooksNodes called for NoHooksLU")
435 """Tasklet base class.
437 Tasklets are subcomponents for LUs. LUs can consist entirely of tasklets or
438 they can mix legacy code with tasklets. Locking needs to be done in the LU,
439 tasklets know nothing about locks.
441 Subclasses must follow these rules:
442 - Implement CheckPrereq
446 def __init__(self, lu):
453 def CheckPrereq(self):
454 """Check prerequisites for this tasklets.
456 This method should check whether the prerequisites for the execution of
457 this tasklet are fulfilled. It can do internode communication, but it
458 should be idempotent - no cluster or system changes are allowed.
460 The method should raise errors.OpPrereqError in case something is not
461 fulfilled. Its return value is ignored.
463 This method should also update all parameters to their canonical form if it
464 hasn't been done before.
469 def Exec(self, feedback_fn):
470 """Execute the tasklet.
472 This method should implement the actual work. It should raise
473 errors.OpExecError for failures that are somewhat dealt with in code, or
477 raise NotImplementedError
481 """Base for query utility classes.
484 #: Attribute holding field definitions
487 def __init__(self, qfilter, fields, use_locking):
488 """Initializes this class.
491 self.use_locking = use_locking
493 self.query = query.Query(self.FIELDS, fields, qfilter=qfilter,
495 self.requested_data = self.query.RequestedData()
496 self.names = self.query.RequestedNames()
498 # Sort only if no names were requested
499 self.sort_by_name = not self.names
501 self.do_locking = None
504 def _GetNames(self, lu, all_names, lock_level):
505 """Helper function to determine names asked for in the query.
509 names = lu.owned_locks(lock_level)
513 if self.wanted == locking.ALL_SET:
514 assert not self.names
515 # caller didn't specify names, so ordering is not important
516 return utils.NiceSort(names)
518 # caller specified names and we must keep the same order
520 assert not self.do_locking or lu.glm.is_owned(lock_level)
522 missing = set(self.wanted).difference(names)
524 raise errors.OpExecError("Some items were removed before retrieving"
525 " their data: %s" % missing)
527 # Return expanded names
530 def ExpandNames(self, lu):
531 """Expand names for this query.
533 See L{LogicalUnit.ExpandNames}.
536 raise NotImplementedError()
538 def DeclareLocks(self, lu, level):
539 """Declare locks for this query.
541 See L{LogicalUnit.DeclareLocks}.
544 raise NotImplementedError()
546 def _GetQueryData(self, lu):
547 """Collects all data for this query.
549 @return: Query data object
552 raise NotImplementedError()
554 def NewStyleQuery(self, lu):
555 """Collect data and execute query.
558 return query.GetQueryResponse(self.query, self._GetQueryData(lu),
559 sort_by_name=self.sort_by_name)
561 def OldStyleQuery(self, lu):
562 """Collect data and execute query.
565 return self.query.OldStyleQuery(self._GetQueryData(lu),
566 sort_by_name=self.sort_by_name)
570 """Returns a dict declaring all lock levels shared.
573 return dict.fromkeys(locking.LEVELS, 1)
576 def _MakeLegacyNodeInfo(data):
577 """Formats the data returned by L{rpc.RpcRunner.call_node_info}.
579 Converts the data into a single dictionary. This is fine for most use cases,
580 but some require information from more than one volume group or hypervisor.
583 (bootid, (vg_info, ), (hv_info, )) = data
585 return utils.JoinDisjointDicts(utils.JoinDisjointDicts(vg_info, hv_info), {
590 def _CheckInstanceNodeGroups(cfg, instance_name, owned_groups):
591 """Checks if the owned node groups are still correct for an instance.
593 @type cfg: L{config.ConfigWriter}
594 @param cfg: The cluster configuration
595 @type instance_name: string
596 @param instance_name: Instance name
597 @type owned_groups: set or frozenset
598 @param owned_groups: List of currently owned node groups
601 inst_groups = cfg.GetInstanceNodeGroups(instance_name)
603 if not owned_groups.issuperset(inst_groups):
604 raise errors.OpPrereqError("Instance %s's node groups changed since"
605 " locks were acquired, current groups are"
606 " are '%s', owning groups '%s'; retry the"
609 utils.CommaJoin(inst_groups),
610 utils.CommaJoin(owned_groups)),
616 def _CheckNodeGroupInstances(cfg, group_uuid, owned_instances):
617 """Checks if the instances in a node group are still correct.
619 @type cfg: L{config.ConfigWriter}
620 @param cfg: The cluster configuration
621 @type group_uuid: string
622 @param group_uuid: Node group UUID
623 @type owned_instances: set or frozenset
624 @param owned_instances: List of currently owned instances
627 wanted_instances = cfg.GetNodeGroupInstances(group_uuid)
628 if owned_instances != wanted_instances:
629 raise errors.OpPrereqError("Instances in node group '%s' changed since"
630 " locks were acquired, wanted '%s', have '%s';"
631 " retry the operation" %
633 utils.CommaJoin(wanted_instances),
634 utils.CommaJoin(owned_instances)),
637 return wanted_instances
640 def _SupportsOob(cfg, node):
641 """Tells if node supports OOB.
643 @type cfg: L{config.ConfigWriter}
644 @param cfg: The cluster configuration
645 @type node: L{objects.Node}
646 @param node: The node
647 @return: The OOB script if supported or an empty string otherwise
650 return cfg.GetNdParams(node)[constants.ND_OOB_PROGRAM]
653 def _GetWantedNodes(lu, nodes):
654 """Returns list of checked and expanded node names.
656 @type lu: L{LogicalUnit}
657 @param lu: the logical unit on whose behalf we execute
659 @param nodes: list of node names or None for all nodes
661 @return: the list of nodes, sorted
662 @raise errors.ProgrammerError: if the nodes parameter is wrong type
666 return [_ExpandNodeName(lu.cfg, name) for name in nodes]
668 return utils.NiceSort(lu.cfg.GetNodeList())
671 def _GetWantedInstances(lu, instances):
672 """Returns list of checked and expanded instance names.
674 @type lu: L{LogicalUnit}
675 @param lu: the logical unit on whose behalf we execute
676 @type instances: list
677 @param instances: list of instance names or None for all instances
679 @return: the list of instances, sorted
680 @raise errors.OpPrereqError: if the instances parameter is wrong type
681 @raise errors.OpPrereqError: if any of the passed instances is not found
685 wanted = [_ExpandInstanceName(lu.cfg, name) for name in instances]
687 wanted = utils.NiceSort(lu.cfg.GetInstanceList())
691 def _GetUpdatedParams(old_params, update_dict,
692 use_default=True, use_none=False):
693 """Return the new version of a parameter dictionary.
695 @type old_params: dict
696 @param old_params: old parameters
697 @type update_dict: dict
698 @param update_dict: dict containing new parameter values, or
699 constants.VALUE_DEFAULT to reset the parameter to its default
701 @param use_default: boolean
702 @type use_default: whether to recognise L{constants.VALUE_DEFAULT}
703 values as 'to be deleted' values
704 @param use_none: boolean
705 @type use_none: whether to recognise C{None} values as 'to be
708 @return: the new parameter dictionary
711 params_copy = copy.deepcopy(old_params)
712 for key, val in update_dict.iteritems():
713 if ((use_default and val == constants.VALUE_DEFAULT) or
714 (use_none and val is None)):
720 params_copy[key] = val
724 def _GetUpdatedIPolicy(old_ipolicy, new_ipolicy, group_policy=False):
725 """Return the new version of a instance policy.
727 @param group_policy: whether this policy applies to a group and thus
728 we should support removal of policy entries
731 use_none = use_default = group_policy
732 ipolicy = copy.deepcopy(old_ipolicy)
733 for key, value in new_ipolicy.items():
734 if key not in constants.IPOLICY_ALL_KEYS:
735 raise errors.OpPrereqError("Invalid key in new ipolicy: %s" % key,
737 if key in constants.IPOLICY_PARAMETERS:
738 utils.ForceDictType(value, constants.ISPECS_PARAMETER_TYPES)
739 ipolicy[key] = _GetUpdatedParams(old_ipolicy.get(key, {}), value,
741 use_default=use_default)
743 # FIXME: we assume all others are lists; this should be redone
745 if not value or value == [constants.VALUE_DEFAULT]:
749 raise errors.OpPrereqError("Can't unset ipolicy attribute '%s'"
750 " on the cluster'" % key,
753 ipolicy[key] = list(value)
755 objects.InstancePolicy.CheckParameterSyntax(ipolicy)
756 except errors.ConfigurationError, err:
757 raise errors.OpPrereqError("Invalid instance policy: %s" % err,
762 def _UpdateAndVerifySubDict(base, updates, type_check):
763 """Updates and verifies a dict with sub dicts of the same type.
765 @param base: The dict with the old data
766 @param updates: The dict with the new data
767 @param type_check: Dict suitable to ForceDictType to verify correct types
768 @returns: A new dict with updated and verified values
772 new = _GetUpdatedParams(old, value)
773 utils.ForceDictType(new, type_check)
776 ret = copy.deepcopy(base)
777 ret.update(dict((key, fn(base.get(key, {}), value))
778 for key, value in updates.items()))
782 def _MergeAndVerifyHvState(op_input, obj_input):
783 """Combines the hv state from an opcode with the one of the object
785 @param op_input: The input dict from the opcode
786 @param obj_input: The input dict from the objects
787 @return: The verified and updated dict
791 invalid_hvs = set(op_input) - constants.HYPER_TYPES
793 raise errors.OpPrereqError("Invalid hypervisor(s) in hypervisor state:"
794 " %s" % utils.CommaJoin(invalid_hvs),
796 if obj_input is None:
798 type_check = constants.HVSTS_PARAMETER_TYPES
799 return _UpdateAndVerifySubDict(obj_input, op_input, type_check)
804 def _MergeAndVerifyDiskState(op_input, obj_input):
805 """Combines the disk state from an opcode with the one of the object
807 @param op_input: The input dict from the opcode
808 @param obj_input: The input dict from the objects
809 @return: The verified and updated dict
812 invalid_dst = set(op_input) - constants.DS_VALID_TYPES
814 raise errors.OpPrereqError("Invalid storage type(s) in disk state: %s" %
815 utils.CommaJoin(invalid_dst),
817 type_check = constants.DSS_PARAMETER_TYPES
818 if obj_input is None:
820 return dict((key, _UpdateAndVerifySubDict(obj_input.get(key, {}), value,
822 for key, value in op_input.items())
827 def _ReleaseLocks(lu, level, names=None, keep=None):
828 """Releases locks owned by an LU.
830 @type lu: L{LogicalUnit}
831 @param level: Lock level
832 @type names: list or None
833 @param names: Names of locks to release
834 @type keep: list or None
835 @param keep: Names of locks to retain
838 assert not (keep is not None and names is not None), \
839 "Only one of the 'names' and the 'keep' parameters can be given"
841 if names is not None:
842 should_release = names.__contains__
844 should_release = lambda name: name not in keep
846 should_release = None
848 owned = lu.owned_locks(level)
850 # Not owning any lock at this level, do nothing
857 # Determine which locks to release
859 if should_release(name):
864 assert len(lu.owned_locks(level)) == (len(retain) + len(release))
866 # Release just some locks
867 lu.glm.release(level, names=release)
869 assert frozenset(lu.owned_locks(level)) == frozenset(retain)
872 lu.glm.release(level)
874 assert not lu.glm.is_owned(level), "No locks should be owned"
877 def _MapInstanceDisksToNodes(instances):
878 """Creates a map from (node, volume) to instance name.
880 @type instances: list of L{objects.Instance}
881 @rtype: dict; tuple of (node name, volume name) as key, instance name as value
884 return dict(((node, vol), inst.name)
885 for inst in instances
886 for (node, vols) in inst.MapLVsByNode().items()
890 def _RunPostHook(lu, node_name):
891 """Runs the post-hook for an opcode on a single node.
894 hm = lu.proc.BuildHooksManager(lu)
896 hm.RunPhase(constants.HOOKS_PHASE_POST, nodes=[node_name])
898 # pylint: disable=W0702
899 lu.LogWarning("Errors occurred running hooks on %s" % node_name)
902 def _CheckOutputFields(static, dynamic, selected):
903 """Checks whether all selected fields are valid.
905 @type static: L{utils.FieldSet}
906 @param static: static fields set
907 @type dynamic: L{utils.FieldSet}
908 @param dynamic: dynamic fields set
915 delta = f.NonMatching(selected)
917 raise errors.OpPrereqError("Unknown output fields selected: %s"
918 % ",".join(delta), errors.ECODE_INVAL)
921 def _CheckGlobalHvParams(params):
922 """Validates that given hypervisor params are not global ones.
924 This will ensure that instances don't get customised versions of
928 used_globals = constants.HVC_GLOBALS.intersection(params)
930 msg = ("The following hypervisor parameters are global and cannot"
931 " be customized at instance level, please modify them at"
932 " cluster level: %s" % utils.CommaJoin(used_globals))
933 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
936 def _CheckNodeOnline(lu, node, msg=None):
937 """Ensure that a given node is online.
939 @param lu: the LU on behalf of which we make the check
940 @param node: the node to check
941 @param msg: if passed, should be a message to replace the default one
942 @raise errors.OpPrereqError: if the node is offline
946 msg = "Can't use offline node"
947 if lu.cfg.GetNodeInfo(node).offline:
948 raise errors.OpPrereqError("%s: %s" % (msg, node), errors.ECODE_STATE)
951 def _CheckNodeNotDrained(lu, node):
952 """Ensure that a given node is not drained.
954 @param lu: the LU on behalf of which we make the check
955 @param node: the node to check
956 @raise errors.OpPrereqError: if the node is drained
959 if lu.cfg.GetNodeInfo(node).drained:
960 raise errors.OpPrereqError("Can't use drained node %s" % node,
964 def _CheckNodeVmCapable(lu, node):
965 """Ensure that a given node is vm capable.
967 @param lu: the LU on behalf of which we make the check
968 @param node: the node to check
969 @raise errors.OpPrereqError: if the node is not vm capable
972 if not lu.cfg.GetNodeInfo(node).vm_capable:
973 raise errors.OpPrereqError("Can't use non-vm_capable node %s" % node,
977 def _CheckNodeHasOS(lu, node, os_name, force_variant):
978 """Ensure that a node supports a given OS.
980 @param lu: the LU on behalf of which we make the check
981 @param node: the node to check
982 @param os_name: the OS to query about
983 @param force_variant: whether to ignore variant errors
984 @raise errors.OpPrereqError: if the node is not supporting the OS
987 result = lu.rpc.call_os_get(node, os_name)
988 result.Raise("OS '%s' not in supported OS list for node %s" %
990 prereq=True, ecode=errors.ECODE_INVAL)
991 if not force_variant:
992 _CheckOSVariant(result.payload, os_name)
995 def _CheckNodeHasSecondaryIP(lu, node, secondary_ip, prereq):
996 """Ensure that a node has the given secondary ip.
998 @type lu: L{LogicalUnit}
999 @param lu: the LU on behalf of which we make the check
1001 @param node: the node to check
1002 @type secondary_ip: string
1003 @param secondary_ip: the ip to check
1004 @type prereq: boolean
1005 @param prereq: whether to throw a prerequisite or an execute error
1006 @raise errors.OpPrereqError: if the node doesn't have the ip, and prereq=True
1007 @raise errors.OpExecError: if the node doesn't have the ip, and prereq=False
1010 result = lu.rpc.call_node_has_ip_address(node, secondary_ip)
1011 result.Raise("Failure checking secondary ip on node %s" % node,
1012 prereq=prereq, ecode=errors.ECODE_ENVIRON)
1013 if not result.payload:
1014 msg = ("Node claims it doesn't have the secondary ip you gave (%s),"
1015 " please fix and re-run this command" % secondary_ip)
1017 raise errors.OpPrereqError(msg, errors.ECODE_ENVIRON)
1019 raise errors.OpExecError(msg)
1022 def _GetClusterDomainSecret():
1023 """Reads the cluster domain secret.
1026 return utils.ReadOneLineFile(constants.CLUSTER_DOMAIN_SECRET_FILE,
1030 def _CheckInstanceState(lu, instance, req_states, msg=None):
1031 """Ensure that an instance is in one of the required states.
1033 @param lu: the LU on behalf of which we make the check
1034 @param instance: the instance to check
1035 @param msg: if passed, should be a message to replace the default one
1036 @raise errors.OpPrereqError: if the instance is not in the required state
1040 msg = "can't use instance from outside %s states" % ", ".join(req_states)
1041 if instance.admin_state not in req_states:
1042 raise errors.OpPrereqError("Instance '%s' is marked to be %s, %s" %
1043 (instance.name, instance.admin_state, msg),
1046 if constants.ADMINST_UP not in req_states:
1047 pnode = instance.primary_node
1048 ins_l = lu.rpc.call_instance_list([pnode], [instance.hypervisor])[pnode]
1049 ins_l.Raise("Can't contact node %s for instance information" % pnode,
1050 prereq=True, ecode=errors.ECODE_ENVIRON)
1052 if instance.name in ins_l.payload:
1053 raise errors.OpPrereqError("Instance %s is running, %s" %
1054 (instance.name, msg), errors.ECODE_STATE)
1057 def _ComputeMinMaxSpec(name, ipolicy, value):
1058 """Computes if value is in the desired range.
1060 @param name: name of the parameter for which we perform the check
1061 @param ipolicy: dictionary containing min, max and std values
1062 @param value: actual value that we want to use
1063 @return: None or element not meeting the criteria
1067 if value in [None, constants.VALUE_AUTO]:
1069 max_v = ipolicy[constants.ISPECS_MAX].get(name, value)
1070 min_v = ipolicy[constants.ISPECS_MIN].get(name, value)
1071 if value > max_v or min_v > value:
1072 return ("%s value %s is not in range [%s, %s]" %
1073 (name, value, min_v, max_v))
1077 def _ComputeIPolicySpecViolation(ipolicy, mem_size, cpu_count, disk_count,
1078 nic_count, disk_sizes,
1079 _compute_fn=_ComputeMinMaxSpec):
1080 """Verifies ipolicy against provided specs.
1083 @param ipolicy: The ipolicy
1085 @param mem_size: The memory size
1086 @type cpu_count: int
1087 @param cpu_count: Used cpu cores
1088 @type disk_count: int
1089 @param disk_count: Number of disks used
1090 @type nic_count: int
1091 @param nic_count: Number of nics used
1092 @type disk_sizes: list of ints
1093 @param disk_sizes: Disk sizes of used disk (len must match C{disk_count})
1094 @param _compute_fn: The compute function (unittest only)
1095 @return: A list of violations, or an empty list of no violations are found
1098 assert disk_count == len(disk_sizes)
1101 (constants.ISPEC_MEM_SIZE, mem_size),
1102 (constants.ISPEC_CPU_COUNT, cpu_count),
1103 (constants.ISPEC_DISK_COUNT, disk_count),
1104 (constants.ISPEC_NIC_COUNT, nic_count),
1105 ] + map((lambda d: (constants.ISPEC_DISK_SIZE, d)), disk_sizes)
1108 (_compute_fn(name, ipolicy, value)
1109 for (name, value) in test_settings))
1112 def _ComputeIPolicyInstanceViolation(ipolicy, instance,
1113 _compute_fn=_ComputeIPolicySpecViolation):
1114 """Compute if instance meets the specs of ipolicy.
1117 @param ipolicy: The ipolicy to verify against
1118 @type instance: L{objects.Instance}
1119 @param instance: The instance to verify
1120 @param _compute_fn: The function to verify ipolicy (unittest only)
1121 @see: L{_ComputeIPolicySpecViolation}
1124 mem_size = instance.beparams.get(constants.BE_MAXMEM, None)
1125 cpu_count = instance.beparams.get(constants.BE_VCPUS, None)
1126 disk_count = len(instance.disks)
1127 disk_sizes = [disk.size for disk in instance.disks]
1128 nic_count = len(instance.nics)
1130 return _compute_fn(ipolicy, mem_size, cpu_count, disk_count, nic_count,
1134 def _ComputeIPolicyInstanceSpecViolation(ipolicy, instance_spec,
1135 _compute_fn=_ComputeIPolicySpecViolation):
1136 """Compute if instance specs meets the specs of ipolicy.
1139 @param ipolicy: The ipolicy to verify against
1140 @param instance_spec: dict
1141 @param instance_spec: The instance spec to verify
1142 @param _compute_fn: The function to verify ipolicy (unittest only)
1143 @see: L{_ComputeIPolicySpecViolation}
1146 mem_size = instance_spec.get(constants.ISPEC_MEM_SIZE, None)
1147 cpu_count = instance_spec.get(constants.ISPEC_CPU_COUNT, None)
1148 disk_count = instance_spec.get(constants.ISPEC_DISK_COUNT, 0)
1149 disk_sizes = instance_spec.get(constants.ISPEC_DISK_SIZE, [])
1150 nic_count = instance_spec.get(constants.ISPEC_NIC_COUNT, 0)
1152 return _compute_fn(ipolicy, mem_size, cpu_count, disk_count, nic_count,
1156 def _ComputeIPolicyNodeViolation(ipolicy, instance, current_group,
1158 _compute_fn=_ComputeIPolicyInstanceViolation):
1159 """Compute if instance meets the specs of the new target group.
1161 @param ipolicy: The ipolicy to verify
1162 @param instance: The instance object to verify
1163 @param current_group: The current group of the instance
1164 @param target_group: The new group of the instance
1165 @param _compute_fn: The function to verify ipolicy (unittest only)
1166 @see: L{_ComputeIPolicySpecViolation}
1169 if current_group == target_group:
1172 return _compute_fn(ipolicy, instance)
1175 def _CheckTargetNodeIPolicy(lu, ipolicy, instance, node, ignore=False,
1176 _compute_fn=_ComputeIPolicyNodeViolation):
1177 """Checks that the target node is correct in terms of instance policy.
1179 @param ipolicy: The ipolicy to verify
1180 @param instance: The instance object to verify
1181 @param node: The new node to relocate
1182 @param ignore: Ignore violations of the ipolicy
1183 @param _compute_fn: The function to verify ipolicy (unittest only)
1184 @see: L{_ComputeIPolicySpecViolation}
1187 primary_node = lu.cfg.GetNodeInfo(instance.primary_node)
1188 res = _compute_fn(ipolicy, instance, primary_node.group, node.group)
1191 msg = ("Instance does not meet target node group's (%s) instance"
1192 " policy: %s") % (node.group, utils.CommaJoin(res))
1196 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
1199 def _ComputeNewInstanceViolations(old_ipolicy, new_ipolicy, instances):
1200 """Computes a set of any instances that would violate the new ipolicy.
1202 @param old_ipolicy: The current (still in-place) ipolicy
1203 @param new_ipolicy: The new (to become) ipolicy
1204 @param instances: List of instances to verify
1205 @return: A list of instances which violates the new ipolicy but did not before
1208 return (_ComputeViolatingInstances(old_ipolicy, instances) -
1209 _ComputeViolatingInstances(new_ipolicy, instances))
1212 def _ExpandItemName(fn, name, kind):
1213 """Expand an item name.
1215 @param fn: the function to use for expansion
1216 @param name: requested item name
1217 @param kind: text description ('Node' or 'Instance')
1218 @return: the resolved (full) name
1219 @raise errors.OpPrereqError: if the item is not found
1222 full_name = fn(name)
1223 if full_name is None:
1224 raise errors.OpPrereqError("%s '%s' not known" % (kind, name),
1229 def _ExpandNodeName(cfg, name):
1230 """Wrapper over L{_ExpandItemName} for nodes."""
1231 return _ExpandItemName(cfg.ExpandNodeName, name, "Node")
1234 def _ExpandInstanceName(cfg, name):
1235 """Wrapper over L{_ExpandItemName} for instance."""
1236 return _ExpandItemName(cfg.ExpandInstanceName, name, "Instance")
1239 def _BuildInstanceHookEnv(name, primary_node, secondary_nodes, os_type, status,
1240 minmem, maxmem, vcpus, nics, disk_template, disks,
1241 bep, hvp, hypervisor_name, tags):
1242 """Builds instance related env variables for hooks
1244 This builds the hook environment from individual variables.
1247 @param name: the name of the instance
1248 @type primary_node: string
1249 @param primary_node: the name of the instance's primary node
1250 @type secondary_nodes: list
1251 @param secondary_nodes: list of secondary nodes as strings
1252 @type os_type: string
1253 @param os_type: the name of the instance's OS
1254 @type status: string
1255 @param status: the desired status of the instance
1256 @type minmem: string
1257 @param minmem: the minimum memory size of the instance
1258 @type maxmem: string
1259 @param maxmem: the maximum memory size of the instance
1261 @param vcpus: the count of VCPUs the instance has
1263 @param nics: list of tuples (ip, mac, mode, link) representing
1264 the NICs the instance has
1265 @type disk_template: string
1266 @param disk_template: the disk template of the instance
1268 @param disks: the list of (size, mode) pairs
1270 @param bep: the backend parameters for the instance
1272 @param hvp: the hypervisor parameters for the instance
1273 @type hypervisor_name: string
1274 @param hypervisor_name: the hypervisor for the instance
1276 @param tags: list of instance tags as strings
1278 @return: the hook environment for this instance
1283 "INSTANCE_NAME": name,
1284 "INSTANCE_PRIMARY": primary_node,
1285 "INSTANCE_SECONDARIES": " ".join(secondary_nodes),
1286 "INSTANCE_OS_TYPE": os_type,
1287 "INSTANCE_STATUS": status,
1288 "INSTANCE_MINMEM": minmem,
1289 "INSTANCE_MAXMEM": maxmem,
1290 # TODO(2.7) remove deprecated "memory" value
1291 "INSTANCE_MEMORY": maxmem,
1292 "INSTANCE_VCPUS": vcpus,
1293 "INSTANCE_DISK_TEMPLATE": disk_template,
1294 "INSTANCE_HYPERVISOR": hypervisor_name,
1297 nic_count = len(nics)
1298 for idx, (ip, mac, mode, link) in enumerate(nics):
1301 env["INSTANCE_NIC%d_IP" % idx] = ip
1302 env["INSTANCE_NIC%d_MAC" % idx] = mac
1303 env["INSTANCE_NIC%d_MODE" % idx] = mode
1304 env["INSTANCE_NIC%d_LINK" % idx] = link
1305 if mode == constants.NIC_MODE_BRIDGED:
1306 env["INSTANCE_NIC%d_BRIDGE" % idx] = link
1310 env["INSTANCE_NIC_COUNT"] = nic_count
1313 disk_count = len(disks)
1314 for idx, (size, mode) in enumerate(disks):
1315 env["INSTANCE_DISK%d_SIZE" % idx] = size
1316 env["INSTANCE_DISK%d_MODE" % idx] = mode
1320 env["INSTANCE_DISK_COUNT"] = disk_count
1325 env["INSTANCE_TAGS"] = " ".join(tags)
1327 for source, kind in [(bep, "BE"), (hvp, "HV")]:
1328 for key, value in source.items():
1329 env["INSTANCE_%s_%s" % (kind, key)] = value
1334 def _NICListToTuple(lu, nics):
1335 """Build a list of nic information tuples.
1337 This list is suitable to be passed to _BuildInstanceHookEnv or as a return
1338 value in LUInstanceQueryData.
1340 @type lu: L{LogicalUnit}
1341 @param lu: the logical unit on whose behalf we execute
1342 @type nics: list of L{objects.NIC}
1343 @param nics: list of nics to convert to hooks tuples
1347 cluster = lu.cfg.GetClusterInfo()
1351 filled_params = cluster.SimpleFillNIC(nic.nicparams)
1352 mode = filled_params[constants.NIC_MODE]
1353 link = filled_params[constants.NIC_LINK]
1354 hooks_nics.append((ip, mac, mode, link))
1358 def _BuildInstanceHookEnvByObject(lu, instance, override=None):
1359 """Builds instance related env variables for hooks from an object.
1361 @type lu: L{LogicalUnit}
1362 @param lu: the logical unit on whose behalf we execute
1363 @type instance: L{objects.Instance}
1364 @param instance: the instance for which we should build the
1366 @type override: dict
1367 @param override: dictionary with key/values that will override
1370 @return: the hook environment dictionary
1373 cluster = lu.cfg.GetClusterInfo()
1374 bep = cluster.FillBE(instance)
1375 hvp = cluster.FillHV(instance)
1377 "name": instance.name,
1378 "primary_node": instance.primary_node,
1379 "secondary_nodes": instance.secondary_nodes,
1380 "os_type": instance.os,
1381 "status": instance.admin_state,
1382 "maxmem": bep[constants.BE_MAXMEM],
1383 "minmem": bep[constants.BE_MINMEM],
1384 "vcpus": bep[constants.BE_VCPUS],
1385 "nics": _NICListToTuple(lu, instance.nics),
1386 "disk_template": instance.disk_template,
1387 "disks": [(disk.size, disk.mode) for disk in instance.disks],
1390 "hypervisor_name": instance.hypervisor,
1391 "tags": instance.tags,
1394 args.update(override)
1395 return _BuildInstanceHookEnv(**args) # pylint: disable=W0142
1398 def _AdjustCandidatePool(lu, exceptions):
1399 """Adjust the candidate pool after node operations.
1402 mod_list = lu.cfg.MaintainCandidatePool(exceptions)
1404 lu.LogInfo("Promoted nodes to master candidate role: %s",
1405 utils.CommaJoin(node.name for node in mod_list))
1406 for name in mod_list:
1407 lu.context.ReaddNode(name)
1408 mc_now, mc_max, _ = lu.cfg.GetMasterCandidateStats(exceptions)
1410 lu.LogInfo("Note: more nodes are candidates (%d) than desired (%d)" %
1414 def _DecideSelfPromotion(lu, exceptions=None):
1415 """Decide whether I should promote myself as a master candidate.
1418 cp_size = lu.cfg.GetClusterInfo().candidate_pool_size
1419 mc_now, mc_should, _ = lu.cfg.GetMasterCandidateStats(exceptions)
1420 # the new node will increase mc_max with one, so:
1421 mc_should = min(mc_should + 1, cp_size)
1422 return mc_now < mc_should
1425 def _CalculateGroupIPolicy(cluster, group):
1426 """Calculate instance policy for group.
1429 return cluster.SimpleFillIPolicy(group.ipolicy)
1432 def _ComputeViolatingInstances(ipolicy, instances):
1433 """Computes a set of instances who violates given ipolicy.
1435 @param ipolicy: The ipolicy to verify
1436 @type instances: object.Instance
1437 @param instances: List of instances to verify
1438 @return: A frozenset of instance names violating the ipolicy
1441 return frozenset([inst.name for inst in instances
1442 if _ComputeIPolicyInstanceViolation(ipolicy, inst)])
1445 def _CheckNicsBridgesExist(lu, target_nics, target_node):
1446 """Check that the brigdes needed by a list of nics exist.
1449 cluster = lu.cfg.GetClusterInfo()
1450 paramslist = [cluster.SimpleFillNIC(nic.nicparams) for nic in target_nics]
1451 brlist = [params[constants.NIC_LINK] for params in paramslist
1452 if params[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED]
1454 result = lu.rpc.call_bridges_exist(target_node, brlist)
1455 result.Raise("Error checking bridges on destination node '%s'" %
1456 target_node, prereq=True, ecode=errors.ECODE_ENVIRON)
1459 def _CheckInstanceBridgesExist(lu, instance, node=None):
1460 """Check that the brigdes needed by an instance exist.
1464 node = instance.primary_node
1465 _CheckNicsBridgesExist(lu, instance.nics, node)
1468 def _CheckOSVariant(os_obj, name):
1469 """Check whether an OS name conforms to the os variants specification.
1471 @type os_obj: L{objects.OS}
1472 @param os_obj: OS object to check
1474 @param name: OS name passed by the user, to check for validity
1477 variant = objects.OS.GetVariant(name)
1478 if not os_obj.supported_variants:
1480 raise errors.OpPrereqError("OS '%s' doesn't support variants ('%s'"
1481 " passed)" % (os_obj.name, variant),
1485 raise errors.OpPrereqError("OS name must include a variant",
1488 if variant not in os_obj.supported_variants:
1489 raise errors.OpPrereqError("Unsupported OS variant", errors.ECODE_INVAL)
1492 def _GetNodeInstancesInner(cfg, fn):
1493 return [i for i in cfg.GetAllInstancesInfo().values() if fn(i)]
1496 def _GetNodeInstances(cfg, node_name):
1497 """Returns a list of all primary and secondary instances on a node.
1501 return _GetNodeInstancesInner(cfg, lambda inst: node_name in inst.all_nodes)
1504 def _GetNodePrimaryInstances(cfg, node_name):
1505 """Returns primary instances on a node.
1508 return _GetNodeInstancesInner(cfg,
1509 lambda inst: node_name == inst.primary_node)
1512 def _GetNodeSecondaryInstances(cfg, node_name):
1513 """Returns secondary instances on a node.
1516 return _GetNodeInstancesInner(cfg,
1517 lambda inst: node_name in inst.secondary_nodes)
1520 def _GetStorageTypeArgs(cfg, storage_type):
1521 """Returns the arguments for a storage type.
1524 # Special case for file storage
1525 if storage_type == constants.ST_FILE:
1526 # storage.FileStorage wants a list of storage directories
1527 return [[cfg.GetFileStorageDir(), cfg.GetSharedFileStorageDir()]]
1532 def _FindFaultyInstanceDisks(cfg, rpc_runner, instance, node_name, prereq):
1535 for dev in instance.disks:
1536 cfg.SetDiskID(dev, node_name)
1538 result = rpc_runner.call_blockdev_getmirrorstatus(node_name, instance.disks)
1539 result.Raise("Failed to get disk status from node %s" % node_name,
1540 prereq=prereq, ecode=errors.ECODE_ENVIRON)
1542 for idx, bdev_status in enumerate(result.payload):
1543 if bdev_status and bdev_status.ldisk_status == constants.LDS_FAULTY:
1549 def _CheckIAllocatorOrNode(lu, iallocator_slot, node_slot):
1550 """Check the sanity of iallocator and node arguments and use the
1551 cluster-wide iallocator if appropriate.
1553 Check that at most one of (iallocator, node) is specified. If none is
1554 specified, then the LU's opcode's iallocator slot is filled with the
1555 cluster-wide default iallocator.
1557 @type iallocator_slot: string
1558 @param iallocator_slot: the name of the opcode iallocator slot
1559 @type node_slot: string
1560 @param node_slot: the name of the opcode target node slot
1563 node = getattr(lu.op, node_slot, None)
1564 iallocator = getattr(lu.op, iallocator_slot, None)
1566 if node is not None and iallocator is not None:
1567 raise errors.OpPrereqError("Do not specify both, iallocator and node",
1569 elif node is None and iallocator is None:
1570 default_iallocator = lu.cfg.GetDefaultIAllocator()
1571 if default_iallocator:
1572 setattr(lu.op, iallocator_slot, default_iallocator)
1574 raise errors.OpPrereqError("No iallocator or node given and no"
1575 " cluster-wide default iallocator found;"
1576 " please specify either an iallocator or a"
1577 " node, or set a cluster-wide default"
1581 def _GetDefaultIAllocator(cfg, iallocator):
1582 """Decides on which iallocator to use.
1584 @type cfg: L{config.ConfigWriter}
1585 @param cfg: Cluster configuration object
1586 @type iallocator: string or None
1587 @param iallocator: Iallocator specified in opcode
1589 @return: Iallocator name
1593 # Use default iallocator
1594 iallocator = cfg.GetDefaultIAllocator()
1597 raise errors.OpPrereqError("No iallocator was specified, neither in the"
1598 " opcode nor as a cluster-wide default",
1604 class LUClusterPostInit(LogicalUnit):
1605 """Logical unit for running hooks after cluster initialization.
1608 HPATH = "cluster-init"
1609 HTYPE = constants.HTYPE_CLUSTER
1611 def BuildHooksEnv(self):
1616 "OP_TARGET": self.cfg.GetClusterName(),
1619 def BuildHooksNodes(self):
1620 """Build hooks nodes.
1623 return ([], [self.cfg.GetMasterNode()])
1625 def Exec(self, feedback_fn):
1632 class LUClusterDestroy(LogicalUnit):
1633 """Logical unit for destroying the cluster.
1636 HPATH = "cluster-destroy"
1637 HTYPE = constants.HTYPE_CLUSTER
1639 def BuildHooksEnv(self):
1644 "OP_TARGET": self.cfg.GetClusterName(),
1647 def BuildHooksNodes(self):
1648 """Build hooks nodes.
1653 def CheckPrereq(self):
1654 """Check prerequisites.
1656 This checks whether the cluster is empty.
1658 Any errors are signaled by raising errors.OpPrereqError.
1661 master = self.cfg.GetMasterNode()
1663 nodelist = self.cfg.GetNodeList()
1664 if len(nodelist) != 1 or nodelist[0] != master:
1665 raise errors.OpPrereqError("There are still %d node(s) in"
1666 " this cluster." % (len(nodelist) - 1),
1668 instancelist = self.cfg.GetInstanceList()
1670 raise errors.OpPrereqError("There are still %d instance(s) in"
1671 " this cluster." % len(instancelist),
1674 def Exec(self, feedback_fn):
1675 """Destroys the cluster.
1678 master_params = self.cfg.GetMasterNetworkParameters()
1680 # Run post hooks on master node before it's removed
1681 _RunPostHook(self, master_params.name)
1683 ems = self.cfg.GetUseExternalMipScript()
1684 result = self.rpc.call_node_deactivate_master_ip(master_params.name,
1687 self.LogWarning("Error disabling the master IP address: %s",
1690 return master_params.name
1693 def _VerifyCertificate(filename):
1694 """Verifies a certificate for L{LUClusterVerifyConfig}.
1696 @type filename: string
1697 @param filename: Path to PEM file
1701 cert = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM,
1702 utils.ReadFile(filename))
1703 except Exception, err: # pylint: disable=W0703
1704 return (LUClusterVerifyConfig.ETYPE_ERROR,
1705 "Failed to load X509 certificate %s: %s" % (filename, err))
1708 utils.VerifyX509Certificate(cert, constants.SSL_CERT_EXPIRATION_WARN,
1709 constants.SSL_CERT_EXPIRATION_ERROR)
1712 fnamemsg = "While verifying %s: %s" % (filename, msg)
1717 return (None, fnamemsg)
1718 elif errcode == utils.CERT_WARNING:
1719 return (LUClusterVerifyConfig.ETYPE_WARNING, fnamemsg)
1720 elif errcode == utils.CERT_ERROR:
1721 return (LUClusterVerifyConfig.ETYPE_ERROR, fnamemsg)
1723 raise errors.ProgrammerError("Unhandled certificate error code %r" % errcode)
1726 def _GetAllHypervisorParameters(cluster, instances):
1727 """Compute the set of all hypervisor parameters.
1729 @type cluster: L{objects.Cluster}
1730 @param cluster: the cluster object
1731 @param instances: list of L{objects.Instance}
1732 @param instances: additional instances from which to obtain parameters
1733 @rtype: list of (origin, hypervisor, parameters)
1734 @return: a list with all parameters found, indicating the hypervisor they
1735 apply to, and the origin (can be "cluster", "os X", or "instance Y")
1740 for hv_name in cluster.enabled_hypervisors:
1741 hvp_data.append(("cluster", hv_name, cluster.GetHVDefaults(hv_name)))
1743 for os_name, os_hvp in cluster.os_hvp.items():
1744 for hv_name, hv_params in os_hvp.items():
1746 full_params = cluster.GetHVDefaults(hv_name, os_name=os_name)
1747 hvp_data.append(("os %s" % os_name, hv_name, full_params))
1749 # TODO: collapse identical parameter values in a single one
1750 for instance in instances:
1751 if instance.hvparams:
1752 hvp_data.append(("instance %s" % instance.name, instance.hypervisor,
1753 cluster.FillHV(instance)))
1758 class _VerifyErrors(object):
1759 """Mix-in for cluster/group verify LUs.
1761 It provides _Error and _ErrorIf, and updates the self.bad boolean. (Expects
1762 self.op and self._feedback_fn to be available.)
1766 ETYPE_FIELD = "code"
1767 ETYPE_ERROR = "ERROR"
1768 ETYPE_WARNING = "WARNING"
1770 def _Error(self, ecode, item, msg, *args, **kwargs):
1771 """Format an error message.
1773 Based on the opcode's error_codes parameter, either format a
1774 parseable error code, or a simpler error string.
1776 This must be called only from Exec and functions called from Exec.
1779 ltype = kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR)
1780 itype, etxt, _ = ecode
1781 # first complete the msg
1784 # then format the whole message
1785 if self.op.error_codes: # This is a mix-in. pylint: disable=E1101
1786 msg = "%s:%s:%s:%s:%s" % (ltype, etxt, itype, item, msg)
1792 msg = "%s: %s%s: %s" % (ltype, itype, item, msg)
1793 # and finally report it via the feedback_fn
1794 self._feedback_fn(" - %s" % msg) # Mix-in. pylint: disable=E1101
1796 def _ErrorIf(self, cond, ecode, *args, **kwargs):
1797 """Log an error message if the passed condition is True.
1801 or self.op.debug_simulate_errors) # pylint: disable=E1101
1803 # If the error code is in the list of ignored errors, demote the error to a
1805 (_, etxt, _) = ecode
1806 if etxt in self.op.ignore_errors: # pylint: disable=E1101
1807 kwargs[self.ETYPE_FIELD] = self.ETYPE_WARNING
1810 self._Error(ecode, *args, **kwargs)
1812 # do not mark the operation as failed for WARN cases only
1813 if kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR) == self.ETYPE_ERROR:
1814 self.bad = self.bad or cond
1817 class LUClusterVerify(NoHooksLU):
1818 """Submits all jobs necessary to verify the cluster.
1823 def ExpandNames(self):
1824 self.needed_locks = {}
1826 def Exec(self, feedback_fn):
1829 if self.op.group_name:
1830 groups = [self.op.group_name]
1831 depends_fn = lambda: None
1833 groups = self.cfg.GetNodeGroupList()
1835 # Verify global configuration
1837 opcodes.OpClusterVerifyConfig(ignore_errors=self.op.ignore_errors)
1840 # Always depend on global verification
1841 depends_fn = lambda: [(-len(jobs), [])]
1843 jobs.extend([opcodes.OpClusterVerifyGroup(group_name=group,
1844 ignore_errors=self.op.ignore_errors,
1845 depends=depends_fn())]
1846 for group in groups)
1848 # Fix up all parameters
1849 for op in itertools.chain(*jobs): # pylint: disable=W0142
1850 op.debug_simulate_errors = self.op.debug_simulate_errors
1851 op.verbose = self.op.verbose
1852 op.error_codes = self.op.error_codes
1854 op.skip_checks = self.op.skip_checks
1855 except AttributeError:
1856 assert not isinstance(op, opcodes.OpClusterVerifyGroup)
1858 return ResultWithJobs(jobs)
1861 class LUClusterVerifyConfig(NoHooksLU, _VerifyErrors):
1862 """Verifies the cluster config.
1867 def _VerifyHVP(self, hvp_data):
1868 """Verifies locally the syntax of the hypervisor parameters.
1871 for item, hv_name, hv_params in hvp_data:
1872 msg = ("hypervisor %s parameters syntax check (source %s): %%s" %
1875 hv_class = hypervisor.GetHypervisor(hv_name)
1876 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
1877 hv_class.CheckParameterSyntax(hv_params)
1878 except errors.GenericError, err:
1879 self._ErrorIf(True, constants.CV_ECLUSTERCFG, None, msg % str(err))
1881 def ExpandNames(self):
1882 # Information can be safely retrieved as the BGL is acquired in exclusive
1884 assert locking.BGL in self.owned_locks(locking.LEVEL_CLUSTER)
1885 self.all_group_info = self.cfg.GetAllNodeGroupsInfo()
1886 self.all_node_info = self.cfg.GetAllNodesInfo()
1887 self.all_inst_info = self.cfg.GetAllInstancesInfo()
1888 self.needed_locks = {}
1890 def Exec(self, feedback_fn):
1891 """Verify integrity of cluster, performing various test on nodes.
1895 self._feedback_fn = feedback_fn
1897 feedback_fn("* Verifying cluster config")
1899 for msg in self.cfg.VerifyConfig():
1900 self._ErrorIf(True, constants.CV_ECLUSTERCFG, None, msg)
1902 feedback_fn("* Verifying cluster certificate files")
1904 for cert_filename in constants.ALL_CERT_FILES:
1905 (errcode, msg) = _VerifyCertificate(cert_filename)
1906 self._ErrorIf(errcode, constants.CV_ECLUSTERCERT, None, msg, code=errcode)
1908 feedback_fn("* Verifying hypervisor parameters")
1910 self._VerifyHVP(_GetAllHypervisorParameters(self.cfg.GetClusterInfo(),
1911 self.all_inst_info.values()))
1913 feedback_fn("* Verifying all nodes belong to an existing group")
1915 # We do this verification here because, should this bogus circumstance
1916 # occur, it would never be caught by VerifyGroup, which only acts on
1917 # nodes/instances reachable from existing node groups.
1919 dangling_nodes = set(node.name for node in self.all_node_info.values()
1920 if node.group not in self.all_group_info)
1922 dangling_instances = {}
1923 no_node_instances = []
1925 for inst in self.all_inst_info.values():
1926 if inst.primary_node in dangling_nodes:
1927 dangling_instances.setdefault(inst.primary_node, []).append(inst.name)
1928 elif inst.primary_node not in self.all_node_info:
1929 no_node_instances.append(inst.name)
1934 utils.CommaJoin(dangling_instances.get(node.name,
1936 for node in dangling_nodes]
1938 self._ErrorIf(bool(dangling_nodes), constants.CV_ECLUSTERDANGLINGNODES,
1940 "the following nodes (and their instances) belong to a non"
1941 " existing group: %s", utils.CommaJoin(pretty_dangling))
1943 self._ErrorIf(bool(no_node_instances), constants.CV_ECLUSTERDANGLINGINST,
1945 "the following instances have a non-existing primary-node:"
1946 " %s", utils.CommaJoin(no_node_instances))
1951 class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
1952 """Verifies the status of a node group.
1955 HPATH = "cluster-verify"
1956 HTYPE = constants.HTYPE_CLUSTER
1959 _HOOKS_INDENT_RE = re.compile("^", re.M)
1961 class NodeImage(object):
1962 """A class representing the logical and physical status of a node.
1965 @ivar name: the node name to which this object refers
1966 @ivar volumes: a structure as returned from
1967 L{ganeti.backend.GetVolumeList} (runtime)
1968 @ivar instances: a list of running instances (runtime)
1969 @ivar pinst: list of configured primary instances (config)
1970 @ivar sinst: list of configured secondary instances (config)
1971 @ivar sbp: dictionary of {primary-node: list of instances} for all
1972 instances for which this node is secondary (config)
1973 @ivar mfree: free memory, as reported by hypervisor (runtime)
1974 @ivar dfree: free disk, as reported by the node (runtime)
1975 @ivar offline: the offline status (config)
1976 @type rpc_fail: boolean
1977 @ivar rpc_fail: whether the RPC verify call was successfull (overall,
1978 not whether the individual keys were correct) (runtime)
1979 @type lvm_fail: boolean
1980 @ivar lvm_fail: whether the RPC call didn't return valid LVM data
1981 @type hyp_fail: boolean
1982 @ivar hyp_fail: whether the RPC call didn't return the instance list
1983 @type ghost: boolean
1984 @ivar ghost: whether this is a known node or not (config)
1985 @type os_fail: boolean
1986 @ivar os_fail: whether the RPC call didn't return valid OS data
1988 @ivar oslist: list of OSes as diagnosed by DiagnoseOS
1989 @type vm_capable: boolean
1990 @ivar vm_capable: whether the node can host instances
1993 def __init__(self, offline=False, name=None, vm_capable=True):
2002 self.offline = offline
2003 self.vm_capable = vm_capable
2004 self.rpc_fail = False
2005 self.lvm_fail = False
2006 self.hyp_fail = False
2008 self.os_fail = False
2011 def ExpandNames(self):
2012 # This raises errors.OpPrereqError on its own:
2013 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
2015 # Get instances in node group; this is unsafe and needs verification later
2016 inst_names = self.cfg.GetNodeGroupInstances(self.group_uuid)
2018 self.needed_locks = {
2019 locking.LEVEL_INSTANCE: inst_names,
2020 locking.LEVEL_NODEGROUP: [self.group_uuid],
2021 locking.LEVEL_NODE: [],
2024 self.share_locks = _ShareAll()
2026 def DeclareLocks(self, level):
2027 if level == locking.LEVEL_NODE:
2028 # Get members of node group; this is unsafe and needs verification later
2029 nodes = set(self.cfg.GetNodeGroup(self.group_uuid).members)
2031 all_inst_info = self.cfg.GetAllInstancesInfo()
2033 # In Exec(), we warn about mirrored instances that have primary and
2034 # secondary living in separate node groups. To fully verify that
2035 # volumes for these instances are healthy, we will need to do an
2036 # extra call to their secondaries. We ensure here those nodes will
2038 for inst in self.owned_locks(locking.LEVEL_INSTANCE):
2039 # Important: access only the instances whose lock is owned
2040 if all_inst_info[inst].disk_template in constants.DTS_INT_MIRROR:
2041 nodes.update(all_inst_info[inst].secondary_nodes)
2043 self.needed_locks[locking.LEVEL_NODE] = nodes
2045 def CheckPrereq(self):
2046 assert self.group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP)
2047 self.group_info = self.cfg.GetNodeGroup(self.group_uuid)
2049 group_nodes = set(self.group_info.members)
2050 group_instances = self.cfg.GetNodeGroupInstances(self.group_uuid)
2053 group_nodes.difference(self.owned_locks(locking.LEVEL_NODE))
2055 unlocked_instances = \
2056 group_instances.difference(self.owned_locks(locking.LEVEL_INSTANCE))
2059 raise errors.OpPrereqError("Missing lock for nodes: %s" %
2060 utils.CommaJoin(unlocked_nodes))
2062 if unlocked_instances:
2063 raise errors.OpPrereqError("Missing lock for instances: %s" %
2064 utils.CommaJoin(unlocked_instances))
2066 self.all_node_info = self.cfg.GetAllNodesInfo()
2067 self.all_inst_info = self.cfg.GetAllInstancesInfo()
2069 self.my_node_names = utils.NiceSort(group_nodes)
2070 self.my_inst_names = utils.NiceSort(group_instances)
2072 self.my_node_info = dict((name, self.all_node_info[name])
2073 for name in self.my_node_names)
2075 self.my_inst_info = dict((name, self.all_inst_info[name])
2076 for name in self.my_inst_names)
2078 # We detect here the nodes that will need the extra RPC calls for verifying
2079 # split LV volumes; they should be locked.
2080 extra_lv_nodes = set()
2082 for inst in self.my_inst_info.values():
2083 if inst.disk_template in constants.DTS_INT_MIRROR:
2084 group = self.my_node_info[inst.primary_node].group
2085 for nname in inst.secondary_nodes:
2086 if self.all_node_info[nname].group != group:
2087 extra_lv_nodes.add(nname)
2089 unlocked_lv_nodes = \
2090 extra_lv_nodes.difference(self.owned_locks(locking.LEVEL_NODE))
2092 if unlocked_lv_nodes:
2093 raise errors.OpPrereqError("these nodes could be locked: %s" %
2094 utils.CommaJoin(unlocked_lv_nodes))
2095 self.extra_lv_nodes = list(extra_lv_nodes)
2097 def _VerifyNode(self, ninfo, nresult):
2098 """Perform some basic validation on data returned from a node.
2100 - check the result data structure is well formed and has all the
2102 - check ganeti version
2104 @type ninfo: L{objects.Node}
2105 @param ninfo: the node to check
2106 @param nresult: the results from the node
2108 @return: whether overall this call was successful (and we can expect
2109 reasonable values in the respose)
2113 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2115 # main result, nresult should be a non-empty dict
2116 test = not nresult or not isinstance(nresult, dict)
2117 _ErrorIf(test, constants.CV_ENODERPC, node,
2118 "unable to verify node: no data returned")
2122 # compares ganeti version
2123 local_version = constants.PROTOCOL_VERSION
2124 remote_version = nresult.get("version", None)
2125 test = not (remote_version and
2126 isinstance(remote_version, (list, tuple)) and
2127 len(remote_version) == 2)
2128 _ErrorIf(test, constants.CV_ENODERPC, node,
2129 "connection to node returned invalid data")
2133 test = local_version != remote_version[0]
2134 _ErrorIf(test, constants.CV_ENODEVERSION, node,
2135 "incompatible protocol versions: master %s,"
2136 " node %s", local_version, remote_version[0])
2140 # node seems compatible, we can actually try to look into its results
2142 # full package version
2143 self._ErrorIf(constants.RELEASE_VERSION != remote_version[1],
2144 constants.CV_ENODEVERSION, node,
2145 "software version mismatch: master %s, node %s",
2146 constants.RELEASE_VERSION, remote_version[1],
2147 code=self.ETYPE_WARNING)
2149 hyp_result = nresult.get(constants.NV_HYPERVISOR, None)
2150 if ninfo.vm_capable and isinstance(hyp_result, dict):
2151 for hv_name, hv_result in hyp_result.iteritems():
2152 test = hv_result is not None
2153 _ErrorIf(test, constants.CV_ENODEHV, node,
2154 "hypervisor %s verify failure: '%s'", hv_name, hv_result)
2156 hvp_result = nresult.get(constants.NV_HVPARAMS, None)
2157 if ninfo.vm_capable and isinstance(hvp_result, list):
2158 for item, hv_name, hv_result in hvp_result:
2159 _ErrorIf(True, constants.CV_ENODEHV, node,
2160 "hypervisor %s parameter verify failure (source %s): %s",
2161 hv_name, item, hv_result)
2163 test = nresult.get(constants.NV_NODESETUP,
2164 ["Missing NODESETUP results"])
2165 _ErrorIf(test, constants.CV_ENODESETUP, node, "node setup error: %s",
2170 def _VerifyNodeTime(self, ninfo, nresult,
2171 nvinfo_starttime, nvinfo_endtime):
2172 """Check the node time.
2174 @type ninfo: L{objects.Node}
2175 @param ninfo: the node to check
2176 @param nresult: the remote results for the node
2177 @param nvinfo_starttime: the start time of the RPC call
2178 @param nvinfo_endtime: the end time of the RPC call
2182 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2184 ntime = nresult.get(constants.NV_TIME, None)
2186 ntime_merged = utils.MergeTime(ntime)
2187 except (ValueError, TypeError):
2188 _ErrorIf(True, constants.CV_ENODETIME, node, "Node returned invalid time")
2191 if ntime_merged < (nvinfo_starttime - constants.NODE_MAX_CLOCK_SKEW):
2192 ntime_diff = "%.01fs" % abs(nvinfo_starttime - ntime_merged)
2193 elif ntime_merged > (nvinfo_endtime + constants.NODE_MAX_CLOCK_SKEW):
2194 ntime_diff = "%.01fs" % abs(ntime_merged - nvinfo_endtime)
2198 _ErrorIf(ntime_diff is not None, constants.CV_ENODETIME, node,
2199 "Node time diverges by at least %s from master node time",
2202 def _VerifyNodeLVM(self, ninfo, nresult, vg_name):
2203 """Check the node LVM results.
2205 @type ninfo: L{objects.Node}
2206 @param ninfo: the node to check
2207 @param nresult: the remote results for the node
2208 @param vg_name: the configured VG name
2215 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2217 # checks vg existence and size > 20G
2218 vglist = nresult.get(constants.NV_VGLIST, None)
2220 _ErrorIf(test, constants.CV_ENODELVM, node, "unable to check volume groups")
2222 vgstatus = utils.CheckVolumeGroupSize(vglist, vg_name,
2223 constants.MIN_VG_SIZE)
2224 _ErrorIf(vgstatus, constants.CV_ENODELVM, node, vgstatus)
2227 pvlist = nresult.get(constants.NV_PVLIST, None)
2228 test = pvlist is None
2229 _ErrorIf(test, constants.CV_ENODELVM, node, "Can't get PV list from node")
2231 # check that ':' is not present in PV names, since it's a
2232 # special character for lvcreate (denotes the range of PEs to
2234 for _, pvname, owner_vg in pvlist:
2235 test = ":" in pvname
2236 _ErrorIf(test, constants.CV_ENODELVM, node,
2237 "Invalid character ':' in PV '%s' of VG '%s'",
2240 def _VerifyNodeBridges(self, ninfo, nresult, bridges):
2241 """Check the node bridges.
2243 @type ninfo: L{objects.Node}
2244 @param ninfo: the node to check
2245 @param nresult: the remote results for the node
2246 @param bridges: the expected list of bridges
2253 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2255 missing = nresult.get(constants.NV_BRIDGES, None)
2256 test = not isinstance(missing, list)
2257 _ErrorIf(test, constants.CV_ENODENET, node,
2258 "did not return valid bridge information")
2260 _ErrorIf(bool(missing), constants.CV_ENODENET, node,
2261 "missing bridges: %s" % utils.CommaJoin(sorted(missing)))
2263 def _VerifyNodeUserScripts(self, ninfo, nresult):
2264 """Check the results of user scripts presence and executability on the node
2266 @type ninfo: L{objects.Node}
2267 @param ninfo: the node to check
2268 @param nresult: the remote results for the node
2273 test = not constants.NV_USERSCRIPTS in nresult
2274 self._ErrorIf(test, constants.CV_ENODEUSERSCRIPTS, node,
2275 "did not return user scripts information")
2277 broken_scripts = nresult.get(constants.NV_USERSCRIPTS, None)
2279 self._ErrorIf(broken_scripts, constants.CV_ENODEUSERSCRIPTS, node,
2280 "user scripts not present or not executable: %s" %
2281 utils.CommaJoin(sorted(broken_scripts)))
2283 def _VerifyNodeNetwork(self, ninfo, nresult):
2284 """Check the node network connectivity results.
2286 @type ninfo: L{objects.Node}
2287 @param ninfo: the node to check
2288 @param nresult: the remote results for the node
2292 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2294 test = constants.NV_NODELIST not in nresult
2295 _ErrorIf(test, constants.CV_ENODESSH, node,
2296 "node hasn't returned node ssh connectivity data")
2298 if nresult[constants.NV_NODELIST]:
2299 for a_node, a_msg in nresult[constants.NV_NODELIST].items():
2300 _ErrorIf(True, constants.CV_ENODESSH, node,
2301 "ssh communication with node '%s': %s", a_node, a_msg)
2303 test = constants.NV_NODENETTEST not in nresult
2304 _ErrorIf(test, constants.CV_ENODENET, node,
2305 "node hasn't returned node tcp connectivity data")
2307 if nresult[constants.NV_NODENETTEST]:
2308 nlist = utils.NiceSort(nresult[constants.NV_NODENETTEST].keys())
2310 _ErrorIf(True, constants.CV_ENODENET, node,
2311 "tcp communication with node '%s': %s",
2312 anode, nresult[constants.NV_NODENETTEST][anode])
2314 test = constants.NV_MASTERIP not in nresult
2315 _ErrorIf(test, constants.CV_ENODENET, node,
2316 "node hasn't returned node master IP reachability data")
2318 if not nresult[constants.NV_MASTERIP]:
2319 if node == self.master_node:
2320 msg = "the master node cannot reach the master IP (not configured?)"
2322 msg = "cannot reach the master IP"
2323 _ErrorIf(True, constants.CV_ENODENET, node, msg)
2325 def _VerifyInstance(self, instance, instanceconfig, node_image,
2327 """Verify an instance.
2329 This function checks to see if the required block devices are
2330 available on the instance's node.
2333 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2334 node_current = instanceconfig.primary_node
2336 node_vol_should = {}
2337 instanceconfig.MapLVsByNode(node_vol_should)
2339 ipolicy = _CalculateGroupIPolicy(self.cfg.GetClusterInfo(), self.group_info)
2340 err = _ComputeIPolicyInstanceViolation(ipolicy, instanceconfig)
2341 _ErrorIf(err, constants.CV_EINSTANCEPOLICY, instance, err)
2343 for node in node_vol_should:
2344 n_img = node_image[node]
2345 if n_img.offline or n_img.rpc_fail or n_img.lvm_fail:
2346 # ignore missing volumes on offline or broken nodes
2348 for volume in node_vol_should[node]:
2349 test = volume not in n_img.volumes
2350 _ErrorIf(test, constants.CV_EINSTANCEMISSINGDISK, instance,
2351 "volume %s missing on node %s", volume, node)
2353 if instanceconfig.admin_state == constants.ADMINST_UP:
2354 pri_img = node_image[node_current]
2355 test = instance not in pri_img.instances and not pri_img.offline
2356 _ErrorIf(test, constants.CV_EINSTANCEDOWN, instance,
2357 "instance not running on its primary node %s",
2360 diskdata = [(nname, success, status, idx)
2361 for (nname, disks) in diskstatus.items()
2362 for idx, (success, status) in enumerate(disks)]
2364 for nname, success, bdev_status, idx in diskdata:
2365 # the 'ghost node' construction in Exec() ensures that we have a
2367 snode = node_image[nname]
2368 bad_snode = snode.ghost or snode.offline
2369 _ErrorIf(instanceconfig.admin_state == constants.ADMINST_UP and
2370 not success and not bad_snode,
2371 constants.CV_EINSTANCEFAULTYDISK, instance,
2372 "couldn't retrieve status for disk/%s on %s: %s",
2373 idx, nname, bdev_status)
2374 _ErrorIf((instanceconfig.admin_state == constants.ADMINST_UP and
2375 success and bdev_status.ldisk_status == constants.LDS_FAULTY),
2376 constants.CV_EINSTANCEFAULTYDISK, instance,
2377 "disk/%s on %s is faulty", idx, nname)
2379 def _VerifyOrphanVolumes(self, node_vol_should, node_image, reserved):
2380 """Verify if there are any unknown volumes in the cluster.
2382 The .os, .swap and backup volumes are ignored. All other volumes are
2383 reported as unknown.
2385 @type reserved: L{ganeti.utils.FieldSet}
2386 @param reserved: a FieldSet of reserved volume names
2389 for node, n_img in node_image.items():
2390 if n_img.offline or n_img.rpc_fail or n_img.lvm_fail:
2391 # skip non-healthy nodes
2393 for volume in n_img.volumes:
2394 test = ((node not in node_vol_should or
2395 volume not in node_vol_should[node]) and
2396 not reserved.Matches(volume))
2397 self._ErrorIf(test, constants.CV_ENODEORPHANLV, node,
2398 "volume %s is unknown", volume)
2400 def _VerifyNPlusOneMemory(self, node_image, instance_cfg):
2401 """Verify N+1 Memory Resilience.
2403 Check that if one single node dies we can still start all the
2404 instances it was primary for.
2407 cluster_info = self.cfg.GetClusterInfo()
2408 for node, n_img in node_image.items():
2409 # This code checks that every node which is now listed as
2410 # secondary has enough memory to host all instances it is
2411 # supposed to should a single other node in the cluster fail.
2412 # FIXME: not ready for failover to an arbitrary node
2413 # FIXME: does not support file-backed instances
2414 # WARNING: we currently take into account down instances as well
2415 # as up ones, considering that even if they're down someone
2416 # might want to start them even in the event of a node failure.
2418 # we're skipping offline nodes from the N+1 warning, since
2419 # most likely we don't have good memory infromation from them;
2420 # we already list instances living on such nodes, and that's
2423 #TODO(dynmem): use MINMEM for checking
2424 #TODO(dynmem): also consider ballooning out other instances
2425 for prinode, instances in n_img.sbp.items():
2427 for instance in instances:
2428 bep = cluster_info.FillBE(instance_cfg[instance])
2429 if bep[constants.BE_AUTO_BALANCE]:
2430 needed_mem += bep[constants.BE_MAXMEM]
2431 test = n_img.mfree < needed_mem
2432 self._ErrorIf(test, constants.CV_ENODEN1, node,
2433 "not enough memory to accomodate instance failovers"
2434 " should node %s fail (%dMiB needed, %dMiB available)",
2435 prinode, needed_mem, n_img.mfree)
2438 def _VerifyFiles(cls, errorif, nodeinfo, master_node, all_nvinfo,
2439 (files_all, files_opt, files_mc, files_vm)):
2440 """Verifies file checksums collected from all nodes.
2442 @param errorif: Callback for reporting errors
2443 @param nodeinfo: List of L{objects.Node} objects
2444 @param master_node: Name of master node
2445 @param all_nvinfo: RPC results
2448 # Define functions determining which nodes to consider for a file
2451 (files_mc, lambda node: (node.master_candidate or
2452 node.name == master_node)),
2453 (files_vm, lambda node: node.vm_capable),
2456 # Build mapping from filename to list of nodes which should have the file
2458 for (files, fn) in files2nodefn:
2460 filenodes = nodeinfo
2462 filenodes = filter(fn, nodeinfo)
2463 nodefiles.update((filename,
2464 frozenset(map(operator.attrgetter("name"), filenodes)))
2465 for filename in files)
2467 assert set(nodefiles) == (files_all | files_mc | files_vm)
2469 fileinfo = dict((filename, {}) for filename in nodefiles)
2470 ignore_nodes = set()
2472 for node in nodeinfo:
2474 ignore_nodes.add(node.name)
2477 nresult = all_nvinfo[node.name]
2479 if nresult.fail_msg or not nresult.payload:
2482 node_files = nresult.payload.get(constants.NV_FILELIST, None)
2484 test = not (node_files and isinstance(node_files, dict))
2485 errorif(test, constants.CV_ENODEFILECHECK, node.name,
2486 "Node did not return file checksum data")
2488 ignore_nodes.add(node.name)
2491 # Build per-checksum mapping from filename to nodes having it
2492 for (filename, checksum) in node_files.items():
2493 assert filename in nodefiles
2494 fileinfo[filename].setdefault(checksum, set()).add(node.name)
2496 for (filename, checksums) in fileinfo.items():
2497 assert compat.all(len(i) > 10 for i in checksums), "Invalid checksum"
2499 # Nodes having the file
2500 with_file = frozenset(node_name
2501 for nodes in fileinfo[filename].values()
2502 for node_name in nodes) - ignore_nodes
2504 expected_nodes = nodefiles[filename] - ignore_nodes
2506 # Nodes missing file
2507 missing_file = expected_nodes - with_file
2509 if filename in files_opt:
2511 errorif(missing_file and missing_file != expected_nodes,
2512 constants.CV_ECLUSTERFILECHECK, None,
2513 "File %s is optional, but it must exist on all or no"
2514 " nodes (not found on %s)",
2515 filename, utils.CommaJoin(utils.NiceSort(missing_file)))
2517 errorif(missing_file, constants.CV_ECLUSTERFILECHECK, None,
2518 "File %s is missing from node(s) %s", filename,
2519 utils.CommaJoin(utils.NiceSort(missing_file)))
2521 # Warn if a node has a file it shouldn't
2522 unexpected = with_file - expected_nodes
2524 constants.CV_ECLUSTERFILECHECK, None,
2525 "File %s should not exist on node(s) %s",
2526 filename, utils.CommaJoin(utils.NiceSort(unexpected)))
2528 # See if there are multiple versions of the file
2529 test = len(checksums) > 1
2531 variants = ["variant %s on %s" %
2532 (idx + 1, utils.CommaJoin(utils.NiceSort(nodes)))
2533 for (idx, (checksum, nodes)) in
2534 enumerate(sorted(checksums.items()))]
2538 errorif(test, constants.CV_ECLUSTERFILECHECK, None,
2539 "File %s found with %s different checksums (%s)",
2540 filename, len(checksums), "; ".join(variants))
2542 def _VerifyNodeDrbd(self, ninfo, nresult, instanceinfo, drbd_helper,
2544 """Verifies and the node DRBD status.
2546 @type ninfo: L{objects.Node}
2547 @param ninfo: the node to check
2548 @param nresult: the remote results for the node
2549 @param instanceinfo: the dict of instances
2550 @param drbd_helper: the configured DRBD usermode helper
2551 @param drbd_map: the DRBD map as returned by
2552 L{ganeti.config.ConfigWriter.ComputeDRBDMap}
2556 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2559 helper_result = nresult.get(constants.NV_DRBDHELPER, None)
2560 test = (helper_result == None)
2561 _ErrorIf(test, constants.CV_ENODEDRBDHELPER, node,
2562 "no drbd usermode helper returned")
2564 status, payload = helper_result
2566 _ErrorIf(test, constants.CV_ENODEDRBDHELPER, node,
2567 "drbd usermode helper check unsuccessful: %s", payload)
2568 test = status and (payload != drbd_helper)
2569 _ErrorIf(test, constants.CV_ENODEDRBDHELPER, node,
2570 "wrong drbd usermode helper: %s", payload)
2572 # compute the DRBD minors
2574 for minor, instance in drbd_map[node].items():
2575 test = instance not in instanceinfo
2576 _ErrorIf(test, constants.CV_ECLUSTERCFG, None,
2577 "ghost instance '%s' in temporary DRBD map", instance)
2578 # ghost instance should not be running, but otherwise we
2579 # don't give double warnings (both ghost instance and
2580 # unallocated minor in use)
2582 node_drbd[minor] = (instance, False)
2584 instance = instanceinfo[instance]
2585 node_drbd[minor] = (instance.name,
2586 instance.admin_state == constants.ADMINST_UP)
2588 # and now check them
2589 used_minors = nresult.get(constants.NV_DRBDLIST, [])
2590 test = not isinstance(used_minors, (tuple, list))
2591 _ErrorIf(test, constants.CV_ENODEDRBD, node,
2592 "cannot parse drbd status file: %s", str(used_minors))
2594 # we cannot check drbd status
2597 for minor, (iname, must_exist) in node_drbd.items():
2598 test = minor not in used_minors and must_exist
2599 _ErrorIf(test, constants.CV_ENODEDRBD, node,
2600 "drbd minor %d of instance %s is not active", minor, iname)
2601 for minor in used_minors:
2602 test = minor not in node_drbd
2603 _ErrorIf(test, constants.CV_ENODEDRBD, node,
2604 "unallocated drbd minor %d is in use", minor)
2606 def _UpdateNodeOS(self, ninfo, nresult, nimg):
2607 """Builds the node OS structures.
2609 @type ninfo: L{objects.Node}
2610 @param ninfo: the node to check
2611 @param nresult: the remote results for the node
2612 @param nimg: the node image object
2616 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2618 remote_os = nresult.get(constants.NV_OSLIST, None)
2619 test = (not isinstance(remote_os, list) or
2620 not compat.all(isinstance(v, list) and len(v) == 7
2621 for v in remote_os))
2623 _ErrorIf(test, constants.CV_ENODEOS, node,
2624 "node hasn't returned valid OS data")
2633 for (name, os_path, status, diagnose,
2634 variants, parameters, api_ver) in nresult[constants.NV_OSLIST]:
2636 if name not in os_dict:
2639 # parameters is a list of lists instead of list of tuples due to
2640 # JSON lacking a real tuple type, fix it:
2641 parameters = [tuple(v) for v in parameters]
2642 os_dict[name].append((os_path, status, diagnose,
2643 set(variants), set(parameters), set(api_ver)))
2645 nimg.oslist = os_dict
2647 def _VerifyNodeOS(self, ninfo, nimg, base):
2648 """Verifies the node OS list.
2650 @type ninfo: L{objects.Node}
2651 @param ninfo: the node to check
2652 @param nimg: the node image object
2653 @param base: the 'template' node we match against (e.g. from the master)
2657 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2659 assert not nimg.os_fail, "Entered _VerifyNodeOS with failed OS rpc?"
2661 beautify_params = lambda l: ["%s: %s" % (k, v) for (k, v) in l]
2662 for os_name, os_data in nimg.oslist.items():
2663 assert os_data, "Empty OS status for OS %s?!" % os_name
2664 f_path, f_status, f_diag, f_var, f_param, f_api = os_data[0]
2665 _ErrorIf(not f_status, constants.CV_ENODEOS, node,
2666 "Invalid OS %s (located at %s): %s", os_name, f_path, f_diag)
2667 _ErrorIf(len(os_data) > 1, constants.CV_ENODEOS, node,
2668 "OS '%s' has multiple entries (first one shadows the rest): %s",
2669 os_name, utils.CommaJoin([v[0] for v in os_data]))
2670 # comparisons with the 'base' image
2671 test = os_name not in base.oslist
2672 _ErrorIf(test, constants.CV_ENODEOS, node,
2673 "Extra OS %s not present on reference node (%s)",
2677 assert base.oslist[os_name], "Base node has empty OS status?"
2678 _, b_status, _, b_var, b_param, b_api = base.oslist[os_name][0]
2680 # base OS is invalid, skipping
2682 for kind, a, b in [("API version", f_api, b_api),
2683 ("variants list", f_var, b_var),
2684 ("parameters", beautify_params(f_param),
2685 beautify_params(b_param))]:
2686 _ErrorIf(a != b, constants.CV_ENODEOS, node,
2687 "OS %s for %s differs from reference node %s: [%s] vs. [%s]",
2688 kind, os_name, base.name,
2689 utils.CommaJoin(sorted(a)), utils.CommaJoin(sorted(b)))
2691 # check any missing OSes
2692 missing = set(base.oslist.keys()).difference(nimg.oslist.keys())
2693 _ErrorIf(missing, constants.CV_ENODEOS, node,
2694 "OSes present on reference node %s but missing on this node: %s",
2695 base.name, utils.CommaJoin(missing))
2697 def _VerifyOob(self, ninfo, nresult):
2698 """Verifies out of band functionality of a node.
2700 @type ninfo: L{objects.Node}
2701 @param ninfo: the node to check
2702 @param nresult: the remote results for the node
2706 # We just have to verify the paths on master and/or master candidates
2707 # as the oob helper is invoked on the master
2708 if ((ninfo.master_candidate or ninfo.master_capable) and
2709 constants.NV_OOB_PATHS in nresult):
2710 for path_result in nresult[constants.NV_OOB_PATHS]:
2711 self._ErrorIf(path_result, constants.CV_ENODEOOBPATH, node, path_result)
2713 def _UpdateNodeVolumes(self, ninfo, nresult, nimg, vg_name):
2714 """Verifies and updates the node volume data.
2716 This function will update a L{NodeImage}'s internal structures
2717 with data from the remote call.
2719 @type ninfo: L{objects.Node}
2720 @param ninfo: the node to check
2721 @param nresult: the remote results for the node
2722 @param nimg: the node image object
2723 @param vg_name: the configured VG name
2727 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2729 nimg.lvm_fail = True
2730 lvdata = nresult.get(constants.NV_LVLIST, "Missing LV data")
2733 elif isinstance(lvdata, basestring):
2734 _ErrorIf(True, constants.CV_ENODELVM, node, "LVM problem on node: %s",
2735 utils.SafeEncode(lvdata))
2736 elif not isinstance(lvdata, dict):
2737 _ErrorIf(True, constants.CV_ENODELVM, node,
2738 "rpc call to node failed (lvlist)")
2740 nimg.volumes = lvdata
2741 nimg.lvm_fail = False
2743 def _UpdateNodeInstances(self, ninfo, nresult, nimg):
2744 """Verifies and updates the node instance list.
2746 If the listing was successful, then updates this node's instance
2747 list. Otherwise, it marks the RPC call as failed for the instance
2750 @type ninfo: L{objects.Node}
2751 @param ninfo: the node to check
2752 @param nresult: the remote results for the node
2753 @param nimg: the node image object
2756 idata = nresult.get(constants.NV_INSTANCELIST, None)
2757 test = not isinstance(idata, list)
2758 self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name,
2759 "rpc call to node failed (instancelist): %s",
2760 utils.SafeEncode(str(idata)))
2762 nimg.hyp_fail = True
2764 nimg.instances = idata
2766 def _UpdateNodeInfo(self, ninfo, nresult, nimg, vg_name):
2767 """Verifies and computes a node information map
2769 @type ninfo: L{objects.Node}
2770 @param ninfo: the node to check
2771 @param nresult: the remote results for the node
2772 @param nimg: the node image object
2773 @param vg_name: the configured VG name
2777 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2779 # try to read free memory (from the hypervisor)
2780 hv_info = nresult.get(constants.NV_HVINFO, None)
2781 test = not isinstance(hv_info, dict) or "memory_free" not in hv_info
2782 _ErrorIf(test, constants.CV_ENODEHV, node,
2783 "rpc call to node failed (hvinfo)")
2786 nimg.mfree = int(hv_info["memory_free"])
2787 except (ValueError, TypeError):
2788 _ErrorIf(True, constants.CV_ENODERPC, node,
2789 "node returned invalid nodeinfo, check hypervisor")
2791 # FIXME: devise a free space model for file based instances as well
2792 if vg_name is not None:
2793 test = (constants.NV_VGLIST not in nresult or
2794 vg_name not in nresult[constants.NV_VGLIST])
2795 _ErrorIf(test, constants.CV_ENODELVM, node,
2796 "node didn't return data for the volume group '%s'"
2797 " - it is either missing or broken", vg_name)
2800 nimg.dfree = int(nresult[constants.NV_VGLIST][vg_name])
2801 except (ValueError, TypeError):
2802 _ErrorIf(True, constants.CV_ENODERPC, node,
2803 "node returned invalid LVM info, check LVM status")
2805 def _CollectDiskInfo(self, nodelist, node_image, instanceinfo):
2806 """Gets per-disk status information for all instances.
2808 @type nodelist: list of strings
2809 @param nodelist: Node names
2810 @type node_image: dict of (name, L{objects.Node})
2811 @param node_image: Node objects
2812 @type instanceinfo: dict of (name, L{objects.Instance})
2813 @param instanceinfo: Instance objects
2814 @rtype: {instance: {node: [(succes, payload)]}}
2815 @return: a dictionary of per-instance dictionaries with nodes as
2816 keys and disk information as values; the disk information is a
2817 list of tuples (success, payload)
2820 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2823 node_disks_devonly = {}
2824 diskless_instances = set()
2825 diskless = constants.DT_DISKLESS
2827 for nname in nodelist:
2828 node_instances = list(itertools.chain(node_image[nname].pinst,
2829 node_image[nname].sinst))
2830 diskless_instances.update(inst for inst in node_instances
2831 if instanceinfo[inst].disk_template == diskless)
2832 disks = [(inst, disk)
2833 for inst in node_instances
2834 for disk in instanceinfo[inst].disks]
2837 # No need to collect data
2840 node_disks[nname] = disks
2842 # Creating copies as SetDiskID below will modify the objects and that can
2843 # lead to incorrect data returned from nodes
2844 devonly = [dev.Copy() for (_, dev) in disks]
2847 self.cfg.SetDiskID(dev, nname)
2849 node_disks_devonly[nname] = devonly
2851 assert len(node_disks) == len(node_disks_devonly)
2853 # Collect data from all nodes with disks
2854 result = self.rpc.call_blockdev_getmirrorstatus_multi(node_disks.keys(),
2857 assert len(result) == len(node_disks)
2861 for (nname, nres) in result.items():
2862 disks = node_disks[nname]
2865 # No data from this node
2866 data = len(disks) * [(False, "node offline")]
2869 _ErrorIf(msg, constants.CV_ENODERPC, nname,
2870 "while getting disk information: %s", msg)
2872 # No data from this node
2873 data = len(disks) * [(False, msg)]
2876 for idx, i in enumerate(nres.payload):
2877 if isinstance(i, (tuple, list)) and len(i) == 2:
2880 logging.warning("Invalid result from node %s, entry %d: %s",
2882 data.append((False, "Invalid result from the remote node"))
2884 for ((inst, _), status) in zip(disks, data):
2885 instdisk.setdefault(inst, {}).setdefault(nname, []).append(status)
2887 # Add empty entries for diskless instances.
2888 for inst in diskless_instances:
2889 assert inst not in instdisk
2892 assert compat.all(len(statuses) == len(instanceinfo[inst].disks) and
2893 len(nnames) <= len(instanceinfo[inst].all_nodes) and
2894 compat.all(isinstance(s, (tuple, list)) and
2895 len(s) == 2 for s in statuses)
2896 for inst, nnames in instdisk.items()
2897 for nname, statuses in nnames.items())
2898 assert set(instdisk) == set(instanceinfo), "instdisk consistency failure"
2903 def _SshNodeSelector(group_uuid, all_nodes):
2904 """Create endless iterators for all potential SSH check hosts.
2907 nodes = [node for node in all_nodes
2908 if (node.group != group_uuid and
2910 keyfunc = operator.attrgetter("group")
2912 return map(itertools.cycle,
2913 [sorted(map(operator.attrgetter("name"), names))
2914 for _, names in itertools.groupby(sorted(nodes, key=keyfunc),
2918 def _SelectSshCheckNodes(cls, group_nodes, group_uuid, all_nodes):
2919 """Choose which nodes should talk to which other nodes.
2921 We will make nodes contact all nodes in their group, and one node from
2924 @warning: This algorithm has a known issue if one node group is much
2925 smaller than others (e.g. just one node). In such a case all other
2926 nodes will talk to the single node.
2929 online_nodes = sorted(node.name for node in group_nodes if not node.offline)
2930 sel = cls._SshNodeSelector(group_uuid, all_nodes)
2932 return (online_nodes,
2933 dict((name, sorted([i.next() for i in sel]))
2934 for name in online_nodes))
2936 def BuildHooksEnv(self):
2939 Cluster-Verify hooks just ran in the post phase and their failure makes
2940 the output be logged in the verify output and the verification to fail.
2944 "CLUSTER_TAGS": " ".join(self.cfg.GetClusterInfo().GetTags())
2947 env.update(("NODE_TAGS_%s" % node.name, " ".join(node.GetTags()))
2948 for node in self.my_node_info.values())
2952 def BuildHooksNodes(self):
2953 """Build hooks nodes.
2956 return ([], self.my_node_names)
2958 def Exec(self, feedback_fn):
2959 """Verify integrity of the node group, performing various test on nodes.
2962 # This method has too many local variables. pylint: disable=R0914
2963 feedback_fn("* Verifying group '%s'" % self.group_info.name)
2965 if not self.my_node_names:
2967 feedback_fn("* Empty node group, skipping verification")
2971 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2972 verbose = self.op.verbose
2973 self._feedback_fn = feedback_fn
2975 vg_name = self.cfg.GetVGName()
2976 drbd_helper = self.cfg.GetDRBDHelper()
2977 cluster = self.cfg.GetClusterInfo()
2978 groupinfo = self.cfg.GetAllNodeGroupsInfo()
2979 hypervisors = cluster.enabled_hypervisors
2980 node_data_list = [self.my_node_info[name] for name in self.my_node_names]
2982 i_non_redundant = [] # Non redundant instances
2983 i_non_a_balanced = [] # Non auto-balanced instances
2984 i_offline = 0 # Count of offline instances
2985 n_offline = 0 # Count of offline nodes
2986 n_drained = 0 # Count of nodes being drained
2987 node_vol_should = {}
2989 # FIXME: verify OS list
2992 filemap = _ComputeAncillaryFiles(cluster, False)
2994 # do local checksums
2995 master_node = self.master_node = self.cfg.GetMasterNode()
2996 master_ip = self.cfg.GetMasterIP()
2998 feedback_fn("* Gathering data (%d nodes)" % len(self.my_node_names))
3001 if self.cfg.GetUseExternalMipScript():
3002 user_scripts.append(constants.EXTERNAL_MASTER_SETUP_SCRIPT)
3004 node_verify_param = {
3005 constants.NV_FILELIST:
3006 utils.UniqueSequence(filename
3007 for files in filemap
3008 for filename in files),
3009 constants.NV_NODELIST:
3010 self._SelectSshCheckNodes(node_data_list, self.group_uuid,
3011 self.all_node_info.values()),
3012 constants.NV_HYPERVISOR: hypervisors,
3013 constants.NV_HVPARAMS:
3014 _GetAllHypervisorParameters(cluster, self.all_inst_info.values()),
3015 constants.NV_NODENETTEST: [(node.name, node.primary_ip, node.secondary_ip)
3016 for node in node_data_list
3017 if not node.offline],
3018 constants.NV_INSTANCELIST: hypervisors,
3019 constants.NV_VERSION: None,
3020 constants.NV_HVINFO: self.cfg.GetHypervisorType(),
3021 constants.NV_NODESETUP: None,
3022 constants.NV_TIME: None,
3023 constants.NV_MASTERIP: (master_node, master_ip),
3024 constants.NV_OSLIST: None,
3025 constants.NV_VMNODES: self.cfg.GetNonVmCapableNodeList(),
3026 constants.NV_USERSCRIPTS: user_scripts,
3029 if vg_name is not None:
3030 node_verify_param[constants.NV_VGLIST] = None
3031 node_verify_param[constants.NV_LVLIST] = vg_name
3032 node_verify_param[constants.NV_PVLIST] = [vg_name]
3033 node_verify_param[constants.NV_DRBDLIST] = None
3036 node_verify_param[constants.NV_DRBDHELPER] = drbd_helper
3039 # FIXME: this needs to be changed per node-group, not cluster-wide
3041 default_nicpp = cluster.nicparams[constants.PP_DEFAULT]
3042 if default_nicpp[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
3043 bridges.add(default_nicpp[constants.NIC_LINK])
3044 for instance in self.my_inst_info.values():
3045 for nic in instance.nics:
3046 full_nic = cluster.SimpleFillNIC(nic.nicparams)
3047 if full_nic[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
3048 bridges.add(full_nic[constants.NIC_LINK])
3051 node_verify_param[constants.NV_BRIDGES] = list(bridges)
3053 # Build our expected cluster state
3054 node_image = dict((node.name, self.NodeImage(offline=node.offline,
3056 vm_capable=node.vm_capable))
3057 for node in node_data_list)
3061 for node in self.all_node_info.values():
3062 path = _SupportsOob(self.cfg, node)
3063 if path and path not in oob_paths:
3064 oob_paths.append(path)
3067 node_verify_param[constants.NV_OOB_PATHS] = oob_paths
3069 for instance in self.my_inst_names:
3070 inst_config = self.my_inst_info[instance]
3072 for nname in inst_config.all_nodes:
3073 if nname not in node_image:
3074 gnode = self.NodeImage(name=nname)
3075 gnode.ghost = (nname not in self.all_node_info)
3076 node_image[nname] = gnode
3078 inst_config.MapLVsByNode(node_vol_should)
3080 pnode = inst_config.primary_node
3081 node_image[pnode].pinst.append(instance)
3083 for snode in inst_config.secondary_nodes:
3084 nimg = node_image[snode]
3085 nimg.sinst.append(instance)
3086 if pnode not in nimg.sbp:
3087 nimg.sbp[pnode] = []
3088 nimg.sbp[pnode].append(instance)
3090 # At this point, we have the in-memory data structures complete,
3091 # except for the runtime information, which we'll gather next
3093 # Due to the way our RPC system works, exact response times cannot be
3094 # guaranteed (e.g. a broken node could run into a timeout). By keeping the
3095 # time before and after executing the request, we can at least have a time
3097 nvinfo_starttime = time.time()
3098 all_nvinfo = self.rpc.call_node_verify(self.my_node_names,
3100 self.cfg.GetClusterName())
3101 nvinfo_endtime = time.time()
3103 if self.extra_lv_nodes and vg_name is not None:
3105 self.rpc.call_node_verify(self.extra_lv_nodes,
3106 {constants.NV_LVLIST: vg_name},
3107 self.cfg.GetClusterName())
3109 extra_lv_nvinfo = {}
3111 all_drbd_map = self.cfg.ComputeDRBDMap()
3113 feedback_fn("* Gathering disk information (%s nodes)" %
3114 len(self.my_node_names))
3115 instdisk = self._CollectDiskInfo(self.my_node_names, node_image,
3118 feedback_fn("* Verifying configuration file consistency")
3120 # If not all nodes are being checked, we need to make sure the master node
3121 # and a non-checked vm_capable node are in the list.
3122 absent_nodes = set(self.all_node_info).difference(self.my_node_info)
3124 vf_nvinfo = all_nvinfo.copy()
3125 vf_node_info = list(self.my_node_info.values())
3126 additional_nodes = []
3127 if master_node not in self.my_node_info:
3128 additional_nodes.append(master_node)
3129 vf_node_info.append(self.all_node_info[master_node])
3130 # Add the first vm_capable node we find which is not included
3131 for node in absent_nodes:
3132 nodeinfo = self.all_node_info[node]
3133 if nodeinfo.vm_capable and not nodeinfo.offline:
3134 additional_nodes.append(node)
3135 vf_node_info.append(self.all_node_info[node])
3137 key = constants.NV_FILELIST
3138 vf_nvinfo.update(self.rpc.call_node_verify(additional_nodes,
3139 {key: node_verify_param[key]},
3140 self.cfg.GetClusterName()))
3142 vf_nvinfo = all_nvinfo
3143 vf_node_info = self.my_node_info.values()
3145 self._VerifyFiles(_ErrorIf, vf_node_info, master_node, vf_nvinfo, filemap)
3147 feedback_fn("* Verifying node status")
3151 for node_i in node_data_list:
3153 nimg = node_image[node]
3157 feedback_fn("* Skipping offline node %s" % (node,))
3161 if node == master_node:
3163 elif node_i.master_candidate:
3164 ntype = "master candidate"
3165 elif node_i.drained:
3171 feedback_fn("* Verifying node %s (%s)" % (node, ntype))
3173 msg = all_nvinfo[node].fail_msg
3174 _ErrorIf(msg, constants.CV_ENODERPC, node, "while contacting node: %s",
3177 nimg.rpc_fail = True
3180 nresult = all_nvinfo[node].payload
3182 nimg.call_ok = self._VerifyNode(node_i, nresult)
3183 self._VerifyNodeTime(node_i, nresult, nvinfo_starttime, nvinfo_endtime)
3184 self._VerifyNodeNetwork(node_i, nresult)
3185 self._VerifyNodeUserScripts(node_i, nresult)
3186 self._VerifyOob(node_i, nresult)
3189 self._VerifyNodeLVM(node_i, nresult, vg_name)
3190 self._VerifyNodeDrbd(node_i, nresult, self.all_inst_info, drbd_helper,
3193 self._UpdateNodeVolumes(node_i, nresult, nimg, vg_name)
3194 self._UpdateNodeInstances(node_i, nresult, nimg)
3195 self._UpdateNodeInfo(node_i, nresult, nimg, vg_name)
3196 self._UpdateNodeOS(node_i, nresult, nimg)
3198 if not nimg.os_fail:
3199 if refos_img is None:
3201 self._VerifyNodeOS(node_i, nimg, refos_img)
3202 self._VerifyNodeBridges(node_i, nresult, bridges)
3204 # Check whether all running instancies are primary for the node. (This
3205 # can no longer be done from _VerifyInstance below, since some of the
3206 # wrong instances could be from other node groups.)
3207 non_primary_inst = set(nimg.instances).difference(nimg.pinst)
3209 for inst in non_primary_inst:
3210 # FIXME: investigate best way to handle offline insts
3211 if inst.admin_state == constants.ADMINST_OFFLINE:
3213 feedback_fn("* Skipping offline instance %s" % inst.name)
3216 test = inst in self.all_inst_info
3217 _ErrorIf(test, constants.CV_EINSTANCEWRONGNODE, inst,
3218 "instance should not run on node %s", node_i.name)
3219 _ErrorIf(not test, constants.CV_ENODEORPHANINSTANCE, node_i.name,
3220 "node is running unknown instance %s", inst)
3222 for node, result in extra_lv_nvinfo.items():
3223 self._UpdateNodeVolumes(self.all_node_info[node], result.payload,
3224 node_image[node], vg_name)
3226 feedback_fn("* Verifying instance status")
3227 for instance in self.my_inst_names:
3229 feedback_fn("* Verifying instance %s" % instance)
3230 inst_config = self.my_inst_info[instance]
3231 self._VerifyInstance(instance, inst_config, node_image,
3233 inst_nodes_offline = []
3235 pnode = inst_config.primary_node
3236 pnode_img = node_image[pnode]
3237 _ErrorIf(pnode_img.rpc_fail and not pnode_img.offline,
3238 constants.CV_ENODERPC, pnode, "instance %s, connection to"
3239 " primary node failed", instance)
3241 _ErrorIf(inst_config.admin_state == constants.ADMINST_UP and
3243 constants.CV_EINSTANCEBADNODE, instance,
3244 "instance is marked as running and lives on offline node %s",
3245 inst_config.primary_node)
3247 # If the instance is non-redundant we cannot survive losing its primary
3248 # node, so we are not N+1 compliant. On the other hand we have no disk
3249 # templates with more than one secondary so that situation is not well
3251 # FIXME: does not support file-backed instances
3252 if not inst_config.secondary_nodes:
3253 i_non_redundant.append(instance)
3255 _ErrorIf(len(inst_config.secondary_nodes) > 1,
3256 constants.CV_EINSTANCELAYOUT,
3257 instance, "instance has multiple secondary nodes: %s",
3258 utils.CommaJoin(inst_config.secondary_nodes),
3259 code=self.ETYPE_WARNING)
3261 if inst_config.disk_template in constants.DTS_INT_MIRROR:
3262 pnode = inst_config.primary_node
3263 instance_nodes = utils.NiceSort(inst_config.all_nodes)
3264 instance_groups = {}
3266 for node in instance_nodes:
3267 instance_groups.setdefault(self.all_node_info[node].group,
3271 "%s (group %s)" % (utils.CommaJoin(nodes), groupinfo[group].name)
3272 # Sort so that we always list the primary node first.
3273 for group, nodes in sorted(instance_groups.items(),
3274 key=lambda (_, nodes): pnode in nodes,
3277 self._ErrorIf(len(instance_groups) > 1,
3278 constants.CV_EINSTANCESPLITGROUPS,
3279 instance, "instance has primary and secondary nodes in"
3280 " different groups: %s", utils.CommaJoin(pretty_list),
3281 code=self.ETYPE_WARNING)
3283 if not cluster.FillBE(inst_config)[constants.BE_AUTO_BALANCE]:
3284 i_non_a_balanced.append(instance)
3286 for snode in inst_config.secondary_nodes:
3287 s_img = node_image[snode]
3288 _ErrorIf(s_img.rpc_fail and not s_img.offline, constants.CV_ENODERPC,
3289 snode, "instance %s, connection to secondary node failed",
3293 inst_nodes_offline.append(snode)
3295 # warn that the instance lives on offline nodes
3296 _ErrorIf(inst_nodes_offline, constants.CV_EINSTANCEBADNODE, instance,
3297 "instance has offline secondary node(s) %s",
3298 utils.CommaJoin(inst_nodes_offline))
3299 # ... or ghost/non-vm_capable nodes
3300 for node in inst_config.all_nodes:
3301 _ErrorIf(node_image[node].ghost, constants.CV_EINSTANCEBADNODE,
3302 instance, "instance lives on ghost node %s", node)
3303 _ErrorIf(not node_image[node].vm_capable, constants.CV_EINSTANCEBADNODE,
3304 instance, "instance lives on non-vm_capable node %s", node)
3306 feedback_fn("* Verifying orphan volumes")
3307 reserved = utils.FieldSet(*cluster.reserved_lvs)
3309 # We will get spurious "unknown volume" warnings if any node of this group
3310 # is secondary for an instance whose primary is in another group. To avoid
3311 # them, we find these instances and add their volumes to node_vol_should.
3312 for inst in self.all_inst_info.values():
3313 for secondary in inst.secondary_nodes:
3314 if (secondary in self.my_node_info
3315 and inst.name not in self.my_inst_info):
3316 inst.MapLVsByNode(node_vol_should)
3319 self._VerifyOrphanVolumes(node_vol_should, node_image, reserved)
3321 if constants.VERIFY_NPLUSONE_MEM not in self.op.skip_checks:
3322 feedback_fn("* Verifying N+1 Memory redundancy")
3323 self._VerifyNPlusOneMemory(node_image, self.my_inst_info)
3325 feedback_fn("* Other Notes")
3327 feedback_fn(" - NOTICE: %d non-redundant instance(s) found."
3328 % len(i_non_redundant))
3330 if i_non_a_balanced:
3331 feedback_fn(" - NOTICE: %d non-auto-balanced instance(s) found."
3332 % len(i_non_a_balanced))
3335 feedback_fn(" - NOTICE: %d offline instance(s) found." % i_offline)
3338 feedback_fn(" - NOTICE: %d offline node(s) found." % n_offline)
3341 feedback_fn(" - NOTICE: %d drained node(s) found." % n_drained)
3345 def HooksCallBack(self, phase, hooks_results, feedback_fn, lu_result):
3346 """Analyze the post-hooks' result
3348 This method analyses the hook result, handles it, and sends some
3349 nicely-formatted feedback back to the user.
3351 @param phase: one of L{constants.HOOKS_PHASE_POST} or
3352 L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase
3353 @param hooks_results: the results of the multi-node hooks rpc call
3354 @param feedback_fn: function used send feedback back to the caller
3355 @param lu_result: previous Exec result
3356 @return: the new Exec result, based on the previous result
3360 # We only really run POST phase hooks, only for non-empty groups,
3361 # and are only interested in their results
3362 if not self.my_node_names:
3365 elif phase == constants.HOOKS_PHASE_POST:
3366 # Used to change hooks' output to proper indentation
3367 feedback_fn("* Hooks Results")
3368 assert hooks_results, "invalid result from hooks"
3370 for node_name in hooks_results:
3371 res = hooks_results[node_name]
3373 test = msg and not res.offline
3374 self._ErrorIf(test, constants.CV_ENODEHOOKS, node_name,
3375 "Communication failure in hooks execution: %s", msg)
3376 if res.offline or msg:
3377 # No need to investigate payload if node is offline or gave
3380 for script, hkr, output in res.payload:
3381 test = hkr == constants.HKR_FAIL
3382 self._ErrorIf(test, constants.CV_ENODEHOOKS, node_name,
3383 "Script %s failed, output:", script)
3385 output = self._HOOKS_INDENT_RE.sub(" ", output)
3386 feedback_fn("%s" % output)
3392 class LUClusterVerifyDisks(NoHooksLU):
3393 """Verifies the cluster disks status.
3398 def ExpandNames(self):
3399 self.share_locks = _ShareAll()
3400 self.needed_locks = {
3401 locking.LEVEL_NODEGROUP: locking.ALL_SET,
3404 def Exec(self, feedback_fn):
3405 group_names = self.owned_locks(locking.LEVEL_NODEGROUP)
3407 # Submit one instance of L{opcodes.OpGroupVerifyDisks} per node group
3408 return ResultWithJobs([[opcodes.OpGroupVerifyDisks(group_name=group)]
3409 for group in group_names])
3412 class LUGroupVerifyDisks(NoHooksLU):
3413 """Verifies the status of all disks in a node group.
3418 def ExpandNames(self):
3419 # Raises errors.OpPrereqError on its own if group can't be found
3420 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
3422 self.share_locks = _ShareAll()
3423 self.needed_locks = {
3424 locking.LEVEL_INSTANCE: [],
3425 locking.LEVEL_NODEGROUP: [],
3426 locking.LEVEL_NODE: [],
3429 def DeclareLocks(self, level):
3430 if level == locking.LEVEL_INSTANCE:
3431 assert not self.needed_locks[locking.LEVEL_INSTANCE]
3433 # Lock instances optimistically, needs verification once node and group
3434 # locks have been acquired
3435 self.needed_locks[locking.LEVEL_INSTANCE] = \
3436 self.cfg.GetNodeGroupInstances(self.group_uuid)
3438 elif level == locking.LEVEL_NODEGROUP:
3439 assert not self.needed_locks[locking.LEVEL_NODEGROUP]
3441 self.needed_locks[locking.LEVEL_NODEGROUP] = \
3442 set([self.group_uuid] +
3443 # Lock all groups used by instances optimistically; this requires
3444 # going via the node before it's locked, requiring verification
3447 for instance_name in self.owned_locks(locking.LEVEL_INSTANCE)
3448 for group_uuid in self.cfg.GetInstanceNodeGroups(instance_name)])
3450 elif level == locking.LEVEL_NODE:
3451 # This will only lock the nodes in the group to be verified which contain
3453 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
3454 self._LockInstancesNodes()
3456 # Lock all nodes in group to be verified
3457 assert self.group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP)
3458 member_nodes = self.cfg.GetNodeGroup(self.group_uuid).members
3459 self.needed_locks[locking.LEVEL_NODE].extend(member_nodes)
3461 def CheckPrereq(self):
3462 owned_instances = frozenset(self.owned_locks(locking.LEVEL_INSTANCE))
3463 owned_groups = frozenset(self.owned_locks(locking.LEVEL_NODEGROUP))
3464 owned_nodes = frozenset(self.owned_locks(locking.LEVEL_NODE))
3466 assert self.group_uuid in owned_groups
3468 # Check if locked instances are still correct
3469 _CheckNodeGroupInstances(self.cfg, self.group_uuid, owned_instances)
3471 # Get instance information
3472 self.instances = dict(self.cfg.GetMultiInstanceInfo(owned_instances))
3474 # Check if node groups for locked instances are still correct
3475 for (instance_name, inst) in self.instances.items():
3476 assert owned_nodes.issuperset(inst.all_nodes), \
3477 "Instance %s's nodes changed while we kept the lock" % instance_name
3479 inst_groups = _CheckInstanceNodeGroups(self.cfg, instance_name,
3482 assert self.group_uuid in inst_groups, \
3483 "Instance %s has no node in group %s" % (instance_name, self.group_uuid)
3485 def Exec(self, feedback_fn):
3486 """Verify integrity of cluster disks.
3488 @rtype: tuple of three items
3489 @return: a tuple of (dict of node-to-node_error, list of instances
3490 which need activate-disks, dict of instance: (node, volume) for
3495 res_instances = set()
3498 nv_dict = _MapInstanceDisksToNodes([inst
3499 for inst in self.instances.values()
3500 if inst.admin_state == constants.ADMINST_UP])
3503 nodes = utils.NiceSort(set(self.owned_locks(locking.LEVEL_NODE)) &
3504 set(self.cfg.GetVmCapableNodeList()))
3506 node_lvs = self.rpc.call_lv_list(nodes, [])
3508 for (node, node_res) in node_lvs.items():
3509 if node_res.offline:
3512 msg = node_res.fail_msg
3514 logging.warning("Error enumerating LVs on node %s: %s", node, msg)
3515 res_nodes[node] = msg
3518 for lv_name, (_, _, lv_online) in node_res.payload.items():
3519 inst = nv_dict.pop((node, lv_name), None)
3520 if not (lv_online or inst is None):
3521 res_instances.add(inst)
3523 # any leftover items in nv_dict are missing LVs, let's arrange the data
3525 for key, inst in nv_dict.iteritems():
3526 res_missing.setdefault(inst, []).append(list(key))
3528 return (res_nodes, list(res_instances), res_missing)
3531 class LUClusterRepairDiskSizes(NoHooksLU):
3532 """Verifies the cluster disks sizes.
3537 def ExpandNames(self):
3538 if self.op.instances:
3539 self.wanted_names = _GetWantedInstances(self, self.op.instances)
3540 self.needed_locks = {
3541 locking.LEVEL_NODE_RES: [],
3542 locking.LEVEL_INSTANCE: self.wanted_names,
3544 self.recalculate_locks[locking.LEVEL_NODE_RES] = constants.LOCKS_REPLACE
3546 self.wanted_names = None
3547 self.needed_locks = {
3548 locking.LEVEL_NODE_RES: locking.ALL_SET,
3549 locking.LEVEL_INSTANCE: locking.ALL_SET,
3551 self.share_locks = {
3552 locking.LEVEL_NODE_RES: 1,
3553 locking.LEVEL_INSTANCE: 0,
3556 def DeclareLocks(self, level):
3557 if level == locking.LEVEL_NODE_RES and self.wanted_names is not None:
3558 self._LockInstancesNodes(primary_only=True, level=level)
3560 def CheckPrereq(self):
3561 """Check prerequisites.
3563 This only checks the optional instance list against the existing names.
3566 if self.wanted_names is None:
3567 self.wanted_names = self.owned_locks(locking.LEVEL_INSTANCE)
3569 self.wanted_instances = \
3570 map(compat.snd, self.cfg.GetMultiInstanceInfo(self.wanted_names))
3572 def _EnsureChildSizes(self, disk):
3573 """Ensure children of the disk have the needed disk size.
3575 This is valid mainly for DRBD8 and fixes an issue where the
3576 children have smaller disk size.
3578 @param disk: an L{ganeti.objects.Disk} object
3581 if disk.dev_type == constants.LD_DRBD8:
3582 assert disk.children, "Empty children for DRBD8?"
3583 fchild = disk.children[0]
3584 mismatch = fchild.size < disk.size
3586 self.LogInfo("Child disk has size %d, parent %d, fixing",
3587 fchild.size, disk.size)
3588 fchild.size = disk.size
3590 # and we recurse on this child only, not on the metadev
3591 return self._EnsureChildSizes(fchild) or mismatch
3595 def Exec(self, feedback_fn):
3596 """Verify the size of cluster disks.
3599 # TODO: check child disks too
3600 # TODO: check differences in size between primary/secondary nodes
3602 for instance in self.wanted_instances:
3603 pnode = instance.primary_node
3604 if pnode not in per_node_disks:
3605 per_node_disks[pnode] = []
3606 for idx, disk in enumerate(instance.disks):
3607 per_node_disks[pnode].append((instance, idx, disk))
3609 assert not (frozenset(per_node_disks.keys()) -
3610 self.owned_locks(locking.LEVEL_NODE_RES)), \
3611 "Not owning correct locks"
3612 assert not self.owned_locks(locking.LEVEL_NODE)
3615 for node, dskl in per_node_disks.items():
3616 newl = [v[2].Copy() for v in dskl]
3618 self.cfg.SetDiskID(dsk, node)
3619 result = self.rpc.call_blockdev_getsize(node, newl)
3621 self.LogWarning("Failure in blockdev_getsize call to node"
3622 " %s, ignoring", node)
3624 if len(result.payload) != len(dskl):
3625 logging.warning("Invalid result from node %s: len(dksl)=%d,"
3626 " result.payload=%s", node, len(dskl), result.payload)
3627 self.LogWarning("Invalid result from node %s, ignoring node results",
3630 for ((instance, idx, disk), size) in zip(dskl, result.payload):
3632 self.LogWarning("Disk %d of instance %s did not return size"
3633 " information, ignoring", idx, instance.name)
3635 if not isinstance(size, (int, long)):
3636 self.LogWarning("Disk %d of instance %s did not return valid"
3637 " size information, ignoring", idx, instance.name)
3640 if size != disk.size:
3641 self.LogInfo("Disk %d of instance %s has mismatched size,"
3642 " correcting: recorded %d, actual %d", idx,
3643 instance.name, disk.size, size)
3645 self.cfg.Update(instance, feedback_fn)
3646 changed.append((instance.name, idx, size))
3647 if self._EnsureChildSizes(disk):
3648 self.cfg.Update(instance, feedback_fn)
3649 changed.append((instance.name, idx, disk.size))
3653 class LUClusterRename(LogicalUnit):
3654 """Rename the cluster.
3657 HPATH = "cluster-rename"
3658 HTYPE = constants.HTYPE_CLUSTER
3660 def BuildHooksEnv(self):
3665 "OP_TARGET": self.cfg.GetClusterName(),
3666 "NEW_NAME": self.op.name,
3669 def BuildHooksNodes(self):
3670 """Build hooks nodes.
3673 return ([self.cfg.GetMasterNode()], self.cfg.GetNodeList())
3675 def CheckPrereq(self):
3676 """Verify that the passed name is a valid one.
3679 hostname = netutils.GetHostname(name=self.op.name,
3680 family=self.cfg.GetPrimaryIPFamily())
3682 new_name = hostname.name
3683 self.ip = new_ip = hostname.ip
3684 old_name = self.cfg.GetClusterName()
3685 old_ip = self.cfg.GetMasterIP()
3686 if new_name == old_name and new_ip == old_ip:
3687 raise errors.OpPrereqError("Neither the name nor the IP address of the"
3688 " cluster has changed",
3690 if new_ip != old_ip:
3691 if netutils.TcpPing(new_ip, constants.DEFAULT_NODED_PORT):
3692 raise errors.OpPrereqError("The given cluster IP address (%s) is"
3693 " reachable on the network" %
3694 new_ip, errors.ECODE_NOTUNIQUE)
3696 self.op.name = new_name
3698 def Exec(self, feedback_fn):
3699 """Rename the cluster.
3702 clustername = self.op.name
3705 # shutdown the master IP
3706 master_params = self.cfg.GetMasterNetworkParameters()
3707 ems = self.cfg.GetUseExternalMipScript()
3708 result = self.rpc.call_node_deactivate_master_ip(master_params.name,
3710 result.Raise("Could not disable the master role")
3713 cluster = self.cfg.GetClusterInfo()
3714 cluster.cluster_name = clustername
3715 cluster.master_ip = new_ip
3716 self.cfg.Update(cluster, feedback_fn)
3718 # update the known hosts file
3719 ssh.WriteKnownHostsFile(self.cfg, constants.SSH_KNOWN_HOSTS_FILE)
3720 node_list = self.cfg.GetOnlineNodeList()
3722 node_list.remove(master_params.name)
3725 _UploadHelper(self, node_list, constants.SSH_KNOWN_HOSTS_FILE)
3727 master_params.ip = new_ip
3728 result = self.rpc.call_node_activate_master_ip(master_params.name,
3730 msg = result.fail_msg
3732 self.LogWarning("Could not re-enable the master role on"
3733 " the master, please restart manually: %s", msg)
3738 def _ValidateNetmask(cfg, netmask):
3739 """Checks if a netmask is valid.
3741 @type cfg: L{config.ConfigWriter}
3742 @param cfg: The cluster configuration
3744 @param netmask: the netmask to be verified
3745 @raise errors.OpPrereqError: if the validation fails
3748 ip_family = cfg.GetPrimaryIPFamily()
3750 ipcls = netutils.IPAddress.GetClassFromIpFamily(ip_family)
3751 except errors.ProgrammerError:
3752 raise errors.OpPrereqError("Invalid primary ip family: %s." %
3754 if not ipcls.ValidateNetmask(netmask):
3755 raise errors.OpPrereqError("CIDR netmask (%s) not valid" %
3759 class LUClusterSetParams(LogicalUnit):
3760 """Change the parameters of the cluster.
3763 HPATH = "cluster-modify"
3764 HTYPE = constants.HTYPE_CLUSTER
3767 def CheckArguments(self):
3771 if self.op.uid_pool:
3772 uidpool.CheckUidPool(self.op.uid_pool)
3774 if self.op.add_uids:
3775 uidpool.CheckUidPool(self.op.add_uids)
3777 if self.op.remove_uids:
3778 uidpool.CheckUidPool(self.op.remove_uids)
3780 if self.op.master_netmask is not None:
3781 _ValidateNetmask(self.cfg, self.op.master_netmask)
3783 if self.op.diskparams:
3784 for dt_params in self.op.diskparams.values():
3785 utils.ForceDictType(dt_params, constants.DISK_DT_TYPES)
3787 def ExpandNames(self):
3788 # FIXME: in the future maybe other cluster params won't require checking on
3789 # all nodes to be modified.
3790 self.needed_locks = {
3791 locking.LEVEL_NODE: locking.ALL_SET,
3793 self.share_locks[locking.LEVEL_NODE] = 1
3795 def BuildHooksEnv(self):
3800 "OP_TARGET": self.cfg.GetClusterName(),
3801 "NEW_VG_NAME": self.op.vg_name,
3804 def BuildHooksNodes(self):
3805 """Build hooks nodes.
3808 mn = self.cfg.GetMasterNode()
3811 def CheckPrereq(self):
3812 """Check prerequisites.
3814 This checks whether the given params don't conflict and
3815 if the given volume group is valid.
3818 if self.op.vg_name is not None and not self.op.vg_name:
3819 if self.cfg.HasAnyDiskOfType(constants.LD_LV):
3820 raise errors.OpPrereqError("Cannot disable lvm storage while lvm-based"
3821 " instances exist", errors.ECODE_INVAL)
3823 if self.op.drbd_helper is not None and not self.op.drbd_helper:
3824 if self.cfg.HasAnyDiskOfType(constants.LD_DRBD8):
3825 raise errors.OpPrereqError("Cannot disable drbd helper while"
3826 " drbd-based instances exist",
3829 node_list = self.owned_locks(locking.LEVEL_NODE)
3831 # if vg_name not None, checks given volume group on all nodes
3833 vglist = self.rpc.call_vg_list(node_list)
3834 for node in node_list:
3835 msg = vglist[node].fail_msg
3837 # ignoring down node
3838 self.LogWarning("Error while gathering data on node %s"
3839 " (ignoring node): %s", node, msg)
3841 vgstatus = utils.CheckVolumeGroupSize(vglist[node].payload,
3843 constants.MIN_VG_SIZE)
3845 raise errors.OpPrereqError("Error on node '%s': %s" %
3846 (node, vgstatus), errors.ECODE_ENVIRON)
3848 if self.op.drbd_helper:
3849 # checks given drbd helper on all nodes
3850 helpers = self.rpc.call_drbd_helper(node_list)
3851 for (node, ninfo) in self.cfg.GetMultiNodeInfo(node_list):
3853 self.LogInfo("Not checking drbd helper on offline node %s", node)
3855 msg = helpers[node].fail_msg
3857 raise errors.OpPrereqError("Error checking drbd helper on node"
3858 " '%s': %s" % (node, msg),
3859 errors.ECODE_ENVIRON)
3860 node_helper = helpers[node].payload
3861 if node_helper != self.op.drbd_helper:
3862 raise errors.OpPrereqError("Error on node '%s': drbd helper is %s" %
3863 (node, node_helper), errors.ECODE_ENVIRON)
3865 self.cluster = cluster = self.cfg.GetClusterInfo()
3866 # validate params changes
3867 if self.op.beparams:
3868 objects.UpgradeBeParams(self.op.beparams)
3869 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
3870 self.new_beparams = cluster.SimpleFillBE(self.op.beparams)
3872 if self.op.ndparams:
3873 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
3874 self.new_ndparams = cluster.SimpleFillND(self.op.ndparams)
3876 # TODO: we need a more general way to handle resetting
3877 # cluster-level parameters to default values
3878 if self.new_ndparams["oob_program"] == "":
3879 self.new_ndparams["oob_program"] = \
3880 constants.NDC_DEFAULTS[constants.ND_OOB_PROGRAM]
3882 if self.op.hv_state:
3883 new_hv_state = _MergeAndVerifyHvState(self.op.hv_state,
3884 self.cluster.hv_state_static)
3885 self.new_hv_state = dict((hv, cluster.SimpleFillHvState(values))
3886 for hv, values in new_hv_state.items())
3888 if self.op.disk_state:
3889 new_disk_state = _MergeAndVerifyDiskState(self.op.disk_state,
3890 self.cluster.disk_state_static)
3891 self.new_disk_state = \
3892 dict((storage, dict((name, cluster.SimpleFillDiskState(values))
3893 for name, values in svalues.items()))
3894 for storage, svalues in new_disk_state.items())
3897 self.new_ipolicy = _GetUpdatedIPolicy(cluster.ipolicy, self.op.ipolicy,
3900 if self.op.nicparams:
3901 utils.ForceDictType(self.op.nicparams, constants.NICS_PARAMETER_TYPES)
3902 self.new_nicparams = cluster.SimpleFillNIC(self.op.nicparams)
3903 objects.NIC.CheckParameterSyntax(self.new_nicparams)
3906 # check all instances for consistency
3907 for instance in self.cfg.GetAllInstancesInfo().values():
3908 for nic_idx, nic in enumerate(instance.nics):
3909 params_copy = copy.deepcopy(nic.nicparams)
3910 params_filled = objects.FillDict(self.new_nicparams, params_copy)
3912 # check parameter syntax
3914 objects.NIC.CheckParameterSyntax(params_filled)
3915 except errors.ConfigurationError, err:
3916 nic_errors.append("Instance %s, nic/%d: %s" %
3917 (instance.name, nic_idx, err))
3919 # if we're moving instances to routed, check that they have an ip
3920 target_mode = params_filled[constants.NIC_MODE]
3921 if target_mode == constants.NIC_MODE_ROUTED and not nic.ip:
3922 nic_errors.append("Instance %s, nic/%d: routed NIC with no ip"
3923 " address" % (instance.name, nic_idx))
3925 raise errors.OpPrereqError("Cannot apply the change, errors:\n%s" %
3926 "\n".join(nic_errors))
3928 # hypervisor list/parameters
3929 self.new_hvparams = new_hvp = objects.FillDict(cluster.hvparams, {})
3930 if self.op.hvparams:
3931 for hv_name, hv_dict in self.op.hvparams.items():
3932 if hv_name not in self.new_hvparams:
3933 self.new_hvparams[hv_name] = hv_dict
3935 self.new_hvparams[hv_name].update(hv_dict)
3937 # disk template parameters
3938 self.new_diskparams = objects.FillDict(cluster.diskparams, {})
3939 if self.op.diskparams:
3940 for dt_name, dt_params in self.op.diskparams.items():
3941 if dt_name not in self.op.diskparams:
3942 self.new_diskparams[dt_name] = dt_params
3944 self.new_diskparams[dt_name].update(dt_params)
3946 # os hypervisor parameters
3947 self.new_os_hvp = objects.FillDict(cluster.os_hvp, {})
3949 for os_name, hvs in self.op.os_hvp.items():
3950 if os_name not in self.new_os_hvp:
3951 self.new_os_hvp[os_name] = hvs
3953 for hv_name, hv_dict in hvs.items():
3954 if hv_name not in self.new_os_hvp[os_name]:
3955 self.new_os_hvp[os_name][hv_name] = hv_dict
3957 self.new_os_hvp[os_name][hv_name].update(hv_dict)
3960 self.new_osp = objects.FillDict(cluster.osparams, {})
3961 if self.op.osparams:
3962 for os_name, osp in self.op.osparams.items():
3963 if os_name not in self.new_osp:
3964 self.new_osp[os_name] = {}
3966 self.new_osp[os_name] = _GetUpdatedParams(self.new_osp[os_name], osp,
3969 if not self.new_osp[os_name]:
3970 # we removed all parameters
3971 del self.new_osp[os_name]
3973 # check the parameter validity (remote check)
3974 _CheckOSParams(self, False, [self.cfg.GetMasterNode()],
3975 os_name, self.new_osp[os_name])
3977 # changes to the hypervisor list
3978 if self.op.enabled_hypervisors is not None:
3979 self.hv_list = self.op.enabled_hypervisors
3980 for hv in self.hv_list:
3981 # if the hypervisor doesn't already exist in the cluster
3982 # hvparams, we initialize it to empty, and then (in both
3983 # cases) we make sure to fill the defaults, as we might not
3984 # have a complete defaults list if the hypervisor wasn't
3986 if hv not in new_hvp:
3988 new_hvp[hv] = objects.FillDict(constants.HVC_DEFAULTS[hv], new_hvp[hv])
3989 utils.ForceDictType(new_hvp[hv], constants.HVS_PARAMETER_TYPES)
3991 self.hv_list = cluster.enabled_hypervisors
3993 if self.op.hvparams or self.op.enabled_hypervisors is not None:
3994 # either the enabled list has changed, or the parameters have, validate
3995 for hv_name, hv_params in self.new_hvparams.items():
3996 if ((self.op.hvparams and hv_name in self.op.hvparams) or
3997 (self.op.enabled_hypervisors and
3998 hv_name in self.op.enabled_hypervisors)):
3999 # either this is a new hypervisor, or its parameters have changed
4000 hv_class = hypervisor.GetHypervisor(hv_name)
4001 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
4002 hv_class.CheckParameterSyntax(hv_params)
4003 _CheckHVParams(self, node_list, hv_name, hv_params)
4006 # no need to check any newly-enabled hypervisors, since the
4007 # defaults have already been checked in the above code-block
4008 for os_name, os_hvp in self.new_os_hvp.items():
4009 for hv_name, hv_params in os_hvp.items():
4010 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
4011 # we need to fill in the new os_hvp on top of the actual hv_p
4012 cluster_defaults = self.new_hvparams.get(hv_name, {})
4013 new_osp = objects.FillDict(cluster_defaults, hv_params)
4014 hv_class = hypervisor.GetHypervisor(hv_name)
4015 hv_class.CheckParameterSyntax(new_osp)
4016 _CheckHVParams(self, node_list, hv_name, new_osp)
4018 if self.op.default_iallocator:
4019 alloc_script = utils.FindFile(self.op.default_iallocator,
4020 constants.IALLOCATOR_SEARCH_PATH,
4022 if alloc_script is None:
4023 raise errors.OpPrereqError("Invalid default iallocator script '%s'"
4024 " specified" % self.op.default_iallocator,
4027 def Exec(self, feedback_fn):
4028 """Change the parameters of the cluster.
4031 if self.op.vg_name is not None:
4032 new_volume = self.op.vg_name
4035 if new_volume != self.cfg.GetVGName():
4036 self.cfg.SetVGName(new_volume)
4038 feedback_fn("Cluster LVM configuration already in desired"
4039 " state, not changing")
4040 if self.op.drbd_helper is not None:
4041 new_helper = self.op.drbd_helper
4044 if new_helper != self.cfg.GetDRBDHelper():
4045 self.cfg.SetDRBDHelper(new_helper)
4047 feedback_fn("Cluster DRBD helper already in desired state,"
4049 if self.op.hvparams:
4050 self.cluster.hvparams = self.new_hvparams
4052 self.cluster.os_hvp = self.new_os_hvp
4053 if self.op.enabled_hypervisors is not None:
4054 self.cluster.hvparams = self.new_hvparams
4055 self.cluster.enabled_hypervisors = self.op.enabled_hypervisors
4056 if self.op.beparams:
4057 self.cluster.beparams[constants.PP_DEFAULT] = self.new_beparams
4058 if self.op.nicparams:
4059 self.cluster.nicparams[constants.PP_DEFAULT] = self.new_nicparams
4061 self.cluster.ipolicy = self.new_ipolicy
4062 if self.op.osparams:
4063 self.cluster.osparams = self.new_osp
4064 if self.op.ndparams:
4065 self.cluster.ndparams = self.new_ndparams
4066 if self.op.diskparams:
4067 self.cluster.diskparams = self.new_diskparams
4068 if self.op.hv_state:
4069 self.cluster.hv_state_static = self.new_hv_state
4070 if self.op.disk_state:
4071 self.cluster.disk_state_static = self.new_disk_state
4073 if self.op.candidate_pool_size is not None:
4074 self.cluster.candidate_pool_size = self.op.candidate_pool_size
4075 # we need to update the pool size here, otherwise the save will fail
4076 _AdjustCandidatePool(self, [])
4078 if self.op.maintain_node_health is not None:
4079 if self.op.maintain_node_health and not constants.ENABLE_CONFD:
4080 feedback_fn("Note: CONFD was disabled at build time, node health"
4081 " maintenance is not useful (still enabling it)")
4082 self.cluster.maintain_node_health = self.op.maintain_node_health
4084 if self.op.prealloc_wipe_disks is not None:
4085 self.cluster.prealloc_wipe_disks = self.op.prealloc_wipe_disks
4087 if self.op.add_uids is not None:
4088 uidpool.AddToUidPool(self.cluster.uid_pool, self.op.add_uids)
4090 if self.op.remove_uids is not None:
4091 uidpool.RemoveFromUidPool(self.cluster.uid_pool, self.op.remove_uids)
4093 if self.op.uid_pool is not None:
4094 self.cluster.uid_pool = self.op.uid_pool
4096 if self.op.default_iallocator is not None:
4097 self.cluster.default_iallocator = self.op.default_iallocator
4099 if self.op.reserved_lvs is not None:
4100 self.cluster.reserved_lvs = self.op.reserved_lvs
4102 if self.op.use_external_mip_script is not None:
4103 self.cluster.use_external_mip_script = self.op.use_external_mip_script
4105 def helper_os(aname, mods, desc):
4107 lst = getattr(self.cluster, aname)
4108 for key, val in mods:
4109 if key == constants.DDM_ADD:
4111 feedback_fn("OS %s already in %s, ignoring" % (val, desc))
4114 elif key == constants.DDM_REMOVE:
4118 feedback_fn("OS %s not found in %s, ignoring" % (val, desc))
4120 raise errors.ProgrammerError("Invalid modification '%s'" % key)
4122 if self.op.hidden_os:
4123 helper_os("hidden_os", self.op.hidden_os, "hidden")
4125 if self.op.blacklisted_os:
4126 helper_os("blacklisted_os", self.op.blacklisted_os, "blacklisted")
4128 if self.op.master_netdev:
4129 master_params = self.cfg.GetMasterNetworkParameters()
4130 ems = self.cfg.GetUseExternalMipScript()
4131 feedback_fn("Shutting down master ip on the current netdev (%s)" %
4132 self.cluster.master_netdev)
4133 result = self.rpc.call_node_deactivate_master_ip(master_params.name,
4135 result.Raise("Could not disable the master ip")
4136 feedback_fn("Changing master_netdev from %s to %s" %
4137 (master_params.netdev, self.op.master_netdev))
4138 self.cluster.master_netdev = self.op.master_netdev
4140 if self.op.master_netmask:
4141 master_params = self.cfg.GetMasterNetworkParameters()
4142 feedback_fn("Changing master IP netmask to %s" % self.op.master_netmask)
4143 result = self.rpc.call_node_change_master_netmask(master_params.name,
4144 master_params.netmask,
4145 self.op.master_netmask,
4147 master_params.netdev)
4149 msg = "Could not change the master IP netmask: %s" % result.fail_msg
4152 self.cluster.master_netmask = self.op.master_netmask
4154 self.cfg.Update(self.cluster, feedback_fn)
4156 if self.op.master_netdev:
4157 master_params = self.cfg.GetMasterNetworkParameters()
4158 feedback_fn("Starting the master ip on the new master netdev (%s)" %
4159 self.op.master_netdev)
4160 ems = self.cfg.GetUseExternalMipScript()
4161 result = self.rpc.call_node_activate_master_ip(master_params.name,
4164 self.LogWarning("Could not re-enable the master ip on"
4165 " the master, please restart manually: %s",
4169 def _UploadHelper(lu, nodes, fname):
4170 """Helper for uploading a file and showing warnings.
4173 if os.path.exists(fname):
4174 result = lu.rpc.call_upload_file(nodes, fname)
4175 for to_node, to_result in result.items():
4176 msg = to_result.fail_msg
4178 msg = ("Copy of file %s to node %s failed: %s" %
4179 (fname, to_node, msg))
4180 lu.proc.LogWarning(msg)
4183 def _ComputeAncillaryFiles(cluster, redist):
4184 """Compute files external to Ganeti which need to be consistent.
4186 @type redist: boolean
4187 @param redist: Whether to include files which need to be redistributed
4190 # Compute files for all nodes
4192 constants.SSH_KNOWN_HOSTS_FILE,
4193 constants.CONFD_HMAC_KEY,
4194 constants.CLUSTER_DOMAIN_SECRET_FILE,
4195 constants.SPICE_CERT_FILE,
4196 constants.SPICE_CACERT_FILE,
4197 constants.RAPI_USERS_FILE,
4201 files_all.update(constants.ALL_CERT_FILES)
4202 files_all.update(ssconf.SimpleStore().GetFileList())
4204 # we need to ship at least the RAPI certificate
4205 files_all.add(constants.RAPI_CERT_FILE)
4207 if cluster.modify_etc_hosts:
4208 files_all.add(constants.ETC_HOSTS)
4210 # Files which are optional, these must:
4211 # - be present in one other category as well
4212 # - either exist or not exist on all nodes of that category (mc, vm all)
4214 constants.RAPI_USERS_FILE,
4217 # Files which should only be on master candidates
4221 files_mc.add(constants.CLUSTER_CONF_FILE)
4223 # FIXME: this should also be replicated but Ganeti doesn't support files_mc
4225 files_mc.add(constants.DEFAULT_MASTER_SETUP_SCRIPT)
4227 # Files which should only be on VM-capable nodes
4228 files_vm = set(filename
4229 for hv_name in cluster.enabled_hypervisors
4230 for filename in hypervisor.GetHypervisor(hv_name).GetAncillaryFiles()[0])
4232 files_opt |= set(filename
4233 for hv_name in cluster.enabled_hypervisors
4234 for filename in hypervisor.GetHypervisor(hv_name).GetAncillaryFiles()[1])
4236 # Filenames in each category must be unique
4237 all_files_set = files_all | files_mc | files_vm
4238 assert (len(all_files_set) ==
4239 sum(map(len, [files_all, files_mc, files_vm]))), \
4240 "Found file listed in more than one file list"
4242 # Optional files must be present in one other category
4243 assert all_files_set.issuperset(files_opt), \
4244 "Optional file not in a different required list"
4246 return (files_all, files_opt, files_mc, files_vm)
4249 def _RedistributeAncillaryFiles(lu, additional_nodes=None, additional_vm=True):
4250 """Distribute additional files which are part of the cluster configuration.
4252 ConfigWriter takes care of distributing the config and ssconf files, but
4253 there are more files which should be distributed to all nodes. This function
4254 makes sure those are copied.
4256 @param lu: calling logical unit
4257 @param additional_nodes: list of nodes not in the config to distribute to
4258 @type additional_vm: boolean
4259 @param additional_vm: whether the additional nodes are vm-capable or not
4262 # Gather target nodes
4263 cluster = lu.cfg.GetClusterInfo()
4264 master_info = lu.cfg.GetNodeInfo(lu.cfg.GetMasterNode())
4266 online_nodes = lu.cfg.GetOnlineNodeList()
4267 vm_nodes = lu.cfg.GetVmCapableNodeList()
4269 if additional_nodes is not None:
4270 online_nodes.extend(additional_nodes)
4272 vm_nodes.extend(additional_nodes)
4274 # Never distribute to master node
4275 for nodelist in [online_nodes, vm_nodes]:
4276 if master_info.name in nodelist:
4277 nodelist.remove(master_info.name)
4280 (files_all, _, files_mc, files_vm) = \
4281 _ComputeAncillaryFiles(cluster, True)
4283 # Never re-distribute configuration file from here
4284 assert not (constants.CLUSTER_CONF_FILE in files_all or
4285 constants.CLUSTER_CONF_FILE in files_vm)
4286 assert not files_mc, "Master candidates not handled in this function"
4289 (online_nodes, files_all),
4290 (vm_nodes, files_vm),
4294 for (node_list, files) in filemap:
4296 _UploadHelper(lu, node_list, fname)
4299 class LUClusterRedistConf(NoHooksLU):
4300 """Force the redistribution of cluster configuration.
4302 This is a very simple LU.
4307 def ExpandNames(self):
4308 self.needed_locks = {
4309 locking.LEVEL_NODE: locking.ALL_SET,
4311 self.share_locks[locking.LEVEL_NODE] = 1
4313 def Exec(self, feedback_fn):
4314 """Redistribute the configuration.
4317 self.cfg.Update(self.cfg.GetClusterInfo(), feedback_fn)
4318 _RedistributeAncillaryFiles(self)
4321 class LUClusterActivateMasterIp(NoHooksLU):
4322 """Activate the master IP on the master node.
4325 def Exec(self, feedback_fn):
4326 """Activate the master IP.
4329 master_params = self.cfg.GetMasterNetworkParameters()
4330 ems = self.cfg.GetUseExternalMipScript()
4331 result = self.rpc.call_node_activate_master_ip(master_params.name,
4333 result.Raise("Could not activate the master IP")
4336 class LUClusterDeactivateMasterIp(NoHooksLU):
4337 """Deactivate the master IP on the master node.
4340 def Exec(self, feedback_fn):
4341 """Deactivate the master IP.
4344 master_params = self.cfg.GetMasterNetworkParameters()
4345 ems = self.cfg.GetUseExternalMipScript()
4346 result = self.rpc.call_node_deactivate_master_ip(master_params.name,
4348 result.Raise("Could not deactivate the master IP")
4351 def _WaitForSync(lu, instance, disks=None, oneshot=False):
4352 """Sleep and poll for an instance's disk to sync.
4355 if not instance.disks or disks is not None and not disks:
4358 disks = _ExpandCheckDisks(instance, disks)
4361 lu.proc.LogInfo("Waiting for instance %s to sync disks." % instance.name)
4363 node = instance.primary_node
4366 lu.cfg.SetDiskID(dev, node)
4368 # TODO: Convert to utils.Retry
4371 degr_retries = 10 # in seconds, as we sleep 1 second each time
4375 cumul_degraded = False
4376 rstats = lu.rpc.call_blockdev_getmirrorstatus(node, disks)
4377 msg = rstats.fail_msg
4379 lu.LogWarning("Can't get any data from node %s: %s", node, msg)
4382 raise errors.RemoteError("Can't contact node %s for mirror data,"
4383 " aborting." % node)
4386 rstats = rstats.payload
4388 for i, mstat in enumerate(rstats):
4390 lu.LogWarning("Can't compute data for node %s/%s",
4391 node, disks[i].iv_name)
4394 cumul_degraded = (cumul_degraded or
4395 (mstat.is_degraded and mstat.sync_percent is None))
4396 if mstat.sync_percent is not None:
4398 if mstat.estimated_time is not None:
4399 rem_time = ("%s remaining (estimated)" %
4400 utils.FormatSeconds(mstat.estimated_time))
4401 max_time = mstat.estimated_time
4403 rem_time = "no time estimate"
4404 lu.proc.LogInfo("- device %s: %5.2f%% done, %s" %
4405 (disks[i].iv_name, mstat.sync_percent, rem_time))
4407 # if we're done but degraded, let's do a few small retries, to
4408 # make sure we see a stable and not transient situation; therefore
4409 # we force restart of the loop
4410 if (done or oneshot) and cumul_degraded and degr_retries > 0:
4411 logging.info("Degraded disks found, %d retries left", degr_retries)
4419 time.sleep(min(60, max_time))
4422 lu.proc.LogInfo("Instance %s's disks are in sync." % instance.name)
4423 return not cumul_degraded
4426 def _CheckDiskConsistency(lu, dev, node, on_primary, ldisk=False):
4427 """Check that mirrors are not degraded.
4429 The ldisk parameter, if True, will change the test from the
4430 is_degraded attribute (which represents overall non-ok status for
4431 the device(s)) to the ldisk (representing the local storage status).
4434 lu.cfg.SetDiskID(dev, node)
4438 if on_primary or dev.AssembleOnSecondary():
4439 rstats = lu.rpc.call_blockdev_find(node, dev)
4440 msg = rstats.fail_msg
4442 lu.LogWarning("Can't find disk on node %s: %s", node, msg)
4444 elif not rstats.payload:
4445 lu.LogWarning("Can't find disk on node %s", node)
4449 result = result and rstats.payload.ldisk_status == constants.LDS_OKAY
4451 result = result and not rstats.payload.is_degraded
4454 for child in dev.children:
4455 result = result and _CheckDiskConsistency(lu, child, node, on_primary)
4460 class LUOobCommand(NoHooksLU):
4461 """Logical unit for OOB handling.
4465 _SKIP_MASTER = (constants.OOB_POWER_OFF, constants.OOB_POWER_CYCLE)
4467 def ExpandNames(self):
4468 """Gather locks we need.
4471 if self.op.node_names:
4472 self.op.node_names = _GetWantedNodes(self, self.op.node_names)
4473 lock_names = self.op.node_names
4475 lock_names = locking.ALL_SET
4477 self.needed_locks = {
4478 locking.LEVEL_NODE: lock_names,
4481 def CheckPrereq(self):
4482 """Check prerequisites.
4485 - the node exists in the configuration
4488 Any errors are signaled by raising errors.OpPrereqError.
4492 self.master_node = self.cfg.GetMasterNode()
4494 assert self.op.power_delay >= 0.0
4496 if self.op.node_names:
4497 if (self.op.command in self._SKIP_MASTER and
4498 self.master_node in self.op.node_names):
4499 master_node_obj = self.cfg.GetNodeInfo(self.master_node)
4500 master_oob_handler = _SupportsOob(self.cfg, master_node_obj)
4502 if master_oob_handler:
4503 additional_text = ("run '%s %s %s' if you want to operate on the"
4504 " master regardless") % (master_oob_handler,
4508 additional_text = "it does not support out-of-band operations"
4510 raise errors.OpPrereqError(("Operating on the master node %s is not"
4511 " allowed for %s; %s") %
4512 (self.master_node, self.op.command,
4513 additional_text), errors.ECODE_INVAL)
4515 self.op.node_names = self.cfg.GetNodeList()
4516 if self.op.command in self._SKIP_MASTER:
4517 self.op.node_names.remove(self.master_node)
4519 if self.op.command in self._SKIP_MASTER:
4520 assert self.master_node not in self.op.node_names
4522 for (node_name, node) in self.cfg.GetMultiNodeInfo(self.op.node_names):
4524 raise errors.OpPrereqError("Node %s not found" % node_name,
4527 self.nodes.append(node)
4529 if (not self.op.ignore_status and
4530 (self.op.command == constants.OOB_POWER_OFF and not node.offline)):
4531 raise errors.OpPrereqError(("Cannot power off node %s because it is"
4532 " not marked offline") % node_name,
4535 def Exec(self, feedback_fn):
4536 """Execute OOB and return result if we expect any.
4539 master_node = self.master_node
4542 for idx, node in enumerate(utils.NiceSort(self.nodes,
4543 key=lambda node: node.name)):
4544 node_entry = [(constants.RS_NORMAL, node.name)]
4545 ret.append(node_entry)
4547 oob_program = _SupportsOob(self.cfg, node)
4550 node_entry.append((constants.RS_UNAVAIL, None))
4553 logging.info("Executing out-of-band command '%s' using '%s' on %s",
4554 self.op.command, oob_program, node.name)
4555 result = self.rpc.call_run_oob(master_node, oob_program,
4556 self.op.command, node.name,
4560 self.LogWarning("Out-of-band RPC failed on node '%s': %s",
4561 node.name, result.fail_msg)
4562 node_entry.append((constants.RS_NODATA, None))
4565 self._CheckPayload(result)
4566 except errors.OpExecError, err:
4567 self.LogWarning("Payload returned by node '%s' is not valid: %s",
4569 node_entry.append((constants.RS_NODATA, None))
4571 if self.op.command == constants.OOB_HEALTH:
4572 # For health we should log important events
4573 for item, status in result.payload:
4574 if status in [constants.OOB_STATUS_WARNING,
4575 constants.OOB_STATUS_CRITICAL]:
4576 self.LogWarning("Item '%s' on node '%s' has status '%s'",
4577 item, node.name, status)
4579 if self.op.command == constants.OOB_POWER_ON:
4581 elif self.op.command == constants.OOB_POWER_OFF:
4582 node.powered = False
4583 elif self.op.command == constants.OOB_POWER_STATUS:
4584 powered = result.payload[constants.OOB_POWER_STATUS_POWERED]
4585 if powered != node.powered:
4586 logging.warning(("Recorded power state (%s) of node '%s' does not"
4587 " match actual power state (%s)"), node.powered,
4590 # For configuration changing commands we should update the node
4591 if self.op.command in (constants.OOB_POWER_ON,
4592 constants.OOB_POWER_OFF):
4593 self.cfg.Update(node, feedback_fn)
4595 node_entry.append((constants.RS_NORMAL, result.payload))
4597 if (self.op.command == constants.OOB_POWER_ON and
4598 idx < len(self.nodes) - 1):
4599 time.sleep(self.op.power_delay)
4603 def _CheckPayload(self, result):
4604 """Checks if the payload is valid.
4606 @param result: RPC result
4607 @raises errors.OpExecError: If payload is not valid
4611 if self.op.command == constants.OOB_HEALTH:
4612 if not isinstance(result.payload, list):
4613 errs.append("command 'health' is expected to return a list but got %s" %
4614 type(result.payload))
4616 for item, status in result.payload:
4617 if status not in constants.OOB_STATUSES:
4618 errs.append("health item '%s' has invalid status '%s'" %
4621 if self.op.command == constants.OOB_POWER_STATUS:
4622 if not isinstance(result.payload, dict):
4623 errs.append("power-status is expected to return a dict but got %s" %
4624 type(result.payload))
4626 if self.op.command in [
4627 constants.OOB_POWER_ON,
4628 constants.OOB_POWER_OFF,
4629 constants.OOB_POWER_CYCLE,
4631 if result.payload is not None:
4632 errs.append("%s is expected to not return payload but got '%s'" %
4633 (self.op.command, result.payload))
4636 raise errors.OpExecError("Check of out-of-band payload failed due to %s" %
4637 utils.CommaJoin(errs))
4640 class _OsQuery(_QueryBase):
4641 FIELDS = query.OS_FIELDS
4643 def ExpandNames(self, lu):
4644 # Lock all nodes in shared mode
4645 # Temporary removal of locks, should be reverted later
4646 # TODO: reintroduce locks when they are lighter-weight
4647 lu.needed_locks = {}
4648 #self.share_locks[locking.LEVEL_NODE] = 1
4649 #self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
4651 # The following variables interact with _QueryBase._GetNames
4653 self.wanted = self.names
4655 self.wanted = locking.ALL_SET
4657 self.do_locking = self.use_locking
4659 def DeclareLocks(self, lu, level):
4663 def _DiagnoseByOS(rlist):
4664 """Remaps a per-node return list into an a per-os per-node dictionary
4666 @param rlist: a map with node names as keys and OS objects as values
4669 @return: a dictionary with osnames as keys and as value another
4670 map, with nodes as keys and tuples of (path, status, diagnose,
4671 variants, parameters, api_versions) as values, eg::
4673 {"debian-etch": {"node1": [(/usr/lib/..., True, "", [], []),
4674 (/srv/..., False, "invalid api")],
4675 "node2": [(/srv/..., True, "", [], [])]}
4680 # we build here the list of nodes that didn't fail the RPC (at RPC
4681 # level), so that nodes with a non-responding node daemon don't
4682 # make all OSes invalid
4683 good_nodes = [node_name for node_name in rlist
4684 if not rlist[node_name].fail_msg]
4685 for node_name, nr in rlist.items():
4686 if nr.fail_msg or not nr.payload:
4688 for (name, path, status, diagnose, variants,
4689 params, api_versions) in nr.payload:
4690 if name not in all_os:
4691 # build a list of nodes for this os containing empty lists
4692 # for each node in node_list
4694 for nname in good_nodes:
4695 all_os[name][nname] = []
4696 # convert params from [name, help] to (name, help)
4697 params = [tuple(v) for v in params]
4698 all_os[name][node_name].append((path, status, diagnose,
4699 variants, params, api_versions))
4702 def _GetQueryData(self, lu):
4703 """Computes the list of nodes and their attributes.
4706 # Locking is not used
4707 assert not (compat.any(lu.glm.is_owned(level)
4708 for level in locking.LEVELS
4709 if level != locking.LEVEL_CLUSTER) or
4710 self.do_locking or self.use_locking)
4712 valid_nodes = [node.name
4713 for node in lu.cfg.GetAllNodesInfo().values()
4714 if not node.offline and node.vm_capable]
4715 pol = self._DiagnoseByOS(lu.rpc.call_os_diagnose(valid_nodes))
4716 cluster = lu.cfg.GetClusterInfo()
4720 for (os_name, os_data) in pol.items():
4721 info = query.OsInfo(name=os_name, valid=True, node_status=os_data,
4722 hidden=(os_name in cluster.hidden_os),
4723 blacklisted=(os_name in cluster.blacklisted_os))
4727 api_versions = set()
4729 for idx, osl in enumerate(os_data.values()):
4730 info.valid = bool(info.valid and osl and osl[0][1])
4734 (node_variants, node_params, node_api) = osl[0][3:6]
4737 variants.update(node_variants)
4738 parameters.update(node_params)
4739 api_versions.update(node_api)
4741 # Filter out inconsistent values
4742 variants.intersection_update(node_variants)
4743 parameters.intersection_update(node_params)
4744 api_versions.intersection_update(node_api)
4746 info.variants = list(variants)
4747 info.parameters = list(parameters)
4748 info.api_versions = list(api_versions)
4750 data[os_name] = info
4752 # Prepare data in requested order
4753 return [data[name] for name in self._GetNames(lu, pol.keys(), None)
4757 class LUOsDiagnose(NoHooksLU):
4758 """Logical unit for OS diagnose/query.
4764 def _BuildFilter(fields, names):
4765 """Builds a filter for querying OSes.
4768 name_filter = qlang.MakeSimpleFilter("name", names)
4770 # Legacy behaviour: Hide hidden, blacklisted or invalid OSes if the
4771 # respective field is not requested
4772 status_filter = [[qlang.OP_NOT, [qlang.OP_TRUE, fname]]
4773 for fname in ["hidden", "blacklisted"]
4774 if fname not in fields]
4775 if "valid" not in fields:
4776 status_filter.append([qlang.OP_TRUE, "valid"])
4779 status_filter.insert(0, qlang.OP_AND)
4781 status_filter = None
4783 if name_filter and status_filter:
4784 return [qlang.OP_AND, name_filter, status_filter]
4788 return status_filter
4790 def CheckArguments(self):
4791 self.oq = _OsQuery(self._BuildFilter(self.op.output_fields, self.op.names),
4792 self.op.output_fields, False)
4794 def ExpandNames(self):
4795 self.oq.ExpandNames(self)
4797 def Exec(self, feedback_fn):
4798 return self.oq.OldStyleQuery(self)
4801 class LUNodeRemove(LogicalUnit):
4802 """Logical unit for removing a node.
4805 HPATH = "node-remove"
4806 HTYPE = constants.HTYPE_NODE
4808 def BuildHooksEnv(self):
4811 This doesn't run on the target node in the pre phase as a failed
4812 node would then be impossible to remove.
4816 "OP_TARGET": self.op.node_name,
4817 "NODE_NAME": self.op.node_name,
4820 def BuildHooksNodes(self):
4821 """Build hooks nodes.
4824 all_nodes = self.cfg.GetNodeList()
4826 all_nodes.remove(self.op.node_name)
4828 logging.warning("Node '%s', which is about to be removed, was not found"
4829 " in the list of all nodes", self.op.node_name)
4830 return (all_nodes, all_nodes)
4832 def CheckPrereq(self):
4833 """Check prerequisites.
4836 - the node exists in the configuration
4837 - it does not have primary or secondary instances
4838 - it's not the master
4840 Any errors are signaled by raising errors.OpPrereqError.
4843 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
4844 node = self.cfg.GetNodeInfo(self.op.node_name)
4845 assert node is not None
4847 masternode = self.cfg.GetMasterNode()
4848 if node.name == masternode:
4849 raise errors.OpPrereqError("Node is the master node, failover to another"
4850 " node is required", errors.ECODE_INVAL)
4852 for instance_name, instance in self.cfg.GetAllInstancesInfo().items():
4853 if node.name in instance.all_nodes:
4854 raise errors.OpPrereqError("Instance %s is still running on the node,"
4855 " please remove first" % instance_name,
4857 self.op.node_name = node.name
4860 def Exec(self, feedback_fn):
4861 """Removes the node from the cluster.
4865 logging.info("Stopping the node daemon and removing configs from node %s",
4868 modify_ssh_setup = self.cfg.GetClusterInfo().modify_ssh_setup
4870 assert locking.BGL in self.owned_locks(locking.LEVEL_CLUSTER), \
4873 # Promote nodes to master candidate as needed
4874 _AdjustCandidatePool(self, exceptions=[node.name])
4875 self.context.RemoveNode(node.name)
4877 # Run post hooks on the node before it's removed
4878 _RunPostHook(self, node.name)
4880 result = self.rpc.call_node_leave_cluster(node.name, modify_ssh_setup)
4881 msg = result.fail_msg
4883 self.LogWarning("Errors encountered on the remote node while leaving"
4884 " the cluster: %s", msg)
4886 # Remove node from our /etc/hosts
4887 if self.cfg.GetClusterInfo().modify_etc_hosts:
4888 master_node = self.cfg.GetMasterNode()
4889 result = self.rpc.call_etc_hosts_modify(master_node,
4890 constants.ETC_HOSTS_REMOVE,
4892 result.Raise("Can't update hosts file with new host data")
4893 _RedistributeAncillaryFiles(self)
4896 class _NodeQuery(_QueryBase):
4897 FIELDS = query.NODE_FIELDS
4899 def ExpandNames(self, lu):
4900 lu.needed_locks = {}
4901 lu.share_locks = _ShareAll()
4904 self.wanted = _GetWantedNodes(lu, self.names)
4906 self.wanted = locking.ALL_SET
4908 self.do_locking = (self.use_locking and
4909 query.NQ_LIVE in self.requested_data)
4912 # If any non-static field is requested we need to lock the nodes
4913 lu.needed_locks[locking.LEVEL_NODE] = self.wanted
4915 def DeclareLocks(self, lu, level):
4918 def _GetQueryData(self, lu):
4919 """Computes the list of nodes and their attributes.
4922 all_info = lu.cfg.GetAllNodesInfo()
4924 nodenames = self._GetNames(lu, all_info.keys(), locking.LEVEL_NODE)
4926 # Gather data as requested
4927 if query.NQ_LIVE in self.requested_data:
4928 # filter out non-vm_capable nodes
4929 toquery_nodes = [name for name in nodenames if all_info[name].vm_capable]
4931 node_data = lu.rpc.call_node_info(toquery_nodes, [lu.cfg.GetVGName()],
4932 [lu.cfg.GetHypervisorType()])
4933 live_data = dict((name, _MakeLegacyNodeInfo(nresult.payload))
4934 for (name, nresult) in node_data.items()
4935 if not nresult.fail_msg and nresult.payload)
4939 if query.NQ_INST in self.requested_data:
4940 node_to_primary = dict([(name, set()) for name in nodenames])
4941 node_to_secondary = dict([(name, set()) for name in nodenames])
4943 inst_data = lu.cfg.GetAllInstancesInfo()
4945 for inst in inst_data.values():
4946 if inst.primary_node in node_to_primary:
4947 node_to_primary[inst.primary_node].add(inst.name)
4948 for secnode in inst.secondary_nodes:
4949 if secnode in node_to_secondary:
4950 node_to_secondary[secnode].add(inst.name)
4952 node_to_primary = None
4953 node_to_secondary = None
4955 if query.NQ_OOB in self.requested_data:
4956 oob_support = dict((name, bool(_SupportsOob(lu.cfg, node)))
4957 for name, node in all_info.iteritems())
4961 if query.NQ_GROUP in self.requested_data:
4962 groups = lu.cfg.GetAllNodeGroupsInfo()
4966 return query.NodeQueryData([all_info[name] for name in nodenames],
4967 live_data, lu.cfg.GetMasterNode(),
4968 node_to_primary, node_to_secondary, groups,
4969 oob_support, lu.cfg.GetClusterInfo())
4972 class LUNodeQuery(NoHooksLU):
4973 """Logical unit for querying nodes.
4976 # pylint: disable=W0142
4979 def CheckArguments(self):
4980 self.nq = _NodeQuery(qlang.MakeSimpleFilter("name", self.op.names),
4981 self.op.output_fields, self.op.use_locking)
4983 def ExpandNames(self):
4984 self.nq.ExpandNames(self)
4986 def DeclareLocks(self, level):
4987 self.nq.DeclareLocks(self, level)
4989 def Exec(self, feedback_fn):
4990 return self.nq.OldStyleQuery(self)
4993 class LUNodeQueryvols(NoHooksLU):
4994 """Logical unit for getting volumes on node(s).
4998 _FIELDS_DYNAMIC = utils.FieldSet("phys", "vg", "name", "size", "instance")
4999 _FIELDS_STATIC = utils.FieldSet("node")
5001 def CheckArguments(self):
5002 _CheckOutputFields(static=self._FIELDS_STATIC,
5003 dynamic=self._FIELDS_DYNAMIC,
5004 selected=self.op.output_fields)
5006 def ExpandNames(self):
5007 self.share_locks = _ShareAll()
5008 self.needed_locks = {}
5010 if not self.op.nodes:
5011 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
5013 self.needed_locks[locking.LEVEL_NODE] = \
5014 _GetWantedNodes(self, self.op.nodes)
5016 def Exec(self, feedback_fn):
5017 """Computes the list of nodes and their attributes.
5020 nodenames = self.owned_locks(locking.LEVEL_NODE)
5021 volumes = self.rpc.call_node_volumes(nodenames)
5023 ilist = self.cfg.GetAllInstancesInfo()
5024 vol2inst = _MapInstanceDisksToNodes(ilist.values())
5027 for node in nodenames:
5028 nresult = volumes[node]
5031 msg = nresult.fail_msg
5033 self.LogWarning("Can't compute volume data on node %s: %s", node, msg)
5036 node_vols = sorted(nresult.payload,
5037 key=operator.itemgetter("dev"))
5039 for vol in node_vols:
5041 for field in self.op.output_fields:
5044 elif field == "phys":
5048 elif field == "name":
5050 elif field == "size":
5051 val = int(float(vol["size"]))
5052 elif field == "instance":
5053 val = vol2inst.get((node, vol["vg"] + "/" + vol["name"]), "-")
5055 raise errors.ParameterError(field)
5056 node_output.append(str(val))
5058 output.append(node_output)
5063 class LUNodeQueryStorage(NoHooksLU):
5064 """Logical unit for getting information on storage units on node(s).
5067 _FIELDS_STATIC = utils.FieldSet(constants.SF_NODE)
5070 def CheckArguments(self):
5071 _CheckOutputFields(static=self._FIELDS_STATIC,
5072 dynamic=utils.FieldSet(*constants.VALID_STORAGE_FIELDS),
5073 selected=self.op.output_fields)
5075 def ExpandNames(self):
5076 self.share_locks = _ShareAll()
5077 self.needed_locks = {}
5080 self.needed_locks[locking.LEVEL_NODE] = \
5081 _GetWantedNodes(self, self.op.nodes)
5083 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
5085 def Exec(self, feedback_fn):
5086 """Computes the list of nodes and their attributes.
5089 self.nodes = self.owned_locks(locking.LEVEL_NODE)
5091 # Always get name to sort by
5092 if constants.SF_NAME in self.op.output_fields:
5093 fields = self.op.output_fields[:]
5095 fields = [constants.SF_NAME] + self.op.output_fields
5097 # Never ask for node or type as it's only known to the LU
5098 for extra in [constants.SF_NODE, constants.SF_TYPE]:
5099 while extra in fields:
5100 fields.remove(extra)
5102 field_idx = dict([(name, idx) for (idx, name) in enumerate(fields)])
5103 name_idx = field_idx[constants.SF_NAME]
5105 st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
5106 data = self.rpc.call_storage_list(self.nodes,
5107 self.op.storage_type, st_args,
5108 self.op.name, fields)
5112 for node in utils.NiceSort(self.nodes):
5113 nresult = data[node]
5117 msg = nresult.fail_msg
5119 self.LogWarning("Can't get storage data from node %s: %s", node, msg)
5122 rows = dict([(row[name_idx], row) for row in nresult.payload])
5124 for name in utils.NiceSort(rows.keys()):
5129 for field in self.op.output_fields:
5130 if field == constants.SF_NODE:
5132 elif field == constants.SF_TYPE:
5133 val = self.op.storage_type
5134 elif field in field_idx:
5135 val = row[field_idx[field]]
5137 raise errors.ParameterError(field)
5146 class _InstanceQuery(_QueryBase):
5147 FIELDS = query.INSTANCE_FIELDS
5149 def ExpandNames(self, lu):
5150 lu.needed_locks = {}
5151 lu.share_locks = _ShareAll()
5154 self.wanted = _GetWantedInstances(lu, self.names)
5156 self.wanted = locking.ALL_SET
5158 self.do_locking = (self.use_locking and
5159 query.IQ_LIVE in self.requested_data)
5161 lu.needed_locks[locking.LEVEL_INSTANCE] = self.wanted
5162 lu.needed_locks[locking.LEVEL_NODEGROUP] = []
5163 lu.needed_locks[locking.LEVEL_NODE] = []
5164 lu.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
5166 self.do_grouplocks = (self.do_locking and
5167 query.IQ_NODES in self.requested_data)
5169 def DeclareLocks(self, lu, level):
5171 if level == locking.LEVEL_NODEGROUP and self.do_grouplocks:
5172 assert not lu.needed_locks[locking.LEVEL_NODEGROUP]
5174 # Lock all groups used by instances optimistically; this requires going
5175 # via the node before it's locked, requiring verification later on
5176 lu.needed_locks[locking.LEVEL_NODEGROUP] = \
5178 for instance_name in lu.owned_locks(locking.LEVEL_INSTANCE)
5179 for group_uuid in lu.cfg.GetInstanceNodeGroups(instance_name))
5180 elif level == locking.LEVEL_NODE:
5181 lu._LockInstancesNodes() # pylint: disable=W0212
5184 def _CheckGroupLocks(lu):
5185 owned_instances = frozenset(lu.owned_locks(locking.LEVEL_INSTANCE))
5186 owned_groups = frozenset(lu.owned_locks(locking.LEVEL_NODEGROUP))
5188 # Check if node groups for locked instances are still correct
5189 for instance_name in owned_instances:
5190 _CheckInstanceNodeGroups(lu.cfg, instance_name, owned_groups)
5192 def _GetQueryData(self, lu):
5193 """Computes the list of instances and their attributes.
5196 if self.do_grouplocks:
5197 self._CheckGroupLocks(lu)
5199 cluster = lu.cfg.GetClusterInfo()
5200 all_info = lu.cfg.GetAllInstancesInfo()
5202 instance_names = self._GetNames(lu, all_info.keys(), locking.LEVEL_INSTANCE)
5204 instance_list = [all_info[name] for name in instance_names]
5205 nodes = frozenset(itertools.chain(*(inst.all_nodes
5206 for inst in instance_list)))
5207 hv_list = list(set([inst.hypervisor for inst in instance_list]))
5210 wrongnode_inst = set()
5212 # Gather data as requested
5213 if self.requested_data & set([query.IQ_LIVE, query.IQ_CONSOLE]):
5215 node_data = lu.rpc.call_all_instances_info(nodes, hv_list)
5217 result = node_data[name]
5219 # offline nodes will be in both lists
5220 assert result.fail_msg
5221 offline_nodes.append(name)
5223 bad_nodes.append(name)
5224 elif result.payload:
5225 for inst in result.payload:
5226 if inst in all_info:
5227 if all_info[inst].primary_node == name:
5228 live_data.update(result.payload)
5230 wrongnode_inst.add(inst)
5232 # orphan instance; we don't list it here as we don't
5233 # handle this case yet in the output of instance listing
5234 logging.warning("Orphan instance '%s' found on node %s",
5236 # else no instance is alive
5240 if query.IQ_DISKUSAGE in self.requested_data:
5241 disk_usage = dict((inst.name,
5242 _ComputeDiskSize(inst.disk_template,
5243 [{constants.IDISK_SIZE: disk.size}
5244 for disk in inst.disks]))
5245 for inst in instance_list)
5249 if query.IQ_CONSOLE in self.requested_data:
5251 for inst in instance_list:
5252 if inst.name in live_data:
5253 # Instance is running
5254 consinfo[inst.name] = _GetInstanceConsole(cluster, inst)
5256 consinfo[inst.name] = None
5257 assert set(consinfo.keys()) == set(instance_names)
5261 if query.IQ_NODES in self.requested_data:
5262 node_names = set(itertools.chain(*map(operator.attrgetter("all_nodes"),
5264 nodes = dict(lu.cfg.GetMultiNodeInfo(node_names))
5265 groups = dict((uuid, lu.cfg.GetNodeGroup(uuid))
5266 for uuid in set(map(operator.attrgetter("group"),
5272 return query.InstanceQueryData(instance_list, lu.cfg.GetClusterInfo(),
5273 disk_usage, offline_nodes, bad_nodes,
5274 live_data, wrongnode_inst, consinfo,
5278 class LUQuery(NoHooksLU):
5279 """Query for resources/items of a certain kind.
5282 # pylint: disable=W0142
5285 def CheckArguments(self):
5286 qcls = _GetQueryImplementation(self.op.what)
5288 self.impl = qcls(self.op.qfilter, self.op.fields, self.op.use_locking)
5290 def ExpandNames(self):
5291 self.impl.ExpandNames(self)
5293 def DeclareLocks(self, level):
5294 self.impl.DeclareLocks(self, level)
5296 def Exec(self, feedback_fn):
5297 return self.impl.NewStyleQuery(self)
5300 class LUQueryFields(NoHooksLU):
5301 """Query for resources/items of a certain kind.
5304 # pylint: disable=W0142
5307 def CheckArguments(self):
5308 self.qcls = _GetQueryImplementation(self.op.what)
5310 def ExpandNames(self):
5311 self.needed_locks = {}
5313 def Exec(self, feedback_fn):
5314 return query.QueryFields(self.qcls.FIELDS, self.op.fields)
5317 class LUNodeModifyStorage(NoHooksLU):
5318 """Logical unit for modifying a storage volume on a node.
5323 def CheckArguments(self):
5324 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
5326 storage_type = self.op.storage_type
5329 modifiable = constants.MODIFIABLE_STORAGE_FIELDS[storage_type]
5331 raise errors.OpPrereqError("Storage units of type '%s' can not be"
5332 " modified" % storage_type,
5335 diff = set(self.op.changes.keys()) - modifiable
5337 raise errors.OpPrereqError("The following fields can not be modified for"
5338 " storage units of type '%s': %r" %
5339 (storage_type, list(diff)),
5342 def ExpandNames(self):
5343 self.needed_locks = {
5344 locking.LEVEL_NODE: self.op.node_name,
5347 def Exec(self, feedback_fn):
5348 """Computes the list of nodes and their attributes.
5351 st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
5352 result = self.rpc.call_storage_modify(self.op.node_name,
5353 self.op.storage_type, st_args,
5354 self.op.name, self.op.changes)
5355 result.Raise("Failed to modify storage unit '%s' on %s" %
5356 (self.op.name, self.op.node_name))
5359 class LUNodeAdd(LogicalUnit):
5360 """Logical unit for adding node to the cluster.
5364 HTYPE = constants.HTYPE_NODE
5365 _NFLAGS = ["master_capable", "vm_capable"]
5367 def CheckArguments(self):
5368 self.primary_ip_family = self.cfg.GetPrimaryIPFamily()
5369 # validate/normalize the node name
5370 self.hostname = netutils.GetHostname(name=self.op.node_name,
5371 family=self.primary_ip_family)
5372 self.op.node_name = self.hostname.name
5374 if self.op.readd and self.op.node_name == self.cfg.GetMasterNode():
5375 raise errors.OpPrereqError("Cannot readd the master node",
5378 if self.op.readd and self.op.group:
5379 raise errors.OpPrereqError("Cannot pass a node group when a node is"
5380 " being readded", errors.ECODE_INVAL)
5382 def BuildHooksEnv(self):
5385 This will run on all nodes before, and on all nodes + the new node after.
5389 "OP_TARGET": self.op.node_name,
5390 "NODE_NAME": self.op.node_name,
5391 "NODE_PIP": self.op.primary_ip,
5392 "NODE_SIP": self.op.secondary_ip,
5393 "MASTER_CAPABLE": str(self.op.master_capable),
5394 "VM_CAPABLE": str(self.op.vm_capable),
5397 def BuildHooksNodes(self):
5398 """Build hooks nodes.
5401 # Exclude added node
5402 pre_nodes = list(set(self.cfg.GetNodeList()) - set([self.op.node_name]))
5403 post_nodes = pre_nodes + [self.op.node_name, ]
5405 return (pre_nodes, post_nodes)
5407 def CheckPrereq(self):
5408 """Check prerequisites.
5411 - the new node is not already in the config
5413 - its parameters (single/dual homed) matches the cluster
5415 Any errors are signaled by raising errors.OpPrereqError.
5419 hostname = self.hostname
5420 node = hostname.name
5421 primary_ip = self.op.primary_ip = hostname.ip
5422 if self.op.secondary_ip is None:
5423 if self.primary_ip_family == netutils.IP6Address.family:
5424 raise errors.OpPrereqError("When using a IPv6 primary address, a valid"
5425 " IPv4 address must be given as secondary",
5427 self.op.secondary_ip = primary_ip
5429 secondary_ip = self.op.secondary_ip
5430 if not netutils.IP4Address.IsValid(secondary_ip):
5431 raise errors.OpPrereqError("Secondary IP (%s) needs to be a valid IPv4"
5432 " address" % secondary_ip, errors.ECODE_INVAL)
5434 node_list = cfg.GetNodeList()
5435 if not self.op.readd and node in node_list:
5436 raise errors.OpPrereqError("Node %s is already in the configuration" %
5437 node, errors.ECODE_EXISTS)
5438 elif self.op.readd and node not in node_list:
5439 raise errors.OpPrereqError("Node %s is not in the configuration" % node,
5442 self.changed_primary_ip = False
5444 for existing_node_name, existing_node in cfg.GetMultiNodeInfo(node_list):
5445 if self.op.readd and node == existing_node_name:
5446 if existing_node.secondary_ip != secondary_ip:
5447 raise errors.OpPrereqError("Readded node doesn't have the same IP"
5448 " address configuration as before",
5450 if existing_node.primary_ip != primary_ip:
5451 self.changed_primary_ip = True
5455 if (existing_node.primary_ip == primary_ip or
5456 existing_node.secondary_ip == primary_ip or
5457 existing_node.primary_ip == secondary_ip or
5458 existing_node.secondary_ip == secondary_ip):
5459 raise errors.OpPrereqError("New node ip address(es) conflict with"
5460 " existing node %s" % existing_node.name,
5461 errors.ECODE_NOTUNIQUE)
5463 # After this 'if' block, None is no longer a valid value for the
5464 # _capable op attributes
5466 old_node = self.cfg.GetNodeInfo(node)
5467 assert old_node is not None, "Can't retrieve locked node %s" % node
5468 for attr in self._NFLAGS:
5469 if getattr(self.op, attr) is None:
5470 setattr(self.op, attr, getattr(old_node, attr))
5472 for attr in self._NFLAGS:
5473 if getattr(self.op, attr) is None:
5474 setattr(self.op, attr, True)
5476 if self.op.readd and not self.op.vm_capable:
5477 pri, sec = cfg.GetNodeInstances(node)
5479 raise errors.OpPrereqError("Node %s being re-added with vm_capable"
5480 " flag set to false, but it already holds"
5481 " instances" % node,
5484 # check that the type of the node (single versus dual homed) is the
5485 # same as for the master
5486 myself = cfg.GetNodeInfo(self.cfg.GetMasterNode())
5487 master_singlehomed = myself.secondary_ip == myself.primary_ip
5488 newbie_singlehomed = secondary_ip == primary_ip
5489 if master_singlehomed != newbie_singlehomed:
5490 if master_singlehomed:
5491 raise errors.OpPrereqError("The master has no secondary ip but the"
5492 " new node has one",
5495 raise errors.OpPrereqError("The master has a secondary ip but the"
5496 " new node doesn't have one",
5499 # checks reachability
5500 if not netutils.TcpPing(primary_ip, constants.DEFAULT_NODED_PORT):
5501 raise errors.OpPrereqError("Node not reachable by ping",
5502 errors.ECODE_ENVIRON)
5504 if not newbie_singlehomed:
5505 # check reachability from my secondary ip to newbie's secondary ip
5506 if not netutils.TcpPing(secondary_ip, constants.DEFAULT_NODED_PORT,
5507 source=myself.secondary_ip):
5508 raise errors.OpPrereqError("Node secondary ip not reachable by TCP"
5509 " based ping to node daemon port",
5510 errors.ECODE_ENVIRON)
5517 if self.op.master_capable:
5518 self.master_candidate = _DecideSelfPromotion(self, exceptions=exceptions)
5520 self.master_candidate = False
5523 self.new_node = old_node
5525 node_group = cfg.LookupNodeGroup(self.op.group)
5526 self.new_node = objects.Node(name=node,
5527 primary_ip=primary_ip,
5528 secondary_ip=secondary_ip,
5529 master_candidate=self.master_candidate,
5530 offline=False, drained=False,
5533 if self.op.ndparams:
5534 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
5536 if self.op.hv_state:
5537 self.new_hv_state = _MergeAndVerifyHvState(self.op.hv_state, None)
5539 if self.op.disk_state:
5540 self.new_disk_state = _MergeAndVerifyDiskState(self.op.disk_state, None)
5542 def Exec(self, feedback_fn):
5543 """Adds the new node to the cluster.
5546 new_node = self.new_node
5547 node = new_node.name
5549 assert locking.BGL in self.owned_locks(locking.LEVEL_CLUSTER), \
5552 # We adding a new node so we assume it's powered
5553 new_node.powered = True
5555 # for re-adds, reset the offline/drained/master-candidate flags;
5556 # we need to reset here, otherwise offline would prevent RPC calls
5557 # later in the procedure; this also means that if the re-add
5558 # fails, we are left with a non-offlined, broken node
5560 new_node.drained = new_node.offline = False # pylint: disable=W0201
5561 self.LogInfo("Readding a node, the offline/drained flags were reset")
5562 # if we demote the node, we do cleanup later in the procedure
5563 new_node.master_candidate = self.master_candidate
5564 if self.changed_primary_ip:
5565 new_node.primary_ip = self.op.primary_ip
5567 # copy the master/vm_capable flags
5568 for attr in self._NFLAGS:
5569 setattr(new_node, attr, getattr(self.op, attr))
5571 # notify the user about any possible mc promotion
5572 if new_node.master_candidate:
5573 self.LogInfo("Node will be a master candidate")
5575 if self.op.ndparams:
5576 new_node.ndparams = self.op.ndparams
5578 new_node.ndparams = {}
5580 if self.op.hv_state:
5581 new_node.hv_state_static = self.new_hv_state
5583 if self.op.disk_state:
5584 new_node.disk_state_static = self.new_disk_state
5586 # check connectivity
5587 result = self.rpc.call_version([node])[node]
5588 result.Raise("Can't get version information from node %s" % node)
5589 if constants.PROTOCOL_VERSION == result.payload:
5590 logging.info("Communication to node %s fine, sw version %s match",
5591 node, result.payload)
5593 raise errors.OpExecError("Version mismatch master version %s,"
5594 " node version %s" %
5595 (constants.PROTOCOL_VERSION, result.payload))
5597 # Add node to our /etc/hosts, and add key to known_hosts
5598 if self.cfg.GetClusterInfo().modify_etc_hosts:
5599 master_node = self.cfg.GetMasterNode()
5600 result = self.rpc.call_etc_hosts_modify(master_node,
5601 constants.ETC_HOSTS_ADD,
5604 result.Raise("Can't update hosts file with new host data")
5606 if new_node.secondary_ip != new_node.primary_ip:
5607 _CheckNodeHasSecondaryIP(self, new_node.name, new_node.secondary_ip,
5610 node_verify_list = [self.cfg.GetMasterNode()]
5611 node_verify_param = {
5612 constants.NV_NODELIST: ([node], {}),
5613 # TODO: do a node-net-test as well?
5616 result = self.rpc.call_node_verify(node_verify_list, node_verify_param,
5617 self.cfg.GetClusterName())
5618 for verifier in node_verify_list:
5619 result[verifier].Raise("Cannot communicate with node %s" % verifier)
5620 nl_payload = result[verifier].payload[constants.NV_NODELIST]
5622 for failed in nl_payload:
5623 feedback_fn("ssh/hostname verification failed"
5624 " (checking from %s): %s" %
5625 (verifier, nl_payload[failed]))
5626 raise errors.OpExecError("ssh/hostname verification failed")
5629 _RedistributeAncillaryFiles(self)
5630 self.context.ReaddNode(new_node)
5631 # make sure we redistribute the config
5632 self.cfg.Update(new_node, feedback_fn)
5633 # and make sure the new node will not have old files around
5634 if not new_node.master_candidate:
5635 result = self.rpc.call_node_demote_from_mc(new_node.name)
5636 msg = result.fail_msg
5638 self.LogWarning("Node failed to demote itself from master"
5639 " candidate status: %s" % msg)
5641 _RedistributeAncillaryFiles(self, additional_nodes=[node],
5642 additional_vm=self.op.vm_capable)
5643 self.context.AddNode(new_node, self.proc.GetECId())
5646 class LUNodeSetParams(LogicalUnit):
5647 """Modifies the parameters of a node.
5649 @cvar _F2R: a dictionary from tuples of flags (mc, drained, offline)
5650 to the node role (as _ROLE_*)
5651 @cvar _R2F: a dictionary from node role to tuples of flags
5652 @cvar _FLAGS: a list of attribute names corresponding to the flags
5655 HPATH = "node-modify"
5656 HTYPE = constants.HTYPE_NODE
5658 (_ROLE_CANDIDATE, _ROLE_DRAINED, _ROLE_OFFLINE, _ROLE_REGULAR) = range(4)
5660 (True, False, False): _ROLE_CANDIDATE,
5661 (False, True, False): _ROLE_DRAINED,
5662 (False, False, True): _ROLE_OFFLINE,
5663 (False, False, False): _ROLE_REGULAR,
5665 _R2F = dict((v, k) for k, v in _F2R.items())
5666 _FLAGS = ["master_candidate", "drained", "offline"]
5668 def CheckArguments(self):
5669 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
5670 all_mods = [self.op.offline, self.op.master_candidate, self.op.drained,
5671 self.op.master_capable, self.op.vm_capable,
5672 self.op.secondary_ip, self.op.ndparams, self.op.hv_state,
5674 if all_mods.count(None) == len(all_mods):
5675 raise errors.OpPrereqError("Please pass at least one modification",
5677 if all_mods.count(True) > 1:
5678 raise errors.OpPrereqError("Can't set the node into more than one"
5679 " state at the same time",
5682 # Boolean value that tells us whether we might be demoting from MC
5683 self.might_demote = (self.op.master_candidate == False or
5684 self.op.offline == True or
5685 self.op.drained == True or
5686 self.op.master_capable == False)
5688 if self.op.secondary_ip:
5689 if not netutils.IP4Address.IsValid(self.op.secondary_ip):
5690 raise errors.OpPrereqError("Secondary IP (%s) needs to be a valid IPv4"
5691 " address" % self.op.secondary_ip,
5694 self.lock_all = self.op.auto_promote and self.might_demote
5695 self.lock_instances = self.op.secondary_ip is not None
5697 def _InstanceFilter(self, instance):
5698 """Filter for getting affected instances.
5701 return (instance.disk_template in constants.DTS_INT_MIRROR and
5702 self.op.node_name in instance.all_nodes)
5704 def ExpandNames(self):
5706 self.needed_locks = {locking.LEVEL_NODE: locking.ALL_SET}
5708 self.needed_locks = {locking.LEVEL_NODE: self.op.node_name}
5710 # Since modifying a node can have severe effects on currently running
5711 # operations the resource lock is at least acquired in shared mode
5712 self.needed_locks[locking.LEVEL_NODE_RES] = \
5713 self.needed_locks[locking.LEVEL_NODE]
5715 # Get node resource and instance locks in shared mode; they are not used
5716 # for anything but read-only access
5717 self.share_locks[locking.LEVEL_NODE_RES] = 1
5718 self.share_locks[locking.LEVEL_INSTANCE] = 1
5720 if self.lock_instances:
5721 self.needed_locks[locking.LEVEL_INSTANCE] = \
5722 frozenset(self.cfg.GetInstancesInfoByFilter(self._InstanceFilter))
5724 def BuildHooksEnv(self):
5727 This runs on the master node.
5731 "OP_TARGET": self.op.node_name,
5732 "MASTER_CANDIDATE": str(self.op.master_candidate),
5733 "OFFLINE": str(self.op.offline),
5734 "DRAINED": str(self.op.drained),
5735 "MASTER_CAPABLE": str(self.op.master_capable),
5736 "VM_CAPABLE": str(self.op.vm_capable),
5739 def BuildHooksNodes(self):
5740 """Build hooks nodes.
5743 nl = [self.cfg.GetMasterNode(), self.op.node_name]
5746 def CheckPrereq(self):
5747 """Check prerequisites.
5749 This only checks the instance list against the existing names.
5752 node = self.node = self.cfg.GetNodeInfo(self.op.node_name)
5754 if self.lock_instances:
5755 affected_instances = \
5756 self.cfg.GetInstancesInfoByFilter(self._InstanceFilter)
5758 # Verify instance locks
5759 owned_instances = self.owned_locks(locking.LEVEL_INSTANCE)
5760 wanted_instances = frozenset(affected_instances.keys())
5761 if wanted_instances - owned_instances:
5762 raise errors.OpPrereqError("Instances affected by changing node %s's"
5763 " secondary IP address have changed since"
5764 " locks were acquired, wanted '%s', have"
5765 " '%s'; retry the operation" %
5767 utils.CommaJoin(wanted_instances),
5768 utils.CommaJoin(owned_instances)),
5771 affected_instances = None
5773 if (self.op.master_candidate is not None or
5774 self.op.drained is not None or
5775 self.op.offline is not None):
5776 # we can't change the master's node flags
5777 if self.op.node_name == self.cfg.GetMasterNode():
5778 raise errors.OpPrereqError("The master role can be changed"
5779 " only via master-failover",
5782 if self.op.master_candidate and not node.master_capable:
5783 raise errors.OpPrereqError("Node %s is not master capable, cannot make"
5784 " it a master candidate" % node.name,
5787 if self.op.vm_capable == False:
5788 (ipri, isec) = self.cfg.GetNodeInstances(self.op.node_name)
5790 raise errors.OpPrereqError("Node %s hosts instances, cannot unset"
5791 " the vm_capable flag" % node.name,
5794 if node.master_candidate and self.might_demote and not self.lock_all:
5795 assert not self.op.auto_promote, "auto_promote set but lock_all not"
5796 # check if after removing the current node, we're missing master
5798 (mc_remaining, mc_should, _) = \
5799 self.cfg.GetMasterCandidateStats(exceptions=[node.name])
5800 if mc_remaining < mc_should:
5801 raise errors.OpPrereqError("Not enough master candidates, please"
5802 " pass auto promote option to allow"
5803 " promotion", errors.ECODE_STATE)
5805 self.old_flags = old_flags = (node.master_candidate,
5806 node.drained, node.offline)
5807 assert old_flags in self._F2R, "Un-handled old flags %s" % str(old_flags)
5808 self.old_role = old_role = self._F2R[old_flags]
5810 # Check for ineffective changes
5811 for attr in self._FLAGS:
5812 if (getattr(self.op, attr) == False and getattr(node, attr) == False):
5813 self.LogInfo("Ignoring request to unset flag %s, already unset", attr)
5814 setattr(self.op, attr, None)
5816 # Past this point, any flag change to False means a transition
5817 # away from the respective state, as only real changes are kept
5819 # TODO: We might query the real power state if it supports OOB
5820 if _SupportsOob(self.cfg, node):
5821 if self.op.offline is False and not (node.powered or
5822 self.op.powered == True):
5823 raise errors.OpPrereqError(("Node %s needs to be turned on before its"
5824 " offline status can be reset") %
5826 elif self.op.powered is not None:
5827 raise errors.OpPrereqError(("Unable to change powered state for node %s"
5828 " as it does not support out-of-band"
5829 " handling") % self.op.node_name)
5831 # If we're being deofflined/drained, we'll MC ourself if needed
5832 if (self.op.drained == False or self.op.offline == False or
5833 (self.op.master_capable and not node.master_capable)):
5834 if _DecideSelfPromotion(self):
5835 self.op.master_candidate = True
5836 self.LogInfo("Auto-promoting node to master candidate")
5838 # If we're no longer master capable, we'll demote ourselves from MC
5839 if self.op.master_capable == False and node.master_candidate:
5840 self.LogInfo("Demoting from master candidate")
5841 self.op.master_candidate = False
5844 assert [getattr(self.op, attr) for attr in self._FLAGS].count(True) <= 1
5845 if self.op.master_candidate:
5846 new_role = self._ROLE_CANDIDATE
5847 elif self.op.drained:
5848 new_role = self._ROLE_DRAINED
5849 elif self.op.offline:
5850 new_role = self._ROLE_OFFLINE
5851 elif False in [self.op.master_candidate, self.op.drained, self.op.offline]:
5852 # False is still in new flags, which means we're un-setting (the
5854 new_role = self._ROLE_REGULAR
5855 else: # no new flags, nothing, keep old role
5858 self.new_role = new_role
5860 if old_role == self._ROLE_OFFLINE and new_role != old_role:
5861 # Trying to transition out of offline status
5862 # TODO: Use standard RPC runner, but make sure it works when the node is
5863 # still marked offline
5864 result = rpc.BootstrapRunner().call_version([node.name])[node.name]
5866 raise errors.OpPrereqError("Node %s is being de-offlined but fails"
5867 " to report its version: %s" %
5868 (node.name, result.fail_msg),
5871 self.LogWarning("Transitioning node from offline to online state"
5872 " without using re-add. Please make sure the node"
5875 if self.op.secondary_ip:
5876 # Ok even without locking, because this can't be changed by any LU
5877 master = self.cfg.GetNodeInfo(self.cfg.GetMasterNode())
5878 master_singlehomed = master.secondary_ip == master.primary_ip
5879 if master_singlehomed and self.op.secondary_ip:
5880 raise errors.OpPrereqError("Cannot change the secondary ip on a single"
5881 " homed cluster", errors.ECODE_INVAL)
5883 assert not (frozenset(affected_instances) -
5884 self.owned_locks(locking.LEVEL_INSTANCE))
5887 if affected_instances:
5888 raise errors.OpPrereqError("Cannot change secondary IP address:"
5889 " offline node has instances (%s)"
5890 " configured to use it" %
5891 utils.CommaJoin(affected_instances.keys()))
5893 # On online nodes, check that no instances are running, and that
5894 # the node has the new ip and we can reach it.
5895 for instance in affected_instances.values():
5896 _CheckInstanceState(self, instance, INSTANCE_DOWN,
5897 msg="cannot change secondary ip")
5899 _CheckNodeHasSecondaryIP(self, node.name, self.op.secondary_ip, True)
5900 if master.name != node.name:
5901 # check reachability from master secondary ip to new secondary ip
5902 if not netutils.TcpPing(self.op.secondary_ip,
5903 constants.DEFAULT_NODED_PORT,
5904 source=master.secondary_ip):
5905 raise errors.OpPrereqError("Node secondary ip not reachable by TCP"
5906 " based ping to node daemon port",
5907 errors.ECODE_ENVIRON)
5909 if self.op.ndparams:
5910 new_ndparams = _GetUpdatedParams(self.node.ndparams, self.op.ndparams)
5911 utils.ForceDictType(new_ndparams, constants.NDS_PARAMETER_TYPES)
5912 self.new_ndparams = new_ndparams
5914 if self.op.hv_state:
5915 self.new_hv_state = _MergeAndVerifyHvState(self.op.hv_state,
5916 self.node.hv_state_static)
5918 if self.op.disk_state:
5919 self.new_disk_state = \
5920 _MergeAndVerifyDiskState(self.op.disk_state,
5921 self.node.disk_state_static)
5923 def Exec(self, feedback_fn):
5928 old_role = self.old_role
5929 new_role = self.new_role
5933 if self.op.ndparams:
5934 node.ndparams = self.new_ndparams
5936 if self.op.powered is not None:
5937 node.powered = self.op.powered
5939 if self.op.hv_state:
5940 node.hv_state_static = self.new_hv_state
5942 if self.op.disk_state:
5943 node.disk_state_static = self.new_disk_state
5945 for attr in ["master_capable", "vm_capable"]:
5946 val = getattr(self.op, attr)
5948 setattr(node, attr, val)
5949 result.append((attr, str(val)))
5951 if new_role != old_role:
5952 # Tell the node to demote itself, if no longer MC and not offline
5953 if old_role == self._ROLE_CANDIDATE and new_role != self._ROLE_OFFLINE:
5954 msg = self.rpc.call_node_demote_from_mc(node.name).fail_msg
5956 self.LogWarning("Node failed to demote itself: %s", msg)
5958 new_flags = self._R2F[new_role]
5959 for of, nf, desc in zip(self.old_flags, new_flags, self._FLAGS):
5961 result.append((desc, str(nf)))
5962 (node.master_candidate, node.drained, node.offline) = new_flags
5964 # we locked all nodes, we adjust the CP before updating this node
5966 _AdjustCandidatePool(self, [node.name])
5968 if self.op.secondary_ip:
5969 node.secondary_ip = self.op.secondary_ip
5970 result.append(("secondary_ip", self.op.secondary_ip))
5972 # this will trigger configuration file update, if needed
5973 self.cfg.Update(node, feedback_fn)
5975 # this will trigger job queue propagation or cleanup if the mc
5977 if [old_role, new_role].count(self._ROLE_CANDIDATE) == 1:
5978 self.context.ReaddNode(node)
5983 class LUNodePowercycle(NoHooksLU):
5984 """Powercycles a node.
5989 def CheckArguments(self):
5990 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
5991 if self.op.node_name == self.cfg.GetMasterNode() and not self.op.force:
5992 raise errors.OpPrereqError("The node is the master and the force"
5993 " parameter was not set",
5996 def ExpandNames(self):
5997 """Locking for PowercycleNode.
5999 This is a last-resort option and shouldn't block on other
6000 jobs. Therefore, we grab no locks.
6003 self.needed_locks = {}
6005 def Exec(self, feedback_fn):
6009 result = self.rpc.call_node_powercycle(self.op.node_name,
6010 self.cfg.GetHypervisorType())
6011 result.Raise("Failed to schedule the reboot")
6012 return result.payload
6015 class LUClusterQuery(NoHooksLU):
6016 """Query cluster configuration.
6021 def ExpandNames(self):
6022 self.needed_locks = {}
6024 def Exec(self, feedback_fn):
6025 """Return cluster config.
6028 cluster = self.cfg.GetClusterInfo()
6031 # Filter just for enabled hypervisors
6032 for os_name, hv_dict in cluster.os_hvp.items():
6033 os_hvp[os_name] = {}
6034 for hv_name, hv_params in hv_dict.items():
6035 if hv_name in cluster.enabled_hypervisors:
6036 os_hvp[os_name][hv_name] = hv_params
6038 # Convert ip_family to ip_version
6039 primary_ip_version = constants.IP4_VERSION
6040 if cluster.primary_ip_family == netutils.IP6Address.family:
6041 primary_ip_version = constants.IP6_VERSION
6044 "software_version": constants.RELEASE_VERSION,
6045 "protocol_version": constants.PROTOCOL_VERSION,
6046 "config_version": constants.CONFIG_VERSION,
6047 "os_api_version": max(constants.OS_API_VERSIONS),
6048 "export_version": constants.EXPORT_VERSION,
6049 "architecture": (platform.architecture()[0], platform.machine()),
6050 "name": cluster.cluster_name,
6051 "master": cluster.master_node,
6052 "default_hypervisor": cluster.primary_hypervisor,
6053 "enabled_hypervisors": cluster.enabled_hypervisors,
6054 "hvparams": dict([(hypervisor_name, cluster.hvparams[hypervisor_name])
6055 for hypervisor_name in cluster.enabled_hypervisors]),
6057 "beparams": cluster.beparams,
6058 "osparams": cluster.osparams,
6059 "ipolicy": cluster.ipolicy,
6060 "nicparams": cluster.nicparams,
6061 "ndparams": cluster.ndparams,
6062 "candidate_pool_size": cluster.candidate_pool_size,
6063 "master_netdev": cluster.master_netdev,
6064 "master_netmask": cluster.master_netmask,
6065 "use_external_mip_script": cluster.use_external_mip_script,
6066 "volume_group_name": cluster.volume_group_name,
6067 "drbd_usermode_helper": cluster.drbd_usermode_helper,
6068 "file_storage_dir": cluster.file_storage_dir,
6069 "shared_file_storage_dir": cluster.shared_file_storage_dir,
6070 "maintain_node_health": cluster.maintain_node_health,
6071 "ctime": cluster.ctime,
6072 "mtime": cluster.mtime,
6073 "uuid": cluster.uuid,
6074 "tags": list(cluster.GetTags()),
6075 "uid_pool": cluster.uid_pool,
6076 "default_iallocator": cluster.default_iallocator,
6077 "reserved_lvs": cluster.reserved_lvs,
6078 "primary_ip_version": primary_ip_version,
6079 "prealloc_wipe_disks": cluster.prealloc_wipe_disks,
6080 "hidden_os": cluster.hidden_os,
6081 "blacklisted_os": cluster.blacklisted_os,
6087 class LUClusterConfigQuery(NoHooksLU):
6088 """Return configuration values.
6092 _FIELDS_DYNAMIC = utils.FieldSet()
6093 _FIELDS_STATIC = utils.FieldSet("cluster_name", "master_node", "drain_flag",
6094 "watcher_pause", "volume_group_name")
6096 def CheckArguments(self):
6097 _CheckOutputFields(static=self._FIELDS_STATIC,
6098 dynamic=self._FIELDS_DYNAMIC,
6099 selected=self.op.output_fields)
6101 def ExpandNames(self):
6102 self.needed_locks = {}
6104 def Exec(self, feedback_fn):
6105 """Dump a representation of the cluster config to the standard output.
6109 for field in self.op.output_fields:
6110 if field == "cluster_name":
6111 entry = self.cfg.GetClusterName()
6112 elif field == "master_node":
6113 entry = self.cfg.GetMasterNode()
6114 elif field == "drain_flag":
6115 entry = os.path.exists(constants.JOB_QUEUE_DRAIN_FILE)
6116 elif field == "watcher_pause":
6117 entry = utils.ReadWatcherPauseFile(constants.WATCHER_PAUSEFILE)
6118 elif field == "volume_group_name":
6119 entry = self.cfg.GetVGName()
6121 raise errors.ParameterError(field)
6122 values.append(entry)
6126 class LUInstanceActivateDisks(NoHooksLU):
6127 """Bring up an instance's disks.
6132 def ExpandNames(self):
6133 self._ExpandAndLockInstance()
6134 self.needed_locks[locking.LEVEL_NODE] = []
6135 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
6137 def DeclareLocks(self, level):
6138 if level == locking.LEVEL_NODE:
6139 self._LockInstancesNodes()
6141 def CheckPrereq(self):
6142 """Check prerequisites.
6144 This checks that the instance is in the cluster.
6147 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6148 assert self.instance is not None, \
6149 "Cannot retrieve locked instance %s" % self.op.instance_name
6150 _CheckNodeOnline(self, self.instance.primary_node)
6152 def Exec(self, feedback_fn):
6153 """Activate the disks.
6156 disks_ok, disks_info = \
6157 _AssembleInstanceDisks(self, self.instance,
6158 ignore_size=self.op.ignore_size)
6160 raise errors.OpExecError("Cannot activate block devices")
6165 def _AssembleInstanceDisks(lu, instance, disks=None, ignore_secondaries=False,
6167 """Prepare the block devices for an instance.
6169 This sets up the block devices on all nodes.
6171 @type lu: L{LogicalUnit}
6172 @param lu: the logical unit on whose behalf we execute
6173 @type instance: L{objects.Instance}
6174 @param instance: the instance for whose disks we assemble
6175 @type disks: list of L{objects.Disk} or None
6176 @param disks: which disks to assemble (or all, if None)
6177 @type ignore_secondaries: boolean
6178 @param ignore_secondaries: if true, errors on secondary nodes
6179 won't result in an error return from the function
6180 @type ignore_size: boolean
6181 @param ignore_size: if true, the current known size of the disk
6182 will not be used during the disk activation, useful for cases
6183 when the size is wrong
6184 @return: False if the operation failed, otherwise a list of
6185 (host, instance_visible_name, node_visible_name)
6186 with the mapping from node devices to instance devices
6191 iname = instance.name
6192 disks = _ExpandCheckDisks(instance, disks)
6194 # With the two passes mechanism we try to reduce the window of
6195 # opportunity for the race condition of switching DRBD to primary
6196 # before handshaking occured, but we do not eliminate it
6198 # The proper fix would be to wait (with some limits) until the
6199 # connection has been made and drbd transitions from WFConnection
6200 # into any other network-connected state (Connected, SyncTarget,
6203 # 1st pass, assemble on all nodes in secondary mode
6204 for idx, inst_disk in enumerate(disks):
6205 for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node):
6207 node_disk = node_disk.Copy()
6208 node_disk.UnsetSize()
6209 lu.cfg.SetDiskID(node_disk, node)
6210 result = lu.rpc.call_blockdev_assemble(node, node_disk, iname, False, idx)
6211 msg = result.fail_msg
6213 lu.proc.LogWarning("Could not prepare block device %s on node %s"
6214 " (is_primary=False, pass=1): %s",
6215 inst_disk.iv_name, node, msg)
6216 if not ignore_secondaries:
6219 # FIXME: race condition on drbd migration to primary
6221 # 2nd pass, do only the primary node
6222 for idx, inst_disk in enumerate(disks):
6225 for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node):
6226 if node != instance.primary_node:
6229 node_disk = node_disk.Copy()
6230 node_disk.UnsetSize()
6231 lu.cfg.SetDiskID(node_disk, node)
6232 result = lu.rpc.call_blockdev_assemble(node, node_disk, iname, True, idx)
6233 msg = result.fail_msg
6235 lu.proc.LogWarning("Could not prepare block device %s on node %s"
6236 " (is_primary=True, pass=2): %s",
6237 inst_disk.iv_name, node, msg)
6240 dev_path = result.payload
6242 device_info.append((instance.primary_node, inst_disk.iv_name, dev_path))
6244 # leave the disks configured for the primary node
6245 # this is a workaround that would be fixed better by
6246 # improving the logical/physical id handling
6248 lu.cfg.SetDiskID(disk, instance.primary_node)
6250 return disks_ok, device_info
6253 def _StartInstanceDisks(lu, instance, force):
6254 """Start the disks of an instance.
6257 disks_ok, _ = _AssembleInstanceDisks(lu, instance,
6258 ignore_secondaries=force)
6260 _ShutdownInstanceDisks(lu, instance)
6261 if force is not None and not force:
6262 lu.proc.LogWarning("", hint="If the message above refers to a"
6264 " you can retry the operation using '--force'.")
6265 raise errors.OpExecError("Disk consistency error")
6268 class LUInstanceDeactivateDisks(NoHooksLU):
6269 """Shutdown an instance's disks.
6274 def ExpandNames(self):
6275 self._ExpandAndLockInstance()
6276 self.needed_locks[locking.LEVEL_NODE] = []
6277 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
6279 def DeclareLocks(self, level):
6280 if level == locking.LEVEL_NODE:
6281 self._LockInstancesNodes()
6283 def CheckPrereq(self):
6284 """Check prerequisites.
6286 This checks that the instance is in the cluster.
6289 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6290 assert self.instance is not None, \
6291 "Cannot retrieve locked instance %s" % self.op.instance_name
6293 def Exec(self, feedback_fn):
6294 """Deactivate the disks
6297 instance = self.instance
6299 _ShutdownInstanceDisks(self, instance)
6301 _SafeShutdownInstanceDisks(self, instance)
6304 def _SafeShutdownInstanceDisks(lu, instance, disks=None):
6305 """Shutdown block devices of an instance.
6307 This function checks if an instance is running, before calling
6308 _ShutdownInstanceDisks.
6311 _CheckInstanceState(lu, instance, INSTANCE_DOWN, msg="cannot shutdown disks")
6312 _ShutdownInstanceDisks(lu, instance, disks=disks)
6315 def _ExpandCheckDisks(instance, disks):
6316 """Return the instance disks selected by the disks list
6318 @type disks: list of L{objects.Disk} or None
6319 @param disks: selected disks
6320 @rtype: list of L{objects.Disk}
6321 @return: selected instance disks to act on
6325 return instance.disks
6327 if not set(disks).issubset(instance.disks):
6328 raise errors.ProgrammerError("Can only act on disks belonging to the"
6333 def _ShutdownInstanceDisks(lu, instance, disks=None, ignore_primary=False):
6334 """Shutdown block devices of an instance.
6336 This does the shutdown on all nodes of the instance.
6338 If the ignore_primary is false, errors on the primary node are
6343 disks = _ExpandCheckDisks(instance, disks)
6346 for node, top_disk in disk.ComputeNodeTree(instance.primary_node):
6347 lu.cfg.SetDiskID(top_disk, node)
6348 result = lu.rpc.call_blockdev_shutdown(node, top_disk)
6349 msg = result.fail_msg
6351 lu.LogWarning("Could not shutdown block device %s on node %s: %s",
6352 disk.iv_name, node, msg)
6353 if ((node == instance.primary_node and not ignore_primary) or
6354 (node != instance.primary_node and not result.offline)):
6359 def _CheckNodeFreeMemory(lu, node, reason, requested, hypervisor_name):
6360 """Checks if a node has enough free memory.
6362 This function check if a given node has the needed amount of free
6363 memory. In case the node has less memory or we cannot get the
6364 information from the node, this function raise an OpPrereqError
6367 @type lu: C{LogicalUnit}
6368 @param lu: a logical unit from which we get configuration data
6370 @param node: the node to check
6371 @type reason: C{str}
6372 @param reason: string to use in the error message
6373 @type requested: C{int}
6374 @param requested: the amount of memory in MiB to check for
6375 @type hypervisor_name: C{str}
6376 @param hypervisor_name: the hypervisor to ask for memory stats
6377 @raise errors.OpPrereqError: if the node doesn't have enough memory, or
6378 we cannot check the node
6381 nodeinfo = lu.rpc.call_node_info([node], None, [hypervisor_name])
6382 nodeinfo[node].Raise("Can't get data from node %s" % node,
6383 prereq=True, ecode=errors.ECODE_ENVIRON)
6384 (_, _, (hv_info, )) = nodeinfo[node].payload
6386 free_mem = hv_info.get("memory_free", None)
6387 if not isinstance(free_mem, int):
6388 raise errors.OpPrereqError("Can't compute free memory on node %s, result"
6389 " was '%s'" % (node, free_mem),
6390 errors.ECODE_ENVIRON)
6391 if requested > free_mem:
6392 raise errors.OpPrereqError("Not enough memory on node %s for %s:"
6393 " needed %s MiB, available %s MiB" %
6394 (node, reason, requested, free_mem),
6398 def _CheckNodesFreeDiskPerVG(lu, nodenames, req_sizes):
6399 """Checks if nodes have enough free disk space in the all VGs.
6401 This function check if all given nodes have the needed amount of
6402 free disk. In case any node has less disk or we cannot get the
6403 information from the node, this function raise an OpPrereqError
6406 @type lu: C{LogicalUnit}
6407 @param lu: a logical unit from which we get configuration data
6408 @type nodenames: C{list}
6409 @param nodenames: the list of node names to check
6410 @type req_sizes: C{dict}
6411 @param req_sizes: the hash of vg and corresponding amount of disk in
6413 @raise errors.OpPrereqError: if the node doesn't have enough disk,
6414 or we cannot check the node
6417 for vg, req_size in req_sizes.items():
6418 _CheckNodesFreeDiskOnVG(lu, nodenames, vg, req_size)
6421 def _CheckNodesFreeDiskOnVG(lu, nodenames, vg, requested):
6422 """Checks if nodes have enough free disk space in the specified VG.
6424 This function check if all given nodes have the needed amount of
6425 free disk. In case any node has less disk or we cannot get the
6426 information from the node, this function raise an OpPrereqError
6429 @type lu: C{LogicalUnit}
6430 @param lu: a logical unit from which we get configuration data
6431 @type nodenames: C{list}
6432 @param nodenames: the list of node names to check
6434 @param vg: the volume group to check
6435 @type requested: C{int}
6436 @param requested: the amount of disk in MiB to check for
6437 @raise errors.OpPrereqError: if the node doesn't have enough disk,
6438 or we cannot check the node
6441 nodeinfo = lu.rpc.call_node_info(nodenames, [vg], None)
6442 for node in nodenames:
6443 info = nodeinfo[node]
6444 info.Raise("Cannot get current information from node %s" % node,
6445 prereq=True, ecode=errors.ECODE_ENVIRON)
6446 (_, (vg_info, ), _) = info.payload
6447 vg_free = vg_info.get("vg_free", None)
6448 if not isinstance(vg_free, int):
6449 raise errors.OpPrereqError("Can't compute free disk space on node"
6450 " %s for vg %s, result was '%s'" %
6451 (node, vg, vg_free), errors.ECODE_ENVIRON)
6452 if requested > vg_free:
6453 raise errors.OpPrereqError("Not enough disk space on target node %s"
6454 " vg %s: required %d MiB, available %d MiB" %
6455 (node, vg, requested, vg_free),
6459 def _CheckNodesPhysicalCPUs(lu, nodenames, requested, hypervisor_name):
6460 """Checks if nodes have enough physical CPUs
6462 This function checks if all given nodes have the needed number of
6463 physical CPUs. In case any node has less CPUs or we cannot get the
6464 information from the node, this function raises an OpPrereqError
6467 @type lu: C{LogicalUnit}
6468 @param lu: a logical unit from which we get configuration data
6469 @type nodenames: C{list}
6470 @param nodenames: the list of node names to check
6471 @type requested: C{int}
6472 @param requested: the minimum acceptable number of physical CPUs
6473 @raise errors.OpPrereqError: if the node doesn't have enough CPUs,
6474 or we cannot check the node
6477 nodeinfo = lu.rpc.call_node_info(nodenames, None, [hypervisor_name])
6478 for node in nodenames:
6479 info = nodeinfo[node]
6480 info.Raise("Cannot get current information from node %s" % node,
6481 prereq=True, ecode=errors.ECODE_ENVIRON)
6482 (_, _, (hv_info, )) = info.payload
6483 num_cpus = hv_info.get("cpu_total", None)
6484 if not isinstance(num_cpus, int):
6485 raise errors.OpPrereqError("Can't compute the number of physical CPUs"
6486 " on node %s, result was '%s'" %
6487 (node, num_cpus), errors.ECODE_ENVIRON)
6488 if requested > num_cpus:
6489 raise errors.OpPrereqError("Node %s has %s physical CPUs, but %s are "
6490 "required" % (node, num_cpus, requested),
6494 class LUInstanceStartup(LogicalUnit):
6495 """Starts an instance.
6498 HPATH = "instance-start"
6499 HTYPE = constants.HTYPE_INSTANCE
6502 def CheckArguments(self):
6504 if self.op.beparams:
6505 # fill the beparams dict
6506 objects.UpgradeBeParams(self.op.beparams)
6507 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
6509 def ExpandNames(self):
6510 self._ExpandAndLockInstance()
6511 self.recalculate_locks[locking.LEVEL_NODE_RES] = constants.LOCKS_REPLACE
6513 def DeclareLocks(self, level):
6514 if level == locking.LEVEL_NODE_RES:
6515 self._LockInstancesNodes(primary_only=True, level=locking.LEVEL_NODE_RES)
6517 def BuildHooksEnv(self):
6520 This runs on master, primary and secondary nodes of the instance.
6524 "FORCE": self.op.force,
6527 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
6531 def BuildHooksNodes(self):
6532 """Build hooks nodes.
6535 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6538 def CheckPrereq(self):
6539 """Check prerequisites.
6541 This checks that the instance is in the cluster.
6544 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6545 assert self.instance is not None, \
6546 "Cannot retrieve locked instance %s" % self.op.instance_name
6549 if self.op.hvparams:
6550 # check hypervisor parameter syntax (locally)
6551 cluster = self.cfg.GetClusterInfo()
6552 utils.ForceDictType(self.op.hvparams, constants.HVS_PARAMETER_TYPES)
6553 filled_hvp = cluster.FillHV(instance)
6554 filled_hvp.update(self.op.hvparams)
6555 hv_type = hypervisor.GetHypervisor(instance.hypervisor)
6556 hv_type.CheckParameterSyntax(filled_hvp)
6557 _CheckHVParams(self, instance.all_nodes, instance.hypervisor, filled_hvp)
6559 _CheckInstanceState(self, instance, INSTANCE_ONLINE)
6561 self.primary_offline = self.cfg.GetNodeInfo(instance.primary_node).offline
6563 if self.primary_offline and self.op.ignore_offline_nodes:
6564 self.proc.LogWarning("Ignoring offline primary node")
6566 if self.op.hvparams or self.op.beparams:
6567 self.proc.LogWarning("Overridden parameters are ignored")
6569 _CheckNodeOnline(self, instance.primary_node)
6571 bep = self.cfg.GetClusterInfo().FillBE(instance)
6572 bep.update(self.op.beparams)
6574 # check bridges existence
6575 _CheckInstanceBridgesExist(self, instance)
6577 remote_info = self.rpc.call_instance_info(instance.primary_node,
6579 instance.hypervisor)
6580 remote_info.Raise("Error checking node %s" % instance.primary_node,
6581 prereq=True, ecode=errors.ECODE_ENVIRON)
6582 if not remote_info.payload: # not running already
6583 _CheckNodeFreeMemory(self, instance.primary_node,
6584 "starting instance %s" % instance.name,
6585 bep[constants.BE_MINMEM], instance.hypervisor)
6587 def Exec(self, feedback_fn):
6588 """Start the instance.
6591 instance = self.instance
6592 force = self.op.force
6594 if not self.op.no_remember:
6595 self.cfg.MarkInstanceUp(instance.name)
6597 if self.primary_offline:
6598 assert self.op.ignore_offline_nodes
6599 self.proc.LogInfo("Primary node offline, marked instance as started")
6601 node_current = instance.primary_node
6603 _StartInstanceDisks(self, instance, force)
6606 self.rpc.call_instance_start(node_current,
6607 (instance, self.op.hvparams,
6609 self.op.startup_paused)
6610 msg = result.fail_msg
6612 _ShutdownInstanceDisks(self, instance)
6613 raise errors.OpExecError("Could not start instance: %s" % msg)
6616 class LUInstanceReboot(LogicalUnit):
6617 """Reboot an instance.
6620 HPATH = "instance-reboot"
6621 HTYPE = constants.HTYPE_INSTANCE
6624 def ExpandNames(self):
6625 self._ExpandAndLockInstance()
6627 def BuildHooksEnv(self):
6630 This runs on master, primary and secondary nodes of the instance.
6634 "IGNORE_SECONDARIES": self.op.ignore_secondaries,
6635 "REBOOT_TYPE": self.op.reboot_type,
6636 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
6639 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
6643 def BuildHooksNodes(self):
6644 """Build hooks nodes.
6647 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6650 def CheckPrereq(self):
6651 """Check prerequisites.
6653 This checks that the instance is in the cluster.
6656 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6657 assert self.instance is not None, \
6658 "Cannot retrieve locked instance %s" % self.op.instance_name
6659 _CheckInstanceState(self, instance, INSTANCE_ONLINE)
6660 _CheckNodeOnline(self, instance.primary_node)
6662 # check bridges existence
6663 _CheckInstanceBridgesExist(self, instance)
6665 def Exec(self, feedback_fn):
6666 """Reboot the instance.
6669 instance = self.instance
6670 ignore_secondaries = self.op.ignore_secondaries
6671 reboot_type = self.op.reboot_type
6673 remote_info = self.rpc.call_instance_info(instance.primary_node,
6675 instance.hypervisor)
6676 remote_info.Raise("Error checking node %s" % instance.primary_node)
6677 instance_running = bool(remote_info.payload)
6679 node_current = instance.primary_node
6681 if instance_running and reboot_type in [constants.INSTANCE_REBOOT_SOFT,
6682 constants.INSTANCE_REBOOT_HARD]:
6683 for disk in instance.disks:
6684 self.cfg.SetDiskID(disk, node_current)
6685 result = self.rpc.call_instance_reboot(node_current, instance,
6687 self.op.shutdown_timeout)
6688 result.Raise("Could not reboot instance")
6690 if instance_running:
6691 result = self.rpc.call_instance_shutdown(node_current, instance,
6692 self.op.shutdown_timeout)
6693 result.Raise("Could not shutdown instance for full reboot")
6694 _ShutdownInstanceDisks(self, instance)
6696 self.LogInfo("Instance %s was already stopped, starting now",
6698 _StartInstanceDisks(self, instance, ignore_secondaries)
6699 result = self.rpc.call_instance_start(node_current,
6700 (instance, None, None), False)
6701 msg = result.fail_msg
6703 _ShutdownInstanceDisks(self, instance)
6704 raise errors.OpExecError("Could not start instance for"
6705 " full reboot: %s" % msg)
6707 self.cfg.MarkInstanceUp(instance.name)
6710 class LUInstanceShutdown(LogicalUnit):
6711 """Shutdown an instance.
6714 HPATH = "instance-stop"
6715 HTYPE = constants.HTYPE_INSTANCE
6718 def ExpandNames(self):
6719 self._ExpandAndLockInstance()
6721 def BuildHooksEnv(self):
6724 This runs on master, primary and secondary nodes of the instance.
6727 env = _BuildInstanceHookEnvByObject(self, self.instance)
6728 env["TIMEOUT"] = self.op.timeout
6731 def BuildHooksNodes(self):
6732 """Build hooks nodes.
6735 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6738 def CheckPrereq(self):
6739 """Check prerequisites.
6741 This checks that the instance is in the cluster.
6744 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6745 assert self.instance is not None, \
6746 "Cannot retrieve locked instance %s" % self.op.instance_name
6748 _CheckInstanceState(self, self.instance, INSTANCE_ONLINE)
6750 self.primary_offline = \
6751 self.cfg.GetNodeInfo(self.instance.primary_node).offline
6753 if self.primary_offline and self.op.ignore_offline_nodes:
6754 self.proc.LogWarning("Ignoring offline primary node")
6756 _CheckNodeOnline(self, self.instance.primary_node)
6758 def Exec(self, feedback_fn):
6759 """Shutdown the instance.
6762 instance = self.instance
6763 node_current = instance.primary_node
6764 timeout = self.op.timeout
6766 if not self.op.no_remember:
6767 self.cfg.MarkInstanceDown(instance.name)
6769 if self.primary_offline:
6770 assert self.op.ignore_offline_nodes
6771 self.proc.LogInfo("Primary node offline, marked instance as stopped")
6773 result = self.rpc.call_instance_shutdown(node_current, instance, timeout)
6774 msg = result.fail_msg
6776 self.proc.LogWarning("Could not shutdown instance: %s" % msg)
6778 _ShutdownInstanceDisks(self, instance)
6781 class LUInstanceReinstall(LogicalUnit):
6782 """Reinstall an instance.
6785 HPATH = "instance-reinstall"
6786 HTYPE = constants.HTYPE_INSTANCE
6789 def ExpandNames(self):
6790 self._ExpandAndLockInstance()
6792 def BuildHooksEnv(self):
6795 This runs on master, primary and secondary nodes of the instance.
6798 return _BuildInstanceHookEnvByObject(self, self.instance)
6800 def BuildHooksNodes(self):
6801 """Build hooks nodes.
6804 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6807 def CheckPrereq(self):
6808 """Check prerequisites.
6810 This checks that the instance is in the cluster and is not running.
6813 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6814 assert instance is not None, \
6815 "Cannot retrieve locked instance %s" % self.op.instance_name
6816 _CheckNodeOnline(self, instance.primary_node, "Instance primary node"
6817 " offline, cannot reinstall")
6818 for node in instance.secondary_nodes:
6819 _CheckNodeOnline(self, node, "Instance secondary node offline,"
6820 " cannot reinstall")
6822 if instance.disk_template == constants.DT_DISKLESS:
6823 raise errors.OpPrereqError("Instance '%s' has no disks" %
6824 self.op.instance_name,
6826 _CheckInstanceState(self, instance, INSTANCE_DOWN, msg="cannot reinstall")
6828 if self.op.os_type is not None:
6830 pnode = _ExpandNodeName(self.cfg, instance.primary_node)
6831 _CheckNodeHasOS(self, pnode, self.op.os_type, self.op.force_variant)
6832 instance_os = self.op.os_type
6834 instance_os = instance.os
6836 nodelist = list(instance.all_nodes)
6838 if self.op.osparams:
6839 i_osdict = _GetUpdatedParams(instance.osparams, self.op.osparams)
6840 _CheckOSParams(self, True, nodelist, instance_os, i_osdict)
6841 self.os_inst = i_osdict # the new dict (without defaults)
6845 self.instance = instance
6847 def Exec(self, feedback_fn):
6848 """Reinstall the instance.
6851 inst = self.instance
6853 if self.op.os_type is not None:
6854 feedback_fn("Changing OS to '%s'..." % self.op.os_type)
6855 inst.os = self.op.os_type
6856 # Write to configuration
6857 self.cfg.Update(inst, feedback_fn)
6859 _StartInstanceDisks(self, inst, None)
6861 feedback_fn("Running the instance OS create scripts...")
6862 # FIXME: pass debug option from opcode to backend
6863 result = self.rpc.call_instance_os_add(inst.primary_node,
6864 (inst, self.os_inst), True,
6865 self.op.debug_level)
6866 result.Raise("Could not install OS for instance %s on node %s" %
6867 (inst.name, inst.primary_node))
6869 _ShutdownInstanceDisks(self, inst)
6872 class LUInstanceRecreateDisks(LogicalUnit):
6873 """Recreate an instance's missing disks.
6876 HPATH = "instance-recreate-disks"
6877 HTYPE = constants.HTYPE_INSTANCE
6880 def CheckArguments(self):
6881 # normalise the disk list
6882 self.op.disks = sorted(frozenset(self.op.disks))
6884 def ExpandNames(self):
6885 self._ExpandAndLockInstance()
6886 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
6888 self.op.nodes = [_ExpandNodeName(self.cfg, n) for n in self.op.nodes]
6889 self.needed_locks[locking.LEVEL_NODE] = list(self.op.nodes)
6891 self.needed_locks[locking.LEVEL_NODE] = []
6892 self.needed_locks[locking.LEVEL_NODE_RES] = []
6894 def DeclareLocks(self, level):
6895 if level == locking.LEVEL_NODE:
6896 # if we replace the nodes, we only need to lock the old primary,
6897 # otherwise we need to lock all nodes for disk re-creation
6898 primary_only = bool(self.op.nodes)
6899 self._LockInstancesNodes(primary_only=primary_only)
6900 elif level == locking.LEVEL_NODE_RES:
6902 self.needed_locks[locking.LEVEL_NODE_RES] = \
6903 self.needed_locks[locking.LEVEL_NODE][:]
6905 def BuildHooksEnv(self):
6908 This runs on master, primary and secondary nodes of the instance.
6911 return _BuildInstanceHookEnvByObject(self, self.instance)
6913 def BuildHooksNodes(self):
6914 """Build hooks nodes.
6917 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6920 def CheckPrereq(self):
6921 """Check prerequisites.
6923 This checks that the instance is in the cluster and is not running.
6926 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6927 assert instance is not None, \
6928 "Cannot retrieve locked instance %s" % self.op.instance_name
6930 if len(self.op.nodes) != len(instance.all_nodes):
6931 raise errors.OpPrereqError("Instance %s currently has %d nodes, but"
6932 " %d replacement nodes were specified" %
6933 (instance.name, len(instance.all_nodes),
6934 len(self.op.nodes)),
6936 assert instance.disk_template != constants.DT_DRBD8 or \
6937 len(self.op.nodes) == 2
6938 assert instance.disk_template != constants.DT_PLAIN or \
6939 len(self.op.nodes) == 1
6940 primary_node = self.op.nodes[0]
6942 primary_node = instance.primary_node
6943 _CheckNodeOnline(self, primary_node)
6945 if instance.disk_template == constants.DT_DISKLESS:
6946 raise errors.OpPrereqError("Instance '%s' has no disks" %
6947 self.op.instance_name, errors.ECODE_INVAL)
6948 # if we replace nodes *and* the old primary is offline, we don't
6950 assert instance.primary_node in self.owned_locks(locking.LEVEL_NODE)
6951 assert instance.primary_node in self.owned_locks(locking.LEVEL_NODE_RES)
6952 old_pnode = self.cfg.GetNodeInfo(instance.primary_node)
6953 if not (self.op.nodes and old_pnode.offline):
6954 _CheckInstanceState(self, instance, INSTANCE_NOT_RUNNING,
6955 msg="cannot recreate disks")
6957 if not self.op.disks:
6958 self.op.disks = range(len(instance.disks))
6960 for idx in self.op.disks:
6961 if idx >= len(instance.disks):
6962 raise errors.OpPrereqError("Invalid disk index '%s'" % idx,
6964 if self.op.disks != range(len(instance.disks)) and self.op.nodes:
6965 raise errors.OpPrereqError("Can't recreate disks partially and"
6966 " change the nodes at the same time",
6968 self.instance = instance
6970 def Exec(self, feedback_fn):
6971 """Recreate the disks.
6974 instance = self.instance
6976 assert (self.owned_locks(locking.LEVEL_NODE) ==
6977 self.owned_locks(locking.LEVEL_NODE_RES))
6980 mods = [] # keeps track of needed logical_id changes
6982 for idx, disk in enumerate(instance.disks):
6983 if idx not in self.op.disks: # disk idx has not been passed in
6986 # update secondaries for disks, if needed
6988 if disk.dev_type == constants.LD_DRBD8:
6989 # need to update the nodes and minors
6990 assert len(self.op.nodes) == 2
6991 assert len(disk.logical_id) == 6 # otherwise disk internals
6993 (_, _, old_port, _, _, old_secret) = disk.logical_id
6994 new_minors = self.cfg.AllocateDRBDMinor(self.op.nodes, instance.name)
6995 new_id = (self.op.nodes[0], self.op.nodes[1], old_port,
6996 new_minors[0], new_minors[1], old_secret)
6997 assert len(disk.logical_id) == len(new_id)
6998 mods.append((idx, new_id))
7000 # now that we have passed all asserts above, we can apply the mods
7001 # in a single run (to avoid partial changes)
7002 for idx, new_id in mods:
7003 instance.disks[idx].logical_id = new_id
7005 # change primary node, if needed
7007 instance.primary_node = self.op.nodes[0]
7008 self.LogWarning("Changing the instance's nodes, you will have to"
7009 " remove any disks left on the older nodes manually")
7012 self.cfg.Update(instance, feedback_fn)
7014 _CreateDisks(self, instance, to_skip=to_skip)
7017 class LUInstanceRename(LogicalUnit):
7018 """Rename an instance.
7021 HPATH = "instance-rename"
7022 HTYPE = constants.HTYPE_INSTANCE
7024 def CheckArguments(self):
7028 if self.op.ip_check and not self.op.name_check:
7029 # TODO: make the ip check more flexible and not depend on the name check
7030 raise errors.OpPrereqError("IP address check requires a name check",
7033 def BuildHooksEnv(self):
7036 This runs on master, primary and secondary nodes of the instance.
7039 env = _BuildInstanceHookEnvByObject(self, self.instance)
7040 env["INSTANCE_NEW_NAME"] = self.op.new_name
7043 def BuildHooksNodes(self):
7044 """Build hooks nodes.
7047 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
7050 def CheckPrereq(self):
7051 """Check prerequisites.
7053 This checks that the instance is in the cluster and is not running.
7056 self.op.instance_name = _ExpandInstanceName(self.cfg,
7057 self.op.instance_name)
7058 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
7059 assert instance is not None
7060 _CheckNodeOnline(self, instance.primary_node)
7061 _CheckInstanceState(self, instance, INSTANCE_NOT_RUNNING,
7062 msg="cannot rename")
7063 self.instance = instance
7065 new_name = self.op.new_name
7066 if self.op.name_check:
7067 hostname = netutils.GetHostname(name=new_name)
7068 if hostname.name != new_name:
7069 self.LogInfo("Resolved given name '%s' to '%s'", new_name,
7071 if not utils.MatchNameComponent(self.op.new_name, [hostname.name]):
7072 raise errors.OpPrereqError(("Resolved hostname '%s' does not look the"
7073 " same as given hostname '%s'") %
7074 (hostname.name, self.op.new_name),
7076 new_name = self.op.new_name = hostname.name
7077 if (self.op.ip_check and
7078 netutils.TcpPing(hostname.ip, constants.DEFAULT_NODED_PORT)):
7079 raise errors.OpPrereqError("IP %s of instance %s already in use" %
7080 (hostname.ip, new_name),
7081 errors.ECODE_NOTUNIQUE)
7083 instance_list = self.cfg.GetInstanceList()
7084 if new_name in instance_list and new_name != instance.name:
7085 raise errors.OpPrereqError("Instance '%s' is already in the cluster" %
7086 new_name, errors.ECODE_EXISTS)
7088 def Exec(self, feedback_fn):
7089 """Rename the instance.
7092 inst = self.instance
7093 old_name = inst.name
7095 rename_file_storage = False
7096 if (inst.disk_template in constants.DTS_FILEBASED and
7097 self.op.new_name != inst.name):
7098 old_file_storage_dir = os.path.dirname(inst.disks[0].logical_id[1])
7099 rename_file_storage = True
7101 self.cfg.RenameInstance(inst.name, self.op.new_name)
7102 # Change the instance lock. This is definitely safe while we hold the BGL.
7103 # Otherwise the new lock would have to be added in acquired mode.
7105 self.glm.remove(locking.LEVEL_INSTANCE, old_name)
7106 self.glm.add(locking.LEVEL_INSTANCE, self.op.new_name)
7108 # re-read the instance from the configuration after rename
7109 inst = self.cfg.GetInstanceInfo(self.op.new_name)
7111 if rename_file_storage:
7112 new_file_storage_dir = os.path.dirname(inst.disks[0].logical_id[1])
7113 result = self.rpc.call_file_storage_dir_rename(inst.primary_node,
7114 old_file_storage_dir,
7115 new_file_storage_dir)
7116 result.Raise("Could not rename on node %s directory '%s' to '%s'"
7117 " (but the instance has been renamed in Ganeti)" %
7118 (inst.primary_node, old_file_storage_dir,
7119 new_file_storage_dir))
7121 _StartInstanceDisks(self, inst, None)
7123 result = self.rpc.call_instance_run_rename(inst.primary_node, inst,
7124 old_name, self.op.debug_level)
7125 msg = result.fail_msg
7127 msg = ("Could not run OS rename script for instance %s on node %s"
7128 " (but the instance has been renamed in Ganeti): %s" %
7129 (inst.name, inst.primary_node, msg))
7130 self.proc.LogWarning(msg)
7132 _ShutdownInstanceDisks(self, inst)
7137 class LUInstanceRemove(LogicalUnit):
7138 """Remove an instance.
7141 HPATH = "instance-remove"
7142 HTYPE = constants.HTYPE_INSTANCE
7145 def ExpandNames(self):
7146 self._ExpandAndLockInstance()
7147 self.needed_locks[locking.LEVEL_NODE] = []
7148 self.needed_locks[locking.LEVEL_NODE_RES] = []
7149 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
7151 def DeclareLocks(self, level):
7152 if level == locking.LEVEL_NODE:
7153 self._LockInstancesNodes()
7154 elif level == locking.LEVEL_NODE_RES:
7156 self.needed_locks[locking.LEVEL_NODE_RES] = \
7157 self.needed_locks[locking.LEVEL_NODE][:]
7159 def BuildHooksEnv(self):
7162 This runs on master, primary and secondary nodes of the instance.
7165 env = _BuildInstanceHookEnvByObject(self, self.instance)
7166 env["SHUTDOWN_TIMEOUT"] = self.op.shutdown_timeout
7169 def BuildHooksNodes(self):
7170 """Build hooks nodes.
7173 nl = [self.cfg.GetMasterNode()]
7174 nl_post = list(self.instance.all_nodes) + nl
7175 return (nl, nl_post)
7177 def CheckPrereq(self):
7178 """Check prerequisites.
7180 This checks that the instance is in the cluster.
7183 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
7184 assert self.instance is not None, \
7185 "Cannot retrieve locked instance %s" % self.op.instance_name
7187 def Exec(self, feedback_fn):
7188 """Remove the instance.
7191 instance = self.instance
7192 logging.info("Shutting down instance %s on node %s",
7193 instance.name, instance.primary_node)
7195 result = self.rpc.call_instance_shutdown(instance.primary_node, instance,
7196 self.op.shutdown_timeout)
7197 msg = result.fail_msg
7199 if self.op.ignore_failures:
7200 feedback_fn("Warning: can't shutdown instance: %s" % msg)
7202 raise errors.OpExecError("Could not shutdown instance %s on"
7204 (instance.name, instance.primary_node, msg))
7206 assert (self.owned_locks(locking.LEVEL_NODE) ==
7207 self.owned_locks(locking.LEVEL_NODE_RES))
7208 assert not (set(instance.all_nodes) -
7209 self.owned_locks(locking.LEVEL_NODE)), \
7210 "Not owning correct locks"
7212 _RemoveInstance(self, feedback_fn, instance, self.op.ignore_failures)
7215 def _RemoveInstance(lu, feedback_fn, instance, ignore_failures):
7216 """Utility function to remove an instance.
7219 logging.info("Removing block devices for instance %s", instance.name)
7221 if not _RemoveDisks(lu, instance):
7222 if not ignore_failures:
7223 raise errors.OpExecError("Can't remove instance's disks")
7224 feedback_fn("Warning: can't remove instance's disks")
7226 logging.info("Removing instance %s out of cluster config", instance.name)
7228 lu.cfg.RemoveInstance(instance.name)
7230 assert not lu.remove_locks.get(locking.LEVEL_INSTANCE), \
7231 "Instance lock removal conflict"
7233 # Remove lock for the instance
7234 lu.remove_locks[locking.LEVEL_INSTANCE] = instance.name
7237 class LUInstanceQuery(NoHooksLU):
7238 """Logical unit for querying instances.
7241 # pylint: disable=W0142
7244 def CheckArguments(self):
7245 self.iq = _InstanceQuery(qlang.MakeSimpleFilter("name", self.op.names),
7246 self.op.output_fields, self.op.use_locking)
7248 def ExpandNames(self):
7249 self.iq.ExpandNames(self)
7251 def DeclareLocks(self, level):
7252 self.iq.DeclareLocks(self, level)
7254 def Exec(self, feedback_fn):
7255 return self.iq.OldStyleQuery(self)
7258 class LUInstanceFailover(LogicalUnit):
7259 """Failover an instance.
7262 HPATH = "instance-failover"
7263 HTYPE = constants.HTYPE_INSTANCE
7266 def CheckArguments(self):
7267 """Check the arguments.
7270 self.iallocator = getattr(self.op, "iallocator", None)
7271 self.target_node = getattr(self.op, "target_node", None)
7273 def ExpandNames(self):
7274 self._ExpandAndLockInstance()
7276 if self.op.target_node is not None:
7277 self.op.target_node = _ExpandNodeName(self.cfg, self.op.target_node)
7279 self.needed_locks[locking.LEVEL_NODE] = []
7280 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
7282 ignore_consistency = self.op.ignore_consistency
7283 shutdown_timeout = self.op.shutdown_timeout
7284 self._migrater = TLMigrateInstance(self, self.op.instance_name,
7287 ignore_consistency=ignore_consistency,
7288 shutdown_timeout=shutdown_timeout,
7289 ignore_ipolicy=self.op.ignore_ipolicy)
7290 self.tasklets = [self._migrater]
7292 def DeclareLocks(self, level):
7293 if level == locking.LEVEL_NODE:
7294 instance = self.context.cfg.GetInstanceInfo(self.op.instance_name)
7295 if instance.disk_template in constants.DTS_EXT_MIRROR:
7296 if self.op.target_node is None:
7297 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
7299 self.needed_locks[locking.LEVEL_NODE] = [instance.primary_node,
7300 self.op.target_node]
7301 del self.recalculate_locks[locking.LEVEL_NODE]
7303 self._LockInstancesNodes()
7305 def BuildHooksEnv(self):
7308 This runs on master, primary and secondary nodes of the instance.
7311 instance = self._migrater.instance
7312 source_node = instance.primary_node
7313 target_node = self.op.target_node
7315 "IGNORE_CONSISTENCY": self.op.ignore_consistency,
7316 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
7317 "OLD_PRIMARY": source_node,
7318 "NEW_PRIMARY": target_node,
7321 if instance.disk_template in constants.DTS_INT_MIRROR:
7322 env["OLD_SECONDARY"] = instance.secondary_nodes[0]
7323 env["NEW_SECONDARY"] = source_node
7325 env["OLD_SECONDARY"] = env["NEW_SECONDARY"] = ""
7327 env.update(_BuildInstanceHookEnvByObject(self, instance))
7331 def BuildHooksNodes(self):
7332 """Build hooks nodes.
7335 instance = self._migrater.instance
7336 nl = [self.cfg.GetMasterNode()] + list(instance.secondary_nodes)
7337 return (nl, nl + [instance.primary_node])
7340 class LUInstanceMigrate(LogicalUnit):
7341 """Migrate an instance.
7343 This is migration without shutting down, compared to the failover,
7344 which is done with shutdown.
7347 HPATH = "instance-migrate"
7348 HTYPE = constants.HTYPE_INSTANCE
7351 def ExpandNames(self):
7352 self._ExpandAndLockInstance()
7354 if self.op.target_node is not None:
7355 self.op.target_node = _ExpandNodeName(self.cfg, self.op.target_node)
7357 self.needed_locks[locking.LEVEL_NODE] = []
7358 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
7360 self._migrater = TLMigrateInstance(self, self.op.instance_name,
7361 cleanup=self.op.cleanup,
7363 fallback=self.op.allow_failover,
7364 ignore_ipolicy=self.op.ignore_ipolicy)
7365 self.tasklets = [self._migrater]
7367 def DeclareLocks(self, level):
7368 if level == locking.LEVEL_NODE:
7369 instance = self.context.cfg.GetInstanceInfo(self.op.instance_name)
7370 if instance.disk_template in constants.DTS_EXT_MIRROR:
7371 if self.op.target_node is None:
7372 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
7374 self.needed_locks[locking.LEVEL_NODE] = [instance.primary_node,
7375 self.op.target_node]
7376 del self.recalculate_locks[locking.LEVEL_NODE]
7378 self._LockInstancesNodes()
7380 def BuildHooksEnv(self):
7383 This runs on master, primary and secondary nodes of the instance.
7386 instance = self._migrater.instance
7387 source_node = instance.primary_node
7388 target_node = self.op.target_node
7389 env = _BuildInstanceHookEnvByObject(self, instance)
7391 "MIGRATE_LIVE": self._migrater.live,
7392 "MIGRATE_CLEANUP": self.op.cleanup,
7393 "OLD_PRIMARY": source_node,
7394 "NEW_PRIMARY": target_node,
7397 if instance.disk_template in constants.DTS_INT_MIRROR:
7398 env["OLD_SECONDARY"] = target_node
7399 env["NEW_SECONDARY"] = source_node
7401 env["OLD_SECONDARY"] = env["NEW_SECONDARY"] = None
7405 def BuildHooksNodes(self):
7406 """Build hooks nodes.
7409 instance = self._migrater.instance
7410 nl = [self.cfg.GetMasterNode()] + list(instance.secondary_nodes)
7411 return (nl, nl + [instance.primary_node])
7414 class LUInstanceMove(LogicalUnit):
7415 """Move an instance by data-copying.
7418 HPATH = "instance-move"
7419 HTYPE = constants.HTYPE_INSTANCE
7422 def ExpandNames(self):
7423 self._ExpandAndLockInstance()
7424 target_node = _ExpandNodeName(self.cfg, self.op.target_node)
7425 self.op.target_node = target_node
7426 self.needed_locks[locking.LEVEL_NODE] = [target_node]
7427 self.needed_locks[locking.LEVEL_NODE_RES] = []
7428 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
7430 def DeclareLocks(self, level):
7431 if level == locking.LEVEL_NODE:
7432 self._LockInstancesNodes(primary_only=True)
7433 elif level == locking.LEVEL_NODE_RES:
7435 self.needed_locks[locking.LEVEL_NODE_RES] = \
7436 self.needed_locks[locking.LEVEL_NODE][:]
7438 def BuildHooksEnv(self):
7441 This runs on master, primary and secondary nodes of the instance.
7445 "TARGET_NODE": self.op.target_node,
7446 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
7448 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
7451 def BuildHooksNodes(self):
7452 """Build hooks nodes.
7456 self.cfg.GetMasterNode(),
7457 self.instance.primary_node,
7458 self.op.target_node,
7462 def CheckPrereq(self):
7463 """Check prerequisites.
7465 This checks that the instance is in the cluster.
7468 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
7469 assert self.instance is not None, \
7470 "Cannot retrieve locked instance %s" % self.op.instance_name
7472 node = self.cfg.GetNodeInfo(self.op.target_node)
7473 assert node is not None, \
7474 "Cannot retrieve locked node %s" % self.op.target_node
7476 self.target_node = target_node = node.name
7478 if target_node == instance.primary_node:
7479 raise errors.OpPrereqError("Instance %s is already on the node %s" %
7480 (instance.name, target_node),
7483 bep = self.cfg.GetClusterInfo().FillBE(instance)
7485 for idx, dsk in enumerate(instance.disks):
7486 if dsk.dev_type not in (constants.LD_LV, constants.LD_FILE):
7487 raise errors.OpPrereqError("Instance disk %d has a complex layout,"
7488 " cannot copy" % idx, errors.ECODE_STATE)
7490 _CheckNodeOnline(self, target_node)
7491 _CheckNodeNotDrained(self, target_node)
7492 _CheckNodeVmCapable(self, target_node)
7493 ipolicy = _CalculateGroupIPolicy(self.cfg.GetClusterInfo(),
7494 self.cfg.GetNodeGroup(node.group))
7495 _CheckTargetNodeIPolicy(self, ipolicy, instance, node,
7496 ignore=self.op.ignore_ipolicy)
7498 if instance.admin_state == constants.ADMINST_UP:
7499 # check memory requirements on the secondary node
7500 _CheckNodeFreeMemory(self, target_node, "failing over instance %s" %
7501 instance.name, bep[constants.BE_MAXMEM],
7502 instance.hypervisor)
7504 self.LogInfo("Not checking memory on the secondary node as"
7505 " instance will not be started")
7507 # check bridge existance
7508 _CheckInstanceBridgesExist(self, instance, node=target_node)
7510 def Exec(self, feedback_fn):
7511 """Move an instance.
7513 The move is done by shutting it down on its present node, copying
7514 the data over (slow) and starting it on the new node.
7517 instance = self.instance
7519 source_node = instance.primary_node
7520 target_node = self.target_node
7522 self.LogInfo("Shutting down instance %s on source node %s",
7523 instance.name, source_node)
7525 assert (self.owned_locks(locking.LEVEL_NODE) ==
7526 self.owned_locks(locking.LEVEL_NODE_RES))
7528 result = self.rpc.call_instance_shutdown(source_node, instance,
7529 self.op.shutdown_timeout)
7530 msg = result.fail_msg
7532 if self.op.ignore_consistency:
7533 self.proc.LogWarning("Could not shutdown instance %s on node %s."
7534 " Proceeding anyway. Please make sure node"
7535 " %s is down. Error details: %s",
7536 instance.name, source_node, source_node, msg)
7538 raise errors.OpExecError("Could not shutdown instance %s on"
7540 (instance.name, source_node, msg))
7542 # create the target disks
7544 _CreateDisks(self, instance, target_node=target_node)
7545 except errors.OpExecError:
7546 self.LogWarning("Device creation failed, reverting...")
7548 _RemoveDisks(self, instance, target_node=target_node)
7550 self.cfg.ReleaseDRBDMinors(instance.name)
7553 cluster_name = self.cfg.GetClusterInfo().cluster_name
7556 # activate, get path, copy the data over
7557 for idx, disk in enumerate(instance.disks):
7558 self.LogInfo("Copying data for disk %d", idx)
7559 result = self.rpc.call_blockdev_assemble(target_node, disk,
7560 instance.name, True, idx)
7562 self.LogWarning("Can't assemble newly created disk %d: %s",
7563 idx, result.fail_msg)
7564 errs.append(result.fail_msg)
7566 dev_path = result.payload
7567 result = self.rpc.call_blockdev_export(source_node, disk,
7568 target_node, dev_path,
7571 self.LogWarning("Can't copy data over for disk %d: %s",
7572 idx, result.fail_msg)
7573 errs.append(result.fail_msg)
7577 self.LogWarning("Some disks failed to copy, aborting")
7579 _RemoveDisks(self, instance, target_node=target_node)
7581 self.cfg.ReleaseDRBDMinors(instance.name)
7582 raise errors.OpExecError("Errors during disk copy: %s" %
7585 instance.primary_node = target_node
7586 self.cfg.Update(instance, feedback_fn)
7588 self.LogInfo("Removing the disks on the original node")
7589 _RemoveDisks(self, instance, target_node=source_node)
7591 # Only start the instance if it's marked as up
7592 if instance.admin_state == constants.ADMINST_UP:
7593 self.LogInfo("Starting instance %s on node %s",
7594 instance.name, target_node)
7596 disks_ok, _ = _AssembleInstanceDisks(self, instance,
7597 ignore_secondaries=True)
7599 _ShutdownInstanceDisks(self, instance)
7600 raise errors.OpExecError("Can't activate the instance's disks")
7602 result = self.rpc.call_instance_start(target_node,
7603 (instance, None, None), False)
7604 msg = result.fail_msg
7606 _ShutdownInstanceDisks(self, instance)
7607 raise errors.OpExecError("Could not start instance %s on node %s: %s" %
7608 (instance.name, target_node, msg))
7611 class LUNodeMigrate(LogicalUnit):
7612 """Migrate all instances from a node.
7615 HPATH = "node-migrate"
7616 HTYPE = constants.HTYPE_NODE
7619 def CheckArguments(self):
7622 def ExpandNames(self):
7623 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
7625 self.share_locks = _ShareAll()
7626 self.needed_locks = {
7627 locking.LEVEL_NODE: [self.op.node_name],
7630 def BuildHooksEnv(self):
7633 This runs on the master, the primary and all the secondaries.
7637 "NODE_NAME": self.op.node_name,
7640 def BuildHooksNodes(self):
7641 """Build hooks nodes.
7644 nl = [self.cfg.GetMasterNode()]
7647 def CheckPrereq(self):
7650 def Exec(self, feedback_fn):
7651 # Prepare jobs for migration instances
7653 [opcodes.OpInstanceMigrate(instance_name=inst.name,
7656 iallocator=self.op.iallocator,
7657 target_node=self.op.target_node,
7658 ignore_ipolicy=self.op.ignore_ipolicy)]
7659 for inst in _GetNodePrimaryInstances(self.cfg, self.op.node_name)
7662 # TODO: Run iallocator in this opcode and pass correct placement options to
7663 # OpInstanceMigrate. Since other jobs can modify the cluster between
7664 # running the iallocator and the actual migration, a good consistency model
7665 # will have to be found.
7667 assert (frozenset(self.owned_locks(locking.LEVEL_NODE)) ==
7668 frozenset([self.op.node_name]))
7670 return ResultWithJobs(jobs)
7673 class TLMigrateInstance(Tasklet):
7674 """Tasklet class for instance migration.
7677 @ivar live: whether the migration will be done live or non-live;
7678 this variable is initalized only after CheckPrereq has run
7679 @type cleanup: boolean
7680 @ivar cleanup: Wheater we cleanup from a failed migration
7681 @type iallocator: string
7682 @ivar iallocator: The iallocator used to determine target_node
7683 @type target_node: string
7684 @ivar target_node: If given, the target_node to reallocate the instance to
7685 @type failover: boolean
7686 @ivar failover: Whether operation results in failover or migration
7687 @type fallback: boolean
7688 @ivar fallback: Whether fallback to failover is allowed if migration not
7690 @type ignore_consistency: boolean
7691 @ivar ignore_consistency: Wheter we should ignore consistency between source
7693 @type shutdown_timeout: int
7694 @ivar shutdown_timeout: In case of failover timeout of the shutdown
7695 @type ignore_ipolicy: bool
7696 @ivar ignore_ipolicy: If true, we can ignore instance policy when migrating
7701 _MIGRATION_POLL_INTERVAL = 1 # seconds
7702 _MIGRATION_FEEDBACK_INTERVAL = 10 # seconds
7704 def __init__(self, lu, instance_name, cleanup=False,
7705 failover=False, fallback=False,
7706 ignore_consistency=False,
7707 shutdown_timeout=constants.DEFAULT_SHUTDOWN_TIMEOUT,
7708 ignore_ipolicy=False):
7709 """Initializes this class.
7712 Tasklet.__init__(self, lu)
7715 self.instance_name = instance_name
7716 self.cleanup = cleanup
7717 self.live = False # will be overridden later
7718 self.failover = failover
7719 self.fallback = fallback
7720 self.ignore_consistency = ignore_consistency
7721 self.shutdown_timeout = shutdown_timeout
7722 self.ignore_ipolicy = ignore_ipolicy
7724 def CheckPrereq(self):
7725 """Check prerequisites.
7727 This checks that the instance is in the cluster.
7730 instance_name = _ExpandInstanceName(self.lu.cfg, self.instance_name)
7731 instance = self.cfg.GetInstanceInfo(instance_name)
7732 assert instance is not None
7733 self.instance = instance
7734 cluster = self.cfg.GetClusterInfo()
7736 if (not self.cleanup and
7737 not instance.admin_state == constants.ADMINST_UP and
7738 not self.failover and self.fallback):
7739 self.lu.LogInfo("Instance is marked down or offline, fallback allowed,"
7740 " switching to failover")
7741 self.failover = True
7743 if instance.disk_template not in constants.DTS_MIRRORED:
7748 raise errors.OpPrereqError("Instance's disk layout '%s' does not allow"
7749 " %s" % (instance.disk_template, text),
7752 if instance.disk_template in constants.DTS_EXT_MIRROR:
7753 _CheckIAllocatorOrNode(self.lu, "iallocator", "target_node")
7755 if self.lu.op.iallocator:
7756 self._RunAllocator()
7758 # We set set self.target_node as it is required by
7760 self.target_node = self.lu.op.target_node
7762 # Check that the target node is correct in terms of instance policy
7763 nodeinfo = self.cfg.GetNodeInfo(self.target_node)
7764 group_info = self.cfg.GetNodeGroup(nodeinfo.group)
7765 ipolicy = _CalculateGroupIPolicy(cluster, group_info)
7766 _CheckTargetNodeIPolicy(self.lu, ipolicy, instance, nodeinfo,
7767 ignore=self.ignore_ipolicy)
7769 # self.target_node is already populated, either directly or by the
7771 target_node = self.target_node
7772 if self.target_node == instance.primary_node:
7773 raise errors.OpPrereqError("Cannot migrate instance %s"
7774 " to its primary (%s)" %
7775 (instance.name, instance.primary_node))
7777 if len(self.lu.tasklets) == 1:
7778 # It is safe to release locks only when we're the only tasklet
7780 _ReleaseLocks(self.lu, locking.LEVEL_NODE,
7781 keep=[instance.primary_node, self.target_node])
7784 secondary_nodes = instance.secondary_nodes
7785 if not secondary_nodes:
7786 raise errors.ConfigurationError("No secondary node but using"
7787 " %s disk template" %
7788 instance.disk_template)
7789 target_node = secondary_nodes[0]
7790 if self.lu.op.iallocator or (self.lu.op.target_node and
7791 self.lu.op.target_node != target_node):
7793 text = "failed over"
7796 raise errors.OpPrereqError("Instances with disk template %s cannot"
7797 " be %s to arbitrary nodes"
7798 " (neither an iallocator nor a target"
7799 " node can be passed)" %
7800 (instance.disk_template, text),
7802 nodeinfo = self.cfg.GetNodeInfo(target_node)
7803 group_info = self.cfg.GetNodeGroup(nodeinfo.group)
7804 ipolicy = _CalculateGroupIPolicy(cluster, group_info)
7805 _CheckTargetNodeIPolicy(self.lu, ipolicy, instance, nodeinfo,
7806 ignore=self.ignore_ipolicy)
7808 i_be = cluster.FillBE(instance)
7810 # check memory requirements on the secondary node
7811 if (not self.cleanup and
7812 (not self.failover or instance.admin_state == constants.ADMINST_UP)):
7813 _CheckNodeFreeMemory(self.lu, target_node, "migrating instance %s" %
7814 instance.name, i_be[constants.BE_MAXMEM],
7815 instance.hypervisor)
7817 self.lu.LogInfo("Not checking memory on the secondary node as"
7818 " instance will not be started")
7820 # check if failover must be forced instead of migration
7821 if (not self.cleanup and not self.failover and
7822 i_be[constants.BE_ALWAYS_FAILOVER]):
7824 self.lu.LogInfo("Instance configured to always failover; fallback"
7826 self.failover = True
7828 raise errors.OpPrereqError("This instance has been configured to"
7829 " always failover, please allow failover",
7832 # check bridge existance
7833 _CheckInstanceBridgesExist(self.lu, instance, node=target_node)
7835 if not self.cleanup:
7836 _CheckNodeNotDrained(self.lu, target_node)
7837 if not self.failover:
7838 result = self.rpc.call_instance_migratable(instance.primary_node,
7840 if result.fail_msg and self.fallback:
7841 self.lu.LogInfo("Can't migrate, instance offline, fallback to"
7843 self.failover = True
7845 result.Raise("Can't migrate, please use failover",
7846 prereq=True, ecode=errors.ECODE_STATE)
7848 assert not (self.failover and self.cleanup)
7850 if not self.failover:
7851 if self.lu.op.live is not None and self.lu.op.mode is not None:
7852 raise errors.OpPrereqError("Only one of the 'live' and 'mode'"
7853 " parameters are accepted",
7855 if self.lu.op.live is not None:
7857 self.lu.op.mode = constants.HT_MIGRATION_LIVE
7859 self.lu.op.mode = constants.HT_MIGRATION_NONLIVE
7860 # reset the 'live' parameter to None so that repeated
7861 # invocations of CheckPrereq do not raise an exception
7862 self.lu.op.live = None
7863 elif self.lu.op.mode is None:
7864 # read the default value from the hypervisor
7865 i_hv = cluster.FillHV(self.instance, skip_globals=False)
7866 self.lu.op.mode = i_hv[constants.HV_MIGRATION_MODE]
7868 self.live = self.lu.op.mode == constants.HT_MIGRATION_LIVE
7870 # Failover is never live
7873 def _RunAllocator(self):
7874 """Run the allocator based on input opcode.
7877 # FIXME: add a self.ignore_ipolicy option
7878 ial = IAllocator(self.cfg, self.rpc,
7879 mode=constants.IALLOCATOR_MODE_RELOC,
7880 name=self.instance_name,
7881 # TODO See why hail breaks with a single node below
7882 relocate_from=[self.instance.primary_node,
7883 self.instance.primary_node],
7886 ial.Run(self.lu.op.iallocator)
7889 raise errors.OpPrereqError("Can't compute nodes using"
7890 " iallocator '%s': %s" %
7891 (self.lu.op.iallocator, ial.info),
7893 if len(ial.result) != ial.required_nodes:
7894 raise errors.OpPrereqError("iallocator '%s' returned invalid number"
7895 " of nodes (%s), required %s" %
7896 (self.lu.op.iallocator, len(ial.result),
7897 ial.required_nodes), errors.ECODE_FAULT)
7898 self.target_node = ial.result[0]
7899 self.lu.LogInfo("Selected nodes for instance %s via iallocator %s: %s",
7900 self.instance_name, self.lu.op.iallocator,
7901 utils.CommaJoin(ial.result))
7903 def _WaitUntilSync(self):
7904 """Poll with custom rpc for disk sync.
7906 This uses our own step-based rpc call.
7909 self.feedback_fn("* wait until resync is done")
7913 result = self.rpc.call_drbd_wait_sync(self.all_nodes,
7915 self.instance.disks)
7917 for node, nres in result.items():
7918 nres.Raise("Cannot resync disks on node %s" % node)
7919 node_done, node_percent = nres.payload
7920 all_done = all_done and node_done
7921 if node_percent is not None:
7922 min_percent = min(min_percent, node_percent)
7924 if min_percent < 100:
7925 self.feedback_fn(" - progress: %.1f%%" % min_percent)
7928 def _EnsureSecondary(self, node):
7929 """Demote a node to secondary.
7932 self.feedback_fn("* switching node %s to secondary mode" % node)
7934 for dev in self.instance.disks:
7935 self.cfg.SetDiskID(dev, node)
7937 result = self.rpc.call_blockdev_close(node, self.instance.name,
7938 self.instance.disks)
7939 result.Raise("Cannot change disk to secondary on node %s" % node)
7941 def _GoStandalone(self):
7942 """Disconnect from the network.
7945 self.feedback_fn("* changing into standalone mode")
7946 result = self.rpc.call_drbd_disconnect_net(self.all_nodes, self.nodes_ip,
7947 self.instance.disks)
7948 for node, nres in result.items():
7949 nres.Raise("Cannot disconnect disks node %s" % node)
7951 def _GoReconnect(self, multimaster):
7952 """Reconnect to the network.
7958 msg = "single-master"
7959 self.feedback_fn("* changing disks into %s mode" % msg)
7960 result = self.rpc.call_drbd_attach_net(self.all_nodes, self.nodes_ip,
7961 self.instance.disks,
7962 self.instance.name, multimaster)
7963 for node, nres in result.items():
7964 nres.Raise("Cannot change disks config on node %s" % node)
7966 def _ExecCleanup(self):
7967 """Try to cleanup after a failed migration.
7969 The cleanup is done by:
7970 - check that the instance is running only on one node
7971 (and update the config if needed)
7972 - change disks on its secondary node to secondary
7973 - wait until disks are fully synchronized
7974 - disconnect from the network
7975 - change disks into single-master mode
7976 - wait again until disks are fully synchronized
7979 instance = self.instance
7980 target_node = self.target_node
7981 source_node = self.source_node
7983 # check running on only one node
7984 self.feedback_fn("* checking where the instance actually runs"
7985 " (if this hangs, the hypervisor might be in"
7987 ins_l = self.rpc.call_instance_list(self.all_nodes, [instance.hypervisor])
7988 for node, result in ins_l.items():
7989 result.Raise("Can't contact node %s" % node)
7991 runningon_source = instance.name in ins_l[source_node].payload
7992 runningon_target = instance.name in ins_l[target_node].payload
7994 if runningon_source and runningon_target:
7995 raise errors.OpExecError("Instance seems to be running on two nodes,"
7996 " or the hypervisor is confused; you will have"
7997 " to ensure manually that it runs only on one"
7998 " and restart this operation")
8000 if not (runningon_source or runningon_target):
8001 raise errors.OpExecError("Instance does not seem to be running at all;"
8002 " in this case it's safer to repair by"
8003 " running 'gnt-instance stop' to ensure disk"
8004 " shutdown, and then restarting it")
8006 if runningon_target:
8007 # the migration has actually succeeded, we need to update the config
8008 self.feedback_fn("* instance running on secondary node (%s),"
8009 " updating config" % target_node)
8010 instance.primary_node = target_node
8011 self.cfg.Update(instance, self.feedback_fn)
8012 demoted_node = source_node
8014 self.feedback_fn("* instance confirmed to be running on its"
8015 " primary node (%s)" % source_node)
8016 demoted_node = target_node
8018 if instance.disk_template in constants.DTS_INT_MIRROR:
8019 self._EnsureSecondary(demoted_node)
8021 self._WaitUntilSync()
8022 except errors.OpExecError:
8023 # we ignore here errors, since if the device is standalone, it
8024 # won't be able to sync
8026 self._GoStandalone()
8027 self._GoReconnect(False)
8028 self._WaitUntilSync()
8030 self.feedback_fn("* done")
8032 def _RevertDiskStatus(self):
8033 """Try to revert the disk status after a failed migration.
8036 target_node = self.target_node
8037 if self.instance.disk_template in constants.DTS_EXT_MIRROR:
8041 self._EnsureSecondary(target_node)
8042 self._GoStandalone()
8043 self._GoReconnect(False)
8044 self._WaitUntilSync()
8045 except errors.OpExecError, err:
8046 self.lu.LogWarning("Migration failed and I can't reconnect the drives,"
8047 " please try to recover the instance manually;"
8048 " error '%s'" % str(err))
8050 def _AbortMigration(self):
8051 """Call the hypervisor code to abort a started migration.
8054 instance = self.instance
8055 target_node = self.target_node
8056 source_node = self.source_node
8057 migration_info = self.migration_info
8059 abort_result = self.rpc.call_instance_finalize_migration_dst(target_node,
8063 abort_msg = abort_result.fail_msg
8065 logging.error("Aborting migration failed on target node %s: %s",
8066 target_node, abort_msg)
8067 # Don't raise an exception here, as we stil have to try to revert the
8068 # disk status, even if this step failed.
8070 abort_result = self.rpc.call_instance_finalize_migration_src(source_node,
8071 instance, False, self.live)
8072 abort_msg = abort_result.fail_msg
8074 logging.error("Aborting migration failed on source node %s: %s",
8075 source_node, abort_msg)
8077 def _ExecMigration(self):
8078 """Migrate an instance.
8080 The migrate is done by:
8081 - change the disks into dual-master mode
8082 - wait until disks are fully synchronized again
8083 - migrate the instance
8084 - change disks on the new secondary node (the old primary) to secondary
8085 - wait until disks are fully synchronized
8086 - change disks into single-master mode
8089 instance = self.instance
8090 target_node = self.target_node
8091 source_node = self.source_node
8093 # Check for hypervisor version mismatch and warn the user.
8094 nodeinfo = self.rpc.call_node_info([source_node, target_node],
8095 None, [self.instance.hypervisor])
8096 for ninfo in nodeinfo.values():
8097 ninfo.Raise("Unable to retrieve node information from node '%s'" %
8099 (_, _, (src_info, )) = nodeinfo[source_node].payload
8100 (_, _, (dst_info, )) = nodeinfo[target_node].payload
8102 if ((constants.HV_NODEINFO_KEY_VERSION in src_info) and
8103 (constants.HV_NODEINFO_KEY_VERSION in dst_info)):
8104 src_version = src_info[constants.HV_NODEINFO_KEY_VERSION]
8105 dst_version = dst_info[constants.HV_NODEINFO_KEY_VERSION]
8106 if src_version != dst_version:
8107 self.feedback_fn("* warning: hypervisor version mismatch between"
8108 " source (%s) and target (%s) node" %
8109 (src_version, dst_version))
8111 self.feedback_fn("* checking disk consistency between source and target")
8112 for dev in instance.disks:
8113 if not _CheckDiskConsistency(self.lu, dev, target_node, False):
8114 raise errors.OpExecError("Disk %s is degraded or not fully"
8115 " synchronized on target node,"
8116 " aborting migration" % dev.iv_name)
8118 # First get the migration information from the remote node
8119 result = self.rpc.call_migration_info(source_node, instance)
8120 msg = result.fail_msg
8122 log_err = ("Failed fetching source migration information from %s: %s" %
8124 logging.error(log_err)
8125 raise errors.OpExecError(log_err)
8127 self.migration_info = migration_info = result.payload
8129 if self.instance.disk_template not in constants.DTS_EXT_MIRROR:
8130 # Then switch the disks to master/master mode
8131 self._EnsureSecondary(target_node)
8132 self._GoStandalone()
8133 self._GoReconnect(True)
8134 self._WaitUntilSync()
8136 self.feedback_fn("* preparing %s to accept the instance" % target_node)
8137 result = self.rpc.call_accept_instance(target_node,
8140 self.nodes_ip[target_node])
8142 msg = result.fail_msg
8144 logging.error("Instance pre-migration failed, trying to revert"
8145 " disk status: %s", msg)
8146 self.feedback_fn("Pre-migration failed, aborting")
8147 self._AbortMigration()
8148 self._RevertDiskStatus()
8149 raise errors.OpExecError("Could not pre-migrate instance %s: %s" %
8150 (instance.name, msg))
8152 self.feedback_fn("* migrating instance to %s" % target_node)
8153 result = self.rpc.call_instance_migrate(source_node, instance,
8154 self.nodes_ip[target_node],
8156 msg = result.fail_msg
8158 logging.error("Instance migration failed, trying to revert"
8159 " disk status: %s", msg)
8160 self.feedback_fn("Migration failed, aborting")
8161 self._AbortMigration()
8162 self._RevertDiskStatus()
8163 raise errors.OpExecError("Could not migrate instance %s: %s" %
8164 (instance.name, msg))
8166 self.feedback_fn("* starting memory transfer")
8167 last_feedback = time.time()
8169 result = self.rpc.call_instance_get_migration_status(source_node,
8171 msg = result.fail_msg
8172 ms = result.payload # MigrationStatus instance
8173 if msg or (ms.status in constants.HV_MIGRATION_FAILED_STATUSES):
8174 logging.error("Instance migration failed, trying to revert"
8175 " disk status: %s", msg)
8176 self.feedback_fn("Migration failed, aborting")
8177 self._AbortMigration()
8178 self._RevertDiskStatus()
8179 raise errors.OpExecError("Could not migrate instance %s: %s" %
8180 (instance.name, msg))
8182 if result.payload.status != constants.HV_MIGRATION_ACTIVE:
8183 self.feedback_fn("* memory transfer complete")
8186 if (utils.TimeoutExpired(last_feedback,
8187 self._MIGRATION_FEEDBACK_INTERVAL) and
8188 ms.transferred_ram is not None):
8189 mem_progress = 100 * float(ms.transferred_ram) / float(ms.total_ram)
8190 self.feedback_fn("* memory transfer progress: %.2f %%" % mem_progress)
8191 last_feedback = time.time()
8193 time.sleep(self._MIGRATION_POLL_INTERVAL)
8195 result = self.rpc.call_instance_finalize_migration_src(source_node,
8199 msg = result.fail_msg
8201 logging.error("Instance migration succeeded, but finalization failed"
8202 " on the source node: %s", msg)
8203 raise errors.OpExecError("Could not finalize instance migration: %s" %
8206 instance.primary_node = target_node
8208 # distribute new instance config to the other nodes
8209 self.cfg.Update(instance, self.feedback_fn)
8211 result = self.rpc.call_instance_finalize_migration_dst(target_node,
8215 msg = result.fail_msg
8217 logging.error("Instance migration succeeded, but finalization failed"
8218 " on the target node: %s", msg)
8219 raise errors.OpExecError("Could not finalize instance migration: %s" %
8222 if self.instance.disk_template not in constants.DTS_EXT_MIRROR:
8223 self._EnsureSecondary(source_node)
8224 self._WaitUntilSync()
8225 self._GoStandalone()
8226 self._GoReconnect(False)
8227 self._WaitUntilSync()
8229 # If the instance's disk template is `rbd' and there was a successful
8230 # migration, unmap the device from the source node.
8231 if self.instance.disk_template == constants.DT_RBD:
8232 disks = _ExpandCheckDisks(instance, instance.disks)
8233 self.feedback_fn("* unmapping instance's disks from %s" % source_node)
8235 result = self.rpc.call_blockdev_shutdown(source_node, disk)
8236 msg = result.fail_msg
8238 logging.error("Migration was successful, but couldn't unmap the"
8239 " block device %s on source node %s: %s",
8240 disk.iv_name, source_node, msg)
8241 logging.error("You need to unmap the device %s manually on %s",
8242 disk.iv_name, source_node)
8244 self.feedback_fn("* done")
8246 def _ExecFailover(self):
8247 """Failover an instance.
8249 The failover is done by shutting it down on its present node and
8250 starting it on the secondary.
8253 instance = self.instance
8254 primary_node = self.cfg.GetNodeInfo(instance.primary_node)
8256 source_node = instance.primary_node
8257 target_node = self.target_node
8259 if instance.admin_state == constants.ADMINST_UP:
8260 self.feedback_fn("* checking disk consistency between source and target")
8261 for dev in instance.disks:
8262 # for drbd, these are drbd over lvm
8263 if not _CheckDiskConsistency(self.lu, dev, target_node, False):
8264 if primary_node.offline:
8265 self.feedback_fn("Node %s is offline, ignoring degraded disk %s on"
8267 (primary_node.name, dev.iv_name, target_node))
8268 elif not self.ignore_consistency:
8269 raise errors.OpExecError("Disk %s is degraded on target node,"
8270 " aborting failover" % dev.iv_name)
8272 self.feedback_fn("* not checking disk consistency as instance is not"
8275 self.feedback_fn("* shutting down instance on source node")
8276 logging.info("Shutting down instance %s on node %s",
8277 instance.name, source_node)
8279 result = self.rpc.call_instance_shutdown(source_node, instance,
8280 self.shutdown_timeout)
8281 msg = result.fail_msg
8283 if self.ignore_consistency or primary_node.offline:
8284 self.lu.LogWarning("Could not shutdown instance %s on node %s,"
8285 " proceeding anyway; please make sure node"
8286 " %s is down; error details: %s",
8287 instance.name, source_node, source_node, msg)
8289 raise errors.OpExecError("Could not shutdown instance %s on"
8291 (instance.name, source_node, msg))
8293 self.feedback_fn("* deactivating the instance's disks on source node")
8294 if not _ShutdownInstanceDisks(self.lu, instance, ignore_primary=True):
8295 raise errors.OpExecError("Can't shut down the instance's disks")
8297 instance.primary_node = target_node
8298 # distribute new instance config to the other nodes
8299 self.cfg.Update(instance, self.feedback_fn)
8301 # Only start the instance if it's marked as up
8302 if instance.admin_state == constants.ADMINST_UP:
8303 self.feedback_fn("* activating the instance's disks on target node %s" %
8305 logging.info("Starting instance %s on node %s",
8306 instance.name, target_node)
8308 disks_ok, _ = _AssembleInstanceDisks(self.lu, instance,
8309 ignore_secondaries=True)
8311 _ShutdownInstanceDisks(self.lu, instance)
8312 raise errors.OpExecError("Can't activate the instance's disks")
8314 self.feedback_fn("* starting the instance on the target node %s" %
8316 result = self.rpc.call_instance_start(target_node, (instance, None, None),
8318 msg = result.fail_msg
8320 _ShutdownInstanceDisks(self.lu, instance)
8321 raise errors.OpExecError("Could not start instance %s on node %s: %s" %
8322 (instance.name, target_node, msg))
8324 def Exec(self, feedback_fn):
8325 """Perform the migration.
8328 self.feedback_fn = feedback_fn
8329 self.source_node = self.instance.primary_node
8331 # FIXME: if we implement migrate-to-any in DRBD, this needs fixing
8332 if self.instance.disk_template in constants.DTS_INT_MIRROR:
8333 self.target_node = self.instance.secondary_nodes[0]
8334 # Otherwise self.target_node has been populated either
8335 # directly, or through an iallocator.
8337 self.all_nodes = [self.source_node, self.target_node]
8338 self.nodes_ip = dict((name, node.secondary_ip) for (name, node)
8339 in self.cfg.GetMultiNodeInfo(self.all_nodes))
8342 feedback_fn("Failover instance %s" % self.instance.name)
8343 self._ExecFailover()
8345 feedback_fn("Migrating instance %s" % self.instance.name)
8348 return self._ExecCleanup()
8350 return self._ExecMigration()
8353 def _CreateBlockDev(lu, node, instance, device, force_create,
8355 """Create a tree of block devices on a given node.
8357 If this device type has to be created on secondaries, create it and
8360 If not, just recurse to children keeping the same 'force' value.
8362 @param lu: the lu on whose behalf we execute
8363 @param node: the node on which to create the device
8364 @type instance: L{objects.Instance}
8365 @param instance: the instance which owns the device
8366 @type device: L{objects.Disk}
8367 @param device: the device to create
8368 @type force_create: boolean
8369 @param force_create: whether to force creation of this device; this
8370 will be change to True whenever we find a device which has
8371 CreateOnSecondary() attribute
8372 @param info: the extra 'metadata' we should attach to the device
8373 (this will be represented as a LVM tag)
8374 @type force_open: boolean
8375 @param force_open: this parameter will be passes to the
8376 L{backend.BlockdevCreate} function where it specifies
8377 whether we run on primary or not, and it affects both
8378 the child assembly and the device own Open() execution
8381 if device.CreateOnSecondary():
8385 for child in device.children:
8386 _CreateBlockDev(lu, node, instance, child, force_create,
8389 if not force_create:
8392 _CreateSingleBlockDev(lu, node, instance, device, info, force_open)
8395 def _CreateSingleBlockDev(lu, node, instance, device, info, force_open):
8396 """Create a single block device on a given node.
8398 This will not recurse over children of the device, so they must be
8401 @param lu: the lu on whose behalf we execute
8402 @param node: the node on which to create the device
8403 @type instance: L{objects.Instance}
8404 @param instance: the instance which owns the device
8405 @type device: L{objects.Disk}
8406 @param device: the device to create
8407 @param info: the extra 'metadata' we should attach to the device
8408 (this will be represented as a LVM tag)
8409 @type force_open: boolean
8410 @param force_open: this parameter will be passes to the
8411 L{backend.BlockdevCreate} function where it specifies
8412 whether we run on primary or not, and it affects both
8413 the child assembly and the device own Open() execution
8416 lu.cfg.SetDiskID(device, node)
8417 result = lu.rpc.call_blockdev_create(node, device, device.size,
8418 instance.name, force_open, info)
8419 result.Raise("Can't create block device %s on"
8420 " node %s for instance %s" % (device, node, instance.name))
8421 if device.physical_id is None:
8422 device.physical_id = result.payload
8425 def _GenerateUniqueNames(lu, exts):
8426 """Generate a suitable LV name.
8428 This will generate a logical volume name for the given instance.
8433 new_id = lu.cfg.GenerateUniqueID(lu.proc.GetECId())
8434 results.append("%s%s" % (new_id, val))
8438 def _ComputeLDParams(disk_template, disk_params):
8439 """Computes Logical Disk parameters from Disk Template parameters.
8441 @type disk_template: string
8442 @param disk_template: disk template, one of L{constants.DISK_TEMPLATES}
8443 @type disk_params: dict
8444 @param disk_params: disk template parameters; dict(template_name -> parameters
8446 @return: a list of dicts, one for each node of the disk hierarchy. Each dict
8447 contains the LD parameters of the node. The tree is flattened in-order.
8450 if disk_template not in constants.DISK_TEMPLATES:
8451 raise errors.ProgrammerError("Unknown disk template %s" % disk_template)
8454 dt_params = disk_params[disk_template]
8455 if disk_template == constants.DT_DRBD8:
8457 constants.LDP_RESYNC_RATE: dt_params[constants.DRBD_RESYNC_RATE],
8458 constants.LDP_BARRIERS: dt_params[constants.DRBD_DISK_BARRIERS],
8459 constants.LDP_NO_META_FLUSH: dt_params[constants.DRBD_META_BARRIERS],
8460 constants.LDP_DEFAULT_METAVG: dt_params[constants.DRBD_DEFAULT_METAVG],
8461 constants.LDP_DISK_CUSTOM: dt_params[constants.DRBD_DISK_CUSTOM],
8462 constants.LDP_NET_CUSTOM: dt_params[constants.DRBD_NET_CUSTOM],
8463 constants.LDP_DYNAMIC_RESYNC: dt_params[constants.DRBD_DYNAMIC_RESYNC],
8464 constants.LDP_PLAN_AHEAD: dt_params[constants.DRBD_PLAN_AHEAD],
8465 constants.LDP_FILL_TARGET: dt_params[constants.DRBD_FILL_TARGET],
8466 constants.LDP_DELAY_TARGET: dt_params[constants.DRBD_DELAY_TARGET],
8467 constants.LDP_MAX_RATE: dt_params[constants.DRBD_MAX_RATE],
8468 constants.LDP_MIN_RATE: dt_params[constants.DRBD_MIN_RATE],
8472 objects.FillDict(constants.DISK_LD_DEFAULTS[constants.LD_DRBD8],
8475 result.append(drbd_params)
8479 constants.LDP_STRIPES: dt_params[constants.DRBD_DATA_STRIPES],
8482 objects.FillDict(constants.DISK_LD_DEFAULTS[constants.LD_LV],
8484 result.append(data_params)
8488 constants.LDP_STRIPES: dt_params[constants.DRBD_META_STRIPES],
8491 objects.FillDict(constants.DISK_LD_DEFAULTS[constants.LD_LV],
8493 result.append(meta_params)
8495 elif (disk_template == constants.DT_FILE or
8496 disk_template == constants.DT_SHARED_FILE):
8497 result.append(constants.DISK_LD_DEFAULTS[constants.LD_FILE])
8499 elif disk_template == constants.DT_PLAIN:
8501 constants.LDP_STRIPES: dt_params[constants.LV_STRIPES],
8504 objects.FillDict(constants.DISK_LD_DEFAULTS[constants.LD_LV],
8506 result.append(params)
8508 elif disk_template == constants.DT_BLOCK:
8509 result.append(constants.DISK_LD_DEFAULTS[constants.LD_BLOCKDEV])
8511 elif disk_template == constants.DT_RBD:
8513 constants.LDP_POOL: dt_params[constants.RBD_POOL]
8516 objects.FillDict(constants.DISK_LD_DEFAULTS[constants.LD_RBD],
8518 result.append(params)
8523 def _GenerateDRBD8Branch(lu, primary, secondary, size, vgnames, names,
8524 iv_name, p_minor, s_minor, drbd_params, data_params,
8526 """Generate a drbd8 device complete with its children.
8529 assert len(vgnames) == len(names) == 2
8530 port = lu.cfg.AllocatePort()
8531 shared_secret = lu.cfg.GenerateDRBDSecret(lu.proc.GetECId())
8533 dev_data = objects.Disk(dev_type=constants.LD_LV, size=size,
8534 logical_id=(vgnames[0], names[0]),
8536 dev_meta = objects.Disk(dev_type=constants.LD_LV, size=DRBD_META_SIZE,
8537 logical_id=(vgnames[1], names[1]),
8539 drbd_dev = objects.Disk(dev_type=constants.LD_DRBD8, size=size,
8540 logical_id=(primary, secondary, port,
8543 children=[dev_data, dev_meta],
8544 iv_name=iv_name, params=drbd_params)
8548 def _GenerateDiskTemplate(lu, template_name,
8549 instance_name, primary_node,
8550 secondary_nodes, disk_info,
8551 file_storage_dir, file_driver,
8552 base_index, feedback_fn, disk_params):
8553 """Generate the entire disk layout for a given template type.
8556 #TODO: compute space requirements
8558 vgname = lu.cfg.GetVGName()
8559 disk_count = len(disk_info)
8561 ld_params = _ComputeLDParams(template_name, disk_params)
8562 if template_name == constants.DT_DISKLESS:
8564 elif template_name == constants.DT_PLAIN:
8566 raise errors.ProgrammerError("Wrong template configuration")
8568 names = _GenerateUniqueNames(lu, [".disk%d" % (base_index + i)
8569 for i in range(disk_count)])
8570 for idx, disk in enumerate(disk_info):
8571 disk_index = idx + base_index
8572 vg = disk.get(constants.IDISK_VG, vgname)
8573 feedback_fn("* disk %i, vg %s, name %s" % (idx, vg, names[idx]))
8574 disk_dev = objects.Disk(dev_type=constants.LD_LV,
8575 size=disk[constants.IDISK_SIZE],
8576 logical_id=(vg, names[idx]),
8577 iv_name="disk/%d" % disk_index,
8578 mode=disk[constants.IDISK_MODE],
8579 params=ld_params[0])
8580 disks.append(disk_dev)
8581 elif template_name == constants.DT_DRBD8:
8582 drbd_params, data_params, meta_params = ld_params
8583 if len(secondary_nodes) != 1:
8584 raise errors.ProgrammerError("Wrong template configuration")
8585 remote_node = secondary_nodes[0]
8586 minors = lu.cfg.AllocateDRBDMinor(
8587 [primary_node, remote_node] * len(disk_info), instance_name)
8590 for lv_prefix in _GenerateUniqueNames(lu, [".disk%d" % (base_index + i)
8591 for i in range(disk_count)]):
8592 names.append(lv_prefix + "_data")
8593 names.append(lv_prefix + "_meta")
8594 for idx, disk in enumerate(disk_info):
8595 disk_index = idx + base_index
8596 drbd_default_metavg = drbd_params[constants.LDP_DEFAULT_METAVG]
8597 data_vg = disk.get(constants.IDISK_VG, vgname)
8598 meta_vg = disk.get(constants.IDISK_METAVG, drbd_default_metavg)
8599 disk_dev = _GenerateDRBD8Branch(lu, primary_node, remote_node,
8600 disk[constants.IDISK_SIZE],
8602 names[idx * 2:idx * 2 + 2],
8603 "disk/%d" % disk_index,
8604 minors[idx * 2], minors[idx * 2 + 1],
8605 drbd_params, data_params, meta_params)
8606 disk_dev.mode = disk[constants.IDISK_MODE]
8607 disks.append(disk_dev)
8608 elif template_name == constants.DT_FILE:
8610 raise errors.ProgrammerError("Wrong template configuration")
8612 opcodes.RequireFileStorage()
8614 for idx, disk in enumerate(disk_info):
8615 disk_index = idx + base_index
8616 disk_dev = objects.Disk(dev_type=constants.LD_FILE,
8617 size=disk[constants.IDISK_SIZE],
8618 iv_name="disk/%d" % disk_index,
8619 logical_id=(file_driver,
8620 "%s/disk%d" % (file_storage_dir,
8622 mode=disk[constants.IDISK_MODE],
8623 params=ld_params[0])
8624 disks.append(disk_dev)
8625 elif template_name == constants.DT_SHARED_FILE:
8627 raise errors.ProgrammerError("Wrong template configuration")
8629 opcodes.RequireSharedFileStorage()
8631 for idx, disk in enumerate(disk_info):
8632 disk_index = idx + base_index
8633 disk_dev = objects.Disk(dev_type=constants.LD_FILE,
8634 size=disk[constants.IDISK_SIZE],
8635 iv_name="disk/%d" % disk_index,
8636 logical_id=(file_driver,
8637 "%s/disk%d" % (file_storage_dir,
8639 mode=disk[constants.IDISK_MODE],
8640 params=ld_params[0])
8641 disks.append(disk_dev)
8642 elif template_name == constants.DT_BLOCK:
8644 raise errors.ProgrammerError("Wrong template configuration")
8646 for idx, disk in enumerate(disk_info):
8647 disk_index = idx + base_index
8648 disk_dev = objects.Disk(dev_type=constants.LD_BLOCKDEV,
8649 size=disk[constants.IDISK_SIZE],
8650 logical_id=(constants.BLOCKDEV_DRIVER_MANUAL,
8651 disk[constants.IDISK_ADOPT]),
8652 iv_name="disk/%d" % disk_index,
8653 mode=disk[constants.IDISK_MODE],
8654 params=ld_params[0])
8655 disks.append(disk_dev)
8656 elif template_name == constants.DT_RBD:
8658 raise errors.ProgrammerError("Wrong template configuration")
8660 names = _GenerateUniqueNames(lu, [".rbd.disk%d" % (base_index + i)
8661 for i in range(disk_count)])
8663 for idx, disk in enumerate(disk_info):
8664 disk_index = idx + base_index
8665 disk_dev = objects.Disk(dev_type=constants.LD_RBD,
8666 size=disk[constants.IDISK_SIZE],
8667 logical_id=("rbd", names[idx]),
8668 iv_name="disk/%d" % disk_index,
8669 mode=disk[constants.IDISK_MODE],
8670 params=ld_params[0])
8671 disks.append(disk_dev)
8674 raise errors.ProgrammerError("Invalid disk template '%s'" % template_name)
8678 def _GetInstanceInfoText(instance):
8679 """Compute that text that should be added to the disk's metadata.
8682 return "originstname+%s" % instance.name
8685 def _CalcEta(time_taken, written, total_size):
8686 """Calculates the ETA based on size written and total size.
8688 @param time_taken: The time taken so far
8689 @param written: amount written so far
8690 @param total_size: The total size of data to be written
8691 @return: The remaining time in seconds
8694 avg_time = time_taken / float(written)
8695 return (total_size - written) * avg_time
8698 def _WipeDisks(lu, instance):
8699 """Wipes instance disks.
8701 @type lu: L{LogicalUnit}
8702 @param lu: the logical unit on whose behalf we execute
8703 @type instance: L{objects.Instance}
8704 @param instance: the instance whose disks we should create
8705 @return: the success of the wipe
8708 node = instance.primary_node
8710 for device in instance.disks:
8711 lu.cfg.SetDiskID(device, node)
8713 logging.info("Pause sync of instance %s disks", instance.name)
8714 result = lu.rpc.call_blockdev_pause_resume_sync(node, instance.disks, True)
8716 for idx, success in enumerate(result.payload):
8718 logging.warn("pause-sync of instance %s for disks %d failed",
8722 for idx, device in enumerate(instance.disks):
8723 # The wipe size is MIN_WIPE_CHUNK_PERCENT % of the instance disk but
8724 # MAX_WIPE_CHUNK at max
8725 wipe_chunk_size = min(constants.MAX_WIPE_CHUNK, device.size / 100.0 *
8726 constants.MIN_WIPE_CHUNK_PERCENT)
8727 # we _must_ make this an int, otherwise rounding errors will
8729 wipe_chunk_size = int(wipe_chunk_size)
8731 lu.LogInfo("* Wiping disk %d", idx)
8732 logging.info("Wiping disk %d for instance %s, node %s using"
8733 " chunk size %s", idx, instance.name, node, wipe_chunk_size)
8738 start_time = time.time()
8740 while offset < size:
8741 wipe_size = min(wipe_chunk_size, size - offset)
8742 logging.debug("Wiping disk %d, offset %s, chunk %s",
8743 idx, offset, wipe_size)
8744 result = lu.rpc.call_blockdev_wipe(node, device, offset, wipe_size)
8745 result.Raise("Could not wipe disk %d at offset %d for size %d" %
8746 (idx, offset, wipe_size))
8749 if now - last_output >= 60:
8750 eta = _CalcEta(now - start_time, offset, size)
8751 lu.LogInfo(" - done: %.1f%% ETA: %s" %
8752 (offset / float(size) * 100, utils.FormatSeconds(eta)))
8755 logging.info("Resume sync of instance %s disks", instance.name)
8757 result = lu.rpc.call_blockdev_pause_resume_sync(node, instance.disks, False)
8759 for idx, success in enumerate(result.payload):
8761 lu.LogWarning("Resume sync of disk %d failed, please have a"
8762 " look at the status and troubleshoot the issue", idx)
8763 logging.warn("resume-sync of instance %s for disks %d failed",
8767 def _CreateDisks(lu, instance, to_skip=None, target_node=None):
8768 """Create all disks for an instance.
8770 This abstracts away some work from AddInstance.
8772 @type lu: L{LogicalUnit}
8773 @param lu: the logical unit on whose behalf we execute
8774 @type instance: L{objects.Instance}
8775 @param instance: the instance whose disks we should create
8777 @param to_skip: list of indices to skip
8778 @type target_node: string
8779 @param target_node: if passed, overrides the target node for creation
8781 @return: the success of the creation
8784 info = _GetInstanceInfoText(instance)
8785 if target_node is None:
8786 pnode = instance.primary_node
8787 all_nodes = instance.all_nodes
8792 if instance.disk_template in constants.DTS_FILEBASED:
8793 file_storage_dir = os.path.dirname(instance.disks[0].logical_id[1])
8794 result = lu.rpc.call_file_storage_dir_create(pnode, file_storage_dir)
8796 result.Raise("Failed to create directory '%s' on"
8797 " node %s" % (file_storage_dir, pnode))
8799 # Note: this needs to be kept in sync with adding of disks in
8800 # LUInstanceSetParams
8801 for idx, device in enumerate(instance.disks):
8802 if to_skip and idx in to_skip:
8804 logging.info("Creating volume %s for instance %s",
8805 device.iv_name, instance.name)
8807 for node in all_nodes:
8808 f_create = node == pnode
8809 _CreateBlockDev(lu, node, instance, device, f_create, info, f_create)
8812 def _RemoveDisks(lu, instance, target_node=None):
8813 """Remove all disks for an instance.
8815 This abstracts away some work from `AddInstance()` and
8816 `RemoveInstance()`. Note that in case some of the devices couldn't
8817 be removed, the removal will continue with the other ones (compare
8818 with `_CreateDisks()`).
8820 @type lu: L{LogicalUnit}
8821 @param lu: the logical unit on whose behalf we execute
8822 @type instance: L{objects.Instance}
8823 @param instance: the instance whose disks we should remove
8824 @type target_node: string
8825 @param target_node: used to override the node on which to remove the disks
8827 @return: the success of the removal
8830 logging.info("Removing block devices for instance %s", instance.name)
8833 for device in instance.disks:
8835 edata = [(target_node, device)]
8837 edata = device.ComputeNodeTree(instance.primary_node)
8838 for node, disk in edata:
8839 lu.cfg.SetDiskID(disk, node)
8840 msg = lu.rpc.call_blockdev_remove(node, disk).fail_msg
8842 lu.LogWarning("Could not remove block device %s on node %s,"
8843 " continuing anyway: %s", device.iv_name, node, msg)
8846 # if this is a DRBD disk, return its port to the pool
8847 if device.dev_type in constants.LDS_DRBD:
8848 tcp_port = device.logical_id[2]
8849 lu.cfg.AddTcpUdpPort(tcp_port)
8851 if instance.disk_template == constants.DT_FILE:
8852 file_storage_dir = os.path.dirname(instance.disks[0].logical_id[1])
8856 tgt = instance.primary_node
8857 result = lu.rpc.call_file_storage_dir_remove(tgt, file_storage_dir)
8859 lu.LogWarning("Could not remove directory '%s' on node %s: %s",
8860 file_storage_dir, instance.primary_node, result.fail_msg)
8866 def _ComputeDiskSizePerVG(disk_template, disks):
8867 """Compute disk size requirements in the volume group
8870 def _compute(disks, payload):
8871 """Universal algorithm.
8876 vgs[disk[constants.IDISK_VG]] = \
8877 vgs.get(constants.IDISK_VG, 0) + disk[constants.IDISK_SIZE] + payload
8881 # Required free disk space as a function of disk and swap space
8883 constants.DT_DISKLESS: {},
8884 constants.DT_PLAIN: _compute(disks, 0),
8885 # 128 MB are added for drbd metadata for each disk
8886 constants.DT_DRBD8: _compute(disks, DRBD_META_SIZE),
8887 constants.DT_FILE: {},
8888 constants.DT_SHARED_FILE: {},
8891 if disk_template not in req_size_dict:
8892 raise errors.ProgrammerError("Disk template '%s' size requirement"
8893 " is unknown" % disk_template)
8895 return req_size_dict[disk_template]
8898 def _ComputeDiskSize(disk_template, disks):
8899 """Compute disk size requirements in the volume group
8902 # Required free disk space as a function of disk and swap space
8904 constants.DT_DISKLESS: None,
8905 constants.DT_PLAIN: sum(d[constants.IDISK_SIZE] for d in disks),
8906 # 128 MB are added for drbd metadata for each disk
8908 sum(d[constants.IDISK_SIZE] + DRBD_META_SIZE for d in disks),
8909 constants.DT_FILE: None,
8910 constants.DT_SHARED_FILE: 0,
8911 constants.DT_BLOCK: 0,
8912 constants.DT_RBD: 0,
8915 if disk_template not in req_size_dict:
8916 raise errors.ProgrammerError("Disk template '%s' size requirement"
8917 " is unknown" % disk_template)
8919 return req_size_dict[disk_template]
8922 def _FilterVmNodes(lu, nodenames):
8923 """Filters out non-vm_capable nodes from a list.
8925 @type lu: L{LogicalUnit}
8926 @param lu: the logical unit for which we check
8927 @type nodenames: list
8928 @param nodenames: the list of nodes on which we should check
8930 @return: the list of vm-capable nodes
8933 vm_nodes = frozenset(lu.cfg.GetNonVmCapableNodeList())
8934 return [name for name in nodenames if name not in vm_nodes]
8937 def _CheckHVParams(lu, nodenames, hvname, hvparams):
8938 """Hypervisor parameter validation.
8940 This function abstract the hypervisor parameter validation to be
8941 used in both instance create and instance modify.
8943 @type lu: L{LogicalUnit}
8944 @param lu: the logical unit for which we check
8945 @type nodenames: list
8946 @param nodenames: the list of nodes on which we should check
8947 @type hvname: string
8948 @param hvname: the name of the hypervisor we should use
8949 @type hvparams: dict
8950 @param hvparams: the parameters which we need to check
8951 @raise errors.OpPrereqError: if the parameters are not valid
8954 nodenames = _FilterVmNodes(lu, nodenames)
8956 cluster = lu.cfg.GetClusterInfo()
8957 hvfull = objects.FillDict(cluster.hvparams.get(hvname, {}), hvparams)
8959 hvinfo = lu.rpc.call_hypervisor_validate_params(nodenames, hvname, hvfull)
8960 for node in nodenames:
8964 info.Raise("Hypervisor parameter validation failed on node %s" % node)
8967 def _CheckOSParams(lu, required, nodenames, osname, osparams):
8968 """OS parameters validation.
8970 @type lu: L{LogicalUnit}
8971 @param lu: the logical unit for which we check
8972 @type required: boolean
8973 @param required: whether the validation should fail if the OS is not
8975 @type nodenames: list
8976 @param nodenames: the list of nodes on which we should check
8977 @type osname: string
8978 @param osname: the name of the hypervisor we should use
8979 @type osparams: dict
8980 @param osparams: the parameters which we need to check
8981 @raise errors.OpPrereqError: if the parameters are not valid
8984 nodenames = _FilterVmNodes(lu, nodenames)
8985 result = lu.rpc.call_os_validate(nodenames, required, osname,
8986 [constants.OS_VALIDATE_PARAMETERS],
8988 for node, nres in result.items():
8989 # we don't check for offline cases since this should be run only
8990 # against the master node and/or an instance's nodes
8991 nres.Raise("OS Parameters validation failed on node %s" % node)
8992 if not nres.payload:
8993 lu.LogInfo("OS %s not found on node %s, validation skipped",
8997 class LUInstanceCreate(LogicalUnit):
8998 """Create an instance.
9001 HPATH = "instance-add"
9002 HTYPE = constants.HTYPE_INSTANCE
9005 def CheckArguments(self):
9009 # do not require name_check to ease forward/backward compatibility
9011 if self.op.no_install and self.op.start:
9012 self.LogInfo("No-installation mode selected, disabling startup")
9013 self.op.start = False
9014 # validate/normalize the instance name
9015 self.op.instance_name = \
9016 netutils.Hostname.GetNormalizedName(self.op.instance_name)
9018 if self.op.ip_check and not self.op.name_check:
9019 # TODO: make the ip check more flexible and not depend on the name check
9020 raise errors.OpPrereqError("Cannot do IP address check without a name"
9021 " check", errors.ECODE_INVAL)
9023 # check nics' parameter names
9024 for nic in self.op.nics:
9025 utils.ForceDictType(nic, constants.INIC_PARAMS_TYPES)
9027 # check disks. parameter names and consistent adopt/no-adopt strategy
9028 has_adopt = has_no_adopt = False
9029 for disk in self.op.disks:
9030 utils.ForceDictType(disk, constants.IDISK_PARAMS_TYPES)
9031 if constants.IDISK_ADOPT in disk:
9035 if has_adopt and has_no_adopt:
9036 raise errors.OpPrereqError("Either all disks are adopted or none is",
9039 if self.op.disk_template not in constants.DTS_MAY_ADOPT:
9040 raise errors.OpPrereqError("Disk adoption is not supported for the"
9041 " '%s' disk template" %
9042 self.op.disk_template,
9044 if self.op.iallocator is not None:
9045 raise errors.OpPrereqError("Disk adoption not allowed with an"
9046 " iallocator script", errors.ECODE_INVAL)
9047 if self.op.mode == constants.INSTANCE_IMPORT:
9048 raise errors.OpPrereqError("Disk adoption not allowed for"
9049 " instance import", errors.ECODE_INVAL)
9051 if self.op.disk_template in constants.DTS_MUST_ADOPT:
9052 raise errors.OpPrereqError("Disk template %s requires disk adoption,"
9053 " but no 'adopt' parameter given" %
9054 self.op.disk_template,
9057 self.adopt_disks = has_adopt
9059 # instance name verification
9060 if self.op.name_check:
9061 self.hostname1 = netutils.GetHostname(name=self.op.instance_name)
9062 self.op.instance_name = self.hostname1.name
9063 # used in CheckPrereq for ip ping check
9064 self.check_ip = self.hostname1.ip
9066 self.check_ip = None
9068 # file storage checks
9069 if (self.op.file_driver and
9070 not self.op.file_driver in constants.FILE_DRIVER):
9071 raise errors.OpPrereqError("Invalid file driver name '%s'" %
9072 self.op.file_driver, errors.ECODE_INVAL)
9074 if self.op.disk_template == constants.DT_FILE:
9075 opcodes.RequireFileStorage()
9076 elif self.op.disk_template == constants.DT_SHARED_FILE:
9077 opcodes.RequireSharedFileStorage()
9079 ### Node/iallocator related checks
9080 _CheckIAllocatorOrNode(self, "iallocator", "pnode")
9082 if self.op.pnode is not None:
9083 if self.op.disk_template in constants.DTS_INT_MIRROR:
9084 if self.op.snode is None:
9085 raise errors.OpPrereqError("The networked disk templates need"
9086 " a mirror node", errors.ECODE_INVAL)
9088 self.LogWarning("Secondary node will be ignored on non-mirrored disk"
9090 self.op.snode = None
9092 self._cds = _GetClusterDomainSecret()
9094 if self.op.mode == constants.INSTANCE_IMPORT:
9095 # On import force_variant must be True, because if we forced it at
9096 # initial install, our only chance when importing it back is that it
9098 self.op.force_variant = True
9100 if self.op.no_install:
9101 self.LogInfo("No-installation mode has no effect during import")
9103 elif self.op.mode == constants.INSTANCE_CREATE:
9104 if self.op.os_type is None:
9105 raise errors.OpPrereqError("No guest OS specified",
9107 if self.op.os_type in self.cfg.GetClusterInfo().blacklisted_os:
9108 raise errors.OpPrereqError("Guest OS '%s' is not allowed for"
9109 " installation" % self.op.os_type,
9111 if self.op.disk_template is None:
9112 raise errors.OpPrereqError("No disk template specified",
9115 elif self.op.mode == constants.INSTANCE_REMOTE_IMPORT:
9116 # Check handshake to ensure both clusters have the same domain secret
9117 src_handshake = self.op.source_handshake
9118 if not src_handshake:
9119 raise errors.OpPrereqError("Missing source handshake",
9122 errmsg = masterd.instance.CheckRemoteExportHandshake(self._cds,
9125 raise errors.OpPrereqError("Invalid handshake: %s" % errmsg,
9128 # Load and check source CA
9129 self.source_x509_ca_pem = self.op.source_x509_ca
9130 if not self.source_x509_ca_pem:
9131 raise errors.OpPrereqError("Missing source X509 CA",
9135 (cert, _) = utils.LoadSignedX509Certificate(self.source_x509_ca_pem,
9137 except OpenSSL.crypto.Error, err:
9138 raise errors.OpPrereqError("Unable to load source X509 CA (%s)" %
9139 (err, ), errors.ECODE_INVAL)
9141 (errcode, msg) = utils.VerifyX509Certificate(cert, None, None)
9142 if errcode is not None:
9143 raise errors.OpPrereqError("Invalid source X509 CA (%s)" % (msg, ),
9146 self.source_x509_ca = cert
9148 src_instance_name = self.op.source_instance_name
9149 if not src_instance_name:
9150 raise errors.OpPrereqError("Missing source instance name",
9153 self.source_instance_name = \
9154 netutils.GetHostname(name=src_instance_name).name
9157 raise errors.OpPrereqError("Invalid instance creation mode %r" %
9158 self.op.mode, errors.ECODE_INVAL)
9160 def ExpandNames(self):
9161 """ExpandNames for CreateInstance.
9163 Figure out the right locks for instance creation.
9166 self.needed_locks = {}
9168 instance_name = self.op.instance_name
9169 # this is just a preventive check, but someone might still add this
9170 # instance in the meantime, and creation will fail at lock-add time
9171 if instance_name in self.cfg.GetInstanceList():
9172 raise errors.OpPrereqError("Instance '%s' is already in the cluster" %
9173 instance_name, errors.ECODE_EXISTS)
9175 self.add_locks[locking.LEVEL_INSTANCE] = instance_name
9177 if self.op.iallocator:
9178 # TODO: Find a solution to not lock all nodes in the cluster, e.g. by
9179 # specifying a group on instance creation and then selecting nodes from
9181 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
9182 self.needed_locks[locking.LEVEL_NODE_RES] = locking.ALL_SET
9184 self.op.pnode = _ExpandNodeName(self.cfg, self.op.pnode)
9185 nodelist = [self.op.pnode]
9186 if self.op.snode is not None:
9187 self.op.snode = _ExpandNodeName(self.cfg, self.op.snode)
9188 nodelist.append(self.op.snode)
9189 self.needed_locks[locking.LEVEL_NODE] = nodelist
9190 # Lock resources of instance's primary and secondary nodes (copy to
9191 # prevent accidential modification)
9192 self.needed_locks[locking.LEVEL_NODE_RES] = list(nodelist)
9194 # in case of import lock the source node too
9195 if self.op.mode == constants.INSTANCE_IMPORT:
9196 src_node = self.op.src_node
9197 src_path = self.op.src_path
9199 if src_path is None:
9200 self.op.src_path = src_path = self.op.instance_name
9202 if src_node is None:
9203 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
9204 self.op.src_node = None
9205 if os.path.isabs(src_path):
9206 raise errors.OpPrereqError("Importing an instance from a path"
9207 " requires a source node option",
9210 self.op.src_node = src_node = _ExpandNodeName(self.cfg, src_node)
9211 if self.needed_locks[locking.LEVEL_NODE] is not locking.ALL_SET:
9212 self.needed_locks[locking.LEVEL_NODE].append(src_node)
9213 if not os.path.isabs(src_path):
9214 self.op.src_path = src_path = \
9215 utils.PathJoin(constants.EXPORT_DIR, src_path)
9217 def _RunAllocator(self):
9218 """Run the allocator based on input opcode.
9221 nics = [n.ToDict() for n in self.nics]
9222 ial = IAllocator(self.cfg, self.rpc,
9223 mode=constants.IALLOCATOR_MODE_ALLOC,
9224 name=self.op.instance_name,
9225 disk_template=self.op.disk_template,
9228 vcpus=self.be_full[constants.BE_VCPUS],
9229 memory=self.be_full[constants.BE_MAXMEM],
9232 hypervisor=self.op.hypervisor,
9235 ial.Run(self.op.iallocator)
9238 raise errors.OpPrereqError("Can't compute nodes using"
9239 " iallocator '%s': %s" %
9240 (self.op.iallocator, ial.info),
9242 if len(ial.result) != ial.required_nodes:
9243 raise errors.OpPrereqError("iallocator '%s' returned invalid number"
9244 " of nodes (%s), required %s" %
9245 (self.op.iallocator, len(ial.result),
9246 ial.required_nodes), errors.ECODE_FAULT)
9247 self.op.pnode = ial.result[0]
9248 self.LogInfo("Selected nodes for instance %s via iallocator %s: %s",
9249 self.op.instance_name, self.op.iallocator,
9250 utils.CommaJoin(ial.result))
9251 if ial.required_nodes == 2:
9252 self.op.snode = ial.result[1]
9254 def BuildHooksEnv(self):
9257 This runs on master, primary and secondary nodes of the instance.
9261 "ADD_MODE": self.op.mode,
9263 if self.op.mode == constants.INSTANCE_IMPORT:
9264 env["SRC_NODE"] = self.op.src_node
9265 env["SRC_PATH"] = self.op.src_path
9266 env["SRC_IMAGES"] = self.src_images
9268 env.update(_BuildInstanceHookEnv(
9269 name=self.op.instance_name,
9270 primary_node=self.op.pnode,
9271 secondary_nodes=self.secondaries,
9272 status=self.op.start,
9273 os_type=self.op.os_type,
9274 minmem=self.be_full[constants.BE_MINMEM],
9275 maxmem=self.be_full[constants.BE_MAXMEM],
9276 vcpus=self.be_full[constants.BE_VCPUS],
9277 nics=_NICListToTuple(self, self.nics),
9278 disk_template=self.op.disk_template,
9279 disks=[(d[constants.IDISK_SIZE], d[constants.IDISK_MODE])
9280 for d in self.disks],
9283 hypervisor_name=self.op.hypervisor,
9289 def BuildHooksNodes(self):
9290 """Build hooks nodes.
9293 nl = [self.cfg.GetMasterNode(), self.op.pnode] + self.secondaries
9296 def _ReadExportInfo(self):
9297 """Reads the export information from disk.
9299 It will override the opcode source node and path with the actual
9300 information, if these two were not specified before.
9302 @return: the export information
9305 assert self.op.mode == constants.INSTANCE_IMPORT
9307 src_node = self.op.src_node
9308 src_path = self.op.src_path
9310 if src_node is None:
9311 locked_nodes = self.owned_locks(locking.LEVEL_NODE)
9312 exp_list = self.rpc.call_export_list(locked_nodes)
9314 for node in exp_list:
9315 if exp_list[node].fail_msg:
9317 if src_path in exp_list[node].payload:
9319 self.op.src_node = src_node = node
9320 self.op.src_path = src_path = utils.PathJoin(constants.EXPORT_DIR,
9324 raise errors.OpPrereqError("No export found for relative path %s" %
9325 src_path, errors.ECODE_INVAL)
9327 _CheckNodeOnline(self, src_node)
9328 result = self.rpc.call_export_info(src_node, src_path)
9329 result.Raise("No export or invalid export found in dir %s" % src_path)
9331 export_info = objects.SerializableConfigParser.Loads(str(result.payload))
9332 if not export_info.has_section(constants.INISECT_EXP):
9333 raise errors.ProgrammerError("Corrupted export config",
9334 errors.ECODE_ENVIRON)
9336 ei_version = export_info.get(constants.INISECT_EXP, "version")
9337 if (int(ei_version) != constants.EXPORT_VERSION):
9338 raise errors.OpPrereqError("Wrong export version %s (wanted %d)" %
9339 (ei_version, constants.EXPORT_VERSION),
9340 errors.ECODE_ENVIRON)
9343 def _ReadExportParams(self, einfo):
9344 """Use export parameters as defaults.
9346 In case the opcode doesn't specify (as in override) some instance
9347 parameters, then try to use them from the export information, if
9351 self.op.os_type = einfo.get(constants.INISECT_EXP, "os")
9353 if self.op.disk_template is None:
9354 if einfo.has_option(constants.INISECT_INS, "disk_template"):
9355 self.op.disk_template = einfo.get(constants.INISECT_INS,
9357 if self.op.disk_template not in constants.DISK_TEMPLATES:
9358 raise errors.OpPrereqError("Disk template specified in configuration"
9359 " file is not one of the allowed values:"
9360 " %s" % " ".join(constants.DISK_TEMPLATES))
9362 raise errors.OpPrereqError("No disk template specified and the export"
9363 " is missing the disk_template information",
9366 if not self.op.disks:
9368 # TODO: import the disk iv_name too
9369 for idx in range(constants.MAX_DISKS):
9370 if einfo.has_option(constants.INISECT_INS, "disk%d_size" % idx):
9371 disk_sz = einfo.getint(constants.INISECT_INS, "disk%d_size" % idx)
9372 disks.append({constants.IDISK_SIZE: disk_sz})
9373 self.op.disks = disks
9374 if not disks and self.op.disk_template != constants.DT_DISKLESS:
9375 raise errors.OpPrereqError("No disk info specified and the export"
9376 " is missing the disk information",
9379 if not self.op.nics:
9381 for idx in range(constants.MAX_NICS):
9382 if einfo.has_option(constants.INISECT_INS, "nic%d_mac" % idx):
9384 for name in list(constants.NICS_PARAMETERS) + ["ip", "mac"]:
9385 v = einfo.get(constants.INISECT_INS, "nic%d_%s" % (idx, name))
9392 if not self.op.tags and einfo.has_option(constants.INISECT_INS, "tags"):
9393 self.op.tags = einfo.get(constants.INISECT_INS, "tags").split()
9395 if (self.op.hypervisor is None and
9396 einfo.has_option(constants.INISECT_INS, "hypervisor")):
9397 self.op.hypervisor = einfo.get(constants.INISECT_INS, "hypervisor")
9399 if einfo.has_section(constants.INISECT_HYP):
9400 # use the export parameters but do not override the ones
9401 # specified by the user
9402 for name, value in einfo.items(constants.INISECT_HYP):
9403 if name not in self.op.hvparams:
9404 self.op.hvparams[name] = value
9406 if einfo.has_section(constants.INISECT_BEP):
9407 # use the parameters, without overriding
9408 for name, value in einfo.items(constants.INISECT_BEP):
9409 if name not in self.op.beparams:
9410 self.op.beparams[name] = value
9411 # Compatibility for the old "memory" be param
9412 if name == constants.BE_MEMORY:
9413 if constants.BE_MAXMEM not in self.op.beparams:
9414 self.op.beparams[constants.BE_MAXMEM] = value
9415 if constants.BE_MINMEM not in self.op.beparams:
9416 self.op.beparams[constants.BE_MINMEM] = value
9418 # try to read the parameters old style, from the main section
9419 for name in constants.BES_PARAMETERS:
9420 if (name not in self.op.beparams and
9421 einfo.has_option(constants.INISECT_INS, name)):
9422 self.op.beparams[name] = einfo.get(constants.INISECT_INS, name)
9424 if einfo.has_section(constants.INISECT_OSP):
9425 # use the parameters, without overriding
9426 for name, value in einfo.items(constants.INISECT_OSP):
9427 if name not in self.op.osparams:
9428 self.op.osparams[name] = value
9430 def _RevertToDefaults(self, cluster):
9431 """Revert the instance parameters to the default values.
9435 hv_defs = cluster.SimpleFillHV(self.op.hypervisor, self.op.os_type, {})
9436 for name in self.op.hvparams.keys():
9437 if name in hv_defs and hv_defs[name] == self.op.hvparams[name]:
9438 del self.op.hvparams[name]
9440 be_defs = cluster.SimpleFillBE({})
9441 for name in self.op.beparams.keys():
9442 if name in be_defs and be_defs[name] == self.op.beparams[name]:
9443 del self.op.beparams[name]
9445 nic_defs = cluster.SimpleFillNIC({})
9446 for nic in self.op.nics:
9447 for name in constants.NICS_PARAMETERS:
9448 if name in nic and name in nic_defs and nic[name] == nic_defs[name]:
9451 os_defs = cluster.SimpleFillOS(self.op.os_type, {})
9452 for name in self.op.osparams.keys():
9453 if name in os_defs and os_defs[name] == self.op.osparams[name]:
9454 del self.op.osparams[name]
9456 def _CalculateFileStorageDir(self):
9457 """Calculate final instance file storage dir.
9460 # file storage dir calculation/check
9461 self.instance_file_storage_dir = None
9462 if self.op.disk_template in constants.DTS_FILEBASED:
9463 # build the full file storage dir path
9466 if self.op.disk_template == constants.DT_SHARED_FILE:
9467 get_fsd_fn = self.cfg.GetSharedFileStorageDir
9469 get_fsd_fn = self.cfg.GetFileStorageDir
9471 cfg_storagedir = get_fsd_fn()
9472 if not cfg_storagedir:
9473 raise errors.OpPrereqError("Cluster file storage dir not defined")
9474 joinargs.append(cfg_storagedir)
9476 if self.op.file_storage_dir is not None:
9477 joinargs.append(self.op.file_storage_dir)
9479 joinargs.append(self.op.instance_name)
9481 # pylint: disable=W0142
9482 self.instance_file_storage_dir = utils.PathJoin(*joinargs)
9484 def CheckPrereq(self): # pylint: disable=R0914
9485 """Check prerequisites.
9488 self._CalculateFileStorageDir()
9490 if self.op.mode == constants.INSTANCE_IMPORT:
9491 export_info = self._ReadExportInfo()
9492 self._ReadExportParams(export_info)
9494 if (not self.cfg.GetVGName() and
9495 self.op.disk_template not in constants.DTS_NOT_LVM):
9496 raise errors.OpPrereqError("Cluster does not support lvm-based"
9497 " instances", errors.ECODE_STATE)
9499 if (self.op.hypervisor is None or
9500 self.op.hypervisor == constants.VALUE_AUTO):
9501 self.op.hypervisor = self.cfg.GetHypervisorType()
9503 cluster = self.cfg.GetClusterInfo()
9504 enabled_hvs = cluster.enabled_hypervisors
9505 if self.op.hypervisor not in enabled_hvs:
9506 raise errors.OpPrereqError("Selected hypervisor (%s) not enabled in the"
9507 " cluster (%s)" % (self.op.hypervisor,
9508 ",".join(enabled_hvs)),
9511 # Check tag validity
9512 for tag in self.op.tags:
9513 objects.TaggableObject.ValidateTag(tag)
9515 # check hypervisor parameter syntax (locally)
9516 utils.ForceDictType(self.op.hvparams, constants.HVS_PARAMETER_TYPES)
9517 filled_hvp = cluster.SimpleFillHV(self.op.hypervisor, self.op.os_type,
9519 hv_type = hypervisor.GetHypervisor(self.op.hypervisor)
9520 hv_type.CheckParameterSyntax(filled_hvp)
9521 self.hv_full = filled_hvp
9522 # check that we don't specify global parameters on an instance
9523 _CheckGlobalHvParams(self.op.hvparams)
9525 # fill and remember the beparams dict
9526 default_beparams = cluster.beparams[constants.PP_DEFAULT]
9527 for param, value in self.op.beparams.iteritems():
9528 if value == constants.VALUE_AUTO:
9529 self.op.beparams[param] = default_beparams[param]
9530 objects.UpgradeBeParams(self.op.beparams)
9531 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
9532 self.be_full = cluster.SimpleFillBE(self.op.beparams)
9534 # build os parameters
9535 self.os_full = cluster.SimpleFillOS(self.op.os_type, self.op.osparams)
9537 # now that hvp/bep are in final format, let's reset to defaults,
9539 if self.op.identify_defaults:
9540 self._RevertToDefaults(cluster)
9544 for idx, nic in enumerate(self.op.nics):
9545 nic_mode_req = nic.get(constants.INIC_MODE, None)
9546 nic_mode = nic_mode_req
9547 if nic_mode is None or nic_mode == constants.VALUE_AUTO:
9548 nic_mode = cluster.nicparams[constants.PP_DEFAULT][constants.NIC_MODE]
9550 # in routed mode, for the first nic, the default ip is 'auto'
9551 if nic_mode == constants.NIC_MODE_ROUTED and idx == 0:
9552 default_ip_mode = constants.VALUE_AUTO
9554 default_ip_mode = constants.VALUE_NONE
9556 # ip validity checks
9557 ip = nic.get(constants.INIC_IP, default_ip_mode)
9558 if ip is None or ip.lower() == constants.VALUE_NONE:
9560 elif ip.lower() == constants.VALUE_AUTO:
9561 if not self.op.name_check:
9562 raise errors.OpPrereqError("IP address set to auto but name checks"
9563 " have been skipped",
9565 nic_ip = self.hostname1.ip
9567 if not netutils.IPAddress.IsValid(ip):
9568 raise errors.OpPrereqError("Invalid IP address '%s'" % ip,
9572 # TODO: check the ip address for uniqueness
9573 if nic_mode == constants.NIC_MODE_ROUTED and not nic_ip:
9574 raise errors.OpPrereqError("Routed nic mode requires an ip address",
9577 # MAC address verification
9578 mac = nic.get(constants.INIC_MAC, constants.VALUE_AUTO)
9579 if mac not in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
9580 mac = utils.NormalizeAndValidateMac(mac)
9583 self.cfg.ReserveMAC(mac, self.proc.GetECId())
9584 except errors.ReservationError:
9585 raise errors.OpPrereqError("MAC address %s already in use"
9586 " in cluster" % mac,
9587 errors.ECODE_NOTUNIQUE)
9589 # Build nic parameters
9590 link = nic.get(constants.INIC_LINK, None)
9591 if link == constants.VALUE_AUTO:
9592 link = cluster.nicparams[constants.PP_DEFAULT][constants.NIC_LINK]
9595 nicparams[constants.NIC_MODE] = nic_mode
9597 nicparams[constants.NIC_LINK] = link
9599 check_params = cluster.SimpleFillNIC(nicparams)
9600 objects.NIC.CheckParameterSyntax(check_params)
9601 self.nics.append(objects.NIC(mac=mac, ip=nic_ip, nicparams=nicparams))
9603 # disk checks/pre-build
9604 default_vg = self.cfg.GetVGName()
9606 for disk in self.op.disks:
9607 mode = disk.get(constants.IDISK_MODE, constants.DISK_RDWR)
9608 if mode not in constants.DISK_ACCESS_SET:
9609 raise errors.OpPrereqError("Invalid disk access mode '%s'" %
9610 mode, errors.ECODE_INVAL)
9611 size = disk.get(constants.IDISK_SIZE, None)
9613 raise errors.OpPrereqError("Missing disk size", errors.ECODE_INVAL)
9616 except (TypeError, ValueError):
9617 raise errors.OpPrereqError("Invalid disk size '%s'" % size,
9620 data_vg = disk.get(constants.IDISK_VG, default_vg)
9622 constants.IDISK_SIZE: size,
9623 constants.IDISK_MODE: mode,
9624 constants.IDISK_VG: data_vg,
9626 if constants.IDISK_METAVG in disk:
9627 new_disk[constants.IDISK_METAVG] = disk[constants.IDISK_METAVG]
9628 if constants.IDISK_ADOPT in disk:
9629 new_disk[constants.IDISK_ADOPT] = disk[constants.IDISK_ADOPT]
9630 self.disks.append(new_disk)
9632 if self.op.mode == constants.INSTANCE_IMPORT:
9634 for idx in range(len(self.disks)):
9635 option = "disk%d_dump" % idx
9636 if export_info.has_option(constants.INISECT_INS, option):
9637 # FIXME: are the old os-es, disk sizes, etc. useful?
9638 export_name = export_info.get(constants.INISECT_INS, option)
9639 image = utils.PathJoin(self.op.src_path, export_name)
9640 disk_images.append(image)
9642 disk_images.append(False)
9644 self.src_images = disk_images
9646 old_name = export_info.get(constants.INISECT_INS, "name")
9647 if self.op.instance_name == old_name:
9648 for idx, nic in enumerate(self.nics):
9649 if nic.mac == constants.VALUE_AUTO:
9650 nic_mac_ini = "nic%d_mac" % idx
9651 nic.mac = export_info.get(constants.INISECT_INS, nic_mac_ini)
9653 # ENDIF: self.op.mode == constants.INSTANCE_IMPORT
9655 # ip ping checks (we use the same ip that was resolved in ExpandNames)
9656 if self.op.ip_check:
9657 if netutils.TcpPing(self.check_ip, constants.DEFAULT_NODED_PORT):
9658 raise errors.OpPrereqError("IP %s of instance %s already in use" %
9659 (self.check_ip, self.op.instance_name),
9660 errors.ECODE_NOTUNIQUE)
9662 #### mac address generation
9663 # By generating here the mac address both the allocator and the hooks get
9664 # the real final mac address rather than the 'auto' or 'generate' value.
9665 # There is a race condition between the generation and the instance object
9666 # creation, which means that we know the mac is valid now, but we're not
9667 # sure it will be when we actually add the instance. If things go bad
9668 # adding the instance will abort because of a duplicate mac, and the
9669 # creation job will fail.
9670 for nic in self.nics:
9671 if nic.mac in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
9672 nic.mac = self.cfg.GenerateMAC(self.proc.GetECId())
9676 if self.op.iallocator is not None:
9677 self._RunAllocator()
9679 # Release all unneeded node locks
9680 _ReleaseLocks(self, locking.LEVEL_NODE,
9681 keep=filter(None, [self.op.pnode, self.op.snode,
9683 _ReleaseLocks(self, locking.LEVEL_NODE_RES,
9684 keep=filter(None, [self.op.pnode, self.op.snode,
9687 #### node related checks
9689 # check primary node
9690 self.pnode = pnode = self.cfg.GetNodeInfo(self.op.pnode)
9691 assert self.pnode is not None, \
9692 "Cannot retrieve locked node %s" % self.op.pnode
9694 raise errors.OpPrereqError("Cannot use offline primary node '%s'" %
9695 pnode.name, errors.ECODE_STATE)
9697 raise errors.OpPrereqError("Cannot use drained primary node '%s'" %
9698 pnode.name, errors.ECODE_STATE)
9699 if not pnode.vm_capable:
9700 raise errors.OpPrereqError("Cannot use non-vm_capable primary node"
9701 " '%s'" % pnode.name, errors.ECODE_STATE)
9703 self.secondaries = []
9705 # mirror node verification
9706 if self.op.disk_template in constants.DTS_INT_MIRROR:
9707 if self.op.snode == pnode.name:
9708 raise errors.OpPrereqError("The secondary node cannot be the"
9709 " primary node", errors.ECODE_INVAL)
9710 _CheckNodeOnline(self, self.op.snode)
9711 _CheckNodeNotDrained(self, self.op.snode)
9712 _CheckNodeVmCapable(self, self.op.snode)
9713 self.secondaries.append(self.op.snode)
9715 snode = self.cfg.GetNodeInfo(self.op.snode)
9716 if pnode.group != snode.group:
9717 self.LogWarning("The primary and secondary nodes are in two"
9718 " different node groups; the disk parameters"
9719 " from the first disk's node group will be"
9722 nodenames = [pnode.name] + self.secondaries
9724 # Verify instance specs
9726 constants.ISPEC_MEM_SIZE: self.be_full.get(constants.BE_MAXMEM, None),
9727 constants.ISPEC_CPU_COUNT: self.be_full.get(constants.BE_VCPUS, None),
9728 constants.ISPEC_DISK_COUNT: len(self.disks),
9729 constants.ISPEC_DISK_SIZE: [disk["size"] for disk in self.disks],
9730 constants.ISPEC_NIC_COUNT: len(self.nics),
9733 group_info = self.cfg.GetNodeGroup(pnode.group)
9734 ipolicy = _CalculateGroupIPolicy(cluster, group_info)
9735 res = _ComputeIPolicyInstanceSpecViolation(ipolicy, ispec)
9736 if not self.op.ignore_ipolicy and res:
9737 raise errors.OpPrereqError(("Instance allocation to group %s violates"
9738 " policy: %s") % (pnode.group,
9739 utils.CommaJoin(res)),
9742 # disk parameters (not customizable at instance or node level)
9743 # just use the primary node parameters, ignoring the secondary.
9744 self.diskparams = group_info.diskparams
9746 if not self.adopt_disks:
9747 if self.op.disk_template == constants.DT_RBD:
9748 # _CheckRADOSFreeSpace() is just a placeholder.
9749 # Any function that checks prerequisites can be placed here.
9750 # Check if there is enough space on the RADOS cluster.
9751 _CheckRADOSFreeSpace()
9753 # Check lv size requirements, if not adopting
9754 req_sizes = _ComputeDiskSizePerVG(self.op.disk_template, self.disks)
9755 _CheckNodesFreeDiskPerVG(self, nodenames, req_sizes)
9757 elif self.op.disk_template == constants.DT_PLAIN: # Check the adoption data
9758 all_lvs = set(["%s/%s" % (disk[constants.IDISK_VG],
9759 disk[constants.IDISK_ADOPT])
9760 for disk in self.disks])
9761 if len(all_lvs) != len(self.disks):
9762 raise errors.OpPrereqError("Duplicate volume names given for adoption",
9764 for lv_name in all_lvs:
9766 # FIXME: lv_name here is "vg/lv" need to ensure that other calls
9767 # to ReserveLV uses the same syntax
9768 self.cfg.ReserveLV(lv_name, self.proc.GetECId())
9769 except errors.ReservationError:
9770 raise errors.OpPrereqError("LV named %s used by another instance" %
9771 lv_name, errors.ECODE_NOTUNIQUE)
9773 vg_names = self.rpc.call_vg_list([pnode.name])[pnode.name]
9774 vg_names.Raise("Cannot get VG information from node %s" % pnode.name)
9776 node_lvs = self.rpc.call_lv_list([pnode.name],
9777 vg_names.payload.keys())[pnode.name]
9778 node_lvs.Raise("Cannot get LV information from node %s" % pnode.name)
9779 node_lvs = node_lvs.payload
9781 delta = all_lvs.difference(node_lvs.keys())
9783 raise errors.OpPrereqError("Missing logical volume(s): %s" %
9784 utils.CommaJoin(delta),
9786 online_lvs = [lv for lv in all_lvs if node_lvs[lv][2]]
9788 raise errors.OpPrereqError("Online logical volumes found, cannot"
9789 " adopt: %s" % utils.CommaJoin(online_lvs),
9791 # update the size of disk based on what is found
9792 for dsk in self.disks:
9793 dsk[constants.IDISK_SIZE] = \
9794 int(float(node_lvs["%s/%s" % (dsk[constants.IDISK_VG],
9795 dsk[constants.IDISK_ADOPT])][0]))
9797 elif self.op.disk_template == constants.DT_BLOCK:
9798 # Normalize and de-duplicate device paths
9799 all_disks = set([os.path.abspath(disk[constants.IDISK_ADOPT])
9800 for disk in self.disks])
9801 if len(all_disks) != len(self.disks):
9802 raise errors.OpPrereqError("Duplicate disk names given for adoption",
9804 baddisks = [d for d in all_disks
9805 if not d.startswith(constants.ADOPTABLE_BLOCKDEV_ROOT)]
9807 raise errors.OpPrereqError("Device node(s) %s lie outside %s and"
9808 " cannot be adopted" %
9809 (", ".join(baddisks),
9810 constants.ADOPTABLE_BLOCKDEV_ROOT),
9813 node_disks = self.rpc.call_bdev_sizes([pnode.name],
9814 list(all_disks))[pnode.name]
9815 node_disks.Raise("Cannot get block device information from node %s" %
9817 node_disks = node_disks.payload
9818 delta = all_disks.difference(node_disks.keys())
9820 raise errors.OpPrereqError("Missing block device(s): %s" %
9821 utils.CommaJoin(delta),
9823 for dsk in self.disks:
9824 dsk[constants.IDISK_SIZE] = \
9825 int(float(node_disks[dsk[constants.IDISK_ADOPT]]))
9827 _CheckHVParams(self, nodenames, self.op.hypervisor, self.op.hvparams)
9829 _CheckNodeHasOS(self, pnode.name, self.op.os_type, self.op.force_variant)
9830 # check OS parameters (remotely)
9831 _CheckOSParams(self, True, nodenames, self.op.os_type, self.os_full)
9833 _CheckNicsBridgesExist(self, self.nics, self.pnode.name)
9835 # memory check on primary node
9836 #TODO(dynmem): use MINMEM for checking
9838 _CheckNodeFreeMemory(self, self.pnode.name,
9839 "creating instance %s" % self.op.instance_name,
9840 self.be_full[constants.BE_MAXMEM],
9843 self.dry_run_result = list(nodenames)
9845 def Exec(self, feedback_fn):
9846 """Create and add the instance to the cluster.
9849 instance = self.op.instance_name
9850 pnode_name = self.pnode.name
9852 assert not (self.owned_locks(locking.LEVEL_NODE_RES) -
9853 self.owned_locks(locking.LEVEL_NODE)), \
9854 "Node locks differ from node resource locks"
9856 ht_kind = self.op.hypervisor
9857 if ht_kind in constants.HTS_REQ_PORT:
9858 network_port = self.cfg.AllocatePort()
9862 disks = _GenerateDiskTemplate(self,
9863 self.op.disk_template,
9864 instance, pnode_name,
9867 self.instance_file_storage_dir,
9868 self.op.file_driver,
9873 iobj = objects.Instance(name=instance, os=self.op.os_type,
9874 primary_node=pnode_name,
9875 nics=self.nics, disks=disks,
9876 disk_template=self.op.disk_template,
9877 admin_state=constants.ADMINST_DOWN,
9878 network_port=network_port,
9879 beparams=self.op.beparams,
9880 hvparams=self.op.hvparams,
9881 hypervisor=self.op.hypervisor,
9882 osparams=self.op.osparams,
9886 for tag in self.op.tags:
9889 if self.adopt_disks:
9890 if self.op.disk_template == constants.DT_PLAIN:
9891 # rename LVs to the newly-generated names; we need to construct
9892 # 'fake' LV disks with the old data, plus the new unique_id
9893 tmp_disks = [objects.Disk.FromDict(v.ToDict()) for v in disks]
9895 for t_dsk, a_dsk in zip(tmp_disks, self.disks):
9896 rename_to.append(t_dsk.logical_id)
9897 t_dsk.logical_id = (t_dsk.logical_id[0], a_dsk[constants.IDISK_ADOPT])
9898 self.cfg.SetDiskID(t_dsk, pnode_name)
9899 result = self.rpc.call_blockdev_rename(pnode_name,
9900 zip(tmp_disks, rename_to))
9901 result.Raise("Failed to rename adoped LVs")
9903 feedback_fn("* creating instance disks...")
9905 _CreateDisks(self, iobj)
9906 except errors.OpExecError:
9907 self.LogWarning("Device creation failed, reverting...")
9909 _RemoveDisks(self, iobj)
9911 self.cfg.ReleaseDRBDMinors(instance)
9914 feedback_fn("adding instance %s to cluster config" % instance)
9916 self.cfg.AddInstance(iobj, self.proc.GetECId())
9918 # Declare that we don't want to remove the instance lock anymore, as we've
9919 # added the instance to the config
9920 del self.remove_locks[locking.LEVEL_INSTANCE]
9922 if self.op.mode == constants.INSTANCE_IMPORT:
9923 # Release unused nodes
9924 _ReleaseLocks(self, locking.LEVEL_NODE, keep=[self.op.src_node])
9927 _ReleaseLocks(self, locking.LEVEL_NODE)
9930 if not self.adopt_disks and self.cfg.GetClusterInfo().prealloc_wipe_disks:
9931 feedback_fn("* wiping instance disks...")
9933 _WipeDisks(self, iobj)
9934 except errors.OpExecError, err:
9935 logging.exception("Wiping disks failed")
9936 self.LogWarning("Wiping instance disks failed (%s)", err)
9940 # Something is already wrong with the disks, don't do anything else
9942 elif self.op.wait_for_sync:
9943 disk_abort = not _WaitForSync(self, iobj)
9944 elif iobj.disk_template in constants.DTS_INT_MIRROR:
9945 # make sure the disks are not degraded (still sync-ing is ok)
9946 feedback_fn("* checking mirrors status")
9947 disk_abort = not _WaitForSync(self, iobj, oneshot=True)
9952 _RemoveDisks(self, iobj)
9953 self.cfg.RemoveInstance(iobj.name)
9954 # Make sure the instance lock gets removed
9955 self.remove_locks[locking.LEVEL_INSTANCE] = iobj.name
9956 raise errors.OpExecError("There are some degraded disks for"
9959 # Release all node resource locks
9960 _ReleaseLocks(self, locking.LEVEL_NODE_RES)
9962 if iobj.disk_template != constants.DT_DISKLESS and not self.adopt_disks:
9963 if self.op.mode == constants.INSTANCE_CREATE:
9964 if not self.op.no_install:
9965 pause_sync = (iobj.disk_template in constants.DTS_INT_MIRROR and
9966 not self.op.wait_for_sync)
9968 feedback_fn("* pausing disk sync to install instance OS")
9969 result = self.rpc.call_blockdev_pause_resume_sync(pnode_name,
9971 for idx, success in enumerate(result.payload):
9973 logging.warn("pause-sync of instance %s for disk %d failed",
9976 feedback_fn("* running the instance OS create scripts...")
9977 # FIXME: pass debug option from opcode to backend
9979 self.rpc.call_instance_os_add(pnode_name, (iobj, None), False,
9980 self.op.debug_level)
9982 feedback_fn("* resuming disk sync")
9983 result = self.rpc.call_blockdev_pause_resume_sync(pnode_name,
9985 for idx, success in enumerate(result.payload):
9987 logging.warn("resume-sync of instance %s for disk %d failed",
9990 os_add_result.Raise("Could not add os for instance %s"
9991 " on node %s" % (instance, pnode_name))
9993 elif self.op.mode == constants.INSTANCE_IMPORT:
9994 feedback_fn("* running the instance OS import scripts...")
9998 for idx, image in enumerate(self.src_images):
10002 # FIXME: pass debug option from opcode to backend
10003 dt = masterd.instance.DiskTransfer("disk/%s" % idx,
10004 constants.IEIO_FILE, (image, ),
10005 constants.IEIO_SCRIPT,
10006 (iobj.disks[idx], idx),
10008 transfers.append(dt)
10011 masterd.instance.TransferInstanceData(self, feedback_fn,
10012 self.op.src_node, pnode_name,
10013 self.pnode.secondary_ip,
10015 if not compat.all(import_result):
10016 self.LogWarning("Some disks for instance %s on node %s were not"
10017 " imported successfully" % (instance, pnode_name))
10019 elif self.op.mode == constants.INSTANCE_REMOTE_IMPORT:
10020 feedback_fn("* preparing remote import...")
10021 # The source cluster will stop the instance before attempting to make a
10022 # connection. In some cases stopping an instance can take a long time,
10023 # hence the shutdown timeout is added to the connection timeout.
10024 connect_timeout = (constants.RIE_CONNECT_TIMEOUT +
10025 self.op.source_shutdown_timeout)
10026 timeouts = masterd.instance.ImportExportTimeouts(connect_timeout)
10028 assert iobj.primary_node == self.pnode.name
10030 masterd.instance.RemoteImport(self, feedback_fn, iobj, self.pnode,
10031 self.source_x509_ca,
10032 self._cds, timeouts)
10033 if not compat.all(disk_results):
10034 # TODO: Should the instance still be started, even if some disks
10035 # failed to import (valid for local imports, too)?
10036 self.LogWarning("Some disks for instance %s on node %s were not"
10037 " imported successfully" % (instance, pnode_name))
10039 # Run rename script on newly imported instance
10040 assert iobj.name == instance
10041 feedback_fn("Running rename script for %s" % instance)
10042 result = self.rpc.call_instance_run_rename(pnode_name, iobj,
10043 self.source_instance_name,
10044 self.op.debug_level)
10045 if result.fail_msg:
10046 self.LogWarning("Failed to run rename script for %s on node"
10047 " %s: %s" % (instance, pnode_name, result.fail_msg))
10050 # also checked in the prereq part
10051 raise errors.ProgrammerError("Unknown OS initialization mode '%s'"
10054 assert not self.owned_locks(locking.LEVEL_NODE_RES)
10057 iobj.admin_state = constants.ADMINST_UP
10058 self.cfg.Update(iobj, feedback_fn)
10059 logging.info("Starting instance %s on node %s", instance, pnode_name)
10060 feedback_fn("* starting instance...")
10061 result = self.rpc.call_instance_start(pnode_name, (iobj, None, None),
10063 result.Raise("Could not start instance")
10065 return list(iobj.all_nodes)
10068 def _CheckRADOSFreeSpace():
10069 """Compute disk size requirements inside the RADOS cluster.
10072 # For the RADOS cluster we assume there is always enough space.
10076 class LUInstanceConsole(NoHooksLU):
10077 """Connect to an instance's console.
10079 This is somewhat special in that it returns the command line that
10080 you need to run on the master node in order to connect to the
10086 def ExpandNames(self):
10087 self.share_locks = _ShareAll()
10088 self._ExpandAndLockInstance()
10090 def CheckPrereq(self):
10091 """Check prerequisites.
10093 This checks that the instance is in the cluster.
10096 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
10097 assert self.instance is not None, \
10098 "Cannot retrieve locked instance %s" % self.op.instance_name
10099 _CheckNodeOnline(self, self.instance.primary_node)
10101 def Exec(self, feedback_fn):
10102 """Connect to the console of an instance
10105 instance = self.instance
10106 node = instance.primary_node
10108 node_insts = self.rpc.call_instance_list([node],
10109 [instance.hypervisor])[node]
10110 node_insts.Raise("Can't get node information from %s" % node)
10112 if instance.name not in node_insts.payload:
10113 if instance.admin_state == constants.ADMINST_UP:
10114 state = constants.INSTST_ERRORDOWN
10115 elif instance.admin_state == constants.ADMINST_DOWN:
10116 state = constants.INSTST_ADMINDOWN
10118 state = constants.INSTST_ADMINOFFLINE
10119 raise errors.OpExecError("Instance %s is not running (state %s)" %
10120 (instance.name, state))
10122 logging.debug("Connecting to console of %s on %s", instance.name, node)
10124 return _GetInstanceConsole(self.cfg.GetClusterInfo(), instance)
10127 def _GetInstanceConsole(cluster, instance):
10128 """Returns console information for an instance.
10130 @type cluster: L{objects.Cluster}
10131 @type instance: L{objects.Instance}
10135 hyper = hypervisor.GetHypervisor(instance.hypervisor)
10136 # beparams and hvparams are passed separately, to avoid editing the
10137 # instance and then saving the defaults in the instance itself.
10138 hvparams = cluster.FillHV(instance)
10139 beparams = cluster.FillBE(instance)
10140 console = hyper.GetInstanceConsole(instance, hvparams, beparams)
10142 assert console.instance == instance.name
10143 assert console.Validate()
10145 return console.ToDict()
10148 class LUInstanceReplaceDisks(LogicalUnit):
10149 """Replace the disks of an instance.
10152 HPATH = "mirrors-replace"
10153 HTYPE = constants.HTYPE_INSTANCE
10156 def CheckArguments(self):
10157 TLReplaceDisks.CheckArguments(self.op.mode, self.op.remote_node,
10158 self.op.iallocator)
10160 def ExpandNames(self):
10161 self._ExpandAndLockInstance()
10163 assert locking.LEVEL_NODE not in self.needed_locks
10164 assert locking.LEVEL_NODE_RES not in self.needed_locks
10165 assert locking.LEVEL_NODEGROUP not in self.needed_locks
10167 assert self.op.iallocator is None or self.op.remote_node is None, \
10168 "Conflicting options"
10170 if self.op.remote_node is not None:
10171 self.op.remote_node = _ExpandNodeName(self.cfg, self.op.remote_node)
10173 # Warning: do not remove the locking of the new secondary here
10174 # unless DRBD8.AddChildren is changed to work in parallel;
10175 # currently it doesn't since parallel invocations of
10176 # FindUnusedMinor will conflict
10177 self.needed_locks[locking.LEVEL_NODE] = [self.op.remote_node]
10178 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
10180 self.needed_locks[locking.LEVEL_NODE] = []
10181 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
10183 if self.op.iallocator is not None:
10184 # iallocator will select a new node in the same group
10185 self.needed_locks[locking.LEVEL_NODEGROUP] = []
10187 self.needed_locks[locking.LEVEL_NODE_RES] = []
10189 self.replacer = TLReplaceDisks(self, self.op.instance_name, self.op.mode,
10190 self.op.iallocator, self.op.remote_node,
10191 self.op.disks, False, self.op.early_release,
10192 self.op.ignore_ipolicy)
10194 self.tasklets = [self.replacer]
10196 def DeclareLocks(self, level):
10197 if level == locking.LEVEL_NODEGROUP:
10198 assert self.op.remote_node is None
10199 assert self.op.iallocator is not None
10200 assert not self.needed_locks[locking.LEVEL_NODEGROUP]
10202 self.share_locks[locking.LEVEL_NODEGROUP] = 1
10203 # Lock all groups used by instance optimistically; this requires going
10204 # via the node before it's locked, requiring verification later on
10205 self.needed_locks[locking.LEVEL_NODEGROUP] = \
10206 self.cfg.GetInstanceNodeGroups(self.op.instance_name)
10208 elif level == locking.LEVEL_NODE:
10209 if self.op.iallocator is not None:
10210 assert self.op.remote_node is None
10211 assert not self.needed_locks[locking.LEVEL_NODE]
10213 # Lock member nodes of all locked groups
10214 self.needed_locks[locking.LEVEL_NODE] = [node_name
10215 for group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP)
10216 for node_name in self.cfg.GetNodeGroup(group_uuid).members]
10218 self._LockInstancesNodes()
10219 elif level == locking.LEVEL_NODE_RES:
10221 self.needed_locks[locking.LEVEL_NODE_RES] = \
10222 self.needed_locks[locking.LEVEL_NODE]
10224 def BuildHooksEnv(self):
10225 """Build hooks env.
10227 This runs on the master, the primary and all the secondaries.
10230 instance = self.replacer.instance
10232 "MODE": self.op.mode,
10233 "NEW_SECONDARY": self.op.remote_node,
10234 "OLD_SECONDARY": instance.secondary_nodes[0],
10236 env.update(_BuildInstanceHookEnvByObject(self, instance))
10239 def BuildHooksNodes(self):
10240 """Build hooks nodes.
10243 instance = self.replacer.instance
10245 self.cfg.GetMasterNode(),
10246 instance.primary_node,
10248 if self.op.remote_node is not None:
10249 nl.append(self.op.remote_node)
10252 def CheckPrereq(self):
10253 """Check prerequisites.
10256 assert (self.glm.is_owned(locking.LEVEL_NODEGROUP) or
10257 self.op.iallocator is None)
10259 # Verify if node group locks are still correct
10260 owned_groups = self.owned_locks(locking.LEVEL_NODEGROUP)
10262 _CheckInstanceNodeGroups(self.cfg, self.op.instance_name, owned_groups)
10264 return LogicalUnit.CheckPrereq(self)
10267 class TLReplaceDisks(Tasklet):
10268 """Replaces disks for an instance.
10270 Note: Locking is not within the scope of this class.
10273 def __init__(self, lu, instance_name, mode, iallocator_name, remote_node,
10274 disks, delay_iallocator, early_release, ignore_ipolicy):
10275 """Initializes this class.
10278 Tasklet.__init__(self, lu)
10281 self.instance_name = instance_name
10283 self.iallocator_name = iallocator_name
10284 self.remote_node = remote_node
10286 self.delay_iallocator = delay_iallocator
10287 self.early_release = early_release
10288 self.ignore_ipolicy = ignore_ipolicy
10291 self.instance = None
10292 self.new_node = None
10293 self.target_node = None
10294 self.other_node = None
10295 self.remote_node_info = None
10296 self.node_secondary_ip = None
10299 def CheckArguments(mode, remote_node, iallocator):
10300 """Helper function for users of this class.
10303 # check for valid parameter combination
10304 if mode == constants.REPLACE_DISK_CHG:
10305 if remote_node is None and iallocator is None:
10306 raise errors.OpPrereqError("When changing the secondary either an"
10307 " iallocator script must be used or the"
10308 " new node given", errors.ECODE_INVAL)
10310 if remote_node is not None and iallocator is not None:
10311 raise errors.OpPrereqError("Give either the iallocator or the new"
10312 " secondary, not both", errors.ECODE_INVAL)
10314 elif remote_node is not None or iallocator is not None:
10315 # Not replacing the secondary
10316 raise errors.OpPrereqError("The iallocator and new node options can"
10317 " only be used when changing the"
10318 " secondary node", errors.ECODE_INVAL)
10321 def _RunAllocator(lu, iallocator_name, instance_name, relocate_from):
10322 """Compute a new secondary node using an IAllocator.
10325 ial = IAllocator(lu.cfg, lu.rpc,
10326 mode=constants.IALLOCATOR_MODE_RELOC,
10327 name=instance_name,
10328 relocate_from=list(relocate_from))
10330 ial.Run(iallocator_name)
10332 if not ial.success:
10333 raise errors.OpPrereqError("Can't compute nodes using iallocator '%s':"
10334 " %s" % (iallocator_name, ial.info),
10335 errors.ECODE_NORES)
10337 if len(ial.result) != ial.required_nodes:
10338 raise errors.OpPrereqError("iallocator '%s' returned invalid number"
10339 " of nodes (%s), required %s" %
10341 len(ial.result), ial.required_nodes),
10342 errors.ECODE_FAULT)
10344 remote_node_name = ial.result[0]
10346 lu.LogInfo("Selected new secondary for instance '%s': %s",
10347 instance_name, remote_node_name)
10349 return remote_node_name
10351 def _FindFaultyDisks(self, node_name):
10352 """Wrapper for L{_FindFaultyInstanceDisks}.
10355 return _FindFaultyInstanceDisks(self.cfg, self.rpc, self.instance,
10358 def _CheckDisksActivated(self, instance):
10359 """Checks if the instance disks are activated.
10361 @param instance: The instance to check disks
10362 @return: True if they are activated, False otherwise
10365 nodes = instance.all_nodes
10367 for idx, dev in enumerate(instance.disks):
10369 self.lu.LogInfo("Checking disk/%d on %s", idx, node)
10370 self.cfg.SetDiskID(dev, node)
10372 result = self.rpc.call_blockdev_find(node, dev)
10376 elif result.fail_msg or not result.payload:
10381 def CheckPrereq(self):
10382 """Check prerequisites.
10384 This checks that the instance is in the cluster.
10387 self.instance = instance = self.cfg.GetInstanceInfo(self.instance_name)
10388 assert instance is not None, \
10389 "Cannot retrieve locked instance %s" % self.instance_name
10391 if instance.disk_template != constants.DT_DRBD8:
10392 raise errors.OpPrereqError("Can only run replace disks for DRBD8-based"
10393 " instances", errors.ECODE_INVAL)
10395 if len(instance.secondary_nodes) != 1:
10396 raise errors.OpPrereqError("The instance has a strange layout,"
10397 " expected one secondary but found %d" %
10398 len(instance.secondary_nodes),
10399 errors.ECODE_FAULT)
10401 if not self.delay_iallocator:
10402 self._CheckPrereq2()
10404 def _CheckPrereq2(self):
10405 """Check prerequisites, second part.
10407 This function should always be part of CheckPrereq. It was separated and is
10408 now called from Exec because during node evacuation iallocator was only
10409 called with an unmodified cluster model, not taking planned changes into
10413 instance = self.instance
10414 secondary_node = instance.secondary_nodes[0]
10416 if self.iallocator_name is None:
10417 remote_node = self.remote_node
10419 remote_node = self._RunAllocator(self.lu, self.iallocator_name,
10420 instance.name, instance.secondary_nodes)
10422 if remote_node is None:
10423 self.remote_node_info = None
10425 assert remote_node in self.lu.owned_locks(locking.LEVEL_NODE), \
10426 "Remote node '%s' is not locked" % remote_node
10428 self.remote_node_info = self.cfg.GetNodeInfo(remote_node)
10429 assert self.remote_node_info is not None, \
10430 "Cannot retrieve locked node %s" % remote_node
10432 if remote_node == self.instance.primary_node:
10433 raise errors.OpPrereqError("The specified node is the primary node of"
10434 " the instance", errors.ECODE_INVAL)
10436 if remote_node == secondary_node:
10437 raise errors.OpPrereqError("The specified node is already the"
10438 " secondary node of the instance",
10439 errors.ECODE_INVAL)
10441 if self.disks and self.mode in (constants.REPLACE_DISK_AUTO,
10442 constants.REPLACE_DISK_CHG):
10443 raise errors.OpPrereqError("Cannot specify disks to be replaced",
10444 errors.ECODE_INVAL)
10446 if self.mode == constants.REPLACE_DISK_AUTO:
10447 if not self._CheckDisksActivated(instance):
10448 raise errors.OpPrereqError("Please run activate-disks on instance %s"
10449 " first" % self.instance_name,
10450 errors.ECODE_STATE)
10451 faulty_primary = self._FindFaultyDisks(instance.primary_node)
10452 faulty_secondary = self._FindFaultyDisks(secondary_node)
10454 if faulty_primary and faulty_secondary:
10455 raise errors.OpPrereqError("Instance %s has faulty disks on more than"
10456 " one node and can not be repaired"
10457 " automatically" % self.instance_name,
10458 errors.ECODE_STATE)
10461 self.disks = faulty_primary
10462 self.target_node = instance.primary_node
10463 self.other_node = secondary_node
10464 check_nodes = [self.target_node, self.other_node]
10465 elif faulty_secondary:
10466 self.disks = faulty_secondary
10467 self.target_node = secondary_node
10468 self.other_node = instance.primary_node
10469 check_nodes = [self.target_node, self.other_node]
10475 # Non-automatic modes
10476 if self.mode == constants.REPLACE_DISK_PRI:
10477 self.target_node = instance.primary_node
10478 self.other_node = secondary_node
10479 check_nodes = [self.target_node, self.other_node]
10481 elif self.mode == constants.REPLACE_DISK_SEC:
10482 self.target_node = secondary_node
10483 self.other_node = instance.primary_node
10484 check_nodes = [self.target_node, self.other_node]
10486 elif self.mode == constants.REPLACE_DISK_CHG:
10487 self.new_node = remote_node
10488 self.other_node = instance.primary_node
10489 self.target_node = secondary_node
10490 check_nodes = [self.new_node, self.other_node]
10492 _CheckNodeNotDrained(self.lu, remote_node)
10493 _CheckNodeVmCapable(self.lu, remote_node)
10495 old_node_info = self.cfg.GetNodeInfo(secondary_node)
10496 assert old_node_info is not None
10497 if old_node_info.offline and not self.early_release:
10498 # doesn't make sense to delay the release
10499 self.early_release = True
10500 self.lu.LogInfo("Old secondary %s is offline, automatically enabling"
10501 " early-release mode", secondary_node)
10504 raise errors.ProgrammerError("Unhandled disk replace mode (%s)" %
10507 # If not specified all disks should be replaced
10509 self.disks = range(len(self.instance.disks))
10511 # TODO: This is ugly, but right now we can't distinguish between internal
10512 # submitted opcode and external one. We should fix that.
10513 if self.remote_node_info:
10514 # We change the node, lets verify it still meets instance policy
10515 new_group_info = self.cfg.GetNodeGroup(self.remote_node_info.group)
10516 ipolicy = _CalculateGroupIPolicy(self.cfg.GetClusterInfo(),
10518 _CheckTargetNodeIPolicy(self, ipolicy, instance, self.remote_node_info,
10519 ignore=self.ignore_ipolicy)
10521 # TODO: compute disk parameters
10522 primary_node_info = self.cfg.GetNodeInfo(instance.primary_node)
10523 secondary_node_info = self.cfg.GetNodeInfo(secondary_node)
10524 if primary_node_info.group != secondary_node_info.group:
10525 self.lu.LogInfo("The instance primary and secondary nodes are in two"
10526 " different node groups; the disk parameters of the"
10527 " primary node's group will be applied.")
10529 self.diskparams = self.cfg.GetNodeGroup(primary_node_info.group).diskparams
10531 for node in check_nodes:
10532 _CheckNodeOnline(self.lu, node)
10534 touched_nodes = frozenset(node_name for node_name in [self.new_node,
10537 if node_name is not None)
10539 # Release unneeded node and node resource locks
10540 _ReleaseLocks(self.lu, locking.LEVEL_NODE, keep=touched_nodes)
10541 _ReleaseLocks(self.lu, locking.LEVEL_NODE_RES, keep=touched_nodes)
10543 # Release any owned node group
10544 if self.lu.glm.is_owned(locking.LEVEL_NODEGROUP):
10545 _ReleaseLocks(self.lu, locking.LEVEL_NODEGROUP)
10547 # Check whether disks are valid
10548 for disk_idx in self.disks:
10549 instance.FindDisk(disk_idx)
10551 # Get secondary node IP addresses
10552 self.node_secondary_ip = dict((name, node.secondary_ip) for (name, node)
10553 in self.cfg.GetMultiNodeInfo(touched_nodes))
10555 def Exec(self, feedback_fn):
10556 """Execute disk replacement.
10558 This dispatches the disk replacement to the appropriate handler.
10561 if self.delay_iallocator:
10562 self._CheckPrereq2()
10565 # Verify owned locks before starting operation
10566 owned_nodes = self.lu.owned_locks(locking.LEVEL_NODE)
10567 assert set(owned_nodes) == set(self.node_secondary_ip), \
10568 ("Incorrect node locks, owning %s, expected %s" %
10569 (owned_nodes, self.node_secondary_ip.keys()))
10570 assert (self.lu.owned_locks(locking.LEVEL_NODE) ==
10571 self.lu.owned_locks(locking.LEVEL_NODE_RES))
10573 owned_instances = self.lu.owned_locks(locking.LEVEL_INSTANCE)
10574 assert list(owned_instances) == [self.instance_name], \
10575 "Instance '%s' not locked" % self.instance_name
10577 assert not self.lu.glm.is_owned(locking.LEVEL_NODEGROUP), \
10578 "Should not own any node group lock at this point"
10581 feedback_fn("No disks need replacement")
10584 feedback_fn("Replacing disk(s) %s for %s" %
10585 (utils.CommaJoin(self.disks), self.instance.name))
10587 activate_disks = (self.instance.admin_state != constants.ADMINST_UP)
10589 # Activate the instance disks if we're replacing them on a down instance
10591 _StartInstanceDisks(self.lu, self.instance, True)
10594 # Should we replace the secondary node?
10595 if self.new_node is not None:
10596 fn = self._ExecDrbd8Secondary
10598 fn = self._ExecDrbd8DiskOnly
10600 result = fn(feedback_fn)
10602 # Deactivate the instance disks if we're replacing them on a
10605 _SafeShutdownInstanceDisks(self.lu, self.instance)
10607 assert not self.lu.owned_locks(locking.LEVEL_NODE)
10610 # Verify owned locks
10611 owned_nodes = self.lu.owned_locks(locking.LEVEL_NODE_RES)
10612 nodes = frozenset(self.node_secondary_ip)
10613 assert ((self.early_release and not owned_nodes) or
10614 (not self.early_release and not (set(owned_nodes) - nodes))), \
10615 ("Not owning the correct locks, early_release=%s, owned=%r,"
10616 " nodes=%r" % (self.early_release, owned_nodes, nodes))
10620 def _CheckVolumeGroup(self, nodes):
10621 self.lu.LogInfo("Checking volume groups")
10623 vgname = self.cfg.GetVGName()
10625 # Make sure volume group exists on all involved nodes
10626 results = self.rpc.call_vg_list(nodes)
10628 raise errors.OpExecError("Can't list volume groups on the nodes")
10631 res = results[node]
10632 res.Raise("Error checking node %s" % node)
10633 if vgname not in res.payload:
10634 raise errors.OpExecError("Volume group '%s' not found on node %s" %
10637 def _CheckDisksExistence(self, nodes):
10638 # Check disk existence
10639 for idx, dev in enumerate(self.instance.disks):
10640 if idx not in self.disks:
10644 self.lu.LogInfo("Checking disk/%d on %s" % (idx, node))
10645 self.cfg.SetDiskID(dev, node)
10647 result = self.rpc.call_blockdev_find(node, dev)
10649 msg = result.fail_msg
10650 if msg or not result.payload:
10652 msg = "disk not found"
10653 raise errors.OpExecError("Can't find disk/%d on node %s: %s" %
10656 def _CheckDisksConsistency(self, node_name, on_primary, ldisk):
10657 for idx, dev in enumerate(self.instance.disks):
10658 if idx not in self.disks:
10661 self.lu.LogInfo("Checking disk/%d consistency on node %s" %
10664 if not _CheckDiskConsistency(self.lu, dev, node_name, on_primary,
10666 raise errors.OpExecError("Node %s has degraded storage, unsafe to"
10667 " replace disks for instance %s" %
10668 (node_name, self.instance.name))
10670 def _CreateNewStorage(self, node_name):
10671 """Create new storage on the primary or secondary node.
10673 This is only used for same-node replaces, not for changing the
10674 secondary node, hence we don't want to modify the existing disk.
10679 for idx, dev in enumerate(self.instance.disks):
10680 if idx not in self.disks:
10683 self.lu.LogInfo("Adding storage on %s for disk/%d" % (node_name, idx))
10685 self.cfg.SetDiskID(dev, node_name)
10687 lv_names = [".disk%d_%s" % (idx, suffix) for suffix in ["data", "meta"]]
10688 names = _GenerateUniqueNames(self.lu, lv_names)
10690 _, data_p, meta_p = _ComputeLDParams(constants.DT_DRBD8, self.diskparams)
10692 vg_data = dev.children[0].logical_id[0]
10693 lv_data = objects.Disk(dev_type=constants.LD_LV, size=dev.size,
10694 logical_id=(vg_data, names[0]), params=data_p)
10695 vg_meta = dev.children[1].logical_id[0]
10696 lv_meta = objects.Disk(dev_type=constants.LD_LV, size=DRBD_META_SIZE,
10697 logical_id=(vg_meta, names[1]), params=meta_p)
10699 new_lvs = [lv_data, lv_meta]
10700 old_lvs = [child.Copy() for child in dev.children]
10701 iv_names[dev.iv_name] = (dev, old_lvs, new_lvs)
10703 # we pass force_create=True to force the LVM creation
10704 for new_lv in new_lvs:
10705 _CreateBlockDev(self.lu, node_name, self.instance, new_lv, True,
10706 _GetInstanceInfoText(self.instance), False)
10710 def _CheckDevices(self, node_name, iv_names):
10711 for name, (dev, _, _) in iv_names.iteritems():
10712 self.cfg.SetDiskID(dev, node_name)
10714 result = self.rpc.call_blockdev_find(node_name, dev)
10716 msg = result.fail_msg
10717 if msg or not result.payload:
10719 msg = "disk not found"
10720 raise errors.OpExecError("Can't find DRBD device %s: %s" %
10723 if result.payload.is_degraded:
10724 raise errors.OpExecError("DRBD device %s is degraded!" % name)
10726 def _RemoveOldStorage(self, node_name, iv_names):
10727 for name, (_, old_lvs, _) in iv_names.iteritems():
10728 self.lu.LogInfo("Remove logical volumes for %s" % name)
10731 self.cfg.SetDiskID(lv, node_name)
10733 msg = self.rpc.call_blockdev_remove(node_name, lv).fail_msg
10735 self.lu.LogWarning("Can't remove old LV: %s" % msg,
10736 hint="remove unused LVs manually")
10738 def _ExecDrbd8DiskOnly(self, feedback_fn): # pylint: disable=W0613
10739 """Replace a disk on the primary or secondary for DRBD 8.
10741 The algorithm for replace is quite complicated:
10743 1. for each disk to be replaced:
10745 1. create new LVs on the target node with unique names
10746 1. detach old LVs from the drbd device
10747 1. rename old LVs to name_replaced.<time_t>
10748 1. rename new LVs to old LVs
10749 1. attach the new LVs (with the old names now) to the drbd device
10751 1. wait for sync across all devices
10753 1. for each modified disk:
10755 1. remove old LVs (which have the name name_replaces.<time_t>)
10757 Failures are not very well handled.
10762 # Step: check device activation
10763 self.lu.LogStep(1, steps_total, "Check device existence")
10764 self._CheckDisksExistence([self.other_node, self.target_node])
10765 self._CheckVolumeGroup([self.target_node, self.other_node])
10767 # Step: check other node consistency
10768 self.lu.LogStep(2, steps_total, "Check peer consistency")
10769 self._CheckDisksConsistency(self.other_node,
10770 self.other_node == self.instance.primary_node,
10773 # Step: create new storage
10774 self.lu.LogStep(3, steps_total, "Allocate new storage")
10775 iv_names = self._CreateNewStorage(self.target_node)
10777 # Step: for each lv, detach+rename*2+attach
10778 self.lu.LogStep(4, steps_total, "Changing drbd configuration")
10779 for dev, old_lvs, new_lvs in iv_names.itervalues():
10780 self.lu.LogInfo("Detaching %s drbd from local storage" % dev.iv_name)
10782 result = self.rpc.call_blockdev_removechildren(self.target_node, dev,
10784 result.Raise("Can't detach drbd from local storage on node"
10785 " %s for device %s" % (self.target_node, dev.iv_name))
10787 #cfg.Update(instance)
10789 # ok, we created the new LVs, so now we know we have the needed
10790 # storage; as such, we proceed on the target node to rename
10791 # old_lv to _old, and new_lv to old_lv; note that we rename LVs
10792 # using the assumption that logical_id == physical_id (which in
10793 # turn is the unique_id on that node)
10795 # FIXME(iustin): use a better name for the replaced LVs
10796 temp_suffix = int(time.time())
10797 ren_fn = lambda d, suff: (d.physical_id[0],
10798 d.physical_id[1] + "_replaced-%s" % suff)
10800 # Build the rename list based on what LVs exist on the node
10801 rename_old_to_new = []
10802 for to_ren in old_lvs:
10803 result = self.rpc.call_blockdev_find(self.target_node, to_ren)
10804 if not result.fail_msg and result.payload:
10806 rename_old_to_new.append((to_ren, ren_fn(to_ren, temp_suffix)))
10808 self.lu.LogInfo("Renaming the old LVs on the target node")
10809 result = self.rpc.call_blockdev_rename(self.target_node,
10811 result.Raise("Can't rename old LVs on node %s" % self.target_node)
10813 # Now we rename the new LVs to the old LVs
10814 self.lu.LogInfo("Renaming the new LVs on the target node")
10815 rename_new_to_old = [(new, old.physical_id)
10816 for old, new in zip(old_lvs, new_lvs)]
10817 result = self.rpc.call_blockdev_rename(self.target_node,
10819 result.Raise("Can't rename new LVs on node %s" % self.target_node)
10821 # Intermediate steps of in memory modifications
10822 for old, new in zip(old_lvs, new_lvs):
10823 new.logical_id = old.logical_id
10824 self.cfg.SetDiskID(new, self.target_node)
10826 # We need to modify old_lvs so that removal later removes the
10827 # right LVs, not the newly added ones; note that old_lvs is a
10829 for disk in old_lvs:
10830 disk.logical_id = ren_fn(disk, temp_suffix)
10831 self.cfg.SetDiskID(disk, self.target_node)
10833 # Now that the new lvs have the old name, we can add them to the device
10834 self.lu.LogInfo("Adding new mirror component on %s" % self.target_node)
10835 result = self.rpc.call_blockdev_addchildren(self.target_node, dev,
10837 msg = result.fail_msg
10839 for new_lv in new_lvs:
10840 msg2 = self.rpc.call_blockdev_remove(self.target_node,
10843 self.lu.LogWarning("Can't rollback device %s: %s", dev, msg2,
10844 hint=("cleanup manually the unused logical"
10846 raise errors.OpExecError("Can't add local storage to drbd: %s" % msg)
10848 cstep = itertools.count(5)
10850 if self.early_release:
10851 self.lu.LogStep(cstep.next(), steps_total, "Removing old storage")
10852 self._RemoveOldStorage(self.target_node, iv_names)
10853 # TODO: Check if releasing locks early still makes sense
10854 _ReleaseLocks(self.lu, locking.LEVEL_NODE_RES)
10856 # Release all resource locks except those used by the instance
10857 _ReleaseLocks(self.lu, locking.LEVEL_NODE_RES,
10858 keep=self.node_secondary_ip.keys())
10860 # Release all node locks while waiting for sync
10861 _ReleaseLocks(self.lu, locking.LEVEL_NODE)
10863 # TODO: Can the instance lock be downgraded here? Take the optional disk
10864 # shutdown in the caller into consideration.
10867 # This can fail as the old devices are degraded and _WaitForSync
10868 # does a combined result over all disks, so we don't check its return value
10869 self.lu.LogStep(cstep.next(), steps_total, "Sync devices")
10870 _WaitForSync(self.lu, self.instance)
10872 # Check all devices manually
10873 self._CheckDevices(self.instance.primary_node, iv_names)
10875 # Step: remove old storage
10876 if not self.early_release:
10877 self.lu.LogStep(cstep.next(), steps_total, "Removing old storage")
10878 self._RemoveOldStorage(self.target_node, iv_names)
10880 def _ExecDrbd8Secondary(self, feedback_fn):
10881 """Replace the secondary node for DRBD 8.
10883 The algorithm for replace is quite complicated:
10884 - for all disks of the instance:
10885 - create new LVs on the new node with same names
10886 - shutdown the drbd device on the old secondary
10887 - disconnect the drbd network on the primary
10888 - create the drbd device on the new secondary
10889 - network attach the drbd on the primary, using an artifice:
10890 the drbd code for Attach() will connect to the network if it
10891 finds a device which is connected to the good local disks but
10892 not network enabled
10893 - wait for sync across all devices
10894 - remove all disks from the old secondary
10896 Failures are not very well handled.
10901 pnode = self.instance.primary_node
10903 # Step: check device activation
10904 self.lu.LogStep(1, steps_total, "Check device existence")
10905 self._CheckDisksExistence([self.instance.primary_node])
10906 self._CheckVolumeGroup([self.instance.primary_node])
10908 # Step: check other node consistency
10909 self.lu.LogStep(2, steps_total, "Check peer consistency")
10910 self._CheckDisksConsistency(self.instance.primary_node, True, True)
10912 # Step: create new storage
10913 self.lu.LogStep(3, steps_total, "Allocate new storage")
10914 for idx, dev in enumerate(self.instance.disks):
10915 self.lu.LogInfo("Adding new local storage on %s for disk/%d" %
10916 (self.new_node, idx))
10917 # we pass force_create=True to force LVM creation
10918 for new_lv in dev.children:
10919 _CreateBlockDev(self.lu, self.new_node, self.instance, new_lv, True,
10920 _GetInstanceInfoText(self.instance), False)
10922 # Step 4: dbrd minors and drbd setups changes
10923 # after this, we must manually remove the drbd minors on both the
10924 # error and the success paths
10925 self.lu.LogStep(4, steps_total, "Changing drbd configuration")
10926 minors = self.cfg.AllocateDRBDMinor([self.new_node
10927 for dev in self.instance.disks],
10928 self.instance.name)
10929 logging.debug("Allocated minors %r", minors)
10932 for idx, (dev, new_minor) in enumerate(zip(self.instance.disks, minors)):
10933 self.lu.LogInfo("activating a new drbd on %s for disk/%d" %
10934 (self.new_node, idx))
10935 # create new devices on new_node; note that we create two IDs:
10936 # one without port, so the drbd will be activated without
10937 # networking information on the new node at this stage, and one
10938 # with network, for the latter activation in step 4
10939 (o_node1, o_node2, o_port, o_minor1, o_minor2, o_secret) = dev.logical_id
10940 if self.instance.primary_node == o_node1:
10943 assert self.instance.primary_node == o_node2, "Three-node instance?"
10946 new_alone_id = (self.instance.primary_node, self.new_node, None,
10947 p_minor, new_minor, o_secret)
10948 new_net_id = (self.instance.primary_node, self.new_node, o_port,
10949 p_minor, new_minor, o_secret)
10951 iv_names[idx] = (dev, dev.children, new_net_id)
10952 logging.debug("Allocated new_minor: %s, new_logical_id: %s", new_minor,
10954 drbd_params, _, _ = _ComputeLDParams(constants.DT_DRBD8, self.diskparams)
10955 new_drbd = objects.Disk(dev_type=constants.LD_DRBD8,
10956 logical_id=new_alone_id,
10957 children=dev.children,
10959 params=drbd_params)
10961 _CreateSingleBlockDev(self.lu, self.new_node, self.instance, new_drbd,
10962 _GetInstanceInfoText(self.instance), False)
10963 except errors.GenericError:
10964 self.cfg.ReleaseDRBDMinors(self.instance.name)
10967 # We have new devices, shutdown the drbd on the old secondary
10968 for idx, dev in enumerate(self.instance.disks):
10969 self.lu.LogInfo("Shutting down drbd for disk/%d on old node" % idx)
10970 self.cfg.SetDiskID(dev, self.target_node)
10971 msg = self.rpc.call_blockdev_shutdown(self.target_node, dev).fail_msg
10973 self.lu.LogWarning("Failed to shutdown drbd for disk/%d on old"
10974 "node: %s" % (idx, msg),
10975 hint=("Please cleanup this device manually as"
10976 " soon as possible"))
10978 self.lu.LogInfo("Detaching primary drbds from the network (=> standalone)")
10979 result = self.rpc.call_drbd_disconnect_net([pnode], self.node_secondary_ip,
10980 self.instance.disks)[pnode]
10982 msg = result.fail_msg
10984 # detaches didn't succeed (unlikely)
10985 self.cfg.ReleaseDRBDMinors(self.instance.name)
10986 raise errors.OpExecError("Can't detach the disks from the network on"
10987 " old node: %s" % (msg,))
10989 # if we managed to detach at least one, we update all the disks of
10990 # the instance to point to the new secondary
10991 self.lu.LogInfo("Updating instance configuration")
10992 for dev, _, new_logical_id in iv_names.itervalues():
10993 dev.logical_id = new_logical_id
10994 self.cfg.SetDiskID(dev, self.instance.primary_node)
10996 self.cfg.Update(self.instance, feedback_fn)
10998 # Release all node locks (the configuration has been updated)
10999 _ReleaseLocks(self.lu, locking.LEVEL_NODE)
11001 # and now perform the drbd attach
11002 self.lu.LogInfo("Attaching primary drbds to new secondary"
11003 " (standalone => connected)")
11004 result = self.rpc.call_drbd_attach_net([self.instance.primary_node,
11006 self.node_secondary_ip,
11007 self.instance.disks,
11008 self.instance.name,
11010 for to_node, to_result in result.items():
11011 msg = to_result.fail_msg
11013 self.lu.LogWarning("Can't attach drbd disks on node %s: %s",
11015 hint=("please do a gnt-instance info to see the"
11016 " status of disks"))
11018 cstep = itertools.count(5)
11020 if self.early_release:
11021 self.lu.LogStep(cstep.next(), steps_total, "Removing old storage")
11022 self._RemoveOldStorage(self.target_node, iv_names)
11023 # TODO: Check if releasing locks early still makes sense
11024 _ReleaseLocks(self.lu, locking.LEVEL_NODE_RES)
11026 # Release all resource locks except those used by the instance
11027 _ReleaseLocks(self.lu, locking.LEVEL_NODE_RES,
11028 keep=self.node_secondary_ip.keys())
11030 # TODO: Can the instance lock be downgraded here? Take the optional disk
11031 # shutdown in the caller into consideration.
11034 # This can fail as the old devices are degraded and _WaitForSync
11035 # does a combined result over all disks, so we don't check its return value
11036 self.lu.LogStep(cstep.next(), steps_total, "Sync devices")
11037 _WaitForSync(self.lu, self.instance)
11039 # Check all devices manually
11040 self._CheckDevices(self.instance.primary_node, iv_names)
11042 # Step: remove old storage
11043 if not self.early_release:
11044 self.lu.LogStep(cstep.next(), steps_total, "Removing old storage")
11045 self._RemoveOldStorage(self.target_node, iv_names)
11048 class LURepairNodeStorage(NoHooksLU):
11049 """Repairs the volume group on a node.
11054 def CheckArguments(self):
11055 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
11057 storage_type = self.op.storage_type
11059 if (constants.SO_FIX_CONSISTENCY not in
11060 constants.VALID_STORAGE_OPERATIONS.get(storage_type, [])):
11061 raise errors.OpPrereqError("Storage units of type '%s' can not be"
11062 " repaired" % storage_type,
11063 errors.ECODE_INVAL)
11065 def ExpandNames(self):
11066 self.needed_locks = {
11067 locking.LEVEL_NODE: [self.op.node_name],
11070 def _CheckFaultyDisks(self, instance, node_name):
11071 """Ensure faulty disks abort the opcode or at least warn."""
11073 if _FindFaultyInstanceDisks(self.cfg, self.rpc, instance,
11075 raise errors.OpPrereqError("Instance '%s' has faulty disks on"
11076 " node '%s'" % (instance.name, node_name),
11077 errors.ECODE_STATE)
11078 except errors.OpPrereqError, err:
11079 if self.op.ignore_consistency:
11080 self.proc.LogWarning(str(err.args[0]))
11084 def CheckPrereq(self):
11085 """Check prerequisites.
11088 # Check whether any instance on this node has faulty disks
11089 for inst in _GetNodeInstances(self.cfg, self.op.node_name):
11090 if inst.admin_state != constants.ADMINST_UP:
11092 check_nodes = set(inst.all_nodes)
11093 check_nodes.discard(self.op.node_name)
11094 for inst_node_name in check_nodes:
11095 self._CheckFaultyDisks(inst, inst_node_name)
11097 def Exec(self, feedback_fn):
11098 feedback_fn("Repairing storage unit '%s' on %s ..." %
11099 (self.op.name, self.op.node_name))
11101 st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
11102 result = self.rpc.call_storage_execute(self.op.node_name,
11103 self.op.storage_type, st_args,
11105 constants.SO_FIX_CONSISTENCY)
11106 result.Raise("Failed to repair storage unit '%s' on %s" %
11107 (self.op.name, self.op.node_name))
11110 class LUNodeEvacuate(NoHooksLU):
11111 """Evacuates instances off a list of nodes.
11116 _MODE2IALLOCATOR = {
11117 constants.NODE_EVAC_PRI: constants.IALLOCATOR_NEVAC_PRI,
11118 constants.NODE_EVAC_SEC: constants.IALLOCATOR_NEVAC_SEC,
11119 constants.NODE_EVAC_ALL: constants.IALLOCATOR_NEVAC_ALL,
11121 assert frozenset(_MODE2IALLOCATOR.keys()) == constants.NODE_EVAC_MODES
11122 assert (frozenset(_MODE2IALLOCATOR.values()) ==
11123 constants.IALLOCATOR_NEVAC_MODES)
11125 def CheckArguments(self):
11126 _CheckIAllocatorOrNode(self, "iallocator", "remote_node")
11128 def ExpandNames(self):
11129 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
11131 if self.op.remote_node is not None:
11132 self.op.remote_node = _ExpandNodeName(self.cfg, self.op.remote_node)
11133 assert self.op.remote_node
11135 if self.op.remote_node == self.op.node_name:
11136 raise errors.OpPrereqError("Can not use evacuated node as a new"
11137 " secondary node", errors.ECODE_INVAL)
11139 if self.op.mode != constants.NODE_EVAC_SEC:
11140 raise errors.OpPrereqError("Without the use of an iallocator only"
11141 " secondary instances can be evacuated",
11142 errors.ECODE_INVAL)
11145 self.share_locks = _ShareAll()
11146 self.needed_locks = {
11147 locking.LEVEL_INSTANCE: [],
11148 locking.LEVEL_NODEGROUP: [],
11149 locking.LEVEL_NODE: [],
11152 # Determine nodes (via group) optimistically, needs verification once locks
11153 # have been acquired
11154 self.lock_nodes = self._DetermineNodes()
11156 def _DetermineNodes(self):
11157 """Gets the list of nodes to operate on.
11160 if self.op.remote_node is None:
11161 # Iallocator will choose any node(s) in the same group
11162 group_nodes = self.cfg.GetNodeGroupMembersByNodes([self.op.node_name])
11164 group_nodes = frozenset([self.op.remote_node])
11166 # Determine nodes to be locked
11167 return set([self.op.node_name]) | group_nodes
11169 def _DetermineInstances(self):
11170 """Builds list of instances to operate on.
11173 assert self.op.mode in constants.NODE_EVAC_MODES
11175 if self.op.mode == constants.NODE_EVAC_PRI:
11176 # Primary instances only
11177 inst_fn = _GetNodePrimaryInstances
11178 assert self.op.remote_node is None, \
11179 "Evacuating primary instances requires iallocator"
11180 elif self.op.mode == constants.NODE_EVAC_SEC:
11181 # Secondary instances only
11182 inst_fn = _GetNodeSecondaryInstances
11185 assert self.op.mode == constants.NODE_EVAC_ALL
11186 inst_fn = _GetNodeInstances
11187 # TODO: In 2.6, change the iallocator interface to take an evacuation mode
11189 raise errors.OpPrereqError("Due to an issue with the iallocator"
11190 " interface it is not possible to evacuate"
11191 " all instances at once; specify explicitly"
11192 " whether to evacuate primary or secondary"
11194 errors.ECODE_INVAL)
11196 return inst_fn(self.cfg, self.op.node_name)
11198 def DeclareLocks(self, level):
11199 if level == locking.LEVEL_INSTANCE:
11200 # Lock instances optimistically, needs verification once node and group
11201 # locks have been acquired
11202 self.needed_locks[locking.LEVEL_INSTANCE] = \
11203 set(i.name for i in self._DetermineInstances())
11205 elif level == locking.LEVEL_NODEGROUP:
11206 # Lock node groups for all potential target nodes optimistically, needs
11207 # verification once nodes have been acquired
11208 self.needed_locks[locking.LEVEL_NODEGROUP] = \
11209 self.cfg.GetNodeGroupsFromNodes(self.lock_nodes)
11211 elif level == locking.LEVEL_NODE:
11212 self.needed_locks[locking.LEVEL_NODE] = self.lock_nodes
11214 def CheckPrereq(self):
11216 owned_instances = self.owned_locks(locking.LEVEL_INSTANCE)
11217 owned_nodes = self.owned_locks(locking.LEVEL_NODE)
11218 owned_groups = self.owned_locks(locking.LEVEL_NODEGROUP)
11220 need_nodes = self._DetermineNodes()
11222 if not owned_nodes.issuperset(need_nodes):
11223 raise errors.OpPrereqError("Nodes in same group as '%s' changed since"
11224 " locks were acquired, current nodes are"
11225 " are '%s', used to be '%s'; retry the"
11227 (self.op.node_name,
11228 utils.CommaJoin(need_nodes),
11229 utils.CommaJoin(owned_nodes)),
11230 errors.ECODE_STATE)
11232 wanted_groups = self.cfg.GetNodeGroupsFromNodes(owned_nodes)
11233 if owned_groups != wanted_groups:
11234 raise errors.OpExecError("Node groups changed since locks were acquired,"
11235 " current groups are '%s', used to be '%s';"
11236 " retry the operation" %
11237 (utils.CommaJoin(wanted_groups),
11238 utils.CommaJoin(owned_groups)))
11240 # Determine affected instances
11241 self.instances = self._DetermineInstances()
11242 self.instance_names = [i.name for i in self.instances]
11244 if set(self.instance_names) != owned_instances:
11245 raise errors.OpExecError("Instances on node '%s' changed since locks"
11246 " were acquired, current instances are '%s',"
11247 " used to be '%s'; retry the operation" %
11248 (self.op.node_name,
11249 utils.CommaJoin(self.instance_names),
11250 utils.CommaJoin(owned_instances)))
11252 if self.instance_names:
11253 self.LogInfo("Evacuating instances from node '%s': %s",
11255 utils.CommaJoin(utils.NiceSort(self.instance_names)))
11257 self.LogInfo("No instances to evacuate from node '%s'",
11260 if self.op.remote_node is not None:
11261 for i in self.instances:
11262 if i.primary_node == self.op.remote_node:
11263 raise errors.OpPrereqError("Node %s is the primary node of"
11264 " instance %s, cannot use it as"
11266 (self.op.remote_node, i.name),
11267 errors.ECODE_INVAL)
11269 def Exec(self, feedback_fn):
11270 assert (self.op.iallocator is not None) ^ (self.op.remote_node is not None)
11272 if not self.instance_names:
11273 # No instances to evacuate
11276 elif self.op.iallocator is not None:
11277 # TODO: Implement relocation to other group
11278 ial = IAllocator(self.cfg, self.rpc, constants.IALLOCATOR_MODE_NODE_EVAC,
11279 evac_mode=self._MODE2IALLOCATOR[self.op.mode],
11280 instances=list(self.instance_names))
11282 ial.Run(self.op.iallocator)
11284 if not ial.success:
11285 raise errors.OpPrereqError("Can't compute node evacuation using"
11286 " iallocator '%s': %s" %
11287 (self.op.iallocator, ial.info),
11288 errors.ECODE_NORES)
11290 jobs = _LoadNodeEvacResult(self, ial.result, self.op.early_release, True)
11292 elif self.op.remote_node is not None:
11293 assert self.op.mode == constants.NODE_EVAC_SEC
11295 [opcodes.OpInstanceReplaceDisks(instance_name=instance_name,
11296 remote_node=self.op.remote_node,
11298 mode=constants.REPLACE_DISK_CHG,
11299 early_release=self.op.early_release)]
11300 for instance_name in self.instance_names
11304 raise errors.ProgrammerError("No iallocator or remote node")
11306 return ResultWithJobs(jobs)
11309 def _SetOpEarlyRelease(early_release, op):
11310 """Sets C{early_release} flag on opcodes if available.
11314 op.early_release = early_release
11315 except AttributeError:
11316 assert not isinstance(op, opcodes.OpInstanceReplaceDisks)
11321 def _NodeEvacDest(use_nodes, group, nodes):
11322 """Returns group or nodes depending on caller's choice.
11326 return utils.CommaJoin(nodes)
11331 def _LoadNodeEvacResult(lu, alloc_result, early_release, use_nodes):
11332 """Unpacks the result of change-group and node-evacuate iallocator requests.
11334 Iallocator modes L{constants.IALLOCATOR_MODE_NODE_EVAC} and
11335 L{constants.IALLOCATOR_MODE_CHG_GROUP}.
11337 @type lu: L{LogicalUnit}
11338 @param lu: Logical unit instance
11339 @type alloc_result: tuple/list
11340 @param alloc_result: Result from iallocator
11341 @type early_release: bool
11342 @param early_release: Whether to release locks early if possible
11343 @type use_nodes: bool
11344 @param use_nodes: Whether to display node names instead of groups
11347 (moved, failed, jobs) = alloc_result
11350 failreason = utils.CommaJoin("%s (%s)" % (name, reason)
11351 for (name, reason) in failed)
11352 lu.LogWarning("Unable to evacuate instances %s", failreason)
11353 raise errors.OpExecError("Unable to evacuate instances %s" % failreason)
11356 lu.LogInfo("Instances to be moved: %s",
11357 utils.CommaJoin("%s (to %s)" %
11358 (name, _NodeEvacDest(use_nodes, group, nodes))
11359 for (name, group, nodes) in moved))
11361 return [map(compat.partial(_SetOpEarlyRelease, early_release),
11362 map(opcodes.OpCode.LoadOpCode, ops))
11366 class LUInstanceGrowDisk(LogicalUnit):
11367 """Grow a disk of an instance.
11370 HPATH = "disk-grow"
11371 HTYPE = constants.HTYPE_INSTANCE
11374 def ExpandNames(self):
11375 self._ExpandAndLockInstance()
11376 self.needed_locks[locking.LEVEL_NODE] = []
11377 self.needed_locks[locking.LEVEL_NODE_RES] = []
11378 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
11379 self.recalculate_locks[locking.LEVEL_NODE_RES] = constants.LOCKS_REPLACE
11381 def DeclareLocks(self, level):
11382 if level == locking.LEVEL_NODE:
11383 self._LockInstancesNodes()
11384 elif level == locking.LEVEL_NODE_RES:
11386 self.needed_locks[locking.LEVEL_NODE_RES] = \
11387 self.needed_locks[locking.LEVEL_NODE][:]
11389 def BuildHooksEnv(self):
11390 """Build hooks env.
11392 This runs on the master, the primary and all the secondaries.
11396 "DISK": self.op.disk,
11397 "AMOUNT": self.op.amount,
11399 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
11402 def BuildHooksNodes(self):
11403 """Build hooks nodes.
11406 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
11409 def CheckPrereq(self):
11410 """Check prerequisites.
11412 This checks that the instance is in the cluster.
11415 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
11416 assert instance is not None, \
11417 "Cannot retrieve locked instance %s" % self.op.instance_name
11418 nodenames = list(instance.all_nodes)
11419 for node in nodenames:
11420 _CheckNodeOnline(self, node)
11422 self.instance = instance
11424 if instance.disk_template not in constants.DTS_GROWABLE:
11425 raise errors.OpPrereqError("Instance's disk layout does not support"
11426 " growing", errors.ECODE_INVAL)
11428 self.disk = instance.FindDisk(self.op.disk)
11430 if instance.disk_template not in (constants.DT_FILE,
11431 constants.DT_SHARED_FILE,
11433 # TODO: check the free disk space for file, when that feature will be
11435 _CheckNodesFreeDiskPerVG(self, nodenames,
11436 self.disk.ComputeGrowth(self.op.amount))
11438 def Exec(self, feedback_fn):
11439 """Execute disk grow.
11442 instance = self.instance
11445 assert set([instance.name]) == self.owned_locks(locking.LEVEL_INSTANCE)
11446 assert (self.owned_locks(locking.LEVEL_NODE) ==
11447 self.owned_locks(locking.LEVEL_NODE_RES))
11449 disks_ok, _ = _AssembleInstanceDisks(self, self.instance, disks=[disk])
11451 raise errors.OpExecError("Cannot activate block device to grow")
11453 feedback_fn("Growing disk %s of instance '%s' by %s" %
11454 (self.op.disk, instance.name,
11455 utils.FormatUnit(self.op.amount, "h")))
11457 # First run all grow ops in dry-run mode
11458 for node in instance.all_nodes:
11459 self.cfg.SetDiskID(disk, node)
11460 result = self.rpc.call_blockdev_grow(node, disk, self.op.amount, True)
11461 result.Raise("Grow request failed to node %s" % node)
11463 # We know that (as far as we can test) operations across different
11464 # nodes will succeed, time to run it for real
11465 for node in instance.all_nodes:
11466 self.cfg.SetDiskID(disk, node)
11467 result = self.rpc.call_blockdev_grow(node, disk, self.op.amount, False)
11468 result.Raise("Grow request failed to node %s" % node)
11470 # TODO: Rewrite code to work properly
11471 # DRBD goes into sync mode for a short amount of time after executing the
11472 # "resize" command. DRBD 8.x below version 8.0.13 contains a bug whereby
11473 # calling "resize" in sync mode fails. Sleeping for a short amount of
11474 # time is a work-around.
11477 disk.RecordGrow(self.op.amount)
11478 self.cfg.Update(instance, feedback_fn)
11480 # Changes have been recorded, release node lock
11481 _ReleaseLocks(self, locking.LEVEL_NODE)
11483 # Downgrade lock while waiting for sync
11484 self.glm.downgrade(locking.LEVEL_INSTANCE)
11486 if self.op.wait_for_sync:
11487 disk_abort = not _WaitForSync(self, instance, disks=[disk])
11489 self.proc.LogWarning("Disk sync-ing has not returned a good"
11490 " status; please check the instance")
11491 if instance.admin_state != constants.ADMINST_UP:
11492 _SafeShutdownInstanceDisks(self, instance, disks=[disk])
11493 elif instance.admin_state != constants.ADMINST_UP:
11494 self.proc.LogWarning("Not shutting down the disk even if the instance is"
11495 " not supposed to be running because no wait for"
11496 " sync mode was requested")
11498 assert self.owned_locks(locking.LEVEL_NODE_RES)
11499 assert set([instance.name]) == self.owned_locks(locking.LEVEL_INSTANCE)
11502 class LUInstanceQueryData(NoHooksLU):
11503 """Query runtime instance data.
11508 def ExpandNames(self):
11509 self.needed_locks = {}
11511 # Use locking if requested or when non-static information is wanted
11512 if not (self.op.static or self.op.use_locking):
11513 self.LogWarning("Non-static data requested, locks need to be acquired")
11514 self.op.use_locking = True
11516 if self.op.instances or not self.op.use_locking:
11517 # Expand instance names right here
11518 self.wanted_names = _GetWantedInstances(self, self.op.instances)
11520 # Will use acquired locks
11521 self.wanted_names = None
11523 if self.op.use_locking:
11524 self.share_locks = _ShareAll()
11526 if self.wanted_names is None:
11527 self.needed_locks[locking.LEVEL_INSTANCE] = locking.ALL_SET
11529 self.needed_locks[locking.LEVEL_INSTANCE] = self.wanted_names
11531 self.needed_locks[locking.LEVEL_NODE] = []
11532 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
11534 def DeclareLocks(self, level):
11535 if self.op.use_locking and level == locking.LEVEL_NODE:
11536 self._LockInstancesNodes()
11538 def CheckPrereq(self):
11539 """Check prerequisites.
11541 This only checks the optional instance list against the existing names.
11544 if self.wanted_names is None:
11545 assert self.op.use_locking, "Locking was not used"
11546 self.wanted_names = self.owned_locks(locking.LEVEL_INSTANCE)
11548 self.wanted_instances = \
11549 map(compat.snd, self.cfg.GetMultiInstanceInfo(self.wanted_names))
11551 def _ComputeBlockdevStatus(self, node, instance_name, dev):
11552 """Returns the status of a block device
11555 if self.op.static or not node:
11558 self.cfg.SetDiskID(dev, node)
11560 result = self.rpc.call_blockdev_find(node, dev)
11564 result.Raise("Can't compute disk status for %s" % instance_name)
11566 status = result.payload
11570 return (status.dev_path, status.major, status.minor,
11571 status.sync_percent, status.estimated_time,
11572 status.is_degraded, status.ldisk_status)
11574 def _ComputeDiskStatus(self, instance, snode, dev):
11575 """Compute block device status.
11578 if dev.dev_type in constants.LDS_DRBD:
11579 # we change the snode then (otherwise we use the one passed in)
11580 if dev.logical_id[0] == instance.primary_node:
11581 snode = dev.logical_id[1]
11583 snode = dev.logical_id[0]
11585 dev_pstatus = self._ComputeBlockdevStatus(instance.primary_node,
11586 instance.name, dev)
11587 dev_sstatus = self._ComputeBlockdevStatus(snode, instance.name, dev)
11590 dev_children = map(compat.partial(self._ComputeDiskStatus,
11597 "iv_name": dev.iv_name,
11598 "dev_type": dev.dev_type,
11599 "logical_id": dev.logical_id,
11600 "physical_id": dev.physical_id,
11601 "pstatus": dev_pstatus,
11602 "sstatus": dev_sstatus,
11603 "children": dev_children,
11608 def Exec(self, feedback_fn):
11609 """Gather and return data"""
11612 cluster = self.cfg.GetClusterInfo()
11614 pri_nodes = self.cfg.GetMultiNodeInfo(i.primary_node
11615 for i in self.wanted_instances)
11616 for instance, (_, pnode) in zip(self.wanted_instances, pri_nodes):
11617 if self.op.static or pnode.offline:
11618 remote_state = None
11620 self.LogWarning("Primary node %s is marked offline, returning static"
11621 " information only for instance %s" %
11622 (pnode.name, instance.name))
11624 remote_info = self.rpc.call_instance_info(instance.primary_node,
11626 instance.hypervisor)
11627 remote_info.Raise("Error checking node %s" % instance.primary_node)
11628 remote_info = remote_info.payload
11629 if remote_info and "state" in remote_info:
11630 remote_state = "up"
11632 if instance.admin_state == constants.ADMINST_UP:
11633 remote_state = "down"
11635 remote_state = instance.admin_state
11637 disks = map(compat.partial(self._ComputeDiskStatus, instance, None),
11640 result[instance.name] = {
11641 "name": instance.name,
11642 "config_state": instance.admin_state,
11643 "run_state": remote_state,
11644 "pnode": instance.primary_node,
11645 "snodes": instance.secondary_nodes,
11647 # this happens to be the same format used for hooks
11648 "nics": _NICListToTuple(self, instance.nics),
11649 "disk_template": instance.disk_template,
11651 "hypervisor": instance.hypervisor,
11652 "network_port": instance.network_port,
11653 "hv_instance": instance.hvparams,
11654 "hv_actual": cluster.FillHV(instance, skip_globals=True),
11655 "be_instance": instance.beparams,
11656 "be_actual": cluster.FillBE(instance),
11657 "os_instance": instance.osparams,
11658 "os_actual": cluster.SimpleFillOS(instance.os, instance.osparams),
11659 "serial_no": instance.serial_no,
11660 "mtime": instance.mtime,
11661 "ctime": instance.ctime,
11662 "uuid": instance.uuid,
11668 class LUInstanceSetParams(LogicalUnit):
11669 """Modifies an instances's parameters.
11672 HPATH = "instance-modify"
11673 HTYPE = constants.HTYPE_INSTANCE
11676 def CheckArguments(self):
11677 if not (self.op.nics or self.op.disks or self.op.disk_template or
11678 self.op.hvparams or self.op.beparams or self.op.os_name or
11679 self.op.online_inst or self.op.offline_inst or
11680 self.op.runtime_mem):
11681 raise errors.OpPrereqError("No changes submitted", errors.ECODE_INVAL)
11683 if self.op.hvparams:
11684 _CheckGlobalHvParams(self.op.hvparams)
11688 for disk_op, disk_dict in self.op.disks:
11689 utils.ForceDictType(disk_dict, constants.IDISK_PARAMS_TYPES)
11690 if disk_op == constants.DDM_REMOVE:
11691 disk_addremove += 1
11693 elif disk_op == constants.DDM_ADD:
11694 disk_addremove += 1
11696 if not isinstance(disk_op, int):
11697 raise errors.OpPrereqError("Invalid disk index", errors.ECODE_INVAL)
11698 if not isinstance(disk_dict, dict):
11699 msg = "Invalid disk value: expected dict, got '%s'" % disk_dict
11700 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
11702 if disk_op == constants.DDM_ADD:
11703 mode = disk_dict.setdefault(constants.IDISK_MODE, constants.DISK_RDWR)
11704 if mode not in constants.DISK_ACCESS_SET:
11705 raise errors.OpPrereqError("Invalid disk access mode '%s'" % mode,
11706 errors.ECODE_INVAL)
11707 size = disk_dict.get(constants.IDISK_SIZE, None)
11709 raise errors.OpPrereqError("Required disk parameter size missing",
11710 errors.ECODE_INVAL)
11713 except (TypeError, ValueError), err:
11714 raise errors.OpPrereqError("Invalid disk size parameter: %s" %
11715 str(err), errors.ECODE_INVAL)
11716 disk_dict[constants.IDISK_SIZE] = size
11718 # modification of disk
11719 if constants.IDISK_SIZE in disk_dict:
11720 raise errors.OpPrereqError("Disk size change not possible, use"
11721 " grow-disk", errors.ECODE_INVAL)
11723 if disk_addremove > 1:
11724 raise errors.OpPrereqError("Only one disk add or remove operation"
11725 " supported at a time", errors.ECODE_INVAL)
11727 if self.op.disks and self.op.disk_template is not None:
11728 raise errors.OpPrereqError("Disk template conversion and other disk"
11729 " changes not supported at the same time",
11730 errors.ECODE_INVAL)
11732 if (self.op.disk_template and
11733 self.op.disk_template in constants.DTS_INT_MIRROR and
11734 self.op.remote_node is None):
11735 raise errors.OpPrereqError("Changing the disk template to a mirrored"
11736 " one requires specifying a secondary node",
11737 errors.ECODE_INVAL)
11741 for nic_op, nic_dict in self.op.nics:
11742 utils.ForceDictType(nic_dict, constants.INIC_PARAMS_TYPES)
11743 if nic_op == constants.DDM_REMOVE:
11746 elif nic_op == constants.DDM_ADD:
11749 if not isinstance(nic_op, int):
11750 raise errors.OpPrereqError("Invalid nic index", errors.ECODE_INVAL)
11751 if not isinstance(nic_dict, dict):
11752 msg = "Invalid nic value: expected dict, got '%s'" % nic_dict
11753 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
11755 # nic_dict should be a dict
11756 nic_ip = nic_dict.get(constants.INIC_IP, None)
11757 if nic_ip is not None:
11758 if nic_ip.lower() == constants.VALUE_NONE:
11759 nic_dict[constants.INIC_IP] = None
11761 if not netutils.IPAddress.IsValid(nic_ip):
11762 raise errors.OpPrereqError("Invalid IP address '%s'" % nic_ip,
11763 errors.ECODE_INVAL)
11765 nic_bridge = nic_dict.get("bridge", None)
11766 nic_link = nic_dict.get(constants.INIC_LINK, None)
11767 if nic_bridge and nic_link:
11768 raise errors.OpPrereqError("Cannot pass 'bridge' and 'link'"
11769 " at the same time", errors.ECODE_INVAL)
11770 elif nic_bridge and nic_bridge.lower() == constants.VALUE_NONE:
11771 nic_dict["bridge"] = None
11772 elif nic_link and nic_link.lower() == constants.VALUE_NONE:
11773 nic_dict[constants.INIC_LINK] = None
11775 if nic_op == constants.DDM_ADD:
11776 nic_mac = nic_dict.get(constants.INIC_MAC, None)
11777 if nic_mac is None:
11778 nic_dict[constants.INIC_MAC] = constants.VALUE_AUTO
11780 if constants.INIC_MAC in nic_dict:
11781 nic_mac = nic_dict[constants.INIC_MAC]
11782 if nic_mac not in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
11783 nic_mac = utils.NormalizeAndValidateMac(nic_mac)
11785 if nic_op != constants.DDM_ADD and nic_mac == constants.VALUE_AUTO:
11786 raise errors.OpPrereqError("'auto' is not a valid MAC address when"
11787 " modifying an existing nic",
11788 errors.ECODE_INVAL)
11790 if nic_addremove > 1:
11791 raise errors.OpPrereqError("Only one NIC add or remove operation"
11792 " supported at a time", errors.ECODE_INVAL)
11794 def ExpandNames(self):
11795 self._ExpandAndLockInstance()
11796 # Can't even acquire node locks in shared mode as upcoming changes in
11797 # Ganeti 2.6 will start to modify the node object on disk conversion
11798 self.needed_locks[locking.LEVEL_NODE] = []
11799 self.needed_locks[locking.LEVEL_NODE_RES] = []
11800 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
11802 def DeclareLocks(self, level):
11803 if level == locking.LEVEL_NODE:
11804 self._LockInstancesNodes()
11805 if self.op.disk_template and self.op.remote_node:
11806 self.op.remote_node = _ExpandNodeName(self.cfg, self.op.remote_node)
11807 self.needed_locks[locking.LEVEL_NODE].append(self.op.remote_node)
11808 elif level == locking.LEVEL_NODE_RES and self.op.disk_template:
11810 self.needed_locks[locking.LEVEL_NODE_RES] = \
11811 self.needed_locks[locking.LEVEL_NODE][:]
11813 def BuildHooksEnv(self):
11814 """Build hooks env.
11816 This runs on the master, primary and secondaries.
11820 if constants.BE_MINMEM in self.be_new:
11821 args["minmem"] = self.be_new[constants.BE_MINMEM]
11822 if constants.BE_MAXMEM in self.be_new:
11823 args["maxmem"] = self.be_new[constants.BE_MAXMEM]
11824 if constants.BE_VCPUS in self.be_new:
11825 args["vcpus"] = self.be_new[constants.BE_VCPUS]
11826 # TODO: export disk changes. Note: _BuildInstanceHookEnv* don't export disk
11827 # information at all.
11830 nic_override = dict(self.op.nics)
11831 for idx, nic in enumerate(self.instance.nics):
11832 if idx in nic_override:
11833 this_nic_override = nic_override[idx]
11835 this_nic_override = {}
11836 if constants.INIC_IP in this_nic_override:
11837 ip = this_nic_override[constants.INIC_IP]
11840 if constants.INIC_MAC in this_nic_override:
11841 mac = this_nic_override[constants.INIC_MAC]
11844 if idx in self.nic_pnew:
11845 nicparams = self.nic_pnew[idx]
11847 nicparams = self.cluster.SimpleFillNIC(nic.nicparams)
11848 mode = nicparams[constants.NIC_MODE]
11849 link = nicparams[constants.NIC_LINK]
11850 args["nics"].append((ip, mac, mode, link))
11851 if constants.DDM_ADD in nic_override:
11852 ip = nic_override[constants.DDM_ADD].get(constants.INIC_IP, None)
11853 mac = nic_override[constants.DDM_ADD][constants.INIC_MAC]
11854 nicparams = self.nic_pnew[constants.DDM_ADD]
11855 mode = nicparams[constants.NIC_MODE]
11856 link = nicparams[constants.NIC_LINK]
11857 args["nics"].append((ip, mac, mode, link))
11858 elif constants.DDM_REMOVE in nic_override:
11859 del args["nics"][-1]
11861 env = _BuildInstanceHookEnvByObject(self, self.instance, override=args)
11862 if self.op.disk_template:
11863 env["NEW_DISK_TEMPLATE"] = self.op.disk_template
11864 if self.op.runtime_mem:
11865 env["RUNTIME_MEMORY"] = self.op.runtime_mem
11869 def BuildHooksNodes(self):
11870 """Build hooks nodes.
11873 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
11876 def CheckPrereq(self):
11877 """Check prerequisites.
11879 This only checks the instance list against the existing names.
11882 # checking the new params on the primary/secondary nodes
11884 instance = self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
11885 cluster = self.cluster = self.cfg.GetClusterInfo()
11886 assert self.instance is not None, \
11887 "Cannot retrieve locked instance %s" % self.op.instance_name
11888 pnode = instance.primary_node
11889 nodelist = list(instance.all_nodes)
11890 pnode_info = self.cfg.GetNodeInfo(pnode)
11891 self.diskparams = self.cfg.GetNodeGroup(pnode_info.group).diskparams
11894 if self.op.os_name and not self.op.force:
11895 _CheckNodeHasOS(self, instance.primary_node, self.op.os_name,
11896 self.op.force_variant)
11897 instance_os = self.op.os_name
11899 instance_os = instance.os
11901 if self.op.disk_template:
11902 if instance.disk_template == self.op.disk_template:
11903 raise errors.OpPrereqError("Instance already has disk template %s" %
11904 instance.disk_template, errors.ECODE_INVAL)
11906 if (instance.disk_template,
11907 self.op.disk_template) not in self._DISK_CONVERSIONS:
11908 raise errors.OpPrereqError("Unsupported disk template conversion from"
11909 " %s to %s" % (instance.disk_template,
11910 self.op.disk_template),
11911 errors.ECODE_INVAL)
11912 _CheckInstanceState(self, instance, INSTANCE_DOWN,
11913 msg="cannot change disk template")
11914 if self.op.disk_template in constants.DTS_INT_MIRROR:
11915 if self.op.remote_node == pnode:
11916 raise errors.OpPrereqError("Given new secondary node %s is the same"
11917 " as the primary node of the instance" %
11918 self.op.remote_node, errors.ECODE_STATE)
11919 _CheckNodeOnline(self, self.op.remote_node)
11920 _CheckNodeNotDrained(self, self.op.remote_node)
11921 # FIXME: here we assume that the old instance type is DT_PLAIN
11922 assert instance.disk_template == constants.DT_PLAIN
11923 disks = [{constants.IDISK_SIZE: d.size,
11924 constants.IDISK_VG: d.logical_id[0]}
11925 for d in instance.disks]
11926 required = _ComputeDiskSizePerVG(self.op.disk_template, disks)
11927 _CheckNodesFreeDiskPerVG(self, [self.op.remote_node], required)
11929 snode_info = self.cfg.GetNodeInfo(self.op.remote_node)
11930 snode_group = self.cfg.GetNodeGroup(snode_info.group)
11931 ipolicy = _CalculateGroupIPolicy(cluster, snode_group)
11932 _CheckTargetNodeIPolicy(self, ipolicy, instance, snode_info,
11933 ignore=self.op.ignore_ipolicy)
11934 if pnode_info.group != snode_info.group:
11935 self.LogWarning("The primary and secondary nodes are in two"
11936 " different node groups; the disk parameters"
11937 " from the first disk's node group will be"
11940 # hvparams processing
11941 if self.op.hvparams:
11942 hv_type = instance.hypervisor
11943 i_hvdict = _GetUpdatedParams(instance.hvparams, self.op.hvparams)
11944 utils.ForceDictType(i_hvdict, constants.HVS_PARAMETER_TYPES)
11945 hv_new = cluster.SimpleFillHV(hv_type, instance.os, i_hvdict)
11948 hypervisor.GetHypervisor(hv_type).CheckParameterSyntax(hv_new)
11949 _CheckHVParams(self, nodelist, instance.hypervisor, hv_new)
11950 self.hv_proposed = self.hv_new = hv_new # the new actual values
11951 self.hv_inst = i_hvdict # the new dict (without defaults)
11953 self.hv_proposed = cluster.SimpleFillHV(instance.hypervisor, instance.os,
11955 self.hv_new = self.hv_inst = {}
11957 # beparams processing
11958 if self.op.beparams:
11959 i_bedict = _GetUpdatedParams(instance.beparams, self.op.beparams,
11961 objects.UpgradeBeParams(i_bedict)
11962 utils.ForceDictType(i_bedict, constants.BES_PARAMETER_TYPES)
11963 be_new = cluster.SimpleFillBE(i_bedict)
11964 self.be_proposed = self.be_new = be_new # the new actual values
11965 self.be_inst = i_bedict # the new dict (without defaults)
11967 self.be_new = self.be_inst = {}
11968 self.be_proposed = cluster.SimpleFillBE(instance.beparams)
11969 be_old = cluster.FillBE(instance)
11971 # CPU param validation -- checking every time a paramtere is
11972 # changed to cover all cases where either CPU mask or vcpus have
11974 if (constants.BE_VCPUS in self.be_proposed and
11975 constants.HV_CPU_MASK in self.hv_proposed):
11977 utils.ParseMultiCpuMask(self.hv_proposed[constants.HV_CPU_MASK])
11978 # Verify mask is consistent with number of vCPUs. Can skip this
11979 # test if only 1 entry in the CPU mask, which means same mask
11980 # is applied to all vCPUs.
11981 if (len(cpu_list) > 1 and
11982 len(cpu_list) != self.be_proposed[constants.BE_VCPUS]):
11983 raise errors.OpPrereqError("Number of vCPUs [%d] does not match the"
11985 (self.be_proposed[constants.BE_VCPUS],
11986 self.hv_proposed[constants.HV_CPU_MASK]),
11987 errors.ECODE_INVAL)
11989 # Only perform this test if a new CPU mask is given
11990 if constants.HV_CPU_MASK in self.hv_new:
11991 # Calculate the largest CPU number requested
11992 max_requested_cpu = max(map(max, cpu_list))
11993 # Check that all of the instance's nodes have enough physical CPUs to
11994 # satisfy the requested CPU mask
11995 _CheckNodesPhysicalCPUs(self, instance.all_nodes,
11996 max_requested_cpu + 1, instance.hypervisor)
11998 # osparams processing
11999 if self.op.osparams:
12000 i_osdict = _GetUpdatedParams(instance.osparams, self.op.osparams)
12001 _CheckOSParams(self, True, nodelist, instance_os, i_osdict)
12002 self.os_inst = i_osdict # the new dict (without defaults)
12008 #TODO(dynmem): do the appropriate check involving MINMEM
12009 if (constants.BE_MAXMEM in self.op.beparams and not self.op.force and
12010 be_new[constants.BE_MAXMEM] > be_old[constants.BE_MAXMEM]):
12011 mem_check_list = [pnode]
12012 if be_new[constants.BE_AUTO_BALANCE]:
12013 # either we changed auto_balance to yes or it was from before
12014 mem_check_list.extend(instance.secondary_nodes)
12015 instance_info = self.rpc.call_instance_info(pnode, instance.name,
12016 instance.hypervisor)
12017 nodeinfo = self.rpc.call_node_info(mem_check_list, None,
12018 [instance.hypervisor])
12019 pninfo = nodeinfo[pnode]
12020 msg = pninfo.fail_msg
12022 # Assume the primary node is unreachable and go ahead
12023 self.warn.append("Can't get info from primary node %s: %s" %
12026 (_, _, (pnhvinfo, )) = pninfo.payload
12027 if not isinstance(pnhvinfo.get("memory_free", None), int):
12028 self.warn.append("Node data from primary node %s doesn't contain"
12029 " free memory information" % pnode)
12030 elif instance_info.fail_msg:
12031 self.warn.append("Can't get instance runtime information: %s" %
12032 instance_info.fail_msg)
12034 if instance_info.payload:
12035 current_mem = int(instance_info.payload["memory"])
12037 # Assume instance not running
12038 # (there is a slight race condition here, but it's not very
12039 # probable, and we have no other way to check)
12040 # TODO: Describe race condition
12042 #TODO(dynmem): do the appropriate check involving MINMEM
12043 miss_mem = (be_new[constants.BE_MAXMEM] - current_mem -
12044 pnhvinfo["memory_free"])
12046 raise errors.OpPrereqError("This change will prevent the instance"
12047 " from starting, due to %d MB of memory"
12048 " missing on its primary node" %
12050 errors.ECODE_NORES)
12052 if be_new[constants.BE_AUTO_BALANCE]:
12053 for node, nres in nodeinfo.items():
12054 if node not in instance.secondary_nodes:
12056 nres.Raise("Can't get info from secondary node %s" % node,
12057 prereq=True, ecode=errors.ECODE_STATE)
12058 (_, _, (nhvinfo, )) = nres.payload
12059 if not isinstance(nhvinfo.get("memory_free", None), int):
12060 raise errors.OpPrereqError("Secondary node %s didn't return free"
12061 " memory information" % node,
12062 errors.ECODE_STATE)
12063 #TODO(dynmem): do the appropriate check involving MINMEM
12064 elif be_new[constants.BE_MAXMEM] > nhvinfo["memory_free"]:
12065 raise errors.OpPrereqError("This change will prevent the instance"
12066 " from failover to its secondary node"
12067 " %s, due to not enough memory" % node,
12068 errors.ECODE_STATE)
12070 if self.op.runtime_mem:
12071 remote_info = self.rpc.call_instance_info(instance.primary_node,
12073 instance.hypervisor)
12074 remote_info.Raise("Error checking node %s" % instance.primary_node)
12075 if not remote_info.payload: # not running already
12076 raise errors.OpPrereqError("Instance %s is not running" % instance.name,
12077 errors.ECODE_STATE)
12079 current_memory = remote_info.payload["memory"]
12080 if (not self.op.force and
12081 (self.op.runtime_mem > self.be_proposed[constants.BE_MAXMEM] or
12082 self.op.runtime_mem < self.be_proposed[constants.BE_MINMEM])):
12083 raise errors.OpPrereqError("Instance %s must have memory between %d"
12084 " and %d MB of memory unless --force is"
12085 " given" % (instance.name,
12086 self.be_proposed[constants.BE_MINMEM],
12087 self.be_proposed[constants.BE_MAXMEM]),
12088 errors.ECODE_INVAL)
12090 if self.op.runtime_mem > current_memory:
12091 _CheckNodeFreeMemory(self, instance.primary_node,
12092 "ballooning memory for instance %s" %
12094 self.op.memory - current_memory,
12095 instance.hypervisor)
12099 self.nic_pinst = {}
12100 for nic_op, nic_dict in self.op.nics:
12101 if nic_op == constants.DDM_REMOVE:
12102 if not instance.nics:
12103 raise errors.OpPrereqError("Instance has no NICs, cannot remove",
12104 errors.ECODE_INVAL)
12106 if nic_op != constants.DDM_ADD:
12108 if not instance.nics:
12109 raise errors.OpPrereqError("Invalid NIC index %s, instance has"
12110 " no NICs" % nic_op,
12111 errors.ECODE_INVAL)
12112 if nic_op < 0 or nic_op >= len(instance.nics):
12113 raise errors.OpPrereqError("Invalid NIC index %s, valid values"
12115 (nic_op, len(instance.nics) - 1),
12116 errors.ECODE_INVAL)
12117 old_nic_params = instance.nics[nic_op].nicparams
12118 old_nic_ip = instance.nics[nic_op].ip
12120 old_nic_params = {}
12123 update_params_dict = dict([(key, nic_dict[key])
12124 for key in constants.NICS_PARAMETERS
12125 if key in nic_dict])
12127 if "bridge" in nic_dict:
12128 update_params_dict[constants.NIC_LINK] = nic_dict["bridge"]
12130 new_nic_params = _GetUpdatedParams(old_nic_params,
12131 update_params_dict)
12132 utils.ForceDictType(new_nic_params, constants.NICS_PARAMETER_TYPES)
12133 new_filled_nic_params = cluster.SimpleFillNIC(new_nic_params)
12134 objects.NIC.CheckParameterSyntax(new_filled_nic_params)
12135 self.nic_pinst[nic_op] = new_nic_params
12136 self.nic_pnew[nic_op] = new_filled_nic_params
12137 new_nic_mode = new_filled_nic_params[constants.NIC_MODE]
12139 if new_nic_mode == constants.NIC_MODE_BRIDGED:
12140 nic_bridge = new_filled_nic_params[constants.NIC_LINK]
12141 msg = self.rpc.call_bridges_exist(pnode, [nic_bridge]).fail_msg
12143 msg = "Error checking bridges on node %s: %s" % (pnode, msg)
12145 self.warn.append(msg)
12147 raise errors.OpPrereqError(msg, errors.ECODE_ENVIRON)
12148 if new_nic_mode == constants.NIC_MODE_ROUTED:
12149 if constants.INIC_IP in nic_dict:
12150 nic_ip = nic_dict[constants.INIC_IP]
12152 nic_ip = old_nic_ip
12154 raise errors.OpPrereqError("Cannot set the nic ip to None"
12155 " on a routed nic", errors.ECODE_INVAL)
12156 if constants.INIC_MAC in nic_dict:
12157 nic_mac = nic_dict[constants.INIC_MAC]
12158 if nic_mac is None:
12159 raise errors.OpPrereqError("Cannot set the nic mac to None",
12160 errors.ECODE_INVAL)
12161 elif nic_mac in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
12162 # otherwise generate the mac
12163 nic_dict[constants.INIC_MAC] = \
12164 self.cfg.GenerateMAC(self.proc.GetECId())
12166 # or validate/reserve the current one
12168 self.cfg.ReserveMAC(nic_mac, self.proc.GetECId())
12169 except errors.ReservationError:
12170 raise errors.OpPrereqError("MAC address %s already in use"
12171 " in cluster" % nic_mac,
12172 errors.ECODE_NOTUNIQUE)
12175 if self.op.disks and instance.disk_template == constants.DT_DISKLESS:
12176 raise errors.OpPrereqError("Disk operations not supported for"
12177 " diskless instances",
12178 errors.ECODE_INVAL)
12179 for disk_op, _ in self.op.disks:
12180 if disk_op == constants.DDM_REMOVE:
12181 if len(instance.disks) == 1:
12182 raise errors.OpPrereqError("Cannot remove the last disk of"
12183 " an instance", errors.ECODE_INVAL)
12184 _CheckInstanceState(self, instance, INSTANCE_DOWN,
12185 msg="cannot remove disks")
12187 if (disk_op == constants.DDM_ADD and
12188 len(instance.disks) >= constants.MAX_DISKS):
12189 raise errors.OpPrereqError("Instance has too many disks (%d), cannot"
12190 " add more" % constants.MAX_DISKS,
12191 errors.ECODE_STATE)
12192 if disk_op not in (constants.DDM_ADD, constants.DDM_REMOVE):
12194 if disk_op < 0 or disk_op >= len(instance.disks):
12195 raise errors.OpPrereqError("Invalid disk index %s, valid values"
12197 (disk_op, len(instance.disks)),
12198 errors.ECODE_INVAL)
12200 # disabling the instance
12201 if self.op.offline_inst:
12202 _CheckInstanceState(self, instance, INSTANCE_DOWN,
12203 msg="cannot change instance state to offline")
12205 # enabling the instance
12206 if self.op.online_inst:
12207 _CheckInstanceState(self, instance, INSTANCE_OFFLINE,
12208 msg="cannot make instance go online")
12210 def _ConvertPlainToDrbd(self, feedback_fn):
12211 """Converts an instance from plain to drbd.
12214 feedback_fn("Converting template to drbd")
12215 instance = self.instance
12216 pnode = instance.primary_node
12217 snode = self.op.remote_node
12219 assert instance.disk_template == constants.DT_PLAIN
12221 # create a fake disk info for _GenerateDiskTemplate
12222 disk_info = [{constants.IDISK_SIZE: d.size, constants.IDISK_MODE: d.mode,
12223 constants.IDISK_VG: d.logical_id[0]}
12224 for d in instance.disks]
12225 new_disks = _GenerateDiskTemplate(self, self.op.disk_template,
12226 instance.name, pnode, [snode],
12227 disk_info, None, None, 0, feedback_fn,
12229 info = _GetInstanceInfoText(instance)
12230 feedback_fn("Creating aditional volumes...")
12231 # first, create the missing data and meta devices
12232 for disk in new_disks:
12233 # unfortunately this is... not too nice
12234 _CreateSingleBlockDev(self, pnode, instance, disk.children[1],
12236 for child in disk.children:
12237 _CreateSingleBlockDev(self, snode, instance, child, info, True)
12238 # at this stage, all new LVs have been created, we can rename the
12240 feedback_fn("Renaming original volumes...")
12241 rename_list = [(o, n.children[0].logical_id)
12242 for (o, n) in zip(instance.disks, new_disks)]
12243 result = self.rpc.call_blockdev_rename(pnode, rename_list)
12244 result.Raise("Failed to rename original LVs")
12246 feedback_fn("Initializing DRBD devices...")
12247 # all child devices are in place, we can now create the DRBD devices
12248 for disk in new_disks:
12249 for node in [pnode, snode]:
12250 f_create = node == pnode
12251 _CreateSingleBlockDev(self, node, instance, disk, info, f_create)
12253 # at this point, the instance has been modified
12254 instance.disk_template = constants.DT_DRBD8
12255 instance.disks = new_disks
12256 self.cfg.Update(instance, feedback_fn)
12258 # Release node locks while waiting for sync
12259 _ReleaseLocks(self, locking.LEVEL_NODE)
12261 # disks are created, waiting for sync
12262 disk_abort = not _WaitForSync(self, instance,
12263 oneshot=not self.op.wait_for_sync)
12265 raise errors.OpExecError("There are some degraded disks for"
12266 " this instance, please cleanup manually")
12268 # Node resource locks will be released by caller
12270 def _ConvertDrbdToPlain(self, feedback_fn):
12271 """Converts an instance from drbd to plain.
12274 instance = self.instance
12276 assert len(instance.secondary_nodes) == 1
12277 assert instance.disk_template == constants.DT_DRBD8
12279 pnode = instance.primary_node
12280 snode = instance.secondary_nodes[0]
12281 feedback_fn("Converting template to plain")
12283 old_disks = instance.disks
12284 new_disks = [d.children[0] for d in old_disks]
12286 # copy over size and mode
12287 for parent, child in zip(old_disks, new_disks):
12288 child.size = parent.size
12289 child.mode = parent.mode
12291 # update instance structure
12292 instance.disks = new_disks
12293 instance.disk_template = constants.DT_PLAIN
12294 self.cfg.Update(instance, feedback_fn)
12296 # Release locks in case removing disks takes a while
12297 _ReleaseLocks(self, locking.LEVEL_NODE)
12299 feedback_fn("Removing volumes on the secondary node...")
12300 for disk in old_disks:
12301 self.cfg.SetDiskID(disk, snode)
12302 msg = self.rpc.call_blockdev_remove(snode, disk).fail_msg
12304 self.LogWarning("Could not remove block device %s on node %s,"
12305 " continuing anyway: %s", disk.iv_name, snode, msg)
12307 feedback_fn("Removing unneeded volumes on the primary node...")
12308 for idx, disk in enumerate(old_disks):
12309 meta = disk.children[1]
12310 self.cfg.SetDiskID(meta, pnode)
12311 msg = self.rpc.call_blockdev_remove(pnode, meta).fail_msg
12313 self.LogWarning("Could not remove metadata for disk %d on node %s,"
12314 " continuing anyway: %s", idx, pnode, msg)
12316 # this is a DRBD disk, return its port to the pool
12317 for disk in old_disks:
12318 tcp_port = disk.logical_id[2]
12319 self.cfg.AddTcpUdpPort(tcp_port)
12321 # Node resource locks will be released by caller
12323 def Exec(self, feedback_fn):
12324 """Modifies an instance.
12326 All parameters take effect only at the next restart of the instance.
12329 # Process here the warnings from CheckPrereq, as we don't have a
12330 # feedback_fn there.
12331 for warn in self.warn:
12332 feedback_fn("WARNING: %s" % warn)
12334 assert ((self.op.disk_template is None) ^
12335 bool(self.owned_locks(locking.LEVEL_NODE_RES))), \
12336 "Not owning any node resource locks"
12339 instance = self.instance
12342 if self.op.runtime_mem:
12343 rpcres = self.rpc.call_instance_balloon_memory(instance.primary_node,
12345 self.op.runtime_mem)
12346 rpcres.Raise("Cannot modify instance runtime memory")
12347 result.append(("runtime_memory", self.op.runtime_mem))
12350 for disk_op, disk_dict in self.op.disks:
12351 if disk_op == constants.DDM_REMOVE:
12352 # remove the last disk
12353 device = instance.disks.pop()
12354 device_idx = len(instance.disks)
12355 for node, disk in device.ComputeNodeTree(instance.primary_node):
12356 self.cfg.SetDiskID(disk, node)
12357 msg = self.rpc.call_blockdev_remove(node, disk).fail_msg
12359 self.LogWarning("Could not remove disk/%d on node %s: %s,"
12360 " continuing anyway", device_idx, node, msg)
12361 result.append(("disk/%d" % device_idx, "remove"))
12363 # if this is a DRBD disk, return its port to the pool
12364 if device.dev_type in constants.LDS_DRBD:
12365 tcp_port = device.logical_id[2]
12366 self.cfg.AddTcpUdpPort(tcp_port)
12367 elif disk_op == constants.DDM_ADD:
12369 if instance.disk_template in (constants.DT_FILE,
12370 constants.DT_SHARED_FILE):
12371 file_driver, file_path = instance.disks[0].logical_id
12372 file_path = os.path.dirname(file_path)
12374 file_driver = file_path = None
12375 disk_idx_base = len(instance.disks)
12376 new_disk = _GenerateDiskTemplate(self,
12377 instance.disk_template,
12378 instance.name, instance.primary_node,
12379 instance.secondary_nodes,
12385 self.diskparams)[0]
12386 instance.disks.append(new_disk)
12387 info = _GetInstanceInfoText(instance)
12389 logging.info("Creating volume %s for instance %s",
12390 new_disk.iv_name, instance.name)
12391 # Note: this needs to be kept in sync with _CreateDisks
12393 for node in instance.all_nodes:
12394 f_create = node == instance.primary_node
12396 _CreateBlockDev(self, node, instance, new_disk,
12397 f_create, info, f_create)
12398 except errors.OpExecError, err:
12399 self.LogWarning("Failed to create volume %s (%s) on"
12401 new_disk.iv_name, new_disk, node, err)
12402 result.append(("disk/%d" % disk_idx_base, "add:size=%s,mode=%s" %
12403 (new_disk.size, new_disk.mode)))
12405 # change a given disk
12406 instance.disks[disk_op].mode = disk_dict[constants.IDISK_MODE]
12407 result.append(("disk.mode/%d" % disk_op,
12408 disk_dict[constants.IDISK_MODE]))
12410 if self.op.disk_template:
12412 check_nodes = set(instance.all_nodes)
12413 if self.op.remote_node:
12414 check_nodes.add(self.op.remote_node)
12415 for level in [locking.LEVEL_NODE, locking.LEVEL_NODE_RES]:
12416 owned = self.owned_locks(level)
12417 assert not (check_nodes - owned), \
12418 ("Not owning the correct locks, owning %r, expected at least %r" %
12419 (owned, check_nodes))
12421 r_shut = _ShutdownInstanceDisks(self, instance)
12423 raise errors.OpExecError("Cannot shutdown instance disks, unable to"
12424 " proceed with disk template conversion")
12425 mode = (instance.disk_template, self.op.disk_template)
12427 self._DISK_CONVERSIONS[mode](self, feedback_fn)
12429 self.cfg.ReleaseDRBDMinors(instance.name)
12431 result.append(("disk_template", self.op.disk_template))
12433 assert instance.disk_template == self.op.disk_template, \
12434 ("Expected disk template '%s', found '%s'" %
12435 (self.op.disk_template, instance.disk_template))
12437 # Release node and resource locks if there are any (they might already have
12438 # been released during disk conversion)
12439 _ReleaseLocks(self, locking.LEVEL_NODE)
12440 _ReleaseLocks(self, locking.LEVEL_NODE_RES)
12443 for nic_op, nic_dict in self.op.nics:
12444 if nic_op == constants.DDM_REMOVE:
12445 # remove the last nic
12446 del instance.nics[-1]
12447 result.append(("nic.%d" % len(instance.nics), "remove"))
12448 elif nic_op == constants.DDM_ADD:
12449 # mac and bridge should be set, by now
12450 mac = nic_dict[constants.INIC_MAC]
12451 ip = nic_dict.get(constants.INIC_IP, None)
12452 nicparams = self.nic_pinst[constants.DDM_ADD]
12453 new_nic = objects.NIC(mac=mac, ip=ip, nicparams=nicparams)
12454 instance.nics.append(new_nic)
12455 result.append(("nic.%d" % (len(instance.nics) - 1),
12456 "add:mac=%s,ip=%s,mode=%s,link=%s" %
12457 (new_nic.mac, new_nic.ip,
12458 self.nic_pnew[constants.DDM_ADD][constants.NIC_MODE],
12459 self.nic_pnew[constants.DDM_ADD][constants.NIC_LINK]
12462 for key in (constants.INIC_MAC, constants.INIC_IP):
12463 if key in nic_dict:
12464 setattr(instance.nics[nic_op], key, nic_dict[key])
12465 if nic_op in self.nic_pinst:
12466 instance.nics[nic_op].nicparams = self.nic_pinst[nic_op]
12467 for key, val in nic_dict.iteritems():
12468 result.append(("nic.%s/%d" % (key, nic_op), val))
12471 if self.op.hvparams:
12472 instance.hvparams = self.hv_inst
12473 for key, val in self.op.hvparams.iteritems():
12474 result.append(("hv/%s" % key, val))
12477 if self.op.beparams:
12478 instance.beparams = self.be_inst
12479 for key, val in self.op.beparams.iteritems():
12480 result.append(("be/%s" % key, val))
12483 if self.op.os_name:
12484 instance.os = self.op.os_name
12487 if self.op.osparams:
12488 instance.osparams = self.os_inst
12489 for key, val in self.op.osparams.iteritems():
12490 result.append(("os/%s" % key, val))
12492 # online/offline instance
12493 if self.op.online_inst:
12494 self.cfg.MarkInstanceDown(instance.name)
12495 result.append(("admin_state", constants.ADMINST_DOWN))
12496 if self.op.offline_inst:
12497 self.cfg.MarkInstanceOffline(instance.name)
12498 result.append(("admin_state", constants.ADMINST_OFFLINE))
12500 self.cfg.Update(instance, feedback_fn)
12502 assert not (self.owned_locks(locking.LEVEL_NODE_RES) or
12503 self.owned_locks(locking.LEVEL_NODE)), \
12504 "All node locks should have been released by now"
12508 _DISK_CONVERSIONS = {
12509 (constants.DT_PLAIN, constants.DT_DRBD8): _ConvertPlainToDrbd,
12510 (constants.DT_DRBD8, constants.DT_PLAIN): _ConvertDrbdToPlain,
12514 class LUInstanceChangeGroup(LogicalUnit):
12515 HPATH = "instance-change-group"
12516 HTYPE = constants.HTYPE_INSTANCE
12519 def ExpandNames(self):
12520 self.share_locks = _ShareAll()
12521 self.needed_locks = {
12522 locking.LEVEL_NODEGROUP: [],
12523 locking.LEVEL_NODE: [],
12526 self._ExpandAndLockInstance()
12528 if self.op.target_groups:
12529 self.req_target_uuids = map(self.cfg.LookupNodeGroup,
12530 self.op.target_groups)
12532 self.req_target_uuids = None
12534 self.op.iallocator = _GetDefaultIAllocator(self.cfg, self.op.iallocator)
12536 def DeclareLocks(self, level):
12537 if level == locking.LEVEL_NODEGROUP:
12538 assert not self.needed_locks[locking.LEVEL_NODEGROUP]
12540 if self.req_target_uuids:
12541 lock_groups = set(self.req_target_uuids)
12543 # Lock all groups used by instance optimistically; this requires going
12544 # via the node before it's locked, requiring verification later on
12545 instance_groups = self.cfg.GetInstanceNodeGroups(self.op.instance_name)
12546 lock_groups.update(instance_groups)
12548 # No target groups, need to lock all of them
12549 lock_groups = locking.ALL_SET
12551 self.needed_locks[locking.LEVEL_NODEGROUP] = lock_groups
12553 elif level == locking.LEVEL_NODE:
12554 if self.req_target_uuids:
12555 # Lock all nodes used by instances
12556 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
12557 self._LockInstancesNodes()
12559 # Lock all nodes in all potential target groups
12560 lock_groups = (frozenset(self.owned_locks(locking.LEVEL_NODEGROUP)) -
12561 self.cfg.GetInstanceNodeGroups(self.op.instance_name))
12562 member_nodes = [node_name
12563 for group in lock_groups
12564 for node_name in self.cfg.GetNodeGroup(group).members]
12565 self.needed_locks[locking.LEVEL_NODE].extend(member_nodes)
12567 # Lock all nodes as all groups are potential targets
12568 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
12570 def CheckPrereq(self):
12571 owned_instances = frozenset(self.owned_locks(locking.LEVEL_INSTANCE))
12572 owned_groups = frozenset(self.owned_locks(locking.LEVEL_NODEGROUP))
12573 owned_nodes = frozenset(self.owned_locks(locking.LEVEL_NODE))
12575 assert (self.req_target_uuids is None or
12576 owned_groups.issuperset(self.req_target_uuids))
12577 assert owned_instances == set([self.op.instance_name])
12579 # Get instance information
12580 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
12582 # Check if node groups for locked instance are still correct
12583 assert owned_nodes.issuperset(self.instance.all_nodes), \
12584 ("Instance %s's nodes changed while we kept the lock" %
12585 self.op.instance_name)
12587 inst_groups = _CheckInstanceNodeGroups(self.cfg, self.op.instance_name,
12590 if self.req_target_uuids:
12591 # User requested specific target groups
12592 self.target_uuids = self.req_target_uuids
12594 # All groups except those used by the instance are potential targets
12595 self.target_uuids = owned_groups - inst_groups
12597 conflicting_groups = self.target_uuids & inst_groups
12598 if conflicting_groups:
12599 raise errors.OpPrereqError("Can't use group(s) '%s' as targets, they are"
12600 " used by the instance '%s'" %
12601 (utils.CommaJoin(conflicting_groups),
12602 self.op.instance_name),
12603 errors.ECODE_INVAL)
12605 if not self.target_uuids:
12606 raise errors.OpPrereqError("There are no possible target groups",
12607 errors.ECODE_INVAL)
12609 def BuildHooksEnv(self):
12610 """Build hooks env.
12613 assert self.target_uuids
12616 "TARGET_GROUPS": " ".join(self.target_uuids),
12619 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
12623 def BuildHooksNodes(self):
12624 """Build hooks nodes.
12627 mn = self.cfg.GetMasterNode()
12628 return ([mn], [mn])
12630 def Exec(self, feedback_fn):
12631 instances = list(self.owned_locks(locking.LEVEL_INSTANCE))
12633 assert instances == [self.op.instance_name], "Instance not locked"
12635 ial = IAllocator(self.cfg, self.rpc, constants.IALLOCATOR_MODE_CHG_GROUP,
12636 instances=instances, target_groups=list(self.target_uuids))
12638 ial.Run(self.op.iallocator)
12640 if not ial.success:
12641 raise errors.OpPrereqError("Can't compute solution for changing group of"
12642 " instance '%s' using iallocator '%s': %s" %
12643 (self.op.instance_name, self.op.iallocator,
12645 errors.ECODE_NORES)
12647 jobs = _LoadNodeEvacResult(self, ial.result, self.op.early_release, False)
12649 self.LogInfo("Iallocator returned %s job(s) for changing group of"
12650 " instance '%s'", len(jobs), self.op.instance_name)
12652 return ResultWithJobs(jobs)
12655 class LUBackupQuery(NoHooksLU):
12656 """Query the exports list
12661 def ExpandNames(self):
12662 self.needed_locks = {}
12663 self.share_locks[locking.LEVEL_NODE] = 1
12664 if not self.op.nodes:
12665 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
12667 self.needed_locks[locking.LEVEL_NODE] = \
12668 _GetWantedNodes(self, self.op.nodes)
12670 def Exec(self, feedback_fn):
12671 """Compute the list of all the exported system images.
12674 @return: a dictionary with the structure node->(export-list)
12675 where export-list is a list of the instances exported on
12679 self.nodes = self.owned_locks(locking.LEVEL_NODE)
12680 rpcresult = self.rpc.call_export_list(self.nodes)
12682 for node in rpcresult:
12683 if rpcresult[node].fail_msg:
12684 result[node] = False
12686 result[node] = rpcresult[node].payload
12691 class LUBackupPrepare(NoHooksLU):
12692 """Prepares an instance for an export and returns useful information.
12697 def ExpandNames(self):
12698 self._ExpandAndLockInstance()
12700 def CheckPrereq(self):
12701 """Check prerequisites.
12704 instance_name = self.op.instance_name
12706 self.instance = self.cfg.GetInstanceInfo(instance_name)
12707 assert self.instance is not None, \
12708 "Cannot retrieve locked instance %s" % self.op.instance_name
12709 _CheckNodeOnline(self, self.instance.primary_node)
12711 self._cds = _GetClusterDomainSecret()
12713 def Exec(self, feedback_fn):
12714 """Prepares an instance for an export.
12717 instance = self.instance
12719 if self.op.mode == constants.EXPORT_MODE_REMOTE:
12720 salt = utils.GenerateSecret(8)
12722 feedback_fn("Generating X509 certificate on %s" % instance.primary_node)
12723 result = self.rpc.call_x509_cert_create(instance.primary_node,
12724 constants.RIE_CERT_VALIDITY)
12725 result.Raise("Can't create X509 key and certificate on %s" % result.node)
12727 (name, cert_pem) = result.payload
12729 cert = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM,
12733 "handshake": masterd.instance.ComputeRemoteExportHandshake(self._cds),
12734 "x509_key_name": (name, utils.Sha1Hmac(self._cds, name, salt=salt),
12736 "x509_ca": utils.SignX509Certificate(cert, self._cds, salt),
12742 class LUBackupExport(LogicalUnit):
12743 """Export an instance to an image in the cluster.
12746 HPATH = "instance-export"
12747 HTYPE = constants.HTYPE_INSTANCE
12750 def CheckArguments(self):
12751 """Check the arguments.
12754 self.x509_key_name = self.op.x509_key_name
12755 self.dest_x509_ca_pem = self.op.destination_x509_ca
12757 if self.op.mode == constants.EXPORT_MODE_REMOTE:
12758 if not self.x509_key_name:
12759 raise errors.OpPrereqError("Missing X509 key name for encryption",
12760 errors.ECODE_INVAL)
12762 if not self.dest_x509_ca_pem:
12763 raise errors.OpPrereqError("Missing destination X509 CA",
12764 errors.ECODE_INVAL)
12766 def ExpandNames(self):
12767 self._ExpandAndLockInstance()
12769 # Lock all nodes for local exports
12770 if self.op.mode == constants.EXPORT_MODE_LOCAL:
12771 # FIXME: lock only instance primary and destination node
12773 # Sad but true, for now we have do lock all nodes, as we don't know where
12774 # the previous export might be, and in this LU we search for it and
12775 # remove it from its current node. In the future we could fix this by:
12776 # - making a tasklet to search (share-lock all), then create the
12777 # new one, then one to remove, after
12778 # - removing the removal operation altogether
12779 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
12781 def DeclareLocks(self, level):
12782 """Last minute lock declaration."""
12783 # All nodes are locked anyway, so nothing to do here.
12785 def BuildHooksEnv(self):
12786 """Build hooks env.
12788 This will run on the master, primary node and target node.
12792 "EXPORT_MODE": self.op.mode,
12793 "EXPORT_NODE": self.op.target_node,
12794 "EXPORT_DO_SHUTDOWN": self.op.shutdown,
12795 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
12796 # TODO: Generic function for boolean env variables
12797 "REMOVE_INSTANCE": str(bool(self.op.remove_instance)),
12800 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
12804 def BuildHooksNodes(self):
12805 """Build hooks nodes.
12808 nl = [self.cfg.GetMasterNode(), self.instance.primary_node]
12810 if self.op.mode == constants.EXPORT_MODE_LOCAL:
12811 nl.append(self.op.target_node)
12815 def CheckPrereq(self):
12816 """Check prerequisites.
12818 This checks that the instance and node names are valid.
12821 instance_name = self.op.instance_name
12823 self.instance = self.cfg.GetInstanceInfo(instance_name)
12824 assert self.instance is not None, \
12825 "Cannot retrieve locked instance %s" % self.op.instance_name
12826 _CheckNodeOnline(self, self.instance.primary_node)
12828 if (self.op.remove_instance and
12829 self.instance.admin_state == constants.ADMINST_UP and
12830 not self.op.shutdown):
12831 raise errors.OpPrereqError("Can not remove instance without shutting it"
12834 if self.op.mode == constants.EXPORT_MODE_LOCAL:
12835 self.op.target_node = _ExpandNodeName(self.cfg, self.op.target_node)
12836 self.dst_node = self.cfg.GetNodeInfo(self.op.target_node)
12837 assert self.dst_node is not None
12839 _CheckNodeOnline(self, self.dst_node.name)
12840 _CheckNodeNotDrained(self, self.dst_node.name)
12843 self.dest_disk_info = None
12844 self.dest_x509_ca = None
12846 elif self.op.mode == constants.EXPORT_MODE_REMOTE:
12847 self.dst_node = None
12849 if len(self.op.target_node) != len(self.instance.disks):
12850 raise errors.OpPrereqError(("Received destination information for %s"
12851 " disks, but instance %s has %s disks") %
12852 (len(self.op.target_node), instance_name,
12853 len(self.instance.disks)),
12854 errors.ECODE_INVAL)
12856 cds = _GetClusterDomainSecret()
12858 # Check X509 key name
12860 (key_name, hmac_digest, hmac_salt) = self.x509_key_name
12861 except (TypeError, ValueError), err:
12862 raise errors.OpPrereqError("Invalid data for X509 key name: %s" % err)
12864 if not utils.VerifySha1Hmac(cds, key_name, hmac_digest, salt=hmac_salt):
12865 raise errors.OpPrereqError("HMAC for X509 key name is wrong",
12866 errors.ECODE_INVAL)
12868 # Load and verify CA
12870 (cert, _) = utils.LoadSignedX509Certificate(self.dest_x509_ca_pem, cds)
12871 except OpenSSL.crypto.Error, err:
12872 raise errors.OpPrereqError("Unable to load destination X509 CA (%s)" %
12873 (err, ), errors.ECODE_INVAL)
12875 (errcode, msg) = utils.VerifyX509Certificate(cert, None, None)
12876 if errcode is not None:
12877 raise errors.OpPrereqError("Invalid destination X509 CA (%s)" %
12878 (msg, ), errors.ECODE_INVAL)
12880 self.dest_x509_ca = cert
12882 # Verify target information
12884 for idx, disk_data in enumerate(self.op.target_node):
12886 (host, port, magic) = \
12887 masterd.instance.CheckRemoteExportDiskInfo(cds, idx, disk_data)
12888 except errors.GenericError, err:
12889 raise errors.OpPrereqError("Target info for disk %s: %s" %
12890 (idx, err), errors.ECODE_INVAL)
12892 disk_info.append((host, port, magic))
12894 assert len(disk_info) == len(self.op.target_node)
12895 self.dest_disk_info = disk_info
12898 raise errors.ProgrammerError("Unhandled export mode %r" %
12901 # instance disk type verification
12902 # TODO: Implement export support for file-based disks
12903 for disk in self.instance.disks:
12904 if disk.dev_type == constants.LD_FILE:
12905 raise errors.OpPrereqError("Export not supported for instances with"
12906 " file-based disks", errors.ECODE_INVAL)
12908 def _CleanupExports(self, feedback_fn):
12909 """Removes exports of current instance from all other nodes.
12911 If an instance in a cluster with nodes A..D was exported to node C, its
12912 exports will be removed from the nodes A, B and D.
12915 assert self.op.mode != constants.EXPORT_MODE_REMOTE
12917 nodelist = self.cfg.GetNodeList()
12918 nodelist.remove(self.dst_node.name)
12920 # on one-node clusters nodelist will be empty after the removal
12921 # if we proceed the backup would be removed because OpBackupQuery
12922 # substitutes an empty list with the full cluster node list.
12923 iname = self.instance.name
12925 feedback_fn("Removing old exports for instance %s" % iname)
12926 exportlist = self.rpc.call_export_list(nodelist)
12927 for node in exportlist:
12928 if exportlist[node].fail_msg:
12930 if iname in exportlist[node].payload:
12931 msg = self.rpc.call_export_remove(node, iname).fail_msg
12933 self.LogWarning("Could not remove older export for instance %s"
12934 " on node %s: %s", iname, node, msg)
12936 def Exec(self, feedback_fn):
12937 """Export an instance to an image in the cluster.
12940 assert self.op.mode in constants.EXPORT_MODES
12942 instance = self.instance
12943 src_node = instance.primary_node
12945 if self.op.shutdown:
12946 # shutdown the instance, but not the disks
12947 feedback_fn("Shutting down instance %s" % instance.name)
12948 result = self.rpc.call_instance_shutdown(src_node, instance,
12949 self.op.shutdown_timeout)
12950 # TODO: Maybe ignore failures if ignore_remove_failures is set
12951 result.Raise("Could not shutdown instance %s on"
12952 " node %s" % (instance.name, src_node))
12954 # set the disks ID correctly since call_instance_start needs the
12955 # correct drbd minor to create the symlinks
12956 for disk in instance.disks:
12957 self.cfg.SetDiskID(disk, src_node)
12959 activate_disks = (instance.admin_state != constants.ADMINST_UP)
12962 # Activate the instance disks if we'exporting a stopped instance
12963 feedback_fn("Activating disks for %s" % instance.name)
12964 _StartInstanceDisks(self, instance, None)
12967 helper = masterd.instance.ExportInstanceHelper(self, feedback_fn,
12970 helper.CreateSnapshots()
12972 if (self.op.shutdown and
12973 instance.admin_state == constants.ADMINST_UP and
12974 not self.op.remove_instance):
12975 assert not activate_disks
12976 feedback_fn("Starting instance %s" % instance.name)
12977 result = self.rpc.call_instance_start(src_node,
12978 (instance, None, None), False)
12979 msg = result.fail_msg
12981 feedback_fn("Failed to start instance: %s" % msg)
12982 _ShutdownInstanceDisks(self, instance)
12983 raise errors.OpExecError("Could not start instance: %s" % msg)
12985 if self.op.mode == constants.EXPORT_MODE_LOCAL:
12986 (fin_resu, dresults) = helper.LocalExport(self.dst_node)
12987 elif self.op.mode == constants.EXPORT_MODE_REMOTE:
12988 connect_timeout = constants.RIE_CONNECT_TIMEOUT
12989 timeouts = masterd.instance.ImportExportTimeouts(connect_timeout)
12991 (key_name, _, _) = self.x509_key_name
12994 OpenSSL.crypto.dump_certificate(OpenSSL.crypto.FILETYPE_PEM,
12997 (fin_resu, dresults) = helper.RemoteExport(self.dest_disk_info,
12998 key_name, dest_ca_pem,
13003 # Check for backwards compatibility
13004 assert len(dresults) == len(instance.disks)
13005 assert compat.all(isinstance(i, bool) for i in dresults), \
13006 "Not all results are boolean: %r" % dresults
13010 feedback_fn("Deactivating disks for %s" % instance.name)
13011 _ShutdownInstanceDisks(self, instance)
13013 if not (compat.all(dresults) and fin_resu):
13016 failures.append("export finalization")
13017 if not compat.all(dresults):
13018 fdsk = utils.CommaJoin(idx for (idx, dsk) in enumerate(dresults)
13020 failures.append("disk export: disk(s) %s" % fdsk)
13022 raise errors.OpExecError("Export failed, errors in %s" %
13023 utils.CommaJoin(failures))
13025 # At this point, the export was successful, we can cleanup/finish
13027 # Remove instance if requested
13028 if self.op.remove_instance:
13029 feedback_fn("Removing instance %s" % instance.name)
13030 _RemoveInstance(self, feedback_fn, instance,
13031 self.op.ignore_remove_failures)
13033 if self.op.mode == constants.EXPORT_MODE_LOCAL:
13034 self._CleanupExports(feedback_fn)
13036 return fin_resu, dresults
13039 class LUBackupRemove(NoHooksLU):
13040 """Remove exports related to the named instance.
13045 def ExpandNames(self):
13046 self.needed_locks = {}
13047 # We need all nodes to be locked in order for RemoveExport to work, but we
13048 # don't need to lock the instance itself, as nothing will happen to it (and
13049 # we can remove exports also for a removed instance)
13050 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
13052 def Exec(self, feedback_fn):
13053 """Remove any export.
13056 instance_name = self.cfg.ExpandInstanceName(self.op.instance_name)
13057 # If the instance was not found we'll try with the name that was passed in.
13058 # This will only work if it was an FQDN, though.
13060 if not instance_name:
13062 instance_name = self.op.instance_name
13064 locked_nodes = self.owned_locks(locking.LEVEL_NODE)
13065 exportlist = self.rpc.call_export_list(locked_nodes)
13067 for node in exportlist:
13068 msg = exportlist[node].fail_msg
13070 self.LogWarning("Failed to query node %s (continuing): %s", node, msg)
13072 if instance_name in exportlist[node].payload:
13074 result = self.rpc.call_export_remove(node, instance_name)
13075 msg = result.fail_msg
13077 logging.error("Could not remove export for instance %s"
13078 " on node %s: %s", instance_name, node, msg)
13080 if fqdn_warn and not found:
13081 feedback_fn("Export not found. If trying to remove an export belonging"
13082 " to a deleted instance please use its Fully Qualified"
13086 class LUGroupAdd(LogicalUnit):
13087 """Logical unit for creating node groups.
13090 HPATH = "group-add"
13091 HTYPE = constants.HTYPE_GROUP
13094 def ExpandNames(self):
13095 # We need the new group's UUID here so that we can create and acquire the
13096 # corresponding lock. Later, in Exec(), we'll indicate to cfg.AddNodeGroup
13097 # that it should not check whether the UUID exists in the configuration.
13098 self.group_uuid = self.cfg.GenerateUniqueID(self.proc.GetECId())
13099 self.needed_locks = {}
13100 self.add_locks[locking.LEVEL_NODEGROUP] = self.group_uuid
13102 def CheckPrereq(self):
13103 """Check prerequisites.
13105 This checks that the given group name is not an existing node group
13110 existing_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
13111 except errors.OpPrereqError:
13114 raise errors.OpPrereqError("Desired group name '%s' already exists as a"
13115 " node group (UUID: %s)" %
13116 (self.op.group_name, existing_uuid),
13117 errors.ECODE_EXISTS)
13119 if self.op.ndparams:
13120 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
13122 if self.op.hv_state:
13123 self.new_hv_state = _MergeAndVerifyHvState(self.op.hv_state, None)
13125 self.new_hv_state = None
13127 if self.op.disk_state:
13128 self.new_disk_state = _MergeAndVerifyDiskState(self.op.disk_state, None)
13130 self.new_disk_state = None
13132 if self.op.diskparams:
13133 for templ in constants.DISK_TEMPLATES:
13134 if templ not in self.op.diskparams:
13135 self.op.diskparams[templ] = {}
13136 utils.ForceDictType(self.op.diskparams[templ], constants.DISK_DT_TYPES)
13138 self.op.diskparams = self.cfg.GetClusterInfo().diskparams
13140 if self.op.ipolicy:
13141 cluster = self.cfg.GetClusterInfo()
13142 full_ipolicy = cluster.SimpleFillIPolicy(self.op.ipolicy)
13144 objects.InstancePolicy.CheckParameterSyntax(full_ipolicy)
13145 except errors.ConfigurationError, err:
13146 raise errors.OpPrereqError("Invalid instance policy: %s" % err,
13147 errors.ECODE_INVAL)
13149 def BuildHooksEnv(self):
13150 """Build hooks env.
13154 "GROUP_NAME": self.op.group_name,
13157 def BuildHooksNodes(self):
13158 """Build hooks nodes.
13161 mn = self.cfg.GetMasterNode()
13162 return ([mn], [mn])
13164 def Exec(self, feedback_fn):
13165 """Add the node group to the cluster.
13168 group_obj = objects.NodeGroup(name=self.op.group_name, members=[],
13169 uuid=self.group_uuid,
13170 alloc_policy=self.op.alloc_policy,
13171 ndparams=self.op.ndparams,
13172 diskparams=self.op.diskparams,
13173 ipolicy=self.op.ipolicy,
13174 hv_state_static=self.new_hv_state,
13175 disk_state_static=self.new_disk_state)
13177 self.cfg.AddNodeGroup(group_obj, self.proc.GetECId(), check_uuid=False)
13178 del self.remove_locks[locking.LEVEL_NODEGROUP]
13181 class LUGroupAssignNodes(NoHooksLU):
13182 """Logical unit for assigning nodes to groups.
13187 def ExpandNames(self):
13188 # These raise errors.OpPrereqError on their own:
13189 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
13190 self.op.nodes = _GetWantedNodes(self, self.op.nodes)
13192 # We want to lock all the affected nodes and groups. We have readily
13193 # available the list of nodes, and the *destination* group. To gather the
13194 # list of "source" groups, we need to fetch node information later on.
13195 self.needed_locks = {
13196 locking.LEVEL_NODEGROUP: set([self.group_uuid]),
13197 locking.LEVEL_NODE: self.op.nodes,
13200 def DeclareLocks(self, level):
13201 if level == locking.LEVEL_NODEGROUP:
13202 assert len(self.needed_locks[locking.LEVEL_NODEGROUP]) == 1
13204 # Try to get all affected nodes' groups without having the group or node
13205 # lock yet. Needs verification later in the code flow.
13206 groups = self.cfg.GetNodeGroupsFromNodes(self.op.nodes)
13208 self.needed_locks[locking.LEVEL_NODEGROUP].update(groups)
13210 def CheckPrereq(self):
13211 """Check prerequisites.
13214 assert self.needed_locks[locking.LEVEL_NODEGROUP]
13215 assert (frozenset(self.owned_locks(locking.LEVEL_NODE)) ==
13216 frozenset(self.op.nodes))
13218 expected_locks = (set([self.group_uuid]) |
13219 self.cfg.GetNodeGroupsFromNodes(self.op.nodes))
13220 actual_locks = self.owned_locks(locking.LEVEL_NODEGROUP)
13221 if actual_locks != expected_locks:
13222 raise errors.OpExecError("Nodes changed groups since locks were acquired,"
13223 " current groups are '%s', used to be '%s'" %
13224 (utils.CommaJoin(expected_locks),
13225 utils.CommaJoin(actual_locks)))
13227 self.node_data = self.cfg.GetAllNodesInfo()
13228 self.group = self.cfg.GetNodeGroup(self.group_uuid)
13229 instance_data = self.cfg.GetAllInstancesInfo()
13231 if self.group is None:
13232 raise errors.OpExecError("Could not retrieve group '%s' (UUID: %s)" %
13233 (self.op.group_name, self.group_uuid))
13235 (new_splits, previous_splits) = \
13236 self.CheckAssignmentForSplitInstances([(node, self.group_uuid)
13237 for node in self.op.nodes],
13238 self.node_data, instance_data)
13241 fmt_new_splits = utils.CommaJoin(utils.NiceSort(new_splits))
13243 if not self.op.force:
13244 raise errors.OpExecError("The following instances get split by this"
13245 " change and --force was not given: %s" %
13248 self.LogWarning("This operation will split the following instances: %s",
13251 if previous_splits:
13252 self.LogWarning("In addition, these already-split instances continue"
13253 " to be split across groups: %s",
13254 utils.CommaJoin(utils.NiceSort(previous_splits)))
13256 def Exec(self, feedback_fn):
13257 """Assign nodes to a new group.
13260 mods = [(node_name, self.group_uuid) for node_name in self.op.nodes]
13262 self.cfg.AssignGroupNodes(mods)
13265 def CheckAssignmentForSplitInstances(changes, node_data, instance_data):
13266 """Check for split instances after a node assignment.
13268 This method considers a series of node assignments as an atomic operation,
13269 and returns information about split instances after applying the set of
13272 In particular, it returns information about newly split instances, and
13273 instances that were already split, and remain so after the change.
13275 Only instances whose disk template is listed in constants.DTS_INT_MIRROR are
13278 @type changes: list of (node_name, new_group_uuid) pairs.
13279 @param changes: list of node assignments to consider.
13280 @param node_data: a dict with data for all nodes
13281 @param instance_data: a dict with all instances to consider
13282 @rtype: a two-tuple
13283 @return: a list of instances that were previously okay and result split as a
13284 consequence of this change, and a list of instances that were previously
13285 split and this change does not fix.
13288 changed_nodes = dict((node, group) for node, group in changes
13289 if node_data[node].group != group)
13291 all_split_instances = set()
13292 previously_split_instances = set()
13294 def InstanceNodes(instance):
13295 return [instance.primary_node] + list(instance.secondary_nodes)
13297 for inst in instance_data.values():
13298 if inst.disk_template not in constants.DTS_INT_MIRROR:
13301 instance_nodes = InstanceNodes(inst)
13303 if len(set(node_data[node].group for node in instance_nodes)) > 1:
13304 previously_split_instances.add(inst.name)
13306 if len(set(changed_nodes.get(node, node_data[node].group)
13307 for node in instance_nodes)) > 1:
13308 all_split_instances.add(inst.name)
13310 return (list(all_split_instances - previously_split_instances),
13311 list(previously_split_instances & all_split_instances))
13314 class _GroupQuery(_QueryBase):
13315 FIELDS = query.GROUP_FIELDS
13317 def ExpandNames(self, lu):
13318 lu.needed_locks = {}
13320 self._all_groups = lu.cfg.GetAllNodeGroupsInfo()
13321 self._cluster = lu.cfg.GetClusterInfo()
13322 name_to_uuid = dict((g.name, g.uuid) for g in self._all_groups.values())
13325 self.wanted = [name_to_uuid[name]
13326 for name in utils.NiceSort(name_to_uuid.keys())]
13328 # Accept names to be either names or UUIDs.
13331 all_uuid = frozenset(self._all_groups.keys())
13333 for name in self.names:
13334 if name in all_uuid:
13335 self.wanted.append(name)
13336 elif name in name_to_uuid:
13337 self.wanted.append(name_to_uuid[name])
13339 missing.append(name)
13342 raise errors.OpPrereqError("Some groups do not exist: %s" %
13343 utils.CommaJoin(missing),
13344 errors.ECODE_NOENT)
13346 def DeclareLocks(self, lu, level):
13349 def _GetQueryData(self, lu):
13350 """Computes the list of node groups and their attributes.
13353 do_nodes = query.GQ_NODE in self.requested_data
13354 do_instances = query.GQ_INST in self.requested_data
13356 group_to_nodes = None
13357 group_to_instances = None
13359 # For GQ_NODE, we need to map group->[nodes], and group->[instances] for
13360 # GQ_INST. The former is attainable with just GetAllNodesInfo(), but for the
13361 # latter GetAllInstancesInfo() is not enough, for we have to go through
13362 # instance->node. Hence, we will need to process nodes even if we only need
13363 # instance information.
13364 if do_nodes or do_instances:
13365 all_nodes = lu.cfg.GetAllNodesInfo()
13366 group_to_nodes = dict((uuid, []) for uuid in self.wanted)
13369 for node in all_nodes.values():
13370 if node.group in group_to_nodes:
13371 group_to_nodes[node.group].append(node.name)
13372 node_to_group[node.name] = node.group
13375 all_instances = lu.cfg.GetAllInstancesInfo()
13376 group_to_instances = dict((uuid, []) for uuid in self.wanted)
13378 for instance in all_instances.values():
13379 node = instance.primary_node
13380 if node in node_to_group:
13381 group_to_instances[node_to_group[node]].append(instance.name)
13384 # Do not pass on node information if it was not requested.
13385 group_to_nodes = None
13387 return query.GroupQueryData(self._cluster,
13388 [self._all_groups[uuid]
13389 for uuid in self.wanted],
13390 group_to_nodes, group_to_instances)
13393 class LUGroupQuery(NoHooksLU):
13394 """Logical unit for querying node groups.
13399 def CheckArguments(self):
13400 self.gq = _GroupQuery(qlang.MakeSimpleFilter("name", self.op.names),
13401 self.op.output_fields, False)
13403 def ExpandNames(self):
13404 self.gq.ExpandNames(self)
13406 def DeclareLocks(self, level):
13407 self.gq.DeclareLocks(self, level)
13409 def Exec(self, feedback_fn):
13410 return self.gq.OldStyleQuery(self)
13413 class LUGroupSetParams(LogicalUnit):
13414 """Modifies the parameters of a node group.
13417 HPATH = "group-modify"
13418 HTYPE = constants.HTYPE_GROUP
13421 def CheckArguments(self):
13424 self.op.diskparams,
13425 self.op.alloc_policy,
13427 self.op.disk_state,
13431 if all_changes.count(None) == len(all_changes):
13432 raise errors.OpPrereqError("Please pass at least one modification",
13433 errors.ECODE_INVAL)
13435 def ExpandNames(self):
13436 # This raises errors.OpPrereqError on its own:
13437 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
13439 self.needed_locks = {
13440 locking.LEVEL_INSTANCE: [],
13441 locking.LEVEL_NODEGROUP: [self.group_uuid],
13444 self.share_locks[locking.LEVEL_INSTANCE] = 1
13446 def DeclareLocks(self, level):
13447 if level == locking.LEVEL_INSTANCE:
13448 assert not self.needed_locks[locking.LEVEL_INSTANCE]
13450 # Lock instances optimistically, needs verification once group lock has
13452 self.needed_locks[locking.LEVEL_INSTANCE] = \
13453 self.cfg.GetNodeGroupInstances(self.group_uuid)
13455 def CheckPrereq(self):
13456 """Check prerequisites.
13459 owned_instances = frozenset(self.owned_locks(locking.LEVEL_INSTANCE))
13461 # Check if locked instances are still correct
13462 _CheckNodeGroupInstances(self.cfg, self.group_uuid, owned_instances)
13464 self.group = self.cfg.GetNodeGroup(self.group_uuid)
13465 cluster = self.cfg.GetClusterInfo()
13467 if self.group is None:
13468 raise errors.OpExecError("Could not retrieve group '%s' (UUID: %s)" %
13469 (self.op.group_name, self.group_uuid))
13471 if self.op.ndparams:
13472 new_ndparams = _GetUpdatedParams(self.group.ndparams, self.op.ndparams)
13473 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
13474 self.new_ndparams = new_ndparams
13476 if self.op.diskparams:
13477 self.new_diskparams = dict()
13478 for templ in constants.DISK_TEMPLATES:
13479 if templ not in self.op.diskparams:
13480 self.op.diskparams[templ] = {}
13481 new_templ_params = _GetUpdatedParams(self.group.diskparams[templ],
13482 self.op.diskparams[templ])
13483 utils.ForceDictType(new_templ_params, constants.DISK_DT_TYPES)
13484 self.new_diskparams[templ] = new_templ_params
13486 if self.op.hv_state:
13487 self.new_hv_state = _MergeAndVerifyHvState(self.op.hv_state,
13488 self.group.hv_state_static)
13490 if self.op.disk_state:
13491 self.new_disk_state = \
13492 _MergeAndVerifyDiskState(self.op.disk_state,
13493 self.group.disk_state_static)
13495 if self.op.ipolicy:
13496 self.new_ipolicy = _GetUpdatedIPolicy(self.group.ipolicy,
13500 new_ipolicy = cluster.SimpleFillIPolicy(self.new_ipolicy)
13501 inst_filter = lambda inst: inst.name in owned_instances
13502 instances = self.cfg.GetInstancesInfoByFilter(inst_filter).values()
13504 _ComputeNewInstanceViolations(_CalculateGroupIPolicy(cluster,
13506 new_ipolicy, instances)
13509 self.LogWarning("After the ipolicy change the following instances"
13510 " violate them: %s",
13511 utils.CommaJoin(violations))
13513 def BuildHooksEnv(self):
13514 """Build hooks env.
13518 "GROUP_NAME": self.op.group_name,
13519 "NEW_ALLOC_POLICY": self.op.alloc_policy,
13522 def BuildHooksNodes(self):
13523 """Build hooks nodes.
13526 mn = self.cfg.GetMasterNode()
13527 return ([mn], [mn])
13529 def Exec(self, feedback_fn):
13530 """Modifies the node group.
13535 if self.op.ndparams:
13536 self.group.ndparams = self.new_ndparams
13537 result.append(("ndparams", str(self.group.ndparams)))
13539 if self.op.diskparams:
13540 self.group.diskparams = self.new_diskparams
13541 result.append(("diskparams", str(self.group.diskparams)))
13543 if self.op.alloc_policy:
13544 self.group.alloc_policy = self.op.alloc_policy
13546 if self.op.hv_state:
13547 self.group.hv_state_static = self.new_hv_state
13549 if self.op.disk_state:
13550 self.group.disk_state_static = self.new_disk_state
13552 if self.op.ipolicy:
13553 self.group.ipolicy = self.new_ipolicy
13555 self.cfg.Update(self.group, feedback_fn)
13559 class LUGroupRemove(LogicalUnit):
13560 HPATH = "group-remove"
13561 HTYPE = constants.HTYPE_GROUP
13564 def ExpandNames(self):
13565 # This will raises errors.OpPrereqError on its own:
13566 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
13567 self.needed_locks = {
13568 locking.LEVEL_NODEGROUP: [self.group_uuid],
13571 def CheckPrereq(self):
13572 """Check prerequisites.
13574 This checks that the given group name exists as a node group, that is
13575 empty (i.e., contains no nodes), and that is not the last group of the
13579 # Verify that the group is empty.
13580 group_nodes = [node.name
13581 for node in self.cfg.GetAllNodesInfo().values()
13582 if node.group == self.group_uuid]
13585 raise errors.OpPrereqError("Group '%s' not empty, has the following"
13587 (self.op.group_name,
13588 utils.CommaJoin(utils.NiceSort(group_nodes))),
13589 errors.ECODE_STATE)
13591 # Verify the cluster would not be left group-less.
13592 if len(self.cfg.GetNodeGroupList()) == 1:
13593 raise errors.OpPrereqError("Group '%s' is the only group,"
13594 " cannot be removed" %
13595 self.op.group_name,
13596 errors.ECODE_STATE)
13598 def BuildHooksEnv(self):
13599 """Build hooks env.
13603 "GROUP_NAME": self.op.group_name,
13606 def BuildHooksNodes(self):
13607 """Build hooks nodes.
13610 mn = self.cfg.GetMasterNode()
13611 return ([mn], [mn])
13613 def Exec(self, feedback_fn):
13614 """Remove the node group.
13618 self.cfg.RemoveNodeGroup(self.group_uuid)
13619 except errors.ConfigurationError:
13620 raise errors.OpExecError("Group '%s' with UUID %s disappeared" %
13621 (self.op.group_name, self.group_uuid))
13623 self.remove_locks[locking.LEVEL_NODEGROUP] = self.group_uuid
13626 class LUGroupRename(LogicalUnit):
13627 HPATH = "group-rename"
13628 HTYPE = constants.HTYPE_GROUP
13631 def ExpandNames(self):
13632 # This raises errors.OpPrereqError on its own:
13633 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
13635 self.needed_locks = {
13636 locking.LEVEL_NODEGROUP: [self.group_uuid],
13639 def CheckPrereq(self):
13640 """Check prerequisites.
13642 Ensures requested new name is not yet used.
13646 new_name_uuid = self.cfg.LookupNodeGroup(self.op.new_name)
13647 except errors.OpPrereqError:
13650 raise errors.OpPrereqError("Desired new name '%s' clashes with existing"
13651 " node group (UUID: %s)" %
13652 (self.op.new_name, new_name_uuid),
13653 errors.ECODE_EXISTS)
13655 def BuildHooksEnv(self):
13656 """Build hooks env.
13660 "OLD_NAME": self.op.group_name,
13661 "NEW_NAME": self.op.new_name,
13664 def BuildHooksNodes(self):
13665 """Build hooks nodes.
13668 mn = self.cfg.GetMasterNode()
13670 all_nodes = self.cfg.GetAllNodesInfo()
13671 all_nodes.pop(mn, None)
13674 run_nodes.extend(node.name for node in all_nodes.values()
13675 if node.group == self.group_uuid)
13677 return (run_nodes, run_nodes)
13679 def Exec(self, feedback_fn):
13680 """Rename the node group.
13683 group = self.cfg.GetNodeGroup(self.group_uuid)
13686 raise errors.OpExecError("Could not retrieve group '%s' (UUID: %s)" %
13687 (self.op.group_name, self.group_uuid))
13689 group.name = self.op.new_name
13690 self.cfg.Update(group, feedback_fn)
13692 return self.op.new_name
13695 class LUGroupEvacuate(LogicalUnit):
13696 HPATH = "group-evacuate"
13697 HTYPE = constants.HTYPE_GROUP
13700 def ExpandNames(self):
13701 # This raises errors.OpPrereqError on its own:
13702 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
13704 if self.op.target_groups:
13705 self.req_target_uuids = map(self.cfg.LookupNodeGroup,
13706 self.op.target_groups)
13708 self.req_target_uuids = []
13710 if self.group_uuid in self.req_target_uuids:
13711 raise errors.OpPrereqError("Group to be evacuated (%s) can not be used"
13712 " as a target group (targets are %s)" %
13714 utils.CommaJoin(self.req_target_uuids)),
13715 errors.ECODE_INVAL)
13717 self.op.iallocator = _GetDefaultIAllocator(self.cfg, self.op.iallocator)
13719 self.share_locks = _ShareAll()
13720 self.needed_locks = {
13721 locking.LEVEL_INSTANCE: [],
13722 locking.LEVEL_NODEGROUP: [],
13723 locking.LEVEL_NODE: [],
13726 def DeclareLocks(self, level):
13727 if level == locking.LEVEL_INSTANCE:
13728 assert not self.needed_locks[locking.LEVEL_INSTANCE]
13730 # Lock instances optimistically, needs verification once node and group
13731 # locks have been acquired
13732 self.needed_locks[locking.LEVEL_INSTANCE] = \
13733 self.cfg.GetNodeGroupInstances(self.group_uuid)
13735 elif level == locking.LEVEL_NODEGROUP:
13736 assert not self.needed_locks[locking.LEVEL_NODEGROUP]
13738 if self.req_target_uuids:
13739 lock_groups = set([self.group_uuid] + self.req_target_uuids)
13741 # Lock all groups used by instances optimistically; this requires going
13742 # via the node before it's locked, requiring verification later on
13743 lock_groups.update(group_uuid
13744 for instance_name in
13745 self.owned_locks(locking.LEVEL_INSTANCE)
13747 self.cfg.GetInstanceNodeGroups(instance_name))
13749 # No target groups, need to lock all of them
13750 lock_groups = locking.ALL_SET
13752 self.needed_locks[locking.LEVEL_NODEGROUP] = lock_groups
13754 elif level == locking.LEVEL_NODE:
13755 # This will only lock the nodes in the group to be evacuated which
13756 # contain actual instances
13757 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
13758 self._LockInstancesNodes()
13760 # Lock all nodes in group to be evacuated and target groups
13761 owned_groups = frozenset(self.owned_locks(locking.LEVEL_NODEGROUP))
13762 assert self.group_uuid in owned_groups
13763 member_nodes = [node_name
13764 for group in owned_groups
13765 for node_name in self.cfg.GetNodeGroup(group).members]
13766 self.needed_locks[locking.LEVEL_NODE].extend(member_nodes)
13768 def CheckPrereq(self):
13769 owned_instances = frozenset(self.owned_locks(locking.LEVEL_INSTANCE))
13770 owned_groups = frozenset(self.owned_locks(locking.LEVEL_NODEGROUP))
13771 owned_nodes = frozenset(self.owned_locks(locking.LEVEL_NODE))
13773 assert owned_groups.issuperset(self.req_target_uuids)
13774 assert self.group_uuid in owned_groups
13776 # Check if locked instances are still correct
13777 _CheckNodeGroupInstances(self.cfg, self.group_uuid, owned_instances)
13779 # Get instance information
13780 self.instances = dict(self.cfg.GetMultiInstanceInfo(owned_instances))
13782 # Check if node groups for locked instances are still correct
13783 for instance_name in owned_instances:
13784 inst = self.instances[instance_name]
13785 assert owned_nodes.issuperset(inst.all_nodes), \
13786 "Instance %s's nodes changed while we kept the lock" % instance_name
13788 inst_groups = _CheckInstanceNodeGroups(self.cfg, instance_name,
13791 assert self.group_uuid in inst_groups, \
13792 "Instance %s has no node in group %s" % (instance_name, self.group_uuid)
13794 if self.req_target_uuids:
13795 # User requested specific target groups
13796 self.target_uuids = self.req_target_uuids
13798 # All groups except the one to be evacuated are potential targets
13799 self.target_uuids = [group_uuid for group_uuid in owned_groups
13800 if group_uuid != self.group_uuid]
13802 if not self.target_uuids:
13803 raise errors.OpPrereqError("There are no possible target groups",
13804 errors.ECODE_INVAL)
13806 def BuildHooksEnv(self):
13807 """Build hooks env.
13811 "GROUP_NAME": self.op.group_name,
13812 "TARGET_GROUPS": " ".join(self.target_uuids),
13815 def BuildHooksNodes(self):
13816 """Build hooks nodes.
13819 mn = self.cfg.GetMasterNode()
13821 assert self.group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP)
13823 run_nodes = [mn] + self.cfg.GetNodeGroup(self.group_uuid).members
13825 return (run_nodes, run_nodes)
13827 def Exec(self, feedback_fn):
13828 instances = list(self.owned_locks(locking.LEVEL_INSTANCE))
13830 assert self.group_uuid not in self.target_uuids
13832 ial = IAllocator(self.cfg, self.rpc, constants.IALLOCATOR_MODE_CHG_GROUP,
13833 instances=instances, target_groups=self.target_uuids)
13835 ial.Run(self.op.iallocator)
13837 if not ial.success:
13838 raise errors.OpPrereqError("Can't compute group evacuation using"
13839 " iallocator '%s': %s" %
13840 (self.op.iallocator, ial.info),
13841 errors.ECODE_NORES)
13843 jobs = _LoadNodeEvacResult(self, ial.result, self.op.early_release, False)
13845 self.LogInfo("Iallocator returned %s job(s) for evacuating node group %s",
13846 len(jobs), self.op.group_name)
13848 return ResultWithJobs(jobs)
13851 class TagsLU(NoHooksLU): # pylint: disable=W0223
13852 """Generic tags LU.
13854 This is an abstract class which is the parent of all the other tags LUs.
13857 def ExpandNames(self):
13858 self.group_uuid = None
13859 self.needed_locks = {}
13860 if self.op.kind == constants.TAG_NODE:
13861 self.op.name = _ExpandNodeName(self.cfg, self.op.name)
13862 self.needed_locks[locking.LEVEL_NODE] = self.op.name
13863 elif self.op.kind == constants.TAG_INSTANCE:
13864 self.op.name = _ExpandInstanceName(self.cfg, self.op.name)
13865 self.needed_locks[locking.LEVEL_INSTANCE] = self.op.name
13866 elif self.op.kind == constants.TAG_NODEGROUP:
13867 self.group_uuid = self.cfg.LookupNodeGroup(self.op.name)
13869 # FIXME: Acquire BGL for cluster tag operations (as of this writing it's
13870 # not possible to acquire the BGL based on opcode parameters)
13872 def CheckPrereq(self):
13873 """Check prerequisites.
13876 if self.op.kind == constants.TAG_CLUSTER:
13877 self.target = self.cfg.GetClusterInfo()
13878 elif self.op.kind == constants.TAG_NODE:
13879 self.target = self.cfg.GetNodeInfo(self.op.name)
13880 elif self.op.kind == constants.TAG_INSTANCE:
13881 self.target = self.cfg.GetInstanceInfo(self.op.name)
13882 elif self.op.kind == constants.TAG_NODEGROUP:
13883 self.target = self.cfg.GetNodeGroup(self.group_uuid)
13885 raise errors.OpPrereqError("Wrong tag type requested (%s)" %
13886 str(self.op.kind), errors.ECODE_INVAL)
13889 class LUTagsGet(TagsLU):
13890 """Returns the tags of a given object.
13895 def ExpandNames(self):
13896 TagsLU.ExpandNames(self)
13898 # Share locks as this is only a read operation
13899 self.share_locks = _ShareAll()
13901 def Exec(self, feedback_fn):
13902 """Returns the tag list.
13905 return list(self.target.GetTags())
13908 class LUTagsSearch(NoHooksLU):
13909 """Searches the tags for a given pattern.
13914 def ExpandNames(self):
13915 self.needed_locks = {}
13917 def CheckPrereq(self):
13918 """Check prerequisites.
13920 This checks the pattern passed for validity by compiling it.
13924 self.re = re.compile(self.op.pattern)
13925 except re.error, err:
13926 raise errors.OpPrereqError("Invalid search pattern '%s': %s" %
13927 (self.op.pattern, err), errors.ECODE_INVAL)
13929 def Exec(self, feedback_fn):
13930 """Returns the tag list.
13934 tgts = [("/cluster", cfg.GetClusterInfo())]
13935 ilist = cfg.GetAllInstancesInfo().values()
13936 tgts.extend([("/instances/%s" % i.name, i) for i in ilist])
13937 nlist = cfg.GetAllNodesInfo().values()
13938 tgts.extend([("/nodes/%s" % n.name, n) for n in nlist])
13939 tgts.extend(("/nodegroup/%s" % n.name, n)
13940 for n in cfg.GetAllNodeGroupsInfo().values())
13942 for path, target in tgts:
13943 for tag in target.GetTags():
13944 if self.re.search(tag):
13945 results.append((path, tag))
13949 class LUTagsSet(TagsLU):
13950 """Sets a tag on a given object.
13955 def CheckPrereq(self):
13956 """Check prerequisites.
13958 This checks the type and length of the tag name and value.
13961 TagsLU.CheckPrereq(self)
13962 for tag in self.op.tags:
13963 objects.TaggableObject.ValidateTag(tag)
13965 def Exec(self, feedback_fn):
13970 for tag in self.op.tags:
13971 self.target.AddTag(tag)
13972 except errors.TagError, err:
13973 raise errors.OpExecError("Error while setting tag: %s" % str(err))
13974 self.cfg.Update(self.target, feedback_fn)
13977 class LUTagsDel(TagsLU):
13978 """Delete a list of tags from a given object.
13983 def CheckPrereq(self):
13984 """Check prerequisites.
13986 This checks that we have the given tag.
13989 TagsLU.CheckPrereq(self)
13990 for tag in self.op.tags:
13991 objects.TaggableObject.ValidateTag(tag)
13992 del_tags = frozenset(self.op.tags)
13993 cur_tags = self.target.GetTags()
13995 diff_tags = del_tags - cur_tags
13997 diff_names = ("'%s'" % i for i in sorted(diff_tags))
13998 raise errors.OpPrereqError("Tag(s) %s not found" %
13999 (utils.CommaJoin(diff_names), ),
14000 errors.ECODE_NOENT)
14002 def Exec(self, feedback_fn):
14003 """Remove the tag from the object.
14006 for tag in self.op.tags:
14007 self.target.RemoveTag(tag)
14008 self.cfg.Update(self.target, feedback_fn)
14011 class LUTestDelay(NoHooksLU):
14012 """Sleep for a specified amount of time.
14014 This LU sleeps on the master and/or nodes for a specified amount of
14020 def ExpandNames(self):
14021 """Expand names and set required locks.
14023 This expands the node list, if any.
14026 self.needed_locks = {}
14027 if self.op.on_nodes:
14028 # _GetWantedNodes can be used here, but is not always appropriate to use
14029 # this way in ExpandNames. Check LogicalUnit.ExpandNames docstring for
14030 # more information.
14031 self.op.on_nodes = _GetWantedNodes(self, self.op.on_nodes)
14032 self.needed_locks[locking.LEVEL_NODE] = self.op.on_nodes
14034 def _TestDelay(self):
14035 """Do the actual sleep.
14038 if self.op.on_master:
14039 if not utils.TestDelay(self.op.duration):
14040 raise errors.OpExecError("Error during master delay test")
14041 if self.op.on_nodes:
14042 result = self.rpc.call_test_delay(self.op.on_nodes, self.op.duration)
14043 for node, node_result in result.items():
14044 node_result.Raise("Failure during rpc call to node %s" % node)
14046 def Exec(self, feedback_fn):
14047 """Execute the test delay opcode, with the wanted repetitions.
14050 if self.op.repeat == 0:
14053 top_value = self.op.repeat - 1
14054 for i in range(self.op.repeat):
14055 self.LogInfo("Test delay iteration %d/%d" % (i, top_value))
14059 class LUTestJqueue(NoHooksLU):
14060 """Utility LU to test some aspects of the job queue.
14065 # Must be lower than default timeout for WaitForJobChange to see whether it
14066 # notices changed jobs
14067 _CLIENT_CONNECT_TIMEOUT = 20.0
14068 _CLIENT_CONFIRM_TIMEOUT = 60.0
14071 def _NotifyUsingSocket(cls, cb, errcls):
14072 """Opens a Unix socket and waits for another program to connect.
14075 @param cb: Callback to send socket name to client
14076 @type errcls: class
14077 @param errcls: Exception class to use for errors
14080 # Using a temporary directory as there's no easy way to create temporary
14081 # sockets without writing a custom loop around tempfile.mktemp and
14083 tmpdir = tempfile.mkdtemp()
14085 tmpsock = utils.PathJoin(tmpdir, "sock")
14087 logging.debug("Creating temporary socket at %s", tmpsock)
14088 sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
14093 # Send details to client
14096 # Wait for client to connect before continuing
14097 sock.settimeout(cls._CLIENT_CONNECT_TIMEOUT)
14099 (conn, _) = sock.accept()
14100 except socket.error, err:
14101 raise errcls("Client didn't connect in time (%s)" % err)
14105 # Remove as soon as client is connected
14106 shutil.rmtree(tmpdir)
14108 # Wait for client to close
14111 # pylint: disable=E1101
14112 # Instance of '_socketobject' has no ... member
14113 conn.settimeout(cls._CLIENT_CONFIRM_TIMEOUT)
14115 except socket.error, err:
14116 raise errcls("Client failed to confirm notification (%s)" % err)
14120 def _SendNotification(self, test, arg, sockname):
14121 """Sends a notification to the client.
14124 @param test: Test name
14125 @param arg: Test argument (depends on test)
14126 @type sockname: string
14127 @param sockname: Socket path
14130 self.Log(constants.ELOG_JQUEUE_TEST, (sockname, test, arg))
14132 def _Notify(self, prereq, test, arg):
14133 """Notifies the client of a test.
14136 @param prereq: Whether this is a prereq-phase test
14138 @param test: Test name
14139 @param arg: Test argument (depends on test)
14143 errcls = errors.OpPrereqError
14145 errcls = errors.OpExecError
14147 return self._NotifyUsingSocket(compat.partial(self._SendNotification,
14151 def CheckArguments(self):
14152 self.checkargs_calls = getattr(self, "checkargs_calls", 0) + 1
14153 self.expandnames_calls = 0
14155 def ExpandNames(self):
14156 checkargs_calls = getattr(self, "checkargs_calls", 0)
14157 if checkargs_calls < 1:
14158 raise errors.ProgrammerError("CheckArguments was not called")
14160 self.expandnames_calls += 1
14162 if self.op.notify_waitlock:
14163 self._Notify(True, constants.JQT_EXPANDNAMES, None)
14165 self.LogInfo("Expanding names")
14167 # Get lock on master node (just to get a lock, not for a particular reason)
14168 self.needed_locks = {
14169 locking.LEVEL_NODE: self.cfg.GetMasterNode(),
14172 def Exec(self, feedback_fn):
14173 if self.expandnames_calls < 1:
14174 raise errors.ProgrammerError("ExpandNames was not called")
14176 if self.op.notify_exec:
14177 self._Notify(False, constants.JQT_EXEC, None)
14179 self.LogInfo("Executing")
14181 if self.op.log_messages:
14182 self._Notify(False, constants.JQT_STARTMSG, len(self.op.log_messages))
14183 for idx, msg in enumerate(self.op.log_messages):
14184 self.LogInfo("Sending log message %s", idx + 1)
14185 feedback_fn(constants.JQT_MSGPREFIX + msg)
14186 # Report how many test messages have been sent
14187 self._Notify(False, constants.JQT_LOGMSG, idx + 1)
14190 raise errors.OpExecError("Opcode failure was requested")
14195 class IAllocator(object):
14196 """IAllocator framework.
14198 An IAllocator instance has three sets of attributes:
14199 - cfg that is needed to query the cluster
14200 - input data (all members of the _KEYS class attribute are required)
14201 - four buffer attributes (in|out_data|text), that represent the
14202 input (to the external script) in text and data structure format,
14203 and the output from it, again in two formats
14204 - the result variables from the script (success, info, nodes) for
14208 # pylint: disable=R0902
14209 # lots of instance attributes
14211 def __init__(self, cfg, rpc_runner, mode, **kwargs):
14213 self.rpc = rpc_runner
14214 # init buffer variables
14215 self.in_text = self.out_text = self.in_data = self.out_data = None
14216 # init all input fields so that pylint is happy
14218 self.memory = self.disks = self.disk_template = None
14219 self.os = self.tags = self.nics = self.vcpus = None
14220 self.hypervisor = None
14221 self.relocate_from = None
14223 self.instances = None
14224 self.evac_mode = None
14225 self.target_groups = []
14227 self.required_nodes = None
14228 # init result fields
14229 self.success = self.info = self.result = None
14232 (fn, keydata, self._result_check) = self._MODE_DATA[self.mode]
14234 raise errors.ProgrammerError("Unknown mode '%s' passed to the"
14235 " IAllocator" % self.mode)
14237 keyset = [n for (n, _) in keydata]
14240 if key not in keyset:
14241 raise errors.ProgrammerError("Invalid input parameter '%s' to"
14242 " IAllocator" % key)
14243 setattr(self, key, kwargs[key])
14246 if key not in kwargs:
14247 raise errors.ProgrammerError("Missing input parameter '%s' to"
14248 " IAllocator" % key)
14249 self._BuildInputData(compat.partial(fn, self), keydata)
14251 def _ComputeClusterData(self):
14252 """Compute the generic allocator input data.
14254 This is the data that is independent of the actual operation.
14258 cluster_info = cfg.GetClusterInfo()
14261 "version": constants.IALLOCATOR_VERSION,
14262 "cluster_name": cfg.GetClusterName(),
14263 "cluster_tags": list(cluster_info.GetTags()),
14264 "enabled_hypervisors": list(cluster_info.enabled_hypervisors),
14265 # we don't have job IDs
14267 ninfo = cfg.GetAllNodesInfo()
14268 iinfo = cfg.GetAllInstancesInfo().values()
14269 i_list = [(inst, cluster_info.FillBE(inst)) for inst in iinfo]
14272 node_list = [n.name for n in ninfo.values() if n.vm_capable]
14274 if self.mode == constants.IALLOCATOR_MODE_ALLOC:
14275 hypervisor_name = self.hypervisor
14276 elif self.mode == constants.IALLOCATOR_MODE_RELOC:
14277 hypervisor_name = cfg.GetInstanceInfo(self.name).hypervisor
14279 hypervisor_name = cluster_info.primary_hypervisor
14281 node_data = self.rpc.call_node_info(node_list, [cfg.GetVGName()],
14284 self.rpc.call_all_instances_info(node_list,
14285 cluster_info.enabled_hypervisors)
14287 data["nodegroups"] = self._ComputeNodeGroupData(cfg)
14289 config_ndata = self._ComputeBasicNodeData(ninfo)
14290 data["nodes"] = self._ComputeDynamicNodeData(ninfo, node_data, node_iinfo,
14291 i_list, config_ndata)
14292 assert len(data["nodes"]) == len(ninfo), \
14293 "Incomplete node data computed"
14295 data["instances"] = self._ComputeInstanceData(cluster_info, i_list)
14297 self.in_data = data
14300 def _ComputeNodeGroupData(cfg):
14301 """Compute node groups data.
14304 cluster = cfg.GetClusterInfo()
14305 ng = dict((guuid, {
14306 "name": gdata.name,
14307 "alloc_policy": gdata.alloc_policy,
14308 "ipolicy": _CalculateGroupIPolicy(cluster, gdata),
14310 for guuid, gdata in cfg.GetAllNodeGroupsInfo().items())
14315 def _ComputeBasicNodeData(node_cfg):
14316 """Compute global node data.
14319 @returns: a dict of name: (node dict, node config)
14322 # fill in static (config-based) values
14323 node_results = dict((ninfo.name, {
14324 "tags": list(ninfo.GetTags()),
14325 "primary_ip": ninfo.primary_ip,
14326 "secondary_ip": ninfo.secondary_ip,
14327 "offline": ninfo.offline,
14328 "drained": ninfo.drained,
14329 "master_candidate": ninfo.master_candidate,
14330 "group": ninfo.group,
14331 "master_capable": ninfo.master_capable,
14332 "vm_capable": ninfo.vm_capable,
14334 for ninfo in node_cfg.values())
14336 return node_results
14339 def _ComputeDynamicNodeData(node_cfg, node_data, node_iinfo, i_list,
14341 """Compute global node data.
14343 @param node_results: the basic node structures as filled from the config
14346 #TODO(dynmem): compute the right data on MAX and MIN memory
14347 # make a copy of the current dict
14348 node_results = dict(node_results)
14349 for nname, nresult in node_data.items():
14350 assert nname in node_results, "Missing basic data for node %s" % nname
14351 ninfo = node_cfg[nname]
14353 if not (ninfo.offline or ninfo.drained):
14354 nresult.Raise("Can't get data for node %s" % nname)
14355 node_iinfo[nname].Raise("Can't get node instance info from node %s" %
14357 remote_info = _MakeLegacyNodeInfo(nresult.payload)
14359 for attr in ["memory_total", "memory_free", "memory_dom0",
14360 "vg_size", "vg_free", "cpu_total"]:
14361 if attr not in remote_info:
14362 raise errors.OpExecError("Node '%s' didn't return attribute"
14363 " '%s'" % (nname, attr))
14364 if not isinstance(remote_info[attr], int):
14365 raise errors.OpExecError("Node '%s' returned invalid value"
14367 (nname, attr, remote_info[attr]))
14368 # compute memory used by primary instances
14369 i_p_mem = i_p_up_mem = 0
14370 for iinfo, beinfo in i_list:
14371 if iinfo.primary_node == nname:
14372 i_p_mem += beinfo[constants.BE_MAXMEM]
14373 if iinfo.name not in node_iinfo[nname].payload:
14376 i_used_mem = int(node_iinfo[nname].payload[iinfo.name]["memory"])
14377 i_mem_diff = beinfo[constants.BE_MAXMEM] - i_used_mem
14378 remote_info["memory_free"] -= max(0, i_mem_diff)
14380 if iinfo.admin_state == constants.ADMINST_UP:
14381 i_p_up_mem += beinfo[constants.BE_MAXMEM]
14383 # compute memory used by instances
14385 "total_memory": remote_info["memory_total"],
14386 "reserved_memory": remote_info["memory_dom0"],
14387 "free_memory": remote_info["memory_free"],
14388 "total_disk": remote_info["vg_size"],
14389 "free_disk": remote_info["vg_free"],
14390 "total_cpus": remote_info["cpu_total"],
14391 "i_pri_memory": i_p_mem,
14392 "i_pri_up_memory": i_p_up_mem,
14394 pnr_dyn.update(node_results[nname])
14395 node_results[nname] = pnr_dyn
14397 return node_results
14400 def _ComputeInstanceData(cluster_info, i_list):
14401 """Compute global instance data.
14405 for iinfo, beinfo in i_list:
14407 for nic in iinfo.nics:
14408 filled_params = cluster_info.SimpleFillNIC(nic.nicparams)
14412 "mode": filled_params[constants.NIC_MODE],
14413 "link": filled_params[constants.NIC_LINK],
14415 if filled_params[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
14416 nic_dict["bridge"] = filled_params[constants.NIC_LINK]
14417 nic_data.append(nic_dict)
14419 "tags": list(iinfo.GetTags()),
14420 "admin_state": iinfo.admin_state,
14421 "vcpus": beinfo[constants.BE_VCPUS],
14422 "memory": beinfo[constants.BE_MAXMEM],
14424 "nodes": [iinfo.primary_node] + list(iinfo.secondary_nodes),
14426 "disks": [{constants.IDISK_SIZE: dsk.size,
14427 constants.IDISK_MODE: dsk.mode}
14428 for dsk in iinfo.disks],
14429 "disk_template": iinfo.disk_template,
14430 "hypervisor": iinfo.hypervisor,
14432 pir["disk_space_total"] = _ComputeDiskSize(iinfo.disk_template,
14434 instance_data[iinfo.name] = pir
14436 return instance_data
14438 def _AddNewInstance(self):
14439 """Add new instance data to allocator structure.
14441 This in combination with _AllocatorGetClusterData will create the
14442 correct structure needed as input for the allocator.
14444 The checks for the completeness of the opcode must have already been
14448 disk_space = _ComputeDiskSize(self.disk_template, self.disks)
14450 if self.disk_template in constants.DTS_INT_MIRROR:
14451 self.required_nodes = 2
14453 self.required_nodes = 1
14457 "disk_template": self.disk_template,
14460 "vcpus": self.vcpus,
14461 "memory": self.memory,
14462 "disks": self.disks,
14463 "disk_space_total": disk_space,
14465 "required_nodes": self.required_nodes,
14466 "hypervisor": self.hypervisor,
14471 def _AddRelocateInstance(self):
14472 """Add relocate instance data to allocator structure.
14474 This in combination with _IAllocatorGetClusterData will create the
14475 correct structure needed as input for the allocator.
14477 The checks for the completeness of the opcode must have already been
14481 instance = self.cfg.GetInstanceInfo(self.name)
14482 if instance is None:
14483 raise errors.ProgrammerError("Unknown instance '%s' passed to"
14484 " IAllocator" % self.name)
14486 if instance.disk_template not in constants.DTS_MIRRORED:
14487 raise errors.OpPrereqError("Can't relocate non-mirrored instances",
14488 errors.ECODE_INVAL)
14490 if instance.disk_template in constants.DTS_INT_MIRROR and \
14491 len(instance.secondary_nodes) != 1:
14492 raise errors.OpPrereqError("Instance has not exactly one secondary node",
14493 errors.ECODE_STATE)
14495 self.required_nodes = 1
14496 disk_sizes = [{constants.IDISK_SIZE: disk.size} for disk in instance.disks]
14497 disk_space = _ComputeDiskSize(instance.disk_template, disk_sizes)
14501 "disk_space_total": disk_space,
14502 "required_nodes": self.required_nodes,
14503 "relocate_from": self.relocate_from,
14507 def _AddNodeEvacuate(self):
14508 """Get data for node-evacuate requests.
14512 "instances": self.instances,
14513 "evac_mode": self.evac_mode,
14516 def _AddChangeGroup(self):
14517 """Get data for node-evacuate requests.
14521 "instances": self.instances,
14522 "target_groups": self.target_groups,
14525 def _BuildInputData(self, fn, keydata):
14526 """Build input data structures.
14529 self._ComputeClusterData()
14532 request["type"] = self.mode
14533 for keyname, keytype in keydata:
14534 if keyname not in request:
14535 raise errors.ProgrammerError("Request parameter %s is missing" %
14537 val = request[keyname]
14538 if not keytype(val):
14539 raise errors.ProgrammerError("Request parameter %s doesn't pass"
14540 " validation, value %s, expected"
14541 " type %s" % (keyname, val, keytype))
14542 self.in_data["request"] = request
14544 self.in_text = serializer.Dump(self.in_data)
14546 _STRING_LIST = ht.TListOf(ht.TString)
14547 _JOB_LIST = ht.TListOf(ht.TListOf(ht.TStrictDict(True, False, {
14548 # pylint: disable=E1101
14549 # Class '...' has no 'OP_ID' member
14550 "OP_ID": ht.TElemOf([opcodes.OpInstanceFailover.OP_ID,
14551 opcodes.OpInstanceMigrate.OP_ID,
14552 opcodes.OpInstanceReplaceDisks.OP_ID])
14556 ht.TListOf(ht.TAnd(ht.TIsLength(3),
14557 ht.TItems([ht.TNonEmptyString,
14558 ht.TNonEmptyString,
14559 ht.TListOf(ht.TNonEmptyString),
14562 ht.TListOf(ht.TAnd(ht.TIsLength(2),
14563 ht.TItems([ht.TNonEmptyString,
14566 _NEVAC_RESULT = ht.TAnd(ht.TIsLength(3),
14567 ht.TItems([_NEVAC_MOVED, _NEVAC_FAILED, _JOB_LIST]))
14570 constants.IALLOCATOR_MODE_ALLOC:
14573 ("name", ht.TString),
14574 ("memory", ht.TInt),
14575 ("disks", ht.TListOf(ht.TDict)),
14576 ("disk_template", ht.TString),
14577 ("os", ht.TString),
14578 ("tags", _STRING_LIST),
14579 ("nics", ht.TListOf(ht.TDict)),
14580 ("vcpus", ht.TInt),
14581 ("hypervisor", ht.TString),
14583 constants.IALLOCATOR_MODE_RELOC:
14584 (_AddRelocateInstance,
14585 [("name", ht.TString), ("relocate_from", _STRING_LIST)],
14587 constants.IALLOCATOR_MODE_NODE_EVAC:
14588 (_AddNodeEvacuate, [
14589 ("instances", _STRING_LIST),
14590 ("evac_mode", ht.TElemOf(constants.IALLOCATOR_NEVAC_MODES)),
14592 constants.IALLOCATOR_MODE_CHG_GROUP:
14593 (_AddChangeGroup, [
14594 ("instances", _STRING_LIST),
14595 ("target_groups", _STRING_LIST),
14599 def Run(self, name, validate=True, call_fn=None):
14600 """Run an instance allocator and return the results.
14603 if call_fn is None:
14604 call_fn = self.rpc.call_iallocator_runner
14606 result = call_fn(self.cfg.GetMasterNode(), name, self.in_text)
14607 result.Raise("Failure while running the iallocator script")
14609 self.out_text = result.payload
14611 self._ValidateResult()
14613 def _ValidateResult(self):
14614 """Process the allocator results.
14616 This will process and if successful save the result in
14617 self.out_data and the other parameters.
14621 rdict = serializer.Load(self.out_text)
14622 except Exception, err:
14623 raise errors.OpExecError("Can't parse iallocator results: %s" % str(err))
14625 if not isinstance(rdict, dict):
14626 raise errors.OpExecError("Can't parse iallocator results: not a dict")
14628 # TODO: remove backwards compatiblity in later versions
14629 if "nodes" in rdict and "result" not in rdict:
14630 rdict["result"] = rdict["nodes"]
14633 for key in "success", "info", "result":
14634 if key not in rdict:
14635 raise errors.OpExecError("Can't parse iallocator results:"
14636 " missing key '%s'" % key)
14637 setattr(self, key, rdict[key])
14639 if not self._result_check(self.result):
14640 raise errors.OpExecError("Iallocator returned invalid result,"
14641 " expected %s, got %s" %
14642 (self._result_check, self.result),
14643 errors.ECODE_INVAL)
14645 if self.mode == constants.IALLOCATOR_MODE_RELOC:
14646 assert self.relocate_from is not None
14647 assert self.required_nodes == 1
14649 node2group = dict((name, ndata["group"])
14650 for (name, ndata) in self.in_data["nodes"].items())
14652 fn = compat.partial(self._NodesToGroups, node2group,
14653 self.in_data["nodegroups"])
14655 instance = self.cfg.GetInstanceInfo(self.name)
14656 request_groups = fn(self.relocate_from + [instance.primary_node])
14657 result_groups = fn(rdict["result"] + [instance.primary_node])
14659 if self.success and not set(result_groups).issubset(request_groups):
14660 raise errors.OpExecError("Groups of nodes returned by iallocator (%s)"
14661 " differ from original groups (%s)" %
14662 (utils.CommaJoin(result_groups),
14663 utils.CommaJoin(request_groups)))
14665 elif self.mode == constants.IALLOCATOR_MODE_NODE_EVAC:
14666 assert self.evac_mode in constants.IALLOCATOR_NEVAC_MODES
14668 self.out_data = rdict
14671 def _NodesToGroups(node2group, groups, nodes):
14672 """Returns a list of unique group names for a list of nodes.
14674 @type node2group: dict
14675 @param node2group: Map from node name to group UUID
14677 @param groups: Group information
14679 @param nodes: Node names
14686 group_uuid = node2group[node]
14688 # Ignore unknown node
14692 group = groups[group_uuid]
14694 # Can't find group, let's use UUID
14695 group_name = group_uuid
14697 group_name = group["name"]
14699 result.add(group_name)
14701 return sorted(result)
14704 class LUTestAllocator(NoHooksLU):
14705 """Run allocator tests.
14707 This LU runs the allocator tests
14710 def CheckPrereq(self):
14711 """Check prerequisites.
14713 This checks the opcode parameters depending on the director and mode test.
14716 if self.op.mode == constants.IALLOCATOR_MODE_ALLOC:
14717 for attr in ["memory", "disks", "disk_template",
14718 "os", "tags", "nics", "vcpus"]:
14719 if not hasattr(self.op, attr):
14720 raise errors.OpPrereqError("Missing attribute '%s' on opcode input" %
14721 attr, errors.ECODE_INVAL)
14722 iname = self.cfg.ExpandInstanceName(self.op.name)
14723 if iname is not None:
14724 raise errors.OpPrereqError("Instance '%s' already in the cluster" %
14725 iname, errors.ECODE_EXISTS)
14726 if not isinstance(self.op.nics, list):
14727 raise errors.OpPrereqError("Invalid parameter 'nics'",
14728 errors.ECODE_INVAL)
14729 if not isinstance(self.op.disks, list):
14730 raise errors.OpPrereqError("Invalid parameter 'disks'",
14731 errors.ECODE_INVAL)
14732 for row in self.op.disks:
14733 if (not isinstance(row, dict) or
14734 constants.IDISK_SIZE not in row or
14735 not isinstance(row[constants.IDISK_SIZE], int) or
14736 constants.IDISK_MODE not in row or
14737 row[constants.IDISK_MODE] not in constants.DISK_ACCESS_SET):
14738 raise errors.OpPrereqError("Invalid contents of the 'disks'"
14739 " parameter", errors.ECODE_INVAL)
14740 if self.op.hypervisor is None:
14741 self.op.hypervisor = self.cfg.GetHypervisorType()
14742 elif self.op.mode == constants.IALLOCATOR_MODE_RELOC:
14743 fname = _ExpandInstanceName(self.cfg, self.op.name)
14744 self.op.name = fname
14745 self.relocate_from = \
14746 list(self.cfg.GetInstanceInfo(fname).secondary_nodes)
14747 elif self.op.mode in (constants.IALLOCATOR_MODE_CHG_GROUP,
14748 constants.IALLOCATOR_MODE_NODE_EVAC):
14749 if not self.op.instances:
14750 raise errors.OpPrereqError("Missing instances", errors.ECODE_INVAL)
14751 self.op.instances = _GetWantedInstances(self, self.op.instances)
14753 raise errors.OpPrereqError("Invalid test allocator mode '%s'" %
14754 self.op.mode, errors.ECODE_INVAL)
14756 if self.op.direction == constants.IALLOCATOR_DIR_OUT:
14757 if self.op.allocator is None:
14758 raise errors.OpPrereqError("Missing allocator name",
14759 errors.ECODE_INVAL)
14760 elif self.op.direction != constants.IALLOCATOR_DIR_IN:
14761 raise errors.OpPrereqError("Wrong allocator test '%s'" %
14762 self.op.direction, errors.ECODE_INVAL)
14764 def Exec(self, feedback_fn):
14765 """Run the allocator test.
14768 if self.op.mode == constants.IALLOCATOR_MODE_ALLOC:
14769 ial = IAllocator(self.cfg, self.rpc,
14772 memory=self.op.memory,
14773 disks=self.op.disks,
14774 disk_template=self.op.disk_template,
14778 vcpus=self.op.vcpus,
14779 hypervisor=self.op.hypervisor,
14781 elif self.op.mode == constants.IALLOCATOR_MODE_RELOC:
14782 ial = IAllocator(self.cfg, self.rpc,
14785 relocate_from=list(self.relocate_from),
14787 elif self.op.mode == constants.IALLOCATOR_MODE_CHG_GROUP:
14788 ial = IAllocator(self.cfg, self.rpc,
14790 instances=self.op.instances,
14791 target_groups=self.op.target_groups)
14792 elif self.op.mode == constants.IALLOCATOR_MODE_NODE_EVAC:
14793 ial = IAllocator(self.cfg, self.rpc,
14795 instances=self.op.instances,
14796 evac_mode=self.op.evac_mode)
14798 raise errors.ProgrammerError("Uncatched mode %s in"
14799 " LUTestAllocator.Exec", self.op.mode)
14801 if self.op.direction == constants.IALLOCATOR_DIR_IN:
14802 result = ial.in_text
14804 ial.Run(self.op.allocator, validate=False)
14805 result = ial.out_text
14809 #: Query type implementations
14811 constants.QR_INSTANCE: _InstanceQuery,
14812 constants.QR_NODE: _NodeQuery,
14813 constants.QR_GROUP: _GroupQuery,
14814 constants.QR_OS: _OsQuery,
14817 assert set(_QUERY_IMPL.keys()) == constants.QR_VIA_OP
14820 def _GetQueryImplementation(name):
14821 """Returns the implemtnation for a query type.
14823 @param name: Query type, must be one of L{constants.QR_VIA_OP}
14827 return _QUERY_IMPL[name]
14829 raise errors.OpPrereqError("Unknown query resource '%s'" % name,
14830 errors.ECODE_INVAL)