4 # Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 Google Inc.
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 """Module implementing the master-side code."""
24 # pylint: disable=W0201,C0302
26 # W0201 since most LU attributes are defined in CheckPrereq or similar
29 # C0302: since we have waaaay too many lines in this module
45 from ganeti import ssh
46 from ganeti import utils
47 from ganeti import errors
48 from ganeti import hypervisor
49 from ganeti import locking
50 from ganeti import constants
51 from ganeti import objects
52 from ganeti import serializer
53 from ganeti import ssconf
54 from ganeti import uidpool
55 from ganeti import compat
56 from ganeti import masterd
57 from ganeti import netutils
58 from ganeti import query
59 from ganeti import qlang
60 from ganeti import opcodes
62 from ganeti import rpc
64 import ganeti.masterd.instance # pylint: disable=W0611
67 #: Size of DRBD meta block device
71 INSTANCE_UP = [constants.ADMINST_UP]
72 INSTANCE_DOWN = [constants.ADMINST_DOWN]
73 INSTANCE_OFFLINE = [constants.ADMINST_OFFLINE]
74 INSTANCE_ONLINE = [constants.ADMINST_DOWN, constants.ADMINST_UP]
75 INSTANCE_NOT_RUNNING = [constants.ADMINST_DOWN, constants.ADMINST_OFFLINE]
79 """Data container for LU results with jobs.
81 Instances of this class returned from L{LogicalUnit.Exec} will be recognized
82 by L{mcpu.Processor._ProcessResult}. The latter will then submit the jobs
83 contained in the C{jobs} attribute and include the job IDs in the opcode
87 def __init__(self, jobs, **kwargs):
88 """Initializes this class.
90 Additional return values can be specified as keyword arguments.
92 @type jobs: list of lists of L{opcode.OpCode}
93 @param jobs: A list of lists of opcode objects
100 class LogicalUnit(object):
101 """Logical Unit base class.
103 Subclasses must follow these rules:
104 - implement ExpandNames
105 - implement CheckPrereq (except when tasklets are used)
106 - implement Exec (except when tasklets are used)
107 - implement BuildHooksEnv
108 - implement BuildHooksNodes
109 - redefine HPATH and HTYPE
110 - optionally redefine their run requirements:
111 REQ_BGL: the LU needs to hold the Big Ganeti Lock exclusively
113 Note that all commands require root permissions.
115 @ivar dry_run_result: the value (if any) that will be returned to the caller
116 in dry-run mode (signalled by opcode dry_run parameter)
123 def __init__(self, processor, op, context, rpc_runner):
124 """Constructor for LogicalUnit.
126 This needs to be overridden in derived classes in order to check op
130 self.proc = processor
132 self.cfg = context.cfg
133 self.glm = context.glm
135 self.owned_locks = context.glm.list_owned
136 self.context = context
137 self.rpc = rpc_runner
138 # Dicts used to declare locking needs to mcpu
139 self.needed_locks = None
140 self.share_locks = dict.fromkeys(locking.LEVELS, 0)
142 self.remove_locks = {}
143 # Used to force good behavior when calling helper functions
144 self.recalculate_locks = {}
146 self.Log = processor.Log # pylint: disable=C0103
147 self.LogWarning = processor.LogWarning # pylint: disable=C0103
148 self.LogInfo = processor.LogInfo # pylint: disable=C0103
149 self.LogStep = processor.LogStep # pylint: disable=C0103
150 # support for dry-run
151 self.dry_run_result = None
152 # support for generic debug attribute
153 if (not hasattr(self.op, "debug_level") or
154 not isinstance(self.op.debug_level, int)):
155 self.op.debug_level = 0
160 # Validate opcode parameters and set defaults
161 self.op.Validate(True)
163 self.CheckArguments()
165 def CheckArguments(self):
166 """Check syntactic validity for the opcode arguments.
168 This method is for doing a simple syntactic check and ensure
169 validity of opcode parameters, without any cluster-related
170 checks. While the same can be accomplished in ExpandNames and/or
171 CheckPrereq, doing these separate is better because:
173 - ExpandNames is left as as purely a lock-related function
174 - CheckPrereq is run after we have acquired locks (and possible
177 The function is allowed to change the self.op attribute so that
178 later methods can no longer worry about missing parameters.
183 def ExpandNames(self):
184 """Expand names for this LU.
186 This method is called before starting to execute the opcode, and it should
187 update all the parameters of the opcode to their canonical form (e.g. a
188 short node name must be fully expanded after this method has successfully
189 completed). This way locking, hooks, logging, etc. can work correctly.
191 LUs which implement this method must also populate the self.needed_locks
192 member, as a dict with lock levels as keys, and a list of needed lock names
195 - use an empty dict if you don't need any lock
196 - if you don't need any lock at a particular level omit that level
197 - don't put anything for the BGL level
198 - if you want all locks at a level use locking.ALL_SET as a value
200 If you need to share locks (rather than acquire them exclusively) at one
201 level you can modify self.share_locks, setting a true value (usually 1) for
202 that level. By default locks are not shared.
204 This function can also define a list of tasklets, which then will be
205 executed in order instead of the usual LU-level CheckPrereq and Exec
206 functions, if those are not defined by the LU.
210 # Acquire all nodes and one instance
211 self.needed_locks = {
212 locking.LEVEL_NODE: locking.ALL_SET,
213 locking.LEVEL_INSTANCE: ['instance1.example.com'],
215 # Acquire just two nodes
216 self.needed_locks = {
217 locking.LEVEL_NODE: ['node1.example.com', 'node2.example.com'],
220 self.needed_locks = {} # No, you can't leave it to the default value None
223 # The implementation of this method is mandatory only if the new LU is
224 # concurrent, so that old LUs don't need to be changed all at the same
227 self.needed_locks = {} # Exclusive LUs don't need locks.
229 raise NotImplementedError
231 def DeclareLocks(self, level):
232 """Declare LU locking needs for a level
234 While most LUs can just declare their locking needs at ExpandNames time,
235 sometimes there's the need to calculate some locks after having acquired
236 the ones before. This function is called just before acquiring locks at a
237 particular level, but after acquiring the ones at lower levels, and permits
238 such calculations. It can be used to modify self.needed_locks, and by
239 default it does nothing.
241 This function is only called if you have something already set in
242 self.needed_locks for the level.
244 @param level: Locking level which is going to be locked
245 @type level: member of ganeti.locking.LEVELS
249 def CheckPrereq(self):
250 """Check prerequisites for this LU.
252 This method should check that the prerequisites for the execution
253 of this LU are fulfilled. It can do internode communication, but
254 it should be idempotent - no cluster or system changes are
257 The method should raise errors.OpPrereqError in case something is
258 not fulfilled. Its return value is ignored.
260 This method should also update all the parameters of the opcode to
261 their canonical form if it hasn't been done by ExpandNames before.
264 if self.tasklets is not None:
265 for (idx, tl) in enumerate(self.tasklets):
266 logging.debug("Checking prerequisites for tasklet %s/%s",
267 idx + 1, len(self.tasklets))
272 def Exec(self, feedback_fn):
275 This method should implement the actual work. It should raise
276 errors.OpExecError for failures that are somewhat dealt with in
280 if self.tasklets is not None:
281 for (idx, tl) in enumerate(self.tasklets):
282 logging.debug("Executing tasklet %s/%s", idx + 1, len(self.tasklets))
285 raise NotImplementedError
287 def BuildHooksEnv(self):
288 """Build hooks environment for this LU.
291 @return: Dictionary containing the environment that will be used for
292 running the hooks for this LU. The keys of the dict must not be prefixed
293 with "GANETI_"--that'll be added by the hooks runner. The hooks runner
294 will extend the environment with additional variables. If no environment
295 should be defined, an empty dictionary should be returned (not C{None}).
296 @note: If the C{HPATH} attribute of the LU class is C{None}, this function
300 raise NotImplementedError
302 def BuildHooksNodes(self):
303 """Build list of nodes to run LU's hooks.
305 @rtype: tuple; (list, list)
306 @return: Tuple containing a list of node names on which the hook
307 should run before the execution and a list of node names on which the
308 hook should run after the execution. No nodes should be returned as an
309 empty list (and not None).
310 @note: If the C{HPATH} attribute of the LU class is C{None}, this function
314 raise NotImplementedError
316 def HooksCallBack(self, phase, hook_results, feedback_fn, lu_result):
317 """Notify the LU about the results of its hooks.
319 This method is called every time a hooks phase is executed, and notifies
320 the Logical Unit about the hooks' result. The LU can then use it to alter
321 its result based on the hooks. By default the method does nothing and the
322 previous result is passed back unchanged but any LU can define it if it
323 wants to use the local cluster hook-scripts somehow.
325 @param phase: one of L{constants.HOOKS_PHASE_POST} or
326 L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase
327 @param hook_results: the results of the multi-node hooks rpc call
328 @param feedback_fn: function used send feedback back to the caller
329 @param lu_result: the previous Exec result this LU had, or None
331 @return: the new Exec result, based on the previous result
335 # API must be kept, thus we ignore the unused argument and could
336 # be a function warnings
337 # pylint: disable=W0613,R0201
340 def _ExpandAndLockInstance(self):
341 """Helper function to expand and lock an instance.
343 Many LUs that work on an instance take its name in self.op.instance_name
344 and need to expand it and then declare the expanded name for locking. This
345 function does it, and then updates self.op.instance_name to the expanded
346 name. It also initializes needed_locks as a dict, if this hasn't been done
350 if self.needed_locks is None:
351 self.needed_locks = {}
353 assert locking.LEVEL_INSTANCE not in self.needed_locks, \
354 "_ExpandAndLockInstance called with instance-level locks set"
355 self.op.instance_name = _ExpandInstanceName(self.cfg,
356 self.op.instance_name)
357 self.needed_locks[locking.LEVEL_INSTANCE] = self.op.instance_name
359 def _LockInstancesNodes(self, primary_only=False,
360 level=locking.LEVEL_NODE):
361 """Helper function to declare instances' nodes for locking.
363 This function should be called after locking one or more instances to lock
364 their nodes. Its effect is populating self.needed_locks[locking.LEVEL_NODE]
365 with all primary or secondary nodes for instances already locked and
366 present in self.needed_locks[locking.LEVEL_INSTANCE].
368 It should be called from DeclareLocks, and for safety only works if
369 self.recalculate_locks[locking.LEVEL_NODE] is set.
371 In the future it may grow parameters to just lock some instance's nodes, or
372 to just lock primaries or secondary nodes, if needed.
374 If should be called in DeclareLocks in a way similar to::
376 if level == locking.LEVEL_NODE:
377 self._LockInstancesNodes()
379 @type primary_only: boolean
380 @param primary_only: only lock primary nodes of locked instances
381 @param level: Which lock level to use for locking nodes
384 assert level in self.recalculate_locks, \
385 "_LockInstancesNodes helper function called with no nodes to recalculate"
387 # TODO: check if we're really been called with the instance locks held
389 # For now we'll replace self.needed_locks[locking.LEVEL_NODE], but in the
390 # future we might want to have different behaviors depending on the value
391 # of self.recalculate_locks[locking.LEVEL_NODE]
393 locked_i = self.owned_locks(locking.LEVEL_INSTANCE)
394 for _, instance in self.cfg.GetMultiInstanceInfo(locked_i):
395 wanted_nodes.append(instance.primary_node)
397 wanted_nodes.extend(instance.secondary_nodes)
399 if self.recalculate_locks[level] == constants.LOCKS_REPLACE:
400 self.needed_locks[level] = wanted_nodes
401 elif self.recalculate_locks[level] == constants.LOCKS_APPEND:
402 self.needed_locks[level].extend(wanted_nodes)
404 raise errors.ProgrammerError("Unknown recalculation mode")
406 del self.recalculate_locks[level]
409 class NoHooksLU(LogicalUnit): # pylint: disable=W0223
410 """Simple LU which runs no hooks.
412 This LU is intended as a parent for other LogicalUnits which will
413 run no hooks, in order to reduce duplicate code.
419 def BuildHooksEnv(self):
420 """Empty BuildHooksEnv for NoHooksLu.
422 This just raises an error.
425 raise AssertionError("BuildHooksEnv called for NoHooksLUs")
427 def BuildHooksNodes(self):
428 """Empty BuildHooksNodes for NoHooksLU.
431 raise AssertionError("BuildHooksNodes called for NoHooksLU")
435 """Tasklet base class.
437 Tasklets are subcomponents for LUs. LUs can consist entirely of tasklets or
438 they can mix legacy code with tasklets. Locking needs to be done in the LU,
439 tasklets know nothing about locks.
441 Subclasses must follow these rules:
442 - Implement CheckPrereq
446 def __init__(self, lu):
453 def CheckPrereq(self):
454 """Check prerequisites for this tasklets.
456 This method should check whether the prerequisites for the execution of
457 this tasklet are fulfilled. It can do internode communication, but it
458 should be idempotent - no cluster or system changes are allowed.
460 The method should raise errors.OpPrereqError in case something is not
461 fulfilled. Its return value is ignored.
463 This method should also update all parameters to their canonical form if it
464 hasn't been done before.
469 def Exec(self, feedback_fn):
470 """Execute the tasklet.
472 This method should implement the actual work. It should raise
473 errors.OpExecError for failures that are somewhat dealt with in code, or
477 raise NotImplementedError
481 """Base for query utility classes.
484 #: Attribute holding field definitions
487 def __init__(self, qfilter, fields, use_locking):
488 """Initializes this class.
491 self.use_locking = use_locking
493 self.query = query.Query(self.FIELDS, fields, qfilter=qfilter,
495 self.requested_data = self.query.RequestedData()
496 self.names = self.query.RequestedNames()
498 # Sort only if no names were requested
499 self.sort_by_name = not self.names
501 self.do_locking = None
504 def _GetNames(self, lu, all_names, lock_level):
505 """Helper function to determine names asked for in the query.
509 names = lu.owned_locks(lock_level)
513 if self.wanted == locking.ALL_SET:
514 assert not self.names
515 # caller didn't specify names, so ordering is not important
516 return utils.NiceSort(names)
518 # caller specified names and we must keep the same order
520 assert not self.do_locking or lu.glm.is_owned(lock_level)
522 missing = set(self.wanted).difference(names)
524 raise errors.OpExecError("Some items were removed before retrieving"
525 " their data: %s" % missing)
527 # Return expanded names
530 def ExpandNames(self, lu):
531 """Expand names for this query.
533 See L{LogicalUnit.ExpandNames}.
536 raise NotImplementedError()
538 def DeclareLocks(self, lu, level):
539 """Declare locks for this query.
541 See L{LogicalUnit.DeclareLocks}.
544 raise NotImplementedError()
546 def _GetQueryData(self, lu):
547 """Collects all data for this query.
549 @return: Query data object
552 raise NotImplementedError()
554 def NewStyleQuery(self, lu):
555 """Collect data and execute query.
558 return query.GetQueryResponse(self.query, self._GetQueryData(lu),
559 sort_by_name=self.sort_by_name)
561 def OldStyleQuery(self, lu):
562 """Collect data and execute query.
565 return self.query.OldStyleQuery(self._GetQueryData(lu),
566 sort_by_name=self.sort_by_name)
570 """Returns a dict declaring all lock levels shared.
573 return dict.fromkeys(locking.LEVELS, 1)
576 def _MakeLegacyNodeInfo(data):
577 """Formats the data returned by L{rpc.RpcRunner.call_node_info}.
579 Converts the data into a single dictionary. This is fine for most use cases,
580 but some require information from more than one volume group or hypervisor.
583 (bootid, (vg_info, ), (hv_info, )) = data
585 return utils.JoinDisjointDicts(utils.JoinDisjointDicts(vg_info, hv_info), {
590 def _CheckInstanceNodeGroups(cfg, instance_name, owned_groups):
591 """Checks if the owned node groups are still correct for an instance.
593 @type cfg: L{config.ConfigWriter}
594 @param cfg: The cluster configuration
595 @type instance_name: string
596 @param instance_name: Instance name
597 @type owned_groups: set or frozenset
598 @param owned_groups: List of currently owned node groups
601 inst_groups = cfg.GetInstanceNodeGroups(instance_name)
603 if not owned_groups.issuperset(inst_groups):
604 raise errors.OpPrereqError("Instance %s's node groups changed since"
605 " locks were acquired, current groups are"
606 " are '%s', owning groups '%s'; retry the"
609 utils.CommaJoin(inst_groups),
610 utils.CommaJoin(owned_groups)),
616 def _CheckNodeGroupInstances(cfg, group_uuid, owned_instances):
617 """Checks if the instances in a node group are still correct.
619 @type cfg: L{config.ConfigWriter}
620 @param cfg: The cluster configuration
621 @type group_uuid: string
622 @param group_uuid: Node group UUID
623 @type owned_instances: set or frozenset
624 @param owned_instances: List of currently owned instances
627 wanted_instances = cfg.GetNodeGroupInstances(group_uuid)
628 if owned_instances != wanted_instances:
629 raise errors.OpPrereqError("Instances in node group '%s' changed since"
630 " locks were acquired, wanted '%s', have '%s';"
631 " retry the operation" %
633 utils.CommaJoin(wanted_instances),
634 utils.CommaJoin(owned_instances)),
637 return wanted_instances
640 def _SupportsOob(cfg, node):
641 """Tells if node supports OOB.
643 @type cfg: L{config.ConfigWriter}
644 @param cfg: The cluster configuration
645 @type node: L{objects.Node}
646 @param node: The node
647 @return: The OOB script if supported or an empty string otherwise
650 return cfg.GetNdParams(node)[constants.ND_OOB_PROGRAM]
653 def _GetWantedNodes(lu, nodes):
654 """Returns list of checked and expanded node names.
656 @type lu: L{LogicalUnit}
657 @param lu: the logical unit on whose behalf we execute
659 @param nodes: list of node names or None for all nodes
661 @return: the list of nodes, sorted
662 @raise errors.ProgrammerError: if the nodes parameter is wrong type
666 return [_ExpandNodeName(lu.cfg, name) for name in nodes]
668 return utils.NiceSort(lu.cfg.GetNodeList())
671 def _GetWantedInstances(lu, instances):
672 """Returns list of checked and expanded instance names.
674 @type lu: L{LogicalUnit}
675 @param lu: the logical unit on whose behalf we execute
676 @type instances: list
677 @param instances: list of instance names or None for all instances
679 @return: the list of instances, sorted
680 @raise errors.OpPrereqError: if the instances parameter is wrong type
681 @raise errors.OpPrereqError: if any of the passed instances is not found
685 wanted = [_ExpandInstanceName(lu.cfg, name) for name in instances]
687 wanted = utils.NiceSort(lu.cfg.GetInstanceList())
691 def _GetUpdatedParams(old_params, update_dict,
692 use_default=True, use_none=False):
693 """Return the new version of a parameter dictionary.
695 @type old_params: dict
696 @param old_params: old parameters
697 @type update_dict: dict
698 @param update_dict: dict containing new parameter values, or
699 constants.VALUE_DEFAULT to reset the parameter to its default
701 @param use_default: boolean
702 @type use_default: whether to recognise L{constants.VALUE_DEFAULT}
703 values as 'to be deleted' values
704 @param use_none: boolean
705 @type use_none: whether to recognise C{None} values as 'to be
708 @return: the new parameter dictionary
711 params_copy = copy.deepcopy(old_params)
712 for key, val in update_dict.iteritems():
713 if ((use_default and val == constants.VALUE_DEFAULT) or
714 (use_none and val is None)):
720 params_copy[key] = val
724 def _UpdateAndVerifySubDict(base, updates, type_check):
725 """Updates and verifies a dict with sub dicts of the same type.
727 @param base: The dict with the old data
728 @param updates: The dict with the new data
729 @param type_check: Dict suitable to ForceDictType to verify correct types
730 @returns: A new dict with updated and verified values
734 new = _GetUpdatedParams(old, value)
735 utils.ForceDictType(new, type_check)
738 ret = copy.deepcopy(base)
739 ret.update(dict((key, fn(base.get(key, {}), value))
740 for key, value in updates.items()))
744 def _MergeAndVerifyHvState(op_input, obj_input):
745 """Combines the hv state from an opcode with the one of the object
747 @param op_input: The input dict from the opcode
748 @param obj_input: The input dict from the objects
749 @return: The verified and updated dict
753 invalid_hvs = set(op_input) - constants.HYPER_TYPES
755 raise errors.OpPrereqError("Invalid hypervisor(s) in hypervisor state:"
756 " %s" % utils.CommaJoin(invalid_hvs),
758 if obj_input is None:
760 type_check = constants.HVSTS_PARAMETER_TYPES
761 return _UpdateAndVerifySubDict(obj_input, op_input, type_check)
766 def _MergeAndVerifyDiskState(op_input, obj_input):
767 """Combines the disk state from an opcode with the one of the object
769 @param op_input: The input dict from the opcode
770 @param obj_input: The input dict from the objects
771 @return: The verified and updated dict
774 invalid_dst = set(op_input) - constants.DS_VALID_TYPES
776 raise errors.OpPrereqError("Invalid storage type(s) in disk state: %s" %
777 utils.CommaJoin(invalid_dst),
779 type_check = constants.DSS_PARAMETER_TYPES
780 if obj_input is None:
782 return dict((key, _UpdateAndVerifySubDict(obj_input.get(key, {}), value,
784 for key, value in op_input.items())
789 def _ReleaseLocks(lu, level, names=None, keep=None):
790 """Releases locks owned by an LU.
792 @type lu: L{LogicalUnit}
793 @param level: Lock level
794 @type names: list or None
795 @param names: Names of locks to release
796 @type keep: list or None
797 @param keep: Names of locks to retain
800 assert not (keep is not None and names is not None), \
801 "Only one of the 'names' and the 'keep' parameters can be given"
803 if names is not None:
804 should_release = names.__contains__
806 should_release = lambda name: name not in keep
808 should_release = None
810 owned = lu.owned_locks(level)
812 # Not owning any lock at this level, do nothing
819 # Determine which locks to release
821 if should_release(name):
826 assert len(lu.owned_locks(level)) == (len(retain) + len(release))
828 # Release just some locks
829 lu.glm.release(level, names=release)
831 assert frozenset(lu.owned_locks(level)) == frozenset(retain)
834 lu.glm.release(level)
836 assert not lu.glm.is_owned(level), "No locks should be owned"
839 def _MapInstanceDisksToNodes(instances):
840 """Creates a map from (node, volume) to instance name.
842 @type instances: list of L{objects.Instance}
843 @rtype: dict; tuple of (node name, volume name) as key, instance name as value
846 return dict(((node, vol), inst.name)
847 for inst in instances
848 for (node, vols) in inst.MapLVsByNode().items()
852 def _RunPostHook(lu, node_name):
853 """Runs the post-hook for an opcode on a single node.
856 hm = lu.proc.BuildHooksManager(lu)
858 hm.RunPhase(constants.HOOKS_PHASE_POST, nodes=[node_name])
860 # pylint: disable=W0702
861 lu.LogWarning("Errors occurred running hooks on %s" % node_name)
864 def _CheckOutputFields(static, dynamic, selected):
865 """Checks whether all selected fields are valid.
867 @type static: L{utils.FieldSet}
868 @param static: static fields set
869 @type dynamic: L{utils.FieldSet}
870 @param dynamic: dynamic fields set
877 delta = f.NonMatching(selected)
879 raise errors.OpPrereqError("Unknown output fields selected: %s"
880 % ",".join(delta), errors.ECODE_INVAL)
883 def _CheckGlobalHvParams(params):
884 """Validates that given hypervisor params are not global ones.
886 This will ensure that instances don't get customised versions of
890 used_globals = constants.HVC_GLOBALS.intersection(params)
892 msg = ("The following hypervisor parameters are global and cannot"
893 " be customized at instance level, please modify them at"
894 " cluster level: %s" % utils.CommaJoin(used_globals))
895 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
898 def _CheckNodeOnline(lu, node, msg=None):
899 """Ensure that a given node is online.
901 @param lu: the LU on behalf of which we make the check
902 @param node: the node to check
903 @param msg: if passed, should be a message to replace the default one
904 @raise errors.OpPrereqError: if the node is offline
908 msg = "Can't use offline node"
909 if lu.cfg.GetNodeInfo(node).offline:
910 raise errors.OpPrereqError("%s: %s" % (msg, node), errors.ECODE_STATE)
913 def _CheckNodeNotDrained(lu, node):
914 """Ensure that a given node is not drained.
916 @param lu: the LU on behalf of which we make the check
917 @param node: the node to check
918 @raise errors.OpPrereqError: if the node is drained
921 if lu.cfg.GetNodeInfo(node).drained:
922 raise errors.OpPrereqError("Can't use drained node %s" % node,
926 def _CheckNodeVmCapable(lu, node):
927 """Ensure that a given node is vm capable.
929 @param lu: the LU on behalf of which we make the check
930 @param node: the node to check
931 @raise errors.OpPrereqError: if the node is not vm capable
934 if not lu.cfg.GetNodeInfo(node).vm_capable:
935 raise errors.OpPrereqError("Can't use non-vm_capable node %s" % node,
939 def _CheckNodeHasOS(lu, node, os_name, force_variant):
940 """Ensure that a node supports a given OS.
942 @param lu: the LU on behalf of which we make the check
943 @param node: the node to check
944 @param os_name: the OS to query about
945 @param force_variant: whether to ignore variant errors
946 @raise errors.OpPrereqError: if the node is not supporting the OS
949 result = lu.rpc.call_os_get(node, os_name)
950 result.Raise("OS '%s' not in supported OS list for node %s" %
952 prereq=True, ecode=errors.ECODE_INVAL)
953 if not force_variant:
954 _CheckOSVariant(result.payload, os_name)
957 def _CheckNodeHasSecondaryIP(lu, node, secondary_ip, prereq):
958 """Ensure that a node has the given secondary ip.
960 @type lu: L{LogicalUnit}
961 @param lu: the LU on behalf of which we make the check
963 @param node: the node to check
964 @type secondary_ip: string
965 @param secondary_ip: the ip to check
966 @type prereq: boolean
967 @param prereq: whether to throw a prerequisite or an execute error
968 @raise errors.OpPrereqError: if the node doesn't have the ip, and prereq=True
969 @raise errors.OpExecError: if the node doesn't have the ip, and prereq=False
972 result = lu.rpc.call_node_has_ip_address(node, secondary_ip)
973 result.Raise("Failure checking secondary ip on node %s" % node,
974 prereq=prereq, ecode=errors.ECODE_ENVIRON)
975 if not result.payload:
976 msg = ("Node claims it doesn't have the secondary ip you gave (%s),"
977 " please fix and re-run this command" % secondary_ip)
979 raise errors.OpPrereqError(msg, errors.ECODE_ENVIRON)
981 raise errors.OpExecError(msg)
984 def _GetClusterDomainSecret():
985 """Reads the cluster domain secret.
988 return utils.ReadOneLineFile(constants.CLUSTER_DOMAIN_SECRET_FILE,
992 def _CheckInstanceState(lu, instance, req_states, msg=None):
993 """Ensure that an instance is in one of the required states.
995 @param lu: the LU on behalf of which we make the check
996 @param instance: the instance to check
997 @param msg: if passed, should be a message to replace the default one
998 @raise errors.OpPrereqError: if the instance is not in the required state
1002 msg = "can't use instance from outside %s states" % ", ".join(req_states)
1003 if instance.admin_state not in req_states:
1004 raise errors.OpPrereqError("Instance %s is marked to be %s, %s" %
1005 (instance, instance.admin_state, msg),
1008 if constants.ADMINST_UP not in req_states:
1009 pnode = instance.primary_node
1010 ins_l = lu.rpc.call_instance_list([pnode], [instance.hypervisor])[pnode]
1011 ins_l.Raise("Can't contact node %s for instance information" % pnode,
1012 prereq=True, ecode=errors.ECODE_ENVIRON)
1014 if instance.name in ins_l.payload:
1015 raise errors.OpPrereqError("Instance %s is running, %s" %
1016 (instance.name, msg), errors.ECODE_STATE)
1019 def _ExpandItemName(fn, name, kind):
1020 """Expand an item name.
1022 @param fn: the function to use for expansion
1023 @param name: requested item name
1024 @param kind: text description ('Node' or 'Instance')
1025 @return: the resolved (full) name
1026 @raise errors.OpPrereqError: if the item is not found
1029 full_name = fn(name)
1030 if full_name is None:
1031 raise errors.OpPrereqError("%s '%s' not known" % (kind, name),
1036 def _ExpandNodeName(cfg, name):
1037 """Wrapper over L{_ExpandItemName} for nodes."""
1038 return _ExpandItemName(cfg.ExpandNodeName, name, "Node")
1041 def _ExpandInstanceName(cfg, name):
1042 """Wrapper over L{_ExpandItemName} for instance."""
1043 return _ExpandItemName(cfg.ExpandInstanceName, name, "Instance")
1046 def _BuildInstanceHookEnv(name, primary_node, secondary_nodes, os_type, status,
1047 minmem, maxmem, vcpus, nics, disk_template, disks,
1048 bep, hvp, hypervisor_name, tags):
1049 """Builds instance related env variables for hooks
1051 This builds the hook environment from individual variables.
1054 @param name: the name of the instance
1055 @type primary_node: string
1056 @param primary_node: the name of the instance's primary node
1057 @type secondary_nodes: list
1058 @param secondary_nodes: list of secondary nodes as strings
1059 @type os_type: string
1060 @param os_type: the name of the instance's OS
1061 @type status: string
1062 @param status: the desired status of the instance
1063 @type minmem: string
1064 @param minmem: the minimum memory size of the instance
1065 @type maxmem: string
1066 @param maxmem: the maximum memory size of the instance
1068 @param vcpus: the count of VCPUs the instance has
1070 @param nics: list of tuples (ip, mac, mode, link) representing
1071 the NICs the instance has
1072 @type disk_template: string
1073 @param disk_template: the disk template of the instance
1075 @param disks: the list of (size, mode) pairs
1077 @param bep: the backend parameters for the instance
1079 @param hvp: the hypervisor parameters for the instance
1080 @type hypervisor_name: string
1081 @param hypervisor_name: the hypervisor for the instance
1083 @param tags: list of instance tags as strings
1085 @return: the hook environment for this instance
1090 "INSTANCE_NAME": name,
1091 "INSTANCE_PRIMARY": primary_node,
1092 "INSTANCE_SECONDARIES": " ".join(secondary_nodes),
1093 "INSTANCE_OS_TYPE": os_type,
1094 "INSTANCE_STATUS": status,
1095 "INSTANCE_MINMEM": minmem,
1096 "INSTANCE_MAXMEM": maxmem,
1097 # TODO(2.7) remove deprecated "memory" value
1098 "INSTANCE_MEMORY": maxmem,
1099 "INSTANCE_VCPUS": vcpus,
1100 "INSTANCE_DISK_TEMPLATE": disk_template,
1101 "INSTANCE_HYPERVISOR": hypervisor_name,
1104 nic_count = len(nics)
1105 for idx, (ip, mac, mode, link) in enumerate(nics):
1108 env["INSTANCE_NIC%d_IP" % idx] = ip
1109 env["INSTANCE_NIC%d_MAC" % idx] = mac
1110 env["INSTANCE_NIC%d_MODE" % idx] = mode
1111 env["INSTANCE_NIC%d_LINK" % idx] = link
1112 if mode == constants.NIC_MODE_BRIDGED:
1113 env["INSTANCE_NIC%d_BRIDGE" % idx] = link
1117 env["INSTANCE_NIC_COUNT"] = nic_count
1120 disk_count = len(disks)
1121 for idx, (size, mode) in enumerate(disks):
1122 env["INSTANCE_DISK%d_SIZE" % idx] = size
1123 env["INSTANCE_DISK%d_MODE" % idx] = mode
1127 env["INSTANCE_DISK_COUNT"] = disk_count
1132 env["INSTANCE_TAGS"] = " ".join(tags)
1134 for source, kind in [(bep, "BE"), (hvp, "HV")]:
1135 for key, value in source.items():
1136 env["INSTANCE_%s_%s" % (kind, key)] = value
1141 def _NICListToTuple(lu, nics):
1142 """Build a list of nic information tuples.
1144 This list is suitable to be passed to _BuildInstanceHookEnv or as a return
1145 value in LUInstanceQueryData.
1147 @type lu: L{LogicalUnit}
1148 @param lu: the logical unit on whose behalf we execute
1149 @type nics: list of L{objects.NIC}
1150 @param nics: list of nics to convert to hooks tuples
1154 cluster = lu.cfg.GetClusterInfo()
1158 filled_params = cluster.SimpleFillNIC(nic.nicparams)
1159 mode = filled_params[constants.NIC_MODE]
1160 link = filled_params[constants.NIC_LINK]
1161 hooks_nics.append((ip, mac, mode, link))
1165 def _BuildInstanceHookEnvByObject(lu, instance, override=None):
1166 """Builds instance related env variables for hooks from an object.
1168 @type lu: L{LogicalUnit}
1169 @param lu: the logical unit on whose behalf we execute
1170 @type instance: L{objects.Instance}
1171 @param instance: the instance for which we should build the
1173 @type override: dict
1174 @param override: dictionary with key/values that will override
1177 @return: the hook environment dictionary
1180 cluster = lu.cfg.GetClusterInfo()
1181 bep = cluster.FillBE(instance)
1182 hvp = cluster.FillHV(instance)
1184 "name": instance.name,
1185 "primary_node": instance.primary_node,
1186 "secondary_nodes": instance.secondary_nodes,
1187 "os_type": instance.os,
1188 "status": instance.admin_state,
1189 "maxmem": bep[constants.BE_MAXMEM],
1190 "minmem": bep[constants.BE_MINMEM],
1191 "vcpus": bep[constants.BE_VCPUS],
1192 "nics": _NICListToTuple(lu, instance.nics),
1193 "disk_template": instance.disk_template,
1194 "disks": [(disk.size, disk.mode) for disk in instance.disks],
1197 "hypervisor_name": instance.hypervisor,
1198 "tags": instance.tags,
1201 args.update(override)
1202 return _BuildInstanceHookEnv(**args) # pylint: disable=W0142
1205 def _AdjustCandidatePool(lu, exceptions):
1206 """Adjust the candidate pool after node operations.
1209 mod_list = lu.cfg.MaintainCandidatePool(exceptions)
1211 lu.LogInfo("Promoted nodes to master candidate role: %s",
1212 utils.CommaJoin(node.name for node in mod_list))
1213 for name in mod_list:
1214 lu.context.ReaddNode(name)
1215 mc_now, mc_max, _ = lu.cfg.GetMasterCandidateStats(exceptions)
1217 lu.LogInfo("Note: more nodes are candidates (%d) than desired (%d)" %
1221 def _DecideSelfPromotion(lu, exceptions=None):
1222 """Decide whether I should promote myself as a master candidate.
1225 cp_size = lu.cfg.GetClusterInfo().candidate_pool_size
1226 mc_now, mc_should, _ = lu.cfg.GetMasterCandidateStats(exceptions)
1227 # the new node will increase mc_max with one, so:
1228 mc_should = min(mc_should + 1, cp_size)
1229 return mc_now < mc_should
1232 def _CheckNicsBridgesExist(lu, target_nics, target_node):
1233 """Check that the brigdes needed by a list of nics exist.
1236 cluster = lu.cfg.GetClusterInfo()
1237 paramslist = [cluster.SimpleFillNIC(nic.nicparams) for nic in target_nics]
1238 brlist = [params[constants.NIC_LINK] for params in paramslist
1239 if params[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED]
1241 result = lu.rpc.call_bridges_exist(target_node, brlist)
1242 result.Raise("Error checking bridges on destination node '%s'" %
1243 target_node, prereq=True, ecode=errors.ECODE_ENVIRON)
1246 def _CheckInstanceBridgesExist(lu, instance, node=None):
1247 """Check that the brigdes needed by an instance exist.
1251 node = instance.primary_node
1252 _CheckNicsBridgesExist(lu, instance.nics, node)
1255 def _CheckOSVariant(os_obj, name):
1256 """Check whether an OS name conforms to the os variants specification.
1258 @type os_obj: L{objects.OS}
1259 @param os_obj: OS object to check
1261 @param name: OS name passed by the user, to check for validity
1264 variant = objects.OS.GetVariant(name)
1265 if not os_obj.supported_variants:
1267 raise errors.OpPrereqError("OS '%s' doesn't support variants ('%s'"
1268 " passed)" % (os_obj.name, variant),
1272 raise errors.OpPrereqError("OS name must include a variant",
1275 if variant not in os_obj.supported_variants:
1276 raise errors.OpPrereqError("Unsupported OS variant", errors.ECODE_INVAL)
1279 def _GetNodeInstancesInner(cfg, fn):
1280 return [i for i in cfg.GetAllInstancesInfo().values() if fn(i)]
1283 def _GetNodeInstances(cfg, node_name):
1284 """Returns a list of all primary and secondary instances on a node.
1288 return _GetNodeInstancesInner(cfg, lambda inst: node_name in inst.all_nodes)
1291 def _GetNodePrimaryInstances(cfg, node_name):
1292 """Returns primary instances on a node.
1295 return _GetNodeInstancesInner(cfg,
1296 lambda inst: node_name == inst.primary_node)
1299 def _GetNodeSecondaryInstances(cfg, node_name):
1300 """Returns secondary instances on a node.
1303 return _GetNodeInstancesInner(cfg,
1304 lambda inst: node_name in inst.secondary_nodes)
1307 def _GetStorageTypeArgs(cfg, storage_type):
1308 """Returns the arguments for a storage type.
1311 # Special case for file storage
1312 if storage_type == constants.ST_FILE:
1313 # storage.FileStorage wants a list of storage directories
1314 return [[cfg.GetFileStorageDir(), cfg.GetSharedFileStorageDir()]]
1319 def _FindFaultyInstanceDisks(cfg, rpc_runner, instance, node_name, prereq):
1322 for dev in instance.disks:
1323 cfg.SetDiskID(dev, node_name)
1325 result = rpc_runner.call_blockdev_getmirrorstatus(node_name, instance.disks)
1326 result.Raise("Failed to get disk status from node %s" % node_name,
1327 prereq=prereq, ecode=errors.ECODE_ENVIRON)
1329 for idx, bdev_status in enumerate(result.payload):
1330 if bdev_status and bdev_status.ldisk_status == constants.LDS_FAULTY:
1336 def _CheckIAllocatorOrNode(lu, iallocator_slot, node_slot):
1337 """Check the sanity of iallocator and node arguments and use the
1338 cluster-wide iallocator if appropriate.
1340 Check that at most one of (iallocator, node) is specified. If none is
1341 specified, then the LU's opcode's iallocator slot is filled with the
1342 cluster-wide default iallocator.
1344 @type iallocator_slot: string
1345 @param iallocator_slot: the name of the opcode iallocator slot
1346 @type node_slot: string
1347 @param node_slot: the name of the opcode target node slot
1350 node = getattr(lu.op, node_slot, None)
1351 iallocator = getattr(lu.op, iallocator_slot, None)
1353 if node is not None and iallocator is not None:
1354 raise errors.OpPrereqError("Do not specify both, iallocator and node",
1356 elif node is None and iallocator is None:
1357 default_iallocator = lu.cfg.GetDefaultIAllocator()
1358 if default_iallocator:
1359 setattr(lu.op, iallocator_slot, default_iallocator)
1361 raise errors.OpPrereqError("No iallocator or node given and no"
1362 " cluster-wide default iallocator found;"
1363 " please specify either an iallocator or a"
1364 " node, or set a cluster-wide default"
1368 def _GetDefaultIAllocator(cfg, iallocator):
1369 """Decides on which iallocator to use.
1371 @type cfg: L{config.ConfigWriter}
1372 @param cfg: Cluster configuration object
1373 @type iallocator: string or None
1374 @param iallocator: Iallocator specified in opcode
1376 @return: Iallocator name
1380 # Use default iallocator
1381 iallocator = cfg.GetDefaultIAllocator()
1384 raise errors.OpPrereqError("No iallocator was specified, neither in the"
1385 " opcode nor as a cluster-wide default",
1391 class LUClusterPostInit(LogicalUnit):
1392 """Logical unit for running hooks after cluster initialization.
1395 HPATH = "cluster-init"
1396 HTYPE = constants.HTYPE_CLUSTER
1398 def BuildHooksEnv(self):
1403 "OP_TARGET": self.cfg.GetClusterName(),
1406 def BuildHooksNodes(self):
1407 """Build hooks nodes.
1410 return ([], [self.cfg.GetMasterNode()])
1412 def Exec(self, feedback_fn):
1419 class LUClusterDestroy(LogicalUnit):
1420 """Logical unit for destroying the cluster.
1423 HPATH = "cluster-destroy"
1424 HTYPE = constants.HTYPE_CLUSTER
1426 def BuildHooksEnv(self):
1431 "OP_TARGET": self.cfg.GetClusterName(),
1434 def BuildHooksNodes(self):
1435 """Build hooks nodes.
1440 def CheckPrereq(self):
1441 """Check prerequisites.
1443 This checks whether the cluster is empty.
1445 Any errors are signaled by raising errors.OpPrereqError.
1448 master = self.cfg.GetMasterNode()
1450 nodelist = self.cfg.GetNodeList()
1451 if len(nodelist) != 1 or nodelist[0] != master:
1452 raise errors.OpPrereqError("There are still %d node(s) in"
1453 " this cluster." % (len(nodelist) - 1),
1455 instancelist = self.cfg.GetInstanceList()
1457 raise errors.OpPrereqError("There are still %d instance(s) in"
1458 " this cluster." % len(instancelist),
1461 def Exec(self, feedback_fn):
1462 """Destroys the cluster.
1465 master_params = self.cfg.GetMasterNetworkParameters()
1467 # Run post hooks on master node before it's removed
1468 _RunPostHook(self, master_params.name)
1470 ems = self.cfg.GetUseExternalMipScript()
1471 result = self.rpc.call_node_deactivate_master_ip(master_params.name,
1473 result.Raise("Could not disable the master role")
1475 return master_params.name
1478 def _VerifyCertificate(filename):
1479 """Verifies a certificate for L{LUClusterVerifyConfig}.
1481 @type filename: string
1482 @param filename: Path to PEM file
1486 cert = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM,
1487 utils.ReadFile(filename))
1488 except Exception, err: # pylint: disable=W0703
1489 return (LUClusterVerifyConfig.ETYPE_ERROR,
1490 "Failed to load X509 certificate %s: %s" % (filename, err))
1493 utils.VerifyX509Certificate(cert, constants.SSL_CERT_EXPIRATION_WARN,
1494 constants.SSL_CERT_EXPIRATION_ERROR)
1497 fnamemsg = "While verifying %s: %s" % (filename, msg)
1502 return (None, fnamemsg)
1503 elif errcode == utils.CERT_WARNING:
1504 return (LUClusterVerifyConfig.ETYPE_WARNING, fnamemsg)
1505 elif errcode == utils.CERT_ERROR:
1506 return (LUClusterVerifyConfig.ETYPE_ERROR, fnamemsg)
1508 raise errors.ProgrammerError("Unhandled certificate error code %r" % errcode)
1511 def _GetAllHypervisorParameters(cluster, instances):
1512 """Compute the set of all hypervisor parameters.
1514 @type cluster: L{objects.Cluster}
1515 @param cluster: the cluster object
1516 @param instances: list of L{objects.Instance}
1517 @param instances: additional instances from which to obtain parameters
1518 @rtype: list of (origin, hypervisor, parameters)
1519 @return: a list with all parameters found, indicating the hypervisor they
1520 apply to, and the origin (can be "cluster", "os X", or "instance Y")
1525 for hv_name in cluster.enabled_hypervisors:
1526 hvp_data.append(("cluster", hv_name, cluster.GetHVDefaults(hv_name)))
1528 for os_name, os_hvp in cluster.os_hvp.items():
1529 for hv_name, hv_params in os_hvp.items():
1531 full_params = cluster.GetHVDefaults(hv_name, os_name=os_name)
1532 hvp_data.append(("os %s" % os_name, hv_name, full_params))
1534 # TODO: collapse identical parameter values in a single one
1535 for instance in instances:
1536 if instance.hvparams:
1537 hvp_data.append(("instance %s" % instance.name, instance.hypervisor,
1538 cluster.FillHV(instance)))
1543 class _VerifyErrors(object):
1544 """Mix-in for cluster/group verify LUs.
1546 It provides _Error and _ErrorIf, and updates the self.bad boolean. (Expects
1547 self.op and self._feedback_fn to be available.)
1551 ETYPE_FIELD = "code"
1552 ETYPE_ERROR = "ERROR"
1553 ETYPE_WARNING = "WARNING"
1555 def _Error(self, ecode, item, msg, *args, **kwargs):
1556 """Format an error message.
1558 Based on the opcode's error_codes parameter, either format a
1559 parseable error code, or a simpler error string.
1561 This must be called only from Exec and functions called from Exec.
1564 ltype = kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR)
1565 itype, etxt, _ = ecode
1566 # first complete the msg
1569 # then format the whole message
1570 if self.op.error_codes: # This is a mix-in. pylint: disable=E1101
1571 msg = "%s:%s:%s:%s:%s" % (ltype, etxt, itype, item, msg)
1577 msg = "%s: %s%s: %s" % (ltype, itype, item, msg)
1578 # and finally report it via the feedback_fn
1579 self._feedback_fn(" - %s" % msg) # Mix-in. pylint: disable=E1101
1581 def _ErrorIf(self, cond, ecode, *args, **kwargs):
1582 """Log an error message if the passed condition is True.
1586 or self.op.debug_simulate_errors) # pylint: disable=E1101
1588 # If the error code is in the list of ignored errors, demote the error to a
1590 (_, etxt, _) = ecode
1591 if etxt in self.op.ignore_errors: # pylint: disable=E1101
1592 kwargs[self.ETYPE_FIELD] = self.ETYPE_WARNING
1595 self._Error(ecode, *args, **kwargs)
1597 # do not mark the operation as failed for WARN cases only
1598 if kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR) == self.ETYPE_ERROR:
1599 self.bad = self.bad or cond
1602 class LUClusterVerify(NoHooksLU):
1603 """Submits all jobs necessary to verify the cluster.
1608 def ExpandNames(self):
1609 self.needed_locks = {}
1611 def Exec(self, feedback_fn):
1614 if self.op.group_name:
1615 groups = [self.op.group_name]
1616 depends_fn = lambda: None
1618 groups = self.cfg.GetNodeGroupList()
1620 # Verify global configuration
1622 opcodes.OpClusterVerifyConfig(ignore_errors=self.op.ignore_errors)
1625 # Always depend on global verification
1626 depends_fn = lambda: [(-len(jobs), [])]
1628 jobs.extend([opcodes.OpClusterVerifyGroup(group_name=group,
1629 ignore_errors=self.op.ignore_errors,
1630 depends=depends_fn())]
1631 for group in groups)
1633 # Fix up all parameters
1634 for op in itertools.chain(*jobs): # pylint: disable=W0142
1635 op.debug_simulate_errors = self.op.debug_simulate_errors
1636 op.verbose = self.op.verbose
1637 op.error_codes = self.op.error_codes
1639 op.skip_checks = self.op.skip_checks
1640 except AttributeError:
1641 assert not isinstance(op, opcodes.OpClusterVerifyGroup)
1643 return ResultWithJobs(jobs)
1646 class LUClusterVerifyConfig(NoHooksLU, _VerifyErrors):
1647 """Verifies the cluster config.
1652 def _VerifyHVP(self, hvp_data):
1653 """Verifies locally the syntax of the hypervisor parameters.
1656 for item, hv_name, hv_params in hvp_data:
1657 msg = ("hypervisor %s parameters syntax check (source %s): %%s" %
1660 hv_class = hypervisor.GetHypervisor(hv_name)
1661 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
1662 hv_class.CheckParameterSyntax(hv_params)
1663 except errors.GenericError, err:
1664 self._ErrorIf(True, constants.CV_ECLUSTERCFG, None, msg % str(err))
1666 def ExpandNames(self):
1667 # Information can be safely retrieved as the BGL is acquired in exclusive
1669 assert locking.BGL in self.owned_locks(locking.LEVEL_CLUSTER)
1670 self.all_group_info = self.cfg.GetAllNodeGroupsInfo()
1671 self.all_node_info = self.cfg.GetAllNodesInfo()
1672 self.all_inst_info = self.cfg.GetAllInstancesInfo()
1673 self.needed_locks = {}
1675 def Exec(self, feedback_fn):
1676 """Verify integrity of cluster, performing various test on nodes.
1680 self._feedback_fn = feedback_fn
1682 feedback_fn("* Verifying cluster config")
1684 for msg in self.cfg.VerifyConfig():
1685 self._ErrorIf(True, constants.CV_ECLUSTERCFG, None, msg)
1687 feedback_fn("* Verifying cluster certificate files")
1689 for cert_filename in constants.ALL_CERT_FILES:
1690 (errcode, msg) = _VerifyCertificate(cert_filename)
1691 self._ErrorIf(errcode, constants.CV_ECLUSTERCERT, None, msg, code=errcode)
1693 feedback_fn("* Verifying hypervisor parameters")
1695 self._VerifyHVP(_GetAllHypervisorParameters(self.cfg.GetClusterInfo(),
1696 self.all_inst_info.values()))
1698 feedback_fn("* Verifying all nodes belong to an existing group")
1700 # We do this verification here because, should this bogus circumstance
1701 # occur, it would never be caught by VerifyGroup, which only acts on
1702 # nodes/instances reachable from existing node groups.
1704 dangling_nodes = set(node.name for node in self.all_node_info.values()
1705 if node.group not in self.all_group_info)
1707 dangling_instances = {}
1708 no_node_instances = []
1710 for inst in self.all_inst_info.values():
1711 if inst.primary_node in dangling_nodes:
1712 dangling_instances.setdefault(inst.primary_node, []).append(inst.name)
1713 elif inst.primary_node not in self.all_node_info:
1714 no_node_instances.append(inst.name)
1719 utils.CommaJoin(dangling_instances.get(node.name,
1721 for node in dangling_nodes]
1723 self._ErrorIf(bool(dangling_nodes), constants.CV_ECLUSTERDANGLINGNODES,
1725 "the following nodes (and their instances) belong to a non"
1726 " existing group: %s", utils.CommaJoin(pretty_dangling))
1728 self._ErrorIf(bool(no_node_instances), constants.CV_ECLUSTERDANGLINGINST,
1730 "the following instances have a non-existing primary-node:"
1731 " %s", utils.CommaJoin(no_node_instances))
1736 class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
1737 """Verifies the status of a node group.
1740 HPATH = "cluster-verify"
1741 HTYPE = constants.HTYPE_CLUSTER
1744 _HOOKS_INDENT_RE = re.compile("^", re.M)
1746 class NodeImage(object):
1747 """A class representing the logical and physical status of a node.
1750 @ivar name: the node name to which this object refers
1751 @ivar volumes: a structure as returned from
1752 L{ganeti.backend.GetVolumeList} (runtime)
1753 @ivar instances: a list of running instances (runtime)
1754 @ivar pinst: list of configured primary instances (config)
1755 @ivar sinst: list of configured secondary instances (config)
1756 @ivar sbp: dictionary of {primary-node: list of instances} for all
1757 instances for which this node is secondary (config)
1758 @ivar mfree: free memory, as reported by hypervisor (runtime)
1759 @ivar dfree: free disk, as reported by the node (runtime)
1760 @ivar offline: the offline status (config)
1761 @type rpc_fail: boolean
1762 @ivar rpc_fail: whether the RPC verify call was successfull (overall,
1763 not whether the individual keys were correct) (runtime)
1764 @type lvm_fail: boolean
1765 @ivar lvm_fail: whether the RPC call didn't return valid LVM data
1766 @type hyp_fail: boolean
1767 @ivar hyp_fail: whether the RPC call didn't return the instance list
1768 @type ghost: boolean
1769 @ivar ghost: whether this is a known node or not (config)
1770 @type os_fail: boolean
1771 @ivar os_fail: whether the RPC call didn't return valid OS data
1773 @ivar oslist: list of OSes as diagnosed by DiagnoseOS
1774 @type vm_capable: boolean
1775 @ivar vm_capable: whether the node can host instances
1778 def __init__(self, offline=False, name=None, vm_capable=True):
1787 self.offline = offline
1788 self.vm_capable = vm_capable
1789 self.rpc_fail = False
1790 self.lvm_fail = False
1791 self.hyp_fail = False
1793 self.os_fail = False
1796 def ExpandNames(self):
1797 # This raises errors.OpPrereqError on its own:
1798 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
1800 # Get instances in node group; this is unsafe and needs verification later
1801 inst_names = self.cfg.GetNodeGroupInstances(self.group_uuid)
1803 self.needed_locks = {
1804 locking.LEVEL_INSTANCE: inst_names,
1805 locking.LEVEL_NODEGROUP: [self.group_uuid],
1806 locking.LEVEL_NODE: [],
1809 self.share_locks = _ShareAll()
1811 def DeclareLocks(self, level):
1812 if level == locking.LEVEL_NODE:
1813 # Get members of node group; this is unsafe and needs verification later
1814 nodes = set(self.cfg.GetNodeGroup(self.group_uuid).members)
1816 all_inst_info = self.cfg.GetAllInstancesInfo()
1818 # In Exec(), we warn about mirrored instances that have primary and
1819 # secondary living in separate node groups. To fully verify that
1820 # volumes for these instances are healthy, we will need to do an
1821 # extra call to their secondaries. We ensure here those nodes will
1823 for inst in self.owned_locks(locking.LEVEL_INSTANCE):
1824 # Important: access only the instances whose lock is owned
1825 if all_inst_info[inst].disk_template in constants.DTS_INT_MIRROR:
1826 nodes.update(all_inst_info[inst].secondary_nodes)
1828 self.needed_locks[locking.LEVEL_NODE] = nodes
1830 def CheckPrereq(self):
1831 assert self.group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP)
1832 self.group_info = self.cfg.GetNodeGroup(self.group_uuid)
1834 group_nodes = set(self.group_info.members)
1835 group_instances = self.cfg.GetNodeGroupInstances(self.group_uuid)
1838 group_nodes.difference(self.owned_locks(locking.LEVEL_NODE))
1840 unlocked_instances = \
1841 group_instances.difference(self.owned_locks(locking.LEVEL_INSTANCE))
1844 raise errors.OpPrereqError("Missing lock for nodes: %s" %
1845 utils.CommaJoin(unlocked_nodes))
1847 if unlocked_instances:
1848 raise errors.OpPrereqError("Missing lock for instances: %s" %
1849 utils.CommaJoin(unlocked_instances))
1851 self.all_node_info = self.cfg.GetAllNodesInfo()
1852 self.all_inst_info = self.cfg.GetAllInstancesInfo()
1854 self.my_node_names = utils.NiceSort(group_nodes)
1855 self.my_inst_names = utils.NiceSort(group_instances)
1857 self.my_node_info = dict((name, self.all_node_info[name])
1858 for name in self.my_node_names)
1860 self.my_inst_info = dict((name, self.all_inst_info[name])
1861 for name in self.my_inst_names)
1863 # We detect here the nodes that will need the extra RPC calls for verifying
1864 # split LV volumes; they should be locked.
1865 extra_lv_nodes = set()
1867 for inst in self.my_inst_info.values():
1868 if inst.disk_template in constants.DTS_INT_MIRROR:
1869 group = self.my_node_info[inst.primary_node].group
1870 for nname in inst.secondary_nodes:
1871 if self.all_node_info[nname].group != group:
1872 extra_lv_nodes.add(nname)
1874 unlocked_lv_nodes = \
1875 extra_lv_nodes.difference(self.owned_locks(locking.LEVEL_NODE))
1877 if unlocked_lv_nodes:
1878 raise errors.OpPrereqError("these nodes could be locked: %s" %
1879 utils.CommaJoin(unlocked_lv_nodes))
1880 self.extra_lv_nodes = list(extra_lv_nodes)
1882 def _VerifyNode(self, ninfo, nresult):
1883 """Perform some basic validation on data returned from a node.
1885 - check the result data structure is well formed and has all the
1887 - check ganeti version
1889 @type ninfo: L{objects.Node}
1890 @param ninfo: the node to check
1891 @param nresult: the results from the node
1893 @return: whether overall this call was successful (and we can expect
1894 reasonable values in the respose)
1898 _ErrorIf = self._ErrorIf # pylint: disable=C0103
1900 # main result, nresult should be a non-empty dict
1901 test = not nresult or not isinstance(nresult, dict)
1902 _ErrorIf(test, constants.CV_ENODERPC, node,
1903 "unable to verify node: no data returned")
1907 # compares ganeti version
1908 local_version = constants.PROTOCOL_VERSION
1909 remote_version = nresult.get("version", None)
1910 test = not (remote_version and
1911 isinstance(remote_version, (list, tuple)) and
1912 len(remote_version) == 2)
1913 _ErrorIf(test, constants.CV_ENODERPC, node,
1914 "connection to node returned invalid data")
1918 test = local_version != remote_version[0]
1919 _ErrorIf(test, constants.CV_ENODEVERSION, node,
1920 "incompatible protocol versions: master %s,"
1921 " node %s", local_version, remote_version[0])
1925 # node seems compatible, we can actually try to look into its results
1927 # full package version
1928 self._ErrorIf(constants.RELEASE_VERSION != remote_version[1],
1929 constants.CV_ENODEVERSION, node,
1930 "software version mismatch: master %s, node %s",
1931 constants.RELEASE_VERSION, remote_version[1],
1932 code=self.ETYPE_WARNING)
1934 hyp_result = nresult.get(constants.NV_HYPERVISOR, None)
1935 if ninfo.vm_capable and isinstance(hyp_result, dict):
1936 for hv_name, hv_result in hyp_result.iteritems():
1937 test = hv_result is not None
1938 _ErrorIf(test, constants.CV_ENODEHV, node,
1939 "hypervisor %s verify failure: '%s'", hv_name, hv_result)
1941 hvp_result = nresult.get(constants.NV_HVPARAMS, None)
1942 if ninfo.vm_capable and isinstance(hvp_result, list):
1943 for item, hv_name, hv_result in hvp_result:
1944 _ErrorIf(True, constants.CV_ENODEHV, node,
1945 "hypervisor %s parameter verify failure (source %s): %s",
1946 hv_name, item, hv_result)
1948 test = nresult.get(constants.NV_NODESETUP,
1949 ["Missing NODESETUP results"])
1950 _ErrorIf(test, constants.CV_ENODESETUP, node, "node setup error: %s",
1955 def _VerifyNodeTime(self, ninfo, nresult,
1956 nvinfo_starttime, nvinfo_endtime):
1957 """Check the node time.
1959 @type ninfo: L{objects.Node}
1960 @param ninfo: the node to check
1961 @param nresult: the remote results for the node
1962 @param nvinfo_starttime: the start time of the RPC call
1963 @param nvinfo_endtime: the end time of the RPC call
1967 _ErrorIf = self._ErrorIf # pylint: disable=C0103
1969 ntime = nresult.get(constants.NV_TIME, None)
1971 ntime_merged = utils.MergeTime(ntime)
1972 except (ValueError, TypeError):
1973 _ErrorIf(True, constants.CV_ENODETIME, node, "Node returned invalid time")
1976 if ntime_merged < (nvinfo_starttime - constants.NODE_MAX_CLOCK_SKEW):
1977 ntime_diff = "%.01fs" % abs(nvinfo_starttime - ntime_merged)
1978 elif ntime_merged > (nvinfo_endtime + constants.NODE_MAX_CLOCK_SKEW):
1979 ntime_diff = "%.01fs" % abs(ntime_merged - nvinfo_endtime)
1983 _ErrorIf(ntime_diff is not None, constants.CV_ENODETIME, node,
1984 "Node time diverges by at least %s from master node time",
1987 def _VerifyNodeLVM(self, ninfo, nresult, vg_name):
1988 """Check the node LVM results.
1990 @type ninfo: L{objects.Node}
1991 @param ninfo: the node to check
1992 @param nresult: the remote results for the node
1993 @param vg_name: the configured VG name
2000 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2002 # checks vg existence and size > 20G
2003 vglist = nresult.get(constants.NV_VGLIST, None)
2005 _ErrorIf(test, constants.CV_ENODELVM, node, "unable to check volume groups")
2007 vgstatus = utils.CheckVolumeGroupSize(vglist, vg_name,
2008 constants.MIN_VG_SIZE)
2009 _ErrorIf(vgstatus, constants.CV_ENODELVM, node, vgstatus)
2012 pvlist = nresult.get(constants.NV_PVLIST, None)
2013 test = pvlist is None
2014 _ErrorIf(test, constants.CV_ENODELVM, node, "Can't get PV list from node")
2016 # check that ':' is not present in PV names, since it's a
2017 # special character for lvcreate (denotes the range of PEs to
2019 for _, pvname, owner_vg in pvlist:
2020 test = ":" in pvname
2021 _ErrorIf(test, constants.CV_ENODELVM, node,
2022 "Invalid character ':' in PV '%s' of VG '%s'",
2025 def _VerifyNodeBridges(self, ninfo, nresult, bridges):
2026 """Check the node bridges.
2028 @type ninfo: L{objects.Node}
2029 @param ninfo: the node to check
2030 @param nresult: the remote results for the node
2031 @param bridges: the expected list of bridges
2038 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2040 missing = nresult.get(constants.NV_BRIDGES, None)
2041 test = not isinstance(missing, list)
2042 _ErrorIf(test, constants.CV_ENODENET, node,
2043 "did not return valid bridge information")
2045 _ErrorIf(bool(missing), constants.CV_ENODENET, node,
2046 "missing bridges: %s" % utils.CommaJoin(sorted(missing)))
2048 def _VerifyNodeUserScripts(self, ninfo, nresult):
2049 """Check the results of user scripts presence and executability on the node
2051 @type ninfo: L{objects.Node}
2052 @param ninfo: the node to check
2053 @param nresult: the remote results for the node
2058 test = not constants.NV_USERSCRIPTS in nresult
2059 self._ErrorIf(test, constants.CV_ENODEUSERSCRIPTS, node,
2060 "did not return user scripts information")
2062 broken_scripts = nresult.get(constants.NV_USERSCRIPTS, None)
2064 self._ErrorIf(broken_scripts, constants.CV_ENODEUSERSCRIPTS, node,
2065 "user scripts not present or not executable: %s" %
2066 utils.CommaJoin(sorted(broken_scripts)))
2068 def _VerifyNodeNetwork(self, ninfo, nresult):
2069 """Check the node network connectivity results.
2071 @type ninfo: L{objects.Node}
2072 @param ninfo: the node to check
2073 @param nresult: the remote results for the node
2077 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2079 test = constants.NV_NODELIST not in nresult
2080 _ErrorIf(test, constants.CV_ENODESSH, node,
2081 "node hasn't returned node ssh connectivity data")
2083 if nresult[constants.NV_NODELIST]:
2084 for a_node, a_msg in nresult[constants.NV_NODELIST].items():
2085 _ErrorIf(True, constants.CV_ENODESSH, node,
2086 "ssh communication with node '%s': %s", a_node, a_msg)
2088 test = constants.NV_NODENETTEST not in nresult
2089 _ErrorIf(test, constants.CV_ENODENET, node,
2090 "node hasn't returned node tcp connectivity data")
2092 if nresult[constants.NV_NODENETTEST]:
2093 nlist = utils.NiceSort(nresult[constants.NV_NODENETTEST].keys())
2095 _ErrorIf(True, constants.CV_ENODENET, node,
2096 "tcp communication with node '%s': %s",
2097 anode, nresult[constants.NV_NODENETTEST][anode])
2099 test = constants.NV_MASTERIP not in nresult
2100 _ErrorIf(test, constants.CV_ENODENET, node,
2101 "node hasn't returned node master IP reachability data")
2103 if not nresult[constants.NV_MASTERIP]:
2104 if node == self.master_node:
2105 msg = "the master node cannot reach the master IP (not configured?)"
2107 msg = "cannot reach the master IP"
2108 _ErrorIf(True, constants.CV_ENODENET, node, msg)
2110 def _VerifyInstance(self, instance, instanceconfig, node_image,
2112 """Verify an instance.
2114 This function checks to see if the required block devices are
2115 available on the instance's node.
2118 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2119 node_current = instanceconfig.primary_node
2121 node_vol_should = {}
2122 instanceconfig.MapLVsByNode(node_vol_should)
2124 for node in node_vol_should:
2125 n_img = node_image[node]
2126 if n_img.offline or n_img.rpc_fail or n_img.lvm_fail:
2127 # ignore missing volumes on offline or broken nodes
2129 for volume in node_vol_should[node]:
2130 test = volume not in n_img.volumes
2131 _ErrorIf(test, constants.CV_EINSTANCEMISSINGDISK, instance,
2132 "volume %s missing on node %s", volume, node)
2134 if instanceconfig.admin_state == constants.ADMINST_UP:
2135 pri_img = node_image[node_current]
2136 test = instance not in pri_img.instances and not pri_img.offline
2137 _ErrorIf(test, constants.CV_EINSTANCEDOWN, instance,
2138 "instance not running on its primary node %s",
2141 diskdata = [(nname, success, status, idx)
2142 for (nname, disks) in diskstatus.items()
2143 for idx, (success, status) in enumerate(disks)]
2145 for nname, success, bdev_status, idx in diskdata:
2146 # the 'ghost node' construction in Exec() ensures that we have a
2148 snode = node_image[nname]
2149 bad_snode = snode.ghost or snode.offline
2150 _ErrorIf(instanceconfig.admin_state == constants.ADMINST_UP and
2151 not success and not bad_snode,
2152 constants.CV_EINSTANCEFAULTYDISK, instance,
2153 "couldn't retrieve status for disk/%s on %s: %s",
2154 idx, nname, bdev_status)
2155 _ErrorIf((instanceconfig.admin_state == constants.ADMINST_UP and
2156 success and bdev_status.ldisk_status == constants.LDS_FAULTY),
2157 constants.CV_EINSTANCEFAULTYDISK, instance,
2158 "disk/%s on %s is faulty", idx, nname)
2160 def _VerifyOrphanVolumes(self, node_vol_should, node_image, reserved):
2161 """Verify if there are any unknown volumes in the cluster.
2163 The .os, .swap and backup volumes are ignored. All other volumes are
2164 reported as unknown.
2166 @type reserved: L{ganeti.utils.FieldSet}
2167 @param reserved: a FieldSet of reserved volume names
2170 for node, n_img in node_image.items():
2171 if n_img.offline or n_img.rpc_fail or n_img.lvm_fail:
2172 # skip non-healthy nodes
2174 for volume in n_img.volumes:
2175 test = ((node not in node_vol_should or
2176 volume not in node_vol_should[node]) and
2177 not reserved.Matches(volume))
2178 self._ErrorIf(test, constants.CV_ENODEORPHANLV, node,
2179 "volume %s is unknown", volume)
2181 def _VerifyNPlusOneMemory(self, node_image, instance_cfg):
2182 """Verify N+1 Memory Resilience.
2184 Check that if one single node dies we can still start all the
2185 instances it was primary for.
2188 cluster_info = self.cfg.GetClusterInfo()
2189 for node, n_img in node_image.items():
2190 # This code checks that every node which is now listed as
2191 # secondary has enough memory to host all instances it is
2192 # supposed to should a single other node in the cluster fail.
2193 # FIXME: not ready for failover to an arbitrary node
2194 # FIXME: does not support file-backed instances
2195 # WARNING: we currently take into account down instances as well
2196 # as up ones, considering that even if they're down someone
2197 # might want to start them even in the event of a node failure.
2199 # we're skipping offline nodes from the N+1 warning, since
2200 # most likely we don't have good memory infromation from them;
2201 # we already list instances living on such nodes, and that's
2204 #TODO(dynmem): use MINMEM for checking
2205 #TODO(dynmem): also consider ballooning out other instances
2206 for prinode, instances in n_img.sbp.items():
2208 for instance in instances:
2209 bep = cluster_info.FillBE(instance_cfg[instance])
2210 if bep[constants.BE_AUTO_BALANCE]:
2211 needed_mem += bep[constants.BE_MAXMEM]
2212 test = n_img.mfree < needed_mem
2213 self._ErrorIf(test, constants.CV_ENODEN1, node,
2214 "not enough memory to accomodate instance failovers"
2215 " should node %s fail (%dMiB needed, %dMiB available)",
2216 prinode, needed_mem, n_img.mfree)
2219 def _VerifyFiles(cls, errorif, nodeinfo, master_node, all_nvinfo,
2220 (files_all, files_opt, files_mc, files_vm)):
2221 """Verifies file checksums collected from all nodes.
2223 @param errorif: Callback for reporting errors
2224 @param nodeinfo: List of L{objects.Node} objects
2225 @param master_node: Name of master node
2226 @param all_nvinfo: RPC results
2229 # Define functions determining which nodes to consider for a file
2232 (files_mc, lambda node: (node.master_candidate or
2233 node.name == master_node)),
2234 (files_vm, lambda node: node.vm_capable),
2237 # Build mapping from filename to list of nodes which should have the file
2239 for (files, fn) in files2nodefn:
2241 filenodes = nodeinfo
2243 filenodes = filter(fn, nodeinfo)
2244 nodefiles.update((filename,
2245 frozenset(map(operator.attrgetter("name"), filenodes)))
2246 for filename in files)
2248 assert set(nodefiles) == (files_all | files_mc | files_vm)
2250 fileinfo = dict((filename, {}) for filename in nodefiles)
2251 ignore_nodes = set()
2253 for node in nodeinfo:
2255 ignore_nodes.add(node.name)
2258 nresult = all_nvinfo[node.name]
2260 if nresult.fail_msg or not nresult.payload:
2263 node_files = nresult.payload.get(constants.NV_FILELIST, None)
2265 test = not (node_files and isinstance(node_files, dict))
2266 errorif(test, constants.CV_ENODEFILECHECK, node.name,
2267 "Node did not return file checksum data")
2269 ignore_nodes.add(node.name)
2272 # Build per-checksum mapping from filename to nodes having it
2273 for (filename, checksum) in node_files.items():
2274 assert filename in nodefiles
2275 fileinfo[filename].setdefault(checksum, set()).add(node.name)
2277 for (filename, checksums) in fileinfo.items():
2278 assert compat.all(len(i) > 10 for i in checksums), "Invalid checksum"
2280 # Nodes having the file
2281 with_file = frozenset(node_name
2282 for nodes in fileinfo[filename].values()
2283 for node_name in nodes) - ignore_nodes
2285 expected_nodes = nodefiles[filename] - ignore_nodes
2287 # Nodes missing file
2288 missing_file = expected_nodes - with_file
2290 if filename in files_opt:
2292 errorif(missing_file and missing_file != expected_nodes,
2293 constants.CV_ECLUSTERFILECHECK, None,
2294 "File %s is optional, but it must exist on all or no"
2295 " nodes (not found on %s)",
2296 filename, utils.CommaJoin(utils.NiceSort(missing_file)))
2298 errorif(missing_file, constants.CV_ECLUSTERFILECHECK, None,
2299 "File %s is missing from node(s) %s", filename,
2300 utils.CommaJoin(utils.NiceSort(missing_file)))
2302 # Warn if a node has a file it shouldn't
2303 unexpected = with_file - expected_nodes
2305 constants.CV_ECLUSTERFILECHECK, None,
2306 "File %s should not exist on node(s) %s",
2307 filename, utils.CommaJoin(utils.NiceSort(unexpected)))
2309 # See if there are multiple versions of the file
2310 test = len(checksums) > 1
2312 variants = ["variant %s on %s" %
2313 (idx + 1, utils.CommaJoin(utils.NiceSort(nodes)))
2314 for (idx, (checksum, nodes)) in
2315 enumerate(sorted(checksums.items()))]
2319 errorif(test, constants.CV_ECLUSTERFILECHECK, None,
2320 "File %s found with %s different checksums (%s)",
2321 filename, len(checksums), "; ".join(variants))
2323 def _VerifyNodeDrbd(self, ninfo, nresult, instanceinfo, drbd_helper,
2325 """Verifies and the node DRBD status.
2327 @type ninfo: L{objects.Node}
2328 @param ninfo: the node to check
2329 @param nresult: the remote results for the node
2330 @param instanceinfo: the dict of instances
2331 @param drbd_helper: the configured DRBD usermode helper
2332 @param drbd_map: the DRBD map as returned by
2333 L{ganeti.config.ConfigWriter.ComputeDRBDMap}
2337 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2340 helper_result = nresult.get(constants.NV_DRBDHELPER, None)
2341 test = (helper_result == None)
2342 _ErrorIf(test, constants.CV_ENODEDRBDHELPER, node,
2343 "no drbd usermode helper returned")
2345 status, payload = helper_result
2347 _ErrorIf(test, constants.CV_ENODEDRBDHELPER, node,
2348 "drbd usermode helper check unsuccessful: %s", payload)
2349 test = status and (payload != drbd_helper)
2350 _ErrorIf(test, constants.CV_ENODEDRBDHELPER, node,
2351 "wrong drbd usermode helper: %s", payload)
2353 # compute the DRBD minors
2355 for minor, instance in drbd_map[node].items():
2356 test = instance not in instanceinfo
2357 _ErrorIf(test, constants.CV_ECLUSTERCFG, None,
2358 "ghost instance '%s' in temporary DRBD map", instance)
2359 # ghost instance should not be running, but otherwise we
2360 # don't give double warnings (both ghost instance and
2361 # unallocated minor in use)
2363 node_drbd[minor] = (instance, False)
2365 instance = instanceinfo[instance]
2366 node_drbd[minor] = (instance.name,
2367 instance.admin_state == constants.ADMINST_UP)
2369 # and now check them
2370 used_minors = nresult.get(constants.NV_DRBDLIST, [])
2371 test = not isinstance(used_minors, (tuple, list))
2372 _ErrorIf(test, constants.CV_ENODEDRBD, node,
2373 "cannot parse drbd status file: %s", str(used_minors))
2375 # we cannot check drbd status
2378 for minor, (iname, must_exist) in node_drbd.items():
2379 test = minor not in used_minors and must_exist
2380 _ErrorIf(test, constants.CV_ENODEDRBD, node,
2381 "drbd minor %d of instance %s is not active", minor, iname)
2382 for minor in used_minors:
2383 test = minor not in node_drbd
2384 _ErrorIf(test, constants.CV_ENODEDRBD, node,
2385 "unallocated drbd minor %d is in use", minor)
2387 def _UpdateNodeOS(self, ninfo, nresult, nimg):
2388 """Builds the node OS structures.
2390 @type ninfo: L{objects.Node}
2391 @param ninfo: the node to check
2392 @param nresult: the remote results for the node
2393 @param nimg: the node image object
2397 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2399 remote_os = nresult.get(constants.NV_OSLIST, None)
2400 test = (not isinstance(remote_os, list) or
2401 not compat.all(isinstance(v, list) and len(v) == 7
2402 for v in remote_os))
2404 _ErrorIf(test, constants.CV_ENODEOS, node,
2405 "node hasn't returned valid OS data")
2414 for (name, os_path, status, diagnose,
2415 variants, parameters, api_ver) in nresult[constants.NV_OSLIST]:
2417 if name not in os_dict:
2420 # parameters is a list of lists instead of list of tuples due to
2421 # JSON lacking a real tuple type, fix it:
2422 parameters = [tuple(v) for v in parameters]
2423 os_dict[name].append((os_path, status, diagnose,
2424 set(variants), set(parameters), set(api_ver)))
2426 nimg.oslist = os_dict
2428 def _VerifyNodeOS(self, ninfo, nimg, base):
2429 """Verifies the node OS list.
2431 @type ninfo: L{objects.Node}
2432 @param ninfo: the node to check
2433 @param nimg: the node image object
2434 @param base: the 'template' node we match against (e.g. from the master)
2438 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2440 assert not nimg.os_fail, "Entered _VerifyNodeOS with failed OS rpc?"
2442 beautify_params = lambda l: ["%s: %s" % (k, v) for (k, v) in l]
2443 for os_name, os_data in nimg.oslist.items():
2444 assert os_data, "Empty OS status for OS %s?!" % os_name
2445 f_path, f_status, f_diag, f_var, f_param, f_api = os_data[0]
2446 _ErrorIf(not f_status, constants.CV_ENODEOS, node,
2447 "Invalid OS %s (located at %s): %s", os_name, f_path, f_diag)
2448 _ErrorIf(len(os_data) > 1, constants.CV_ENODEOS, node,
2449 "OS '%s' has multiple entries (first one shadows the rest): %s",
2450 os_name, utils.CommaJoin([v[0] for v in os_data]))
2451 # comparisons with the 'base' image
2452 test = os_name not in base.oslist
2453 _ErrorIf(test, constants.CV_ENODEOS, node,
2454 "Extra OS %s not present on reference node (%s)",
2458 assert base.oslist[os_name], "Base node has empty OS status?"
2459 _, b_status, _, b_var, b_param, b_api = base.oslist[os_name][0]
2461 # base OS is invalid, skipping
2463 for kind, a, b in [("API version", f_api, b_api),
2464 ("variants list", f_var, b_var),
2465 ("parameters", beautify_params(f_param),
2466 beautify_params(b_param))]:
2467 _ErrorIf(a != b, constants.CV_ENODEOS, node,
2468 "OS %s for %s differs from reference node %s: [%s] vs. [%s]",
2469 kind, os_name, base.name,
2470 utils.CommaJoin(sorted(a)), utils.CommaJoin(sorted(b)))
2472 # check any missing OSes
2473 missing = set(base.oslist.keys()).difference(nimg.oslist.keys())
2474 _ErrorIf(missing, constants.CV_ENODEOS, node,
2475 "OSes present on reference node %s but missing on this node: %s",
2476 base.name, utils.CommaJoin(missing))
2478 def _VerifyOob(self, ninfo, nresult):
2479 """Verifies out of band functionality of a node.
2481 @type ninfo: L{objects.Node}
2482 @param ninfo: the node to check
2483 @param nresult: the remote results for the node
2487 # We just have to verify the paths on master and/or master candidates
2488 # as the oob helper is invoked on the master
2489 if ((ninfo.master_candidate or ninfo.master_capable) and
2490 constants.NV_OOB_PATHS in nresult):
2491 for path_result in nresult[constants.NV_OOB_PATHS]:
2492 self._ErrorIf(path_result, constants.CV_ENODEOOBPATH, node, path_result)
2494 def _UpdateNodeVolumes(self, ninfo, nresult, nimg, vg_name):
2495 """Verifies and updates the node volume data.
2497 This function will update a L{NodeImage}'s internal structures
2498 with data from the remote call.
2500 @type ninfo: L{objects.Node}
2501 @param ninfo: the node to check
2502 @param nresult: the remote results for the node
2503 @param nimg: the node image object
2504 @param vg_name: the configured VG name
2508 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2510 nimg.lvm_fail = True
2511 lvdata = nresult.get(constants.NV_LVLIST, "Missing LV data")
2514 elif isinstance(lvdata, basestring):
2515 _ErrorIf(True, constants.CV_ENODELVM, node, "LVM problem on node: %s",
2516 utils.SafeEncode(lvdata))
2517 elif not isinstance(lvdata, dict):
2518 _ErrorIf(True, constants.CV_ENODELVM, node,
2519 "rpc call to node failed (lvlist)")
2521 nimg.volumes = lvdata
2522 nimg.lvm_fail = False
2524 def _UpdateNodeInstances(self, ninfo, nresult, nimg):
2525 """Verifies and updates the node instance list.
2527 If the listing was successful, then updates this node's instance
2528 list. Otherwise, it marks the RPC call as failed for the instance
2531 @type ninfo: L{objects.Node}
2532 @param ninfo: the node to check
2533 @param nresult: the remote results for the node
2534 @param nimg: the node image object
2537 idata = nresult.get(constants.NV_INSTANCELIST, None)
2538 test = not isinstance(idata, list)
2539 self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name,
2540 "rpc call to node failed (instancelist): %s",
2541 utils.SafeEncode(str(idata)))
2543 nimg.hyp_fail = True
2545 nimg.instances = idata
2547 def _UpdateNodeInfo(self, ninfo, nresult, nimg, vg_name):
2548 """Verifies and computes a node information map
2550 @type ninfo: L{objects.Node}
2551 @param ninfo: the node to check
2552 @param nresult: the remote results for the node
2553 @param nimg: the node image object
2554 @param vg_name: the configured VG name
2558 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2560 # try to read free memory (from the hypervisor)
2561 hv_info = nresult.get(constants.NV_HVINFO, None)
2562 test = not isinstance(hv_info, dict) or "memory_free" not in hv_info
2563 _ErrorIf(test, constants.CV_ENODEHV, node,
2564 "rpc call to node failed (hvinfo)")
2567 nimg.mfree = int(hv_info["memory_free"])
2568 except (ValueError, TypeError):
2569 _ErrorIf(True, constants.CV_ENODERPC, node,
2570 "node returned invalid nodeinfo, check hypervisor")
2572 # FIXME: devise a free space model for file based instances as well
2573 if vg_name is not None:
2574 test = (constants.NV_VGLIST not in nresult or
2575 vg_name not in nresult[constants.NV_VGLIST])
2576 _ErrorIf(test, constants.CV_ENODELVM, node,
2577 "node didn't return data for the volume group '%s'"
2578 " - it is either missing or broken", vg_name)
2581 nimg.dfree = int(nresult[constants.NV_VGLIST][vg_name])
2582 except (ValueError, TypeError):
2583 _ErrorIf(True, constants.CV_ENODERPC, node,
2584 "node returned invalid LVM info, check LVM status")
2586 def _CollectDiskInfo(self, nodelist, node_image, instanceinfo):
2587 """Gets per-disk status information for all instances.
2589 @type nodelist: list of strings
2590 @param nodelist: Node names
2591 @type node_image: dict of (name, L{objects.Node})
2592 @param node_image: Node objects
2593 @type instanceinfo: dict of (name, L{objects.Instance})
2594 @param instanceinfo: Instance objects
2595 @rtype: {instance: {node: [(succes, payload)]}}
2596 @return: a dictionary of per-instance dictionaries with nodes as
2597 keys and disk information as values; the disk information is a
2598 list of tuples (success, payload)
2601 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2604 node_disks_devonly = {}
2605 diskless_instances = set()
2606 diskless = constants.DT_DISKLESS
2608 for nname in nodelist:
2609 node_instances = list(itertools.chain(node_image[nname].pinst,
2610 node_image[nname].sinst))
2611 diskless_instances.update(inst for inst in node_instances
2612 if instanceinfo[inst].disk_template == diskless)
2613 disks = [(inst, disk)
2614 for inst in node_instances
2615 for disk in instanceinfo[inst].disks]
2618 # No need to collect data
2621 node_disks[nname] = disks
2623 # Creating copies as SetDiskID below will modify the objects and that can
2624 # lead to incorrect data returned from nodes
2625 devonly = [dev.Copy() for (_, dev) in disks]
2628 self.cfg.SetDiskID(dev, nname)
2630 node_disks_devonly[nname] = devonly
2632 assert len(node_disks) == len(node_disks_devonly)
2634 # Collect data from all nodes with disks
2635 result = self.rpc.call_blockdev_getmirrorstatus_multi(node_disks.keys(),
2638 assert len(result) == len(node_disks)
2642 for (nname, nres) in result.items():
2643 disks = node_disks[nname]
2646 # No data from this node
2647 data = len(disks) * [(False, "node offline")]
2650 _ErrorIf(msg, constants.CV_ENODERPC, nname,
2651 "while getting disk information: %s", msg)
2653 # No data from this node
2654 data = len(disks) * [(False, msg)]
2657 for idx, i in enumerate(nres.payload):
2658 if isinstance(i, (tuple, list)) and len(i) == 2:
2661 logging.warning("Invalid result from node %s, entry %d: %s",
2663 data.append((False, "Invalid result from the remote node"))
2665 for ((inst, _), status) in zip(disks, data):
2666 instdisk.setdefault(inst, {}).setdefault(nname, []).append(status)
2668 # Add empty entries for diskless instances.
2669 for inst in diskless_instances:
2670 assert inst not in instdisk
2673 assert compat.all(len(statuses) == len(instanceinfo[inst].disks) and
2674 len(nnames) <= len(instanceinfo[inst].all_nodes) and
2675 compat.all(isinstance(s, (tuple, list)) and
2676 len(s) == 2 for s in statuses)
2677 for inst, nnames in instdisk.items()
2678 for nname, statuses in nnames.items())
2679 assert set(instdisk) == set(instanceinfo), "instdisk consistency failure"
2684 def _SshNodeSelector(group_uuid, all_nodes):
2685 """Create endless iterators for all potential SSH check hosts.
2688 nodes = [node for node in all_nodes
2689 if (node.group != group_uuid and
2691 keyfunc = operator.attrgetter("group")
2693 return map(itertools.cycle,
2694 [sorted(map(operator.attrgetter("name"), names))
2695 for _, names in itertools.groupby(sorted(nodes, key=keyfunc),
2699 def _SelectSshCheckNodes(cls, group_nodes, group_uuid, all_nodes):
2700 """Choose which nodes should talk to which other nodes.
2702 We will make nodes contact all nodes in their group, and one node from
2705 @warning: This algorithm has a known issue if one node group is much
2706 smaller than others (e.g. just one node). In such a case all other
2707 nodes will talk to the single node.
2710 online_nodes = sorted(node.name for node in group_nodes if not node.offline)
2711 sel = cls._SshNodeSelector(group_uuid, all_nodes)
2713 return (online_nodes,
2714 dict((name, sorted([i.next() for i in sel]))
2715 for name in online_nodes))
2717 def BuildHooksEnv(self):
2720 Cluster-Verify hooks just ran in the post phase and their failure makes
2721 the output be logged in the verify output and the verification to fail.
2725 "CLUSTER_TAGS": " ".join(self.cfg.GetClusterInfo().GetTags())
2728 env.update(("NODE_TAGS_%s" % node.name, " ".join(node.GetTags()))
2729 for node in self.my_node_info.values())
2733 def BuildHooksNodes(self):
2734 """Build hooks nodes.
2737 return ([], self.my_node_names)
2739 def Exec(self, feedback_fn):
2740 """Verify integrity of the node group, performing various test on nodes.
2743 # This method has too many local variables. pylint: disable=R0914
2744 feedback_fn("* Verifying group '%s'" % self.group_info.name)
2746 if not self.my_node_names:
2748 feedback_fn("* Empty node group, skipping verification")
2752 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2753 verbose = self.op.verbose
2754 self._feedback_fn = feedback_fn
2756 vg_name = self.cfg.GetVGName()
2757 drbd_helper = self.cfg.GetDRBDHelper()
2758 cluster = self.cfg.GetClusterInfo()
2759 groupinfo = self.cfg.GetAllNodeGroupsInfo()
2760 hypervisors = cluster.enabled_hypervisors
2761 node_data_list = [self.my_node_info[name] for name in self.my_node_names]
2763 i_non_redundant = [] # Non redundant instances
2764 i_non_a_balanced = [] # Non auto-balanced instances
2765 i_offline = 0 # Count of offline instances
2766 n_offline = 0 # Count of offline nodes
2767 n_drained = 0 # Count of nodes being drained
2768 node_vol_should = {}
2770 # FIXME: verify OS list
2773 filemap = _ComputeAncillaryFiles(cluster, False)
2775 # do local checksums
2776 master_node = self.master_node = self.cfg.GetMasterNode()
2777 master_ip = self.cfg.GetMasterIP()
2779 feedback_fn("* Gathering data (%d nodes)" % len(self.my_node_names))
2782 if self.cfg.GetUseExternalMipScript():
2783 user_scripts.append(constants.EXTERNAL_MASTER_SETUP_SCRIPT)
2785 node_verify_param = {
2786 constants.NV_FILELIST:
2787 utils.UniqueSequence(filename
2788 for files in filemap
2789 for filename in files),
2790 constants.NV_NODELIST:
2791 self._SelectSshCheckNodes(node_data_list, self.group_uuid,
2792 self.all_node_info.values()),
2793 constants.NV_HYPERVISOR: hypervisors,
2794 constants.NV_HVPARAMS:
2795 _GetAllHypervisorParameters(cluster, self.all_inst_info.values()),
2796 constants.NV_NODENETTEST: [(node.name, node.primary_ip, node.secondary_ip)
2797 for node in node_data_list
2798 if not node.offline],
2799 constants.NV_INSTANCELIST: hypervisors,
2800 constants.NV_VERSION: None,
2801 constants.NV_HVINFO: self.cfg.GetHypervisorType(),
2802 constants.NV_NODESETUP: None,
2803 constants.NV_TIME: None,
2804 constants.NV_MASTERIP: (master_node, master_ip),
2805 constants.NV_OSLIST: None,
2806 constants.NV_VMNODES: self.cfg.GetNonVmCapableNodeList(),
2807 constants.NV_USERSCRIPTS: user_scripts,
2810 if vg_name is not None:
2811 node_verify_param[constants.NV_VGLIST] = None
2812 node_verify_param[constants.NV_LVLIST] = vg_name
2813 node_verify_param[constants.NV_PVLIST] = [vg_name]
2814 node_verify_param[constants.NV_DRBDLIST] = None
2817 node_verify_param[constants.NV_DRBDHELPER] = drbd_helper
2820 # FIXME: this needs to be changed per node-group, not cluster-wide
2822 default_nicpp = cluster.nicparams[constants.PP_DEFAULT]
2823 if default_nicpp[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
2824 bridges.add(default_nicpp[constants.NIC_LINK])
2825 for instance in self.my_inst_info.values():
2826 for nic in instance.nics:
2827 full_nic = cluster.SimpleFillNIC(nic.nicparams)
2828 if full_nic[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
2829 bridges.add(full_nic[constants.NIC_LINK])
2832 node_verify_param[constants.NV_BRIDGES] = list(bridges)
2834 # Build our expected cluster state
2835 node_image = dict((node.name, self.NodeImage(offline=node.offline,
2837 vm_capable=node.vm_capable))
2838 for node in node_data_list)
2842 for node in self.all_node_info.values():
2843 path = _SupportsOob(self.cfg, node)
2844 if path and path not in oob_paths:
2845 oob_paths.append(path)
2848 node_verify_param[constants.NV_OOB_PATHS] = oob_paths
2850 for instance in self.my_inst_names:
2851 inst_config = self.my_inst_info[instance]
2853 for nname in inst_config.all_nodes:
2854 if nname not in node_image:
2855 gnode = self.NodeImage(name=nname)
2856 gnode.ghost = (nname not in self.all_node_info)
2857 node_image[nname] = gnode
2859 inst_config.MapLVsByNode(node_vol_should)
2861 pnode = inst_config.primary_node
2862 node_image[pnode].pinst.append(instance)
2864 for snode in inst_config.secondary_nodes:
2865 nimg = node_image[snode]
2866 nimg.sinst.append(instance)
2867 if pnode not in nimg.sbp:
2868 nimg.sbp[pnode] = []
2869 nimg.sbp[pnode].append(instance)
2871 # At this point, we have the in-memory data structures complete,
2872 # except for the runtime information, which we'll gather next
2874 # Due to the way our RPC system works, exact response times cannot be
2875 # guaranteed (e.g. a broken node could run into a timeout). By keeping the
2876 # time before and after executing the request, we can at least have a time
2878 nvinfo_starttime = time.time()
2879 all_nvinfo = self.rpc.call_node_verify(self.my_node_names,
2881 self.cfg.GetClusterName())
2882 nvinfo_endtime = time.time()
2884 if self.extra_lv_nodes and vg_name is not None:
2886 self.rpc.call_node_verify(self.extra_lv_nodes,
2887 {constants.NV_LVLIST: vg_name},
2888 self.cfg.GetClusterName())
2890 extra_lv_nvinfo = {}
2892 all_drbd_map = self.cfg.ComputeDRBDMap()
2894 feedback_fn("* Gathering disk information (%s nodes)" %
2895 len(self.my_node_names))
2896 instdisk = self._CollectDiskInfo(self.my_node_names, node_image,
2899 feedback_fn("* Verifying configuration file consistency")
2901 # If not all nodes are being checked, we need to make sure the master node
2902 # and a non-checked vm_capable node are in the list.
2903 absent_nodes = set(self.all_node_info).difference(self.my_node_info)
2905 vf_nvinfo = all_nvinfo.copy()
2906 vf_node_info = list(self.my_node_info.values())
2907 additional_nodes = []
2908 if master_node not in self.my_node_info:
2909 additional_nodes.append(master_node)
2910 vf_node_info.append(self.all_node_info[master_node])
2911 # Add the first vm_capable node we find which is not included
2912 for node in absent_nodes:
2913 nodeinfo = self.all_node_info[node]
2914 if nodeinfo.vm_capable and not nodeinfo.offline:
2915 additional_nodes.append(node)
2916 vf_node_info.append(self.all_node_info[node])
2918 key = constants.NV_FILELIST
2919 vf_nvinfo.update(self.rpc.call_node_verify(additional_nodes,
2920 {key: node_verify_param[key]},
2921 self.cfg.GetClusterName()))
2923 vf_nvinfo = all_nvinfo
2924 vf_node_info = self.my_node_info.values()
2926 self._VerifyFiles(_ErrorIf, vf_node_info, master_node, vf_nvinfo, filemap)
2928 feedback_fn("* Verifying node status")
2932 for node_i in node_data_list:
2934 nimg = node_image[node]
2938 feedback_fn("* Skipping offline node %s" % (node,))
2942 if node == master_node:
2944 elif node_i.master_candidate:
2945 ntype = "master candidate"
2946 elif node_i.drained:
2952 feedback_fn("* Verifying node %s (%s)" % (node, ntype))
2954 msg = all_nvinfo[node].fail_msg
2955 _ErrorIf(msg, constants.CV_ENODERPC, node, "while contacting node: %s",
2958 nimg.rpc_fail = True
2961 nresult = all_nvinfo[node].payload
2963 nimg.call_ok = self._VerifyNode(node_i, nresult)
2964 self._VerifyNodeTime(node_i, nresult, nvinfo_starttime, nvinfo_endtime)
2965 self._VerifyNodeNetwork(node_i, nresult)
2966 self._VerifyNodeUserScripts(node_i, nresult)
2967 self._VerifyOob(node_i, nresult)
2970 self._VerifyNodeLVM(node_i, nresult, vg_name)
2971 self._VerifyNodeDrbd(node_i, nresult, self.all_inst_info, drbd_helper,
2974 self._UpdateNodeVolumes(node_i, nresult, nimg, vg_name)
2975 self._UpdateNodeInstances(node_i, nresult, nimg)
2976 self._UpdateNodeInfo(node_i, nresult, nimg, vg_name)
2977 self._UpdateNodeOS(node_i, nresult, nimg)
2979 if not nimg.os_fail:
2980 if refos_img is None:
2982 self._VerifyNodeOS(node_i, nimg, refos_img)
2983 self._VerifyNodeBridges(node_i, nresult, bridges)
2985 # Check whether all running instancies are primary for the node. (This
2986 # can no longer be done from _VerifyInstance below, since some of the
2987 # wrong instances could be from other node groups.)
2988 non_primary_inst = set(nimg.instances).difference(nimg.pinst)
2990 for inst in non_primary_inst:
2991 # FIXME: investigate best way to handle offline insts
2992 if inst.admin_state == constants.ADMINST_OFFLINE:
2994 feedback_fn("* Skipping offline instance %s" % inst.name)
2997 test = inst in self.all_inst_info
2998 _ErrorIf(test, constants.CV_EINSTANCEWRONGNODE, inst,
2999 "instance should not run on node %s", node_i.name)
3000 _ErrorIf(not test, constants.CV_ENODEORPHANINSTANCE, node_i.name,
3001 "node is running unknown instance %s", inst)
3003 for node, result in extra_lv_nvinfo.items():
3004 self._UpdateNodeVolumes(self.all_node_info[node], result.payload,
3005 node_image[node], vg_name)
3007 feedback_fn("* Verifying instance status")
3008 for instance in self.my_inst_names:
3010 feedback_fn("* Verifying instance %s" % instance)
3011 inst_config = self.my_inst_info[instance]
3012 self._VerifyInstance(instance, inst_config, node_image,
3014 inst_nodes_offline = []
3016 pnode = inst_config.primary_node
3017 pnode_img = node_image[pnode]
3018 _ErrorIf(pnode_img.rpc_fail and not pnode_img.offline,
3019 constants.CV_ENODERPC, pnode, "instance %s, connection to"
3020 " primary node failed", instance)
3022 _ErrorIf(inst_config.admin_state == constants.ADMINST_UP and
3024 constants.CV_EINSTANCEBADNODE, instance,
3025 "instance is marked as running and lives on offline node %s",
3026 inst_config.primary_node)
3028 # If the instance is non-redundant we cannot survive losing its primary
3029 # node, so we are not N+1 compliant. On the other hand we have no disk
3030 # templates with more than one secondary so that situation is not well
3032 # FIXME: does not support file-backed instances
3033 if not inst_config.secondary_nodes:
3034 i_non_redundant.append(instance)
3036 _ErrorIf(len(inst_config.secondary_nodes) > 1,
3037 constants.CV_EINSTANCELAYOUT,
3038 instance, "instance has multiple secondary nodes: %s",
3039 utils.CommaJoin(inst_config.secondary_nodes),
3040 code=self.ETYPE_WARNING)
3042 if inst_config.disk_template in constants.DTS_INT_MIRROR:
3043 pnode = inst_config.primary_node
3044 instance_nodes = utils.NiceSort(inst_config.all_nodes)
3045 instance_groups = {}
3047 for node in instance_nodes:
3048 instance_groups.setdefault(self.all_node_info[node].group,
3052 "%s (group %s)" % (utils.CommaJoin(nodes), groupinfo[group].name)
3053 # Sort so that we always list the primary node first.
3054 for group, nodes in sorted(instance_groups.items(),
3055 key=lambda (_, nodes): pnode in nodes,
3058 self._ErrorIf(len(instance_groups) > 1,
3059 constants.CV_EINSTANCESPLITGROUPS,
3060 instance, "instance has primary and secondary nodes in"
3061 " different groups: %s", utils.CommaJoin(pretty_list),
3062 code=self.ETYPE_WARNING)
3064 if not cluster.FillBE(inst_config)[constants.BE_AUTO_BALANCE]:
3065 i_non_a_balanced.append(instance)
3067 for snode in inst_config.secondary_nodes:
3068 s_img = node_image[snode]
3069 _ErrorIf(s_img.rpc_fail and not s_img.offline, constants.CV_ENODERPC,
3070 snode, "instance %s, connection to secondary node failed",
3074 inst_nodes_offline.append(snode)
3076 # warn that the instance lives on offline nodes
3077 _ErrorIf(inst_nodes_offline, constants.CV_EINSTANCEBADNODE, instance,
3078 "instance has offline secondary node(s) %s",
3079 utils.CommaJoin(inst_nodes_offline))
3080 # ... or ghost/non-vm_capable nodes
3081 for node in inst_config.all_nodes:
3082 _ErrorIf(node_image[node].ghost, constants.CV_EINSTANCEBADNODE,
3083 instance, "instance lives on ghost node %s", node)
3084 _ErrorIf(not node_image[node].vm_capable, constants.CV_EINSTANCEBADNODE,
3085 instance, "instance lives on non-vm_capable node %s", node)
3087 feedback_fn("* Verifying orphan volumes")
3088 reserved = utils.FieldSet(*cluster.reserved_lvs)
3090 # We will get spurious "unknown volume" warnings if any node of this group
3091 # is secondary for an instance whose primary is in another group. To avoid
3092 # them, we find these instances and add their volumes to node_vol_should.
3093 for inst in self.all_inst_info.values():
3094 for secondary in inst.secondary_nodes:
3095 if (secondary in self.my_node_info
3096 and inst.name not in self.my_inst_info):
3097 inst.MapLVsByNode(node_vol_should)
3100 self._VerifyOrphanVolumes(node_vol_should, node_image, reserved)
3102 if constants.VERIFY_NPLUSONE_MEM not in self.op.skip_checks:
3103 feedback_fn("* Verifying N+1 Memory redundancy")
3104 self._VerifyNPlusOneMemory(node_image, self.my_inst_info)
3106 feedback_fn("* Other Notes")
3108 feedback_fn(" - NOTICE: %d non-redundant instance(s) found."
3109 % len(i_non_redundant))
3111 if i_non_a_balanced:
3112 feedback_fn(" - NOTICE: %d non-auto-balanced instance(s) found."
3113 % len(i_non_a_balanced))
3116 feedback_fn(" - NOTICE: %d offline instance(s) found." % i_offline)
3119 feedback_fn(" - NOTICE: %d offline node(s) found." % n_offline)
3122 feedback_fn(" - NOTICE: %d drained node(s) found." % n_drained)
3126 def HooksCallBack(self, phase, hooks_results, feedback_fn, lu_result):
3127 """Analyze the post-hooks' result
3129 This method analyses the hook result, handles it, and sends some
3130 nicely-formatted feedback back to the user.
3132 @param phase: one of L{constants.HOOKS_PHASE_POST} or
3133 L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase
3134 @param hooks_results: the results of the multi-node hooks rpc call
3135 @param feedback_fn: function used send feedback back to the caller
3136 @param lu_result: previous Exec result
3137 @return: the new Exec result, based on the previous result
3141 # We only really run POST phase hooks, only for non-empty groups,
3142 # and are only interested in their results
3143 if not self.my_node_names:
3146 elif phase == constants.HOOKS_PHASE_POST:
3147 # Used to change hooks' output to proper indentation
3148 feedback_fn("* Hooks Results")
3149 assert hooks_results, "invalid result from hooks"
3151 for node_name in hooks_results:
3152 res = hooks_results[node_name]
3154 test = msg and not res.offline
3155 self._ErrorIf(test, constants.CV_ENODEHOOKS, node_name,
3156 "Communication failure in hooks execution: %s", msg)
3157 if res.offline or msg:
3158 # No need to investigate payload if node is offline or gave
3161 for script, hkr, output in res.payload:
3162 test = hkr == constants.HKR_FAIL
3163 self._ErrorIf(test, constants.CV_ENODEHOOKS, node_name,
3164 "Script %s failed, output:", script)
3166 output = self._HOOKS_INDENT_RE.sub(" ", output)
3167 feedback_fn("%s" % output)
3173 class LUClusterVerifyDisks(NoHooksLU):
3174 """Verifies the cluster disks status.
3179 def ExpandNames(self):
3180 self.share_locks = _ShareAll()
3181 self.needed_locks = {
3182 locking.LEVEL_NODEGROUP: locking.ALL_SET,
3185 def Exec(self, feedback_fn):
3186 group_names = self.owned_locks(locking.LEVEL_NODEGROUP)
3188 # Submit one instance of L{opcodes.OpGroupVerifyDisks} per node group
3189 return ResultWithJobs([[opcodes.OpGroupVerifyDisks(group_name=group)]
3190 for group in group_names])
3193 class LUGroupVerifyDisks(NoHooksLU):
3194 """Verifies the status of all disks in a node group.
3199 def ExpandNames(self):
3200 # Raises errors.OpPrereqError on its own if group can't be found
3201 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
3203 self.share_locks = _ShareAll()
3204 self.needed_locks = {
3205 locking.LEVEL_INSTANCE: [],
3206 locking.LEVEL_NODEGROUP: [],
3207 locking.LEVEL_NODE: [],
3210 def DeclareLocks(self, level):
3211 if level == locking.LEVEL_INSTANCE:
3212 assert not self.needed_locks[locking.LEVEL_INSTANCE]
3214 # Lock instances optimistically, needs verification once node and group
3215 # locks have been acquired
3216 self.needed_locks[locking.LEVEL_INSTANCE] = \
3217 self.cfg.GetNodeGroupInstances(self.group_uuid)
3219 elif level == locking.LEVEL_NODEGROUP:
3220 assert not self.needed_locks[locking.LEVEL_NODEGROUP]
3222 self.needed_locks[locking.LEVEL_NODEGROUP] = \
3223 set([self.group_uuid] +
3224 # Lock all groups used by instances optimistically; this requires
3225 # going via the node before it's locked, requiring verification
3228 for instance_name in self.owned_locks(locking.LEVEL_INSTANCE)
3229 for group_uuid in self.cfg.GetInstanceNodeGroups(instance_name)])
3231 elif level == locking.LEVEL_NODE:
3232 # This will only lock the nodes in the group to be verified which contain
3234 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
3235 self._LockInstancesNodes()
3237 # Lock all nodes in group to be verified
3238 assert self.group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP)
3239 member_nodes = self.cfg.GetNodeGroup(self.group_uuid).members
3240 self.needed_locks[locking.LEVEL_NODE].extend(member_nodes)
3242 def CheckPrereq(self):
3243 owned_instances = frozenset(self.owned_locks(locking.LEVEL_INSTANCE))
3244 owned_groups = frozenset(self.owned_locks(locking.LEVEL_NODEGROUP))
3245 owned_nodes = frozenset(self.owned_locks(locking.LEVEL_NODE))
3247 assert self.group_uuid in owned_groups
3249 # Check if locked instances are still correct
3250 _CheckNodeGroupInstances(self.cfg, self.group_uuid, owned_instances)
3252 # Get instance information
3253 self.instances = dict(self.cfg.GetMultiInstanceInfo(owned_instances))
3255 # Check if node groups for locked instances are still correct
3256 for (instance_name, inst) in self.instances.items():
3257 assert owned_nodes.issuperset(inst.all_nodes), \
3258 "Instance %s's nodes changed while we kept the lock" % instance_name
3260 inst_groups = _CheckInstanceNodeGroups(self.cfg, instance_name,
3263 assert self.group_uuid in inst_groups, \
3264 "Instance %s has no node in group %s" % (instance_name, self.group_uuid)
3266 def Exec(self, feedback_fn):
3267 """Verify integrity of cluster disks.
3269 @rtype: tuple of three items
3270 @return: a tuple of (dict of node-to-node_error, list of instances
3271 which need activate-disks, dict of instance: (node, volume) for
3276 res_instances = set()
3279 nv_dict = _MapInstanceDisksToNodes([inst
3280 for inst in self.instances.values()
3281 if inst.admin_state == constants.ADMINST_UP])
3284 nodes = utils.NiceSort(set(self.owned_locks(locking.LEVEL_NODE)) &
3285 set(self.cfg.GetVmCapableNodeList()))
3287 node_lvs = self.rpc.call_lv_list(nodes, [])
3289 for (node, node_res) in node_lvs.items():
3290 if node_res.offline:
3293 msg = node_res.fail_msg
3295 logging.warning("Error enumerating LVs on node %s: %s", node, msg)
3296 res_nodes[node] = msg
3299 for lv_name, (_, _, lv_online) in node_res.payload.items():
3300 inst = nv_dict.pop((node, lv_name), None)
3301 if not (lv_online or inst is None):
3302 res_instances.add(inst)
3304 # any leftover items in nv_dict are missing LVs, let's arrange the data
3306 for key, inst in nv_dict.iteritems():
3307 res_missing.setdefault(inst, []).append(list(key))
3309 return (res_nodes, list(res_instances), res_missing)
3312 class LUClusterRepairDiskSizes(NoHooksLU):
3313 """Verifies the cluster disks sizes.
3318 def ExpandNames(self):
3319 if self.op.instances:
3320 self.wanted_names = _GetWantedInstances(self, self.op.instances)
3321 self.needed_locks = {
3322 locking.LEVEL_NODE_RES: [],
3323 locking.LEVEL_INSTANCE: self.wanted_names,
3325 self.recalculate_locks[locking.LEVEL_NODE_RES] = constants.LOCKS_REPLACE
3327 self.wanted_names = None
3328 self.needed_locks = {
3329 locking.LEVEL_NODE_RES: locking.ALL_SET,
3330 locking.LEVEL_INSTANCE: locking.ALL_SET,
3332 self.share_locks = {
3333 locking.LEVEL_NODE_RES: 1,
3334 locking.LEVEL_INSTANCE: 0,
3337 def DeclareLocks(self, level):
3338 if level == locking.LEVEL_NODE_RES and self.wanted_names is not None:
3339 self._LockInstancesNodes(primary_only=True, level=level)
3341 def CheckPrereq(self):
3342 """Check prerequisites.
3344 This only checks the optional instance list against the existing names.
3347 if self.wanted_names is None:
3348 self.wanted_names = self.owned_locks(locking.LEVEL_INSTANCE)
3350 self.wanted_instances = \
3351 map(compat.snd, self.cfg.GetMultiInstanceInfo(self.wanted_names))
3353 def _EnsureChildSizes(self, disk):
3354 """Ensure children of the disk have the needed disk size.
3356 This is valid mainly for DRBD8 and fixes an issue where the
3357 children have smaller disk size.
3359 @param disk: an L{ganeti.objects.Disk} object
3362 if disk.dev_type == constants.LD_DRBD8:
3363 assert disk.children, "Empty children for DRBD8?"
3364 fchild = disk.children[0]
3365 mismatch = fchild.size < disk.size
3367 self.LogInfo("Child disk has size %d, parent %d, fixing",
3368 fchild.size, disk.size)
3369 fchild.size = disk.size
3371 # and we recurse on this child only, not on the metadev
3372 return self._EnsureChildSizes(fchild) or mismatch
3376 def Exec(self, feedback_fn):
3377 """Verify the size of cluster disks.
3380 # TODO: check child disks too
3381 # TODO: check differences in size between primary/secondary nodes
3383 for instance in self.wanted_instances:
3384 pnode = instance.primary_node
3385 if pnode not in per_node_disks:
3386 per_node_disks[pnode] = []
3387 for idx, disk in enumerate(instance.disks):
3388 per_node_disks[pnode].append((instance, idx, disk))
3390 assert not (frozenset(per_node_disks.keys()) -
3391 self.owned_locks(locking.LEVEL_NODE_RES)), \
3392 "Not owning correct locks"
3393 assert not self.owned_locks(locking.LEVEL_NODE)
3396 for node, dskl in per_node_disks.items():
3397 newl = [v[2].Copy() for v in dskl]
3399 self.cfg.SetDiskID(dsk, node)
3400 result = self.rpc.call_blockdev_getsize(node, newl)
3402 self.LogWarning("Failure in blockdev_getsize call to node"
3403 " %s, ignoring", node)
3405 if len(result.payload) != len(dskl):
3406 logging.warning("Invalid result from node %s: len(dksl)=%d,"
3407 " result.payload=%s", node, len(dskl), result.payload)
3408 self.LogWarning("Invalid result from node %s, ignoring node results",
3411 for ((instance, idx, disk), size) in zip(dskl, result.payload):
3413 self.LogWarning("Disk %d of instance %s did not return size"
3414 " information, ignoring", idx, instance.name)
3416 if not isinstance(size, (int, long)):
3417 self.LogWarning("Disk %d of instance %s did not return valid"
3418 " size information, ignoring", idx, instance.name)
3421 if size != disk.size:
3422 self.LogInfo("Disk %d of instance %s has mismatched size,"
3423 " correcting: recorded %d, actual %d", idx,
3424 instance.name, disk.size, size)
3426 self.cfg.Update(instance, feedback_fn)
3427 changed.append((instance.name, idx, size))
3428 if self._EnsureChildSizes(disk):
3429 self.cfg.Update(instance, feedback_fn)
3430 changed.append((instance.name, idx, disk.size))
3434 class LUClusterRename(LogicalUnit):
3435 """Rename the cluster.
3438 HPATH = "cluster-rename"
3439 HTYPE = constants.HTYPE_CLUSTER
3441 def BuildHooksEnv(self):
3446 "OP_TARGET": self.cfg.GetClusterName(),
3447 "NEW_NAME": self.op.name,
3450 def BuildHooksNodes(self):
3451 """Build hooks nodes.
3454 return ([self.cfg.GetMasterNode()], self.cfg.GetNodeList())
3456 def CheckPrereq(self):
3457 """Verify that the passed name is a valid one.
3460 hostname = netutils.GetHostname(name=self.op.name,
3461 family=self.cfg.GetPrimaryIPFamily())
3463 new_name = hostname.name
3464 self.ip = new_ip = hostname.ip
3465 old_name = self.cfg.GetClusterName()
3466 old_ip = self.cfg.GetMasterIP()
3467 if new_name == old_name and new_ip == old_ip:
3468 raise errors.OpPrereqError("Neither the name nor the IP address of the"
3469 " cluster has changed",
3471 if new_ip != old_ip:
3472 if netutils.TcpPing(new_ip, constants.DEFAULT_NODED_PORT):
3473 raise errors.OpPrereqError("The given cluster IP address (%s) is"
3474 " reachable on the network" %
3475 new_ip, errors.ECODE_NOTUNIQUE)
3477 self.op.name = new_name
3479 def Exec(self, feedback_fn):
3480 """Rename the cluster.
3483 clustername = self.op.name
3486 # shutdown the master IP
3487 master_params = self.cfg.GetMasterNetworkParameters()
3488 ems = self.cfg.GetUseExternalMipScript()
3489 result = self.rpc.call_node_deactivate_master_ip(master_params.name,
3491 result.Raise("Could not disable the master role")
3494 cluster = self.cfg.GetClusterInfo()
3495 cluster.cluster_name = clustername
3496 cluster.master_ip = new_ip
3497 self.cfg.Update(cluster, feedback_fn)
3499 # update the known hosts file
3500 ssh.WriteKnownHostsFile(self.cfg, constants.SSH_KNOWN_HOSTS_FILE)
3501 node_list = self.cfg.GetOnlineNodeList()
3503 node_list.remove(master_params.name)
3506 _UploadHelper(self, node_list, constants.SSH_KNOWN_HOSTS_FILE)
3508 master_params.ip = new_ip
3509 result = self.rpc.call_node_activate_master_ip(master_params.name,
3511 msg = result.fail_msg
3513 self.LogWarning("Could not re-enable the master role on"
3514 " the master, please restart manually: %s", msg)
3519 def _ValidateNetmask(cfg, netmask):
3520 """Checks if a netmask is valid.
3522 @type cfg: L{config.ConfigWriter}
3523 @param cfg: The cluster configuration
3525 @param netmask: the netmask to be verified
3526 @raise errors.OpPrereqError: if the validation fails
3529 ip_family = cfg.GetPrimaryIPFamily()
3531 ipcls = netutils.IPAddress.GetClassFromIpFamily(ip_family)
3532 except errors.ProgrammerError:
3533 raise errors.OpPrereqError("Invalid primary ip family: %s." %
3535 if not ipcls.ValidateNetmask(netmask):
3536 raise errors.OpPrereqError("CIDR netmask (%s) not valid" %
3540 class LUClusterSetParams(LogicalUnit):
3541 """Change the parameters of the cluster.
3544 HPATH = "cluster-modify"
3545 HTYPE = constants.HTYPE_CLUSTER
3548 def CheckArguments(self):
3552 if self.op.uid_pool:
3553 uidpool.CheckUidPool(self.op.uid_pool)
3555 if self.op.add_uids:
3556 uidpool.CheckUidPool(self.op.add_uids)
3558 if self.op.remove_uids:
3559 uidpool.CheckUidPool(self.op.remove_uids)
3561 if self.op.master_netmask is not None:
3562 _ValidateNetmask(self.cfg, self.op.master_netmask)
3564 if self.op.diskparams:
3565 for dt_params in self.op.diskparams.values():
3566 utils.ForceDictType(dt_params, constants.DISK_DT_TYPES)
3568 def ExpandNames(self):
3569 # FIXME: in the future maybe other cluster params won't require checking on
3570 # all nodes to be modified.
3571 self.needed_locks = {
3572 locking.LEVEL_NODE: locking.ALL_SET,
3574 self.share_locks[locking.LEVEL_NODE] = 1
3576 def BuildHooksEnv(self):
3581 "OP_TARGET": self.cfg.GetClusterName(),
3582 "NEW_VG_NAME": self.op.vg_name,
3585 def BuildHooksNodes(self):
3586 """Build hooks nodes.
3589 mn = self.cfg.GetMasterNode()
3592 def CheckPrereq(self):
3593 """Check prerequisites.
3595 This checks whether the given params don't conflict and
3596 if the given volume group is valid.
3599 if self.op.vg_name is not None and not self.op.vg_name:
3600 if self.cfg.HasAnyDiskOfType(constants.LD_LV):
3601 raise errors.OpPrereqError("Cannot disable lvm storage while lvm-based"
3602 " instances exist", errors.ECODE_INVAL)
3604 if self.op.drbd_helper is not None and not self.op.drbd_helper:
3605 if self.cfg.HasAnyDiskOfType(constants.LD_DRBD8):
3606 raise errors.OpPrereqError("Cannot disable drbd helper while"
3607 " drbd-based instances exist",
3610 node_list = self.owned_locks(locking.LEVEL_NODE)
3612 # if vg_name not None, checks given volume group on all nodes
3614 vglist = self.rpc.call_vg_list(node_list)
3615 for node in node_list:
3616 msg = vglist[node].fail_msg
3618 # ignoring down node
3619 self.LogWarning("Error while gathering data on node %s"
3620 " (ignoring node): %s", node, msg)
3622 vgstatus = utils.CheckVolumeGroupSize(vglist[node].payload,
3624 constants.MIN_VG_SIZE)
3626 raise errors.OpPrereqError("Error on node '%s': %s" %
3627 (node, vgstatus), errors.ECODE_ENVIRON)
3629 if self.op.drbd_helper:
3630 # checks given drbd helper on all nodes
3631 helpers = self.rpc.call_drbd_helper(node_list)
3632 for (node, ninfo) in self.cfg.GetMultiNodeInfo(node_list):
3634 self.LogInfo("Not checking drbd helper on offline node %s", node)
3636 msg = helpers[node].fail_msg
3638 raise errors.OpPrereqError("Error checking drbd helper on node"
3639 " '%s': %s" % (node, msg),
3640 errors.ECODE_ENVIRON)
3641 node_helper = helpers[node].payload
3642 if node_helper != self.op.drbd_helper:
3643 raise errors.OpPrereqError("Error on node '%s': drbd helper is %s" %
3644 (node, node_helper), errors.ECODE_ENVIRON)
3646 self.cluster = cluster = self.cfg.GetClusterInfo()
3647 # validate params changes
3648 if self.op.beparams:
3649 objects.UpgradeBeParams(self.op.beparams)
3650 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
3651 self.new_beparams = cluster.SimpleFillBE(self.op.beparams)
3653 if self.op.ndparams:
3654 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
3655 self.new_ndparams = cluster.SimpleFillND(self.op.ndparams)
3657 # TODO: we need a more general way to handle resetting
3658 # cluster-level parameters to default values
3659 if self.new_ndparams["oob_program"] == "":
3660 self.new_ndparams["oob_program"] = \
3661 constants.NDC_DEFAULTS[constants.ND_OOB_PROGRAM]
3663 if self.op.nicparams:
3664 utils.ForceDictType(self.op.nicparams, constants.NICS_PARAMETER_TYPES)
3665 self.new_nicparams = cluster.SimpleFillNIC(self.op.nicparams)
3666 objects.NIC.CheckParameterSyntax(self.new_nicparams)
3669 # check all instances for consistency
3670 for instance in self.cfg.GetAllInstancesInfo().values():
3671 for nic_idx, nic in enumerate(instance.nics):
3672 params_copy = copy.deepcopy(nic.nicparams)
3673 params_filled = objects.FillDict(self.new_nicparams, params_copy)
3675 # check parameter syntax
3677 objects.NIC.CheckParameterSyntax(params_filled)
3678 except errors.ConfigurationError, err:
3679 nic_errors.append("Instance %s, nic/%d: %s" %
3680 (instance.name, nic_idx, err))
3682 # if we're moving instances to routed, check that they have an ip
3683 target_mode = params_filled[constants.NIC_MODE]
3684 if target_mode == constants.NIC_MODE_ROUTED and not nic.ip:
3685 nic_errors.append("Instance %s, nic/%d: routed NIC with no ip"
3686 " address" % (instance.name, nic_idx))
3688 raise errors.OpPrereqError("Cannot apply the change, errors:\n%s" %
3689 "\n".join(nic_errors))
3691 # hypervisor list/parameters
3692 self.new_hvparams = new_hvp = objects.FillDict(cluster.hvparams, {})
3693 if self.op.hvparams:
3694 for hv_name, hv_dict in self.op.hvparams.items():
3695 if hv_name not in self.new_hvparams:
3696 self.new_hvparams[hv_name] = hv_dict
3698 self.new_hvparams[hv_name].update(hv_dict)
3700 # disk template parameters
3701 self.new_diskparams = objects.FillDict(cluster.diskparams, {})
3702 if self.op.diskparams:
3703 for dt_name, dt_params in self.op.diskparams.items():
3704 if dt_name not in self.op.diskparams:
3705 self.new_diskparams[dt_name] = dt_params
3707 self.new_diskparams[dt_name].update(dt_params)
3709 # os hypervisor parameters
3710 self.new_os_hvp = objects.FillDict(cluster.os_hvp, {})
3712 for os_name, hvs in self.op.os_hvp.items():
3713 if os_name not in self.new_os_hvp:
3714 self.new_os_hvp[os_name] = hvs
3716 for hv_name, hv_dict in hvs.items():
3717 if hv_name not in self.new_os_hvp[os_name]:
3718 self.new_os_hvp[os_name][hv_name] = hv_dict
3720 self.new_os_hvp[os_name][hv_name].update(hv_dict)
3723 self.new_osp = objects.FillDict(cluster.osparams, {})
3724 if self.op.osparams:
3725 for os_name, osp in self.op.osparams.items():
3726 if os_name not in self.new_osp:
3727 self.new_osp[os_name] = {}
3729 self.new_osp[os_name] = _GetUpdatedParams(self.new_osp[os_name], osp,
3732 if not self.new_osp[os_name]:
3733 # we removed all parameters
3734 del self.new_osp[os_name]
3736 # check the parameter validity (remote check)
3737 _CheckOSParams(self, False, [self.cfg.GetMasterNode()],
3738 os_name, self.new_osp[os_name])
3740 # changes to the hypervisor list
3741 if self.op.enabled_hypervisors is not None:
3742 self.hv_list = self.op.enabled_hypervisors
3743 for hv in self.hv_list:
3744 # if the hypervisor doesn't already exist in the cluster
3745 # hvparams, we initialize it to empty, and then (in both
3746 # cases) we make sure to fill the defaults, as we might not
3747 # have a complete defaults list if the hypervisor wasn't
3749 if hv not in new_hvp:
3751 new_hvp[hv] = objects.FillDict(constants.HVC_DEFAULTS[hv], new_hvp[hv])
3752 utils.ForceDictType(new_hvp[hv], constants.HVS_PARAMETER_TYPES)
3754 self.hv_list = cluster.enabled_hypervisors
3756 if self.op.hvparams or self.op.enabled_hypervisors is not None:
3757 # either the enabled list has changed, or the parameters have, validate
3758 for hv_name, hv_params in self.new_hvparams.items():
3759 if ((self.op.hvparams and hv_name in self.op.hvparams) or
3760 (self.op.enabled_hypervisors and
3761 hv_name in self.op.enabled_hypervisors)):
3762 # either this is a new hypervisor, or its parameters have changed
3763 hv_class = hypervisor.GetHypervisor(hv_name)
3764 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
3765 hv_class.CheckParameterSyntax(hv_params)
3766 _CheckHVParams(self, node_list, hv_name, hv_params)
3769 # no need to check any newly-enabled hypervisors, since the
3770 # defaults have already been checked in the above code-block
3771 for os_name, os_hvp in self.new_os_hvp.items():
3772 for hv_name, hv_params in os_hvp.items():
3773 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
3774 # we need to fill in the new os_hvp on top of the actual hv_p
3775 cluster_defaults = self.new_hvparams.get(hv_name, {})
3776 new_osp = objects.FillDict(cluster_defaults, hv_params)
3777 hv_class = hypervisor.GetHypervisor(hv_name)
3778 hv_class.CheckParameterSyntax(new_osp)
3779 _CheckHVParams(self, node_list, hv_name, new_osp)
3781 if self.op.default_iallocator:
3782 alloc_script = utils.FindFile(self.op.default_iallocator,
3783 constants.IALLOCATOR_SEARCH_PATH,
3785 if alloc_script is None:
3786 raise errors.OpPrereqError("Invalid default iallocator script '%s'"
3787 " specified" % self.op.default_iallocator,
3790 def Exec(self, feedback_fn):
3791 """Change the parameters of the cluster.
3794 if self.op.vg_name is not None:
3795 new_volume = self.op.vg_name
3798 if new_volume != self.cfg.GetVGName():
3799 self.cfg.SetVGName(new_volume)
3801 feedback_fn("Cluster LVM configuration already in desired"
3802 " state, not changing")
3803 if self.op.drbd_helper is not None:
3804 new_helper = self.op.drbd_helper
3807 if new_helper != self.cfg.GetDRBDHelper():
3808 self.cfg.SetDRBDHelper(new_helper)
3810 feedback_fn("Cluster DRBD helper already in desired state,"
3812 if self.op.hvparams:
3813 self.cluster.hvparams = self.new_hvparams
3815 self.cluster.os_hvp = self.new_os_hvp
3816 if self.op.enabled_hypervisors is not None:
3817 self.cluster.hvparams = self.new_hvparams
3818 self.cluster.enabled_hypervisors = self.op.enabled_hypervisors
3819 if self.op.beparams:
3820 self.cluster.beparams[constants.PP_DEFAULT] = self.new_beparams
3821 if self.op.nicparams:
3822 self.cluster.nicparams[constants.PP_DEFAULT] = self.new_nicparams
3823 if self.op.osparams:
3824 self.cluster.osparams = self.new_osp
3825 if self.op.ndparams:
3826 self.cluster.ndparams = self.new_ndparams
3827 if self.op.diskparams:
3828 self.cluster.diskparams = self.new_diskparams
3830 if self.op.candidate_pool_size is not None:
3831 self.cluster.candidate_pool_size = self.op.candidate_pool_size
3832 # we need to update the pool size here, otherwise the save will fail
3833 _AdjustCandidatePool(self, [])
3835 if self.op.maintain_node_health is not None:
3836 if self.op.maintain_node_health and not constants.ENABLE_CONFD:
3837 feedback_fn("Note: CONFD was disabled at build time, node health"
3838 " maintenance is not useful (still enabling it)")
3839 self.cluster.maintain_node_health = self.op.maintain_node_health
3841 if self.op.prealloc_wipe_disks is not None:
3842 self.cluster.prealloc_wipe_disks = self.op.prealloc_wipe_disks
3844 if self.op.add_uids is not None:
3845 uidpool.AddToUidPool(self.cluster.uid_pool, self.op.add_uids)
3847 if self.op.remove_uids is not None:
3848 uidpool.RemoveFromUidPool(self.cluster.uid_pool, self.op.remove_uids)
3850 if self.op.uid_pool is not None:
3851 self.cluster.uid_pool = self.op.uid_pool
3853 if self.op.default_iallocator is not None:
3854 self.cluster.default_iallocator = self.op.default_iallocator
3856 if self.op.reserved_lvs is not None:
3857 self.cluster.reserved_lvs = self.op.reserved_lvs
3859 if self.op.use_external_mip_script is not None:
3860 self.cluster.use_external_mip_script = self.op.use_external_mip_script
3862 def helper_os(aname, mods, desc):
3864 lst = getattr(self.cluster, aname)
3865 for key, val in mods:
3866 if key == constants.DDM_ADD:
3868 feedback_fn("OS %s already in %s, ignoring" % (val, desc))
3871 elif key == constants.DDM_REMOVE:
3875 feedback_fn("OS %s not found in %s, ignoring" % (val, desc))
3877 raise errors.ProgrammerError("Invalid modification '%s'" % key)
3879 if self.op.hidden_os:
3880 helper_os("hidden_os", self.op.hidden_os, "hidden")
3882 if self.op.blacklisted_os:
3883 helper_os("blacklisted_os", self.op.blacklisted_os, "blacklisted")
3885 if self.op.master_netdev:
3886 master_params = self.cfg.GetMasterNetworkParameters()
3887 ems = self.cfg.GetUseExternalMipScript()
3888 feedback_fn("Shutting down master ip on the current netdev (%s)" %
3889 self.cluster.master_netdev)
3890 result = self.rpc.call_node_deactivate_master_ip(master_params.name,
3892 result.Raise("Could not disable the master ip")
3893 feedback_fn("Changing master_netdev from %s to %s" %
3894 (master_params.netdev, self.op.master_netdev))
3895 self.cluster.master_netdev = self.op.master_netdev
3897 if self.op.master_netmask:
3898 master_params = self.cfg.GetMasterNetworkParameters()
3899 feedback_fn("Changing master IP netmask to %s" % self.op.master_netmask)
3900 result = self.rpc.call_node_change_master_netmask(master_params.name,
3901 master_params.netmask,
3902 self.op.master_netmask,
3904 master_params.netdev)
3906 msg = "Could not change the master IP netmask: %s" % result.fail_msg
3909 self.cluster.master_netmask = self.op.master_netmask
3911 self.cfg.Update(self.cluster, feedback_fn)
3913 if self.op.master_netdev:
3914 master_params = self.cfg.GetMasterNetworkParameters()
3915 feedback_fn("Starting the master ip on the new master netdev (%s)" %
3916 self.op.master_netdev)
3917 ems = self.cfg.GetUseExternalMipScript()
3918 result = self.rpc.call_node_activate_master_ip(master_params.name,
3921 self.LogWarning("Could not re-enable the master ip on"
3922 " the master, please restart manually: %s",
3926 def _UploadHelper(lu, nodes, fname):
3927 """Helper for uploading a file and showing warnings.
3930 if os.path.exists(fname):
3931 result = lu.rpc.call_upload_file(nodes, fname)
3932 for to_node, to_result in result.items():
3933 msg = to_result.fail_msg
3935 msg = ("Copy of file %s to node %s failed: %s" %
3936 (fname, to_node, msg))
3937 lu.proc.LogWarning(msg)
3940 def _ComputeAncillaryFiles(cluster, redist):
3941 """Compute files external to Ganeti which need to be consistent.
3943 @type redist: boolean
3944 @param redist: Whether to include files which need to be redistributed
3947 # Compute files for all nodes
3949 constants.SSH_KNOWN_HOSTS_FILE,
3950 constants.CONFD_HMAC_KEY,
3951 constants.CLUSTER_DOMAIN_SECRET_FILE,
3952 constants.SPICE_CERT_FILE,
3953 constants.SPICE_CACERT_FILE,
3954 constants.RAPI_USERS_FILE,
3958 files_all.update(constants.ALL_CERT_FILES)
3959 files_all.update(ssconf.SimpleStore().GetFileList())
3961 # we need to ship at least the RAPI certificate
3962 files_all.add(constants.RAPI_CERT_FILE)
3964 if cluster.modify_etc_hosts:
3965 files_all.add(constants.ETC_HOSTS)
3967 # Files which are optional, these must:
3968 # - be present in one other category as well
3969 # - either exist or not exist on all nodes of that category (mc, vm all)
3971 constants.RAPI_USERS_FILE,
3974 # Files which should only be on master candidates
3978 files_mc.add(constants.CLUSTER_CONF_FILE)
3980 # FIXME: this should also be replicated but Ganeti doesn't support files_mc
3982 files_mc.add(constants.DEFAULT_MASTER_SETUP_SCRIPT)
3984 # Files which should only be on VM-capable nodes
3985 files_vm = set(filename
3986 for hv_name in cluster.enabled_hypervisors
3987 for filename in hypervisor.GetHypervisor(hv_name).GetAncillaryFiles()[0])
3989 files_opt |= set(filename
3990 for hv_name in cluster.enabled_hypervisors
3991 for filename in hypervisor.GetHypervisor(hv_name).GetAncillaryFiles()[1])
3993 # Filenames in each category must be unique
3994 all_files_set = files_all | files_mc | files_vm
3995 assert (len(all_files_set) ==
3996 sum(map(len, [files_all, files_mc, files_vm]))), \
3997 "Found file listed in more than one file list"
3999 # Optional files must be present in one other category
4000 assert all_files_set.issuperset(files_opt), \
4001 "Optional file not in a different required list"
4003 return (files_all, files_opt, files_mc, files_vm)
4006 def _RedistributeAncillaryFiles(lu, additional_nodes=None, additional_vm=True):
4007 """Distribute additional files which are part of the cluster configuration.
4009 ConfigWriter takes care of distributing the config and ssconf files, but
4010 there are more files which should be distributed to all nodes. This function
4011 makes sure those are copied.
4013 @param lu: calling logical unit
4014 @param additional_nodes: list of nodes not in the config to distribute to
4015 @type additional_vm: boolean
4016 @param additional_vm: whether the additional nodes are vm-capable or not
4019 # Gather target nodes
4020 cluster = lu.cfg.GetClusterInfo()
4021 master_info = lu.cfg.GetNodeInfo(lu.cfg.GetMasterNode())
4023 online_nodes = lu.cfg.GetOnlineNodeList()
4024 vm_nodes = lu.cfg.GetVmCapableNodeList()
4026 if additional_nodes is not None:
4027 online_nodes.extend(additional_nodes)
4029 vm_nodes.extend(additional_nodes)
4031 # Never distribute to master node
4032 for nodelist in [online_nodes, vm_nodes]:
4033 if master_info.name in nodelist:
4034 nodelist.remove(master_info.name)
4037 (files_all, _, files_mc, files_vm) = \
4038 _ComputeAncillaryFiles(cluster, True)
4040 # Never re-distribute configuration file from here
4041 assert not (constants.CLUSTER_CONF_FILE in files_all or
4042 constants.CLUSTER_CONF_FILE in files_vm)
4043 assert not files_mc, "Master candidates not handled in this function"
4046 (online_nodes, files_all),
4047 (vm_nodes, files_vm),
4051 for (node_list, files) in filemap:
4053 _UploadHelper(lu, node_list, fname)
4056 class LUClusterRedistConf(NoHooksLU):
4057 """Force the redistribution of cluster configuration.
4059 This is a very simple LU.
4064 def ExpandNames(self):
4065 self.needed_locks = {
4066 locking.LEVEL_NODE: locking.ALL_SET,
4068 self.share_locks[locking.LEVEL_NODE] = 1
4070 def Exec(self, feedback_fn):
4071 """Redistribute the configuration.
4074 self.cfg.Update(self.cfg.GetClusterInfo(), feedback_fn)
4075 _RedistributeAncillaryFiles(self)
4078 class LUClusterActivateMasterIp(NoHooksLU):
4079 """Activate the master IP on the master node.
4082 def Exec(self, feedback_fn):
4083 """Activate the master IP.
4086 master_params = self.cfg.GetMasterNetworkParameters()
4087 ems = self.cfg.GetUseExternalMipScript()
4088 result = self.rpc.call_node_activate_master_ip(master_params.name,
4090 result.Raise("Could not activate the master IP")
4093 class LUClusterDeactivateMasterIp(NoHooksLU):
4094 """Deactivate the master IP on the master node.
4097 def Exec(self, feedback_fn):
4098 """Deactivate the master IP.
4101 master_params = self.cfg.GetMasterNetworkParameters()
4102 ems = self.cfg.GetUseExternalMipScript()
4103 result = self.rpc.call_node_deactivate_master_ip(master_params.name,
4105 result.Raise("Could not deactivate the master IP")
4108 def _WaitForSync(lu, instance, disks=None, oneshot=False):
4109 """Sleep and poll for an instance's disk to sync.
4112 if not instance.disks or disks is not None and not disks:
4115 disks = _ExpandCheckDisks(instance, disks)
4118 lu.proc.LogInfo("Waiting for instance %s to sync disks." % instance.name)
4120 node = instance.primary_node
4123 lu.cfg.SetDiskID(dev, node)
4125 # TODO: Convert to utils.Retry
4128 degr_retries = 10 # in seconds, as we sleep 1 second each time
4132 cumul_degraded = False
4133 rstats = lu.rpc.call_blockdev_getmirrorstatus(node, disks)
4134 msg = rstats.fail_msg
4136 lu.LogWarning("Can't get any data from node %s: %s", node, msg)
4139 raise errors.RemoteError("Can't contact node %s for mirror data,"
4140 " aborting." % node)
4143 rstats = rstats.payload
4145 for i, mstat in enumerate(rstats):
4147 lu.LogWarning("Can't compute data for node %s/%s",
4148 node, disks[i].iv_name)
4151 cumul_degraded = (cumul_degraded or
4152 (mstat.is_degraded and mstat.sync_percent is None))
4153 if mstat.sync_percent is not None:
4155 if mstat.estimated_time is not None:
4156 rem_time = ("%s remaining (estimated)" %
4157 utils.FormatSeconds(mstat.estimated_time))
4158 max_time = mstat.estimated_time
4160 rem_time = "no time estimate"
4161 lu.proc.LogInfo("- device %s: %5.2f%% done, %s" %
4162 (disks[i].iv_name, mstat.sync_percent, rem_time))
4164 # if we're done but degraded, let's do a few small retries, to
4165 # make sure we see a stable and not transient situation; therefore
4166 # we force restart of the loop
4167 if (done or oneshot) and cumul_degraded and degr_retries > 0:
4168 logging.info("Degraded disks found, %d retries left", degr_retries)
4176 time.sleep(min(60, max_time))
4179 lu.proc.LogInfo("Instance %s's disks are in sync." % instance.name)
4180 return not cumul_degraded
4183 def _CheckDiskConsistency(lu, dev, node, on_primary, ldisk=False):
4184 """Check that mirrors are not degraded.
4186 The ldisk parameter, if True, will change the test from the
4187 is_degraded attribute (which represents overall non-ok status for
4188 the device(s)) to the ldisk (representing the local storage status).
4191 lu.cfg.SetDiskID(dev, node)
4195 if on_primary or dev.AssembleOnSecondary():
4196 rstats = lu.rpc.call_blockdev_find(node, dev)
4197 msg = rstats.fail_msg
4199 lu.LogWarning("Can't find disk on node %s: %s", node, msg)
4201 elif not rstats.payload:
4202 lu.LogWarning("Can't find disk on node %s", node)
4206 result = result and rstats.payload.ldisk_status == constants.LDS_OKAY
4208 result = result and not rstats.payload.is_degraded
4211 for child in dev.children:
4212 result = result and _CheckDiskConsistency(lu, child, node, on_primary)
4217 class LUOobCommand(NoHooksLU):
4218 """Logical unit for OOB handling.
4222 _SKIP_MASTER = (constants.OOB_POWER_OFF, constants.OOB_POWER_CYCLE)
4224 def ExpandNames(self):
4225 """Gather locks we need.
4228 if self.op.node_names:
4229 self.op.node_names = _GetWantedNodes(self, self.op.node_names)
4230 lock_names = self.op.node_names
4232 lock_names = locking.ALL_SET
4234 self.needed_locks = {
4235 locking.LEVEL_NODE: lock_names,
4238 def CheckPrereq(self):
4239 """Check prerequisites.
4242 - the node exists in the configuration
4245 Any errors are signaled by raising errors.OpPrereqError.
4249 self.master_node = self.cfg.GetMasterNode()
4251 assert self.op.power_delay >= 0.0
4253 if self.op.node_names:
4254 if (self.op.command in self._SKIP_MASTER and
4255 self.master_node in self.op.node_names):
4256 master_node_obj = self.cfg.GetNodeInfo(self.master_node)
4257 master_oob_handler = _SupportsOob(self.cfg, master_node_obj)
4259 if master_oob_handler:
4260 additional_text = ("run '%s %s %s' if you want to operate on the"
4261 " master regardless") % (master_oob_handler,
4265 additional_text = "it does not support out-of-band operations"
4267 raise errors.OpPrereqError(("Operating on the master node %s is not"
4268 " allowed for %s; %s") %
4269 (self.master_node, self.op.command,
4270 additional_text), errors.ECODE_INVAL)
4272 self.op.node_names = self.cfg.GetNodeList()
4273 if self.op.command in self._SKIP_MASTER:
4274 self.op.node_names.remove(self.master_node)
4276 if self.op.command in self._SKIP_MASTER:
4277 assert self.master_node not in self.op.node_names
4279 for (node_name, node) in self.cfg.GetMultiNodeInfo(self.op.node_names):
4281 raise errors.OpPrereqError("Node %s not found" % node_name,
4284 self.nodes.append(node)
4286 if (not self.op.ignore_status and
4287 (self.op.command == constants.OOB_POWER_OFF and not node.offline)):
4288 raise errors.OpPrereqError(("Cannot power off node %s because it is"
4289 " not marked offline") % node_name,
4292 def Exec(self, feedback_fn):
4293 """Execute OOB and return result if we expect any.
4296 master_node = self.master_node
4299 for idx, node in enumerate(utils.NiceSort(self.nodes,
4300 key=lambda node: node.name)):
4301 node_entry = [(constants.RS_NORMAL, node.name)]
4302 ret.append(node_entry)
4304 oob_program = _SupportsOob(self.cfg, node)
4307 node_entry.append((constants.RS_UNAVAIL, None))
4310 logging.info("Executing out-of-band command '%s' using '%s' on %s",
4311 self.op.command, oob_program, node.name)
4312 result = self.rpc.call_run_oob(master_node, oob_program,
4313 self.op.command, node.name,
4317 self.LogWarning("Out-of-band RPC failed on node '%s': %s",
4318 node.name, result.fail_msg)
4319 node_entry.append((constants.RS_NODATA, None))
4322 self._CheckPayload(result)
4323 except errors.OpExecError, err:
4324 self.LogWarning("Payload returned by node '%s' is not valid: %s",
4326 node_entry.append((constants.RS_NODATA, None))
4328 if self.op.command == constants.OOB_HEALTH:
4329 # For health we should log important events
4330 for item, status in result.payload:
4331 if status in [constants.OOB_STATUS_WARNING,
4332 constants.OOB_STATUS_CRITICAL]:
4333 self.LogWarning("Item '%s' on node '%s' has status '%s'",
4334 item, node.name, status)
4336 if self.op.command == constants.OOB_POWER_ON:
4338 elif self.op.command == constants.OOB_POWER_OFF:
4339 node.powered = False
4340 elif self.op.command == constants.OOB_POWER_STATUS:
4341 powered = result.payload[constants.OOB_POWER_STATUS_POWERED]
4342 if powered != node.powered:
4343 logging.warning(("Recorded power state (%s) of node '%s' does not"
4344 " match actual power state (%s)"), node.powered,
4347 # For configuration changing commands we should update the node
4348 if self.op.command in (constants.OOB_POWER_ON,
4349 constants.OOB_POWER_OFF):
4350 self.cfg.Update(node, feedback_fn)
4352 node_entry.append((constants.RS_NORMAL, result.payload))
4354 if (self.op.command == constants.OOB_POWER_ON and
4355 idx < len(self.nodes) - 1):
4356 time.sleep(self.op.power_delay)
4360 def _CheckPayload(self, result):
4361 """Checks if the payload is valid.
4363 @param result: RPC result
4364 @raises errors.OpExecError: If payload is not valid
4368 if self.op.command == constants.OOB_HEALTH:
4369 if not isinstance(result.payload, list):
4370 errs.append("command 'health' is expected to return a list but got %s" %
4371 type(result.payload))
4373 for item, status in result.payload:
4374 if status not in constants.OOB_STATUSES:
4375 errs.append("health item '%s' has invalid status '%s'" %
4378 if self.op.command == constants.OOB_POWER_STATUS:
4379 if not isinstance(result.payload, dict):
4380 errs.append("power-status is expected to return a dict but got %s" %
4381 type(result.payload))
4383 if self.op.command in [
4384 constants.OOB_POWER_ON,
4385 constants.OOB_POWER_OFF,
4386 constants.OOB_POWER_CYCLE,
4388 if result.payload is not None:
4389 errs.append("%s is expected to not return payload but got '%s'" %
4390 (self.op.command, result.payload))
4393 raise errors.OpExecError("Check of out-of-band payload failed due to %s" %
4394 utils.CommaJoin(errs))
4397 class _OsQuery(_QueryBase):
4398 FIELDS = query.OS_FIELDS
4400 def ExpandNames(self, lu):
4401 # Lock all nodes in shared mode
4402 # Temporary removal of locks, should be reverted later
4403 # TODO: reintroduce locks when they are lighter-weight
4404 lu.needed_locks = {}
4405 #self.share_locks[locking.LEVEL_NODE] = 1
4406 #self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
4408 # The following variables interact with _QueryBase._GetNames
4410 self.wanted = self.names
4412 self.wanted = locking.ALL_SET
4414 self.do_locking = self.use_locking
4416 def DeclareLocks(self, lu, level):
4420 def _DiagnoseByOS(rlist):
4421 """Remaps a per-node return list into an a per-os per-node dictionary
4423 @param rlist: a map with node names as keys and OS objects as values
4426 @return: a dictionary with osnames as keys and as value another
4427 map, with nodes as keys and tuples of (path, status, diagnose,
4428 variants, parameters, api_versions) as values, eg::
4430 {"debian-etch": {"node1": [(/usr/lib/..., True, "", [], []),
4431 (/srv/..., False, "invalid api")],
4432 "node2": [(/srv/..., True, "", [], [])]}
4437 # we build here the list of nodes that didn't fail the RPC (at RPC
4438 # level), so that nodes with a non-responding node daemon don't
4439 # make all OSes invalid
4440 good_nodes = [node_name for node_name in rlist
4441 if not rlist[node_name].fail_msg]
4442 for node_name, nr in rlist.items():
4443 if nr.fail_msg or not nr.payload:
4445 for (name, path, status, diagnose, variants,
4446 params, api_versions) in nr.payload:
4447 if name not in all_os:
4448 # build a list of nodes for this os containing empty lists
4449 # for each node in node_list
4451 for nname in good_nodes:
4452 all_os[name][nname] = []
4453 # convert params from [name, help] to (name, help)
4454 params = [tuple(v) for v in params]
4455 all_os[name][node_name].append((path, status, diagnose,
4456 variants, params, api_versions))
4459 def _GetQueryData(self, lu):
4460 """Computes the list of nodes and their attributes.
4463 # Locking is not used
4464 assert not (compat.any(lu.glm.is_owned(level)
4465 for level in locking.LEVELS
4466 if level != locking.LEVEL_CLUSTER) or
4467 self.do_locking or self.use_locking)
4469 valid_nodes = [node.name
4470 for node in lu.cfg.GetAllNodesInfo().values()
4471 if not node.offline and node.vm_capable]
4472 pol = self._DiagnoseByOS(lu.rpc.call_os_diagnose(valid_nodes))
4473 cluster = lu.cfg.GetClusterInfo()
4477 for (os_name, os_data) in pol.items():
4478 info = query.OsInfo(name=os_name, valid=True, node_status=os_data,
4479 hidden=(os_name in cluster.hidden_os),
4480 blacklisted=(os_name in cluster.blacklisted_os))
4484 api_versions = set()
4486 for idx, osl in enumerate(os_data.values()):
4487 info.valid = bool(info.valid and osl and osl[0][1])
4491 (node_variants, node_params, node_api) = osl[0][3:6]
4494 variants.update(node_variants)
4495 parameters.update(node_params)
4496 api_versions.update(node_api)
4498 # Filter out inconsistent values
4499 variants.intersection_update(node_variants)
4500 parameters.intersection_update(node_params)
4501 api_versions.intersection_update(node_api)
4503 info.variants = list(variants)
4504 info.parameters = list(parameters)
4505 info.api_versions = list(api_versions)
4507 data[os_name] = info
4509 # Prepare data in requested order
4510 return [data[name] for name in self._GetNames(lu, pol.keys(), None)
4514 class LUOsDiagnose(NoHooksLU):
4515 """Logical unit for OS diagnose/query.
4521 def _BuildFilter(fields, names):
4522 """Builds a filter for querying OSes.
4525 name_filter = qlang.MakeSimpleFilter("name", names)
4527 # Legacy behaviour: Hide hidden, blacklisted or invalid OSes if the
4528 # respective field is not requested
4529 status_filter = [[qlang.OP_NOT, [qlang.OP_TRUE, fname]]
4530 for fname in ["hidden", "blacklisted"]
4531 if fname not in fields]
4532 if "valid" not in fields:
4533 status_filter.append([qlang.OP_TRUE, "valid"])
4536 status_filter.insert(0, qlang.OP_AND)
4538 status_filter = None
4540 if name_filter and status_filter:
4541 return [qlang.OP_AND, name_filter, status_filter]
4545 return status_filter
4547 def CheckArguments(self):
4548 self.oq = _OsQuery(self._BuildFilter(self.op.output_fields, self.op.names),
4549 self.op.output_fields, False)
4551 def ExpandNames(self):
4552 self.oq.ExpandNames(self)
4554 def Exec(self, feedback_fn):
4555 return self.oq.OldStyleQuery(self)
4558 class LUNodeRemove(LogicalUnit):
4559 """Logical unit for removing a node.
4562 HPATH = "node-remove"
4563 HTYPE = constants.HTYPE_NODE
4565 def BuildHooksEnv(self):
4568 This doesn't run on the target node in the pre phase as a failed
4569 node would then be impossible to remove.
4573 "OP_TARGET": self.op.node_name,
4574 "NODE_NAME": self.op.node_name,
4577 def BuildHooksNodes(self):
4578 """Build hooks nodes.
4581 all_nodes = self.cfg.GetNodeList()
4583 all_nodes.remove(self.op.node_name)
4585 logging.warning("Node '%s', which is about to be removed, was not found"
4586 " in the list of all nodes", self.op.node_name)
4587 return (all_nodes, all_nodes)
4589 def CheckPrereq(self):
4590 """Check prerequisites.
4593 - the node exists in the configuration
4594 - it does not have primary or secondary instances
4595 - it's not the master
4597 Any errors are signaled by raising errors.OpPrereqError.
4600 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
4601 node = self.cfg.GetNodeInfo(self.op.node_name)
4602 assert node is not None
4604 masternode = self.cfg.GetMasterNode()
4605 if node.name == masternode:
4606 raise errors.OpPrereqError("Node is the master node, failover to another"
4607 " node is required", errors.ECODE_INVAL)
4609 for instance_name, instance in self.cfg.GetAllInstancesInfo().items():
4610 if node.name in instance.all_nodes:
4611 raise errors.OpPrereqError("Instance %s is still running on the node,"
4612 " please remove first" % instance_name,
4614 self.op.node_name = node.name
4617 def Exec(self, feedback_fn):
4618 """Removes the node from the cluster.
4622 logging.info("Stopping the node daemon and removing configs from node %s",
4625 modify_ssh_setup = self.cfg.GetClusterInfo().modify_ssh_setup
4627 assert locking.BGL in self.owned_locks(locking.LEVEL_CLUSTER), \
4630 # Promote nodes to master candidate as needed
4631 _AdjustCandidatePool(self, exceptions=[node.name])
4632 self.context.RemoveNode(node.name)
4634 # Run post hooks on the node before it's removed
4635 _RunPostHook(self, node.name)
4637 result = self.rpc.call_node_leave_cluster(node.name, modify_ssh_setup)
4638 msg = result.fail_msg
4640 self.LogWarning("Errors encountered on the remote node while leaving"
4641 " the cluster: %s", msg)
4643 # Remove node from our /etc/hosts
4644 if self.cfg.GetClusterInfo().modify_etc_hosts:
4645 master_node = self.cfg.GetMasterNode()
4646 result = self.rpc.call_etc_hosts_modify(master_node,
4647 constants.ETC_HOSTS_REMOVE,
4649 result.Raise("Can't update hosts file with new host data")
4650 _RedistributeAncillaryFiles(self)
4653 class _NodeQuery(_QueryBase):
4654 FIELDS = query.NODE_FIELDS
4656 def ExpandNames(self, lu):
4657 lu.needed_locks = {}
4658 lu.share_locks = _ShareAll()
4661 self.wanted = _GetWantedNodes(lu, self.names)
4663 self.wanted = locking.ALL_SET
4665 self.do_locking = (self.use_locking and
4666 query.NQ_LIVE in self.requested_data)
4669 # If any non-static field is requested we need to lock the nodes
4670 lu.needed_locks[locking.LEVEL_NODE] = self.wanted
4672 def DeclareLocks(self, lu, level):
4675 def _GetQueryData(self, lu):
4676 """Computes the list of nodes and their attributes.
4679 all_info = lu.cfg.GetAllNodesInfo()
4681 nodenames = self._GetNames(lu, all_info.keys(), locking.LEVEL_NODE)
4683 # Gather data as requested
4684 if query.NQ_LIVE in self.requested_data:
4685 # filter out non-vm_capable nodes
4686 toquery_nodes = [name for name in nodenames if all_info[name].vm_capable]
4688 node_data = lu.rpc.call_node_info(toquery_nodes, [lu.cfg.GetVGName()],
4689 [lu.cfg.GetHypervisorType()])
4690 live_data = dict((name, _MakeLegacyNodeInfo(nresult.payload))
4691 for (name, nresult) in node_data.items()
4692 if not nresult.fail_msg and nresult.payload)
4696 if query.NQ_INST in self.requested_data:
4697 node_to_primary = dict([(name, set()) for name in nodenames])
4698 node_to_secondary = dict([(name, set()) for name in nodenames])
4700 inst_data = lu.cfg.GetAllInstancesInfo()
4702 for inst in inst_data.values():
4703 if inst.primary_node in node_to_primary:
4704 node_to_primary[inst.primary_node].add(inst.name)
4705 for secnode in inst.secondary_nodes:
4706 if secnode in node_to_secondary:
4707 node_to_secondary[secnode].add(inst.name)
4709 node_to_primary = None
4710 node_to_secondary = None
4712 if query.NQ_OOB in self.requested_data:
4713 oob_support = dict((name, bool(_SupportsOob(lu.cfg, node)))
4714 for name, node in all_info.iteritems())
4718 if query.NQ_GROUP in self.requested_data:
4719 groups = lu.cfg.GetAllNodeGroupsInfo()
4723 return query.NodeQueryData([all_info[name] for name in nodenames],
4724 live_data, lu.cfg.GetMasterNode(),
4725 node_to_primary, node_to_secondary, groups,
4726 oob_support, lu.cfg.GetClusterInfo())
4729 class LUNodeQuery(NoHooksLU):
4730 """Logical unit for querying nodes.
4733 # pylint: disable=W0142
4736 def CheckArguments(self):
4737 self.nq = _NodeQuery(qlang.MakeSimpleFilter("name", self.op.names),
4738 self.op.output_fields, self.op.use_locking)
4740 def ExpandNames(self):
4741 self.nq.ExpandNames(self)
4743 def DeclareLocks(self, level):
4744 self.nq.DeclareLocks(self, level)
4746 def Exec(self, feedback_fn):
4747 return self.nq.OldStyleQuery(self)
4750 class LUNodeQueryvols(NoHooksLU):
4751 """Logical unit for getting volumes on node(s).
4755 _FIELDS_DYNAMIC = utils.FieldSet("phys", "vg", "name", "size", "instance")
4756 _FIELDS_STATIC = utils.FieldSet("node")
4758 def CheckArguments(self):
4759 _CheckOutputFields(static=self._FIELDS_STATIC,
4760 dynamic=self._FIELDS_DYNAMIC,
4761 selected=self.op.output_fields)
4763 def ExpandNames(self):
4764 self.share_locks = _ShareAll()
4765 self.needed_locks = {}
4767 if not self.op.nodes:
4768 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
4770 self.needed_locks[locking.LEVEL_NODE] = \
4771 _GetWantedNodes(self, self.op.nodes)
4773 def Exec(self, feedback_fn):
4774 """Computes the list of nodes and their attributes.
4777 nodenames = self.owned_locks(locking.LEVEL_NODE)
4778 volumes = self.rpc.call_node_volumes(nodenames)
4780 ilist = self.cfg.GetAllInstancesInfo()
4781 vol2inst = _MapInstanceDisksToNodes(ilist.values())
4784 for node in nodenames:
4785 nresult = volumes[node]
4788 msg = nresult.fail_msg
4790 self.LogWarning("Can't compute volume data on node %s: %s", node, msg)
4793 node_vols = sorted(nresult.payload,
4794 key=operator.itemgetter("dev"))
4796 for vol in node_vols:
4798 for field in self.op.output_fields:
4801 elif field == "phys":
4805 elif field == "name":
4807 elif field == "size":
4808 val = int(float(vol["size"]))
4809 elif field == "instance":
4810 val = vol2inst.get((node, vol["vg"] + "/" + vol["name"]), "-")
4812 raise errors.ParameterError(field)
4813 node_output.append(str(val))
4815 output.append(node_output)
4820 class LUNodeQueryStorage(NoHooksLU):
4821 """Logical unit for getting information on storage units on node(s).
4824 _FIELDS_STATIC = utils.FieldSet(constants.SF_NODE)
4827 def CheckArguments(self):
4828 _CheckOutputFields(static=self._FIELDS_STATIC,
4829 dynamic=utils.FieldSet(*constants.VALID_STORAGE_FIELDS),
4830 selected=self.op.output_fields)
4832 def ExpandNames(self):
4833 self.share_locks = _ShareAll()
4834 self.needed_locks = {}
4837 self.needed_locks[locking.LEVEL_NODE] = \
4838 _GetWantedNodes(self, self.op.nodes)
4840 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
4842 def Exec(self, feedback_fn):
4843 """Computes the list of nodes and their attributes.
4846 self.nodes = self.owned_locks(locking.LEVEL_NODE)
4848 # Always get name to sort by
4849 if constants.SF_NAME in self.op.output_fields:
4850 fields = self.op.output_fields[:]
4852 fields = [constants.SF_NAME] + self.op.output_fields
4854 # Never ask for node or type as it's only known to the LU
4855 for extra in [constants.SF_NODE, constants.SF_TYPE]:
4856 while extra in fields:
4857 fields.remove(extra)
4859 field_idx = dict([(name, idx) for (idx, name) in enumerate(fields)])
4860 name_idx = field_idx[constants.SF_NAME]
4862 st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
4863 data = self.rpc.call_storage_list(self.nodes,
4864 self.op.storage_type, st_args,
4865 self.op.name, fields)
4869 for node in utils.NiceSort(self.nodes):
4870 nresult = data[node]
4874 msg = nresult.fail_msg
4876 self.LogWarning("Can't get storage data from node %s: %s", node, msg)
4879 rows = dict([(row[name_idx], row) for row in nresult.payload])
4881 for name in utils.NiceSort(rows.keys()):
4886 for field in self.op.output_fields:
4887 if field == constants.SF_NODE:
4889 elif field == constants.SF_TYPE:
4890 val = self.op.storage_type
4891 elif field in field_idx:
4892 val = row[field_idx[field]]
4894 raise errors.ParameterError(field)
4903 class _InstanceQuery(_QueryBase):
4904 FIELDS = query.INSTANCE_FIELDS
4906 def ExpandNames(self, lu):
4907 lu.needed_locks = {}
4908 lu.share_locks = _ShareAll()
4911 self.wanted = _GetWantedInstances(lu, self.names)
4913 self.wanted = locking.ALL_SET
4915 self.do_locking = (self.use_locking and
4916 query.IQ_LIVE in self.requested_data)
4918 lu.needed_locks[locking.LEVEL_INSTANCE] = self.wanted
4919 lu.needed_locks[locking.LEVEL_NODEGROUP] = []
4920 lu.needed_locks[locking.LEVEL_NODE] = []
4921 lu.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
4923 self.do_grouplocks = (self.do_locking and
4924 query.IQ_NODES in self.requested_data)
4926 def DeclareLocks(self, lu, level):
4928 if level == locking.LEVEL_NODEGROUP and self.do_grouplocks:
4929 assert not lu.needed_locks[locking.LEVEL_NODEGROUP]
4931 # Lock all groups used by instances optimistically; this requires going
4932 # via the node before it's locked, requiring verification later on
4933 lu.needed_locks[locking.LEVEL_NODEGROUP] = \
4935 for instance_name in lu.owned_locks(locking.LEVEL_INSTANCE)
4936 for group_uuid in lu.cfg.GetInstanceNodeGroups(instance_name))
4937 elif level == locking.LEVEL_NODE:
4938 lu._LockInstancesNodes() # pylint: disable=W0212
4941 def _CheckGroupLocks(lu):
4942 owned_instances = frozenset(lu.owned_locks(locking.LEVEL_INSTANCE))
4943 owned_groups = frozenset(lu.owned_locks(locking.LEVEL_NODEGROUP))
4945 # Check if node groups for locked instances are still correct
4946 for instance_name in owned_instances:
4947 _CheckInstanceNodeGroups(lu.cfg, instance_name, owned_groups)
4949 def _GetQueryData(self, lu):
4950 """Computes the list of instances and their attributes.
4953 if self.do_grouplocks:
4954 self._CheckGroupLocks(lu)
4956 cluster = lu.cfg.GetClusterInfo()
4957 all_info = lu.cfg.GetAllInstancesInfo()
4959 instance_names = self._GetNames(lu, all_info.keys(), locking.LEVEL_INSTANCE)
4961 instance_list = [all_info[name] for name in instance_names]
4962 nodes = frozenset(itertools.chain(*(inst.all_nodes
4963 for inst in instance_list)))
4964 hv_list = list(set([inst.hypervisor for inst in instance_list]))
4967 wrongnode_inst = set()
4969 # Gather data as requested
4970 if self.requested_data & set([query.IQ_LIVE, query.IQ_CONSOLE]):
4972 node_data = lu.rpc.call_all_instances_info(nodes, hv_list)
4974 result = node_data[name]
4976 # offline nodes will be in both lists
4977 assert result.fail_msg
4978 offline_nodes.append(name)
4980 bad_nodes.append(name)
4981 elif result.payload:
4982 for inst in result.payload:
4983 if inst in all_info:
4984 if all_info[inst].primary_node == name:
4985 live_data.update(result.payload)
4987 wrongnode_inst.add(inst)
4989 # orphan instance; we don't list it here as we don't
4990 # handle this case yet in the output of instance listing
4991 logging.warning("Orphan instance '%s' found on node %s",
4993 # else no instance is alive
4997 if query.IQ_DISKUSAGE in self.requested_data:
4998 disk_usage = dict((inst.name,
4999 _ComputeDiskSize(inst.disk_template,
5000 [{constants.IDISK_SIZE: disk.size}
5001 for disk in inst.disks]))
5002 for inst in instance_list)
5006 if query.IQ_CONSOLE in self.requested_data:
5008 for inst in instance_list:
5009 if inst.name in live_data:
5010 # Instance is running
5011 consinfo[inst.name] = _GetInstanceConsole(cluster, inst)
5013 consinfo[inst.name] = None
5014 assert set(consinfo.keys()) == set(instance_names)
5018 if query.IQ_NODES in self.requested_data:
5019 node_names = set(itertools.chain(*map(operator.attrgetter("all_nodes"),
5021 nodes = dict(lu.cfg.GetMultiNodeInfo(node_names))
5022 groups = dict((uuid, lu.cfg.GetNodeGroup(uuid))
5023 for uuid in set(map(operator.attrgetter("group"),
5029 return query.InstanceQueryData(instance_list, lu.cfg.GetClusterInfo(),
5030 disk_usage, offline_nodes, bad_nodes,
5031 live_data, wrongnode_inst, consinfo,
5035 class LUQuery(NoHooksLU):
5036 """Query for resources/items of a certain kind.
5039 # pylint: disable=W0142
5042 def CheckArguments(self):
5043 qcls = _GetQueryImplementation(self.op.what)
5045 self.impl = qcls(self.op.qfilter, self.op.fields, self.op.use_locking)
5047 def ExpandNames(self):
5048 self.impl.ExpandNames(self)
5050 def DeclareLocks(self, level):
5051 self.impl.DeclareLocks(self, level)
5053 def Exec(self, feedback_fn):
5054 return self.impl.NewStyleQuery(self)
5057 class LUQueryFields(NoHooksLU):
5058 """Query for resources/items of a certain kind.
5061 # pylint: disable=W0142
5064 def CheckArguments(self):
5065 self.qcls = _GetQueryImplementation(self.op.what)
5067 def ExpandNames(self):
5068 self.needed_locks = {}
5070 def Exec(self, feedback_fn):
5071 return query.QueryFields(self.qcls.FIELDS, self.op.fields)
5074 class LUNodeModifyStorage(NoHooksLU):
5075 """Logical unit for modifying a storage volume on a node.
5080 def CheckArguments(self):
5081 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
5083 storage_type = self.op.storage_type
5086 modifiable = constants.MODIFIABLE_STORAGE_FIELDS[storage_type]
5088 raise errors.OpPrereqError("Storage units of type '%s' can not be"
5089 " modified" % storage_type,
5092 diff = set(self.op.changes.keys()) - modifiable
5094 raise errors.OpPrereqError("The following fields can not be modified for"
5095 " storage units of type '%s': %r" %
5096 (storage_type, list(diff)),
5099 def ExpandNames(self):
5100 self.needed_locks = {
5101 locking.LEVEL_NODE: self.op.node_name,
5104 def Exec(self, feedback_fn):
5105 """Computes the list of nodes and their attributes.
5108 st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
5109 result = self.rpc.call_storage_modify(self.op.node_name,
5110 self.op.storage_type, st_args,
5111 self.op.name, self.op.changes)
5112 result.Raise("Failed to modify storage unit '%s' on %s" %
5113 (self.op.name, self.op.node_name))
5116 class LUNodeAdd(LogicalUnit):
5117 """Logical unit for adding node to the cluster.
5121 HTYPE = constants.HTYPE_NODE
5122 _NFLAGS = ["master_capable", "vm_capable"]
5124 def CheckArguments(self):
5125 self.primary_ip_family = self.cfg.GetPrimaryIPFamily()
5126 # validate/normalize the node name
5127 self.hostname = netutils.GetHostname(name=self.op.node_name,
5128 family=self.primary_ip_family)
5129 self.op.node_name = self.hostname.name
5131 if self.op.readd and self.op.node_name == self.cfg.GetMasterNode():
5132 raise errors.OpPrereqError("Cannot readd the master node",
5135 if self.op.readd and self.op.group:
5136 raise errors.OpPrereqError("Cannot pass a node group when a node is"
5137 " being readded", errors.ECODE_INVAL)
5139 def BuildHooksEnv(self):
5142 This will run on all nodes before, and on all nodes + the new node after.
5146 "OP_TARGET": self.op.node_name,
5147 "NODE_NAME": self.op.node_name,
5148 "NODE_PIP": self.op.primary_ip,
5149 "NODE_SIP": self.op.secondary_ip,
5150 "MASTER_CAPABLE": str(self.op.master_capable),
5151 "VM_CAPABLE": str(self.op.vm_capable),
5154 def BuildHooksNodes(self):
5155 """Build hooks nodes.
5158 # Exclude added node
5159 pre_nodes = list(set(self.cfg.GetNodeList()) - set([self.op.node_name]))
5160 post_nodes = pre_nodes + [self.op.node_name, ]
5162 return (pre_nodes, post_nodes)
5164 def CheckPrereq(self):
5165 """Check prerequisites.
5168 - the new node is not already in the config
5170 - its parameters (single/dual homed) matches the cluster
5172 Any errors are signaled by raising errors.OpPrereqError.
5176 hostname = self.hostname
5177 node = hostname.name
5178 primary_ip = self.op.primary_ip = hostname.ip
5179 if self.op.secondary_ip is None:
5180 if self.primary_ip_family == netutils.IP6Address.family:
5181 raise errors.OpPrereqError("When using a IPv6 primary address, a valid"
5182 " IPv4 address must be given as secondary",
5184 self.op.secondary_ip = primary_ip
5186 secondary_ip = self.op.secondary_ip
5187 if not netutils.IP4Address.IsValid(secondary_ip):
5188 raise errors.OpPrereqError("Secondary IP (%s) needs to be a valid IPv4"
5189 " address" % secondary_ip, errors.ECODE_INVAL)
5191 node_list = cfg.GetNodeList()
5192 if not self.op.readd and node in node_list:
5193 raise errors.OpPrereqError("Node %s is already in the configuration" %
5194 node, errors.ECODE_EXISTS)
5195 elif self.op.readd and node not in node_list:
5196 raise errors.OpPrereqError("Node %s is not in the configuration" % node,
5199 self.changed_primary_ip = False
5201 for existing_node_name, existing_node in cfg.GetMultiNodeInfo(node_list):
5202 if self.op.readd and node == existing_node_name:
5203 if existing_node.secondary_ip != secondary_ip:
5204 raise errors.OpPrereqError("Readded node doesn't have the same IP"
5205 " address configuration as before",
5207 if existing_node.primary_ip != primary_ip:
5208 self.changed_primary_ip = True
5212 if (existing_node.primary_ip == primary_ip or
5213 existing_node.secondary_ip == primary_ip or
5214 existing_node.primary_ip == secondary_ip or
5215 existing_node.secondary_ip == secondary_ip):
5216 raise errors.OpPrereqError("New node ip address(es) conflict with"
5217 " existing node %s" % existing_node.name,
5218 errors.ECODE_NOTUNIQUE)
5220 # After this 'if' block, None is no longer a valid value for the
5221 # _capable op attributes
5223 old_node = self.cfg.GetNodeInfo(node)
5224 assert old_node is not None, "Can't retrieve locked node %s" % node
5225 for attr in self._NFLAGS:
5226 if getattr(self.op, attr) is None:
5227 setattr(self.op, attr, getattr(old_node, attr))
5229 for attr in self._NFLAGS:
5230 if getattr(self.op, attr) is None:
5231 setattr(self.op, attr, True)
5233 if self.op.readd and not self.op.vm_capable:
5234 pri, sec = cfg.GetNodeInstances(node)
5236 raise errors.OpPrereqError("Node %s being re-added with vm_capable"
5237 " flag set to false, but it already holds"
5238 " instances" % node,
5241 # check that the type of the node (single versus dual homed) is the
5242 # same as for the master
5243 myself = cfg.GetNodeInfo(self.cfg.GetMasterNode())
5244 master_singlehomed = myself.secondary_ip == myself.primary_ip
5245 newbie_singlehomed = secondary_ip == primary_ip
5246 if master_singlehomed != newbie_singlehomed:
5247 if master_singlehomed:
5248 raise errors.OpPrereqError("The master has no secondary ip but the"
5249 " new node has one",
5252 raise errors.OpPrereqError("The master has a secondary ip but the"
5253 " new node doesn't have one",
5256 # checks reachability
5257 if not netutils.TcpPing(primary_ip, constants.DEFAULT_NODED_PORT):
5258 raise errors.OpPrereqError("Node not reachable by ping",
5259 errors.ECODE_ENVIRON)
5261 if not newbie_singlehomed:
5262 # check reachability from my secondary ip to newbie's secondary ip
5263 if not netutils.TcpPing(secondary_ip, constants.DEFAULT_NODED_PORT,
5264 source=myself.secondary_ip):
5265 raise errors.OpPrereqError("Node secondary ip not reachable by TCP"
5266 " based ping to node daemon port",
5267 errors.ECODE_ENVIRON)
5274 if self.op.master_capable:
5275 self.master_candidate = _DecideSelfPromotion(self, exceptions=exceptions)
5277 self.master_candidate = False
5280 self.new_node = old_node
5282 node_group = cfg.LookupNodeGroup(self.op.group)
5283 self.new_node = objects.Node(name=node,
5284 primary_ip=primary_ip,
5285 secondary_ip=secondary_ip,
5286 master_candidate=self.master_candidate,
5287 offline=False, drained=False,
5290 if self.op.ndparams:
5291 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
5293 def Exec(self, feedback_fn):
5294 """Adds the new node to the cluster.
5297 new_node = self.new_node
5298 node = new_node.name
5300 assert locking.BGL in self.owned_locks(locking.LEVEL_CLUSTER), \
5303 # We adding a new node so we assume it's powered
5304 new_node.powered = True
5306 # for re-adds, reset the offline/drained/master-candidate flags;
5307 # we need to reset here, otherwise offline would prevent RPC calls
5308 # later in the procedure; this also means that if the re-add
5309 # fails, we are left with a non-offlined, broken node
5311 new_node.drained = new_node.offline = False # pylint: disable=W0201
5312 self.LogInfo("Readding a node, the offline/drained flags were reset")
5313 # if we demote the node, we do cleanup later in the procedure
5314 new_node.master_candidate = self.master_candidate
5315 if self.changed_primary_ip:
5316 new_node.primary_ip = self.op.primary_ip
5318 # copy the master/vm_capable flags
5319 for attr in self._NFLAGS:
5320 setattr(new_node, attr, getattr(self.op, attr))
5322 # notify the user about any possible mc promotion
5323 if new_node.master_candidate:
5324 self.LogInfo("Node will be a master candidate")
5326 if self.op.ndparams:
5327 new_node.ndparams = self.op.ndparams
5329 new_node.ndparams = {}
5331 # check connectivity
5332 result = self.rpc.call_version([node])[node]
5333 result.Raise("Can't get version information from node %s" % node)
5334 if constants.PROTOCOL_VERSION == result.payload:
5335 logging.info("Communication to node %s fine, sw version %s match",
5336 node, result.payload)
5338 raise errors.OpExecError("Version mismatch master version %s,"
5339 " node version %s" %
5340 (constants.PROTOCOL_VERSION, result.payload))
5342 # Add node to our /etc/hosts, and add key to known_hosts
5343 if self.cfg.GetClusterInfo().modify_etc_hosts:
5344 master_node = self.cfg.GetMasterNode()
5345 result = self.rpc.call_etc_hosts_modify(master_node,
5346 constants.ETC_HOSTS_ADD,
5349 result.Raise("Can't update hosts file with new host data")
5351 if new_node.secondary_ip != new_node.primary_ip:
5352 _CheckNodeHasSecondaryIP(self, new_node.name, new_node.secondary_ip,
5355 node_verify_list = [self.cfg.GetMasterNode()]
5356 node_verify_param = {
5357 constants.NV_NODELIST: ([node], {}),
5358 # TODO: do a node-net-test as well?
5361 result = self.rpc.call_node_verify(node_verify_list, node_verify_param,
5362 self.cfg.GetClusterName())
5363 for verifier in node_verify_list:
5364 result[verifier].Raise("Cannot communicate with node %s" % verifier)
5365 nl_payload = result[verifier].payload[constants.NV_NODELIST]
5367 for failed in nl_payload:
5368 feedback_fn("ssh/hostname verification failed"
5369 " (checking from %s): %s" %
5370 (verifier, nl_payload[failed]))
5371 raise errors.OpExecError("ssh/hostname verification failed")
5374 _RedistributeAncillaryFiles(self)
5375 self.context.ReaddNode(new_node)
5376 # make sure we redistribute the config
5377 self.cfg.Update(new_node, feedback_fn)
5378 # and make sure the new node will not have old files around
5379 if not new_node.master_candidate:
5380 result = self.rpc.call_node_demote_from_mc(new_node.name)
5381 msg = result.fail_msg
5383 self.LogWarning("Node failed to demote itself from master"
5384 " candidate status: %s" % msg)
5386 _RedistributeAncillaryFiles(self, additional_nodes=[node],
5387 additional_vm=self.op.vm_capable)
5388 self.context.AddNode(new_node, self.proc.GetECId())
5391 class LUNodeSetParams(LogicalUnit):
5392 """Modifies the parameters of a node.
5394 @cvar _F2R: a dictionary from tuples of flags (mc, drained, offline)
5395 to the node role (as _ROLE_*)
5396 @cvar _R2F: a dictionary from node role to tuples of flags
5397 @cvar _FLAGS: a list of attribute names corresponding to the flags
5400 HPATH = "node-modify"
5401 HTYPE = constants.HTYPE_NODE
5403 (_ROLE_CANDIDATE, _ROLE_DRAINED, _ROLE_OFFLINE, _ROLE_REGULAR) = range(4)
5405 (True, False, False): _ROLE_CANDIDATE,
5406 (False, True, False): _ROLE_DRAINED,
5407 (False, False, True): _ROLE_OFFLINE,
5408 (False, False, False): _ROLE_REGULAR,
5410 _R2F = dict((v, k) for k, v in _F2R.items())
5411 _FLAGS = ["master_candidate", "drained", "offline"]
5413 def CheckArguments(self):
5414 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
5415 all_mods = [self.op.offline, self.op.master_candidate, self.op.drained,
5416 self.op.master_capable, self.op.vm_capable,
5417 self.op.secondary_ip, self.op.ndparams, self.op.hv_state,
5419 if all_mods.count(None) == len(all_mods):
5420 raise errors.OpPrereqError("Please pass at least one modification",
5422 if all_mods.count(True) > 1:
5423 raise errors.OpPrereqError("Can't set the node into more than one"
5424 " state at the same time",
5427 # Boolean value that tells us whether we might be demoting from MC
5428 self.might_demote = (self.op.master_candidate == False or
5429 self.op.offline == True or
5430 self.op.drained == True or
5431 self.op.master_capable == False)
5433 if self.op.secondary_ip:
5434 if not netutils.IP4Address.IsValid(self.op.secondary_ip):
5435 raise errors.OpPrereqError("Secondary IP (%s) needs to be a valid IPv4"
5436 " address" % self.op.secondary_ip,
5439 self.lock_all = self.op.auto_promote and self.might_demote
5440 self.lock_instances = self.op.secondary_ip is not None
5442 def _InstanceFilter(self, instance):
5443 """Filter for getting affected instances.
5446 return (instance.disk_template in constants.DTS_INT_MIRROR and
5447 self.op.node_name in instance.all_nodes)
5449 def ExpandNames(self):
5451 self.needed_locks = {locking.LEVEL_NODE: locking.ALL_SET}
5453 self.needed_locks = {locking.LEVEL_NODE: self.op.node_name}
5455 # Since modifying a node can have severe effects on currently running
5456 # operations the resource lock is at least acquired in shared mode
5457 self.needed_locks[locking.LEVEL_NODE_RES] = \
5458 self.needed_locks[locking.LEVEL_NODE]
5460 # Get node resource and instance locks in shared mode; they are not used
5461 # for anything but read-only access
5462 self.share_locks[locking.LEVEL_NODE_RES] = 1
5463 self.share_locks[locking.LEVEL_INSTANCE] = 1
5465 if self.lock_instances:
5466 self.needed_locks[locking.LEVEL_INSTANCE] = \
5467 frozenset(self.cfg.GetInstancesInfoByFilter(self._InstanceFilter))
5469 def BuildHooksEnv(self):
5472 This runs on the master node.
5476 "OP_TARGET": self.op.node_name,
5477 "MASTER_CANDIDATE": str(self.op.master_candidate),
5478 "OFFLINE": str(self.op.offline),
5479 "DRAINED": str(self.op.drained),
5480 "MASTER_CAPABLE": str(self.op.master_capable),
5481 "VM_CAPABLE": str(self.op.vm_capable),
5484 def BuildHooksNodes(self):
5485 """Build hooks nodes.
5488 nl = [self.cfg.GetMasterNode(), self.op.node_name]
5491 def CheckPrereq(self):
5492 """Check prerequisites.
5494 This only checks the instance list against the existing names.
5497 node = self.node = self.cfg.GetNodeInfo(self.op.node_name)
5499 if self.lock_instances:
5500 affected_instances = \
5501 self.cfg.GetInstancesInfoByFilter(self._InstanceFilter)
5503 # Verify instance locks
5504 owned_instances = self.owned_locks(locking.LEVEL_INSTANCE)
5505 wanted_instances = frozenset(affected_instances.keys())
5506 if wanted_instances - owned_instances:
5507 raise errors.OpPrereqError("Instances affected by changing node %s's"
5508 " secondary IP address have changed since"
5509 " locks were acquired, wanted '%s', have"
5510 " '%s'; retry the operation" %
5512 utils.CommaJoin(wanted_instances),
5513 utils.CommaJoin(owned_instances)),
5516 affected_instances = None
5518 if (self.op.master_candidate is not None or
5519 self.op.drained is not None or
5520 self.op.offline is not None):
5521 # we can't change the master's node flags
5522 if self.op.node_name == self.cfg.GetMasterNode():
5523 raise errors.OpPrereqError("The master role can be changed"
5524 " only via master-failover",
5527 if self.op.master_candidate and not node.master_capable:
5528 raise errors.OpPrereqError("Node %s is not master capable, cannot make"
5529 " it a master candidate" % node.name,
5532 if self.op.vm_capable == False:
5533 (ipri, isec) = self.cfg.GetNodeInstances(self.op.node_name)
5535 raise errors.OpPrereqError("Node %s hosts instances, cannot unset"
5536 " the vm_capable flag" % node.name,
5539 if node.master_candidate and self.might_demote and not self.lock_all:
5540 assert not self.op.auto_promote, "auto_promote set but lock_all not"
5541 # check if after removing the current node, we're missing master
5543 (mc_remaining, mc_should, _) = \
5544 self.cfg.GetMasterCandidateStats(exceptions=[node.name])
5545 if mc_remaining < mc_should:
5546 raise errors.OpPrereqError("Not enough master candidates, please"
5547 " pass auto promote option to allow"
5548 " promotion", errors.ECODE_STATE)
5550 self.old_flags = old_flags = (node.master_candidate,
5551 node.drained, node.offline)
5552 assert old_flags in self._F2R, "Un-handled old flags %s" % str(old_flags)
5553 self.old_role = old_role = self._F2R[old_flags]
5555 # Check for ineffective changes
5556 for attr in self._FLAGS:
5557 if (getattr(self.op, attr) == False and getattr(node, attr) == False):
5558 self.LogInfo("Ignoring request to unset flag %s, already unset", attr)
5559 setattr(self.op, attr, None)
5561 # Past this point, any flag change to False means a transition
5562 # away from the respective state, as only real changes are kept
5564 # TODO: We might query the real power state if it supports OOB
5565 if _SupportsOob(self.cfg, node):
5566 if self.op.offline is False and not (node.powered or
5567 self.op.powered == True):
5568 raise errors.OpPrereqError(("Node %s needs to be turned on before its"
5569 " offline status can be reset") %
5571 elif self.op.powered is not None:
5572 raise errors.OpPrereqError(("Unable to change powered state for node %s"
5573 " as it does not support out-of-band"
5574 " handling") % self.op.node_name)
5576 # If we're being deofflined/drained, we'll MC ourself if needed
5577 if (self.op.drained == False or self.op.offline == False or
5578 (self.op.master_capable and not node.master_capable)):
5579 if _DecideSelfPromotion(self):
5580 self.op.master_candidate = True
5581 self.LogInfo("Auto-promoting node to master candidate")
5583 # If we're no longer master capable, we'll demote ourselves from MC
5584 if self.op.master_capable == False and node.master_candidate:
5585 self.LogInfo("Demoting from master candidate")
5586 self.op.master_candidate = False
5589 assert [getattr(self.op, attr) for attr in self._FLAGS].count(True) <= 1
5590 if self.op.master_candidate:
5591 new_role = self._ROLE_CANDIDATE
5592 elif self.op.drained:
5593 new_role = self._ROLE_DRAINED
5594 elif self.op.offline:
5595 new_role = self._ROLE_OFFLINE
5596 elif False in [self.op.master_candidate, self.op.drained, self.op.offline]:
5597 # False is still in new flags, which means we're un-setting (the
5599 new_role = self._ROLE_REGULAR
5600 else: # no new flags, nothing, keep old role
5603 self.new_role = new_role
5605 if old_role == self._ROLE_OFFLINE and new_role != old_role:
5606 # Trying to transition out of offline status
5607 # TODO: Use standard RPC runner, but make sure it works when the node is
5608 # still marked offline
5609 result = rpc.BootstrapRunner().call_version([node.name])[node.name]
5611 raise errors.OpPrereqError("Node %s is being de-offlined but fails"
5612 " to report its version: %s" %
5613 (node.name, result.fail_msg),
5616 self.LogWarning("Transitioning node from offline to online state"
5617 " without using re-add. Please make sure the node"
5620 if self.op.secondary_ip:
5621 # Ok even without locking, because this can't be changed by any LU
5622 master = self.cfg.GetNodeInfo(self.cfg.GetMasterNode())
5623 master_singlehomed = master.secondary_ip == master.primary_ip
5624 if master_singlehomed and self.op.secondary_ip:
5625 raise errors.OpPrereqError("Cannot change the secondary ip on a single"
5626 " homed cluster", errors.ECODE_INVAL)
5628 assert not (frozenset(affected_instances) -
5629 self.owned_locks(locking.LEVEL_INSTANCE))
5632 if affected_instances:
5633 raise errors.OpPrereqError("Cannot change secondary IP address:"
5634 " offline node has instances (%s)"
5635 " configured to use it" %
5636 utils.CommaJoin(affected_instances.keys()))
5638 # On online nodes, check that no instances are running, and that
5639 # the node has the new ip and we can reach it.
5640 for instance in affected_instances.values():
5641 _CheckInstanceState(self, instance, INSTANCE_DOWN,
5642 msg="cannot change secondary ip")
5644 _CheckNodeHasSecondaryIP(self, node.name, self.op.secondary_ip, True)
5645 if master.name != node.name:
5646 # check reachability from master secondary ip to new secondary ip
5647 if not netutils.TcpPing(self.op.secondary_ip,
5648 constants.DEFAULT_NODED_PORT,
5649 source=master.secondary_ip):
5650 raise errors.OpPrereqError("Node secondary ip not reachable by TCP"
5651 " based ping to node daemon port",
5652 errors.ECODE_ENVIRON)
5654 if self.op.ndparams:
5655 new_ndparams = _GetUpdatedParams(self.node.ndparams, self.op.ndparams)
5656 utils.ForceDictType(new_ndparams, constants.NDS_PARAMETER_TYPES)
5657 self.new_ndparams = new_ndparams
5659 if self.op.hv_state:
5660 self.new_hv_state = _MergeAndVerifyHvState(self.op.hv_state,
5661 self.node.hv_state_static)
5663 if self.op.disk_state:
5664 self.new_disk_state = \
5665 _MergeAndVerifyDiskState(self.op.disk_state,
5666 self.node.disk_state_static)
5668 def Exec(self, feedback_fn):
5673 old_role = self.old_role
5674 new_role = self.new_role
5678 if self.op.ndparams:
5679 node.ndparams = self.new_ndparams
5681 if self.op.powered is not None:
5682 node.powered = self.op.powered
5684 if self.op.hv_state:
5685 node.hv_state_static = self.new_hv_state
5687 if self.op.disk_state:
5688 node.disk_state_static = self.new_disk_state
5690 for attr in ["master_capable", "vm_capable"]:
5691 val = getattr(self.op, attr)
5693 setattr(node, attr, val)
5694 result.append((attr, str(val)))
5696 if new_role != old_role:
5697 # Tell the node to demote itself, if no longer MC and not offline
5698 if old_role == self._ROLE_CANDIDATE and new_role != self._ROLE_OFFLINE:
5699 msg = self.rpc.call_node_demote_from_mc(node.name).fail_msg
5701 self.LogWarning("Node failed to demote itself: %s", msg)
5703 new_flags = self._R2F[new_role]
5704 for of, nf, desc in zip(self.old_flags, new_flags, self._FLAGS):
5706 result.append((desc, str(nf)))
5707 (node.master_candidate, node.drained, node.offline) = new_flags
5709 # we locked all nodes, we adjust the CP before updating this node
5711 _AdjustCandidatePool(self, [node.name])
5713 if self.op.secondary_ip:
5714 node.secondary_ip = self.op.secondary_ip
5715 result.append(("secondary_ip", self.op.secondary_ip))
5717 # this will trigger configuration file update, if needed
5718 self.cfg.Update(node, feedback_fn)
5720 # this will trigger job queue propagation or cleanup if the mc
5722 if [old_role, new_role].count(self._ROLE_CANDIDATE) == 1:
5723 self.context.ReaddNode(node)
5728 class LUNodePowercycle(NoHooksLU):
5729 """Powercycles a node.
5734 def CheckArguments(self):
5735 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
5736 if self.op.node_name == self.cfg.GetMasterNode() and not self.op.force:
5737 raise errors.OpPrereqError("The node is the master and the force"
5738 " parameter was not set",
5741 def ExpandNames(self):
5742 """Locking for PowercycleNode.
5744 This is a last-resort option and shouldn't block on other
5745 jobs. Therefore, we grab no locks.
5748 self.needed_locks = {}
5750 def Exec(self, feedback_fn):
5754 result = self.rpc.call_node_powercycle(self.op.node_name,
5755 self.cfg.GetHypervisorType())
5756 result.Raise("Failed to schedule the reboot")
5757 return result.payload
5760 class LUClusterQuery(NoHooksLU):
5761 """Query cluster configuration.
5766 def ExpandNames(self):
5767 self.needed_locks = {}
5769 def Exec(self, feedback_fn):
5770 """Return cluster config.
5773 cluster = self.cfg.GetClusterInfo()
5776 # Filter just for enabled hypervisors
5777 for os_name, hv_dict in cluster.os_hvp.items():
5778 os_hvp[os_name] = {}
5779 for hv_name, hv_params in hv_dict.items():
5780 if hv_name in cluster.enabled_hypervisors:
5781 os_hvp[os_name][hv_name] = hv_params
5783 # Convert ip_family to ip_version
5784 primary_ip_version = constants.IP4_VERSION
5785 if cluster.primary_ip_family == netutils.IP6Address.family:
5786 primary_ip_version = constants.IP6_VERSION
5789 "software_version": constants.RELEASE_VERSION,
5790 "protocol_version": constants.PROTOCOL_VERSION,
5791 "config_version": constants.CONFIG_VERSION,
5792 "os_api_version": max(constants.OS_API_VERSIONS),
5793 "export_version": constants.EXPORT_VERSION,
5794 "architecture": (platform.architecture()[0], platform.machine()),
5795 "name": cluster.cluster_name,
5796 "master": cluster.master_node,
5797 "default_hypervisor": cluster.primary_hypervisor,
5798 "enabled_hypervisors": cluster.enabled_hypervisors,
5799 "hvparams": dict([(hypervisor_name, cluster.hvparams[hypervisor_name])
5800 for hypervisor_name in cluster.enabled_hypervisors]),
5802 "beparams": cluster.beparams,
5803 "osparams": cluster.osparams,
5804 "nicparams": cluster.nicparams,
5805 "ndparams": cluster.ndparams,
5806 "candidate_pool_size": cluster.candidate_pool_size,
5807 "master_netdev": cluster.master_netdev,
5808 "master_netmask": cluster.master_netmask,
5809 "use_external_mip_script": cluster.use_external_mip_script,
5810 "volume_group_name": cluster.volume_group_name,
5811 "drbd_usermode_helper": cluster.drbd_usermode_helper,
5812 "file_storage_dir": cluster.file_storage_dir,
5813 "shared_file_storage_dir": cluster.shared_file_storage_dir,
5814 "maintain_node_health": cluster.maintain_node_health,
5815 "ctime": cluster.ctime,
5816 "mtime": cluster.mtime,
5817 "uuid": cluster.uuid,
5818 "tags": list(cluster.GetTags()),
5819 "uid_pool": cluster.uid_pool,
5820 "default_iallocator": cluster.default_iallocator,
5821 "reserved_lvs": cluster.reserved_lvs,
5822 "primary_ip_version": primary_ip_version,
5823 "prealloc_wipe_disks": cluster.prealloc_wipe_disks,
5824 "hidden_os": cluster.hidden_os,
5825 "blacklisted_os": cluster.blacklisted_os,
5831 class LUClusterConfigQuery(NoHooksLU):
5832 """Return configuration values.
5836 _FIELDS_DYNAMIC = utils.FieldSet()
5837 _FIELDS_STATIC = utils.FieldSet("cluster_name", "master_node", "drain_flag",
5838 "watcher_pause", "volume_group_name")
5840 def CheckArguments(self):
5841 _CheckOutputFields(static=self._FIELDS_STATIC,
5842 dynamic=self._FIELDS_DYNAMIC,
5843 selected=self.op.output_fields)
5845 def ExpandNames(self):
5846 self.needed_locks = {}
5848 def Exec(self, feedback_fn):
5849 """Dump a representation of the cluster config to the standard output.
5853 for field in self.op.output_fields:
5854 if field == "cluster_name":
5855 entry = self.cfg.GetClusterName()
5856 elif field == "master_node":
5857 entry = self.cfg.GetMasterNode()
5858 elif field == "drain_flag":
5859 entry = os.path.exists(constants.JOB_QUEUE_DRAIN_FILE)
5860 elif field == "watcher_pause":
5861 entry = utils.ReadWatcherPauseFile(constants.WATCHER_PAUSEFILE)
5862 elif field == "volume_group_name":
5863 entry = self.cfg.GetVGName()
5865 raise errors.ParameterError(field)
5866 values.append(entry)
5870 class LUInstanceActivateDisks(NoHooksLU):
5871 """Bring up an instance's disks.
5876 def ExpandNames(self):
5877 self._ExpandAndLockInstance()
5878 self.needed_locks[locking.LEVEL_NODE] = []
5879 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
5881 def DeclareLocks(self, level):
5882 if level == locking.LEVEL_NODE:
5883 self._LockInstancesNodes()
5885 def CheckPrereq(self):
5886 """Check prerequisites.
5888 This checks that the instance is in the cluster.
5891 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5892 assert self.instance is not None, \
5893 "Cannot retrieve locked instance %s" % self.op.instance_name
5894 _CheckNodeOnline(self, self.instance.primary_node)
5896 def Exec(self, feedback_fn):
5897 """Activate the disks.
5900 disks_ok, disks_info = \
5901 _AssembleInstanceDisks(self, self.instance,
5902 ignore_size=self.op.ignore_size)
5904 raise errors.OpExecError("Cannot activate block devices")
5909 def _AssembleInstanceDisks(lu, instance, disks=None, ignore_secondaries=False,
5911 """Prepare the block devices for an instance.
5913 This sets up the block devices on all nodes.
5915 @type lu: L{LogicalUnit}
5916 @param lu: the logical unit on whose behalf we execute
5917 @type instance: L{objects.Instance}
5918 @param instance: the instance for whose disks we assemble
5919 @type disks: list of L{objects.Disk} or None
5920 @param disks: which disks to assemble (or all, if None)
5921 @type ignore_secondaries: boolean
5922 @param ignore_secondaries: if true, errors on secondary nodes
5923 won't result in an error return from the function
5924 @type ignore_size: boolean
5925 @param ignore_size: if true, the current known size of the disk
5926 will not be used during the disk activation, useful for cases
5927 when the size is wrong
5928 @return: False if the operation failed, otherwise a list of
5929 (host, instance_visible_name, node_visible_name)
5930 with the mapping from node devices to instance devices
5935 iname = instance.name
5936 disks = _ExpandCheckDisks(instance, disks)
5938 # With the two passes mechanism we try to reduce the window of
5939 # opportunity for the race condition of switching DRBD to primary
5940 # before handshaking occured, but we do not eliminate it
5942 # The proper fix would be to wait (with some limits) until the
5943 # connection has been made and drbd transitions from WFConnection
5944 # into any other network-connected state (Connected, SyncTarget,
5947 # 1st pass, assemble on all nodes in secondary mode
5948 for idx, inst_disk in enumerate(disks):
5949 for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node):
5951 node_disk = node_disk.Copy()
5952 node_disk.UnsetSize()
5953 lu.cfg.SetDiskID(node_disk, node)
5954 result = lu.rpc.call_blockdev_assemble(node, node_disk, iname, False, idx)
5955 msg = result.fail_msg
5957 lu.proc.LogWarning("Could not prepare block device %s on node %s"
5958 " (is_primary=False, pass=1): %s",
5959 inst_disk.iv_name, node, msg)
5960 if not ignore_secondaries:
5963 # FIXME: race condition on drbd migration to primary
5965 # 2nd pass, do only the primary node
5966 for idx, inst_disk in enumerate(disks):
5969 for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node):
5970 if node != instance.primary_node:
5973 node_disk = node_disk.Copy()
5974 node_disk.UnsetSize()
5975 lu.cfg.SetDiskID(node_disk, node)
5976 result = lu.rpc.call_blockdev_assemble(node, node_disk, iname, True, idx)
5977 msg = result.fail_msg
5979 lu.proc.LogWarning("Could not prepare block device %s on node %s"
5980 " (is_primary=True, pass=2): %s",
5981 inst_disk.iv_name, node, msg)
5984 dev_path = result.payload
5986 device_info.append((instance.primary_node, inst_disk.iv_name, dev_path))
5988 # leave the disks configured for the primary node
5989 # this is a workaround that would be fixed better by
5990 # improving the logical/physical id handling
5992 lu.cfg.SetDiskID(disk, instance.primary_node)
5994 return disks_ok, device_info
5997 def _StartInstanceDisks(lu, instance, force):
5998 """Start the disks of an instance.
6001 disks_ok, _ = _AssembleInstanceDisks(lu, instance,
6002 ignore_secondaries=force)
6004 _ShutdownInstanceDisks(lu, instance)
6005 if force is not None and not force:
6006 lu.proc.LogWarning("", hint="If the message above refers to a"
6008 " you can retry the operation using '--force'.")
6009 raise errors.OpExecError("Disk consistency error")
6012 class LUInstanceDeactivateDisks(NoHooksLU):
6013 """Shutdown an instance's disks.
6018 def ExpandNames(self):
6019 self._ExpandAndLockInstance()
6020 self.needed_locks[locking.LEVEL_NODE] = []
6021 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
6023 def DeclareLocks(self, level):
6024 if level == locking.LEVEL_NODE:
6025 self._LockInstancesNodes()
6027 def CheckPrereq(self):
6028 """Check prerequisites.
6030 This checks that the instance is in the cluster.
6033 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6034 assert self.instance is not None, \
6035 "Cannot retrieve locked instance %s" % self.op.instance_name
6037 def Exec(self, feedback_fn):
6038 """Deactivate the disks
6041 instance = self.instance
6043 _ShutdownInstanceDisks(self, instance)
6045 _SafeShutdownInstanceDisks(self, instance)
6048 def _SafeShutdownInstanceDisks(lu, instance, disks=None):
6049 """Shutdown block devices of an instance.
6051 This function checks if an instance is running, before calling
6052 _ShutdownInstanceDisks.
6055 _CheckInstanceState(lu, instance, INSTANCE_DOWN, msg="cannot shutdown disks")
6056 _ShutdownInstanceDisks(lu, instance, disks=disks)
6059 def _ExpandCheckDisks(instance, disks):
6060 """Return the instance disks selected by the disks list
6062 @type disks: list of L{objects.Disk} or None
6063 @param disks: selected disks
6064 @rtype: list of L{objects.Disk}
6065 @return: selected instance disks to act on
6069 return instance.disks
6071 if not set(disks).issubset(instance.disks):
6072 raise errors.ProgrammerError("Can only act on disks belonging to the"
6077 def _ShutdownInstanceDisks(lu, instance, disks=None, ignore_primary=False):
6078 """Shutdown block devices of an instance.
6080 This does the shutdown on all nodes of the instance.
6082 If the ignore_primary is false, errors on the primary node are
6087 disks = _ExpandCheckDisks(instance, disks)
6090 for node, top_disk in disk.ComputeNodeTree(instance.primary_node):
6091 lu.cfg.SetDiskID(top_disk, node)
6092 result = lu.rpc.call_blockdev_shutdown(node, top_disk)
6093 msg = result.fail_msg
6095 lu.LogWarning("Could not shutdown block device %s on node %s: %s",
6096 disk.iv_name, node, msg)
6097 if ((node == instance.primary_node and not ignore_primary) or
6098 (node != instance.primary_node and not result.offline)):
6103 def _CheckNodeFreeMemory(lu, node, reason, requested, hypervisor_name):
6104 """Checks if a node has enough free memory.
6106 This function check if a given node has the needed amount of free
6107 memory. In case the node has less memory or we cannot get the
6108 information from the node, this function raise an OpPrereqError
6111 @type lu: C{LogicalUnit}
6112 @param lu: a logical unit from which we get configuration data
6114 @param node: the node to check
6115 @type reason: C{str}
6116 @param reason: string to use in the error message
6117 @type requested: C{int}
6118 @param requested: the amount of memory in MiB to check for
6119 @type hypervisor_name: C{str}
6120 @param hypervisor_name: the hypervisor to ask for memory stats
6121 @raise errors.OpPrereqError: if the node doesn't have enough memory, or
6122 we cannot check the node
6125 nodeinfo = lu.rpc.call_node_info([node], None, [hypervisor_name])
6126 nodeinfo[node].Raise("Can't get data from node %s" % node,
6127 prereq=True, ecode=errors.ECODE_ENVIRON)
6128 (_, _, (hv_info, )) = nodeinfo[node].payload
6130 free_mem = hv_info.get("memory_free", None)
6131 if not isinstance(free_mem, int):
6132 raise errors.OpPrereqError("Can't compute free memory on node %s, result"
6133 " was '%s'" % (node, free_mem),
6134 errors.ECODE_ENVIRON)
6135 if requested > free_mem:
6136 raise errors.OpPrereqError("Not enough memory on node %s for %s:"
6137 " needed %s MiB, available %s MiB" %
6138 (node, reason, requested, free_mem),
6142 def _CheckNodesFreeDiskPerVG(lu, nodenames, req_sizes):
6143 """Checks if nodes have enough free disk space in the all VGs.
6145 This function check if all given nodes have the needed amount of
6146 free disk. In case any node has less disk or we cannot get the
6147 information from the node, this function raise an OpPrereqError
6150 @type lu: C{LogicalUnit}
6151 @param lu: a logical unit from which we get configuration data
6152 @type nodenames: C{list}
6153 @param nodenames: the list of node names to check
6154 @type req_sizes: C{dict}
6155 @param req_sizes: the hash of vg and corresponding amount of disk in
6157 @raise errors.OpPrereqError: if the node doesn't have enough disk,
6158 or we cannot check the node
6161 for vg, req_size in req_sizes.items():
6162 _CheckNodesFreeDiskOnVG(lu, nodenames, vg, req_size)
6165 def _CheckNodesFreeDiskOnVG(lu, nodenames, vg, requested):
6166 """Checks if nodes have enough free disk space in the specified VG.
6168 This function check if all given nodes have the needed amount of
6169 free disk. In case any node has less disk or we cannot get the
6170 information from the node, this function raise an OpPrereqError
6173 @type lu: C{LogicalUnit}
6174 @param lu: a logical unit from which we get configuration data
6175 @type nodenames: C{list}
6176 @param nodenames: the list of node names to check
6178 @param vg: the volume group to check
6179 @type requested: C{int}
6180 @param requested: the amount of disk in MiB to check for
6181 @raise errors.OpPrereqError: if the node doesn't have enough disk,
6182 or we cannot check the node
6185 nodeinfo = lu.rpc.call_node_info(nodenames, [vg], None)
6186 for node in nodenames:
6187 info = nodeinfo[node]
6188 info.Raise("Cannot get current information from node %s" % node,
6189 prereq=True, ecode=errors.ECODE_ENVIRON)
6190 (_, (vg_info, ), _) = info.payload
6191 vg_free = vg_info.get("vg_free", None)
6192 if not isinstance(vg_free, int):
6193 raise errors.OpPrereqError("Can't compute free disk space on node"
6194 " %s for vg %s, result was '%s'" %
6195 (node, vg, vg_free), errors.ECODE_ENVIRON)
6196 if requested > vg_free:
6197 raise errors.OpPrereqError("Not enough disk space on target node %s"
6198 " vg %s: required %d MiB, available %d MiB" %
6199 (node, vg, requested, vg_free),
6203 def _CheckNodesPhysicalCPUs(lu, nodenames, requested, hypervisor_name):
6204 """Checks if nodes have enough physical CPUs
6206 This function checks if all given nodes have the needed number of
6207 physical CPUs. In case any node has less CPUs or we cannot get the
6208 information from the node, this function raises an OpPrereqError
6211 @type lu: C{LogicalUnit}
6212 @param lu: a logical unit from which we get configuration data
6213 @type nodenames: C{list}
6214 @param nodenames: the list of node names to check
6215 @type requested: C{int}
6216 @param requested: the minimum acceptable number of physical CPUs
6217 @raise errors.OpPrereqError: if the node doesn't have enough CPUs,
6218 or we cannot check the node
6221 nodeinfo = lu.rpc.call_node_info(nodenames, None, [hypervisor_name])
6222 for node in nodenames:
6223 info = nodeinfo[node]
6224 info.Raise("Cannot get current information from node %s" % node,
6225 prereq=True, ecode=errors.ECODE_ENVIRON)
6226 (_, _, (hv_info, )) = info.payload
6227 num_cpus = hv_info.get("cpu_total", None)
6228 if not isinstance(num_cpus, int):
6229 raise errors.OpPrereqError("Can't compute the number of physical CPUs"
6230 " on node %s, result was '%s'" %
6231 (node, num_cpus), errors.ECODE_ENVIRON)
6232 if requested > num_cpus:
6233 raise errors.OpPrereqError("Node %s has %s physical CPUs, but %s are "
6234 "required" % (node, num_cpus, requested),
6238 class LUInstanceStartup(LogicalUnit):
6239 """Starts an instance.
6242 HPATH = "instance-start"
6243 HTYPE = constants.HTYPE_INSTANCE
6246 def CheckArguments(self):
6248 if self.op.beparams:
6249 # fill the beparams dict
6250 objects.UpgradeBeParams(self.op.beparams)
6251 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
6253 def ExpandNames(self):
6254 self._ExpandAndLockInstance()
6256 def BuildHooksEnv(self):
6259 This runs on master, primary and secondary nodes of the instance.
6263 "FORCE": self.op.force,
6266 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
6270 def BuildHooksNodes(self):
6271 """Build hooks nodes.
6274 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6277 def CheckPrereq(self):
6278 """Check prerequisites.
6280 This checks that the instance is in the cluster.
6283 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6284 assert self.instance is not None, \
6285 "Cannot retrieve locked instance %s" % self.op.instance_name
6288 if self.op.hvparams:
6289 # check hypervisor parameter syntax (locally)
6290 cluster = self.cfg.GetClusterInfo()
6291 utils.ForceDictType(self.op.hvparams, constants.HVS_PARAMETER_TYPES)
6292 filled_hvp = cluster.FillHV(instance)
6293 filled_hvp.update(self.op.hvparams)
6294 hv_type = hypervisor.GetHypervisor(instance.hypervisor)
6295 hv_type.CheckParameterSyntax(filled_hvp)
6296 _CheckHVParams(self, instance.all_nodes, instance.hypervisor, filled_hvp)
6298 _CheckInstanceState(self, instance, INSTANCE_ONLINE)
6300 self.primary_offline = self.cfg.GetNodeInfo(instance.primary_node).offline
6302 if self.primary_offline and self.op.ignore_offline_nodes:
6303 self.proc.LogWarning("Ignoring offline primary node")
6305 if self.op.hvparams or self.op.beparams:
6306 self.proc.LogWarning("Overridden parameters are ignored")
6308 _CheckNodeOnline(self, instance.primary_node)
6310 bep = self.cfg.GetClusterInfo().FillBE(instance)
6312 # check bridges existence
6313 _CheckInstanceBridgesExist(self, instance)
6315 remote_info = self.rpc.call_instance_info(instance.primary_node,
6317 instance.hypervisor)
6318 remote_info.Raise("Error checking node %s" % instance.primary_node,
6319 prereq=True, ecode=errors.ECODE_ENVIRON)
6320 if not remote_info.payload: # not running already
6321 _CheckNodeFreeMemory(self, instance.primary_node,
6322 "starting instance %s" % instance.name,
6323 bep[constants.BE_MAXMEM], instance.hypervisor)
6325 def Exec(self, feedback_fn):
6326 """Start the instance.
6329 instance = self.instance
6330 force = self.op.force
6332 if not self.op.no_remember:
6333 self.cfg.MarkInstanceUp(instance.name)
6335 if self.primary_offline:
6336 assert self.op.ignore_offline_nodes
6337 self.proc.LogInfo("Primary node offline, marked instance as started")
6339 node_current = instance.primary_node
6341 _StartInstanceDisks(self, instance, force)
6344 self.rpc.call_instance_start(node_current,
6345 (instance, self.op.hvparams,
6347 self.op.startup_paused)
6348 msg = result.fail_msg
6350 _ShutdownInstanceDisks(self, instance)
6351 raise errors.OpExecError("Could not start instance: %s" % msg)
6354 class LUInstanceReboot(LogicalUnit):
6355 """Reboot an instance.
6358 HPATH = "instance-reboot"
6359 HTYPE = constants.HTYPE_INSTANCE
6362 def ExpandNames(self):
6363 self._ExpandAndLockInstance()
6365 def BuildHooksEnv(self):
6368 This runs on master, primary and secondary nodes of the instance.
6372 "IGNORE_SECONDARIES": self.op.ignore_secondaries,
6373 "REBOOT_TYPE": self.op.reboot_type,
6374 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
6377 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
6381 def BuildHooksNodes(self):
6382 """Build hooks nodes.
6385 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6388 def CheckPrereq(self):
6389 """Check prerequisites.
6391 This checks that the instance is in the cluster.
6394 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6395 assert self.instance is not None, \
6396 "Cannot retrieve locked instance %s" % self.op.instance_name
6397 _CheckInstanceState(self, instance, INSTANCE_ONLINE)
6398 _CheckNodeOnline(self, instance.primary_node)
6400 # check bridges existence
6401 _CheckInstanceBridgesExist(self, instance)
6403 def Exec(self, feedback_fn):
6404 """Reboot the instance.
6407 instance = self.instance
6408 ignore_secondaries = self.op.ignore_secondaries
6409 reboot_type = self.op.reboot_type
6411 remote_info = self.rpc.call_instance_info(instance.primary_node,
6413 instance.hypervisor)
6414 remote_info.Raise("Error checking node %s" % instance.primary_node)
6415 instance_running = bool(remote_info.payload)
6417 node_current = instance.primary_node
6419 if instance_running and reboot_type in [constants.INSTANCE_REBOOT_SOFT,
6420 constants.INSTANCE_REBOOT_HARD]:
6421 for disk in instance.disks:
6422 self.cfg.SetDiskID(disk, node_current)
6423 result = self.rpc.call_instance_reboot(node_current, instance,
6425 self.op.shutdown_timeout)
6426 result.Raise("Could not reboot instance")
6428 if instance_running:
6429 result = self.rpc.call_instance_shutdown(node_current, instance,
6430 self.op.shutdown_timeout)
6431 result.Raise("Could not shutdown instance for full reboot")
6432 _ShutdownInstanceDisks(self, instance)
6434 self.LogInfo("Instance %s was already stopped, starting now",
6436 _StartInstanceDisks(self, instance, ignore_secondaries)
6437 result = self.rpc.call_instance_start(node_current,
6438 (instance, None, None), False)
6439 msg = result.fail_msg
6441 _ShutdownInstanceDisks(self, instance)
6442 raise errors.OpExecError("Could not start instance for"
6443 " full reboot: %s" % msg)
6445 self.cfg.MarkInstanceUp(instance.name)
6448 class LUInstanceShutdown(LogicalUnit):
6449 """Shutdown an instance.
6452 HPATH = "instance-stop"
6453 HTYPE = constants.HTYPE_INSTANCE
6456 def ExpandNames(self):
6457 self._ExpandAndLockInstance()
6459 def BuildHooksEnv(self):
6462 This runs on master, primary and secondary nodes of the instance.
6465 env = _BuildInstanceHookEnvByObject(self, self.instance)
6466 env["TIMEOUT"] = self.op.timeout
6469 def BuildHooksNodes(self):
6470 """Build hooks nodes.
6473 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6476 def CheckPrereq(self):
6477 """Check prerequisites.
6479 This checks that the instance is in the cluster.
6482 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6483 assert self.instance is not None, \
6484 "Cannot retrieve locked instance %s" % self.op.instance_name
6486 _CheckInstanceState(self, self.instance, INSTANCE_ONLINE)
6488 self.primary_offline = \
6489 self.cfg.GetNodeInfo(self.instance.primary_node).offline
6491 if self.primary_offline and self.op.ignore_offline_nodes:
6492 self.proc.LogWarning("Ignoring offline primary node")
6494 _CheckNodeOnline(self, self.instance.primary_node)
6496 def Exec(self, feedback_fn):
6497 """Shutdown the instance.
6500 instance = self.instance
6501 node_current = instance.primary_node
6502 timeout = self.op.timeout
6504 if not self.op.no_remember:
6505 self.cfg.MarkInstanceDown(instance.name)
6507 if self.primary_offline:
6508 assert self.op.ignore_offline_nodes
6509 self.proc.LogInfo("Primary node offline, marked instance as stopped")
6511 result = self.rpc.call_instance_shutdown(node_current, instance, timeout)
6512 msg = result.fail_msg
6514 self.proc.LogWarning("Could not shutdown instance: %s" % msg)
6516 _ShutdownInstanceDisks(self, instance)
6519 class LUInstanceReinstall(LogicalUnit):
6520 """Reinstall an instance.
6523 HPATH = "instance-reinstall"
6524 HTYPE = constants.HTYPE_INSTANCE
6527 def ExpandNames(self):
6528 self._ExpandAndLockInstance()
6530 def BuildHooksEnv(self):
6533 This runs on master, primary and secondary nodes of the instance.
6536 return _BuildInstanceHookEnvByObject(self, self.instance)
6538 def BuildHooksNodes(self):
6539 """Build hooks nodes.
6542 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6545 def CheckPrereq(self):
6546 """Check prerequisites.
6548 This checks that the instance is in the cluster and is not running.
6551 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6552 assert instance is not None, \
6553 "Cannot retrieve locked instance %s" % self.op.instance_name
6554 _CheckNodeOnline(self, instance.primary_node, "Instance primary node"
6555 " offline, cannot reinstall")
6556 for node in instance.secondary_nodes:
6557 _CheckNodeOnline(self, node, "Instance secondary node offline,"
6558 " cannot reinstall")
6560 if instance.disk_template == constants.DT_DISKLESS:
6561 raise errors.OpPrereqError("Instance '%s' has no disks" %
6562 self.op.instance_name,
6564 _CheckInstanceState(self, instance, INSTANCE_DOWN, msg="cannot reinstall")
6566 if self.op.os_type is not None:
6568 pnode = _ExpandNodeName(self.cfg, instance.primary_node)
6569 _CheckNodeHasOS(self, pnode, self.op.os_type, self.op.force_variant)
6570 instance_os = self.op.os_type
6572 instance_os = instance.os
6574 nodelist = list(instance.all_nodes)
6576 if self.op.osparams:
6577 i_osdict = _GetUpdatedParams(instance.osparams, self.op.osparams)
6578 _CheckOSParams(self, True, nodelist, instance_os, i_osdict)
6579 self.os_inst = i_osdict # the new dict (without defaults)
6583 self.instance = instance
6585 def Exec(self, feedback_fn):
6586 """Reinstall the instance.
6589 inst = self.instance
6591 if self.op.os_type is not None:
6592 feedback_fn("Changing OS to '%s'..." % self.op.os_type)
6593 inst.os = self.op.os_type
6594 # Write to configuration
6595 self.cfg.Update(inst, feedback_fn)
6597 _StartInstanceDisks(self, inst, None)
6599 feedback_fn("Running the instance OS create scripts...")
6600 # FIXME: pass debug option from opcode to backend
6601 result = self.rpc.call_instance_os_add(inst.primary_node,
6602 (inst, self.os_inst), True,
6603 self.op.debug_level)
6604 result.Raise("Could not install OS for instance %s on node %s" %
6605 (inst.name, inst.primary_node))
6607 _ShutdownInstanceDisks(self, inst)
6610 class LUInstanceRecreateDisks(LogicalUnit):
6611 """Recreate an instance's missing disks.
6614 HPATH = "instance-recreate-disks"
6615 HTYPE = constants.HTYPE_INSTANCE
6618 def CheckArguments(self):
6619 # normalise the disk list
6620 self.op.disks = sorted(frozenset(self.op.disks))
6622 def ExpandNames(self):
6623 self._ExpandAndLockInstance()
6624 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
6626 self.op.nodes = [_ExpandNodeName(self.cfg, n) for n in self.op.nodes]
6627 self.needed_locks[locking.LEVEL_NODE] = list(self.op.nodes)
6629 self.needed_locks[locking.LEVEL_NODE] = []
6631 def DeclareLocks(self, level):
6632 if level == locking.LEVEL_NODE:
6633 # if we replace the nodes, we only need to lock the old primary,
6634 # otherwise we need to lock all nodes for disk re-creation
6635 primary_only = bool(self.op.nodes)
6636 self._LockInstancesNodes(primary_only=primary_only)
6637 elif level == locking.LEVEL_NODE_RES:
6639 self.needed_locks[locking.LEVEL_NODE_RES] = \
6640 self.needed_locks[locking.LEVEL_NODE][:]
6642 def BuildHooksEnv(self):
6645 This runs on master, primary and secondary nodes of the instance.
6648 return _BuildInstanceHookEnvByObject(self, self.instance)
6650 def BuildHooksNodes(self):
6651 """Build hooks nodes.
6654 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6657 def CheckPrereq(self):
6658 """Check prerequisites.
6660 This checks that the instance is in the cluster and is not running.
6663 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6664 assert instance is not None, \
6665 "Cannot retrieve locked instance %s" % self.op.instance_name
6667 if len(self.op.nodes) != len(instance.all_nodes):
6668 raise errors.OpPrereqError("Instance %s currently has %d nodes, but"
6669 " %d replacement nodes were specified" %
6670 (instance.name, len(instance.all_nodes),
6671 len(self.op.nodes)),
6673 assert instance.disk_template != constants.DT_DRBD8 or \
6674 len(self.op.nodes) == 2
6675 assert instance.disk_template != constants.DT_PLAIN or \
6676 len(self.op.nodes) == 1
6677 primary_node = self.op.nodes[0]
6679 primary_node = instance.primary_node
6680 _CheckNodeOnline(self, primary_node)
6682 if instance.disk_template == constants.DT_DISKLESS:
6683 raise errors.OpPrereqError("Instance '%s' has no disks" %
6684 self.op.instance_name, errors.ECODE_INVAL)
6685 # if we replace nodes *and* the old primary is offline, we don't
6687 assert instance.primary_node in self.owned_locks(locking.LEVEL_NODE)
6688 assert instance.primary_node in self.owned_locks(locking.LEVEL_NODE_RES)
6689 old_pnode = self.cfg.GetNodeInfo(instance.primary_node)
6690 if not (self.op.nodes and old_pnode.offline):
6691 _CheckInstanceState(self, instance, INSTANCE_NOT_RUNNING,
6692 msg="cannot recreate disks")
6694 if not self.op.disks:
6695 self.op.disks = range(len(instance.disks))
6697 for idx in self.op.disks:
6698 if idx >= len(instance.disks):
6699 raise errors.OpPrereqError("Invalid disk index '%s'" % idx,
6701 if self.op.disks != range(len(instance.disks)) and self.op.nodes:
6702 raise errors.OpPrereqError("Can't recreate disks partially and"
6703 " change the nodes at the same time",
6705 self.instance = instance
6707 def Exec(self, feedback_fn):
6708 """Recreate the disks.
6711 instance = self.instance
6713 assert (self.owned_locks(locking.LEVEL_NODE) ==
6714 self.owned_locks(locking.LEVEL_NODE_RES))
6717 mods = [] # keeps track of needed logical_id changes
6719 for idx, disk in enumerate(instance.disks):
6720 if idx not in self.op.disks: # disk idx has not been passed in
6723 # update secondaries for disks, if needed
6725 if disk.dev_type == constants.LD_DRBD8:
6726 # need to update the nodes and minors
6727 assert len(self.op.nodes) == 2
6728 assert len(disk.logical_id) == 6 # otherwise disk internals
6730 (_, _, old_port, _, _, old_secret) = disk.logical_id
6731 new_minors = self.cfg.AllocateDRBDMinor(self.op.nodes, instance.name)
6732 new_id = (self.op.nodes[0], self.op.nodes[1], old_port,
6733 new_minors[0], new_minors[1], old_secret)
6734 assert len(disk.logical_id) == len(new_id)
6735 mods.append((idx, new_id))
6737 # now that we have passed all asserts above, we can apply the mods
6738 # in a single run (to avoid partial changes)
6739 for idx, new_id in mods:
6740 instance.disks[idx].logical_id = new_id
6742 # change primary node, if needed
6744 instance.primary_node = self.op.nodes[0]
6745 self.LogWarning("Changing the instance's nodes, you will have to"
6746 " remove any disks left on the older nodes manually")
6749 self.cfg.Update(instance, feedback_fn)
6751 _CreateDisks(self, instance, to_skip=to_skip)
6754 class LUInstanceRename(LogicalUnit):
6755 """Rename an instance.
6758 HPATH = "instance-rename"
6759 HTYPE = constants.HTYPE_INSTANCE
6761 def CheckArguments(self):
6765 if self.op.ip_check and not self.op.name_check:
6766 # TODO: make the ip check more flexible and not depend on the name check
6767 raise errors.OpPrereqError("IP address check requires a name check",
6770 def BuildHooksEnv(self):
6773 This runs on master, primary and secondary nodes of the instance.
6776 env = _BuildInstanceHookEnvByObject(self, self.instance)
6777 env["INSTANCE_NEW_NAME"] = self.op.new_name
6780 def BuildHooksNodes(self):
6781 """Build hooks nodes.
6784 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6787 def CheckPrereq(self):
6788 """Check prerequisites.
6790 This checks that the instance is in the cluster and is not running.
6793 self.op.instance_name = _ExpandInstanceName(self.cfg,
6794 self.op.instance_name)
6795 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6796 assert instance is not None
6797 _CheckNodeOnline(self, instance.primary_node)
6798 _CheckInstanceState(self, instance, INSTANCE_NOT_RUNNING,
6799 msg="cannot rename")
6800 self.instance = instance
6802 new_name = self.op.new_name
6803 if self.op.name_check:
6804 hostname = netutils.GetHostname(name=new_name)
6805 if hostname.name != new_name:
6806 self.LogInfo("Resolved given name '%s' to '%s'", new_name,
6808 if not utils.MatchNameComponent(self.op.new_name, [hostname.name]):
6809 raise errors.OpPrereqError(("Resolved hostname '%s' does not look the"
6810 " same as given hostname '%s'") %
6811 (hostname.name, self.op.new_name),
6813 new_name = self.op.new_name = hostname.name
6814 if (self.op.ip_check and
6815 netutils.TcpPing(hostname.ip, constants.DEFAULT_NODED_PORT)):
6816 raise errors.OpPrereqError("IP %s of instance %s already in use" %
6817 (hostname.ip, new_name),
6818 errors.ECODE_NOTUNIQUE)
6820 instance_list = self.cfg.GetInstanceList()
6821 if new_name in instance_list and new_name != instance.name:
6822 raise errors.OpPrereqError("Instance '%s' is already in the cluster" %
6823 new_name, errors.ECODE_EXISTS)
6825 def Exec(self, feedback_fn):
6826 """Rename the instance.
6829 inst = self.instance
6830 old_name = inst.name
6832 rename_file_storage = False
6833 if (inst.disk_template in constants.DTS_FILEBASED and
6834 self.op.new_name != inst.name):
6835 old_file_storage_dir = os.path.dirname(inst.disks[0].logical_id[1])
6836 rename_file_storage = True
6838 self.cfg.RenameInstance(inst.name, self.op.new_name)
6839 # Change the instance lock. This is definitely safe while we hold the BGL.
6840 # Otherwise the new lock would have to be added in acquired mode.
6842 self.glm.remove(locking.LEVEL_INSTANCE, old_name)
6843 self.glm.add(locking.LEVEL_INSTANCE, self.op.new_name)
6845 # re-read the instance from the configuration after rename
6846 inst = self.cfg.GetInstanceInfo(self.op.new_name)
6848 if rename_file_storage:
6849 new_file_storage_dir = os.path.dirname(inst.disks[0].logical_id[1])
6850 result = self.rpc.call_file_storage_dir_rename(inst.primary_node,
6851 old_file_storage_dir,
6852 new_file_storage_dir)
6853 result.Raise("Could not rename on node %s directory '%s' to '%s'"
6854 " (but the instance has been renamed in Ganeti)" %
6855 (inst.primary_node, old_file_storage_dir,
6856 new_file_storage_dir))
6858 _StartInstanceDisks(self, inst, None)
6860 result = self.rpc.call_instance_run_rename(inst.primary_node, inst,
6861 old_name, self.op.debug_level)
6862 msg = result.fail_msg
6864 msg = ("Could not run OS rename script for instance %s on node %s"
6865 " (but the instance has been renamed in Ganeti): %s" %
6866 (inst.name, inst.primary_node, msg))
6867 self.proc.LogWarning(msg)
6869 _ShutdownInstanceDisks(self, inst)
6874 class LUInstanceRemove(LogicalUnit):
6875 """Remove an instance.
6878 HPATH = "instance-remove"
6879 HTYPE = constants.HTYPE_INSTANCE
6882 def ExpandNames(self):
6883 self._ExpandAndLockInstance()
6884 self.needed_locks[locking.LEVEL_NODE] = []
6885 self.needed_locks[locking.LEVEL_NODE_RES] = []
6886 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
6888 def DeclareLocks(self, level):
6889 if level == locking.LEVEL_NODE:
6890 self._LockInstancesNodes()
6891 elif level == locking.LEVEL_NODE_RES:
6893 self.needed_locks[locking.LEVEL_NODE_RES] = \
6894 self.needed_locks[locking.LEVEL_NODE][:]
6896 def BuildHooksEnv(self):
6899 This runs on master, primary and secondary nodes of the instance.
6902 env = _BuildInstanceHookEnvByObject(self, self.instance)
6903 env["SHUTDOWN_TIMEOUT"] = self.op.shutdown_timeout
6906 def BuildHooksNodes(self):
6907 """Build hooks nodes.
6910 nl = [self.cfg.GetMasterNode()]
6911 nl_post = list(self.instance.all_nodes) + nl
6912 return (nl, nl_post)
6914 def CheckPrereq(self):
6915 """Check prerequisites.
6917 This checks that the instance is in the cluster.
6920 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6921 assert self.instance is not None, \
6922 "Cannot retrieve locked instance %s" % self.op.instance_name
6924 def Exec(self, feedback_fn):
6925 """Remove the instance.
6928 instance = self.instance
6929 logging.info("Shutting down instance %s on node %s",
6930 instance.name, instance.primary_node)
6932 result = self.rpc.call_instance_shutdown(instance.primary_node, instance,
6933 self.op.shutdown_timeout)
6934 msg = result.fail_msg
6936 if self.op.ignore_failures:
6937 feedback_fn("Warning: can't shutdown instance: %s" % msg)
6939 raise errors.OpExecError("Could not shutdown instance %s on"
6941 (instance.name, instance.primary_node, msg))
6943 assert (self.owned_locks(locking.LEVEL_NODE) ==
6944 self.owned_locks(locking.LEVEL_NODE_RES))
6945 assert not (set(instance.all_nodes) -
6946 self.owned_locks(locking.LEVEL_NODE)), \
6947 "Not owning correct locks"
6949 _RemoveInstance(self, feedback_fn, instance, self.op.ignore_failures)
6952 def _RemoveInstance(lu, feedback_fn, instance, ignore_failures):
6953 """Utility function to remove an instance.
6956 logging.info("Removing block devices for instance %s", instance.name)
6958 if not _RemoveDisks(lu, instance):
6959 if not ignore_failures:
6960 raise errors.OpExecError("Can't remove instance's disks")
6961 feedback_fn("Warning: can't remove instance's disks")
6963 logging.info("Removing instance %s out of cluster config", instance.name)
6965 lu.cfg.RemoveInstance(instance.name)
6967 assert not lu.remove_locks.get(locking.LEVEL_INSTANCE), \
6968 "Instance lock removal conflict"
6970 # Remove lock for the instance
6971 lu.remove_locks[locking.LEVEL_INSTANCE] = instance.name
6974 class LUInstanceQuery(NoHooksLU):
6975 """Logical unit for querying instances.
6978 # pylint: disable=W0142
6981 def CheckArguments(self):
6982 self.iq = _InstanceQuery(qlang.MakeSimpleFilter("name", self.op.names),
6983 self.op.output_fields, self.op.use_locking)
6985 def ExpandNames(self):
6986 self.iq.ExpandNames(self)
6988 def DeclareLocks(self, level):
6989 self.iq.DeclareLocks(self, level)
6991 def Exec(self, feedback_fn):
6992 return self.iq.OldStyleQuery(self)
6995 class LUInstanceFailover(LogicalUnit):
6996 """Failover an instance.
6999 HPATH = "instance-failover"
7000 HTYPE = constants.HTYPE_INSTANCE
7003 def CheckArguments(self):
7004 """Check the arguments.
7007 self.iallocator = getattr(self.op, "iallocator", None)
7008 self.target_node = getattr(self.op, "target_node", None)
7010 def ExpandNames(self):
7011 self._ExpandAndLockInstance()
7013 if self.op.target_node is not None:
7014 self.op.target_node = _ExpandNodeName(self.cfg, self.op.target_node)
7016 self.needed_locks[locking.LEVEL_NODE] = []
7017 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
7019 ignore_consistency = self.op.ignore_consistency
7020 shutdown_timeout = self.op.shutdown_timeout
7021 self._migrater = TLMigrateInstance(self, self.op.instance_name,
7024 ignore_consistency=ignore_consistency,
7025 shutdown_timeout=shutdown_timeout)
7026 self.tasklets = [self._migrater]
7028 def DeclareLocks(self, level):
7029 if level == locking.LEVEL_NODE:
7030 instance = self.context.cfg.GetInstanceInfo(self.op.instance_name)
7031 if instance.disk_template in constants.DTS_EXT_MIRROR:
7032 if self.op.target_node is None:
7033 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
7035 self.needed_locks[locking.LEVEL_NODE] = [instance.primary_node,
7036 self.op.target_node]
7037 del self.recalculate_locks[locking.LEVEL_NODE]
7039 self._LockInstancesNodes()
7041 def BuildHooksEnv(self):
7044 This runs on master, primary and secondary nodes of the instance.
7047 instance = self._migrater.instance
7048 source_node = instance.primary_node
7049 target_node = self.op.target_node
7051 "IGNORE_CONSISTENCY": self.op.ignore_consistency,
7052 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
7053 "OLD_PRIMARY": source_node,
7054 "NEW_PRIMARY": target_node,
7057 if instance.disk_template in constants.DTS_INT_MIRROR:
7058 env["OLD_SECONDARY"] = instance.secondary_nodes[0]
7059 env["NEW_SECONDARY"] = source_node
7061 env["OLD_SECONDARY"] = env["NEW_SECONDARY"] = ""
7063 env.update(_BuildInstanceHookEnvByObject(self, instance))
7067 def BuildHooksNodes(self):
7068 """Build hooks nodes.
7071 instance = self._migrater.instance
7072 nl = [self.cfg.GetMasterNode()] + list(instance.secondary_nodes)
7073 return (nl, nl + [instance.primary_node])
7076 class LUInstanceMigrate(LogicalUnit):
7077 """Migrate an instance.
7079 This is migration without shutting down, compared to the failover,
7080 which is done with shutdown.
7083 HPATH = "instance-migrate"
7084 HTYPE = constants.HTYPE_INSTANCE
7087 def ExpandNames(self):
7088 self._ExpandAndLockInstance()
7090 if self.op.target_node is not None:
7091 self.op.target_node = _ExpandNodeName(self.cfg, self.op.target_node)
7093 self.needed_locks[locking.LEVEL_NODE] = []
7094 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
7096 self._migrater = TLMigrateInstance(self, self.op.instance_name,
7097 cleanup=self.op.cleanup,
7099 fallback=self.op.allow_failover)
7100 self.tasklets = [self._migrater]
7102 def DeclareLocks(self, level):
7103 if level == locking.LEVEL_NODE:
7104 instance = self.context.cfg.GetInstanceInfo(self.op.instance_name)
7105 if instance.disk_template in constants.DTS_EXT_MIRROR:
7106 if self.op.target_node is None:
7107 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
7109 self.needed_locks[locking.LEVEL_NODE] = [instance.primary_node,
7110 self.op.target_node]
7111 del self.recalculate_locks[locking.LEVEL_NODE]
7113 self._LockInstancesNodes()
7115 def BuildHooksEnv(self):
7118 This runs on master, primary and secondary nodes of the instance.
7121 instance = self._migrater.instance
7122 source_node = instance.primary_node
7123 target_node = self.op.target_node
7124 env = _BuildInstanceHookEnvByObject(self, instance)
7126 "MIGRATE_LIVE": self._migrater.live,
7127 "MIGRATE_CLEANUP": self.op.cleanup,
7128 "OLD_PRIMARY": source_node,
7129 "NEW_PRIMARY": target_node,
7132 if instance.disk_template in constants.DTS_INT_MIRROR:
7133 env["OLD_SECONDARY"] = target_node
7134 env["NEW_SECONDARY"] = source_node
7136 env["OLD_SECONDARY"] = env["NEW_SECONDARY"] = None
7140 def BuildHooksNodes(self):
7141 """Build hooks nodes.
7144 instance = self._migrater.instance
7145 nl = [self.cfg.GetMasterNode()] + list(instance.secondary_nodes)
7146 return (nl, nl + [instance.primary_node])
7149 class LUInstanceMove(LogicalUnit):
7150 """Move an instance by data-copying.
7153 HPATH = "instance-move"
7154 HTYPE = constants.HTYPE_INSTANCE
7157 def ExpandNames(self):
7158 self._ExpandAndLockInstance()
7159 target_node = _ExpandNodeName(self.cfg, self.op.target_node)
7160 self.op.target_node = target_node
7161 self.needed_locks[locking.LEVEL_NODE] = [target_node]
7162 self.needed_locks[locking.LEVEL_NODE_RES] = []
7163 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
7165 def DeclareLocks(self, level):
7166 if level == locking.LEVEL_NODE:
7167 self._LockInstancesNodes(primary_only=True)
7168 elif level == locking.LEVEL_NODE_RES:
7170 self.needed_locks[locking.LEVEL_NODE_RES] = \
7171 self.needed_locks[locking.LEVEL_NODE][:]
7173 def BuildHooksEnv(self):
7176 This runs on master, primary and secondary nodes of the instance.
7180 "TARGET_NODE": self.op.target_node,
7181 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
7183 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
7186 def BuildHooksNodes(self):
7187 """Build hooks nodes.
7191 self.cfg.GetMasterNode(),
7192 self.instance.primary_node,
7193 self.op.target_node,
7197 def CheckPrereq(self):
7198 """Check prerequisites.
7200 This checks that the instance is in the cluster.
7203 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
7204 assert self.instance is not None, \
7205 "Cannot retrieve locked instance %s" % self.op.instance_name
7207 node = self.cfg.GetNodeInfo(self.op.target_node)
7208 assert node is not None, \
7209 "Cannot retrieve locked node %s" % self.op.target_node
7211 self.target_node = target_node = node.name
7213 if target_node == instance.primary_node:
7214 raise errors.OpPrereqError("Instance %s is already on the node %s" %
7215 (instance.name, target_node),
7218 bep = self.cfg.GetClusterInfo().FillBE(instance)
7220 for idx, dsk in enumerate(instance.disks):
7221 if dsk.dev_type not in (constants.LD_LV, constants.LD_FILE):
7222 raise errors.OpPrereqError("Instance disk %d has a complex layout,"
7223 " cannot copy" % idx, errors.ECODE_STATE)
7225 _CheckNodeOnline(self, target_node)
7226 _CheckNodeNotDrained(self, target_node)
7227 _CheckNodeVmCapable(self, target_node)
7229 if instance.admin_state == constants.ADMINST_UP:
7230 # check memory requirements on the secondary node
7231 _CheckNodeFreeMemory(self, target_node, "failing over instance %s" %
7232 instance.name, bep[constants.BE_MAXMEM],
7233 instance.hypervisor)
7235 self.LogInfo("Not checking memory on the secondary node as"
7236 " instance will not be started")
7238 # check bridge existance
7239 _CheckInstanceBridgesExist(self, instance, node=target_node)
7241 def Exec(self, feedback_fn):
7242 """Move an instance.
7244 The move is done by shutting it down on its present node, copying
7245 the data over (slow) and starting it on the new node.
7248 instance = self.instance
7250 source_node = instance.primary_node
7251 target_node = self.target_node
7253 self.LogInfo("Shutting down instance %s on source node %s",
7254 instance.name, source_node)
7256 assert (self.owned_locks(locking.LEVEL_NODE) ==
7257 self.owned_locks(locking.LEVEL_NODE_RES))
7259 result = self.rpc.call_instance_shutdown(source_node, instance,
7260 self.op.shutdown_timeout)
7261 msg = result.fail_msg
7263 if self.op.ignore_consistency:
7264 self.proc.LogWarning("Could not shutdown instance %s on node %s."
7265 " Proceeding anyway. Please make sure node"
7266 " %s is down. Error details: %s",
7267 instance.name, source_node, source_node, msg)
7269 raise errors.OpExecError("Could not shutdown instance %s on"
7271 (instance.name, source_node, msg))
7273 # create the target disks
7275 _CreateDisks(self, instance, target_node=target_node)
7276 except errors.OpExecError:
7277 self.LogWarning("Device creation failed, reverting...")
7279 _RemoveDisks(self, instance, target_node=target_node)
7281 self.cfg.ReleaseDRBDMinors(instance.name)
7284 cluster_name = self.cfg.GetClusterInfo().cluster_name
7287 # activate, get path, copy the data over
7288 for idx, disk in enumerate(instance.disks):
7289 self.LogInfo("Copying data for disk %d", idx)
7290 result = self.rpc.call_blockdev_assemble(target_node, disk,
7291 instance.name, True, idx)
7293 self.LogWarning("Can't assemble newly created disk %d: %s",
7294 idx, result.fail_msg)
7295 errs.append(result.fail_msg)
7297 dev_path = result.payload
7298 result = self.rpc.call_blockdev_export(source_node, disk,
7299 target_node, dev_path,
7302 self.LogWarning("Can't copy data over for disk %d: %s",
7303 idx, result.fail_msg)
7304 errs.append(result.fail_msg)
7308 self.LogWarning("Some disks failed to copy, aborting")
7310 _RemoveDisks(self, instance, target_node=target_node)
7312 self.cfg.ReleaseDRBDMinors(instance.name)
7313 raise errors.OpExecError("Errors during disk copy: %s" %
7316 instance.primary_node = target_node
7317 self.cfg.Update(instance, feedback_fn)
7319 self.LogInfo("Removing the disks on the original node")
7320 _RemoveDisks(self, instance, target_node=source_node)
7322 # Only start the instance if it's marked as up
7323 if instance.admin_state == constants.ADMINST_UP:
7324 self.LogInfo("Starting instance %s on node %s",
7325 instance.name, target_node)
7327 disks_ok, _ = _AssembleInstanceDisks(self, instance,
7328 ignore_secondaries=True)
7330 _ShutdownInstanceDisks(self, instance)
7331 raise errors.OpExecError("Can't activate the instance's disks")
7333 result = self.rpc.call_instance_start(target_node,
7334 (instance, None, None), False)
7335 msg = result.fail_msg
7337 _ShutdownInstanceDisks(self, instance)
7338 raise errors.OpExecError("Could not start instance %s on node %s: %s" %
7339 (instance.name, target_node, msg))
7342 class LUNodeMigrate(LogicalUnit):
7343 """Migrate all instances from a node.
7346 HPATH = "node-migrate"
7347 HTYPE = constants.HTYPE_NODE
7350 def CheckArguments(self):
7353 def ExpandNames(self):
7354 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
7356 self.share_locks = _ShareAll()
7357 self.needed_locks = {
7358 locking.LEVEL_NODE: [self.op.node_name],
7361 def BuildHooksEnv(self):
7364 This runs on the master, the primary and all the secondaries.
7368 "NODE_NAME": self.op.node_name,
7371 def BuildHooksNodes(self):
7372 """Build hooks nodes.
7375 nl = [self.cfg.GetMasterNode()]
7378 def CheckPrereq(self):
7381 def Exec(self, feedback_fn):
7382 # Prepare jobs for migration instances
7384 [opcodes.OpInstanceMigrate(instance_name=inst.name,
7387 iallocator=self.op.iallocator,
7388 target_node=self.op.target_node)]
7389 for inst in _GetNodePrimaryInstances(self.cfg, self.op.node_name)
7392 # TODO: Run iallocator in this opcode and pass correct placement options to
7393 # OpInstanceMigrate. Since other jobs can modify the cluster between
7394 # running the iallocator and the actual migration, a good consistency model
7395 # will have to be found.
7397 assert (frozenset(self.owned_locks(locking.LEVEL_NODE)) ==
7398 frozenset([self.op.node_name]))
7400 return ResultWithJobs(jobs)
7403 class TLMigrateInstance(Tasklet):
7404 """Tasklet class for instance migration.
7407 @ivar live: whether the migration will be done live or non-live;
7408 this variable is initalized only after CheckPrereq has run
7409 @type cleanup: boolean
7410 @ivar cleanup: Wheater we cleanup from a failed migration
7411 @type iallocator: string
7412 @ivar iallocator: The iallocator used to determine target_node
7413 @type target_node: string
7414 @ivar target_node: If given, the target_node to reallocate the instance to
7415 @type failover: boolean
7416 @ivar failover: Whether operation results in failover or migration
7417 @type fallback: boolean
7418 @ivar fallback: Whether fallback to failover is allowed if migration not
7420 @type ignore_consistency: boolean
7421 @ivar ignore_consistency: Wheter we should ignore consistency between source
7423 @type shutdown_timeout: int
7424 @ivar shutdown_timeout: In case of failover timeout of the shutdown
7429 _MIGRATION_POLL_INTERVAL = 1 # seconds
7430 _MIGRATION_FEEDBACK_INTERVAL = 10 # seconds
7432 def __init__(self, lu, instance_name, cleanup=False,
7433 failover=False, fallback=False,
7434 ignore_consistency=False,
7435 shutdown_timeout=constants.DEFAULT_SHUTDOWN_TIMEOUT):
7436 """Initializes this class.
7439 Tasklet.__init__(self, lu)
7442 self.instance_name = instance_name
7443 self.cleanup = cleanup
7444 self.live = False # will be overridden later
7445 self.failover = failover
7446 self.fallback = fallback
7447 self.ignore_consistency = ignore_consistency
7448 self.shutdown_timeout = shutdown_timeout
7450 def CheckPrereq(self):
7451 """Check prerequisites.
7453 This checks that the instance is in the cluster.
7456 instance_name = _ExpandInstanceName(self.lu.cfg, self.instance_name)
7457 instance = self.cfg.GetInstanceInfo(instance_name)
7458 assert instance is not None
7459 self.instance = instance
7461 if (not self.cleanup and
7462 not instance.admin_state == constants.ADMINST_UP and
7463 not self.failover and self.fallback):
7464 self.lu.LogInfo("Instance is marked down or offline, fallback allowed,"
7465 " switching to failover")
7466 self.failover = True
7468 if instance.disk_template not in constants.DTS_MIRRORED:
7473 raise errors.OpPrereqError("Instance's disk layout '%s' does not allow"
7474 " %s" % (instance.disk_template, text),
7477 if instance.disk_template in constants.DTS_EXT_MIRROR:
7478 _CheckIAllocatorOrNode(self.lu, "iallocator", "target_node")
7480 if self.lu.op.iallocator:
7481 self._RunAllocator()
7483 # We set set self.target_node as it is required by
7485 self.target_node = self.lu.op.target_node
7487 # self.target_node is already populated, either directly or by the
7489 target_node = self.target_node
7490 if self.target_node == instance.primary_node:
7491 raise errors.OpPrereqError("Cannot migrate instance %s"
7492 " to its primary (%s)" %
7493 (instance.name, instance.primary_node))
7495 if len(self.lu.tasklets) == 1:
7496 # It is safe to release locks only when we're the only tasklet
7498 _ReleaseLocks(self.lu, locking.LEVEL_NODE,
7499 keep=[instance.primary_node, self.target_node])
7502 secondary_nodes = instance.secondary_nodes
7503 if not secondary_nodes:
7504 raise errors.ConfigurationError("No secondary node but using"
7505 " %s disk template" %
7506 instance.disk_template)
7507 target_node = secondary_nodes[0]
7508 if self.lu.op.iallocator or (self.lu.op.target_node and
7509 self.lu.op.target_node != target_node):
7511 text = "failed over"
7514 raise errors.OpPrereqError("Instances with disk template %s cannot"
7515 " be %s to arbitrary nodes"
7516 " (neither an iallocator nor a target"
7517 " node can be passed)" %
7518 (instance.disk_template, text),
7521 i_be = self.cfg.GetClusterInfo().FillBE(instance)
7523 # check memory requirements on the secondary node
7524 if not self.failover or instance.admin_state == constants.ADMINST_UP:
7525 _CheckNodeFreeMemory(self.lu, target_node, "migrating instance %s" %
7526 instance.name, i_be[constants.BE_MAXMEM],
7527 instance.hypervisor)
7529 self.lu.LogInfo("Not checking memory on the secondary node as"
7530 " instance will not be started")
7532 # check bridge existance
7533 _CheckInstanceBridgesExist(self.lu, instance, node=target_node)
7535 if not self.cleanup:
7536 _CheckNodeNotDrained(self.lu, target_node)
7537 if not self.failover:
7538 result = self.rpc.call_instance_migratable(instance.primary_node,
7540 if result.fail_msg and self.fallback:
7541 self.lu.LogInfo("Can't migrate, instance offline, fallback to"
7543 self.failover = True
7545 result.Raise("Can't migrate, please use failover",
7546 prereq=True, ecode=errors.ECODE_STATE)
7548 assert not (self.failover and self.cleanup)
7550 if not self.failover:
7551 if self.lu.op.live is not None and self.lu.op.mode is not None:
7552 raise errors.OpPrereqError("Only one of the 'live' and 'mode'"
7553 " parameters are accepted",
7555 if self.lu.op.live is not None:
7557 self.lu.op.mode = constants.HT_MIGRATION_LIVE
7559 self.lu.op.mode = constants.HT_MIGRATION_NONLIVE
7560 # reset the 'live' parameter to None so that repeated
7561 # invocations of CheckPrereq do not raise an exception
7562 self.lu.op.live = None
7563 elif self.lu.op.mode is None:
7564 # read the default value from the hypervisor
7565 i_hv = self.cfg.GetClusterInfo().FillHV(self.instance,
7567 self.lu.op.mode = i_hv[constants.HV_MIGRATION_MODE]
7569 self.live = self.lu.op.mode == constants.HT_MIGRATION_LIVE
7571 # Failover is never live
7574 def _RunAllocator(self):
7575 """Run the allocator based on input opcode.
7578 ial = IAllocator(self.cfg, self.rpc,
7579 mode=constants.IALLOCATOR_MODE_RELOC,
7580 name=self.instance_name,
7581 # TODO See why hail breaks with a single node below
7582 relocate_from=[self.instance.primary_node,
7583 self.instance.primary_node],
7586 ial.Run(self.lu.op.iallocator)
7589 raise errors.OpPrereqError("Can't compute nodes using"
7590 " iallocator '%s': %s" %
7591 (self.lu.op.iallocator, ial.info),
7593 if len(ial.result) != ial.required_nodes:
7594 raise errors.OpPrereqError("iallocator '%s' returned invalid number"
7595 " of nodes (%s), required %s" %
7596 (self.lu.op.iallocator, len(ial.result),
7597 ial.required_nodes), errors.ECODE_FAULT)
7598 self.target_node = ial.result[0]
7599 self.lu.LogInfo("Selected nodes for instance %s via iallocator %s: %s",
7600 self.instance_name, self.lu.op.iallocator,
7601 utils.CommaJoin(ial.result))
7603 def _WaitUntilSync(self):
7604 """Poll with custom rpc for disk sync.
7606 This uses our own step-based rpc call.
7609 self.feedback_fn("* wait until resync is done")
7613 result = self.rpc.call_drbd_wait_sync(self.all_nodes,
7615 self.instance.disks)
7617 for node, nres in result.items():
7618 nres.Raise("Cannot resync disks on node %s" % node)
7619 node_done, node_percent = nres.payload
7620 all_done = all_done and node_done
7621 if node_percent is not None:
7622 min_percent = min(min_percent, node_percent)
7624 if min_percent < 100:
7625 self.feedback_fn(" - progress: %.1f%%" % min_percent)
7628 def _EnsureSecondary(self, node):
7629 """Demote a node to secondary.
7632 self.feedback_fn("* switching node %s to secondary mode" % node)
7634 for dev in self.instance.disks:
7635 self.cfg.SetDiskID(dev, node)
7637 result = self.rpc.call_blockdev_close(node, self.instance.name,
7638 self.instance.disks)
7639 result.Raise("Cannot change disk to secondary on node %s" % node)
7641 def _GoStandalone(self):
7642 """Disconnect from the network.
7645 self.feedback_fn("* changing into standalone mode")
7646 result = self.rpc.call_drbd_disconnect_net(self.all_nodes, self.nodes_ip,
7647 self.instance.disks)
7648 for node, nres in result.items():
7649 nres.Raise("Cannot disconnect disks node %s" % node)
7651 def _GoReconnect(self, multimaster):
7652 """Reconnect to the network.
7658 msg = "single-master"
7659 self.feedback_fn("* changing disks into %s mode" % msg)
7660 result = self.rpc.call_drbd_attach_net(self.all_nodes, self.nodes_ip,
7661 self.instance.disks,
7662 self.instance.name, multimaster)
7663 for node, nres in result.items():
7664 nres.Raise("Cannot change disks config on node %s" % node)
7666 def _ExecCleanup(self):
7667 """Try to cleanup after a failed migration.
7669 The cleanup is done by:
7670 - check that the instance is running only on one node
7671 (and update the config if needed)
7672 - change disks on its secondary node to secondary
7673 - wait until disks are fully synchronized
7674 - disconnect from the network
7675 - change disks into single-master mode
7676 - wait again until disks are fully synchronized
7679 instance = self.instance
7680 target_node = self.target_node
7681 source_node = self.source_node
7683 # check running on only one node
7684 self.feedback_fn("* checking where the instance actually runs"
7685 " (if this hangs, the hypervisor might be in"
7687 ins_l = self.rpc.call_instance_list(self.all_nodes, [instance.hypervisor])
7688 for node, result in ins_l.items():
7689 result.Raise("Can't contact node %s" % node)
7691 runningon_source = instance.name in ins_l[source_node].payload
7692 runningon_target = instance.name in ins_l[target_node].payload
7694 if runningon_source and runningon_target:
7695 raise errors.OpExecError("Instance seems to be running on two nodes,"
7696 " or the hypervisor is confused; you will have"
7697 " to ensure manually that it runs only on one"
7698 " and restart this operation")
7700 if not (runningon_source or runningon_target):
7701 raise errors.OpExecError("Instance does not seem to be running at all;"
7702 " in this case it's safer to repair by"
7703 " running 'gnt-instance stop' to ensure disk"
7704 " shutdown, and then restarting it")
7706 if runningon_target:
7707 # the migration has actually succeeded, we need to update the config
7708 self.feedback_fn("* instance running on secondary node (%s),"
7709 " updating config" % target_node)
7710 instance.primary_node = target_node
7711 self.cfg.Update(instance, self.feedback_fn)
7712 demoted_node = source_node
7714 self.feedback_fn("* instance confirmed to be running on its"
7715 " primary node (%s)" % source_node)
7716 demoted_node = target_node
7718 if instance.disk_template in constants.DTS_INT_MIRROR:
7719 self._EnsureSecondary(demoted_node)
7721 self._WaitUntilSync()
7722 except errors.OpExecError:
7723 # we ignore here errors, since if the device is standalone, it
7724 # won't be able to sync
7726 self._GoStandalone()
7727 self._GoReconnect(False)
7728 self._WaitUntilSync()
7730 self.feedback_fn("* done")
7732 def _RevertDiskStatus(self):
7733 """Try to revert the disk status after a failed migration.
7736 target_node = self.target_node
7737 if self.instance.disk_template in constants.DTS_EXT_MIRROR:
7741 self._EnsureSecondary(target_node)
7742 self._GoStandalone()
7743 self._GoReconnect(False)
7744 self._WaitUntilSync()
7745 except errors.OpExecError, err:
7746 self.lu.LogWarning("Migration failed and I can't reconnect the drives,"
7747 " please try to recover the instance manually;"
7748 " error '%s'" % str(err))
7750 def _AbortMigration(self):
7751 """Call the hypervisor code to abort a started migration.
7754 instance = self.instance
7755 target_node = self.target_node
7756 source_node = self.source_node
7757 migration_info = self.migration_info
7759 abort_result = self.rpc.call_instance_finalize_migration_dst(target_node,
7763 abort_msg = abort_result.fail_msg
7765 logging.error("Aborting migration failed on target node %s: %s",
7766 target_node, abort_msg)
7767 # Don't raise an exception here, as we stil have to try to revert the
7768 # disk status, even if this step failed.
7770 abort_result = self.rpc.call_instance_finalize_migration_src(source_node,
7771 instance, False, self.live)
7772 abort_msg = abort_result.fail_msg
7774 logging.error("Aborting migration failed on source node %s: %s",
7775 source_node, abort_msg)
7777 def _ExecMigration(self):
7778 """Migrate an instance.
7780 The migrate is done by:
7781 - change the disks into dual-master mode
7782 - wait until disks are fully synchronized again
7783 - migrate the instance
7784 - change disks on the new secondary node (the old primary) to secondary
7785 - wait until disks are fully synchronized
7786 - change disks into single-master mode
7789 instance = self.instance
7790 target_node = self.target_node
7791 source_node = self.source_node
7793 # Check for hypervisor version mismatch and warn the user.
7794 nodeinfo = self.rpc.call_node_info([source_node, target_node],
7795 None, [self.instance.hypervisor])
7796 for ninfo in nodeinfo.values():
7797 ninfo.Raise("Unable to retrieve node information from node '%s'" %
7799 (_, _, (src_info, )) = nodeinfo[source_node].payload
7800 (_, _, (dst_info, )) = nodeinfo[target_node].payload
7802 if ((constants.HV_NODEINFO_KEY_VERSION in src_info) and
7803 (constants.HV_NODEINFO_KEY_VERSION in dst_info)):
7804 src_version = src_info[constants.HV_NODEINFO_KEY_VERSION]
7805 dst_version = dst_info[constants.HV_NODEINFO_KEY_VERSION]
7806 if src_version != dst_version:
7807 self.feedback_fn("* warning: hypervisor version mismatch between"
7808 " source (%s) and target (%s) node" %
7809 (src_version, dst_version))
7811 self.feedback_fn("* checking disk consistency between source and target")
7812 for dev in instance.disks:
7813 if not _CheckDiskConsistency(self.lu, dev, target_node, False):
7814 raise errors.OpExecError("Disk %s is degraded or not fully"
7815 " synchronized on target node,"
7816 " aborting migration" % dev.iv_name)
7818 # First get the migration information from the remote node
7819 result = self.rpc.call_migration_info(source_node, instance)
7820 msg = result.fail_msg
7822 log_err = ("Failed fetching source migration information from %s: %s" %
7824 logging.error(log_err)
7825 raise errors.OpExecError(log_err)
7827 self.migration_info = migration_info = result.payload
7829 if self.instance.disk_template not in constants.DTS_EXT_MIRROR:
7830 # Then switch the disks to master/master mode
7831 self._EnsureSecondary(target_node)
7832 self._GoStandalone()
7833 self._GoReconnect(True)
7834 self._WaitUntilSync()
7836 self.feedback_fn("* preparing %s to accept the instance" % target_node)
7837 result = self.rpc.call_accept_instance(target_node,
7840 self.nodes_ip[target_node])
7842 msg = result.fail_msg
7844 logging.error("Instance pre-migration failed, trying to revert"
7845 " disk status: %s", msg)
7846 self.feedback_fn("Pre-migration failed, aborting")
7847 self._AbortMigration()
7848 self._RevertDiskStatus()
7849 raise errors.OpExecError("Could not pre-migrate instance %s: %s" %
7850 (instance.name, msg))
7852 self.feedback_fn("* migrating instance to %s" % target_node)
7853 result = self.rpc.call_instance_migrate(source_node, instance,
7854 self.nodes_ip[target_node],
7856 msg = result.fail_msg
7858 logging.error("Instance migration failed, trying to revert"
7859 " disk status: %s", msg)
7860 self.feedback_fn("Migration failed, aborting")
7861 self._AbortMigration()
7862 self._RevertDiskStatus()
7863 raise errors.OpExecError("Could not migrate instance %s: %s" %
7864 (instance.name, msg))
7866 self.feedback_fn("* starting memory transfer")
7867 last_feedback = time.time()
7869 result = self.rpc.call_instance_get_migration_status(source_node,
7871 msg = result.fail_msg
7872 ms = result.payload # MigrationStatus instance
7873 if msg or (ms.status in constants.HV_MIGRATION_FAILED_STATUSES):
7874 logging.error("Instance migration failed, trying to revert"
7875 " disk status: %s", msg)
7876 self.feedback_fn("Migration failed, aborting")
7877 self._AbortMigration()
7878 self._RevertDiskStatus()
7879 raise errors.OpExecError("Could not migrate instance %s: %s" %
7880 (instance.name, msg))
7882 if result.payload.status != constants.HV_MIGRATION_ACTIVE:
7883 self.feedback_fn("* memory transfer complete")
7886 if (utils.TimeoutExpired(last_feedback,
7887 self._MIGRATION_FEEDBACK_INTERVAL) and
7888 ms.transferred_ram is not None):
7889 mem_progress = 100 * float(ms.transferred_ram) / float(ms.total_ram)
7890 self.feedback_fn("* memory transfer progress: %.2f %%" % mem_progress)
7891 last_feedback = time.time()
7893 time.sleep(self._MIGRATION_POLL_INTERVAL)
7895 result = self.rpc.call_instance_finalize_migration_src(source_node,
7899 msg = result.fail_msg
7901 logging.error("Instance migration succeeded, but finalization failed"
7902 " on the source node: %s", msg)
7903 raise errors.OpExecError("Could not finalize instance migration: %s" %
7906 instance.primary_node = target_node
7908 # distribute new instance config to the other nodes
7909 self.cfg.Update(instance, self.feedback_fn)
7911 result = self.rpc.call_instance_finalize_migration_dst(target_node,
7915 msg = result.fail_msg
7917 logging.error("Instance migration succeeded, but finalization failed"
7918 " on the target node: %s", msg)
7919 raise errors.OpExecError("Could not finalize instance migration: %s" %
7922 if self.instance.disk_template not in constants.DTS_EXT_MIRROR:
7923 self._EnsureSecondary(source_node)
7924 self._WaitUntilSync()
7925 self._GoStandalone()
7926 self._GoReconnect(False)
7927 self._WaitUntilSync()
7929 self.feedback_fn("* done")
7931 def _ExecFailover(self):
7932 """Failover an instance.
7934 The failover is done by shutting it down on its present node and
7935 starting it on the secondary.
7938 instance = self.instance
7939 primary_node = self.cfg.GetNodeInfo(instance.primary_node)
7941 source_node = instance.primary_node
7942 target_node = self.target_node
7944 if instance.admin_state == constants.ADMINST_UP:
7945 self.feedback_fn("* checking disk consistency between source and target")
7946 for dev in instance.disks:
7947 # for drbd, these are drbd over lvm
7948 if not _CheckDiskConsistency(self.lu, dev, target_node, False):
7949 if primary_node.offline:
7950 self.feedback_fn("Node %s is offline, ignoring degraded disk %s on"
7952 (primary_node.name, dev.iv_name, target_node))
7953 elif not self.ignore_consistency:
7954 raise errors.OpExecError("Disk %s is degraded on target node,"
7955 " aborting failover" % dev.iv_name)
7957 self.feedback_fn("* not checking disk consistency as instance is not"
7960 self.feedback_fn("* shutting down instance on source node")
7961 logging.info("Shutting down instance %s on node %s",
7962 instance.name, source_node)
7964 result = self.rpc.call_instance_shutdown(source_node, instance,
7965 self.shutdown_timeout)
7966 msg = result.fail_msg
7968 if self.ignore_consistency or primary_node.offline:
7969 self.lu.LogWarning("Could not shutdown instance %s on node %s,"
7970 " proceeding anyway; please make sure node"
7971 " %s is down; error details: %s",
7972 instance.name, source_node, source_node, msg)
7974 raise errors.OpExecError("Could not shutdown instance %s on"
7976 (instance.name, source_node, msg))
7978 self.feedback_fn("* deactivating the instance's disks on source node")
7979 if not _ShutdownInstanceDisks(self.lu, instance, ignore_primary=True):
7980 raise errors.OpExecError("Can't shut down the instance's disks")
7982 instance.primary_node = target_node
7983 # distribute new instance config to the other nodes
7984 self.cfg.Update(instance, self.feedback_fn)
7986 # Only start the instance if it's marked as up
7987 if instance.admin_state == constants.ADMINST_UP:
7988 self.feedback_fn("* activating the instance's disks on target node %s" %
7990 logging.info("Starting instance %s on node %s",
7991 instance.name, target_node)
7993 disks_ok, _ = _AssembleInstanceDisks(self.lu, instance,
7994 ignore_secondaries=True)
7996 _ShutdownInstanceDisks(self.lu, instance)
7997 raise errors.OpExecError("Can't activate the instance's disks")
7999 self.feedback_fn("* starting the instance on the target node %s" %
8001 result = self.rpc.call_instance_start(target_node, (instance, None, None),
8003 msg = result.fail_msg
8005 _ShutdownInstanceDisks(self.lu, instance)
8006 raise errors.OpExecError("Could not start instance %s on node %s: %s" %
8007 (instance.name, target_node, msg))
8009 def Exec(self, feedback_fn):
8010 """Perform the migration.
8013 self.feedback_fn = feedback_fn
8014 self.source_node = self.instance.primary_node
8016 # FIXME: if we implement migrate-to-any in DRBD, this needs fixing
8017 if self.instance.disk_template in constants.DTS_INT_MIRROR:
8018 self.target_node = self.instance.secondary_nodes[0]
8019 # Otherwise self.target_node has been populated either
8020 # directly, or through an iallocator.
8022 self.all_nodes = [self.source_node, self.target_node]
8023 self.nodes_ip = dict((name, node.secondary_ip) for (name, node)
8024 in self.cfg.GetMultiNodeInfo(self.all_nodes))
8027 feedback_fn("Failover instance %s" % self.instance.name)
8028 self._ExecFailover()
8030 feedback_fn("Migrating instance %s" % self.instance.name)
8033 return self._ExecCleanup()
8035 return self._ExecMigration()
8038 def _CreateBlockDev(lu, node, instance, device, force_create,
8040 """Create a tree of block devices on a given node.
8042 If this device type has to be created on secondaries, create it and
8045 If not, just recurse to children keeping the same 'force' value.
8047 @param lu: the lu on whose behalf we execute
8048 @param node: the node on which to create the device
8049 @type instance: L{objects.Instance}
8050 @param instance: the instance which owns the device
8051 @type device: L{objects.Disk}
8052 @param device: the device to create
8053 @type force_create: boolean
8054 @param force_create: whether to force creation of this device; this
8055 will be change to True whenever we find a device which has
8056 CreateOnSecondary() attribute
8057 @param info: the extra 'metadata' we should attach to the device
8058 (this will be represented as a LVM tag)
8059 @type force_open: boolean
8060 @param force_open: this parameter will be passes to the
8061 L{backend.BlockdevCreate} function where it specifies
8062 whether we run on primary or not, and it affects both
8063 the child assembly and the device own Open() execution
8066 if device.CreateOnSecondary():
8070 for child in device.children:
8071 _CreateBlockDev(lu, node, instance, child, force_create,
8074 if not force_create:
8077 _CreateSingleBlockDev(lu, node, instance, device, info, force_open)
8080 def _CreateSingleBlockDev(lu, node, instance, device, info, force_open):
8081 """Create a single block device on a given node.
8083 This will not recurse over children of the device, so they must be
8086 @param lu: the lu on whose behalf we execute
8087 @param node: the node on which to create the device
8088 @type instance: L{objects.Instance}
8089 @param instance: the instance which owns the device
8090 @type device: L{objects.Disk}
8091 @param device: the device to create
8092 @param info: the extra 'metadata' we should attach to the device
8093 (this will be represented as a LVM tag)
8094 @type force_open: boolean
8095 @param force_open: this parameter will be passes to the
8096 L{backend.BlockdevCreate} function where it specifies
8097 whether we run on primary or not, and it affects both
8098 the child assembly and the device own Open() execution
8101 lu.cfg.SetDiskID(device, node)
8102 result = lu.rpc.call_blockdev_create(node, device, device.size,
8103 instance.name, force_open, info)
8104 result.Raise("Can't create block device %s on"
8105 " node %s for instance %s" % (device, node, instance.name))
8106 if device.physical_id is None:
8107 device.physical_id = result.payload
8110 def _GenerateUniqueNames(lu, exts):
8111 """Generate a suitable LV name.
8113 This will generate a logical volume name for the given instance.
8118 new_id = lu.cfg.GenerateUniqueID(lu.proc.GetECId())
8119 results.append("%s%s" % (new_id, val))
8123 def _ComputeLDParams(disk_template, disk_params):
8124 """Computes Logical Disk parameters from Disk Template parameters.
8126 @type disk_template: string
8127 @param disk_template: disk template, one of L{constants.DISK_TEMPLATES}
8128 @type disk_params: dict
8129 @param disk_params: disk template parameters; dict(template_name -> parameters
8131 @return: a list of dicts, one for each node of the disk hierarchy. Each dict
8132 contains the LD parameters of the node. The tree is flattened in-order.
8135 if disk_template not in constants.DISK_TEMPLATES:
8136 raise errors.ProgrammerError("Unknown disk template %s" % disk_template)
8139 dt_params = disk_params[disk_template]
8140 if disk_template == constants.DT_DRBD8:
8142 constants.RESYNC_RATE: dt_params[constants.DRBD_RESYNC_RATE],
8143 constants.BARRIERS: dt_params[constants.DRBD_DISK_BARRIERS],
8144 constants.NO_META_FLUSH: dt_params[constants.DRBD_META_BARRIERS],
8148 objects.FillDict(constants.DISK_LD_DEFAULTS[constants.LD_DRBD8],
8151 result.append(drbd_params)
8155 constants.STRIPES: dt_params[constants.DRBD_DATA_STRIPES],
8158 objects.FillDict(constants.DISK_LD_DEFAULTS[constants.LD_LV],
8160 result.append(data_params)
8164 constants.STRIPES: dt_params[constants.DRBD_META_STRIPES],
8167 objects.FillDict(constants.DISK_LD_DEFAULTS[constants.LD_LV],
8169 result.append(meta_params)
8171 elif (disk_template == constants.DT_FILE or
8172 disk_template == constants.DT_SHARED_FILE):
8173 result.append(constants.DISK_LD_DEFAULTS[constants.LD_FILE])
8175 elif disk_template == constants.DT_PLAIN:
8177 constants.STRIPES: dt_params[constants.LV_STRIPES],
8180 objects.FillDict(constants.DISK_LD_DEFAULTS[constants.LD_LV],
8182 result.append(params)
8184 elif disk_template == constants.DT_BLOCK:
8185 result.append(constants.DISK_LD_DEFAULTS[constants.LD_BLOCKDEV])
8190 def _GenerateDRBD8Branch(lu, primary, secondary, size, vgnames, names,
8191 iv_name, p_minor, s_minor, drbd_params, data_params,
8193 """Generate a drbd8 device complete with its children.
8196 assert len(vgnames) == len(names) == 2
8197 port = lu.cfg.AllocatePort()
8198 shared_secret = lu.cfg.GenerateDRBDSecret(lu.proc.GetECId())
8200 dev_data = objects.Disk(dev_type=constants.LD_LV, size=size,
8201 logical_id=(vgnames[0], names[0]),
8203 dev_meta = objects.Disk(dev_type=constants.LD_LV, size=DRBD_META_SIZE,
8204 logical_id=(vgnames[1], names[1]),
8206 drbd_dev = objects.Disk(dev_type=constants.LD_DRBD8, size=size,
8207 logical_id=(primary, secondary, port,
8210 children=[dev_data, dev_meta],
8211 iv_name=iv_name, params=drbd_params)
8215 def _GenerateDiskTemplate(lu, template_name,
8216 instance_name, primary_node,
8217 secondary_nodes, disk_info,
8218 file_storage_dir, file_driver,
8219 base_index, feedback_fn, disk_params):
8220 """Generate the entire disk layout for a given template type.
8223 #TODO: compute space requirements
8225 vgname = lu.cfg.GetVGName()
8226 disk_count = len(disk_info)
8228 ld_params = _ComputeLDParams(template_name, disk_params)
8229 if template_name == constants.DT_DISKLESS:
8231 elif template_name == constants.DT_PLAIN:
8232 if len(secondary_nodes) != 0:
8233 raise errors.ProgrammerError("Wrong template configuration")
8235 names = _GenerateUniqueNames(lu, [".disk%d" % (base_index + i)
8236 for i in range(disk_count)])
8237 for idx, disk in enumerate(disk_info):
8238 disk_index = idx + base_index
8239 vg = disk.get(constants.IDISK_VG, vgname)
8240 feedback_fn("* disk %i, vg %s, name %s" % (idx, vg, names[idx]))
8241 disk_dev = objects.Disk(dev_type=constants.LD_LV,
8242 size=disk[constants.IDISK_SIZE],
8243 logical_id=(vg, names[idx]),
8244 iv_name="disk/%d" % disk_index,
8245 mode=disk[constants.IDISK_MODE],
8246 params=ld_params[0])
8247 disks.append(disk_dev)
8248 elif template_name == constants.DT_DRBD8:
8249 drbd_params, data_params, meta_params = ld_params
8250 if len(secondary_nodes) != 1:
8251 raise errors.ProgrammerError("Wrong template configuration")
8252 remote_node = secondary_nodes[0]
8253 minors = lu.cfg.AllocateDRBDMinor(
8254 [primary_node, remote_node] * len(disk_info), instance_name)
8257 for lv_prefix in _GenerateUniqueNames(lu, [".disk%d" % (base_index + i)
8258 for i in range(disk_count)]):
8259 names.append(lv_prefix + "_data")
8260 names.append(lv_prefix + "_meta")
8261 for idx, disk in enumerate(disk_info):
8262 disk_index = idx + base_index
8263 data_vg = disk.get(constants.IDISK_VG, vgname)
8264 meta_vg = disk.get(constants.IDISK_METAVG, data_vg)
8265 disk_dev = _GenerateDRBD8Branch(lu, primary_node, remote_node,
8266 disk[constants.IDISK_SIZE],
8268 names[idx * 2:idx * 2 + 2],
8269 "disk/%d" % disk_index,
8270 minors[idx * 2], minors[idx * 2 + 1],
8271 drbd_params, data_params, meta_params)
8272 disk_dev.mode = disk[constants.IDISK_MODE]
8273 disks.append(disk_dev)
8274 elif template_name == constants.DT_FILE:
8275 if len(secondary_nodes) != 0:
8276 raise errors.ProgrammerError("Wrong template configuration")
8278 opcodes.RequireFileStorage()
8280 for idx, disk in enumerate(disk_info):
8281 disk_index = idx + base_index
8282 disk_dev = objects.Disk(dev_type=constants.LD_FILE,
8283 size=disk[constants.IDISK_SIZE],
8284 iv_name="disk/%d" % disk_index,
8285 logical_id=(file_driver,
8286 "%s/disk%d" % (file_storage_dir,
8288 mode=disk[constants.IDISK_MODE],
8289 params=ld_params[0])
8290 disks.append(disk_dev)
8291 elif template_name == constants.DT_SHARED_FILE:
8292 if len(secondary_nodes) != 0:
8293 raise errors.ProgrammerError("Wrong template configuration")
8295 opcodes.RequireSharedFileStorage()
8297 for idx, disk in enumerate(disk_info):
8298 disk_index = idx + base_index
8299 disk_dev = objects.Disk(dev_type=constants.LD_FILE,
8300 size=disk[constants.IDISK_SIZE],
8301 iv_name="disk/%d" % disk_index,
8302 logical_id=(file_driver,
8303 "%s/disk%d" % (file_storage_dir,
8305 mode=disk[constants.IDISK_MODE],
8306 params=ld_params[0])
8307 disks.append(disk_dev)
8308 elif template_name == constants.DT_BLOCK:
8309 if len(secondary_nodes) != 0:
8310 raise errors.ProgrammerError("Wrong template configuration")
8312 for idx, disk in enumerate(disk_info):
8313 disk_index = idx + base_index
8314 disk_dev = objects.Disk(dev_type=constants.LD_BLOCKDEV,
8315 size=disk[constants.IDISK_SIZE],
8316 logical_id=(constants.BLOCKDEV_DRIVER_MANUAL,
8317 disk[constants.IDISK_ADOPT]),
8318 iv_name="disk/%d" % disk_index,
8319 mode=disk[constants.IDISK_MODE],
8320 params=ld_params[0])
8321 disks.append(disk_dev)
8324 raise errors.ProgrammerError("Invalid disk template '%s'" % template_name)
8328 def _GetInstanceInfoText(instance):
8329 """Compute that text that should be added to the disk's metadata.
8332 return "originstname+%s" % instance.name
8335 def _CalcEta(time_taken, written, total_size):
8336 """Calculates the ETA based on size written and total size.
8338 @param time_taken: The time taken so far
8339 @param written: amount written so far
8340 @param total_size: The total size of data to be written
8341 @return: The remaining time in seconds
8344 avg_time = time_taken / float(written)
8345 return (total_size - written) * avg_time
8348 def _WipeDisks(lu, instance):
8349 """Wipes instance disks.
8351 @type lu: L{LogicalUnit}
8352 @param lu: the logical unit on whose behalf we execute
8353 @type instance: L{objects.Instance}
8354 @param instance: the instance whose disks we should create
8355 @return: the success of the wipe
8358 node = instance.primary_node
8360 for device in instance.disks:
8361 lu.cfg.SetDiskID(device, node)
8363 logging.info("Pause sync of instance %s disks", instance.name)
8364 result = lu.rpc.call_blockdev_pause_resume_sync(node, instance.disks, True)
8366 for idx, success in enumerate(result.payload):
8368 logging.warn("pause-sync of instance %s for disks %d failed",
8372 for idx, device in enumerate(instance.disks):
8373 # The wipe size is MIN_WIPE_CHUNK_PERCENT % of the instance disk but
8374 # MAX_WIPE_CHUNK at max
8375 wipe_chunk_size = min(constants.MAX_WIPE_CHUNK, device.size / 100.0 *
8376 constants.MIN_WIPE_CHUNK_PERCENT)
8377 # we _must_ make this an int, otherwise rounding errors will
8379 wipe_chunk_size = int(wipe_chunk_size)
8381 lu.LogInfo("* Wiping disk %d", idx)
8382 logging.info("Wiping disk %d for instance %s, node %s using"
8383 " chunk size %s", idx, instance.name, node, wipe_chunk_size)
8388 start_time = time.time()
8390 while offset < size:
8391 wipe_size = min(wipe_chunk_size, size - offset)
8392 logging.debug("Wiping disk %d, offset %s, chunk %s",
8393 idx, offset, wipe_size)
8394 result = lu.rpc.call_blockdev_wipe(node, device, offset, wipe_size)
8395 result.Raise("Could not wipe disk %d at offset %d for size %d" %
8396 (idx, offset, wipe_size))
8399 if now - last_output >= 60:
8400 eta = _CalcEta(now - start_time, offset, size)
8401 lu.LogInfo(" - done: %.1f%% ETA: %s" %
8402 (offset / float(size) * 100, utils.FormatSeconds(eta)))
8405 logging.info("Resume sync of instance %s disks", instance.name)
8407 result = lu.rpc.call_blockdev_pause_resume_sync(node, instance.disks, False)
8409 for idx, success in enumerate(result.payload):
8411 lu.LogWarning("Resume sync of disk %d failed, please have a"
8412 " look at the status and troubleshoot the issue", idx)
8413 logging.warn("resume-sync of instance %s for disks %d failed",
8417 def _CreateDisks(lu, instance, to_skip=None, target_node=None):
8418 """Create all disks for an instance.
8420 This abstracts away some work from AddInstance.
8422 @type lu: L{LogicalUnit}
8423 @param lu: the logical unit on whose behalf we execute
8424 @type instance: L{objects.Instance}
8425 @param instance: the instance whose disks we should create
8427 @param to_skip: list of indices to skip
8428 @type target_node: string
8429 @param target_node: if passed, overrides the target node for creation
8431 @return: the success of the creation
8434 info = _GetInstanceInfoText(instance)
8435 if target_node is None:
8436 pnode = instance.primary_node
8437 all_nodes = instance.all_nodes
8442 if instance.disk_template in constants.DTS_FILEBASED:
8443 file_storage_dir = os.path.dirname(instance.disks[0].logical_id[1])
8444 result = lu.rpc.call_file_storage_dir_create(pnode, file_storage_dir)
8446 result.Raise("Failed to create directory '%s' on"
8447 " node %s" % (file_storage_dir, pnode))
8449 # Note: this needs to be kept in sync with adding of disks in
8450 # LUInstanceSetParams
8451 for idx, device in enumerate(instance.disks):
8452 if to_skip and idx in to_skip:
8454 logging.info("Creating volume %s for instance %s",
8455 device.iv_name, instance.name)
8457 for node in all_nodes:
8458 f_create = node == pnode
8459 _CreateBlockDev(lu, node, instance, device, f_create, info, f_create)
8462 def _RemoveDisks(lu, instance, target_node=None):
8463 """Remove all disks for an instance.
8465 This abstracts away some work from `AddInstance()` and
8466 `RemoveInstance()`. Note that in case some of the devices couldn't
8467 be removed, the removal will continue with the other ones (compare
8468 with `_CreateDisks()`).
8470 @type lu: L{LogicalUnit}
8471 @param lu: the logical unit on whose behalf we execute
8472 @type instance: L{objects.Instance}
8473 @param instance: the instance whose disks we should remove
8474 @type target_node: string
8475 @param target_node: used to override the node on which to remove the disks
8477 @return: the success of the removal
8480 logging.info("Removing block devices for instance %s", instance.name)
8483 for device in instance.disks:
8485 edata = [(target_node, device)]
8487 edata = device.ComputeNodeTree(instance.primary_node)
8488 for node, disk in edata:
8489 lu.cfg.SetDiskID(disk, node)
8490 msg = lu.rpc.call_blockdev_remove(node, disk).fail_msg
8492 lu.LogWarning("Could not remove block device %s on node %s,"
8493 " continuing anyway: %s", device.iv_name, node, msg)
8496 # if this is a DRBD disk, return its port to the pool
8497 if device.dev_type in constants.LDS_DRBD:
8498 tcp_port = device.logical_id[2]
8499 lu.cfg.AddTcpUdpPort(tcp_port)
8501 if instance.disk_template == constants.DT_FILE:
8502 file_storage_dir = os.path.dirname(instance.disks[0].logical_id[1])
8506 tgt = instance.primary_node
8507 result = lu.rpc.call_file_storage_dir_remove(tgt, file_storage_dir)
8509 lu.LogWarning("Could not remove directory '%s' on node %s: %s",
8510 file_storage_dir, instance.primary_node, result.fail_msg)
8516 def _ComputeDiskSizePerVG(disk_template, disks):
8517 """Compute disk size requirements in the volume group
8520 def _compute(disks, payload):
8521 """Universal algorithm.
8526 vgs[disk[constants.IDISK_VG]] = \
8527 vgs.get(constants.IDISK_VG, 0) + disk[constants.IDISK_SIZE] + payload
8531 # Required free disk space as a function of disk and swap space
8533 constants.DT_DISKLESS: {},
8534 constants.DT_PLAIN: _compute(disks, 0),
8535 # 128 MB are added for drbd metadata for each disk
8536 constants.DT_DRBD8: _compute(disks, DRBD_META_SIZE),
8537 constants.DT_FILE: {},
8538 constants.DT_SHARED_FILE: {},
8541 if disk_template not in req_size_dict:
8542 raise errors.ProgrammerError("Disk template '%s' size requirement"
8543 " is unknown" % disk_template)
8545 return req_size_dict[disk_template]
8548 def _ComputeDiskSize(disk_template, disks):
8549 """Compute disk size requirements in the volume group
8552 # Required free disk space as a function of disk and swap space
8554 constants.DT_DISKLESS: None,
8555 constants.DT_PLAIN: sum(d[constants.IDISK_SIZE] for d in disks),
8556 # 128 MB are added for drbd metadata for each disk
8558 sum(d[constants.IDISK_SIZE] + DRBD_META_SIZE for d in disks),
8559 constants.DT_FILE: None,
8560 constants.DT_SHARED_FILE: 0,
8561 constants.DT_BLOCK: 0,
8564 if disk_template not in req_size_dict:
8565 raise errors.ProgrammerError("Disk template '%s' size requirement"
8566 " is unknown" % disk_template)
8568 return req_size_dict[disk_template]
8571 def _FilterVmNodes(lu, nodenames):
8572 """Filters out non-vm_capable nodes from a list.
8574 @type lu: L{LogicalUnit}
8575 @param lu: the logical unit for which we check
8576 @type nodenames: list
8577 @param nodenames: the list of nodes on which we should check
8579 @return: the list of vm-capable nodes
8582 vm_nodes = frozenset(lu.cfg.GetNonVmCapableNodeList())
8583 return [name for name in nodenames if name not in vm_nodes]
8586 def _CheckHVParams(lu, nodenames, hvname, hvparams):
8587 """Hypervisor parameter validation.
8589 This function abstract the hypervisor parameter validation to be
8590 used in both instance create and instance modify.
8592 @type lu: L{LogicalUnit}
8593 @param lu: the logical unit for which we check
8594 @type nodenames: list
8595 @param nodenames: the list of nodes on which we should check
8596 @type hvname: string
8597 @param hvname: the name of the hypervisor we should use
8598 @type hvparams: dict
8599 @param hvparams: the parameters which we need to check
8600 @raise errors.OpPrereqError: if the parameters are not valid
8603 nodenames = _FilterVmNodes(lu, nodenames)
8605 cluster = lu.cfg.GetClusterInfo()
8606 hvfull = objects.FillDict(cluster.hvparams.get(hvname, {}), hvparams)
8608 hvinfo = lu.rpc.call_hypervisor_validate_params(nodenames, hvname, hvfull)
8609 for node in nodenames:
8613 info.Raise("Hypervisor parameter validation failed on node %s" % node)
8616 def _CheckOSParams(lu, required, nodenames, osname, osparams):
8617 """OS parameters validation.
8619 @type lu: L{LogicalUnit}
8620 @param lu: the logical unit for which we check
8621 @type required: boolean
8622 @param required: whether the validation should fail if the OS is not
8624 @type nodenames: list
8625 @param nodenames: the list of nodes on which we should check
8626 @type osname: string
8627 @param osname: the name of the hypervisor we should use
8628 @type osparams: dict
8629 @param osparams: the parameters which we need to check
8630 @raise errors.OpPrereqError: if the parameters are not valid
8633 nodenames = _FilterVmNodes(lu, nodenames)
8634 result = lu.rpc.call_os_validate(nodenames, required, osname,
8635 [constants.OS_VALIDATE_PARAMETERS],
8637 for node, nres in result.items():
8638 # we don't check for offline cases since this should be run only
8639 # against the master node and/or an instance's nodes
8640 nres.Raise("OS Parameters validation failed on node %s" % node)
8641 if not nres.payload:
8642 lu.LogInfo("OS %s not found on node %s, validation skipped",
8646 class LUInstanceCreate(LogicalUnit):
8647 """Create an instance.
8650 HPATH = "instance-add"
8651 HTYPE = constants.HTYPE_INSTANCE
8654 def CheckArguments(self):
8658 # do not require name_check to ease forward/backward compatibility
8660 if self.op.no_install and self.op.start:
8661 self.LogInfo("No-installation mode selected, disabling startup")
8662 self.op.start = False
8663 # validate/normalize the instance name
8664 self.op.instance_name = \
8665 netutils.Hostname.GetNormalizedName(self.op.instance_name)
8667 if self.op.ip_check and not self.op.name_check:
8668 # TODO: make the ip check more flexible and not depend on the name check
8669 raise errors.OpPrereqError("Cannot do IP address check without a name"
8670 " check", errors.ECODE_INVAL)
8672 # check nics' parameter names
8673 for nic in self.op.nics:
8674 utils.ForceDictType(nic, constants.INIC_PARAMS_TYPES)
8676 # check disks. parameter names and consistent adopt/no-adopt strategy
8677 has_adopt = has_no_adopt = False
8678 for disk in self.op.disks:
8679 utils.ForceDictType(disk, constants.IDISK_PARAMS_TYPES)
8680 if constants.IDISK_ADOPT in disk:
8684 if has_adopt and has_no_adopt:
8685 raise errors.OpPrereqError("Either all disks are adopted or none is",
8688 if self.op.disk_template not in constants.DTS_MAY_ADOPT:
8689 raise errors.OpPrereqError("Disk adoption is not supported for the"
8690 " '%s' disk template" %
8691 self.op.disk_template,
8693 if self.op.iallocator is not None:
8694 raise errors.OpPrereqError("Disk adoption not allowed with an"
8695 " iallocator script", errors.ECODE_INVAL)
8696 if self.op.mode == constants.INSTANCE_IMPORT:
8697 raise errors.OpPrereqError("Disk adoption not allowed for"
8698 " instance import", errors.ECODE_INVAL)
8700 if self.op.disk_template in constants.DTS_MUST_ADOPT:
8701 raise errors.OpPrereqError("Disk template %s requires disk adoption,"
8702 " but no 'adopt' parameter given" %
8703 self.op.disk_template,
8706 self.adopt_disks = has_adopt
8708 # instance name verification
8709 if self.op.name_check:
8710 self.hostname1 = netutils.GetHostname(name=self.op.instance_name)
8711 self.op.instance_name = self.hostname1.name
8712 # used in CheckPrereq for ip ping check
8713 self.check_ip = self.hostname1.ip
8715 self.check_ip = None
8717 # file storage checks
8718 if (self.op.file_driver and
8719 not self.op.file_driver in constants.FILE_DRIVER):
8720 raise errors.OpPrereqError("Invalid file driver name '%s'" %
8721 self.op.file_driver, errors.ECODE_INVAL)
8723 if self.op.disk_template == constants.DT_FILE:
8724 opcodes.RequireFileStorage()
8725 elif self.op.disk_template == constants.DT_SHARED_FILE:
8726 opcodes.RequireSharedFileStorage()
8728 ### Node/iallocator related checks
8729 _CheckIAllocatorOrNode(self, "iallocator", "pnode")
8731 if self.op.pnode is not None:
8732 if self.op.disk_template in constants.DTS_INT_MIRROR:
8733 if self.op.snode is None:
8734 raise errors.OpPrereqError("The networked disk templates need"
8735 " a mirror node", errors.ECODE_INVAL)
8737 self.LogWarning("Secondary node will be ignored on non-mirrored disk"
8739 self.op.snode = None
8741 self._cds = _GetClusterDomainSecret()
8743 if self.op.mode == constants.INSTANCE_IMPORT:
8744 # On import force_variant must be True, because if we forced it at
8745 # initial install, our only chance when importing it back is that it
8747 self.op.force_variant = True
8749 if self.op.no_install:
8750 self.LogInfo("No-installation mode has no effect during import")
8752 elif self.op.mode == constants.INSTANCE_CREATE:
8753 if self.op.os_type is None:
8754 raise errors.OpPrereqError("No guest OS specified",
8756 if self.op.os_type in self.cfg.GetClusterInfo().blacklisted_os:
8757 raise errors.OpPrereqError("Guest OS '%s' is not allowed for"
8758 " installation" % self.op.os_type,
8760 if self.op.disk_template is None:
8761 raise errors.OpPrereqError("No disk template specified",
8764 elif self.op.mode == constants.INSTANCE_REMOTE_IMPORT:
8765 # Check handshake to ensure both clusters have the same domain secret
8766 src_handshake = self.op.source_handshake
8767 if not src_handshake:
8768 raise errors.OpPrereqError("Missing source handshake",
8771 errmsg = masterd.instance.CheckRemoteExportHandshake(self._cds,
8774 raise errors.OpPrereqError("Invalid handshake: %s" % errmsg,
8777 # Load and check source CA
8778 self.source_x509_ca_pem = self.op.source_x509_ca
8779 if not self.source_x509_ca_pem:
8780 raise errors.OpPrereqError("Missing source X509 CA",
8784 (cert, _) = utils.LoadSignedX509Certificate(self.source_x509_ca_pem,
8786 except OpenSSL.crypto.Error, err:
8787 raise errors.OpPrereqError("Unable to load source X509 CA (%s)" %
8788 (err, ), errors.ECODE_INVAL)
8790 (errcode, msg) = utils.VerifyX509Certificate(cert, None, None)
8791 if errcode is not None:
8792 raise errors.OpPrereqError("Invalid source X509 CA (%s)" % (msg, ),
8795 self.source_x509_ca = cert
8797 src_instance_name = self.op.source_instance_name
8798 if not src_instance_name:
8799 raise errors.OpPrereqError("Missing source instance name",
8802 self.source_instance_name = \
8803 netutils.GetHostname(name=src_instance_name).name
8806 raise errors.OpPrereqError("Invalid instance creation mode %r" %
8807 self.op.mode, errors.ECODE_INVAL)
8809 def ExpandNames(self):
8810 """ExpandNames for CreateInstance.
8812 Figure out the right locks for instance creation.
8815 self.needed_locks = {}
8817 instance_name = self.op.instance_name
8818 # this is just a preventive check, but someone might still add this
8819 # instance in the meantime, and creation will fail at lock-add time
8820 if instance_name in self.cfg.GetInstanceList():
8821 raise errors.OpPrereqError("Instance '%s' is already in the cluster" %
8822 instance_name, errors.ECODE_EXISTS)
8824 self.add_locks[locking.LEVEL_INSTANCE] = instance_name
8826 if self.op.iallocator:
8827 # TODO: Find a solution to not lock all nodes in the cluster, e.g. by
8828 # specifying a group on instance creation and then selecting nodes from
8830 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
8831 self.needed_locks[locking.LEVEL_NODE_RES] = locking.ALL_SET
8833 self.op.pnode = _ExpandNodeName(self.cfg, self.op.pnode)
8834 nodelist = [self.op.pnode]
8835 if self.op.snode is not None:
8836 self.op.snode = _ExpandNodeName(self.cfg, self.op.snode)
8837 nodelist.append(self.op.snode)
8838 self.needed_locks[locking.LEVEL_NODE] = nodelist
8839 # Lock resources of instance's primary and secondary nodes (copy to
8840 # prevent accidential modification)
8841 self.needed_locks[locking.LEVEL_NODE_RES] = list(nodelist)
8843 # in case of import lock the source node too
8844 if self.op.mode == constants.INSTANCE_IMPORT:
8845 src_node = self.op.src_node
8846 src_path = self.op.src_path
8848 if src_path is None:
8849 self.op.src_path = src_path = self.op.instance_name
8851 if src_node is None:
8852 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
8853 self.op.src_node = None
8854 if os.path.isabs(src_path):
8855 raise errors.OpPrereqError("Importing an instance from a path"
8856 " requires a source node option",
8859 self.op.src_node = src_node = _ExpandNodeName(self.cfg, src_node)
8860 if self.needed_locks[locking.LEVEL_NODE] is not locking.ALL_SET:
8861 self.needed_locks[locking.LEVEL_NODE].append(src_node)
8862 if not os.path.isabs(src_path):
8863 self.op.src_path = src_path = \
8864 utils.PathJoin(constants.EXPORT_DIR, src_path)
8866 def _RunAllocator(self):
8867 """Run the allocator based on input opcode.
8870 nics = [n.ToDict() for n in self.nics]
8871 ial = IAllocator(self.cfg, self.rpc,
8872 mode=constants.IALLOCATOR_MODE_ALLOC,
8873 name=self.op.instance_name,
8874 disk_template=self.op.disk_template,
8877 vcpus=self.be_full[constants.BE_VCPUS],
8878 memory=self.be_full[constants.BE_MAXMEM],
8881 hypervisor=self.op.hypervisor,
8884 ial.Run(self.op.iallocator)
8887 raise errors.OpPrereqError("Can't compute nodes using"
8888 " iallocator '%s': %s" %
8889 (self.op.iallocator, ial.info),
8891 if len(ial.result) != ial.required_nodes:
8892 raise errors.OpPrereqError("iallocator '%s' returned invalid number"
8893 " of nodes (%s), required %s" %
8894 (self.op.iallocator, len(ial.result),
8895 ial.required_nodes), errors.ECODE_FAULT)
8896 self.op.pnode = ial.result[0]
8897 self.LogInfo("Selected nodes for instance %s via iallocator %s: %s",
8898 self.op.instance_name, self.op.iallocator,
8899 utils.CommaJoin(ial.result))
8900 if ial.required_nodes == 2:
8901 self.op.snode = ial.result[1]
8903 def BuildHooksEnv(self):
8906 This runs on master, primary and secondary nodes of the instance.
8910 "ADD_MODE": self.op.mode,
8912 if self.op.mode == constants.INSTANCE_IMPORT:
8913 env["SRC_NODE"] = self.op.src_node
8914 env["SRC_PATH"] = self.op.src_path
8915 env["SRC_IMAGES"] = self.src_images
8917 env.update(_BuildInstanceHookEnv(
8918 name=self.op.instance_name,
8919 primary_node=self.op.pnode,
8920 secondary_nodes=self.secondaries,
8921 status=self.op.start,
8922 os_type=self.op.os_type,
8923 minmem=self.be_full[constants.BE_MINMEM],
8924 maxmem=self.be_full[constants.BE_MAXMEM],
8925 vcpus=self.be_full[constants.BE_VCPUS],
8926 nics=_NICListToTuple(self, self.nics),
8927 disk_template=self.op.disk_template,
8928 disks=[(d[constants.IDISK_SIZE], d[constants.IDISK_MODE])
8929 for d in self.disks],
8932 hypervisor_name=self.op.hypervisor,
8938 def BuildHooksNodes(self):
8939 """Build hooks nodes.
8942 nl = [self.cfg.GetMasterNode(), self.op.pnode] + self.secondaries
8945 def _ReadExportInfo(self):
8946 """Reads the export information from disk.
8948 It will override the opcode source node and path with the actual
8949 information, if these two were not specified before.
8951 @return: the export information
8954 assert self.op.mode == constants.INSTANCE_IMPORT
8956 src_node = self.op.src_node
8957 src_path = self.op.src_path
8959 if src_node is None:
8960 locked_nodes = self.owned_locks(locking.LEVEL_NODE)
8961 exp_list = self.rpc.call_export_list(locked_nodes)
8963 for node in exp_list:
8964 if exp_list[node].fail_msg:
8966 if src_path in exp_list[node].payload:
8968 self.op.src_node = src_node = node
8969 self.op.src_path = src_path = utils.PathJoin(constants.EXPORT_DIR,
8973 raise errors.OpPrereqError("No export found for relative path %s" %
8974 src_path, errors.ECODE_INVAL)
8976 _CheckNodeOnline(self, src_node)
8977 result = self.rpc.call_export_info(src_node, src_path)
8978 result.Raise("No export or invalid export found in dir %s" % src_path)
8980 export_info = objects.SerializableConfigParser.Loads(str(result.payload))
8981 if not export_info.has_section(constants.INISECT_EXP):
8982 raise errors.ProgrammerError("Corrupted export config",
8983 errors.ECODE_ENVIRON)
8985 ei_version = export_info.get(constants.INISECT_EXP, "version")
8986 if (int(ei_version) != constants.EXPORT_VERSION):
8987 raise errors.OpPrereqError("Wrong export version %s (wanted %d)" %
8988 (ei_version, constants.EXPORT_VERSION),
8989 errors.ECODE_ENVIRON)
8992 def _ReadExportParams(self, einfo):
8993 """Use export parameters as defaults.
8995 In case the opcode doesn't specify (as in override) some instance
8996 parameters, then try to use them from the export information, if
9000 self.op.os_type = einfo.get(constants.INISECT_EXP, "os")
9002 if self.op.disk_template is None:
9003 if einfo.has_option(constants.INISECT_INS, "disk_template"):
9004 self.op.disk_template = einfo.get(constants.INISECT_INS,
9006 if self.op.disk_template not in constants.DISK_TEMPLATES:
9007 raise errors.OpPrereqError("Disk template specified in configuration"
9008 " file is not one of the allowed values:"
9009 " %s" % " ".join(constants.DISK_TEMPLATES))
9011 raise errors.OpPrereqError("No disk template specified and the export"
9012 " is missing the disk_template information",
9015 if not self.op.disks:
9017 # TODO: import the disk iv_name too
9018 for idx in range(constants.MAX_DISKS):
9019 if einfo.has_option(constants.INISECT_INS, "disk%d_size" % idx):
9020 disk_sz = einfo.getint(constants.INISECT_INS, "disk%d_size" % idx)
9021 disks.append({constants.IDISK_SIZE: disk_sz})
9022 self.op.disks = disks
9023 if not disks and self.op.disk_template != constants.DT_DISKLESS:
9024 raise errors.OpPrereqError("No disk info specified and the export"
9025 " is missing the disk information",
9028 if not self.op.nics:
9030 for idx in range(constants.MAX_NICS):
9031 if einfo.has_option(constants.INISECT_INS, "nic%d_mac" % idx):
9033 for name in list(constants.NICS_PARAMETERS) + ["ip", "mac"]:
9034 v = einfo.get(constants.INISECT_INS, "nic%d_%s" % (idx, name))
9041 if not self.op.tags and einfo.has_option(constants.INISECT_INS, "tags"):
9042 self.op.tags = einfo.get(constants.INISECT_INS, "tags").split()
9044 if (self.op.hypervisor is None and
9045 einfo.has_option(constants.INISECT_INS, "hypervisor")):
9046 self.op.hypervisor = einfo.get(constants.INISECT_INS, "hypervisor")
9048 if einfo.has_section(constants.INISECT_HYP):
9049 # use the export parameters but do not override the ones
9050 # specified by the user
9051 for name, value in einfo.items(constants.INISECT_HYP):
9052 if name not in self.op.hvparams:
9053 self.op.hvparams[name] = value
9055 if einfo.has_section(constants.INISECT_BEP):
9056 # use the parameters, without overriding
9057 for name, value in einfo.items(constants.INISECT_BEP):
9058 if name not in self.op.beparams:
9059 self.op.beparams[name] = value
9060 # Compatibility for the old "memory" be param
9061 if name == constants.BE_MEMORY:
9062 if constants.BE_MAXMEM not in self.op.beparams:
9063 self.op.beparams[constants.BE_MAXMEM] = value
9064 if constants.BE_MINMEM not in self.op.beparams:
9065 self.op.beparams[constants.BE_MINMEM] = value
9067 # try to read the parameters old style, from the main section
9068 for name in constants.BES_PARAMETERS:
9069 if (name not in self.op.beparams and
9070 einfo.has_option(constants.INISECT_INS, name)):
9071 self.op.beparams[name] = einfo.get(constants.INISECT_INS, name)
9073 if einfo.has_section(constants.INISECT_OSP):
9074 # use the parameters, without overriding
9075 for name, value in einfo.items(constants.INISECT_OSP):
9076 if name not in self.op.osparams:
9077 self.op.osparams[name] = value
9079 def _RevertToDefaults(self, cluster):
9080 """Revert the instance parameters to the default values.
9084 hv_defs = cluster.SimpleFillHV(self.op.hypervisor, self.op.os_type, {})
9085 for name in self.op.hvparams.keys():
9086 if name in hv_defs and hv_defs[name] == self.op.hvparams[name]:
9087 del self.op.hvparams[name]
9089 be_defs = cluster.SimpleFillBE({})
9090 for name in self.op.beparams.keys():
9091 if name in be_defs and be_defs[name] == self.op.beparams[name]:
9092 del self.op.beparams[name]
9094 nic_defs = cluster.SimpleFillNIC({})
9095 for nic in self.op.nics:
9096 for name in constants.NICS_PARAMETERS:
9097 if name in nic and name in nic_defs and nic[name] == nic_defs[name]:
9100 os_defs = cluster.SimpleFillOS(self.op.os_type, {})
9101 for name in self.op.osparams.keys():
9102 if name in os_defs and os_defs[name] == self.op.osparams[name]:
9103 del self.op.osparams[name]
9105 def _CalculateFileStorageDir(self):
9106 """Calculate final instance file storage dir.
9109 # file storage dir calculation/check
9110 self.instance_file_storage_dir = None
9111 if self.op.disk_template in constants.DTS_FILEBASED:
9112 # build the full file storage dir path
9115 if self.op.disk_template == constants.DT_SHARED_FILE:
9116 get_fsd_fn = self.cfg.GetSharedFileStorageDir
9118 get_fsd_fn = self.cfg.GetFileStorageDir
9120 cfg_storagedir = get_fsd_fn()
9121 if not cfg_storagedir:
9122 raise errors.OpPrereqError("Cluster file storage dir not defined")
9123 joinargs.append(cfg_storagedir)
9125 if self.op.file_storage_dir is not None:
9126 joinargs.append(self.op.file_storage_dir)
9128 joinargs.append(self.op.instance_name)
9130 # pylint: disable=W0142
9131 self.instance_file_storage_dir = utils.PathJoin(*joinargs)
9133 def CheckPrereq(self):
9134 """Check prerequisites.
9137 self._CalculateFileStorageDir()
9139 if self.op.mode == constants.INSTANCE_IMPORT:
9140 export_info = self._ReadExportInfo()
9141 self._ReadExportParams(export_info)
9143 if (not self.cfg.GetVGName() and
9144 self.op.disk_template not in constants.DTS_NOT_LVM):
9145 raise errors.OpPrereqError("Cluster does not support lvm-based"
9146 " instances", errors.ECODE_STATE)
9148 if (self.op.hypervisor is None or
9149 self.op.hypervisor == constants.VALUE_AUTO):
9150 self.op.hypervisor = self.cfg.GetHypervisorType()
9152 cluster = self.cfg.GetClusterInfo()
9153 enabled_hvs = cluster.enabled_hypervisors
9154 if self.op.hypervisor not in enabled_hvs:
9155 raise errors.OpPrereqError("Selected hypervisor (%s) not enabled in the"
9156 " cluster (%s)" % (self.op.hypervisor,
9157 ",".join(enabled_hvs)),
9160 # Check tag validity
9161 for tag in self.op.tags:
9162 objects.TaggableObject.ValidateTag(tag)
9164 # check hypervisor parameter syntax (locally)
9165 utils.ForceDictType(self.op.hvparams, constants.HVS_PARAMETER_TYPES)
9166 filled_hvp = cluster.SimpleFillHV(self.op.hypervisor, self.op.os_type,
9168 hv_type = hypervisor.GetHypervisor(self.op.hypervisor)
9169 hv_type.CheckParameterSyntax(filled_hvp)
9170 self.hv_full = filled_hvp
9171 # check that we don't specify global parameters on an instance
9172 _CheckGlobalHvParams(self.op.hvparams)
9174 # fill and remember the beparams dict
9175 default_beparams = cluster.beparams[constants.PP_DEFAULT]
9176 for param, value in self.op.beparams.iteritems():
9177 if value == constants.VALUE_AUTO:
9178 self.op.beparams[param] = default_beparams[param]
9179 objects.UpgradeBeParams(self.op.beparams)
9180 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
9181 self.be_full = cluster.SimpleFillBE(self.op.beparams)
9183 # build os parameters
9184 self.os_full = cluster.SimpleFillOS(self.op.os_type, self.op.osparams)
9186 # now that hvp/bep are in final format, let's reset to defaults,
9188 if self.op.identify_defaults:
9189 self._RevertToDefaults(cluster)
9193 for idx, nic in enumerate(self.op.nics):
9194 nic_mode_req = nic.get(constants.INIC_MODE, None)
9195 nic_mode = nic_mode_req
9196 if nic_mode is None or nic_mode == constants.VALUE_AUTO:
9197 nic_mode = cluster.nicparams[constants.PP_DEFAULT][constants.NIC_MODE]
9199 # in routed mode, for the first nic, the default ip is 'auto'
9200 if nic_mode == constants.NIC_MODE_ROUTED and idx == 0:
9201 default_ip_mode = constants.VALUE_AUTO
9203 default_ip_mode = constants.VALUE_NONE
9205 # ip validity checks
9206 ip = nic.get(constants.INIC_IP, default_ip_mode)
9207 if ip is None or ip.lower() == constants.VALUE_NONE:
9209 elif ip.lower() == constants.VALUE_AUTO:
9210 if not self.op.name_check:
9211 raise errors.OpPrereqError("IP address set to auto but name checks"
9212 " have been skipped",
9214 nic_ip = self.hostname1.ip
9216 if not netutils.IPAddress.IsValid(ip):
9217 raise errors.OpPrereqError("Invalid IP address '%s'" % ip,
9221 # TODO: check the ip address for uniqueness
9222 if nic_mode == constants.NIC_MODE_ROUTED and not nic_ip:
9223 raise errors.OpPrereqError("Routed nic mode requires an ip address",
9226 # MAC address verification
9227 mac = nic.get(constants.INIC_MAC, constants.VALUE_AUTO)
9228 if mac not in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
9229 mac = utils.NormalizeAndValidateMac(mac)
9232 self.cfg.ReserveMAC(mac, self.proc.GetECId())
9233 except errors.ReservationError:
9234 raise errors.OpPrereqError("MAC address %s already in use"
9235 " in cluster" % mac,
9236 errors.ECODE_NOTUNIQUE)
9238 # Build nic parameters
9239 link = nic.get(constants.INIC_LINK, None)
9240 if link == constants.VALUE_AUTO:
9241 link = cluster.nicparams[constants.PP_DEFAULT][constants.NIC_LINK]
9244 nicparams[constants.NIC_MODE] = nic_mode
9246 nicparams[constants.NIC_LINK] = link
9248 check_params = cluster.SimpleFillNIC(nicparams)
9249 objects.NIC.CheckParameterSyntax(check_params)
9250 self.nics.append(objects.NIC(mac=mac, ip=nic_ip, nicparams=nicparams))
9252 # disk checks/pre-build
9253 default_vg = self.cfg.GetVGName()
9255 for disk in self.op.disks:
9256 mode = disk.get(constants.IDISK_MODE, constants.DISK_RDWR)
9257 if mode not in constants.DISK_ACCESS_SET:
9258 raise errors.OpPrereqError("Invalid disk access mode '%s'" %
9259 mode, errors.ECODE_INVAL)
9260 size = disk.get(constants.IDISK_SIZE, None)
9262 raise errors.OpPrereqError("Missing disk size", errors.ECODE_INVAL)
9265 except (TypeError, ValueError):
9266 raise errors.OpPrereqError("Invalid disk size '%s'" % size,
9269 data_vg = disk.get(constants.IDISK_VG, default_vg)
9271 constants.IDISK_SIZE: size,
9272 constants.IDISK_MODE: mode,
9273 constants.IDISK_VG: data_vg,
9274 constants.IDISK_METAVG: disk.get(constants.IDISK_METAVG, data_vg),
9276 if constants.IDISK_ADOPT in disk:
9277 new_disk[constants.IDISK_ADOPT] = disk[constants.IDISK_ADOPT]
9278 self.disks.append(new_disk)
9280 if self.op.mode == constants.INSTANCE_IMPORT:
9282 for idx in range(len(self.disks)):
9283 option = "disk%d_dump" % idx
9284 if export_info.has_option(constants.INISECT_INS, option):
9285 # FIXME: are the old os-es, disk sizes, etc. useful?
9286 export_name = export_info.get(constants.INISECT_INS, option)
9287 image = utils.PathJoin(self.op.src_path, export_name)
9288 disk_images.append(image)
9290 disk_images.append(False)
9292 self.src_images = disk_images
9294 old_name = export_info.get(constants.INISECT_INS, "name")
9295 if self.op.instance_name == old_name:
9296 for idx, nic in enumerate(self.nics):
9297 if nic.mac == constants.VALUE_AUTO:
9298 nic_mac_ini = "nic%d_mac" % idx
9299 nic.mac = export_info.get(constants.INISECT_INS, nic_mac_ini)
9301 # ENDIF: self.op.mode == constants.INSTANCE_IMPORT
9303 # ip ping checks (we use the same ip that was resolved in ExpandNames)
9304 if self.op.ip_check:
9305 if netutils.TcpPing(self.check_ip, constants.DEFAULT_NODED_PORT):
9306 raise errors.OpPrereqError("IP %s of instance %s already in use" %
9307 (self.check_ip, self.op.instance_name),
9308 errors.ECODE_NOTUNIQUE)
9310 #### mac address generation
9311 # By generating here the mac address both the allocator and the hooks get
9312 # the real final mac address rather than the 'auto' or 'generate' value.
9313 # There is a race condition between the generation and the instance object
9314 # creation, which means that we know the mac is valid now, but we're not
9315 # sure it will be when we actually add the instance. If things go bad
9316 # adding the instance will abort because of a duplicate mac, and the
9317 # creation job will fail.
9318 for nic in self.nics:
9319 if nic.mac in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
9320 nic.mac = self.cfg.GenerateMAC(self.proc.GetECId())
9324 if self.op.iallocator is not None:
9325 self._RunAllocator()
9327 # Release all unneeded node locks
9328 _ReleaseLocks(self, locking.LEVEL_NODE,
9329 keep=filter(None, [self.op.pnode, self.op.snode,
9332 #### node related checks
9334 # check primary node
9335 self.pnode = pnode = self.cfg.GetNodeInfo(self.op.pnode)
9336 assert self.pnode is not None, \
9337 "Cannot retrieve locked node %s" % self.op.pnode
9339 raise errors.OpPrereqError("Cannot use offline primary node '%s'" %
9340 pnode.name, errors.ECODE_STATE)
9342 raise errors.OpPrereqError("Cannot use drained primary node '%s'" %
9343 pnode.name, errors.ECODE_STATE)
9344 if not pnode.vm_capable:
9345 raise errors.OpPrereqError("Cannot use non-vm_capable primary node"
9346 " '%s'" % pnode.name, errors.ECODE_STATE)
9348 self.secondaries = []
9350 # mirror node verification
9351 if self.op.disk_template in constants.DTS_INT_MIRROR:
9352 if self.op.snode == pnode.name:
9353 raise errors.OpPrereqError("The secondary node cannot be the"
9354 " primary node", errors.ECODE_INVAL)
9355 _CheckNodeOnline(self, self.op.snode)
9356 _CheckNodeNotDrained(self, self.op.snode)
9357 _CheckNodeVmCapable(self, self.op.snode)
9358 self.secondaries.append(self.op.snode)
9360 snode = self.cfg.GetNodeInfo(self.op.snode)
9361 if pnode.group != snode.group:
9362 self.LogWarning("The primary and secondary nodes are in two"
9363 " different node groups; the disk parameters"
9364 " from the first disk's node group will be"
9367 nodenames = [pnode.name] + self.secondaries
9369 # disk parameters (not customizable at instance or node level)
9370 # just use the primary node parameters, ignoring the secondary.
9371 self.diskparams = self.cfg.GetNodeGroup(pnode.group).diskparams
9373 if not self.adopt_disks:
9374 # Check lv size requirements, if not adopting
9375 req_sizes = _ComputeDiskSizePerVG(self.op.disk_template, self.disks)
9376 _CheckNodesFreeDiskPerVG(self, nodenames, req_sizes)
9378 elif self.op.disk_template == constants.DT_PLAIN: # Check the adoption data
9379 all_lvs = set(["%s/%s" % (disk[constants.IDISK_VG],
9380 disk[constants.IDISK_ADOPT])
9381 for disk in self.disks])
9382 if len(all_lvs) != len(self.disks):
9383 raise errors.OpPrereqError("Duplicate volume names given for adoption",
9385 for lv_name in all_lvs:
9387 # FIXME: lv_name here is "vg/lv" need to ensure that other calls
9388 # to ReserveLV uses the same syntax
9389 self.cfg.ReserveLV(lv_name, self.proc.GetECId())
9390 except errors.ReservationError:
9391 raise errors.OpPrereqError("LV named %s used by another instance" %
9392 lv_name, errors.ECODE_NOTUNIQUE)
9394 vg_names = self.rpc.call_vg_list([pnode.name])[pnode.name]
9395 vg_names.Raise("Cannot get VG information from node %s" % pnode.name)
9397 node_lvs = self.rpc.call_lv_list([pnode.name],
9398 vg_names.payload.keys())[pnode.name]
9399 node_lvs.Raise("Cannot get LV information from node %s" % pnode.name)
9400 node_lvs = node_lvs.payload
9402 delta = all_lvs.difference(node_lvs.keys())
9404 raise errors.OpPrereqError("Missing logical volume(s): %s" %
9405 utils.CommaJoin(delta),
9407 online_lvs = [lv for lv in all_lvs if node_lvs[lv][2]]
9409 raise errors.OpPrereqError("Online logical volumes found, cannot"
9410 " adopt: %s" % utils.CommaJoin(online_lvs),
9412 # update the size of disk based on what is found
9413 for dsk in self.disks:
9414 dsk[constants.IDISK_SIZE] = \
9415 int(float(node_lvs["%s/%s" % (dsk[constants.IDISK_VG],
9416 dsk[constants.IDISK_ADOPT])][0]))
9418 elif self.op.disk_template == constants.DT_BLOCK:
9419 # Normalize and de-duplicate device paths
9420 all_disks = set([os.path.abspath(disk[constants.IDISK_ADOPT])
9421 for disk in self.disks])
9422 if len(all_disks) != len(self.disks):
9423 raise errors.OpPrereqError("Duplicate disk names given for adoption",
9425 baddisks = [d for d in all_disks
9426 if not d.startswith(constants.ADOPTABLE_BLOCKDEV_ROOT)]
9428 raise errors.OpPrereqError("Device node(s) %s lie outside %s and"
9429 " cannot be adopted" %
9430 (", ".join(baddisks),
9431 constants.ADOPTABLE_BLOCKDEV_ROOT),
9434 node_disks = self.rpc.call_bdev_sizes([pnode.name],
9435 list(all_disks))[pnode.name]
9436 node_disks.Raise("Cannot get block device information from node %s" %
9438 node_disks = node_disks.payload
9439 delta = all_disks.difference(node_disks.keys())
9441 raise errors.OpPrereqError("Missing block device(s): %s" %
9442 utils.CommaJoin(delta),
9444 for dsk in self.disks:
9445 dsk[constants.IDISK_SIZE] = \
9446 int(float(node_disks[dsk[constants.IDISK_ADOPT]]))
9448 _CheckHVParams(self, nodenames, self.op.hypervisor, self.op.hvparams)
9450 _CheckNodeHasOS(self, pnode.name, self.op.os_type, self.op.force_variant)
9451 # check OS parameters (remotely)
9452 _CheckOSParams(self, True, nodenames, self.op.os_type, self.os_full)
9454 _CheckNicsBridgesExist(self, self.nics, self.pnode.name)
9456 # memory check on primary node
9457 #TODO(dynmem): use MINMEM for checking
9459 _CheckNodeFreeMemory(self, self.pnode.name,
9460 "creating instance %s" % self.op.instance_name,
9461 self.be_full[constants.BE_MAXMEM],
9464 self.dry_run_result = list(nodenames)
9466 def Exec(self, feedback_fn):
9467 """Create and add the instance to the cluster.
9470 instance = self.op.instance_name
9471 pnode_name = self.pnode.name
9473 assert not (self.owned_locks(locking.LEVEL_NODE_RES) -
9474 self.owned_locks(locking.LEVEL_NODE)), \
9475 "Node locks differ from node resource locks"
9477 ht_kind = self.op.hypervisor
9478 if ht_kind in constants.HTS_REQ_PORT:
9479 network_port = self.cfg.AllocatePort()
9483 disks = _GenerateDiskTemplate(self,
9484 self.op.disk_template,
9485 instance, pnode_name,
9488 self.instance_file_storage_dir,
9489 self.op.file_driver,
9494 iobj = objects.Instance(name=instance, os=self.op.os_type,
9495 primary_node=pnode_name,
9496 nics=self.nics, disks=disks,
9497 disk_template=self.op.disk_template,
9498 admin_state=constants.ADMINST_DOWN,
9499 network_port=network_port,
9500 beparams=self.op.beparams,
9501 hvparams=self.op.hvparams,
9502 hypervisor=self.op.hypervisor,
9503 osparams=self.op.osparams,
9507 for tag in self.op.tags:
9510 if self.adopt_disks:
9511 if self.op.disk_template == constants.DT_PLAIN:
9512 # rename LVs to the newly-generated names; we need to construct
9513 # 'fake' LV disks with the old data, plus the new unique_id
9514 tmp_disks = [objects.Disk.FromDict(v.ToDict()) for v in disks]
9516 for t_dsk, a_dsk in zip(tmp_disks, self.disks):
9517 rename_to.append(t_dsk.logical_id)
9518 t_dsk.logical_id = (t_dsk.logical_id[0], a_dsk[constants.IDISK_ADOPT])
9519 self.cfg.SetDiskID(t_dsk, pnode_name)
9520 result = self.rpc.call_blockdev_rename(pnode_name,
9521 zip(tmp_disks, rename_to))
9522 result.Raise("Failed to rename adoped LVs")
9524 feedback_fn("* creating instance disks...")
9526 _CreateDisks(self, iobj)
9527 except errors.OpExecError:
9528 self.LogWarning("Device creation failed, reverting...")
9530 _RemoveDisks(self, iobj)
9532 self.cfg.ReleaseDRBDMinors(instance)
9535 feedback_fn("adding instance %s to cluster config" % instance)
9537 self.cfg.AddInstance(iobj, self.proc.GetECId())
9539 # Declare that we don't want to remove the instance lock anymore, as we've
9540 # added the instance to the config
9541 del self.remove_locks[locking.LEVEL_INSTANCE]
9543 if self.op.mode == constants.INSTANCE_IMPORT:
9544 # Release unused nodes
9545 _ReleaseLocks(self, locking.LEVEL_NODE, keep=[self.op.src_node])
9548 _ReleaseLocks(self, locking.LEVEL_NODE)
9551 if not self.adopt_disks and self.cfg.GetClusterInfo().prealloc_wipe_disks:
9552 feedback_fn("* wiping instance disks...")
9554 _WipeDisks(self, iobj)
9555 except errors.OpExecError, err:
9556 logging.exception("Wiping disks failed")
9557 self.LogWarning("Wiping instance disks failed (%s)", err)
9561 # Something is already wrong with the disks, don't do anything else
9563 elif self.op.wait_for_sync:
9564 disk_abort = not _WaitForSync(self, iobj)
9565 elif iobj.disk_template in constants.DTS_INT_MIRROR:
9566 # make sure the disks are not degraded (still sync-ing is ok)
9567 feedback_fn("* checking mirrors status")
9568 disk_abort = not _WaitForSync(self, iobj, oneshot=True)
9573 _RemoveDisks(self, iobj)
9574 self.cfg.RemoveInstance(iobj.name)
9575 # Make sure the instance lock gets removed
9576 self.remove_locks[locking.LEVEL_INSTANCE] = iobj.name
9577 raise errors.OpExecError("There are some degraded disks for"
9580 # Release all node resource locks
9581 _ReleaseLocks(self, locking.LEVEL_NODE_RES)
9583 if iobj.disk_template != constants.DT_DISKLESS and not self.adopt_disks:
9584 if self.op.mode == constants.INSTANCE_CREATE:
9585 if not self.op.no_install:
9586 pause_sync = (iobj.disk_template in constants.DTS_INT_MIRROR and
9587 not self.op.wait_for_sync)
9589 feedback_fn("* pausing disk sync to install instance OS")
9590 result = self.rpc.call_blockdev_pause_resume_sync(pnode_name,
9592 for idx, success in enumerate(result.payload):
9594 logging.warn("pause-sync of instance %s for disk %d failed",
9597 feedback_fn("* running the instance OS create scripts...")
9598 # FIXME: pass debug option from opcode to backend
9600 self.rpc.call_instance_os_add(pnode_name, (iobj, None), False,
9601 self.op.debug_level)
9603 feedback_fn("* resuming disk sync")
9604 result = self.rpc.call_blockdev_pause_resume_sync(pnode_name,
9606 for idx, success in enumerate(result.payload):
9608 logging.warn("resume-sync of instance %s for disk %d failed",
9611 os_add_result.Raise("Could not add os for instance %s"
9612 " on node %s" % (instance, pnode_name))
9614 elif self.op.mode == constants.INSTANCE_IMPORT:
9615 feedback_fn("* running the instance OS import scripts...")
9619 for idx, image in enumerate(self.src_images):
9623 # FIXME: pass debug option from opcode to backend
9624 dt = masterd.instance.DiskTransfer("disk/%s" % idx,
9625 constants.IEIO_FILE, (image, ),
9626 constants.IEIO_SCRIPT,
9627 (iobj.disks[idx], idx),
9629 transfers.append(dt)
9632 masterd.instance.TransferInstanceData(self, feedback_fn,
9633 self.op.src_node, pnode_name,
9634 self.pnode.secondary_ip,
9636 if not compat.all(import_result):
9637 self.LogWarning("Some disks for instance %s on node %s were not"
9638 " imported successfully" % (instance, pnode_name))
9640 elif self.op.mode == constants.INSTANCE_REMOTE_IMPORT:
9641 feedback_fn("* preparing remote import...")
9642 # The source cluster will stop the instance before attempting to make a
9643 # connection. In some cases stopping an instance can take a long time,
9644 # hence the shutdown timeout is added to the connection timeout.
9645 connect_timeout = (constants.RIE_CONNECT_TIMEOUT +
9646 self.op.source_shutdown_timeout)
9647 timeouts = masterd.instance.ImportExportTimeouts(connect_timeout)
9649 assert iobj.primary_node == self.pnode.name
9651 masterd.instance.RemoteImport(self, feedback_fn, iobj, self.pnode,
9652 self.source_x509_ca,
9653 self._cds, timeouts)
9654 if not compat.all(disk_results):
9655 # TODO: Should the instance still be started, even if some disks
9656 # failed to import (valid for local imports, too)?
9657 self.LogWarning("Some disks for instance %s on node %s were not"
9658 " imported successfully" % (instance, pnode_name))
9660 # Run rename script on newly imported instance
9661 assert iobj.name == instance
9662 feedback_fn("Running rename script for %s" % instance)
9663 result = self.rpc.call_instance_run_rename(pnode_name, iobj,
9664 self.source_instance_name,
9665 self.op.debug_level)
9667 self.LogWarning("Failed to run rename script for %s on node"
9668 " %s: %s" % (instance, pnode_name, result.fail_msg))
9671 # also checked in the prereq part
9672 raise errors.ProgrammerError("Unknown OS initialization mode '%s'"
9675 assert not self.owned_locks(locking.LEVEL_NODE_RES)
9678 iobj.admin_state = constants.ADMINST_UP
9679 self.cfg.Update(iobj, feedback_fn)
9680 logging.info("Starting instance %s on node %s", instance, pnode_name)
9681 feedback_fn("* starting instance...")
9682 result = self.rpc.call_instance_start(pnode_name, (iobj, None, None),
9684 result.Raise("Could not start instance")
9686 return list(iobj.all_nodes)
9689 class LUInstanceConsole(NoHooksLU):
9690 """Connect to an instance's console.
9692 This is somewhat special in that it returns the command line that
9693 you need to run on the master node in order to connect to the
9699 def ExpandNames(self):
9700 self.share_locks = _ShareAll()
9701 self._ExpandAndLockInstance()
9703 def CheckPrereq(self):
9704 """Check prerequisites.
9706 This checks that the instance is in the cluster.
9709 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
9710 assert self.instance is not None, \
9711 "Cannot retrieve locked instance %s" % self.op.instance_name
9712 _CheckNodeOnline(self, self.instance.primary_node)
9714 def Exec(self, feedback_fn):
9715 """Connect to the console of an instance
9718 instance = self.instance
9719 node = instance.primary_node
9721 node_insts = self.rpc.call_instance_list([node],
9722 [instance.hypervisor])[node]
9723 node_insts.Raise("Can't get node information from %s" % node)
9725 if instance.name not in node_insts.payload:
9726 if instance.admin_state == constants.ADMINST_UP:
9727 state = constants.INSTST_ERRORDOWN
9728 elif instance.admin_state == constants.ADMINST_DOWN:
9729 state = constants.INSTST_ADMINDOWN
9731 state = constants.INSTST_ADMINOFFLINE
9732 raise errors.OpExecError("Instance %s is not running (state %s)" %
9733 (instance.name, state))
9735 logging.debug("Connecting to console of %s on %s", instance.name, node)
9737 return _GetInstanceConsole(self.cfg.GetClusterInfo(), instance)
9740 def _GetInstanceConsole(cluster, instance):
9741 """Returns console information for an instance.
9743 @type cluster: L{objects.Cluster}
9744 @type instance: L{objects.Instance}
9748 hyper = hypervisor.GetHypervisor(instance.hypervisor)
9749 # beparams and hvparams are passed separately, to avoid editing the
9750 # instance and then saving the defaults in the instance itself.
9751 hvparams = cluster.FillHV(instance)
9752 beparams = cluster.FillBE(instance)
9753 console = hyper.GetInstanceConsole(instance, hvparams, beparams)
9755 assert console.instance == instance.name
9756 assert console.Validate()
9758 return console.ToDict()
9761 class LUInstanceReplaceDisks(LogicalUnit):
9762 """Replace the disks of an instance.
9765 HPATH = "mirrors-replace"
9766 HTYPE = constants.HTYPE_INSTANCE
9769 def CheckArguments(self):
9770 TLReplaceDisks.CheckArguments(self.op.mode, self.op.remote_node,
9773 def ExpandNames(self):
9774 self._ExpandAndLockInstance()
9776 assert locking.LEVEL_NODE not in self.needed_locks
9777 assert locking.LEVEL_NODE_RES not in self.needed_locks
9778 assert locking.LEVEL_NODEGROUP not in self.needed_locks
9780 assert self.op.iallocator is None or self.op.remote_node is None, \
9781 "Conflicting options"
9783 if self.op.remote_node is not None:
9784 self.op.remote_node = _ExpandNodeName(self.cfg, self.op.remote_node)
9786 # Warning: do not remove the locking of the new secondary here
9787 # unless DRBD8.AddChildren is changed to work in parallel;
9788 # currently it doesn't since parallel invocations of
9789 # FindUnusedMinor will conflict
9790 self.needed_locks[locking.LEVEL_NODE] = [self.op.remote_node]
9791 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
9793 self.needed_locks[locking.LEVEL_NODE] = []
9794 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
9796 if self.op.iallocator is not None:
9797 # iallocator will select a new node in the same group
9798 self.needed_locks[locking.LEVEL_NODEGROUP] = []
9800 self.needed_locks[locking.LEVEL_NODE_RES] = []
9802 self.replacer = TLReplaceDisks(self, self.op.instance_name, self.op.mode,
9803 self.op.iallocator, self.op.remote_node,
9804 self.op.disks, False, self.op.early_release)
9806 self.tasklets = [self.replacer]
9808 def DeclareLocks(self, level):
9809 if level == locking.LEVEL_NODEGROUP:
9810 assert self.op.remote_node is None
9811 assert self.op.iallocator is not None
9812 assert not self.needed_locks[locking.LEVEL_NODEGROUP]
9814 self.share_locks[locking.LEVEL_NODEGROUP] = 1
9815 # Lock all groups used by instance optimistically; this requires going
9816 # via the node before it's locked, requiring verification later on
9817 self.needed_locks[locking.LEVEL_NODEGROUP] = \
9818 self.cfg.GetInstanceNodeGroups(self.op.instance_name)
9820 elif level == locking.LEVEL_NODE:
9821 if self.op.iallocator is not None:
9822 assert self.op.remote_node is None
9823 assert not self.needed_locks[locking.LEVEL_NODE]
9825 # Lock member nodes of all locked groups
9826 self.needed_locks[locking.LEVEL_NODE] = [node_name
9827 for group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP)
9828 for node_name in self.cfg.GetNodeGroup(group_uuid).members]
9830 self._LockInstancesNodes()
9831 elif level == locking.LEVEL_NODE_RES:
9833 self.needed_locks[locking.LEVEL_NODE_RES] = \
9834 self.needed_locks[locking.LEVEL_NODE]
9836 def BuildHooksEnv(self):
9839 This runs on the master, the primary and all the secondaries.
9842 instance = self.replacer.instance
9844 "MODE": self.op.mode,
9845 "NEW_SECONDARY": self.op.remote_node,
9846 "OLD_SECONDARY": instance.secondary_nodes[0],
9848 env.update(_BuildInstanceHookEnvByObject(self, instance))
9851 def BuildHooksNodes(self):
9852 """Build hooks nodes.
9855 instance = self.replacer.instance
9857 self.cfg.GetMasterNode(),
9858 instance.primary_node,
9860 if self.op.remote_node is not None:
9861 nl.append(self.op.remote_node)
9864 def CheckPrereq(self):
9865 """Check prerequisites.
9868 assert (self.glm.is_owned(locking.LEVEL_NODEGROUP) or
9869 self.op.iallocator is None)
9871 # Verify if node group locks are still correct
9872 owned_groups = self.owned_locks(locking.LEVEL_NODEGROUP)
9874 _CheckInstanceNodeGroups(self.cfg, self.op.instance_name, owned_groups)
9876 return LogicalUnit.CheckPrereq(self)
9879 class TLReplaceDisks(Tasklet):
9880 """Replaces disks for an instance.
9882 Note: Locking is not within the scope of this class.
9885 def __init__(self, lu, instance_name, mode, iallocator_name, remote_node,
9886 disks, delay_iallocator, early_release):
9887 """Initializes this class.
9890 Tasklet.__init__(self, lu)
9893 self.instance_name = instance_name
9895 self.iallocator_name = iallocator_name
9896 self.remote_node = remote_node
9898 self.delay_iallocator = delay_iallocator
9899 self.early_release = early_release
9902 self.instance = None
9903 self.new_node = None
9904 self.target_node = None
9905 self.other_node = None
9906 self.remote_node_info = None
9907 self.node_secondary_ip = None
9910 def CheckArguments(mode, remote_node, iallocator):
9911 """Helper function for users of this class.
9914 # check for valid parameter combination
9915 if mode == constants.REPLACE_DISK_CHG:
9916 if remote_node is None and iallocator is None:
9917 raise errors.OpPrereqError("When changing the secondary either an"
9918 " iallocator script must be used or the"
9919 " new node given", errors.ECODE_INVAL)
9921 if remote_node is not None and iallocator is not None:
9922 raise errors.OpPrereqError("Give either the iallocator or the new"
9923 " secondary, not both", errors.ECODE_INVAL)
9925 elif remote_node is not None or iallocator is not None:
9926 # Not replacing the secondary
9927 raise errors.OpPrereqError("The iallocator and new node options can"
9928 " only be used when changing the"
9929 " secondary node", errors.ECODE_INVAL)
9932 def _RunAllocator(lu, iallocator_name, instance_name, relocate_from):
9933 """Compute a new secondary node using an IAllocator.
9936 ial = IAllocator(lu.cfg, lu.rpc,
9937 mode=constants.IALLOCATOR_MODE_RELOC,
9939 relocate_from=list(relocate_from))
9941 ial.Run(iallocator_name)
9944 raise errors.OpPrereqError("Can't compute nodes using iallocator '%s':"
9945 " %s" % (iallocator_name, ial.info),
9948 if len(ial.result) != ial.required_nodes:
9949 raise errors.OpPrereqError("iallocator '%s' returned invalid number"
9950 " of nodes (%s), required %s" %
9952 len(ial.result), ial.required_nodes),
9955 remote_node_name = ial.result[0]
9957 lu.LogInfo("Selected new secondary for instance '%s': %s",
9958 instance_name, remote_node_name)
9960 return remote_node_name
9962 def _FindFaultyDisks(self, node_name):
9963 """Wrapper for L{_FindFaultyInstanceDisks}.
9966 return _FindFaultyInstanceDisks(self.cfg, self.rpc, self.instance,
9969 def _CheckDisksActivated(self, instance):
9970 """Checks if the instance disks are activated.
9972 @param instance: The instance to check disks
9973 @return: True if they are activated, False otherwise
9976 nodes = instance.all_nodes
9978 for idx, dev in enumerate(instance.disks):
9980 self.lu.LogInfo("Checking disk/%d on %s", idx, node)
9981 self.cfg.SetDiskID(dev, node)
9983 result = self.rpc.call_blockdev_find(node, dev)
9987 elif result.fail_msg or not result.payload:
9992 def CheckPrereq(self):
9993 """Check prerequisites.
9995 This checks that the instance is in the cluster.
9998 self.instance = instance = self.cfg.GetInstanceInfo(self.instance_name)
9999 assert instance is not None, \
10000 "Cannot retrieve locked instance %s" % self.instance_name
10002 if instance.disk_template != constants.DT_DRBD8:
10003 raise errors.OpPrereqError("Can only run replace disks for DRBD8-based"
10004 " instances", errors.ECODE_INVAL)
10006 if len(instance.secondary_nodes) != 1:
10007 raise errors.OpPrereqError("The instance has a strange layout,"
10008 " expected one secondary but found %d" %
10009 len(instance.secondary_nodes),
10010 errors.ECODE_FAULT)
10012 if not self.delay_iallocator:
10013 self._CheckPrereq2()
10015 def _CheckPrereq2(self):
10016 """Check prerequisites, second part.
10018 This function should always be part of CheckPrereq. It was separated and is
10019 now called from Exec because during node evacuation iallocator was only
10020 called with an unmodified cluster model, not taking planned changes into
10024 instance = self.instance
10025 secondary_node = instance.secondary_nodes[0]
10027 if self.iallocator_name is None:
10028 remote_node = self.remote_node
10030 remote_node = self._RunAllocator(self.lu, self.iallocator_name,
10031 instance.name, instance.secondary_nodes)
10033 if remote_node is None:
10034 self.remote_node_info = None
10036 assert remote_node in self.lu.owned_locks(locking.LEVEL_NODE), \
10037 "Remote node '%s' is not locked" % remote_node
10039 self.remote_node_info = self.cfg.GetNodeInfo(remote_node)
10040 assert self.remote_node_info is not None, \
10041 "Cannot retrieve locked node %s" % remote_node
10043 if remote_node == self.instance.primary_node:
10044 raise errors.OpPrereqError("The specified node is the primary node of"
10045 " the instance", errors.ECODE_INVAL)
10047 if remote_node == secondary_node:
10048 raise errors.OpPrereqError("The specified node is already the"
10049 " secondary node of the instance",
10050 errors.ECODE_INVAL)
10052 if self.disks and self.mode in (constants.REPLACE_DISK_AUTO,
10053 constants.REPLACE_DISK_CHG):
10054 raise errors.OpPrereqError("Cannot specify disks to be replaced",
10055 errors.ECODE_INVAL)
10057 if self.mode == constants.REPLACE_DISK_AUTO:
10058 if not self._CheckDisksActivated(instance):
10059 raise errors.OpPrereqError("Please run activate-disks on instance %s"
10060 " first" % self.instance_name,
10061 errors.ECODE_STATE)
10062 faulty_primary = self._FindFaultyDisks(instance.primary_node)
10063 faulty_secondary = self._FindFaultyDisks(secondary_node)
10065 if faulty_primary and faulty_secondary:
10066 raise errors.OpPrereqError("Instance %s has faulty disks on more than"
10067 " one node and can not be repaired"
10068 " automatically" % self.instance_name,
10069 errors.ECODE_STATE)
10072 self.disks = faulty_primary
10073 self.target_node = instance.primary_node
10074 self.other_node = secondary_node
10075 check_nodes = [self.target_node, self.other_node]
10076 elif faulty_secondary:
10077 self.disks = faulty_secondary
10078 self.target_node = secondary_node
10079 self.other_node = instance.primary_node
10080 check_nodes = [self.target_node, self.other_node]
10086 # Non-automatic modes
10087 if self.mode == constants.REPLACE_DISK_PRI:
10088 self.target_node = instance.primary_node
10089 self.other_node = secondary_node
10090 check_nodes = [self.target_node, self.other_node]
10092 elif self.mode == constants.REPLACE_DISK_SEC:
10093 self.target_node = secondary_node
10094 self.other_node = instance.primary_node
10095 check_nodes = [self.target_node, self.other_node]
10097 elif self.mode == constants.REPLACE_DISK_CHG:
10098 self.new_node = remote_node
10099 self.other_node = instance.primary_node
10100 self.target_node = secondary_node
10101 check_nodes = [self.new_node, self.other_node]
10103 _CheckNodeNotDrained(self.lu, remote_node)
10104 _CheckNodeVmCapable(self.lu, remote_node)
10106 old_node_info = self.cfg.GetNodeInfo(secondary_node)
10107 assert old_node_info is not None
10108 if old_node_info.offline and not self.early_release:
10109 # doesn't make sense to delay the release
10110 self.early_release = True
10111 self.lu.LogInfo("Old secondary %s is offline, automatically enabling"
10112 " early-release mode", secondary_node)
10115 raise errors.ProgrammerError("Unhandled disk replace mode (%s)" %
10118 # If not specified all disks should be replaced
10120 self.disks = range(len(self.instance.disks))
10122 # TODO: compute disk parameters
10123 primary_node_info = self.cfg.GetNodeInfo(instance.primary_node)
10124 secondary_node_info = self.cfg.GetNodeInfo(secondary_node)
10125 if primary_node_info.group != secondary_node_info.group:
10126 self.lu.LogInfo("The instance primary and secondary nodes are in two"
10127 " different node groups; the disk parameters of the"
10128 " primary node's group will be applied.")
10130 self.diskparams = self.cfg.GetNodeGroup(primary_node_info.group).diskparams
10132 for node in check_nodes:
10133 _CheckNodeOnline(self.lu, node)
10135 touched_nodes = frozenset(node_name for node_name in [self.new_node,
10138 if node_name is not None)
10140 # Release unneeded node and node resource locks
10141 _ReleaseLocks(self.lu, locking.LEVEL_NODE, keep=touched_nodes)
10142 _ReleaseLocks(self.lu, locking.LEVEL_NODE_RES, keep=touched_nodes)
10144 # Release any owned node group
10145 if self.lu.glm.is_owned(locking.LEVEL_NODEGROUP):
10146 _ReleaseLocks(self.lu, locking.LEVEL_NODEGROUP)
10148 # Check whether disks are valid
10149 for disk_idx in self.disks:
10150 instance.FindDisk(disk_idx)
10152 # Get secondary node IP addresses
10153 self.node_secondary_ip = dict((name, node.secondary_ip) for (name, node)
10154 in self.cfg.GetMultiNodeInfo(touched_nodes))
10156 def Exec(self, feedback_fn):
10157 """Execute disk replacement.
10159 This dispatches the disk replacement to the appropriate handler.
10162 if self.delay_iallocator:
10163 self._CheckPrereq2()
10166 # Verify owned locks before starting operation
10167 owned_nodes = self.lu.owned_locks(locking.LEVEL_NODE)
10168 assert set(owned_nodes) == set(self.node_secondary_ip), \
10169 ("Incorrect node locks, owning %s, expected %s" %
10170 (owned_nodes, self.node_secondary_ip.keys()))
10171 assert (self.lu.owned_locks(locking.LEVEL_NODE) ==
10172 self.lu.owned_locks(locking.LEVEL_NODE_RES))
10174 owned_instances = self.lu.owned_locks(locking.LEVEL_INSTANCE)
10175 assert list(owned_instances) == [self.instance_name], \
10176 "Instance '%s' not locked" % self.instance_name
10178 assert not self.lu.glm.is_owned(locking.LEVEL_NODEGROUP), \
10179 "Should not own any node group lock at this point"
10182 feedback_fn("No disks need replacement")
10185 feedback_fn("Replacing disk(s) %s for %s" %
10186 (utils.CommaJoin(self.disks), self.instance.name))
10188 activate_disks = (self.instance.admin_state != constants.ADMINST_UP)
10190 # Activate the instance disks if we're replacing them on a down instance
10192 _StartInstanceDisks(self.lu, self.instance, True)
10195 # Should we replace the secondary node?
10196 if self.new_node is not None:
10197 fn = self._ExecDrbd8Secondary
10199 fn = self._ExecDrbd8DiskOnly
10201 result = fn(feedback_fn)
10203 # Deactivate the instance disks if we're replacing them on a
10206 _SafeShutdownInstanceDisks(self.lu, self.instance)
10208 assert not self.lu.owned_locks(locking.LEVEL_NODE)
10211 # Verify owned locks
10212 owned_nodes = self.lu.owned_locks(locking.LEVEL_NODE_RES)
10213 nodes = frozenset(self.node_secondary_ip)
10214 assert ((self.early_release and not owned_nodes) or
10215 (not self.early_release and not (set(owned_nodes) - nodes))), \
10216 ("Not owning the correct locks, early_release=%s, owned=%r,"
10217 " nodes=%r" % (self.early_release, owned_nodes, nodes))
10221 def _CheckVolumeGroup(self, nodes):
10222 self.lu.LogInfo("Checking volume groups")
10224 vgname = self.cfg.GetVGName()
10226 # Make sure volume group exists on all involved nodes
10227 results = self.rpc.call_vg_list(nodes)
10229 raise errors.OpExecError("Can't list volume groups on the nodes")
10232 res = results[node]
10233 res.Raise("Error checking node %s" % node)
10234 if vgname not in res.payload:
10235 raise errors.OpExecError("Volume group '%s' not found on node %s" %
10238 def _CheckDisksExistence(self, nodes):
10239 # Check disk existence
10240 for idx, dev in enumerate(self.instance.disks):
10241 if idx not in self.disks:
10245 self.lu.LogInfo("Checking disk/%d on %s" % (idx, node))
10246 self.cfg.SetDiskID(dev, node)
10248 result = self.rpc.call_blockdev_find(node, dev)
10250 msg = result.fail_msg
10251 if msg or not result.payload:
10253 msg = "disk not found"
10254 raise errors.OpExecError("Can't find disk/%d on node %s: %s" %
10257 def _CheckDisksConsistency(self, node_name, on_primary, ldisk):
10258 for idx, dev in enumerate(self.instance.disks):
10259 if idx not in self.disks:
10262 self.lu.LogInfo("Checking disk/%d consistency on node %s" %
10265 if not _CheckDiskConsistency(self.lu, dev, node_name, on_primary,
10267 raise errors.OpExecError("Node %s has degraded storage, unsafe to"
10268 " replace disks for instance %s" %
10269 (node_name, self.instance.name))
10271 def _CreateNewStorage(self, node_name):
10272 """Create new storage on the primary or secondary node.
10274 This is only used for same-node replaces, not for changing the
10275 secondary node, hence we don't want to modify the existing disk.
10280 for idx, dev in enumerate(self.instance.disks):
10281 if idx not in self.disks:
10284 self.lu.LogInfo("Adding storage on %s for disk/%d" % (node_name, idx))
10286 self.cfg.SetDiskID(dev, node_name)
10288 lv_names = [".disk%d_%s" % (idx, suffix) for suffix in ["data", "meta"]]
10289 names = _GenerateUniqueNames(self.lu, lv_names)
10291 _, data_p, meta_p = _ComputeLDParams(constants.DT_DRBD8, self.diskparams)
10293 vg_data = dev.children[0].logical_id[0]
10294 lv_data = objects.Disk(dev_type=constants.LD_LV, size=dev.size,
10295 logical_id=(vg_data, names[0]), params=data_p)
10296 vg_meta = dev.children[1].logical_id[0]
10297 lv_meta = objects.Disk(dev_type=constants.LD_LV, size=DRBD_META_SIZE,
10298 logical_id=(vg_meta, names[1]), params=meta_p)
10300 new_lvs = [lv_data, lv_meta]
10301 old_lvs = [child.Copy() for child in dev.children]
10302 iv_names[dev.iv_name] = (dev, old_lvs, new_lvs)
10304 # we pass force_create=True to force the LVM creation
10305 for new_lv in new_lvs:
10306 _CreateBlockDev(self.lu, node_name, self.instance, new_lv, True,
10307 _GetInstanceInfoText(self.instance), False)
10311 def _CheckDevices(self, node_name, iv_names):
10312 for name, (dev, _, _) in iv_names.iteritems():
10313 self.cfg.SetDiskID(dev, node_name)
10315 result = self.rpc.call_blockdev_find(node_name, dev)
10317 msg = result.fail_msg
10318 if msg or not result.payload:
10320 msg = "disk not found"
10321 raise errors.OpExecError("Can't find DRBD device %s: %s" %
10324 if result.payload.is_degraded:
10325 raise errors.OpExecError("DRBD device %s is degraded!" % name)
10327 def _RemoveOldStorage(self, node_name, iv_names):
10328 for name, (_, old_lvs, _) in iv_names.iteritems():
10329 self.lu.LogInfo("Remove logical volumes for %s" % name)
10332 self.cfg.SetDiskID(lv, node_name)
10334 msg = self.rpc.call_blockdev_remove(node_name, lv).fail_msg
10336 self.lu.LogWarning("Can't remove old LV: %s" % msg,
10337 hint="remove unused LVs manually")
10339 def _ExecDrbd8DiskOnly(self, feedback_fn): # pylint: disable=W0613
10340 """Replace a disk on the primary or secondary for DRBD 8.
10342 The algorithm for replace is quite complicated:
10344 1. for each disk to be replaced:
10346 1. create new LVs on the target node with unique names
10347 1. detach old LVs from the drbd device
10348 1. rename old LVs to name_replaced.<time_t>
10349 1. rename new LVs to old LVs
10350 1. attach the new LVs (with the old names now) to the drbd device
10352 1. wait for sync across all devices
10354 1. for each modified disk:
10356 1. remove old LVs (which have the name name_replaces.<time_t>)
10358 Failures are not very well handled.
10363 # Step: check device activation
10364 self.lu.LogStep(1, steps_total, "Check device existence")
10365 self._CheckDisksExistence([self.other_node, self.target_node])
10366 self._CheckVolumeGroup([self.target_node, self.other_node])
10368 # Step: check other node consistency
10369 self.lu.LogStep(2, steps_total, "Check peer consistency")
10370 self._CheckDisksConsistency(self.other_node,
10371 self.other_node == self.instance.primary_node,
10374 # Step: create new storage
10375 self.lu.LogStep(3, steps_total, "Allocate new storage")
10376 iv_names = self._CreateNewStorage(self.target_node)
10378 # Step: for each lv, detach+rename*2+attach
10379 self.lu.LogStep(4, steps_total, "Changing drbd configuration")
10380 for dev, old_lvs, new_lvs in iv_names.itervalues():
10381 self.lu.LogInfo("Detaching %s drbd from local storage" % dev.iv_name)
10383 result = self.rpc.call_blockdev_removechildren(self.target_node, dev,
10385 result.Raise("Can't detach drbd from local storage on node"
10386 " %s for device %s" % (self.target_node, dev.iv_name))
10388 #cfg.Update(instance)
10390 # ok, we created the new LVs, so now we know we have the needed
10391 # storage; as such, we proceed on the target node to rename
10392 # old_lv to _old, and new_lv to old_lv; note that we rename LVs
10393 # using the assumption that logical_id == physical_id (which in
10394 # turn is the unique_id on that node)
10396 # FIXME(iustin): use a better name for the replaced LVs
10397 temp_suffix = int(time.time())
10398 ren_fn = lambda d, suff: (d.physical_id[0],
10399 d.physical_id[1] + "_replaced-%s" % suff)
10401 # Build the rename list based on what LVs exist on the node
10402 rename_old_to_new = []
10403 for to_ren in old_lvs:
10404 result = self.rpc.call_blockdev_find(self.target_node, to_ren)
10405 if not result.fail_msg and result.payload:
10407 rename_old_to_new.append((to_ren, ren_fn(to_ren, temp_suffix)))
10409 self.lu.LogInfo("Renaming the old LVs on the target node")
10410 result = self.rpc.call_blockdev_rename(self.target_node,
10412 result.Raise("Can't rename old LVs on node %s" % self.target_node)
10414 # Now we rename the new LVs to the old LVs
10415 self.lu.LogInfo("Renaming the new LVs on the target node")
10416 rename_new_to_old = [(new, old.physical_id)
10417 for old, new in zip(old_lvs, new_lvs)]
10418 result = self.rpc.call_blockdev_rename(self.target_node,
10420 result.Raise("Can't rename new LVs on node %s" % self.target_node)
10422 # Intermediate steps of in memory modifications
10423 for old, new in zip(old_lvs, new_lvs):
10424 new.logical_id = old.logical_id
10425 self.cfg.SetDiskID(new, self.target_node)
10427 # We need to modify old_lvs so that removal later removes the
10428 # right LVs, not the newly added ones; note that old_lvs is a
10430 for disk in old_lvs:
10431 disk.logical_id = ren_fn(disk, temp_suffix)
10432 self.cfg.SetDiskID(disk, self.target_node)
10434 # Now that the new lvs have the old name, we can add them to the device
10435 self.lu.LogInfo("Adding new mirror component on %s" % self.target_node)
10436 result = self.rpc.call_blockdev_addchildren(self.target_node, dev,
10438 msg = result.fail_msg
10440 for new_lv in new_lvs:
10441 msg2 = self.rpc.call_blockdev_remove(self.target_node,
10444 self.lu.LogWarning("Can't rollback device %s: %s", dev, msg2,
10445 hint=("cleanup manually the unused logical"
10447 raise errors.OpExecError("Can't add local storage to drbd: %s" % msg)
10449 cstep = itertools.count(5)
10451 if self.early_release:
10452 self.lu.LogStep(cstep.next(), steps_total, "Removing old storage")
10453 self._RemoveOldStorage(self.target_node, iv_names)
10454 # TODO: Check if releasing locks early still makes sense
10455 _ReleaseLocks(self.lu, locking.LEVEL_NODE_RES)
10457 # Release all resource locks except those used by the instance
10458 _ReleaseLocks(self.lu, locking.LEVEL_NODE_RES,
10459 keep=self.node_secondary_ip.keys())
10461 # Release all node locks while waiting for sync
10462 _ReleaseLocks(self.lu, locking.LEVEL_NODE)
10464 # TODO: Can the instance lock be downgraded here? Take the optional disk
10465 # shutdown in the caller into consideration.
10468 # This can fail as the old devices are degraded and _WaitForSync
10469 # does a combined result over all disks, so we don't check its return value
10470 self.lu.LogStep(cstep.next(), steps_total, "Sync devices")
10471 _WaitForSync(self.lu, self.instance)
10473 # Check all devices manually
10474 self._CheckDevices(self.instance.primary_node, iv_names)
10476 # Step: remove old storage
10477 if not self.early_release:
10478 self.lu.LogStep(cstep.next(), steps_total, "Removing old storage")
10479 self._RemoveOldStorage(self.target_node, iv_names)
10481 def _ExecDrbd8Secondary(self, feedback_fn):
10482 """Replace the secondary node for DRBD 8.
10484 The algorithm for replace is quite complicated:
10485 - for all disks of the instance:
10486 - create new LVs on the new node with same names
10487 - shutdown the drbd device on the old secondary
10488 - disconnect the drbd network on the primary
10489 - create the drbd device on the new secondary
10490 - network attach the drbd on the primary, using an artifice:
10491 the drbd code for Attach() will connect to the network if it
10492 finds a device which is connected to the good local disks but
10493 not network enabled
10494 - wait for sync across all devices
10495 - remove all disks from the old secondary
10497 Failures are not very well handled.
10502 pnode = self.instance.primary_node
10504 # Step: check device activation
10505 self.lu.LogStep(1, steps_total, "Check device existence")
10506 self._CheckDisksExistence([self.instance.primary_node])
10507 self._CheckVolumeGroup([self.instance.primary_node])
10509 # Step: check other node consistency
10510 self.lu.LogStep(2, steps_total, "Check peer consistency")
10511 self._CheckDisksConsistency(self.instance.primary_node, True, True)
10513 # Step: create new storage
10514 self.lu.LogStep(3, steps_total, "Allocate new storage")
10515 for idx, dev in enumerate(self.instance.disks):
10516 self.lu.LogInfo("Adding new local storage on %s for disk/%d" %
10517 (self.new_node, idx))
10518 # we pass force_create=True to force LVM creation
10519 for new_lv in dev.children:
10520 _CreateBlockDev(self.lu, self.new_node, self.instance, new_lv, True,
10521 _GetInstanceInfoText(self.instance), False)
10523 # Step 4: dbrd minors and drbd setups changes
10524 # after this, we must manually remove the drbd minors on both the
10525 # error and the success paths
10526 self.lu.LogStep(4, steps_total, "Changing drbd configuration")
10527 minors = self.cfg.AllocateDRBDMinor([self.new_node
10528 for dev in self.instance.disks],
10529 self.instance.name)
10530 logging.debug("Allocated minors %r", minors)
10533 for idx, (dev, new_minor) in enumerate(zip(self.instance.disks, minors)):
10534 self.lu.LogInfo("activating a new drbd on %s for disk/%d" %
10535 (self.new_node, idx))
10536 # create new devices on new_node; note that we create two IDs:
10537 # one without port, so the drbd will be activated without
10538 # networking information on the new node at this stage, and one
10539 # with network, for the latter activation in step 4
10540 (o_node1, o_node2, o_port, o_minor1, o_minor2, o_secret) = dev.logical_id
10541 if self.instance.primary_node == o_node1:
10544 assert self.instance.primary_node == o_node2, "Three-node instance?"
10547 new_alone_id = (self.instance.primary_node, self.new_node, None,
10548 p_minor, new_minor, o_secret)
10549 new_net_id = (self.instance.primary_node, self.new_node, o_port,
10550 p_minor, new_minor, o_secret)
10552 iv_names[idx] = (dev, dev.children, new_net_id)
10553 logging.debug("Allocated new_minor: %s, new_logical_id: %s", new_minor,
10555 drbd_params, _, _ = _ComputeLDParams(constants.DT_DRBD8, self.diskparams)
10556 new_drbd = objects.Disk(dev_type=constants.LD_DRBD8,
10557 logical_id=new_alone_id,
10558 children=dev.children,
10560 params=drbd_params)
10562 _CreateSingleBlockDev(self.lu, self.new_node, self.instance, new_drbd,
10563 _GetInstanceInfoText(self.instance), False)
10564 except errors.GenericError:
10565 self.cfg.ReleaseDRBDMinors(self.instance.name)
10568 # We have new devices, shutdown the drbd on the old secondary
10569 for idx, dev in enumerate(self.instance.disks):
10570 self.lu.LogInfo("Shutting down drbd for disk/%d on old node" % idx)
10571 self.cfg.SetDiskID(dev, self.target_node)
10572 msg = self.rpc.call_blockdev_shutdown(self.target_node, dev).fail_msg
10574 self.lu.LogWarning("Failed to shutdown drbd for disk/%d on old"
10575 "node: %s" % (idx, msg),
10576 hint=("Please cleanup this device manually as"
10577 " soon as possible"))
10579 self.lu.LogInfo("Detaching primary drbds from the network (=> standalone)")
10580 result = self.rpc.call_drbd_disconnect_net([pnode], self.node_secondary_ip,
10581 self.instance.disks)[pnode]
10583 msg = result.fail_msg
10585 # detaches didn't succeed (unlikely)
10586 self.cfg.ReleaseDRBDMinors(self.instance.name)
10587 raise errors.OpExecError("Can't detach the disks from the network on"
10588 " old node: %s" % (msg,))
10590 # if we managed to detach at least one, we update all the disks of
10591 # the instance to point to the new secondary
10592 self.lu.LogInfo("Updating instance configuration")
10593 for dev, _, new_logical_id in iv_names.itervalues():
10594 dev.logical_id = new_logical_id
10595 self.cfg.SetDiskID(dev, self.instance.primary_node)
10597 self.cfg.Update(self.instance, feedback_fn)
10599 # Release all node locks (the configuration has been updated)
10600 _ReleaseLocks(self.lu, locking.LEVEL_NODE)
10602 # and now perform the drbd attach
10603 self.lu.LogInfo("Attaching primary drbds to new secondary"
10604 " (standalone => connected)")
10605 result = self.rpc.call_drbd_attach_net([self.instance.primary_node,
10607 self.node_secondary_ip,
10608 self.instance.disks,
10609 self.instance.name,
10611 for to_node, to_result in result.items():
10612 msg = to_result.fail_msg
10614 self.lu.LogWarning("Can't attach drbd disks on node %s: %s",
10616 hint=("please do a gnt-instance info to see the"
10617 " status of disks"))
10619 cstep = itertools.count(5)
10621 if self.early_release:
10622 self.lu.LogStep(cstep.next(), steps_total, "Removing old storage")
10623 self._RemoveOldStorage(self.target_node, iv_names)
10624 # TODO: Check if releasing locks early still makes sense
10625 _ReleaseLocks(self.lu, locking.LEVEL_NODE_RES)
10627 # Release all resource locks except those used by the instance
10628 _ReleaseLocks(self.lu, locking.LEVEL_NODE_RES,
10629 keep=self.node_secondary_ip.keys())
10631 # TODO: Can the instance lock be downgraded here? Take the optional disk
10632 # shutdown in the caller into consideration.
10635 # This can fail as the old devices are degraded and _WaitForSync
10636 # does a combined result over all disks, so we don't check its return value
10637 self.lu.LogStep(cstep.next(), steps_total, "Sync devices")
10638 _WaitForSync(self.lu, self.instance)
10640 # Check all devices manually
10641 self._CheckDevices(self.instance.primary_node, iv_names)
10643 # Step: remove old storage
10644 if not self.early_release:
10645 self.lu.LogStep(cstep.next(), steps_total, "Removing old storage")
10646 self._RemoveOldStorage(self.target_node, iv_names)
10649 class LURepairNodeStorage(NoHooksLU):
10650 """Repairs the volume group on a node.
10655 def CheckArguments(self):
10656 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
10658 storage_type = self.op.storage_type
10660 if (constants.SO_FIX_CONSISTENCY not in
10661 constants.VALID_STORAGE_OPERATIONS.get(storage_type, [])):
10662 raise errors.OpPrereqError("Storage units of type '%s' can not be"
10663 " repaired" % storage_type,
10664 errors.ECODE_INVAL)
10666 def ExpandNames(self):
10667 self.needed_locks = {
10668 locking.LEVEL_NODE: [self.op.node_name],
10671 def _CheckFaultyDisks(self, instance, node_name):
10672 """Ensure faulty disks abort the opcode or at least warn."""
10674 if _FindFaultyInstanceDisks(self.cfg, self.rpc, instance,
10676 raise errors.OpPrereqError("Instance '%s' has faulty disks on"
10677 " node '%s'" % (instance.name, node_name),
10678 errors.ECODE_STATE)
10679 except errors.OpPrereqError, err:
10680 if self.op.ignore_consistency:
10681 self.proc.LogWarning(str(err.args[0]))
10685 def CheckPrereq(self):
10686 """Check prerequisites.
10689 # Check whether any instance on this node has faulty disks
10690 for inst in _GetNodeInstances(self.cfg, self.op.node_name):
10691 if inst.admin_state != constants.ADMINST_UP:
10693 check_nodes = set(inst.all_nodes)
10694 check_nodes.discard(self.op.node_name)
10695 for inst_node_name in check_nodes:
10696 self._CheckFaultyDisks(inst, inst_node_name)
10698 def Exec(self, feedback_fn):
10699 feedback_fn("Repairing storage unit '%s' on %s ..." %
10700 (self.op.name, self.op.node_name))
10702 st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
10703 result = self.rpc.call_storage_execute(self.op.node_name,
10704 self.op.storage_type, st_args,
10706 constants.SO_FIX_CONSISTENCY)
10707 result.Raise("Failed to repair storage unit '%s' on %s" %
10708 (self.op.name, self.op.node_name))
10711 class LUNodeEvacuate(NoHooksLU):
10712 """Evacuates instances off a list of nodes.
10717 _MODE2IALLOCATOR = {
10718 constants.NODE_EVAC_PRI: constants.IALLOCATOR_NEVAC_PRI,
10719 constants.NODE_EVAC_SEC: constants.IALLOCATOR_NEVAC_SEC,
10720 constants.NODE_EVAC_ALL: constants.IALLOCATOR_NEVAC_ALL,
10722 assert frozenset(_MODE2IALLOCATOR.keys()) == constants.NODE_EVAC_MODES
10723 assert (frozenset(_MODE2IALLOCATOR.values()) ==
10724 constants.IALLOCATOR_NEVAC_MODES)
10726 def CheckArguments(self):
10727 _CheckIAllocatorOrNode(self, "iallocator", "remote_node")
10729 def ExpandNames(self):
10730 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
10732 if self.op.remote_node is not None:
10733 self.op.remote_node = _ExpandNodeName(self.cfg, self.op.remote_node)
10734 assert self.op.remote_node
10736 if self.op.remote_node == self.op.node_name:
10737 raise errors.OpPrereqError("Can not use evacuated node as a new"
10738 " secondary node", errors.ECODE_INVAL)
10740 if self.op.mode != constants.NODE_EVAC_SEC:
10741 raise errors.OpPrereqError("Without the use of an iallocator only"
10742 " secondary instances can be evacuated",
10743 errors.ECODE_INVAL)
10746 self.share_locks = _ShareAll()
10747 self.needed_locks = {
10748 locking.LEVEL_INSTANCE: [],
10749 locking.LEVEL_NODEGROUP: [],
10750 locking.LEVEL_NODE: [],
10753 # Determine nodes (via group) optimistically, needs verification once locks
10754 # have been acquired
10755 self.lock_nodes = self._DetermineNodes()
10757 def _DetermineNodes(self):
10758 """Gets the list of nodes to operate on.
10761 if self.op.remote_node is None:
10762 # Iallocator will choose any node(s) in the same group
10763 group_nodes = self.cfg.GetNodeGroupMembersByNodes([self.op.node_name])
10765 group_nodes = frozenset([self.op.remote_node])
10767 # Determine nodes to be locked
10768 return set([self.op.node_name]) | group_nodes
10770 def _DetermineInstances(self):
10771 """Builds list of instances to operate on.
10774 assert self.op.mode in constants.NODE_EVAC_MODES
10776 if self.op.mode == constants.NODE_EVAC_PRI:
10777 # Primary instances only
10778 inst_fn = _GetNodePrimaryInstances
10779 assert self.op.remote_node is None, \
10780 "Evacuating primary instances requires iallocator"
10781 elif self.op.mode == constants.NODE_EVAC_SEC:
10782 # Secondary instances only
10783 inst_fn = _GetNodeSecondaryInstances
10786 assert self.op.mode == constants.NODE_EVAC_ALL
10787 inst_fn = _GetNodeInstances
10788 # TODO: In 2.6, change the iallocator interface to take an evacuation mode
10790 raise errors.OpPrereqError("Due to an issue with the iallocator"
10791 " interface it is not possible to evacuate"
10792 " all instances at once; specify explicitly"
10793 " whether to evacuate primary or secondary"
10795 errors.ECODE_INVAL)
10797 return inst_fn(self.cfg, self.op.node_name)
10799 def DeclareLocks(self, level):
10800 if level == locking.LEVEL_INSTANCE:
10801 # Lock instances optimistically, needs verification once node and group
10802 # locks have been acquired
10803 self.needed_locks[locking.LEVEL_INSTANCE] = \
10804 set(i.name for i in self._DetermineInstances())
10806 elif level == locking.LEVEL_NODEGROUP:
10807 # Lock node groups for all potential target nodes optimistically, needs
10808 # verification once nodes have been acquired
10809 self.needed_locks[locking.LEVEL_NODEGROUP] = \
10810 self.cfg.GetNodeGroupsFromNodes(self.lock_nodes)
10812 elif level == locking.LEVEL_NODE:
10813 self.needed_locks[locking.LEVEL_NODE] = self.lock_nodes
10815 def CheckPrereq(self):
10817 owned_instances = self.owned_locks(locking.LEVEL_INSTANCE)
10818 owned_nodes = self.owned_locks(locking.LEVEL_NODE)
10819 owned_groups = self.owned_locks(locking.LEVEL_NODEGROUP)
10821 need_nodes = self._DetermineNodes()
10823 if not owned_nodes.issuperset(need_nodes):
10824 raise errors.OpPrereqError("Nodes in same group as '%s' changed since"
10825 " locks were acquired, current nodes are"
10826 " are '%s', used to be '%s'; retry the"
10828 (self.op.node_name,
10829 utils.CommaJoin(need_nodes),
10830 utils.CommaJoin(owned_nodes)),
10831 errors.ECODE_STATE)
10833 wanted_groups = self.cfg.GetNodeGroupsFromNodes(owned_nodes)
10834 if owned_groups != wanted_groups:
10835 raise errors.OpExecError("Node groups changed since locks were acquired,"
10836 " current groups are '%s', used to be '%s';"
10837 " retry the operation" %
10838 (utils.CommaJoin(wanted_groups),
10839 utils.CommaJoin(owned_groups)))
10841 # Determine affected instances
10842 self.instances = self._DetermineInstances()
10843 self.instance_names = [i.name for i in self.instances]
10845 if set(self.instance_names) != owned_instances:
10846 raise errors.OpExecError("Instances on node '%s' changed since locks"
10847 " were acquired, current instances are '%s',"
10848 " used to be '%s'; retry the operation" %
10849 (self.op.node_name,
10850 utils.CommaJoin(self.instance_names),
10851 utils.CommaJoin(owned_instances)))
10853 if self.instance_names:
10854 self.LogInfo("Evacuating instances from node '%s': %s",
10856 utils.CommaJoin(utils.NiceSort(self.instance_names)))
10858 self.LogInfo("No instances to evacuate from node '%s'",
10861 if self.op.remote_node is not None:
10862 for i in self.instances:
10863 if i.primary_node == self.op.remote_node:
10864 raise errors.OpPrereqError("Node %s is the primary node of"
10865 " instance %s, cannot use it as"
10867 (self.op.remote_node, i.name),
10868 errors.ECODE_INVAL)
10870 def Exec(self, feedback_fn):
10871 assert (self.op.iallocator is not None) ^ (self.op.remote_node is not None)
10873 if not self.instance_names:
10874 # No instances to evacuate
10877 elif self.op.iallocator is not None:
10878 # TODO: Implement relocation to other group
10879 ial = IAllocator(self.cfg, self.rpc, constants.IALLOCATOR_MODE_NODE_EVAC,
10880 evac_mode=self._MODE2IALLOCATOR[self.op.mode],
10881 instances=list(self.instance_names))
10883 ial.Run(self.op.iallocator)
10885 if not ial.success:
10886 raise errors.OpPrereqError("Can't compute node evacuation using"
10887 " iallocator '%s': %s" %
10888 (self.op.iallocator, ial.info),
10889 errors.ECODE_NORES)
10891 jobs = _LoadNodeEvacResult(self, ial.result, self.op.early_release, True)
10893 elif self.op.remote_node is not None:
10894 assert self.op.mode == constants.NODE_EVAC_SEC
10896 [opcodes.OpInstanceReplaceDisks(instance_name=instance_name,
10897 remote_node=self.op.remote_node,
10899 mode=constants.REPLACE_DISK_CHG,
10900 early_release=self.op.early_release)]
10901 for instance_name in self.instance_names
10905 raise errors.ProgrammerError("No iallocator or remote node")
10907 return ResultWithJobs(jobs)
10910 def _SetOpEarlyRelease(early_release, op):
10911 """Sets C{early_release} flag on opcodes if available.
10915 op.early_release = early_release
10916 except AttributeError:
10917 assert not isinstance(op, opcodes.OpInstanceReplaceDisks)
10922 def _NodeEvacDest(use_nodes, group, nodes):
10923 """Returns group or nodes depending on caller's choice.
10927 return utils.CommaJoin(nodes)
10932 def _LoadNodeEvacResult(lu, alloc_result, early_release, use_nodes):
10933 """Unpacks the result of change-group and node-evacuate iallocator requests.
10935 Iallocator modes L{constants.IALLOCATOR_MODE_NODE_EVAC} and
10936 L{constants.IALLOCATOR_MODE_CHG_GROUP}.
10938 @type lu: L{LogicalUnit}
10939 @param lu: Logical unit instance
10940 @type alloc_result: tuple/list
10941 @param alloc_result: Result from iallocator
10942 @type early_release: bool
10943 @param early_release: Whether to release locks early if possible
10944 @type use_nodes: bool
10945 @param use_nodes: Whether to display node names instead of groups
10948 (moved, failed, jobs) = alloc_result
10951 failreason = utils.CommaJoin("%s (%s)" % (name, reason)
10952 for (name, reason) in failed)
10953 lu.LogWarning("Unable to evacuate instances %s", failreason)
10954 raise errors.OpExecError("Unable to evacuate instances %s" % failreason)
10957 lu.LogInfo("Instances to be moved: %s",
10958 utils.CommaJoin("%s (to %s)" %
10959 (name, _NodeEvacDest(use_nodes, group, nodes))
10960 for (name, group, nodes) in moved))
10962 return [map(compat.partial(_SetOpEarlyRelease, early_release),
10963 map(opcodes.OpCode.LoadOpCode, ops))
10967 class LUInstanceGrowDisk(LogicalUnit):
10968 """Grow a disk of an instance.
10971 HPATH = "disk-grow"
10972 HTYPE = constants.HTYPE_INSTANCE
10975 def ExpandNames(self):
10976 self._ExpandAndLockInstance()
10977 self.needed_locks[locking.LEVEL_NODE] = []
10978 self.needed_locks[locking.LEVEL_NODE_RES] = []
10979 self.recalculate_locks[locking.LEVEL_NODE_RES] = constants.LOCKS_REPLACE
10981 def DeclareLocks(self, level):
10982 if level == locking.LEVEL_NODE:
10983 self._LockInstancesNodes()
10984 elif level == locking.LEVEL_NODE_RES:
10986 self.needed_locks[locking.LEVEL_NODE_RES] = \
10987 self.needed_locks[locking.LEVEL_NODE][:]
10989 def BuildHooksEnv(self):
10990 """Build hooks env.
10992 This runs on the master, the primary and all the secondaries.
10996 "DISK": self.op.disk,
10997 "AMOUNT": self.op.amount,
10999 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
11002 def BuildHooksNodes(self):
11003 """Build hooks nodes.
11006 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
11009 def CheckPrereq(self):
11010 """Check prerequisites.
11012 This checks that the instance is in the cluster.
11015 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
11016 assert instance is not None, \
11017 "Cannot retrieve locked instance %s" % self.op.instance_name
11018 nodenames = list(instance.all_nodes)
11019 for node in nodenames:
11020 _CheckNodeOnline(self, node)
11022 self.instance = instance
11024 if instance.disk_template not in constants.DTS_GROWABLE:
11025 raise errors.OpPrereqError("Instance's disk layout does not support"
11026 " growing", errors.ECODE_INVAL)
11028 self.disk = instance.FindDisk(self.op.disk)
11030 if instance.disk_template not in (constants.DT_FILE,
11031 constants.DT_SHARED_FILE):
11032 # TODO: check the free disk space for file, when that feature will be
11034 _CheckNodesFreeDiskPerVG(self, nodenames,
11035 self.disk.ComputeGrowth(self.op.amount))
11037 def Exec(self, feedback_fn):
11038 """Execute disk grow.
11041 instance = self.instance
11044 assert set([instance.name]) == self.owned_locks(locking.LEVEL_INSTANCE)
11045 assert (self.owned_locks(locking.LEVEL_NODE) ==
11046 self.owned_locks(locking.LEVEL_NODE_RES))
11048 disks_ok, _ = _AssembleInstanceDisks(self, self.instance, disks=[disk])
11050 raise errors.OpExecError("Cannot activate block device to grow")
11052 feedback_fn("Growing disk %s of instance '%s' by %s" %
11053 (self.op.disk, instance.name,
11054 utils.FormatUnit(self.op.amount, "h")))
11056 # First run all grow ops in dry-run mode
11057 for node in instance.all_nodes:
11058 self.cfg.SetDiskID(disk, node)
11059 result = self.rpc.call_blockdev_grow(node, disk, self.op.amount, True)
11060 result.Raise("Grow request failed to node %s" % node)
11062 # We know that (as far as we can test) operations across different
11063 # nodes will succeed, time to run it for real
11064 for node in instance.all_nodes:
11065 self.cfg.SetDiskID(disk, node)
11066 result = self.rpc.call_blockdev_grow(node, disk, self.op.amount, False)
11067 result.Raise("Grow request failed to node %s" % node)
11069 # TODO: Rewrite code to work properly
11070 # DRBD goes into sync mode for a short amount of time after executing the
11071 # "resize" command. DRBD 8.x below version 8.0.13 contains a bug whereby
11072 # calling "resize" in sync mode fails. Sleeping for a short amount of
11073 # time is a work-around.
11076 disk.RecordGrow(self.op.amount)
11077 self.cfg.Update(instance, feedback_fn)
11079 # Changes have been recorded, release node lock
11080 _ReleaseLocks(self, locking.LEVEL_NODE)
11082 # Downgrade lock while waiting for sync
11083 self.glm.downgrade(locking.LEVEL_INSTANCE)
11085 if self.op.wait_for_sync:
11086 disk_abort = not _WaitForSync(self, instance, disks=[disk])
11088 self.proc.LogWarning("Disk sync-ing has not returned a good"
11089 " status; please check the instance")
11090 if instance.admin_state != constants.ADMINST_UP:
11091 _SafeShutdownInstanceDisks(self, instance, disks=[disk])
11092 elif instance.admin_state != constants.ADMINST_UP:
11093 self.proc.LogWarning("Not shutting down the disk even if the instance is"
11094 " not supposed to be running because no wait for"
11095 " sync mode was requested")
11097 assert self.owned_locks(locking.LEVEL_NODE_RES)
11098 assert set([instance.name]) == self.owned_locks(locking.LEVEL_INSTANCE)
11101 class LUInstanceQueryData(NoHooksLU):
11102 """Query runtime instance data.
11107 def ExpandNames(self):
11108 self.needed_locks = {}
11110 # Use locking if requested or when non-static information is wanted
11111 if not (self.op.static or self.op.use_locking):
11112 self.LogWarning("Non-static data requested, locks need to be acquired")
11113 self.op.use_locking = True
11115 if self.op.instances or not self.op.use_locking:
11116 # Expand instance names right here
11117 self.wanted_names = _GetWantedInstances(self, self.op.instances)
11119 # Will use acquired locks
11120 self.wanted_names = None
11122 if self.op.use_locking:
11123 self.share_locks = _ShareAll()
11125 if self.wanted_names is None:
11126 self.needed_locks[locking.LEVEL_INSTANCE] = locking.ALL_SET
11128 self.needed_locks[locking.LEVEL_INSTANCE] = self.wanted_names
11130 self.needed_locks[locking.LEVEL_NODE] = []
11131 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
11133 def DeclareLocks(self, level):
11134 if self.op.use_locking and level == locking.LEVEL_NODE:
11135 self._LockInstancesNodes()
11137 def CheckPrereq(self):
11138 """Check prerequisites.
11140 This only checks the optional instance list against the existing names.
11143 if self.wanted_names is None:
11144 assert self.op.use_locking, "Locking was not used"
11145 self.wanted_names = self.owned_locks(locking.LEVEL_INSTANCE)
11147 self.wanted_instances = \
11148 map(compat.snd, self.cfg.GetMultiInstanceInfo(self.wanted_names))
11150 def _ComputeBlockdevStatus(self, node, instance_name, dev):
11151 """Returns the status of a block device
11154 if self.op.static or not node:
11157 self.cfg.SetDiskID(dev, node)
11159 result = self.rpc.call_blockdev_find(node, dev)
11163 result.Raise("Can't compute disk status for %s" % instance_name)
11165 status = result.payload
11169 return (status.dev_path, status.major, status.minor,
11170 status.sync_percent, status.estimated_time,
11171 status.is_degraded, status.ldisk_status)
11173 def _ComputeDiskStatus(self, instance, snode, dev):
11174 """Compute block device status.
11177 if dev.dev_type in constants.LDS_DRBD:
11178 # we change the snode then (otherwise we use the one passed in)
11179 if dev.logical_id[0] == instance.primary_node:
11180 snode = dev.logical_id[1]
11182 snode = dev.logical_id[0]
11184 dev_pstatus = self._ComputeBlockdevStatus(instance.primary_node,
11185 instance.name, dev)
11186 dev_sstatus = self._ComputeBlockdevStatus(snode, instance.name, dev)
11189 dev_children = map(compat.partial(self._ComputeDiskStatus,
11196 "iv_name": dev.iv_name,
11197 "dev_type": dev.dev_type,
11198 "logical_id": dev.logical_id,
11199 "physical_id": dev.physical_id,
11200 "pstatus": dev_pstatus,
11201 "sstatus": dev_sstatus,
11202 "children": dev_children,
11207 def Exec(self, feedback_fn):
11208 """Gather and return data"""
11211 cluster = self.cfg.GetClusterInfo()
11213 pri_nodes = self.cfg.GetMultiNodeInfo(i.primary_node
11214 for i in self.wanted_instances)
11215 for instance, (_, pnode) in zip(self.wanted_instances, pri_nodes):
11216 if self.op.static or pnode.offline:
11217 remote_state = None
11219 self.LogWarning("Primary node %s is marked offline, returning static"
11220 " information only for instance %s" %
11221 (pnode.name, instance.name))
11223 remote_info = self.rpc.call_instance_info(instance.primary_node,
11225 instance.hypervisor)
11226 remote_info.Raise("Error checking node %s" % instance.primary_node)
11227 remote_info = remote_info.payload
11228 if remote_info and "state" in remote_info:
11229 remote_state = "up"
11231 if instance.admin_state == constants.ADMINST_UP:
11232 remote_state = "down"
11234 remote_state = instance.admin_state
11236 disks = map(compat.partial(self._ComputeDiskStatus, instance, None),
11239 result[instance.name] = {
11240 "name": instance.name,
11241 "config_state": instance.admin_state,
11242 "run_state": remote_state,
11243 "pnode": instance.primary_node,
11244 "snodes": instance.secondary_nodes,
11246 # this happens to be the same format used for hooks
11247 "nics": _NICListToTuple(self, instance.nics),
11248 "disk_template": instance.disk_template,
11250 "hypervisor": instance.hypervisor,
11251 "network_port": instance.network_port,
11252 "hv_instance": instance.hvparams,
11253 "hv_actual": cluster.FillHV(instance, skip_globals=True),
11254 "be_instance": instance.beparams,
11255 "be_actual": cluster.FillBE(instance),
11256 "os_instance": instance.osparams,
11257 "os_actual": cluster.SimpleFillOS(instance.os, instance.osparams),
11258 "serial_no": instance.serial_no,
11259 "mtime": instance.mtime,
11260 "ctime": instance.ctime,
11261 "uuid": instance.uuid,
11267 class LUInstanceSetParams(LogicalUnit):
11268 """Modifies an instances's parameters.
11271 HPATH = "instance-modify"
11272 HTYPE = constants.HTYPE_INSTANCE
11275 def CheckArguments(self):
11276 if not (self.op.nics or self.op.disks or self.op.disk_template or
11277 self.op.hvparams or self.op.beparams or self.op.os_name or
11278 self.op.online_inst or self.op.offline_inst):
11279 raise errors.OpPrereqError("No changes submitted", errors.ECODE_INVAL)
11281 if self.op.hvparams:
11282 _CheckGlobalHvParams(self.op.hvparams)
11286 for disk_op, disk_dict in self.op.disks:
11287 utils.ForceDictType(disk_dict, constants.IDISK_PARAMS_TYPES)
11288 if disk_op == constants.DDM_REMOVE:
11289 disk_addremove += 1
11291 elif disk_op == constants.DDM_ADD:
11292 disk_addremove += 1
11294 if not isinstance(disk_op, int):
11295 raise errors.OpPrereqError("Invalid disk index", errors.ECODE_INVAL)
11296 if not isinstance(disk_dict, dict):
11297 msg = "Invalid disk value: expected dict, got '%s'" % disk_dict
11298 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
11300 if disk_op == constants.DDM_ADD:
11301 mode = disk_dict.setdefault(constants.IDISK_MODE, constants.DISK_RDWR)
11302 if mode not in constants.DISK_ACCESS_SET:
11303 raise errors.OpPrereqError("Invalid disk access mode '%s'" % mode,
11304 errors.ECODE_INVAL)
11305 size = disk_dict.get(constants.IDISK_SIZE, None)
11307 raise errors.OpPrereqError("Required disk parameter size missing",
11308 errors.ECODE_INVAL)
11311 except (TypeError, ValueError), err:
11312 raise errors.OpPrereqError("Invalid disk size parameter: %s" %
11313 str(err), errors.ECODE_INVAL)
11314 disk_dict[constants.IDISK_SIZE] = size
11316 # modification of disk
11317 if constants.IDISK_SIZE in disk_dict:
11318 raise errors.OpPrereqError("Disk size change not possible, use"
11319 " grow-disk", errors.ECODE_INVAL)
11321 if disk_addremove > 1:
11322 raise errors.OpPrereqError("Only one disk add or remove operation"
11323 " supported at a time", errors.ECODE_INVAL)
11325 if self.op.disks and self.op.disk_template is not None:
11326 raise errors.OpPrereqError("Disk template conversion and other disk"
11327 " changes not supported at the same time",
11328 errors.ECODE_INVAL)
11330 if (self.op.disk_template and
11331 self.op.disk_template in constants.DTS_INT_MIRROR and
11332 self.op.remote_node is None):
11333 raise errors.OpPrereqError("Changing the disk template to a mirrored"
11334 " one requires specifying a secondary node",
11335 errors.ECODE_INVAL)
11339 for nic_op, nic_dict in self.op.nics:
11340 utils.ForceDictType(nic_dict, constants.INIC_PARAMS_TYPES)
11341 if nic_op == constants.DDM_REMOVE:
11344 elif nic_op == constants.DDM_ADD:
11347 if not isinstance(nic_op, int):
11348 raise errors.OpPrereqError("Invalid nic index", errors.ECODE_INVAL)
11349 if not isinstance(nic_dict, dict):
11350 msg = "Invalid nic value: expected dict, got '%s'" % nic_dict
11351 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
11353 # nic_dict should be a dict
11354 nic_ip = nic_dict.get(constants.INIC_IP, None)
11355 if nic_ip is not None:
11356 if nic_ip.lower() == constants.VALUE_NONE:
11357 nic_dict[constants.INIC_IP] = None
11359 if not netutils.IPAddress.IsValid(nic_ip):
11360 raise errors.OpPrereqError("Invalid IP address '%s'" % nic_ip,
11361 errors.ECODE_INVAL)
11363 nic_bridge = nic_dict.get("bridge", None)
11364 nic_link = nic_dict.get(constants.INIC_LINK, None)
11365 if nic_bridge and nic_link:
11366 raise errors.OpPrereqError("Cannot pass 'bridge' and 'link'"
11367 " at the same time", errors.ECODE_INVAL)
11368 elif nic_bridge and nic_bridge.lower() == constants.VALUE_NONE:
11369 nic_dict["bridge"] = None
11370 elif nic_link and nic_link.lower() == constants.VALUE_NONE:
11371 nic_dict[constants.INIC_LINK] = None
11373 if nic_op == constants.DDM_ADD:
11374 nic_mac = nic_dict.get(constants.INIC_MAC, None)
11375 if nic_mac is None:
11376 nic_dict[constants.INIC_MAC] = constants.VALUE_AUTO
11378 if constants.INIC_MAC in nic_dict:
11379 nic_mac = nic_dict[constants.INIC_MAC]
11380 if nic_mac not in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
11381 nic_mac = utils.NormalizeAndValidateMac(nic_mac)
11383 if nic_op != constants.DDM_ADD and nic_mac == constants.VALUE_AUTO:
11384 raise errors.OpPrereqError("'auto' is not a valid MAC address when"
11385 " modifying an existing nic",
11386 errors.ECODE_INVAL)
11388 if nic_addremove > 1:
11389 raise errors.OpPrereqError("Only one NIC add or remove operation"
11390 " supported at a time", errors.ECODE_INVAL)
11392 def ExpandNames(self):
11393 self._ExpandAndLockInstance()
11394 # Can't even acquire node locks in shared mode as upcoming changes in
11395 # Ganeti 2.6 will start to modify the node object on disk conversion
11396 self.needed_locks[locking.LEVEL_NODE] = []
11397 self.needed_locks[locking.LEVEL_NODE_RES] = []
11398 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
11400 def DeclareLocks(self, level):
11401 if level == locking.LEVEL_NODE:
11402 self._LockInstancesNodes()
11403 if self.op.disk_template and self.op.remote_node:
11404 self.op.remote_node = _ExpandNodeName(self.cfg, self.op.remote_node)
11405 self.needed_locks[locking.LEVEL_NODE].append(self.op.remote_node)
11406 elif level == locking.LEVEL_NODE_RES and self.op.disk_template:
11408 self.needed_locks[locking.LEVEL_NODE_RES] = \
11409 self.needed_locks[locking.LEVEL_NODE][:]
11411 def BuildHooksEnv(self):
11412 """Build hooks env.
11414 This runs on the master, primary and secondaries.
11418 if constants.BE_MINMEM in self.be_new:
11419 args["minmem"] = self.be_new[constants.BE_MINMEM]
11420 if constants.BE_MAXMEM in self.be_new:
11421 args["maxmem"] = self.be_new[constants.BE_MAXMEM]
11422 if constants.BE_VCPUS in self.be_new:
11423 args["vcpus"] = self.be_new[constants.BE_VCPUS]
11424 # TODO: export disk changes. Note: _BuildInstanceHookEnv* don't export disk
11425 # information at all.
11428 nic_override = dict(self.op.nics)
11429 for idx, nic in enumerate(self.instance.nics):
11430 if idx in nic_override:
11431 this_nic_override = nic_override[idx]
11433 this_nic_override = {}
11434 if constants.INIC_IP in this_nic_override:
11435 ip = this_nic_override[constants.INIC_IP]
11438 if constants.INIC_MAC in this_nic_override:
11439 mac = this_nic_override[constants.INIC_MAC]
11442 if idx in self.nic_pnew:
11443 nicparams = self.nic_pnew[idx]
11445 nicparams = self.cluster.SimpleFillNIC(nic.nicparams)
11446 mode = nicparams[constants.NIC_MODE]
11447 link = nicparams[constants.NIC_LINK]
11448 args["nics"].append((ip, mac, mode, link))
11449 if constants.DDM_ADD in nic_override:
11450 ip = nic_override[constants.DDM_ADD].get(constants.INIC_IP, None)
11451 mac = nic_override[constants.DDM_ADD][constants.INIC_MAC]
11452 nicparams = self.nic_pnew[constants.DDM_ADD]
11453 mode = nicparams[constants.NIC_MODE]
11454 link = nicparams[constants.NIC_LINK]
11455 args["nics"].append((ip, mac, mode, link))
11456 elif constants.DDM_REMOVE in nic_override:
11457 del args["nics"][-1]
11459 env = _BuildInstanceHookEnvByObject(self, self.instance, override=args)
11460 if self.op.disk_template:
11461 env["NEW_DISK_TEMPLATE"] = self.op.disk_template
11465 def BuildHooksNodes(self):
11466 """Build hooks nodes.
11469 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
11472 def CheckPrereq(self):
11473 """Check prerequisites.
11475 This only checks the instance list against the existing names.
11478 # checking the new params on the primary/secondary nodes
11480 instance = self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
11481 cluster = self.cluster = self.cfg.GetClusterInfo()
11482 assert self.instance is not None, \
11483 "Cannot retrieve locked instance %s" % self.op.instance_name
11484 pnode = instance.primary_node
11485 nodelist = list(instance.all_nodes)
11486 pnode_info = self.cfg.GetNodeInfo(pnode)
11487 self.diskparams = self.cfg.GetNodeGroup(pnode_info.group).diskparams
11490 if self.op.os_name and not self.op.force:
11491 _CheckNodeHasOS(self, instance.primary_node, self.op.os_name,
11492 self.op.force_variant)
11493 instance_os = self.op.os_name
11495 instance_os = instance.os
11497 if self.op.disk_template:
11498 if instance.disk_template == self.op.disk_template:
11499 raise errors.OpPrereqError("Instance already has disk template %s" %
11500 instance.disk_template, errors.ECODE_INVAL)
11502 if (instance.disk_template,
11503 self.op.disk_template) not in self._DISK_CONVERSIONS:
11504 raise errors.OpPrereqError("Unsupported disk template conversion from"
11505 " %s to %s" % (instance.disk_template,
11506 self.op.disk_template),
11507 errors.ECODE_INVAL)
11508 _CheckInstanceState(self, instance, INSTANCE_DOWN,
11509 msg="cannot change disk template")
11510 if self.op.disk_template in constants.DTS_INT_MIRROR:
11511 if self.op.remote_node == pnode:
11512 raise errors.OpPrereqError("Given new secondary node %s is the same"
11513 " as the primary node of the instance" %
11514 self.op.remote_node, errors.ECODE_STATE)
11515 _CheckNodeOnline(self, self.op.remote_node)
11516 _CheckNodeNotDrained(self, self.op.remote_node)
11517 # FIXME: here we assume that the old instance type is DT_PLAIN
11518 assert instance.disk_template == constants.DT_PLAIN
11519 disks = [{constants.IDISK_SIZE: d.size,
11520 constants.IDISK_VG: d.logical_id[0]}
11521 for d in instance.disks]
11522 required = _ComputeDiskSizePerVG(self.op.disk_template, disks)
11523 _CheckNodesFreeDiskPerVG(self, [self.op.remote_node], required)
11525 snode_info = self.cfg.GetNodeInfo(self.op.remote_node)
11526 if pnode_info.group != snode_info.group:
11527 self.LogWarning("The primary and secondary nodes are in two"
11528 " different node groups; the disk parameters"
11529 " from the first disk's node group will be"
11532 # hvparams processing
11533 if self.op.hvparams:
11534 hv_type = instance.hypervisor
11535 i_hvdict = _GetUpdatedParams(instance.hvparams, self.op.hvparams)
11536 utils.ForceDictType(i_hvdict, constants.HVS_PARAMETER_TYPES)
11537 hv_new = cluster.SimpleFillHV(hv_type, instance.os, i_hvdict)
11540 hypervisor.GetHypervisor(hv_type).CheckParameterSyntax(hv_new)
11541 _CheckHVParams(self, nodelist, instance.hypervisor, hv_new)
11542 self.hv_proposed = self.hv_new = hv_new # the new actual values
11543 self.hv_inst = i_hvdict # the new dict (without defaults)
11545 self.hv_proposed = cluster.SimpleFillHV(instance.hypervisor, instance.os,
11547 self.hv_new = self.hv_inst = {}
11549 # beparams processing
11550 if self.op.beparams:
11551 i_bedict = _GetUpdatedParams(instance.beparams, self.op.beparams,
11553 objects.UpgradeBeParams(i_bedict)
11554 utils.ForceDictType(i_bedict, constants.BES_PARAMETER_TYPES)
11555 be_new = cluster.SimpleFillBE(i_bedict)
11556 self.be_proposed = self.be_new = be_new # the new actual values
11557 self.be_inst = i_bedict # the new dict (without defaults)
11559 self.be_new = self.be_inst = {}
11560 self.be_proposed = cluster.SimpleFillBE(instance.beparams)
11561 be_old = cluster.FillBE(instance)
11563 # CPU param validation -- checking every time a paramtere is
11564 # changed to cover all cases where either CPU mask or vcpus have
11566 if (constants.BE_VCPUS in self.be_proposed and
11567 constants.HV_CPU_MASK in self.hv_proposed):
11569 utils.ParseMultiCpuMask(self.hv_proposed[constants.HV_CPU_MASK])
11570 # Verify mask is consistent with number of vCPUs. Can skip this
11571 # test if only 1 entry in the CPU mask, which means same mask
11572 # is applied to all vCPUs.
11573 if (len(cpu_list) > 1 and
11574 len(cpu_list) != self.be_proposed[constants.BE_VCPUS]):
11575 raise errors.OpPrereqError("Number of vCPUs [%d] does not match the"
11577 (self.be_proposed[constants.BE_VCPUS],
11578 self.hv_proposed[constants.HV_CPU_MASK]),
11579 errors.ECODE_INVAL)
11581 # Only perform this test if a new CPU mask is given
11582 if constants.HV_CPU_MASK in self.hv_new:
11583 # Calculate the largest CPU number requested
11584 max_requested_cpu = max(map(max, cpu_list))
11585 # Check that all of the instance's nodes have enough physical CPUs to
11586 # satisfy the requested CPU mask
11587 _CheckNodesPhysicalCPUs(self, instance.all_nodes,
11588 max_requested_cpu + 1, instance.hypervisor)
11590 # osparams processing
11591 if self.op.osparams:
11592 i_osdict = _GetUpdatedParams(instance.osparams, self.op.osparams)
11593 _CheckOSParams(self, True, nodelist, instance_os, i_osdict)
11594 self.os_inst = i_osdict # the new dict (without defaults)
11600 #TODO(dynmem): do the appropriate check involving MINMEM
11601 if (constants.BE_MAXMEM in self.op.beparams and not self.op.force and
11602 be_new[constants.BE_MAXMEM] > be_old[constants.BE_MAXMEM]):
11603 mem_check_list = [pnode]
11604 if be_new[constants.BE_AUTO_BALANCE]:
11605 # either we changed auto_balance to yes or it was from before
11606 mem_check_list.extend(instance.secondary_nodes)
11607 instance_info = self.rpc.call_instance_info(pnode, instance.name,
11608 instance.hypervisor)
11609 nodeinfo = self.rpc.call_node_info(mem_check_list, None,
11610 [instance.hypervisor])
11611 pninfo = nodeinfo[pnode]
11612 msg = pninfo.fail_msg
11614 # Assume the primary node is unreachable and go ahead
11615 self.warn.append("Can't get info from primary node %s: %s" %
11618 (_, _, (pnhvinfo, )) = pninfo.payload
11619 if not isinstance(pnhvinfo.get("memory_free", None), int):
11620 self.warn.append("Node data from primary node %s doesn't contain"
11621 " free memory information" % pnode)
11622 elif instance_info.fail_msg:
11623 self.warn.append("Can't get instance runtime information: %s" %
11624 instance_info.fail_msg)
11626 if instance_info.payload:
11627 current_mem = int(instance_info.payload["memory"])
11629 # Assume instance not running
11630 # (there is a slight race condition here, but it's not very
11631 # probable, and we have no other way to check)
11632 # TODO: Describe race condition
11634 #TODO(dynmem): do the appropriate check involving MINMEM
11635 miss_mem = (be_new[constants.BE_MAXMEM] - current_mem -
11636 pnhvinfo["memory_free"])
11638 raise errors.OpPrereqError("This change will prevent the instance"
11639 " from starting, due to %d MB of memory"
11640 " missing on its primary node" %
11642 errors.ECODE_NORES)
11644 if be_new[constants.BE_AUTO_BALANCE]:
11645 for node, nres in nodeinfo.items():
11646 if node not in instance.secondary_nodes:
11648 nres.Raise("Can't get info from secondary node %s" % node,
11649 prereq=True, ecode=errors.ECODE_STATE)
11650 (_, _, (nhvinfo, )) = nres.payload
11651 if not isinstance(nhvinfo.get("memory_free", None), int):
11652 raise errors.OpPrereqError("Secondary node %s didn't return free"
11653 " memory information" % node,
11654 errors.ECODE_STATE)
11655 #TODO(dynmem): do the appropriate check involving MINMEM
11656 elif be_new[constants.BE_MAXMEM] > nhvinfo["memory_free"]:
11657 raise errors.OpPrereqError("This change will prevent the instance"
11658 " from failover to its secondary node"
11659 " %s, due to not enough memory" % node,
11660 errors.ECODE_STATE)
11664 self.nic_pinst = {}
11665 for nic_op, nic_dict in self.op.nics:
11666 if nic_op == constants.DDM_REMOVE:
11667 if not instance.nics:
11668 raise errors.OpPrereqError("Instance has no NICs, cannot remove",
11669 errors.ECODE_INVAL)
11671 if nic_op != constants.DDM_ADD:
11673 if not instance.nics:
11674 raise errors.OpPrereqError("Invalid NIC index %s, instance has"
11675 " no NICs" % nic_op,
11676 errors.ECODE_INVAL)
11677 if nic_op < 0 or nic_op >= len(instance.nics):
11678 raise errors.OpPrereqError("Invalid NIC index %s, valid values"
11680 (nic_op, len(instance.nics) - 1),
11681 errors.ECODE_INVAL)
11682 old_nic_params = instance.nics[nic_op].nicparams
11683 old_nic_ip = instance.nics[nic_op].ip
11685 old_nic_params = {}
11688 update_params_dict = dict([(key, nic_dict[key])
11689 for key in constants.NICS_PARAMETERS
11690 if key in nic_dict])
11692 if "bridge" in nic_dict:
11693 update_params_dict[constants.NIC_LINK] = nic_dict["bridge"]
11695 new_nic_params = _GetUpdatedParams(old_nic_params,
11696 update_params_dict)
11697 utils.ForceDictType(new_nic_params, constants.NICS_PARAMETER_TYPES)
11698 new_filled_nic_params = cluster.SimpleFillNIC(new_nic_params)
11699 objects.NIC.CheckParameterSyntax(new_filled_nic_params)
11700 self.nic_pinst[nic_op] = new_nic_params
11701 self.nic_pnew[nic_op] = new_filled_nic_params
11702 new_nic_mode = new_filled_nic_params[constants.NIC_MODE]
11704 if new_nic_mode == constants.NIC_MODE_BRIDGED:
11705 nic_bridge = new_filled_nic_params[constants.NIC_LINK]
11706 msg = self.rpc.call_bridges_exist(pnode, [nic_bridge]).fail_msg
11708 msg = "Error checking bridges on node %s: %s" % (pnode, msg)
11710 self.warn.append(msg)
11712 raise errors.OpPrereqError(msg, errors.ECODE_ENVIRON)
11713 if new_nic_mode == constants.NIC_MODE_ROUTED:
11714 if constants.INIC_IP in nic_dict:
11715 nic_ip = nic_dict[constants.INIC_IP]
11717 nic_ip = old_nic_ip
11719 raise errors.OpPrereqError("Cannot set the nic ip to None"
11720 " on a routed nic", errors.ECODE_INVAL)
11721 if constants.INIC_MAC in nic_dict:
11722 nic_mac = nic_dict[constants.INIC_MAC]
11723 if nic_mac is None:
11724 raise errors.OpPrereqError("Cannot set the nic mac to None",
11725 errors.ECODE_INVAL)
11726 elif nic_mac in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
11727 # otherwise generate the mac
11728 nic_dict[constants.INIC_MAC] = \
11729 self.cfg.GenerateMAC(self.proc.GetECId())
11731 # or validate/reserve the current one
11733 self.cfg.ReserveMAC(nic_mac, self.proc.GetECId())
11734 except errors.ReservationError:
11735 raise errors.OpPrereqError("MAC address %s already in use"
11736 " in cluster" % nic_mac,
11737 errors.ECODE_NOTUNIQUE)
11740 if self.op.disks and instance.disk_template == constants.DT_DISKLESS:
11741 raise errors.OpPrereqError("Disk operations not supported for"
11742 " diskless instances",
11743 errors.ECODE_INVAL)
11744 for disk_op, _ in self.op.disks:
11745 if disk_op == constants.DDM_REMOVE:
11746 if len(instance.disks) == 1:
11747 raise errors.OpPrereqError("Cannot remove the last disk of"
11748 " an instance", errors.ECODE_INVAL)
11749 _CheckInstanceState(self, instance, INSTANCE_DOWN,
11750 msg="cannot remove disks")
11752 if (disk_op == constants.DDM_ADD and
11753 len(instance.disks) >= constants.MAX_DISKS):
11754 raise errors.OpPrereqError("Instance has too many disks (%d), cannot"
11755 " add more" % constants.MAX_DISKS,
11756 errors.ECODE_STATE)
11757 if disk_op not in (constants.DDM_ADD, constants.DDM_REMOVE):
11759 if disk_op < 0 or disk_op >= len(instance.disks):
11760 raise errors.OpPrereqError("Invalid disk index %s, valid values"
11762 (disk_op, len(instance.disks)),
11763 errors.ECODE_INVAL)
11765 # disabling the instance
11766 if self.op.offline_inst:
11767 _CheckInstanceState(self, instance, INSTANCE_DOWN,
11768 msg="cannot change instance state to offline")
11770 # enabling the instance
11771 if self.op.online_inst:
11772 _CheckInstanceState(self, instance, INSTANCE_OFFLINE,
11773 msg="cannot make instance go online")
11775 def _ConvertPlainToDrbd(self, feedback_fn):
11776 """Converts an instance from plain to drbd.
11779 feedback_fn("Converting template to drbd")
11780 instance = self.instance
11781 pnode = instance.primary_node
11782 snode = self.op.remote_node
11784 assert instance.disk_template == constants.DT_PLAIN
11786 # create a fake disk info for _GenerateDiskTemplate
11787 disk_info = [{constants.IDISK_SIZE: d.size, constants.IDISK_MODE: d.mode,
11788 constants.IDISK_VG: d.logical_id[0]}
11789 for d in instance.disks]
11790 new_disks = _GenerateDiskTemplate(self, self.op.disk_template,
11791 instance.name, pnode, [snode],
11792 disk_info, None, None, 0, feedback_fn,
11794 info = _GetInstanceInfoText(instance)
11795 feedback_fn("Creating aditional volumes...")
11796 # first, create the missing data and meta devices
11797 for disk in new_disks:
11798 # unfortunately this is... not too nice
11799 _CreateSingleBlockDev(self, pnode, instance, disk.children[1],
11801 for child in disk.children:
11802 _CreateSingleBlockDev(self, snode, instance, child, info, True)
11803 # at this stage, all new LVs have been created, we can rename the
11805 feedback_fn("Renaming original volumes...")
11806 rename_list = [(o, n.children[0].logical_id)
11807 for (o, n) in zip(instance.disks, new_disks)]
11808 result = self.rpc.call_blockdev_rename(pnode, rename_list)
11809 result.Raise("Failed to rename original LVs")
11811 feedback_fn("Initializing DRBD devices...")
11812 # all child devices are in place, we can now create the DRBD devices
11813 for disk in new_disks:
11814 for node in [pnode, snode]:
11815 f_create = node == pnode
11816 _CreateSingleBlockDev(self, node, instance, disk, info, f_create)
11818 # at this point, the instance has been modified
11819 instance.disk_template = constants.DT_DRBD8
11820 instance.disks = new_disks
11821 self.cfg.Update(instance, feedback_fn)
11823 # Release node locks while waiting for sync
11824 _ReleaseLocks(self, locking.LEVEL_NODE)
11826 # disks are created, waiting for sync
11827 disk_abort = not _WaitForSync(self, instance,
11828 oneshot=not self.op.wait_for_sync)
11830 raise errors.OpExecError("There are some degraded disks for"
11831 " this instance, please cleanup manually")
11833 # Node resource locks will be released by caller
11835 def _ConvertDrbdToPlain(self, feedback_fn):
11836 """Converts an instance from drbd to plain.
11839 instance = self.instance
11841 assert len(instance.secondary_nodes) == 1
11842 assert instance.disk_template == constants.DT_DRBD8
11844 pnode = instance.primary_node
11845 snode = instance.secondary_nodes[0]
11846 feedback_fn("Converting template to plain")
11848 old_disks = instance.disks
11849 new_disks = [d.children[0] for d in old_disks]
11851 # copy over size and mode
11852 for parent, child in zip(old_disks, new_disks):
11853 child.size = parent.size
11854 child.mode = parent.mode
11856 # update instance structure
11857 instance.disks = new_disks
11858 instance.disk_template = constants.DT_PLAIN
11859 self.cfg.Update(instance, feedback_fn)
11861 # Release locks in case removing disks takes a while
11862 _ReleaseLocks(self, locking.LEVEL_NODE)
11864 feedback_fn("Removing volumes on the secondary node...")
11865 for disk in old_disks:
11866 self.cfg.SetDiskID(disk, snode)
11867 msg = self.rpc.call_blockdev_remove(snode, disk).fail_msg
11869 self.LogWarning("Could not remove block device %s on node %s,"
11870 " continuing anyway: %s", disk.iv_name, snode, msg)
11872 feedback_fn("Removing unneeded volumes on the primary node...")
11873 for idx, disk in enumerate(old_disks):
11874 meta = disk.children[1]
11875 self.cfg.SetDiskID(meta, pnode)
11876 msg = self.rpc.call_blockdev_remove(pnode, meta).fail_msg
11878 self.LogWarning("Could not remove metadata for disk %d on node %s,"
11879 " continuing anyway: %s", idx, pnode, msg)
11881 # this is a DRBD disk, return its port to the pool
11882 for disk in old_disks:
11883 tcp_port = disk.logical_id[2]
11884 self.cfg.AddTcpUdpPort(tcp_port)
11886 # Node resource locks will be released by caller
11888 def Exec(self, feedback_fn):
11889 """Modifies an instance.
11891 All parameters take effect only at the next restart of the instance.
11894 # Process here the warnings from CheckPrereq, as we don't have a
11895 # feedback_fn there.
11896 for warn in self.warn:
11897 feedback_fn("WARNING: %s" % warn)
11899 assert ((self.op.disk_template is None) ^
11900 bool(self.owned_locks(locking.LEVEL_NODE_RES))), \
11901 "Not owning any node resource locks"
11904 instance = self.instance
11906 for disk_op, disk_dict in self.op.disks:
11907 if disk_op == constants.DDM_REMOVE:
11908 # remove the last disk
11909 device = instance.disks.pop()
11910 device_idx = len(instance.disks)
11911 for node, disk in device.ComputeNodeTree(instance.primary_node):
11912 self.cfg.SetDiskID(disk, node)
11913 msg = self.rpc.call_blockdev_remove(node, disk).fail_msg
11915 self.LogWarning("Could not remove disk/%d on node %s: %s,"
11916 " continuing anyway", device_idx, node, msg)
11917 result.append(("disk/%d" % device_idx, "remove"))
11919 # if this is a DRBD disk, return its port to the pool
11920 if device.dev_type in constants.LDS_DRBD:
11921 tcp_port = device.logical_id[2]
11922 self.cfg.AddTcpUdpPort(tcp_port)
11923 elif disk_op == constants.DDM_ADD:
11925 if instance.disk_template in (constants.DT_FILE,
11926 constants.DT_SHARED_FILE):
11927 file_driver, file_path = instance.disks[0].logical_id
11928 file_path = os.path.dirname(file_path)
11930 file_driver = file_path = None
11931 disk_idx_base = len(instance.disks)
11932 new_disk = _GenerateDiskTemplate(self,
11933 instance.disk_template,
11934 instance.name, instance.primary_node,
11935 instance.secondary_nodes,
11941 self.diskparams)[0]
11942 instance.disks.append(new_disk)
11943 info = _GetInstanceInfoText(instance)
11945 logging.info("Creating volume %s for instance %s",
11946 new_disk.iv_name, instance.name)
11947 # Note: this needs to be kept in sync with _CreateDisks
11949 for node in instance.all_nodes:
11950 f_create = node == instance.primary_node
11952 _CreateBlockDev(self, node, instance, new_disk,
11953 f_create, info, f_create)
11954 except errors.OpExecError, err:
11955 self.LogWarning("Failed to create volume %s (%s) on"
11957 new_disk.iv_name, new_disk, node, err)
11958 result.append(("disk/%d" % disk_idx_base, "add:size=%s,mode=%s" %
11959 (new_disk.size, new_disk.mode)))
11961 # change a given disk
11962 instance.disks[disk_op].mode = disk_dict[constants.IDISK_MODE]
11963 result.append(("disk.mode/%d" % disk_op,
11964 disk_dict[constants.IDISK_MODE]))
11966 if self.op.disk_template:
11968 check_nodes = set(instance.all_nodes)
11969 if self.op.remote_node:
11970 check_nodes.add(self.op.remote_node)
11971 for level in [locking.LEVEL_NODE, locking.LEVEL_NODE_RES]:
11972 owned = self.owned_locks(level)
11973 assert not (check_nodes - owned), \
11974 ("Not owning the correct locks, owning %r, expected at least %r" %
11975 (owned, check_nodes))
11977 r_shut = _ShutdownInstanceDisks(self, instance)
11979 raise errors.OpExecError("Cannot shutdown instance disks, unable to"
11980 " proceed with disk template conversion")
11981 mode = (instance.disk_template, self.op.disk_template)
11983 self._DISK_CONVERSIONS[mode](self, feedback_fn)
11985 self.cfg.ReleaseDRBDMinors(instance.name)
11987 result.append(("disk_template", self.op.disk_template))
11989 assert instance.disk_template == self.op.disk_template, \
11990 ("Expected disk template '%s', found '%s'" %
11991 (self.op.disk_template, instance.disk_template))
11993 # Release node and resource locks if there are any (they might already have
11994 # been released during disk conversion)
11995 _ReleaseLocks(self, locking.LEVEL_NODE)
11996 _ReleaseLocks(self, locking.LEVEL_NODE_RES)
11999 for nic_op, nic_dict in self.op.nics:
12000 if nic_op == constants.DDM_REMOVE:
12001 # remove the last nic
12002 del instance.nics[-1]
12003 result.append(("nic.%d" % len(instance.nics), "remove"))
12004 elif nic_op == constants.DDM_ADD:
12005 # mac and bridge should be set, by now
12006 mac = nic_dict[constants.INIC_MAC]
12007 ip = nic_dict.get(constants.INIC_IP, None)
12008 nicparams = self.nic_pinst[constants.DDM_ADD]
12009 new_nic = objects.NIC(mac=mac, ip=ip, nicparams=nicparams)
12010 instance.nics.append(new_nic)
12011 result.append(("nic.%d" % (len(instance.nics) - 1),
12012 "add:mac=%s,ip=%s,mode=%s,link=%s" %
12013 (new_nic.mac, new_nic.ip,
12014 self.nic_pnew[constants.DDM_ADD][constants.NIC_MODE],
12015 self.nic_pnew[constants.DDM_ADD][constants.NIC_LINK]
12018 for key in (constants.INIC_MAC, constants.INIC_IP):
12019 if key in nic_dict:
12020 setattr(instance.nics[nic_op], key, nic_dict[key])
12021 if nic_op in self.nic_pinst:
12022 instance.nics[nic_op].nicparams = self.nic_pinst[nic_op]
12023 for key, val in nic_dict.iteritems():
12024 result.append(("nic.%s/%d" % (key, nic_op), val))
12027 if self.op.hvparams:
12028 instance.hvparams = self.hv_inst
12029 for key, val in self.op.hvparams.iteritems():
12030 result.append(("hv/%s" % key, val))
12033 if self.op.beparams:
12034 instance.beparams = self.be_inst
12035 for key, val in self.op.beparams.iteritems():
12036 result.append(("be/%s" % key, val))
12039 if self.op.os_name:
12040 instance.os = self.op.os_name
12043 if self.op.osparams:
12044 instance.osparams = self.os_inst
12045 for key, val in self.op.osparams.iteritems():
12046 result.append(("os/%s" % key, val))
12048 # online/offline instance
12049 if self.op.online_inst:
12050 self.cfg.MarkInstanceDown(instance.name)
12051 result.append(("admin_state", constants.ADMINST_DOWN))
12052 if self.op.offline_inst:
12053 self.cfg.MarkInstanceOffline(instance.name)
12054 result.append(("admin_state", constants.ADMINST_OFFLINE))
12056 self.cfg.Update(instance, feedback_fn)
12058 assert not (self.owned_locks(locking.LEVEL_NODE_RES) or
12059 self.owned_locks(locking.LEVEL_NODE)), \
12060 "All node locks should have been released by now"
12064 _DISK_CONVERSIONS = {
12065 (constants.DT_PLAIN, constants.DT_DRBD8): _ConvertPlainToDrbd,
12066 (constants.DT_DRBD8, constants.DT_PLAIN): _ConvertDrbdToPlain,
12070 class LUInstanceChangeGroup(LogicalUnit):
12071 HPATH = "instance-change-group"
12072 HTYPE = constants.HTYPE_INSTANCE
12075 def ExpandNames(self):
12076 self.share_locks = _ShareAll()
12077 self.needed_locks = {
12078 locking.LEVEL_NODEGROUP: [],
12079 locking.LEVEL_NODE: [],
12082 self._ExpandAndLockInstance()
12084 if self.op.target_groups:
12085 self.req_target_uuids = map(self.cfg.LookupNodeGroup,
12086 self.op.target_groups)
12088 self.req_target_uuids = None
12090 self.op.iallocator = _GetDefaultIAllocator(self.cfg, self.op.iallocator)
12092 def DeclareLocks(self, level):
12093 if level == locking.LEVEL_NODEGROUP:
12094 assert not self.needed_locks[locking.LEVEL_NODEGROUP]
12096 if self.req_target_uuids:
12097 lock_groups = set(self.req_target_uuids)
12099 # Lock all groups used by instance optimistically; this requires going
12100 # via the node before it's locked, requiring verification later on
12101 instance_groups = self.cfg.GetInstanceNodeGroups(self.op.instance_name)
12102 lock_groups.update(instance_groups)
12104 # No target groups, need to lock all of them
12105 lock_groups = locking.ALL_SET
12107 self.needed_locks[locking.LEVEL_NODEGROUP] = lock_groups
12109 elif level == locking.LEVEL_NODE:
12110 if self.req_target_uuids:
12111 # Lock all nodes used by instances
12112 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
12113 self._LockInstancesNodes()
12115 # Lock all nodes in all potential target groups
12116 lock_groups = (frozenset(self.owned_locks(locking.LEVEL_NODEGROUP)) -
12117 self.cfg.GetInstanceNodeGroups(self.op.instance_name))
12118 member_nodes = [node_name
12119 for group in lock_groups
12120 for node_name in self.cfg.GetNodeGroup(group).members]
12121 self.needed_locks[locking.LEVEL_NODE].extend(member_nodes)
12123 # Lock all nodes as all groups are potential targets
12124 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
12126 def CheckPrereq(self):
12127 owned_instances = frozenset(self.owned_locks(locking.LEVEL_INSTANCE))
12128 owned_groups = frozenset(self.owned_locks(locking.LEVEL_NODEGROUP))
12129 owned_nodes = frozenset(self.owned_locks(locking.LEVEL_NODE))
12131 assert (self.req_target_uuids is None or
12132 owned_groups.issuperset(self.req_target_uuids))
12133 assert owned_instances == set([self.op.instance_name])
12135 # Get instance information
12136 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
12138 # Check if node groups for locked instance are still correct
12139 assert owned_nodes.issuperset(self.instance.all_nodes), \
12140 ("Instance %s's nodes changed while we kept the lock" %
12141 self.op.instance_name)
12143 inst_groups = _CheckInstanceNodeGroups(self.cfg, self.op.instance_name,
12146 if self.req_target_uuids:
12147 # User requested specific target groups
12148 self.target_uuids = self.req_target_uuids
12150 # All groups except those used by the instance are potential targets
12151 self.target_uuids = owned_groups - inst_groups
12153 conflicting_groups = self.target_uuids & inst_groups
12154 if conflicting_groups:
12155 raise errors.OpPrereqError("Can't use group(s) '%s' as targets, they are"
12156 " used by the instance '%s'" %
12157 (utils.CommaJoin(conflicting_groups),
12158 self.op.instance_name),
12159 errors.ECODE_INVAL)
12161 if not self.target_uuids:
12162 raise errors.OpPrereqError("There are no possible target groups",
12163 errors.ECODE_INVAL)
12165 def BuildHooksEnv(self):
12166 """Build hooks env.
12169 assert self.target_uuids
12172 "TARGET_GROUPS": " ".join(self.target_uuids),
12175 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
12179 def BuildHooksNodes(self):
12180 """Build hooks nodes.
12183 mn = self.cfg.GetMasterNode()
12184 return ([mn], [mn])
12186 def Exec(self, feedback_fn):
12187 instances = list(self.owned_locks(locking.LEVEL_INSTANCE))
12189 assert instances == [self.op.instance_name], "Instance not locked"
12191 ial = IAllocator(self.cfg, self.rpc, constants.IALLOCATOR_MODE_CHG_GROUP,
12192 instances=instances, target_groups=list(self.target_uuids))
12194 ial.Run(self.op.iallocator)
12196 if not ial.success:
12197 raise errors.OpPrereqError("Can't compute solution for changing group of"
12198 " instance '%s' using iallocator '%s': %s" %
12199 (self.op.instance_name, self.op.iallocator,
12201 errors.ECODE_NORES)
12203 jobs = _LoadNodeEvacResult(self, ial.result, self.op.early_release, False)
12205 self.LogInfo("Iallocator returned %s job(s) for changing group of"
12206 " instance '%s'", len(jobs), self.op.instance_name)
12208 return ResultWithJobs(jobs)
12211 class LUBackupQuery(NoHooksLU):
12212 """Query the exports list
12217 def ExpandNames(self):
12218 self.needed_locks = {}
12219 self.share_locks[locking.LEVEL_NODE] = 1
12220 if not self.op.nodes:
12221 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
12223 self.needed_locks[locking.LEVEL_NODE] = \
12224 _GetWantedNodes(self, self.op.nodes)
12226 def Exec(self, feedback_fn):
12227 """Compute the list of all the exported system images.
12230 @return: a dictionary with the structure node->(export-list)
12231 where export-list is a list of the instances exported on
12235 self.nodes = self.owned_locks(locking.LEVEL_NODE)
12236 rpcresult = self.rpc.call_export_list(self.nodes)
12238 for node in rpcresult:
12239 if rpcresult[node].fail_msg:
12240 result[node] = False
12242 result[node] = rpcresult[node].payload
12247 class LUBackupPrepare(NoHooksLU):
12248 """Prepares an instance for an export and returns useful information.
12253 def ExpandNames(self):
12254 self._ExpandAndLockInstance()
12256 def CheckPrereq(self):
12257 """Check prerequisites.
12260 instance_name = self.op.instance_name
12262 self.instance = self.cfg.GetInstanceInfo(instance_name)
12263 assert self.instance is not None, \
12264 "Cannot retrieve locked instance %s" % self.op.instance_name
12265 _CheckNodeOnline(self, self.instance.primary_node)
12267 self._cds = _GetClusterDomainSecret()
12269 def Exec(self, feedback_fn):
12270 """Prepares an instance for an export.
12273 instance = self.instance
12275 if self.op.mode == constants.EXPORT_MODE_REMOTE:
12276 salt = utils.GenerateSecret(8)
12278 feedback_fn("Generating X509 certificate on %s" % instance.primary_node)
12279 result = self.rpc.call_x509_cert_create(instance.primary_node,
12280 constants.RIE_CERT_VALIDITY)
12281 result.Raise("Can't create X509 key and certificate on %s" % result.node)
12283 (name, cert_pem) = result.payload
12285 cert = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM,
12289 "handshake": masterd.instance.ComputeRemoteExportHandshake(self._cds),
12290 "x509_key_name": (name, utils.Sha1Hmac(self._cds, name, salt=salt),
12292 "x509_ca": utils.SignX509Certificate(cert, self._cds, salt),
12298 class LUBackupExport(LogicalUnit):
12299 """Export an instance to an image in the cluster.
12302 HPATH = "instance-export"
12303 HTYPE = constants.HTYPE_INSTANCE
12306 def CheckArguments(self):
12307 """Check the arguments.
12310 self.x509_key_name = self.op.x509_key_name
12311 self.dest_x509_ca_pem = self.op.destination_x509_ca
12313 if self.op.mode == constants.EXPORT_MODE_REMOTE:
12314 if not self.x509_key_name:
12315 raise errors.OpPrereqError("Missing X509 key name for encryption",
12316 errors.ECODE_INVAL)
12318 if not self.dest_x509_ca_pem:
12319 raise errors.OpPrereqError("Missing destination X509 CA",
12320 errors.ECODE_INVAL)
12322 def ExpandNames(self):
12323 self._ExpandAndLockInstance()
12325 # Lock all nodes for local exports
12326 if self.op.mode == constants.EXPORT_MODE_LOCAL:
12327 # FIXME: lock only instance primary and destination node
12329 # Sad but true, for now we have do lock all nodes, as we don't know where
12330 # the previous export might be, and in this LU we search for it and
12331 # remove it from its current node. In the future we could fix this by:
12332 # - making a tasklet to search (share-lock all), then create the
12333 # new one, then one to remove, after
12334 # - removing the removal operation altogether
12335 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
12337 def DeclareLocks(self, level):
12338 """Last minute lock declaration."""
12339 # All nodes are locked anyway, so nothing to do here.
12341 def BuildHooksEnv(self):
12342 """Build hooks env.
12344 This will run on the master, primary node and target node.
12348 "EXPORT_MODE": self.op.mode,
12349 "EXPORT_NODE": self.op.target_node,
12350 "EXPORT_DO_SHUTDOWN": self.op.shutdown,
12351 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
12352 # TODO: Generic function for boolean env variables
12353 "REMOVE_INSTANCE": str(bool(self.op.remove_instance)),
12356 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
12360 def BuildHooksNodes(self):
12361 """Build hooks nodes.
12364 nl = [self.cfg.GetMasterNode(), self.instance.primary_node]
12366 if self.op.mode == constants.EXPORT_MODE_LOCAL:
12367 nl.append(self.op.target_node)
12371 def CheckPrereq(self):
12372 """Check prerequisites.
12374 This checks that the instance and node names are valid.
12377 instance_name = self.op.instance_name
12379 self.instance = self.cfg.GetInstanceInfo(instance_name)
12380 assert self.instance is not None, \
12381 "Cannot retrieve locked instance %s" % self.op.instance_name
12382 _CheckNodeOnline(self, self.instance.primary_node)
12384 if (self.op.remove_instance and
12385 self.instance.admin_state == constants.ADMINST_UP and
12386 not self.op.shutdown):
12387 raise errors.OpPrereqError("Can not remove instance without shutting it"
12390 if self.op.mode == constants.EXPORT_MODE_LOCAL:
12391 self.op.target_node = _ExpandNodeName(self.cfg, self.op.target_node)
12392 self.dst_node = self.cfg.GetNodeInfo(self.op.target_node)
12393 assert self.dst_node is not None
12395 _CheckNodeOnline(self, self.dst_node.name)
12396 _CheckNodeNotDrained(self, self.dst_node.name)
12399 self.dest_disk_info = None
12400 self.dest_x509_ca = None
12402 elif self.op.mode == constants.EXPORT_MODE_REMOTE:
12403 self.dst_node = None
12405 if len(self.op.target_node) != len(self.instance.disks):
12406 raise errors.OpPrereqError(("Received destination information for %s"
12407 " disks, but instance %s has %s disks") %
12408 (len(self.op.target_node), instance_name,
12409 len(self.instance.disks)),
12410 errors.ECODE_INVAL)
12412 cds = _GetClusterDomainSecret()
12414 # Check X509 key name
12416 (key_name, hmac_digest, hmac_salt) = self.x509_key_name
12417 except (TypeError, ValueError), err:
12418 raise errors.OpPrereqError("Invalid data for X509 key name: %s" % err)
12420 if not utils.VerifySha1Hmac(cds, key_name, hmac_digest, salt=hmac_salt):
12421 raise errors.OpPrereqError("HMAC for X509 key name is wrong",
12422 errors.ECODE_INVAL)
12424 # Load and verify CA
12426 (cert, _) = utils.LoadSignedX509Certificate(self.dest_x509_ca_pem, cds)
12427 except OpenSSL.crypto.Error, err:
12428 raise errors.OpPrereqError("Unable to load destination X509 CA (%s)" %
12429 (err, ), errors.ECODE_INVAL)
12431 (errcode, msg) = utils.VerifyX509Certificate(cert, None, None)
12432 if errcode is not None:
12433 raise errors.OpPrereqError("Invalid destination X509 CA (%s)" %
12434 (msg, ), errors.ECODE_INVAL)
12436 self.dest_x509_ca = cert
12438 # Verify target information
12440 for idx, disk_data in enumerate(self.op.target_node):
12442 (host, port, magic) = \
12443 masterd.instance.CheckRemoteExportDiskInfo(cds, idx, disk_data)
12444 except errors.GenericError, err:
12445 raise errors.OpPrereqError("Target info for disk %s: %s" %
12446 (idx, err), errors.ECODE_INVAL)
12448 disk_info.append((host, port, magic))
12450 assert len(disk_info) == len(self.op.target_node)
12451 self.dest_disk_info = disk_info
12454 raise errors.ProgrammerError("Unhandled export mode %r" %
12457 # instance disk type verification
12458 # TODO: Implement export support for file-based disks
12459 for disk in self.instance.disks:
12460 if disk.dev_type == constants.LD_FILE:
12461 raise errors.OpPrereqError("Export not supported for instances with"
12462 " file-based disks", errors.ECODE_INVAL)
12464 def _CleanupExports(self, feedback_fn):
12465 """Removes exports of current instance from all other nodes.
12467 If an instance in a cluster with nodes A..D was exported to node C, its
12468 exports will be removed from the nodes A, B and D.
12471 assert self.op.mode != constants.EXPORT_MODE_REMOTE
12473 nodelist = self.cfg.GetNodeList()
12474 nodelist.remove(self.dst_node.name)
12476 # on one-node clusters nodelist will be empty after the removal
12477 # if we proceed the backup would be removed because OpBackupQuery
12478 # substitutes an empty list with the full cluster node list.
12479 iname = self.instance.name
12481 feedback_fn("Removing old exports for instance %s" % iname)
12482 exportlist = self.rpc.call_export_list(nodelist)
12483 for node in exportlist:
12484 if exportlist[node].fail_msg:
12486 if iname in exportlist[node].payload:
12487 msg = self.rpc.call_export_remove(node, iname).fail_msg
12489 self.LogWarning("Could not remove older export for instance %s"
12490 " on node %s: %s", iname, node, msg)
12492 def Exec(self, feedback_fn):
12493 """Export an instance to an image in the cluster.
12496 assert self.op.mode in constants.EXPORT_MODES
12498 instance = self.instance
12499 src_node = instance.primary_node
12501 if self.op.shutdown:
12502 # shutdown the instance, but not the disks
12503 feedback_fn("Shutting down instance %s" % instance.name)
12504 result = self.rpc.call_instance_shutdown(src_node, instance,
12505 self.op.shutdown_timeout)
12506 # TODO: Maybe ignore failures if ignore_remove_failures is set
12507 result.Raise("Could not shutdown instance %s on"
12508 " node %s" % (instance.name, src_node))
12510 # set the disks ID correctly since call_instance_start needs the
12511 # correct drbd minor to create the symlinks
12512 for disk in instance.disks:
12513 self.cfg.SetDiskID(disk, src_node)
12515 activate_disks = (instance.admin_state != constants.ADMINST_UP)
12518 # Activate the instance disks if we'exporting a stopped instance
12519 feedback_fn("Activating disks for %s" % instance.name)
12520 _StartInstanceDisks(self, instance, None)
12523 helper = masterd.instance.ExportInstanceHelper(self, feedback_fn,
12526 helper.CreateSnapshots()
12528 if (self.op.shutdown and
12529 instance.admin_state == constants.ADMINST_UP and
12530 not self.op.remove_instance):
12531 assert not activate_disks
12532 feedback_fn("Starting instance %s" % instance.name)
12533 result = self.rpc.call_instance_start(src_node,
12534 (instance, None, None), False)
12535 msg = result.fail_msg
12537 feedback_fn("Failed to start instance: %s" % msg)
12538 _ShutdownInstanceDisks(self, instance)
12539 raise errors.OpExecError("Could not start instance: %s" % msg)
12541 if self.op.mode == constants.EXPORT_MODE_LOCAL:
12542 (fin_resu, dresults) = helper.LocalExport(self.dst_node)
12543 elif self.op.mode == constants.EXPORT_MODE_REMOTE:
12544 connect_timeout = constants.RIE_CONNECT_TIMEOUT
12545 timeouts = masterd.instance.ImportExportTimeouts(connect_timeout)
12547 (key_name, _, _) = self.x509_key_name
12550 OpenSSL.crypto.dump_certificate(OpenSSL.crypto.FILETYPE_PEM,
12553 (fin_resu, dresults) = helper.RemoteExport(self.dest_disk_info,
12554 key_name, dest_ca_pem,
12559 # Check for backwards compatibility
12560 assert len(dresults) == len(instance.disks)
12561 assert compat.all(isinstance(i, bool) for i in dresults), \
12562 "Not all results are boolean: %r" % dresults
12566 feedback_fn("Deactivating disks for %s" % instance.name)
12567 _ShutdownInstanceDisks(self, instance)
12569 if not (compat.all(dresults) and fin_resu):
12572 failures.append("export finalization")
12573 if not compat.all(dresults):
12574 fdsk = utils.CommaJoin(idx for (idx, dsk) in enumerate(dresults)
12576 failures.append("disk export: disk(s) %s" % fdsk)
12578 raise errors.OpExecError("Export failed, errors in %s" %
12579 utils.CommaJoin(failures))
12581 # At this point, the export was successful, we can cleanup/finish
12583 # Remove instance if requested
12584 if self.op.remove_instance:
12585 feedback_fn("Removing instance %s" % instance.name)
12586 _RemoveInstance(self, feedback_fn, instance,
12587 self.op.ignore_remove_failures)
12589 if self.op.mode == constants.EXPORT_MODE_LOCAL:
12590 self._CleanupExports(feedback_fn)
12592 return fin_resu, dresults
12595 class LUBackupRemove(NoHooksLU):
12596 """Remove exports related to the named instance.
12601 def ExpandNames(self):
12602 self.needed_locks = {}
12603 # We need all nodes to be locked in order for RemoveExport to work, but we
12604 # don't need to lock the instance itself, as nothing will happen to it (and
12605 # we can remove exports also for a removed instance)
12606 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
12608 def Exec(self, feedback_fn):
12609 """Remove any export.
12612 instance_name = self.cfg.ExpandInstanceName(self.op.instance_name)
12613 # If the instance was not found we'll try with the name that was passed in.
12614 # This will only work if it was an FQDN, though.
12616 if not instance_name:
12618 instance_name = self.op.instance_name
12620 locked_nodes = self.owned_locks(locking.LEVEL_NODE)
12621 exportlist = self.rpc.call_export_list(locked_nodes)
12623 for node in exportlist:
12624 msg = exportlist[node].fail_msg
12626 self.LogWarning("Failed to query node %s (continuing): %s", node, msg)
12628 if instance_name in exportlist[node].payload:
12630 result = self.rpc.call_export_remove(node, instance_name)
12631 msg = result.fail_msg
12633 logging.error("Could not remove export for instance %s"
12634 " on node %s: %s", instance_name, node, msg)
12636 if fqdn_warn and not found:
12637 feedback_fn("Export not found. If trying to remove an export belonging"
12638 " to a deleted instance please use its Fully Qualified"
12642 class LUGroupAdd(LogicalUnit):
12643 """Logical unit for creating node groups.
12646 HPATH = "group-add"
12647 HTYPE = constants.HTYPE_GROUP
12650 def ExpandNames(self):
12651 # We need the new group's UUID here so that we can create and acquire the
12652 # corresponding lock. Later, in Exec(), we'll indicate to cfg.AddNodeGroup
12653 # that it should not check whether the UUID exists in the configuration.
12654 self.group_uuid = self.cfg.GenerateUniqueID(self.proc.GetECId())
12655 self.needed_locks = {}
12656 self.add_locks[locking.LEVEL_NODEGROUP] = self.group_uuid
12658 def CheckPrereq(self):
12659 """Check prerequisites.
12661 This checks that the given group name is not an existing node group
12666 existing_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
12667 except errors.OpPrereqError:
12670 raise errors.OpPrereqError("Desired group name '%s' already exists as a"
12671 " node group (UUID: %s)" %
12672 (self.op.group_name, existing_uuid),
12673 errors.ECODE_EXISTS)
12675 if self.op.ndparams:
12676 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
12678 if self.op.diskparams:
12679 for templ in constants.DISK_TEMPLATES:
12680 if templ not in self.op.diskparams:
12681 self.op.diskparams[templ] = {}
12682 utils.ForceDictType(self.op.diskparams[templ], constants.DISK_DT_TYPES)
12684 self.op.diskparams = self.cfg.GetClusterInfo().diskparams
12686 def BuildHooksEnv(self):
12687 """Build hooks env.
12691 "GROUP_NAME": self.op.group_name,
12694 def BuildHooksNodes(self):
12695 """Build hooks nodes.
12698 mn = self.cfg.GetMasterNode()
12699 return ([mn], [mn])
12701 def Exec(self, feedback_fn):
12702 """Add the node group to the cluster.
12705 group_obj = objects.NodeGroup(name=self.op.group_name, members=[],
12706 uuid=self.group_uuid,
12707 alloc_policy=self.op.alloc_policy,
12708 ndparams=self.op.ndparams,
12709 diskparams=self.op.diskparams)
12711 self.cfg.AddNodeGroup(group_obj, self.proc.GetECId(), check_uuid=False)
12712 del self.remove_locks[locking.LEVEL_NODEGROUP]
12715 class LUGroupAssignNodes(NoHooksLU):
12716 """Logical unit for assigning nodes to groups.
12721 def ExpandNames(self):
12722 # These raise errors.OpPrereqError on their own:
12723 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
12724 self.op.nodes = _GetWantedNodes(self, self.op.nodes)
12726 # We want to lock all the affected nodes and groups. We have readily
12727 # available the list of nodes, and the *destination* group. To gather the
12728 # list of "source" groups, we need to fetch node information later on.
12729 self.needed_locks = {
12730 locking.LEVEL_NODEGROUP: set([self.group_uuid]),
12731 locking.LEVEL_NODE: self.op.nodes,
12734 def DeclareLocks(self, level):
12735 if level == locking.LEVEL_NODEGROUP:
12736 assert len(self.needed_locks[locking.LEVEL_NODEGROUP]) == 1
12738 # Try to get all affected nodes' groups without having the group or node
12739 # lock yet. Needs verification later in the code flow.
12740 groups = self.cfg.GetNodeGroupsFromNodes(self.op.nodes)
12742 self.needed_locks[locking.LEVEL_NODEGROUP].update(groups)
12744 def CheckPrereq(self):
12745 """Check prerequisites.
12748 assert self.needed_locks[locking.LEVEL_NODEGROUP]
12749 assert (frozenset(self.owned_locks(locking.LEVEL_NODE)) ==
12750 frozenset(self.op.nodes))
12752 expected_locks = (set([self.group_uuid]) |
12753 self.cfg.GetNodeGroupsFromNodes(self.op.nodes))
12754 actual_locks = self.owned_locks(locking.LEVEL_NODEGROUP)
12755 if actual_locks != expected_locks:
12756 raise errors.OpExecError("Nodes changed groups since locks were acquired,"
12757 " current groups are '%s', used to be '%s'" %
12758 (utils.CommaJoin(expected_locks),
12759 utils.CommaJoin(actual_locks)))
12761 self.node_data = self.cfg.GetAllNodesInfo()
12762 self.group = self.cfg.GetNodeGroup(self.group_uuid)
12763 instance_data = self.cfg.GetAllInstancesInfo()
12765 if self.group is None:
12766 raise errors.OpExecError("Could not retrieve group '%s' (UUID: %s)" %
12767 (self.op.group_name, self.group_uuid))
12769 (new_splits, previous_splits) = \
12770 self.CheckAssignmentForSplitInstances([(node, self.group_uuid)
12771 for node in self.op.nodes],
12772 self.node_data, instance_data)
12775 fmt_new_splits = utils.CommaJoin(utils.NiceSort(new_splits))
12777 if not self.op.force:
12778 raise errors.OpExecError("The following instances get split by this"
12779 " change and --force was not given: %s" %
12782 self.LogWarning("This operation will split the following instances: %s",
12785 if previous_splits:
12786 self.LogWarning("In addition, these already-split instances continue"
12787 " to be split across groups: %s",
12788 utils.CommaJoin(utils.NiceSort(previous_splits)))
12790 def Exec(self, feedback_fn):
12791 """Assign nodes to a new group.
12794 mods = [(node_name, self.group_uuid) for node_name in self.op.nodes]
12796 self.cfg.AssignGroupNodes(mods)
12799 def CheckAssignmentForSplitInstances(changes, node_data, instance_data):
12800 """Check for split instances after a node assignment.
12802 This method considers a series of node assignments as an atomic operation,
12803 and returns information about split instances after applying the set of
12806 In particular, it returns information about newly split instances, and
12807 instances that were already split, and remain so after the change.
12809 Only instances whose disk template is listed in constants.DTS_INT_MIRROR are
12812 @type changes: list of (node_name, new_group_uuid) pairs.
12813 @param changes: list of node assignments to consider.
12814 @param node_data: a dict with data for all nodes
12815 @param instance_data: a dict with all instances to consider
12816 @rtype: a two-tuple
12817 @return: a list of instances that were previously okay and result split as a
12818 consequence of this change, and a list of instances that were previously
12819 split and this change does not fix.
12822 changed_nodes = dict((node, group) for node, group in changes
12823 if node_data[node].group != group)
12825 all_split_instances = set()
12826 previously_split_instances = set()
12828 def InstanceNodes(instance):
12829 return [instance.primary_node] + list(instance.secondary_nodes)
12831 for inst in instance_data.values():
12832 if inst.disk_template not in constants.DTS_INT_MIRROR:
12835 instance_nodes = InstanceNodes(inst)
12837 if len(set(node_data[node].group for node in instance_nodes)) > 1:
12838 previously_split_instances.add(inst.name)
12840 if len(set(changed_nodes.get(node, node_data[node].group)
12841 for node in instance_nodes)) > 1:
12842 all_split_instances.add(inst.name)
12844 return (list(all_split_instances - previously_split_instances),
12845 list(previously_split_instances & all_split_instances))
12848 class _GroupQuery(_QueryBase):
12849 FIELDS = query.GROUP_FIELDS
12851 def ExpandNames(self, lu):
12852 lu.needed_locks = {}
12854 self._all_groups = lu.cfg.GetAllNodeGroupsInfo()
12855 name_to_uuid = dict((g.name, g.uuid) for g in self._all_groups.values())
12858 self.wanted = [name_to_uuid[name]
12859 for name in utils.NiceSort(name_to_uuid.keys())]
12861 # Accept names to be either names or UUIDs.
12864 all_uuid = frozenset(self._all_groups.keys())
12866 for name in self.names:
12867 if name in all_uuid:
12868 self.wanted.append(name)
12869 elif name in name_to_uuid:
12870 self.wanted.append(name_to_uuid[name])
12872 missing.append(name)
12875 raise errors.OpPrereqError("Some groups do not exist: %s" %
12876 utils.CommaJoin(missing),
12877 errors.ECODE_NOENT)
12879 def DeclareLocks(self, lu, level):
12882 def _GetQueryData(self, lu):
12883 """Computes the list of node groups and their attributes.
12886 do_nodes = query.GQ_NODE in self.requested_data
12887 do_instances = query.GQ_INST in self.requested_data
12889 group_to_nodes = None
12890 group_to_instances = None
12892 # For GQ_NODE, we need to map group->[nodes], and group->[instances] for
12893 # GQ_INST. The former is attainable with just GetAllNodesInfo(), but for the
12894 # latter GetAllInstancesInfo() is not enough, for we have to go through
12895 # instance->node. Hence, we will need to process nodes even if we only need
12896 # instance information.
12897 if do_nodes or do_instances:
12898 all_nodes = lu.cfg.GetAllNodesInfo()
12899 group_to_nodes = dict((uuid, []) for uuid in self.wanted)
12902 for node in all_nodes.values():
12903 if node.group in group_to_nodes:
12904 group_to_nodes[node.group].append(node.name)
12905 node_to_group[node.name] = node.group
12908 all_instances = lu.cfg.GetAllInstancesInfo()
12909 group_to_instances = dict((uuid, []) for uuid in self.wanted)
12911 for instance in all_instances.values():
12912 node = instance.primary_node
12913 if node in node_to_group:
12914 group_to_instances[node_to_group[node]].append(instance.name)
12917 # Do not pass on node information if it was not requested.
12918 group_to_nodes = None
12920 return query.GroupQueryData([self._all_groups[uuid]
12921 for uuid in self.wanted],
12922 group_to_nodes, group_to_instances)
12925 class LUGroupQuery(NoHooksLU):
12926 """Logical unit for querying node groups.
12931 def CheckArguments(self):
12932 self.gq = _GroupQuery(qlang.MakeSimpleFilter("name", self.op.names),
12933 self.op.output_fields, False)
12935 def ExpandNames(self):
12936 self.gq.ExpandNames(self)
12938 def DeclareLocks(self, level):
12939 self.gq.DeclareLocks(self, level)
12941 def Exec(self, feedback_fn):
12942 return self.gq.OldStyleQuery(self)
12945 class LUGroupSetParams(LogicalUnit):
12946 """Modifies the parameters of a node group.
12949 HPATH = "group-modify"
12950 HTYPE = constants.HTYPE_GROUP
12953 def CheckArguments(self):
12956 self.op.diskparams,
12957 self.op.alloc_policy,
12960 if all_changes.count(None) == len(all_changes):
12961 raise errors.OpPrereqError("Please pass at least one modification",
12962 errors.ECODE_INVAL)
12964 def ExpandNames(self):
12965 # This raises errors.OpPrereqError on its own:
12966 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
12968 self.needed_locks = {
12969 locking.LEVEL_NODEGROUP: [self.group_uuid],
12972 def CheckPrereq(self):
12973 """Check prerequisites.
12976 self.group = self.cfg.GetNodeGroup(self.group_uuid)
12978 if self.group is None:
12979 raise errors.OpExecError("Could not retrieve group '%s' (UUID: %s)" %
12980 (self.op.group_name, self.group_uuid))
12982 if self.op.ndparams:
12983 new_ndparams = _GetUpdatedParams(self.group.ndparams, self.op.ndparams)
12984 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
12985 self.new_ndparams = new_ndparams
12987 if self.op.diskparams:
12988 self.new_diskparams = dict()
12989 for templ in constants.DISK_TEMPLATES:
12990 if templ not in self.op.diskparams:
12991 self.op.diskparams[templ] = {}
12992 new_templ_params = _GetUpdatedParams(self.group.diskparams[templ],
12993 self.op.diskparams[templ])
12994 utils.ForceDictType(new_templ_params, constants.DISK_DT_TYPES)
12995 self.new_diskparams[templ] = new_templ_params
12997 def BuildHooksEnv(self):
12998 """Build hooks env.
13002 "GROUP_NAME": self.op.group_name,
13003 "NEW_ALLOC_POLICY": self.op.alloc_policy,
13006 def BuildHooksNodes(self):
13007 """Build hooks nodes.
13010 mn = self.cfg.GetMasterNode()
13011 return ([mn], [mn])
13013 def Exec(self, feedback_fn):
13014 """Modifies the node group.
13019 if self.op.ndparams:
13020 self.group.ndparams = self.new_ndparams
13021 result.append(("ndparams", str(self.group.ndparams)))
13023 if self.op.diskparams:
13024 self.group.diskparams = self.new_diskparams
13025 result.append(("diskparams", str(self.group.diskparams)))
13027 if self.op.alloc_policy:
13028 self.group.alloc_policy = self.op.alloc_policy
13030 self.cfg.Update(self.group, feedback_fn)
13034 class LUGroupRemove(LogicalUnit):
13035 HPATH = "group-remove"
13036 HTYPE = constants.HTYPE_GROUP
13039 def ExpandNames(self):
13040 # This will raises errors.OpPrereqError on its own:
13041 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
13042 self.needed_locks = {
13043 locking.LEVEL_NODEGROUP: [self.group_uuid],
13046 def CheckPrereq(self):
13047 """Check prerequisites.
13049 This checks that the given group name exists as a node group, that is
13050 empty (i.e., contains no nodes), and that is not the last group of the
13054 # Verify that the group is empty.
13055 group_nodes = [node.name
13056 for node in self.cfg.GetAllNodesInfo().values()
13057 if node.group == self.group_uuid]
13060 raise errors.OpPrereqError("Group '%s' not empty, has the following"
13062 (self.op.group_name,
13063 utils.CommaJoin(utils.NiceSort(group_nodes))),
13064 errors.ECODE_STATE)
13066 # Verify the cluster would not be left group-less.
13067 if len(self.cfg.GetNodeGroupList()) == 1:
13068 raise errors.OpPrereqError("Group '%s' is the only group,"
13069 " cannot be removed" %
13070 self.op.group_name,
13071 errors.ECODE_STATE)
13073 def BuildHooksEnv(self):
13074 """Build hooks env.
13078 "GROUP_NAME": self.op.group_name,
13081 def BuildHooksNodes(self):
13082 """Build hooks nodes.
13085 mn = self.cfg.GetMasterNode()
13086 return ([mn], [mn])
13088 def Exec(self, feedback_fn):
13089 """Remove the node group.
13093 self.cfg.RemoveNodeGroup(self.group_uuid)
13094 except errors.ConfigurationError:
13095 raise errors.OpExecError("Group '%s' with UUID %s disappeared" %
13096 (self.op.group_name, self.group_uuid))
13098 self.remove_locks[locking.LEVEL_NODEGROUP] = self.group_uuid
13101 class LUGroupRename(LogicalUnit):
13102 HPATH = "group-rename"
13103 HTYPE = constants.HTYPE_GROUP
13106 def ExpandNames(self):
13107 # This raises errors.OpPrereqError on its own:
13108 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
13110 self.needed_locks = {
13111 locking.LEVEL_NODEGROUP: [self.group_uuid],
13114 def CheckPrereq(self):
13115 """Check prerequisites.
13117 Ensures requested new name is not yet used.
13121 new_name_uuid = self.cfg.LookupNodeGroup(self.op.new_name)
13122 except errors.OpPrereqError:
13125 raise errors.OpPrereqError("Desired new name '%s' clashes with existing"
13126 " node group (UUID: %s)" %
13127 (self.op.new_name, new_name_uuid),
13128 errors.ECODE_EXISTS)
13130 def BuildHooksEnv(self):
13131 """Build hooks env.
13135 "OLD_NAME": self.op.group_name,
13136 "NEW_NAME": self.op.new_name,
13139 def BuildHooksNodes(self):
13140 """Build hooks nodes.
13143 mn = self.cfg.GetMasterNode()
13145 all_nodes = self.cfg.GetAllNodesInfo()
13146 all_nodes.pop(mn, None)
13149 run_nodes.extend(node.name for node in all_nodes.values()
13150 if node.group == self.group_uuid)
13152 return (run_nodes, run_nodes)
13154 def Exec(self, feedback_fn):
13155 """Rename the node group.
13158 group = self.cfg.GetNodeGroup(self.group_uuid)
13161 raise errors.OpExecError("Could not retrieve group '%s' (UUID: %s)" %
13162 (self.op.group_name, self.group_uuid))
13164 group.name = self.op.new_name
13165 self.cfg.Update(group, feedback_fn)
13167 return self.op.new_name
13170 class LUGroupEvacuate(LogicalUnit):
13171 HPATH = "group-evacuate"
13172 HTYPE = constants.HTYPE_GROUP
13175 def ExpandNames(self):
13176 # This raises errors.OpPrereqError on its own:
13177 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
13179 if self.op.target_groups:
13180 self.req_target_uuids = map(self.cfg.LookupNodeGroup,
13181 self.op.target_groups)
13183 self.req_target_uuids = []
13185 if self.group_uuid in self.req_target_uuids:
13186 raise errors.OpPrereqError("Group to be evacuated (%s) can not be used"
13187 " as a target group (targets are %s)" %
13189 utils.CommaJoin(self.req_target_uuids)),
13190 errors.ECODE_INVAL)
13192 self.op.iallocator = _GetDefaultIAllocator(self.cfg, self.op.iallocator)
13194 self.share_locks = _ShareAll()
13195 self.needed_locks = {
13196 locking.LEVEL_INSTANCE: [],
13197 locking.LEVEL_NODEGROUP: [],
13198 locking.LEVEL_NODE: [],
13201 def DeclareLocks(self, level):
13202 if level == locking.LEVEL_INSTANCE:
13203 assert not self.needed_locks[locking.LEVEL_INSTANCE]
13205 # Lock instances optimistically, needs verification once node and group
13206 # locks have been acquired
13207 self.needed_locks[locking.LEVEL_INSTANCE] = \
13208 self.cfg.GetNodeGroupInstances(self.group_uuid)
13210 elif level == locking.LEVEL_NODEGROUP:
13211 assert not self.needed_locks[locking.LEVEL_NODEGROUP]
13213 if self.req_target_uuids:
13214 lock_groups = set([self.group_uuid] + self.req_target_uuids)
13216 # Lock all groups used by instances optimistically; this requires going
13217 # via the node before it's locked, requiring verification later on
13218 lock_groups.update(group_uuid
13219 for instance_name in
13220 self.owned_locks(locking.LEVEL_INSTANCE)
13222 self.cfg.GetInstanceNodeGroups(instance_name))
13224 # No target groups, need to lock all of them
13225 lock_groups = locking.ALL_SET
13227 self.needed_locks[locking.LEVEL_NODEGROUP] = lock_groups
13229 elif level == locking.LEVEL_NODE:
13230 # This will only lock the nodes in the group to be evacuated which
13231 # contain actual instances
13232 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
13233 self._LockInstancesNodes()
13235 # Lock all nodes in group to be evacuated and target groups
13236 owned_groups = frozenset(self.owned_locks(locking.LEVEL_NODEGROUP))
13237 assert self.group_uuid in owned_groups
13238 member_nodes = [node_name
13239 for group in owned_groups
13240 for node_name in self.cfg.GetNodeGroup(group).members]
13241 self.needed_locks[locking.LEVEL_NODE].extend(member_nodes)
13243 def CheckPrereq(self):
13244 owned_instances = frozenset(self.owned_locks(locking.LEVEL_INSTANCE))
13245 owned_groups = frozenset(self.owned_locks(locking.LEVEL_NODEGROUP))
13246 owned_nodes = frozenset(self.owned_locks(locking.LEVEL_NODE))
13248 assert owned_groups.issuperset(self.req_target_uuids)
13249 assert self.group_uuid in owned_groups
13251 # Check if locked instances are still correct
13252 _CheckNodeGroupInstances(self.cfg, self.group_uuid, owned_instances)
13254 # Get instance information
13255 self.instances = dict(self.cfg.GetMultiInstanceInfo(owned_instances))
13257 # Check if node groups for locked instances are still correct
13258 for instance_name in owned_instances:
13259 inst = self.instances[instance_name]
13260 assert owned_nodes.issuperset(inst.all_nodes), \
13261 "Instance %s's nodes changed while we kept the lock" % instance_name
13263 inst_groups = _CheckInstanceNodeGroups(self.cfg, instance_name,
13266 assert self.group_uuid in inst_groups, \
13267 "Instance %s has no node in group %s" % (instance_name, self.group_uuid)
13269 if self.req_target_uuids:
13270 # User requested specific target groups
13271 self.target_uuids = self.req_target_uuids
13273 # All groups except the one to be evacuated are potential targets
13274 self.target_uuids = [group_uuid for group_uuid in owned_groups
13275 if group_uuid != self.group_uuid]
13277 if not self.target_uuids:
13278 raise errors.OpPrereqError("There are no possible target groups",
13279 errors.ECODE_INVAL)
13281 def BuildHooksEnv(self):
13282 """Build hooks env.
13286 "GROUP_NAME": self.op.group_name,
13287 "TARGET_GROUPS": " ".join(self.target_uuids),
13290 def BuildHooksNodes(self):
13291 """Build hooks nodes.
13294 mn = self.cfg.GetMasterNode()
13296 assert self.group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP)
13298 run_nodes = [mn] + self.cfg.GetNodeGroup(self.group_uuid).members
13300 return (run_nodes, run_nodes)
13302 def Exec(self, feedback_fn):
13303 instances = list(self.owned_locks(locking.LEVEL_INSTANCE))
13305 assert self.group_uuid not in self.target_uuids
13307 ial = IAllocator(self.cfg, self.rpc, constants.IALLOCATOR_MODE_CHG_GROUP,
13308 instances=instances, target_groups=self.target_uuids)
13310 ial.Run(self.op.iallocator)
13312 if not ial.success:
13313 raise errors.OpPrereqError("Can't compute group evacuation using"
13314 " iallocator '%s': %s" %
13315 (self.op.iallocator, ial.info),
13316 errors.ECODE_NORES)
13318 jobs = _LoadNodeEvacResult(self, ial.result, self.op.early_release, False)
13320 self.LogInfo("Iallocator returned %s job(s) for evacuating node group %s",
13321 len(jobs), self.op.group_name)
13323 return ResultWithJobs(jobs)
13326 class TagsLU(NoHooksLU): # pylint: disable=W0223
13327 """Generic tags LU.
13329 This is an abstract class which is the parent of all the other tags LUs.
13332 def ExpandNames(self):
13333 self.group_uuid = None
13334 self.needed_locks = {}
13335 if self.op.kind == constants.TAG_NODE:
13336 self.op.name = _ExpandNodeName(self.cfg, self.op.name)
13337 self.needed_locks[locking.LEVEL_NODE] = self.op.name
13338 elif self.op.kind == constants.TAG_INSTANCE:
13339 self.op.name = _ExpandInstanceName(self.cfg, self.op.name)
13340 self.needed_locks[locking.LEVEL_INSTANCE] = self.op.name
13341 elif self.op.kind == constants.TAG_NODEGROUP:
13342 self.group_uuid = self.cfg.LookupNodeGroup(self.op.name)
13344 # FIXME: Acquire BGL for cluster tag operations (as of this writing it's
13345 # not possible to acquire the BGL based on opcode parameters)
13347 def CheckPrereq(self):
13348 """Check prerequisites.
13351 if self.op.kind == constants.TAG_CLUSTER:
13352 self.target = self.cfg.GetClusterInfo()
13353 elif self.op.kind == constants.TAG_NODE:
13354 self.target = self.cfg.GetNodeInfo(self.op.name)
13355 elif self.op.kind == constants.TAG_INSTANCE:
13356 self.target = self.cfg.GetInstanceInfo(self.op.name)
13357 elif self.op.kind == constants.TAG_NODEGROUP:
13358 self.target = self.cfg.GetNodeGroup(self.group_uuid)
13360 raise errors.OpPrereqError("Wrong tag type requested (%s)" %
13361 str(self.op.kind), errors.ECODE_INVAL)
13364 class LUTagsGet(TagsLU):
13365 """Returns the tags of a given object.
13370 def ExpandNames(self):
13371 TagsLU.ExpandNames(self)
13373 # Share locks as this is only a read operation
13374 self.share_locks = _ShareAll()
13376 def Exec(self, feedback_fn):
13377 """Returns the tag list.
13380 return list(self.target.GetTags())
13383 class LUTagsSearch(NoHooksLU):
13384 """Searches the tags for a given pattern.
13389 def ExpandNames(self):
13390 self.needed_locks = {}
13392 def CheckPrereq(self):
13393 """Check prerequisites.
13395 This checks the pattern passed for validity by compiling it.
13399 self.re = re.compile(self.op.pattern)
13400 except re.error, err:
13401 raise errors.OpPrereqError("Invalid search pattern '%s': %s" %
13402 (self.op.pattern, err), errors.ECODE_INVAL)
13404 def Exec(self, feedback_fn):
13405 """Returns the tag list.
13409 tgts = [("/cluster", cfg.GetClusterInfo())]
13410 ilist = cfg.GetAllInstancesInfo().values()
13411 tgts.extend([("/instances/%s" % i.name, i) for i in ilist])
13412 nlist = cfg.GetAllNodesInfo().values()
13413 tgts.extend([("/nodes/%s" % n.name, n) for n in nlist])
13414 tgts.extend(("/nodegroup/%s" % n.name, n)
13415 for n in cfg.GetAllNodeGroupsInfo().values())
13417 for path, target in tgts:
13418 for tag in target.GetTags():
13419 if self.re.search(tag):
13420 results.append((path, tag))
13424 class LUTagsSet(TagsLU):
13425 """Sets a tag on a given object.
13430 def CheckPrereq(self):
13431 """Check prerequisites.
13433 This checks the type and length of the tag name and value.
13436 TagsLU.CheckPrereq(self)
13437 for tag in self.op.tags:
13438 objects.TaggableObject.ValidateTag(tag)
13440 def Exec(self, feedback_fn):
13445 for tag in self.op.tags:
13446 self.target.AddTag(tag)
13447 except errors.TagError, err:
13448 raise errors.OpExecError("Error while setting tag: %s" % str(err))
13449 self.cfg.Update(self.target, feedback_fn)
13452 class LUTagsDel(TagsLU):
13453 """Delete a list of tags from a given object.
13458 def CheckPrereq(self):
13459 """Check prerequisites.
13461 This checks that we have the given tag.
13464 TagsLU.CheckPrereq(self)
13465 for tag in self.op.tags:
13466 objects.TaggableObject.ValidateTag(tag)
13467 del_tags = frozenset(self.op.tags)
13468 cur_tags = self.target.GetTags()
13470 diff_tags = del_tags - cur_tags
13472 diff_names = ("'%s'" % i for i in sorted(diff_tags))
13473 raise errors.OpPrereqError("Tag(s) %s not found" %
13474 (utils.CommaJoin(diff_names), ),
13475 errors.ECODE_NOENT)
13477 def Exec(self, feedback_fn):
13478 """Remove the tag from the object.
13481 for tag in self.op.tags:
13482 self.target.RemoveTag(tag)
13483 self.cfg.Update(self.target, feedback_fn)
13486 class LUTestDelay(NoHooksLU):
13487 """Sleep for a specified amount of time.
13489 This LU sleeps on the master and/or nodes for a specified amount of
13495 def ExpandNames(self):
13496 """Expand names and set required locks.
13498 This expands the node list, if any.
13501 self.needed_locks = {}
13502 if self.op.on_nodes:
13503 # _GetWantedNodes can be used here, but is not always appropriate to use
13504 # this way in ExpandNames. Check LogicalUnit.ExpandNames docstring for
13505 # more information.
13506 self.op.on_nodes = _GetWantedNodes(self, self.op.on_nodes)
13507 self.needed_locks[locking.LEVEL_NODE] = self.op.on_nodes
13509 def _TestDelay(self):
13510 """Do the actual sleep.
13513 if self.op.on_master:
13514 if not utils.TestDelay(self.op.duration):
13515 raise errors.OpExecError("Error during master delay test")
13516 if self.op.on_nodes:
13517 result = self.rpc.call_test_delay(self.op.on_nodes, self.op.duration)
13518 for node, node_result in result.items():
13519 node_result.Raise("Failure during rpc call to node %s" % node)
13521 def Exec(self, feedback_fn):
13522 """Execute the test delay opcode, with the wanted repetitions.
13525 if self.op.repeat == 0:
13528 top_value = self.op.repeat - 1
13529 for i in range(self.op.repeat):
13530 self.LogInfo("Test delay iteration %d/%d" % (i, top_value))
13534 class LUTestJqueue(NoHooksLU):
13535 """Utility LU to test some aspects of the job queue.
13540 # Must be lower than default timeout for WaitForJobChange to see whether it
13541 # notices changed jobs
13542 _CLIENT_CONNECT_TIMEOUT = 20.0
13543 _CLIENT_CONFIRM_TIMEOUT = 60.0
13546 def _NotifyUsingSocket(cls, cb, errcls):
13547 """Opens a Unix socket and waits for another program to connect.
13550 @param cb: Callback to send socket name to client
13551 @type errcls: class
13552 @param errcls: Exception class to use for errors
13555 # Using a temporary directory as there's no easy way to create temporary
13556 # sockets without writing a custom loop around tempfile.mktemp and
13558 tmpdir = tempfile.mkdtemp()
13560 tmpsock = utils.PathJoin(tmpdir, "sock")
13562 logging.debug("Creating temporary socket at %s", tmpsock)
13563 sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
13568 # Send details to client
13571 # Wait for client to connect before continuing
13572 sock.settimeout(cls._CLIENT_CONNECT_TIMEOUT)
13574 (conn, _) = sock.accept()
13575 except socket.error, err:
13576 raise errcls("Client didn't connect in time (%s)" % err)
13580 # Remove as soon as client is connected
13581 shutil.rmtree(tmpdir)
13583 # Wait for client to close
13586 # pylint: disable=E1101
13587 # Instance of '_socketobject' has no ... member
13588 conn.settimeout(cls._CLIENT_CONFIRM_TIMEOUT)
13590 except socket.error, err:
13591 raise errcls("Client failed to confirm notification (%s)" % err)
13595 def _SendNotification(self, test, arg, sockname):
13596 """Sends a notification to the client.
13599 @param test: Test name
13600 @param arg: Test argument (depends on test)
13601 @type sockname: string
13602 @param sockname: Socket path
13605 self.Log(constants.ELOG_JQUEUE_TEST, (sockname, test, arg))
13607 def _Notify(self, prereq, test, arg):
13608 """Notifies the client of a test.
13611 @param prereq: Whether this is a prereq-phase test
13613 @param test: Test name
13614 @param arg: Test argument (depends on test)
13618 errcls = errors.OpPrereqError
13620 errcls = errors.OpExecError
13622 return self._NotifyUsingSocket(compat.partial(self._SendNotification,
13626 def CheckArguments(self):
13627 self.checkargs_calls = getattr(self, "checkargs_calls", 0) + 1
13628 self.expandnames_calls = 0
13630 def ExpandNames(self):
13631 checkargs_calls = getattr(self, "checkargs_calls", 0)
13632 if checkargs_calls < 1:
13633 raise errors.ProgrammerError("CheckArguments was not called")
13635 self.expandnames_calls += 1
13637 if self.op.notify_waitlock:
13638 self._Notify(True, constants.JQT_EXPANDNAMES, None)
13640 self.LogInfo("Expanding names")
13642 # Get lock on master node (just to get a lock, not for a particular reason)
13643 self.needed_locks = {
13644 locking.LEVEL_NODE: self.cfg.GetMasterNode(),
13647 def Exec(self, feedback_fn):
13648 if self.expandnames_calls < 1:
13649 raise errors.ProgrammerError("ExpandNames was not called")
13651 if self.op.notify_exec:
13652 self._Notify(False, constants.JQT_EXEC, None)
13654 self.LogInfo("Executing")
13656 if self.op.log_messages:
13657 self._Notify(False, constants.JQT_STARTMSG, len(self.op.log_messages))
13658 for idx, msg in enumerate(self.op.log_messages):
13659 self.LogInfo("Sending log message %s", idx + 1)
13660 feedback_fn(constants.JQT_MSGPREFIX + msg)
13661 # Report how many test messages have been sent
13662 self._Notify(False, constants.JQT_LOGMSG, idx + 1)
13665 raise errors.OpExecError("Opcode failure was requested")
13670 class IAllocator(object):
13671 """IAllocator framework.
13673 An IAllocator instance has three sets of attributes:
13674 - cfg that is needed to query the cluster
13675 - input data (all members of the _KEYS class attribute are required)
13676 - four buffer attributes (in|out_data|text), that represent the
13677 input (to the external script) in text and data structure format,
13678 and the output from it, again in two formats
13679 - the result variables from the script (success, info, nodes) for
13683 # pylint: disable=R0902
13684 # lots of instance attributes
13686 def __init__(self, cfg, rpc_runner, mode, **kwargs):
13688 self.rpc = rpc_runner
13689 # init buffer variables
13690 self.in_text = self.out_text = self.in_data = self.out_data = None
13691 # init all input fields so that pylint is happy
13693 self.memory = self.disks = self.disk_template = None
13694 self.os = self.tags = self.nics = self.vcpus = None
13695 self.hypervisor = None
13696 self.relocate_from = None
13698 self.instances = None
13699 self.evac_mode = None
13700 self.target_groups = []
13702 self.required_nodes = None
13703 # init result fields
13704 self.success = self.info = self.result = None
13707 (fn, keydata, self._result_check) = self._MODE_DATA[self.mode]
13709 raise errors.ProgrammerError("Unknown mode '%s' passed to the"
13710 " IAllocator" % self.mode)
13712 keyset = [n for (n, _) in keydata]
13715 if key not in keyset:
13716 raise errors.ProgrammerError("Invalid input parameter '%s' to"
13717 " IAllocator" % key)
13718 setattr(self, key, kwargs[key])
13721 if key not in kwargs:
13722 raise errors.ProgrammerError("Missing input parameter '%s' to"
13723 " IAllocator" % key)
13724 self._BuildInputData(compat.partial(fn, self), keydata)
13726 def _ComputeClusterData(self):
13727 """Compute the generic allocator input data.
13729 This is the data that is independent of the actual operation.
13733 cluster_info = cfg.GetClusterInfo()
13736 "version": constants.IALLOCATOR_VERSION,
13737 "cluster_name": cfg.GetClusterName(),
13738 "cluster_tags": list(cluster_info.GetTags()),
13739 "enabled_hypervisors": list(cluster_info.enabled_hypervisors),
13740 # we don't have job IDs
13742 ninfo = cfg.GetAllNodesInfo()
13743 iinfo = cfg.GetAllInstancesInfo().values()
13744 i_list = [(inst, cluster_info.FillBE(inst)) for inst in iinfo]
13747 node_list = [n.name for n in ninfo.values() if n.vm_capable]
13749 if self.mode == constants.IALLOCATOR_MODE_ALLOC:
13750 hypervisor_name = self.hypervisor
13751 elif self.mode == constants.IALLOCATOR_MODE_RELOC:
13752 hypervisor_name = cfg.GetInstanceInfo(self.name).hypervisor
13754 hypervisor_name = cluster_info.primary_hypervisor
13756 node_data = self.rpc.call_node_info(node_list, [cfg.GetVGName()],
13759 self.rpc.call_all_instances_info(node_list,
13760 cluster_info.enabled_hypervisors)
13762 data["nodegroups"] = self._ComputeNodeGroupData(cfg)
13764 config_ndata = self._ComputeBasicNodeData(ninfo)
13765 data["nodes"] = self._ComputeDynamicNodeData(ninfo, node_data, node_iinfo,
13766 i_list, config_ndata)
13767 assert len(data["nodes"]) == len(ninfo), \
13768 "Incomplete node data computed"
13770 data["instances"] = self._ComputeInstanceData(cluster_info, i_list)
13772 self.in_data = data
13775 def _ComputeNodeGroupData(cfg):
13776 """Compute node groups data.
13779 ng = dict((guuid, {
13780 "name": gdata.name,
13781 "alloc_policy": gdata.alloc_policy,
13783 for guuid, gdata in cfg.GetAllNodeGroupsInfo().items())
13788 def _ComputeBasicNodeData(node_cfg):
13789 """Compute global node data.
13792 @returns: a dict of name: (node dict, node config)
13795 # fill in static (config-based) values
13796 node_results = dict((ninfo.name, {
13797 "tags": list(ninfo.GetTags()),
13798 "primary_ip": ninfo.primary_ip,
13799 "secondary_ip": ninfo.secondary_ip,
13800 "offline": ninfo.offline,
13801 "drained": ninfo.drained,
13802 "master_candidate": ninfo.master_candidate,
13803 "group": ninfo.group,
13804 "master_capable": ninfo.master_capable,
13805 "vm_capable": ninfo.vm_capable,
13807 for ninfo in node_cfg.values())
13809 return node_results
13812 def _ComputeDynamicNodeData(node_cfg, node_data, node_iinfo, i_list,
13814 """Compute global node data.
13816 @param node_results: the basic node structures as filled from the config
13819 #TODO(dynmem): compute the right data on MAX and MIN memory
13820 # make a copy of the current dict
13821 node_results = dict(node_results)
13822 for nname, nresult in node_data.items():
13823 assert nname in node_results, "Missing basic data for node %s" % nname
13824 ninfo = node_cfg[nname]
13826 if not (ninfo.offline or ninfo.drained):
13827 nresult.Raise("Can't get data for node %s" % nname)
13828 node_iinfo[nname].Raise("Can't get node instance info from node %s" %
13830 remote_info = _MakeLegacyNodeInfo(nresult.payload)
13832 for attr in ["memory_total", "memory_free", "memory_dom0",
13833 "vg_size", "vg_free", "cpu_total"]:
13834 if attr not in remote_info:
13835 raise errors.OpExecError("Node '%s' didn't return attribute"
13836 " '%s'" % (nname, attr))
13837 if not isinstance(remote_info[attr], int):
13838 raise errors.OpExecError("Node '%s' returned invalid value"
13840 (nname, attr, remote_info[attr]))
13841 # compute memory used by primary instances
13842 i_p_mem = i_p_up_mem = 0
13843 for iinfo, beinfo in i_list:
13844 if iinfo.primary_node == nname:
13845 i_p_mem += beinfo[constants.BE_MAXMEM]
13846 if iinfo.name not in node_iinfo[nname].payload:
13849 i_used_mem = int(node_iinfo[nname].payload[iinfo.name]["memory"])
13850 i_mem_diff = beinfo[constants.BE_MAXMEM] - i_used_mem
13851 remote_info["memory_free"] -= max(0, i_mem_diff)
13853 if iinfo.admin_state == constants.ADMINST_UP:
13854 i_p_up_mem += beinfo[constants.BE_MAXMEM]
13856 # compute memory used by instances
13858 "total_memory": remote_info["memory_total"],
13859 "reserved_memory": remote_info["memory_dom0"],
13860 "free_memory": remote_info["memory_free"],
13861 "total_disk": remote_info["vg_size"],
13862 "free_disk": remote_info["vg_free"],
13863 "total_cpus": remote_info["cpu_total"],
13864 "i_pri_memory": i_p_mem,
13865 "i_pri_up_memory": i_p_up_mem,
13867 pnr_dyn.update(node_results[nname])
13868 node_results[nname] = pnr_dyn
13870 return node_results
13873 def _ComputeInstanceData(cluster_info, i_list):
13874 """Compute global instance data.
13878 for iinfo, beinfo in i_list:
13880 for nic in iinfo.nics:
13881 filled_params = cluster_info.SimpleFillNIC(nic.nicparams)
13885 "mode": filled_params[constants.NIC_MODE],
13886 "link": filled_params[constants.NIC_LINK],
13888 if filled_params[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
13889 nic_dict["bridge"] = filled_params[constants.NIC_LINK]
13890 nic_data.append(nic_dict)
13892 "tags": list(iinfo.GetTags()),
13893 "admin_state": iinfo.admin_state,
13894 "vcpus": beinfo[constants.BE_VCPUS],
13895 "memory": beinfo[constants.BE_MAXMEM],
13897 "nodes": [iinfo.primary_node] + list(iinfo.secondary_nodes),
13899 "disks": [{constants.IDISK_SIZE: dsk.size,
13900 constants.IDISK_MODE: dsk.mode}
13901 for dsk in iinfo.disks],
13902 "disk_template": iinfo.disk_template,
13903 "hypervisor": iinfo.hypervisor,
13905 pir["disk_space_total"] = _ComputeDiskSize(iinfo.disk_template,
13907 instance_data[iinfo.name] = pir
13909 return instance_data
13911 def _AddNewInstance(self):
13912 """Add new instance data to allocator structure.
13914 This in combination with _AllocatorGetClusterData will create the
13915 correct structure needed as input for the allocator.
13917 The checks for the completeness of the opcode must have already been
13921 disk_space = _ComputeDiskSize(self.disk_template, self.disks)
13923 if self.disk_template in constants.DTS_INT_MIRROR:
13924 self.required_nodes = 2
13926 self.required_nodes = 1
13930 "disk_template": self.disk_template,
13933 "vcpus": self.vcpus,
13934 "memory": self.memory,
13935 "disks": self.disks,
13936 "disk_space_total": disk_space,
13938 "required_nodes": self.required_nodes,
13939 "hypervisor": self.hypervisor,
13944 def _AddRelocateInstance(self):
13945 """Add relocate instance data to allocator structure.
13947 This in combination with _IAllocatorGetClusterData will create the
13948 correct structure needed as input for the allocator.
13950 The checks for the completeness of the opcode must have already been
13954 instance = self.cfg.GetInstanceInfo(self.name)
13955 if instance is None:
13956 raise errors.ProgrammerError("Unknown instance '%s' passed to"
13957 " IAllocator" % self.name)
13959 if instance.disk_template not in constants.DTS_MIRRORED:
13960 raise errors.OpPrereqError("Can't relocate non-mirrored instances",
13961 errors.ECODE_INVAL)
13963 if instance.disk_template in constants.DTS_INT_MIRROR and \
13964 len(instance.secondary_nodes) != 1:
13965 raise errors.OpPrereqError("Instance has not exactly one secondary node",
13966 errors.ECODE_STATE)
13968 self.required_nodes = 1
13969 disk_sizes = [{constants.IDISK_SIZE: disk.size} for disk in instance.disks]
13970 disk_space = _ComputeDiskSize(instance.disk_template, disk_sizes)
13974 "disk_space_total": disk_space,
13975 "required_nodes": self.required_nodes,
13976 "relocate_from": self.relocate_from,
13980 def _AddNodeEvacuate(self):
13981 """Get data for node-evacuate requests.
13985 "instances": self.instances,
13986 "evac_mode": self.evac_mode,
13989 def _AddChangeGroup(self):
13990 """Get data for node-evacuate requests.
13994 "instances": self.instances,
13995 "target_groups": self.target_groups,
13998 def _BuildInputData(self, fn, keydata):
13999 """Build input data structures.
14002 self._ComputeClusterData()
14005 request["type"] = self.mode
14006 for keyname, keytype in keydata:
14007 if keyname not in request:
14008 raise errors.ProgrammerError("Request parameter %s is missing" %
14010 val = request[keyname]
14011 if not keytype(val):
14012 raise errors.ProgrammerError("Request parameter %s doesn't pass"
14013 " validation, value %s, expected"
14014 " type %s" % (keyname, val, keytype))
14015 self.in_data["request"] = request
14017 self.in_text = serializer.Dump(self.in_data)
14019 _STRING_LIST = ht.TListOf(ht.TString)
14020 _JOB_LIST = ht.TListOf(ht.TListOf(ht.TStrictDict(True, False, {
14021 # pylint: disable=E1101
14022 # Class '...' has no 'OP_ID' member
14023 "OP_ID": ht.TElemOf([opcodes.OpInstanceFailover.OP_ID,
14024 opcodes.OpInstanceMigrate.OP_ID,
14025 opcodes.OpInstanceReplaceDisks.OP_ID])
14029 ht.TListOf(ht.TAnd(ht.TIsLength(3),
14030 ht.TItems([ht.TNonEmptyString,
14031 ht.TNonEmptyString,
14032 ht.TListOf(ht.TNonEmptyString),
14035 ht.TListOf(ht.TAnd(ht.TIsLength(2),
14036 ht.TItems([ht.TNonEmptyString,
14039 _NEVAC_RESULT = ht.TAnd(ht.TIsLength(3),
14040 ht.TItems([_NEVAC_MOVED, _NEVAC_FAILED, _JOB_LIST]))
14043 constants.IALLOCATOR_MODE_ALLOC:
14046 ("name", ht.TString),
14047 ("memory", ht.TInt),
14048 ("disks", ht.TListOf(ht.TDict)),
14049 ("disk_template", ht.TString),
14050 ("os", ht.TString),
14051 ("tags", _STRING_LIST),
14052 ("nics", ht.TListOf(ht.TDict)),
14053 ("vcpus", ht.TInt),
14054 ("hypervisor", ht.TString),
14056 constants.IALLOCATOR_MODE_RELOC:
14057 (_AddRelocateInstance,
14058 [("name", ht.TString), ("relocate_from", _STRING_LIST)],
14060 constants.IALLOCATOR_MODE_NODE_EVAC:
14061 (_AddNodeEvacuate, [
14062 ("instances", _STRING_LIST),
14063 ("evac_mode", ht.TElemOf(constants.IALLOCATOR_NEVAC_MODES)),
14065 constants.IALLOCATOR_MODE_CHG_GROUP:
14066 (_AddChangeGroup, [
14067 ("instances", _STRING_LIST),
14068 ("target_groups", _STRING_LIST),
14072 def Run(self, name, validate=True, call_fn=None):
14073 """Run an instance allocator and return the results.
14076 if call_fn is None:
14077 call_fn = self.rpc.call_iallocator_runner
14079 result = call_fn(self.cfg.GetMasterNode(), name, self.in_text)
14080 result.Raise("Failure while running the iallocator script")
14082 self.out_text = result.payload
14084 self._ValidateResult()
14086 def _ValidateResult(self):
14087 """Process the allocator results.
14089 This will process and if successful save the result in
14090 self.out_data and the other parameters.
14094 rdict = serializer.Load(self.out_text)
14095 except Exception, err:
14096 raise errors.OpExecError("Can't parse iallocator results: %s" % str(err))
14098 if not isinstance(rdict, dict):
14099 raise errors.OpExecError("Can't parse iallocator results: not a dict")
14101 # TODO: remove backwards compatiblity in later versions
14102 if "nodes" in rdict and "result" not in rdict:
14103 rdict["result"] = rdict["nodes"]
14106 for key in "success", "info", "result":
14107 if key not in rdict:
14108 raise errors.OpExecError("Can't parse iallocator results:"
14109 " missing key '%s'" % key)
14110 setattr(self, key, rdict[key])
14112 if not self._result_check(self.result):
14113 raise errors.OpExecError("Iallocator returned invalid result,"
14114 " expected %s, got %s" %
14115 (self._result_check, self.result),
14116 errors.ECODE_INVAL)
14118 if self.mode == constants.IALLOCATOR_MODE_RELOC:
14119 assert self.relocate_from is not None
14120 assert self.required_nodes == 1
14122 node2group = dict((name, ndata["group"])
14123 for (name, ndata) in self.in_data["nodes"].items())
14125 fn = compat.partial(self._NodesToGroups, node2group,
14126 self.in_data["nodegroups"])
14128 instance = self.cfg.GetInstanceInfo(self.name)
14129 request_groups = fn(self.relocate_from + [instance.primary_node])
14130 result_groups = fn(rdict["result"] + [instance.primary_node])
14132 if self.success and not set(result_groups).issubset(request_groups):
14133 raise errors.OpExecError("Groups of nodes returned by iallocator (%s)"
14134 " differ from original groups (%s)" %
14135 (utils.CommaJoin(result_groups),
14136 utils.CommaJoin(request_groups)))
14138 elif self.mode == constants.IALLOCATOR_MODE_NODE_EVAC:
14139 assert self.evac_mode in constants.IALLOCATOR_NEVAC_MODES
14141 self.out_data = rdict
14144 def _NodesToGroups(node2group, groups, nodes):
14145 """Returns a list of unique group names for a list of nodes.
14147 @type node2group: dict
14148 @param node2group: Map from node name to group UUID
14150 @param groups: Group information
14152 @param nodes: Node names
14159 group_uuid = node2group[node]
14161 # Ignore unknown node
14165 group = groups[group_uuid]
14167 # Can't find group, let's use UUID
14168 group_name = group_uuid
14170 group_name = group["name"]
14172 result.add(group_name)
14174 return sorted(result)
14177 class LUTestAllocator(NoHooksLU):
14178 """Run allocator tests.
14180 This LU runs the allocator tests
14183 def CheckPrereq(self):
14184 """Check prerequisites.
14186 This checks the opcode parameters depending on the director and mode test.
14189 if self.op.mode == constants.IALLOCATOR_MODE_ALLOC:
14190 for attr in ["memory", "disks", "disk_template",
14191 "os", "tags", "nics", "vcpus"]:
14192 if not hasattr(self.op, attr):
14193 raise errors.OpPrereqError("Missing attribute '%s' on opcode input" %
14194 attr, errors.ECODE_INVAL)
14195 iname = self.cfg.ExpandInstanceName(self.op.name)
14196 if iname is not None:
14197 raise errors.OpPrereqError("Instance '%s' already in the cluster" %
14198 iname, errors.ECODE_EXISTS)
14199 if not isinstance(self.op.nics, list):
14200 raise errors.OpPrereqError("Invalid parameter 'nics'",
14201 errors.ECODE_INVAL)
14202 if not isinstance(self.op.disks, list):
14203 raise errors.OpPrereqError("Invalid parameter 'disks'",
14204 errors.ECODE_INVAL)
14205 for row in self.op.disks:
14206 if (not isinstance(row, dict) or
14207 constants.IDISK_SIZE not in row or
14208 not isinstance(row[constants.IDISK_SIZE], int) or
14209 constants.IDISK_MODE not in row or
14210 row[constants.IDISK_MODE] not in constants.DISK_ACCESS_SET):
14211 raise errors.OpPrereqError("Invalid contents of the 'disks'"
14212 " parameter", errors.ECODE_INVAL)
14213 if self.op.hypervisor is None:
14214 self.op.hypervisor = self.cfg.GetHypervisorType()
14215 elif self.op.mode == constants.IALLOCATOR_MODE_RELOC:
14216 fname = _ExpandInstanceName(self.cfg, self.op.name)
14217 self.op.name = fname
14218 self.relocate_from = \
14219 list(self.cfg.GetInstanceInfo(fname).secondary_nodes)
14220 elif self.op.mode in (constants.IALLOCATOR_MODE_CHG_GROUP,
14221 constants.IALLOCATOR_MODE_NODE_EVAC):
14222 if not self.op.instances:
14223 raise errors.OpPrereqError("Missing instances", errors.ECODE_INVAL)
14224 self.op.instances = _GetWantedInstances(self, self.op.instances)
14226 raise errors.OpPrereqError("Invalid test allocator mode '%s'" %
14227 self.op.mode, errors.ECODE_INVAL)
14229 if self.op.direction == constants.IALLOCATOR_DIR_OUT:
14230 if self.op.allocator is None:
14231 raise errors.OpPrereqError("Missing allocator name",
14232 errors.ECODE_INVAL)
14233 elif self.op.direction != constants.IALLOCATOR_DIR_IN:
14234 raise errors.OpPrereqError("Wrong allocator test '%s'" %
14235 self.op.direction, errors.ECODE_INVAL)
14237 def Exec(self, feedback_fn):
14238 """Run the allocator test.
14241 if self.op.mode == constants.IALLOCATOR_MODE_ALLOC:
14242 ial = IAllocator(self.cfg, self.rpc,
14245 memory=self.op.memory,
14246 disks=self.op.disks,
14247 disk_template=self.op.disk_template,
14251 vcpus=self.op.vcpus,
14252 hypervisor=self.op.hypervisor,
14254 elif self.op.mode == constants.IALLOCATOR_MODE_RELOC:
14255 ial = IAllocator(self.cfg, self.rpc,
14258 relocate_from=list(self.relocate_from),
14260 elif self.op.mode == constants.IALLOCATOR_MODE_CHG_GROUP:
14261 ial = IAllocator(self.cfg, self.rpc,
14263 instances=self.op.instances,
14264 target_groups=self.op.target_groups)
14265 elif self.op.mode == constants.IALLOCATOR_MODE_NODE_EVAC:
14266 ial = IAllocator(self.cfg, self.rpc,
14268 instances=self.op.instances,
14269 evac_mode=self.op.evac_mode)
14271 raise errors.ProgrammerError("Uncatched mode %s in"
14272 " LUTestAllocator.Exec", self.op.mode)
14274 if self.op.direction == constants.IALLOCATOR_DIR_IN:
14275 result = ial.in_text
14277 ial.Run(self.op.allocator, validate=False)
14278 result = ial.out_text
14282 #: Query type implementations
14284 constants.QR_INSTANCE: _InstanceQuery,
14285 constants.QR_NODE: _NodeQuery,
14286 constants.QR_GROUP: _GroupQuery,
14287 constants.QR_OS: _OsQuery,
14290 assert set(_QUERY_IMPL.keys()) == constants.QR_VIA_OP
14293 def _GetQueryImplementation(name):
14294 """Returns the implemtnation for a query type.
14296 @param name: Query type, must be one of L{constants.QR_VIA_OP}
14300 return _QUERY_IMPL[name]
14302 raise errors.OpPrereqError("Unknown query resource '%s'" % name,
14303 errors.ECODE_INVAL)