4 # Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 Google Inc.
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 """Module implementing the master-side code."""
24 # pylint: disable=W0201,C0302
26 # W0201 since most LU attributes are defined in CheckPrereq or similar
29 # C0302: since we have waaaay too many lines in this module
45 from ganeti import ssh
46 from ganeti import utils
47 from ganeti import errors
48 from ganeti import hypervisor
49 from ganeti import locking
50 from ganeti import constants
51 from ganeti import objects
52 from ganeti import serializer
53 from ganeti import ssconf
54 from ganeti import uidpool
55 from ganeti import compat
56 from ganeti import masterd
57 from ganeti import netutils
58 from ganeti import query
59 from ganeti import qlang
60 from ganeti import opcodes
62 from ganeti import rpc
64 import ganeti.masterd.instance # pylint: disable=W0611
67 #: Size of DRBD meta block device
72 """Data container for LU results with jobs.
74 Instances of this class returned from L{LogicalUnit.Exec} will be recognized
75 by L{mcpu.Processor._ProcessResult}. The latter will then submit the jobs
76 contained in the C{jobs} attribute and include the job IDs in the opcode
80 def __init__(self, jobs, **kwargs):
81 """Initializes this class.
83 Additional return values can be specified as keyword arguments.
85 @type jobs: list of lists of L{opcode.OpCode}
86 @param jobs: A list of lists of opcode objects
93 class LogicalUnit(object):
94 """Logical Unit base class.
96 Subclasses must follow these rules:
97 - implement ExpandNames
98 - implement CheckPrereq (except when tasklets are used)
99 - implement Exec (except when tasklets are used)
100 - implement BuildHooksEnv
101 - implement BuildHooksNodes
102 - redefine HPATH and HTYPE
103 - optionally redefine their run requirements:
104 REQ_BGL: the LU needs to hold the Big Ganeti Lock exclusively
106 Note that all commands require root permissions.
108 @ivar dry_run_result: the value (if any) that will be returned to the caller
109 in dry-run mode (signalled by opcode dry_run parameter)
116 def __init__(self, processor, op, context, rpc_runner):
117 """Constructor for LogicalUnit.
119 This needs to be overridden in derived classes in order to check op
123 self.proc = processor
125 self.cfg = context.cfg
126 self.glm = context.glm
128 self.owned_locks = context.glm.list_owned
129 self.context = context
130 self.rpc = rpc_runner
131 # Dicts used to declare locking needs to mcpu
132 self.needed_locks = None
133 self.share_locks = dict.fromkeys(locking.LEVELS, 0)
135 self.remove_locks = {}
136 # Used to force good behavior when calling helper functions
137 self.recalculate_locks = {}
139 self.Log = processor.Log # pylint: disable=C0103
140 self.LogWarning = processor.LogWarning # pylint: disable=C0103
141 self.LogInfo = processor.LogInfo # pylint: disable=C0103
142 self.LogStep = processor.LogStep # pylint: disable=C0103
143 # support for dry-run
144 self.dry_run_result = None
145 # support for generic debug attribute
146 if (not hasattr(self.op, "debug_level") or
147 not isinstance(self.op.debug_level, int)):
148 self.op.debug_level = 0
153 # Validate opcode parameters and set defaults
154 self.op.Validate(True)
156 self.CheckArguments()
158 def CheckArguments(self):
159 """Check syntactic validity for the opcode arguments.
161 This method is for doing a simple syntactic check and ensure
162 validity of opcode parameters, without any cluster-related
163 checks. While the same can be accomplished in ExpandNames and/or
164 CheckPrereq, doing these separate is better because:
166 - ExpandNames is left as as purely a lock-related function
167 - CheckPrereq is run after we have acquired locks (and possible
170 The function is allowed to change the self.op attribute so that
171 later methods can no longer worry about missing parameters.
176 def ExpandNames(self):
177 """Expand names for this LU.
179 This method is called before starting to execute the opcode, and it should
180 update all the parameters of the opcode to their canonical form (e.g. a
181 short node name must be fully expanded after this method has successfully
182 completed). This way locking, hooks, logging, etc. can work correctly.
184 LUs which implement this method must also populate the self.needed_locks
185 member, as a dict with lock levels as keys, and a list of needed lock names
188 - use an empty dict if you don't need any lock
189 - if you don't need any lock at a particular level omit that level
190 - don't put anything for the BGL level
191 - if you want all locks at a level use locking.ALL_SET as a value
193 If you need to share locks (rather than acquire them exclusively) at one
194 level you can modify self.share_locks, setting a true value (usually 1) for
195 that level. By default locks are not shared.
197 This function can also define a list of tasklets, which then will be
198 executed in order instead of the usual LU-level CheckPrereq and Exec
199 functions, if those are not defined by the LU.
203 # Acquire all nodes and one instance
204 self.needed_locks = {
205 locking.LEVEL_NODE: locking.ALL_SET,
206 locking.LEVEL_INSTANCE: ['instance1.example.com'],
208 # Acquire just two nodes
209 self.needed_locks = {
210 locking.LEVEL_NODE: ['node1.example.com', 'node2.example.com'],
213 self.needed_locks = {} # No, you can't leave it to the default value None
216 # The implementation of this method is mandatory only if the new LU is
217 # concurrent, so that old LUs don't need to be changed all at the same
220 self.needed_locks = {} # Exclusive LUs don't need locks.
222 raise NotImplementedError
224 def DeclareLocks(self, level):
225 """Declare LU locking needs for a level
227 While most LUs can just declare their locking needs at ExpandNames time,
228 sometimes there's the need to calculate some locks after having acquired
229 the ones before. This function is called just before acquiring locks at a
230 particular level, but after acquiring the ones at lower levels, and permits
231 such calculations. It can be used to modify self.needed_locks, and by
232 default it does nothing.
234 This function is only called if you have something already set in
235 self.needed_locks for the level.
237 @param level: Locking level which is going to be locked
238 @type level: member of ganeti.locking.LEVELS
242 def CheckPrereq(self):
243 """Check prerequisites for this LU.
245 This method should check that the prerequisites for the execution
246 of this LU are fulfilled. It can do internode communication, but
247 it should be idempotent - no cluster or system changes are
250 The method should raise errors.OpPrereqError in case something is
251 not fulfilled. Its return value is ignored.
253 This method should also update all the parameters of the opcode to
254 their canonical form if it hasn't been done by ExpandNames before.
257 if self.tasklets is not None:
258 for (idx, tl) in enumerate(self.tasklets):
259 logging.debug("Checking prerequisites for tasklet %s/%s",
260 idx + 1, len(self.tasklets))
265 def Exec(self, feedback_fn):
268 This method should implement the actual work. It should raise
269 errors.OpExecError for failures that are somewhat dealt with in
273 if self.tasklets is not None:
274 for (idx, tl) in enumerate(self.tasklets):
275 logging.debug("Executing tasklet %s/%s", idx + 1, len(self.tasklets))
278 raise NotImplementedError
280 def BuildHooksEnv(self):
281 """Build hooks environment for this LU.
284 @return: Dictionary containing the environment that will be used for
285 running the hooks for this LU. The keys of the dict must not be prefixed
286 with "GANETI_"--that'll be added by the hooks runner. The hooks runner
287 will extend the environment with additional variables. If no environment
288 should be defined, an empty dictionary should be returned (not C{None}).
289 @note: If the C{HPATH} attribute of the LU class is C{None}, this function
293 raise NotImplementedError
295 def BuildHooksNodes(self):
296 """Build list of nodes to run LU's hooks.
298 @rtype: tuple; (list, list)
299 @return: Tuple containing a list of node names on which the hook
300 should run before the execution and a list of node names on which the
301 hook should run after the execution. No nodes should be returned as an
302 empty list (and not None).
303 @note: If the C{HPATH} attribute of the LU class is C{None}, this function
307 raise NotImplementedError
309 def HooksCallBack(self, phase, hook_results, feedback_fn, lu_result):
310 """Notify the LU about the results of its hooks.
312 This method is called every time a hooks phase is executed, and notifies
313 the Logical Unit about the hooks' result. The LU can then use it to alter
314 its result based on the hooks. By default the method does nothing and the
315 previous result is passed back unchanged but any LU can define it if it
316 wants to use the local cluster hook-scripts somehow.
318 @param phase: one of L{constants.HOOKS_PHASE_POST} or
319 L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase
320 @param hook_results: the results of the multi-node hooks rpc call
321 @param feedback_fn: function used send feedback back to the caller
322 @param lu_result: the previous Exec result this LU had, or None
324 @return: the new Exec result, based on the previous result
328 # API must be kept, thus we ignore the unused argument and could
329 # be a function warnings
330 # pylint: disable=W0613,R0201
333 def _ExpandAndLockInstance(self):
334 """Helper function to expand and lock an instance.
336 Many LUs that work on an instance take its name in self.op.instance_name
337 and need to expand it and then declare the expanded name for locking. This
338 function does it, and then updates self.op.instance_name to the expanded
339 name. It also initializes needed_locks as a dict, if this hasn't been done
343 if self.needed_locks is None:
344 self.needed_locks = {}
346 assert locking.LEVEL_INSTANCE not in self.needed_locks, \
347 "_ExpandAndLockInstance called with instance-level locks set"
348 self.op.instance_name = _ExpandInstanceName(self.cfg,
349 self.op.instance_name)
350 self.needed_locks[locking.LEVEL_INSTANCE] = self.op.instance_name
352 def _LockInstancesNodes(self, primary_only=False):
353 """Helper function to declare instances' nodes for locking.
355 This function should be called after locking one or more instances to lock
356 their nodes. Its effect is populating self.needed_locks[locking.LEVEL_NODE]
357 with all primary or secondary nodes for instances already locked and
358 present in self.needed_locks[locking.LEVEL_INSTANCE].
360 It should be called from DeclareLocks, and for safety only works if
361 self.recalculate_locks[locking.LEVEL_NODE] is set.
363 In the future it may grow parameters to just lock some instance's nodes, or
364 to just lock primaries or secondary nodes, if needed.
366 If should be called in DeclareLocks in a way similar to::
368 if level == locking.LEVEL_NODE:
369 self._LockInstancesNodes()
371 @type primary_only: boolean
372 @param primary_only: only lock primary nodes of locked instances
375 assert locking.LEVEL_NODE in self.recalculate_locks, \
376 "_LockInstancesNodes helper function called with no nodes to recalculate"
378 # TODO: check if we're really been called with the instance locks held
380 # For now we'll replace self.needed_locks[locking.LEVEL_NODE], but in the
381 # future we might want to have different behaviors depending on the value
382 # of self.recalculate_locks[locking.LEVEL_NODE]
384 locked_i = self.owned_locks(locking.LEVEL_INSTANCE)
385 for _, instance in self.cfg.GetMultiInstanceInfo(locked_i):
386 wanted_nodes.append(instance.primary_node)
388 wanted_nodes.extend(instance.secondary_nodes)
390 if self.recalculate_locks[locking.LEVEL_NODE] == constants.LOCKS_REPLACE:
391 self.needed_locks[locking.LEVEL_NODE] = wanted_nodes
392 elif self.recalculate_locks[locking.LEVEL_NODE] == constants.LOCKS_APPEND:
393 self.needed_locks[locking.LEVEL_NODE].extend(wanted_nodes)
395 del self.recalculate_locks[locking.LEVEL_NODE]
398 class NoHooksLU(LogicalUnit): # pylint: disable=W0223
399 """Simple LU which runs no hooks.
401 This LU is intended as a parent for other LogicalUnits which will
402 run no hooks, in order to reduce duplicate code.
408 def BuildHooksEnv(self):
409 """Empty BuildHooksEnv for NoHooksLu.
411 This just raises an error.
414 raise AssertionError("BuildHooksEnv called for NoHooksLUs")
416 def BuildHooksNodes(self):
417 """Empty BuildHooksNodes for NoHooksLU.
420 raise AssertionError("BuildHooksNodes called for NoHooksLU")
424 """Tasklet base class.
426 Tasklets are subcomponents for LUs. LUs can consist entirely of tasklets or
427 they can mix legacy code with tasklets. Locking needs to be done in the LU,
428 tasklets know nothing about locks.
430 Subclasses must follow these rules:
431 - Implement CheckPrereq
435 def __init__(self, lu):
442 def CheckPrereq(self):
443 """Check prerequisites for this tasklets.
445 This method should check whether the prerequisites for the execution of
446 this tasklet are fulfilled. It can do internode communication, but it
447 should be idempotent - no cluster or system changes are allowed.
449 The method should raise errors.OpPrereqError in case something is not
450 fulfilled. Its return value is ignored.
452 This method should also update all parameters to their canonical form if it
453 hasn't been done before.
458 def Exec(self, feedback_fn):
459 """Execute the tasklet.
461 This method should implement the actual work. It should raise
462 errors.OpExecError for failures that are somewhat dealt with in code, or
466 raise NotImplementedError
470 """Base for query utility classes.
473 #: Attribute holding field definitions
476 def __init__(self, qfilter, fields, use_locking):
477 """Initializes this class.
480 self.use_locking = use_locking
482 self.query = query.Query(self.FIELDS, fields, qfilter=qfilter,
484 self.requested_data = self.query.RequestedData()
485 self.names = self.query.RequestedNames()
487 # Sort only if no names were requested
488 self.sort_by_name = not self.names
490 self.do_locking = None
493 def _GetNames(self, lu, all_names, lock_level):
494 """Helper function to determine names asked for in the query.
498 names = lu.owned_locks(lock_level)
502 if self.wanted == locking.ALL_SET:
503 assert not self.names
504 # caller didn't specify names, so ordering is not important
505 return utils.NiceSort(names)
507 # caller specified names and we must keep the same order
509 assert not self.do_locking or lu.glm.is_owned(lock_level)
511 missing = set(self.wanted).difference(names)
513 raise errors.OpExecError("Some items were removed before retrieving"
514 " their data: %s" % missing)
516 # Return expanded names
519 def ExpandNames(self, lu):
520 """Expand names for this query.
522 See L{LogicalUnit.ExpandNames}.
525 raise NotImplementedError()
527 def DeclareLocks(self, lu, level):
528 """Declare locks for this query.
530 See L{LogicalUnit.DeclareLocks}.
533 raise NotImplementedError()
535 def _GetQueryData(self, lu):
536 """Collects all data for this query.
538 @return: Query data object
541 raise NotImplementedError()
543 def NewStyleQuery(self, lu):
544 """Collect data and execute query.
547 return query.GetQueryResponse(self.query, self._GetQueryData(lu),
548 sort_by_name=self.sort_by_name)
550 def OldStyleQuery(self, lu):
551 """Collect data and execute query.
554 return self.query.OldStyleQuery(self._GetQueryData(lu),
555 sort_by_name=self.sort_by_name)
559 """Returns a dict declaring all lock levels shared.
562 return dict.fromkeys(locking.LEVELS, 1)
565 def _CheckInstanceNodeGroups(cfg, instance_name, owned_groups):
566 """Checks if the owned node groups are still correct for an instance.
568 @type cfg: L{config.ConfigWriter}
569 @param cfg: The cluster configuration
570 @type instance_name: string
571 @param instance_name: Instance name
572 @type owned_groups: set or frozenset
573 @param owned_groups: List of currently owned node groups
576 inst_groups = cfg.GetInstanceNodeGroups(instance_name)
578 if not owned_groups.issuperset(inst_groups):
579 raise errors.OpPrereqError("Instance %s's node groups changed since"
580 " locks were acquired, current groups are"
581 " are '%s', owning groups '%s'; retry the"
584 utils.CommaJoin(inst_groups),
585 utils.CommaJoin(owned_groups)),
591 def _CheckNodeGroupInstances(cfg, group_uuid, owned_instances):
592 """Checks if the instances in a node group are still correct.
594 @type cfg: L{config.ConfigWriter}
595 @param cfg: The cluster configuration
596 @type group_uuid: string
597 @param group_uuid: Node group UUID
598 @type owned_instances: set or frozenset
599 @param owned_instances: List of currently owned instances
602 wanted_instances = cfg.GetNodeGroupInstances(group_uuid)
603 if owned_instances != wanted_instances:
604 raise errors.OpPrereqError("Instances in node group '%s' changed since"
605 " locks were acquired, wanted '%s', have '%s';"
606 " retry the operation" %
608 utils.CommaJoin(wanted_instances),
609 utils.CommaJoin(owned_instances)),
612 return wanted_instances
615 def _SupportsOob(cfg, node):
616 """Tells if node supports OOB.
618 @type cfg: L{config.ConfigWriter}
619 @param cfg: The cluster configuration
620 @type node: L{objects.Node}
621 @param node: The node
622 @return: The OOB script if supported or an empty string otherwise
625 return cfg.GetNdParams(node)[constants.ND_OOB_PROGRAM]
628 def _GetWantedNodes(lu, nodes):
629 """Returns list of checked and expanded node names.
631 @type lu: L{LogicalUnit}
632 @param lu: the logical unit on whose behalf we execute
634 @param nodes: list of node names or None for all nodes
636 @return: the list of nodes, sorted
637 @raise errors.ProgrammerError: if the nodes parameter is wrong type
641 return [_ExpandNodeName(lu.cfg, name) for name in nodes]
643 return utils.NiceSort(lu.cfg.GetNodeList())
646 def _GetWantedInstances(lu, instances):
647 """Returns list of checked and expanded instance names.
649 @type lu: L{LogicalUnit}
650 @param lu: the logical unit on whose behalf we execute
651 @type instances: list
652 @param instances: list of instance names or None for all instances
654 @return: the list of instances, sorted
655 @raise errors.OpPrereqError: if the instances parameter is wrong type
656 @raise errors.OpPrereqError: if any of the passed instances is not found
660 wanted = [_ExpandInstanceName(lu.cfg, name) for name in instances]
662 wanted = utils.NiceSort(lu.cfg.GetInstanceList())
666 def _GetUpdatedParams(old_params, update_dict,
667 use_default=True, use_none=False):
668 """Return the new version of a parameter dictionary.
670 @type old_params: dict
671 @param old_params: old parameters
672 @type update_dict: dict
673 @param update_dict: dict containing new parameter values, or
674 constants.VALUE_DEFAULT to reset the parameter to its default
676 @param use_default: boolean
677 @type use_default: whether to recognise L{constants.VALUE_DEFAULT}
678 values as 'to be deleted' values
679 @param use_none: boolean
680 @type use_none: whether to recognise C{None} values as 'to be
683 @return: the new parameter dictionary
686 params_copy = copy.deepcopy(old_params)
687 for key, val in update_dict.iteritems():
688 if ((use_default and val == constants.VALUE_DEFAULT) or
689 (use_none and val is None)):
695 params_copy[key] = val
699 def _ReleaseLocks(lu, level, names=None, keep=None):
700 """Releases locks owned by an LU.
702 @type lu: L{LogicalUnit}
703 @param level: Lock level
704 @type names: list or None
705 @param names: Names of locks to release
706 @type keep: list or None
707 @param keep: Names of locks to retain
710 assert not (keep is not None and names is not None), \
711 "Only one of the 'names' and the 'keep' parameters can be given"
713 if names is not None:
714 should_release = names.__contains__
716 should_release = lambda name: name not in keep
718 should_release = None
724 # Determine which locks to release
725 for name in lu.owned_locks(level):
726 if should_release(name):
731 assert len(lu.owned_locks(level)) == (len(retain) + len(release))
733 # Release just some locks
734 lu.glm.release(level, names=release)
736 assert frozenset(lu.owned_locks(level)) == frozenset(retain)
739 lu.glm.release(level)
741 assert not lu.glm.is_owned(level), "No locks should be owned"
744 def _MapInstanceDisksToNodes(instances):
745 """Creates a map from (node, volume) to instance name.
747 @type instances: list of L{objects.Instance}
748 @rtype: dict; tuple of (node name, volume name) as key, instance name as value
751 return dict(((node, vol), inst.name)
752 for inst in instances
753 for (node, vols) in inst.MapLVsByNode().items()
757 def _RunPostHook(lu, node_name):
758 """Runs the post-hook for an opcode on a single node.
761 hm = lu.proc.BuildHooksManager(lu)
763 hm.RunPhase(constants.HOOKS_PHASE_POST, nodes=[node_name])
765 # pylint: disable=W0702
766 lu.LogWarning("Errors occurred running hooks on %s" % node_name)
769 def _CheckOutputFields(static, dynamic, selected):
770 """Checks whether all selected fields are valid.
772 @type static: L{utils.FieldSet}
773 @param static: static fields set
774 @type dynamic: L{utils.FieldSet}
775 @param dynamic: dynamic fields set
782 delta = f.NonMatching(selected)
784 raise errors.OpPrereqError("Unknown output fields selected: %s"
785 % ",".join(delta), errors.ECODE_INVAL)
788 def _CheckGlobalHvParams(params):
789 """Validates that given hypervisor params are not global ones.
791 This will ensure that instances don't get customised versions of
795 used_globals = constants.HVC_GLOBALS.intersection(params)
797 msg = ("The following hypervisor parameters are global and cannot"
798 " be customized at instance level, please modify them at"
799 " cluster level: %s" % utils.CommaJoin(used_globals))
800 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
803 def _CheckNodeOnline(lu, node, msg=None):
804 """Ensure that a given node is online.
806 @param lu: the LU on behalf of which we make the check
807 @param node: the node to check
808 @param msg: if passed, should be a message to replace the default one
809 @raise errors.OpPrereqError: if the node is offline
813 msg = "Can't use offline node"
814 if lu.cfg.GetNodeInfo(node).offline:
815 raise errors.OpPrereqError("%s: %s" % (msg, node), errors.ECODE_STATE)
818 def _CheckNodeNotDrained(lu, node):
819 """Ensure that a given node is not drained.
821 @param lu: the LU on behalf of which we make the check
822 @param node: the node to check
823 @raise errors.OpPrereqError: if the node is drained
826 if lu.cfg.GetNodeInfo(node).drained:
827 raise errors.OpPrereqError("Can't use drained node %s" % node,
831 def _CheckNodeVmCapable(lu, node):
832 """Ensure that a given node is vm capable.
834 @param lu: the LU on behalf of which we make the check
835 @param node: the node to check
836 @raise errors.OpPrereqError: if the node is not vm capable
839 if not lu.cfg.GetNodeInfo(node).vm_capable:
840 raise errors.OpPrereqError("Can't use non-vm_capable node %s" % node,
844 def _CheckNodeHasOS(lu, node, os_name, force_variant):
845 """Ensure that a node supports a given OS.
847 @param lu: the LU on behalf of which we make the check
848 @param node: the node to check
849 @param os_name: the OS to query about
850 @param force_variant: whether to ignore variant errors
851 @raise errors.OpPrereqError: if the node is not supporting the OS
854 result = lu.rpc.call_os_get(node, os_name)
855 result.Raise("OS '%s' not in supported OS list for node %s" %
857 prereq=True, ecode=errors.ECODE_INVAL)
858 if not force_variant:
859 _CheckOSVariant(result.payload, os_name)
862 def _CheckNodeHasSecondaryIP(lu, node, secondary_ip, prereq):
863 """Ensure that a node has the given secondary ip.
865 @type lu: L{LogicalUnit}
866 @param lu: the LU on behalf of which we make the check
868 @param node: the node to check
869 @type secondary_ip: string
870 @param secondary_ip: the ip to check
871 @type prereq: boolean
872 @param prereq: whether to throw a prerequisite or an execute error
873 @raise errors.OpPrereqError: if the node doesn't have the ip, and prereq=True
874 @raise errors.OpExecError: if the node doesn't have the ip, and prereq=False
877 result = lu.rpc.call_node_has_ip_address(node, secondary_ip)
878 result.Raise("Failure checking secondary ip on node %s" % node,
879 prereq=prereq, ecode=errors.ECODE_ENVIRON)
880 if not result.payload:
881 msg = ("Node claims it doesn't have the secondary ip you gave (%s),"
882 " please fix and re-run this command" % secondary_ip)
884 raise errors.OpPrereqError(msg, errors.ECODE_ENVIRON)
886 raise errors.OpExecError(msg)
889 def _GetClusterDomainSecret():
890 """Reads the cluster domain secret.
893 return utils.ReadOneLineFile(constants.CLUSTER_DOMAIN_SECRET_FILE,
897 def _CheckInstanceDown(lu, instance, reason):
898 """Ensure that an instance is not running."""
899 if instance.admin_up:
900 raise errors.OpPrereqError("Instance %s is marked to be up, %s" %
901 (instance.name, reason), errors.ECODE_STATE)
903 pnode = instance.primary_node
904 ins_l = lu.rpc.call_instance_list([pnode], [instance.hypervisor])[pnode]
905 ins_l.Raise("Can't contact node %s for instance information" % pnode,
906 prereq=True, ecode=errors.ECODE_ENVIRON)
908 if instance.name in ins_l.payload:
909 raise errors.OpPrereqError("Instance %s is running, %s" %
910 (instance.name, reason), errors.ECODE_STATE)
913 def _ExpandItemName(fn, name, kind):
914 """Expand an item name.
916 @param fn: the function to use for expansion
917 @param name: requested item name
918 @param kind: text description ('Node' or 'Instance')
919 @return: the resolved (full) name
920 @raise errors.OpPrereqError: if the item is not found
924 if full_name is None:
925 raise errors.OpPrereqError("%s '%s' not known" % (kind, name),
930 def _ExpandNodeName(cfg, name):
931 """Wrapper over L{_ExpandItemName} for nodes."""
932 return _ExpandItemName(cfg.ExpandNodeName, name, "Node")
935 def _ExpandInstanceName(cfg, name):
936 """Wrapper over L{_ExpandItemName} for instance."""
937 return _ExpandItemName(cfg.ExpandInstanceName, name, "Instance")
940 def _BuildInstanceHookEnv(name, primary_node, secondary_nodes, os_type, status,
941 memory, vcpus, nics, disk_template, disks,
942 bep, hvp, hypervisor_name, tags):
943 """Builds instance related env variables for hooks
945 This builds the hook environment from individual variables.
948 @param name: the name of the instance
949 @type primary_node: string
950 @param primary_node: the name of the instance's primary node
951 @type secondary_nodes: list
952 @param secondary_nodes: list of secondary nodes as strings
953 @type os_type: string
954 @param os_type: the name of the instance's OS
955 @type status: boolean
956 @param status: the should_run status of the instance
958 @param memory: the memory size of the instance
960 @param vcpus: the count of VCPUs the instance has
962 @param nics: list of tuples (ip, mac, mode, link) representing
963 the NICs the instance has
964 @type disk_template: string
965 @param disk_template: the disk template of the instance
967 @param disks: the list of (size, mode) pairs
969 @param bep: the backend parameters for the instance
971 @param hvp: the hypervisor parameters for the instance
972 @type hypervisor_name: string
973 @param hypervisor_name: the hypervisor for the instance
975 @param tags: list of instance tags as strings
977 @return: the hook environment for this instance
986 "INSTANCE_NAME": name,
987 "INSTANCE_PRIMARY": primary_node,
988 "INSTANCE_SECONDARIES": " ".join(secondary_nodes),
989 "INSTANCE_OS_TYPE": os_type,
990 "INSTANCE_STATUS": str_status,
991 "INSTANCE_MEMORY": memory,
992 "INSTANCE_VCPUS": vcpus,
993 "INSTANCE_DISK_TEMPLATE": disk_template,
994 "INSTANCE_HYPERVISOR": hypervisor_name,
998 nic_count = len(nics)
999 for idx, (ip, mac, mode, link) in enumerate(nics):
1002 env["INSTANCE_NIC%d_IP" % idx] = ip
1003 env["INSTANCE_NIC%d_MAC" % idx] = mac
1004 env["INSTANCE_NIC%d_MODE" % idx] = mode
1005 env["INSTANCE_NIC%d_LINK" % idx] = link
1006 if mode == constants.NIC_MODE_BRIDGED:
1007 env["INSTANCE_NIC%d_BRIDGE" % idx] = link
1011 env["INSTANCE_NIC_COUNT"] = nic_count
1014 disk_count = len(disks)
1015 for idx, (size, mode) in enumerate(disks):
1016 env["INSTANCE_DISK%d_SIZE" % idx] = size
1017 env["INSTANCE_DISK%d_MODE" % idx] = mode
1021 env["INSTANCE_DISK_COUNT"] = disk_count
1026 env["INSTANCE_TAGS"] = " ".join(tags)
1028 for source, kind in [(bep, "BE"), (hvp, "HV")]:
1029 for key, value in source.items():
1030 env["INSTANCE_%s_%s" % (kind, key)] = value
1035 def _NICListToTuple(lu, nics):
1036 """Build a list of nic information tuples.
1038 This list is suitable to be passed to _BuildInstanceHookEnv or as a return
1039 value in LUInstanceQueryData.
1041 @type lu: L{LogicalUnit}
1042 @param lu: the logical unit on whose behalf we execute
1043 @type nics: list of L{objects.NIC}
1044 @param nics: list of nics to convert to hooks tuples
1048 cluster = lu.cfg.GetClusterInfo()
1052 filled_params = cluster.SimpleFillNIC(nic.nicparams)
1053 mode = filled_params[constants.NIC_MODE]
1054 link = filled_params[constants.NIC_LINK]
1055 hooks_nics.append((ip, mac, mode, link))
1059 def _BuildInstanceHookEnvByObject(lu, instance, override=None):
1060 """Builds instance related env variables for hooks from an object.
1062 @type lu: L{LogicalUnit}
1063 @param lu: the logical unit on whose behalf we execute
1064 @type instance: L{objects.Instance}
1065 @param instance: the instance for which we should build the
1067 @type override: dict
1068 @param override: dictionary with key/values that will override
1071 @return: the hook environment dictionary
1074 cluster = lu.cfg.GetClusterInfo()
1075 bep = cluster.FillBE(instance)
1076 hvp = cluster.FillHV(instance)
1078 "name": instance.name,
1079 "primary_node": instance.primary_node,
1080 "secondary_nodes": instance.secondary_nodes,
1081 "os_type": instance.os,
1082 "status": instance.admin_up,
1083 "memory": bep[constants.BE_MEMORY],
1084 "vcpus": bep[constants.BE_VCPUS],
1085 "nics": _NICListToTuple(lu, instance.nics),
1086 "disk_template": instance.disk_template,
1087 "disks": [(disk.size, disk.mode) for disk in instance.disks],
1090 "hypervisor_name": instance.hypervisor,
1091 "tags": instance.tags,
1094 args.update(override)
1095 return _BuildInstanceHookEnv(**args) # pylint: disable=W0142
1098 def _AdjustCandidatePool(lu, exceptions):
1099 """Adjust the candidate pool after node operations.
1102 mod_list = lu.cfg.MaintainCandidatePool(exceptions)
1104 lu.LogInfo("Promoted nodes to master candidate role: %s",
1105 utils.CommaJoin(node.name for node in mod_list))
1106 for name in mod_list:
1107 lu.context.ReaddNode(name)
1108 mc_now, mc_max, _ = lu.cfg.GetMasterCandidateStats(exceptions)
1110 lu.LogInfo("Note: more nodes are candidates (%d) than desired (%d)" %
1114 def _DecideSelfPromotion(lu, exceptions=None):
1115 """Decide whether I should promote myself as a master candidate.
1118 cp_size = lu.cfg.GetClusterInfo().candidate_pool_size
1119 mc_now, mc_should, _ = lu.cfg.GetMasterCandidateStats(exceptions)
1120 # the new node will increase mc_max with one, so:
1121 mc_should = min(mc_should + 1, cp_size)
1122 return mc_now < mc_should
1125 def _CheckNicsBridgesExist(lu, target_nics, target_node):
1126 """Check that the brigdes needed by a list of nics exist.
1129 cluster = lu.cfg.GetClusterInfo()
1130 paramslist = [cluster.SimpleFillNIC(nic.nicparams) for nic in target_nics]
1131 brlist = [params[constants.NIC_LINK] for params in paramslist
1132 if params[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED]
1134 result = lu.rpc.call_bridges_exist(target_node, brlist)
1135 result.Raise("Error checking bridges on destination node '%s'" %
1136 target_node, prereq=True, ecode=errors.ECODE_ENVIRON)
1139 def _CheckInstanceBridgesExist(lu, instance, node=None):
1140 """Check that the brigdes needed by an instance exist.
1144 node = instance.primary_node
1145 _CheckNicsBridgesExist(lu, instance.nics, node)
1148 def _CheckOSVariant(os_obj, name):
1149 """Check whether an OS name conforms to the os variants specification.
1151 @type os_obj: L{objects.OS}
1152 @param os_obj: OS object to check
1154 @param name: OS name passed by the user, to check for validity
1157 variant = objects.OS.GetVariant(name)
1158 if not os_obj.supported_variants:
1160 raise errors.OpPrereqError("OS '%s' doesn't support variants ('%s'"
1161 " passed)" % (os_obj.name, variant),
1165 raise errors.OpPrereqError("OS name must include a variant",
1168 if variant not in os_obj.supported_variants:
1169 raise errors.OpPrereqError("Unsupported OS variant", errors.ECODE_INVAL)
1172 def _GetNodeInstancesInner(cfg, fn):
1173 return [i for i in cfg.GetAllInstancesInfo().values() if fn(i)]
1176 def _GetNodeInstances(cfg, node_name):
1177 """Returns a list of all primary and secondary instances on a node.
1181 return _GetNodeInstancesInner(cfg, lambda inst: node_name in inst.all_nodes)
1184 def _GetNodePrimaryInstances(cfg, node_name):
1185 """Returns primary instances on a node.
1188 return _GetNodeInstancesInner(cfg,
1189 lambda inst: node_name == inst.primary_node)
1192 def _GetNodeSecondaryInstances(cfg, node_name):
1193 """Returns secondary instances on a node.
1196 return _GetNodeInstancesInner(cfg,
1197 lambda inst: node_name in inst.secondary_nodes)
1200 def _GetStorageTypeArgs(cfg, storage_type):
1201 """Returns the arguments for a storage type.
1204 # Special case for file storage
1205 if storage_type == constants.ST_FILE:
1206 # storage.FileStorage wants a list of storage directories
1207 return [[cfg.GetFileStorageDir(), cfg.GetSharedFileStorageDir()]]
1212 def _FindFaultyInstanceDisks(cfg, rpc_runner, instance, node_name, prereq):
1215 for dev in instance.disks:
1216 cfg.SetDiskID(dev, node_name)
1218 result = rpc_runner.call_blockdev_getmirrorstatus(node_name, instance.disks)
1219 result.Raise("Failed to get disk status from node %s" % node_name,
1220 prereq=prereq, ecode=errors.ECODE_ENVIRON)
1222 for idx, bdev_status in enumerate(result.payload):
1223 if bdev_status and bdev_status.ldisk_status == constants.LDS_FAULTY:
1229 def _CheckIAllocatorOrNode(lu, iallocator_slot, node_slot):
1230 """Check the sanity of iallocator and node arguments and use the
1231 cluster-wide iallocator if appropriate.
1233 Check that at most one of (iallocator, node) is specified. If none is
1234 specified, then the LU's opcode's iallocator slot is filled with the
1235 cluster-wide default iallocator.
1237 @type iallocator_slot: string
1238 @param iallocator_slot: the name of the opcode iallocator slot
1239 @type node_slot: string
1240 @param node_slot: the name of the opcode target node slot
1243 node = getattr(lu.op, node_slot, None)
1244 iallocator = getattr(lu.op, iallocator_slot, None)
1246 if node is not None and iallocator is not None:
1247 raise errors.OpPrereqError("Do not specify both, iallocator and node",
1249 elif node is None and iallocator is None:
1250 default_iallocator = lu.cfg.GetDefaultIAllocator()
1251 if default_iallocator:
1252 setattr(lu.op, iallocator_slot, default_iallocator)
1254 raise errors.OpPrereqError("No iallocator or node given and no"
1255 " cluster-wide default iallocator found;"
1256 " please specify either an iallocator or a"
1257 " node, or set a cluster-wide default"
1261 def _GetDefaultIAllocator(cfg, iallocator):
1262 """Decides on which iallocator to use.
1264 @type cfg: L{config.ConfigWriter}
1265 @param cfg: Cluster configuration object
1266 @type iallocator: string or None
1267 @param iallocator: Iallocator specified in opcode
1269 @return: Iallocator name
1273 # Use default iallocator
1274 iallocator = cfg.GetDefaultIAllocator()
1277 raise errors.OpPrereqError("No iallocator was specified, neither in the"
1278 " opcode nor as a cluster-wide default",
1284 class LUClusterPostInit(LogicalUnit):
1285 """Logical unit for running hooks after cluster initialization.
1288 HPATH = "cluster-init"
1289 HTYPE = constants.HTYPE_CLUSTER
1291 def BuildHooksEnv(self):
1296 "OP_TARGET": self.cfg.GetClusterName(),
1299 def BuildHooksNodes(self):
1300 """Build hooks nodes.
1303 return ([], [self.cfg.GetMasterNode()])
1305 def Exec(self, feedback_fn):
1312 class LUClusterDestroy(LogicalUnit):
1313 """Logical unit for destroying the cluster.
1316 HPATH = "cluster-destroy"
1317 HTYPE = constants.HTYPE_CLUSTER
1319 def BuildHooksEnv(self):
1324 "OP_TARGET": self.cfg.GetClusterName(),
1327 def BuildHooksNodes(self):
1328 """Build hooks nodes.
1333 def CheckPrereq(self):
1334 """Check prerequisites.
1336 This checks whether the cluster is empty.
1338 Any errors are signaled by raising errors.OpPrereqError.
1341 master = self.cfg.GetMasterNode()
1343 nodelist = self.cfg.GetNodeList()
1344 if len(nodelist) != 1 or nodelist[0] != master:
1345 raise errors.OpPrereqError("There are still %d node(s) in"
1346 " this cluster." % (len(nodelist) - 1),
1348 instancelist = self.cfg.GetInstanceList()
1350 raise errors.OpPrereqError("There are still %d instance(s) in"
1351 " this cluster." % len(instancelist),
1354 def Exec(self, feedback_fn):
1355 """Destroys the cluster.
1358 master_params = self.cfg.GetMasterNetworkParameters()
1360 # Run post hooks on master node before it's removed
1361 _RunPostHook(self, master_params.name)
1363 result = self.rpc.call_node_deactivate_master_ip(master_params.name,
1365 master_params.netmask,
1366 master_params.netdev,
1367 master_params.ip_family)
1368 result.Raise("Could not disable the master role")
1370 return master_params.name
1373 def _VerifyCertificate(filename):
1374 """Verifies a certificate for L{LUClusterVerifyConfig}.
1376 @type filename: string
1377 @param filename: Path to PEM file
1381 cert = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM,
1382 utils.ReadFile(filename))
1383 except Exception, err: # pylint: disable=W0703
1384 return (LUClusterVerifyConfig.ETYPE_ERROR,
1385 "Failed to load X509 certificate %s: %s" % (filename, err))
1388 utils.VerifyX509Certificate(cert, constants.SSL_CERT_EXPIRATION_WARN,
1389 constants.SSL_CERT_EXPIRATION_ERROR)
1392 fnamemsg = "While verifying %s: %s" % (filename, msg)
1397 return (None, fnamemsg)
1398 elif errcode == utils.CERT_WARNING:
1399 return (LUClusterVerifyConfig.ETYPE_WARNING, fnamemsg)
1400 elif errcode == utils.CERT_ERROR:
1401 return (LUClusterVerifyConfig.ETYPE_ERROR, fnamemsg)
1403 raise errors.ProgrammerError("Unhandled certificate error code %r" % errcode)
1406 def _GetAllHypervisorParameters(cluster, instances):
1407 """Compute the set of all hypervisor parameters.
1409 @type cluster: L{objects.Cluster}
1410 @param cluster: the cluster object
1411 @param instances: list of L{objects.Instance}
1412 @param instances: additional instances from which to obtain parameters
1413 @rtype: list of (origin, hypervisor, parameters)
1414 @return: a list with all parameters found, indicating the hypervisor they
1415 apply to, and the origin (can be "cluster", "os X", or "instance Y")
1420 for hv_name in cluster.enabled_hypervisors:
1421 hvp_data.append(("cluster", hv_name, cluster.GetHVDefaults(hv_name)))
1423 for os_name, os_hvp in cluster.os_hvp.items():
1424 for hv_name, hv_params in os_hvp.items():
1426 full_params = cluster.GetHVDefaults(hv_name, os_name=os_name)
1427 hvp_data.append(("os %s" % os_name, hv_name, full_params))
1429 # TODO: collapse identical parameter values in a single one
1430 for instance in instances:
1431 if instance.hvparams:
1432 hvp_data.append(("instance %s" % instance.name, instance.hypervisor,
1433 cluster.FillHV(instance)))
1438 class _VerifyErrors(object):
1439 """Mix-in for cluster/group verify LUs.
1441 It provides _Error and _ErrorIf, and updates the self.bad boolean. (Expects
1442 self.op and self._feedback_fn to be available.)
1446 ETYPE_FIELD = "code"
1447 ETYPE_ERROR = "ERROR"
1448 ETYPE_WARNING = "WARNING"
1450 def _Error(self, ecode, item, msg, *args, **kwargs):
1451 """Format an error message.
1453 Based on the opcode's error_codes parameter, either format a
1454 parseable error code, or a simpler error string.
1456 This must be called only from Exec and functions called from Exec.
1459 ltype = kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR)
1460 itype, etxt, _ = ecode
1461 # first complete the msg
1464 # then format the whole message
1465 if self.op.error_codes: # This is a mix-in. pylint: disable=E1101
1466 msg = "%s:%s:%s:%s:%s" % (ltype, etxt, itype, item, msg)
1472 msg = "%s: %s%s: %s" % (ltype, itype, item, msg)
1473 # and finally report it via the feedback_fn
1474 self._feedback_fn(" - %s" % msg) # Mix-in. pylint: disable=E1101
1476 def _ErrorIf(self, cond, ecode, *args, **kwargs):
1477 """Log an error message if the passed condition is True.
1481 or self.op.debug_simulate_errors) # pylint: disable=E1101
1483 # If the error code is in the list of ignored errors, demote the error to a
1485 (_, etxt, _) = ecode
1486 if etxt in self.op.ignore_errors: # pylint: disable=E1101
1487 kwargs[self.ETYPE_FIELD] = self.ETYPE_WARNING
1490 self._Error(ecode, *args, **kwargs)
1492 # do not mark the operation as failed for WARN cases only
1493 if kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR) == self.ETYPE_ERROR:
1494 self.bad = self.bad or cond
1497 class LUClusterVerify(NoHooksLU):
1498 """Submits all jobs necessary to verify the cluster.
1503 def ExpandNames(self):
1504 self.needed_locks = {}
1506 def Exec(self, feedback_fn):
1509 if self.op.group_name:
1510 groups = [self.op.group_name]
1511 depends_fn = lambda: None
1513 groups = self.cfg.GetNodeGroupList()
1515 # Verify global configuration
1517 opcodes.OpClusterVerifyConfig(ignore_errors=self.op.ignore_errors)
1520 # Always depend on global verification
1521 depends_fn = lambda: [(-len(jobs), [])]
1523 jobs.extend([opcodes.OpClusterVerifyGroup(group_name=group,
1524 ignore_errors=self.op.ignore_errors,
1525 depends=depends_fn())]
1526 for group in groups)
1528 # Fix up all parameters
1529 for op in itertools.chain(*jobs): # pylint: disable=W0142
1530 op.debug_simulate_errors = self.op.debug_simulate_errors
1531 op.verbose = self.op.verbose
1532 op.error_codes = self.op.error_codes
1534 op.skip_checks = self.op.skip_checks
1535 except AttributeError:
1536 assert not isinstance(op, opcodes.OpClusterVerifyGroup)
1538 return ResultWithJobs(jobs)
1541 class LUClusterVerifyConfig(NoHooksLU, _VerifyErrors):
1542 """Verifies the cluster config.
1547 def _VerifyHVP(self, hvp_data):
1548 """Verifies locally the syntax of the hypervisor parameters.
1551 for item, hv_name, hv_params in hvp_data:
1552 msg = ("hypervisor %s parameters syntax check (source %s): %%s" %
1555 hv_class = hypervisor.GetHypervisor(hv_name)
1556 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
1557 hv_class.CheckParameterSyntax(hv_params)
1558 except errors.GenericError, err:
1559 self._ErrorIf(True, constants.CV_ECLUSTERCFG, None, msg % str(err))
1561 def ExpandNames(self):
1562 # Information can be safely retrieved as the BGL is acquired in exclusive
1564 assert locking.BGL in self.owned_locks(locking.LEVEL_CLUSTER)
1565 self.all_group_info = self.cfg.GetAllNodeGroupsInfo()
1566 self.all_node_info = self.cfg.GetAllNodesInfo()
1567 self.all_inst_info = self.cfg.GetAllInstancesInfo()
1568 self.needed_locks = {}
1570 def Exec(self, feedback_fn):
1571 """Verify integrity of cluster, performing various test on nodes.
1575 self._feedback_fn = feedback_fn
1577 feedback_fn("* Verifying cluster config")
1579 for msg in self.cfg.VerifyConfig():
1580 self._ErrorIf(True, constants.CV_ECLUSTERCFG, None, msg)
1582 feedback_fn("* Verifying cluster certificate files")
1584 for cert_filename in constants.ALL_CERT_FILES:
1585 (errcode, msg) = _VerifyCertificate(cert_filename)
1586 self._ErrorIf(errcode, constants.CV_ECLUSTERCERT, None, msg, code=errcode)
1588 feedback_fn("* Verifying hypervisor parameters")
1590 self._VerifyHVP(_GetAllHypervisorParameters(self.cfg.GetClusterInfo(),
1591 self.all_inst_info.values()))
1593 feedback_fn("* Verifying all nodes belong to an existing group")
1595 # We do this verification here because, should this bogus circumstance
1596 # occur, it would never be caught by VerifyGroup, which only acts on
1597 # nodes/instances reachable from existing node groups.
1599 dangling_nodes = set(node.name for node in self.all_node_info.values()
1600 if node.group not in self.all_group_info)
1602 dangling_instances = {}
1603 no_node_instances = []
1605 for inst in self.all_inst_info.values():
1606 if inst.primary_node in dangling_nodes:
1607 dangling_instances.setdefault(inst.primary_node, []).append(inst.name)
1608 elif inst.primary_node not in self.all_node_info:
1609 no_node_instances.append(inst.name)
1614 utils.CommaJoin(dangling_instances.get(node.name,
1616 for node in dangling_nodes]
1618 self._ErrorIf(bool(dangling_nodes), constants.CV_ECLUSTERDANGLINGNODES,
1620 "the following nodes (and their instances) belong to a non"
1621 " existing group: %s", utils.CommaJoin(pretty_dangling))
1623 self._ErrorIf(bool(no_node_instances), constants.CV_ECLUSTERDANGLINGINST,
1625 "the following instances have a non-existing primary-node:"
1626 " %s", utils.CommaJoin(no_node_instances))
1631 class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
1632 """Verifies the status of a node group.
1635 HPATH = "cluster-verify"
1636 HTYPE = constants.HTYPE_CLUSTER
1639 _HOOKS_INDENT_RE = re.compile("^", re.M)
1641 class NodeImage(object):
1642 """A class representing the logical and physical status of a node.
1645 @ivar name: the node name to which this object refers
1646 @ivar volumes: a structure as returned from
1647 L{ganeti.backend.GetVolumeList} (runtime)
1648 @ivar instances: a list of running instances (runtime)
1649 @ivar pinst: list of configured primary instances (config)
1650 @ivar sinst: list of configured secondary instances (config)
1651 @ivar sbp: dictionary of {primary-node: list of instances} for all
1652 instances for which this node is secondary (config)
1653 @ivar mfree: free memory, as reported by hypervisor (runtime)
1654 @ivar dfree: free disk, as reported by the node (runtime)
1655 @ivar offline: the offline status (config)
1656 @type rpc_fail: boolean
1657 @ivar rpc_fail: whether the RPC verify call was successfull (overall,
1658 not whether the individual keys were correct) (runtime)
1659 @type lvm_fail: boolean
1660 @ivar lvm_fail: whether the RPC call didn't return valid LVM data
1661 @type hyp_fail: boolean
1662 @ivar hyp_fail: whether the RPC call didn't return the instance list
1663 @type ghost: boolean
1664 @ivar ghost: whether this is a known node or not (config)
1665 @type os_fail: boolean
1666 @ivar os_fail: whether the RPC call didn't return valid OS data
1668 @ivar oslist: list of OSes as diagnosed by DiagnoseOS
1669 @type vm_capable: boolean
1670 @ivar vm_capable: whether the node can host instances
1673 def __init__(self, offline=False, name=None, vm_capable=True):
1682 self.offline = offline
1683 self.vm_capable = vm_capable
1684 self.rpc_fail = False
1685 self.lvm_fail = False
1686 self.hyp_fail = False
1688 self.os_fail = False
1691 def ExpandNames(self):
1692 # This raises errors.OpPrereqError on its own:
1693 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
1695 # Get instances in node group; this is unsafe and needs verification later
1696 inst_names = self.cfg.GetNodeGroupInstances(self.group_uuid)
1698 self.needed_locks = {
1699 locking.LEVEL_INSTANCE: inst_names,
1700 locking.LEVEL_NODEGROUP: [self.group_uuid],
1701 locking.LEVEL_NODE: [],
1704 self.share_locks = _ShareAll()
1706 def DeclareLocks(self, level):
1707 if level == locking.LEVEL_NODE:
1708 # Get members of node group; this is unsafe and needs verification later
1709 nodes = set(self.cfg.GetNodeGroup(self.group_uuid).members)
1711 all_inst_info = self.cfg.GetAllInstancesInfo()
1713 # In Exec(), we warn about mirrored instances that have primary and
1714 # secondary living in separate node groups. To fully verify that
1715 # volumes for these instances are healthy, we will need to do an
1716 # extra call to their secondaries. We ensure here those nodes will
1718 for inst in self.owned_locks(locking.LEVEL_INSTANCE):
1719 # Important: access only the instances whose lock is owned
1720 if all_inst_info[inst].disk_template in constants.DTS_INT_MIRROR:
1721 nodes.update(all_inst_info[inst].secondary_nodes)
1723 self.needed_locks[locking.LEVEL_NODE] = nodes
1725 def CheckPrereq(self):
1726 assert self.group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP)
1727 self.group_info = self.cfg.GetNodeGroup(self.group_uuid)
1729 group_nodes = set(self.group_info.members)
1730 group_instances = self.cfg.GetNodeGroupInstances(self.group_uuid)
1733 group_nodes.difference(self.owned_locks(locking.LEVEL_NODE))
1735 unlocked_instances = \
1736 group_instances.difference(self.owned_locks(locking.LEVEL_INSTANCE))
1739 raise errors.OpPrereqError("Missing lock for nodes: %s" %
1740 utils.CommaJoin(unlocked_nodes))
1742 if unlocked_instances:
1743 raise errors.OpPrereqError("Missing lock for instances: %s" %
1744 utils.CommaJoin(unlocked_instances))
1746 self.all_node_info = self.cfg.GetAllNodesInfo()
1747 self.all_inst_info = self.cfg.GetAllInstancesInfo()
1749 self.my_node_names = utils.NiceSort(group_nodes)
1750 self.my_inst_names = utils.NiceSort(group_instances)
1752 self.my_node_info = dict((name, self.all_node_info[name])
1753 for name in self.my_node_names)
1755 self.my_inst_info = dict((name, self.all_inst_info[name])
1756 for name in self.my_inst_names)
1758 # We detect here the nodes that will need the extra RPC calls for verifying
1759 # split LV volumes; they should be locked.
1760 extra_lv_nodes = set()
1762 for inst in self.my_inst_info.values():
1763 if inst.disk_template in constants.DTS_INT_MIRROR:
1764 group = self.my_node_info[inst.primary_node].group
1765 for nname in inst.secondary_nodes:
1766 if self.all_node_info[nname].group != group:
1767 extra_lv_nodes.add(nname)
1769 unlocked_lv_nodes = \
1770 extra_lv_nodes.difference(self.owned_locks(locking.LEVEL_NODE))
1772 if unlocked_lv_nodes:
1773 raise errors.OpPrereqError("these nodes could be locked: %s" %
1774 utils.CommaJoin(unlocked_lv_nodes))
1775 self.extra_lv_nodes = list(extra_lv_nodes)
1777 def _VerifyNode(self, ninfo, nresult):
1778 """Perform some basic validation on data returned from a node.
1780 - check the result data structure is well formed and has all the
1782 - check ganeti version
1784 @type ninfo: L{objects.Node}
1785 @param ninfo: the node to check
1786 @param nresult: the results from the node
1788 @return: whether overall this call was successful (and we can expect
1789 reasonable values in the respose)
1793 _ErrorIf = self._ErrorIf # pylint: disable=C0103
1795 # main result, nresult should be a non-empty dict
1796 test = not nresult or not isinstance(nresult, dict)
1797 _ErrorIf(test, constants.CV_ENODERPC, node,
1798 "unable to verify node: no data returned")
1802 # compares ganeti version
1803 local_version = constants.PROTOCOL_VERSION
1804 remote_version = nresult.get("version", None)
1805 test = not (remote_version and
1806 isinstance(remote_version, (list, tuple)) and
1807 len(remote_version) == 2)
1808 _ErrorIf(test, constants.CV_ENODERPC, node,
1809 "connection to node returned invalid data")
1813 test = local_version != remote_version[0]
1814 _ErrorIf(test, constants.CV_ENODEVERSION, node,
1815 "incompatible protocol versions: master %s,"
1816 " node %s", local_version, remote_version[0])
1820 # node seems compatible, we can actually try to look into its results
1822 # full package version
1823 self._ErrorIf(constants.RELEASE_VERSION != remote_version[1],
1824 constants.CV_ENODEVERSION, node,
1825 "software version mismatch: master %s, node %s",
1826 constants.RELEASE_VERSION, remote_version[1],
1827 code=self.ETYPE_WARNING)
1829 hyp_result = nresult.get(constants.NV_HYPERVISOR, None)
1830 if ninfo.vm_capable and isinstance(hyp_result, dict):
1831 for hv_name, hv_result in hyp_result.iteritems():
1832 test = hv_result is not None
1833 _ErrorIf(test, constants.CV_ENODEHV, node,
1834 "hypervisor %s verify failure: '%s'", hv_name, hv_result)
1836 hvp_result = nresult.get(constants.NV_HVPARAMS, None)
1837 if ninfo.vm_capable and isinstance(hvp_result, list):
1838 for item, hv_name, hv_result in hvp_result:
1839 _ErrorIf(True, constants.CV_ENODEHV, node,
1840 "hypervisor %s parameter verify failure (source %s): %s",
1841 hv_name, item, hv_result)
1843 test = nresult.get(constants.NV_NODESETUP,
1844 ["Missing NODESETUP results"])
1845 _ErrorIf(test, constants.CV_ENODESETUP, node, "node setup error: %s",
1850 def _VerifyNodeTime(self, ninfo, nresult,
1851 nvinfo_starttime, nvinfo_endtime):
1852 """Check the node time.
1854 @type ninfo: L{objects.Node}
1855 @param ninfo: the node to check
1856 @param nresult: the remote results for the node
1857 @param nvinfo_starttime: the start time of the RPC call
1858 @param nvinfo_endtime: the end time of the RPC call
1862 _ErrorIf = self._ErrorIf # pylint: disable=C0103
1864 ntime = nresult.get(constants.NV_TIME, None)
1866 ntime_merged = utils.MergeTime(ntime)
1867 except (ValueError, TypeError):
1868 _ErrorIf(True, constants.CV_ENODETIME, node, "Node returned invalid time")
1871 if ntime_merged < (nvinfo_starttime - constants.NODE_MAX_CLOCK_SKEW):
1872 ntime_diff = "%.01fs" % abs(nvinfo_starttime - ntime_merged)
1873 elif ntime_merged > (nvinfo_endtime + constants.NODE_MAX_CLOCK_SKEW):
1874 ntime_diff = "%.01fs" % abs(ntime_merged - nvinfo_endtime)
1878 _ErrorIf(ntime_diff is not None, constants.CV_ENODETIME, node,
1879 "Node time diverges by at least %s from master node time",
1882 def _VerifyNodeLVM(self, ninfo, nresult, vg_name):
1883 """Check the node LVM results.
1885 @type ninfo: L{objects.Node}
1886 @param ninfo: the node to check
1887 @param nresult: the remote results for the node
1888 @param vg_name: the configured VG name
1895 _ErrorIf = self._ErrorIf # pylint: disable=C0103
1897 # checks vg existence and size > 20G
1898 vglist = nresult.get(constants.NV_VGLIST, None)
1900 _ErrorIf(test, constants.CV_ENODELVM, node, "unable to check volume groups")
1902 vgstatus = utils.CheckVolumeGroupSize(vglist, vg_name,
1903 constants.MIN_VG_SIZE)
1904 _ErrorIf(vgstatus, constants.CV_ENODELVM, node, vgstatus)
1907 pvlist = nresult.get(constants.NV_PVLIST, None)
1908 test = pvlist is None
1909 _ErrorIf(test, constants.CV_ENODELVM, node, "Can't get PV list from node")
1911 # check that ':' is not present in PV names, since it's a
1912 # special character for lvcreate (denotes the range of PEs to
1914 for _, pvname, owner_vg in pvlist:
1915 test = ":" in pvname
1916 _ErrorIf(test, constants.CV_ENODELVM, node,
1917 "Invalid character ':' in PV '%s' of VG '%s'",
1920 def _VerifyNodeBridges(self, ninfo, nresult, bridges):
1921 """Check the node bridges.
1923 @type ninfo: L{objects.Node}
1924 @param ninfo: the node to check
1925 @param nresult: the remote results for the node
1926 @param bridges: the expected list of bridges
1933 _ErrorIf = self._ErrorIf # pylint: disable=C0103
1935 missing = nresult.get(constants.NV_BRIDGES, None)
1936 test = not isinstance(missing, list)
1937 _ErrorIf(test, constants.CV_ENODENET, node,
1938 "did not return valid bridge information")
1940 _ErrorIf(bool(missing), constants.CV_ENODENET, node,
1941 "missing bridges: %s" % utils.CommaJoin(sorted(missing)))
1943 def _VerifyNodeNetwork(self, ninfo, nresult):
1944 """Check the node network connectivity results.
1946 @type ninfo: L{objects.Node}
1947 @param ninfo: the node to check
1948 @param nresult: the remote results for the node
1952 _ErrorIf = self._ErrorIf # pylint: disable=C0103
1954 test = constants.NV_NODELIST not in nresult
1955 _ErrorIf(test, constants.CV_ENODESSH, node,
1956 "node hasn't returned node ssh connectivity data")
1958 if nresult[constants.NV_NODELIST]:
1959 for a_node, a_msg in nresult[constants.NV_NODELIST].items():
1960 _ErrorIf(True, constants.CV_ENODESSH, node,
1961 "ssh communication with node '%s': %s", a_node, a_msg)
1963 test = constants.NV_NODENETTEST not in nresult
1964 _ErrorIf(test, constants.CV_ENODENET, node,
1965 "node hasn't returned node tcp connectivity data")
1967 if nresult[constants.NV_NODENETTEST]:
1968 nlist = utils.NiceSort(nresult[constants.NV_NODENETTEST].keys())
1970 _ErrorIf(True, constants.CV_ENODENET, node,
1971 "tcp communication with node '%s': %s",
1972 anode, nresult[constants.NV_NODENETTEST][anode])
1974 test = constants.NV_MASTERIP not in nresult
1975 _ErrorIf(test, constants.CV_ENODENET, node,
1976 "node hasn't returned node master IP reachability data")
1978 if not nresult[constants.NV_MASTERIP]:
1979 if node == self.master_node:
1980 msg = "the master node cannot reach the master IP (not configured?)"
1982 msg = "cannot reach the master IP"
1983 _ErrorIf(True, constants.CV_ENODENET, node, msg)
1985 def _VerifyInstance(self, instance, instanceconfig, node_image,
1987 """Verify an instance.
1989 This function checks to see if the required block devices are
1990 available on the instance's node.
1993 _ErrorIf = self._ErrorIf # pylint: disable=C0103
1994 node_current = instanceconfig.primary_node
1996 node_vol_should = {}
1997 instanceconfig.MapLVsByNode(node_vol_should)
1999 for node in node_vol_should:
2000 n_img = node_image[node]
2001 if n_img.offline or n_img.rpc_fail or n_img.lvm_fail:
2002 # ignore missing volumes on offline or broken nodes
2004 for volume in node_vol_should[node]:
2005 test = volume not in n_img.volumes
2006 _ErrorIf(test, constants.CV_EINSTANCEMISSINGDISK, instance,
2007 "volume %s missing on node %s", volume, node)
2009 if instanceconfig.admin_up:
2010 pri_img = node_image[node_current]
2011 test = instance not in pri_img.instances and not pri_img.offline
2012 _ErrorIf(test, constants.CV_EINSTANCEDOWN, instance,
2013 "instance not running on its primary node %s",
2016 diskdata = [(nname, success, status, idx)
2017 for (nname, disks) in diskstatus.items()
2018 for idx, (success, status) in enumerate(disks)]
2020 for nname, success, bdev_status, idx in diskdata:
2021 # the 'ghost node' construction in Exec() ensures that we have a
2023 snode = node_image[nname]
2024 bad_snode = snode.ghost or snode.offline
2025 _ErrorIf(instanceconfig.admin_up and not success and not bad_snode,
2026 constants.CV_EINSTANCEFAULTYDISK, instance,
2027 "couldn't retrieve status for disk/%s on %s: %s",
2028 idx, nname, bdev_status)
2029 _ErrorIf((instanceconfig.admin_up and success and
2030 bdev_status.ldisk_status == constants.LDS_FAULTY),
2031 constants.CV_EINSTANCEFAULTYDISK, instance,
2032 "disk/%s on %s is faulty", idx, nname)
2034 def _VerifyOrphanVolumes(self, node_vol_should, node_image, reserved):
2035 """Verify if there are any unknown volumes in the cluster.
2037 The .os, .swap and backup volumes are ignored. All other volumes are
2038 reported as unknown.
2040 @type reserved: L{ganeti.utils.FieldSet}
2041 @param reserved: a FieldSet of reserved volume names
2044 for node, n_img in node_image.items():
2045 if n_img.offline or n_img.rpc_fail or n_img.lvm_fail:
2046 # skip non-healthy nodes
2048 for volume in n_img.volumes:
2049 test = ((node not in node_vol_should or
2050 volume not in node_vol_should[node]) and
2051 not reserved.Matches(volume))
2052 self._ErrorIf(test, constants.CV_ENODEORPHANLV, node,
2053 "volume %s is unknown", volume)
2055 def _VerifyNPlusOneMemory(self, node_image, instance_cfg):
2056 """Verify N+1 Memory Resilience.
2058 Check that if one single node dies we can still start all the
2059 instances it was primary for.
2062 cluster_info = self.cfg.GetClusterInfo()
2063 for node, n_img in node_image.items():
2064 # This code checks that every node which is now listed as
2065 # secondary has enough memory to host all instances it is
2066 # supposed to should a single other node in the cluster fail.
2067 # FIXME: not ready for failover to an arbitrary node
2068 # FIXME: does not support file-backed instances
2069 # WARNING: we currently take into account down instances as well
2070 # as up ones, considering that even if they're down someone
2071 # might want to start them even in the event of a node failure.
2073 # we're skipping offline nodes from the N+1 warning, since
2074 # most likely we don't have good memory infromation from them;
2075 # we already list instances living on such nodes, and that's
2078 for prinode, instances in n_img.sbp.items():
2080 for instance in instances:
2081 bep = cluster_info.FillBE(instance_cfg[instance])
2082 if bep[constants.BE_AUTO_BALANCE]:
2083 needed_mem += bep[constants.BE_MEMORY]
2084 test = n_img.mfree < needed_mem
2085 self._ErrorIf(test, constants.CV_ENODEN1, node,
2086 "not enough memory to accomodate instance failovers"
2087 " should node %s fail (%dMiB needed, %dMiB available)",
2088 prinode, needed_mem, n_img.mfree)
2091 def _VerifyFiles(cls, errorif, nodeinfo, master_node, all_nvinfo,
2092 (files_all, files_opt, files_mc, files_vm)):
2093 """Verifies file checksums collected from all nodes.
2095 @param errorif: Callback for reporting errors
2096 @param nodeinfo: List of L{objects.Node} objects
2097 @param master_node: Name of master node
2098 @param all_nvinfo: RPC results
2101 # Define functions determining which nodes to consider for a file
2104 (files_mc, lambda node: (node.master_candidate or
2105 node.name == master_node)),
2106 (files_vm, lambda node: node.vm_capable),
2109 # Build mapping from filename to list of nodes which should have the file
2111 for (files, fn) in files2nodefn:
2113 filenodes = nodeinfo
2115 filenodes = filter(fn, nodeinfo)
2116 nodefiles.update((filename,
2117 frozenset(map(operator.attrgetter("name"), filenodes)))
2118 for filename in files)
2120 assert set(nodefiles) == (files_all | files_mc | files_vm)
2122 fileinfo = dict((filename, {}) for filename in nodefiles)
2123 ignore_nodes = set()
2125 for node in nodeinfo:
2127 ignore_nodes.add(node.name)
2130 nresult = all_nvinfo[node.name]
2132 if nresult.fail_msg or not nresult.payload:
2135 node_files = nresult.payload.get(constants.NV_FILELIST, None)
2137 test = not (node_files and isinstance(node_files, dict))
2138 errorif(test, constants.CV_ENODEFILECHECK, node.name,
2139 "Node did not return file checksum data")
2141 ignore_nodes.add(node.name)
2144 # Build per-checksum mapping from filename to nodes having it
2145 for (filename, checksum) in node_files.items():
2146 assert filename in nodefiles
2147 fileinfo[filename].setdefault(checksum, set()).add(node.name)
2149 for (filename, checksums) in fileinfo.items():
2150 assert compat.all(len(i) > 10 for i in checksums), "Invalid checksum"
2152 # Nodes having the file
2153 with_file = frozenset(node_name
2154 for nodes in fileinfo[filename].values()
2155 for node_name in nodes) - ignore_nodes
2157 expected_nodes = nodefiles[filename] - ignore_nodes
2159 # Nodes missing file
2160 missing_file = expected_nodes - with_file
2162 if filename in files_opt:
2164 errorif(missing_file and missing_file != expected_nodes,
2165 constants.CV_ECLUSTERFILECHECK, None,
2166 "File %s is optional, but it must exist on all or no"
2167 " nodes (not found on %s)",
2168 filename, utils.CommaJoin(utils.NiceSort(missing_file)))
2170 errorif(missing_file, constants.CV_ECLUSTERFILECHECK, None,
2171 "File %s is missing from node(s) %s", filename,
2172 utils.CommaJoin(utils.NiceSort(missing_file)))
2174 # Warn if a node has a file it shouldn't
2175 unexpected = with_file - expected_nodes
2177 constants.CV_ECLUSTERFILECHECK, None,
2178 "File %s should not exist on node(s) %s",
2179 filename, utils.CommaJoin(utils.NiceSort(unexpected)))
2181 # See if there are multiple versions of the file
2182 test = len(checksums) > 1
2184 variants = ["variant %s on %s" %
2185 (idx + 1, utils.CommaJoin(utils.NiceSort(nodes)))
2186 for (idx, (checksum, nodes)) in
2187 enumerate(sorted(checksums.items()))]
2191 errorif(test, constants.CV_ECLUSTERFILECHECK, None,
2192 "File %s found with %s different checksums (%s)",
2193 filename, len(checksums), "; ".join(variants))
2195 def _VerifyNodeDrbd(self, ninfo, nresult, instanceinfo, drbd_helper,
2197 """Verifies and the node DRBD status.
2199 @type ninfo: L{objects.Node}
2200 @param ninfo: the node to check
2201 @param nresult: the remote results for the node
2202 @param instanceinfo: the dict of instances
2203 @param drbd_helper: the configured DRBD usermode helper
2204 @param drbd_map: the DRBD map as returned by
2205 L{ganeti.config.ConfigWriter.ComputeDRBDMap}
2209 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2212 helper_result = nresult.get(constants.NV_DRBDHELPER, None)
2213 test = (helper_result == None)
2214 _ErrorIf(test, constants.CV_ENODEDRBDHELPER, node,
2215 "no drbd usermode helper returned")
2217 status, payload = helper_result
2219 _ErrorIf(test, constants.CV_ENODEDRBDHELPER, node,
2220 "drbd usermode helper check unsuccessful: %s", payload)
2221 test = status and (payload != drbd_helper)
2222 _ErrorIf(test, constants.CV_ENODEDRBDHELPER, node,
2223 "wrong drbd usermode helper: %s", payload)
2225 # compute the DRBD minors
2227 for minor, instance in drbd_map[node].items():
2228 test = instance not in instanceinfo
2229 _ErrorIf(test, constants.CV_ECLUSTERCFG, None,
2230 "ghost instance '%s' in temporary DRBD map", instance)
2231 # ghost instance should not be running, but otherwise we
2232 # don't give double warnings (both ghost instance and
2233 # unallocated minor in use)
2235 node_drbd[minor] = (instance, False)
2237 instance = instanceinfo[instance]
2238 node_drbd[minor] = (instance.name, instance.admin_up)
2240 # and now check them
2241 used_minors = nresult.get(constants.NV_DRBDLIST, [])
2242 test = not isinstance(used_minors, (tuple, list))
2243 _ErrorIf(test, constants.CV_ENODEDRBD, node,
2244 "cannot parse drbd status file: %s", str(used_minors))
2246 # we cannot check drbd status
2249 for minor, (iname, must_exist) in node_drbd.items():
2250 test = minor not in used_minors and must_exist
2251 _ErrorIf(test, constants.CV_ENODEDRBD, node,
2252 "drbd minor %d of instance %s is not active", minor, iname)
2253 for minor in used_minors:
2254 test = minor not in node_drbd
2255 _ErrorIf(test, constants.CV_ENODEDRBD, node,
2256 "unallocated drbd minor %d is in use", minor)
2258 def _UpdateNodeOS(self, ninfo, nresult, nimg):
2259 """Builds the node OS structures.
2261 @type ninfo: L{objects.Node}
2262 @param ninfo: the node to check
2263 @param nresult: the remote results for the node
2264 @param nimg: the node image object
2268 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2270 remote_os = nresult.get(constants.NV_OSLIST, None)
2271 test = (not isinstance(remote_os, list) or
2272 not compat.all(isinstance(v, list) and len(v) == 7
2273 for v in remote_os))
2275 _ErrorIf(test, constants.CV_ENODEOS, node,
2276 "node hasn't returned valid OS data")
2285 for (name, os_path, status, diagnose,
2286 variants, parameters, api_ver) in nresult[constants.NV_OSLIST]:
2288 if name not in os_dict:
2291 # parameters is a list of lists instead of list of tuples due to
2292 # JSON lacking a real tuple type, fix it:
2293 parameters = [tuple(v) for v in parameters]
2294 os_dict[name].append((os_path, status, diagnose,
2295 set(variants), set(parameters), set(api_ver)))
2297 nimg.oslist = os_dict
2299 def _VerifyNodeOS(self, ninfo, nimg, base):
2300 """Verifies the node OS list.
2302 @type ninfo: L{objects.Node}
2303 @param ninfo: the node to check
2304 @param nimg: the node image object
2305 @param base: the 'template' node we match against (e.g. from the master)
2309 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2311 assert not nimg.os_fail, "Entered _VerifyNodeOS with failed OS rpc?"
2313 beautify_params = lambda l: ["%s: %s" % (k, v) for (k, v) in l]
2314 for os_name, os_data in nimg.oslist.items():
2315 assert os_data, "Empty OS status for OS %s?!" % os_name
2316 f_path, f_status, f_diag, f_var, f_param, f_api = os_data[0]
2317 _ErrorIf(not f_status, constants.CV_ENODEOS, node,
2318 "Invalid OS %s (located at %s): %s", os_name, f_path, f_diag)
2319 _ErrorIf(len(os_data) > 1, constants.CV_ENODEOS, node,
2320 "OS '%s' has multiple entries (first one shadows the rest): %s",
2321 os_name, utils.CommaJoin([v[0] for v in os_data]))
2322 # comparisons with the 'base' image
2323 test = os_name not in base.oslist
2324 _ErrorIf(test, constants.CV_ENODEOS, node,
2325 "Extra OS %s not present on reference node (%s)",
2329 assert base.oslist[os_name], "Base node has empty OS status?"
2330 _, b_status, _, b_var, b_param, b_api = base.oslist[os_name][0]
2332 # base OS is invalid, skipping
2334 for kind, a, b in [("API version", f_api, b_api),
2335 ("variants list", f_var, b_var),
2336 ("parameters", beautify_params(f_param),
2337 beautify_params(b_param))]:
2338 _ErrorIf(a != b, constants.CV_ENODEOS, node,
2339 "OS %s for %s differs from reference node %s: [%s] vs. [%s]",
2340 kind, os_name, base.name,
2341 utils.CommaJoin(sorted(a)), utils.CommaJoin(sorted(b)))
2343 # check any missing OSes
2344 missing = set(base.oslist.keys()).difference(nimg.oslist.keys())
2345 _ErrorIf(missing, constants.CV_ENODEOS, node,
2346 "OSes present on reference node %s but missing on this node: %s",
2347 base.name, utils.CommaJoin(missing))
2349 def _VerifyOob(self, ninfo, nresult):
2350 """Verifies out of band functionality of a node.
2352 @type ninfo: L{objects.Node}
2353 @param ninfo: the node to check
2354 @param nresult: the remote results for the node
2358 # We just have to verify the paths on master and/or master candidates
2359 # as the oob helper is invoked on the master
2360 if ((ninfo.master_candidate or ninfo.master_capable) and
2361 constants.NV_OOB_PATHS in nresult):
2362 for path_result in nresult[constants.NV_OOB_PATHS]:
2363 self._ErrorIf(path_result, constants.CV_ENODEOOBPATH, node, path_result)
2365 def _UpdateNodeVolumes(self, ninfo, nresult, nimg, vg_name):
2366 """Verifies and updates the node volume data.
2368 This function will update a L{NodeImage}'s internal structures
2369 with data from the remote call.
2371 @type ninfo: L{objects.Node}
2372 @param ninfo: the node to check
2373 @param nresult: the remote results for the node
2374 @param nimg: the node image object
2375 @param vg_name: the configured VG name
2379 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2381 nimg.lvm_fail = True
2382 lvdata = nresult.get(constants.NV_LVLIST, "Missing LV data")
2385 elif isinstance(lvdata, basestring):
2386 _ErrorIf(True, constants.CV_ENODELVM, node, "LVM problem on node: %s",
2387 utils.SafeEncode(lvdata))
2388 elif not isinstance(lvdata, dict):
2389 _ErrorIf(True, constants.CV_ENODELVM, node,
2390 "rpc call to node failed (lvlist)")
2392 nimg.volumes = lvdata
2393 nimg.lvm_fail = False
2395 def _UpdateNodeInstances(self, ninfo, nresult, nimg):
2396 """Verifies and updates the node instance list.
2398 If the listing was successful, then updates this node's instance
2399 list. Otherwise, it marks the RPC call as failed for the instance
2402 @type ninfo: L{objects.Node}
2403 @param ninfo: the node to check
2404 @param nresult: the remote results for the node
2405 @param nimg: the node image object
2408 idata = nresult.get(constants.NV_INSTANCELIST, None)
2409 test = not isinstance(idata, list)
2410 self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name,
2411 "rpc call to node failed (instancelist): %s",
2412 utils.SafeEncode(str(idata)))
2414 nimg.hyp_fail = True
2416 nimg.instances = idata
2418 def _UpdateNodeInfo(self, ninfo, nresult, nimg, vg_name):
2419 """Verifies and computes a node information map
2421 @type ninfo: L{objects.Node}
2422 @param ninfo: the node to check
2423 @param nresult: the remote results for the node
2424 @param nimg: the node image object
2425 @param vg_name: the configured VG name
2429 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2431 # try to read free memory (from the hypervisor)
2432 hv_info = nresult.get(constants.NV_HVINFO, None)
2433 test = not isinstance(hv_info, dict) or "memory_free" not in hv_info
2434 _ErrorIf(test, constants.CV_ENODEHV, node,
2435 "rpc call to node failed (hvinfo)")
2438 nimg.mfree = int(hv_info["memory_free"])
2439 except (ValueError, TypeError):
2440 _ErrorIf(True, constants.CV_ENODERPC, node,
2441 "node returned invalid nodeinfo, check hypervisor")
2443 # FIXME: devise a free space model for file based instances as well
2444 if vg_name is not None:
2445 test = (constants.NV_VGLIST not in nresult or
2446 vg_name not in nresult[constants.NV_VGLIST])
2447 _ErrorIf(test, constants.CV_ENODELVM, node,
2448 "node didn't return data for the volume group '%s'"
2449 " - it is either missing or broken", vg_name)
2452 nimg.dfree = int(nresult[constants.NV_VGLIST][vg_name])
2453 except (ValueError, TypeError):
2454 _ErrorIf(True, constants.CV_ENODERPC, node,
2455 "node returned invalid LVM info, check LVM status")
2457 def _CollectDiskInfo(self, nodelist, node_image, instanceinfo):
2458 """Gets per-disk status information for all instances.
2460 @type nodelist: list of strings
2461 @param nodelist: Node names
2462 @type node_image: dict of (name, L{objects.Node})
2463 @param node_image: Node objects
2464 @type instanceinfo: dict of (name, L{objects.Instance})
2465 @param instanceinfo: Instance objects
2466 @rtype: {instance: {node: [(succes, payload)]}}
2467 @return: a dictionary of per-instance dictionaries with nodes as
2468 keys and disk information as values; the disk information is a
2469 list of tuples (success, payload)
2472 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2475 node_disks_devonly = {}
2476 diskless_instances = set()
2477 diskless = constants.DT_DISKLESS
2479 for nname in nodelist:
2480 node_instances = list(itertools.chain(node_image[nname].pinst,
2481 node_image[nname].sinst))
2482 diskless_instances.update(inst for inst in node_instances
2483 if instanceinfo[inst].disk_template == diskless)
2484 disks = [(inst, disk)
2485 for inst in node_instances
2486 for disk in instanceinfo[inst].disks]
2489 # No need to collect data
2492 node_disks[nname] = disks
2494 # Creating copies as SetDiskID below will modify the objects and that can
2495 # lead to incorrect data returned from nodes
2496 devonly = [dev.Copy() for (_, dev) in disks]
2499 self.cfg.SetDiskID(dev, nname)
2501 node_disks_devonly[nname] = devonly
2503 assert len(node_disks) == len(node_disks_devonly)
2505 # Collect data from all nodes with disks
2506 result = self.rpc.call_blockdev_getmirrorstatus_multi(node_disks.keys(),
2509 assert len(result) == len(node_disks)
2513 for (nname, nres) in result.items():
2514 disks = node_disks[nname]
2517 # No data from this node
2518 data = len(disks) * [(False, "node offline")]
2521 _ErrorIf(msg, constants.CV_ENODERPC, nname,
2522 "while getting disk information: %s", msg)
2524 # No data from this node
2525 data = len(disks) * [(False, msg)]
2528 for idx, i in enumerate(nres.payload):
2529 if isinstance(i, (tuple, list)) and len(i) == 2:
2532 logging.warning("Invalid result from node %s, entry %d: %s",
2534 data.append((False, "Invalid result from the remote node"))
2536 for ((inst, _), status) in zip(disks, data):
2537 instdisk.setdefault(inst, {}).setdefault(nname, []).append(status)
2539 # Add empty entries for diskless instances.
2540 for inst in diskless_instances:
2541 assert inst not in instdisk
2544 assert compat.all(len(statuses) == len(instanceinfo[inst].disks) and
2545 len(nnames) <= len(instanceinfo[inst].all_nodes) and
2546 compat.all(isinstance(s, (tuple, list)) and
2547 len(s) == 2 for s in statuses)
2548 for inst, nnames in instdisk.items()
2549 for nname, statuses in nnames.items())
2550 assert set(instdisk) == set(instanceinfo), "instdisk consistency failure"
2555 def _SshNodeSelector(group_uuid, all_nodes):
2556 """Create endless iterators for all potential SSH check hosts.
2559 nodes = [node for node in all_nodes
2560 if (node.group != group_uuid and
2562 keyfunc = operator.attrgetter("group")
2564 return map(itertools.cycle,
2565 [sorted(map(operator.attrgetter("name"), names))
2566 for _, names in itertools.groupby(sorted(nodes, key=keyfunc),
2570 def _SelectSshCheckNodes(cls, group_nodes, group_uuid, all_nodes):
2571 """Choose which nodes should talk to which other nodes.
2573 We will make nodes contact all nodes in their group, and one node from
2576 @warning: This algorithm has a known issue if one node group is much
2577 smaller than others (e.g. just one node). In such a case all other
2578 nodes will talk to the single node.
2581 online_nodes = sorted(node.name for node in group_nodes if not node.offline)
2582 sel = cls._SshNodeSelector(group_uuid, all_nodes)
2584 return (online_nodes,
2585 dict((name, sorted([i.next() for i in sel]))
2586 for name in online_nodes))
2588 def BuildHooksEnv(self):
2591 Cluster-Verify hooks just ran in the post phase and their failure makes
2592 the output be logged in the verify output and the verification to fail.
2596 "CLUSTER_TAGS": " ".join(self.cfg.GetClusterInfo().GetTags())
2599 env.update(("NODE_TAGS_%s" % node.name, " ".join(node.GetTags()))
2600 for node in self.my_node_info.values())
2604 def BuildHooksNodes(self):
2605 """Build hooks nodes.
2608 return ([], self.my_node_names)
2610 def Exec(self, feedback_fn):
2611 """Verify integrity of the node group, performing various test on nodes.
2614 # This method has too many local variables. pylint: disable=R0914
2615 feedback_fn("* Verifying group '%s'" % self.group_info.name)
2617 if not self.my_node_names:
2619 feedback_fn("* Empty node group, skipping verification")
2623 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2624 verbose = self.op.verbose
2625 self._feedback_fn = feedback_fn
2627 vg_name = self.cfg.GetVGName()
2628 drbd_helper = self.cfg.GetDRBDHelper()
2629 cluster = self.cfg.GetClusterInfo()
2630 groupinfo = self.cfg.GetAllNodeGroupsInfo()
2631 hypervisors = cluster.enabled_hypervisors
2632 node_data_list = [self.my_node_info[name] for name in self.my_node_names]
2634 i_non_redundant = [] # Non redundant instances
2635 i_non_a_balanced = [] # Non auto-balanced instances
2636 n_offline = 0 # Count of offline nodes
2637 n_drained = 0 # Count of nodes being drained
2638 node_vol_should = {}
2640 # FIXME: verify OS list
2643 filemap = _ComputeAncillaryFiles(cluster, False)
2645 # do local checksums
2646 master_node = self.master_node = self.cfg.GetMasterNode()
2647 master_ip = self.cfg.GetMasterIP()
2649 feedback_fn("* Gathering data (%d nodes)" % len(self.my_node_names))
2651 node_verify_param = {
2652 constants.NV_FILELIST:
2653 utils.UniqueSequence(filename
2654 for files in filemap
2655 for filename in files),
2656 constants.NV_NODELIST:
2657 self._SelectSshCheckNodes(node_data_list, self.group_uuid,
2658 self.all_node_info.values()),
2659 constants.NV_HYPERVISOR: hypervisors,
2660 constants.NV_HVPARAMS:
2661 _GetAllHypervisorParameters(cluster, self.all_inst_info.values()),
2662 constants.NV_NODENETTEST: [(node.name, node.primary_ip, node.secondary_ip)
2663 for node in node_data_list
2664 if not node.offline],
2665 constants.NV_INSTANCELIST: hypervisors,
2666 constants.NV_VERSION: None,
2667 constants.NV_HVINFO: self.cfg.GetHypervisorType(),
2668 constants.NV_NODESETUP: None,
2669 constants.NV_TIME: None,
2670 constants.NV_MASTERIP: (master_node, master_ip),
2671 constants.NV_OSLIST: None,
2672 constants.NV_VMNODES: self.cfg.GetNonVmCapableNodeList(),
2675 if vg_name is not None:
2676 node_verify_param[constants.NV_VGLIST] = None
2677 node_verify_param[constants.NV_LVLIST] = vg_name
2678 node_verify_param[constants.NV_PVLIST] = [vg_name]
2679 node_verify_param[constants.NV_DRBDLIST] = None
2682 node_verify_param[constants.NV_DRBDHELPER] = drbd_helper
2685 # FIXME: this needs to be changed per node-group, not cluster-wide
2687 default_nicpp = cluster.nicparams[constants.PP_DEFAULT]
2688 if default_nicpp[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
2689 bridges.add(default_nicpp[constants.NIC_LINK])
2690 for instance in self.my_inst_info.values():
2691 for nic in instance.nics:
2692 full_nic = cluster.SimpleFillNIC(nic.nicparams)
2693 if full_nic[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
2694 bridges.add(full_nic[constants.NIC_LINK])
2697 node_verify_param[constants.NV_BRIDGES] = list(bridges)
2699 # Build our expected cluster state
2700 node_image = dict((node.name, self.NodeImage(offline=node.offline,
2702 vm_capable=node.vm_capable))
2703 for node in node_data_list)
2707 for node in self.all_node_info.values():
2708 path = _SupportsOob(self.cfg, node)
2709 if path and path not in oob_paths:
2710 oob_paths.append(path)
2713 node_verify_param[constants.NV_OOB_PATHS] = oob_paths
2715 for instance in self.my_inst_names:
2716 inst_config = self.my_inst_info[instance]
2718 for nname in inst_config.all_nodes:
2719 if nname not in node_image:
2720 gnode = self.NodeImage(name=nname)
2721 gnode.ghost = (nname not in self.all_node_info)
2722 node_image[nname] = gnode
2724 inst_config.MapLVsByNode(node_vol_should)
2726 pnode = inst_config.primary_node
2727 node_image[pnode].pinst.append(instance)
2729 for snode in inst_config.secondary_nodes:
2730 nimg = node_image[snode]
2731 nimg.sinst.append(instance)
2732 if pnode not in nimg.sbp:
2733 nimg.sbp[pnode] = []
2734 nimg.sbp[pnode].append(instance)
2736 # At this point, we have the in-memory data structures complete,
2737 # except for the runtime information, which we'll gather next
2739 # Due to the way our RPC system works, exact response times cannot be
2740 # guaranteed (e.g. a broken node could run into a timeout). By keeping the
2741 # time before and after executing the request, we can at least have a time
2743 nvinfo_starttime = time.time()
2744 all_nvinfo = self.rpc.call_node_verify(self.my_node_names,
2746 self.cfg.GetClusterName())
2747 nvinfo_endtime = time.time()
2749 if self.extra_lv_nodes and vg_name is not None:
2751 self.rpc.call_node_verify(self.extra_lv_nodes,
2752 {constants.NV_LVLIST: vg_name},
2753 self.cfg.GetClusterName())
2755 extra_lv_nvinfo = {}
2757 all_drbd_map = self.cfg.ComputeDRBDMap()
2759 feedback_fn("* Gathering disk information (%s nodes)" %
2760 len(self.my_node_names))
2761 instdisk = self._CollectDiskInfo(self.my_node_names, node_image,
2764 feedback_fn("* Verifying configuration file consistency")
2766 # If not all nodes are being checked, we need to make sure the master node
2767 # and a non-checked vm_capable node are in the list.
2768 absent_nodes = set(self.all_node_info).difference(self.my_node_info)
2770 vf_nvinfo = all_nvinfo.copy()
2771 vf_node_info = list(self.my_node_info.values())
2772 additional_nodes = []
2773 if master_node not in self.my_node_info:
2774 additional_nodes.append(master_node)
2775 vf_node_info.append(self.all_node_info[master_node])
2776 # Add the first vm_capable node we find which is not included
2777 for node in absent_nodes:
2778 nodeinfo = self.all_node_info[node]
2779 if nodeinfo.vm_capable and not nodeinfo.offline:
2780 additional_nodes.append(node)
2781 vf_node_info.append(self.all_node_info[node])
2783 key = constants.NV_FILELIST
2784 vf_nvinfo.update(self.rpc.call_node_verify(additional_nodes,
2785 {key: node_verify_param[key]},
2786 self.cfg.GetClusterName()))
2788 vf_nvinfo = all_nvinfo
2789 vf_node_info = self.my_node_info.values()
2791 self._VerifyFiles(_ErrorIf, vf_node_info, master_node, vf_nvinfo, filemap)
2793 feedback_fn("* Verifying node status")
2797 for node_i in node_data_list:
2799 nimg = node_image[node]
2803 feedback_fn("* Skipping offline node %s" % (node,))
2807 if node == master_node:
2809 elif node_i.master_candidate:
2810 ntype = "master candidate"
2811 elif node_i.drained:
2817 feedback_fn("* Verifying node %s (%s)" % (node, ntype))
2819 msg = all_nvinfo[node].fail_msg
2820 _ErrorIf(msg, constants.CV_ENODERPC, node, "while contacting node: %s",
2823 nimg.rpc_fail = True
2826 nresult = all_nvinfo[node].payload
2828 nimg.call_ok = self._VerifyNode(node_i, nresult)
2829 self._VerifyNodeTime(node_i, nresult, nvinfo_starttime, nvinfo_endtime)
2830 self._VerifyNodeNetwork(node_i, nresult)
2831 self._VerifyOob(node_i, nresult)
2834 self._VerifyNodeLVM(node_i, nresult, vg_name)
2835 self._VerifyNodeDrbd(node_i, nresult, self.all_inst_info, drbd_helper,
2838 self._UpdateNodeVolumes(node_i, nresult, nimg, vg_name)
2839 self._UpdateNodeInstances(node_i, nresult, nimg)
2840 self._UpdateNodeInfo(node_i, nresult, nimg, vg_name)
2841 self._UpdateNodeOS(node_i, nresult, nimg)
2843 if not nimg.os_fail:
2844 if refos_img is None:
2846 self._VerifyNodeOS(node_i, nimg, refos_img)
2847 self._VerifyNodeBridges(node_i, nresult, bridges)
2849 # Check whether all running instancies are primary for the node. (This
2850 # can no longer be done from _VerifyInstance below, since some of the
2851 # wrong instances could be from other node groups.)
2852 non_primary_inst = set(nimg.instances).difference(nimg.pinst)
2854 for inst in non_primary_inst:
2855 test = inst in self.all_inst_info
2856 _ErrorIf(test, constants.CV_EINSTANCEWRONGNODE, inst,
2857 "instance should not run on node %s", node_i.name)
2858 _ErrorIf(not test, constants.CV_ENODEORPHANINSTANCE, node_i.name,
2859 "node is running unknown instance %s", inst)
2861 for node, result in extra_lv_nvinfo.items():
2862 self._UpdateNodeVolumes(self.all_node_info[node], result.payload,
2863 node_image[node], vg_name)
2865 feedback_fn("* Verifying instance status")
2866 for instance in self.my_inst_names:
2868 feedback_fn("* Verifying instance %s" % instance)
2869 inst_config = self.my_inst_info[instance]
2870 self._VerifyInstance(instance, inst_config, node_image,
2872 inst_nodes_offline = []
2874 pnode = inst_config.primary_node
2875 pnode_img = node_image[pnode]
2876 _ErrorIf(pnode_img.rpc_fail and not pnode_img.offline,
2877 constants.CV_ENODERPC, pnode, "instance %s, connection to"
2878 " primary node failed", instance)
2880 _ErrorIf(inst_config.admin_up and pnode_img.offline,
2881 constants.CV_EINSTANCEBADNODE, instance,
2882 "instance is marked as running and lives on offline node %s",
2883 inst_config.primary_node)
2885 # If the instance is non-redundant we cannot survive losing its primary
2886 # node, so we are not N+1 compliant. On the other hand we have no disk
2887 # templates with more than one secondary so that situation is not well
2889 # FIXME: does not support file-backed instances
2890 if not inst_config.secondary_nodes:
2891 i_non_redundant.append(instance)
2893 _ErrorIf(len(inst_config.secondary_nodes) > 1,
2894 constants.CV_EINSTANCELAYOUT,
2895 instance, "instance has multiple secondary nodes: %s",
2896 utils.CommaJoin(inst_config.secondary_nodes),
2897 code=self.ETYPE_WARNING)
2899 if inst_config.disk_template in constants.DTS_INT_MIRROR:
2900 pnode = inst_config.primary_node
2901 instance_nodes = utils.NiceSort(inst_config.all_nodes)
2902 instance_groups = {}
2904 for node in instance_nodes:
2905 instance_groups.setdefault(self.all_node_info[node].group,
2909 "%s (group %s)" % (utils.CommaJoin(nodes), groupinfo[group].name)
2910 # Sort so that we always list the primary node first.
2911 for group, nodes in sorted(instance_groups.items(),
2912 key=lambda (_, nodes): pnode in nodes,
2915 self._ErrorIf(len(instance_groups) > 1,
2916 constants.CV_EINSTANCESPLITGROUPS,
2917 instance, "instance has primary and secondary nodes in"
2918 " different groups: %s", utils.CommaJoin(pretty_list),
2919 code=self.ETYPE_WARNING)
2921 if not cluster.FillBE(inst_config)[constants.BE_AUTO_BALANCE]:
2922 i_non_a_balanced.append(instance)
2924 for snode in inst_config.secondary_nodes:
2925 s_img = node_image[snode]
2926 _ErrorIf(s_img.rpc_fail and not s_img.offline, constants.CV_ENODERPC,
2927 snode, "instance %s, connection to secondary node failed",
2931 inst_nodes_offline.append(snode)
2933 # warn that the instance lives on offline nodes
2934 _ErrorIf(inst_nodes_offline, constants.CV_EINSTANCEBADNODE, instance,
2935 "instance has offline secondary node(s) %s",
2936 utils.CommaJoin(inst_nodes_offline))
2937 # ... or ghost/non-vm_capable nodes
2938 for node in inst_config.all_nodes:
2939 _ErrorIf(node_image[node].ghost, constants.CV_EINSTANCEBADNODE,
2940 instance, "instance lives on ghost node %s", node)
2941 _ErrorIf(not node_image[node].vm_capable, constants.CV_EINSTANCEBADNODE,
2942 instance, "instance lives on non-vm_capable node %s", node)
2944 feedback_fn("* Verifying orphan volumes")
2945 reserved = utils.FieldSet(*cluster.reserved_lvs)
2947 # We will get spurious "unknown volume" warnings if any node of this group
2948 # is secondary for an instance whose primary is in another group. To avoid
2949 # them, we find these instances and add their volumes to node_vol_should.
2950 for inst in self.all_inst_info.values():
2951 for secondary in inst.secondary_nodes:
2952 if (secondary in self.my_node_info
2953 and inst.name not in self.my_inst_info):
2954 inst.MapLVsByNode(node_vol_should)
2957 self._VerifyOrphanVolumes(node_vol_should, node_image, reserved)
2959 if constants.VERIFY_NPLUSONE_MEM not in self.op.skip_checks:
2960 feedback_fn("* Verifying N+1 Memory redundancy")
2961 self._VerifyNPlusOneMemory(node_image, self.my_inst_info)
2963 feedback_fn("* Other Notes")
2965 feedback_fn(" - NOTICE: %d non-redundant instance(s) found."
2966 % len(i_non_redundant))
2968 if i_non_a_balanced:
2969 feedback_fn(" - NOTICE: %d non-auto-balanced instance(s) found."
2970 % len(i_non_a_balanced))
2973 feedback_fn(" - NOTICE: %d offline node(s) found." % n_offline)
2976 feedback_fn(" - NOTICE: %d drained node(s) found." % n_drained)
2980 def HooksCallBack(self, phase, hooks_results, feedback_fn, lu_result):
2981 """Analyze the post-hooks' result
2983 This method analyses the hook result, handles it, and sends some
2984 nicely-formatted feedback back to the user.
2986 @param phase: one of L{constants.HOOKS_PHASE_POST} or
2987 L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase
2988 @param hooks_results: the results of the multi-node hooks rpc call
2989 @param feedback_fn: function used send feedback back to the caller
2990 @param lu_result: previous Exec result
2991 @return: the new Exec result, based on the previous result
2995 # We only really run POST phase hooks, only for non-empty groups,
2996 # and are only interested in their results
2997 if not self.my_node_names:
3000 elif phase == constants.HOOKS_PHASE_POST:
3001 # Used to change hooks' output to proper indentation
3002 feedback_fn("* Hooks Results")
3003 assert hooks_results, "invalid result from hooks"
3005 for node_name in hooks_results:
3006 res = hooks_results[node_name]
3008 test = msg and not res.offline
3009 self._ErrorIf(test, constants.CV_ENODEHOOKS, node_name,
3010 "Communication failure in hooks execution: %s", msg)
3011 if res.offline or msg:
3012 # No need to investigate payload if node is offline or gave
3015 for script, hkr, output in res.payload:
3016 test = hkr == constants.HKR_FAIL
3017 self._ErrorIf(test, constants.CV_ENODEHOOKS, node_name,
3018 "Script %s failed, output:", script)
3020 output = self._HOOKS_INDENT_RE.sub(" ", output)
3021 feedback_fn("%s" % output)
3027 class LUClusterVerifyDisks(NoHooksLU):
3028 """Verifies the cluster disks status.
3033 def ExpandNames(self):
3034 self.share_locks = _ShareAll()
3035 self.needed_locks = {
3036 locking.LEVEL_NODEGROUP: locking.ALL_SET,
3039 def Exec(self, feedback_fn):
3040 group_names = self.owned_locks(locking.LEVEL_NODEGROUP)
3042 # Submit one instance of L{opcodes.OpGroupVerifyDisks} per node group
3043 return ResultWithJobs([[opcodes.OpGroupVerifyDisks(group_name=group)]
3044 for group in group_names])
3047 class LUGroupVerifyDisks(NoHooksLU):
3048 """Verifies the status of all disks in a node group.
3053 def ExpandNames(self):
3054 # Raises errors.OpPrereqError on its own if group can't be found
3055 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
3057 self.share_locks = _ShareAll()
3058 self.needed_locks = {
3059 locking.LEVEL_INSTANCE: [],
3060 locking.LEVEL_NODEGROUP: [],
3061 locking.LEVEL_NODE: [],
3064 def DeclareLocks(self, level):
3065 if level == locking.LEVEL_INSTANCE:
3066 assert not self.needed_locks[locking.LEVEL_INSTANCE]
3068 # Lock instances optimistically, needs verification once node and group
3069 # locks have been acquired
3070 self.needed_locks[locking.LEVEL_INSTANCE] = \
3071 self.cfg.GetNodeGroupInstances(self.group_uuid)
3073 elif level == locking.LEVEL_NODEGROUP:
3074 assert not self.needed_locks[locking.LEVEL_NODEGROUP]
3076 self.needed_locks[locking.LEVEL_NODEGROUP] = \
3077 set([self.group_uuid] +
3078 # Lock all groups used by instances optimistically; this requires
3079 # going via the node before it's locked, requiring verification
3082 for instance_name in self.owned_locks(locking.LEVEL_INSTANCE)
3083 for group_uuid in self.cfg.GetInstanceNodeGroups(instance_name)])
3085 elif level == locking.LEVEL_NODE:
3086 # This will only lock the nodes in the group to be verified which contain
3088 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
3089 self._LockInstancesNodes()
3091 # Lock all nodes in group to be verified
3092 assert self.group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP)
3093 member_nodes = self.cfg.GetNodeGroup(self.group_uuid).members
3094 self.needed_locks[locking.LEVEL_NODE].extend(member_nodes)
3096 def CheckPrereq(self):
3097 owned_instances = frozenset(self.owned_locks(locking.LEVEL_INSTANCE))
3098 owned_groups = frozenset(self.owned_locks(locking.LEVEL_NODEGROUP))
3099 owned_nodes = frozenset(self.owned_locks(locking.LEVEL_NODE))
3101 assert self.group_uuid in owned_groups
3103 # Check if locked instances are still correct
3104 _CheckNodeGroupInstances(self.cfg, self.group_uuid, owned_instances)
3106 # Get instance information
3107 self.instances = dict(self.cfg.GetMultiInstanceInfo(owned_instances))
3109 # Check if node groups for locked instances are still correct
3110 for (instance_name, inst) in self.instances.items():
3111 assert owned_nodes.issuperset(inst.all_nodes), \
3112 "Instance %s's nodes changed while we kept the lock" % instance_name
3114 inst_groups = _CheckInstanceNodeGroups(self.cfg, instance_name,
3117 assert self.group_uuid in inst_groups, \
3118 "Instance %s has no node in group %s" % (instance_name, self.group_uuid)
3120 def Exec(self, feedback_fn):
3121 """Verify integrity of cluster disks.
3123 @rtype: tuple of three items
3124 @return: a tuple of (dict of node-to-node_error, list of instances
3125 which need activate-disks, dict of instance: (node, volume) for
3130 res_instances = set()
3133 nv_dict = _MapInstanceDisksToNodes([inst
3134 for inst in self.instances.values()
3138 nodes = utils.NiceSort(set(self.owned_locks(locking.LEVEL_NODE)) &
3139 set(self.cfg.GetVmCapableNodeList()))
3141 node_lvs = self.rpc.call_lv_list(nodes, [])
3143 for (node, node_res) in node_lvs.items():
3144 if node_res.offline:
3147 msg = node_res.fail_msg
3149 logging.warning("Error enumerating LVs on node %s: %s", node, msg)
3150 res_nodes[node] = msg
3153 for lv_name, (_, _, lv_online) in node_res.payload.items():
3154 inst = nv_dict.pop((node, lv_name), None)
3155 if not (lv_online or inst is None):
3156 res_instances.add(inst)
3158 # any leftover items in nv_dict are missing LVs, let's arrange the data
3160 for key, inst in nv_dict.iteritems():
3161 res_missing.setdefault(inst, []).append(list(key))
3163 return (res_nodes, list(res_instances), res_missing)
3166 class LUClusterRepairDiskSizes(NoHooksLU):
3167 """Verifies the cluster disks sizes.
3172 def ExpandNames(self):
3173 if self.op.instances:
3174 self.wanted_names = _GetWantedInstances(self, self.op.instances)
3175 self.needed_locks = {
3176 locking.LEVEL_NODE: [],
3177 locking.LEVEL_INSTANCE: self.wanted_names,
3179 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
3181 self.wanted_names = None
3182 self.needed_locks = {
3183 locking.LEVEL_NODE: locking.ALL_SET,
3184 locking.LEVEL_INSTANCE: locking.ALL_SET,
3186 self.share_locks = _ShareAll()
3188 def DeclareLocks(self, level):
3189 if level == locking.LEVEL_NODE and self.wanted_names is not None:
3190 self._LockInstancesNodes(primary_only=True)
3192 def CheckPrereq(self):
3193 """Check prerequisites.
3195 This only checks the optional instance list against the existing names.
3198 if self.wanted_names is None:
3199 self.wanted_names = self.owned_locks(locking.LEVEL_INSTANCE)
3201 self.wanted_instances = \
3202 map(compat.snd, self.cfg.GetMultiInstanceInfo(self.wanted_names))
3204 def _EnsureChildSizes(self, disk):
3205 """Ensure children of the disk have the needed disk size.
3207 This is valid mainly for DRBD8 and fixes an issue where the
3208 children have smaller disk size.
3210 @param disk: an L{ganeti.objects.Disk} object
3213 if disk.dev_type == constants.LD_DRBD8:
3214 assert disk.children, "Empty children for DRBD8?"
3215 fchild = disk.children[0]
3216 mismatch = fchild.size < disk.size
3218 self.LogInfo("Child disk has size %d, parent %d, fixing",
3219 fchild.size, disk.size)
3220 fchild.size = disk.size
3222 # and we recurse on this child only, not on the metadev
3223 return self._EnsureChildSizes(fchild) or mismatch
3227 def Exec(self, feedback_fn):
3228 """Verify the size of cluster disks.
3231 # TODO: check child disks too
3232 # TODO: check differences in size between primary/secondary nodes
3234 for instance in self.wanted_instances:
3235 pnode = instance.primary_node
3236 if pnode not in per_node_disks:
3237 per_node_disks[pnode] = []
3238 for idx, disk in enumerate(instance.disks):
3239 per_node_disks[pnode].append((instance, idx, disk))
3242 for node, dskl in per_node_disks.items():
3243 newl = [v[2].Copy() for v in dskl]
3245 self.cfg.SetDiskID(dsk, node)
3246 result = self.rpc.call_blockdev_getsize(node, newl)
3248 self.LogWarning("Failure in blockdev_getsize call to node"
3249 " %s, ignoring", node)
3251 if len(result.payload) != len(dskl):
3252 logging.warning("Invalid result from node %s: len(dksl)=%d,"
3253 " result.payload=%s", node, len(dskl), result.payload)
3254 self.LogWarning("Invalid result from node %s, ignoring node results",
3257 for ((instance, idx, disk), size) in zip(dskl, result.payload):
3259 self.LogWarning("Disk %d of instance %s did not return size"
3260 " information, ignoring", idx, instance.name)
3262 if not isinstance(size, (int, long)):
3263 self.LogWarning("Disk %d of instance %s did not return valid"
3264 " size information, ignoring", idx, instance.name)
3267 if size != disk.size:
3268 self.LogInfo("Disk %d of instance %s has mismatched size,"
3269 " correcting: recorded %d, actual %d", idx,
3270 instance.name, disk.size, size)
3272 self.cfg.Update(instance, feedback_fn)
3273 changed.append((instance.name, idx, size))
3274 if self._EnsureChildSizes(disk):
3275 self.cfg.Update(instance, feedback_fn)
3276 changed.append((instance.name, idx, disk.size))
3280 class LUClusterRename(LogicalUnit):
3281 """Rename the cluster.
3284 HPATH = "cluster-rename"
3285 HTYPE = constants.HTYPE_CLUSTER
3287 def BuildHooksEnv(self):
3292 "OP_TARGET": self.cfg.GetClusterName(),
3293 "NEW_NAME": self.op.name,
3296 def BuildHooksNodes(self):
3297 """Build hooks nodes.
3300 return ([self.cfg.GetMasterNode()], self.cfg.GetNodeList())
3302 def CheckPrereq(self):
3303 """Verify that the passed name is a valid one.
3306 hostname = netutils.GetHostname(name=self.op.name,
3307 family=self.cfg.GetPrimaryIPFamily())
3309 new_name = hostname.name
3310 self.ip = new_ip = hostname.ip
3311 old_name = self.cfg.GetClusterName()
3312 old_ip = self.cfg.GetMasterIP()
3313 if new_name == old_name and new_ip == old_ip:
3314 raise errors.OpPrereqError("Neither the name nor the IP address of the"
3315 " cluster has changed",
3317 if new_ip != old_ip:
3318 if netutils.TcpPing(new_ip, constants.DEFAULT_NODED_PORT):
3319 raise errors.OpPrereqError("The given cluster IP address (%s) is"
3320 " reachable on the network" %
3321 new_ip, errors.ECODE_NOTUNIQUE)
3323 self.op.name = new_name
3325 def Exec(self, feedback_fn):
3326 """Rename the cluster.
3329 clustername = self.op.name
3332 # shutdown the master IP
3333 master_params = self.cfg.GetMasterNetworkParameters()
3334 result = self.rpc.call_node_deactivate_master_ip(master_params.name,
3336 master_params.netmask,
3337 master_params.netdev,
3338 master_params.ip_family)
3339 result.Raise("Could not disable the master role")
3342 cluster = self.cfg.GetClusterInfo()
3343 cluster.cluster_name = clustername
3344 cluster.master_ip = new_ip
3345 self.cfg.Update(cluster, feedback_fn)
3347 # update the known hosts file
3348 ssh.WriteKnownHostsFile(self.cfg, constants.SSH_KNOWN_HOSTS_FILE)
3349 node_list = self.cfg.GetOnlineNodeList()
3351 node_list.remove(master_params.name)
3354 _UploadHelper(self, node_list, constants.SSH_KNOWN_HOSTS_FILE)
3356 result = self.rpc.call_node_activate_master_ip(master_params.name,
3358 master_params.netmask,
3359 master_params.netdev,
3360 master_params.ip_family)
3361 msg = result.fail_msg
3363 self.LogWarning("Could not re-enable the master role on"
3364 " the master, please restart manually: %s", msg)
3369 def _ValidateNetmask(cfg, netmask):
3370 """Checks if a netmask is valid.
3372 @type cfg: L{config.ConfigWriter}
3373 @param cfg: The cluster configuration
3375 @param netmask: the netmask to be verified
3376 @raise errors.OpPrereqError: if the validation fails
3379 ip_family = cfg.GetPrimaryIPFamily()
3381 ipcls = netutils.IPAddress.GetClassFromIpFamily(ip_family)
3382 except errors.ProgrammerError:
3383 raise errors.OpPrereqError("Invalid primary ip family: %s." %
3385 if not ipcls.ValidateNetmask(netmask):
3386 raise errors.OpPrereqError("CIDR netmask (%s) not valid" %
3390 class LUClusterSetParams(LogicalUnit):
3391 """Change the parameters of the cluster.
3394 HPATH = "cluster-modify"
3395 HTYPE = constants.HTYPE_CLUSTER
3398 def CheckArguments(self):
3402 if self.op.uid_pool:
3403 uidpool.CheckUidPool(self.op.uid_pool)
3405 if self.op.add_uids:
3406 uidpool.CheckUidPool(self.op.add_uids)
3408 if self.op.remove_uids:
3409 uidpool.CheckUidPool(self.op.remove_uids)
3411 if self.op.master_netmask is not None:
3412 _ValidateNetmask(self.cfg, self.op.master_netmask)
3414 def ExpandNames(self):
3415 # FIXME: in the future maybe other cluster params won't require checking on
3416 # all nodes to be modified.
3417 self.needed_locks = {
3418 locking.LEVEL_NODE: locking.ALL_SET,
3420 self.share_locks[locking.LEVEL_NODE] = 1
3422 def BuildHooksEnv(self):
3427 "OP_TARGET": self.cfg.GetClusterName(),
3428 "NEW_VG_NAME": self.op.vg_name,
3431 def BuildHooksNodes(self):
3432 """Build hooks nodes.
3435 mn = self.cfg.GetMasterNode()
3438 def CheckPrereq(self):
3439 """Check prerequisites.
3441 This checks whether the given params don't conflict and
3442 if the given volume group is valid.
3445 if self.op.vg_name is not None and not self.op.vg_name:
3446 if self.cfg.HasAnyDiskOfType(constants.LD_LV):
3447 raise errors.OpPrereqError("Cannot disable lvm storage while lvm-based"
3448 " instances exist", errors.ECODE_INVAL)
3450 if self.op.drbd_helper is not None and not self.op.drbd_helper:
3451 if self.cfg.HasAnyDiskOfType(constants.LD_DRBD8):
3452 raise errors.OpPrereqError("Cannot disable drbd helper while"
3453 " drbd-based instances exist",
3456 node_list = self.owned_locks(locking.LEVEL_NODE)
3458 # if vg_name not None, checks given volume group on all nodes
3460 vglist = self.rpc.call_vg_list(node_list)
3461 for node in node_list:
3462 msg = vglist[node].fail_msg
3464 # ignoring down node
3465 self.LogWarning("Error while gathering data on node %s"
3466 " (ignoring node): %s", node, msg)
3468 vgstatus = utils.CheckVolumeGroupSize(vglist[node].payload,
3470 constants.MIN_VG_SIZE)
3472 raise errors.OpPrereqError("Error on node '%s': %s" %
3473 (node, vgstatus), errors.ECODE_ENVIRON)
3475 if self.op.drbd_helper:
3476 # checks given drbd helper on all nodes
3477 helpers = self.rpc.call_drbd_helper(node_list)
3478 for (node, ninfo) in self.cfg.GetMultiNodeInfo(node_list):
3480 self.LogInfo("Not checking drbd helper on offline node %s", node)
3482 msg = helpers[node].fail_msg
3484 raise errors.OpPrereqError("Error checking drbd helper on node"
3485 " '%s': %s" % (node, msg),
3486 errors.ECODE_ENVIRON)
3487 node_helper = helpers[node].payload
3488 if node_helper != self.op.drbd_helper:
3489 raise errors.OpPrereqError("Error on node '%s': drbd helper is %s" %
3490 (node, node_helper), errors.ECODE_ENVIRON)
3492 self.cluster = cluster = self.cfg.GetClusterInfo()
3493 # validate params changes
3494 if self.op.beparams:
3495 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
3496 self.new_beparams = cluster.SimpleFillBE(self.op.beparams)
3498 if self.op.ndparams:
3499 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
3500 self.new_ndparams = cluster.SimpleFillND(self.op.ndparams)
3502 # TODO: we need a more general way to handle resetting
3503 # cluster-level parameters to default values
3504 if self.new_ndparams["oob_program"] == "":
3505 self.new_ndparams["oob_program"] = \
3506 constants.NDC_DEFAULTS[constants.ND_OOB_PROGRAM]
3508 if self.op.nicparams:
3509 utils.ForceDictType(self.op.nicparams, constants.NICS_PARAMETER_TYPES)
3510 self.new_nicparams = cluster.SimpleFillNIC(self.op.nicparams)
3511 objects.NIC.CheckParameterSyntax(self.new_nicparams)
3514 # check all instances for consistency
3515 for instance in self.cfg.GetAllInstancesInfo().values():
3516 for nic_idx, nic in enumerate(instance.nics):
3517 params_copy = copy.deepcopy(nic.nicparams)
3518 params_filled = objects.FillDict(self.new_nicparams, params_copy)
3520 # check parameter syntax
3522 objects.NIC.CheckParameterSyntax(params_filled)
3523 except errors.ConfigurationError, err:
3524 nic_errors.append("Instance %s, nic/%d: %s" %
3525 (instance.name, nic_idx, err))
3527 # if we're moving instances to routed, check that they have an ip
3528 target_mode = params_filled[constants.NIC_MODE]
3529 if target_mode == constants.NIC_MODE_ROUTED and not nic.ip:
3530 nic_errors.append("Instance %s, nic/%d: routed NIC with no ip"
3531 " address" % (instance.name, nic_idx))
3533 raise errors.OpPrereqError("Cannot apply the change, errors:\n%s" %
3534 "\n".join(nic_errors))
3536 # hypervisor list/parameters
3537 self.new_hvparams = new_hvp = objects.FillDict(cluster.hvparams, {})
3538 if self.op.hvparams:
3539 for hv_name, hv_dict in self.op.hvparams.items():
3540 if hv_name not in self.new_hvparams:
3541 self.new_hvparams[hv_name] = hv_dict
3543 self.new_hvparams[hv_name].update(hv_dict)
3545 # os hypervisor parameters
3546 self.new_os_hvp = objects.FillDict(cluster.os_hvp, {})
3548 for os_name, hvs in self.op.os_hvp.items():
3549 if os_name not in self.new_os_hvp:
3550 self.new_os_hvp[os_name] = hvs
3552 for hv_name, hv_dict in hvs.items():
3553 if hv_name not in self.new_os_hvp[os_name]:
3554 self.new_os_hvp[os_name][hv_name] = hv_dict
3556 self.new_os_hvp[os_name][hv_name].update(hv_dict)
3559 self.new_osp = objects.FillDict(cluster.osparams, {})
3560 if self.op.osparams:
3561 for os_name, osp in self.op.osparams.items():
3562 if os_name not in self.new_osp:
3563 self.new_osp[os_name] = {}
3565 self.new_osp[os_name] = _GetUpdatedParams(self.new_osp[os_name], osp,
3568 if not self.new_osp[os_name]:
3569 # we removed all parameters
3570 del self.new_osp[os_name]
3572 # check the parameter validity (remote check)
3573 _CheckOSParams(self, False, [self.cfg.GetMasterNode()],
3574 os_name, self.new_osp[os_name])
3576 # changes to the hypervisor list
3577 if self.op.enabled_hypervisors is not None:
3578 self.hv_list = self.op.enabled_hypervisors
3579 for hv in self.hv_list:
3580 # if the hypervisor doesn't already exist in the cluster
3581 # hvparams, we initialize it to empty, and then (in both
3582 # cases) we make sure to fill the defaults, as we might not
3583 # have a complete defaults list if the hypervisor wasn't
3585 if hv not in new_hvp:
3587 new_hvp[hv] = objects.FillDict(constants.HVC_DEFAULTS[hv], new_hvp[hv])
3588 utils.ForceDictType(new_hvp[hv], constants.HVS_PARAMETER_TYPES)
3590 self.hv_list = cluster.enabled_hypervisors
3592 if self.op.hvparams or self.op.enabled_hypervisors is not None:
3593 # either the enabled list has changed, or the parameters have, validate
3594 for hv_name, hv_params in self.new_hvparams.items():
3595 if ((self.op.hvparams and hv_name in self.op.hvparams) or
3596 (self.op.enabled_hypervisors and
3597 hv_name in self.op.enabled_hypervisors)):
3598 # either this is a new hypervisor, or its parameters have changed
3599 hv_class = hypervisor.GetHypervisor(hv_name)
3600 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
3601 hv_class.CheckParameterSyntax(hv_params)
3602 _CheckHVParams(self, node_list, hv_name, hv_params)
3605 # no need to check any newly-enabled hypervisors, since the
3606 # defaults have already been checked in the above code-block
3607 for os_name, os_hvp in self.new_os_hvp.items():
3608 for hv_name, hv_params in os_hvp.items():
3609 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
3610 # we need to fill in the new os_hvp on top of the actual hv_p
3611 cluster_defaults = self.new_hvparams.get(hv_name, {})
3612 new_osp = objects.FillDict(cluster_defaults, hv_params)
3613 hv_class = hypervisor.GetHypervisor(hv_name)
3614 hv_class.CheckParameterSyntax(new_osp)
3615 _CheckHVParams(self, node_list, hv_name, new_osp)
3617 if self.op.default_iallocator:
3618 alloc_script = utils.FindFile(self.op.default_iallocator,
3619 constants.IALLOCATOR_SEARCH_PATH,
3621 if alloc_script is None:
3622 raise errors.OpPrereqError("Invalid default iallocator script '%s'"
3623 " specified" % self.op.default_iallocator,
3626 def Exec(self, feedback_fn):
3627 """Change the parameters of the cluster.
3630 if self.op.vg_name is not None:
3631 new_volume = self.op.vg_name
3634 if new_volume != self.cfg.GetVGName():
3635 self.cfg.SetVGName(new_volume)
3637 feedback_fn("Cluster LVM configuration already in desired"
3638 " state, not changing")
3639 if self.op.drbd_helper is not None:
3640 new_helper = self.op.drbd_helper
3643 if new_helper != self.cfg.GetDRBDHelper():
3644 self.cfg.SetDRBDHelper(new_helper)
3646 feedback_fn("Cluster DRBD helper already in desired state,"
3648 if self.op.hvparams:
3649 self.cluster.hvparams = self.new_hvparams
3651 self.cluster.os_hvp = self.new_os_hvp
3652 if self.op.enabled_hypervisors is not None:
3653 self.cluster.hvparams = self.new_hvparams
3654 self.cluster.enabled_hypervisors = self.op.enabled_hypervisors
3655 if self.op.beparams:
3656 self.cluster.beparams[constants.PP_DEFAULT] = self.new_beparams
3657 if self.op.nicparams:
3658 self.cluster.nicparams[constants.PP_DEFAULT] = self.new_nicparams
3659 if self.op.osparams:
3660 self.cluster.osparams = self.new_osp
3661 if self.op.ndparams:
3662 self.cluster.ndparams = self.new_ndparams
3664 if self.op.candidate_pool_size is not None:
3665 self.cluster.candidate_pool_size = self.op.candidate_pool_size
3666 # we need to update the pool size here, otherwise the save will fail
3667 _AdjustCandidatePool(self, [])
3669 if self.op.maintain_node_health is not None:
3670 self.cluster.maintain_node_health = self.op.maintain_node_health
3672 if self.op.prealloc_wipe_disks is not None:
3673 self.cluster.prealloc_wipe_disks = self.op.prealloc_wipe_disks
3675 if self.op.add_uids is not None:
3676 uidpool.AddToUidPool(self.cluster.uid_pool, self.op.add_uids)
3678 if self.op.remove_uids is not None:
3679 uidpool.RemoveFromUidPool(self.cluster.uid_pool, self.op.remove_uids)
3681 if self.op.uid_pool is not None:
3682 self.cluster.uid_pool = self.op.uid_pool
3684 if self.op.default_iallocator is not None:
3685 self.cluster.default_iallocator = self.op.default_iallocator
3687 if self.op.reserved_lvs is not None:
3688 self.cluster.reserved_lvs = self.op.reserved_lvs
3690 def helper_os(aname, mods, desc):
3692 lst = getattr(self.cluster, aname)
3693 for key, val in mods:
3694 if key == constants.DDM_ADD:
3696 feedback_fn("OS %s already in %s, ignoring" % (val, desc))
3699 elif key == constants.DDM_REMOVE:
3703 feedback_fn("OS %s not found in %s, ignoring" % (val, desc))
3705 raise errors.ProgrammerError("Invalid modification '%s'" % key)
3707 if self.op.hidden_os:
3708 helper_os("hidden_os", self.op.hidden_os, "hidden")
3710 if self.op.blacklisted_os:
3711 helper_os("blacklisted_os", self.op.blacklisted_os, "blacklisted")
3713 if self.op.master_netdev:
3714 master_params = self.cfg.GetMasterNetworkParameters()
3715 feedback_fn("Shutting down master ip on the current netdev (%s)" %
3716 self.cluster.master_netdev)
3717 result = self.rpc.call_node_deactivate_master_ip(master_params.name,
3719 master_params.netmask,
3720 master_params.netdev,
3721 master_params.ip_family)
3722 result.Raise("Could not disable the master ip")
3723 feedback_fn("Changing master_netdev from %s to %s" %
3724 (master_params.netdev, self.op.master_netdev))
3725 self.cluster.master_netdev = self.op.master_netdev
3727 if self.op.master_netmask:
3728 master_params = self.cfg.GetMasterNetworkParameters()
3729 feedback_fn("Changing master IP netmask to %s" % self.op.master_netmask)
3730 result = self.rpc.call_node_change_master_netmask(master_params.name,
3731 master_params.netmask,
3732 self.op.master_netmask,
3734 master_params.netdev)
3736 msg = "Could not change the master IP netmask: %s" % result.fail_msg
3737 self.LogWarning(msg)
3740 self.cluster.master_netmask = self.op.master_netmask
3742 self.cfg.Update(self.cluster, feedback_fn)
3744 if self.op.master_netdev:
3745 master_params = self.cfg.GetMasterNetworkParameters()
3746 feedback_fn("Starting the master ip on the new master netdev (%s)" %
3747 self.op.master_netdev)
3748 result = self.rpc.call_node_activate_master_ip(master_params.name,
3750 master_params.netmask,
3751 master_params.netdev,
3752 master_params.ip_family)
3754 self.LogWarning("Could not re-enable the master ip on"
3755 " the master, please restart manually: %s",
3759 def _UploadHelper(lu, nodes, fname):
3760 """Helper for uploading a file and showing warnings.
3763 if os.path.exists(fname):
3764 result = lu.rpc.call_upload_file(nodes, fname)
3765 for to_node, to_result in result.items():
3766 msg = to_result.fail_msg
3768 msg = ("Copy of file %s to node %s failed: %s" %
3769 (fname, to_node, msg))
3770 lu.proc.LogWarning(msg)
3773 def _ComputeAncillaryFiles(cluster, redist):
3774 """Compute files external to Ganeti which need to be consistent.
3776 @type redist: boolean
3777 @param redist: Whether to include files which need to be redistributed
3780 # Compute files for all nodes
3782 constants.SSH_KNOWN_HOSTS_FILE,
3783 constants.CONFD_HMAC_KEY,
3784 constants.CLUSTER_DOMAIN_SECRET_FILE,
3785 constants.SPICE_CERT_FILE,
3786 constants.SPICE_CACERT_FILE,
3787 constants.RAPI_USERS_FILE,
3791 files_all.update(constants.ALL_CERT_FILES)
3792 files_all.update(ssconf.SimpleStore().GetFileList())
3794 # we need to ship at least the RAPI certificate
3795 files_all.add(constants.RAPI_CERT_FILE)
3797 if cluster.modify_etc_hosts:
3798 files_all.add(constants.ETC_HOSTS)
3800 # Files which are optional, these must:
3801 # - be present in one other category as well
3802 # - either exist or not exist on all nodes of that category (mc, vm all)
3804 constants.RAPI_USERS_FILE,
3807 # Files which should only be on master candidates
3810 files_mc.add(constants.CLUSTER_CONF_FILE)
3812 # Files which should only be on VM-capable nodes
3813 files_vm = set(filename
3814 for hv_name in cluster.enabled_hypervisors
3815 for filename in hypervisor.GetHypervisor(hv_name).GetAncillaryFiles()[0])
3817 files_opt |= set(filename
3818 for hv_name in cluster.enabled_hypervisors
3819 for filename in hypervisor.GetHypervisor(hv_name).GetAncillaryFiles()[1])
3821 # Filenames in each category must be unique
3822 all_files_set = files_all | files_mc | files_vm
3823 assert (len(all_files_set) ==
3824 sum(map(len, [files_all, files_mc, files_vm]))), \
3825 "Found file listed in more than one file list"
3827 # Optional files must be present in one other category
3828 assert all_files_set.issuperset(files_opt), \
3829 "Optional file not in a different required list"
3831 return (files_all, files_opt, files_mc, files_vm)
3834 def _RedistributeAncillaryFiles(lu, additional_nodes=None, additional_vm=True):
3835 """Distribute additional files which are part of the cluster configuration.
3837 ConfigWriter takes care of distributing the config and ssconf files, but
3838 there are more files which should be distributed to all nodes. This function
3839 makes sure those are copied.
3841 @param lu: calling logical unit
3842 @param additional_nodes: list of nodes not in the config to distribute to
3843 @type additional_vm: boolean
3844 @param additional_vm: whether the additional nodes are vm-capable or not
3847 # Gather target nodes
3848 cluster = lu.cfg.GetClusterInfo()
3849 master_info = lu.cfg.GetNodeInfo(lu.cfg.GetMasterNode())
3851 online_nodes = lu.cfg.GetOnlineNodeList()
3852 vm_nodes = lu.cfg.GetVmCapableNodeList()
3854 if additional_nodes is not None:
3855 online_nodes.extend(additional_nodes)
3857 vm_nodes.extend(additional_nodes)
3859 # Never distribute to master node
3860 for nodelist in [online_nodes, vm_nodes]:
3861 if master_info.name in nodelist:
3862 nodelist.remove(master_info.name)
3865 (files_all, _, files_mc, files_vm) = \
3866 _ComputeAncillaryFiles(cluster, True)
3868 # Never re-distribute configuration file from here
3869 assert not (constants.CLUSTER_CONF_FILE in files_all or
3870 constants.CLUSTER_CONF_FILE in files_vm)
3871 assert not files_mc, "Master candidates not handled in this function"
3874 (online_nodes, files_all),
3875 (vm_nodes, files_vm),
3879 for (node_list, files) in filemap:
3881 _UploadHelper(lu, node_list, fname)
3884 class LUClusterRedistConf(NoHooksLU):
3885 """Force the redistribution of cluster configuration.
3887 This is a very simple LU.
3892 def ExpandNames(self):
3893 self.needed_locks = {
3894 locking.LEVEL_NODE: locking.ALL_SET,
3896 self.share_locks[locking.LEVEL_NODE] = 1
3898 def Exec(self, feedback_fn):
3899 """Redistribute the configuration.
3902 self.cfg.Update(self.cfg.GetClusterInfo(), feedback_fn)
3903 _RedistributeAncillaryFiles(self)
3906 class LUClusterActivateMasterIp(NoHooksLU):
3907 """Activate the master IP on the master node.
3910 def Exec(self, feedback_fn):
3911 """Activate the master IP.
3914 master_params = self.cfg.GetMasterNetworkParameters()
3915 self.rpc.call_node_activate_master_ip(master_params.name,
3917 master_params.netmask,
3918 master_params.netdev,
3919 master_params.ip_family)
3922 class LUClusterDeactivateMasterIp(NoHooksLU):
3923 """Deactivate the master IP on the master node.
3926 def Exec(self, feedback_fn):
3927 """Deactivate the master IP.
3930 master_params = self.cfg.GetMasterNetworkParameters()
3931 self.rpc.call_node_deactivate_master_ip(master_params.name,
3933 master_params.netmask,
3934 master_params.netdev,
3935 master_params.ip_family)
3938 def _WaitForSync(lu, instance, disks=None, oneshot=False):
3939 """Sleep and poll for an instance's disk to sync.
3942 if not instance.disks or disks is not None and not disks:
3945 disks = _ExpandCheckDisks(instance, disks)
3948 lu.proc.LogInfo("Waiting for instance %s to sync disks." % instance.name)
3950 node = instance.primary_node
3953 lu.cfg.SetDiskID(dev, node)
3955 # TODO: Convert to utils.Retry
3958 degr_retries = 10 # in seconds, as we sleep 1 second each time
3962 cumul_degraded = False
3963 rstats = lu.rpc.call_blockdev_getmirrorstatus(node, disks)
3964 msg = rstats.fail_msg
3966 lu.LogWarning("Can't get any data from node %s: %s", node, msg)
3969 raise errors.RemoteError("Can't contact node %s for mirror data,"
3970 " aborting." % node)
3973 rstats = rstats.payload
3975 for i, mstat in enumerate(rstats):
3977 lu.LogWarning("Can't compute data for node %s/%s",
3978 node, disks[i].iv_name)
3981 cumul_degraded = (cumul_degraded or
3982 (mstat.is_degraded and mstat.sync_percent is None))
3983 if mstat.sync_percent is not None:
3985 if mstat.estimated_time is not None:
3986 rem_time = ("%s remaining (estimated)" %
3987 utils.FormatSeconds(mstat.estimated_time))
3988 max_time = mstat.estimated_time
3990 rem_time = "no time estimate"
3991 lu.proc.LogInfo("- device %s: %5.2f%% done, %s" %
3992 (disks[i].iv_name, mstat.sync_percent, rem_time))
3994 # if we're done but degraded, let's do a few small retries, to
3995 # make sure we see a stable and not transient situation; therefore
3996 # we force restart of the loop
3997 if (done or oneshot) and cumul_degraded and degr_retries > 0:
3998 logging.info("Degraded disks found, %d retries left", degr_retries)
4006 time.sleep(min(60, max_time))
4009 lu.proc.LogInfo("Instance %s's disks are in sync." % instance.name)
4010 return not cumul_degraded
4013 def _CheckDiskConsistency(lu, dev, node, on_primary, ldisk=False):
4014 """Check that mirrors are not degraded.
4016 The ldisk parameter, if True, will change the test from the
4017 is_degraded attribute (which represents overall non-ok status for
4018 the device(s)) to the ldisk (representing the local storage status).
4021 lu.cfg.SetDiskID(dev, node)
4025 if on_primary or dev.AssembleOnSecondary():
4026 rstats = lu.rpc.call_blockdev_find(node, dev)
4027 msg = rstats.fail_msg
4029 lu.LogWarning("Can't find disk on node %s: %s", node, msg)
4031 elif not rstats.payload:
4032 lu.LogWarning("Can't find disk on node %s", node)
4036 result = result and rstats.payload.ldisk_status == constants.LDS_OKAY
4038 result = result and not rstats.payload.is_degraded
4041 for child in dev.children:
4042 result = result and _CheckDiskConsistency(lu, child, node, on_primary)
4047 class LUOobCommand(NoHooksLU):
4048 """Logical unit for OOB handling.
4052 _SKIP_MASTER = (constants.OOB_POWER_OFF, constants.OOB_POWER_CYCLE)
4054 def ExpandNames(self):
4055 """Gather locks we need.
4058 if self.op.node_names:
4059 self.op.node_names = _GetWantedNodes(self, self.op.node_names)
4060 lock_names = self.op.node_names
4062 lock_names = locking.ALL_SET
4064 self.needed_locks = {
4065 locking.LEVEL_NODE: lock_names,
4068 def CheckPrereq(self):
4069 """Check prerequisites.
4072 - the node exists in the configuration
4075 Any errors are signaled by raising errors.OpPrereqError.
4079 self.master_node = self.cfg.GetMasterNode()
4081 assert self.op.power_delay >= 0.0
4083 if self.op.node_names:
4084 if (self.op.command in self._SKIP_MASTER and
4085 self.master_node in self.op.node_names):
4086 master_node_obj = self.cfg.GetNodeInfo(self.master_node)
4087 master_oob_handler = _SupportsOob(self.cfg, master_node_obj)
4089 if master_oob_handler:
4090 additional_text = ("run '%s %s %s' if you want to operate on the"
4091 " master regardless") % (master_oob_handler,
4095 additional_text = "it does not support out-of-band operations"
4097 raise errors.OpPrereqError(("Operating on the master node %s is not"
4098 " allowed for %s; %s") %
4099 (self.master_node, self.op.command,
4100 additional_text), errors.ECODE_INVAL)
4102 self.op.node_names = self.cfg.GetNodeList()
4103 if self.op.command in self._SKIP_MASTER:
4104 self.op.node_names.remove(self.master_node)
4106 if self.op.command in self._SKIP_MASTER:
4107 assert self.master_node not in self.op.node_names
4109 for (node_name, node) in self.cfg.GetMultiNodeInfo(self.op.node_names):
4111 raise errors.OpPrereqError("Node %s not found" % node_name,
4114 self.nodes.append(node)
4116 if (not self.op.ignore_status and
4117 (self.op.command == constants.OOB_POWER_OFF and not node.offline)):
4118 raise errors.OpPrereqError(("Cannot power off node %s because it is"
4119 " not marked offline") % node_name,
4122 def Exec(self, feedback_fn):
4123 """Execute OOB and return result if we expect any.
4126 master_node = self.master_node
4129 for idx, node in enumerate(utils.NiceSort(self.nodes,
4130 key=lambda node: node.name)):
4131 node_entry = [(constants.RS_NORMAL, node.name)]
4132 ret.append(node_entry)
4134 oob_program = _SupportsOob(self.cfg, node)
4137 node_entry.append((constants.RS_UNAVAIL, None))
4140 logging.info("Executing out-of-band command '%s' using '%s' on %s",
4141 self.op.command, oob_program, node.name)
4142 result = self.rpc.call_run_oob(master_node, oob_program,
4143 self.op.command, node.name,
4147 self.LogWarning("Out-of-band RPC failed on node '%s': %s",
4148 node.name, result.fail_msg)
4149 node_entry.append((constants.RS_NODATA, None))
4152 self._CheckPayload(result)
4153 except errors.OpExecError, err:
4154 self.LogWarning("Payload returned by node '%s' is not valid: %s",
4156 node_entry.append((constants.RS_NODATA, None))
4158 if self.op.command == constants.OOB_HEALTH:
4159 # For health we should log important events
4160 for item, status in result.payload:
4161 if status in [constants.OOB_STATUS_WARNING,
4162 constants.OOB_STATUS_CRITICAL]:
4163 self.LogWarning("Item '%s' on node '%s' has status '%s'",
4164 item, node.name, status)
4166 if self.op.command == constants.OOB_POWER_ON:
4168 elif self.op.command == constants.OOB_POWER_OFF:
4169 node.powered = False
4170 elif self.op.command == constants.OOB_POWER_STATUS:
4171 powered = result.payload[constants.OOB_POWER_STATUS_POWERED]
4172 if powered != node.powered:
4173 logging.warning(("Recorded power state (%s) of node '%s' does not"
4174 " match actual power state (%s)"), node.powered,
4177 # For configuration changing commands we should update the node
4178 if self.op.command in (constants.OOB_POWER_ON,
4179 constants.OOB_POWER_OFF):
4180 self.cfg.Update(node, feedback_fn)
4182 node_entry.append((constants.RS_NORMAL, result.payload))
4184 if (self.op.command == constants.OOB_POWER_ON and
4185 idx < len(self.nodes) - 1):
4186 time.sleep(self.op.power_delay)
4190 def _CheckPayload(self, result):
4191 """Checks if the payload is valid.
4193 @param result: RPC result
4194 @raises errors.OpExecError: If payload is not valid
4198 if self.op.command == constants.OOB_HEALTH:
4199 if not isinstance(result.payload, list):
4200 errs.append("command 'health' is expected to return a list but got %s" %
4201 type(result.payload))
4203 for item, status in result.payload:
4204 if status not in constants.OOB_STATUSES:
4205 errs.append("health item '%s' has invalid status '%s'" %
4208 if self.op.command == constants.OOB_POWER_STATUS:
4209 if not isinstance(result.payload, dict):
4210 errs.append("power-status is expected to return a dict but got %s" %
4211 type(result.payload))
4213 if self.op.command in [
4214 constants.OOB_POWER_ON,
4215 constants.OOB_POWER_OFF,
4216 constants.OOB_POWER_CYCLE,
4218 if result.payload is not None:
4219 errs.append("%s is expected to not return payload but got '%s'" %
4220 (self.op.command, result.payload))
4223 raise errors.OpExecError("Check of out-of-band payload failed due to %s" %
4224 utils.CommaJoin(errs))
4227 class _OsQuery(_QueryBase):
4228 FIELDS = query.OS_FIELDS
4230 def ExpandNames(self, lu):
4231 # Lock all nodes in shared mode
4232 # Temporary removal of locks, should be reverted later
4233 # TODO: reintroduce locks when they are lighter-weight
4234 lu.needed_locks = {}
4235 #self.share_locks[locking.LEVEL_NODE] = 1
4236 #self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
4238 # The following variables interact with _QueryBase._GetNames
4240 self.wanted = self.names
4242 self.wanted = locking.ALL_SET
4244 self.do_locking = self.use_locking
4246 def DeclareLocks(self, lu, level):
4250 def _DiagnoseByOS(rlist):
4251 """Remaps a per-node return list into an a per-os per-node dictionary
4253 @param rlist: a map with node names as keys and OS objects as values
4256 @return: a dictionary with osnames as keys and as value another
4257 map, with nodes as keys and tuples of (path, status, diagnose,
4258 variants, parameters, api_versions) as values, eg::
4260 {"debian-etch": {"node1": [(/usr/lib/..., True, "", [], []),
4261 (/srv/..., False, "invalid api")],
4262 "node2": [(/srv/..., True, "", [], [])]}
4267 # we build here the list of nodes that didn't fail the RPC (at RPC
4268 # level), so that nodes with a non-responding node daemon don't
4269 # make all OSes invalid
4270 good_nodes = [node_name for node_name in rlist
4271 if not rlist[node_name].fail_msg]
4272 for node_name, nr in rlist.items():
4273 if nr.fail_msg or not nr.payload:
4275 for (name, path, status, diagnose, variants,
4276 params, api_versions) in nr.payload:
4277 if name not in all_os:
4278 # build a list of nodes for this os containing empty lists
4279 # for each node in node_list
4281 for nname in good_nodes:
4282 all_os[name][nname] = []
4283 # convert params from [name, help] to (name, help)
4284 params = [tuple(v) for v in params]
4285 all_os[name][node_name].append((path, status, diagnose,
4286 variants, params, api_versions))
4289 def _GetQueryData(self, lu):
4290 """Computes the list of nodes and their attributes.
4293 # Locking is not used
4294 assert not (compat.any(lu.glm.is_owned(level)
4295 for level in locking.LEVELS
4296 if level != locking.LEVEL_CLUSTER) or
4297 self.do_locking or self.use_locking)
4299 valid_nodes = [node.name
4300 for node in lu.cfg.GetAllNodesInfo().values()
4301 if not node.offline and node.vm_capable]
4302 pol = self._DiagnoseByOS(lu.rpc.call_os_diagnose(valid_nodes))
4303 cluster = lu.cfg.GetClusterInfo()
4307 for (os_name, os_data) in pol.items():
4308 info = query.OsInfo(name=os_name, valid=True, node_status=os_data,
4309 hidden=(os_name in cluster.hidden_os),
4310 blacklisted=(os_name in cluster.blacklisted_os))
4314 api_versions = set()
4316 for idx, osl in enumerate(os_data.values()):
4317 info.valid = bool(info.valid and osl and osl[0][1])
4321 (node_variants, node_params, node_api) = osl[0][3:6]
4324 variants.update(node_variants)
4325 parameters.update(node_params)
4326 api_versions.update(node_api)
4328 # Filter out inconsistent values
4329 variants.intersection_update(node_variants)
4330 parameters.intersection_update(node_params)
4331 api_versions.intersection_update(node_api)
4333 info.variants = list(variants)
4334 info.parameters = list(parameters)
4335 info.api_versions = list(api_versions)
4337 data[os_name] = info
4339 # Prepare data in requested order
4340 return [data[name] for name in self._GetNames(lu, pol.keys(), None)
4344 class LUOsDiagnose(NoHooksLU):
4345 """Logical unit for OS diagnose/query.
4351 def _BuildFilter(fields, names):
4352 """Builds a filter for querying OSes.
4355 name_filter = qlang.MakeSimpleFilter("name", names)
4357 # Legacy behaviour: Hide hidden, blacklisted or invalid OSes if the
4358 # respective field is not requested
4359 status_filter = [[qlang.OP_NOT, [qlang.OP_TRUE, fname]]
4360 for fname in ["hidden", "blacklisted"]
4361 if fname not in fields]
4362 if "valid" not in fields:
4363 status_filter.append([qlang.OP_TRUE, "valid"])
4366 status_filter.insert(0, qlang.OP_AND)
4368 status_filter = None
4370 if name_filter and status_filter:
4371 return [qlang.OP_AND, name_filter, status_filter]
4375 return status_filter
4377 def CheckArguments(self):
4378 self.oq = _OsQuery(self._BuildFilter(self.op.output_fields, self.op.names),
4379 self.op.output_fields, False)
4381 def ExpandNames(self):
4382 self.oq.ExpandNames(self)
4384 def Exec(self, feedback_fn):
4385 return self.oq.OldStyleQuery(self)
4388 class LUNodeRemove(LogicalUnit):
4389 """Logical unit for removing a node.
4392 HPATH = "node-remove"
4393 HTYPE = constants.HTYPE_NODE
4395 def BuildHooksEnv(self):
4398 This doesn't run on the target node in the pre phase as a failed
4399 node would then be impossible to remove.
4403 "OP_TARGET": self.op.node_name,
4404 "NODE_NAME": self.op.node_name,
4407 def BuildHooksNodes(self):
4408 """Build hooks nodes.
4411 all_nodes = self.cfg.GetNodeList()
4413 all_nodes.remove(self.op.node_name)
4415 logging.warning("Node '%s', which is about to be removed, was not found"
4416 " in the list of all nodes", self.op.node_name)
4417 return (all_nodes, all_nodes)
4419 def CheckPrereq(self):
4420 """Check prerequisites.
4423 - the node exists in the configuration
4424 - it does not have primary or secondary instances
4425 - it's not the master
4427 Any errors are signaled by raising errors.OpPrereqError.
4430 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
4431 node = self.cfg.GetNodeInfo(self.op.node_name)
4432 assert node is not None
4434 masternode = self.cfg.GetMasterNode()
4435 if node.name == masternode:
4436 raise errors.OpPrereqError("Node is the master node, failover to another"
4437 " node is required", errors.ECODE_INVAL)
4439 for instance_name, instance in self.cfg.GetAllInstancesInfo():
4440 if node.name in instance.all_nodes:
4441 raise errors.OpPrereqError("Instance %s is still running on the node,"
4442 " please remove first" % instance_name,
4444 self.op.node_name = node.name
4447 def Exec(self, feedback_fn):
4448 """Removes the node from the cluster.
4452 logging.info("Stopping the node daemon and removing configs from node %s",
4455 modify_ssh_setup = self.cfg.GetClusterInfo().modify_ssh_setup
4457 # Promote nodes to master candidate as needed
4458 _AdjustCandidatePool(self, exceptions=[node.name])
4459 self.context.RemoveNode(node.name)
4461 # Run post hooks on the node before it's removed
4462 _RunPostHook(self, node.name)
4464 result = self.rpc.call_node_leave_cluster(node.name, modify_ssh_setup)
4465 msg = result.fail_msg
4467 self.LogWarning("Errors encountered on the remote node while leaving"
4468 " the cluster: %s", msg)
4470 # Remove node from our /etc/hosts
4471 if self.cfg.GetClusterInfo().modify_etc_hosts:
4472 master_node = self.cfg.GetMasterNode()
4473 result = self.rpc.call_etc_hosts_modify(master_node,
4474 constants.ETC_HOSTS_REMOVE,
4476 result.Raise("Can't update hosts file with new host data")
4477 _RedistributeAncillaryFiles(self)
4480 class _NodeQuery(_QueryBase):
4481 FIELDS = query.NODE_FIELDS
4483 def ExpandNames(self, lu):
4484 lu.needed_locks = {}
4485 lu.share_locks = _ShareAll()
4488 self.wanted = _GetWantedNodes(lu, self.names)
4490 self.wanted = locking.ALL_SET
4492 self.do_locking = (self.use_locking and
4493 query.NQ_LIVE in self.requested_data)
4496 # If any non-static field is requested we need to lock the nodes
4497 lu.needed_locks[locking.LEVEL_NODE] = self.wanted
4499 def DeclareLocks(self, lu, level):
4502 def _GetQueryData(self, lu):
4503 """Computes the list of nodes and their attributes.
4506 all_info = lu.cfg.GetAllNodesInfo()
4508 nodenames = self._GetNames(lu, all_info.keys(), locking.LEVEL_NODE)
4510 # Gather data as requested
4511 if query.NQ_LIVE in self.requested_data:
4512 # filter out non-vm_capable nodes
4513 toquery_nodes = [name for name in nodenames if all_info[name].vm_capable]
4515 node_data = lu.rpc.call_node_info(toquery_nodes, lu.cfg.GetVGName(),
4516 lu.cfg.GetHypervisorType())
4517 live_data = dict((name, nresult.payload)
4518 for (name, nresult) in node_data.items()
4519 if not nresult.fail_msg and nresult.payload)
4523 if query.NQ_INST in self.requested_data:
4524 node_to_primary = dict([(name, set()) for name in nodenames])
4525 node_to_secondary = dict([(name, set()) for name in nodenames])
4527 inst_data = lu.cfg.GetAllInstancesInfo()
4529 for inst in inst_data.values():
4530 if inst.primary_node in node_to_primary:
4531 node_to_primary[inst.primary_node].add(inst.name)
4532 for secnode in inst.secondary_nodes:
4533 if secnode in node_to_secondary:
4534 node_to_secondary[secnode].add(inst.name)
4536 node_to_primary = None
4537 node_to_secondary = None
4539 if query.NQ_OOB in self.requested_data:
4540 oob_support = dict((name, bool(_SupportsOob(lu.cfg, node)))
4541 for name, node in all_info.iteritems())
4545 if query.NQ_GROUP in self.requested_data:
4546 groups = lu.cfg.GetAllNodeGroupsInfo()
4550 return query.NodeQueryData([all_info[name] for name in nodenames],
4551 live_data, lu.cfg.GetMasterNode(),
4552 node_to_primary, node_to_secondary, groups,
4553 oob_support, lu.cfg.GetClusterInfo())
4556 class LUNodeQuery(NoHooksLU):
4557 """Logical unit for querying nodes.
4560 # pylint: disable=W0142
4563 def CheckArguments(self):
4564 self.nq = _NodeQuery(qlang.MakeSimpleFilter("name", self.op.names),
4565 self.op.output_fields, self.op.use_locking)
4567 def ExpandNames(self):
4568 self.nq.ExpandNames(self)
4570 def Exec(self, feedback_fn):
4571 return self.nq.OldStyleQuery(self)
4574 class LUNodeQueryvols(NoHooksLU):
4575 """Logical unit for getting volumes on node(s).
4579 _FIELDS_DYNAMIC = utils.FieldSet("phys", "vg", "name", "size", "instance")
4580 _FIELDS_STATIC = utils.FieldSet("node")
4582 def CheckArguments(self):
4583 _CheckOutputFields(static=self._FIELDS_STATIC,
4584 dynamic=self._FIELDS_DYNAMIC,
4585 selected=self.op.output_fields)
4587 def ExpandNames(self):
4588 self.needed_locks = {}
4589 self.share_locks[locking.LEVEL_NODE] = 1
4590 if not self.op.nodes:
4591 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
4593 self.needed_locks[locking.LEVEL_NODE] = \
4594 _GetWantedNodes(self, self.op.nodes)
4596 def Exec(self, feedback_fn):
4597 """Computes the list of nodes and their attributes.
4600 nodenames = self.owned_locks(locking.LEVEL_NODE)
4601 volumes = self.rpc.call_node_volumes(nodenames)
4603 ilist = self.cfg.GetAllInstancesInfo()
4604 vol2inst = _MapInstanceDisksToNodes(ilist.values())
4607 for node in nodenames:
4608 nresult = volumes[node]
4611 msg = nresult.fail_msg
4613 self.LogWarning("Can't compute volume data on node %s: %s", node, msg)
4616 node_vols = sorted(nresult.payload,
4617 key=operator.itemgetter("dev"))
4619 for vol in node_vols:
4621 for field in self.op.output_fields:
4624 elif field == "phys":
4628 elif field == "name":
4630 elif field == "size":
4631 val = int(float(vol["size"]))
4632 elif field == "instance":
4633 val = vol2inst.get((node, vol["vg"] + "/" + vol["name"]), "-")
4635 raise errors.ParameterError(field)
4636 node_output.append(str(val))
4638 output.append(node_output)
4643 class LUNodeQueryStorage(NoHooksLU):
4644 """Logical unit for getting information on storage units on node(s).
4647 _FIELDS_STATIC = utils.FieldSet(constants.SF_NODE)
4650 def CheckArguments(self):
4651 _CheckOutputFields(static=self._FIELDS_STATIC,
4652 dynamic=utils.FieldSet(*constants.VALID_STORAGE_FIELDS),
4653 selected=self.op.output_fields)
4655 def ExpandNames(self):
4656 self.needed_locks = {}
4657 self.share_locks[locking.LEVEL_NODE] = 1
4660 self.needed_locks[locking.LEVEL_NODE] = \
4661 _GetWantedNodes(self, self.op.nodes)
4663 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
4665 def Exec(self, feedback_fn):
4666 """Computes the list of nodes and their attributes.
4669 self.nodes = self.owned_locks(locking.LEVEL_NODE)
4671 # Always get name to sort by
4672 if constants.SF_NAME in self.op.output_fields:
4673 fields = self.op.output_fields[:]
4675 fields = [constants.SF_NAME] + self.op.output_fields
4677 # Never ask for node or type as it's only known to the LU
4678 for extra in [constants.SF_NODE, constants.SF_TYPE]:
4679 while extra in fields:
4680 fields.remove(extra)
4682 field_idx = dict([(name, idx) for (idx, name) in enumerate(fields)])
4683 name_idx = field_idx[constants.SF_NAME]
4685 st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
4686 data = self.rpc.call_storage_list(self.nodes,
4687 self.op.storage_type, st_args,
4688 self.op.name, fields)
4692 for node in utils.NiceSort(self.nodes):
4693 nresult = data[node]
4697 msg = nresult.fail_msg
4699 self.LogWarning("Can't get storage data from node %s: %s", node, msg)
4702 rows = dict([(row[name_idx], row) for row in nresult.payload])
4704 for name in utils.NiceSort(rows.keys()):
4709 for field in self.op.output_fields:
4710 if field == constants.SF_NODE:
4712 elif field == constants.SF_TYPE:
4713 val = self.op.storage_type
4714 elif field in field_idx:
4715 val = row[field_idx[field]]
4717 raise errors.ParameterError(field)
4726 class _InstanceQuery(_QueryBase):
4727 FIELDS = query.INSTANCE_FIELDS
4729 def ExpandNames(self, lu):
4730 lu.needed_locks = {}
4731 lu.share_locks = _ShareAll()
4734 self.wanted = _GetWantedInstances(lu, self.names)
4736 self.wanted = locking.ALL_SET
4738 self.do_locking = (self.use_locking and
4739 query.IQ_LIVE in self.requested_data)
4741 lu.needed_locks[locking.LEVEL_INSTANCE] = self.wanted
4742 lu.needed_locks[locking.LEVEL_NODEGROUP] = []
4743 lu.needed_locks[locking.LEVEL_NODE] = []
4744 lu.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
4746 self.do_grouplocks = (self.do_locking and
4747 query.IQ_NODES in self.requested_data)
4749 def DeclareLocks(self, lu, level):
4751 if level == locking.LEVEL_NODEGROUP and self.do_grouplocks:
4752 assert not lu.needed_locks[locking.LEVEL_NODEGROUP]
4754 # Lock all groups used by instances optimistically; this requires going
4755 # via the node before it's locked, requiring verification later on
4756 lu.needed_locks[locking.LEVEL_NODEGROUP] = \
4758 for instance_name in lu.owned_locks(locking.LEVEL_INSTANCE)
4759 for group_uuid in lu.cfg.GetInstanceNodeGroups(instance_name))
4760 elif level == locking.LEVEL_NODE:
4761 lu._LockInstancesNodes() # pylint: disable=W0212
4764 def _CheckGroupLocks(lu):
4765 owned_instances = frozenset(lu.owned_locks(locking.LEVEL_INSTANCE))
4766 owned_groups = frozenset(lu.owned_locks(locking.LEVEL_NODEGROUP))
4768 # Check if node groups for locked instances are still correct
4769 for instance_name in owned_instances:
4770 _CheckInstanceNodeGroups(lu.cfg, instance_name, owned_groups)
4772 def _GetQueryData(self, lu):
4773 """Computes the list of instances and their attributes.
4776 if self.do_grouplocks:
4777 self._CheckGroupLocks(lu)
4779 cluster = lu.cfg.GetClusterInfo()
4780 all_info = lu.cfg.GetAllInstancesInfo()
4782 instance_names = self._GetNames(lu, all_info.keys(), locking.LEVEL_INSTANCE)
4784 instance_list = [all_info[name] for name in instance_names]
4785 nodes = frozenset(itertools.chain(*(inst.all_nodes
4786 for inst in instance_list)))
4787 hv_list = list(set([inst.hypervisor for inst in instance_list]))
4790 wrongnode_inst = set()
4792 # Gather data as requested
4793 if self.requested_data & set([query.IQ_LIVE, query.IQ_CONSOLE]):
4795 node_data = lu.rpc.call_all_instances_info(nodes, hv_list)
4797 result = node_data[name]
4799 # offline nodes will be in both lists
4800 assert result.fail_msg
4801 offline_nodes.append(name)
4803 bad_nodes.append(name)
4804 elif result.payload:
4805 for inst in result.payload:
4806 if inst in all_info:
4807 if all_info[inst].primary_node == name:
4808 live_data.update(result.payload)
4810 wrongnode_inst.add(inst)
4812 # orphan instance; we don't list it here as we don't
4813 # handle this case yet in the output of instance listing
4814 logging.warning("Orphan instance '%s' found on node %s",
4816 # else no instance is alive
4820 if query.IQ_DISKUSAGE in self.requested_data:
4821 disk_usage = dict((inst.name,
4822 _ComputeDiskSize(inst.disk_template,
4823 [{constants.IDISK_SIZE: disk.size}
4824 for disk in inst.disks]))
4825 for inst in instance_list)
4829 if query.IQ_CONSOLE in self.requested_data:
4831 for inst in instance_list:
4832 if inst.name in live_data:
4833 # Instance is running
4834 consinfo[inst.name] = _GetInstanceConsole(cluster, inst)
4836 consinfo[inst.name] = None
4837 assert set(consinfo.keys()) == set(instance_names)
4841 if query.IQ_NODES in self.requested_data:
4842 node_names = set(itertools.chain(*map(operator.attrgetter("all_nodes"),
4844 nodes = dict(lu.cfg.GetMultiNodeInfo(node_names))
4845 groups = dict((uuid, lu.cfg.GetNodeGroup(uuid))
4846 for uuid in set(map(operator.attrgetter("group"),
4852 return query.InstanceQueryData(instance_list, lu.cfg.GetClusterInfo(),
4853 disk_usage, offline_nodes, bad_nodes,
4854 live_data, wrongnode_inst, consinfo,
4858 class LUQuery(NoHooksLU):
4859 """Query for resources/items of a certain kind.
4862 # pylint: disable=W0142
4865 def CheckArguments(self):
4866 qcls = _GetQueryImplementation(self.op.what)
4868 self.impl = qcls(self.op.qfilter, self.op.fields, self.op.use_locking)
4870 def ExpandNames(self):
4871 self.impl.ExpandNames(self)
4873 def DeclareLocks(self, level):
4874 self.impl.DeclareLocks(self, level)
4876 def Exec(self, feedback_fn):
4877 return self.impl.NewStyleQuery(self)
4880 class LUQueryFields(NoHooksLU):
4881 """Query for resources/items of a certain kind.
4884 # pylint: disable=W0142
4887 def CheckArguments(self):
4888 self.qcls = _GetQueryImplementation(self.op.what)
4890 def ExpandNames(self):
4891 self.needed_locks = {}
4893 def Exec(self, feedback_fn):
4894 return query.QueryFields(self.qcls.FIELDS, self.op.fields)
4897 class LUNodeModifyStorage(NoHooksLU):
4898 """Logical unit for modifying a storage volume on a node.
4903 def CheckArguments(self):
4904 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
4906 storage_type = self.op.storage_type
4909 modifiable = constants.MODIFIABLE_STORAGE_FIELDS[storage_type]
4911 raise errors.OpPrereqError("Storage units of type '%s' can not be"
4912 " modified" % storage_type,
4915 diff = set(self.op.changes.keys()) - modifiable
4917 raise errors.OpPrereqError("The following fields can not be modified for"
4918 " storage units of type '%s': %r" %
4919 (storage_type, list(diff)),
4922 def ExpandNames(self):
4923 self.needed_locks = {
4924 locking.LEVEL_NODE: self.op.node_name,
4927 def Exec(self, feedback_fn):
4928 """Computes the list of nodes and their attributes.
4931 st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
4932 result = self.rpc.call_storage_modify(self.op.node_name,
4933 self.op.storage_type, st_args,
4934 self.op.name, self.op.changes)
4935 result.Raise("Failed to modify storage unit '%s' on %s" %
4936 (self.op.name, self.op.node_name))
4939 class LUNodeAdd(LogicalUnit):
4940 """Logical unit for adding node to the cluster.
4944 HTYPE = constants.HTYPE_NODE
4945 _NFLAGS = ["master_capable", "vm_capable"]
4947 def CheckArguments(self):
4948 self.primary_ip_family = self.cfg.GetPrimaryIPFamily()
4949 # validate/normalize the node name
4950 self.hostname = netutils.GetHostname(name=self.op.node_name,
4951 family=self.primary_ip_family)
4952 self.op.node_name = self.hostname.name
4954 if self.op.readd and self.op.node_name == self.cfg.GetMasterNode():
4955 raise errors.OpPrereqError("Cannot readd the master node",
4958 if self.op.readd and self.op.group:
4959 raise errors.OpPrereqError("Cannot pass a node group when a node is"
4960 " being readded", errors.ECODE_INVAL)
4962 def BuildHooksEnv(self):
4965 This will run on all nodes before, and on all nodes + the new node after.
4969 "OP_TARGET": self.op.node_name,
4970 "NODE_NAME": self.op.node_name,
4971 "NODE_PIP": self.op.primary_ip,
4972 "NODE_SIP": self.op.secondary_ip,
4973 "MASTER_CAPABLE": str(self.op.master_capable),
4974 "VM_CAPABLE": str(self.op.vm_capable),
4977 def BuildHooksNodes(self):
4978 """Build hooks nodes.
4981 # Exclude added node
4982 pre_nodes = list(set(self.cfg.GetNodeList()) - set([self.op.node_name]))
4983 post_nodes = pre_nodes + [self.op.node_name, ]
4985 return (pre_nodes, post_nodes)
4987 def CheckPrereq(self):
4988 """Check prerequisites.
4991 - the new node is not already in the config
4993 - its parameters (single/dual homed) matches the cluster
4995 Any errors are signaled by raising errors.OpPrereqError.
4999 hostname = self.hostname
5000 node = hostname.name
5001 primary_ip = self.op.primary_ip = hostname.ip
5002 if self.op.secondary_ip is None:
5003 if self.primary_ip_family == netutils.IP6Address.family:
5004 raise errors.OpPrereqError("When using a IPv6 primary address, a valid"
5005 " IPv4 address must be given as secondary",
5007 self.op.secondary_ip = primary_ip
5009 secondary_ip = self.op.secondary_ip
5010 if not netutils.IP4Address.IsValid(secondary_ip):
5011 raise errors.OpPrereqError("Secondary IP (%s) needs to be a valid IPv4"
5012 " address" % secondary_ip, errors.ECODE_INVAL)
5014 node_list = cfg.GetNodeList()
5015 if not self.op.readd and node in node_list:
5016 raise errors.OpPrereqError("Node %s is already in the configuration" %
5017 node, errors.ECODE_EXISTS)
5018 elif self.op.readd and node not in node_list:
5019 raise errors.OpPrereqError("Node %s is not in the configuration" % node,
5022 self.changed_primary_ip = False
5024 for existing_node_name, existing_node in cfg.GetMultiNodeInfo(node_list):
5025 if self.op.readd and node == existing_node_name:
5026 if existing_node.secondary_ip != secondary_ip:
5027 raise errors.OpPrereqError("Readded node doesn't have the same IP"
5028 " address configuration as before",
5030 if existing_node.primary_ip != primary_ip:
5031 self.changed_primary_ip = True
5035 if (existing_node.primary_ip == primary_ip or
5036 existing_node.secondary_ip == primary_ip or
5037 existing_node.primary_ip == secondary_ip or
5038 existing_node.secondary_ip == secondary_ip):
5039 raise errors.OpPrereqError("New node ip address(es) conflict with"
5040 " existing node %s" % existing_node.name,
5041 errors.ECODE_NOTUNIQUE)
5043 # After this 'if' block, None is no longer a valid value for the
5044 # _capable op attributes
5046 old_node = self.cfg.GetNodeInfo(node)
5047 assert old_node is not None, "Can't retrieve locked node %s" % node
5048 for attr in self._NFLAGS:
5049 if getattr(self.op, attr) is None:
5050 setattr(self.op, attr, getattr(old_node, attr))
5052 for attr in self._NFLAGS:
5053 if getattr(self.op, attr) is None:
5054 setattr(self.op, attr, True)
5056 if self.op.readd and not self.op.vm_capable:
5057 pri, sec = cfg.GetNodeInstances(node)
5059 raise errors.OpPrereqError("Node %s being re-added with vm_capable"
5060 " flag set to false, but it already holds"
5061 " instances" % node,
5064 # check that the type of the node (single versus dual homed) is the
5065 # same as for the master
5066 myself = cfg.GetNodeInfo(self.cfg.GetMasterNode())
5067 master_singlehomed = myself.secondary_ip == myself.primary_ip
5068 newbie_singlehomed = secondary_ip == primary_ip
5069 if master_singlehomed != newbie_singlehomed:
5070 if master_singlehomed:
5071 raise errors.OpPrereqError("The master has no secondary ip but the"
5072 " new node has one",
5075 raise errors.OpPrereqError("The master has a secondary ip but the"
5076 " new node doesn't have one",
5079 # checks reachability
5080 if not netutils.TcpPing(primary_ip, constants.DEFAULT_NODED_PORT):
5081 raise errors.OpPrereqError("Node not reachable by ping",
5082 errors.ECODE_ENVIRON)
5084 if not newbie_singlehomed:
5085 # check reachability from my secondary ip to newbie's secondary ip
5086 if not netutils.TcpPing(secondary_ip, constants.DEFAULT_NODED_PORT,
5087 source=myself.secondary_ip):
5088 raise errors.OpPrereqError("Node secondary ip not reachable by TCP"
5089 " based ping to node daemon port",
5090 errors.ECODE_ENVIRON)
5097 if self.op.master_capable:
5098 self.master_candidate = _DecideSelfPromotion(self, exceptions=exceptions)
5100 self.master_candidate = False
5103 self.new_node = old_node
5105 node_group = cfg.LookupNodeGroup(self.op.group)
5106 self.new_node = objects.Node(name=node,
5107 primary_ip=primary_ip,
5108 secondary_ip=secondary_ip,
5109 master_candidate=self.master_candidate,
5110 offline=False, drained=False,
5113 if self.op.ndparams:
5114 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
5116 def Exec(self, feedback_fn):
5117 """Adds the new node to the cluster.
5120 new_node = self.new_node
5121 node = new_node.name
5123 # We adding a new node so we assume it's powered
5124 new_node.powered = True
5126 # for re-adds, reset the offline/drained/master-candidate flags;
5127 # we need to reset here, otherwise offline would prevent RPC calls
5128 # later in the procedure; this also means that if the re-add
5129 # fails, we are left with a non-offlined, broken node
5131 new_node.drained = new_node.offline = False # pylint: disable=W0201
5132 self.LogInfo("Readding a node, the offline/drained flags were reset")
5133 # if we demote the node, we do cleanup later in the procedure
5134 new_node.master_candidate = self.master_candidate
5135 if self.changed_primary_ip:
5136 new_node.primary_ip = self.op.primary_ip
5138 # copy the master/vm_capable flags
5139 for attr in self._NFLAGS:
5140 setattr(new_node, attr, getattr(self.op, attr))
5142 # notify the user about any possible mc promotion
5143 if new_node.master_candidate:
5144 self.LogInfo("Node will be a master candidate")
5146 if self.op.ndparams:
5147 new_node.ndparams = self.op.ndparams
5149 new_node.ndparams = {}
5151 # check connectivity
5152 result = self.rpc.call_version([node])[node]
5153 result.Raise("Can't get version information from node %s" % node)
5154 if constants.PROTOCOL_VERSION == result.payload:
5155 logging.info("Communication to node %s fine, sw version %s match",
5156 node, result.payload)
5158 raise errors.OpExecError("Version mismatch master version %s,"
5159 " node version %s" %
5160 (constants.PROTOCOL_VERSION, result.payload))
5162 # Add node to our /etc/hosts, and add key to known_hosts
5163 if self.cfg.GetClusterInfo().modify_etc_hosts:
5164 master_node = self.cfg.GetMasterNode()
5165 result = self.rpc.call_etc_hosts_modify(master_node,
5166 constants.ETC_HOSTS_ADD,
5169 result.Raise("Can't update hosts file with new host data")
5171 if new_node.secondary_ip != new_node.primary_ip:
5172 _CheckNodeHasSecondaryIP(self, new_node.name, new_node.secondary_ip,
5175 node_verify_list = [self.cfg.GetMasterNode()]
5176 node_verify_param = {
5177 constants.NV_NODELIST: ([node], {}),
5178 # TODO: do a node-net-test as well?
5181 result = self.rpc.call_node_verify(node_verify_list, node_verify_param,
5182 self.cfg.GetClusterName())
5183 for verifier in node_verify_list:
5184 result[verifier].Raise("Cannot communicate with node %s" % verifier)
5185 nl_payload = result[verifier].payload[constants.NV_NODELIST]
5187 for failed in nl_payload:
5188 feedback_fn("ssh/hostname verification failed"
5189 " (checking from %s): %s" %
5190 (verifier, nl_payload[failed]))
5191 raise errors.OpExecError("ssh/hostname verification failed")
5194 _RedistributeAncillaryFiles(self)
5195 self.context.ReaddNode(new_node)
5196 # make sure we redistribute the config
5197 self.cfg.Update(new_node, feedback_fn)
5198 # and make sure the new node will not have old files around
5199 if not new_node.master_candidate:
5200 result = self.rpc.call_node_demote_from_mc(new_node.name)
5201 msg = result.fail_msg
5203 self.LogWarning("Node failed to demote itself from master"
5204 " candidate status: %s" % msg)
5206 _RedistributeAncillaryFiles(self, additional_nodes=[node],
5207 additional_vm=self.op.vm_capable)
5208 self.context.AddNode(new_node, self.proc.GetECId())
5211 class LUNodeSetParams(LogicalUnit):
5212 """Modifies the parameters of a node.
5214 @cvar _F2R: a dictionary from tuples of flags (mc, drained, offline)
5215 to the node role (as _ROLE_*)
5216 @cvar _R2F: a dictionary from node role to tuples of flags
5217 @cvar _FLAGS: a list of attribute names corresponding to the flags
5220 HPATH = "node-modify"
5221 HTYPE = constants.HTYPE_NODE
5223 (_ROLE_CANDIDATE, _ROLE_DRAINED, _ROLE_OFFLINE, _ROLE_REGULAR) = range(4)
5225 (True, False, False): _ROLE_CANDIDATE,
5226 (False, True, False): _ROLE_DRAINED,
5227 (False, False, True): _ROLE_OFFLINE,
5228 (False, False, False): _ROLE_REGULAR,
5230 _R2F = dict((v, k) for k, v in _F2R.items())
5231 _FLAGS = ["master_candidate", "drained", "offline"]
5233 def CheckArguments(self):
5234 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
5235 all_mods = [self.op.offline, self.op.master_candidate, self.op.drained,
5236 self.op.master_capable, self.op.vm_capable,
5237 self.op.secondary_ip, self.op.ndparams]
5238 if all_mods.count(None) == len(all_mods):
5239 raise errors.OpPrereqError("Please pass at least one modification",
5241 if all_mods.count(True) > 1:
5242 raise errors.OpPrereqError("Can't set the node into more than one"
5243 " state at the same time",
5246 # Boolean value that tells us whether we might be demoting from MC
5247 self.might_demote = (self.op.master_candidate == False or
5248 self.op.offline == True or
5249 self.op.drained == True or
5250 self.op.master_capable == False)
5252 if self.op.secondary_ip:
5253 if not netutils.IP4Address.IsValid(self.op.secondary_ip):
5254 raise errors.OpPrereqError("Secondary IP (%s) needs to be a valid IPv4"
5255 " address" % self.op.secondary_ip,
5258 self.lock_all = self.op.auto_promote and self.might_demote
5259 self.lock_instances = self.op.secondary_ip is not None
5261 def ExpandNames(self):
5263 self.needed_locks = {locking.LEVEL_NODE: locking.ALL_SET}
5265 self.needed_locks = {locking.LEVEL_NODE: self.op.node_name}
5267 if self.lock_instances:
5268 self.needed_locks[locking.LEVEL_INSTANCE] = locking.ALL_SET
5270 def DeclareLocks(self, level):
5271 # If we have locked all instances, before waiting to lock nodes, release
5272 # all the ones living on nodes unrelated to the current operation.
5273 if level == locking.LEVEL_NODE and self.lock_instances:
5274 self.affected_instances = []
5275 if self.needed_locks[locking.LEVEL_NODE] is not locking.ALL_SET:
5278 # Build list of instances to release
5279 locked_i = self.owned_locks(locking.LEVEL_INSTANCE)
5280 for instance_name, instance in self.cfg.GetMultiInstanceInfo(locked_i):
5281 if (instance.disk_template in constants.DTS_INT_MIRROR and
5282 self.op.node_name in instance.all_nodes):
5283 instances_keep.append(instance_name)
5284 self.affected_instances.append(instance)
5286 _ReleaseLocks(self, locking.LEVEL_INSTANCE, keep=instances_keep)
5288 assert (set(self.owned_locks(locking.LEVEL_INSTANCE)) ==
5289 set(instances_keep))
5291 def BuildHooksEnv(self):
5294 This runs on the master node.
5298 "OP_TARGET": self.op.node_name,
5299 "MASTER_CANDIDATE": str(self.op.master_candidate),
5300 "OFFLINE": str(self.op.offline),
5301 "DRAINED": str(self.op.drained),
5302 "MASTER_CAPABLE": str(self.op.master_capable),
5303 "VM_CAPABLE": str(self.op.vm_capable),
5306 def BuildHooksNodes(self):
5307 """Build hooks nodes.
5310 nl = [self.cfg.GetMasterNode(), self.op.node_name]
5313 def CheckPrereq(self):
5314 """Check prerequisites.
5316 This only checks the instance list against the existing names.
5319 node = self.node = self.cfg.GetNodeInfo(self.op.node_name)
5321 if (self.op.master_candidate is not None or
5322 self.op.drained is not None or
5323 self.op.offline is not None):
5324 # we can't change the master's node flags
5325 if self.op.node_name == self.cfg.GetMasterNode():
5326 raise errors.OpPrereqError("The master role can be changed"
5327 " only via master-failover",
5330 if self.op.master_candidate and not node.master_capable:
5331 raise errors.OpPrereqError("Node %s is not master capable, cannot make"
5332 " it a master candidate" % node.name,
5335 if self.op.vm_capable == False:
5336 (ipri, isec) = self.cfg.GetNodeInstances(self.op.node_name)
5338 raise errors.OpPrereqError("Node %s hosts instances, cannot unset"
5339 " the vm_capable flag" % node.name,
5342 if node.master_candidate and self.might_demote and not self.lock_all:
5343 assert not self.op.auto_promote, "auto_promote set but lock_all not"
5344 # check if after removing the current node, we're missing master
5346 (mc_remaining, mc_should, _) = \
5347 self.cfg.GetMasterCandidateStats(exceptions=[node.name])
5348 if mc_remaining < mc_should:
5349 raise errors.OpPrereqError("Not enough master candidates, please"
5350 " pass auto promote option to allow"
5351 " promotion", errors.ECODE_STATE)
5353 self.old_flags = old_flags = (node.master_candidate,
5354 node.drained, node.offline)
5355 assert old_flags in self._F2R, "Un-handled old flags %s" % str(old_flags)
5356 self.old_role = old_role = self._F2R[old_flags]
5358 # Check for ineffective changes
5359 for attr in self._FLAGS:
5360 if (getattr(self.op, attr) == False and getattr(node, attr) == False):
5361 self.LogInfo("Ignoring request to unset flag %s, already unset", attr)
5362 setattr(self.op, attr, None)
5364 # Past this point, any flag change to False means a transition
5365 # away from the respective state, as only real changes are kept
5367 # TODO: We might query the real power state if it supports OOB
5368 if _SupportsOob(self.cfg, node):
5369 if self.op.offline is False and not (node.powered or
5370 self.op.powered == True):
5371 raise errors.OpPrereqError(("Node %s needs to be turned on before its"
5372 " offline status can be reset") %
5374 elif self.op.powered is not None:
5375 raise errors.OpPrereqError(("Unable to change powered state for node %s"
5376 " as it does not support out-of-band"
5377 " handling") % self.op.node_name)
5379 # If we're being deofflined/drained, we'll MC ourself if needed
5380 if (self.op.drained == False or self.op.offline == False or
5381 (self.op.master_capable and not node.master_capable)):
5382 if _DecideSelfPromotion(self):
5383 self.op.master_candidate = True
5384 self.LogInfo("Auto-promoting node to master candidate")
5386 # If we're no longer master capable, we'll demote ourselves from MC
5387 if self.op.master_capable == False and node.master_candidate:
5388 self.LogInfo("Demoting from master candidate")
5389 self.op.master_candidate = False
5392 assert [getattr(self.op, attr) for attr in self._FLAGS].count(True) <= 1
5393 if self.op.master_candidate:
5394 new_role = self._ROLE_CANDIDATE
5395 elif self.op.drained:
5396 new_role = self._ROLE_DRAINED
5397 elif self.op.offline:
5398 new_role = self._ROLE_OFFLINE
5399 elif False in [self.op.master_candidate, self.op.drained, self.op.offline]:
5400 # False is still in new flags, which means we're un-setting (the
5402 new_role = self._ROLE_REGULAR
5403 else: # no new flags, nothing, keep old role
5406 self.new_role = new_role
5408 if old_role == self._ROLE_OFFLINE and new_role != old_role:
5409 # Trying to transition out of offline status
5410 # TODO: Use standard RPC runner, but make sure it works when the node is
5411 # still marked offline
5412 result = rpc.BootstrapRunner().call_version([node.name])[node.name]
5414 raise errors.OpPrereqError("Node %s is being de-offlined but fails"
5415 " to report its version: %s" %
5416 (node.name, result.fail_msg),
5419 self.LogWarning("Transitioning node from offline to online state"
5420 " without using re-add. Please make sure the node"
5423 if self.op.secondary_ip:
5424 # Ok even without locking, because this can't be changed by any LU
5425 master = self.cfg.GetNodeInfo(self.cfg.GetMasterNode())
5426 master_singlehomed = master.secondary_ip == master.primary_ip
5427 if master_singlehomed and self.op.secondary_ip:
5428 raise errors.OpPrereqError("Cannot change the secondary ip on a single"
5429 " homed cluster", errors.ECODE_INVAL)
5432 if self.affected_instances:
5433 raise errors.OpPrereqError("Cannot change secondary ip: offline"
5434 " node has instances (%s) configured"
5435 " to use it" % self.affected_instances)
5437 # On online nodes, check that no instances are running, and that
5438 # the node has the new ip and we can reach it.
5439 for instance in self.affected_instances:
5440 _CheckInstanceDown(self, instance, "cannot change secondary ip")
5442 _CheckNodeHasSecondaryIP(self, node.name, self.op.secondary_ip, True)
5443 if master.name != node.name:
5444 # check reachability from master secondary ip to new secondary ip
5445 if not netutils.TcpPing(self.op.secondary_ip,
5446 constants.DEFAULT_NODED_PORT,
5447 source=master.secondary_ip):
5448 raise errors.OpPrereqError("Node secondary ip not reachable by TCP"
5449 " based ping to node daemon port",
5450 errors.ECODE_ENVIRON)
5452 if self.op.ndparams:
5453 new_ndparams = _GetUpdatedParams(self.node.ndparams, self.op.ndparams)
5454 utils.ForceDictType(new_ndparams, constants.NDS_PARAMETER_TYPES)
5455 self.new_ndparams = new_ndparams
5457 def Exec(self, feedback_fn):
5462 old_role = self.old_role
5463 new_role = self.new_role
5467 if self.op.ndparams:
5468 node.ndparams = self.new_ndparams
5470 if self.op.powered is not None:
5471 node.powered = self.op.powered
5473 for attr in ["master_capable", "vm_capable"]:
5474 val = getattr(self.op, attr)
5476 setattr(node, attr, val)
5477 result.append((attr, str(val)))
5479 if new_role != old_role:
5480 # Tell the node to demote itself, if no longer MC and not offline
5481 if old_role == self._ROLE_CANDIDATE and new_role != self._ROLE_OFFLINE:
5482 msg = self.rpc.call_node_demote_from_mc(node.name).fail_msg
5484 self.LogWarning("Node failed to demote itself: %s", msg)
5486 new_flags = self._R2F[new_role]
5487 for of, nf, desc in zip(self.old_flags, new_flags, self._FLAGS):
5489 result.append((desc, str(nf)))
5490 (node.master_candidate, node.drained, node.offline) = new_flags
5492 # we locked all nodes, we adjust the CP before updating this node
5494 _AdjustCandidatePool(self, [node.name])
5496 if self.op.secondary_ip:
5497 node.secondary_ip = self.op.secondary_ip
5498 result.append(("secondary_ip", self.op.secondary_ip))
5500 # this will trigger configuration file update, if needed
5501 self.cfg.Update(node, feedback_fn)
5503 # this will trigger job queue propagation or cleanup if the mc
5505 if [old_role, new_role].count(self._ROLE_CANDIDATE) == 1:
5506 self.context.ReaddNode(node)
5511 class LUNodePowercycle(NoHooksLU):
5512 """Powercycles a node.
5517 def CheckArguments(self):
5518 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
5519 if self.op.node_name == self.cfg.GetMasterNode() and not self.op.force:
5520 raise errors.OpPrereqError("The node is the master and the force"
5521 " parameter was not set",
5524 def ExpandNames(self):
5525 """Locking for PowercycleNode.
5527 This is a last-resort option and shouldn't block on other
5528 jobs. Therefore, we grab no locks.
5531 self.needed_locks = {}
5533 def Exec(self, feedback_fn):
5537 result = self.rpc.call_node_powercycle(self.op.node_name,
5538 self.cfg.GetHypervisorType())
5539 result.Raise("Failed to schedule the reboot")
5540 return result.payload
5543 class LUClusterQuery(NoHooksLU):
5544 """Query cluster configuration.
5549 def ExpandNames(self):
5550 self.needed_locks = {}
5552 def Exec(self, feedback_fn):
5553 """Return cluster config.
5556 cluster = self.cfg.GetClusterInfo()
5559 # Filter just for enabled hypervisors
5560 for os_name, hv_dict in cluster.os_hvp.items():
5561 os_hvp[os_name] = {}
5562 for hv_name, hv_params in hv_dict.items():
5563 if hv_name in cluster.enabled_hypervisors:
5564 os_hvp[os_name][hv_name] = hv_params
5566 # Convert ip_family to ip_version
5567 primary_ip_version = constants.IP4_VERSION
5568 if cluster.primary_ip_family == netutils.IP6Address.family:
5569 primary_ip_version = constants.IP6_VERSION
5572 "software_version": constants.RELEASE_VERSION,
5573 "protocol_version": constants.PROTOCOL_VERSION,
5574 "config_version": constants.CONFIG_VERSION,
5575 "os_api_version": max(constants.OS_API_VERSIONS),
5576 "export_version": constants.EXPORT_VERSION,
5577 "architecture": (platform.architecture()[0], platform.machine()),
5578 "name": cluster.cluster_name,
5579 "master": cluster.master_node,
5580 "default_hypervisor": cluster.enabled_hypervisors[0],
5581 "enabled_hypervisors": cluster.enabled_hypervisors,
5582 "hvparams": dict([(hypervisor_name, cluster.hvparams[hypervisor_name])
5583 for hypervisor_name in cluster.enabled_hypervisors]),
5585 "beparams": cluster.beparams,
5586 "osparams": cluster.osparams,
5587 "nicparams": cluster.nicparams,
5588 "ndparams": cluster.ndparams,
5589 "candidate_pool_size": cluster.candidate_pool_size,
5590 "master_netdev": cluster.master_netdev,
5591 "master_netmask": cluster.master_netmask,
5592 "volume_group_name": cluster.volume_group_name,
5593 "drbd_usermode_helper": cluster.drbd_usermode_helper,
5594 "file_storage_dir": cluster.file_storage_dir,
5595 "shared_file_storage_dir": cluster.shared_file_storage_dir,
5596 "maintain_node_health": cluster.maintain_node_health,
5597 "ctime": cluster.ctime,
5598 "mtime": cluster.mtime,
5599 "uuid": cluster.uuid,
5600 "tags": list(cluster.GetTags()),
5601 "uid_pool": cluster.uid_pool,
5602 "default_iallocator": cluster.default_iallocator,
5603 "reserved_lvs": cluster.reserved_lvs,
5604 "primary_ip_version": primary_ip_version,
5605 "prealloc_wipe_disks": cluster.prealloc_wipe_disks,
5606 "hidden_os": cluster.hidden_os,
5607 "blacklisted_os": cluster.blacklisted_os,
5613 class LUClusterConfigQuery(NoHooksLU):
5614 """Return configuration values.
5618 _FIELDS_DYNAMIC = utils.FieldSet()
5619 _FIELDS_STATIC = utils.FieldSet("cluster_name", "master_node", "drain_flag",
5620 "watcher_pause", "volume_group_name")
5622 def CheckArguments(self):
5623 _CheckOutputFields(static=self._FIELDS_STATIC,
5624 dynamic=self._FIELDS_DYNAMIC,
5625 selected=self.op.output_fields)
5627 def ExpandNames(self):
5628 self.needed_locks = {}
5630 def Exec(self, feedback_fn):
5631 """Dump a representation of the cluster config to the standard output.
5635 for field in self.op.output_fields:
5636 if field == "cluster_name":
5637 entry = self.cfg.GetClusterName()
5638 elif field == "master_node":
5639 entry = self.cfg.GetMasterNode()
5640 elif field == "drain_flag":
5641 entry = os.path.exists(constants.JOB_QUEUE_DRAIN_FILE)
5642 elif field == "watcher_pause":
5643 entry = utils.ReadWatcherPauseFile(constants.WATCHER_PAUSEFILE)
5644 elif field == "volume_group_name":
5645 entry = self.cfg.GetVGName()
5647 raise errors.ParameterError(field)
5648 values.append(entry)
5652 class LUInstanceActivateDisks(NoHooksLU):
5653 """Bring up an instance's disks.
5658 def ExpandNames(self):
5659 self._ExpandAndLockInstance()
5660 self.needed_locks[locking.LEVEL_NODE] = []
5661 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
5663 def DeclareLocks(self, level):
5664 if level == locking.LEVEL_NODE:
5665 self._LockInstancesNodes()
5667 def CheckPrereq(self):
5668 """Check prerequisites.
5670 This checks that the instance is in the cluster.
5673 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5674 assert self.instance is not None, \
5675 "Cannot retrieve locked instance %s" % self.op.instance_name
5676 _CheckNodeOnline(self, self.instance.primary_node)
5678 def Exec(self, feedback_fn):
5679 """Activate the disks.
5682 disks_ok, disks_info = \
5683 _AssembleInstanceDisks(self, self.instance,
5684 ignore_size=self.op.ignore_size)
5686 raise errors.OpExecError("Cannot activate block devices")
5691 def _AssembleInstanceDisks(lu, instance, disks=None, ignore_secondaries=False,
5693 """Prepare the block devices for an instance.
5695 This sets up the block devices on all nodes.
5697 @type lu: L{LogicalUnit}
5698 @param lu: the logical unit on whose behalf we execute
5699 @type instance: L{objects.Instance}
5700 @param instance: the instance for whose disks we assemble
5701 @type disks: list of L{objects.Disk} or None
5702 @param disks: which disks to assemble (or all, if None)
5703 @type ignore_secondaries: boolean
5704 @param ignore_secondaries: if true, errors on secondary nodes
5705 won't result in an error return from the function
5706 @type ignore_size: boolean
5707 @param ignore_size: if true, the current known size of the disk
5708 will not be used during the disk activation, useful for cases
5709 when the size is wrong
5710 @return: False if the operation failed, otherwise a list of
5711 (host, instance_visible_name, node_visible_name)
5712 with the mapping from node devices to instance devices
5717 iname = instance.name
5718 disks = _ExpandCheckDisks(instance, disks)
5720 # With the two passes mechanism we try to reduce the window of
5721 # opportunity for the race condition of switching DRBD to primary
5722 # before handshaking occured, but we do not eliminate it
5724 # The proper fix would be to wait (with some limits) until the
5725 # connection has been made and drbd transitions from WFConnection
5726 # into any other network-connected state (Connected, SyncTarget,
5729 # 1st pass, assemble on all nodes in secondary mode
5730 for idx, inst_disk in enumerate(disks):
5731 for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node):
5733 node_disk = node_disk.Copy()
5734 node_disk.UnsetSize()
5735 lu.cfg.SetDiskID(node_disk, node)
5736 result = lu.rpc.call_blockdev_assemble(node, node_disk, iname, False, idx)
5737 msg = result.fail_msg
5739 lu.proc.LogWarning("Could not prepare block device %s on node %s"
5740 " (is_primary=False, pass=1): %s",
5741 inst_disk.iv_name, node, msg)
5742 if not ignore_secondaries:
5745 # FIXME: race condition on drbd migration to primary
5747 # 2nd pass, do only the primary node
5748 for idx, inst_disk in enumerate(disks):
5751 for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node):
5752 if node != instance.primary_node:
5755 node_disk = node_disk.Copy()
5756 node_disk.UnsetSize()
5757 lu.cfg.SetDiskID(node_disk, node)
5758 result = lu.rpc.call_blockdev_assemble(node, node_disk, iname, True, idx)
5759 msg = result.fail_msg
5761 lu.proc.LogWarning("Could not prepare block device %s on node %s"
5762 " (is_primary=True, pass=2): %s",
5763 inst_disk.iv_name, node, msg)
5766 dev_path = result.payload
5768 device_info.append((instance.primary_node, inst_disk.iv_name, dev_path))
5770 # leave the disks configured for the primary node
5771 # this is a workaround that would be fixed better by
5772 # improving the logical/physical id handling
5774 lu.cfg.SetDiskID(disk, instance.primary_node)
5776 return disks_ok, device_info
5779 def _StartInstanceDisks(lu, instance, force):
5780 """Start the disks of an instance.
5783 disks_ok, _ = _AssembleInstanceDisks(lu, instance,
5784 ignore_secondaries=force)
5786 _ShutdownInstanceDisks(lu, instance)
5787 if force is not None and not force:
5788 lu.proc.LogWarning("", hint="If the message above refers to a"
5790 " you can retry the operation using '--force'.")
5791 raise errors.OpExecError("Disk consistency error")
5794 class LUInstanceDeactivateDisks(NoHooksLU):
5795 """Shutdown an instance's disks.
5800 def ExpandNames(self):
5801 self._ExpandAndLockInstance()
5802 self.needed_locks[locking.LEVEL_NODE] = []
5803 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
5805 def DeclareLocks(self, level):
5806 if level == locking.LEVEL_NODE:
5807 self._LockInstancesNodes()
5809 def CheckPrereq(self):
5810 """Check prerequisites.
5812 This checks that the instance is in the cluster.
5815 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5816 assert self.instance is not None, \
5817 "Cannot retrieve locked instance %s" % self.op.instance_name
5819 def Exec(self, feedback_fn):
5820 """Deactivate the disks
5823 instance = self.instance
5825 _ShutdownInstanceDisks(self, instance)
5827 _SafeShutdownInstanceDisks(self, instance)
5830 def _SafeShutdownInstanceDisks(lu, instance, disks=None):
5831 """Shutdown block devices of an instance.
5833 This function checks if an instance is running, before calling
5834 _ShutdownInstanceDisks.
5837 _CheckInstanceDown(lu, instance, "cannot shutdown disks")
5838 _ShutdownInstanceDisks(lu, instance, disks=disks)
5841 def _ExpandCheckDisks(instance, disks):
5842 """Return the instance disks selected by the disks list
5844 @type disks: list of L{objects.Disk} or None
5845 @param disks: selected disks
5846 @rtype: list of L{objects.Disk}
5847 @return: selected instance disks to act on
5851 return instance.disks
5853 if not set(disks).issubset(instance.disks):
5854 raise errors.ProgrammerError("Can only act on disks belonging to the"
5859 def _ShutdownInstanceDisks(lu, instance, disks=None, ignore_primary=False):
5860 """Shutdown block devices of an instance.
5862 This does the shutdown on all nodes of the instance.
5864 If the ignore_primary is false, errors on the primary node are
5869 disks = _ExpandCheckDisks(instance, disks)
5872 for node, top_disk in disk.ComputeNodeTree(instance.primary_node):
5873 lu.cfg.SetDiskID(top_disk, node)
5874 result = lu.rpc.call_blockdev_shutdown(node, top_disk)
5875 msg = result.fail_msg
5877 lu.LogWarning("Could not shutdown block device %s on node %s: %s",
5878 disk.iv_name, node, msg)
5879 if ((node == instance.primary_node and not ignore_primary) or
5880 (node != instance.primary_node and not result.offline)):
5885 def _CheckNodeFreeMemory(lu, node, reason, requested, hypervisor_name):
5886 """Checks if a node has enough free memory.
5888 This function check if a given node has the needed amount of free
5889 memory. In case the node has less memory or we cannot get the
5890 information from the node, this function raise an OpPrereqError
5893 @type lu: C{LogicalUnit}
5894 @param lu: a logical unit from which we get configuration data
5896 @param node: the node to check
5897 @type reason: C{str}
5898 @param reason: string to use in the error message
5899 @type requested: C{int}
5900 @param requested: the amount of memory in MiB to check for
5901 @type hypervisor_name: C{str}
5902 @param hypervisor_name: the hypervisor to ask for memory stats
5903 @raise errors.OpPrereqError: if the node doesn't have enough memory, or
5904 we cannot check the node
5907 nodeinfo = lu.rpc.call_node_info([node], None, hypervisor_name)
5908 nodeinfo[node].Raise("Can't get data from node %s" % node,
5909 prereq=True, ecode=errors.ECODE_ENVIRON)
5910 free_mem = nodeinfo[node].payload.get("memory_free", None)
5911 if not isinstance(free_mem, int):
5912 raise errors.OpPrereqError("Can't compute free memory on node %s, result"
5913 " was '%s'" % (node, free_mem),
5914 errors.ECODE_ENVIRON)
5915 if requested > free_mem:
5916 raise errors.OpPrereqError("Not enough memory on node %s for %s:"
5917 " needed %s MiB, available %s MiB" %
5918 (node, reason, requested, free_mem),
5922 def _CheckNodesFreeDiskPerVG(lu, nodenames, req_sizes):
5923 """Checks if nodes have enough free disk space in the all VGs.
5925 This function check if all given nodes have the needed amount of
5926 free disk. In case any node has less disk or we cannot get the
5927 information from the node, this function raise an OpPrereqError
5930 @type lu: C{LogicalUnit}
5931 @param lu: a logical unit from which we get configuration data
5932 @type nodenames: C{list}
5933 @param nodenames: the list of node names to check
5934 @type req_sizes: C{dict}
5935 @param req_sizes: the hash of vg and corresponding amount of disk in
5937 @raise errors.OpPrereqError: if the node doesn't have enough disk,
5938 or we cannot check the node
5941 for vg, req_size in req_sizes.items():
5942 _CheckNodesFreeDiskOnVG(lu, nodenames, vg, req_size)
5945 def _CheckNodesFreeDiskOnVG(lu, nodenames, vg, requested):
5946 """Checks if nodes have enough free disk space in the specified VG.
5948 This function check if all given nodes have the needed amount of
5949 free disk. In case any node has less disk or we cannot get the
5950 information from the node, this function raise an OpPrereqError
5953 @type lu: C{LogicalUnit}
5954 @param lu: a logical unit from which we get configuration data
5955 @type nodenames: C{list}
5956 @param nodenames: the list of node names to check
5958 @param vg: the volume group to check
5959 @type requested: C{int}
5960 @param requested: the amount of disk in MiB to check for
5961 @raise errors.OpPrereqError: if the node doesn't have enough disk,
5962 or we cannot check the node
5965 nodeinfo = lu.rpc.call_node_info(nodenames, vg, None)
5966 for node in nodenames:
5967 info = nodeinfo[node]
5968 info.Raise("Cannot get current information from node %s" % node,
5969 prereq=True, ecode=errors.ECODE_ENVIRON)
5970 vg_free = info.payload.get("vg_free", None)
5971 if not isinstance(vg_free, int):
5972 raise errors.OpPrereqError("Can't compute free disk space on node"
5973 " %s for vg %s, result was '%s'" %
5974 (node, vg, vg_free), errors.ECODE_ENVIRON)
5975 if requested > vg_free:
5976 raise errors.OpPrereqError("Not enough disk space on target node %s"
5977 " vg %s: required %d MiB, available %d MiB" %
5978 (node, vg, requested, vg_free),
5982 def _CheckNodesPhysicalCPUs(lu, nodenames, requested, hypervisor_name):
5983 """Checks if nodes have enough physical CPUs
5985 This function checks if all given nodes have the needed number of
5986 physical CPUs. In case any node has less CPUs or we cannot get the
5987 information from the node, this function raises an OpPrereqError
5990 @type lu: C{LogicalUnit}
5991 @param lu: a logical unit from which we get configuration data
5992 @type nodenames: C{list}
5993 @param nodenames: the list of node names to check
5994 @type requested: C{int}
5995 @param requested: the minimum acceptable number of physical CPUs
5996 @raise errors.OpPrereqError: if the node doesn't have enough CPUs,
5997 or we cannot check the node
6000 nodeinfo = lu.rpc.call_node_info(nodenames, None, hypervisor_name)
6001 for node in nodenames:
6002 info = nodeinfo[node]
6003 info.Raise("Cannot get current information from node %s" % node,
6004 prereq=True, ecode=errors.ECODE_ENVIRON)
6005 num_cpus = info.payload.get("cpu_total", None)
6006 if not isinstance(num_cpus, int):
6007 raise errors.OpPrereqError("Can't compute the number of physical CPUs"
6008 " on node %s, result was '%s'" %
6009 (node, num_cpus), errors.ECODE_ENVIRON)
6010 if requested > num_cpus:
6011 raise errors.OpPrereqError("Node %s has %s physical CPUs, but %s are "
6012 "required" % (node, num_cpus, requested),
6016 class LUInstanceStartup(LogicalUnit):
6017 """Starts an instance.
6020 HPATH = "instance-start"
6021 HTYPE = constants.HTYPE_INSTANCE
6024 def CheckArguments(self):
6026 if self.op.beparams:
6027 # fill the beparams dict
6028 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
6030 def ExpandNames(self):
6031 self._ExpandAndLockInstance()
6033 def BuildHooksEnv(self):
6036 This runs on master, primary and secondary nodes of the instance.
6040 "FORCE": self.op.force,
6043 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
6047 def BuildHooksNodes(self):
6048 """Build hooks nodes.
6051 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6054 def CheckPrereq(self):
6055 """Check prerequisites.
6057 This checks that the instance is in the cluster.
6060 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6061 assert self.instance is not None, \
6062 "Cannot retrieve locked instance %s" % self.op.instance_name
6065 if self.op.hvparams:
6066 # check hypervisor parameter syntax (locally)
6067 cluster = self.cfg.GetClusterInfo()
6068 utils.ForceDictType(self.op.hvparams, constants.HVS_PARAMETER_TYPES)
6069 filled_hvp = cluster.FillHV(instance)
6070 filled_hvp.update(self.op.hvparams)
6071 hv_type = hypervisor.GetHypervisor(instance.hypervisor)
6072 hv_type.CheckParameterSyntax(filled_hvp)
6073 _CheckHVParams(self, instance.all_nodes, instance.hypervisor, filled_hvp)
6075 self.primary_offline = self.cfg.GetNodeInfo(instance.primary_node).offline
6077 if self.primary_offline and self.op.ignore_offline_nodes:
6078 self.proc.LogWarning("Ignoring offline primary node")
6080 if self.op.hvparams or self.op.beparams:
6081 self.proc.LogWarning("Overridden parameters are ignored")
6083 _CheckNodeOnline(self, instance.primary_node)
6085 bep = self.cfg.GetClusterInfo().FillBE(instance)
6087 # check bridges existence
6088 _CheckInstanceBridgesExist(self, instance)
6090 remote_info = self.rpc.call_instance_info(instance.primary_node,
6092 instance.hypervisor)
6093 remote_info.Raise("Error checking node %s" % instance.primary_node,
6094 prereq=True, ecode=errors.ECODE_ENVIRON)
6095 if not remote_info.payload: # not running already
6096 _CheckNodeFreeMemory(self, instance.primary_node,
6097 "starting instance %s" % instance.name,
6098 bep[constants.BE_MEMORY], instance.hypervisor)
6100 def Exec(self, feedback_fn):
6101 """Start the instance.
6104 instance = self.instance
6105 force = self.op.force
6107 if not self.op.no_remember:
6108 self.cfg.MarkInstanceUp(instance.name)
6110 if self.primary_offline:
6111 assert self.op.ignore_offline_nodes
6112 self.proc.LogInfo("Primary node offline, marked instance as started")
6114 node_current = instance.primary_node
6116 _StartInstanceDisks(self, instance, force)
6119 self.rpc.call_instance_start(node_current,
6120 (instance, self.op.hvparams,
6122 self.op.startup_paused)
6123 msg = result.fail_msg
6125 _ShutdownInstanceDisks(self, instance)
6126 raise errors.OpExecError("Could not start instance: %s" % msg)
6129 class LUInstanceReboot(LogicalUnit):
6130 """Reboot an instance.
6133 HPATH = "instance-reboot"
6134 HTYPE = constants.HTYPE_INSTANCE
6137 def ExpandNames(self):
6138 self._ExpandAndLockInstance()
6140 def BuildHooksEnv(self):
6143 This runs on master, primary and secondary nodes of the instance.
6147 "IGNORE_SECONDARIES": self.op.ignore_secondaries,
6148 "REBOOT_TYPE": self.op.reboot_type,
6149 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
6152 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
6156 def BuildHooksNodes(self):
6157 """Build hooks nodes.
6160 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6163 def CheckPrereq(self):
6164 """Check prerequisites.
6166 This checks that the instance is in the cluster.
6169 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6170 assert self.instance is not None, \
6171 "Cannot retrieve locked instance %s" % self.op.instance_name
6173 _CheckNodeOnline(self, instance.primary_node)
6175 # check bridges existence
6176 _CheckInstanceBridgesExist(self, instance)
6178 def Exec(self, feedback_fn):
6179 """Reboot the instance.
6182 instance = self.instance
6183 ignore_secondaries = self.op.ignore_secondaries
6184 reboot_type = self.op.reboot_type
6186 remote_info = self.rpc.call_instance_info(instance.primary_node,
6188 instance.hypervisor)
6189 remote_info.Raise("Error checking node %s" % instance.primary_node)
6190 instance_running = bool(remote_info.payload)
6192 node_current = instance.primary_node
6194 if instance_running and reboot_type in [constants.INSTANCE_REBOOT_SOFT,
6195 constants.INSTANCE_REBOOT_HARD]:
6196 for disk in instance.disks:
6197 self.cfg.SetDiskID(disk, node_current)
6198 result = self.rpc.call_instance_reboot(node_current, instance,
6200 self.op.shutdown_timeout)
6201 result.Raise("Could not reboot instance")
6203 if instance_running:
6204 result = self.rpc.call_instance_shutdown(node_current, instance,
6205 self.op.shutdown_timeout)
6206 result.Raise("Could not shutdown instance for full reboot")
6207 _ShutdownInstanceDisks(self, instance)
6209 self.LogInfo("Instance %s was already stopped, starting now",
6211 _StartInstanceDisks(self, instance, ignore_secondaries)
6212 result = self.rpc.call_instance_start(node_current,
6213 (instance, None, None), False)
6214 msg = result.fail_msg
6216 _ShutdownInstanceDisks(self, instance)
6217 raise errors.OpExecError("Could not start instance for"
6218 " full reboot: %s" % msg)
6220 self.cfg.MarkInstanceUp(instance.name)
6223 class LUInstanceShutdown(LogicalUnit):
6224 """Shutdown an instance.
6227 HPATH = "instance-stop"
6228 HTYPE = constants.HTYPE_INSTANCE
6231 def ExpandNames(self):
6232 self._ExpandAndLockInstance()
6234 def BuildHooksEnv(self):
6237 This runs on master, primary and secondary nodes of the instance.
6240 env = _BuildInstanceHookEnvByObject(self, self.instance)
6241 env["TIMEOUT"] = self.op.timeout
6244 def BuildHooksNodes(self):
6245 """Build hooks nodes.
6248 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6251 def CheckPrereq(self):
6252 """Check prerequisites.
6254 This checks that the instance is in the cluster.
6257 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6258 assert self.instance is not None, \
6259 "Cannot retrieve locked instance %s" % self.op.instance_name
6261 self.primary_offline = \
6262 self.cfg.GetNodeInfo(self.instance.primary_node).offline
6264 if self.primary_offline and self.op.ignore_offline_nodes:
6265 self.proc.LogWarning("Ignoring offline primary node")
6267 _CheckNodeOnline(self, self.instance.primary_node)
6269 def Exec(self, feedback_fn):
6270 """Shutdown the instance.
6273 instance = self.instance
6274 node_current = instance.primary_node
6275 timeout = self.op.timeout
6277 if not self.op.no_remember:
6278 self.cfg.MarkInstanceDown(instance.name)
6280 if self.primary_offline:
6281 assert self.op.ignore_offline_nodes
6282 self.proc.LogInfo("Primary node offline, marked instance as stopped")
6284 result = self.rpc.call_instance_shutdown(node_current, instance, timeout)
6285 msg = result.fail_msg
6287 self.proc.LogWarning("Could not shutdown instance: %s" % msg)
6289 _ShutdownInstanceDisks(self, instance)
6292 class LUInstanceReinstall(LogicalUnit):
6293 """Reinstall an instance.
6296 HPATH = "instance-reinstall"
6297 HTYPE = constants.HTYPE_INSTANCE
6300 def ExpandNames(self):
6301 self._ExpandAndLockInstance()
6303 def BuildHooksEnv(self):
6306 This runs on master, primary and secondary nodes of the instance.
6309 return _BuildInstanceHookEnvByObject(self, self.instance)
6311 def BuildHooksNodes(self):
6312 """Build hooks nodes.
6315 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6318 def CheckPrereq(self):
6319 """Check prerequisites.
6321 This checks that the instance is in the cluster and is not running.
6324 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6325 assert instance is not None, \
6326 "Cannot retrieve locked instance %s" % self.op.instance_name
6327 _CheckNodeOnline(self, instance.primary_node, "Instance primary node"
6328 " offline, cannot reinstall")
6329 for node in instance.secondary_nodes:
6330 _CheckNodeOnline(self, node, "Instance secondary node offline,"
6331 " cannot reinstall")
6333 if instance.disk_template == constants.DT_DISKLESS:
6334 raise errors.OpPrereqError("Instance '%s' has no disks" %
6335 self.op.instance_name,
6337 _CheckInstanceDown(self, instance, "cannot reinstall")
6339 if self.op.os_type is not None:
6341 pnode = _ExpandNodeName(self.cfg, instance.primary_node)
6342 _CheckNodeHasOS(self, pnode, self.op.os_type, self.op.force_variant)
6343 instance_os = self.op.os_type
6345 instance_os = instance.os
6347 nodelist = list(instance.all_nodes)
6349 if self.op.osparams:
6350 i_osdict = _GetUpdatedParams(instance.osparams, self.op.osparams)
6351 _CheckOSParams(self, True, nodelist, instance_os, i_osdict)
6352 self.os_inst = i_osdict # the new dict (without defaults)
6356 self.instance = instance
6358 def Exec(self, feedback_fn):
6359 """Reinstall the instance.
6362 inst = self.instance
6364 if self.op.os_type is not None:
6365 feedback_fn("Changing OS to '%s'..." % self.op.os_type)
6366 inst.os = self.op.os_type
6367 # Write to configuration
6368 self.cfg.Update(inst, feedback_fn)
6370 _StartInstanceDisks(self, inst, None)
6372 feedback_fn("Running the instance OS create scripts...")
6373 # FIXME: pass debug option from opcode to backend
6374 result = self.rpc.call_instance_os_add(inst.primary_node,
6375 (inst, self.os_inst), True,
6376 self.op.debug_level)
6377 result.Raise("Could not install OS for instance %s on node %s" %
6378 (inst.name, inst.primary_node))
6380 _ShutdownInstanceDisks(self, inst)
6383 class LUInstanceRecreateDisks(LogicalUnit):
6384 """Recreate an instance's missing disks.
6387 HPATH = "instance-recreate-disks"
6388 HTYPE = constants.HTYPE_INSTANCE
6391 def CheckArguments(self):
6392 # normalise the disk list
6393 self.op.disks = sorted(frozenset(self.op.disks))
6395 def ExpandNames(self):
6396 self._ExpandAndLockInstance()
6397 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
6399 self.op.nodes = [_ExpandNodeName(self.cfg, n) for n in self.op.nodes]
6400 self.needed_locks[locking.LEVEL_NODE] = list(self.op.nodes)
6402 self.needed_locks[locking.LEVEL_NODE] = []
6404 def DeclareLocks(self, level):
6405 if level == locking.LEVEL_NODE:
6406 # if we replace the nodes, we only need to lock the old primary,
6407 # otherwise we need to lock all nodes for disk re-creation
6408 primary_only = bool(self.op.nodes)
6409 self._LockInstancesNodes(primary_only=primary_only)
6411 def BuildHooksEnv(self):
6414 This runs on master, primary and secondary nodes of the instance.
6417 return _BuildInstanceHookEnvByObject(self, self.instance)
6419 def BuildHooksNodes(self):
6420 """Build hooks nodes.
6423 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6426 def CheckPrereq(self):
6427 """Check prerequisites.
6429 This checks that the instance is in the cluster and is not running.
6432 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6433 assert instance is not None, \
6434 "Cannot retrieve locked instance %s" % self.op.instance_name
6436 if len(self.op.nodes) != len(instance.all_nodes):
6437 raise errors.OpPrereqError("Instance %s currently has %d nodes, but"
6438 " %d replacement nodes were specified" %
6439 (instance.name, len(instance.all_nodes),
6440 len(self.op.nodes)),
6442 assert instance.disk_template != constants.DT_DRBD8 or \
6443 len(self.op.nodes) == 2
6444 assert instance.disk_template != constants.DT_PLAIN or \
6445 len(self.op.nodes) == 1
6446 primary_node = self.op.nodes[0]
6448 primary_node = instance.primary_node
6449 _CheckNodeOnline(self, primary_node)
6451 if instance.disk_template == constants.DT_DISKLESS:
6452 raise errors.OpPrereqError("Instance '%s' has no disks" %
6453 self.op.instance_name, errors.ECODE_INVAL)
6454 # if we replace nodes *and* the old primary is offline, we don't
6456 assert instance.primary_node in self.needed_locks[locking.LEVEL_NODE]
6457 old_pnode = self.cfg.GetNodeInfo(instance.primary_node)
6458 if not (self.op.nodes and old_pnode.offline):
6459 _CheckInstanceDown(self, instance, "cannot recreate disks")
6461 if not self.op.disks:
6462 self.op.disks = range(len(instance.disks))
6464 for idx in self.op.disks:
6465 if idx >= len(instance.disks):
6466 raise errors.OpPrereqError("Invalid disk index '%s'" % idx,
6468 if self.op.disks != range(len(instance.disks)) and self.op.nodes:
6469 raise errors.OpPrereqError("Can't recreate disks partially and"
6470 " change the nodes at the same time",
6472 self.instance = instance
6474 def Exec(self, feedback_fn):
6475 """Recreate the disks.
6478 instance = self.instance
6481 mods = [] # keeps track of needed logical_id changes
6483 for idx, disk in enumerate(instance.disks):
6484 if idx not in self.op.disks: # disk idx has not been passed in
6487 # update secondaries for disks, if needed
6489 if disk.dev_type == constants.LD_DRBD8:
6490 # need to update the nodes and minors
6491 assert len(self.op.nodes) == 2
6492 assert len(disk.logical_id) == 6 # otherwise disk internals
6494 (_, _, old_port, _, _, old_secret) = disk.logical_id
6495 new_minors = self.cfg.AllocateDRBDMinor(self.op.nodes, instance.name)
6496 new_id = (self.op.nodes[0], self.op.nodes[1], old_port,
6497 new_minors[0], new_minors[1], old_secret)
6498 assert len(disk.logical_id) == len(new_id)
6499 mods.append((idx, new_id))
6501 # now that we have passed all asserts above, we can apply the mods
6502 # in a single run (to avoid partial changes)
6503 for idx, new_id in mods:
6504 instance.disks[idx].logical_id = new_id
6506 # change primary node, if needed
6508 instance.primary_node = self.op.nodes[0]
6509 self.LogWarning("Changing the instance's nodes, you will have to"
6510 " remove any disks left on the older nodes manually")
6513 self.cfg.Update(instance, feedback_fn)
6515 _CreateDisks(self, instance, to_skip=to_skip)
6518 class LUInstanceRename(LogicalUnit):
6519 """Rename an instance.
6522 HPATH = "instance-rename"
6523 HTYPE = constants.HTYPE_INSTANCE
6525 def CheckArguments(self):
6529 if self.op.ip_check and not self.op.name_check:
6530 # TODO: make the ip check more flexible and not depend on the name check
6531 raise errors.OpPrereqError("IP address check requires a name check",
6534 def BuildHooksEnv(self):
6537 This runs on master, primary and secondary nodes of the instance.
6540 env = _BuildInstanceHookEnvByObject(self, self.instance)
6541 env["INSTANCE_NEW_NAME"] = self.op.new_name
6544 def BuildHooksNodes(self):
6545 """Build hooks nodes.
6548 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6551 def CheckPrereq(self):
6552 """Check prerequisites.
6554 This checks that the instance is in the cluster and is not running.
6557 self.op.instance_name = _ExpandInstanceName(self.cfg,
6558 self.op.instance_name)
6559 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6560 assert instance is not None
6561 _CheckNodeOnline(self, instance.primary_node)
6562 _CheckInstanceDown(self, instance, "cannot rename")
6563 self.instance = instance
6565 new_name = self.op.new_name
6566 if self.op.name_check:
6567 hostname = netutils.GetHostname(name=new_name)
6568 if hostname != new_name:
6569 self.LogInfo("Resolved given name '%s' to '%s'", new_name,
6571 if not utils.MatchNameComponent(self.op.new_name, [hostname.name]):
6572 raise errors.OpPrereqError(("Resolved hostname '%s' does not look the"
6573 " same as given hostname '%s'") %
6574 (hostname.name, self.op.new_name),
6576 new_name = self.op.new_name = hostname.name
6577 if (self.op.ip_check and
6578 netutils.TcpPing(hostname.ip, constants.DEFAULT_NODED_PORT)):
6579 raise errors.OpPrereqError("IP %s of instance %s already in use" %
6580 (hostname.ip, new_name),
6581 errors.ECODE_NOTUNIQUE)
6583 instance_list = self.cfg.GetInstanceList()
6584 if new_name in instance_list and new_name != instance.name:
6585 raise errors.OpPrereqError("Instance '%s' is already in the cluster" %
6586 new_name, errors.ECODE_EXISTS)
6588 def Exec(self, feedback_fn):
6589 """Rename the instance.
6592 inst = self.instance
6593 old_name = inst.name
6595 rename_file_storage = False
6596 if (inst.disk_template in constants.DTS_FILEBASED and
6597 self.op.new_name != inst.name):
6598 old_file_storage_dir = os.path.dirname(inst.disks[0].logical_id[1])
6599 rename_file_storage = True
6601 self.cfg.RenameInstance(inst.name, self.op.new_name)
6602 # Change the instance lock. This is definitely safe while we hold the BGL.
6603 # Otherwise the new lock would have to be added in acquired mode.
6605 self.glm.remove(locking.LEVEL_INSTANCE, old_name)
6606 self.glm.add(locking.LEVEL_INSTANCE, self.op.new_name)
6608 # re-read the instance from the configuration after rename
6609 inst = self.cfg.GetInstanceInfo(self.op.new_name)
6611 if rename_file_storage:
6612 new_file_storage_dir = os.path.dirname(inst.disks[0].logical_id[1])
6613 result = self.rpc.call_file_storage_dir_rename(inst.primary_node,
6614 old_file_storage_dir,
6615 new_file_storage_dir)
6616 result.Raise("Could not rename on node %s directory '%s' to '%s'"
6617 " (but the instance has been renamed in Ganeti)" %
6618 (inst.primary_node, old_file_storage_dir,
6619 new_file_storage_dir))
6621 _StartInstanceDisks(self, inst, None)
6623 result = self.rpc.call_instance_run_rename(inst.primary_node, inst,
6624 old_name, self.op.debug_level)
6625 msg = result.fail_msg
6627 msg = ("Could not run OS rename script for instance %s on node %s"
6628 " (but the instance has been renamed in Ganeti): %s" %
6629 (inst.name, inst.primary_node, msg))
6630 self.proc.LogWarning(msg)
6632 _ShutdownInstanceDisks(self, inst)
6637 class LUInstanceRemove(LogicalUnit):
6638 """Remove an instance.
6641 HPATH = "instance-remove"
6642 HTYPE = constants.HTYPE_INSTANCE
6645 def ExpandNames(self):
6646 self._ExpandAndLockInstance()
6647 self.needed_locks[locking.LEVEL_NODE] = []
6648 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
6650 def DeclareLocks(self, level):
6651 if level == locking.LEVEL_NODE:
6652 self._LockInstancesNodes()
6654 def BuildHooksEnv(self):
6657 This runs on master, primary and secondary nodes of the instance.
6660 env = _BuildInstanceHookEnvByObject(self, self.instance)
6661 env["SHUTDOWN_TIMEOUT"] = self.op.shutdown_timeout
6664 def BuildHooksNodes(self):
6665 """Build hooks nodes.
6668 nl = [self.cfg.GetMasterNode()]
6669 nl_post = list(self.instance.all_nodes) + nl
6670 return (nl, nl_post)
6672 def CheckPrereq(self):
6673 """Check prerequisites.
6675 This checks that the instance is in the cluster.
6678 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6679 assert self.instance is not None, \
6680 "Cannot retrieve locked instance %s" % self.op.instance_name
6682 def Exec(self, feedback_fn):
6683 """Remove the instance.
6686 instance = self.instance
6687 logging.info("Shutting down instance %s on node %s",
6688 instance.name, instance.primary_node)
6690 result = self.rpc.call_instance_shutdown(instance.primary_node, instance,
6691 self.op.shutdown_timeout)
6692 msg = result.fail_msg
6694 if self.op.ignore_failures:
6695 feedback_fn("Warning: can't shutdown instance: %s" % msg)
6697 raise errors.OpExecError("Could not shutdown instance %s on"
6699 (instance.name, instance.primary_node, msg))
6701 _RemoveInstance(self, feedback_fn, instance, self.op.ignore_failures)
6704 def _RemoveInstance(lu, feedback_fn, instance, ignore_failures):
6705 """Utility function to remove an instance.
6708 logging.info("Removing block devices for instance %s", instance.name)
6710 if not _RemoveDisks(lu, instance):
6711 if not ignore_failures:
6712 raise errors.OpExecError("Can't remove instance's disks")
6713 feedback_fn("Warning: can't remove instance's disks")
6715 logging.info("Removing instance %s out of cluster config", instance.name)
6717 lu.cfg.RemoveInstance(instance.name)
6719 assert not lu.remove_locks.get(locking.LEVEL_INSTANCE), \
6720 "Instance lock removal conflict"
6722 # Remove lock for the instance
6723 lu.remove_locks[locking.LEVEL_INSTANCE] = instance.name
6726 class LUInstanceQuery(NoHooksLU):
6727 """Logical unit for querying instances.
6730 # pylint: disable=W0142
6733 def CheckArguments(self):
6734 self.iq = _InstanceQuery(qlang.MakeSimpleFilter("name", self.op.names),
6735 self.op.output_fields, self.op.use_locking)
6737 def ExpandNames(self):
6738 self.iq.ExpandNames(self)
6740 def DeclareLocks(self, level):
6741 self.iq.DeclareLocks(self, level)
6743 def Exec(self, feedback_fn):
6744 return self.iq.OldStyleQuery(self)
6747 class LUInstanceFailover(LogicalUnit):
6748 """Failover an instance.
6751 HPATH = "instance-failover"
6752 HTYPE = constants.HTYPE_INSTANCE
6755 def CheckArguments(self):
6756 """Check the arguments.
6759 self.iallocator = getattr(self.op, "iallocator", None)
6760 self.target_node = getattr(self.op, "target_node", None)
6762 def ExpandNames(self):
6763 self._ExpandAndLockInstance()
6765 if self.op.target_node is not None:
6766 self.op.target_node = _ExpandNodeName(self.cfg, self.op.target_node)
6768 self.needed_locks[locking.LEVEL_NODE] = []
6769 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
6771 ignore_consistency = self.op.ignore_consistency
6772 shutdown_timeout = self.op.shutdown_timeout
6773 self._migrater = TLMigrateInstance(self, self.op.instance_name,
6776 ignore_consistency=ignore_consistency,
6777 shutdown_timeout=shutdown_timeout)
6778 self.tasklets = [self._migrater]
6780 def DeclareLocks(self, level):
6781 if level == locking.LEVEL_NODE:
6782 instance = self.context.cfg.GetInstanceInfo(self.op.instance_name)
6783 if instance.disk_template in constants.DTS_EXT_MIRROR:
6784 if self.op.target_node is None:
6785 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
6787 self.needed_locks[locking.LEVEL_NODE] = [instance.primary_node,
6788 self.op.target_node]
6789 del self.recalculate_locks[locking.LEVEL_NODE]
6791 self._LockInstancesNodes()
6793 def BuildHooksEnv(self):
6796 This runs on master, primary and secondary nodes of the instance.
6799 instance = self._migrater.instance
6800 source_node = instance.primary_node
6801 target_node = self.op.target_node
6803 "IGNORE_CONSISTENCY": self.op.ignore_consistency,
6804 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
6805 "OLD_PRIMARY": source_node,
6806 "NEW_PRIMARY": target_node,
6809 if instance.disk_template in constants.DTS_INT_MIRROR:
6810 env["OLD_SECONDARY"] = instance.secondary_nodes[0]
6811 env["NEW_SECONDARY"] = source_node
6813 env["OLD_SECONDARY"] = env["NEW_SECONDARY"] = ""
6815 env.update(_BuildInstanceHookEnvByObject(self, instance))
6819 def BuildHooksNodes(self):
6820 """Build hooks nodes.
6823 instance = self._migrater.instance
6824 nl = [self.cfg.GetMasterNode()] + list(instance.secondary_nodes)
6825 return (nl, nl + [instance.primary_node])
6828 class LUInstanceMigrate(LogicalUnit):
6829 """Migrate an instance.
6831 This is migration without shutting down, compared to the failover,
6832 which is done with shutdown.
6835 HPATH = "instance-migrate"
6836 HTYPE = constants.HTYPE_INSTANCE
6839 def ExpandNames(self):
6840 self._ExpandAndLockInstance()
6842 if self.op.target_node is not None:
6843 self.op.target_node = _ExpandNodeName(self.cfg, self.op.target_node)
6845 self.needed_locks[locking.LEVEL_NODE] = []
6846 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
6848 self._migrater = TLMigrateInstance(self, self.op.instance_name,
6849 cleanup=self.op.cleanup,
6851 fallback=self.op.allow_failover)
6852 self.tasklets = [self._migrater]
6854 def DeclareLocks(self, level):
6855 if level == locking.LEVEL_NODE:
6856 instance = self.context.cfg.GetInstanceInfo(self.op.instance_name)
6857 if instance.disk_template in constants.DTS_EXT_MIRROR:
6858 if self.op.target_node is None:
6859 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
6861 self.needed_locks[locking.LEVEL_NODE] = [instance.primary_node,
6862 self.op.target_node]
6863 del self.recalculate_locks[locking.LEVEL_NODE]
6865 self._LockInstancesNodes()
6867 def BuildHooksEnv(self):
6870 This runs on master, primary and secondary nodes of the instance.
6873 instance = self._migrater.instance
6874 source_node = instance.primary_node
6875 target_node = self.op.target_node
6876 env = _BuildInstanceHookEnvByObject(self, instance)
6878 "MIGRATE_LIVE": self._migrater.live,
6879 "MIGRATE_CLEANUP": self.op.cleanup,
6880 "OLD_PRIMARY": source_node,
6881 "NEW_PRIMARY": target_node,
6884 if instance.disk_template in constants.DTS_INT_MIRROR:
6885 env["OLD_SECONDARY"] = target_node
6886 env["NEW_SECONDARY"] = source_node
6888 env["OLD_SECONDARY"] = env["NEW_SECONDARY"] = None
6892 def BuildHooksNodes(self):
6893 """Build hooks nodes.
6896 instance = self._migrater.instance
6897 nl = [self.cfg.GetMasterNode()] + list(instance.secondary_nodes)
6898 return (nl, nl + [instance.primary_node])
6901 class LUInstanceMove(LogicalUnit):
6902 """Move an instance by data-copying.
6905 HPATH = "instance-move"
6906 HTYPE = constants.HTYPE_INSTANCE
6909 def ExpandNames(self):
6910 self._ExpandAndLockInstance()
6911 target_node = _ExpandNodeName(self.cfg, self.op.target_node)
6912 self.op.target_node = target_node
6913 self.needed_locks[locking.LEVEL_NODE] = [target_node]
6914 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
6916 def DeclareLocks(self, level):
6917 if level == locking.LEVEL_NODE:
6918 self._LockInstancesNodes(primary_only=True)
6920 def BuildHooksEnv(self):
6923 This runs on master, primary and secondary nodes of the instance.
6927 "TARGET_NODE": self.op.target_node,
6928 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
6930 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
6933 def BuildHooksNodes(self):
6934 """Build hooks nodes.
6938 self.cfg.GetMasterNode(),
6939 self.instance.primary_node,
6940 self.op.target_node,
6944 def CheckPrereq(self):
6945 """Check prerequisites.
6947 This checks that the instance is in the cluster.
6950 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6951 assert self.instance is not None, \
6952 "Cannot retrieve locked instance %s" % self.op.instance_name
6954 node = self.cfg.GetNodeInfo(self.op.target_node)
6955 assert node is not None, \
6956 "Cannot retrieve locked node %s" % self.op.target_node
6958 self.target_node = target_node = node.name
6960 if target_node == instance.primary_node:
6961 raise errors.OpPrereqError("Instance %s is already on the node %s" %
6962 (instance.name, target_node),
6965 bep = self.cfg.GetClusterInfo().FillBE(instance)
6967 for idx, dsk in enumerate(instance.disks):
6968 if dsk.dev_type not in (constants.LD_LV, constants.LD_FILE):
6969 raise errors.OpPrereqError("Instance disk %d has a complex layout,"
6970 " cannot copy" % idx, errors.ECODE_STATE)
6972 _CheckNodeOnline(self, target_node)
6973 _CheckNodeNotDrained(self, target_node)
6974 _CheckNodeVmCapable(self, target_node)
6976 if instance.admin_up:
6977 # check memory requirements on the secondary node
6978 _CheckNodeFreeMemory(self, target_node, "failing over instance %s" %
6979 instance.name, bep[constants.BE_MEMORY],
6980 instance.hypervisor)
6982 self.LogInfo("Not checking memory on the secondary node as"
6983 " instance will not be started")
6985 # check bridge existance
6986 _CheckInstanceBridgesExist(self, instance, node=target_node)
6988 def Exec(self, feedback_fn):
6989 """Move an instance.
6991 The move is done by shutting it down on its present node, copying
6992 the data over (slow) and starting it on the new node.
6995 instance = self.instance
6997 source_node = instance.primary_node
6998 target_node = self.target_node
7000 self.LogInfo("Shutting down instance %s on source node %s",
7001 instance.name, source_node)
7003 result = self.rpc.call_instance_shutdown(source_node, instance,
7004 self.op.shutdown_timeout)
7005 msg = result.fail_msg
7007 if self.op.ignore_consistency:
7008 self.proc.LogWarning("Could not shutdown instance %s on node %s."
7009 " Proceeding anyway. Please make sure node"
7010 " %s is down. Error details: %s",
7011 instance.name, source_node, source_node, msg)
7013 raise errors.OpExecError("Could not shutdown instance %s on"
7015 (instance.name, source_node, msg))
7017 # create the target disks
7019 _CreateDisks(self, instance, target_node=target_node)
7020 except errors.OpExecError:
7021 self.LogWarning("Device creation failed, reverting...")
7023 _RemoveDisks(self, instance, target_node=target_node)
7025 self.cfg.ReleaseDRBDMinors(instance.name)
7028 cluster_name = self.cfg.GetClusterInfo().cluster_name
7031 # activate, get path, copy the data over
7032 for idx, disk in enumerate(instance.disks):
7033 self.LogInfo("Copying data for disk %d", idx)
7034 result = self.rpc.call_blockdev_assemble(target_node, disk,
7035 instance.name, True, idx)
7037 self.LogWarning("Can't assemble newly created disk %d: %s",
7038 idx, result.fail_msg)
7039 errs.append(result.fail_msg)
7041 dev_path = result.payload
7042 result = self.rpc.call_blockdev_export(source_node, disk,
7043 target_node, dev_path,
7046 self.LogWarning("Can't copy data over for disk %d: %s",
7047 idx, result.fail_msg)
7048 errs.append(result.fail_msg)
7052 self.LogWarning("Some disks failed to copy, aborting")
7054 _RemoveDisks(self, instance, target_node=target_node)
7056 self.cfg.ReleaseDRBDMinors(instance.name)
7057 raise errors.OpExecError("Errors during disk copy: %s" %
7060 instance.primary_node = target_node
7061 self.cfg.Update(instance, feedback_fn)
7063 self.LogInfo("Removing the disks on the original node")
7064 _RemoveDisks(self, instance, target_node=source_node)
7066 # Only start the instance if it's marked as up
7067 if instance.admin_up:
7068 self.LogInfo("Starting instance %s on node %s",
7069 instance.name, target_node)
7071 disks_ok, _ = _AssembleInstanceDisks(self, instance,
7072 ignore_secondaries=True)
7074 _ShutdownInstanceDisks(self, instance)
7075 raise errors.OpExecError("Can't activate the instance's disks")
7077 result = self.rpc.call_instance_start(target_node,
7078 (instance, None, None), False)
7079 msg = result.fail_msg
7081 _ShutdownInstanceDisks(self, instance)
7082 raise errors.OpExecError("Could not start instance %s on node %s: %s" %
7083 (instance.name, target_node, msg))
7086 class LUNodeMigrate(LogicalUnit):
7087 """Migrate all instances from a node.
7090 HPATH = "node-migrate"
7091 HTYPE = constants.HTYPE_NODE
7094 def CheckArguments(self):
7097 def ExpandNames(self):
7098 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
7100 self.share_locks = _ShareAll()
7101 self.needed_locks = {
7102 locking.LEVEL_NODE: [self.op.node_name],
7105 def BuildHooksEnv(self):
7108 This runs on the master, the primary and all the secondaries.
7112 "NODE_NAME": self.op.node_name,
7115 def BuildHooksNodes(self):
7116 """Build hooks nodes.
7119 nl = [self.cfg.GetMasterNode()]
7122 def CheckPrereq(self):
7125 def Exec(self, feedback_fn):
7126 # Prepare jobs for migration instances
7128 [opcodes.OpInstanceMigrate(instance_name=inst.name,
7131 iallocator=self.op.iallocator,
7132 target_node=self.op.target_node)]
7133 for inst in _GetNodePrimaryInstances(self.cfg, self.op.node_name)
7136 # TODO: Run iallocator in this opcode and pass correct placement options to
7137 # OpInstanceMigrate. Since other jobs can modify the cluster between
7138 # running the iallocator and the actual migration, a good consistency model
7139 # will have to be found.
7141 assert (frozenset(self.owned_locks(locking.LEVEL_NODE)) ==
7142 frozenset([self.op.node_name]))
7144 return ResultWithJobs(jobs)
7147 class TLMigrateInstance(Tasklet):
7148 """Tasklet class for instance migration.
7151 @ivar live: whether the migration will be done live or non-live;
7152 this variable is initalized only after CheckPrereq has run
7153 @type cleanup: boolean
7154 @ivar cleanup: Wheater we cleanup from a failed migration
7155 @type iallocator: string
7156 @ivar iallocator: The iallocator used to determine target_node
7157 @type target_node: string
7158 @ivar target_node: If given, the target_node to reallocate the instance to
7159 @type failover: boolean
7160 @ivar failover: Whether operation results in failover or migration
7161 @type fallback: boolean
7162 @ivar fallback: Whether fallback to failover is allowed if migration not
7164 @type ignore_consistency: boolean
7165 @ivar ignore_consistency: Wheter we should ignore consistency between source
7167 @type shutdown_timeout: int
7168 @ivar shutdown_timeout: In case of failover timeout of the shutdown
7173 _MIGRATION_POLL_INTERVAL = 1 # seconds
7174 _MIGRATION_FEEDBACK_INTERVAL = 10 # seconds
7176 def __init__(self, lu, instance_name, cleanup=False,
7177 failover=False, fallback=False,
7178 ignore_consistency=False,
7179 shutdown_timeout=constants.DEFAULT_SHUTDOWN_TIMEOUT):
7180 """Initializes this class.
7183 Tasklet.__init__(self, lu)
7186 self.instance_name = instance_name
7187 self.cleanup = cleanup
7188 self.live = False # will be overridden later
7189 self.failover = failover
7190 self.fallback = fallback
7191 self.ignore_consistency = ignore_consistency
7192 self.shutdown_timeout = shutdown_timeout
7194 def CheckPrereq(self):
7195 """Check prerequisites.
7197 This checks that the instance is in the cluster.
7200 instance_name = _ExpandInstanceName(self.lu.cfg, self.instance_name)
7201 instance = self.cfg.GetInstanceInfo(instance_name)
7202 assert instance is not None
7203 self.instance = instance
7205 if (not self.cleanup and not instance.admin_up and not self.failover and
7207 self.lu.LogInfo("Instance is marked down, fallback allowed, switching"
7209 self.failover = True
7211 if instance.disk_template not in constants.DTS_MIRRORED:
7216 raise errors.OpPrereqError("Instance's disk layout '%s' does not allow"
7217 " %s" % (instance.disk_template, text),
7220 if instance.disk_template in constants.DTS_EXT_MIRROR:
7221 _CheckIAllocatorOrNode(self.lu, "iallocator", "target_node")
7223 if self.lu.op.iallocator:
7224 self._RunAllocator()
7226 # We set set self.target_node as it is required by
7228 self.target_node = self.lu.op.target_node
7230 # self.target_node is already populated, either directly or by the
7232 target_node = self.target_node
7233 if self.target_node == instance.primary_node:
7234 raise errors.OpPrereqError("Cannot migrate instance %s"
7235 " to its primary (%s)" %
7236 (instance.name, instance.primary_node))
7238 if len(self.lu.tasklets) == 1:
7239 # It is safe to release locks only when we're the only tasklet
7241 _ReleaseLocks(self.lu, locking.LEVEL_NODE,
7242 keep=[instance.primary_node, self.target_node])
7245 secondary_nodes = instance.secondary_nodes
7246 if not secondary_nodes:
7247 raise errors.ConfigurationError("No secondary node but using"
7248 " %s disk template" %
7249 instance.disk_template)
7250 target_node = secondary_nodes[0]
7251 if self.lu.op.iallocator or (self.lu.op.target_node and
7252 self.lu.op.target_node != target_node):
7254 text = "failed over"
7257 raise errors.OpPrereqError("Instances with disk template %s cannot"
7258 " be %s to arbitrary nodes"
7259 " (neither an iallocator nor a target"
7260 " node can be passed)" %
7261 (instance.disk_template, text),
7264 i_be = self.cfg.GetClusterInfo().FillBE(instance)
7266 # check memory requirements on the secondary node
7267 if not self.failover or instance.admin_up:
7268 _CheckNodeFreeMemory(self.lu, target_node, "migrating instance %s" %
7269 instance.name, i_be[constants.BE_MEMORY],
7270 instance.hypervisor)
7272 self.lu.LogInfo("Not checking memory on the secondary node as"
7273 " instance will not be started")
7275 # check bridge existance
7276 _CheckInstanceBridgesExist(self.lu, instance, node=target_node)
7278 if not self.cleanup:
7279 _CheckNodeNotDrained(self.lu, target_node)
7280 if not self.failover:
7281 result = self.rpc.call_instance_migratable(instance.primary_node,
7283 if result.fail_msg and self.fallback:
7284 self.lu.LogInfo("Can't migrate, instance offline, fallback to"
7286 self.failover = True
7288 result.Raise("Can't migrate, please use failover",
7289 prereq=True, ecode=errors.ECODE_STATE)
7291 assert not (self.failover and self.cleanup)
7293 if not self.failover:
7294 if self.lu.op.live is not None and self.lu.op.mode is not None:
7295 raise errors.OpPrereqError("Only one of the 'live' and 'mode'"
7296 " parameters are accepted",
7298 if self.lu.op.live is not None:
7300 self.lu.op.mode = constants.HT_MIGRATION_LIVE
7302 self.lu.op.mode = constants.HT_MIGRATION_NONLIVE
7303 # reset the 'live' parameter to None so that repeated
7304 # invocations of CheckPrereq do not raise an exception
7305 self.lu.op.live = None
7306 elif self.lu.op.mode is None:
7307 # read the default value from the hypervisor
7308 i_hv = self.cfg.GetClusterInfo().FillHV(self.instance,
7310 self.lu.op.mode = i_hv[constants.HV_MIGRATION_MODE]
7312 self.live = self.lu.op.mode == constants.HT_MIGRATION_LIVE
7314 # Failover is never live
7317 def _RunAllocator(self):
7318 """Run the allocator based on input opcode.
7321 ial = IAllocator(self.cfg, self.rpc,
7322 mode=constants.IALLOCATOR_MODE_RELOC,
7323 name=self.instance_name,
7324 # TODO See why hail breaks with a single node below
7325 relocate_from=[self.instance.primary_node,
7326 self.instance.primary_node],
7329 ial.Run(self.lu.op.iallocator)
7332 raise errors.OpPrereqError("Can't compute nodes using"
7333 " iallocator '%s': %s" %
7334 (self.lu.op.iallocator, ial.info),
7336 if len(ial.result) != ial.required_nodes:
7337 raise errors.OpPrereqError("iallocator '%s' returned invalid number"
7338 " of nodes (%s), required %s" %
7339 (self.lu.op.iallocator, len(ial.result),
7340 ial.required_nodes), errors.ECODE_FAULT)
7341 self.target_node = ial.result[0]
7342 self.lu.LogInfo("Selected nodes for instance %s via iallocator %s: %s",
7343 self.instance_name, self.lu.op.iallocator,
7344 utils.CommaJoin(ial.result))
7346 def _WaitUntilSync(self):
7347 """Poll with custom rpc for disk sync.
7349 This uses our own step-based rpc call.
7352 self.feedback_fn("* wait until resync is done")
7356 result = self.rpc.call_drbd_wait_sync(self.all_nodes,
7358 self.instance.disks)
7360 for node, nres in result.items():
7361 nres.Raise("Cannot resync disks on node %s" % node)
7362 node_done, node_percent = nres.payload
7363 all_done = all_done and node_done
7364 if node_percent is not None:
7365 min_percent = min(min_percent, node_percent)
7367 if min_percent < 100:
7368 self.feedback_fn(" - progress: %.1f%%" % min_percent)
7371 def _EnsureSecondary(self, node):
7372 """Demote a node to secondary.
7375 self.feedback_fn("* switching node %s to secondary mode" % node)
7377 for dev in self.instance.disks:
7378 self.cfg.SetDiskID(dev, node)
7380 result = self.rpc.call_blockdev_close(node, self.instance.name,
7381 self.instance.disks)
7382 result.Raise("Cannot change disk to secondary on node %s" % node)
7384 def _GoStandalone(self):
7385 """Disconnect from the network.
7388 self.feedback_fn("* changing into standalone mode")
7389 result = self.rpc.call_drbd_disconnect_net(self.all_nodes, self.nodes_ip,
7390 self.instance.disks)
7391 for node, nres in result.items():
7392 nres.Raise("Cannot disconnect disks node %s" % node)
7394 def _GoReconnect(self, multimaster):
7395 """Reconnect to the network.
7401 msg = "single-master"
7402 self.feedback_fn("* changing disks into %s mode" % msg)
7403 result = self.rpc.call_drbd_attach_net(self.all_nodes, self.nodes_ip,
7404 self.instance.disks,
7405 self.instance.name, multimaster)
7406 for node, nres in result.items():
7407 nres.Raise("Cannot change disks config on node %s" % node)
7409 def _ExecCleanup(self):
7410 """Try to cleanup after a failed migration.
7412 The cleanup is done by:
7413 - check that the instance is running only on one node
7414 (and update the config if needed)
7415 - change disks on its secondary node to secondary
7416 - wait until disks are fully synchronized
7417 - disconnect from the network
7418 - change disks into single-master mode
7419 - wait again until disks are fully synchronized
7422 instance = self.instance
7423 target_node = self.target_node
7424 source_node = self.source_node
7426 # check running on only one node
7427 self.feedback_fn("* checking where the instance actually runs"
7428 " (if this hangs, the hypervisor might be in"
7430 ins_l = self.rpc.call_instance_list(self.all_nodes, [instance.hypervisor])
7431 for node, result in ins_l.items():
7432 result.Raise("Can't contact node %s" % node)
7434 runningon_source = instance.name in ins_l[source_node].payload
7435 runningon_target = instance.name in ins_l[target_node].payload
7437 if runningon_source and runningon_target:
7438 raise errors.OpExecError("Instance seems to be running on two nodes,"
7439 " or the hypervisor is confused; you will have"
7440 " to ensure manually that it runs only on one"
7441 " and restart this operation")
7443 if not (runningon_source or runningon_target):
7444 raise errors.OpExecError("Instance does not seem to be running at all;"
7445 " in this case it's safer to repair by"
7446 " running 'gnt-instance stop' to ensure disk"
7447 " shutdown, and then restarting it")
7449 if runningon_target:
7450 # the migration has actually succeeded, we need to update the config
7451 self.feedback_fn("* instance running on secondary node (%s),"
7452 " updating config" % target_node)
7453 instance.primary_node = target_node
7454 self.cfg.Update(instance, self.feedback_fn)
7455 demoted_node = source_node
7457 self.feedback_fn("* instance confirmed to be running on its"
7458 " primary node (%s)" % source_node)
7459 demoted_node = target_node
7461 if instance.disk_template in constants.DTS_INT_MIRROR:
7462 self._EnsureSecondary(demoted_node)
7464 self._WaitUntilSync()
7465 except errors.OpExecError:
7466 # we ignore here errors, since if the device is standalone, it
7467 # won't be able to sync
7469 self._GoStandalone()
7470 self._GoReconnect(False)
7471 self._WaitUntilSync()
7473 self.feedback_fn("* done")
7475 def _RevertDiskStatus(self):
7476 """Try to revert the disk status after a failed migration.
7479 target_node = self.target_node
7480 if self.instance.disk_template in constants.DTS_EXT_MIRROR:
7484 self._EnsureSecondary(target_node)
7485 self._GoStandalone()
7486 self._GoReconnect(False)
7487 self._WaitUntilSync()
7488 except errors.OpExecError, err:
7489 self.lu.LogWarning("Migration failed and I can't reconnect the drives,"
7490 " please try to recover the instance manually;"
7491 " error '%s'" % str(err))
7493 def _AbortMigration(self):
7494 """Call the hypervisor code to abort a started migration.
7497 instance = self.instance
7498 target_node = self.target_node
7499 source_node = self.source_node
7500 migration_info = self.migration_info
7502 abort_result = self.rpc.call_instance_finalize_migration_dst(target_node,
7506 abort_msg = abort_result.fail_msg
7508 logging.error("Aborting migration failed on target node %s: %s",
7509 target_node, abort_msg)
7510 # Don't raise an exception here, as we stil have to try to revert the
7511 # disk status, even if this step failed.
7513 abort_result = self.rpc.call_instance_finalize_migration_src(source_node,
7514 instance, False, self.live)
7515 abort_msg = abort_result.fail_msg
7517 logging.error("Aborting migration failed on source node %s: %s",
7518 source_node, abort_msg)
7520 def _ExecMigration(self):
7521 """Migrate an instance.
7523 The migrate is done by:
7524 - change the disks into dual-master mode
7525 - wait until disks are fully synchronized again
7526 - migrate the instance
7527 - change disks on the new secondary node (the old primary) to secondary
7528 - wait until disks are fully synchronized
7529 - change disks into single-master mode
7532 instance = self.instance
7533 target_node = self.target_node
7534 source_node = self.source_node
7536 # Check for hypervisor version mismatch and warn the user.
7537 nodeinfo = self.rpc.call_node_info([source_node, target_node],
7538 None, self.instance.hypervisor)
7539 src_info = nodeinfo[source_node]
7540 dst_info = nodeinfo[target_node]
7542 if ((constants.HV_NODEINFO_KEY_VERSION in src_info.payload) and
7543 (constants.HV_NODEINFO_KEY_VERSION in dst_info.payload)):
7544 src_version = src_info.payload[constants.HV_NODEINFO_KEY_VERSION]
7545 dst_version = dst_info.payload[constants.HV_NODEINFO_KEY_VERSION]
7546 if src_version != dst_version:
7547 self.feedback_fn("* warning: hypervisor version mismatch between"
7548 " source (%s) and target (%s) node" %
7549 (src_version, dst_version))
7551 self.feedback_fn("* checking disk consistency between source and target")
7552 for dev in instance.disks:
7553 if not _CheckDiskConsistency(self.lu, dev, target_node, False):
7554 raise errors.OpExecError("Disk %s is degraded or not fully"
7555 " synchronized on target node,"
7556 " aborting migration" % dev.iv_name)
7558 # First get the migration information from the remote node
7559 result = self.rpc.call_migration_info(source_node, instance)
7560 msg = result.fail_msg
7562 log_err = ("Failed fetching source migration information from %s: %s" %
7564 logging.error(log_err)
7565 raise errors.OpExecError(log_err)
7567 self.migration_info = migration_info = result.payload
7569 if self.instance.disk_template not in constants.DTS_EXT_MIRROR:
7570 # Then switch the disks to master/master mode
7571 self._EnsureSecondary(target_node)
7572 self._GoStandalone()
7573 self._GoReconnect(True)
7574 self._WaitUntilSync()
7576 self.feedback_fn("* preparing %s to accept the instance" % target_node)
7577 result = self.rpc.call_accept_instance(target_node,
7580 self.nodes_ip[target_node])
7582 msg = result.fail_msg
7584 logging.error("Instance pre-migration failed, trying to revert"
7585 " disk status: %s", msg)
7586 self.feedback_fn("Pre-migration failed, aborting")
7587 self._AbortMigration()
7588 self._RevertDiskStatus()
7589 raise errors.OpExecError("Could not pre-migrate instance %s: %s" %
7590 (instance.name, msg))
7592 self.feedback_fn("* migrating instance to %s" % target_node)
7593 result = self.rpc.call_instance_migrate(source_node, instance,
7594 self.nodes_ip[target_node],
7596 msg = result.fail_msg
7598 logging.error("Instance migration failed, trying to revert"
7599 " disk status: %s", msg)
7600 self.feedback_fn("Migration failed, aborting")
7601 self._AbortMigration()
7602 self._RevertDiskStatus()
7603 raise errors.OpExecError("Could not migrate instance %s: %s" %
7604 (instance.name, msg))
7606 self.feedback_fn("* starting memory transfer")
7607 last_feedback = time.time()
7609 result = self.rpc.call_instance_get_migration_status(source_node,
7611 msg = result.fail_msg
7612 ms = result.payload # MigrationStatus instance
7613 if msg or (ms.status in constants.HV_MIGRATION_FAILED_STATUSES):
7614 logging.error("Instance migration failed, trying to revert"
7615 " disk status: %s", msg)
7616 self.feedback_fn("Migration failed, aborting")
7617 self._AbortMigration()
7618 self._RevertDiskStatus()
7619 raise errors.OpExecError("Could not migrate instance %s: %s" %
7620 (instance.name, msg))
7622 if result.payload.status != constants.HV_MIGRATION_ACTIVE:
7623 self.feedback_fn("* memory transfer complete")
7626 if (utils.TimeoutExpired(last_feedback,
7627 self._MIGRATION_FEEDBACK_INTERVAL) and
7628 ms.transferred_ram is not None):
7629 mem_progress = 100 * float(ms.transferred_ram) / float(ms.total_ram)
7630 self.feedback_fn("* memory transfer progress: %.2f %%" % mem_progress)
7631 last_feedback = time.time()
7633 time.sleep(self._MIGRATION_POLL_INTERVAL)
7635 result = self.rpc.call_instance_finalize_migration_src(source_node,
7639 msg = result.fail_msg
7641 logging.error("Instance migration succeeded, but finalization failed"
7642 " on the source node: %s", msg)
7643 raise errors.OpExecError("Could not finalize instance migration: %s" %
7646 instance.primary_node = target_node
7648 # distribute new instance config to the other nodes
7649 self.cfg.Update(instance, self.feedback_fn)
7651 result = self.rpc.call_instance_finalize_migration_dst(target_node,
7655 msg = result.fail_msg
7657 logging.error("Instance migration succeeded, but finalization failed"
7658 " on the target node: %s", msg)
7659 raise errors.OpExecError("Could not finalize instance migration: %s" %
7662 if self.instance.disk_template not in constants.DTS_EXT_MIRROR:
7663 self._EnsureSecondary(source_node)
7664 self._WaitUntilSync()
7665 self._GoStandalone()
7666 self._GoReconnect(False)
7667 self._WaitUntilSync()
7669 self.feedback_fn("* done")
7671 def _ExecFailover(self):
7672 """Failover an instance.
7674 The failover is done by shutting it down on its present node and
7675 starting it on the secondary.
7678 instance = self.instance
7679 primary_node = self.cfg.GetNodeInfo(instance.primary_node)
7681 source_node = instance.primary_node
7682 target_node = self.target_node
7684 if instance.admin_up:
7685 self.feedback_fn("* checking disk consistency between source and target")
7686 for dev in instance.disks:
7687 # for drbd, these are drbd over lvm
7688 if not _CheckDiskConsistency(self.lu, dev, target_node, False):
7689 if primary_node.offline:
7690 self.feedback_fn("Node %s is offline, ignoring degraded disk %s on"
7692 (primary_node.name, dev.iv_name, target_node))
7693 elif not self.ignore_consistency:
7694 raise errors.OpExecError("Disk %s is degraded on target node,"
7695 " aborting failover" % dev.iv_name)
7697 self.feedback_fn("* not checking disk consistency as instance is not"
7700 self.feedback_fn("* shutting down instance on source node")
7701 logging.info("Shutting down instance %s on node %s",
7702 instance.name, source_node)
7704 result = self.rpc.call_instance_shutdown(source_node, instance,
7705 self.shutdown_timeout)
7706 msg = result.fail_msg
7708 if self.ignore_consistency or primary_node.offline:
7709 self.lu.LogWarning("Could not shutdown instance %s on node %s,"
7710 " proceeding anyway; please make sure node"
7711 " %s is down; error details: %s",
7712 instance.name, source_node, source_node, msg)
7714 raise errors.OpExecError("Could not shutdown instance %s on"
7716 (instance.name, source_node, msg))
7718 self.feedback_fn("* deactivating the instance's disks on source node")
7719 if not _ShutdownInstanceDisks(self.lu, instance, ignore_primary=True):
7720 raise errors.OpExecError("Can't shut down the instance's disks")
7722 instance.primary_node = target_node
7723 # distribute new instance config to the other nodes
7724 self.cfg.Update(instance, self.feedback_fn)
7726 # Only start the instance if it's marked as up
7727 if instance.admin_up:
7728 self.feedback_fn("* activating the instance's disks on target node %s" %
7730 logging.info("Starting instance %s on node %s",
7731 instance.name, target_node)
7733 disks_ok, _ = _AssembleInstanceDisks(self.lu, instance,
7734 ignore_secondaries=True)
7736 _ShutdownInstanceDisks(self.lu, instance)
7737 raise errors.OpExecError("Can't activate the instance's disks")
7739 self.feedback_fn("* starting the instance on the target node %s" %
7741 result = self.rpc.call_instance_start(target_node, (instance, None, None),
7743 msg = result.fail_msg
7745 _ShutdownInstanceDisks(self.lu, instance)
7746 raise errors.OpExecError("Could not start instance %s on node %s: %s" %
7747 (instance.name, target_node, msg))
7749 def Exec(self, feedback_fn):
7750 """Perform the migration.
7753 self.feedback_fn = feedback_fn
7754 self.source_node = self.instance.primary_node
7756 # FIXME: if we implement migrate-to-any in DRBD, this needs fixing
7757 if self.instance.disk_template in constants.DTS_INT_MIRROR:
7758 self.target_node = self.instance.secondary_nodes[0]
7759 # Otherwise self.target_node has been populated either
7760 # directly, or through an iallocator.
7762 self.all_nodes = [self.source_node, self.target_node]
7763 self.nodes_ip = dict((name, node.secondary_ip) for (name, node)
7764 in self.cfg.GetMultiNodeInfo(self.all_nodes))
7767 feedback_fn("Failover instance %s" % self.instance.name)
7768 self._ExecFailover()
7770 feedback_fn("Migrating instance %s" % self.instance.name)
7773 return self._ExecCleanup()
7775 return self._ExecMigration()
7778 def _CreateBlockDev(lu, node, instance, device, force_create,
7780 """Create a tree of block devices on a given node.
7782 If this device type has to be created on secondaries, create it and
7785 If not, just recurse to children keeping the same 'force' value.
7787 @param lu: the lu on whose behalf we execute
7788 @param node: the node on which to create the device
7789 @type instance: L{objects.Instance}
7790 @param instance: the instance which owns the device
7791 @type device: L{objects.Disk}
7792 @param device: the device to create
7793 @type force_create: boolean
7794 @param force_create: whether to force creation of this device; this
7795 will be change to True whenever we find a device which has
7796 CreateOnSecondary() attribute
7797 @param info: the extra 'metadata' we should attach to the device
7798 (this will be represented as a LVM tag)
7799 @type force_open: boolean
7800 @param force_open: this parameter will be passes to the
7801 L{backend.BlockdevCreate} function where it specifies
7802 whether we run on primary or not, and it affects both
7803 the child assembly and the device own Open() execution
7806 if device.CreateOnSecondary():
7810 for child in device.children:
7811 _CreateBlockDev(lu, node, instance, child, force_create,
7814 if not force_create:
7817 _CreateSingleBlockDev(lu, node, instance, device, info, force_open)
7820 def _CreateSingleBlockDev(lu, node, instance, device, info, force_open):
7821 """Create a single block device on a given node.
7823 This will not recurse over children of the device, so they must be
7826 @param lu: the lu on whose behalf we execute
7827 @param node: the node on which to create the device
7828 @type instance: L{objects.Instance}
7829 @param instance: the instance which owns the device
7830 @type device: L{objects.Disk}
7831 @param device: the device to create
7832 @param info: the extra 'metadata' we should attach to the device
7833 (this will be represented as a LVM tag)
7834 @type force_open: boolean
7835 @param force_open: this parameter will be passes to the
7836 L{backend.BlockdevCreate} function where it specifies
7837 whether we run on primary or not, and it affects both
7838 the child assembly and the device own Open() execution
7841 lu.cfg.SetDiskID(device, node)
7842 result = lu.rpc.call_blockdev_create(node, device, device.size,
7843 instance.name, force_open, info)
7844 result.Raise("Can't create block device %s on"
7845 " node %s for instance %s" % (device, node, instance.name))
7846 if device.physical_id is None:
7847 device.physical_id = result.payload
7850 def _GenerateUniqueNames(lu, exts):
7851 """Generate a suitable LV name.
7853 This will generate a logical volume name for the given instance.
7858 new_id = lu.cfg.GenerateUniqueID(lu.proc.GetECId())
7859 results.append("%s%s" % (new_id, val))
7863 def _GenerateDRBD8Branch(lu, primary, secondary, size, vgnames, names,
7864 iv_name, p_minor, s_minor):
7865 """Generate a drbd8 device complete with its children.
7868 assert len(vgnames) == len(names) == 2
7869 port = lu.cfg.AllocatePort()
7870 shared_secret = lu.cfg.GenerateDRBDSecret(lu.proc.GetECId())
7871 dev_data = objects.Disk(dev_type=constants.LD_LV, size=size,
7872 logical_id=(vgnames[0], names[0]))
7873 dev_meta = objects.Disk(dev_type=constants.LD_LV, size=DRBD_META_SIZE,
7874 logical_id=(vgnames[1], names[1]))
7875 drbd_dev = objects.Disk(dev_type=constants.LD_DRBD8, size=size,
7876 logical_id=(primary, secondary, port,
7879 children=[dev_data, dev_meta],
7884 def _GenerateDiskTemplate(lu, template_name,
7885 instance_name, primary_node,
7886 secondary_nodes, disk_info,
7887 file_storage_dir, file_driver,
7888 base_index, feedback_fn):
7889 """Generate the entire disk layout for a given template type.
7892 #TODO: compute space requirements
7894 vgname = lu.cfg.GetVGName()
7895 disk_count = len(disk_info)
7897 if template_name == constants.DT_DISKLESS:
7899 elif template_name == constants.DT_PLAIN:
7900 if len(secondary_nodes) != 0:
7901 raise errors.ProgrammerError("Wrong template configuration")
7903 names = _GenerateUniqueNames(lu, [".disk%d" % (base_index + i)
7904 for i in range(disk_count)])
7905 for idx, disk in enumerate(disk_info):
7906 disk_index = idx + base_index
7907 vg = disk.get(constants.IDISK_VG, vgname)
7908 feedback_fn("* disk %i, vg %s, name %s" % (idx, vg, names[idx]))
7909 disk_dev = objects.Disk(dev_type=constants.LD_LV,
7910 size=disk[constants.IDISK_SIZE],
7911 logical_id=(vg, names[idx]),
7912 iv_name="disk/%d" % disk_index,
7913 mode=disk[constants.IDISK_MODE])
7914 disks.append(disk_dev)
7915 elif template_name == constants.DT_DRBD8:
7916 if len(secondary_nodes) != 1:
7917 raise errors.ProgrammerError("Wrong template configuration")
7918 remote_node = secondary_nodes[0]
7919 minors = lu.cfg.AllocateDRBDMinor(
7920 [primary_node, remote_node] * len(disk_info), instance_name)
7923 for lv_prefix in _GenerateUniqueNames(lu, [".disk%d" % (base_index + i)
7924 for i in range(disk_count)]):
7925 names.append(lv_prefix + "_data")
7926 names.append(lv_prefix + "_meta")
7927 for idx, disk in enumerate(disk_info):
7928 disk_index = idx + base_index
7929 data_vg = disk.get(constants.IDISK_VG, vgname)
7930 meta_vg = disk.get(constants.IDISK_METAVG, data_vg)
7931 disk_dev = _GenerateDRBD8Branch(lu, primary_node, remote_node,
7932 disk[constants.IDISK_SIZE],
7934 names[idx * 2:idx * 2 + 2],
7935 "disk/%d" % disk_index,
7936 minors[idx * 2], minors[idx * 2 + 1])
7937 disk_dev.mode = disk[constants.IDISK_MODE]
7938 disks.append(disk_dev)
7939 elif template_name == constants.DT_FILE:
7940 if len(secondary_nodes) != 0:
7941 raise errors.ProgrammerError("Wrong template configuration")
7943 opcodes.RequireFileStorage()
7945 for idx, disk in enumerate(disk_info):
7946 disk_index = idx + base_index
7947 disk_dev = objects.Disk(dev_type=constants.LD_FILE,
7948 size=disk[constants.IDISK_SIZE],
7949 iv_name="disk/%d" % disk_index,
7950 logical_id=(file_driver,
7951 "%s/disk%d" % (file_storage_dir,
7953 mode=disk[constants.IDISK_MODE])
7954 disks.append(disk_dev)
7955 elif template_name == constants.DT_SHARED_FILE:
7956 if len(secondary_nodes) != 0:
7957 raise errors.ProgrammerError("Wrong template configuration")
7959 opcodes.RequireSharedFileStorage()
7961 for idx, disk in enumerate(disk_info):
7962 disk_index = idx + base_index
7963 disk_dev = objects.Disk(dev_type=constants.LD_FILE,
7964 size=disk[constants.IDISK_SIZE],
7965 iv_name="disk/%d" % disk_index,
7966 logical_id=(file_driver,
7967 "%s/disk%d" % (file_storage_dir,
7969 mode=disk[constants.IDISK_MODE])
7970 disks.append(disk_dev)
7971 elif template_name == constants.DT_BLOCK:
7972 if len(secondary_nodes) != 0:
7973 raise errors.ProgrammerError("Wrong template configuration")
7975 for idx, disk in enumerate(disk_info):
7976 disk_index = idx + base_index
7977 disk_dev = objects.Disk(dev_type=constants.LD_BLOCKDEV,
7978 size=disk[constants.IDISK_SIZE],
7979 logical_id=(constants.BLOCKDEV_DRIVER_MANUAL,
7980 disk[constants.IDISK_ADOPT]),
7981 iv_name="disk/%d" % disk_index,
7982 mode=disk[constants.IDISK_MODE])
7983 disks.append(disk_dev)
7986 raise errors.ProgrammerError("Invalid disk template '%s'" % template_name)
7990 def _GetInstanceInfoText(instance):
7991 """Compute that text that should be added to the disk's metadata.
7994 return "originstname+%s" % instance.name
7997 def _CalcEta(time_taken, written, total_size):
7998 """Calculates the ETA based on size written and total size.
8000 @param time_taken: The time taken so far
8001 @param written: amount written so far
8002 @param total_size: The total size of data to be written
8003 @return: The remaining time in seconds
8006 avg_time = time_taken / float(written)
8007 return (total_size - written) * avg_time
8010 def _WipeDisks(lu, instance):
8011 """Wipes instance disks.
8013 @type lu: L{LogicalUnit}
8014 @param lu: the logical unit on whose behalf we execute
8015 @type instance: L{objects.Instance}
8016 @param instance: the instance whose disks we should create
8017 @return: the success of the wipe
8020 node = instance.primary_node
8022 for device in instance.disks:
8023 lu.cfg.SetDiskID(device, node)
8025 logging.info("Pause sync of instance %s disks", instance.name)
8026 result = lu.rpc.call_blockdev_pause_resume_sync(node, instance.disks, True)
8028 for idx, success in enumerate(result.payload):
8030 logging.warn("pause-sync of instance %s for disks %d failed",
8034 for idx, device in enumerate(instance.disks):
8035 # The wipe size is MIN_WIPE_CHUNK_PERCENT % of the instance disk but
8036 # MAX_WIPE_CHUNK at max
8037 wipe_chunk_size = min(constants.MAX_WIPE_CHUNK, device.size / 100.0 *
8038 constants.MIN_WIPE_CHUNK_PERCENT)
8039 # we _must_ make this an int, otherwise rounding errors will
8041 wipe_chunk_size = int(wipe_chunk_size)
8043 lu.LogInfo("* Wiping disk %d", idx)
8044 logging.info("Wiping disk %d for instance %s, node %s using"
8045 " chunk size %s", idx, instance.name, node, wipe_chunk_size)
8050 start_time = time.time()
8052 while offset < size:
8053 wipe_size = min(wipe_chunk_size, size - offset)
8054 logging.debug("Wiping disk %d, offset %s, chunk %s",
8055 idx, offset, wipe_size)
8056 result = lu.rpc.call_blockdev_wipe(node, device, offset, wipe_size)
8057 result.Raise("Could not wipe disk %d at offset %d for size %d" %
8058 (idx, offset, wipe_size))
8061 if now - last_output >= 60:
8062 eta = _CalcEta(now - start_time, offset, size)
8063 lu.LogInfo(" - done: %.1f%% ETA: %s" %
8064 (offset / float(size) * 100, utils.FormatSeconds(eta)))
8067 logging.info("Resume sync of instance %s disks", instance.name)
8069 result = lu.rpc.call_blockdev_pause_resume_sync(node, instance.disks, False)
8071 for idx, success in enumerate(result.payload):
8073 lu.LogWarning("Resume sync of disk %d failed, please have a"
8074 " look at the status and troubleshoot the issue", idx)
8075 logging.warn("resume-sync of instance %s for disks %d failed",
8079 def _CreateDisks(lu, instance, to_skip=None, target_node=None):
8080 """Create all disks for an instance.
8082 This abstracts away some work from AddInstance.
8084 @type lu: L{LogicalUnit}
8085 @param lu: the logical unit on whose behalf we execute
8086 @type instance: L{objects.Instance}
8087 @param instance: the instance whose disks we should create
8089 @param to_skip: list of indices to skip
8090 @type target_node: string
8091 @param target_node: if passed, overrides the target node for creation
8093 @return: the success of the creation
8096 info = _GetInstanceInfoText(instance)
8097 if target_node is None:
8098 pnode = instance.primary_node
8099 all_nodes = instance.all_nodes
8104 if instance.disk_template in constants.DTS_FILEBASED:
8105 file_storage_dir = os.path.dirname(instance.disks[0].logical_id[1])
8106 result = lu.rpc.call_file_storage_dir_create(pnode, file_storage_dir)
8108 result.Raise("Failed to create directory '%s' on"
8109 " node %s" % (file_storage_dir, pnode))
8111 # Note: this needs to be kept in sync with adding of disks in
8112 # LUInstanceSetParams
8113 for idx, device in enumerate(instance.disks):
8114 if to_skip and idx in to_skip:
8116 logging.info("Creating volume %s for instance %s",
8117 device.iv_name, instance.name)
8119 for node in all_nodes:
8120 f_create = node == pnode
8121 _CreateBlockDev(lu, node, instance, device, f_create, info, f_create)
8124 def _RemoveDisks(lu, instance, target_node=None):
8125 """Remove all disks for an instance.
8127 This abstracts away some work from `AddInstance()` and
8128 `RemoveInstance()`. Note that in case some of the devices couldn't
8129 be removed, the removal will continue with the other ones (compare
8130 with `_CreateDisks()`).
8132 @type lu: L{LogicalUnit}
8133 @param lu: the logical unit on whose behalf we execute
8134 @type instance: L{objects.Instance}
8135 @param instance: the instance whose disks we should remove
8136 @type target_node: string
8137 @param target_node: used to override the node on which to remove the disks
8139 @return: the success of the removal
8142 logging.info("Removing block devices for instance %s", instance.name)
8145 for device in instance.disks:
8147 edata = [(target_node, device)]
8149 edata = device.ComputeNodeTree(instance.primary_node)
8150 for node, disk in edata:
8151 lu.cfg.SetDiskID(disk, node)
8152 msg = lu.rpc.call_blockdev_remove(node, disk).fail_msg
8154 lu.LogWarning("Could not remove block device %s on node %s,"
8155 " continuing anyway: %s", device.iv_name, node, msg)
8158 if instance.disk_template == constants.DT_FILE:
8159 file_storage_dir = os.path.dirname(instance.disks[0].logical_id[1])
8163 tgt = instance.primary_node
8164 result = lu.rpc.call_file_storage_dir_remove(tgt, file_storage_dir)
8166 lu.LogWarning("Could not remove directory '%s' on node %s: %s",
8167 file_storage_dir, instance.primary_node, result.fail_msg)
8173 def _ComputeDiskSizePerVG(disk_template, disks):
8174 """Compute disk size requirements in the volume group
8177 def _compute(disks, payload):
8178 """Universal algorithm.
8183 vgs[disk[constants.IDISK_VG]] = \
8184 vgs.get(constants.IDISK_VG, 0) + disk[constants.IDISK_SIZE] + payload
8188 # Required free disk space as a function of disk and swap space
8190 constants.DT_DISKLESS: {},
8191 constants.DT_PLAIN: _compute(disks, 0),
8192 # 128 MB are added for drbd metadata for each disk
8193 constants.DT_DRBD8: _compute(disks, DRBD_META_SIZE),
8194 constants.DT_FILE: {},
8195 constants.DT_SHARED_FILE: {},
8198 if disk_template not in req_size_dict:
8199 raise errors.ProgrammerError("Disk template '%s' size requirement"
8200 " is unknown" % disk_template)
8202 return req_size_dict[disk_template]
8205 def _ComputeDiskSize(disk_template, disks):
8206 """Compute disk size requirements in the volume group
8209 # Required free disk space as a function of disk and swap space
8211 constants.DT_DISKLESS: None,
8212 constants.DT_PLAIN: sum(d[constants.IDISK_SIZE] for d in disks),
8213 # 128 MB are added for drbd metadata for each disk
8215 sum(d[constants.IDISK_SIZE] + DRBD_META_SIZE for d in disks),
8216 constants.DT_FILE: None,
8217 constants.DT_SHARED_FILE: 0,
8218 constants.DT_BLOCK: 0,
8221 if disk_template not in req_size_dict:
8222 raise errors.ProgrammerError("Disk template '%s' size requirement"
8223 " is unknown" % disk_template)
8225 return req_size_dict[disk_template]
8228 def _FilterVmNodes(lu, nodenames):
8229 """Filters out non-vm_capable nodes from a list.
8231 @type lu: L{LogicalUnit}
8232 @param lu: the logical unit for which we check
8233 @type nodenames: list
8234 @param nodenames: the list of nodes on which we should check
8236 @return: the list of vm-capable nodes
8239 vm_nodes = frozenset(lu.cfg.GetNonVmCapableNodeList())
8240 return [name for name in nodenames if name not in vm_nodes]
8243 def _CheckHVParams(lu, nodenames, hvname, hvparams):
8244 """Hypervisor parameter validation.
8246 This function abstract the hypervisor parameter validation to be
8247 used in both instance create and instance modify.
8249 @type lu: L{LogicalUnit}
8250 @param lu: the logical unit for which we check
8251 @type nodenames: list
8252 @param nodenames: the list of nodes on which we should check
8253 @type hvname: string
8254 @param hvname: the name of the hypervisor we should use
8255 @type hvparams: dict
8256 @param hvparams: the parameters which we need to check
8257 @raise errors.OpPrereqError: if the parameters are not valid
8260 nodenames = _FilterVmNodes(lu, nodenames)
8262 cluster = lu.cfg.GetClusterInfo()
8263 hvfull = objects.FillDict(cluster.hvparams.get(hvname, {}), hvparams)
8265 hvinfo = lu.rpc.call_hypervisor_validate_params(nodenames, hvname, hvfull)
8266 for node in nodenames:
8270 info.Raise("Hypervisor parameter validation failed on node %s" % node)
8273 def _CheckOSParams(lu, required, nodenames, osname, osparams):
8274 """OS parameters validation.
8276 @type lu: L{LogicalUnit}
8277 @param lu: the logical unit for which we check
8278 @type required: boolean
8279 @param required: whether the validation should fail if the OS is not
8281 @type nodenames: list
8282 @param nodenames: the list of nodes on which we should check
8283 @type osname: string
8284 @param osname: the name of the hypervisor we should use
8285 @type osparams: dict
8286 @param osparams: the parameters which we need to check
8287 @raise errors.OpPrereqError: if the parameters are not valid
8290 nodenames = _FilterVmNodes(lu, nodenames)
8291 result = lu.rpc.call_os_validate(nodenames, required, osname,
8292 [constants.OS_VALIDATE_PARAMETERS],
8294 for node, nres in result.items():
8295 # we don't check for offline cases since this should be run only
8296 # against the master node and/or an instance's nodes
8297 nres.Raise("OS Parameters validation failed on node %s" % node)
8298 if not nres.payload:
8299 lu.LogInfo("OS %s not found on node %s, validation skipped",
8303 class LUInstanceCreate(LogicalUnit):
8304 """Create an instance.
8307 HPATH = "instance-add"
8308 HTYPE = constants.HTYPE_INSTANCE
8311 def CheckArguments(self):
8315 # do not require name_check to ease forward/backward compatibility
8317 if self.op.no_install and self.op.start:
8318 self.LogInfo("No-installation mode selected, disabling startup")
8319 self.op.start = False
8320 # validate/normalize the instance name
8321 self.op.instance_name = \
8322 netutils.Hostname.GetNormalizedName(self.op.instance_name)
8324 if self.op.ip_check and not self.op.name_check:
8325 # TODO: make the ip check more flexible and not depend on the name check
8326 raise errors.OpPrereqError("Cannot do IP address check without a name"
8327 " check", errors.ECODE_INVAL)
8329 # check nics' parameter names
8330 for nic in self.op.nics:
8331 utils.ForceDictType(nic, constants.INIC_PARAMS_TYPES)
8333 # check disks. parameter names and consistent adopt/no-adopt strategy
8334 has_adopt = has_no_adopt = False
8335 for disk in self.op.disks:
8336 utils.ForceDictType(disk, constants.IDISK_PARAMS_TYPES)
8337 if constants.IDISK_ADOPT in disk:
8341 if has_adopt and has_no_adopt:
8342 raise errors.OpPrereqError("Either all disks are adopted or none is",
8345 if self.op.disk_template not in constants.DTS_MAY_ADOPT:
8346 raise errors.OpPrereqError("Disk adoption is not supported for the"
8347 " '%s' disk template" %
8348 self.op.disk_template,
8350 if self.op.iallocator is not None:
8351 raise errors.OpPrereqError("Disk adoption not allowed with an"
8352 " iallocator script", errors.ECODE_INVAL)
8353 if self.op.mode == constants.INSTANCE_IMPORT:
8354 raise errors.OpPrereqError("Disk adoption not allowed for"
8355 " instance import", errors.ECODE_INVAL)
8357 if self.op.disk_template in constants.DTS_MUST_ADOPT:
8358 raise errors.OpPrereqError("Disk template %s requires disk adoption,"
8359 " but no 'adopt' parameter given" %
8360 self.op.disk_template,
8363 self.adopt_disks = has_adopt
8365 # instance name verification
8366 if self.op.name_check:
8367 self.hostname1 = netutils.GetHostname(name=self.op.instance_name)
8368 self.op.instance_name = self.hostname1.name
8369 # used in CheckPrereq for ip ping check
8370 self.check_ip = self.hostname1.ip
8372 self.check_ip = None
8374 # file storage checks
8375 if (self.op.file_driver and
8376 not self.op.file_driver in constants.FILE_DRIVER):
8377 raise errors.OpPrereqError("Invalid file driver name '%s'" %
8378 self.op.file_driver, errors.ECODE_INVAL)
8380 if self.op.disk_template == constants.DT_FILE:
8381 opcodes.RequireFileStorage()
8382 elif self.op.disk_template == constants.DT_SHARED_FILE:
8383 opcodes.RequireSharedFileStorage()
8385 ### Node/iallocator related checks
8386 _CheckIAllocatorOrNode(self, "iallocator", "pnode")
8388 if self.op.pnode is not None:
8389 if self.op.disk_template in constants.DTS_INT_MIRROR:
8390 if self.op.snode is None:
8391 raise errors.OpPrereqError("The networked disk templates need"
8392 " a mirror node", errors.ECODE_INVAL)
8394 self.LogWarning("Secondary node will be ignored on non-mirrored disk"
8396 self.op.snode = None
8398 self._cds = _GetClusterDomainSecret()
8400 if self.op.mode == constants.INSTANCE_IMPORT:
8401 # On import force_variant must be True, because if we forced it at
8402 # initial install, our only chance when importing it back is that it
8404 self.op.force_variant = True
8406 if self.op.no_install:
8407 self.LogInfo("No-installation mode has no effect during import")
8409 elif self.op.mode == constants.INSTANCE_CREATE:
8410 if self.op.os_type is None:
8411 raise errors.OpPrereqError("No guest OS specified",
8413 if self.op.os_type in self.cfg.GetClusterInfo().blacklisted_os:
8414 raise errors.OpPrereqError("Guest OS '%s' is not allowed for"
8415 " installation" % self.op.os_type,
8417 if self.op.disk_template is None:
8418 raise errors.OpPrereqError("No disk template specified",
8421 elif self.op.mode == constants.INSTANCE_REMOTE_IMPORT:
8422 # Check handshake to ensure both clusters have the same domain secret
8423 src_handshake = self.op.source_handshake
8424 if not src_handshake:
8425 raise errors.OpPrereqError("Missing source handshake",
8428 errmsg = masterd.instance.CheckRemoteExportHandshake(self._cds,
8431 raise errors.OpPrereqError("Invalid handshake: %s" % errmsg,
8434 # Load and check source CA
8435 self.source_x509_ca_pem = self.op.source_x509_ca
8436 if not self.source_x509_ca_pem:
8437 raise errors.OpPrereqError("Missing source X509 CA",
8441 (cert, _) = utils.LoadSignedX509Certificate(self.source_x509_ca_pem,
8443 except OpenSSL.crypto.Error, err:
8444 raise errors.OpPrereqError("Unable to load source X509 CA (%s)" %
8445 (err, ), errors.ECODE_INVAL)
8447 (errcode, msg) = utils.VerifyX509Certificate(cert, None, None)
8448 if errcode is not None:
8449 raise errors.OpPrereqError("Invalid source X509 CA (%s)" % (msg, ),
8452 self.source_x509_ca = cert
8454 src_instance_name = self.op.source_instance_name
8455 if not src_instance_name:
8456 raise errors.OpPrereqError("Missing source instance name",
8459 self.source_instance_name = \
8460 netutils.GetHostname(name=src_instance_name).name
8463 raise errors.OpPrereqError("Invalid instance creation mode %r" %
8464 self.op.mode, errors.ECODE_INVAL)
8466 def ExpandNames(self):
8467 """ExpandNames for CreateInstance.
8469 Figure out the right locks for instance creation.
8472 self.needed_locks = {}
8474 instance_name = self.op.instance_name
8475 # this is just a preventive check, but someone might still add this
8476 # instance in the meantime, and creation will fail at lock-add time
8477 if instance_name in self.cfg.GetInstanceList():
8478 raise errors.OpPrereqError("Instance '%s' is already in the cluster" %
8479 instance_name, errors.ECODE_EXISTS)
8481 self.add_locks[locking.LEVEL_INSTANCE] = instance_name
8483 if self.op.iallocator:
8484 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
8486 self.op.pnode = _ExpandNodeName(self.cfg, self.op.pnode)
8487 nodelist = [self.op.pnode]
8488 if self.op.snode is not None:
8489 self.op.snode = _ExpandNodeName(self.cfg, self.op.snode)
8490 nodelist.append(self.op.snode)
8491 self.needed_locks[locking.LEVEL_NODE] = nodelist
8493 # in case of import lock the source node too
8494 if self.op.mode == constants.INSTANCE_IMPORT:
8495 src_node = self.op.src_node
8496 src_path = self.op.src_path
8498 if src_path is None:
8499 self.op.src_path = src_path = self.op.instance_name
8501 if src_node is None:
8502 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
8503 self.op.src_node = None
8504 if os.path.isabs(src_path):
8505 raise errors.OpPrereqError("Importing an instance from a path"
8506 " requires a source node option",
8509 self.op.src_node = src_node = _ExpandNodeName(self.cfg, src_node)
8510 if self.needed_locks[locking.LEVEL_NODE] is not locking.ALL_SET:
8511 self.needed_locks[locking.LEVEL_NODE].append(src_node)
8512 if not os.path.isabs(src_path):
8513 self.op.src_path = src_path = \
8514 utils.PathJoin(constants.EXPORT_DIR, src_path)
8516 def _RunAllocator(self):
8517 """Run the allocator based on input opcode.
8520 nics = [n.ToDict() for n in self.nics]
8521 ial = IAllocator(self.cfg, self.rpc,
8522 mode=constants.IALLOCATOR_MODE_ALLOC,
8523 name=self.op.instance_name,
8524 disk_template=self.op.disk_template,
8527 vcpus=self.be_full[constants.BE_VCPUS],
8528 memory=self.be_full[constants.BE_MEMORY],
8531 hypervisor=self.op.hypervisor,
8534 ial.Run(self.op.iallocator)
8537 raise errors.OpPrereqError("Can't compute nodes using"
8538 " iallocator '%s': %s" %
8539 (self.op.iallocator, ial.info),
8541 if len(ial.result) != ial.required_nodes:
8542 raise errors.OpPrereqError("iallocator '%s' returned invalid number"
8543 " of nodes (%s), required %s" %
8544 (self.op.iallocator, len(ial.result),
8545 ial.required_nodes), errors.ECODE_FAULT)
8546 self.op.pnode = ial.result[0]
8547 self.LogInfo("Selected nodes for instance %s via iallocator %s: %s",
8548 self.op.instance_name, self.op.iallocator,
8549 utils.CommaJoin(ial.result))
8550 if ial.required_nodes == 2:
8551 self.op.snode = ial.result[1]
8553 def BuildHooksEnv(self):
8556 This runs on master, primary and secondary nodes of the instance.
8560 "ADD_MODE": self.op.mode,
8562 if self.op.mode == constants.INSTANCE_IMPORT:
8563 env["SRC_NODE"] = self.op.src_node
8564 env["SRC_PATH"] = self.op.src_path
8565 env["SRC_IMAGES"] = self.src_images
8567 env.update(_BuildInstanceHookEnv(
8568 name=self.op.instance_name,
8569 primary_node=self.op.pnode,
8570 secondary_nodes=self.secondaries,
8571 status=self.op.start,
8572 os_type=self.op.os_type,
8573 memory=self.be_full[constants.BE_MEMORY],
8574 vcpus=self.be_full[constants.BE_VCPUS],
8575 nics=_NICListToTuple(self, self.nics),
8576 disk_template=self.op.disk_template,
8577 disks=[(d[constants.IDISK_SIZE], d[constants.IDISK_MODE])
8578 for d in self.disks],
8581 hypervisor_name=self.op.hypervisor,
8587 def BuildHooksNodes(self):
8588 """Build hooks nodes.
8591 nl = [self.cfg.GetMasterNode(), self.op.pnode] + self.secondaries
8594 def _ReadExportInfo(self):
8595 """Reads the export information from disk.
8597 It will override the opcode source node and path with the actual
8598 information, if these two were not specified before.
8600 @return: the export information
8603 assert self.op.mode == constants.INSTANCE_IMPORT
8605 src_node = self.op.src_node
8606 src_path = self.op.src_path
8608 if src_node is None:
8609 locked_nodes = self.owned_locks(locking.LEVEL_NODE)
8610 exp_list = self.rpc.call_export_list(locked_nodes)
8612 for node in exp_list:
8613 if exp_list[node].fail_msg:
8615 if src_path in exp_list[node].payload:
8617 self.op.src_node = src_node = node
8618 self.op.src_path = src_path = utils.PathJoin(constants.EXPORT_DIR,
8622 raise errors.OpPrereqError("No export found for relative path %s" %
8623 src_path, errors.ECODE_INVAL)
8625 _CheckNodeOnline(self, src_node)
8626 result = self.rpc.call_export_info(src_node, src_path)
8627 result.Raise("No export or invalid export found in dir %s" % src_path)
8629 export_info = objects.SerializableConfigParser.Loads(str(result.payload))
8630 if not export_info.has_section(constants.INISECT_EXP):
8631 raise errors.ProgrammerError("Corrupted export config",
8632 errors.ECODE_ENVIRON)
8634 ei_version = export_info.get(constants.INISECT_EXP, "version")
8635 if (int(ei_version) != constants.EXPORT_VERSION):
8636 raise errors.OpPrereqError("Wrong export version %s (wanted %d)" %
8637 (ei_version, constants.EXPORT_VERSION),
8638 errors.ECODE_ENVIRON)
8641 def _ReadExportParams(self, einfo):
8642 """Use export parameters as defaults.
8644 In case the opcode doesn't specify (as in override) some instance
8645 parameters, then try to use them from the export information, if
8649 self.op.os_type = einfo.get(constants.INISECT_EXP, "os")
8651 if self.op.disk_template is None:
8652 if einfo.has_option(constants.INISECT_INS, "disk_template"):
8653 self.op.disk_template = einfo.get(constants.INISECT_INS,
8655 if self.op.disk_template not in constants.DISK_TEMPLATES:
8656 raise errors.OpPrereqError("Disk template specified in configuration"
8657 " file is not one of the allowed values:"
8658 " %s" % " ".join(constants.DISK_TEMPLATES))
8660 raise errors.OpPrereqError("No disk template specified and the export"
8661 " is missing the disk_template information",
8664 if not self.op.disks:
8666 # TODO: import the disk iv_name too
8667 for idx in range(constants.MAX_DISKS):
8668 if einfo.has_option(constants.INISECT_INS, "disk%d_size" % idx):
8669 disk_sz = einfo.getint(constants.INISECT_INS, "disk%d_size" % idx)
8670 disks.append({constants.IDISK_SIZE: disk_sz})
8671 self.op.disks = disks
8672 if not disks and self.op.disk_template != constants.DT_DISKLESS:
8673 raise errors.OpPrereqError("No disk info specified and the export"
8674 " is missing the disk information",
8677 if not self.op.nics:
8679 for idx in range(constants.MAX_NICS):
8680 if einfo.has_option(constants.INISECT_INS, "nic%d_mac" % idx):
8682 for name in list(constants.NICS_PARAMETERS) + ["ip", "mac"]:
8683 v = einfo.get(constants.INISECT_INS, "nic%d_%s" % (idx, name))
8690 if not self.op.tags and einfo.has_option(constants.INISECT_INS, "tags"):
8691 self.op.tags = einfo.get(constants.INISECT_INS, "tags").split()
8693 if (self.op.hypervisor is None and
8694 einfo.has_option(constants.INISECT_INS, "hypervisor")):
8695 self.op.hypervisor = einfo.get(constants.INISECT_INS, "hypervisor")
8697 if einfo.has_section(constants.INISECT_HYP):
8698 # use the export parameters but do not override the ones
8699 # specified by the user
8700 for name, value in einfo.items(constants.INISECT_HYP):
8701 if name not in self.op.hvparams:
8702 self.op.hvparams[name] = value
8704 if einfo.has_section(constants.INISECT_BEP):
8705 # use the parameters, without overriding
8706 for name, value in einfo.items(constants.INISECT_BEP):
8707 if name not in self.op.beparams:
8708 self.op.beparams[name] = value
8710 # try to read the parameters old style, from the main section
8711 for name in constants.BES_PARAMETERS:
8712 if (name not in self.op.beparams and
8713 einfo.has_option(constants.INISECT_INS, name)):
8714 self.op.beparams[name] = einfo.get(constants.INISECT_INS, name)
8716 if einfo.has_section(constants.INISECT_OSP):
8717 # use the parameters, without overriding
8718 for name, value in einfo.items(constants.INISECT_OSP):
8719 if name not in self.op.osparams:
8720 self.op.osparams[name] = value
8722 def _RevertToDefaults(self, cluster):
8723 """Revert the instance parameters to the default values.
8727 hv_defs = cluster.SimpleFillHV(self.op.hypervisor, self.op.os_type, {})
8728 for name in self.op.hvparams.keys():
8729 if name in hv_defs and hv_defs[name] == self.op.hvparams[name]:
8730 del self.op.hvparams[name]
8732 be_defs = cluster.SimpleFillBE({})
8733 for name in self.op.beparams.keys():
8734 if name in be_defs and be_defs[name] == self.op.beparams[name]:
8735 del self.op.beparams[name]
8737 nic_defs = cluster.SimpleFillNIC({})
8738 for nic in self.op.nics:
8739 for name in constants.NICS_PARAMETERS:
8740 if name in nic and name in nic_defs and nic[name] == nic_defs[name]:
8743 os_defs = cluster.SimpleFillOS(self.op.os_type, {})
8744 for name in self.op.osparams.keys():
8745 if name in os_defs and os_defs[name] == self.op.osparams[name]:
8746 del self.op.osparams[name]
8748 def _CalculateFileStorageDir(self):
8749 """Calculate final instance file storage dir.
8752 # file storage dir calculation/check
8753 self.instance_file_storage_dir = None
8754 if self.op.disk_template in constants.DTS_FILEBASED:
8755 # build the full file storage dir path
8758 if self.op.disk_template == constants.DT_SHARED_FILE:
8759 get_fsd_fn = self.cfg.GetSharedFileStorageDir
8761 get_fsd_fn = self.cfg.GetFileStorageDir
8763 cfg_storagedir = get_fsd_fn()
8764 if not cfg_storagedir:
8765 raise errors.OpPrereqError("Cluster file storage dir not defined")
8766 joinargs.append(cfg_storagedir)
8768 if self.op.file_storage_dir is not None:
8769 joinargs.append(self.op.file_storage_dir)
8771 joinargs.append(self.op.instance_name)
8773 # pylint: disable=W0142
8774 self.instance_file_storage_dir = utils.PathJoin(*joinargs)
8776 def CheckPrereq(self):
8777 """Check prerequisites.
8780 self._CalculateFileStorageDir()
8782 if self.op.mode == constants.INSTANCE_IMPORT:
8783 export_info = self._ReadExportInfo()
8784 self._ReadExportParams(export_info)
8786 if (not self.cfg.GetVGName() and
8787 self.op.disk_template not in constants.DTS_NOT_LVM):
8788 raise errors.OpPrereqError("Cluster does not support lvm-based"
8789 " instances", errors.ECODE_STATE)
8791 if (self.op.hypervisor is None or
8792 self.op.hypervisor == constants.VALUE_AUTO):
8793 self.op.hypervisor = self.cfg.GetHypervisorType()
8795 cluster = self.cfg.GetClusterInfo()
8796 enabled_hvs = cluster.enabled_hypervisors
8797 if self.op.hypervisor not in enabled_hvs:
8798 raise errors.OpPrereqError("Selected hypervisor (%s) not enabled in the"
8799 " cluster (%s)" % (self.op.hypervisor,
8800 ",".join(enabled_hvs)),
8803 # Check tag validity
8804 for tag in self.op.tags:
8805 objects.TaggableObject.ValidateTag(tag)
8807 # check hypervisor parameter syntax (locally)
8808 utils.ForceDictType(self.op.hvparams, constants.HVS_PARAMETER_TYPES)
8809 filled_hvp = cluster.SimpleFillHV(self.op.hypervisor, self.op.os_type,
8811 hv_type = hypervisor.GetHypervisor(self.op.hypervisor)
8812 hv_type.CheckParameterSyntax(filled_hvp)
8813 self.hv_full = filled_hvp
8814 # check that we don't specify global parameters on an instance
8815 _CheckGlobalHvParams(self.op.hvparams)
8817 # fill and remember the beparams dict
8818 default_beparams = cluster.beparams[constants.PP_DEFAULT]
8819 for param, value in self.op.beparams.iteritems():
8820 if value == constants.VALUE_AUTO:
8821 self.op.beparams[param] = default_beparams[param]
8822 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
8823 self.be_full = cluster.SimpleFillBE(self.op.beparams)
8825 # build os parameters
8826 self.os_full = cluster.SimpleFillOS(self.op.os_type, self.op.osparams)
8828 # now that hvp/bep are in final format, let's reset to defaults,
8830 if self.op.identify_defaults:
8831 self._RevertToDefaults(cluster)
8835 for idx, nic in enumerate(self.op.nics):
8836 nic_mode_req = nic.get(constants.INIC_MODE, None)
8837 nic_mode = nic_mode_req
8838 if nic_mode is None or nic_mode == constants.VALUE_AUTO:
8839 nic_mode = cluster.nicparams[constants.PP_DEFAULT][constants.NIC_MODE]
8841 # in routed mode, for the first nic, the default ip is 'auto'
8842 if nic_mode == constants.NIC_MODE_ROUTED and idx == 0:
8843 default_ip_mode = constants.VALUE_AUTO
8845 default_ip_mode = constants.VALUE_NONE
8847 # ip validity checks
8848 ip = nic.get(constants.INIC_IP, default_ip_mode)
8849 if ip is None or ip.lower() == constants.VALUE_NONE:
8851 elif ip.lower() == constants.VALUE_AUTO:
8852 if not self.op.name_check:
8853 raise errors.OpPrereqError("IP address set to auto but name checks"
8854 " have been skipped",
8856 nic_ip = self.hostname1.ip
8858 if not netutils.IPAddress.IsValid(ip):
8859 raise errors.OpPrereqError("Invalid IP address '%s'" % ip,
8863 # TODO: check the ip address for uniqueness
8864 if nic_mode == constants.NIC_MODE_ROUTED and not nic_ip:
8865 raise errors.OpPrereqError("Routed nic mode requires an ip address",
8868 # MAC address verification
8869 mac = nic.get(constants.INIC_MAC, constants.VALUE_AUTO)
8870 if mac not in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
8871 mac = utils.NormalizeAndValidateMac(mac)
8874 self.cfg.ReserveMAC(mac, self.proc.GetECId())
8875 except errors.ReservationError:
8876 raise errors.OpPrereqError("MAC address %s already in use"
8877 " in cluster" % mac,
8878 errors.ECODE_NOTUNIQUE)
8880 # Build nic parameters
8881 link = nic.get(constants.INIC_LINK, None)
8882 if link == constants.VALUE_AUTO:
8883 link = cluster.nicparams[constants.PP_DEFAULT][constants.NIC_LINK]
8886 nicparams[constants.NIC_MODE] = nic_mode
8888 nicparams[constants.NIC_LINK] = link
8890 check_params = cluster.SimpleFillNIC(nicparams)
8891 objects.NIC.CheckParameterSyntax(check_params)
8892 self.nics.append(objects.NIC(mac=mac, ip=nic_ip, nicparams=nicparams))
8894 # disk checks/pre-build
8895 default_vg = self.cfg.GetVGName()
8897 for disk in self.op.disks:
8898 mode = disk.get(constants.IDISK_MODE, constants.DISK_RDWR)
8899 if mode not in constants.DISK_ACCESS_SET:
8900 raise errors.OpPrereqError("Invalid disk access mode '%s'" %
8901 mode, errors.ECODE_INVAL)
8902 size = disk.get(constants.IDISK_SIZE, None)
8904 raise errors.OpPrereqError("Missing disk size", errors.ECODE_INVAL)
8907 except (TypeError, ValueError):
8908 raise errors.OpPrereqError("Invalid disk size '%s'" % size,
8911 data_vg = disk.get(constants.IDISK_VG, default_vg)
8913 constants.IDISK_SIZE: size,
8914 constants.IDISK_MODE: mode,
8915 constants.IDISK_VG: data_vg,
8916 constants.IDISK_METAVG: disk.get(constants.IDISK_METAVG, data_vg),
8918 if constants.IDISK_ADOPT in disk:
8919 new_disk[constants.IDISK_ADOPT] = disk[constants.IDISK_ADOPT]
8920 self.disks.append(new_disk)
8922 if self.op.mode == constants.INSTANCE_IMPORT:
8924 for idx in range(len(self.disks)):
8925 option = "disk%d_dump" % idx
8926 if export_info.has_option(constants.INISECT_INS, option):
8927 # FIXME: are the old os-es, disk sizes, etc. useful?
8928 export_name = export_info.get(constants.INISECT_INS, option)
8929 image = utils.PathJoin(self.op.src_path, export_name)
8930 disk_images.append(image)
8932 disk_images.append(False)
8934 self.src_images = disk_images
8936 old_name = export_info.get(constants.INISECT_INS, "name")
8937 if self.op.instance_name == old_name:
8938 for idx, nic in enumerate(self.nics):
8939 if nic.mac == constants.VALUE_AUTO:
8940 nic_mac_ini = "nic%d_mac" % idx
8941 nic.mac = export_info.get(constants.INISECT_INS, nic_mac_ini)
8943 # ENDIF: self.op.mode == constants.INSTANCE_IMPORT
8945 # ip ping checks (we use the same ip that was resolved in ExpandNames)
8946 if self.op.ip_check:
8947 if netutils.TcpPing(self.check_ip, constants.DEFAULT_NODED_PORT):
8948 raise errors.OpPrereqError("IP %s of instance %s already in use" %
8949 (self.check_ip, self.op.instance_name),
8950 errors.ECODE_NOTUNIQUE)
8952 #### mac address generation
8953 # By generating here the mac address both the allocator and the hooks get
8954 # the real final mac address rather than the 'auto' or 'generate' value.
8955 # There is a race condition between the generation and the instance object
8956 # creation, which means that we know the mac is valid now, but we're not
8957 # sure it will be when we actually add the instance. If things go bad
8958 # adding the instance will abort because of a duplicate mac, and the
8959 # creation job will fail.
8960 for nic in self.nics:
8961 if nic.mac in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
8962 nic.mac = self.cfg.GenerateMAC(self.proc.GetECId())
8966 if self.op.iallocator is not None:
8967 self._RunAllocator()
8969 #### node related checks
8971 # check primary node
8972 self.pnode = pnode = self.cfg.GetNodeInfo(self.op.pnode)
8973 assert self.pnode is not None, \
8974 "Cannot retrieve locked node %s" % self.op.pnode
8976 raise errors.OpPrereqError("Cannot use offline primary node '%s'" %
8977 pnode.name, errors.ECODE_STATE)
8979 raise errors.OpPrereqError("Cannot use drained primary node '%s'" %
8980 pnode.name, errors.ECODE_STATE)
8981 if not pnode.vm_capable:
8982 raise errors.OpPrereqError("Cannot use non-vm_capable primary node"
8983 " '%s'" % pnode.name, errors.ECODE_STATE)
8985 self.secondaries = []
8987 # mirror node verification
8988 if self.op.disk_template in constants.DTS_INT_MIRROR:
8989 if self.op.snode == pnode.name:
8990 raise errors.OpPrereqError("The secondary node cannot be the"
8991 " primary node", errors.ECODE_INVAL)
8992 _CheckNodeOnline(self, self.op.snode)
8993 _CheckNodeNotDrained(self, self.op.snode)
8994 _CheckNodeVmCapable(self, self.op.snode)
8995 self.secondaries.append(self.op.snode)
8997 nodenames = [pnode.name] + self.secondaries
8999 if not self.adopt_disks:
9000 # Check lv size requirements, if not adopting
9001 req_sizes = _ComputeDiskSizePerVG(self.op.disk_template, self.disks)
9002 _CheckNodesFreeDiskPerVG(self, nodenames, req_sizes)
9004 elif self.op.disk_template == constants.DT_PLAIN: # Check the adoption data
9005 all_lvs = set(["%s/%s" % (disk[constants.IDISK_VG],
9006 disk[constants.IDISK_ADOPT])
9007 for disk in self.disks])
9008 if len(all_lvs) != len(self.disks):
9009 raise errors.OpPrereqError("Duplicate volume names given for adoption",
9011 for lv_name in all_lvs:
9013 # FIXME: lv_name here is "vg/lv" need to ensure that other calls
9014 # to ReserveLV uses the same syntax
9015 self.cfg.ReserveLV(lv_name, self.proc.GetECId())
9016 except errors.ReservationError:
9017 raise errors.OpPrereqError("LV named %s used by another instance" %
9018 lv_name, errors.ECODE_NOTUNIQUE)
9020 vg_names = self.rpc.call_vg_list([pnode.name])[pnode.name]
9021 vg_names.Raise("Cannot get VG information from node %s" % pnode.name)
9023 node_lvs = self.rpc.call_lv_list([pnode.name],
9024 vg_names.payload.keys())[pnode.name]
9025 node_lvs.Raise("Cannot get LV information from node %s" % pnode.name)
9026 node_lvs = node_lvs.payload
9028 delta = all_lvs.difference(node_lvs.keys())
9030 raise errors.OpPrereqError("Missing logical volume(s): %s" %
9031 utils.CommaJoin(delta),
9033 online_lvs = [lv for lv in all_lvs if node_lvs[lv][2]]
9035 raise errors.OpPrereqError("Online logical volumes found, cannot"
9036 " adopt: %s" % utils.CommaJoin(online_lvs),
9038 # update the size of disk based on what is found
9039 for dsk in self.disks:
9040 dsk[constants.IDISK_SIZE] = \
9041 int(float(node_lvs["%s/%s" % (dsk[constants.IDISK_VG],
9042 dsk[constants.IDISK_ADOPT])][0]))
9044 elif self.op.disk_template == constants.DT_BLOCK:
9045 # Normalize and de-duplicate device paths
9046 all_disks = set([os.path.abspath(disk[constants.IDISK_ADOPT])
9047 for disk in self.disks])
9048 if len(all_disks) != len(self.disks):
9049 raise errors.OpPrereqError("Duplicate disk names given for adoption",
9051 baddisks = [d for d in all_disks
9052 if not d.startswith(constants.ADOPTABLE_BLOCKDEV_ROOT)]
9054 raise errors.OpPrereqError("Device node(s) %s lie outside %s and"
9055 " cannot be adopted" %
9056 (", ".join(baddisks),
9057 constants.ADOPTABLE_BLOCKDEV_ROOT),
9060 node_disks = self.rpc.call_bdev_sizes([pnode.name],
9061 list(all_disks))[pnode.name]
9062 node_disks.Raise("Cannot get block device information from node %s" %
9064 node_disks = node_disks.payload
9065 delta = all_disks.difference(node_disks.keys())
9067 raise errors.OpPrereqError("Missing block device(s): %s" %
9068 utils.CommaJoin(delta),
9070 for dsk in self.disks:
9071 dsk[constants.IDISK_SIZE] = \
9072 int(float(node_disks[dsk[constants.IDISK_ADOPT]]))
9074 _CheckHVParams(self, nodenames, self.op.hypervisor, self.op.hvparams)
9076 _CheckNodeHasOS(self, pnode.name, self.op.os_type, self.op.force_variant)
9077 # check OS parameters (remotely)
9078 _CheckOSParams(self, True, nodenames, self.op.os_type, self.os_full)
9080 _CheckNicsBridgesExist(self, self.nics, self.pnode.name)
9082 # memory check on primary node
9084 _CheckNodeFreeMemory(self, self.pnode.name,
9085 "creating instance %s" % self.op.instance_name,
9086 self.be_full[constants.BE_MEMORY],
9089 self.dry_run_result = list(nodenames)
9091 def Exec(self, feedback_fn):
9092 """Create and add the instance to the cluster.
9095 instance = self.op.instance_name
9096 pnode_name = self.pnode.name
9098 ht_kind = self.op.hypervisor
9099 if ht_kind in constants.HTS_REQ_PORT:
9100 network_port = self.cfg.AllocatePort()
9104 disks = _GenerateDiskTemplate(self,
9105 self.op.disk_template,
9106 instance, pnode_name,
9109 self.instance_file_storage_dir,
9110 self.op.file_driver,
9114 iobj = objects.Instance(name=instance, os=self.op.os_type,
9115 primary_node=pnode_name,
9116 nics=self.nics, disks=disks,
9117 disk_template=self.op.disk_template,
9119 network_port=network_port,
9120 beparams=self.op.beparams,
9121 hvparams=self.op.hvparams,
9122 hypervisor=self.op.hypervisor,
9123 osparams=self.op.osparams,
9127 for tag in self.op.tags:
9130 if self.adopt_disks:
9131 if self.op.disk_template == constants.DT_PLAIN:
9132 # rename LVs to the newly-generated names; we need to construct
9133 # 'fake' LV disks with the old data, plus the new unique_id
9134 tmp_disks = [objects.Disk.FromDict(v.ToDict()) for v in disks]
9136 for t_dsk, a_dsk in zip(tmp_disks, self.disks):
9137 rename_to.append(t_dsk.logical_id)
9138 t_dsk.logical_id = (t_dsk.logical_id[0], a_dsk[constants.IDISK_ADOPT])
9139 self.cfg.SetDiskID(t_dsk, pnode_name)
9140 result = self.rpc.call_blockdev_rename(pnode_name,
9141 zip(tmp_disks, rename_to))
9142 result.Raise("Failed to rename adoped LVs")
9144 feedback_fn("* creating instance disks...")
9146 _CreateDisks(self, iobj)
9147 except errors.OpExecError:
9148 self.LogWarning("Device creation failed, reverting...")
9150 _RemoveDisks(self, iobj)
9152 self.cfg.ReleaseDRBDMinors(instance)
9155 feedback_fn("adding instance %s to cluster config" % instance)
9157 self.cfg.AddInstance(iobj, self.proc.GetECId())
9159 # Declare that we don't want to remove the instance lock anymore, as we've
9160 # added the instance to the config
9161 del self.remove_locks[locking.LEVEL_INSTANCE]
9163 if self.op.mode == constants.INSTANCE_IMPORT:
9164 # Release unused nodes
9165 _ReleaseLocks(self, locking.LEVEL_NODE, keep=[self.op.src_node])
9168 _ReleaseLocks(self, locking.LEVEL_NODE)
9171 if not self.adopt_disks and self.cfg.GetClusterInfo().prealloc_wipe_disks:
9172 feedback_fn("* wiping instance disks...")
9174 _WipeDisks(self, iobj)
9175 except errors.OpExecError, err:
9176 logging.exception("Wiping disks failed")
9177 self.LogWarning("Wiping instance disks failed (%s)", err)
9181 # Something is already wrong with the disks, don't do anything else
9183 elif self.op.wait_for_sync:
9184 disk_abort = not _WaitForSync(self, iobj)
9185 elif iobj.disk_template in constants.DTS_INT_MIRROR:
9186 # make sure the disks are not degraded (still sync-ing is ok)
9187 feedback_fn("* checking mirrors status")
9188 disk_abort = not _WaitForSync(self, iobj, oneshot=True)
9193 _RemoveDisks(self, iobj)
9194 self.cfg.RemoveInstance(iobj.name)
9195 # Make sure the instance lock gets removed
9196 self.remove_locks[locking.LEVEL_INSTANCE] = iobj.name
9197 raise errors.OpExecError("There are some degraded disks for"
9200 if iobj.disk_template != constants.DT_DISKLESS and not self.adopt_disks:
9201 if self.op.mode == constants.INSTANCE_CREATE:
9202 if not self.op.no_install:
9203 pause_sync = (iobj.disk_template in constants.DTS_INT_MIRROR and
9204 not self.op.wait_for_sync)
9206 feedback_fn("* pausing disk sync to install instance OS")
9207 result = self.rpc.call_blockdev_pause_resume_sync(pnode_name,
9209 for idx, success in enumerate(result.payload):
9211 logging.warn("pause-sync of instance %s for disk %d failed",
9214 feedback_fn("* running the instance OS create scripts...")
9215 # FIXME: pass debug option from opcode to backend
9217 self.rpc.call_instance_os_add(pnode_name, (iobj, None), False,
9218 self.op.debug_level)
9220 feedback_fn("* resuming disk sync")
9221 result = self.rpc.call_blockdev_pause_resume_sync(pnode_name,
9223 for idx, success in enumerate(result.payload):
9225 logging.warn("resume-sync of instance %s for disk %d failed",
9228 os_add_result.Raise("Could not add os for instance %s"
9229 " on node %s" % (instance, pnode_name))
9231 elif self.op.mode == constants.INSTANCE_IMPORT:
9232 feedback_fn("* running the instance OS import scripts...")
9236 for idx, image in enumerate(self.src_images):
9240 # FIXME: pass debug option from opcode to backend
9241 dt = masterd.instance.DiskTransfer("disk/%s" % idx,
9242 constants.IEIO_FILE, (image, ),
9243 constants.IEIO_SCRIPT,
9244 (iobj.disks[idx], idx),
9246 transfers.append(dt)
9249 masterd.instance.TransferInstanceData(self, feedback_fn,
9250 self.op.src_node, pnode_name,
9251 self.pnode.secondary_ip,
9253 if not compat.all(import_result):
9254 self.LogWarning("Some disks for instance %s on node %s were not"
9255 " imported successfully" % (instance, pnode_name))
9257 elif self.op.mode == constants.INSTANCE_REMOTE_IMPORT:
9258 feedback_fn("* preparing remote import...")
9259 # The source cluster will stop the instance before attempting to make a
9260 # connection. In some cases stopping an instance can take a long time,
9261 # hence the shutdown timeout is added to the connection timeout.
9262 connect_timeout = (constants.RIE_CONNECT_TIMEOUT +
9263 self.op.source_shutdown_timeout)
9264 timeouts = masterd.instance.ImportExportTimeouts(connect_timeout)
9266 assert iobj.primary_node == self.pnode.name
9268 masterd.instance.RemoteImport(self, feedback_fn, iobj, self.pnode,
9269 self.source_x509_ca,
9270 self._cds, timeouts)
9271 if not compat.all(disk_results):
9272 # TODO: Should the instance still be started, even if some disks
9273 # failed to import (valid for local imports, too)?
9274 self.LogWarning("Some disks for instance %s on node %s were not"
9275 " imported successfully" % (instance, pnode_name))
9277 # Run rename script on newly imported instance
9278 assert iobj.name == instance
9279 feedback_fn("Running rename script for %s" % instance)
9280 result = self.rpc.call_instance_run_rename(pnode_name, iobj,
9281 self.source_instance_name,
9282 self.op.debug_level)
9284 self.LogWarning("Failed to run rename script for %s on node"
9285 " %s: %s" % (instance, pnode_name, result.fail_msg))
9288 # also checked in the prereq part
9289 raise errors.ProgrammerError("Unknown OS initialization mode '%s'"
9293 iobj.admin_up = True
9294 self.cfg.Update(iobj, feedback_fn)
9295 logging.info("Starting instance %s on node %s", instance, pnode_name)
9296 feedback_fn("* starting instance...")
9297 result = self.rpc.call_instance_start(pnode_name, (iobj, None, None),
9299 result.Raise("Could not start instance")
9301 return list(iobj.all_nodes)
9304 class LUInstanceConsole(NoHooksLU):
9305 """Connect to an instance's console.
9307 This is somewhat special in that it returns the command line that
9308 you need to run on the master node in order to connect to the
9314 def ExpandNames(self):
9315 self._ExpandAndLockInstance()
9317 def CheckPrereq(self):
9318 """Check prerequisites.
9320 This checks that the instance is in the cluster.
9323 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
9324 assert self.instance is not None, \
9325 "Cannot retrieve locked instance %s" % self.op.instance_name
9326 _CheckNodeOnline(self, self.instance.primary_node)
9328 def Exec(self, feedback_fn):
9329 """Connect to the console of an instance
9332 instance = self.instance
9333 node = instance.primary_node
9335 node_insts = self.rpc.call_instance_list([node],
9336 [instance.hypervisor])[node]
9337 node_insts.Raise("Can't get node information from %s" % node)
9339 if instance.name not in node_insts.payload:
9340 if instance.admin_up:
9341 state = constants.INSTST_ERRORDOWN
9343 state = constants.INSTST_ADMINDOWN
9344 raise errors.OpExecError("Instance %s is not running (state %s)" %
9345 (instance.name, state))
9347 logging.debug("Connecting to console of %s on %s", instance.name, node)
9349 return _GetInstanceConsole(self.cfg.GetClusterInfo(), instance)
9352 def _GetInstanceConsole(cluster, instance):
9353 """Returns console information for an instance.
9355 @type cluster: L{objects.Cluster}
9356 @type instance: L{objects.Instance}
9360 hyper = hypervisor.GetHypervisor(instance.hypervisor)
9361 # beparams and hvparams are passed separately, to avoid editing the
9362 # instance and then saving the defaults in the instance itself.
9363 hvparams = cluster.FillHV(instance)
9364 beparams = cluster.FillBE(instance)
9365 console = hyper.GetInstanceConsole(instance, hvparams, beparams)
9367 assert console.instance == instance.name
9368 assert console.Validate()
9370 return console.ToDict()
9373 class LUInstanceReplaceDisks(LogicalUnit):
9374 """Replace the disks of an instance.
9377 HPATH = "mirrors-replace"
9378 HTYPE = constants.HTYPE_INSTANCE
9381 def CheckArguments(self):
9382 TLReplaceDisks.CheckArguments(self.op.mode, self.op.remote_node,
9385 def ExpandNames(self):
9386 self._ExpandAndLockInstance()
9388 assert locking.LEVEL_NODE not in self.needed_locks
9389 assert locking.LEVEL_NODEGROUP not in self.needed_locks
9391 assert self.op.iallocator is None or self.op.remote_node is None, \
9392 "Conflicting options"
9394 if self.op.remote_node is not None:
9395 self.op.remote_node = _ExpandNodeName(self.cfg, self.op.remote_node)
9397 # Warning: do not remove the locking of the new secondary here
9398 # unless DRBD8.AddChildren is changed to work in parallel;
9399 # currently it doesn't since parallel invocations of
9400 # FindUnusedMinor will conflict
9401 self.needed_locks[locking.LEVEL_NODE] = [self.op.remote_node]
9402 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
9404 self.needed_locks[locking.LEVEL_NODE] = []
9405 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
9407 if self.op.iallocator is not None:
9408 # iallocator will select a new node in the same group
9409 self.needed_locks[locking.LEVEL_NODEGROUP] = []
9411 self.replacer = TLReplaceDisks(self, self.op.instance_name, self.op.mode,
9412 self.op.iallocator, self.op.remote_node,
9413 self.op.disks, False, self.op.early_release)
9415 self.tasklets = [self.replacer]
9417 def DeclareLocks(self, level):
9418 if level == locking.LEVEL_NODEGROUP:
9419 assert self.op.remote_node is None
9420 assert self.op.iallocator is not None
9421 assert not self.needed_locks[locking.LEVEL_NODEGROUP]
9423 self.share_locks[locking.LEVEL_NODEGROUP] = 1
9424 self.needed_locks[locking.LEVEL_NODEGROUP] = \
9425 self.cfg.GetInstanceNodeGroups(self.op.instance_name)
9427 elif level == locking.LEVEL_NODE:
9428 if self.op.iallocator is not None:
9429 assert self.op.remote_node is None
9430 assert not self.needed_locks[locking.LEVEL_NODE]
9432 # Lock member nodes of all locked groups
9433 self.needed_locks[locking.LEVEL_NODE] = [node_name
9434 for group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP)
9435 for node_name in self.cfg.GetNodeGroup(group_uuid).members]
9437 self._LockInstancesNodes()
9439 def BuildHooksEnv(self):
9442 This runs on the master, the primary and all the secondaries.
9445 instance = self.replacer.instance
9447 "MODE": self.op.mode,
9448 "NEW_SECONDARY": self.op.remote_node,
9449 "OLD_SECONDARY": instance.secondary_nodes[0],
9451 env.update(_BuildInstanceHookEnvByObject(self, instance))
9454 def BuildHooksNodes(self):
9455 """Build hooks nodes.
9458 instance = self.replacer.instance
9460 self.cfg.GetMasterNode(),
9461 instance.primary_node,
9463 if self.op.remote_node is not None:
9464 nl.append(self.op.remote_node)
9467 def CheckPrereq(self):
9468 """Check prerequisites.
9471 assert (self.glm.is_owned(locking.LEVEL_NODEGROUP) or
9472 self.op.iallocator is None)
9474 owned_groups = self.owned_locks(locking.LEVEL_NODEGROUP)
9476 _CheckInstanceNodeGroups(self.cfg, self.op.instance_name, owned_groups)
9478 return LogicalUnit.CheckPrereq(self)
9481 class TLReplaceDisks(Tasklet):
9482 """Replaces disks for an instance.
9484 Note: Locking is not within the scope of this class.
9487 def __init__(self, lu, instance_name, mode, iallocator_name, remote_node,
9488 disks, delay_iallocator, early_release):
9489 """Initializes this class.
9492 Tasklet.__init__(self, lu)
9495 self.instance_name = instance_name
9497 self.iallocator_name = iallocator_name
9498 self.remote_node = remote_node
9500 self.delay_iallocator = delay_iallocator
9501 self.early_release = early_release
9504 self.instance = None
9505 self.new_node = None
9506 self.target_node = None
9507 self.other_node = None
9508 self.remote_node_info = None
9509 self.node_secondary_ip = None
9512 def CheckArguments(mode, remote_node, iallocator):
9513 """Helper function for users of this class.
9516 # check for valid parameter combination
9517 if mode == constants.REPLACE_DISK_CHG:
9518 if remote_node is None and iallocator is None:
9519 raise errors.OpPrereqError("When changing the secondary either an"
9520 " iallocator script must be used or the"
9521 " new node given", errors.ECODE_INVAL)
9523 if remote_node is not None and iallocator is not None:
9524 raise errors.OpPrereqError("Give either the iallocator or the new"
9525 " secondary, not both", errors.ECODE_INVAL)
9527 elif remote_node is not None or iallocator is not None:
9528 # Not replacing the secondary
9529 raise errors.OpPrereqError("The iallocator and new node options can"
9530 " only be used when changing the"
9531 " secondary node", errors.ECODE_INVAL)
9534 def _RunAllocator(lu, iallocator_name, instance_name, relocate_from):
9535 """Compute a new secondary node using an IAllocator.
9538 ial = IAllocator(lu.cfg, lu.rpc,
9539 mode=constants.IALLOCATOR_MODE_RELOC,
9541 relocate_from=list(relocate_from))
9543 ial.Run(iallocator_name)
9546 raise errors.OpPrereqError("Can't compute nodes using iallocator '%s':"
9547 " %s" % (iallocator_name, ial.info),
9550 if len(ial.result) != ial.required_nodes:
9551 raise errors.OpPrereqError("iallocator '%s' returned invalid number"
9552 " of nodes (%s), required %s" %
9554 len(ial.result), ial.required_nodes),
9557 remote_node_name = ial.result[0]
9559 lu.LogInfo("Selected new secondary for instance '%s': %s",
9560 instance_name, remote_node_name)
9562 return remote_node_name
9564 def _FindFaultyDisks(self, node_name):
9565 """Wrapper for L{_FindFaultyInstanceDisks}.
9568 return _FindFaultyInstanceDisks(self.cfg, self.rpc, self.instance,
9571 def _CheckDisksActivated(self, instance):
9572 """Checks if the instance disks are activated.
9574 @param instance: The instance to check disks
9575 @return: True if they are activated, False otherwise
9578 nodes = instance.all_nodes
9580 for idx, dev in enumerate(instance.disks):
9582 self.lu.LogInfo("Checking disk/%d on %s", idx, node)
9583 self.cfg.SetDiskID(dev, node)
9585 result = self.rpc.call_blockdev_find(node, dev)
9589 elif result.fail_msg or not result.payload:
9594 def CheckPrereq(self):
9595 """Check prerequisites.
9597 This checks that the instance is in the cluster.
9600 self.instance = instance = self.cfg.GetInstanceInfo(self.instance_name)
9601 assert instance is not None, \
9602 "Cannot retrieve locked instance %s" % self.instance_name
9604 if instance.disk_template != constants.DT_DRBD8:
9605 raise errors.OpPrereqError("Can only run replace disks for DRBD8-based"
9606 " instances", errors.ECODE_INVAL)
9608 if len(instance.secondary_nodes) != 1:
9609 raise errors.OpPrereqError("The instance has a strange layout,"
9610 " expected one secondary but found %d" %
9611 len(instance.secondary_nodes),
9614 if not self.delay_iallocator:
9615 self._CheckPrereq2()
9617 def _CheckPrereq2(self):
9618 """Check prerequisites, second part.
9620 This function should always be part of CheckPrereq. It was separated and is
9621 now called from Exec because during node evacuation iallocator was only
9622 called with an unmodified cluster model, not taking planned changes into
9626 instance = self.instance
9627 secondary_node = instance.secondary_nodes[0]
9629 if self.iallocator_name is None:
9630 remote_node = self.remote_node
9632 remote_node = self._RunAllocator(self.lu, self.iallocator_name,
9633 instance.name, instance.secondary_nodes)
9635 if remote_node is None:
9636 self.remote_node_info = None
9638 assert remote_node in self.lu.owned_locks(locking.LEVEL_NODE), \
9639 "Remote node '%s' is not locked" % remote_node
9641 self.remote_node_info = self.cfg.GetNodeInfo(remote_node)
9642 assert self.remote_node_info is not None, \
9643 "Cannot retrieve locked node %s" % remote_node
9645 if remote_node == self.instance.primary_node:
9646 raise errors.OpPrereqError("The specified node is the primary node of"
9647 " the instance", errors.ECODE_INVAL)
9649 if remote_node == secondary_node:
9650 raise errors.OpPrereqError("The specified node is already the"
9651 " secondary node of the instance",
9654 if self.disks and self.mode in (constants.REPLACE_DISK_AUTO,
9655 constants.REPLACE_DISK_CHG):
9656 raise errors.OpPrereqError("Cannot specify disks to be replaced",
9659 if self.mode == constants.REPLACE_DISK_AUTO:
9660 if not self._CheckDisksActivated(instance):
9661 raise errors.OpPrereqError("Please run activate-disks on instance %s"
9662 " first" % self.instance_name,
9664 faulty_primary = self._FindFaultyDisks(instance.primary_node)
9665 faulty_secondary = self._FindFaultyDisks(secondary_node)
9667 if faulty_primary and faulty_secondary:
9668 raise errors.OpPrereqError("Instance %s has faulty disks on more than"
9669 " one node and can not be repaired"
9670 " automatically" % self.instance_name,
9674 self.disks = faulty_primary
9675 self.target_node = instance.primary_node
9676 self.other_node = secondary_node
9677 check_nodes = [self.target_node, self.other_node]
9678 elif faulty_secondary:
9679 self.disks = faulty_secondary
9680 self.target_node = secondary_node
9681 self.other_node = instance.primary_node
9682 check_nodes = [self.target_node, self.other_node]
9688 # Non-automatic modes
9689 if self.mode == constants.REPLACE_DISK_PRI:
9690 self.target_node = instance.primary_node
9691 self.other_node = secondary_node
9692 check_nodes = [self.target_node, self.other_node]
9694 elif self.mode == constants.REPLACE_DISK_SEC:
9695 self.target_node = secondary_node
9696 self.other_node = instance.primary_node
9697 check_nodes = [self.target_node, self.other_node]
9699 elif self.mode == constants.REPLACE_DISK_CHG:
9700 self.new_node = remote_node
9701 self.other_node = instance.primary_node
9702 self.target_node = secondary_node
9703 check_nodes = [self.new_node, self.other_node]
9705 _CheckNodeNotDrained(self.lu, remote_node)
9706 _CheckNodeVmCapable(self.lu, remote_node)
9708 old_node_info = self.cfg.GetNodeInfo(secondary_node)
9709 assert old_node_info is not None
9710 if old_node_info.offline and not self.early_release:
9711 # doesn't make sense to delay the release
9712 self.early_release = True
9713 self.lu.LogInfo("Old secondary %s is offline, automatically enabling"
9714 " early-release mode", secondary_node)
9717 raise errors.ProgrammerError("Unhandled disk replace mode (%s)" %
9720 # If not specified all disks should be replaced
9722 self.disks = range(len(self.instance.disks))
9724 for node in check_nodes:
9725 _CheckNodeOnline(self.lu, node)
9727 touched_nodes = frozenset(node_name for node_name in [self.new_node,
9730 if node_name is not None)
9732 # Release unneeded node locks
9733 _ReleaseLocks(self.lu, locking.LEVEL_NODE, keep=touched_nodes)
9735 # Release any owned node group
9736 if self.lu.glm.is_owned(locking.LEVEL_NODEGROUP):
9737 _ReleaseLocks(self.lu, locking.LEVEL_NODEGROUP)
9739 # Check whether disks are valid
9740 for disk_idx in self.disks:
9741 instance.FindDisk(disk_idx)
9743 # Get secondary node IP addresses
9744 self.node_secondary_ip = dict((name, node.secondary_ip) for (name, node)
9745 in self.cfg.GetMultiNodeInfo(touched_nodes))
9747 def Exec(self, feedback_fn):
9748 """Execute disk replacement.
9750 This dispatches the disk replacement to the appropriate handler.
9753 if self.delay_iallocator:
9754 self._CheckPrereq2()
9757 # Verify owned locks before starting operation
9758 owned_nodes = self.lu.owned_locks(locking.LEVEL_NODE)
9759 assert set(owned_nodes) == set(self.node_secondary_ip), \
9760 ("Incorrect node locks, owning %s, expected %s" %
9761 (owned_nodes, self.node_secondary_ip.keys()))
9763 owned_instances = self.lu.owned_locks(locking.LEVEL_INSTANCE)
9764 assert list(owned_instances) == [self.instance_name], \
9765 "Instance '%s' not locked" % self.instance_name
9767 assert not self.lu.glm.is_owned(locking.LEVEL_NODEGROUP), \
9768 "Should not own any node group lock at this point"
9771 feedback_fn("No disks need replacement")
9774 feedback_fn("Replacing disk(s) %s for %s" %
9775 (utils.CommaJoin(self.disks), self.instance.name))
9777 activate_disks = (not self.instance.admin_up)
9779 # Activate the instance disks if we're replacing them on a down instance
9781 _StartInstanceDisks(self.lu, self.instance, True)
9784 # Should we replace the secondary node?
9785 if self.new_node is not None:
9786 fn = self._ExecDrbd8Secondary
9788 fn = self._ExecDrbd8DiskOnly
9790 result = fn(feedback_fn)
9792 # Deactivate the instance disks if we're replacing them on a
9795 _SafeShutdownInstanceDisks(self.lu, self.instance)
9798 # Verify owned locks
9799 owned_nodes = self.lu.owned_locks(locking.LEVEL_NODE)
9800 nodes = frozenset(self.node_secondary_ip)
9801 assert ((self.early_release and not owned_nodes) or
9802 (not self.early_release and not (set(owned_nodes) - nodes))), \
9803 ("Not owning the correct locks, early_release=%s, owned=%r,"
9804 " nodes=%r" % (self.early_release, owned_nodes, nodes))
9808 def _CheckVolumeGroup(self, nodes):
9809 self.lu.LogInfo("Checking volume groups")
9811 vgname = self.cfg.GetVGName()
9813 # Make sure volume group exists on all involved nodes
9814 results = self.rpc.call_vg_list(nodes)
9816 raise errors.OpExecError("Can't list volume groups on the nodes")
9820 res.Raise("Error checking node %s" % node)
9821 if vgname not in res.payload:
9822 raise errors.OpExecError("Volume group '%s' not found on node %s" %
9825 def _CheckDisksExistence(self, nodes):
9826 # Check disk existence
9827 for idx, dev in enumerate(self.instance.disks):
9828 if idx not in self.disks:
9832 self.lu.LogInfo("Checking disk/%d on %s" % (idx, node))
9833 self.cfg.SetDiskID(dev, node)
9835 result = self.rpc.call_blockdev_find(node, dev)
9837 msg = result.fail_msg
9838 if msg or not result.payload:
9840 msg = "disk not found"
9841 raise errors.OpExecError("Can't find disk/%d on node %s: %s" %
9844 def _CheckDisksConsistency(self, node_name, on_primary, ldisk):
9845 for idx, dev in enumerate(self.instance.disks):
9846 if idx not in self.disks:
9849 self.lu.LogInfo("Checking disk/%d consistency on node %s" %
9852 if not _CheckDiskConsistency(self.lu, dev, node_name, on_primary,
9854 raise errors.OpExecError("Node %s has degraded storage, unsafe to"
9855 " replace disks for instance %s" %
9856 (node_name, self.instance.name))
9858 def _CreateNewStorage(self, node_name):
9859 """Create new storage on the primary or secondary node.
9861 This is only used for same-node replaces, not for changing the
9862 secondary node, hence we don't want to modify the existing disk.
9867 for idx, dev in enumerate(self.instance.disks):
9868 if idx not in self.disks:
9871 self.lu.LogInfo("Adding storage on %s for disk/%d" % (node_name, idx))
9873 self.cfg.SetDiskID(dev, node_name)
9875 lv_names = [".disk%d_%s" % (idx, suffix) for suffix in ["data", "meta"]]
9876 names = _GenerateUniqueNames(self.lu, lv_names)
9878 vg_data = dev.children[0].logical_id[0]
9879 lv_data = objects.Disk(dev_type=constants.LD_LV, size=dev.size,
9880 logical_id=(vg_data, names[0]))
9881 vg_meta = dev.children[1].logical_id[0]
9882 lv_meta = objects.Disk(dev_type=constants.LD_LV, size=DRBD_META_SIZE,
9883 logical_id=(vg_meta, names[1]))
9885 new_lvs = [lv_data, lv_meta]
9886 old_lvs = [child.Copy() for child in dev.children]
9887 iv_names[dev.iv_name] = (dev, old_lvs, new_lvs)
9889 # we pass force_create=True to force the LVM creation
9890 for new_lv in new_lvs:
9891 _CreateBlockDev(self.lu, node_name, self.instance, new_lv, True,
9892 _GetInstanceInfoText(self.instance), False)
9896 def _CheckDevices(self, node_name, iv_names):
9897 for name, (dev, _, _) in iv_names.iteritems():
9898 self.cfg.SetDiskID(dev, node_name)
9900 result = self.rpc.call_blockdev_find(node_name, dev)
9902 msg = result.fail_msg
9903 if msg or not result.payload:
9905 msg = "disk not found"
9906 raise errors.OpExecError("Can't find DRBD device %s: %s" %
9909 if result.payload.is_degraded:
9910 raise errors.OpExecError("DRBD device %s is degraded!" % name)
9912 def _RemoveOldStorage(self, node_name, iv_names):
9913 for name, (_, old_lvs, _) in iv_names.iteritems():
9914 self.lu.LogInfo("Remove logical volumes for %s" % name)
9917 self.cfg.SetDiskID(lv, node_name)
9919 msg = self.rpc.call_blockdev_remove(node_name, lv).fail_msg
9921 self.lu.LogWarning("Can't remove old LV: %s" % msg,
9922 hint="remove unused LVs manually")
9924 def _ExecDrbd8DiskOnly(self, feedback_fn): # pylint: disable=W0613
9925 """Replace a disk on the primary or secondary for DRBD 8.
9927 The algorithm for replace is quite complicated:
9929 1. for each disk to be replaced:
9931 1. create new LVs on the target node with unique names
9932 1. detach old LVs from the drbd device
9933 1. rename old LVs to name_replaced.<time_t>
9934 1. rename new LVs to old LVs
9935 1. attach the new LVs (with the old names now) to the drbd device
9937 1. wait for sync across all devices
9939 1. for each modified disk:
9941 1. remove old LVs (which have the name name_replaces.<time_t>)
9943 Failures are not very well handled.
9948 # Step: check device activation
9949 self.lu.LogStep(1, steps_total, "Check device existence")
9950 self._CheckDisksExistence([self.other_node, self.target_node])
9951 self._CheckVolumeGroup([self.target_node, self.other_node])
9953 # Step: check other node consistency
9954 self.lu.LogStep(2, steps_total, "Check peer consistency")
9955 self._CheckDisksConsistency(self.other_node,
9956 self.other_node == self.instance.primary_node,
9959 # Step: create new storage
9960 self.lu.LogStep(3, steps_total, "Allocate new storage")
9961 iv_names = self._CreateNewStorage(self.target_node)
9963 # Step: for each lv, detach+rename*2+attach
9964 self.lu.LogStep(4, steps_total, "Changing drbd configuration")
9965 for dev, old_lvs, new_lvs in iv_names.itervalues():
9966 self.lu.LogInfo("Detaching %s drbd from local storage" % dev.iv_name)
9968 result = self.rpc.call_blockdev_removechildren(self.target_node, dev,
9970 result.Raise("Can't detach drbd from local storage on node"
9971 " %s for device %s" % (self.target_node, dev.iv_name))
9973 #cfg.Update(instance)
9975 # ok, we created the new LVs, so now we know we have the needed
9976 # storage; as such, we proceed on the target node to rename
9977 # old_lv to _old, and new_lv to old_lv; note that we rename LVs
9978 # using the assumption that logical_id == physical_id (which in
9979 # turn is the unique_id on that node)
9981 # FIXME(iustin): use a better name for the replaced LVs
9982 temp_suffix = int(time.time())
9983 ren_fn = lambda d, suff: (d.physical_id[0],
9984 d.physical_id[1] + "_replaced-%s" % suff)
9986 # Build the rename list based on what LVs exist on the node
9987 rename_old_to_new = []
9988 for to_ren in old_lvs:
9989 result = self.rpc.call_blockdev_find(self.target_node, to_ren)
9990 if not result.fail_msg and result.payload:
9992 rename_old_to_new.append((to_ren, ren_fn(to_ren, temp_suffix)))
9994 self.lu.LogInfo("Renaming the old LVs on the target node")
9995 result = self.rpc.call_blockdev_rename(self.target_node,
9997 result.Raise("Can't rename old LVs on node %s" % self.target_node)
9999 # Now we rename the new LVs to the old LVs
10000 self.lu.LogInfo("Renaming the new LVs on the target node")
10001 rename_new_to_old = [(new, old.physical_id)
10002 for old, new in zip(old_lvs, new_lvs)]
10003 result = self.rpc.call_blockdev_rename(self.target_node,
10005 result.Raise("Can't rename new LVs on node %s" % self.target_node)
10007 # Intermediate steps of in memory modifications
10008 for old, new in zip(old_lvs, new_lvs):
10009 new.logical_id = old.logical_id
10010 self.cfg.SetDiskID(new, self.target_node)
10012 # We need to modify old_lvs so that removal later removes the
10013 # right LVs, not the newly added ones; note that old_lvs is a
10015 for disk in old_lvs:
10016 disk.logical_id = ren_fn(disk, temp_suffix)
10017 self.cfg.SetDiskID(disk, self.target_node)
10019 # Now that the new lvs have the old name, we can add them to the device
10020 self.lu.LogInfo("Adding new mirror component on %s" % self.target_node)
10021 result = self.rpc.call_blockdev_addchildren(self.target_node, dev,
10023 msg = result.fail_msg
10025 for new_lv in new_lvs:
10026 msg2 = self.rpc.call_blockdev_remove(self.target_node,
10029 self.lu.LogWarning("Can't rollback device %s: %s", dev, msg2,
10030 hint=("cleanup manually the unused logical"
10032 raise errors.OpExecError("Can't add local storage to drbd: %s" % msg)
10035 if self.early_release:
10036 self.lu.LogStep(cstep, steps_total, "Removing old storage")
10038 self._RemoveOldStorage(self.target_node, iv_names)
10039 # WARNING: we release both node locks here, do not do other RPCs
10040 # than WaitForSync to the primary node
10041 _ReleaseLocks(self.lu, locking.LEVEL_NODE,
10042 names=[self.target_node, self.other_node])
10045 # This can fail as the old devices are degraded and _WaitForSync
10046 # does a combined result over all disks, so we don't check its return value
10047 self.lu.LogStep(cstep, steps_total, "Sync devices")
10049 _WaitForSync(self.lu, self.instance)
10051 # Check all devices manually
10052 self._CheckDevices(self.instance.primary_node, iv_names)
10054 # Step: remove old storage
10055 if not self.early_release:
10056 self.lu.LogStep(cstep, steps_total, "Removing old storage")
10058 self._RemoveOldStorage(self.target_node, iv_names)
10060 def _ExecDrbd8Secondary(self, feedback_fn):
10061 """Replace the secondary node for DRBD 8.
10063 The algorithm for replace is quite complicated:
10064 - for all disks of the instance:
10065 - create new LVs on the new node with same names
10066 - shutdown the drbd device on the old secondary
10067 - disconnect the drbd network on the primary
10068 - create the drbd device on the new secondary
10069 - network attach the drbd on the primary, using an artifice:
10070 the drbd code for Attach() will connect to the network if it
10071 finds a device which is connected to the good local disks but
10072 not network enabled
10073 - wait for sync across all devices
10074 - remove all disks from the old secondary
10076 Failures are not very well handled.
10081 pnode = self.instance.primary_node
10083 # Step: check device activation
10084 self.lu.LogStep(1, steps_total, "Check device existence")
10085 self._CheckDisksExistence([self.instance.primary_node])
10086 self._CheckVolumeGroup([self.instance.primary_node])
10088 # Step: check other node consistency
10089 self.lu.LogStep(2, steps_total, "Check peer consistency")
10090 self._CheckDisksConsistency(self.instance.primary_node, True, True)
10092 # Step: create new storage
10093 self.lu.LogStep(3, steps_total, "Allocate new storage")
10094 for idx, dev in enumerate(self.instance.disks):
10095 self.lu.LogInfo("Adding new local storage on %s for disk/%d" %
10096 (self.new_node, idx))
10097 # we pass force_create=True to force LVM creation
10098 for new_lv in dev.children:
10099 _CreateBlockDev(self.lu, self.new_node, self.instance, new_lv, True,
10100 _GetInstanceInfoText(self.instance), False)
10102 # Step 4: dbrd minors and drbd setups changes
10103 # after this, we must manually remove the drbd minors on both the
10104 # error and the success paths
10105 self.lu.LogStep(4, steps_total, "Changing drbd configuration")
10106 minors = self.cfg.AllocateDRBDMinor([self.new_node
10107 for dev in self.instance.disks],
10108 self.instance.name)
10109 logging.debug("Allocated minors %r", minors)
10112 for idx, (dev, new_minor) in enumerate(zip(self.instance.disks, minors)):
10113 self.lu.LogInfo("activating a new drbd on %s for disk/%d" %
10114 (self.new_node, idx))
10115 # create new devices on new_node; note that we create two IDs:
10116 # one without port, so the drbd will be activated without
10117 # networking information on the new node at this stage, and one
10118 # with network, for the latter activation in step 4
10119 (o_node1, o_node2, o_port, o_minor1, o_minor2, o_secret) = dev.logical_id
10120 if self.instance.primary_node == o_node1:
10123 assert self.instance.primary_node == o_node2, "Three-node instance?"
10126 new_alone_id = (self.instance.primary_node, self.new_node, None,
10127 p_minor, new_minor, o_secret)
10128 new_net_id = (self.instance.primary_node, self.new_node, o_port,
10129 p_minor, new_minor, o_secret)
10131 iv_names[idx] = (dev, dev.children, new_net_id)
10132 logging.debug("Allocated new_minor: %s, new_logical_id: %s", new_minor,
10134 new_drbd = objects.Disk(dev_type=constants.LD_DRBD8,
10135 logical_id=new_alone_id,
10136 children=dev.children,
10139 _CreateSingleBlockDev(self.lu, self.new_node, self.instance, new_drbd,
10140 _GetInstanceInfoText(self.instance), False)
10141 except errors.GenericError:
10142 self.cfg.ReleaseDRBDMinors(self.instance.name)
10145 # We have new devices, shutdown the drbd on the old secondary
10146 for idx, dev in enumerate(self.instance.disks):
10147 self.lu.LogInfo("Shutting down drbd for disk/%d on old node" % idx)
10148 self.cfg.SetDiskID(dev, self.target_node)
10149 msg = self.rpc.call_blockdev_shutdown(self.target_node, dev).fail_msg
10151 self.lu.LogWarning("Failed to shutdown drbd for disk/%d on old"
10152 "node: %s" % (idx, msg),
10153 hint=("Please cleanup this device manually as"
10154 " soon as possible"))
10156 self.lu.LogInfo("Detaching primary drbds from the network (=> standalone)")
10157 result = self.rpc.call_drbd_disconnect_net([pnode], self.node_secondary_ip,
10158 self.instance.disks)[pnode]
10160 msg = result.fail_msg
10162 # detaches didn't succeed (unlikely)
10163 self.cfg.ReleaseDRBDMinors(self.instance.name)
10164 raise errors.OpExecError("Can't detach the disks from the network on"
10165 " old node: %s" % (msg,))
10167 # if we managed to detach at least one, we update all the disks of
10168 # the instance to point to the new secondary
10169 self.lu.LogInfo("Updating instance configuration")
10170 for dev, _, new_logical_id in iv_names.itervalues():
10171 dev.logical_id = new_logical_id
10172 self.cfg.SetDiskID(dev, self.instance.primary_node)
10174 self.cfg.Update(self.instance, feedback_fn)
10176 # and now perform the drbd attach
10177 self.lu.LogInfo("Attaching primary drbds to new secondary"
10178 " (standalone => connected)")
10179 result = self.rpc.call_drbd_attach_net([self.instance.primary_node,
10181 self.node_secondary_ip,
10182 self.instance.disks,
10183 self.instance.name,
10185 for to_node, to_result in result.items():
10186 msg = to_result.fail_msg
10188 self.lu.LogWarning("Can't attach drbd disks on node %s: %s",
10190 hint=("please do a gnt-instance info to see the"
10191 " status of disks"))
10193 if self.early_release:
10194 self.lu.LogStep(cstep, steps_total, "Removing old storage")
10196 self._RemoveOldStorage(self.target_node, iv_names)
10197 # WARNING: we release all node locks here, do not do other RPCs
10198 # than WaitForSync to the primary node
10199 _ReleaseLocks(self.lu, locking.LEVEL_NODE,
10200 names=[self.instance.primary_node,
10205 # This can fail as the old devices are degraded and _WaitForSync
10206 # does a combined result over all disks, so we don't check its return value
10207 self.lu.LogStep(cstep, steps_total, "Sync devices")
10209 _WaitForSync(self.lu, self.instance)
10211 # Check all devices manually
10212 self._CheckDevices(self.instance.primary_node, iv_names)
10214 # Step: remove old storage
10215 if not self.early_release:
10216 self.lu.LogStep(cstep, steps_total, "Removing old storage")
10217 self._RemoveOldStorage(self.target_node, iv_names)
10220 class LURepairNodeStorage(NoHooksLU):
10221 """Repairs the volume group on a node.
10226 def CheckArguments(self):
10227 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
10229 storage_type = self.op.storage_type
10231 if (constants.SO_FIX_CONSISTENCY not in
10232 constants.VALID_STORAGE_OPERATIONS.get(storage_type, [])):
10233 raise errors.OpPrereqError("Storage units of type '%s' can not be"
10234 " repaired" % storage_type,
10235 errors.ECODE_INVAL)
10237 def ExpandNames(self):
10238 self.needed_locks = {
10239 locking.LEVEL_NODE: [self.op.node_name],
10242 def _CheckFaultyDisks(self, instance, node_name):
10243 """Ensure faulty disks abort the opcode or at least warn."""
10245 if _FindFaultyInstanceDisks(self.cfg, self.rpc, instance,
10247 raise errors.OpPrereqError("Instance '%s' has faulty disks on"
10248 " node '%s'" % (instance.name, node_name),
10249 errors.ECODE_STATE)
10250 except errors.OpPrereqError, err:
10251 if self.op.ignore_consistency:
10252 self.proc.LogWarning(str(err.args[0]))
10256 def CheckPrereq(self):
10257 """Check prerequisites.
10260 # Check whether any instance on this node has faulty disks
10261 for inst in _GetNodeInstances(self.cfg, self.op.node_name):
10262 if not inst.admin_up:
10264 check_nodes = set(inst.all_nodes)
10265 check_nodes.discard(self.op.node_name)
10266 for inst_node_name in check_nodes:
10267 self._CheckFaultyDisks(inst, inst_node_name)
10269 def Exec(self, feedback_fn):
10270 feedback_fn("Repairing storage unit '%s' on %s ..." %
10271 (self.op.name, self.op.node_name))
10273 st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
10274 result = self.rpc.call_storage_execute(self.op.node_name,
10275 self.op.storage_type, st_args,
10277 constants.SO_FIX_CONSISTENCY)
10278 result.Raise("Failed to repair storage unit '%s' on %s" %
10279 (self.op.name, self.op.node_name))
10282 class LUNodeEvacuate(NoHooksLU):
10283 """Evacuates instances off a list of nodes.
10288 def CheckArguments(self):
10289 _CheckIAllocatorOrNode(self, "iallocator", "remote_node")
10291 def ExpandNames(self):
10292 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
10294 if self.op.remote_node is not None:
10295 self.op.remote_node = _ExpandNodeName(self.cfg, self.op.remote_node)
10296 assert self.op.remote_node
10298 if self.op.remote_node == self.op.node_name:
10299 raise errors.OpPrereqError("Can not use evacuated node as a new"
10300 " secondary node", errors.ECODE_INVAL)
10302 if self.op.mode != constants.IALLOCATOR_NEVAC_SEC:
10303 raise errors.OpPrereqError("Without the use of an iallocator only"
10304 " secondary instances can be evacuated",
10305 errors.ECODE_INVAL)
10308 self.share_locks = _ShareAll()
10309 self.needed_locks = {
10310 locking.LEVEL_INSTANCE: [],
10311 locking.LEVEL_NODEGROUP: [],
10312 locking.LEVEL_NODE: [],
10315 if self.op.remote_node is None:
10316 # Iallocator will choose any node(s) in the same group
10317 group_nodes = self.cfg.GetNodeGroupMembersByNodes([self.op.node_name])
10319 group_nodes = frozenset([self.op.remote_node])
10321 # Determine nodes to be locked
10322 self.lock_nodes = set([self.op.node_name]) | group_nodes
10324 def _DetermineInstances(self):
10325 """Builds list of instances to operate on.
10328 assert self.op.mode in constants.IALLOCATOR_NEVAC_MODES
10330 if self.op.mode == constants.IALLOCATOR_NEVAC_PRI:
10331 # Primary instances only
10332 inst_fn = _GetNodePrimaryInstances
10333 assert self.op.remote_node is None, \
10334 "Evacuating primary instances requires iallocator"
10335 elif self.op.mode == constants.IALLOCATOR_NEVAC_SEC:
10336 # Secondary instances only
10337 inst_fn = _GetNodeSecondaryInstances
10340 assert self.op.mode == constants.IALLOCATOR_NEVAC_ALL
10341 inst_fn = _GetNodeInstances
10343 return inst_fn(self.cfg, self.op.node_name)
10345 def DeclareLocks(self, level):
10346 if level == locking.LEVEL_INSTANCE:
10347 # Lock instances optimistically, needs verification once node and group
10348 # locks have been acquired
10349 self.needed_locks[locking.LEVEL_INSTANCE] = \
10350 set(i.name for i in self._DetermineInstances())
10352 elif level == locking.LEVEL_NODEGROUP:
10353 # Lock node groups optimistically, needs verification once nodes have
10355 self.needed_locks[locking.LEVEL_NODEGROUP] = \
10356 self.cfg.GetNodeGroupsFromNodes(self.lock_nodes)
10358 elif level == locking.LEVEL_NODE:
10359 self.needed_locks[locking.LEVEL_NODE] = self.lock_nodes
10361 def CheckPrereq(self):
10363 owned_instances = self.owned_locks(locking.LEVEL_INSTANCE)
10364 owned_nodes = self.owned_locks(locking.LEVEL_NODE)
10365 owned_groups = self.owned_locks(locking.LEVEL_NODEGROUP)
10367 assert owned_nodes == self.lock_nodes
10369 wanted_groups = self.cfg.GetNodeGroupsFromNodes(owned_nodes)
10370 if owned_groups != wanted_groups:
10371 raise errors.OpExecError("Node groups changed since locks were acquired,"
10372 " current groups are '%s', used to be '%s'" %
10373 (utils.CommaJoin(wanted_groups),
10374 utils.CommaJoin(owned_groups)))
10376 # Determine affected instances
10377 self.instances = self._DetermineInstances()
10378 self.instance_names = [i.name for i in self.instances]
10380 if set(self.instance_names) != owned_instances:
10381 raise errors.OpExecError("Instances on node '%s' changed since locks"
10382 " were acquired, current instances are '%s',"
10383 " used to be '%s'" %
10384 (self.op.node_name,
10385 utils.CommaJoin(self.instance_names),
10386 utils.CommaJoin(owned_instances)))
10388 if self.instance_names:
10389 self.LogInfo("Evacuating instances from node '%s': %s",
10391 utils.CommaJoin(utils.NiceSort(self.instance_names)))
10393 self.LogInfo("No instances to evacuate from node '%s'",
10396 if self.op.remote_node is not None:
10397 for i in self.instances:
10398 if i.primary_node == self.op.remote_node:
10399 raise errors.OpPrereqError("Node %s is the primary node of"
10400 " instance %s, cannot use it as"
10402 (self.op.remote_node, i.name),
10403 errors.ECODE_INVAL)
10405 def Exec(self, feedback_fn):
10406 assert (self.op.iallocator is not None) ^ (self.op.remote_node is not None)
10408 if not self.instance_names:
10409 # No instances to evacuate
10412 elif self.op.iallocator is not None:
10413 # TODO: Implement relocation to other group
10414 ial = IAllocator(self.cfg, self.rpc, constants.IALLOCATOR_MODE_NODE_EVAC,
10415 evac_mode=self.op.mode,
10416 instances=list(self.instance_names))
10418 ial.Run(self.op.iallocator)
10420 if not ial.success:
10421 raise errors.OpPrereqError("Can't compute node evacuation using"
10422 " iallocator '%s': %s" %
10423 (self.op.iallocator, ial.info),
10424 errors.ECODE_NORES)
10426 jobs = _LoadNodeEvacResult(self, ial.result, self.op.early_release, True)
10428 elif self.op.remote_node is not None:
10429 assert self.op.mode == constants.IALLOCATOR_NEVAC_SEC
10431 [opcodes.OpInstanceReplaceDisks(instance_name=instance_name,
10432 remote_node=self.op.remote_node,
10434 mode=constants.REPLACE_DISK_CHG,
10435 early_release=self.op.early_release)]
10436 for instance_name in self.instance_names
10440 raise errors.ProgrammerError("No iallocator or remote node")
10442 return ResultWithJobs(jobs)
10445 def _SetOpEarlyRelease(early_release, op):
10446 """Sets C{early_release} flag on opcodes if available.
10450 op.early_release = early_release
10451 except AttributeError:
10452 assert not isinstance(op, opcodes.OpInstanceReplaceDisks)
10457 def _NodeEvacDest(use_nodes, group, nodes):
10458 """Returns group or nodes depending on caller's choice.
10462 return utils.CommaJoin(nodes)
10467 def _LoadNodeEvacResult(lu, alloc_result, early_release, use_nodes):
10468 """Unpacks the result of change-group and node-evacuate iallocator requests.
10470 Iallocator modes L{constants.IALLOCATOR_MODE_NODE_EVAC} and
10471 L{constants.IALLOCATOR_MODE_CHG_GROUP}.
10473 @type lu: L{LogicalUnit}
10474 @param lu: Logical unit instance
10475 @type alloc_result: tuple/list
10476 @param alloc_result: Result from iallocator
10477 @type early_release: bool
10478 @param early_release: Whether to release locks early if possible
10479 @type use_nodes: bool
10480 @param use_nodes: Whether to display node names instead of groups
10483 (moved, failed, jobs) = alloc_result
10486 lu.LogWarning("Unable to evacuate instances %s",
10487 utils.CommaJoin("%s (%s)" % (name, reason)
10488 for (name, reason) in failed))
10491 lu.LogInfo("Instances to be moved: %s",
10492 utils.CommaJoin("%s (to %s)" %
10493 (name, _NodeEvacDest(use_nodes, group, nodes))
10494 for (name, group, nodes) in moved))
10496 return [map(compat.partial(_SetOpEarlyRelease, early_release),
10497 map(opcodes.OpCode.LoadOpCode, ops))
10501 class LUInstanceGrowDisk(LogicalUnit):
10502 """Grow a disk of an instance.
10505 HPATH = "disk-grow"
10506 HTYPE = constants.HTYPE_INSTANCE
10509 def ExpandNames(self):
10510 self._ExpandAndLockInstance()
10511 self.needed_locks[locking.LEVEL_NODE] = []
10512 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
10514 def DeclareLocks(self, level):
10515 if level == locking.LEVEL_NODE:
10516 self._LockInstancesNodes()
10518 def BuildHooksEnv(self):
10519 """Build hooks env.
10521 This runs on the master, the primary and all the secondaries.
10525 "DISK": self.op.disk,
10526 "AMOUNT": self.op.amount,
10528 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
10531 def BuildHooksNodes(self):
10532 """Build hooks nodes.
10535 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
10538 def CheckPrereq(self):
10539 """Check prerequisites.
10541 This checks that the instance is in the cluster.
10544 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
10545 assert instance is not None, \
10546 "Cannot retrieve locked instance %s" % self.op.instance_name
10547 nodenames = list(instance.all_nodes)
10548 for node in nodenames:
10549 _CheckNodeOnline(self, node)
10551 self.instance = instance
10553 if instance.disk_template not in constants.DTS_GROWABLE:
10554 raise errors.OpPrereqError("Instance's disk layout does not support"
10555 " growing", errors.ECODE_INVAL)
10557 self.disk = instance.FindDisk(self.op.disk)
10559 if instance.disk_template not in (constants.DT_FILE,
10560 constants.DT_SHARED_FILE):
10561 # TODO: check the free disk space for file, when that feature will be
10563 _CheckNodesFreeDiskPerVG(self, nodenames,
10564 self.disk.ComputeGrowth(self.op.amount))
10566 def Exec(self, feedback_fn):
10567 """Execute disk grow.
10570 instance = self.instance
10573 disks_ok, _ = _AssembleInstanceDisks(self, self.instance, disks=[disk])
10575 raise errors.OpExecError("Cannot activate block device to grow")
10577 # First run all grow ops in dry-run mode
10578 for node in instance.all_nodes:
10579 self.cfg.SetDiskID(disk, node)
10580 result = self.rpc.call_blockdev_grow(node, disk, self.op.amount, True)
10581 result.Raise("Grow request failed to node %s" % node)
10583 # We know that (as far as we can test) operations across different
10584 # nodes will succeed, time to run it for real
10585 for node in instance.all_nodes:
10586 self.cfg.SetDiskID(disk, node)
10587 result = self.rpc.call_blockdev_grow(node, disk, self.op.amount, False)
10588 result.Raise("Grow request failed to node %s" % node)
10590 # TODO: Rewrite code to work properly
10591 # DRBD goes into sync mode for a short amount of time after executing the
10592 # "resize" command. DRBD 8.x below version 8.0.13 contains a bug whereby
10593 # calling "resize" in sync mode fails. Sleeping for a short amount of
10594 # time is a work-around.
10597 disk.RecordGrow(self.op.amount)
10598 self.cfg.Update(instance, feedback_fn)
10599 if self.op.wait_for_sync:
10600 disk_abort = not _WaitForSync(self, instance, disks=[disk])
10602 self.proc.LogWarning("Disk sync-ing has not returned a good"
10603 " status; please check the instance")
10604 if not instance.admin_up:
10605 _SafeShutdownInstanceDisks(self, instance, disks=[disk])
10606 elif not instance.admin_up:
10607 self.proc.LogWarning("Not shutting down the disk even if the instance is"
10608 " not supposed to be running because no wait for"
10609 " sync mode was requested")
10612 class LUInstanceQueryData(NoHooksLU):
10613 """Query runtime instance data.
10618 def ExpandNames(self):
10619 self.needed_locks = {}
10621 # Use locking if requested or when non-static information is wanted
10622 if not (self.op.static or self.op.use_locking):
10623 self.LogWarning("Non-static data requested, locks need to be acquired")
10624 self.op.use_locking = True
10626 if self.op.instances or not self.op.use_locking:
10627 # Expand instance names right here
10628 self.wanted_names = _GetWantedInstances(self, self.op.instances)
10630 # Will use acquired locks
10631 self.wanted_names = None
10633 if self.op.use_locking:
10634 self.share_locks = _ShareAll()
10636 if self.wanted_names is None:
10637 self.needed_locks[locking.LEVEL_INSTANCE] = locking.ALL_SET
10639 self.needed_locks[locking.LEVEL_INSTANCE] = self.wanted_names
10641 self.needed_locks[locking.LEVEL_NODE] = []
10642 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
10644 def DeclareLocks(self, level):
10645 if self.op.use_locking and level == locking.LEVEL_NODE:
10646 self._LockInstancesNodes()
10648 def CheckPrereq(self):
10649 """Check prerequisites.
10651 This only checks the optional instance list against the existing names.
10654 if self.wanted_names is None:
10655 assert self.op.use_locking, "Locking was not used"
10656 self.wanted_names = self.owned_locks(locking.LEVEL_INSTANCE)
10658 self.wanted_instances = \
10659 map(compat.snd, self.cfg.GetMultiInstanceInfo(self.wanted_names))
10661 def _ComputeBlockdevStatus(self, node, instance_name, dev):
10662 """Returns the status of a block device
10665 if self.op.static or not node:
10668 self.cfg.SetDiskID(dev, node)
10670 result = self.rpc.call_blockdev_find(node, dev)
10674 result.Raise("Can't compute disk status for %s" % instance_name)
10676 status = result.payload
10680 return (status.dev_path, status.major, status.minor,
10681 status.sync_percent, status.estimated_time,
10682 status.is_degraded, status.ldisk_status)
10684 def _ComputeDiskStatus(self, instance, snode, dev):
10685 """Compute block device status.
10688 if dev.dev_type in constants.LDS_DRBD:
10689 # we change the snode then (otherwise we use the one passed in)
10690 if dev.logical_id[0] == instance.primary_node:
10691 snode = dev.logical_id[1]
10693 snode = dev.logical_id[0]
10695 dev_pstatus = self._ComputeBlockdevStatus(instance.primary_node,
10696 instance.name, dev)
10697 dev_sstatus = self._ComputeBlockdevStatus(snode, instance.name, dev)
10700 dev_children = map(compat.partial(self._ComputeDiskStatus,
10707 "iv_name": dev.iv_name,
10708 "dev_type": dev.dev_type,
10709 "logical_id": dev.logical_id,
10710 "physical_id": dev.physical_id,
10711 "pstatus": dev_pstatus,
10712 "sstatus": dev_sstatus,
10713 "children": dev_children,
10718 def Exec(self, feedback_fn):
10719 """Gather and return data"""
10722 cluster = self.cfg.GetClusterInfo()
10724 pri_nodes = self.cfg.GetMultiNodeInfo(i.primary_node
10725 for i in self.wanted_instances)
10726 for instance, (_, pnode) in zip(self.wanted_instances, pri_nodes):
10727 if self.op.static or pnode.offline:
10728 remote_state = None
10730 self.LogWarning("Primary node %s is marked offline, returning static"
10731 " information only for instance %s" %
10732 (pnode.name, instance.name))
10734 remote_info = self.rpc.call_instance_info(instance.primary_node,
10736 instance.hypervisor)
10737 remote_info.Raise("Error checking node %s" % instance.primary_node)
10738 remote_info = remote_info.payload
10739 if remote_info and "state" in remote_info:
10740 remote_state = "up"
10742 remote_state = "down"
10744 if instance.admin_up:
10745 config_state = "up"
10747 config_state = "down"
10749 disks = map(compat.partial(self._ComputeDiskStatus, instance, None),
10752 result[instance.name] = {
10753 "name": instance.name,
10754 "config_state": config_state,
10755 "run_state": remote_state,
10756 "pnode": instance.primary_node,
10757 "snodes": instance.secondary_nodes,
10759 # this happens to be the same format used for hooks
10760 "nics": _NICListToTuple(self, instance.nics),
10761 "disk_template": instance.disk_template,
10763 "hypervisor": instance.hypervisor,
10764 "network_port": instance.network_port,
10765 "hv_instance": instance.hvparams,
10766 "hv_actual": cluster.FillHV(instance, skip_globals=True),
10767 "be_instance": instance.beparams,
10768 "be_actual": cluster.FillBE(instance),
10769 "os_instance": instance.osparams,
10770 "os_actual": cluster.SimpleFillOS(instance.os, instance.osparams),
10771 "serial_no": instance.serial_no,
10772 "mtime": instance.mtime,
10773 "ctime": instance.ctime,
10774 "uuid": instance.uuid,
10780 class LUInstanceSetParams(LogicalUnit):
10781 """Modifies an instances's parameters.
10784 HPATH = "instance-modify"
10785 HTYPE = constants.HTYPE_INSTANCE
10788 def CheckArguments(self):
10789 if not (self.op.nics or self.op.disks or self.op.disk_template or
10790 self.op.hvparams or self.op.beparams or self.op.os_name):
10791 raise errors.OpPrereqError("No changes submitted", errors.ECODE_INVAL)
10793 if self.op.hvparams:
10794 _CheckGlobalHvParams(self.op.hvparams)
10798 for disk_op, disk_dict in self.op.disks:
10799 utils.ForceDictType(disk_dict, constants.IDISK_PARAMS_TYPES)
10800 if disk_op == constants.DDM_REMOVE:
10801 disk_addremove += 1
10803 elif disk_op == constants.DDM_ADD:
10804 disk_addremove += 1
10806 if not isinstance(disk_op, int):
10807 raise errors.OpPrereqError("Invalid disk index", errors.ECODE_INVAL)
10808 if not isinstance(disk_dict, dict):
10809 msg = "Invalid disk value: expected dict, got '%s'" % disk_dict
10810 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
10812 if disk_op == constants.DDM_ADD:
10813 mode = disk_dict.setdefault(constants.IDISK_MODE, constants.DISK_RDWR)
10814 if mode not in constants.DISK_ACCESS_SET:
10815 raise errors.OpPrereqError("Invalid disk access mode '%s'" % mode,
10816 errors.ECODE_INVAL)
10817 size = disk_dict.get(constants.IDISK_SIZE, None)
10819 raise errors.OpPrereqError("Required disk parameter size missing",
10820 errors.ECODE_INVAL)
10823 except (TypeError, ValueError), err:
10824 raise errors.OpPrereqError("Invalid disk size parameter: %s" %
10825 str(err), errors.ECODE_INVAL)
10826 disk_dict[constants.IDISK_SIZE] = size
10828 # modification of disk
10829 if constants.IDISK_SIZE in disk_dict:
10830 raise errors.OpPrereqError("Disk size change not possible, use"
10831 " grow-disk", errors.ECODE_INVAL)
10833 if disk_addremove > 1:
10834 raise errors.OpPrereqError("Only one disk add or remove operation"
10835 " supported at a time", errors.ECODE_INVAL)
10837 if self.op.disks and self.op.disk_template is not None:
10838 raise errors.OpPrereqError("Disk template conversion and other disk"
10839 " changes not supported at the same time",
10840 errors.ECODE_INVAL)
10842 if (self.op.disk_template and
10843 self.op.disk_template in constants.DTS_INT_MIRROR and
10844 self.op.remote_node is None):
10845 raise errors.OpPrereqError("Changing the disk template to a mirrored"
10846 " one requires specifying a secondary node",
10847 errors.ECODE_INVAL)
10851 for nic_op, nic_dict in self.op.nics:
10852 utils.ForceDictType(nic_dict, constants.INIC_PARAMS_TYPES)
10853 if nic_op == constants.DDM_REMOVE:
10856 elif nic_op == constants.DDM_ADD:
10859 if not isinstance(nic_op, int):
10860 raise errors.OpPrereqError("Invalid nic index", errors.ECODE_INVAL)
10861 if not isinstance(nic_dict, dict):
10862 msg = "Invalid nic value: expected dict, got '%s'" % nic_dict
10863 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
10865 # nic_dict should be a dict
10866 nic_ip = nic_dict.get(constants.INIC_IP, None)
10867 if nic_ip is not None:
10868 if nic_ip.lower() == constants.VALUE_NONE:
10869 nic_dict[constants.INIC_IP] = None
10871 if not netutils.IPAddress.IsValid(nic_ip):
10872 raise errors.OpPrereqError("Invalid IP address '%s'" % nic_ip,
10873 errors.ECODE_INVAL)
10875 nic_bridge = nic_dict.get("bridge", None)
10876 nic_link = nic_dict.get(constants.INIC_LINK, None)
10877 if nic_bridge and nic_link:
10878 raise errors.OpPrereqError("Cannot pass 'bridge' and 'link'"
10879 " at the same time", errors.ECODE_INVAL)
10880 elif nic_bridge and nic_bridge.lower() == constants.VALUE_NONE:
10881 nic_dict["bridge"] = None
10882 elif nic_link and nic_link.lower() == constants.VALUE_NONE:
10883 nic_dict[constants.INIC_LINK] = None
10885 if nic_op == constants.DDM_ADD:
10886 nic_mac = nic_dict.get(constants.INIC_MAC, None)
10887 if nic_mac is None:
10888 nic_dict[constants.INIC_MAC] = constants.VALUE_AUTO
10890 if constants.INIC_MAC in nic_dict:
10891 nic_mac = nic_dict[constants.INIC_MAC]
10892 if nic_mac not in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
10893 nic_mac = utils.NormalizeAndValidateMac(nic_mac)
10895 if nic_op != constants.DDM_ADD and nic_mac == constants.VALUE_AUTO:
10896 raise errors.OpPrereqError("'auto' is not a valid MAC address when"
10897 " modifying an existing nic",
10898 errors.ECODE_INVAL)
10900 if nic_addremove > 1:
10901 raise errors.OpPrereqError("Only one NIC add or remove operation"
10902 " supported at a time", errors.ECODE_INVAL)
10904 def ExpandNames(self):
10905 self._ExpandAndLockInstance()
10906 self.needed_locks[locking.LEVEL_NODE] = []
10907 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
10909 def DeclareLocks(self, level):
10910 if level == locking.LEVEL_NODE:
10911 self._LockInstancesNodes()
10912 if self.op.disk_template and self.op.remote_node:
10913 self.op.remote_node = _ExpandNodeName(self.cfg, self.op.remote_node)
10914 self.needed_locks[locking.LEVEL_NODE].append(self.op.remote_node)
10916 def BuildHooksEnv(self):
10917 """Build hooks env.
10919 This runs on the master, primary and secondaries.
10923 if constants.BE_MEMORY in self.be_new:
10924 args["memory"] = self.be_new[constants.BE_MEMORY]
10925 if constants.BE_VCPUS in self.be_new:
10926 args["vcpus"] = self.be_new[constants.BE_VCPUS]
10927 # TODO: export disk changes. Note: _BuildInstanceHookEnv* don't export disk
10928 # information at all.
10931 nic_override = dict(self.op.nics)
10932 for idx, nic in enumerate(self.instance.nics):
10933 if idx in nic_override:
10934 this_nic_override = nic_override[idx]
10936 this_nic_override = {}
10937 if constants.INIC_IP in this_nic_override:
10938 ip = this_nic_override[constants.INIC_IP]
10941 if constants.INIC_MAC in this_nic_override:
10942 mac = this_nic_override[constants.INIC_MAC]
10945 if idx in self.nic_pnew:
10946 nicparams = self.nic_pnew[idx]
10948 nicparams = self.cluster.SimpleFillNIC(nic.nicparams)
10949 mode = nicparams[constants.NIC_MODE]
10950 link = nicparams[constants.NIC_LINK]
10951 args["nics"].append((ip, mac, mode, link))
10952 if constants.DDM_ADD in nic_override:
10953 ip = nic_override[constants.DDM_ADD].get(constants.INIC_IP, None)
10954 mac = nic_override[constants.DDM_ADD][constants.INIC_MAC]
10955 nicparams = self.nic_pnew[constants.DDM_ADD]
10956 mode = nicparams[constants.NIC_MODE]
10957 link = nicparams[constants.NIC_LINK]
10958 args["nics"].append((ip, mac, mode, link))
10959 elif constants.DDM_REMOVE in nic_override:
10960 del args["nics"][-1]
10962 env = _BuildInstanceHookEnvByObject(self, self.instance, override=args)
10963 if self.op.disk_template:
10964 env["NEW_DISK_TEMPLATE"] = self.op.disk_template
10968 def BuildHooksNodes(self):
10969 """Build hooks nodes.
10972 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
10975 def CheckPrereq(self):
10976 """Check prerequisites.
10978 This only checks the instance list against the existing names.
10981 # checking the new params on the primary/secondary nodes
10983 instance = self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
10984 cluster = self.cluster = self.cfg.GetClusterInfo()
10985 assert self.instance is not None, \
10986 "Cannot retrieve locked instance %s" % self.op.instance_name
10987 pnode = instance.primary_node
10988 nodelist = list(instance.all_nodes)
10991 if self.op.os_name and not self.op.force:
10992 _CheckNodeHasOS(self, instance.primary_node, self.op.os_name,
10993 self.op.force_variant)
10994 instance_os = self.op.os_name
10996 instance_os = instance.os
10998 if self.op.disk_template:
10999 if instance.disk_template == self.op.disk_template:
11000 raise errors.OpPrereqError("Instance already has disk template %s" %
11001 instance.disk_template, errors.ECODE_INVAL)
11003 if (instance.disk_template,
11004 self.op.disk_template) not in self._DISK_CONVERSIONS:
11005 raise errors.OpPrereqError("Unsupported disk template conversion from"
11006 " %s to %s" % (instance.disk_template,
11007 self.op.disk_template),
11008 errors.ECODE_INVAL)
11009 _CheckInstanceDown(self, instance, "cannot change disk template")
11010 if self.op.disk_template in constants.DTS_INT_MIRROR:
11011 if self.op.remote_node == pnode:
11012 raise errors.OpPrereqError("Given new secondary node %s is the same"
11013 " as the primary node of the instance" %
11014 self.op.remote_node, errors.ECODE_STATE)
11015 _CheckNodeOnline(self, self.op.remote_node)
11016 _CheckNodeNotDrained(self, self.op.remote_node)
11017 # FIXME: here we assume that the old instance type is DT_PLAIN
11018 assert instance.disk_template == constants.DT_PLAIN
11019 disks = [{constants.IDISK_SIZE: d.size,
11020 constants.IDISK_VG: d.logical_id[0]}
11021 for d in instance.disks]
11022 required = _ComputeDiskSizePerVG(self.op.disk_template, disks)
11023 _CheckNodesFreeDiskPerVG(self, [self.op.remote_node], required)
11025 # hvparams processing
11026 if self.op.hvparams:
11027 hv_type = instance.hypervisor
11028 i_hvdict = _GetUpdatedParams(instance.hvparams, self.op.hvparams)
11029 utils.ForceDictType(i_hvdict, constants.HVS_PARAMETER_TYPES)
11030 hv_new = cluster.SimpleFillHV(hv_type, instance.os, i_hvdict)
11033 hypervisor.GetHypervisor(hv_type).CheckParameterSyntax(hv_new)
11034 _CheckHVParams(self, nodelist, instance.hypervisor, hv_new)
11035 self.hv_proposed = self.hv_new = hv_new # the new actual values
11036 self.hv_inst = i_hvdict # the new dict (without defaults)
11038 self.hv_proposed = cluster.SimpleFillHV(instance.hypervisor, instance.os,
11040 self.hv_new = self.hv_inst = {}
11042 # beparams processing
11043 if self.op.beparams:
11044 i_bedict = _GetUpdatedParams(instance.beparams, self.op.beparams,
11046 utils.ForceDictType(i_bedict, constants.BES_PARAMETER_TYPES)
11047 be_new = cluster.SimpleFillBE(i_bedict)
11048 self.be_proposed = self.be_new = be_new # the new actual values
11049 self.be_inst = i_bedict # the new dict (without defaults)
11051 self.be_new = self.be_inst = {}
11052 self.be_proposed = cluster.SimpleFillBE(instance.beparams)
11053 be_old = cluster.FillBE(instance)
11055 # CPU param validation -- checking every time a paramtere is
11056 # changed to cover all cases where either CPU mask or vcpus have
11058 if (constants.BE_VCPUS in self.be_proposed and
11059 constants.HV_CPU_MASK in self.hv_proposed):
11061 utils.ParseMultiCpuMask(self.hv_proposed[constants.HV_CPU_MASK])
11062 # Verify mask is consistent with number of vCPUs. Can skip this
11063 # test if only 1 entry in the CPU mask, which means same mask
11064 # is applied to all vCPUs.
11065 if (len(cpu_list) > 1 and
11066 len(cpu_list) != self.be_proposed[constants.BE_VCPUS]):
11067 raise errors.OpPrereqError("Number of vCPUs [%d] does not match the"
11069 (self.be_proposed[constants.BE_VCPUS],
11070 self.hv_proposed[constants.HV_CPU_MASK]),
11071 errors.ECODE_INVAL)
11073 # Only perform this test if a new CPU mask is given
11074 if constants.HV_CPU_MASK in self.hv_new:
11075 # Calculate the largest CPU number requested
11076 max_requested_cpu = max(map(max, cpu_list))
11077 # Check that all of the instance's nodes have enough physical CPUs to
11078 # satisfy the requested CPU mask
11079 _CheckNodesPhysicalCPUs(self, instance.all_nodes,
11080 max_requested_cpu + 1, instance.hypervisor)
11082 # osparams processing
11083 if self.op.osparams:
11084 i_osdict = _GetUpdatedParams(instance.osparams, self.op.osparams)
11085 _CheckOSParams(self, True, nodelist, instance_os, i_osdict)
11086 self.os_inst = i_osdict # the new dict (without defaults)
11092 if (constants.BE_MEMORY in self.op.beparams and not self.op.force and
11093 be_new[constants.BE_MEMORY] > be_old[constants.BE_MEMORY]):
11094 mem_check_list = [pnode]
11095 if be_new[constants.BE_AUTO_BALANCE]:
11096 # either we changed auto_balance to yes or it was from before
11097 mem_check_list.extend(instance.secondary_nodes)
11098 instance_info = self.rpc.call_instance_info(pnode, instance.name,
11099 instance.hypervisor)
11100 nodeinfo = self.rpc.call_node_info(mem_check_list, None,
11101 instance.hypervisor)
11102 pninfo = nodeinfo[pnode]
11103 msg = pninfo.fail_msg
11105 # Assume the primary node is unreachable and go ahead
11106 self.warn.append("Can't get info from primary node %s: %s" %
11108 elif not isinstance(pninfo.payload.get("memory_free", None), int):
11109 self.warn.append("Node data from primary node %s doesn't contain"
11110 " free memory information" % pnode)
11111 elif instance_info.fail_msg:
11112 self.warn.append("Can't get instance runtime information: %s" %
11113 instance_info.fail_msg)
11115 if instance_info.payload:
11116 current_mem = int(instance_info.payload["memory"])
11118 # Assume instance not running
11119 # (there is a slight race condition here, but it's not very probable,
11120 # and we have no other way to check)
11122 miss_mem = (be_new[constants.BE_MEMORY] - current_mem -
11123 pninfo.payload["memory_free"])
11125 raise errors.OpPrereqError("This change will prevent the instance"
11126 " from starting, due to %d MB of memory"
11127 " missing on its primary node" % miss_mem,
11128 errors.ECODE_NORES)
11130 if be_new[constants.BE_AUTO_BALANCE]:
11131 for node, nres in nodeinfo.items():
11132 if node not in instance.secondary_nodes:
11134 nres.Raise("Can't get info from secondary node %s" % node,
11135 prereq=True, ecode=errors.ECODE_STATE)
11136 if not isinstance(nres.payload.get("memory_free", None), int):
11137 raise errors.OpPrereqError("Secondary node %s didn't return free"
11138 " memory information" % node,
11139 errors.ECODE_STATE)
11140 elif be_new[constants.BE_MEMORY] > nres.payload["memory_free"]:
11141 raise errors.OpPrereqError("This change will prevent the instance"
11142 " from failover to its secondary node"
11143 " %s, due to not enough memory" % node,
11144 errors.ECODE_STATE)
11148 self.nic_pinst = {}
11149 for nic_op, nic_dict in self.op.nics:
11150 if nic_op == constants.DDM_REMOVE:
11151 if not instance.nics:
11152 raise errors.OpPrereqError("Instance has no NICs, cannot remove",
11153 errors.ECODE_INVAL)
11155 if nic_op != constants.DDM_ADD:
11157 if not instance.nics:
11158 raise errors.OpPrereqError("Invalid NIC index %s, instance has"
11159 " no NICs" % nic_op,
11160 errors.ECODE_INVAL)
11161 if nic_op < 0 or nic_op >= len(instance.nics):
11162 raise errors.OpPrereqError("Invalid NIC index %s, valid values"
11164 (nic_op, len(instance.nics) - 1),
11165 errors.ECODE_INVAL)
11166 old_nic_params = instance.nics[nic_op].nicparams
11167 old_nic_ip = instance.nics[nic_op].ip
11169 old_nic_params = {}
11172 update_params_dict = dict([(key, nic_dict[key])
11173 for key in constants.NICS_PARAMETERS
11174 if key in nic_dict])
11176 if "bridge" in nic_dict:
11177 update_params_dict[constants.NIC_LINK] = nic_dict["bridge"]
11179 new_nic_params = _GetUpdatedParams(old_nic_params,
11180 update_params_dict)
11181 utils.ForceDictType(new_nic_params, constants.NICS_PARAMETER_TYPES)
11182 new_filled_nic_params = cluster.SimpleFillNIC(new_nic_params)
11183 objects.NIC.CheckParameterSyntax(new_filled_nic_params)
11184 self.nic_pinst[nic_op] = new_nic_params
11185 self.nic_pnew[nic_op] = new_filled_nic_params
11186 new_nic_mode = new_filled_nic_params[constants.NIC_MODE]
11188 if new_nic_mode == constants.NIC_MODE_BRIDGED:
11189 nic_bridge = new_filled_nic_params[constants.NIC_LINK]
11190 msg = self.rpc.call_bridges_exist(pnode, [nic_bridge]).fail_msg
11192 msg = "Error checking bridges on node %s: %s" % (pnode, msg)
11194 self.warn.append(msg)
11196 raise errors.OpPrereqError(msg, errors.ECODE_ENVIRON)
11197 if new_nic_mode == constants.NIC_MODE_ROUTED:
11198 if constants.INIC_IP in nic_dict:
11199 nic_ip = nic_dict[constants.INIC_IP]
11201 nic_ip = old_nic_ip
11203 raise errors.OpPrereqError("Cannot set the nic ip to None"
11204 " on a routed nic", errors.ECODE_INVAL)
11205 if constants.INIC_MAC in nic_dict:
11206 nic_mac = nic_dict[constants.INIC_MAC]
11207 if nic_mac is None:
11208 raise errors.OpPrereqError("Cannot set the nic mac to None",
11209 errors.ECODE_INVAL)
11210 elif nic_mac in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
11211 # otherwise generate the mac
11212 nic_dict[constants.INIC_MAC] = \
11213 self.cfg.GenerateMAC(self.proc.GetECId())
11215 # or validate/reserve the current one
11217 self.cfg.ReserveMAC(nic_mac, self.proc.GetECId())
11218 except errors.ReservationError:
11219 raise errors.OpPrereqError("MAC address %s already in use"
11220 " in cluster" % nic_mac,
11221 errors.ECODE_NOTUNIQUE)
11224 if self.op.disks and instance.disk_template == constants.DT_DISKLESS:
11225 raise errors.OpPrereqError("Disk operations not supported for"
11226 " diskless instances",
11227 errors.ECODE_INVAL)
11228 for disk_op, _ in self.op.disks:
11229 if disk_op == constants.DDM_REMOVE:
11230 if len(instance.disks) == 1:
11231 raise errors.OpPrereqError("Cannot remove the last disk of"
11232 " an instance", errors.ECODE_INVAL)
11233 _CheckInstanceDown(self, instance, "cannot remove disks")
11235 if (disk_op == constants.DDM_ADD and
11236 len(instance.disks) >= constants.MAX_DISKS):
11237 raise errors.OpPrereqError("Instance has too many disks (%d), cannot"
11238 " add more" % constants.MAX_DISKS,
11239 errors.ECODE_STATE)
11240 if disk_op not in (constants.DDM_ADD, constants.DDM_REMOVE):
11242 if disk_op < 0 or disk_op >= len(instance.disks):
11243 raise errors.OpPrereqError("Invalid disk index %s, valid values"
11245 (disk_op, len(instance.disks)),
11246 errors.ECODE_INVAL)
11250 def _ConvertPlainToDrbd(self, feedback_fn):
11251 """Converts an instance from plain to drbd.
11254 feedback_fn("Converting template to drbd")
11255 instance = self.instance
11256 pnode = instance.primary_node
11257 snode = self.op.remote_node
11259 # create a fake disk info for _GenerateDiskTemplate
11260 disk_info = [{constants.IDISK_SIZE: d.size, constants.IDISK_MODE: d.mode,
11261 constants.IDISK_VG: d.logical_id[0]}
11262 for d in instance.disks]
11263 new_disks = _GenerateDiskTemplate(self, self.op.disk_template,
11264 instance.name, pnode, [snode],
11265 disk_info, None, None, 0, feedback_fn)
11266 info = _GetInstanceInfoText(instance)
11267 feedback_fn("Creating aditional volumes...")
11268 # first, create the missing data and meta devices
11269 for disk in new_disks:
11270 # unfortunately this is... not too nice
11271 _CreateSingleBlockDev(self, pnode, instance, disk.children[1],
11273 for child in disk.children:
11274 _CreateSingleBlockDev(self, snode, instance, child, info, True)
11275 # at this stage, all new LVs have been created, we can rename the
11277 feedback_fn("Renaming original volumes...")
11278 rename_list = [(o, n.children[0].logical_id)
11279 for (o, n) in zip(instance.disks, new_disks)]
11280 result = self.rpc.call_blockdev_rename(pnode, rename_list)
11281 result.Raise("Failed to rename original LVs")
11283 feedback_fn("Initializing DRBD devices...")
11284 # all child devices are in place, we can now create the DRBD devices
11285 for disk in new_disks:
11286 for node in [pnode, snode]:
11287 f_create = node == pnode
11288 _CreateSingleBlockDev(self, node, instance, disk, info, f_create)
11290 # at this point, the instance has been modified
11291 instance.disk_template = constants.DT_DRBD8
11292 instance.disks = new_disks
11293 self.cfg.Update(instance, feedback_fn)
11295 # disks are created, waiting for sync
11296 disk_abort = not _WaitForSync(self, instance,
11297 oneshot=not self.op.wait_for_sync)
11299 raise errors.OpExecError("There are some degraded disks for"
11300 " this instance, please cleanup manually")
11302 def _ConvertDrbdToPlain(self, feedback_fn):
11303 """Converts an instance from drbd to plain.
11306 instance = self.instance
11307 assert len(instance.secondary_nodes) == 1
11308 pnode = instance.primary_node
11309 snode = instance.secondary_nodes[0]
11310 feedback_fn("Converting template to plain")
11312 old_disks = instance.disks
11313 new_disks = [d.children[0] for d in old_disks]
11315 # copy over size and mode
11316 for parent, child in zip(old_disks, new_disks):
11317 child.size = parent.size
11318 child.mode = parent.mode
11320 # update instance structure
11321 instance.disks = new_disks
11322 instance.disk_template = constants.DT_PLAIN
11323 self.cfg.Update(instance, feedback_fn)
11325 feedback_fn("Removing volumes on the secondary node...")
11326 for disk in old_disks:
11327 self.cfg.SetDiskID(disk, snode)
11328 msg = self.rpc.call_blockdev_remove(snode, disk).fail_msg
11330 self.LogWarning("Could not remove block device %s on node %s,"
11331 " continuing anyway: %s", disk.iv_name, snode, msg)
11333 feedback_fn("Removing unneeded volumes on the primary node...")
11334 for idx, disk in enumerate(old_disks):
11335 meta = disk.children[1]
11336 self.cfg.SetDiskID(meta, pnode)
11337 msg = self.rpc.call_blockdev_remove(pnode, meta).fail_msg
11339 self.LogWarning("Could not remove metadata for disk %d on node %s,"
11340 " continuing anyway: %s", idx, pnode, msg)
11342 def Exec(self, feedback_fn):
11343 """Modifies an instance.
11345 All parameters take effect only at the next restart of the instance.
11348 # Process here the warnings from CheckPrereq, as we don't have a
11349 # feedback_fn there.
11350 for warn in self.warn:
11351 feedback_fn("WARNING: %s" % warn)
11354 instance = self.instance
11356 for disk_op, disk_dict in self.op.disks:
11357 if disk_op == constants.DDM_REMOVE:
11358 # remove the last disk
11359 device = instance.disks.pop()
11360 device_idx = len(instance.disks)
11361 for node, disk in device.ComputeNodeTree(instance.primary_node):
11362 self.cfg.SetDiskID(disk, node)
11363 msg = self.rpc.call_blockdev_remove(node, disk).fail_msg
11365 self.LogWarning("Could not remove disk/%d on node %s: %s,"
11366 " continuing anyway", device_idx, node, msg)
11367 result.append(("disk/%d" % device_idx, "remove"))
11368 elif disk_op == constants.DDM_ADD:
11370 if instance.disk_template in (constants.DT_FILE,
11371 constants.DT_SHARED_FILE):
11372 file_driver, file_path = instance.disks[0].logical_id
11373 file_path = os.path.dirname(file_path)
11375 file_driver = file_path = None
11376 disk_idx_base = len(instance.disks)
11377 new_disk = _GenerateDiskTemplate(self,
11378 instance.disk_template,
11379 instance.name, instance.primary_node,
11380 instance.secondary_nodes,
11384 disk_idx_base, feedback_fn)[0]
11385 instance.disks.append(new_disk)
11386 info = _GetInstanceInfoText(instance)
11388 logging.info("Creating volume %s for instance %s",
11389 new_disk.iv_name, instance.name)
11390 # Note: this needs to be kept in sync with _CreateDisks
11392 for node in instance.all_nodes:
11393 f_create = node == instance.primary_node
11395 _CreateBlockDev(self, node, instance, new_disk,
11396 f_create, info, f_create)
11397 except errors.OpExecError, err:
11398 self.LogWarning("Failed to create volume %s (%s) on"
11400 new_disk.iv_name, new_disk, node, err)
11401 result.append(("disk/%d" % disk_idx_base, "add:size=%s,mode=%s" %
11402 (new_disk.size, new_disk.mode)))
11404 # change a given disk
11405 instance.disks[disk_op].mode = disk_dict[constants.IDISK_MODE]
11406 result.append(("disk.mode/%d" % disk_op,
11407 disk_dict[constants.IDISK_MODE]))
11409 if self.op.disk_template:
11410 r_shut = _ShutdownInstanceDisks(self, instance)
11412 raise errors.OpExecError("Cannot shutdown instance disks, unable to"
11413 " proceed with disk template conversion")
11414 mode = (instance.disk_template, self.op.disk_template)
11416 self._DISK_CONVERSIONS[mode](self, feedback_fn)
11418 self.cfg.ReleaseDRBDMinors(instance.name)
11420 result.append(("disk_template", self.op.disk_template))
11423 for nic_op, nic_dict in self.op.nics:
11424 if nic_op == constants.DDM_REMOVE:
11425 # remove the last nic
11426 del instance.nics[-1]
11427 result.append(("nic.%d" % len(instance.nics), "remove"))
11428 elif nic_op == constants.DDM_ADD:
11429 # mac and bridge should be set, by now
11430 mac = nic_dict[constants.INIC_MAC]
11431 ip = nic_dict.get(constants.INIC_IP, None)
11432 nicparams = self.nic_pinst[constants.DDM_ADD]
11433 new_nic = objects.NIC(mac=mac, ip=ip, nicparams=nicparams)
11434 instance.nics.append(new_nic)
11435 result.append(("nic.%d" % (len(instance.nics) - 1),
11436 "add:mac=%s,ip=%s,mode=%s,link=%s" %
11437 (new_nic.mac, new_nic.ip,
11438 self.nic_pnew[constants.DDM_ADD][constants.NIC_MODE],
11439 self.nic_pnew[constants.DDM_ADD][constants.NIC_LINK]
11442 for key in (constants.INIC_MAC, constants.INIC_IP):
11443 if key in nic_dict:
11444 setattr(instance.nics[nic_op], key, nic_dict[key])
11445 if nic_op in self.nic_pinst:
11446 instance.nics[nic_op].nicparams = self.nic_pinst[nic_op]
11447 for key, val in nic_dict.iteritems():
11448 result.append(("nic.%s/%d" % (key, nic_op), val))
11451 if self.op.hvparams:
11452 instance.hvparams = self.hv_inst
11453 for key, val in self.op.hvparams.iteritems():
11454 result.append(("hv/%s" % key, val))
11457 if self.op.beparams:
11458 instance.beparams = self.be_inst
11459 for key, val in self.op.beparams.iteritems():
11460 result.append(("be/%s" % key, val))
11463 if self.op.os_name:
11464 instance.os = self.op.os_name
11467 if self.op.osparams:
11468 instance.osparams = self.os_inst
11469 for key, val in self.op.osparams.iteritems():
11470 result.append(("os/%s" % key, val))
11472 self.cfg.Update(instance, feedback_fn)
11476 _DISK_CONVERSIONS = {
11477 (constants.DT_PLAIN, constants.DT_DRBD8): _ConvertPlainToDrbd,
11478 (constants.DT_DRBD8, constants.DT_PLAIN): _ConvertDrbdToPlain,
11482 class LUInstanceChangeGroup(LogicalUnit):
11483 HPATH = "instance-change-group"
11484 HTYPE = constants.HTYPE_INSTANCE
11487 def ExpandNames(self):
11488 self.share_locks = _ShareAll()
11489 self.needed_locks = {
11490 locking.LEVEL_NODEGROUP: [],
11491 locking.LEVEL_NODE: [],
11494 self._ExpandAndLockInstance()
11496 if self.op.target_groups:
11497 self.req_target_uuids = map(self.cfg.LookupNodeGroup,
11498 self.op.target_groups)
11500 self.req_target_uuids = None
11502 self.op.iallocator = _GetDefaultIAllocator(self.cfg, self.op.iallocator)
11504 def DeclareLocks(self, level):
11505 if level == locking.LEVEL_NODEGROUP:
11506 assert not self.needed_locks[locking.LEVEL_NODEGROUP]
11508 if self.req_target_uuids:
11509 lock_groups = set(self.req_target_uuids)
11511 # Lock all groups used by instance optimistically; this requires going
11512 # via the node before it's locked, requiring verification later on
11513 instance_groups = self.cfg.GetInstanceNodeGroups(self.op.instance_name)
11514 lock_groups.update(instance_groups)
11516 # No target groups, need to lock all of them
11517 lock_groups = locking.ALL_SET
11519 self.needed_locks[locking.LEVEL_NODEGROUP] = lock_groups
11521 elif level == locking.LEVEL_NODE:
11522 if self.req_target_uuids:
11523 # Lock all nodes used by instances
11524 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
11525 self._LockInstancesNodes()
11527 # Lock all nodes in all potential target groups
11528 lock_groups = (frozenset(self.owned_locks(locking.LEVEL_NODEGROUP)) -
11529 self.cfg.GetInstanceNodeGroups(self.op.instance_name))
11530 member_nodes = [node_name
11531 for group in lock_groups
11532 for node_name in self.cfg.GetNodeGroup(group).members]
11533 self.needed_locks[locking.LEVEL_NODE].extend(member_nodes)
11535 # Lock all nodes as all groups are potential targets
11536 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
11538 def CheckPrereq(self):
11539 owned_instances = frozenset(self.owned_locks(locking.LEVEL_INSTANCE))
11540 owned_groups = frozenset(self.owned_locks(locking.LEVEL_NODEGROUP))
11541 owned_nodes = frozenset(self.owned_locks(locking.LEVEL_NODE))
11543 assert (self.req_target_uuids is None or
11544 owned_groups.issuperset(self.req_target_uuids))
11545 assert owned_instances == set([self.op.instance_name])
11547 # Get instance information
11548 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
11550 # Check if node groups for locked instance are still correct
11551 assert owned_nodes.issuperset(self.instance.all_nodes), \
11552 ("Instance %s's nodes changed while we kept the lock" %
11553 self.op.instance_name)
11555 inst_groups = _CheckInstanceNodeGroups(self.cfg, self.op.instance_name,
11558 if self.req_target_uuids:
11559 # User requested specific target groups
11560 self.target_uuids = self.req_target_uuids
11562 # All groups except those used by the instance are potential targets
11563 self.target_uuids = owned_groups - inst_groups
11565 conflicting_groups = self.target_uuids & inst_groups
11566 if conflicting_groups:
11567 raise errors.OpPrereqError("Can't use group(s) '%s' as targets, they are"
11568 " used by the instance '%s'" %
11569 (utils.CommaJoin(conflicting_groups),
11570 self.op.instance_name),
11571 errors.ECODE_INVAL)
11573 if not self.target_uuids:
11574 raise errors.OpPrereqError("There are no possible target groups",
11575 errors.ECODE_INVAL)
11577 def BuildHooksEnv(self):
11578 """Build hooks env.
11581 assert self.target_uuids
11584 "TARGET_GROUPS": " ".join(self.target_uuids),
11587 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
11591 def BuildHooksNodes(self):
11592 """Build hooks nodes.
11595 mn = self.cfg.GetMasterNode()
11596 return ([mn], [mn])
11598 def Exec(self, feedback_fn):
11599 instances = list(self.owned_locks(locking.LEVEL_INSTANCE))
11601 assert instances == [self.op.instance_name], "Instance not locked"
11603 ial = IAllocator(self.cfg, self.rpc, constants.IALLOCATOR_MODE_CHG_GROUP,
11604 instances=instances, target_groups=list(self.target_uuids))
11606 ial.Run(self.op.iallocator)
11608 if not ial.success:
11609 raise errors.OpPrereqError("Can't compute solution for changing group of"
11610 " instance '%s' using iallocator '%s': %s" %
11611 (self.op.instance_name, self.op.iallocator,
11613 errors.ECODE_NORES)
11615 jobs = _LoadNodeEvacResult(self, ial.result, self.op.early_release, False)
11617 self.LogInfo("Iallocator returned %s job(s) for changing group of"
11618 " instance '%s'", len(jobs), self.op.instance_name)
11620 return ResultWithJobs(jobs)
11623 class LUBackupQuery(NoHooksLU):
11624 """Query the exports list
11629 def ExpandNames(self):
11630 self.needed_locks = {}
11631 self.share_locks[locking.LEVEL_NODE] = 1
11632 if not self.op.nodes:
11633 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
11635 self.needed_locks[locking.LEVEL_NODE] = \
11636 _GetWantedNodes(self, self.op.nodes)
11638 def Exec(self, feedback_fn):
11639 """Compute the list of all the exported system images.
11642 @return: a dictionary with the structure node->(export-list)
11643 where export-list is a list of the instances exported on
11647 self.nodes = self.owned_locks(locking.LEVEL_NODE)
11648 rpcresult = self.rpc.call_export_list(self.nodes)
11650 for node in rpcresult:
11651 if rpcresult[node].fail_msg:
11652 result[node] = False
11654 result[node] = rpcresult[node].payload
11659 class LUBackupPrepare(NoHooksLU):
11660 """Prepares an instance for an export and returns useful information.
11665 def ExpandNames(self):
11666 self._ExpandAndLockInstance()
11668 def CheckPrereq(self):
11669 """Check prerequisites.
11672 instance_name = self.op.instance_name
11674 self.instance = self.cfg.GetInstanceInfo(instance_name)
11675 assert self.instance is not None, \
11676 "Cannot retrieve locked instance %s" % self.op.instance_name
11677 _CheckNodeOnline(self, self.instance.primary_node)
11679 self._cds = _GetClusterDomainSecret()
11681 def Exec(self, feedback_fn):
11682 """Prepares an instance for an export.
11685 instance = self.instance
11687 if self.op.mode == constants.EXPORT_MODE_REMOTE:
11688 salt = utils.GenerateSecret(8)
11690 feedback_fn("Generating X509 certificate on %s" % instance.primary_node)
11691 result = self.rpc.call_x509_cert_create(instance.primary_node,
11692 constants.RIE_CERT_VALIDITY)
11693 result.Raise("Can't create X509 key and certificate on %s" % result.node)
11695 (name, cert_pem) = result.payload
11697 cert = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM,
11701 "handshake": masterd.instance.ComputeRemoteExportHandshake(self._cds),
11702 "x509_key_name": (name, utils.Sha1Hmac(self._cds, name, salt=salt),
11704 "x509_ca": utils.SignX509Certificate(cert, self._cds, salt),
11710 class LUBackupExport(LogicalUnit):
11711 """Export an instance to an image in the cluster.
11714 HPATH = "instance-export"
11715 HTYPE = constants.HTYPE_INSTANCE
11718 def CheckArguments(self):
11719 """Check the arguments.
11722 self.x509_key_name = self.op.x509_key_name
11723 self.dest_x509_ca_pem = self.op.destination_x509_ca
11725 if self.op.mode == constants.EXPORT_MODE_REMOTE:
11726 if not self.x509_key_name:
11727 raise errors.OpPrereqError("Missing X509 key name for encryption",
11728 errors.ECODE_INVAL)
11730 if not self.dest_x509_ca_pem:
11731 raise errors.OpPrereqError("Missing destination X509 CA",
11732 errors.ECODE_INVAL)
11734 def ExpandNames(self):
11735 self._ExpandAndLockInstance()
11737 # Lock all nodes for local exports
11738 if self.op.mode == constants.EXPORT_MODE_LOCAL:
11739 # FIXME: lock only instance primary and destination node
11741 # Sad but true, for now we have do lock all nodes, as we don't know where
11742 # the previous export might be, and in this LU we search for it and
11743 # remove it from its current node. In the future we could fix this by:
11744 # - making a tasklet to search (share-lock all), then create the
11745 # new one, then one to remove, after
11746 # - removing the removal operation altogether
11747 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
11749 def DeclareLocks(self, level):
11750 """Last minute lock declaration."""
11751 # All nodes are locked anyway, so nothing to do here.
11753 def BuildHooksEnv(self):
11754 """Build hooks env.
11756 This will run on the master, primary node and target node.
11760 "EXPORT_MODE": self.op.mode,
11761 "EXPORT_NODE": self.op.target_node,
11762 "EXPORT_DO_SHUTDOWN": self.op.shutdown,
11763 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
11764 # TODO: Generic function for boolean env variables
11765 "REMOVE_INSTANCE": str(bool(self.op.remove_instance)),
11768 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
11772 def BuildHooksNodes(self):
11773 """Build hooks nodes.
11776 nl = [self.cfg.GetMasterNode(), self.instance.primary_node]
11778 if self.op.mode == constants.EXPORT_MODE_LOCAL:
11779 nl.append(self.op.target_node)
11783 def CheckPrereq(self):
11784 """Check prerequisites.
11786 This checks that the instance and node names are valid.
11789 instance_name = self.op.instance_name
11791 self.instance = self.cfg.GetInstanceInfo(instance_name)
11792 assert self.instance is not None, \
11793 "Cannot retrieve locked instance %s" % self.op.instance_name
11794 _CheckNodeOnline(self, self.instance.primary_node)
11796 if (self.op.remove_instance and self.instance.admin_up and
11797 not self.op.shutdown):
11798 raise errors.OpPrereqError("Can not remove instance without shutting it"
11801 if self.op.mode == constants.EXPORT_MODE_LOCAL:
11802 self.op.target_node = _ExpandNodeName(self.cfg, self.op.target_node)
11803 self.dst_node = self.cfg.GetNodeInfo(self.op.target_node)
11804 assert self.dst_node is not None
11806 _CheckNodeOnline(self, self.dst_node.name)
11807 _CheckNodeNotDrained(self, self.dst_node.name)
11810 self.dest_disk_info = None
11811 self.dest_x509_ca = None
11813 elif self.op.mode == constants.EXPORT_MODE_REMOTE:
11814 self.dst_node = None
11816 if len(self.op.target_node) != len(self.instance.disks):
11817 raise errors.OpPrereqError(("Received destination information for %s"
11818 " disks, but instance %s has %s disks") %
11819 (len(self.op.target_node), instance_name,
11820 len(self.instance.disks)),
11821 errors.ECODE_INVAL)
11823 cds = _GetClusterDomainSecret()
11825 # Check X509 key name
11827 (key_name, hmac_digest, hmac_salt) = self.x509_key_name
11828 except (TypeError, ValueError), err:
11829 raise errors.OpPrereqError("Invalid data for X509 key name: %s" % err)
11831 if not utils.VerifySha1Hmac(cds, key_name, hmac_digest, salt=hmac_salt):
11832 raise errors.OpPrereqError("HMAC for X509 key name is wrong",
11833 errors.ECODE_INVAL)
11835 # Load and verify CA
11837 (cert, _) = utils.LoadSignedX509Certificate(self.dest_x509_ca_pem, cds)
11838 except OpenSSL.crypto.Error, err:
11839 raise errors.OpPrereqError("Unable to load destination X509 CA (%s)" %
11840 (err, ), errors.ECODE_INVAL)
11842 (errcode, msg) = utils.VerifyX509Certificate(cert, None, None)
11843 if errcode is not None:
11844 raise errors.OpPrereqError("Invalid destination X509 CA (%s)" %
11845 (msg, ), errors.ECODE_INVAL)
11847 self.dest_x509_ca = cert
11849 # Verify target information
11851 for idx, disk_data in enumerate(self.op.target_node):
11853 (host, port, magic) = \
11854 masterd.instance.CheckRemoteExportDiskInfo(cds, idx, disk_data)
11855 except errors.GenericError, err:
11856 raise errors.OpPrereqError("Target info for disk %s: %s" %
11857 (idx, err), errors.ECODE_INVAL)
11859 disk_info.append((host, port, magic))
11861 assert len(disk_info) == len(self.op.target_node)
11862 self.dest_disk_info = disk_info
11865 raise errors.ProgrammerError("Unhandled export mode %r" %
11868 # instance disk type verification
11869 # TODO: Implement export support for file-based disks
11870 for disk in self.instance.disks:
11871 if disk.dev_type == constants.LD_FILE:
11872 raise errors.OpPrereqError("Export not supported for instances with"
11873 " file-based disks", errors.ECODE_INVAL)
11875 def _CleanupExports(self, feedback_fn):
11876 """Removes exports of current instance from all other nodes.
11878 If an instance in a cluster with nodes A..D was exported to node C, its
11879 exports will be removed from the nodes A, B and D.
11882 assert self.op.mode != constants.EXPORT_MODE_REMOTE
11884 nodelist = self.cfg.GetNodeList()
11885 nodelist.remove(self.dst_node.name)
11887 # on one-node clusters nodelist will be empty after the removal
11888 # if we proceed the backup would be removed because OpBackupQuery
11889 # substitutes an empty list with the full cluster node list.
11890 iname = self.instance.name
11892 feedback_fn("Removing old exports for instance %s" % iname)
11893 exportlist = self.rpc.call_export_list(nodelist)
11894 for node in exportlist:
11895 if exportlist[node].fail_msg:
11897 if iname in exportlist[node].payload:
11898 msg = self.rpc.call_export_remove(node, iname).fail_msg
11900 self.LogWarning("Could not remove older export for instance %s"
11901 " on node %s: %s", iname, node, msg)
11903 def Exec(self, feedback_fn):
11904 """Export an instance to an image in the cluster.
11907 assert self.op.mode in constants.EXPORT_MODES
11909 instance = self.instance
11910 src_node = instance.primary_node
11912 if self.op.shutdown:
11913 # shutdown the instance, but not the disks
11914 feedback_fn("Shutting down instance %s" % instance.name)
11915 result = self.rpc.call_instance_shutdown(src_node, instance,
11916 self.op.shutdown_timeout)
11917 # TODO: Maybe ignore failures if ignore_remove_failures is set
11918 result.Raise("Could not shutdown instance %s on"
11919 " node %s" % (instance.name, src_node))
11921 # set the disks ID correctly since call_instance_start needs the
11922 # correct drbd minor to create the symlinks
11923 for disk in instance.disks:
11924 self.cfg.SetDiskID(disk, src_node)
11926 activate_disks = (not instance.admin_up)
11929 # Activate the instance disks if we'exporting a stopped instance
11930 feedback_fn("Activating disks for %s" % instance.name)
11931 _StartInstanceDisks(self, instance, None)
11934 helper = masterd.instance.ExportInstanceHelper(self, feedback_fn,
11937 helper.CreateSnapshots()
11939 if (self.op.shutdown and instance.admin_up and
11940 not self.op.remove_instance):
11941 assert not activate_disks
11942 feedback_fn("Starting instance %s" % instance.name)
11943 result = self.rpc.call_instance_start(src_node,
11944 (instance, None, None), False)
11945 msg = result.fail_msg
11947 feedback_fn("Failed to start instance: %s" % msg)
11948 _ShutdownInstanceDisks(self, instance)
11949 raise errors.OpExecError("Could not start instance: %s" % msg)
11951 if self.op.mode == constants.EXPORT_MODE_LOCAL:
11952 (fin_resu, dresults) = helper.LocalExport(self.dst_node)
11953 elif self.op.mode == constants.EXPORT_MODE_REMOTE:
11954 connect_timeout = constants.RIE_CONNECT_TIMEOUT
11955 timeouts = masterd.instance.ImportExportTimeouts(connect_timeout)
11957 (key_name, _, _) = self.x509_key_name
11960 OpenSSL.crypto.dump_certificate(OpenSSL.crypto.FILETYPE_PEM,
11963 (fin_resu, dresults) = helper.RemoteExport(self.dest_disk_info,
11964 key_name, dest_ca_pem,
11969 # Check for backwards compatibility
11970 assert len(dresults) == len(instance.disks)
11971 assert compat.all(isinstance(i, bool) for i in dresults), \
11972 "Not all results are boolean: %r" % dresults
11976 feedback_fn("Deactivating disks for %s" % instance.name)
11977 _ShutdownInstanceDisks(self, instance)
11979 if not (compat.all(dresults) and fin_resu):
11982 failures.append("export finalization")
11983 if not compat.all(dresults):
11984 fdsk = utils.CommaJoin(idx for (idx, dsk) in enumerate(dresults)
11986 failures.append("disk export: disk(s) %s" % fdsk)
11988 raise errors.OpExecError("Export failed, errors in %s" %
11989 utils.CommaJoin(failures))
11991 # At this point, the export was successful, we can cleanup/finish
11993 # Remove instance if requested
11994 if self.op.remove_instance:
11995 feedback_fn("Removing instance %s" % instance.name)
11996 _RemoveInstance(self, feedback_fn, instance,
11997 self.op.ignore_remove_failures)
11999 if self.op.mode == constants.EXPORT_MODE_LOCAL:
12000 self._CleanupExports(feedback_fn)
12002 return fin_resu, dresults
12005 class LUBackupRemove(NoHooksLU):
12006 """Remove exports related to the named instance.
12011 def ExpandNames(self):
12012 self.needed_locks = {}
12013 # We need all nodes to be locked in order for RemoveExport to work, but we
12014 # don't need to lock the instance itself, as nothing will happen to it (and
12015 # we can remove exports also for a removed instance)
12016 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
12018 def Exec(self, feedback_fn):
12019 """Remove any export.
12022 instance_name = self.cfg.ExpandInstanceName(self.op.instance_name)
12023 # If the instance was not found we'll try with the name that was passed in.
12024 # This will only work if it was an FQDN, though.
12026 if not instance_name:
12028 instance_name = self.op.instance_name
12030 locked_nodes = self.owned_locks(locking.LEVEL_NODE)
12031 exportlist = self.rpc.call_export_list(locked_nodes)
12033 for node in exportlist:
12034 msg = exportlist[node].fail_msg
12036 self.LogWarning("Failed to query node %s (continuing): %s", node, msg)
12038 if instance_name in exportlist[node].payload:
12040 result = self.rpc.call_export_remove(node, instance_name)
12041 msg = result.fail_msg
12043 logging.error("Could not remove export for instance %s"
12044 " on node %s: %s", instance_name, node, msg)
12046 if fqdn_warn and not found:
12047 feedback_fn("Export not found. If trying to remove an export belonging"
12048 " to a deleted instance please use its Fully Qualified"
12052 class LUGroupAdd(LogicalUnit):
12053 """Logical unit for creating node groups.
12056 HPATH = "group-add"
12057 HTYPE = constants.HTYPE_GROUP
12060 def ExpandNames(self):
12061 # We need the new group's UUID here so that we can create and acquire the
12062 # corresponding lock. Later, in Exec(), we'll indicate to cfg.AddNodeGroup
12063 # that it should not check whether the UUID exists in the configuration.
12064 self.group_uuid = self.cfg.GenerateUniqueID(self.proc.GetECId())
12065 self.needed_locks = {}
12066 self.add_locks[locking.LEVEL_NODEGROUP] = self.group_uuid
12068 def CheckPrereq(self):
12069 """Check prerequisites.
12071 This checks that the given group name is not an existing node group
12076 existing_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
12077 except errors.OpPrereqError:
12080 raise errors.OpPrereqError("Desired group name '%s' already exists as a"
12081 " node group (UUID: %s)" %
12082 (self.op.group_name, existing_uuid),
12083 errors.ECODE_EXISTS)
12085 if self.op.ndparams:
12086 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
12088 def BuildHooksEnv(self):
12089 """Build hooks env.
12093 "GROUP_NAME": self.op.group_name,
12096 def BuildHooksNodes(self):
12097 """Build hooks nodes.
12100 mn = self.cfg.GetMasterNode()
12101 return ([mn], [mn])
12103 def Exec(self, feedback_fn):
12104 """Add the node group to the cluster.
12107 group_obj = objects.NodeGroup(name=self.op.group_name, members=[],
12108 uuid=self.group_uuid,
12109 alloc_policy=self.op.alloc_policy,
12110 ndparams=self.op.ndparams)
12112 self.cfg.AddNodeGroup(group_obj, self.proc.GetECId(), check_uuid=False)
12113 del self.remove_locks[locking.LEVEL_NODEGROUP]
12116 class LUGroupAssignNodes(NoHooksLU):
12117 """Logical unit for assigning nodes to groups.
12122 def ExpandNames(self):
12123 # These raise errors.OpPrereqError on their own:
12124 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
12125 self.op.nodes = _GetWantedNodes(self, self.op.nodes)
12127 # We want to lock all the affected nodes and groups. We have readily
12128 # available the list of nodes, and the *destination* group. To gather the
12129 # list of "source" groups, we need to fetch node information later on.
12130 self.needed_locks = {
12131 locking.LEVEL_NODEGROUP: set([self.group_uuid]),
12132 locking.LEVEL_NODE: self.op.nodes,
12135 def DeclareLocks(self, level):
12136 if level == locking.LEVEL_NODEGROUP:
12137 assert len(self.needed_locks[locking.LEVEL_NODEGROUP]) == 1
12139 # Try to get all affected nodes' groups without having the group or node
12140 # lock yet. Needs verification later in the code flow.
12141 groups = self.cfg.GetNodeGroupsFromNodes(self.op.nodes)
12143 self.needed_locks[locking.LEVEL_NODEGROUP].update(groups)
12145 def CheckPrereq(self):
12146 """Check prerequisites.
12149 assert self.needed_locks[locking.LEVEL_NODEGROUP]
12150 assert (frozenset(self.owned_locks(locking.LEVEL_NODE)) ==
12151 frozenset(self.op.nodes))
12153 expected_locks = (set([self.group_uuid]) |
12154 self.cfg.GetNodeGroupsFromNodes(self.op.nodes))
12155 actual_locks = self.owned_locks(locking.LEVEL_NODEGROUP)
12156 if actual_locks != expected_locks:
12157 raise errors.OpExecError("Nodes changed groups since locks were acquired,"
12158 " current groups are '%s', used to be '%s'" %
12159 (utils.CommaJoin(expected_locks),
12160 utils.CommaJoin(actual_locks)))
12162 self.node_data = self.cfg.GetAllNodesInfo()
12163 self.group = self.cfg.GetNodeGroup(self.group_uuid)
12164 instance_data = self.cfg.GetAllInstancesInfo()
12166 if self.group is None:
12167 raise errors.OpExecError("Could not retrieve group '%s' (UUID: %s)" %
12168 (self.op.group_name, self.group_uuid))
12170 (new_splits, previous_splits) = \
12171 self.CheckAssignmentForSplitInstances([(node, self.group_uuid)
12172 for node in self.op.nodes],
12173 self.node_data, instance_data)
12176 fmt_new_splits = utils.CommaJoin(utils.NiceSort(new_splits))
12178 if not self.op.force:
12179 raise errors.OpExecError("The following instances get split by this"
12180 " change and --force was not given: %s" %
12183 self.LogWarning("This operation will split the following instances: %s",
12186 if previous_splits:
12187 self.LogWarning("In addition, these already-split instances continue"
12188 " to be split across groups: %s",
12189 utils.CommaJoin(utils.NiceSort(previous_splits)))
12191 def Exec(self, feedback_fn):
12192 """Assign nodes to a new group.
12195 for node in self.op.nodes:
12196 self.node_data[node].group = self.group_uuid
12198 # FIXME: Depends on side-effects of modifying the result of
12199 # C{cfg.GetAllNodesInfo}
12201 self.cfg.Update(self.group, feedback_fn) # Saves all modified nodes.
12204 def CheckAssignmentForSplitInstances(changes, node_data, instance_data):
12205 """Check for split instances after a node assignment.
12207 This method considers a series of node assignments as an atomic operation,
12208 and returns information about split instances after applying the set of
12211 In particular, it returns information about newly split instances, and
12212 instances that were already split, and remain so after the change.
12214 Only instances whose disk template is listed in constants.DTS_INT_MIRROR are
12217 @type changes: list of (node_name, new_group_uuid) pairs.
12218 @param changes: list of node assignments to consider.
12219 @param node_data: a dict with data for all nodes
12220 @param instance_data: a dict with all instances to consider
12221 @rtype: a two-tuple
12222 @return: a list of instances that were previously okay and result split as a
12223 consequence of this change, and a list of instances that were previously
12224 split and this change does not fix.
12227 changed_nodes = dict((node, group) for node, group in changes
12228 if node_data[node].group != group)
12230 all_split_instances = set()
12231 previously_split_instances = set()
12233 def InstanceNodes(instance):
12234 return [instance.primary_node] + list(instance.secondary_nodes)
12236 for inst in instance_data.values():
12237 if inst.disk_template not in constants.DTS_INT_MIRROR:
12240 instance_nodes = InstanceNodes(inst)
12242 if len(set(node_data[node].group for node in instance_nodes)) > 1:
12243 previously_split_instances.add(inst.name)
12245 if len(set(changed_nodes.get(node, node_data[node].group)
12246 for node in instance_nodes)) > 1:
12247 all_split_instances.add(inst.name)
12249 return (list(all_split_instances - previously_split_instances),
12250 list(previously_split_instances & all_split_instances))
12253 class _GroupQuery(_QueryBase):
12254 FIELDS = query.GROUP_FIELDS
12256 def ExpandNames(self, lu):
12257 lu.needed_locks = {}
12259 self._all_groups = lu.cfg.GetAllNodeGroupsInfo()
12260 name_to_uuid = dict((g.name, g.uuid) for g in self._all_groups.values())
12263 self.wanted = [name_to_uuid[name]
12264 for name in utils.NiceSort(name_to_uuid.keys())]
12266 # Accept names to be either names or UUIDs.
12269 all_uuid = frozenset(self._all_groups.keys())
12271 for name in self.names:
12272 if name in all_uuid:
12273 self.wanted.append(name)
12274 elif name in name_to_uuid:
12275 self.wanted.append(name_to_uuid[name])
12277 missing.append(name)
12280 raise errors.OpPrereqError("Some groups do not exist: %s" %
12281 utils.CommaJoin(missing),
12282 errors.ECODE_NOENT)
12284 def DeclareLocks(self, lu, level):
12287 def _GetQueryData(self, lu):
12288 """Computes the list of node groups and their attributes.
12291 do_nodes = query.GQ_NODE in self.requested_data
12292 do_instances = query.GQ_INST in self.requested_data
12294 group_to_nodes = None
12295 group_to_instances = None
12297 # For GQ_NODE, we need to map group->[nodes], and group->[instances] for
12298 # GQ_INST. The former is attainable with just GetAllNodesInfo(), but for the
12299 # latter GetAllInstancesInfo() is not enough, for we have to go through
12300 # instance->node. Hence, we will need to process nodes even if we only need
12301 # instance information.
12302 if do_nodes or do_instances:
12303 all_nodes = lu.cfg.GetAllNodesInfo()
12304 group_to_nodes = dict((uuid, []) for uuid in self.wanted)
12307 for node in all_nodes.values():
12308 if node.group in group_to_nodes:
12309 group_to_nodes[node.group].append(node.name)
12310 node_to_group[node.name] = node.group
12313 all_instances = lu.cfg.GetAllInstancesInfo()
12314 group_to_instances = dict((uuid, []) for uuid in self.wanted)
12316 for instance in all_instances.values():
12317 node = instance.primary_node
12318 if node in node_to_group:
12319 group_to_instances[node_to_group[node]].append(instance.name)
12322 # Do not pass on node information if it was not requested.
12323 group_to_nodes = None
12325 return query.GroupQueryData([self._all_groups[uuid]
12326 for uuid in self.wanted],
12327 group_to_nodes, group_to_instances)
12330 class LUGroupQuery(NoHooksLU):
12331 """Logical unit for querying node groups.
12336 def CheckArguments(self):
12337 self.gq = _GroupQuery(qlang.MakeSimpleFilter("name", self.op.names),
12338 self.op.output_fields, False)
12340 def ExpandNames(self):
12341 self.gq.ExpandNames(self)
12343 def DeclareLocks(self, level):
12344 self.gq.DeclareLocks(self, level)
12346 def Exec(self, feedback_fn):
12347 return self.gq.OldStyleQuery(self)
12350 class LUGroupSetParams(LogicalUnit):
12351 """Modifies the parameters of a node group.
12354 HPATH = "group-modify"
12355 HTYPE = constants.HTYPE_GROUP
12358 def CheckArguments(self):
12361 self.op.alloc_policy,
12364 if all_changes.count(None) == len(all_changes):
12365 raise errors.OpPrereqError("Please pass at least one modification",
12366 errors.ECODE_INVAL)
12368 def ExpandNames(self):
12369 # This raises errors.OpPrereqError on its own:
12370 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
12372 self.needed_locks = {
12373 locking.LEVEL_NODEGROUP: [self.group_uuid],
12376 def CheckPrereq(self):
12377 """Check prerequisites.
12380 self.group = self.cfg.GetNodeGroup(self.group_uuid)
12382 if self.group is None:
12383 raise errors.OpExecError("Could not retrieve group '%s' (UUID: %s)" %
12384 (self.op.group_name, self.group_uuid))
12386 if self.op.ndparams:
12387 new_ndparams = _GetUpdatedParams(self.group.ndparams, self.op.ndparams)
12388 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
12389 self.new_ndparams = new_ndparams
12391 def BuildHooksEnv(self):
12392 """Build hooks env.
12396 "GROUP_NAME": self.op.group_name,
12397 "NEW_ALLOC_POLICY": self.op.alloc_policy,
12400 def BuildHooksNodes(self):
12401 """Build hooks nodes.
12404 mn = self.cfg.GetMasterNode()
12405 return ([mn], [mn])
12407 def Exec(self, feedback_fn):
12408 """Modifies the node group.
12413 if self.op.ndparams:
12414 self.group.ndparams = self.new_ndparams
12415 result.append(("ndparams", str(self.group.ndparams)))
12417 if self.op.alloc_policy:
12418 self.group.alloc_policy = self.op.alloc_policy
12420 self.cfg.Update(self.group, feedback_fn)
12424 class LUGroupRemove(LogicalUnit):
12425 HPATH = "group-remove"
12426 HTYPE = constants.HTYPE_GROUP
12429 def ExpandNames(self):
12430 # This will raises errors.OpPrereqError on its own:
12431 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
12432 self.needed_locks = {
12433 locking.LEVEL_NODEGROUP: [self.group_uuid],
12436 def CheckPrereq(self):
12437 """Check prerequisites.
12439 This checks that the given group name exists as a node group, that is
12440 empty (i.e., contains no nodes), and that is not the last group of the
12444 # Verify that the group is empty.
12445 group_nodes = [node.name
12446 for node in self.cfg.GetAllNodesInfo().values()
12447 if node.group == self.group_uuid]
12450 raise errors.OpPrereqError("Group '%s' not empty, has the following"
12452 (self.op.group_name,
12453 utils.CommaJoin(utils.NiceSort(group_nodes))),
12454 errors.ECODE_STATE)
12456 # Verify the cluster would not be left group-less.
12457 if len(self.cfg.GetNodeGroupList()) == 1:
12458 raise errors.OpPrereqError("Group '%s' is the only group,"
12459 " cannot be removed" %
12460 self.op.group_name,
12461 errors.ECODE_STATE)
12463 def BuildHooksEnv(self):
12464 """Build hooks env.
12468 "GROUP_NAME": self.op.group_name,
12471 def BuildHooksNodes(self):
12472 """Build hooks nodes.
12475 mn = self.cfg.GetMasterNode()
12476 return ([mn], [mn])
12478 def Exec(self, feedback_fn):
12479 """Remove the node group.
12483 self.cfg.RemoveNodeGroup(self.group_uuid)
12484 except errors.ConfigurationError:
12485 raise errors.OpExecError("Group '%s' with UUID %s disappeared" %
12486 (self.op.group_name, self.group_uuid))
12488 self.remove_locks[locking.LEVEL_NODEGROUP] = self.group_uuid
12491 class LUGroupRename(LogicalUnit):
12492 HPATH = "group-rename"
12493 HTYPE = constants.HTYPE_GROUP
12496 def ExpandNames(self):
12497 # This raises errors.OpPrereqError on its own:
12498 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
12500 self.needed_locks = {
12501 locking.LEVEL_NODEGROUP: [self.group_uuid],
12504 def CheckPrereq(self):
12505 """Check prerequisites.
12507 Ensures requested new name is not yet used.
12511 new_name_uuid = self.cfg.LookupNodeGroup(self.op.new_name)
12512 except errors.OpPrereqError:
12515 raise errors.OpPrereqError("Desired new name '%s' clashes with existing"
12516 " node group (UUID: %s)" %
12517 (self.op.new_name, new_name_uuid),
12518 errors.ECODE_EXISTS)
12520 def BuildHooksEnv(self):
12521 """Build hooks env.
12525 "OLD_NAME": self.op.group_name,
12526 "NEW_NAME": self.op.new_name,
12529 def BuildHooksNodes(self):
12530 """Build hooks nodes.
12533 mn = self.cfg.GetMasterNode()
12535 all_nodes = self.cfg.GetAllNodesInfo()
12536 all_nodes.pop(mn, None)
12539 run_nodes.extend(node.name for node in all_nodes.values()
12540 if node.group == self.group_uuid)
12542 return (run_nodes, run_nodes)
12544 def Exec(self, feedback_fn):
12545 """Rename the node group.
12548 group = self.cfg.GetNodeGroup(self.group_uuid)
12551 raise errors.OpExecError("Could not retrieve group '%s' (UUID: %s)" %
12552 (self.op.group_name, self.group_uuid))
12554 group.name = self.op.new_name
12555 self.cfg.Update(group, feedback_fn)
12557 return self.op.new_name
12560 class LUGroupEvacuate(LogicalUnit):
12561 HPATH = "group-evacuate"
12562 HTYPE = constants.HTYPE_GROUP
12565 def ExpandNames(self):
12566 # This raises errors.OpPrereqError on its own:
12567 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
12569 if self.op.target_groups:
12570 self.req_target_uuids = map(self.cfg.LookupNodeGroup,
12571 self.op.target_groups)
12573 self.req_target_uuids = []
12575 if self.group_uuid in self.req_target_uuids:
12576 raise errors.OpPrereqError("Group to be evacuated (%s) can not be used"
12577 " as a target group (targets are %s)" %
12579 utils.CommaJoin(self.req_target_uuids)),
12580 errors.ECODE_INVAL)
12582 self.op.iallocator = _GetDefaultIAllocator(self.cfg, self.op.iallocator)
12584 self.share_locks = _ShareAll()
12585 self.needed_locks = {
12586 locking.LEVEL_INSTANCE: [],
12587 locking.LEVEL_NODEGROUP: [],
12588 locking.LEVEL_NODE: [],
12591 def DeclareLocks(self, level):
12592 if level == locking.LEVEL_INSTANCE:
12593 assert not self.needed_locks[locking.LEVEL_INSTANCE]
12595 # Lock instances optimistically, needs verification once node and group
12596 # locks have been acquired
12597 self.needed_locks[locking.LEVEL_INSTANCE] = \
12598 self.cfg.GetNodeGroupInstances(self.group_uuid)
12600 elif level == locking.LEVEL_NODEGROUP:
12601 assert not self.needed_locks[locking.LEVEL_NODEGROUP]
12603 if self.req_target_uuids:
12604 lock_groups = set([self.group_uuid] + self.req_target_uuids)
12606 # Lock all groups used by instances optimistically; this requires going
12607 # via the node before it's locked, requiring verification later on
12608 lock_groups.update(group_uuid
12609 for instance_name in
12610 self.owned_locks(locking.LEVEL_INSTANCE)
12612 self.cfg.GetInstanceNodeGroups(instance_name))
12614 # No target groups, need to lock all of them
12615 lock_groups = locking.ALL_SET
12617 self.needed_locks[locking.LEVEL_NODEGROUP] = lock_groups
12619 elif level == locking.LEVEL_NODE:
12620 # This will only lock the nodes in the group to be evacuated which
12621 # contain actual instances
12622 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
12623 self._LockInstancesNodes()
12625 # Lock all nodes in group to be evacuated and target groups
12626 owned_groups = frozenset(self.owned_locks(locking.LEVEL_NODEGROUP))
12627 assert self.group_uuid in owned_groups
12628 member_nodes = [node_name
12629 for group in owned_groups
12630 for node_name in self.cfg.GetNodeGroup(group).members]
12631 self.needed_locks[locking.LEVEL_NODE].extend(member_nodes)
12633 def CheckPrereq(self):
12634 owned_instances = frozenset(self.owned_locks(locking.LEVEL_INSTANCE))
12635 owned_groups = frozenset(self.owned_locks(locking.LEVEL_NODEGROUP))
12636 owned_nodes = frozenset(self.owned_locks(locking.LEVEL_NODE))
12638 assert owned_groups.issuperset(self.req_target_uuids)
12639 assert self.group_uuid in owned_groups
12641 # Check if locked instances are still correct
12642 _CheckNodeGroupInstances(self.cfg, self.group_uuid, owned_instances)
12644 # Get instance information
12645 self.instances = dict(self.cfg.GetMultiInstanceInfo(owned_instances))
12647 # Check if node groups for locked instances are still correct
12648 for instance_name in owned_instances:
12649 inst = self.instances[instance_name]
12650 assert owned_nodes.issuperset(inst.all_nodes), \
12651 "Instance %s's nodes changed while we kept the lock" % instance_name
12653 inst_groups = _CheckInstanceNodeGroups(self.cfg, instance_name,
12656 assert self.group_uuid in inst_groups, \
12657 "Instance %s has no node in group %s" % (instance_name, self.group_uuid)
12659 if self.req_target_uuids:
12660 # User requested specific target groups
12661 self.target_uuids = self.req_target_uuids
12663 # All groups except the one to be evacuated are potential targets
12664 self.target_uuids = [group_uuid for group_uuid in owned_groups
12665 if group_uuid != self.group_uuid]
12667 if not self.target_uuids:
12668 raise errors.OpPrereqError("There are no possible target groups",
12669 errors.ECODE_INVAL)
12671 def BuildHooksEnv(self):
12672 """Build hooks env.
12676 "GROUP_NAME": self.op.group_name,
12677 "TARGET_GROUPS": " ".join(self.target_uuids),
12680 def BuildHooksNodes(self):
12681 """Build hooks nodes.
12684 mn = self.cfg.GetMasterNode()
12686 assert self.group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP)
12688 run_nodes = [mn] + self.cfg.GetNodeGroup(self.group_uuid).members
12690 return (run_nodes, run_nodes)
12692 def Exec(self, feedback_fn):
12693 instances = list(self.owned_locks(locking.LEVEL_INSTANCE))
12695 assert self.group_uuid not in self.target_uuids
12697 ial = IAllocator(self.cfg, self.rpc, constants.IALLOCATOR_MODE_CHG_GROUP,
12698 instances=instances, target_groups=self.target_uuids)
12700 ial.Run(self.op.iallocator)
12702 if not ial.success:
12703 raise errors.OpPrereqError("Can't compute group evacuation using"
12704 " iallocator '%s': %s" %
12705 (self.op.iallocator, ial.info),
12706 errors.ECODE_NORES)
12708 jobs = _LoadNodeEvacResult(self, ial.result, self.op.early_release, False)
12710 self.LogInfo("Iallocator returned %s job(s) for evacuating node group %s",
12711 len(jobs), self.op.group_name)
12713 return ResultWithJobs(jobs)
12716 class TagsLU(NoHooksLU): # pylint: disable=W0223
12717 """Generic tags LU.
12719 This is an abstract class which is the parent of all the other tags LUs.
12722 def ExpandNames(self):
12723 self.group_uuid = None
12724 self.needed_locks = {}
12725 if self.op.kind == constants.TAG_NODE:
12726 self.op.name = _ExpandNodeName(self.cfg, self.op.name)
12727 self.needed_locks[locking.LEVEL_NODE] = self.op.name
12728 elif self.op.kind == constants.TAG_INSTANCE:
12729 self.op.name = _ExpandInstanceName(self.cfg, self.op.name)
12730 self.needed_locks[locking.LEVEL_INSTANCE] = self.op.name
12731 elif self.op.kind == constants.TAG_NODEGROUP:
12732 self.group_uuid = self.cfg.LookupNodeGroup(self.op.name)
12734 # FIXME: Acquire BGL for cluster tag operations (as of this writing it's
12735 # not possible to acquire the BGL based on opcode parameters)
12737 def CheckPrereq(self):
12738 """Check prerequisites.
12741 if self.op.kind == constants.TAG_CLUSTER:
12742 self.target = self.cfg.GetClusterInfo()
12743 elif self.op.kind == constants.TAG_NODE:
12744 self.target = self.cfg.GetNodeInfo(self.op.name)
12745 elif self.op.kind == constants.TAG_INSTANCE:
12746 self.target = self.cfg.GetInstanceInfo(self.op.name)
12747 elif self.op.kind == constants.TAG_NODEGROUP:
12748 self.target = self.cfg.GetNodeGroup(self.group_uuid)
12750 raise errors.OpPrereqError("Wrong tag type requested (%s)" %
12751 str(self.op.kind), errors.ECODE_INVAL)
12754 class LUTagsGet(TagsLU):
12755 """Returns the tags of a given object.
12760 def ExpandNames(self):
12761 TagsLU.ExpandNames(self)
12763 # Share locks as this is only a read operation
12764 self.share_locks = _ShareAll()
12766 def Exec(self, feedback_fn):
12767 """Returns the tag list.
12770 return list(self.target.GetTags())
12773 class LUTagsSearch(NoHooksLU):
12774 """Searches the tags for a given pattern.
12779 def ExpandNames(self):
12780 self.needed_locks = {}
12782 def CheckPrereq(self):
12783 """Check prerequisites.
12785 This checks the pattern passed for validity by compiling it.
12789 self.re = re.compile(self.op.pattern)
12790 except re.error, err:
12791 raise errors.OpPrereqError("Invalid search pattern '%s': %s" %
12792 (self.op.pattern, err), errors.ECODE_INVAL)
12794 def Exec(self, feedback_fn):
12795 """Returns the tag list.
12799 tgts = [("/cluster", cfg.GetClusterInfo())]
12800 ilist = cfg.GetAllInstancesInfo().values()
12801 tgts.extend([("/instances/%s" % i.name, i) for i in ilist])
12802 nlist = cfg.GetAllNodesInfo().values()
12803 tgts.extend([("/nodes/%s" % n.name, n) for n in nlist])
12804 tgts.extend(("/nodegroup/%s" % n.name, n)
12805 for n in cfg.GetAllNodeGroupsInfo().values())
12807 for path, target in tgts:
12808 for tag in target.GetTags():
12809 if self.re.search(tag):
12810 results.append((path, tag))
12814 class LUTagsSet(TagsLU):
12815 """Sets a tag on a given object.
12820 def CheckPrereq(self):
12821 """Check prerequisites.
12823 This checks the type and length of the tag name and value.
12826 TagsLU.CheckPrereq(self)
12827 for tag in self.op.tags:
12828 objects.TaggableObject.ValidateTag(tag)
12830 def Exec(self, feedback_fn):
12835 for tag in self.op.tags:
12836 self.target.AddTag(tag)
12837 except errors.TagError, err:
12838 raise errors.OpExecError("Error while setting tag: %s" % str(err))
12839 self.cfg.Update(self.target, feedback_fn)
12842 class LUTagsDel(TagsLU):
12843 """Delete a list of tags from a given object.
12848 def CheckPrereq(self):
12849 """Check prerequisites.
12851 This checks that we have the given tag.
12854 TagsLU.CheckPrereq(self)
12855 for tag in self.op.tags:
12856 objects.TaggableObject.ValidateTag(tag)
12857 del_tags = frozenset(self.op.tags)
12858 cur_tags = self.target.GetTags()
12860 diff_tags = del_tags - cur_tags
12862 diff_names = ("'%s'" % i for i in sorted(diff_tags))
12863 raise errors.OpPrereqError("Tag(s) %s not found" %
12864 (utils.CommaJoin(diff_names), ),
12865 errors.ECODE_NOENT)
12867 def Exec(self, feedback_fn):
12868 """Remove the tag from the object.
12871 for tag in self.op.tags:
12872 self.target.RemoveTag(tag)
12873 self.cfg.Update(self.target, feedback_fn)
12876 class LUTestDelay(NoHooksLU):
12877 """Sleep for a specified amount of time.
12879 This LU sleeps on the master and/or nodes for a specified amount of
12885 def ExpandNames(self):
12886 """Expand names and set required locks.
12888 This expands the node list, if any.
12891 self.needed_locks = {}
12892 if self.op.on_nodes:
12893 # _GetWantedNodes can be used here, but is not always appropriate to use
12894 # this way in ExpandNames. Check LogicalUnit.ExpandNames docstring for
12895 # more information.
12896 self.op.on_nodes = _GetWantedNodes(self, self.op.on_nodes)
12897 self.needed_locks[locking.LEVEL_NODE] = self.op.on_nodes
12899 def _TestDelay(self):
12900 """Do the actual sleep.
12903 if self.op.on_master:
12904 if not utils.TestDelay(self.op.duration):
12905 raise errors.OpExecError("Error during master delay test")
12906 if self.op.on_nodes:
12907 result = self.rpc.call_test_delay(self.op.on_nodes, self.op.duration)
12908 for node, node_result in result.items():
12909 node_result.Raise("Failure during rpc call to node %s" % node)
12911 def Exec(self, feedback_fn):
12912 """Execute the test delay opcode, with the wanted repetitions.
12915 if self.op.repeat == 0:
12918 top_value = self.op.repeat - 1
12919 for i in range(self.op.repeat):
12920 self.LogInfo("Test delay iteration %d/%d" % (i, top_value))
12924 class LUTestJqueue(NoHooksLU):
12925 """Utility LU to test some aspects of the job queue.
12930 # Must be lower than default timeout for WaitForJobChange to see whether it
12931 # notices changed jobs
12932 _CLIENT_CONNECT_TIMEOUT = 20.0
12933 _CLIENT_CONFIRM_TIMEOUT = 60.0
12936 def _NotifyUsingSocket(cls, cb, errcls):
12937 """Opens a Unix socket and waits for another program to connect.
12940 @param cb: Callback to send socket name to client
12941 @type errcls: class
12942 @param errcls: Exception class to use for errors
12945 # Using a temporary directory as there's no easy way to create temporary
12946 # sockets without writing a custom loop around tempfile.mktemp and
12948 tmpdir = tempfile.mkdtemp()
12950 tmpsock = utils.PathJoin(tmpdir, "sock")
12952 logging.debug("Creating temporary socket at %s", tmpsock)
12953 sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
12958 # Send details to client
12961 # Wait for client to connect before continuing
12962 sock.settimeout(cls._CLIENT_CONNECT_TIMEOUT)
12964 (conn, _) = sock.accept()
12965 except socket.error, err:
12966 raise errcls("Client didn't connect in time (%s)" % err)
12970 # Remove as soon as client is connected
12971 shutil.rmtree(tmpdir)
12973 # Wait for client to close
12976 # pylint: disable=E1101
12977 # Instance of '_socketobject' has no ... member
12978 conn.settimeout(cls._CLIENT_CONFIRM_TIMEOUT)
12980 except socket.error, err:
12981 raise errcls("Client failed to confirm notification (%s)" % err)
12985 def _SendNotification(self, test, arg, sockname):
12986 """Sends a notification to the client.
12989 @param test: Test name
12990 @param arg: Test argument (depends on test)
12991 @type sockname: string
12992 @param sockname: Socket path
12995 self.Log(constants.ELOG_JQUEUE_TEST, (sockname, test, arg))
12997 def _Notify(self, prereq, test, arg):
12998 """Notifies the client of a test.
13001 @param prereq: Whether this is a prereq-phase test
13003 @param test: Test name
13004 @param arg: Test argument (depends on test)
13008 errcls = errors.OpPrereqError
13010 errcls = errors.OpExecError
13012 return self._NotifyUsingSocket(compat.partial(self._SendNotification,
13016 def CheckArguments(self):
13017 self.checkargs_calls = getattr(self, "checkargs_calls", 0) + 1
13018 self.expandnames_calls = 0
13020 def ExpandNames(self):
13021 checkargs_calls = getattr(self, "checkargs_calls", 0)
13022 if checkargs_calls < 1:
13023 raise errors.ProgrammerError("CheckArguments was not called")
13025 self.expandnames_calls += 1
13027 if self.op.notify_waitlock:
13028 self._Notify(True, constants.JQT_EXPANDNAMES, None)
13030 self.LogInfo("Expanding names")
13032 # Get lock on master node (just to get a lock, not for a particular reason)
13033 self.needed_locks = {
13034 locking.LEVEL_NODE: self.cfg.GetMasterNode(),
13037 def Exec(self, feedback_fn):
13038 if self.expandnames_calls < 1:
13039 raise errors.ProgrammerError("ExpandNames was not called")
13041 if self.op.notify_exec:
13042 self._Notify(False, constants.JQT_EXEC, None)
13044 self.LogInfo("Executing")
13046 if self.op.log_messages:
13047 self._Notify(False, constants.JQT_STARTMSG, len(self.op.log_messages))
13048 for idx, msg in enumerate(self.op.log_messages):
13049 self.LogInfo("Sending log message %s", idx + 1)
13050 feedback_fn(constants.JQT_MSGPREFIX + msg)
13051 # Report how many test messages have been sent
13052 self._Notify(False, constants.JQT_LOGMSG, idx + 1)
13055 raise errors.OpExecError("Opcode failure was requested")
13060 class IAllocator(object):
13061 """IAllocator framework.
13063 An IAllocator instance has three sets of attributes:
13064 - cfg that is needed to query the cluster
13065 - input data (all members of the _KEYS class attribute are required)
13066 - four buffer attributes (in|out_data|text), that represent the
13067 input (to the external script) in text and data structure format,
13068 and the output from it, again in two formats
13069 - the result variables from the script (success, info, nodes) for
13073 # pylint: disable=R0902
13074 # lots of instance attributes
13076 def __init__(self, cfg, rpc_runner, mode, **kwargs):
13078 self.rpc = rpc_runner
13079 # init buffer variables
13080 self.in_text = self.out_text = self.in_data = self.out_data = None
13081 # init all input fields so that pylint is happy
13083 self.memory = self.disks = self.disk_template = None
13084 self.os = self.tags = self.nics = self.vcpus = None
13085 self.hypervisor = None
13086 self.relocate_from = None
13088 self.instances = None
13089 self.evac_mode = None
13090 self.target_groups = []
13092 self.required_nodes = None
13093 # init result fields
13094 self.success = self.info = self.result = None
13097 (fn, keydata, self._result_check) = self._MODE_DATA[self.mode]
13099 raise errors.ProgrammerError("Unknown mode '%s' passed to the"
13100 " IAllocator" % self.mode)
13102 keyset = [n for (n, _) in keydata]
13105 if key not in keyset:
13106 raise errors.ProgrammerError("Invalid input parameter '%s' to"
13107 " IAllocator" % key)
13108 setattr(self, key, kwargs[key])
13111 if key not in kwargs:
13112 raise errors.ProgrammerError("Missing input parameter '%s' to"
13113 " IAllocator" % key)
13114 self._BuildInputData(compat.partial(fn, self), keydata)
13116 def _ComputeClusterData(self):
13117 """Compute the generic allocator input data.
13119 This is the data that is independent of the actual operation.
13123 cluster_info = cfg.GetClusterInfo()
13126 "version": constants.IALLOCATOR_VERSION,
13127 "cluster_name": cfg.GetClusterName(),
13128 "cluster_tags": list(cluster_info.GetTags()),
13129 "enabled_hypervisors": list(cluster_info.enabled_hypervisors),
13130 # we don't have job IDs
13132 ninfo = cfg.GetAllNodesInfo()
13133 iinfo = cfg.GetAllInstancesInfo().values()
13134 i_list = [(inst, cluster_info.FillBE(inst)) for inst in iinfo]
13137 node_list = [n.name for n in ninfo.values() if n.vm_capable]
13139 if self.mode == constants.IALLOCATOR_MODE_ALLOC:
13140 hypervisor_name = self.hypervisor
13141 elif self.mode == constants.IALLOCATOR_MODE_RELOC:
13142 hypervisor_name = cfg.GetInstanceInfo(self.name).hypervisor
13144 hypervisor_name = cluster_info.enabled_hypervisors[0]
13146 node_data = self.rpc.call_node_info(node_list, cfg.GetVGName(),
13149 self.rpc.call_all_instances_info(node_list,
13150 cluster_info.enabled_hypervisors)
13152 data["nodegroups"] = self._ComputeNodeGroupData(cfg)
13154 config_ndata = self._ComputeBasicNodeData(ninfo)
13155 data["nodes"] = self._ComputeDynamicNodeData(ninfo, node_data, node_iinfo,
13156 i_list, config_ndata)
13157 assert len(data["nodes"]) == len(ninfo), \
13158 "Incomplete node data computed"
13160 data["instances"] = self._ComputeInstanceData(cluster_info, i_list)
13162 self.in_data = data
13165 def _ComputeNodeGroupData(cfg):
13166 """Compute node groups data.
13169 ng = dict((guuid, {
13170 "name": gdata.name,
13171 "alloc_policy": gdata.alloc_policy,
13173 for guuid, gdata in cfg.GetAllNodeGroupsInfo().items())
13178 def _ComputeBasicNodeData(node_cfg):
13179 """Compute global node data.
13182 @returns: a dict of name: (node dict, node config)
13185 # fill in static (config-based) values
13186 node_results = dict((ninfo.name, {
13187 "tags": list(ninfo.GetTags()),
13188 "primary_ip": ninfo.primary_ip,
13189 "secondary_ip": ninfo.secondary_ip,
13190 "offline": ninfo.offline,
13191 "drained": ninfo.drained,
13192 "master_candidate": ninfo.master_candidate,
13193 "group": ninfo.group,
13194 "master_capable": ninfo.master_capable,
13195 "vm_capable": ninfo.vm_capable,
13197 for ninfo in node_cfg.values())
13199 return node_results
13202 def _ComputeDynamicNodeData(node_cfg, node_data, node_iinfo, i_list,
13204 """Compute global node data.
13206 @param node_results: the basic node structures as filled from the config
13209 # make a copy of the current dict
13210 node_results = dict(node_results)
13211 for nname, nresult in node_data.items():
13212 assert nname in node_results, "Missing basic data for node %s" % nname
13213 ninfo = node_cfg[nname]
13215 if not (ninfo.offline or ninfo.drained):
13216 nresult.Raise("Can't get data for node %s" % nname)
13217 node_iinfo[nname].Raise("Can't get node instance info from node %s" %
13219 remote_info = nresult.payload
13221 for attr in ["memory_total", "memory_free", "memory_dom0",
13222 "vg_size", "vg_free", "cpu_total"]:
13223 if attr not in remote_info:
13224 raise errors.OpExecError("Node '%s' didn't return attribute"
13225 " '%s'" % (nname, attr))
13226 if not isinstance(remote_info[attr], int):
13227 raise errors.OpExecError("Node '%s' returned invalid value"
13229 (nname, attr, remote_info[attr]))
13230 # compute memory used by primary instances
13231 i_p_mem = i_p_up_mem = 0
13232 for iinfo, beinfo in i_list:
13233 if iinfo.primary_node == nname:
13234 i_p_mem += beinfo[constants.BE_MEMORY]
13235 if iinfo.name not in node_iinfo[nname].payload:
13238 i_used_mem = int(node_iinfo[nname].payload[iinfo.name]["memory"])
13239 i_mem_diff = beinfo[constants.BE_MEMORY] - i_used_mem
13240 remote_info["memory_free"] -= max(0, i_mem_diff)
13243 i_p_up_mem += beinfo[constants.BE_MEMORY]
13245 # compute memory used by instances
13247 "total_memory": remote_info["memory_total"],
13248 "reserved_memory": remote_info["memory_dom0"],
13249 "free_memory": remote_info["memory_free"],
13250 "total_disk": remote_info["vg_size"],
13251 "free_disk": remote_info["vg_free"],
13252 "total_cpus": remote_info["cpu_total"],
13253 "i_pri_memory": i_p_mem,
13254 "i_pri_up_memory": i_p_up_mem,
13256 pnr_dyn.update(node_results[nname])
13257 node_results[nname] = pnr_dyn
13259 return node_results
13262 def _ComputeInstanceData(cluster_info, i_list):
13263 """Compute global instance data.
13267 for iinfo, beinfo in i_list:
13269 for nic in iinfo.nics:
13270 filled_params = cluster_info.SimpleFillNIC(nic.nicparams)
13274 "mode": filled_params[constants.NIC_MODE],
13275 "link": filled_params[constants.NIC_LINK],
13277 if filled_params[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
13278 nic_dict["bridge"] = filled_params[constants.NIC_LINK]
13279 nic_data.append(nic_dict)
13281 "tags": list(iinfo.GetTags()),
13282 "admin_up": iinfo.admin_up,
13283 "vcpus": beinfo[constants.BE_VCPUS],
13284 "memory": beinfo[constants.BE_MEMORY],
13286 "nodes": [iinfo.primary_node] + list(iinfo.secondary_nodes),
13288 "disks": [{constants.IDISK_SIZE: dsk.size,
13289 constants.IDISK_MODE: dsk.mode}
13290 for dsk in iinfo.disks],
13291 "disk_template": iinfo.disk_template,
13292 "hypervisor": iinfo.hypervisor,
13294 pir["disk_space_total"] = _ComputeDiskSize(iinfo.disk_template,
13296 instance_data[iinfo.name] = pir
13298 return instance_data
13300 def _AddNewInstance(self):
13301 """Add new instance data to allocator structure.
13303 This in combination with _AllocatorGetClusterData will create the
13304 correct structure needed as input for the allocator.
13306 The checks for the completeness of the opcode must have already been
13310 disk_space = _ComputeDiskSize(self.disk_template, self.disks)
13312 if self.disk_template in constants.DTS_INT_MIRROR:
13313 self.required_nodes = 2
13315 self.required_nodes = 1
13319 "disk_template": self.disk_template,
13322 "vcpus": self.vcpus,
13323 "memory": self.memory,
13324 "disks": self.disks,
13325 "disk_space_total": disk_space,
13327 "required_nodes": self.required_nodes,
13328 "hypervisor": self.hypervisor,
13333 def _AddRelocateInstance(self):
13334 """Add relocate instance data to allocator structure.
13336 This in combination with _IAllocatorGetClusterData will create the
13337 correct structure needed as input for the allocator.
13339 The checks for the completeness of the opcode must have already been
13343 instance = self.cfg.GetInstanceInfo(self.name)
13344 if instance is None:
13345 raise errors.ProgrammerError("Unknown instance '%s' passed to"
13346 " IAllocator" % self.name)
13348 if instance.disk_template not in constants.DTS_MIRRORED:
13349 raise errors.OpPrereqError("Can't relocate non-mirrored instances",
13350 errors.ECODE_INVAL)
13352 if instance.disk_template in constants.DTS_INT_MIRROR and \
13353 len(instance.secondary_nodes) != 1:
13354 raise errors.OpPrereqError("Instance has not exactly one secondary node",
13355 errors.ECODE_STATE)
13357 self.required_nodes = 1
13358 disk_sizes = [{constants.IDISK_SIZE: disk.size} for disk in instance.disks]
13359 disk_space = _ComputeDiskSize(instance.disk_template, disk_sizes)
13363 "disk_space_total": disk_space,
13364 "required_nodes": self.required_nodes,
13365 "relocate_from": self.relocate_from,
13369 def _AddNodeEvacuate(self):
13370 """Get data for node-evacuate requests.
13374 "instances": self.instances,
13375 "evac_mode": self.evac_mode,
13378 def _AddChangeGroup(self):
13379 """Get data for node-evacuate requests.
13383 "instances": self.instances,
13384 "target_groups": self.target_groups,
13387 def _BuildInputData(self, fn, keydata):
13388 """Build input data structures.
13391 self._ComputeClusterData()
13394 request["type"] = self.mode
13395 for keyname, keytype in keydata:
13396 if keyname not in request:
13397 raise errors.ProgrammerError("Request parameter %s is missing" %
13399 val = request[keyname]
13400 if not keytype(val):
13401 raise errors.ProgrammerError("Request parameter %s doesn't pass"
13402 " validation, value %s, expected"
13403 " type %s" % (keyname, val, keytype))
13404 self.in_data["request"] = request
13406 self.in_text = serializer.Dump(self.in_data)
13408 _STRING_LIST = ht.TListOf(ht.TString)
13409 _JOB_LIST = ht.TListOf(ht.TListOf(ht.TStrictDict(True, False, {
13410 # pylint: disable=E1101
13411 # Class '...' has no 'OP_ID' member
13412 "OP_ID": ht.TElemOf([opcodes.OpInstanceFailover.OP_ID,
13413 opcodes.OpInstanceMigrate.OP_ID,
13414 opcodes.OpInstanceReplaceDisks.OP_ID])
13418 ht.TListOf(ht.TAnd(ht.TIsLength(3),
13419 ht.TItems([ht.TNonEmptyString,
13420 ht.TNonEmptyString,
13421 ht.TListOf(ht.TNonEmptyString),
13424 ht.TListOf(ht.TAnd(ht.TIsLength(2),
13425 ht.TItems([ht.TNonEmptyString,
13428 _NEVAC_RESULT = ht.TAnd(ht.TIsLength(3),
13429 ht.TItems([_NEVAC_MOVED, _NEVAC_FAILED, _JOB_LIST]))
13432 constants.IALLOCATOR_MODE_ALLOC:
13435 ("name", ht.TString),
13436 ("memory", ht.TInt),
13437 ("disks", ht.TListOf(ht.TDict)),
13438 ("disk_template", ht.TString),
13439 ("os", ht.TString),
13440 ("tags", _STRING_LIST),
13441 ("nics", ht.TListOf(ht.TDict)),
13442 ("vcpus", ht.TInt),
13443 ("hypervisor", ht.TString),
13445 constants.IALLOCATOR_MODE_RELOC:
13446 (_AddRelocateInstance,
13447 [("name", ht.TString), ("relocate_from", _STRING_LIST)],
13449 constants.IALLOCATOR_MODE_NODE_EVAC:
13450 (_AddNodeEvacuate, [
13451 ("instances", _STRING_LIST),
13452 ("evac_mode", ht.TElemOf(constants.IALLOCATOR_NEVAC_MODES)),
13454 constants.IALLOCATOR_MODE_CHG_GROUP:
13455 (_AddChangeGroup, [
13456 ("instances", _STRING_LIST),
13457 ("target_groups", _STRING_LIST),
13461 def Run(self, name, validate=True, call_fn=None):
13462 """Run an instance allocator and return the results.
13465 if call_fn is None:
13466 call_fn = self.rpc.call_iallocator_runner
13468 result = call_fn(self.cfg.GetMasterNode(), name, self.in_text)
13469 result.Raise("Failure while running the iallocator script")
13471 self.out_text = result.payload
13473 self._ValidateResult()
13475 def _ValidateResult(self):
13476 """Process the allocator results.
13478 This will process and if successful save the result in
13479 self.out_data and the other parameters.
13483 rdict = serializer.Load(self.out_text)
13484 except Exception, err:
13485 raise errors.OpExecError("Can't parse iallocator results: %s" % str(err))
13487 if not isinstance(rdict, dict):
13488 raise errors.OpExecError("Can't parse iallocator results: not a dict")
13490 # TODO: remove backwards compatiblity in later versions
13491 if "nodes" in rdict and "result" not in rdict:
13492 rdict["result"] = rdict["nodes"]
13495 for key in "success", "info", "result":
13496 if key not in rdict:
13497 raise errors.OpExecError("Can't parse iallocator results:"
13498 " missing key '%s'" % key)
13499 setattr(self, key, rdict[key])
13501 if not self._result_check(self.result):
13502 raise errors.OpExecError("Iallocator returned invalid result,"
13503 " expected %s, got %s" %
13504 (self._result_check, self.result),
13505 errors.ECODE_INVAL)
13507 if self.mode == constants.IALLOCATOR_MODE_RELOC:
13508 assert self.relocate_from is not None
13509 assert self.required_nodes == 1
13511 node2group = dict((name, ndata["group"])
13512 for (name, ndata) in self.in_data["nodes"].items())
13514 fn = compat.partial(self._NodesToGroups, node2group,
13515 self.in_data["nodegroups"])
13517 instance = self.cfg.GetInstanceInfo(self.name)
13518 request_groups = fn(self.relocate_from + [instance.primary_node])
13519 result_groups = fn(rdict["result"] + [instance.primary_node])
13521 if self.success and not set(result_groups).issubset(request_groups):
13522 raise errors.OpExecError("Groups of nodes returned by iallocator (%s)"
13523 " differ from original groups (%s)" %
13524 (utils.CommaJoin(result_groups),
13525 utils.CommaJoin(request_groups)))
13527 elif self.mode == constants.IALLOCATOR_MODE_NODE_EVAC:
13528 assert self.evac_mode in constants.IALLOCATOR_NEVAC_MODES
13530 self.out_data = rdict
13533 def _NodesToGroups(node2group, groups, nodes):
13534 """Returns a list of unique group names for a list of nodes.
13536 @type node2group: dict
13537 @param node2group: Map from node name to group UUID
13539 @param groups: Group information
13541 @param nodes: Node names
13548 group_uuid = node2group[node]
13550 # Ignore unknown node
13554 group = groups[group_uuid]
13556 # Can't find group, let's use UUID
13557 group_name = group_uuid
13559 group_name = group["name"]
13561 result.add(group_name)
13563 return sorted(result)
13566 class LUTestAllocator(NoHooksLU):
13567 """Run allocator tests.
13569 This LU runs the allocator tests
13572 def CheckPrereq(self):
13573 """Check prerequisites.
13575 This checks the opcode parameters depending on the director and mode test.
13578 if self.op.mode == constants.IALLOCATOR_MODE_ALLOC:
13579 for attr in ["memory", "disks", "disk_template",
13580 "os", "tags", "nics", "vcpus"]:
13581 if not hasattr(self.op, attr):
13582 raise errors.OpPrereqError("Missing attribute '%s' on opcode input" %
13583 attr, errors.ECODE_INVAL)
13584 iname = self.cfg.ExpandInstanceName(self.op.name)
13585 if iname is not None:
13586 raise errors.OpPrereqError("Instance '%s' already in the cluster" %
13587 iname, errors.ECODE_EXISTS)
13588 if not isinstance(self.op.nics, list):
13589 raise errors.OpPrereqError("Invalid parameter 'nics'",
13590 errors.ECODE_INVAL)
13591 if not isinstance(self.op.disks, list):
13592 raise errors.OpPrereqError("Invalid parameter 'disks'",
13593 errors.ECODE_INVAL)
13594 for row in self.op.disks:
13595 if (not isinstance(row, dict) or
13596 constants.IDISK_SIZE not in row or
13597 not isinstance(row[constants.IDISK_SIZE], int) or
13598 constants.IDISK_MODE not in row or
13599 row[constants.IDISK_MODE] not in constants.DISK_ACCESS_SET):
13600 raise errors.OpPrereqError("Invalid contents of the 'disks'"
13601 " parameter", errors.ECODE_INVAL)
13602 if self.op.hypervisor is None:
13603 self.op.hypervisor = self.cfg.GetHypervisorType()
13604 elif self.op.mode == constants.IALLOCATOR_MODE_RELOC:
13605 fname = _ExpandInstanceName(self.cfg, self.op.name)
13606 self.op.name = fname
13607 self.relocate_from = \
13608 list(self.cfg.GetInstanceInfo(fname).secondary_nodes)
13609 elif self.op.mode in (constants.IALLOCATOR_MODE_CHG_GROUP,
13610 constants.IALLOCATOR_MODE_NODE_EVAC):
13611 if not self.op.instances:
13612 raise errors.OpPrereqError("Missing instances", errors.ECODE_INVAL)
13613 self.op.instances = _GetWantedInstances(self, self.op.instances)
13615 raise errors.OpPrereqError("Invalid test allocator mode '%s'" %
13616 self.op.mode, errors.ECODE_INVAL)
13618 if self.op.direction == constants.IALLOCATOR_DIR_OUT:
13619 if self.op.allocator is None:
13620 raise errors.OpPrereqError("Missing allocator name",
13621 errors.ECODE_INVAL)
13622 elif self.op.direction != constants.IALLOCATOR_DIR_IN:
13623 raise errors.OpPrereqError("Wrong allocator test '%s'" %
13624 self.op.direction, errors.ECODE_INVAL)
13626 def Exec(self, feedback_fn):
13627 """Run the allocator test.
13630 if self.op.mode == constants.IALLOCATOR_MODE_ALLOC:
13631 ial = IAllocator(self.cfg, self.rpc,
13634 memory=self.op.memory,
13635 disks=self.op.disks,
13636 disk_template=self.op.disk_template,
13640 vcpus=self.op.vcpus,
13641 hypervisor=self.op.hypervisor,
13643 elif self.op.mode == constants.IALLOCATOR_MODE_RELOC:
13644 ial = IAllocator(self.cfg, self.rpc,
13647 relocate_from=list(self.relocate_from),
13649 elif self.op.mode == constants.IALLOCATOR_MODE_CHG_GROUP:
13650 ial = IAllocator(self.cfg, self.rpc,
13652 instances=self.op.instances,
13653 target_groups=self.op.target_groups)
13654 elif self.op.mode == constants.IALLOCATOR_MODE_NODE_EVAC:
13655 ial = IAllocator(self.cfg, self.rpc,
13657 instances=self.op.instances,
13658 evac_mode=self.op.evac_mode)
13660 raise errors.ProgrammerError("Uncatched mode %s in"
13661 " LUTestAllocator.Exec", self.op.mode)
13663 if self.op.direction == constants.IALLOCATOR_DIR_IN:
13664 result = ial.in_text
13666 ial.Run(self.op.allocator, validate=False)
13667 result = ial.out_text
13671 #: Query type implementations
13673 constants.QR_INSTANCE: _InstanceQuery,
13674 constants.QR_NODE: _NodeQuery,
13675 constants.QR_GROUP: _GroupQuery,
13676 constants.QR_OS: _OsQuery,
13679 assert set(_QUERY_IMPL.keys()) == constants.QR_VIA_OP
13682 def _GetQueryImplementation(name):
13683 """Returns the implemtnation for a query type.
13685 @param name: Query type, must be one of L{constants.QR_VIA_OP}
13689 return _QUERY_IMPL[name]
13691 raise errors.OpPrereqError("Unknown query resource '%s'" % name,
13692 errors.ECODE_INVAL)