4 # Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 Google Inc.
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 """Module implementing the master-side code."""
24 # pylint: disable=W0201,C0302
26 # W0201 since most LU attributes are defined in CheckPrereq or similar
29 # C0302: since we have waaaay too many lines in this module
45 from ganeti import ssh
46 from ganeti import utils
47 from ganeti import errors
48 from ganeti import hypervisor
49 from ganeti import locking
50 from ganeti import constants
51 from ganeti import objects
52 from ganeti import serializer
53 from ganeti import ssconf
54 from ganeti import uidpool
55 from ganeti import compat
56 from ganeti import masterd
57 from ganeti import netutils
58 from ganeti import query
59 from ganeti import qlang
60 from ganeti import opcodes
62 from ganeti import rpc
64 import ganeti.masterd.instance # pylint: disable=W0611
67 #: Size of DRBD meta block device
72 """Data container for LU results with jobs.
74 Instances of this class returned from L{LogicalUnit.Exec} will be recognized
75 by L{mcpu.Processor._ProcessResult}. The latter will then submit the jobs
76 contained in the C{jobs} attribute and include the job IDs in the opcode
80 def __init__(self, jobs, **kwargs):
81 """Initializes this class.
83 Additional return values can be specified as keyword arguments.
85 @type jobs: list of lists of L{opcode.OpCode}
86 @param jobs: A list of lists of opcode objects
93 class LogicalUnit(object):
94 """Logical Unit base class.
96 Subclasses must follow these rules:
97 - implement ExpandNames
98 - implement CheckPrereq (except when tasklets are used)
99 - implement Exec (except when tasklets are used)
100 - implement BuildHooksEnv
101 - implement BuildHooksNodes
102 - redefine HPATH and HTYPE
103 - optionally redefine their run requirements:
104 REQ_BGL: the LU needs to hold the Big Ganeti Lock exclusively
106 Note that all commands require root permissions.
108 @ivar dry_run_result: the value (if any) that will be returned to the caller
109 in dry-run mode (signalled by opcode dry_run parameter)
116 def __init__(self, processor, op, context, rpc_runner):
117 """Constructor for LogicalUnit.
119 This needs to be overridden in derived classes in order to check op
123 self.proc = processor
125 self.cfg = context.cfg
126 self.glm = context.glm
128 self.owned_locks = context.glm.list_owned
129 self.context = context
130 self.rpc = rpc_runner
131 # Dicts used to declare locking needs to mcpu
132 self.needed_locks = None
133 self.share_locks = dict.fromkeys(locking.LEVELS, 0)
135 self.remove_locks = {}
136 # Used to force good behavior when calling helper functions
137 self.recalculate_locks = {}
139 self.Log = processor.Log # pylint: disable=C0103
140 self.LogWarning = processor.LogWarning # pylint: disable=C0103
141 self.LogInfo = processor.LogInfo # pylint: disable=C0103
142 self.LogStep = processor.LogStep # pylint: disable=C0103
143 # support for dry-run
144 self.dry_run_result = None
145 # support for generic debug attribute
146 if (not hasattr(self.op, "debug_level") or
147 not isinstance(self.op.debug_level, int)):
148 self.op.debug_level = 0
153 # Validate opcode parameters and set defaults
154 self.op.Validate(True)
156 self.CheckArguments()
158 def CheckArguments(self):
159 """Check syntactic validity for the opcode arguments.
161 This method is for doing a simple syntactic check and ensure
162 validity of opcode parameters, without any cluster-related
163 checks. While the same can be accomplished in ExpandNames and/or
164 CheckPrereq, doing these separate is better because:
166 - ExpandNames is left as as purely a lock-related function
167 - CheckPrereq is run after we have acquired locks (and possible
170 The function is allowed to change the self.op attribute so that
171 later methods can no longer worry about missing parameters.
176 def ExpandNames(self):
177 """Expand names for this LU.
179 This method is called before starting to execute the opcode, and it should
180 update all the parameters of the opcode to their canonical form (e.g. a
181 short node name must be fully expanded after this method has successfully
182 completed). This way locking, hooks, logging, etc. can work correctly.
184 LUs which implement this method must also populate the self.needed_locks
185 member, as a dict with lock levels as keys, and a list of needed lock names
188 - use an empty dict if you don't need any lock
189 - if you don't need any lock at a particular level omit that level
190 - don't put anything for the BGL level
191 - if you want all locks at a level use locking.ALL_SET as a value
193 If you need to share locks (rather than acquire them exclusively) at one
194 level you can modify self.share_locks, setting a true value (usually 1) for
195 that level. By default locks are not shared.
197 This function can also define a list of tasklets, which then will be
198 executed in order instead of the usual LU-level CheckPrereq and Exec
199 functions, if those are not defined by the LU.
203 # Acquire all nodes and one instance
204 self.needed_locks = {
205 locking.LEVEL_NODE: locking.ALL_SET,
206 locking.LEVEL_INSTANCE: ['instance1.example.com'],
208 # Acquire just two nodes
209 self.needed_locks = {
210 locking.LEVEL_NODE: ['node1.example.com', 'node2.example.com'],
213 self.needed_locks = {} # No, you can't leave it to the default value None
216 # The implementation of this method is mandatory only if the new LU is
217 # concurrent, so that old LUs don't need to be changed all at the same
220 self.needed_locks = {} # Exclusive LUs don't need locks.
222 raise NotImplementedError
224 def DeclareLocks(self, level):
225 """Declare LU locking needs for a level
227 While most LUs can just declare their locking needs at ExpandNames time,
228 sometimes there's the need to calculate some locks after having acquired
229 the ones before. This function is called just before acquiring locks at a
230 particular level, but after acquiring the ones at lower levels, and permits
231 such calculations. It can be used to modify self.needed_locks, and by
232 default it does nothing.
234 This function is only called if you have something already set in
235 self.needed_locks for the level.
237 @param level: Locking level which is going to be locked
238 @type level: member of ganeti.locking.LEVELS
242 def CheckPrereq(self):
243 """Check prerequisites for this LU.
245 This method should check that the prerequisites for the execution
246 of this LU are fulfilled. It can do internode communication, but
247 it should be idempotent - no cluster or system changes are
250 The method should raise errors.OpPrereqError in case something is
251 not fulfilled. Its return value is ignored.
253 This method should also update all the parameters of the opcode to
254 their canonical form if it hasn't been done by ExpandNames before.
257 if self.tasklets is not None:
258 for (idx, tl) in enumerate(self.tasklets):
259 logging.debug("Checking prerequisites for tasklet %s/%s",
260 idx + 1, len(self.tasklets))
265 def Exec(self, feedback_fn):
268 This method should implement the actual work. It should raise
269 errors.OpExecError for failures that are somewhat dealt with in
273 if self.tasklets is not None:
274 for (idx, tl) in enumerate(self.tasklets):
275 logging.debug("Executing tasklet %s/%s", idx + 1, len(self.tasklets))
278 raise NotImplementedError
280 def BuildHooksEnv(self):
281 """Build hooks environment for this LU.
284 @return: Dictionary containing the environment that will be used for
285 running the hooks for this LU. The keys of the dict must not be prefixed
286 with "GANETI_"--that'll be added by the hooks runner. The hooks runner
287 will extend the environment with additional variables. If no environment
288 should be defined, an empty dictionary should be returned (not C{None}).
289 @note: If the C{HPATH} attribute of the LU class is C{None}, this function
293 raise NotImplementedError
295 def BuildHooksNodes(self):
296 """Build list of nodes to run LU's hooks.
298 @rtype: tuple; (list, list)
299 @return: Tuple containing a list of node names on which the hook
300 should run before the execution and a list of node names on which the
301 hook should run after the execution. No nodes should be returned as an
302 empty list (and not None).
303 @note: If the C{HPATH} attribute of the LU class is C{None}, this function
307 raise NotImplementedError
309 def HooksCallBack(self, phase, hook_results, feedback_fn, lu_result):
310 """Notify the LU about the results of its hooks.
312 This method is called every time a hooks phase is executed, and notifies
313 the Logical Unit about the hooks' result. The LU can then use it to alter
314 its result based on the hooks. By default the method does nothing and the
315 previous result is passed back unchanged but any LU can define it if it
316 wants to use the local cluster hook-scripts somehow.
318 @param phase: one of L{constants.HOOKS_PHASE_POST} or
319 L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase
320 @param hook_results: the results of the multi-node hooks rpc call
321 @param feedback_fn: function used send feedback back to the caller
322 @param lu_result: the previous Exec result this LU had, or None
324 @return: the new Exec result, based on the previous result
328 # API must be kept, thus we ignore the unused argument and could
329 # be a function warnings
330 # pylint: disable=W0613,R0201
333 def _ExpandAndLockInstance(self):
334 """Helper function to expand and lock an instance.
336 Many LUs that work on an instance take its name in self.op.instance_name
337 and need to expand it and then declare the expanded name for locking. This
338 function does it, and then updates self.op.instance_name to the expanded
339 name. It also initializes needed_locks as a dict, if this hasn't been done
343 if self.needed_locks is None:
344 self.needed_locks = {}
346 assert locking.LEVEL_INSTANCE not in self.needed_locks, \
347 "_ExpandAndLockInstance called with instance-level locks set"
348 self.op.instance_name = _ExpandInstanceName(self.cfg,
349 self.op.instance_name)
350 self.needed_locks[locking.LEVEL_INSTANCE] = self.op.instance_name
352 def _LockInstancesNodes(self, primary_only=False):
353 """Helper function to declare instances' nodes for locking.
355 This function should be called after locking one or more instances to lock
356 their nodes. Its effect is populating self.needed_locks[locking.LEVEL_NODE]
357 with all primary or secondary nodes for instances already locked and
358 present in self.needed_locks[locking.LEVEL_INSTANCE].
360 It should be called from DeclareLocks, and for safety only works if
361 self.recalculate_locks[locking.LEVEL_NODE] is set.
363 In the future it may grow parameters to just lock some instance's nodes, or
364 to just lock primaries or secondary nodes, if needed.
366 If should be called in DeclareLocks in a way similar to::
368 if level == locking.LEVEL_NODE:
369 self._LockInstancesNodes()
371 @type primary_only: boolean
372 @param primary_only: only lock primary nodes of locked instances
375 assert locking.LEVEL_NODE in self.recalculate_locks, \
376 "_LockInstancesNodes helper function called with no nodes to recalculate"
378 # TODO: check if we're really been called with the instance locks held
380 # For now we'll replace self.needed_locks[locking.LEVEL_NODE], but in the
381 # future we might want to have different behaviors depending on the value
382 # of self.recalculate_locks[locking.LEVEL_NODE]
384 locked_i = self.owned_locks(locking.LEVEL_INSTANCE)
385 for _, instance in self.cfg.GetMultiInstanceInfo(locked_i):
386 wanted_nodes.append(instance.primary_node)
388 wanted_nodes.extend(instance.secondary_nodes)
390 if self.recalculate_locks[locking.LEVEL_NODE] == constants.LOCKS_REPLACE:
391 self.needed_locks[locking.LEVEL_NODE] = wanted_nodes
392 elif self.recalculate_locks[locking.LEVEL_NODE] == constants.LOCKS_APPEND:
393 self.needed_locks[locking.LEVEL_NODE].extend(wanted_nodes)
395 del self.recalculate_locks[locking.LEVEL_NODE]
398 class NoHooksLU(LogicalUnit): # pylint: disable=W0223
399 """Simple LU which runs no hooks.
401 This LU is intended as a parent for other LogicalUnits which will
402 run no hooks, in order to reduce duplicate code.
408 def BuildHooksEnv(self):
409 """Empty BuildHooksEnv for NoHooksLu.
411 This just raises an error.
414 raise AssertionError("BuildHooksEnv called for NoHooksLUs")
416 def BuildHooksNodes(self):
417 """Empty BuildHooksNodes for NoHooksLU.
420 raise AssertionError("BuildHooksNodes called for NoHooksLU")
424 """Tasklet base class.
426 Tasklets are subcomponents for LUs. LUs can consist entirely of tasklets or
427 they can mix legacy code with tasklets. Locking needs to be done in the LU,
428 tasklets know nothing about locks.
430 Subclasses must follow these rules:
431 - Implement CheckPrereq
435 def __init__(self, lu):
442 def CheckPrereq(self):
443 """Check prerequisites for this tasklets.
445 This method should check whether the prerequisites for the execution of
446 this tasklet are fulfilled. It can do internode communication, but it
447 should be idempotent - no cluster or system changes are allowed.
449 The method should raise errors.OpPrereqError in case something is not
450 fulfilled. Its return value is ignored.
452 This method should also update all parameters to their canonical form if it
453 hasn't been done before.
458 def Exec(self, feedback_fn):
459 """Execute the tasklet.
461 This method should implement the actual work. It should raise
462 errors.OpExecError for failures that are somewhat dealt with in code, or
466 raise NotImplementedError
470 """Base for query utility classes.
473 #: Attribute holding field definitions
476 def __init__(self, qfilter, fields, use_locking):
477 """Initializes this class.
480 self.use_locking = use_locking
482 self.query = query.Query(self.FIELDS, fields, qfilter=qfilter,
484 self.requested_data = self.query.RequestedData()
485 self.names = self.query.RequestedNames()
487 # Sort only if no names were requested
488 self.sort_by_name = not self.names
490 self.do_locking = None
493 def _GetNames(self, lu, all_names, lock_level):
494 """Helper function to determine names asked for in the query.
498 names = lu.owned_locks(lock_level)
502 if self.wanted == locking.ALL_SET:
503 assert not self.names
504 # caller didn't specify names, so ordering is not important
505 return utils.NiceSort(names)
507 # caller specified names and we must keep the same order
509 assert not self.do_locking or lu.glm.is_owned(lock_level)
511 missing = set(self.wanted).difference(names)
513 raise errors.OpExecError("Some items were removed before retrieving"
514 " their data: %s" % missing)
516 # Return expanded names
519 def ExpandNames(self, lu):
520 """Expand names for this query.
522 See L{LogicalUnit.ExpandNames}.
525 raise NotImplementedError()
527 def DeclareLocks(self, lu, level):
528 """Declare locks for this query.
530 See L{LogicalUnit.DeclareLocks}.
533 raise NotImplementedError()
535 def _GetQueryData(self, lu):
536 """Collects all data for this query.
538 @return: Query data object
541 raise NotImplementedError()
543 def NewStyleQuery(self, lu):
544 """Collect data and execute query.
547 return query.GetQueryResponse(self.query, self._GetQueryData(lu),
548 sort_by_name=self.sort_by_name)
550 def OldStyleQuery(self, lu):
551 """Collect data and execute query.
554 return self.query.OldStyleQuery(self._GetQueryData(lu),
555 sort_by_name=self.sort_by_name)
559 """Returns a dict declaring all lock levels shared.
562 return dict.fromkeys(locking.LEVELS, 1)
565 def _CheckInstanceNodeGroups(cfg, instance_name, owned_groups):
566 """Checks if the owned node groups are still correct for an instance.
568 @type cfg: L{config.ConfigWriter}
569 @param cfg: The cluster configuration
570 @type instance_name: string
571 @param instance_name: Instance name
572 @type owned_groups: set or frozenset
573 @param owned_groups: List of currently owned node groups
576 inst_groups = cfg.GetInstanceNodeGroups(instance_name)
578 if not owned_groups.issuperset(inst_groups):
579 raise errors.OpPrereqError("Instance %s's node groups changed since"
580 " locks were acquired, current groups are"
581 " are '%s', owning groups '%s'; retry the"
584 utils.CommaJoin(inst_groups),
585 utils.CommaJoin(owned_groups)),
591 def _CheckNodeGroupInstances(cfg, group_uuid, owned_instances):
592 """Checks if the instances in a node group are still correct.
594 @type cfg: L{config.ConfigWriter}
595 @param cfg: The cluster configuration
596 @type group_uuid: string
597 @param group_uuid: Node group UUID
598 @type owned_instances: set or frozenset
599 @param owned_instances: List of currently owned instances
602 wanted_instances = cfg.GetNodeGroupInstances(group_uuid)
603 if owned_instances != wanted_instances:
604 raise errors.OpPrereqError("Instances in node group '%s' changed since"
605 " locks were acquired, wanted '%s', have '%s';"
606 " retry the operation" %
608 utils.CommaJoin(wanted_instances),
609 utils.CommaJoin(owned_instances)),
612 return wanted_instances
615 def _SupportsOob(cfg, node):
616 """Tells if node supports OOB.
618 @type cfg: L{config.ConfigWriter}
619 @param cfg: The cluster configuration
620 @type node: L{objects.Node}
621 @param node: The node
622 @return: The OOB script if supported or an empty string otherwise
625 return cfg.GetNdParams(node)[constants.ND_OOB_PROGRAM]
628 def _GetWantedNodes(lu, nodes):
629 """Returns list of checked and expanded node names.
631 @type lu: L{LogicalUnit}
632 @param lu: the logical unit on whose behalf we execute
634 @param nodes: list of node names or None for all nodes
636 @return: the list of nodes, sorted
637 @raise errors.ProgrammerError: if the nodes parameter is wrong type
641 return [_ExpandNodeName(lu.cfg, name) for name in nodes]
643 return utils.NiceSort(lu.cfg.GetNodeList())
646 def _GetWantedInstances(lu, instances):
647 """Returns list of checked and expanded instance names.
649 @type lu: L{LogicalUnit}
650 @param lu: the logical unit on whose behalf we execute
651 @type instances: list
652 @param instances: list of instance names or None for all instances
654 @return: the list of instances, sorted
655 @raise errors.OpPrereqError: if the instances parameter is wrong type
656 @raise errors.OpPrereqError: if any of the passed instances is not found
660 wanted = [_ExpandInstanceName(lu.cfg, name) for name in instances]
662 wanted = utils.NiceSort(lu.cfg.GetInstanceList())
666 def _GetUpdatedParams(old_params, update_dict,
667 use_default=True, use_none=False):
668 """Return the new version of a parameter dictionary.
670 @type old_params: dict
671 @param old_params: old parameters
672 @type update_dict: dict
673 @param update_dict: dict containing new parameter values, or
674 constants.VALUE_DEFAULT to reset the parameter to its default
676 @param use_default: boolean
677 @type use_default: whether to recognise L{constants.VALUE_DEFAULT}
678 values as 'to be deleted' values
679 @param use_none: boolean
680 @type use_none: whether to recognise C{None} values as 'to be
683 @return: the new parameter dictionary
686 params_copy = copy.deepcopy(old_params)
687 for key, val in update_dict.iteritems():
688 if ((use_default and val == constants.VALUE_DEFAULT) or
689 (use_none and val is None)):
695 params_copy[key] = val
699 def _ReleaseLocks(lu, level, names=None, keep=None):
700 """Releases locks owned by an LU.
702 @type lu: L{LogicalUnit}
703 @param level: Lock level
704 @type names: list or None
705 @param names: Names of locks to release
706 @type keep: list or None
707 @param keep: Names of locks to retain
710 assert not (keep is not None and names is not None), \
711 "Only one of the 'names' and the 'keep' parameters can be given"
713 if names is not None:
714 should_release = names.__contains__
716 should_release = lambda name: name not in keep
718 should_release = None
724 # Determine which locks to release
725 for name in lu.owned_locks(level):
726 if should_release(name):
731 assert len(lu.owned_locks(level)) == (len(retain) + len(release))
733 # Release just some locks
734 lu.glm.release(level, names=release)
736 assert frozenset(lu.owned_locks(level)) == frozenset(retain)
739 lu.glm.release(level)
741 assert not lu.glm.is_owned(level), "No locks should be owned"
744 def _MapInstanceDisksToNodes(instances):
745 """Creates a map from (node, volume) to instance name.
747 @type instances: list of L{objects.Instance}
748 @rtype: dict; tuple of (node name, volume name) as key, instance name as value
751 return dict(((node, vol), inst.name)
752 for inst in instances
753 for (node, vols) in inst.MapLVsByNode().items()
757 def _RunPostHook(lu, node_name):
758 """Runs the post-hook for an opcode on a single node.
761 hm = lu.proc.BuildHooksManager(lu)
763 hm.RunPhase(constants.HOOKS_PHASE_POST, nodes=[node_name])
765 # pylint: disable=W0702
766 lu.LogWarning("Errors occurred running hooks on %s" % node_name)
769 def _CheckOutputFields(static, dynamic, selected):
770 """Checks whether all selected fields are valid.
772 @type static: L{utils.FieldSet}
773 @param static: static fields set
774 @type dynamic: L{utils.FieldSet}
775 @param dynamic: dynamic fields set
782 delta = f.NonMatching(selected)
784 raise errors.OpPrereqError("Unknown output fields selected: %s"
785 % ",".join(delta), errors.ECODE_INVAL)
788 def _CheckGlobalHvParams(params):
789 """Validates that given hypervisor params are not global ones.
791 This will ensure that instances don't get customised versions of
795 used_globals = constants.HVC_GLOBALS.intersection(params)
797 msg = ("The following hypervisor parameters are global and cannot"
798 " be customized at instance level, please modify them at"
799 " cluster level: %s" % utils.CommaJoin(used_globals))
800 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
803 def _CheckNodeOnline(lu, node, msg=None):
804 """Ensure that a given node is online.
806 @param lu: the LU on behalf of which we make the check
807 @param node: the node to check
808 @param msg: if passed, should be a message to replace the default one
809 @raise errors.OpPrereqError: if the node is offline
813 msg = "Can't use offline node"
814 if lu.cfg.GetNodeInfo(node).offline:
815 raise errors.OpPrereqError("%s: %s" % (msg, node), errors.ECODE_STATE)
818 def _CheckNodeNotDrained(lu, node):
819 """Ensure that a given node is not drained.
821 @param lu: the LU on behalf of which we make the check
822 @param node: the node to check
823 @raise errors.OpPrereqError: if the node is drained
826 if lu.cfg.GetNodeInfo(node).drained:
827 raise errors.OpPrereqError("Can't use drained node %s" % node,
831 def _CheckNodeVmCapable(lu, node):
832 """Ensure that a given node is vm capable.
834 @param lu: the LU on behalf of which we make the check
835 @param node: the node to check
836 @raise errors.OpPrereqError: if the node is not vm capable
839 if not lu.cfg.GetNodeInfo(node).vm_capable:
840 raise errors.OpPrereqError("Can't use non-vm_capable node %s" % node,
844 def _CheckNodeHasOS(lu, node, os_name, force_variant):
845 """Ensure that a node supports a given OS.
847 @param lu: the LU on behalf of which we make the check
848 @param node: the node to check
849 @param os_name: the OS to query about
850 @param force_variant: whether to ignore variant errors
851 @raise errors.OpPrereqError: if the node is not supporting the OS
854 result = lu.rpc.call_os_get(node, os_name)
855 result.Raise("OS '%s' not in supported OS list for node %s" %
857 prereq=True, ecode=errors.ECODE_INVAL)
858 if not force_variant:
859 _CheckOSVariant(result.payload, os_name)
862 def _CheckNodeHasSecondaryIP(lu, node, secondary_ip, prereq):
863 """Ensure that a node has the given secondary ip.
865 @type lu: L{LogicalUnit}
866 @param lu: the LU on behalf of which we make the check
868 @param node: the node to check
869 @type secondary_ip: string
870 @param secondary_ip: the ip to check
871 @type prereq: boolean
872 @param prereq: whether to throw a prerequisite or an execute error
873 @raise errors.OpPrereqError: if the node doesn't have the ip, and prereq=True
874 @raise errors.OpExecError: if the node doesn't have the ip, and prereq=False
877 result = lu.rpc.call_node_has_ip_address(node, secondary_ip)
878 result.Raise("Failure checking secondary ip on node %s" % node,
879 prereq=prereq, ecode=errors.ECODE_ENVIRON)
880 if not result.payload:
881 msg = ("Node claims it doesn't have the secondary ip you gave (%s),"
882 " please fix and re-run this command" % secondary_ip)
884 raise errors.OpPrereqError(msg, errors.ECODE_ENVIRON)
886 raise errors.OpExecError(msg)
889 def _GetClusterDomainSecret():
890 """Reads the cluster domain secret.
893 return utils.ReadOneLineFile(constants.CLUSTER_DOMAIN_SECRET_FILE,
897 def _CheckInstanceDown(lu, instance, reason):
898 """Ensure that an instance is not running."""
899 if instance.admin_up:
900 raise errors.OpPrereqError("Instance %s is marked to be up, %s" %
901 (instance.name, reason), errors.ECODE_STATE)
903 pnode = instance.primary_node
904 ins_l = lu.rpc.call_instance_list([pnode], [instance.hypervisor])[pnode]
905 ins_l.Raise("Can't contact node %s for instance information" % pnode,
906 prereq=True, ecode=errors.ECODE_ENVIRON)
908 if instance.name in ins_l.payload:
909 raise errors.OpPrereqError("Instance %s is running, %s" %
910 (instance.name, reason), errors.ECODE_STATE)
913 def _ExpandItemName(fn, name, kind):
914 """Expand an item name.
916 @param fn: the function to use for expansion
917 @param name: requested item name
918 @param kind: text description ('Node' or 'Instance')
919 @return: the resolved (full) name
920 @raise errors.OpPrereqError: if the item is not found
924 if full_name is None:
925 raise errors.OpPrereqError("%s '%s' not known" % (kind, name),
930 def _ExpandNodeName(cfg, name):
931 """Wrapper over L{_ExpandItemName} for nodes."""
932 return _ExpandItemName(cfg.ExpandNodeName, name, "Node")
935 def _ExpandInstanceName(cfg, name):
936 """Wrapper over L{_ExpandItemName} for instance."""
937 return _ExpandItemName(cfg.ExpandInstanceName, name, "Instance")
940 def _BuildInstanceHookEnv(name, primary_node, secondary_nodes, os_type, status,
941 memory, vcpus, nics, disk_template, disks,
942 bep, hvp, hypervisor_name, tags):
943 """Builds instance related env variables for hooks
945 This builds the hook environment from individual variables.
948 @param name: the name of the instance
949 @type primary_node: string
950 @param primary_node: the name of the instance's primary node
951 @type secondary_nodes: list
952 @param secondary_nodes: list of secondary nodes as strings
953 @type os_type: string
954 @param os_type: the name of the instance's OS
955 @type status: boolean
956 @param status: the should_run status of the instance
958 @param memory: the memory size of the instance
960 @param vcpus: the count of VCPUs the instance has
962 @param nics: list of tuples (ip, mac, mode, link) representing
963 the NICs the instance has
964 @type disk_template: string
965 @param disk_template: the disk template of the instance
967 @param disks: the list of (size, mode) pairs
969 @param bep: the backend parameters for the instance
971 @param hvp: the hypervisor parameters for the instance
972 @type hypervisor_name: string
973 @param hypervisor_name: the hypervisor for the instance
975 @param tags: list of instance tags as strings
977 @return: the hook environment for this instance
986 "INSTANCE_NAME": name,
987 "INSTANCE_PRIMARY": primary_node,
988 "INSTANCE_SECONDARIES": " ".join(secondary_nodes),
989 "INSTANCE_OS_TYPE": os_type,
990 "INSTANCE_STATUS": str_status,
991 "INSTANCE_MEMORY": memory,
992 "INSTANCE_VCPUS": vcpus,
993 "INSTANCE_DISK_TEMPLATE": disk_template,
994 "INSTANCE_HYPERVISOR": hypervisor_name,
998 nic_count = len(nics)
999 for idx, (ip, mac, mode, link) in enumerate(nics):
1002 env["INSTANCE_NIC%d_IP" % idx] = ip
1003 env["INSTANCE_NIC%d_MAC" % idx] = mac
1004 env["INSTANCE_NIC%d_MODE" % idx] = mode
1005 env["INSTANCE_NIC%d_LINK" % idx] = link
1006 if mode == constants.NIC_MODE_BRIDGED:
1007 env["INSTANCE_NIC%d_BRIDGE" % idx] = link
1011 env["INSTANCE_NIC_COUNT"] = nic_count
1014 disk_count = len(disks)
1015 for idx, (size, mode) in enumerate(disks):
1016 env["INSTANCE_DISK%d_SIZE" % idx] = size
1017 env["INSTANCE_DISK%d_MODE" % idx] = mode
1021 env["INSTANCE_DISK_COUNT"] = disk_count
1026 env["INSTANCE_TAGS"] = " ".join(tags)
1028 for source, kind in [(bep, "BE"), (hvp, "HV")]:
1029 for key, value in source.items():
1030 env["INSTANCE_%s_%s" % (kind, key)] = value
1035 def _NICListToTuple(lu, nics):
1036 """Build a list of nic information tuples.
1038 This list is suitable to be passed to _BuildInstanceHookEnv or as a return
1039 value in LUInstanceQueryData.
1041 @type lu: L{LogicalUnit}
1042 @param lu: the logical unit on whose behalf we execute
1043 @type nics: list of L{objects.NIC}
1044 @param nics: list of nics to convert to hooks tuples
1048 cluster = lu.cfg.GetClusterInfo()
1052 filled_params = cluster.SimpleFillNIC(nic.nicparams)
1053 mode = filled_params[constants.NIC_MODE]
1054 link = filled_params[constants.NIC_LINK]
1055 hooks_nics.append((ip, mac, mode, link))
1059 def _BuildInstanceHookEnvByObject(lu, instance, override=None):
1060 """Builds instance related env variables for hooks from an object.
1062 @type lu: L{LogicalUnit}
1063 @param lu: the logical unit on whose behalf we execute
1064 @type instance: L{objects.Instance}
1065 @param instance: the instance for which we should build the
1067 @type override: dict
1068 @param override: dictionary with key/values that will override
1071 @return: the hook environment dictionary
1074 cluster = lu.cfg.GetClusterInfo()
1075 bep = cluster.FillBE(instance)
1076 hvp = cluster.FillHV(instance)
1078 "name": instance.name,
1079 "primary_node": instance.primary_node,
1080 "secondary_nodes": instance.secondary_nodes,
1081 "os_type": instance.os,
1082 "status": instance.admin_up,
1083 "memory": bep[constants.BE_MEMORY],
1084 "vcpus": bep[constants.BE_VCPUS],
1085 "nics": _NICListToTuple(lu, instance.nics),
1086 "disk_template": instance.disk_template,
1087 "disks": [(disk.size, disk.mode) for disk in instance.disks],
1090 "hypervisor_name": instance.hypervisor,
1091 "tags": instance.tags,
1094 args.update(override)
1095 return _BuildInstanceHookEnv(**args) # pylint: disable=W0142
1098 def _AdjustCandidatePool(lu, exceptions):
1099 """Adjust the candidate pool after node operations.
1102 mod_list = lu.cfg.MaintainCandidatePool(exceptions)
1104 lu.LogInfo("Promoted nodes to master candidate role: %s",
1105 utils.CommaJoin(node.name for node in mod_list))
1106 for name in mod_list:
1107 lu.context.ReaddNode(name)
1108 mc_now, mc_max, _ = lu.cfg.GetMasterCandidateStats(exceptions)
1110 lu.LogInfo("Note: more nodes are candidates (%d) than desired (%d)" %
1114 def _DecideSelfPromotion(lu, exceptions=None):
1115 """Decide whether I should promote myself as a master candidate.
1118 cp_size = lu.cfg.GetClusterInfo().candidate_pool_size
1119 mc_now, mc_should, _ = lu.cfg.GetMasterCandidateStats(exceptions)
1120 # the new node will increase mc_max with one, so:
1121 mc_should = min(mc_should + 1, cp_size)
1122 return mc_now < mc_should
1125 def _CheckNicsBridgesExist(lu, target_nics, target_node):
1126 """Check that the brigdes needed by a list of nics exist.
1129 cluster = lu.cfg.GetClusterInfo()
1130 paramslist = [cluster.SimpleFillNIC(nic.nicparams) for nic in target_nics]
1131 brlist = [params[constants.NIC_LINK] for params in paramslist
1132 if params[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED]
1134 result = lu.rpc.call_bridges_exist(target_node, brlist)
1135 result.Raise("Error checking bridges on destination node '%s'" %
1136 target_node, prereq=True, ecode=errors.ECODE_ENVIRON)
1139 def _CheckInstanceBridgesExist(lu, instance, node=None):
1140 """Check that the brigdes needed by an instance exist.
1144 node = instance.primary_node
1145 _CheckNicsBridgesExist(lu, instance.nics, node)
1148 def _CheckOSVariant(os_obj, name):
1149 """Check whether an OS name conforms to the os variants specification.
1151 @type os_obj: L{objects.OS}
1152 @param os_obj: OS object to check
1154 @param name: OS name passed by the user, to check for validity
1157 variant = objects.OS.GetVariant(name)
1158 if not os_obj.supported_variants:
1160 raise errors.OpPrereqError("OS '%s' doesn't support variants ('%s'"
1161 " passed)" % (os_obj.name, variant),
1165 raise errors.OpPrereqError("OS name must include a variant",
1168 if variant not in os_obj.supported_variants:
1169 raise errors.OpPrereqError("Unsupported OS variant", errors.ECODE_INVAL)
1172 def _GetNodeInstancesInner(cfg, fn):
1173 return [i for i in cfg.GetAllInstancesInfo().values() if fn(i)]
1176 def _GetNodeInstances(cfg, node_name):
1177 """Returns a list of all primary and secondary instances on a node.
1181 return _GetNodeInstancesInner(cfg, lambda inst: node_name in inst.all_nodes)
1184 def _GetNodePrimaryInstances(cfg, node_name):
1185 """Returns primary instances on a node.
1188 return _GetNodeInstancesInner(cfg,
1189 lambda inst: node_name == inst.primary_node)
1192 def _GetNodeSecondaryInstances(cfg, node_name):
1193 """Returns secondary instances on a node.
1196 return _GetNodeInstancesInner(cfg,
1197 lambda inst: node_name in inst.secondary_nodes)
1200 def _GetStorageTypeArgs(cfg, storage_type):
1201 """Returns the arguments for a storage type.
1204 # Special case for file storage
1205 if storage_type == constants.ST_FILE:
1206 # storage.FileStorage wants a list of storage directories
1207 return [[cfg.GetFileStorageDir(), cfg.GetSharedFileStorageDir()]]
1212 def _FindFaultyInstanceDisks(cfg, rpc_runner, instance, node_name, prereq):
1215 for dev in instance.disks:
1216 cfg.SetDiskID(dev, node_name)
1218 result = rpc_runner.call_blockdev_getmirrorstatus(node_name, instance.disks)
1219 result.Raise("Failed to get disk status from node %s" % node_name,
1220 prereq=prereq, ecode=errors.ECODE_ENVIRON)
1222 for idx, bdev_status in enumerate(result.payload):
1223 if bdev_status and bdev_status.ldisk_status == constants.LDS_FAULTY:
1229 def _CheckIAllocatorOrNode(lu, iallocator_slot, node_slot):
1230 """Check the sanity of iallocator and node arguments and use the
1231 cluster-wide iallocator if appropriate.
1233 Check that at most one of (iallocator, node) is specified. If none is
1234 specified, then the LU's opcode's iallocator slot is filled with the
1235 cluster-wide default iallocator.
1237 @type iallocator_slot: string
1238 @param iallocator_slot: the name of the opcode iallocator slot
1239 @type node_slot: string
1240 @param node_slot: the name of the opcode target node slot
1243 node = getattr(lu.op, node_slot, None)
1244 iallocator = getattr(lu.op, iallocator_slot, None)
1246 if node is not None and iallocator is not None:
1247 raise errors.OpPrereqError("Do not specify both, iallocator and node",
1249 elif node is None and iallocator is None:
1250 default_iallocator = lu.cfg.GetDefaultIAllocator()
1251 if default_iallocator:
1252 setattr(lu.op, iallocator_slot, default_iallocator)
1254 raise errors.OpPrereqError("No iallocator or node given and no"
1255 " cluster-wide default iallocator found;"
1256 " please specify either an iallocator or a"
1257 " node, or set a cluster-wide default"
1261 def _GetDefaultIAllocator(cfg, iallocator):
1262 """Decides on which iallocator to use.
1264 @type cfg: L{config.ConfigWriter}
1265 @param cfg: Cluster configuration object
1266 @type iallocator: string or None
1267 @param iallocator: Iallocator specified in opcode
1269 @return: Iallocator name
1273 # Use default iallocator
1274 iallocator = cfg.GetDefaultIAllocator()
1277 raise errors.OpPrereqError("No iallocator was specified, neither in the"
1278 " opcode nor as a cluster-wide default",
1284 class LUClusterPostInit(LogicalUnit):
1285 """Logical unit for running hooks after cluster initialization.
1288 HPATH = "cluster-init"
1289 HTYPE = constants.HTYPE_CLUSTER
1291 def BuildHooksEnv(self):
1296 "OP_TARGET": self.cfg.GetClusterName(),
1299 def BuildHooksNodes(self):
1300 """Build hooks nodes.
1303 return ([], [self.cfg.GetMasterNode()])
1305 def Exec(self, feedback_fn):
1312 class LUClusterDestroy(LogicalUnit):
1313 """Logical unit for destroying the cluster.
1316 HPATH = "cluster-destroy"
1317 HTYPE = constants.HTYPE_CLUSTER
1319 def BuildHooksEnv(self):
1324 "OP_TARGET": self.cfg.GetClusterName(),
1327 def BuildHooksNodes(self):
1328 """Build hooks nodes.
1333 def CheckPrereq(self):
1334 """Check prerequisites.
1336 This checks whether the cluster is empty.
1338 Any errors are signaled by raising errors.OpPrereqError.
1341 master = self.cfg.GetMasterNode()
1343 nodelist = self.cfg.GetNodeList()
1344 if len(nodelist) != 1 or nodelist[0] != master:
1345 raise errors.OpPrereqError("There are still %d node(s) in"
1346 " this cluster." % (len(nodelist) - 1),
1348 instancelist = self.cfg.GetInstanceList()
1350 raise errors.OpPrereqError("There are still %d instance(s) in"
1351 " this cluster." % len(instancelist),
1354 def Exec(self, feedback_fn):
1355 """Destroys the cluster.
1358 (master, ip, dev, netmask, family) = self.cfg.GetMasterNetworkParameters()
1360 # Run post hooks on master node before it's removed
1361 _RunPostHook(self, master)
1363 result = self.rpc.call_node_deactivate_master_ip(master, ip, netmask, dev,
1365 result.Raise("Could not disable the master role")
1370 def _VerifyCertificate(filename):
1371 """Verifies a certificate for L{LUClusterVerifyConfig}.
1373 @type filename: string
1374 @param filename: Path to PEM file
1378 cert = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM,
1379 utils.ReadFile(filename))
1380 except Exception, err: # pylint: disable=W0703
1381 return (LUClusterVerifyConfig.ETYPE_ERROR,
1382 "Failed to load X509 certificate %s: %s" % (filename, err))
1385 utils.VerifyX509Certificate(cert, constants.SSL_CERT_EXPIRATION_WARN,
1386 constants.SSL_CERT_EXPIRATION_ERROR)
1389 fnamemsg = "While verifying %s: %s" % (filename, msg)
1394 return (None, fnamemsg)
1395 elif errcode == utils.CERT_WARNING:
1396 return (LUClusterVerifyConfig.ETYPE_WARNING, fnamemsg)
1397 elif errcode == utils.CERT_ERROR:
1398 return (LUClusterVerifyConfig.ETYPE_ERROR, fnamemsg)
1400 raise errors.ProgrammerError("Unhandled certificate error code %r" % errcode)
1403 def _GetAllHypervisorParameters(cluster, instances):
1404 """Compute the set of all hypervisor parameters.
1406 @type cluster: L{objects.Cluster}
1407 @param cluster: the cluster object
1408 @param instances: list of L{objects.Instance}
1409 @param instances: additional instances from which to obtain parameters
1410 @rtype: list of (origin, hypervisor, parameters)
1411 @return: a list with all parameters found, indicating the hypervisor they
1412 apply to, and the origin (can be "cluster", "os X", or "instance Y")
1417 for hv_name in cluster.enabled_hypervisors:
1418 hvp_data.append(("cluster", hv_name, cluster.GetHVDefaults(hv_name)))
1420 for os_name, os_hvp in cluster.os_hvp.items():
1421 for hv_name, hv_params in os_hvp.items():
1423 full_params = cluster.GetHVDefaults(hv_name, os_name=os_name)
1424 hvp_data.append(("os %s" % os_name, hv_name, full_params))
1426 # TODO: collapse identical parameter values in a single one
1427 for instance in instances:
1428 if instance.hvparams:
1429 hvp_data.append(("instance %s" % instance.name, instance.hypervisor,
1430 cluster.FillHV(instance)))
1435 class _VerifyErrors(object):
1436 """Mix-in for cluster/group verify LUs.
1438 It provides _Error and _ErrorIf, and updates the self.bad boolean. (Expects
1439 self.op and self._feedback_fn to be available.)
1443 ETYPE_FIELD = "code"
1444 ETYPE_ERROR = "ERROR"
1445 ETYPE_WARNING = "WARNING"
1447 def _Error(self, ecode, item, msg, *args, **kwargs):
1448 """Format an error message.
1450 Based on the opcode's error_codes parameter, either format a
1451 parseable error code, or a simpler error string.
1453 This must be called only from Exec and functions called from Exec.
1456 ltype = kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR)
1457 itype, etxt, _ = ecode
1458 # first complete the msg
1461 # then format the whole message
1462 if self.op.error_codes: # This is a mix-in. pylint: disable=E1101
1463 msg = "%s:%s:%s:%s:%s" % (ltype, etxt, itype, item, msg)
1469 msg = "%s: %s%s: %s" % (ltype, itype, item, msg)
1470 # and finally report it via the feedback_fn
1471 self._feedback_fn(" - %s" % msg) # Mix-in. pylint: disable=E1101
1473 def _ErrorIf(self, cond, ecode, *args, **kwargs):
1474 """Log an error message if the passed condition is True.
1478 or self.op.debug_simulate_errors) # pylint: disable=E1101
1480 # If the error code is in the list of ignored errors, demote the error to a
1482 (_, etxt, _) = ecode
1483 if etxt in self.op.ignore_errors: # pylint: disable=E1101
1484 kwargs[self.ETYPE_FIELD] = self.ETYPE_WARNING
1487 self._Error(ecode, *args, **kwargs)
1489 # do not mark the operation as failed for WARN cases only
1490 if kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR) == self.ETYPE_ERROR:
1491 self.bad = self.bad or cond
1494 class LUClusterVerify(NoHooksLU):
1495 """Submits all jobs necessary to verify the cluster.
1500 def ExpandNames(self):
1501 self.needed_locks = {}
1503 def Exec(self, feedback_fn):
1506 if self.op.group_name:
1507 groups = [self.op.group_name]
1508 depends_fn = lambda: None
1510 groups = self.cfg.GetNodeGroupList()
1512 # Verify global configuration
1514 opcodes.OpClusterVerifyConfig(ignore_errors=self.op.ignore_errors)
1517 # Always depend on global verification
1518 depends_fn = lambda: [(-len(jobs), [])]
1520 jobs.extend([opcodes.OpClusterVerifyGroup(group_name=group,
1521 ignore_errors=self.op.ignore_errors,
1522 depends=depends_fn())]
1523 for group in groups)
1525 # Fix up all parameters
1526 for op in itertools.chain(*jobs): # pylint: disable=W0142
1527 op.debug_simulate_errors = self.op.debug_simulate_errors
1528 op.verbose = self.op.verbose
1529 op.error_codes = self.op.error_codes
1531 op.skip_checks = self.op.skip_checks
1532 except AttributeError:
1533 assert not isinstance(op, opcodes.OpClusterVerifyGroup)
1535 return ResultWithJobs(jobs)
1538 class LUClusterVerifyConfig(NoHooksLU, _VerifyErrors):
1539 """Verifies the cluster config.
1544 def _VerifyHVP(self, hvp_data):
1545 """Verifies locally the syntax of the hypervisor parameters.
1548 for item, hv_name, hv_params in hvp_data:
1549 msg = ("hypervisor %s parameters syntax check (source %s): %%s" %
1552 hv_class = hypervisor.GetHypervisor(hv_name)
1553 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
1554 hv_class.CheckParameterSyntax(hv_params)
1555 except errors.GenericError, err:
1556 self._ErrorIf(True, constants.CV_ECLUSTERCFG, None, msg % str(err))
1558 def ExpandNames(self):
1559 # Information can be safely retrieved as the BGL is acquired in exclusive
1561 assert locking.BGL in self.owned_locks(locking.LEVEL_CLUSTER)
1562 self.all_group_info = self.cfg.GetAllNodeGroupsInfo()
1563 self.all_node_info = self.cfg.GetAllNodesInfo()
1564 self.all_inst_info = self.cfg.GetAllInstancesInfo()
1565 self.needed_locks = {}
1567 def Exec(self, feedback_fn):
1568 """Verify integrity of cluster, performing various test on nodes.
1572 self._feedback_fn = feedback_fn
1574 feedback_fn("* Verifying cluster config")
1576 for msg in self.cfg.VerifyConfig():
1577 self._ErrorIf(True, constants.CV_ECLUSTERCFG, None, msg)
1579 feedback_fn("* Verifying cluster certificate files")
1581 for cert_filename in constants.ALL_CERT_FILES:
1582 (errcode, msg) = _VerifyCertificate(cert_filename)
1583 self._ErrorIf(errcode, constants.CV_ECLUSTERCERT, None, msg, code=errcode)
1585 feedback_fn("* Verifying hypervisor parameters")
1587 self._VerifyHVP(_GetAllHypervisorParameters(self.cfg.GetClusterInfo(),
1588 self.all_inst_info.values()))
1590 feedback_fn("* Verifying all nodes belong to an existing group")
1592 # We do this verification here because, should this bogus circumstance
1593 # occur, it would never be caught by VerifyGroup, which only acts on
1594 # nodes/instances reachable from existing node groups.
1596 dangling_nodes = set(node.name for node in self.all_node_info.values()
1597 if node.group not in self.all_group_info)
1599 dangling_instances = {}
1600 no_node_instances = []
1602 for inst in self.all_inst_info.values():
1603 if inst.primary_node in dangling_nodes:
1604 dangling_instances.setdefault(inst.primary_node, []).append(inst.name)
1605 elif inst.primary_node not in self.all_node_info:
1606 no_node_instances.append(inst.name)
1611 utils.CommaJoin(dangling_instances.get(node.name,
1613 for node in dangling_nodes]
1615 self._ErrorIf(bool(dangling_nodes), constants.CV_ECLUSTERDANGLINGNODES,
1617 "the following nodes (and their instances) belong to a non"
1618 " existing group: %s", utils.CommaJoin(pretty_dangling))
1620 self._ErrorIf(bool(no_node_instances), constants.CV_ECLUSTERDANGLINGINST,
1622 "the following instances have a non-existing primary-node:"
1623 " %s", utils.CommaJoin(no_node_instances))
1628 class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
1629 """Verifies the status of a node group.
1632 HPATH = "cluster-verify"
1633 HTYPE = constants.HTYPE_CLUSTER
1636 _HOOKS_INDENT_RE = re.compile("^", re.M)
1638 class NodeImage(object):
1639 """A class representing the logical and physical status of a node.
1642 @ivar name: the node name to which this object refers
1643 @ivar volumes: a structure as returned from
1644 L{ganeti.backend.GetVolumeList} (runtime)
1645 @ivar instances: a list of running instances (runtime)
1646 @ivar pinst: list of configured primary instances (config)
1647 @ivar sinst: list of configured secondary instances (config)
1648 @ivar sbp: dictionary of {primary-node: list of instances} for all
1649 instances for which this node is secondary (config)
1650 @ivar mfree: free memory, as reported by hypervisor (runtime)
1651 @ivar dfree: free disk, as reported by the node (runtime)
1652 @ivar offline: the offline status (config)
1653 @type rpc_fail: boolean
1654 @ivar rpc_fail: whether the RPC verify call was successfull (overall,
1655 not whether the individual keys were correct) (runtime)
1656 @type lvm_fail: boolean
1657 @ivar lvm_fail: whether the RPC call didn't return valid LVM data
1658 @type hyp_fail: boolean
1659 @ivar hyp_fail: whether the RPC call didn't return the instance list
1660 @type ghost: boolean
1661 @ivar ghost: whether this is a known node or not (config)
1662 @type os_fail: boolean
1663 @ivar os_fail: whether the RPC call didn't return valid OS data
1665 @ivar oslist: list of OSes as diagnosed by DiagnoseOS
1666 @type vm_capable: boolean
1667 @ivar vm_capable: whether the node can host instances
1670 def __init__(self, offline=False, name=None, vm_capable=True):
1679 self.offline = offline
1680 self.vm_capable = vm_capable
1681 self.rpc_fail = False
1682 self.lvm_fail = False
1683 self.hyp_fail = False
1685 self.os_fail = False
1688 def ExpandNames(self):
1689 # This raises errors.OpPrereqError on its own:
1690 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
1692 # Get instances in node group; this is unsafe and needs verification later
1693 inst_names = self.cfg.GetNodeGroupInstances(self.group_uuid)
1695 self.needed_locks = {
1696 locking.LEVEL_INSTANCE: inst_names,
1697 locking.LEVEL_NODEGROUP: [self.group_uuid],
1698 locking.LEVEL_NODE: [],
1701 self.share_locks = _ShareAll()
1703 def DeclareLocks(self, level):
1704 if level == locking.LEVEL_NODE:
1705 # Get members of node group; this is unsafe and needs verification later
1706 nodes = set(self.cfg.GetNodeGroup(self.group_uuid).members)
1708 all_inst_info = self.cfg.GetAllInstancesInfo()
1710 # In Exec(), we warn about mirrored instances that have primary and
1711 # secondary living in separate node groups. To fully verify that
1712 # volumes for these instances are healthy, we will need to do an
1713 # extra call to their secondaries. We ensure here those nodes will
1715 for inst in self.owned_locks(locking.LEVEL_INSTANCE):
1716 # Important: access only the instances whose lock is owned
1717 if all_inst_info[inst].disk_template in constants.DTS_INT_MIRROR:
1718 nodes.update(all_inst_info[inst].secondary_nodes)
1720 self.needed_locks[locking.LEVEL_NODE] = nodes
1722 def CheckPrereq(self):
1723 assert self.group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP)
1724 self.group_info = self.cfg.GetNodeGroup(self.group_uuid)
1726 group_nodes = set(self.group_info.members)
1727 group_instances = self.cfg.GetNodeGroupInstances(self.group_uuid)
1730 group_nodes.difference(self.owned_locks(locking.LEVEL_NODE))
1732 unlocked_instances = \
1733 group_instances.difference(self.owned_locks(locking.LEVEL_INSTANCE))
1736 raise errors.OpPrereqError("Missing lock for nodes: %s" %
1737 utils.CommaJoin(unlocked_nodes))
1739 if unlocked_instances:
1740 raise errors.OpPrereqError("Missing lock for instances: %s" %
1741 utils.CommaJoin(unlocked_instances))
1743 self.all_node_info = self.cfg.GetAllNodesInfo()
1744 self.all_inst_info = self.cfg.GetAllInstancesInfo()
1746 self.my_node_names = utils.NiceSort(group_nodes)
1747 self.my_inst_names = utils.NiceSort(group_instances)
1749 self.my_node_info = dict((name, self.all_node_info[name])
1750 for name in self.my_node_names)
1752 self.my_inst_info = dict((name, self.all_inst_info[name])
1753 for name in self.my_inst_names)
1755 # We detect here the nodes that will need the extra RPC calls for verifying
1756 # split LV volumes; they should be locked.
1757 extra_lv_nodes = set()
1759 for inst in self.my_inst_info.values():
1760 if inst.disk_template in constants.DTS_INT_MIRROR:
1761 group = self.my_node_info[inst.primary_node].group
1762 for nname in inst.secondary_nodes:
1763 if self.all_node_info[nname].group != group:
1764 extra_lv_nodes.add(nname)
1766 unlocked_lv_nodes = \
1767 extra_lv_nodes.difference(self.owned_locks(locking.LEVEL_NODE))
1769 if unlocked_lv_nodes:
1770 raise errors.OpPrereqError("these nodes could be locked: %s" %
1771 utils.CommaJoin(unlocked_lv_nodes))
1772 self.extra_lv_nodes = list(extra_lv_nodes)
1774 def _VerifyNode(self, ninfo, nresult):
1775 """Perform some basic validation on data returned from a node.
1777 - check the result data structure is well formed and has all the
1779 - check ganeti version
1781 @type ninfo: L{objects.Node}
1782 @param ninfo: the node to check
1783 @param nresult: the results from the node
1785 @return: whether overall this call was successful (and we can expect
1786 reasonable values in the respose)
1790 _ErrorIf = self._ErrorIf # pylint: disable=C0103
1792 # main result, nresult should be a non-empty dict
1793 test = not nresult or not isinstance(nresult, dict)
1794 _ErrorIf(test, constants.CV_ENODERPC, node,
1795 "unable to verify node: no data returned")
1799 # compares ganeti version
1800 local_version = constants.PROTOCOL_VERSION
1801 remote_version = nresult.get("version", None)
1802 test = not (remote_version and
1803 isinstance(remote_version, (list, tuple)) and
1804 len(remote_version) == 2)
1805 _ErrorIf(test, constants.CV_ENODERPC, node,
1806 "connection to node returned invalid data")
1810 test = local_version != remote_version[0]
1811 _ErrorIf(test, constants.CV_ENODEVERSION, node,
1812 "incompatible protocol versions: master %s,"
1813 " node %s", local_version, remote_version[0])
1817 # node seems compatible, we can actually try to look into its results
1819 # full package version
1820 self._ErrorIf(constants.RELEASE_VERSION != remote_version[1],
1821 constants.CV_ENODEVERSION, node,
1822 "software version mismatch: master %s, node %s",
1823 constants.RELEASE_VERSION, remote_version[1],
1824 code=self.ETYPE_WARNING)
1826 hyp_result = nresult.get(constants.NV_HYPERVISOR, None)
1827 if ninfo.vm_capable and isinstance(hyp_result, dict):
1828 for hv_name, hv_result in hyp_result.iteritems():
1829 test = hv_result is not None
1830 _ErrorIf(test, constants.CV_ENODEHV, node,
1831 "hypervisor %s verify failure: '%s'", hv_name, hv_result)
1833 hvp_result = nresult.get(constants.NV_HVPARAMS, None)
1834 if ninfo.vm_capable and isinstance(hvp_result, list):
1835 for item, hv_name, hv_result in hvp_result:
1836 _ErrorIf(True, constants.CV_ENODEHV, node,
1837 "hypervisor %s parameter verify failure (source %s): %s",
1838 hv_name, item, hv_result)
1840 test = nresult.get(constants.NV_NODESETUP,
1841 ["Missing NODESETUP results"])
1842 _ErrorIf(test, constants.CV_ENODESETUP, node, "node setup error: %s",
1847 def _VerifyNodeTime(self, ninfo, nresult,
1848 nvinfo_starttime, nvinfo_endtime):
1849 """Check the node time.
1851 @type ninfo: L{objects.Node}
1852 @param ninfo: the node to check
1853 @param nresult: the remote results for the node
1854 @param nvinfo_starttime: the start time of the RPC call
1855 @param nvinfo_endtime: the end time of the RPC call
1859 _ErrorIf = self._ErrorIf # pylint: disable=C0103
1861 ntime = nresult.get(constants.NV_TIME, None)
1863 ntime_merged = utils.MergeTime(ntime)
1864 except (ValueError, TypeError):
1865 _ErrorIf(True, constants.CV_ENODETIME, node, "Node returned invalid time")
1868 if ntime_merged < (nvinfo_starttime - constants.NODE_MAX_CLOCK_SKEW):
1869 ntime_diff = "%.01fs" % abs(nvinfo_starttime - ntime_merged)
1870 elif ntime_merged > (nvinfo_endtime + constants.NODE_MAX_CLOCK_SKEW):
1871 ntime_diff = "%.01fs" % abs(ntime_merged - nvinfo_endtime)
1875 _ErrorIf(ntime_diff is not None, constants.CV_ENODETIME, node,
1876 "Node time diverges by at least %s from master node time",
1879 def _VerifyNodeLVM(self, ninfo, nresult, vg_name):
1880 """Check the node LVM results.
1882 @type ninfo: L{objects.Node}
1883 @param ninfo: the node to check
1884 @param nresult: the remote results for the node
1885 @param vg_name: the configured VG name
1892 _ErrorIf = self._ErrorIf # pylint: disable=C0103
1894 # checks vg existence and size > 20G
1895 vglist = nresult.get(constants.NV_VGLIST, None)
1897 _ErrorIf(test, constants.CV_ENODELVM, node, "unable to check volume groups")
1899 vgstatus = utils.CheckVolumeGroupSize(vglist, vg_name,
1900 constants.MIN_VG_SIZE)
1901 _ErrorIf(vgstatus, constants.CV_ENODELVM, node, vgstatus)
1904 pvlist = nresult.get(constants.NV_PVLIST, None)
1905 test = pvlist is None
1906 _ErrorIf(test, constants.CV_ENODELVM, node, "Can't get PV list from node")
1908 # check that ':' is not present in PV names, since it's a
1909 # special character for lvcreate (denotes the range of PEs to
1911 for _, pvname, owner_vg in pvlist:
1912 test = ":" in pvname
1913 _ErrorIf(test, constants.CV_ENODELVM, node,
1914 "Invalid character ':' in PV '%s' of VG '%s'",
1917 def _VerifyNodeBridges(self, ninfo, nresult, bridges):
1918 """Check the node bridges.
1920 @type ninfo: L{objects.Node}
1921 @param ninfo: the node to check
1922 @param nresult: the remote results for the node
1923 @param bridges: the expected list of bridges
1930 _ErrorIf = self._ErrorIf # pylint: disable=C0103
1932 missing = nresult.get(constants.NV_BRIDGES, None)
1933 test = not isinstance(missing, list)
1934 _ErrorIf(test, constants.CV_ENODENET, node,
1935 "did not return valid bridge information")
1937 _ErrorIf(bool(missing), constants.CV_ENODENET, node,
1938 "missing bridges: %s" % utils.CommaJoin(sorted(missing)))
1940 def _VerifyNodeNetwork(self, ninfo, nresult):
1941 """Check the node network connectivity results.
1943 @type ninfo: L{objects.Node}
1944 @param ninfo: the node to check
1945 @param nresult: the remote results for the node
1949 _ErrorIf = self._ErrorIf # pylint: disable=C0103
1951 test = constants.NV_NODELIST not in nresult
1952 _ErrorIf(test, constants.CV_ENODESSH, node,
1953 "node hasn't returned node ssh connectivity data")
1955 if nresult[constants.NV_NODELIST]:
1956 for a_node, a_msg in nresult[constants.NV_NODELIST].items():
1957 _ErrorIf(True, constants.CV_ENODESSH, node,
1958 "ssh communication with node '%s': %s", a_node, a_msg)
1960 test = constants.NV_NODENETTEST not in nresult
1961 _ErrorIf(test, constants.CV_ENODENET, node,
1962 "node hasn't returned node tcp connectivity data")
1964 if nresult[constants.NV_NODENETTEST]:
1965 nlist = utils.NiceSort(nresult[constants.NV_NODENETTEST].keys())
1967 _ErrorIf(True, constants.CV_ENODENET, node,
1968 "tcp communication with node '%s': %s",
1969 anode, nresult[constants.NV_NODENETTEST][anode])
1971 test = constants.NV_MASTERIP not in nresult
1972 _ErrorIf(test, constants.CV_ENODENET, node,
1973 "node hasn't returned node master IP reachability data")
1975 if not nresult[constants.NV_MASTERIP]:
1976 if node == self.master_node:
1977 msg = "the master node cannot reach the master IP (not configured?)"
1979 msg = "cannot reach the master IP"
1980 _ErrorIf(True, constants.CV_ENODENET, node, msg)
1982 def _VerifyInstance(self, instance, instanceconfig, node_image,
1984 """Verify an instance.
1986 This function checks to see if the required block devices are
1987 available on the instance's node.
1990 _ErrorIf = self._ErrorIf # pylint: disable=C0103
1991 node_current = instanceconfig.primary_node
1993 node_vol_should = {}
1994 instanceconfig.MapLVsByNode(node_vol_should)
1996 for node in node_vol_should:
1997 n_img = node_image[node]
1998 if n_img.offline or n_img.rpc_fail or n_img.lvm_fail:
1999 # ignore missing volumes on offline or broken nodes
2001 for volume in node_vol_should[node]:
2002 test = volume not in n_img.volumes
2003 _ErrorIf(test, constants.CV_EINSTANCEMISSINGDISK, instance,
2004 "volume %s missing on node %s", volume, node)
2006 if instanceconfig.admin_up:
2007 pri_img = node_image[node_current]
2008 test = instance not in pri_img.instances and not pri_img.offline
2009 _ErrorIf(test, constants.CV_EINSTANCEDOWN, instance,
2010 "instance not running on its primary node %s",
2013 diskdata = [(nname, success, status, idx)
2014 for (nname, disks) in diskstatus.items()
2015 for idx, (success, status) in enumerate(disks)]
2017 for nname, success, bdev_status, idx in diskdata:
2018 # the 'ghost node' construction in Exec() ensures that we have a
2020 snode = node_image[nname]
2021 bad_snode = snode.ghost or snode.offline
2022 _ErrorIf(instanceconfig.admin_up and not success and not bad_snode,
2023 constants.CV_EINSTANCEFAULTYDISK, instance,
2024 "couldn't retrieve status for disk/%s on %s: %s",
2025 idx, nname, bdev_status)
2026 _ErrorIf((instanceconfig.admin_up and success and
2027 bdev_status.ldisk_status == constants.LDS_FAULTY),
2028 constants.CV_EINSTANCEFAULTYDISK, instance,
2029 "disk/%s on %s is faulty", idx, nname)
2031 def _VerifyOrphanVolumes(self, node_vol_should, node_image, reserved):
2032 """Verify if there are any unknown volumes in the cluster.
2034 The .os, .swap and backup volumes are ignored. All other volumes are
2035 reported as unknown.
2037 @type reserved: L{ganeti.utils.FieldSet}
2038 @param reserved: a FieldSet of reserved volume names
2041 for node, n_img in node_image.items():
2042 if n_img.offline or n_img.rpc_fail or n_img.lvm_fail:
2043 # skip non-healthy nodes
2045 for volume in n_img.volumes:
2046 test = ((node not in node_vol_should or
2047 volume not in node_vol_should[node]) and
2048 not reserved.Matches(volume))
2049 self._ErrorIf(test, constants.CV_ENODEORPHANLV, node,
2050 "volume %s is unknown", volume)
2052 def _VerifyNPlusOneMemory(self, node_image, instance_cfg):
2053 """Verify N+1 Memory Resilience.
2055 Check that if one single node dies we can still start all the
2056 instances it was primary for.
2059 cluster_info = self.cfg.GetClusterInfo()
2060 for node, n_img in node_image.items():
2061 # This code checks that every node which is now listed as
2062 # secondary has enough memory to host all instances it is
2063 # supposed to should a single other node in the cluster fail.
2064 # FIXME: not ready for failover to an arbitrary node
2065 # FIXME: does not support file-backed instances
2066 # WARNING: we currently take into account down instances as well
2067 # as up ones, considering that even if they're down someone
2068 # might want to start them even in the event of a node failure.
2070 # we're skipping offline nodes from the N+1 warning, since
2071 # most likely we don't have good memory infromation from them;
2072 # we already list instances living on such nodes, and that's
2075 for prinode, instances in n_img.sbp.items():
2077 for instance in instances:
2078 bep = cluster_info.FillBE(instance_cfg[instance])
2079 if bep[constants.BE_AUTO_BALANCE]:
2080 needed_mem += bep[constants.BE_MEMORY]
2081 test = n_img.mfree < needed_mem
2082 self._ErrorIf(test, constants.CV_ENODEN1, node,
2083 "not enough memory to accomodate instance failovers"
2084 " should node %s fail (%dMiB needed, %dMiB available)",
2085 prinode, needed_mem, n_img.mfree)
2088 def _VerifyFiles(cls, errorif, nodeinfo, master_node, all_nvinfo,
2089 (files_all, files_opt, files_mc, files_vm)):
2090 """Verifies file checksums collected from all nodes.
2092 @param errorif: Callback for reporting errors
2093 @param nodeinfo: List of L{objects.Node} objects
2094 @param master_node: Name of master node
2095 @param all_nvinfo: RPC results
2098 # Define functions determining which nodes to consider for a file
2101 (files_mc, lambda node: (node.master_candidate or
2102 node.name == master_node)),
2103 (files_vm, lambda node: node.vm_capable),
2106 # Build mapping from filename to list of nodes which should have the file
2108 for (files, fn) in files2nodefn:
2110 filenodes = nodeinfo
2112 filenodes = filter(fn, nodeinfo)
2113 nodefiles.update((filename,
2114 frozenset(map(operator.attrgetter("name"), filenodes)))
2115 for filename in files)
2117 assert set(nodefiles) == (files_all | files_mc | files_vm)
2119 fileinfo = dict((filename, {}) for filename in nodefiles)
2120 ignore_nodes = set()
2122 for node in nodeinfo:
2124 ignore_nodes.add(node.name)
2127 nresult = all_nvinfo[node.name]
2129 if nresult.fail_msg or not nresult.payload:
2132 node_files = nresult.payload.get(constants.NV_FILELIST, None)
2134 test = not (node_files and isinstance(node_files, dict))
2135 errorif(test, constants.CV_ENODEFILECHECK, node.name,
2136 "Node did not return file checksum data")
2138 ignore_nodes.add(node.name)
2141 # Build per-checksum mapping from filename to nodes having it
2142 for (filename, checksum) in node_files.items():
2143 assert filename in nodefiles
2144 fileinfo[filename].setdefault(checksum, set()).add(node.name)
2146 for (filename, checksums) in fileinfo.items():
2147 assert compat.all(len(i) > 10 for i in checksums), "Invalid checksum"
2149 # Nodes having the file
2150 with_file = frozenset(node_name
2151 for nodes in fileinfo[filename].values()
2152 for node_name in nodes) - ignore_nodes
2154 expected_nodes = nodefiles[filename] - ignore_nodes
2156 # Nodes missing file
2157 missing_file = expected_nodes - with_file
2159 if filename in files_opt:
2161 errorif(missing_file and missing_file != expected_nodes,
2162 constants.CV_ECLUSTERFILECHECK, None,
2163 "File %s is optional, but it must exist on all or no"
2164 " nodes (not found on %s)",
2165 filename, utils.CommaJoin(utils.NiceSort(missing_file)))
2167 errorif(missing_file, constants.CV_ECLUSTERFILECHECK, None,
2168 "File %s is missing from node(s) %s", filename,
2169 utils.CommaJoin(utils.NiceSort(missing_file)))
2171 # Warn if a node has a file it shouldn't
2172 unexpected = with_file - expected_nodes
2174 constants.CV_ECLUSTERFILECHECK, None,
2175 "File %s should not exist on node(s) %s",
2176 filename, utils.CommaJoin(utils.NiceSort(unexpected)))
2178 # See if there are multiple versions of the file
2179 test = len(checksums) > 1
2181 variants = ["variant %s on %s" %
2182 (idx + 1, utils.CommaJoin(utils.NiceSort(nodes)))
2183 for (idx, (checksum, nodes)) in
2184 enumerate(sorted(checksums.items()))]
2188 errorif(test, constants.CV_ECLUSTERFILECHECK, None,
2189 "File %s found with %s different checksums (%s)",
2190 filename, len(checksums), "; ".join(variants))
2192 def _VerifyNodeDrbd(self, ninfo, nresult, instanceinfo, drbd_helper,
2194 """Verifies and the node DRBD status.
2196 @type ninfo: L{objects.Node}
2197 @param ninfo: the node to check
2198 @param nresult: the remote results for the node
2199 @param instanceinfo: the dict of instances
2200 @param drbd_helper: the configured DRBD usermode helper
2201 @param drbd_map: the DRBD map as returned by
2202 L{ganeti.config.ConfigWriter.ComputeDRBDMap}
2206 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2209 helper_result = nresult.get(constants.NV_DRBDHELPER, None)
2210 test = (helper_result == None)
2211 _ErrorIf(test, constants.CV_ENODEDRBDHELPER, node,
2212 "no drbd usermode helper returned")
2214 status, payload = helper_result
2216 _ErrorIf(test, constants.CV_ENODEDRBDHELPER, node,
2217 "drbd usermode helper check unsuccessful: %s", payload)
2218 test = status and (payload != drbd_helper)
2219 _ErrorIf(test, constants.CV_ENODEDRBDHELPER, node,
2220 "wrong drbd usermode helper: %s", payload)
2222 # compute the DRBD minors
2224 for minor, instance in drbd_map[node].items():
2225 test = instance not in instanceinfo
2226 _ErrorIf(test, constants.CV_ECLUSTERCFG, None,
2227 "ghost instance '%s' in temporary DRBD map", instance)
2228 # ghost instance should not be running, but otherwise we
2229 # don't give double warnings (both ghost instance and
2230 # unallocated minor in use)
2232 node_drbd[minor] = (instance, False)
2234 instance = instanceinfo[instance]
2235 node_drbd[minor] = (instance.name, instance.admin_up)
2237 # and now check them
2238 used_minors = nresult.get(constants.NV_DRBDLIST, [])
2239 test = not isinstance(used_minors, (tuple, list))
2240 _ErrorIf(test, constants.CV_ENODEDRBD, node,
2241 "cannot parse drbd status file: %s", str(used_minors))
2243 # we cannot check drbd status
2246 for minor, (iname, must_exist) in node_drbd.items():
2247 test = minor not in used_minors and must_exist
2248 _ErrorIf(test, constants.CV_ENODEDRBD, node,
2249 "drbd minor %d of instance %s is not active", minor, iname)
2250 for minor in used_minors:
2251 test = minor not in node_drbd
2252 _ErrorIf(test, constants.CV_ENODEDRBD, node,
2253 "unallocated drbd minor %d is in use", minor)
2255 def _UpdateNodeOS(self, ninfo, nresult, nimg):
2256 """Builds the node OS structures.
2258 @type ninfo: L{objects.Node}
2259 @param ninfo: the node to check
2260 @param nresult: the remote results for the node
2261 @param nimg: the node image object
2265 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2267 remote_os = nresult.get(constants.NV_OSLIST, None)
2268 test = (not isinstance(remote_os, list) or
2269 not compat.all(isinstance(v, list) and len(v) == 7
2270 for v in remote_os))
2272 _ErrorIf(test, constants.CV_ENODEOS, node,
2273 "node hasn't returned valid OS data")
2282 for (name, os_path, status, diagnose,
2283 variants, parameters, api_ver) in nresult[constants.NV_OSLIST]:
2285 if name not in os_dict:
2288 # parameters is a list of lists instead of list of tuples due to
2289 # JSON lacking a real tuple type, fix it:
2290 parameters = [tuple(v) for v in parameters]
2291 os_dict[name].append((os_path, status, diagnose,
2292 set(variants), set(parameters), set(api_ver)))
2294 nimg.oslist = os_dict
2296 def _VerifyNodeOS(self, ninfo, nimg, base):
2297 """Verifies the node OS list.
2299 @type ninfo: L{objects.Node}
2300 @param ninfo: the node to check
2301 @param nimg: the node image object
2302 @param base: the 'template' node we match against (e.g. from the master)
2306 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2308 assert not nimg.os_fail, "Entered _VerifyNodeOS with failed OS rpc?"
2310 beautify_params = lambda l: ["%s: %s" % (k, v) for (k, v) in l]
2311 for os_name, os_data in nimg.oslist.items():
2312 assert os_data, "Empty OS status for OS %s?!" % os_name
2313 f_path, f_status, f_diag, f_var, f_param, f_api = os_data[0]
2314 _ErrorIf(not f_status, constants.CV_ENODEOS, node,
2315 "Invalid OS %s (located at %s): %s", os_name, f_path, f_diag)
2316 _ErrorIf(len(os_data) > 1, constants.CV_ENODEOS, node,
2317 "OS '%s' has multiple entries (first one shadows the rest): %s",
2318 os_name, utils.CommaJoin([v[0] for v in os_data]))
2319 # comparisons with the 'base' image
2320 test = os_name not in base.oslist
2321 _ErrorIf(test, constants.CV_ENODEOS, node,
2322 "Extra OS %s not present on reference node (%s)",
2326 assert base.oslist[os_name], "Base node has empty OS status?"
2327 _, b_status, _, b_var, b_param, b_api = base.oslist[os_name][0]
2329 # base OS is invalid, skipping
2331 for kind, a, b in [("API version", f_api, b_api),
2332 ("variants list", f_var, b_var),
2333 ("parameters", beautify_params(f_param),
2334 beautify_params(b_param))]:
2335 _ErrorIf(a != b, constants.CV_ENODEOS, node,
2336 "OS %s for %s differs from reference node %s: [%s] vs. [%s]",
2337 kind, os_name, base.name,
2338 utils.CommaJoin(sorted(a)), utils.CommaJoin(sorted(b)))
2340 # check any missing OSes
2341 missing = set(base.oslist.keys()).difference(nimg.oslist.keys())
2342 _ErrorIf(missing, constants.CV_ENODEOS, node,
2343 "OSes present on reference node %s but missing on this node: %s",
2344 base.name, utils.CommaJoin(missing))
2346 def _VerifyOob(self, ninfo, nresult):
2347 """Verifies out of band functionality of a node.
2349 @type ninfo: L{objects.Node}
2350 @param ninfo: the node to check
2351 @param nresult: the remote results for the node
2355 # We just have to verify the paths on master and/or master candidates
2356 # as the oob helper is invoked on the master
2357 if ((ninfo.master_candidate or ninfo.master_capable) and
2358 constants.NV_OOB_PATHS in nresult):
2359 for path_result in nresult[constants.NV_OOB_PATHS]:
2360 self._ErrorIf(path_result, constants.CV_ENODEOOBPATH, node, path_result)
2362 def _UpdateNodeVolumes(self, ninfo, nresult, nimg, vg_name):
2363 """Verifies and updates the node volume data.
2365 This function will update a L{NodeImage}'s internal structures
2366 with data from the remote call.
2368 @type ninfo: L{objects.Node}
2369 @param ninfo: the node to check
2370 @param nresult: the remote results for the node
2371 @param nimg: the node image object
2372 @param vg_name: the configured VG name
2376 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2378 nimg.lvm_fail = True
2379 lvdata = nresult.get(constants.NV_LVLIST, "Missing LV data")
2382 elif isinstance(lvdata, basestring):
2383 _ErrorIf(True, constants.CV_ENODELVM, node, "LVM problem on node: %s",
2384 utils.SafeEncode(lvdata))
2385 elif not isinstance(lvdata, dict):
2386 _ErrorIf(True, constants.CV_ENODELVM, node,
2387 "rpc call to node failed (lvlist)")
2389 nimg.volumes = lvdata
2390 nimg.lvm_fail = False
2392 def _UpdateNodeInstances(self, ninfo, nresult, nimg):
2393 """Verifies and updates the node instance list.
2395 If the listing was successful, then updates this node's instance
2396 list. Otherwise, it marks the RPC call as failed for the instance
2399 @type ninfo: L{objects.Node}
2400 @param ninfo: the node to check
2401 @param nresult: the remote results for the node
2402 @param nimg: the node image object
2405 idata = nresult.get(constants.NV_INSTANCELIST, None)
2406 test = not isinstance(idata, list)
2407 self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name,
2408 "rpc call to node failed (instancelist): %s",
2409 utils.SafeEncode(str(idata)))
2411 nimg.hyp_fail = True
2413 nimg.instances = idata
2415 def _UpdateNodeInfo(self, ninfo, nresult, nimg, vg_name):
2416 """Verifies and computes a node information map
2418 @type ninfo: L{objects.Node}
2419 @param ninfo: the node to check
2420 @param nresult: the remote results for the node
2421 @param nimg: the node image object
2422 @param vg_name: the configured VG name
2426 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2428 # try to read free memory (from the hypervisor)
2429 hv_info = nresult.get(constants.NV_HVINFO, None)
2430 test = not isinstance(hv_info, dict) or "memory_free" not in hv_info
2431 _ErrorIf(test, constants.CV_ENODEHV, node,
2432 "rpc call to node failed (hvinfo)")
2435 nimg.mfree = int(hv_info["memory_free"])
2436 except (ValueError, TypeError):
2437 _ErrorIf(True, constants.CV_ENODERPC, node,
2438 "node returned invalid nodeinfo, check hypervisor")
2440 # FIXME: devise a free space model for file based instances as well
2441 if vg_name is not None:
2442 test = (constants.NV_VGLIST not in nresult or
2443 vg_name not in nresult[constants.NV_VGLIST])
2444 _ErrorIf(test, constants.CV_ENODELVM, node,
2445 "node didn't return data for the volume group '%s'"
2446 " - it is either missing or broken", vg_name)
2449 nimg.dfree = int(nresult[constants.NV_VGLIST][vg_name])
2450 except (ValueError, TypeError):
2451 _ErrorIf(True, constants.CV_ENODERPC, node,
2452 "node returned invalid LVM info, check LVM status")
2454 def _CollectDiskInfo(self, nodelist, node_image, instanceinfo):
2455 """Gets per-disk status information for all instances.
2457 @type nodelist: list of strings
2458 @param nodelist: Node names
2459 @type node_image: dict of (name, L{objects.Node})
2460 @param node_image: Node objects
2461 @type instanceinfo: dict of (name, L{objects.Instance})
2462 @param instanceinfo: Instance objects
2463 @rtype: {instance: {node: [(succes, payload)]}}
2464 @return: a dictionary of per-instance dictionaries with nodes as
2465 keys and disk information as values; the disk information is a
2466 list of tuples (success, payload)
2469 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2472 node_disks_devonly = {}
2473 diskless_instances = set()
2474 diskless = constants.DT_DISKLESS
2476 for nname in nodelist:
2477 node_instances = list(itertools.chain(node_image[nname].pinst,
2478 node_image[nname].sinst))
2479 diskless_instances.update(inst for inst in node_instances
2480 if instanceinfo[inst].disk_template == diskless)
2481 disks = [(inst, disk)
2482 for inst in node_instances
2483 for disk in instanceinfo[inst].disks]
2486 # No need to collect data
2489 node_disks[nname] = disks
2491 # Creating copies as SetDiskID below will modify the objects and that can
2492 # lead to incorrect data returned from nodes
2493 devonly = [dev.Copy() for (_, dev) in disks]
2496 self.cfg.SetDiskID(dev, nname)
2498 node_disks_devonly[nname] = devonly
2500 assert len(node_disks) == len(node_disks_devonly)
2502 # Collect data from all nodes with disks
2503 result = self.rpc.call_blockdev_getmirrorstatus_multi(node_disks.keys(),
2506 assert len(result) == len(node_disks)
2510 for (nname, nres) in result.items():
2511 disks = node_disks[nname]
2514 # No data from this node
2515 data = len(disks) * [(False, "node offline")]
2518 _ErrorIf(msg, constants.CV_ENODERPC, nname,
2519 "while getting disk information: %s", msg)
2521 # No data from this node
2522 data = len(disks) * [(False, msg)]
2525 for idx, i in enumerate(nres.payload):
2526 if isinstance(i, (tuple, list)) and len(i) == 2:
2529 logging.warning("Invalid result from node %s, entry %d: %s",
2531 data.append((False, "Invalid result from the remote node"))
2533 for ((inst, _), status) in zip(disks, data):
2534 instdisk.setdefault(inst, {}).setdefault(nname, []).append(status)
2536 # Add empty entries for diskless instances.
2537 for inst in diskless_instances:
2538 assert inst not in instdisk
2541 assert compat.all(len(statuses) == len(instanceinfo[inst].disks) and
2542 len(nnames) <= len(instanceinfo[inst].all_nodes) and
2543 compat.all(isinstance(s, (tuple, list)) and
2544 len(s) == 2 for s in statuses)
2545 for inst, nnames in instdisk.items()
2546 for nname, statuses in nnames.items())
2547 assert set(instdisk) == set(instanceinfo), "instdisk consistency failure"
2552 def _SshNodeSelector(group_uuid, all_nodes):
2553 """Create endless iterators for all potential SSH check hosts.
2556 nodes = [node for node in all_nodes
2557 if (node.group != group_uuid and
2559 keyfunc = operator.attrgetter("group")
2561 return map(itertools.cycle,
2562 [sorted(map(operator.attrgetter("name"), names))
2563 for _, names in itertools.groupby(sorted(nodes, key=keyfunc),
2567 def _SelectSshCheckNodes(cls, group_nodes, group_uuid, all_nodes):
2568 """Choose which nodes should talk to which other nodes.
2570 We will make nodes contact all nodes in their group, and one node from
2573 @warning: This algorithm has a known issue if one node group is much
2574 smaller than others (e.g. just one node). In such a case all other
2575 nodes will talk to the single node.
2578 online_nodes = sorted(node.name for node in group_nodes if not node.offline)
2579 sel = cls._SshNodeSelector(group_uuid, all_nodes)
2581 return (online_nodes,
2582 dict((name, sorted([i.next() for i in sel]))
2583 for name in online_nodes))
2585 def BuildHooksEnv(self):
2588 Cluster-Verify hooks just ran in the post phase and their failure makes
2589 the output be logged in the verify output and the verification to fail.
2593 "CLUSTER_TAGS": " ".join(self.cfg.GetClusterInfo().GetTags())
2596 env.update(("NODE_TAGS_%s" % node.name, " ".join(node.GetTags()))
2597 for node in self.my_node_info.values())
2601 def BuildHooksNodes(self):
2602 """Build hooks nodes.
2605 return ([], self.my_node_names)
2607 def Exec(self, feedback_fn):
2608 """Verify integrity of the node group, performing various test on nodes.
2611 # This method has too many local variables. pylint: disable=R0914
2612 feedback_fn("* Verifying group '%s'" % self.group_info.name)
2614 if not self.my_node_names:
2616 feedback_fn("* Empty node group, skipping verification")
2620 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2621 verbose = self.op.verbose
2622 self._feedback_fn = feedback_fn
2624 vg_name = self.cfg.GetVGName()
2625 drbd_helper = self.cfg.GetDRBDHelper()
2626 cluster = self.cfg.GetClusterInfo()
2627 groupinfo = self.cfg.GetAllNodeGroupsInfo()
2628 hypervisors = cluster.enabled_hypervisors
2629 node_data_list = [self.my_node_info[name] for name in self.my_node_names]
2631 i_non_redundant = [] # Non redundant instances
2632 i_non_a_balanced = [] # Non auto-balanced instances
2633 n_offline = 0 # Count of offline nodes
2634 n_drained = 0 # Count of nodes being drained
2635 node_vol_should = {}
2637 # FIXME: verify OS list
2640 filemap = _ComputeAncillaryFiles(cluster, False)
2642 # do local checksums
2643 master_node = self.master_node = self.cfg.GetMasterNode()
2644 master_ip = self.cfg.GetMasterIP()
2646 feedback_fn("* Gathering data (%d nodes)" % len(self.my_node_names))
2648 node_verify_param = {
2649 constants.NV_FILELIST:
2650 utils.UniqueSequence(filename
2651 for files in filemap
2652 for filename in files),
2653 constants.NV_NODELIST:
2654 self._SelectSshCheckNodes(node_data_list, self.group_uuid,
2655 self.all_node_info.values()),
2656 constants.NV_HYPERVISOR: hypervisors,
2657 constants.NV_HVPARAMS:
2658 _GetAllHypervisorParameters(cluster, self.all_inst_info.values()),
2659 constants.NV_NODENETTEST: [(node.name, node.primary_ip, node.secondary_ip)
2660 for node in node_data_list
2661 if not node.offline],
2662 constants.NV_INSTANCELIST: hypervisors,
2663 constants.NV_VERSION: None,
2664 constants.NV_HVINFO: self.cfg.GetHypervisorType(),
2665 constants.NV_NODESETUP: None,
2666 constants.NV_TIME: None,
2667 constants.NV_MASTERIP: (master_node, master_ip),
2668 constants.NV_OSLIST: None,
2669 constants.NV_VMNODES: self.cfg.GetNonVmCapableNodeList(),
2672 if vg_name is not None:
2673 node_verify_param[constants.NV_VGLIST] = None
2674 node_verify_param[constants.NV_LVLIST] = vg_name
2675 node_verify_param[constants.NV_PVLIST] = [vg_name]
2676 node_verify_param[constants.NV_DRBDLIST] = None
2679 node_verify_param[constants.NV_DRBDHELPER] = drbd_helper
2682 # FIXME: this needs to be changed per node-group, not cluster-wide
2684 default_nicpp = cluster.nicparams[constants.PP_DEFAULT]
2685 if default_nicpp[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
2686 bridges.add(default_nicpp[constants.NIC_LINK])
2687 for instance in self.my_inst_info.values():
2688 for nic in instance.nics:
2689 full_nic = cluster.SimpleFillNIC(nic.nicparams)
2690 if full_nic[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
2691 bridges.add(full_nic[constants.NIC_LINK])
2694 node_verify_param[constants.NV_BRIDGES] = list(bridges)
2696 # Build our expected cluster state
2697 node_image = dict((node.name, self.NodeImage(offline=node.offline,
2699 vm_capable=node.vm_capable))
2700 for node in node_data_list)
2704 for node in self.all_node_info.values():
2705 path = _SupportsOob(self.cfg, node)
2706 if path and path not in oob_paths:
2707 oob_paths.append(path)
2710 node_verify_param[constants.NV_OOB_PATHS] = oob_paths
2712 for instance in self.my_inst_names:
2713 inst_config = self.my_inst_info[instance]
2715 for nname in inst_config.all_nodes:
2716 if nname not in node_image:
2717 gnode = self.NodeImage(name=nname)
2718 gnode.ghost = (nname not in self.all_node_info)
2719 node_image[nname] = gnode
2721 inst_config.MapLVsByNode(node_vol_should)
2723 pnode = inst_config.primary_node
2724 node_image[pnode].pinst.append(instance)
2726 for snode in inst_config.secondary_nodes:
2727 nimg = node_image[snode]
2728 nimg.sinst.append(instance)
2729 if pnode not in nimg.sbp:
2730 nimg.sbp[pnode] = []
2731 nimg.sbp[pnode].append(instance)
2733 # At this point, we have the in-memory data structures complete,
2734 # except for the runtime information, which we'll gather next
2736 # Due to the way our RPC system works, exact response times cannot be
2737 # guaranteed (e.g. a broken node could run into a timeout). By keeping the
2738 # time before and after executing the request, we can at least have a time
2740 nvinfo_starttime = time.time()
2741 all_nvinfo = self.rpc.call_node_verify(self.my_node_names,
2743 self.cfg.GetClusterName())
2744 nvinfo_endtime = time.time()
2746 if self.extra_lv_nodes and vg_name is not None:
2748 self.rpc.call_node_verify(self.extra_lv_nodes,
2749 {constants.NV_LVLIST: vg_name},
2750 self.cfg.GetClusterName())
2752 extra_lv_nvinfo = {}
2754 all_drbd_map = self.cfg.ComputeDRBDMap()
2756 feedback_fn("* Gathering disk information (%s nodes)" %
2757 len(self.my_node_names))
2758 instdisk = self._CollectDiskInfo(self.my_node_names, node_image,
2761 feedback_fn("* Verifying configuration file consistency")
2763 # If not all nodes are being checked, we need to make sure the master node
2764 # and a non-checked vm_capable node are in the list.
2765 absent_nodes = set(self.all_node_info).difference(self.my_node_info)
2767 vf_nvinfo = all_nvinfo.copy()
2768 vf_node_info = list(self.my_node_info.values())
2769 additional_nodes = []
2770 if master_node not in self.my_node_info:
2771 additional_nodes.append(master_node)
2772 vf_node_info.append(self.all_node_info[master_node])
2773 # Add the first vm_capable node we find which is not included
2774 for node in absent_nodes:
2775 nodeinfo = self.all_node_info[node]
2776 if nodeinfo.vm_capable and not nodeinfo.offline:
2777 additional_nodes.append(node)
2778 vf_node_info.append(self.all_node_info[node])
2780 key = constants.NV_FILELIST
2781 vf_nvinfo.update(self.rpc.call_node_verify(additional_nodes,
2782 {key: node_verify_param[key]},
2783 self.cfg.GetClusterName()))
2785 vf_nvinfo = all_nvinfo
2786 vf_node_info = self.my_node_info.values()
2788 self._VerifyFiles(_ErrorIf, vf_node_info, master_node, vf_nvinfo, filemap)
2790 feedback_fn("* Verifying node status")
2794 for node_i in node_data_list:
2796 nimg = node_image[node]
2800 feedback_fn("* Skipping offline node %s" % (node,))
2804 if node == master_node:
2806 elif node_i.master_candidate:
2807 ntype = "master candidate"
2808 elif node_i.drained:
2814 feedback_fn("* Verifying node %s (%s)" % (node, ntype))
2816 msg = all_nvinfo[node].fail_msg
2817 _ErrorIf(msg, constants.CV_ENODERPC, node, "while contacting node: %s",
2820 nimg.rpc_fail = True
2823 nresult = all_nvinfo[node].payload
2825 nimg.call_ok = self._VerifyNode(node_i, nresult)
2826 self._VerifyNodeTime(node_i, nresult, nvinfo_starttime, nvinfo_endtime)
2827 self._VerifyNodeNetwork(node_i, nresult)
2828 self._VerifyOob(node_i, nresult)
2831 self._VerifyNodeLVM(node_i, nresult, vg_name)
2832 self._VerifyNodeDrbd(node_i, nresult, self.all_inst_info, drbd_helper,
2835 self._UpdateNodeVolumes(node_i, nresult, nimg, vg_name)
2836 self._UpdateNodeInstances(node_i, nresult, nimg)
2837 self._UpdateNodeInfo(node_i, nresult, nimg, vg_name)
2838 self._UpdateNodeOS(node_i, nresult, nimg)
2840 if not nimg.os_fail:
2841 if refos_img is None:
2843 self._VerifyNodeOS(node_i, nimg, refos_img)
2844 self._VerifyNodeBridges(node_i, nresult, bridges)
2846 # Check whether all running instancies are primary for the node. (This
2847 # can no longer be done from _VerifyInstance below, since some of the
2848 # wrong instances could be from other node groups.)
2849 non_primary_inst = set(nimg.instances).difference(nimg.pinst)
2851 for inst in non_primary_inst:
2852 test = inst in self.all_inst_info
2853 _ErrorIf(test, constants.CV_EINSTANCEWRONGNODE, inst,
2854 "instance should not run on node %s", node_i.name)
2855 _ErrorIf(not test, constants.CV_ENODEORPHANINSTANCE, node_i.name,
2856 "node is running unknown instance %s", inst)
2858 for node, result in extra_lv_nvinfo.items():
2859 self._UpdateNodeVolumes(self.all_node_info[node], result.payload,
2860 node_image[node], vg_name)
2862 feedback_fn("* Verifying instance status")
2863 for instance in self.my_inst_names:
2865 feedback_fn("* Verifying instance %s" % instance)
2866 inst_config = self.my_inst_info[instance]
2867 self._VerifyInstance(instance, inst_config, node_image,
2869 inst_nodes_offline = []
2871 pnode = inst_config.primary_node
2872 pnode_img = node_image[pnode]
2873 _ErrorIf(pnode_img.rpc_fail and not pnode_img.offline,
2874 constants.CV_ENODERPC, pnode, "instance %s, connection to"
2875 " primary node failed", instance)
2877 _ErrorIf(inst_config.admin_up and pnode_img.offline,
2878 constants.CV_EINSTANCEBADNODE, instance,
2879 "instance is marked as running and lives on offline node %s",
2880 inst_config.primary_node)
2882 # If the instance is non-redundant we cannot survive losing its primary
2883 # node, so we are not N+1 compliant. On the other hand we have no disk
2884 # templates with more than one secondary so that situation is not well
2886 # FIXME: does not support file-backed instances
2887 if not inst_config.secondary_nodes:
2888 i_non_redundant.append(instance)
2890 _ErrorIf(len(inst_config.secondary_nodes) > 1,
2891 constants.CV_EINSTANCELAYOUT,
2892 instance, "instance has multiple secondary nodes: %s",
2893 utils.CommaJoin(inst_config.secondary_nodes),
2894 code=self.ETYPE_WARNING)
2896 if inst_config.disk_template in constants.DTS_INT_MIRROR:
2897 pnode = inst_config.primary_node
2898 instance_nodes = utils.NiceSort(inst_config.all_nodes)
2899 instance_groups = {}
2901 for node in instance_nodes:
2902 instance_groups.setdefault(self.all_node_info[node].group,
2906 "%s (group %s)" % (utils.CommaJoin(nodes), groupinfo[group].name)
2907 # Sort so that we always list the primary node first.
2908 for group, nodes in sorted(instance_groups.items(),
2909 key=lambda (_, nodes): pnode in nodes,
2912 self._ErrorIf(len(instance_groups) > 1,
2913 constants.CV_EINSTANCESPLITGROUPS,
2914 instance, "instance has primary and secondary nodes in"
2915 " different groups: %s", utils.CommaJoin(pretty_list),
2916 code=self.ETYPE_WARNING)
2918 if not cluster.FillBE(inst_config)[constants.BE_AUTO_BALANCE]:
2919 i_non_a_balanced.append(instance)
2921 for snode in inst_config.secondary_nodes:
2922 s_img = node_image[snode]
2923 _ErrorIf(s_img.rpc_fail and not s_img.offline, constants.CV_ENODERPC,
2924 snode, "instance %s, connection to secondary node failed",
2928 inst_nodes_offline.append(snode)
2930 # warn that the instance lives on offline nodes
2931 _ErrorIf(inst_nodes_offline, constants.CV_EINSTANCEBADNODE, instance,
2932 "instance has offline secondary node(s) %s",
2933 utils.CommaJoin(inst_nodes_offline))
2934 # ... or ghost/non-vm_capable nodes
2935 for node in inst_config.all_nodes:
2936 _ErrorIf(node_image[node].ghost, constants.CV_EINSTANCEBADNODE,
2937 instance, "instance lives on ghost node %s", node)
2938 _ErrorIf(not node_image[node].vm_capable, constants.CV_EINSTANCEBADNODE,
2939 instance, "instance lives on non-vm_capable node %s", node)
2941 feedback_fn("* Verifying orphan volumes")
2942 reserved = utils.FieldSet(*cluster.reserved_lvs)
2944 # We will get spurious "unknown volume" warnings if any node of this group
2945 # is secondary for an instance whose primary is in another group. To avoid
2946 # them, we find these instances and add their volumes to node_vol_should.
2947 for inst in self.all_inst_info.values():
2948 for secondary in inst.secondary_nodes:
2949 if (secondary in self.my_node_info
2950 and inst.name not in self.my_inst_info):
2951 inst.MapLVsByNode(node_vol_should)
2954 self._VerifyOrphanVolumes(node_vol_should, node_image, reserved)
2956 if constants.VERIFY_NPLUSONE_MEM not in self.op.skip_checks:
2957 feedback_fn("* Verifying N+1 Memory redundancy")
2958 self._VerifyNPlusOneMemory(node_image, self.my_inst_info)
2960 feedback_fn("* Other Notes")
2962 feedback_fn(" - NOTICE: %d non-redundant instance(s) found."
2963 % len(i_non_redundant))
2965 if i_non_a_balanced:
2966 feedback_fn(" - NOTICE: %d non-auto-balanced instance(s) found."
2967 % len(i_non_a_balanced))
2970 feedback_fn(" - NOTICE: %d offline node(s) found." % n_offline)
2973 feedback_fn(" - NOTICE: %d drained node(s) found." % n_drained)
2977 def HooksCallBack(self, phase, hooks_results, feedback_fn, lu_result):
2978 """Analyze the post-hooks' result
2980 This method analyses the hook result, handles it, and sends some
2981 nicely-formatted feedback back to the user.
2983 @param phase: one of L{constants.HOOKS_PHASE_POST} or
2984 L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase
2985 @param hooks_results: the results of the multi-node hooks rpc call
2986 @param feedback_fn: function used send feedback back to the caller
2987 @param lu_result: previous Exec result
2988 @return: the new Exec result, based on the previous result
2992 # We only really run POST phase hooks, only for non-empty groups,
2993 # and are only interested in their results
2994 if not self.my_node_names:
2997 elif phase == constants.HOOKS_PHASE_POST:
2998 # Used to change hooks' output to proper indentation
2999 feedback_fn("* Hooks Results")
3000 assert hooks_results, "invalid result from hooks"
3002 for node_name in hooks_results:
3003 res = hooks_results[node_name]
3005 test = msg and not res.offline
3006 self._ErrorIf(test, constants.CV_ENODEHOOKS, node_name,
3007 "Communication failure in hooks execution: %s", msg)
3008 if res.offline or msg:
3009 # No need to investigate payload if node is offline or gave
3012 for script, hkr, output in res.payload:
3013 test = hkr == constants.HKR_FAIL
3014 self._ErrorIf(test, constants.CV_ENODEHOOKS, node_name,
3015 "Script %s failed, output:", script)
3017 output = self._HOOKS_INDENT_RE.sub(" ", output)
3018 feedback_fn("%s" % output)
3024 class LUClusterVerifyDisks(NoHooksLU):
3025 """Verifies the cluster disks status.
3030 def ExpandNames(self):
3031 self.share_locks = _ShareAll()
3032 self.needed_locks = {
3033 locking.LEVEL_NODEGROUP: locking.ALL_SET,
3036 def Exec(self, feedback_fn):
3037 group_names = self.owned_locks(locking.LEVEL_NODEGROUP)
3039 # Submit one instance of L{opcodes.OpGroupVerifyDisks} per node group
3040 return ResultWithJobs([[opcodes.OpGroupVerifyDisks(group_name=group)]
3041 for group in group_names])
3044 class LUGroupVerifyDisks(NoHooksLU):
3045 """Verifies the status of all disks in a node group.
3050 def ExpandNames(self):
3051 # Raises errors.OpPrereqError on its own if group can't be found
3052 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
3054 self.share_locks = _ShareAll()
3055 self.needed_locks = {
3056 locking.LEVEL_INSTANCE: [],
3057 locking.LEVEL_NODEGROUP: [],
3058 locking.LEVEL_NODE: [],
3061 def DeclareLocks(self, level):
3062 if level == locking.LEVEL_INSTANCE:
3063 assert not self.needed_locks[locking.LEVEL_INSTANCE]
3065 # Lock instances optimistically, needs verification once node and group
3066 # locks have been acquired
3067 self.needed_locks[locking.LEVEL_INSTANCE] = \
3068 self.cfg.GetNodeGroupInstances(self.group_uuid)
3070 elif level == locking.LEVEL_NODEGROUP:
3071 assert not self.needed_locks[locking.LEVEL_NODEGROUP]
3073 self.needed_locks[locking.LEVEL_NODEGROUP] = \
3074 set([self.group_uuid] +
3075 # Lock all groups used by instances optimistically; this requires
3076 # going via the node before it's locked, requiring verification
3079 for instance_name in self.owned_locks(locking.LEVEL_INSTANCE)
3080 for group_uuid in self.cfg.GetInstanceNodeGroups(instance_name)])
3082 elif level == locking.LEVEL_NODE:
3083 # This will only lock the nodes in the group to be verified which contain
3085 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
3086 self._LockInstancesNodes()
3088 # Lock all nodes in group to be verified
3089 assert self.group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP)
3090 member_nodes = self.cfg.GetNodeGroup(self.group_uuid).members
3091 self.needed_locks[locking.LEVEL_NODE].extend(member_nodes)
3093 def CheckPrereq(self):
3094 owned_instances = frozenset(self.owned_locks(locking.LEVEL_INSTANCE))
3095 owned_groups = frozenset(self.owned_locks(locking.LEVEL_NODEGROUP))
3096 owned_nodes = frozenset(self.owned_locks(locking.LEVEL_NODE))
3098 assert self.group_uuid in owned_groups
3100 # Check if locked instances are still correct
3101 _CheckNodeGroupInstances(self.cfg, self.group_uuid, owned_instances)
3103 # Get instance information
3104 self.instances = dict(self.cfg.GetMultiInstanceInfo(owned_instances))
3106 # Check if node groups for locked instances are still correct
3107 for (instance_name, inst) in self.instances.items():
3108 assert owned_nodes.issuperset(inst.all_nodes), \
3109 "Instance %s's nodes changed while we kept the lock" % instance_name
3111 inst_groups = _CheckInstanceNodeGroups(self.cfg, instance_name,
3114 assert self.group_uuid in inst_groups, \
3115 "Instance %s has no node in group %s" % (instance_name, self.group_uuid)
3117 def Exec(self, feedback_fn):
3118 """Verify integrity of cluster disks.
3120 @rtype: tuple of three items
3121 @return: a tuple of (dict of node-to-node_error, list of instances
3122 which need activate-disks, dict of instance: (node, volume) for
3127 res_instances = set()
3130 nv_dict = _MapInstanceDisksToNodes([inst
3131 for inst in self.instances.values()
3135 nodes = utils.NiceSort(set(self.owned_locks(locking.LEVEL_NODE)) &
3136 set(self.cfg.GetVmCapableNodeList()))
3138 node_lvs = self.rpc.call_lv_list(nodes, [])
3140 for (node, node_res) in node_lvs.items():
3141 if node_res.offline:
3144 msg = node_res.fail_msg
3146 logging.warning("Error enumerating LVs on node %s: %s", node, msg)
3147 res_nodes[node] = msg
3150 for lv_name, (_, _, lv_online) in node_res.payload.items():
3151 inst = nv_dict.pop((node, lv_name), None)
3152 if not (lv_online or inst is None):
3153 res_instances.add(inst)
3155 # any leftover items in nv_dict are missing LVs, let's arrange the data
3157 for key, inst in nv_dict.iteritems():
3158 res_missing.setdefault(inst, []).append(list(key))
3160 return (res_nodes, list(res_instances), res_missing)
3163 class LUClusterRepairDiskSizes(NoHooksLU):
3164 """Verifies the cluster disks sizes.
3169 def ExpandNames(self):
3170 if self.op.instances:
3171 self.wanted_names = _GetWantedInstances(self, self.op.instances)
3172 self.needed_locks = {
3173 locking.LEVEL_NODE: [],
3174 locking.LEVEL_INSTANCE: self.wanted_names,
3176 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
3178 self.wanted_names = None
3179 self.needed_locks = {
3180 locking.LEVEL_NODE: locking.ALL_SET,
3181 locking.LEVEL_INSTANCE: locking.ALL_SET,
3183 self.share_locks = _ShareAll()
3185 def DeclareLocks(self, level):
3186 if level == locking.LEVEL_NODE and self.wanted_names is not None:
3187 self._LockInstancesNodes(primary_only=True)
3189 def CheckPrereq(self):
3190 """Check prerequisites.
3192 This only checks the optional instance list against the existing names.
3195 if self.wanted_names is None:
3196 self.wanted_names = self.owned_locks(locking.LEVEL_INSTANCE)
3198 self.wanted_instances = \
3199 map(compat.snd, self.cfg.GetMultiInstanceInfo(self.wanted_names))
3201 def _EnsureChildSizes(self, disk):
3202 """Ensure children of the disk have the needed disk size.
3204 This is valid mainly for DRBD8 and fixes an issue where the
3205 children have smaller disk size.
3207 @param disk: an L{ganeti.objects.Disk} object
3210 if disk.dev_type == constants.LD_DRBD8:
3211 assert disk.children, "Empty children for DRBD8?"
3212 fchild = disk.children[0]
3213 mismatch = fchild.size < disk.size
3215 self.LogInfo("Child disk has size %d, parent %d, fixing",
3216 fchild.size, disk.size)
3217 fchild.size = disk.size
3219 # and we recurse on this child only, not on the metadev
3220 return self._EnsureChildSizes(fchild) or mismatch
3224 def Exec(self, feedback_fn):
3225 """Verify the size of cluster disks.
3228 # TODO: check child disks too
3229 # TODO: check differences in size between primary/secondary nodes
3231 for instance in self.wanted_instances:
3232 pnode = instance.primary_node
3233 if pnode not in per_node_disks:
3234 per_node_disks[pnode] = []
3235 for idx, disk in enumerate(instance.disks):
3236 per_node_disks[pnode].append((instance, idx, disk))
3239 for node, dskl in per_node_disks.items():
3240 newl = [v[2].Copy() for v in dskl]
3242 self.cfg.SetDiskID(dsk, node)
3243 result = self.rpc.call_blockdev_getsize(node, newl)
3245 self.LogWarning("Failure in blockdev_getsize call to node"
3246 " %s, ignoring", node)
3248 if len(result.payload) != len(dskl):
3249 logging.warning("Invalid result from node %s: len(dksl)=%d,"
3250 " result.payload=%s", node, len(dskl), result.payload)
3251 self.LogWarning("Invalid result from node %s, ignoring node results",
3254 for ((instance, idx, disk), size) in zip(dskl, result.payload):
3256 self.LogWarning("Disk %d of instance %s did not return size"
3257 " information, ignoring", idx, instance.name)
3259 if not isinstance(size, (int, long)):
3260 self.LogWarning("Disk %d of instance %s did not return valid"
3261 " size information, ignoring", idx, instance.name)
3264 if size != disk.size:
3265 self.LogInfo("Disk %d of instance %s has mismatched size,"
3266 " correcting: recorded %d, actual %d", idx,
3267 instance.name, disk.size, size)
3269 self.cfg.Update(instance, feedback_fn)
3270 changed.append((instance.name, idx, size))
3271 if self._EnsureChildSizes(disk):
3272 self.cfg.Update(instance, feedback_fn)
3273 changed.append((instance.name, idx, disk.size))
3277 class LUClusterRename(LogicalUnit):
3278 """Rename the cluster.
3281 HPATH = "cluster-rename"
3282 HTYPE = constants.HTYPE_CLUSTER
3284 def BuildHooksEnv(self):
3289 "OP_TARGET": self.cfg.GetClusterName(),
3290 "NEW_NAME": self.op.name,
3293 def BuildHooksNodes(self):
3294 """Build hooks nodes.
3297 return ([self.cfg.GetMasterNode()], self.cfg.GetNodeList())
3299 def CheckPrereq(self):
3300 """Verify that the passed name is a valid one.
3303 hostname = netutils.GetHostname(name=self.op.name,
3304 family=self.cfg.GetPrimaryIPFamily())
3306 new_name = hostname.name
3307 self.ip = new_ip = hostname.ip
3308 old_name = self.cfg.GetClusterName()
3309 old_ip = self.cfg.GetMasterIP()
3310 if new_name == old_name and new_ip == old_ip:
3311 raise errors.OpPrereqError("Neither the name nor the IP address of the"
3312 " cluster has changed",
3314 if new_ip != old_ip:
3315 if netutils.TcpPing(new_ip, constants.DEFAULT_NODED_PORT):
3316 raise errors.OpPrereqError("The given cluster IP address (%s) is"
3317 " reachable on the network" %
3318 new_ip, errors.ECODE_NOTUNIQUE)
3320 self.op.name = new_name
3322 def Exec(self, feedback_fn):
3323 """Rename the cluster.
3326 clustername = self.op.name
3329 # shutdown the master IP
3330 (master, ip, dev, netmask, family) = self.cfg.GetMasterNetworkParameters()
3331 result = self.rpc.call_node_deactivate_master_ip(master, ip, netmask, dev,
3333 result.Raise("Could not disable the master role")
3336 cluster = self.cfg.GetClusterInfo()
3337 cluster.cluster_name = clustername
3338 cluster.master_ip = new_ip
3339 self.cfg.Update(cluster, feedback_fn)
3341 # update the known hosts file
3342 ssh.WriteKnownHostsFile(self.cfg, constants.SSH_KNOWN_HOSTS_FILE)
3343 node_list = self.cfg.GetOnlineNodeList()
3345 node_list.remove(master)
3348 _UploadHelper(self, node_list, constants.SSH_KNOWN_HOSTS_FILE)
3350 result = self.rpc.call_node_activate_master_ip(master, new_ip, netmask,
3352 msg = result.fail_msg
3354 self.LogWarning("Could not re-enable the master role on"
3355 " the master, please restart manually: %s", msg)
3360 def _ValidateNetmask(cfg, netmask):
3361 """Checks if a netmask is valid.
3363 @type cfg: L{config.ConfigWriter}
3364 @param cfg: The cluster configuration
3366 @param netmask: the netmask to be verified
3367 @raise errors.OpPrereqError: if the validation fails
3370 ip_family = cfg.GetPrimaryIPFamily()
3372 ipcls = netutils.IPAddress.GetClassFromIpFamily(ip_family)
3373 except errors.ProgrammerError:
3374 raise errors.OpPrereqError("Invalid primary ip family: %s." %
3376 if not ipcls.ValidateNetmask(netmask):
3377 raise errors.OpPrereqError("CIDR netmask (%s) not valid" %
3381 class LUClusterSetParams(LogicalUnit):
3382 """Change the parameters of the cluster.
3385 HPATH = "cluster-modify"
3386 HTYPE = constants.HTYPE_CLUSTER
3389 def CheckArguments(self):
3393 if self.op.uid_pool:
3394 uidpool.CheckUidPool(self.op.uid_pool)
3396 if self.op.add_uids:
3397 uidpool.CheckUidPool(self.op.add_uids)
3399 if self.op.remove_uids:
3400 uidpool.CheckUidPool(self.op.remove_uids)
3402 if self.op.master_netmask is not None:
3403 _ValidateNetmask(self.cfg, self.op.master_netmask)
3405 def ExpandNames(self):
3406 # FIXME: in the future maybe other cluster params won't require checking on
3407 # all nodes to be modified.
3408 self.needed_locks = {
3409 locking.LEVEL_NODE: locking.ALL_SET,
3411 self.share_locks[locking.LEVEL_NODE] = 1
3413 def BuildHooksEnv(self):
3418 "OP_TARGET": self.cfg.GetClusterName(),
3419 "NEW_VG_NAME": self.op.vg_name,
3422 def BuildHooksNodes(self):
3423 """Build hooks nodes.
3426 mn = self.cfg.GetMasterNode()
3429 def CheckPrereq(self):
3430 """Check prerequisites.
3432 This checks whether the given params don't conflict and
3433 if the given volume group is valid.
3436 if self.op.vg_name is not None and not self.op.vg_name:
3437 if self.cfg.HasAnyDiskOfType(constants.LD_LV):
3438 raise errors.OpPrereqError("Cannot disable lvm storage while lvm-based"
3439 " instances exist", errors.ECODE_INVAL)
3441 if self.op.drbd_helper is not None and not self.op.drbd_helper:
3442 if self.cfg.HasAnyDiskOfType(constants.LD_DRBD8):
3443 raise errors.OpPrereqError("Cannot disable drbd helper while"
3444 " drbd-based instances exist",
3447 node_list = self.owned_locks(locking.LEVEL_NODE)
3449 # if vg_name not None, checks given volume group on all nodes
3451 vglist = self.rpc.call_vg_list(node_list)
3452 for node in node_list:
3453 msg = vglist[node].fail_msg
3455 # ignoring down node
3456 self.LogWarning("Error while gathering data on node %s"
3457 " (ignoring node): %s", node, msg)
3459 vgstatus = utils.CheckVolumeGroupSize(vglist[node].payload,
3461 constants.MIN_VG_SIZE)
3463 raise errors.OpPrereqError("Error on node '%s': %s" %
3464 (node, vgstatus), errors.ECODE_ENVIRON)
3466 if self.op.drbd_helper:
3467 # checks given drbd helper on all nodes
3468 helpers = self.rpc.call_drbd_helper(node_list)
3469 for (node, ninfo) in self.cfg.GetMultiNodeInfo(node_list):
3471 self.LogInfo("Not checking drbd helper on offline node %s", node)
3473 msg = helpers[node].fail_msg
3475 raise errors.OpPrereqError("Error checking drbd helper on node"
3476 " '%s': %s" % (node, msg),
3477 errors.ECODE_ENVIRON)
3478 node_helper = helpers[node].payload
3479 if node_helper != self.op.drbd_helper:
3480 raise errors.OpPrereqError("Error on node '%s': drbd helper is %s" %
3481 (node, node_helper), errors.ECODE_ENVIRON)
3483 self.cluster = cluster = self.cfg.GetClusterInfo()
3484 # validate params changes
3485 if self.op.beparams:
3486 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
3487 self.new_beparams = cluster.SimpleFillBE(self.op.beparams)
3489 if self.op.ndparams:
3490 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
3491 self.new_ndparams = cluster.SimpleFillND(self.op.ndparams)
3493 # TODO: we need a more general way to handle resetting
3494 # cluster-level parameters to default values
3495 if self.new_ndparams["oob_program"] == "":
3496 self.new_ndparams["oob_program"] = \
3497 constants.NDC_DEFAULTS[constants.ND_OOB_PROGRAM]
3499 if self.op.nicparams:
3500 utils.ForceDictType(self.op.nicparams, constants.NICS_PARAMETER_TYPES)
3501 self.new_nicparams = cluster.SimpleFillNIC(self.op.nicparams)
3502 objects.NIC.CheckParameterSyntax(self.new_nicparams)
3505 # check all instances for consistency
3506 for instance in self.cfg.GetAllInstancesInfo().values():
3507 for nic_idx, nic in enumerate(instance.nics):
3508 params_copy = copy.deepcopy(nic.nicparams)
3509 params_filled = objects.FillDict(self.new_nicparams, params_copy)
3511 # check parameter syntax
3513 objects.NIC.CheckParameterSyntax(params_filled)
3514 except errors.ConfigurationError, err:
3515 nic_errors.append("Instance %s, nic/%d: %s" %
3516 (instance.name, nic_idx, err))
3518 # if we're moving instances to routed, check that they have an ip
3519 target_mode = params_filled[constants.NIC_MODE]
3520 if target_mode == constants.NIC_MODE_ROUTED and not nic.ip:
3521 nic_errors.append("Instance %s, nic/%d: routed NIC with no ip"
3522 " address" % (instance.name, nic_idx))
3524 raise errors.OpPrereqError("Cannot apply the change, errors:\n%s" %
3525 "\n".join(nic_errors))
3527 # hypervisor list/parameters
3528 self.new_hvparams = new_hvp = objects.FillDict(cluster.hvparams, {})
3529 if self.op.hvparams:
3530 for hv_name, hv_dict in self.op.hvparams.items():
3531 if hv_name not in self.new_hvparams:
3532 self.new_hvparams[hv_name] = hv_dict
3534 self.new_hvparams[hv_name].update(hv_dict)
3536 # os hypervisor parameters
3537 self.new_os_hvp = objects.FillDict(cluster.os_hvp, {})
3539 for os_name, hvs in self.op.os_hvp.items():
3540 if os_name not in self.new_os_hvp:
3541 self.new_os_hvp[os_name] = hvs
3543 for hv_name, hv_dict in hvs.items():
3544 if hv_name not in self.new_os_hvp[os_name]:
3545 self.new_os_hvp[os_name][hv_name] = hv_dict
3547 self.new_os_hvp[os_name][hv_name].update(hv_dict)
3550 self.new_osp = objects.FillDict(cluster.osparams, {})
3551 if self.op.osparams:
3552 for os_name, osp in self.op.osparams.items():
3553 if os_name not in self.new_osp:
3554 self.new_osp[os_name] = {}
3556 self.new_osp[os_name] = _GetUpdatedParams(self.new_osp[os_name], osp,
3559 if not self.new_osp[os_name]:
3560 # we removed all parameters
3561 del self.new_osp[os_name]
3563 # check the parameter validity (remote check)
3564 _CheckOSParams(self, False, [self.cfg.GetMasterNode()],
3565 os_name, self.new_osp[os_name])
3567 # changes to the hypervisor list
3568 if self.op.enabled_hypervisors is not None:
3569 self.hv_list = self.op.enabled_hypervisors
3570 for hv in self.hv_list:
3571 # if the hypervisor doesn't already exist in the cluster
3572 # hvparams, we initialize it to empty, and then (in both
3573 # cases) we make sure to fill the defaults, as we might not
3574 # have a complete defaults list if the hypervisor wasn't
3576 if hv not in new_hvp:
3578 new_hvp[hv] = objects.FillDict(constants.HVC_DEFAULTS[hv], new_hvp[hv])
3579 utils.ForceDictType(new_hvp[hv], constants.HVS_PARAMETER_TYPES)
3581 self.hv_list = cluster.enabled_hypervisors
3583 if self.op.hvparams or self.op.enabled_hypervisors is not None:
3584 # either the enabled list has changed, or the parameters have, validate
3585 for hv_name, hv_params in self.new_hvparams.items():
3586 if ((self.op.hvparams and hv_name in self.op.hvparams) or
3587 (self.op.enabled_hypervisors and
3588 hv_name in self.op.enabled_hypervisors)):
3589 # either this is a new hypervisor, or its parameters have changed
3590 hv_class = hypervisor.GetHypervisor(hv_name)
3591 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
3592 hv_class.CheckParameterSyntax(hv_params)
3593 _CheckHVParams(self, node_list, hv_name, hv_params)
3596 # no need to check any newly-enabled hypervisors, since the
3597 # defaults have already been checked in the above code-block
3598 for os_name, os_hvp in self.new_os_hvp.items():
3599 for hv_name, hv_params in os_hvp.items():
3600 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
3601 # we need to fill in the new os_hvp on top of the actual hv_p
3602 cluster_defaults = self.new_hvparams.get(hv_name, {})
3603 new_osp = objects.FillDict(cluster_defaults, hv_params)
3604 hv_class = hypervisor.GetHypervisor(hv_name)
3605 hv_class.CheckParameterSyntax(new_osp)
3606 _CheckHVParams(self, node_list, hv_name, new_osp)
3608 if self.op.default_iallocator:
3609 alloc_script = utils.FindFile(self.op.default_iallocator,
3610 constants.IALLOCATOR_SEARCH_PATH,
3612 if alloc_script is None:
3613 raise errors.OpPrereqError("Invalid default iallocator script '%s'"
3614 " specified" % self.op.default_iallocator,
3617 def Exec(self, feedback_fn):
3618 """Change the parameters of the cluster.
3621 if self.op.vg_name is not None:
3622 new_volume = self.op.vg_name
3625 if new_volume != self.cfg.GetVGName():
3626 self.cfg.SetVGName(new_volume)
3628 feedback_fn("Cluster LVM configuration already in desired"
3629 " state, not changing")
3630 if self.op.drbd_helper is not None:
3631 new_helper = self.op.drbd_helper
3634 if new_helper != self.cfg.GetDRBDHelper():
3635 self.cfg.SetDRBDHelper(new_helper)
3637 feedback_fn("Cluster DRBD helper already in desired state,"
3639 if self.op.hvparams:
3640 self.cluster.hvparams = self.new_hvparams
3642 self.cluster.os_hvp = self.new_os_hvp
3643 if self.op.enabled_hypervisors is not None:
3644 self.cluster.hvparams = self.new_hvparams
3645 self.cluster.enabled_hypervisors = self.op.enabled_hypervisors
3646 if self.op.beparams:
3647 self.cluster.beparams[constants.PP_DEFAULT] = self.new_beparams
3648 if self.op.nicparams:
3649 self.cluster.nicparams[constants.PP_DEFAULT] = self.new_nicparams
3650 if self.op.osparams:
3651 self.cluster.osparams = self.new_osp
3652 if self.op.ndparams:
3653 self.cluster.ndparams = self.new_ndparams
3655 if self.op.candidate_pool_size is not None:
3656 self.cluster.candidate_pool_size = self.op.candidate_pool_size
3657 # we need to update the pool size here, otherwise the save will fail
3658 _AdjustCandidatePool(self, [])
3660 if self.op.maintain_node_health is not None:
3661 self.cluster.maintain_node_health = self.op.maintain_node_health
3663 if self.op.prealloc_wipe_disks is not None:
3664 self.cluster.prealloc_wipe_disks = self.op.prealloc_wipe_disks
3666 if self.op.add_uids is not None:
3667 uidpool.AddToUidPool(self.cluster.uid_pool, self.op.add_uids)
3669 if self.op.remove_uids is not None:
3670 uidpool.RemoveFromUidPool(self.cluster.uid_pool, self.op.remove_uids)
3672 if self.op.uid_pool is not None:
3673 self.cluster.uid_pool = self.op.uid_pool
3675 if self.op.default_iallocator is not None:
3676 self.cluster.default_iallocator = self.op.default_iallocator
3678 if self.op.reserved_lvs is not None:
3679 self.cluster.reserved_lvs = self.op.reserved_lvs
3681 def helper_os(aname, mods, desc):
3683 lst = getattr(self.cluster, aname)
3684 for key, val in mods:
3685 if key == constants.DDM_ADD:
3687 feedback_fn("OS %s already in %s, ignoring" % (val, desc))
3690 elif key == constants.DDM_REMOVE:
3694 feedback_fn("OS %s not found in %s, ignoring" % (val, desc))
3696 raise errors.ProgrammerError("Invalid modification '%s'" % key)
3698 if self.op.hidden_os:
3699 helper_os("hidden_os", self.op.hidden_os, "hidden")
3701 if self.op.blacklisted_os:
3702 helper_os("blacklisted_os", self.op.blacklisted_os, "blacklisted")
3704 if self.op.master_netdev:
3705 (master, ip, dev, netmask, family) = self.cfg.GetMasterNetworkParameters()
3706 feedback_fn("Shutting down master ip on the current netdev (%s)" %
3707 self.cluster.master_netdev)
3708 result = self.rpc.call_node_deactivate_master_ip(master, ip, netmask, dev,
3710 result.Raise("Could not disable the master ip")
3711 feedback_fn("Changing master_netdev from %s to %s" %
3712 (dev, self.op.master_netdev))
3713 self.cluster.master_netdev = self.op.master_netdev
3715 if self.op.master_netmask:
3716 (master, ip, dev, old_netmask, _) = self.cfg.GetMasterNetworkParameters()
3717 feedback_fn("Changing master IP netmask to %s" % self.op.master_netmask)
3718 result = self.rpc.call_node_change_master_netmask(master, old_netmask,
3719 self.op.master_netmask,
3722 msg = "Could not change the master IP netmask: %s" % result.fail_msg
3723 self.LogWarning(msg)
3726 self.cluster.master_netmask = self.op.master_netmask
3728 self.cfg.Update(self.cluster, feedback_fn)
3730 if self.op.master_netdev:
3731 (master, ip, dev, netmask, family) = self.cfg.GetMasterNetworkParameters()
3732 feedback_fn("Starting the master ip on the new master netdev (%s)" %
3733 self.op.master_netdev)
3734 result = self.rpc.call_node_activate_master_ip(master, ip, netmask, dev,
3737 self.LogWarning("Could not re-enable the master ip on"
3738 " the master, please restart manually: %s",
3742 def _UploadHelper(lu, nodes, fname):
3743 """Helper for uploading a file and showing warnings.
3746 if os.path.exists(fname):
3747 result = lu.rpc.call_upload_file(nodes, fname)
3748 for to_node, to_result in result.items():
3749 msg = to_result.fail_msg
3751 msg = ("Copy of file %s to node %s failed: %s" %
3752 (fname, to_node, msg))
3753 lu.proc.LogWarning(msg)
3756 def _ComputeAncillaryFiles(cluster, redist):
3757 """Compute files external to Ganeti which need to be consistent.
3759 @type redist: boolean
3760 @param redist: Whether to include files which need to be redistributed
3763 # Compute files for all nodes
3765 constants.SSH_KNOWN_HOSTS_FILE,
3766 constants.CONFD_HMAC_KEY,
3767 constants.CLUSTER_DOMAIN_SECRET_FILE,
3768 constants.SPICE_CERT_FILE,
3769 constants.SPICE_CACERT_FILE,
3770 constants.RAPI_USERS_FILE,
3774 files_all.update(constants.ALL_CERT_FILES)
3775 files_all.update(ssconf.SimpleStore().GetFileList())
3777 # we need to ship at least the RAPI certificate
3778 files_all.add(constants.RAPI_CERT_FILE)
3780 if cluster.modify_etc_hosts:
3781 files_all.add(constants.ETC_HOSTS)
3783 # Files which are optional, these must:
3784 # - be present in one other category as well
3785 # - either exist or not exist on all nodes of that category (mc, vm all)
3787 constants.RAPI_USERS_FILE,
3790 # Files which should only be on master candidates
3793 files_mc.add(constants.CLUSTER_CONF_FILE)
3795 # Files which should only be on VM-capable nodes
3796 files_vm = set(filename
3797 for hv_name in cluster.enabled_hypervisors
3798 for filename in hypervisor.GetHypervisor(hv_name).GetAncillaryFiles()[0])
3800 files_opt |= set(filename
3801 for hv_name in cluster.enabled_hypervisors
3802 for filename in hypervisor.GetHypervisor(hv_name).GetAncillaryFiles()[1])
3804 # Filenames in each category must be unique
3805 all_files_set = files_all | files_mc | files_vm
3806 assert (len(all_files_set) ==
3807 sum(map(len, [files_all, files_mc, files_vm]))), \
3808 "Found file listed in more than one file list"
3810 # Optional files must be present in one other category
3811 assert all_files_set.issuperset(files_opt), \
3812 "Optional file not in a different required list"
3814 return (files_all, files_opt, files_mc, files_vm)
3817 def _RedistributeAncillaryFiles(lu, additional_nodes=None, additional_vm=True):
3818 """Distribute additional files which are part of the cluster configuration.
3820 ConfigWriter takes care of distributing the config and ssconf files, but
3821 there are more files which should be distributed to all nodes. This function
3822 makes sure those are copied.
3824 @param lu: calling logical unit
3825 @param additional_nodes: list of nodes not in the config to distribute to
3826 @type additional_vm: boolean
3827 @param additional_vm: whether the additional nodes are vm-capable or not
3830 # Gather target nodes
3831 cluster = lu.cfg.GetClusterInfo()
3832 master_info = lu.cfg.GetNodeInfo(lu.cfg.GetMasterNode())
3834 online_nodes = lu.cfg.GetOnlineNodeList()
3835 vm_nodes = lu.cfg.GetVmCapableNodeList()
3837 if additional_nodes is not None:
3838 online_nodes.extend(additional_nodes)
3840 vm_nodes.extend(additional_nodes)
3842 # Never distribute to master node
3843 for nodelist in [online_nodes, vm_nodes]:
3844 if master_info.name in nodelist:
3845 nodelist.remove(master_info.name)
3848 (files_all, _, files_mc, files_vm) = \
3849 _ComputeAncillaryFiles(cluster, True)
3851 # Never re-distribute configuration file from here
3852 assert not (constants.CLUSTER_CONF_FILE in files_all or
3853 constants.CLUSTER_CONF_FILE in files_vm)
3854 assert not files_mc, "Master candidates not handled in this function"
3857 (online_nodes, files_all),
3858 (vm_nodes, files_vm),
3862 for (node_list, files) in filemap:
3864 _UploadHelper(lu, node_list, fname)
3867 class LUClusterRedistConf(NoHooksLU):
3868 """Force the redistribution of cluster configuration.
3870 This is a very simple LU.
3875 def ExpandNames(self):
3876 self.needed_locks = {
3877 locking.LEVEL_NODE: locking.ALL_SET,
3879 self.share_locks[locking.LEVEL_NODE] = 1
3881 def Exec(self, feedback_fn):
3882 """Redistribute the configuration.
3885 self.cfg.Update(self.cfg.GetClusterInfo(), feedback_fn)
3886 _RedistributeAncillaryFiles(self)
3889 class LUClusterActivateMasterIp(NoHooksLU):
3890 """Activate the master IP on the master node.
3893 def Exec(self, feedback_fn):
3894 """Activate the master IP.
3897 (master, ip, dev, netmask, family) = self.cfg.GetMasterNetworkParameters()
3898 self.rpc.call_node_activate_master_ip(master, ip, netmask, dev, family)
3901 class LUClusterDeactivateMasterIp(NoHooksLU):
3902 """Deactivate the master IP on the master node.
3905 def Exec(self, feedback_fn):
3906 """Deactivate the master IP.
3909 (master, ip, dev, netmask, family) = self.cfg.GetMasterNetworkParameters()
3910 self.rpc.call_node_deactivate_master_ip(master, ip, netmask, dev, family)
3913 def _WaitForSync(lu, instance, disks=None, oneshot=False):
3914 """Sleep and poll for an instance's disk to sync.
3917 if not instance.disks or disks is not None and not disks:
3920 disks = _ExpandCheckDisks(instance, disks)
3923 lu.proc.LogInfo("Waiting for instance %s to sync disks." % instance.name)
3925 node = instance.primary_node
3928 lu.cfg.SetDiskID(dev, node)
3930 # TODO: Convert to utils.Retry
3933 degr_retries = 10 # in seconds, as we sleep 1 second each time
3937 cumul_degraded = False
3938 rstats = lu.rpc.call_blockdev_getmirrorstatus(node, disks)
3939 msg = rstats.fail_msg
3941 lu.LogWarning("Can't get any data from node %s: %s", node, msg)
3944 raise errors.RemoteError("Can't contact node %s for mirror data,"
3945 " aborting." % node)
3948 rstats = rstats.payload
3950 for i, mstat in enumerate(rstats):
3952 lu.LogWarning("Can't compute data for node %s/%s",
3953 node, disks[i].iv_name)
3956 cumul_degraded = (cumul_degraded or
3957 (mstat.is_degraded and mstat.sync_percent is None))
3958 if mstat.sync_percent is not None:
3960 if mstat.estimated_time is not None:
3961 rem_time = ("%s remaining (estimated)" %
3962 utils.FormatSeconds(mstat.estimated_time))
3963 max_time = mstat.estimated_time
3965 rem_time = "no time estimate"
3966 lu.proc.LogInfo("- device %s: %5.2f%% done, %s" %
3967 (disks[i].iv_name, mstat.sync_percent, rem_time))
3969 # if we're done but degraded, let's do a few small retries, to
3970 # make sure we see a stable and not transient situation; therefore
3971 # we force restart of the loop
3972 if (done or oneshot) and cumul_degraded and degr_retries > 0:
3973 logging.info("Degraded disks found, %d retries left", degr_retries)
3981 time.sleep(min(60, max_time))
3984 lu.proc.LogInfo("Instance %s's disks are in sync." % instance.name)
3985 return not cumul_degraded
3988 def _CheckDiskConsistency(lu, dev, node, on_primary, ldisk=False):
3989 """Check that mirrors are not degraded.
3991 The ldisk parameter, if True, will change the test from the
3992 is_degraded attribute (which represents overall non-ok status for
3993 the device(s)) to the ldisk (representing the local storage status).
3996 lu.cfg.SetDiskID(dev, node)
4000 if on_primary or dev.AssembleOnSecondary():
4001 rstats = lu.rpc.call_blockdev_find(node, dev)
4002 msg = rstats.fail_msg
4004 lu.LogWarning("Can't find disk on node %s: %s", node, msg)
4006 elif not rstats.payload:
4007 lu.LogWarning("Can't find disk on node %s", node)
4011 result = result and rstats.payload.ldisk_status == constants.LDS_OKAY
4013 result = result and not rstats.payload.is_degraded
4016 for child in dev.children:
4017 result = result and _CheckDiskConsistency(lu, child, node, on_primary)
4022 class LUOobCommand(NoHooksLU):
4023 """Logical unit for OOB handling.
4027 _SKIP_MASTER = (constants.OOB_POWER_OFF, constants.OOB_POWER_CYCLE)
4029 def ExpandNames(self):
4030 """Gather locks we need.
4033 if self.op.node_names:
4034 self.op.node_names = _GetWantedNodes(self, self.op.node_names)
4035 lock_names = self.op.node_names
4037 lock_names = locking.ALL_SET
4039 self.needed_locks = {
4040 locking.LEVEL_NODE: lock_names,
4043 def CheckPrereq(self):
4044 """Check prerequisites.
4047 - the node exists in the configuration
4050 Any errors are signaled by raising errors.OpPrereqError.
4054 self.master_node = self.cfg.GetMasterNode()
4056 assert self.op.power_delay >= 0.0
4058 if self.op.node_names:
4059 if (self.op.command in self._SKIP_MASTER and
4060 self.master_node in self.op.node_names):
4061 master_node_obj = self.cfg.GetNodeInfo(self.master_node)
4062 master_oob_handler = _SupportsOob(self.cfg, master_node_obj)
4064 if master_oob_handler:
4065 additional_text = ("run '%s %s %s' if you want to operate on the"
4066 " master regardless") % (master_oob_handler,
4070 additional_text = "it does not support out-of-band operations"
4072 raise errors.OpPrereqError(("Operating on the master node %s is not"
4073 " allowed for %s; %s") %
4074 (self.master_node, self.op.command,
4075 additional_text), errors.ECODE_INVAL)
4077 self.op.node_names = self.cfg.GetNodeList()
4078 if self.op.command in self._SKIP_MASTER:
4079 self.op.node_names.remove(self.master_node)
4081 if self.op.command in self._SKIP_MASTER:
4082 assert self.master_node not in self.op.node_names
4084 for (node_name, node) in self.cfg.GetMultiNodeInfo(self.op.node_names):
4086 raise errors.OpPrereqError("Node %s not found" % node_name,
4089 self.nodes.append(node)
4091 if (not self.op.ignore_status and
4092 (self.op.command == constants.OOB_POWER_OFF and not node.offline)):
4093 raise errors.OpPrereqError(("Cannot power off node %s because it is"
4094 " not marked offline") % node_name,
4097 def Exec(self, feedback_fn):
4098 """Execute OOB and return result if we expect any.
4101 master_node = self.master_node
4104 for idx, node in enumerate(utils.NiceSort(self.nodes,
4105 key=lambda node: node.name)):
4106 node_entry = [(constants.RS_NORMAL, node.name)]
4107 ret.append(node_entry)
4109 oob_program = _SupportsOob(self.cfg, node)
4112 node_entry.append((constants.RS_UNAVAIL, None))
4115 logging.info("Executing out-of-band command '%s' using '%s' on %s",
4116 self.op.command, oob_program, node.name)
4117 result = self.rpc.call_run_oob(master_node, oob_program,
4118 self.op.command, node.name,
4122 self.LogWarning("Out-of-band RPC failed on node '%s': %s",
4123 node.name, result.fail_msg)
4124 node_entry.append((constants.RS_NODATA, None))
4127 self._CheckPayload(result)
4128 except errors.OpExecError, err:
4129 self.LogWarning("Payload returned by node '%s' is not valid: %s",
4131 node_entry.append((constants.RS_NODATA, None))
4133 if self.op.command == constants.OOB_HEALTH:
4134 # For health we should log important events
4135 for item, status in result.payload:
4136 if status in [constants.OOB_STATUS_WARNING,
4137 constants.OOB_STATUS_CRITICAL]:
4138 self.LogWarning("Item '%s' on node '%s' has status '%s'",
4139 item, node.name, status)
4141 if self.op.command == constants.OOB_POWER_ON:
4143 elif self.op.command == constants.OOB_POWER_OFF:
4144 node.powered = False
4145 elif self.op.command == constants.OOB_POWER_STATUS:
4146 powered = result.payload[constants.OOB_POWER_STATUS_POWERED]
4147 if powered != node.powered:
4148 logging.warning(("Recorded power state (%s) of node '%s' does not"
4149 " match actual power state (%s)"), node.powered,
4152 # For configuration changing commands we should update the node
4153 if self.op.command in (constants.OOB_POWER_ON,
4154 constants.OOB_POWER_OFF):
4155 self.cfg.Update(node, feedback_fn)
4157 node_entry.append((constants.RS_NORMAL, result.payload))
4159 if (self.op.command == constants.OOB_POWER_ON and
4160 idx < len(self.nodes) - 1):
4161 time.sleep(self.op.power_delay)
4165 def _CheckPayload(self, result):
4166 """Checks if the payload is valid.
4168 @param result: RPC result
4169 @raises errors.OpExecError: If payload is not valid
4173 if self.op.command == constants.OOB_HEALTH:
4174 if not isinstance(result.payload, list):
4175 errs.append("command 'health' is expected to return a list but got %s" %
4176 type(result.payload))
4178 for item, status in result.payload:
4179 if status not in constants.OOB_STATUSES:
4180 errs.append("health item '%s' has invalid status '%s'" %
4183 if self.op.command == constants.OOB_POWER_STATUS:
4184 if not isinstance(result.payload, dict):
4185 errs.append("power-status is expected to return a dict but got %s" %
4186 type(result.payload))
4188 if self.op.command in [
4189 constants.OOB_POWER_ON,
4190 constants.OOB_POWER_OFF,
4191 constants.OOB_POWER_CYCLE,
4193 if result.payload is not None:
4194 errs.append("%s is expected to not return payload but got '%s'" %
4195 (self.op.command, result.payload))
4198 raise errors.OpExecError("Check of out-of-band payload failed due to %s" %
4199 utils.CommaJoin(errs))
4202 class _OsQuery(_QueryBase):
4203 FIELDS = query.OS_FIELDS
4205 def ExpandNames(self, lu):
4206 # Lock all nodes in shared mode
4207 # Temporary removal of locks, should be reverted later
4208 # TODO: reintroduce locks when they are lighter-weight
4209 lu.needed_locks = {}
4210 #self.share_locks[locking.LEVEL_NODE] = 1
4211 #self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
4213 # The following variables interact with _QueryBase._GetNames
4215 self.wanted = self.names
4217 self.wanted = locking.ALL_SET
4219 self.do_locking = self.use_locking
4221 def DeclareLocks(self, lu, level):
4225 def _DiagnoseByOS(rlist):
4226 """Remaps a per-node return list into an a per-os per-node dictionary
4228 @param rlist: a map with node names as keys and OS objects as values
4231 @return: a dictionary with osnames as keys and as value another
4232 map, with nodes as keys and tuples of (path, status, diagnose,
4233 variants, parameters, api_versions) as values, eg::
4235 {"debian-etch": {"node1": [(/usr/lib/..., True, "", [], []),
4236 (/srv/..., False, "invalid api")],
4237 "node2": [(/srv/..., True, "", [], [])]}
4242 # we build here the list of nodes that didn't fail the RPC (at RPC
4243 # level), so that nodes with a non-responding node daemon don't
4244 # make all OSes invalid
4245 good_nodes = [node_name for node_name in rlist
4246 if not rlist[node_name].fail_msg]
4247 for node_name, nr in rlist.items():
4248 if nr.fail_msg or not nr.payload:
4250 for (name, path, status, diagnose, variants,
4251 params, api_versions) in nr.payload:
4252 if name not in all_os:
4253 # build a list of nodes for this os containing empty lists
4254 # for each node in node_list
4256 for nname in good_nodes:
4257 all_os[name][nname] = []
4258 # convert params from [name, help] to (name, help)
4259 params = [tuple(v) for v in params]
4260 all_os[name][node_name].append((path, status, diagnose,
4261 variants, params, api_versions))
4264 def _GetQueryData(self, lu):
4265 """Computes the list of nodes and their attributes.
4268 # Locking is not used
4269 assert not (compat.any(lu.glm.is_owned(level)
4270 for level in locking.LEVELS
4271 if level != locking.LEVEL_CLUSTER) or
4272 self.do_locking or self.use_locking)
4274 valid_nodes = [node.name
4275 for node in lu.cfg.GetAllNodesInfo().values()
4276 if not node.offline and node.vm_capable]
4277 pol = self._DiagnoseByOS(lu.rpc.call_os_diagnose(valid_nodes))
4278 cluster = lu.cfg.GetClusterInfo()
4282 for (os_name, os_data) in pol.items():
4283 info = query.OsInfo(name=os_name, valid=True, node_status=os_data,
4284 hidden=(os_name in cluster.hidden_os),
4285 blacklisted=(os_name in cluster.blacklisted_os))
4289 api_versions = set()
4291 for idx, osl in enumerate(os_data.values()):
4292 info.valid = bool(info.valid and osl and osl[0][1])
4296 (node_variants, node_params, node_api) = osl[0][3:6]
4299 variants.update(node_variants)
4300 parameters.update(node_params)
4301 api_versions.update(node_api)
4303 # Filter out inconsistent values
4304 variants.intersection_update(node_variants)
4305 parameters.intersection_update(node_params)
4306 api_versions.intersection_update(node_api)
4308 info.variants = list(variants)
4309 info.parameters = list(parameters)
4310 info.api_versions = list(api_versions)
4312 data[os_name] = info
4314 # Prepare data in requested order
4315 return [data[name] for name in self._GetNames(lu, pol.keys(), None)
4319 class LUOsDiagnose(NoHooksLU):
4320 """Logical unit for OS diagnose/query.
4326 def _BuildFilter(fields, names):
4327 """Builds a filter for querying OSes.
4330 name_filter = qlang.MakeSimpleFilter("name", names)
4332 # Legacy behaviour: Hide hidden, blacklisted or invalid OSes if the
4333 # respective field is not requested
4334 status_filter = [[qlang.OP_NOT, [qlang.OP_TRUE, fname]]
4335 for fname in ["hidden", "blacklisted"]
4336 if fname not in fields]
4337 if "valid" not in fields:
4338 status_filter.append([qlang.OP_TRUE, "valid"])
4341 status_filter.insert(0, qlang.OP_AND)
4343 status_filter = None
4345 if name_filter and status_filter:
4346 return [qlang.OP_AND, name_filter, status_filter]
4350 return status_filter
4352 def CheckArguments(self):
4353 self.oq = _OsQuery(self._BuildFilter(self.op.output_fields, self.op.names),
4354 self.op.output_fields, False)
4356 def ExpandNames(self):
4357 self.oq.ExpandNames(self)
4359 def Exec(self, feedback_fn):
4360 return self.oq.OldStyleQuery(self)
4363 class LUNodeRemove(LogicalUnit):
4364 """Logical unit for removing a node.
4367 HPATH = "node-remove"
4368 HTYPE = constants.HTYPE_NODE
4370 def BuildHooksEnv(self):
4373 This doesn't run on the target node in the pre phase as a failed
4374 node would then be impossible to remove.
4378 "OP_TARGET": self.op.node_name,
4379 "NODE_NAME": self.op.node_name,
4382 def BuildHooksNodes(self):
4383 """Build hooks nodes.
4386 all_nodes = self.cfg.GetNodeList()
4388 all_nodes.remove(self.op.node_name)
4390 logging.warning("Node '%s', which is about to be removed, was not found"
4391 " in the list of all nodes", self.op.node_name)
4392 return (all_nodes, all_nodes)
4394 def CheckPrereq(self):
4395 """Check prerequisites.
4398 - the node exists in the configuration
4399 - it does not have primary or secondary instances
4400 - it's not the master
4402 Any errors are signaled by raising errors.OpPrereqError.
4405 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
4406 node = self.cfg.GetNodeInfo(self.op.node_name)
4407 assert node is not None
4409 masternode = self.cfg.GetMasterNode()
4410 if node.name == masternode:
4411 raise errors.OpPrereqError("Node is the master node, failover to another"
4412 " node is required", errors.ECODE_INVAL)
4414 for instance_name, instance in self.cfg.GetAllInstancesInfo():
4415 if node.name in instance.all_nodes:
4416 raise errors.OpPrereqError("Instance %s is still running on the node,"
4417 " please remove first" % instance_name,
4419 self.op.node_name = node.name
4422 def Exec(self, feedback_fn):
4423 """Removes the node from the cluster.
4427 logging.info("Stopping the node daemon and removing configs from node %s",
4430 modify_ssh_setup = self.cfg.GetClusterInfo().modify_ssh_setup
4432 # Promote nodes to master candidate as needed
4433 _AdjustCandidatePool(self, exceptions=[node.name])
4434 self.context.RemoveNode(node.name)
4436 # Run post hooks on the node before it's removed
4437 _RunPostHook(self, node.name)
4439 result = self.rpc.call_node_leave_cluster(node.name, modify_ssh_setup)
4440 msg = result.fail_msg
4442 self.LogWarning("Errors encountered on the remote node while leaving"
4443 " the cluster: %s", msg)
4445 # Remove node from our /etc/hosts
4446 if self.cfg.GetClusterInfo().modify_etc_hosts:
4447 master_node = self.cfg.GetMasterNode()
4448 result = self.rpc.call_etc_hosts_modify(master_node,
4449 constants.ETC_HOSTS_REMOVE,
4451 result.Raise("Can't update hosts file with new host data")
4452 _RedistributeAncillaryFiles(self)
4455 class _NodeQuery(_QueryBase):
4456 FIELDS = query.NODE_FIELDS
4458 def ExpandNames(self, lu):
4459 lu.needed_locks = {}
4460 lu.share_locks = _ShareAll()
4463 self.wanted = _GetWantedNodes(lu, self.names)
4465 self.wanted = locking.ALL_SET
4467 self.do_locking = (self.use_locking and
4468 query.NQ_LIVE in self.requested_data)
4471 # If any non-static field is requested we need to lock the nodes
4472 lu.needed_locks[locking.LEVEL_NODE] = self.wanted
4474 def DeclareLocks(self, lu, level):
4477 def _GetQueryData(self, lu):
4478 """Computes the list of nodes and their attributes.
4481 all_info = lu.cfg.GetAllNodesInfo()
4483 nodenames = self._GetNames(lu, all_info.keys(), locking.LEVEL_NODE)
4485 # Gather data as requested
4486 if query.NQ_LIVE in self.requested_data:
4487 # filter out non-vm_capable nodes
4488 toquery_nodes = [name for name in nodenames if all_info[name].vm_capable]
4490 node_data = lu.rpc.call_node_info(toquery_nodes, lu.cfg.GetVGName(),
4491 lu.cfg.GetHypervisorType())
4492 live_data = dict((name, nresult.payload)
4493 for (name, nresult) in node_data.items()
4494 if not nresult.fail_msg and nresult.payload)
4498 if query.NQ_INST in self.requested_data:
4499 node_to_primary = dict([(name, set()) for name in nodenames])
4500 node_to_secondary = dict([(name, set()) for name in nodenames])
4502 inst_data = lu.cfg.GetAllInstancesInfo()
4504 for inst in inst_data.values():
4505 if inst.primary_node in node_to_primary:
4506 node_to_primary[inst.primary_node].add(inst.name)
4507 for secnode in inst.secondary_nodes:
4508 if secnode in node_to_secondary:
4509 node_to_secondary[secnode].add(inst.name)
4511 node_to_primary = None
4512 node_to_secondary = None
4514 if query.NQ_OOB in self.requested_data:
4515 oob_support = dict((name, bool(_SupportsOob(lu.cfg, node)))
4516 for name, node in all_info.iteritems())
4520 if query.NQ_GROUP in self.requested_data:
4521 groups = lu.cfg.GetAllNodeGroupsInfo()
4525 return query.NodeQueryData([all_info[name] for name in nodenames],
4526 live_data, lu.cfg.GetMasterNode(),
4527 node_to_primary, node_to_secondary, groups,
4528 oob_support, lu.cfg.GetClusterInfo())
4531 class LUNodeQuery(NoHooksLU):
4532 """Logical unit for querying nodes.
4535 # pylint: disable=W0142
4538 def CheckArguments(self):
4539 self.nq = _NodeQuery(qlang.MakeSimpleFilter("name", self.op.names),
4540 self.op.output_fields, self.op.use_locking)
4542 def ExpandNames(self):
4543 self.nq.ExpandNames(self)
4545 def Exec(self, feedback_fn):
4546 return self.nq.OldStyleQuery(self)
4549 class LUNodeQueryvols(NoHooksLU):
4550 """Logical unit for getting volumes on node(s).
4554 _FIELDS_DYNAMIC = utils.FieldSet("phys", "vg", "name", "size", "instance")
4555 _FIELDS_STATIC = utils.FieldSet("node")
4557 def CheckArguments(self):
4558 _CheckOutputFields(static=self._FIELDS_STATIC,
4559 dynamic=self._FIELDS_DYNAMIC,
4560 selected=self.op.output_fields)
4562 def ExpandNames(self):
4563 self.needed_locks = {}
4564 self.share_locks[locking.LEVEL_NODE] = 1
4565 if not self.op.nodes:
4566 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
4568 self.needed_locks[locking.LEVEL_NODE] = \
4569 _GetWantedNodes(self, self.op.nodes)
4571 def Exec(self, feedback_fn):
4572 """Computes the list of nodes and their attributes.
4575 nodenames = self.owned_locks(locking.LEVEL_NODE)
4576 volumes = self.rpc.call_node_volumes(nodenames)
4578 ilist = self.cfg.GetAllInstancesInfo()
4579 vol2inst = _MapInstanceDisksToNodes(ilist.values())
4582 for node in nodenames:
4583 nresult = volumes[node]
4586 msg = nresult.fail_msg
4588 self.LogWarning("Can't compute volume data on node %s: %s", node, msg)
4591 node_vols = sorted(nresult.payload,
4592 key=operator.itemgetter("dev"))
4594 for vol in node_vols:
4596 for field in self.op.output_fields:
4599 elif field == "phys":
4603 elif field == "name":
4605 elif field == "size":
4606 val = int(float(vol["size"]))
4607 elif field == "instance":
4608 val = vol2inst.get((node, vol["vg"] + "/" + vol["name"]), "-")
4610 raise errors.ParameterError(field)
4611 node_output.append(str(val))
4613 output.append(node_output)
4618 class LUNodeQueryStorage(NoHooksLU):
4619 """Logical unit for getting information on storage units on node(s).
4622 _FIELDS_STATIC = utils.FieldSet(constants.SF_NODE)
4625 def CheckArguments(self):
4626 _CheckOutputFields(static=self._FIELDS_STATIC,
4627 dynamic=utils.FieldSet(*constants.VALID_STORAGE_FIELDS),
4628 selected=self.op.output_fields)
4630 def ExpandNames(self):
4631 self.needed_locks = {}
4632 self.share_locks[locking.LEVEL_NODE] = 1
4635 self.needed_locks[locking.LEVEL_NODE] = \
4636 _GetWantedNodes(self, self.op.nodes)
4638 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
4640 def Exec(self, feedback_fn):
4641 """Computes the list of nodes and their attributes.
4644 self.nodes = self.owned_locks(locking.LEVEL_NODE)
4646 # Always get name to sort by
4647 if constants.SF_NAME in self.op.output_fields:
4648 fields = self.op.output_fields[:]
4650 fields = [constants.SF_NAME] + self.op.output_fields
4652 # Never ask for node or type as it's only known to the LU
4653 for extra in [constants.SF_NODE, constants.SF_TYPE]:
4654 while extra in fields:
4655 fields.remove(extra)
4657 field_idx = dict([(name, idx) for (idx, name) in enumerate(fields)])
4658 name_idx = field_idx[constants.SF_NAME]
4660 st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
4661 data = self.rpc.call_storage_list(self.nodes,
4662 self.op.storage_type, st_args,
4663 self.op.name, fields)
4667 for node in utils.NiceSort(self.nodes):
4668 nresult = data[node]
4672 msg = nresult.fail_msg
4674 self.LogWarning("Can't get storage data from node %s: %s", node, msg)
4677 rows = dict([(row[name_idx], row) for row in nresult.payload])
4679 for name in utils.NiceSort(rows.keys()):
4684 for field in self.op.output_fields:
4685 if field == constants.SF_NODE:
4687 elif field == constants.SF_TYPE:
4688 val = self.op.storage_type
4689 elif field in field_idx:
4690 val = row[field_idx[field]]
4692 raise errors.ParameterError(field)
4701 class _InstanceQuery(_QueryBase):
4702 FIELDS = query.INSTANCE_FIELDS
4704 def ExpandNames(self, lu):
4705 lu.needed_locks = {}
4706 lu.share_locks = _ShareAll()
4709 self.wanted = _GetWantedInstances(lu, self.names)
4711 self.wanted = locking.ALL_SET
4713 self.do_locking = (self.use_locking and
4714 query.IQ_LIVE in self.requested_data)
4716 lu.needed_locks[locking.LEVEL_INSTANCE] = self.wanted
4717 lu.needed_locks[locking.LEVEL_NODEGROUP] = []
4718 lu.needed_locks[locking.LEVEL_NODE] = []
4719 lu.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
4721 self.do_grouplocks = (self.do_locking and
4722 query.IQ_NODES in self.requested_data)
4724 def DeclareLocks(self, lu, level):
4726 if level == locking.LEVEL_NODEGROUP and self.do_grouplocks:
4727 assert not lu.needed_locks[locking.LEVEL_NODEGROUP]
4729 # Lock all groups used by instances optimistically; this requires going
4730 # via the node before it's locked, requiring verification later on
4731 lu.needed_locks[locking.LEVEL_NODEGROUP] = \
4733 for instance_name in lu.owned_locks(locking.LEVEL_INSTANCE)
4734 for group_uuid in lu.cfg.GetInstanceNodeGroups(instance_name))
4735 elif level == locking.LEVEL_NODE:
4736 lu._LockInstancesNodes() # pylint: disable=W0212
4739 def _CheckGroupLocks(lu):
4740 owned_instances = frozenset(lu.owned_locks(locking.LEVEL_INSTANCE))
4741 owned_groups = frozenset(lu.owned_locks(locking.LEVEL_NODEGROUP))
4743 # Check if node groups for locked instances are still correct
4744 for instance_name in owned_instances:
4745 _CheckInstanceNodeGroups(lu.cfg, instance_name, owned_groups)
4747 def _GetQueryData(self, lu):
4748 """Computes the list of instances and their attributes.
4751 if self.do_grouplocks:
4752 self._CheckGroupLocks(lu)
4754 cluster = lu.cfg.GetClusterInfo()
4755 all_info = lu.cfg.GetAllInstancesInfo()
4757 instance_names = self._GetNames(lu, all_info.keys(), locking.LEVEL_INSTANCE)
4759 instance_list = [all_info[name] for name in instance_names]
4760 nodes = frozenset(itertools.chain(*(inst.all_nodes
4761 for inst in instance_list)))
4762 hv_list = list(set([inst.hypervisor for inst in instance_list]))
4765 wrongnode_inst = set()
4767 # Gather data as requested
4768 if self.requested_data & set([query.IQ_LIVE, query.IQ_CONSOLE]):
4770 node_data = lu.rpc.call_all_instances_info(nodes, hv_list)
4772 result = node_data[name]
4774 # offline nodes will be in both lists
4775 assert result.fail_msg
4776 offline_nodes.append(name)
4778 bad_nodes.append(name)
4779 elif result.payload:
4780 for inst in result.payload:
4781 if inst in all_info:
4782 if all_info[inst].primary_node == name:
4783 live_data.update(result.payload)
4785 wrongnode_inst.add(inst)
4787 # orphan instance; we don't list it here as we don't
4788 # handle this case yet in the output of instance listing
4789 logging.warning("Orphan instance '%s' found on node %s",
4791 # else no instance is alive
4795 if query.IQ_DISKUSAGE in self.requested_data:
4796 disk_usage = dict((inst.name,
4797 _ComputeDiskSize(inst.disk_template,
4798 [{constants.IDISK_SIZE: disk.size}
4799 for disk in inst.disks]))
4800 for inst in instance_list)
4804 if query.IQ_CONSOLE in self.requested_data:
4806 for inst in instance_list:
4807 if inst.name in live_data:
4808 # Instance is running
4809 consinfo[inst.name] = _GetInstanceConsole(cluster, inst)
4811 consinfo[inst.name] = None
4812 assert set(consinfo.keys()) == set(instance_names)
4816 if query.IQ_NODES in self.requested_data:
4817 node_names = set(itertools.chain(*map(operator.attrgetter("all_nodes"),
4819 nodes = dict(lu.cfg.GetMultiNodeInfo(node_names))
4820 groups = dict((uuid, lu.cfg.GetNodeGroup(uuid))
4821 for uuid in set(map(operator.attrgetter("group"),
4827 return query.InstanceQueryData(instance_list, lu.cfg.GetClusterInfo(),
4828 disk_usage, offline_nodes, bad_nodes,
4829 live_data, wrongnode_inst, consinfo,
4833 class LUQuery(NoHooksLU):
4834 """Query for resources/items of a certain kind.
4837 # pylint: disable=W0142
4840 def CheckArguments(self):
4841 qcls = _GetQueryImplementation(self.op.what)
4843 self.impl = qcls(self.op.qfilter, self.op.fields, self.op.use_locking)
4845 def ExpandNames(self):
4846 self.impl.ExpandNames(self)
4848 def DeclareLocks(self, level):
4849 self.impl.DeclareLocks(self, level)
4851 def Exec(self, feedback_fn):
4852 return self.impl.NewStyleQuery(self)
4855 class LUQueryFields(NoHooksLU):
4856 """Query for resources/items of a certain kind.
4859 # pylint: disable=W0142
4862 def CheckArguments(self):
4863 self.qcls = _GetQueryImplementation(self.op.what)
4865 def ExpandNames(self):
4866 self.needed_locks = {}
4868 def Exec(self, feedback_fn):
4869 return query.QueryFields(self.qcls.FIELDS, self.op.fields)
4872 class LUNodeModifyStorage(NoHooksLU):
4873 """Logical unit for modifying a storage volume on a node.
4878 def CheckArguments(self):
4879 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
4881 storage_type = self.op.storage_type
4884 modifiable = constants.MODIFIABLE_STORAGE_FIELDS[storage_type]
4886 raise errors.OpPrereqError("Storage units of type '%s' can not be"
4887 " modified" % storage_type,
4890 diff = set(self.op.changes.keys()) - modifiable
4892 raise errors.OpPrereqError("The following fields can not be modified for"
4893 " storage units of type '%s': %r" %
4894 (storage_type, list(diff)),
4897 def ExpandNames(self):
4898 self.needed_locks = {
4899 locking.LEVEL_NODE: self.op.node_name,
4902 def Exec(self, feedback_fn):
4903 """Computes the list of nodes and their attributes.
4906 st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
4907 result = self.rpc.call_storage_modify(self.op.node_name,
4908 self.op.storage_type, st_args,
4909 self.op.name, self.op.changes)
4910 result.Raise("Failed to modify storage unit '%s' on %s" %
4911 (self.op.name, self.op.node_name))
4914 class LUNodeAdd(LogicalUnit):
4915 """Logical unit for adding node to the cluster.
4919 HTYPE = constants.HTYPE_NODE
4920 _NFLAGS = ["master_capable", "vm_capable"]
4922 def CheckArguments(self):
4923 self.primary_ip_family = self.cfg.GetPrimaryIPFamily()
4924 # validate/normalize the node name
4925 self.hostname = netutils.GetHostname(name=self.op.node_name,
4926 family=self.primary_ip_family)
4927 self.op.node_name = self.hostname.name
4929 if self.op.readd and self.op.node_name == self.cfg.GetMasterNode():
4930 raise errors.OpPrereqError("Cannot readd the master node",
4933 if self.op.readd and self.op.group:
4934 raise errors.OpPrereqError("Cannot pass a node group when a node is"
4935 " being readded", errors.ECODE_INVAL)
4937 def BuildHooksEnv(self):
4940 This will run on all nodes before, and on all nodes + the new node after.
4944 "OP_TARGET": self.op.node_name,
4945 "NODE_NAME": self.op.node_name,
4946 "NODE_PIP": self.op.primary_ip,
4947 "NODE_SIP": self.op.secondary_ip,
4948 "MASTER_CAPABLE": str(self.op.master_capable),
4949 "VM_CAPABLE": str(self.op.vm_capable),
4952 def BuildHooksNodes(self):
4953 """Build hooks nodes.
4956 # Exclude added node
4957 pre_nodes = list(set(self.cfg.GetNodeList()) - set([self.op.node_name]))
4958 post_nodes = pre_nodes + [self.op.node_name, ]
4960 return (pre_nodes, post_nodes)
4962 def CheckPrereq(self):
4963 """Check prerequisites.
4966 - the new node is not already in the config
4968 - its parameters (single/dual homed) matches the cluster
4970 Any errors are signaled by raising errors.OpPrereqError.
4974 hostname = self.hostname
4975 node = hostname.name
4976 primary_ip = self.op.primary_ip = hostname.ip
4977 if self.op.secondary_ip is None:
4978 if self.primary_ip_family == netutils.IP6Address.family:
4979 raise errors.OpPrereqError("When using a IPv6 primary address, a valid"
4980 " IPv4 address must be given as secondary",
4982 self.op.secondary_ip = primary_ip
4984 secondary_ip = self.op.secondary_ip
4985 if not netutils.IP4Address.IsValid(secondary_ip):
4986 raise errors.OpPrereqError("Secondary IP (%s) needs to be a valid IPv4"
4987 " address" % secondary_ip, errors.ECODE_INVAL)
4989 node_list = cfg.GetNodeList()
4990 if not self.op.readd and node in node_list:
4991 raise errors.OpPrereqError("Node %s is already in the configuration" %
4992 node, errors.ECODE_EXISTS)
4993 elif self.op.readd and node not in node_list:
4994 raise errors.OpPrereqError("Node %s is not in the configuration" % node,
4997 self.changed_primary_ip = False
4999 for existing_node_name, existing_node in cfg.GetMultiNodeInfo(node_list):
5000 if self.op.readd and node == existing_node_name:
5001 if existing_node.secondary_ip != secondary_ip:
5002 raise errors.OpPrereqError("Readded node doesn't have the same IP"
5003 " address configuration as before",
5005 if existing_node.primary_ip != primary_ip:
5006 self.changed_primary_ip = True
5010 if (existing_node.primary_ip == primary_ip or
5011 existing_node.secondary_ip == primary_ip or
5012 existing_node.primary_ip == secondary_ip or
5013 existing_node.secondary_ip == secondary_ip):
5014 raise errors.OpPrereqError("New node ip address(es) conflict with"
5015 " existing node %s" % existing_node.name,
5016 errors.ECODE_NOTUNIQUE)
5018 # After this 'if' block, None is no longer a valid value for the
5019 # _capable op attributes
5021 old_node = self.cfg.GetNodeInfo(node)
5022 assert old_node is not None, "Can't retrieve locked node %s" % node
5023 for attr in self._NFLAGS:
5024 if getattr(self.op, attr) is None:
5025 setattr(self.op, attr, getattr(old_node, attr))
5027 for attr in self._NFLAGS:
5028 if getattr(self.op, attr) is None:
5029 setattr(self.op, attr, True)
5031 if self.op.readd and not self.op.vm_capable:
5032 pri, sec = cfg.GetNodeInstances(node)
5034 raise errors.OpPrereqError("Node %s being re-added with vm_capable"
5035 " flag set to false, but it already holds"
5036 " instances" % node,
5039 # check that the type of the node (single versus dual homed) is the
5040 # same as for the master
5041 myself = cfg.GetNodeInfo(self.cfg.GetMasterNode())
5042 master_singlehomed = myself.secondary_ip == myself.primary_ip
5043 newbie_singlehomed = secondary_ip == primary_ip
5044 if master_singlehomed != newbie_singlehomed:
5045 if master_singlehomed:
5046 raise errors.OpPrereqError("The master has no secondary ip but the"
5047 " new node has one",
5050 raise errors.OpPrereqError("The master has a secondary ip but the"
5051 " new node doesn't have one",
5054 # checks reachability
5055 if not netutils.TcpPing(primary_ip, constants.DEFAULT_NODED_PORT):
5056 raise errors.OpPrereqError("Node not reachable by ping",
5057 errors.ECODE_ENVIRON)
5059 if not newbie_singlehomed:
5060 # check reachability from my secondary ip to newbie's secondary ip
5061 if not netutils.TcpPing(secondary_ip, constants.DEFAULT_NODED_PORT,
5062 source=myself.secondary_ip):
5063 raise errors.OpPrereqError("Node secondary ip not reachable by TCP"
5064 " based ping to node daemon port",
5065 errors.ECODE_ENVIRON)
5072 if self.op.master_capable:
5073 self.master_candidate = _DecideSelfPromotion(self, exceptions=exceptions)
5075 self.master_candidate = False
5078 self.new_node = old_node
5080 node_group = cfg.LookupNodeGroup(self.op.group)
5081 self.new_node = objects.Node(name=node,
5082 primary_ip=primary_ip,
5083 secondary_ip=secondary_ip,
5084 master_candidate=self.master_candidate,
5085 offline=False, drained=False,
5088 if self.op.ndparams:
5089 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
5091 def Exec(self, feedback_fn):
5092 """Adds the new node to the cluster.
5095 new_node = self.new_node
5096 node = new_node.name
5098 # We adding a new node so we assume it's powered
5099 new_node.powered = True
5101 # for re-adds, reset the offline/drained/master-candidate flags;
5102 # we need to reset here, otherwise offline would prevent RPC calls
5103 # later in the procedure; this also means that if the re-add
5104 # fails, we are left with a non-offlined, broken node
5106 new_node.drained = new_node.offline = False # pylint: disable=W0201
5107 self.LogInfo("Readding a node, the offline/drained flags were reset")
5108 # if we demote the node, we do cleanup later in the procedure
5109 new_node.master_candidate = self.master_candidate
5110 if self.changed_primary_ip:
5111 new_node.primary_ip = self.op.primary_ip
5113 # copy the master/vm_capable flags
5114 for attr in self._NFLAGS:
5115 setattr(new_node, attr, getattr(self.op, attr))
5117 # notify the user about any possible mc promotion
5118 if new_node.master_candidate:
5119 self.LogInfo("Node will be a master candidate")
5121 if self.op.ndparams:
5122 new_node.ndparams = self.op.ndparams
5124 new_node.ndparams = {}
5126 # check connectivity
5127 result = self.rpc.call_version([node])[node]
5128 result.Raise("Can't get version information from node %s" % node)
5129 if constants.PROTOCOL_VERSION == result.payload:
5130 logging.info("Communication to node %s fine, sw version %s match",
5131 node, result.payload)
5133 raise errors.OpExecError("Version mismatch master version %s,"
5134 " node version %s" %
5135 (constants.PROTOCOL_VERSION, result.payload))
5137 # Add node to our /etc/hosts, and add key to known_hosts
5138 if self.cfg.GetClusterInfo().modify_etc_hosts:
5139 master_node = self.cfg.GetMasterNode()
5140 result = self.rpc.call_etc_hosts_modify(master_node,
5141 constants.ETC_HOSTS_ADD,
5144 result.Raise("Can't update hosts file with new host data")
5146 if new_node.secondary_ip != new_node.primary_ip:
5147 _CheckNodeHasSecondaryIP(self, new_node.name, new_node.secondary_ip,
5150 node_verify_list = [self.cfg.GetMasterNode()]
5151 node_verify_param = {
5152 constants.NV_NODELIST: ([node], {}),
5153 # TODO: do a node-net-test as well?
5156 result = self.rpc.call_node_verify(node_verify_list, node_verify_param,
5157 self.cfg.GetClusterName())
5158 for verifier in node_verify_list:
5159 result[verifier].Raise("Cannot communicate with node %s" % verifier)
5160 nl_payload = result[verifier].payload[constants.NV_NODELIST]
5162 for failed in nl_payload:
5163 feedback_fn("ssh/hostname verification failed"
5164 " (checking from %s): %s" %
5165 (verifier, nl_payload[failed]))
5166 raise errors.OpExecError("ssh/hostname verification failed")
5169 _RedistributeAncillaryFiles(self)
5170 self.context.ReaddNode(new_node)
5171 # make sure we redistribute the config
5172 self.cfg.Update(new_node, feedback_fn)
5173 # and make sure the new node will not have old files around
5174 if not new_node.master_candidate:
5175 result = self.rpc.call_node_demote_from_mc(new_node.name)
5176 msg = result.fail_msg
5178 self.LogWarning("Node failed to demote itself from master"
5179 " candidate status: %s" % msg)
5181 _RedistributeAncillaryFiles(self, additional_nodes=[node],
5182 additional_vm=self.op.vm_capable)
5183 self.context.AddNode(new_node, self.proc.GetECId())
5186 class LUNodeSetParams(LogicalUnit):
5187 """Modifies the parameters of a node.
5189 @cvar _F2R: a dictionary from tuples of flags (mc, drained, offline)
5190 to the node role (as _ROLE_*)
5191 @cvar _R2F: a dictionary from node role to tuples of flags
5192 @cvar _FLAGS: a list of attribute names corresponding to the flags
5195 HPATH = "node-modify"
5196 HTYPE = constants.HTYPE_NODE
5198 (_ROLE_CANDIDATE, _ROLE_DRAINED, _ROLE_OFFLINE, _ROLE_REGULAR) = range(4)
5200 (True, False, False): _ROLE_CANDIDATE,
5201 (False, True, False): _ROLE_DRAINED,
5202 (False, False, True): _ROLE_OFFLINE,
5203 (False, False, False): _ROLE_REGULAR,
5205 _R2F = dict((v, k) for k, v in _F2R.items())
5206 _FLAGS = ["master_candidate", "drained", "offline"]
5208 def CheckArguments(self):
5209 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
5210 all_mods = [self.op.offline, self.op.master_candidate, self.op.drained,
5211 self.op.master_capable, self.op.vm_capable,
5212 self.op.secondary_ip, self.op.ndparams]
5213 if all_mods.count(None) == len(all_mods):
5214 raise errors.OpPrereqError("Please pass at least one modification",
5216 if all_mods.count(True) > 1:
5217 raise errors.OpPrereqError("Can't set the node into more than one"
5218 " state at the same time",
5221 # Boolean value that tells us whether we might be demoting from MC
5222 self.might_demote = (self.op.master_candidate == False or
5223 self.op.offline == True or
5224 self.op.drained == True or
5225 self.op.master_capable == False)
5227 if self.op.secondary_ip:
5228 if not netutils.IP4Address.IsValid(self.op.secondary_ip):
5229 raise errors.OpPrereqError("Secondary IP (%s) needs to be a valid IPv4"
5230 " address" % self.op.secondary_ip,
5233 self.lock_all = self.op.auto_promote and self.might_demote
5234 self.lock_instances = self.op.secondary_ip is not None
5236 def ExpandNames(self):
5238 self.needed_locks = {locking.LEVEL_NODE: locking.ALL_SET}
5240 self.needed_locks = {locking.LEVEL_NODE: self.op.node_name}
5242 if self.lock_instances:
5243 self.needed_locks[locking.LEVEL_INSTANCE] = locking.ALL_SET
5245 def DeclareLocks(self, level):
5246 # If we have locked all instances, before waiting to lock nodes, release
5247 # all the ones living on nodes unrelated to the current operation.
5248 if level == locking.LEVEL_NODE and self.lock_instances:
5249 self.affected_instances = []
5250 if self.needed_locks[locking.LEVEL_NODE] is not locking.ALL_SET:
5253 # Build list of instances to release
5254 locked_i = self.owned_locks(locking.LEVEL_INSTANCE)
5255 for instance_name, instance in self.cfg.GetMultiInstanceInfo(locked_i):
5256 if (instance.disk_template in constants.DTS_INT_MIRROR and
5257 self.op.node_name in instance.all_nodes):
5258 instances_keep.append(instance_name)
5259 self.affected_instances.append(instance)
5261 _ReleaseLocks(self, locking.LEVEL_INSTANCE, keep=instances_keep)
5263 assert (set(self.owned_locks(locking.LEVEL_INSTANCE)) ==
5264 set(instances_keep))
5266 def BuildHooksEnv(self):
5269 This runs on the master node.
5273 "OP_TARGET": self.op.node_name,
5274 "MASTER_CANDIDATE": str(self.op.master_candidate),
5275 "OFFLINE": str(self.op.offline),
5276 "DRAINED": str(self.op.drained),
5277 "MASTER_CAPABLE": str(self.op.master_capable),
5278 "VM_CAPABLE": str(self.op.vm_capable),
5281 def BuildHooksNodes(self):
5282 """Build hooks nodes.
5285 nl = [self.cfg.GetMasterNode(), self.op.node_name]
5288 def CheckPrereq(self):
5289 """Check prerequisites.
5291 This only checks the instance list against the existing names.
5294 node = self.node = self.cfg.GetNodeInfo(self.op.node_name)
5296 if (self.op.master_candidate is not None or
5297 self.op.drained is not None or
5298 self.op.offline is not None):
5299 # we can't change the master's node flags
5300 if self.op.node_name == self.cfg.GetMasterNode():
5301 raise errors.OpPrereqError("The master role can be changed"
5302 " only via master-failover",
5305 if self.op.master_candidate and not node.master_capable:
5306 raise errors.OpPrereqError("Node %s is not master capable, cannot make"
5307 " it a master candidate" % node.name,
5310 if self.op.vm_capable == False:
5311 (ipri, isec) = self.cfg.GetNodeInstances(self.op.node_name)
5313 raise errors.OpPrereqError("Node %s hosts instances, cannot unset"
5314 " the vm_capable flag" % node.name,
5317 if node.master_candidate and self.might_demote and not self.lock_all:
5318 assert not self.op.auto_promote, "auto_promote set but lock_all not"
5319 # check if after removing the current node, we're missing master
5321 (mc_remaining, mc_should, _) = \
5322 self.cfg.GetMasterCandidateStats(exceptions=[node.name])
5323 if mc_remaining < mc_should:
5324 raise errors.OpPrereqError("Not enough master candidates, please"
5325 " pass auto promote option to allow"
5326 " promotion", errors.ECODE_STATE)
5328 self.old_flags = old_flags = (node.master_candidate,
5329 node.drained, node.offline)
5330 assert old_flags in self._F2R, "Un-handled old flags %s" % str(old_flags)
5331 self.old_role = old_role = self._F2R[old_flags]
5333 # Check for ineffective changes
5334 for attr in self._FLAGS:
5335 if (getattr(self.op, attr) == False and getattr(node, attr) == False):
5336 self.LogInfo("Ignoring request to unset flag %s, already unset", attr)
5337 setattr(self.op, attr, None)
5339 # Past this point, any flag change to False means a transition
5340 # away from the respective state, as only real changes are kept
5342 # TODO: We might query the real power state if it supports OOB
5343 if _SupportsOob(self.cfg, node):
5344 if self.op.offline is False and not (node.powered or
5345 self.op.powered == True):
5346 raise errors.OpPrereqError(("Node %s needs to be turned on before its"
5347 " offline status can be reset") %
5349 elif self.op.powered is not None:
5350 raise errors.OpPrereqError(("Unable to change powered state for node %s"
5351 " as it does not support out-of-band"
5352 " handling") % self.op.node_name)
5354 # If we're being deofflined/drained, we'll MC ourself if needed
5355 if (self.op.drained == False or self.op.offline == False or
5356 (self.op.master_capable and not node.master_capable)):
5357 if _DecideSelfPromotion(self):
5358 self.op.master_candidate = True
5359 self.LogInfo("Auto-promoting node to master candidate")
5361 # If we're no longer master capable, we'll demote ourselves from MC
5362 if self.op.master_capable == False and node.master_candidate:
5363 self.LogInfo("Demoting from master candidate")
5364 self.op.master_candidate = False
5367 assert [getattr(self.op, attr) for attr in self._FLAGS].count(True) <= 1
5368 if self.op.master_candidate:
5369 new_role = self._ROLE_CANDIDATE
5370 elif self.op.drained:
5371 new_role = self._ROLE_DRAINED
5372 elif self.op.offline:
5373 new_role = self._ROLE_OFFLINE
5374 elif False in [self.op.master_candidate, self.op.drained, self.op.offline]:
5375 # False is still in new flags, which means we're un-setting (the
5377 new_role = self._ROLE_REGULAR
5378 else: # no new flags, nothing, keep old role
5381 self.new_role = new_role
5383 if old_role == self._ROLE_OFFLINE and new_role != old_role:
5384 # Trying to transition out of offline status
5385 # TODO: Use standard RPC runner, but make sure it works when the node is
5386 # still marked offline
5387 result = rpc.BootstrapRunner().call_version([node.name])[node.name]
5389 raise errors.OpPrereqError("Node %s is being de-offlined but fails"
5390 " to report its version: %s" %
5391 (node.name, result.fail_msg),
5394 self.LogWarning("Transitioning node from offline to online state"
5395 " without using re-add. Please make sure the node"
5398 if self.op.secondary_ip:
5399 # Ok even without locking, because this can't be changed by any LU
5400 master = self.cfg.GetNodeInfo(self.cfg.GetMasterNode())
5401 master_singlehomed = master.secondary_ip == master.primary_ip
5402 if master_singlehomed and self.op.secondary_ip:
5403 raise errors.OpPrereqError("Cannot change the secondary ip on a single"
5404 " homed cluster", errors.ECODE_INVAL)
5407 if self.affected_instances:
5408 raise errors.OpPrereqError("Cannot change secondary ip: offline"
5409 " node has instances (%s) configured"
5410 " to use it" % self.affected_instances)
5412 # On online nodes, check that no instances are running, and that
5413 # the node has the new ip and we can reach it.
5414 for instance in self.affected_instances:
5415 _CheckInstanceDown(self, instance, "cannot change secondary ip")
5417 _CheckNodeHasSecondaryIP(self, node.name, self.op.secondary_ip, True)
5418 if master.name != node.name:
5419 # check reachability from master secondary ip to new secondary ip
5420 if not netutils.TcpPing(self.op.secondary_ip,
5421 constants.DEFAULT_NODED_PORT,
5422 source=master.secondary_ip):
5423 raise errors.OpPrereqError("Node secondary ip not reachable by TCP"
5424 " based ping to node daemon port",
5425 errors.ECODE_ENVIRON)
5427 if self.op.ndparams:
5428 new_ndparams = _GetUpdatedParams(self.node.ndparams, self.op.ndparams)
5429 utils.ForceDictType(new_ndparams, constants.NDS_PARAMETER_TYPES)
5430 self.new_ndparams = new_ndparams
5432 def Exec(self, feedback_fn):
5437 old_role = self.old_role
5438 new_role = self.new_role
5442 if self.op.ndparams:
5443 node.ndparams = self.new_ndparams
5445 if self.op.powered is not None:
5446 node.powered = self.op.powered
5448 for attr in ["master_capable", "vm_capable"]:
5449 val = getattr(self.op, attr)
5451 setattr(node, attr, val)
5452 result.append((attr, str(val)))
5454 if new_role != old_role:
5455 # Tell the node to demote itself, if no longer MC and not offline
5456 if old_role == self._ROLE_CANDIDATE and new_role != self._ROLE_OFFLINE:
5457 msg = self.rpc.call_node_demote_from_mc(node.name).fail_msg
5459 self.LogWarning("Node failed to demote itself: %s", msg)
5461 new_flags = self._R2F[new_role]
5462 for of, nf, desc in zip(self.old_flags, new_flags, self._FLAGS):
5464 result.append((desc, str(nf)))
5465 (node.master_candidate, node.drained, node.offline) = new_flags
5467 # we locked all nodes, we adjust the CP before updating this node
5469 _AdjustCandidatePool(self, [node.name])
5471 if self.op.secondary_ip:
5472 node.secondary_ip = self.op.secondary_ip
5473 result.append(("secondary_ip", self.op.secondary_ip))
5475 # this will trigger configuration file update, if needed
5476 self.cfg.Update(node, feedback_fn)
5478 # this will trigger job queue propagation or cleanup if the mc
5480 if [old_role, new_role].count(self._ROLE_CANDIDATE) == 1:
5481 self.context.ReaddNode(node)
5486 class LUNodePowercycle(NoHooksLU):
5487 """Powercycles a node.
5492 def CheckArguments(self):
5493 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
5494 if self.op.node_name == self.cfg.GetMasterNode() and not self.op.force:
5495 raise errors.OpPrereqError("The node is the master and the force"
5496 " parameter was not set",
5499 def ExpandNames(self):
5500 """Locking for PowercycleNode.
5502 This is a last-resort option and shouldn't block on other
5503 jobs. Therefore, we grab no locks.
5506 self.needed_locks = {}
5508 def Exec(self, feedback_fn):
5512 result = self.rpc.call_node_powercycle(self.op.node_name,
5513 self.cfg.GetHypervisorType())
5514 result.Raise("Failed to schedule the reboot")
5515 return result.payload
5518 class LUClusterQuery(NoHooksLU):
5519 """Query cluster configuration.
5524 def ExpandNames(self):
5525 self.needed_locks = {}
5527 def Exec(self, feedback_fn):
5528 """Return cluster config.
5531 cluster = self.cfg.GetClusterInfo()
5534 # Filter just for enabled hypervisors
5535 for os_name, hv_dict in cluster.os_hvp.items():
5536 os_hvp[os_name] = {}
5537 for hv_name, hv_params in hv_dict.items():
5538 if hv_name in cluster.enabled_hypervisors:
5539 os_hvp[os_name][hv_name] = hv_params
5541 # Convert ip_family to ip_version
5542 primary_ip_version = constants.IP4_VERSION
5543 if cluster.primary_ip_family == netutils.IP6Address.family:
5544 primary_ip_version = constants.IP6_VERSION
5547 "software_version": constants.RELEASE_VERSION,
5548 "protocol_version": constants.PROTOCOL_VERSION,
5549 "config_version": constants.CONFIG_VERSION,
5550 "os_api_version": max(constants.OS_API_VERSIONS),
5551 "export_version": constants.EXPORT_VERSION,
5552 "architecture": (platform.architecture()[0], platform.machine()),
5553 "name": cluster.cluster_name,
5554 "master": cluster.master_node,
5555 "default_hypervisor": cluster.enabled_hypervisors[0],
5556 "enabled_hypervisors": cluster.enabled_hypervisors,
5557 "hvparams": dict([(hypervisor_name, cluster.hvparams[hypervisor_name])
5558 for hypervisor_name in cluster.enabled_hypervisors]),
5560 "beparams": cluster.beparams,
5561 "osparams": cluster.osparams,
5562 "nicparams": cluster.nicparams,
5563 "ndparams": cluster.ndparams,
5564 "candidate_pool_size": cluster.candidate_pool_size,
5565 "master_netdev": cluster.master_netdev,
5566 "master_netmask": cluster.master_netmask,
5567 "volume_group_name": cluster.volume_group_name,
5568 "drbd_usermode_helper": cluster.drbd_usermode_helper,
5569 "file_storage_dir": cluster.file_storage_dir,
5570 "shared_file_storage_dir": cluster.shared_file_storage_dir,
5571 "maintain_node_health": cluster.maintain_node_health,
5572 "ctime": cluster.ctime,
5573 "mtime": cluster.mtime,
5574 "uuid": cluster.uuid,
5575 "tags": list(cluster.GetTags()),
5576 "uid_pool": cluster.uid_pool,
5577 "default_iallocator": cluster.default_iallocator,
5578 "reserved_lvs": cluster.reserved_lvs,
5579 "primary_ip_version": primary_ip_version,
5580 "prealloc_wipe_disks": cluster.prealloc_wipe_disks,
5581 "hidden_os": cluster.hidden_os,
5582 "blacklisted_os": cluster.blacklisted_os,
5588 class LUClusterConfigQuery(NoHooksLU):
5589 """Return configuration values.
5593 _FIELDS_DYNAMIC = utils.FieldSet()
5594 _FIELDS_STATIC = utils.FieldSet("cluster_name", "master_node", "drain_flag",
5595 "watcher_pause", "volume_group_name")
5597 def CheckArguments(self):
5598 _CheckOutputFields(static=self._FIELDS_STATIC,
5599 dynamic=self._FIELDS_DYNAMIC,
5600 selected=self.op.output_fields)
5602 def ExpandNames(self):
5603 self.needed_locks = {}
5605 def Exec(self, feedback_fn):
5606 """Dump a representation of the cluster config to the standard output.
5610 for field in self.op.output_fields:
5611 if field == "cluster_name":
5612 entry = self.cfg.GetClusterName()
5613 elif field == "master_node":
5614 entry = self.cfg.GetMasterNode()
5615 elif field == "drain_flag":
5616 entry = os.path.exists(constants.JOB_QUEUE_DRAIN_FILE)
5617 elif field == "watcher_pause":
5618 entry = utils.ReadWatcherPauseFile(constants.WATCHER_PAUSEFILE)
5619 elif field == "volume_group_name":
5620 entry = self.cfg.GetVGName()
5622 raise errors.ParameterError(field)
5623 values.append(entry)
5627 class LUInstanceActivateDisks(NoHooksLU):
5628 """Bring up an instance's disks.
5633 def ExpandNames(self):
5634 self._ExpandAndLockInstance()
5635 self.needed_locks[locking.LEVEL_NODE] = []
5636 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
5638 def DeclareLocks(self, level):
5639 if level == locking.LEVEL_NODE:
5640 self._LockInstancesNodes()
5642 def CheckPrereq(self):
5643 """Check prerequisites.
5645 This checks that the instance is in the cluster.
5648 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5649 assert self.instance is not None, \
5650 "Cannot retrieve locked instance %s" % self.op.instance_name
5651 _CheckNodeOnline(self, self.instance.primary_node)
5653 def Exec(self, feedback_fn):
5654 """Activate the disks.
5657 disks_ok, disks_info = \
5658 _AssembleInstanceDisks(self, self.instance,
5659 ignore_size=self.op.ignore_size)
5661 raise errors.OpExecError("Cannot activate block devices")
5666 def _AssembleInstanceDisks(lu, instance, disks=None, ignore_secondaries=False,
5668 """Prepare the block devices for an instance.
5670 This sets up the block devices on all nodes.
5672 @type lu: L{LogicalUnit}
5673 @param lu: the logical unit on whose behalf we execute
5674 @type instance: L{objects.Instance}
5675 @param instance: the instance for whose disks we assemble
5676 @type disks: list of L{objects.Disk} or None
5677 @param disks: which disks to assemble (or all, if None)
5678 @type ignore_secondaries: boolean
5679 @param ignore_secondaries: if true, errors on secondary nodes
5680 won't result in an error return from the function
5681 @type ignore_size: boolean
5682 @param ignore_size: if true, the current known size of the disk
5683 will not be used during the disk activation, useful for cases
5684 when the size is wrong
5685 @return: False if the operation failed, otherwise a list of
5686 (host, instance_visible_name, node_visible_name)
5687 with the mapping from node devices to instance devices
5692 iname = instance.name
5693 disks = _ExpandCheckDisks(instance, disks)
5695 # With the two passes mechanism we try to reduce the window of
5696 # opportunity for the race condition of switching DRBD to primary
5697 # before handshaking occured, but we do not eliminate it
5699 # The proper fix would be to wait (with some limits) until the
5700 # connection has been made and drbd transitions from WFConnection
5701 # into any other network-connected state (Connected, SyncTarget,
5704 # 1st pass, assemble on all nodes in secondary mode
5705 for idx, inst_disk in enumerate(disks):
5706 for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node):
5708 node_disk = node_disk.Copy()
5709 node_disk.UnsetSize()
5710 lu.cfg.SetDiskID(node_disk, node)
5711 result = lu.rpc.call_blockdev_assemble(node, node_disk, iname, False, idx)
5712 msg = result.fail_msg
5714 lu.proc.LogWarning("Could not prepare block device %s on node %s"
5715 " (is_primary=False, pass=1): %s",
5716 inst_disk.iv_name, node, msg)
5717 if not ignore_secondaries:
5720 # FIXME: race condition on drbd migration to primary
5722 # 2nd pass, do only the primary node
5723 for idx, inst_disk in enumerate(disks):
5726 for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node):
5727 if node != instance.primary_node:
5730 node_disk = node_disk.Copy()
5731 node_disk.UnsetSize()
5732 lu.cfg.SetDiskID(node_disk, node)
5733 result = lu.rpc.call_blockdev_assemble(node, node_disk, iname, True, idx)
5734 msg = result.fail_msg
5736 lu.proc.LogWarning("Could not prepare block device %s on node %s"
5737 " (is_primary=True, pass=2): %s",
5738 inst_disk.iv_name, node, msg)
5741 dev_path = result.payload
5743 device_info.append((instance.primary_node, inst_disk.iv_name, dev_path))
5745 # leave the disks configured for the primary node
5746 # this is a workaround that would be fixed better by
5747 # improving the logical/physical id handling
5749 lu.cfg.SetDiskID(disk, instance.primary_node)
5751 return disks_ok, device_info
5754 def _StartInstanceDisks(lu, instance, force):
5755 """Start the disks of an instance.
5758 disks_ok, _ = _AssembleInstanceDisks(lu, instance,
5759 ignore_secondaries=force)
5761 _ShutdownInstanceDisks(lu, instance)
5762 if force is not None and not force:
5763 lu.proc.LogWarning("", hint="If the message above refers to a"
5765 " you can retry the operation using '--force'.")
5766 raise errors.OpExecError("Disk consistency error")
5769 class LUInstanceDeactivateDisks(NoHooksLU):
5770 """Shutdown an instance's disks.
5775 def ExpandNames(self):
5776 self._ExpandAndLockInstance()
5777 self.needed_locks[locking.LEVEL_NODE] = []
5778 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
5780 def DeclareLocks(self, level):
5781 if level == locking.LEVEL_NODE:
5782 self._LockInstancesNodes()
5784 def CheckPrereq(self):
5785 """Check prerequisites.
5787 This checks that the instance is in the cluster.
5790 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5791 assert self.instance is not None, \
5792 "Cannot retrieve locked instance %s" % self.op.instance_name
5794 def Exec(self, feedback_fn):
5795 """Deactivate the disks
5798 instance = self.instance
5800 _ShutdownInstanceDisks(self, instance)
5802 _SafeShutdownInstanceDisks(self, instance)
5805 def _SafeShutdownInstanceDisks(lu, instance, disks=None):
5806 """Shutdown block devices of an instance.
5808 This function checks if an instance is running, before calling
5809 _ShutdownInstanceDisks.
5812 _CheckInstanceDown(lu, instance, "cannot shutdown disks")
5813 _ShutdownInstanceDisks(lu, instance, disks=disks)
5816 def _ExpandCheckDisks(instance, disks):
5817 """Return the instance disks selected by the disks list
5819 @type disks: list of L{objects.Disk} or None
5820 @param disks: selected disks
5821 @rtype: list of L{objects.Disk}
5822 @return: selected instance disks to act on
5826 return instance.disks
5828 if not set(disks).issubset(instance.disks):
5829 raise errors.ProgrammerError("Can only act on disks belonging to the"
5834 def _ShutdownInstanceDisks(lu, instance, disks=None, ignore_primary=False):
5835 """Shutdown block devices of an instance.
5837 This does the shutdown on all nodes of the instance.
5839 If the ignore_primary is false, errors on the primary node are
5844 disks = _ExpandCheckDisks(instance, disks)
5847 for node, top_disk in disk.ComputeNodeTree(instance.primary_node):
5848 lu.cfg.SetDiskID(top_disk, node)
5849 result = lu.rpc.call_blockdev_shutdown(node, top_disk)
5850 msg = result.fail_msg
5852 lu.LogWarning("Could not shutdown block device %s on node %s: %s",
5853 disk.iv_name, node, msg)
5854 if ((node == instance.primary_node and not ignore_primary) or
5855 (node != instance.primary_node and not result.offline)):
5860 def _CheckNodeFreeMemory(lu, node, reason, requested, hypervisor_name):
5861 """Checks if a node has enough free memory.
5863 This function check if a given node has the needed amount of free
5864 memory. In case the node has less memory or we cannot get the
5865 information from the node, this function raise an OpPrereqError
5868 @type lu: C{LogicalUnit}
5869 @param lu: a logical unit from which we get configuration data
5871 @param node: the node to check
5872 @type reason: C{str}
5873 @param reason: string to use in the error message
5874 @type requested: C{int}
5875 @param requested: the amount of memory in MiB to check for
5876 @type hypervisor_name: C{str}
5877 @param hypervisor_name: the hypervisor to ask for memory stats
5878 @raise errors.OpPrereqError: if the node doesn't have enough memory, or
5879 we cannot check the node
5882 nodeinfo = lu.rpc.call_node_info([node], None, hypervisor_name)
5883 nodeinfo[node].Raise("Can't get data from node %s" % node,
5884 prereq=True, ecode=errors.ECODE_ENVIRON)
5885 free_mem = nodeinfo[node].payload.get("memory_free", None)
5886 if not isinstance(free_mem, int):
5887 raise errors.OpPrereqError("Can't compute free memory on node %s, result"
5888 " was '%s'" % (node, free_mem),
5889 errors.ECODE_ENVIRON)
5890 if requested > free_mem:
5891 raise errors.OpPrereqError("Not enough memory on node %s for %s:"
5892 " needed %s MiB, available %s MiB" %
5893 (node, reason, requested, free_mem),
5897 def _CheckNodesFreeDiskPerVG(lu, nodenames, req_sizes):
5898 """Checks if nodes have enough free disk space in the all VGs.
5900 This function check if all given nodes have the needed amount of
5901 free disk. In case any node has less disk or we cannot get the
5902 information from the node, this function raise an OpPrereqError
5905 @type lu: C{LogicalUnit}
5906 @param lu: a logical unit from which we get configuration data
5907 @type nodenames: C{list}
5908 @param nodenames: the list of node names to check
5909 @type req_sizes: C{dict}
5910 @param req_sizes: the hash of vg and corresponding amount of disk in
5912 @raise errors.OpPrereqError: if the node doesn't have enough disk,
5913 or we cannot check the node
5916 for vg, req_size in req_sizes.items():
5917 _CheckNodesFreeDiskOnVG(lu, nodenames, vg, req_size)
5920 def _CheckNodesFreeDiskOnVG(lu, nodenames, vg, requested):
5921 """Checks if nodes have enough free disk space in the specified VG.
5923 This function check if all given nodes have the needed amount of
5924 free disk. In case any node has less disk or we cannot get the
5925 information from the node, this function raise an OpPrereqError
5928 @type lu: C{LogicalUnit}
5929 @param lu: a logical unit from which we get configuration data
5930 @type nodenames: C{list}
5931 @param nodenames: the list of node names to check
5933 @param vg: the volume group to check
5934 @type requested: C{int}
5935 @param requested: the amount of disk in MiB to check for
5936 @raise errors.OpPrereqError: if the node doesn't have enough disk,
5937 or we cannot check the node
5940 nodeinfo = lu.rpc.call_node_info(nodenames, vg, None)
5941 for node in nodenames:
5942 info = nodeinfo[node]
5943 info.Raise("Cannot get current information from node %s" % node,
5944 prereq=True, ecode=errors.ECODE_ENVIRON)
5945 vg_free = info.payload.get("vg_free", None)
5946 if not isinstance(vg_free, int):
5947 raise errors.OpPrereqError("Can't compute free disk space on node"
5948 " %s for vg %s, result was '%s'" %
5949 (node, vg, vg_free), errors.ECODE_ENVIRON)
5950 if requested > vg_free:
5951 raise errors.OpPrereqError("Not enough disk space on target node %s"
5952 " vg %s: required %d MiB, available %d MiB" %
5953 (node, vg, requested, vg_free),
5957 def _CheckNodesPhysicalCPUs(lu, nodenames, requested, hypervisor_name):
5958 """Checks if nodes have enough physical CPUs
5960 This function checks if all given nodes have the needed number of
5961 physical CPUs. In case any node has less CPUs or we cannot get the
5962 information from the node, this function raises an OpPrereqError
5965 @type lu: C{LogicalUnit}
5966 @param lu: a logical unit from which we get configuration data
5967 @type nodenames: C{list}
5968 @param nodenames: the list of node names to check
5969 @type requested: C{int}
5970 @param requested: the minimum acceptable number of physical CPUs
5971 @raise errors.OpPrereqError: if the node doesn't have enough CPUs,
5972 or we cannot check the node
5975 nodeinfo = lu.rpc.call_node_info(nodenames, None, hypervisor_name)
5976 for node in nodenames:
5977 info = nodeinfo[node]
5978 info.Raise("Cannot get current information from node %s" % node,
5979 prereq=True, ecode=errors.ECODE_ENVIRON)
5980 num_cpus = info.payload.get("cpu_total", None)
5981 if not isinstance(num_cpus, int):
5982 raise errors.OpPrereqError("Can't compute the number of physical CPUs"
5983 " on node %s, result was '%s'" %
5984 (node, num_cpus), errors.ECODE_ENVIRON)
5985 if requested > num_cpus:
5986 raise errors.OpPrereqError("Node %s has %s physical CPUs, but %s are "
5987 "required" % (node, num_cpus, requested),
5991 class LUInstanceStartup(LogicalUnit):
5992 """Starts an instance.
5995 HPATH = "instance-start"
5996 HTYPE = constants.HTYPE_INSTANCE
5999 def CheckArguments(self):
6001 if self.op.beparams:
6002 # fill the beparams dict
6003 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
6005 def ExpandNames(self):
6006 self._ExpandAndLockInstance()
6008 def BuildHooksEnv(self):
6011 This runs on master, primary and secondary nodes of the instance.
6015 "FORCE": self.op.force,
6018 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
6022 def BuildHooksNodes(self):
6023 """Build hooks nodes.
6026 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6029 def CheckPrereq(self):
6030 """Check prerequisites.
6032 This checks that the instance is in the cluster.
6035 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6036 assert self.instance is not None, \
6037 "Cannot retrieve locked instance %s" % self.op.instance_name
6040 if self.op.hvparams:
6041 # check hypervisor parameter syntax (locally)
6042 cluster = self.cfg.GetClusterInfo()
6043 utils.ForceDictType(self.op.hvparams, constants.HVS_PARAMETER_TYPES)
6044 filled_hvp = cluster.FillHV(instance)
6045 filled_hvp.update(self.op.hvparams)
6046 hv_type = hypervisor.GetHypervisor(instance.hypervisor)
6047 hv_type.CheckParameterSyntax(filled_hvp)
6048 _CheckHVParams(self, instance.all_nodes, instance.hypervisor, filled_hvp)
6050 self.primary_offline = self.cfg.GetNodeInfo(instance.primary_node).offline
6052 if self.primary_offline and self.op.ignore_offline_nodes:
6053 self.proc.LogWarning("Ignoring offline primary node")
6055 if self.op.hvparams or self.op.beparams:
6056 self.proc.LogWarning("Overridden parameters are ignored")
6058 _CheckNodeOnline(self, instance.primary_node)
6060 bep = self.cfg.GetClusterInfo().FillBE(instance)
6062 # check bridges existence
6063 _CheckInstanceBridgesExist(self, instance)
6065 remote_info = self.rpc.call_instance_info(instance.primary_node,
6067 instance.hypervisor)
6068 remote_info.Raise("Error checking node %s" % instance.primary_node,
6069 prereq=True, ecode=errors.ECODE_ENVIRON)
6070 if not remote_info.payload: # not running already
6071 _CheckNodeFreeMemory(self, instance.primary_node,
6072 "starting instance %s" % instance.name,
6073 bep[constants.BE_MEMORY], instance.hypervisor)
6075 def Exec(self, feedback_fn):
6076 """Start the instance.
6079 instance = self.instance
6080 force = self.op.force
6082 if not self.op.no_remember:
6083 self.cfg.MarkInstanceUp(instance.name)
6085 if self.primary_offline:
6086 assert self.op.ignore_offline_nodes
6087 self.proc.LogInfo("Primary node offline, marked instance as started")
6089 node_current = instance.primary_node
6091 _StartInstanceDisks(self, instance, force)
6094 self.rpc.call_instance_start(node_current,
6095 (instance, self.op.hvparams,
6097 self.op.startup_paused)
6098 msg = result.fail_msg
6100 _ShutdownInstanceDisks(self, instance)
6101 raise errors.OpExecError("Could not start instance: %s" % msg)
6104 class LUInstanceReboot(LogicalUnit):
6105 """Reboot an instance.
6108 HPATH = "instance-reboot"
6109 HTYPE = constants.HTYPE_INSTANCE
6112 def ExpandNames(self):
6113 self._ExpandAndLockInstance()
6115 def BuildHooksEnv(self):
6118 This runs on master, primary and secondary nodes of the instance.
6122 "IGNORE_SECONDARIES": self.op.ignore_secondaries,
6123 "REBOOT_TYPE": self.op.reboot_type,
6124 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
6127 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
6131 def BuildHooksNodes(self):
6132 """Build hooks nodes.
6135 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6138 def CheckPrereq(self):
6139 """Check prerequisites.
6141 This checks that the instance is in the cluster.
6144 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6145 assert self.instance is not None, \
6146 "Cannot retrieve locked instance %s" % self.op.instance_name
6148 _CheckNodeOnline(self, instance.primary_node)
6150 # check bridges existence
6151 _CheckInstanceBridgesExist(self, instance)
6153 def Exec(self, feedback_fn):
6154 """Reboot the instance.
6157 instance = self.instance
6158 ignore_secondaries = self.op.ignore_secondaries
6159 reboot_type = self.op.reboot_type
6161 remote_info = self.rpc.call_instance_info(instance.primary_node,
6163 instance.hypervisor)
6164 remote_info.Raise("Error checking node %s" % instance.primary_node)
6165 instance_running = bool(remote_info.payload)
6167 node_current = instance.primary_node
6169 if instance_running and reboot_type in [constants.INSTANCE_REBOOT_SOFT,
6170 constants.INSTANCE_REBOOT_HARD]:
6171 for disk in instance.disks:
6172 self.cfg.SetDiskID(disk, node_current)
6173 result = self.rpc.call_instance_reboot(node_current, instance,
6175 self.op.shutdown_timeout)
6176 result.Raise("Could not reboot instance")
6178 if instance_running:
6179 result = self.rpc.call_instance_shutdown(node_current, instance,
6180 self.op.shutdown_timeout)
6181 result.Raise("Could not shutdown instance for full reboot")
6182 _ShutdownInstanceDisks(self, instance)
6184 self.LogInfo("Instance %s was already stopped, starting now",
6186 _StartInstanceDisks(self, instance, ignore_secondaries)
6187 result = self.rpc.call_instance_start(node_current,
6188 (instance, None, None), False)
6189 msg = result.fail_msg
6191 _ShutdownInstanceDisks(self, instance)
6192 raise errors.OpExecError("Could not start instance for"
6193 " full reboot: %s" % msg)
6195 self.cfg.MarkInstanceUp(instance.name)
6198 class LUInstanceShutdown(LogicalUnit):
6199 """Shutdown an instance.
6202 HPATH = "instance-stop"
6203 HTYPE = constants.HTYPE_INSTANCE
6206 def ExpandNames(self):
6207 self._ExpandAndLockInstance()
6209 def BuildHooksEnv(self):
6212 This runs on master, primary and secondary nodes of the instance.
6215 env = _BuildInstanceHookEnvByObject(self, self.instance)
6216 env["TIMEOUT"] = self.op.timeout
6219 def BuildHooksNodes(self):
6220 """Build hooks nodes.
6223 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6226 def CheckPrereq(self):
6227 """Check prerequisites.
6229 This checks that the instance is in the cluster.
6232 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6233 assert self.instance is not None, \
6234 "Cannot retrieve locked instance %s" % self.op.instance_name
6236 self.primary_offline = \
6237 self.cfg.GetNodeInfo(self.instance.primary_node).offline
6239 if self.primary_offline and self.op.ignore_offline_nodes:
6240 self.proc.LogWarning("Ignoring offline primary node")
6242 _CheckNodeOnline(self, self.instance.primary_node)
6244 def Exec(self, feedback_fn):
6245 """Shutdown the instance.
6248 instance = self.instance
6249 node_current = instance.primary_node
6250 timeout = self.op.timeout
6252 if not self.op.no_remember:
6253 self.cfg.MarkInstanceDown(instance.name)
6255 if self.primary_offline:
6256 assert self.op.ignore_offline_nodes
6257 self.proc.LogInfo("Primary node offline, marked instance as stopped")
6259 result = self.rpc.call_instance_shutdown(node_current, instance, timeout)
6260 msg = result.fail_msg
6262 self.proc.LogWarning("Could not shutdown instance: %s" % msg)
6264 _ShutdownInstanceDisks(self, instance)
6267 class LUInstanceReinstall(LogicalUnit):
6268 """Reinstall an instance.
6271 HPATH = "instance-reinstall"
6272 HTYPE = constants.HTYPE_INSTANCE
6275 def ExpandNames(self):
6276 self._ExpandAndLockInstance()
6278 def BuildHooksEnv(self):
6281 This runs on master, primary and secondary nodes of the instance.
6284 return _BuildInstanceHookEnvByObject(self, self.instance)
6286 def BuildHooksNodes(self):
6287 """Build hooks nodes.
6290 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6293 def CheckPrereq(self):
6294 """Check prerequisites.
6296 This checks that the instance is in the cluster and is not running.
6299 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6300 assert instance is not None, \
6301 "Cannot retrieve locked instance %s" % self.op.instance_name
6302 _CheckNodeOnline(self, instance.primary_node, "Instance primary node"
6303 " offline, cannot reinstall")
6304 for node in instance.secondary_nodes:
6305 _CheckNodeOnline(self, node, "Instance secondary node offline,"
6306 " cannot reinstall")
6308 if instance.disk_template == constants.DT_DISKLESS:
6309 raise errors.OpPrereqError("Instance '%s' has no disks" %
6310 self.op.instance_name,
6312 _CheckInstanceDown(self, instance, "cannot reinstall")
6314 if self.op.os_type is not None:
6316 pnode = _ExpandNodeName(self.cfg, instance.primary_node)
6317 _CheckNodeHasOS(self, pnode, self.op.os_type, self.op.force_variant)
6318 instance_os = self.op.os_type
6320 instance_os = instance.os
6322 nodelist = list(instance.all_nodes)
6324 if self.op.osparams:
6325 i_osdict = _GetUpdatedParams(instance.osparams, self.op.osparams)
6326 _CheckOSParams(self, True, nodelist, instance_os, i_osdict)
6327 self.os_inst = i_osdict # the new dict (without defaults)
6331 self.instance = instance
6333 def Exec(self, feedback_fn):
6334 """Reinstall the instance.
6337 inst = self.instance
6339 if self.op.os_type is not None:
6340 feedback_fn("Changing OS to '%s'..." % self.op.os_type)
6341 inst.os = self.op.os_type
6342 # Write to configuration
6343 self.cfg.Update(inst, feedback_fn)
6345 _StartInstanceDisks(self, inst, None)
6347 feedback_fn("Running the instance OS create scripts...")
6348 # FIXME: pass debug option from opcode to backend
6349 result = self.rpc.call_instance_os_add(inst.primary_node,
6350 (inst, self.os_inst), True,
6351 self.op.debug_level)
6352 result.Raise("Could not install OS for instance %s on node %s" %
6353 (inst.name, inst.primary_node))
6355 _ShutdownInstanceDisks(self, inst)
6358 class LUInstanceRecreateDisks(LogicalUnit):
6359 """Recreate an instance's missing disks.
6362 HPATH = "instance-recreate-disks"
6363 HTYPE = constants.HTYPE_INSTANCE
6366 def CheckArguments(self):
6367 # normalise the disk list
6368 self.op.disks = sorted(frozenset(self.op.disks))
6370 def ExpandNames(self):
6371 self._ExpandAndLockInstance()
6372 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
6374 self.op.nodes = [_ExpandNodeName(self.cfg, n) for n in self.op.nodes]
6375 self.needed_locks[locking.LEVEL_NODE] = list(self.op.nodes)
6377 self.needed_locks[locking.LEVEL_NODE] = []
6379 def DeclareLocks(self, level):
6380 if level == locking.LEVEL_NODE:
6381 # if we replace the nodes, we only need to lock the old primary,
6382 # otherwise we need to lock all nodes for disk re-creation
6383 primary_only = bool(self.op.nodes)
6384 self._LockInstancesNodes(primary_only=primary_only)
6386 def BuildHooksEnv(self):
6389 This runs on master, primary and secondary nodes of the instance.
6392 return _BuildInstanceHookEnvByObject(self, self.instance)
6394 def BuildHooksNodes(self):
6395 """Build hooks nodes.
6398 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6401 def CheckPrereq(self):
6402 """Check prerequisites.
6404 This checks that the instance is in the cluster and is not running.
6407 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6408 assert instance is not None, \
6409 "Cannot retrieve locked instance %s" % self.op.instance_name
6411 if len(self.op.nodes) != len(instance.all_nodes):
6412 raise errors.OpPrereqError("Instance %s currently has %d nodes, but"
6413 " %d replacement nodes were specified" %
6414 (instance.name, len(instance.all_nodes),
6415 len(self.op.nodes)),
6417 assert instance.disk_template != constants.DT_DRBD8 or \
6418 len(self.op.nodes) == 2
6419 assert instance.disk_template != constants.DT_PLAIN or \
6420 len(self.op.nodes) == 1
6421 primary_node = self.op.nodes[0]
6423 primary_node = instance.primary_node
6424 _CheckNodeOnline(self, primary_node)
6426 if instance.disk_template == constants.DT_DISKLESS:
6427 raise errors.OpPrereqError("Instance '%s' has no disks" %
6428 self.op.instance_name, errors.ECODE_INVAL)
6429 # if we replace nodes *and* the old primary is offline, we don't
6431 assert instance.primary_node in self.needed_locks[locking.LEVEL_NODE]
6432 old_pnode = self.cfg.GetNodeInfo(instance.primary_node)
6433 if not (self.op.nodes and old_pnode.offline):
6434 _CheckInstanceDown(self, instance, "cannot recreate disks")
6436 if not self.op.disks:
6437 self.op.disks = range(len(instance.disks))
6439 for idx in self.op.disks:
6440 if idx >= len(instance.disks):
6441 raise errors.OpPrereqError("Invalid disk index '%s'" % idx,
6443 if self.op.disks != range(len(instance.disks)) and self.op.nodes:
6444 raise errors.OpPrereqError("Can't recreate disks partially and"
6445 " change the nodes at the same time",
6447 self.instance = instance
6449 def Exec(self, feedback_fn):
6450 """Recreate the disks.
6453 instance = self.instance
6456 mods = [] # keeps track of needed logical_id changes
6458 for idx, disk in enumerate(instance.disks):
6459 if idx not in self.op.disks: # disk idx has not been passed in
6462 # update secondaries for disks, if needed
6464 if disk.dev_type == constants.LD_DRBD8:
6465 # need to update the nodes and minors
6466 assert len(self.op.nodes) == 2
6467 assert len(disk.logical_id) == 6 # otherwise disk internals
6469 (_, _, old_port, _, _, old_secret) = disk.logical_id
6470 new_minors = self.cfg.AllocateDRBDMinor(self.op.nodes, instance.name)
6471 new_id = (self.op.nodes[0], self.op.nodes[1], old_port,
6472 new_minors[0], new_minors[1], old_secret)
6473 assert len(disk.logical_id) == len(new_id)
6474 mods.append((idx, new_id))
6476 # now that we have passed all asserts above, we can apply the mods
6477 # in a single run (to avoid partial changes)
6478 for idx, new_id in mods:
6479 instance.disks[idx].logical_id = new_id
6481 # change primary node, if needed
6483 instance.primary_node = self.op.nodes[0]
6484 self.LogWarning("Changing the instance's nodes, you will have to"
6485 " remove any disks left on the older nodes manually")
6488 self.cfg.Update(instance, feedback_fn)
6490 _CreateDisks(self, instance, to_skip=to_skip)
6493 class LUInstanceRename(LogicalUnit):
6494 """Rename an instance.
6497 HPATH = "instance-rename"
6498 HTYPE = constants.HTYPE_INSTANCE
6500 def CheckArguments(self):
6504 if self.op.ip_check and not self.op.name_check:
6505 # TODO: make the ip check more flexible and not depend on the name check
6506 raise errors.OpPrereqError("IP address check requires a name check",
6509 def BuildHooksEnv(self):
6512 This runs on master, primary and secondary nodes of the instance.
6515 env = _BuildInstanceHookEnvByObject(self, self.instance)
6516 env["INSTANCE_NEW_NAME"] = self.op.new_name
6519 def BuildHooksNodes(self):
6520 """Build hooks nodes.
6523 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6526 def CheckPrereq(self):
6527 """Check prerequisites.
6529 This checks that the instance is in the cluster and is not running.
6532 self.op.instance_name = _ExpandInstanceName(self.cfg,
6533 self.op.instance_name)
6534 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6535 assert instance is not None
6536 _CheckNodeOnline(self, instance.primary_node)
6537 _CheckInstanceDown(self, instance, "cannot rename")
6538 self.instance = instance
6540 new_name = self.op.new_name
6541 if self.op.name_check:
6542 hostname = netutils.GetHostname(name=new_name)
6543 if hostname != new_name:
6544 self.LogInfo("Resolved given name '%s' to '%s'", new_name,
6546 if not utils.MatchNameComponent(self.op.new_name, [hostname.name]):
6547 raise errors.OpPrereqError(("Resolved hostname '%s' does not look the"
6548 " same as given hostname '%s'") %
6549 (hostname.name, self.op.new_name),
6551 new_name = self.op.new_name = hostname.name
6552 if (self.op.ip_check and
6553 netutils.TcpPing(hostname.ip, constants.DEFAULT_NODED_PORT)):
6554 raise errors.OpPrereqError("IP %s of instance %s already in use" %
6555 (hostname.ip, new_name),
6556 errors.ECODE_NOTUNIQUE)
6558 instance_list = self.cfg.GetInstanceList()
6559 if new_name in instance_list and new_name != instance.name:
6560 raise errors.OpPrereqError("Instance '%s' is already in the cluster" %
6561 new_name, errors.ECODE_EXISTS)
6563 def Exec(self, feedback_fn):
6564 """Rename the instance.
6567 inst = self.instance
6568 old_name = inst.name
6570 rename_file_storage = False
6571 if (inst.disk_template in constants.DTS_FILEBASED and
6572 self.op.new_name != inst.name):
6573 old_file_storage_dir = os.path.dirname(inst.disks[0].logical_id[1])
6574 rename_file_storage = True
6576 self.cfg.RenameInstance(inst.name, self.op.new_name)
6577 # Change the instance lock. This is definitely safe while we hold the BGL.
6578 # Otherwise the new lock would have to be added in acquired mode.
6580 self.glm.remove(locking.LEVEL_INSTANCE, old_name)
6581 self.glm.add(locking.LEVEL_INSTANCE, self.op.new_name)
6583 # re-read the instance from the configuration after rename
6584 inst = self.cfg.GetInstanceInfo(self.op.new_name)
6586 if rename_file_storage:
6587 new_file_storage_dir = os.path.dirname(inst.disks[0].logical_id[1])
6588 result = self.rpc.call_file_storage_dir_rename(inst.primary_node,
6589 old_file_storage_dir,
6590 new_file_storage_dir)
6591 result.Raise("Could not rename on node %s directory '%s' to '%s'"
6592 " (but the instance has been renamed in Ganeti)" %
6593 (inst.primary_node, old_file_storage_dir,
6594 new_file_storage_dir))
6596 _StartInstanceDisks(self, inst, None)
6598 result = self.rpc.call_instance_run_rename(inst.primary_node, inst,
6599 old_name, self.op.debug_level)
6600 msg = result.fail_msg
6602 msg = ("Could not run OS rename script for instance %s on node %s"
6603 " (but the instance has been renamed in Ganeti): %s" %
6604 (inst.name, inst.primary_node, msg))
6605 self.proc.LogWarning(msg)
6607 _ShutdownInstanceDisks(self, inst)
6612 class LUInstanceRemove(LogicalUnit):
6613 """Remove an instance.
6616 HPATH = "instance-remove"
6617 HTYPE = constants.HTYPE_INSTANCE
6620 def ExpandNames(self):
6621 self._ExpandAndLockInstance()
6622 self.needed_locks[locking.LEVEL_NODE] = []
6623 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
6625 def DeclareLocks(self, level):
6626 if level == locking.LEVEL_NODE:
6627 self._LockInstancesNodes()
6629 def BuildHooksEnv(self):
6632 This runs on master, primary and secondary nodes of the instance.
6635 env = _BuildInstanceHookEnvByObject(self, self.instance)
6636 env["SHUTDOWN_TIMEOUT"] = self.op.shutdown_timeout
6639 def BuildHooksNodes(self):
6640 """Build hooks nodes.
6643 nl = [self.cfg.GetMasterNode()]
6644 nl_post = list(self.instance.all_nodes) + nl
6645 return (nl, nl_post)
6647 def CheckPrereq(self):
6648 """Check prerequisites.
6650 This checks that the instance is in the cluster.
6653 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6654 assert self.instance is not None, \
6655 "Cannot retrieve locked instance %s" % self.op.instance_name
6657 def Exec(self, feedback_fn):
6658 """Remove the instance.
6661 instance = self.instance
6662 logging.info("Shutting down instance %s on node %s",
6663 instance.name, instance.primary_node)
6665 result = self.rpc.call_instance_shutdown(instance.primary_node, instance,
6666 self.op.shutdown_timeout)
6667 msg = result.fail_msg
6669 if self.op.ignore_failures:
6670 feedback_fn("Warning: can't shutdown instance: %s" % msg)
6672 raise errors.OpExecError("Could not shutdown instance %s on"
6674 (instance.name, instance.primary_node, msg))
6676 _RemoveInstance(self, feedback_fn, instance, self.op.ignore_failures)
6679 def _RemoveInstance(lu, feedback_fn, instance, ignore_failures):
6680 """Utility function to remove an instance.
6683 logging.info("Removing block devices for instance %s", instance.name)
6685 if not _RemoveDisks(lu, instance):
6686 if not ignore_failures:
6687 raise errors.OpExecError("Can't remove instance's disks")
6688 feedback_fn("Warning: can't remove instance's disks")
6690 logging.info("Removing instance %s out of cluster config", instance.name)
6692 lu.cfg.RemoveInstance(instance.name)
6694 assert not lu.remove_locks.get(locking.LEVEL_INSTANCE), \
6695 "Instance lock removal conflict"
6697 # Remove lock for the instance
6698 lu.remove_locks[locking.LEVEL_INSTANCE] = instance.name
6701 class LUInstanceQuery(NoHooksLU):
6702 """Logical unit for querying instances.
6705 # pylint: disable=W0142
6708 def CheckArguments(self):
6709 self.iq = _InstanceQuery(qlang.MakeSimpleFilter("name", self.op.names),
6710 self.op.output_fields, self.op.use_locking)
6712 def ExpandNames(self):
6713 self.iq.ExpandNames(self)
6715 def DeclareLocks(self, level):
6716 self.iq.DeclareLocks(self, level)
6718 def Exec(self, feedback_fn):
6719 return self.iq.OldStyleQuery(self)
6722 class LUInstanceFailover(LogicalUnit):
6723 """Failover an instance.
6726 HPATH = "instance-failover"
6727 HTYPE = constants.HTYPE_INSTANCE
6730 def CheckArguments(self):
6731 """Check the arguments.
6734 self.iallocator = getattr(self.op, "iallocator", None)
6735 self.target_node = getattr(self.op, "target_node", None)
6737 def ExpandNames(self):
6738 self._ExpandAndLockInstance()
6740 if self.op.target_node is not None:
6741 self.op.target_node = _ExpandNodeName(self.cfg, self.op.target_node)
6743 self.needed_locks[locking.LEVEL_NODE] = []
6744 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
6746 ignore_consistency = self.op.ignore_consistency
6747 shutdown_timeout = self.op.shutdown_timeout
6748 self._migrater = TLMigrateInstance(self, self.op.instance_name,
6751 ignore_consistency=ignore_consistency,
6752 shutdown_timeout=shutdown_timeout)
6753 self.tasklets = [self._migrater]
6755 def DeclareLocks(self, level):
6756 if level == locking.LEVEL_NODE:
6757 instance = self.context.cfg.GetInstanceInfo(self.op.instance_name)
6758 if instance.disk_template in constants.DTS_EXT_MIRROR:
6759 if self.op.target_node is None:
6760 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
6762 self.needed_locks[locking.LEVEL_NODE] = [instance.primary_node,
6763 self.op.target_node]
6764 del self.recalculate_locks[locking.LEVEL_NODE]
6766 self._LockInstancesNodes()
6768 def BuildHooksEnv(self):
6771 This runs on master, primary and secondary nodes of the instance.
6774 instance = self._migrater.instance
6775 source_node = instance.primary_node
6776 target_node = self.op.target_node
6778 "IGNORE_CONSISTENCY": self.op.ignore_consistency,
6779 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
6780 "OLD_PRIMARY": source_node,
6781 "NEW_PRIMARY": target_node,
6784 if instance.disk_template in constants.DTS_INT_MIRROR:
6785 env["OLD_SECONDARY"] = instance.secondary_nodes[0]
6786 env["NEW_SECONDARY"] = source_node
6788 env["OLD_SECONDARY"] = env["NEW_SECONDARY"] = ""
6790 env.update(_BuildInstanceHookEnvByObject(self, instance))
6794 def BuildHooksNodes(self):
6795 """Build hooks nodes.
6798 instance = self._migrater.instance
6799 nl = [self.cfg.GetMasterNode()] + list(instance.secondary_nodes)
6800 return (nl, nl + [instance.primary_node])
6803 class LUInstanceMigrate(LogicalUnit):
6804 """Migrate an instance.
6806 This is migration without shutting down, compared to the failover,
6807 which is done with shutdown.
6810 HPATH = "instance-migrate"
6811 HTYPE = constants.HTYPE_INSTANCE
6814 def ExpandNames(self):
6815 self._ExpandAndLockInstance()
6817 if self.op.target_node is not None:
6818 self.op.target_node = _ExpandNodeName(self.cfg, self.op.target_node)
6820 self.needed_locks[locking.LEVEL_NODE] = []
6821 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
6823 self._migrater = TLMigrateInstance(self, self.op.instance_name,
6824 cleanup=self.op.cleanup,
6826 fallback=self.op.allow_failover)
6827 self.tasklets = [self._migrater]
6829 def DeclareLocks(self, level):
6830 if level == locking.LEVEL_NODE:
6831 instance = self.context.cfg.GetInstanceInfo(self.op.instance_name)
6832 if instance.disk_template in constants.DTS_EXT_MIRROR:
6833 if self.op.target_node is None:
6834 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
6836 self.needed_locks[locking.LEVEL_NODE] = [instance.primary_node,
6837 self.op.target_node]
6838 del self.recalculate_locks[locking.LEVEL_NODE]
6840 self._LockInstancesNodes()
6842 def BuildHooksEnv(self):
6845 This runs on master, primary and secondary nodes of the instance.
6848 instance = self._migrater.instance
6849 source_node = instance.primary_node
6850 target_node = self.op.target_node
6851 env = _BuildInstanceHookEnvByObject(self, instance)
6853 "MIGRATE_LIVE": self._migrater.live,
6854 "MIGRATE_CLEANUP": self.op.cleanup,
6855 "OLD_PRIMARY": source_node,
6856 "NEW_PRIMARY": target_node,
6859 if instance.disk_template in constants.DTS_INT_MIRROR:
6860 env["OLD_SECONDARY"] = target_node
6861 env["NEW_SECONDARY"] = source_node
6863 env["OLD_SECONDARY"] = env["NEW_SECONDARY"] = None
6867 def BuildHooksNodes(self):
6868 """Build hooks nodes.
6871 instance = self._migrater.instance
6872 nl = [self.cfg.GetMasterNode()] + list(instance.secondary_nodes)
6873 return (nl, nl + [instance.primary_node])
6876 class LUInstanceMove(LogicalUnit):
6877 """Move an instance by data-copying.
6880 HPATH = "instance-move"
6881 HTYPE = constants.HTYPE_INSTANCE
6884 def ExpandNames(self):
6885 self._ExpandAndLockInstance()
6886 target_node = _ExpandNodeName(self.cfg, self.op.target_node)
6887 self.op.target_node = target_node
6888 self.needed_locks[locking.LEVEL_NODE] = [target_node]
6889 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
6891 def DeclareLocks(self, level):
6892 if level == locking.LEVEL_NODE:
6893 self._LockInstancesNodes(primary_only=True)
6895 def BuildHooksEnv(self):
6898 This runs on master, primary and secondary nodes of the instance.
6902 "TARGET_NODE": self.op.target_node,
6903 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
6905 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
6908 def BuildHooksNodes(self):
6909 """Build hooks nodes.
6913 self.cfg.GetMasterNode(),
6914 self.instance.primary_node,
6915 self.op.target_node,
6919 def CheckPrereq(self):
6920 """Check prerequisites.
6922 This checks that the instance is in the cluster.
6925 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6926 assert self.instance is not None, \
6927 "Cannot retrieve locked instance %s" % self.op.instance_name
6929 node = self.cfg.GetNodeInfo(self.op.target_node)
6930 assert node is not None, \
6931 "Cannot retrieve locked node %s" % self.op.target_node
6933 self.target_node = target_node = node.name
6935 if target_node == instance.primary_node:
6936 raise errors.OpPrereqError("Instance %s is already on the node %s" %
6937 (instance.name, target_node),
6940 bep = self.cfg.GetClusterInfo().FillBE(instance)
6942 for idx, dsk in enumerate(instance.disks):
6943 if dsk.dev_type not in (constants.LD_LV, constants.LD_FILE):
6944 raise errors.OpPrereqError("Instance disk %d has a complex layout,"
6945 " cannot copy" % idx, errors.ECODE_STATE)
6947 _CheckNodeOnline(self, target_node)
6948 _CheckNodeNotDrained(self, target_node)
6949 _CheckNodeVmCapable(self, target_node)
6951 if instance.admin_up:
6952 # check memory requirements on the secondary node
6953 _CheckNodeFreeMemory(self, target_node, "failing over instance %s" %
6954 instance.name, bep[constants.BE_MEMORY],
6955 instance.hypervisor)
6957 self.LogInfo("Not checking memory on the secondary node as"
6958 " instance will not be started")
6960 # check bridge existance
6961 _CheckInstanceBridgesExist(self, instance, node=target_node)
6963 def Exec(self, feedback_fn):
6964 """Move an instance.
6966 The move is done by shutting it down on its present node, copying
6967 the data over (slow) and starting it on the new node.
6970 instance = self.instance
6972 source_node = instance.primary_node
6973 target_node = self.target_node
6975 self.LogInfo("Shutting down instance %s on source node %s",
6976 instance.name, source_node)
6978 result = self.rpc.call_instance_shutdown(source_node, instance,
6979 self.op.shutdown_timeout)
6980 msg = result.fail_msg
6982 if self.op.ignore_consistency:
6983 self.proc.LogWarning("Could not shutdown instance %s on node %s."
6984 " Proceeding anyway. Please make sure node"
6985 " %s is down. Error details: %s",
6986 instance.name, source_node, source_node, msg)
6988 raise errors.OpExecError("Could not shutdown instance %s on"
6990 (instance.name, source_node, msg))
6992 # create the target disks
6994 _CreateDisks(self, instance, target_node=target_node)
6995 except errors.OpExecError:
6996 self.LogWarning("Device creation failed, reverting...")
6998 _RemoveDisks(self, instance, target_node=target_node)
7000 self.cfg.ReleaseDRBDMinors(instance.name)
7003 cluster_name = self.cfg.GetClusterInfo().cluster_name
7006 # activate, get path, copy the data over
7007 for idx, disk in enumerate(instance.disks):
7008 self.LogInfo("Copying data for disk %d", idx)
7009 result = self.rpc.call_blockdev_assemble(target_node, disk,
7010 instance.name, True, idx)
7012 self.LogWarning("Can't assemble newly created disk %d: %s",
7013 idx, result.fail_msg)
7014 errs.append(result.fail_msg)
7016 dev_path = result.payload
7017 result = self.rpc.call_blockdev_export(source_node, disk,
7018 target_node, dev_path,
7021 self.LogWarning("Can't copy data over for disk %d: %s",
7022 idx, result.fail_msg)
7023 errs.append(result.fail_msg)
7027 self.LogWarning("Some disks failed to copy, aborting")
7029 _RemoveDisks(self, instance, target_node=target_node)
7031 self.cfg.ReleaseDRBDMinors(instance.name)
7032 raise errors.OpExecError("Errors during disk copy: %s" %
7035 instance.primary_node = target_node
7036 self.cfg.Update(instance, feedback_fn)
7038 self.LogInfo("Removing the disks on the original node")
7039 _RemoveDisks(self, instance, target_node=source_node)
7041 # Only start the instance if it's marked as up
7042 if instance.admin_up:
7043 self.LogInfo("Starting instance %s on node %s",
7044 instance.name, target_node)
7046 disks_ok, _ = _AssembleInstanceDisks(self, instance,
7047 ignore_secondaries=True)
7049 _ShutdownInstanceDisks(self, instance)
7050 raise errors.OpExecError("Can't activate the instance's disks")
7052 result = self.rpc.call_instance_start(target_node,
7053 (instance, None, None), False)
7054 msg = result.fail_msg
7056 _ShutdownInstanceDisks(self, instance)
7057 raise errors.OpExecError("Could not start instance %s on node %s: %s" %
7058 (instance.name, target_node, msg))
7061 class LUNodeMigrate(LogicalUnit):
7062 """Migrate all instances from a node.
7065 HPATH = "node-migrate"
7066 HTYPE = constants.HTYPE_NODE
7069 def CheckArguments(self):
7072 def ExpandNames(self):
7073 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
7075 self.share_locks = _ShareAll()
7076 self.needed_locks = {
7077 locking.LEVEL_NODE: [self.op.node_name],
7080 def BuildHooksEnv(self):
7083 This runs on the master, the primary and all the secondaries.
7087 "NODE_NAME": self.op.node_name,
7090 def BuildHooksNodes(self):
7091 """Build hooks nodes.
7094 nl = [self.cfg.GetMasterNode()]
7097 def CheckPrereq(self):
7100 def Exec(self, feedback_fn):
7101 # Prepare jobs for migration instances
7103 [opcodes.OpInstanceMigrate(instance_name=inst.name,
7106 iallocator=self.op.iallocator,
7107 target_node=self.op.target_node)]
7108 for inst in _GetNodePrimaryInstances(self.cfg, self.op.node_name)
7111 # TODO: Run iallocator in this opcode and pass correct placement options to
7112 # OpInstanceMigrate. Since other jobs can modify the cluster between
7113 # running the iallocator and the actual migration, a good consistency model
7114 # will have to be found.
7116 assert (frozenset(self.owned_locks(locking.LEVEL_NODE)) ==
7117 frozenset([self.op.node_name]))
7119 return ResultWithJobs(jobs)
7122 class TLMigrateInstance(Tasklet):
7123 """Tasklet class for instance migration.
7126 @ivar live: whether the migration will be done live or non-live;
7127 this variable is initalized only after CheckPrereq has run
7128 @type cleanup: boolean
7129 @ivar cleanup: Wheater we cleanup from a failed migration
7130 @type iallocator: string
7131 @ivar iallocator: The iallocator used to determine target_node
7132 @type target_node: string
7133 @ivar target_node: If given, the target_node to reallocate the instance to
7134 @type failover: boolean
7135 @ivar failover: Whether operation results in failover or migration
7136 @type fallback: boolean
7137 @ivar fallback: Whether fallback to failover is allowed if migration not
7139 @type ignore_consistency: boolean
7140 @ivar ignore_consistency: Wheter we should ignore consistency between source
7142 @type shutdown_timeout: int
7143 @ivar shutdown_timeout: In case of failover timeout of the shutdown
7148 _MIGRATION_POLL_INTERVAL = 1 # seconds
7149 _MIGRATION_FEEDBACK_INTERVAL = 10 # seconds
7151 def __init__(self, lu, instance_name, cleanup=False,
7152 failover=False, fallback=False,
7153 ignore_consistency=False,
7154 shutdown_timeout=constants.DEFAULT_SHUTDOWN_TIMEOUT):
7155 """Initializes this class.
7158 Tasklet.__init__(self, lu)
7161 self.instance_name = instance_name
7162 self.cleanup = cleanup
7163 self.live = False # will be overridden later
7164 self.failover = failover
7165 self.fallback = fallback
7166 self.ignore_consistency = ignore_consistency
7167 self.shutdown_timeout = shutdown_timeout
7169 def CheckPrereq(self):
7170 """Check prerequisites.
7172 This checks that the instance is in the cluster.
7175 instance_name = _ExpandInstanceName(self.lu.cfg, self.instance_name)
7176 instance = self.cfg.GetInstanceInfo(instance_name)
7177 assert instance is not None
7178 self.instance = instance
7180 if (not self.cleanup and not instance.admin_up and not self.failover and
7182 self.lu.LogInfo("Instance is marked down, fallback allowed, switching"
7184 self.failover = True
7186 if instance.disk_template not in constants.DTS_MIRRORED:
7191 raise errors.OpPrereqError("Instance's disk layout '%s' does not allow"
7192 " %s" % (instance.disk_template, text),
7195 if instance.disk_template in constants.DTS_EXT_MIRROR:
7196 _CheckIAllocatorOrNode(self.lu, "iallocator", "target_node")
7198 if self.lu.op.iallocator:
7199 self._RunAllocator()
7201 # We set set self.target_node as it is required by
7203 self.target_node = self.lu.op.target_node
7205 # self.target_node is already populated, either directly or by the
7207 target_node = self.target_node
7208 if self.target_node == instance.primary_node:
7209 raise errors.OpPrereqError("Cannot migrate instance %s"
7210 " to its primary (%s)" %
7211 (instance.name, instance.primary_node))
7213 if len(self.lu.tasklets) == 1:
7214 # It is safe to release locks only when we're the only tasklet
7216 _ReleaseLocks(self.lu, locking.LEVEL_NODE,
7217 keep=[instance.primary_node, self.target_node])
7220 secondary_nodes = instance.secondary_nodes
7221 if not secondary_nodes:
7222 raise errors.ConfigurationError("No secondary node but using"
7223 " %s disk template" %
7224 instance.disk_template)
7225 target_node = secondary_nodes[0]
7226 if self.lu.op.iallocator or (self.lu.op.target_node and
7227 self.lu.op.target_node != target_node):
7229 text = "failed over"
7232 raise errors.OpPrereqError("Instances with disk template %s cannot"
7233 " be %s to arbitrary nodes"
7234 " (neither an iallocator nor a target"
7235 " node can be passed)" %
7236 (instance.disk_template, text),
7239 i_be = self.cfg.GetClusterInfo().FillBE(instance)
7241 # check memory requirements on the secondary node
7242 if not self.failover or instance.admin_up:
7243 _CheckNodeFreeMemory(self.lu, target_node, "migrating instance %s" %
7244 instance.name, i_be[constants.BE_MEMORY],
7245 instance.hypervisor)
7247 self.lu.LogInfo("Not checking memory on the secondary node as"
7248 " instance will not be started")
7250 # check bridge existance
7251 _CheckInstanceBridgesExist(self.lu, instance, node=target_node)
7253 if not self.cleanup:
7254 _CheckNodeNotDrained(self.lu, target_node)
7255 if not self.failover:
7256 result = self.rpc.call_instance_migratable(instance.primary_node,
7258 if result.fail_msg and self.fallback:
7259 self.lu.LogInfo("Can't migrate, instance offline, fallback to"
7261 self.failover = True
7263 result.Raise("Can't migrate, please use failover",
7264 prereq=True, ecode=errors.ECODE_STATE)
7266 assert not (self.failover and self.cleanup)
7268 if not self.failover:
7269 if self.lu.op.live is not None and self.lu.op.mode is not None:
7270 raise errors.OpPrereqError("Only one of the 'live' and 'mode'"
7271 " parameters are accepted",
7273 if self.lu.op.live is not None:
7275 self.lu.op.mode = constants.HT_MIGRATION_LIVE
7277 self.lu.op.mode = constants.HT_MIGRATION_NONLIVE
7278 # reset the 'live' parameter to None so that repeated
7279 # invocations of CheckPrereq do not raise an exception
7280 self.lu.op.live = None
7281 elif self.lu.op.mode is None:
7282 # read the default value from the hypervisor
7283 i_hv = self.cfg.GetClusterInfo().FillHV(self.instance,
7285 self.lu.op.mode = i_hv[constants.HV_MIGRATION_MODE]
7287 self.live = self.lu.op.mode == constants.HT_MIGRATION_LIVE
7289 # Failover is never live
7292 def _RunAllocator(self):
7293 """Run the allocator based on input opcode.
7296 ial = IAllocator(self.cfg, self.rpc,
7297 mode=constants.IALLOCATOR_MODE_RELOC,
7298 name=self.instance_name,
7299 # TODO See why hail breaks with a single node below
7300 relocate_from=[self.instance.primary_node,
7301 self.instance.primary_node],
7304 ial.Run(self.lu.op.iallocator)
7307 raise errors.OpPrereqError("Can't compute nodes using"
7308 " iallocator '%s': %s" %
7309 (self.lu.op.iallocator, ial.info),
7311 if len(ial.result) != ial.required_nodes:
7312 raise errors.OpPrereqError("iallocator '%s' returned invalid number"
7313 " of nodes (%s), required %s" %
7314 (self.lu.op.iallocator, len(ial.result),
7315 ial.required_nodes), errors.ECODE_FAULT)
7316 self.target_node = ial.result[0]
7317 self.lu.LogInfo("Selected nodes for instance %s via iallocator %s: %s",
7318 self.instance_name, self.lu.op.iallocator,
7319 utils.CommaJoin(ial.result))
7321 def _WaitUntilSync(self):
7322 """Poll with custom rpc for disk sync.
7324 This uses our own step-based rpc call.
7327 self.feedback_fn("* wait until resync is done")
7331 result = self.rpc.call_drbd_wait_sync(self.all_nodes,
7333 self.instance.disks)
7335 for node, nres in result.items():
7336 nres.Raise("Cannot resync disks on node %s" % node)
7337 node_done, node_percent = nres.payload
7338 all_done = all_done and node_done
7339 if node_percent is not None:
7340 min_percent = min(min_percent, node_percent)
7342 if min_percent < 100:
7343 self.feedback_fn(" - progress: %.1f%%" % min_percent)
7346 def _EnsureSecondary(self, node):
7347 """Demote a node to secondary.
7350 self.feedback_fn("* switching node %s to secondary mode" % node)
7352 for dev in self.instance.disks:
7353 self.cfg.SetDiskID(dev, node)
7355 result = self.rpc.call_blockdev_close(node, self.instance.name,
7356 self.instance.disks)
7357 result.Raise("Cannot change disk to secondary on node %s" % node)
7359 def _GoStandalone(self):
7360 """Disconnect from the network.
7363 self.feedback_fn("* changing into standalone mode")
7364 result = self.rpc.call_drbd_disconnect_net(self.all_nodes, self.nodes_ip,
7365 self.instance.disks)
7366 for node, nres in result.items():
7367 nres.Raise("Cannot disconnect disks node %s" % node)
7369 def _GoReconnect(self, multimaster):
7370 """Reconnect to the network.
7376 msg = "single-master"
7377 self.feedback_fn("* changing disks into %s mode" % msg)
7378 result = self.rpc.call_drbd_attach_net(self.all_nodes, self.nodes_ip,
7379 self.instance.disks,
7380 self.instance.name, multimaster)
7381 for node, nres in result.items():
7382 nres.Raise("Cannot change disks config on node %s" % node)
7384 def _ExecCleanup(self):
7385 """Try to cleanup after a failed migration.
7387 The cleanup is done by:
7388 - check that the instance is running only on one node
7389 (and update the config if needed)
7390 - change disks on its secondary node to secondary
7391 - wait until disks are fully synchronized
7392 - disconnect from the network
7393 - change disks into single-master mode
7394 - wait again until disks are fully synchronized
7397 instance = self.instance
7398 target_node = self.target_node
7399 source_node = self.source_node
7401 # check running on only one node
7402 self.feedback_fn("* checking where the instance actually runs"
7403 " (if this hangs, the hypervisor might be in"
7405 ins_l = self.rpc.call_instance_list(self.all_nodes, [instance.hypervisor])
7406 for node, result in ins_l.items():
7407 result.Raise("Can't contact node %s" % node)
7409 runningon_source = instance.name in ins_l[source_node].payload
7410 runningon_target = instance.name in ins_l[target_node].payload
7412 if runningon_source and runningon_target:
7413 raise errors.OpExecError("Instance seems to be running on two nodes,"
7414 " or the hypervisor is confused; you will have"
7415 " to ensure manually that it runs only on one"
7416 " and restart this operation")
7418 if not (runningon_source or runningon_target):
7419 raise errors.OpExecError("Instance does not seem to be running at all;"
7420 " in this case it's safer to repair by"
7421 " running 'gnt-instance stop' to ensure disk"
7422 " shutdown, and then restarting it")
7424 if runningon_target:
7425 # the migration has actually succeeded, we need to update the config
7426 self.feedback_fn("* instance running on secondary node (%s),"
7427 " updating config" % target_node)
7428 instance.primary_node = target_node
7429 self.cfg.Update(instance, self.feedback_fn)
7430 demoted_node = source_node
7432 self.feedback_fn("* instance confirmed to be running on its"
7433 " primary node (%s)" % source_node)
7434 demoted_node = target_node
7436 if instance.disk_template in constants.DTS_INT_MIRROR:
7437 self._EnsureSecondary(demoted_node)
7439 self._WaitUntilSync()
7440 except errors.OpExecError:
7441 # we ignore here errors, since if the device is standalone, it
7442 # won't be able to sync
7444 self._GoStandalone()
7445 self._GoReconnect(False)
7446 self._WaitUntilSync()
7448 self.feedback_fn("* done")
7450 def _RevertDiskStatus(self):
7451 """Try to revert the disk status after a failed migration.
7454 target_node = self.target_node
7455 if self.instance.disk_template in constants.DTS_EXT_MIRROR:
7459 self._EnsureSecondary(target_node)
7460 self._GoStandalone()
7461 self._GoReconnect(False)
7462 self._WaitUntilSync()
7463 except errors.OpExecError, err:
7464 self.lu.LogWarning("Migration failed and I can't reconnect the drives,"
7465 " please try to recover the instance manually;"
7466 " error '%s'" % str(err))
7468 def _AbortMigration(self):
7469 """Call the hypervisor code to abort a started migration.
7472 instance = self.instance
7473 target_node = self.target_node
7474 source_node = self.source_node
7475 migration_info = self.migration_info
7477 abort_result = self.rpc.call_instance_finalize_migration_dst(target_node,
7481 abort_msg = abort_result.fail_msg
7483 logging.error("Aborting migration failed on target node %s: %s",
7484 target_node, abort_msg)
7485 # Don't raise an exception here, as we stil have to try to revert the
7486 # disk status, even if this step failed.
7488 abort_result = self.rpc.call_instance_finalize_migration_src(source_node,
7489 instance, False, self.live)
7490 abort_msg = abort_result.fail_msg
7492 logging.error("Aborting migration failed on source node %s: %s",
7493 source_node, abort_msg)
7495 def _ExecMigration(self):
7496 """Migrate an instance.
7498 The migrate is done by:
7499 - change the disks into dual-master mode
7500 - wait until disks are fully synchronized again
7501 - migrate the instance
7502 - change disks on the new secondary node (the old primary) to secondary
7503 - wait until disks are fully synchronized
7504 - change disks into single-master mode
7507 instance = self.instance
7508 target_node = self.target_node
7509 source_node = self.source_node
7511 # Check for hypervisor version mismatch and warn the user.
7512 nodeinfo = self.rpc.call_node_info([source_node, target_node],
7513 None, self.instance.hypervisor)
7514 src_info = nodeinfo[source_node]
7515 dst_info = nodeinfo[target_node]
7517 if ((constants.HV_NODEINFO_KEY_VERSION in src_info.payload) and
7518 (constants.HV_NODEINFO_KEY_VERSION in dst_info.payload)):
7519 src_version = src_info.payload[constants.HV_NODEINFO_KEY_VERSION]
7520 dst_version = dst_info.payload[constants.HV_NODEINFO_KEY_VERSION]
7521 if src_version != dst_version:
7522 self.feedback_fn("* warning: hypervisor version mismatch between"
7523 " source (%s) and target (%s) node" %
7524 (src_version, dst_version))
7526 self.feedback_fn("* checking disk consistency between source and target")
7527 for dev in instance.disks:
7528 if not _CheckDiskConsistency(self.lu, dev, target_node, False):
7529 raise errors.OpExecError("Disk %s is degraded or not fully"
7530 " synchronized on target node,"
7531 " aborting migration" % dev.iv_name)
7533 # First get the migration information from the remote node
7534 result = self.rpc.call_migration_info(source_node, instance)
7535 msg = result.fail_msg
7537 log_err = ("Failed fetching source migration information from %s: %s" %
7539 logging.error(log_err)
7540 raise errors.OpExecError(log_err)
7542 self.migration_info = migration_info = result.payload
7544 if self.instance.disk_template not in constants.DTS_EXT_MIRROR:
7545 # Then switch the disks to master/master mode
7546 self._EnsureSecondary(target_node)
7547 self._GoStandalone()
7548 self._GoReconnect(True)
7549 self._WaitUntilSync()
7551 self.feedback_fn("* preparing %s to accept the instance" % target_node)
7552 result = self.rpc.call_accept_instance(target_node,
7555 self.nodes_ip[target_node])
7557 msg = result.fail_msg
7559 logging.error("Instance pre-migration failed, trying to revert"
7560 " disk status: %s", msg)
7561 self.feedback_fn("Pre-migration failed, aborting")
7562 self._AbortMigration()
7563 self._RevertDiskStatus()
7564 raise errors.OpExecError("Could not pre-migrate instance %s: %s" %
7565 (instance.name, msg))
7567 self.feedback_fn("* migrating instance to %s" % target_node)
7568 result = self.rpc.call_instance_migrate(source_node, instance,
7569 self.nodes_ip[target_node],
7571 msg = result.fail_msg
7573 logging.error("Instance migration failed, trying to revert"
7574 " disk status: %s", msg)
7575 self.feedback_fn("Migration failed, aborting")
7576 self._AbortMigration()
7577 self._RevertDiskStatus()
7578 raise errors.OpExecError("Could not migrate instance %s: %s" %
7579 (instance.name, msg))
7581 self.feedback_fn("* starting memory transfer")
7582 last_feedback = time.time()
7584 result = self.rpc.call_instance_get_migration_status(source_node,
7586 msg = result.fail_msg
7587 ms = result.payload # MigrationStatus instance
7588 if msg or (ms.status in constants.HV_MIGRATION_FAILED_STATUSES):
7589 logging.error("Instance migration failed, trying to revert"
7590 " disk status: %s", msg)
7591 self.feedback_fn("Migration failed, aborting")
7592 self._AbortMigration()
7593 self._RevertDiskStatus()
7594 raise errors.OpExecError("Could not migrate instance %s: %s" %
7595 (instance.name, msg))
7597 if result.payload.status != constants.HV_MIGRATION_ACTIVE:
7598 self.feedback_fn("* memory transfer complete")
7601 if (utils.TimeoutExpired(last_feedback,
7602 self._MIGRATION_FEEDBACK_INTERVAL) and
7603 ms.transferred_ram is not None):
7604 mem_progress = 100 * float(ms.transferred_ram) / float(ms.total_ram)
7605 self.feedback_fn("* memory transfer progress: %.2f %%" % mem_progress)
7606 last_feedback = time.time()
7608 time.sleep(self._MIGRATION_POLL_INTERVAL)
7610 result = self.rpc.call_instance_finalize_migration_src(source_node,
7614 msg = result.fail_msg
7616 logging.error("Instance migration succeeded, but finalization failed"
7617 " on the source node: %s", msg)
7618 raise errors.OpExecError("Could not finalize instance migration: %s" %
7621 instance.primary_node = target_node
7623 # distribute new instance config to the other nodes
7624 self.cfg.Update(instance, self.feedback_fn)
7626 result = self.rpc.call_instance_finalize_migration_dst(target_node,
7630 msg = result.fail_msg
7632 logging.error("Instance migration succeeded, but finalization failed"
7633 " on the target node: %s", msg)
7634 raise errors.OpExecError("Could not finalize instance migration: %s" %
7637 if self.instance.disk_template not in constants.DTS_EXT_MIRROR:
7638 self._EnsureSecondary(source_node)
7639 self._WaitUntilSync()
7640 self._GoStandalone()
7641 self._GoReconnect(False)
7642 self._WaitUntilSync()
7644 self.feedback_fn("* done")
7646 def _ExecFailover(self):
7647 """Failover an instance.
7649 The failover is done by shutting it down on its present node and
7650 starting it on the secondary.
7653 instance = self.instance
7654 primary_node = self.cfg.GetNodeInfo(instance.primary_node)
7656 source_node = instance.primary_node
7657 target_node = self.target_node
7659 if instance.admin_up:
7660 self.feedback_fn("* checking disk consistency between source and target")
7661 for dev in instance.disks:
7662 # for drbd, these are drbd over lvm
7663 if not _CheckDiskConsistency(self.lu, dev, target_node, False):
7664 if primary_node.offline:
7665 self.feedback_fn("Node %s is offline, ignoring degraded disk %s on"
7667 (primary_node.name, dev.iv_name, target_node))
7668 elif not self.ignore_consistency:
7669 raise errors.OpExecError("Disk %s is degraded on target node,"
7670 " aborting failover" % dev.iv_name)
7672 self.feedback_fn("* not checking disk consistency as instance is not"
7675 self.feedback_fn("* shutting down instance on source node")
7676 logging.info("Shutting down instance %s on node %s",
7677 instance.name, source_node)
7679 result = self.rpc.call_instance_shutdown(source_node, instance,
7680 self.shutdown_timeout)
7681 msg = result.fail_msg
7683 if self.ignore_consistency or primary_node.offline:
7684 self.lu.LogWarning("Could not shutdown instance %s on node %s,"
7685 " proceeding anyway; please make sure node"
7686 " %s is down; error details: %s",
7687 instance.name, source_node, source_node, msg)
7689 raise errors.OpExecError("Could not shutdown instance %s on"
7691 (instance.name, source_node, msg))
7693 self.feedback_fn("* deactivating the instance's disks on source node")
7694 if not _ShutdownInstanceDisks(self.lu, instance, ignore_primary=True):
7695 raise errors.OpExecError("Can't shut down the instance's disks")
7697 instance.primary_node = target_node
7698 # distribute new instance config to the other nodes
7699 self.cfg.Update(instance, self.feedback_fn)
7701 # Only start the instance if it's marked as up
7702 if instance.admin_up:
7703 self.feedback_fn("* activating the instance's disks on target node %s" %
7705 logging.info("Starting instance %s on node %s",
7706 instance.name, target_node)
7708 disks_ok, _ = _AssembleInstanceDisks(self.lu, instance,
7709 ignore_secondaries=True)
7711 _ShutdownInstanceDisks(self.lu, instance)
7712 raise errors.OpExecError("Can't activate the instance's disks")
7714 self.feedback_fn("* starting the instance on the target node %s" %
7716 result = self.rpc.call_instance_start(target_node, (instance, None, None),
7718 msg = result.fail_msg
7720 _ShutdownInstanceDisks(self.lu, instance)
7721 raise errors.OpExecError("Could not start instance %s on node %s: %s" %
7722 (instance.name, target_node, msg))
7724 def Exec(self, feedback_fn):
7725 """Perform the migration.
7728 self.feedback_fn = feedback_fn
7729 self.source_node = self.instance.primary_node
7731 # FIXME: if we implement migrate-to-any in DRBD, this needs fixing
7732 if self.instance.disk_template in constants.DTS_INT_MIRROR:
7733 self.target_node = self.instance.secondary_nodes[0]
7734 # Otherwise self.target_node has been populated either
7735 # directly, or through an iallocator.
7737 self.all_nodes = [self.source_node, self.target_node]
7738 self.nodes_ip = dict((name, node.secondary_ip) for (name, node)
7739 in self.cfg.GetMultiNodeInfo(self.all_nodes))
7742 feedback_fn("Failover instance %s" % self.instance.name)
7743 self._ExecFailover()
7745 feedback_fn("Migrating instance %s" % self.instance.name)
7748 return self._ExecCleanup()
7750 return self._ExecMigration()
7753 def _CreateBlockDev(lu, node, instance, device, force_create,
7755 """Create a tree of block devices on a given node.
7757 If this device type has to be created on secondaries, create it and
7760 If not, just recurse to children keeping the same 'force' value.
7762 @param lu: the lu on whose behalf we execute
7763 @param node: the node on which to create the device
7764 @type instance: L{objects.Instance}
7765 @param instance: the instance which owns the device
7766 @type device: L{objects.Disk}
7767 @param device: the device to create
7768 @type force_create: boolean
7769 @param force_create: whether to force creation of this device; this
7770 will be change to True whenever we find a device which has
7771 CreateOnSecondary() attribute
7772 @param info: the extra 'metadata' we should attach to the device
7773 (this will be represented as a LVM tag)
7774 @type force_open: boolean
7775 @param force_open: this parameter will be passes to the
7776 L{backend.BlockdevCreate} function where it specifies
7777 whether we run on primary or not, and it affects both
7778 the child assembly and the device own Open() execution
7781 if device.CreateOnSecondary():
7785 for child in device.children:
7786 _CreateBlockDev(lu, node, instance, child, force_create,
7789 if not force_create:
7792 _CreateSingleBlockDev(lu, node, instance, device, info, force_open)
7795 def _CreateSingleBlockDev(lu, node, instance, device, info, force_open):
7796 """Create a single block device on a given node.
7798 This will not recurse over children of the device, so they must be
7801 @param lu: the lu on whose behalf we execute
7802 @param node: the node on which to create the device
7803 @type instance: L{objects.Instance}
7804 @param instance: the instance which owns the device
7805 @type device: L{objects.Disk}
7806 @param device: the device to create
7807 @param info: the extra 'metadata' we should attach to the device
7808 (this will be represented as a LVM tag)
7809 @type force_open: boolean
7810 @param force_open: this parameter will be passes to the
7811 L{backend.BlockdevCreate} function where it specifies
7812 whether we run on primary or not, and it affects both
7813 the child assembly and the device own Open() execution
7816 lu.cfg.SetDiskID(device, node)
7817 result = lu.rpc.call_blockdev_create(node, device, device.size,
7818 instance.name, force_open, info)
7819 result.Raise("Can't create block device %s on"
7820 " node %s for instance %s" % (device, node, instance.name))
7821 if device.physical_id is None:
7822 device.physical_id = result.payload
7825 def _GenerateUniqueNames(lu, exts):
7826 """Generate a suitable LV name.
7828 This will generate a logical volume name for the given instance.
7833 new_id = lu.cfg.GenerateUniqueID(lu.proc.GetECId())
7834 results.append("%s%s" % (new_id, val))
7838 def _GenerateDRBD8Branch(lu, primary, secondary, size, vgnames, names,
7839 iv_name, p_minor, s_minor):
7840 """Generate a drbd8 device complete with its children.
7843 assert len(vgnames) == len(names) == 2
7844 port = lu.cfg.AllocatePort()
7845 shared_secret = lu.cfg.GenerateDRBDSecret(lu.proc.GetECId())
7846 dev_data = objects.Disk(dev_type=constants.LD_LV, size=size,
7847 logical_id=(vgnames[0], names[0]))
7848 dev_meta = objects.Disk(dev_type=constants.LD_LV, size=DRBD_META_SIZE,
7849 logical_id=(vgnames[1], names[1]))
7850 drbd_dev = objects.Disk(dev_type=constants.LD_DRBD8, size=size,
7851 logical_id=(primary, secondary, port,
7854 children=[dev_data, dev_meta],
7859 def _GenerateDiskTemplate(lu, template_name,
7860 instance_name, primary_node,
7861 secondary_nodes, disk_info,
7862 file_storage_dir, file_driver,
7863 base_index, feedback_fn):
7864 """Generate the entire disk layout for a given template type.
7867 #TODO: compute space requirements
7869 vgname = lu.cfg.GetVGName()
7870 disk_count = len(disk_info)
7872 if template_name == constants.DT_DISKLESS:
7874 elif template_name == constants.DT_PLAIN:
7875 if len(secondary_nodes) != 0:
7876 raise errors.ProgrammerError("Wrong template configuration")
7878 names = _GenerateUniqueNames(lu, [".disk%d" % (base_index + i)
7879 for i in range(disk_count)])
7880 for idx, disk in enumerate(disk_info):
7881 disk_index = idx + base_index
7882 vg = disk.get(constants.IDISK_VG, vgname)
7883 feedback_fn("* disk %i, vg %s, name %s" % (idx, vg, names[idx]))
7884 disk_dev = objects.Disk(dev_type=constants.LD_LV,
7885 size=disk[constants.IDISK_SIZE],
7886 logical_id=(vg, names[idx]),
7887 iv_name="disk/%d" % disk_index,
7888 mode=disk[constants.IDISK_MODE])
7889 disks.append(disk_dev)
7890 elif template_name == constants.DT_DRBD8:
7891 if len(secondary_nodes) != 1:
7892 raise errors.ProgrammerError("Wrong template configuration")
7893 remote_node = secondary_nodes[0]
7894 minors = lu.cfg.AllocateDRBDMinor(
7895 [primary_node, remote_node] * len(disk_info), instance_name)
7898 for lv_prefix in _GenerateUniqueNames(lu, [".disk%d" % (base_index + i)
7899 for i in range(disk_count)]):
7900 names.append(lv_prefix + "_data")
7901 names.append(lv_prefix + "_meta")
7902 for idx, disk in enumerate(disk_info):
7903 disk_index = idx + base_index
7904 data_vg = disk.get(constants.IDISK_VG, vgname)
7905 meta_vg = disk.get(constants.IDISK_METAVG, data_vg)
7906 disk_dev = _GenerateDRBD8Branch(lu, primary_node, remote_node,
7907 disk[constants.IDISK_SIZE],
7909 names[idx * 2:idx * 2 + 2],
7910 "disk/%d" % disk_index,
7911 minors[idx * 2], minors[idx * 2 + 1])
7912 disk_dev.mode = disk[constants.IDISK_MODE]
7913 disks.append(disk_dev)
7914 elif template_name == constants.DT_FILE:
7915 if len(secondary_nodes) != 0:
7916 raise errors.ProgrammerError("Wrong template configuration")
7918 opcodes.RequireFileStorage()
7920 for idx, disk in enumerate(disk_info):
7921 disk_index = idx + base_index
7922 disk_dev = objects.Disk(dev_type=constants.LD_FILE,
7923 size=disk[constants.IDISK_SIZE],
7924 iv_name="disk/%d" % disk_index,
7925 logical_id=(file_driver,
7926 "%s/disk%d" % (file_storage_dir,
7928 mode=disk[constants.IDISK_MODE])
7929 disks.append(disk_dev)
7930 elif template_name == constants.DT_SHARED_FILE:
7931 if len(secondary_nodes) != 0:
7932 raise errors.ProgrammerError("Wrong template configuration")
7934 opcodes.RequireSharedFileStorage()
7936 for idx, disk in enumerate(disk_info):
7937 disk_index = idx + base_index
7938 disk_dev = objects.Disk(dev_type=constants.LD_FILE,
7939 size=disk[constants.IDISK_SIZE],
7940 iv_name="disk/%d" % disk_index,
7941 logical_id=(file_driver,
7942 "%s/disk%d" % (file_storage_dir,
7944 mode=disk[constants.IDISK_MODE])
7945 disks.append(disk_dev)
7946 elif template_name == constants.DT_BLOCK:
7947 if len(secondary_nodes) != 0:
7948 raise errors.ProgrammerError("Wrong template configuration")
7950 for idx, disk in enumerate(disk_info):
7951 disk_index = idx + base_index
7952 disk_dev = objects.Disk(dev_type=constants.LD_BLOCKDEV,
7953 size=disk[constants.IDISK_SIZE],
7954 logical_id=(constants.BLOCKDEV_DRIVER_MANUAL,
7955 disk[constants.IDISK_ADOPT]),
7956 iv_name="disk/%d" % disk_index,
7957 mode=disk[constants.IDISK_MODE])
7958 disks.append(disk_dev)
7961 raise errors.ProgrammerError("Invalid disk template '%s'" % template_name)
7965 def _GetInstanceInfoText(instance):
7966 """Compute that text that should be added to the disk's metadata.
7969 return "originstname+%s" % instance.name
7972 def _CalcEta(time_taken, written, total_size):
7973 """Calculates the ETA based on size written and total size.
7975 @param time_taken: The time taken so far
7976 @param written: amount written so far
7977 @param total_size: The total size of data to be written
7978 @return: The remaining time in seconds
7981 avg_time = time_taken / float(written)
7982 return (total_size - written) * avg_time
7985 def _WipeDisks(lu, instance):
7986 """Wipes instance disks.
7988 @type lu: L{LogicalUnit}
7989 @param lu: the logical unit on whose behalf we execute
7990 @type instance: L{objects.Instance}
7991 @param instance: the instance whose disks we should create
7992 @return: the success of the wipe
7995 node = instance.primary_node
7997 for device in instance.disks:
7998 lu.cfg.SetDiskID(device, node)
8000 logging.info("Pause sync of instance %s disks", instance.name)
8001 result = lu.rpc.call_blockdev_pause_resume_sync(node, instance.disks, True)
8003 for idx, success in enumerate(result.payload):
8005 logging.warn("pause-sync of instance %s for disks %d failed",
8009 for idx, device in enumerate(instance.disks):
8010 # The wipe size is MIN_WIPE_CHUNK_PERCENT % of the instance disk but
8011 # MAX_WIPE_CHUNK at max
8012 wipe_chunk_size = min(constants.MAX_WIPE_CHUNK, device.size / 100.0 *
8013 constants.MIN_WIPE_CHUNK_PERCENT)
8014 # we _must_ make this an int, otherwise rounding errors will
8016 wipe_chunk_size = int(wipe_chunk_size)
8018 lu.LogInfo("* Wiping disk %d", idx)
8019 logging.info("Wiping disk %d for instance %s, node %s using"
8020 " chunk size %s", idx, instance.name, node, wipe_chunk_size)
8025 start_time = time.time()
8027 while offset < size:
8028 wipe_size = min(wipe_chunk_size, size - offset)
8029 logging.debug("Wiping disk %d, offset %s, chunk %s",
8030 idx, offset, wipe_size)
8031 result = lu.rpc.call_blockdev_wipe(node, device, offset, wipe_size)
8032 result.Raise("Could not wipe disk %d at offset %d for size %d" %
8033 (idx, offset, wipe_size))
8036 if now - last_output >= 60:
8037 eta = _CalcEta(now - start_time, offset, size)
8038 lu.LogInfo(" - done: %.1f%% ETA: %s" %
8039 (offset / float(size) * 100, utils.FormatSeconds(eta)))
8042 logging.info("Resume sync of instance %s disks", instance.name)
8044 result = lu.rpc.call_blockdev_pause_resume_sync(node, instance.disks, False)
8046 for idx, success in enumerate(result.payload):
8048 lu.LogWarning("Resume sync of disk %d failed, please have a"
8049 " look at the status and troubleshoot the issue", idx)
8050 logging.warn("resume-sync of instance %s for disks %d failed",
8054 def _CreateDisks(lu, instance, to_skip=None, target_node=None):
8055 """Create all disks for an instance.
8057 This abstracts away some work from AddInstance.
8059 @type lu: L{LogicalUnit}
8060 @param lu: the logical unit on whose behalf we execute
8061 @type instance: L{objects.Instance}
8062 @param instance: the instance whose disks we should create
8064 @param to_skip: list of indices to skip
8065 @type target_node: string
8066 @param target_node: if passed, overrides the target node for creation
8068 @return: the success of the creation
8071 info = _GetInstanceInfoText(instance)
8072 if target_node is None:
8073 pnode = instance.primary_node
8074 all_nodes = instance.all_nodes
8079 if instance.disk_template in constants.DTS_FILEBASED:
8080 file_storage_dir = os.path.dirname(instance.disks[0].logical_id[1])
8081 result = lu.rpc.call_file_storage_dir_create(pnode, file_storage_dir)
8083 result.Raise("Failed to create directory '%s' on"
8084 " node %s" % (file_storage_dir, pnode))
8086 # Note: this needs to be kept in sync with adding of disks in
8087 # LUInstanceSetParams
8088 for idx, device in enumerate(instance.disks):
8089 if to_skip and idx in to_skip:
8091 logging.info("Creating volume %s for instance %s",
8092 device.iv_name, instance.name)
8094 for node in all_nodes:
8095 f_create = node == pnode
8096 _CreateBlockDev(lu, node, instance, device, f_create, info, f_create)
8099 def _RemoveDisks(lu, instance, target_node=None):
8100 """Remove all disks for an instance.
8102 This abstracts away some work from `AddInstance()` and
8103 `RemoveInstance()`. Note that in case some of the devices couldn't
8104 be removed, the removal will continue with the other ones (compare
8105 with `_CreateDisks()`).
8107 @type lu: L{LogicalUnit}
8108 @param lu: the logical unit on whose behalf we execute
8109 @type instance: L{objects.Instance}
8110 @param instance: the instance whose disks we should remove
8111 @type target_node: string
8112 @param target_node: used to override the node on which to remove the disks
8114 @return: the success of the removal
8117 logging.info("Removing block devices for instance %s", instance.name)
8120 for device in instance.disks:
8122 edata = [(target_node, device)]
8124 edata = device.ComputeNodeTree(instance.primary_node)
8125 for node, disk in edata:
8126 lu.cfg.SetDiskID(disk, node)
8127 msg = lu.rpc.call_blockdev_remove(node, disk).fail_msg
8129 lu.LogWarning("Could not remove block device %s on node %s,"
8130 " continuing anyway: %s", device.iv_name, node, msg)
8133 if instance.disk_template == constants.DT_FILE:
8134 file_storage_dir = os.path.dirname(instance.disks[0].logical_id[1])
8138 tgt = instance.primary_node
8139 result = lu.rpc.call_file_storage_dir_remove(tgt, file_storage_dir)
8141 lu.LogWarning("Could not remove directory '%s' on node %s: %s",
8142 file_storage_dir, instance.primary_node, result.fail_msg)
8148 def _ComputeDiskSizePerVG(disk_template, disks):
8149 """Compute disk size requirements in the volume group
8152 def _compute(disks, payload):
8153 """Universal algorithm.
8158 vgs[disk[constants.IDISK_VG]] = \
8159 vgs.get(constants.IDISK_VG, 0) + disk[constants.IDISK_SIZE] + payload
8163 # Required free disk space as a function of disk and swap space
8165 constants.DT_DISKLESS: {},
8166 constants.DT_PLAIN: _compute(disks, 0),
8167 # 128 MB are added for drbd metadata for each disk
8168 constants.DT_DRBD8: _compute(disks, DRBD_META_SIZE),
8169 constants.DT_FILE: {},
8170 constants.DT_SHARED_FILE: {},
8173 if disk_template not in req_size_dict:
8174 raise errors.ProgrammerError("Disk template '%s' size requirement"
8175 " is unknown" % disk_template)
8177 return req_size_dict[disk_template]
8180 def _ComputeDiskSize(disk_template, disks):
8181 """Compute disk size requirements in the volume group
8184 # Required free disk space as a function of disk and swap space
8186 constants.DT_DISKLESS: None,
8187 constants.DT_PLAIN: sum(d[constants.IDISK_SIZE] for d in disks),
8188 # 128 MB are added for drbd metadata for each disk
8190 sum(d[constants.IDISK_SIZE] + DRBD_META_SIZE for d in disks),
8191 constants.DT_FILE: None,
8192 constants.DT_SHARED_FILE: 0,
8193 constants.DT_BLOCK: 0,
8196 if disk_template not in req_size_dict:
8197 raise errors.ProgrammerError("Disk template '%s' size requirement"
8198 " is unknown" % disk_template)
8200 return req_size_dict[disk_template]
8203 def _FilterVmNodes(lu, nodenames):
8204 """Filters out non-vm_capable nodes from a list.
8206 @type lu: L{LogicalUnit}
8207 @param lu: the logical unit for which we check
8208 @type nodenames: list
8209 @param nodenames: the list of nodes on which we should check
8211 @return: the list of vm-capable nodes
8214 vm_nodes = frozenset(lu.cfg.GetNonVmCapableNodeList())
8215 return [name for name in nodenames if name not in vm_nodes]
8218 def _CheckHVParams(lu, nodenames, hvname, hvparams):
8219 """Hypervisor parameter validation.
8221 This function abstract the hypervisor parameter validation to be
8222 used in both instance create and instance modify.
8224 @type lu: L{LogicalUnit}
8225 @param lu: the logical unit for which we check
8226 @type nodenames: list
8227 @param nodenames: the list of nodes on which we should check
8228 @type hvname: string
8229 @param hvname: the name of the hypervisor we should use
8230 @type hvparams: dict
8231 @param hvparams: the parameters which we need to check
8232 @raise errors.OpPrereqError: if the parameters are not valid
8235 nodenames = _FilterVmNodes(lu, nodenames)
8237 cluster = lu.cfg.GetClusterInfo()
8238 hvfull = objects.FillDict(cluster.hvparams.get(hvname, {}), hvparams)
8240 hvinfo = lu.rpc.call_hypervisor_validate_params(nodenames, hvname, hvfull)
8241 for node in nodenames:
8245 info.Raise("Hypervisor parameter validation failed on node %s" % node)
8248 def _CheckOSParams(lu, required, nodenames, osname, osparams):
8249 """OS parameters validation.
8251 @type lu: L{LogicalUnit}
8252 @param lu: the logical unit for which we check
8253 @type required: boolean
8254 @param required: whether the validation should fail if the OS is not
8256 @type nodenames: list
8257 @param nodenames: the list of nodes on which we should check
8258 @type osname: string
8259 @param osname: the name of the hypervisor we should use
8260 @type osparams: dict
8261 @param osparams: the parameters which we need to check
8262 @raise errors.OpPrereqError: if the parameters are not valid
8265 nodenames = _FilterVmNodes(lu, nodenames)
8266 result = lu.rpc.call_os_validate(nodenames, required, osname,
8267 [constants.OS_VALIDATE_PARAMETERS],
8269 for node, nres in result.items():
8270 # we don't check for offline cases since this should be run only
8271 # against the master node and/or an instance's nodes
8272 nres.Raise("OS Parameters validation failed on node %s" % node)
8273 if not nres.payload:
8274 lu.LogInfo("OS %s not found on node %s, validation skipped",
8278 class LUInstanceCreate(LogicalUnit):
8279 """Create an instance.
8282 HPATH = "instance-add"
8283 HTYPE = constants.HTYPE_INSTANCE
8286 def CheckArguments(self):
8290 # do not require name_check to ease forward/backward compatibility
8292 if self.op.no_install and self.op.start:
8293 self.LogInfo("No-installation mode selected, disabling startup")
8294 self.op.start = False
8295 # validate/normalize the instance name
8296 self.op.instance_name = \
8297 netutils.Hostname.GetNormalizedName(self.op.instance_name)
8299 if self.op.ip_check and not self.op.name_check:
8300 # TODO: make the ip check more flexible and not depend on the name check
8301 raise errors.OpPrereqError("Cannot do IP address check without a name"
8302 " check", errors.ECODE_INVAL)
8304 # check nics' parameter names
8305 for nic in self.op.nics:
8306 utils.ForceDictType(nic, constants.INIC_PARAMS_TYPES)
8308 # check disks. parameter names and consistent adopt/no-adopt strategy
8309 has_adopt = has_no_adopt = False
8310 for disk in self.op.disks:
8311 utils.ForceDictType(disk, constants.IDISK_PARAMS_TYPES)
8312 if constants.IDISK_ADOPT in disk:
8316 if has_adopt and has_no_adopt:
8317 raise errors.OpPrereqError("Either all disks are adopted or none is",
8320 if self.op.disk_template not in constants.DTS_MAY_ADOPT:
8321 raise errors.OpPrereqError("Disk adoption is not supported for the"
8322 " '%s' disk template" %
8323 self.op.disk_template,
8325 if self.op.iallocator is not None:
8326 raise errors.OpPrereqError("Disk adoption not allowed with an"
8327 " iallocator script", errors.ECODE_INVAL)
8328 if self.op.mode == constants.INSTANCE_IMPORT:
8329 raise errors.OpPrereqError("Disk adoption not allowed for"
8330 " instance import", errors.ECODE_INVAL)
8332 if self.op.disk_template in constants.DTS_MUST_ADOPT:
8333 raise errors.OpPrereqError("Disk template %s requires disk adoption,"
8334 " but no 'adopt' parameter given" %
8335 self.op.disk_template,
8338 self.adopt_disks = has_adopt
8340 # instance name verification
8341 if self.op.name_check:
8342 self.hostname1 = netutils.GetHostname(name=self.op.instance_name)
8343 self.op.instance_name = self.hostname1.name
8344 # used in CheckPrereq for ip ping check
8345 self.check_ip = self.hostname1.ip
8347 self.check_ip = None
8349 # file storage checks
8350 if (self.op.file_driver and
8351 not self.op.file_driver in constants.FILE_DRIVER):
8352 raise errors.OpPrereqError("Invalid file driver name '%s'" %
8353 self.op.file_driver, errors.ECODE_INVAL)
8355 if self.op.disk_template == constants.DT_FILE:
8356 opcodes.RequireFileStorage()
8357 elif self.op.disk_template == constants.DT_SHARED_FILE:
8358 opcodes.RequireSharedFileStorage()
8360 ### Node/iallocator related checks
8361 _CheckIAllocatorOrNode(self, "iallocator", "pnode")
8363 if self.op.pnode is not None:
8364 if self.op.disk_template in constants.DTS_INT_MIRROR:
8365 if self.op.snode is None:
8366 raise errors.OpPrereqError("The networked disk templates need"
8367 " a mirror node", errors.ECODE_INVAL)
8369 self.LogWarning("Secondary node will be ignored on non-mirrored disk"
8371 self.op.snode = None
8373 self._cds = _GetClusterDomainSecret()
8375 if self.op.mode == constants.INSTANCE_IMPORT:
8376 # On import force_variant must be True, because if we forced it at
8377 # initial install, our only chance when importing it back is that it
8379 self.op.force_variant = True
8381 if self.op.no_install:
8382 self.LogInfo("No-installation mode has no effect during import")
8384 elif self.op.mode == constants.INSTANCE_CREATE:
8385 if self.op.os_type is None:
8386 raise errors.OpPrereqError("No guest OS specified",
8388 if self.op.os_type in self.cfg.GetClusterInfo().blacklisted_os:
8389 raise errors.OpPrereqError("Guest OS '%s' is not allowed for"
8390 " installation" % self.op.os_type,
8392 if self.op.disk_template is None:
8393 raise errors.OpPrereqError("No disk template specified",
8396 elif self.op.mode == constants.INSTANCE_REMOTE_IMPORT:
8397 # Check handshake to ensure both clusters have the same domain secret
8398 src_handshake = self.op.source_handshake
8399 if not src_handshake:
8400 raise errors.OpPrereqError("Missing source handshake",
8403 errmsg = masterd.instance.CheckRemoteExportHandshake(self._cds,
8406 raise errors.OpPrereqError("Invalid handshake: %s" % errmsg,
8409 # Load and check source CA
8410 self.source_x509_ca_pem = self.op.source_x509_ca
8411 if not self.source_x509_ca_pem:
8412 raise errors.OpPrereqError("Missing source X509 CA",
8416 (cert, _) = utils.LoadSignedX509Certificate(self.source_x509_ca_pem,
8418 except OpenSSL.crypto.Error, err:
8419 raise errors.OpPrereqError("Unable to load source X509 CA (%s)" %
8420 (err, ), errors.ECODE_INVAL)
8422 (errcode, msg) = utils.VerifyX509Certificate(cert, None, None)
8423 if errcode is not None:
8424 raise errors.OpPrereqError("Invalid source X509 CA (%s)" % (msg, ),
8427 self.source_x509_ca = cert
8429 src_instance_name = self.op.source_instance_name
8430 if not src_instance_name:
8431 raise errors.OpPrereqError("Missing source instance name",
8434 self.source_instance_name = \
8435 netutils.GetHostname(name=src_instance_name).name
8438 raise errors.OpPrereqError("Invalid instance creation mode %r" %
8439 self.op.mode, errors.ECODE_INVAL)
8441 def ExpandNames(self):
8442 """ExpandNames for CreateInstance.
8444 Figure out the right locks for instance creation.
8447 self.needed_locks = {}
8449 instance_name = self.op.instance_name
8450 # this is just a preventive check, but someone might still add this
8451 # instance in the meantime, and creation will fail at lock-add time
8452 if instance_name in self.cfg.GetInstanceList():
8453 raise errors.OpPrereqError("Instance '%s' is already in the cluster" %
8454 instance_name, errors.ECODE_EXISTS)
8456 self.add_locks[locking.LEVEL_INSTANCE] = instance_name
8458 if self.op.iallocator:
8459 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
8461 self.op.pnode = _ExpandNodeName(self.cfg, self.op.pnode)
8462 nodelist = [self.op.pnode]
8463 if self.op.snode is not None:
8464 self.op.snode = _ExpandNodeName(self.cfg, self.op.snode)
8465 nodelist.append(self.op.snode)
8466 self.needed_locks[locking.LEVEL_NODE] = nodelist
8468 # in case of import lock the source node too
8469 if self.op.mode == constants.INSTANCE_IMPORT:
8470 src_node = self.op.src_node
8471 src_path = self.op.src_path
8473 if src_path is None:
8474 self.op.src_path = src_path = self.op.instance_name
8476 if src_node is None:
8477 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
8478 self.op.src_node = None
8479 if os.path.isabs(src_path):
8480 raise errors.OpPrereqError("Importing an instance from a path"
8481 " requires a source node option",
8484 self.op.src_node = src_node = _ExpandNodeName(self.cfg, src_node)
8485 if self.needed_locks[locking.LEVEL_NODE] is not locking.ALL_SET:
8486 self.needed_locks[locking.LEVEL_NODE].append(src_node)
8487 if not os.path.isabs(src_path):
8488 self.op.src_path = src_path = \
8489 utils.PathJoin(constants.EXPORT_DIR, src_path)
8491 def _RunAllocator(self):
8492 """Run the allocator based on input opcode.
8495 nics = [n.ToDict() for n in self.nics]
8496 ial = IAllocator(self.cfg, self.rpc,
8497 mode=constants.IALLOCATOR_MODE_ALLOC,
8498 name=self.op.instance_name,
8499 disk_template=self.op.disk_template,
8502 vcpus=self.be_full[constants.BE_VCPUS],
8503 memory=self.be_full[constants.BE_MEMORY],
8506 hypervisor=self.op.hypervisor,
8509 ial.Run(self.op.iallocator)
8512 raise errors.OpPrereqError("Can't compute nodes using"
8513 " iallocator '%s': %s" %
8514 (self.op.iallocator, ial.info),
8516 if len(ial.result) != ial.required_nodes:
8517 raise errors.OpPrereqError("iallocator '%s' returned invalid number"
8518 " of nodes (%s), required %s" %
8519 (self.op.iallocator, len(ial.result),
8520 ial.required_nodes), errors.ECODE_FAULT)
8521 self.op.pnode = ial.result[0]
8522 self.LogInfo("Selected nodes for instance %s via iallocator %s: %s",
8523 self.op.instance_name, self.op.iallocator,
8524 utils.CommaJoin(ial.result))
8525 if ial.required_nodes == 2:
8526 self.op.snode = ial.result[1]
8528 def BuildHooksEnv(self):
8531 This runs on master, primary and secondary nodes of the instance.
8535 "ADD_MODE": self.op.mode,
8537 if self.op.mode == constants.INSTANCE_IMPORT:
8538 env["SRC_NODE"] = self.op.src_node
8539 env["SRC_PATH"] = self.op.src_path
8540 env["SRC_IMAGES"] = self.src_images
8542 env.update(_BuildInstanceHookEnv(
8543 name=self.op.instance_name,
8544 primary_node=self.op.pnode,
8545 secondary_nodes=self.secondaries,
8546 status=self.op.start,
8547 os_type=self.op.os_type,
8548 memory=self.be_full[constants.BE_MEMORY],
8549 vcpus=self.be_full[constants.BE_VCPUS],
8550 nics=_NICListToTuple(self, self.nics),
8551 disk_template=self.op.disk_template,
8552 disks=[(d[constants.IDISK_SIZE], d[constants.IDISK_MODE])
8553 for d in self.disks],
8556 hypervisor_name=self.op.hypervisor,
8562 def BuildHooksNodes(self):
8563 """Build hooks nodes.
8566 nl = [self.cfg.GetMasterNode(), self.op.pnode] + self.secondaries
8569 def _ReadExportInfo(self):
8570 """Reads the export information from disk.
8572 It will override the opcode source node and path with the actual
8573 information, if these two were not specified before.
8575 @return: the export information
8578 assert self.op.mode == constants.INSTANCE_IMPORT
8580 src_node = self.op.src_node
8581 src_path = self.op.src_path
8583 if src_node is None:
8584 locked_nodes = self.owned_locks(locking.LEVEL_NODE)
8585 exp_list = self.rpc.call_export_list(locked_nodes)
8587 for node in exp_list:
8588 if exp_list[node].fail_msg:
8590 if src_path in exp_list[node].payload:
8592 self.op.src_node = src_node = node
8593 self.op.src_path = src_path = utils.PathJoin(constants.EXPORT_DIR,
8597 raise errors.OpPrereqError("No export found for relative path %s" %
8598 src_path, errors.ECODE_INVAL)
8600 _CheckNodeOnline(self, src_node)
8601 result = self.rpc.call_export_info(src_node, src_path)
8602 result.Raise("No export or invalid export found in dir %s" % src_path)
8604 export_info = objects.SerializableConfigParser.Loads(str(result.payload))
8605 if not export_info.has_section(constants.INISECT_EXP):
8606 raise errors.ProgrammerError("Corrupted export config",
8607 errors.ECODE_ENVIRON)
8609 ei_version = export_info.get(constants.INISECT_EXP, "version")
8610 if (int(ei_version) != constants.EXPORT_VERSION):
8611 raise errors.OpPrereqError("Wrong export version %s (wanted %d)" %
8612 (ei_version, constants.EXPORT_VERSION),
8613 errors.ECODE_ENVIRON)
8616 def _ReadExportParams(self, einfo):
8617 """Use export parameters as defaults.
8619 In case the opcode doesn't specify (as in override) some instance
8620 parameters, then try to use them from the export information, if
8624 self.op.os_type = einfo.get(constants.INISECT_EXP, "os")
8626 if self.op.disk_template is None:
8627 if einfo.has_option(constants.INISECT_INS, "disk_template"):
8628 self.op.disk_template = einfo.get(constants.INISECT_INS,
8630 if self.op.disk_template not in constants.DISK_TEMPLATES:
8631 raise errors.OpPrereqError("Disk template specified in configuration"
8632 " file is not one of the allowed values:"
8633 " %s" % " ".join(constants.DISK_TEMPLATES))
8635 raise errors.OpPrereqError("No disk template specified and the export"
8636 " is missing the disk_template information",
8639 if not self.op.disks:
8641 # TODO: import the disk iv_name too
8642 for idx in range(constants.MAX_DISKS):
8643 if einfo.has_option(constants.INISECT_INS, "disk%d_size" % idx):
8644 disk_sz = einfo.getint(constants.INISECT_INS, "disk%d_size" % idx)
8645 disks.append({constants.IDISK_SIZE: disk_sz})
8646 self.op.disks = disks
8647 if not disks and self.op.disk_template != constants.DT_DISKLESS:
8648 raise errors.OpPrereqError("No disk info specified and the export"
8649 " is missing the disk information",
8652 if not self.op.nics:
8654 for idx in range(constants.MAX_NICS):
8655 if einfo.has_option(constants.INISECT_INS, "nic%d_mac" % idx):
8657 for name in list(constants.NICS_PARAMETERS) + ["ip", "mac"]:
8658 v = einfo.get(constants.INISECT_INS, "nic%d_%s" % (idx, name))
8665 if not self.op.tags and einfo.has_option(constants.INISECT_INS, "tags"):
8666 self.op.tags = einfo.get(constants.INISECT_INS, "tags").split()
8668 if (self.op.hypervisor is None and
8669 einfo.has_option(constants.INISECT_INS, "hypervisor")):
8670 self.op.hypervisor = einfo.get(constants.INISECT_INS, "hypervisor")
8672 if einfo.has_section(constants.INISECT_HYP):
8673 # use the export parameters but do not override the ones
8674 # specified by the user
8675 for name, value in einfo.items(constants.INISECT_HYP):
8676 if name not in self.op.hvparams:
8677 self.op.hvparams[name] = value
8679 if einfo.has_section(constants.INISECT_BEP):
8680 # use the parameters, without overriding
8681 for name, value in einfo.items(constants.INISECT_BEP):
8682 if name not in self.op.beparams:
8683 self.op.beparams[name] = value
8685 # try to read the parameters old style, from the main section
8686 for name in constants.BES_PARAMETERS:
8687 if (name not in self.op.beparams and
8688 einfo.has_option(constants.INISECT_INS, name)):
8689 self.op.beparams[name] = einfo.get(constants.INISECT_INS, name)
8691 if einfo.has_section(constants.INISECT_OSP):
8692 # use the parameters, without overriding
8693 for name, value in einfo.items(constants.INISECT_OSP):
8694 if name not in self.op.osparams:
8695 self.op.osparams[name] = value
8697 def _RevertToDefaults(self, cluster):
8698 """Revert the instance parameters to the default values.
8702 hv_defs = cluster.SimpleFillHV(self.op.hypervisor, self.op.os_type, {})
8703 for name in self.op.hvparams.keys():
8704 if name in hv_defs and hv_defs[name] == self.op.hvparams[name]:
8705 del self.op.hvparams[name]
8707 be_defs = cluster.SimpleFillBE({})
8708 for name in self.op.beparams.keys():
8709 if name in be_defs and be_defs[name] == self.op.beparams[name]:
8710 del self.op.beparams[name]
8712 nic_defs = cluster.SimpleFillNIC({})
8713 for nic in self.op.nics:
8714 for name in constants.NICS_PARAMETERS:
8715 if name in nic and name in nic_defs and nic[name] == nic_defs[name]:
8718 os_defs = cluster.SimpleFillOS(self.op.os_type, {})
8719 for name in self.op.osparams.keys():
8720 if name in os_defs and os_defs[name] == self.op.osparams[name]:
8721 del self.op.osparams[name]
8723 def _CalculateFileStorageDir(self):
8724 """Calculate final instance file storage dir.
8727 # file storage dir calculation/check
8728 self.instance_file_storage_dir = None
8729 if self.op.disk_template in constants.DTS_FILEBASED:
8730 # build the full file storage dir path
8733 if self.op.disk_template == constants.DT_SHARED_FILE:
8734 get_fsd_fn = self.cfg.GetSharedFileStorageDir
8736 get_fsd_fn = self.cfg.GetFileStorageDir
8738 cfg_storagedir = get_fsd_fn()
8739 if not cfg_storagedir:
8740 raise errors.OpPrereqError("Cluster file storage dir not defined")
8741 joinargs.append(cfg_storagedir)
8743 if self.op.file_storage_dir is not None:
8744 joinargs.append(self.op.file_storage_dir)
8746 joinargs.append(self.op.instance_name)
8748 # pylint: disable=W0142
8749 self.instance_file_storage_dir = utils.PathJoin(*joinargs)
8751 def CheckPrereq(self):
8752 """Check prerequisites.
8755 self._CalculateFileStorageDir()
8757 if self.op.mode == constants.INSTANCE_IMPORT:
8758 export_info = self._ReadExportInfo()
8759 self._ReadExportParams(export_info)
8761 if (not self.cfg.GetVGName() and
8762 self.op.disk_template not in constants.DTS_NOT_LVM):
8763 raise errors.OpPrereqError("Cluster does not support lvm-based"
8764 " instances", errors.ECODE_STATE)
8766 if (self.op.hypervisor is None or
8767 self.op.hypervisor == constants.VALUE_AUTO):
8768 self.op.hypervisor = self.cfg.GetHypervisorType()
8770 cluster = self.cfg.GetClusterInfo()
8771 enabled_hvs = cluster.enabled_hypervisors
8772 if self.op.hypervisor not in enabled_hvs:
8773 raise errors.OpPrereqError("Selected hypervisor (%s) not enabled in the"
8774 " cluster (%s)" % (self.op.hypervisor,
8775 ",".join(enabled_hvs)),
8778 # Check tag validity
8779 for tag in self.op.tags:
8780 objects.TaggableObject.ValidateTag(tag)
8782 # check hypervisor parameter syntax (locally)
8783 utils.ForceDictType(self.op.hvparams, constants.HVS_PARAMETER_TYPES)
8784 filled_hvp = cluster.SimpleFillHV(self.op.hypervisor, self.op.os_type,
8786 hv_type = hypervisor.GetHypervisor(self.op.hypervisor)
8787 hv_type.CheckParameterSyntax(filled_hvp)
8788 self.hv_full = filled_hvp
8789 # check that we don't specify global parameters on an instance
8790 _CheckGlobalHvParams(self.op.hvparams)
8792 # fill and remember the beparams dict
8793 default_beparams = cluster.beparams[constants.PP_DEFAULT]
8794 for param, value in self.op.beparams.iteritems():
8795 if value == constants.VALUE_AUTO:
8796 self.op.beparams[param] = default_beparams[param]
8797 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
8798 self.be_full = cluster.SimpleFillBE(self.op.beparams)
8800 # build os parameters
8801 self.os_full = cluster.SimpleFillOS(self.op.os_type, self.op.osparams)
8803 # now that hvp/bep are in final format, let's reset to defaults,
8805 if self.op.identify_defaults:
8806 self._RevertToDefaults(cluster)
8810 for idx, nic in enumerate(self.op.nics):
8811 nic_mode_req = nic.get(constants.INIC_MODE, None)
8812 nic_mode = nic_mode_req
8813 if nic_mode is None or nic_mode == constants.VALUE_AUTO:
8814 nic_mode = cluster.nicparams[constants.PP_DEFAULT][constants.NIC_MODE]
8816 # in routed mode, for the first nic, the default ip is 'auto'
8817 if nic_mode == constants.NIC_MODE_ROUTED and idx == 0:
8818 default_ip_mode = constants.VALUE_AUTO
8820 default_ip_mode = constants.VALUE_NONE
8822 # ip validity checks
8823 ip = nic.get(constants.INIC_IP, default_ip_mode)
8824 if ip is None or ip.lower() == constants.VALUE_NONE:
8826 elif ip.lower() == constants.VALUE_AUTO:
8827 if not self.op.name_check:
8828 raise errors.OpPrereqError("IP address set to auto but name checks"
8829 " have been skipped",
8831 nic_ip = self.hostname1.ip
8833 if not netutils.IPAddress.IsValid(ip):
8834 raise errors.OpPrereqError("Invalid IP address '%s'" % ip,
8838 # TODO: check the ip address for uniqueness
8839 if nic_mode == constants.NIC_MODE_ROUTED and not nic_ip:
8840 raise errors.OpPrereqError("Routed nic mode requires an ip address",
8843 # MAC address verification
8844 mac = nic.get(constants.INIC_MAC, constants.VALUE_AUTO)
8845 if mac not in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
8846 mac = utils.NormalizeAndValidateMac(mac)
8849 self.cfg.ReserveMAC(mac, self.proc.GetECId())
8850 except errors.ReservationError:
8851 raise errors.OpPrereqError("MAC address %s already in use"
8852 " in cluster" % mac,
8853 errors.ECODE_NOTUNIQUE)
8855 # Build nic parameters
8856 link = nic.get(constants.INIC_LINK, None)
8857 if link == constants.VALUE_AUTO:
8858 link = cluster.nicparams[constants.PP_DEFAULT][constants.NIC_LINK]
8861 nicparams[constants.NIC_MODE] = nic_mode
8863 nicparams[constants.NIC_LINK] = link
8865 check_params = cluster.SimpleFillNIC(nicparams)
8866 objects.NIC.CheckParameterSyntax(check_params)
8867 self.nics.append(objects.NIC(mac=mac, ip=nic_ip, nicparams=nicparams))
8869 # disk checks/pre-build
8870 default_vg = self.cfg.GetVGName()
8872 for disk in self.op.disks:
8873 mode = disk.get(constants.IDISK_MODE, constants.DISK_RDWR)
8874 if mode not in constants.DISK_ACCESS_SET:
8875 raise errors.OpPrereqError("Invalid disk access mode '%s'" %
8876 mode, errors.ECODE_INVAL)
8877 size = disk.get(constants.IDISK_SIZE, None)
8879 raise errors.OpPrereqError("Missing disk size", errors.ECODE_INVAL)
8882 except (TypeError, ValueError):
8883 raise errors.OpPrereqError("Invalid disk size '%s'" % size,
8886 data_vg = disk.get(constants.IDISK_VG, default_vg)
8888 constants.IDISK_SIZE: size,
8889 constants.IDISK_MODE: mode,
8890 constants.IDISK_VG: data_vg,
8891 constants.IDISK_METAVG: disk.get(constants.IDISK_METAVG, data_vg),
8893 if constants.IDISK_ADOPT in disk:
8894 new_disk[constants.IDISK_ADOPT] = disk[constants.IDISK_ADOPT]
8895 self.disks.append(new_disk)
8897 if self.op.mode == constants.INSTANCE_IMPORT:
8899 for idx in range(len(self.disks)):
8900 option = "disk%d_dump" % idx
8901 if export_info.has_option(constants.INISECT_INS, option):
8902 # FIXME: are the old os-es, disk sizes, etc. useful?
8903 export_name = export_info.get(constants.INISECT_INS, option)
8904 image = utils.PathJoin(self.op.src_path, export_name)
8905 disk_images.append(image)
8907 disk_images.append(False)
8909 self.src_images = disk_images
8911 old_name = export_info.get(constants.INISECT_INS, "name")
8912 if self.op.instance_name == old_name:
8913 for idx, nic in enumerate(self.nics):
8914 if nic.mac == constants.VALUE_AUTO:
8915 nic_mac_ini = "nic%d_mac" % idx
8916 nic.mac = export_info.get(constants.INISECT_INS, nic_mac_ini)
8918 # ENDIF: self.op.mode == constants.INSTANCE_IMPORT
8920 # ip ping checks (we use the same ip that was resolved in ExpandNames)
8921 if self.op.ip_check:
8922 if netutils.TcpPing(self.check_ip, constants.DEFAULT_NODED_PORT):
8923 raise errors.OpPrereqError("IP %s of instance %s already in use" %
8924 (self.check_ip, self.op.instance_name),
8925 errors.ECODE_NOTUNIQUE)
8927 #### mac address generation
8928 # By generating here the mac address both the allocator and the hooks get
8929 # the real final mac address rather than the 'auto' or 'generate' value.
8930 # There is a race condition between the generation and the instance object
8931 # creation, which means that we know the mac is valid now, but we're not
8932 # sure it will be when we actually add the instance. If things go bad
8933 # adding the instance will abort because of a duplicate mac, and the
8934 # creation job will fail.
8935 for nic in self.nics:
8936 if nic.mac in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
8937 nic.mac = self.cfg.GenerateMAC(self.proc.GetECId())
8941 if self.op.iallocator is not None:
8942 self._RunAllocator()
8944 #### node related checks
8946 # check primary node
8947 self.pnode = pnode = self.cfg.GetNodeInfo(self.op.pnode)
8948 assert self.pnode is not None, \
8949 "Cannot retrieve locked node %s" % self.op.pnode
8951 raise errors.OpPrereqError("Cannot use offline primary node '%s'" %
8952 pnode.name, errors.ECODE_STATE)
8954 raise errors.OpPrereqError("Cannot use drained primary node '%s'" %
8955 pnode.name, errors.ECODE_STATE)
8956 if not pnode.vm_capable:
8957 raise errors.OpPrereqError("Cannot use non-vm_capable primary node"
8958 " '%s'" % pnode.name, errors.ECODE_STATE)
8960 self.secondaries = []
8962 # mirror node verification
8963 if self.op.disk_template in constants.DTS_INT_MIRROR:
8964 if self.op.snode == pnode.name:
8965 raise errors.OpPrereqError("The secondary node cannot be the"
8966 " primary node", errors.ECODE_INVAL)
8967 _CheckNodeOnline(self, self.op.snode)
8968 _CheckNodeNotDrained(self, self.op.snode)
8969 _CheckNodeVmCapable(self, self.op.snode)
8970 self.secondaries.append(self.op.snode)
8972 nodenames = [pnode.name] + self.secondaries
8974 if not self.adopt_disks:
8975 # Check lv size requirements, if not adopting
8976 req_sizes = _ComputeDiskSizePerVG(self.op.disk_template, self.disks)
8977 _CheckNodesFreeDiskPerVG(self, nodenames, req_sizes)
8979 elif self.op.disk_template == constants.DT_PLAIN: # Check the adoption data
8980 all_lvs = set(["%s/%s" % (disk[constants.IDISK_VG],
8981 disk[constants.IDISK_ADOPT])
8982 for disk in self.disks])
8983 if len(all_lvs) != len(self.disks):
8984 raise errors.OpPrereqError("Duplicate volume names given for adoption",
8986 for lv_name in all_lvs:
8988 # FIXME: lv_name here is "vg/lv" need to ensure that other calls
8989 # to ReserveLV uses the same syntax
8990 self.cfg.ReserveLV(lv_name, self.proc.GetECId())
8991 except errors.ReservationError:
8992 raise errors.OpPrereqError("LV named %s used by another instance" %
8993 lv_name, errors.ECODE_NOTUNIQUE)
8995 vg_names = self.rpc.call_vg_list([pnode.name])[pnode.name]
8996 vg_names.Raise("Cannot get VG information from node %s" % pnode.name)
8998 node_lvs = self.rpc.call_lv_list([pnode.name],
8999 vg_names.payload.keys())[pnode.name]
9000 node_lvs.Raise("Cannot get LV information from node %s" % pnode.name)
9001 node_lvs = node_lvs.payload
9003 delta = all_lvs.difference(node_lvs.keys())
9005 raise errors.OpPrereqError("Missing logical volume(s): %s" %
9006 utils.CommaJoin(delta),
9008 online_lvs = [lv for lv in all_lvs if node_lvs[lv][2]]
9010 raise errors.OpPrereqError("Online logical volumes found, cannot"
9011 " adopt: %s" % utils.CommaJoin(online_lvs),
9013 # update the size of disk based on what is found
9014 for dsk in self.disks:
9015 dsk[constants.IDISK_SIZE] = \
9016 int(float(node_lvs["%s/%s" % (dsk[constants.IDISK_VG],
9017 dsk[constants.IDISK_ADOPT])][0]))
9019 elif self.op.disk_template == constants.DT_BLOCK:
9020 # Normalize and de-duplicate device paths
9021 all_disks = set([os.path.abspath(disk[constants.IDISK_ADOPT])
9022 for disk in self.disks])
9023 if len(all_disks) != len(self.disks):
9024 raise errors.OpPrereqError("Duplicate disk names given for adoption",
9026 baddisks = [d for d in all_disks
9027 if not d.startswith(constants.ADOPTABLE_BLOCKDEV_ROOT)]
9029 raise errors.OpPrereqError("Device node(s) %s lie outside %s and"
9030 " cannot be adopted" %
9031 (", ".join(baddisks),
9032 constants.ADOPTABLE_BLOCKDEV_ROOT),
9035 node_disks = self.rpc.call_bdev_sizes([pnode.name],
9036 list(all_disks))[pnode.name]
9037 node_disks.Raise("Cannot get block device information from node %s" %
9039 node_disks = node_disks.payload
9040 delta = all_disks.difference(node_disks.keys())
9042 raise errors.OpPrereqError("Missing block device(s): %s" %
9043 utils.CommaJoin(delta),
9045 for dsk in self.disks:
9046 dsk[constants.IDISK_SIZE] = \
9047 int(float(node_disks[dsk[constants.IDISK_ADOPT]]))
9049 _CheckHVParams(self, nodenames, self.op.hypervisor, self.op.hvparams)
9051 _CheckNodeHasOS(self, pnode.name, self.op.os_type, self.op.force_variant)
9052 # check OS parameters (remotely)
9053 _CheckOSParams(self, True, nodenames, self.op.os_type, self.os_full)
9055 _CheckNicsBridgesExist(self, self.nics, self.pnode.name)
9057 # memory check on primary node
9059 _CheckNodeFreeMemory(self, self.pnode.name,
9060 "creating instance %s" % self.op.instance_name,
9061 self.be_full[constants.BE_MEMORY],
9064 self.dry_run_result = list(nodenames)
9066 def Exec(self, feedback_fn):
9067 """Create and add the instance to the cluster.
9070 instance = self.op.instance_name
9071 pnode_name = self.pnode.name
9073 ht_kind = self.op.hypervisor
9074 if ht_kind in constants.HTS_REQ_PORT:
9075 network_port = self.cfg.AllocatePort()
9079 disks = _GenerateDiskTemplate(self,
9080 self.op.disk_template,
9081 instance, pnode_name,
9084 self.instance_file_storage_dir,
9085 self.op.file_driver,
9089 iobj = objects.Instance(name=instance, os=self.op.os_type,
9090 primary_node=pnode_name,
9091 nics=self.nics, disks=disks,
9092 disk_template=self.op.disk_template,
9094 network_port=network_port,
9095 beparams=self.op.beparams,
9096 hvparams=self.op.hvparams,
9097 hypervisor=self.op.hypervisor,
9098 osparams=self.op.osparams,
9102 for tag in self.op.tags:
9105 if self.adopt_disks:
9106 if self.op.disk_template == constants.DT_PLAIN:
9107 # rename LVs to the newly-generated names; we need to construct
9108 # 'fake' LV disks with the old data, plus the new unique_id
9109 tmp_disks = [objects.Disk.FromDict(v.ToDict()) for v in disks]
9111 for t_dsk, a_dsk in zip(tmp_disks, self.disks):
9112 rename_to.append(t_dsk.logical_id)
9113 t_dsk.logical_id = (t_dsk.logical_id[0], a_dsk[constants.IDISK_ADOPT])
9114 self.cfg.SetDiskID(t_dsk, pnode_name)
9115 result = self.rpc.call_blockdev_rename(pnode_name,
9116 zip(tmp_disks, rename_to))
9117 result.Raise("Failed to rename adoped LVs")
9119 feedback_fn("* creating instance disks...")
9121 _CreateDisks(self, iobj)
9122 except errors.OpExecError:
9123 self.LogWarning("Device creation failed, reverting...")
9125 _RemoveDisks(self, iobj)
9127 self.cfg.ReleaseDRBDMinors(instance)
9130 feedback_fn("adding instance %s to cluster config" % instance)
9132 self.cfg.AddInstance(iobj, self.proc.GetECId())
9134 # Declare that we don't want to remove the instance lock anymore, as we've
9135 # added the instance to the config
9136 del self.remove_locks[locking.LEVEL_INSTANCE]
9138 if self.op.mode == constants.INSTANCE_IMPORT:
9139 # Release unused nodes
9140 _ReleaseLocks(self, locking.LEVEL_NODE, keep=[self.op.src_node])
9143 _ReleaseLocks(self, locking.LEVEL_NODE)
9146 if not self.adopt_disks and self.cfg.GetClusterInfo().prealloc_wipe_disks:
9147 feedback_fn("* wiping instance disks...")
9149 _WipeDisks(self, iobj)
9150 except errors.OpExecError, err:
9151 logging.exception("Wiping disks failed")
9152 self.LogWarning("Wiping instance disks failed (%s)", err)
9156 # Something is already wrong with the disks, don't do anything else
9158 elif self.op.wait_for_sync:
9159 disk_abort = not _WaitForSync(self, iobj)
9160 elif iobj.disk_template in constants.DTS_INT_MIRROR:
9161 # make sure the disks are not degraded (still sync-ing is ok)
9162 feedback_fn("* checking mirrors status")
9163 disk_abort = not _WaitForSync(self, iobj, oneshot=True)
9168 _RemoveDisks(self, iobj)
9169 self.cfg.RemoveInstance(iobj.name)
9170 # Make sure the instance lock gets removed
9171 self.remove_locks[locking.LEVEL_INSTANCE] = iobj.name
9172 raise errors.OpExecError("There are some degraded disks for"
9175 if iobj.disk_template != constants.DT_DISKLESS and not self.adopt_disks:
9176 if self.op.mode == constants.INSTANCE_CREATE:
9177 if not self.op.no_install:
9178 pause_sync = (iobj.disk_template in constants.DTS_INT_MIRROR and
9179 not self.op.wait_for_sync)
9181 feedback_fn("* pausing disk sync to install instance OS")
9182 result = self.rpc.call_blockdev_pause_resume_sync(pnode_name,
9184 for idx, success in enumerate(result.payload):
9186 logging.warn("pause-sync of instance %s for disk %d failed",
9189 feedback_fn("* running the instance OS create scripts...")
9190 # FIXME: pass debug option from opcode to backend
9192 self.rpc.call_instance_os_add(pnode_name, (iobj, None), False,
9193 self.op.debug_level)
9195 feedback_fn("* resuming disk sync")
9196 result = self.rpc.call_blockdev_pause_resume_sync(pnode_name,
9198 for idx, success in enumerate(result.payload):
9200 logging.warn("resume-sync of instance %s for disk %d failed",
9203 os_add_result.Raise("Could not add os for instance %s"
9204 " on node %s" % (instance, pnode_name))
9206 elif self.op.mode == constants.INSTANCE_IMPORT:
9207 feedback_fn("* running the instance OS import scripts...")
9211 for idx, image in enumerate(self.src_images):
9215 # FIXME: pass debug option from opcode to backend
9216 dt = masterd.instance.DiskTransfer("disk/%s" % idx,
9217 constants.IEIO_FILE, (image, ),
9218 constants.IEIO_SCRIPT,
9219 (iobj.disks[idx], idx),
9221 transfers.append(dt)
9224 masterd.instance.TransferInstanceData(self, feedback_fn,
9225 self.op.src_node, pnode_name,
9226 self.pnode.secondary_ip,
9228 if not compat.all(import_result):
9229 self.LogWarning("Some disks for instance %s on node %s were not"
9230 " imported successfully" % (instance, pnode_name))
9232 elif self.op.mode == constants.INSTANCE_REMOTE_IMPORT:
9233 feedback_fn("* preparing remote import...")
9234 # The source cluster will stop the instance before attempting to make a
9235 # connection. In some cases stopping an instance can take a long time,
9236 # hence the shutdown timeout is added to the connection timeout.
9237 connect_timeout = (constants.RIE_CONNECT_TIMEOUT +
9238 self.op.source_shutdown_timeout)
9239 timeouts = masterd.instance.ImportExportTimeouts(connect_timeout)
9241 assert iobj.primary_node == self.pnode.name
9243 masterd.instance.RemoteImport(self, feedback_fn, iobj, self.pnode,
9244 self.source_x509_ca,
9245 self._cds, timeouts)
9246 if not compat.all(disk_results):
9247 # TODO: Should the instance still be started, even if some disks
9248 # failed to import (valid for local imports, too)?
9249 self.LogWarning("Some disks for instance %s on node %s were not"
9250 " imported successfully" % (instance, pnode_name))
9252 # Run rename script on newly imported instance
9253 assert iobj.name == instance
9254 feedback_fn("Running rename script for %s" % instance)
9255 result = self.rpc.call_instance_run_rename(pnode_name, iobj,
9256 self.source_instance_name,
9257 self.op.debug_level)
9259 self.LogWarning("Failed to run rename script for %s on node"
9260 " %s: %s" % (instance, pnode_name, result.fail_msg))
9263 # also checked in the prereq part
9264 raise errors.ProgrammerError("Unknown OS initialization mode '%s'"
9268 iobj.admin_up = True
9269 self.cfg.Update(iobj, feedback_fn)
9270 logging.info("Starting instance %s on node %s", instance, pnode_name)
9271 feedback_fn("* starting instance...")
9272 result = self.rpc.call_instance_start(pnode_name, (iobj, None, None),
9274 result.Raise("Could not start instance")
9276 return list(iobj.all_nodes)
9279 class LUInstanceConsole(NoHooksLU):
9280 """Connect to an instance's console.
9282 This is somewhat special in that it returns the command line that
9283 you need to run on the master node in order to connect to the
9289 def ExpandNames(self):
9290 self._ExpandAndLockInstance()
9292 def CheckPrereq(self):
9293 """Check prerequisites.
9295 This checks that the instance is in the cluster.
9298 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
9299 assert self.instance is not None, \
9300 "Cannot retrieve locked instance %s" % self.op.instance_name
9301 _CheckNodeOnline(self, self.instance.primary_node)
9303 def Exec(self, feedback_fn):
9304 """Connect to the console of an instance
9307 instance = self.instance
9308 node = instance.primary_node
9310 node_insts = self.rpc.call_instance_list([node],
9311 [instance.hypervisor])[node]
9312 node_insts.Raise("Can't get node information from %s" % node)
9314 if instance.name not in node_insts.payload:
9315 if instance.admin_up:
9316 state = constants.INSTST_ERRORDOWN
9318 state = constants.INSTST_ADMINDOWN
9319 raise errors.OpExecError("Instance %s is not running (state %s)" %
9320 (instance.name, state))
9322 logging.debug("Connecting to console of %s on %s", instance.name, node)
9324 return _GetInstanceConsole(self.cfg.GetClusterInfo(), instance)
9327 def _GetInstanceConsole(cluster, instance):
9328 """Returns console information for an instance.
9330 @type cluster: L{objects.Cluster}
9331 @type instance: L{objects.Instance}
9335 hyper = hypervisor.GetHypervisor(instance.hypervisor)
9336 # beparams and hvparams are passed separately, to avoid editing the
9337 # instance and then saving the defaults in the instance itself.
9338 hvparams = cluster.FillHV(instance)
9339 beparams = cluster.FillBE(instance)
9340 console = hyper.GetInstanceConsole(instance, hvparams, beparams)
9342 assert console.instance == instance.name
9343 assert console.Validate()
9345 return console.ToDict()
9348 class LUInstanceReplaceDisks(LogicalUnit):
9349 """Replace the disks of an instance.
9352 HPATH = "mirrors-replace"
9353 HTYPE = constants.HTYPE_INSTANCE
9356 def CheckArguments(self):
9357 TLReplaceDisks.CheckArguments(self.op.mode, self.op.remote_node,
9360 def ExpandNames(self):
9361 self._ExpandAndLockInstance()
9363 assert locking.LEVEL_NODE not in self.needed_locks
9364 assert locking.LEVEL_NODEGROUP not in self.needed_locks
9366 assert self.op.iallocator is None or self.op.remote_node is None, \
9367 "Conflicting options"
9369 if self.op.remote_node is not None:
9370 self.op.remote_node = _ExpandNodeName(self.cfg, self.op.remote_node)
9372 # Warning: do not remove the locking of the new secondary here
9373 # unless DRBD8.AddChildren is changed to work in parallel;
9374 # currently it doesn't since parallel invocations of
9375 # FindUnusedMinor will conflict
9376 self.needed_locks[locking.LEVEL_NODE] = [self.op.remote_node]
9377 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
9379 self.needed_locks[locking.LEVEL_NODE] = []
9380 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
9382 if self.op.iallocator is not None:
9383 # iallocator will select a new node in the same group
9384 self.needed_locks[locking.LEVEL_NODEGROUP] = []
9386 self.replacer = TLReplaceDisks(self, self.op.instance_name, self.op.mode,
9387 self.op.iallocator, self.op.remote_node,
9388 self.op.disks, False, self.op.early_release)
9390 self.tasklets = [self.replacer]
9392 def DeclareLocks(self, level):
9393 if level == locking.LEVEL_NODEGROUP:
9394 assert self.op.remote_node is None
9395 assert self.op.iallocator is not None
9396 assert not self.needed_locks[locking.LEVEL_NODEGROUP]
9398 self.share_locks[locking.LEVEL_NODEGROUP] = 1
9399 self.needed_locks[locking.LEVEL_NODEGROUP] = \
9400 self.cfg.GetInstanceNodeGroups(self.op.instance_name)
9402 elif level == locking.LEVEL_NODE:
9403 if self.op.iallocator is not None:
9404 assert self.op.remote_node is None
9405 assert not self.needed_locks[locking.LEVEL_NODE]
9407 # Lock member nodes of all locked groups
9408 self.needed_locks[locking.LEVEL_NODE] = [node_name
9409 for group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP)
9410 for node_name in self.cfg.GetNodeGroup(group_uuid).members]
9412 self._LockInstancesNodes()
9414 def BuildHooksEnv(self):
9417 This runs on the master, the primary and all the secondaries.
9420 instance = self.replacer.instance
9422 "MODE": self.op.mode,
9423 "NEW_SECONDARY": self.op.remote_node,
9424 "OLD_SECONDARY": instance.secondary_nodes[0],
9426 env.update(_BuildInstanceHookEnvByObject(self, instance))
9429 def BuildHooksNodes(self):
9430 """Build hooks nodes.
9433 instance = self.replacer.instance
9435 self.cfg.GetMasterNode(),
9436 instance.primary_node,
9438 if self.op.remote_node is not None:
9439 nl.append(self.op.remote_node)
9442 def CheckPrereq(self):
9443 """Check prerequisites.
9446 assert (self.glm.is_owned(locking.LEVEL_NODEGROUP) or
9447 self.op.iallocator is None)
9449 owned_groups = self.owned_locks(locking.LEVEL_NODEGROUP)
9451 _CheckInstanceNodeGroups(self.cfg, self.op.instance_name, owned_groups)
9453 return LogicalUnit.CheckPrereq(self)
9456 class TLReplaceDisks(Tasklet):
9457 """Replaces disks for an instance.
9459 Note: Locking is not within the scope of this class.
9462 def __init__(self, lu, instance_name, mode, iallocator_name, remote_node,
9463 disks, delay_iallocator, early_release):
9464 """Initializes this class.
9467 Tasklet.__init__(self, lu)
9470 self.instance_name = instance_name
9472 self.iallocator_name = iallocator_name
9473 self.remote_node = remote_node
9475 self.delay_iallocator = delay_iallocator
9476 self.early_release = early_release
9479 self.instance = None
9480 self.new_node = None
9481 self.target_node = None
9482 self.other_node = None
9483 self.remote_node_info = None
9484 self.node_secondary_ip = None
9487 def CheckArguments(mode, remote_node, iallocator):
9488 """Helper function for users of this class.
9491 # check for valid parameter combination
9492 if mode == constants.REPLACE_DISK_CHG:
9493 if remote_node is None and iallocator is None:
9494 raise errors.OpPrereqError("When changing the secondary either an"
9495 " iallocator script must be used or the"
9496 " new node given", errors.ECODE_INVAL)
9498 if remote_node is not None and iallocator is not None:
9499 raise errors.OpPrereqError("Give either the iallocator or the new"
9500 " secondary, not both", errors.ECODE_INVAL)
9502 elif remote_node is not None or iallocator is not None:
9503 # Not replacing the secondary
9504 raise errors.OpPrereqError("The iallocator and new node options can"
9505 " only be used when changing the"
9506 " secondary node", errors.ECODE_INVAL)
9509 def _RunAllocator(lu, iallocator_name, instance_name, relocate_from):
9510 """Compute a new secondary node using an IAllocator.
9513 ial = IAllocator(lu.cfg, lu.rpc,
9514 mode=constants.IALLOCATOR_MODE_RELOC,
9516 relocate_from=list(relocate_from))
9518 ial.Run(iallocator_name)
9521 raise errors.OpPrereqError("Can't compute nodes using iallocator '%s':"
9522 " %s" % (iallocator_name, ial.info),
9525 if len(ial.result) != ial.required_nodes:
9526 raise errors.OpPrereqError("iallocator '%s' returned invalid number"
9527 " of nodes (%s), required %s" %
9529 len(ial.result), ial.required_nodes),
9532 remote_node_name = ial.result[0]
9534 lu.LogInfo("Selected new secondary for instance '%s': %s",
9535 instance_name, remote_node_name)
9537 return remote_node_name
9539 def _FindFaultyDisks(self, node_name):
9540 """Wrapper for L{_FindFaultyInstanceDisks}.
9543 return _FindFaultyInstanceDisks(self.cfg, self.rpc, self.instance,
9546 def _CheckDisksActivated(self, instance):
9547 """Checks if the instance disks are activated.
9549 @param instance: The instance to check disks
9550 @return: True if they are activated, False otherwise
9553 nodes = instance.all_nodes
9555 for idx, dev in enumerate(instance.disks):
9557 self.lu.LogInfo("Checking disk/%d on %s", idx, node)
9558 self.cfg.SetDiskID(dev, node)
9560 result = self.rpc.call_blockdev_find(node, dev)
9564 elif result.fail_msg or not result.payload:
9569 def CheckPrereq(self):
9570 """Check prerequisites.
9572 This checks that the instance is in the cluster.
9575 self.instance = instance = self.cfg.GetInstanceInfo(self.instance_name)
9576 assert instance is not None, \
9577 "Cannot retrieve locked instance %s" % self.instance_name
9579 if instance.disk_template != constants.DT_DRBD8:
9580 raise errors.OpPrereqError("Can only run replace disks for DRBD8-based"
9581 " instances", errors.ECODE_INVAL)
9583 if len(instance.secondary_nodes) != 1:
9584 raise errors.OpPrereqError("The instance has a strange layout,"
9585 " expected one secondary but found %d" %
9586 len(instance.secondary_nodes),
9589 if not self.delay_iallocator:
9590 self._CheckPrereq2()
9592 def _CheckPrereq2(self):
9593 """Check prerequisites, second part.
9595 This function should always be part of CheckPrereq. It was separated and is
9596 now called from Exec because during node evacuation iallocator was only
9597 called with an unmodified cluster model, not taking planned changes into
9601 instance = self.instance
9602 secondary_node = instance.secondary_nodes[0]
9604 if self.iallocator_name is None:
9605 remote_node = self.remote_node
9607 remote_node = self._RunAllocator(self.lu, self.iallocator_name,
9608 instance.name, instance.secondary_nodes)
9610 if remote_node is None:
9611 self.remote_node_info = None
9613 assert remote_node in self.lu.owned_locks(locking.LEVEL_NODE), \
9614 "Remote node '%s' is not locked" % remote_node
9616 self.remote_node_info = self.cfg.GetNodeInfo(remote_node)
9617 assert self.remote_node_info is not None, \
9618 "Cannot retrieve locked node %s" % remote_node
9620 if remote_node == self.instance.primary_node:
9621 raise errors.OpPrereqError("The specified node is the primary node of"
9622 " the instance", errors.ECODE_INVAL)
9624 if remote_node == secondary_node:
9625 raise errors.OpPrereqError("The specified node is already the"
9626 " secondary node of the instance",
9629 if self.disks and self.mode in (constants.REPLACE_DISK_AUTO,
9630 constants.REPLACE_DISK_CHG):
9631 raise errors.OpPrereqError("Cannot specify disks to be replaced",
9634 if self.mode == constants.REPLACE_DISK_AUTO:
9635 if not self._CheckDisksActivated(instance):
9636 raise errors.OpPrereqError("Please run activate-disks on instance %s"
9637 " first" % self.instance_name,
9639 faulty_primary = self._FindFaultyDisks(instance.primary_node)
9640 faulty_secondary = self._FindFaultyDisks(secondary_node)
9642 if faulty_primary and faulty_secondary:
9643 raise errors.OpPrereqError("Instance %s has faulty disks on more than"
9644 " one node and can not be repaired"
9645 " automatically" % self.instance_name,
9649 self.disks = faulty_primary
9650 self.target_node = instance.primary_node
9651 self.other_node = secondary_node
9652 check_nodes = [self.target_node, self.other_node]
9653 elif faulty_secondary:
9654 self.disks = faulty_secondary
9655 self.target_node = secondary_node
9656 self.other_node = instance.primary_node
9657 check_nodes = [self.target_node, self.other_node]
9663 # Non-automatic modes
9664 if self.mode == constants.REPLACE_DISK_PRI:
9665 self.target_node = instance.primary_node
9666 self.other_node = secondary_node
9667 check_nodes = [self.target_node, self.other_node]
9669 elif self.mode == constants.REPLACE_DISK_SEC:
9670 self.target_node = secondary_node
9671 self.other_node = instance.primary_node
9672 check_nodes = [self.target_node, self.other_node]
9674 elif self.mode == constants.REPLACE_DISK_CHG:
9675 self.new_node = remote_node
9676 self.other_node = instance.primary_node
9677 self.target_node = secondary_node
9678 check_nodes = [self.new_node, self.other_node]
9680 _CheckNodeNotDrained(self.lu, remote_node)
9681 _CheckNodeVmCapable(self.lu, remote_node)
9683 old_node_info = self.cfg.GetNodeInfo(secondary_node)
9684 assert old_node_info is not None
9685 if old_node_info.offline and not self.early_release:
9686 # doesn't make sense to delay the release
9687 self.early_release = True
9688 self.lu.LogInfo("Old secondary %s is offline, automatically enabling"
9689 " early-release mode", secondary_node)
9692 raise errors.ProgrammerError("Unhandled disk replace mode (%s)" %
9695 # If not specified all disks should be replaced
9697 self.disks = range(len(self.instance.disks))
9699 for node in check_nodes:
9700 _CheckNodeOnline(self.lu, node)
9702 touched_nodes = frozenset(node_name for node_name in [self.new_node,
9705 if node_name is not None)
9707 # Release unneeded node locks
9708 _ReleaseLocks(self.lu, locking.LEVEL_NODE, keep=touched_nodes)
9710 # Release any owned node group
9711 if self.lu.glm.is_owned(locking.LEVEL_NODEGROUP):
9712 _ReleaseLocks(self.lu, locking.LEVEL_NODEGROUP)
9714 # Check whether disks are valid
9715 for disk_idx in self.disks:
9716 instance.FindDisk(disk_idx)
9718 # Get secondary node IP addresses
9719 self.node_secondary_ip = dict((name, node.secondary_ip) for (name, node)
9720 in self.cfg.GetMultiNodeInfo(touched_nodes))
9722 def Exec(self, feedback_fn):
9723 """Execute disk replacement.
9725 This dispatches the disk replacement to the appropriate handler.
9728 if self.delay_iallocator:
9729 self._CheckPrereq2()
9732 # Verify owned locks before starting operation
9733 owned_nodes = self.lu.owned_locks(locking.LEVEL_NODE)
9734 assert set(owned_nodes) == set(self.node_secondary_ip), \
9735 ("Incorrect node locks, owning %s, expected %s" %
9736 (owned_nodes, self.node_secondary_ip.keys()))
9738 owned_instances = self.lu.owned_locks(locking.LEVEL_INSTANCE)
9739 assert list(owned_instances) == [self.instance_name], \
9740 "Instance '%s' not locked" % self.instance_name
9742 assert not self.lu.glm.is_owned(locking.LEVEL_NODEGROUP), \
9743 "Should not own any node group lock at this point"
9746 feedback_fn("No disks need replacement")
9749 feedback_fn("Replacing disk(s) %s for %s" %
9750 (utils.CommaJoin(self.disks), self.instance.name))
9752 activate_disks = (not self.instance.admin_up)
9754 # Activate the instance disks if we're replacing them on a down instance
9756 _StartInstanceDisks(self.lu, self.instance, True)
9759 # Should we replace the secondary node?
9760 if self.new_node is not None:
9761 fn = self._ExecDrbd8Secondary
9763 fn = self._ExecDrbd8DiskOnly
9765 result = fn(feedback_fn)
9767 # Deactivate the instance disks if we're replacing them on a
9770 _SafeShutdownInstanceDisks(self.lu, self.instance)
9773 # Verify owned locks
9774 owned_nodes = self.lu.owned_locks(locking.LEVEL_NODE)
9775 nodes = frozenset(self.node_secondary_ip)
9776 assert ((self.early_release and not owned_nodes) or
9777 (not self.early_release and not (set(owned_nodes) - nodes))), \
9778 ("Not owning the correct locks, early_release=%s, owned=%r,"
9779 " nodes=%r" % (self.early_release, owned_nodes, nodes))
9783 def _CheckVolumeGroup(self, nodes):
9784 self.lu.LogInfo("Checking volume groups")
9786 vgname = self.cfg.GetVGName()
9788 # Make sure volume group exists on all involved nodes
9789 results = self.rpc.call_vg_list(nodes)
9791 raise errors.OpExecError("Can't list volume groups on the nodes")
9795 res.Raise("Error checking node %s" % node)
9796 if vgname not in res.payload:
9797 raise errors.OpExecError("Volume group '%s' not found on node %s" %
9800 def _CheckDisksExistence(self, nodes):
9801 # Check disk existence
9802 for idx, dev in enumerate(self.instance.disks):
9803 if idx not in self.disks:
9807 self.lu.LogInfo("Checking disk/%d on %s" % (idx, node))
9808 self.cfg.SetDiskID(dev, node)
9810 result = self.rpc.call_blockdev_find(node, dev)
9812 msg = result.fail_msg
9813 if msg or not result.payload:
9815 msg = "disk not found"
9816 raise errors.OpExecError("Can't find disk/%d on node %s: %s" %
9819 def _CheckDisksConsistency(self, node_name, on_primary, ldisk):
9820 for idx, dev in enumerate(self.instance.disks):
9821 if idx not in self.disks:
9824 self.lu.LogInfo("Checking disk/%d consistency on node %s" %
9827 if not _CheckDiskConsistency(self.lu, dev, node_name, on_primary,
9829 raise errors.OpExecError("Node %s has degraded storage, unsafe to"
9830 " replace disks for instance %s" %
9831 (node_name, self.instance.name))
9833 def _CreateNewStorage(self, node_name):
9834 """Create new storage on the primary or secondary node.
9836 This is only used for same-node replaces, not for changing the
9837 secondary node, hence we don't want to modify the existing disk.
9842 for idx, dev in enumerate(self.instance.disks):
9843 if idx not in self.disks:
9846 self.lu.LogInfo("Adding storage on %s for disk/%d" % (node_name, idx))
9848 self.cfg.SetDiskID(dev, node_name)
9850 lv_names = [".disk%d_%s" % (idx, suffix) for suffix in ["data", "meta"]]
9851 names = _GenerateUniqueNames(self.lu, lv_names)
9853 vg_data = dev.children[0].logical_id[0]
9854 lv_data = objects.Disk(dev_type=constants.LD_LV, size=dev.size,
9855 logical_id=(vg_data, names[0]))
9856 vg_meta = dev.children[1].logical_id[0]
9857 lv_meta = objects.Disk(dev_type=constants.LD_LV, size=DRBD_META_SIZE,
9858 logical_id=(vg_meta, names[1]))
9860 new_lvs = [lv_data, lv_meta]
9861 old_lvs = [child.Copy() for child in dev.children]
9862 iv_names[dev.iv_name] = (dev, old_lvs, new_lvs)
9864 # we pass force_create=True to force the LVM creation
9865 for new_lv in new_lvs:
9866 _CreateBlockDev(self.lu, node_name, self.instance, new_lv, True,
9867 _GetInstanceInfoText(self.instance), False)
9871 def _CheckDevices(self, node_name, iv_names):
9872 for name, (dev, _, _) in iv_names.iteritems():
9873 self.cfg.SetDiskID(dev, node_name)
9875 result = self.rpc.call_blockdev_find(node_name, dev)
9877 msg = result.fail_msg
9878 if msg or not result.payload:
9880 msg = "disk not found"
9881 raise errors.OpExecError("Can't find DRBD device %s: %s" %
9884 if result.payload.is_degraded:
9885 raise errors.OpExecError("DRBD device %s is degraded!" % name)
9887 def _RemoveOldStorage(self, node_name, iv_names):
9888 for name, (_, old_lvs, _) in iv_names.iteritems():
9889 self.lu.LogInfo("Remove logical volumes for %s" % name)
9892 self.cfg.SetDiskID(lv, node_name)
9894 msg = self.rpc.call_blockdev_remove(node_name, lv).fail_msg
9896 self.lu.LogWarning("Can't remove old LV: %s" % msg,
9897 hint="remove unused LVs manually")
9899 def _ExecDrbd8DiskOnly(self, feedback_fn): # pylint: disable=W0613
9900 """Replace a disk on the primary or secondary for DRBD 8.
9902 The algorithm for replace is quite complicated:
9904 1. for each disk to be replaced:
9906 1. create new LVs on the target node with unique names
9907 1. detach old LVs from the drbd device
9908 1. rename old LVs to name_replaced.<time_t>
9909 1. rename new LVs to old LVs
9910 1. attach the new LVs (with the old names now) to the drbd device
9912 1. wait for sync across all devices
9914 1. for each modified disk:
9916 1. remove old LVs (which have the name name_replaces.<time_t>)
9918 Failures are not very well handled.
9923 # Step: check device activation
9924 self.lu.LogStep(1, steps_total, "Check device existence")
9925 self._CheckDisksExistence([self.other_node, self.target_node])
9926 self._CheckVolumeGroup([self.target_node, self.other_node])
9928 # Step: check other node consistency
9929 self.lu.LogStep(2, steps_total, "Check peer consistency")
9930 self._CheckDisksConsistency(self.other_node,
9931 self.other_node == self.instance.primary_node,
9934 # Step: create new storage
9935 self.lu.LogStep(3, steps_total, "Allocate new storage")
9936 iv_names = self._CreateNewStorage(self.target_node)
9938 # Step: for each lv, detach+rename*2+attach
9939 self.lu.LogStep(4, steps_total, "Changing drbd configuration")
9940 for dev, old_lvs, new_lvs in iv_names.itervalues():
9941 self.lu.LogInfo("Detaching %s drbd from local storage" % dev.iv_name)
9943 result = self.rpc.call_blockdev_removechildren(self.target_node, dev,
9945 result.Raise("Can't detach drbd from local storage on node"
9946 " %s for device %s" % (self.target_node, dev.iv_name))
9948 #cfg.Update(instance)
9950 # ok, we created the new LVs, so now we know we have the needed
9951 # storage; as such, we proceed on the target node to rename
9952 # old_lv to _old, and new_lv to old_lv; note that we rename LVs
9953 # using the assumption that logical_id == physical_id (which in
9954 # turn is the unique_id on that node)
9956 # FIXME(iustin): use a better name for the replaced LVs
9957 temp_suffix = int(time.time())
9958 ren_fn = lambda d, suff: (d.physical_id[0],
9959 d.physical_id[1] + "_replaced-%s" % suff)
9961 # Build the rename list based on what LVs exist on the node
9962 rename_old_to_new = []
9963 for to_ren in old_lvs:
9964 result = self.rpc.call_blockdev_find(self.target_node, to_ren)
9965 if not result.fail_msg and result.payload:
9967 rename_old_to_new.append((to_ren, ren_fn(to_ren, temp_suffix)))
9969 self.lu.LogInfo("Renaming the old LVs on the target node")
9970 result = self.rpc.call_blockdev_rename(self.target_node,
9972 result.Raise("Can't rename old LVs on node %s" % self.target_node)
9974 # Now we rename the new LVs to the old LVs
9975 self.lu.LogInfo("Renaming the new LVs on the target node")
9976 rename_new_to_old = [(new, old.physical_id)
9977 for old, new in zip(old_lvs, new_lvs)]
9978 result = self.rpc.call_blockdev_rename(self.target_node,
9980 result.Raise("Can't rename new LVs on node %s" % self.target_node)
9982 # Intermediate steps of in memory modifications
9983 for old, new in zip(old_lvs, new_lvs):
9984 new.logical_id = old.logical_id
9985 self.cfg.SetDiskID(new, self.target_node)
9987 # We need to modify old_lvs so that removal later removes the
9988 # right LVs, not the newly added ones; note that old_lvs is a
9990 for disk in old_lvs:
9991 disk.logical_id = ren_fn(disk, temp_suffix)
9992 self.cfg.SetDiskID(disk, self.target_node)
9994 # Now that the new lvs have the old name, we can add them to the device
9995 self.lu.LogInfo("Adding new mirror component on %s" % self.target_node)
9996 result = self.rpc.call_blockdev_addchildren(self.target_node, dev,
9998 msg = result.fail_msg
10000 for new_lv in new_lvs:
10001 msg2 = self.rpc.call_blockdev_remove(self.target_node,
10004 self.lu.LogWarning("Can't rollback device %s: %s", dev, msg2,
10005 hint=("cleanup manually the unused logical"
10007 raise errors.OpExecError("Can't add local storage to drbd: %s" % msg)
10010 if self.early_release:
10011 self.lu.LogStep(cstep, steps_total, "Removing old storage")
10013 self._RemoveOldStorage(self.target_node, iv_names)
10014 # WARNING: we release both node locks here, do not do other RPCs
10015 # than WaitForSync to the primary node
10016 _ReleaseLocks(self.lu, locking.LEVEL_NODE,
10017 names=[self.target_node, self.other_node])
10020 # This can fail as the old devices are degraded and _WaitForSync
10021 # does a combined result over all disks, so we don't check its return value
10022 self.lu.LogStep(cstep, steps_total, "Sync devices")
10024 _WaitForSync(self.lu, self.instance)
10026 # Check all devices manually
10027 self._CheckDevices(self.instance.primary_node, iv_names)
10029 # Step: remove old storage
10030 if not self.early_release:
10031 self.lu.LogStep(cstep, steps_total, "Removing old storage")
10033 self._RemoveOldStorage(self.target_node, iv_names)
10035 def _ExecDrbd8Secondary(self, feedback_fn):
10036 """Replace the secondary node for DRBD 8.
10038 The algorithm for replace is quite complicated:
10039 - for all disks of the instance:
10040 - create new LVs on the new node with same names
10041 - shutdown the drbd device on the old secondary
10042 - disconnect the drbd network on the primary
10043 - create the drbd device on the new secondary
10044 - network attach the drbd on the primary, using an artifice:
10045 the drbd code for Attach() will connect to the network if it
10046 finds a device which is connected to the good local disks but
10047 not network enabled
10048 - wait for sync across all devices
10049 - remove all disks from the old secondary
10051 Failures are not very well handled.
10056 pnode = self.instance.primary_node
10058 # Step: check device activation
10059 self.lu.LogStep(1, steps_total, "Check device existence")
10060 self._CheckDisksExistence([self.instance.primary_node])
10061 self._CheckVolumeGroup([self.instance.primary_node])
10063 # Step: check other node consistency
10064 self.lu.LogStep(2, steps_total, "Check peer consistency")
10065 self._CheckDisksConsistency(self.instance.primary_node, True, True)
10067 # Step: create new storage
10068 self.lu.LogStep(3, steps_total, "Allocate new storage")
10069 for idx, dev in enumerate(self.instance.disks):
10070 self.lu.LogInfo("Adding new local storage on %s for disk/%d" %
10071 (self.new_node, idx))
10072 # we pass force_create=True to force LVM creation
10073 for new_lv in dev.children:
10074 _CreateBlockDev(self.lu, self.new_node, self.instance, new_lv, True,
10075 _GetInstanceInfoText(self.instance), False)
10077 # Step 4: dbrd minors and drbd setups changes
10078 # after this, we must manually remove the drbd minors on both the
10079 # error and the success paths
10080 self.lu.LogStep(4, steps_total, "Changing drbd configuration")
10081 minors = self.cfg.AllocateDRBDMinor([self.new_node
10082 for dev in self.instance.disks],
10083 self.instance.name)
10084 logging.debug("Allocated minors %r", minors)
10087 for idx, (dev, new_minor) in enumerate(zip(self.instance.disks, minors)):
10088 self.lu.LogInfo("activating a new drbd on %s for disk/%d" %
10089 (self.new_node, idx))
10090 # create new devices on new_node; note that we create two IDs:
10091 # one without port, so the drbd will be activated without
10092 # networking information on the new node at this stage, and one
10093 # with network, for the latter activation in step 4
10094 (o_node1, o_node2, o_port, o_minor1, o_minor2, o_secret) = dev.logical_id
10095 if self.instance.primary_node == o_node1:
10098 assert self.instance.primary_node == o_node2, "Three-node instance?"
10101 new_alone_id = (self.instance.primary_node, self.new_node, None,
10102 p_minor, new_minor, o_secret)
10103 new_net_id = (self.instance.primary_node, self.new_node, o_port,
10104 p_minor, new_minor, o_secret)
10106 iv_names[idx] = (dev, dev.children, new_net_id)
10107 logging.debug("Allocated new_minor: %s, new_logical_id: %s", new_minor,
10109 new_drbd = objects.Disk(dev_type=constants.LD_DRBD8,
10110 logical_id=new_alone_id,
10111 children=dev.children,
10114 _CreateSingleBlockDev(self.lu, self.new_node, self.instance, new_drbd,
10115 _GetInstanceInfoText(self.instance), False)
10116 except errors.GenericError:
10117 self.cfg.ReleaseDRBDMinors(self.instance.name)
10120 # We have new devices, shutdown the drbd on the old secondary
10121 for idx, dev in enumerate(self.instance.disks):
10122 self.lu.LogInfo("Shutting down drbd for disk/%d on old node" % idx)
10123 self.cfg.SetDiskID(dev, self.target_node)
10124 msg = self.rpc.call_blockdev_shutdown(self.target_node, dev).fail_msg
10126 self.lu.LogWarning("Failed to shutdown drbd for disk/%d on old"
10127 "node: %s" % (idx, msg),
10128 hint=("Please cleanup this device manually as"
10129 " soon as possible"))
10131 self.lu.LogInfo("Detaching primary drbds from the network (=> standalone)")
10132 result = self.rpc.call_drbd_disconnect_net([pnode], self.node_secondary_ip,
10133 self.instance.disks)[pnode]
10135 msg = result.fail_msg
10137 # detaches didn't succeed (unlikely)
10138 self.cfg.ReleaseDRBDMinors(self.instance.name)
10139 raise errors.OpExecError("Can't detach the disks from the network on"
10140 " old node: %s" % (msg,))
10142 # if we managed to detach at least one, we update all the disks of
10143 # the instance to point to the new secondary
10144 self.lu.LogInfo("Updating instance configuration")
10145 for dev, _, new_logical_id in iv_names.itervalues():
10146 dev.logical_id = new_logical_id
10147 self.cfg.SetDiskID(dev, self.instance.primary_node)
10149 self.cfg.Update(self.instance, feedback_fn)
10151 # and now perform the drbd attach
10152 self.lu.LogInfo("Attaching primary drbds to new secondary"
10153 " (standalone => connected)")
10154 result = self.rpc.call_drbd_attach_net([self.instance.primary_node,
10156 self.node_secondary_ip,
10157 self.instance.disks,
10158 self.instance.name,
10160 for to_node, to_result in result.items():
10161 msg = to_result.fail_msg
10163 self.lu.LogWarning("Can't attach drbd disks on node %s: %s",
10165 hint=("please do a gnt-instance info to see the"
10166 " status of disks"))
10168 if self.early_release:
10169 self.lu.LogStep(cstep, steps_total, "Removing old storage")
10171 self._RemoveOldStorage(self.target_node, iv_names)
10172 # WARNING: we release all node locks here, do not do other RPCs
10173 # than WaitForSync to the primary node
10174 _ReleaseLocks(self.lu, locking.LEVEL_NODE,
10175 names=[self.instance.primary_node,
10180 # This can fail as the old devices are degraded and _WaitForSync
10181 # does a combined result over all disks, so we don't check its return value
10182 self.lu.LogStep(cstep, steps_total, "Sync devices")
10184 _WaitForSync(self.lu, self.instance)
10186 # Check all devices manually
10187 self._CheckDevices(self.instance.primary_node, iv_names)
10189 # Step: remove old storage
10190 if not self.early_release:
10191 self.lu.LogStep(cstep, steps_total, "Removing old storage")
10192 self._RemoveOldStorage(self.target_node, iv_names)
10195 class LURepairNodeStorage(NoHooksLU):
10196 """Repairs the volume group on a node.
10201 def CheckArguments(self):
10202 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
10204 storage_type = self.op.storage_type
10206 if (constants.SO_FIX_CONSISTENCY not in
10207 constants.VALID_STORAGE_OPERATIONS.get(storage_type, [])):
10208 raise errors.OpPrereqError("Storage units of type '%s' can not be"
10209 " repaired" % storage_type,
10210 errors.ECODE_INVAL)
10212 def ExpandNames(self):
10213 self.needed_locks = {
10214 locking.LEVEL_NODE: [self.op.node_name],
10217 def _CheckFaultyDisks(self, instance, node_name):
10218 """Ensure faulty disks abort the opcode or at least warn."""
10220 if _FindFaultyInstanceDisks(self.cfg, self.rpc, instance,
10222 raise errors.OpPrereqError("Instance '%s' has faulty disks on"
10223 " node '%s'" % (instance.name, node_name),
10224 errors.ECODE_STATE)
10225 except errors.OpPrereqError, err:
10226 if self.op.ignore_consistency:
10227 self.proc.LogWarning(str(err.args[0]))
10231 def CheckPrereq(self):
10232 """Check prerequisites.
10235 # Check whether any instance on this node has faulty disks
10236 for inst in _GetNodeInstances(self.cfg, self.op.node_name):
10237 if not inst.admin_up:
10239 check_nodes = set(inst.all_nodes)
10240 check_nodes.discard(self.op.node_name)
10241 for inst_node_name in check_nodes:
10242 self._CheckFaultyDisks(inst, inst_node_name)
10244 def Exec(self, feedback_fn):
10245 feedback_fn("Repairing storage unit '%s' on %s ..." %
10246 (self.op.name, self.op.node_name))
10248 st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
10249 result = self.rpc.call_storage_execute(self.op.node_name,
10250 self.op.storage_type, st_args,
10252 constants.SO_FIX_CONSISTENCY)
10253 result.Raise("Failed to repair storage unit '%s' on %s" %
10254 (self.op.name, self.op.node_name))
10257 class LUNodeEvacuate(NoHooksLU):
10258 """Evacuates instances off a list of nodes.
10263 def CheckArguments(self):
10264 _CheckIAllocatorOrNode(self, "iallocator", "remote_node")
10266 def ExpandNames(self):
10267 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
10269 if self.op.remote_node is not None:
10270 self.op.remote_node = _ExpandNodeName(self.cfg, self.op.remote_node)
10271 assert self.op.remote_node
10273 if self.op.remote_node == self.op.node_name:
10274 raise errors.OpPrereqError("Can not use evacuated node as a new"
10275 " secondary node", errors.ECODE_INVAL)
10277 if self.op.mode != constants.IALLOCATOR_NEVAC_SEC:
10278 raise errors.OpPrereqError("Without the use of an iallocator only"
10279 " secondary instances can be evacuated",
10280 errors.ECODE_INVAL)
10283 self.share_locks = _ShareAll()
10284 self.needed_locks = {
10285 locking.LEVEL_INSTANCE: [],
10286 locking.LEVEL_NODEGROUP: [],
10287 locking.LEVEL_NODE: [],
10290 if self.op.remote_node is None:
10291 # Iallocator will choose any node(s) in the same group
10292 group_nodes = self.cfg.GetNodeGroupMembersByNodes([self.op.node_name])
10294 group_nodes = frozenset([self.op.remote_node])
10296 # Determine nodes to be locked
10297 self.lock_nodes = set([self.op.node_name]) | group_nodes
10299 def _DetermineInstances(self):
10300 """Builds list of instances to operate on.
10303 assert self.op.mode in constants.IALLOCATOR_NEVAC_MODES
10305 if self.op.mode == constants.IALLOCATOR_NEVAC_PRI:
10306 # Primary instances only
10307 inst_fn = _GetNodePrimaryInstances
10308 assert self.op.remote_node is None, \
10309 "Evacuating primary instances requires iallocator"
10310 elif self.op.mode == constants.IALLOCATOR_NEVAC_SEC:
10311 # Secondary instances only
10312 inst_fn = _GetNodeSecondaryInstances
10315 assert self.op.mode == constants.IALLOCATOR_NEVAC_ALL
10316 inst_fn = _GetNodeInstances
10318 return inst_fn(self.cfg, self.op.node_name)
10320 def DeclareLocks(self, level):
10321 if level == locking.LEVEL_INSTANCE:
10322 # Lock instances optimistically, needs verification once node and group
10323 # locks have been acquired
10324 self.needed_locks[locking.LEVEL_INSTANCE] = \
10325 set(i.name for i in self._DetermineInstances())
10327 elif level == locking.LEVEL_NODEGROUP:
10328 # Lock node groups optimistically, needs verification once nodes have
10330 self.needed_locks[locking.LEVEL_NODEGROUP] = \
10331 self.cfg.GetNodeGroupsFromNodes(self.lock_nodes)
10333 elif level == locking.LEVEL_NODE:
10334 self.needed_locks[locking.LEVEL_NODE] = self.lock_nodes
10336 def CheckPrereq(self):
10338 owned_instances = self.owned_locks(locking.LEVEL_INSTANCE)
10339 owned_nodes = self.owned_locks(locking.LEVEL_NODE)
10340 owned_groups = self.owned_locks(locking.LEVEL_NODEGROUP)
10342 assert owned_nodes == self.lock_nodes
10344 wanted_groups = self.cfg.GetNodeGroupsFromNodes(owned_nodes)
10345 if owned_groups != wanted_groups:
10346 raise errors.OpExecError("Node groups changed since locks were acquired,"
10347 " current groups are '%s', used to be '%s'" %
10348 (utils.CommaJoin(wanted_groups),
10349 utils.CommaJoin(owned_groups)))
10351 # Determine affected instances
10352 self.instances = self._DetermineInstances()
10353 self.instance_names = [i.name for i in self.instances]
10355 if set(self.instance_names) != owned_instances:
10356 raise errors.OpExecError("Instances on node '%s' changed since locks"
10357 " were acquired, current instances are '%s',"
10358 " used to be '%s'" %
10359 (self.op.node_name,
10360 utils.CommaJoin(self.instance_names),
10361 utils.CommaJoin(owned_instances)))
10363 if self.instance_names:
10364 self.LogInfo("Evacuating instances from node '%s': %s",
10366 utils.CommaJoin(utils.NiceSort(self.instance_names)))
10368 self.LogInfo("No instances to evacuate from node '%s'",
10371 if self.op.remote_node is not None:
10372 for i in self.instances:
10373 if i.primary_node == self.op.remote_node:
10374 raise errors.OpPrereqError("Node %s is the primary node of"
10375 " instance %s, cannot use it as"
10377 (self.op.remote_node, i.name),
10378 errors.ECODE_INVAL)
10380 def Exec(self, feedback_fn):
10381 assert (self.op.iallocator is not None) ^ (self.op.remote_node is not None)
10383 if not self.instance_names:
10384 # No instances to evacuate
10387 elif self.op.iallocator is not None:
10388 # TODO: Implement relocation to other group
10389 ial = IAllocator(self.cfg, self.rpc, constants.IALLOCATOR_MODE_NODE_EVAC,
10390 evac_mode=self.op.mode,
10391 instances=list(self.instance_names))
10393 ial.Run(self.op.iallocator)
10395 if not ial.success:
10396 raise errors.OpPrereqError("Can't compute node evacuation using"
10397 " iallocator '%s': %s" %
10398 (self.op.iallocator, ial.info),
10399 errors.ECODE_NORES)
10401 jobs = _LoadNodeEvacResult(self, ial.result, self.op.early_release, True)
10403 elif self.op.remote_node is not None:
10404 assert self.op.mode == constants.IALLOCATOR_NEVAC_SEC
10406 [opcodes.OpInstanceReplaceDisks(instance_name=instance_name,
10407 remote_node=self.op.remote_node,
10409 mode=constants.REPLACE_DISK_CHG,
10410 early_release=self.op.early_release)]
10411 for instance_name in self.instance_names
10415 raise errors.ProgrammerError("No iallocator or remote node")
10417 return ResultWithJobs(jobs)
10420 def _SetOpEarlyRelease(early_release, op):
10421 """Sets C{early_release} flag on opcodes if available.
10425 op.early_release = early_release
10426 except AttributeError:
10427 assert not isinstance(op, opcodes.OpInstanceReplaceDisks)
10432 def _NodeEvacDest(use_nodes, group, nodes):
10433 """Returns group or nodes depending on caller's choice.
10437 return utils.CommaJoin(nodes)
10442 def _LoadNodeEvacResult(lu, alloc_result, early_release, use_nodes):
10443 """Unpacks the result of change-group and node-evacuate iallocator requests.
10445 Iallocator modes L{constants.IALLOCATOR_MODE_NODE_EVAC} and
10446 L{constants.IALLOCATOR_MODE_CHG_GROUP}.
10448 @type lu: L{LogicalUnit}
10449 @param lu: Logical unit instance
10450 @type alloc_result: tuple/list
10451 @param alloc_result: Result from iallocator
10452 @type early_release: bool
10453 @param early_release: Whether to release locks early if possible
10454 @type use_nodes: bool
10455 @param use_nodes: Whether to display node names instead of groups
10458 (moved, failed, jobs) = alloc_result
10461 lu.LogWarning("Unable to evacuate instances %s",
10462 utils.CommaJoin("%s (%s)" % (name, reason)
10463 for (name, reason) in failed))
10466 lu.LogInfo("Instances to be moved: %s",
10467 utils.CommaJoin("%s (to %s)" %
10468 (name, _NodeEvacDest(use_nodes, group, nodes))
10469 for (name, group, nodes) in moved))
10471 return [map(compat.partial(_SetOpEarlyRelease, early_release),
10472 map(opcodes.OpCode.LoadOpCode, ops))
10476 class LUInstanceGrowDisk(LogicalUnit):
10477 """Grow a disk of an instance.
10480 HPATH = "disk-grow"
10481 HTYPE = constants.HTYPE_INSTANCE
10484 def ExpandNames(self):
10485 self._ExpandAndLockInstance()
10486 self.needed_locks[locking.LEVEL_NODE] = []
10487 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
10489 def DeclareLocks(self, level):
10490 if level == locking.LEVEL_NODE:
10491 self._LockInstancesNodes()
10493 def BuildHooksEnv(self):
10494 """Build hooks env.
10496 This runs on the master, the primary and all the secondaries.
10500 "DISK": self.op.disk,
10501 "AMOUNT": self.op.amount,
10503 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
10506 def BuildHooksNodes(self):
10507 """Build hooks nodes.
10510 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
10513 def CheckPrereq(self):
10514 """Check prerequisites.
10516 This checks that the instance is in the cluster.
10519 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
10520 assert instance is not None, \
10521 "Cannot retrieve locked instance %s" % self.op.instance_name
10522 nodenames = list(instance.all_nodes)
10523 for node in nodenames:
10524 _CheckNodeOnline(self, node)
10526 self.instance = instance
10528 if instance.disk_template not in constants.DTS_GROWABLE:
10529 raise errors.OpPrereqError("Instance's disk layout does not support"
10530 " growing", errors.ECODE_INVAL)
10532 self.disk = instance.FindDisk(self.op.disk)
10534 if instance.disk_template not in (constants.DT_FILE,
10535 constants.DT_SHARED_FILE):
10536 # TODO: check the free disk space for file, when that feature will be
10538 _CheckNodesFreeDiskPerVG(self, nodenames,
10539 self.disk.ComputeGrowth(self.op.amount))
10541 def Exec(self, feedback_fn):
10542 """Execute disk grow.
10545 instance = self.instance
10548 disks_ok, _ = _AssembleInstanceDisks(self, self.instance, disks=[disk])
10550 raise errors.OpExecError("Cannot activate block device to grow")
10552 # First run all grow ops in dry-run mode
10553 for node in instance.all_nodes:
10554 self.cfg.SetDiskID(disk, node)
10555 result = self.rpc.call_blockdev_grow(node, disk, self.op.amount, True)
10556 result.Raise("Grow request failed to node %s" % node)
10558 # We know that (as far as we can test) operations across different
10559 # nodes will succeed, time to run it for real
10560 for node in instance.all_nodes:
10561 self.cfg.SetDiskID(disk, node)
10562 result = self.rpc.call_blockdev_grow(node, disk, self.op.amount, False)
10563 result.Raise("Grow request failed to node %s" % node)
10565 # TODO: Rewrite code to work properly
10566 # DRBD goes into sync mode for a short amount of time after executing the
10567 # "resize" command. DRBD 8.x below version 8.0.13 contains a bug whereby
10568 # calling "resize" in sync mode fails. Sleeping for a short amount of
10569 # time is a work-around.
10572 disk.RecordGrow(self.op.amount)
10573 self.cfg.Update(instance, feedback_fn)
10574 if self.op.wait_for_sync:
10575 disk_abort = not _WaitForSync(self, instance, disks=[disk])
10577 self.proc.LogWarning("Disk sync-ing has not returned a good"
10578 " status; please check the instance")
10579 if not instance.admin_up:
10580 _SafeShutdownInstanceDisks(self, instance, disks=[disk])
10581 elif not instance.admin_up:
10582 self.proc.LogWarning("Not shutting down the disk even if the instance is"
10583 " not supposed to be running because no wait for"
10584 " sync mode was requested")
10587 class LUInstanceQueryData(NoHooksLU):
10588 """Query runtime instance data.
10593 def ExpandNames(self):
10594 self.needed_locks = {}
10596 # Use locking if requested or when non-static information is wanted
10597 if not (self.op.static or self.op.use_locking):
10598 self.LogWarning("Non-static data requested, locks need to be acquired")
10599 self.op.use_locking = True
10601 if self.op.instances or not self.op.use_locking:
10602 # Expand instance names right here
10603 self.wanted_names = _GetWantedInstances(self, self.op.instances)
10605 # Will use acquired locks
10606 self.wanted_names = None
10608 if self.op.use_locking:
10609 self.share_locks = _ShareAll()
10611 if self.wanted_names is None:
10612 self.needed_locks[locking.LEVEL_INSTANCE] = locking.ALL_SET
10614 self.needed_locks[locking.LEVEL_INSTANCE] = self.wanted_names
10616 self.needed_locks[locking.LEVEL_NODE] = []
10617 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
10619 def DeclareLocks(self, level):
10620 if self.op.use_locking and level == locking.LEVEL_NODE:
10621 self._LockInstancesNodes()
10623 def CheckPrereq(self):
10624 """Check prerequisites.
10626 This only checks the optional instance list against the existing names.
10629 if self.wanted_names is None:
10630 assert self.op.use_locking, "Locking was not used"
10631 self.wanted_names = self.owned_locks(locking.LEVEL_INSTANCE)
10633 self.wanted_instances = \
10634 map(compat.snd, self.cfg.GetMultiInstanceInfo(self.wanted_names))
10636 def _ComputeBlockdevStatus(self, node, instance_name, dev):
10637 """Returns the status of a block device
10640 if self.op.static or not node:
10643 self.cfg.SetDiskID(dev, node)
10645 result = self.rpc.call_blockdev_find(node, dev)
10649 result.Raise("Can't compute disk status for %s" % instance_name)
10651 status = result.payload
10655 return (status.dev_path, status.major, status.minor,
10656 status.sync_percent, status.estimated_time,
10657 status.is_degraded, status.ldisk_status)
10659 def _ComputeDiskStatus(self, instance, snode, dev):
10660 """Compute block device status.
10663 if dev.dev_type in constants.LDS_DRBD:
10664 # we change the snode then (otherwise we use the one passed in)
10665 if dev.logical_id[0] == instance.primary_node:
10666 snode = dev.logical_id[1]
10668 snode = dev.logical_id[0]
10670 dev_pstatus = self._ComputeBlockdevStatus(instance.primary_node,
10671 instance.name, dev)
10672 dev_sstatus = self._ComputeBlockdevStatus(snode, instance.name, dev)
10675 dev_children = map(compat.partial(self._ComputeDiskStatus,
10682 "iv_name": dev.iv_name,
10683 "dev_type": dev.dev_type,
10684 "logical_id": dev.logical_id,
10685 "physical_id": dev.physical_id,
10686 "pstatus": dev_pstatus,
10687 "sstatus": dev_sstatus,
10688 "children": dev_children,
10693 def Exec(self, feedback_fn):
10694 """Gather and return data"""
10697 cluster = self.cfg.GetClusterInfo()
10699 pri_nodes = self.cfg.GetMultiNodeInfo(i.primary_node
10700 for i in self.wanted_instances)
10701 for instance, (_, pnode) in zip(self.wanted_instances, pri_nodes):
10702 if self.op.static or pnode.offline:
10703 remote_state = None
10705 self.LogWarning("Primary node %s is marked offline, returning static"
10706 " information only for instance %s" %
10707 (pnode.name, instance.name))
10709 remote_info = self.rpc.call_instance_info(instance.primary_node,
10711 instance.hypervisor)
10712 remote_info.Raise("Error checking node %s" % instance.primary_node)
10713 remote_info = remote_info.payload
10714 if remote_info and "state" in remote_info:
10715 remote_state = "up"
10717 remote_state = "down"
10719 if instance.admin_up:
10720 config_state = "up"
10722 config_state = "down"
10724 disks = map(compat.partial(self._ComputeDiskStatus, instance, None),
10727 result[instance.name] = {
10728 "name": instance.name,
10729 "config_state": config_state,
10730 "run_state": remote_state,
10731 "pnode": instance.primary_node,
10732 "snodes": instance.secondary_nodes,
10734 # this happens to be the same format used for hooks
10735 "nics": _NICListToTuple(self, instance.nics),
10736 "disk_template": instance.disk_template,
10738 "hypervisor": instance.hypervisor,
10739 "network_port": instance.network_port,
10740 "hv_instance": instance.hvparams,
10741 "hv_actual": cluster.FillHV(instance, skip_globals=True),
10742 "be_instance": instance.beparams,
10743 "be_actual": cluster.FillBE(instance),
10744 "os_instance": instance.osparams,
10745 "os_actual": cluster.SimpleFillOS(instance.os, instance.osparams),
10746 "serial_no": instance.serial_no,
10747 "mtime": instance.mtime,
10748 "ctime": instance.ctime,
10749 "uuid": instance.uuid,
10755 class LUInstanceSetParams(LogicalUnit):
10756 """Modifies an instances's parameters.
10759 HPATH = "instance-modify"
10760 HTYPE = constants.HTYPE_INSTANCE
10763 def CheckArguments(self):
10764 if not (self.op.nics or self.op.disks or self.op.disk_template or
10765 self.op.hvparams or self.op.beparams or self.op.os_name):
10766 raise errors.OpPrereqError("No changes submitted", errors.ECODE_INVAL)
10768 if self.op.hvparams:
10769 _CheckGlobalHvParams(self.op.hvparams)
10773 for disk_op, disk_dict in self.op.disks:
10774 utils.ForceDictType(disk_dict, constants.IDISK_PARAMS_TYPES)
10775 if disk_op == constants.DDM_REMOVE:
10776 disk_addremove += 1
10778 elif disk_op == constants.DDM_ADD:
10779 disk_addremove += 1
10781 if not isinstance(disk_op, int):
10782 raise errors.OpPrereqError("Invalid disk index", errors.ECODE_INVAL)
10783 if not isinstance(disk_dict, dict):
10784 msg = "Invalid disk value: expected dict, got '%s'" % disk_dict
10785 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
10787 if disk_op == constants.DDM_ADD:
10788 mode = disk_dict.setdefault(constants.IDISK_MODE, constants.DISK_RDWR)
10789 if mode not in constants.DISK_ACCESS_SET:
10790 raise errors.OpPrereqError("Invalid disk access mode '%s'" % mode,
10791 errors.ECODE_INVAL)
10792 size = disk_dict.get(constants.IDISK_SIZE, None)
10794 raise errors.OpPrereqError("Required disk parameter size missing",
10795 errors.ECODE_INVAL)
10798 except (TypeError, ValueError), err:
10799 raise errors.OpPrereqError("Invalid disk size parameter: %s" %
10800 str(err), errors.ECODE_INVAL)
10801 disk_dict[constants.IDISK_SIZE] = size
10803 # modification of disk
10804 if constants.IDISK_SIZE in disk_dict:
10805 raise errors.OpPrereqError("Disk size change not possible, use"
10806 " grow-disk", errors.ECODE_INVAL)
10808 if disk_addremove > 1:
10809 raise errors.OpPrereqError("Only one disk add or remove operation"
10810 " supported at a time", errors.ECODE_INVAL)
10812 if self.op.disks and self.op.disk_template is not None:
10813 raise errors.OpPrereqError("Disk template conversion and other disk"
10814 " changes not supported at the same time",
10815 errors.ECODE_INVAL)
10817 if (self.op.disk_template and
10818 self.op.disk_template in constants.DTS_INT_MIRROR and
10819 self.op.remote_node is None):
10820 raise errors.OpPrereqError("Changing the disk template to a mirrored"
10821 " one requires specifying a secondary node",
10822 errors.ECODE_INVAL)
10826 for nic_op, nic_dict in self.op.nics:
10827 utils.ForceDictType(nic_dict, constants.INIC_PARAMS_TYPES)
10828 if nic_op == constants.DDM_REMOVE:
10831 elif nic_op == constants.DDM_ADD:
10834 if not isinstance(nic_op, int):
10835 raise errors.OpPrereqError("Invalid nic index", errors.ECODE_INVAL)
10836 if not isinstance(nic_dict, dict):
10837 msg = "Invalid nic value: expected dict, got '%s'" % nic_dict
10838 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
10840 # nic_dict should be a dict
10841 nic_ip = nic_dict.get(constants.INIC_IP, None)
10842 if nic_ip is not None:
10843 if nic_ip.lower() == constants.VALUE_NONE:
10844 nic_dict[constants.INIC_IP] = None
10846 if not netutils.IPAddress.IsValid(nic_ip):
10847 raise errors.OpPrereqError("Invalid IP address '%s'" % nic_ip,
10848 errors.ECODE_INVAL)
10850 nic_bridge = nic_dict.get("bridge", None)
10851 nic_link = nic_dict.get(constants.INIC_LINK, None)
10852 if nic_bridge and nic_link:
10853 raise errors.OpPrereqError("Cannot pass 'bridge' and 'link'"
10854 " at the same time", errors.ECODE_INVAL)
10855 elif nic_bridge and nic_bridge.lower() == constants.VALUE_NONE:
10856 nic_dict["bridge"] = None
10857 elif nic_link and nic_link.lower() == constants.VALUE_NONE:
10858 nic_dict[constants.INIC_LINK] = None
10860 if nic_op == constants.DDM_ADD:
10861 nic_mac = nic_dict.get(constants.INIC_MAC, None)
10862 if nic_mac is None:
10863 nic_dict[constants.INIC_MAC] = constants.VALUE_AUTO
10865 if constants.INIC_MAC in nic_dict:
10866 nic_mac = nic_dict[constants.INIC_MAC]
10867 if nic_mac not in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
10868 nic_mac = utils.NormalizeAndValidateMac(nic_mac)
10870 if nic_op != constants.DDM_ADD and nic_mac == constants.VALUE_AUTO:
10871 raise errors.OpPrereqError("'auto' is not a valid MAC address when"
10872 " modifying an existing nic",
10873 errors.ECODE_INVAL)
10875 if nic_addremove > 1:
10876 raise errors.OpPrereqError("Only one NIC add or remove operation"
10877 " supported at a time", errors.ECODE_INVAL)
10879 def ExpandNames(self):
10880 self._ExpandAndLockInstance()
10881 self.needed_locks[locking.LEVEL_NODE] = []
10882 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
10884 def DeclareLocks(self, level):
10885 if level == locking.LEVEL_NODE:
10886 self._LockInstancesNodes()
10887 if self.op.disk_template and self.op.remote_node:
10888 self.op.remote_node = _ExpandNodeName(self.cfg, self.op.remote_node)
10889 self.needed_locks[locking.LEVEL_NODE].append(self.op.remote_node)
10891 def BuildHooksEnv(self):
10892 """Build hooks env.
10894 This runs on the master, primary and secondaries.
10898 if constants.BE_MEMORY in self.be_new:
10899 args["memory"] = self.be_new[constants.BE_MEMORY]
10900 if constants.BE_VCPUS in self.be_new:
10901 args["vcpus"] = self.be_new[constants.BE_VCPUS]
10902 # TODO: export disk changes. Note: _BuildInstanceHookEnv* don't export disk
10903 # information at all.
10906 nic_override = dict(self.op.nics)
10907 for idx, nic in enumerate(self.instance.nics):
10908 if idx in nic_override:
10909 this_nic_override = nic_override[idx]
10911 this_nic_override = {}
10912 if constants.INIC_IP in this_nic_override:
10913 ip = this_nic_override[constants.INIC_IP]
10916 if constants.INIC_MAC in this_nic_override:
10917 mac = this_nic_override[constants.INIC_MAC]
10920 if idx in self.nic_pnew:
10921 nicparams = self.nic_pnew[idx]
10923 nicparams = self.cluster.SimpleFillNIC(nic.nicparams)
10924 mode = nicparams[constants.NIC_MODE]
10925 link = nicparams[constants.NIC_LINK]
10926 args["nics"].append((ip, mac, mode, link))
10927 if constants.DDM_ADD in nic_override:
10928 ip = nic_override[constants.DDM_ADD].get(constants.INIC_IP, None)
10929 mac = nic_override[constants.DDM_ADD][constants.INIC_MAC]
10930 nicparams = self.nic_pnew[constants.DDM_ADD]
10931 mode = nicparams[constants.NIC_MODE]
10932 link = nicparams[constants.NIC_LINK]
10933 args["nics"].append((ip, mac, mode, link))
10934 elif constants.DDM_REMOVE in nic_override:
10935 del args["nics"][-1]
10937 env = _BuildInstanceHookEnvByObject(self, self.instance, override=args)
10938 if self.op.disk_template:
10939 env["NEW_DISK_TEMPLATE"] = self.op.disk_template
10943 def BuildHooksNodes(self):
10944 """Build hooks nodes.
10947 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
10950 def CheckPrereq(self):
10951 """Check prerequisites.
10953 This only checks the instance list against the existing names.
10956 # checking the new params on the primary/secondary nodes
10958 instance = self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
10959 cluster = self.cluster = self.cfg.GetClusterInfo()
10960 assert self.instance is not None, \
10961 "Cannot retrieve locked instance %s" % self.op.instance_name
10962 pnode = instance.primary_node
10963 nodelist = list(instance.all_nodes)
10966 if self.op.os_name and not self.op.force:
10967 _CheckNodeHasOS(self, instance.primary_node, self.op.os_name,
10968 self.op.force_variant)
10969 instance_os = self.op.os_name
10971 instance_os = instance.os
10973 if self.op.disk_template:
10974 if instance.disk_template == self.op.disk_template:
10975 raise errors.OpPrereqError("Instance already has disk template %s" %
10976 instance.disk_template, errors.ECODE_INVAL)
10978 if (instance.disk_template,
10979 self.op.disk_template) not in self._DISK_CONVERSIONS:
10980 raise errors.OpPrereqError("Unsupported disk template conversion from"
10981 " %s to %s" % (instance.disk_template,
10982 self.op.disk_template),
10983 errors.ECODE_INVAL)
10984 _CheckInstanceDown(self, instance, "cannot change disk template")
10985 if self.op.disk_template in constants.DTS_INT_MIRROR:
10986 if self.op.remote_node == pnode:
10987 raise errors.OpPrereqError("Given new secondary node %s is the same"
10988 " as the primary node of the instance" %
10989 self.op.remote_node, errors.ECODE_STATE)
10990 _CheckNodeOnline(self, self.op.remote_node)
10991 _CheckNodeNotDrained(self, self.op.remote_node)
10992 # FIXME: here we assume that the old instance type is DT_PLAIN
10993 assert instance.disk_template == constants.DT_PLAIN
10994 disks = [{constants.IDISK_SIZE: d.size,
10995 constants.IDISK_VG: d.logical_id[0]}
10996 for d in instance.disks]
10997 required = _ComputeDiskSizePerVG(self.op.disk_template, disks)
10998 _CheckNodesFreeDiskPerVG(self, [self.op.remote_node], required)
11000 # hvparams processing
11001 if self.op.hvparams:
11002 hv_type = instance.hypervisor
11003 i_hvdict = _GetUpdatedParams(instance.hvparams, self.op.hvparams)
11004 utils.ForceDictType(i_hvdict, constants.HVS_PARAMETER_TYPES)
11005 hv_new = cluster.SimpleFillHV(hv_type, instance.os, i_hvdict)
11008 hypervisor.GetHypervisor(hv_type).CheckParameterSyntax(hv_new)
11009 _CheckHVParams(self, nodelist, instance.hypervisor, hv_new)
11010 self.hv_proposed = self.hv_new = hv_new # the new actual values
11011 self.hv_inst = i_hvdict # the new dict (without defaults)
11013 self.hv_proposed = cluster.SimpleFillHV(instance.hypervisor, instance.os,
11015 self.hv_new = self.hv_inst = {}
11017 # beparams processing
11018 if self.op.beparams:
11019 i_bedict = _GetUpdatedParams(instance.beparams, self.op.beparams,
11021 utils.ForceDictType(i_bedict, constants.BES_PARAMETER_TYPES)
11022 be_new = cluster.SimpleFillBE(i_bedict)
11023 self.be_proposed = self.be_new = be_new # the new actual values
11024 self.be_inst = i_bedict # the new dict (without defaults)
11026 self.be_new = self.be_inst = {}
11027 self.be_proposed = cluster.SimpleFillBE(instance.beparams)
11028 be_old = cluster.FillBE(instance)
11030 # CPU param validation -- checking every time a paramtere is
11031 # changed to cover all cases where either CPU mask or vcpus have
11033 if (constants.BE_VCPUS in self.be_proposed and
11034 constants.HV_CPU_MASK in self.hv_proposed):
11036 utils.ParseMultiCpuMask(self.hv_proposed[constants.HV_CPU_MASK])
11037 # Verify mask is consistent with number of vCPUs. Can skip this
11038 # test if only 1 entry in the CPU mask, which means same mask
11039 # is applied to all vCPUs.
11040 if (len(cpu_list) > 1 and
11041 len(cpu_list) != self.be_proposed[constants.BE_VCPUS]):
11042 raise errors.OpPrereqError("Number of vCPUs [%d] does not match the"
11044 (self.be_proposed[constants.BE_VCPUS],
11045 self.hv_proposed[constants.HV_CPU_MASK]),
11046 errors.ECODE_INVAL)
11048 # Only perform this test if a new CPU mask is given
11049 if constants.HV_CPU_MASK in self.hv_new:
11050 # Calculate the largest CPU number requested
11051 max_requested_cpu = max(map(max, cpu_list))
11052 # Check that all of the instance's nodes have enough physical CPUs to
11053 # satisfy the requested CPU mask
11054 _CheckNodesPhysicalCPUs(self, instance.all_nodes,
11055 max_requested_cpu + 1, instance.hypervisor)
11057 # osparams processing
11058 if self.op.osparams:
11059 i_osdict = _GetUpdatedParams(instance.osparams, self.op.osparams)
11060 _CheckOSParams(self, True, nodelist, instance_os, i_osdict)
11061 self.os_inst = i_osdict # the new dict (without defaults)
11067 if (constants.BE_MEMORY in self.op.beparams and not self.op.force and
11068 be_new[constants.BE_MEMORY] > be_old[constants.BE_MEMORY]):
11069 mem_check_list = [pnode]
11070 if be_new[constants.BE_AUTO_BALANCE]:
11071 # either we changed auto_balance to yes or it was from before
11072 mem_check_list.extend(instance.secondary_nodes)
11073 instance_info = self.rpc.call_instance_info(pnode, instance.name,
11074 instance.hypervisor)
11075 nodeinfo = self.rpc.call_node_info(mem_check_list, None,
11076 instance.hypervisor)
11077 pninfo = nodeinfo[pnode]
11078 msg = pninfo.fail_msg
11080 # Assume the primary node is unreachable and go ahead
11081 self.warn.append("Can't get info from primary node %s: %s" %
11083 elif not isinstance(pninfo.payload.get("memory_free", None), int):
11084 self.warn.append("Node data from primary node %s doesn't contain"
11085 " free memory information" % pnode)
11086 elif instance_info.fail_msg:
11087 self.warn.append("Can't get instance runtime information: %s" %
11088 instance_info.fail_msg)
11090 if instance_info.payload:
11091 current_mem = int(instance_info.payload["memory"])
11093 # Assume instance not running
11094 # (there is a slight race condition here, but it's not very probable,
11095 # and we have no other way to check)
11097 miss_mem = (be_new[constants.BE_MEMORY] - current_mem -
11098 pninfo.payload["memory_free"])
11100 raise errors.OpPrereqError("This change will prevent the instance"
11101 " from starting, due to %d MB of memory"
11102 " missing on its primary node" % miss_mem,
11103 errors.ECODE_NORES)
11105 if be_new[constants.BE_AUTO_BALANCE]:
11106 for node, nres in nodeinfo.items():
11107 if node not in instance.secondary_nodes:
11109 nres.Raise("Can't get info from secondary node %s" % node,
11110 prereq=True, ecode=errors.ECODE_STATE)
11111 if not isinstance(nres.payload.get("memory_free", None), int):
11112 raise errors.OpPrereqError("Secondary node %s didn't return free"
11113 " memory information" % node,
11114 errors.ECODE_STATE)
11115 elif be_new[constants.BE_MEMORY] > nres.payload["memory_free"]:
11116 raise errors.OpPrereqError("This change will prevent the instance"
11117 " from failover to its secondary node"
11118 " %s, due to not enough memory" % node,
11119 errors.ECODE_STATE)
11123 self.nic_pinst = {}
11124 for nic_op, nic_dict in self.op.nics:
11125 if nic_op == constants.DDM_REMOVE:
11126 if not instance.nics:
11127 raise errors.OpPrereqError("Instance has no NICs, cannot remove",
11128 errors.ECODE_INVAL)
11130 if nic_op != constants.DDM_ADD:
11132 if not instance.nics:
11133 raise errors.OpPrereqError("Invalid NIC index %s, instance has"
11134 " no NICs" % nic_op,
11135 errors.ECODE_INVAL)
11136 if nic_op < 0 or nic_op >= len(instance.nics):
11137 raise errors.OpPrereqError("Invalid NIC index %s, valid values"
11139 (nic_op, len(instance.nics) - 1),
11140 errors.ECODE_INVAL)
11141 old_nic_params = instance.nics[nic_op].nicparams
11142 old_nic_ip = instance.nics[nic_op].ip
11144 old_nic_params = {}
11147 update_params_dict = dict([(key, nic_dict[key])
11148 for key in constants.NICS_PARAMETERS
11149 if key in nic_dict])
11151 if "bridge" in nic_dict:
11152 update_params_dict[constants.NIC_LINK] = nic_dict["bridge"]
11154 new_nic_params = _GetUpdatedParams(old_nic_params,
11155 update_params_dict)
11156 utils.ForceDictType(new_nic_params, constants.NICS_PARAMETER_TYPES)
11157 new_filled_nic_params = cluster.SimpleFillNIC(new_nic_params)
11158 objects.NIC.CheckParameterSyntax(new_filled_nic_params)
11159 self.nic_pinst[nic_op] = new_nic_params
11160 self.nic_pnew[nic_op] = new_filled_nic_params
11161 new_nic_mode = new_filled_nic_params[constants.NIC_MODE]
11163 if new_nic_mode == constants.NIC_MODE_BRIDGED:
11164 nic_bridge = new_filled_nic_params[constants.NIC_LINK]
11165 msg = self.rpc.call_bridges_exist(pnode, [nic_bridge]).fail_msg
11167 msg = "Error checking bridges on node %s: %s" % (pnode, msg)
11169 self.warn.append(msg)
11171 raise errors.OpPrereqError(msg, errors.ECODE_ENVIRON)
11172 if new_nic_mode == constants.NIC_MODE_ROUTED:
11173 if constants.INIC_IP in nic_dict:
11174 nic_ip = nic_dict[constants.INIC_IP]
11176 nic_ip = old_nic_ip
11178 raise errors.OpPrereqError("Cannot set the nic ip to None"
11179 " on a routed nic", errors.ECODE_INVAL)
11180 if constants.INIC_MAC in nic_dict:
11181 nic_mac = nic_dict[constants.INIC_MAC]
11182 if nic_mac is None:
11183 raise errors.OpPrereqError("Cannot set the nic mac to None",
11184 errors.ECODE_INVAL)
11185 elif nic_mac in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
11186 # otherwise generate the mac
11187 nic_dict[constants.INIC_MAC] = \
11188 self.cfg.GenerateMAC(self.proc.GetECId())
11190 # or validate/reserve the current one
11192 self.cfg.ReserveMAC(nic_mac, self.proc.GetECId())
11193 except errors.ReservationError:
11194 raise errors.OpPrereqError("MAC address %s already in use"
11195 " in cluster" % nic_mac,
11196 errors.ECODE_NOTUNIQUE)
11199 if self.op.disks and instance.disk_template == constants.DT_DISKLESS:
11200 raise errors.OpPrereqError("Disk operations not supported for"
11201 " diskless instances",
11202 errors.ECODE_INVAL)
11203 for disk_op, _ in self.op.disks:
11204 if disk_op == constants.DDM_REMOVE:
11205 if len(instance.disks) == 1:
11206 raise errors.OpPrereqError("Cannot remove the last disk of"
11207 " an instance", errors.ECODE_INVAL)
11208 _CheckInstanceDown(self, instance, "cannot remove disks")
11210 if (disk_op == constants.DDM_ADD and
11211 len(instance.disks) >= constants.MAX_DISKS):
11212 raise errors.OpPrereqError("Instance has too many disks (%d), cannot"
11213 " add more" % constants.MAX_DISKS,
11214 errors.ECODE_STATE)
11215 if disk_op not in (constants.DDM_ADD, constants.DDM_REMOVE):
11217 if disk_op < 0 or disk_op >= len(instance.disks):
11218 raise errors.OpPrereqError("Invalid disk index %s, valid values"
11220 (disk_op, len(instance.disks)),
11221 errors.ECODE_INVAL)
11225 def _ConvertPlainToDrbd(self, feedback_fn):
11226 """Converts an instance from plain to drbd.
11229 feedback_fn("Converting template to drbd")
11230 instance = self.instance
11231 pnode = instance.primary_node
11232 snode = self.op.remote_node
11234 # create a fake disk info for _GenerateDiskTemplate
11235 disk_info = [{constants.IDISK_SIZE: d.size, constants.IDISK_MODE: d.mode,
11236 constants.IDISK_VG: d.logical_id[0]}
11237 for d in instance.disks]
11238 new_disks = _GenerateDiskTemplate(self, self.op.disk_template,
11239 instance.name, pnode, [snode],
11240 disk_info, None, None, 0, feedback_fn)
11241 info = _GetInstanceInfoText(instance)
11242 feedback_fn("Creating aditional volumes...")
11243 # first, create the missing data and meta devices
11244 for disk in new_disks:
11245 # unfortunately this is... not too nice
11246 _CreateSingleBlockDev(self, pnode, instance, disk.children[1],
11248 for child in disk.children:
11249 _CreateSingleBlockDev(self, snode, instance, child, info, True)
11250 # at this stage, all new LVs have been created, we can rename the
11252 feedback_fn("Renaming original volumes...")
11253 rename_list = [(o, n.children[0].logical_id)
11254 for (o, n) in zip(instance.disks, new_disks)]
11255 result = self.rpc.call_blockdev_rename(pnode, rename_list)
11256 result.Raise("Failed to rename original LVs")
11258 feedback_fn("Initializing DRBD devices...")
11259 # all child devices are in place, we can now create the DRBD devices
11260 for disk in new_disks:
11261 for node in [pnode, snode]:
11262 f_create = node == pnode
11263 _CreateSingleBlockDev(self, node, instance, disk, info, f_create)
11265 # at this point, the instance has been modified
11266 instance.disk_template = constants.DT_DRBD8
11267 instance.disks = new_disks
11268 self.cfg.Update(instance, feedback_fn)
11270 # disks are created, waiting for sync
11271 disk_abort = not _WaitForSync(self, instance,
11272 oneshot=not self.op.wait_for_sync)
11274 raise errors.OpExecError("There are some degraded disks for"
11275 " this instance, please cleanup manually")
11277 def _ConvertDrbdToPlain(self, feedback_fn):
11278 """Converts an instance from drbd to plain.
11281 instance = self.instance
11282 assert len(instance.secondary_nodes) == 1
11283 pnode = instance.primary_node
11284 snode = instance.secondary_nodes[0]
11285 feedback_fn("Converting template to plain")
11287 old_disks = instance.disks
11288 new_disks = [d.children[0] for d in old_disks]
11290 # copy over size and mode
11291 for parent, child in zip(old_disks, new_disks):
11292 child.size = parent.size
11293 child.mode = parent.mode
11295 # update instance structure
11296 instance.disks = new_disks
11297 instance.disk_template = constants.DT_PLAIN
11298 self.cfg.Update(instance, feedback_fn)
11300 feedback_fn("Removing volumes on the secondary node...")
11301 for disk in old_disks:
11302 self.cfg.SetDiskID(disk, snode)
11303 msg = self.rpc.call_blockdev_remove(snode, disk).fail_msg
11305 self.LogWarning("Could not remove block device %s on node %s,"
11306 " continuing anyway: %s", disk.iv_name, snode, msg)
11308 feedback_fn("Removing unneeded volumes on the primary node...")
11309 for idx, disk in enumerate(old_disks):
11310 meta = disk.children[1]
11311 self.cfg.SetDiskID(meta, pnode)
11312 msg = self.rpc.call_blockdev_remove(pnode, meta).fail_msg
11314 self.LogWarning("Could not remove metadata for disk %d on node %s,"
11315 " continuing anyway: %s", idx, pnode, msg)
11317 def Exec(self, feedback_fn):
11318 """Modifies an instance.
11320 All parameters take effect only at the next restart of the instance.
11323 # Process here the warnings from CheckPrereq, as we don't have a
11324 # feedback_fn there.
11325 for warn in self.warn:
11326 feedback_fn("WARNING: %s" % warn)
11329 instance = self.instance
11331 for disk_op, disk_dict in self.op.disks:
11332 if disk_op == constants.DDM_REMOVE:
11333 # remove the last disk
11334 device = instance.disks.pop()
11335 device_idx = len(instance.disks)
11336 for node, disk in device.ComputeNodeTree(instance.primary_node):
11337 self.cfg.SetDiskID(disk, node)
11338 msg = self.rpc.call_blockdev_remove(node, disk).fail_msg
11340 self.LogWarning("Could not remove disk/%d on node %s: %s,"
11341 " continuing anyway", device_idx, node, msg)
11342 result.append(("disk/%d" % device_idx, "remove"))
11343 elif disk_op == constants.DDM_ADD:
11345 if instance.disk_template in (constants.DT_FILE,
11346 constants.DT_SHARED_FILE):
11347 file_driver, file_path = instance.disks[0].logical_id
11348 file_path = os.path.dirname(file_path)
11350 file_driver = file_path = None
11351 disk_idx_base = len(instance.disks)
11352 new_disk = _GenerateDiskTemplate(self,
11353 instance.disk_template,
11354 instance.name, instance.primary_node,
11355 instance.secondary_nodes,
11359 disk_idx_base, feedback_fn)[0]
11360 instance.disks.append(new_disk)
11361 info = _GetInstanceInfoText(instance)
11363 logging.info("Creating volume %s for instance %s",
11364 new_disk.iv_name, instance.name)
11365 # Note: this needs to be kept in sync with _CreateDisks
11367 for node in instance.all_nodes:
11368 f_create = node == instance.primary_node
11370 _CreateBlockDev(self, node, instance, new_disk,
11371 f_create, info, f_create)
11372 except errors.OpExecError, err:
11373 self.LogWarning("Failed to create volume %s (%s) on"
11375 new_disk.iv_name, new_disk, node, err)
11376 result.append(("disk/%d" % disk_idx_base, "add:size=%s,mode=%s" %
11377 (new_disk.size, new_disk.mode)))
11379 # change a given disk
11380 instance.disks[disk_op].mode = disk_dict[constants.IDISK_MODE]
11381 result.append(("disk.mode/%d" % disk_op,
11382 disk_dict[constants.IDISK_MODE]))
11384 if self.op.disk_template:
11385 r_shut = _ShutdownInstanceDisks(self, instance)
11387 raise errors.OpExecError("Cannot shutdown instance disks, unable to"
11388 " proceed with disk template conversion")
11389 mode = (instance.disk_template, self.op.disk_template)
11391 self._DISK_CONVERSIONS[mode](self, feedback_fn)
11393 self.cfg.ReleaseDRBDMinors(instance.name)
11395 result.append(("disk_template", self.op.disk_template))
11398 for nic_op, nic_dict in self.op.nics:
11399 if nic_op == constants.DDM_REMOVE:
11400 # remove the last nic
11401 del instance.nics[-1]
11402 result.append(("nic.%d" % len(instance.nics), "remove"))
11403 elif nic_op == constants.DDM_ADD:
11404 # mac and bridge should be set, by now
11405 mac = nic_dict[constants.INIC_MAC]
11406 ip = nic_dict.get(constants.INIC_IP, None)
11407 nicparams = self.nic_pinst[constants.DDM_ADD]
11408 new_nic = objects.NIC(mac=mac, ip=ip, nicparams=nicparams)
11409 instance.nics.append(new_nic)
11410 result.append(("nic.%d" % (len(instance.nics) - 1),
11411 "add:mac=%s,ip=%s,mode=%s,link=%s" %
11412 (new_nic.mac, new_nic.ip,
11413 self.nic_pnew[constants.DDM_ADD][constants.NIC_MODE],
11414 self.nic_pnew[constants.DDM_ADD][constants.NIC_LINK]
11417 for key in (constants.INIC_MAC, constants.INIC_IP):
11418 if key in nic_dict:
11419 setattr(instance.nics[nic_op], key, nic_dict[key])
11420 if nic_op in self.nic_pinst:
11421 instance.nics[nic_op].nicparams = self.nic_pinst[nic_op]
11422 for key, val in nic_dict.iteritems():
11423 result.append(("nic.%s/%d" % (key, nic_op), val))
11426 if self.op.hvparams:
11427 instance.hvparams = self.hv_inst
11428 for key, val in self.op.hvparams.iteritems():
11429 result.append(("hv/%s" % key, val))
11432 if self.op.beparams:
11433 instance.beparams = self.be_inst
11434 for key, val in self.op.beparams.iteritems():
11435 result.append(("be/%s" % key, val))
11438 if self.op.os_name:
11439 instance.os = self.op.os_name
11442 if self.op.osparams:
11443 instance.osparams = self.os_inst
11444 for key, val in self.op.osparams.iteritems():
11445 result.append(("os/%s" % key, val))
11447 self.cfg.Update(instance, feedback_fn)
11451 _DISK_CONVERSIONS = {
11452 (constants.DT_PLAIN, constants.DT_DRBD8): _ConvertPlainToDrbd,
11453 (constants.DT_DRBD8, constants.DT_PLAIN): _ConvertDrbdToPlain,
11457 class LUInstanceChangeGroup(LogicalUnit):
11458 HPATH = "instance-change-group"
11459 HTYPE = constants.HTYPE_INSTANCE
11462 def ExpandNames(self):
11463 self.share_locks = _ShareAll()
11464 self.needed_locks = {
11465 locking.LEVEL_NODEGROUP: [],
11466 locking.LEVEL_NODE: [],
11469 self._ExpandAndLockInstance()
11471 if self.op.target_groups:
11472 self.req_target_uuids = map(self.cfg.LookupNodeGroup,
11473 self.op.target_groups)
11475 self.req_target_uuids = None
11477 self.op.iallocator = _GetDefaultIAllocator(self.cfg, self.op.iallocator)
11479 def DeclareLocks(self, level):
11480 if level == locking.LEVEL_NODEGROUP:
11481 assert not self.needed_locks[locking.LEVEL_NODEGROUP]
11483 if self.req_target_uuids:
11484 lock_groups = set(self.req_target_uuids)
11486 # Lock all groups used by instance optimistically; this requires going
11487 # via the node before it's locked, requiring verification later on
11488 instance_groups = self.cfg.GetInstanceNodeGroups(self.op.instance_name)
11489 lock_groups.update(instance_groups)
11491 # No target groups, need to lock all of them
11492 lock_groups = locking.ALL_SET
11494 self.needed_locks[locking.LEVEL_NODEGROUP] = lock_groups
11496 elif level == locking.LEVEL_NODE:
11497 if self.req_target_uuids:
11498 # Lock all nodes used by instances
11499 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
11500 self._LockInstancesNodes()
11502 # Lock all nodes in all potential target groups
11503 lock_groups = (frozenset(self.owned_locks(locking.LEVEL_NODEGROUP)) -
11504 self.cfg.GetInstanceNodeGroups(self.op.instance_name))
11505 member_nodes = [node_name
11506 for group in lock_groups
11507 for node_name in self.cfg.GetNodeGroup(group).members]
11508 self.needed_locks[locking.LEVEL_NODE].extend(member_nodes)
11510 # Lock all nodes as all groups are potential targets
11511 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
11513 def CheckPrereq(self):
11514 owned_instances = frozenset(self.owned_locks(locking.LEVEL_INSTANCE))
11515 owned_groups = frozenset(self.owned_locks(locking.LEVEL_NODEGROUP))
11516 owned_nodes = frozenset(self.owned_locks(locking.LEVEL_NODE))
11518 assert (self.req_target_uuids is None or
11519 owned_groups.issuperset(self.req_target_uuids))
11520 assert owned_instances == set([self.op.instance_name])
11522 # Get instance information
11523 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
11525 # Check if node groups for locked instance are still correct
11526 assert owned_nodes.issuperset(self.instance.all_nodes), \
11527 ("Instance %s's nodes changed while we kept the lock" %
11528 self.op.instance_name)
11530 inst_groups = _CheckInstanceNodeGroups(self.cfg, self.op.instance_name,
11533 if self.req_target_uuids:
11534 # User requested specific target groups
11535 self.target_uuids = self.req_target_uuids
11537 # All groups except those used by the instance are potential targets
11538 self.target_uuids = owned_groups - inst_groups
11540 conflicting_groups = self.target_uuids & inst_groups
11541 if conflicting_groups:
11542 raise errors.OpPrereqError("Can't use group(s) '%s' as targets, they are"
11543 " used by the instance '%s'" %
11544 (utils.CommaJoin(conflicting_groups),
11545 self.op.instance_name),
11546 errors.ECODE_INVAL)
11548 if not self.target_uuids:
11549 raise errors.OpPrereqError("There are no possible target groups",
11550 errors.ECODE_INVAL)
11552 def BuildHooksEnv(self):
11553 """Build hooks env.
11556 assert self.target_uuids
11559 "TARGET_GROUPS": " ".join(self.target_uuids),
11562 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
11566 def BuildHooksNodes(self):
11567 """Build hooks nodes.
11570 mn = self.cfg.GetMasterNode()
11571 return ([mn], [mn])
11573 def Exec(self, feedback_fn):
11574 instances = list(self.owned_locks(locking.LEVEL_INSTANCE))
11576 assert instances == [self.op.instance_name], "Instance not locked"
11578 ial = IAllocator(self.cfg, self.rpc, constants.IALLOCATOR_MODE_CHG_GROUP,
11579 instances=instances, target_groups=list(self.target_uuids))
11581 ial.Run(self.op.iallocator)
11583 if not ial.success:
11584 raise errors.OpPrereqError("Can't compute solution for changing group of"
11585 " instance '%s' using iallocator '%s': %s" %
11586 (self.op.instance_name, self.op.iallocator,
11588 errors.ECODE_NORES)
11590 jobs = _LoadNodeEvacResult(self, ial.result, self.op.early_release, False)
11592 self.LogInfo("Iallocator returned %s job(s) for changing group of"
11593 " instance '%s'", len(jobs), self.op.instance_name)
11595 return ResultWithJobs(jobs)
11598 class LUBackupQuery(NoHooksLU):
11599 """Query the exports list
11604 def ExpandNames(self):
11605 self.needed_locks = {}
11606 self.share_locks[locking.LEVEL_NODE] = 1
11607 if not self.op.nodes:
11608 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
11610 self.needed_locks[locking.LEVEL_NODE] = \
11611 _GetWantedNodes(self, self.op.nodes)
11613 def Exec(self, feedback_fn):
11614 """Compute the list of all the exported system images.
11617 @return: a dictionary with the structure node->(export-list)
11618 where export-list is a list of the instances exported on
11622 self.nodes = self.owned_locks(locking.LEVEL_NODE)
11623 rpcresult = self.rpc.call_export_list(self.nodes)
11625 for node in rpcresult:
11626 if rpcresult[node].fail_msg:
11627 result[node] = False
11629 result[node] = rpcresult[node].payload
11634 class LUBackupPrepare(NoHooksLU):
11635 """Prepares an instance for an export and returns useful information.
11640 def ExpandNames(self):
11641 self._ExpandAndLockInstance()
11643 def CheckPrereq(self):
11644 """Check prerequisites.
11647 instance_name = self.op.instance_name
11649 self.instance = self.cfg.GetInstanceInfo(instance_name)
11650 assert self.instance is not None, \
11651 "Cannot retrieve locked instance %s" % self.op.instance_name
11652 _CheckNodeOnline(self, self.instance.primary_node)
11654 self._cds = _GetClusterDomainSecret()
11656 def Exec(self, feedback_fn):
11657 """Prepares an instance for an export.
11660 instance = self.instance
11662 if self.op.mode == constants.EXPORT_MODE_REMOTE:
11663 salt = utils.GenerateSecret(8)
11665 feedback_fn("Generating X509 certificate on %s" % instance.primary_node)
11666 result = self.rpc.call_x509_cert_create(instance.primary_node,
11667 constants.RIE_CERT_VALIDITY)
11668 result.Raise("Can't create X509 key and certificate on %s" % result.node)
11670 (name, cert_pem) = result.payload
11672 cert = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM,
11676 "handshake": masterd.instance.ComputeRemoteExportHandshake(self._cds),
11677 "x509_key_name": (name, utils.Sha1Hmac(self._cds, name, salt=salt),
11679 "x509_ca": utils.SignX509Certificate(cert, self._cds, salt),
11685 class LUBackupExport(LogicalUnit):
11686 """Export an instance to an image in the cluster.
11689 HPATH = "instance-export"
11690 HTYPE = constants.HTYPE_INSTANCE
11693 def CheckArguments(self):
11694 """Check the arguments.
11697 self.x509_key_name = self.op.x509_key_name
11698 self.dest_x509_ca_pem = self.op.destination_x509_ca
11700 if self.op.mode == constants.EXPORT_MODE_REMOTE:
11701 if not self.x509_key_name:
11702 raise errors.OpPrereqError("Missing X509 key name for encryption",
11703 errors.ECODE_INVAL)
11705 if not self.dest_x509_ca_pem:
11706 raise errors.OpPrereqError("Missing destination X509 CA",
11707 errors.ECODE_INVAL)
11709 def ExpandNames(self):
11710 self._ExpandAndLockInstance()
11712 # Lock all nodes for local exports
11713 if self.op.mode == constants.EXPORT_MODE_LOCAL:
11714 # FIXME: lock only instance primary and destination node
11716 # Sad but true, for now we have do lock all nodes, as we don't know where
11717 # the previous export might be, and in this LU we search for it and
11718 # remove it from its current node. In the future we could fix this by:
11719 # - making a tasklet to search (share-lock all), then create the
11720 # new one, then one to remove, after
11721 # - removing the removal operation altogether
11722 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
11724 def DeclareLocks(self, level):
11725 """Last minute lock declaration."""
11726 # All nodes are locked anyway, so nothing to do here.
11728 def BuildHooksEnv(self):
11729 """Build hooks env.
11731 This will run on the master, primary node and target node.
11735 "EXPORT_MODE": self.op.mode,
11736 "EXPORT_NODE": self.op.target_node,
11737 "EXPORT_DO_SHUTDOWN": self.op.shutdown,
11738 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
11739 # TODO: Generic function for boolean env variables
11740 "REMOVE_INSTANCE": str(bool(self.op.remove_instance)),
11743 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
11747 def BuildHooksNodes(self):
11748 """Build hooks nodes.
11751 nl = [self.cfg.GetMasterNode(), self.instance.primary_node]
11753 if self.op.mode == constants.EXPORT_MODE_LOCAL:
11754 nl.append(self.op.target_node)
11758 def CheckPrereq(self):
11759 """Check prerequisites.
11761 This checks that the instance and node names are valid.
11764 instance_name = self.op.instance_name
11766 self.instance = self.cfg.GetInstanceInfo(instance_name)
11767 assert self.instance is not None, \
11768 "Cannot retrieve locked instance %s" % self.op.instance_name
11769 _CheckNodeOnline(self, self.instance.primary_node)
11771 if (self.op.remove_instance and self.instance.admin_up and
11772 not self.op.shutdown):
11773 raise errors.OpPrereqError("Can not remove instance without shutting it"
11776 if self.op.mode == constants.EXPORT_MODE_LOCAL:
11777 self.op.target_node = _ExpandNodeName(self.cfg, self.op.target_node)
11778 self.dst_node = self.cfg.GetNodeInfo(self.op.target_node)
11779 assert self.dst_node is not None
11781 _CheckNodeOnline(self, self.dst_node.name)
11782 _CheckNodeNotDrained(self, self.dst_node.name)
11785 self.dest_disk_info = None
11786 self.dest_x509_ca = None
11788 elif self.op.mode == constants.EXPORT_MODE_REMOTE:
11789 self.dst_node = None
11791 if len(self.op.target_node) != len(self.instance.disks):
11792 raise errors.OpPrereqError(("Received destination information for %s"
11793 " disks, but instance %s has %s disks") %
11794 (len(self.op.target_node), instance_name,
11795 len(self.instance.disks)),
11796 errors.ECODE_INVAL)
11798 cds = _GetClusterDomainSecret()
11800 # Check X509 key name
11802 (key_name, hmac_digest, hmac_salt) = self.x509_key_name
11803 except (TypeError, ValueError), err:
11804 raise errors.OpPrereqError("Invalid data for X509 key name: %s" % err)
11806 if not utils.VerifySha1Hmac(cds, key_name, hmac_digest, salt=hmac_salt):
11807 raise errors.OpPrereqError("HMAC for X509 key name is wrong",
11808 errors.ECODE_INVAL)
11810 # Load and verify CA
11812 (cert, _) = utils.LoadSignedX509Certificate(self.dest_x509_ca_pem, cds)
11813 except OpenSSL.crypto.Error, err:
11814 raise errors.OpPrereqError("Unable to load destination X509 CA (%s)" %
11815 (err, ), errors.ECODE_INVAL)
11817 (errcode, msg) = utils.VerifyX509Certificate(cert, None, None)
11818 if errcode is not None:
11819 raise errors.OpPrereqError("Invalid destination X509 CA (%s)" %
11820 (msg, ), errors.ECODE_INVAL)
11822 self.dest_x509_ca = cert
11824 # Verify target information
11826 for idx, disk_data in enumerate(self.op.target_node):
11828 (host, port, magic) = \
11829 masterd.instance.CheckRemoteExportDiskInfo(cds, idx, disk_data)
11830 except errors.GenericError, err:
11831 raise errors.OpPrereqError("Target info for disk %s: %s" %
11832 (idx, err), errors.ECODE_INVAL)
11834 disk_info.append((host, port, magic))
11836 assert len(disk_info) == len(self.op.target_node)
11837 self.dest_disk_info = disk_info
11840 raise errors.ProgrammerError("Unhandled export mode %r" %
11843 # instance disk type verification
11844 # TODO: Implement export support for file-based disks
11845 for disk in self.instance.disks:
11846 if disk.dev_type == constants.LD_FILE:
11847 raise errors.OpPrereqError("Export not supported for instances with"
11848 " file-based disks", errors.ECODE_INVAL)
11850 def _CleanupExports(self, feedback_fn):
11851 """Removes exports of current instance from all other nodes.
11853 If an instance in a cluster with nodes A..D was exported to node C, its
11854 exports will be removed from the nodes A, B and D.
11857 assert self.op.mode != constants.EXPORT_MODE_REMOTE
11859 nodelist = self.cfg.GetNodeList()
11860 nodelist.remove(self.dst_node.name)
11862 # on one-node clusters nodelist will be empty after the removal
11863 # if we proceed the backup would be removed because OpBackupQuery
11864 # substitutes an empty list with the full cluster node list.
11865 iname = self.instance.name
11867 feedback_fn("Removing old exports for instance %s" % iname)
11868 exportlist = self.rpc.call_export_list(nodelist)
11869 for node in exportlist:
11870 if exportlist[node].fail_msg:
11872 if iname in exportlist[node].payload:
11873 msg = self.rpc.call_export_remove(node, iname).fail_msg
11875 self.LogWarning("Could not remove older export for instance %s"
11876 " on node %s: %s", iname, node, msg)
11878 def Exec(self, feedback_fn):
11879 """Export an instance to an image in the cluster.
11882 assert self.op.mode in constants.EXPORT_MODES
11884 instance = self.instance
11885 src_node = instance.primary_node
11887 if self.op.shutdown:
11888 # shutdown the instance, but not the disks
11889 feedback_fn("Shutting down instance %s" % instance.name)
11890 result = self.rpc.call_instance_shutdown(src_node, instance,
11891 self.op.shutdown_timeout)
11892 # TODO: Maybe ignore failures if ignore_remove_failures is set
11893 result.Raise("Could not shutdown instance %s on"
11894 " node %s" % (instance.name, src_node))
11896 # set the disks ID correctly since call_instance_start needs the
11897 # correct drbd minor to create the symlinks
11898 for disk in instance.disks:
11899 self.cfg.SetDiskID(disk, src_node)
11901 activate_disks = (not instance.admin_up)
11904 # Activate the instance disks if we'exporting a stopped instance
11905 feedback_fn("Activating disks for %s" % instance.name)
11906 _StartInstanceDisks(self, instance, None)
11909 helper = masterd.instance.ExportInstanceHelper(self, feedback_fn,
11912 helper.CreateSnapshots()
11914 if (self.op.shutdown and instance.admin_up and
11915 not self.op.remove_instance):
11916 assert not activate_disks
11917 feedback_fn("Starting instance %s" % instance.name)
11918 result = self.rpc.call_instance_start(src_node,
11919 (instance, None, None), False)
11920 msg = result.fail_msg
11922 feedback_fn("Failed to start instance: %s" % msg)
11923 _ShutdownInstanceDisks(self, instance)
11924 raise errors.OpExecError("Could not start instance: %s" % msg)
11926 if self.op.mode == constants.EXPORT_MODE_LOCAL:
11927 (fin_resu, dresults) = helper.LocalExport(self.dst_node)
11928 elif self.op.mode == constants.EXPORT_MODE_REMOTE:
11929 connect_timeout = constants.RIE_CONNECT_TIMEOUT
11930 timeouts = masterd.instance.ImportExportTimeouts(connect_timeout)
11932 (key_name, _, _) = self.x509_key_name
11935 OpenSSL.crypto.dump_certificate(OpenSSL.crypto.FILETYPE_PEM,
11938 (fin_resu, dresults) = helper.RemoteExport(self.dest_disk_info,
11939 key_name, dest_ca_pem,
11944 # Check for backwards compatibility
11945 assert len(dresults) == len(instance.disks)
11946 assert compat.all(isinstance(i, bool) for i in dresults), \
11947 "Not all results are boolean: %r" % dresults
11951 feedback_fn("Deactivating disks for %s" % instance.name)
11952 _ShutdownInstanceDisks(self, instance)
11954 if not (compat.all(dresults) and fin_resu):
11957 failures.append("export finalization")
11958 if not compat.all(dresults):
11959 fdsk = utils.CommaJoin(idx for (idx, dsk) in enumerate(dresults)
11961 failures.append("disk export: disk(s) %s" % fdsk)
11963 raise errors.OpExecError("Export failed, errors in %s" %
11964 utils.CommaJoin(failures))
11966 # At this point, the export was successful, we can cleanup/finish
11968 # Remove instance if requested
11969 if self.op.remove_instance:
11970 feedback_fn("Removing instance %s" % instance.name)
11971 _RemoveInstance(self, feedback_fn, instance,
11972 self.op.ignore_remove_failures)
11974 if self.op.mode == constants.EXPORT_MODE_LOCAL:
11975 self._CleanupExports(feedback_fn)
11977 return fin_resu, dresults
11980 class LUBackupRemove(NoHooksLU):
11981 """Remove exports related to the named instance.
11986 def ExpandNames(self):
11987 self.needed_locks = {}
11988 # We need all nodes to be locked in order for RemoveExport to work, but we
11989 # don't need to lock the instance itself, as nothing will happen to it (and
11990 # we can remove exports also for a removed instance)
11991 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
11993 def Exec(self, feedback_fn):
11994 """Remove any export.
11997 instance_name = self.cfg.ExpandInstanceName(self.op.instance_name)
11998 # If the instance was not found we'll try with the name that was passed in.
11999 # This will only work if it was an FQDN, though.
12001 if not instance_name:
12003 instance_name = self.op.instance_name
12005 locked_nodes = self.owned_locks(locking.LEVEL_NODE)
12006 exportlist = self.rpc.call_export_list(locked_nodes)
12008 for node in exportlist:
12009 msg = exportlist[node].fail_msg
12011 self.LogWarning("Failed to query node %s (continuing): %s", node, msg)
12013 if instance_name in exportlist[node].payload:
12015 result = self.rpc.call_export_remove(node, instance_name)
12016 msg = result.fail_msg
12018 logging.error("Could not remove export for instance %s"
12019 " on node %s: %s", instance_name, node, msg)
12021 if fqdn_warn and not found:
12022 feedback_fn("Export not found. If trying to remove an export belonging"
12023 " to a deleted instance please use its Fully Qualified"
12027 class LUGroupAdd(LogicalUnit):
12028 """Logical unit for creating node groups.
12031 HPATH = "group-add"
12032 HTYPE = constants.HTYPE_GROUP
12035 def ExpandNames(self):
12036 # We need the new group's UUID here so that we can create and acquire the
12037 # corresponding lock. Later, in Exec(), we'll indicate to cfg.AddNodeGroup
12038 # that it should not check whether the UUID exists in the configuration.
12039 self.group_uuid = self.cfg.GenerateUniqueID(self.proc.GetECId())
12040 self.needed_locks = {}
12041 self.add_locks[locking.LEVEL_NODEGROUP] = self.group_uuid
12043 def CheckPrereq(self):
12044 """Check prerequisites.
12046 This checks that the given group name is not an existing node group
12051 existing_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
12052 except errors.OpPrereqError:
12055 raise errors.OpPrereqError("Desired group name '%s' already exists as a"
12056 " node group (UUID: %s)" %
12057 (self.op.group_name, existing_uuid),
12058 errors.ECODE_EXISTS)
12060 if self.op.ndparams:
12061 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
12063 def BuildHooksEnv(self):
12064 """Build hooks env.
12068 "GROUP_NAME": self.op.group_name,
12071 def BuildHooksNodes(self):
12072 """Build hooks nodes.
12075 mn = self.cfg.GetMasterNode()
12076 return ([mn], [mn])
12078 def Exec(self, feedback_fn):
12079 """Add the node group to the cluster.
12082 group_obj = objects.NodeGroup(name=self.op.group_name, members=[],
12083 uuid=self.group_uuid,
12084 alloc_policy=self.op.alloc_policy,
12085 ndparams=self.op.ndparams)
12087 self.cfg.AddNodeGroup(group_obj, self.proc.GetECId(), check_uuid=False)
12088 del self.remove_locks[locking.LEVEL_NODEGROUP]
12091 class LUGroupAssignNodes(NoHooksLU):
12092 """Logical unit for assigning nodes to groups.
12097 def ExpandNames(self):
12098 # These raise errors.OpPrereqError on their own:
12099 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
12100 self.op.nodes = _GetWantedNodes(self, self.op.nodes)
12102 # We want to lock all the affected nodes and groups. We have readily
12103 # available the list of nodes, and the *destination* group. To gather the
12104 # list of "source" groups, we need to fetch node information later on.
12105 self.needed_locks = {
12106 locking.LEVEL_NODEGROUP: set([self.group_uuid]),
12107 locking.LEVEL_NODE: self.op.nodes,
12110 def DeclareLocks(self, level):
12111 if level == locking.LEVEL_NODEGROUP:
12112 assert len(self.needed_locks[locking.LEVEL_NODEGROUP]) == 1
12114 # Try to get all affected nodes' groups without having the group or node
12115 # lock yet. Needs verification later in the code flow.
12116 groups = self.cfg.GetNodeGroupsFromNodes(self.op.nodes)
12118 self.needed_locks[locking.LEVEL_NODEGROUP].update(groups)
12120 def CheckPrereq(self):
12121 """Check prerequisites.
12124 assert self.needed_locks[locking.LEVEL_NODEGROUP]
12125 assert (frozenset(self.owned_locks(locking.LEVEL_NODE)) ==
12126 frozenset(self.op.nodes))
12128 expected_locks = (set([self.group_uuid]) |
12129 self.cfg.GetNodeGroupsFromNodes(self.op.nodes))
12130 actual_locks = self.owned_locks(locking.LEVEL_NODEGROUP)
12131 if actual_locks != expected_locks:
12132 raise errors.OpExecError("Nodes changed groups since locks were acquired,"
12133 " current groups are '%s', used to be '%s'" %
12134 (utils.CommaJoin(expected_locks),
12135 utils.CommaJoin(actual_locks)))
12137 self.node_data = self.cfg.GetAllNodesInfo()
12138 self.group = self.cfg.GetNodeGroup(self.group_uuid)
12139 instance_data = self.cfg.GetAllInstancesInfo()
12141 if self.group is None:
12142 raise errors.OpExecError("Could not retrieve group '%s' (UUID: %s)" %
12143 (self.op.group_name, self.group_uuid))
12145 (new_splits, previous_splits) = \
12146 self.CheckAssignmentForSplitInstances([(node, self.group_uuid)
12147 for node in self.op.nodes],
12148 self.node_data, instance_data)
12151 fmt_new_splits = utils.CommaJoin(utils.NiceSort(new_splits))
12153 if not self.op.force:
12154 raise errors.OpExecError("The following instances get split by this"
12155 " change and --force was not given: %s" %
12158 self.LogWarning("This operation will split the following instances: %s",
12161 if previous_splits:
12162 self.LogWarning("In addition, these already-split instances continue"
12163 " to be split across groups: %s",
12164 utils.CommaJoin(utils.NiceSort(previous_splits)))
12166 def Exec(self, feedback_fn):
12167 """Assign nodes to a new group.
12170 for node in self.op.nodes:
12171 self.node_data[node].group = self.group_uuid
12173 # FIXME: Depends on side-effects of modifying the result of
12174 # C{cfg.GetAllNodesInfo}
12176 self.cfg.Update(self.group, feedback_fn) # Saves all modified nodes.
12179 def CheckAssignmentForSplitInstances(changes, node_data, instance_data):
12180 """Check for split instances after a node assignment.
12182 This method considers a series of node assignments as an atomic operation,
12183 and returns information about split instances after applying the set of
12186 In particular, it returns information about newly split instances, and
12187 instances that were already split, and remain so after the change.
12189 Only instances whose disk template is listed in constants.DTS_INT_MIRROR are
12192 @type changes: list of (node_name, new_group_uuid) pairs.
12193 @param changes: list of node assignments to consider.
12194 @param node_data: a dict with data for all nodes
12195 @param instance_data: a dict with all instances to consider
12196 @rtype: a two-tuple
12197 @return: a list of instances that were previously okay and result split as a
12198 consequence of this change, and a list of instances that were previously
12199 split and this change does not fix.
12202 changed_nodes = dict((node, group) for node, group in changes
12203 if node_data[node].group != group)
12205 all_split_instances = set()
12206 previously_split_instances = set()
12208 def InstanceNodes(instance):
12209 return [instance.primary_node] + list(instance.secondary_nodes)
12211 for inst in instance_data.values():
12212 if inst.disk_template not in constants.DTS_INT_MIRROR:
12215 instance_nodes = InstanceNodes(inst)
12217 if len(set(node_data[node].group for node in instance_nodes)) > 1:
12218 previously_split_instances.add(inst.name)
12220 if len(set(changed_nodes.get(node, node_data[node].group)
12221 for node in instance_nodes)) > 1:
12222 all_split_instances.add(inst.name)
12224 return (list(all_split_instances - previously_split_instances),
12225 list(previously_split_instances & all_split_instances))
12228 class _GroupQuery(_QueryBase):
12229 FIELDS = query.GROUP_FIELDS
12231 def ExpandNames(self, lu):
12232 lu.needed_locks = {}
12234 self._all_groups = lu.cfg.GetAllNodeGroupsInfo()
12235 name_to_uuid = dict((g.name, g.uuid) for g in self._all_groups.values())
12238 self.wanted = [name_to_uuid[name]
12239 for name in utils.NiceSort(name_to_uuid.keys())]
12241 # Accept names to be either names or UUIDs.
12244 all_uuid = frozenset(self._all_groups.keys())
12246 for name in self.names:
12247 if name in all_uuid:
12248 self.wanted.append(name)
12249 elif name in name_to_uuid:
12250 self.wanted.append(name_to_uuid[name])
12252 missing.append(name)
12255 raise errors.OpPrereqError("Some groups do not exist: %s" %
12256 utils.CommaJoin(missing),
12257 errors.ECODE_NOENT)
12259 def DeclareLocks(self, lu, level):
12262 def _GetQueryData(self, lu):
12263 """Computes the list of node groups and their attributes.
12266 do_nodes = query.GQ_NODE in self.requested_data
12267 do_instances = query.GQ_INST in self.requested_data
12269 group_to_nodes = None
12270 group_to_instances = None
12272 # For GQ_NODE, we need to map group->[nodes], and group->[instances] for
12273 # GQ_INST. The former is attainable with just GetAllNodesInfo(), but for the
12274 # latter GetAllInstancesInfo() is not enough, for we have to go through
12275 # instance->node. Hence, we will need to process nodes even if we only need
12276 # instance information.
12277 if do_nodes or do_instances:
12278 all_nodes = lu.cfg.GetAllNodesInfo()
12279 group_to_nodes = dict((uuid, []) for uuid in self.wanted)
12282 for node in all_nodes.values():
12283 if node.group in group_to_nodes:
12284 group_to_nodes[node.group].append(node.name)
12285 node_to_group[node.name] = node.group
12288 all_instances = lu.cfg.GetAllInstancesInfo()
12289 group_to_instances = dict((uuid, []) for uuid in self.wanted)
12291 for instance in all_instances.values():
12292 node = instance.primary_node
12293 if node in node_to_group:
12294 group_to_instances[node_to_group[node]].append(instance.name)
12297 # Do not pass on node information if it was not requested.
12298 group_to_nodes = None
12300 return query.GroupQueryData([self._all_groups[uuid]
12301 for uuid in self.wanted],
12302 group_to_nodes, group_to_instances)
12305 class LUGroupQuery(NoHooksLU):
12306 """Logical unit for querying node groups.
12311 def CheckArguments(self):
12312 self.gq = _GroupQuery(qlang.MakeSimpleFilter("name", self.op.names),
12313 self.op.output_fields, False)
12315 def ExpandNames(self):
12316 self.gq.ExpandNames(self)
12318 def DeclareLocks(self, level):
12319 self.gq.DeclareLocks(self, level)
12321 def Exec(self, feedback_fn):
12322 return self.gq.OldStyleQuery(self)
12325 class LUGroupSetParams(LogicalUnit):
12326 """Modifies the parameters of a node group.
12329 HPATH = "group-modify"
12330 HTYPE = constants.HTYPE_GROUP
12333 def CheckArguments(self):
12336 self.op.alloc_policy,
12339 if all_changes.count(None) == len(all_changes):
12340 raise errors.OpPrereqError("Please pass at least one modification",
12341 errors.ECODE_INVAL)
12343 def ExpandNames(self):
12344 # This raises errors.OpPrereqError on its own:
12345 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
12347 self.needed_locks = {
12348 locking.LEVEL_NODEGROUP: [self.group_uuid],
12351 def CheckPrereq(self):
12352 """Check prerequisites.
12355 self.group = self.cfg.GetNodeGroup(self.group_uuid)
12357 if self.group is None:
12358 raise errors.OpExecError("Could not retrieve group '%s' (UUID: %s)" %
12359 (self.op.group_name, self.group_uuid))
12361 if self.op.ndparams:
12362 new_ndparams = _GetUpdatedParams(self.group.ndparams, self.op.ndparams)
12363 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
12364 self.new_ndparams = new_ndparams
12366 def BuildHooksEnv(self):
12367 """Build hooks env.
12371 "GROUP_NAME": self.op.group_name,
12372 "NEW_ALLOC_POLICY": self.op.alloc_policy,
12375 def BuildHooksNodes(self):
12376 """Build hooks nodes.
12379 mn = self.cfg.GetMasterNode()
12380 return ([mn], [mn])
12382 def Exec(self, feedback_fn):
12383 """Modifies the node group.
12388 if self.op.ndparams:
12389 self.group.ndparams = self.new_ndparams
12390 result.append(("ndparams", str(self.group.ndparams)))
12392 if self.op.alloc_policy:
12393 self.group.alloc_policy = self.op.alloc_policy
12395 self.cfg.Update(self.group, feedback_fn)
12399 class LUGroupRemove(LogicalUnit):
12400 HPATH = "group-remove"
12401 HTYPE = constants.HTYPE_GROUP
12404 def ExpandNames(self):
12405 # This will raises errors.OpPrereqError on its own:
12406 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
12407 self.needed_locks = {
12408 locking.LEVEL_NODEGROUP: [self.group_uuid],
12411 def CheckPrereq(self):
12412 """Check prerequisites.
12414 This checks that the given group name exists as a node group, that is
12415 empty (i.e., contains no nodes), and that is not the last group of the
12419 # Verify that the group is empty.
12420 group_nodes = [node.name
12421 for node in self.cfg.GetAllNodesInfo().values()
12422 if node.group == self.group_uuid]
12425 raise errors.OpPrereqError("Group '%s' not empty, has the following"
12427 (self.op.group_name,
12428 utils.CommaJoin(utils.NiceSort(group_nodes))),
12429 errors.ECODE_STATE)
12431 # Verify the cluster would not be left group-less.
12432 if len(self.cfg.GetNodeGroupList()) == 1:
12433 raise errors.OpPrereqError("Group '%s' is the only group,"
12434 " cannot be removed" %
12435 self.op.group_name,
12436 errors.ECODE_STATE)
12438 def BuildHooksEnv(self):
12439 """Build hooks env.
12443 "GROUP_NAME": self.op.group_name,
12446 def BuildHooksNodes(self):
12447 """Build hooks nodes.
12450 mn = self.cfg.GetMasterNode()
12451 return ([mn], [mn])
12453 def Exec(self, feedback_fn):
12454 """Remove the node group.
12458 self.cfg.RemoveNodeGroup(self.group_uuid)
12459 except errors.ConfigurationError:
12460 raise errors.OpExecError("Group '%s' with UUID %s disappeared" %
12461 (self.op.group_name, self.group_uuid))
12463 self.remove_locks[locking.LEVEL_NODEGROUP] = self.group_uuid
12466 class LUGroupRename(LogicalUnit):
12467 HPATH = "group-rename"
12468 HTYPE = constants.HTYPE_GROUP
12471 def ExpandNames(self):
12472 # This raises errors.OpPrereqError on its own:
12473 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
12475 self.needed_locks = {
12476 locking.LEVEL_NODEGROUP: [self.group_uuid],
12479 def CheckPrereq(self):
12480 """Check prerequisites.
12482 Ensures requested new name is not yet used.
12486 new_name_uuid = self.cfg.LookupNodeGroup(self.op.new_name)
12487 except errors.OpPrereqError:
12490 raise errors.OpPrereqError("Desired new name '%s' clashes with existing"
12491 " node group (UUID: %s)" %
12492 (self.op.new_name, new_name_uuid),
12493 errors.ECODE_EXISTS)
12495 def BuildHooksEnv(self):
12496 """Build hooks env.
12500 "OLD_NAME": self.op.group_name,
12501 "NEW_NAME": self.op.new_name,
12504 def BuildHooksNodes(self):
12505 """Build hooks nodes.
12508 mn = self.cfg.GetMasterNode()
12510 all_nodes = self.cfg.GetAllNodesInfo()
12511 all_nodes.pop(mn, None)
12514 run_nodes.extend(node.name for node in all_nodes.values()
12515 if node.group == self.group_uuid)
12517 return (run_nodes, run_nodes)
12519 def Exec(self, feedback_fn):
12520 """Rename the node group.
12523 group = self.cfg.GetNodeGroup(self.group_uuid)
12526 raise errors.OpExecError("Could not retrieve group '%s' (UUID: %s)" %
12527 (self.op.group_name, self.group_uuid))
12529 group.name = self.op.new_name
12530 self.cfg.Update(group, feedback_fn)
12532 return self.op.new_name
12535 class LUGroupEvacuate(LogicalUnit):
12536 HPATH = "group-evacuate"
12537 HTYPE = constants.HTYPE_GROUP
12540 def ExpandNames(self):
12541 # This raises errors.OpPrereqError on its own:
12542 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
12544 if self.op.target_groups:
12545 self.req_target_uuids = map(self.cfg.LookupNodeGroup,
12546 self.op.target_groups)
12548 self.req_target_uuids = []
12550 if self.group_uuid in self.req_target_uuids:
12551 raise errors.OpPrereqError("Group to be evacuated (%s) can not be used"
12552 " as a target group (targets are %s)" %
12554 utils.CommaJoin(self.req_target_uuids)),
12555 errors.ECODE_INVAL)
12557 self.op.iallocator = _GetDefaultIAllocator(self.cfg, self.op.iallocator)
12559 self.share_locks = _ShareAll()
12560 self.needed_locks = {
12561 locking.LEVEL_INSTANCE: [],
12562 locking.LEVEL_NODEGROUP: [],
12563 locking.LEVEL_NODE: [],
12566 def DeclareLocks(self, level):
12567 if level == locking.LEVEL_INSTANCE:
12568 assert not self.needed_locks[locking.LEVEL_INSTANCE]
12570 # Lock instances optimistically, needs verification once node and group
12571 # locks have been acquired
12572 self.needed_locks[locking.LEVEL_INSTANCE] = \
12573 self.cfg.GetNodeGroupInstances(self.group_uuid)
12575 elif level == locking.LEVEL_NODEGROUP:
12576 assert not self.needed_locks[locking.LEVEL_NODEGROUP]
12578 if self.req_target_uuids:
12579 lock_groups = set([self.group_uuid] + self.req_target_uuids)
12581 # Lock all groups used by instances optimistically; this requires going
12582 # via the node before it's locked, requiring verification later on
12583 lock_groups.update(group_uuid
12584 for instance_name in
12585 self.owned_locks(locking.LEVEL_INSTANCE)
12587 self.cfg.GetInstanceNodeGroups(instance_name))
12589 # No target groups, need to lock all of them
12590 lock_groups = locking.ALL_SET
12592 self.needed_locks[locking.LEVEL_NODEGROUP] = lock_groups
12594 elif level == locking.LEVEL_NODE:
12595 # This will only lock the nodes in the group to be evacuated which
12596 # contain actual instances
12597 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
12598 self._LockInstancesNodes()
12600 # Lock all nodes in group to be evacuated and target groups
12601 owned_groups = frozenset(self.owned_locks(locking.LEVEL_NODEGROUP))
12602 assert self.group_uuid in owned_groups
12603 member_nodes = [node_name
12604 for group in owned_groups
12605 for node_name in self.cfg.GetNodeGroup(group).members]
12606 self.needed_locks[locking.LEVEL_NODE].extend(member_nodes)
12608 def CheckPrereq(self):
12609 owned_instances = frozenset(self.owned_locks(locking.LEVEL_INSTANCE))
12610 owned_groups = frozenset(self.owned_locks(locking.LEVEL_NODEGROUP))
12611 owned_nodes = frozenset(self.owned_locks(locking.LEVEL_NODE))
12613 assert owned_groups.issuperset(self.req_target_uuids)
12614 assert self.group_uuid in owned_groups
12616 # Check if locked instances are still correct
12617 _CheckNodeGroupInstances(self.cfg, self.group_uuid, owned_instances)
12619 # Get instance information
12620 self.instances = dict(self.cfg.GetMultiInstanceInfo(owned_instances))
12622 # Check if node groups for locked instances are still correct
12623 for instance_name in owned_instances:
12624 inst = self.instances[instance_name]
12625 assert owned_nodes.issuperset(inst.all_nodes), \
12626 "Instance %s's nodes changed while we kept the lock" % instance_name
12628 inst_groups = _CheckInstanceNodeGroups(self.cfg, instance_name,
12631 assert self.group_uuid in inst_groups, \
12632 "Instance %s has no node in group %s" % (instance_name, self.group_uuid)
12634 if self.req_target_uuids:
12635 # User requested specific target groups
12636 self.target_uuids = self.req_target_uuids
12638 # All groups except the one to be evacuated are potential targets
12639 self.target_uuids = [group_uuid for group_uuid in owned_groups
12640 if group_uuid != self.group_uuid]
12642 if not self.target_uuids:
12643 raise errors.OpPrereqError("There are no possible target groups",
12644 errors.ECODE_INVAL)
12646 def BuildHooksEnv(self):
12647 """Build hooks env.
12651 "GROUP_NAME": self.op.group_name,
12652 "TARGET_GROUPS": " ".join(self.target_uuids),
12655 def BuildHooksNodes(self):
12656 """Build hooks nodes.
12659 mn = self.cfg.GetMasterNode()
12661 assert self.group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP)
12663 run_nodes = [mn] + self.cfg.GetNodeGroup(self.group_uuid).members
12665 return (run_nodes, run_nodes)
12667 def Exec(self, feedback_fn):
12668 instances = list(self.owned_locks(locking.LEVEL_INSTANCE))
12670 assert self.group_uuid not in self.target_uuids
12672 ial = IAllocator(self.cfg, self.rpc, constants.IALLOCATOR_MODE_CHG_GROUP,
12673 instances=instances, target_groups=self.target_uuids)
12675 ial.Run(self.op.iallocator)
12677 if not ial.success:
12678 raise errors.OpPrereqError("Can't compute group evacuation using"
12679 " iallocator '%s': %s" %
12680 (self.op.iallocator, ial.info),
12681 errors.ECODE_NORES)
12683 jobs = _LoadNodeEvacResult(self, ial.result, self.op.early_release, False)
12685 self.LogInfo("Iallocator returned %s job(s) for evacuating node group %s",
12686 len(jobs), self.op.group_name)
12688 return ResultWithJobs(jobs)
12691 class TagsLU(NoHooksLU): # pylint: disable=W0223
12692 """Generic tags LU.
12694 This is an abstract class which is the parent of all the other tags LUs.
12697 def ExpandNames(self):
12698 self.group_uuid = None
12699 self.needed_locks = {}
12700 if self.op.kind == constants.TAG_NODE:
12701 self.op.name = _ExpandNodeName(self.cfg, self.op.name)
12702 self.needed_locks[locking.LEVEL_NODE] = self.op.name
12703 elif self.op.kind == constants.TAG_INSTANCE:
12704 self.op.name = _ExpandInstanceName(self.cfg, self.op.name)
12705 self.needed_locks[locking.LEVEL_INSTANCE] = self.op.name
12706 elif self.op.kind == constants.TAG_NODEGROUP:
12707 self.group_uuid = self.cfg.LookupNodeGroup(self.op.name)
12709 # FIXME: Acquire BGL for cluster tag operations (as of this writing it's
12710 # not possible to acquire the BGL based on opcode parameters)
12712 def CheckPrereq(self):
12713 """Check prerequisites.
12716 if self.op.kind == constants.TAG_CLUSTER:
12717 self.target = self.cfg.GetClusterInfo()
12718 elif self.op.kind == constants.TAG_NODE:
12719 self.target = self.cfg.GetNodeInfo(self.op.name)
12720 elif self.op.kind == constants.TAG_INSTANCE:
12721 self.target = self.cfg.GetInstanceInfo(self.op.name)
12722 elif self.op.kind == constants.TAG_NODEGROUP:
12723 self.target = self.cfg.GetNodeGroup(self.group_uuid)
12725 raise errors.OpPrereqError("Wrong tag type requested (%s)" %
12726 str(self.op.kind), errors.ECODE_INVAL)
12729 class LUTagsGet(TagsLU):
12730 """Returns the tags of a given object.
12735 def ExpandNames(self):
12736 TagsLU.ExpandNames(self)
12738 # Share locks as this is only a read operation
12739 self.share_locks = _ShareAll()
12741 def Exec(self, feedback_fn):
12742 """Returns the tag list.
12745 return list(self.target.GetTags())
12748 class LUTagsSearch(NoHooksLU):
12749 """Searches the tags for a given pattern.
12754 def ExpandNames(self):
12755 self.needed_locks = {}
12757 def CheckPrereq(self):
12758 """Check prerequisites.
12760 This checks the pattern passed for validity by compiling it.
12764 self.re = re.compile(self.op.pattern)
12765 except re.error, err:
12766 raise errors.OpPrereqError("Invalid search pattern '%s': %s" %
12767 (self.op.pattern, err), errors.ECODE_INVAL)
12769 def Exec(self, feedback_fn):
12770 """Returns the tag list.
12774 tgts = [("/cluster", cfg.GetClusterInfo())]
12775 ilist = cfg.GetAllInstancesInfo().values()
12776 tgts.extend([("/instances/%s" % i.name, i) for i in ilist])
12777 nlist = cfg.GetAllNodesInfo().values()
12778 tgts.extend([("/nodes/%s" % n.name, n) for n in nlist])
12779 tgts.extend(("/nodegroup/%s" % n.name, n)
12780 for n in cfg.GetAllNodeGroupsInfo().values())
12782 for path, target in tgts:
12783 for tag in target.GetTags():
12784 if self.re.search(tag):
12785 results.append((path, tag))
12789 class LUTagsSet(TagsLU):
12790 """Sets a tag on a given object.
12795 def CheckPrereq(self):
12796 """Check prerequisites.
12798 This checks the type and length of the tag name and value.
12801 TagsLU.CheckPrereq(self)
12802 for tag in self.op.tags:
12803 objects.TaggableObject.ValidateTag(tag)
12805 def Exec(self, feedback_fn):
12810 for tag in self.op.tags:
12811 self.target.AddTag(tag)
12812 except errors.TagError, err:
12813 raise errors.OpExecError("Error while setting tag: %s" % str(err))
12814 self.cfg.Update(self.target, feedback_fn)
12817 class LUTagsDel(TagsLU):
12818 """Delete a list of tags from a given object.
12823 def CheckPrereq(self):
12824 """Check prerequisites.
12826 This checks that we have the given tag.
12829 TagsLU.CheckPrereq(self)
12830 for tag in self.op.tags:
12831 objects.TaggableObject.ValidateTag(tag)
12832 del_tags = frozenset(self.op.tags)
12833 cur_tags = self.target.GetTags()
12835 diff_tags = del_tags - cur_tags
12837 diff_names = ("'%s'" % i for i in sorted(diff_tags))
12838 raise errors.OpPrereqError("Tag(s) %s not found" %
12839 (utils.CommaJoin(diff_names), ),
12840 errors.ECODE_NOENT)
12842 def Exec(self, feedback_fn):
12843 """Remove the tag from the object.
12846 for tag in self.op.tags:
12847 self.target.RemoveTag(tag)
12848 self.cfg.Update(self.target, feedback_fn)
12851 class LUTestDelay(NoHooksLU):
12852 """Sleep for a specified amount of time.
12854 This LU sleeps on the master and/or nodes for a specified amount of
12860 def ExpandNames(self):
12861 """Expand names and set required locks.
12863 This expands the node list, if any.
12866 self.needed_locks = {}
12867 if self.op.on_nodes:
12868 # _GetWantedNodes can be used here, but is not always appropriate to use
12869 # this way in ExpandNames. Check LogicalUnit.ExpandNames docstring for
12870 # more information.
12871 self.op.on_nodes = _GetWantedNodes(self, self.op.on_nodes)
12872 self.needed_locks[locking.LEVEL_NODE] = self.op.on_nodes
12874 def _TestDelay(self):
12875 """Do the actual sleep.
12878 if self.op.on_master:
12879 if not utils.TestDelay(self.op.duration):
12880 raise errors.OpExecError("Error during master delay test")
12881 if self.op.on_nodes:
12882 result = self.rpc.call_test_delay(self.op.on_nodes, self.op.duration)
12883 for node, node_result in result.items():
12884 node_result.Raise("Failure during rpc call to node %s" % node)
12886 def Exec(self, feedback_fn):
12887 """Execute the test delay opcode, with the wanted repetitions.
12890 if self.op.repeat == 0:
12893 top_value = self.op.repeat - 1
12894 for i in range(self.op.repeat):
12895 self.LogInfo("Test delay iteration %d/%d" % (i, top_value))
12899 class LUTestJqueue(NoHooksLU):
12900 """Utility LU to test some aspects of the job queue.
12905 # Must be lower than default timeout for WaitForJobChange to see whether it
12906 # notices changed jobs
12907 _CLIENT_CONNECT_TIMEOUT = 20.0
12908 _CLIENT_CONFIRM_TIMEOUT = 60.0
12911 def _NotifyUsingSocket(cls, cb, errcls):
12912 """Opens a Unix socket and waits for another program to connect.
12915 @param cb: Callback to send socket name to client
12916 @type errcls: class
12917 @param errcls: Exception class to use for errors
12920 # Using a temporary directory as there's no easy way to create temporary
12921 # sockets without writing a custom loop around tempfile.mktemp and
12923 tmpdir = tempfile.mkdtemp()
12925 tmpsock = utils.PathJoin(tmpdir, "sock")
12927 logging.debug("Creating temporary socket at %s", tmpsock)
12928 sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
12933 # Send details to client
12936 # Wait for client to connect before continuing
12937 sock.settimeout(cls._CLIENT_CONNECT_TIMEOUT)
12939 (conn, _) = sock.accept()
12940 except socket.error, err:
12941 raise errcls("Client didn't connect in time (%s)" % err)
12945 # Remove as soon as client is connected
12946 shutil.rmtree(tmpdir)
12948 # Wait for client to close
12951 # pylint: disable=E1101
12952 # Instance of '_socketobject' has no ... member
12953 conn.settimeout(cls._CLIENT_CONFIRM_TIMEOUT)
12955 except socket.error, err:
12956 raise errcls("Client failed to confirm notification (%s)" % err)
12960 def _SendNotification(self, test, arg, sockname):
12961 """Sends a notification to the client.
12964 @param test: Test name
12965 @param arg: Test argument (depends on test)
12966 @type sockname: string
12967 @param sockname: Socket path
12970 self.Log(constants.ELOG_JQUEUE_TEST, (sockname, test, arg))
12972 def _Notify(self, prereq, test, arg):
12973 """Notifies the client of a test.
12976 @param prereq: Whether this is a prereq-phase test
12978 @param test: Test name
12979 @param arg: Test argument (depends on test)
12983 errcls = errors.OpPrereqError
12985 errcls = errors.OpExecError
12987 return self._NotifyUsingSocket(compat.partial(self._SendNotification,
12991 def CheckArguments(self):
12992 self.checkargs_calls = getattr(self, "checkargs_calls", 0) + 1
12993 self.expandnames_calls = 0
12995 def ExpandNames(self):
12996 checkargs_calls = getattr(self, "checkargs_calls", 0)
12997 if checkargs_calls < 1:
12998 raise errors.ProgrammerError("CheckArguments was not called")
13000 self.expandnames_calls += 1
13002 if self.op.notify_waitlock:
13003 self._Notify(True, constants.JQT_EXPANDNAMES, None)
13005 self.LogInfo("Expanding names")
13007 # Get lock on master node (just to get a lock, not for a particular reason)
13008 self.needed_locks = {
13009 locking.LEVEL_NODE: self.cfg.GetMasterNode(),
13012 def Exec(self, feedback_fn):
13013 if self.expandnames_calls < 1:
13014 raise errors.ProgrammerError("ExpandNames was not called")
13016 if self.op.notify_exec:
13017 self._Notify(False, constants.JQT_EXEC, None)
13019 self.LogInfo("Executing")
13021 if self.op.log_messages:
13022 self._Notify(False, constants.JQT_STARTMSG, len(self.op.log_messages))
13023 for idx, msg in enumerate(self.op.log_messages):
13024 self.LogInfo("Sending log message %s", idx + 1)
13025 feedback_fn(constants.JQT_MSGPREFIX + msg)
13026 # Report how many test messages have been sent
13027 self._Notify(False, constants.JQT_LOGMSG, idx + 1)
13030 raise errors.OpExecError("Opcode failure was requested")
13035 class IAllocator(object):
13036 """IAllocator framework.
13038 An IAllocator instance has three sets of attributes:
13039 - cfg that is needed to query the cluster
13040 - input data (all members of the _KEYS class attribute are required)
13041 - four buffer attributes (in|out_data|text), that represent the
13042 input (to the external script) in text and data structure format,
13043 and the output from it, again in two formats
13044 - the result variables from the script (success, info, nodes) for
13048 # pylint: disable=R0902
13049 # lots of instance attributes
13051 def __init__(self, cfg, rpc_runner, mode, **kwargs):
13053 self.rpc = rpc_runner
13054 # init buffer variables
13055 self.in_text = self.out_text = self.in_data = self.out_data = None
13056 # init all input fields so that pylint is happy
13058 self.memory = self.disks = self.disk_template = None
13059 self.os = self.tags = self.nics = self.vcpus = None
13060 self.hypervisor = None
13061 self.relocate_from = None
13063 self.instances = None
13064 self.evac_mode = None
13065 self.target_groups = []
13067 self.required_nodes = None
13068 # init result fields
13069 self.success = self.info = self.result = None
13072 (fn, keydata, self._result_check) = self._MODE_DATA[self.mode]
13074 raise errors.ProgrammerError("Unknown mode '%s' passed to the"
13075 " IAllocator" % self.mode)
13077 keyset = [n for (n, _) in keydata]
13080 if key not in keyset:
13081 raise errors.ProgrammerError("Invalid input parameter '%s' to"
13082 " IAllocator" % key)
13083 setattr(self, key, kwargs[key])
13086 if key not in kwargs:
13087 raise errors.ProgrammerError("Missing input parameter '%s' to"
13088 " IAllocator" % key)
13089 self._BuildInputData(compat.partial(fn, self), keydata)
13091 def _ComputeClusterData(self):
13092 """Compute the generic allocator input data.
13094 This is the data that is independent of the actual operation.
13098 cluster_info = cfg.GetClusterInfo()
13101 "version": constants.IALLOCATOR_VERSION,
13102 "cluster_name": cfg.GetClusterName(),
13103 "cluster_tags": list(cluster_info.GetTags()),
13104 "enabled_hypervisors": list(cluster_info.enabled_hypervisors),
13105 # we don't have job IDs
13107 ninfo = cfg.GetAllNodesInfo()
13108 iinfo = cfg.GetAllInstancesInfo().values()
13109 i_list = [(inst, cluster_info.FillBE(inst)) for inst in iinfo]
13112 node_list = [n.name for n in ninfo.values() if n.vm_capable]
13114 if self.mode == constants.IALLOCATOR_MODE_ALLOC:
13115 hypervisor_name = self.hypervisor
13116 elif self.mode == constants.IALLOCATOR_MODE_RELOC:
13117 hypervisor_name = cfg.GetInstanceInfo(self.name).hypervisor
13119 hypervisor_name = cluster_info.enabled_hypervisors[0]
13121 node_data = self.rpc.call_node_info(node_list, cfg.GetVGName(),
13124 self.rpc.call_all_instances_info(node_list,
13125 cluster_info.enabled_hypervisors)
13127 data["nodegroups"] = self._ComputeNodeGroupData(cfg)
13129 config_ndata = self._ComputeBasicNodeData(ninfo)
13130 data["nodes"] = self._ComputeDynamicNodeData(ninfo, node_data, node_iinfo,
13131 i_list, config_ndata)
13132 assert len(data["nodes"]) == len(ninfo), \
13133 "Incomplete node data computed"
13135 data["instances"] = self._ComputeInstanceData(cluster_info, i_list)
13137 self.in_data = data
13140 def _ComputeNodeGroupData(cfg):
13141 """Compute node groups data.
13144 ng = dict((guuid, {
13145 "name": gdata.name,
13146 "alloc_policy": gdata.alloc_policy,
13148 for guuid, gdata in cfg.GetAllNodeGroupsInfo().items())
13153 def _ComputeBasicNodeData(node_cfg):
13154 """Compute global node data.
13157 @returns: a dict of name: (node dict, node config)
13160 # fill in static (config-based) values
13161 node_results = dict((ninfo.name, {
13162 "tags": list(ninfo.GetTags()),
13163 "primary_ip": ninfo.primary_ip,
13164 "secondary_ip": ninfo.secondary_ip,
13165 "offline": ninfo.offline,
13166 "drained": ninfo.drained,
13167 "master_candidate": ninfo.master_candidate,
13168 "group": ninfo.group,
13169 "master_capable": ninfo.master_capable,
13170 "vm_capable": ninfo.vm_capable,
13172 for ninfo in node_cfg.values())
13174 return node_results
13177 def _ComputeDynamicNodeData(node_cfg, node_data, node_iinfo, i_list,
13179 """Compute global node data.
13181 @param node_results: the basic node structures as filled from the config
13184 # make a copy of the current dict
13185 node_results = dict(node_results)
13186 for nname, nresult in node_data.items():
13187 assert nname in node_results, "Missing basic data for node %s" % nname
13188 ninfo = node_cfg[nname]
13190 if not (ninfo.offline or ninfo.drained):
13191 nresult.Raise("Can't get data for node %s" % nname)
13192 node_iinfo[nname].Raise("Can't get node instance info from node %s" %
13194 remote_info = nresult.payload
13196 for attr in ["memory_total", "memory_free", "memory_dom0",
13197 "vg_size", "vg_free", "cpu_total"]:
13198 if attr not in remote_info:
13199 raise errors.OpExecError("Node '%s' didn't return attribute"
13200 " '%s'" % (nname, attr))
13201 if not isinstance(remote_info[attr], int):
13202 raise errors.OpExecError("Node '%s' returned invalid value"
13204 (nname, attr, remote_info[attr]))
13205 # compute memory used by primary instances
13206 i_p_mem = i_p_up_mem = 0
13207 for iinfo, beinfo in i_list:
13208 if iinfo.primary_node == nname:
13209 i_p_mem += beinfo[constants.BE_MEMORY]
13210 if iinfo.name not in node_iinfo[nname].payload:
13213 i_used_mem = int(node_iinfo[nname].payload[iinfo.name]["memory"])
13214 i_mem_diff = beinfo[constants.BE_MEMORY] - i_used_mem
13215 remote_info["memory_free"] -= max(0, i_mem_diff)
13218 i_p_up_mem += beinfo[constants.BE_MEMORY]
13220 # compute memory used by instances
13222 "total_memory": remote_info["memory_total"],
13223 "reserved_memory": remote_info["memory_dom0"],
13224 "free_memory": remote_info["memory_free"],
13225 "total_disk": remote_info["vg_size"],
13226 "free_disk": remote_info["vg_free"],
13227 "total_cpus": remote_info["cpu_total"],
13228 "i_pri_memory": i_p_mem,
13229 "i_pri_up_memory": i_p_up_mem,
13231 pnr_dyn.update(node_results[nname])
13232 node_results[nname] = pnr_dyn
13234 return node_results
13237 def _ComputeInstanceData(cluster_info, i_list):
13238 """Compute global instance data.
13242 for iinfo, beinfo in i_list:
13244 for nic in iinfo.nics:
13245 filled_params = cluster_info.SimpleFillNIC(nic.nicparams)
13249 "mode": filled_params[constants.NIC_MODE],
13250 "link": filled_params[constants.NIC_LINK],
13252 if filled_params[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
13253 nic_dict["bridge"] = filled_params[constants.NIC_LINK]
13254 nic_data.append(nic_dict)
13256 "tags": list(iinfo.GetTags()),
13257 "admin_up": iinfo.admin_up,
13258 "vcpus": beinfo[constants.BE_VCPUS],
13259 "memory": beinfo[constants.BE_MEMORY],
13261 "nodes": [iinfo.primary_node] + list(iinfo.secondary_nodes),
13263 "disks": [{constants.IDISK_SIZE: dsk.size,
13264 constants.IDISK_MODE: dsk.mode}
13265 for dsk in iinfo.disks],
13266 "disk_template": iinfo.disk_template,
13267 "hypervisor": iinfo.hypervisor,
13269 pir["disk_space_total"] = _ComputeDiskSize(iinfo.disk_template,
13271 instance_data[iinfo.name] = pir
13273 return instance_data
13275 def _AddNewInstance(self):
13276 """Add new instance data to allocator structure.
13278 This in combination with _AllocatorGetClusterData will create the
13279 correct structure needed as input for the allocator.
13281 The checks for the completeness of the opcode must have already been
13285 disk_space = _ComputeDiskSize(self.disk_template, self.disks)
13287 if self.disk_template in constants.DTS_INT_MIRROR:
13288 self.required_nodes = 2
13290 self.required_nodes = 1
13294 "disk_template": self.disk_template,
13297 "vcpus": self.vcpus,
13298 "memory": self.memory,
13299 "disks": self.disks,
13300 "disk_space_total": disk_space,
13302 "required_nodes": self.required_nodes,
13303 "hypervisor": self.hypervisor,
13308 def _AddRelocateInstance(self):
13309 """Add relocate instance data to allocator structure.
13311 This in combination with _IAllocatorGetClusterData will create the
13312 correct structure needed as input for the allocator.
13314 The checks for the completeness of the opcode must have already been
13318 instance = self.cfg.GetInstanceInfo(self.name)
13319 if instance is None:
13320 raise errors.ProgrammerError("Unknown instance '%s' passed to"
13321 " IAllocator" % self.name)
13323 if instance.disk_template not in constants.DTS_MIRRORED:
13324 raise errors.OpPrereqError("Can't relocate non-mirrored instances",
13325 errors.ECODE_INVAL)
13327 if instance.disk_template in constants.DTS_INT_MIRROR and \
13328 len(instance.secondary_nodes) != 1:
13329 raise errors.OpPrereqError("Instance has not exactly one secondary node",
13330 errors.ECODE_STATE)
13332 self.required_nodes = 1
13333 disk_sizes = [{constants.IDISK_SIZE: disk.size} for disk in instance.disks]
13334 disk_space = _ComputeDiskSize(instance.disk_template, disk_sizes)
13338 "disk_space_total": disk_space,
13339 "required_nodes": self.required_nodes,
13340 "relocate_from": self.relocate_from,
13344 def _AddNodeEvacuate(self):
13345 """Get data for node-evacuate requests.
13349 "instances": self.instances,
13350 "evac_mode": self.evac_mode,
13353 def _AddChangeGroup(self):
13354 """Get data for node-evacuate requests.
13358 "instances": self.instances,
13359 "target_groups": self.target_groups,
13362 def _BuildInputData(self, fn, keydata):
13363 """Build input data structures.
13366 self._ComputeClusterData()
13369 request["type"] = self.mode
13370 for keyname, keytype in keydata:
13371 if keyname not in request:
13372 raise errors.ProgrammerError("Request parameter %s is missing" %
13374 val = request[keyname]
13375 if not keytype(val):
13376 raise errors.ProgrammerError("Request parameter %s doesn't pass"
13377 " validation, value %s, expected"
13378 " type %s" % (keyname, val, keytype))
13379 self.in_data["request"] = request
13381 self.in_text = serializer.Dump(self.in_data)
13383 _STRING_LIST = ht.TListOf(ht.TString)
13384 _JOB_LIST = ht.TListOf(ht.TListOf(ht.TStrictDict(True, False, {
13385 # pylint: disable=E1101
13386 # Class '...' has no 'OP_ID' member
13387 "OP_ID": ht.TElemOf([opcodes.OpInstanceFailover.OP_ID,
13388 opcodes.OpInstanceMigrate.OP_ID,
13389 opcodes.OpInstanceReplaceDisks.OP_ID])
13393 ht.TListOf(ht.TAnd(ht.TIsLength(3),
13394 ht.TItems([ht.TNonEmptyString,
13395 ht.TNonEmptyString,
13396 ht.TListOf(ht.TNonEmptyString),
13399 ht.TListOf(ht.TAnd(ht.TIsLength(2),
13400 ht.TItems([ht.TNonEmptyString,
13403 _NEVAC_RESULT = ht.TAnd(ht.TIsLength(3),
13404 ht.TItems([_NEVAC_MOVED, _NEVAC_FAILED, _JOB_LIST]))
13407 constants.IALLOCATOR_MODE_ALLOC:
13410 ("name", ht.TString),
13411 ("memory", ht.TInt),
13412 ("disks", ht.TListOf(ht.TDict)),
13413 ("disk_template", ht.TString),
13414 ("os", ht.TString),
13415 ("tags", _STRING_LIST),
13416 ("nics", ht.TListOf(ht.TDict)),
13417 ("vcpus", ht.TInt),
13418 ("hypervisor", ht.TString),
13420 constants.IALLOCATOR_MODE_RELOC:
13421 (_AddRelocateInstance,
13422 [("name", ht.TString), ("relocate_from", _STRING_LIST)],
13424 constants.IALLOCATOR_MODE_NODE_EVAC:
13425 (_AddNodeEvacuate, [
13426 ("instances", _STRING_LIST),
13427 ("evac_mode", ht.TElemOf(constants.IALLOCATOR_NEVAC_MODES)),
13429 constants.IALLOCATOR_MODE_CHG_GROUP:
13430 (_AddChangeGroup, [
13431 ("instances", _STRING_LIST),
13432 ("target_groups", _STRING_LIST),
13436 def Run(self, name, validate=True, call_fn=None):
13437 """Run an instance allocator and return the results.
13440 if call_fn is None:
13441 call_fn = self.rpc.call_iallocator_runner
13443 result = call_fn(self.cfg.GetMasterNode(), name, self.in_text)
13444 result.Raise("Failure while running the iallocator script")
13446 self.out_text = result.payload
13448 self._ValidateResult()
13450 def _ValidateResult(self):
13451 """Process the allocator results.
13453 This will process and if successful save the result in
13454 self.out_data and the other parameters.
13458 rdict = serializer.Load(self.out_text)
13459 except Exception, err:
13460 raise errors.OpExecError("Can't parse iallocator results: %s" % str(err))
13462 if not isinstance(rdict, dict):
13463 raise errors.OpExecError("Can't parse iallocator results: not a dict")
13465 # TODO: remove backwards compatiblity in later versions
13466 if "nodes" in rdict and "result" not in rdict:
13467 rdict["result"] = rdict["nodes"]
13470 for key in "success", "info", "result":
13471 if key not in rdict:
13472 raise errors.OpExecError("Can't parse iallocator results:"
13473 " missing key '%s'" % key)
13474 setattr(self, key, rdict[key])
13476 if not self._result_check(self.result):
13477 raise errors.OpExecError("Iallocator returned invalid result,"
13478 " expected %s, got %s" %
13479 (self._result_check, self.result),
13480 errors.ECODE_INVAL)
13482 if self.mode == constants.IALLOCATOR_MODE_RELOC:
13483 assert self.relocate_from is not None
13484 assert self.required_nodes == 1
13486 node2group = dict((name, ndata["group"])
13487 for (name, ndata) in self.in_data["nodes"].items())
13489 fn = compat.partial(self._NodesToGroups, node2group,
13490 self.in_data["nodegroups"])
13492 instance = self.cfg.GetInstanceInfo(self.name)
13493 request_groups = fn(self.relocate_from + [instance.primary_node])
13494 result_groups = fn(rdict["result"] + [instance.primary_node])
13496 if self.success and not set(result_groups).issubset(request_groups):
13497 raise errors.OpExecError("Groups of nodes returned by iallocator (%s)"
13498 " differ from original groups (%s)" %
13499 (utils.CommaJoin(result_groups),
13500 utils.CommaJoin(request_groups)))
13502 elif self.mode == constants.IALLOCATOR_MODE_NODE_EVAC:
13503 assert self.evac_mode in constants.IALLOCATOR_NEVAC_MODES
13505 self.out_data = rdict
13508 def _NodesToGroups(node2group, groups, nodes):
13509 """Returns a list of unique group names for a list of nodes.
13511 @type node2group: dict
13512 @param node2group: Map from node name to group UUID
13514 @param groups: Group information
13516 @param nodes: Node names
13523 group_uuid = node2group[node]
13525 # Ignore unknown node
13529 group = groups[group_uuid]
13531 # Can't find group, let's use UUID
13532 group_name = group_uuid
13534 group_name = group["name"]
13536 result.add(group_name)
13538 return sorted(result)
13541 class LUTestAllocator(NoHooksLU):
13542 """Run allocator tests.
13544 This LU runs the allocator tests
13547 def CheckPrereq(self):
13548 """Check prerequisites.
13550 This checks the opcode parameters depending on the director and mode test.
13553 if self.op.mode == constants.IALLOCATOR_MODE_ALLOC:
13554 for attr in ["memory", "disks", "disk_template",
13555 "os", "tags", "nics", "vcpus"]:
13556 if not hasattr(self.op, attr):
13557 raise errors.OpPrereqError("Missing attribute '%s' on opcode input" %
13558 attr, errors.ECODE_INVAL)
13559 iname = self.cfg.ExpandInstanceName(self.op.name)
13560 if iname is not None:
13561 raise errors.OpPrereqError("Instance '%s' already in the cluster" %
13562 iname, errors.ECODE_EXISTS)
13563 if not isinstance(self.op.nics, list):
13564 raise errors.OpPrereqError("Invalid parameter 'nics'",
13565 errors.ECODE_INVAL)
13566 if not isinstance(self.op.disks, list):
13567 raise errors.OpPrereqError("Invalid parameter 'disks'",
13568 errors.ECODE_INVAL)
13569 for row in self.op.disks:
13570 if (not isinstance(row, dict) or
13571 constants.IDISK_SIZE not in row or
13572 not isinstance(row[constants.IDISK_SIZE], int) or
13573 constants.IDISK_MODE not in row or
13574 row[constants.IDISK_MODE] not in constants.DISK_ACCESS_SET):
13575 raise errors.OpPrereqError("Invalid contents of the 'disks'"
13576 " parameter", errors.ECODE_INVAL)
13577 if self.op.hypervisor is None:
13578 self.op.hypervisor = self.cfg.GetHypervisorType()
13579 elif self.op.mode == constants.IALLOCATOR_MODE_RELOC:
13580 fname = _ExpandInstanceName(self.cfg, self.op.name)
13581 self.op.name = fname
13582 self.relocate_from = \
13583 list(self.cfg.GetInstanceInfo(fname).secondary_nodes)
13584 elif self.op.mode in (constants.IALLOCATOR_MODE_CHG_GROUP,
13585 constants.IALLOCATOR_MODE_NODE_EVAC):
13586 if not self.op.instances:
13587 raise errors.OpPrereqError("Missing instances", errors.ECODE_INVAL)
13588 self.op.instances = _GetWantedInstances(self, self.op.instances)
13590 raise errors.OpPrereqError("Invalid test allocator mode '%s'" %
13591 self.op.mode, errors.ECODE_INVAL)
13593 if self.op.direction == constants.IALLOCATOR_DIR_OUT:
13594 if self.op.allocator is None:
13595 raise errors.OpPrereqError("Missing allocator name",
13596 errors.ECODE_INVAL)
13597 elif self.op.direction != constants.IALLOCATOR_DIR_IN:
13598 raise errors.OpPrereqError("Wrong allocator test '%s'" %
13599 self.op.direction, errors.ECODE_INVAL)
13601 def Exec(self, feedback_fn):
13602 """Run the allocator test.
13605 if self.op.mode == constants.IALLOCATOR_MODE_ALLOC:
13606 ial = IAllocator(self.cfg, self.rpc,
13609 memory=self.op.memory,
13610 disks=self.op.disks,
13611 disk_template=self.op.disk_template,
13615 vcpus=self.op.vcpus,
13616 hypervisor=self.op.hypervisor,
13618 elif self.op.mode == constants.IALLOCATOR_MODE_RELOC:
13619 ial = IAllocator(self.cfg, self.rpc,
13622 relocate_from=list(self.relocate_from),
13624 elif self.op.mode == constants.IALLOCATOR_MODE_CHG_GROUP:
13625 ial = IAllocator(self.cfg, self.rpc,
13627 instances=self.op.instances,
13628 target_groups=self.op.target_groups)
13629 elif self.op.mode == constants.IALLOCATOR_MODE_NODE_EVAC:
13630 ial = IAllocator(self.cfg, self.rpc,
13632 instances=self.op.instances,
13633 evac_mode=self.op.evac_mode)
13635 raise errors.ProgrammerError("Uncatched mode %s in"
13636 " LUTestAllocator.Exec", self.op.mode)
13638 if self.op.direction == constants.IALLOCATOR_DIR_IN:
13639 result = ial.in_text
13641 ial.Run(self.op.allocator, validate=False)
13642 result = ial.out_text
13646 #: Query type implementations
13648 constants.QR_INSTANCE: _InstanceQuery,
13649 constants.QR_NODE: _NodeQuery,
13650 constants.QR_GROUP: _GroupQuery,
13651 constants.QR_OS: _OsQuery,
13654 assert set(_QUERY_IMPL.keys()) == constants.QR_VIA_OP
13657 def _GetQueryImplementation(name):
13658 """Returns the implemtnation for a query type.
13660 @param name: Query type, must be one of L{constants.QR_VIA_OP}
13664 return _QUERY_IMPL[name]
13666 raise errors.OpPrereqError("Unknown query resource '%s'" % name,
13667 errors.ECODE_INVAL)