4 # Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 Google Inc.
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 """Module implementing the master-side code."""
24 # pylint: disable=W0201,C0302
26 # W0201 since most LU attributes are defined in CheckPrereq or similar
29 # C0302: since we have waaaay too many lines in this module
45 from ganeti import ssh
46 from ganeti import utils
47 from ganeti import errors
48 from ganeti import hypervisor
49 from ganeti import locking
50 from ganeti import constants
51 from ganeti import objects
52 from ganeti import serializer
53 from ganeti import ssconf
54 from ganeti import uidpool
55 from ganeti import compat
56 from ganeti import masterd
57 from ganeti import netutils
58 from ganeti import query
59 from ganeti import qlang
60 from ganeti import opcodes
62 from ganeti import rpc
64 import ganeti.masterd.instance # pylint: disable=W0611
67 #: Size of DRBD meta block device
72 """Data container for LU results with jobs.
74 Instances of this class returned from L{LogicalUnit.Exec} will be recognized
75 by L{mcpu.Processor._ProcessResult}. The latter will then submit the jobs
76 contained in the C{jobs} attribute and include the job IDs in the opcode
80 def __init__(self, jobs, **kwargs):
81 """Initializes this class.
83 Additional return values can be specified as keyword arguments.
85 @type jobs: list of lists of L{opcode.OpCode}
86 @param jobs: A list of lists of opcode objects
93 class LogicalUnit(object):
94 """Logical Unit base class.
96 Subclasses must follow these rules:
97 - implement ExpandNames
98 - implement CheckPrereq (except when tasklets are used)
99 - implement Exec (except when tasklets are used)
100 - implement BuildHooksEnv
101 - implement BuildHooksNodes
102 - redefine HPATH and HTYPE
103 - optionally redefine their run requirements:
104 REQ_BGL: the LU needs to hold the Big Ganeti Lock exclusively
106 Note that all commands require root permissions.
108 @ivar dry_run_result: the value (if any) that will be returned to the caller
109 in dry-run mode (signalled by opcode dry_run parameter)
116 def __init__(self, processor, op, context, rpc_runner):
117 """Constructor for LogicalUnit.
119 This needs to be overridden in derived classes in order to check op
123 self.proc = processor
125 self.cfg = context.cfg
126 self.glm = context.glm
128 self.owned_locks = context.glm.list_owned
129 self.context = context
130 self.rpc = rpc_runner
131 # Dicts used to declare locking needs to mcpu
132 self.needed_locks = None
133 self.share_locks = dict.fromkeys(locking.LEVELS, 0)
135 self.remove_locks = {}
136 # Used to force good behavior when calling helper functions
137 self.recalculate_locks = {}
139 self.Log = processor.Log # pylint: disable=C0103
140 self.LogWarning = processor.LogWarning # pylint: disable=C0103
141 self.LogInfo = processor.LogInfo # pylint: disable=C0103
142 self.LogStep = processor.LogStep # pylint: disable=C0103
143 # support for dry-run
144 self.dry_run_result = None
145 # support for generic debug attribute
146 if (not hasattr(self.op, "debug_level") or
147 not isinstance(self.op.debug_level, int)):
148 self.op.debug_level = 0
153 # Validate opcode parameters and set defaults
154 self.op.Validate(True)
156 self.CheckArguments()
158 def CheckArguments(self):
159 """Check syntactic validity for the opcode arguments.
161 This method is for doing a simple syntactic check and ensure
162 validity of opcode parameters, without any cluster-related
163 checks. While the same can be accomplished in ExpandNames and/or
164 CheckPrereq, doing these separate is better because:
166 - ExpandNames is left as as purely a lock-related function
167 - CheckPrereq is run after we have acquired locks (and possible
170 The function is allowed to change the self.op attribute so that
171 later methods can no longer worry about missing parameters.
176 def ExpandNames(self):
177 """Expand names for this LU.
179 This method is called before starting to execute the opcode, and it should
180 update all the parameters of the opcode to their canonical form (e.g. a
181 short node name must be fully expanded after this method has successfully
182 completed). This way locking, hooks, logging, etc. can work correctly.
184 LUs which implement this method must also populate the self.needed_locks
185 member, as a dict with lock levels as keys, and a list of needed lock names
188 - use an empty dict if you don't need any lock
189 - if you don't need any lock at a particular level omit that level
190 - don't put anything for the BGL level
191 - if you want all locks at a level use locking.ALL_SET as a value
193 If you need to share locks (rather than acquire them exclusively) at one
194 level you can modify self.share_locks, setting a true value (usually 1) for
195 that level. By default locks are not shared.
197 This function can also define a list of tasklets, which then will be
198 executed in order instead of the usual LU-level CheckPrereq and Exec
199 functions, if those are not defined by the LU.
203 # Acquire all nodes and one instance
204 self.needed_locks = {
205 locking.LEVEL_NODE: locking.ALL_SET,
206 locking.LEVEL_INSTANCE: ['instance1.example.com'],
208 # Acquire just two nodes
209 self.needed_locks = {
210 locking.LEVEL_NODE: ['node1.example.com', 'node2.example.com'],
213 self.needed_locks = {} # No, you can't leave it to the default value None
216 # The implementation of this method is mandatory only if the new LU is
217 # concurrent, so that old LUs don't need to be changed all at the same
220 self.needed_locks = {} # Exclusive LUs don't need locks.
222 raise NotImplementedError
224 def DeclareLocks(self, level):
225 """Declare LU locking needs for a level
227 While most LUs can just declare their locking needs at ExpandNames time,
228 sometimes there's the need to calculate some locks after having acquired
229 the ones before. This function is called just before acquiring locks at a
230 particular level, but after acquiring the ones at lower levels, and permits
231 such calculations. It can be used to modify self.needed_locks, and by
232 default it does nothing.
234 This function is only called if you have something already set in
235 self.needed_locks for the level.
237 @param level: Locking level which is going to be locked
238 @type level: member of ganeti.locking.LEVELS
242 def CheckPrereq(self):
243 """Check prerequisites for this LU.
245 This method should check that the prerequisites for the execution
246 of this LU are fulfilled. It can do internode communication, but
247 it should be idempotent - no cluster or system changes are
250 The method should raise errors.OpPrereqError in case something is
251 not fulfilled. Its return value is ignored.
253 This method should also update all the parameters of the opcode to
254 their canonical form if it hasn't been done by ExpandNames before.
257 if self.tasklets is not None:
258 for (idx, tl) in enumerate(self.tasklets):
259 logging.debug("Checking prerequisites for tasklet %s/%s",
260 idx + 1, len(self.tasklets))
265 def Exec(self, feedback_fn):
268 This method should implement the actual work. It should raise
269 errors.OpExecError for failures that are somewhat dealt with in
273 if self.tasklets is not None:
274 for (idx, tl) in enumerate(self.tasklets):
275 logging.debug("Executing tasklet %s/%s", idx + 1, len(self.tasklets))
278 raise NotImplementedError
280 def BuildHooksEnv(self):
281 """Build hooks environment for this LU.
284 @return: Dictionary containing the environment that will be used for
285 running the hooks for this LU. The keys of the dict must not be prefixed
286 with "GANETI_"--that'll be added by the hooks runner. The hooks runner
287 will extend the environment with additional variables. If no environment
288 should be defined, an empty dictionary should be returned (not C{None}).
289 @note: If the C{HPATH} attribute of the LU class is C{None}, this function
293 raise NotImplementedError
295 def BuildHooksNodes(self):
296 """Build list of nodes to run LU's hooks.
298 @rtype: tuple; (list, list)
299 @return: Tuple containing a list of node names on which the hook
300 should run before the execution and a list of node names on which the
301 hook should run after the execution. No nodes should be returned as an
302 empty list (and not None).
303 @note: If the C{HPATH} attribute of the LU class is C{None}, this function
307 raise NotImplementedError
309 def HooksCallBack(self, phase, hook_results, feedback_fn, lu_result):
310 """Notify the LU about the results of its hooks.
312 This method is called every time a hooks phase is executed, and notifies
313 the Logical Unit about the hooks' result. The LU can then use it to alter
314 its result based on the hooks. By default the method does nothing and the
315 previous result is passed back unchanged but any LU can define it if it
316 wants to use the local cluster hook-scripts somehow.
318 @param phase: one of L{constants.HOOKS_PHASE_POST} or
319 L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase
320 @param hook_results: the results of the multi-node hooks rpc call
321 @param feedback_fn: function used send feedback back to the caller
322 @param lu_result: the previous Exec result this LU had, or None
324 @return: the new Exec result, based on the previous result
328 # API must be kept, thus we ignore the unused argument and could
329 # be a function warnings
330 # pylint: disable=W0613,R0201
333 def _ExpandAndLockInstance(self):
334 """Helper function to expand and lock an instance.
336 Many LUs that work on an instance take its name in self.op.instance_name
337 and need to expand it and then declare the expanded name for locking. This
338 function does it, and then updates self.op.instance_name to the expanded
339 name. It also initializes needed_locks as a dict, if this hasn't been done
343 if self.needed_locks is None:
344 self.needed_locks = {}
346 assert locking.LEVEL_INSTANCE not in self.needed_locks, \
347 "_ExpandAndLockInstance called with instance-level locks set"
348 self.op.instance_name = _ExpandInstanceName(self.cfg,
349 self.op.instance_name)
350 self.needed_locks[locking.LEVEL_INSTANCE] = self.op.instance_name
352 def _LockInstancesNodes(self, primary_only=False,
353 level=locking.LEVEL_NODE):
354 """Helper function to declare instances' nodes for locking.
356 This function should be called after locking one or more instances to lock
357 their nodes. Its effect is populating self.needed_locks[locking.LEVEL_NODE]
358 with all primary or secondary nodes for instances already locked and
359 present in self.needed_locks[locking.LEVEL_INSTANCE].
361 It should be called from DeclareLocks, and for safety only works if
362 self.recalculate_locks[locking.LEVEL_NODE] is set.
364 In the future it may grow parameters to just lock some instance's nodes, or
365 to just lock primaries or secondary nodes, if needed.
367 If should be called in DeclareLocks in a way similar to::
369 if level == locking.LEVEL_NODE:
370 self._LockInstancesNodes()
372 @type primary_only: boolean
373 @param primary_only: only lock primary nodes of locked instances
374 @param level: Which lock level to use for locking nodes
377 assert level in self.recalculate_locks, \
378 "_LockInstancesNodes helper function called with no nodes to recalculate"
380 # TODO: check if we're really been called with the instance locks held
382 # For now we'll replace self.needed_locks[locking.LEVEL_NODE], but in the
383 # future we might want to have different behaviors depending on the value
384 # of self.recalculate_locks[locking.LEVEL_NODE]
386 locked_i = self.owned_locks(locking.LEVEL_INSTANCE)
387 for _, instance in self.cfg.GetMultiInstanceInfo(locked_i):
388 wanted_nodes.append(instance.primary_node)
390 wanted_nodes.extend(instance.secondary_nodes)
392 if self.recalculate_locks[level] == constants.LOCKS_REPLACE:
393 self.needed_locks[level] = wanted_nodes
394 elif self.recalculate_locks[level] == constants.LOCKS_APPEND:
395 self.needed_locks[level].extend(wanted_nodes)
397 raise errors.ProgrammerError("Unknown recalculation mode")
399 del self.recalculate_locks[level]
402 class NoHooksLU(LogicalUnit): # pylint: disable=W0223
403 """Simple LU which runs no hooks.
405 This LU is intended as a parent for other LogicalUnits which will
406 run no hooks, in order to reduce duplicate code.
412 def BuildHooksEnv(self):
413 """Empty BuildHooksEnv for NoHooksLu.
415 This just raises an error.
418 raise AssertionError("BuildHooksEnv called for NoHooksLUs")
420 def BuildHooksNodes(self):
421 """Empty BuildHooksNodes for NoHooksLU.
424 raise AssertionError("BuildHooksNodes called for NoHooksLU")
428 """Tasklet base class.
430 Tasklets are subcomponents for LUs. LUs can consist entirely of tasklets or
431 they can mix legacy code with tasklets. Locking needs to be done in the LU,
432 tasklets know nothing about locks.
434 Subclasses must follow these rules:
435 - Implement CheckPrereq
439 def __init__(self, lu):
446 def CheckPrereq(self):
447 """Check prerequisites for this tasklets.
449 This method should check whether the prerequisites for the execution of
450 this tasklet are fulfilled. It can do internode communication, but it
451 should be idempotent - no cluster or system changes are allowed.
453 The method should raise errors.OpPrereqError in case something is not
454 fulfilled. Its return value is ignored.
456 This method should also update all parameters to their canonical form if it
457 hasn't been done before.
462 def Exec(self, feedback_fn):
463 """Execute the tasklet.
465 This method should implement the actual work. It should raise
466 errors.OpExecError for failures that are somewhat dealt with in code, or
470 raise NotImplementedError
474 """Base for query utility classes.
477 #: Attribute holding field definitions
480 def __init__(self, qfilter, fields, use_locking):
481 """Initializes this class.
484 self.use_locking = use_locking
486 self.query = query.Query(self.FIELDS, fields, qfilter=qfilter,
488 self.requested_data = self.query.RequestedData()
489 self.names = self.query.RequestedNames()
491 # Sort only if no names were requested
492 self.sort_by_name = not self.names
494 self.do_locking = None
497 def _GetNames(self, lu, all_names, lock_level):
498 """Helper function to determine names asked for in the query.
502 names = lu.owned_locks(lock_level)
506 if self.wanted == locking.ALL_SET:
507 assert not self.names
508 # caller didn't specify names, so ordering is not important
509 return utils.NiceSort(names)
511 # caller specified names and we must keep the same order
513 assert not self.do_locking or lu.glm.is_owned(lock_level)
515 missing = set(self.wanted).difference(names)
517 raise errors.OpExecError("Some items were removed before retrieving"
518 " their data: %s" % missing)
520 # Return expanded names
523 def ExpandNames(self, lu):
524 """Expand names for this query.
526 See L{LogicalUnit.ExpandNames}.
529 raise NotImplementedError()
531 def DeclareLocks(self, lu, level):
532 """Declare locks for this query.
534 See L{LogicalUnit.DeclareLocks}.
537 raise NotImplementedError()
539 def _GetQueryData(self, lu):
540 """Collects all data for this query.
542 @return: Query data object
545 raise NotImplementedError()
547 def NewStyleQuery(self, lu):
548 """Collect data and execute query.
551 return query.GetQueryResponse(self.query, self._GetQueryData(lu),
552 sort_by_name=self.sort_by_name)
554 def OldStyleQuery(self, lu):
555 """Collect data and execute query.
558 return self.query.OldStyleQuery(self._GetQueryData(lu),
559 sort_by_name=self.sort_by_name)
563 """Returns a dict declaring all lock levels shared.
566 return dict.fromkeys(locking.LEVELS, 1)
569 def _CheckInstanceNodeGroups(cfg, instance_name, owned_groups):
570 """Checks if the owned node groups are still correct for an instance.
572 @type cfg: L{config.ConfigWriter}
573 @param cfg: The cluster configuration
574 @type instance_name: string
575 @param instance_name: Instance name
576 @type owned_groups: set or frozenset
577 @param owned_groups: List of currently owned node groups
580 inst_groups = cfg.GetInstanceNodeGroups(instance_name)
582 if not owned_groups.issuperset(inst_groups):
583 raise errors.OpPrereqError("Instance %s's node groups changed since"
584 " locks were acquired, current groups are"
585 " are '%s', owning groups '%s'; retry the"
588 utils.CommaJoin(inst_groups),
589 utils.CommaJoin(owned_groups)),
595 def _CheckNodeGroupInstances(cfg, group_uuid, owned_instances):
596 """Checks if the instances in a node group are still correct.
598 @type cfg: L{config.ConfigWriter}
599 @param cfg: The cluster configuration
600 @type group_uuid: string
601 @param group_uuid: Node group UUID
602 @type owned_instances: set or frozenset
603 @param owned_instances: List of currently owned instances
606 wanted_instances = cfg.GetNodeGroupInstances(group_uuid)
607 if owned_instances != wanted_instances:
608 raise errors.OpPrereqError("Instances in node group '%s' changed since"
609 " locks were acquired, wanted '%s', have '%s';"
610 " retry the operation" %
612 utils.CommaJoin(wanted_instances),
613 utils.CommaJoin(owned_instances)),
616 return wanted_instances
619 def _SupportsOob(cfg, node):
620 """Tells if node supports OOB.
622 @type cfg: L{config.ConfigWriter}
623 @param cfg: The cluster configuration
624 @type node: L{objects.Node}
625 @param node: The node
626 @return: The OOB script if supported or an empty string otherwise
629 return cfg.GetNdParams(node)[constants.ND_OOB_PROGRAM]
632 def _GetWantedNodes(lu, nodes):
633 """Returns list of checked and expanded node names.
635 @type lu: L{LogicalUnit}
636 @param lu: the logical unit on whose behalf we execute
638 @param nodes: list of node names or None for all nodes
640 @return: the list of nodes, sorted
641 @raise errors.ProgrammerError: if the nodes parameter is wrong type
645 return [_ExpandNodeName(lu.cfg, name) for name in nodes]
647 return utils.NiceSort(lu.cfg.GetNodeList())
650 def _GetWantedInstances(lu, instances):
651 """Returns list of checked and expanded instance names.
653 @type lu: L{LogicalUnit}
654 @param lu: the logical unit on whose behalf we execute
655 @type instances: list
656 @param instances: list of instance names or None for all instances
658 @return: the list of instances, sorted
659 @raise errors.OpPrereqError: if the instances parameter is wrong type
660 @raise errors.OpPrereqError: if any of the passed instances is not found
664 wanted = [_ExpandInstanceName(lu.cfg, name) for name in instances]
666 wanted = utils.NiceSort(lu.cfg.GetInstanceList())
670 def _GetUpdatedParams(old_params, update_dict,
671 use_default=True, use_none=False):
672 """Return the new version of a parameter dictionary.
674 @type old_params: dict
675 @param old_params: old parameters
676 @type update_dict: dict
677 @param update_dict: dict containing new parameter values, or
678 constants.VALUE_DEFAULT to reset the parameter to its default
680 @param use_default: boolean
681 @type use_default: whether to recognise L{constants.VALUE_DEFAULT}
682 values as 'to be deleted' values
683 @param use_none: boolean
684 @type use_none: whether to recognise C{None} values as 'to be
687 @return: the new parameter dictionary
690 params_copy = copy.deepcopy(old_params)
691 for key, val in update_dict.iteritems():
692 if ((use_default and val == constants.VALUE_DEFAULT) or
693 (use_none and val is None)):
699 params_copy[key] = val
703 def _ReleaseLocks(lu, level, names=None, keep=None):
704 """Releases locks owned by an LU.
706 @type lu: L{LogicalUnit}
707 @param level: Lock level
708 @type names: list or None
709 @param names: Names of locks to release
710 @type keep: list or None
711 @param keep: Names of locks to retain
714 assert not (keep is not None and names is not None), \
715 "Only one of the 'names' and the 'keep' parameters can be given"
717 if names is not None:
718 should_release = names.__contains__
720 should_release = lambda name: name not in keep
722 should_release = None
728 # Determine which locks to release
729 for name in lu.owned_locks(level):
730 if should_release(name):
735 assert len(lu.owned_locks(level)) == (len(retain) + len(release))
737 # Release just some locks
738 lu.glm.release(level, names=release)
740 assert frozenset(lu.owned_locks(level)) == frozenset(retain)
743 lu.glm.release(level)
745 assert not lu.glm.is_owned(level), "No locks should be owned"
748 def _MapInstanceDisksToNodes(instances):
749 """Creates a map from (node, volume) to instance name.
751 @type instances: list of L{objects.Instance}
752 @rtype: dict; tuple of (node name, volume name) as key, instance name as value
755 return dict(((node, vol), inst.name)
756 for inst in instances
757 for (node, vols) in inst.MapLVsByNode().items()
761 def _RunPostHook(lu, node_name):
762 """Runs the post-hook for an opcode on a single node.
765 hm = lu.proc.BuildHooksManager(lu)
767 hm.RunPhase(constants.HOOKS_PHASE_POST, nodes=[node_name])
769 # pylint: disable=W0702
770 lu.LogWarning("Errors occurred running hooks on %s" % node_name)
773 def _CheckOutputFields(static, dynamic, selected):
774 """Checks whether all selected fields are valid.
776 @type static: L{utils.FieldSet}
777 @param static: static fields set
778 @type dynamic: L{utils.FieldSet}
779 @param dynamic: dynamic fields set
786 delta = f.NonMatching(selected)
788 raise errors.OpPrereqError("Unknown output fields selected: %s"
789 % ",".join(delta), errors.ECODE_INVAL)
792 def _CheckGlobalHvParams(params):
793 """Validates that given hypervisor params are not global ones.
795 This will ensure that instances don't get customised versions of
799 used_globals = constants.HVC_GLOBALS.intersection(params)
801 msg = ("The following hypervisor parameters are global and cannot"
802 " be customized at instance level, please modify them at"
803 " cluster level: %s" % utils.CommaJoin(used_globals))
804 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
807 def _CheckNodeOnline(lu, node, msg=None):
808 """Ensure that a given node is online.
810 @param lu: the LU on behalf of which we make the check
811 @param node: the node to check
812 @param msg: if passed, should be a message to replace the default one
813 @raise errors.OpPrereqError: if the node is offline
817 msg = "Can't use offline node"
818 if lu.cfg.GetNodeInfo(node).offline:
819 raise errors.OpPrereqError("%s: %s" % (msg, node), errors.ECODE_STATE)
822 def _CheckNodeNotDrained(lu, node):
823 """Ensure that a given node is not drained.
825 @param lu: the LU on behalf of which we make the check
826 @param node: the node to check
827 @raise errors.OpPrereqError: if the node is drained
830 if lu.cfg.GetNodeInfo(node).drained:
831 raise errors.OpPrereqError("Can't use drained node %s" % node,
835 def _CheckNodeVmCapable(lu, node):
836 """Ensure that a given node is vm capable.
838 @param lu: the LU on behalf of which we make the check
839 @param node: the node to check
840 @raise errors.OpPrereqError: if the node is not vm capable
843 if not lu.cfg.GetNodeInfo(node).vm_capable:
844 raise errors.OpPrereqError("Can't use non-vm_capable node %s" % node,
848 def _CheckNodeHasOS(lu, node, os_name, force_variant):
849 """Ensure that a node supports a given OS.
851 @param lu: the LU on behalf of which we make the check
852 @param node: the node to check
853 @param os_name: the OS to query about
854 @param force_variant: whether to ignore variant errors
855 @raise errors.OpPrereqError: if the node is not supporting the OS
858 result = lu.rpc.call_os_get(node, os_name)
859 result.Raise("OS '%s' not in supported OS list for node %s" %
861 prereq=True, ecode=errors.ECODE_INVAL)
862 if not force_variant:
863 _CheckOSVariant(result.payload, os_name)
866 def _CheckNodeHasSecondaryIP(lu, node, secondary_ip, prereq):
867 """Ensure that a node has the given secondary ip.
869 @type lu: L{LogicalUnit}
870 @param lu: the LU on behalf of which we make the check
872 @param node: the node to check
873 @type secondary_ip: string
874 @param secondary_ip: the ip to check
875 @type prereq: boolean
876 @param prereq: whether to throw a prerequisite or an execute error
877 @raise errors.OpPrereqError: if the node doesn't have the ip, and prereq=True
878 @raise errors.OpExecError: if the node doesn't have the ip, and prereq=False
881 result = lu.rpc.call_node_has_ip_address(node, secondary_ip)
882 result.Raise("Failure checking secondary ip on node %s" % node,
883 prereq=prereq, ecode=errors.ECODE_ENVIRON)
884 if not result.payload:
885 msg = ("Node claims it doesn't have the secondary ip you gave (%s),"
886 " please fix and re-run this command" % secondary_ip)
888 raise errors.OpPrereqError(msg, errors.ECODE_ENVIRON)
890 raise errors.OpExecError(msg)
893 def _GetClusterDomainSecret():
894 """Reads the cluster domain secret.
897 return utils.ReadOneLineFile(constants.CLUSTER_DOMAIN_SECRET_FILE,
901 def _CheckInstanceDown(lu, instance, reason):
902 """Ensure that an instance is not running."""
903 if instance.admin_up:
904 raise errors.OpPrereqError("Instance %s is marked to be up, %s" %
905 (instance.name, reason), errors.ECODE_STATE)
907 pnode = instance.primary_node
908 ins_l = lu.rpc.call_instance_list([pnode], [instance.hypervisor])[pnode]
909 ins_l.Raise("Can't contact node %s for instance information" % pnode,
910 prereq=True, ecode=errors.ECODE_ENVIRON)
912 if instance.name in ins_l.payload:
913 raise errors.OpPrereqError("Instance %s is running, %s" %
914 (instance.name, reason), errors.ECODE_STATE)
917 def _ExpandItemName(fn, name, kind):
918 """Expand an item name.
920 @param fn: the function to use for expansion
921 @param name: requested item name
922 @param kind: text description ('Node' or 'Instance')
923 @return: the resolved (full) name
924 @raise errors.OpPrereqError: if the item is not found
928 if full_name is None:
929 raise errors.OpPrereqError("%s '%s' not known" % (kind, name),
934 def _ExpandNodeName(cfg, name):
935 """Wrapper over L{_ExpandItemName} for nodes."""
936 return _ExpandItemName(cfg.ExpandNodeName, name, "Node")
939 def _ExpandInstanceName(cfg, name):
940 """Wrapper over L{_ExpandItemName} for instance."""
941 return _ExpandItemName(cfg.ExpandInstanceName, name, "Instance")
944 def _BuildInstanceHookEnv(name, primary_node, secondary_nodes, os_type, status,
945 memory, vcpus, nics, disk_template, disks,
946 bep, hvp, hypervisor_name, tags):
947 """Builds instance related env variables for hooks
949 This builds the hook environment from individual variables.
952 @param name: the name of the instance
953 @type primary_node: string
954 @param primary_node: the name of the instance's primary node
955 @type secondary_nodes: list
956 @param secondary_nodes: list of secondary nodes as strings
957 @type os_type: string
958 @param os_type: the name of the instance's OS
959 @type status: boolean
960 @param status: the should_run status of the instance
962 @param memory: the memory size of the instance
964 @param vcpus: the count of VCPUs the instance has
966 @param nics: list of tuples (ip, mac, mode, link) representing
967 the NICs the instance has
968 @type disk_template: string
969 @param disk_template: the disk template of the instance
971 @param disks: the list of (size, mode) pairs
973 @param bep: the backend parameters for the instance
975 @param hvp: the hypervisor parameters for the instance
976 @type hypervisor_name: string
977 @param hypervisor_name: the hypervisor for the instance
979 @param tags: list of instance tags as strings
981 @return: the hook environment for this instance
990 "INSTANCE_NAME": name,
991 "INSTANCE_PRIMARY": primary_node,
992 "INSTANCE_SECONDARIES": " ".join(secondary_nodes),
993 "INSTANCE_OS_TYPE": os_type,
994 "INSTANCE_STATUS": str_status,
995 "INSTANCE_MEMORY": memory,
996 "INSTANCE_VCPUS": vcpus,
997 "INSTANCE_DISK_TEMPLATE": disk_template,
998 "INSTANCE_HYPERVISOR": hypervisor_name,
1002 nic_count = len(nics)
1003 for idx, (ip, mac, mode, link) in enumerate(nics):
1006 env["INSTANCE_NIC%d_IP" % idx] = ip
1007 env["INSTANCE_NIC%d_MAC" % idx] = mac
1008 env["INSTANCE_NIC%d_MODE" % idx] = mode
1009 env["INSTANCE_NIC%d_LINK" % idx] = link
1010 if mode == constants.NIC_MODE_BRIDGED:
1011 env["INSTANCE_NIC%d_BRIDGE" % idx] = link
1015 env["INSTANCE_NIC_COUNT"] = nic_count
1018 disk_count = len(disks)
1019 for idx, (size, mode) in enumerate(disks):
1020 env["INSTANCE_DISK%d_SIZE" % idx] = size
1021 env["INSTANCE_DISK%d_MODE" % idx] = mode
1025 env["INSTANCE_DISK_COUNT"] = disk_count
1030 env["INSTANCE_TAGS"] = " ".join(tags)
1032 for source, kind in [(bep, "BE"), (hvp, "HV")]:
1033 for key, value in source.items():
1034 env["INSTANCE_%s_%s" % (kind, key)] = value
1039 def _NICListToTuple(lu, nics):
1040 """Build a list of nic information tuples.
1042 This list is suitable to be passed to _BuildInstanceHookEnv or as a return
1043 value in LUInstanceQueryData.
1045 @type lu: L{LogicalUnit}
1046 @param lu: the logical unit on whose behalf we execute
1047 @type nics: list of L{objects.NIC}
1048 @param nics: list of nics to convert to hooks tuples
1052 cluster = lu.cfg.GetClusterInfo()
1056 filled_params = cluster.SimpleFillNIC(nic.nicparams)
1057 mode = filled_params[constants.NIC_MODE]
1058 link = filled_params[constants.NIC_LINK]
1059 hooks_nics.append((ip, mac, mode, link))
1063 def _BuildInstanceHookEnvByObject(lu, instance, override=None):
1064 """Builds instance related env variables for hooks from an object.
1066 @type lu: L{LogicalUnit}
1067 @param lu: the logical unit on whose behalf we execute
1068 @type instance: L{objects.Instance}
1069 @param instance: the instance for which we should build the
1071 @type override: dict
1072 @param override: dictionary with key/values that will override
1075 @return: the hook environment dictionary
1078 cluster = lu.cfg.GetClusterInfo()
1079 bep = cluster.FillBE(instance)
1080 hvp = cluster.FillHV(instance)
1082 "name": instance.name,
1083 "primary_node": instance.primary_node,
1084 "secondary_nodes": instance.secondary_nodes,
1085 "os_type": instance.os,
1086 "status": instance.admin_up,
1087 "memory": bep[constants.BE_MEMORY],
1088 "vcpus": bep[constants.BE_VCPUS],
1089 "nics": _NICListToTuple(lu, instance.nics),
1090 "disk_template": instance.disk_template,
1091 "disks": [(disk.size, disk.mode) for disk in instance.disks],
1094 "hypervisor_name": instance.hypervisor,
1095 "tags": instance.tags,
1098 args.update(override)
1099 return _BuildInstanceHookEnv(**args) # pylint: disable=W0142
1102 def _AdjustCandidatePool(lu, exceptions):
1103 """Adjust the candidate pool after node operations.
1106 mod_list = lu.cfg.MaintainCandidatePool(exceptions)
1108 lu.LogInfo("Promoted nodes to master candidate role: %s",
1109 utils.CommaJoin(node.name for node in mod_list))
1110 for name in mod_list:
1111 lu.context.ReaddNode(name)
1112 mc_now, mc_max, _ = lu.cfg.GetMasterCandidateStats(exceptions)
1114 lu.LogInfo("Note: more nodes are candidates (%d) than desired (%d)" %
1118 def _DecideSelfPromotion(lu, exceptions=None):
1119 """Decide whether I should promote myself as a master candidate.
1122 cp_size = lu.cfg.GetClusterInfo().candidate_pool_size
1123 mc_now, mc_should, _ = lu.cfg.GetMasterCandidateStats(exceptions)
1124 # the new node will increase mc_max with one, so:
1125 mc_should = min(mc_should + 1, cp_size)
1126 return mc_now < mc_should
1129 def _CheckNicsBridgesExist(lu, target_nics, target_node):
1130 """Check that the brigdes needed by a list of nics exist.
1133 cluster = lu.cfg.GetClusterInfo()
1134 paramslist = [cluster.SimpleFillNIC(nic.nicparams) for nic in target_nics]
1135 brlist = [params[constants.NIC_LINK] for params in paramslist
1136 if params[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED]
1138 result = lu.rpc.call_bridges_exist(target_node, brlist)
1139 result.Raise("Error checking bridges on destination node '%s'" %
1140 target_node, prereq=True, ecode=errors.ECODE_ENVIRON)
1143 def _CheckInstanceBridgesExist(lu, instance, node=None):
1144 """Check that the brigdes needed by an instance exist.
1148 node = instance.primary_node
1149 _CheckNicsBridgesExist(lu, instance.nics, node)
1152 def _CheckOSVariant(os_obj, name):
1153 """Check whether an OS name conforms to the os variants specification.
1155 @type os_obj: L{objects.OS}
1156 @param os_obj: OS object to check
1158 @param name: OS name passed by the user, to check for validity
1161 variant = objects.OS.GetVariant(name)
1162 if not os_obj.supported_variants:
1164 raise errors.OpPrereqError("OS '%s' doesn't support variants ('%s'"
1165 " passed)" % (os_obj.name, variant),
1169 raise errors.OpPrereqError("OS name must include a variant",
1172 if variant not in os_obj.supported_variants:
1173 raise errors.OpPrereqError("Unsupported OS variant", errors.ECODE_INVAL)
1176 def _GetNodeInstancesInner(cfg, fn):
1177 return [i for i in cfg.GetAllInstancesInfo().values() if fn(i)]
1180 def _GetNodeInstances(cfg, node_name):
1181 """Returns a list of all primary and secondary instances on a node.
1185 return _GetNodeInstancesInner(cfg, lambda inst: node_name in inst.all_nodes)
1188 def _GetNodePrimaryInstances(cfg, node_name):
1189 """Returns primary instances on a node.
1192 return _GetNodeInstancesInner(cfg,
1193 lambda inst: node_name == inst.primary_node)
1196 def _GetNodeSecondaryInstances(cfg, node_name):
1197 """Returns secondary instances on a node.
1200 return _GetNodeInstancesInner(cfg,
1201 lambda inst: node_name in inst.secondary_nodes)
1204 def _GetStorageTypeArgs(cfg, storage_type):
1205 """Returns the arguments for a storage type.
1208 # Special case for file storage
1209 if storage_type == constants.ST_FILE:
1210 # storage.FileStorage wants a list of storage directories
1211 return [[cfg.GetFileStorageDir(), cfg.GetSharedFileStorageDir()]]
1216 def _FindFaultyInstanceDisks(cfg, rpc_runner, instance, node_name, prereq):
1219 for dev in instance.disks:
1220 cfg.SetDiskID(dev, node_name)
1222 result = rpc_runner.call_blockdev_getmirrorstatus(node_name, instance.disks)
1223 result.Raise("Failed to get disk status from node %s" % node_name,
1224 prereq=prereq, ecode=errors.ECODE_ENVIRON)
1226 for idx, bdev_status in enumerate(result.payload):
1227 if bdev_status and bdev_status.ldisk_status == constants.LDS_FAULTY:
1233 def _CheckIAllocatorOrNode(lu, iallocator_slot, node_slot):
1234 """Check the sanity of iallocator and node arguments and use the
1235 cluster-wide iallocator if appropriate.
1237 Check that at most one of (iallocator, node) is specified. If none is
1238 specified, then the LU's opcode's iallocator slot is filled with the
1239 cluster-wide default iallocator.
1241 @type iallocator_slot: string
1242 @param iallocator_slot: the name of the opcode iallocator slot
1243 @type node_slot: string
1244 @param node_slot: the name of the opcode target node slot
1247 node = getattr(lu.op, node_slot, None)
1248 iallocator = getattr(lu.op, iallocator_slot, None)
1250 if node is not None and iallocator is not None:
1251 raise errors.OpPrereqError("Do not specify both, iallocator and node",
1253 elif node is None and iallocator is None:
1254 default_iallocator = lu.cfg.GetDefaultIAllocator()
1255 if default_iallocator:
1256 setattr(lu.op, iallocator_slot, default_iallocator)
1258 raise errors.OpPrereqError("No iallocator or node given and no"
1259 " cluster-wide default iallocator found;"
1260 " please specify either an iallocator or a"
1261 " node, or set a cluster-wide default"
1265 def _GetDefaultIAllocator(cfg, iallocator):
1266 """Decides on which iallocator to use.
1268 @type cfg: L{config.ConfigWriter}
1269 @param cfg: Cluster configuration object
1270 @type iallocator: string or None
1271 @param iallocator: Iallocator specified in opcode
1273 @return: Iallocator name
1277 # Use default iallocator
1278 iallocator = cfg.GetDefaultIAllocator()
1281 raise errors.OpPrereqError("No iallocator was specified, neither in the"
1282 " opcode nor as a cluster-wide default",
1288 class LUClusterPostInit(LogicalUnit):
1289 """Logical unit for running hooks after cluster initialization.
1292 HPATH = "cluster-init"
1293 HTYPE = constants.HTYPE_CLUSTER
1295 def BuildHooksEnv(self):
1300 "OP_TARGET": self.cfg.GetClusterName(),
1303 def BuildHooksNodes(self):
1304 """Build hooks nodes.
1307 return ([], [self.cfg.GetMasterNode()])
1309 def Exec(self, feedback_fn):
1316 class LUClusterDestroy(LogicalUnit):
1317 """Logical unit for destroying the cluster.
1320 HPATH = "cluster-destroy"
1321 HTYPE = constants.HTYPE_CLUSTER
1323 def BuildHooksEnv(self):
1328 "OP_TARGET": self.cfg.GetClusterName(),
1331 def BuildHooksNodes(self):
1332 """Build hooks nodes.
1337 def CheckPrereq(self):
1338 """Check prerequisites.
1340 This checks whether the cluster is empty.
1342 Any errors are signaled by raising errors.OpPrereqError.
1345 master = self.cfg.GetMasterNode()
1347 nodelist = self.cfg.GetNodeList()
1348 if len(nodelist) != 1 or nodelist[0] != master:
1349 raise errors.OpPrereqError("There are still %d node(s) in"
1350 " this cluster." % (len(nodelist) - 1),
1352 instancelist = self.cfg.GetInstanceList()
1354 raise errors.OpPrereqError("There are still %d instance(s) in"
1355 " this cluster." % len(instancelist),
1358 def Exec(self, feedback_fn):
1359 """Destroys the cluster.
1362 master_params = self.cfg.GetMasterNetworkParameters()
1364 # Run post hooks on master node before it's removed
1365 _RunPostHook(self, master_params.name)
1367 ems = self.cfg.GetUseExternalMipScript()
1368 result = self.rpc.call_node_deactivate_master_ip(master_params.name,
1370 result.Raise("Could not disable the master role")
1372 return master_params.name
1375 def _VerifyCertificate(filename):
1376 """Verifies a certificate for L{LUClusterVerifyConfig}.
1378 @type filename: string
1379 @param filename: Path to PEM file
1383 cert = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM,
1384 utils.ReadFile(filename))
1385 except Exception, err: # pylint: disable=W0703
1386 return (LUClusterVerifyConfig.ETYPE_ERROR,
1387 "Failed to load X509 certificate %s: %s" % (filename, err))
1390 utils.VerifyX509Certificate(cert, constants.SSL_CERT_EXPIRATION_WARN,
1391 constants.SSL_CERT_EXPIRATION_ERROR)
1394 fnamemsg = "While verifying %s: %s" % (filename, msg)
1399 return (None, fnamemsg)
1400 elif errcode == utils.CERT_WARNING:
1401 return (LUClusterVerifyConfig.ETYPE_WARNING, fnamemsg)
1402 elif errcode == utils.CERT_ERROR:
1403 return (LUClusterVerifyConfig.ETYPE_ERROR, fnamemsg)
1405 raise errors.ProgrammerError("Unhandled certificate error code %r" % errcode)
1408 def _GetAllHypervisorParameters(cluster, instances):
1409 """Compute the set of all hypervisor parameters.
1411 @type cluster: L{objects.Cluster}
1412 @param cluster: the cluster object
1413 @param instances: list of L{objects.Instance}
1414 @param instances: additional instances from which to obtain parameters
1415 @rtype: list of (origin, hypervisor, parameters)
1416 @return: a list with all parameters found, indicating the hypervisor they
1417 apply to, and the origin (can be "cluster", "os X", or "instance Y")
1422 for hv_name in cluster.enabled_hypervisors:
1423 hvp_data.append(("cluster", hv_name, cluster.GetHVDefaults(hv_name)))
1425 for os_name, os_hvp in cluster.os_hvp.items():
1426 for hv_name, hv_params in os_hvp.items():
1428 full_params = cluster.GetHVDefaults(hv_name, os_name=os_name)
1429 hvp_data.append(("os %s" % os_name, hv_name, full_params))
1431 # TODO: collapse identical parameter values in a single one
1432 for instance in instances:
1433 if instance.hvparams:
1434 hvp_data.append(("instance %s" % instance.name, instance.hypervisor,
1435 cluster.FillHV(instance)))
1440 class _VerifyErrors(object):
1441 """Mix-in for cluster/group verify LUs.
1443 It provides _Error and _ErrorIf, and updates the self.bad boolean. (Expects
1444 self.op and self._feedback_fn to be available.)
1448 ETYPE_FIELD = "code"
1449 ETYPE_ERROR = "ERROR"
1450 ETYPE_WARNING = "WARNING"
1452 def _Error(self, ecode, item, msg, *args, **kwargs):
1453 """Format an error message.
1455 Based on the opcode's error_codes parameter, either format a
1456 parseable error code, or a simpler error string.
1458 This must be called only from Exec and functions called from Exec.
1461 ltype = kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR)
1462 itype, etxt, _ = ecode
1463 # first complete the msg
1466 # then format the whole message
1467 if self.op.error_codes: # This is a mix-in. pylint: disable=E1101
1468 msg = "%s:%s:%s:%s:%s" % (ltype, etxt, itype, item, msg)
1474 msg = "%s: %s%s: %s" % (ltype, itype, item, msg)
1475 # and finally report it via the feedback_fn
1476 self._feedback_fn(" - %s" % msg) # Mix-in. pylint: disable=E1101
1478 def _ErrorIf(self, cond, ecode, *args, **kwargs):
1479 """Log an error message if the passed condition is True.
1483 or self.op.debug_simulate_errors) # pylint: disable=E1101
1485 # If the error code is in the list of ignored errors, demote the error to a
1487 (_, etxt, _) = ecode
1488 if etxt in self.op.ignore_errors: # pylint: disable=E1101
1489 kwargs[self.ETYPE_FIELD] = self.ETYPE_WARNING
1492 self._Error(ecode, *args, **kwargs)
1494 # do not mark the operation as failed for WARN cases only
1495 if kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR) == self.ETYPE_ERROR:
1496 self.bad = self.bad or cond
1499 class LUClusterVerify(NoHooksLU):
1500 """Submits all jobs necessary to verify the cluster.
1505 def ExpandNames(self):
1506 self.needed_locks = {}
1508 def Exec(self, feedback_fn):
1511 if self.op.group_name:
1512 groups = [self.op.group_name]
1513 depends_fn = lambda: None
1515 groups = self.cfg.GetNodeGroupList()
1517 # Verify global configuration
1519 opcodes.OpClusterVerifyConfig(ignore_errors=self.op.ignore_errors)
1522 # Always depend on global verification
1523 depends_fn = lambda: [(-len(jobs), [])]
1525 jobs.extend([opcodes.OpClusterVerifyGroup(group_name=group,
1526 ignore_errors=self.op.ignore_errors,
1527 depends=depends_fn())]
1528 for group in groups)
1530 # Fix up all parameters
1531 for op in itertools.chain(*jobs): # pylint: disable=W0142
1532 op.debug_simulate_errors = self.op.debug_simulate_errors
1533 op.verbose = self.op.verbose
1534 op.error_codes = self.op.error_codes
1536 op.skip_checks = self.op.skip_checks
1537 except AttributeError:
1538 assert not isinstance(op, opcodes.OpClusterVerifyGroup)
1540 return ResultWithJobs(jobs)
1543 class LUClusterVerifyConfig(NoHooksLU, _VerifyErrors):
1544 """Verifies the cluster config.
1549 def _VerifyHVP(self, hvp_data):
1550 """Verifies locally the syntax of the hypervisor parameters.
1553 for item, hv_name, hv_params in hvp_data:
1554 msg = ("hypervisor %s parameters syntax check (source %s): %%s" %
1557 hv_class = hypervisor.GetHypervisor(hv_name)
1558 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
1559 hv_class.CheckParameterSyntax(hv_params)
1560 except errors.GenericError, err:
1561 self._ErrorIf(True, constants.CV_ECLUSTERCFG, None, msg % str(err))
1563 def ExpandNames(self):
1564 # Information can be safely retrieved as the BGL is acquired in exclusive
1566 assert locking.BGL in self.owned_locks(locking.LEVEL_CLUSTER)
1567 self.all_group_info = self.cfg.GetAllNodeGroupsInfo()
1568 self.all_node_info = self.cfg.GetAllNodesInfo()
1569 self.all_inst_info = self.cfg.GetAllInstancesInfo()
1570 self.needed_locks = {}
1572 def Exec(self, feedback_fn):
1573 """Verify integrity of cluster, performing various test on nodes.
1577 self._feedback_fn = feedback_fn
1579 feedback_fn("* Verifying cluster config")
1581 for msg in self.cfg.VerifyConfig():
1582 self._ErrorIf(True, constants.CV_ECLUSTERCFG, None, msg)
1584 feedback_fn("* Verifying cluster certificate files")
1586 for cert_filename in constants.ALL_CERT_FILES:
1587 (errcode, msg) = _VerifyCertificate(cert_filename)
1588 self._ErrorIf(errcode, constants.CV_ECLUSTERCERT, None, msg, code=errcode)
1590 feedback_fn("* Verifying hypervisor parameters")
1592 self._VerifyHVP(_GetAllHypervisorParameters(self.cfg.GetClusterInfo(),
1593 self.all_inst_info.values()))
1595 feedback_fn("* Verifying all nodes belong to an existing group")
1597 # We do this verification here because, should this bogus circumstance
1598 # occur, it would never be caught by VerifyGroup, which only acts on
1599 # nodes/instances reachable from existing node groups.
1601 dangling_nodes = set(node.name for node in self.all_node_info.values()
1602 if node.group not in self.all_group_info)
1604 dangling_instances = {}
1605 no_node_instances = []
1607 for inst in self.all_inst_info.values():
1608 if inst.primary_node in dangling_nodes:
1609 dangling_instances.setdefault(inst.primary_node, []).append(inst.name)
1610 elif inst.primary_node not in self.all_node_info:
1611 no_node_instances.append(inst.name)
1616 utils.CommaJoin(dangling_instances.get(node.name,
1618 for node in dangling_nodes]
1620 self._ErrorIf(bool(dangling_nodes), constants.CV_ECLUSTERDANGLINGNODES,
1622 "the following nodes (and their instances) belong to a non"
1623 " existing group: %s", utils.CommaJoin(pretty_dangling))
1625 self._ErrorIf(bool(no_node_instances), constants.CV_ECLUSTERDANGLINGINST,
1627 "the following instances have a non-existing primary-node:"
1628 " %s", utils.CommaJoin(no_node_instances))
1633 class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
1634 """Verifies the status of a node group.
1637 HPATH = "cluster-verify"
1638 HTYPE = constants.HTYPE_CLUSTER
1641 _HOOKS_INDENT_RE = re.compile("^", re.M)
1643 class NodeImage(object):
1644 """A class representing the logical and physical status of a node.
1647 @ivar name: the node name to which this object refers
1648 @ivar volumes: a structure as returned from
1649 L{ganeti.backend.GetVolumeList} (runtime)
1650 @ivar instances: a list of running instances (runtime)
1651 @ivar pinst: list of configured primary instances (config)
1652 @ivar sinst: list of configured secondary instances (config)
1653 @ivar sbp: dictionary of {primary-node: list of instances} for all
1654 instances for which this node is secondary (config)
1655 @ivar mfree: free memory, as reported by hypervisor (runtime)
1656 @ivar dfree: free disk, as reported by the node (runtime)
1657 @ivar offline: the offline status (config)
1658 @type rpc_fail: boolean
1659 @ivar rpc_fail: whether the RPC verify call was successfull (overall,
1660 not whether the individual keys were correct) (runtime)
1661 @type lvm_fail: boolean
1662 @ivar lvm_fail: whether the RPC call didn't return valid LVM data
1663 @type hyp_fail: boolean
1664 @ivar hyp_fail: whether the RPC call didn't return the instance list
1665 @type ghost: boolean
1666 @ivar ghost: whether this is a known node or not (config)
1667 @type os_fail: boolean
1668 @ivar os_fail: whether the RPC call didn't return valid OS data
1670 @ivar oslist: list of OSes as diagnosed by DiagnoseOS
1671 @type vm_capable: boolean
1672 @ivar vm_capable: whether the node can host instances
1675 def __init__(self, offline=False, name=None, vm_capable=True):
1684 self.offline = offline
1685 self.vm_capable = vm_capable
1686 self.rpc_fail = False
1687 self.lvm_fail = False
1688 self.hyp_fail = False
1690 self.os_fail = False
1693 def ExpandNames(self):
1694 # This raises errors.OpPrereqError on its own:
1695 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
1697 # Get instances in node group; this is unsafe and needs verification later
1698 inst_names = self.cfg.GetNodeGroupInstances(self.group_uuid)
1700 self.needed_locks = {
1701 locking.LEVEL_INSTANCE: inst_names,
1702 locking.LEVEL_NODEGROUP: [self.group_uuid],
1703 locking.LEVEL_NODE: [],
1706 self.share_locks = _ShareAll()
1708 def DeclareLocks(self, level):
1709 if level == locking.LEVEL_NODE:
1710 # Get members of node group; this is unsafe and needs verification later
1711 nodes = set(self.cfg.GetNodeGroup(self.group_uuid).members)
1713 all_inst_info = self.cfg.GetAllInstancesInfo()
1715 # In Exec(), we warn about mirrored instances that have primary and
1716 # secondary living in separate node groups. To fully verify that
1717 # volumes for these instances are healthy, we will need to do an
1718 # extra call to their secondaries. We ensure here those nodes will
1720 for inst in self.owned_locks(locking.LEVEL_INSTANCE):
1721 # Important: access only the instances whose lock is owned
1722 if all_inst_info[inst].disk_template in constants.DTS_INT_MIRROR:
1723 nodes.update(all_inst_info[inst].secondary_nodes)
1725 self.needed_locks[locking.LEVEL_NODE] = nodes
1727 def CheckPrereq(self):
1728 assert self.group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP)
1729 self.group_info = self.cfg.GetNodeGroup(self.group_uuid)
1731 group_nodes = set(self.group_info.members)
1732 group_instances = self.cfg.GetNodeGroupInstances(self.group_uuid)
1735 group_nodes.difference(self.owned_locks(locking.LEVEL_NODE))
1737 unlocked_instances = \
1738 group_instances.difference(self.owned_locks(locking.LEVEL_INSTANCE))
1741 raise errors.OpPrereqError("Missing lock for nodes: %s" %
1742 utils.CommaJoin(unlocked_nodes))
1744 if unlocked_instances:
1745 raise errors.OpPrereqError("Missing lock for instances: %s" %
1746 utils.CommaJoin(unlocked_instances))
1748 self.all_node_info = self.cfg.GetAllNodesInfo()
1749 self.all_inst_info = self.cfg.GetAllInstancesInfo()
1751 self.my_node_names = utils.NiceSort(group_nodes)
1752 self.my_inst_names = utils.NiceSort(group_instances)
1754 self.my_node_info = dict((name, self.all_node_info[name])
1755 for name in self.my_node_names)
1757 self.my_inst_info = dict((name, self.all_inst_info[name])
1758 for name in self.my_inst_names)
1760 # We detect here the nodes that will need the extra RPC calls for verifying
1761 # split LV volumes; they should be locked.
1762 extra_lv_nodes = set()
1764 for inst in self.my_inst_info.values():
1765 if inst.disk_template in constants.DTS_INT_MIRROR:
1766 group = self.my_node_info[inst.primary_node].group
1767 for nname in inst.secondary_nodes:
1768 if self.all_node_info[nname].group != group:
1769 extra_lv_nodes.add(nname)
1771 unlocked_lv_nodes = \
1772 extra_lv_nodes.difference(self.owned_locks(locking.LEVEL_NODE))
1774 if unlocked_lv_nodes:
1775 raise errors.OpPrereqError("these nodes could be locked: %s" %
1776 utils.CommaJoin(unlocked_lv_nodes))
1777 self.extra_lv_nodes = list(extra_lv_nodes)
1779 def _VerifyNode(self, ninfo, nresult):
1780 """Perform some basic validation on data returned from a node.
1782 - check the result data structure is well formed and has all the
1784 - check ganeti version
1786 @type ninfo: L{objects.Node}
1787 @param ninfo: the node to check
1788 @param nresult: the results from the node
1790 @return: whether overall this call was successful (and we can expect
1791 reasonable values in the respose)
1795 _ErrorIf = self._ErrorIf # pylint: disable=C0103
1797 # main result, nresult should be a non-empty dict
1798 test = not nresult or not isinstance(nresult, dict)
1799 _ErrorIf(test, constants.CV_ENODERPC, node,
1800 "unable to verify node: no data returned")
1804 # compares ganeti version
1805 local_version = constants.PROTOCOL_VERSION
1806 remote_version = nresult.get("version", None)
1807 test = not (remote_version and
1808 isinstance(remote_version, (list, tuple)) and
1809 len(remote_version) == 2)
1810 _ErrorIf(test, constants.CV_ENODERPC, node,
1811 "connection to node returned invalid data")
1815 test = local_version != remote_version[0]
1816 _ErrorIf(test, constants.CV_ENODEVERSION, node,
1817 "incompatible protocol versions: master %s,"
1818 " node %s", local_version, remote_version[0])
1822 # node seems compatible, we can actually try to look into its results
1824 # full package version
1825 self._ErrorIf(constants.RELEASE_VERSION != remote_version[1],
1826 constants.CV_ENODEVERSION, node,
1827 "software version mismatch: master %s, node %s",
1828 constants.RELEASE_VERSION, remote_version[1],
1829 code=self.ETYPE_WARNING)
1831 hyp_result = nresult.get(constants.NV_HYPERVISOR, None)
1832 if ninfo.vm_capable and isinstance(hyp_result, dict):
1833 for hv_name, hv_result in hyp_result.iteritems():
1834 test = hv_result is not None
1835 _ErrorIf(test, constants.CV_ENODEHV, node,
1836 "hypervisor %s verify failure: '%s'", hv_name, hv_result)
1838 hvp_result = nresult.get(constants.NV_HVPARAMS, None)
1839 if ninfo.vm_capable and isinstance(hvp_result, list):
1840 for item, hv_name, hv_result in hvp_result:
1841 _ErrorIf(True, constants.CV_ENODEHV, node,
1842 "hypervisor %s parameter verify failure (source %s): %s",
1843 hv_name, item, hv_result)
1845 test = nresult.get(constants.NV_NODESETUP,
1846 ["Missing NODESETUP results"])
1847 _ErrorIf(test, constants.CV_ENODESETUP, node, "node setup error: %s",
1852 def _VerifyNodeTime(self, ninfo, nresult,
1853 nvinfo_starttime, nvinfo_endtime):
1854 """Check the node time.
1856 @type ninfo: L{objects.Node}
1857 @param ninfo: the node to check
1858 @param nresult: the remote results for the node
1859 @param nvinfo_starttime: the start time of the RPC call
1860 @param nvinfo_endtime: the end time of the RPC call
1864 _ErrorIf = self._ErrorIf # pylint: disable=C0103
1866 ntime = nresult.get(constants.NV_TIME, None)
1868 ntime_merged = utils.MergeTime(ntime)
1869 except (ValueError, TypeError):
1870 _ErrorIf(True, constants.CV_ENODETIME, node, "Node returned invalid time")
1873 if ntime_merged < (nvinfo_starttime - constants.NODE_MAX_CLOCK_SKEW):
1874 ntime_diff = "%.01fs" % abs(nvinfo_starttime - ntime_merged)
1875 elif ntime_merged > (nvinfo_endtime + constants.NODE_MAX_CLOCK_SKEW):
1876 ntime_diff = "%.01fs" % abs(ntime_merged - nvinfo_endtime)
1880 _ErrorIf(ntime_diff is not None, constants.CV_ENODETIME, node,
1881 "Node time diverges by at least %s from master node time",
1884 def _VerifyNodeLVM(self, ninfo, nresult, vg_name):
1885 """Check the node LVM results.
1887 @type ninfo: L{objects.Node}
1888 @param ninfo: the node to check
1889 @param nresult: the remote results for the node
1890 @param vg_name: the configured VG name
1897 _ErrorIf = self._ErrorIf # pylint: disable=C0103
1899 # checks vg existence and size > 20G
1900 vglist = nresult.get(constants.NV_VGLIST, None)
1902 _ErrorIf(test, constants.CV_ENODELVM, node, "unable to check volume groups")
1904 vgstatus = utils.CheckVolumeGroupSize(vglist, vg_name,
1905 constants.MIN_VG_SIZE)
1906 _ErrorIf(vgstatus, constants.CV_ENODELVM, node, vgstatus)
1909 pvlist = nresult.get(constants.NV_PVLIST, None)
1910 test = pvlist is None
1911 _ErrorIf(test, constants.CV_ENODELVM, node, "Can't get PV list from node")
1913 # check that ':' is not present in PV names, since it's a
1914 # special character for lvcreate (denotes the range of PEs to
1916 for _, pvname, owner_vg in pvlist:
1917 test = ":" in pvname
1918 _ErrorIf(test, constants.CV_ENODELVM, node,
1919 "Invalid character ':' in PV '%s' of VG '%s'",
1922 def _VerifyNodeBridges(self, ninfo, nresult, bridges):
1923 """Check the node bridges.
1925 @type ninfo: L{objects.Node}
1926 @param ninfo: the node to check
1927 @param nresult: the remote results for the node
1928 @param bridges: the expected list of bridges
1935 _ErrorIf = self._ErrorIf # pylint: disable=C0103
1937 missing = nresult.get(constants.NV_BRIDGES, None)
1938 test = not isinstance(missing, list)
1939 _ErrorIf(test, constants.CV_ENODENET, node,
1940 "did not return valid bridge information")
1942 _ErrorIf(bool(missing), constants.CV_ENODENET, node,
1943 "missing bridges: %s" % utils.CommaJoin(sorted(missing)))
1945 def _VerifyNodeUserScripts(self, ninfo, nresult):
1946 """Check the results of user scripts presence and executability on the node
1948 @type ninfo: L{objects.Node}
1949 @param ninfo: the node to check
1950 @param nresult: the remote results for the node
1955 test = not constants.NV_USERSCRIPTS in nresult
1956 self._ErrorIf(test, constants.CV_ENODEUSERSCRIPTS, node,
1957 "did not return user scripts information")
1959 broken_scripts = nresult.get(constants.NV_USERSCRIPTS, None)
1961 self._ErrorIf(broken_scripts, constants.CV_ENODEUSERSCRIPTS, node,
1962 "user scripts not present or not executable: %s" %
1963 utils.CommaJoin(sorted(broken_scripts)))
1965 def _VerifyNodeNetwork(self, ninfo, nresult):
1966 """Check the node network connectivity results.
1968 @type ninfo: L{objects.Node}
1969 @param ninfo: the node to check
1970 @param nresult: the remote results for the node
1974 _ErrorIf = self._ErrorIf # pylint: disable=C0103
1976 test = constants.NV_NODELIST not in nresult
1977 _ErrorIf(test, constants.CV_ENODESSH, node,
1978 "node hasn't returned node ssh connectivity data")
1980 if nresult[constants.NV_NODELIST]:
1981 for a_node, a_msg in nresult[constants.NV_NODELIST].items():
1982 _ErrorIf(True, constants.CV_ENODESSH, node,
1983 "ssh communication with node '%s': %s", a_node, a_msg)
1985 test = constants.NV_NODENETTEST not in nresult
1986 _ErrorIf(test, constants.CV_ENODENET, node,
1987 "node hasn't returned node tcp connectivity data")
1989 if nresult[constants.NV_NODENETTEST]:
1990 nlist = utils.NiceSort(nresult[constants.NV_NODENETTEST].keys())
1992 _ErrorIf(True, constants.CV_ENODENET, node,
1993 "tcp communication with node '%s': %s",
1994 anode, nresult[constants.NV_NODENETTEST][anode])
1996 test = constants.NV_MASTERIP not in nresult
1997 _ErrorIf(test, constants.CV_ENODENET, node,
1998 "node hasn't returned node master IP reachability data")
2000 if not nresult[constants.NV_MASTERIP]:
2001 if node == self.master_node:
2002 msg = "the master node cannot reach the master IP (not configured?)"
2004 msg = "cannot reach the master IP"
2005 _ErrorIf(True, constants.CV_ENODENET, node, msg)
2007 def _VerifyInstance(self, instance, instanceconfig, node_image,
2009 """Verify an instance.
2011 This function checks to see if the required block devices are
2012 available on the instance's node.
2015 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2016 node_current = instanceconfig.primary_node
2018 node_vol_should = {}
2019 instanceconfig.MapLVsByNode(node_vol_should)
2021 for node in node_vol_should:
2022 n_img = node_image[node]
2023 if n_img.offline or n_img.rpc_fail or n_img.lvm_fail:
2024 # ignore missing volumes on offline or broken nodes
2026 for volume in node_vol_should[node]:
2027 test = volume not in n_img.volumes
2028 _ErrorIf(test, constants.CV_EINSTANCEMISSINGDISK, instance,
2029 "volume %s missing on node %s", volume, node)
2031 if instanceconfig.admin_up:
2032 pri_img = node_image[node_current]
2033 test = instance not in pri_img.instances and not pri_img.offline
2034 _ErrorIf(test, constants.CV_EINSTANCEDOWN, instance,
2035 "instance not running on its primary node %s",
2038 diskdata = [(nname, success, status, idx)
2039 for (nname, disks) in diskstatus.items()
2040 for idx, (success, status) in enumerate(disks)]
2042 for nname, success, bdev_status, idx in diskdata:
2043 # the 'ghost node' construction in Exec() ensures that we have a
2045 snode = node_image[nname]
2046 bad_snode = snode.ghost or snode.offline
2047 _ErrorIf(instanceconfig.admin_up and not success and not bad_snode,
2048 constants.CV_EINSTANCEFAULTYDISK, instance,
2049 "couldn't retrieve status for disk/%s on %s: %s",
2050 idx, nname, bdev_status)
2051 _ErrorIf((instanceconfig.admin_up and success and
2052 bdev_status.ldisk_status == constants.LDS_FAULTY),
2053 constants.CV_EINSTANCEFAULTYDISK, instance,
2054 "disk/%s on %s is faulty", idx, nname)
2056 def _VerifyOrphanVolumes(self, node_vol_should, node_image, reserved):
2057 """Verify if there are any unknown volumes in the cluster.
2059 The .os, .swap and backup volumes are ignored. All other volumes are
2060 reported as unknown.
2062 @type reserved: L{ganeti.utils.FieldSet}
2063 @param reserved: a FieldSet of reserved volume names
2066 for node, n_img in node_image.items():
2067 if n_img.offline or n_img.rpc_fail or n_img.lvm_fail:
2068 # skip non-healthy nodes
2070 for volume in n_img.volumes:
2071 test = ((node not in node_vol_should or
2072 volume not in node_vol_should[node]) and
2073 not reserved.Matches(volume))
2074 self._ErrorIf(test, constants.CV_ENODEORPHANLV, node,
2075 "volume %s is unknown", volume)
2077 def _VerifyNPlusOneMemory(self, node_image, instance_cfg):
2078 """Verify N+1 Memory Resilience.
2080 Check that if one single node dies we can still start all the
2081 instances it was primary for.
2084 cluster_info = self.cfg.GetClusterInfo()
2085 for node, n_img in node_image.items():
2086 # This code checks that every node which is now listed as
2087 # secondary has enough memory to host all instances it is
2088 # supposed to should a single other node in the cluster fail.
2089 # FIXME: not ready for failover to an arbitrary node
2090 # FIXME: does not support file-backed instances
2091 # WARNING: we currently take into account down instances as well
2092 # as up ones, considering that even if they're down someone
2093 # might want to start them even in the event of a node failure.
2095 # we're skipping offline nodes from the N+1 warning, since
2096 # most likely we don't have good memory infromation from them;
2097 # we already list instances living on such nodes, and that's
2100 for prinode, instances in n_img.sbp.items():
2102 for instance in instances:
2103 bep = cluster_info.FillBE(instance_cfg[instance])
2104 if bep[constants.BE_AUTO_BALANCE]:
2105 needed_mem += bep[constants.BE_MEMORY]
2106 test = n_img.mfree < needed_mem
2107 self._ErrorIf(test, constants.CV_ENODEN1, node,
2108 "not enough memory to accomodate instance failovers"
2109 " should node %s fail (%dMiB needed, %dMiB available)",
2110 prinode, needed_mem, n_img.mfree)
2113 def _VerifyFiles(cls, errorif, nodeinfo, master_node, all_nvinfo,
2114 (files_all, files_opt, files_mc, files_vm)):
2115 """Verifies file checksums collected from all nodes.
2117 @param errorif: Callback for reporting errors
2118 @param nodeinfo: List of L{objects.Node} objects
2119 @param master_node: Name of master node
2120 @param all_nvinfo: RPC results
2123 # Define functions determining which nodes to consider for a file
2126 (files_mc, lambda node: (node.master_candidate or
2127 node.name == master_node)),
2128 (files_vm, lambda node: node.vm_capable),
2131 # Build mapping from filename to list of nodes which should have the file
2133 for (files, fn) in files2nodefn:
2135 filenodes = nodeinfo
2137 filenodes = filter(fn, nodeinfo)
2138 nodefiles.update((filename,
2139 frozenset(map(operator.attrgetter("name"), filenodes)))
2140 for filename in files)
2142 assert set(nodefiles) == (files_all | files_mc | files_vm)
2144 fileinfo = dict((filename, {}) for filename in nodefiles)
2145 ignore_nodes = set()
2147 for node in nodeinfo:
2149 ignore_nodes.add(node.name)
2152 nresult = all_nvinfo[node.name]
2154 if nresult.fail_msg or not nresult.payload:
2157 node_files = nresult.payload.get(constants.NV_FILELIST, None)
2159 test = not (node_files and isinstance(node_files, dict))
2160 errorif(test, constants.CV_ENODEFILECHECK, node.name,
2161 "Node did not return file checksum data")
2163 ignore_nodes.add(node.name)
2166 # Build per-checksum mapping from filename to nodes having it
2167 for (filename, checksum) in node_files.items():
2168 assert filename in nodefiles
2169 fileinfo[filename].setdefault(checksum, set()).add(node.name)
2171 for (filename, checksums) in fileinfo.items():
2172 assert compat.all(len(i) > 10 for i in checksums), "Invalid checksum"
2174 # Nodes having the file
2175 with_file = frozenset(node_name
2176 for nodes in fileinfo[filename].values()
2177 for node_name in nodes) - ignore_nodes
2179 expected_nodes = nodefiles[filename] - ignore_nodes
2181 # Nodes missing file
2182 missing_file = expected_nodes - with_file
2184 if filename in files_opt:
2186 errorif(missing_file and missing_file != expected_nodes,
2187 constants.CV_ECLUSTERFILECHECK, None,
2188 "File %s is optional, but it must exist on all or no"
2189 " nodes (not found on %s)",
2190 filename, utils.CommaJoin(utils.NiceSort(missing_file)))
2192 errorif(missing_file, constants.CV_ECLUSTERFILECHECK, None,
2193 "File %s is missing from node(s) %s", filename,
2194 utils.CommaJoin(utils.NiceSort(missing_file)))
2196 # Warn if a node has a file it shouldn't
2197 unexpected = with_file - expected_nodes
2199 constants.CV_ECLUSTERFILECHECK, None,
2200 "File %s should not exist on node(s) %s",
2201 filename, utils.CommaJoin(utils.NiceSort(unexpected)))
2203 # See if there are multiple versions of the file
2204 test = len(checksums) > 1
2206 variants = ["variant %s on %s" %
2207 (idx + 1, utils.CommaJoin(utils.NiceSort(nodes)))
2208 for (idx, (checksum, nodes)) in
2209 enumerate(sorted(checksums.items()))]
2213 errorif(test, constants.CV_ECLUSTERFILECHECK, None,
2214 "File %s found with %s different checksums (%s)",
2215 filename, len(checksums), "; ".join(variants))
2217 def _VerifyNodeDrbd(self, ninfo, nresult, instanceinfo, drbd_helper,
2219 """Verifies and the node DRBD status.
2221 @type ninfo: L{objects.Node}
2222 @param ninfo: the node to check
2223 @param nresult: the remote results for the node
2224 @param instanceinfo: the dict of instances
2225 @param drbd_helper: the configured DRBD usermode helper
2226 @param drbd_map: the DRBD map as returned by
2227 L{ganeti.config.ConfigWriter.ComputeDRBDMap}
2231 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2234 helper_result = nresult.get(constants.NV_DRBDHELPER, None)
2235 test = (helper_result == None)
2236 _ErrorIf(test, constants.CV_ENODEDRBDHELPER, node,
2237 "no drbd usermode helper returned")
2239 status, payload = helper_result
2241 _ErrorIf(test, constants.CV_ENODEDRBDHELPER, node,
2242 "drbd usermode helper check unsuccessful: %s", payload)
2243 test = status and (payload != drbd_helper)
2244 _ErrorIf(test, constants.CV_ENODEDRBDHELPER, node,
2245 "wrong drbd usermode helper: %s", payload)
2247 # compute the DRBD minors
2249 for minor, instance in drbd_map[node].items():
2250 test = instance not in instanceinfo
2251 _ErrorIf(test, constants.CV_ECLUSTERCFG, None,
2252 "ghost instance '%s' in temporary DRBD map", instance)
2253 # ghost instance should not be running, but otherwise we
2254 # don't give double warnings (both ghost instance and
2255 # unallocated minor in use)
2257 node_drbd[minor] = (instance, False)
2259 instance = instanceinfo[instance]
2260 node_drbd[minor] = (instance.name, instance.admin_up)
2262 # and now check them
2263 used_minors = nresult.get(constants.NV_DRBDLIST, [])
2264 test = not isinstance(used_minors, (tuple, list))
2265 _ErrorIf(test, constants.CV_ENODEDRBD, node,
2266 "cannot parse drbd status file: %s", str(used_minors))
2268 # we cannot check drbd status
2271 for minor, (iname, must_exist) in node_drbd.items():
2272 test = minor not in used_minors and must_exist
2273 _ErrorIf(test, constants.CV_ENODEDRBD, node,
2274 "drbd minor %d of instance %s is not active", minor, iname)
2275 for minor in used_minors:
2276 test = minor not in node_drbd
2277 _ErrorIf(test, constants.CV_ENODEDRBD, node,
2278 "unallocated drbd minor %d is in use", minor)
2280 def _UpdateNodeOS(self, ninfo, nresult, nimg):
2281 """Builds the node OS structures.
2283 @type ninfo: L{objects.Node}
2284 @param ninfo: the node to check
2285 @param nresult: the remote results for the node
2286 @param nimg: the node image object
2290 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2292 remote_os = nresult.get(constants.NV_OSLIST, None)
2293 test = (not isinstance(remote_os, list) or
2294 not compat.all(isinstance(v, list) and len(v) == 7
2295 for v in remote_os))
2297 _ErrorIf(test, constants.CV_ENODEOS, node,
2298 "node hasn't returned valid OS data")
2307 for (name, os_path, status, diagnose,
2308 variants, parameters, api_ver) in nresult[constants.NV_OSLIST]:
2310 if name not in os_dict:
2313 # parameters is a list of lists instead of list of tuples due to
2314 # JSON lacking a real tuple type, fix it:
2315 parameters = [tuple(v) for v in parameters]
2316 os_dict[name].append((os_path, status, diagnose,
2317 set(variants), set(parameters), set(api_ver)))
2319 nimg.oslist = os_dict
2321 def _VerifyNodeOS(self, ninfo, nimg, base):
2322 """Verifies the node OS list.
2324 @type ninfo: L{objects.Node}
2325 @param ninfo: the node to check
2326 @param nimg: the node image object
2327 @param base: the 'template' node we match against (e.g. from the master)
2331 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2333 assert not nimg.os_fail, "Entered _VerifyNodeOS with failed OS rpc?"
2335 beautify_params = lambda l: ["%s: %s" % (k, v) for (k, v) in l]
2336 for os_name, os_data in nimg.oslist.items():
2337 assert os_data, "Empty OS status for OS %s?!" % os_name
2338 f_path, f_status, f_diag, f_var, f_param, f_api = os_data[0]
2339 _ErrorIf(not f_status, constants.CV_ENODEOS, node,
2340 "Invalid OS %s (located at %s): %s", os_name, f_path, f_diag)
2341 _ErrorIf(len(os_data) > 1, constants.CV_ENODEOS, node,
2342 "OS '%s' has multiple entries (first one shadows the rest): %s",
2343 os_name, utils.CommaJoin([v[0] for v in os_data]))
2344 # comparisons with the 'base' image
2345 test = os_name not in base.oslist
2346 _ErrorIf(test, constants.CV_ENODEOS, node,
2347 "Extra OS %s not present on reference node (%s)",
2351 assert base.oslist[os_name], "Base node has empty OS status?"
2352 _, b_status, _, b_var, b_param, b_api = base.oslist[os_name][0]
2354 # base OS is invalid, skipping
2356 for kind, a, b in [("API version", f_api, b_api),
2357 ("variants list", f_var, b_var),
2358 ("parameters", beautify_params(f_param),
2359 beautify_params(b_param))]:
2360 _ErrorIf(a != b, constants.CV_ENODEOS, node,
2361 "OS %s for %s differs from reference node %s: [%s] vs. [%s]",
2362 kind, os_name, base.name,
2363 utils.CommaJoin(sorted(a)), utils.CommaJoin(sorted(b)))
2365 # check any missing OSes
2366 missing = set(base.oslist.keys()).difference(nimg.oslist.keys())
2367 _ErrorIf(missing, constants.CV_ENODEOS, node,
2368 "OSes present on reference node %s but missing on this node: %s",
2369 base.name, utils.CommaJoin(missing))
2371 def _VerifyOob(self, ninfo, nresult):
2372 """Verifies out of band functionality of a node.
2374 @type ninfo: L{objects.Node}
2375 @param ninfo: the node to check
2376 @param nresult: the remote results for the node
2380 # We just have to verify the paths on master and/or master candidates
2381 # as the oob helper is invoked on the master
2382 if ((ninfo.master_candidate or ninfo.master_capable) and
2383 constants.NV_OOB_PATHS in nresult):
2384 for path_result in nresult[constants.NV_OOB_PATHS]:
2385 self._ErrorIf(path_result, constants.CV_ENODEOOBPATH, node, path_result)
2387 def _UpdateNodeVolumes(self, ninfo, nresult, nimg, vg_name):
2388 """Verifies and updates the node volume data.
2390 This function will update a L{NodeImage}'s internal structures
2391 with data from the remote call.
2393 @type ninfo: L{objects.Node}
2394 @param ninfo: the node to check
2395 @param nresult: the remote results for the node
2396 @param nimg: the node image object
2397 @param vg_name: the configured VG name
2401 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2403 nimg.lvm_fail = True
2404 lvdata = nresult.get(constants.NV_LVLIST, "Missing LV data")
2407 elif isinstance(lvdata, basestring):
2408 _ErrorIf(True, constants.CV_ENODELVM, node, "LVM problem on node: %s",
2409 utils.SafeEncode(lvdata))
2410 elif not isinstance(lvdata, dict):
2411 _ErrorIf(True, constants.CV_ENODELVM, node,
2412 "rpc call to node failed (lvlist)")
2414 nimg.volumes = lvdata
2415 nimg.lvm_fail = False
2417 def _UpdateNodeInstances(self, ninfo, nresult, nimg):
2418 """Verifies and updates the node instance list.
2420 If the listing was successful, then updates this node's instance
2421 list. Otherwise, it marks the RPC call as failed for the instance
2424 @type ninfo: L{objects.Node}
2425 @param ninfo: the node to check
2426 @param nresult: the remote results for the node
2427 @param nimg: the node image object
2430 idata = nresult.get(constants.NV_INSTANCELIST, None)
2431 test = not isinstance(idata, list)
2432 self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name,
2433 "rpc call to node failed (instancelist): %s",
2434 utils.SafeEncode(str(idata)))
2436 nimg.hyp_fail = True
2438 nimg.instances = idata
2440 def _UpdateNodeInfo(self, ninfo, nresult, nimg, vg_name):
2441 """Verifies and computes a node information map
2443 @type ninfo: L{objects.Node}
2444 @param ninfo: the node to check
2445 @param nresult: the remote results for the node
2446 @param nimg: the node image object
2447 @param vg_name: the configured VG name
2451 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2453 # try to read free memory (from the hypervisor)
2454 hv_info = nresult.get(constants.NV_HVINFO, None)
2455 test = not isinstance(hv_info, dict) or "memory_free" not in hv_info
2456 _ErrorIf(test, constants.CV_ENODEHV, node,
2457 "rpc call to node failed (hvinfo)")
2460 nimg.mfree = int(hv_info["memory_free"])
2461 except (ValueError, TypeError):
2462 _ErrorIf(True, constants.CV_ENODERPC, node,
2463 "node returned invalid nodeinfo, check hypervisor")
2465 # FIXME: devise a free space model for file based instances as well
2466 if vg_name is not None:
2467 test = (constants.NV_VGLIST not in nresult or
2468 vg_name not in nresult[constants.NV_VGLIST])
2469 _ErrorIf(test, constants.CV_ENODELVM, node,
2470 "node didn't return data for the volume group '%s'"
2471 " - it is either missing or broken", vg_name)
2474 nimg.dfree = int(nresult[constants.NV_VGLIST][vg_name])
2475 except (ValueError, TypeError):
2476 _ErrorIf(True, constants.CV_ENODERPC, node,
2477 "node returned invalid LVM info, check LVM status")
2479 def _CollectDiskInfo(self, nodelist, node_image, instanceinfo):
2480 """Gets per-disk status information for all instances.
2482 @type nodelist: list of strings
2483 @param nodelist: Node names
2484 @type node_image: dict of (name, L{objects.Node})
2485 @param node_image: Node objects
2486 @type instanceinfo: dict of (name, L{objects.Instance})
2487 @param instanceinfo: Instance objects
2488 @rtype: {instance: {node: [(succes, payload)]}}
2489 @return: a dictionary of per-instance dictionaries with nodes as
2490 keys and disk information as values; the disk information is a
2491 list of tuples (success, payload)
2494 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2497 node_disks_devonly = {}
2498 diskless_instances = set()
2499 diskless = constants.DT_DISKLESS
2501 for nname in nodelist:
2502 node_instances = list(itertools.chain(node_image[nname].pinst,
2503 node_image[nname].sinst))
2504 diskless_instances.update(inst for inst in node_instances
2505 if instanceinfo[inst].disk_template == diskless)
2506 disks = [(inst, disk)
2507 for inst in node_instances
2508 for disk in instanceinfo[inst].disks]
2511 # No need to collect data
2514 node_disks[nname] = disks
2516 # Creating copies as SetDiskID below will modify the objects and that can
2517 # lead to incorrect data returned from nodes
2518 devonly = [dev.Copy() for (_, dev) in disks]
2521 self.cfg.SetDiskID(dev, nname)
2523 node_disks_devonly[nname] = devonly
2525 assert len(node_disks) == len(node_disks_devonly)
2527 # Collect data from all nodes with disks
2528 result = self.rpc.call_blockdev_getmirrorstatus_multi(node_disks.keys(),
2531 assert len(result) == len(node_disks)
2535 for (nname, nres) in result.items():
2536 disks = node_disks[nname]
2539 # No data from this node
2540 data = len(disks) * [(False, "node offline")]
2543 _ErrorIf(msg, constants.CV_ENODERPC, nname,
2544 "while getting disk information: %s", msg)
2546 # No data from this node
2547 data = len(disks) * [(False, msg)]
2550 for idx, i in enumerate(nres.payload):
2551 if isinstance(i, (tuple, list)) and len(i) == 2:
2554 logging.warning("Invalid result from node %s, entry %d: %s",
2556 data.append((False, "Invalid result from the remote node"))
2558 for ((inst, _), status) in zip(disks, data):
2559 instdisk.setdefault(inst, {}).setdefault(nname, []).append(status)
2561 # Add empty entries for diskless instances.
2562 for inst in diskless_instances:
2563 assert inst not in instdisk
2566 assert compat.all(len(statuses) == len(instanceinfo[inst].disks) and
2567 len(nnames) <= len(instanceinfo[inst].all_nodes) and
2568 compat.all(isinstance(s, (tuple, list)) and
2569 len(s) == 2 for s in statuses)
2570 for inst, nnames in instdisk.items()
2571 for nname, statuses in nnames.items())
2572 assert set(instdisk) == set(instanceinfo), "instdisk consistency failure"
2577 def _SshNodeSelector(group_uuid, all_nodes):
2578 """Create endless iterators for all potential SSH check hosts.
2581 nodes = [node for node in all_nodes
2582 if (node.group != group_uuid and
2584 keyfunc = operator.attrgetter("group")
2586 return map(itertools.cycle,
2587 [sorted(map(operator.attrgetter("name"), names))
2588 for _, names in itertools.groupby(sorted(nodes, key=keyfunc),
2592 def _SelectSshCheckNodes(cls, group_nodes, group_uuid, all_nodes):
2593 """Choose which nodes should talk to which other nodes.
2595 We will make nodes contact all nodes in their group, and one node from
2598 @warning: This algorithm has a known issue if one node group is much
2599 smaller than others (e.g. just one node). In such a case all other
2600 nodes will talk to the single node.
2603 online_nodes = sorted(node.name for node in group_nodes if not node.offline)
2604 sel = cls._SshNodeSelector(group_uuid, all_nodes)
2606 return (online_nodes,
2607 dict((name, sorted([i.next() for i in sel]))
2608 for name in online_nodes))
2610 def BuildHooksEnv(self):
2613 Cluster-Verify hooks just ran in the post phase and their failure makes
2614 the output be logged in the verify output and the verification to fail.
2618 "CLUSTER_TAGS": " ".join(self.cfg.GetClusterInfo().GetTags())
2621 env.update(("NODE_TAGS_%s" % node.name, " ".join(node.GetTags()))
2622 for node in self.my_node_info.values())
2626 def BuildHooksNodes(self):
2627 """Build hooks nodes.
2630 return ([], self.my_node_names)
2632 def Exec(self, feedback_fn):
2633 """Verify integrity of the node group, performing various test on nodes.
2636 # This method has too many local variables. pylint: disable=R0914
2637 feedback_fn("* Verifying group '%s'" % self.group_info.name)
2639 if not self.my_node_names:
2641 feedback_fn("* Empty node group, skipping verification")
2645 _ErrorIf = self._ErrorIf # pylint: disable=C0103
2646 verbose = self.op.verbose
2647 self._feedback_fn = feedback_fn
2649 vg_name = self.cfg.GetVGName()
2650 drbd_helper = self.cfg.GetDRBDHelper()
2651 cluster = self.cfg.GetClusterInfo()
2652 groupinfo = self.cfg.GetAllNodeGroupsInfo()
2653 hypervisors = cluster.enabled_hypervisors
2654 node_data_list = [self.my_node_info[name] for name in self.my_node_names]
2656 i_non_redundant = [] # Non redundant instances
2657 i_non_a_balanced = [] # Non auto-balanced instances
2658 n_offline = 0 # Count of offline nodes
2659 n_drained = 0 # Count of nodes being drained
2660 node_vol_should = {}
2662 # FIXME: verify OS list
2665 filemap = _ComputeAncillaryFiles(cluster, False)
2667 # do local checksums
2668 master_node = self.master_node = self.cfg.GetMasterNode()
2669 master_ip = self.cfg.GetMasterIP()
2671 feedback_fn("* Gathering data (%d nodes)" % len(self.my_node_names))
2674 if self.cfg.GetUseExternalMipScript():
2675 user_scripts.append(constants.EXTERNAL_MASTER_SETUP_SCRIPT)
2677 node_verify_param = {
2678 constants.NV_FILELIST:
2679 utils.UniqueSequence(filename
2680 for files in filemap
2681 for filename in files),
2682 constants.NV_NODELIST:
2683 self._SelectSshCheckNodes(node_data_list, self.group_uuid,
2684 self.all_node_info.values()),
2685 constants.NV_HYPERVISOR: hypervisors,
2686 constants.NV_HVPARAMS:
2687 _GetAllHypervisorParameters(cluster, self.all_inst_info.values()),
2688 constants.NV_NODENETTEST: [(node.name, node.primary_ip, node.secondary_ip)
2689 for node in node_data_list
2690 if not node.offline],
2691 constants.NV_INSTANCELIST: hypervisors,
2692 constants.NV_VERSION: None,
2693 constants.NV_HVINFO: self.cfg.GetHypervisorType(),
2694 constants.NV_NODESETUP: None,
2695 constants.NV_TIME: None,
2696 constants.NV_MASTERIP: (master_node, master_ip),
2697 constants.NV_OSLIST: None,
2698 constants.NV_VMNODES: self.cfg.GetNonVmCapableNodeList(),
2699 constants.NV_USERSCRIPTS: user_scripts,
2702 if vg_name is not None:
2703 node_verify_param[constants.NV_VGLIST] = None
2704 node_verify_param[constants.NV_LVLIST] = vg_name
2705 node_verify_param[constants.NV_PVLIST] = [vg_name]
2706 node_verify_param[constants.NV_DRBDLIST] = None
2709 node_verify_param[constants.NV_DRBDHELPER] = drbd_helper
2712 # FIXME: this needs to be changed per node-group, not cluster-wide
2714 default_nicpp = cluster.nicparams[constants.PP_DEFAULT]
2715 if default_nicpp[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
2716 bridges.add(default_nicpp[constants.NIC_LINK])
2717 for instance in self.my_inst_info.values():
2718 for nic in instance.nics:
2719 full_nic = cluster.SimpleFillNIC(nic.nicparams)
2720 if full_nic[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
2721 bridges.add(full_nic[constants.NIC_LINK])
2724 node_verify_param[constants.NV_BRIDGES] = list(bridges)
2726 # Build our expected cluster state
2727 node_image = dict((node.name, self.NodeImage(offline=node.offline,
2729 vm_capable=node.vm_capable))
2730 for node in node_data_list)
2734 for node in self.all_node_info.values():
2735 path = _SupportsOob(self.cfg, node)
2736 if path and path not in oob_paths:
2737 oob_paths.append(path)
2740 node_verify_param[constants.NV_OOB_PATHS] = oob_paths
2742 for instance in self.my_inst_names:
2743 inst_config = self.my_inst_info[instance]
2745 for nname in inst_config.all_nodes:
2746 if nname not in node_image:
2747 gnode = self.NodeImage(name=nname)
2748 gnode.ghost = (nname not in self.all_node_info)
2749 node_image[nname] = gnode
2751 inst_config.MapLVsByNode(node_vol_should)
2753 pnode = inst_config.primary_node
2754 node_image[pnode].pinst.append(instance)
2756 for snode in inst_config.secondary_nodes:
2757 nimg = node_image[snode]
2758 nimg.sinst.append(instance)
2759 if pnode not in nimg.sbp:
2760 nimg.sbp[pnode] = []
2761 nimg.sbp[pnode].append(instance)
2763 # At this point, we have the in-memory data structures complete,
2764 # except for the runtime information, which we'll gather next
2766 # Due to the way our RPC system works, exact response times cannot be
2767 # guaranteed (e.g. a broken node could run into a timeout). By keeping the
2768 # time before and after executing the request, we can at least have a time
2770 nvinfo_starttime = time.time()
2771 all_nvinfo = self.rpc.call_node_verify(self.my_node_names,
2773 self.cfg.GetClusterName())
2774 nvinfo_endtime = time.time()
2776 if self.extra_lv_nodes and vg_name is not None:
2778 self.rpc.call_node_verify(self.extra_lv_nodes,
2779 {constants.NV_LVLIST: vg_name},
2780 self.cfg.GetClusterName())
2782 extra_lv_nvinfo = {}
2784 all_drbd_map = self.cfg.ComputeDRBDMap()
2786 feedback_fn("* Gathering disk information (%s nodes)" %
2787 len(self.my_node_names))
2788 instdisk = self._CollectDiskInfo(self.my_node_names, node_image,
2791 feedback_fn("* Verifying configuration file consistency")
2793 # If not all nodes are being checked, we need to make sure the master node
2794 # and a non-checked vm_capable node are in the list.
2795 absent_nodes = set(self.all_node_info).difference(self.my_node_info)
2797 vf_nvinfo = all_nvinfo.copy()
2798 vf_node_info = list(self.my_node_info.values())
2799 additional_nodes = []
2800 if master_node not in self.my_node_info:
2801 additional_nodes.append(master_node)
2802 vf_node_info.append(self.all_node_info[master_node])
2803 # Add the first vm_capable node we find which is not included
2804 for node in absent_nodes:
2805 nodeinfo = self.all_node_info[node]
2806 if nodeinfo.vm_capable and not nodeinfo.offline:
2807 additional_nodes.append(node)
2808 vf_node_info.append(self.all_node_info[node])
2810 key = constants.NV_FILELIST
2811 vf_nvinfo.update(self.rpc.call_node_verify(additional_nodes,
2812 {key: node_verify_param[key]},
2813 self.cfg.GetClusterName()))
2815 vf_nvinfo = all_nvinfo
2816 vf_node_info = self.my_node_info.values()
2818 self._VerifyFiles(_ErrorIf, vf_node_info, master_node, vf_nvinfo, filemap)
2820 feedback_fn("* Verifying node status")
2824 for node_i in node_data_list:
2826 nimg = node_image[node]
2830 feedback_fn("* Skipping offline node %s" % (node,))
2834 if node == master_node:
2836 elif node_i.master_candidate:
2837 ntype = "master candidate"
2838 elif node_i.drained:
2844 feedback_fn("* Verifying node %s (%s)" % (node, ntype))
2846 msg = all_nvinfo[node].fail_msg
2847 _ErrorIf(msg, constants.CV_ENODERPC, node, "while contacting node: %s",
2850 nimg.rpc_fail = True
2853 nresult = all_nvinfo[node].payload
2855 nimg.call_ok = self._VerifyNode(node_i, nresult)
2856 self._VerifyNodeTime(node_i, nresult, nvinfo_starttime, nvinfo_endtime)
2857 self._VerifyNodeNetwork(node_i, nresult)
2858 self._VerifyNodeUserScripts(node_i, nresult)
2859 self._VerifyOob(node_i, nresult)
2862 self._VerifyNodeLVM(node_i, nresult, vg_name)
2863 self._VerifyNodeDrbd(node_i, nresult, self.all_inst_info, drbd_helper,
2866 self._UpdateNodeVolumes(node_i, nresult, nimg, vg_name)
2867 self._UpdateNodeInstances(node_i, nresult, nimg)
2868 self._UpdateNodeInfo(node_i, nresult, nimg, vg_name)
2869 self._UpdateNodeOS(node_i, nresult, nimg)
2871 if not nimg.os_fail:
2872 if refos_img is None:
2874 self._VerifyNodeOS(node_i, nimg, refos_img)
2875 self._VerifyNodeBridges(node_i, nresult, bridges)
2877 # Check whether all running instancies are primary for the node. (This
2878 # can no longer be done from _VerifyInstance below, since some of the
2879 # wrong instances could be from other node groups.)
2880 non_primary_inst = set(nimg.instances).difference(nimg.pinst)
2882 for inst in non_primary_inst:
2883 test = inst in self.all_inst_info
2884 _ErrorIf(test, constants.CV_EINSTANCEWRONGNODE, inst,
2885 "instance should not run on node %s", node_i.name)
2886 _ErrorIf(not test, constants.CV_ENODEORPHANINSTANCE, node_i.name,
2887 "node is running unknown instance %s", inst)
2889 for node, result in extra_lv_nvinfo.items():
2890 self._UpdateNodeVolumes(self.all_node_info[node], result.payload,
2891 node_image[node], vg_name)
2893 feedback_fn("* Verifying instance status")
2894 for instance in self.my_inst_names:
2896 feedback_fn("* Verifying instance %s" % instance)
2897 inst_config = self.my_inst_info[instance]
2898 self._VerifyInstance(instance, inst_config, node_image,
2900 inst_nodes_offline = []
2902 pnode = inst_config.primary_node
2903 pnode_img = node_image[pnode]
2904 _ErrorIf(pnode_img.rpc_fail and not pnode_img.offline,
2905 constants.CV_ENODERPC, pnode, "instance %s, connection to"
2906 " primary node failed", instance)
2908 _ErrorIf(inst_config.admin_up and pnode_img.offline,
2909 constants.CV_EINSTANCEBADNODE, instance,
2910 "instance is marked as running and lives on offline node %s",
2911 inst_config.primary_node)
2913 # If the instance is non-redundant we cannot survive losing its primary
2914 # node, so we are not N+1 compliant. On the other hand we have no disk
2915 # templates with more than one secondary so that situation is not well
2917 # FIXME: does not support file-backed instances
2918 if not inst_config.secondary_nodes:
2919 i_non_redundant.append(instance)
2921 _ErrorIf(len(inst_config.secondary_nodes) > 1,
2922 constants.CV_EINSTANCELAYOUT,
2923 instance, "instance has multiple secondary nodes: %s",
2924 utils.CommaJoin(inst_config.secondary_nodes),
2925 code=self.ETYPE_WARNING)
2927 if inst_config.disk_template in constants.DTS_INT_MIRROR:
2928 pnode = inst_config.primary_node
2929 instance_nodes = utils.NiceSort(inst_config.all_nodes)
2930 instance_groups = {}
2932 for node in instance_nodes:
2933 instance_groups.setdefault(self.all_node_info[node].group,
2937 "%s (group %s)" % (utils.CommaJoin(nodes), groupinfo[group].name)
2938 # Sort so that we always list the primary node first.
2939 for group, nodes in sorted(instance_groups.items(),
2940 key=lambda (_, nodes): pnode in nodes,
2943 self._ErrorIf(len(instance_groups) > 1,
2944 constants.CV_EINSTANCESPLITGROUPS,
2945 instance, "instance has primary and secondary nodes in"
2946 " different groups: %s", utils.CommaJoin(pretty_list),
2947 code=self.ETYPE_WARNING)
2949 if not cluster.FillBE(inst_config)[constants.BE_AUTO_BALANCE]:
2950 i_non_a_balanced.append(instance)
2952 for snode in inst_config.secondary_nodes:
2953 s_img = node_image[snode]
2954 _ErrorIf(s_img.rpc_fail and not s_img.offline, constants.CV_ENODERPC,
2955 snode, "instance %s, connection to secondary node failed",
2959 inst_nodes_offline.append(snode)
2961 # warn that the instance lives on offline nodes
2962 _ErrorIf(inst_nodes_offline, constants.CV_EINSTANCEBADNODE, instance,
2963 "instance has offline secondary node(s) %s",
2964 utils.CommaJoin(inst_nodes_offline))
2965 # ... or ghost/non-vm_capable nodes
2966 for node in inst_config.all_nodes:
2967 _ErrorIf(node_image[node].ghost, constants.CV_EINSTANCEBADNODE,
2968 instance, "instance lives on ghost node %s", node)
2969 _ErrorIf(not node_image[node].vm_capable, constants.CV_EINSTANCEBADNODE,
2970 instance, "instance lives on non-vm_capable node %s", node)
2972 feedback_fn("* Verifying orphan volumes")
2973 reserved = utils.FieldSet(*cluster.reserved_lvs)
2975 # We will get spurious "unknown volume" warnings if any node of this group
2976 # is secondary for an instance whose primary is in another group. To avoid
2977 # them, we find these instances and add their volumes to node_vol_should.
2978 for inst in self.all_inst_info.values():
2979 for secondary in inst.secondary_nodes:
2980 if (secondary in self.my_node_info
2981 and inst.name not in self.my_inst_info):
2982 inst.MapLVsByNode(node_vol_should)
2985 self._VerifyOrphanVolumes(node_vol_should, node_image, reserved)
2987 if constants.VERIFY_NPLUSONE_MEM not in self.op.skip_checks:
2988 feedback_fn("* Verifying N+1 Memory redundancy")
2989 self._VerifyNPlusOneMemory(node_image, self.my_inst_info)
2991 feedback_fn("* Other Notes")
2993 feedback_fn(" - NOTICE: %d non-redundant instance(s) found."
2994 % len(i_non_redundant))
2996 if i_non_a_balanced:
2997 feedback_fn(" - NOTICE: %d non-auto-balanced instance(s) found."
2998 % len(i_non_a_balanced))
3001 feedback_fn(" - NOTICE: %d offline node(s) found." % n_offline)
3004 feedback_fn(" - NOTICE: %d drained node(s) found." % n_drained)
3008 def HooksCallBack(self, phase, hooks_results, feedback_fn, lu_result):
3009 """Analyze the post-hooks' result
3011 This method analyses the hook result, handles it, and sends some
3012 nicely-formatted feedback back to the user.
3014 @param phase: one of L{constants.HOOKS_PHASE_POST} or
3015 L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase
3016 @param hooks_results: the results of the multi-node hooks rpc call
3017 @param feedback_fn: function used send feedback back to the caller
3018 @param lu_result: previous Exec result
3019 @return: the new Exec result, based on the previous result
3023 # We only really run POST phase hooks, only for non-empty groups,
3024 # and are only interested in their results
3025 if not self.my_node_names:
3028 elif phase == constants.HOOKS_PHASE_POST:
3029 # Used to change hooks' output to proper indentation
3030 feedback_fn("* Hooks Results")
3031 assert hooks_results, "invalid result from hooks"
3033 for node_name in hooks_results:
3034 res = hooks_results[node_name]
3036 test = msg and not res.offline
3037 self._ErrorIf(test, constants.CV_ENODEHOOKS, node_name,
3038 "Communication failure in hooks execution: %s", msg)
3039 if res.offline or msg:
3040 # No need to investigate payload if node is offline or gave
3043 for script, hkr, output in res.payload:
3044 test = hkr == constants.HKR_FAIL
3045 self._ErrorIf(test, constants.CV_ENODEHOOKS, node_name,
3046 "Script %s failed, output:", script)
3048 output = self._HOOKS_INDENT_RE.sub(" ", output)
3049 feedback_fn("%s" % output)
3055 class LUClusterVerifyDisks(NoHooksLU):
3056 """Verifies the cluster disks status.
3061 def ExpandNames(self):
3062 self.share_locks = _ShareAll()
3063 self.needed_locks = {
3064 locking.LEVEL_NODEGROUP: locking.ALL_SET,
3067 def Exec(self, feedback_fn):
3068 group_names = self.owned_locks(locking.LEVEL_NODEGROUP)
3070 # Submit one instance of L{opcodes.OpGroupVerifyDisks} per node group
3071 return ResultWithJobs([[opcodes.OpGroupVerifyDisks(group_name=group)]
3072 for group in group_names])
3075 class LUGroupVerifyDisks(NoHooksLU):
3076 """Verifies the status of all disks in a node group.
3081 def ExpandNames(self):
3082 # Raises errors.OpPrereqError on its own if group can't be found
3083 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
3085 self.share_locks = _ShareAll()
3086 self.needed_locks = {
3087 locking.LEVEL_INSTANCE: [],
3088 locking.LEVEL_NODEGROUP: [],
3089 locking.LEVEL_NODE: [],
3092 def DeclareLocks(self, level):
3093 if level == locking.LEVEL_INSTANCE:
3094 assert not self.needed_locks[locking.LEVEL_INSTANCE]
3096 # Lock instances optimistically, needs verification once node and group
3097 # locks have been acquired
3098 self.needed_locks[locking.LEVEL_INSTANCE] = \
3099 self.cfg.GetNodeGroupInstances(self.group_uuid)
3101 elif level == locking.LEVEL_NODEGROUP:
3102 assert not self.needed_locks[locking.LEVEL_NODEGROUP]
3104 self.needed_locks[locking.LEVEL_NODEGROUP] = \
3105 set([self.group_uuid] +
3106 # Lock all groups used by instances optimistically; this requires
3107 # going via the node before it's locked, requiring verification
3110 for instance_name in self.owned_locks(locking.LEVEL_INSTANCE)
3111 for group_uuid in self.cfg.GetInstanceNodeGroups(instance_name)])
3113 elif level == locking.LEVEL_NODE:
3114 # This will only lock the nodes in the group to be verified which contain
3116 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
3117 self._LockInstancesNodes()
3119 # Lock all nodes in group to be verified
3120 assert self.group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP)
3121 member_nodes = self.cfg.GetNodeGroup(self.group_uuid).members
3122 self.needed_locks[locking.LEVEL_NODE].extend(member_nodes)
3124 def CheckPrereq(self):
3125 owned_instances = frozenset(self.owned_locks(locking.LEVEL_INSTANCE))
3126 owned_groups = frozenset(self.owned_locks(locking.LEVEL_NODEGROUP))
3127 owned_nodes = frozenset(self.owned_locks(locking.LEVEL_NODE))
3129 assert self.group_uuid in owned_groups
3131 # Check if locked instances are still correct
3132 _CheckNodeGroupInstances(self.cfg, self.group_uuid, owned_instances)
3134 # Get instance information
3135 self.instances = dict(self.cfg.GetMultiInstanceInfo(owned_instances))
3137 # Check if node groups for locked instances are still correct
3138 for (instance_name, inst) in self.instances.items():
3139 assert owned_nodes.issuperset(inst.all_nodes), \
3140 "Instance %s's nodes changed while we kept the lock" % instance_name
3142 inst_groups = _CheckInstanceNodeGroups(self.cfg, instance_name,
3145 assert self.group_uuid in inst_groups, \
3146 "Instance %s has no node in group %s" % (instance_name, self.group_uuid)
3148 def Exec(self, feedback_fn):
3149 """Verify integrity of cluster disks.
3151 @rtype: tuple of three items
3152 @return: a tuple of (dict of node-to-node_error, list of instances
3153 which need activate-disks, dict of instance: (node, volume) for
3158 res_instances = set()
3161 nv_dict = _MapInstanceDisksToNodes([inst
3162 for inst in self.instances.values()
3166 nodes = utils.NiceSort(set(self.owned_locks(locking.LEVEL_NODE)) &
3167 set(self.cfg.GetVmCapableNodeList()))
3169 node_lvs = self.rpc.call_lv_list(nodes, [])
3171 for (node, node_res) in node_lvs.items():
3172 if node_res.offline:
3175 msg = node_res.fail_msg
3177 logging.warning("Error enumerating LVs on node %s: %s", node, msg)
3178 res_nodes[node] = msg
3181 for lv_name, (_, _, lv_online) in node_res.payload.items():
3182 inst = nv_dict.pop((node, lv_name), None)
3183 if not (lv_online or inst is None):
3184 res_instances.add(inst)
3186 # any leftover items in nv_dict are missing LVs, let's arrange the data
3188 for key, inst in nv_dict.iteritems():
3189 res_missing.setdefault(inst, []).append(list(key))
3191 return (res_nodes, list(res_instances), res_missing)
3194 class LUClusterRepairDiskSizes(NoHooksLU):
3195 """Verifies the cluster disks sizes.
3200 def ExpandNames(self):
3201 if self.op.instances:
3202 self.wanted_names = _GetWantedInstances(self, self.op.instances)
3203 self.needed_locks = {
3204 locking.LEVEL_NODE: [],
3205 locking.LEVEL_INSTANCE: self.wanted_names,
3207 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
3209 self.wanted_names = None
3210 self.needed_locks = {
3211 locking.LEVEL_NODE: locking.ALL_SET,
3212 locking.LEVEL_INSTANCE: locking.ALL_SET,
3214 self.share_locks = _ShareAll()
3216 def DeclareLocks(self, level):
3217 if level == locking.LEVEL_NODE and self.wanted_names is not None:
3218 self._LockInstancesNodes(primary_only=True)
3220 def CheckPrereq(self):
3221 """Check prerequisites.
3223 This only checks the optional instance list against the existing names.
3226 if self.wanted_names is None:
3227 self.wanted_names = self.owned_locks(locking.LEVEL_INSTANCE)
3229 self.wanted_instances = \
3230 map(compat.snd, self.cfg.GetMultiInstanceInfo(self.wanted_names))
3232 def _EnsureChildSizes(self, disk):
3233 """Ensure children of the disk have the needed disk size.
3235 This is valid mainly for DRBD8 and fixes an issue where the
3236 children have smaller disk size.
3238 @param disk: an L{ganeti.objects.Disk} object
3241 if disk.dev_type == constants.LD_DRBD8:
3242 assert disk.children, "Empty children for DRBD8?"
3243 fchild = disk.children[0]
3244 mismatch = fchild.size < disk.size
3246 self.LogInfo("Child disk has size %d, parent %d, fixing",
3247 fchild.size, disk.size)
3248 fchild.size = disk.size
3250 # and we recurse on this child only, not on the metadev
3251 return self._EnsureChildSizes(fchild) or mismatch
3255 def Exec(self, feedback_fn):
3256 """Verify the size of cluster disks.
3259 # TODO: check child disks too
3260 # TODO: check differences in size between primary/secondary nodes
3262 for instance in self.wanted_instances:
3263 pnode = instance.primary_node
3264 if pnode not in per_node_disks:
3265 per_node_disks[pnode] = []
3266 for idx, disk in enumerate(instance.disks):
3267 per_node_disks[pnode].append((instance, idx, disk))
3270 for node, dskl in per_node_disks.items():
3271 newl = [v[2].Copy() for v in dskl]
3273 self.cfg.SetDiskID(dsk, node)
3274 result = self.rpc.call_blockdev_getsize(node, newl)
3276 self.LogWarning("Failure in blockdev_getsize call to node"
3277 " %s, ignoring", node)
3279 if len(result.payload) != len(dskl):
3280 logging.warning("Invalid result from node %s: len(dksl)=%d,"
3281 " result.payload=%s", node, len(dskl), result.payload)
3282 self.LogWarning("Invalid result from node %s, ignoring node results",
3285 for ((instance, idx, disk), size) in zip(dskl, result.payload):
3287 self.LogWarning("Disk %d of instance %s did not return size"
3288 " information, ignoring", idx, instance.name)
3290 if not isinstance(size, (int, long)):
3291 self.LogWarning("Disk %d of instance %s did not return valid"
3292 " size information, ignoring", idx, instance.name)
3295 if size != disk.size:
3296 self.LogInfo("Disk %d of instance %s has mismatched size,"
3297 " correcting: recorded %d, actual %d", idx,
3298 instance.name, disk.size, size)
3300 self.cfg.Update(instance, feedback_fn)
3301 changed.append((instance.name, idx, size))
3302 if self._EnsureChildSizes(disk):
3303 self.cfg.Update(instance, feedback_fn)
3304 changed.append((instance.name, idx, disk.size))
3308 class LUClusterRename(LogicalUnit):
3309 """Rename the cluster.
3312 HPATH = "cluster-rename"
3313 HTYPE = constants.HTYPE_CLUSTER
3315 def BuildHooksEnv(self):
3320 "OP_TARGET": self.cfg.GetClusterName(),
3321 "NEW_NAME": self.op.name,
3324 def BuildHooksNodes(self):
3325 """Build hooks nodes.
3328 return ([self.cfg.GetMasterNode()], self.cfg.GetNodeList())
3330 def CheckPrereq(self):
3331 """Verify that the passed name is a valid one.
3334 hostname = netutils.GetHostname(name=self.op.name,
3335 family=self.cfg.GetPrimaryIPFamily())
3337 new_name = hostname.name
3338 self.ip = new_ip = hostname.ip
3339 old_name = self.cfg.GetClusterName()
3340 old_ip = self.cfg.GetMasterIP()
3341 if new_name == old_name and new_ip == old_ip:
3342 raise errors.OpPrereqError("Neither the name nor the IP address of the"
3343 " cluster has changed",
3345 if new_ip != old_ip:
3346 if netutils.TcpPing(new_ip, constants.DEFAULT_NODED_PORT):
3347 raise errors.OpPrereqError("The given cluster IP address (%s) is"
3348 " reachable on the network" %
3349 new_ip, errors.ECODE_NOTUNIQUE)
3351 self.op.name = new_name
3353 def Exec(self, feedback_fn):
3354 """Rename the cluster.
3357 clustername = self.op.name
3360 # shutdown the master IP
3361 master_params = self.cfg.GetMasterNetworkParameters()
3362 ems = self.cfg.GetUseExternalMipScript()
3363 result = self.rpc.call_node_deactivate_master_ip(master_params.name,
3365 result.Raise("Could not disable the master role")
3368 cluster = self.cfg.GetClusterInfo()
3369 cluster.cluster_name = clustername
3370 cluster.master_ip = new_ip
3371 self.cfg.Update(cluster, feedback_fn)
3373 # update the known hosts file
3374 ssh.WriteKnownHostsFile(self.cfg, constants.SSH_KNOWN_HOSTS_FILE)
3375 node_list = self.cfg.GetOnlineNodeList()
3377 node_list.remove(master_params.name)
3380 _UploadHelper(self, node_list, constants.SSH_KNOWN_HOSTS_FILE)
3382 master_params.ip = new_ip
3383 result = self.rpc.call_node_activate_master_ip(master_params.name,
3385 msg = result.fail_msg
3387 self.LogWarning("Could not re-enable the master role on"
3388 " the master, please restart manually: %s", msg)
3393 def _ValidateNetmask(cfg, netmask):
3394 """Checks if a netmask is valid.
3396 @type cfg: L{config.ConfigWriter}
3397 @param cfg: The cluster configuration
3399 @param netmask: the netmask to be verified
3400 @raise errors.OpPrereqError: if the validation fails
3403 ip_family = cfg.GetPrimaryIPFamily()
3405 ipcls = netutils.IPAddress.GetClassFromIpFamily(ip_family)
3406 except errors.ProgrammerError:
3407 raise errors.OpPrereqError("Invalid primary ip family: %s." %
3409 if not ipcls.ValidateNetmask(netmask):
3410 raise errors.OpPrereqError("CIDR netmask (%s) not valid" %
3414 class LUClusterSetParams(LogicalUnit):
3415 """Change the parameters of the cluster.
3418 HPATH = "cluster-modify"
3419 HTYPE = constants.HTYPE_CLUSTER
3422 def CheckArguments(self):
3426 if self.op.uid_pool:
3427 uidpool.CheckUidPool(self.op.uid_pool)
3429 if self.op.add_uids:
3430 uidpool.CheckUidPool(self.op.add_uids)
3432 if self.op.remove_uids:
3433 uidpool.CheckUidPool(self.op.remove_uids)
3435 if self.op.master_netmask is not None:
3436 _ValidateNetmask(self.cfg, self.op.master_netmask)
3438 def ExpandNames(self):
3439 # FIXME: in the future maybe other cluster params won't require checking on
3440 # all nodes to be modified.
3441 self.needed_locks = {
3442 locking.LEVEL_NODE: locking.ALL_SET,
3444 self.share_locks[locking.LEVEL_NODE] = 1
3446 def BuildHooksEnv(self):
3451 "OP_TARGET": self.cfg.GetClusterName(),
3452 "NEW_VG_NAME": self.op.vg_name,
3455 def BuildHooksNodes(self):
3456 """Build hooks nodes.
3459 mn = self.cfg.GetMasterNode()
3462 def CheckPrereq(self):
3463 """Check prerequisites.
3465 This checks whether the given params don't conflict and
3466 if the given volume group is valid.
3469 if self.op.vg_name is not None and not self.op.vg_name:
3470 if self.cfg.HasAnyDiskOfType(constants.LD_LV):
3471 raise errors.OpPrereqError("Cannot disable lvm storage while lvm-based"
3472 " instances exist", errors.ECODE_INVAL)
3474 if self.op.drbd_helper is not None and not self.op.drbd_helper:
3475 if self.cfg.HasAnyDiskOfType(constants.LD_DRBD8):
3476 raise errors.OpPrereqError("Cannot disable drbd helper while"
3477 " drbd-based instances exist",
3480 node_list = self.owned_locks(locking.LEVEL_NODE)
3482 # if vg_name not None, checks given volume group on all nodes
3484 vglist = self.rpc.call_vg_list(node_list)
3485 for node in node_list:
3486 msg = vglist[node].fail_msg
3488 # ignoring down node
3489 self.LogWarning("Error while gathering data on node %s"
3490 " (ignoring node): %s", node, msg)
3492 vgstatus = utils.CheckVolumeGroupSize(vglist[node].payload,
3494 constants.MIN_VG_SIZE)
3496 raise errors.OpPrereqError("Error on node '%s': %s" %
3497 (node, vgstatus), errors.ECODE_ENVIRON)
3499 if self.op.drbd_helper:
3500 # checks given drbd helper on all nodes
3501 helpers = self.rpc.call_drbd_helper(node_list)
3502 for (node, ninfo) in self.cfg.GetMultiNodeInfo(node_list):
3504 self.LogInfo("Not checking drbd helper on offline node %s", node)
3506 msg = helpers[node].fail_msg
3508 raise errors.OpPrereqError("Error checking drbd helper on node"
3509 " '%s': %s" % (node, msg),
3510 errors.ECODE_ENVIRON)
3511 node_helper = helpers[node].payload
3512 if node_helper != self.op.drbd_helper:
3513 raise errors.OpPrereqError("Error on node '%s': drbd helper is %s" %
3514 (node, node_helper), errors.ECODE_ENVIRON)
3516 self.cluster = cluster = self.cfg.GetClusterInfo()
3517 # validate params changes
3518 if self.op.beparams:
3519 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
3520 self.new_beparams = cluster.SimpleFillBE(self.op.beparams)
3522 if self.op.ndparams:
3523 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
3524 self.new_ndparams = cluster.SimpleFillND(self.op.ndparams)
3526 # TODO: we need a more general way to handle resetting
3527 # cluster-level parameters to default values
3528 if self.new_ndparams["oob_program"] == "":
3529 self.new_ndparams["oob_program"] = \
3530 constants.NDC_DEFAULTS[constants.ND_OOB_PROGRAM]
3532 if self.op.nicparams:
3533 utils.ForceDictType(self.op.nicparams, constants.NICS_PARAMETER_TYPES)
3534 self.new_nicparams = cluster.SimpleFillNIC(self.op.nicparams)
3535 objects.NIC.CheckParameterSyntax(self.new_nicparams)
3538 # check all instances for consistency
3539 for instance in self.cfg.GetAllInstancesInfo().values():
3540 for nic_idx, nic in enumerate(instance.nics):
3541 params_copy = copy.deepcopy(nic.nicparams)
3542 params_filled = objects.FillDict(self.new_nicparams, params_copy)
3544 # check parameter syntax
3546 objects.NIC.CheckParameterSyntax(params_filled)
3547 except errors.ConfigurationError, err:
3548 nic_errors.append("Instance %s, nic/%d: %s" %
3549 (instance.name, nic_idx, err))
3551 # if we're moving instances to routed, check that they have an ip
3552 target_mode = params_filled[constants.NIC_MODE]
3553 if target_mode == constants.NIC_MODE_ROUTED and not nic.ip:
3554 nic_errors.append("Instance %s, nic/%d: routed NIC with no ip"
3555 " address" % (instance.name, nic_idx))
3557 raise errors.OpPrereqError("Cannot apply the change, errors:\n%s" %
3558 "\n".join(nic_errors))
3560 # hypervisor list/parameters
3561 self.new_hvparams = new_hvp = objects.FillDict(cluster.hvparams, {})
3562 if self.op.hvparams:
3563 for hv_name, hv_dict in self.op.hvparams.items():
3564 if hv_name not in self.new_hvparams:
3565 self.new_hvparams[hv_name] = hv_dict
3567 self.new_hvparams[hv_name].update(hv_dict)
3569 # os hypervisor parameters
3570 self.new_os_hvp = objects.FillDict(cluster.os_hvp, {})
3572 for os_name, hvs in self.op.os_hvp.items():
3573 if os_name not in self.new_os_hvp:
3574 self.new_os_hvp[os_name] = hvs
3576 for hv_name, hv_dict in hvs.items():
3577 if hv_name not in self.new_os_hvp[os_name]:
3578 self.new_os_hvp[os_name][hv_name] = hv_dict
3580 self.new_os_hvp[os_name][hv_name].update(hv_dict)
3583 self.new_osp = objects.FillDict(cluster.osparams, {})
3584 if self.op.osparams:
3585 for os_name, osp in self.op.osparams.items():
3586 if os_name not in self.new_osp:
3587 self.new_osp[os_name] = {}
3589 self.new_osp[os_name] = _GetUpdatedParams(self.new_osp[os_name], osp,
3592 if not self.new_osp[os_name]:
3593 # we removed all parameters
3594 del self.new_osp[os_name]
3596 # check the parameter validity (remote check)
3597 _CheckOSParams(self, False, [self.cfg.GetMasterNode()],
3598 os_name, self.new_osp[os_name])
3600 # changes to the hypervisor list
3601 if self.op.enabled_hypervisors is not None:
3602 self.hv_list = self.op.enabled_hypervisors
3603 for hv in self.hv_list:
3604 # if the hypervisor doesn't already exist in the cluster
3605 # hvparams, we initialize it to empty, and then (in both
3606 # cases) we make sure to fill the defaults, as we might not
3607 # have a complete defaults list if the hypervisor wasn't
3609 if hv not in new_hvp:
3611 new_hvp[hv] = objects.FillDict(constants.HVC_DEFAULTS[hv], new_hvp[hv])
3612 utils.ForceDictType(new_hvp[hv], constants.HVS_PARAMETER_TYPES)
3614 self.hv_list = cluster.enabled_hypervisors
3616 if self.op.hvparams or self.op.enabled_hypervisors is not None:
3617 # either the enabled list has changed, or the parameters have, validate
3618 for hv_name, hv_params in self.new_hvparams.items():
3619 if ((self.op.hvparams and hv_name in self.op.hvparams) or
3620 (self.op.enabled_hypervisors and
3621 hv_name in self.op.enabled_hypervisors)):
3622 # either this is a new hypervisor, or its parameters have changed
3623 hv_class = hypervisor.GetHypervisor(hv_name)
3624 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
3625 hv_class.CheckParameterSyntax(hv_params)
3626 _CheckHVParams(self, node_list, hv_name, hv_params)
3629 # no need to check any newly-enabled hypervisors, since the
3630 # defaults have already been checked in the above code-block
3631 for os_name, os_hvp in self.new_os_hvp.items():
3632 for hv_name, hv_params in os_hvp.items():
3633 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
3634 # we need to fill in the new os_hvp on top of the actual hv_p
3635 cluster_defaults = self.new_hvparams.get(hv_name, {})
3636 new_osp = objects.FillDict(cluster_defaults, hv_params)
3637 hv_class = hypervisor.GetHypervisor(hv_name)
3638 hv_class.CheckParameterSyntax(new_osp)
3639 _CheckHVParams(self, node_list, hv_name, new_osp)
3641 if self.op.default_iallocator:
3642 alloc_script = utils.FindFile(self.op.default_iallocator,
3643 constants.IALLOCATOR_SEARCH_PATH,
3645 if alloc_script is None:
3646 raise errors.OpPrereqError("Invalid default iallocator script '%s'"
3647 " specified" % self.op.default_iallocator,
3650 def Exec(self, feedback_fn):
3651 """Change the parameters of the cluster.
3654 if self.op.vg_name is not None:
3655 new_volume = self.op.vg_name
3658 if new_volume != self.cfg.GetVGName():
3659 self.cfg.SetVGName(new_volume)
3661 feedback_fn("Cluster LVM configuration already in desired"
3662 " state, not changing")
3663 if self.op.drbd_helper is not None:
3664 new_helper = self.op.drbd_helper
3667 if new_helper != self.cfg.GetDRBDHelper():
3668 self.cfg.SetDRBDHelper(new_helper)
3670 feedback_fn("Cluster DRBD helper already in desired state,"
3672 if self.op.hvparams:
3673 self.cluster.hvparams = self.new_hvparams
3675 self.cluster.os_hvp = self.new_os_hvp
3676 if self.op.enabled_hypervisors is not None:
3677 self.cluster.hvparams = self.new_hvparams
3678 self.cluster.enabled_hypervisors = self.op.enabled_hypervisors
3679 if self.op.beparams:
3680 self.cluster.beparams[constants.PP_DEFAULT] = self.new_beparams
3681 if self.op.nicparams:
3682 self.cluster.nicparams[constants.PP_DEFAULT] = self.new_nicparams
3683 if self.op.osparams:
3684 self.cluster.osparams = self.new_osp
3685 if self.op.ndparams:
3686 self.cluster.ndparams = self.new_ndparams
3688 if self.op.candidate_pool_size is not None:
3689 self.cluster.candidate_pool_size = self.op.candidate_pool_size
3690 # we need to update the pool size here, otherwise the save will fail
3691 _AdjustCandidatePool(self, [])
3693 if self.op.maintain_node_health is not None:
3694 self.cluster.maintain_node_health = self.op.maintain_node_health
3696 if self.op.prealloc_wipe_disks is not None:
3697 self.cluster.prealloc_wipe_disks = self.op.prealloc_wipe_disks
3699 if self.op.add_uids is not None:
3700 uidpool.AddToUidPool(self.cluster.uid_pool, self.op.add_uids)
3702 if self.op.remove_uids is not None:
3703 uidpool.RemoveFromUidPool(self.cluster.uid_pool, self.op.remove_uids)
3705 if self.op.uid_pool is not None:
3706 self.cluster.uid_pool = self.op.uid_pool
3708 if self.op.default_iallocator is not None:
3709 self.cluster.default_iallocator = self.op.default_iallocator
3711 if self.op.reserved_lvs is not None:
3712 self.cluster.reserved_lvs = self.op.reserved_lvs
3714 if self.op.use_external_mip_script is not None:
3715 self.cluster.use_external_mip_script = self.op.use_external_mip_script
3717 def helper_os(aname, mods, desc):
3719 lst = getattr(self.cluster, aname)
3720 for key, val in mods:
3721 if key == constants.DDM_ADD:
3723 feedback_fn("OS %s already in %s, ignoring" % (val, desc))
3726 elif key == constants.DDM_REMOVE:
3730 feedback_fn("OS %s not found in %s, ignoring" % (val, desc))
3732 raise errors.ProgrammerError("Invalid modification '%s'" % key)
3734 if self.op.hidden_os:
3735 helper_os("hidden_os", self.op.hidden_os, "hidden")
3737 if self.op.blacklisted_os:
3738 helper_os("blacklisted_os", self.op.blacklisted_os, "blacklisted")
3740 if self.op.master_netdev:
3741 master_params = self.cfg.GetMasterNetworkParameters()
3742 ems = self.cfg.GetUseExternalMipScript()
3743 feedback_fn("Shutting down master ip on the current netdev (%s)" %
3744 self.cluster.master_netdev)
3745 result = self.rpc.call_node_deactivate_master_ip(master_params.name,
3747 result.Raise("Could not disable the master ip")
3748 feedback_fn("Changing master_netdev from %s to %s" %
3749 (master_params.netdev, self.op.master_netdev))
3750 self.cluster.master_netdev = self.op.master_netdev
3752 if self.op.master_netmask:
3753 master_params = self.cfg.GetMasterNetworkParameters()
3754 feedback_fn("Changing master IP netmask to %s" % self.op.master_netmask)
3755 result = self.rpc.call_node_change_master_netmask(master_params.name,
3756 master_params.netmask,
3757 self.op.master_netmask,
3759 master_params.netdev)
3761 msg = "Could not change the master IP netmask: %s" % result.fail_msg
3764 self.cluster.master_netmask = self.op.master_netmask
3766 self.cfg.Update(self.cluster, feedback_fn)
3768 if self.op.master_netdev:
3769 master_params = self.cfg.GetMasterNetworkParameters()
3770 feedback_fn("Starting the master ip on the new master netdev (%s)" %
3771 self.op.master_netdev)
3772 ems = self.cfg.GetUseExternalMipScript()
3773 result = self.rpc.call_node_activate_master_ip(master_params.name,
3776 self.LogWarning("Could not re-enable the master ip on"
3777 " the master, please restart manually: %s",
3781 def _UploadHelper(lu, nodes, fname):
3782 """Helper for uploading a file and showing warnings.
3785 if os.path.exists(fname):
3786 result = lu.rpc.call_upload_file(nodes, fname)
3787 for to_node, to_result in result.items():
3788 msg = to_result.fail_msg
3790 msg = ("Copy of file %s to node %s failed: %s" %
3791 (fname, to_node, msg))
3792 lu.proc.LogWarning(msg)
3795 def _ComputeAncillaryFiles(cluster, redist):
3796 """Compute files external to Ganeti which need to be consistent.
3798 @type redist: boolean
3799 @param redist: Whether to include files which need to be redistributed
3802 # Compute files for all nodes
3804 constants.SSH_KNOWN_HOSTS_FILE,
3805 constants.CONFD_HMAC_KEY,
3806 constants.CLUSTER_DOMAIN_SECRET_FILE,
3807 constants.SPICE_CERT_FILE,
3808 constants.SPICE_CACERT_FILE,
3809 constants.RAPI_USERS_FILE,
3813 files_all.update(constants.ALL_CERT_FILES)
3814 files_all.update(ssconf.SimpleStore().GetFileList())
3816 # we need to ship at least the RAPI certificate
3817 files_all.add(constants.RAPI_CERT_FILE)
3819 if cluster.modify_etc_hosts:
3820 files_all.add(constants.ETC_HOSTS)
3822 # Files which are optional, these must:
3823 # - be present in one other category as well
3824 # - either exist or not exist on all nodes of that category (mc, vm all)
3826 constants.RAPI_USERS_FILE,
3829 # Files which should only be on master candidates
3833 files_mc.add(constants.CLUSTER_CONF_FILE)
3835 # FIXME: this should also be replicated but Ganeti doesn't support files_mc
3837 files_mc.add(constants.DEFAULT_MASTER_SETUP_SCRIPT)
3839 # Files which should only be on VM-capable nodes
3840 files_vm = set(filename
3841 for hv_name in cluster.enabled_hypervisors
3842 for filename in hypervisor.GetHypervisor(hv_name).GetAncillaryFiles()[0])
3844 files_opt |= set(filename
3845 for hv_name in cluster.enabled_hypervisors
3846 for filename in hypervisor.GetHypervisor(hv_name).GetAncillaryFiles()[1])
3848 # Filenames in each category must be unique
3849 all_files_set = files_all | files_mc | files_vm
3850 assert (len(all_files_set) ==
3851 sum(map(len, [files_all, files_mc, files_vm]))), \
3852 "Found file listed in more than one file list"
3854 # Optional files must be present in one other category
3855 assert all_files_set.issuperset(files_opt), \
3856 "Optional file not in a different required list"
3858 return (files_all, files_opt, files_mc, files_vm)
3861 def _RedistributeAncillaryFiles(lu, additional_nodes=None, additional_vm=True):
3862 """Distribute additional files which are part of the cluster configuration.
3864 ConfigWriter takes care of distributing the config and ssconf files, but
3865 there are more files which should be distributed to all nodes. This function
3866 makes sure those are copied.
3868 @param lu: calling logical unit
3869 @param additional_nodes: list of nodes not in the config to distribute to
3870 @type additional_vm: boolean
3871 @param additional_vm: whether the additional nodes are vm-capable or not
3874 # Gather target nodes
3875 cluster = lu.cfg.GetClusterInfo()
3876 master_info = lu.cfg.GetNodeInfo(lu.cfg.GetMasterNode())
3878 online_nodes = lu.cfg.GetOnlineNodeList()
3879 vm_nodes = lu.cfg.GetVmCapableNodeList()
3881 if additional_nodes is not None:
3882 online_nodes.extend(additional_nodes)
3884 vm_nodes.extend(additional_nodes)
3886 # Never distribute to master node
3887 for nodelist in [online_nodes, vm_nodes]:
3888 if master_info.name in nodelist:
3889 nodelist.remove(master_info.name)
3892 (files_all, _, files_mc, files_vm) = \
3893 _ComputeAncillaryFiles(cluster, True)
3895 # Never re-distribute configuration file from here
3896 assert not (constants.CLUSTER_CONF_FILE in files_all or
3897 constants.CLUSTER_CONF_FILE in files_vm)
3898 assert not files_mc, "Master candidates not handled in this function"
3901 (online_nodes, files_all),
3902 (vm_nodes, files_vm),
3906 for (node_list, files) in filemap:
3908 _UploadHelper(lu, node_list, fname)
3911 class LUClusterRedistConf(NoHooksLU):
3912 """Force the redistribution of cluster configuration.
3914 This is a very simple LU.
3919 def ExpandNames(self):
3920 self.needed_locks = {
3921 locking.LEVEL_NODE: locking.ALL_SET,
3923 self.share_locks[locking.LEVEL_NODE] = 1
3925 def Exec(self, feedback_fn):
3926 """Redistribute the configuration.
3929 self.cfg.Update(self.cfg.GetClusterInfo(), feedback_fn)
3930 _RedistributeAncillaryFiles(self)
3933 class LUClusterActivateMasterIp(NoHooksLU):
3934 """Activate the master IP on the master node.
3937 def Exec(self, feedback_fn):
3938 """Activate the master IP.
3941 master_params = self.cfg.GetMasterNetworkParameters()
3942 ems = self.cfg.GetUseExternalMipScript()
3943 self.rpc.call_node_activate_master_ip(master_params.name,
3947 class LUClusterDeactivateMasterIp(NoHooksLU):
3948 """Deactivate the master IP on the master node.
3951 def Exec(self, feedback_fn):
3952 """Deactivate the master IP.
3955 master_params = self.cfg.GetMasterNetworkParameters()
3956 ems = self.cfg.GetUseExternalMipScript()
3957 self.rpc.call_node_deactivate_master_ip(master_params.name, master_params,
3961 def _WaitForSync(lu, instance, disks=None, oneshot=False):
3962 """Sleep and poll for an instance's disk to sync.
3965 if not instance.disks or disks is not None and not disks:
3968 disks = _ExpandCheckDisks(instance, disks)
3971 lu.proc.LogInfo("Waiting for instance %s to sync disks." % instance.name)
3973 node = instance.primary_node
3976 lu.cfg.SetDiskID(dev, node)
3978 # TODO: Convert to utils.Retry
3981 degr_retries = 10 # in seconds, as we sleep 1 second each time
3985 cumul_degraded = False
3986 rstats = lu.rpc.call_blockdev_getmirrorstatus(node, disks)
3987 msg = rstats.fail_msg
3989 lu.LogWarning("Can't get any data from node %s: %s", node, msg)
3992 raise errors.RemoteError("Can't contact node %s for mirror data,"
3993 " aborting." % node)
3996 rstats = rstats.payload
3998 for i, mstat in enumerate(rstats):
4000 lu.LogWarning("Can't compute data for node %s/%s",
4001 node, disks[i].iv_name)
4004 cumul_degraded = (cumul_degraded or
4005 (mstat.is_degraded and mstat.sync_percent is None))
4006 if mstat.sync_percent is not None:
4008 if mstat.estimated_time is not None:
4009 rem_time = ("%s remaining (estimated)" %
4010 utils.FormatSeconds(mstat.estimated_time))
4011 max_time = mstat.estimated_time
4013 rem_time = "no time estimate"
4014 lu.proc.LogInfo("- device %s: %5.2f%% done, %s" %
4015 (disks[i].iv_name, mstat.sync_percent, rem_time))
4017 # if we're done but degraded, let's do a few small retries, to
4018 # make sure we see a stable and not transient situation; therefore
4019 # we force restart of the loop
4020 if (done or oneshot) and cumul_degraded and degr_retries > 0:
4021 logging.info("Degraded disks found, %d retries left", degr_retries)
4029 time.sleep(min(60, max_time))
4032 lu.proc.LogInfo("Instance %s's disks are in sync." % instance.name)
4033 return not cumul_degraded
4036 def _CheckDiskConsistency(lu, dev, node, on_primary, ldisk=False):
4037 """Check that mirrors are not degraded.
4039 The ldisk parameter, if True, will change the test from the
4040 is_degraded attribute (which represents overall non-ok status for
4041 the device(s)) to the ldisk (representing the local storage status).
4044 lu.cfg.SetDiskID(dev, node)
4048 if on_primary or dev.AssembleOnSecondary():
4049 rstats = lu.rpc.call_blockdev_find(node, dev)
4050 msg = rstats.fail_msg
4052 lu.LogWarning("Can't find disk on node %s: %s", node, msg)
4054 elif not rstats.payload:
4055 lu.LogWarning("Can't find disk on node %s", node)
4059 result = result and rstats.payload.ldisk_status == constants.LDS_OKAY
4061 result = result and not rstats.payload.is_degraded
4064 for child in dev.children:
4065 result = result and _CheckDiskConsistency(lu, child, node, on_primary)
4070 class LUOobCommand(NoHooksLU):
4071 """Logical unit for OOB handling.
4075 _SKIP_MASTER = (constants.OOB_POWER_OFF, constants.OOB_POWER_CYCLE)
4077 def ExpandNames(self):
4078 """Gather locks we need.
4081 if self.op.node_names:
4082 self.op.node_names = _GetWantedNodes(self, self.op.node_names)
4083 lock_names = self.op.node_names
4085 lock_names = locking.ALL_SET
4087 self.needed_locks = {
4088 locking.LEVEL_NODE: lock_names,
4091 def CheckPrereq(self):
4092 """Check prerequisites.
4095 - the node exists in the configuration
4098 Any errors are signaled by raising errors.OpPrereqError.
4102 self.master_node = self.cfg.GetMasterNode()
4104 assert self.op.power_delay >= 0.0
4106 if self.op.node_names:
4107 if (self.op.command in self._SKIP_MASTER and
4108 self.master_node in self.op.node_names):
4109 master_node_obj = self.cfg.GetNodeInfo(self.master_node)
4110 master_oob_handler = _SupportsOob(self.cfg, master_node_obj)
4112 if master_oob_handler:
4113 additional_text = ("run '%s %s %s' if you want to operate on the"
4114 " master regardless") % (master_oob_handler,
4118 additional_text = "it does not support out-of-band operations"
4120 raise errors.OpPrereqError(("Operating on the master node %s is not"
4121 " allowed for %s; %s") %
4122 (self.master_node, self.op.command,
4123 additional_text), errors.ECODE_INVAL)
4125 self.op.node_names = self.cfg.GetNodeList()
4126 if self.op.command in self._SKIP_MASTER:
4127 self.op.node_names.remove(self.master_node)
4129 if self.op.command in self._SKIP_MASTER:
4130 assert self.master_node not in self.op.node_names
4132 for (node_name, node) in self.cfg.GetMultiNodeInfo(self.op.node_names):
4134 raise errors.OpPrereqError("Node %s not found" % node_name,
4137 self.nodes.append(node)
4139 if (not self.op.ignore_status and
4140 (self.op.command == constants.OOB_POWER_OFF and not node.offline)):
4141 raise errors.OpPrereqError(("Cannot power off node %s because it is"
4142 " not marked offline") % node_name,
4145 def Exec(self, feedback_fn):
4146 """Execute OOB and return result if we expect any.
4149 master_node = self.master_node
4152 for idx, node in enumerate(utils.NiceSort(self.nodes,
4153 key=lambda node: node.name)):
4154 node_entry = [(constants.RS_NORMAL, node.name)]
4155 ret.append(node_entry)
4157 oob_program = _SupportsOob(self.cfg, node)
4160 node_entry.append((constants.RS_UNAVAIL, None))
4163 logging.info("Executing out-of-band command '%s' using '%s' on %s",
4164 self.op.command, oob_program, node.name)
4165 result = self.rpc.call_run_oob(master_node, oob_program,
4166 self.op.command, node.name,
4170 self.LogWarning("Out-of-band RPC failed on node '%s': %s",
4171 node.name, result.fail_msg)
4172 node_entry.append((constants.RS_NODATA, None))
4175 self._CheckPayload(result)
4176 except errors.OpExecError, err:
4177 self.LogWarning("Payload returned by node '%s' is not valid: %s",
4179 node_entry.append((constants.RS_NODATA, None))
4181 if self.op.command == constants.OOB_HEALTH:
4182 # For health we should log important events
4183 for item, status in result.payload:
4184 if status in [constants.OOB_STATUS_WARNING,
4185 constants.OOB_STATUS_CRITICAL]:
4186 self.LogWarning("Item '%s' on node '%s' has status '%s'",
4187 item, node.name, status)
4189 if self.op.command == constants.OOB_POWER_ON:
4191 elif self.op.command == constants.OOB_POWER_OFF:
4192 node.powered = False
4193 elif self.op.command == constants.OOB_POWER_STATUS:
4194 powered = result.payload[constants.OOB_POWER_STATUS_POWERED]
4195 if powered != node.powered:
4196 logging.warning(("Recorded power state (%s) of node '%s' does not"
4197 " match actual power state (%s)"), node.powered,
4200 # For configuration changing commands we should update the node
4201 if self.op.command in (constants.OOB_POWER_ON,
4202 constants.OOB_POWER_OFF):
4203 self.cfg.Update(node, feedback_fn)
4205 node_entry.append((constants.RS_NORMAL, result.payload))
4207 if (self.op.command == constants.OOB_POWER_ON and
4208 idx < len(self.nodes) - 1):
4209 time.sleep(self.op.power_delay)
4213 def _CheckPayload(self, result):
4214 """Checks if the payload is valid.
4216 @param result: RPC result
4217 @raises errors.OpExecError: If payload is not valid
4221 if self.op.command == constants.OOB_HEALTH:
4222 if not isinstance(result.payload, list):
4223 errs.append("command 'health' is expected to return a list but got %s" %
4224 type(result.payload))
4226 for item, status in result.payload:
4227 if status not in constants.OOB_STATUSES:
4228 errs.append("health item '%s' has invalid status '%s'" %
4231 if self.op.command == constants.OOB_POWER_STATUS:
4232 if not isinstance(result.payload, dict):
4233 errs.append("power-status is expected to return a dict but got %s" %
4234 type(result.payload))
4236 if self.op.command in [
4237 constants.OOB_POWER_ON,
4238 constants.OOB_POWER_OFF,
4239 constants.OOB_POWER_CYCLE,
4241 if result.payload is not None:
4242 errs.append("%s is expected to not return payload but got '%s'" %
4243 (self.op.command, result.payload))
4246 raise errors.OpExecError("Check of out-of-band payload failed due to %s" %
4247 utils.CommaJoin(errs))
4250 class _OsQuery(_QueryBase):
4251 FIELDS = query.OS_FIELDS
4253 def ExpandNames(self, lu):
4254 # Lock all nodes in shared mode
4255 # Temporary removal of locks, should be reverted later
4256 # TODO: reintroduce locks when they are lighter-weight
4257 lu.needed_locks = {}
4258 #self.share_locks[locking.LEVEL_NODE] = 1
4259 #self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
4261 # The following variables interact with _QueryBase._GetNames
4263 self.wanted = self.names
4265 self.wanted = locking.ALL_SET
4267 self.do_locking = self.use_locking
4269 def DeclareLocks(self, lu, level):
4273 def _DiagnoseByOS(rlist):
4274 """Remaps a per-node return list into an a per-os per-node dictionary
4276 @param rlist: a map with node names as keys and OS objects as values
4279 @return: a dictionary with osnames as keys and as value another
4280 map, with nodes as keys and tuples of (path, status, diagnose,
4281 variants, parameters, api_versions) as values, eg::
4283 {"debian-etch": {"node1": [(/usr/lib/..., True, "", [], []),
4284 (/srv/..., False, "invalid api")],
4285 "node2": [(/srv/..., True, "", [], [])]}
4290 # we build here the list of nodes that didn't fail the RPC (at RPC
4291 # level), so that nodes with a non-responding node daemon don't
4292 # make all OSes invalid
4293 good_nodes = [node_name for node_name in rlist
4294 if not rlist[node_name].fail_msg]
4295 for node_name, nr in rlist.items():
4296 if nr.fail_msg or not nr.payload:
4298 for (name, path, status, diagnose, variants,
4299 params, api_versions) in nr.payload:
4300 if name not in all_os:
4301 # build a list of nodes for this os containing empty lists
4302 # for each node in node_list
4304 for nname in good_nodes:
4305 all_os[name][nname] = []
4306 # convert params from [name, help] to (name, help)
4307 params = [tuple(v) for v in params]
4308 all_os[name][node_name].append((path, status, diagnose,
4309 variants, params, api_versions))
4312 def _GetQueryData(self, lu):
4313 """Computes the list of nodes and their attributes.
4316 # Locking is not used
4317 assert not (compat.any(lu.glm.is_owned(level)
4318 for level in locking.LEVELS
4319 if level != locking.LEVEL_CLUSTER) or
4320 self.do_locking or self.use_locking)
4322 valid_nodes = [node.name
4323 for node in lu.cfg.GetAllNodesInfo().values()
4324 if not node.offline and node.vm_capable]
4325 pol = self._DiagnoseByOS(lu.rpc.call_os_diagnose(valid_nodes))
4326 cluster = lu.cfg.GetClusterInfo()
4330 for (os_name, os_data) in pol.items():
4331 info = query.OsInfo(name=os_name, valid=True, node_status=os_data,
4332 hidden=(os_name in cluster.hidden_os),
4333 blacklisted=(os_name in cluster.blacklisted_os))
4337 api_versions = set()
4339 for idx, osl in enumerate(os_data.values()):
4340 info.valid = bool(info.valid and osl and osl[0][1])
4344 (node_variants, node_params, node_api) = osl[0][3:6]
4347 variants.update(node_variants)
4348 parameters.update(node_params)
4349 api_versions.update(node_api)
4351 # Filter out inconsistent values
4352 variants.intersection_update(node_variants)
4353 parameters.intersection_update(node_params)
4354 api_versions.intersection_update(node_api)
4356 info.variants = list(variants)
4357 info.parameters = list(parameters)
4358 info.api_versions = list(api_versions)
4360 data[os_name] = info
4362 # Prepare data in requested order
4363 return [data[name] for name in self._GetNames(lu, pol.keys(), None)
4367 class LUOsDiagnose(NoHooksLU):
4368 """Logical unit for OS diagnose/query.
4374 def _BuildFilter(fields, names):
4375 """Builds a filter for querying OSes.
4378 name_filter = qlang.MakeSimpleFilter("name", names)
4380 # Legacy behaviour: Hide hidden, blacklisted or invalid OSes if the
4381 # respective field is not requested
4382 status_filter = [[qlang.OP_NOT, [qlang.OP_TRUE, fname]]
4383 for fname in ["hidden", "blacklisted"]
4384 if fname not in fields]
4385 if "valid" not in fields:
4386 status_filter.append([qlang.OP_TRUE, "valid"])
4389 status_filter.insert(0, qlang.OP_AND)
4391 status_filter = None
4393 if name_filter and status_filter:
4394 return [qlang.OP_AND, name_filter, status_filter]
4398 return status_filter
4400 def CheckArguments(self):
4401 self.oq = _OsQuery(self._BuildFilter(self.op.output_fields, self.op.names),
4402 self.op.output_fields, False)
4404 def ExpandNames(self):
4405 self.oq.ExpandNames(self)
4407 def Exec(self, feedback_fn):
4408 return self.oq.OldStyleQuery(self)
4411 class LUNodeRemove(LogicalUnit):
4412 """Logical unit for removing a node.
4415 HPATH = "node-remove"
4416 HTYPE = constants.HTYPE_NODE
4418 def BuildHooksEnv(self):
4421 This doesn't run on the target node in the pre phase as a failed
4422 node would then be impossible to remove.
4426 "OP_TARGET": self.op.node_name,
4427 "NODE_NAME": self.op.node_name,
4430 def BuildHooksNodes(self):
4431 """Build hooks nodes.
4434 all_nodes = self.cfg.GetNodeList()
4436 all_nodes.remove(self.op.node_name)
4438 logging.warning("Node '%s', which is about to be removed, was not found"
4439 " in the list of all nodes", self.op.node_name)
4440 return (all_nodes, all_nodes)
4442 def CheckPrereq(self):
4443 """Check prerequisites.
4446 - the node exists in the configuration
4447 - it does not have primary or secondary instances
4448 - it's not the master
4450 Any errors are signaled by raising errors.OpPrereqError.
4453 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
4454 node = self.cfg.GetNodeInfo(self.op.node_name)
4455 assert node is not None
4457 masternode = self.cfg.GetMasterNode()
4458 if node.name == masternode:
4459 raise errors.OpPrereqError("Node is the master node, failover to another"
4460 " node is required", errors.ECODE_INVAL)
4462 for instance_name, instance in self.cfg.GetAllInstancesInfo():
4463 if node.name in instance.all_nodes:
4464 raise errors.OpPrereqError("Instance %s is still running on the node,"
4465 " please remove first" % instance_name,
4467 self.op.node_name = node.name
4470 def Exec(self, feedback_fn):
4471 """Removes the node from the cluster.
4475 logging.info("Stopping the node daemon and removing configs from node %s",
4478 modify_ssh_setup = self.cfg.GetClusterInfo().modify_ssh_setup
4480 assert locking.BGL in self.owned_locks(locking.LEVEL_CLUSTER), \
4483 # Promote nodes to master candidate as needed
4484 _AdjustCandidatePool(self, exceptions=[node.name])
4485 self.context.RemoveNode(node.name)
4487 # Run post hooks on the node before it's removed
4488 _RunPostHook(self, node.name)
4490 result = self.rpc.call_node_leave_cluster(node.name, modify_ssh_setup)
4491 msg = result.fail_msg
4493 self.LogWarning("Errors encountered on the remote node while leaving"
4494 " the cluster: %s", msg)
4496 # Remove node from our /etc/hosts
4497 if self.cfg.GetClusterInfo().modify_etc_hosts:
4498 master_node = self.cfg.GetMasterNode()
4499 result = self.rpc.call_etc_hosts_modify(master_node,
4500 constants.ETC_HOSTS_REMOVE,
4502 result.Raise("Can't update hosts file with new host data")
4503 _RedistributeAncillaryFiles(self)
4506 class _NodeQuery(_QueryBase):
4507 FIELDS = query.NODE_FIELDS
4509 def ExpandNames(self, lu):
4510 lu.needed_locks = {}
4511 lu.share_locks = _ShareAll()
4514 self.wanted = _GetWantedNodes(lu, self.names)
4516 self.wanted = locking.ALL_SET
4518 self.do_locking = (self.use_locking and
4519 query.NQ_LIVE in self.requested_data)
4522 # If any non-static field is requested we need to lock the nodes
4523 lu.needed_locks[locking.LEVEL_NODE] = self.wanted
4525 def DeclareLocks(self, lu, level):
4528 def _GetQueryData(self, lu):
4529 """Computes the list of nodes and their attributes.
4532 all_info = lu.cfg.GetAllNodesInfo()
4534 nodenames = self._GetNames(lu, all_info.keys(), locking.LEVEL_NODE)
4536 # Gather data as requested
4537 if query.NQ_LIVE in self.requested_data:
4538 # filter out non-vm_capable nodes
4539 toquery_nodes = [name for name in nodenames if all_info[name].vm_capable]
4541 node_data = lu.rpc.call_node_info(toquery_nodes, lu.cfg.GetVGName(),
4542 lu.cfg.GetHypervisorType())
4543 live_data = dict((name, nresult.payload)
4544 for (name, nresult) in node_data.items()
4545 if not nresult.fail_msg and nresult.payload)
4549 if query.NQ_INST in self.requested_data:
4550 node_to_primary = dict([(name, set()) for name in nodenames])
4551 node_to_secondary = dict([(name, set()) for name in nodenames])
4553 inst_data = lu.cfg.GetAllInstancesInfo()
4555 for inst in inst_data.values():
4556 if inst.primary_node in node_to_primary:
4557 node_to_primary[inst.primary_node].add(inst.name)
4558 for secnode in inst.secondary_nodes:
4559 if secnode in node_to_secondary:
4560 node_to_secondary[secnode].add(inst.name)
4562 node_to_primary = None
4563 node_to_secondary = None
4565 if query.NQ_OOB in self.requested_data:
4566 oob_support = dict((name, bool(_SupportsOob(lu.cfg, node)))
4567 for name, node in all_info.iteritems())
4571 if query.NQ_GROUP in self.requested_data:
4572 groups = lu.cfg.GetAllNodeGroupsInfo()
4576 return query.NodeQueryData([all_info[name] for name in nodenames],
4577 live_data, lu.cfg.GetMasterNode(),
4578 node_to_primary, node_to_secondary, groups,
4579 oob_support, lu.cfg.GetClusterInfo())
4582 class LUNodeQuery(NoHooksLU):
4583 """Logical unit for querying nodes.
4586 # pylint: disable=W0142
4589 def CheckArguments(self):
4590 self.nq = _NodeQuery(qlang.MakeSimpleFilter("name", self.op.names),
4591 self.op.output_fields, self.op.use_locking)
4593 def ExpandNames(self):
4594 self.nq.ExpandNames(self)
4596 def DeclareLocks(self, level):
4597 self.nq.DeclareLocks(self, level)
4599 def Exec(self, feedback_fn):
4600 return self.nq.OldStyleQuery(self)
4603 class LUNodeQueryvols(NoHooksLU):
4604 """Logical unit for getting volumes on node(s).
4608 _FIELDS_DYNAMIC = utils.FieldSet("phys", "vg", "name", "size", "instance")
4609 _FIELDS_STATIC = utils.FieldSet("node")
4611 def CheckArguments(self):
4612 _CheckOutputFields(static=self._FIELDS_STATIC,
4613 dynamic=self._FIELDS_DYNAMIC,
4614 selected=self.op.output_fields)
4616 def ExpandNames(self):
4617 self.share_locks = _ShareAll()
4618 self.needed_locks = {}
4620 if not self.op.nodes:
4621 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
4623 self.needed_locks[locking.LEVEL_NODE] = \
4624 _GetWantedNodes(self, self.op.nodes)
4626 def Exec(self, feedback_fn):
4627 """Computes the list of nodes and their attributes.
4630 nodenames = self.owned_locks(locking.LEVEL_NODE)
4631 volumes = self.rpc.call_node_volumes(nodenames)
4633 ilist = self.cfg.GetAllInstancesInfo()
4634 vol2inst = _MapInstanceDisksToNodes(ilist.values())
4637 for node in nodenames:
4638 nresult = volumes[node]
4641 msg = nresult.fail_msg
4643 self.LogWarning("Can't compute volume data on node %s: %s", node, msg)
4646 node_vols = sorted(nresult.payload,
4647 key=operator.itemgetter("dev"))
4649 for vol in node_vols:
4651 for field in self.op.output_fields:
4654 elif field == "phys":
4658 elif field == "name":
4660 elif field == "size":
4661 val = int(float(vol["size"]))
4662 elif field == "instance":
4663 val = vol2inst.get((node, vol["vg"] + "/" + vol["name"]), "-")
4665 raise errors.ParameterError(field)
4666 node_output.append(str(val))
4668 output.append(node_output)
4673 class LUNodeQueryStorage(NoHooksLU):
4674 """Logical unit for getting information on storage units on node(s).
4677 _FIELDS_STATIC = utils.FieldSet(constants.SF_NODE)
4680 def CheckArguments(self):
4681 _CheckOutputFields(static=self._FIELDS_STATIC,
4682 dynamic=utils.FieldSet(*constants.VALID_STORAGE_FIELDS),
4683 selected=self.op.output_fields)
4685 def ExpandNames(self):
4686 self.share_locks = _ShareAll()
4687 self.needed_locks = {}
4690 self.needed_locks[locking.LEVEL_NODE] = \
4691 _GetWantedNodes(self, self.op.nodes)
4693 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
4695 def Exec(self, feedback_fn):
4696 """Computes the list of nodes and their attributes.
4699 self.nodes = self.owned_locks(locking.LEVEL_NODE)
4701 # Always get name to sort by
4702 if constants.SF_NAME in self.op.output_fields:
4703 fields = self.op.output_fields[:]
4705 fields = [constants.SF_NAME] + self.op.output_fields
4707 # Never ask for node or type as it's only known to the LU
4708 for extra in [constants.SF_NODE, constants.SF_TYPE]:
4709 while extra in fields:
4710 fields.remove(extra)
4712 field_idx = dict([(name, idx) for (idx, name) in enumerate(fields)])
4713 name_idx = field_idx[constants.SF_NAME]
4715 st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
4716 data = self.rpc.call_storage_list(self.nodes,
4717 self.op.storage_type, st_args,
4718 self.op.name, fields)
4722 for node in utils.NiceSort(self.nodes):
4723 nresult = data[node]
4727 msg = nresult.fail_msg
4729 self.LogWarning("Can't get storage data from node %s: %s", node, msg)
4732 rows = dict([(row[name_idx], row) for row in nresult.payload])
4734 for name in utils.NiceSort(rows.keys()):
4739 for field in self.op.output_fields:
4740 if field == constants.SF_NODE:
4742 elif field == constants.SF_TYPE:
4743 val = self.op.storage_type
4744 elif field in field_idx:
4745 val = row[field_idx[field]]
4747 raise errors.ParameterError(field)
4756 class _InstanceQuery(_QueryBase):
4757 FIELDS = query.INSTANCE_FIELDS
4759 def ExpandNames(self, lu):
4760 lu.needed_locks = {}
4761 lu.share_locks = _ShareAll()
4764 self.wanted = _GetWantedInstances(lu, self.names)
4766 self.wanted = locking.ALL_SET
4768 self.do_locking = (self.use_locking and
4769 query.IQ_LIVE in self.requested_data)
4771 lu.needed_locks[locking.LEVEL_INSTANCE] = self.wanted
4772 lu.needed_locks[locking.LEVEL_NODEGROUP] = []
4773 lu.needed_locks[locking.LEVEL_NODE] = []
4774 lu.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
4776 self.do_grouplocks = (self.do_locking and
4777 query.IQ_NODES in self.requested_data)
4779 def DeclareLocks(self, lu, level):
4781 if level == locking.LEVEL_NODEGROUP and self.do_grouplocks:
4782 assert not lu.needed_locks[locking.LEVEL_NODEGROUP]
4784 # Lock all groups used by instances optimistically; this requires going
4785 # via the node before it's locked, requiring verification later on
4786 lu.needed_locks[locking.LEVEL_NODEGROUP] = \
4788 for instance_name in lu.owned_locks(locking.LEVEL_INSTANCE)
4789 for group_uuid in lu.cfg.GetInstanceNodeGroups(instance_name))
4790 elif level == locking.LEVEL_NODE:
4791 lu._LockInstancesNodes() # pylint: disable=W0212
4794 def _CheckGroupLocks(lu):
4795 owned_instances = frozenset(lu.owned_locks(locking.LEVEL_INSTANCE))
4796 owned_groups = frozenset(lu.owned_locks(locking.LEVEL_NODEGROUP))
4798 # Check if node groups for locked instances are still correct
4799 for instance_name in owned_instances:
4800 _CheckInstanceNodeGroups(lu.cfg, instance_name, owned_groups)
4802 def _GetQueryData(self, lu):
4803 """Computes the list of instances and their attributes.
4806 if self.do_grouplocks:
4807 self._CheckGroupLocks(lu)
4809 cluster = lu.cfg.GetClusterInfo()
4810 all_info = lu.cfg.GetAllInstancesInfo()
4812 instance_names = self._GetNames(lu, all_info.keys(), locking.LEVEL_INSTANCE)
4814 instance_list = [all_info[name] for name in instance_names]
4815 nodes = frozenset(itertools.chain(*(inst.all_nodes
4816 for inst in instance_list)))
4817 hv_list = list(set([inst.hypervisor for inst in instance_list]))
4820 wrongnode_inst = set()
4822 # Gather data as requested
4823 if self.requested_data & set([query.IQ_LIVE, query.IQ_CONSOLE]):
4825 node_data = lu.rpc.call_all_instances_info(nodes, hv_list)
4827 result = node_data[name]
4829 # offline nodes will be in both lists
4830 assert result.fail_msg
4831 offline_nodes.append(name)
4833 bad_nodes.append(name)
4834 elif result.payload:
4835 for inst in result.payload:
4836 if inst in all_info:
4837 if all_info[inst].primary_node == name:
4838 live_data.update(result.payload)
4840 wrongnode_inst.add(inst)
4842 # orphan instance; we don't list it here as we don't
4843 # handle this case yet in the output of instance listing
4844 logging.warning("Orphan instance '%s' found on node %s",
4846 # else no instance is alive
4850 if query.IQ_DISKUSAGE in self.requested_data:
4851 disk_usage = dict((inst.name,
4852 _ComputeDiskSize(inst.disk_template,
4853 [{constants.IDISK_SIZE: disk.size}
4854 for disk in inst.disks]))
4855 for inst in instance_list)
4859 if query.IQ_CONSOLE in self.requested_data:
4861 for inst in instance_list:
4862 if inst.name in live_data:
4863 # Instance is running
4864 consinfo[inst.name] = _GetInstanceConsole(cluster, inst)
4866 consinfo[inst.name] = None
4867 assert set(consinfo.keys()) == set(instance_names)
4871 if query.IQ_NODES in self.requested_data:
4872 node_names = set(itertools.chain(*map(operator.attrgetter("all_nodes"),
4874 nodes = dict(lu.cfg.GetMultiNodeInfo(node_names))
4875 groups = dict((uuid, lu.cfg.GetNodeGroup(uuid))
4876 for uuid in set(map(operator.attrgetter("group"),
4882 return query.InstanceQueryData(instance_list, lu.cfg.GetClusterInfo(),
4883 disk_usage, offline_nodes, bad_nodes,
4884 live_data, wrongnode_inst, consinfo,
4888 class LUQuery(NoHooksLU):
4889 """Query for resources/items of a certain kind.
4892 # pylint: disable=W0142
4895 def CheckArguments(self):
4896 qcls = _GetQueryImplementation(self.op.what)
4898 self.impl = qcls(self.op.qfilter, self.op.fields, self.op.use_locking)
4900 def ExpandNames(self):
4901 self.impl.ExpandNames(self)
4903 def DeclareLocks(self, level):
4904 self.impl.DeclareLocks(self, level)
4906 def Exec(self, feedback_fn):
4907 return self.impl.NewStyleQuery(self)
4910 class LUQueryFields(NoHooksLU):
4911 """Query for resources/items of a certain kind.
4914 # pylint: disable=W0142
4917 def CheckArguments(self):
4918 self.qcls = _GetQueryImplementation(self.op.what)
4920 def ExpandNames(self):
4921 self.needed_locks = {}
4923 def Exec(self, feedback_fn):
4924 return query.QueryFields(self.qcls.FIELDS, self.op.fields)
4927 class LUNodeModifyStorage(NoHooksLU):
4928 """Logical unit for modifying a storage volume on a node.
4933 def CheckArguments(self):
4934 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
4936 storage_type = self.op.storage_type
4939 modifiable = constants.MODIFIABLE_STORAGE_FIELDS[storage_type]
4941 raise errors.OpPrereqError("Storage units of type '%s' can not be"
4942 " modified" % storage_type,
4945 diff = set(self.op.changes.keys()) - modifiable
4947 raise errors.OpPrereqError("The following fields can not be modified for"
4948 " storage units of type '%s': %r" %
4949 (storage_type, list(diff)),
4952 def ExpandNames(self):
4953 self.needed_locks = {
4954 locking.LEVEL_NODE: self.op.node_name,
4957 def Exec(self, feedback_fn):
4958 """Computes the list of nodes and their attributes.
4961 st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
4962 result = self.rpc.call_storage_modify(self.op.node_name,
4963 self.op.storage_type, st_args,
4964 self.op.name, self.op.changes)
4965 result.Raise("Failed to modify storage unit '%s' on %s" %
4966 (self.op.name, self.op.node_name))
4969 class LUNodeAdd(LogicalUnit):
4970 """Logical unit for adding node to the cluster.
4974 HTYPE = constants.HTYPE_NODE
4975 _NFLAGS = ["master_capable", "vm_capable"]
4977 def CheckArguments(self):
4978 self.primary_ip_family = self.cfg.GetPrimaryIPFamily()
4979 # validate/normalize the node name
4980 self.hostname = netutils.GetHostname(name=self.op.node_name,
4981 family=self.primary_ip_family)
4982 self.op.node_name = self.hostname.name
4984 if self.op.readd and self.op.node_name == self.cfg.GetMasterNode():
4985 raise errors.OpPrereqError("Cannot readd the master node",
4988 if self.op.readd and self.op.group:
4989 raise errors.OpPrereqError("Cannot pass a node group when a node is"
4990 " being readded", errors.ECODE_INVAL)
4992 def BuildHooksEnv(self):
4995 This will run on all nodes before, and on all nodes + the new node after.
4999 "OP_TARGET": self.op.node_name,
5000 "NODE_NAME": self.op.node_name,
5001 "NODE_PIP": self.op.primary_ip,
5002 "NODE_SIP": self.op.secondary_ip,
5003 "MASTER_CAPABLE": str(self.op.master_capable),
5004 "VM_CAPABLE": str(self.op.vm_capable),
5007 def BuildHooksNodes(self):
5008 """Build hooks nodes.
5011 # Exclude added node
5012 pre_nodes = list(set(self.cfg.GetNodeList()) - set([self.op.node_name]))
5013 post_nodes = pre_nodes + [self.op.node_name, ]
5015 return (pre_nodes, post_nodes)
5017 def CheckPrereq(self):
5018 """Check prerequisites.
5021 - the new node is not already in the config
5023 - its parameters (single/dual homed) matches the cluster
5025 Any errors are signaled by raising errors.OpPrereqError.
5029 hostname = self.hostname
5030 node = hostname.name
5031 primary_ip = self.op.primary_ip = hostname.ip
5032 if self.op.secondary_ip is None:
5033 if self.primary_ip_family == netutils.IP6Address.family:
5034 raise errors.OpPrereqError("When using a IPv6 primary address, a valid"
5035 " IPv4 address must be given as secondary",
5037 self.op.secondary_ip = primary_ip
5039 secondary_ip = self.op.secondary_ip
5040 if not netutils.IP4Address.IsValid(secondary_ip):
5041 raise errors.OpPrereqError("Secondary IP (%s) needs to be a valid IPv4"
5042 " address" % secondary_ip, errors.ECODE_INVAL)
5044 node_list = cfg.GetNodeList()
5045 if not self.op.readd and node in node_list:
5046 raise errors.OpPrereqError("Node %s is already in the configuration" %
5047 node, errors.ECODE_EXISTS)
5048 elif self.op.readd and node not in node_list:
5049 raise errors.OpPrereqError("Node %s is not in the configuration" % node,
5052 self.changed_primary_ip = False
5054 for existing_node_name, existing_node in cfg.GetMultiNodeInfo(node_list):
5055 if self.op.readd and node == existing_node_name:
5056 if existing_node.secondary_ip != secondary_ip:
5057 raise errors.OpPrereqError("Readded node doesn't have the same IP"
5058 " address configuration as before",
5060 if existing_node.primary_ip != primary_ip:
5061 self.changed_primary_ip = True
5065 if (existing_node.primary_ip == primary_ip or
5066 existing_node.secondary_ip == primary_ip or
5067 existing_node.primary_ip == secondary_ip or
5068 existing_node.secondary_ip == secondary_ip):
5069 raise errors.OpPrereqError("New node ip address(es) conflict with"
5070 " existing node %s" % existing_node.name,
5071 errors.ECODE_NOTUNIQUE)
5073 # After this 'if' block, None is no longer a valid value for the
5074 # _capable op attributes
5076 old_node = self.cfg.GetNodeInfo(node)
5077 assert old_node is not None, "Can't retrieve locked node %s" % node
5078 for attr in self._NFLAGS:
5079 if getattr(self.op, attr) is None:
5080 setattr(self.op, attr, getattr(old_node, attr))
5082 for attr in self._NFLAGS:
5083 if getattr(self.op, attr) is None:
5084 setattr(self.op, attr, True)
5086 if self.op.readd and not self.op.vm_capable:
5087 pri, sec = cfg.GetNodeInstances(node)
5089 raise errors.OpPrereqError("Node %s being re-added with vm_capable"
5090 " flag set to false, but it already holds"
5091 " instances" % node,
5094 # check that the type of the node (single versus dual homed) is the
5095 # same as for the master
5096 myself = cfg.GetNodeInfo(self.cfg.GetMasterNode())
5097 master_singlehomed = myself.secondary_ip == myself.primary_ip
5098 newbie_singlehomed = secondary_ip == primary_ip
5099 if master_singlehomed != newbie_singlehomed:
5100 if master_singlehomed:
5101 raise errors.OpPrereqError("The master has no secondary ip but the"
5102 " new node has one",
5105 raise errors.OpPrereqError("The master has a secondary ip but the"
5106 " new node doesn't have one",
5109 # checks reachability
5110 if not netutils.TcpPing(primary_ip, constants.DEFAULT_NODED_PORT):
5111 raise errors.OpPrereqError("Node not reachable by ping",
5112 errors.ECODE_ENVIRON)
5114 if not newbie_singlehomed:
5115 # check reachability from my secondary ip to newbie's secondary ip
5116 if not netutils.TcpPing(secondary_ip, constants.DEFAULT_NODED_PORT,
5117 source=myself.secondary_ip):
5118 raise errors.OpPrereqError("Node secondary ip not reachable by TCP"
5119 " based ping to node daemon port",
5120 errors.ECODE_ENVIRON)
5127 if self.op.master_capable:
5128 self.master_candidate = _DecideSelfPromotion(self, exceptions=exceptions)
5130 self.master_candidate = False
5133 self.new_node = old_node
5135 node_group = cfg.LookupNodeGroup(self.op.group)
5136 self.new_node = objects.Node(name=node,
5137 primary_ip=primary_ip,
5138 secondary_ip=secondary_ip,
5139 master_candidate=self.master_candidate,
5140 offline=False, drained=False,
5143 if self.op.ndparams:
5144 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
5146 def Exec(self, feedback_fn):
5147 """Adds the new node to the cluster.
5150 new_node = self.new_node
5151 node = new_node.name
5153 assert locking.BGL in self.owned_locks(locking.LEVEL_CLUSTER), \
5156 # We adding a new node so we assume it's powered
5157 new_node.powered = True
5159 # for re-adds, reset the offline/drained/master-candidate flags;
5160 # we need to reset here, otherwise offline would prevent RPC calls
5161 # later in the procedure; this also means that if the re-add
5162 # fails, we are left with a non-offlined, broken node
5164 new_node.drained = new_node.offline = False # pylint: disable=W0201
5165 self.LogInfo("Readding a node, the offline/drained flags were reset")
5166 # if we demote the node, we do cleanup later in the procedure
5167 new_node.master_candidate = self.master_candidate
5168 if self.changed_primary_ip:
5169 new_node.primary_ip = self.op.primary_ip
5171 # copy the master/vm_capable flags
5172 for attr in self._NFLAGS:
5173 setattr(new_node, attr, getattr(self.op, attr))
5175 # notify the user about any possible mc promotion
5176 if new_node.master_candidate:
5177 self.LogInfo("Node will be a master candidate")
5179 if self.op.ndparams:
5180 new_node.ndparams = self.op.ndparams
5182 new_node.ndparams = {}
5184 # check connectivity
5185 result = self.rpc.call_version([node])[node]
5186 result.Raise("Can't get version information from node %s" % node)
5187 if constants.PROTOCOL_VERSION == result.payload:
5188 logging.info("Communication to node %s fine, sw version %s match",
5189 node, result.payload)
5191 raise errors.OpExecError("Version mismatch master version %s,"
5192 " node version %s" %
5193 (constants.PROTOCOL_VERSION, result.payload))
5195 # Add node to our /etc/hosts, and add key to known_hosts
5196 if self.cfg.GetClusterInfo().modify_etc_hosts:
5197 master_node = self.cfg.GetMasterNode()
5198 result = self.rpc.call_etc_hosts_modify(master_node,
5199 constants.ETC_HOSTS_ADD,
5202 result.Raise("Can't update hosts file with new host data")
5204 if new_node.secondary_ip != new_node.primary_ip:
5205 _CheckNodeHasSecondaryIP(self, new_node.name, new_node.secondary_ip,
5208 node_verify_list = [self.cfg.GetMasterNode()]
5209 node_verify_param = {
5210 constants.NV_NODELIST: ([node], {}),
5211 # TODO: do a node-net-test as well?
5214 result = self.rpc.call_node_verify(node_verify_list, node_verify_param,
5215 self.cfg.GetClusterName())
5216 for verifier in node_verify_list:
5217 result[verifier].Raise("Cannot communicate with node %s" % verifier)
5218 nl_payload = result[verifier].payload[constants.NV_NODELIST]
5220 for failed in nl_payload:
5221 feedback_fn("ssh/hostname verification failed"
5222 " (checking from %s): %s" %
5223 (verifier, nl_payload[failed]))
5224 raise errors.OpExecError("ssh/hostname verification failed")
5227 _RedistributeAncillaryFiles(self)
5228 self.context.ReaddNode(new_node)
5229 # make sure we redistribute the config
5230 self.cfg.Update(new_node, feedback_fn)
5231 # and make sure the new node will not have old files around
5232 if not new_node.master_candidate:
5233 result = self.rpc.call_node_demote_from_mc(new_node.name)
5234 msg = result.fail_msg
5236 self.LogWarning("Node failed to demote itself from master"
5237 " candidate status: %s" % msg)
5239 _RedistributeAncillaryFiles(self, additional_nodes=[node],
5240 additional_vm=self.op.vm_capable)
5241 self.context.AddNode(new_node, self.proc.GetECId())
5244 class LUNodeSetParams(LogicalUnit):
5245 """Modifies the parameters of a node.
5247 @cvar _F2R: a dictionary from tuples of flags (mc, drained, offline)
5248 to the node role (as _ROLE_*)
5249 @cvar _R2F: a dictionary from node role to tuples of flags
5250 @cvar _FLAGS: a list of attribute names corresponding to the flags
5253 HPATH = "node-modify"
5254 HTYPE = constants.HTYPE_NODE
5256 (_ROLE_CANDIDATE, _ROLE_DRAINED, _ROLE_OFFLINE, _ROLE_REGULAR) = range(4)
5258 (True, False, False): _ROLE_CANDIDATE,
5259 (False, True, False): _ROLE_DRAINED,
5260 (False, False, True): _ROLE_OFFLINE,
5261 (False, False, False): _ROLE_REGULAR,
5263 _R2F = dict((v, k) for k, v in _F2R.items())
5264 _FLAGS = ["master_candidate", "drained", "offline"]
5266 def CheckArguments(self):
5267 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
5268 all_mods = [self.op.offline, self.op.master_candidate, self.op.drained,
5269 self.op.master_capable, self.op.vm_capable,
5270 self.op.secondary_ip, self.op.ndparams]
5271 if all_mods.count(None) == len(all_mods):
5272 raise errors.OpPrereqError("Please pass at least one modification",
5274 if all_mods.count(True) > 1:
5275 raise errors.OpPrereqError("Can't set the node into more than one"
5276 " state at the same time",
5279 # Boolean value that tells us whether we might be demoting from MC
5280 self.might_demote = (self.op.master_candidate == False or
5281 self.op.offline == True or
5282 self.op.drained == True or
5283 self.op.master_capable == False)
5285 if self.op.secondary_ip:
5286 if not netutils.IP4Address.IsValid(self.op.secondary_ip):
5287 raise errors.OpPrereqError("Secondary IP (%s) needs to be a valid IPv4"
5288 " address" % self.op.secondary_ip,
5291 self.lock_all = self.op.auto_promote and self.might_demote
5292 self.lock_instances = self.op.secondary_ip is not None
5294 def _InstanceFilter(self, instance):
5295 """Filter for getting affected instances.
5298 return (instance.disk_template in constants.DTS_INT_MIRROR and
5299 self.op.node_name in instance.all_nodes)
5301 def ExpandNames(self):
5303 self.needed_locks = {locking.LEVEL_NODE: locking.ALL_SET}
5305 self.needed_locks = {locking.LEVEL_NODE: self.op.node_name}
5307 if self.lock_instances:
5308 self.needed_locks[locking.LEVEL_INSTANCE] = \
5309 frozenset(self.cfg.GetInstancesInfoByFilter(self._InstanceFilter))
5311 def BuildHooksEnv(self):
5314 This runs on the master node.
5318 "OP_TARGET": self.op.node_name,
5319 "MASTER_CANDIDATE": str(self.op.master_candidate),
5320 "OFFLINE": str(self.op.offline),
5321 "DRAINED": str(self.op.drained),
5322 "MASTER_CAPABLE": str(self.op.master_capable),
5323 "VM_CAPABLE": str(self.op.vm_capable),
5326 def BuildHooksNodes(self):
5327 """Build hooks nodes.
5330 nl = [self.cfg.GetMasterNode(), self.op.node_name]
5333 def CheckPrereq(self):
5334 """Check prerequisites.
5336 This only checks the instance list against the existing names.
5339 node = self.node = self.cfg.GetNodeInfo(self.op.node_name)
5341 if self.lock_instances:
5342 affected_instances = \
5343 self.cfg.GetInstancesInfoByFilter(self._InstanceFilter)
5345 # Verify instance locks
5346 owned_instances = self.owned_locks(locking.LEVEL_INSTANCE)
5347 wanted_instances = frozenset(affected_instances.keys())
5348 if wanted_instances - owned_instances:
5349 raise errors.OpPrereqError("Instances affected by changing node %s's"
5350 " secondary IP address have changed since"
5351 " locks were acquired, wanted '%s', have"
5352 " '%s'; retry the operation" %
5354 utils.CommaJoin(wanted_instances),
5355 utils.CommaJoin(owned_instances)),
5358 affected_instances = None
5360 if (self.op.master_candidate is not None or
5361 self.op.drained is not None or
5362 self.op.offline is not None):
5363 # we can't change the master's node flags
5364 if self.op.node_name == self.cfg.GetMasterNode():
5365 raise errors.OpPrereqError("The master role can be changed"
5366 " only via master-failover",
5369 if self.op.master_candidate and not node.master_capable:
5370 raise errors.OpPrereqError("Node %s is not master capable, cannot make"
5371 " it a master candidate" % node.name,
5374 if self.op.vm_capable == False:
5375 (ipri, isec) = self.cfg.GetNodeInstances(self.op.node_name)
5377 raise errors.OpPrereqError("Node %s hosts instances, cannot unset"
5378 " the vm_capable flag" % node.name,
5381 if node.master_candidate and self.might_demote and not self.lock_all:
5382 assert not self.op.auto_promote, "auto_promote set but lock_all not"
5383 # check if after removing the current node, we're missing master
5385 (mc_remaining, mc_should, _) = \
5386 self.cfg.GetMasterCandidateStats(exceptions=[node.name])
5387 if mc_remaining < mc_should:
5388 raise errors.OpPrereqError("Not enough master candidates, please"
5389 " pass auto promote option to allow"
5390 " promotion", errors.ECODE_STATE)
5392 self.old_flags = old_flags = (node.master_candidate,
5393 node.drained, node.offline)
5394 assert old_flags in self._F2R, "Un-handled old flags %s" % str(old_flags)
5395 self.old_role = old_role = self._F2R[old_flags]
5397 # Check for ineffective changes
5398 for attr in self._FLAGS:
5399 if (getattr(self.op, attr) == False and getattr(node, attr) == False):
5400 self.LogInfo("Ignoring request to unset flag %s, already unset", attr)
5401 setattr(self.op, attr, None)
5403 # Past this point, any flag change to False means a transition
5404 # away from the respective state, as only real changes are kept
5406 # TODO: We might query the real power state if it supports OOB
5407 if _SupportsOob(self.cfg, node):
5408 if self.op.offline is False and not (node.powered or
5409 self.op.powered == True):
5410 raise errors.OpPrereqError(("Node %s needs to be turned on before its"
5411 " offline status can be reset") %
5413 elif self.op.powered is not None:
5414 raise errors.OpPrereqError(("Unable to change powered state for node %s"
5415 " as it does not support out-of-band"
5416 " handling") % self.op.node_name)
5418 # If we're being deofflined/drained, we'll MC ourself if needed
5419 if (self.op.drained == False or self.op.offline == False or
5420 (self.op.master_capable and not node.master_capable)):
5421 if _DecideSelfPromotion(self):
5422 self.op.master_candidate = True
5423 self.LogInfo("Auto-promoting node to master candidate")
5425 # If we're no longer master capable, we'll demote ourselves from MC
5426 if self.op.master_capable == False and node.master_candidate:
5427 self.LogInfo("Demoting from master candidate")
5428 self.op.master_candidate = False
5431 assert [getattr(self.op, attr) for attr in self._FLAGS].count(True) <= 1
5432 if self.op.master_candidate:
5433 new_role = self._ROLE_CANDIDATE
5434 elif self.op.drained:
5435 new_role = self._ROLE_DRAINED
5436 elif self.op.offline:
5437 new_role = self._ROLE_OFFLINE
5438 elif False in [self.op.master_candidate, self.op.drained, self.op.offline]:
5439 # False is still in new flags, which means we're un-setting (the
5441 new_role = self._ROLE_REGULAR
5442 else: # no new flags, nothing, keep old role
5445 self.new_role = new_role
5447 if old_role == self._ROLE_OFFLINE and new_role != old_role:
5448 # Trying to transition out of offline status
5449 # TODO: Use standard RPC runner, but make sure it works when the node is
5450 # still marked offline
5451 result = rpc.BootstrapRunner().call_version([node.name])[node.name]
5453 raise errors.OpPrereqError("Node %s is being de-offlined but fails"
5454 " to report its version: %s" %
5455 (node.name, result.fail_msg),
5458 self.LogWarning("Transitioning node from offline to online state"
5459 " without using re-add. Please make sure the node"
5462 if self.op.secondary_ip:
5463 # Ok even without locking, because this can't be changed by any LU
5464 master = self.cfg.GetNodeInfo(self.cfg.GetMasterNode())
5465 master_singlehomed = master.secondary_ip == master.primary_ip
5466 if master_singlehomed and self.op.secondary_ip:
5467 raise errors.OpPrereqError("Cannot change the secondary ip on a single"
5468 " homed cluster", errors.ECODE_INVAL)
5470 assert not (frozenset(affected_instances) -
5471 self.owned_locks(locking.LEVEL_INSTANCE))
5474 if affected_instances:
5475 raise errors.OpPrereqError("Cannot change secondary IP address:"
5476 " offline node has instances (%s)"
5477 " configured to use it" %
5478 utils.CommaJoin(affected_instances.keys()))
5480 # On online nodes, check that no instances are running, and that
5481 # the node has the new ip and we can reach it.
5482 for instance in affected_instances.values():
5483 _CheckInstanceDown(self, instance, "cannot change secondary ip")
5485 _CheckNodeHasSecondaryIP(self, node.name, self.op.secondary_ip, True)
5486 if master.name != node.name:
5487 # check reachability from master secondary ip to new secondary ip
5488 if not netutils.TcpPing(self.op.secondary_ip,
5489 constants.DEFAULT_NODED_PORT,
5490 source=master.secondary_ip):
5491 raise errors.OpPrereqError("Node secondary ip not reachable by TCP"
5492 " based ping to node daemon port",
5493 errors.ECODE_ENVIRON)
5495 if self.op.ndparams:
5496 new_ndparams = _GetUpdatedParams(self.node.ndparams, self.op.ndparams)
5497 utils.ForceDictType(new_ndparams, constants.NDS_PARAMETER_TYPES)
5498 self.new_ndparams = new_ndparams
5500 def Exec(self, feedback_fn):
5505 old_role = self.old_role
5506 new_role = self.new_role
5510 if self.op.ndparams:
5511 node.ndparams = self.new_ndparams
5513 if self.op.powered is not None:
5514 node.powered = self.op.powered
5516 for attr in ["master_capable", "vm_capable"]:
5517 val = getattr(self.op, attr)
5519 setattr(node, attr, val)
5520 result.append((attr, str(val)))
5522 if new_role != old_role:
5523 # Tell the node to demote itself, if no longer MC and not offline
5524 if old_role == self._ROLE_CANDIDATE and new_role != self._ROLE_OFFLINE:
5525 msg = self.rpc.call_node_demote_from_mc(node.name).fail_msg
5527 self.LogWarning("Node failed to demote itself: %s", msg)
5529 new_flags = self._R2F[new_role]
5530 for of, nf, desc in zip(self.old_flags, new_flags, self._FLAGS):
5532 result.append((desc, str(nf)))
5533 (node.master_candidate, node.drained, node.offline) = new_flags
5535 # we locked all nodes, we adjust the CP before updating this node
5537 _AdjustCandidatePool(self, [node.name])
5539 if self.op.secondary_ip:
5540 node.secondary_ip = self.op.secondary_ip
5541 result.append(("secondary_ip", self.op.secondary_ip))
5543 # this will trigger configuration file update, if needed
5544 self.cfg.Update(node, feedback_fn)
5546 # this will trigger job queue propagation or cleanup if the mc
5548 if [old_role, new_role].count(self._ROLE_CANDIDATE) == 1:
5549 self.context.ReaddNode(node)
5554 class LUNodePowercycle(NoHooksLU):
5555 """Powercycles a node.
5560 def CheckArguments(self):
5561 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
5562 if self.op.node_name == self.cfg.GetMasterNode() and not self.op.force:
5563 raise errors.OpPrereqError("The node is the master and the force"
5564 " parameter was not set",
5567 def ExpandNames(self):
5568 """Locking for PowercycleNode.
5570 This is a last-resort option and shouldn't block on other
5571 jobs. Therefore, we grab no locks.
5574 self.needed_locks = {}
5576 def Exec(self, feedback_fn):
5580 result = self.rpc.call_node_powercycle(self.op.node_name,
5581 self.cfg.GetHypervisorType())
5582 result.Raise("Failed to schedule the reboot")
5583 return result.payload
5586 class LUClusterQuery(NoHooksLU):
5587 """Query cluster configuration.
5592 def ExpandNames(self):
5593 self.needed_locks = {}
5595 def Exec(self, feedback_fn):
5596 """Return cluster config.
5599 cluster = self.cfg.GetClusterInfo()
5602 # Filter just for enabled hypervisors
5603 for os_name, hv_dict in cluster.os_hvp.items():
5604 os_hvp[os_name] = {}
5605 for hv_name, hv_params in hv_dict.items():
5606 if hv_name in cluster.enabled_hypervisors:
5607 os_hvp[os_name][hv_name] = hv_params
5609 # Convert ip_family to ip_version
5610 primary_ip_version = constants.IP4_VERSION
5611 if cluster.primary_ip_family == netutils.IP6Address.family:
5612 primary_ip_version = constants.IP6_VERSION
5615 "software_version": constants.RELEASE_VERSION,
5616 "protocol_version": constants.PROTOCOL_VERSION,
5617 "config_version": constants.CONFIG_VERSION,
5618 "os_api_version": max(constants.OS_API_VERSIONS),
5619 "export_version": constants.EXPORT_VERSION,
5620 "architecture": (platform.architecture()[0], platform.machine()),
5621 "name": cluster.cluster_name,
5622 "master": cluster.master_node,
5623 "default_hypervisor": cluster.enabled_hypervisors[0],
5624 "enabled_hypervisors": cluster.enabled_hypervisors,
5625 "hvparams": dict([(hypervisor_name, cluster.hvparams[hypervisor_name])
5626 for hypervisor_name in cluster.enabled_hypervisors]),
5628 "beparams": cluster.beparams,
5629 "osparams": cluster.osparams,
5630 "nicparams": cluster.nicparams,
5631 "ndparams": cluster.ndparams,
5632 "candidate_pool_size": cluster.candidate_pool_size,
5633 "master_netdev": cluster.master_netdev,
5634 "master_netmask": cluster.master_netmask,
5635 "use_external_mip_script": cluster.use_external_mip_script,
5636 "volume_group_name": cluster.volume_group_name,
5637 "drbd_usermode_helper": cluster.drbd_usermode_helper,
5638 "file_storage_dir": cluster.file_storage_dir,
5639 "shared_file_storage_dir": cluster.shared_file_storage_dir,
5640 "maintain_node_health": cluster.maintain_node_health,
5641 "ctime": cluster.ctime,
5642 "mtime": cluster.mtime,
5643 "uuid": cluster.uuid,
5644 "tags": list(cluster.GetTags()),
5645 "uid_pool": cluster.uid_pool,
5646 "default_iallocator": cluster.default_iallocator,
5647 "reserved_lvs": cluster.reserved_lvs,
5648 "primary_ip_version": primary_ip_version,
5649 "prealloc_wipe_disks": cluster.prealloc_wipe_disks,
5650 "hidden_os": cluster.hidden_os,
5651 "blacklisted_os": cluster.blacklisted_os,
5657 class LUClusterConfigQuery(NoHooksLU):
5658 """Return configuration values.
5662 _FIELDS_DYNAMIC = utils.FieldSet()
5663 _FIELDS_STATIC = utils.FieldSet("cluster_name", "master_node", "drain_flag",
5664 "watcher_pause", "volume_group_name")
5666 def CheckArguments(self):
5667 _CheckOutputFields(static=self._FIELDS_STATIC,
5668 dynamic=self._FIELDS_DYNAMIC,
5669 selected=self.op.output_fields)
5671 def ExpandNames(self):
5672 self.needed_locks = {}
5674 def Exec(self, feedback_fn):
5675 """Dump a representation of the cluster config to the standard output.
5679 for field in self.op.output_fields:
5680 if field == "cluster_name":
5681 entry = self.cfg.GetClusterName()
5682 elif field == "master_node":
5683 entry = self.cfg.GetMasterNode()
5684 elif field == "drain_flag":
5685 entry = os.path.exists(constants.JOB_QUEUE_DRAIN_FILE)
5686 elif field == "watcher_pause":
5687 entry = utils.ReadWatcherPauseFile(constants.WATCHER_PAUSEFILE)
5688 elif field == "volume_group_name":
5689 entry = self.cfg.GetVGName()
5691 raise errors.ParameterError(field)
5692 values.append(entry)
5696 class LUInstanceActivateDisks(NoHooksLU):
5697 """Bring up an instance's disks.
5702 def ExpandNames(self):
5703 self._ExpandAndLockInstance()
5704 self.needed_locks[locking.LEVEL_NODE] = []
5705 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
5707 def DeclareLocks(self, level):
5708 if level == locking.LEVEL_NODE:
5709 self._LockInstancesNodes()
5711 def CheckPrereq(self):
5712 """Check prerequisites.
5714 This checks that the instance is in the cluster.
5717 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5718 assert self.instance is not None, \
5719 "Cannot retrieve locked instance %s" % self.op.instance_name
5720 _CheckNodeOnline(self, self.instance.primary_node)
5722 def Exec(self, feedback_fn):
5723 """Activate the disks.
5726 disks_ok, disks_info = \
5727 _AssembleInstanceDisks(self, self.instance,
5728 ignore_size=self.op.ignore_size)
5730 raise errors.OpExecError("Cannot activate block devices")
5735 def _AssembleInstanceDisks(lu, instance, disks=None, ignore_secondaries=False,
5737 """Prepare the block devices for an instance.
5739 This sets up the block devices on all nodes.
5741 @type lu: L{LogicalUnit}
5742 @param lu: the logical unit on whose behalf we execute
5743 @type instance: L{objects.Instance}
5744 @param instance: the instance for whose disks we assemble
5745 @type disks: list of L{objects.Disk} or None
5746 @param disks: which disks to assemble (or all, if None)
5747 @type ignore_secondaries: boolean
5748 @param ignore_secondaries: if true, errors on secondary nodes
5749 won't result in an error return from the function
5750 @type ignore_size: boolean
5751 @param ignore_size: if true, the current known size of the disk
5752 will not be used during the disk activation, useful for cases
5753 when the size is wrong
5754 @return: False if the operation failed, otherwise a list of
5755 (host, instance_visible_name, node_visible_name)
5756 with the mapping from node devices to instance devices
5761 iname = instance.name
5762 disks = _ExpandCheckDisks(instance, disks)
5764 # With the two passes mechanism we try to reduce the window of
5765 # opportunity for the race condition of switching DRBD to primary
5766 # before handshaking occured, but we do not eliminate it
5768 # The proper fix would be to wait (with some limits) until the
5769 # connection has been made and drbd transitions from WFConnection
5770 # into any other network-connected state (Connected, SyncTarget,
5773 # 1st pass, assemble on all nodes in secondary mode
5774 for idx, inst_disk in enumerate(disks):
5775 for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node):
5777 node_disk = node_disk.Copy()
5778 node_disk.UnsetSize()
5779 lu.cfg.SetDiskID(node_disk, node)
5780 result = lu.rpc.call_blockdev_assemble(node, node_disk, iname, False, idx)
5781 msg = result.fail_msg
5783 lu.proc.LogWarning("Could not prepare block device %s on node %s"
5784 " (is_primary=False, pass=1): %s",
5785 inst_disk.iv_name, node, msg)
5786 if not ignore_secondaries:
5789 # FIXME: race condition on drbd migration to primary
5791 # 2nd pass, do only the primary node
5792 for idx, inst_disk in enumerate(disks):
5795 for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node):
5796 if node != instance.primary_node:
5799 node_disk = node_disk.Copy()
5800 node_disk.UnsetSize()
5801 lu.cfg.SetDiskID(node_disk, node)
5802 result = lu.rpc.call_blockdev_assemble(node, node_disk, iname, True, idx)
5803 msg = result.fail_msg
5805 lu.proc.LogWarning("Could not prepare block device %s on node %s"
5806 " (is_primary=True, pass=2): %s",
5807 inst_disk.iv_name, node, msg)
5810 dev_path = result.payload
5812 device_info.append((instance.primary_node, inst_disk.iv_name, dev_path))
5814 # leave the disks configured for the primary node
5815 # this is a workaround that would be fixed better by
5816 # improving the logical/physical id handling
5818 lu.cfg.SetDiskID(disk, instance.primary_node)
5820 return disks_ok, device_info
5823 def _StartInstanceDisks(lu, instance, force):
5824 """Start the disks of an instance.
5827 disks_ok, _ = _AssembleInstanceDisks(lu, instance,
5828 ignore_secondaries=force)
5830 _ShutdownInstanceDisks(lu, instance)
5831 if force is not None and not force:
5832 lu.proc.LogWarning("", hint="If the message above refers to a"
5834 " you can retry the operation using '--force'.")
5835 raise errors.OpExecError("Disk consistency error")
5838 class LUInstanceDeactivateDisks(NoHooksLU):
5839 """Shutdown an instance's disks.
5844 def ExpandNames(self):
5845 self._ExpandAndLockInstance()
5846 self.needed_locks[locking.LEVEL_NODE] = []
5847 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
5849 def DeclareLocks(self, level):
5850 if level == locking.LEVEL_NODE:
5851 self._LockInstancesNodes()
5853 def CheckPrereq(self):
5854 """Check prerequisites.
5856 This checks that the instance is in the cluster.
5859 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5860 assert self.instance is not None, \
5861 "Cannot retrieve locked instance %s" % self.op.instance_name
5863 def Exec(self, feedback_fn):
5864 """Deactivate the disks
5867 instance = self.instance
5869 _ShutdownInstanceDisks(self, instance)
5871 _SafeShutdownInstanceDisks(self, instance)
5874 def _SafeShutdownInstanceDisks(lu, instance, disks=None):
5875 """Shutdown block devices of an instance.
5877 This function checks if an instance is running, before calling
5878 _ShutdownInstanceDisks.
5881 _CheckInstanceDown(lu, instance, "cannot shutdown disks")
5882 _ShutdownInstanceDisks(lu, instance, disks=disks)
5885 def _ExpandCheckDisks(instance, disks):
5886 """Return the instance disks selected by the disks list
5888 @type disks: list of L{objects.Disk} or None
5889 @param disks: selected disks
5890 @rtype: list of L{objects.Disk}
5891 @return: selected instance disks to act on
5895 return instance.disks
5897 if not set(disks).issubset(instance.disks):
5898 raise errors.ProgrammerError("Can only act on disks belonging to the"
5903 def _ShutdownInstanceDisks(lu, instance, disks=None, ignore_primary=False):
5904 """Shutdown block devices of an instance.
5906 This does the shutdown on all nodes of the instance.
5908 If the ignore_primary is false, errors on the primary node are
5913 disks = _ExpandCheckDisks(instance, disks)
5916 for node, top_disk in disk.ComputeNodeTree(instance.primary_node):
5917 lu.cfg.SetDiskID(top_disk, node)
5918 result = lu.rpc.call_blockdev_shutdown(node, top_disk)
5919 msg = result.fail_msg
5921 lu.LogWarning("Could not shutdown block device %s on node %s: %s",
5922 disk.iv_name, node, msg)
5923 if ((node == instance.primary_node and not ignore_primary) or
5924 (node != instance.primary_node and not result.offline)):
5929 def _CheckNodeFreeMemory(lu, node, reason, requested, hypervisor_name):
5930 """Checks if a node has enough free memory.
5932 This function check if a given node has the needed amount of free
5933 memory. In case the node has less memory or we cannot get the
5934 information from the node, this function raise an OpPrereqError
5937 @type lu: C{LogicalUnit}
5938 @param lu: a logical unit from which we get configuration data
5940 @param node: the node to check
5941 @type reason: C{str}
5942 @param reason: string to use in the error message
5943 @type requested: C{int}
5944 @param requested: the amount of memory in MiB to check for
5945 @type hypervisor_name: C{str}
5946 @param hypervisor_name: the hypervisor to ask for memory stats
5947 @raise errors.OpPrereqError: if the node doesn't have enough memory, or
5948 we cannot check the node
5951 nodeinfo = lu.rpc.call_node_info([node], None, hypervisor_name)
5952 nodeinfo[node].Raise("Can't get data from node %s" % node,
5953 prereq=True, ecode=errors.ECODE_ENVIRON)
5954 free_mem = nodeinfo[node].payload.get("memory_free", None)
5955 if not isinstance(free_mem, int):
5956 raise errors.OpPrereqError("Can't compute free memory on node %s, result"
5957 " was '%s'" % (node, free_mem),
5958 errors.ECODE_ENVIRON)
5959 if requested > free_mem:
5960 raise errors.OpPrereqError("Not enough memory on node %s for %s:"
5961 " needed %s MiB, available %s MiB" %
5962 (node, reason, requested, free_mem),
5966 def _CheckNodesFreeDiskPerVG(lu, nodenames, req_sizes):
5967 """Checks if nodes have enough free disk space in the all VGs.
5969 This function check if all given nodes have the needed amount of
5970 free disk. In case any node has less disk or we cannot get the
5971 information from the node, this function raise an OpPrereqError
5974 @type lu: C{LogicalUnit}
5975 @param lu: a logical unit from which we get configuration data
5976 @type nodenames: C{list}
5977 @param nodenames: the list of node names to check
5978 @type req_sizes: C{dict}
5979 @param req_sizes: the hash of vg and corresponding amount of disk in
5981 @raise errors.OpPrereqError: if the node doesn't have enough disk,
5982 or we cannot check the node
5985 for vg, req_size in req_sizes.items():
5986 _CheckNodesFreeDiskOnVG(lu, nodenames, vg, req_size)
5989 def _CheckNodesFreeDiskOnVG(lu, nodenames, vg, requested):
5990 """Checks if nodes have enough free disk space in the specified VG.
5992 This function check if all given nodes have the needed amount of
5993 free disk. In case any node has less disk or we cannot get the
5994 information from the node, this function raise an OpPrereqError
5997 @type lu: C{LogicalUnit}
5998 @param lu: a logical unit from which we get configuration data
5999 @type nodenames: C{list}
6000 @param nodenames: the list of node names to check
6002 @param vg: the volume group to check
6003 @type requested: C{int}
6004 @param requested: the amount of disk in MiB to check for
6005 @raise errors.OpPrereqError: if the node doesn't have enough disk,
6006 or we cannot check the node
6009 nodeinfo = lu.rpc.call_node_info(nodenames, vg, None)
6010 for node in nodenames:
6011 info = nodeinfo[node]
6012 info.Raise("Cannot get current information from node %s" % node,
6013 prereq=True, ecode=errors.ECODE_ENVIRON)
6014 vg_free = info.payload.get("vg_free", None)
6015 if not isinstance(vg_free, int):
6016 raise errors.OpPrereqError("Can't compute free disk space on node"
6017 " %s for vg %s, result was '%s'" %
6018 (node, vg, vg_free), errors.ECODE_ENVIRON)
6019 if requested > vg_free:
6020 raise errors.OpPrereqError("Not enough disk space on target node %s"
6021 " vg %s: required %d MiB, available %d MiB" %
6022 (node, vg, requested, vg_free),
6026 def _CheckNodesPhysicalCPUs(lu, nodenames, requested, hypervisor_name):
6027 """Checks if nodes have enough physical CPUs
6029 This function checks if all given nodes have the needed number of
6030 physical CPUs. In case any node has less CPUs or we cannot get the
6031 information from the node, this function raises an OpPrereqError
6034 @type lu: C{LogicalUnit}
6035 @param lu: a logical unit from which we get configuration data
6036 @type nodenames: C{list}
6037 @param nodenames: the list of node names to check
6038 @type requested: C{int}
6039 @param requested: the minimum acceptable number of physical CPUs
6040 @raise errors.OpPrereqError: if the node doesn't have enough CPUs,
6041 or we cannot check the node
6044 nodeinfo = lu.rpc.call_node_info(nodenames, None, hypervisor_name)
6045 for node in nodenames:
6046 info = nodeinfo[node]
6047 info.Raise("Cannot get current information from node %s" % node,
6048 prereq=True, ecode=errors.ECODE_ENVIRON)
6049 num_cpus = info.payload.get("cpu_total", None)
6050 if not isinstance(num_cpus, int):
6051 raise errors.OpPrereqError("Can't compute the number of physical CPUs"
6052 " on node %s, result was '%s'" %
6053 (node, num_cpus), errors.ECODE_ENVIRON)
6054 if requested > num_cpus:
6055 raise errors.OpPrereqError("Node %s has %s physical CPUs, but %s are "
6056 "required" % (node, num_cpus, requested),
6060 class LUInstanceStartup(LogicalUnit):
6061 """Starts an instance.
6064 HPATH = "instance-start"
6065 HTYPE = constants.HTYPE_INSTANCE
6068 def CheckArguments(self):
6070 if self.op.beparams:
6071 # fill the beparams dict
6072 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
6074 def ExpandNames(self):
6075 self._ExpandAndLockInstance()
6077 def BuildHooksEnv(self):
6080 This runs on master, primary and secondary nodes of the instance.
6084 "FORCE": self.op.force,
6087 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
6091 def BuildHooksNodes(self):
6092 """Build hooks nodes.
6095 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6098 def CheckPrereq(self):
6099 """Check prerequisites.
6101 This checks that the instance is in the cluster.
6104 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6105 assert self.instance is not None, \
6106 "Cannot retrieve locked instance %s" % self.op.instance_name
6109 if self.op.hvparams:
6110 # check hypervisor parameter syntax (locally)
6111 cluster = self.cfg.GetClusterInfo()
6112 utils.ForceDictType(self.op.hvparams, constants.HVS_PARAMETER_TYPES)
6113 filled_hvp = cluster.FillHV(instance)
6114 filled_hvp.update(self.op.hvparams)
6115 hv_type = hypervisor.GetHypervisor(instance.hypervisor)
6116 hv_type.CheckParameterSyntax(filled_hvp)
6117 _CheckHVParams(self, instance.all_nodes, instance.hypervisor, filled_hvp)
6119 self.primary_offline = self.cfg.GetNodeInfo(instance.primary_node).offline
6121 if self.primary_offline and self.op.ignore_offline_nodes:
6122 self.proc.LogWarning("Ignoring offline primary node")
6124 if self.op.hvparams or self.op.beparams:
6125 self.proc.LogWarning("Overridden parameters are ignored")
6127 _CheckNodeOnline(self, instance.primary_node)
6129 bep = self.cfg.GetClusterInfo().FillBE(instance)
6131 # check bridges existence
6132 _CheckInstanceBridgesExist(self, instance)
6134 remote_info = self.rpc.call_instance_info(instance.primary_node,
6136 instance.hypervisor)
6137 remote_info.Raise("Error checking node %s" % instance.primary_node,
6138 prereq=True, ecode=errors.ECODE_ENVIRON)
6139 if not remote_info.payload: # not running already
6140 _CheckNodeFreeMemory(self, instance.primary_node,
6141 "starting instance %s" % instance.name,
6142 bep[constants.BE_MEMORY], instance.hypervisor)
6144 def Exec(self, feedback_fn):
6145 """Start the instance.
6148 instance = self.instance
6149 force = self.op.force
6151 if not self.op.no_remember:
6152 self.cfg.MarkInstanceUp(instance.name)
6154 if self.primary_offline:
6155 assert self.op.ignore_offline_nodes
6156 self.proc.LogInfo("Primary node offline, marked instance as started")
6158 node_current = instance.primary_node
6160 _StartInstanceDisks(self, instance, force)
6163 self.rpc.call_instance_start(node_current,
6164 (instance, self.op.hvparams,
6166 self.op.startup_paused)
6167 msg = result.fail_msg
6169 _ShutdownInstanceDisks(self, instance)
6170 raise errors.OpExecError("Could not start instance: %s" % msg)
6173 class LUInstanceReboot(LogicalUnit):
6174 """Reboot an instance.
6177 HPATH = "instance-reboot"
6178 HTYPE = constants.HTYPE_INSTANCE
6181 def ExpandNames(self):
6182 self._ExpandAndLockInstance()
6184 def BuildHooksEnv(self):
6187 This runs on master, primary and secondary nodes of the instance.
6191 "IGNORE_SECONDARIES": self.op.ignore_secondaries,
6192 "REBOOT_TYPE": self.op.reboot_type,
6193 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
6196 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
6200 def BuildHooksNodes(self):
6201 """Build hooks nodes.
6204 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6207 def CheckPrereq(self):
6208 """Check prerequisites.
6210 This checks that the instance is in the cluster.
6213 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6214 assert self.instance is not None, \
6215 "Cannot retrieve locked instance %s" % self.op.instance_name
6217 _CheckNodeOnline(self, instance.primary_node)
6219 # check bridges existence
6220 _CheckInstanceBridgesExist(self, instance)
6222 def Exec(self, feedback_fn):
6223 """Reboot the instance.
6226 instance = self.instance
6227 ignore_secondaries = self.op.ignore_secondaries
6228 reboot_type = self.op.reboot_type
6230 remote_info = self.rpc.call_instance_info(instance.primary_node,
6232 instance.hypervisor)
6233 remote_info.Raise("Error checking node %s" % instance.primary_node)
6234 instance_running = bool(remote_info.payload)
6236 node_current = instance.primary_node
6238 if instance_running and reboot_type in [constants.INSTANCE_REBOOT_SOFT,
6239 constants.INSTANCE_REBOOT_HARD]:
6240 for disk in instance.disks:
6241 self.cfg.SetDiskID(disk, node_current)
6242 result = self.rpc.call_instance_reboot(node_current, instance,
6244 self.op.shutdown_timeout)
6245 result.Raise("Could not reboot instance")
6247 if instance_running:
6248 result = self.rpc.call_instance_shutdown(node_current, instance,
6249 self.op.shutdown_timeout)
6250 result.Raise("Could not shutdown instance for full reboot")
6251 _ShutdownInstanceDisks(self, instance)
6253 self.LogInfo("Instance %s was already stopped, starting now",
6255 _StartInstanceDisks(self, instance, ignore_secondaries)
6256 result = self.rpc.call_instance_start(node_current,
6257 (instance, None, None), False)
6258 msg = result.fail_msg
6260 _ShutdownInstanceDisks(self, instance)
6261 raise errors.OpExecError("Could not start instance for"
6262 " full reboot: %s" % msg)
6264 self.cfg.MarkInstanceUp(instance.name)
6267 class LUInstanceShutdown(LogicalUnit):
6268 """Shutdown an instance.
6271 HPATH = "instance-stop"
6272 HTYPE = constants.HTYPE_INSTANCE
6275 def ExpandNames(self):
6276 self._ExpandAndLockInstance()
6278 def BuildHooksEnv(self):
6281 This runs on master, primary and secondary nodes of the instance.
6284 env = _BuildInstanceHookEnvByObject(self, self.instance)
6285 env["TIMEOUT"] = self.op.timeout
6288 def BuildHooksNodes(self):
6289 """Build hooks nodes.
6292 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6295 def CheckPrereq(self):
6296 """Check prerequisites.
6298 This checks that the instance is in the cluster.
6301 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6302 assert self.instance is not None, \
6303 "Cannot retrieve locked instance %s" % self.op.instance_name
6305 self.primary_offline = \
6306 self.cfg.GetNodeInfo(self.instance.primary_node).offline
6308 if self.primary_offline and self.op.ignore_offline_nodes:
6309 self.proc.LogWarning("Ignoring offline primary node")
6311 _CheckNodeOnline(self, self.instance.primary_node)
6313 def Exec(self, feedback_fn):
6314 """Shutdown the instance.
6317 instance = self.instance
6318 node_current = instance.primary_node
6319 timeout = self.op.timeout
6321 if not self.op.no_remember:
6322 self.cfg.MarkInstanceDown(instance.name)
6324 if self.primary_offline:
6325 assert self.op.ignore_offline_nodes
6326 self.proc.LogInfo("Primary node offline, marked instance as stopped")
6328 result = self.rpc.call_instance_shutdown(node_current, instance, timeout)
6329 msg = result.fail_msg
6331 self.proc.LogWarning("Could not shutdown instance: %s" % msg)
6333 _ShutdownInstanceDisks(self, instance)
6336 class LUInstanceReinstall(LogicalUnit):
6337 """Reinstall an instance.
6340 HPATH = "instance-reinstall"
6341 HTYPE = constants.HTYPE_INSTANCE
6344 def ExpandNames(self):
6345 self._ExpandAndLockInstance()
6347 def BuildHooksEnv(self):
6350 This runs on master, primary and secondary nodes of the instance.
6353 return _BuildInstanceHookEnvByObject(self, self.instance)
6355 def BuildHooksNodes(self):
6356 """Build hooks nodes.
6359 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6362 def CheckPrereq(self):
6363 """Check prerequisites.
6365 This checks that the instance is in the cluster and is not running.
6368 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6369 assert instance is not None, \
6370 "Cannot retrieve locked instance %s" % self.op.instance_name
6371 _CheckNodeOnline(self, instance.primary_node, "Instance primary node"
6372 " offline, cannot reinstall")
6373 for node in instance.secondary_nodes:
6374 _CheckNodeOnline(self, node, "Instance secondary node offline,"
6375 " cannot reinstall")
6377 if instance.disk_template == constants.DT_DISKLESS:
6378 raise errors.OpPrereqError("Instance '%s' has no disks" %
6379 self.op.instance_name,
6381 _CheckInstanceDown(self, instance, "cannot reinstall")
6383 if self.op.os_type is not None:
6385 pnode = _ExpandNodeName(self.cfg, instance.primary_node)
6386 _CheckNodeHasOS(self, pnode, self.op.os_type, self.op.force_variant)
6387 instance_os = self.op.os_type
6389 instance_os = instance.os
6391 nodelist = list(instance.all_nodes)
6393 if self.op.osparams:
6394 i_osdict = _GetUpdatedParams(instance.osparams, self.op.osparams)
6395 _CheckOSParams(self, True, nodelist, instance_os, i_osdict)
6396 self.os_inst = i_osdict # the new dict (without defaults)
6400 self.instance = instance
6402 def Exec(self, feedback_fn):
6403 """Reinstall the instance.
6406 inst = self.instance
6408 if self.op.os_type is not None:
6409 feedback_fn("Changing OS to '%s'..." % self.op.os_type)
6410 inst.os = self.op.os_type
6411 # Write to configuration
6412 self.cfg.Update(inst, feedback_fn)
6414 _StartInstanceDisks(self, inst, None)
6416 feedback_fn("Running the instance OS create scripts...")
6417 # FIXME: pass debug option from opcode to backend
6418 result = self.rpc.call_instance_os_add(inst.primary_node,
6419 (inst, self.os_inst), True,
6420 self.op.debug_level)
6421 result.Raise("Could not install OS for instance %s on node %s" %
6422 (inst.name, inst.primary_node))
6424 _ShutdownInstanceDisks(self, inst)
6427 class LUInstanceRecreateDisks(LogicalUnit):
6428 """Recreate an instance's missing disks.
6431 HPATH = "instance-recreate-disks"
6432 HTYPE = constants.HTYPE_INSTANCE
6435 def CheckArguments(self):
6436 # normalise the disk list
6437 self.op.disks = sorted(frozenset(self.op.disks))
6439 def ExpandNames(self):
6440 self._ExpandAndLockInstance()
6441 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
6443 self.op.nodes = [_ExpandNodeName(self.cfg, n) for n in self.op.nodes]
6444 self.needed_locks[locking.LEVEL_NODE] = list(self.op.nodes)
6446 self.needed_locks[locking.LEVEL_NODE] = []
6448 def DeclareLocks(self, level):
6449 if level == locking.LEVEL_NODE:
6450 # if we replace the nodes, we only need to lock the old primary,
6451 # otherwise we need to lock all nodes for disk re-creation
6452 primary_only = bool(self.op.nodes)
6453 self._LockInstancesNodes(primary_only=primary_only)
6455 def BuildHooksEnv(self):
6458 This runs on master, primary and secondary nodes of the instance.
6461 return _BuildInstanceHookEnvByObject(self, self.instance)
6463 def BuildHooksNodes(self):
6464 """Build hooks nodes.
6467 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6470 def CheckPrereq(self):
6471 """Check prerequisites.
6473 This checks that the instance is in the cluster and is not running.
6476 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6477 assert instance is not None, \
6478 "Cannot retrieve locked instance %s" % self.op.instance_name
6480 if len(self.op.nodes) != len(instance.all_nodes):
6481 raise errors.OpPrereqError("Instance %s currently has %d nodes, but"
6482 " %d replacement nodes were specified" %
6483 (instance.name, len(instance.all_nodes),
6484 len(self.op.nodes)),
6486 assert instance.disk_template != constants.DT_DRBD8 or \
6487 len(self.op.nodes) == 2
6488 assert instance.disk_template != constants.DT_PLAIN or \
6489 len(self.op.nodes) == 1
6490 primary_node = self.op.nodes[0]
6492 primary_node = instance.primary_node
6493 _CheckNodeOnline(self, primary_node)
6495 if instance.disk_template == constants.DT_DISKLESS:
6496 raise errors.OpPrereqError("Instance '%s' has no disks" %
6497 self.op.instance_name, errors.ECODE_INVAL)
6498 # if we replace nodes *and* the old primary is offline, we don't
6500 assert instance.primary_node in self.needed_locks[locking.LEVEL_NODE]
6501 old_pnode = self.cfg.GetNodeInfo(instance.primary_node)
6502 if not (self.op.nodes and old_pnode.offline):
6503 _CheckInstanceDown(self, instance, "cannot recreate disks")
6505 if not self.op.disks:
6506 self.op.disks = range(len(instance.disks))
6508 for idx in self.op.disks:
6509 if idx >= len(instance.disks):
6510 raise errors.OpPrereqError("Invalid disk index '%s'" % idx,
6512 if self.op.disks != range(len(instance.disks)) and self.op.nodes:
6513 raise errors.OpPrereqError("Can't recreate disks partially and"
6514 " change the nodes at the same time",
6516 self.instance = instance
6518 def Exec(self, feedback_fn):
6519 """Recreate the disks.
6522 instance = self.instance
6525 mods = [] # keeps track of needed logical_id changes
6527 for idx, disk in enumerate(instance.disks):
6528 if idx not in self.op.disks: # disk idx has not been passed in
6531 # update secondaries for disks, if needed
6533 if disk.dev_type == constants.LD_DRBD8:
6534 # need to update the nodes and minors
6535 assert len(self.op.nodes) == 2
6536 assert len(disk.logical_id) == 6 # otherwise disk internals
6538 (_, _, old_port, _, _, old_secret) = disk.logical_id
6539 new_minors = self.cfg.AllocateDRBDMinor(self.op.nodes, instance.name)
6540 new_id = (self.op.nodes[0], self.op.nodes[1], old_port,
6541 new_minors[0], new_minors[1], old_secret)
6542 assert len(disk.logical_id) == len(new_id)
6543 mods.append((idx, new_id))
6545 # now that we have passed all asserts above, we can apply the mods
6546 # in a single run (to avoid partial changes)
6547 for idx, new_id in mods:
6548 instance.disks[idx].logical_id = new_id
6550 # change primary node, if needed
6552 instance.primary_node = self.op.nodes[0]
6553 self.LogWarning("Changing the instance's nodes, you will have to"
6554 " remove any disks left on the older nodes manually")
6557 self.cfg.Update(instance, feedback_fn)
6559 _CreateDisks(self, instance, to_skip=to_skip)
6562 class LUInstanceRename(LogicalUnit):
6563 """Rename an instance.
6566 HPATH = "instance-rename"
6567 HTYPE = constants.HTYPE_INSTANCE
6569 def CheckArguments(self):
6573 if self.op.ip_check and not self.op.name_check:
6574 # TODO: make the ip check more flexible and not depend on the name check
6575 raise errors.OpPrereqError("IP address check requires a name check",
6578 def BuildHooksEnv(self):
6581 This runs on master, primary and secondary nodes of the instance.
6584 env = _BuildInstanceHookEnvByObject(self, self.instance)
6585 env["INSTANCE_NEW_NAME"] = self.op.new_name
6588 def BuildHooksNodes(self):
6589 """Build hooks nodes.
6592 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
6595 def CheckPrereq(self):
6596 """Check prerequisites.
6598 This checks that the instance is in the cluster and is not running.
6601 self.op.instance_name = _ExpandInstanceName(self.cfg,
6602 self.op.instance_name)
6603 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6604 assert instance is not None
6605 _CheckNodeOnline(self, instance.primary_node)
6606 _CheckInstanceDown(self, instance, "cannot rename")
6607 self.instance = instance
6609 new_name = self.op.new_name
6610 if self.op.name_check:
6611 hostname = netutils.GetHostname(name=new_name)
6612 if hostname != new_name:
6613 self.LogInfo("Resolved given name '%s' to '%s'", new_name,
6615 if not utils.MatchNameComponent(self.op.new_name, [hostname.name]):
6616 raise errors.OpPrereqError(("Resolved hostname '%s' does not look the"
6617 " same as given hostname '%s'") %
6618 (hostname.name, self.op.new_name),
6620 new_name = self.op.new_name = hostname.name
6621 if (self.op.ip_check and
6622 netutils.TcpPing(hostname.ip, constants.DEFAULT_NODED_PORT)):
6623 raise errors.OpPrereqError("IP %s of instance %s already in use" %
6624 (hostname.ip, new_name),
6625 errors.ECODE_NOTUNIQUE)
6627 instance_list = self.cfg.GetInstanceList()
6628 if new_name in instance_list and new_name != instance.name:
6629 raise errors.OpPrereqError("Instance '%s' is already in the cluster" %
6630 new_name, errors.ECODE_EXISTS)
6632 def Exec(self, feedback_fn):
6633 """Rename the instance.
6636 inst = self.instance
6637 old_name = inst.name
6639 rename_file_storage = False
6640 if (inst.disk_template in constants.DTS_FILEBASED and
6641 self.op.new_name != inst.name):
6642 old_file_storage_dir = os.path.dirname(inst.disks[0].logical_id[1])
6643 rename_file_storage = True
6645 self.cfg.RenameInstance(inst.name, self.op.new_name)
6646 # Change the instance lock. This is definitely safe while we hold the BGL.
6647 # Otherwise the new lock would have to be added in acquired mode.
6649 self.glm.remove(locking.LEVEL_INSTANCE, old_name)
6650 self.glm.add(locking.LEVEL_INSTANCE, self.op.new_name)
6652 # re-read the instance from the configuration after rename
6653 inst = self.cfg.GetInstanceInfo(self.op.new_name)
6655 if rename_file_storage:
6656 new_file_storage_dir = os.path.dirname(inst.disks[0].logical_id[1])
6657 result = self.rpc.call_file_storage_dir_rename(inst.primary_node,
6658 old_file_storage_dir,
6659 new_file_storage_dir)
6660 result.Raise("Could not rename on node %s directory '%s' to '%s'"
6661 " (but the instance has been renamed in Ganeti)" %
6662 (inst.primary_node, old_file_storage_dir,
6663 new_file_storage_dir))
6665 _StartInstanceDisks(self, inst, None)
6667 result = self.rpc.call_instance_run_rename(inst.primary_node, inst,
6668 old_name, self.op.debug_level)
6669 msg = result.fail_msg
6671 msg = ("Could not run OS rename script for instance %s on node %s"
6672 " (but the instance has been renamed in Ganeti): %s" %
6673 (inst.name, inst.primary_node, msg))
6674 self.proc.LogWarning(msg)
6676 _ShutdownInstanceDisks(self, inst)
6681 class LUInstanceRemove(LogicalUnit):
6682 """Remove an instance.
6685 HPATH = "instance-remove"
6686 HTYPE = constants.HTYPE_INSTANCE
6689 def ExpandNames(self):
6690 self._ExpandAndLockInstance()
6691 self.needed_locks[locking.LEVEL_NODE] = []
6692 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
6694 def DeclareLocks(self, level):
6695 if level == locking.LEVEL_NODE:
6696 self._LockInstancesNodes()
6698 def BuildHooksEnv(self):
6701 This runs on master, primary and secondary nodes of the instance.
6704 env = _BuildInstanceHookEnvByObject(self, self.instance)
6705 env["SHUTDOWN_TIMEOUT"] = self.op.shutdown_timeout
6708 def BuildHooksNodes(self):
6709 """Build hooks nodes.
6712 nl = [self.cfg.GetMasterNode()]
6713 nl_post = list(self.instance.all_nodes) + nl
6714 return (nl, nl_post)
6716 def CheckPrereq(self):
6717 """Check prerequisites.
6719 This checks that the instance is in the cluster.
6722 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6723 assert self.instance is not None, \
6724 "Cannot retrieve locked instance %s" % self.op.instance_name
6726 def Exec(self, feedback_fn):
6727 """Remove the instance.
6730 instance = self.instance
6731 logging.info("Shutting down instance %s on node %s",
6732 instance.name, instance.primary_node)
6734 result = self.rpc.call_instance_shutdown(instance.primary_node, instance,
6735 self.op.shutdown_timeout)
6736 msg = result.fail_msg
6738 if self.op.ignore_failures:
6739 feedback_fn("Warning: can't shutdown instance: %s" % msg)
6741 raise errors.OpExecError("Could not shutdown instance %s on"
6743 (instance.name, instance.primary_node, msg))
6745 _RemoveInstance(self, feedback_fn, instance, self.op.ignore_failures)
6748 def _RemoveInstance(lu, feedback_fn, instance, ignore_failures):
6749 """Utility function to remove an instance.
6752 logging.info("Removing block devices for instance %s", instance.name)
6754 if not _RemoveDisks(lu, instance):
6755 if not ignore_failures:
6756 raise errors.OpExecError("Can't remove instance's disks")
6757 feedback_fn("Warning: can't remove instance's disks")
6759 logging.info("Removing instance %s out of cluster config", instance.name)
6761 lu.cfg.RemoveInstance(instance.name)
6763 assert not lu.remove_locks.get(locking.LEVEL_INSTANCE), \
6764 "Instance lock removal conflict"
6766 # Remove lock for the instance
6767 lu.remove_locks[locking.LEVEL_INSTANCE] = instance.name
6770 class LUInstanceQuery(NoHooksLU):
6771 """Logical unit for querying instances.
6774 # pylint: disable=W0142
6777 def CheckArguments(self):
6778 self.iq = _InstanceQuery(qlang.MakeSimpleFilter("name", self.op.names),
6779 self.op.output_fields, self.op.use_locking)
6781 def ExpandNames(self):
6782 self.iq.ExpandNames(self)
6784 def DeclareLocks(self, level):
6785 self.iq.DeclareLocks(self, level)
6787 def Exec(self, feedback_fn):
6788 return self.iq.OldStyleQuery(self)
6791 class LUInstanceFailover(LogicalUnit):
6792 """Failover an instance.
6795 HPATH = "instance-failover"
6796 HTYPE = constants.HTYPE_INSTANCE
6799 def CheckArguments(self):
6800 """Check the arguments.
6803 self.iallocator = getattr(self.op, "iallocator", None)
6804 self.target_node = getattr(self.op, "target_node", None)
6806 def ExpandNames(self):
6807 self._ExpandAndLockInstance()
6809 if self.op.target_node is not None:
6810 self.op.target_node = _ExpandNodeName(self.cfg, self.op.target_node)
6812 self.needed_locks[locking.LEVEL_NODE] = []
6813 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
6815 ignore_consistency = self.op.ignore_consistency
6816 shutdown_timeout = self.op.shutdown_timeout
6817 self._migrater = TLMigrateInstance(self, self.op.instance_name,
6820 ignore_consistency=ignore_consistency,
6821 shutdown_timeout=shutdown_timeout)
6822 self.tasklets = [self._migrater]
6824 def DeclareLocks(self, level):
6825 if level == locking.LEVEL_NODE:
6826 instance = self.context.cfg.GetInstanceInfo(self.op.instance_name)
6827 if instance.disk_template in constants.DTS_EXT_MIRROR:
6828 if self.op.target_node is None:
6829 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
6831 self.needed_locks[locking.LEVEL_NODE] = [instance.primary_node,
6832 self.op.target_node]
6833 del self.recalculate_locks[locking.LEVEL_NODE]
6835 self._LockInstancesNodes()
6837 def BuildHooksEnv(self):
6840 This runs on master, primary and secondary nodes of the instance.
6843 instance = self._migrater.instance
6844 source_node = instance.primary_node
6845 target_node = self.op.target_node
6847 "IGNORE_CONSISTENCY": self.op.ignore_consistency,
6848 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
6849 "OLD_PRIMARY": source_node,
6850 "NEW_PRIMARY": target_node,
6853 if instance.disk_template in constants.DTS_INT_MIRROR:
6854 env["OLD_SECONDARY"] = instance.secondary_nodes[0]
6855 env["NEW_SECONDARY"] = source_node
6857 env["OLD_SECONDARY"] = env["NEW_SECONDARY"] = ""
6859 env.update(_BuildInstanceHookEnvByObject(self, instance))
6863 def BuildHooksNodes(self):
6864 """Build hooks nodes.
6867 instance = self._migrater.instance
6868 nl = [self.cfg.GetMasterNode()] + list(instance.secondary_nodes)
6869 return (nl, nl + [instance.primary_node])
6872 class LUInstanceMigrate(LogicalUnit):
6873 """Migrate an instance.
6875 This is migration without shutting down, compared to the failover,
6876 which is done with shutdown.
6879 HPATH = "instance-migrate"
6880 HTYPE = constants.HTYPE_INSTANCE
6883 def ExpandNames(self):
6884 self._ExpandAndLockInstance()
6886 if self.op.target_node is not None:
6887 self.op.target_node = _ExpandNodeName(self.cfg, self.op.target_node)
6889 self.needed_locks[locking.LEVEL_NODE] = []
6890 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
6892 self._migrater = TLMigrateInstance(self, self.op.instance_name,
6893 cleanup=self.op.cleanup,
6895 fallback=self.op.allow_failover)
6896 self.tasklets = [self._migrater]
6898 def DeclareLocks(self, level):
6899 if level == locking.LEVEL_NODE:
6900 instance = self.context.cfg.GetInstanceInfo(self.op.instance_name)
6901 if instance.disk_template in constants.DTS_EXT_MIRROR:
6902 if self.op.target_node is None:
6903 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
6905 self.needed_locks[locking.LEVEL_NODE] = [instance.primary_node,
6906 self.op.target_node]
6907 del self.recalculate_locks[locking.LEVEL_NODE]
6909 self._LockInstancesNodes()
6911 def BuildHooksEnv(self):
6914 This runs on master, primary and secondary nodes of the instance.
6917 instance = self._migrater.instance
6918 source_node = instance.primary_node
6919 target_node = self.op.target_node
6920 env = _BuildInstanceHookEnvByObject(self, instance)
6922 "MIGRATE_LIVE": self._migrater.live,
6923 "MIGRATE_CLEANUP": self.op.cleanup,
6924 "OLD_PRIMARY": source_node,
6925 "NEW_PRIMARY": target_node,
6928 if instance.disk_template in constants.DTS_INT_MIRROR:
6929 env["OLD_SECONDARY"] = target_node
6930 env["NEW_SECONDARY"] = source_node
6932 env["OLD_SECONDARY"] = env["NEW_SECONDARY"] = None
6936 def BuildHooksNodes(self):
6937 """Build hooks nodes.
6940 instance = self._migrater.instance
6941 nl = [self.cfg.GetMasterNode()] + list(instance.secondary_nodes)
6942 return (nl, nl + [instance.primary_node])
6945 class LUInstanceMove(LogicalUnit):
6946 """Move an instance by data-copying.
6949 HPATH = "instance-move"
6950 HTYPE = constants.HTYPE_INSTANCE
6953 def ExpandNames(self):
6954 self._ExpandAndLockInstance()
6955 target_node = _ExpandNodeName(self.cfg, self.op.target_node)
6956 self.op.target_node = target_node
6957 self.needed_locks[locking.LEVEL_NODE] = [target_node]
6958 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
6960 def DeclareLocks(self, level):
6961 if level == locking.LEVEL_NODE:
6962 self._LockInstancesNodes(primary_only=True)
6964 def BuildHooksEnv(self):
6967 This runs on master, primary and secondary nodes of the instance.
6971 "TARGET_NODE": self.op.target_node,
6972 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
6974 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
6977 def BuildHooksNodes(self):
6978 """Build hooks nodes.
6982 self.cfg.GetMasterNode(),
6983 self.instance.primary_node,
6984 self.op.target_node,
6988 def CheckPrereq(self):
6989 """Check prerequisites.
6991 This checks that the instance is in the cluster.
6994 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6995 assert self.instance is not None, \
6996 "Cannot retrieve locked instance %s" % self.op.instance_name
6998 node = self.cfg.GetNodeInfo(self.op.target_node)
6999 assert node is not None, \
7000 "Cannot retrieve locked node %s" % self.op.target_node
7002 self.target_node = target_node = node.name
7004 if target_node == instance.primary_node:
7005 raise errors.OpPrereqError("Instance %s is already on the node %s" %
7006 (instance.name, target_node),
7009 bep = self.cfg.GetClusterInfo().FillBE(instance)
7011 for idx, dsk in enumerate(instance.disks):
7012 if dsk.dev_type not in (constants.LD_LV, constants.LD_FILE):
7013 raise errors.OpPrereqError("Instance disk %d has a complex layout,"
7014 " cannot copy" % idx, errors.ECODE_STATE)
7016 _CheckNodeOnline(self, target_node)
7017 _CheckNodeNotDrained(self, target_node)
7018 _CheckNodeVmCapable(self, target_node)
7020 if instance.admin_up:
7021 # check memory requirements on the secondary node
7022 _CheckNodeFreeMemory(self, target_node, "failing over instance %s" %
7023 instance.name, bep[constants.BE_MEMORY],
7024 instance.hypervisor)
7026 self.LogInfo("Not checking memory on the secondary node as"
7027 " instance will not be started")
7029 # check bridge existance
7030 _CheckInstanceBridgesExist(self, instance, node=target_node)
7032 def Exec(self, feedback_fn):
7033 """Move an instance.
7035 The move is done by shutting it down on its present node, copying
7036 the data over (slow) and starting it on the new node.
7039 instance = self.instance
7041 source_node = instance.primary_node
7042 target_node = self.target_node
7044 self.LogInfo("Shutting down instance %s on source node %s",
7045 instance.name, source_node)
7047 result = self.rpc.call_instance_shutdown(source_node, instance,
7048 self.op.shutdown_timeout)
7049 msg = result.fail_msg
7051 if self.op.ignore_consistency:
7052 self.proc.LogWarning("Could not shutdown instance %s on node %s."
7053 " Proceeding anyway. Please make sure node"
7054 " %s is down. Error details: %s",
7055 instance.name, source_node, source_node, msg)
7057 raise errors.OpExecError("Could not shutdown instance %s on"
7059 (instance.name, source_node, msg))
7061 # create the target disks
7063 _CreateDisks(self, instance, target_node=target_node)
7064 except errors.OpExecError:
7065 self.LogWarning("Device creation failed, reverting...")
7067 _RemoveDisks(self, instance, target_node=target_node)
7069 self.cfg.ReleaseDRBDMinors(instance.name)
7072 cluster_name = self.cfg.GetClusterInfo().cluster_name
7075 # activate, get path, copy the data over
7076 for idx, disk in enumerate(instance.disks):
7077 self.LogInfo("Copying data for disk %d", idx)
7078 result = self.rpc.call_blockdev_assemble(target_node, disk,
7079 instance.name, True, idx)
7081 self.LogWarning("Can't assemble newly created disk %d: %s",
7082 idx, result.fail_msg)
7083 errs.append(result.fail_msg)
7085 dev_path = result.payload
7086 result = self.rpc.call_blockdev_export(source_node, disk,
7087 target_node, dev_path,
7090 self.LogWarning("Can't copy data over for disk %d: %s",
7091 idx, result.fail_msg)
7092 errs.append(result.fail_msg)
7096 self.LogWarning("Some disks failed to copy, aborting")
7098 _RemoveDisks(self, instance, target_node=target_node)
7100 self.cfg.ReleaseDRBDMinors(instance.name)
7101 raise errors.OpExecError("Errors during disk copy: %s" %
7104 instance.primary_node = target_node
7105 self.cfg.Update(instance, feedback_fn)
7107 self.LogInfo("Removing the disks on the original node")
7108 _RemoveDisks(self, instance, target_node=source_node)
7110 # Only start the instance if it's marked as up
7111 if instance.admin_up:
7112 self.LogInfo("Starting instance %s on node %s",
7113 instance.name, target_node)
7115 disks_ok, _ = _AssembleInstanceDisks(self, instance,
7116 ignore_secondaries=True)
7118 _ShutdownInstanceDisks(self, instance)
7119 raise errors.OpExecError("Can't activate the instance's disks")
7121 result = self.rpc.call_instance_start(target_node,
7122 (instance, None, None), False)
7123 msg = result.fail_msg
7125 _ShutdownInstanceDisks(self, instance)
7126 raise errors.OpExecError("Could not start instance %s on node %s: %s" %
7127 (instance.name, target_node, msg))
7130 class LUNodeMigrate(LogicalUnit):
7131 """Migrate all instances from a node.
7134 HPATH = "node-migrate"
7135 HTYPE = constants.HTYPE_NODE
7138 def CheckArguments(self):
7141 def ExpandNames(self):
7142 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
7144 self.share_locks = _ShareAll()
7145 self.needed_locks = {
7146 locking.LEVEL_NODE: [self.op.node_name],
7149 def BuildHooksEnv(self):
7152 This runs on the master, the primary and all the secondaries.
7156 "NODE_NAME": self.op.node_name,
7159 def BuildHooksNodes(self):
7160 """Build hooks nodes.
7163 nl = [self.cfg.GetMasterNode()]
7166 def CheckPrereq(self):
7169 def Exec(self, feedback_fn):
7170 # Prepare jobs for migration instances
7172 [opcodes.OpInstanceMigrate(instance_name=inst.name,
7175 iallocator=self.op.iallocator,
7176 target_node=self.op.target_node)]
7177 for inst in _GetNodePrimaryInstances(self.cfg, self.op.node_name)
7180 # TODO: Run iallocator in this opcode and pass correct placement options to
7181 # OpInstanceMigrate. Since other jobs can modify the cluster between
7182 # running the iallocator and the actual migration, a good consistency model
7183 # will have to be found.
7185 assert (frozenset(self.owned_locks(locking.LEVEL_NODE)) ==
7186 frozenset([self.op.node_name]))
7188 return ResultWithJobs(jobs)
7191 class TLMigrateInstance(Tasklet):
7192 """Tasklet class for instance migration.
7195 @ivar live: whether the migration will be done live or non-live;
7196 this variable is initalized only after CheckPrereq has run
7197 @type cleanup: boolean
7198 @ivar cleanup: Wheater we cleanup from a failed migration
7199 @type iallocator: string
7200 @ivar iallocator: The iallocator used to determine target_node
7201 @type target_node: string
7202 @ivar target_node: If given, the target_node to reallocate the instance to
7203 @type failover: boolean
7204 @ivar failover: Whether operation results in failover or migration
7205 @type fallback: boolean
7206 @ivar fallback: Whether fallback to failover is allowed if migration not
7208 @type ignore_consistency: boolean
7209 @ivar ignore_consistency: Wheter we should ignore consistency between source
7211 @type shutdown_timeout: int
7212 @ivar shutdown_timeout: In case of failover timeout of the shutdown
7217 _MIGRATION_POLL_INTERVAL = 1 # seconds
7218 _MIGRATION_FEEDBACK_INTERVAL = 10 # seconds
7220 def __init__(self, lu, instance_name, cleanup=False,
7221 failover=False, fallback=False,
7222 ignore_consistency=False,
7223 shutdown_timeout=constants.DEFAULT_SHUTDOWN_TIMEOUT):
7224 """Initializes this class.
7227 Tasklet.__init__(self, lu)
7230 self.instance_name = instance_name
7231 self.cleanup = cleanup
7232 self.live = False # will be overridden later
7233 self.failover = failover
7234 self.fallback = fallback
7235 self.ignore_consistency = ignore_consistency
7236 self.shutdown_timeout = shutdown_timeout
7238 def CheckPrereq(self):
7239 """Check prerequisites.
7241 This checks that the instance is in the cluster.
7244 instance_name = _ExpandInstanceName(self.lu.cfg, self.instance_name)
7245 instance = self.cfg.GetInstanceInfo(instance_name)
7246 assert instance is not None
7247 self.instance = instance
7249 if (not self.cleanup and not instance.admin_up and not self.failover and
7251 self.lu.LogInfo("Instance is marked down, fallback allowed, switching"
7253 self.failover = True
7255 if instance.disk_template not in constants.DTS_MIRRORED:
7260 raise errors.OpPrereqError("Instance's disk layout '%s' does not allow"
7261 " %s" % (instance.disk_template, text),
7264 if instance.disk_template in constants.DTS_EXT_MIRROR:
7265 _CheckIAllocatorOrNode(self.lu, "iallocator", "target_node")
7267 if self.lu.op.iallocator:
7268 self._RunAllocator()
7270 # We set set self.target_node as it is required by
7272 self.target_node = self.lu.op.target_node
7274 # self.target_node is already populated, either directly or by the
7276 target_node = self.target_node
7277 if self.target_node == instance.primary_node:
7278 raise errors.OpPrereqError("Cannot migrate instance %s"
7279 " to its primary (%s)" %
7280 (instance.name, instance.primary_node))
7282 if len(self.lu.tasklets) == 1:
7283 # It is safe to release locks only when we're the only tasklet
7285 _ReleaseLocks(self.lu, locking.LEVEL_NODE,
7286 keep=[instance.primary_node, self.target_node])
7289 secondary_nodes = instance.secondary_nodes
7290 if not secondary_nodes:
7291 raise errors.ConfigurationError("No secondary node but using"
7292 " %s disk template" %
7293 instance.disk_template)
7294 target_node = secondary_nodes[0]
7295 if self.lu.op.iallocator or (self.lu.op.target_node and
7296 self.lu.op.target_node != target_node):
7298 text = "failed over"
7301 raise errors.OpPrereqError("Instances with disk template %s cannot"
7302 " be %s to arbitrary nodes"
7303 " (neither an iallocator nor a target"
7304 " node can be passed)" %
7305 (instance.disk_template, text),
7308 i_be = self.cfg.GetClusterInfo().FillBE(instance)
7310 # check memory requirements on the secondary node
7311 if not self.failover or instance.admin_up:
7312 _CheckNodeFreeMemory(self.lu, target_node, "migrating instance %s" %
7313 instance.name, i_be[constants.BE_MEMORY],
7314 instance.hypervisor)
7316 self.lu.LogInfo("Not checking memory on the secondary node as"
7317 " instance will not be started")
7319 # check bridge existance
7320 _CheckInstanceBridgesExist(self.lu, instance, node=target_node)
7322 if not self.cleanup:
7323 _CheckNodeNotDrained(self.lu, target_node)
7324 if not self.failover:
7325 result = self.rpc.call_instance_migratable(instance.primary_node,
7327 if result.fail_msg and self.fallback:
7328 self.lu.LogInfo("Can't migrate, instance offline, fallback to"
7330 self.failover = True
7332 result.Raise("Can't migrate, please use failover",
7333 prereq=True, ecode=errors.ECODE_STATE)
7335 assert not (self.failover and self.cleanup)
7337 if not self.failover:
7338 if self.lu.op.live is not None and self.lu.op.mode is not None:
7339 raise errors.OpPrereqError("Only one of the 'live' and 'mode'"
7340 " parameters are accepted",
7342 if self.lu.op.live is not None:
7344 self.lu.op.mode = constants.HT_MIGRATION_LIVE
7346 self.lu.op.mode = constants.HT_MIGRATION_NONLIVE
7347 # reset the 'live' parameter to None so that repeated
7348 # invocations of CheckPrereq do not raise an exception
7349 self.lu.op.live = None
7350 elif self.lu.op.mode is None:
7351 # read the default value from the hypervisor
7352 i_hv = self.cfg.GetClusterInfo().FillHV(self.instance,
7354 self.lu.op.mode = i_hv[constants.HV_MIGRATION_MODE]
7356 self.live = self.lu.op.mode == constants.HT_MIGRATION_LIVE
7358 # Failover is never live
7361 def _RunAllocator(self):
7362 """Run the allocator based on input opcode.
7365 ial = IAllocator(self.cfg, self.rpc,
7366 mode=constants.IALLOCATOR_MODE_RELOC,
7367 name=self.instance_name,
7368 # TODO See why hail breaks with a single node below
7369 relocate_from=[self.instance.primary_node,
7370 self.instance.primary_node],
7373 ial.Run(self.lu.op.iallocator)
7376 raise errors.OpPrereqError("Can't compute nodes using"
7377 " iallocator '%s': %s" %
7378 (self.lu.op.iallocator, ial.info),
7380 if len(ial.result) != ial.required_nodes:
7381 raise errors.OpPrereqError("iallocator '%s' returned invalid number"
7382 " of nodes (%s), required %s" %
7383 (self.lu.op.iallocator, len(ial.result),
7384 ial.required_nodes), errors.ECODE_FAULT)
7385 self.target_node = ial.result[0]
7386 self.lu.LogInfo("Selected nodes for instance %s via iallocator %s: %s",
7387 self.instance_name, self.lu.op.iallocator,
7388 utils.CommaJoin(ial.result))
7390 def _WaitUntilSync(self):
7391 """Poll with custom rpc for disk sync.
7393 This uses our own step-based rpc call.
7396 self.feedback_fn("* wait until resync is done")
7400 result = self.rpc.call_drbd_wait_sync(self.all_nodes,
7402 self.instance.disks)
7404 for node, nres in result.items():
7405 nres.Raise("Cannot resync disks on node %s" % node)
7406 node_done, node_percent = nres.payload
7407 all_done = all_done and node_done
7408 if node_percent is not None:
7409 min_percent = min(min_percent, node_percent)
7411 if min_percent < 100:
7412 self.feedback_fn(" - progress: %.1f%%" % min_percent)
7415 def _EnsureSecondary(self, node):
7416 """Demote a node to secondary.
7419 self.feedback_fn("* switching node %s to secondary mode" % node)
7421 for dev in self.instance.disks:
7422 self.cfg.SetDiskID(dev, node)
7424 result = self.rpc.call_blockdev_close(node, self.instance.name,
7425 self.instance.disks)
7426 result.Raise("Cannot change disk to secondary on node %s" % node)
7428 def _GoStandalone(self):
7429 """Disconnect from the network.
7432 self.feedback_fn("* changing into standalone mode")
7433 result = self.rpc.call_drbd_disconnect_net(self.all_nodes, self.nodes_ip,
7434 self.instance.disks)
7435 for node, nres in result.items():
7436 nres.Raise("Cannot disconnect disks node %s" % node)
7438 def _GoReconnect(self, multimaster):
7439 """Reconnect to the network.
7445 msg = "single-master"
7446 self.feedback_fn("* changing disks into %s mode" % msg)
7447 result = self.rpc.call_drbd_attach_net(self.all_nodes, self.nodes_ip,
7448 self.instance.disks,
7449 self.instance.name, multimaster)
7450 for node, nres in result.items():
7451 nres.Raise("Cannot change disks config on node %s" % node)
7453 def _ExecCleanup(self):
7454 """Try to cleanup after a failed migration.
7456 The cleanup is done by:
7457 - check that the instance is running only on one node
7458 (and update the config if needed)
7459 - change disks on its secondary node to secondary
7460 - wait until disks are fully synchronized
7461 - disconnect from the network
7462 - change disks into single-master mode
7463 - wait again until disks are fully synchronized
7466 instance = self.instance
7467 target_node = self.target_node
7468 source_node = self.source_node
7470 # check running on only one node
7471 self.feedback_fn("* checking where the instance actually runs"
7472 " (if this hangs, the hypervisor might be in"
7474 ins_l = self.rpc.call_instance_list(self.all_nodes, [instance.hypervisor])
7475 for node, result in ins_l.items():
7476 result.Raise("Can't contact node %s" % node)
7478 runningon_source = instance.name in ins_l[source_node].payload
7479 runningon_target = instance.name in ins_l[target_node].payload
7481 if runningon_source and runningon_target:
7482 raise errors.OpExecError("Instance seems to be running on two nodes,"
7483 " or the hypervisor is confused; you will have"
7484 " to ensure manually that it runs only on one"
7485 " and restart this operation")
7487 if not (runningon_source or runningon_target):
7488 raise errors.OpExecError("Instance does not seem to be running at all;"
7489 " in this case it's safer to repair by"
7490 " running 'gnt-instance stop' to ensure disk"
7491 " shutdown, and then restarting it")
7493 if runningon_target:
7494 # the migration has actually succeeded, we need to update the config
7495 self.feedback_fn("* instance running on secondary node (%s),"
7496 " updating config" % target_node)
7497 instance.primary_node = target_node
7498 self.cfg.Update(instance, self.feedback_fn)
7499 demoted_node = source_node
7501 self.feedback_fn("* instance confirmed to be running on its"
7502 " primary node (%s)" % source_node)
7503 demoted_node = target_node
7505 if instance.disk_template in constants.DTS_INT_MIRROR:
7506 self._EnsureSecondary(demoted_node)
7508 self._WaitUntilSync()
7509 except errors.OpExecError:
7510 # we ignore here errors, since if the device is standalone, it
7511 # won't be able to sync
7513 self._GoStandalone()
7514 self._GoReconnect(False)
7515 self._WaitUntilSync()
7517 self.feedback_fn("* done")
7519 def _RevertDiskStatus(self):
7520 """Try to revert the disk status after a failed migration.
7523 target_node = self.target_node
7524 if self.instance.disk_template in constants.DTS_EXT_MIRROR:
7528 self._EnsureSecondary(target_node)
7529 self._GoStandalone()
7530 self._GoReconnect(False)
7531 self._WaitUntilSync()
7532 except errors.OpExecError, err:
7533 self.lu.LogWarning("Migration failed and I can't reconnect the drives,"
7534 " please try to recover the instance manually;"
7535 " error '%s'" % str(err))
7537 def _AbortMigration(self):
7538 """Call the hypervisor code to abort a started migration.
7541 instance = self.instance
7542 target_node = self.target_node
7543 source_node = self.source_node
7544 migration_info = self.migration_info
7546 abort_result = self.rpc.call_instance_finalize_migration_dst(target_node,
7550 abort_msg = abort_result.fail_msg
7552 logging.error("Aborting migration failed on target node %s: %s",
7553 target_node, abort_msg)
7554 # Don't raise an exception here, as we stil have to try to revert the
7555 # disk status, even if this step failed.
7557 abort_result = self.rpc.call_instance_finalize_migration_src(source_node,
7558 instance, False, self.live)
7559 abort_msg = abort_result.fail_msg
7561 logging.error("Aborting migration failed on source node %s: %s",
7562 source_node, abort_msg)
7564 def _ExecMigration(self):
7565 """Migrate an instance.
7567 The migrate is done by:
7568 - change the disks into dual-master mode
7569 - wait until disks are fully synchronized again
7570 - migrate the instance
7571 - change disks on the new secondary node (the old primary) to secondary
7572 - wait until disks are fully synchronized
7573 - change disks into single-master mode
7576 instance = self.instance
7577 target_node = self.target_node
7578 source_node = self.source_node
7580 # Check for hypervisor version mismatch and warn the user.
7581 nodeinfo = self.rpc.call_node_info([source_node, target_node],
7582 None, self.instance.hypervisor)
7583 src_info = nodeinfo[source_node]
7584 dst_info = nodeinfo[target_node]
7586 if ((constants.HV_NODEINFO_KEY_VERSION in src_info.payload) and
7587 (constants.HV_NODEINFO_KEY_VERSION in dst_info.payload)):
7588 src_version = src_info.payload[constants.HV_NODEINFO_KEY_VERSION]
7589 dst_version = dst_info.payload[constants.HV_NODEINFO_KEY_VERSION]
7590 if src_version != dst_version:
7591 self.feedback_fn("* warning: hypervisor version mismatch between"
7592 " source (%s) and target (%s) node" %
7593 (src_version, dst_version))
7595 self.feedback_fn("* checking disk consistency between source and target")
7596 for dev in instance.disks:
7597 if not _CheckDiskConsistency(self.lu, dev, target_node, False):
7598 raise errors.OpExecError("Disk %s is degraded or not fully"
7599 " synchronized on target node,"
7600 " aborting migration" % dev.iv_name)
7602 # First get the migration information from the remote node
7603 result = self.rpc.call_migration_info(source_node, instance)
7604 msg = result.fail_msg
7606 log_err = ("Failed fetching source migration information from %s: %s" %
7608 logging.error(log_err)
7609 raise errors.OpExecError(log_err)
7611 self.migration_info = migration_info = result.payload
7613 if self.instance.disk_template not in constants.DTS_EXT_MIRROR:
7614 # Then switch the disks to master/master mode
7615 self._EnsureSecondary(target_node)
7616 self._GoStandalone()
7617 self._GoReconnect(True)
7618 self._WaitUntilSync()
7620 self.feedback_fn("* preparing %s to accept the instance" % target_node)
7621 result = self.rpc.call_accept_instance(target_node,
7624 self.nodes_ip[target_node])
7626 msg = result.fail_msg
7628 logging.error("Instance pre-migration failed, trying to revert"
7629 " disk status: %s", msg)
7630 self.feedback_fn("Pre-migration failed, aborting")
7631 self._AbortMigration()
7632 self._RevertDiskStatus()
7633 raise errors.OpExecError("Could not pre-migrate instance %s: %s" %
7634 (instance.name, msg))
7636 self.feedback_fn("* migrating instance to %s" % target_node)
7637 result = self.rpc.call_instance_migrate(source_node, instance,
7638 self.nodes_ip[target_node],
7640 msg = result.fail_msg
7642 logging.error("Instance migration failed, trying to revert"
7643 " disk status: %s", msg)
7644 self.feedback_fn("Migration failed, aborting")
7645 self._AbortMigration()
7646 self._RevertDiskStatus()
7647 raise errors.OpExecError("Could not migrate instance %s: %s" %
7648 (instance.name, msg))
7650 self.feedback_fn("* starting memory transfer")
7651 last_feedback = time.time()
7653 result = self.rpc.call_instance_get_migration_status(source_node,
7655 msg = result.fail_msg
7656 ms = result.payload # MigrationStatus instance
7657 if msg or (ms.status in constants.HV_MIGRATION_FAILED_STATUSES):
7658 logging.error("Instance migration failed, trying to revert"
7659 " disk status: %s", msg)
7660 self.feedback_fn("Migration failed, aborting")
7661 self._AbortMigration()
7662 self._RevertDiskStatus()
7663 raise errors.OpExecError("Could not migrate instance %s: %s" %
7664 (instance.name, msg))
7666 if result.payload.status != constants.HV_MIGRATION_ACTIVE:
7667 self.feedback_fn("* memory transfer complete")
7670 if (utils.TimeoutExpired(last_feedback,
7671 self._MIGRATION_FEEDBACK_INTERVAL) and
7672 ms.transferred_ram is not None):
7673 mem_progress = 100 * float(ms.transferred_ram) / float(ms.total_ram)
7674 self.feedback_fn("* memory transfer progress: %.2f %%" % mem_progress)
7675 last_feedback = time.time()
7677 time.sleep(self._MIGRATION_POLL_INTERVAL)
7679 result = self.rpc.call_instance_finalize_migration_src(source_node,
7683 msg = result.fail_msg
7685 logging.error("Instance migration succeeded, but finalization failed"
7686 " on the source node: %s", msg)
7687 raise errors.OpExecError("Could not finalize instance migration: %s" %
7690 instance.primary_node = target_node
7692 # distribute new instance config to the other nodes
7693 self.cfg.Update(instance, self.feedback_fn)
7695 result = self.rpc.call_instance_finalize_migration_dst(target_node,
7699 msg = result.fail_msg
7701 logging.error("Instance migration succeeded, but finalization failed"
7702 " on the target node: %s", msg)
7703 raise errors.OpExecError("Could not finalize instance migration: %s" %
7706 if self.instance.disk_template not in constants.DTS_EXT_MIRROR:
7707 self._EnsureSecondary(source_node)
7708 self._WaitUntilSync()
7709 self._GoStandalone()
7710 self._GoReconnect(False)
7711 self._WaitUntilSync()
7713 self.feedback_fn("* done")
7715 def _ExecFailover(self):
7716 """Failover an instance.
7718 The failover is done by shutting it down on its present node and
7719 starting it on the secondary.
7722 instance = self.instance
7723 primary_node = self.cfg.GetNodeInfo(instance.primary_node)
7725 source_node = instance.primary_node
7726 target_node = self.target_node
7728 if instance.admin_up:
7729 self.feedback_fn("* checking disk consistency between source and target")
7730 for dev in instance.disks:
7731 # for drbd, these are drbd over lvm
7732 if not _CheckDiskConsistency(self.lu, dev, target_node, False):
7733 if primary_node.offline:
7734 self.feedback_fn("Node %s is offline, ignoring degraded disk %s on"
7736 (primary_node.name, dev.iv_name, target_node))
7737 elif not self.ignore_consistency:
7738 raise errors.OpExecError("Disk %s is degraded on target node,"
7739 " aborting failover" % dev.iv_name)
7741 self.feedback_fn("* not checking disk consistency as instance is not"
7744 self.feedback_fn("* shutting down instance on source node")
7745 logging.info("Shutting down instance %s on node %s",
7746 instance.name, source_node)
7748 result = self.rpc.call_instance_shutdown(source_node, instance,
7749 self.shutdown_timeout)
7750 msg = result.fail_msg
7752 if self.ignore_consistency or primary_node.offline:
7753 self.lu.LogWarning("Could not shutdown instance %s on node %s,"
7754 " proceeding anyway; please make sure node"
7755 " %s is down; error details: %s",
7756 instance.name, source_node, source_node, msg)
7758 raise errors.OpExecError("Could not shutdown instance %s on"
7760 (instance.name, source_node, msg))
7762 self.feedback_fn("* deactivating the instance's disks on source node")
7763 if not _ShutdownInstanceDisks(self.lu, instance, ignore_primary=True):
7764 raise errors.OpExecError("Can't shut down the instance's disks")
7766 instance.primary_node = target_node
7767 # distribute new instance config to the other nodes
7768 self.cfg.Update(instance, self.feedback_fn)
7770 # Only start the instance if it's marked as up
7771 if instance.admin_up:
7772 self.feedback_fn("* activating the instance's disks on target node %s" %
7774 logging.info("Starting instance %s on node %s",
7775 instance.name, target_node)
7777 disks_ok, _ = _AssembleInstanceDisks(self.lu, instance,
7778 ignore_secondaries=True)
7780 _ShutdownInstanceDisks(self.lu, instance)
7781 raise errors.OpExecError("Can't activate the instance's disks")
7783 self.feedback_fn("* starting the instance on the target node %s" %
7785 result = self.rpc.call_instance_start(target_node, (instance, None, None),
7787 msg = result.fail_msg
7789 _ShutdownInstanceDisks(self.lu, instance)
7790 raise errors.OpExecError("Could not start instance %s on node %s: %s" %
7791 (instance.name, target_node, msg))
7793 def Exec(self, feedback_fn):
7794 """Perform the migration.
7797 self.feedback_fn = feedback_fn
7798 self.source_node = self.instance.primary_node
7800 # FIXME: if we implement migrate-to-any in DRBD, this needs fixing
7801 if self.instance.disk_template in constants.DTS_INT_MIRROR:
7802 self.target_node = self.instance.secondary_nodes[0]
7803 # Otherwise self.target_node has been populated either
7804 # directly, or through an iallocator.
7806 self.all_nodes = [self.source_node, self.target_node]
7807 self.nodes_ip = dict((name, node.secondary_ip) for (name, node)
7808 in self.cfg.GetMultiNodeInfo(self.all_nodes))
7811 feedback_fn("Failover instance %s" % self.instance.name)
7812 self._ExecFailover()
7814 feedback_fn("Migrating instance %s" % self.instance.name)
7817 return self._ExecCleanup()
7819 return self._ExecMigration()
7822 def _CreateBlockDev(lu, node, instance, device, force_create,
7824 """Create a tree of block devices on a given node.
7826 If this device type has to be created on secondaries, create it and
7829 If not, just recurse to children keeping the same 'force' value.
7831 @param lu: the lu on whose behalf we execute
7832 @param node: the node on which to create the device
7833 @type instance: L{objects.Instance}
7834 @param instance: the instance which owns the device
7835 @type device: L{objects.Disk}
7836 @param device: the device to create
7837 @type force_create: boolean
7838 @param force_create: whether to force creation of this device; this
7839 will be change to True whenever we find a device which has
7840 CreateOnSecondary() attribute
7841 @param info: the extra 'metadata' we should attach to the device
7842 (this will be represented as a LVM tag)
7843 @type force_open: boolean
7844 @param force_open: this parameter will be passes to the
7845 L{backend.BlockdevCreate} function where it specifies
7846 whether we run on primary or not, and it affects both
7847 the child assembly and the device own Open() execution
7850 if device.CreateOnSecondary():
7854 for child in device.children:
7855 _CreateBlockDev(lu, node, instance, child, force_create,
7858 if not force_create:
7861 _CreateSingleBlockDev(lu, node, instance, device, info, force_open)
7864 def _CreateSingleBlockDev(lu, node, instance, device, info, force_open):
7865 """Create a single block device on a given node.
7867 This will not recurse over children of the device, so they must be
7870 @param lu: the lu on whose behalf we execute
7871 @param node: the node on which to create the device
7872 @type instance: L{objects.Instance}
7873 @param instance: the instance which owns the device
7874 @type device: L{objects.Disk}
7875 @param device: the device to create
7876 @param info: the extra 'metadata' we should attach to the device
7877 (this will be represented as a LVM tag)
7878 @type force_open: boolean
7879 @param force_open: this parameter will be passes to the
7880 L{backend.BlockdevCreate} function where it specifies
7881 whether we run on primary or not, and it affects both
7882 the child assembly and the device own Open() execution
7885 lu.cfg.SetDiskID(device, node)
7886 result = lu.rpc.call_blockdev_create(node, device, device.size,
7887 instance.name, force_open, info)
7888 result.Raise("Can't create block device %s on"
7889 " node %s for instance %s" % (device, node, instance.name))
7890 if device.physical_id is None:
7891 device.physical_id = result.payload
7894 def _GenerateUniqueNames(lu, exts):
7895 """Generate a suitable LV name.
7897 This will generate a logical volume name for the given instance.
7902 new_id = lu.cfg.GenerateUniqueID(lu.proc.GetECId())
7903 results.append("%s%s" % (new_id, val))
7907 def _GenerateDRBD8Branch(lu, primary, secondary, size, vgnames, names,
7908 iv_name, p_minor, s_minor):
7909 """Generate a drbd8 device complete with its children.
7912 assert len(vgnames) == len(names) == 2
7913 port = lu.cfg.AllocatePort()
7914 shared_secret = lu.cfg.GenerateDRBDSecret(lu.proc.GetECId())
7915 dev_data = objects.Disk(dev_type=constants.LD_LV, size=size,
7916 logical_id=(vgnames[0], names[0]))
7917 dev_meta = objects.Disk(dev_type=constants.LD_LV, size=DRBD_META_SIZE,
7918 logical_id=(vgnames[1], names[1]))
7919 drbd_dev = objects.Disk(dev_type=constants.LD_DRBD8, size=size,
7920 logical_id=(primary, secondary, port,
7923 children=[dev_data, dev_meta],
7928 def _GenerateDiskTemplate(lu, template_name,
7929 instance_name, primary_node,
7930 secondary_nodes, disk_info,
7931 file_storage_dir, file_driver,
7932 base_index, feedback_fn):
7933 """Generate the entire disk layout for a given template type.
7936 #TODO: compute space requirements
7938 vgname = lu.cfg.GetVGName()
7939 disk_count = len(disk_info)
7941 if template_name == constants.DT_DISKLESS:
7943 elif template_name == constants.DT_PLAIN:
7944 if len(secondary_nodes) != 0:
7945 raise errors.ProgrammerError("Wrong template configuration")
7947 names = _GenerateUniqueNames(lu, [".disk%d" % (base_index + i)
7948 for i in range(disk_count)])
7949 for idx, disk in enumerate(disk_info):
7950 disk_index = idx + base_index
7951 vg = disk.get(constants.IDISK_VG, vgname)
7952 feedback_fn("* disk %i, vg %s, name %s" % (idx, vg, names[idx]))
7953 disk_dev = objects.Disk(dev_type=constants.LD_LV,
7954 size=disk[constants.IDISK_SIZE],
7955 logical_id=(vg, names[idx]),
7956 iv_name="disk/%d" % disk_index,
7957 mode=disk[constants.IDISK_MODE])
7958 disks.append(disk_dev)
7959 elif template_name == constants.DT_DRBD8:
7960 if len(secondary_nodes) != 1:
7961 raise errors.ProgrammerError("Wrong template configuration")
7962 remote_node = secondary_nodes[0]
7963 minors = lu.cfg.AllocateDRBDMinor(
7964 [primary_node, remote_node] * len(disk_info), instance_name)
7967 for lv_prefix in _GenerateUniqueNames(lu, [".disk%d" % (base_index + i)
7968 for i in range(disk_count)]):
7969 names.append(lv_prefix + "_data")
7970 names.append(lv_prefix + "_meta")
7971 for idx, disk in enumerate(disk_info):
7972 disk_index = idx + base_index
7973 data_vg = disk.get(constants.IDISK_VG, vgname)
7974 meta_vg = disk.get(constants.IDISK_METAVG, data_vg)
7975 disk_dev = _GenerateDRBD8Branch(lu, primary_node, remote_node,
7976 disk[constants.IDISK_SIZE],
7978 names[idx * 2:idx * 2 + 2],
7979 "disk/%d" % disk_index,
7980 minors[idx * 2], minors[idx * 2 + 1])
7981 disk_dev.mode = disk[constants.IDISK_MODE]
7982 disks.append(disk_dev)
7983 elif template_name == constants.DT_FILE:
7984 if len(secondary_nodes) != 0:
7985 raise errors.ProgrammerError("Wrong template configuration")
7987 opcodes.RequireFileStorage()
7989 for idx, disk in enumerate(disk_info):
7990 disk_index = idx + base_index
7991 disk_dev = objects.Disk(dev_type=constants.LD_FILE,
7992 size=disk[constants.IDISK_SIZE],
7993 iv_name="disk/%d" % disk_index,
7994 logical_id=(file_driver,
7995 "%s/disk%d" % (file_storage_dir,
7997 mode=disk[constants.IDISK_MODE])
7998 disks.append(disk_dev)
7999 elif template_name == constants.DT_SHARED_FILE:
8000 if len(secondary_nodes) != 0:
8001 raise errors.ProgrammerError("Wrong template configuration")
8003 opcodes.RequireSharedFileStorage()
8005 for idx, disk in enumerate(disk_info):
8006 disk_index = idx + base_index
8007 disk_dev = objects.Disk(dev_type=constants.LD_FILE,
8008 size=disk[constants.IDISK_SIZE],
8009 iv_name="disk/%d" % disk_index,
8010 logical_id=(file_driver,
8011 "%s/disk%d" % (file_storage_dir,
8013 mode=disk[constants.IDISK_MODE])
8014 disks.append(disk_dev)
8015 elif template_name == constants.DT_BLOCK:
8016 if len(secondary_nodes) != 0:
8017 raise errors.ProgrammerError("Wrong template configuration")
8019 for idx, disk in enumerate(disk_info):
8020 disk_index = idx + base_index
8021 disk_dev = objects.Disk(dev_type=constants.LD_BLOCKDEV,
8022 size=disk[constants.IDISK_SIZE],
8023 logical_id=(constants.BLOCKDEV_DRIVER_MANUAL,
8024 disk[constants.IDISK_ADOPT]),
8025 iv_name="disk/%d" % disk_index,
8026 mode=disk[constants.IDISK_MODE])
8027 disks.append(disk_dev)
8030 raise errors.ProgrammerError("Invalid disk template '%s'" % template_name)
8034 def _GetInstanceInfoText(instance):
8035 """Compute that text that should be added to the disk's metadata.
8038 return "originstname+%s" % instance.name
8041 def _CalcEta(time_taken, written, total_size):
8042 """Calculates the ETA based on size written and total size.
8044 @param time_taken: The time taken so far
8045 @param written: amount written so far
8046 @param total_size: The total size of data to be written
8047 @return: The remaining time in seconds
8050 avg_time = time_taken / float(written)
8051 return (total_size - written) * avg_time
8054 def _WipeDisks(lu, instance):
8055 """Wipes instance disks.
8057 @type lu: L{LogicalUnit}
8058 @param lu: the logical unit on whose behalf we execute
8059 @type instance: L{objects.Instance}
8060 @param instance: the instance whose disks we should create
8061 @return: the success of the wipe
8064 node = instance.primary_node
8066 for device in instance.disks:
8067 lu.cfg.SetDiskID(device, node)
8069 logging.info("Pause sync of instance %s disks", instance.name)
8070 result = lu.rpc.call_blockdev_pause_resume_sync(node, instance.disks, True)
8072 for idx, success in enumerate(result.payload):
8074 logging.warn("pause-sync of instance %s for disks %d failed",
8078 for idx, device in enumerate(instance.disks):
8079 # The wipe size is MIN_WIPE_CHUNK_PERCENT % of the instance disk but
8080 # MAX_WIPE_CHUNK at max
8081 wipe_chunk_size = min(constants.MAX_WIPE_CHUNK, device.size / 100.0 *
8082 constants.MIN_WIPE_CHUNK_PERCENT)
8083 # we _must_ make this an int, otherwise rounding errors will
8085 wipe_chunk_size = int(wipe_chunk_size)
8087 lu.LogInfo("* Wiping disk %d", idx)
8088 logging.info("Wiping disk %d for instance %s, node %s using"
8089 " chunk size %s", idx, instance.name, node, wipe_chunk_size)
8094 start_time = time.time()
8096 while offset < size:
8097 wipe_size = min(wipe_chunk_size, size - offset)
8098 logging.debug("Wiping disk %d, offset %s, chunk %s",
8099 idx, offset, wipe_size)
8100 result = lu.rpc.call_blockdev_wipe(node, device, offset, wipe_size)
8101 result.Raise("Could not wipe disk %d at offset %d for size %d" %
8102 (idx, offset, wipe_size))
8105 if now - last_output >= 60:
8106 eta = _CalcEta(now - start_time, offset, size)
8107 lu.LogInfo(" - done: %.1f%% ETA: %s" %
8108 (offset / float(size) * 100, utils.FormatSeconds(eta)))
8111 logging.info("Resume sync of instance %s disks", instance.name)
8113 result = lu.rpc.call_blockdev_pause_resume_sync(node, instance.disks, False)
8115 for idx, success in enumerate(result.payload):
8117 lu.LogWarning("Resume sync of disk %d failed, please have a"
8118 " look at the status and troubleshoot the issue", idx)
8119 logging.warn("resume-sync of instance %s for disks %d failed",
8123 def _CreateDisks(lu, instance, to_skip=None, target_node=None):
8124 """Create all disks for an instance.
8126 This abstracts away some work from AddInstance.
8128 @type lu: L{LogicalUnit}
8129 @param lu: the logical unit on whose behalf we execute
8130 @type instance: L{objects.Instance}
8131 @param instance: the instance whose disks we should create
8133 @param to_skip: list of indices to skip
8134 @type target_node: string
8135 @param target_node: if passed, overrides the target node for creation
8137 @return: the success of the creation
8140 info = _GetInstanceInfoText(instance)
8141 if target_node is None:
8142 pnode = instance.primary_node
8143 all_nodes = instance.all_nodes
8148 if instance.disk_template in constants.DTS_FILEBASED:
8149 file_storage_dir = os.path.dirname(instance.disks[0].logical_id[1])
8150 result = lu.rpc.call_file_storage_dir_create(pnode, file_storage_dir)
8152 result.Raise("Failed to create directory '%s' on"
8153 " node %s" % (file_storage_dir, pnode))
8155 # Note: this needs to be kept in sync with adding of disks in
8156 # LUInstanceSetParams
8157 for idx, device in enumerate(instance.disks):
8158 if to_skip and idx in to_skip:
8160 logging.info("Creating volume %s for instance %s",
8161 device.iv_name, instance.name)
8163 for node in all_nodes:
8164 f_create = node == pnode
8165 _CreateBlockDev(lu, node, instance, device, f_create, info, f_create)
8168 def _RemoveDisks(lu, instance, target_node=None):
8169 """Remove all disks for an instance.
8171 This abstracts away some work from `AddInstance()` and
8172 `RemoveInstance()`. Note that in case some of the devices couldn't
8173 be removed, the removal will continue with the other ones (compare
8174 with `_CreateDisks()`).
8176 @type lu: L{LogicalUnit}
8177 @param lu: the logical unit on whose behalf we execute
8178 @type instance: L{objects.Instance}
8179 @param instance: the instance whose disks we should remove
8180 @type target_node: string
8181 @param target_node: used to override the node on which to remove the disks
8183 @return: the success of the removal
8186 logging.info("Removing block devices for instance %s", instance.name)
8189 for device in instance.disks:
8191 edata = [(target_node, device)]
8193 edata = device.ComputeNodeTree(instance.primary_node)
8194 for node, disk in edata:
8195 lu.cfg.SetDiskID(disk, node)
8196 msg = lu.rpc.call_blockdev_remove(node, disk).fail_msg
8198 lu.LogWarning("Could not remove block device %s on node %s,"
8199 " continuing anyway: %s", device.iv_name, node, msg)
8202 if instance.disk_template == constants.DT_FILE:
8203 file_storage_dir = os.path.dirname(instance.disks[0].logical_id[1])
8207 tgt = instance.primary_node
8208 result = lu.rpc.call_file_storage_dir_remove(tgt, file_storage_dir)
8210 lu.LogWarning("Could not remove directory '%s' on node %s: %s",
8211 file_storage_dir, instance.primary_node, result.fail_msg)
8217 def _ComputeDiskSizePerVG(disk_template, disks):
8218 """Compute disk size requirements in the volume group
8221 def _compute(disks, payload):
8222 """Universal algorithm.
8227 vgs[disk[constants.IDISK_VG]] = \
8228 vgs.get(constants.IDISK_VG, 0) + disk[constants.IDISK_SIZE] + payload
8232 # Required free disk space as a function of disk and swap space
8234 constants.DT_DISKLESS: {},
8235 constants.DT_PLAIN: _compute(disks, 0),
8236 # 128 MB are added for drbd metadata for each disk
8237 constants.DT_DRBD8: _compute(disks, DRBD_META_SIZE),
8238 constants.DT_FILE: {},
8239 constants.DT_SHARED_FILE: {},
8242 if disk_template not in req_size_dict:
8243 raise errors.ProgrammerError("Disk template '%s' size requirement"
8244 " is unknown" % disk_template)
8246 return req_size_dict[disk_template]
8249 def _ComputeDiskSize(disk_template, disks):
8250 """Compute disk size requirements in the volume group
8253 # Required free disk space as a function of disk and swap space
8255 constants.DT_DISKLESS: None,
8256 constants.DT_PLAIN: sum(d[constants.IDISK_SIZE] for d in disks),
8257 # 128 MB are added for drbd metadata for each disk
8259 sum(d[constants.IDISK_SIZE] + DRBD_META_SIZE for d in disks),
8260 constants.DT_FILE: None,
8261 constants.DT_SHARED_FILE: 0,
8262 constants.DT_BLOCK: 0,
8265 if disk_template not in req_size_dict:
8266 raise errors.ProgrammerError("Disk template '%s' size requirement"
8267 " is unknown" % disk_template)
8269 return req_size_dict[disk_template]
8272 def _FilterVmNodes(lu, nodenames):
8273 """Filters out non-vm_capable nodes from a list.
8275 @type lu: L{LogicalUnit}
8276 @param lu: the logical unit for which we check
8277 @type nodenames: list
8278 @param nodenames: the list of nodes on which we should check
8280 @return: the list of vm-capable nodes
8283 vm_nodes = frozenset(lu.cfg.GetNonVmCapableNodeList())
8284 return [name for name in nodenames if name not in vm_nodes]
8287 def _CheckHVParams(lu, nodenames, hvname, hvparams):
8288 """Hypervisor parameter validation.
8290 This function abstract the hypervisor parameter validation to be
8291 used in both instance create and instance modify.
8293 @type lu: L{LogicalUnit}
8294 @param lu: the logical unit for which we check
8295 @type nodenames: list
8296 @param nodenames: the list of nodes on which we should check
8297 @type hvname: string
8298 @param hvname: the name of the hypervisor we should use
8299 @type hvparams: dict
8300 @param hvparams: the parameters which we need to check
8301 @raise errors.OpPrereqError: if the parameters are not valid
8304 nodenames = _FilterVmNodes(lu, nodenames)
8306 cluster = lu.cfg.GetClusterInfo()
8307 hvfull = objects.FillDict(cluster.hvparams.get(hvname, {}), hvparams)
8309 hvinfo = lu.rpc.call_hypervisor_validate_params(nodenames, hvname, hvfull)
8310 for node in nodenames:
8314 info.Raise("Hypervisor parameter validation failed on node %s" % node)
8317 def _CheckOSParams(lu, required, nodenames, osname, osparams):
8318 """OS parameters validation.
8320 @type lu: L{LogicalUnit}
8321 @param lu: the logical unit for which we check
8322 @type required: boolean
8323 @param required: whether the validation should fail if the OS is not
8325 @type nodenames: list
8326 @param nodenames: the list of nodes on which we should check
8327 @type osname: string
8328 @param osname: the name of the hypervisor we should use
8329 @type osparams: dict
8330 @param osparams: the parameters which we need to check
8331 @raise errors.OpPrereqError: if the parameters are not valid
8334 nodenames = _FilterVmNodes(lu, nodenames)
8335 result = lu.rpc.call_os_validate(nodenames, required, osname,
8336 [constants.OS_VALIDATE_PARAMETERS],
8338 for node, nres in result.items():
8339 # we don't check for offline cases since this should be run only
8340 # against the master node and/or an instance's nodes
8341 nres.Raise("OS Parameters validation failed on node %s" % node)
8342 if not nres.payload:
8343 lu.LogInfo("OS %s not found on node %s, validation skipped",
8347 class LUInstanceCreate(LogicalUnit):
8348 """Create an instance.
8351 HPATH = "instance-add"
8352 HTYPE = constants.HTYPE_INSTANCE
8355 def CheckArguments(self):
8359 # do not require name_check to ease forward/backward compatibility
8361 if self.op.no_install and self.op.start:
8362 self.LogInfo("No-installation mode selected, disabling startup")
8363 self.op.start = False
8364 # validate/normalize the instance name
8365 self.op.instance_name = \
8366 netutils.Hostname.GetNormalizedName(self.op.instance_name)
8368 if self.op.ip_check and not self.op.name_check:
8369 # TODO: make the ip check more flexible and not depend on the name check
8370 raise errors.OpPrereqError("Cannot do IP address check without a name"
8371 " check", errors.ECODE_INVAL)
8373 # check nics' parameter names
8374 for nic in self.op.nics:
8375 utils.ForceDictType(nic, constants.INIC_PARAMS_TYPES)
8377 # check disks. parameter names and consistent adopt/no-adopt strategy
8378 has_adopt = has_no_adopt = False
8379 for disk in self.op.disks:
8380 utils.ForceDictType(disk, constants.IDISK_PARAMS_TYPES)
8381 if constants.IDISK_ADOPT in disk:
8385 if has_adopt and has_no_adopt:
8386 raise errors.OpPrereqError("Either all disks are adopted or none is",
8389 if self.op.disk_template not in constants.DTS_MAY_ADOPT:
8390 raise errors.OpPrereqError("Disk adoption is not supported for the"
8391 " '%s' disk template" %
8392 self.op.disk_template,
8394 if self.op.iallocator is not None:
8395 raise errors.OpPrereqError("Disk adoption not allowed with an"
8396 " iallocator script", errors.ECODE_INVAL)
8397 if self.op.mode == constants.INSTANCE_IMPORT:
8398 raise errors.OpPrereqError("Disk adoption not allowed for"
8399 " instance import", errors.ECODE_INVAL)
8401 if self.op.disk_template in constants.DTS_MUST_ADOPT:
8402 raise errors.OpPrereqError("Disk template %s requires disk adoption,"
8403 " but no 'adopt' parameter given" %
8404 self.op.disk_template,
8407 self.adopt_disks = has_adopt
8409 # instance name verification
8410 if self.op.name_check:
8411 self.hostname1 = netutils.GetHostname(name=self.op.instance_name)
8412 self.op.instance_name = self.hostname1.name
8413 # used in CheckPrereq for ip ping check
8414 self.check_ip = self.hostname1.ip
8416 self.check_ip = None
8418 # file storage checks
8419 if (self.op.file_driver and
8420 not self.op.file_driver in constants.FILE_DRIVER):
8421 raise errors.OpPrereqError("Invalid file driver name '%s'" %
8422 self.op.file_driver, errors.ECODE_INVAL)
8424 if self.op.disk_template == constants.DT_FILE:
8425 opcodes.RequireFileStorage()
8426 elif self.op.disk_template == constants.DT_SHARED_FILE:
8427 opcodes.RequireSharedFileStorage()
8429 ### Node/iallocator related checks
8430 _CheckIAllocatorOrNode(self, "iallocator", "pnode")
8432 if self.op.pnode is not None:
8433 if self.op.disk_template in constants.DTS_INT_MIRROR:
8434 if self.op.snode is None:
8435 raise errors.OpPrereqError("The networked disk templates need"
8436 " a mirror node", errors.ECODE_INVAL)
8438 self.LogWarning("Secondary node will be ignored on non-mirrored disk"
8440 self.op.snode = None
8442 self._cds = _GetClusterDomainSecret()
8444 if self.op.mode == constants.INSTANCE_IMPORT:
8445 # On import force_variant must be True, because if we forced it at
8446 # initial install, our only chance when importing it back is that it
8448 self.op.force_variant = True
8450 if self.op.no_install:
8451 self.LogInfo("No-installation mode has no effect during import")
8453 elif self.op.mode == constants.INSTANCE_CREATE:
8454 if self.op.os_type is None:
8455 raise errors.OpPrereqError("No guest OS specified",
8457 if self.op.os_type in self.cfg.GetClusterInfo().blacklisted_os:
8458 raise errors.OpPrereqError("Guest OS '%s' is not allowed for"
8459 " installation" % self.op.os_type,
8461 if self.op.disk_template is None:
8462 raise errors.OpPrereqError("No disk template specified",
8465 elif self.op.mode == constants.INSTANCE_REMOTE_IMPORT:
8466 # Check handshake to ensure both clusters have the same domain secret
8467 src_handshake = self.op.source_handshake
8468 if not src_handshake:
8469 raise errors.OpPrereqError("Missing source handshake",
8472 errmsg = masterd.instance.CheckRemoteExportHandshake(self._cds,
8475 raise errors.OpPrereqError("Invalid handshake: %s" % errmsg,
8478 # Load and check source CA
8479 self.source_x509_ca_pem = self.op.source_x509_ca
8480 if not self.source_x509_ca_pem:
8481 raise errors.OpPrereqError("Missing source X509 CA",
8485 (cert, _) = utils.LoadSignedX509Certificate(self.source_x509_ca_pem,
8487 except OpenSSL.crypto.Error, err:
8488 raise errors.OpPrereqError("Unable to load source X509 CA (%s)" %
8489 (err, ), errors.ECODE_INVAL)
8491 (errcode, msg) = utils.VerifyX509Certificate(cert, None, None)
8492 if errcode is not None:
8493 raise errors.OpPrereqError("Invalid source X509 CA (%s)" % (msg, ),
8496 self.source_x509_ca = cert
8498 src_instance_name = self.op.source_instance_name
8499 if not src_instance_name:
8500 raise errors.OpPrereqError("Missing source instance name",
8503 self.source_instance_name = \
8504 netutils.GetHostname(name=src_instance_name).name
8507 raise errors.OpPrereqError("Invalid instance creation mode %r" %
8508 self.op.mode, errors.ECODE_INVAL)
8510 def ExpandNames(self):
8511 """ExpandNames for CreateInstance.
8513 Figure out the right locks for instance creation.
8516 self.needed_locks = {}
8518 instance_name = self.op.instance_name
8519 # this is just a preventive check, but someone might still add this
8520 # instance in the meantime, and creation will fail at lock-add time
8521 if instance_name in self.cfg.GetInstanceList():
8522 raise errors.OpPrereqError("Instance '%s' is already in the cluster" %
8523 instance_name, errors.ECODE_EXISTS)
8525 self.add_locks[locking.LEVEL_INSTANCE] = instance_name
8527 if self.op.iallocator:
8528 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
8530 self.op.pnode = _ExpandNodeName(self.cfg, self.op.pnode)
8531 nodelist = [self.op.pnode]
8532 if self.op.snode is not None:
8533 self.op.snode = _ExpandNodeName(self.cfg, self.op.snode)
8534 nodelist.append(self.op.snode)
8535 self.needed_locks[locking.LEVEL_NODE] = nodelist
8537 # in case of import lock the source node too
8538 if self.op.mode == constants.INSTANCE_IMPORT:
8539 src_node = self.op.src_node
8540 src_path = self.op.src_path
8542 if src_path is None:
8543 self.op.src_path = src_path = self.op.instance_name
8545 if src_node is None:
8546 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
8547 self.op.src_node = None
8548 if os.path.isabs(src_path):
8549 raise errors.OpPrereqError("Importing an instance from a path"
8550 " requires a source node option",
8553 self.op.src_node = src_node = _ExpandNodeName(self.cfg, src_node)
8554 if self.needed_locks[locking.LEVEL_NODE] is not locking.ALL_SET:
8555 self.needed_locks[locking.LEVEL_NODE].append(src_node)
8556 if not os.path.isabs(src_path):
8557 self.op.src_path = src_path = \
8558 utils.PathJoin(constants.EXPORT_DIR, src_path)
8560 def _RunAllocator(self):
8561 """Run the allocator based on input opcode.
8564 nics = [n.ToDict() for n in self.nics]
8565 ial = IAllocator(self.cfg, self.rpc,
8566 mode=constants.IALLOCATOR_MODE_ALLOC,
8567 name=self.op.instance_name,
8568 disk_template=self.op.disk_template,
8571 vcpus=self.be_full[constants.BE_VCPUS],
8572 memory=self.be_full[constants.BE_MEMORY],
8575 hypervisor=self.op.hypervisor,
8578 ial.Run(self.op.iallocator)
8581 raise errors.OpPrereqError("Can't compute nodes using"
8582 " iallocator '%s': %s" %
8583 (self.op.iallocator, ial.info),
8585 if len(ial.result) != ial.required_nodes:
8586 raise errors.OpPrereqError("iallocator '%s' returned invalid number"
8587 " of nodes (%s), required %s" %
8588 (self.op.iallocator, len(ial.result),
8589 ial.required_nodes), errors.ECODE_FAULT)
8590 self.op.pnode = ial.result[0]
8591 self.LogInfo("Selected nodes for instance %s via iallocator %s: %s",
8592 self.op.instance_name, self.op.iallocator,
8593 utils.CommaJoin(ial.result))
8594 if ial.required_nodes == 2:
8595 self.op.snode = ial.result[1]
8597 def BuildHooksEnv(self):
8600 This runs on master, primary and secondary nodes of the instance.
8604 "ADD_MODE": self.op.mode,
8606 if self.op.mode == constants.INSTANCE_IMPORT:
8607 env["SRC_NODE"] = self.op.src_node
8608 env["SRC_PATH"] = self.op.src_path
8609 env["SRC_IMAGES"] = self.src_images
8611 env.update(_BuildInstanceHookEnv(
8612 name=self.op.instance_name,
8613 primary_node=self.op.pnode,
8614 secondary_nodes=self.secondaries,
8615 status=self.op.start,
8616 os_type=self.op.os_type,
8617 memory=self.be_full[constants.BE_MEMORY],
8618 vcpus=self.be_full[constants.BE_VCPUS],
8619 nics=_NICListToTuple(self, self.nics),
8620 disk_template=self.op.disk_template,
8621 disks=[(d[constants.IDISK_SIZE], d[constants.IDISK_MODE])
8622 for d in self.disks],
8625 hypervisor_name=self.op.hypervisor,
8631 def BuildHooksNodes(self):
8632 """Build hooks nodes.
8635 nl = [self.cfg.GetMasterNode(), self.op.pnode] + self.secondaries
8638 def _ReadExportInfo(self):
8639 """Reads the export information from disk.
8641 It will override the opcode source node and path with the actual
8642 information, if these two were not specified before.
8644 @return: the export information
8647 assert self.op.mode == constants.INSTANCE_IMPORT
8649 src_node = self.op.src_node
8650 src_path = self.op.src_path
8652 if src_node is None:
8653 locked_nodes = self.owned_locks(locking.LEVEL_NODE)
8654 exp_list = self.rpc.call_export_list(locked_nodes)
8656 for node in exp_list:
8657 if exp_list[node].fail_msg:
8659 if src_path in exp_list[node].payload:
8661 self.op.src_node = src_node = node
8662 self.op.src_path = src_path = utils.PathJoin(constants.EXPORT_DIR,
8666 raise errors.OpPrereqError("No export found for relative path %s" %
8667 src_path, errors.ECODE_INVAL)
8669 _CheckNodeOnline(self, src_node)
8670 result = self.rpc.call_export_info(src_node, src_path)
8671 result.Raise("No export or invalid export found in dir %s" % src_path)
8673 export_info = objects.SerializableConfigParser.Loads(str(result.payload))
8674 if not export_info.has_section(constants.INISECT_EXP):
8675 raise errors.ProgrammerError("Corrupted export config",
8676 errors.ECODE_ENVIRON)
8678 ei_version = export_info.get(constants.INISECT_EXP, "version")
8679 if (int(ei_version) != constants.EXPORT_VERSION):
8680 raise errors.OpPrereqError("Wrong export version %s (wanted %d)" %
8681 (ei_version, constants.EXPORT_VERSION),
8682 errors.ECODE_ENVIRON)
8685 def _ReadExportParams(self, einfo):
8686 """Use export parameters as defaults.
8688 In case the opcode doesn't specify (as in override) some instance
8689 parameters, then try to use them from the export information, if
8693 self.op.os_type = einfo.get(constants.INISECT_EXP, "os")
8695 if self.op.disk_template is None:
8696 if einfo.has_option(constants.INISECT_INS, "disk_template"):
8697 self.op.disk_template = einfo.get(constants.INISECT_INS,
8699 if self.op.disk_template not in constants.DISK_TEMPLATES:
8700 raise errors.OpPrereqError("Disk template specified in configuration"
8701 " file is not one of the allowed values:"
8702 " %s" % " ".join(constants.DISK_TEMPLATES))
8704 raise errors.OpPrereqError("No disk template specified and the export"
8705 " is missing the disk_template information",
8708 if not self.op.disks:
8710 # TODO: import the disk iv_name too
8711 for idx in range(constants.MAX_DISKS):
8712 if einfo.has_option(constants.INISECT_INS, "disk%d_size" % idx):
8713 disk_sz = einfo.getint(constants.INISECT_INS, "disk%d_size" % idx)
8714 disks.append({constants.IDISK_SIZE: disk_sz})
8715 self.op.disks = disks
8716 if not disks and self.op.disk_template != constants.DT_DISKLESS:
8717 raise errors.OpPrereqError("No disk info specified and the export"
8718 " is missing the disk information",
8721 if not self.op.nics:
8723 for idx in range(constants.MAX_NICS):
8724 if einfo.has_option(constants.INISECT_INS, "nic%d_mac" % idx):
8726 for name in list(constants.NICS_PARAMETERS) + ["ip", "mac"]:
8727 v = einfo.get(constants.INISECT_INS, "nic%d_%s" % (idx, name))
8734 if not self.op.tags and einfo.has_option(constants.INISECT_INS, "tags"):
8735 self.op.tags = einfo.get(constants.INISECT_INS, "tags").split()
8737 if (self.op.hypervisor is None and
8738 einfo.has_option(constants.INISECT_INS, "hypervisor")):
8739 self.op.hypervisor = einfo.get(constants.INISECT_INS, "hypervisor")
8741 if einfo.has_section(constants.INISECT_HYP):
8742 # use the export parameters but do not override the ones
8743 # specified by the user
8744 for name, value in einfo.items(constants.INISECT_HYP):
8745 if name not in self.op.hvparams:
8746 self.op.hvparams[name] = value
8748 if einfo.has_section(constants.INISECT_BEP):
8749 # use the parameters, without overriding
8750 for name, value in einfo.items(constants.INISECT_BEP):
8751 if name not in self.op.beparams:
8752 self.op.beparams[name] = value
8754 # try to read the parameters old style, from the main section
8755 for name in constants.BES_PARAMETERS:
8756 if (name not in self.op.beparams and
8757 einfo.has_option(constants.INISECT_INS, name)):
8758 self.op.beparams[name] = einfo.get(constants.INISECT_INS, name)
8760 if einfo.has_section(constants.INISECT_OSP):
8761 # use the parameters, without overriding
8762 for name, value in einfo.items(constants.INISECT_OSP):
8763 if name not in self.op.osparams:
8764 self.op.osparams[name] = value
8766 def _RevertToDefaults(self, cluster):
8767 """Revert the instance parameters to the default values.
8771 hv_defs = cluster.SimpleFillHV(self.op.hypervisor, self.op.os_type, {})
8772 for name in self.op.hvparams.keys():
8773 if name in hv_defs and hv_defs[name] == self.op.hvparams[name]:
8774 del self.op.hvparams[name]
8776 be_defs = cluster.SimpleFillBE({})
8777 for name in self.op.beparams.keys():
8778 if name in be_defs and be_defs[name] == self.op.beparams[name]:
8779 del self.op.beparams[name]
8781 nic_defs = cluster.SimpleFillNIC({})
8782 for nic in self.op.nics:
8783 for name in constants.NICS_PARAMETERS:
8784 if name in nic and name in nic_defs and nic[name] == nic_defs[name]:
8787 os_defs = cluster.SimpleFillOS(self.op.os_type, {})
8788 for name in self.op.osparams.keys():
8789 if name in os_defs and os_defs[name] == self.op.osparams[name]:
8790 del self.op.osparams[name]
8792 def _CalculateFileStorageDir(self):
8793 """Calculate final instance file storage dir.
8796 # file storage dir calculation/check
8797 self.instance_file_storage_dir = None
8798 if self.op.disk_template in constants.DTS_FILEBASED:
8799 # build the full file storage dir path
8802 if self.op.disk_template == constants.DT_SHARED_FILE:
8803 get_fsd_fn = self.cfg.GetSharedFileStorageDir
8805 get_fsd_fn = self.cfg.GetFileStorageDir
8807 cfg_storagedir = get_fsd_fn()
8808 if not cfg_storagedir:
8809 raise errors.OpPrereqError("Cluster file storage dir not defined")
8810 joinargs.append(cfg_storagedir)
8812 if self.op.file_storage_dir is not None:
8813 joinargs.append(self.op.file_storage_dir)
8815 joinargs.append(self.op.instance_name)
8817 # pylint: disable=W0142
8818 self.instance_file_storage_dir = utils.PathJoin(*joinargs)
8820 def CheckPrereq(self):
8821 """Check prerequisites.
8824 self._CalculateFileStorageDir()
8826 if self.op.mode == constants.INSTANCE_IMPORT:
8827 export_info = self._ReadExportInfo()
8828 self._ReadExportParams(export_info)
8830 if (not self.cfg.GetVGName() and
8831 self.op.disk_template not in constants.DTS_NOT_LVM):
8832 raise errors.OpPrereqError("Cluster does not support lvm-based"
8833 " instances", errors.ECODE_STATE)
8835 if (self.op.hypervisor is None or
8836 self.op.hypervisor == constants.VALUE_AUTO):
8837 self.op.hypervisor = self.cfg.GetHypervisorType()
8839 cluster = self.cfg.GetClusterInfo()
8840 enabled_hvs = cluster.enabled_hypervisors
8841 if self.op.hypervisor not in enabled_hvs:
8842 raise errors.OpPrereqError("Selected hypervisor (%s) not enabled in the"
8843 " cluster (%s)" % (self.op.hypervisor,
8844 ",".join(enabled_hvs)),
8847 # Check tag validity
8848 for tag in self.op.tags:
8849 objects.TaggableObject.ValidateTag(tag)
8851 # check hypervisor parameter syntax (locally)
8852 utils.ForceDictType(self.op.hvparams, constants.HVS_PARAMETER_TYPES)
8853 filled_hvp = cluster.SimpleFillHV(self.op.hypervisor, self.op.os_type,
8855 hv_type = hypervisor.GetHypervisor(self.op.hypervisor)
8856 hv_type.CheckParameterSyntax(filled_hvp)
8857 self.hv_full = filled_hvp
8858 # check that we don't specify global parameters on an instance
8859 _CheckGlobalHvParams(self.op.hvparams)
8861 # fill and remember the beparams dict
8862 default_beparams = cluster.beparams[constants.PP_DEFAULT]
8863 for param, value in self.op.beparams.iteritems():
8864 if value == constants.VALUE_AUTO:
8865 self.op.beparams[param] = default_beparams[param]
8866 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
8867 self.be_full = cluster.SimpleFillBE(self.op.beparams)
8869 # build os parameters
8870 self.os_full = cluster.SimpleFillOS(self.op.os_type, self.op.osparams)
8872 # now that hvp/bep are in final format, let's reset to defaults,
8874 if self.op.identify_defaults:
8875 self._RevertToDefaults(cluster)
8879 for idx, nic in enumerate(self.op.nics):
8880 nic_mode_req = nic.get(constants.INIC_MODE, None)
8881 nic_mode = nic_mode_req
8882 if nic_mode is None or nic_mode == constants.VALUE_AUTO:
8883 nic_mode = cluster.nicparams[constants.PP_DEFAULT][constants.NIC_MODE]
8885 # in routed mode, for the first nic, the default ip is 'auto'
8886 if nic_mode == constants.NIC_MODE_ROUTED and idx == 0:
8887 default_ip_mode = constants.VALUE_AUTO
8889 default_ip_mode = constants.VALUE_NONE
8891 # ip validity checks
8892 ip = nic.get(constants.INIC_IP, default_ip_mode)
8893 if ip is None or ip.lower() == constants.VALUE_NONE:
8895 elif ip.lower() == constants.VALUE_AUTO:
8896 if not self.op.name_check:
8897 raise errors.OpPrereqError("IP address set to auto but name checks"
8898 " have been skipped",
8900 nic_ip = self.hostname1.ip
8902 if not netutils.IPAddress.IsValid(ip):
8903 raise errors.OpPrereqError("Invalid IP address '%s'" % ip,
8907 # TODO: check the ip address for uniqueness
8908 if nic_mode == constants.NIC_MODE_ROUTED and not nic_ip:
8909 raise errors.OpPrereqError("Routed nic mode requires an ip address",
8912 # MAC address verification
8913 mac = nic.get(constants.INIC_MAC, constants.VALUE_AUTO)
8914 if mac not in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
8915 mac = utils.NormalizeAndValidateMac(mac)
8918 self.cfg.ReserveMAC(mac, self.proc.GetECId())
8919 except errors.ReservationError:
8920 raise errors.OpPrereqError("MAC address %s already in use"
8921 " in cluster" % mac,
8922 errors.ECODE_NOTUNIQUE)
8924 # Build nic parameters
8925 link = nic.get(constants.INIC_LINK, None)
8926 if link == constants.VALUE_AUTO:
8927 link = cluster.nicparams[constants.PP_DEFAULT][constants.NIC_LINK]
8930 nicparams[constants.NIC_MODE] = nic_mode
8932 nicparams[constants.NIC_LINK] = link
8934 check_params = cluster.SimpleFillNIC(nicparams)
8935 objects.NIC.CheckParameterSyntax(check_params)
8936 self.nics.append(objects.NIC(mac=mac, ip=nic_ip, nicparams=nicparams))
8938 # disk checks/pre-build
8939 default_vg = self.cfg.GetVGName()
8941 for disk in self.op.disks:
8942 mode = disk.get(constants.IDISK_MODE, constants.DISK_RDWR)
8943 if mode not in constants.DISK_ACCESS_SET:
8944 raise errors.OpPrereqError("Invalid disk access mode '%s'" %
8945 mode, errors.ECODE_INVAL)
8946 size = disk.get(constants.IDISK_SIZE, None)
8948 raise errors.OpPrereqError("Missing disk size", errors.ECODE_INVAL)
8951 except (TypeError, ValueError):
8952 raise errors.OpPrereqError("Invalid disk size '%s'" % size,
8955 data_vg = disk.get(constants.IDISK_VG, default_vg)
8957 constants.IDISK_SIZE: size,
8958 constants.IDISK_MODE: mode,
8959 constants.IDISK_VG: data_vg,
8960 constants.IDISK_METAVG: disk.get(constants.IDISK_METAVG, data_vg),
8962 if constants.IDISK_ADOPT in disk:
8963 new_disk[constants.IDISK_ADOPT] = disk[constants.IDISK_ADOPT]
8964 self.disks.append(new_disk)
8966 if self.op.mode == constants.INSTANCE_IMPORT:
8968 for idx in range(len(self.disks)):
8969 option = "disk%d_dump" % idx
8970 if export_info.has_option(constants.INISECT_INS, option):
8971 # FIXME: are the old os-es, disk sizes, etc. useful?
8972 export_name = export_info.get(constants.INISECT_INS, option)
8973 image = utils.PathJoin(self.op.src_path, export_name)
8974 disk_images.append(image)
8976 disk_images.append(False)
8978 self.src_images = disk_images
8980 old_name = export_info.get(constants.INISECT_INS, "name")
8981 if self.op.instance_name == old_name:
8982 for idx, nic in enumerate(self.nics):
8983 if nic.mac == constants.VALUE_AUTO:
8984 nic_mac_ini = "nic%d_mac" % idx
8985 nic.mac = export_info.get(constants.INISECT_INS, nic_mac_ini)
8987 # ENDIF: self.op.mode == constants.INSTANCE_IMPORT
8989 # ip ping checks (we use the same ip that was resolved in ExpandNames)
8990 if self.op.ip_check:
8991 if netutils.TcpPing(self.check_ip, constants.DEFAULT_NODED_PORT):
8992 raise errors.OpPrereqError("IP %s of instance %s already in use" %
8993 (self.check_ip, self.op.instance_name),
8994 errors.ECODE_NOTUNIQUE)
8996 #### mac address generation
8997 # By generating here the mac address both the allocator and the hooks get
8998 # the real final mac address rather than the 'auto' or 'generate' value.
8999 # There is a race condition between the generation and the instance object
9000 # creation, which means that we know the mac is valid now, but we're not
9001 # sure it will be when we actually add the instance. If things go bad
9002 # adding the instance will abort because of a duplicate mac, and the
9003 # creation job will fail.
9004 for nic in self.nics:
9005 if nic.mac in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
9006 nic.mac = self.cfg.GenerateMAC(self.proc.GetECId())
9010 if self.op.iallocator is not None:
9011 self._RunAllocator()
9013 #### node related checks
9015 # check primary node
9016 self.pnode = pnode = self.cfg.GetNodeInfo(self.op.pnode)
9017 assert self.pnode is not None, \
9018 "Cannot retrieve locked node %s" % self.op.pnode
9020 raise errors.OpPrereqError("Cannot use offline primary node '%s'" %
9021 pnode.name, errors.ECODE_STATE)
9023 raise errors.OpPrereqError("Cannot use drained primary node '%s'" %
9024 pnode.name, errors.ECODE_STATE)
9025 if not pnode.vm_capable:
9026 raise errors.OpPrereqError("Cannot use non-vm_capable primary node"
9027 " '%s'" % pnode.name, errors.ECODE_STATE)
9029 self.secondaries = []
9031 # mirror node verification
9032 if self.op.disk_template in constants.DTS_INT_MIRROR:
9033 if self.op.snode == pnode.name:
9034 raise errors.OpPrereqError("The secondary node cannot be the"
9035 " primary node", errors.ECODE_INVAL)
9036 _CheckNodeOnline(self, self.op.snode)
9037 _CheckNodeNotDrained(self, self.op.snode)
9038 _CheckNodeVmCapable(self, self.op.snode)
9039 self.secondaries.append(self.op.snode)
9041 nodenames = [pnode.name] + self.secondaries
9043 if not self.adopt_disks:
9044 # Check lv size requirements, if not adopting
9045 req_sizes = _ComputeDiskSizePerVG(self.op.disk_template, self.disks)
9046 _CheckNodesFreeDiskPerVG(self, nodenames, req_sizes)
9048 elif self.op.disk_template == constants.DT_PLAIN: # Check the adoption data
9049 all_lvs = set(["%s/%s" % (disk[constants.IDISK_VG],
9050 disk[constants.IDISK_ADOPT])
9051 for disk in self.disks])
9052 if len(all_lvs) != len(self.disks):
9053 raise errors.OpPrereqError("Duplicate volume names given for adoption",
9055 for lv_name in all_lvs:
9057 # FIXME: lv_name here is "vg/lv" need to ensure that other calls
9058 # to ReserveLV uses the same syntax
9059 self.cfg.ReserveLV(lv_name, self.proc.GetECId())
9060 except errors.ReservationError:
9061 raise errors.OpPrereqError("LV named %s used by another instance" %
9062 lv_name, errors.ECODE_NOTUNIQUE)
9064 vg_names = self.rpc.call_vg_list([pnode.name])[pnode.name]
9065 vg_names.Raise("Cannot get VG information from node %s" % pnode.name)
9067 node_lvs = self.rpc.call_lv_list([pnode.name],
9068 vg_names.payload.keys())[pnode.name]
9069 node_lvs.Raise("Cannot get LV information from node %s" % pnode.name)
9070 node_lvs = node_lvs.payload
9072 delta = all_lvs.difference(node_lvs.keys())
9074 raise errors.OpPrereqError("Missing logical volume(s): %s" %
9075 utils.CommaJoin(delta),
9077 online_lvs = [lv for lv in all_lvs if node_lvs[lv][2]]
9079 raise errors.OpPrereqError("Online logical volumes found, cannot"
9080 " adopt: %s" % utils.CommaJoin(online_lvs),
9082 # update the size of disk based on what is found
9083 for dsk in self.disks:
9084 dsk[constants.IDISK_SIZE] = \
9085 int(float(node_lvs["%s/%s" % (dsk[constants.IDISK_VG],
9086 dsk[constants.IDISK_ADOPT])][0]))
9088 elif self.op.disk_template == constants.DT_BLOCK:
9089 # Normalize and de-duplicate device paths
9090 all_disks = set([os.path.abspath(disk[constants.IDISK_ADOPT])
9091 for disk in self.disks])
9092 if len(all_disks) != len(self.disks):
9093 raise errors.OpPrereqError("Duplicate disk names given for adoption",
9095 baddisks = [d for d in all_disks
9096 if not d.startswith(constants.ADOPTABLE_BLOCKDEV_ROOT)]
9098 raise errors.OpPrereqError("Device node(s) %s lie outside %s and"
9099 " cannot be adopted" %
9100 (", ".join(baddisks),
9101 constants.ADOPTABLE_BLOCKDEV_ROOT),
9104 node_disks = self.rpc.call_bdev_sizes([pnode.name],
9105 list(all_disks))[pnode.name]
9106 node_disks.Raise("Cannot get block device information from node %s" %
9108 node_disks = node_disks.payload
9109 delta = all_disks.difference(node_disks.keys())
9111 raise errors.OpPrereqError("Missing block device(s): %s" %
9112 utils.CommaJoin(delta),
9114 for dsk in self.disks:
9115 dsk[constants.IDISK_SIZE] = \
9116 int(float(node_disks[dsk[constants.IDISK_ADOPT]]))
9118 _CheckHVParams(self, nodenames, self.op.hypervisor, self.op.hvparams)
9120 _CheckNodeHasOS(self, pnode.name, self.op.os_type, self.op.force_variant)
9121 # check OS parameters (remotely)
9122 _CheckOSParams(self, True, nodenames, self.op.os_type, self.os_full)
9124 _CheckNicsBridgesExist(self, self.nics, self.pnode.name)
9126 # memory check on primary node
9128 _CheckNodeFreeMemory(self, self.pnode.name,
9129 "creating instance %s" % self.op.instance_name,
9130 self.be_full[constants.BE_MEMORY],
9133 self.dry_run_result = list(nodenames)
9135 def Exec(self, feedback_fn):
9136 """Create and add the instance to the cluster.
9139 instance = self.op.instance_name
9140 pnode_name = self.pnode.name
9142 ht_kind = self.op.hypervisor
9143 if ht_kind in constants.HTS_REQ_PORT:
9144 network_port = self.cfg.AllocatePort()
9148 disks = _GenerateDiskTemplate(self,
9149 self.op.disk_template,
9150 instance, pnode_name,
9153 self.instance_file_storage_dir,
9154 self.op.file_driver,
9158 iobj = objects.Instance(name=instance, os=self.op.os_type,
9159 primary_node=pnode_name,
9160 nics=self.nics, disks=disks,
9161 disk_template=self.op.disk_template,
9163 network_port=network_port,
9164 beparams=self.op.beparams,
9165 hvparams=self.op.hvparams,
9166 hypervisor=self.op.hypervisor,
9167 osparams=self.op.osparams,
9171 for tag in self.op.tags:
9174 if self.adopt_disks:
9175 if self.op.disk_template == constants.DT_PLAIN:
9176 # rename LVs to the newly-generated names; we need to construct
9177 # 'fake' LV disks with the old data, plus the new unique_id
9178 tmp_disks = [objects.Disk.FromDict(v.ToDict()) for v in disks]
9180 for t_dsk, a_dsk in zip(tmp_disks, self.disks):
9181 rename_to.append(t_dsk.logical_id)
9182 t_dsk.logical_id = (t_dsk.logical_id[0], a_dsk[constants.IDISK_ADOPT])
9183 self.cfg.SetDiskID(t_dsk, pnode_name)
9184 result = self.rpc.call_blockdev_rename(pnode_name,
9185 zip(tmp_disks, rename_to))
9186 result.Raise("Failed to rename adoped LVs")
9188 feedback_fn("* creating instance disks...")
9190 _CreateDisks(self, iobj)
9191 except errors.OpExecError:
9192 self.LogWarning("Device creation failed, reverting...")
9194 _RemoveDisks(self, iobj)
9196 self.cfg.ReleaseDRBDMinors(instance)
9199 feedback_fn("adding instance %s to cluster config" % instance)
9201 self.cfg.AddInstance(iobj, self.proc.GetECId())
9203 # Declare that we don't want to remove the instance lock anymore, as we've
9204 # added the instance to the config
9205 del self.remove_locks[locking.LEVEL_INSTANCE]
9207 if self.op.mode == constants.INSTANCE_IMPORT:
9208 # Release unused nodes
9209 _ReleaseLocks(self, locking.LEVEL_NODE, keep=[self.op.src_node])
9212 _ReleaseLocks(self, locking.LEVEL_NODE)
9215 if not self.adopt_disks and self.cfg.GetClusterInfo().prealloc_wipe_disks:
9216 feedback_fn("* wiping instance disks...")
9218 _WipeDisks(self, iobj)
9219 except errors.OpExecError, err:
9220 logging.exception("Wiping disks failed")
9221 self.LogWarning("Wiping instance disks failed (%s)", err)
9225 # Something is already wrong with the disks, don't do anything else
9227 elif self.op.wait_for_sync:
9228 disk_abort = not _WaitForSync(self, iobj)
9229 elif iobj.disk_template in constants.DTS_INT_MIRROR:
9230 # make sure the disks are not degraded (still sync-ing is ok)
9231 feedback_fn("* checking mirrors status")
9232 disk_abort = not _WaitForSync(self, iobj, oneshot=True)
9237 _RemoveDisks(self, iobj)
9238 self.cfg.RemoveInstance(iobj.name)
9239 # Make sure the instance lock gets removed
9240 self.remove_locks[locking.LEVEL_INSTANCE] = iobj.name
9241 raise errors.OpExecError("There are some degraded disks for"
9244 if iobj.disk_template != constants.DT_DISKLESS and not self.adopt_disks:
9245 if self.op.mode == constants.INSTANCE_CREATE:
9246 if not self.op.no_install:
9247 pause_sync = (iobj.disk_template in constants.DTS_INT_MIRROR and
9248 not self.op.wait_for_sync)
9250 feedback_fn("* pausing disk sync to install instance OS")
9251 result = self.rpc.call_blockdev_pause_resume_sync(pnode_name,
9253 for idx, success in enumerate(result.payload):
9255 logging.warn("pause-sync of instance %s for disk %d failed",
9258 feedback_fn("* running the instance OS create scripts...")
9259 # FIXME: pass debug option from opcode to backend
9261 self.rpc.call_instance_os_add(pnode_name, (iobj, None), False,
9262 self.op.debug_level)
9264 feedback_fn("* resuming disk sync")
9265 result = self.rpc.call_blockdev_pause_resume_sync(pnode_name,
9267 for idx, success in enumerate(result.payload):
9269 logging.warn("resume-sync of instance %s for disk %d failed",
9272 os_add_result.Raise("Could not add os for instance %s"
9273 " on node %s" % (instance, pnode_name))
9275 elif self.op.mode == constants.INSTANCE_IMPORT:
9276 feedback_fn("* running the instance OS import scripts...")
9280 for idx, image in enumerate(self.src_images):
9284 # FIXME: pass debug option from opcode to backend
9285 dt = masterd.instance.DiskTransfer("disk/%s" % idx,
9286 constants.IEIO_FILE, (image, ),
9287 constants.IEIO_SCRIPT,
9288 (iobj.disks[idx], idx),
9290 transfers.append(dt)
9293 masterd.instance.TransferInstanceData(self, feedback_fn,
9294 self.op.src_node, pnode_name,
9295 self.pnode.secondary_ip,
9297 if not compat.all(import_result):
9298 self.LogWarning("Some disks for instance %s on node %s were not"
9299 " imported successfully" % (instance, pnode_name))
9301 elif self.op.mode == constants.INSTANCE_REMOTE_IMPORT:
9302 feedback_fn("* preparing remote import...")
9303 # The source cluster will stop the instance before attempting to make a
9304 # connection. In some cases stopping an instance can take a long time,
9305 # hence the shutdown timeout is added to the connection timeout.
9306 connect_timeout = (constants.RIE_CONNECT_TIMEOUT +
9307 self.op.source_shutdown_timeout)
9308 timeouts = masterd.instance.ImportExportTimeouts(connect_timeout)
9310 assert iobj.primary_node == self.pnode.name
9312 masterd.instance.RemoteImport(self, feedback_fn, iobj, self.pnode,
9313 self.source_x509_ca,
9314 self._cds, timeouts)
9315 if not compat.all(disk_results):
9316 # TODO: Should the instance still be started, even if some disks
9317 # failed to import (valid for local imports, too)?
9318 self.LogWarning("Some disks for instance %s on node %s were not"
9319 " imported successfully" % (instance, pnode_name))
9321 # Run rename script on newly imported instance
9322 assert iobj.name == instance
9323 feedback_fn("Running rename script for %s" % instance)
9324 result = self.rpc.call_instance_run_rename(pnode_name, iobj,
9325 self.source_instance_name,
9326 self.op.debug_level)
9328 self.LogWarning("Failed to run rename script for %s on node"
9329 " %s: %s" % (instance, pnode_name, result.fail_msg))
9332 # also checked in the prereq part
9333 raise errors.ProgrammerError("Unknown OS initialization mode '%s'"
9337 iobj.admin_up = True
9338 self.cfg.Update(iobj, feedback_fn)
9339 logging.info("Starting instance %s on node %s", instance, pnode_name)
9340 feedback_fn("* starting instance...")
9341 result = self.rpc.call_instance_start(pnode_name, (iobj, None, None),
9343 result.Raise("Could not start instance")
9345 return list(iobj.all_nodes)
9348 class LUInstanceConsole(NoHooksLU):
9349 """Connect to an instance's console.
9351 This is somewhat special in that it returns the command line that
9352 you need to run on the master node in order to connect to the
9358 def ExpandNames(self):
9359 self.share_locks = _ShareAll()
9360 self._ExpandAndLockInstance()
9362 def CheckPrereq(self):
9363 """Check prerequisites.
9365 This checks that the instance is in the cluster.
9368 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
9369 assert self.instance is not None, \
9370 "Cannot retrieve locked instance %s" % self.op.instance_name
9371 _CheckNodeOnline(self, self.instance.primary_node)
9373 def Exec(self, feedback_fn):
9374 """Connect to the console of an instance
9377 instance = self.instance
9378 node = instance.primary_node
9380 node_insts = self.rpc.call_instance_list([node],
9381 [instance.hypervisor])[node]
9382 node_insts.Raise("Can't get node information from %s" % node)
9384 if instance.name not in node_insts.payload:
9385 if instance.admin_up:
9386 state = constants.INSTST_ERRORDOWN
9388 state = constants.INSTST_ADMINDOWN
9389 raise errors.OpExecError("Instance %s is not running (state %s)" %
9390 (instance.name, state))
9392 logging.debug("Connecting to console of %s on %s", instance.name, node)
9394 return _GetInstanceConsole(self.cfg.GetClusterInfo(), instance)
9397 def _GetInstanceConsole(cluster, instance):
9398 """Returns console information for an instance.
9400 @type cluster: L{objects.Cluster}
9401 @type instance: L{objects.Instance}
9405 hyper = hypervisor.GetHypervisor(instance.hypervisor)
9406 # beparams and hvparams are passed separately, to avoid editing the
9407 # instance and then saving the defaults in the instance itself.
9408 hvparams = cluster.FillHV(instance)
9409 beparams = cluster.FillBE(instance)
9410 console = hyper.GetInstanceConsole(instance, hvparams, beparams)
9412 assert console.instance == instance.name
9413 assert console.Validate()
9415 return console.ToDict()
9418 class LUInstanceReplaceDisks(LogicalUnit):
9419 """Replace the disks of an instance.
9422 HPATH = "mirrors-replace"
9423 HTYPE = constants.HTYPE_INSTANCE
9426 def CheckArguments(self):
9427 TLReplaceDisks.CheckArguments(self.op.mode, self.op.remote_node,
9430 def ExpandNames(self):
9431 self._ExpandAndLockInstance()
9433 assert locking.LEVEL_NODE not in self.needed_locks
9434 assert locking.LEVEL_NODEGROUP not in self.needed_locks
9436 assert self.op.iallocator is None or self.op.remote_node is None, \
9437 "Conflicting options"
9439 if self.op.remote_node is not None:
9440 self.op.remote_node = _ExpandNodeName(self.cfg, self.op.remote_node)
9442 # Warning: do not remove the locking of the new secondary here
9443 # unless DRBD8.AddChildren is changed to work in parallel;
9444 # currently it doesn't since parallel invocations of
9445 # FindUnusedMinor will conflict
9446 self.needed_locks[locking.LEVEL_NODE] = [self.op.remote_node]
9447 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
9449 self.needed_locks[locking.LEVEL_NODE] = []
9450 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
9452 if self.op.iallocator is not None:
9453 # iallocator will select a new node in the same group
9454 self.needed_locks[locking.LEVEL_NODEGROUP] = []
9456 self.replacer = TLReplaceDisks(self, self.op.instance_name, self.op.mode,
9457 self.op.iallocator, self.op.remote_node,
9458 self.op.disks, False, self.op.early_release)
9460 self.tasklets = [self.replacer]
9462 def DeclareLocks(self, level):
9463 if level == locking.LEVEL_NODEGROUP:
9464 assert self.op.remote_node is None
9465 assert self.op.iallocator is not None
9466 assert not self.needed_locks[locking.LEVEL_NODEGROUP]
9468 self.share_locks[locking.LEVEL_NODEGROUP] = 1
9469 self.needed_locks[locking.LEVEL_NODEGROUP] = \
9470 self.cfg.GetInstanceNodeGroups(self.op.instance_name)
9472 elif level == locking.LEVEL_NODE:
9473 if self.op.iallocator is not None:
9474 assert self.op.remote_node is None
9475 assert not self.needed_locks[locking.LEVEL_NODE]
9477 # Lock member nodes of all locked groups
9478 self.needed_locks[locking.LEVEL_NODE] = [node_name
9479 for group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP)
9480 for node_name in self.cfg.GetNodeGroup(group_uuid).members]
9482 self._LockInstancesNodes()
9484 def BuildHooksEnv(self):
9487 This runs on the master, the primary and all the secondaries.
9490 instance = self.replacer.instance
9492 "MODE": self.op.mode,
9493 "NEW_SECONDARY": self.op.remote_node,
9494 "OLD_SECONDARY": instance.secondary_nodes[0],
9496 env.update(_BuildInstanceHookEnvByObject(self, instance))
9499 def BuildHooksNodes(self):
9500 """Build hooks nodes.
9503 instance = self.replacer.instance
9505 self.cfg.GetMasterNode(),
9506 instance.primary_node,
9508 if self.op.remote_node is not None:
9509 nl.append(self.op.remote_node)
9512 def CheckPrereq(self):
9513 """Check prerequisites.
9516 assert (self.glm.is_owned(locking.LEVEL_NODEGROUP) or
9517 self.op.iallocator is None)
9519 owned_groups = self.owned_locks(locking.LEVEL_NODEGROUP)
9521 _CheckInstanceNodeGroups(self.cfg, self.op.instance_name, owned_groups)
9523 return LogicalUnit.CheckPrereq(self)
9526 class TLReplaceDisks(Tasklet):
9527 """Replaces disks for an instance.
9529 Note: Locking is not within the scope of this class.
9532 def __init__(self, lu, instance_name, mode, iallocator_name, remote_node,
9533 disks, delay_iallocator, early_release):
9534 """Initializes this class.
9537 Tasklet.__init__(self, lu)
9540 self.instance_name = instance_name
9542 self.iallocator_name = iallocator_name
9543 self.remote_node = remote_node
9545 self.delay_iallocator = delay_iallocator
9546 self.early_release = early_release
9549 self.instance = None
9550 self.new_node = None
9551 self.target_node = None
9552 self.other_node = None
9553 self.remote_node_info = None
9554 self.node_secondary_ip = None
9557 def CheckArguments(mode, remote_node, iallocator):
9558 """Helper function for users of this class.
9561 # check for valid parameter combination
9562 if mode == constants.REPLACE_DISK_CHG:
9563 if remote_node is None and iallocator is None:
9564 raise errors.OpPrereqError("When changing the secondary either an"
9565 " iallocator script must be used or the"
9566 " new node given", errors.ECODE_INVAL)
9568 if remote_node is not None and iallocator is not None:
9569 raise errors.OpPrereqError("Give either the iallocator or the new"
9570 " secondary, not both", errors.ECODE_INVAL)
9572 elif remote_node is not None or iallocator is not None:
9573 # Not replacing the secondary
9574 raise errors.OpPrereqError("The iallocator and new node options can"
9575 " only be used when changing the"
9576 " secondary node", errors.ECODE_INVAL)
9579 def _RunAllocator(lu, iallocator_name, instance_name, relocate_from):
9580 """Compute a new secondary node using an IAllocator.
9583 ial = IAllocator(lu.cfg, lu.rpc,
9584 mode=constants.IALLOCATOR_MODE_RELOC,
9586 relocate_from=list(relocate_from))
9588 ial.Run(iallocator_name)
9591 raise errors.OpPrereqError("Can't compute nodes using iallocator '%s':"
9592 " %s" % (iallocator_name, ial.info),
9595 if len(ial.result) != ial.required_nodes:
9596 raise errors.OpPrereqError("iallocator '%s' returned invalid number"
9597 " of nodes (%s), required %s" %
9599 len(ial.result), ial.required_nodes),
9602 remote_node_name = ial.result[0]
9604 lu.LogInfo("Selected new secondary for instance '%s': %s",
9605 instance_name, remote_node_name)
9607 return remote_node_name
9609 def _FindFaultyDisks(self, node_name):
9610 """Wrapper for L{_FindFaultyInstanceDisks}.
9613 return _FindFaultyInstanceDisks(self.cfg, self.rpc, self.instance,
9616 def _CheckDisksActivated(self, instance):
9617 """Checks if the instance disks are activated.
9619 @param instance: The instance to check disks
9620 @return: True if they are activated, False otherwise
9623 nodes = instance.all_nodes
9625 for idx, dev in enumerate(instance.disks):
9627 self.lu.LogInfo("Checking disk/%d on %s", idx, node)
9628 self.cfg.SetDiskID(dev, node)
9630 result = self.rpc.call_blockdev_find(node, dev)
9634 elif result.fail_msg or not result.payload:
9639 def CheckPrereq(self):
9640 """Check prerequisites.
9642 This checks that the instance is in the cluster.
9645 self.instance = instance = self.cfg.GetInstanceInfo(self.instance_name)
9646 assert instance is not None, \
9647 "Cannot retrieve locked instance %s" % self.instance_name
9649 if instance.disk_template != constants.DT_DRBD8:
9650 raise errors.OpPrereqError("Can only run replace disks for DRBD8-based"
9651 " instances", errors.ECODE_INVAL)
9653 if len(instance.secondary_nodes) != 1:
9654 raise errors.OpPrereqError("The instance has a strange layout,"
9655 " expected one secondary but found %d" %
9656 len(instance.secondary_nodes),
9659 if not self.delay_iallocator:
9660 self._CheckPrereq2()
9662 def _CheckPrereq2(self):
9663 """Check prerequisites, second part.
9665 This function should always be part of CheckPrereq. It was separated and is
9666 now called from Exec because during node evacuation iallocator was only
9667 called with an unmodified cluster model, not taking planned changes into
9671 instance = self.instance
9672 secondary_node = instance.secondary_nodes[0]
9674 if self.iallocator_name is None:
9675 remote_node = self.remote_node
9677 remote_node = self._RunAllocator(self.lu, self.iallocator_name,
9678 instance.name, instance.secondary_nodes)
9680 if remote_node is None:
9681 self.remote_node_info = None
9683 assert remote_node in self.lu.owned_locks(locking.LEVEL_NODE), \
9684 "Remote node '%s' is not locked" % remote_node
9686 self.remote_node_info = self.cfg.GetNodeInfo(remote_node)
9687 assert self.remote_node_info is not None, \
9688 "Cannot retrieve locked node %s" % remote_node
9690 if remote_node == self.instance.primary_node:
9691 raise errors.OpPrereqError("The specified node is the primary node of"
9692 " the instance", errors.ECODE_INVAL)
9694 if remote_node == secondary_node:
9695 raise errors.OpPrereqError("The specified node is already the"
9696 " secondary node of the instance",
9699 if self.disks and self.mode in (constants.REPLACE_DISK_AUTO,
9700 constants.REPLACE_DISK_CHG):
9701 raise errors.OpPrereqError("Cannot specify disks to be replaced",
9704 if self.mode == constants.REPLACE_DISK_AUTO:
9705 if not self._CheckDisksActivated(instance):
9706 raise errors.OpPrereqError("Please run activate-disks on instance %s"
9707 " first" % self.instance_name,
9709 faulty_primary = self._FindFaultyDisks(instance.primary_node)
9710 faulty_secondary = self._FindFaultyDisks(secondary_node)
9712 if faulty_primary and faulty_secondary:
9713 raise errors.OpPrereqError("Instance %s has faulty disks on more than"
9714 " one node and can not be repaired"
9715 " automatically" % self.instance_name,
9719 self.disks = faulty_primary
9720 self.target_node = instance.primary_node
9721 self.other_node = secondary_node
9722 check_nodes = [self.target_node, self.other_node]
9723 elif faulty_secondary:
9724 self.disks = faulty_secondary
9725 self.target_node = secondary_node
9726 self.other_node = instance.primary_node
9727 check_nodes = [self.target_node, self.other_node]
9733 # Non-automatic modes
9734 if self.mode == constants.REPLACE_DISK_PRI:
9735 self.target_node = instance.primary_node
9736 self.other_node = secondary_node
9737 check_nodes = [self.target_node, self.other_node]
9739 elif self.mode == constants.REPLACE_DISK_SEC:
9740 self.target_node = secondary_node
9741 self.other_node = instance.primary_node
9742 check_nodes = [self.target_node, self.other_node]
9744 elif self.mode == constants.REPLACE_DISK_CHG:
9745 self.new_node = remote_node
9746 self.other_node = instance.primary_node
9747 self.target_node = secondary_node
9748 check_nodes = [self.new_node, self.other_node]
9750 _CheckNodeNotDrained(self.lu, remote_node)
9751 _CheckNodeVmCapable(self.lu, remote_node)
9753 old_node_info = self.cfg.GetNodeInfo(secondary_node)
9754 assert old_node_info is not None
9755 if old_node_info.offline and not self.early_release:
9756 # doesn't make sense to delay the release
9757 self.early_release = True
9758 self.lu.LogInfo("Old secondary %s is offline, automatically enabling"
9759 " early-release mode", secondary_node)
9762 raise errors.ProgrammerError("Unhandled disk replace mode (%s)" %
9765 # If not specified all disks should be replaced
9767 self.disks = range(len(self.instance.disks))
9769 for node in check_nodes:
9770 _CheckNodeOnline(self.lu, node)
9772 touched_nodes = frozenset(node_name for node_name in [self.new_node,
9775 if node_name is not None)
9777 # Release unneeded node locks
9778 _ReleaseLocks(self.lu, locking.LEVEL_NODE, keep=touched_nodes)
9780 # Release any owned node group
9781 if self.lu.glm.is_owned(locking.LEVEL_NODEGROUP):
9782 _ReleaseLocks(self.lu, locking.LEVEL_NODEGROUP)
9784 # Check whether disks are valid
9785 for disk_idx in self.disks:
9786 instance.FindDisk(disk_idx)
9788 # Get secondary node IP addresses
9789 self.node_secondary_ip = dict((name, node.secondary_ip) for (name, node)
9790 in self.cfg.GetMultiNodeInfo(touched_nodes))
9792 def Exec(self, feedback_fn):
9793 """Execute disk replacement.
9795 This dispatches the disk replacement to the appropriate handler.
9798 if self.delay_iallocator:
9799 self._CheckPrereq2()
9802 # Verify owned locks before starting operation
9803 owned_nodes = self.lu.owned_locks(locking.LEVEL_NODE)
9804 assert set(owned_nodes) == set(self.node_secondary_ip), \
9805 ("Incorrect node locks, owning %s, expected %s" %
9806 (owned_nodes, self.node_secondary_ip.keys()))
9808 owned_instances = self.lu.owned_locks(locking.LEVEL_INSTANCE)
9809 assert list(owned_instances) == [self.instance_name], \
9810 "Instance '%s' not locked" % self.instance_name
9812 assert not self.lu.glm.is_owned(locking.LEVEL_NODEGROUP), \
9813 "Should not own any node group lock at this point"
9816 feedback_fn("No disks need replacement")
9819 feedback_fn("Replacing disk(s) %s for %s" %
9820 (utils.CommaJoin(self.disks), self.instance.name))
9822 activate_disks = (not self.instance.admin_up)
9824 # Activate the instance disks if we're replacing them on a down instance
9826 _StartInstanceDisks(self.lu, self.instance, True)
9829 # Should we replace the secondary node?
9830 if self.new_node is not None:
9831 fn = self._ExecDrbd8Secondary
9833 fn = self._ExecDrbd8DiskOnly
9835 result = fn(feedback_fn)
9837 # Deactivate the instance disks if we're replacing them on a
9840 _SafeShutdownInstanceDisks(self.lu, self.instance)
9843 # Verify owned locks
9844 owned_nodes = self.lu.owned_locks(locking.LEVEL_NODE)
9845 nodes = frozenset(self.node_secondary_ip)
9846 assert ((self.early_release and not owned_nodes) or
9847 (not self.early_release and not (set(owned_nodes) - nodes))), \
9848 ("Not owning the correct locks, early_release=%s, owned=%r,"
9849 " nodes=%r" % (self.early_release, owned_nodes, nodes))
9853 def _CheckVolumeGroup(self, nodes):
9854 self.lu.LogInfo("Checking volume groups")
9856 vgname = self.cfg.GetVGName()
9858 # Make sure volume group exists on all involved nodes
9859 results = self.rpc.call_vg_list(nodes)
9861 raise errors.OpExecError("Can't list volume groups on the nodes")
9865 res.Raise("Error checking node %s" % node)
9866 if vgname not in res.payload:
9867 raise errors.OpExecError("Volume group '%s' not found on node %s" %
9870 def _CheckDisksExistence(self, nodes):
9871 # Check disk existence
9872 for idx, dev in enumerate(self.instance.disks):
9873 if idx not in self.disks:
9877 self.lu.LogInfo("Checking disk/%d on %s" % (idx, node))
9878 self.cfg.SetDiskID(dev, node)
9880 result = self.rpc.call_blockdev_find(node, dev)
9882 msg = result.fail_msg
9883 if msg or not result.payload:
9885 msg = "disk not found"
9886 raise errors.OpExecError("Can't find disk/%d on node %s: %s" %
9889 def _CheckDisksConsistency(self, node_name, on_primary, ldisk):
9890 for idx, dev in enumerate(self.instance.disks):
9891 if idx not in self.disks:
9894 self.lu.LogInfo("Checking disk/%d consistency on node %s" %
9897 if not _CheckDiskConsistency(self.lu, dev, node_name, on_primary,
9899 raise errors.OpExecError("Node %s has degraded storage, unsafe to"
9900 " replace disks for instance %s" %
9901 (node_name, self.instance.name))
9903 def _CreateNewStorage(self, node_name):
9904 """Create new storage on the primary or secondary node.
9906 This is only used for same-node replaces, not for changing the
9907 secondary node, hence we don't want to modify the existing disk.
9912 for idx, dev in enumerate(self.instance.disks):
9913 if idx not in self.disks:
9916 self.lu.LogInfo("Adding storage on %s for disk/%d" % (node_name, idx))
9918 self.cfg.SetDiskID(dev, node_name)
9920 lv_names = [".disk%d_%s" % (idx, suffix) for suffix in ["data", "meta"]]
9921 names = _GenerateUniqueNames(self.lu, lv_names)
9923 vg_data = dev.children[0].logical_id[0]
9924 lv_data = objects.Disk(dev_type=constants.LD_LV, size=dev.size,
9925 logical_id=(vg_data, names[0]))
9926 vg_meta = dev.children[1].logical_id[0]
9927 lv_meta = objects.Disk(dev_type=constants.LD_LV, size=DRBD_META_SIZE,
9928 logical_id=(vg_meta, names[1]))
9930 new_lvs = [lv_data, lv_meta]
9931 old_lvs = [child.Copy() for child in dev.children]
9932 iv_names[dev.iv_name] = (dev, old_lvs, new_lvs)
9934 # we pass force_create=True to force the LVM creation
9935 for new_lv in new_lvs:
9936 _CreateBlockDev(self.lu, node_name, self.instance, new_lv, True,
9937 _GetInstanceInfoText(self.instance), False)
9941 def _CheckDevices(self, node_name, iv_names):
9942 for name, (dev, _, _) in iv_names.iteritems():
9943 self.cfg.SetDiskID(dev, node_name)
9945 result = self.rpc.call_blockdev_find(node_name, dev)
9947 msg = result.fail_msg
9948 if msg or not result.payload:
9950 msg = "disk not found"
9951 raise errors.OpExecError("Can't find DRBD device %s: %s" %
9954 if result.payload.is_degraded:
9955 raise errors.OpExecError("DRBD device %s is degraded!" % name)
9957 def _RemoveOldStorage(self, node_name, iv_names):
9958 for name, (_, old_lvs, _) in iv_names.iteritems():
9959 self.lu.LogInfo("Remove logical volumes for %s" % name)
9962 self.cfg.SetDiskID(lv, node_name)
9964 msg = self.rpc.call_blockdev_remove(node_name, lv).fail_msg
9966 self.lu.LogWarning("Can't remove old LV: %s" % msg,
9967 hint="remove unused LVs manually")
9969 def _ExecDrbd8DiskOnly(self, feedback_fn): # pylint: disable=W0613
9970 """Replace a disk on the primary or secondary for DRBD 8.
9972 The algorithm for replace is quite complicated:
9974 1. for each disk to be replaced:
9976 1. create new LVs on the target node with unique names
9977 1. detach old LVs from the drbd device
9978 1. rename old LVs to name_replaced.<time_t>
9979 1. rename new LVs to old LVs
9980 1. attach the new LVs (with the old names now) to the drbd device
9982 1. wait for sync across all devices
9984 1. for each modified disk:
9986 1. remove old LVs (which have the name name_replaces.<time_t>)
9988 Failures are not very well handled.
9993 # Step: check device activation
9994 self.lu.LogStep(1, steps_total, "Check device existence")
9995 self._CheckDisksExistence([self.other_node, self.target_node])
9996 self._CheckVolumeGroup([self.target_node, self.other_node])
9998 # Step: check other node consistency
9999 self.lu.LogStep(2, steps_total, "Check peer consistency")
10000 self._CheckDisksConsistency(self.other_node,
10001 self.other_node == self.instance.primary_node,
10004 # Step: create new storage
10005 self.lu.LogStep(3, steps_total, "Allocate new storage")
10006 iv_names = self._CreateNewStorage(self.target_node)
10008 # Step: for each lv, detach+rename*2+attach
10009 self.lu.LogStep(4, steps_total, "Changing drbd configuration")
10010 for dev, old_lvs, new_lvs in iv_names.itervalues():
10011 self.lu.LogInfo("Detaching %s drbd from local storage" % dev.iv_name)
10013 result = self.rpc.call_blockdev_removechildren(self.target_node, dev,
10015 result.Raise("Can't detach drbd from local storage on node"
10016 " %s for device %s" % (self.target_node, dev.iv_name))
10018 #cfg.Update(instance)
10020 # ok, we created the new LVs, so now we know we have the needed
10021 # storage; as such, we proceed on the target node to rename
10022 # old_lv to _old, and new_lv to old_lv; note that we rename LVs
10023 # using the assumption that logical_id == physical_id (which in
10024 # turn is the unique_id on that node)
10026 # FIXME(iustin): use a better name for the replaced LVs
10027 temp_suffix = int(time.time())
10028 ren_fn = lambda d, suff: (d.physical_id[0],
10029 d.physical_id[1] + "_replaced-%s" % suff)
10031 # Build the rename list based on what LVs exist on the node
10032 rename_old_to_new = []
10033 for to_ren in old_lvs:
10034 result = self.rpc.call_blockdev_find(self.target_node, to_ren)
10035 if not result.fail_msg and result.payload:
10037 rename_old_to_new.append((to_ren, ren_fn(to_ren, temp_suffix)))
10039 self.lu.LogInfo("Renaming the old LVs on the target node")
10040 result = self.rpc.call_blockdev_rename(self.target_node,
10042 result.Raise("Can't rename old LVs on node %s" % self.target_node)
10044 # Now we rename the new LVs to the old LVs
10045 self.lu.LogInfo("Renaming the new LVs on the target node")
10046 rename_new_to_old = [(new, old.physical_id)
10047 for old, new in zip(old_lvs, new_lvs)]
10048 result = self.rpc.call_blockdev_rename(self.target_node,
10050 result.Raise("Can't rename new LVs on node %s" % self.target_node)
10052 # Intermediate steps of in memory modifications
10053 for old, new in zip(old_lvs, new_lvs):
10054 new.logical_id = old.logical_id
10055 self.cfg.SetDiskID(new, self.target_node)
10057 # We need to modify old_lvs so that removal later removes the
10058 # right LVs, not the newly added ones; note that old_lvs is a
10060 for disk in old_lvs:
10061 disk.logical_id = ren_fn(disk, temp_suffix)
10062 self.cfg.SetDiskID(disk, self.target_node)
10064 # Now that the new lvs have the old name, we can add them to the device
10065 self.lu.LogInfo("Adding new mirror component on %s" % self.target_node)
10066 result = self.rpc.call_blockdev_addchildren(self.target_node, dev,
10068 msg = result.fail_msg
10070 for new_lv in new_lvs:
10071 msg2 = self.rpc.call_blockdev_remove(self.target_node,
10074 self.lu.LogWarning("Can't rollback device %s: %s", dev, msg2,
10075 hint=("cleanup manually the unused logical"
10077 raise errors.OpExecError("Can't add local storage to drbd: %s" % msg)
10080 if self.early_release:
10081 self.lu.LogStep(cstep, steps_total, "Removing old storage")
10083 self._RemoveOldStorage(self.target_node, iv_names)
10084 # WARNING: we release both node locks here, do not do other RPCs
10085 # than WaitForSync to the primary node
10086 _ReleaseLocks(self.lu, locking.LEVEL_NODE,
10087 names=[self.target_node, self.other_node])
10090 # This can fail as the old devices are degraded and _WaitForSync
10091 # does a combined result over all disks, so we don't check its return value
10092 self.lu.LogStep(cstep, steps_total, "Sync devices")
10094 _WaitForSync(self.lu, self.instance)
10096 # Check all devices manually
10097 self._CheckDevices(self.instance.primary_node, iv_names)
10099 # Step: remove old storage
10100 if not self.early_release:
10101 self.lu.LogStep(cstep, steps_total, "Removing old storage")
10103 self._RemoveOldStorage(self.target_node, iv_names)
10105 def _ExecDrbd8Secondary(self, feedback_fn):
10106 """Replace the secondary node for DRBD 8.
10108 The algorithm for replace is quite complicated:
10109 - for all disks of the instance:
10110 - create new LVs on the new node with same names
10111 - shutdown the drbd device on the old secondary
10112 - disconnect the drbd network on the primary
10113 - create the drbd device on the new secondary
10114 - network attach the drbd on the primary, using an artifice:
10115 the drbd code for Attach() will connect to the network if it
10116 finds a device which is connected to the good local disks but
10117 not network enabled
10118 - wait for sync across all devices
10119 - remove all disks from the old secondary
10121 Failures are not very well handled.
10126 pnode = self.instance.primary_node
10128 # Step: check device activation
10129 self.lu.LogStep(1, steps_total, "Check device existence")
10130 self._CheckDisksExistence([self.instance.primary_node])
10131 self._CheckVolumeGroup([self.instance.primary_node])
10133 # Step: check other node consistency
10134 self.lu.LogStep(2, steps_total, "Check peer consistency")
10135 self._CheckDisksConsistency(self.instance.primary_node, True, True)
10137 # Step: create new storage
10138 self.lu.LogStep(3, steps_total, "Allocate new storage")
10139 for idx, dev in enumerate(self.instance.disks):
10140 self.lu.LogInfo("Adding new local storage on %s for disk/%d" %
10141 (self.new_node, idx))
10142 # we pass force_create=True to force LVM creation
10143 for new_lv in dev.children:
10144 _CreateBlockDev(self.lu, self.new_node, self.instance, new_lv, True,
10145 _GetInstanceInfoText(self.instance), False)
10147 # Step 4: dbrd minors and drbd setups changes
10148 # after this, we must manually remove the drbd minors on both the
10149 # error and the success paths
10150 self.lu.LogStep(4, steps_total, "Changing drbd configuration")
10151 minors = self.cfg.AllocateDRBDMinor([self.new_node
10152 for dev in self.instance.disks],
10153 self.instance.name)
10154 logging.debug("Allocated minors %r", minors)
10157 for idx, (dev, new_minor) in enumerate(zip(self.instance.disks, minors)):
10158 self.lu.LogInfo("activating a new drbd on %s for disk/%d" %
10159 (self.new_node, idx))
10160 # create new devices on new_node; note that we create two IDs:
10161 # one without port, so the drbd will be activated without
10162 # networking information on the new node at this stage, and one
10163 # with network, for the latter activation in step 4
10164 (o_node1, o_node2, o_port, o_minor1, o_minor2, o_secret) = dev.logical_id
10165 if self.instance.primary_node == o_node1:
10168 assert self.instance.primary_node == o_node2, "Three-node instance?"
10171 new_alone_id = (self.instance.primary_node, self.new_node, None,
10172 p_minor, new_minor, o_secret)
10173 new_net_id = (self.instance.primary_node, self.new_node, o_port,
10174 p_minor, new_minor, o_secret)
10176 iv_names[idx] = (dev, dev.children, new_net_id)
10177 logging.debug("Allocated new_minor: %s, new_logical_id: %s", new_minor,
10179 new_drbd = objects.Disk(dev_type=constants.LD_DRBD8,
10180 logical_id=new_alone_id,
10181 children=dev.children,
10184 _CreateSingleBlockDev(self.lu, self.new_node, self.instance, new_drbd,
10185 _GetInstanceInfoText(self.instance), False)
10186 except errors.GenericError:
10187 self.cfg.ReleaseDRBDMinors(self.instance.name)
10190 # We have new devices, shutdown the drbd on the old secondary
10191 for idx, dev in enumerate(self.instance.disks):
10192 self.lu.LogInfo("Shutting down drbd for disk/%d on old node" % idx)
10193 self.cfg.SetDiskID(dev, self.target_node)
10194 msg = self.rpc.call_blockdev_shutdown(self.target_node, dev).fail_msg
10196 self.lu.LogWarning("Failed to shutdown drbd for disk/%d on old"
10197 "node: %s" % (idx, msg),
10198 hint=("Please cleanup this device manually as"
10199 " soon as possible"))
10201 self.lu.LogInfo("Detaching primary drbds from the network (=> standalone)")
10202 result = self.rpc.call_drbd_disconnect_net([pnode], self.node_secondary_ip,
10203 self.instance.disks)[pnode]
10205 msg = result.fail_msg
10207 # detaches didn't succeed (unlikely)
10208 self.cfg.ReleaseDRBDMinors(self.instance.name)
10209 raise errors.OpExecError("Can't detach the disks from the network on"
10210 " old node: %s" % (msg,))
10212 # if we managed to detach at least one, we update all the disks of
10213 # the instance to point to the new secondary
10214 self.lu.LogInfo("Updating instance configuration")
10215 for dev, _, new_logical_id in iv_names.itervalues():
10216 dev.logical_id = new_logical_id
10217 self.cfg.SetDiskID(dev, self.instance.primary_node)
10219 self.cfg.Update(self.instance, feedback_fn)
10221 # and now perform the drbd attach
10222 self.lu.LogInfo("Attaching primary drbds to new secondary"
10223 " (standalone => connected)")
10224 result = self.rpc.call_drbd_attach_net([self.instance.primary_node,
10226 self.node_secondary_ip,
10227 self.instance.disks,
10228 self.instance.name,
10230 for to_node, to_result in result.items():
10231 msg = to_result.fail_msg
10233 self.lu.LogWarning("Can't attach drbd disks on node %s: %s",
10235 hint=("please do a gnt-instance info to see the"
10236 " status of disks"))
10238 if self.early_release:
10239 self.lu.LogStep(cstep, steps_total, "Removing old storage")
10241 self._RemoveOldStorage(self.target_node, iv_names)
10242 # WARNING: we release all node locks here, do not do other RPCs
10243 # than WaitForSync to the primary node
10244 _ReleaseLocks(self.lu, locking.LEVEL_NODE,
10245 names=[self.instance.primary_node,
10250 # This can fail as the old devices are degraded and _WaitForSync
10251 # does a combined result over all disks, so we don't check its return value
10252 self.lu.LogStep(cstep, steps_total, "Sync devices")
10254 _WaitForSync(self.lu, self.instance)
10256 # Check all devices manually
10257 self._CheckDevices(self.instance.primary_node, iv_names)
10259 # Step: remove old storage
10260 if not self.early_release:
10261 self.lu.LogStep(cstep, steps_total, "Removing old storage")
10262 self._RemoveOldStorage(self.target_node, iv_names)
10265 class LURepairNodeStorage(NoHooksLU):
10266 """Repairs the volume group on a node.
10271 def CheckArguments(self):
10272 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
10274 storage_type = self.op.storage_type
10276 if (constants.SO_FIX_CONSISTENCY not in
10277 constants.VALID_STORAGE_OPERATIONS.get(storage_type, [])):
10278 raise errors.OpPrereqError("Storage units of type '%s' can not be"
10279 " repaired" % storage_type,
10280 errors.ECODE_INVAL)
10282 def ExpandNames(self):
10283 self.needed_locks = {
10284 locking.LEVEL_NODE: [self.op.node_name],
10287 def _CheckFaultyDisks(self, instance, node_name):
10288 """Ensure faulty disks abort the opcode or at least warn."""
10290 if _FindFaultyInstanceDisks(self.cfg, self.rpc, instance,
10292 raise errors.OpPrereqError("Instance '%s' has faulty disks on"
10293 " node '%s'" % (instance.name, node_name),
10294 errors.ECODE_STATE)
10295 except errors.OpPrereqError, err:
10296 if self.op.ignore_consistency:
10297 self.proc.LogWarning(str(err.args[0]))
10301 def CheckPrereq(self):
10302 """Check prerequisites.
10305 # Check whether any instance on this node has faulty disks
10306 for inst in _GetNodeInstances(self.cfg, self.op.node_name):
10307 if not inst.admin_up:
10309 check_nodes = set(inst.all_nodes)
10310 check_nodes.discard(self.op.node_name)
10311 for inst_node_name in check_nodes:
10312 self._CheckFaultyDisks(inst, inst_node_name)
10314 def Exec(self, feedback_fn):
10315 feedback_fn("Repairing storage unit '%s' on %s ..." %
10316 (self.op.name, self.op.node_name))
10318 st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
10319 result = self.rpc.call_storage_execute(self.op.node_name,
10320 self.op.storage_type, st_args,
10322 constants.SO_FIX_CONSISTENCY)
10323 result.Raise("Failed to repair storage unit '%s' on %s" %
10324 (self.op.name, self.op.node_name))
10327 class LUNodeEvacuate(NoHooksLU):
10328 """Evacuates instances off a list of nodes.
10333 def CheckArguments(self):
10334 _CheckIAllocatorOrNode(self, "iallocator", "remote_node")
10336 def ExpandNames(self):
10337 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
10339 if self.op.remote_node is not None:
10340 self.op.remote_node = _ExpandNodeName(self.cfg, self.op.remote_node)
10341 assert self.op.remote_node
10343 if self.op.remote_node == self.op.node_name:
10344 raise errors.OpPrereqError("Can not use evacuated node as a new"
10345 " secondary node", errors.ECODE_INVAL)
10347 if self.op.mode != constants.IALLOCATOR_NEVAC_SEC:
10348 raise errors.OpPrereqError("Without the use of an iallocator only"
10349 " secondary instances can be evacuated",
10350 errors.ECODE_INVAL)
10353 self.share_locks = _ShareAll()
10354 self.needed_locks = {
10355 locking.LEVEL_INSTANCE: [],
10356 locking.LEVEL_NODEGROUP: [],
10357 locking.LEVEL_NODE: [],
10360 if self.op.remote_node is None:
10361 # Iallocator will choose any node(s) in the same group
10362 group_nodes = self.cfg.GetNodeGroupMembersByNodes([self.op.node_name])
10364 group_nodes = frozenset([self.op.remote_node])
10366 # Determine nodes to be locked
10367 self.lock_nodes = set([self.op.node_name]) | group_nodes
10369 def _DetermineInstances(self):
10370 """Builds list of instances to operate on.
10373 assert self.op.mode in constants.IALLOCATOR_NEVAC_MODES
10375 if self.op.mode == constants.IALLOCATOR_NEVAC_PRI:
10376 # Primary instances only
10377 inst_fn = _GetNodePrimaryInstances
10378 assert self.op.remote_node is None, \
10379 "Evacuating primary instances requires iallocator"
10380 elif self.op.mode == constants.IALLOCATOR_NEVAC_SEC:
10381 # Secondary instances only
10382 inst_fn = _GetNodeSecondaryInstances
10385 assert self.op.mode == constants.IALLOCATOR_NEVAC_ALL
10386 inst_fn = _GetNodeInstances
10388 return inst_fn(self.cfg, self.op.node_name)
10390 def DeclareLocks(self, level):
10391 if level == locking.LEVEL_INSTANCE:
10392 # Lock instances optimistically, needs verification once node and group
10393 # locks have been acquired
10394 self.needed_locks[locking.LEVEL_INSTANCE] = \
10395 set(i.name for i in self._DetermineInstances())
10397 elif level == locking.LEVEL_NODEGROUP:
10398 # Lock node groups optimistically, needs verification once nodes have
10400 self.needed_locks[locking.LEVEL_NODEGROUP] = \
10401 self.cfg.GetNodeGroupsFromNodes(self.lock_nodes)
10403 elif level == locking.LEVEL_NODE:
10404 self.needed_locks[locking.LEVEL_NODE] = self.lock_nodes
10406 def CheckPrereq(self):
10408 owned_instances = self.owned_locks(locking.LEVEL_INSTANCE)
10409 owned_nodes = self.owned_locks(locking.LEVEL_NODE)
10410 owned_groups = self.owned_locks(locking.LEVEL_NODEGROUP)
10412 assert owned_nodes == self.lock_nodes
10414 wanted_groups = self.cfg.GetNodeGroupsFromNodes(owned_nodes)
10415 if owned_groups != wanted_groups:
10416 raise errors.OpExecError("Node groups changed since locks were acquired,"
10417 " current groups are '%s', used to be '%s'" %
10418 (utils.CommaJoin(wanted_groups),
10419 utils.CommaJoin(owned_groups)))
10421 # Determine affected instances
10422 self.instances = self._DetermineInstances()
10423 self.instance_names = [i.name for i in self.instances]
10425 if set(self.instance_names) != owned_instances:
10426 raise errors.OpExecError("Instances on node '%s' changed since locks"
10427 " were acquired, current instances are '%s',"
10428 " used to be '%s'" %
10429 (self.op.node_name,
10430 utils.CommaJoin(self.instance_names),
10431 utils.CommaJoin(owned_instances)))
10433 if self.instance_names:
10434 self.LogInfo("Evacuating instances from node '%s': %s",
10436 utils.CommaJoin(utils.NiceSort(self.instance_names)))
10438 self.LogInfo("No instances to evacuate from node '%s'",
10441 if self.op.remote_node is not None:
10442 for i in self.instances:
10443 if i.primary_node == self.op.remote_node:
10444 raise errors.OpPrereqError("Node %s is the primary node of"
10445 " instance %s, cannot use it as"
10447 (self.op.remote_node, i.name),
10448 errors.ECODE_INVAL)
10450 def Exec(self, feedback_fn):
10451 assert (self.op.iallocator is not None) ^ (self.op.remote_node is not None)
10453 if not self.instance_names:
10454 # No instances to evacuate
10457 elif self.op.iallocator is not None:
10458 # TODO: Implement relocation to other group
10459 ial = IAllocator(self.cfg, self.rpc, constants.IALLOCATOR_MODE_NODE_EVAC,
10460 evac_mode=self.op.mode,
10461 instances=list(self.instance_names))
10463 ial.Run(self.op.iallocator)
10465 if not ial.success:
10466 raise errors.OpPrereqError("Can't compute node evacuation using"
10467 " iallocator '%s': %s" %
10468 (self.op.iallocator, ial.info),
10469 errors.ECODE_NORES)
10471 jobs = _LoadNodeEvacResult(self, ial.result, self.op.early_release, True)
10473 elif self.op.remote_node is not None:
10474 assert self.op.mode == constants.IALLOCATOR_NEVAC_SEC
10476 [opcodes.OpInstanceReplaceDisks(instance_name=instance_name,
10477 remote_node=self.op.remote_node,
10479 mode=constants.REPLACE_DISK_CHG,
10480 early_release=self.op.early_release)]
10481 for instance_name in self.instance_names
10485 raise errors.ProgrammerError("No iallocator or remote node")
10487 return ResultWithJobs(jobs)
10490 def _SetOpEarlyRelease(early_release, op):
10491 """Sets C{early_release} flag on opcodes if available.
10495 op.early_release = early_release
10496 except AttributeError:
10497 assert not isinstance(op, opcodes.OpInstanceReplaceDisks)
10502 def _NodeEvacDest(use_nodes, group, nodes):
10503 """Returns group or nodes depending on caller's choice.
10507 return utils.CommaJoin(nodes)
10512 def _LoadNodeEvacResult(lu, alloc_result, early_release, use_nodes):
10513 """Unpacks the result of change-group and node-evacuate iallocator requests.
10515 Iallocator modes L{constants.IALLOCATOR_MODE_NODE_EVAC} and
10516 L{constants.IALLOCATOR_MODE_CHG_GROUP}.
10518 @type lu: L{LogicalUnit}
10519 @param lu: Logical unit instance
10520 @type alloc_result: tuple/list
10521 @param alloc_result: Result from iallocator
10522 @type early_release: bool
10523 @param early_release: Whether to release locks early if possible
10524 @type use_nodes: bool
10525 @param use_nodes: Whether to display node names instead of groups
10528 (moved, failed, jobs) = alloc_result
10531 lu.LogWarning("Unable to evacuate instances %s",
10532 utils.CommaJoin("%s (%s)" % (name, reason)
10533 for (name, reason) in failed))
10536 lu.LogInfo("Instances to be moved: %s",
10537 utils.CommaJoin("%s (to %s)" %
10538 (name, _NodeEvacDest(use_nodes, group, nodes))
10539 for (name, group, nodes) in moved))
10541 return [map(compat.partial(_SetOpEarlyRelease, early_release),
10542 map(opcodes.OpCode.LoadOpCode, ops))
10546 class LUInstanceGrowDisk(LogicalUnit):
10547 """Grow a disk of an instance.
10550 HPATH = "disk-grow"
10551 HTYPE = constants.HTYPE_INSTANCE
10554 def ExpandNames(self):
10555 self._ExpandAndLockInstance()
10556 self.needed_locks[locking.LEVEL_NODE] = []
10557 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
10559 def DeclareLocks(self, level):
10560 if level == locking.LEVEL_NODE:
10561 self._LockInstancesNodes()
10563 def BuildHooksEnv(self):
10564 """Build hooks env.
10566 This runs on the master, the primary and all the secondaries.
10570 "DISK": self.op.disk,
10571 "AMOUNT": self.op.amount,
10573 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
10576 def BuildHooksNodes(self):
10577 """Build hooks nodes.
10580 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
10583 def CheckPrereq(self):
10584 """Check prerequisites.
10586 This checks that the instance is in the cluster.
10589 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
10590 assert instance is not None, \
10591 "Cannot retrieve locked instance %s" % self.op.instance_name
10592 nodenames = list(instance.all_nodes)
10593 for node in nodenames:
10594 _CheckNodeOnline(self, node)
10596 self.instance = instance
10598 if instance.disk_template not in constants.DTS_GROWABLE:
10599 raise errors.OpPrereqError("Instance's disk layout does not support"
10600 " growing", errors.ECODE_INVAL)
10602 self.disk = instance.FindDisk(self.op.disk)
10604 if instance.disk_template not in (constants.DT_FILE,
10605 constants.DT_SHARED_FILE):
10606 # TODO: check the free disk space for file, when that feature will be
10608 _CheckNodesFreeDiskPerVG(self, nodenames,
10609 self.disk.ComputeGrowth(self.op.amount))
10611 def Exec(self, feedback_fn):
10612 """Execute disk grow.
10615 instance = self.instance
10618 disks_ok, _ = _AssembleInstanceDisks(self, self.instance, disks=[disk])
10620 raise errors.OpExecError("Cannot activate block device to grow")
10622 # First run all grow ops in dry-run mode
10623 for node in instance.all_nodes:
10624 self.cfg.SetDiskID(disk, node)
10625 result = self.rpc.call_blockdev_grow(node, disk, self.op.amount, True)
10626 result.Raise("Grow request failed to node %s" % node)
10628 # We know that (as far as we can test) operations across different
10629 # nodes will succeed, time to run it for real
10630 for node in instance.all_nodes:
10631 self.cfg.SetDiskID(disk, node)
10632 result = self.rpc.call_blockdev_grow(node, disk, self.op.amount, False)
10633 result.Raise("Grow request failed to node %s" % node)
10635 # TODO: Rewrite code to work properly
10636 # DRBD goes into sync mode for a short amount of time after executing the
10637 # "resize" command. DRBD 8.x below version 8.0.13 contains a bug whereby
10638 # calling "resize" in sync mode fails. Sleeping for a short amount of
10639 # time is a work-around.
10642 disk.RecordGrow(self.op.amount)
10643 self.cfg.Update(instance, feedback_fn)
10644 if self.op.wait_for_sync:
10645 disk_abort = not _WaitForSync(self, instance, disks=[disk])
10647 self.proc.LogWarning("Disk sync-ing has not returned a good"
10648 " status; please check the instance")
10649 if not instance.admin_up:
10650 _SafeShutdownInstanceDisks(self, instance, disks=[disk])
10651 elif not instance.admin_up:
10652 self.proc.LogWarning("Not shutting down the disk even if the instance is"
10653 " not supposed to be running because no wait for"
10654 " sync mode was requested")
10657 class LUInstanceQueryData(NoHooksLU):
10658 """Query runtime instance data.
10663 def ExpandNames(self):
10664 self.needed_locks = {}
10666 # Use locking if requested or when non-static information is wanted
10667 if not (self.op.static or self.op.use_locking):
10668 self.LogWarning("Non-static data requested, locks need to be acquired")
10669 self.op.use_locking = True
10671 if self.op.instances or not self.op.use_locking:
10672 # Expand instance names right here
10673 self.wanted_names = _GetWantedInstances(self, self.op.instances)
10675 # Will use acquired locks
10676 self.wanted_names = None
10678 if self.op.use_locking:
10679 self.share_locks = _ShareAll()
10681 if self.wanted_names is None:
10682 self.needed_locks[locking.LEVEL_INSTANCE] = locking.ALL_SET
10684 self.needed_locks[locking.LEVEL_INSTANCE] = self.wanted_names
10686 self.needed_locks[locking.LEVEL_NODE] = []
10687 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
10689 def DeclareLocks(self, level):
10690 if self.op.use_locking and level == locking.LEVEL_NODE:
10691 self._LockInstancesNodes()
10693 def CheckPrereq(self):
10694 """Check prerequisites.
10696 This only checks the optional instance list against the existing names.
10699 if self.wanted_names is None:
10700 assert self.op.use_locking, "Locking was not used"
10701 self.wanted_names = self.owned_locks(locking.LEVEL_INSTANCE)
10703 self.wanted_instances = \
10704 map(compat.snd, self.cfg.GetMultiInstanceInfo(self.wanted_names))
10706 def _ComputeBlockdevStatus(self, node, instance_name, dev):
10707 """Returns the status of a block device
10710 if self.op.static or not node:
10713 self.cfg.SetDiskID(dev, node)
10715 result = self.rpc.call_blockdev_find(node, dev)
10719 result.Raise("Can't compute disk status for %s" % instance_name)
10721 status = result.payload
10725 return (status.dev_path, status.major, status.minor,
10726 status.sync_percent, status.estimated_time,
10727 status.is_degraded, status.ldisk_status)
10729 def _ComputeDiskStatus(self, instance, snode, dev):
10730 """Compute block device status.
10733 if dev.dev_type in constants.LDS_DRBD:
10734 # we change the snode then (otherwise we use the one passed in)
10735 if dev.logical_id[0] == instance.primary_node:
10736 snode = dev.logical_id[1]
10738 snode = dev.logical_id[0]
10740 dev_pstatus = self._ComputeBlockdevStatus(instance.primary_node,
10741 instance.name, dev)
10742 dev_sstatus = self._ComputeBlockdevStatus(snode, instance.name, dev)
10745 dev_children = map(compat.partial(self._ComputeDiskStatus,
10752 "iv_name": dev.iv_name,
10753 "dev_type": dev.dev_type,
10754 "logical_id": dev.logical_id,
10755 "physical_id": dev.physical_id,
10756 "pstatus": dev_pstatus,
10757 "sstatus": dev_sstatus,
10758 "children": dev_children,
10763 def Exec(self, feedback_fn):
10764 """Gather and return data"""
10767 cluster = self.cfg.GetClusterInfo()
10769 pri_nodes = self.cfg.GetMultiNodeInfo(i.primary_node
10770 for i in self.wanted_instances)
10771 for instance, (_, pnode) in zip(self.wanted_instances, pri_nodes):
10772 if self.op.static or pnode.offline:
10773 remote_state = None
10775 self.LogWarning("Primary node %s is marked offline, returning static"
10776 " information only for instance %s" %
10777 (pnode.name, instance.name))
10779 remote_info = self.rpc.call_instance_info(instance.primary_node,
10781 instance.hypervisor)
10782 remote_info.Raise("Error checking node %s" % instance.primary_node)
10783 remote_info = remote_info.payload
10784 if remote_info and "state" in remote_info:
10785 remote_state = "up"
10787 remote_state = "down"
10789 if instance.admin_up:
10790 config_state = "up"
10792 config_state = "down"
10794 disks = map(compat.partial(self._ComputeDiskStatus, instance, None),
10797 result[instance.name] = {
10798 "name": instance.name,
10799 "config_state": config_state,
10800 "run_state": remote_state,
10801 "pnode": instance.primary_node,
10802 "snodes": instance.secondary_nodes,
10804 # this happens to be the same format used for hooks
10805 "nics": _NICListToTuple(self, instance.nics),
10806 "disk_template": instance.disk_template,
10808 "hypervisor": instance.hypervisor,
10809 "network_port": instance.network_port,
10810 "hv_instance": instance.hvparams,
10811 "hv_actual": cluster.FillHV(instance, skip_globals=True),
10812 "be_instance": instance.beparams,
10813 "be_actual": cluster.FillBE(instance),
10814 "os_instance": instance.osparams,
10815 "os_actual": cluster.SimpleFillOS(instance.os, instance.osparams),
10816 "serial_no": instance.serial_no,
10817 "mtime": instance.mtime,
10818 "ctime": instance.ctime,
10819 "uuid": instance.uuid,
10825 class LUInstanceSetParams(LogicalUnit):
10826 """Modifies an instances's parameters.
10829 HPATH = "instance-modify"
10830 HTYPE = constants.HTYPE_INSTANCE
10833 def CheckArguments(self):
10834 if not (self.op.nics or self.op.disks or self.op.disk_template or
10835 self.op.hvparams or self.op.beparams or self.op.os_name):
10836 raise errors.OpPrereqError("No changes submitted", errors.ECODE_INVAL)
10838 if self.op.hvparams:
10839 _CheckGlobalHvParams(self.op.hvparams)
10843 for disk_op, disk_dict in self.op.disks:
10844 utils.ForceDictType(disk_dict, constants.IDISK_PARAMS_TYPES)
10845 if disk_op == constants.DDM_REMOVE:
10846 disk_addremove += 1
10848 elif disk_op == constants.DDM_ADD:
10849 disk_addremove += 1
10851 if not isinstance(disk_op, int):
10852 raise errors.OpPrereqError("Invalid disk index", errors.ECODE_INVAL)
10853 if not isinstance(disk_dict, dict):
10854 msg = "Invalid disk value: expected dict, got '%s'" % disk_dict
10855 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
10857 if disk_op == constants.DDM_ADD:
10858 mode = disk_dict.setdefault(constants.IDISK_MODE, constants.DISK_RDWR)
10859 if mode not in constants.DISK_ACCESS_SET:
10860 raise errors.OpPrereqError("Invalid disk access mode '%s'" % mode,
10861 errors.ECODE_INVAL)
10862 size = disk_dict.get(constants.IDISK_SIZE, None)
10864 raise errors.OpPrereqError("Required disk parameter size missing",
10865 errors.ECODE_INVAL)
10868 except (TypeError, ValueError), err:
10869 raise errors.OpPrereqError("Invalid disk size parameter: %s" %
10870 str(err), errors.ECODE_INVAL)
10871 disk_dict[constants.IDISK_SIZE] = size
10873 # modification of disk
10874 if constants.IDISK_SIZE in disk_dict:
10875 raise errors.OpPrereqError("Disk size change not possible, use"
10876 " grow-disk", errors.ECODE_INVAL)
10878 if disk_addremove > 1:
10879 raise errors.OpPrereqError("Only one disk add or remove operation"
10880 " supported at a time", errors.ECODE_INVAL)
10882 if self.op.disks and self.op.disk_template is not None:
10883 raise errors.OpPrereqError("Disk template conversion and other disk"
10884 " changes not supported at the same time",
10885 errors.ECODE_INVAL)
10887 if (self.op.disk_template and
10888 self.op.disk_template in constants.DTS_INT_MIRROR and
10889 self.op.remote_node is None):
10890 raise errors.OpPrereqError("Changing the disk template to a mirrored"
10891 " one requires specifying a secondary node",
10892 errors.ECODE_INVAL)
10896 for nic_op, nic_dict in self.op.nics:
10897 utils.ForceDictType(nic_dict, constants.INIC_PARAMS_TYPES)
10898 if nic_op == constants.DDM_REMOVE:
10901 elif nic_op == constants.DDM_ADD:
10904 if not isinstance(nic_op, int):
10905 raise errors.OpPrereqError("Invalid nic index", errors.ECODE_INVAL)
10906 if not isinstance(nic_dict, dict):
10907 msg = "Invalid nic value: expected dict, got '%s'" % nic_dict
10908 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
10910 # nic_dict should be a dict
10911 nic_ip = nic_dict.get(constants.INIC_IP, None)
10912 if nic_ip is not None:
10913 if nic_ip.lower() == constants.VALUE_NONE:
10914 nic_dict[constants.INIC_IP] = None
10916 if not netutils.IPAddress.IsValid(nic_ip):
10917 raise errors.OpPrereqError("Invalid IP address '%s'" % nic_ip,
10918 errors.ECODE_INVAL)
10920 nic_bridge = nic_dict.get("bridge", None)
10921 nic_link = nic_dict.get(constants.INIC_LINK, None)
10922 if nic_bridge and nic_link:
10923 raise errors.OpPrereqError("Cannot pass 'bridge' and 'link'"
10924 " at the same time", errors.ECODE_INVAL)
10925 elif nic_bridge and nic_bridge.lower() == constants.VALUE_NONE:
10926 nic_dict["bridge"] = None
10927 elif nic_link and nic_link.lower() == constants.VALUE_NONE:
10928 nic_dict[constants.INIC_LINK] = None
10930 if nic_op == constants.DDM_ADD:
10931 nic_mac = nic_dict.get(constants.INIC_MAC, None)
10932 if nic_mac is None:
10933 nic_dict[constants.INIC_MAC] = constants.VALUE_AUTO
10935 if constants.INIC_MAC in nic_dict:
10936 nic_mac = nic_dict[constants.INIC_MAC]
10937 if nic_mac not in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
10938 nic_mac = utils.NormalizeAndValidateMac(nic_mac)
10940 if nic_op != constants.DDM_ADD and nic_mac == constants.VALUE_AUTO:
10941 raise errors.OpPrereqError("'auto' is not a valid MAC address when"
10942 " modifying an existing nic",
10943 errors.ECODE_INVAL)
10945 if nic_addremove > 1:
10946 raise errors.OpPrereqError("Only one NIC add or remove operation"
10947 " supported at a time", errors.ECODE_INVAL)
10949 def ExpandNames(self):
10950 self._ExpandAndLockInstance()
10951 self.needed_locks[locking.LEVEL_NODE] = []
10952 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
10954 def DeclareLocks(self, level):
10955 if level == locking.LEVEL_NODE:
10956 self._LockInstancesNodes()
10957 if self.op.disk_template and self.op.remote_node:
10958 self.op.remote_node = _ExpandNodeName(self.cfg, self.op.remote_node)
10959 self.needed_locks[locking.LEVEL_NODE].append(self.op.remote_node)
10961 def BuildHooksEnv(self):
10962 """Build hooks env.
10964 This runs on the master, primary and secondaries.
10968 if constants.BE_MEMORY in self.be_new:
10969 args["memory"] = self.be_new[constants.BE_MEMORY]
10970 if constants.BE_VCPUS in self.be_new:
10971 args["vcpus"] = self.be_new[constants.BE_VCPUS]
10972 # TODO: export disk changes. Note: _BuildInstanceHookEnv* don't export disk
10973 # information at all.
10976 nic_override = dict(self.op.nics)
10977 for idx, nic in enumerate(self.instance.nics):
10978 if idx in nic_override:
10979 this_nic_override = nic_override[idx]
10981 this_nic_override = {}
10982 if constants.INIC_IP in this_nic_override:
10983 ip = this_nic_override[constants.INIC_IP]
10986 if constants.INIC_MAC in this_nic_override:
10987 mac = this_nic_override[constants.INIC_MAC]
10990 if idx in self.nic_pnew:
10991 nicparams = self.nic_pnew[idx]
10993 nicparams = self.cluster.SimpleFillNIC(nic.nicparams)
10994 mode = nicparams[constants.NIC_MODE]
10995 link = nicparams[constants.NIC_LINK]
10996 args["nics"].append((ip, mac, mode, link))
10997 if constants.DDM_ADD in nic_override:
10998 ip = nic_override[constants.DDM_ADD].get(constants.INIC_IP, None)
10999 mac = nic_override[constants.DDM_ADD][constants.INIC_MAC]
11000 nicparams = self.nic_pnew[constants.DDM_ADD]
11001 mode = nicparams[constants.NIC_MODE]
11002 link = nicparams[constants.NIC_LINK]
11003 args["nics"].append((ip, mac, mode, link))
11004 elif constants.DDM_REMOVE in nic_override:
11005 del args["nics"][-1]
11007 env = _BuildInstanceHookEnvByObject(self, self.instance, override=args)
11008 if self.op.disk_template:
11009 env["NEW_DISK_TEMPLATE"] = self.op.disk_template
11013 def BuildHooksNodes(self):
11014 """Build hooks nodes.
11017 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
11020 def CheckPrereq(self):
11021 """Check prerequisites.
11023 This only checks the instance list against the existing names.
11026 # checking the new params on the primary/secondary nodes
11028 instance = self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
11029 cluster = self.cluster = self.cfg.GetClusterInfo()
11030 assert self.instance is not None, \
11031 "Cannot retrieve locked instance %s" % self.op.instance_name
11032 pnode = instance.primary_node
11033 nodelist = list(instance.all_nodes)
11036 if self.op.os_name and not self.op.force:
11037 _CheckNodeHasOS(self, instance.primary_node, self.op.os_name,
11038 self.op.force_variant)
11039 instance_os = self.op.os_name
11041 instance_os = instance.os
11043 if self.op.disk_template:
11044 if instance.disk_template == self.op.disk_template:
11045 raise errors.OpPrereqError("Instance already has disk template %s" %
11046 instance.disk_template, errors.ECODE_INVAL)
11048 if (instance.disk_template,
11049 self.op.disk_template) not in self._DISK_CONVERSIONS:
11050 raise errors.OpPrereqError("Unsupported disk template conversion from"
11051 " %s to %s" % (instance.disk_template,
11052 self.op.disk_template),
11053 errors.ECODE_INVAL)
11054 _CheckInstanceDown(self, instance, "cannot change disk template")
11055 if self.op.disk_template in constants.DTS_INT_MIRROR:
11056 if self.op.remote_node == pnode:
11057 raise errors.OpPrereqError("Given new secondary node %s is the same"
11058 " as the primary node of the instance" %
11059 self.op.remote_node, errors.ECODE_STATE)
11060 _CheckNodeOnline(self, self.op.remote_node)
11061 _CheckNodeNotDrained(self, self.op.remote_node)
11062 # FIXME: here we assume that the old instance type is DT_PLAIN
11063 assert instance.disk_template == constants.DT_PLAIN
11064 disks = [{constants.IDISK_SIZE: d.size,
11065 constants.IDISK_VG: d.logical_id[0]}
11066 for d in instance.disks]
11067 required = _ComputeDiskSizePerVG(self.op.disk_template, disks)
11068 _CheckNodesFreeDiskPerVG(self, [self.op.remote_node], required)
11070 # hvparams processing
11071 if self.op.hvparams:
11072 hv_type = instance.hypervisor
11073 i_hvdict = _GetUpdatedParams(instance.hvparams, self.op.hvparams)
11074 utils.ForceDictType(i_hvdict, constants.HVS_PARAMETER_TYPES)
11075 hv_new = cluster.SimpleFillHV(hv_type, instance.os, i_hvdict)
11078 hypervisor.GetHypervisor(hv_type).CheckParameterSyntax(hv_new)
11079 _CheckHVParams(self, nodelist, instance.hypervisor, hv_new)
11080 self.hv_proposed = self.hv_new = hv_new # the new actual values
11081 self.hv_inst = i_hvdict # the new dict (without defaults)
11083 self.hv_proposed = cluster.SimpleFillHV(instance.hypervisor, instance.os,
11085 self.hv_new = self.hv_inst = {}
11087 # beparams processing
11088 if self.op.beparams:
11089 i_bedict = _GetUpdatedParams(instance.beparams, self.op.beparams,
11091 utils.ForceDictType(i_bedict, constants.BES_PARAMETER_TYPES)
11092 be_new = cluster.SimpleFillBE(i_bedict)
11093 self.be_proposed = self.be_new = be_new # the new actual values
11094 self.be_inst = i_bedict # the new dict (without defaults)
11096 self.be_new = self.be_inst = {}
11097 self.be_proposed = cluster.SimpleFillBE(instance.beparams)
11098 be_old = cluster.FillBE(instance)
11100 # CPU param validation -- checking every time a paramtere is
11101 # changed to cover all cases where either CPU mask or vcpus have
11103 if (constants.BE_VCPUS in self.be_proposed and
11104 constants.HV_CPU_MASK in self.hv_proposed):
11106 utils.ParseMultiCpuMask(self.hv_proposed[constants.HV_CPU_MASK])
11107 # Verify mask is consistent with number of vCPUs. Can skip this
11108 # test if only 1 entry in the CPU mask, which means same mask
11109 # is applied to all vCPUs.
11110 if (len(cpu_list) > 1 and
11111 len(cpu_list) != self.be_proposed[constants.BE_VCPUS]):
11112 raise errors.OpPrereqError("Number of vCPUs [%d] does not match the"
11114 (self.be_proposed[constants.BE_VCPUS],
11115 self.hv_proposed[constants.HV_CPU_MASK]),
11116 errors.ECODE_INVAL)
11118 # Only perform this test if a new CPU mask is given
11119 if constants.HV_CPU_MASK in self.hv_new:
11120 # Calculate the largest CPU number requested
11121 max_requested_cpu = max(map(max, cpu_list))
11122 # Check that all of the instance's nodes have enough physical CPUs to
11123 # satisfy the requested CPU mask
11124 _CheckNodesPhysicalCPUs(self, instance.all_nodes,
11125 max_requested_cpu + 1, instance.hypervisor)
11127 # osparams processing
11128 if self.op.osparams:
11129 i_osdict = _GetUpdatedParams(instance.osparams, self.op.osparams)
11130 _CheckOSParams(self, True, nodelist, instance_os, i_osdict)
11131 self.os_inst = i_osdict # the new dict (without defaults)
11137 if (constants.BE_MEMORY in self.op.beparams and not self.op.force and
11138 be_new[constants.BE_MEMORY] > be_old[constants.BE_MEMORY]):
11139 mem_check_list = [pnode]
11140 if be_new[constants.BE_AUTO_BALANCE]:
11141 # either we changed auto_balance to yes or it was from before
11142 mem_check_list.extend(instance.secondary_nodes)
11143 instance_info = self.rpc.call_instance_info(pnode, instance.name,
11144 instance.hypervisor)
11145 nodeinfo = self.rpc.call_node_info(mem_check_list, None,
11146 instance.hypervisor)
11147 pninfo = nodeinfo[pnode]
11148 msg = pninfo.fail_msg
11150 # Assume the primary node is unreachable and go ahead
11151 self.warn.append("Can't get info from primary node %s: %s" %
11153 elif not isinstance(pninfo.payload.get("memory_free", None), int):
11154 self.warn.append("Node data from primary node %s doesn't contain"
11155 " free memory information" % pnode)
11156 elif instance_info.fail_msg:
11157 self.warn.append("Can't get instance runtime information: %s" %
11158 instance_info.fail_msg)
11160 if instance_info.payload:
11161 current_mem = int(instance_info.payload["memory"])
11163 # Assume instance not running
11164 # (there is a slight race condition here, but it's not very probable,
11165 # and we have no other way to check)
11167 miss_mem = (be_new[constants.BE_MEMORY] - current_mem -
11168 pninfo.payload["memory_free"])
11170 raise errors.OpPrereqError("This change will prevent the instance"
11171 " from starting, due to %d MB of memory"
11172 " missing on its primary node" % miss_mem,
11173 errors.ECODE_NORES)
11175 if be_new[constants.BE_AUTO_BALANCE]:
11176 for node, nres in nodeinfo.items():
11177 if node not in instance.secondary_nodes:
11179 nres.Raise("Can't get info from secondary node %s" % node,
11180 prereq=True, ecode=errors.ECODE_STATE)
11181 if not isinstance(nres.payload.get("memory_free", None), int):
11182 raise errors.OpPrereqError("Secondary node %s didn't return free"
11183 " memory information" % node,
11184 errors.ECODE_STATE)
11185 elif be_new[constants.BE_MEMORY] > nres.payload["memory_free"]:
11186 raise errors.OpPrereqError("This change will prevent the instance"
11187 " from failover to its secondary node"
11188 " %s, due to not enough memory" % node,
11189 errors.ECODE_STATE)
11193 self.nic_pinst = {}
11194 for nic_op, nic_dict in self.op.nics:
11195 if nic_op == constants.DDM_REMOVE:
11196 if not instance.nics:
11197 raise errors.OpPrereqError("Instance has no NICs, cannot remove",
11198 errors.ECODE_INVAL)
11200 if nic_op != constants.DDM_ADD:
11202 if not instance.nics:
11203 raise errors.OpPrereqError("Invalid NIC index %s, instance has"
11204 " no NICs" % nic_op,
11205 errors.ECODE_INVAL)
11206 if nic_op < 0 or nic_op >= len(instance.nics):
11207 raise errors.OpPrereqError("Invalid NIC index %s, valid values"
11209 (nic_op, len(instance.nics) - 1),
11210 errors.ECODE_INVAL)
11211 old_nic_params = instance.nics[nic_op].nicparams
11212 old_nic_ip = instance.nics[nic_op].ip
11214 old_nic_params = {}
11217 update_params_dict = dict([(key, nic_dict[key])
11218 for key in constants.NICS_PARAMETERS
11219 if key in nic_dict])
11221 if "bridge" in nic_dict:
11222 update_params_dict[constants.NIC_LINK] = nic_dict["bridge"]
11224 new_nic_params = _GetUpdatedParams(old_nic_params,
11225 update_params_dict)
11226 utils.ForceDictType(new_nic_params, constants.NICS_PARAMETER_TYPES)
11227 new_filled_nic_params = cluster.SimpleFillNIC(new_nic_params)
11228 objects.NIC.CheckParameterSyntax(new_filled_nic_params)
11229 self.nic_pinst[nic_op] = new_nic_params
11230 self.nic_pnew[nic_op] = new_filled_nic_params
11231 new_nic_mode = new_filled_nic_params[constants.NIC_MODE]
11233 if new_nic_mode == constants.NIC_MODE_BRIDGED:
11234 nic_bridge = new_filled_nic_params[constants.NIC_LINK]
11235 msg = self.rpc.call_bridges_exist(pnode, [nic_bridge]).fail_msg
11237 msg = "Error checking bridges on node %s: %s" % (pnode, msg)
11239 self.warn.append(msg)
11241 raise errors.OpPrereqError(msg, errors.ECODE_ENVIRON)
11242 if new_nic_mode == constants.NIC_MODE_ROUTED:
11243 if constants.INIC_IP in nic_dict:
11244 nic_ip = nic_dict[constants.INIC_IP]
11246 nic_ip = old_nic_ip
11248 raise errors.OpPrereqError("Cannot set the nic ip to None"
11249 " on a routed nic", errors.ECODE_INVAL)
11250 if constants.INIC_MAC in nic_dict:
11251 nic_mac = nic_dict[constants.INIC_MAC]
11252 if nic_mac is None:
11253 raise errors.OpPrereqError("Cannot set the nic mac to None",
11254 errors.ECODE_INVAL)
11255 elif nic_mac in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
11256 # otherwise generate the mac
11257 nic_dict[constants.INIC_MAC] = \
11258 self.cfg.GenerateMAC(self.proc.GetECId())
11260 # or validate/reserve the current one
11262 self.cfg.ReserveMAC(nic_mac, self.proc.GetECId())
11263 except errors.ReservationError:
11264 raise errors.OpPrereqError("MAC address %s already in use"
11265 " in cluster" % nic_mac,
11266 errors.ECODE_NOTUNIQUE)
11269 if self.op.disks and instance.disk_template == constants.DT_DISKLESS:
11270 raise errors.OpPrereqError("Disk operations not supported for"
11271 " diskless instances",
11272 errors.ECODE_INVAL)
11273 for disk_op, _ in self.op.disks:
11274 if disk_op == constants.DDM_REMOVE:
11275 if len(instance.disks) == 1:
11276 raise errors.OpPrereqError("Cannot remove the last disk of"
11277 " an instance", errors.ECODE_INVAL)
11278 _CheckInstanceDown(self, instance, "cannot remove disks")
11280 if (disk_op == constants.DDM_ADD and
11281 len(instance.disks) >= constants.MAX_DISKS):
11282 raise errors.OpPrereqError("Instance has too many disks (%d), cannot"
11283 " add more" % constants.MAX_DISKS,
11284 errors.ECODE_STATE)
11285 if disk_op not in (constants.DDM_ADD, constants.DDM_REMOVE):
11287 if disk_op < 0 or disk_op >= len(instance.disks):
11288 raise errors.OpPrereqError("Invalid disk index %s, valid values"
11290 (disk_op, len(instance.disks)),
11291 errors.ECODE_INVAL)
11295 def _ConvertPlainToDrbd(self, feedback_fn):
11296 """Converts an instance from plain to drbd.
11299 feedback_fn("Converting template to drbd")
11300 instance = self.instance
11301 pnode = instance.primary_node
11302 snode = self.op.remote_node
11304 # create a fake disk info for _GenerateDiskTemplate
11305 disk_info = [{constants.IDISK_SIZE: d.size, constants.IDISK_MODE: d.mode,
11306 constants.IDISK_VG: d.logical_id[0]}
11307 for d in instance.disks]
11308 new_disks = _GenerateDiskTemplate(self, self.op.disk_template,
11309 instance.name, pnode, [snode],
11310 disk_info, None, None, 0, feedback_fn)
11311 info = _GetInstanceInfoText(instance)
11312 feedback_fn("Creating aditional volumes...")
11313 # first, create the missing data and meta devices
11314 for disk in new_disks:
11315 # unfortunately this is... not too nice
11316 _CreateSingleBlockDev(self, pnode, instance, disk.children[1],
11318 for child in disk.children:
11319 _CreateSingleBlockDev(self, snode, instance, child, info, True)
11320 # at this stage, all new LVs have been created, we can rename the
11322 feedback_fn("Renaming original volumes...")
11323 rename_list = [(o, n.children[0].logical_id)
11324 for (o, n) in zip(instance.disks, new_disks)]
11325 result = self.rpc.call_blockdev_rename(pnode, rename_list)
11326 result.Raise("Failed to rename original LVs")
11328 feedback_fn("Initializing DRBD devices...")
11329 # all child devices are in place, we can now create the DRBD devices
11330 for disk in new_disks:
11331 for node in [pnode, snode]:
11332 f_create = node == pnode
11333 _CreateSingleBlockDev(self, node, instance, disk, info, f_create)
11335 # at this point, the instance has been modified
11336 instance.disk_template = constants.DT_DRBD8
11337 instance.disks = new_disks
11338 self.cfg.Update(instance, feedback_fn)
11340 # disks are created, waiting for sync
11341 disk_abort = not _WaitForSync(self, instance,
11342 oneshot=not self.op.wait_for_sync)
11344 raise errors.OpExecError("There are some degraded disks for"
11345 " this instance, please cleanup manually")
11347 def _ConvertDrbdToPlain(self, feedback_fn):
11348 """Converts an instance from drbd to plain.
11351 instance = self.instance
11352 assert len(instance.secondary_nodes) == 1
11353 pnode = instance.primary_node
11354 snode = instance.secondary_nodes[0]
11355 feedback_fn("Converting template to plain")
11357 old_disks = instance.disks
11358 new_disks = [d.children[0] for d in old_disks]
11360 # copy over size and mode
11361 for parent, child in zip(old_disks, new_disks):
11362 child.size = parent.size
11363 child.mode = parent.mode
11365 # update instance structure
11366 instance.disks = new_disks
11367 instance.disk_template = constants.DT_PLAIN
11368 self.cfg.Update(instance, feedback_fn)
11370 feedback_fn("Removing volumes on the secondary node...")
11371 for disk in old_disks:
11372 self.cfg.SetDiskID(disk, snode)
11373 msg = self.rpc.call_blockdev_remove(snode, disk).fail_msg
11375 self.LogWarning("Could not remove block device %s on node %s,"
11376 " continuing anyway: %s", disk.iv_name, snode, msg)
11378 feedback_fn("Removing unneeded volumes on the primary node...")
11379 for idx, disk in enumerate(old_disks):
11380 meta = disk.children[1]
11381 self.cfg.SetDiskID(meta, pnode)
11382 msg = self.rpc.call_blockdev_remove(pnode, meta).fail_msg
11384 self.LogWarning("Could not remove metadata for disk %d on node %s,"
11385 " continuing anyway: %s", idx, pnode, msg)
11387 def Exec(self, feedback_fn):
11388 """Modifies an instance.
11390 All parameters take effect only at the next restart of the instance.
11393 # Process here the warnings from CheckPrereq, as we don't have a
11394 # feedback_fn there.
11395 for warn in self.warn:
11396 feedback_fn("WARNING: %s" % warn)
11399 instance = self.instance
11401 for disk_op, disk_dict in self.op.disks:
11402 if disk_op == constants.DDM_REMOVE:
11403 # remove the last disk
11404 device = instance.disks.pop()
11405 device_idx = len(instance.disks)
11406 for node, disk in device.ComputeNodeTree(instance.primary_node):
11407 self.cfg.SetDiskID(disk, node)
11408 msg = self.rpc.call_blockdev_remove(node, disk).fail_msg
11410 self.LogWarning("Could not remove disk/%d on node %s: %s,"
11411 " continuing anyway", device_idx, node, msg)
11412 result.append(("disk/%d" % device_idx, "remove"))
11413 elif disk_op == constants.DDM_ADD:
11415 if instance.disk_template in (constants.DT_FILE,
11416 constants.DT_SHARED_FILE):
11417 file_driver, file_path = instance.disks[0].logical_id
11418 file_path = os.path.dirname(file_path)
11420 file_driver = file_path = None
11421 disk_idx_base = len(instance.disks)
11422 new_disk = _GenerateDiskTemplate(self,
11423 instance.disk_template,
11424 instance.name, instance.primary_node,
11425 instance.secondary_nodes,
11429 disk_idx_base, feedback_fn)[0]
11430 instance.disks.append(new_disk)
11431 info = _GetInstanceInfoText(instance)
11433 logging.info("Creating volume %s for instance %s",
11434 new_disk.iv_name, instance.name)
11435 # Note: this needs to be kept in sync with _CreateDisks
11437 for node in instance.all_nodes:
11438 f_create = node == instance.primary_node
11440 _CreateBlockDev(self, node, instance, new_disk,
11441 f_create, info, f_create)
11442 except errors.OpExecError, err:
11443 self.LogWarning("Failed to create volume %s (%s) on"
11445 new_disk.iv_name, new_disk, node, err)
11446 result.append(("disk/%d" % disk_idx_base, "add:size=%s,mode=%s" %
11447 (new_disk.size, new_disk.mode)))
11449 # change a given disk
11450 instance.disks[disk_op].mode = disk_dict[constants.IDISK_MODE]
11451 result.append(("disk.mode/%d" % disk_op,
11452 disk_dict[constants.IDISK_MODE]))
11454 if self.op.disk_template:
11455 r_shut = _ShutdownInstanceDisks(self, instance)
11457 raise errors.OpExecError("Cannot shutdown instance disks, unable to"
11458 " proceed with disk template conversion")
11459 mode = (instance.disk_template, self.op.disk_template)
11461 self._DISK_CONVERSIONS[mode](self, feedback_fn)
11463 self.cfg.ReleaseDRBDMinors(instance.name)
11465 result.append(("disk_template", self.op.disk_template))
11468 for nic_op, nic_dict in self.op.nics:
11469 if nic_op == constants.DDM_REMOVE:
11470 # remove the last nic
11471 del instance.nics[-1]
11472 result.append(("nic.%d" % len(instance.nics), "remove"))
11473 elif nic_op == constants.DDM_ADD:
11474 # mac and bridge should be set, by now
11475 mac = nic_dict[constants.INIC_MAC]
11476 ip = nic_dict.get(constants.INIC_IP, None)
11477 nicparams = self.nic_pinst[constants.DDM_ADD]
11478 new_nic = objects.NIC(mac=mac, ip=ip, nicparams=nicparams)
11479 instance.nics.append(new_nic)
11480 result.append(("nic.%d" % (len(instance.nics) - 1),
11481 "add:mac=%s,ip=%s,mode=%s,link=%s" %
11482 (new_nic.mac, new_nic.ip,
11483 self.nic_pnew[constants.DDM_ADD][constants.NIC_MODE],
11484 self.nic_pnew[constants.DDM_ADD][constants.NIC_LINK]
11487 for key in (constants.INIC_MAC, constants.INIC_IP):
11488 if key in nic_dict:
11489 setattr(instance.nics[nic_op], key, nic_dict[key])
11490 if nic_op in self.nic_pinst:
11491 instance.nics[nic_op].nicparams = self.nic_pinst[nic_op]
11492 for key, val in nic_dict.iteritems():
11493 result.append(("nic.%s/%d" % (key, nic_op), val))
11496 if self.op.hvparams:
11497 instance.hvparams = self.hv_inst
11498 for key, val in self.op.hvparams.iteritems():
11499 result.append(("hv/%s" % key, val))
11502 if self.op.beparams:
11503 instance.beparams = self.be_inst
11504 for key, val in self.op.beparams.iteritems():
11505 result.append(("be/%s" % key, val))
11508 if self.op.os_name:
11509 instance.os = self.op.os_name
11512 if self.op.osparams:
11513 instance.osparams = self.os_inst
11514 for key, val in self.op.osparams.iteritems():
11515 result.append(("os/%s" % key, val))
11517 self.cfg.Update(instance, feedback_fn)
11521 _DISK_CONVERSIONS = {
11522 (constants.DT_PLAIN, constants.DT_DRBD8): _ConvertPlainToDrbd,
11523 (constants.DT_DRBD8, constants.DT_PLAIN): _ConvertDrbdToPlain,
11527 class LUInstanceChangeGroup(LogicalUnit):
11528 HPATH = "instance-change-group"
11529 HTYPE = constants.HTYPE_INSTANCE
11532 def ExpandNames(self):
11533 self.share_locks = _ShareAll()
11534 self.needed_locks = {
11535 locking.LEVEL_NODEGROUP: [],
11536 locking.LEVEL_NODE: [],
11539 self._ExpandAndLockInstance()
11541 if self.op.target_groups:
11542 self.req_target_uuids = map(self.cfg.LookupNodeGroup,
11543 self.op.target_groups)
11545 self.req_target_uuids = None
11547 self.op.iallocator = _GetDefaultIAllocator(self.cfg, self.op.iallocator)
11549 def DeclareLocks(self, level):
11550 if level == locking.LEVEL_NODEGROUP:
11551 assert not self.needed_locks[locking.LEVEL_NODEGROUP]
11553 if self.req_target_uuids:
11554 lock_groups = set(self.req_target_uuids)
11556 # Lock all groups used by instance optimistically; this requires going
11557 # via the node before it's locked, requiring verification later on
11558 instance_groups = self.cfg.GetInstanceNodeGroups(self.op.instance_name)
11559 lock_groups.update(instance_groups)
11561 # No target groups, need to lock all of them
11562 lock_groups = locking.ALL_SET
11564 self.needed_locks[locking.LEVEL_NODEGROUP] = lock_groups
11566 elif level == locking.LEVEL_NODE:
11567 if self.req_target_uuids:
11568 # Lock all nodes used by instances
11569 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
11570 self._LockInstancesNodes()
11572 # Lock all nodes in all potential target groups
11573 lock_groups = (frozenset(self.owned_locks(locking.LEVEL_NODEGROUP)) -
11574 self.cfg.GetInstanceNodeGroups(self.op.instance_name))
11575 member_nodes = [node_name
11576 for group in lock_groups
11577 for node_name in self.cfg.GetNodeGroup(group).members]
11578 self.needed_locks[locking.LEVEL_NODE].extend(member_nodes)
11580 # Lock all nodes as all groups are potential targets
11581 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
11583 def CheckPrereq(self):
11584 owned_instances = frozenset(self.owned_locks(locking.LEVEL_INSTANCE))
11585 owned_groups = frozenset(self.owned_locks(locking.LEVEL_NODEGROUP))
11586 owned_nodes = frozenset(self.owned_locks(locking.LEVEL_NODE))
11588 assert (self.req_target_uuids is None or
11589 owned_groups.issuperset(self.req_target_uuids))
11590 assert owned_instances == set([self.op.instance_name])
11592 # Get instance information
11593 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
11595 # Check if node groups for locked instance are still correct
11596 assert owned_nodes.issuperset(self.instance.all_nodes), \
11597 ("Instance %s's nodes changed while we kept the lock" %
11598 self.op.instance_name)
11600 inst_groups = _CheckInstanceNodeGroups(self.cfg, self.op.instance_name,
11603 if self.req_target_uuids:
11604 # User requested specific target groups
11605 self.target_uuids = self.req_target_uuids
11607 # All groups except those used by the instance are potential targets
11608 self.target_uuids = owned_groups - inst_groups
11610 conflicting_groups = self.target_uuids & inst_groups
11611 if conflicting_groups:
11612 raise errors.OpPrereqError("Can't use group(s) '%s' as targets, they are"
11613 " used by the instance '%s'" %
11614 (utils.CommaJoin(conflicting_groups),
11615 self.op.instance_name),
11616 errors.ECODE_INVAL)
11618 if not self.target_uuids:
11619 raise errors.OpPrereqError("There are no possible target groups",
11620 errors.ECODE_INVAL)
11622 def BuildHooksEnv(self):
11623 """Build hooks env.
11626 assert self.target_uuids
11629 "TARGET_GROUPS": " ".join(self.target_uuids),
11632 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
11636 def BuildHooksNodes(self):
11637 """Build hooks nodes.
11640 mn = self.cfg.GetMasterNode()
11641 return ([mn], [mn])
11643 def Exec(self, feedback_fn):
11644 instances = list(self.owned_locks(locking.LEVEL_INSTANCE))
11646 assert instances == [self.op.instance_name], "Instance not locked"
11648 ial = IAllocator(self.cfg, self.rpc, constants.IALLOCATOR_MODE_CHG_GROUP,
11649 instances=instances, target_groups=list(self.target_uuids))
11651 ial.Run(self.op.iallocator)
11653 if not ial.success:
11654 raise errors.OpPrereqError("Can't compute solution for changing group of"
11655 " instance '%s' using iallocator '%s': %s" %
11656 (self.op.instance_name, self.op.iallocator,
11658 errors.ECODE_NORES)
11660 jobs = _LoadNodeEvacResult(self, ial.result, self.op.early_release, False)
11662 self.LogInfo("Iallocator returned %s job(s) for changing group of"
11663 " instance '%s'", len(jobs), self.op.instance_name)
11665 return ResultWithJobs(jobs)
11668 class LUBackupQuery(NoHooksLU):
11669 """Query the exports list
11674 def ExpandNames(self):
11675 self.needed_locks = {}
11676 self.share_locks[locking.LEVEL_NODE] = 1
11677 if not self.op.nodes:
11678 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
11680 self.needed_locks[locking.LEVEL_NODE] = \
11681 _GetWantedNodes(self, self.op.nodes)
11683 def Exec(self, feedback_fn):
11684 """Compute the list of all the exported system images.
11687 @return: a dictionary with the structure node->(export-list)
11688 where export-list is a list of the instances exported on
11692 self.nodes = self.owned_locks(locking.LEVEL_NODE)
11693 rpcresult = self.rpc.call_export_list(self.nodes)
11695 for node in rpcresult:
11696 if rpcresult[node].fail_msg:
11697 result[node] = False
11699 result[node] = rpcresult[node].payload
11704 class LUBackupPrepare(NoHooksLU):
11705 """Prepares an instance for an export and returns useful information.
11710 def ExpandNames(self):
11711 self._ExpandAndLockInstance()
11713 def CheckPrereq(self):
11714 """Check prerequisites.
11717 instance_name = self.op.instance_name
11719 self.instance = self.cfg.GetInstanceInfo(instance_name)
11720 assert self.instance is not None, \
11721 "Cannot retrieve locked instance %s" % self.op.instance_name
11722 _CheckNodeOnline(self, self.instance.primary_node)
11724 self._cds = _GetClusterDomainSecret()
11726 def Exec(self, feedback_fn):
11727 """Prepares an instance for an export.
11730 instance = self.instance
11732 if self.op.mode == constants.EXPORT_MODE_REMOTE:
11733 salt = utils.GenerateSecret(8)
11735 feedback_fn("Generating X509 certificate on %s" % instance.primary_node)
11736 result = self.rpc.call_x509_cert_create(instance.primary_node,
11737 constants.RIE_CERT_VALIDITY)
11738 result.Raise("Can't create X509 key and certificate on %s" % result.node)
11740 (name, cert_pem) = result.payload
11742 cert = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM,
11746 "handshake": masterd.instance.ComputeRemoteExportHandshake(self._cds),
11747 "x509_key_name": (name, utils.Sha1Hmac(self._cds, name, salt=salt),
11749 "x509_ca": utils.SignX509Certificate(cert, self._cds, salt),
11755 class LUBackupExport(LogicalUnit):
11756 """Export an instance to an image in the cluster.
11759 HPATH = "instance-export"
11760 HTYPE = constants.HTYPE_INSTANCE
11763 def CheckArguments(self):
11764 """Check the arguments.
11767 self.x509_key_name = self.op.x509_key_name
11768 self.dest_x509_ca_pem = self.op.destination_x509_ca
11770 if self.op.mode == constants.EXPORT_MODE_REMOTE:
11771 if not self.x509_key_name:
11772 raise errors.OpPrereqError("Missing X509 key name for encryption",
11773 errors.ECODE_INVAL)
11775 if not self.dest_x509_ca_pem:
11776 raise errors.OpPrereqError("Missing destination X509 CA",
11777 errors.ECODE_INVAL)
11779 def ExpandNames(self):
11780 self._ExpandAndLockInstance()
11782 # Lock all nodes for local exports
11783 if self.op.mode == constants.EXPORT_MODE_LOCAL:
11784 # FIXME: lock only instance primary and destination node
11786 # Sad but true, for now we have do lock all nodes, as we don't know where
11787 # the previous export might be, and in this LU we search for it and
11788 # remove it from its current node. In the future we could fix this by:
11789 # - making a tasklet to search (share-lock all), then create the
11790 # new one, then one to remove, after
11791 # - removing the removal operation altogether
11792 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
11794 def DeclareLocks(self, level):
11795 """Last minute lock declaration."""
11796 # All nodes are locked anyway, so nothing to do here.
11798 def BuildHooksEnv(self):
11799 """Build hooks env.
11801 This will run on the master, primary node and target node.
11805 "EXPORT_MODE": self.op.mode,
11806 "EXPORT_NODE": self.op.target_node,
11807 "EXPORT_DO_SHUTDOWN": self.op.shutdown,
11808 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
11809 # TODO: Generic function for boolean env variables
11810 "REMOVE_INSTANCE": str(bool(self.op.remove_instance)),
11813 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
11817 def BuildHooksNodes(self):
11818 """Build hooks nodes.
11821 nl = [self.cfg.GetMasterNode(), self.instance.primary_node]
11823 if self.op.mode == constants.EXPORT_MODE_LOCAL:
11824 nl.append(self.op.target_node)
11828 def CheckPrereq(self):
11829 """Check prerequisites.
11831 This checks that the instance and node names are valid.
11834 instance_name = self.op.instance_name
11836 self.instance = self.cfg.GetInstanceInfo(instance_name)
11837 assert self.instance is not None, \
11838 "Cannot retrieve locked instance %s" % self.op.instance_name
11839 _CheckNodeOnline(self, self.instance.primary_node)
11841 if (self.op.remove_instance and self.instance.admin_up and
11842 not self.op.shutdown):
11843 raise errors.OpPrereqError("Can not remove instance without shutting it"
11846 if self.op.mode == constants.EXPORT_MODE_LOCAL:
11847 self.op.target_node = _ExpandNodeName(self.cfg, self.op.target_node)
11848 self.dst_node = self.cfg.GetNodeInfo(self.op.target_node)
11849 assert self.dst_node is not None
11851 _CheckNodeOnline(self, self.dst_node.name)
11852 _CheckNodeNotDrained(self, self.dst_node.name)
11855 self.dest_disk_info = None
11856 self.dest_x509_ca = None
11858 elif self.op.mode == constants.EXPORT_MODE_REMOTE:
11859 self.dst_node = None
11861 if len(self.op.target_node) != len(self.instance.disks):
11862 raise errors.OpPrereqError(("Received destination information for %s"
11863 " disks, but instance %s has %s disks") %
11864 (len(self.op.target_node), instance_name,
11865 len(self.instance.disks)),
11866 errors.ECODE_INVAL)
11868 cds = _GetClusterDomainSecret()
11870 # Check X509 key name
11872 (key_name, hmac_digest, hmac_salt) = self.x509_key_name
11873 except (TypeError, ValueError), err:
11874 raise errors.OpPrereqError("Invalid data for X509 key name: %s" % err)
11876 if not utils.VerifySha1Hmac(cds, key_name, hmac_digest, salt=hmac_salt):
11877 raise errors.OpPrereqError("HMAC for X509 key name is wrong",
11878 errors.ECODE_INVAL)
11880 # Load and verify CA
11882 (cert, _) = utils.LoadSignedX509Certificate(self.dest_x509_ca_pem, cds)
11883 except OpenSSL.crypto.Error, err:
11884 raise errors.OpPrereqError("Unable to load destination X509 CA (%s)" %
11885 (err, ), errors.ECODE_INVAL)
11887 (errcode, msg) = utils.VerifyX509Certificate(cert, None, None)
11888 if errcode is not None:
11889 raise errors.OpPrereqError("Invalid destination X509 CA (%s)" %
11890 (msg, ), errors.ECODE_INVAL)
11892 self.dest_x509_ca = cert
11894 # Verify target information
11896 for idx, disk_data in enumerate(self.op.target_node):
11898 (host, port, magic) = \
11899 masterd.instance.CheckRemoteExportDiskInfo(cds, idx, disk_data)
11900 except errors.GenericError, err:
11901 raise errors.OpPrereqError("Target info for disk %s: %s" %
11902 (idx, err), errors.ECODE_INVAL)
11904 disk_info.append((host, port, magic))
11906 assert len(disk_info) == len(self.op.target_node)
11907 self.dest_disk_info = disk_info
11910 raise errors.ProgrammerError("Unhandled export mode %r" %
11913 # instance disk type verification
11914 # TODO: Implement export support for file-based disks
11915 for disk in self.instance.disks:
11916 if disk.dev_type == constants.LD_FILE:
11917 raise errors.OpPrereqError("Export not supported for instances with"
11918 " file-based disks", errors.ECODE_INVAL)
11920 def _CleanupExports(self, feedback_fn):
11921 """Removes exports of current instance from all other nodes.
11923 If an instance in a cluster with nodes A..D was exported to node C, its
11924 exports will be removed from the nodes A, B and D.
11927 assert self.op.mode != constants.EXPORT_MODE_REMOTE
11929 nodelist = self.cfg.GetNodeList()
11930 nodelist.remove(self.dst_node.name)
11932 # on one-node clusters nodelist will be empty after the removal
11933 # if we proceed the backup would be removed because OpBackupQuery
11934 # substitutes an empty list with the full cluster node list.
11935 iname = self.instance.name
11937 feedback_fn("Removing old exports for instance %s" % iname)
11938 exportlist = self.rpc.call_export_list(nodelist)
11939 for node in exportlist:
11940 if exportlist[node].fail_msg:
11942 if iname in exportlist[node].payload:
11943 msg = self.rpc.call_export_remove(node, iname).fail_msg
11945 self.LogWarning("Could not remove older export for instance %s"
11946 " on node %s: %s", iname, node, msg)
11948 def Exec(self, feedback_fn):
11949 """Export an instance to an image in the cluster.
11952 assert self.op.mode in constants.EXPORT_MODES
11954 instance = self.instance
11955 src_node = instance.primary_node
11957 if self.op.shutdown:
11958 # shutdown the instance, but not the disks
11959 feedback_fn("Shutting down instance %s" % instance.name)
11960 result = self.rpc.call_instance_shutdown(src_node, instance,
11961 self.op.shutdown_timeout)
11962 # TODO: Maybe ignore failures if ignore_remove_failures is set
11963 result.Raise("Could not shutdown instance %s on"
11964 " node %s" % (instance.name, src_node))
11966 # set the disks ID correctly since call_instance_start needs the
11967 # correct drbd minor to create the symlinks
11968 for disk in instance.disks:
11969 self.cfg.SetDiskID(disk, src_node)
11971 activate_disks = (not instance.admin_up)
11974 # Activate the instance disks if we'exporting a stopped instance
11975 feedback_fn("Activating disks for %s" % instance.name)
11976 _StartInstanceDisks(self, instance, None)
11979 helper = masterd.instance.ExportInstanceHelper(self, feedback_fn,
11982 helper.CreateSnapshots()
11984 if (self.op.shutdown and instance.admin_up and
11985 not self.op.remove_instance):
11986 assert not activate_disks
11987 feedback_fn("Starting instance %s" % instance.name)
11988 result = self.rpc.call_instance_start(src_node,
11989 (instance, None, None), False)
11990 msg = result.fail_msg
11992 feedback_fn("Failed to start instance: %s" % msg)
11993 _ShutdownInstanceDisks(self, instance)
11994 raise errors.OpExecError("Could not start instance: %s" % msg)
11996 if self.op.mode == constants.EXPORT_MODE_LOCAL:
11997 (fin_resu, dresults) = helper.LocalExport(self.dst_node)
11998 elif self.op.mode == constants.EXPORT_MODE_REMOTE:
11999 connect_timeout = constants.RIE_CONNECT_TIMEOUT
12000 timeouts = masterd.instance.ImportExportTimeouts(connect_timeout)
12002 (key_name, _, _) = self.x509_key_name
12005 OpenSSL.crypto.dump_certificate(OpenSSL.crypto.FILETYPE_PEM,
12008 (fin_resu, dresults) = helper.RemoteExport(self.dest_disk_info,
12009 key_name, dest_ca_pem,
12014 # Check for backwards compatibility
12015 assert len(dresults) == len(instance.disks)
12016 assert compat.all(isinstance(i, bool) for i in dresults), \
12017 "Not all results are boolean: %r" % dresults
12021 feedback_fn("Deactivating disks for %s" % instance.name)
12022 _ShutdownInstanceDisks(self, instance)
12024 if not (compat.all(dresults) and fin_resu):
12027 failures.append("export finalization")
12028 if not compat.all(dresults):
12029 fdsk = utils.CommaJoin(idx for (idx, dsk) in enumerate(dresults)
12031 failures.append("disk export: disk(s) %s" % fdsk)
12033 raise errors.OpExecError("Export failed, errors in %s" %
12034 utils.CommaJoin(failures))
12036 # At this point, the export was successful, we can cleanup/finish
12038 # Remove instance if requested
12039 if self.op.remove_instance:
12040 feedback_fn("Removing instance %s" % instance.name)
12041 _RemoveInstance(self, feedback_fn, instance,
12042 self.op.ignore_remove_failures)
12044 if self.op.mode == constants.EXPORT_MODE_LOCAL:
12045 self._CleanupExports(feedback_fn)
12047 return fin_resu, dresults
12050 class LUBackupRemove(NoHooksLU):
12051 """Remove exports related to the named instance.
12056 def ExpandNames(self):
12057 self.needed_locks = {}
12058 # We need all nodes to be locked in order for RemoveExport to work, but we
12059 # don't need to lock the instance itself, as nothing will happen to it (and
12060 # we can remove exports also for a removed instance)
12061 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
12063 def Exec(self, feedback_fn):
12064 """Remove any export.
12067 instance_name = self.cfg.ExpandInstanceName(self.op.instance_name)
12068 # If the instance was not found we'll try with the name that was passed in.
12069 # This will only work if it was an FQDN, though.
12071 if not instance_name:
12073 instance_name = self.op.instance_name
12075 locked_nodes = self.owned_locks(locking.LEVEL_NODE)
12076 exportlist = self.rpc.call_export_list(locked_nodes)
12078 for node in exportlist:
12079 msg = exportlist[node].fail_msg
12081 self.LogWarning("Failed to query node %s (continuing): %s", node, msg)
12083 if instance_name in exportlist[node].payload:
12085 result = self.rpc.call_export_remove(node, instance_name)
12086 msg = result.fail_msg
12088 logging.error("Could not remove export for instance %s"
12089 " on node %s: %s", instance_name, node, msg)
12091 if fqdn_warn and not found:
12092 feedback_fn("Export not found. If trying to remove an export belonging"
12093 " to a deleted instance please use its Fully Qualified"
12097 class LUGroupAdd(LogicalUnit):
12098 """Logical unit for creating node groups.
12101 HPATH = "group-add"
12102 HTYPE = constants.HTYPE_GROUP
12105 def ExpandNames(self):
12106 # We need the new group's UUID here so that we can create and acquire the
12107 # corresponding lock. Later, in Exec(), we'll indicate to cfg.AddNodeGroup
12108 # that it should not check whether the UUID exists in the configuration.
12109 self.group_uuid = self.cfg.GenerateUniqueID(self.proc.GetECId())
12110 self.needed_locks = {}
12111 self.add_locks[locking.LEVEL_NODEGROUP] = self.group_uuid
12113 def CheckPrereq(self):
12114 """Check prerequisites.
12116 This checks that the given group name is not an existing node group
12121 existing_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
12122 except errors.OpPrereqError:
12125 raise errors.OpPrereqError("Desired group name '%s' already exists as a"
12126 " node group (UUID: %s)" %
12127 (self.op.group_name, existing_uuid),
12128 errors.ECODE_EXISTS)
12130 if self.op.ndparams:
12131 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
12133 def BuildHooksEnv(self):
12134 """Build hooks env.
12138 "GROUP_NAME": self.op.group_name,
12141 def BuildHooksNodes(self):
12142 """Build hooks nodes.
12145 mn = self.cfg.GetMasterNode()
12146 return ([mn], [mn])
12148 def Exec(self, feedback_fn):
12149 """Add the node group to the cluster.
12152 group_obj = objects.NodeGroup(name=self.op.group_name, members=[],
12153 uuid=self.group_uuid,
12154 alloc_policy=self.op.alloc_policy,
12155 ndparams=self.op.ndparams)
12157 self.cfg.AddNodeGroup(group_obj, self.proc.GetECId(), check_uuid=False)
12158 del self.remove_locks[locking.LEVEL_NODEGROUP]
12161 class LUGroupAssignNodes(NoHooksLU):
12162 """Logical unit for assigning nodes to groups.
12167 def ExpandNames(self):
12168 # These raise errors.OpPrereqError on their own:
12169 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
12170 self.op.nodes = _GetWantedNodes(self, self.op.nodes)
12172 # We want to lock all the affected nodes and groups. We have readily
12173 # available the list of nodes, and the *destination* group. To gather the
12174 # list of "source" groups, we need to fetch node information later on.
12175 self.needed_locks = {
12176 locking.LEVEL_NODEGROUP: set([self.group_uuid]),
12177 locking.LEVEL_NODE: self.op.nodes,
12180 def DeclareLocks(self, level):
12181 if level == locking.LEVEL_NODEGROUP:
12182 assert len(self.needed_locks[locking.LEVEL_NODEGROUP]) == 1
12184 # Try to get all affected nodes' groups without having the group or node
12185 # lock yet. Needs verification later in the code flow.
12186 groups = self.cfg.GetNodeGroupsFromNodes(self.op.nodes)
12188 self.needed_locks[locking.LEVEL_NODEGROUP].update(groups)
12190 def CheckPrereq(self):
12191 """Check prerequisites.
12194 assert self.needed_locks[locking.LEVEL_NODEGROUP]
12195 assert (frozenset(self.owned_locks(locking.LEVEL_NODE)) ==
12196 frozenset(self.op.nodes))
12198 expected_locks = (set([self.group_uuid]) |
12199 self.cfg.GetNodeGroupsFromNodes(self.op.nodes))
12200 actual_locks = self.owned_locks(locking.LEVEL_NODEGROUP)
12201 if actual_locks != expected_locks:
12202 raise errors.OpExecError("Nodes changed groups since locks were acquired,"
12203 " current groups are '%s', used to be '%s'" %
12204 (utils.CommaJoin(expected_locks),
12205 utils.CommaJoin(actual_locks)))
12207 self.node_data = self.cfg.GetAllNodesInfo()
12208 self.group = self.cfg.GetNodeGroup(self.group_uuid)
12209 instance_data = self.cfg.GetAllInstancesInfo()
12211 if self.group is None:
12212 raise errors.OpExecError("Could not retrieve group '%s' (UUID: %s)" %
12213 (self.op.group_name, self.group_uuid))
12215 (new_splits, previous_splits) = \
12216 self.CheckAssignmentForSplitInstances([(node, self.group_uuid)
12217 for node in self.op.nodes],
12218 self.node_data, instance_data)
12221 fmt_new_splits = utils.CommaJoin(utils.NiceSort(new_splits))
12223 if not self.op.force:
12224 raise errors.OpExecError("The following instances get split by this"
12225 " change and --force was not given: %s" %
12228 self.LogWarning("This operation will split the following instances: %s",
12231 if previous_splits:
12232 self.LogWarning("In addition, these already-split instances continue"
12233 " to be split across groups: %s",
12234 utils.CommaJoin(utils.NiceSort(previous_splits)))
12236 def Exec(self, feedback_fn):
12237 """Assign nodes to a new group.
12240 for node in self.op.nodes:
12241 self.node_data[node].group = self.group_uuid
12243 # FIXME: Depends on side-effects of modifying the result of
12244 # C{cfg.GetAllNodesInfo}
12246 self.cfg.Update(self.group, feedback_fn) # Saves all modified nodes.
12249 def CheckAssignmentForSplitInstances(changes, node_data, instance_data):
12250 """Check for split instances after a node assignment.
12252 This method considers a series of node assignments as an atomic operation,
12253 and returns information about split instances after applying the set of
12256 In particular, it returns information about newly split instances, and
12257 instances that were already split, and remain so after the change.
12259 Only instances whose disk template is listed in constants.DTS_INT_MIRROR are
12262 @type changes: list of (node_name, new_group_uuid) pairs.
12263 @param changes: list of node assignments to consider.
12264 @param node_data: a dict with data for all nodes
12265 @param instance_data: a dict with all instances to consider
12266 @rtype: a two-tuple
12267 @return: a list of instances that were previously okay and result split as a
12268 consequence of this change, and a list of instances that were previously
12269 split and this change does not fix.
12272 changed_nodes = dict((node, group) for node, group in changes
12273 if node_data[node].group != group)
12275 all_split_instances = set()
12276 previously_split_instances = set()
12278 def InstanceNodes(instance):
12279 return [instance.primary_node] + list(instance.secondary_nodes)
12281 for inst in instance_data.values():
12282 if inst.disk_template not in constants.DTS_INT_MIRROR:
12285 instance_nodes = InstanceNodes(inst)
12287 if len(set(node_data[node].group for node in instance_nodes)) > 1:
12288 previously_split_instances.add(inst.name)
12290 if len(set(changed_nodes.get(node, node_data[node].group)
12291 for node in instance_nodes)) > 1:
12292 all_split_instances.add(inst.name)
12294 return (list(all_split_instances - previously_split_instances),
12295 list(previously_split_instances & all_split_instances))
12298 class _GroupQuery(_QueryBase):
12299 FIELDS = query.GROUP_FIELDS
12301 def ExpandNames(self, lu):
12302 lu.needed_locks = {}
12304 self._all_groups = lu.cfg.GetAllNodeGroupsInfo()
12305 name_to_uuid = dict((g.name, g.uuid) for g in self._all_groups.values())
12308 self.wanted = [name_to_uuid[name]
12309 for name in utils.NiceSort(name_to_uuid.keys())]
12311 # Accept names to be either names or UUIDs.
12314 all_uuid = frozenset(self._all_groups.keys())
12316 for name in self.names:
12317 if name in all_uuid:
12318 self.wanted.append(name)
12319 elif name in name_to_uuid:
12320 self.wanted.append(name_to_uuid[name])
12322 missing.append(name)
12325 raise errors.OpPrereqError("Some groups do not exist: %s" %
12326 utils.CommaJoin(missing),
12327 errors.ECODE_NOENT)
12329 def DeclareLocks(self, lu, level):
12332 def _GetQueryData(self, lu):
12333 """Computes the list of node groups and their attributes.
12336 do_nodes = query.GQ_NODE in self.requested_data
12337 do_instances = query.GQ_INST in self.requested_data
12339 group_to_nodes = None
12340 group_to_instances = None
12342 # For GQ_NODE, we need to map group->[nodes], and group->[instances] for
12343 # GQ_INST. The former is attainable with just GetAllNodesInfo(), but for the
12344 # latter GetAllInstancesInfo() is not enough, for we have to go through
12345 # instance->node. Hence, we will need to process nodes even if we only need
12346 # instance information.
12347 if do_nodes or do_instances:
12348 all_nodes = lu.cfg.GetAllNodesInfo()
12349 group_to_nodes = dict((uuid, []) for uuid in self.wanted)
12352 for node in all_nodes.values():
12353 if node.group in group_to_nodes:
12354 group_to_nodes[node.group].append(node.name)
12355 node_to_group[node.name] = node.group
12358 all_instances = lu.cfg.GetAllInstancesInfo()
12359 group_to_instances = dict((uuid, []) for uuid in self.wanted)
12361 for instance in all_instances.values():
12362 node = instance.primary_node
12363 if node in node_to_group:
12364 group_to_instances[node_to_group[node]].append(instance.name)
12367 # Do not pass on node information if it was not requested.
12368 group_to_nodes = None
12370 return query.GroupQueryData([self._all_groups[uuid]
12371 for uuid in self.wanted],
12372 group_to_nodes, group_to_instances)
12375 class LUGroupQuery(NoHooksLU):
12376 """Logical unit for querying node groups.
12381 def CheckArguments(self):
12382 self.gq = _GroupQuery(qlang.MakeSimpleFilter("name", self.op.names),
12383 self.op.output_fields, False)
12385 def ExpandNames(self):
12386 self.gq.ExpandNames(self)
12388 def DeclareLocks(self, level):
12389 self.gq.DeclareLocks(self, level)
12391 def Exec(self, feedback_fn):
12392 return self.gq.OldStyleQuery(self)
12395 class LUGroupSetParams(LogicalUnit):
12396 """Modifies the parameters of a node group.
12399 HPATH = "group-modify"
12400 HTYPE = constants.HTYPE_GROUP
12403 def CheckArguments(self):
12406 self.op.alloc_policy,
12409 if all_changes.count(None) == len(all_changes):
12410 raise errors.OpPrereqError("Please pass at least one modification",
12411 errors.ECODE_INVAL)
12413 def ExpandNames(self):
12414 # This raises errors.OpPrereqError on its own:
12415 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
12417 self.needed_locks = {
12418 locking.LEVEL_NODEGROUP: [self.group_uuid],
12421 def CheckPrereq(self):
12422 """Check prerequisites.
12425 self.group = self.cfg.GetNodeGroup(self.group_uuid)
12427 if self.group is None:
12428 raise errors.OpExecError("Could not retrieve group '%s' (UUID: %s)" %
12429 (self.op.group_name, self.group_uuid))
12431 if self.op.ndparams:
12432 new_ndparams = _GetUpdatedParams(self.group.ndparams, self.op.ndparams)
12433 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
12434 self.new_ndparams = new_ndparams
12436 def BuildHooksEnv(self):
12437 """Build hooks env.
12441 "GROUP_NAME": self.op.group_name,
12442 "NEW_ALLOC_POLICY": self.op.alloc_policy,
12445 def BuildHooksNodes(self):
12446 """Build hooks nodes.
12449 mn = self.cfg.GetMasterNode()
12450 return ([mn], [mn])
12452 def Exec(self, feedback_fn):
12453 """Modifies the node group.
12458 if self.op.ndparams:
12459 self.group.ndparams = self.new_ndparams
12460 result.append(("ndparams", str(self.group.ndparams)))
12462 if self.op.alloc_policy:
12463 self.group.alloc_policy = self.op.alloc_policy
12465 self.cfg.Update(self.group, feedback_fn)
12469 class LUGroupRemove(LogicalUnit):
12470 HPATH = "group-remove"
12471 HTYPE = constants.HTYPE_GROUP
12474 def ExpandNames(self):
12475 # This will raises errors.OpPrereqError on its own:
12476 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
12477 self.needed_locks = {
12478 locking.LEVEL_NODEGROUP: [self.group_uuid],
12481 def CheckPrereq(self):
12482 """Check prerequisites.
12484 This checks that the given group name exists as a node group, that is
12485 empty (i.e., contains no nodes), and that is not the last group of the
12489 # Verify that the group is empty.
12490 group_nodes = [node.name
12491 for node in self.cfg.GetAllNodesInfo().values()
12492 if node.group == self.group_uuid]
12495 raise errors.OpPrereqError("Group '%s' not empty, has the following"
12497 (self.op.group_name,
12498 utils.CommaJoin(utils.NiceSort(group_nodes))),
12499 errors.ECODE_STATE)
12501 # Verify the cluster would not be left group-less.
12502 if len(self.cfg.GetNodeGroupList()) == 1:
12503 raise errors.OpPrereqError("Group '%s' is the only group,"
12504 " cannot be removed" %
12505 self.op.group_name,
12506 errors.ECODE_STATE)
12508 def BuildHooksEnv(self):
12509 """Build hooks env.
12513 "GROUP_NAME": self.op.group_name,
12516 def BuildHooksNodes(self):
12517 """Build hooks nodes.
12520 mn = self.cfg.GetMasterNode()
12521 return ([mn], [mn])
12523 def Exec(self, feedback_fn):
12524 """Remove the node group.
12528 self.cfg.RemoveNodeGroup(self.group_uuid)
12529 except errors.ConfigurationError:
12530 raise errors.OpExecError("Group '%s' with UUID %s disappeared" %
12531 (self.op.group_name, self.group_uuid))
12533 self.remove_locks[locking.LEVEL_NODEGROUP] = self.group_uuid
12536 class LUGroupRename(LogicalUnit):
12537 HPATH = "group-rename"
12538 HTYPE = constants.HTYPE_GROUP
12541 def ExpandNames(self):
12542 # This raises errors.OpPrereqError on its own:
12543 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
12545 self.needed_locks = {
12546 locking.LEVEL_NODEGROUP: [self.group_uuid],
12549 def CheckPrereq(self):
12550 """Check prerequisites.
12552 Ensures requested new name is not yet used.
12556 new_name_uuid = self.cfg.LookupNodeGroup(self.op.new_name)
12557 except errors.OpPrereqError:
12560 raise errors.OpPrereqError("Desired new name '%s' clashes with existing"
12561 " node group (UUID: %s)" %
12562 (self.op.new_name, new_name_uuid),
12563 errors.ECODE_EXISTS)
12565 def BuildHooksEnv(self):
12566 """Build hooks env.
12570 "OLD_NAME": self.op.group_name,
12571 "NEW_NAME": self.op.new_name,
12574 def BuildHooksNodes(self):
12575 """Build hooks nodes.
12578 mn = self.cfg.GetMasterNode()
12580 all_nodes = self.cfg.GetAllNodesInfo()
12581 all_nodes.pop(mn, None)
12584 run_nodes.extend(node.name for node in all_nodes.values()
12585 if node.group == self.group_uuid)
12587 return (run_nodes, run_nodes)
12589 def Exec(self, feedback_fn):
12590 """Rename the node group.
12593 group = self.cfg.GetNodeGroup(self.group_uuid)
12596 raise errors.OpExecError("Could not retrieve group '%s' (UUID: %s)" %
12597 (self.op.group_name, self.group_uuid))
12599 group.name = self.op.new_name
12600 self.cfg.Update(group, feedback_fn)
12602 return self.op.new_name
12605 class LUGroupEvacuate(LogicalUnit):
12606 HPATH = "group-evacuate"
12607 HTYPE = constants.HTYPE_GROUP
12610 def ExpandNames(self):
12611 # This raises errors.OpPrereqError on its own:
12612 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
12614 if self.op.target_groups:
12615 self.req_target_uuids = map(self.cfg.LookupNodeGroup,
12616 self.op.target_groups)
12618 self.req_target_uuids = []
12620 if self.group_uuid in self.req_target_uuids:
12621 raise errors.OpPrereqError("Group to be evacuated (%s) can not be used"
12622 " as a target group (targets are %s)" %
12624 utils.CommaJoin(self.req_target_uuids)),
12625 errors.ECODE_INVAL)
12627 self.op.iallocator = _GetDefaultIAllocator(self.cfg, self.op.iallocator)
12629 self.share_locks = _ShareAll()
12630 self.needed_locks = {
12631 locking.LEVEL_INSTANCE: [],
12632 locking.LEVEL_NODEGROUP: [],
12633 locking.LEVEL_NODE: [],
12636 def DeclareLocks(self, level):
12637 if level == locking.LEVEL_INSTANCE:
12638 assert not self.needed_locks[locking.LEVEL_INSTANCE]
12640 # Lock instances optimistically, needs verification once node and group
12641 # locks have been acquired
12642 self.needed_locks[locking.LEVEL_INSTANCE] = \
12643 self.cfg.GetNodeGroupInstances(self.group_uuid)
12645 elif level == locking.LEVEL_NODEGROUP:
12646 assert not self.needed_locks[locking.LEVEL_NODEGROUP]
12648 if self.req_target_uuids:
12649 lock_groups = set([self.group_uuid] + self.req_target_uuids)
12651 # Lock all groups used by instances optimistically; this requires going
12652 # via the node before it's locked, requiring verification later on
12653 lock_groups.update(group_uuid
12654 for instance_name in
12655 self.owned_locks(locking.LEVEL_INSTANCE)
12657 self.cfg.GetInstanceNodeGroups(instance_name))
12659 # No target groups, need to lock all of them
12660 lock_groups = locking.ALL_SET
12662 self.needed_locks[locking.LEVEL_NODEGROUP] = lock_groups
12664 elif level == locking.LEVEL_NODE:
12665 # This will only lock the nodes in the group to be evacuated which
12666 # contain actual instances
12667 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
12668 self._LockInstancesNodes()
12670 # Lock all nodes in group to be evacuated and target groups
12671 owned_groups = frozenset(self.owned_locks(locking.LEVEL_NODEGROUP))
12672 assert self.group_uuid in owned_groups
12673 member_nodes = [node_name
12674 for group in owned_groups
12675 for node_name in self.cfg.GetNodeGroup(group).members]
12676 self.needed_locks[locking.LEVEL_NODE].extend(member_nodes)
12678 def CheckPrereq(self):
12679 owned_instances = frozenset(self.owned_locks(locking.LEVEL_INSTANCE))
12680 owned_groups = frozenset(self.owned_locks(locking.LEVEL_NODEGROUP))
12681 owned_nodes = frozenset(self.owned_locks(locking.LEVEL_NODE))
12683 assert owned_groups.issuperset(self.req_target_uuids)
12684 assert self.group_uuid in owned_groups
12686 # Check if locked instances are still correct
12687 _CheckNodeGroupInstances(self.cfg, self.group_uuid, owned_instances)
12689 # Get instance information
12690 self.instances = dict(self.cfg.GetMultiInstanceInfo(owned_instances))
12692 # Check if node groups for locked instances are still correct
12693 for instance_name in owned_instances:
12694 inst = self.instances[instance_name]
12695 assert owned_nodes.issuperset(inst.all_nodes), \
12696 "Instance %s's nodes changed while we kept the lock" % instance_name
12698 inst_groups = _CheckInstanceNodeGroups(self.cfg, instance_name,
12701 assert self.group_uuid in inst_groups, \
12702 "Instance %s has no node in group %s" % (instance_name, self.group_uuid)
12704 if self.req_target_uuids:
12705 # User requested specific target groups
12706 self.target_uuids = self.req_target_uuids
12708 # All groups except the one to be evacuated are potential targets
12709 self.target_uuids = [group_uuid for group_uuid in owned_groups
12710 if group_uuid != self.group_uuid]
12712 if not self.target_uuids:
12713 raise errors.OpPrereqError("There are no possible target groups",
12714 errors.ECODE_INVAL)
12716 def BuildHooksEnv(self):
12717 """Build hooks env.
12721 "GROUP_NAME": self.op.group_name,
12722 "TARGET_GROUPS": " ".join(self.target_uuids),
12725 def BuildHooksNodes(self):
12726 """Build hooks nodes.
12729 mn = self.cfg.GetMasterNode()
12731 assert self.group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP)
12733 run_nodes = [mn] + self.cfg.GetNodeGroup(self.group_uuid).members
12735 return (run_nodes, run_nodes)
12737 def Exec(self, feedback_fn):
12738 instances = list(self.owned_locks(locking.LEVEL_INSTANCE))
12740 assert self.group_uuid not in self.target_uuids
12742 ial = IAllocator(self.cfg, self.rpc, constants.IALLOCATOR_MODE_CHG_GROUP,
12743 instances=instances, target_groups=self.target_uuids)
12745 ial.Run(self.op.iallocator)
12747 if not ial.success:
12748 raise errors.OpPrereqError("Can't compute group evacuation using"
12749 " iallocator '%s': %s" %
12750 (self.op.iallocator, ial.info),
12751 errors.ECODE_NORES)
12753 jobs = _LoadNodeEvacResult(self, ial.result, self.op.early_release, False)
12755 self.LogInfo("Iallocator returned %s job(s) for evacuating node group %s",
12756 len(jobs), self.op.group_name)
12758 return ResultWithJobs(jobs)
12761 class TagsLU(NoHooksLU): # pylint: disable=W0223
12762 """Generic tags LU.
12764 This is an abstract class which is the parent of all the other tags LUs.
12767 def ExpandNames(self):
12768 self.group_uuid = None
12769 self.needed_locks = {}
12770 if self.op.kind == constants.TAG_NODE:
12771 self.op.name = _ExpandNodeName(self.cfg, self.op.name)
12772 self.needed_locks[locking.LEVEL_NODE] = self.op.name
12773 elif self.op.kind == constants.TAG_INSTANCE:
12774 self.op.name = _ExpandInstanceName(self.cfg, self.op.name)
12775 self.needed_locks[locking.LEVEL_INSTANCE] = self.op.name
12776 elif self.op.kind == constants.TAG_NODEGROUP:
12777 self.group_uuid = self.cfg.LookupNodeGroup(self.op.name)
12779 # FIXME: Acquire BGL for cluster tag operations (as of this writing it's
12780 # not possible to acquire the BGL based on opcode parameters)
12782 def CheckPrereq(self):
12783 """Check prerequisites.
12786 if self.op.kind == constants.TAG_CLUSTER:
12787 self.target = self.cfg.GetClusterInfo()
12788 elif self.op.kind == constants.TAG_NODE:
12789 self.target = self.cfg.GetNodeInfo(self.op.name)
12790 elif self.op.kind == constants.TAG_INSTANCE:
12791 self.target = self.cfg.GetInstanceInfo(self.op.name)
12792 elif self.op.kind == constants.TAG_NODEGROUP:
12793 self.target = self.cfg.GetNodeGroup(self.group_uuid)
12795 raise errors.OpPrereqError("Wrong tag type requested (%s)" %
12796 str(self.op.kind), errors.ECODE_INVAL)
12799 class LUTagsGet(TagsLU):
12800 """Returns the tags of a given object.
12805 def ExpandNames(self):
12806 TagsLU.ExpandNames(self)
12808 # Share locks as this is only a read operation
12809 self.share_locks = _ShareAll()
12811 def Exec(self, feedback_fn):
12812 """Returns the tag list.
12815 return list(self.target.GetTags())
12818 class LUTagsSearch(NoHooksLU):
12819 """Searches the tags for a given pattern.
12824 def ExpandNames(self):
12825 self.needed_locks = {}
12827 def CheckPrereq(self):
12828 """Check prerequisites.
12830 This checks the pattern passed for validity by compiling it.
12834 self.re = re.compile(self.op.pattern)
12835 except re.error, err:
12836 raise errors.OpPrereqError("Invalid search pattern '%s': %s" %
12837 (self.op.pattern, err), errors.ECODE_INVAL)
12839 def Exec(self, feedback_fn):
12840 """Returns the tag list.
12844 tgts = [("/cluster", cfg.GetClusterInfo())]
12845 ilist = cfg.GetAllInstancesInfo().values()
12846 tgts.extend([("/instances/%s" % i.name, i) for i in ilist])
12847 nlist = cfg.GetAllNodesInfo().values()
12848 tgts.extend([("/nodes/%s" % n.name, n) for n in nlist])
12849 tgts.extend(("/nodegroup/%s" % n.name, n)
12850 for n in cfg.GetAllNodeGroupsInfo().values())
12852 for path, target in tgts:
12853 for tag in target.GetTags():
12854 if self.re.search(tag):
12855 results.append((path, tag))
12859 class LUTagsSet(TagsLU):
12860 """Sets a tag on a given object.
12865 def CheckPrereq(self):
12866 """Check prerequisites.
12868 This checks the type and length of the tag name and value.
12871 TagsLU.CheckPrereq(self)
12872 for tag in self.op.tags:
12873 objects.TaggableObject.ValidateTag(tag)
12875 def Exec(self, feedback_fn):
12880 for tag in self.op.tags:
12881 self.target.AddTag(tag)
12882 except errors.TagError, err:
12883 raise errors.OpExecError("Error while setting tag: %s" % str(err))
12884 self.cfg.Update(self.target, feedback_fn)
12887 class LUTagsDel(TagsLU):
12888 """Delete a list of tags from a given object.
12893 def CheckPrereq(self):
12894 """Check prerequisites.
12896 This checks that we have the given tag.
12899 TagsLU.CheckPrereq(self)
12900 for tag in self.op.tags:
12901 objects.TaggableObject.ValidateTag(tag)
12902 del_tags = frozenset(self.op.tags)
12903 cur_tags = self.target.GetTags()
12905 diff_tags = del_tags - cur_tags
12907 diff_names = ("'%s'" % i for i in sorted(diff_tags))
12908 raise errors.OpPrereqError("Tag(s) %s not found" %
12909 (utils.CommaJoin(diff_names), ),
12910 errors.ECODE_NOENT)
12912 def Exec(self, feedback_fn):
12913 """Remove the tag from the object.
12916 for tag in self.op.tags:
12917 self.target.RemoveTag(tag)
12918 self.cfg.Update(self.target, feedback_fn)
12921 class LUTestDelay(NoHooksLU):
12922 """Sleep for a specified amount of time.
12924 This LU sleeps on the master and/or nodes for a specified amount of
12930 def ExpandNames(self):
12931 """Expand names and set required locks.
12933 This expands the node list, if any.
12936 self.needed_locks = {}
12937 if self.op.on_nodes:
12938 # _GetWantedNodes can be used here, but is not always appropriate to use
12939 # this way in ExpandNames. Check LogicalUnit.ExpandNames docstring for
12940 # more information.
12941 self.op.on_nodes = _GetWantedNodes(self, self.op.on_nodes)
12942 self.needed_locks[locking.LEVEL_NODE] = self.op.on_nodes
12944 def _TestDelay(self):
12945 """Do the actual sleep.
12948 if self.op.on_master:
12949 if not utils.TestDelay(self.op.duration):
12950 raise errors.OpExecError("Error during master delay test")
12951 if self.op.on_nodes:
12952 result = self.rpc.call_test_delay(self.op.on_nodes, self.op.duration)
12953 for node, node_result in result.items():
12954 node_result.Raise("Failure during rpc call to node %s" % node)
12956 def Exec(self, feedback_fn):
12957 """Execute the test delay opcode, with the wanted repetitions.
12960 if self.op.repeat == 0:
12963 top_value = self.op.repeat - 1
12964 for i in range(self.op.repeat):
12965 self.LogInfo("Test delay iteration %d/%d" % (i, top_value))
12969 class LUTestJqueue(NoHooksLU):
12970 """Utility LU to test some aspects of the job queue.
12975 # Must be lower than default timeout for WaitForJobChange to see whether it
12976 # notices changed jobs
12977 _CLIENT_CONNECT_TIMEOUT = 20.0
12978 _CLIENT_CONFIRM_TIMEOUT = 60.0
12981 def _NotifyUsingSocket(cls, cb, errcls):
12982 """Opens a Unix socket and waits for another program to connect.
12985 @param cb: Callback to send socket name to client
12986 @type errcls: class
12987 @param errcls: Exception class to use for errors
12990 # Using a temporary directory as there's no easy way to create temporary
12991 # sockets without writing a custom loop around tempfile.mktemp and
12993 tmpdir = tempfile.mkdtemp()
12995 tmpsock = utils.PathJoin(tmpdir, "sock")
12997 logging.debug("Creating temporary socket at %s", tmpsock)
12998 sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
13003 # Send details to client
13006 # Wait for client to connect before continuing
13007 sock.settimeout(cls._CLIENT_CONNECT_TIMEOUT)
13009 (conn, _) = sock.accept()
13010 except socket.error, err:
13011 raise errcls("Client didn't connect in time (%s)" % err)
13015 # Remove as soon as client is connected
13016 shutil.rmtree(tmpdir)
13018 # Wait for client to close
13021 # pylint: disable=E1101
13022 # Instance of '_socketobject' has no ... member
13023 conn.settimeout(cls._CLIENT_CONFIRM_TIMEOUT)
13025 except socket.error, err:
13026 raise errcls("Client failed to confirm notification (%s)" % err)
13030 def _SendNotification(self, test, arg, sockname):
13031 """Sends a notification to the client.
13034 @param test: Test name
13035 @param arg: Test argument (depends on test)
13036 @type sockname: string
13037 @param sockname: Socket path
13040 self.Log(constants.ELOG_JQUEUE_TEST, (sockname, test, arg))
13042 def _Notify(self, prereq, test, arg):
13043 """Notifies the client of a test.
13046 @param prereq: Whether this is a prereq-phase test
13048 @param test: Test name
13049 @param arg: Test argument (depends on test)
13053 errcls = errors.OpPrereqError
13055 errcls = errors.OpExecError
13057 return self._NotifyUsingSocket(compat.partial(self._SendNotification,
13061 def CheckArguments(self):
13062 self.checkargs_calls = getattr(self, "checkargs_calls", 0) + 1
13063 self.expandnames_calls = 0
13065 def ExpandNames(self):
13066 checkargs_calls = getattr(self, "checkargs_calls", 0)
13067 if checkargs_calls < 1:
13068 raise errors.ProgrammerError("CheckArguments was not called")
13070 self.expandnames_calls += 1
13072 if self.op.notify_waitlock:
13073 self._Notify(True, constants.JQT_EXPANDNAMES, None)
13075 self.LogInfo("Expanding names")
13077 # Get lock on master node (just to get a lock, not for a particular reason)
13078 self.needed_locks = {
13079 locking.LEVEL_NODE: self.cfg.GetMasterNode(),
13082 def Exec(self, feedback_fn):
13083 if self.expandnames_calls < 1:
13084 raise errors.ProgrammerError("ExpandNames was not called")
13086 if self.op.notify_exec:
13087 self._Notify(False, constants.JQT_EXEC, None)
13089 self.LogInfo("Executing")
13091 if self.op.log_messages:
13092 self._Notify(False, constants.JQT_STARTMSG, len(self.op.log_messages))
13093 for idx, msg in enumerate(self.op.log_messages):
13094 self.LogInfo("Sending log message %s", idx + 1)
13095 feedback_fn(constants.JQT_MSGPREFIX + msg)
13096 # Report how many test messages have been sent
13097 self._Notify(False, constants.JQT_LOGMSG, idx + 1)
13100 raise errors.OpExecError("Opcode failure was requested")
13105 class IAllocator(object):
13106 """IAllocator framework.
13108 An IAllocator instance has three sets of attributes:
13109 - cfg that is needed to query the cluster
13110 - input data (all members of the _KEYS class attribute are required)
13111 - four buffer attributes (in|out_data|text), that represent the
13112 input (to the external script) in text and data structure format,
13113 and the output from it, again in two formats
13114 - the result variables from the script (success, info, nodes) for
13118 # pylint: disable=R0902
13119 # lots of instance attributes
13121 def __init__(self, cfg, rpc_runner, mode, **kwargs):
13123 self.rpc = rpc_runner
13124 # init buffer variables
13125 self.in_text = self.out_text = self.in_data = self.out_data = None
13126 # init all input fields so that pylint is happy
13128 self.memory = self.disks = self.disk_template = None
13129 self.os = self.tags = self.nics = self.vcpus = None
13130 self.hypervisor = None
13131 self.relocate_from = None
13133 self.instances = None
13134 self.evac_mode = None
13135 self.target_groups = []
13137 self.required_nodes = None
13138 # init result fields
13139 self.success = self.info = self.result = None
13142 (fn, keydata, self._result_check) = self._MODE_DATA[self.mode]
13144 raise errors.ProgrammerError("Unknown mode '%s' passed to the"
13145 " IAllocator" % self.mode)
13147 keyset = [n for (n, _) in keydata]
13150 if key not in keyset:
13151 raise errors.ProgrammerError("Invalid input parameter '%s' to"
13152 " IAllocator" % key)
13153 setattr(self, key, kwargs[key])
13156 if key not in kwargs:
13157 raise errors.ProgrammerError("Missing input parameter '%s' to"
13158 " IAllocator" % key)
13159 self._BuildInputData(compat.partial(fn, self), keydata)
13161 def _ComputeClusterData(self):
13162 """Compute the generic allocator input data.
13164 This is the data that is independent of the actual operation.
13168 cluster_info = cfg.GetClusterInfo()
13171 "version": constants.IALLOCATOR_VERSION,
13172 "cluster_name": cfg.GetClusterName(),
13173 "cluster_tags": list(cluster_info.GetTags()),
13174 "enabled_hypervisors": list(cluster_info.enabled_hypervisors),
13175 # we don't have job IDs
13177 ninfo = cfg.GetAllNodesInfo()
13178 iinfo = cfg.GetAllInstancesInfo().values()
13179 i_list = [(inst, cluster_info.FillBE(inst)) for inst in iinfo]
13182 node_list = [n.name for n in ninfo.values() if n.vm_capable]
13184 if self.mode == constants.IALLOCATOR_MODE_ALLOC:
13185 hypervisor_name = self.hypervisor
13186 elif self.mode == constants.IALLOCATOR_MODE_RELOC:
13187 hypervisor_name = cfg.GetInstanceInfo(self.name).hypervisor
13189 hypervisor_name = cluster_info.enabled_hypervisors[0]
13191 node_data = self.rpc.call_node_info(node_list, cfg.GetVGName(),
13194 self.rpc.call_all_instances_info(node_list,
13195 cluster_info.enabled_hypervisors)
13197 data["nodegroups"] = self._ComputeNodeGroupData(cfg)
13199 config_ndata = self._ComputeBasicNodeData(ninfo)
13200 data["nodes"] = self._ComputeDynamicNodeData(ninfo, node_data, node_iinfo,
13201 i_list, config_ndata)
13202 assert len(data["nodes"]) == len(ninfo), \
13203 "Incomplete node data computed"
13205 data["instances"] = self._ComputeInstanceData(cluster_info, i_list)
13207 self.in_data = data
13210 def _ComputeNodeGroupData(cfg):
13211 """Compute node groups data.
13214 ng = dict((guuid, {
13215 "name": gdata.name,
13216 "alloc_policy": gdata.alloc_policy,
13218 for guuid, gdata in cfg.GetAllNodeGroupsInfo().items())
13223 def _ComputeBasicNodeData(node_cfg):
13224 """Compute global node data.
13227 @returns: a dict of name: (node dict, node config)
13230 # fill in static (config-based) values
13231 node_results = dict((ninfo.name, {
13232 "tags": list(ninfo.GetTags()),
13233 "primary_ip": ninfo.primary_ip,
13234 "secondary_ip": ninfo.secondary_ip,
13235 "offline": ninfo.offline,
13236 "drained": ninfo.drained,
13237 "master_candidate": ninfo.master_candidate,
13238 "group": ninfo.group,
13239 "master_capable": ninfo.master_capable,
13240 "vm_capable": ninfo.vm_capable,
13242 for ninfo in node_cfg.values())
13244 return node_results
13247 def _ComputeDynamicNodeData(node_cfg, node_data, node_iinfo, i_list,
13249 """Compute global node data.
13251 @param node_results: the basic node structures as filled from the config
13254 # make a copy of the current dict
13255 node_results = dict(node_results)
13256 for nname, nresult in node_data.items():
13257 assert nname in node_results, "Missing basic data for node %s" % nname
13258 ninfo = node_cfg[nname]
13260 if not (ninfo.offline or ninfo.drained):
13261 nresult.Raise("Can't get data for node %s" % nname)
13262 node_iinfo[nname].Raise("Can't get node instance info from node %s" %
13264 remote_info = nresult.payload
13266 for attr in ["memory_total", "memory_free", "memory_dom0",
13267 "vg_size", "vg_free", "cpu_total"]:
13268 if attr not in remote_info:
13269 raise errors.OpExecError("Node '%s' didn't return attribute"
13270 " '%s'" % (nname, attr))
13271 if not isinstance(remote_info[attr], int):
13272 raise errors.OpExecError("Node '%s' returned invalid value"
13274 (nname, attr, remote_info[attr]))
13275 # compute memory used by primary instances
13276 i_p_mem = i_p_up_mem = 0
13277 for iinfo, beinfo in i_list:
13278 if iinfo.primary_node == nname:
13279 i_p_mem += beinfo[constants.BE_MEMORY]
13280 if iinfo.name not in node_iinfo[nname].payload:
13283 i_used_mem = int(node_iinfo[nname].payload[iinfo.name]["memory"])
13284 i_mem_diff = beinfo[constants.BE_MEMORY] - i_used_mem
13285 remote_info["memory_free"] -= max(0, i_mem_diff)
13288 i_p_up_mem += beinfo[constants.BE_MEMORY]
13290 # compute memory used by instances
13292 "total_memory": remote_info["memory_total"],
13293 "reserved_memory": remote_info["memory_dom0"],
13294 "free_memory": remote_info["memory_free"],
13295 "total_disk": remote_info["vg_size"],
13296 "free_disk": remote_info["vg_free"],
13297 "total_cpus": remote_info["cpu_total"],
13298 "i_pri_memory": i_p_mem,
13299 "i_pri_up_memory": i_p_up_mem,
13301 pnr_dyn.update(node_results[nname])
13302 node_results[nname] = pnr_dyn
13304 return node_results
13307 def _ComputeInstanceData(cluster_info, i_list):
13308 """Compute global instance data.
13312 for iinfo, beinfo in i_list:
13314 for nic in iinfo.nics:
13315 filled_params = cluster_info.SimpleFillNIC(nic.nicparams)
13319 "mode": filled_params[constants.NIC_MODE],
13320 "link": filled_params[constants.NIC_LINK],
13322 if filled_params[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
13323 nic_dict["bridge"] = filled_params[constants.NIC_LINK]
13324 nic_data.append(nic_dict)
13326 "tags": list(iinfo.GetTags()),
13327 "admin_up": iinfo.admin_up,
13328 "vcpus": beinfo[constants.BE_VCPUS],
13329 "memory": beinfo[constants.BE_MEMORY],
13331 "nodes": [iinfo.primary_node] + list(iinfo.secondary_nodes),
13333 "disks": [{constants.IDISK_SIZE: dsk.size,
13334 constants.IDISK_MODE: dsk.mode}
13335 for dsk in iinfo.disks],
13336 "disk_template": iinfo.disk_template,
13337 "hypervisor": iinfo.hypervisor,
13339 pir["disk_space_total"] = _ComputeDiskSize(iinfo.disk_template,
13341 instance_data[iinfo.name] = pir
13343 return instance_data
13345 def _AddNewInstance(self):
13346 """Add new instance data to allocator structure.
13348 This in combination with _AllocatorGetClusterData will create the
13349 correct structure needed as input for the allocator.
13351 The checks for the completeness of the opcode must have already been
13355 disk_space = _ComputeDiskSize(self.disk_template, self.disks)
13357 if self.disk_template in constants.DTS_INT_MIRROR:
13358 self.required_nodes = 2
13360 self.required_nodes = 1
13364 "disk_template": self.disk_template,
13367 "vcpus": self.vcpus,
13368 "memory": self.memory,
13369 "disks": self.disks,
13370 "disk_space_total": disk_space,
13372 "required_nodes": self.required_nodes,
13373 "hypervisor": self.hypervisor,
13378 def _AddRelocateInstance(self):
13379 """Add relocate instance data to allocator structure.
13381 This in combination with _IAllocatorGetClusterData will create the
13382 correct structure needed as input for the allocator.
13384 The checks for the completeness of the opcode must have already been
13388 instance = self.cfg.GetInstanceInfo(self.name)
13389 if instance is None:
13390 raise errors.ProgrammerError("Unknown instance '%s' passed to"
13391 " IAllocator" % self.name)
13393 if instance.disk_template not in constants.DTS_MIRRORED:
13394 raise errors.OpPrereqError("Can't relocate non-mirrored instances",
13395 errors.ECODE_INVAL)
13397 if instance.disk_template in constants.DTS_INT_MIRROR and \
13398 len(instance.secondary_nodes) != 1:
13399 raise errors.OpPrereqError("Instance has not exactly one secondary node",
13400 errors.ECODE_STATE)
13402 self.required_nodes = 1
13403 disk_sizes = [{constants.IDISK_SIZE: disk.size} for disk in instance.disks]
13404 disk_space = _ComputeDiskSize(instance.disk_template, disk_sizes)
13408 "disk_space_total": disk_space,
13409 "required_nodes": self.required_nodes,
13410 "relocate_from": self.relocate_from,
13414 def _AddNodeEvacuate(self):
13415 """Get data for node-evacuate requests.
13419 "instances": self.instances,
13420 "evac_mode": self.evac_mode,
13423 def _AddChangeGroup(self):
13424 """Get data for node-evacuate requests.
13428 "instances": self.instances,
13429 "target_groups": self.target_groups,
13432 def _BuildInputData(self, fn, keydata):
13433 """Build input data structures.
13436 self._ComputeClusterData()
13439 request["type"] = self.mode
13440 for keyname, keytype in keydata:
13441 if keyname not in request:
13442 raise errors.ProgrammerError("Request parameter %s is missing" %
13444 val = request[keyname]
13445 if not keytype(val):
13446 raise errors.ProgrammerError("Request parameter %s doesn't pass"
13447 " validation, value %s, expected"
13448 " type %s" % (keyname, val, keytype))
13449 self.in_data["request"] = request
13451 self.in_text = serializer.Dump(self.in_data)
13453 _STRING_LIST = ht.TListOf(ht.TString)
13454 _JOB_LIST = ht.TListOf(ht.TListOf(ht.TStrictDict(True, False, {
13455 # pylint: disable=E1101
13456 # Class '...' has no 'OP_ID' member
13457 "OP_ID": ht.TElemOf([opcodes.OpInstanceFailover.OP_ID,
13458 opcodes.OpInstanceMigrate.OP_ID,
13459 opcodes.OpInstanceReplaceDisks.OP_ID])
13463 ht.TListOf(ht.TAnd(ht.TIsLength(3),
13464 ht.TItems([ht.TNonEmptyString,
13465 ht.TNonEmptyString,
13466 ht.TListOf(ht.TNonEmptyString),
13469 ht.TListOf(ht.TAnd(ht.TIsLength(2),
13470 ht.TItems([ht.TNonEmptyString,
13473 _NEVAC_RESULT = ht.TAnd(ht.TIsLength(3),
13474 ht.TItems([_NEVAC_MOVED, _NEVAC_FAILED, _JOB_LIST]))
13477 constants.IALLOCATOR_MODE_ALLOC:
13480 ("name", ht.TString),
13481 ("memory", ht.TInt),
13482 ("disks", ht.TListOf(ht.TDict)),
13483 ("disk_template", ht.TString),
13484 ("os", ht.TString),
13485 ("tags", _STRING_LIST),
13486 ("nics", ht.TListOf(ht.TDict)),
13487 ("vcpus", ht.TInt),
13488 ("hypervisor", ht.TString),
13490 constants.IALLOCATOR_MODE_RELOC:
13491 (_AddRelocateInstance,
13492 [("name", ht.TString), ("relocate_from", _STRING_LIST)],
13494 constants.IALLOCATOR_MODE_NODE_EVAC:
13495 (_AddNodeEvacuate, [
13496 ("instances", _STRING_LIST),
13497 ("evac_mode", ht.TElemOf(constants.IALLOCATOR_NEVAC_MODES)),
13499 constants.IALLOCATOR_MODE_CHG_GROUP:
13500 (_AddChangeGroup, [
13501 ("instances", _STRING_LIST),
13502 ("target_groups", _STRING_LIST),
13506 def Run(self, name, validate=True, call_fn=None):
13507 """Run an instance allocator and return the results.
13510 if call_fn is None:
13511 call_fn = self.rpc.call_iallocator_runner
13513 result = call_fn(self.cfg.GetMasterNode(), name, self.in_text)
13514 result.Raise("Failure while running the iallocator script")
13516 self.out_text = result.payload
13518 self._ValidateResult()
13520 def _ValidateResult(self):
13521 """Process the allocator results.
13523 This will process and if successful save the result in
13524 self.out_data and the other parameters.
13528 rdict = serializer.Load(self.out_text)
13529 except Exception, err:
13530 raise errors.OpExecError("Can't parse iallocator results: %s" % str(err))
13532 if not isinstance(rdict, dict):
13533 raise errors.OpExecError("Can't parse iallocator results: not a dict")
13535 # TODO: remove backwards compatiblity in later versions
13536 if "nodes" in rdict and "result" not in rdict:
13537 rdict["result"] = rdict["nodes"]
13540 for key in "success", "info", "result":
13541 if key not in rdict:
13542 raise errors.OpExecError("Can't parse iallocator results:"
13543 " missing key '%s'" % key)
13544 setattr(self, key, rdict[key])
13546 if not self._result_check(self.result):
13547 raise errors.OpExecError("Iallocator returned invalid result,"
13548 " expected %s, got %s" %
13549 (self._result_check, self.result),
13550 errors.ECODE_INVAL)
13552 if self.mode == constants.IALLOCATOR_MODE_RELOC:
13553 assert self.relocate_from is not None
13554 assert self.required_nodes == 1
13556 node2group = dict((name, ndata["group"])
13557 for (name, ndata) in self.in_data["nodes"].items())
13559 fn = compat.partial(self._NodesToGroups, node2group,
13560 self.in_data["nodegroups"])
13562 instance = self.cfg.GetInstanceInfo(self.name)
13563 request_groups = fn(self.relocate_from + [instance.primary_node])
13564 result_groups = fn(rdict["result"] + [instance.primary_node])
13566 if self.success and not set(result_groups).issubset(request_groups):
13567 raise errors.OpExecError("Groups of nodes returned by iallocator (%s)"
13568 " differ from original groups (%s)" %
13569 (utils.CommaJoin(result_groups),
13570 utils.CommaJoin(request_groups)))
13572 elif self.mode == constants.IALLOCATOR_MODE_NODE_EVAC:
13573 assert self.evac_mode in constants.IALLOCATOR_NEVAC_MODES
13575 self.out_data = rdict
13578 def _NodesToGroups(node2group, groups, nodes):
13579 """Returns a list of unique group names for a list of nodes.
13581 @type node2group: dict
13582 @param node2group: Map from node name to group UUID
13584 @param groups: Group information
13586 @param nodes: Node names
13593 group_uuid = node2group[node]
13595 # Ignore unknown node
13599 group = groups[group_uuid]
13601 # Can't find group, let's use UUID
13602 group_name = group_uuid
13604 group_name = group["name"]
13606 result.add(group_name)
13608 return sorted(result)
13611 class LUTestAllocator(NoHooksLU):
13612 """Run allocator tests.
13614 This LU runs the allocator tests
13617 def CheckPrereq(self):
13618 """Check prerequisites.
13620 This checks the opcode parameters depending on the director and mode test.
13623 if self.op.mode == constants.IALLOCATOR_MODE_ALLOC:
13624 for attr in ["memory", "disks", "disk_template",
13625 "os", "tags", "nics", "vcpus"]:
13626 if not hasattr(self.op, attr):
13627 raise errors.OpPrereqError("Missing attribute '%s' on opcode input" %
13628 attr, errors.ECODE_INVAL)
13629 iname = self.cfg.ExpandInstanceName(self.op.name)
13630 if iname is not None:
13631 raise errors.OpPrereqError("Instance '%s' already in the cluster" %
13632 iname, errors.ECODE_EXISTS)
13633 if not isinstance(self.op.nics, list):
13634 raise errors.OpPrereqError("Invalid parameter 'nics'",
13635 errors.ECODE_INVAL)
13636 if not isinstance(self.op.disks, list):
13637 raise errors.OpPrereqError("Invalid parameter 'disks'",
13638 errors.ECODE_INVAL)
13639 for row in self.op.disks:
13640 if (not isinstance(row, dict) or
13641 constants.IDISK_SIZE not in row or
13642 not isinstance(row[constants.IDISK_SIZE], int) or
13643 constants.IDISK_MODE not in row or
13644 row[constants.IDISK_MODE] not in constants.DISK_ACCESS_SET):
13645 raise errors.OpPrereqError("Invalid contents of the 'disks'"
13646 " parameter", errors.ECODE_INVAL)
13647 if self.op.hypervisor is None:
13648 self.op.hypervisor = self.cfg.GetHypervisorType()
13649 elif self.op.mode == constants.IALLOCATOR_MODE_RELOC:
13650 fname = _ExpandInstanceName(self.cfg, self.op.name)
13651 self.op.name = fname
13652 self.relocate_from = \
13653 list(self.cfg.GetInstanceInfo(fname).secondary_nodes)
13654 elif self.op.mode in (constants.IALLOCATOR_MODE_CHG_GROUP,
13655 constants.IALLOCATOR_MODE_NODE_EVAC):
13656 if not self.op.instances:
13657 raise errors.OpPrereqError("Missing instances", errors.ECODE_INVAL)
13658 self.op.instances = _GetWantedInstances(self, self.op.instances)
13660 raise errors.OpPrereqError("Invalid test allocator mode '%s'" %
13661 self.op.mode, errors.ECODE_INVAL)
13663 if self.op.direction == constants.IALLOCATOR_DIR_OUT:
13664 if self.op.allocator is None:
13665 raise errors.OpPrereqError("Missing allocator name",
13666 errors.ECODE_INVAL)
13667 elif self.op.direction != constants.IALLOCATOR_DIR_IN:
13668 raise errors.OpPrereqError("Wrong allocator test '%s'" %
13669 self.op.direction, errors.ECODE_INVAL)
13671 def Exec(self, feedback_fn):
13672 """Run the allocator test.
13675 if self.op.mode == constants.IALLOCATOR_MODE_ALLOC:
13676 ial = IAllocator(self.cfg, self.rpc,
13679 memory=self.op.memory,
13680 disks=self.op.disks,
13681 disk_template=self.op.disk_template,
13685 vcpus=self.op.vcpus,
13686 hypervisor=self.op.hypervisor,
13688 elif self.op.mode == constants.IALLOCATOR_MODE_RELOC:
13689 ial = IAllocator(self.cfg, self.rpc,
13692 relocate_from=list(self.relocate_from),
13694 elif self.op.mode == constants.IALLOCATOR_MODE_CHG_GROUP:
13695 ial = IAllocator(self.cfg, self.rpc,
13697 instances=self.op.instances,
13698 target_groups=self.op.target_groups)
13699 elif self.op.mode == constants.IALLOCATOR_MODE_NODE_EVAC:
13700 ial = IAllocator(self.cfg, self.rpc,
13702 instances=self.op.instances,
13703 evac_mode=self.op.evac_mode)
13705 raise errors.ProgrammerError("Uncatched mode %s in"
13706 " LUTestAllocator.Exec", self.op.mode)
13708 if self.op.direction == constants.IALLOCATOR_DIR_IN:
13709 result = ial.in_text
13711 ial.Run(self.op.allocator, validate=False)
13712 result = ial.out_text
13716 #: Query type implementations
13718 constants.QR_INSTANCE: _InstanceQuery,
13719 constants.QR_NODE: _NodeQuery,
13720 constants.QR_GROUP: _GroupQuery,
13721 constants.QR_OS: _OsQuery,
13724 assert set(_QUERY_IMPL.keys()) == constants.QR_VIA_OP
13727 def _GetQueryImplementation(name):
13728 """Returns the implemtnation for a query type.
13730 @param name: Query type, must be one of L{constants.QR_VIA_OP}
13734 return _QUERY_IMPL[name]
13736 raise errors.OpPrereqError("Unknown query resource '%s'" % name,
13737 errors.ECODE_INVAL)