4 # Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 Google Inc.
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 """Module implementing the master-side code."""
24 # pylint: disable-msg=W0201,C0302
26 # W0201 since most LU attributes are defined in CheckPrereq or similar
29 # C0302: since we have waaaay to many lines in this module
44 from ganeti import ssh
45 from ganeti import utils
46 from ganeti import errors
47 from ganeti import hypervisor
48 from ganeti import locking
49 from ganeti import constants
50 from ganeti import objects
51 from ganeti import serializer
52 from ganeti import ssconf
53 from ganeti import uidpool
54 from ganeti import compat
55 from ganeti import masterd
56 from ganeti import netutils
57 from ganeti import query
58 from ganeti import qlang
59 from ganeti import opcodes
62 import ganeti.masterd.instance # pylint: disable-msg=W0611
65 def _SupportsOob(cfg, node):
66 """Tells if node supports OOB.
68 @type cfg: L{config.ConfigWriter}
69 @param cfg: The cluster configuration
70 @type node: L{objects.Node}
72 @return: The OOB script if supported or an empty string otherwise
75 return cfg.GetNdParams(node)[constants.ND_OOB_PROGRAM]
79 """Data container for LU results with jobs.
81 Instances of this class returned from L{LogicalUnit.Exec} will be recognized
82 by L{mcpu.Processor._ProcessResult}. The latter will then submit the jobs
83 contained in the C{jobs} attribute and include the job IDs in the opcode
87 def __init__(self, jobs, **kwargs):
88 """Initializes this class.
90 Additional return values can be specified as keyword arguments.
92 @type jobs: list of lists of L{opcode.OpCode}
93 @param jobs: A list of lists of opcode objects
100 class LogicalUnit(object):
101 """Logical Unit base class.
103 Subclasses must follow these rules:
104 - implement ExpandNames
105 - implement CheckPrereq (except when tasklets are used)
106 - implement Exec (except when tasklets are used)
107 - implement BuildHooksEnv
108 - implement BuildHooksNodes
109 - redefine HPATH and HTYPE
110 - optionally redefine their run requirements:
111 REQ_BGL: the LU needs to hold the Big Ganeti Lock exclusively
113 Note that all commands require root permissions.
115 @ivar dry_run_result: the value (if any) that will be returned to the caller
116 in dry-run mode (signalled by opcode dry_run parameter)
123 def __init__(self, processor, op, context, rpc):
124 """Constructor for LogicalUnit.
126 This needs to be overridden in derived classes in order to check op
130 self.proc = processor
132 self.cfg = context.cfg
133 self.glm = context.glm
134 self.context = context
136 # Dicts used to declare locking needs to mcpu
137 self.needed_locks = None
138 self.share_locks = dict.fromkeys(locking.LEVELS, 0)
140 self.remove_locks = {}
141 # Used to force good behavior when calling helper functions
142 self.recalculate_locks = {}
144 self.Log = processor.Log # pylint: disable-msg=C0103
145 self.LogWarning = processor.LogWarning # pylint: disable-msg=C0103
146 self.LogInfo = processor.LogInfo # pylint: disable-msg=C0103
147 self.LogStep = processor.LogStep # pylint: disable-msg=C0103
148 # support for dry-run
149 self.dry_run_result = None
150 # support for generic debug attribute
151 if (not hasattr(self.op, "debug_level") or
152 not isinstance(self.op.debug_level, int)):
153 self.op.debug_level = 0
158 # Validate opcode parameters and set defaults
159 self.op.Validate(True)
161 self.CheckArguments()
163 def CheckArguments(self):
164 """Check syntactic validity for the opcode arguments.
166 This method is for doing a simple syntactic check and ensure
167 validity of opcode parameters, without any cluster-related
168 checks. While the same can be accomplished in ExpandNames and/or
169 CheckPrereq, doing these separate is better because:
171 - ExpandNames is left as as purely a lock-related function
172 - CheckPrereq is run after we have acquired locks (and possible
175 The function is allowed to change the self.op attribute so that
176 later methods can no longer worry about missing parameters.
181 def ExpandNames(self):
182 """Expand names for this LU.
184 This method is called before starting to execute the opcode, and it should
185 update all the parameters of the opcode to their canonical form (e.g. a
186 short node name must be fully expanded after this method has successfully
187 completed). This way locking, hooks, logging, etc. can work correctly.
189 LUs which implement this method must also populate the self.needed_locks
190 member, as a dict with lock levels as keys, and a list of needed lock names
193 - use an empty dict if you don't need any lock
194 - if you don't need any lock at a particular level omit that level
195 - don't put anything for the BGL level
196 - if you want all locks at a level use locking.ALL_SET as a value
198 If you need to share locks (rather than acquire them exclusively) at one
199 level you can modify self.share_locks, setting a true value (usually 1) for
200 that level. By default locks are not shared.
202 This function can also define a list of tasklets, which then will be
203 executed in order instead of the usual LU-level CheckPrereq and Exec
204 functions, if those are not defined by the LU.
208 # Acquire all nodes and one instance
209 self.needed_locks = {
210 locking.LEVEL_NODE: locking.ALL_SET,
211 locking.LEVEL_INSTANCE: ['instance1.example.com'],
213 # Acquire just two nodes
214 self.needed_locks = {
215 locking.LEVEL_NODE: ['node1.example.com', 'node2.example.com'],
218 self.needed_locks = {} # No, you can't leave it to the default value None
221 # The implementation of this method is mandatory only if the new LU is
222 # concurrent, so that old LUs don't need to be changed all at the same
225 self.needed_locks = {} # Exclusive LUs don't need locks.
227 raise NotImplementedError
229 def DeclareLocks(self, level):
230 """Declare LU locking needs for a level
232 While most LUs can just declare their locking needs at ExpandNames time,
233 sometimes there's the need to calculate some locks after having acquired
234 the ones before. This function is called just before acquiring locks at a
235 particular level, but after acquiring the ones at lower levels, and permits
236 such calculations. It can be used to modify self.needed_locks, and by
237 default it does nothing.
239 This function is only called if you have something already set in
240 self.needed_locks for the level.
242 @param level: Locking level which is going to be locked
243 @type level: member of ganeti.locking.LEVELS
247 def CheckPrereq(self):
248 """Check prerequisites for this LU.
250 This method should check that the prerequisites for the execution
251 of this LU are fulfilled. It can do internode communication, but
252 it should be idempotent - no cluster or system changes are
255 The method should raise errors.OpPrereqError in case something is
256 not fulfilled. Its return value is ignored.
258 This method should also update all the parameters of the opcode to
259 their canonical form if it hasn't been done by ExpandNames before.
262 if self.tasklets is not None:
263 for (idx, tl) in enumerate(self.tasklets):
264 logging.debug("Checking prerequisites for tasklet %s/%s",
265 idx + 1, len(self.tasklets))
270 def Exec(self, feedback_fn):
273 This method should implement the actual work. It should raise
274 errors.OpExecError for failures that are somewhat dealt with in
278 if self.tasklets is not None:
279 for (idx, tl) in enumerate(self.tasklets):
280 logging.debug("Executing tasklet %s/%s", idx + 1, len(self.tasklets))
283 raise NotImplementedError
285 def BuildHooksEnv(self):
286 """Build hooks environment for this LU.
289 @return: Dictionary containing the environment that will be used for
290 running the hooks for this LU. The keys of the dict must not be prefixed
291 with "GANETI_"--that'll be added by the hooks runner. The hooks runner
292 will extend the environment with additional variables. If no environment
293 should be defined, an empty dictionary should be returned (not C{None}).
294 @note: If the C{HPATH} attribute of the LU class is C{None}, this function
298 raise NotImplementedError
300 def BuildHooksNodes(self):
301 """Build list of nodes to run LU's hooks.
303 @rtype: tuple; (list, list)
304 @return: Tuple containing a list of node names on which the hook
305 should run before the execution and a list of node names on which the
306 hook should run after the execution. No nodes should be returned as an
307 empty list (and not None).
308 @note: If the C{HPATH} attribute of the LU class is C{None}, this function
312 raise NotImplementedError
314 def HooksCallBack(self, phase, hook_results, feedback_fn, lu_result):
315 """Notify the LU about the results of its hooks.
317 This method is called every time a hooks phase is executed, and notifies
318 the Logical Unit about the hooks' result. The LU can then use it to alter
319 its result based on the hooks. By default the method does nothing and the
320 previous result is passed back unchanged but any LU can define it if it
321 wants to use the local cluster hook-scripts somehow.
323 @param phase: one of L{constants.HOOKS_PHASE_POST} or
324 L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase
325 @param hook_results: the results of the multi-node hooks rpc call
326 @param feedback_fn: function used send feedback back to the caller
327 @param lu_result: the previous Exec result this LU had, or None
329 @return: the new Exec result, based on the previous result
333 # API must be kept, thus we ignore the unused argument and could
334 # be a function warnings
335 # pylint: disable-msg=W0613,R0201
338 def _ExpandAndLockInstance(self):
339 """Helper function to expand and lock an instance.
341 Many LUs that work on an instance take its name in self.op.instance_name
342 and need to expand it and then declare the expanded name for locking. This
343 function does it, and then updates self.op.instance_name to the expanded
344 name. It also initializes needed_locks as a dict, if this hasn't been done
348 if self.needed_locks is None:
349 self.needed_locks = {}
351 assert locking.LEVEL_INSTANCE not in self.needed_locks, \
352 "_ExpandAndLockInstance called with instance-level locks set"
353 self.op.instance_name = _ExpandInstanceName(self.cfg,
354 self.op.instance_name)
355 self.needed_locks[locking.LEVEL_INSTANCE] = self.op.instance_name
357 def _LockInstancesNodes(self, primary_only=False):
358 """Helper function to declare instances' nodes for locking.
360 This function should be called after locking one or more instances to lock
361 their nodes. Its effect is populating self.needed_locks[locking.LEVEL_NODE]
362 with all primary or secondary nodes for instances already locked and
363 present in self.needed_locks[locking.LEVEL_INSTANCE].
365 It should be called from DeclareLocks, and for safety only works if
366 self.recalculate_locks[locking.LEVEL_NODE] is set.
368 In the future it may grow parameters to just lock some instance's nodes, or
369 to just lock primaries or secondary nodes, if needed.
371 If should be called in DeclareLocks in a way similar to::
373 if level == locking.LEVEL_NODE:
374 self._LockInstancesNodes()
376 @type primary_only: boolean
377 @param primary_only: only lock primary nodes of locked instances
380 assert locking.LEVEL_NODE in self.recalculate_locks, \
381 "_LockInstancesNodes helper function called with no nodes to recalculate"
383 # TODO: check if we're really been called with the instance locks held
385 # For now we'll replace self.needed_locks[locking.LEVEL_NODE], but in the
386 # future we might want to have different behaviors depending on the value
387 # of self.recalculate_locks[locking.LEVEL_NODE]
389 for instance_name in self.glm.list_owned(locking.LEVEL_INSTANCE):
390 instance = self.context.cfg.GetInstanceInfo(instance_name)
391 wanted_nodes.append(instance.primary_node)
393 wanted_nodes.extend(instance.secondary_nodes)
395 if self.recalculate_locks[locking.LEVEL_NODE] == constants.LOCKS_REPLACE:
396 self.needed_locks[locking.LEVEL_NODE] = wanted_nodes
397 elif self.recalculate_locks[locking.LEVEL_NODE] == constants.LOCKS_APPEND:
398 self.needed_locks[locking.LEVEL_NODE].extend(wanted_nodes)
400 del self.recalculate_locks[locking.LEVEL_NODE]
403 class NoHooksLU(LogicalUnit): # pylint: disable-msg=W0223
404 """Simple LU which runs no hooks.
406 This LU is intended as a parent for other LogicalUnits which will
407 run no hooks, in order to reduce duplicate code.
413 def BuildHooksEnv(self):
414 """Empty BuildHooksEnv for NoHooksLu.
416 This just raises an error.
419 raise AssertionError("BuildHooksEnv called for NoHooksLUs")
421 def BuildHooksNodes(self):
422 """Empty BuildHooksNodes for NoHooksLU.
425 raise AssertionError("BuildHooksNodes called for NoHooksLU")
429 """Tasklet base class.
431 Tasklets are subcomponents for LUs. LUs can consist entirely of tasklets or
432 they can mix legacy code with tasklets. Locking needs to be done in the LU,
433 tasklets know nothing about locks.
435 Subclasses must follow these rules:
436 - Implement CheckPrereq
440 def __init__(self, lu):
447 def CheckPrereq(self):
448 """Check prerequisites for this tasklets.
450 This method should check whether the prerequisites for the execution of
451 this tasklet are fulfilled. It can do internode communication, but it
452 should be idempotent - no cluster or system changes are allowed.
454 The method should raise errors.OpPrereqError in case something is not
455 fulfilled. Its return value is ignored.
457 This method should also update all parameters to their canonical form if it
458 hasn't been done before.
463 def Exec(self, feedback_fn):
464 """Execute the tasklet.
466 This method should implement the actual work. It should raise
467 errors.OpExecError for failures that are somewhat dealt with in code, or
471 raise NotImplementedError
475 """Base for query utility classes.
478 #: Attribute holding field definitions
481 def __init__(self, filter_, fields, use_locking):
482 """Initializes this class.
485 self.use_locking = use_locking
487 self.query = query.Query(self.FIELDS, fields, filter_=filter_,
489 self.requested_data = self.query.RequestedData()
490 self.names = self.query.RequestedNames()
492 # Sort only if no names were requested
493 self.sort_by_name = not self.names
495 self.do_locking = None
498 def _GetNames(self, lu, all_names, lock_level):
499 """Helper function to determine names asked for in the query.
503 names = lu.glm.list_owned(lock_level)
507 if self.wanted == locking.ALL_SET:
508 assert not self.names
509 # caller didn't specify names, so ordering is not important
510 return utils.NiceSort(names)
512 # caller specified names and we must keep the same order
514 assert not self.do_locking or lu.glm.is_owned(lock_level)
516 missing = set(self.wanted).difference(names)
518 raise errors.OpExecError("Some items were removed before retrieving"
519 " their data: %s" % missing)
521 # Return expanded names
524 def ExpandNames(self, lu):
525 """Expand names for this query.
527 See L{LogicalUnit.ExpandNames}.
530 raise NotImplementedError()
532 def DeclareLocks(self, lu, level):
533 """Declare locks for this query.
535 See L{LogicalUnit.DeclareLocks}.
538 raise NotImplementedError()
540 def _GetQueryData(self, lu):
541 """Collects all data for this query.
543 @return: Query data object
546 raise NotImplementedError()
548 def NewStyleQuery(self, lu):
549 """Collect data and execute query.
552 return query.GetQueryResponse(self.query, self._GetQueryData(lu),
553 sort_by_name=self.sort_by_name)
555 def OldStyleQuery(self, lu):
556 """Collect data and execute query.
559 return self.query.OldStyleQuery(self._GetQueryData(lu),
560 sort_by_name=self.sort_by_name)
563 def _GetWantedNodes(lu, nodes):
564 """Returns list of checked and expanded node names.
566 @type lu: L{LogicalUnit}
567 @param lu: the logical unit on whose behalf we execute
569 @param nodes: list of node names or None for all nodes
571 @return: the list of nodes, sorted
572 @raise errors.ProgrammerError: if the nodes parameter is wrong type
576 return [_ExpandNodeName(lu.cfg, name) for name in nodes]
578 return utils.NiceSort(lu.cfg.GetNodeList())
581 def _GetWantedInstances(lu, instances):
582 """Returns list of checked and expanded instance names.
584 @type lu: L{LogicalUnit}
585 @param lu: the logical unit on whose behalf we execute
586 @type instances: list
587 @param instances: list of instance names or None for all instances
589 @return: the list of instances, sorted
590 @raise errors.OpPrereqError: if the instances parameter is wrong type
591 @raise errors.OpPrereqError: if any of the passed instances is not found
595 wanted = [_ExpandInstanceName(lu.cfg, name) for name in instances]
597 wanted = utils.NiceSort(lu.cfg.GetInstanceList())
601 def _GetUpdatedParams(old_params, update_dict,
602 use_default=True, use_none=False):
603 """Return the new version of a parameter dictionary.
605 @type old_params: dict
606 @param old_params: old parameters
607 @type update_dict: dict
608 @param update_dict: dict containing new parameter values, or
609 constants.VALUE_DEFAULT to reset the parameter to its default
611 @param use_default: boolean
612 @type use_default: whether to recognise L{constants.VALUE_DEFAULT}
613 values as 'to be deleted' values
614 @param use_none: boolean
615 @type use_none: whether to recognise C{None} values as 'to be
618 @return: the new parameter dictionary
621 params_copy = copy.deepcopy(old_params)
622 for key, val in update_dict.iteritems():
623 if ((use_default and val == constants.VALUE_DEFAULT) or
624 (use_none and val is None)):
630 params_copy[key] = val
634 def _ReleaseLocks(lu, level, names=None, keep=None):
635 """Releases locks owned by an LU.
637 @type lu: L{LogicalUnit}
638 @param level: Lock level
639 @type names: list or None
640 @param names: Names of locks to release
641 @type keep: list or None
642 @param keep: Names of locks to retain
645 assert not (keep is not None and names is not None), \
646 "Only one of the 'names' and the 'keep' parameters can be given"
648 if names is not None:
649 should_release = names.__contains__
651 should_release = lambda name: name not in keep
653 should_release = None
659 # Determine which locks to release
660 for name in lu.glm.list_owned(level):
661 if should_release(name):
666 assert len(lu.glm.list_owned(level)) == (len(retain) + len(release))
668 # Release just some locks
669 lu.glm.release(level, names=release)
671 assert frozenset(lu.glm.list_owned(level)) == frozenset(retain)
674 lu.glm.release(level)
676 assert not lu.glm.is_owned(level), "No locks should be owned"
679 def _RunPostHook(lu, node_name):
680 """Runs the post-hook for an opcode on a single node.
683 hm = lu.proc.hmclass(lu.rpc.call_hooks_runner, lu)
685 hm.RunPhase(constants.HOOKS_PHASE_POST, nodes=[node_name])
687 # pylint: disable-msg=W0702
688 lu.LogWarning("Errors occurred running hooks on %s" % node_name)
691 def _CheckOutputFields(static, dynamic, selected):
692 """Checks whether all selected fields are valid.
694 @type static: L{utils.FieldSet}
695 @param static: static fields set
696 @type dynamic: L{utils.FieldSet}
697 @param dynamic: dynamic fields set
704 delta = f.NonMatching(selected)
706 raise errors.OpPrereqError("Unknown output fields selected: %s"
707 % ",".join(delta), errors.ECODE_INVAL)
710 def _CheckGlobalHvParams(params):
711 """Validates that given hypervisor params are not global ones.
713 This will ensure that instances don't get customised versions of
717 used_globals = constants.HVC_GLOBALS.intersection(params)
719 msg = ("The following hypervisor parameters are global and cannot"
720 " be customized at instance level, please modify them at"
721 " cluster level: %s" % utils.CommaJoin(used_globals))
722 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
725 def _CheckNodeOnline(lu, node, msg=None):
726 """Ensure that a given node is online.
728 @param lu: the LU on behalf of which we make the check
729 @param node: the node to check
730 @param msg: if passed, should be a message to replace the default one
731 @raise errors.OpPrereqError: if the node is offline
735 msg = "Can't use offline node"
736 if lu.cfg.GetNodeInfo(node).offline:
737 raise errors.OpPrereqError("%s: %s" % (msg, node), errors.ECODE_STATE)
740 def _CheckNodeNotDrained(lu, node):
741 """Ensure that a given node is not drained.
743 @param lu: the LU on behalf of which we make the check
744 @param node: the node to check
745 @raise errors.OpPrereqError: if the node is drained
748 if lu.cfg.GetNodeInfo(node).drained:
749 raise errors.OpPrereqError("Can't use drained node %s" % node,
753 def _CheckNodeVmCapable(lu, node):
754 """Ensure that a given node is vm capable.
756 @param lu: the LU on behalf of which we make the check
757 @param node: the node to check
758 @raise errors.OpPrereqError: if the node is not vm capable
761 if not lu.cfg.GetNodeInfo(node).vm_capable:
762 raise errors.OpPrereqError("Can't use non-vm_capable node %s" % node,
766 def _CheckNodeHasOS(lu, node, os_name, force_variant):
767 """Ensure that a node supports a given OS.
769 @param lu: the LU on behalf of which we make the check
770 @param node: the node to check
771 @param os_name: the OS to query about
772 @param force_variant: whether to ignore variant errors
773 @raise errors.OpPrereqError: if the node is not supporting the OS
776 result = lu.rpc.call_os_get(node, os_name)
777 result.Raise("OS '%s' not in supported OS list for node %s" %
779 prereq=True, ecode=errors.ECODE_INVAL)
780 if not force_variant:
781 _CheckOSVariant(result.payload, os_name)
784 def _CheckNodeHasSecondaryIP(lu, node, secondary_ip, prereq):
785 """Ensure that a node has the given secondary ip.
787 @type lu: L{LogicalUnit}
788 @param lu: the LU on behalf of which we make the check
790 @param node: the node to check
791 @type secondary_ip: string
792 @param secondary_ip: the ip to check
793 @type prereq: boolean
794 @param prereq: whether to throw a prerequisite or an execute error
795 @raise errors.OpPrereqError: if the node doesn't have the ip, and prereq=True
796 @raise errors.OpExecError: if the node doesn't have the ip, and prereq=False
799 result = lu.rpc.call_node_has_ip_address(node, secondary_ip)
800 result.Raise("Failure checking secondary ip on node %s" % node,
801 prereq=prereq, ecode=errors.ECODE_ENVIRON)
802 if not result.payload:
803 msg = ("Node claims it doesn't have the secondary ip you gave (%s),"
804 " please fix and re-run this command" % secondary_ip)
806 raise errors.OpPrereqError(msg, errors.ECODE_ENVIRON)
808 raise errors.OpExecError(msg)
811 def _GetClusterDomainSecret():
812 """Reads the cluster domain secret.
815 return utils.ReadOneLineFile(constants.CLUSTER_DOMAIN_SECRET_FILE,
819 def _CheckInstanceDown(lu, instance, reason):
820 """Ensure that an instance is not running."""
821 if instance.admin_up:
822 raise errors.OpPrereqError("Instance %s is marked to be up, %s" %
823 (instance.name, reason), errors.ECODE_STATE)
825 pnode = instance.primary_node
826 ins_l = lu.rpc.call_instance_list([pnode], [instance.hypervisor])[pnode]
827 ins_l.Raise("Can't contact node %s for instance information" % pnode,
828 prereq=True, ecode=errors.ECODE_ENVIRON)
830 if instance.name in ins_l.payload:
831 raise errors.OpPrereqError("Instance %s is running, %s" %
832 (instance.name, reason), errors.ECODE_STATE)
835 def _ExpandItemName(fn, name, kind):
836 """Expand an item name.
838 @param fn: the function to use for expansion
839 @param name: requested item name
840 @param kind: text description ('Node' or 'Instance')
841 @return: the resolved (full) name
842 @raise errors.OpPrereqError: if the item is not found
846 if full_name is None:
847 raise errors.OpPrereqError("%s '%s' not known" % (kind, name),
852 def _ExpandNodeName(cfg, name):
853 """Wrapper over L{_ExpandItemName} for nodes."""
854 return _ExpandItemName(cfg.ExpandNodeName, name, "Node")
857 def _ExpandInstanceName(cfg, name):
858 """Wrapper over L{_ExpandItemName} for instance."""
859 return _ExpandItemName(cfg.ExpandInstanceName, name, "Instance")
862 def _BuildInstanceHookEnv(name, primary_node, secondary_nodes, os_type, status,
863 memory, vcpus, nics, disk_template, disks,
864 bep, hvp, hypervisor_name):
865 """Builds instance related env variables for hooks
867 This builds the hook environment from individual variables.
870 @param name: the name of the instance
871 @type primary_node: string
872 @param primary_node: the name of the instance's primary node
873 @type secondary_nodes: list
874 @param secondary_nodes: list of secondary nodes as strings
875 @type os_type: string
876 @param os_type: the name of the instance's OS
877 @type status: boolean
878 @param status: the should_run status of the instance
880 @param memory: the memory size of the instance
882 @param vcpus: the count of VCPUs the instance has
884 @param nics: list of tuples (ip, mac, mode, link) representing
885 the NICs the instance has
886 @type disk_template: string
887 @param disk_template: the disk template of the instance
889 @param disks: the list of (size, mode) pairs
891 @param bep: the backend parameters for the instance
893 @param hvp: the hypervisor parameters for the instance
894 @type hypervisor_name: string
895 @param hypervisor_name: the hypervisor for the instance
897 @return: the hook environment for this instance
906 "INSTANCE_NAME": name,
907 "INSTANCE_PRIMARY": primary_node,
908 "INSTANCE_SECONDARIES": " ".join(secondary_nodes),
909 "INSTANCE_OS_TYPE": os_type,
910 "INSTANCE_STATUS": str_status,
911 "INSTANCE_MEMORY": memory,
912 "INSTANCE_VCPUS": vcpus,
913 "INSTANCE_DISK_TEMPLATE": disk_template,
914 "INSTANCE_HYPERVISOR": hypervisor_name,
918 nic_count = len(nics)
919 for idx, (ip, mac, mode, link) in enumerate(nics):
922 env["INSTANCE_NIC%d_IP" % idx] = ip
923 env["INSTANCE_NIC%d_MAC" % idx] = mac
924 env["INSTANCE_NIC%d_MODE" % idx] = mode
925 env["INSTANCE_NIC%d_LINK" % idx] = link
926 if mode == constants.NIC_MODE_BRIDGED:
927 env["INSTANCE_NIC%d_BRIDGE" % idx] = link
931 env["INSTANCE_NIC_COUNT"] = nic_count
934 disk_count = len(disks)
935 for idx, (size, mode) in enumerate(disks):
936 env["INSTANCE_DISK%d_SIZE" % idx] = size
937 env["INSTANCE_DISK%d_MODE" % idx] = mode
941 env["INSTANCE_DISK_COUNT"] = disk_count
943 for source, kind in [(bep, "BE"), (hvp, "HV")]:
944 for key, value in source.items():
945 env["INSTANCE_%s_%s" % (kind, key)] = value
950 def _NICListToTuple(lu, nics):
951 """Build a list of nic information tuples.
953 This list is suitable to be passed to _BuildInstanceHookEnv or as a return
954 value in LUInstanceQueryData.
956 @type lu: L{LogicalUnit}
957 @param lu: the logical unit on whose behalf we execute
958 @type nics: list of L{objects.NIC}
959 @param nics: list of nics to convert to hooks tuples
963 cluster = lu.cfg.GetClusterInfo()
967 filled_params = cluster.SimpleFillNIC(nic.nicparams)
968 mode = filled_params[constants.NIC_MODE]
969 link = filled_params[constants.NIC_LINK]
970 hooks_nics.append((ip, mac, mode, link))
974 def _BuildInstanceHookEnvByObject(lu, instance, override=None):
975 """Builds instance related env variables for hooks from an object.
977 @type lu: L{LogicalUnit}
978 @param lu: the logical unit on whose behalf we execute
979 @type instance: L{objects.Instance}
980 @param instance: the instance for which we should build the
983 @param override: dictionary with key/values that will override
986 @return: the hook environment dictionary
989 cluster = lu.cfg.GetClusterInfo()
990 bep = cluster.FillBE(instance)
991 hvp = cluster.FillHV(instance)
993 'name': instance.name,
994 'primary_node': instance.primary_node,
995 'secondary_nodes': instance.secondary_nodes,
996 'os_type': instance.os,
997 'status': instance.admin_up,
998 'memory': bep[constants.BE_MEMORY],
999 'vcpus': bep[constants.BE_VCPUS],
1000 'nics': _NICListToTuple(lu, instance.nics),
1001 'disk_template': instance.disk_template,
1002 'disks': [(disk.size, disk.mode) for disk in instance.disks],
1005 'hypervisor_name': instance.hypervisor,
1008 args.update(override)
1009 return _BuildInstanceHookEnv(**args) # pylint: disable-msg=W0142
1012 def _AdjustCandidatePool(lu, exceptions):
1013 """Adjust the candidate pool after node operations.
1016 mod_list = lu.cfg.MaintainCandidatePool(exceptions)
1018 lu.LogInfo("Promoted nodes to master candidate role: %s",
1019 utils.CommaJoin(node.name for node in mod_list))
1020 for name in mod_list:
1021 lu.context.ReaddNode(name)
1022 mc_now, mc_max, _ = lu.cfg.GetMasterCandidateStats(exceptions)
1024 lu.LogInfo("Note: more nodes are candidates (%d) than desired (%d)" %
1028 def _DecideSelfPromotion(lu, exceptions=None):
1029 """Decide whether I should promote myself as a master candidate.
1032 cp_size = lu.cfg.GetClusterInfo().candidate_pool_size
1033 mc_now, mc_should, _ = lu.cfg.GetMasterCandidateStats(exceptions)
1034 # the new node will increase mc_max with one, so:
1035 mc_should = min(mc_should + 1, cp_size)
1036 return mc_now < mc_should
1039 def _CheckNicsBridgesExist(lu, target_nics, target_node):
1040 """Check that the brigdes needed by a list of nics exist.
1043 cluster = lu.cfg.GetClusterInfo()
1044 paramslist = [cluster.SimpleFillNIC(nic.nicparams) for nic in target_nics]
1045 brlist = [params[constants.NIC_LINK] for params in paramslist
1046 if params[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED]
1048 result = lu.rpc.call_bridges_exist(target_node, brlist)
1049 result.Raise("Error checking bridges on destination node '%s'" %
1050 target_node, prereq=True, ecode=errors.ECODE_ENVIRON)
1053 def _CheckInstanceBridgesExist(lu, instance, node=None):
1054 """Check that the brigdes needed by an instance exist.
1058 node = instance.primary_node
1059 _CheckNicsBridgesExist(lu, instance.nics, node)
1062 def _CheckOSVariant(os_obj, name):
1063 """Check whether an OS name conforms to the os variants specification.
1065 @type os_obj: L{objects.OS}
1066 @param os_obj: OS object to check
1068 @param name: OS name passed by the user, to check for validity
1071 if not os_obj.supported_variants:
1073 variant = objects.OS.GetVariant(name)
1075 raise errors.OpPrereqError("OS name must include a variant",
1078 if variant not in os_obj.supported_variants:
1079 raise errors.OpPrereqError("Unsupported OS variant", errors.ECODE_INVAL)
1082 def _GetNodeInstancesInner(cfg, fn):
1083 return [i for i in cfg.GetAllInstancesInfo().values() if fn(i)]
1086 def _GetNodeInstances(cfg, node_name):
1087 """Returns a list of all primary and secondary instances on a node.
1091 return _GetNodeInstancesInner(cfg, lambda inst: node_name in inst.all_nodes)
1094 def _GetNodePrimaryInstances(cfg, node_name):
1095 """Returns primary instances on a node.
1098 return _GetNodeInstancesInner(cfg,
1099 lambda inst: node_name == inst.primary_node)
1102 def _GetNodeSecondaryInstances(cfg, node_name):
1103 """Returns secondary instances on a node.
1106 return _GetNodeInstancesInner(cfg,
1107 lambda inst: node_name in inst.secondary_nodes)
1110 def _GetStorageTypeArgs(cfg, storage_type):
1111 """Returns the arguments for a storage type.
1114 # Special case for file storage
1115 if storage_type == constants.ST_FILE:
1116 # storage.FileStorage wants a list of storage directories
1117 return [[cfg.GetFileStorageDir(), cfg.GetSharedFileStorageDir()]]
1122 def _FindFaultyInstanceDisks(cfg, rpc, instance, node_name, prereq):
1125 for dev in instance.disks:
1126 cfg.SetDiskID(dev, node_name)
1128 result = rpc.call_blockdev_getmirrorstatus(node_name, instance.disks)
1129 result.Raise("Failed to get disk status from node %s" % node_name,
1130 prereq=prereq, ecode=errors.ECODE_ENVIRON)
1132 for idx, bdev_status in enumerate(result.payload):
1133 if bdev_status and bdev_status.ldisk_status == constants.LDS_FAULTY:
1139 def _CheckIAllocatorOrNode(lu, iallocator_slot, node_slot):
1140 """Check the sanity of iallocator and node arguments and use the
1141 cluster-wide iallocator if appropriate.
1143 Check that at most one of (iallocator, node) is specified. If none is
1144 specified, then the LU's opcode's iallocator slot is filled with the
1145 cluster-wide default iallocator.
1147 @type iallocator_slot: string
1148 @param iallocator_slot: the name of the opcode iallocator slot
1149 @type node_slot: string
1150 @param node_slot: the name of the opcode target node slot
1153 node = getattr(lu.op, node_slot, None)
1154 iallocator = getattr(lu.op, iallocator_slot, None)
1156 if node is not None and iallocator is not None:
1157 raise errors.OpPrereqError("Do not specify both, iallocator and node",
1159 elif node is None and iallocator is None:
1160 default_iallocator = lu.cfg.GetDefaultIAllocator()
1161 if default_iallocator:
1162 setattr(lu.op, iallocator_slot, default_iallocator)
1164 raise errors.OpPrereqError("No iallocator or node given and no"
1165 " cluster-wide default iallocator found;"
1166 " please specify either an iallocator or a"
1167 " node, or set a cluster-wide default"
1171 class LUClusterPostInit(LogicalUnit):
1172 """Logical unit for running hooks after cluster initialization.
1175 HPATH = "cluster-init"
1176 HTYPE = constants.HTYPE_CLUSTER
1178 def BuildHooksEnv(self):
1183 "OP_TARGET": self.cfg.GetClusterName(),
1186 def BuildHooksNodes(self):
1187 """Build hooks nodes.
1190 return ([], [self.cfg.GetMasterNode()])
1192 def Exec(self, feedback_fn):
1199 class LUClusterDestroy(LogicalUnit):
1200 """Logical unit for destroying the cluster.
1203 HPATH = "cluster-destroy"
1204 HTYPE = constants.HTYPE_CLUSTER
1206 def BuildHooksEnv(self):
1211 "OP_TARGET": self.cfg.GetClusterName(),
1214 def BuildHooksNodes(self):
1215 """Build hooks nodes.
1220 def CheckPrereq(self):
1221 """Check prerequisites.
1223 This checks whether the cluster is empty.
1225 Any errors are signaled by raising errors.OpPrereqError.
1228 master = self.cfg.GetMasterNode()
1230 nodelist = self.cfg.GetNodeList()
1231 if len(nodelist) != 1 or nodelist[0] != master:
1232 raise errors.OpPrereqError("There are still %d node(s) in"
1233 " this cluster." % (len(nodelist) - 1),
1235 instancelist = self.cfg.GetInstanceList()
1237 raise errors.OpPrereqError("There are still %d instance(s) in"
1238 " this cluster." % len(instancelist),
1241 def Exec(self, feedback_fn):
1242 """Destroys the cluster.
1245 master = self.cfg.GetMasterNode()
1247 # Run post hooks on master node before it's removed
1248 _RunPostHook(self, master)
1250 result = self.rpc.call_node_stop_master(master, False)
1251 result.Raise("Could not disable the master role")
1256 def _VerifyCertificate(filename):
1257 """Verifies a certificate for LUClusterVerify.
1259 @type filename: string
1260 @param filename: Path to PEM file
1264 cert = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM,
1265 utils.ReadFile(filename))
1266 except Exception, err: # pylint: disable-msg=W0703
1267 return (LUClusterVerify.ETYPE_ERROR,
1268 "Failed to load X509 certificate %s: %s" % (filename, err))
1271 utils.VerifyX509Certificate(cert, constants.SSL_CERT_EXPIRATION_WARN,
1272 constants.SSL_CERT_EXPIRATION_ERROR)
1275 fnamemsg = "While verifying %s: %s" % (filename, msg)
1280 return (None, fnamemsg)
1281 elif errcode == utils.CERT_WARNING:
1282 return (LUClusterVerify.ETYPE_WARNING, fnamemsg)
1283 elif errcode == utils.CERT_ERROR:
1284 return (LUClusterVerify.ETYPE_ERROR, fnamemsg)
1286 raise errors.ProgrammerError("Unhandled certificate error code %r" % errcode)
1289 class LUClusterVerify(LogicalUnit):
1290 """Verifies the cluster status.
1293 HPATH = "cluster-verify"
1294 HTYPE = constants.HTYPE_CLUSTER
1297 TCLUSTER = "cluster"
1299 TINSTANCE = "instance"
1301 ECLUSTERCFG = (TCLUSTER, "ECLUSTERCFG")
1302 ECLUSTERCERT = (TCLUSTER, "ECLUSTERCERT")
1303 ECLUSTERFILECHECK = (TCLUSTER, "ECLUSTERFILECHECK")
1304 EINSTANCEBADNODE = (TINSTANCE, "EINSTANCEBADNODE")
1305 EINSTANCEDOWN = (TINSTANCE, "EINSTANCEDOWN")
1306 EINSTANCELAYOUT = (TINSTANCE, "EINSTANCELAYOUT")
1307 EINSTANCEMISSINGDISK = (TINSTANCE, "EINSTANCEMISSINGDISK")
1308 EINSTANCEFAULTYDISK = (TINSTANCE, "EINSTANCEFAULTYDISK")
1309 EINSTANCEWRONGNODE = (TINSTANCE, "EINSTANCEWRONGNODE")
1310 EINSTANCESPLITGROUPS = (TINSTANCE, "EINSTANCESPLITGROUPS")
1311 ENODEDRBD = (TNODE, "ENODEDRBD")
1312 ENODEDRBDHELPER = (TNODE, "ENODEDRBDHELPER")
1313 ENODEFILECHECK = (TNODE, "ENODEFILECHECK")
1314 ENODEHOOKS = (TNODE, "ENODEHOOKS")
1315 ENODEHV = (TNODE, "ENODEHV")
1316 ENODELVM = (TNODE, "ENODELVM")
1317 ENODEN1 = (TNODE, "ENODEN1")
1318 ENODENET = (TNODE, "ENODENET")
1319 ENODEOS = (TNODE, "ENODEOS")
1320 ENODEORPHANINSTANCE = (TNODE, "ENODEORPHANINSTANCE")
1321 ENODEORPHANLV = (TNODE, "ENODEORPHANLV")
1322 ENODERPC = (TNODE, "ENODERPC")
1323 ENODESSH = (TNODE, "ENODESSH")
1324 ENODEVERSION = (TNODE, "ENODEVERSION")
1325 ENODESETUP = (TNODE, "ENODESETUP")
1326 ENODETIME = (TNODE, "ENODETIME")
1327 ENODEOOBPATH = (TNODE, "ENODEOOBPATH")
1329 ETYPE_FIELD = "code"
1330 ETYPE_ERROR = "ERROR"
1331 ETYPE_WARNING = "WARNING"
1333 _HOOKS_INDENT_RE = re.compile("^", re.M)
1335 class NodeImage(object):
1336 """A class representing the logical and physical status of a node.
1339 @ivar name: the node name to which this object refers
1340 @ivar volumes: a structure as returned from
1341 L{ganeti.backend.GetVolumeList} (runtime)
1342 @ivar instances: a list of running instances (runtime)
1343 @ivar pinst: list of configured primary instances (config)
1344 @ivar sinst: list of configured secondary instances (config)
1345 @ivar sbp: dictionary of {primary-node: list of instances} for all
1346 instances for which this node is secondary (config)
1347 @ivar mfree: free memory, as reported by hypervisor (runtime)
1348 @ivar dfree: free disk, as reported by the node (runtime)
1349 @ivar offline: the offline status (config)
1350 @type rpc_fail: boolean
1351 @ivar rpc_fail: whether the RPC verify call was successfull (overall,
1352 not whether the individual keys were correct) (runtime)
1353 @type lvm_fail: boolean
1354 @ivar lvm_fail: whether the RPC call didn't return valid LVM data
1355 @type hyp_fail: boolean
1356 @ivar hyp_fail: whether the RPC call didn't return the instance list
1357 @type ghost: boolean
1358 @ivar ghost: whether this is a known node or not (config)
1359 @type os_fail: boolean
1360 @ivar os_fail: whether the RPC call didn't return valid OS data
1362 @ivar oslist: list of OSes as diagnosed by DiagnoseOS
1363 @type vm_capable: boolean
1364 @ivar vm_capable: whether the node can host instances
1367 def __init__(self, offline=False, name=None, vm_capable=True):
1376 self.offline = offline
1377 self.vm_capable = vm_capable
1378 self.rpc_fail = False
1379 self.lvm_fail = False
1380 self.hyp_fail = False
1382 self.os_fail = False
1385 def ExpandNames(self):
1386 self.needed_locks = {
1387 locking.LEVEL_NODE: locking.ALL_SET,
1388 locking.LEVEL_INSTANCE: locking.ALL_SET,
1390 self.share_locks = dict.fromkeys(locking.LEVELS, 1)
1392 def _Error(self, ecode, item, msg, *args, **kwargs):
1393 """Format an error message.
1395 Based on the opcode's error_codes parameter, either format a
1396 parseable error code, or a simpler error string.
1398 This must be called only from Exec and functions called from Exec.
1401 ltype = kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR)
1403 # first complete the msg
1406 # then format the whole message
1407 if self.op.error_codes:
1408 msg = "%s:%s:%s:%s:%s" % (ltype, etxt, itype, item, msg)
1414 msg = "%s: %s%s: %s" % (ltype, itype, item, msg)
1415 # and finally report it via the feedback_fn
1416 self._feedback_fn(" - %s" % msg)
1418 def _ErrorIf(self, cond, *args, **kwargs):
1419 """Log an error message if the passed condition is True.
1422 cond = bool(cond) or self.op.debug_simulate_errors
1424 self._Error(*args, **kwargs)
1425 # do not mark the operation as failed for WARN cases only
1426 if kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR) == self.ETYPE_ERROR:
1427 self.bad = self.bad or cond
1429 def _VerifyNode(self, ninfo, nresult):
1430 """Perform some basic validation on data returned from a node.
1432 - check the result data structure is well formed and has all the
1434 - check ganeti version
1436 @type ninfo: L{objects.Node}
1437 @param ninfo: the node to check
1438 @param nresult: the results from the node
1440 @return: whether overall this call was successful (and we can expect
1441 reasonable values in the respose)
1445 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1447 # main result, nresult should be a non-empty dict
1448 test = not nresult or not isinstance(nresult, dict)
1449 _ErrorIf(test, self.ENODERPC, node,
1450 "unable to verify node: no data returned")
1454 # compares ganeti version
1455 local_version = constants.PROTOCOL_VERSION
1456 remote_version = nresult.get("version", None)
1457 test = not (remote_version and
1458 isinstance(remote_version, (list, tuple)) and
1459 len(remote_version) == 2)
1460 _ErrorIf(test, self.ENODERPC, node,
1461 "connection to node returned invalid data")
1465 test = local_version != remote_version[0]
1466 _ErrorIf(test, self.ENODEVERSION, node,
1467 "incompatible protocol versions: master %s,"
1468 " node %s", local_version, remote_version[0])
1472 # node seems compatible, we can actually try to look into its results
1474 # full package version
1475 self._ErrorIf(constants.RELEASE_VERSION != remote_version[1],
1476 self.ENODEVERSION, node,
1477 "software version mismatch: master %s, node %s",
1478 constants.RELEASE_VERSION, remote_version[1],
1479 code=self.ETYPE_WARNING)
1481 hyp_result = nresult.get(constants.NV_HYPERVISOR, None)
1482 if ninfo.vm_capable and isinstance(hyp_result, dict):
1483 for hv_name, hv_result in hyp_result.iteritems():
1484 test = hv_result is not None
1485 _ErrorIf(test, self.ENODEHV, node,
1486 "hypervisor %s verify failure: '%s'", hv_name, hv_result)
1488 hvp_result = nresult.get(constants.NV_HVPARAMS, None)
1489 if ninfo.vm_capable and isinstance(hvp_result, list):
1490 for item, hv_name, hv_result in hvp_result:
1491 _ErrorIf(True, self.ENODEHV, node,
1492 "hypervisor %s parameter verify failure (source %s): %s",
1493 hv_name, item, hv_result)
1495 test = nresult.get(constants.NV_NODESETUP,
1496 ["Missing NODESETUP results"])
1497 _ErrorIf(test, self.ENODESETUP, node, "node setup error: %s",
1502 def _VerifyNodeTime(self, ninfo, nresult,
1503 nvinfo_starttime, nvinfo_endtime):
1504 """Check the node time.
1506 @type ninfo: L{objects.Node}
1507 @param ninfo: the node to check
1508 @param nresult: the remote results for the node
1509 @param nvinfo_starttime: the start time of the RPC call
1510 @param nvinfo_endtime: the end time of the RPC call
1514 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1516 ntime = nresult.get(constants.NV_TIME, None)
1518 ntime_merged = utils.MergeTime(ntime)
1519 except (ValueError, TypeError):
1520 _ErrorIf(True, self.ENODETIME, node, "Node returned invalid time")
1523 if ntime_merged < (nvinfo_starttime - constants.NODE_MAX_CLOCK_SKEW):
1524 ntime_diff = "%.01fs" % abs(nvinfo_starttime - ntime_merged)
1525 elif ntime_merged > (nvinfo_endtime + constants.NODE_MAX_CLOCK_SKEW):
1526 ntime_diff = "%.01fs" % abs(ntime_merged - nvinfo_endtime)
1530 _ErrorIf(ntime_diff is not None, self.ENODETIME, node,
1531 "Node time diverges by at least %s from master node time",
1534 def _VerifyNodeLVM(self, ninfo, nresult, vg_name):
1535 """Check the node LVM results.
1537 @type ninfo: L{objects.Node}
1538 @param ninfo: the node to check
1539 @param nresult: the remote results for the node
1540 @param vg_name: the configured VG name
1547 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1549 # checks vg existence and size > 20G
1550 vglist = nresult.get(constants.NV_VGLIST, None)
1552 _ErrorIf(test, self.ENODELVM, node, "unable to check volume groups")
1554 vgstatus = utils.CheckVolumeGroupSize(vglist, vg_name,
1555 constants.MIN_VG_SIZE)
1556 _ErrorIf(vgstatus, self.ENODELVM, node, vgstatus)
1559 pvlist = nresult.get(constants.NV_PVLIST, None)
1560 test = pvlist is None
1561 _ErrorIf(test, self.ENODELVM, node, "Can't get PV list from node")
1563 # check that ':' is not present in PV names, since it's a
1564 # special character for lvcreate (denotes the range of PEs to
1566 for _, pvname, owner_vg in pvlist:
1567 test = ":" in pvname
1568 _ErrorIf(test, self.ENODELVM, node, "Invalid character ':' in PV"
1569 " '%s' of VG '%s'", pvname, owner_vg)
1571 def _VerifyNodeBridges(self, ninfo, nresult, bridges):
1572 """Check the node bridges.
1574 @type ninfo: L{objects.Node}
1575 @param ninfo: the node to check
1576 @param nresult: the remote results for the node
1577 @param bridges: the expected list of bridges
1584 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1586 missing = nresult.get(constants.NV_BRIDGES, None)
1587 test = not isinstance(missing, list)
1588 _ErrorIf(test, self.ENODENET, node,
1589 "did not return valid bridge information")
1591 _ErrorIf(bool(missing), self.ENODENET, node, "missing bridges: %s" %
1592 utils.CommaJoin(sorted(missing)))
1594 def _VerifyNodeNetwork(self, ninfo, nresult):
1595 """Check the node network connectivity results.
1597 @type ninfo: L{objects.Node}
1598 @param ninfo: the node to check
1599 @param nresult: the remote results for the node
1603 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1605 test = constants.NV_NODELIST not in nresult
1606 _ErrorIf(test, self.ENODESSH, node,
1607 "node hasn't returned node ssh connectivity data")
1609 if nresult[constants.NV_NODELIST]:
1610 for a_node, a_msg in nresult[constants.NV_NODELIST].items():
1611 _ErrorIf(True, self.ENODESSH, node,
1612 "ssh communication with node '%s': %s", a_node, a_msg)
1614 test = constants.NV_NODENETTEST not in nresult
1615 _ErrorIf(test, self.ENODENET, node,
1616 "node hasn't returned node tcp connectivity data")
1618 if nresult[constants.NV_NODENETTEST]:
1619 nlist = utils.NiceSort(nresult[constants.NV_NODENETTEST].keys())
1621 _ErrorIf(True, self.ENODENET, node,
1622 "tcp communication with node '%s': %s",
1623 anode, nresult[constants.NV_NODENETTEST][anode])
1625 test = constants.NV_MASTERIP not in nresult
1626 _ErrorIf(test, self.ENODENET, node,
1627 "node hasn't returned node master IP reachability data")
1629 if not nresult[constants.NV_MASTERIP]:
1630 if node == self.master_node:
1631 msg = "the master node cannot reach the master IP (not configured?)"
1633 msg = "cannot reach the master IP"
1634 _ErrorIf(True, self.ENODENET, node, msg)
1636 def _VerifyInstance(self, instance, instanceconfig, node_image,
1638 """Verify an instance.
1640 This function checks to see if the required block devices are
1641 available on the instance's node.
1644 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1645 node_current = instanceconfig.primary_node
1647 node_vol_should = {}
1648 instanceconfig.MapLVsByNode(node_vol_should)
1650 for node in node_vol_should:
1651 n_img = node_image[node]
1652 if n_img.offline or n_img.rpc_fail or n_img.lvm_fail:
1653 # ignore missing volumes on offline or broken nodes
1655 for volume in node_vol_should[node]:
1656 test = volume not in n_img.volumes
1657 _ErrorIf(test, self.EINSTANCEMISSINGDISK, instance,
1658 "volume %s missing on node %s", volume, node)
1660 if instanceconfig.admin_up:
1661 pri_img = node_image[node_current]
1662 test = instance not in pri_img.instances and not pri_img.offline
1663 _ErrorIf(test, self.EINSTANCEDOWN, instance,
1664 "instance not running on its primary node %s",
1667 for node, n_img in node_image.items():
1668 if node != node_current:
1669 test = instance in n_img.instances
1670 _ErrorIf(test, self.EINSTANCEWRONGNODE, instance,
1671 "instance should not run on node %s", node)
1673 diskdata = [(nname, success, status, idx)
1674 for (nname, disks) in diskstatus.items()
1675 for idx, (success, status) in enumerate(disks)]
1677 for nname, success, bdev_status, idx in diskdata:
1678 # the 'ghost node' construction in Exec() ensures that we have a
1680 snode = node_image[nname]
1681 bad_snode = snode.ghost or snode.offline
1682 _ErrorIf(instanceconfig.admin_up and not success and not bad_snode,
1683 self.EINSTANCEFAULTYDISK, instance,
1684 "couldn't retrieve status for disk/%s on %s: %s",
1685 idx, nname, bdev_status)
1686 _ErrorIf((instanceconfig.admin_up and success and
1687 bdev_status.ldisk_status == constants.LDS_FAULTY),
1688 self.EINSTANCEFAULTYDISK, instance,
1689 "disk/%s on %s is faulty", idx, nname)
1691 def _VerifyOrphanVolumes(self, node_vol_should, node_image, reserved):
1692 """Verify if there are any unknown volumes in the cluster.
1694 The .os, .swap and backup volumes are ignored. All other volumes are
1695 reported as unknown.
1697 @type reserved: L{ganeti.utils.FieldSet}
1698 @param reserved: a FieldSet of reserved volume names
1701 for node, n_img in node_image.items():
1702 if n_img.offline or n_img.rpc_fail or n_img.lvm_fail:
1703 # skip non-healthy nodes
1705 for volume in n_img.volumes:
1706 test = ((node not in node_vol_should or
1707 volume not in node_vol_should[node]) and
1708 not reserved.Matches(volume))
1709 self._ErrorIf(test, self.ENODEORPHANLV, node,
1710 "volume %s is unknown", volume)
1712 def _VerifyOrphanInstances(self, instancelist, node_image):
1713 """Verify the list of running instances.
1715 This checks what instances are running but unknown to the cluster.
1718 for node, n_img in node_image.items():
1719 for o_inst in n_img.instances:
1720 test = o_inst not in instancelist
1721 self._ErrorIf(test, self.ENODEORPHANINSTANCE, node,
1722 "instance %s on node %s should not exist", o_inst, node)
1724 def _VerifyNPlusOneMemory(self, node_image, instance_cfg):
1725 """Verify N+1 Memory Resilience.
1727 Check that if one single node dies we can still start all the
1728 instances it was primary for.
1731 cluster_info = self.cfg.GetClusterInfo()
1732 for node, n_img in node_image.items():
1733 # This code checks that every node which is now listed as
1734 # secondary has enough memory to host all instances it is
1735 # supposed to should a single other node in the cluster fail.
1736 # FIXME: not ready for failover to an arbitrary node
1737 # FIXME: does not support file-backed instances
1738 # WARNING: we currently take into account down instances as well
1739 # as up ones, considering that even if they're down someone
1740 # might want to start them even in the event of a node failure.
1742 # we're skipping offline nodes from the N+1 warning, since
1743 # most likely we don't have good memory infromation from them;
1744 # we already list instances living on such nodes, and that's
1747 for prinode, instances in n_img.sbp.items():
1749 for instance in instances:
1750 bep = cluster_info.FillBE(instance_cfg[instance])
1751 if bep[constants.BE_AUTO_BALANCE]:
1752 needed_mem += bep[constants.BE_MEMORY]
1753 test = n_img.mfree < needed_mem
1754 self._ErrorIf(test, self.ENODEN1, node,
1755 "not enough memory to accomodate instance failovers"
1756 " should node %s fail (%dMiB needed, %dMiB available)",
1757 prinode, needed_mem, n_img.mfree)
1760 def _VerifyFiles(cls, errorif, nodeinfo, master_node, all_nvinfo,
1761 (files_all, files_all_opt, files_mc, files_vm)):
1762 """Verifies file checksums collected from all nodes.
1764 @param errorif: Callback for reporting errors
1765 @param nodeinfo: List of L{objects.Node} objects
1766 @param master_node: Name of master node
1767 @param all_nvinfo: RPC results
1770 node_names = frozenset(node.name for node in nodeinfo)
1772 assert master_node in node_names
1773 assert (len(files_all | files_all_opt | files_mc | files_vm) ==
1774 sum(map(len, [files_all, files_all_opt, files_mc, files_vm]))), \
1775 "Found file listed in more than one file list"
1777 # Define functions determining which nodes to consider for a file
1778 file2nodefn = dict([(filename, fn)
1779 for (files, fn) in [(files_all, None),
1780 (files_all_opt, None),
1781 (files_mc, lambda node: (node.master_candidate or
1782 node.name == master_node)),
1783 (files_vm, lambda node: node.vm_capable)]
1784 for filename in files])
1786 fileinfo = dict((filename, {}) for filename in file2nodefn.keys())
1788 for node in nodeinfo:
1789 nresult = all_nvinfo[node.name]
1791 if nresult.fail_msg or not nresult.payload:
1794 node_files = nresult.payload.get(constants.NV_FILELIST, None)
1796 test = not (node_files and isinstance(node_files, dict))
1797 errorif(test, cls.ENODEFILECHECK, node.name,
1798 "Node did not return file checksum data")
1802 for (filename, checksum) in node_files.items():
1803 # Check if the file should be considered for a node
1804 fn = file2nodefn[filename]
1805 if fn is None or fn(node):
1806 fileinfo[filename].setdefault(checksum, set()).add(node.name)
1808 for (filename, checksums) in fileinfo.items():
1809 assert compat.all(len(i) > 10 for i in checksums), "Invalid checksum"
1811 # Nodes having the file
1812 with_file = frozenset(node_name
1813 for nodes in fileinfo[filename].values()
1814 for node_name in nodes)
1816 # Nodes missing file
1817 missing_file = node_names - with_file
1819 if filename in files_all_opt:
1821 errorif(missing_file and missing_file != node_names,
1822 cls.ECLUSTERFILECHECK, None,
1823 "File %s is optional, but it must exist on all or no nodes (not"
1825 filename, utils.CommaJoin(utils.NiceSort(missing_file)))
1827 errorif(missing_file, cls.ECLUSTERFILECHECK, None,
1828 "File %s is missing from node(s) %s", filename,
1829 utils.CommaJoin(utils.NiceSort(missing_file)))
1831 # See if there are multiple versions of the file
1832 test = len(checksums) > 1
1834 variants = ["variant %s on %s" %
1835 (idx + 1, utils.CommaJoin(utils.NiceSort(nodes)))
1836 for (idx, (checksum, nodes)) in
1837 enumerate(sorted(checksums.items()))]
1841 errorif(test, cls.ECLUSTERFILECHECK, None,
1842 "File %s found with %s different checksums (%s)",
1843 filename, len(checksums), "; ".join(variants))
1845 def _VerifyNodeDrbd(self, ninfo, nresult, instanceinfo, drbd_helper,
1847 """Verifies and the node DRBD status.
1849 @type ninfo: L{objects.Node}
1850 @param ninfo: the node to check
1851 @param nresult: the remote results for the node
1852 @param instanceinfo: the dict of instances
1853 @param drbd_helper: the configured DRBD usermode helper
1854 @param drbd_map: the DRBD map as returned by
1855 L{ganeti.config.ConfigWriter.ComputeDRBDMap}
1859 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1862 helper_result = nresult.get(constants.NV_DRBDHELPER, None)
1863 test = (helper_result == None)
1864 _ErrorIf(test, self.ENODEDRBDHELPER, node,
1865 "no drbd usermode helper returned")
1867 status, payload = helper_result
1869 _ErrorIf(test, self.ENODEDRBDHELPER, node,
1870 "drbd usermode helper check unsuccessful: %s", payload)
1871 test = status and (payload != drbd_helper)
1872 _ErrorIf(test, self.ENODEDRBDHELPER, node,
1873 "wrong drbd usermode helper: %s", payload)
1875 # compute the DRBD minors
1877 for minor, instance in drbd_map[node].items():
1878 test = instance not in instanceinfo
1879 _ErrorIf(test, self.ECLUSTERCFG, None,
1880 "ghost instance '%s' in temporary DRBD map", instance)
1881 # ghost instance should not be running, but otherwise we
1882 # don't give double warnings (both ghost instance and
1883 # unallocated minor in use)
1885 node_drbd[minor] = (instance, False)
1887 instance = instanceinfo[instance]
1888 node_drbd[minor] = (instance.name, instance.admin_up)
1890 # and now check them
1891 used_minors = nresult.get(constants.NV_DRBDLIST, [])
1892 test = not isinstance(used_minors, (tuple, list))
1893 _ErrorIf(test, self.ENODEDRBD, node,
1894 "cannot parse drbd status file: %s", str(used_minors))
1896 # we cannot check drbd status
1899 for minor, (iname, must_exist) in node_drbd.items():
1900 test = minor not in used_minors and must_exist
1901 _ErrorIf(test, self.ENODEDRBD, node,
1902 "drbd minor %d of instance %s is not active", minor, iname)
1903 for minor in used_minors:
1904 test = minor not in node_drbd
1905 _ErrorIf(test, self.ENODEDRBD, node,
1906 "unallocated drbd minor %d is in use", minor)
1908 def _UpdateNodeOS(self, ninfo, nresult, nimg):
1909 """Builds the node OS structures.
1911 @type ninfo: L{objects.Node}
1912 @param ninfo: the node to check
1913 @param nresult: the remote results for the node
1914 @param nimg: the node image object
1918 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1920 remote_os = nresult.get(constants.NV_OSLIST, None)
1921 test = (not isinstance(remote_os, list) or
1922 not compat.all(isinstance(v, list) and len(v) == 7
1923 for v in remote_os))
1925 _ErrorIf(test, self.ENODEOS, node,
1926 "node hasn't returned valid OS data")
1935 for (name, os_path, status, diagnose,
1936 variants, parameters, api_ver) in nresult[constants.NV_OSLIST]:
1938 if name not in os_dict:
1941 # parameters is a list of lists instead of list of tuples due to
1942 # JSON lacking a real tuple type, fix it:
1943 parameters = [tuple(v) for v in parameters]
1944 os_dict[name].append((os_path, status, diagnose,
1945 set(variants), set(parameters), set(api_ver)))
1947 nimg.oslist = os_dict
1949 def _VerifyNodeOS(self, ninfo, nimg, base):
1950 """Verifies the node OS list.
1952 @type ninfo: L{objects.Node}
1953 @param ninfo: the node to check
1954 @param nimg: the node image object
1955 @param base: the 'template' node we match against (e.g. from the master)
1959 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1961 assert not nimg.os_fail, "Entered _VerifyNodeOS with failed OS rpc?"
1963 beautify_params = lambda l: ["%s: %s" % (k, v) for (k, v) in l]
1964 for os_name, os_data in nimg.oslist.items():
1965 assert os_data, "Empty OS status for OS %s?!" % os_name
1966 f_path, f_status, f_diag, f_var, f_param, f_api = os_data[0]
1967 _ErrorIf(not f_status, self.ENODEOS, node,
1968 "Invalid OS %s (located at %s): %s", os_name, f_path, f_diag)
1969 _ErrorIf(len(os_data) > 1, self.ENODEOS, node,
1970 "OS '%s' has multiple entries (first one shadows the rest): %s",
1971 os_name, utils.CommaJoin([v[0] for v in os_data]))
1972 # this will catched in backend too
1973 _ErrorIf(compat.any(v >= constants.OS_API_V15 for v in f_api)
1974 and not f_var, self.ENODEOS, node,
1975 "OS %s with API at least %d does not declare any variant",
1976 os_name, constants.OS_API_V15)
1977 # comparisons with the 'base' image
1978 test = os_name not in base.oslist
1979 _ErrorIf(test, self.ENODEOS, node,
1980 "Extra OS %s not present on reference node (%s)",
1984 assert base.oslist[os_name], "Base node has empty OS status?"
1985 _, b_status, _, b_var, b_param, b_api = base.oslist[os_name][0]
1987 # base OS is invalid, skipping
1989 for kind, a, b in [("API version", f_api, b_api),
1990 ("variants list", f_var, b_var),
1991 ("parameters", beautify_params(f_param),
1992 beautify_params(b_param))]:
1993 _ErrorIf(a != b, self.ENODEOS, node,
1994 "OS %s for %s differs from reference node %s: [%s] vs. [%s]",
1995 kind, os_name, base.name,
1996 utils.CommaJoin(sorted(a)), utils.CommaJoin(sorted(b)))
1998 # check any missing OSes
1999 missing = set(base.oslist.keys()).difference(nimg.oslist.keys())
2000 _ErrorIf(missing, self.ENODEOS, node,
2001 "OSes present on reference node %s but missing on this node: %s",
2002 base.name, utils.CommaJoin(missing))
2004 def _VerifyOob(self, ninfo, nresult):
2005 """Verifies out of band functionality of a node.
2007 @type ninfo: L{objects.Node}
2008 @param ninfo: the node to check
2009 @param nresult: the remote results for the node
2013 # We just have to verify the paths on master and/or master candidates
2014 # as the oob helper is invoked on the master
2015 if ((ninfo.master_candidate or ninfo.master_capable) and
2016 constants.NV_OOB_PATHS in nresult):
2017 for path_result in nresult[constants.NV_OOB_PATHS]:
2018 self._ErrorIf(path_result, self.ENODEOOBPATH, node, path_result)
2020 def _UpdateNodeVolumes(self, ninfo, nresult, nimg, vg_name):
2021 """Verifies and updates the node volume data.
2023 This function will update a L{NodeImage}'s internal structures
2024 with data from the remote call.
2026 @type ninfo: L{objects.Node}
2027 @param ninfo: the node to check
2028 @param nresult: the remote results for the node
2029 @param nimg: the node image object
2030 @param vg_name: the configured VG name
2034 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
2036 nimg.lvm_fail = True
2037 lvdata = nresult.get(constants.NV_LVLIST, "Missing LV data")
2040 elif isinstance(lvdata, basestring):
2041 _ErrorIf(True, self.ENODELVM, node, "LVM problem on node: %s",
2042 utils.SafeEncode(lvdata))
2043 elif not isinstance(lvdata, dict):
2044 _ErrorIf(True, self.ENODELVM, node, "rpc call to node failed (lvlist)")
2046 nimg.volumes = lvdata
2047 nimg.lvm_fail = False
2049 def _UpdateNodeInstances(self, ninfo, nresult, nimg):
2050 """Verifies and updates the node instance list.
2052 If the listing was successful, then updates this node's instance
2053 list. Otherwise, it marks the RPC call as failed for the instance
2056 @type ninfo: L{objects.Node}
2057 @param ninfo: the node to check
2058 @param nresult: the remote results for the node
2059 @param nimg: the node image object
2062 idata = nresult.get(constants.NV_INSTANCELIST, None)
2063 test = not isinstance(idata, list)
2064 self._ErrorIf(test, self.ENODEHV, ninfo.name, "rpc call to node failed"
2065 " (instancelist): %s", utils.SafeEncode(str(idata)))
2067 nimg.hyp_fail = True
2069 nimg.instances = idata
2071 def _UpdateNodeInfo(self, ninfo, nresult, nimg, vg_name):
2072 """Verifies and computes a node information map
2074 @type ninfo: L{objects.Node}
2075 @param ninfo: the node to check
2076 @param nresult: the remote results for the node
2077 @param nimg: the node image object
2078 @param vg_name: the configured VG name
2082 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
2084 # try to read free memory (from the hypervisor)
2085 hv_info = nresult.get(constants.NV_HVINFO, None)
2086 test = not isinstance(hv_info, dict) or "memory_free" not in hv_info
2087 _ErrorIf(test, self.ENODEHV, node, "rpc call to node failed (hvinfo)")
2090 nimg.mfree = int(hv_info["memory_free"])
2091 except (ValueError, TypeError):
2092 _ErrorIf(True, self.ENODERPC, node,
2093 "node returned invalid nodeinfo, check hypervisor")
2095 # FIXME: devise a free space model for file based instances as well
2096 if vg_name is not None:
2097 test = (constants.NV_VGLIST not in nresult or
2098 vg_name not in nresult[constants.NV_VGLIST])
2099 _ErrorIf(test, self.ENODELVM, node,
2100 "node didn't return data for the volume group '%s'"
2101 " - it is either missing or broken", vg_name)
2104 nimg.dfree = int(nresult[constants.NV_VGLIST][vg_name])
2105 except (ValueError, TypeError):
2106 _ErrorIf(True, self.ENODERPC, node,
2107 "node returned invalid LVM info, check LVM status")
2109 def _CollectDiskInfo(self, nodelist, node_image, instanceinfo):
2110 """Gets per-disk status information for all instances.
2112 @type nodelist: list of strings
2113 @param nodelist: Node names
2114 @type node_image: dict of (name, L{objects.Node})
2115 @param node_image: Node objects
2116 @type instanceinfo: dict of (name, L{objects.Instance})
2117 @param instanceinfo: Instance objects
2118 @rtype: {instance: {node: [(succes, payload)]}}
2119 @return: a dictionary of per-instance dictionaries with nodes as
2120 keys and disk information as values; the disk information is a
2121 list of tuples (success, payload)
2124 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
2127 node_disks_devonly = {}
2128 diskless_instances = set()
2129 diskless = constants.DT_DISKLESS
2131 for nname in nodelist:
2132 node_instances = list(itertools.chain(node_image[nname].pinst,
2133 node_image[nname].sinst))
2134 diskless_instances.update(inst for inst in node_instances
2135 if instanceinfo[inst].disk_template == diskless)
2136 disks = [(inst, disk)
2137 for inst in node_instances
2138 for disk in instanceinfo[inst].disks]
2141 # No need to collect data
2144 node_disks[nname] = disks
2146 # Creating copies as SetDiskID below will modify the objects and that can
2147 # lead to incorrect data returned from nodes
2148 devonly = [dev.Copy() for (_, dev) in disks]
2151 self.cfg.SetDiskID(dev, nname)
2153 node_disks_devonly[nname] = devonly
2155 assert len(node_disks) == len(node_disks_devonly)
2157 # Collect data from all nodes with disks
2158 result = self.rpc.call_blockdev_getmirrorstatus_multi(node_disks.keys(),
2161 assert len(result) == len(node_disks)
2165 for (nname, nres) in result.items():
2166 disks = node_disks[nname]
2169 # No data from this node
2170 data = len(disks) * [(False, "node offline")]
2173 _ErrorIf(msg, self.ENODERPC, nname,
2174 "while getting disk information: %s", msg)
2176 # No data from this node
2177 data = len(disks) * [(False, msg)]
2180 for idx, i in enumerate(nres.payload):
2181 if isinstance(i, (tuple, list)) and len(i) == 2:
2184 logging.warning("Invalid result from node %s, entry %d: %s",
2186 data.append((False, "Invalid result from the remote node"))
2188 for ((inst, _), status) in zip(disks, data):
2189 instdisk.setdefault(inst, {}).setdefault(nname, []).append(status)
2191 # Add empty entries for diskless instances.
2192 for inst in diskless_instances:
2193 assert inst not in instdisk
2196 assert compat.all(len(statuses) == len(instanceinfo[inst].disks) and
2197 len(nnames) <= len(instanceinfo[inst].all_nodes) and
2198 compat.all(isinstance(s, (tuple, list)) and
2199 len(s) == 2 for s in statuses)
2200 for inst, nnames in instdisk.items()
2201 for nname, statuses in nnames.items())
2202 assert set(instdisk) == set(instanceinfo), "instdisk consistency failure"
2206 def _VerifyHVP(self, hvp_data):
2207 """Verifies locally the syntax of the hypervisor parameters.
2210 for item, hv_name, hv_params in hvp_data:
2211 msg = ("hypervisor %s parameters syntax check (source %s): %%s" %
2214 hv_class = hypervisor.GetHypervisor(hv_name)
2215 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
2216 hv_class.CheckParameterSyntax(hv_params)
2217 except errors.GenericError, err:
2218 self._ErrorIf(True, self.ECLUSTERCFG, None, msg % str(err))
2220 def BuildHooksEnv(self):
2223 Cluster-Verify hooks just ran in the post phase and their failure makes
2224 the output be logged in the verify output and the verification to fail.
2230 "CLUSTER_TAGS": " ".join(cfg.GetClusterInfo().GetTags())
2233 env.update(("NODE_TAGS_%s" % node.name, " ".join(node.GetTags()))
2234 for node in cfg.GetAllNodesInfo().values())
2238 def BuildHooksNodes(self):
2239 """Build hooks nodes.
2242 return ([], self.cfg.GetNodeList())
2244 def Exec(self, feedback_fn):
2245 """Verify integrity of cluster, performing various test on nodes.
2248 # This method has too many local variables. pylint: disable-msg=R0914
2250 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
2251 verbose = self.op.verbose
2252 self._feedback_fn = feedback_fn
2253 feedback_fn("* Verifying global settings")
2254 for msg in self.cfg.VerifyConfig():
2255 _ErrorIf(True, self.ECLUSTERCFG, None, msg)
2257 # Check the cluster certificates
2258 for cert_filename in constants.ALL_CERT_FILES:
2259 (errcode, msg) = _VerifyCertificate(cert_filename)
2260 _ErrorIf(errcode, self.ECLUSTERCERT, None, msg, code=errcode)
2262 vg_name = self.cfg.GetVGName()
2263 drbd_helper = self.cfg.GetDRBDHelper()
2264 hypervisors = self.cfg.GetClusterInfo().enabled_hypervisors
2265 cluster = self.cfg.GetClusterInfo()
2266 nodeinfo_byname = self.cfg.GetAllNodesInfo()
2267 nodelist = utils.NiceSort(nodeinfo_byname.keys())
2268 nodeinfo = [nodeinfo_byname[nname] for nname in nodelist]
2269 instanceinfo = self.cfg.GetAllInstancesInfo()
2270 instancelist = utils.NiceSort(instanceinfo.keys())
2271 groupinfo = self.cfg.GetAllNodeGroupsInfo()
2272 i_non_redundant = [] # Non redundant instances
2273 i_non_a_balanced = [] # Non auto-balanced instances
2274 n_offline = 0 # Count of offline nodes
2275 n_drained = 0 # Count of nodes being drained
2276 node_vol_should = {}
2278 # FIXME: verify OS list
2281 filemap = _ComputeAncillaryFiles(cluster, False)
2283 # do local checksums
2284 master_node = self.master_node = self.cfg.GetMasterNode()
2285 master_ip = self.cfg.GetMasterIP()
2287 # Compute the set of hypervisor parameters
2289 for hv_name in hypervisors:
2290 hvp_data.append(("cluster", hv_name, cluster.GetHVDefaults(hv_name)))
2291 for os_name, os_hvp in cluster.os_hvp.items():
2292 for hv_name, hv_params in os_hvp.items():
2295 full_params = cluster.GetHVDefaults(hv_name, os_name=os_name)
2296 hvp_data.append(("os %s" % os_name, hv_name, full_params))
2297 # TODO: collapse identical parameter values in a single one
2298 for instance in instanceinfo.values():
2299 if not instance.hvparams:
2301 hvp_data.append(("instance %s" % instance.name, instance.hypervisor,
2302 cluster.FillHV(instance)))
2303 # and verify them locally
2304 self._VerifyHVP(hvp_data)
2306 feedback_fn("* Gathering data (%d nodes)" % len(nodelist))
2307 node_verify_param = {
2308 constants.NV_FILELIST:
2309 utils.UniqueSequence(filename
2310 for files in filemap
2311 for filename in files),
2312 constants.NV_NODELIST: [node.name for node in nodeinfo
2313 if not node.offline],
2314 constants.NV_HYPERVISOR: hypervisors,
2315 constants.NV_HVPARAMS: hvp_data,
2316 constants.NV_NODENETTEST: [(node.name, node.primary_ip,
2317 node.secondary_ip) for node in nodeinfo
2318 if not node.offline],
2319 constants.NV_INSTANCELIST: hypervisors,
2320 constants.NV_VERSION: None,
2321 constants.NV_HVINFO: self.cfg.GetHypervisorType(),
2322 constants.NV_NODESETUP: None,
2323 constants.NV_TIME: None,
2324 constants.NV_MASTERIP: (master_node, master_ip),
2325 constants.NV_OSLIST: None,
2326 constants.NV_VMNODES: self.cfg.GetNonVmCapableNodeList(),
2329 if vg_name is not None:
2330 node_verify_param[constants.NV_VGLIST] = None
2331 node_verify_param[constants.NV_LVLIST] = vg_name
2332 node_verify_param[constants.NV_PVLIST] = [vg_name]
2333 node_verify_param[constants.NV_DRBDLIST] = None
2336 node_verify_param[constants.NV_DRBDHELPER] = drbd_helper
2339 # FIXME: this needs to be changed per node-group, not cluster-wide
2341 default_nicpp = cluster.nicparams[constants.PP_DEFAULT]
2342 if default_nicpp[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
2343 bridges.add(default_nicpp[constants.NIC_LINK])
2344 for instance in instanceinfo.values():
2345 for nic in instance.nics:
2346 full_nic = cluster.SimpleFillNIC(nic.nicparams)
2347 if full_nic[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
2348 bridges.add(full_nic[constants.NIC_LINK])
2351 node_verify_param[constants.NV_BRIDGES] = list(bridges)
2353 # Build our expected cluster state
2354 node_image = dict((node.name, self.NodeImage(offline=node.offline,
2356 vm_capable=node.vm_capable))
2357 for node in nodeinfo)
2361 for node in nodeinfo:
2362 path = _SupportsOob(self.cfg, node)
2363 if path and path not in oob_paths:
2364 oob_paths.append(path)
2367 node_verify_param[constants.NV_OOB_PATHS] = oob_paths
2369 for instance in instancelist:
2370 inst_config = instanceinfo[instance]
2372 for nname in inst_config.all_nodes:
2373 if nname not in node_image:
2375 gnode = self.NodeImage(name=nname)
2377 node_image[nname] = gnode
2379 inst_config.MapLVsByNode(node_vol_should)
2381 pnode = inst_config.primary_node
2382 node_image[pnode].pinst.append(instance)
2384 for snode in inst_config.secondary_nodes:
2385 nimg = node_image[snode]
2386 nimg.sinst.append(instance)
2387 if pnode not in nimg.sbp:
2388 nimg.sbp[pnode] = []
2389 nimg.sbp[pnode].append(instance)
2391 # At this point, we have the in-memory data structures complete,
2392 # except for the runtime information, which we'll gather next
2394 # Due to the way our RPC system works, exact response times cannot be
2395 # guaranteed (e.g. a broken node could run into a timeout). By keeping the
2396 # time before and after executing the request, we can at least have a time
2398 nvinfo_starttime = time.time()
2399 all_nvinfo = self.rpc.call_node_verify(nodelist, node_verify_param,
2400 self.cfg.GetClusterName())
2401 nvinfo_endtime = time.time()
2403 all_drbd_map = self.cfg.ComputeDRBDMap()
2405 feedback_fn("* Gathering disk information (%s nodes)" % len(nodelist))
2406 instdisk = self._CollectDiskInfo(nodelist, node_image, instanceinfo)
2408 feedback_fn("* Verifying configuration file consistency")
2409 self._VerifyFiles(_ErrorIf, nodeinfo, master_node, all_nvinfo, filemap)
2411 feedback_fn("* Verifying node status")
2415 for node_i in nodeinfo:
2417 nimg = node_image[node]
2421 feedback_fn("* Skipping offline node %s" % (node,))
2425 if node == master_node:
2427 elif node_i.master_candidate:
2428 ntype = "master candidate"
2429 elif node_i.drained:
2435 feedback_fn("* Verifying node %s (%s)" % (node, ntype))
2437 msg = all_nvinfo[node].fail_msg
2438 _ErrorIf(msg, self.ENODERPC, node, "while contacting node: %s", msg)
2440 nimg.rpc_fail = True
2443 nresult = all_nvinfo[node].payload
2445 nimg.call_ok = self._VerifyNode(node_i, nresult)
2446 self._VerifyNodeTime(node_i, nresult, nvinfo_starttime, nvinfo_endtime)
2447 self._VerifyNodeNetwork(node_i, nresult)
2448 self._VerifyOob(node_i, nresult)
2451 self._VerifyNodeLVM(node_i, nresult, vg_name)
2452 self._VerifyNodeDrbd(node_i, nresult, instanceinfo, drbd_helper,
2455 self._UpdateNodeVolumes(node_i, nresult, nimg, vg_name)
2456 self._UpdateNodeInstances(node_i, nresult, nimg)
2457 self._UpdateNodeInfo(node_i, nresult, nimg, vg_name)
2458 self._UpdateNodeOS(node_i, nresult, nimg)
2459 if not nimg.os_fail:
2460 if refos_img is None:
2462 self._VerifyNodeOS(node_i, nimg, refos_img)
2463 self._VerifyNodeBridges(node_i, nresult, bridges)
2465 feedback_fn("* Verifying instance status")
2466 for instance in instancelist:
2468 feedback_fn("* Verifying instance %s" % instance)
2469 inst_config = instanceinfo[instance]
2470 self._VerifyInstance(instance, inst_config, node_image,
2472 inst_nodes_offline = []
2474 pnode = inst_config.primary_node
2475 pnode_img = node_image[pnode]
2476 _ErrorIf(pnode_img.rpc_fail and not pnode_img.offline,
2477 self.ENODERPC, pnode, "instance %s, connection to"
2478 " primary node failed", instance)
2480 _ErrorIf(inst_config.admin_up and pnode_img.offline,
2481 self.EINSTANCEBADNODE, instance,
2482 "instance is marked as running and lives on offline node %s",
2483 inst_config.primary_node)
2485 # If the instance is non-redundant we cannot survive losing its primary
2486 # node, so we are not N+1 compliant. On the other hand we have no disk
2487 # templates with more than one secondary so that situation is not well
2489 # FIXME: does not support file-backed instances
2490 if not inst_config.secondary_nodes:
2491 i_non_redundant.append(instance)
2493 _ErrorIf(len(inst_config.secondary_nodes) > 1, self.EINSTANCELAYOUT,
2494 instance, "instance has multiple secondary nodes: %s",
2495 utils.CommaJoin(inst_config.secondary_nodes),
2496 code=self.ETYPE_WARNING)
2498 if inst_config.disk_template in constants.DTS_INT_MIRROR:
2499 pnode = inst_config.primary_node
2500 instance_nodes = utils.NiceSort(inst_config.all_nodes)
2501 instance_groups = {}
2503 for node in instance_nodes:
2504 instance_groups.setdefault(nodeinfo_byname[node].group,
2508 "%s (group %s)" % (utils.CommaJoin(nodes), groupinfo[group].name)
2509 # Sort so that we always list the primary node first.
2510 for group, nodes in sorted(instance_groups.items(),
2511 key=lambda (_, nodes): pnode in nodes,
2514 self._ErrorIf(len(instance_groups) > 1, self.EINSTANCESPLITGROUPS,
2515 instance, "instance has primary and secondary nodes in"
2516 " different groups: %s", utils.CommaJoin(pretty_list),
2517 code=self.ETYPE_WARNING)
2519 if not cluster.FillBE(inst_config)[constants.BE_AUTO_BALANCE]:
2520 i_non_a_balanced.append(instance)
2522 for snode in inst_config.secondary_nodes:
2523 s_img = node_image[snode]
2524 _ErrorIf(s_img.rpc_fail and not s_img.offline, self.ENODERPC, snode,
2525 "instance %s, connection to secondary node failed", instance)
2528 inst_nodes_offline.append(snode)
2530 # warn that the instance lives on offline nodes
2531 _ErrorIf(inst_nodes_offline, self.EINSTANCEBADNODE, instance,
2532 "instance has offline secondary node(s) %s",
2533 utils.CommaJoin(inst_nodes_offline))
2534 # ... or ghost/non-vm_capable nodes
2535 for node in inst_config.all_nodes:
2536 _ErrorIf(node_image[node].ghost, self.EINSTANCEBADNODE, instance,
2537 "instance lives on ghost node %s", node)
2538 _ErrorIf(not node_image[node].vm_capable, self.EINSTANCEBADNODE,
2539 instance, "instance lives on non-vm_capable node %s", node)
2541 feedback_fn("* Verifying orphan volumes")
2542 reserved = utils.FieldSet(*cluster.reserved_lvs)
2543 self._VerifyOrphanVolumes(node_vol_should, node_image, reserved)
2545 feedback_fn("* Verifying orphan instances")
2546 self._VerifyOrphanInstances(instancelist, node_image)
2548 if constants.VERIFY_NPLUSONE_MEM not in self.op.skip_checks:
2549 feedback_fn("* Verifying N+1 Memory redundancy")
2550 self._VerifyNPlusOneMemory(node_image, instanceinfo)
2552 feedback_fn("* Other Notes")
2554 feedback_fn(" - NOTICE: %d non-redundant instance(s) found."
2555 % len(i_non_redundant))
2557 if i_non_a_balanced:
2558 feedback_fn(" - NOTICE: %d non-auto-balanced instance(s) found."
2559 % len(i_non_a_balanced))
2562 feedback_fn(" - NOTICE: %d offline node(s) found." % n_offline)
2565 feedback_fn(" - NOTICE: %d drained node(s) found." % n_drained)
2569 def HooksCallBack(self, phase, hooks_results, feedback_fn, lu_result):
2570 """Analyze the post-hooks' result
2572 This method analyses the hook result, handles it, and sends some
2573 nicely-formatted feedback back to the user.
2575 @param phase: one of L{constants.HOOKS_PHASE_POST} or
2576 L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase
2577 @param hooks_results: the results of the multi-node hooks rpc call
2578 @param feedback_fn: function used send feedback back to the caller
2579 @param lu_result: previous Exec result
2580 @return: the new Exec result, based on the previous result
2584 # We only really run POST phase hooks, and are only interested in
2586 if phase == constants.HOOKS_PHASE_POST:
2587 # Used to change hooks' output to proper indentation
2588 feedback_fn("* Hooks Results")
2589 assert hooks_results, "invalid result from hooks"
2591 for node_name in hooks_results:
2592 res = hooks_results[node_name]
2594 test = msg and not res.offline
2595 self._ErrorIf(test, self.ENODEHOOKS, node_name,
2596 "Communication failure in hooks execution: %s", msg)
2597 if res.offline or msg:
2598 # No need to investigate payload if node is offline or gave an error.
2599 # override manually lu_result here as _ErrorIf only
2600 # overrides self.bad
2603 for script, hkr, output in res.payload:
2604 test = hkr == constants.HKR_FAIL
2605 self._ErrorIf(test, self.ENODEHOOKS, node_name,
2606 "Script %s failed, output:", script)
2608 output = self._HOOKS_INDENT_RE.sub(' ', output)
2609 feedback_fn("%s" % output)
2615 class LUClusterVerifyDisks(NoHooksLU):
2616 """Verifies the cluster disks status.
2621 def ExpandNames(self):
2622 self.needed_locks = {
2623 locking.LEVEL_NODE: locking.ALL_SET,
2624 locking.LEVEL_INSTANCE: locking.ALL_SET,
2626 self.share_locks = dict.fromkeys(locking.LEVELS, 1)
2628 def Exec(self, feedback_fn):
2629 """Verify integrity of cluster disks.
2631 @rtype: tuple of three items
2632 @return: a tuple of (dict of node-to-node_error, list of instances
2633 which need activate-disks, dict of instance: (node, volume) for
2637 result = res_nodes, res_instances, res_missing = {}, [], {}
2639 nodes = utils.NiceSort(self.cfg.GetVmCapableNodeList())
2640 instances = self.cfg.GetAllInstancesInfo().values()
2643 for inst in instances:
2645 if not inst.admin_up:
2647 inst.MapLVsByNode(inst_lvs)
2648 # transform { iname: {node: [vol,],},} to {(node, vol): iname}
2649 for node, vol_list in inst_lvs.iteritems():
2650 for vol in vol_list:
2651 nv_dict[(node, vol)] = inst
2656 node_lvs = self.rpc.call_lv_list(nodes, [])
2657 for node, node_res in node_lvs.items():
2658 if node_res.offline:
2660 msg = node_res.fail_msg
2662 logging.warning("Error enumerating LVs on node %s: %s", node, msg)
2663 res_nodes[node] = msg
2666 lvs = node_res.payload
2667 for lv_name, (_, _, lv_online) in lvs.items():
2668 inst = nv_dict.pop((node, lv_name), None)
2669 if (not lv_online and inst is not None
2670 and inst.name not in res_instances):
2671 res_instances.append(inst.name)
2673 # any leftover items in nv_dict are missing LVs, let's arrange the
2675 for key, inst in nv_dict.iteritems():
2676 if inst.name not in res_missing:
2677 res_missing[inst.name] = []
2678 res_missing[inst.name].append(key)
2683 class LUClusterRepairDiskSizes(NoHooksLU):
2684 """Verifies the cluster disks sizes.
2689 def ExpandNames(self):
2690 if self.op.instances:
2691 self.wanted_names = _GetWantedInstances(self, self.op.instances)
2692 self.needed_locks = {
2693 locking.LEVEL_NODE: [],
2694 locking.LEVEL_INSTANCE: self.wanted_names,
2696 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
2698 self.wanted_names = None
2699 self.needed_locks = {
2700 locking.LEVEL_NODE: locking.ALL_SET,
2701 locking.LEVEL_INSTANCE: locking.ALL_SET,
2703 self.share_locks = dict.fromkeys(locking.LEVELS, 1)
2705 def DeclareLocks(self, level):
2706 if level == locking.LEVEL_NODE and self.wanted_names is not None:
2707 self._LockInstancesNodes(primary_only=True)
2709 def CheckPrereq(self):
2710 """Check prerequisites.
2712 This only checks the optional instance list against the existing names.
2715 if self.wanted_names is None:
2716 self.wanted_names = self.glm.list_owned(locking.LEVEL_INSTANCE)
2718 self.wanted_instances = [self.cfg.GetInstanceInfo(name) for name
2719 in self.wanted_names]
2721 def _EnsureChildSizes(self, disk):
2722 """Ensure children of the disk have the needed disk size.
2724 This is valid mainly for DRBD8 and fixes an issue where the
2725 children have smaller disk size.
2727 @param disk: an L{ganeti.objects.Disk} object
2730 if disk.dev_type == constants.LD_DRBD8:
2731 assert disk.children, "Empty children for DRBD8?"
2732 fchild = disk.children[0]
2733 mismatch = fchild.size < disk.size
2735 self.LogInfo("Child disk has size %d, parent %d, fixing",
2736 fchild.size, disk.size)
2737 fchild.size = disk.size
2739 # and we recurse on this child only, not on the metadev
2740 return self._EnsureChildSizes(fchild) or mismatch
2744 def Exec(self, feedback_fn):
2745 """Verify the size of cluster disks.
2748 # TODO: check child disks too
2749 # TODO: check differences in size between primary/secondary nodes
2751 for instance in self.wanted_instances:
2752 pnode = instance.primary_node
2753 if pnode not in per_node_disks:
2754 per_node_disks[pnode] = []
2755 for idx, disk in enumerate(instance.disks):
2756 per_node_disks[pnode].append((instance, idx, disk))
2759 for node, dskl in per_node_disks.items():
2760 newl = [v[2].Copy() for v in dskl]
2762 self.cfg.SetDiskID(dsk, node)
2763 result = self.rpc.call_blockdev_getsize(node, newl)
2765 self.LogWarning("Failure in blockdev_getsize call to node"
2766 " %s, ignoring", node)
2768 if len(result.payload) != len(dskl):
2769 logging.warning("Invalid result from node %s: len(dksl)=%d,"
2770 " result.payload=%s", node, len(dskl), result.payload)
2771 self.LogWarning("Invalid result from node %s, ignoring node results",
2774 for ((instance, idx, disk), size) in zip(dskl, result.payload):
2776 self.LogWarning("Disk %d of instance %s did not return size"
2777 " information, ignoring", idx, instance.name)
2779 if not isinstance(size, (int, long)):
2780 self.LogWarning("Disk %d of instance %s did not return valid"
2781 " size information, ignoring", idx, instance.name)
2784 if size != disk.size:
2785 self.LogInfo("Disk %d of instance %s has mismatched size,"
2786 " correcting: recorded %d, actual %d", idx,
2787 instance.name, disk.size, size)
2789 self.cfg.Update(instance, feedback_fn)
2790 changed.append((instance.name, idx, size))
2791 if self._EnsureChildSizes(disk):
2792 self.cfg.Update(instance, feedback_fn)
2793 changed.append((instance.name, idx, disk.size))
2797 class LUClusterRename(LogicalUnit):
2798 """Rename the cluster.
2801 HPATH = "cluster-rename"
2802 HTYPE = constants.HTYPE_CLUSTER
2804 def BuildHooksEnv(self):
2809 "OP_TARGET": self.cfg.GetClusterName(),
2810 "NEW_NAME": self.op.name,
2813 def BuildHooksNodes(self):
2814 """Build hooks nodes.
2817 return ([self.cfg.GetMasterNode()], self.cfg.GetNodeList())
2819 def CheckPrereq(self):
2820 """Verify that the passed name is a valid one.
2823 hostname = netutils.GetHostname(name=self.op.name,
2824 family=self.cfg.GetPrimaryIPFamily())
2826 new_name = hostname.name
2827 self.ip = new_ip = hostname.ip
2828 old_name = self.cfg.GetClusterName()
2829 old_ip = self.cfg.GetMasterIP()
2830 if new_name == old_name and new_ip == old_ip:
2831 raise errors.OpPrereqError("Neither the name nor the IP address of the"
2832 " cluster has changed",
2834 if new_ip != old_ip:
2835 if netutils.TcpPing(new_ip, constants.DEFAULT_NODED_PORT):
2836 raise errors.OpPrereqError("The given cluster IP address (%s) is"
2837 " reachable on the network" %
2838 new_ip, errors.ECODE_NOTUNIQUE)
2840 self.op.name = new_name
2842 def Exec(self, feedback_fn):
2843 """Rename the cluster.
2846 clustername = self.op.name
2849 # shutdown the master IP
2850 master = self.cfg.GetMasterNode()
2851 result = self.rpc.call_node_stop_master(master, False)
2852 result.Raise("Could not disable the master role")
2855 cluster = self.cfg.GetClusterInfo()
2856 cluster.cluster_name = clustername
2857 cluster.master_ip = ip
2858 self.cfg.Update(cluster, feedback_fn)
2860 # update the known hosts file
2861 ssh.WriteKnownHostsFile(self.cfg, constants.SSH_KNOWN_HOSTS_FILE)
2862 node_list = self.cfg.GetOnlineNodeList()
2864 node_list.remove(master)
2867 _UploadHelper(self, node_list, constants.SSH_KNOWN_HOSTS_FILE)
2869 result = self.rpc.call_node_start_master(master, False, False)
2870 msg = result.fail_msg
2872 self.LogWarning("Could not re-enable the master role on"
2873 " the master, please restart manually: %s", msg)
2878 class LUClusterSetParams(LogicalUnit):
2879 """Change the parameters of the cluster.
2882 HPATH = "cluster-modify"
2883 HTYPE = constants.HTYPE_CLUSTER
2886 def CheckArguments(self):
2890 if self.op.uid_pool:
2891 uidpool.CheckUidPool(self.op.uid_pool)
2893 if self.op.add_uids:
2894 uidpool.CheckUidPool(self.op.add_uids)
2896 if self.op.remove_uids:
2897 uidpool.CheckUidPool(self.op.remove_uids)
2899 def ExpandNames(self):
2900 # FIXME: in the future maybe other cluster params won't require checking on
2901 # all nodes to be modified.
2902 self.needed_locks = {
2903 locking.LEVEL_NODE: locking.ALL_SET,
2905 self.share_locks[locking.LEVEL_NODE] = 1
2907 def BuildHooksEnv(self):
2912 "OP_TARGET": self.cfg.GetClusterName(),
2913 "NEW_VG_NAME": self.op.vg_name,
2916 def BuildHooksNodes(self):
2917 """Build hooks nodes.
2920 mn = self.cfg.GetMasterNode()
2923 def CheckPrereq(self):
2924 """Check prerequisites.
2926 This checks whether the given params don't conflict and
2927 if the given volume group is valid.
2930 if self.op.vg_name is not None and not self.op.vg_name:
2931 if self.cfg.HasAnyDiskOfType(constants.LD_LV):
2932 raise errors.OpPrereqError("Cannot disable lvm storage while lvm-based"
2933 " instances exist", errors.ECODE_INVAL)
2935 if self.op.drbd_helper is not None and not self.op.drbd_helper:
2936 if self.cfg.HasAnyDiskOfType(constants.LD_DRBD8):
2937 raise errors.OpPrereqError("Cannot disable drbd helper while"
2938 " drbd-based instances exist",
2941 node_list = self.glm.list_owned(locking.LEVEL_NODE)
2943 # if vg_name not None, checks given volume group on all nodes
2945 vglist = self.rpc.call_vg_list(node_list)
2946 for node in node_list:
2947 msg = vglist[node].fail_msg
2949 # ignoring down node
2950 self.LogWarning("Error while gathering data on node %s"
2951 " (ignoring node): %s", node, msg)
2953 vgstatus = utils.CheckVolumeGroupSize(vglist[node].payload,
2955 constants.MIN_VG_SIZE)
2957 raise errors.OpPrereqError("Error on node '%s': %s" %
2958 (node, vgstatus), errors.ECODE_ENVIRON)
2960 if self.op.drbd_helper:
2961 # checks given drbd helper on all nodes
2962 helpers = self.rpc.call_drbd_helper(node_list)
2963 for node in node_list:
2964 ninfo = self.cfg.GetNodeInfo(node)
2966 self.LogInfo("Not checking drbd helper on offline node %s", node)
2968 msg = helpers[node].fail_msg
2970 raise errors.OpPrereqError("Error checking drbd helper on node"
2971 " '%s': %s" % (node, msg),
2972 errors.ECODE_ENVIRON)
2973 node_helper = helpers[node].payload
2974 if node_helper != self.op.drbd_helper:
2975 raise errors.OpPrereqError("Error on node '%s': drbd helper is %s" %
2976 (node, node_helper), errors.ECODE_ENVIRON)
2978 self.cluster = cluster = self.cfg.GetClusterInfo()
2979 # validate params changes
2980 if self.op.beparams:
2981 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
2982 self.new_beparams = cluster.SimpleFillBE(self.op.beparams)
2984 if self.op.ndparams:
2985 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
2986 self.new_ndparams = cluster.SimpleFillND(self.op.ndparams)
2988 # TODO: we need a more general way to handle resetting
2989 # cluster-level parameters to default values
2990 if self.new_ndparams["oob_program"] == "":
2991 self.new_ndparams["oob_program"] = \
2992 constants.NDC_DEFAULTS[constants.ND_OOB_PROGRAM]
2994 if self.op.nicparams:
2995 utils.ForceDictType(self.op.nicparams, constants.NICS_PARAMETER_TYPES)
2996 self.new_nicparams = cluster.SimpleFillNIC(self.op.nicparams)
2997 objects.NIC.CheckParameterSyntax(self.new_nicparams)
3000 # check all instances for consistency
3001 for instance in self.cfg.GetAllInstancesInfo().values():
3002 for nic_idx, nic in enumerate(instance.nics):
3003 params_copy = copy.deepcopy(nic.nicparams)
3004 params_filled = objects.FillDict(self.new_nicparams, params_copy)
3006 # check parameter syntax
3008 objects.NIC.CheckParameterSyntax(params_filled)
3009 except errors.ConfigurationError, err:
3010 nic_errors.append("Instance %s, nic/%d: %s" %
3011 (instance.name, nic_idx, err))
3013 # if we're moving instances to routed, check that they have an ip
3014 target_mode = params_filled[constants.NIC_MODE]
3015 if target_mode == constants.NIC_MODE_ROUTED and not nic.ip:
3016 nic_errors.append("Instance %s, nic/%d: routed NIC with no ip"
3017 " address" % (instance.name, nic_idx))
3019 raise errors.OpPrereqError("Cannot apply the change, errors:\n%s" %
3020 "\n".join(nic_errors))
3022 # hypervisor list/parameters
3023 self.new_hvparams = new_hvp = objects.FillDict(cluster.hvparams, {})
3024 if self.op.hvparams:
3025 for hv_name, hv_dict in self.op.hvparams.items():
3026 if hv_name not in self.new_hvparams:
3027 self.new_hvparams[hv_name] = hv_dict
3029 self.new_hvparams[hv_name].update(hv_dict)
3031 # os hypervisor parameters
3032 self.new_os_hvp = objects.FillDict(cluster.os_hvp, {})
3034 for os_name, hvs in self.op.os_hvp.items():
3035 if os_name not in self.new_os_hvp:
3036 self.new_os_hvp[os_name] = hvs
3038 for hv_name, hv_dict in hvs.items():
3039 if hv_name not in self.new_os_hvp[os_name]:
3040 self.new_os_hvp[os_name][hv_name] = hv_dict
3042 self.new_os_hvp[os_name][hv_name].update(hv_dict)
3045 self.new_osp = objects.FillDict(cluster.osparams, {})
3046 if self.op.osparams:
3047 for os_name, osp in self.op.osparams.items():
3048 if os_name not in self.new_osp:
3049 self.new_osp[os_name] = {}
3051 self.new_osp[os_name] = _GetUpdatedParams(self.new_osp[os_name], osp,
3054 if not self.new_osp[os_name]:
3055 # we removed all parameters
3056 del self.new_osp[os_name]
3058 # check the parameter validity (remote check)
3059 _CheckOSParams(self, False, [self.cfg.GetMasterNode()],
3060 os_name, self.new_osp[os_name])
3062 # changes to the hypervisor list
3063 if self.op.enabled_hypervisors is not None:
3064 self.hv_list = self.op.enabled_hypervisors
3065 for hv in self.hv_list:
3066 # if the hypervisor doesn't already exist in the cluster
3067 # hvparams, we initialize it to empty, and then (in both
3068 # cases) we make sure to fill the defaults, as we might not
3069 # have a complete defaults list if the hypervisor wasn't
3071 if hv not in new_hvp:
3073 new_hvp[hv] = objects.FillDict(constants.HVC_DEFAULTS[hv], new_hvp[hv])
3074 utils.ForceDictType(new_hvp[hv], constants.HVS_PARAMETER_TYPES)
3076 self.hv_list = cluster.enabled_hypervisors
3078 if self.op.hvparams or self.op.enabled_hypervisors is not None:
3079 # either the enabled list has changed, or the parameters have, validate
3080 for hv_name, hv_params in self.new_hvparams.items():
3081 if ((self.op.hvparams and hv_name in self.op.hvparams) or
3082 (self.op.enabled_hypervisors and
3083 hv_name in self.op.enabled_hypervisors)):
3084 # either this is a new hypervisor, or its parameters have changed
3085 hv_class = hypervisor.GetHypervisor(hv_name)
3086 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
3087 hv_class.CheckParameterSyntax(hv_params)
3088 _CheckHVParams(self, node_list, hv_name, hv_params)
3091 # no need to check any newly-enabled hypervisors, since the
3092 # defaults have already been checked in the above code-block
3093 for os_name, os_hvp in self.new_os_hvp.items():
3094 for hv_name, hv_params in os_hvp.items():
3095 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
3096 # we need to fill in the new os_hvp on top of the actual hv_p
3097 cluster_defaults = self.new_hvparams.get(hv_name, {})
3098 new_osp = objects.FillDict(cluster_defaults, hv_params)
3099 hv_class = hypervisor.GetHypervisor(hv_name)
3100 hv_class.CheckParameterSyntax(new_osp)
3101 _CheckHVParams(self, node_list, hv_name, new_osp)
3103 if self.op.default_iallocator:
3104 alloc_script = utils.FindFile(self.op.default_iallocator,
3105 constants.IALLOCATOR_SEARCH_PATH,
3107 if alloc_script is None:
3108 raise errors.OpPrereqError("Invalid default iallocator script '%s'"
3109 " specified" % self.op.default_iallocator,
3112 def Exec(self, feedback_fn):
3113 """Change the parameters of the cluster.
3116 if self.op.vg_name is not None:
3117 new_volume = self.op.vg_name
3120 if new_volume != self.cfg.GetVGName():
3121 self.cfg.SetVGName(new_volume)
3123 feedback_fn("Cluster LVM configuration already in desired"
3124 " state, not changing")
3125 if self.op.drbd_helper is not None:
3126 new_helper = self.op.drbd_helper
3129 if new_helper != self.cfg.GetDRBDHelper():
3130 self.cfg.SetDRBDHelper(new_helper)
3132 feedback_fn("Cluster DRBD helper already in desired state,"
3134 if self.op.hvparams:
3135 self.cluster.hvparams = self.new_hvparams
3137 self.cluster.os_hvp = self.new_os_hvp
3138 if self.op.enabled_hypervisors is not None:
3139 self.cluster.hvparams = self.new_hvparams
3140 self.cluster.enabled_hypervisors = self.op.enabled_hypervisors
3141 if self.op.beparams:
3142 self.cluster.beparams[constants.PP_DEFAULT] = self.new_beparams
3143 if self.op.nicparams:
3144 self.cluster.nicparams[constants.PP_DEFAULT] = self.new_nicparams
3145 if self.op.osparams:
3146 self.cluster.osparams = self.new_osp
3147 if self.op.ndparams:
3148 self.cluster.ndparams = self.new_ndparams
3150 if self.op.candidate_pool_size is not None:
3151 self.cluster.candidate_pool_size = self.op.candidate_pool_size
3152 # we need to update the pool size here, otherwise the save will fail
3153 _AdjustCandidatePool(self, [])
3155 if self.op.maintain_node_health is not None:
3156 self.cluster.maintain_node_health = self.op.maintain_node_health
3158 if self.op.prealloc_wipe_disks is not None:
3159 self.cluster.prealloc_wipe_disks = self.op.prealloc_wipe_disks
3161 if self.op.add_uids is not None:
3162 uidpool.AddToUidPool(self.cluster.uid_pool, self.op.add_uids)
3164 if self.op.remove_uids is not None:
3165 uidpool.RemoveFromUidPool(self.cluster.uid_pool, self.op.remove_uids)
3167 if self.op.uid_pool is not None:
3168 self.cluster.uid_pool = self.op.uid_pool
3170 if self.op.default_iallocator is not None:
3171 self.cluster.default_iallocator = self.op.default_iallocator
3173 if self.op.reserved_lvs is not None:
3174 self.cluster.reserved_lvs = self.op.reserved_lvs
3176 def helper_os(aname, mods, desc):
3178 lst = getattr(self.cluster, aname)
3179 for key, val in mods:
3180 if key == constants.DDM_ADD:
3182 feedback_fn("OS %s already in %s, ignoring" % (val, desc))
3185 elif key == constants.DDM_REMOVE:
3189 feedback_fn("OS %s not found in %s, ignoring" % (val, desc))
3191 raise errors.ProgrammerError("Invalid modification '%s'" % key)
3193 if self.op.hidden_os:
3194 helper_os("hidden_os", self.op.hidden_os, "hidden")
3196 if self.op.blacklisted_os:
3197 helper_os("blacklisted_os", self.op.blacklisted_os, "blacklisted")
3199 if self.op.master_netdev:
3200 master = self.cfg.GetMasterNode()
3201 feedback_fn("Shutting down master ip on the current netdev (%s)" %
3202 self.cluster.master_netdev)
3203 result = self.rpc.call_node_stop_master(master, False)
3204 result.Raise("Could not disable the master ip")
3205 feedback_fn("Changing master_netdev from %s to %s" %
3206 (self.cluster.master_netdev, self.op.master_netdev))
3207 self.cluster.master_netdev = self.op.master_netdev
3209 self.cfg.Update(self.cluster, feedback_fn)
3211 if self.op.master_netdev:
3212 feedback_fn("Starting the master ip on the new master netdev (%s)" %
3213 self.op.master_netdev)
3214 result = self.rpc.call_node_start_master(master, False, False)
3216 self.LogWarning("Could not re-enable the master ip on"
3217 " the master, please restart manually: %s",
3221 def _UploadHelper(lu, nodes, fname):
3222 """Helper for uploading a file and showing warnings.
3225 if os.path.exists(fname):
3226 result = lu.rpc.call_upload_file(nodes, fname)
3227 for to_node, to_result in result.items():
3228 msg = to_result.fail_msg
3230 msg = ("Copy of file %s to node %s failed: %s" %
3231 (fname, to_node, msg))
3232 lu.proc.LogWarning(msg)
3235 def _ComputeAncillaryFiles(cluster, redist):
3236 """Compute files external to Ganeti which need to be consistent.
3238 @type redist: boolean
3239 @param redist: Whether to include files which need to be redistributed
3242 # Compute files for all nodes
3244 constants.SSH_KNOWN_HOSTS_FILE,
3245 constants.CONFD_HMAC_KEY,
3246 constants.CLUSTER_DOMAIN_SECRET_FILE,
3250 files_all.update(constants.ALL_CERT_FILES)
3251 files_all.update(ssconf.SimpleStore().GetFileList())
3253 if cluster.modify_etc_hosts:
3254 files_all.add(constants.ETC_HOSTS)
3256 # Files which must either exist on all nodes or on none
3257 files_all_opt = set([
3258 constants.RAPI_USERS_FILE,
3261 # Files which should only be on master candidates
3264 files_mc.add(constants.CLUSTER_CONF_FILE)
3266 # Files which should only be on VM-capable nodes
3267 files_vm = set(filename
3268 for hv_name in cluster.enabled_hypervisors
3269 for filename in hypervisor.GetHypervisor(hv_name).GetAncillaryFiles())
3271 # Filenames must be unique
3272 assert (len(files_all | files_all_opt | files_mc | files_vm) ==
3273 sum(map(len, [files_all, files_all_opt, files_mc, files_vm]))), \
3274 "Found file listed in more than one file list"
3276 return (files_all, files_all_opt, files_mc, files_vm)
3279 def _RedistributeAncillaryFiles(lu, additional_nodes=None, additional_vm=True):
3280 """Distribute additional files which are part of the cluster configuration.
3282 ConfigWriter takes care of distributing the config and ssconf files, but
3283 there are more files which should be distributed to all nodes. This function
3284 makes sure those are copied.
3286 @param lu: calling logical unit
3287 @param additional_nodes: list of nodes not in the config to distribute to
3288 @type additional_vm: boolean
3289 @param additional_vm: whether the additional nodes are vm-capable or not
3292 # Gather target nodes
3293 cluster = lu.cfg.GetClusterInfo()
3294 master_info = lu.cfg.GetNodeInfo(lu.cfg.GetMasterNode())
3296 online_nodes = lu.cfg.GetOnlineNodeList()
3297 vm_nodes = lu.cfg.GetVmCapableNodeList()
3299 if additional_nodes is not None:
3300 online_nodes.extend(additional_nodes)
3302 vm_nodes.extend(additional_nodes)
3304 # Never distribute to master node
3305 for nodelist in [online_nodes, vm_nodes]:
3306 if master_info.name in nodelist:
3307 nodelist.remove(master_info.name)
3310 (files_all, files_all_opt, files_mc, files_vm) = \
3311 _ComputeAncillaryFiles(cluster, True)
3313 # Never re-distribute configuration file from here
3314 assert not (constants.CLUSTER_CONF_FILE in files_all or
3315 constants.CLUSTER_CONF_FILE in files_vm)
3316 assert not files_mc, "Master candidates not handled in this function"
3319 (online_nodes, files_all),
3320 (online_nodes, files_all_opt),
3321 (vm_nodes, files_vm),
3325 for (node_list, files) in filemap:
3327 _UploadHelper(lu, node_list, fname)
3330 class LUClusterRedistConf(NoHooksLU):
3331 """Force the redistribution of cluster configuration.
3333 This is a very simple LU.
3338 def ExpandNames(self):
3339 self.needed_locks = {
3340 locking.LEVEL_NODE: locking.ALL_SET,
3342 self.share_locks[locking.LEVEL_NODE] = 1
3344 def Exec(self, feedback_fn):
3345 """Redistribute the configuration.
3348 self.cfg.Update(self.cfg.GetClusterInfo(), feedback_fn)
3349 _RedistributeAncillaryFiles(self)
3352 def _WaitForSync(lu, instance, disks=None, oneshot=False):
3353 """Sleep and poll for an instance's disk to sync.
3356 if not instance.disks or disks is not None and not disks:
3359 disks = _ExpandCheckDisks(instance, disks)
3362 lu.proc.LogInfo("Waiting for instance %s to sync disks." % instance.name)
3364 node = instance.primary_node
3367 lu.cfg.SetDiskID(dev, node)
3369 # TODO: Convert to utils.Retry
3372 degr_retries = 10 # in seconds, as we sleep 1 second each time
3376 cumul_degraded = False
3377 rstats = lu.rpc.call_blockdev_getmirrorstatus(node, disks)
3378 msg = rstats.fail_msg
3380 lu.LogWarning("Can't get any data from node %s: %s", node, msg)
3383 raise errors.RemoteError("Can't contact node %s for mirror data,"
3384 " aborting." % node)
3387 rstats = rstats.payload
3389 for i, mstat in enumerate(rstats):
3391 lu.LogWarning("Can't compute data for node %s/%s",
3392 node, disks[i].iv_name)
3395 cumul_degraded = (cumul_degraded or
3396 (mstat.is_degraded and mstat.sync_percent is None))
3397 if mstat.sync_percent is not None:
3399 if mstat.estimated_time is not None:
3400 rem_time = ("%s remaining (estimated)" %
3401 utils.FormatSeconds(mstat.estimated_time))
3402 max_time = mstat.estimated_time
3404 rem_time = "no time estimate"
3405 lu.proc.LogInfo("- device %s: %5.2f%% done, %s" %
3406 (disks[i].iv_name, mstat.sync_percent, rem_time))
3408 # if we're done but degraded, let's do a few small retries, to
3409 # make sure we see a stable and not transient situation; therefore
3410 # we force restart of the loop
3411 if (done or oneshot) and cumul_degraded and degr_retries > 0:
3412 logging.info("Degraded disks found, %d retries left", degr_retries)
3420 time.sleep(min(60, max_time))
3423 lu.proc.LogInfo("Instance %s's disks are in sync." % instance.name)
3424 return not cumul_degraded
3427 def _CheckDiskConsistency(lu, dev, node, on_primary, ldisk=False):
3428 """Check that mirrors are not degraded.
3430 The ldisk parameter, if True, will change the test from the
3431 is_degraded attribute (which represents overall non-ok status for
3432 the device(s)) to the ldisk (representing the local storage status).
3435 lu.cfg.SetDiskID(dev, node)
3439 if on_primary or dev.AssembleOnSecondary():
3440 rstats = lu.rpc.call_blockdev_find(node, dev)
3441 msg = rstats.fail_msg
3443 lu.LogWarning("Can't find disk on node %s: %s", node, msg)
3445 elif not rstats.payload:
3446 lu.LogWarning("Can't find disk on node %s", node)
3450 result = result and rstats.payload.ldisk_status == constants.LDS_OKAY
3452 result = result and not rstats.payload.is_degraded
3455 for child in dev.children:
3456 result = result and _CheckDiskConsistency(lu, child, node, on_primary)
3461 class LUOobCommand(NoHooksLU):
3462 """Logical unit for OOB handling.
3466 _SKIP_MASTER = (constants.OOB_POWER_OFF, constants.OOB_POWER_CYCLE)
3468 def ExpandNames(self):
3469 """Gather locks we need.
3472 if self.op.node_names:
3473 self.op.node_names = _GetWantedNodes(self, self.op.node_names)
3474 lock_names = self.op.node_names
3476 lock_names = locking.ALL_SET
3478 self.needed_locks = {
3479 locking.LEVEL_NODE: lock_names,
3482 def CheckPrereq(self):
3483 """Check prerequisites.
3486 - the node exists in the configuration
3489 Any errors are signaled by raising errors.OpPrereqError.
3493 self.master_node = self.cfg.GetMasterNode()
3495 assert self.op.power_delay >= 0.0
3497 if self.op.node_names:
3498 if (self.op.command in self._SKIP_MASTER and
3499 self.master_node in self.op.node_names):
3500 master_node_obj = self.cfg.GetNodeInfo(self.master_node)
3501 master_oob_handler = _SupportsOob(self.cfg, master_node_obj)
3503 if master_oob_handler:
3504 additional_text = ("run '%s %s %s' if you want to operate on the"
3505 " master regardless") % (master_oob_handler,
3509 additional_text = "it does not support out-of-band operations"
3511 raise errors.OpPrereqError(("Operating on the master node %s is not"
3512 " allowed for %s; %s") %
3513 (self.master_node, self.op.command,
3514 additional_text), errors.ECODE_INVAL)
3516 self.op.node_names = self.cfg.GetNodeList()
3517 if self.op.command in self._SKIP_MASTER:
3518 self.op.node_names.remove(self.master_node)
3520 if self.op.command in self._SKIP_MASTER:
3521 assert self.master_node not in self.op.node_names
3523 for node_name in self.op.node_names:
3524 node = self.cfg.GetNodeInfo(node_name)
3527 raise errors.OpPrereqError("Node %s not found" % node_name,
3530 self.nodes.append(node)
3532 if (not self.op.ignore_status and
3533 (self.op.command == constants.OOB_POWER_OFF and not node.offline)):
3534 raise errors.OpPrereqError(("Cannot power off node %s because it is"
3535 " not marked offline") % node_name,
3538 def Exec(self, feedback_fn):
3539 """Execute OOB and return result if we expect any.
3542 master_node = self.master_node
3545 for idx, node in enumerate(utils.NiceSort(self.nodes,
3546 key=lambda node: node.name)):
3547 node_entry = [(constants.RS_NORMAL, node.name)]
3548 ret.append(node_entry)
3550 oob_program = _SupportsOob(self.cfg, node)
3553 node_entry.append((constants.RS_UNAVAIL, None))
3556 logging.info("Executing out-of-band command '%s' using '%s' on %s",
3557 self.op.command, oob_program, node.name)
3558 result = self.rpc.call_run_oob(master_node, oob_program,
3559 self.op.command, node.name,
3563 self.LogWarning("Out-of-band RPC failed on node '%s': %s",
3564 node.name, result.fail_msg)
3565 node_entry.append((constants.RS_NODATA, None))
3568 self._CheckPayload(result)
3569 except errors.OpExecError, err:
3570 self.LogWarning("Payload returned by node '%s' is not valid: %s",
3572 node_entry.append((constants.RS_NODATA, None))
3574 if self.op.command == constants.OOB_HEALTH:
3575 # For health we should log important events
3576 for item, status in result.payload:
3577 if status in [constants.OOB_STATUS_WARNING,
3578 constants.OOB_STATUS_CRITICAL]:
3579 self.LogWarning("Item '%s' on node '%s' has status '%s'",
3580 item, node.name, status)
3582 if self.op.command == constants.OOB_POWER_ON:
3584 elif self.op.command == constants.OOB_POWER_OFF:
3585 node.powered = False
3586 elif self.op.command == constants.OOB_POWER_STATUS:
3587 powered = result.payload[constants.OOB_POWER_STATUS_POWERED]
3588 if powered != node.powered:
3589 logging.warning(("Recorded power state (%s) of node '%s' does not"
3590 " match actual power state (%s)"), node.powered,
3593 # For configuration changing commands we should update the node
3594 if self.op.command in (constants.OOB_POWER_ON,
3595 constants.OOB_POWER_OFF):
3596 self.cfg.Update(node, feedback_fn)
3598 node_entry.append((constants.RS_NORMAL, result.payload))
3600 if (self.op.command == constants.OOB_POWER_ON and
3601 idx < len(self.nodes) - 1):
3602 time.sleep(self.op.power_delay)
3606 def _CheckPayload(self, result):
3607 """Checks if the payload is valid.
3609 @param result: RPC result
3610 @raises errors.OpExecError: If payload is not valid
3614 if self.op.command == constants.OOB_HEALTH:
3615 if not isinstance(result.payload, list):
3616 errs.append("command 'health' is expected to return a list but got %s" %
3617 type(result.payload))
3619 for item, status in result.payload:
3620 if status not in constants.OOB_STATUSES:
3621 errs.append("health item '%s' has invalid status '%s'" %
3624 if self.op.command == constants.OOB_POWER_STATUS:
3625 if not isinstance(result.payload, dict):
3626 errs.append("power-status is expected to return a dict but got %s" %
3627 type(result.payload))
3629 if self.op.command in [
3630 constants.OOB_POWER_ON,
3631 constants.OOB_POWER_OFF,
3632 constants.OOB_POWER_CYCLE,
3634 if result.payload is not None:
3635 errs.append("%s is expected to not return payload but got '%s'" %
3636 (self.op.command, result.payload))
3639 raise errors.OpExecError("Check of out-of-band payload failed due to %s" %
3640 utils.CommaJoin(errs))
3642 class _OsQuery(_QueryBase):
3643 FIELDS = query.OS_FIELDS
3645 def ExpandNames(self, lu):
3646 # Lock all nodes in shared mode
3647 # Temporary removal of locks, should be reverted later
3648 # TODO: reintroduce locks when they are lighter-weight
3649 lu.needed_locks = {}
3650 #self.share_locks[locking.LEVEL_NODE] = 1
3651 #self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
3653 # The following variables interact with _QueryBase._GetNames
3655 self.wanted = self.names
3657 self.wanted = locking.ALL_SET
3659 self.do_locking = self.use_locking
3661 def DeclareLocks(self, lu, level):
3665 def _DiagnoseByOS(rlist):
3666 """Remaps a per-node return list into an a per-os per-node dictionary
3668 @param rlist: a map with node names as keys and OS objects as values
3671 @return: a dictionary with osnames as keys and as value another
3672 map, with nodes as keys and tuples of (path, status, diagnose,
3673 variants, parameters, api_versions) as values, eg::
3675 {"debian-etch": {"node1": [(/usr/lib/..., True, "", [], []),
3676 (/srv/..., False, "invalid api")],
3677 "node2": [(/srv/..., True, "", [], [])]}
3682 # we build here the list of nodes that didn't fail the RPC (at RPC
3683 # level), so that nodes with a non-responding node daemon don't
3684 # make all OSes invalid
3685 good_nodes = [node_name for node_name in rlist
3686 if not rlist[node_name].fail_msg]
3687 for node_name, nr in rlist.items():
3688 if nr.fail_msg or not nr.payload:
3690 for (name, path, status, diagnose, variants,
3691 params, api_versions) in nr.payload:
3692 if name not in all_os:
3693 # build a list of nodes for this os containing empty lists
3694 # for each node in node_list
3696 for nname in good_nodes:
3697 all_os[name][nname] = []
3698 # convert params from [name, help] to (name, help)
3699 params = [tuple(v) for v in params]
3700 all_os[name][node_name].append((path, status, diagnose,
3701 variants, params, api_versions))
3704 def _GetQueryData(self, lu):
3705 """Computes the list of nodes and their attributes.
3708 # Locking is not used
3709 assert not (compat.any(lu.glm.is_owned(level)
3710 for level in locking.LEVELS
3711 if level != locking.LEVEL_CLUSTER) or
3712 self.do_locking or self.use_locking)
3714 valid_nodes = [node.name
3715 for node in lu.cfg.GetAllNodesInfo().values()
3716 if not node.offline and node.vm_capable]
3717 pol = self._DiagnoseByOS(lu.rpc.call_os_diagnose(valid_nodes))
3718 cluster = lu.cfg.GetClusterInfo()
3722 for (os_name, os_data) in pol.items():
3723 info = query.OsInfo(name=os_name, valid=True, node_status=os_data,
3724 hidden=(os_name in cluster.hidden_os),
3725 blacklisted=(os_name in cluster.blacklisted_os))
3729 api_versions = set()
3731 for idx, osl in enumerate(os_data.values()):
3732 info.valid = bool(info.valid and osl and osl[0][1])
3736 (node_variants, node_params, node_api) = osl[0][3:6]
3739 variants.update(node_variants)
3740 parameters.update(node_params)
3741 api_versions.update(node_api)
3743 # Filter out inconsistent values
3744 variants.intersection_update(node_variants)
3745 parameters.intersection_update(node_params)
3746 api_versions.intersection_update(node_api)
3748 info.variants = list(variants)
3749 info.parameters = list(parameters)
3750 info.api_versions = list(api_versions)
3752 data[os_name] = info
3754 # Prepare data in requested order
3755 return [data[name] for name in self._GetNames(lu, pol.keys(), None)
3759 class LUOsDiagnose(NoHooksLU):
3760 """Logical unit for OS diagnose/query.
3766 def _BuildFilter(fields, names):
3767 """Builds a filter for querying OSes.
3770 name_filter = qlang.MakeSimpleFilter("name", names)
3772 # Legacy behaviour: Hide hidden, blacklisted or invalid OSes if the
3773 # respective field is not requested
3774 status_filter = [[qlang.OP_NOT, [qlang.OP_TRUE, fname]]
3775 for fname in ["hidden", "blacklisted"]
3776 if fname not in fields]
3777 if "valid" not in fields:
3778 status_filter.append([qlang.OP_TRUE, "valid"])
3781 status_filter.insert(0, qlang.OP_AND)
3783 status_filter = None
3785 if name_filter and status_filter:
3786 return [qlang.OP_AND, name_filter, status_filter]
3790 return status_filter
3792 def CheckArguments(self):
3793 self.oq = _OsQuery(self._BuildFilter(self.op.output_fields, self.op.names),
3794 self.op.output_fields, False)
3796 def ExpandNames(self):
3797 self.oq.ExpandNames(self)
3799 def Exec(self, feedback_fn):
3800 return self.oq.OldStyleQuery(self)
3803 class LUNodeRemove(LogicalUnit):
3804 """Logical unit for removing a node.
3807 HPATH = "node-remove"
3808 HTYPE = constants.HTYPE_NODE
3810 def BuildHooksEnv(self):
3813 This doesn't run on the target node in the pre phase as a failed
3814 node would then be impossible to remove.
3818 "OP_TARGET": self.op.node_name,
3819 "NODE_NAME": self.op.node_name,
3822 def BuildHooksNodes(self):
3823 """Build hooks nodes.
3826 all_nodes = self.cfg.GetNodeList()
3828 all_nodes.remove(self.op.node_name)
3830 logging.warning("Node '%s', which is about to be removed, was not found"
3831 " in the list of all nodes", self.op.node_name)
3832 return (all_nodes, all_nodes)
3834 def CheckPrereq(self):
3835 """Check prerequisites.
3838 - the node exists in the configuration
3839 - it does not have primary or secondary instances
3840 - it's not the master
3842 Any errors are signaled by raising errors.OpPrereqError.
3845 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
3846 node = self.cfg.GetNodeInfo(self.op.node_name)
3847 assert node is not None
3849 instance_list = self.cfg.GetInstanceList()
3851 masternode = self.cfg.GetMasterNode()
3852 if node.name == masternode:
3853 raise errors.OpPrereqError("Node is the master node, failover to another"
3854 " node is required", errors.ECODE_INVAL)
3856 for instance_name in instance_list:
3857 instance = self.cfg.GetInstanceInfo(instance_name)
3858 if node.name in instance.all_nodes:
3859 raise errors.OpPrereqError("Instance %s is still running on the node,"
3860 " please remove first" % instance_name,
3862 self.op.node_name = node.name
3865 def Exec(self, feedback_fn):
3866 """Removes the node from the cluster.
3870 logging.info("Stopping the node daemon and removing configs from node %s",
3873 modify_ssh_setup = self.cfg.GetClusterInfo().modify_ssh_setup
3875 # Promote nodes to master candidate as needed
3876 _AdjustCandidatePool(self, exceptions=[node.name])
3877 self.context.RemoveNode(node.name)
3879 # Run post hooks on the node before it's removed
3880 _RunPostHook(self, node.name)
3882 result = self.rpc.call_node_leave_cluster(node.name, modify_ssh_setup)
3883 msg = result.fail_msg
3885 self.LogWarning("Errors encountered on the remote node while leaving"
3886 " the cluster: %s", msg)
3888 # Remove node from our /etc/hosts
3889 if self.cfg.GetClusterInfo().modify_etc_hosts:
3890 master_node = self.cfg.GetMasterNode()
3891 result = self.rpc.call_etc_hosts_modify(master_node,
3892 constants.ETC_HOSTS_REMOVE,
3894 result.Raise("Can't update hosts file with new host data")
3895 _RedistributeAncillaryFiles(self)
3898 class _NodeQuery(_QueryBase):
3899 FIELDS = query.NODE_FIELDS
3901 def ExpandNames(self, lu):
3902 lu.needed_locks = {}
3903 lu.share_locks[locking.LEVEL_NODE] = 1
3906 self.wanted = _GetWantedNodes(lu, self.names)
3908 self.wanted = locking.ALL_SET
3910 self.do_locking = (self.use_locking and
3911 query.NQ_LIVE in self.requested_data)
3914 # if we don't request only static fields, we need to lock the nodes
3915 lu.needed_locks[locking.LEVEL_NODE] = self.wanted
3917 def DeclareLocks(self, lu, level):
3920 def _GetQueryData(self, lu):
3921 """Computes the list of nodes and their attributes.
3924 all_info = lu.cfg.GetAllNodesInfo()
3926 nodenames = self._GetNames(lu, all_info.keys(), locking.LEVEL_NODE)
3928 # Gather data as requested
3929 if query.NQ_LIVE in self.requested_data:
3930 # filter out non-vm_capable nodes
3931 toquery_nodes = [name for name in nodenames if all_info[name].vm_capable]
3933 node_data = lu.rpc.call_node_info(toquery_nodes, lu.cfg.GetVGName(),
3934 lu.cfg.GetHypervisorType())
3935 live_data = dict((name, nresult.payload)
3936 for (name, nresult) in node_data.items()
3937 if not nresult.fail_msg and nresult.payload)
3941 if query.NQ_INST in self.requested_data:
3942 node_to_primary = dict([(name, set()) for name in nodenames])
3943 node_to_secondary = dict([(name, set()) for name in nodenames])
3945 inst_data = lu.cfg.GetAllInstancesInfo()
3947 for inst in inst_data.values():
3948 if inst.primary_node in node_to_primary:
3949 node_to_primary[inst.primary_node].add(inst.name)
3950 for secnode in inst.secondary_nodes:
3951 if secnode in node_to_secondary:
3952 node_to_secondary[secnode].add(inst.name)
3954 node_to_primary = None
3955 node_to_secondary = None
3957 if query.NQ_OOB in self.requested_data:
3958 oob_support = dict((name, bool(_SupportsOob(lu.cfg, node)))
3959 for name, node in all_info.iteritems())
3963 if query.NQ_GROUP in self.requested_data:
3964 groups = lu.cfg.GetAllNodeGroupsInfo()
3968 return query.NodeQueryData([all_info[name] for name in nodenames],
3969 live_data, lu.cfg.GetMasterNode(),
3970 node_to_primary, node_to_secondary, groups,
3971 oob_support, lu.cfg.GetClusterInfo())
3974 class LUNodeQuery(NoHooksLU):
3975 """Logical unit for querying nodes.
3978 # pylint: disable-msg=W0142
3981 def CheckArguments(self):
3982 self.nq = _NodeQuery(qlang.MakeSimpleFilter("name", self.op.names),
3983 self.op.output_fields, self.op.use_locking)
3985 def ExpandNames(self):
3986 self.nq.ExpandNames(self)
3988 def Exec(self, feedback_fn):
3989 return self.nq.OldStyleQuery(self)
3992 class LUNodeQueryvols(NoHooksLU):
3993 """Logical unit for getting volumes on node(s).
3997 _FIELDS_DYNAMIC = utils.FieldSet("phys", "vg", "name", "size", "instance")
3998 _FIELDS_STATIC = utils.FieldSet("node")
4000 def CheckArguments(self):
4001 _CheckOutputFields(static=self._FIELDS_STATIC,
4002 dynamic=self._FIELDS_DYNAMIC,
4003 selected=self.op.output_fields)
4005 def ExpandNames(self):
4006 self.needed_locks = {}
4007 self.share_locks[locking.LEVEL_NODE] = 1
4008 if not self.op.nodes:
4009 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
4011 self.needed_locks[locking.LEVEL_NODE] = \
4012 _GetWantedNodes(self, self.op.nodes)
4014 def Exec(self, feedback_fn):
4015 """Computes the list of nodes and their attributes.
4018 nodenames = self.glm.list_owned(locking.LEVEL_NODE)
4019 volumes = self.rpc.call_node_volumes(nodenames)
4021 ilist = [self.cfg.GetInstanceInfo(iname) for iname
4022 in self.cfg.GetInstanceList()]
4024 lv_by_node = dict([(inst, inst.MapLVsByNode()) for inst in ilist])
4027 for node in nodenames:
4028 nresult = volumes[node]
4031 msg = nresult.fail_msg
4033 self.LogWarning("Can't compute volume data on node %s: %s", node, msg)
4036 node_vols = nresult.payload[:]
4037 node_vols.sort(key=lambda vol: vol['dev'])
4039 for vol in node_vols:
4041 for field in self.op.output_fields:
4044 elif field == "phys":
4048 elif field == "name":
4050 elif field == "size":
4051 val = int(float(vol['size']))
4052 elif field == "instance":
4054 if node not in lv_by_node[inst]:
4056 if vol['name'] in lv_by_node[inst][node]:
4062 raise errors.ParameterError(field)
4063 node_output.append(str(val))
4065 output.append(node_output)
4070 class LUNodeQueryStorage(NoHooksLU):
4071 """Logical unit for getting information on storage units on node(s).
4074 _FIELDS_STATIC = utils.FieldSet(constants.SF_NODE)
4077 def CheckArguments(self):
4078 _CheckOutputFields(static=self._FIELDS_STATIC,
4079 dynamic=utils.FieldSet(*constants.VALID_STORAGE_FIELDS),
4080 selected=self.op.output_fields)
4082 def ExpandNames(self):
4083 self.needed_locks = {}
4084 self.share_locks[locking.LEVEL_NODE] = 1
4087 self.needed_locks[locking.LEVEL_NODE] = \
4088 _GetWantedNodes(self, self.op.nodes)
4090 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
4092 def Exec(self, feedback_fn):
4093 """Computes the list of nodes and their attributes.
4096 self.nodes = self.glm.list_owned(locking.LEVEL_NODE)
4098 # Always get name to sort by
4099 if constants.SF_NAME in self.op.output_fields:
4100 fields = self.op.output_fields[:]
4102 fields = [constants.SF_NAME] + self.op.output_fields
4104 # Never ask for node or type as it's only known to the LU
4105 for extra in [constants.SF_NODE, constants.SF_TYPE]:
4106 while extra in fields:
4107 fields.remove(extra)
4109 field_idx = dict([(name, idx) for (idx, name) in enumerate(fields)])
4110 name_idx = field_idx[constants.SF_NAME]
4112 st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
4113 data = self.rpc.call_storage_list(self.nodes,
4114 self.op.storage_type, st_args,
4115 self.op.name, fields)
4119 for node in utils.NiceSort(self.nodes):
4120 nresult = data[node]
4124 msg = nresult.fail_msg
4126 self.LogWarning("Can't get storage data from node %s: %s", node, msg)
4129 rows = dict([(row[name_idx], row) for row in nresult.payload])
4131 for name in utils.NiceSort(rows.keys()):
4136 for field in self.op.output_fields:
4137 if field == constants.SF_NODE:
4139 elif field == constants.SF_TYPE:
4140 val = self.op.storage_type
4141 elif field in field_idx:
4142 val = row[field_idx[field]]
4144 raise errors.ParameterError(field)
4153 class _InstanceQuery(_QueryBase):
4154 FIELDS = query.INSTANCE_FIELDS
4156 def ExpandNames(self, lu):
4157 lu.needed_locks = {}
4158 lu.share_locks[locking.LEVEL_INSTANCE] = 1
4159 lu.share_locks[locking.LEVEL_NODE] = 1
4162 self.wanted = _GetWantedInstances(lu, self.names)
4164 self.wanted = locking.ALL_SET
4166 self.do_locking = (self.use_locking and
4167 query.IQ_LIVE in self.requested_data)
4169 lu.needed_locks[locking.LEVEL_INSTANCE] = self.wanted
4170 lu.needed_locks[locking.LEVEL_NODE] = []
4171 lu.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
4173 def DeclareLocks(self, lu, level):
4174 if level == locking.LEVEL_NODE and self.do_locking:
4175 lu._LockInstancesNodes() # pylint: disable-msg=W0212
4177 def _GetQueryData(self, lu):
4178 """Computes the list of instances and their attributes.
4181 cluster = lu.cfg.GetClusterInfo()
4182 all_info = lu.cfg.GetAllInstancesInfo()
4184 instance_names = self._GetNames(lu, all_info.keys(), locking.LEVEL_INSTANCE)
4186 instance_list = [all_info[name] for name in instance_names]
4187 nodes = frozenset(itertools.chain(*(inst.all_nodes
4188 for inst in instance_list)))
4189 hv_list = list(set([inst.hypervisor for inst in instance_list]))
4192 wrongnode_inst = set()
4194 # Gather data as requested
4195 if self.requested_data & set([query.IQ_LIVE, query.IQ_CONSOLE]):
4197 node_data = lu.rpc.call_all_instances_info(nodes, hv_list)
4199 result = node_data[name]
4201 # offline nodes will be in both lists
4202 assert result.fail_msg
4203 offline_nodes.append(name)
4205 bad_nodes.append(name)
4206 elif result.payload:
4207 for inst in result.payload:
4208 if inst in all_info:
4209 if all_info[inst].primary_node == name:
4210 live_data.update(result.payload)
4212 wrongnode_inst.add(inst)
4214 # orphan instance; we don't list it here as we don't
4215 # handle this case yet in the output of instance listing
4216 logging.warning("Orphan instance '%s' found on node %s",
4218 # else no instance is alive
4222 if query.IQ_DISKUSAGE in self.requested_data:
4223 disk_usage = dict((inst.name,
4224 _ComputeDiskSize(inst.disk_template,
4225 [{constants.IDISK_SIZE: disk.size}
4226 for disk in inst.disks]))
4227 for inst in instance_list)
4231 if query.IQ_CONSOLE in self.requested_data:
4233 for inst in instance_list:
4234 if inst.name in live_data:
4235 # Instance is running
4236 consinfo[inst.name] = _GetInstanceConsole(cluster, inst)
4238 consinfo[inst.name] = None
4239 assert set(consinfo.keys()) == set(instance_names)
4243 return query.InstanceQueryData(instance_list, lu.cfg.GetClusterInfo(),
4244 disk_usage, offline_nodes, bad_nodes,
4245 live_data, wrongnode_inst, consinfo)
4248 class LUQuery(NoHooksLU):
4249 """Query for resources/items of a certain kind.
4252 # pylint: disable-msg=W0142
4255 def CheckArguments(self):
4256 qcls = _GetQueryImplementation(self.op.what)
4258 self.impl = qcls(self.op.filter, self.op.fields, False)
4260 def ExpandNames(self):
4261 self.impl.ExpandNames(self)
4263 def DeclareLocks(self, level):
4264 self.impl.DeclareLocks(self, level)
4266 def Exec(self, feedback_fn):
4267 return self.impl.NewStyleQuery(self)
4270 class LUQueryFields(NoHooksLU):
4271 """Query for resources/items of a certain kind.
4274 # pylint: disable-msg=W0142
4277 def CheckArguments(self):
4278 self.qcls = _GetQueryImplementation(self.op.what)
4280 def ExpandNames(self):
4281 self.needed_locks = {}
4283 def Exec(self, feedback_fn):
4284 return query.QueryFields(self.qcls.FIELDS, self.op.fields)
4287 class LUNodeModifyStorage(NoHooksLU):
4288 """Logical unit for modifying a storage volume on a node.
4293 def CheckArguments(self):
4294 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
4296 storage_type = self.op.storage_type
4299 modifiable = constants.MODIFIABLE_STORAGE_FIELDS[storage_type]
4301 raise errors.OpPrereqError("Storage units of type '%s' can not be"
4302 " modified" % storage_type,
4305 diff = set(self.op.changes.keys()) - modifiable
4307 raise errors.OpPrereqError("The following fields can not be modified for"
4308 " storage units of type '%s': %r" %
4309 (storage_type, list(diff)),
4312 def ExpandNames(self):
4313 self.needed_locks = {
4314 locking.LEVEL_NODE: self.op.node_name,
4317 def Exec(self, feedback_fn):
4318 """Computes the list of nodes and their attributes.
4321 st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
4322 result = self.rpc.call_storage_modify(self.op.node_name,
4323 self.op.storage_type, st_args,
4324 self.op.name, self.op.changes)
4325 result.Raise("Failed to modify storage unit '%s' on %s" %
4326 (self.op.name, self.op.node_name))
4329 class LUNodeAdd(LogicalUnit):
4330 """Logical unit for adding node to the cluster.
4334 HTYPE = constants.HTYPE_NODE
4335 _NFLAGS = ["master_capable", "vm_capable"]
4337 def CheckArguments(self):
4338 self.primary_ip_family = self.cfg.GetPrimaryIPFamily()
4339 # validate/normalize the node name
4340 self.hostname = netutils.GetHostname(name=self.op.node_name,
4341 family=self.primary_ip_family)
4342 self.op.node_name = self.hostname.name
4344 if self.op.readd and self.op.node_name == self.cfg.GetMasterNode():
4345 raise errors.OpPrereqError("Cannot readd the master node",
4348 if self.op.readd and self.op.group:
4349 raise errors.OpPrereqError("Cannot pass a node group when a node is"
4350 " being readded", errors.ECODE_INVAL)
4352 def BuildHooksEnv(self):
4355 This will run on all nodes before, and on all nodes + the new node after.
4359 "OP_TARGET": self.op.node_name,
4360 "NODE_NAME": self.op.node_name,
4361 "NODE_PIP": self.op.primary_ip,
4362 "NODE_SIP": self.op.secondary_ip,
4363 "MASTER_CAPABLE": str(self.op.master_capable),
4364 "VM_CAPABLE": str(self.op.vm_capable),
4367 def BuildHooksNodes(self):
4368 """Build hooks nodes.
4371 # Exclude added node
4372 pre_nodes = list(set(self.cfg.GetNodeList()) - set([self.op.node_name]))
4373 post_nodes = pre_nodes + [self.op.node_name, ]
4375 return (pre_nodes, post_nodes)
4377 def CheckPrereq(self):
4378 """Check prerequisites.
4381 - the new node is not already in the config
4383 - its parameters (single/dual homed) matches the cluster
4385 Any errors are signaled by raising errors.OpPrereqError.
4389 hostname = self.hostname
4390 node = hostname.name
4391 primary_ip = self.op.primary_ip = hostname.ip
4392 if self.op.secondary_ip is None:
4393 if self.primary_ip_family == netutils.IP6Address.family:
4394 raise errors.OpPrereqError("When using a IPv6 primary address, a valid"
4395 " IPv4 address must be given as secondary",
4397 self.op.secondary_ip = primary_ip
4399 secondary_ip = self.op.secondary_ip
4400 if not netutils.IP4Address.IsValid(secondary_ip):
4401 raise errors.OpPrereqError("Secondary IP (%s) needs to be a valid IPv4"
4402 " address" % secondary_ip, errors.ECODE_INVAL)
4404 node_list = cfg.GetNodeList()
4405 if not self.op.readd and node in node_list:
4406 raise errors.OpPrereqError("Node %s is already in the configuration" %
4407 node, errors.ECODE_EXISTS)
4408 elif self.op.readd and node not in node_list:
4409 raise errors.OpPrereqError("Node %s is not in the configuration" % node,
4412 self.changed_primary_ip = False
4414 for existing_node_name in node_list:
4415 existing_node = cfg.GetNodeInfo(existing_node_name)
4417 if self.op.readd and node == existing_node_name:
4418 if existing_node.secondary_ip != secondary_ip:
4419 raise errors.OpPrereqError("Readded node doesn't have the same IP"
4420 " address configuration as before",
4422 if existing_node.primary_ip != primary_ip:
4423 self.changed_primary_ip = True
4427 if (existing_node.primary_ip == primary_ip or
4428 existing_node.secondary_ip == primary_ip or
4429 existing_node.primary_ip == secondary_ip or
4430 existing_node.secondary_ip == secondary_ip):
4431 raise errors.OpPrereqError("New node ip address(es) conflict with"
4432 " existing node %s" % existing_node.name,
4433 errors.ECODE_NOTUNIQUE)
4435 # After this 'if' block, None is no longer a valid value for the
4436 # _capable op attributes
4438 old_node = self.cfg.GetNodeInfo(node)
4439 assert old_node is not None, "Can't retrieve locked node %s" % node
4440 for attr in self._NFLAGS:
4441 if getattr(self.op, attr) is None:
4442 setattr(self.op, attr, getattr(old_node, attr))
4444 for attr in self._NFLAGS:
4445 if getattr(self.op, attr) is None:
4446 setattr(self.op, attr, True)
4448 if self.op.readd and not self.op.vm_capable:
4449 pri, sec = cfg.GetNodeInstances(node)
4451 raise errors.OpPrereqError("Node %s being re-added with vm_capable"
4452 " flag set to false, but it already holds"
4453 " instances" % node,
4456 # check that the type of the node (single versus dual homed) is the
4457 # same as for the master
4458 myself = cfg.GetNodeInfo(self.cfg.GetMasterNode())
4459 master_singlehomed = myself.secondary_ip == myself.primary_ip
4460 newbie_singlehomed = secondary_ip == primary_ip
4461 if master_singlehomed != newbie_singlehomed:
4462 if master_singlehomed:
4463 raise errors.OpPrereqError("The master has no secondary ip but the"
4464 " new node has one",
4467 raise errors.OpPrereqError("The master has a secondary ip but the"
4468 " new node doesn't have one",
4471 # checks reachability
4472 if not netutils.TcpPing(primary_ip, constants.DEFAULT_NODED_PORT):
4473 raise errors.OpPrereqError("Node not reachable by ping",
4474 errors.ECODE_ENVIRON)
4476 if not newbie_singlehomed:
4477 # check reachability from my secondary ip to newbie's secondary ip
4478 if not netutils.TcpPing(secondary_ip, constants.DEFAULT_NODED_PORT,
4479 source=myself.secondary_ip):
4480 raise errors.OpPrereqError("Node secondary ip not reachable by TCP"
4481 " based ping to node daemon port",
4482 errors.ECODE_ENVIRON)
4489 if self.op.master_capable:
4490 self.master_candidate = _DecideSelfPromotion(self, exceptions=exceptions)
4492 self.master_candidate = False
4495 self.new_node = old_node
4497 node_group = cfg.LookupNodeGroup(self.op.group)
4498 self.new_node = objects.Node(name=node,
4499 primary_ip=primary_ip,
4500 secondary_ip=secondary_ip,
4501 master_candidate=self.master_candidate,
4502 offline=False, drained=False,
4505 if self.op.ndparams:
4506 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
4508 def Exec(self, feedback_fn):
4509 """Adds the new node to the cluster.
4512 new_node = self.new_node
4513 node = new_node.name
4515 # We adding a new node so we assume it's powered
4516 new_node.powered = True
4518 # for re-adds, reset the offline/drained/master-candidate flags;
4519 # we need to reset here, otherwise offline would prevent RPC calls
4520 # later in the procedure; this also means that if the re-add
4521 # fails, we are left with a non-offlined, broken node
4523 new_node.drained = new_node.offline = False # pylint: disable-msg=W0201
4524 self.LogInfo("Readding a node, the offline/drained flags were reset")
4525 # if we demote the node, we do cleanup later in the procedure
4526 new_node.master_candidate = self.master_candidate
4527 if self.changed_primary_ip:
4528 new_node.primary_ip = self.op.primary_ip
4530 # copy the master/vm_capable flags
4531 for attr in self._NFLAGS:
4532 setattr(new_node, attr, getattr(self.op, attr))
4534 # notify the user about any possible mc promotion
4535 if new_node.master_candidate:
4536 self.LogInfo("Node will be a master candidate")
4538 if self.op.ndparams:
4539 new_node.ndparams = self.op.ndparams
4541 new_node.ndparams = {}
4543 # check connectivity
4544 result = self.rpc.call_version([node])[node]
4545 result.Raise("Can't get version information from node %s" % node)
4546 if constants.PROTOCOL_VERSION == result.payload:
4547 logging.info("Communication to node %s fine, sw version %s match",
4548 node, result.payload)
4550 raise errors.OpExecError("Version mismatch master version %s,"
4551 " node version %s" %
4552 (constants.PROTOCOL_VERSION, result.payload))
4554 # Add node to our /etc/hosts, and add key to known_hosts
4555 if self.cfg.GetClusterInfo().modify_etc_hosts:
4556 master_node = self.cfg.GetMasterNode()
4557 result = self.rpc.call_etc_hosts_modify(master_node,
4558 constants.ETC_HOSTS_ADD,
4561 result.Raise("Can't update hosts file with new host data")
4563 if new_node.secondary_ip != new_node.primary_ip:
4564 _CheckNodeHasSecondaryIP(self, new_node.name, new_node.secondary_ip,
4567 node_verify_list = [self.cfg.GetMasterNode()]
4568 node_verify_param = {
4569 constants.NV_NODELIST: [node],
4570 # TODO: do a node-net-test as well?
4573 result = self.rpc.call_node_verify(node_verify_list, node_verify_param,
4574 self.cfg.GetClusterName())
4575 for verifier in node_verify_list:
4576 result[verifier].Raise("Cannot communicate with node %s" % verifier)
4577 nl_payload = result[verifier].payload[constants.NV_NODELIST]
4579 for failed in nl_payload:
4580 feedback_fn("ssh/hostname verification failed"
4581 " (checking from %s): %s" %
4582 (verifier, nl_payload[failed]))
4583 raise errors.OpExecError("ssh/hostname verification failed")
4586 _RedistributeAncillaryFiles(self)
4587 self.context.ReaddNode(new_node)
4588 # make sure we redistribute the config
4589 self.cfg.Update(new_node, feedback_fn)
4590 # and make sure the new node will not have old files around
4591 if not new_node.master_candidate:
4592 result = self.rpc.call_node_demote_from_mc(new_node.name)
4593 msg = result.fail_msg
4595 self.LogWarning("Node failed to demote itself from master"
4596 " candidate status: %s" % msg)
4598 _RedistributeAncillaryFiles(self, additional_nodes=[node],
4599 additional_vm=self.op.vm_capable)
4600 self.context.AddNode(new_node, self.proc.GetECId())
4603 class LUNodeSetParams(LogicalUnit):
4604 """Modifies the parameters of a node.
4606 @cvar _F2R: a dictionary from tuples of flags (mc, drained, offline)
4607 to the node role (as _ROLE_*)
4608 @cvar _R2F: a dictionary from node role to tuples of flags
4609 @cvar _FLAGS: a list of attribute names corresponding to the flags
4612 HPATH = "node-modify"
4613 HTYPE = constants.HTYPE_NODE
4615 (_ROLE_CANDIDATE, _ROLE_DRAINED, _ROLE_OFFLINE, _ROLE_REGULAR) = range(4)
4617 (True, False, False): _ROLE_CANDIDATE,
4618 (False, True, False): _ROLE_DRAINED,
4619 (False, False, True): _ROLE_OFFLINE,
4620 (False, False, False): _ROLE_REGULAR,
4622 _R2F = dict((v, k) for k, v in _F2R.items())
4623 _FLAGS = ["master_candidate", "drained", "offline"]
4625 def CheckArguments(self):
4626 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
4627 all_mods = [self.op.offline, self.op.master_candidate, self.op.drained,
4628 self.op.master_capable, self.op.vm_capable,
4629 self.op.secondary_ip, self.op.ndparams]
4630 if all_mods.count(None) == len(all_mods):
4631 raise errors.OpPrereqError("Please pass at least one modification",
4633 if all_mods.count(True) > 1:
4634 raise errors.OpPrereqError("Can't set the node into more than one"
4635 " state at the same time",
4638 # Boolean value that tells us whether we might be demoting from MC
4639 self.might_demote = (self.op.master_candidate == False or
4640 self.op.offline == True or
4641 self.op.drained == True or
4642 self.op.master_capable == False)
4644 if self.op.secondary_ip:
4645 if not netutils.IP4Address.IsValid(self.op.secondary_ip):
4646 raise errors.OpPrereqError("Secondary IP (%s) needs to be a valid IPv4"
4647 " address" % self.op.secondary_ip,
4650 self.lock_all = self.op.auto_promote and self.might_demote
4651 self.lock_instances = self.op.secondary_ip is not None
4653 def ExpandNames(self):
4655 self.needed_locks = {locking.LEVEL_NODE: locking.ALL_SET}
4657 self.needed_locks = {locking.LEVEL_NODE: self.op.node_name}
4659 if self.lock_instances:
4660 self.needed_locks[locking.LEVEL_INSTANCE] = locking.ALL_SET
4662 def DeclareLocks(self, level):
4663 # If we have locked all instances, before waiting to lock nodes, release
4664 # all the ones living on nodes unrelated to the current operation.
4665 if level == locking.LEVEL_NODE and self.lock_instances:
4666 self.affected_instances = []
4667 if self.needed_locks[locking.LEVEL_NODE] is not locking.ALL_SET:
4670 # Build list of instances to release
4671 for instance_name in self.glm.list_owned(locking.LEVEL_INSTANCE):
4672 instance = self.context.cfg.GetInstanceInfo(instance_name)
4673 if (instance.disk_template in constants.DTS_INT_MIRROR and
4674 self.op.node_name in instance.all_nodes):
4675 instances_keep.append(instance_name)
4676 self.affected_instances.append(instance)
4678 _ReleaseLocks(self, locking.LEVEL_INSTANCE, keep=instances_keep)
4680 assert (set(self.glm.list_owned(locking.LEVEL_INSTANCE)) ==
4681 set(instances_keep))
4683 def BuildHooksEnv(self):
4686 This runs on the master node.
4690 "OP_TARGET": self.op.node_name,
4691 "MASTER_CANDIDATE": str(self.op.master_candidate),
4692 "OFFLINE": str(self.op.offline),
4693 "DRAINED": str(self.op.drained),
4694 "MASTER_CAPABLE": str(self.op.master_capable),
4695 "VM_CAPABLE": str(self.op.vm_capable),
4698 def BuildHooksNodes(self):
4699 """Build hooks nodes.
4702 nl = [self.cfg.GetMasterNode(), self.op.node_name]
4705 def CheckPrereq(self):
4706 """Check prerequisites.
4708 This only checks the instance list against the existing names.
4711 node = self.node = self.cfg.GetNodeInfo(self.op.node_name)
4713 if (self.op.master_candidate is not None or
4714 self.op.drained is not None or
4715 self.op.offline is not None):
4716 # we can't change the master's node flags
4717 if self.op.node_name == self.cfg.GetMasterNode():
4718 raise errors.OpPrereqError("The master role can be changed"
4719 " only via master-failover",
4722 if self.op.master_candidate and not node.master_capable:
4723 raise errors.OpPrereqError("Node %s is not master capable, cannot make"
4724 " it a master candidate" % node.name,
4727 if self.op.vm_capable == False:
4728 (ipri, isec) = self.cfg.GetNodeInstances(self.op.node_name)
4730 raise errors.OpPrereqError("Node %s hosts instances, cannot unset"
4731 " the vm_capable flag" % node.name,
4734 if node.master_candidate and self.might_demote and not self.lock_all:
4735 assert not self.op.auto_promote, "auto_promote set but lock_all not"
4736 # check if after removing the current node, we're missing master
4738 (mc_remaining, mc_should, _) = \
4739 self.cfg.GetMasterCandidateStats(exceptions=[node.name])
4740 if mc_remaining < mc_should:
4741 raise errors.OpPrereqError("Not enough master candidates, please"
4742 " pass auto promote option to allow"
4743 " promotion", errors.ECODE_STATE)
4745 self.old_flags = old_flags = (node.master_candidate,
4746 node.drained, node.offline)
4747 assert old_flags in self._F2R, "Un-handled old flags %s" % str(old_flags)
4748 self.old_role = old_role = self._F2R[old_flags]
4750 # Check for ineffective changes
4751 for attr in self._FLAGS:
4752 if (getattr(self.op, attr) == False and getattr(node, attr) == False):
4753 self.LogInfo("Ignoring request to unset flag %s, already unset", attr)
4754 setattr(self.op, attr, None)
4756 # Past this point, any flag change to False means a transition
4757 # away from the respective state, as only real changes are kept
4759 # TODO: We might query the real power state if it supports OOB
4760 if _SupportsOob(self.cfg, node):
4761 if self.op.offline is False and not (node.powered or
4762 self.op.powered == True):
4763 raise errors.OpPrereqError(("Node %s needs to be turned on before its"
4764 " offline status can be reset") %
4766 elif self.op.powered is not None:
4767 raise errors.OpPrereqError(("Unable to change powered state for node %s"
4768 " as it does not support out-of-band"
4769 " handling") % self.op.node_name)
4771 # If we're being deofflined/drained, we'll MC ourself if needed
4772 if (self.op.drained == False or self.op.offline == False or
4773 (self.op.master_capable and not node.master_capable)):
4774 if _DecideSelfPromotion(self):
4775 self.op.master_candidate = True
4776 self.LogInfo("Auto-promoting node to master candidate")
4778 # If we're no longer master capable, we'll demote ourselves from MC
4779 if self.op.master_capable == False and node.master_candidate:
4780 self.LogInfo("Demoting from master candidate")
4781 self.op.master_candidate = False
4784 assert [getattr(self.op, attr) for attr in self._FLAGS].count(True) <= 1
4785 if self.op.master_candidate:
4786 new_role = self._ROLE_CANDIDATE
4787 elif self.op.drained:
4788 new_role = self._ROLE_DRAINED
4789 elif self.op.offline:
4790 new_role = self._ROLE_OFFLINE
4791 elif False in [self.op.master_candidate, self.op.drained, self.op.offline]:
4792 # False is still in new flags, which means we're un-setting (the
4794 new_role = self._ROLE_REGULAR
4795 else: # no new flags, nothing, keep old role
4798 self.new_role = new_role
4800 if old_role == self._ROLE_OFFLINE and new_role != old_role:
4801 # Trying to transition out of offline status
4802 result = self.rpc.call_version([node.name])[node.name]
4804 raise errors.OpPrereqError("Node %s is being de-offlined but fails"
4805 " to report its version: %s" %
4806 (node.name, result.fail_msg),
4809 self.LogWarning("Transitioning node from offline to online state"
4810 " without using re-add. Please make sure the node"
4813 if self.op.secondary_ip:
4814 # Ok even without locking, because this can't be changed by any LU
4815 master = self.cfg.GetNodeInfo(self.cfg.GetMasterNode())
4816 master_singlehomed = master.secondary_ip == master.primary_ip
4817 if master_singlehomed and self.op.secondary_ip:
4818 raise errors.OpPrereqError("Cannot change the secondary ip on a single"
4819 " homed cluster", errors.ECODE_INVAL)
4822 if self.affected_instances:
4823 raise errors.OpPrereqError("Cannot change secondary ip: offline"
4824 " node has instances (%s) configured"
4825 " to use it" % self.affected_instances)
4827 # On online nodes, check that no instances are running, and that
4828 # the node has the new ip and we can reach it.
4829 for instance in self.affected_instances:
4830 _CheckInstanceDown(self, instance, "cannot change secondary ip")
4832 _CheckNodeHasSecondaryIP(self, node.name, self.op.secondary_ip, True)
4833 if master.name != node.name:
4834 # check reachability from master secondary ip to new secondary ip
4835 if not netutils.TcpPing(self.op.secondary_ip,
4836 constants.DEFAULT_NODED_PORT,
4837 source=master.secondary_ip):
4838 raise errors.OpPrereqError("Node secondary ip not reachable by TCP"
4839 " based ping to node daemon port",
4840 errors.ECODE_ENVIRON)
4842 if self.op.ndparams:
4843 new_ndparams = _GetUpdatedParams(self.node.ndparams, self.op.ndparams)
4844 utils.ForceDictType(new_ndparams, constants.NDS_PARAMETER_TYPES)
4845 self.new_ndparams = new_ndparams
4847 def Exec(self, feedback_fn):
4852 old_role = self.old_role
4853 new_role = self.new_role
4857 if self.op.ndparams:
4858 node.ndparams = self.new_ndparams
4860 if self.op.powered is not None:
4861 node.powered = self.op.powered
4863 for attr in ["master_capable", "vm_capable"]:
4864 val = getattr(self.op, attr)
4866 setattr(node, attr, val)
4867 result.append((attr, str(val)))
4869 if new_role != old_role:
4870 # Tell the node to demote itself, if no longer MC and not offline
4871 if old_role == self._ROLE_CANDIDATE and new_role != self._ROLE_OFFLINE:
4872 msg = self.rpc.call_node_demote_from_mc(node.name).fail_msg
4874 self.LogWarning("Node failed to demote itself: %s", msg)
4876 new_flags = self._R2F[new_role]
4877 for of, nf, desc in zip(self.old_flags, new_flags, self._FLAGS):
4879 result.append((desc, str(nf)))
4880 (node.master_candidate, node.drained, node.offline) = new_flags
4882 # we locked all nodes, we adjust the CP before updating this node
4884 _AdjustCandidatePool(self, [node.name])
4886 if self.op.secondary_ip:
4887 node.secondary_ip = self.op.secondary_ip
4888 result.append(("secondary_ip", self.op.secondary_ip))
4890 # this will trigger configuration file update, if needed
4891 self.cfg.Update(node, feedback_fn)
4893 # this will trigger job queue propagation or cleanup if the mc
4895 if [old_role, new_role].count(self._ROLE_CANDIDATE) == 1:
4896 self.context.ReaddNode(node)
4901 class LUNodePowercycle(NoHooksLU):
4902 """Powercycles a node.
4907 def CheckArguments(self):
4908 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
4909 if self.op.node_name == self.cfg.GetMasterNode() and not self.op.force:
4910 raise errors.OpPrereqError("The node is the master and the force"
4911 " parameter was not set",
4914 def ExpandNames(self):
4915 """Locking for PowercycleNode.
4917 This is a last-resort option and shouldn't block on other
4918 jobs. Therefore, we grab no locks.
4921 self.needed_locks = {}
4923 def Exec(self, feedback_fn):
4927 result = self.rpc.call_node_powercycle(self.op.node_name,
4928 self.cfg.GetHypervisorType())
4929 result.Raise("Failed to schedule the reboot")
4930 return result.payload
4933 class LUClusterQuery(NoHooksLU):
4934 """Query cluster configuration.
4939 def ExpandNames(self):
4940 self.needed_locks = {}
4942 def Exec(self, feedback_fn):
4943 """Return cluster config.
4946 cluster = self.cfg.GetClusterInfo()
4949 # Filter just for enabled hypervisors
4950 for os_name, hv_dict in cluster.os_hvp.items():
4951 os_hvp[os_name] = {}
4952 for hv_name, hv_params in hv_dict.items():
4953 if hv_name in cluster.enabled_hypervisors:
4954 os_hvp[os_name][hv_name] = hv_params
4956 # Convert ip_family to ip_version
4957 primary_ip_version = constants.IP4_VERSION
4958 if cluster.primary_ip_family == netutils.IP6Address.family:
4959 primary_ip_version = constants.IP6_VERSION
4962 "software_version": constants.RELEASE_VERSION,
4963 "protocol_version": constants.PROTOCOL_VERSION,
4964 "config_version": constants.CONFIG_VERSION,
4965 "os_api_version": max(constants.OS_API_VERSIONS),
4966 "export_version": constants.EXPORT_VERSION,
4967 "architecture": (platform.architecture()[0], platform.machine()),
4968 "name": cluster.cluster_name,
4969 "master": cluster.master_node,
4970 "default_hypervisor": cluster.enabled_hypervisors[0],
4971 "enabled_hypervisors": cluster.enabled_hypervisors,
4972 "hvparams": dict([(hypervisor_name, cluster.hvparams[hypervisor_name])
4973 for hypervisor_name in cluster.enabled_hypervisors]),
4975 "beparams": cluster.beparams,
4976 "osparams": cluster.osparams,
4977 "nicparams": cluster.nicparams,
4978 "ndparams": cluster.ndparams,
4979 "candidate_pool_size": cluster.candidate_pool_size,
4980 "master_netdev": cluster.master_netdev,
4981 "volume_group_name": cluster.volume_group_name,
4982 "drbd_usermode_helper": cluster.drbd_usermode_helper,
4983 "file_storage_dir": cluster.file_storage_dir,
4984 "shared_file_storage_dir": cluster.shared_file_storage_dir,
4985 "maintain_node_health": cluster.maintain_node_health,
4986 "ctime": cluster.ctime,
4987 "mtime": cluster.mtime,
4988 "uuid": cluster.uuid,
4989 "tags": list(cluster.GetTags()),
4990 "uid_pool": cluster.uid_pool,
4991 "default_iallocator": cluster.default_iallocator,
4992 "reserved_lvs": cluster.reserved_lvs,
4993 "primary_ip_version": primary_ip_version,
4994 "prealloc_wipe_disks": cluster.prealloc_wipe_disks,
4995 "hidden_os": cluster.hidden_os,
4996 "blacklisted_os": cluster.blacklisted_os,
5002 class LUClusterConfigQuery(NoHooksLU):
5003 """Return configuration values.
5007 _FIELDS_DYNAMIC = utils.FieldSet()
5008 _FIELDS_STATIC = utils.FieldSet("cluster_name", "master_node", "drain_flag",
5009 "watcher_pause", "volume_group_name")
5011 def CheckArguments(self):
5012 _CheckOutputFields(static=self._FIELDS_STATIC,
5013 dynamic=self._FIELDS_DYNAMIC,
5014 selected=self.op.output_fields)
5016 def ExpandNames(self):
5017 self.needed_locks = {}
5019 def Exec(self, feedback_fn):
5020 """Dump a representation of the cluster config to the standard output.
5024 for field in self.op.output_fields:
5025 if field == "cluster_name":
5026 entry = self.cfg.GetClusterName()
5027 elif field == "master_node":
5028 entry = self.cfg.GetMasterNode()
5029 elif field == "drain_flag":
5030 entry = os.path.exists(constants.JOB_QUEUE_DRAIN_FILE)
5031 elif field == "watcher_pause":
5032 entry = utils.ReadWatcherPauseFile(constants.WATCHER_PAUSEFILE)
5033 elif field == "volume_group_name":
5034 entry = self.cfg.GetVGName()
5036 raise errors.ParameterError(field)
5037 values.append(entry)
5041 class LUInstanceActivateDisks(NoHooksLU):
5042 """Bring up an instance's disks.
5047 def ExpandNames(self):
5048 self._ExpandAndLockInstance()
5049 self.needed_locks[locking.LEVEL_NODE] = []
5050 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
5052 def DeclareLocks(self, level):
5053 if level == locking.LEVEL_NODE:
5054 self._LockInstancesNodes()
5056 def CheckPrereq(self):
5057 """Check prerequisites.
5059 This checks that the instance is in the cluster.
5062 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5063 assert self.instance is not None, \
5064 "Cannot retrieve locked instance %s" % self.op.instance_name
5065 _CheckNodeOnline(self, self.instance.primary_node)
5067 def Exec(self, feedback_fn):
5068 """Activate the disks.
5071 disks_ok, disks_info = \
5072 _AssembleInstanceDisks(self, self.instance,
5073 ignore_size=self.op.ignore_size)
5075 raise errors.OpExecError("Cannot activate block devices")
5080 def _AssembleInstanceDisks(lu, instance, disks=None, ignore_secondaries=False,
5082 """Prepare the block devices for an instance.
5084 This sets up the block devices on all nodes.
5086 @type lu: L{LogicalUnit}
5087 @param lu: the logical unit on whose behalf we execute
5088 @type instance: L{objects.Instance}
5089 @param instance: the instance for whose disks we assemble
5090 @type disks: list of L{objects.Disk} or None
5091 @param disks: which disks to assemble (or all, if None)
5092 @type ignore_secondaries: boolean
5093 @param ignore_secondaries: if true, errors on secondary nodes
5094 won't result in an error return from the function
5095 @type ignore_size: boolean
5096 @param ignore_size: if true, the current known size of the disk
5097 will not be used during the disk activation, useful for cases
5098 when the size is wrong
5099 @return: False if the operation failed, otherwise a list of
5100 (host, instance_visible_name, node_visible_name)
5101 with the mapping from node devices to instance devices
5106 iname = instance.name
5107 disks = _ExpandCheckDisks(instance, disks)
5109 # With the two passes mechanism we try to reduce the window of
5110 # opportunity for the race condition of switching DRBD to primary
5111 # before handshaking occured, but we do not eliminate it
5113 # The proper fix would be to wait (with some limits) until the
5114 # connection has been made and drbd transitions from WFConnection
5115 # into any other network-connected state (Connected, SyncTarget,
5118 # 1st pass, assemble on all nodes in secondary mode
5119 for idx, inst_disk in enumerate(disks):
5120 for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node):
5122 node_disk = node_disk.Copy()
5123 node_disk.UnsetSize()
5124 lu.cfg.SetDiskID(node_disk, node)
5125 result = lu.rpc.call_blockdev_assemble(node, node_disk, iname, False, idx)
5126 msg = result.fail_msg
5128 lu.proc.LogWarning("Could not prepare block device %s on node %s"
5129 " (is_primary=False, pass=1): %s",
5130 inst_disk.iv_name, node, msg)
5131 if not ignore_secondaries:
5134 # FIXME: race condition on drbd migration to primary
5136 # 2nd pass, do only the primary node
5137 for idx, inst_disk in enumerate(disks):
5140 for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node):
5141 if node != instance.primary_node:
5144 node_disk = node_disk.Copy()
5145 node_disk.UnsetSize()
5146 lu.cfg.SetDiskID(node_disk, node)
5147 result = lu.rpc.call_blockdev_assemble(node, node_disk, iname, True, idx)
5148 msg = result.fail_msg
5150 lu.proc.LogWarning("Could not prepare block device %s on node %s"
5151 " (is_primary=True, pass=2): %s",
5152 inst_disk.iv_name, node, msg)
5155 dev_path = result.payload
5157 device_info.append((instance.primary_node, inst_disk.iv_name, dev_path))
5159 # leave the disks configured for the primary node
5160 # this is a workaround that would be fixed better by
5161 # improving the logical/physical id handling
5163 lu.cfg.SetDiskID(disk, instance.primary_node)
5165 return disks_ok, device_info
5168 def _StartInstanceDisks(lu, instance, force):
5169 """Start the disks of an instance.
5172 disks_ok, _ = _AssembleInstanceDisks(lu, instance,
5173 ignore_secondaries=force)
5175 _ShutdownInstanceDisks(lu, instance)
5176 if force is not None and not force:
5177 lu.proc.LogWarning("", hint="If the message above refers to a"
5179 " you can retry the operation using '--force'.")
5180 raise errors.OpExecError("Disk consistency error")
5183 class LUInstanceDeactivateDisks(NoHooksLU):
5184 """Shutdown an instance's disks.
5189 def ExpandNames(self):
5190 self._ExpandAndLockInstance()
5191 self.needed_locks[locking.LEVEL_NODE] = []
5192 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
5194 def DeclareLocks(self, level):
5195 if level == locking.LEVEL_NODE:
5196 self._LockInstancesNodes()
5198 def CheckPrereq(self):
5199 """Check prerequisites.
5201 This checks that the instance is in the cluster.
5204 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5205 assert self.instance is not None, \
5206 "Cannot retrieve locked instance %s" % self.op.instance_name
5208 def Exec(self, feedback_fn):
5209 """Deactivate the disks
5212 instance = self.instance
5214 _ShutdownInstanceDisks(self, instance)
5216 _SafeShutdownInstanceDisks(self, instance)
5219 def _SafeShutdownInstanceDisks(lu, instance, disks=None):
5220 """Shutdown block devices of an instance.
5222 This function checks if an instance is running, before calling
5223 _ShutdownInstanceDisks.
5226 _CheckInstanceDown(lu, instance, "cannot shutdown disks")
5227 _ShutdownInstanceDisks(lu, instance, disks=disks)
5230 def _ExpandCheckDisks(instance, disks):
5231 """Return the instance disks selected by the disks list
5233 @type disks: list of L{objects.Disk} or None
5234 @param disks: selected disks
5235 @rtype: list of L{objects.Disk}
5236 @return: selected instance disks to act on
5240 return instance.disks
5242 if not set(disks).issubset(instance.disks):
5243 raise errors.ProgrammerError("Can only act on disks belonging to the"
5248 def _ShutdownInstanceDisks(lu, instance, disks=None, ignore_primary=False):
5249 """Shutdown block devices of an instance.
5251 This does the shutdown on all nodes of the instance.
5253 If the ignore_primary is false, errors on the primary node are
5258 disks = _ExpandCheckDisks(instance, disks)
5261 for node, top_disk in disk.ComputeNodeTree(instance.primary_node):
5262 lu.cfg.SetDiskID(top_disk, node)
5263 result = lu.rpc.call_blockdev_shutdown(node, top_disk)
5264 msg = result.fail_msg
5266 lu.LogWarning("Could not shutdown block device %s on node %s: %s",
5267 disk.iv_name, node, msg)
5268 if ((node == instance.primary_node and not ignore_primary) or
5269 (node != instance.primary_node and not result.offline)):
5274 def _CheckNodeFreeMemory(lu, node, reason, requested, hypervisor_name):
5275 """Checks if a node has enough free memory.
5277 This function check if a given node has the needed amount of free
5278 memory. In case the node has less memory or we cannot get the
5279 information from the node, this function raise an OpPrereqError
5282 @type lu: C{LogicalUnit}
5283 @param lu: a logical unit from which we get configuration data
5285 @param node: the node to check
5286 @type reason: C{str}
5287 @param reason: string to use in the error message
5288 @type requested: C{int}
5289 @param requested: the amount of memory in MiB to check for
5290 @type hypervisor_name: C{str}
5291 @param hypervisor_name: the hypervisor to ask for memory stats
5292 @raise errors.OpPrereqError: if the node doesn't have enough memory, or
5293 we cannot check the node
5296 nodeinfo = lu.rpc.call_node_info([node], None, hypervisor_name)
5297 nodeinfo[node].Raise("Can't get data from node %s" % node,
5298 prereq=True, ecode=errors.ECODE_ENVIRON)
5299 free_mem = nodeinfo[node].payload.get('memory_free', None)
5300 if not isinstance(free_mem, int):
5301 raise errors.OpPrereqError("Can't compute free memory on node %s, result"
5302 " was '%s'" % (node, free_mem),
5303 errors.ECODE_ENVIRON)
5304 if requested > free_mem:
5305 raise errors.OpPrereqError("Not enough memory on node %s for %s:"
5306 " needed %s MiB, available %s MiB" %
5307 (node, reason, requested, free_mem),
5311 def _CheckNodesFreeDiskPerVG(lu, nodenames, req_sizes):
5312 """Checks if nodes have enough free disk space in the all VGs.
5314 This function check if all given nodes have the needed amount of
5315 free disk. In case any node has less disk or we cannot get the
5316 information from the node, this function raise an OpPrereqError
5319 @type lu: C{LogicalUnit}
5320 @param lu: a logical unit from which we get configuration data
5321 @type nodenames: C{list}
5322 @param nodenames: the list of node names to check
5323 @type req_sizes: C{dict}
5324 @param req_sizes: the hash of vg and corresponding amount of disk in
5326 @raise errors.OpPrereqError: if the node doesn't have enough disk,
5327 or we cannot check the node
5330 for vg, req_size in req_sizes.items():
5331 _CheckNodesFreeDiskOnVG(lu, nodenames, vg, req_size)
5334 def _CheckNodesFreeDiskOnVG(lu, nodenames, vg, requested):
5335 """Checks if nodes have enough free disk space in the specified VG.
5337 This function check if all given nodes have the needed amount of
5338 free disk. In case any node has less disk or we cannot get the
5339 information from the node, this function raise an OpPrereqError
5342 @type lu: C{LogicalUnit}
5343 @param lu: a logical unit from which we get configuration data
5344 @type nodenames: C{list}
5345 @param nodenames: the list of node names to check
5347 @param vg: the volume group to check
5348 @type requested: C{int}
5349 @param requested: the amount of disk in MiB to check for
5350 @raise errors.OpPrereqError: if the node doesn't have enough disk,
5351 or we cannot check the node
5354 nodeinfo = lu.rpc.call_node_info(nodenames, vg, None)
5355 for node in nodenames:
5356 info = nodeinfo[node]
5357 info.Raise("Cannot get current information from node %s" % node,
5358 prereq=True, ecode=errors.ECODE_ENVIRON)
5359 vg_free = info.payload.get("vg_free", None)
5360 if not isinstance(vg_free, int):
5361 raise errors.OpPrereqError("Can't compute free disk space on node"
5362 " %s for vg %s, result was '%s'" %
5363 (node, vg, vg_free), errors.ECODE_ENVIRON)
5364 if requested > vg_free:
5365 raise errors.OpPrereqError("Not enough disk space on target node %s"
5366 " vg %s: required %d MiB, available %d MiB" %
5367 (node, vg, requested, vg_free),
5371 class LUInstanceStartup(LogicalUnit):
5372 """Starts an instance.
5375 HPATH = "instance-start"
5376 HTYPE = constants.HTYPE_INSTANCE
5379 def CheckArguments(self):
5381 if self.op.beparams:
5382 # fill the beparams dict
5383 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
5385 def ExpandNames(self):
5386 self._ExpandAndLockInstance()
5388 def BuildHooksEnv(self):
5391 This runs on master, primary and secondary nodes of the instance.
5395 "FORCE": self.op.force,
5398 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
5402 def BuildHooksNodes(self):
5403 """Build hooks nodes.
5406 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
5409 def CheckPrereq(self):
5410 """Check prerequisites.
5412 This checks that the instance is in the cluster.
5415 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5416 assert self.instance is not None, \
5417 "Cannot retrieve locked instance %s" % self.op.instance_name
5420 if self.op.hvparams:
5421 # check hypervisor parameter syntax (locally)
5422 cluster = self.cfg.GetClusterInfo()
5423 utils.ForceDictType(self.op.hvparams, constants.HVS_PARAMETER_TYPES)
5424 filled_hvp = cluster.FillHV(instance)
5425 filled_hvp.update(self.op.hvparams)
5426 hv_type = hypervisor.GetHypervisor(instance.hypervisor)
5427 hv_type.CheckParameterSyntax(filled_hvp)
5428 _CheckHVParams(self, instance.all_nodes, instance.hypervisor, filled_hvp)
5430 self.primary_offline = self.cfg.GetNodeInfo(instance.primary_node).offline
5432 if self.primary_offline and self.op.ignore_offline_nodes:
5433 self.proc.LogWarning("Ignoring offline primary node")
5435 if self.op.hvparams or self.op.beparams:
5436 self.proc.LogWarning("Overridden parameters are ignored")
5438 _CheckNodeOnline(self, instance.primary_node)
5440 bep = self.cfg.GetClusterInfo().FillBE(instance)
5442 # check bridges existence
5443 _CheckInstanceBridgesExist(self, instance)
5445 remote_info = self.rpc.call_instance_info(instance.primary_node,
5447 instance.hypervisor)
5448 remote_info.Raise("Error checking node %s" % instance.primary_node,
5449 prereq=True, ecode=errors.ECODE_ENVIRON)
5450 if not remote_info.payload: # not running already
5451 _CheckNodeFreeMemory(self, instance.primary_node,
5452 "starting instance %s" % instance.name,
5453 bep[constants.BE_MEMORY], instance.hypervisor)
5455 def Exec(self, feedback_fn):
5456 """Start the instance.
5459 instance = self.instance
5460 force = self.op.force
5462 self.cfg.MarkInstanceUp(instance.name)
5464 if self.primary_offline:
5465 assert self.op.ignore_offline_nodes
5466 self.proc.LogInfo("Primary node offline, marked instance as started")
5468 node_current = instance.primary_node
5470 _StartInstanceDisks(self, instance, force)
5472 result = self.rpc.call_instance_start(node_current, instance,
5473 self.op.hvparams, self.op.beparams)
5474 msg = result.fail_msg
5476 _ShutdownInstanceDisks(self, instance)
5477 raise errors.OpExecError("Could not start instance: %s" % msg)
5480 class LUInstanceReboot(LogicalUnit):
5481 """Reboot an instance.
5484 HPATH = "instance-reboot"
5485 HTYPE = constants.HTYPE_INSTANCE
5488 def ExpandNames(self):
5489 self._ExpandAndLockInstance()
5491 def BuildHooksEnv(self):
5494 This runs on master, primary and secondary nodes of the instance.
5498 "IGNORE_SECONDARIES": self.op.ignore_secondaries,
5499 "REBOOT_TYPE": self.op.reboot_type,
5500 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
5503 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
5507 def BuildHooksNodes(self):
5508 """Build hooks nodes.
5511 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
5514 def CheckPrereq(self):
5515 """Check prerequisites.
5517 This checks that the instance is in the cluster.
5520 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5521 assert self.instance is not None, \
5522 "Cannot retrieve locked instance %s" % self.op.instance_name
5524 _CheckNodeOnline(self, instance.primary_node)
5526 # check bridges existence
5527 _CheckInstanceBridgesExist(self, instance)
5529 def Exec(self, feedback_fn):
5530 """Reboot the instance.
5533 instance = self.instance
5534 ignore_secondaries = self.op.ignore_secondaries
5535 reboot_type = self.op.reboot_type
5537 remote_info = self.rpc.call_instance_info(instance.primary_node,
5539 instance.hypervisor)
5540 remote_info.Raise("Error checking node %s" % instance.primary_node)
5541 instance_running = bool(remote_info.payload)
5543 node_current = instance.primary_node
5545 if instance_running and reboot_type in [constants.INSTANCE_REBOOT_SOFT,
5546 constants.INSTANCE_REBOOT_HARD]:
5547 for disk in instance.disks:
5548 self.cfg.SetDiskID(disk, node_current)
5549 result = self.rpc.call_instance_reboot(node_current, instance,
5551 self.op.shutdown_timeout)
5552 result.Raise("Could not reboot instance")
5554 if instance_running:
5555 result = self.rpc.call_instance_shutdown(node_current, instance,
5556 self.op.shutdown_timeout)
5557 result.Raise("Could not shutdown instance for full reboot")
5558 _ShutdownInstanceDisks(self, instance)
5560 self.LogInfo("Instance %s was already stopped, starting now",
5562 _StartInstanceDisks(self, instance, ignore_secondaries)
5563 result = self.rpc.call_instance_start(node_current, instance, None, None)
5564 msg = result.fail_msg
5566 _ShutdownInstanceDisks(self, instance)
5567 raise errors.OpExecError("Could not start instance for"
5568 " full reboot: %s" % msg)
5570 self.cfg.MarkInstanceUp(instance.name)
5573 class LUInstanceShutdown(LogicalUnit):
5574 """Shutdown an instance.
5577 HPATH = "instance-stop"
5578 HTYPE = constants.HTYPE_INSTANCE
5581 def ExpandNames(self):
5582 self._ExpandAndLockInstance()
5584 def BuildHooksEnv(self):
5587 This runs on master, primary and secondary nodes of the instance.
5590 env = _BuildInstanceHookEnvByObject(self, self.instance)
5591 env["TIMEOUT"] = self.op.timeout
5594 def BuildHooksNodes(self):
5595 """Build hooks nodes.
5598 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
5601 def CheckPrereq(self):
5602 """Check prerequisites.
5604 This checks that the instance is in the cluster.
5607 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5608 assert self.instance is not None, \
5609 "Cannot retrieve locked instance %s" % self.op.instance_name
5611 self.primary_offline = \
5612 self.cfg.GetNodeInfo(self.instance.primary_node).offline
5614 if self.primary_offline and self.op.ignore_offline_nodes:
5615 self.proc.LogWarning("Ignoring offline primary node")
5617 _CheckNodeOnline(self, self.instance.primary_node)
5619 def Exec(self, feedback_fn):
5620 """Shutdown the instance.
5623 instance = self.instance
5624 node_current = instance.primary_node
5625 timeout = self.op.timeout
5627 self.cfg.MarkInstanceDown(instance.name)
5629 if self.primary_offline:
5630 assert self.op.ignore_offline_nodes
5631 self.proc.LogInfo("Primary node offline, marked instance as stopped")
5633 result = self.rpc.call_instance_shutdown(node_current, instance, timeout)
5634 msg = result.fail_msg
5636 self.proc.LogWarning("Could not shutdown instance: %s" % msg)
5638 _ShutdownInstanceDisks(self, instance)
5641 class LUInstanceReinstall(LogicalUnit):
5642 """Reinstall an instance.
5645 HPATH = "instance-reinstall"
5646 HTYPE = constants.HTYPE_INSTANCE
5649 def ExpandNames(self):
5650 self._ExpandAndLockInstance()
5652 def BuildHooksEnv(self):
5655 This runs on master, primary and secondary nodes of the instance.
5658 return _BuildInstanceHookEnvByObject(self, self.instance)
5660 def BuildHooksNodes(self):
5661 """Build hooks nodes.
5664 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
5667 def CheckPrereq(self):
5668 """Check prerequisites.
5670 This checks that the instance is in the cluster and is not running.
5673 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5674 assert instance is not None, \
5675 "Cannot retrieve locked instance %s" % self.op.instance_name
5676 _CheckNodeOnline(self, instance.primary_node, "Instance primary node"
5677 " offline, cannot reinstall")
5678 for node in instance.secondary_nodes:
5679 _CheckNodeOnline(self, node, "Instance secondary node offline,"
5680 " cannot reinstall")
5682 if instance.disk_template == constants.DT_DISKLESS:
5683 raise errors.OpPrereqError("Instance '%s' has no disks" %
5684 self.op.instance_name,
5686 _CheckInstanceDown(self, instance, "cannot reinstall")
5688 if self.op.os_type is not None:
5690 pnode = _ExpandNodeName(self.cfg, instance.primary_node)
5691 _CheckNodeHasOS(self, pnode, self.op.os_type, self.op.force_variant)
5692 instance_os = self.op.os_type
5694 instance_os = instance.os
5696 nodelist = list(instance.all_nodes)
5698 if self.op.osparams:
5699 i_osdict = _GetUpdatedParams(instance.osparams, self.op.osparams)
5700 _CheckOSParams(self, True, nodelist, instance_os, i_osdict)
5701 self.os_inst = i_osdict # the new dict (without defaults)
5705 self.instance = instance
5707 def Exec(self, feedback_fn):
5708 """Reinstall the instance.
5711 inst = self.instance
5713 if self.op.os_type is not None:
5714 feedback_fn("Changing OS to '%s'..." % self.op.os_type)
5715 inst.os = self.op.os_type
5716 # Write to configuration
5717 self.cfg.Update(inst, feedback_fn)
5719 _StartInstanceDisks(self, inst, None)
5721 feedback_fn("Running the instance OS create scripts...")
5722 # FIXME: pass debug option from opcode to backend
5723 result = self.rpc.call_instance_os_add(inst.primary_node, inst, True,
5724 self.op.debug_level,
5725 osparams=self.os_inst)
5726 result.Raise("Could not install OS for instance %s on node %s" %
5727 (inst.name, inst.primary_node))
5729 _ShutdownInstanceDisks(self, inst)
5732 class LUInstanceRecreateDisks(LogicalUnit):
5733 """Recreate an instance's missing disks.
5736 HPATH = "instance-recreate-disks"
5737 HTYPE = constants.HTYPE_INSTANCE
5740 def CheckArguments(self):
5741 # normalise the disk list
5742 self.op.disks = sorted(frozenset(self.op.disks))
5744 def ExpandNames(self):
5745 self._ExpandAndLockInstance()
5746 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
5748 self.op.nodes = [_ExpandNodeName(self.cfg, n) for n in self.op.nodes]
5749 self.needed_locks[locking.LEVEL_NODE] = list(self.op.nodes)
5751 self.needed_locks[locking.LEVEL_NODE] = []
5753 def DeclareLocks(self, level):
5754 if level == locking.LEVEL_NODE:
5755 # if we replace the nodes, we only need to lock the old primary,
5756 # otherwise we need to lock all nodes for disk re-creation
5757 primary_only = bool(self.op.nodes)
5758 self._LockInstancesNodes(primary_only=primary_only)
5760 def BuildHooksEnv(self):
5763 This runs on master, primary and secondary nodes of the instance.
5766 return _BuildInstanceHookEnvByObject(self, self.instance)
5768 def BuildHooksNodes(self):
5769 """Build hooks nodes.
5772 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
5775 def CheckPrereq(self):
5776 """Check prerequisites.
5778 This checks that the instance is in the cluster and is not running.
5781 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5782 assert instance is not None, \
5783 "Cannot retrieve locked instance %s" % self.op.instance_name
5785 if len(self.op.nodes) != len(instance.all_nodes):
5786 raise errors.OpPrereqError("Instance %s currently has %d nodes, but"
5787 " %d replacement nodes were specified" %
5788 (instance.name, len(instance.all_nodes),
5789 len(self.op.nodes)),
5791 assert instance.disk_template != constants.DT_DRBD8 or \
5792 len(self.op.nodes) == 2
5793 assert instance.disk_template != constants.DT_PLAIN or \
5794 len(self.op.nodes) == 1
5795 primary_node = self.op.nodes[0]
5797 primary_node = instance.primary_node
5798 _CheckNodeOnline(self, primary_node)
5800 if instance.disk_template == constants.DT_DISKLESS:
5801 raise errors.OpPrereqError("Instance '%s' has no disks" %
5802 self.op.instance_name, errors.ECODE_INVAL)
5803 # if we replace nodes *and* the old primary is offline, we don't
5805 assert instance.primary_node in self.needed_locks[locking.LEVEL_NODE]
5806 old_pnode = self.cfg.GetNodeInfo(instance.primary_node)
5807 if not (self.op.nodes and old_pnode.offline):
5808 _CheckInstanceDown(self, instance, "cannot recreate disks")
5810 if not self.op.disks:
5811 self.op.disks = range(len(instance.disks))
5813 for idx in self.op.disks:
5814 if idx >= len(instance.disks):
5815 raise errors.OpPrereqError("Invalid disk index '%s'" % idx,
5817 if self.op.disks != range(len(instance.disks)) and self.op.nodes:
5818 raise errors.OpPrereqError("Can't recreate disks partially and"
5819 " change the nodes at the same time",
5821 self.instance = instance
5823 def Exec(self, feedback_fn):
5824 """Recreate the disks.
5827 # change primary node, if needed
5829 self.instance.primary_node = self.op.nodes[0]
5830 self.LogWarning("Changing the instance's nodes, you will have to"
5831 " remove any disks left on the older nodes manually")
5834 for idx, disk in enumerate(self.instance.disks):
5835 if idx not in self.op.disks: # disk idx has not been passed in
5838 # update secondaries for disks, if needed
5840 if disk.dev_type == constants.LD_DRBD8:
5841 # need to update the nodes
5842 assert len(self.op.nodes) == 2
5843 logical_id = list(disk.logical_id)
5844 logical_id[0] = self.op.nodes[0]
5845 logical_id[1] = self.op.nodes[1]
5846 disk.logical_id = tuple(logical_id)
5849 self.cfg.Update(self.instance, feedback_fn)
5851 _CreateDisks(self, self.instance, to_skip=to_skip)
5854 class LUInstanceRename(LogicalUnit):
5855 """Rename an instance.
5858 HPATH = "instance-rename"
5859 HTYPE = constants.HTYPE_INSTANCE
5861 def CheckArguments(self):
5865 if self.op.ip_check and not self.op.name_check:
5866 # TODO: make the ip check more flexible and not depend on the name check
5867 raise errors.OpPrereqError("IP address check requires a name check",
5870 def BuildHooksEnv(self):
5873 This runs on master, primary and secondary nodes of the instance.
5876 env = _BuildInstanceHookEnvByObject(self, self.instance)
5877 env["INSTANCE_NEW_NAME"] = self.op.new_name
5880 def BuildHooksNodes(self):
5881 """Build hooks nodes.
5884 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
5887 def CheckPrereq(self):
5888 """Check prerequisites.
5890 This checks that the instance is in the cluster and is not running.
5893 self.op.instance_name = _ExpandInstanceName(self.cfg,
5894 self.op.instance_name)
5895 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5896 assert instance is not None
5897 _CheckNodeOnline(self, instance.primary_node)
5898 _CheckInstanceDown(self, instance, "cannot rename")
5899 self.instance = instance
5901 new_name = self.op.new_name
5902 if self.op.name_check:
5903 hostname = netutils.GetHostname(name=new_name)
5904 if hostname != new_name:
5905 self.LogInfo("Resolved given name '%s' to '%s'", new_name,
5907 if not utils.MatchNameComponent(self.op.new_name, [hostname.name]):
5908 raise errors.OpPrereqError(("Resolved hostname '%s' does not look the"
5909 " same as given hostname '%s'") %
5910 (hostname.name, self.op.new_name),
5912 new_name = self.op.new_name = hostname.name
5913 if (self.op.ip_check and
5914 netutils.TcpPing(hostname.ip, constants.DEFAULT_NODED_PORT)):
5915 raise errors.OpPrereqError("IP %s of instance %s already in use" %
5916 (hostname.ip, new_name),
5917 errors.ECODE_NOTUNIQUE)
5919 instance_list = self.cfg.GetInstanceList()
5920 if new_name in instance_list and new_name != instance.name:
5921 raise errors.OpPrereqError("Instance '%s' is already in the cluster" %
5922 new_name, errors.ECODE_EXISTS)
5924 def Exec(self, feedback_fn):
5925 """Rename the instance.
5928 inst = self.instance
5929 old_name = inst.name
5931 rename_file_storage = False
5932 if (inst.disk_template in (constants.DT_FILE, constants.DT_SHARED_FILE) and
5933 self.op.new_name != inst.name):
5934 old_file_storage_dir = os.path.dirname(inst.disks[0].logical_id[1])
5935 rename_file_storage = True
5937 self.cfg.RenameInstance(inst.name, self.op.new_name)
5938 # Change the instance lock. This is definitely safe while we hold the BGL.
5939 # Otherwise the new lock would have to be added in acquired mode.
5941 self.glm.remove(locking.LEVEL_INSTANCE, old_name)
5942 self.glm.add(locking.LEVEL_INSTANCE, self.op.new_name)
5944 # re-read the instance from the configuration after rename
5945 inst = self.cfg.GetInstanceInfo(self.op.new_name)
5947 if rename_file_storage:
5948 new_file_storage_dir = os.path.dirname(inst.disks[0].logical_id[1])
5949 result = self.rpc.call_file_storage_dir_rename(inst.primary_node,
5950 old_file_storage_dir,
5951 new_file_storage_dir)
5952 result.Raise("Could not rename on node %s directory '%s' to '%s'"
5953 " (but the instance has been renamed in Ganeti)" %
5954 (inst.primary_node, old_file_storage_dir,
5955 new_file_storage_dir))
5957 _StartInstanceDisks(self, inst, None)
5959 result = self.rpc.call_instance_run_rename(inst.primary_node, inst,
5960 old_name, self.op.debug_level)
5961 msg = result.fail_msg
5963 msg = ("Could not run OS rename script for instance %s on node %s"
5964 " (but the instance has been renamed in Ganeti): %s" %
5965 (inst.name, inst.primary_node, msg))
5966 self.proc.LogWarning(msg)
5968 _ShutdownInstanceDisks(self, inst)
5973 class LUInstanceRemove(LogicalUnit):
5974 """Remove an instance.
5977 HPATH = "instance-remove"
5978 HTYPE = constants.HTYPE_INSTANCE
5981 def ExpandNames(self):
5982 self._ExpandAndLockInstance()
5983 self.needed_locks[locking.LEVEL_NODE] = []
5984 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
5986 def DeclareLocks(self, level):
5987 if level == locking.LEVEL_NODE:
5988 self._LockInstancesNodes()
5990 def BuildHooksEnv(self):
5993 This runs on master, primary and secondary nodes of the instance.
5996 env = _BuildInstanceHookEnvByObject(self, self.instance)
5997 env["SHUTDOWN_TIMEOUT"] = self.op.shutdown_timeout
6000 def BuildHooksNodes(self):
6001 """Build hooks nodes.
6004 nl = [self.cfg.GetMasterNode()]
6005 nl_post = list(self.instance.all_nodes) + nl
6006 return (nl, nl_post)
6008 def CheckPrereq(self):
6009 """Check prerequisites.
6011 This checks that the instance is in the cluster.
6014 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6015 assert self.instance is not None, \
6016 "Cannot retrieve locked instance %s" % self.op.instance_name
6018 def Exec(self, feedback_fn):
6019 """Remove the instance.
6022 instance = self.instance
6023 logging.info("Shutting down instance %s on node %s",
6024 instance.name, instance.primary_node)
6026 result = self.rpc.call_instance_shutdown(instance.primary_node, instance,
6027 self.op.shutdown_timeout)
6028 msg = result.fail_msg
6030 if self.op.ignore_failures:
6031 feedback_fn("Warning: can't shutdown instance: %s" % msg)
6033 raise errors.OpExecError("Could not shutdown instance %s on"
6035 (instance.name, instance.primary_node, msg))
6037 _RemoveInstance(self, feedback_fn, instance, self.op.ignore_failures)
6040 def _RemoveInstance(lu, feedback_fn, instance, ignore_failures):
6041 """Utility function to remove an instance.
6044 logging.info("Removing block devices for instance %s", instance.name)
6046 if not _RemoveDisks(lu, instance):
6047 if not ignore_failures:
6048 raise errors.OpExecError("Can't remove instance's disks")
6049 feedback_fn("Warning: can't remove instance's disks")
6051 logging.info("Removing instance %s out of cluster config", instance.name)
6053 lu.cfg.RemoveInstance(instance.name)
6055 assert not lu.remove_locks.get(locking.LEVEL_INSTANCE), \
6056 "Instance lock removal conflict"
6058 # Remove lock for the instance
6059 lu.remove_locks[locking.LEVEL_INSTANCE] = instance.name
6062 class LUInstanceQuery(NoHooksLU):
6063 """Logical unit for querying instances.
6066 # pylint: disable-msg=W0142
6069 def CheckArguments(self):
6070 self.iq = _InstanceQuery(qlang.MakeSimpleFilter("name", self.op.names),
6071 self.op.output_fields, self.op.use_locking)
6073 def ExpandNames(self):
6074 self.iq.ExpandNames(self)
6076 def DeclareLocks(self, level):
6077 self.iq.DeclareLocks(self, level)
6079 def Exec(self, feedback_fn):
6080 return self.iq.OldStyleQuery(self)
6083 class LUInstanceFailover(LogicalUnit):
6084 """Failover an instance.
6087 HPATH = "instance-failover"
6088 HTYPE = constants.HTYPE_INSTANCE
6091 def CheckArguments(self):
6092 """Check the arguments.
6095 self.iallocator = getattr(self.op, "iallocator", None)
6096 self.target_node = getattr(self.op, "target_node", None)
6098 def ExpandNames(self):
6099 self._ExpandAndLockInstance()
6101 if self.op.target_node is not None:
6102 self.op.target_node = _ExpandNodeName(self.cfg, self.op.target_node)
6104 self.needed_locks[locking.LEVEL_NODE] = []
6105 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
6107 ignore_consistency = self.op.ignore_consistency
6108 shutdown_timeout = self.op.shutdown_timeout
6109 self._migrater = TLMigrateInstance(self, self.op.instance_name,
6112 ignore_consistency=ignore_consistency,
6113 shutdown_timeout=shutdown_timeout)
6114 self.tasklets = [self._migrater]
6116 def DeclareLocks(self, level):
6117 if level == locking.LEVEL_NODE:
6118 instance = self.context.cfg.GetInstanceInfo(self.op.instance_name)
6119 if instance.disk_template in constants.DTS_EXT_MIRROR:
6120 if self.op.target_node is None:
6121 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
6123 self.needed_locks[locking.LEVEL_NODE] = [instance.primary_node,
6124 self.op.target_node]
6125 del self.recalculate_locks[locking.LEVEL_NODE]
6127 self._LockInstancesNodes()
6129 def BuildHooksEnv(self):
6132 This runs on master, primary and secondary nodes of the instance.
6135 instance = self._migrater.instance
6136 source_node = instance.primary_node
6137 target_node = self.op.target_node
6139 "IGNORE_CONSISTENCY": self.op.ignore_consistency,
6140 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
6141 "OLD_PRIMARY": source_node,
6142 "NEW_PRIMARY": target_node,
6145 if instance.disk_template in constants.DTS_INT_MIRROR:
6146 env["OLD_SECONDARY"] = instance.secondary_nodes[0]
6147 env["NEW_SECONDARY"] = source_node
6149 env["OLD_SECONDARY"] = env["NEW_SECONDARY"] = ""
6151 env.update(_BuildInstanceHookEnvByObject(self, instance))
6155 def BuildHooksNodes(self):
6156 """Build hooks nodes.
6159 instance = self._migrater.instance
6160 nl = [self.cfg.GetMasterNode()] + list(instance.secondary_nodes)
6161 return (nl, nl + [instance.primary_node])
6164 class LUInstanceMigrate(LogicalUnit):
6165 """Migrate an instance.
6167 This is migration without shutting down, compared to the failover,
6168 which is done with shutdown.
6171 HPATH = "instance-migrate"
6172 HTYPE = constants.HTYPE_INSTANCE
6175 def ExpandNames(self):
6176 self._ExpandAndLockInstance()
6178 if self.op.target_node is not None:
6179 self.op.target_node = _ExpandNodeName(self.cfg, self.op.target_node)
6181 self.needed_locks[locking.LEVEL_NODE] = []
6182 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
6184 self._migrater = TLMigrateInstance(self, self.op.instance_name,
6185 cleanup=self.op.cleanup,
6187 fallback=self.op.allow_failover)
6188 self.tasklets = [self._migrater]
6190 def DeclareLocks(self, level):
6191 if level == locking.LEVEL_NODE:
6192 instance = self.context.cfg.GetInstanceInfo(self.op.instance_name)
6193 if instance.disk_template in constants.DTS_EXT_MIRROR:
6194 if self.op.target_node is None:
6195 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
6197 self.needed_locks[locking.LEVEL_NODE] = [instance.primary_node,
6198 self.op.target_node]
6199 del self.recalculate_locks[locking.LEVEL_NODE]
6201 self._LockInstancesNodes()
6203 def BuildHooksEnv(self):
6206 This runs on master, primary and secondary nodes of the instance.
6209 instance = self._migrater.instance
6210 source_node = instance.primary_node
6211 target_node = self.op.target_node
6212 env = _BuildInstanceHookEnvByObject(self, instance)
6214 "MIGRATE_LIVE": self._migrater.live,
6215 "MIGRATE_CLEANUP": self.op.cleanup,
6216 "OLD_PRIMARY": source_node,
6217 "NEW_PRIMARY": target_node,
6220 if instance.disk_template in constants.DTS_INT_MIRROR:
6221 env["OLD_SECONDARY"] = target_node
6222 env["NEW_SECONDARY"] = source_node
6224 env["OLD_SECONDARY"] = env["NEW_SECONDARY"] = None
6228 def BuildHooksNodes(self):
6229 """Build hooks nodes.
6232 instance = self._migrater.instance
6233 nl = [self.cfg.GetMasterNode()] + list(instance.secondary_nodes)
6234 return (nl, nl + [instance.primary_node])
6237 class LUInstanceMove(LogicalUnit):
6238 """Move an instance by data-copying.
6241 HPATH = "instance-move"
6242 HTYPE = constants.HTYPE_INSTANCE
6245 def ExpandNames(self):
6246 self._ExpandAndLockInstance()
6247 target_node = _ExpandNodeName(self.cfg, self.op.target_node)
6248 self.op.target_node = target_node
6249 self.needed_locks[locking.LEVEL_NODE] = [target_node]
6250 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
6252 def DeclareLocks(self, level):
6253 if level == locking.LEVEL_NODE:
6254 self._LockInstancesNodes(primary_only=True)
6256 def BuildHooksEnv(self):
6259 This runs on master, primary and secondary nodes of the instance.
6263 "TARGET_NODE": self.op.target_node,
6264 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
6266 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
6269 def BuildHooksNodes(self):
6270 """Build hooks nodes.
6274 self.cfg.GetMasterNode(),
6275 self.instance.primary_node,
6276 self.op.target_node,
6280 def CheckPrereq(self):
6281 """Check prerequisites.
6283 This checks that the instance is in the cluster.
6286 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6287 assert self.instance is not None, \
6288 "Cannot retrieve locked instance %s" % self.op.instance_name
6290 node = self.cfg.GetNodeInfo(self.op.target_node)
6291 assert node is not None, \
6292 "Cannot retrieve locked node %s" % self.op.target_node
6294 self.target_node = target_node = node.name
6296 if target_node == instance.primary_node:
6297 raise errors.OpPrereqError("Instance %s is already on the node %s" %
6298 (instance.name, target_node),
6301 bep = self.cfg.GetClusterInfo().FillBE(instance)
6303 for idx, dsk in enumerate(instance.disks):
6304 if dsk.dev_type not in (constants.LD_LV, constants.LD_FILE):
6305 raise errors.OpPrereqError("Instance disk %d has a complex layout,"
6306 " cannot copy" % idx, errors.ECODE_STATE)
6308 _CheckNodeOnline(self, target_node)
6309 _CheckNodeNotDrained(self, target_node)
6310 _CheckNodeVmCapable(self, target_node)
6312 if instance.admin_up:
6313 # check memory requirements on the secondary node
6314 _CheckNodeFreeMemory(self, target_node, "failing over instance %s" %
6315 instance.name, bep[constants.BE_MEMORY],
6316 instance.hypervisor)
6318 self.LogInfo("Not checking memory on the secondary node as"
6319 " instance will not be started")
6321 # check bridge existance
6322 _CheckInstanceBridgesExist(self, instance, node=target_node)
6324 def Exec(self, feedback_fn):
6325 """Move an instance.
6327 The move is done by shutting it down on its present node, copying
6328 the data over (slow) and starting it on the new node.
6331 instance = self.instance
6333 source_node = instance.primary_node
6334 target_node = self.target_node
6336 self.LogInfo("Shutting down instance %s on source node %s",
6337 instance.name, source_node)
6339 result = self.rpc.call_instance_shutdown(source_node, instance,
6340 self.op.shutdown_timeout)
6341 msg = result.fail_msg
6343 if self.op.ignore_consistency:
6344 self.proc.LogWarning("Could not shutdown instance %s on node %s."
6345 " Proceeding anyway. Please make sure node"
6346 " %s is down. Error details: %s",
6347 instance.name, source_node, source_node, msg)
6349 raise errors.OpExecError("Could not shutdown instance %s on"
6351 (instance.name, source_node, msg))
6353 # create the target disks
6355 _CreateDisks(self, instance, target_node=target_node)
6356 except errors.OpExecError:
6357 self.LogWarning("Device creation failed, reverting...")
6359 _RemoveDisks(self, instance, target_node=target_node)
6361 self.cfg.ReleaseDRBDMinors(instance.name)
6364 cluster_name = self.cfg.GetClusterInfo().cluster_name
6367 # activate, get path, copy the data over
6368 for idx, disk in enumerate(instance.disks):
6369 self.LogInfo("Copying data for disk %d", idx)
6370 result = self.rpc.call_blockdev_assemble(target_node, disk,
6371 instance.name, True, idx)
6373 self.LogWarning("Can't assemble newly created disk %d: %s",
6374 idx, result.fail_msg)
6375 errs.append(result.fail_msg)
6377 dev_path = result.payload
6378 result = self.rpc.call_blockdev_export(source_node, disk,
6379 target_node, dev_path,
6382 self.LogWarning("Can't copy data over for disk %d: %s",
6383 idx, result.fail_msg)
6384 errs.append(result.fail_msg)
6388 self.LogWarning("Some disks failed to copy, aborting")
6390 _RemoveDisks(self, instance, target_node=target_node)
6392 self.cfg.ReleaseDRBDMinors(instance.name)
6393 raise errors.OpExecError("Errors during disk copy: %s" %
6396 instance.primary_node = target_node
6397 self.cfg.Update(instance, feedback_fn)
6399 self.LogInfo("Removing the disks on the original node")
6400 _RemoveDisks(self, instance, target_node=source_node)
6402 # Only start the instance if it's marked as up
6403 if instance.admin_up:
6404 self.LogInfo("Starting instance %s on node %s",
6405 instance.name, target_node)
6407 disks_ok, _ = _AssembleInstanceDisks(self, instance,
6408 ignore_secondaries=True)
6410 _ShutdownInstanceDisks(self, instance)
6411 raise errors.OpExecError("Can't activate the instance's disks")
6413 result = self.rpc.call_instance_start(target_node, instance, None, None)
6414 msg = result.fail_msg
6416 _ShutdownInstanceDisks(self, instance)
6417 raise errors.OpExecError("Could not start instance %s on node %s: %s" %
6418 (instance.name, target_node, msg))
6421 class LUNodeMigrate(LogicalUnit):
6422 """Migrate all instances from a node.
6425 HPATH = "node-migrate"
6426 HTYPE = constants.HTYPE_NODE
6429 def CheckArguments(self):
6430 _CheckIAllocatorOrNode(self, "iallocator", "remote_node")
6432 def ExpandNames(self):
6433 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
6435 self.needed_locks = {}
6437 # Create tasklets for migrating instances for all instances on this node
6441 self.lock_all_nodes = False
6443 for inst in _GetNodePrimaryInstances(self.cfg, self.op.node_name):
6444 logging.debug("Migrating instance %s", inst.name)
6445 names.append(inst.name)
6447 tasklets.append(TLMigrateInstance(self, inst.name, cleanup=False))
6449 if inst.disk_template in constants.DTS_EXT_MIRROR:
6450 # We need to lock all nodes, as the iallocator will choose the
6451 # destination nodes afterwards
6452 self.lock_all_nodes = True
6454 self.tasklets = tasklets
6456 # Declare node locks
6457 if self.lock_all_nodes:
6458 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
6460 self.needed_locks[locking.LEVEL_NODE] = [self.op.node_name]
6461 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
6463 # Declare instance locks
6464 self.needed_locks[locking.LEVEL_INSTANCE] = names
6466 def DeclareLocks(self, level):
6467 if level == locking.LEVEL_NODE and not self.lock_all_nodes:
6468 self._LockInstancesNodes()
6470 def BuildHooksEnv(self):
6473 This runs on the master, the primary and all the secondaries.
6477 "NODE_NAME": self.op.node_name,
6480 def BuildHooksNodes(self):
6481 """Build hooks nodes.
6484 nl = [self.cfg.GetMasterNode()]
6488 class TLMigrateInstance(Tasklet):
6489 """Tasklet class for instance migration.
6492 @ivar live: whether the migration will be done live or non-live;
6493 this variable is initalized only after CheckPrereq has run
6494 @type cleanup: boolean
6495 @ivar cleanup: Wheater we cleanup from a failed migration
6496 @type iallocator: string
6497 @ivar iallocator: The iallocator used to determine target_node
6498 @type target_node: string
6499 @ivar target_node: If given, the target_node to reallocate the instance to
6500 @type failover: boolean
6501 @ivar failover: Whether operation results in failover or migration
6502 @type fallback: boolean
6503 @ivar fallback: Whether fallback to failover is allowed if migration not
6505 @type ignore_consistency: boolean
6506 @ivar ignore_consistency: Wheter we should ignore consistency between source
6508 @type shutdown_timeout: int
6509 @ivar shutdown_timeout: In case of failover timeout of the shutdown
6512 def __init__(self, lu, instance_name, cleanup=False,
6513 failover=False, fallback=False,
6514 ignore_consistency=False,
6515 shutdown_timeout=constants.DEFAULT_SHUTDOWN_TIMEOUT):
6516 """Initializes this class.
6519 Tasklet.__init__(self, lu)
6522 self.instance_name = instance_name
6523 self.cleanup = cleanup
6524 self.live = False # will be overridden later
6525 self.failover = failover
6526 self.fallback = fallback
6527 self.ignore_consistency = ignore_consistency
6528 self.shutdown_timeout = shutdown_timeout
6530 def CheckPrereq(self):
6531 """Check prerequisites.
6533 This checks that the instance is in the cluster.
6536 instance_name = _ExpandInstanceName(self.lu.cfg, self.instance_name)
6537 instance = self.cfg.GetInstanceInfo(instance_name)
6538 assert instance is not None
6539 self.instance = instance
6541 if (not self.cleanup and not instance.admin_up and not self.failover and
6543 self.lu.LogInfo("Instance is marked down, fallback allowed, switching"
6545 self.failover = True
6547 if instance.disk_template not in constants.DTS_MIRRORED:
6552 raise errors.OpPrereqError("Instance's disk layout '%s' does not allow"
6553 " %s" % (instance.disk_template, text),
6556 if instance.disk_template in constants.DTS_EXT_MIRROR:
6557 _CheckIAllocatorOrNode(self.lu, "iallocator", "target_node")
6559 if self.lu.op.iallocator:
6560 self._RunAllocator()
6562 # We set set self.target_node as it is required by
6564 self.target_node = self.lu.op.target_node
6566 # self.target_node is already populated, either directly or by the
6568 target_node = self.target_node
6569 if self.target_node == instance.primary_node:
6570 raise errors.OpPrereqError("Cannot migrate instance %s"
6571 " to its primary (%s)" %
6572 (instance.name, instance.primary_node))
6574 if len(self.lu.tasklets) == 1:
6575 # It is safe to release locks only when we're the only tasklet
6577 _ReleaseLocks(self.lu, locking.LEVEL_NODE,
6578 keep=[instance.primary_node, self.target_node])
6581 secondary_nodes = instance.secondary_nodes
6582 if not secondary_nodes:
6583 raise errors.ConfigurationError("No secondary node but using"
6584 " %s disk template" %
6585 instance.disk_template)
6586 target_node = secondary_nodes[0]
6587 if self.lu.op.iallocator or (self.lu.op.target_node and
6588 self.lu.op.target_node != target_node):
6590 text = "failed over"
6593 raise errors.OpPrereqError("Instances with disk template %s cannot"
6594 " be %s to arbitrary nodes"
6595 " (neither an iallocator nor a target"
6596 " node can be passed)" %
6597 (instance.disk_template, text),
6600 i_be = self.cfg.GetClusterInfo().FillBE(instance)
6602 # check memory requirements on the secondary node
6603 if not self.failover or instance.admin_up:
6604 _CheckNodeFreeMemory(self.lu, target_node, "migrating instance %s" %
6605 instance.name, i_be[constants.BE_MEMORY],
6606 instance.hypervisor)
6608 self.lu.LogInfo("Not checking memory on the secondary node as"
6609 " instance will not be started")
6611 # check bridge existance
6612 _CheckInstanceBridgesExist(self.lu, instance, node=target_node)
6614 if not self.cleanup:
6615 _CheckNodeNotDrained(self.lu, target_node)
6616 if not self.failover:
6617 result = self.rpc.call_instance_migratable(instance.primary_node,
6619 if result.fail_msg and self.fallback:
6620 self.lu.LogInfo("Can't migrate, instance offline, fallback to"
6622 self.failover = True
6624 result.Raise("Can't migrate, please use failover",
6625 prereq=True, ecode=errors.ECODE_STATE)
6627 assert not (self.failover and self.cleanup)
6629 if not self.failover:
6630 if self.lu.op.live is not None and self.lu.op.mode is not None:
6631 raise errors.OpPrereqError("Only one of the 'live' and 'mode'"
6632 " parameters are accepted",
6634 if self.lu.op.live is not None:
6636 self.lu.op.mode = constants.HT_MIGRATION_LIVE
6638 self.lu.op.mode = constants.HT_MIGRATION_NONLIVE
6639 # reset the 'live' parameter to None so that repeated
6640 # invocations of CheckPrereq do not raise an exception
6641 self.lu.op.live = None
6642 elif self.lu.op.mode is None:
6643 # read the default value from the hypervisor
6644 i_hv = self.cfg.GetClusterInfo().FillHV(self.instance,
6646 self.lu.op.mode = i_hv[constants.HV_MIGRATION_MODE]
6648 self.live = self.lu.op.mode == constants.HT_MIGRATION_LIVE
6650 # Failover is never live
6653 def _RunAllocator(self):
6654 """Run the allocator based on input opcode.
6657 ial = IAllocator(self.cfg, self.rpc,
6658 mode=constants.IALLOCATOR_MODE_RELOC,
6659 name=self.instance_name,
6660 # TODO See why hail breaks with a single node below
6661 relocate_from=[self.instance.primary_node,
6662 self.instance.primary_node],
6665 ial.Run(self.lu.op.iallocator)
6668 raise errors.OpPrereqError("Can't compute nodes using"
6669 " iallocator '%s': %s" %
6670 (self.lu.op.iallocator, ial.info),
6672 if len(ial.result) != ial.required_nodes:
6673 raise errors.OpPrereqError("iallocator '%s' returned invalid number"
6674 " of nodes (%s), required %s" %
6675 (self.lu.op.iallocator, len(ial.result),
6676 ial.required_nodes), errors.ECODE_FAULT)
6677 self.target_node = ial.result[0]
6678 self.lu.LogInfo("Selected nodes for instance %s via iallocator %s: %s",
6679 self.instance_name, self.lu.op.iallocator,
6680 utils.CommaJoin(ial.result))
6682 def _WaitUntilSync(self):
6683 """Poll with custom rpc for disk sync.
6685 This uses our own step-based rpc call.
6688 self.feedback_fn("* wait until resync is done")
6692 result = self.rpc.call_drbd_wait_sync(self.all_nodes,
6694 self.instance.disks)
6696 for node, nres in result.items():
6697 nres.Raise("Cannot resync disks on node %s" % node)
6698 node_done, node_percent = nres.payload
6699 all_done = all_done and node_done
6700 if node_percent is not None:
6701 min_percent = min(min_percent, node_percent)
6703 if min_percent < 100:
6704 self.feedback_fn(" - progress: %.1f%%" % min_percent)
6707 def _EnsureSecondary(self, node):
6708 """Demote a node to secondary.
6711 self.feedback_fn("* switching node %s to secondary mode" % node)
6713 for dev in self.instance.disks:
6714 self.cfg.SetDiskID(dev, node)
6716 result = self.rpc.call_blockdev_close(node, self.instance.name,
6717 self.instance.disks)
6718 result.Raise("Cannot change disk to secondary on node %s" % node)
6720 def _GoStandalone(self):
6721 """Disconnect from the network.
6724 self.feedback_fn("* changing into standalone mode")
6725 result = self.rpc.call_drbd_disconnect_net(self.all_nodes, self.nodes_ip,
6726 self.instance.disks)
6727 for node, nres in result.items():
6728 nres.Raise("Cannot disconnect disks node %s" % node)
6730 def _GoReconnect(self, multimaster):
6731 """Reconnect to the network.
6737 msg = "single-master"
6738 self.feedback_fn("* changing disks into %s mode" % msg)
6739 result = self.rpc.call_drbd_attach_net(self.all_nodes, self.nodes_ip,
6740 self.instance.disks,
6741 self.instance.name, multimaster)
6742 for node, nres in result.items():
6743 nres.Raise("Cannot change disks config on node %s" % node)
6745 def _ExecCleanup(self):
6746 """Try to cleanup after a failed migration.
6748 The cleanup is done by:
6749 - check that the instance is running only on one node
6750 (and update the config if needed)
6751 - change disks on its secondary node to secondary
6752 - wait until disks are fully synchronized
6753 - disconnect from the network
6754 - change disks into single-master mode
6755 - wait again until disks are fully synchronized
6758 instance = self.instance
6759 target_node = self.target_node
6760 source_node = self.source_node
6762 # check running on only one node
6763 self.feedback_fn("* checking where the instance actually runs"
6764 " (if this hangs, the hypervisor might be in"
6766 ins_l = self.rpc.call_instance_list(self.all_nodes, [instance.hypervisor])
6767 for node, result in ins_l.items():
6768 result.Raise("Can't contact node %s" % node)
6770 runningon_source = instance.name in ins_l[source_node].payload
6771 runningon_target = instance.name in ins_l[target_node].payload
6773 if runningon_source and runningon_target:
6774 raise errors.OpExecError("Instance seems to be running on two nodes,"
6775 " or the hypervisor is confused; you will have"
6776 " to ensure manually that it runs only on one"
6777 " and restart this operation")
6779 if not (runningon_source or runningon_target):
6780 raise errors.OpExecError("Instance does not seem to be running at all;"
6781 " in this case it's safer to repair by"
6782 " running 'gnt-instance stop' to ensure disk"
6783 " shutdown, and then restarting it")
6785 if runningon_target:
6786 # the migration has actually succeeded, we need to update the config
6787 self.feedback_fn("* instance running on secondary node (%s),"
6788 " updating config" % target_node)
6789 instance.primary_node = target_node
6790 self.cfg.Update(instance, self.feedback_fn)
6791 demoted_node = source_node
6793 self.feedback_fn("* instance confirmed to be running on its"
6794 " primary node (%s)" % source_node)
6795 demoted_node = target_node
6797 if instance.disk_template in constants.DTS_INT_MIRROR:
6798 self._EnsureSecondary(demoted_node)
6800 self._WaitUntilSync()
6801 except errors.OpExecError:
6802 # we ignore here errors, since if the device is standalone, it
6803 # won't be able to sync
6805 self._GoStandalone()
6806 self._GoReconnect(False)
6807 self._WaitUntilSync()
6809 self.feedback_fn("* done")
6811 def _RevertDiskStatus(self):
6812 """Try to revert the disk status after a failed migration.
6815 target_node = self.target_node
6816 if self.instance.disk_template in constants.DTS_EXT_MIRROR:
6820 self._EnsureSecondary(target_node)
6821 self._GoStandalone()
6822 self._GoReconnect(False)
6823 self._WaitUntilSync()
6824 except errors.OpExecError, err:
6825 self.lu.LogWarning("Migration failed and I can't reconnect the drives,"
6826 " please try to recover the instance manually;"
6827 " error '%s'" % str(err))
6829 def _AbortMigration(self):
6830 """Call the hypervisor code to abort a started migration.
6833 instance = self.instance
6834 target_node = self.target_node
6835 migration_info = self.migration_info
6837 abort_result = self.rpc.call_finalize_migration(target_node,
6841 abort_msg = abort_result.fail_msg
6843 logging.error("Aborting migration failed on target node %s: %s",
6844 target_node, abort_msg)
6845 # Don't raise an exception here, as we stil have to try to revert the
6846 # disk status, even if this step failed.
6848 def _ExecMigration(self):
6849 """Migrate an instance.
6851 The migrate is done by:
6852 - change the disks into dual-master mode
6853 - wait until disks are fully synchronized again
6854 - migrate the instance
6855 - change disks on the new secondary node (the old primary) to secondary
6856 - wait until disks are fully synchronized
6857 - change disks into single-master mode
6860 instance = self.instance
6861 target_node = self.target_node
6862 source_node = self.source_node
6864 self.feedback_fn("* checking disk consistency between source and target")
6865 for dev in instance.disks:
6866 if not _CheckDiskConsistency(self.lu, dev, target_node, False):
6867 raise errors.OpExecError("Disk %s is degraded or not fully"
6868 " synchronized on target node,"
6869 " aborting migration" % dev.iv_name)
6871 # First get the migration information from the remote node
6872 result = self.rpc.call_migration_info(source_node, instance)
6873 msg = result.fail_msg
6875 log_err = ("Failed fetching source migration information from %s: %s" %
6877 logging.error(log_err)
6878 raise errors.OpExecError(log_err)
6880 self.migration_info = migration_info = result.payload
6882 if self.instance.disk_template not in constants.DTS_EXT_MIRROR:
6883 # Then switch the disks to master/master mode
6884 self._EnsureSecondary(target_node)
6885 self._GoStandalone()
6886 self._GoReconnect(True)
6887 self._WaitUntilSync()
6889 self.feedback_fn("* preparing %s to accept the instance" % target_node)
6890 result = self.rpc.call_accept_instance(target_node,
6893 self.nodes_ip[target_node])
6895 msg = result.fail_msg
6897 logging.error("Instance pre-migration failed, trying to revert"
6898 " disk status: %s", msg)
6899 self.feedback_fn("Pre-migration failed, aborting")
6900 self._AbortMigration()
6901 self._RevertDiskStatus()
6902 raise errors.OpExecError("Could not pre-migrate instance %s: %s" %
6903 (instance.name, msg))
6905 self.feedback_fn("* migrating instance to %s" % target_node)
6906 result = self.rpc.call_instance_migrate(source_node, instance,
6907 self.nodes_ip[target_node],
6909 msg = result.fail_msg
6911 logging.error("Instance migration failed, trying to revert"
6912 " disk status: %s", msg)
6913 self.feedback_fn("Migration failed, aborting")
6914 self._AbortMigration()
6915 self._RevertDiskStatus()
6916 raise errors.OpExecError("Could not migrate instance %s: %s" %
6917 (instance.name, msg))
6919 instance.primary_node = target_node
6920 # distribute new instance config to the other nodes
6921 self.cfg.Update(instance, self.feedback_fn)
6923 result = self.rpc.call_finalize_migration(target_node,
6927 msg = result.fail_msg
6929 logging.error("Instance migration succeeded, but finalization failed:"
6931 raise errors.OpExecError("Could not finalize instance migration: %s" %
6934 if self.instance.disk_template not in constants.DTS_EXT_MIRROR:
6935 self._EnsureSecondary(source_node)
6936 self._WaitUntilSync()
6937 self._GoStandalone()
6938 self._GoReconnect(False)
6939 self._WaitUntilSync()
6941 self.feedback_fn("* done")
6943 def _ExecFailover(self):
6944 """Failover an instance.
6946 The failover is done by shutting it down on its present node and
6947 starting it on the secondary.
6950 instance = self.instance
6951 primary_node = self.cfg.GetNodeInfo(instance.primary_node)
6953 source_node = instance.primary_node
6954 target_node = self.target_node
6956 if instance.admin_up:
6957 self.feedback_fn("* checking disk consistency between source and target")
6958 for dev in instance.disks:
6959 # for drbd, these are drbd over lvm
6960 if not _CheckDiskConsistency(self, dev, target_node, False):
6961 if not self.ignore_consistency:
6962 raise errors.OpExecError("Disk %s is degraded on target node,"
6963 " aborting failover" % dev.iv_name)
6965 self.feedback_fn("* not checking disk consistency as instance is not"
6968 self.feedback_fn("* shutting down instance on source node")
6969 logging.info("Shutting down instance %s on node %s",
6970 instance.name, source_node)
6972 result = self.rpc.call_instance_shutdown(source_node, instance,
6973 self.shutdown_timeout)
6974 msg = result.fail_msg
6976 if self.ignore_consistency or primary_node.offline:
6977 self.lu.LogWarning("Could not shutdown instance %s on node %s,"
6978 " proceeding anyway; please make sure node"
6979 " %s is down; error details: %s",
6980 instance.name, source_node, source_node, msg)
6982 raise errors.OpExecError("Could not shutdown instance %s on"
6984 (instance.name, source_node, msg))
6986 self.feedback_fn("* deactivating the instance's disks on source node")
6987 if not _ShutdownInstanceDisks(self, instance, ignore_primary=True):
6988 raise errors.OpExecError("Can't shut down the instance's disks.")
6990 instance.primary_node = target_node
6991 # distribute new instance config to the other nodes
6992 self.cfg.Update(instance, self.feedback_fn)
6994 # Only start the instance if it's marked as up
6995 if instance.admin_up:
6996 self.feedback_fn("* activating the instance's disks on target node")
6997 logging.info("Starting instance %s on node %s",
6998 instance.name, target_node)
7000 disks_ok, _ = _AssembleInstanceDisks(self, instance,
7001 ignore_secondaries=True)
7003 _ShutdownInstanceDisks(self, instance)
7004 raise errors.OpExecError("Can't activate the instance's disks")
7006 self.feedback_fn("* starting the instance on the target node")
7007 result = self.rpc.call_instance_start(target_node, instance, None, None)
7008 msg = result.fail_msg
7010 _ShutdownInstanceDisks(self, instance)
7011 raise errors.OpExecError("Could not start instance %s on node %s: %s" %
7012 (instance.name, target_node, msg))
7014 def Exec(self, feedback_fn):
7015 """Perform the migration.
7018 self.feedback_fn = feedback_fn
7019 self.source_node = self.instance.primary_node
7021 # FIXME: if we implement migrate-to-any in DRBD, this needs fixing
7022 if self.instance.disk_template in constants.DTS_INT_MIRROR:
7023 self.target_node = self.instance.secondary_nodes[0]
7024 # Otherwise self.target_node has been populated either
7025 # directly, or through an iallocator.
7027 self.all_nodes = [self.source_node, self.target_node]
7029 self.source_node: self.cfg.GetNodeInfo(self.source_node).secondary_ip,
7030 self.target_node: self.cfg.GetNodeInfo(self.target_node).secondary_ip,
7034 feedback_fn("Failover instance %s" % self.instance.name)
7035 self._ExecFailover()
7037 feedback_fn("Migrating instance %s" % self.instance.name)
7040 return self._ExecCleanup()
7042 return self._ExecMigration()
7045 def _CreateBlockDev(lu, node, instance, device, force_create,
7047 """Create a tree of block devices on a given node.
7049 If this device type has to be created on secondaries, create it and
7052 If not, just recurse to children keeping the same 'force' value.
7054 @param lu: the lu on whose behalf we execute
7055 @param node: the node on which to create the device
7056 @type instance: L{objects.Instance}
7057 @param instance: the instance which owns the device
7058 @type device: L{objects.Disk}
7059 @param device: the device to create
7060 @type force_create: boolean
7061 @param force_create: whether to force creation of this device; this
7062 will be change to True whenever we find a device which has
7063 CreateOnSecondary() attribute
7064 @param info: the extra 'metadata' we should attach to the device
7065 (this will be represented as a LVM tag)
7066 @type force_open: boolean
7067 @param force_open: this parameter will be passes to the
7068 L{backend.BlockdevCreate} function where it specifies
7069 whether we run on primary or not, and it affects both
7070 the child assembly and the device own Open() execution
7073 if device.CreateOnSecondary():
7077 for child in device.children:
7078 _CreateBlockDev(lu, node, instance, child, force_create,
7081 if not force_create:
7084 _CreateSingleBlockDev(lu, node, instance, device, info, force_open)
7087 def _CreateSingleBlockDev(lu, node, instance, device, info, force_open):
7088 """Create a single block device on a given node.
7090 This will not recurse over children of the device, so they must be
7093 @param lu: the lu on whose behalf we execute
7094 @param node: the node on which to create the device
7095 @type instance: L{objects.Instance}
7096 @param instance: the instance which owns the device
7097 @type device: L{objects.Disk}
7098 @param device: the device to create
7099 @param info: the extra 'metadata' we should attach to the device
7100 (this will be represented as a LVM tag)
7101 @type force_open: boolean
7102 @param force_open: this parameter will be passes to the
7103 L{backend.BlockdevCreate} function where it specifies
7104 whether we run on primary or not, and it affects both
7105 the child assembly and the device own Open() execution
7108 lu.cfg.SetDiskID(device, node)
7109 result = lu.rpc.call_blockdev_create(node, device, device.size,
7110 instance.name, force_open, info)
7111 result.Raise("Can't create block device %s on"
7112 " node %s for instance %s" % (device, node, instance.name))
7113 if device.physical_id is None:
7114 device.physical_id = result.payload
7117 def _GenerateUniqueNames(lu, exts):
7118 """Generate a suitable LV name.
7120 This will generate a logical volume name for the given instance.
7125 new_id = lu.cfg.GenerateUniqueID(lu.proc.GetECId())
7126 results.append("%s%s" % (new_id, val))
7130 def _GenerateDRBD8Branch(lu, primary, secondary, size, vgnames, names,
7131 iv_name, p_minor, s_minor):
7132 """Generate a drbd8 device complete with its children.
7135 assert len(vgnames) == len(names) == 2
7136 port = lu.cfg.AllocatePort()
7137 shared_secret = lu.cfg.GenerateDRBDSecret(lu.proc.GetECId())
7138 dev_data = objects.Disk(dev_type=constants.LD_LV, size=size,
7139 logical_id=(vgnames[0], names[0]))
7140 dev_meta = objects.Disk(dev_type=constants.LD_LV, size=128,
7141 logical_id=(vgnames[1], names[1]))
7142 drbd_dev = objects.Disk(dev_type=constants.LD_DRBD8, size=size,
7143 logical_id=(primary, secondary, port,
7146 children=[dev_data, dev_meta],
7151 def _GenerateDiskTemplate(lu, template_name,
7152 instance_name, primary_node,
7153 secondary_nodes, disk_info,
7154 file_storage_dir, file_driver,
7155 base_index, feedback_fn):
7156 """Generate the entire disk layout for a given template type.
7159 #TODO: compute space requirements
7161 vgname = lu.cfg.GetVGName()
7162 disk_count = len(disk_info)
7164 if template_name == constants.DT_DISKLESS:
7166 elif template_name == constants.DT_PLAIN:
7167 if len(secondary_nodes) != 0:
7168 raise errors.ProgrammerError("Wrong template configuration")
7170 names = _GenerateUniqueNames(lu, [".disk%d" % (base_index + i)
7171 for i in range(disk_count)])
7172 for idx, disk in enumerate(disk_info):
7173 disk_index = idx + base_index
7174 vg = disk.get(constants.IDISK_VG, vgname)
7175 feedback_fn("* disk %i, vg %s, name %s" % (idx, vg, names[idx]))
7176 disk_dev = objects.Disk(dev_type=constants.LD_LV,
7177 size=disk[constants.IDISK_SIZE],
7178 logical_id=(vg, names[idx]),
7179 iv_name="disk/%d" % disk_index,
7180 mode=disk[constants.IDISK_MODE])
7181 disks.append(disk_dev)
7182 elif template_name == constants.DT_DRBD8:
7183 if len(secondary_nodes) != 1:
7184 raise errors.ProgrammerError("Wrong template configuration")
7185 remote_node = secondary_nodes[0]
7186 minors = lu.cfg.AllocateDRBDMinor(
7187 [primary_node, remote_node] * len(disk_info), instance_name)
7190 for lv_prefix in _GenerateUniqueNames(lu, [".disk%d" % (base_index + i)
7191 for i in range(disk_count)]):
7192 names.append(lv_prefix + "_data")
7193 names.append(lv_prefix + "_meta")
7194 for idx, disk in enumerate(disk_info):
7195 disk_index = idx + base_index
7196 data_vg = disk.get(constants.IDISK_VG, vgname)
7197 meta_vg = disk.get(constants.IDISK_METAVG, data_vg)
7198 disk_dev = _GenerateDRBD8Branch(lu, primary_node, remote_node,
7199 disk[constants.IDISK_SIZE],
7201 names[idx * 2:idx * 2 + 2],
7202 "disk/%d" % disk_index,
7203 minors[idx * 2], minors[idx * 2 + 1])
7204 disk_dev.mode = disk[constants.IDISK_MODE]
7205 disks.append(disk_dev)
7206 elif template_name == constants.DT_FILE:
7207 if len(secondary_nodes) != 0:
7208 raise errors.ProgrammerError("Wrong template configuration")
7210 opcodes.RequireFileStorage()
7212 for idx, disk in enumerate(disk_info):
7213 disk_index = idx + base_index
7214 disk_dev = objects.Disk(dev_type=constants.LD_FILE,
7215 size=disk[constants.IDISK_SIZE],
7216 iv_name="disk/%d" % disk_index,
7217 logical_id=(file_driver,
7218 "%s/disk%d" % (file_storage_dir,
7220 mode=disk[constants.IDISK_MODE])
7221 disks.append(disk_dev)
7222 elif template_name == constants.DT_SHARED_FILE:
7223 if len(secondary_nodes) != 0:
7224 raise errors.ProgrammerError("Wrong template configuration")
7226 opcodes.RequireSharedFileStorage()
7228 for idx, disk in enumerate(disk_info):
7229 disk_index = idx + base_index
7230 disk_dev = objects.Disk(dev_type=constants.LD_FILE,
7231 size=disk[constants.IDISK_SIZE],
7232 iv_name="disk/%d" % disk_index,
7233 logical_id=(file_driver,
7234 "%s/disk%d" % (file_storage_dir,
7236 mode=disk[constants.IDISK_MODE])
7237 disks.append(disk_dev)
7238 elif template_name == constants.DT_BLOCK:
7239 if len(secondary_nodes) != 0:
7240 raise errors.ProgrammerError("Wrong template configuration")
7242 for idx, disk in enumerate(disk_info):
7243 disk_index = idx + base_index
7244 disk_dev = objects.Disk(dev_type=constants.LD_BLOCKDEV,
7245 size=disk[constants.IDISK_SIZE],
7246 logical_id=(constants.BLOCKDEV_DRIVER_MANUAL,
7247 disk[constants.IDISK_ADOPT]),
7248 iv_name="disk/%d" % disk_index,
7249 mode=disk[constants.IDISK_MODE])
7250 disks.append(disk_dev)
7253 raise errors.ProgrammerError("Invalid disk template '%s'" % template_name)
7257 def _GetInstanceInfoText(instance):
7258 """Compute that text that should be added to the disk's metadata.
7261 return "originstname+%s" % instance.name
7264 def _CalcEta(time_taken, written, total_size):
7265 """Calculates the ETA based on size written and total size.
7267 @param time_taken: The time taken so far
7268 @param written: amount written so far
7269 @param total_size: The total size of data to be written
7270 @return: The remaining time in seconds
7273 avg_time = time_taken / float(written)
7274 return (total_size - written) * avg_time
7277 def _WipeDisks(lu, instance):
7278 """Wipes instance disks.
7280 @type lu: L{LogicalUnit}
7281 @param lu: the logical unit on whose behalf we execute
7282 @type instance: L{objects.Instance}
7283 @param instance: the instance whose disks we should create
7284 @return: the success of the wipe
7287 node = instance.primary_node
7289 for device in instance.disks:
7290 lu.cfg.SetDiskID(device, node)
7292 logging.info("Pause sync of instance %s disks", instance.name)
7293 result = lu.rpc.call_blockdev_pause_resume_sync(node, instance.disks, True)
7295 for idx, success in enumerate(result.payload):
7297 logging.warn("pause-sync of instance %s for disks %d failed",
7301 for idx, device in enumerate(instance.disks):
7302 # The wipe size is MIN_WIPE_CHUNK_PERCENT % of the instance disk but
7303 # MAX_WIPE_CHUNK at max
7304 wipe_chunk_size = min(constants.MAX_WIPE_CHUNK, device.size / 100.0 *
7305 constants.MIN_WIPE_CHUNK_PERCENT)
7306 # we _must_ make this an int, otherwise rounding errors will
7308 wipe_chunk_size = int(wipe_chunk_size)
7310 lu.LogInfo("* Wiping disk %d", idx)
7311 logging.info("Wiping disk %d for instance %s, node %s using"
7312 " chunk size %s", idx, instance.name, node, wipe_chunk_size)
7317 start_time = time.time()
7319 while offset < size:
7320 wipe_size = min(wipe_chunk_size, size - offset)
7321 logging.debug("Wiping disk %d, offset %s, chunk %s",
7322 idx, offset, wipe_size)
7323 result = lu.rpc.call_blockdev_wipe(node, device, offset, wipe_size)
7324 result.Raise("Could not wipe disk %d at offset %d for size %d" %
7325 (idx, offset, wipe_size))
7328 if now - last_output >= 60:
7329 eta = _CalcEta(now - start_time, offset, size)
7330 lu.LogInfo(" - done: %.1f%% ETA: %s" %
7331 (offset / float(size) * 100, utils.FormatSeconds(eta)))
7334 logging.info("Resume sync of instance %s disks", instance.name)
7336 result = lu.rpc.call_blockdev_pause_resume_sync(node, instance.disks, False)
7338 for idx, success in enumerate(result.payload):
7340 lu.LogWarning("Resume sync of disk %d failed, please have a"
7341 " look at the status and troubleshoot the issue", idx)
7342 logging.warn("resume-sync of instance %s for disks %d failed",
7346 def _CreateDisks(lu, instance, to_skip=None, target_node=None):
7347 """Create all disks for an instance.
7349 This abstracts away some work from AddInstance.
7351 @type lu: L{LogicalUnit}
7352 @param lu: the logical unit on whose behalf we execute
7353 @type instance: L{objects.Instance}
7354 @param instance: the instance whose disks we should create
7356 @param to_skip: list of indices to skip
7357 @type target_node: string
7358 @param target_node: if passed, overrides the target node for creation
7360 @return: the success of the creation
7363 info = _GetInstanceInfoText(instance)
7364 if target_node is None:
7365 pnode = instance.primary_node
7366 all_nodes = instance.all_nodes
7371 if instance.disk_template in (constants.DT_FILE, constants.DT_SHARED_FILE):
7372 file_storage_dir = os.path.dirname(instance.disks[0].logical_id[1])
7373 result = lu.rpc.call_file_storage_dir_create(pnode, file_storage_dir)
7375 result.Raise("Failed to create directory '%s' on"
7376 " node %s" % (file_storage_dir, pnode))
7378 # Note: this needs to be kept in sync with adding of disks in
7379 # LUInstanceSetParams
7380 for idx, device in enumerate(instance.disks):
7381 if to_skip and idx in to_skip:
7383 logging.info("Creating volume %s for instance %s",
7384 device.iv_name, instance.name)
7386 for node in all_nodes:
7387 f_create = node == pnode
7388 _CreateBlockDev(lu, node, instance, device, f_create, info, f_create)
7391 def _RemoveDisks(lu, instance, target_node=None):
7392 """Remove all disks for an instance.
7394 This abstracts away some work from `AddInstance()` and
7395 `RemoveInstance()`. Note that in case some of the devices couldn't
7396 be removed, the removal will continue with the other ones (compare
7397 with `_CreateDisks()`).
7399 @type lu: L{LogicalUnit}
7400 @param lu: the logical unit on whose behalf we execute
7401 @type instance: L{objects.Instance}
7402 @param instance: the instance whose disks we should remove
7403 @type target_node: string
7404 @param target_node: used to override the node on which to remove the disks
7406 @return: the success of the removal
7409 logging.info("Removing block devices for instance %s", instance.name)
7412 for device in instance.disks:
7414 edata = [(target_node, device)]
7416 edata = device.ComputeNodeTree(instance.primary_node)
7417 for node, disk in edata:
7418 lu.cfg.SetDiskID(disk, node)
7419 msg = lu.rpc.call_blockdev_remove(node, disk).fail_msg
7421 lu.LogWarning("Could not remove block device %s on node %s,"
7422 " continuing anyway: %s", device.iv_name, node, msg)
7425 if instance.disk_template == constants.DT_FILE:
7426 file_storage_dir = os.path.dirname(instance.disks[0].logical_id[1])
7430 tgt = instance.primary_node
7431 result = lu.rpc.call_file_storage_dir_remove(tgt, file_storage_dir)
7433 lu.LogWarning("Could not remove directory '%s' on node %s: %s",
7434 file_storage_dir, instance.primary_node, result.fail_msg)
7440 def _ComputeDiskSizePerVG(disk_template, disks):
7441 """Compute disk size requirements in the volume group
7444 def _compute(disks, payload):
7445 """Universal algorithm.
7450 vgs[disk[constants.IDISK_VG]] = \
7451 vgs.get(constants.IDISK_VG, 0) + disk[constants.IDISK_SIZE] + payload
7455 # Required free disk space as a function of disk and swap space
7457 constants.DT_DISKLESS: {},
7458 constants.DT_PLAIN: _compute(disks, 0),
7459 # 128 MB are added for drbd metadata for each disk
7460 constants.DT_DRBD8: _compute(disks, 128),
7461 constants.DT_FILE: {},
7462 constants.DT_SHARED_FILE: {},
7465 if disk_template not in req_size_dict:
7466 raise errors.ProgrammerError("Disk template '%s' size requirement"
7467 " is unknown" % disk_template)
7469 return req_size_dict[disk_template]
7472 def _ComputeDiskSize(disk_template, disks):
7473 """Compute disk size requirements in the volume group
7476 # Required free disk space as a function of disk and swap space
7478 constants.DT_DISKLESS: None,
7479 constants.DT_PLAIN: sum(d[constants.IDISK_SIZE] for d in disks),
7480 # 128 MB are added for drbd metadata for each disk
7481 constants.DT_DRBD8: sum(d[constants.IDISK_SIZE] + 128 for d in disks),
7482 constants.DT_FILE: None,
7483 constants.DT_SHARED_FILE: 0,
7484 constants.DT_BLOCK: 0,
7487 if disk_template not in req_size_dict:
7488 raise errors.ProgrammerError("Disk template '%s' size requirement"
7489 " is unknown" % disk_template)
7491 return req_size_dict[disk_template]
7494 def _FilterVmNodes(lu, nodenames):
7495 """Filters out non-vm_capable nodes from a list.
7497 @type lu: L{LogicalUnit}
7498 @param lu: the logical unit for which we check
7499 @type nodenames: list
7500 @param nodenames: the list of nodes on which we should check
7502 @return: the list of vm-capable nodes
7505 vm_nodes = frozenset(lu.cfg.GetNonVmCapableNodeList())
7506 return [name for name in nodenames if name not in vm_nodes]
7509 def _CheckHVParams(lu, nodenames, hvname, hvparams):
7510 """Hypervisor parameter validation.
7512 This function abstract the hypervisor parameter validation to be
7513 used in both instance create and instance modify.
7515 @type lu: L{LogicalUnit}
7516 @param lu: the logical unit for which we check
7517 @type nodenames: list
7518 @param nodenames: the list of nodes on which we should check
7519 @type hvname: string
7520 @param hvname: the name of the hypervisor we should use
7521 @type hvparams: dict
7522 @param hvparams: the parameters which we need to check
7523 @raise errors.OpPrereqError: if the parameters are not valid
7526 nodenames = _FilterVmNodes(lu, nodenames)
7527 hvinfo = lu.rpc.call_hypervisor_validate_params(nodenames,
7530 for node in nodenames:
7534 info.Raise("Hypervisor parameter validation failed on node %s" % node)
7537 def _CheckOSParams(lu, required, nodenames, osname, osparams):
7538 """OS parameters validation.
7540 @type lu: L{LogicalUnit}
7541 @param lu: the logical unit for which we check
7542 @type required: boolean
7543 @param required: whether the validation should fail if the OS is not
7545 @type nodenames: list
7546 @param nodenames: the list of nodes on which we should check
7547 @type osname: string
7548 @param osname: the name of the hypervisor we should use
7549 @type osparams: dict
7550 @param osparams: the parameters which we need to check
7551 @raise errors.OpPrereqError: if the parameters are not valid
7554 nodenames = _FilterVmNodes(lu, nodenames)
7555 result = lu.rpc.call_os_validate(required, nodenames, osname,
7556 [constants.OS_VALIDATE_PARAMETERS],
7558 for node, nres in result.items():
7559 # we don't check for offline cases since this should be run only
7560 # against the master node and/or an instance's nodes
7561 nres.Raise("OS Parameters validation failed on node %s" % node)
7562 if not nres.payload:
7563 lu.LogInfo("OS %s not found on node %s, validation skipped",
7567 class LUInstanceCreate(LogicalUnit):
7568 """Create an instance.
7571 HPATH = "instance-add"
7572 HTYPE = constants.HTYPE_INSTANCE
7575 def CheckArguments(self):
7579 # do not require name_check to ease forward/backward compatibility
7581 if self.op.no_install and self.op.start:
7582 self.LogInfo("No-installation mode selected, disabling startup")
7583 self.op.start = False
7584 # validate/normalize the instance name
7585 self.op.instance_name = \
7586 netutils.Hostname.GetNormalizedName(self.op.instance_name)
7588 if self.op.ip_check and not self.op.name_check:
7589 # TODO: make the ip check more flexible and not depend on the name check
7590 raise errors.OpPrereqError("Cannot do IP address check without a name"
7591 " check", errors.ECODE_INVAL)
7593 # check nics' parameter names
7594 for nic in self.op.nics:
7595 utils.ForceDictType(nic, constants.INIC_PARAMS_TYPES)
7597 # check disks. parameter names and consistent adopt/no-adopt strategy
7598 has_adopt = has_no_adopt = False
7599 for disk in self.op.disks:
7600 utils.ForceDictType(disk, constants.IDISK_PARAMS_TYPES)
7601 if constants.IDISK_ADOPT in disk:
7605 if has_adopt and has_no_adopt:
7606 raise errors.OpPrereqError("Either all disks are adopted or none is",
7609 if self.op.disk_template not in constants.DTS_MAY_ADOPT:
7610 raise errors.OpPrereqError("Disk adoption is not supported for the"
7611 " '%s' disk template" %
7612 self.op.disk_template,
7614 if self.op.iallocator is not None:
7615 raise errors.OpPrereqError("Disk adoption not allowed with an"
7616 " iallocator script", errors.ECODE_INVAL)
7617 if self.op.mode == constants.INSTANCE_IMPORT:
7618 raise errors.OpPrereqError("Disk adoption not allowed for"
7619 " instance import", errors.ECODE_INVAL)
7621 if self.op.disk_template in constants.DTS_MUST_ADOPT:
7622 raise errors.OpPrereqError("Disk template %s requires disk adoption,"
7623 " but no 'adopt' parameter given" %
7624 self.op.disk_template,
7627 self.adopt_disks = has_adopt
7629 # instance name verification
7630 if self.op.name_check:
7631 self.hostname1 = netutils.GetHostname(name=self.op.instance_name)
7632 self.op.instance_name = self.hostname1.name
7633 # used in CheckPrereq for ip ping check
7634 self.check_ip = self.hostname1.ip
7636 self.check_ip = None
7638 # file storage checks
7639 if (self.op.file_driver and
7640 not self.op.file_driver in constants.FILE_DRIVER):
7641 raise errors.OpPrereqError("Invalid file driver name '%s'" %
7642 self.op.file_driver, errors.ECODE_INVAL)
7644 if self.op.file_storage_dir and os.path.isabs(self.op.file_storage_dir):
7645 raise errors.OpPrereqError("File storage directory path not absolute",
7648 ### Node/iallocator related checks
7649 _CheckIAllocatorOrNode(self, "iallocator", "pnode")
7651 if self.op.pnode is not None:
7652 if self.op.disk_template in constants.DTS_INT_MIRROR:
7653 if self.op.snode is None:
7654 raise errors.OpPrereqError("The networked disk templates need"
7655 " a mirror node", errors.ECODE_INVAL)
7657 self.LogWarning("Secondary node will be ignored on non-mirrored disk"
7659 self.op.snode = None
7661 self._cds = _GetClusterDomainSecret()
7663 if self.op.mode == constants.INSTANCE_IMPORT:
7664 # On import force_variant must be True, because if we forced it at
7665 # initial install, our only chance when importing it back is that it
7667 self.op.force_variant = True
7669 if self.op.no_install:
7670 self.LogInfo("No-installation mode has no effect during import")
7672 elif self.op.mode == constants.INSTANCE_CREATE:
7673 if self.op.os_type is None:
7674 raise errors.OpPrereqError("No guest OS specified",
7676 if self.op.os_type in self.cfg.GetClusterInfo().blacklisted_os:
7677 raise errors.OpPrereqError("Guest OS '%s' is not allowed for"
7678 " installation" % self.op.os_type,
7680 if self.op.disk_template is None:
7681 raise errors.OpPrereqError("No disk template specified",
7684 elif self.op.mode == constants.INSTANCE_REMOTE_IMPORT:
7685 # Check handshake to ensure both clusters have the same domain secret
7686 src_handshake = self.op.source_handshake
7687 if not src_handshake:
7688 raise errors.OpPrereqError("Missing source handshake",
7691 errmsg = masterd.instance.CheckRemoteExportHandshake(self._cds,
7694 raise errors.OpPrereqError("Invalid handshake: %s" % errmsg,
7697 # Load and check source CA
7698 self.source_x509_ca_pem = self.op.source_x509_ca
7699 if not self.source_x509_ca_pem:
7700 raise errors.OpPrereqError("Missing source X509 CA",
7704 (cert, _) = utils.LoadSignedX509Certificate(self.source_x509_ca_pem,
7706 except OpenSSL.crypto.Error, err:
7707 raise errors.OpPrereqError("Unable to load source X509 CA (%s)" %
7708 (err, ), errors.ECODE_INVAL)
7710 (errcode, msg) = utils.VerifyX509Certificate(cert, None, None)
7711 if errcode is not None:
7712 raise errors.OpPrereqError("Invalid source X509 CA (%s)" % (msg, ),
7715 self.source_x509_ca = cert
7717 src_instance_name = self.op.source_instance_name
7718 if not src_instance_name:
7719 raise errors.OpPrereqError("Missing source instance name",
7722 self.source_instance_name = \
7723 netutils.GetHostname(name=src_instance_name).name
7726 raise errors.OpPrereqError("Invalid instance creation mode %r" %
7727 self.op.mode, errors.ECODE_INVAL)
7729 def ExpandNames(self):
7730 """ExpandNames for CreateInstance.
7732 Figure out the right locks for instance creation.
7735 self.needed_locks = {}
7737 instance_name = self.op.instance_name
7738 # this is just a preventive check, but someone might still add this
7739 # instance in the meantime, and creation will fail at lock-add time
7740 if instance_name in self.cfg.GetInstanceList():
7741 raise errors.OpPrereqError("Instance '%s' is already in the cluster" %
7742 instance_name, errors.ECODE_EXISTS)
7744 self.add_locks[locking.LEVEL_INSTANCE] = instance_name
7746 if self.op.iallocator:
7747 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
7749 self.op.pnode = _ExpandNodeName(self.cfg, self.op.pnode)
7750 nodelist = [self.op.pnode]
7751 if self.op.snode is not None:
7752 self.op.snode = _ExpandNodeName(self.cfg, self.op.snode)
7753 nodelist.append(self.op.snode)
7754 self.needed_locks[locking.LEVEL_NODE] = nodelist
7756 # in case of import lock the source node too
7757 if self.op.mode == constants.INSTANCE_IMPORT:
7758 src_node = self.op.src_node
7759 src_path = self.op.src_path
7761 if src_path is None:
7762 self.op.src_path = src_path = self.op.instance_name
7764 if src_node is None:
7765 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
7766 self.op.src_node = None
7767 if os.path.isabs(src_path):
7768 raise errors.OpPrereqError("Importing an instance from an absolute"
7769 " path requires a source node option",
7772 self.op.src_node = src_node = _ExpandNodeName(self.cfg, src_node)
7773 if self.needed_locks[locking.LEVEL_NODE] is not locking.ALL_SET:
7774 self.needed_locks[locking.LEVEL_NODE].append(src_node)
7775 if not os.path.isabs(src_path):
7776 self.op.src_path = src_path = \
7777 utils.PathJoin(constants.EXPORT_DIR, src_path)
7779 def _RunAllocator(self):
7780 """Run the allocator based on input opcode.
7783 nics = [n.ToDict() for n in self.nics]
7784 ial = IAllocator(self.cfg, self.rpc,
7785 mode=constants.IALLOCATOR_MODE_ALLOC,
7786 name=self.op.instance_name,
7787 disk_template=self.op.disk_template,
7790 vcpus=self.be_full[constants.BE_VCPUS],
7791 mem_size=self.be_full[constants.BE_MEMORY],
7794 hypervisor=self.op.hypervisor,
7797 ial.Run(self.op.iallocator)
7800 raise errors.OpPrereqError("Can't compute nodes using"
7801 " iallocator '%s': %s" %
7802 (self.op.iallocator, ial.info),
7804 if len(ial.result) != ial.required_nodes:
7805 raise errors.OpPrereqError("iallocator '%s' returned invalid number"
7806 " of nodes (%s), required %s" %
7807 (self.op.iallocator, len(ial.result),
7808 ial.required_nodes), errors.ECODE_FAULT)
7809 self.op.pnode = ial.result[0]
7810 self.LogInfo("Selected nodes for instance %s via iallocator %s: %s",
7811 self.op.instance_name, self.op.iallocator,
7812 utils.CommaJoin(ial.result))
7813 if ial.required_nodes == 2:
7814 self.op.snode = ial.result[1]
7816 def BuildHooksEnv(self):
7819 This runs on master, primary and secondary nodes of the instance.
7823 "ADD_MODE": self.op.mode,
7825 if self.op.mode == constants.INSTANCE_IMPORT:
7826 env["SRC_NODE"] = self.op.src_node
7827 env["SRC_PATH"] = self.op.src_path
7828 env["SRC_IMAGES"] = self.src_images
7830 env.update(_BuildInstanceHookEnv(
7831 name=self.op.instance_name,
7832 primary_node=self.op.pnode,
7833 secondary_nodes=self.secondaries,
7834 status=self.op.start,
7835 os_type=self.op.os_type,
7836 memory=self.be_full[constants.BE_MEMORY],
7837 vcpus=self.be_full[constants.BE_VCPUS],
7838 nics=_NICListToTuple(self, self.nics),
7839 disk_template=self.op.disk_template,
7840 disks=[(d[constants.IDISK_SIZE], d[constants.IDISK_MODE])
7841 for d in self.disks],
7844 hypervisor_name=self.op.hypervisor,
7849 def BuildHooksNodes(self):
7850 """Build hooks nodes.
7853 nl = [self.cfg.GetMasterNode(), self.op.pnode] + self.secondaries
7856 def _ReadExportInfo(self):
7857 """Reads the export information from disk.
7859 It will override the opcode source node and path with the actual
7860 information, if these two were not specified before.
7862 @return: the export information
7865 assert self.op.mode == constants.INSTANCE_IMPORT
7867 src_node = self.op.src_node
7868 src_path = self.op.src_path
7870 if src_node is None:
7871 locked_nodes = self.glm.list_owned(locking.LEVEL_NODE)
7872 exp_list = self.rpc.call_export_list(locked_nodes)
7874 for node in exp_list:
7875 if exp_list[node].fail_msg:
7877 if src_path in exp_list[node].payload:
7879 self.op.src_node = src_node = node
7880 self.op.src_path = src_path = utils.PathJoin(constants.EXPORT_DIR,
7884 raise errors.OpPrereqError("No export found for relative path %s" %
7885 src_path, errors.ECODE_INVAL)
7887 _CheckNodeOnline(self, src_node)
7888 result = self.rpc.call_export_info(src_node, src_path)
7889 result.Raise("No export or invalid export found in dir %s" % src_path)
7891 export_info = objects.SerializableConfigParser.Loads(str(result.payload))
7892 if not export_info.has_section(constants.INISECT_EXP):
7893 raise errors.ProgrammerError("Corrupted export config",
7894 errors.ECODE_ENVIRON)
7896 ei_version = export_info.get(constants.INISECT_EXP, "version")
7897 if (int(ei_version) != constants.EXPORT_VERSION):
7898 raise errors.OpPrereqError("Wrong export version %s (wanted %d)" %
7899 (ei_version, constants.EXPORT_VERSION),
7900 errors.ECODE_ENVIRON)
7903 def _ReadExportParams(self, einfo):
7904 """Use export parameters as defaults.
7906 In case the opcode doesn't specify (as in override) some instance
7907 parameters, then try to use them from the export information, if
7911 self.op.os_type = einfo.get(constants.INISECT_EXP, "os")
7913 if self.op.disk_template is None:
7914 if einfo.has_option(constants.INISECT_INS, "disk_template"):
7915 self.op.disk_template = einfo.get(constants.INISECT_INS,
7918 raise errors.OpPrereqError("No disk template specified and the export"
7919 " is missing the disk_template information",
7922 if not self.op.disks:
7923 if einfo.has_option(constants.INISECT_INS, "disk_count"):
7925 # TODO: import the disk iv_name too
7926 for idx in range(einfo.getint(constants.INISECT_INS, "disk_count")):
7927 disk_sz = einfo.getint(constants.INISECT_INS, "disk%d_size" % idx)
7928 disks.append({constants.IDISK_SIZE: disk_sz})
7929 self.op.disks = disks
7931 raise errors.OpPrereqError("No disk info specified and the export"
7932 " is missing the disk information",
7935 if (not self.op.nics and
7936 einfo.has_option(constants.INISECT_INS, "nic_count")):
7938 for idx in range(einfo.getint(constants.INISECT_INS, "nic_count")):
7940 for name in list(constants.NICS_PARAMETERS) + ["ip", "mac"]:
7941 v = einfo.get(constants.INISECT_INS, "nic%d_%s" % (idx, name))
7946 if (self.op.hypervisor is None and
7947 einfo.has_option(constants.INISECT_INS, "hypervisor")):
7948 self.op.hypervisor = einfo.get(constants.INISECT_INS, "hypervisor")
7949 if einfo.has_section(constants.INISECT_HYP):
7950 # use the export parameters but do not override the ones
7951 # specified by the user
7952 for name, value in einfo.items(constants.INISECT_HYP):
7953 if name not in self.op.hvparams:
7954 self.op.hvparams[name] = value
7956 if einfo.has_section(constants.INISECT_BEP):
7957 # use the parameters, without overriding
7958 for name, value in einfo.items(constants.INISECT_BEP):
7959 if name not in self.op.beparams:
7960 self.op.beparams[name] = value
7962 # try to read the parameters old style, from the main section
7963 for name in constants.BES_PARAMETERS:
7964 if (name not in self.op.beparams and
7965 einfo.has_option(constants.INISECT_INS, name)):
7966 self.op.beparams[name] = einfo.get(constants.INISECT_INS, name)
7968 if einfo.has_section(constants.INISECT_OSP):
7969 # use the parameters, without overriding
7970 for name, value in einfo.items(constants.INISECT_OSP):
7971 if name not in self.op.osparams:
7972 self.op.osparams[name] = value
7974 def _RevertToDefaults(self, cluster):
7975 """Revert the instance parameters to the default values.
7979 hv_defs = cluster.SimpleFillHV(self.op.hypervisor, self.op.os_type, {})
7980 for name in self.op.hvparams.keys():
7981 if name in hv_defs and hv_defs[name] == self.op.hvparams[name]:
7982 del self.op.hvparams[name]
7984 be_defs = cluster.SimpleFillBE({})
7985 for name in self.op.beparams.keys():
7986 if name in be_defs and be_defs[name] == self.op.beparams[name]:
7987 del self.op.beparams[name]
7989 nic_defs = cluster.SimpleFillNIC({})
7990 for nic in self.op.nics:
7991 for name in constants.NICS_PARAMETERS:
7992 if name in nic and name in nic_defs and nic[name] == nic_defs[name]:
7995 os_defs = cluster.SimpleFillOS(self.op.os_type, {})
7996 for name in self.op.osparams.keys():
7997 if name in os_defs and os_defs[name] == self.op.osparams[name]:
7998 del self.op.osparams[name]
8000 def CheckPrereq(self):
8001 """Check prerequisites.
8004 if self.op.mode == constants.INSTANCE_IMPORT:
8005 export_info = self._ReadExportInfo()
8006 self._ReadExportParams(export_info)
8008 if (not self.cfg.GetVGName() and
8009 self.op.disk_template not in constants.DTS_NOT_LVM):
8010 raise errors.OpPrereqError("Cluster does not support lvm-based"
8011 " instances", errors.ECODE_STATE)
8013 if self.op.hypervisor is None:
8014 self.op.hypervisor = self.cfg.GetHypervisorType()
8016 cluster = self.cfg.GetClusterInfo()
8017 enabled_hvs = cluster.enabled_hypervisors
8018 if self.op.hypervisor not in enabled_hvs:
8019 raise errors.OpPrereqError("Selected hypervisor (%s) not enabled in the"
8020 " cluster (%s)" % (self.op.hypervisor,
8021 ",".join(enabled_hvs)),
8024 # check hypervisor parameter syntax (locally)
8025 utils.ForceDictType(self.op.hvparams, constants.HVS_PARAMETER_TYPES)
8026 filled_hvp = cluster.SimpleFillHV(self.op.hypervisor, self.op.os_type,
8028 hv_type = hypervisor.GetHypervisor(self.op.hypervisor)
8029 hv_type.CheckParameterSyntax(filled_hvp)
8030 self.hv_full = filled_hvp
8031 # check that we don't specify global parameters on an instance
8032 _CheckGlobalHvParams(self.op.hvparams)
8034 # fill and remember the beparams dict
8035 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
8036 self.be_full = cluster.SimpleFillBE(self.op.beparams)
8038 # build os parameters
8039 self.os_full = cluster.SimpleFillOS(self.op.os_type, self.op.osparams)
8041 # now that hvp/bep are in final format, let's reset to defaults,
8043 if self.op.identify_defaults:
8044 self._RevertToDefaults(cluster)
8048 for idx, nic in enumerate(self.op.nics):
8049 nic_mode_req = nic.get(constants.INIC_MODE, None)
8050 nic_mode = nic_mode_req
8051 if nic_mode is None:
8052 nic_mode = cluster.nicparams[constants.PP_DEFAULT][constants.NIC_MODE]
8054 # in routed mode, for the first nic, the default ip is 'auto'
8055 if nic_mode == constants.NIC_MODE_ROUTED and idx == 0:
8056 default_ip_mode = constants.VALUE_AUTO
8058 default_ip_mode = constants.VALUE_NONE
8060 # ip validity checks
8061 ip = nic.get(constants.INIC_IP, default_ip_mode)
8062 if ip is None or ip.lower() == constants.VALUE_NONE:
8064 elif ip.lower() == constants.VALUE_AUTO:
8065 if not self.op.name_check:
8066 raise errors.OpPrereqError("IP address set to auto but name checks"
8067 " have been skipped",
8069 nic_ip = self.hostname1.ip
8071 if not netutils.IPAddress.IsValid(ip):
8072 raise errors.OpPrereqError("Invalid IP address '%s'" % ip,
8076 # TODO: check the ip address for uniqueness
8077 if nic_mode == constants.NIC_MODE_ROUTED and not nic_ip:
8078 raise errors.OpPrereqError("Routed nic mode requires an ip address",
8081 # MAC address verification
8082 mac = nic.get(constants.INIC_MAC, constants.VALUE_AUTO)
8083 if mac not in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
8084 mac = utils.NormalizeAndValidateMac(mac)
8087 self.cfg.ReserveMAC(mac, self.proc.GetECId())
8088 except errors.ReservationError:
8089 raise errors.OpPrereqError("MAC address %s already in use"
8090 " in cluster" % mac,
8091 errors.ECODE_NOTUNIQUE)
8093 # Build nic parameters
8094 link = nic.get(constants.INIC_LINK, None)
8097 nicparams[constants.NIC_MODE] = nic_mode_req
8099 nicparams[constants.NIC_LINK] = link
8101 check_params = cluster.SimpleFillNIC(nicparams)
8102 objects.NIC.CheckParameterSyntax(check_params)
8103 self.nics.append(objects.NIC(mac=mac, ip=nic_ip, nicparams=nicparams))
8105 # disk checks/pre-build
8106 default_vg = self.cfg.GetVGName()
8108 for disk in self.op.disks:
8109 mode = disk.get(constants.IDISK_MODE, constants.DISK_RDWR)
8110 if mode not in constants.DISK_ACCESS_SET:
8111 raise errors.OpPrereqError("Invalid disk access mode '%s'" %
8112 mode, errors.ECODE_INVAL)
8113 size = disk.get(constants.IDISK_SIZE, None)
8115 raise errors.OpPrereqError("Missing disk size", errors.ECODE_INVAL)
8118 except (TypeError, ValueError):
8119 raise errors.OpPrereqError("Invalid disk size '%s'" % size,
8122 data_vg = disk.get(constants.IDISK_VG, default_vg)
8124 constants.IDISK_SIZE: size,
8125 constants.IDISK_MODE: mode,
8126 constants.IDISK_VG: data_vg,
8127 constants.IDISK_METAVG: disk.get(constants.IDISK_METAVG, data_vg),
8129 if constants.IDISK_ADOPT in disk:
8130 new_disk[constants.IDISK_ADOPT] = disk[constants.IDISK_ADOPT]
8131 self.disks.append(new_disk)
8133 if self.op.mode == constants.INSTANCE_IMPORT:
8135 # Check that the new instance doesn't have less disks than the export
8136 instance_disks = len(self.disks)
8137 export_disks = export_info.getint(constants.INISECT_INS, 'disk_count')
8138 if instance_disks < export_disks:
8139 raise errors.OpPrereqError("Not enough disks to import."
8140 " (instance: %d, export: %d)" %
8141 (instance_disks, export_disks),
8145 for idx in range(export_disks):
8146 option = 'disk%d_dump' % idx
8147 if export_info.has_option(constants.INISECT_INS, option):
8148 # FIXME: are the old os-es, disk sizes, etc. useful?
8149 export_name = export_info.get(constants.INISECT_INS, option)
8150 image = utils.PathJoin(self.op.src_path, export_name)
8151 disk_images.append(image)
8153 disk_images.append(False)
8155 self.src_images = disk_images
8157 old_name = export_info.get(constants.INISECT_INS, 'name')
8159 exp_nic_count = export_info.getint(constants.INISECT_INS, 'nic_count')
8160 except (TypeError, ValueError), err:
8161 raise errors.OpPrereqError("Invalid export file, nic_count is not"
8162 " an integer: %s" % str(err),
8164 if self.op.instance_name == old_name:
8165 for idx, nic in enumerate(self.nics):
8166 if nic.mac == constants.VALUE_AUTO and exp_nic_count >= idx:
8167 nic_mac_ini = 'nic%d_mac' % idx
8168 nic.mac = export_info.get(constants.INISECT_INS, nic_mac_ini)
8170 # ENDIF: self.op.mode == constants.INSTANCE_IMPORT
8172 # ip ping checks (we use the same ip that was resolved in ExpandNames)
8173 if self.op.ip_check:
8174 if netutils.TcpPing(self.check_ip, constants.DEFAULT_NODED_PORT):
8175 raise errors.OpPrereqError("IP %s of instance %s already in use" %
8176 (self.check_ip, self.op.instance_name),
8177 errors.ECODE_NOTUNIQUE)
8179 #### mac address generation
8180 # By generating here the mac address both the allocator and the hooks get
8181 # the real final mac address rather than the 'auto' or 'generate' value.
8182 # There is a race condition between the generation and the instance object
8183 # creation, which means that we know the mac is valid now, but we're not
8184 # sure it will be when we actually add the instance. If things go bad
8185 # adding the instance will abort because of a duplicate mac, and the
8186 # creation job will fail.
8187 for nic in self.nics:
8188 if nic.mac in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
8189 nic.mac = self.cfg.GenerateMAC(self.proc.GetECId())
8193 if self.op.iallocator is not None:
8194 self._RunAllocator()
8196 #### node related checks
8198 # check primary node
8199 self.pnode = pnode = self.cfg.GetNodeInfo(self.op.pnode)
8200 assert self.pnode is not None, \
8201 "Cannot retrieve locked node %s" % self.op.pnode
8203 raise errors.OpPrereqError("Cannot use offline primary node '%s'" %
8204 pnode.name, errors.ECODE_STATE)
8206 raise errors.OpPrereqError("Cannot use drained primary node '%s'" %
8207 pnode.name, errors.ECODE_STATE)
8208 if not pnode.vm_capable:
8209 raise errors.OpPrereqError("Cannot use non-vm_capable primary node"
8210 " '%s'" % pnode.name, errors.ECODE_STATE)
8212 self.secondaries = []
8214 # mirror node verification
8215 if self.op.disk_template in constants.DTS_INT_MIRROR:
8216 if self.op.snode == pnode.name:
8217 raise errors.OpPrereqError("The secondary node cannot be the"
8218 " primary node", errors.ECODE_INVAL)
8219 _CheckNodeOnline(self, self.op.snode)
8220 _CheckNodeNotDrained(self, self.op.snode)
8221 _CheckNodeVmCapable(self, self.op.snode)
8222 self.secondaries.append(self.op.snode)
8224 nodenames = [pnode.name] + self.secondaries
8226 if not self.adopt_disks:
8227 # Check lv size requirements, if not adopting
8228 req_sizes = _ComputeDiskSizePerVG(self.op.disk_template, self.disks)
8229 _CheckNodesFreeDiskPerVG(self, nodenames, req_sizes)
8231 elif self.op.disk_template == constants.DT_PLAIN: # Check the adoption data
8232 all_lvs = set(["%s/%s" % (disk[constants.IDISK_VG],
8233 disk[constants.IDISK_ADOPT])
8234 for disk in self.disks])
8235 if len(all_lvs) != len(self.disks):
8236 raise errors.OpPrereqError("Duplicate volume names given for adoption",
8238 for lv_name in all_lvs:
8240 # FIXME: lv_name here is "vg/lv" need to ensure that other calls
8241 # to ReserveLV uses the same syntax
8242 self.cfg.ReserveLV(lv_name, self.proc.GetECId())
8243 except errors.ReservationError:
8244 raise errors.OpPrereqError("LV named %s used by another instance" %
8245 lv_name, errors.ECODE_NOTUNIQUE)
8247 vg_names = self.rpc.call_vg_list([pnode.name])[pnode.name]
8248 vg_names.Raise("Cannot get VG information from node %s" % pnode.name)
8250 node_lvs = self.rpc.call_lv_list([pnode.name],
8251 vg_names.payload.keys())[pnode.name]
8252 node_lvs.Raise("Cannot get LV information from node %s" % pnode.name)
8253 node_lvs = node_lvs.payload
8255 delta = all_lvs.difference(node_lvs.keys())
8257 raise errors.OpPrereqError("Missing logical volume(s): %s" %
8258 utils.CommaJoin(delta),
8260 online_lvs = [lv for lv in all_lvs if node_lvs[lv][2]]
8262 raise errors.OpPrereqError("Online logical volumes found, cannot"
8263 " adopt: %s" % utils.CommaJoin(online_lvs),
8265 # update the size of disk based on what is found
8266 for dsk in self.disks:
8267 dsk[constants.IDISK_SIZE] = \
8268 int(float(node_lvs["%s/%s" % (dsk[constants.IDISK_VG],
8269 dsk[constants.IDISK_ADOPT])][0]))
8271 elif self.op.disk_template == constants.DT_BLOCK:
8272 # Normalize and de-duplicate device paths
8273 all_disks = set([os.path.abspath(disk[constants.IDISK_ADOPT])
8274 for disk in self.disks])
8275 if len(all_disks) != len(self.disks):
8276 raise errors.OpPrereqError("Duplicate disk names given for adoption",
8278 baddisks = [d for d in all_disks
8279 if not d.startswith(constants.ADOPTABLE_BLOCKDEV_ROOT)]
8281 raise errors.OpPrereqError("Device node(s) %s lie outside %s and"
8282 " cannot be adopted" %
8283 (", ".join(baddisks),
8284 constants.ADOPTABLE_BLOCKDEV_ROOT),
8287 node_disks = self.rpc.call_bdev_sizes([pnode.name],
8288 list(all_disks))[pnode.name]
8289 node_disks.Raise("Cannot get block device information from node %s" %
8291 node_disks = node_disks.payload
8292 delta = all_disks.difference(node_disks.keys())
8294 raise errors.OpPrereqError("Missing block device(s): %s" %
8295 utils.CommaJoin(delta),
8297 for dsk in self.disks:
8298 dsk[constants.IDISK_SIZE] = \
8299 int(float(node_disks[dsk[constants.IDISK_ADOPT]]))
8301 _CheckHVParams(self, nodenames, self.op.hypervisor, self.op.hvparams)
8303 _CheckNodeHasOS(self, pnode.name, self.op.os_type, self.op.force_variant)
8304 # check OS parameters (remotely)
8305 _CheckOSParams(self, True, nodenames, self.op.os_type, self.os_full)
8307 _CheckNicsBridgesExist(self, self.nics, self.pnode.name)
8309 # memory check on primary node
8311 _CheckNodeFreeMemory(self, self.pnode.name,
8312 "creating instance %s" % self.op.instance_name,
8313 self.be_full[constants.BE_MEMORY],
8316 self.dry_run_result = list(nodenames)
8318 def Exec(self, feedback_fn):
8319 """Create and add the instance to the cluster.
8322 instance = self.op.instance_name
8323 pnode_name = self.pnode.name
8325 ht_kind = self.op.hypervisor
8326 if ht_kind in constants.HTS_REQ_PORT:
8327 network_port = self.cfg.AllocatePort()
8331 if constants.ENABLE_FILE_STORAGE or constants.ENABLE_SHARED_FILE_STORAGE:
8332 # this is needed because os.path.join does not accept None arguments
8333 if self.op.file_storage_dir is None:
8334 string_file_storage_dir = ""
8336 string_file_storage_dir = self.op.file_storage_dir
8338 # build the full file storage dir path
8339 if self.op.disk_template == constants.DT_SHARED_FILE:
8340 get_fsd_fn = self.cfg.GetSharedFileStorageDir
8342 get_fsd_fn = self.cfg.GetFileStorageDir
8344 file_storage_dir = utils.PathJoin(get_fsd_fn(),
8345 string_file_storage_dir, instance)
8347 file_storage_dir = ""
8349 disks = _GenerateDiskTemplate(self,
8350 self.op.disk_template,
8351 instance, pnode_name,
8355 self.op.file_driver,
8359 iobj = objects.Instance(name=instance, os=self.op.os_type,
8360 primary_node=pnode_name,
8361 nics=self.nics, disks=disks,
8362 disk_template=self.op.disk_template,
8364 network_port=network_port,
8365 beparams=self.op.beparams,
8366 hvparams=self.op.hvparams,
8367 hypervisor=self.op.hypervisor,
8368 osparams=self.op.osparams,
8371 if self.adopt_disks:
8372 if self.op.disk_template == constants.DT_PLAIN:
8373 # rename LVs to the newly-generated names; we need to construct
8374 # 'fake' LV disks with the old data, plus the new unique_id
8375 tmp_disks = [objects.Disk.FromDict(v.ToDict()) for v in disks]
8377 for t_dsk, a_dsk in zip (tmp_disks, self.disks):
8378 rename_to.append(t_dsk.logical_id)
8379 t_dsk.logical_id = (t_dsk.logical_id[0], a_dsk[constants.IDISK_ADOPT])
8380 self.cfg.SetDiskID(t_dsk, pnode_name)
8381 result = self.rpc.call_blockdev_rename(pnode_name,
8382 zip(tmp_disks, rename_to))
8383 result.Raise("Failed to rename adoped LVs")
8385 feedback_fn("* creating instance disks...")
8387 _CreateDisks(self, iobj)
8388 except errors.OpExecError:
8389 self.LogWarning("Device creation failed, reverting...")
8391 _RemoveDisks(self, iobj)
8393 self.cfg.ReleaseDRBDMinors(instance)
8396 feedback_fn("adding instance %s to cluster config" % instance)
8398 self.cfg.AddInstance(iobj, self.proc.GetECId())
8400 # Declare that we don't want to remove the instance lock anymore, as we've
8401 # added the instance to the config
8402 del self.remove_locks[locking.LEVEL_INSTANCE]
8404 if self.op.mode == constants.INSTANCE_IMPORT:
8405 # Release unused nodes
8406 _ReleaseLocks(self, locking.LEVEL_NODE, keep=[self.op.src_node])
8409 _ReleaseLocks(self, locking.LEVEL_NODE)
8412 if not self.adopt_disks and self.cfg.GetClusterInfo().prealloc_wipe_disks:
8413 feedback_fn("* wiping instance disks...")
8415 _WipeDisks(self, iobj)
8416 except errors.OpExecError, err:
8417 logging.exception("Wiping disks failed")
8418 self.LogWarning("Wiping instance disks failed (%s)", err)
8422 # Something is already wrong with the disks, don't do anything else
8424 elif self.op.wait_for_sync:
8425 disk_abort = not _WaitForSync(self, iobj)
8426 elif iobj.disk_template in constants.DTS_INT_MIRROR:
8427 # make sure the disks are not degraded (still sync-ing is ok)
8429 feedback_fn("* checking mirrors status")
8430 disk_abort = not _WaitForSync(self, iobj, oneshot=True)
8435 _RemoveDisks(self, iobj)
8436 self.cfg.RemoveInstance(iobj.name)
8437 # Make sure the instance lock gets removed
8438 self.remove_locks[locking.LEVEL_INSTANCE] = iobj.name
8439 raise errors.OpExecError("There are some degraded disks for"
8442 if iobj.disk_template != constants.DT_DISKLESS and not self.adopt_disks:
8443 if self.op.mode == constants.INSTANCE_CREATE:
8444 if not self.op.no_install:
8445 feedback_fn("* running the instance OS create scripts...")
8446 # FIXME: pass debug option from opcode to backend
8447 result = self.rpc.call_instance_os_add(pnode_name, iobj, False,
8448 self.op.debug_level)
8449 result.Raise("Could not add os for instance %s"
8450 " on node %s" % (instance, pnode_name))
8452 elif self.op.mode == constants.INSTANCE_IMPORT:
8453 feedback_fn("* running the instance OS import scripts...")
8457 for idx, image in enumerate(self.src_images):
8461 # FIXME: pass debug option from opcode to backend
8462 dt = masterd.instance.DiskTransfer("disk/%s" % idx,
8463 constants.IEIO_FILE, (image, ),
8464 constants.IEIO_SCRIPT,
8465 (iobj.disks[idx], idx),
8467 transfers.append(dt)
8470 masterd.instance.TransferInstanceData(self, feedback_fn,
8471 self.op.src_node, pnode_name,
8472 self.pnode.secondary_ip,
8474 if not compat.all(import_result):
8475 self.LogWarning("Some disks for instance %s on node %s were not"
8476 " imported successfully" % (instance, pnode_name))
8478 elif self.op.mode == constants.INSTANCE_REMOTE_IMPORT:
8479 feedback_fn("* preparing remote import...")
8480 # The source cluster will stop the instance before attempting to make a
8481 # connection. In some cases stopping an instance can take a long time,
8482 # hence the shutdown timeout is added to the connection timeout.
8483 connect_timeout = (constants.RIE_CONNECT_TIMEOUT +
8484 self.op.source_shutdown_timeout)
8485 timeouts = masterd.instance.ImportExportTimeouts(connect_timeout)
8487 assert iobj.primary_node == self.pnode.name
8489 masterd.instance.RemoteImport(self, feedback_fn, iobj, self.pnode,
8490 self.source_x509_ca,
8491 self._cds, timeouts)
8492 if not compat.all(disk_results):
8493 # TODO: Should the instance still be started, even if some disks
8494 # failed to import (valid for local imports, too)?
8495 self.LogWarning("Some disks for instance %s on node %s were not"
8496 " imported successfully" % (instance, pnode_name))
8498 # Run rename script on newly imported instance
8499 assert iobj.name == instance
8500 feedback_fn("Running rename script for %s" % instance)
8501 result = self.rpc.call_instance_run_rename(pnode_name, iobj,
8502 self.source_instance_name,
8503 self.op.debug_level)
8505 self.LogWarning("Failed to run rename script for %s on node"
8506 " %s: %s" % (instance, pnode_name, result.fail_msg))
8509 # also checked in the prereq part
8510 raise errors.ProgrammerError("Unknown OS initialization mode '%s'"
8514 iobj.admin_up = True
8515 self.cfg.Update(iobj, feedback_fn)
8516 logging.info("Starting instance %s on node %s", instance, pnode_name)
8517 feedback_fn("* starting instance...")
8518 result = self.rpc.call_instance_start(pnode_name, iobj, None, None)
8519 result.Raise("Could not start instance")
8521 return list(iobj.all_nodes)
8524 class LUInstanceConsole(NoHooksLU):
8525 """Connect to an instance's console.
8527 This is somewhat special in that it returns the command line that
8528 you need to run on the master node in order to connect to the
8534 def ExpandNames(self):
8535 self._ExpandAndLockInstance()
8537 def CheckPrereq(self):
8538 """Check prerequisites.
8540 This checks that the instance is in the cluster.
8543 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
8544 assert self.instance is not None, \
8545 "Cannot retrieve locked instance %s" % self.op.instance_name
8546 _CheckNodeOnline(self, self.instance.primary_node)
8548 def Exec(self, feedback_fn):
8549 """Connect to the console of an instance
8552 instance = self.instance
8553 node = instance.primary_node
8555 node_insts = self.rpc.call_instance_list([node],
8556 [instance.hypervisor])[node]
8557 node_insts.Raise("Can't get node information from %s" % node)
8559 if instance.name not in node_insts.payload:
8560 if instance.admin_up:
8561 state = constants.INSTST_ERRORDOWN
8563 state = constants.INSTST_ADMINDOWN
8564 raise errors.OpExecError("Instance %s is not running (state %s)" %
8565 (instance.name, state))
8567 logging.debug("Connecting to console of %s on %s", instance.name, node)
8569 return _GetInstanceConsole(self.cfg.GetClusterInfo(), instance)
8572 def _GetInstanceConsole(cluster, instance):
8573 """Returns console information for an instance.
8575 @type cluster: L{objects.Cluster}
8576 @type instance: L{objects.Instance}
8580 hyper = hypervisor.GetHypervisor(instance.hypervisor)
8581 # beparams and hvparams are passed separately, to avoid editing the
8582 # instance and then saving the defaults in the instance itself.
8583 hvparams = cluster.FillHV(instance)
8584 beparams = cluster.FillBE(instance)
8585 console = hyper.GetInstanceConsole(instance, hvparams, beparams)
8587 assert console.instance == instance.name
8588 assert console.Validate()
8590 return console.ToDict()
8593 class LUInstanceReplaceDisks(LogicalUnit):
8594 """Replace the disks of an instance.
8597 HPATH = "mirrors-replace"
8598 HTYPE = constants.HTYPE_INSTANCE
8601 def CheckArguments(self):
8602 TLReplaceDisks.CheckArguments(self.op.mode, self.op.remote_node,
8605 def ExpandNames(self):
8606 self._ExpandAndLockInstance()
8608 assert locking.LEVEL_NODE not in self.needed_locks
8609 assert locking.LEVEL_NODEGROUP not in self.needed_locks
8611 assert self.op.iallocator is None or self.op.remote_node is None, \
8612 "Conflicting options"
8614 if self.op.remote_node is not None:
8615 self.op.remote_node = _ExpandNodeName(self.cfg, self.op.remote_node)
8617 # Warning: do not remove the locking of the new secondary here
8618 # unless DRBD8.AddChildren is changed to work in parallel;
8619 # currently it doesn't since parallel invocations of
8620 # FindUnusedMinor will conflict
8621 self.needed_locks[locking.LEVEL_NODE] = [self.op.remote_node]
8622 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
8624 self.needed_locks[locking.LEVEL_NODE] = []
8625 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
8627 if self.op.iallocator is not None:
8628 # iallocator will select a new node in the same group
8629 self.needed_locks[locking.LEVEL_NODEGROUP] = []
8631 self.replacer = TLReplaceDisks(self, self.op.instance_name, self.op.mode,
8632 self.op.iallocator, self.op.remote_node,
8633 self.op.disks, False, self.op.early_release)
8635 self.tasklets = [self.replacer]
8637 def DeclareLocks(self, level):
8638 if level == locking.LEVEL_NODEGROUP:
8639 assert self.op.remote_node is None
8640 assert self.op.iallocator is not None
8641 assert not self.needed_locks[locking.LEVEL_NODEGROUP]
8643 self.share_locks[locking.LEVEL_NODEGROUP] = 1
8644 self.needed_locks[locking.LEVEL_NODEGROUP] = \
8645 self.cfg.GetInstanceNodeGroups(self.op.instance_name)
8647 elif level == locking.LEVEL_NODE:
8648 if self.op.iallocator is not None:
8649 assert self.op.remote_node is None
8650 assert not self.needed_locks[locking.LEVEL_NODE]
8652 # Lock member nodes of all locked groups
8653 self.needed_locks[locking.LEVEL_NODE] = [node_name
8654 for group_uuid in self.glm.list_owned(locking.LEVEL_NODEGROUP)
8655 for node_name in self.cfg.GetNodeGroup(group_uuid).members]
8657 self._LockInstancesNodes()
8659 def BuildHooksEnv(self):
8662 This runs on the master, the primary and all the secondaries.
8665 instance = self.replacer.instance
8667 "MODE": self.op.mode,
8668 "NEW_SECONDARY": self.op.remote_node,
8669 "OLD_SECONDARY": instance.secondary_nodes[0],
8671 env.update(_BuildInstanceHookEnvByObject(self, instance))
8674 def BuildHooksNodes(self):
8675 """Build hooks nodes.
8678 instance = self.replacer.instance
8680 self.cfg.GetMasterNode(),
8681 instance.primary_node,
8683 if self.op.remote_node is not None:
8684 nl.append(self.op.remote_node)
8687 def CheckPrereq(self):
8688 """Check prerequisites.
8691 assert (self.glm.is_owned(locking.LEVEL_NODEGROUP) or
8692 self.op.iallocator is None)
8694 owned_groups = self.glm.list_owned(locking.LEVEL_NODEGROUP)
8696 groups = self.cfg.GetInstanceNodeGroups(self.op.instance_name)
8697 if owned_groups != groups:
8698 raise errors.OpExecError("Node groups used by instance '%s' changed"
8699 " since lock was acquired, current list is %r,"
8700 " used to be '%s'" %
8701 (self.op.instance_name,
8702 utils.CommaJoin(groups),
8703 utils.CommaJoin(owned_groups)))
8705 return LogicalUnit.CheckPrereq(self)
8708 class TLReplaceDisks(Tasklet):
8709 """Replaces disks for an instance.
8711 Note: Locking is not within the scope of this class.
8714 def __init__(self, lu, instance_name, mode, iallocator_name, remote_node,
8715 disks, delay_iallocator, early_release):
8716 """Initializes this class.
8719 Tasklet.__init__(self, lu)
8722 self.instance_name = instance_name
8724 self.iallocator_name = iallocator_name
8725 self.remote_node = remote_node
8727 self.delay_iallocator = delay_iallocator
8728 self.early_release = early_release
8731 self.instance = None
8732 self.new_node = None
8733 self.target_node = None
8734 self.other_node = None
8735 self.remote_node_info = None
8736 self.node_secondary_ip = None
8739 def CheckArguments(mode, remote_node, iallocator):
8740 """Helper function for users of this class.
8743 # check for valid parameter combination
8744 if mode == constants.REPLACE_DISK_CHG:
8745 if remote_node is None and iallocator is None:
8746 raise errors.OpPrereqError("When changing the secondary either an"
8747 " iallocator script must be used or the"
8748 " new node given", errors.ECODE_INVAL)
8750 if remote_node is not None and iallocator is not None:
8751 raise errors.OpPrereqError("Give either the iallocator or the new"
8752 " secondary, not both", errors.ECODE_INVAL)
8754 elif remote_node is not None or iallocator is not None:
8755 # Not replacing the secondary
8756 raise errors.OpPrereqError("The iallocator and new node options can"
8757 " only be used when changing the"
8758 " secondary node", errors.ECODE_INVAL)
8761 def _RunAllocator(lu, iallocator_name, instance_name, relocate_from):
8762 """Compute a new secondary node using an IAllocator.
8765 ial = IAllocator(lu.cfg, lu.rpc,
8766 mode=constants.IALLOCATOR_MODE_RELOC,
8768 relocate_from=relocate_from)
8770 ial.Run(iallocator_name)
8773 raise errors.OpPrereqError("Can't compute nodes using iallocator '%s':"
8774 " %s" % (iallocator_name, ial.info),
8777 if len(ial.result) != ial.required_nodes:
8778 raise errors.OpPrereqError("iallocator '%s' returned invalid number"
8779 " of nodes (%s), required %s" %
8781 len(ial.result), ial.required_nodes),
8784 remote_node_name = ial.result[0]
8786 lu.LogInfo("Selected new secondary for instance '%s': %s",
8787 instance_name, remote_node_name)
8789 return remote_node_name
8791 def _FindFaultyDisks(self, node_name):
8792 return _FindFaultyInstanceDisks(self.cfg, self.rpc, self.instance,
8795 def _CheckDisksActivated(self, instance):
8796 """Checks if the instance disks are activated.
8798 @param instance: The instance to check disks
8799 @return: True if they are activated, False otherwise
8802 nodes = instance.all_nodes
8804 for idx, dev in enumerate(instance.disks):
8806 self.lu.LogInfo("Checking disk/%d on %s", idx, node)
8807 self.cfg.SetDiskID(dev, node)
8809 result = self.rpc.call_blockdev_find(node, dev)
8813 elif result.fail_msg or not result.payload:
8818 def CheckPrereq(self):
8819 """Check prerequisites.
8821 This checks that the instance is in the cluster.
8824 self.instance = instance = self.cfg.GetInstanceInfo(self.instance_name)
8825 assert instance is not None, \
8826 "Cannot retrieve locked instance %s" % self.instance_name
8828 if instance.disk_template != constants.DT_DRBD8:
8829 raise errors.OpPrereqError("Can only run replace disks for DRBD8-based"
8830 " instances", errors.ECODE_INVAL)
8832 if len(instance.secondary_nodes) != 1:
8833 raise errors.OpPrereqError("The instance has a strange layout,"
8834 " expected one secondary but found %d" %
8835 len(instance.secondary_nodes),
8838 if not self.delay_iallocator:
8839 self._CheckPrereq2()
8841 def _CheckPrereq2(self):
8842 """Check prerequisites, second part.
8844 This function should always be part of CheckPrereq. It was separated and is
8845 now called from Exec because during node evacuation iallocator was only
8846 called with an unmodified cluster model, not taking planned changes into
8850 instance = self.instance
8851 secondary_node = instance.secondary_nodes[0]
8853 if self.iallocator_name is None:
8854 remote_node = self.remote_node
8856 remote_node = self._RunAllocator(self.lu, self.iallocator_name,
8857 instance.name, instance.secondary_nodes)
8859 if remote_node is None:
8860 self.remote_node_info = None
8862 assert remote_node in self.lu.glm.list_owned(locking.LEVEL_NODE), \
8863 "Remote node '%s' is not locked" % remote_node
8865 self.remote_node_info = self.cfg.GetNodeInfo(remote_node)
8866 assert self.remote_node_info is not None, \
8867 "Cannot retrieve locked node %s" % remote_node
8869 if remote_node == self.instance.primary_node:
8870 raise errors.OpPrereqError("The specified node is the primary node of"
8871 " the instance", errors.ECODE_INVAL)
8873 if remote_node == secondary_node:
8874 raise errors.OpPrereqError("The specified node is already the"
8875 " secondary node of the instance",
8878 if self.disks and self.mode in (constants.REPLACE_DISK_AUTO,
8879 constants.REPLACE_DISK_CHG):
8880 raise errors.OpPrereqError("Cannot specify disks to be replaced",
8883 if self.mode == constants.REPLACE_DISK_AUTO:
8884 if not self._CheckDisksActivated(instance):
8885 raise errors.OpPrereqError("Please run activate-disks on instance %s"
8886 " first" % self.instance_name,
8888 faulty_primary = self._FindFaultyDisks(instance.primary_node)
8889 faulty_secondary = self._FindFaultyDisks(secondary_node)
8891 if faulty_primary and faulty_secondary:
8892 raise errors.OpPrereqError("Instance %s has faulty disks on more than"
8893 " one node and can not be repaired"
8894 " automatically" % self.instance_name,
8898 self.disks = faulty_primary
8899 self.target_node = instance.primary_node
8900 self.other_node = secondary_node
8901 check_nodes = [self.target_node, self.other_node]
8902 elif faulty_secondary:
8903 self.disks = faulty_secondary
8904 self.target_node = secondary_node
8905 self.other_node = instance.primary_node
8906 check_nodes = [self.target_node, self.other_node]
8912 # Non-automatic modes
8913 if self.mode == constants.REPLACE_DISK_PRI:
8914 self.target_node = instance.primary_node
8915 self.other_node = secondary_node
8916 check_nodes = [self.target_node, self.other_node]
8918 elif self.mode == constants.REPLACE_DISK_SEC:
8919 self.target_node = secondary_node
8920 self.other_node = instance.primary_node
8921 check_nodes = [self.target_node, self.other_node]
8923 elif self.mode == constants.REPLACE_DISK_CHG:
8924 self.new_node = remote_node
8925 self.other_node = instance.primary_node
8926 self.target_node = secondary_node
8927 check_nodes = [self.new_node, self.other_node]
8929 _CheckNodeNotDrained(self.lu, remote_node)
8930 _CheckNodeVmCapable(self.lu, remote_node)
8932 old_node_info = self.cfg.GetNodeInfo(secondary_node)
8933 assert old_node_info is not None
8934 if old_node_info.offline and not self.early_release:
8935 # doesn't make sense to delay the release
8936 self.early_release = True
8937 self.lu.LogInfo("Old secondary %s is offline, automatically enabling"
8938 " early-release mode", secondary_node)
8941 raise errors.ProgrammerError("Unhandled disk replace mode (%s)" %
8944 # If not specified all disks should be replaced
8946 self.disks = range(len(self.instance.disks))
8948 for node in check_nodes:
8949 _CheckNodeOnline(self.lu, node)
8951 touched_nodes = frozenset(node_name for node_name in [self.new_node,
8954 if node_name is not None)
8956 # Release unneeded node locks
8957 _ReleaseLocks(self.lu, locking.LEVEL_NODE, keep=touched_nodes)
8959 # Release any owned node group
8960 if self.lu.glm.is_owned(locking.LEVEL_NODEGROUP):
8961 _ReleaseLocks(self.lu, locking.LEVEL_NODEGROUP)
8963 # Check whether disks are valid
8964 for disk_idx in self.disks:
8965 instance.FindDisk(disk_idx)
8967 # Get secondary node IP addresses
8968 self.node_secondary_ip = \
8969 dict((node_name, self.cfg.GetNodeInfo(node_name).secondary_ip)
8970 for node_name in touched_nodes)
8972 def Exec(self, feedback_fn):
8973 """Execute disk replacement.
8975 This dispatches the disk replacement to the appropriate handler.
8978 if self.delay_iallocator:
8979 self._CheckPrereq2()
8982 # Verify owned locks before starting operation
8983 owned_locks = self.lu.glm.list_owned(locking.LEVEL_NODE)
8984 assert set(owned_locks) == set(self.node_secondary_ip), \
8985 ("Incorrect node locks, owning %s, expected %s" %
8986 (owned_locks, self.node_secondary_ip.keys()))
8988 owned_locks = self.lu.glm.list_owned(locking.LEVEL_INSTANCE)
8989 assert list(owned_locks) == [self.instance_name], \
8990 "Instance '%s' not locked" % self.instance_name
8992 assert not self.lu.glm.is_owned(locking.LEVEL_NODEGROUP), \
8993 "Should not own any node group lock at this point"
8996 feedback_fn("No disks need replacement")
8999 feedback_fn("Replacing disk(s) %s for %s" %
9000 (utils.CommaJoin(self.disks), self.instance.name))
9002 activate_disks = (not self.instance.admin_up)
9004 # Activate the instance disks if we're replacing them on a down instance
9006 _StartInstanceDisks(self.lu, self.instance, True)
9009 # Should we replace the secondary node?
9010 if self.new_node is not None:
9011 fn = self._ExecDrbd8Secondary
9013 fn = self._ExecDrbd8DiskOnly
9015 result = fn(feedback_fn)
9017 # Deactivate the instance disks if we're replacing them on a
9020 _SafeShutdownInstanceDisks(self.lu, self.instance)
9023 # Verify owned locks
9024 owned_locks = self.lu.glm.list_owned(locking.LEVEL_NODE)
9025 nodes = frozenset(self.node_secondary_ip)
9026 assert ((self.early_release and not owned_locks) or
9027 (not self.early_release and not (set(owned_locks) - nodes))), \
9028 ("Not owning the correct locks, early_release=%s, owned=%r,"
9029 " nodes=%r" % (self.early_release, owned_locks, nodes))
9033 def _CheckVolumeGroup(self, nodes):
9034 self.lu.LogInfo("Checking volume groups")
9036 vgname = self.cfg.GetVGName()
9038 # Make sure volume group exists on all involved nodes
9039 results = self.rpc.call_vg_list(nodes)
9041 raise errors.OpExecError("Can't list volume groups on the nodes")
9045 res.Raise("Error checking node %s" % node)
9046 if vgname not in res.payload:
9047 raise errors.OpExecError("Volume group '%s' not found on node %s" %
9050 def _CheckDisksExistence(self, nodes):
9051 # Check disk existence
9052 for idx, dev in enumerate(self.instance.disks):
9053 if idx not in self.disks:
9057 self.lu.LogInfo("Checking disk/%d on %s" % (idx, node))
9058 self.cfg.SetDiskID(dev, node)
9060 result = self.rpc.call_blockdev_find(node, dev)
9062 msg = result.fail_msg
9063 if msg or not result.payload:
9065 msg = "disk not found"
9066 raise errors.OpExecError("Can't find disk/%d on node %s: %s" %
9069 def _CheckDisksConsistency(self, node_name, on_primary, ldisk):
9070 for idx, dev in enumerate(self.instance.disks):
9071 if idx not in self.disks:
9074 self.lu.LogInfo("Checking disk/%d consistency on node %s" %
9077 if not _CheckDiskConsistency(self.lu, dev, node_name, on_primary,
9079 raise errors.OpExecError("Node %s has degraded storage, unsafe to"
9080 " replace disks for instance %s" %
9081 (node_name, self.instance.name))
9083 def _CreateNewStorage(self, node_name):
9086 for idx, dev in enumerate(self.instance.disks):
9087 if idx not in self.disks:
9090 self.lu.LogInfo("Adding storage on %s for disk/%d" % (node_name, idx))
9092 self.cfg.SetDiskID(dev, node_name)
9094 lv_names = [".disk%d_%s" % (idx, suffix) for suffix in ["data", "meta"]]
9095 names = _GenerateUniqueNames(self.lu, lv_names)
9097 vg_data = dev.children[0].logical_id[0]
9098 lv_data = objects.Disk(dev_type=constants.LD_LV, size=dev.size,
9099 logical_id=(vg_data, names[0]))
9100 vg_meta = dev.children[1].logical_id[0]
9101 lv_meta = objects.Disk(dev_type=constants.LD_LV, size=128,
9102 logical_id=(vg_meta, names[1]))
9104 new_lvs = [lv_data, lv_meta]
9105 old_lvs = dev.children
9106 iv_names[dev.iv_name] = (dev, old_lvs, new_lvs)
9108 # we pass force_create=True to force the LVM creation
9109 for new_lv in new_lvs:
9110 _CreateBlockDev(self.lu, node_name, self.instance, new_lv, True,
9111 _GetInstanceInfoText(self.instance), False)
9115 def _CheckDevices(self, node_name, iv_names):
9116 for name, (dev, _, _) in iv_names.iteritems():
9117 self.cfg.SetDiskID(dev, node_name)
9119 result = self.rpc.call_blockdev_find(node_name, dev)
9121 msg = result.fail_msg
9122 if msg or not result.payload:
9124 msg = "disk not found"
9125 raise errors.OpExecError("Can't find DRBD device %s: %s" %
9128 if result.payload.is_degraded:
9129 raise errors.OpExecError("DRBD device %s is degraded!" % name)
9131 def _RemoveOldStorage(self, node_name, iv_names):
9132 for name, (_, old_lvs, _) in iv_names.iteritems():
9133 self.lu.LogInfo("Remove logical volumes for %s" % name)
9136 self.cfg.SetDiskID(lv, node_name)
9138 msg = self.rpc.call_blockdev_remove(node_name, lv).fail_msg
9140 self.lu.LogWarning("Can't remove old LV: %s" % msg,
9141 hint="remove unused LVs manually")
9143 def _ExecDrbd8DiskOnly(self, feedback_fn):
9144 """Replace a disk on the primary or secondary for DRBD 8.
9146 The algorithm for replace is quite complicated:
9148 1. for each disk to be replaced:
9150 1. create new LVs on the target node with unique names
9151 1. detach old LVs from the drbd device
9152 1. rename old LVs to name_replaced.<time_t>
9153 1. rename new LVs to old LVs
9154 1. attach the new LVs (with the old names now) to the drbd device
9156 1. wait for sync across all devices
9158 1. for each modified disk:
9160 1. remove old LVs (which have the name name_replaces.<time_t>)
9162 Failures are not very well handled.
9167 # Step: check device activation
9168 self.lu.LogStep(1, steps_total, "Check device existence")
9169 self._CheckDisksExistence([self.other_node, self.target_node])
9170 self._CheckVolumeGroup([self.target_node, self.other_node])
9172 # Step: check other node consistency
9173 self.lu.LogStep(2, steps_total, "Check peer consistency")
9174 self._CheckDisksConsistency(self.other_node,
9175 self.other_node == self.instance.primary_node,
9178 # Step: create new storage
9179 self.lu.LogStep(3, steps_total, "Allocate new storage")
9180 iv_names = self._CreateNewStorage(self.target_node)
9182 # Step: for each lv, detach+rename*2+attach
9183 self.lu.LogStep(4, steps_total, "Changing drbd configuration")
9184 for dev, old_lvs, new_lvs in iv_names.itervalues():
9185 self.lu.LogInfo("Detaching %s drbd from local storage" % dev.iv_name)
9187 result = self.rpc.call_blockdev_removechildren(self.target_node, dev,
9189 result.Raise("Can't detach drbd from local storage on node"
9190 " %s for device %s" % (self.target_node, dev.iv_name))
9192 #cfg.Update(instance)
9194 # ok, we created the new LVs, so now we know we have the needed
9195 # storage; as such, we proceed on the target node to rename
9196 # old_lv to _old, and new_lv to old_lv; note that we rename LVs
9197 # using the assumption that logical_id == physical_id (which in
9198 # turn is the unique_id on that node)
9200 # FIXME(iustin): use a better name for the replaced LVs
9201 temp_suffix = int(time.time())
9202 ren_fn = lambda d, suff: (d.physical_id[0],
9203 d.physical_id[1] + "_replaced-%s" % suff)
9205 # Build the rename list based on what LVs exist on the node
9206 rename_old_to_new = []
9207 for to_ren in old_lvs:
9208 result = self.rpc.call_blockdev_find(self.target_node, to_ren)
9209 if not result.fail_msg and result.payload:
9211 rename_old_to_new.append((to_ren, ren_fn(to_ren, temp_suffix)))
9213 self.lu.LogInfo("Renaming the old LVs on the target node")
9214 result = self.rpc.call_blockdev_rename(self.target_node,
9216 result.Raise("Can't rename old LVs on node %s" % self.target_node)
9218 # Now we rename the new LVs to the old LVs
9219 self.lu.LogInfo("Renaming the new LVs on the target node")
9220 rename_new_to_old = [(new, old.physical_id)
9221 for old, new in zip(old_lvs, new_lvs)]
9222 result = self.rpc.call_blockdev_rename(self.target_node,
9224 result.Raise("Can't rename new LVs on node %s" % self.target_node)
9226 for old, new in zip(old_lvs, new_lvs):
9227 new.logical_id = old.logical_id
9228 self.cfg.SetDiskID(new, self.target_node)
9230 for disk in old_lvs:
9231 disk.logical_id = ren_fn(disk, temp_suffix)
9232 self.cfg.SetDiskID(disk, self.target_node)
9234 # Now that the new lvs have the old name, we can add them to the device
9235 self.lu.LogInfo("Adding new mirror component on %s" % self.target_node)
9236 result = self.rpc.call_blockdev_addchildren(self.target_node, dev,
9238 msg = result.fail_msg
9240 for new_lv in new_lvs:
9241 msg2 = self.rpc.call_blockdev_remove(self.target_node,
9244 self.lu.LogWarning("Can't rollback device %s: %s", dev, msg2,
9245 hint=("cleanup manually the unused logical"
9247 raise errors.OpExecError("Can't add local storage to drbd: %s" % msg)
9249 dev.children = new_lvs
9251 self.cfg.Update(self.instance, feedback_fn)
9254 if self.early_release:
9255 self.lu.LogStep(cstep, steps_total, "Removing old storage")
9257 self._RemoveOldStorage(self.target_node, iv_names)
9258 # WARNING: we release both node locks here, do not do other RPCs
9259 # than WaitForSync to the primary node
9260 _ReleaseLocks(self.lu, locking.LEVEL_NODE,
9261 names=[self.target_node, self.other_node])
9264 # This can fail as the old devices are degraded and _WaitForSync
9265 # does a combined result over all disks, so we don't check its return value
9266 self.lu.LogStep(cstep, steps_total, "Sync devices")
9268 _WaitForSync(self.lu, self.instance)
9270 # Check all devices manually
9271 self._CheckDevices(self.instance.primary_node, iv_names)
9273 # Step: remove old storage
9274 if not self.early_release:
9275 self.lu.LogStep(cstep, steps_total, "Removing old storage")
9277 self._RemoveOldStorage(self.target_node, iv_names)
9279 def _ExecDrbd8Secondary(self, feedback_fn):
9280 """Replace the secondary node for DRBD 8.
9282 The algorithm for replace is quite complicated:
9283 - for all disks of the instance:
9284 - create new LVs on the new node with same names
9285 - shutdown the drbd device on the old secondary
9286 - disconnect the drbd network on the primary
9287 - create the drbd device on the new secondary
9288 - network attach the drbd on the primary, using an artifice:
9289 the drbd code for Attach() will connect to the network if it
9290 finds a device which is connected to the good local disks but
9292 - wait for sync across all devices
9293 - remove all disks from the old secondary
9295 Failures are not very well handled.
9300 # Step: check device activation
9301 self.lu.LogStep(1, steps_total, "Check device existence")
9302 self._CheckDisksExistence([self.instance.primary_node])
9303 self._CheckVolumeGroup([self.instance.primary_node])
9305 # Step: check other node consistency
9306 self.lu.LogStep(2, steps_total, "Check peer consistency")
9307 self._CheckDisksConsistency(self.instance.primary_node, True, True)
9309 # Step: create new storage
9310 self.lu.LogStep(3, steps_total, "Allocate new storage")
9311 for idx, dev in enumerate(self.instance.disks):
9312 self.lu.LogInfo("Adding new local storage on %s for disk/%d" %
9313 (self.new_node, idx))
9314 # we pass force_create=True to force LVM creation
9315 for new_lv in dev.children:
9316 _CreateBlockDev(self.lu, self.new_node, self.instance, new_lv, True,
9317 _GetInstanceInfoText(self.instance), False)
9319 # Step 4: dbrd minors and drbd setups changes
9320 # after this, we must manually remove the drbd minors on both the
9321 # error and the success paths
9322 self.lu.LogStep(4, steps_total, "Changing drbd configuration")
9323 minors = self.cfg.AllocateDRBDMinor([self.new_node
9324 for dev in self.instance.disks],
9326 logging.debug("Allocated minors %r", minors)
9329 for idx, (dev, new_minor) in enumerate(zip(self.instance.disks, minors)):
9330 self.lu.LogInfo("activating a new drbd on %s for disk/%d" %
9331 (self.new_node, idx))
9332 # create new devices on new_node; note that we create two IDs:
9333 # one without port, so the drbd will be activated without
9334 # networking information on the new node at this stage, and one
9335 # with network, for the latter activation in step 4
9336 (o_node1, o_node2, o_port, o_minor1, o_minor2, o_secret) = dev.logical_id
9337 if self.instance.primary_node == o_node1:
9340 assert self.instance.primary_node == o_node2, "Three-node instance?"
9343 new_alone_id = (self.instance.primary_node, self.new_node, None,
9344 p_minor, new_minor, o_secret)
9345 new_net_id = (self.instance.primary_node, self.new_node, o_port,
9346 p_minor, new_minor, o_secret)
9348 iv_names[idx] = (dev, dev.children, new_net_id)
9349 logging.debug("Allocated new_minor: %s, new_logical_id: %s", new_minor,
9351 new_drbd = objects.Disk(dev_type=constants.LD_DRBD8,
9352 logical_id=new_alone_id,
9353 children=dev.children,
9356 _CreateSingleBlockDev(self.lu, self.new_node, self.instance, new_drbd,
9357 _GetInstanceInfoText(self.instance), False)
9358 except errors.GenericError:
9359 self.cfg.ReleaseDRBDMinors(self.instance.name)
9362 # We have new devices, shutdown the drbd on the old secondary
9363 for idx, dev in enumerate(self.instance.disks):
9364 self.lu.LogInfo("Shutting down drbd for disk/%d on old node" % idx)
9365 self.cfg.SetDiskID(dev, self.target_node)
9366 msg = self.rpc.call_blockdev_shutdown(self.target_node, dev).fail_msg
9368 self.lu.LogWarning("Failed to shutdown drbd for disk/%d on old"
9369 "node: %s" % (idx, msg),
9370 hint=("Please cleanup this device manually as"
9371 " soon as possible"))
9373 self.lu.LogInfo("Detaching primary drbds from the network (=> standalone)")
9374 result = self.rpc.call_drbd_disconnect_net([self.instance.primary_node],
9375 self.node_secondary_ip,
9376 self.instance.disks)\
9377 [self.instance.primary_node]
9379 msg = result.fail_msg
9381 # detaches didn't succeed (unlikely)
9382 self.cfg.ReleaseDRBDMinors(self.instance.name)
9383 raise errors.OpExecError("Can't detach the disks from the network on"
9384 " old node: %s" % (msg,))
9386 # if we managed to detach at least one, we update all the disks of
9387 # the instance to point to the new secondary
9388 self.lu.LogInfo("Updating instance configuration")
9389 for dev, _, new_logical_id in iv_names.itervalues():
9390 dev.logical_id = new_logical_id
9391 self.cfg.SetDiskID(dev, self.instance.primary_node)
9393 self.cfg.Update(self.instance, feedback_fn)
9395 # and now perform the drbd attach
9396 self.lu.LogInfo("Attaching primary drbds to new secondary"
9397 " (standalone => connected)")
9398 result = self.rpc.call_drbd_attach_net([self.instance.primary_node,
9400 self.node_secondary_ip,
9401 self.instance.disks,
9404 for to_node, to_result in result.items():
9405 msg = to_result.fail_msg
9407 self.lu.LogWarning("Can't attach drbd disks on node %s: %s",
9409 hint=("please do a gnt-instance info to see the"
9410 " status of disks"))
9412 if self.early_release:
9413 self.lu.LogStep(cstep, steps_total, "Removing old storage")
9415 self._RemoveOldStorage(self.target_node, iv_names)
9416 # WARNING: we release all node locks here, do not do other RPCs
9417 # than WaitForSync to the primary node
9418 _ReleaseLocks(self.lu, locking.LEVEL_NODE,
9419 names=[self.instance.primary_node,
9424 # This can fail as the old devices are degraded and _WaitForSync
9425 # does a combined result over all disks, so we don't check its return value
9426 self.lu.LogStep(cstep, steps_total, "Sync devices")
9428 _WaitForSync(self.lu, self.instance)
9430 # Check all devices manually
9431 self._CheckDevices(self.instance.primary_node, iv_names)
9433 # Step: remove old storage
9434 if not self.early_release:
9435 self.lu.LogStep(cstep, steps_total, "Removing old storage")
9436 self._RemoveOldStorage(self.target_node, iv_names)
9439 class LURepairNodeStorage(NoHooksLU):
9440 """Repairs the volume group on a node.
9445 def CheckArguments(self):
9446 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
9448 storage_type = self.op.storage_type
9450 if (constants.SO_FIX_CONSISTENCY not in
9451 constants.VALID_STORAGE_OPERATIONS.get(storage_type, [])):
9452 raise errors.OpPrereqError("Storage units of type '%s' can not be"
9453 " repaired" % storage_type,
9456 def ExpandNames(self):
9457 self.needed_locks = {
9458 locking.LEVEL_NODE: [self.op.node_name],
9461 def _CheckFaultyDisks(self, instance, node_name):
9462 """Ensure faulty disks abort the opcode or at least warn."""
9464 if _FindFaultyInstanceDisks(self.cfg, self.rpc, instance,
9466 raise errors.OpPrereqError("Instance '%s' has faulty disks on"
9467 " node '%s'" % (instance.name, node_name),
9469 except errors.OpPrereqError, err:
9470 if self.op.ignore_consistency:
9471 self.proc.LogWarning(str(err.args[0]))
9475 def CheckPrereq(self):
9476 """Check prerequisites.
9479 # Check whether any instance on this node has faulty disks
9480 for inst in _GetNodeInstances(self.cfg, self.op.node_name):
9481 if not inst.admin_up:
9483 check_nodes = set(inst.all_nodes)
9484 check_nodes.discard(self.op.node_name)
9485 for inst_node_name in check_nodes:
9486 self._CheckFaultyDisks(inst, inst_node_name)
9488 def Exec(self, feedback_fn):
9489 feedback_fn("Repairing storage unit '%s' on %s ..." %
9490 (self.op.name, self.op.node_name))
9492 st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
9493 result = self.rpc.call_storage_execute(self.op.node_name,
9494 self.op.storage_type, st_args,
9496 constants.SO_FIX_CONSISTENCY)
9497 result.Raise("Failed to repair storage unit '%s' on %s" %
9498 (self.op.name, self.op.node_name))
9501 class LUNodeEvacStrategy(NoHooksLU):
9502 """Computes the node evacuation strategy.
9507 def CheckArguments(self):
9508 _CheckIAllocatorOrNode(self, "iallocator", "remote_node")
9510 def ExpandNames(self):
9511 self.op.nodes = _GetWantedNodes(self, self.op.nodes)
9512 self.needed_locks = locks = {}
9513 if self.op.remote_node is None:
9514 locks[locking.LEVEL_NODE] = locking.ALL_SET
9516 self.op.remote_node = _ExpandNodeName(self.cfg, self.op.remote_node)
9517 locks[locking.LEVEL_NODE] = self.op.nodes + [self.op.remote_node]
9519 def Exec(self, feedback_fn):
9520 if self.op.remote_node is not None:
9522 for node in self.op.nodes:
9523 instances.extend(_GetNodeSecondaryInstances(self.cfg, node))
9526 if i.primary_node == self.op.remote_node:
9527 raise errors.OpPrereqError("Node %s is the primary node of"
9528 " instance %s, cannot use it as"
9530 (self.op.remote_node, i.name),
9532 result.append([i.name, self.op.remote_node])
9534 ial = IAllocator(self.cfg, self.rpc,
9535 mode=constants.IALLOCATOR_MODE_MEVAC,
9536 evac_nodes=self.op.nodes)
9537 ial.Run(self.op.iallocator, validate=True)
9539 raise errors.OpExecError("No valid evacuation solution: %s" % ial.info,
9545 class LUInstanceGrowDisk(LogicalUnit):
9546 """Grow a disk of an instance.
9550 HTYPE = constants.HTYPE_INSTANCE
9553 def ExpandNames(self):
9554 self._ExpandAndLockInstance()
9555 self.needed_locks[locking.LEVEL_NODE] = []
9556 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
9558 def DeclareLocks(self, level):
9559 if level == locking.LEVEL_NODE:
9560 self._LockInstancesNodes()
9562 def BuildHooksEnv(self):
9565 This runs on the master, the primary and all the secondaries.
9569 "DISK": self.op.disk,
9570 "AMOUNT": self.op.amount,
9572 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
9575 def BuildHooksNodes(self):
9576 """Build hooks nodes.
9579 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
9582 def CheckPrereq(self):
9583 """Check prerequisites.
9585 This checks that the instance is in the cluster.
9588 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
9589 assert instance is not None, \
9590 "Cannot retrieve locked instance %s" % self.op.instance_name
9591 nodenames = list(instance.all_nodes)
9592 for node in nodenames:
9593 _CheckNodeOnline(self, node)
9595 self.instance = instance
9597 if instance.disk_template not in constants.DTS_GROWABLE:
9598 raise errors.OpPrereqError("Instance's disk layout does not support"
9599 " growing", errors.ECODE_INVAL)
9601 self.disk = instance.FindDisk(self.op.disk)
9603 if instance.disk_template not in (constants.DT_FILE,
9604 constants.DT_SHARED_FILE):
9605 # TODO: check the free disk space for file, when that feature will be
9607 _CheckNodesFreeDiskPerVG(self, nodenames,
9608 self.disk.ComputeGrowth(self.op.amount))
9610 def Exec(self, feedback_fn):
9611 """Execute disk grow.
9614 instance = self.instance
9617 disks_ok, _ = _AssembleInstanceDisks(self, self.instance, disks=[disk])
9619 raise errors.OpExecError("Cannot activate block device to grow")
9621 # First run all grow ops in dry-run mode
9622 for node in instance.all_nodes:
9623 self.cfg.SetDiskID(disk, node)
9624 result = self.rpc.call_blockdev_grow(node, disk, self.op.amount, True)
9625 result.Raise("Grow request failed to node %s" % node)
9627 # We know that (as far as we can test) operations across different
9628 # nodes will succeed, time to run it for real
9629 for node in instance.all_nodes:
9630 self.cfg.SetDiskID(disk, node)
9631 result = self.rpc.call_blockdev_grow(node, disk, self.op.amount, False)
9632 result.Raise("Grow request failed to node %s" % node)
9634 # TODO: Rewrite code to work properly
9635 # DRBD goes into sync mode for a short amount of time after executing the
9636 # "resize" command. DRBD 8.x below version 8.0.13 contains a bug whereby
9637 # calling "resize" in sync mode fails. Sleeping for a short amount of
9638 # time is a work-around.
9641 disk.RecordGrow(self.op.amount)
9642 self.cfg.Update(instance, feedback_fn)
9643 if self.op.wait_for_sync:
9644 disk_abort = not _WaitForSync(self, instance, disks=[disk])
9646 self.proc.LogWarning("Disk sync-ing has not returned a good"
9647 " status; please check the instance")
9648 if not instance.admin_up:
9649 _SafeShutdownInstanceDisks(self, instance, disks=[disk])
9650 elif not instance.admin_up:
9651 self.proc.LogWarning("Not shutting down the disk even if the instance is"
9652 " not supposed to be running because no wait for"
9653 " sync mode was requested")
9656 class LUInstanceQueryData(NoHooksLU):
9657 """Query runtime instance data.
9662 def ExpandNames(self):
9663 self.needed_locks = {}
9665 # Use locking if requested or when non-static information is wanted
9666 if not (self.op.static or self.op.use_locking):
9667 self.LogWarning("Non-static data requested, locks need to be acquired")
9668 self.op.use_locking = True
9670 if self.op.instances or not self.op.use_locking:
9671 # Expand instance names right here
9672 self.wanted_names = _GetWantedInstances(self, self.op.instances)
9674 # Will use acquired locks
9675 self.wanted_names = None
9677 if self.op.use_locking:
9678 self.share_locks = dict.fromkeys(locking.LEVELS, 1)
9680 if self.wanted_names is None:
9681 self.needed_locks[locking.LEVEL_INSTANCE] = locking.ALL_SET
9683 self.needed_locks[locking.LEVEL_INSTANCE] = self.wanted_names
9685 self.needed_locks[locking.LEVEL_NODE] = []
9686 self.share_locks = dict.fromkeys(locking.LEVELS, 1)
9687 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
9689 def DeclareLocks(self, level):
9690 if self.op.use_locking and level == locking.LEVEL_NODE:
9691 self._LockInstancesNodes()
9693 def CheckPrereq(self):
9694 """Check prerequisites.
9696 This only checks the optional instance list against the existing names.
9699 if self.wanted_names is None:
9700 assert self.op.use_locking, "Locking was not used"
9701 self.wanted_names = self.glm.list_owned(locking.LEVEL_INSTANCE)
9703 self.wanted_instances = [self.cfg.GetInstanceInfo(name)
9704 for name in self.wanted_names]
9706 def _ComputeBlockdevStatus(self, node, instance_name, dev):
9707 """Returns the status of a block device
9710 if self.op.static or not node:
9713 self.cfg.SetDiskID(dev, node)
9715 result = self.rpc.call_blockdev_find(node, dev)
9719 result.Raise("Can't compute disk status for %s" % instance_name)
9721 status = result.payload
9725 return (status.dev_path, status.major, status.minor,
9726 status.sync_percent, status.estimated_time,
9727 status.is_degraded, status.ldisk_status)
9729 def _ComputeDiskStatus(self, instance, snode, dev):
9730 """Compute block device status.
9733 if dev.dev_type in constants.LDS_DRBD:
9734 # we change the snode then (otherwise we use the one passed in)
9735 if dev.logical_id[0] == instance.primary_node:
9736 snode = dev.logical_id[1]
9738 snode = dev.logical_id[0]
9740 dev_pstatus = self._ComputeBlockdevStatus(instance.primary_node,
9742 dev_sstatus = self._ComputeBlockdevStatus(snode, instance.name, dev)
9745 dev_children = [self._ComputeDiskStatus(instance, snode, child)
9746 for child in dev.children]
9751 "iv_name": dev.iv_name,
9752 "dev_type": dev.dev_type,
9753 "logical_id": dev.logical_id,
9754 "physical_id": dev.physical_id,
9755 "pstatus": dev_pstatus,
9756 "sstatus": dev_sstatus,
9757 "children": dev_children,
9762 def Exec(self, feedback_fn):
9763 """Gather and return data"""
9766 cluster = self.cfg.GetClusterInfo()
9768 for instance in self.wanted_instances:
9769 if not self.op.static:
9770 remote_info = self.rpc.call_instance_info(instance.primary_node,
9772 instance.hypervisor)
9773 remote_info.Raise("Error checking node %s" % instance.primary_node)
9774 remote_info = remote_info.payload
9775 if remote_info and "state" in remote_info:
9778 remote_state = "down"
9781 if instance.admin_up:
9784 config_state = "down"
9786 disks = [self._ComputeDiskStatus(instance, None, device)
9787 for device in instance.disks]
9789 result[instance.name] = {
9790 "name": instance.name,
9791 "config_state": config_state,
9792 "run_state": remote_state,
9793 "pnode": instance.primary_node,
9794 "snodes": instance.secondary_nodes,
9796 # this happens to be the same format used for hooks
9797 "nics": _NICListToTuple(self, instance.nics),
9798 "disk_template": instance.disk_template,
9800 "hypervisor": instance.hypervisor,
9801 "network_port": instance.network_port,
9802 "hv_instance": instance.hvparams,
9803 "hv_actual": cluster.FillHV(instance, skip_globals=True),
9804 "be_instance": instance.beparams,
9805 "be_actual": cluster.FillBE(instance),
9806 "os_instance": instance.osparams,
9807 "os_actual": cluster.SimpleFillOS(instance.os, instance.osparams),
9808 "serial_no": instance.serial_no,
9809 "mtime": instance.mtime,
9810 "ctime": instance.ctime,
9811 "uuid": instance.uuid,
9817 class LUInstanceSetParams(LogicalUnit):
9818 """Modifies an instances's parameters.
9821 HPATH = "instance-modify"
9822 HTYPE = constants.HTYPE_INSTANCE
9825 def CheckArguments(self):
9826 if not (self.op.nics or self.op.disks or self.op.disk_template or
9827 self.op.hvparams or self.op.beparams or self.op.os_name):
9828 raise errors.OpPrereqError("No changes submitted", errors.ECODE_INVAL)
9830 if self.op.hvparams:
9831 _CheckGlobalHvParams(self.op.hvparams)
9835 for disk_op, disk_dict in self.op.disks:
9836 utils.ForceDictType(disk_dict, constants.IDISK_PARAMS_TYPES)
9837 if disk_op == constants.DDM_REMOVE:
9840 elif disk_op == constants.DDM_ADD:
9843 if not isinstance(disk_op, int):
9844 raise errors.OpPrereqError("Invalid disk index", errors.ECODE_INVAL)
9845 if not isinstance(disk_dict, dict):
9846 msg = "Invalid disk value: expected dict, got '%s'" % disk_dict
9847 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
9849 if disk_op == constants.DDM_ADD:
9850 mode = disk_dict.setdefault(constants.IDISK_MODE, constants.DISK_RDWR)
9851 if mode not in constants.DISK_ACCESS_SET:
9852 raise errors.OpPrereqError("Invalid disk access mode '%s'" % mode,
9854 size = disk_dict.get(constants.IDISK_SIZE, None)
9856 raise errors.OpPrereqError("Required disk parameter size missing",
9860 except (TypeError, ValueError), err:
9861 raise errors.OpPrereqError("Invalid disk size parameter: %s" %
9862 str(err), errors.ECODE_INVAL)
9863 disk_dict[constants.IDISK_SIZE] = size
9865 # modification of disk
9866 if constants.IDISK_SIZE in disk_dict:
9867 raise errors.OpPrereqError("Disk size change not possible, use"
9868 " grow-disk", errors.ECODE_INVAL)
9870 if disk_addremove > 1:
9871 raise errors.OpPrereqError("Only one disk add or remove operation"
9872 " supported at a time", errors.ECODE_INVAL)
9874 if self.op.disks and self.op.disk_template is not None:
9875 raise errors.OpPrereqError("Disk template conversion and other disk"
9876 " changes not supported at the same time",
9879 if (self.op.disk_template and
9880 self.op.disk_template in constants.DTS_INT_MIRROR and
9881 self.op.remote_node is None):
9882 raise errors.OpPrereqError("Changing the disk template to a mirrored"
9883 " one requires specifying a secondary node",
9888 for nic_op, nic_dict in self.op.nics:
9889 utils.ForceDictType(nic_dict, constants.INIC_PARAMS_TYPES)
9890 if nic_op == constants.DDM_REMOVE:
9893 elif nic_op == constants.DDM_ADD:
9896 if not isinstance(nic_op, int):
9897 raise errors.OpPrereqError("Invalid nic index", errors.ECODE_INVAL)
9898 if not isinstance(nic_dict, dict):
9899 msg = "Invalid nic value: expected dict, got '%s'" % nic_dict
9900 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
9902 # nic_dict should be a dict
9903 nic_ip = nic_dict.get(constants.INIC_IP, None)
9904 if nic_ip is not None:
9905 if nic_ip.lower() == constants.VALUE_NONE:
9906 nic_dict[constants.INIC_IP] = None
9908 if not netutils.IPAddress.IsValid(nic_ip):
9909 raise errors.OpPrereqError("Invalid IP address '%s'" % nic_ip,
9912 nic_bridge = nic_dict.get('bridge', None)
9913 nic_link = nic_dict.get(constants.INIC_LINK, None)
9914 if nic_bridge and nic_link:
9915 raise errors.OpPrereqError("Cannot pass 'bridge' and 'link'"
9916 " at the same time", errors.ECODE_INVAL)
9917 elif nic_bridge and nic_bridge.lower() == constants.VALUE_NONE:
9918 nic_dict['bridge'] = None
9919 elif nic_link and nic_link.lower() == constants.VALUE_NONE:
9920 nic_dict[constants.INIC_LINK] = None
9922 if nic_op == constants.DDM_ADD:
9923 nic_mac = nic_dict.get(constants.INIC_MAC, None)
9925 nic_dict[constants.INIC_MAC] = constants.VALUE_AUTO
9927 if constants.INIC_MAC in nic_dict:
9928 nic_mac = nic_dict[constants.INIC_MAC]
9929 if nic_mac not in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
9930 nic_mac = utils.NormalizeAndValidateMac(nic_mac)
9932 if nic_op != constants.DDM_ADD and nic_mac == constants.VALUE_AUTO:
9933 raise errors.OpPrereqError("'auto' is not a valid MAC address when"
9934 " modifying an existing nic",
9937 if nic_addremove > 1:
9938 raise errors.OpPrereqError("Only one NIC add or remove operation"
9939 " supported at a time", errors.ECODE_INVAL)
9941 def ExpandNames(self):
9942 self._ExpandAndLockInstance()
9943 self.needed_locks[locking.LEVEL_NODE] = []
9944 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
9946 def DeclareLocks(self, level):
9947 if level == locking.LEVEL_NODE:
9948 self._LockInstancesNodes()
9949 if self.op.disk_template and self.op.remote_node:
9950 self.op.remote_node = _ExpandNodeName(self.cfg, self.op.remote_node)
9951 self.needed_locks[locking.LEVEL_NODE].append(self.op.remote_node)
9953 def BuildHooksEnv(self):
9956 This runs on the master, primary and secondaries.
9960 if constants.BE_MEMORY in self.be_new:
9961 args['memory'] = self.be_new[constants.BE_MEMORY]
9962 if constants.BE_VCPUS in self.be_new:
9963 args['vcpus'] = self.be_new[constants.BE_VCPUS]
9964 # TODO: export disk changes. Note: _BuildInstanceHookEnv* don't export disk
9965 # information at all.
9968 nic_override = dict(self.op.nics)
9969 for idx, nic in enumerate(self.instance.nics):
9970 if idx in nic_override:
9971 this_nic_override = nic_override[idx]
9973 this_nic_override = {}
9974 if constants.INIC_IP in this_nic_override:
9975 ip = this_nic_override[constants.INIC_IP]
9978 if constants.INIC_MAC in this_nic_override:
9979 mac = this_nic_override[constants.INIC_MAC]
9982 if idx in self.nic_pnew:
9983 nicparams = self.nic_pnew[idx]
9985 nicparams = self.cluster.SimpleFillNIC(nic.nicparams)
9986 mode = nicparams[constants.NIC_MODE]
9987 link = nicparams[constants.NIC_LINK]
9988 args['nics'].append((ip, mac, mode, link))
9989 if constants.DDM_ADD in nic_override:
9990 ip = nic_override[constants.DDM_ADD].get(constants.INIC_IP, None)
9991 mac = nic_override[constants.DDM_ADD][constants.INIC_MAC]
9992 nicparams = self.nic_pnew[constants.DDM_ADD]
9993 mode = nicparams[constants.NIC_MODE]
9994 link = nicparams[constants.NIC_LINK]
9995 args['nics'].append((ip, mac, mode, link))
9996 elif constants.DDM_REMOVE in nic_override:
9997 del args['nics'][-1]
9999 env = _BuildInstanceHookEnvByObject(self, self.instance, override=args)
10000 if self.op.disk_template:
10001 env["NEW_DISK_TEMPLATE"] = self.op.disk_template
10005 def BuildHooksNodes(self):
10006 """Build hooks nodes.
10009 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
10012 def CheckPrereq(self):
10013 """Check prerequisites.
10015 This only checks the instance list against the existing names.
10018 # checking the new params on the primary/secondary nodes
10020 instance = self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
10021 cluster = self.cluster = self.cfg.GetClusterInfo()
10022 assert self.instance is not None, \
10023 "Cannot retrieve locked instance %s" % self.op.instance_name
10024 pnode = instance.primary_node
10025 nodelist = list(instance.all_nodes)
10028 if self.op.os_name and not self.op.force:
10029 _CheckNodeHasOS(self, instance.primary_node, self.op.os_name,
10030 self.op.force_variant)
10031 instance_os = self.op.os_name
10033 instance_os = instance.os
10035 if self.op.disk_template:
10036 if instance.disk_template == self.op.disk_template:
10037 raise errors.OpPrereqError("Instance already has disk template %s" %
10038 instance.disk_template, errors.ECODE_INVAL)
10040 if (instance.disk_template,
10041 self.op.disk_template) not in self._DISK_CONVERSIONS:
10042 raise errors.OpPrereqError("Unsupported disk template conversion from"
10043 " %s to %s" % (instance.disk_template,
10044 self.op.disk_template),
10045 errors.ECODE_INVAL)
10046 _CheckInstanceDown(self, instance, "cannot change disk template")
10047 if self.op.disk_template in constants.DTS_INT_MIRROR:
10048 if self.op.remote_node == pnode:
10049 raise errors.OpPrereqError("Given new secondary node %s is the same"
10050 " as the primary node of the instance" %
10051 self.op.remote_node, errors.ECODE_STATE)
10052 _CheckNodeOnline(self, self.op.remote_node)
10053 _CheckNodeNotDrained(self, self.op.remote_node)
10054 # FIXME: here we assume that the old instance type is DT_PLAIN
10055 assert instance.disk_template == constants.DT_PLAIN
10056 disks = [{constants.IDISK_SIZE: d.size,
10057 constants.IDISK_VG: d.logical_id[0]}
10058 for d in instance.disks]
10059 required = _ComputeDiskSizePerVG(self.op.disk_template, disks)
10060 _CheckNodesFreeDiskPerVG(self, [self.op.remote_node], required)
10062 # hvparams processing
10063 if self.op.hvparams:
10064 hv_type = instance.hypervisor
10065 i_hvdict = _GetUpdatedParams(instance.hvparams, self.op.hvparams)
10066 utils.ForceDictType(i_hvdict, constants.HVS_PARAMETER_TYPES)
10067 hv_new = cluster.SimpleFillHV(hv_type, instance.os, i_hvdict)
10070 hypervisor.GetHypervisor(hv_type).CheckParameterSyntax(hv_new)
10071 _CheckHVParams(self, nodelist, instance.hypervisor, hv_new)
10072 self.hv_new = hv_new # the new actual values
10073 self.hv_inst = i_hvdict # the new dict (without defaults)
10075 self.hv_new = self.hv_inst = {}
10077 # beparams processing
10078 if self.op.beparams:
10079 i_bedict = _GetUpdatedParams(instance.beparams, self.op.beparams,
10081 utils.ForceDictType(i_bedict, constants.BES_PARAMETER_TYPES)
10082 be_new = cluster.SimpleFillBE(i_bedict)
10083 self.be_new = be_new # the new actual values
10084 self.be_inst = i_bedict # the new dict (without defaults)
10086 self.be_new = self.be_inst = {}
10088 # osparams processing
10089 if self.op.osparams:
10090 i_osdict = _GetUpdatedParams(instance.osparams, self.op.osparams)
10091 _CheckOSParams(self, True, nodelist, instance_os, i_osdict)
10092 self.os_inst = i_osdict # the new dict (without defaults)
10098 if constants.BE_MEMORY in self.op.beparams and not self.op.force:
10099 mem_check_list = [pnode]
10100 if be_new[constants.BE_AUTO_BALANCE]:
10101 # either we changed auto_balance to yes or it was from before
10102 mem_check_list.extend(instance.secondary_nodes)
10103 instance_info = self.rpc.call_instance_info(pnode, instance.name,
10104 instance.hypervisor)
10105 nodeinfo = self.rpc.call_node_info(mem_check_list, None,
10106 instance.hypervisor)
10107 pninfo = nodeinfo[pnode]
10108 msg = pninfo.fail_msg
10110 # Assume the primary node is unreachable and go ahead
10111 self.warn.append("Can't get info from primary node %s: %s" %
10113 elif not isinstance(pninfo.payload.get('memory_free', None), int):
10114 self.warn.append("Node data from primary node %s doesn't contain"
10115 " free memory information" % pnode)
10116 elif instance_info.fail_msg:
10117 self.warn.append("Can't get instance runtime information: %s" %
10118 instance_info.fail_msg)
10120 if instance_info.payload:
10121 current_mem = int(instance_info.payload['memory'])
10123 # Assume instance not running
10124 # (there is a slight race condition here, but it's not very probable,
10125 # and we have no other way to check)
10127 miss_mem = (be_new[constants.BE_MEMORY] - current_mem -
10128 pninfo.payload['memory_free'])
10130 raise errors.OpPrereqError("This change will prevent the instance"
10131 " from starting, due to %d MB of memory"
10132 " missing on its primary node" % miss_mem,
10133 errors.ECODE_NORES)
10135 if be_new[constants.BE_AUTO_BALANCE]:
10136 for node, nres in nodeinfo.items():
10137 if node not in instance.secondary_nodes:
10139 msg = nres.fail_msg
10141 self.warn.append("Can't get info from secondary node %s: %s" %
10143 elif not isinstance(nres.payload.get('memory_free', None), int):
10144 self.warn.append("Secondary node %s didn't return free"
10145 " memory information" % node)
10146 elif be_new[constants.BE_MEMORY] > nres.payload['memory_free']:
10147 self.warn.append("Not enough memory to failover instance to"
10148 " secondary node %s" % node)
10152 self.nic_pinst = {}
10153 for nic_op, nic_dict in self.op.nics:
10154 if nic_op == constants.DDM_REMOVE:
10155 if not instance.nics:
10156 raise errors.OpPrereqError("Instance has no NICs, cannot remove",
10157 errors.ECODE_INVAL)
10159 if nic_op != constants.DDM_ADD:
10161 if not instance.nics:
10162 raise errors.OpPrereqError("Invalid NIC index %s, instance has"
10163 " no NICs" % nic_op,
10164 errors.ECODE_INVAL)
10165 if nic_op < 0 or nic_op >= len(instance.nics):
10166 raise errors.OpPrereqError("Invalid NIC index %s, valid values"
10168 (nic_op, len(instance.nics) - 1),
10169 errors.ECODE_INVAL)
10170 old_nic_params = instance.nics[nic_op].nicparams
10171 old_nic_ip = instance.nics[nic_op].ip
10173 old_nic_params = {}
10176 update_params_dict = dict([(key, nic_dict[key])
10177 for key in constants.NICS_PARAMETERS
10178 if key in nic_dict])
10180 if 'bridge' in nic_dict:
10181 update_params_dict[constants.NIC_LINK] = nic_dict['bridge']
10183 new_nic_params = _GetUpdatedParams(old_nic_params,
10184 update_params_dict)
10185 utils.ForceDictType(new_nic_params, constants.NICS_PARAMETER_TYPES)
10186 new_filled_nic_params = cluster.SimpleFillNIC(new_nic_params)
10187 objects.NIC.CheckParameterSyntax(new_filled_nic_params)
10188 self.nic_pinst[nic_op] = new_nic_params
10189 self.nic_pnew[nic_op] = new_filled_nic_params
10190 new_nic_mode = new_filled_nic_params[constants.NIC_MODE]
10192 if new_nic_mode == constants.NIC_MODE_BRIDGED:
10193 nic_bridge = new_filled_nic_params[constants.NIC_LINK]
10194 msg = self.rpc.call_bridges_exist(pnode, [nic_bridge]).fail_msg
10196 msg = "Error checking bridges on node %s: %s" % (pnode, msg)
10198 self.warn.append(msg)
10200 raise errors.OpPrereqError(msg, errors.ECODE_ENVIRON)
10201 if new_nic_mode == constants.NIC_MODE_ROUTED:
10202 if constants.INIC_IP in nic_dict:
10203 nic_ip = nic_dict[constants.INIC_IP]
10205 nic_ip = old_nic_ip
10207 raise errors.OpPrereqError('Cannot set the nic ip to None'
10208 ' on a routed nic', errors.ECODE_INVAL)
10209 if constants.INIC_MAC in nic_dict:
10210 nic_mac = nic_dict[constants.INIC_MAC]
10211 if nic_mac is None:
10212 raise errors.OpPrereqError('Cannot set the nic mac to None',
10213 errors.ECODE_INVAL)
10214 elif nic_mac in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
10215 # otherwise generate the mac
10216 nic_dict[constants.INIC_MAC] = \
10217 self.cfg.GenerateMAC(self.proc.GetECId())
10219 # or validate/reserve the current one
10221 self.cfg.ReserveMAC(nic_mac, self.proc.GetECId())
10222 except errors.ReservationError:
10223 raise errors.OpPrereqError("MAC address %s already in use"
10224 " in cluster" % nic_mac,
10225 errors.ECODE_NOTUNIQUE)
10228 if self.op.disks and instance.disk_template == constants.DT_DISKLESS:
10229 raise errors.OpPrereqError("Disk operations not supported for"
10230 " diskless instances",
10231 errors.ECODE_INVAL)
10232 for disk_op, _ in self.op.disks:
10233 if disk_op == constants.DDM_REMOVE:
10234 if len(instance.disks) == 1:
10235 raise errors.OpPrereqError("Cannot remove the last disk of"
10236 " an instance", errors.ECODE_INVAL)
10237 _CheckInstanceDown(self, instance, "cannot remove disks")
10239 if (disk_op == constants.DDM_ADD and
10240 len(instance.disks) >= constants.MAX_DISKS):
10241 raise errors.OpPrereqError("Instance has too many disks (%d), cannot"
10242 " add more" % constants.MAX_DISKS,
10243 errors.ECODE_STATE)
10244 if disk_op not in (constants.DDM_ADD, constants.DDM_REMOVE):
10246 if disk_op < 0 or disk_op >= len(instance.disks):
10247 raise errors.OpPrereqError("Invalid disk index %s, valid values"
10249 (disk_op, len(instance.disks)),
10250 errors.ECODE_INVAL)
10254 def _ConvertPlainToDrbd(self, feedback_fn):
10255 """Converts an instance from plain to drbd.
10258 feedback_fn("Converting template to drbd")
10259 instance = self.instance
10260 pnode = instance.primary_node
10261 snode = self.op.remote_node
10263 # create a fake disk info for _GenerateDiskTemplate
10264 disk_info = [{constants.IDISK_SIZE: d.size, constants.IDISK_MODE: d.mode,
10265 constants.IDISK_VG: d.logical_id[0]}
10266 for d in instance.disks]
10267 new_disks = _GenerateDiskTemplate(self, self.op.disk_template,
10268 instance.name, pnode, [snode],
10269 disk_info, None, None, 0, feedback_fn)
10270 info = _GetInstanceInfoText(instance)
10271 feedback_fn("Creating aditional volumes...")
10272 # first, create the missing data and meta devices
10273 for disk in new_disks:
10274 # unfortunately this is... not too nice
10275 _CreateSingleBlockDev(self, pnode, instance, disk.children[1],
10277 for child in disk.children:
10278 _CreateSingleBlockDev(self, snode, instance, child, info, True)
10279 # at this stage, all new LVs have been created, we can rename the
10281 feedback_fn("Renaming original volumes...")
10282 rename_list = [(o, n.children[0].logical_id)
10283 for (o, n) in zip(instance.disks, new_disks)]
10284 result = self.rpc.call_blockdev_rename(pnode, rename_list)
10285 result.Raise("Failed to rename original LVs")
10287 feedback_fn("Initializing DRBD devices...")
10288 # all child devices are in place, we can now create the DRBD devices
10289 for disk in new_disks:
10290 for node in [pnode, snode]:
10291 f_create = node == pnode
10292 _CreateSingleBlockDev(self, node, instance, disk, info, f_create)
10294 # at this point, the instance has been modified
10295 instance.disk_template = constants.DT_DRBD8
10296 instance.disks = new_disks
10297 self.cfg.Update(instance, feedback_fn)
10299 # disks are created, waiting for sync
10300 disk_abort = not _WaitForSync(self, instance,
10301 oneshot=not self.op.wait_for_sync)
10303 raise errors.OpExecError("There are some degraded disks for"
10304 " this instance, please cleanup manually")
10306 def _ConvertDrbdToPlain(self, feedback_fn):
10307 """Converts an instance from drbd to plain.
10310 instance = self.instance
10311 assert len(instance.secondary_nodes) == 1
10312 pnode = instance.primary_node
10313 snode = instance.secondary_nodes[0]
10314 feedback_fn("Converting template to plain")
10316 old_disks = instance.disks
10317 new_disks = [d.children[0] for d in old_disks]
10319 # copy over size and mode
10320 for parent, child in zip(old_disks, new_disks):
10321 child.size = parent.size
10322 child.mode = parent.mode
10324 # update instance structure
10325 instance.disks = new_disks
10326 instance.disk_template = constants.DT_PLAIN
10327 self.cfg.Update(instance, feedback_fn)
10329 feedback_fn("Removing volumes on the secondary node...")
10330 for disk in old_disks:
10331 self.cfg.SetDiskID(disk, snode)
10332 msg = self.rpc.call_blockdev_remove(snode, disk).fail_msg
10334 self.LogWarning("Could not remove block device %s on node %s,"
10335 " continuing anyway: %s", disk.iv_name, snode, msg)
10337 feedback_fn("Removing unneeded volumes on the primary node...")
10338 for idx, disk in enumerate(old_disks):
10339 meta = disk.children[1]
10340 self.cfg.SetDiskID(meta, pnode)
10341 msg = self.rpc.call_blockdev_remove(pnode, meta).fail_msg
10343 self.LogWarning("Could not remove metadata for disk %d on node %s,"
10344 " continuing anyway: %s", idx, pnode, msg)
10346 def Exec(self, feedback_fn):
10347 """Modifies an instance.
10349 All parameters take effect only at the next restart of the instance.
10352 # Process here the warnings from CheckPrereq, as we don't have a
10353 # feedback_fn there.
10354 for warn in self.warn:
10355 feedback_fn("WARNING: %s" % warn)
10358 instance = self.instance
10360 for disk_op, disk_dict in self.op.disks:
10361 if disk_op == constants.DDM_REMOVE:
10362 # remove the last disk
10363 device = instance.disks.pop()
10364 device_idx = len(instance.disks)
10365 for node, disk in device.ComputeNodeTree(instance.primary_node):
10366 self.cfg.SetDiskID(disk, node)
10367 msg = self.rpc.call_blockdev_remove(node, disk).fail_msg
10369 self.LogWarning("Could not remove disk/%d on node %s: %s,"
10370 " continuing anyway", device_idx, node, msg)
10371 result.append(("disk/%d" % device_idx, "remove"))
10372 elif disk_op == constants.DDM_ADD:
10374 if instance.disk_template in (constants.DT_FILE,
10375 constants.DT_SHARED_FILE):
10376 file_driver, file_path = instance.disks[0].logical_id
10377 file_path = os.path.dirname(file_path)
10379 file_driver = file_path = None
10380 disk_idx_base = len(instance.disks)
10381 new_disk = _GenerateDiskTemplate(self,
10382 instance.disk_template,
10383 instance.name, instance.primary_node,
10384 instance.secondary_nodes,
10388 disk_idx_base, feedback_fn)[0]
10389 instance.disks.append(new_disk)
10390 info = _GetInstanceInfoText(instance)
10392 logging.info("Creating volume %s for instance %s",
10393 new_disk.iv_name, instance.name)
10394 # Note: this needs to be kept in sync with _CreateDisks
10396 for node in instance.all_nodes:
10397 f_create = node == instance.primary_node
10399 _CreateBlockDev(self, node, instance, new_disk,
10400 f_create, info, f_create)
10401 except errors.OpExecError, err:
10402 self.LogWarning("Failed to create volume %s (%s) on"
10404 new_disk.iv_name, new_disk, node, err)
10405 result.append(("disk/%d" % disk_idx_base, "add:size=%s,mode=%s" %
10406 (new_disk.size, new_disk.mode)))
10408 # change a given disk
10409 instance.disks[disk_op].mode = disk_dict[constants.IDISK_MODE]
10410 result.append(("disk.mode/%d" % disk_op,
10411 disk_dict[constants.IDISK_MODE]))
10413 if self.op.disk_template:
10414 r_shut = _ShutdownInstanceDisks(self, instance)
10416 raise errors.OpExecError("Cannot shutdown instance disks, unable to"
10417 " proceed with disk template conversion")
10418 mode = (instance.disk_template, self.op.disk_template)
10420 self._DISK_CONVERSIONS[mode](self, feedback_fn)
10422 self.cfg.ReleaseDRBDMinors(instance.name)
10424 result.append(("disk_template", self.op.disk_template))
10427 for nic_op, nic_dict in self.op.nics:
10428 if nic_op == constants.DDM_REMOVE:
10429 # remove the last nic
10430 del instance.nics[-1]
10431 result.append(("nic.%d" % len(instance.nics), "remove"))
10432 elif nic_op == constants.DDM_ADD:
10433 # mac and bridge should be set, by now
10434 mac = nic_dict[constants.INIC_MAC]
10435 ip = nic_dict.get(constants.INIC_IP, None)
10436 nicparams = self.nic_pinst[constants.DDM_ADD]
10437 new_nic = objects.NIC(mac=mac, ip=ip, nicparams=nicparams)
10438 instance.nics.append(new_nic)
10439 result.append(("nic.%d" % (len(instance.nics) - 1),
10440 "add:mac=%s,ip=%s,mode=%s,link=%s" %
10441 (new_nic.mac, new_nic.ip,
10442 self.nic_pnew[constants.DDM_ADD][constants.NIC_MODE],
10443 self.nic_pnew[constants.DDM_ADD][constants.NIC_LINK]
10446 for key in (constants.INIC_MAC, constants.INIC_IP):
10447 if key in nic_dict:
10448 setattr(instance.nics[nic_op], key, nic_dict[key])
10449 if nic_op in self.nic_pinst:
10450 instance.nics[nic_op].nicparams = self.nic_pinst[nic_op]
10451 for key, val in nic_dict.iteritems():
10452 result.append(("nic.%s/%d" % (key, nic_op), val))
10455 if self.op.hvparams:
10456 instance.hvparams = self.hv_inst
10457 for key, val in self.op.hvparams.iteritems():
10458 result.append(("hv/%s" % key, val))
10461 if self.op.beparams:
10462 instance.beparams = self.be_inst
10463 for key, val in self.op.beparams.iteritems():
10464 result.append(("be/%s" % key, val))
10467 if self.op.os_name:
10468 instance.os = self.op.os_name
10471 if self.op.osparams:
10472 instance.osparams = self.os_inst
10473 for key, val in self.op.osparams.iteritems():
10474 result.append(("os/%s" % key, val))
10476 self.cfg.Update(instance, feedback_fn)
10480 _DISK_CONVERSIONS = {
10481 (constants.DT_PLAIN, constants.DT_DRBD8): _ConvertPlainToDrbd,
10482 (constants.DT_DRBD8, constants.DT_PLAIN): _ConvertDrbdToPlain,
10486 class LUBackupQuery(NoHooksLU):
10487 """Query the exports list
10492 def ExpandNames(self):
10493 self.needed_locks = {}
10494 self.share_locks[locking.LEVEL_NODE] = 1
10495 if not self.op.nodes:
10496 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
10498 self.needed_locks[locking.LEVEL_NODE] = \
10499 _GetWantedNodes(self, self.op.nodes)
10501 def Exec(self, feedback_fn):
10502 """Compute the list of all the exported system images.
10505 @return: a dictionary with the structure node->(export-list)
10506 where export-list is a list of the instances exported on
10510 self.nodes = self.glm.list_owned(locking.LEVEL_NODE)
10511 rpcresult = self.rpc.call_export_list(self.nodes)
10513 for node in rpcresult:
10514 if rpcresult[node].fail_msg:
10515 result[node] = False
10517 result[node] = rpcresult[node].payload
10522 class LUBackupPrepare(NoHooksLU):
10523 """Prepares an instance for an export and returns useful information.
10528 def ExpandNames(self):
10529 self._ExpandAndLockInstance()
10531 def CheckPrereq(self):
10532 """Check prerequisites.
10535 instance_name = self.op.instance_name
10537 self.instance = self.cfg.GetInstanceInfo(instance_name)
10538 assert self.instance is not None, \
10539 "Cannot retrieve locked instance %s" % self.op.instance_name
10540 _CheckNodeOnline(self, self.instance.primary_node)
10542 self._cds = _GetClusterDomainSecret()
10544 def Exec(self, feedback_fn):
10545 """Prepares an instance for an export.
10548 instance = self.instance
10550 if self.op.mode == constants.EXPORT_MODE_REMOTE:
10551 salt = utils.GenerateSecret(8)
10553 feedback_fn("Generating X509 certificate on %s" % instance.primary_node)
10554 result = self.rpc.call_x509_cert_create(instance.primary_node,
10555 constants.RIE_CERT_VALIDITY)
10556 result.Raise("Can't create X509 key and certificate on %s" % result.node)
10558 (name, cert_pem) = result.payload
10560 cert = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM,
10564 "handshake": masterd.instance.ComputeRemoteExportHandshake(self._cds),
10565 "x509_key_name": (name, utils.Sha1Hmac(self._cds, name, salt=salt),
10567 "x509_ca": utils.SignX509Certificate(cert, self._cds, salt),
10573 class LUBackupExport(LogicalUnit):
10574 """Export an instance to an image in the cluster.
10577 HPATH = "instance-export"
10578 HTYPE = constants.HTYPE_INSTANCE
10581 def CheckArguments(self):
10582 """Check the arguments.
10585 self.x509_key_name = self.op.x509_key_name
10586 self.dest_x509_ca_pem = self.op.destination_x509_ca
10588 if self.op.mode == constants.EXPORT_MODE_REMOTE:
10589 if not self.x509_key_name:
10590 raise errors.OpPrereqError("Missing X509 key name for encryption",
10591 errors.ECODE_INVAL)
10593 if not self.dest_x509_ca_pem:
10594 raise errors.OpPrereqError("Missing destination X509 CA",
10595 errors.ECODE_INVAL)
10597 def ExpandNames(self):
10598 self._ExpandAndLockInstance()
10600 # Lock all nodes for local exports
10601 if self.op.mode == constants.EXPORT_MODE_LOCAL:
10602 # FIXME: lock only instance primary and destination node
10604 # Sad but true, for now we have do lock all nodes, as we don't know where
10605 # the previous export might be, and in this LU we search for it and
10606 # remove it from its current node. In the future we could fix this by:
10607 # - making a tasklet to search (share-lock all), then create the
10608 # new one, then one to remove, after
10609 # - removing the removal operation altogether
10610 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
10612 def DeclareLocks(self, level):
10613 """Last minute lock declaration."""
10614 # All nodes are locked anyway, so nothing to do here.
10616 def BuildHooksEnv(self):
10617 """Build hooks env.
10619 This will run on the master, primary node and target node.
10623 "EXPORT_MODE": self.op.mode,
10624 "EXPORT_NODE": self.op.target_node,
10625 "EXPORT_DO_SHUTDOWN": self.op.shutdown,
10626 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
10627 # TODO: Generic function for boolean env variables
10628 "REMOVE_INSTANCE": str(bool(self.op.remove_instance)),
10631 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
10635 def BuildHooksNodes(self):
10636 """Build hooks nodes.
10639 nl = [self.cfg.GetMasterNode(), self.instance.primary_node]
10641 if self.op.mode == constants.EXPORT_MODE_LOCAL:
10642 nl.append(self.op.target_node)
10646 def CheckPrereq(self):
10647 """Check prerequisites.
10649 This checks that the instance and node names are valid.
10652 instance_name = self.op.instance_name
10654 self.instance = self.cfg.GetInstanceInfo(instance_name)
10655 assert self.instance is not None, \
10656 "Cannot retrieve locked instance %s" % self.op.instance_name
10657 _CheckNodeOnline(self, self.instance.primary_node)
10659 if (self.op.remove_instance and self.instance.admin_up and
10660 not self.op.shutdown):
10661 raise errors.OpPrereqError("Can not remove instance without shutting it"
10664 if self.op.mode == constants.EXPORT_MODE_LOCAL:
10665 self.op.target_node = _ExpandNodeName(self.cfg, self.op.target_node)
10666 self.dst_node = self.cfg.GetNodeInfo(self.op.target_node)
10667 assert self.dst_node is not None
10669 _CheckNodeOnline(self, self.dst_node.name)
10670 _CheckNodeNotDrained(self, self.dst_node.name)
10673 self.dest_disk_info = None
10674 self.dest_x509_ca = None
10676 elif self.op.mode == constants.EXPORT_MODE_REMOTE:
10677 self.dst_node = None
10679 if len(self.op.target_node) != len(self.instance.disks):
10680 raise errors.OpPrereqError(("Received destination information for %s"
10681 " disks, but instance %s has %s disks") %
10682 (len(self.op.target_node), instance_name,
10683 len(self.instance.disks)),
10684 errors.ECODE_INVAL)
10686 cds = _GetClusterDomainSecret()
10688 # Check X509 key name
10690 (key_name, hmac_digest, hmac_salt) = self.x509_key_name
10691 except (TypeError, ValueError), err:
10692 raise errors.OpPrereqError("Invalid data for X509 key name: %s" % err)
10694 if not utils.VerifySha1Hmac(cds, key_name, hmac_digest, salt=hmac_salt):
10695 raise errors.OpPrereqError("HMAC for X509 key name is wrong",
10696 errors.ECODE_INVAL)
10698 # Load and verify CA
10700 (cert, _) = utils.LoadSignedX509Certificate(self.dest_x509_ca_pem, cds)
10701 except OpenSSL.crypto.Error, err:
10702 raise errors.OpPrereqError("Unable to load destination X509 CA (%s)" %
10703 (err, ), errors.ECODE_INVAL)
10705 (errcode, msg) = utils.VerifyX509Certificate(cert, None, None)
10706 if errcode is not None:
10707 raise errors.OpPrereqError("Invalid destination X509 CA (%s)" %
10708 (msg, ), errors.ECODE_INVAL)
10710 self.dest_x509_ca = cert
10712 # Verify target information
10714 for idx, disk_data in enumerate(self.op.target_node):
10716 (host, port, magic) = \
10717 masterd.instance.CheckRemoteExportDiskInfo(cds, idx, disk_data)
10718 except errors.GenericError, err:
10719 raise errors.OpPrereqError("Target info for disk %s: %s" %
10720 (idx, err), errors.ECODE_INVAL)
10722 disk_info.append((host, port, magic))
10724 assert len(disk_info) == len(self.op.target_node)
10725 self.dest_disk_info = disk_info
10728 raise errors.ProgrammerError("Unhandled export mode %r" %
10731 # instance disk type verification
10732 # TODO: Implement export support for file-based disks
10733 for disk in self.instance.disks:
10734 if disk.dev_type == constants.LD_FILE:
10735 raise errors.OpPrereqError("Export not supported for instances with"
10736 " file-based disks", errors.ECODE_INVAL)
10738 def _CleanupExports(self, feedback_fn):
10739 """Removes exports of current instance from all other nodes.
10741 If an instance in a cluster with nodes A..D was exported to node C, its
10742 exports will be removed from the nodes A, B and D.
10745 assert self.op.mode != constants.EXPORT_MODE_REMOTE
10747 nodelist = self.cfg.GetNodeList()
10748 nodelist.remove(self.dst_node.name)
10750 # on one-node clusters nodelist will be empty after the removal
10751 # if we proceed the backup would be removed because OpBackupQuery
10752 # substitutes an empty list with the full cluster node list.
10753 iname = self.instance.name
10755 feedback_fn("Removing old exports for instance %s" % iname)
10756 exportlist = self.rpc.call_export_list(nodelist)
10757 for node in exportlist:
10758 if exportlist[node].fail_msg:
10760 if iname in exportlist[node].payload:
10761 msg = self.rpc.call_export_remove(node, iname).fail_msg
10763 self.LogWarning("Could not remove older export for instance %s"
10764 " on node %s: %s", iname, node, msg)
10766 def Exec(self, feedback_fn):
10767 """Export an instance to an image in the cluster.
10770 assert self.op.mode in constants.EXPORT_MODES
10772 instance = self.instance
10773 src_node = instance.primary_node
10775 if self.op.shutdown:
10776 # shutdown the instance, but not the disks
10777 feedback_fn("Shutting down instance %s" % instance.name)
10778 result = self.rpc.call_instance_shutdown(src_node, instance,
10779 self.op.shutdown_timeout)
10780 # TODO: Maybe ignore failures if ignore_remove_failures is set
10781 result.Raise("Could not shutdown instance %s on"
10782 " node %s" % (instance.name, src_node))
10784 # set the disks ID correctly since call_instance_start needs the
10785 # correct drbd minor to create the symlinks
10786 for disk in instance.disks:
10787 self.cfg.SetDiskID(disk, src_node)
10789 activate_disks = (not instance.admin_up)
10792 # Activate the instance disks if we'exporting a stopped instance
10793 feedback_fn("Activating disks for %s" % instance.name)
10794 _StartInstanceDisks(self, instance, None)
10797 helper = masterd.instance.ExportInstanceHelper(self, feedback_fn,
10800 helper.CreateSnapshots()
10802 if (self.op.shutdown and instance.admin_up and
10803 not self.op.remove_instance):
10804 assert not activate_disks
10805 feedback_fn("Starting instance %s" % instance.name)
10806 result = self.rpc.call_instance_start(src_node, instance, None, None)
10807 msg = result.fail_msg
10809 feedback_fn("Failed to start instance: %s" % msg)
10810 _ShutdownInstanceDisks(self, instance)
10811 raise errors.OpExecError("Could not start instance: %s" % msg)
10813 if self.op.mode == constants.EXPORT_MODE_LOCAL:
10814 (fin_resu, dresults) = helper.LocalExport(self.dst_node)
10815 elif self.op.mode == constants.EXPORT_MODE_REMOTE:
10816 connect_timeout = constants.RIE_CONNECT_TIMEOUT
10817 timeouts = masterd.instance.ImportExportTimeouts(connect_timeout)
10819 (key_name, _, _) = self.x509_key_name
10822 OpenSSL.crypto.dump_certificate(OpenSSL.crypto.FILETYPE_PEM,
10825 (fin_resu, dresults) = helper.RemoteExport(self.dest_disk_info,
10826 key_name, dest_ca_pem,
10831 # Check for backwards compatibility
10832 assert len(dresults) == len(instance.disks)
10833 assert compat.all(isinstance(i, bool) for i in dresults), \
10834 "Not all results are boolean: %r" % dresults
10838 feedback_fn("Deactivating disks for %s" % instance.name)
10839 _ShutdownInstanceDisks(self, instance)
10841 if not (compat.all(dresults) and fin_resu):
10844 failures.append("export finalization")
10845 if not compat.all(dresults):
10846 fdsk = utils.CommaJoin(idx for (idx, dsk) in enumerate(dresults)
10848 failures.append("disk export: disk(s) %s" % fdsk)
10850 raise errors.OpExecError("Export failed, errors in %s" %
10851 utils.CommaJoin(failures))
10853 # At this point, the export was successful, we can cleanup/finish
10855 # Remove instance if requested
10856 if self.op.remove_instance:
10857 feedback_fn("Removing instance %s" % instance.name)
10858 _RemoveInstance(self, feedback_fn, instance,
10859 self.op.ignore_remove_failures)
10861 if self.op.mode == constants.EXPORT_MODE_LOCAL:
10862 self._CleanupExports(feedback_fn)
10864 return fin_resu, dresults
10867 class LUBackupRemove(NoHooksLU):
10868 """Remove exports related to the named instance.
10873 def ExpandNames(self):
10874 self.needed_locks = {}
10875 # We need all nodes to be locked in order for RemoveExport to work, but we
10876 # don't need to lock the instance itself, as nothing will happen to it (and
10877 # we can remove exports also for a removed instance)
10878 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
10880 def Exec(self, feedback_fn):
10881 """Remove any export.
10884 instance_name = self.cfg.ExpandInstanceName(self.op.instance_name)
10885 # If the instance was not found we'll try with the name that was passed in.
10886 # This will only work if it was an FQDN, though.
10888 if not instance_name:
10890 instance_name = self.op.instance_name
10892 locked_nodes = self.glm.list_owned(locking.LEVEL_NODE)
10893 exportlist = self.rpc.call_export_list(locked_nodes)
10895 for node in exportlist:
10896 msg = exportlist[node].fail_msg
10898 self.LogWarning("Failed to query node %s (continuing): %s", node, msg)
10900 if instance_name in exportlist[node].payload:
10902 result = self.rpc.call_export_remove(node, instance_name)
10903 msg = result.fail_msg
10905 logging.error("Could not remove export for instance %s"
10906 " on node %s: %s", instance_name, node, msg)
10908 if fqdn_warn and not found:
10909 feedback_fn("Export not found. If trying to remove an export belonging"
10910 " to a deleted instance please use its Fully Qualified"
10914 class LUGroupAdd(LogicalUnit):
10915 """Logical unit for creating node groups.
10918 HPATH = "group-add"
10919 HTYPE = constants.HTYPE_GROUP
10922 def ExpandNames(self):
10923 # We need the new group's UUID here so that we can create and acquire the
10924 # corresponding lock. Later, in Exec(), we'll indicate to cfg.AddNodeGroup
10925 # that it should not check whether the UUID exists in the configuration.
10926 self.group_uuid = self.cfg.GenerateUniqueID(self.proc.GetECId())
10927 self.needed_locks = {}
10928 self.add_locks[locking.LEVEL_NODEGROUP] = self.group_uuid
10930 def CheckPrereq(self):
10931 """Check prerequisites.
10933 This checks that the given group name is not an existing node group
10938 existing_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
10939 except errors.OpPrereqError:
10942 raise errors.OpPrereqError("Desired group name '%s' already exists as a"
10943 " node group (UUID: %s)" %
10944 (self.op.group_name, existing_uuid),
10945 errors.ECODE_EXISTS)
10947 if self.op.ndparams:
10948 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
10950 def BuildHooksEnv(self):
10951 """Build hooks env.
10955 "GROUP_NAME": self.op.group_name,
10958 def BuildHooksNodes(self):
10959 """Build hooks nodes.
10962 mn = self.cfg.GetMasterNode()
10963 return ([mn], [mn])
10965 def Exec(self, feedback_fn):
10966 """Add the node group to the cluster.
10969 group_obj = objects.NodeGroup(name=self.op.group_name, members=[],
10970 uuid=self.group_uuid,
10971 alloc_policy=self.op.alloc_policy,
10972 ndparams=self.op.ndparams)
10974 self.cfg.AddNodeGroup(group_obj, self.proc.GetECId(), check_uuid=False)
10975 del self.remove_locks[locking.LEVEL_NODEGROUP]
10978 class LUGroupAssignNodes(NoHooksLU):
10979 """Logical unit for assigning nodes to groups.
10984 def ExpandNames(self):
10985 # These raise errors.OpPrereqError on their own:
10986 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
10987 self.op.nodes = _GetWantedNodes(self, self.op.nodes)
10989 # We want to lock all the affected nodes and groups. We have readily
10990 # available the list of nodes, and the *destination* group. To gather the
10991 # list of "source" groups, we need to fetch node information later on.
10992 self.needed_locks = {
10993 locking.LEVEL_NODEGROUP: set([self.group_uuid]),
10994 locking.LEVEL_NODE: self.op.nodes,
10997 def DeclareLocks(self, level):
10998 if level == locking.LEVEL_NODEGROUP:
10999 assert len(self.needed_locks[locking.LEVEL_NODEGROUP]) == 1
11001 # Try to get all affected nodes' groups without having the group or node
11002 # lock yet. Needs verification later in the code flow.
11003 groups = self.cfg.GetNodeGroupsFromNodes(self.op.nodes)
11005 self.needed_locks[locking.LEVEL_NODEGROUP].update(groups)
11007 def CheckPrereq(self):
11008 """Check prerequisites.
11011 assert self.needed_locks[locking.LEVEL_NODEGROUP]
11012 assert (frozenset(self.glm.list_owned(locking.LEVEL_NODE)) ==
11013 frozenset(self.op.nodes))
11015 expected_locks = (set([self.group_uuid]) |
11016 self.cfg.GetNodeGroupsFromNodes(self.op.nodes))
11017 actual_locks = self.glm.list_owned(locking.LEVEL_NODEGROUP)
11018 if actual_locks != expected_locks:
11019 raise errors.OpExecError("Nodes changed groups since locks were acquired,"
11020 " current groups are '%s', used to be '%s'" %
11021 (utils.CommaJoin(expected_locks),
11022 utils.CommaJoin(actual_locks)))
11024 self.node_data = self.cfg.GetAllNodesInfo()
11025 self.group = self.cfg.GetNodeGroup(self.group_uuid)
11026 instance_data = self.cfg.GetAllInstancesInfo()
11028 if self.group is None:
11029 raise errors.OpExecError("Could not retrieve group '%s' (UUID: %s)" %
11030 (self.op.group_name, self.group_uuid))
11032 (new_splits, previous_splits) = \
11033 self.CheckAssignmentForSplitInstances([(node, self.group_uuid)
11034 for node in self.op.nodes],
11035 self.node_data, instance_data)
11038 fmt_new_splits = utils.CommaJoin(utils.NiceSort(new_splits))
11040 if not self.op.force:
11041 raise errors.OpExecError("The following instances get split by this"
11042 " change and --force was not given: %s" %
11045 self.LogWarning("This operation will split the following instances: %s",
11048 if previous_splits:
11049 self.LogWarning("In addition, these already-split instances continue"
11050 " to be split across groups: %s",
11051 utils.CommaJoin(utils.NiceSort(previous_splits)))
11053 def Exec(self, feedback_fn):
11054 """Assign nodes to a new group.
11057 for node in self.op.nodes:
11058 self.node_data[node].group = self.group_uuid
11060 # FIXME: Depends on side-effects of modifying the result of
11061 # C{cfg.GetAllNodesInfo}
11063 self.cfg.Update(self.group, feedback_fn) # Saves all modified nodes.
11066 def CheckAssignmentForSplitInstances(changes, node_data, instance_data):
11067 """Check for split instances after a node assignment.
11069 This method considers a series of node assignments as an atomic operation,
11070 and returns information about split instances after applying the set of
11073 In particular, it returns information about newly split instances, and
11074 instances that were already split, and remain so after the change.
11076 Only instances whose disk template is listed in constants.DTS_INT_MIRROR are
11079 @type changes: list of (node_name, new_group_uuid) pairs.
11080 @param changes: list of node assignments to consider.
11081 @param node_data: a dict with data for all nodes
11082 @param instance_data: a dict with all instances to consider
11083 @rtype: a two-tuple
11084 @return: a list of instances that were previously okay and result split as a
11085 consequence of this change, and a list of instances that were previously
11086 split and this change does not fix.
11089 changed_nodes = dict((node, group) for node, group in changes
11090 if node_data[node].group != group)
11092 all_split_instances = set()
11093 previously_split_instances = set()
11095 def InstanceNodes(instance):
11096 return [instance.primary_node] + list(instance.secondary_nodes)
11098 for inst in instance_data.values():
11099 if inst.disk_template not in constants.DTS_INT_MIRROR:
11102 instance_nodes = InstanceNodes(inst)
11104 if len(set(node_data[node].group for node in instance_nodes)) > 1:
11105 previously_split_instances.add(inst.name)
11107 if len(set(changed_nodes.get(node, node_data[node].group)
11108 for node in instance_nodes)) > 1:
11109 all_split_instances.add(inst.name)
11111 return (list(all_split_instances - previously_split_instances),
11112 list(previously_split_instances & all_split_instances))
11115 class _GroupQuery(_QueryBase):
11116 FIELDS = query.GROUP_FIELDS
11118 def ExpandNames(self, lu):
11119 lu.needed_locks = {}
11121 self._all_groups = lu.cfg.GetAllNodeGroupsInfo()
11122 name_to_uuid = dict((g.name, g.uuid) for g in self._all_groups.values())
11125 self.wanted = [name_to_uuid[name]
11126 for name in utils.NiceSort(name_to_uuid.keys())]
11128 # Accept names to be either names or UUIDs.
11131 all_uuid = frozenset(self._all_groups.keys())
11133 for name in self.names:
11134 if name in all_uuid:
11135 self.wanted.append(name)
11136 elif name in name_to_uuid:
11137 self.wanted.append(name_to_uuid[name])
11139 missing.append(name)
11142 raise errors.OpPrereqError("Some groups do not exist: %s" %
11143 utils.CommaJoin(missing),
11144 errors.ECODE_NOENT)
11146 def DeclareLocks(self, lu, level):
11149 def _GetQueryData(self, lu):
11150 """Computes the list of node groups and their attributes.
11153 do_nodes = query.GQ_NODE in self.requested_data
11154 do_instances = query.GQ_INST in self.requested_data
11156 group_to_nodes = None
11157 group_to_instances = None
11159 # For GQ_NODE, we need to map group->[nodes], and group->[instances] for
11160 # GQ_INST. The former is attainable with just GetAllNodesInfo(), but for the
11161 # latter GetAllInstancesInfo() is not enough, for we have to go through
11162 # instance->node. Hence, we will need to process nodes even if we only need
11163 # instance information.
11164 if do_nodes or do_instances:
11165 all_nodes = lu.cfg.GetAllNodesInfo()
11166 group_to_nodes = dict((uuid, []) for uuid in self.wanted)
11169 for node in all_nodes.values():
11170 if node.group in group_to_nodes:
11171 group_to_nodes[node.group].append(node.name)
11172 node_to_group[node.name] = node.group
11175 all_instances = lu.cfg.GetAllInstancesInfo()
11176 group_to_instances = dict((uuid, []) for uuid in self.wanted)
11178 for instance in all_instances.values():
11179 node = instance.primary_node
11180 if node in node_to_group:
11181 group_to_instances[node_to_group[node]].append(instance.name)
11184 # Do not pass on node information if it was not requested.
11185 group_to_nodes = None
11187 return query.GroupQueryData([self._all_groups[uuid]
11188 for uuid in self.wanted],
11189 group_to_nodes, group_to_instances)
11192 class LUGroupQuery(NoHooksLU):
11193 """Logical unit for querying node groups.
11198 def CheckArguments(self):
11199 self.gq = _GroupQuery(qlang.MakeSimpleFilter("name", self.op.names),
11200 self.op.output_fields, False)
11202 def ExpandNames(self):
11203 self.gq.ExpandNames(self)
11205 def Exec(self, feedback_fn):
11206 return self.gq.OldStyleQuery(self)
11209 class LUGroupSetParams(LogicalUnit):
11210 """Modifies the parameters of a node group.
11213 HPATH = "group-modify"
11214 HTYPE = constants.HTYPE_GROUP
11217 def CheckArguments(self):
11220 self.op.alloc_policy,
11223 if all_changes.count(None) == len(all_changes):
11224 raise errors.OpPrereqError("Please pass at least one modification",
11225 errors.ECODE_INVAL)
11227 def ExpandNames(self):
11228 # This raises errors.OpPrereqError on its own:
11229 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
11231 self.needed_locks = {
11232 locking.LEVEL_NODEGROUP: [self.group_uuid],
11235 def CheckPrereq(self):
11236 """Check prerequisites.
11239 self.group = self.cfg.GetNodeGroup(self.group_uuid)
11241 if self.group is None:
11242 raise errors.OpExecError("Could not retrieve group '%s' (UUID: %s)" %
11243 (self.op.group_name, self.group_uuid))
11245 if self.op.ndparams:
11246 new_ndparams = _GetUpdatedParams(self.group.ndparams, self.op.ndparams)
11247 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
11248 self.new_ndparams = new_ndparams
11250 def BuildHooksEnv(self):
11251 """Build hooks env.
11255 "GROUP_NAME": self.op.group_name,
11256 "NEW_ALLOC_POLICY": self.op.alloc_policy,
11259 def BuildHooksNodes(self):
11260 """Build hooks nodes.
11263 mn = self.cfg.GetMasterNode()
11264 return ([mn], [mn])
11266 def Exec(self, feedback_fn):
11267 """Modifies the node group.
11272 if self.op.ndparams:
11273 self.group.ndparams = self.new_ndparams
11274 result.append(("ndparams", str(self.group.ndparams)))
11276 if self.op.alloc_policy:
11277 self.group.alloc_policy = self.op.alloc_policy
11279 self.cfg.Update(self.group, feedback_fn)
11284 class LUGroupRemove(LogicalUnit):
11285 HPATH = "group-remove"
11286 HTYPE = constants.HTYPE_GROUP
11289 def ExpandNames(self):
11290 # This will raises errors.OpPrereqError on its own:
11291 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
11292 self.needed_locks = {
11293 locking.LEVEL_NODEGROUP: [self.group_uuid],
11296 def CheckPrereq(self):
11297 """Check prerequisites.
11299 This checks that the given group name exists as a node group, that is
11300 empty (i.e., contains no nodes), and that is not the last group of the
11304 # Verify that the group is empty.
11305 group_nodes = [node.name
11306 for node in self.cfg.GetAllNodesInfo().values()
11307 if node.group == self.group_uuid]
11310 raise errors.OpPrereqError("Group '%s' not empty, has the following"
11312 (self.op.group_name,
11313 utils.CommaJoin(utils.NiceSort(group_nodes))),
11314 errors.ECODE_STATE)
11316 # Verify the cluster would not be left group-less.
11317 if len(self.cfg.GetNodeGroupList()) == 1:
11318 raise errors.OpPrereqError("Group '%s' is the only group,"
11319 " cannot be removed" %
11320 self.op.group_name,
11321 errors.ECODE_STATE)
11323 def BuildHooksEnv(self):
11324 """Build hooks env.
11328 "GROUP_NAME": self.op.group_name,
11331 def BuildHooksNodes(self):
11332 """Build hooks nodes.
11335 mn = self.cfg.GetMasterNode()
11336 return ([mn], [mn])
11338 def Exec(self, feedback_fn):
11339 """Remove the node group.
11343 self.cfg.RemoveNodeGroup(self.group_uuid)
11344 except errors.ConfigurationError:
11345 raise errors.OpExecError("Group '%s' with UUID %s disappeared" %
11346 (self.op.group_name, self.group_uuid))
11348 self.remove_locks[locking.LEVEL_NODEGROUP] = self.group_uuid
11351 class LUGroupRename(LogicalUnit):
11352 HPATH = "group-rename"
11353 HTYPE = constants.HTYPE_GROUP
11356 def ExpandNames(self):
11357 # This raises errors.OpPrereqError on its own:
11358 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
11360 self.needed_locks = {
11361 locking.LEVEL_NODEGROUP: [self.group_uuid],
11364 def CheckPrereq(self):
11365 """Check prerequisites.
11367 Ensures requested new name is not yet used.
11371 new_name_uuid = self.cfg.LookupNodeGroup(self.op.new_name)
11372 except errors.OpPrereqError:
11375 raise errors.OpPrereqError("Desired new name '%s' clashes with existing"
11376 " node group (UUID: %s)" %
11377 (self.op.new_name, new_name_uuid),
11378 errors.ECODE_EXISTS)
11380 def BuildHooksEnv(self):
11381 """Build hooks env.
11385 "OLD_NAME": self.op.group_name,
11386 "NEW_NAME": self.op.new_name,
11389 def BuildHooksNodes(self):
11390 """Build hooks nodes.
11393 mn = self.cfg.GetMasterNode()
11395 all_nodes = self.cfg.GetAllNodesInfo()
11396 all_nodes.pop(mn, None)
11399 run_nodes.extend(node.name for node in all_nodes.values()
11400 if node.group == self.group_uuid)
11402 return (run_nodes, run_nodes)
11404 def Exec(self, feedback_fn):
11405 """Rename the node group.
11408 group = self.cfg.GetNodeGroup(self.group_uuid)
11411 raise errors.OpExecError("Could not retrieve group '%s' (UUID: %s)" %
11412 (self.op.group_name, self.group_uuid))
11414 group.name = self.op.new_name
11415 self.cfg.Update(group, feedback_fn)
11417 return self.op.new_name
11420 class TagsLU(NoHooksLU): # pylint: disable-msg=W0223
11421 """Generic tags LU.
11423 This is an abstract class which is the parent of all the other tags LUs.
11426 def ExpandNames(self):
11427 self.group_uuid = None
11428 self.needed_locks = {}
11429 if self.op.kind == constants.TAG_NODE:
11430 self.op.name = _ExpandNodeName(self.cfg, self.op.name)
11431 self.needed_locks[locking.LEVEL_NODE] = self.op.name
11432 elif self.op.kind == constants.TAG_INSTANCE:
11433 self.op.name = _ExpandInstanceName(self.cfg, self.op.name)
11434 self.needed_locks[locking.LEVEL_INSTANCE] = self.op.name
11435 elif self.op.kind == constants.TAG_NODEGROUP:
11436 self.group_uuid = self.cfg.LookupNodeGroup(self.op.name)
11438 # FIXME: Acquire BGL for cluster tag operations (as of this writing it's
11439 # not possible to acquire the BGL based on opcode parameters)
11441 def CheckPrereq(self):
11442 """Check prerequisites.
11445 if self.op.kind == constants.TAG_CLUSTER:
11446 self.target = self.cfg.GetClusterInfo()
11447 elif self.op.kind == constants.TAG_NODE:
11448 self.target = self.cfg.GetNodeInfo(self.op.name)
11449 elif self.op.kind == constants.TAG_INSTANCE:
11450 self.target = self.cfg.GetInstanceInfo(self.op.name)
11451 elif self.op.kind == constants.TAG_NODEGROUP:
11452 self.target = self.cfg.GetNodeGroup(self.group_uuid)
11454 raise errors.OpPrereqError("Wrong tag type requested (%s)" %
11455 str(self.op.kind), errors.ECODE_INVAL)
11458 class LUTagsGet(TagsLU):
11459 """Returns the tags of a given object.
11464 def ExpandNames(self):
11465 TagsLU.ExpandNames(self)
11467 # Share locks as this is only a read operation
11468 self.share_locks = dict.fromkeys(locking.LEVELS, 1)
11470 def Exec(self, feedback_fn):
11471 """Returns the tag list.
11474 return list(self.target.GetTags())
11477 class LUTagsSearch(NoHooksLU):
11478 """Searches the tags for a given pattern.
11483 def ExpandNames(self):
11484 self.needed_locks = {}
11486 def CheckPrereq(self):
11487 """Check prerequisites.
11489 This checks the pattern passed for validity by compiling it.
11493 self.re = re.compile(self.op.pattern)
11494 except re.error, err:
11495 raise errors.OpPrereqError("Invalid search pattern '%s': %s" %
11496 (self.op.pattern, err), errors.ECODE_INVAL)
11498 def Exec(self, feedback_fn):
11499 """Returns the tag list.
11503 tgts = [("/cluster", cfg.GetClusterInfo())]
11504 ilist = cfg.GetAllInstancesInfo().values()
11505 tgts.extend([("/instances/%s" % i.name, i) for i in ilist])
11506 nlist = cfg.GetAllNodesInfo().values()
11507 tgts.extend([("/nodes/%s" % n.name, n) for n in nlist])
11508 tgts.extend(("/nodegroup/%s" % n.name, n)
11509 for n in cfg.GetAllNodeGroupsInfo().values())
11511 for path, target in tgts:
11512 for tag in target.GetTags():
11513 if self.re.search(tag):
11514 results.append((path, tag))
11518 class LUTagsSet(TagsLU):
11519 """Sets a tag on a given object.
11524 def CheckPrereq(self):
11525 """Check prerequisites.
11527 This checks the type and length of the tag name and value.
11530 TagsLU.CheckPrereq(self)
11531 for tag in self.op.tags:
11532 objects.TaggableObject.ValidateTag(tag)
11534 def Exec(self, feedback_fn):
11539 for tag in self.op.tags:
11540 self.target.AddTag(tag)
11541 except errors.TagError, err:
11542 raise errors.OpExecError("Error while setting tag: %s" % str(err))
11543 self.cfg.Update(self.target, feedback_fn)
11546 class LUTagsDel(TagsLU):
11547 """Delete a list of tags from a given object.
11552 def CheckPrereq(self):
11553 """Check prerequisites.
11555 This checks that we have the given tag.
11558 TagsLU.CheckPrereq(self)
11559 for tag in self.op.tags:
11560 objects.TaggableObject.ValidateTag(tag)
11561 del_tags = frozenset(self.op.tags)
11562 cur_tags = self.target.GetTags()
11564 diff_tags = del_tags - cur_tags
11566 diff_names = ("'%s'" % i for i in sorted(diff_tags))
11567 raise errors.OpPrereqError("Tag(s) %s not found" %
11568 (utils.CommaJoin(diff_names), ),
11569 errors.ECODE_NOENT)
11571 def Exec(self, feedback_fn):
11572 """Remove the tag from the object.
11575 for tag in self.op.tags:
11576 self.target.RemoveTag(tag)
11577 self.cfg.Update(self.target, feedback_fn)
11580 class LUTestDelay(NoHooksLU):
11581 """Sleep for a specified amount of time.
11583 This LU sleeps on the master and/or nodes for a specified amount of
11589 def ExpandNames(self):
11590 """Expand names and set required locks.
11592 This expands the node list, if any.
11595 self.needed_locks = {}
11596 if self.op.on_nodes:
11597 # _GetWantedNodes can be used here, but is not always appropriate to use
11598 # this way in ExpandNames. Check LogicalUnit.ExpandNames docstring for
11599 # more information.
11600 self.op.on_nodes = _GetWantedNodes(self, self.op.on_nodes)
11601 self.needed_locks[locking.LEVEL_NODE] = self.op.on_nodes
11603 def _TestDelay(self):
11604 """Do the actual sleep.
11607 if self.op.on_master:
11608 if not utils.TestDelay(self.op.duration):
11609 raise errors.OpExecError("Error during master delay test")
11610 if self.op.on_nodes:
11611 result = self.rpc.call_test_delay(self.op.on_nodes, self.op.duration)
11612 for node, node_result in result.items():
11613 node_result.Raise("Failure during rpc call to node %s" % node)
11615 def Exec(self, feedback_fn):
11616 """Execute the test delay opcode, with the wanted repetitions.
11619 if self.op.repeat == 0:
11622 top_value = self.op.repeat - 1
11623 for i in range(self.op.repeat):
11624 self.LogInfo("Test delay iteration %d/%d" % (i, top_value))
11628 class LUTestJqueue(NoHooksLU):
11629 """Utility LU to test some aspects of the job queue.
11634 # Must be lower than default timeout for WaitForJobChange to see whether it
11635 # notices changed jobs
11636 _CLIENT_CONNECT_TIMEOUT = 20.0
11637 _CLIENT_CONFIRM_TIMEOUT = 60.0
11640 def _NotifyUsingSocket(cls, cb, errcls):
11641 """Opens a Unix socket and waits for another program to connect.
11644 @param cb: Callback to send socket name to client
11645 @type errcls: class
11646 @param errcls: Exception class to use for errors
11649 # Using a temporary directory as there's no easy way to create temporary
11650 # sockets without writing a custom loop around tempfile.mktemp and
11652 tmpdir = tempfile.mkdtemp()
11654 tmpsock = utils.PathJoin(tmpdir, "sock")
11656 logging.debug("Creating temporary socket at %s", tmpsock)
11657 sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
11662 # Send details to client
11665 # Wait for client to connect before continuing
11666 sock.settimeout(cls._CLIENT_CONNECT_TIMEOUT)
11668 (conn, _) = sock.accept()
11669 except socket.error, err:
11670 raise errcls("Client didn't connect in time (%s)" % err)
11674 # Remove as soon as client is connected
11675 shutil.rmtree(tmpdir)
11677 # Wait for client to close
11680 # pylint: disable-msg=E1101
11681 # Instance of '_socketobject' has no ... member
11682 conn.settimeout(cls._CLIENT_CONFIRM_TIMEOUT)
11684 except socket.error, err:
11685 raise errcls("Client failed to confirm notification (%s)" % err)
11689 def _SendNotification(self, test, arg, sockname):
11690 """Sends a notification to the client.
11693 @param test: Test name
11694 @param arg: Test argument (depends on test)
11695 @type sockname: string
11696 @param sockname: Socket path
11699 self.Log(constants.ELOG_JQUEUE_TEST, (sockname, test, arg))
11701 def _Notify(self, prereq, test, arg):
11702 """Notifies the client of a test.
11705 @param prereq: Whether this is a prereq-phase test
11707 @param test: Test name
11708 @param arg: Test argument (depends on test)
11712 errcls = errors.OpPrereqError
11714 errcls = errors.OpExecError
11716 return self._NotifyUsingSocket(compat.partial(self._SendNotification,
11720 def CheckArguments(self):
11721 self.checkargs_calls = getattr(self, "checkargs_calls", 0) + 1
11722 self.expandnames_calls = 0
11724 def ExpandNames(self):
11725 checkargs_calls = getattr(self, "checkargs_calls", 0)
11726 if checkargs_calls < 1:
11727 raise errors.ProgrammerError("CheckArguments was not called")
11729 self.expandnames_calls += 1
11731 if self.op.notify_waitlock:
11732 self._Notify(True, constants.JQT_EXPANDNAMES, None)
11734 self.LogInfo("Expanding names")
11736 # Get lock on master node (just to get a lock, not for a particular reason)
11737 self.needed_locks = {
11738 locking.LEVEL_NODE: self.cfg.GetMasterNode(),
11741 def Exec(self, feedback_fn):
11742 if self.expandnames_calls < 1:
11743 raise errors.ProgrammerError("ExpandNames was not called")
11745 if self.op.notify_exec:
11746 self._Notify(False, constants.JQT_EXEC, None)
11748 self.LogInfo("Executing")
11750 if self.op.log_messages:
11751 self._Notify(False, constants.JQT_STARTMSG, len(self.op.log_messages))
11752 for idx, msg in enumerate(self.op.log_messages):
11753 self.LogInfo("Sending log message %s", idx + 1)
11754 feedback_fn(constants.JQT_MSGPREFIX + msg)
11755 # Report how many test messages have been sent
11756 self._Notify(False, constants.JQT_LOGMSG, idx + 1)
11759 raise errors.OpExecError("Opcode failure was requested")
11764 class IAllocator(object):
11765 """IAllocator framework.
11767 An IAllocator instance has three sets of attributes:
11768 - cfg that is needed to query the cluster
11769 - input data (all members of the _KEYS class attribute are required)
11770 - four buffer attributes (in|out_data|text), that represent the
11771 input (to the external script) in text and data structure format,
11772 and the output from it, again in two formats
11773 - the result variables from the script (success, info, nodes) for
11777 # pylint: disable-msg=R0902
11778 # lots of instance attributes
11780 def __init__(self, cfg, rpc, mode, **kwargs):
11783 # init buffer variables
11784 self.in_text = self.out_text = self.in_data = self.out_data = None
11785 # init all input fields so that pylint is happy
11787 self.mem_size = self.disks = self.disk_template = None
11788 self.os = self.tags = self.nics = self.vcpus = None
11789 self.hypervisor = None
11790 self.relocate_from = None
11792 self.evac_nodes = None
11793 self.instances = None
11794 self.reloc_mode = None
11795 self.target_groups = None
11797 self.required_nodes = None
11798 # init result fields
11799 self.success = self.info = self.result = None
11802 (fn, keyset, self._result_check) = self._MODE_DATA[self.mode]
11804 raise errors.ProgrammerError("Unknown mode '%s' passed to the"
11805 " IAllocator" % self.mode)
11808 if key not in keyset:
11809 raise errors.ProgrammerError("Invalid input parameter '%s' to"
11810 " IAllocator" % key)
11811 setattr(self, key, kwargs[key])
11814 if key not in kwargs:
11815 raise errors.ProgrammerError("Missing input parameter '%s' to"
11816 " IAllocator" % key)
11817 self._BuildInputData(compat.partial(fn, self))
11819 def _ComputeClusterData(self):
11820 """Compute the generic allocator input data.
11822 This is the data that is independent of the actual operation.
11826 cluster_info = cfg.GetClusterInfo()
11829 "version": constants.IALLOCATOR_VERSION,
11830 "cluster_name": cfg.GetClusterName(),
11831 "cluster_tags": list(cluster_info.GetTags()),
11832 "enabled_hypervisors": list(cluster_info.enabled_hypervisors),
11833 # we don't have job IDs
11835 ninfo = cfg.GetAllNodesInfo()
11836 iinfo = cfg.GetAllInstancesInfo().values()
11837 i_list = [(inst, cluster_info.FillBE(inst)) for inst in iinfo]
11840 node_list = [n.name for n in ninfo.values() if n.vm_capable]
11842 if self.mode == constants.IALLOCATOR_MODE_ALLOC:
11843 hypervisor_name = self.hypervisor
11844 elif self.mode == constants.IALLOCATOR_MODE_RELOC:
11845 hypervisor_name = cfg.GetInstanceInfo(self.name).hypervisor
11846 elif self.mode in (constants.IALLOCATOR_MODE_MEVAC,
11847 constants.IALLOCATOR_MODE_MRELOC):
11848 hypervisor_name = cluster_info.enabled_hypervisors[0]
11850 node_data = self.rpc.call_node_info(node_list, cfg.GetVGName(),
11853 self.rpc.call_all_instances_info(node_list,
11854 cluster_info.enabled_hypervisors)
11856 data["nodegroups"] = self._ComputeNodeGroupData(cfg)
11858 config_ndata = self._ComputeBasicNodeData(ninfo)
11859 data["nodes"] = self._ComputeDynamicNodeData(ninfo, node_data, node_iinfo,
11860 i_list, config_ndata)
11861 assert len(data["nodes"]) == len(ninfo), \
11862 "Incomplete node data computed"
11864 data["instances"] = self._ComputeInstanceData(cluster_info, i_list)
11866 self.in_data = data
11869 def _ComputeNodeGroupData(cfg):
11870 """Compute node groups data.
11873 ng = dict((guuid, {
11874 "name": gdata.name,
11875 "alloc_policy": gdata.alloc_policy,
11877 for guuid, gdata in cfg.GetAllNodeGroupsInfo().items())
11882 def _ComputeBasicNodeData(node_cfg):
11883 """Compute global node data.
11886 @returns: a dict of name: (node dict, node config)
11889 # fill in static (config-based) values
11890 node_results = dict((ninfo.name, {
11891 "tags": list(ninfo.GetTags()),
11892 "primary_ip": ninfo.primary_ip,
11893 "secondary_ip": ninfo.secondary_ip,
11894 "offline": ninfo.offline,
11895 "drained": ninfo.drained,
11896 "master_candidate": ninfo.master_candidate,
11897 "group": ninfo.group,
11898 "master_capable": ninfo.master_capable,
11899 "vm_capable": ninfo.vm_capable,
11901 for ninfo in node_cfg.values())
11903 return node_results
11906 def _ComputeDynamicNodeData(node_cfg, node_data, node_iinfo, i_list,
11908 """Compute global node data.
11910 @param node_results: the basic node structures as filled from the config
11913 # make a copy of the current dict
11914 node_results = dict(node_results)
11915 for nname, nresult in node_data.items():
11916 assert nname in node_results, "Missing basic data for node %s" % nname
11917 ninfo = node_cfg[nname]
11919 if not (ninfo.offline or ninfo.drained):
11920 nresult.Raise("Can't get data for node %s" % nname)
11921 node_iinfo[nname].Raise("Can't get node instance info from node %s" %
11923 remote_info = nresult.payload
11925 for attr in ['memory_total', 'memory_free', 'memory_dom0',
11926 'vg_size', 'vg_free', 'cpu_total']:
11927 if attr not in remote_info:
11928 raise errors.OpExecError("Node '%s' didn't return attribute"
11929 " '%s'" % (nname, attr))
11930 if not isinstance(remote_info[attr], int):
11931 raise errors.OpExecError("Node '%s' returned invalid value"
11933 (nname, attr, remote_info[attr]))
11934 # compute memory used by primary instances
11935 i_p_mem = i_p_up_mem = 0
11936 for iinfo, beinfo in i_list:
11937 if iinfo.primary_node == nname:
11938 i_p_mem += beinfo[constants.BE_MEMORY]
11939 if iinfo.name not in node_iinfo[nname].payload:
11942 i_used_mem = int(node_iinfo[nname].payload[iinfo.name]['memory'])
11943 i_mem_diff = beinfo[constants.BE_MEMORY] - i_used_mem
11944 remote_info['memory_free'] -= max(0, i_mem_diff)
11947 i_p_up_mem += beinfo[constants.BE_MEMORY]
11949 # compute memory used by instances
11951 "total_memory": remote_info['memory_total'],
11952 "reserved_memory": remote_info['memory_dom0'],
11953 "free_memory": remote_info['memory_free'],
11954 "total_disk": remote_info['vg_size'],
11955 "free_disk": remote_info['vg_free'],
11956 "total_cpus": remote_info['cpu_total'],
11957 "i_pri_memory": i_p_mem,
11958 "i_pri_up_memory": i_p_up_mem,
11960 pnr_dyn.update(node_results[nname])
11961 node_results[nname] = pnr_dyn
11963 return node_results
11966 def _ComputeInstanceData(cluster_info, i_list):
11967 """Compute global instance data.
11971 for iinfo, beinfo in i_list:
11973 for nic in iinfo.nics:
11974 filled_params = cluster_info.SimpleFillNIC(nic.nicparams)
11978 "mode": filled_params[constants.NIC_MODE],
11979 "link": filled_params[constants.NIC_LINK],
11981 if filled_params[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
11982 nic_dict["bridge"] = filled_params[constants.NIC_LINK]
11983 nic_data.append(nic_dict)
11985 "tags": list(iinfo.GetTags()),
11986 "admin_up": iinfo.admin_up,
11987 "vcpus": beinfo[constants.BE_VCPUS],
11988 "memory": beinfo[constants.BE_MEMORY],
11990 "nodes": [iinfo.primary_node] + list(iinfo.secondary_nodes),
11992 "disks": [{constants.IDISK_SIZE: dsk.size,
11993 constants.IDISK_MODE: dsk.mode}
11994 for dsk in iinfo.disks],
11995 "disk_template": iinfo.disk_template,
11996 "hypervisor": iinfo.hypervisor,
11998 pir["disk_space_total"] = _ComputeDiskSize(iinfo.disk_template,
12000 instance_data[iinfo.name] = pir
12002 return instance_data
12004 def _AddNewInstance(self):
12005 """Add new instance data to allocator structure.
12007 This in combination with _AllocatorGetClusterData will create the
12008 correct structure needed as input for the allocator.
12010 The checks for the completeness of the opcode must have already been
12014 disk_space = _ComputeDiskSize(self.disk_template, self.disks)
12016 if self.disk_template in constants.DTS_INT_MIRROR:
12017 self.required_nodes = 2
12019 self.required_nodes = 1
12023 "disk_template": self.disk_template,
12026 "vcpus": self.vcpus,
12027 "memory": self.mem_size,
12028 "disks": self.disks,
12029 "disk_space_total": disk_space,
12031 "required_nodes": self.required_nodes,
12036 def _AddRelocateInstance(self):
12037 """Add relocate instance data to allocator structure.
12039 This in combination with _IAllocatorGetClusterData will create the
12040 correct structure needed as input for the allocator.
12042 The checks for the completeness of the opcode must have already been
12046 instance = self.cfg.GetInstanceInfo(self.name)
12047 if instance is None:
12048 raise errors.ProgrammerError("Unknown instance '%s' passed to"
12049 " IAllocator" % self.name)
12051 if instance.disk_template not in constants.DTS_MIRRORED:
12052 raise errors.OpPrereqError("Can't relocate non-mirrored instances",
12053 errors.ECODE_INVAL)
12055 if instance.disk_template in constants.DTS_INT_MIRROR and \
12056 len(instance.secondary_nodes) != 1:
12057 raise errors.OpPrereqError("Instance has not exactly one secondary node",
12058 errors.ECODE_STATE)
12060 self.required_nodes = 1
12061 disk_sizes = [{constants.IDISK_SIZE: disk.size} for disk in instance.disks]
12062 disk_space = _ComputeDiskSize(instance.disk_template, disk_sizes)
12066 "disk_space_total": disk_space,
12067 "required_nodes": self.required_nodes,
12068 "relocate_from": self.relocate_from,
12072 def _AddEvacuateNodes(self):
12073 """Add evacuate nodes data to allocator structure.
12077 "evac_nodes": self.evac_nodes
12081 def _AddMultiRelocate(self):
12082 """Get data for multi-relocate requests.
12086 "instances": self.instances,
12087 "reloc_mode": self.reloc_mode,
12088 "target_groups": self.target_groups,
12091 def _BuildInputData(self, fn):
12092 """Build input data structures.
12095 self._ComputeClusterData()
12098 request["type"] = self.mode
12099 self.in_data["request"] = request
12101 self.in_text = serializer.Dump(self.in_data)
12104 constants.IALLOCATOR_MODE_ALLOC:
12106 ["name", "mem_size", "disks", "disk_template", "os", "tags", "nics",
12107 "vcpus", "hypervisor"], ht.TList),
12108 constants.IALLOCATOR_MODE_RELOC:
12109 (_AddRelocateInstance, ["name", "relocate_from"], ht.TList),
12110 constants.IALLOCATOR_MODE_MEVAC:
12111 (_AddEvacuateNodes, ["evac_nodes"],
12112 ht.TListOf(ht.TAnd(ht.TIsLength(2),
12113 ht.TListOf(ht.TString)))),
12114 constants.IALLOCATOR_MODE_MRELOC:
12115 (_AddMultiRelocate, ["instances", "reloc_mode", "target_groups"],
12116 ht.TListOf(ht.TListOf(ht.TStrictDict(True, False, {
12117 # pylint: disable-msg=E1101
12118 # Class '...' has no 'OP_ID' member
12119 "OP_ID": ht.TElemOf([opcodes.OpInstanceFailover.OP_ID,
12120 opcodes.OpInstanceMigrate.OP_ID,
12121 opcodes.OpInstanceReplaceDisks.OP_ID])
12125 def Run(self, name, validate=True, call_fn=None):
12126 """Run an instance allocator and return the results.
12129 if call_fn is None:
12130 call_fn = self.rpc.call_iallocator_runner
12132 result = call_fn(self.cfg.GetMasterNode(), name, self.in_text)
12133 result.Raise("Failure while running the iallocator script")
12135 self.out_text = result.payload
12137 self._ValidateResult()
12139 def _ValidateResult(self):
12140 """Process the allocator results.
12142 This will process and if successful save the result in
12143 self.out_data and the other parameters.
12147 rdict = serializer.Load(self.out_text)
12148 except Exception, err:
12149 raise errors.OpExecError("Can't parse iallocator results: %s" % str(err))
12151 if not isinstance(rdict, dict):
12152 raise errors.OpExecError("Can't parse iallocator results: not a dict")
12154 # TODO: remove backwards compatiblity in later versions
12155 if "nodes" in rdict and "result" not in rdict:
12156 rdict["result"] = rdict["nodes"]
12159 for key in "success", "info", "result":
12160 if key not in rdict:
12161 raise errors.OpExecError("Can't parse iallocator results:"
12162 " missing key '%s'" % key)
12163 setattr(self, key, rdict[key])
12165 if not self._result_check(self.result):
12166 raise errors.OpExecError("Iallocator returned invalid result,"
12167 " expected %s, got %s" %
12168 (self._result_check, self.result),
12169 errors.ECODE_INVAL)
12171 if self.mode in (constants.IALLOCATOR_MODE_RELOC,
12172 constants.IALLOCATOR_MODE_MEVAC):
12173 node2group = dict((name, ndata["group"])
12174 for (name, ndata) in self.in_data["nodes"].items())
12176 fn = compat.partial(self._NodesToGroups, node2group,
12177 self.in_data["nodegroups"])
12179 if self.mode == constants.IALLOCATOR_MODE_RELOC:
12180 assert self.relocate_from is not None
12181 assert self.required_nodes == 1
12183 request_groups = fn(self.relocate_from)
12184 result_groups = fn(rdict["result"])
12186 if result_groups != request_groups:
12187 raise errors.OpExecError("Groups of nodes returned by iallocator (%s)"
12188 " differ from original groups (%s)" %
12189 (utils.CommaJoin(result_groups),
12190 utils.CommaJoin(request_groups)))
12191 elif self.mode == constants.IALLOCATOR_MODE_MEVAC:
12192 request_groups = fn(self.evac_nodes)
12193 for (instance_name, secnode) in self.result:
12194 result_groups = fn([secnode])
12195 if result_groups != request_groups:
12196 raise errors.OpExecError("Iallocator returned new secondary node"
12197 " '%s' (group '%s') for instance '%s'"
12198 " which is not in original group '%s'" %
12199 (secnode, utils.CommaJoin(result_groups),
12201 utils.CommaJoin(request_groups)))
12203 raise errors.ProgrammerError("Unhandled mode '%s'" % self.mode)
12205 self.out_data = rdict
12208 def _NodesToGroups(node2group, groups, nodes):
12209 """Returns a list of unique group names for a list of nodes.
12211 @type node2group: dict
12212 @param node2group: Map from node name to group UUID
12214 @param groups: Group information
12216 @param nodes: Node names
12223 group_uuid = node2group[node]
12225 # Ignore unknown node
12229 group = groups[group_uuid]
12231 # Can't find group, let's use UUID
12232 group_name = group_uuid
12234 group_name = group["name"]
12236 result.add(group_name)
12238 return sorted(result)
12241 class LUTestAllocator(NoHooksLU):
12242 """Run allocator tests.
12244 This LU runs the allocator tests
12247 def CheckPrereq(self):
12248 """Check prerequisites.
12250 This checks the opcode parameters depending on the director and mode test.
12253 if self.op.mode == constants.IALLOCATOR_MODE_ALLOC:
12254 for attr in ["mem_size", "disks", "disk_template",
12255 "os", "tags", "nics", "vcpus"]:
12256 if not hasattr(self.op, attr):
12257 raise errors.OpPrereqError("Missing attribute '%s' on opcode input" %
12258 attr, errors.ECODE_INVAL)
12259 iname = self.cfg.ExpandInstanceName(self.op.name)
12260 if iname is not None:
12261 raise errors.OpPrereqError("Instance '%s' already in the cluster" %
12262 iname, errors.ECODE_EXISTS)
12263 if not isinstance(self.op.nics, list):
12264 raise errors.OpPrereqError("Invalid parameter 'nics'",
12265 errors.ECODE_INVAL)
12266 if not isinstance(self.op.disks, list):
12267 raise errors.OpPrereqError("Invalid parameter 'disks'",
12268 errors.ECODE_INVAL)
12269 for row in self.op.disks:
12270 if (not isinstance(row, dict) or
12271 "size" not in row or
12272 not isinstance(row["size"], int) or
12273 "mode" not in row or
12274 row["mode"] not in ['r', 'w']):
12275 raise errors.OpPrereqError("Invalid contents of the 'disks'"
12276 " parameter", errors.ECODE_INVAL)
12277 if self.op.hypervisor is None:
12278 self.op.hypervisor = self.cfg.GetHypervisorType()
12279 elif self.op.mode == constants.IALLOCATOR_MODE_RELOC:
12280 fname = _ExpandInstanceName(self.cfg, self.op.name)
12281 self.op.name = fname
12282 self.relocate_from = self.cfg.GetInstanceInfo(fname).secondary_nodes
12283 elif self.op.mode == constants.IALLOCATOR_MODE_MEVAC:
12284 if not hasattr(self.op, "evac_nodes"):
12285 raise errors.OpPrereqError("Missing attribute 'evac_nodes' on"
12286 " opcode input", errors.ECODE_INVAL)
12287 elif self.op.mode == constants.IALLOCATOR_MODE_MRELOC:
12288 if self.op.instances:
12289 self.op.instances = _GetWantedInstances(self, self.op.instances)
12291 raise errors.OpPrereqError("Missing instances to relocate",
12292 errors.ECODE_INVAL)
12294 raise errors.OpPrereqError("Invalid test allocator mode '%s'" %
12295 self.op.mode, errors.ECODE_INVAL)
12297 if self.op.direction == constants.IALLOCATOR_DIR_OUT:
12298 if self.op.allocator is None:
12299 raise errors.OpPrereqError("Missing allocator name",
12300 errors.ECODE_INVAL)
12301 elif self.op.direction != constants.IALLOCATOR_DIR_IN:
12302 raise errors.OpPrereqError("Wrong allocator test '%s'" %
12303 self.op.direction, errors.ECODE_INVAL)
12305 def Exec(self, feedback_fn):
12306 """Run the allocator test.
12309 if self.op.mode == constants.IALLOCATOR_MODE_ALLOC:
12310 ial = IAllocator(self.cfg, self.rpc,
12313 mem_size=self.op.mem_size,
12314 disks=self.op.disks,
12315 disk_template=self.op.disk_template,
12319 vcpus=self.op.vcpus,
12320 hypervisor=self.op.hypervisor,
12322 elif self.op.mode == constants.IALLOCATOR_MODE_RELOC:
12323 ial = IAllocator(self.cfg, self.rpc,
12326 relocate_from=list(self.relocate_from),
12328 elif self.op.mode == constants.IALLOCATOR_MODE_MEVAC:
12329 ial = IAllocator(self.cfg, self.rpc,
12331 evac_nodes=self.op.evac_nodes)
12332 elif self.op.mode == constants.IALLOCATOR_MODE_MRELOC:
12333 ial = IAllocator(self.cfg, self.rpc,
12335 instances=self.op.instances,
12336 reloc_mode=self.op.reloc_mode,
12337 target_groups=self.op.target_groups)
12339 raise errors.ProgrammerError("Uncatched mode %s in"
12340 " LUTestAllocator.Exec", self.op.mode)
12342 if self.op.direction == constants.IALLOCATOR_DIR_IN:
12343 result = ial.in_text
12345 ial.Run(self.op.allocator, validate=False)
12346 result = ial.out_text
12350 #: Query type implementations
12352 constants.QR_INSTANCE: _InstanceQuery,
12353 constants.QR_NODE: _NodeQuery,
12354 constants.QR_GROUP: _GroupQuery,
12355 constants.QR_OS: _OsQuery,
12358 assert set(_QUERY_IMPL.keys()) == constants.QR_VIA_OP
12361 def _GetQueryImplementation(name):
12362 """Returns the implemtnation for a query type.
12364 @param name: Query type, must be one of L{constants.QR_VIA_OP}
12368 return _QUERY_IMPL[name]
12370 raise errors.OpPrereqError("Unknown query resource '%s'" % name,
12371 errors.ECODE_INVAL)