4 # Copyright (C) 2006, 2007, 2008, 2009, 2010 Google Inc.
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 """Module implementing the master-side code."""
24 # pylint: disable-msg=W0201,C0302
26 # W0201 since most LU attributes are defined in CheckPrereq or similar
29 # C0302: since we have waaaay to many lines in this module
43 from ganeti import ssh
44 from ganeti import utils
45 from ganeti import errors
46 from ganeti import hypervisor
47 from ganeti import locking
48 from ganeti import constants
49 from ganeti import objects
50 from ganeti import serializer
51 from ganeti import ssconf
52 from ganeti import uidpool
53 from ganeti import compat
54 from ganeti import masterd
55 from ganeti import netutils
58 import ganeti.masterd.instance # pylint: disable-msg=W0611
60 # Common opcode attributes
62 #: output fields for a query operation
63 _POutputFields = ("output_fields", ht.NoDefault, ht.TListOf(ht.TNonEmptyString))
66 #: the shutdown timeout
67 _PShutdownTimeout = ("shutdown_timeout", constants.DEFAULT_SHUTDOWN_TIMEOUT,
70 #: the force parameter
71 _PForce = ("force", False, ht.TBool)
73 #: a required instance name (for single-instance LUs)
74 _PInstanceName = ("instance_name", ht.NoDefault, ht.TNonEmptyString)
76 #: Whether to ignore offline nodes
77 _PIgnoreOfflineNodes = ("ignore_offline_nodes", False, ht.TBool)
79 #: a required node name (for single-node LUs)
80 _PNodeName = ("node_name", ht.NoDefault, ht.TNonEmptyString)
82 #: the migration type (live/non-live)
83 _PMigrationMode = ("mode", None,
84 ht.TOr(ht.TNone, ht.TElemOf(constants.HT_MIGRATION_MODES)))
86 #: the obsolete 'live' mode (boolean)
87 _PMigrationLive = ("live", None, ht.TMaybeBool)
91 class LogicalUnit(object):
92 """Logical Unit base class.
94 Subclasses must follow these rules:
95 - implement ExpandNames
96 - implement CheckPrereq (except when tasklets are used)
97 - implement Exec (except when tasklets are used)
98 - implement BuildHooksEnv
99 - redefine HPATH and HTYPE
100 - optionally redefine their run requirements:
101 REQ_BGL: the LU needs to hold the Big Ganeti Lock exclusively
103 Note that all commands require root permissions.
105 @ivar dry_run_result: the value (if any) that will be returned to the caller
106 in dry-run mode (signalled by opcode dry_run parameter)
107 @cvar _OP_PARAMS: a list of opcode attributes, their defaults values
108 they should get if not already defined, and types they must match
116 def __init__(self, processor, op, context, rpc):
117 """Constructor for LogicalUnit.
119 This needs to be overridden in derived classes in order to check op
123 self.proc = processor
125 self.cfg = context.cfg
126 self.context = context
128 # Dicts used to declare locking needs to mcpu
129 self.needed_locks = None
130 self.acquired_locks = {}
131 self.share_locks = dict.fromkeys(locking.LEVELS, 0)
133 self.remove_locks = {}
134 # Used to force good behavior when calling helper functions
135 self.recalculate_locks = {}
138 self.Log = processor.Log # pylint: disable-msg=C0103
139 self.LogWarning = processor.LogWarning # pylint: disable-msg=C0103
140 self.LogInfo = processor.LogInfo # pylint: disable-msg=C0103
141 self.LogStep = processor.LogStep # pylint: disable-msg=C0103
142 # support for dry-run
143 self.dry_run_result = None
144 # support for generic debug attribute
145 if (not hasattr(self.op, "debug_level") or
146 not isinstance(self.op.debug_level, int)):
147 self.op.debug_level = 0
152 # The new kind-of-type-system
153 op_id = self.op.OP_ID
154 for attr_name, aval, test in self._OP_PARAMS:
155 if not hasattr(op, attr_name):
156 if aval == ht.NoDefault:
157 raise errors.OpPrereqError("Required parameter '%s.%s' missing" %
158 (op_id, attr_name), errors.ECODE_INVAL)
164 setattr(self.op, attr_name, dval)
165 attr_val = getattr(op, attr_name)
166 if test == ht.NoType:
169 if not callable(test):
170 raise errors.ProgrammerError("Validation for parameter '%s.%s' failed,"
171 " given type is not a proper type (%s)" %
172 (op_id, attr_name, test))
173 if not test(attr_val):
174 logging.error("OpCode %s, parameter %s, has invalid type %s/value %s",
175 self.op.OP_ID, attr_name, type(attr_val), attr_val)
176 raise errors.OpPrereqError("Parameter '%s.%s' fails validation" %
177 (op_id, attr_name), errors.ECODE_INVAL)
179 self.CheckArguments()
182 """Returns the SshRunner object
186 self.__ssh = ssh.SshRunner(self.cfg.GetClusterName())
189 ssh = property(fget=__GetSSH)
191 def CheckArguments(self):
192 """Check syntactic validity for the opcode arguments.
194 This method is for doing a simple syntactic check and ensure
195 validity of opcode parameters, without any cluster-related
196 checks. While the same can be accomplished in ExpandNames and/or
197 CheckPrereq, doing these separate is better because:
199 - ExpandNames is left as as purely a lock-related function
200 - CheckPrereq is run after we have acquired locks (and possible
203 The function is allowed to change the self.op attribute so that
204 later methods can no longer worry about missing parameters.
209 def ExpandNames(self):
210 """Expand names for this LU.
212 This method is called before starting to execute the opcode, and it should
213 update all the parameters of the opcode to their canonical form (e.g. a
214 short node name must be fully expanded after this method has successfully
215 completed). This way locking, hooks, logging, ecc. can work correctly.
217 LUs which implement this method must also populate the self.needed_locks
218 member, as a dict with lock levels as keys, and a list of needed lock names
221 - use an empty dict if you don't need any lock
222 - if you don't need any lock at a particular level omit that level
223 - don't put anything for the BGL level
224 - if you want all locks at a level use locking.ALL_SET as a value
226 If you need to share locks (rather than acquire them exclusively) at one
227 level you can modify self.share_locks, setting a true value (usually 1) for
228 that level. By default locks are not shared.
230 This function can also define a list of tasklets, which then will be
231 executed in order instead of the usual LU-level CheckPrereq and Exec
232 functions, if those are not defined by the LU.
236 # Acquire all nodes and one instance
237 self.needed_locks = {
238 locking.LEVEL_NODE: locking.ALL_SET,
239 locking.LEVEL_INSTANCE: ['instance1.example.com'],
241 # Acquire just two nodes
242 self.needed_locks = {
243 locking.LEVEL_NODE: ['node1.example.com', 'node2.example.com'],
246 self.needed_locks = {} # No, you can't leave it to the default value None
249 # The implementation of this method is mandatory only if the new LU is
250 # concurrent, so that old LUs don't need to be changed all at the same
253 self.needed_locks = {} # Exclusive LUs don't need locks.
255 raise NotImplementedError
257 def DeclareLocks(self, level):
258 """Declare LU locking needs for a level
260 While most LUs can just declare their locking needs at ExpandNames time,
261 sometimes there's the need to calculate some locks after having acquired
262 the ones before. This function is called just before acquiring locks at a
263 particular level, but after acquiring the ones at lower levels, and permits
264 such calculations. It can be used to modify self.needed_locks, and by
265 default it does nothing.
267 This function is only called if you have something already set in
268 self.needed_locks for the level.
270 @param level: Locking level which is going to be locked
271 @type level: member of ganeti.locking.LEVELS
275 def CheckPrereq(self):
276 """Check prerequisites for this LU.
278 This method should check that the prerequisites for the execution
279 of this LU are fulfilled. It can do internode communication, but
280 it should be idempotent - no cluster or system changes are
283 The method should raise errors.OpPrereqError in case something is
284 not fulfilled. Its return value is ignored.
286 This method should also update all the parameters of the opcode to
287 their canonical form if it hasn't been done by ExpandNames before.
290 if self.tasklets is not None:
291 for (idx, tl) in enumerate(self.tasklets):
292 logging.debug("Checking prerequisites for tasklet %s/%s",
293 idx + 1, len(self.tasklets))
298 def Exec(self, feedback_fn):
301 This method should implement the actual work. It should raise
302 errors.OpExecError for failures that are somewhat dealt with in
306 if self.tasklets is not None:
307 for (idx, tl) in enumerate(self.tasklets):
308 logging.debug("Executing tasklet %s/%s", idx + 1, len(self.tasklets))
311 raise NotImplementedError
313 def BuildHooksEnv(self):
314 """Build hooks environment for this LU.
316 This method should return a three-node tuple consisting of: a dict
317 containing the environment that will be used for running the
318 specific hook for this LU, a list of node names on which the hook
319 should run before the execution, and a list of node names on which
320 the hook should run after the execution.
322 The keys of the dict must not have 'GANETI_' prefixed as this will
323 be handled in the hooks runner. Also note additional keys will be
324 added by the hooks runner. If the LU doesn't define any
325 environment, an empty dict (and not None) should be returned.
327 No nodes should be returned as an empty list (and not None).
329 Note that if the HPATH for a LU class is None, this function will
333 raise NotImplementedError
335 def HooksCallBack(self, phase, hook_results, feedback_fn, lu_result):
336 """Notify the LU about the results of its hooks.
338 This method is called every time a hooks phase is executed, and notifies
339 the Logical Unit about the hooks' result. The LU can then use it to alter
340 its result based on the hooks. By default the method does nothing and the
341 previous result is passed back unchanged but any LU can define it if it
342 wants to use the local cluster hook-scripts somehow.
344 @param phase: one of L{constants.HOOKS_PHASE_POST} or
345 L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase
346 @param hook_results: the results of the multi-node hooks rpc call
347 @param feedback_fn: function used send feedback back to the caller
348 @param lu_result: the previous Exec result this LU had, or None
350 @return: the new Exec result, based on the previous result
354 # API must be kept, thus we ignore the unused argument and could
355 # be a function warnings
356 # pylint: disable-msg=W0613,R0201
359 def _ExpandAndLockInstance(self):
360 """Helper function to expand and lock an instance.
362 Many LUs that work on an instance take its name in self.op.instance_name
363 and need to expand it and then declare the expanded name for locking. This
364 function does it, and then updates self.op.instance_name to the expanded
365 name. It also initializes needed_locks as a dict, if this hasn't been done
369 if self.needed_locks is None:
370 self.needed_locks = {}
372 assert locking.LEVEL_INSTANCE not in self.needed_locks, \
373 "_ExpandAndLockInstance called with instance-level locks set"
374 self.op.instance_name = _ExpandInstanceName(self.cfg,
375 self.op.instance_name)
376 self.needed_locks[locking.LEVEL_INSTANCE] = self.op.instance_name
378 def _LockInstancesNodes(self, primary_only=False):
379 """Helper function to declare instances' nodes for locking.
381 This function should be called after locking one or more instances to lock
382 their nodes. Its effect is populating self.needed_locks[locking.LEVEL_NODE]
383 with all primary or secondary nodes for instances already locked and
384 present in self.needed_locks[locking.LEVEL_INSTANCE].
386 It should be called from DeclareLocks, and for safety only works if
387 self.recalculate_locks[locking.LEVEL_NODE] is set.
389 In the future it may grow parameters to just lock some instance's nodes, or
390 to just lock primaries or secondary nodes, if needed.
392 If should be called in DeclareLocks in a way similar to::
394 if level == locking.LEVEL_NODE:
395 self._LockInstancesNodes()
397 @type primary_only: boolean
398 @param primary_only: only lock primary nodes of locked instances
401 assert locking.LEVEL_NODE in self.recalculate_locks, \
402 "_LockInstancesNodes helper function called with no nodes to recalculate"
404 # TODO: check if we're really been called with the instance locks held
406 # For now we'll replace self.needed_locks[locking.LEVEL_NODE], but in the
407 # future we might want to have different behaviors depending on the value
408 # of self.recalculate_locks[locking.LEVEL_NODE]
410 for instance_name in self.acquired_locks[locking.LEVEL_INSTANCE]:
411 instance = self.context.cfg.GetInstanceInfo(instance_name)
412 wanted_nodes.append(instance.primary_node)
414 wanted_nodes.extend(instance.secondary_nodes)
416 if self.recalculate_locks[locking.LEVEL_NODE] == constants.LOCKS_REPLACE:
417 self.needed_locks[locking.LEVEL_NODE] = wanted_nodes
418 elif self.recalculate_locks[locking.LEVEL_NODE] == constants.LOCKS_APPEND:
419 self.needed_locks[locking.LEVEL_NODE].extend(wanted_nodes)
421 del self.recalculate_locks[locking.LEVEL_NODE]
424 class NoHooksLU(LogicalUnit): # pylint: disable-msg=W0223
425 """Simple LU which runs no hooks.
427 This LU is intended as a parent for other LogicalUnits which will
428 run no hooks, in order to reduce duplicate code.
434 def BuildHooksEnv(self):
435 """Empty BuildHooksEnv for NoHooksLu.
437 This just raises an error.
440 assert False, "BuildHooksEnv called for NoHooksLUs"
444 """Tasklet base class.
446 Tasklets are subcomponents for LUs. LUs can consist entirely of tasklets or
447 they can mix legacy code with tasklets. Locking needs to be done in the LU,
448 tasklets know nothing about locks.
450 Subclasses must follow these rules:
451 - Implement CheckPrereq
455 def __init__(self, lu):
462 def CheckPrereq(self):
463 """Check prerequisites for this tasklets.
465 This method should check whether the prerequisites for the execution of
466 this tasklet are fulfilled. It can do internode communication, but it
467 should be idempotent - no cluster or system changes are allowed.
469 The method should raise errors.OpPrereqError in case something is not
470 fulfilled. Its return value is ignored.
472 This method should also update all parameters to their canonical form if it
473 hasn't been done before.
478 def Exec(self, feedback_fn):
479 """Execute the tasklet.
481 This method should implement the actual work. It should raise
482 errors.OpExecError for failures that are somewhat dealt with in code, or
486 raise NotImplementedError
489 def _GetWantedNodes(lu, nodes):
490 """Returns list of checked and expanded node names.
492 @type lu: L{LogicalUnit}
493 @param lu: the logical unit on whose behalf we execute
495 @param nodes: list of node names or None for all nodes
497 @return: the list of nodes, sorted
498 @raise errors.ProgrammerError: if the nodes parameter is wrong type
502 raise errors.ProgrammerError("_GetWantedNodes should only be called with a"
503 " non-empty list of nodes whose name is to be expanded.")
505 wanted = [_ExpandNodeName(lu.cfg, name) for name in nodes]
506 return utils.NiceSort(wanted)
509 def _GetWantedInstances(lu, instances):
510 """Returns list of checked and expanded instance names.
512 @type lu: L{LogicalUnit}
513 @param lu: the logical unit on whose behalf we execute
514 @type instances: list
515 @param instances: list of instance names or None for all instances
517 @return: the list of instances, sorted
518 @raise errors.OpPrereqError: if the instances parameter is wrong type
519 @raise errors.OpPrereqError: if any of the passed instances is not found
523 wanted = [_ExpandInstanceName(lu.cfg, name) for name in instances]
525 wanted = utils.NiceSort(lu.cfg.GetInstanceList())
529 def _GetUpdatedParams(old_params, update_dict,
530 use_default=True, use_none=False):
531 """Return the new version of a parameter dictionary.
533 @type old_params: dict
534 @param old_params: old parameters
535 @type update_dict: dict
536 @param update_dict: dict containing new parameter values, or
537 constants.VALUE_DEFAULT to reset the parameter to its default
539 @param use_default: boolean
540 @type use_default: whether to recognise L{constants.VALUE_DEFAULT}
541 values as 'to be deleted' values
542 @param use_none: boolean
543 @type use_none: whether to recognise C{None} values as 'to be
546 @return: the new parameter dictionary
549 params_copy = copy.deepcopy(old_params)
550 for key, val in update_dict.iteritems():
551 if ((use_default and val == constants.VALUE_DEFAULT) or
552 (use_none and val is None)):
558 params_copy[key] = val
562 def _CheckOutputFields(static, dynamic, selected):
563 """Checks whether all selected fields are valid.
565 @type static: L{utils.FieldSet}
566 @param static: static fields set
567 @type dynamic: L{utils.FieldSet}
568 @param dynamic: dynamic fields set
575 delta = f.NonMatching(selected)
577 raise errors.OpPrereqError("Unknown output fields selected: %s"
578 % ",".join(delta), errors.ECODE_INVAL)
581 def _CheckGlobalHvParams(params):
582 """Validates that given hypervisor params are not global ones.
584 This will ensure that instances don't get customised versions of
588 used_globals = constants.HVC_GLOBALS.intersection(params)
590 msg = ("The following hypervisor parameters are global and cannot"
591 " be customized at instance level, please modify them at"
592 " cluster level: %s" % utils.CommaJoin(used_globals))
593 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
596 def _CheckNodeOnline(lu, node):
597 """Ensure that a given node is online.
599 @param lu: the LU on behalf of which we make the check
600 @param node: the node to check
601 @raise errors.OpPrereqError: if the node is offline
604 if lu.cfg.GetNodeInfo(node).offline:
605 raise errors.OpPrereqError("Can't use offline node %s" % node,
609 def _CheckNodeNotDrained(lu, node):
610 """Ensure that a given node is not drained.
612 @param lu: the LU on behalf of which we make the check
613 @param node: the node to check
614 @raise errors.OpPrereqError: if the node is drained
617 if lu.cfg.GetNodeInfo(node).drained:
618 raise errors.OpPrereqError("Can't use drained node %s" % node,
622 def _CheckNodeVmCapable(lu, node):
623 """Ensure that a given node is vm capable.
625 @param lu: the LU on behalf of which we make the check
626 @param node: the node to check
627 @raise errors.OpPrereqError: if the node is not vm capable
630 if not lu.cfg.GetNodeInfo(node).vm_capable:
631 raise errors.OpPrereqError("Can't use non-vm_capable node %s" % node,
635 def _CheckNodeHasOS(lu, node, os_name, force_variant):
636 """Ensure that a node supports a given OS.
638 @param lu: the LU on behalf of which we make the check
639 @param node: the node to check
640 @param os_name: the OS to query about
641 @param force_variant: whether to ignore variant errors
642 @raise errors.OpPrereqError: if the node is not supporting the OS
645 result = lu.rpc.call_os_get(node, os_name)
646 result.Raise("OS '%s' not in supported OS list for node %s" %
648 prereq=True, ecode=errors.ECODE_INVAL)
649 if not force_variant:
650 _CheckOSVariant(result.payload, os_name)
653 def _RequireFileStorage():
654 """Checks that file storage is enabled.
656 @raise errors.OpPrereqError: when file storage is disabled
659 if not constants.ENABLE_FILE_STORAGE:
660 raise errors.OpPrereqError("File storage disabled at configure time",
664 def _CheckDiskTemplate(template):
665 """Ensure a given disk template is valid.
668 if template not in constants.DISK_TEMPLATES:
669 msg = ("Invalid disk template name '%s', valid templates are: %s" %
670 (template, utils.CommaJoin(constants.DISK_TEMPLATES)))
671 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
672 if template == constants.DT_FILE:
673 _RequireFileStorage()
677 def _CheckStorageType(storage_type):
678 """Ensure a given storage type is valid.
681 if storage_type not in constants.VALID_STORAGE_TYPES:
682 raise errors.OpPrereqError("Unknown storage type: %s" % storage_type,
684 if storage_type == constants.ST_FILE:
685 _RequireFileStorage()
689 def _GetClusterDomainSecret():
690 """Reads the cluster domain secret.
693 return utils.ReadOneLineFile(constants.CLUSTER_DOMAIN_SECRET_FILE,
697 def _CheckInstanceDown(lu, instance, reason):
698 """Ensure that an instance is not running."""
699 if instance.admin_up:
700 raise errors.OpPrereqError("Instance %s is marked to be up, %s" %
701 (instance.name, reason), errors.ECODE_STATE)
703 pnode = instance.primary_node
704 ins_l = lu.rpc.call_instance_list([pnode], [instance.hypervisor])[pnode]
705 ins_l.Raise("Can't contact node %s for instance information" % pnode,
706 prereq=True, ecode=errors.ECODE_ENVIRON)
708 if instance.name in ins_l.payload:
709 raise errors.OpPrereqError("Instance %s is running, %s" %
710 (instance.name, reason), errors.ECODE_STATE)
713 def _ExpandItemName(fn, name, kind):
714 """Expand an item name.
716 @param fn: the function to use for expansion
717 @param name: requested item name
718 @param kind: text description ('Node' or 'Instance')
719 @return: the resolved (full) name
720 @raise errors.OpPrereqError: if the item is not found
724 if full_name is None:
725 raise errors.OpPrereqError("%s '%s' not known" % (kind, name),
730 def _ExpandNodeName(cfg, name):
731 """Wrapper over L{_ExpandItemName} for nodes."""
732 return _ExpandItemName(cfg.ExpandNodeName, name, "Node")
735 def _ExpandInstanceName(cfg, name):
736 """Wrapper over L{_ExpandItemName} for instance."""
737 return _ExpandItemName(cfg.ExpandInstanceName, name, "Instance")
740 def _BuildInstanceHookEnv(name, primary_node, secondary_nodes, os_type, status,
741 memory, vcpus, nics, disk_template, disks,
742 bep, hvp, hypervisor_name):
743 """Builds instance related env variables for hooks
745 This builds the hook environment from individual variables.
748 @param name: the name of the instance
749 @type primary_node: string
750 @param primary_node: the name of the instance's primary node
751 @type secondary_nodes: list
752 @param secondary_nodes: list of secondary nodes as strings
753 @type os_type: string
754 @param os_type: the name of the instance's OS
755 @type status: boolean
756 @param status: the should_run status of the instance
758 @param memory: the memory size of the instance
760 @param vcpus: the count of VCPUs the instance has
762 @param nics: list of tuples (ip, mac, mode, link) representing
763 the NICs the instance has
764 @type disk_template: string
765 @param disk_template: the disk template of the instance
767 @param disks: the list of (size, mode) pairs
769 @param bep: the backend parameters for the instance
771 @param hvp: the hypervisor parameters for the instance
772 @type hypervisor_name: string
773 @param hypervisor_name: the hypervisor for the instance
775 @return: the hook environment for this instance
784 "INSTANCE_NAME": name,
785 "INSTANCE_PRIMARY": primary_node,
786 "INSTANCE_SECONDARIES": " ".join(secondary_nodes),
787 "INSTANCE_OS_TYPE": os_type,
788 "INSTANCE_STATUS": str_status,
789 "INSTANCE_MEMORY": memory,
790 "INSTANCE_VCPUS": vcpus,
791 "INSTANCE_DISK_TEMPLATE": disk_template,
792 "INSTANCE_HYPERVISOR": hypervisor_name,
796 nic_count = len(nics)
797 for idx, (ip, mac, mode, link) in enumerate(nics):
800 env["INSTANCE_NIC%d_IP" % idx] = ip
801 env["INSTANCE_NIC%d_MAC" % idx] = mac
802 env["INSTANCE_NIC%d_MODE" % idx] = mode
803 env["INSTANCE_NIC%d_LINK" % idx] = link
804 if mode == constants.NIC_MODE_BRIDGED:
805 env["INSTANCE_NIC%d_BRIDGE" % idx] = link
809 env["INSTANCE_NIC_COUNT"] = nic_count
812 disk_count = len(disks)
813 for idx, (size, mode) in enumerate(disks):
814 env["INSTANCE_DISK%d_SIZE" % idx] = size
815 env["INSTANCE_DISK%d_MODE" % idx] = mode
819 env["INSTANCE_DISK_COUNT"] = disk_count
821 for source, kind in [(bep, "BE"), (hvp, "HV")]:
822 for key, value in source.items():
823 env["INSTANCE_%s_%s" % (kind, key)] = value
828 def _NICListToTuple(lu, nics):
829 """Build a list of nic information tuples.
831 This list is suitable to be passed to _BuildInstanceHookEnv or as a return
832 value in LUQueryInstanceData.
834 @type lu: L{LogicalUnit}
835 @param lu: the logical unit on whose behalf we execute
836 @type nics: list of L{objects.NIC}
837 @param nics: list of nics to convert to hooks tuples
841 cluster = lu.cfg.GetClusterInfo()
845 filled_params = cluster.SimpleFillNIC(nic.nicparams)
846 mode = filled_params[constants.NIC_MODE]
847 link = filled_params[constants.NIC_LINK]
848 hooks_nics.append((ip, mac, mode, link))
852 def _BuildInstanceHookEnvByObject(lu, instance, override=None):
853 """Builds instance related env variables for hooks from an object.
855 @type lu: L{LogicalUnit}
856 @param lu: the logical unit on whose behalf we execute
857 @type instance: L{objects.Instance}
858 @param instance: the instance for which we should build the
861 @param override: dictionary with key/values that will override
864 @return: the hook environment dictionary
867 cluster = lu.cfg.GetClusterInfo()
868 bep = cluster.FillBE(instance)
869 hvp = cluster.FillHV(instance)
871 'name': instance.name,
872 'primary_node': instance.primary_node,
873 'secondary_nodes': instance.secondary_nodes,
874 'os_type': instance.os,
875 'status': instance.admin_up,
876 'memory': bep[constants.BE_MEMORY],
877 'vcpus': bep[constants.BE_VCPUS],
878 'nics': _NICListToTuple(lu, instance.nics),
879 'disk_template': instance.disk_template,
880 'disks': [(disk.size, disk.mode) for disk in instance.disks],
883 'hypervisor_name': instance.hypervisor,
886 args.update(override)
887 return _BuildInstanceHookEnv(**args) # pylint: disable-msg=W0142
890 def _AdjustCandidatePool(lu, exceptions):
891 """Adjust the candidate pool after node operations.
894 mod_list = lu.cfg.MaintainCandidatePool(exceptions)
896 lu.LogInfo("Promoted nodes to master candidate role: %s",
897 utils.CommaJoin(node.name for node in mod_list))
898 for name in mod_list:
899 lu.context.ReaddNode(name)
900 mc_now, mc_max, _ = lu.cfg.GetMasterCandidateStats(exceptions)
902 lu.LogInfo("Note: more nodes are candidates (%d) than desired (%d)" %
906 def _DecideSelfPromotion(lu, exceptions=None):
907 """Decide whether I should promote myself as a master candidate.
910 cp_size = lu.cfg.GetClusterInfo().candidate_pool_size
911 mc_now, mc_should, _ = lu.cfg.GetMasterCandidateStats(exceptions)
912 # the new node will increase mc_max with one, so:
913 mc_should = min(mc_should + 1, cp_size)
914 return mc_now < mc_should
917 def _CheckNicsBridgesExist(lu, target_nics, target_node):
918 """Check that the brigdes needed by a list of nics exist.
921 cluster = lu.cfg.GetClusterInfo()
922 paramslist = [cluster.SimpleFillNIC(nic.nicparams) for nic in target_nics]
923 brlist = [params[constants.NIC_LINK] for params in paramslist
924 if params[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED]
926 result = lu.rpc.call_bridges_exist(target_node, brlist)
927 result.Raise("Error checking bridges on destination node '%s'" %
928 target_node, prereq=True, ecode=errors.ECODE_ENVIRON)
931 def _CheckInstanceBridgesExist(lu, instance, node=None):
932 """Check that the brigdes needed by an instance exist.
936 node = instance.primary_node
937 _CheckNicsBridgesExist(lu, instance.nics, node)
940 def _CheckOSVariant(os_obj, name):
941 """Check whether an OS name conforms to the os variants specification.
943 @type os_obj: L{objects.OS}
944 @param os_obj: OS object to check
946 @param name: OS name passed by the user, to check for validity
949 if not os_obj.supported_variants:
951 variant = objects.OS.GetVariant(name)
953 raise errors.OpPrereqError("OS name must include a variant",
956 if variant not in os_obj.supported_variants:
957 raise errors.OpPrereqError("Unsupported OS variant", errors.ECODE_INVAL)
960 def _GetNodeInstancesInner(cfg, fn):
961 return [i for i in cfg.GetAllInstancesInfo().values() if fn(i)]
964 def _GetNodeInstances(cfg, node_name):
965 """Returns a list of all primary and secondary instances on a node.
969 return _GetNodeInstancesInner(cfg, lambda inst: node_name in inst.all_nodes)
972 def _GetNodePrimaryInstances(cfg, node_name):
973 """Returns primary instances on a node.
976 return _GetNodeInstancesInner(cfg,
977 lambda inst: node_name == inst.primary_node)
980 def _GetNodeSecondaryInstances(cfg, node_name):
981 """Returns secondary instances on a node.
984 return _GetNodeInstancesInner(cfg,
985 lambda inst: node_name in inst.secondary_nodes)
988 def _GetStorageTypeArgs(cfg, storage_type):
989 """Returns the arguments for a storage type.
992 # Special case for file storage
993 if storage_type == constants.ST_FILE:
994 # storage.FileStorage wants a list of storage directories
995 return [[cfg.GetFileStorageDir()]]
1000 def _FindFaultyInstanceDisks(cfg, rpc, instance, node_name, prereq):
1003 for dev in instance.disks:
1004 cfg.SetDiskID(dev, node_name)
1006 result = rpc.call_blockdev_getmirrorstatus(node_name, instance.disks)
1007 result.Raise("Failed to get disk status from node %s" % node_name,
1008 prereq=prereq, ecode=errors.ECODE_ENVIRON)
1010 for idx, bdev_status in enumerate(result.payload):
1011 if bdev_status and bdev_status.ldisk_status == constants.LDS_FAULTY:
1017 def _CheckIAllocatorOrNode(lu, iallocator_slot, node_slot):
1018 """Check the sanity of iallocator and node arguments and use the
1019 cluster-wide iallocator if appropriate.
1021 Check that at most one of (iallocator, node) is specified. If none is
1022 specified, then the LU's opcode's iallocator slot is filled with the
1023 cluster-wide default iallocator.
1025 @type iallocator_slot: string
1026 @param iallocator_slot: the name of the opcode iallocator slot
1027 @type node_slot: string
1028 @param node_slot: the name of the opcode target node slot
1031 node = getattr(lu.op, node_slot, None)
1032 iallocator = getattr(lu.op, iallocator_slot, None)
1034 if node is not None and iallocator is not None:
1035 raise errors.OpPrereqError("Do not specify both, iallocator and node.",
1037 elif node is None and iallocator is None:
1038 default_iallocator = lu.cfg.GetDefaultIAllocator()
1039 if default_iallocator:
1040 setattr(lu.op, iallocator_slot, default_iallocator)
1042 raise errors.OpPrereqError("No iallocator or node given and no"
1043 " cluster-wide default iallocator found."
1044 " Please specify either an iallocator or a"
1045 " node, or set a cluster-wide default"
1049 class LUPostInitCluster(LogicalUnit):
1050 """Logical unit for running hooks after cluster initialization.
1053 HPATH = "cluster-init"
1054 HTYPE = constants.HTYPE_CLUSTER
1056 def BuildHooksEnv(self):
1060 env = {"OP_TARGET": self.cfg.GetClusterName()}
1061 mn = self.cfg.GetMasterNode()
1062 return env, [], [mn]
1064 def Exec(self, feedback_fn):
1071 class LUDestroyCluster(LogicalUnit):
1072 """Logical unit for destroying the cluster.
1075 HPATH = "cluster-destroy"
1076 HTYPE = constants.HTYPE_CLUSTER
1078 def BuildHooksEnv(self):
1082 env = {"OP_TARGET": self.cfg.GetClusterName()}
1085 def CheckPrereq(self):
1086 """Check prerequisites.
1088 This checks whether the cluster is empty.
1090 Any errors are signaled by raising errors.OpPrereqError.
1093 master = self.cfg.GetMasterNode()
1095 nodelist = self.cfg.GetNodeList()
1096 if len(nodelist) != 1 or nodelist[0] != master:
1097 raise errors.OpPrereqError("There are still %d node(s) in"
1098 " this cluster." % (len(nodelist) - 1),
1100 instancelist = self.cfg.GetInstanceList()
1102 raise errors.OpPrereqError("There are still %d instance(s) in"
1103 " this cluster." % len(instancelist),
1106 def Exec(self, feedback_fn):
1107 """Destroys the cluster.
1110 master = self.cfg.GetMasterNode()
1112 # Run post hooks on master node before it's removed
1113 hm = self.proc.hmclass(self.rpc.call_hooks_runner, self)
1115 hm.RunPhase(constants.HOOKS_PHASE_POST, [master])
1117 # pylint: disable-msg=W0702
1118 self.LogWarning("Errors occurred running hooks on %s" % master)
1120 result = self.rpc.call_node_stop_master(master, False)
1121 result.Raise("Could not disable the master role")
1126 def _VerifyCertificate(filename):
1127 """Verifies a certificate for LUVerifyCluster.
1129 @type filename: string
1130 @param filename: Path to PEM file
1134 cert = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM,
1135 utils.ReadFile(filename))
1136 except Exception, err: # pylint: disable-msg=W0703
1137 return (LUVerifyCluster.ETYPE_ERROR,
1138 "Failed to load X509 certificate %s: %s" % (filename, err))
1141 utils.VerifyX509Certificate(cert, constants.SSL_CERT_EXPIRATION_WARN,
1142 constants.SSL_CERT_EXPIRATION_ERROR)
1145 fnamemsg = "While verifying %s: %s" % (filename, msg)
1150 return (None, fnamemsg)
1151 elif errcode == utils.CERT_WARNING:
1152 return (LUVerifyCluster.ETYPE_WARNING, fnamemsg)
1153 elif errcode == utils.CERT_ERROR:
1154 return (LUVerifyCluster.ETYPE_ERROR, fnamemsg)
1156 raise errors.ProgrammerError("Unhandled certificate error code %r" % errcode)
1159 class LUVerifyCluster(LogicalUnit):
1160 """Verifies the cluster status.
1163 HPATH = "cluster-verify"
1164 HTYPE = constants.HTYPE_CLUSTER
1166 ("skip_checks", ht.EmptyList,
1167 ht.TListOf(ht.TElemOf(constants.VERIFY_OPTIONAL_CHECKS))),
1168 ("verbose", False, ht.TBool),
1169 ("error_codes", False, ht.TBool),
1170 ("debug_simulate_errors", False, ht.TBool),
1174 TCLUSTER = "cluster"
1176 TINSTANCE = "instance"
1178 ECLUSTERCFG = (TCLUSTER, "ECLUSTERCFG")
1179 ECLUSTERCERT = (TCLUSTER, "ECLUSTERCERT")
1180 EINSTANCEBADNODE = (TINSTANCE, "EINSTANCEBADNODE")
1181 EINSTANCEDOWN = (TINSTANCE, "EINSTANCEDOWN")
1182 EINSTANCELAYOUT = (TINSTANCE, "EINSTANCELAYOUT")
1183 EINSTANCEMISSINGDISK = (TINSTANCE, "EINSTANCEMISSINGDISK")
1184 EINSTANCEFAULTYDISK = (TINSTANCE, "EINSTANCEFAULTYDISK")
1185 EINSTANCEWRONGNODE = (TINSTANCE, "EINSTANCEWRONGNODE")
1186 ENODEDRBD = (TNODE, "ENODEDRBD")
1187 ENODEDRBDHELPER = (TNODE, "ENODEDRBDHELPER")
1188 ENODEFILECHECK = (TNODE, "ENODEFILECHECK")
1189 ENODEHOOKS = (TNODE, "ENODEHOOKS")
1190 ENODEHV = (TNODE, "ENODEHV")
1191 ENODELVM = (TNODE, "ENODELVM")
1192 ENODEN1 = (TNODE, "ENODEN1")
1193 ENODENET = (TNODE, "ENODENET")
1194 ENODEOS = (TNODE, "ENODEOS")
1195 ENODEORPHANINSTANCE = (TNODE, "ENODEORPHANINSTANCE")
1196 ENODEORPHANLV = (TNODE, "ENODEORPHANLV")
1197 ENODERPC = (TNODE, "ENODERPC")
1198 ENODESSH = (TNODE, "ENODESSH")
1199 ENODEVERSION = (TNODE, "ENODEVERSION")
1200 ENODESETUP = (TNODE, "ENODESETUP")
1201 ENODETIME = (TNODE, "ENODETIME")
1203 ETYPE_FIELD = "code"
1204 ETYPE_ERROR = "ERROR"
1205 ETYPE_WARNING = "WARNING"
1207 class NodeImage(object):
1208 """A class representing the logical and physical status of a node.
1211 @ivar name: the node name to which this object refers
1212 @ivar volumes: a structure as returned from
1213 L{ganeti.backend.GetVolumeList} (runtime)
1214 @ivar instances: a list of running instances (runtime)
1215 @ivar pinst: list of configured primary instances (config)
1216 @ivar sinst: list of configured secondary instances (config)
1217 @ivar sbp: diction of {secondary-node: list of instances} of all peers
1218 of this node (config)
1219 @ivar mfree: free memory, as reported by hypervisor (runtime)
1220 @ivar dfree: free disk, as reported by the node (runtime)
1221 @ivar offline: the offline status (config)
1222 @type rpc_fail: boolean
1223 @ivar rpc_fail: whether the RPC verify call was successfull (overall,
1224 not whether the individual keys were correct) (runtime)
1225 @type lvm_fail: boolean
1226 @ivar lvm_fail: whether the RPC call didn't return valid LVM data
1227 @type hyp_fail: boolean
1228 @ivar hyp_fail: whether the RPC call didn't return the instance list
1229 @type ghost: boolean
1230 @ivar ghost: whether this is a known node or not (config)
1231 @type os_fail: boolean
1232 @ivar os_fail: whether the RPC call didn't return valid OS data
1234 @ivar oslist: list of OSes as diagnosed by DiagnoseOS
1235 @type vm_capable: boolean
1236 @ivar vm_capable: whether the node can host instances
1239 def __init__(self, offline=False, name=None, vm_capable=True):
1248 self.offline = offline
1249 self.vm_capable = vm_capable
1250 self.rpc_fail = False
1251 self.lvm_fail = False
1252 self.hyp_fail = False
1254 self.os_fail = False
1257 def ExpandNames(self):
1258 self.needed_locks = {
1259 locking.LEVEL_NODE: locking.ALL_SET,
1260 locking.LEVEL_INSTANCE: locking.ALL_SET,
1262 self.share_locks = dict.fromkeys(locking.LEVELS, 1)
1264 def _Error(self, ecode, item, msg, *args, **kwargs):
1265 """Format an error message.
1267 Based on the opcode's error_codes parameter, either format a
1268 parseable error code, or a simpler error string.
1270 This must be called only from Exec and functions called from Exec.
1273 ltype = kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR)
1275 # first complete the msg
1278 # then format the whole message
1279 if self.op.error_codes:
1280 msg = "%s:%s:%s:%s:%s" % (ltype, etxt, itype, item, msg)
1286 msg = "%s: %s%s: %s" % (ltype, itype, item, msg)
1287 # and finally report it via the feedback_fn
1288 self._feedback_fn(" - %s" % msg)
1290 def _ErrorIf(self, cond, *args, **kwargs):
1291 """Log an error message if the passed condition is True.
1294 cond = bool(cond) or self.op.debug_simulate_errors
1296 self._Error(*args, **kwargs)
1297 # do not mark the operation as failed for WARN cases only
1298 if kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR) == self.ETYPE_ERROR:
1299 self.bad = self.bad or cond
1301 def _VerifyNode(self, ninfo, nresult):
1302 """Perform some basic validation on data returned from a node.
1304 - check the result data structure is well formed and has all the
1306 - check ganeti version
1308 @type ninfo: L{objects.Node}
1309 @param ninfo: the node to check
1310 @param nresult: the results from the node
1312 @return: whether overall this call was successful (and we can expect
1313 reasonable values in the respose)
1317 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1319 # main result, nresult should be a non-empty dict
1320 test = not nresult or not isinstance(nresult, dict)
1321 _ErrorIf(test, self.ENODERPC, node,
1322 "unable to verify node: no data returned")
1326 # compares ganeti version
1327 local_version = constants.PROTOCOL_VERSION
1328 remote_version = nresult.get("version", None)
1329 test = not (remote_version and
1330 isinstance(remote_version, (list, tuple)) and
1331 len(remote_version) == 2)
1332 _ErrorIf(test, self.ENODERPC, node,
1333 "connection to node returned invalid data")
1337 test = local_version != remote_version[0]
1338 _ErrorIf(test, self.ENODEVERSION, node,
1339 "incompatible protocol versions: master %s,"
1340 " node %s", local_version, remote_version[0])
1344 # node seems compatible, we can actually try to look into its results
1346 # full package version
1347 self._ErrorIf(constants.RELEASE_VERSION != remote_version[1],
1348 self.ENODEVERSION, node,
1349 "software version mismatch: master %s, node %s",
1350 constants.RELEASE_VERSION, remote_version[1],
1351 code=self.ETYPE_WARNING)
1353 hyp_result = nresult.get(constants.NV_HYPERVISOR, None)
1354 if ninfo.vm_capable and isinstance(hyp_result, dict):
1355 for hv_name, hv_result in hyp_result.iteritems():
1356 test = hv_result is not None
1357 _ErrorIf(test, self.ENODEHV, node,
1358 "hypervisor %s verify failure: '%s'", hv_name, hv_result)
1360 test = nresult.get(constants.NV_NODESETUP,
1361 ["Missing NODESETUP results"])
1362 _ErrorIf(test, self.ENODESETUP, node, "node setup error: %s",
1367 def _VerifyNodeTime(self, ninfo, nresult,
1368 nvinfo_starttime, nvinfo_endtime):
1369 """Check the node time.
1371 @type ninfo: L{objects.Node}
1372 @param ninfo: the node to check
1373 @param nresult: the remote results for the node
1374 @param nvinfo_starttime: the start time of the RPC call
1375 @param nvinfo_endtime: the end time of the RPC call
1379 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1381 ntime = nresult.get(constants.NV_TIME, None)
1383 ntime_merged = utils.MergeTime(ntime)
1384 except (ValueError, TypeError):
1385 _ErrorIf(True, self.ENODETIME, node, "Node returned invalid time")
1388 if ntime_merged < (nvinfo_starttime - constants.NODE_MAX_CLOCK_SKEW):
1389 ntime_diff = "%.01fs" % abs(nvinfo_starttime - ntime_merged)
1390 elif ntime_merged > (nvinfo_endtime + constants.NODE_MAX_CLOCK_SKEW):
1391 ntime_diff = "%.01fs" % abs(ntime_merged - nvinfo_endtime)
1395 _ErrorIf(ntime_diff is not None, self.ENODETIME, node,
1396 "Node time diverges by at least %s from master node time",
1399 def _VerifyNodeLVM(self, ninfo, nresult, vg_name):
1400 """Check the node time.
1402 @type ninfo: L{objects.Node}
1403 @param ninfo: the node to check
1404 @param nresult: the remote results for the node
1405 @param vg_name: the configured VG name
1412 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1414 # checks vg existence and size > 20G
1415 vglist = nresult.get(constants.NV_VGLIST, None)
1417 _ErrorIf(test, self.ENODELVM, node, "unable to check volume groups")
1419 vgstatus = utils.CheckVolumeGroupSize(vglist, vg_name,
1420 constants.MIN_VG_SIZE)
1421 _ErrorIf(vgstatus, self.ENODELVM, node, vgstatus)
1424 pvlist = nresult.get(constants.NV_PVLIST, None)
1425 test = pvlist is None
1426 _ErrorIf(test, self.ENODELVM, node, "Can't get PV list from node")
1428 # check that ':' is not present in PV names, since it's a
1429 # special character for lvcreate (denotes the range of PEs to
1431 for _, pvname, owner_vg in pvlist:
1432 test = ":" in pvname
1433 _ErrorIf(test, self.ENODELVM, node, "Invalid character ':' in PV"
1434 " '%s' of VG '%s'", pvname, owner_vg)
1436 def _VerifyNodeNetwork(self, ninfo, nresult):
1437 """Check the node time.
1439 @type ninfo: L{objects.Node}
1440 @param ninfo: the node to check
1441 @param nresult: the remote results for the node
1445 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1447 test = constants.NV_NODELIST not in nresult
1448 _ErrorIf(test, self.ENODESSH, node,
1449 "node hasn't returned node ssh connectivity data")
1451 if nresult[constants.NV_NODELIST]:
1452 for a_node, a_msg in nresult[constants.NV_NODELIST].items():
1453 _ErrorIf(True, self.ENODESSH, node,
1454 "ssh communication with node '%s': %s", a_node, a_msg)
1456 test = constants.NV_NODENETTEST not in nresult
1457 _ErrorIf(test, self.ENODENET, node,
1458 "node hasn't returned node tcp connectivity data")
1460 if nresult[constants.NV_NODENETTEST]:
1461 nlist = utils.NiceSort(nresult[constants.NV_NODENETTEST].keys())
1463 _ErrorIf(True, self.ENODENET, node,
1464 "tcp communication with node '%s': %s",
1465 anode, nresult[constants.NV_NODENETTEST][anode])
1467 test = constants.NV_MASTERIP not in nresult
1468 _ErrorIf(test, self.ENODENET, node,
1469 "node hasn't returned node master IP reachability data")
1471 if not nresult[constants.NV_MASTERIP]:
1472 if node == self.master_node:
1473 msg = "the master node cannot reach the master IP (not configured?)"
1475 msg = "cannot reach the master IP"
1476 _ErrorIf(True, self.ENODENET, node, msg)
1478 def _VerifyInstance(self, instance, instanceconfig, node_image,
1480 """Verify an instance.
1482 This function checks to see if the required block devices are
1483 available on the instance's node.
1486 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1487 node_current = instanceconfig.primary_node
1489 node_vol_should = {}
1490 instanceconfig.MapLVsByNode(node_vol_should)
1492 for node in node_vol_should:
1493 n_img = node_image[node]
1494 if n_img.offline or n_img.rpc_fail or n_img.lvm_fail:
1495 # ignore missing volumes on offline or broken nodes
1497 for volume in node_vol_should[node]:
1498 test = volume not in n_img.volumes
1499 _ErrorIf(test, self.EINSTANCEMISSINGDISK, instance,
1500 "volume %s missing on node %s", volume, node)
1502 if instanceconfig.admin_up:
1503 pri_img = node_image[node_current]
1504 test = instance not in pri_img.instances and not pri_img.offline
1505 _ErrorIf(test, self.EINSTANCEDOWN, instance,
1506 "instance not running on its primary node %s",
1509 for node, n_img in node_image.items():
1510 if (not node == node_current):
1511 test = instance in n_img.instances
1512 _ErrorIf(test, self.EINSTANCEWRONGNODE, instance,
1513 "instance should not run on node %s", node)
1515 diskdata = [(nname, disk, idx)
1516 for (nname, disks) in diskstatus.items()
1517 for idx, disk in enumerate(disks)]
1519 for nname, bdev_status, idx in diskdata:
1520 _ErrorIf(not bdev_status,
1521 self.EINSTANCEFAULTYDISK, instance,
1522 "couldn't retrieve status for disk/%s on %s", idx, nname)
1523 _ErrorIf(bdev_status and bdev_status.ldisk_status == constants.LDS_FAULTY,
1524 self.EINSTANCEFAULTYDISK, instance,
1525 "disk/%s on %s is faulty", idx, nname)
1527 def _VerifyOrphanVolumes(self, node_vol_should, node_image, reserved):
1528 """Verify if there are any unknown volumes in the cluster.
1530 The .os, .swap and backup volumes are ignored. All other volumes are
1531 reported as unknown.
1533 @type reserved: L{ganeti.utils.FieldSet}
1534 @param reserved: a FieldSet of reserved volume names
1537 for node, n_img in node_image.items():
1538 if n_img.offline or n_img.rpc_fail or n_img.lvm_fail:
1539 # skip non-healthy nodes
1541 for volume in n_img.volumes:
1542 test = ((node not in node_vol_should or
1543 volume not in node_vol_should[node]) and
1544 not reserved.Matches(volume))
1545 self._ErrorIf(test, self.ENODEORPHANLV, node,
1546 "volume %s is unknown", volume)
1548 def _VerifyOrphanInstances(self, instancelist, node_image):
1549 """Verify the list of running instances.
1551 This checks what instances are running but unknown to the cluster.
1554 for node, n_img in node_image.items():
1555 for o_inst in n_img.instances:
1556 test = o_inst not in instancelist
1557 self._ErrorIf(test, self.ENODEORPHANINSTANCE, node,
1558 "instance %s on node %s should not exist", o_inst, node)
1560 def _VerifyNPlusOneMemory(self, node_image, instance_cfg):
1561 """Verify N+1 Memory Resilience.
1563 Check that if one single node dies we can still start all the
1564 instances it was primary for.
1567 for node, n_img in node_image.items():
1568 # This code checks that every node which is now listed as
1569 # secondary has enough memory to host all instances it is
1570 # supposed to should a single other node in the cluster fail.
1571 # FIXME: not ready for failover to an arbitrary node
1572 # FIXME: does not support file-backed instances
1573 # WARNING: we currently take into account down instances as well
1574 # as up ones, considering that even if they're down someone
1575 # might want to start them even in the event of a node failure.
1576 for prinode, instances in n_img.sbp.items():
1578 for instance in instances:
1579 bep = self.cfg.GetClusterInfo().FillBE(instance_cfg[instance])
1580 if bep[constants.BE_AUTO_BALANCE]:
1581 needed_mem += bep[constants.BE_MEMORY]
1582 test = n_img.mfree < needed_mem
1583 self._ErrorIf(test, self.ENODEN1, node,
1584 "not enough memory on to accommodate"
1585 " failovers should peer node %s fail", prinode)
1587 def _VerifyNodeFiles(self, ninfo, nresult, file_list, local_cksum,
1589 """Verifies and computes the node required file checksums.
1591 @type ninfo: L{objects.Node}
1592 @param ninfo: the node to check
1593 @param nresult: the remote results for the node
1594 @param file_list: required list of files
1595 @param local_cksum: dictionary of local files and their checksums
1596 @param master_files: list of files that only masters should have
1600 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1602 remote_cksum = nresult.get(constants.NV_FILELIST, None)
1603 test = not isinstance(remote_cksum, dict)
1604 _ErrorIf(test, self.ENODEFILECHECK, node,
1605 "node hasn't returned file checksum data")
1609 for file_name in file_list:
1610 node_is_mc = ninfo.master_candidate
1611 must_have = (file_name not in master_files) or node_is_mc
1613 test1 = file_name not in remote_cksum
1615 test2 = not test1 and remote_cksum[file_name] != local_cksum[file_name]
1617 test3 = not test1 and remote_cksum[file_name] == local_cksum[file_name]
1618 _ErrorIf(test1 and must_have, self.ENODEFILECHECK, node,
1619 "file '%s' missing", file_name)
1620 _ErrorIf(test2 and must_have, self.ENODEFILECHECK, node,
1621 "file '%s' has wrong checksum", file_name)
1622 # not candidate and this is not a must-have file
1623 _ErrorIf(test2 and not must_have, self.ENODEFILECHECK, node,
1624 "file '%s' should not exist on non master"
1625 " candidates (and the file is outdated)", file_name)
1626 # all good, except non-master/non-must have combination
1627 _ErrorIf(test3 and not must_have, self.ENODEFILECHECK, node,
1628 "file '%s' should not exist"
1629 " on non master candidates", file_name)
1631 def _VerifyNodeDrbd(self, ninfo, nresult, instanceinfo, drbd_helper,
1633 """Verifies and the node DRBD status.
1635 @type ninfo: L{objects.Node}
1636 @param ninfo: the node to check
1637 @param nresult: the remote results for the node
1638 @param instanceinfo: the dict of instances
1639 @param drbd_helper: the configured DRBD usermode helper
1640 @param drbd_map: the DRBD map as returned by
1641 L{ganeti.config.ConfigWriter.ComputeDRBDMap}
1645 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1648 helper_result = nresult.get(constants.NV_DRBDHELPER, None)
1649 test = (helper_result == None)
1650 _ErrorIf(test, self.ENODEDRBDHELPER, node,
1651 "no drbd usermode helper returned")
1653 status, payload = helper_result
1655 _ErrorIf(test, self.ENODEDRBDHELPER, node,
1656 "drbd usermode helper check unsuccessful: %s", payload)
1657 test = status and (payload != drbd_helper)
1658 _ErrorIf(test, self.ENODEDRBDHELPER, node,
1659 "wrong drbd usermode helper: %s", payload)
1661 # compute the DRBD minors
1663 for minor, instance in drbd_map[node].items():
1664 test = instance not in instanceinfo
1665 _ErrorIf(test, self.ECLUSTERCFG, None,
1666 "ghost instance '%s' in temporary DRBD map", instance)
1667 # ghost instance should not be running, but otherwise we
1668 # don't give double warnings (both ghost instance and
1669 # unallocated minor in use)
1671 node_drbd[minor] = (instance, False)
1673 instance = instanceinfo[instance]
1674 node_drbd[minor] = (instance.name, instance.admin_up)
1676 # and now check them
1677 used_minors = nresult.get(constants.NV_DRBDLIST, [])
1678 test = not isinstance(used_minors, (tuple, list))
1679 _ErrorIf(test, self.ENODEDRBD, node,
1680 "cannot parse drbd status file: %s", str(used_minors))
1682 # we cannot check drbd status
1685 for minor, (iname, must_exist) in node_drbd.items():
1686 test = minor not in used_minors and must_exist
1687 _ErrorIf(test, self.ENODEDRBD, node,
1688 "drbd minor %d of instance %s is not active", minor, iname)
1689 for minor in used_minors:
1690 test = minor not in node_drbd
1691 _ErrorIf(test, self.ENODEDRBD, node,
1692 "unallocated drbd minor %d is in use", minor)
1694 def _UpdateNodeOS(self, ninfo, nresult, nimg):
1695 """Builds the node OS structures.
1697 @type ninfo: L{objects.Node}
1698 @param ninfo: the node to check
1699 @param nresult: the remote results for the node
1700 @param nimg: the node image object
1704 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1706 remote_os = nresult.get(constants.NV_OSLIST, None)
1707 test = (not isinstance(remote_os, list) or
1708 not compat.all(isinstance(v, list) and len(v) == 7
1709 for v in remote_os))
1711 _ErrorIf(test, self.ENODEOS, node,
1712 "node hasn't returned valid OS data")
1721 for (name, os_path, status, diagnose,
1722 variants, parameters, api_ver) in nresult[constants.NV_OSLIST]:
1724 if name not in os_dict:
1727 # parameters is a list of lists instead of list of tuples due to
1728 # JSON lacking a real tuple type, fix it:
1729 parameters = [tuple(v) for v in parameters]
1730 os_dict[name].append((os_path, status, diagnose,
1731 set(variants), set(parameters), set(api_ver)))
1733 nimg.oslist = os_dict
1735 def _VerifyNodeOS(self, ninfo, nimg, base):
1736 """Verifies the node OS list.
1738 @type ninfo: L{objects.Node}
1739 @param ninfo: the node to check
1740 @param nimg: the node image object
1741 @param base: the 'template' node we match against (e.g. from the master)
1745 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1747 assert not nimg.os_fail, "Entered _VerifyNodeOS with failed OS rpc?"
1749 for os_name, os_data in nimg.oslist.items():
1750 assert os_data, "Empty OS status for OS %s?!" % os_name
1751 f_path, f_status, f_diag, f_var, f_param, f_api = os_data[0]
1752 _ErrorIf(not f_status, self.ENODEOS, node,
1753 "Invalid OS %s (located at %s): %s", os_name, f_path, f_diag)
1754 _ErrorIf(len(os_data) > 1, self.ENODEOS, node,
1755 "OS '%s' has multiple entries (first one shadows the rest): %s",
1756 os_name, utils.CommaJoin([v[0] for v in os_data]))
1757 # this will catched in backend too
1758 _ErrorIf(compat.any(v >= constants.OS_API_V15 for v in f_api)
1759 and not f_var, self.ENODEOS, node,
1760 "OS %s with API at least %d does not declare any variant",
1761 os_name, constants.OS_API_V15)
1762 # comparisons with the 'base' image
1763 test = os_name not in base.oslist
1764 _ErrorIf(test, self.ENODEOS, node,
1765 "Extra OS %s not present on reference node (%s)",
1769 assert base.oslist[os_name], "Base node has empty OS status?"
1770 _, b_status, _, b_var, b_param, b_api = base.oslist[os_name][0]
1772 # base OS is invalid, skipping
1774 for kind, a, b in [("API version", f_api, b_api),
1775 ("variants list", f_var, b_var),
1776 ("parameters", f_param, b_param)]:
1777 _ErrorIf(a != b, self.ENODEOS, node,
1778 "OS %s %s differs from reference node %s: %s vs. %s",
1779 kind, os_name, base.name,
1780 utils.CommaJoin(a), utils.CommaJoin(b))
1782 # check any missing OSes
1783 missing = set(base.oslist.keys()).difference(nimg.oslist.keys())
1784 _ErrorIf(missing, self.ENODEOS, node,
1785 "OSes present on reference node %s but missing on this node: %s",
1786 base.name, utils.CommaJoin(missing))
1788 def _UpdateNodeVolumes(self, ninfo, nresult, nimg, vg_name):
1789 """Verifies and updates the node volume data.
1791 This function will update a L{NodeImage}'s internal structures
1792 with data from the remote call.
1794 @type ninfo: L{objects.Node}
1795 @param ninfo: the node to check
1796 @param nresult: the remote results for the node
1797 @param nimg: the node image object
1798 @param vg_name: the configured VG name
1802 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1804 nimg.lvm_fail = True
1805 lvdata = nresult.get(constants.NV_LVLIST, "Missing LV data")
1808 elif isinstance(lvdata, basestring):
1809 _ErrorIf(True, self.ENODELVM, node, "LVM problem on node: %s",
1810 utils.SafeEncode(lvdata))
1811 elif not isinstance(lvdata, dict):
1812 _ErrorIf(True, self.ENODELVM, node, "rpc call to node failed (lvlist)")
1814 nimg.volumes = lvdata
1815 nimg.lvm_fail = False
1817 def _UpdateNodeInstances(self, ninfo, nresult, nimg):
1818 """Verifies and updates the node instance list.
1820 If the listing was successful, then updates this node's instance
1821 list. Otherwise, it marks the RPC call as failed for the instance
1824 @type ninfo: L{objects.Node}
1825 @param ninfo: the node to check
1826 @param nresult: the remote results for the node
1827 @param nimg: the node image object
1830 idata = nresult.get(constants.NV_INSTANCELIST, None)
1831 test = not isinstance(idata, list)
1832 self._ErrorIf(test, self.ENODEHV, ninfo.name, "rpc call to node failed"
1833 " (instancelist): %s", utils.SafeEncode(str(idata)))
1835 nimg.hyp_fail = True
1837 nimg.instances = idata
1839 def _UpdateNodeInfo(self, ninfo, nresult, nimg, vg_name):
1840 """Verifies and computes a node information map
1842 @type ninfo: L{objects.Node}
1843 @param ninfo: the node to check
1844 @param nresult: the remote results for the node
1845 @param nimg: the node image object
1846 @param vg_name: the configured VG name
1850 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1852 # try to read free memory (from the hypervisor)
1853 hv_info = nresult.get(constants.NV_HVINFO, None)
1854 test = not isinstance(hv_info, dict) or "memory_free" not in hv_info
1855 _ErrorIf(test, self.ENODEHV, node, "rpc call to node failed (hvinfo)")
1858 nimg.mfree = int(hv_info["memory_free"])
1859 except (ValueError, TypeError):
1860 _ErrorIf(True, self.ENODERPC, node,
1861 "node returned invalid nodeinfo, check hypervisor")
1863 # FIXME: devise a free space model for file based instances as well
1864 if vg_name is not None:
1865 test = (constants.NV_VGLIST not in nresult or
1866 vg_name not in nresult[constants.NV_VGLIST])
1867 _ErrorIf(test, self.ENODELVM, node,
1868 "node didn't return data for the volume group '%s'"
1869 " - it is either missing or broken", vg_name)
1872 nimg.dfree = int(nresult[constants.NV_VGLIST][vg_name])
1873 except (ValueError, TypeError):
1874 _ErrorIf(True, self.ENODERPC, node,
1875 "node returned invalid LVM info, check LVM status")
1877 def _CollectDiskInfo(self, nodelist, node_image, instanceinfo):
1878 """Gets per-disk status information for all instances.
1880 @type nodelist: list of strings
1881 @param nodelist: Node names
1882 @type node_image: dict of (name, L{objects.Node})
1883 @param node_image: Node objects
1884 @type instanceinfo: dict of (name, L{objects.Instance})
1885 @param instanceinfo: Instance objects
1888 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1891 node_disks_devonly = {}
1893 for nname in nodelist:
1894 disks = [(inst, disk)
1895 for instlist in [node_image[nname].pinst,
1896 node_image[nname].sinst]
1897 for inst in instlist
1898 for disk in instanceinfo[inst].disks]
1901 # No need to collect data
1904 node_disks[nname] = disks
1906 # Creating copies as SetDiskID below will modify the objects and that can
1907 # lead to incorrect data returned from nodes
1908 devonly = [dev.Copy() for (_, dev) in disks]
1911 self.cfg.SetDiskID(dev, nname)
1913 node_disks_devonly[nname] = devonly
1915 assert len(node_disks) == len(node_disks_devonly)
1917 # Collect data from all nodes with disks
1918 result = self.rpc.call_blockdev_getmirrorstatus_multi(node_disks.keys(),
1921 assert len(result) == len(node_disks)
1925 for (nname, nres) in result.items():
1927 # Ignore offline node
1930 disks = node_disks[nname]
1933 _ErrorIf(msg, self.ENODERPC, nname,
1934 "while getting disk information: %s", nres.fail_msg)
1936 # No data from this node
1937 data = len(disks) * [None]
1941 for ((inst, _), status) in zip(disks, data):
1942 instdisk.setdefault(inst, {}).setdefault(nname, []).append(status)
1944 assert compat.all(len(statuses) == len(instanceinfo[inst].disks) and
1945 len(nnames) <= len(instanceinfo[inst].all_nodes)
1946 for inst, nnames in instdisk.items()
1947 for nname, statuses in nnames.items())
1951 def BuildHooksEnv(self):
1954 Cluster-Verify hooks just ran in the post phase and their failure makes
1955 the output be logged in the verify output and the verification to fail.
1958 all_nodes = self.cfg.GetNodeList()
1960 "CLUSTER_TAGS": " ".join(self.cfg.GetClusterInfo().GetTags())
1962 for node in self.cfg.GetAllNodesInfo().values():
1963 env["NODE_TAGS_%s" % node.name] = " ".join(node.GetTags())
1965 return env, [], all_nodes
1967 def Exec(self, feedback_fn):
1968 """Verify integrity of cluster, performing various test on nodes.
1972 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1973 verbose = self.op.verbose
1974 self._feedback_fn = feedback_fn
1975 feedback_fn("* Verifying global settings")
1976 for msg in self.cfg.VerifyConfig():
1977 _ErrorIf(True, self.ECLUSTERCFG, None, msg)
1979 # Check the cluster certificates
1980 for cert_filename in constants.ALL_CERT_FILES:
1981 (errcode, msg) = _VerifyCertificate(cert_filename)
1982 _ErrorIf(errcode, self.ECLUSTERCERT, None, msg, code=errcode)
1984 vg_name = self.cfg.GetVGName()
1985 drbd_helper = self.cfg.GetDRBDHelper()
1986 hypervisors = self.cfg.GetClusterInfo().enabled_hypervisors
1987 cluster = self.cfg.GetClusterInfo()
1988 nodelist = utils.NiceSort(self.cfg.GetNodeList())
1989 nodeinfo = [self.cfg.GetNodeInfo(nname) for nname in nodelist]
1990 instancelist = utils.NiceSort(self.cfg.GetInstanceList())
1991 instanceinfo = dict((iname, self.cfg.GetInstanceInfo(iname))
1992 for iname in instancelist)
1993 i_non_redundant = [] # Non redundant instances
1994 i_non_a_balanced = [] # Non auto-balanced instances
1995 n_offline = 0 # Count of offline nodes
1996 n_drained = 0 # Count of nodes being drained
1997 node_vol_should = {}
1999 # FIXME: verify OS list
2000 # do local checksums
2001 master_files = [constants.CLUSTER_CONF_FILE]
2002 master_node = self.master_node = self.cfg.GetMasterNode()
2003 master_ip = self.cfg.GetMasterIP()
2005 file_names = ssconf.SimpleStore().GetFileList()
2006 file_names.extend(constants.ALL_CERT_FILES)
2007 file_names.extend(master_files)
2008 if cluster.modify_etc_hosts:
2009 file_names.append(constants.ETC_HOSTS)
2011 local_checksums = utils.FingerprintFiles(file_names)
2013 feedback_fn("* Gathering data (%d nodes)" % len(nodelist))
2014 node_verify_param = {
2015 constants.NV_FILELIST: file_names,
2016 constants.NV_NODELIST: [node.name for node in nodeinfo
2017 if not node.offline],
2018 constants.NV_HYPERVISOR: hypervisors,
2019 constants.NV_NODENETTEST: [(node.name, node.primary_ip,
2020 node.secondary_ip) for node in nodeinfo
2021 if not node.offline],
2022 constants.NV_INSTANCELIST: hypervisors,
2023 constants.NV_VERSION: None,
2024 constants.NV_HVINFO: self.cfg.GetHypervisorType(),
2025 constants.NV_NODESETUP: None,
2026 constants.NV_TIME: None,
2027 constants.NV_MASTERIP: (master_node, master_ip),
2028 constants.NV_OSLIST: None,
2029 constants.NV_VMNODES: self.cfg.GetNonVmCapableNodeList(),
2032 if vg_name is not None:
2033 node_verify_param[constants.NV_VGLIST] = None
2034 node_verify_param[constants.NV_LVLIST] = vg_name
2035 node_verify_param[constants.NV_PVLIST] = [vg_name]
2036 node_verify_param[constants.NV_DRBDLIST] = None
2039 node_verify_param[constants.NV_DRBDHELPER] = drbd_helper
2041 # Build our expected cluster state
2042 node_image = dict((node.name, self.NodeImage(offline=node.offline,
2044 vm_capable=node.vm_capable))
2045 for node in nodeinfo)
2047 for instance in instancelist:
2048 inst_config = instanceinfo[instance]
2050 for nname in inst_config.all_nodes:
2051 if nname not in node_image:
2053 gnode = self.NodeImage(name=nname)
2055 node_image[nname] = gnode
2057 inst_config.MapLVsByNode(node_vol_should)
2059 pnode = inst_config.primary_node
2060 node_image[pnode].pinst.append(instance)
2062 for snode in inst_config.secondary_nodes:
2063 nimg = node_image[snode]
2064 nimg.sinst.append(instance)
2065 if pnode not in nimg.sbp:
2066 nimg.sbp[pnode] = []
2067 nimg.sbp[pnode].append(instance)
2069 # At this point, we have the in-memory data structures complete,
2070 # except for the runtime information, which we'll gather next
2072 # Due to the way our RPC system works, exact response times cannot be
2073 # guaranteed (e.g. a broken node could run into a timeout). By keeping the
2074 # time before and after executing the request, we can at least have a time
2076 nvinfo_starttime = time.time()
2077 all_nvinfo = self.rpc.call_node_verify(nodelist, node_verify_param,
2078 self.cfg.GetClusterName())
2079 nvinfo_endtime = time.time()
2081 all_drbd_map = self.cfg.ComputeDRBDMap()
2083 feedback_fn("* Gathering disk information (%s nodes)" % len(nodelist))
2084 instdisk = self._CollectDiskInfo(nodelist, node_image, instanceinfo)
2086 feedback_fn("* Verifying node status")
2090 for node_i in nodeinfo:
2092 nimg = node_image[node]
2096 feedback_fn("* Skipping offline node %s" % (node,))
2100 if node == master_node:
2102 elif node_i.master_candidate:
2103 ntype = "master candidate"
2104 elif node_i.drained:
2110 feedback_fn("* Verifying node %s (%s)" % (node, ntype))
2112 msg = all_nvinfo[node].fail_msg
2113 _ErrorIf(msg, self.ENODERPC, node, "while contacting node: %s", msg)
2115 nimg.rpc_fail = True
2118 nresult = all_nvinfo[node].payload
2120 nimg.call_ok = self._VerifyNode(node_i, nresult)
2121 self._VerifyNodeTime(node_i, nresult, nvinfo_starttime, nvinfo_endtime)
2122 self._VerifyNodeNetwork(node_i, nresult)
2123 self._VerifyNodeFiles(node_i, nresult, file_names, local_checksums,
2127 self._VerifyNodeLVM(node_i, nresult, vg_name)
2128 self._VerifyNodeDrbd(node_i, nresult, instanceinfo, drbd_helper,
2131 self._UpdateNodeVolumes(node_i, nresult, nimg, vg_name)
2132 self._UpdateNodeInstances(node_i, nresult, nimg)
2133 self._UpdateNodeInfo(node_i, nresult, nimg, vg_name)
2134 self._UpdateNodeOS(node_i, nresult, nimg)
2135 if not nimg.os_fail:
2136 if refos_img is None:
2138 self._VerifyNodeOS(node_i, nimg, refos_img)
2140 feedback_fn("* Verifying instance status")
2141 for instance in instancelist:
2143 feedback_fn("* Verifying instance %s" % instance)
2144 inst_config = instanceinfo[instance]
2145 self._VerifyInstance(instance, inst_config, node_image,
2147 inst_nodes_offline = []
2149 pnode = inst_config.primary_node
2150 pnode_img = node_image[pnode]
2151 _ErrorIf(pnode_img.rpc_fail and not pnode_img.offline,
2152 self.ENODERPC, pnode, "instance %s, connection to"
2153 " primary node failed", instance)
2155 if pnode_img.offline:
2156 inst_nodes_offline.append(pnode)
2158 # If the instance is non-redundant we cannot survive losing its primary
2159 # node, so we are not N+1 compliant. On the other hand we have no disk
2160 # templates with more than one secondary so that situation is not well
2162 # FIXME: does not support file-backed instances
2163 if not inst_config.secondary_nodes:
2164 i_non_redundant.append(instance)
2165 _ErrorIf(len(inst_config.secondary_nodes) > 1, self.EINSTANCELAYOUT,
2166 instance, "instance has multiple secondary nodes: %s",
2167 utils.CommaJoin(inst_config.secondary_nodes),
2168 code=self.ETYPE_WARNING)
2170 if not cluster.FillBE(inst_config)[constants.BE_AUTO_BALANCE]:
2171 i_non_a_balanced.append(instance)
2173 for snode in inst_config.secondary_nodes:
2174 s_img = node_image[snode]
2175 _ErrorIf(s_img.rpc_fail and not s_img.offline, self.ENODERPC, snode,
2176 "instance %s, connection to secondary node failed", instance)
2179 inst_nodes_offline.append(snode)
2181 # warn that the instance lives on offline nodes
2182 _ErrorIf(inst_nodes_offline, self.EINSTANCEBADNODE, instance,
2183 "instance lives on offline node(s) %s",
2184 utils.CommaJoin(inst_nodes_offline))
2185 # ... or ghost/non-vm_capable nodes
2186 for node in inst_config.all_nodes:
2187 _ErrorIf(node_image[node].ghost, self.EINSTANCEBADNODE, instance,
2188 "instance lives on ghost node %s", node)
2189 _ErrorIf(not node_image[node].vm_capable, self.EINSTANCEBADNODE,
2190 instance, "instance lives on non-vm_capable node %s", node)
2192 feedback_fn("* Verifying orphan volumes")
2193 reserved = utils.FieldSet(*cluster.reserved_lvs)
2194 self._VerifyOrphanVolumes(node_vol_should, node_image, reserved)
2196 feedback_fn("* Verifying orphan instances")
2197 self._VerifyOrphanInstances(instancelist, node_image)
2199 if constants.VERIFY_NPLUSONE_MEM not in self.op.skip_checks:
2200 feedback_fn("* Verifying N+1 Memory redundancy")
2201 self._VerifyNPlusOneMemory(node_image, instanceinfo)
2203 feedback_fn("* Other Notes")
2205 feedback_fn(" - NOTICE: %d non-redundant instance(s) found."
2206 % len(i_non_redundant))
2208 if i_non_a_balanced:
2209 feedback_fn(" - NOTICE: %d non-auto-balanced instance(s) found."
2210 % len(i_non_a_balanced))
2213 feedback_fn(" - NOTICE: %d offline node(s) found." % n_offline)
2216 feedback_fn(" - NOTICE: %d drained node(s) found." % n_drained)
2220 def HooksCallBack(self, phase, hooks_results, feedback_fn, lu_result):
2221 """Analyze the post-hooks' result
2223 This method analyses the hook result, handles it, and sends some
2224 nicely-formatted feedback back to the user.
2226 @param phase: one of L{constants.HOOKS_PHASE_POST} or
2227 L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase
2228 @param hooks_results: the results of the multi-node hooks rpc call
2229 @param feedback_fn: function used send feedback back to the caller
2230 @param lu_result: previous Exec result
2231 @return: the new Exec result, based on the previous result
2235 # We only really run POST phase hooks, and are only interested in
2237 if phase == constants.HOOKS_PHASE_POST:
2238 # Used to change hooks' output to proper indentation
2239 indent_re = re.compile('^', re.M)
2240 feedback_fn("* Hooks Results")
2241 assert hooks_results, "invalid result from hooks"
2243 for node_name in hooks_results:
2244 res = hooks_results[node_name]
2246 test = msg and not res.offline
2247 self._ErrorIf(test, self.ENODEHOOKS, node_name,
2248 "Communication failure in hooks execution: %s", msg)
2249 if res.offline or msg:
2250 # No need to investigate payload if node is offline or gave an error.
2251 # override manually lu_result here as _ErrorIf only
2252 # overrides self.bad
2255 for script, hkr, output in res.payload:
2256 test = hkr == constants.HKR_FAIL
2257 self._ErrorIf(test, self.ENODEHOOKS, node_name,
2258 "Script %s failed, output:", script)
2260 output = indent_re.sub(' ', output)
2261 feedback_fn("%s" % output)
2267 class LUVerifyDisks(NoHooksLU):
2268 """Verifies the cluster disks status.
2273 def ExpandNames(self):
2274 self.needed_locks = {
2275 locking.LEVEL_NODE: locking.ALL_SET,
2276 locking.LEVEL_INSTANCE: locking.ALL_SET,
2278 self.share_locks = dict.fromkeys(locking.LEVELS, 1)
2280 def Exec(self, feedback_fn):
2281 """Verify integrity of cluster disks.
2283 @rtype: tuple of three items
2284 @return: a tuple of (dict of node-to-node_error, list of instances
2285 which need activate-disks, dict of instance: (node, volume) for
2289 result = res_nodes, res_instances, res_missing = {}, [], {}
2291 vg_name = self.cfg.GetVGName()
2292 nodes = utils.NiceSort(self.cfg.GetNodeList())
2293 instances = [self.cfg.GetInstanceInfo(name)
2294 for name in self.cfg.GetInstanceList()]
2297 for inst in instances:
2299 if (not inst.admin_up or
2300 inst.disk_template not in constants.DTS_NET_MIRROR):
2302 inst.MapLVsByNode(inst_lvs)
2303 # transform { iname: {node: [vol,],},} to {(node, vol): iname}
2304 for node, vol_list in inst_lvs.iteritems():
2305 for vol in vol_list:
2306 nv_dict[(node, vol)] = inst
2311 node_lvs = self.rpc.call_lv_list(nodes, vg_name)
2315 node_res = node_lvs[node]
2316 if node_res.offline:
2318 msg = node_res.fail_msg
2320 logging.warning("Error enumerating LVs on node %s: %s", node, msg)
2321 res_nodes[node] = msg
2324 lvs = node_res.payload
2325 for lv_name, (_, _, lv_online) in lvs.items():
2326 inst = nv_dict.pop((node, lv_name), None)
2327 if (not lv_online and inst is not None
2328 and inst.name not in res_instances):
2329 res_instances.append(inst.name)
2331 # any leftover items in nv_dict are missing LVs, let's arrange the
2333 for key, inst in nv_dict.iteritems():
2334 if inst.name not in res_missing:
2335 res_missing[inst.name] = []
2336 res_missing[inst.name].append(key)
2341 class LURepairDiskSizes(NoHooksLU):
2342 """Verifies the cluster disks sizes.
2345 _OP_PARAMS = [("instances", ht.EmptyList, ht.TListOf(ht.TNonEmptyString))]
2348 def ExpandNames(self):
2349 if self.op.instances:
2350 self.wanted_names = []
2351 for name in self.op.instances:
2352 full_name = _ExpandInstanceName(self.cfg, name)
2353 self.wanted_names.append(full_name)
2354 self.needed_locks = {
2355 locking.LEVEL_NODE: [],
2356 locking.LEVEL_INSTANCE: self.wanted_names,
2358 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
2360 self.wanted_names = None
2361 self.needed_locks = {
2362 locking.LEVEL_NODE: locking.ALL_SET,
2363 locking.LEVEL_INSTANCE: locking.ALL_SET,
2365 self.share_locks = dict(((i, 1) for i in locking.LEVELS))
2367 def DeclareLocks(self, level):
2368 if level == locking.LEVEL_NODE and self.wanted_names is not None:
2369 self._LockInstancesNodes(primary_only=True)
2371 def CheckPrereq(self):
2372 """Check prerequisites.
2374 This only checks the optional instance list against the existing names.
2377 if self.wanted_names is None:
2378 self.wanted_names = self.acquired_locks[locking.LEVEL_INSTANCE]
2380 self.wanted_instances = [self.cfg.GetInstanceInfo(name) for name
2381 in self.wanted_names]
2383 def _EnsureChildSizes(self, disk):
2384 """Ensure children of the disk have the needed disk size.
2386 This is valid mainly for DRBD8 and fixes an issue where the
2387 children have smaller disk size.
2389 @param disk: an L{ganeti.objects.Disk} object
2392 if disk.dev_type == constants.LD_DRBD8:
2393 assert disk.children, "Empty children for DRBD8?"
2394 fchild = disk.children[0]
2395 mismatch = fchild.size < disk.size
2397 self.LogInfo("Child disk has size %d, parent %d, fixing",
2398 fchild.size, disk.size)
2399 fchild.size = disk.size
2401 # and we recurse on this child only, not on the metadev
2402 return self._EnsureChildSizes(fchild) or mismatch
2406 def Exec(self, feedback_fn):
2407 """Verify the size of cluster disks.
2410 # TODO: check child disks too
2411 # TODO: check differences in size between primary/secondary nodes
2413 for instance in self.wanted_instances:
2414 pnode = instance.primary_node
2415 if pnode not in per_node_disks:
2416 per_node_disks[pnode] = []
2417 for idx, disk in enumerate(instance.disks):
2418 per_node_disks[pnode].append((instance, idx, disk))
2421 for node, dskl in per_node_disks.items():
2422 newl = [v[2].Copy() for v in dskl]
2424 self.cfg.SetDiskID(dsk, node)
2425 result = self.rpc.call_blockdev_getsizes(node, newl)
2427 self.LogWarning("Failure in blockdev_getsizes call to node"
2428 " %s, ignoring", node)
2430 if len(result.data) != len(dskl):
2431 self.LogWarning("Invalid result from node %s, ignoring node results",
2434 for ((instance, idx, disk), size) in zip(dskl, result.data):
2436 self.LogWarning("Disk %d of instance %s did not return size"
2437 " information, ignoring", idx, instance.name)
2439 if not isinstance(size, (int, long)):
2440 self.LogWarning("Disk %d of instance %s did not return valid"
2441 " size information, ignoring", idx, instance.name)
2444 if size != disk.size:
2445 self.LogInfo("Disk %d of instance %s has mismatched size,"
2446 " correcting: recorded %d, actual %d", idx,
2447 instance.name, disk.size, size)
2449 self.cfg.Update(instance, feedback_fn)
2450 changed.append((instance.name, idx, size))
2451 if self._EnsureChildSizes(disk):
2452 self.cfg.Update(instance, feedback_fn)
2453 changed.append((instance.name, idx, disk.size))
2457 class LURenameCluster(LogicalUnit):
2458 """Rename the cluster.
2461 HPATH = "cluster-rename"
2462 HTYPE = constants.HTYPE_CLUSTER
2463 _OP_PARAMS = [("name", ht.NoDefault, ht.TNonEmptyString)]
2465 def BuildHooksEnv(self):
2470 "OP_TARGET": self.cfg.GetClusterName(),
2471 "NEW_NAME": self.op.name,
2473 mn = self.cfg.GetMasterNode()
2474 all_nodes = self.cfg.GetNodeList()
2475 return env, [mn], all_nodes
2477 def CheckPrereq(self):
2478 """Verify that the passed name is a valid one.
2481 hostname = netutils.GetHostname(name=self.op.name,
2482 family=self.cfg.GetPrimaryIPFamily())
2484 new_name = hostname.name
2485 self.ip = new_ip = hostname.ip
2486 old_name = self.cfg.GetClusterName()
2487 old_ip = self.cfg.GetMasterIP()
2488 if new_name == old_name and new_ip == old_ip:
2489 raise errors.OpPrereqError("Neither the name nor the IP address of the"
2490 " cluster has changed",
2492 if new_ip != old_ip:
2493 if netutils.TcpPing(new_ip, constants.DEFAULT_NODED_PORT):
2494 raise errors.OpPrereqError("The given cluster IP address (%s) is"
2495 " reachable on the network" %
2496 new_ip, errors.ECODE_NOTUNIQUE)
2498 self.op.name = new_name
2500 def Exec(self, feedback_fn):
2501 """Rename the cluster.
2504 clustername = self.op.name
2507 # shutdown the master IP
2508 master = self.cfg.GetMasterNode()
2509 result = self.rpc.call_node_stop_master(master, False)
2510 result.Raise("Could not disable the master role")
2513 cluster = self.cfg.GetClusterInfo()
2514 cluster.cluster_name = clustername
2515 cluster.master_ip = ip
2516 self.cfg.Update(cluster, feedback_fn)
2518 # update the known hosts file
2519 ssh.WriteKnownHostsFile(self.cfg, constants.SSH_KNOWN_HOSTS_FILE)
2520 node_list = self.cfg.GetNodeList()
2522 node_list.remove(master)
2525 _UploadHelper(self, node_list, constants.SSH_KNOWN_HOSTS_FILE)
2527 result = self.rpc.call_node_start_master(master, False, False)
2528 msg = result.fail_msg
2530 self.LogWarning("Could not re-enable the master role on"
2531 " the master, please restart manually: %s", msg)
2536 class LUSetClusterParams(LogicalUnit):
2537 """Change the parameters of the cluster.
2540 HPATH = "cluster-modify"
2541 HTYPE = constants.HTYPE_CLUSTER
2543 ("vg_name", None, ht.TMaybeString),
2544 ("enabled_hypervisors", None,
2545 ht.TOr(ht.TAnd(ht.TListOf(ht.TElemOf(constants.HYPER_TYPES)), ht.TTrue),
2547 ("hvparams", None, ht.TOr(ht.TDictOf(ht.TNonEmptyString, ht.TDict),
2549 ("beparams", None, ht.TOr(ht.TDictOf(ht.TNonEmptyString, ht.TDict),
2551 ("os_hvp", None, ht.TOr(ht.TDictOf(ht.TNonEmptyString, ht.TDict),
2553 ("osparams", None, ht.TOr(ht.TDictOf(ht.TNonEmptyString, ht.TDict),
2555 ("candidate_pool_size", None, ht.TOr(ht.TStrictPositiveInt, ht.TNone)),
2556 ("uid_pool", None, ht.NoType),
2557 ("add_uids", None, ht.NoType),
2558 ("remove_uids", None, ht.NoType),
2559 ("maintain_node_health", None, ht.TMaybeBool),
2560 ("prealloc_wipe_disks", None, ht.TMaybeBool),
2561 ("nicparams", None, ht.TOr(ht.TDict, ht.TNone)),
2562 ("drbd_helper", None, ht.TOr(ht.TString, ht.TNone)),
2563 ("default_iallocator", None, ht.TOr(ht.TString, ht.TNone)),
2564 ("reserved_lvs", None, ht.TOr(ht.TListOf(ht.TNonEmptyString), ht.TNone)),
2565 ("hidden_os", None, ht.TOr(ht.TListOf(\
2568 ht.TMap(lambda v: v[0], ht.TElemOf(constants.DDMS_VALUES)))),
2570 ("blacklisted_os", None, ht.TOr(ht.TListOf(\
2573 ht.TMap(lambda v: v[0], ht.TElemOf(constants.DDMS_VALUES)))),
2578 def CheckArguments(self):
2582 if self.op.uid_pool:
2583 uidpool.CheckUidPool(self.op.uid_pool)
2585 if self.op.add_uids:
2586 uidpool.CheckUidPool(self.op.add_uids)
2588 if self.op.remove_uids:
2589 uidpool.CheckUidPool(self.op.remove_uids)
2591 def ExpandNames(self):
2592 # FIXME: in the future maybe other cluster params won't require checking on
2593 # all nodes to be modified.
2594 self.needed_locks = {
2595 locking.LEVEL_NODE: locking.ALL_SET,
2597 self.share_locks[locking.LEVEL_NODE] = 1
2599 def BuildHooksEnv(self):
2604 "OP_TARGET": self.cfg.GetClusterName(),
2605 "NEW_VG_NAME": self.op.vg_name,
2607 mn = self.cfg.GetMasterNode()
2608 return env, [mn], [mn]
2610 def CheckPrereq(self):
2611 """Check prerequisites.
2613 This checks whether the given params don't conflict and
2614 if the given volume group is valid.
2617 if self.op.vg_name is not None and not self.op.vg_name:
2618 if self.cfg.HasAnyDiskOfType(constants.LD_LV):
2619 raise errors.OpPrereqError("Cannot disable lvm storage while lvm-based"
2620 " instances exist", errors.ECODE_INVAL)
2622 if self.op.drbd_helper is not None and not self.op.drbd_helper:
2623 if self.cfg.HasAnyDiskOfType(constants.LD_DRBD8):
2624 raise errors.OpPrereqError("Cannot disable drbd helper while"
2625 " drbd-based instances exist",
2628 node_list = self.acquired_locks[locking.LEVEL_NODE]
2630 # if vg_name not None, checks given volume group on all nodes
2632 vglist = self.rpc.call_vg_list(node_list)
2633 for node in node_list:
2634 msg = vglist[node].fail_msg
2636 # ignoring down node
2637 self.LogWarning("Error while gathering data on node %s"
2638 " (ignoring node): %s", node, msg)
2640 vgstatus = utils.CheckVolumeGroupSize(vglist[node].payload,
2642 constants.MIN_VG_SIZE)
2644 raise errors.OpPrereqError("Error on node '%s': %s" %
2645 (node, vgstatus), errors.ECODE_ENVIRON)
2647 if self.op.drbd_helper:
2648 # checks given drbd helper on all nodes
2649 helpers = self.rpc.call_drbd_helper(node_list)
2650 for node in node_list:
2651 ninfo = self.cfg.GetNodeInfo(node)
2653 self.LogInfo("Not checking drbd helper on offline node %s", node)
2655 msg = helpers[node].fail_msg
2657 raise errors.OpPrereqError("Error checking drbd helper on node"
2658 " '%s': %s" % (node, msg),
2659 errors.ECODE_ENVIRON)
2660 node_helper = helpers[node].payload
2661 if node_helper != self.op.drbd_helper:
2662 raise errors.OpPrereqError("Error on node '%s': drbd helper is %s" %
2663 (node, node_helper), errors.ECODE_ENVIRON)
2665 self.cluster = cluster = self.cfg.GetClusterInfo()
2666 # validate params changes
2667 if self.op.beparams:
2668 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
2669 self.new_beparams = cluster.SimpleFillBE(self.op.beparams)
2671 if self.op.nicparams:
2672 utils.ForceDictType(self.op.nicparams, constants.NICS_PARAMETER_TYPES)
2673 self.new_nicparams = cluster.SimpleFillNIC(self.op.nicparams)
2674 objects.NIC.CheckParameterSyntax(self.new_nicparams)
2677 # check all instances for consistency
2678 for instance in self.cfg.GetAllInstancesInfo().values():
2679 for nic_idx, nic in enumerate(instance.nics):
2680 params_copy = copy.deepcopy(nic.nicparams)
2681 params_filled = objects.FillDict(self.new_nicparams, params_copy)
2683 # check parameter syntax
2685 objects.NIC.CheckParameterSyntax(params_filled)
2686 except errors.ConfigurationError, err:
2687 nic_errors.append("Instance %s, nic/%d: %s" %
2688 (instance.name, nic_idx, err))
2690 # if we're moving instances to routed, check that they have an ip
2691 target_mode = params_filled[constants.NIC_MODE]
2692 if target_mode == constants.NIC_MODE_ROUTED and not nic.ip:
2693 nic_errors.append("Instance %s, nic/%d: routed nick with no ip" %
2694 (instance.name, nic_idx))
2696 raise errors.OpPrereqError("Cannot apply the change, errors:\n%s" %
2697 "\n".join(nic_errors))
2699 # hypervisor list/parameters
2700 self.new_hvparams = new_hvp = objects.FillDict(cluster.hvparams, {})
2701 if self.op.hvparams:
2702 for hv_name, hv_dict in self.op.hvparams.items():
2703 if hv_name not in self.new_hvparams:
2704 self.new_hvparams[hv_name] = hv_dict
2706 self.new_hvparams[hv_name].update(hv_dict)
2708 # os hypervisor parameters
2709 self.new_os_hvp = objects.FillDict(cluster.os_hvp, {})
2711 for os_name, hvs in self.op.os_hvp.items():
2712 if os_name not in self.new_os_hvp:
2713 self.new_os_hvp[os_name] = hvs
2715 for hv_name, hv_dict in hvs.items():
2716 if hv_name not in self.new_os_hvp[os_name]:
2717 self.new_os_hvp[os_name][hv_name] = hv_dict
2719 self.new_os_hvp[os_name][hv_name].update(hv_dict)
2722 self.new_osp = objects.FillDict(cluster.osparams, {})
2723 if self.op.osparams:
2724 for os_name, osp in self.op.osparams.items():
2725 if os_name not in self.new_osp:
2726 self.new_osp[os_name] = {}
2728 self.new_osp[os_name] = _GetUpdatedParams(self.new_osp[os_name], osp,
2731 if not self.new_osp[os_name]:
2732 # we removed all parameters
2733 del self.new_osp[os_name]
2735 # check the parameter validity (remote check)
2736 _CheckOSParams(self, False, [self.cfg.GetMasterNode()],
2737 os_name, self.new_osp[os_name])
2739 # changes to the hypervisor list
2740 if self.op.enabled_hypervisors is not None:
2741 self.hv_list = self.op.enabled_hypervisors
2742 for hv in self.hv_list:
2743 # if the hypervisor doesn't already exist in the cluster
2744 # hvparams, we initialize it to empty, and then (in both
2745 # cases) we make sure to fill the defaults, as we might not
2746 # have a complete defaults list if the hypervisor wasn't
2748 if hv not in new_hvp:
2750 new_hvp[hv] = objects.FillDict(constants.HVC_DEFAULTS[hv], new_hvp[hv])
2751 utils.ForceDictType(new_hvp[hv], constants.HVS_PARAMETER_TYPES)
2753 self.hv_list = cluster.enabled_hypervisors
2755 if self.op.hvparams or self.op.enabled_hypervisors is not None:
2756 # either the enabled list has changed, or the parameters have, validate
2757 for hv_name, hv_params in self.new_hvparams.items():
2758 if ((self.op.hvparams and hv_name in self.op.hvparams) or
2759 (self.op.enabled_hypervisors and
2760 hv_name in self.op.enabled_hypervisors)):
2761 # either this is a new hypervisor, or its parameters have changed
2762 hv_class = hypervisor.GetHypervisor(hv_name)
2763 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
2764 hv_class.CheckParameterSyntax(hv_params)
2765 _CheckHVParams(self, node_list, hv_name, hv_params)
2768 # no need to check any newly-enabled hypervisors, since the
2769 # defaults have already been checked in the above code-block
2770 for os_name, os_hvp in self.new_os_hvp.items():
2771 for hv_name, hv_params in os_hvp.items():
2772 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
2773 # we need to fill in the new os_hvp on top of the actual hv_p
2774 cluster_defaults = self.new_hvparams.get(hv_name, {})
2775 new_osp = objects.FillDict(cluster_defaults, hv_params)
2776 hv_class = hypervisor.GetHypervisor(hv_name)
2777 hv_class.CheckParameterSyntax(new_osp)
2778 _CheckHVParams(self, node_list, hv_name, new_osp)
2780 if self.op.default_iallocator:
2781 alloc_script = utils.FindFile(self.op.default_iallocator,
2782 constants.IALLOCATOR_SEARCH_PATH,
2784 if alloc_script is None:
2785 raise errors.OpPrereqError("Invalid default iallocator script '%s'"
2786 " specified" % self.op.default_iallocator,
2789 def Exec(self, feedback_fn):
2790 """Change the parameters of the cluster.
2793 if self.op.vg_name is not None:
2794 new_volume = self.op.vg_name
2797 if new_volume != self.cfg.GetVGName():
2798 self.cfg.SetVGName(new_volume)
2800 feedback_fn("Cluster LVM configuration already in desired"
2801 " state, not changing")
2802 if self.op.drbd_helper is not None:
2803 new_helper = self.op.drbd_helper
2806 if new_helper != self.cfg.GetDRBDHelper():
2807 self.cfg.SetDRBDHelper(new_helper)
2809 feedback_fn("Cluster DRBD helper already in desired state,"
2811 if self.op.hvparams:
2812 self.cluster.hvparams = self.new_hvparams
2814 self.cluster.os_hvp = self.new_os_hvp
2815 if self.op.enabled_hypervisors is not None:
2816 self.cluster.hvparams = self.new_hvparams
2817 self.cluster.enabled_hypervisors = self.op.enabled_hypervisors
2818 if self.op.beparams:
2819 self.cluster.beparams[constants.PP_DEFAULT] = self.new_beparams
2820 if self.op.nicparams:
2821 self.cluster.nicparams[constants.PP_DEFAULT] = self.new_nicparams
2822 if self.op.osparams:
2823 self.cluster.osparams = self.new_osp
2825 if self.op.candidate_pool_size is not None:
2826 self.cluster.candidate_pool_size = self.op.candidate_pool_size
2827 # we need to update the pool size here, otherwise the save will fail
2828 _AdjustCandidatePool(self, [])
2830 if self.op.maintain_node_health is not None:
2831 self.cluster.maintain_node_health = self.op.maintain_node_health
2833 if self.op.prealloc_wipe_disks is not None:
2834 self.cluster.prealloc_wipe_disks = self.op.prealloc_wipe_disks
2836 if self.op.add_uids is not None:
2837 uidpool.AddToUidPool(self.cluster.uid_pool, self.op.add_uids)
2839 if self.op.remove_uids is not None:
2840 uidpool.RemoveFromUidPool(self.cluster.uid_pool, self.op.remove_uids)
2842 if self.op.uid_pool is not None:
2843 self.cluster.uid_pool = self.op.uid_pool
2845 if self.op.default_iallocator is not None:
2846 self.cluster.default_iallocator = self.op.default_iallocator
2848 if self.op.reserved_lvs is not None:
2849 self.cluster.reserved_lvs = self.op.reserved_lvs
2851 def helper_os(aname, mods, desc):
2853 lst = getattr(self.cluster, aname)
2854 for key, val in mods:
2855 if key == constants.DDM_ADD:
2857 feedback_fn("OS %s already in %s, ignoring", val, desc)
2860 elif key == constants.DDM_REMOVE:
2864 feedback_fn("OS %s not found in %s, ignoring", val, desc)
2866 raise errors.ProgrammerError("Invalid modification '%s'" % key)
2868 if self.op.hidden_os:
2869 helper_os("hidden_os", self.op.hidden_os, "hidden")
2871 if self.op.blacklisted_os:
2872 helper_os("blacklisted_os", self.op.blacklisted_os, "blacklisted")
2874 self.cfg.Update(self.cluster, feedback_fn)
2877 def _UploadHelper(lu, nodes, fname):
2878 """Helper for uploading a file and showing warnings.
2881 if os.path.exists(fname):
2882 result = lu.rpc.call_upload_file(nodes, fname)
2883 for to_node, to_result in result.items():
2884 msg = to_result.fail_msg
2886 msg = ("Copy of file %s to node %s failed: %s" %
2887 (fname, to_node, msg))
2888 lu.proc.LogWarning(msg)
2891 def _RedistributeAncillaryFiles(lu, additional_nodes=None, additional_vm=True):
2892 """Distribute additional files which are part of the cluster configuration.
2894 ConfigWriter takes care of distributing the config and ssconf files, but
2895 there are more files which should be distributed to all nodes. This function
2896 makes sure those are copied.
2898 @param lu: calling logical unit
2899 @param additional_nodes: list of nodes not in the config to distribute to
2900 @type additional_vm: boolean
2901 @param additional_vm: whether the additional nodes are vm-capable or not
2904 # 1. Gather target nodes
2905 myself = lu.cfg.GetNodeInfo(lu.cfg.GetMasterNode())
2906 dist_nodes = lu.cfg.GetOnlineNodeList()
2907 nvm_nodes = lu.cfg.GetNonVmCapableNodeList()
2908 vm_nodes = [name for name in dist_nodes if name not in nvm_nodes]
2909 if additional_nodes is not None:
2910 dist_nodes.extend(additional_nodes)
2912 vm_nodes.extend(additional_nodes)
2913 if myself.name in dist_nodes:
2914 dist_nodes.remove(myself.name)
2915 if myself.name in vm_nodes:
2916 vm_nodes.remove(myself.name)
2918 # 2. Gather files to distribute
2919 dist_files = set([constants.ETC_HOSTS,
2920 constants.SSH_KNOWN_HOSTS_FILE,
2921 constants.RAPI_CERT_FILE,
2922 constants.RAPI_USERS_FILE,
2923 constants.CONFD_HMAC_KEY,
2924 constants.CLUSTER_DOMAIN_SECRET_FILE,
2928 enabled_hypervisors = lu.cfg.GetClusterInfo().enabled_hypervisors
2929 for hv_name in enabled_hypervisors:
2930 hv_class = hypervisor.GetHypervisor(hv_name)
2931 vm_files.update(hv_class.GetAncillaryFiles())
2933 # 3. Perform the files upload
2934 for fname in dist_files:
2935 _UploadHelper(lu, dist_nodes, fname)
2936 for fname in vm_files:
2937 _UploadHelper(lu, vm_nodes, fname)
2940 class LURedistributeConfig(NoHooksLU):
2941 """Force the redistribution of cluster configuration.
2943 This is a very simple LU.
2948 def ExpandNames(self):
2949 self.needed_locks = {
2950 locking.LEVEL_NODE: locking.ALL_SET,
2952 self.share_locks[locking.LEVEL_NODE] = 1
2954 def Exec(self, feedback_fn):
2955 """Redistribute the configuration.
2958 self.cfg.Update(self.cfg.GetClusterInfo(), feedback_fn)
2959 _RedistributeAncillaryFiles(self)
2962 def _WaitForSync(lu, instance, disks=None, oneshot=False):
2963 """Sleep and poll for an instance's disk to sync.
2966 if not instance.disks or disks is not None and not disks:
2969 disks = _ExpandCheckDisks(instance, disks)
2972 lu.proc.LogInfo("Waiting for instance %s to sync disks." % instance.name)
2974 node = instance.primary_node
2977 lu.cfg.SetDiskID(dev, node)
2979 # TODO: Convert to utils.Retry
2982 degr_retries = 10 # in seconds, as we sleep 1 second each time
2986 cumul_degraded = False
2987 rstats = lu.rpc.call_blockdev_getmirrorstatus(node, disks)
2988 msg = rstats.fail_msg
2990 lu.LogWarning("Can't get any data from node %s: %s", node, msg)
2993 raise errors.RemoteError("Can't contact node %s for mirror data,"
2994 " aborting." % node)
2997 rstats = rstats.payload
2999 for i, mstat in enumerate(rstats):
3001 lu.LogWarning("Can't compute data for node %s/%s",
3002 node, disks[i].iv_name)
3005 cumul_degraded = (cumul_degraded or
3006 (mstat.is_degraded and mstat.sync_percent is None))
3007 if mstat.sync_percent is not None:
3009 if mstat.estimated_time is not None:
3010 rem_time = ("%s remaining (estimated)" %
3011 utils.FormatSeconds(mstat.estimated_time))
3012 max_time = mstat.estimated_time
3014 rem_time = "no time estimate"
3015 lu.proc.LogInfo("- device %s: %5.2f%% done, %s" %
3016 (disks[i].iv_name, mstat.sync_percent, rem_time))
3018 # if we're done but degraded, let's do a few small retries, to
3019 # make sure we see a stable and not transient situation; therefore
3020 # we force restart of the loop
3021 if (done or oneshot) and cumul_degraded and degr_retries > 0:
3022 logging.info("Degraded disks found, %d retries left", degr_retries)
3030 time.sleep(min(60, max_time))
3033 lu.proc.LogInfo("Instance %s's disks are in sync." % instance.name)
3034 return not cumul_degraded
3037 def _CheckDiskConsistency(lu, dev, node, on_primary, ldisk=False):
3038 """Check that mirrors are not degraded.
3040 The ldisk parameter, if True, will change the test from the
3041 is_degraded attribute (which represents overall non-ok status for
3042 the device(s)) to the ldisk (representing the local storage status).
3045 lu.cfg.SetDiskID(dev, node)
3049 if on_primary or dev.AssembleOnSecondary():
3050 rstats = lu.rpc.call_blockdev_find(node, dev)
3051 msg = rstats.fail_msg
3053 lu.LogWarning("Can't find disk on node %s: %s", node, msg)
3055 elif not rstats.payload:
3056 lu.LogWarning("Can't find disk on node %s", node)
3060 result = result and rstats.payload.ldisk_status == constants.LDS_OKAY
3062 result = result and not rstats.payload.is_degraded
3065 for child in dev.children:
3066 result = result and _CheckDiskConsistency(lu, child, node, on_primary)
3071 class LUDiagnoseOS(NoHooksLU):
3072 """Logical unit for OS diagnose/query.
3077 ("names", ht.EmptyList, ht.TListOf(ht.TNonEmptyString)),
3081 _BLK = "blacklisted"
3083 _FIELDS_STATIC = utils.FieldSet()
3084 _FIELDS_DYNAMIC = utils.FieldSet("name", _VLD, "node_status", "variants",
3085 "parameters", "api_versions", _HID, _BLK)
3087 def CheckArguments(self):
3089 raise errors.OpPrereqError("Selective OS query not supported",
3092 _CheckOutputFields(static=self._FIELDS_STATIC,
3093 dynamic=self._FIELDS_DYNAMIC,
3094 selected=self.op.output_fields)
3096 def ExpandNames(self):
3097 # Lock all nodes, in shared mode
3098 # Temporary removal of locks, should be reverted later
3099 # TODO: reintroduce locks when they are lighter-weight
3100 self.needed_locks = {}
3101 #self.share_locks[locking.LEVEL_NODE] = 1
3102 #self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
3105 def _DiagnoseByOS(rlist):
3106 """Remaps a per-node return list into an a per-os per-node dictionary
3108 @param rlist: a map with node names as keys and OS objects as values
3111 @return: a dictionary with osnames as keys and as value another
3112 map, with nodes as keys and tuples of (path, status, diagnose,
3113 variants, parameters, api_versions) as values, eg::
3115 {"debian-etch": {"node1": [(/usr/lib/..., True, "", [], []),
3116 (/srv/..., False, "invalid api")],
3117 "node2": [(/srv/..., True, "", [], [])]}
3122 # we build here the list of nodes that didn't fail the RPC (at RPC
3123 # level), so that nodes with a non-responding node daemon don't
3124 # make all OSes invalid
3125 good_nodes = [node_name for node_name in rlist
3126 if not rlist[node_name].fail_msg]
3127 for node_name, nr in rlist.items():
3128 if nr.fail_msg or not nr.payload:
3130 for (name, path, status, diagnose, variants,
3131 params, api_versions) in nr.payload:
3132 if name not in all_os:
3133 # build a list of nodes for this os containing empty lists
3134 # for each node in node_list
3136 for nname in good_nodes:
3137 all_os[name][nname] = []
3138 # convert params from [name, help] to (name, help)
3139 params = [tuple(v) for v in params]
3140 all_os[name][node_name].append((path, status, diagnose,
3141 variants, params, api_versions))
3144 def Exec(self, feedback_fn):
3145 """Compute the list of OSes.
3148 valid_nodes = [node for node in self.cfg.GetOnlineNodeList()]
3149 node_data = self.rpc.call_os_diagnose(valid_nodes)
3150 pol = self._DiagnoseByOS(node_data)
3152 cluster = self.cfg.GetClusterInfo()
3154 for os_name in utils.NiceSort(pol.keys()):
3155 os_data = pol[os_name]
3158 (variants, params, api_versions) = null_state = (set(), set(), set())
3159 for idx, osl in enumerate(os_data.values()):
3160 valid = bool(valid and osl and osl[0][1])
3162 (variants, params, api_versions) = null_state
3164 node_variants, node_params, node_api = osl[0][3:6]
3165 if idx == 0: # first entry
3166 variants = set(node_variants)
3167 params = set(node_params)
3168 api_versions = set(node_api)
3169 else: # keep consistency
3170 variants.intersection_update(node_variants)
3171 params.intersection_update(node_params)
3172 api_versions.intersection_update(node_api)
3174 is_hid = os_name in cluster.hidden_os
3175 is_blk = os_name in cluster.blacklisted_os
3176 if ((self._HID not in self.op.output_fields and is_hid) or
3177 (self._BLK not in self.op.output_fields and is_blk) or
3178 (self._VLD not in self.op.output_fields and not valid)):
3181 for field in self.op.output_fields:
3184 elif field == self._VLD:
3186 elif field == "node_status":
3187 # this is just a copy of the dict
3189 for node_name, nos_list in os_data.items():
3190 val[node_name] = nos_list
3191 elif field == "variants":
3192 val = utils.NiceSort(list(variants))
3193 elif field == "parameters":
3195 elif field == "api_versions":
3196 val = list(api_versions)
3197 elif field == self._HID:
3199 elif field == self._BLK:
3202 raise errors.ParameterError(field)
3209 class LURemoveNode(LogicalUnit):
3210 """Logical unit for removing a node.
3213 HPATH = "node-remove"
3214 HTYPE = constants.HTYPE_NODE
3219 def BuildHooksEnv(self):
3222 This doesn't run on the target node in the pre phase as a failed
3223 node would then be impossible to remove.
3227 "OP_TARGET": self.op.node_name,
3228 "NODE_NAME": self.op.node_name,
3230 all_nodes = self.cfg.GetNodeList()
3232 all_nodes.remove(self.op.node_name)
3234 logging.warning("Node %s which is about to be removed not found"
3235 " in the all nodes list", self.op.node_name)
3236 return env, all_nodes, all_nodes
3238 def CheckPrereq(self):
3239 """Check prerequisites.
3242 - the node exists in the configuration
3243 - it does not have primary or secondary instances
3244 - it's not the master
3246 Any errors are signaled by raising errors.OpPrereqError.
3249 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
3250 node = self.cfg.GetNodeInfo(self.op.node_name)
3251 assert node is not None
3253 instance_list = self.cfg.GetInstanceList()
3255 masternode = self.cfg.GetMasterNode()
3256 if node.name == masternode:
3257 raise errors.OpPrereqError("Node is the master node,"
3258 " you need to failover first.",
3261 for instance_name in instance_list:
3262 instance = self.cfg.GetInstanceInfo(instance_name)
3263 if node.name in instance.all_nodes:
3264 raise errors.OpPrereqError("Instance %s is still running on the node,"
3265 " please remove first." % instance_name,
3267 self.op.node_name = node.name
3270 def Exec(self, feedback_fn):
3271 """Removes the node from the cluster.
3275 logging.info("Stopping the node daemon and removing configs from node %s",
3278 modify_ssh_setup = self.cfg.GetClusterInfo().modify_ssh_setup
3280 # Promote nodes to master candidate as needed
3281 _AdjustCandidatePool(self, exceptions=[node.name])
3282 self.context.RemoveNode(node.name)
3284 # Run post hooks on the node before it's removed
3285 hm = self.proc.hmclass(self.rpc.call_hooks_runner, self)
3287 hm.RunPhase(constants.HOOKS_PHASE_POST, [node.name])
3289 # pylint: disable-msg=W0702
3290 self.LogWarning("Errors occurred running hooks on %s" % node.name)
3292 result = self.rpc.call_node_leave_cluster(node.name, modify_ssh_setup)
3293 msg = result.fail_msg
3295 self.LogWarning("Errors encountered on the remote node while leaving"
3296 " the cluster: %s", msg)
3298 # Remove node from our /etc/hosts
3299 if self.cfg.GetClusterInfo().modify_etc_hosts:
3300 master_node = self.cfg.GetMasterNode()
3301 result = self.rpc.call_etc_hosts_modify(master_node,
3302 constants.ETC_HOSTS_REMOVE,
3304 result.Raise("Can't update hosts file with new host data")
3305 _RedistributeAncillaryFiles(self)
3308 class LUQueryNodes(NoHooksLU):
3309 """Logical unit for querying nodes.
3312 # pylint: disable-msg=W0142
3315 ("names", ht.EmptyList, ht.TListOf(ht.TNonEmptyString)),
3316 ("use_locking", False, ht.TBool),
3320 _SIMPLE_FIELDS = ["name", "serial_no", "ctime", "mtime", "uuid",
3321 "master_candidate", "offline", "drained",
3322 "master_capable", "vm_capable"]
3324 _FIELDS_DYNAMIC = utils.FieldSet(
3326 "mtotal", "mnode", "mfree",
3328 "ctotal", "cnodes", "csockets",
3331 _FIELDS_STATIC = utils.FieldSet(*[
3332 "pinst_cnt", "sinst_cnt",
3333 "pinst_list", "sinst_list",
3334 "pip", "sip", "tags",
3336 "role"] + _SIMPLE_FIELDS
3339 def CheckArguments(self):
3340 _CheckOutputFields(static=self._FIELDS_STATIC,
3341 dynamic=self._FIELDS_DYNAMIC,
3342 selected=self.op.output_fields)
3344 def ExpandNames(self):
3345 self.needed_locks = {}
3346 self.share_locks[locking.LEVEL_NODE] = 1
3349 self.wanted = _GetWantedNodes(self, self.op.names)
3351 self.wanted = locking.ALL_SET
3353 self.do_node_query = self._FIELDS_STATIC.NonMatching(self.op.output_fields)
3354 self.do_locking = self.do_node_query and self.op.use_locking
3356 # if we don't request only static fields, we need to lock the nodes
3357 self.needed_locks[locking.LEVEL_NODE] = self.wanted
3359 def Exec(self, feedback_fn):
3360 """Computes the list of nodes and their attributes.
3363 all_info = self.cfg.GetAllNodesInfo()
3365 nodenames = self.acquired_locks[locking.LEVEL_NODE]
3366 elif self.wanted != locking.ALL_SET:
3367 nodenames = self.wanted
3368 missing = set(nodenames).difference(all_info.keys())
3370 raise errors.OpExecError(
3371 "Some nodes were removed before retrieving their data: %s" % missing)
3373 nodenames = all_info.keys()
3375 nodenames = utils.NiceSort(nodenames)
3376 nodelist = [all_info[name] for name in nodenames]
3378 # begin data gathering
3380 if self.do_node_query:
3382 node_data = self.rpc.call_node_info(nodenames, self.cfg.GetVGName(),
3383 self.cfg.GetHypervisorType())
3384 for name in nodenames:
3385 nodeinfo = node_data[name]
3386 if not nodeinfo.fail_msg and nodeinfo.payload:
3387 nodeinfo = nodeinfo.payload
3388 fn = utils.TryConvert
3390 "mtotal": fn(int, nodeinfo.get('memory_total', None)),
3391 "mnode": fn(int, nodeinfo.get('memory_dom0', None)),
3392 "mfree": fn(int, nodeinfo.get('memory_free', None)),
3393 "dtotal": fn(int, nodeinfo.get('vg_size', None)),
3394 "dfree": fn(int, nodeinfo.get('vg_free', None)),
3395 "ctotal": fn(int, nodeinfo.get('cpu_total', None)),
3396 "bootid": nodeinfo.get('bootid', None),
3397 "cnodes": fn(int, nodeinfo.get('cpu_nodes', None)),
3398 "csockets": fn(int, nodeinfo.get('cpu_sockets', None)),
3401 live_data[name] = {}
3403 live_data = dict.fromkeys(nodenames, {})
3405 node_to_primary = dict([(name, set()) for name in nodenames])
3406 node_to_secondary = dict([(name, set()) for name in nodenames])
3408 inst_fields = frozenset(("pinst_cnt", "pinst_list",
3409 "sinst_cnt", "sinst_list"))
3410 if inst_fields & frozenset(self.op.output_fields):
3411 inst_data = self.cfg.GetAllInstancesInfo()
3413 for inst in inst_data.values():
3414 if inst.primary_node in node_to_primary:
3415 node_to_primary[inst.primary_node].add(inst.name)
3416 for secnode in inst.secondary_nodes:
3417 if secnode in node_to_secondary:
3418 node_to_secondary[secnode].add(inst.name)
3420 master_node = self.cfg.GetMasterNode()
3422 # end data gathering
3425 for node in nodelist:
3427 for field in self.op.output_fields:
3428 if field in self._SIMPLE_FIELDS:
3429 val = getattr(node, field)
3430 elif field == "pinst_list":
3431 val = list(node_to_primary[node.name])
3432 elif field == "sinst_list":
3433 val = list(node_to_secondary[node.name])
3434 elif field == "pinst_cnt":
3435 val = len(node_to_primary[node.name])
3436 elif field == "sinst_cnt":
3437 val = len(node_to_secondary[node.name])
3438 elif field == "pip":
3439 val = node.primary_ip
3440 elif field == "sip":
3441 val = node.secondary_ip
3442 elif field == "tags":
3443 val = list(node.GetTags())
3444 elif field == "master":
3445 val = node.name == master_node
3446 elif self._FIELDS_DYNAMIC.Matches(field):
3447 val = live_data[node.name].get(field, None)
3448 elif field == "role":
3449 if node.name == master_node:
3451 elif node.master_candidate:
3460 raise errors.ParameterError(field)
3461 node_output.append(val)
3462 output.append(node_output)
3467 class LUQueryNodeVolumes(NoHooksLU):
3468 """Logical unit for getting volumes on node(s).
3472 ("nodes", ht.EmptyList, ht.TListOf(ht.TNonEmptyString)),
3473 ("output_fields", ht.NoDefault, ht.TListOf(ht.TNonEmptyString)),
3476 _FIELDS_DYNAMIC = utils.FieldSet("phys", "vg", "name", "size", "instance")
3477 _FIELDS_STATIC = utils.FieldSet("node")
3479 def CheckArguments(self):
3480 _CheckOutputFields(static=self._FIELDS_STATIC,
3481 dynamic=self._FIELDS_DYNAMIC,
3482 selected=self.op.output_fields)
3484 def ExpandNames(self):
3485 self.needed_locks = {}
3486 self.share_locks[locking.LEVEL_NODE] = 1
3487 if not self.op.nodes:
3488 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
3490 self.needed_locks[locking.LEVEL_NODE] = \
3491 _GetWantedNodes(self, self.op.nodes)
3493 def Exec(self, feedback_fn):
3494 """Computes the list of nodes and their attributes.
3497 nodenames = self.acquired_locks[locking.LEVEL_NODE]
3498 volumes = self.rpc.call_node_volumes(nodenames)
3500 ilist = [self.cfg.GetInstanceInfo(iname) for iname
3501 in self.cfg.GetInstanceList()]
3503 lv_by_node = dict([(inst, inst.MapLVsByNode()) for inst in ilist])
3506 for node in nodenames:
3507 nresult = volumes[node]
3510 msg = nresult.fail_msg
3512 self.LogWarning("Can't compute volume data on node %s: %s", node, msg)
3515 node_vols = nresult.payload[:]
3516 node_vols.sort(key=lambda vol: vol['dev'])
3518 for vol in node_vols:
3520 for field in self.op.output_fields:
3523 elif field == "phys":
3527 elif field == "name":
3529 elif field == "size":
3530 val = int(float(vol['size']))
3531 elif field == "instance":
3533 if node not in lv_by_node[inst]:
3535 if vol['name'] in lv_by_node[inst][node]:
3541 raise errors.ParameterError(field)
3542 node_output.append(str(val))
3544 output.append(node_output)
3549 class LUQueryNodeStorage(NoHooksLU):
3550 """Logical unit for getting information on storage units on node(s).
3553 _FIELDS_STATIC = utils.FieldSet(constants.SF_NODE)
3555 ("nodes", ht.EmptyList, ht.TListOf(ht.TNonEmptyString)),
3556 ("storage_type", ht.NoDefault, _CheckStorageType),
3557 ("output_fields", ht.NoDefault, ht.TListOf(ht.TNonEmptyString)),
3558 ("name", None, ht.TMaybeString),
3562 def CheckArguments(self):
3563 _CheckOutputFields(static=self._FIELDS_STATIC,
3564 dynamic=utils.FieldSet(*constants.VALID_STORAGE_FIELDS),
3565 selected=self.op.output_fields)
3567 def ExpandNames(self):
3568 self.needed_locks = {}
3569 self.share_locks[locking.LEVEL_NODE] = 1
3572 self.needed_locks[locking.LEVEL_NODE] = \
3573 _GetWantedNodes(self, self.op.nodes)
3575 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
3577 def Exec(self, feedback_fn):
3578 """Computes the list of nodes and their attributes.
3581 self.nodes = self.acquired_locks[locking.LEVEL_NODE]
3583 # Always get name to sort by
3584 if constants.SF_NAME in self.op.output_fields:
3585 fields = self.op.output_fields[:]
3587 fields = [constants.SF_NAME] + self.op.output_fields
3589 # Never ask for node or type as it's only known to the LU
3590 for extra in [constants.SF_NODE, constants.SF_TYPE]:
3591 while extra in fields:
3592 fields.remove(extra)
3594 field_idx = dict([(name, idx) for (idx, name) in enumerate(fields)])
3595 name_idx = field_idx[constants.SF_NAME]
3597 st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
3598 data = self.rpc.call_storage_list(self.nodes,
3599 self.op.storage_type, st_args,
3600 self.op.name, fields)
3604 for node in utils.NiceSort(self.nodes):
3605 nresult = data[node]
3609 msg = nresult.fail_msg
3611 self.LogWarning("Can't get storage data from node %s: %s", node, msg)
3614 rows = dict([(row[name_idx], row) for row in nresult.payload])
3616 for name in utils.NiceSort(rows.keys()):
3621 for field in self.op.output_fields:
3622 if field == constants.SF_NODE:
3624 elif field == constants.SF_TYPE:
3625 val = self.op.storage_type
3626 elif field in field_idx:
3627 val = row[field_idx[field]]
3629 raise errors.ParameterError(field)
3638 class LUModifyNodeStorage(NoHooksLU):
3639 """Logical unit for modifying a storage volume on a node.
3644 ("storage_type", ht.NoDefault, _CheckStorageType),
3645 ("name", ht.NoDefault, ht.TNonEmptyString),
3646 ("changes", ht.NoDefault, ht.TDict),
3650 def CheckArguments(self):
3651 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
3653 storage_type = self.op.storage_type
3656 modifiable = constants.MODIFIABLE_STORAGE_FIELDS[storage_type]
3658 raise errors.OpPrereqError("Storage units of type '%s' can not be"
3659 " modified" % storage_type,
3662 diff = set(self.op.changes.keys()) - modifiable
3664 raise errors.OpPrereqError("The following fields can not be modified for"
3665 " storage units of type '%s': %r" %
3666 (storage_type, list(diff)),
3669 def ExpandNames(self):
3670 self.needed_locks = {
3671 locking.LEVEL_NODE: self.op.node_name,
3674 def Exec(self, feedback_fn):
3675 """Computes the list of nodes and their attributes.
3678 st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
3679 result = self.rpc.call_storage_modify(self.op.node_name,
3680 self.op.storage_type, st_args,
3681 self.op.name, self.op.changes)
3682 result.Raise("Failed to modify storage unit '%s' on %s" %
3683 (self.op.name, self.op.node_name))
3686 class LUAddNode(LogicalUnit):
3687 """Logical unit for adding node to the cluster.
3691 HTYPE = constants.HTYPE_NODE
3694 ("primary_ip", None, ht.NoType),
3695 ("secondary_ip", None, ht.TMaybeString),
3696 ("readd", False, ht.TBool),
3697 ("group", None, ht.TMaybeString),
3698 ("master_capable", None, ht.TMaybeBool),
3699 ("vm_capable", None, ht.TMaybeBool),
3701 _NFLAGS = ["master_capable", "vm_capable"]
3703 def CheckArguments(self):
3704 self.primary_ip_family = self.cfg.GetPrimaryIPFamily()
3705 # validate/normalize the node name
3706 self.hostname = netutils.GetHostname(name=self.op.node_name,
3707 family=self.primary_ip_family)
3708 self.op.node_name = self.hostname.name
3709 if self.op.readd and self.op.group:
3710 raise errors.OpPrereqError("Cannot pass a node group when a node is"
3711 " being readded", errors.ECODE_INVAL)
3713 def BuildHooksEnv(self):
3716 This will run on all nodes before, and on all nodes + the new node after.
3720 "OP_TARGET": self.op.node_name,
3721 "NODE_NAME": self.op.node_name,
3722 "NODE_PIP": self.op.primary_ip,
3723 "NODE_SIP": self.op.secondary_ip,
3724 "MASTER_CAPABLE": str(self.op.master_capable),
3725 "VM_CAPABLE": str(self.op.vm_capable),
3727 nodes_0 = self.cfg.GetNodeList()
3728 nodes_1 = nodes_0 + [self.op.node_name, ]
3729 return env, nodes_0, nodes_1
3731 def CheckPrereq(self):
3732 """Check prerequisites.
3735 - the new node is not already in the config
3737 - its parameters (single/dual homed) matches the cluster
3739 Any errors are signaled by raising errors.OpPrereqError.
3743 hostname = self.hostname
3744 node = hostname.name
3745 primary_ip = self.op.primary_ip = hostname.ip
3746 if self.op.secondary_ip is None:
3747 if self.primary_ip_family == netutils.IP6Address.family:
3748 raise errors.OpPrereqError("When using a IPv6 primary address, a valid"
3749 " IPv4 address must be given as secondary",
3751 self.op.secondary_ip = primary_ip
3753 secondary_ip = self.op.secondary_ip
3754 if not netutils.IP4Address.IsValid(secondary_ip):
3755 raise errors.OpPrereqError("Secondary IP (%s) needs to be a valid IPv4"
3756 " address" % secondary_ip, errors.ECODE_INVAL)
3758 node_list = cfg.GetNodeList()
3759 if not self.op.readd and node in node_list:
3760 raise errors.OpPrereqError("Node %s is already in the configuration" %
3761 node, errors.ECODE_EXISTS)
3762 elif self.op.readd and node not in node_list:
3763 raise errors.OpPrereqError("Node %s is not in the configuration" % node,
3766 self.changed_primary_ip = False
3768 for existing_node_name in node_list:
3769 existing_node = cfg.GetNodeInfo(existing_node_name)
3771 if self.op.readd and node == existing_node_name:
3772 if existing_node.secondary_ip != secondary_ip:
3773 raise errors.OpPrereqError("Readded node doesn't have the same IP"
3774 " address configuration as before",
3776 if existing_node.primary_ip != primary_ip:
3777 self.changed_primary_ip = True
3781 if (existing_node.primary_ip == primary_ip or
3782 existing_node.secondary_ip == primary_ip or
3783 existing_node.primary_ip == secondary_ip or
3784 existing_node.secondary_ip == secondary_ip):
3785 raise errors.OpPrereqError("New node ip address(es) conflict with"
3786 " existing node %s" % existing_node.name,
3787 errors.ECODE_NOTUNIQUE)
3789 # After this 'if' block, None is no longer a valid value for the
3790 # _capable op attributes
3792 old_node = self.cfg.GetNodeInfo(node)
3793 assert old_node is not None, "Can't retrieve locked node %s" % node
3794 for attr in self._NFLAGS:
3795 if getattr(self.op, attr) is None:
3796 setattr(self.op, attr, getattr(old_node, attr))
3798 for attr in self._NFLAGS:
3799 if getattr(self.op, attr) is None:
3800 setattr(self.op, attr, True)
3802 if self.op.readd and not self.op.vm_capable:
3803 pri, sec = cfg.GetNodeInstances(node)
3805 raise errors.OpPrereqError("Node %s being re-added with vm_capable"
3806 " flag set to false, but it already holds"
3807 " instances" % node,
3810 # check that the type of the node (single versus dual homed) is the
3811 # same as for the master
3812 myself = cfg.GetNodeInfo(self.cfg.GetMasterNode())
3813 master_singlehomed = myself.secondary_ip == myself.primary_ip
3814 newbie_singlehomed = secondary_ip == primary_ip
3815 if master_singlehomed != newbie_singlehomed:
3816 if master_singlehomed:
3817 raise errors.OpPrereqError("The master has no private ip but the"
3818 " new node has one",
3821 raise errors.OpPrereqError("The master has a private ip but the"
3822 " new node doesn't have one",
3825 # checks reachability
3826 if not netutils.TcpPing(primary_ip, constants.DEFAULT_NODED_PORT):
3827 raise errors.OpPrereqError("Node not reachable by ping",
3828 errors.ECODE_ENVIRON)
3830 if not newbie_singlehomed:
3831 # check reachability from my secondary ip to newbie's secondary ip
3832 if not netutils.TcpPing(secondary_ip, constants.DEFAULT_NODED_PORT,
3833 source=myself.secondary_ip):
3834 raise errors.OpPrereqError("Node secondary ip not reachable by TCP"
3835 " based ping to noded port",
3836 errors.ECODE_ENVIRON)
3843 if self.op.master_capable:
3844 self.master_candidate = _DecideSelfPromotion(self, exceptions=exceptions)
3846 self.master_candidate = False
3849 self.new_node = old_node
3851 node_group = cfg.LookupNodeGroup(self.op.group)
3852 self.new_node = objects.Node(name=node,
3853 primary_ip=primary_ip,
3854 secondary_ip=secondary_ip,
3855 master_candidate=self.master_candidate,
3856 offline=False, drained=False,
3859 def Exec(self, feedback_fn):
3860 """Adds the new node to the cluster.
3863 new_node = self.new_node
3864 node = new_node.name
3866 # for re-adds, reset the offline/drained/master-candidate flags;
3867 # we need to reset here, otherwise offline would prevent RPC calls
3868 # later in the procedure; this also means that if the re-add
3869 # fails, we are left with a non-offlined, broken node
3871 new_node.drained = new_node.offline = False # pylint: disable-msg=W0201
3872 self.LogInfo("Readding a node, the offline/drained flags were reset")
3873 # if we demote the node, we do cleanup later in the procedure
3874 new_node.master_candidate = self.master_candidate
3875 if self.changed_primary_ip:
3876 new_node.primary_ip = self.op.primary_ip
3878 # copy the master/vm_capable flags
3879 for attr in self._NFLAGS:
3880 setattr(new_node, attr, getattr(self.op, attr))
3882 # notify the user about any possible mc promotion
3883 if new_node.master_candidate:
3884 self.LogInfo("Node will be a master candidate")
3886 # check connectivity
3887 result = self.rpc.call_version([node])[node]
3888 result.Raise("Can't get version information from node %s" % node)
3889 if constants.PROTOCOL_VERSION == result.payload:
3890 logging.info("Communication to node %s fine, sw version %s match",
3891 node, result.payload)
3893 raise errors.OpExecError("Version mismatch master version %s,"
3894 " node version %s" %
3895 (constants.PROTOCOL_VERSION, result.payload))
3897 # Add node to our /etc/hosts, and add key to known_hosts
3898 if self.cfg.GetClusterInfo().modify_etc_hosts:
3899 master_node = self.cfg.GetMasterNode()
3900 result = self.rpc.call_etc_hosts_modify(master_node,
3901 constants.ETC_HOSTS_ADD,
3904 result.Raise("Can't update hosts file with new host data")
3906 if new_node.secondary_ip != new_node.primary_ip:
3907 result = self.rpc.call_node_has_ip_address(new_node.name,
3908 new_node.secondary_ip)
3909 result.Raise("Failure checking secondary ip on node %s" % new_node.name,
3910 prereq=True, ecode=errors.ECODE_ENVIRON)
3911 if not result.payload:
3912 raise errors.OpExecError("Node claims it doesn't have the secondary ip"
3913 " you gave (%s). Please fix and re-run this"
3914 " command." % new_node.secondary_ip)
3916 node_verify_list = [self.cfg.GetMasterNode()]
3917 node_verify_param = {
3918 constants.NV_NODELIST: [node],
3919 # TODO: do a node-net-test as well?
3922 result = self.rpc.call_node_verify(node_verify_list, node_verify_param,
3923 self.cfg.GetClusterName())
3924 for verifier in node_verify_list:
3925 result[verifier].Raise("Cannot communicate with node %s" % verifier)
3926 nl_payload = result[verifier].payload[constants.NV_NODELIST]
3928 for failed in nl_payload:
3929 feedback_fn("ssh/hostname verification failed"
3930 " (checking from %s): %s" %
3931 (verifier, nl_payload[failed]))
3932 raise errors.OpExecError("ssh/hostname verification failed.")
3935 _RedistributeAncillaryFiles(self)
3936 self.context.ReaddNode(new_node)
3937 # make sure we redistribute the config
3938 self.cfg.Update(new_node, feedback_fn)
3939 # and make sure the new node will not have old files around
3940 if not new_node.master_candidate:
3941 result = self.rpc.call_node_demote_from_mc(new_node.name)
3942 msg = result.fail_msg
3944 self.LogWarning("Node failed to demote itself from master"
3945 " candidate status: %s" % msg)
3947 _RedistributeAncillaryFiles(self, additional_nodes=[node],
3948 additional_vm=self.op.vm_capable)
3949 self.context.AddNode(new_node, self.proc.GetECId())
3952 class LUSetNodeParams(LogicalUnit):
3953 """Modifies the parameters of a node.
3955 @cvar _F2R: a dictionary from tuples of flags (mc, drained, offline)
3956 to the node role (as _ROLE_*)
3957 @cvar _R2F: a dictionary from node role to tuples of flags
3958 @cvar _FLAGS: a list of attribute names corresponding to the flags
3961 HPATH = "node-modify"
3962 HTYPE = constants.HTYPE_NODE
3965 ("master_candidate", None, ht.TMaybeBool),
3966 ("offline", None, ht.TMaybeBool),
3967 ("drained", None, ht.TMaybeBool),
3968 ("auto_promote", False, ht.TBool),
3969 ("master_capable", None, ht.TMaybeBool),
3970 ("vm_capable", None, ht.TMaybeBool),
3974 (_ROLE_CANDIDATE, _ROLE_DRAINED, _ROLE_OFFLINE, _ROLE_REGULAR) = range(4)
3976 (True, False, False): _ROLE_CANDIDATE,
3977 (False, True, False): _ROLE_DRAINED,
3978 (False, False, True): _ROLE_OFFLINE,
3979 (False, False, False): _ROLE_REGULAR,
3981 _R2F = dict((v, k) for k, v in _F2R.items())
3982 _FLAGS = ["master_candidate", "drained", "offline"]
3984 def CheckArguments(self):
3985 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
3986 all_mods = [self.op.offline, self.op.master_candidate, self.op.drained,
3987 self.op.master_capable, self.op.vm_capable]
3988 if all_mods.count(None) == len(all_mods):
3989 raise errors.OpPrereqError("Please pass at least one modification",
3991 if all_mods.count(True) > 1:
3992 raise errors.OpPrereqError("Can't set the node into more than one"
3993 " state at the same time",
3996 # Boolean value that tells us whether we might be demoting from MC
3997 self.might_demote = (self.op.master_candidate == False or
3998 self.op.offline == True or
3999 self.op.drained == True or
4000 self.op.master_capable == False)
4002 self.lock_all = self.op.auto_promote and self.might_demote
4004 def ExpandNames(self):
4006 self.needed_locks = {locking.LEVEL_NODE: locking.ALL_SET}
4008 self.needed_locks = {locking.LEVEL_NODE: self.op.node_name}
4010 def BuildHooksEnv(self):
4013 This runs on the master node.
4017 "OP_TARGET": self.op.node_name,
4018 "MASTER_CANDIDATE": str(self.op.master_candidate),
4019 "OFFLINE": str(self.op.offline),
4020 "DRAINED": str(self.op.drained),
4021 "MASTER_CAPABLE": str(self.op.master_capable),
4022 "VM_CAPABLE": str(self.op.vm_capable),
4024 nl = [self.cfg.GetMasterNode(),
4028 def CheckPrereq(self):
4029 """Check prerequisites.
4031 This only checks the instance list against the existing names.
4034 node = self.node = self.cfg.GetNodeInfo(self.op.node_name)
4036 if (self.op.master_candidate is not None or
4037 self.op.drained is not None or
4038 self.op.offline is not None):
4039 # we can't change the master's node flags
4040 if self.op.node_name == self.cfg.GetMasterNode():
4041 raise errors.OpPrereqError("The master role can be changed"
4042 " only via master-failover",
4045 if self.op.master_candidate and not node.master_capable:
4046 raise errors.OpPrereqError("Node %s is not master capable, cannot make"
4047 " it a master candidate" % node.name,
4050 if self.op.vm_capable == False:
4051 (ipri, isec) = self.cfg.GetNodeInstances(self.op.node_name)
4053 raise errors.OpPrereqError("Node %s hosts instances, cannot unset"
4054 " the vm_capable flag" % node.name,
4057 if node.master_candidate and self.might_demote and not self.lock_all:
4058 assert not self.op.auto_promote, "auto-promote set but lock_all not"
4059 # check if after removing the current node, we're missing master
4061 (mc_remaining, mc_should, _) = \
4062 self.cfg.GetMasterCandidateStats(exceptions=[node.name])
4063 if mc_remaining < mc_should:
4064 raise errors.OpPrereqError("Not enough master candidates, please"
4065 " pass auto_promote to allow promotion",
4068 self.old_flags = old_flags = (node.master_candidate,
4069 node.drained, node.offline)
4070 assert old_flags in self._F2R, "Un-handled old flags %s" % str(old_flags)
4071 self.old_role = old_role = self._F2R[old_flags]
4073 # Check for ineffective changes
4074 for attr in self._FLAGS:
4075 if (getattr(self.op, attr) == False and getattr(node, attr) == False):
4076 self.LogInfo("Ignoring request to unset flag %s, already unset", attr)
4077 setattr(self.op, attr, None)
4079 # Past this point, any flag change to False means a transition
4080 # away from the respective state, as only real changes are kept
4082 # If we're being deofflined/drained, we'll MC ourself if needed
4083 if (self.op.drained == False or self.op.offline == False or
4084 (self.op.master_capable and not node.master_capable)):
4085 if _DecideSelfPromotion(self):
4086 self.op.master_candidate = True
4087 self.LogInfo("Auto-promoting node to master candidate")
4089 # If we're no longer master capable, we'll demote ourselves from MC
4090 if self.op.master_capable == False and node.master_candidate:
4091 self.LogInfo("Demoting from master candidate")
4092 self.op.master_candidate = False
4095 assert [getattr(self.op, attr) for attr in self._FLAGS].count(True) <= 1
4096 if self.op.master_candidate:
4097 new_role = self._ROLE_CANDIDATE
4098 elif self.op.drained:
4099 new_role = self._ROLE_DRAINED
4100 elif self.op.offline:
4101 new_role = self._ROLE_OFFLINE
4102 elif False in [self.op.master_candidate, self.op.drained, self.op.offline]:
4103 # False is still in new flags, which means we're un-setting (the
4105 new_role = self._ROLE_REGULAR
4106 else: # no new flags, nothing, keep old role
4109 self.new_role = new_role
4111 def Exec(self, feedback_fn):
4116 old_role = self.old_role
4117 new_role = self.new_role
4121 for attr in ["master_capable", "vm_capable"]:
4122 val = getattr(self.op, attr)
4124 setattr(node, attr, val)
4125 result.append((attr, str(val)))
4127 if new_role != old_role:
4128 # Tell the node to demote itself, if no longer MC and not offline
4129 if old_role == self._ROLE_CANDIDATE and new_role != self._ROLE_OFFLINE:
4130 msg = self.rpc.call_node_demote_from_mc(node.name).fail_msg
4132 self.LogWarning("Node failed to demote itself: %s", msg)
4134 new_flags = self._R2F[new_role]
4135 for of, nf, desc in zip(self.old_flags, new_flags, self._FLAGS):
4137 result.append((desc, str(nf)))
4138 (node.master_candidate, node.drained, node.offline) = new_flags
4140 # we locked all nodes, we adjust the CP before updating this node
4142 _AdjustCandidatePool(self, [node.name])
4144 # this will trigger configuration file update, if needed
4145 self.cfg.Update(node, feedback_fn)
4147 # this will trigger job queue propagation or cleanup if the mc
4149 if [old_role, new_role].count(self._ROLE_CANDIDATE) == 1:
4150 self.context.ReaddNode(node)
4155 class LUPowercycleNode(NoHooksLU):
4156 """Powercycles a node.
4165 def CheckArguments(self):
4166 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
4167 if self.op.node_name == self.cfg.GetMasterNode() and not self.op.force:
4168 raise errors.OpPrereqError("The node is the master and the force"
4169 " parameter was not set",
4172 def ExpandNames(self):
4173 """Locking for PowercycleNode.
4175 This is a last-resort option and shouldn't block on other
4176 jobs. Therefore, we grab no locks.
4179 self.needed_locks = {}
4181 def Exec(self, feedback_fn):
4185 result = self.rpc.call_node_powercycle(self.op.node_name,
4186 self.cfg.GetHypervisorType())
4187 result.Raise("Failed to schedule the reboot")
4188 return result.payload
4191 class LUQueryClusterInfo(NoHooksLU):
4192 """Query cluster configuration.
4197 def ExpandNames(self):
4198 self.needed_locks = {}
4200 def Exec(self, feedback_fn):
4201 """Return cluster config.
4204 cluster = self.cfg.GetClusterInfo()
4207 # Filter just for enabled hypervisors
4208 for os_name, hv_dict in cluster.os_hvp.items():
4209 os_hvp[os_name] = {}
4210 for hv_name, hv_params in hv_dict.items():
4211 if hv_name in cluster.enabled_hypervisors:
4212 os_hvp[os_name][hv_name] = hv_params
4214 # Convert ip_family to ip_version
4215 primary_ip_version = constants.IP4_VERSION
4216 if cluster.primary_ip_family == netutils.IP6Address.family:
4217 primary_ip_version = constants.IP6_VERSION
4220 "software_version": constants.RELEASE_VERSION,
4221 "protocol_version": constants.PROTOCOL_VERSION,
4222 "config_version": constants.CONFIG_VERSION,
4223 "os_api_version": max(constants.OS_API_VERSIONS),
4224 "export_version": constants.EXPORT_VERSION,
4225 "architecture": (platform.architecture()[0], platform.machine()),
4226 "name": cluster.cluster_name,
4227 "master": cluster.master_node,
4228 "default_hypervisor": cluster.enabled_hypervisors[0],
4229 "enabled_hypervisors": cluster.enabled_hypervisors,
4230 "hvparams": dict([(hypervisor_name, cluster.hvparams[hypervisor_name])
4231 for hypervisor_name in cluster.enabled_hypervisors]),
4233 "beparams": cluster.beparams,
4234 "osparams": cluster.osparams,
4235 "nicparams": cluster.nicparams,
4236 "candidate_pool_size": cluster.candidate_pool_size,
4237 "master_netdev": cluster.master_netdev,
4238 "volume_group_name": cluster.volume_group_name,
4239 "drbd_usermode_helper": cluster.drbd_usermode_helper,
4240 "file_storage_dir": cluster.file_storage_dir,
4241 "maintain_node_health": cluster.maintain_node_health,
4242 "ctime": cluster.ctime,
4243 "mtime": cluster.mtime,
4244 "uuid": cluster.uuid,
4245 "tags": list(cluster.GetTags()),
4246 "uid_pool": cluster.uid_pool,
4247 "default_iallocator": cluster.default_iallocator,
4248 "reserved_lvs": cluster.reserved_lvs,
4249 "primary_ip_version": primary_ip_version,
4250 "prealloc_wipe_disks": cluster.prealloc_wipe_disks,
4256 class LUQueryConfigValues(NoHooksLU):
4257 """Return configuration values.
4260 _OP_PARAMS = [_POutputFields]
4262 _FIELDS_DYNAMIC = utils.FieldSet()
4263 _FIELDS_STATIC = utils.FieldSet("cluster_name", "master_node", "drain_flag",
4264 "watcher_pause", "volume_group_name")
4266 def CheckArguments(self):
4267 _CheckOutputFields(static=self._FIELDS_STATIC,
4268 dynamic=self._FIELDS_DYNAMIC,
4269 selected=self.op.output_fields)
4271 def ExpandNames(self):
4272 self.needed_locks = {}
4274 def Exec(self, feedback_fn):
4275 """Dump a representation of the cluster config to the standard output.
4279 for field in self.op.output_fields:
4280 if field == "cluster_name":
4281 entry = self.cfg.GetClusterName()
4282 elif field == "master_node":
4283 entry = self.cfg.GetMasterNode()
4284 elif field == "drain_flag":
4285 entry = os.path.exists(constants.JOB_QUEUE_DRAIN_FILE)
4286 elif field == "watcher_pause":
4287 entry = utils.ReadWatcherPauseFile(constants.WATCHER_PAUSEFILE)
4288 elif field == "volume_group_name":
4289 entry = self.cfg.GetVGName()
4291 raise errors.ParameterError(field)
4292 values.append(entry)
4296 class LUActivateInstanceDisks(NoHooksLU):
4297 """Bring up an instance's disks.
4302 ("ignore_size", False, ht.TBool),
4306 def ExpandNames(self):
4307 self._ExpandAndLockInstance()
4308 self.needed_locks[locking.LEVEL_NODE] = []
4309 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
4311 def DeclareLocks(self, level):
4312 if level == locking.LEVEL_NODE:
4313 self._LockInstancesNodes()
4315 def CheckPrereq(self):
4316 """Check prerequisites.
4318 This checks that the instance is in the cluster.
4321 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
4322 assert self.instance is not None, \
4323 "Cannot retrieve locked instance %s" % self.op.instance_name
4324 _CheckNodeOnline(self, self.instance.primary_node)
4326 def Exec(self, feedback_fn):
4327 """Activate the disks.
4330 disks_ok, disks_info = \
4331 _AssembleInstanceDisks(self, self.instance,
4332 ignore_size=self.op.ignore_size)
4334 raise errors.OpExecError("Cannot activate block devices")
4339 def _AssembleInstanceDisks(lu, instance, disks=None, ignore_secondaries=False,
4341 """Prepare the block devices for an instance.
4343 This sets up the block devices on all nodes.
4345 @type lu: L{LogicalUnit}
4346 @param lu: the logical unit on whose behalf we execute
4347 @type instance: L{objects.Instance}
4348 @param instance: the instance for whose disks we assemble
4349 @type disks: list of L{objects.Disk} or None
4350 @param disks: which disks to assemble (or all, if None)
4351 @type ignore_secondaries: boolean
4352 @param ignore_secondaries: if true, errors on secondary nodes
4353 won't result in an error return from the function
4354 @type ignore_size: boolean
4355 @param ignore_size: if true, the current known size of the disk
4356 will not be used during the disk activation, useful for cases
4357 when the size is wrong
4358 @return: False if the operation failed, otherwise a list of
4359 (host, instance_visible_name, node_visible_name)
4360 with the mapping from node devices to instance devices
4365 iname = instance.name
4366 disks = _ExpandCheckDisks(instance, disks)
4368 # With the two passes mechanism we try to reduce the window of
4369 # opportunity for the race condition of switching DRBD to primary
4370 # before handshaking occured, but we do not eliminate it
4372 # The proper fix would be to wait (with some limits) until the
4373 # connection has been made and drbd transitions from WFConnection
4374 # into any other network-connected state (Connected, SyncTarget,
4377 # 1st pass, assemble on all nodes in secondary mode
4378 for inst_disk in disks:
4379 for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node):
4381 node_disk = node_disk.Copy()
4382 node_disk.UnsetSize()
4383 lu.cfg.SetDiskID(node_disk, node)
4384 result = lu.rpc.call_blockdev_assemble(node, node_disk, iname, False)
4385 msg = result.fail_msg
4387 lu.proc.LogWarning("Could not prepare block device %s on node %s"
4388 " (is_primary=False, pass=1): %s",
4389 inst_disk.iv_name, node, msg)
4390 if not ignore_secondaries:
4393 # FIXME: race condition on drbd migration to primary
4395 # 2nd pass, do only the primary node
4396 for inst_disk in disks:
4399 for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node):
4400 if node != instance.primary_node:
4403 node_disk = node_disk.Copy()
4404 node_disk.UnsetSize()
4405 lu.cfg.SetDiskID(node_disk, node)
4406 result = lu.rpc.call_blockdev_assemble(node, node_disk, iname, True)
4407 msg = result.fail_msg
4409 lu.proc.LogWarning("Could not prepare block device %s on node %s"
4410 " (is_primary=True, pass=2): %s",
4411 inst_disk.iv_name, node, msg)
4414 dev_path = result.payload
4416 device_info.append((instance.primary_node, inst_disk.iv_name, dev_path))
4418 # leave the disks configured for the primary node
4419 # this is a workaround that would be fixed better by
4420 # improving the logical/physical id handling
4422 lu.cfg.SetDiskID(disk, instance.primary_node)
4424 return disks_ok, device_info
4427 def _StartInstanceDisks(lu, instance, force):
4428 """Start the disks of an instance.
4431 disks_ok, _ = _AssembleInstanceDisks(lu, instance,
4432 ignore_secondaries=force)
4434 _ShutdownInstanceDisks(lu, instance)
4435 if force is not None and not force:
4436 lu.proc.LogWarning("", hint="If the message above refers to a"
4438 " you can retry the operation using '--force'.")
4439 raise errors.OpExecError("Disk consistency error")
4442 class LUDeactivateInstanceDisks(NoHooksLU):
4443 """Shutdown an instance's disks.
4451 def ExpandNames(self):
4452 self._ExpandAndLockInstance()
4453 self.needed_locks[locking.LEVEL_NODE] = []
4454 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
4456 def DeclareLocks(self, level):
4457 if level == locking.LEVEL_NODE:
4458 self._LockInstancesNodes()
4460 def CheckPrereq(self):
4461 """Check prerequisites.
4463 This checks that the instance is in the cluster.
4466 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
4467 assert self.instance is not None, \
4468 "Cannot retrieve locked instance %s" % self.op.instance_name
4470 def Exec(self, feedback_fn):
4471 """Deactivate the disks
4474 instance = self.instance
4475 _SafeShutdownInstanceDisks(self, instance)
4478 def _SafeShutdownInstanceDisks(lu, instance, disks=None):
4479 """Shutdown block devices of an instance.
4481 This function checks if an instance is running, before calling
4482 _ShutdownInstanceDisks.
4485 _CheckInstanceDown(lu, instance, "cannot shutdown disks")
4486 _ShutdownInstanceDisks(lu, instance, disks=disks)
4489 def _ExpandCheckDisks(instance, disks):
4490 """Return the instance disks selected by the disks list
4492 @type disks: list of L{objects.Disk} or None
4493 @param disks: selected disks
4494 @rtype: list of L{objects.Disk}
4495 @return: selected instance disks to act on
4499 return instance.disks
4501 if not set(disks).issubset(instance.disks):
4502 raise errors.ProgrammerError("Can only act on disks belonging to the"
4507 def _ShutdownInstanceDisks(lu, instance, disks=None, ignore_primary=False):
4508 """Shutdown block devices of an instance.
4510 This does the shutdown on all nodes of the instance.
4512 If the ignore_primary is false, errors on the primary node are
4517 disks = _ExpandCheckDisks(instance, disks)
4520 for node, top_disk in disk.ComputeNodeTree(instance.primary_node):
4521 lu.cfg.SetDiskID(top_disk, node)
4522 result = lu.rpc.call_blockdev_shutdown(node, top_disk)
4523 msg = result.fail_msg
4525 lu.LogWarning("Could not shutdown block device %s on node %s: %s",
4526 disk.iv_name, node, msg)
4527 if not ignore_primary or node != instance.primary_node:
4532 def _CheckNodeFreeMemory(lu, node, reason, requested, hypervisor_name):
4533 """Checks if a node has enough free memory.
4535 This function check if a given node has the needed amount of free
4536 memory. In case the node has less memory or we cannot get the
4537 information from the node, this function raise an OpPrereqError
4540 @type lu: C{LogicalUnit}
4541 @param lu: a logical unit from which we get configuration data
4543 @param node: the node to check
4544 @type reason: C{str}
4545 @param reason: string to use in the error message
4546 @type requested: C{int}
4547 @param requested: the amount of memory in MiB to check for
4548 @type hypervisor_name: C{str}
4549 @param hypervisor_name: the hypervisor to ask for memory stats
4550 @raise errors.OpPrereqError: if the node doesn't have enough memory, or
4551 we cannot check the node
4554 nodeinfo = lu.rpc.call_node_info([node], lu.cfg.GetVGName(), hypervisor_name)
4555 nodeinfo[node].Raise("Can't get data from node %s" % node,
4556 prereq=True, ecode=errors.ECODE_ENVIRON)
4557 free_mem = nodeinfo[node].payload.get('memory_free', None)
4558 if not isinstance(free_mem, int):
4559 raise errors.OpPrereqError("Can't compute free memory on node %s, result"
4560 " was '%s'" % (node, free_mem),
4561 errors.ECODE_ENVIRON)
4562 if requested > free_mem:
4563 raise errors.OpPrereqError("Not enough memory on node %s for %s:"
4564 " needed %s MiB, available %s MiB" %
4565 (node, reason, requested, free_mem),
4569 def _CheckNodesFreeDisk(lu, nodenames, requested):
4570 """Checks if nodes have enough free disk space in the default VG.
4572 This function check if all given nodes have the needed amount of
4573 free disk. In case any node has less disk or we cannot get the
4574 information from the node, this function raise an OpPrereqError
4577 @type lu: C{LogicalUnit}
4578 @param lu: a logical unit from which we get configuration data
4579 @type nodenames: C{list}
4580 @param nodenames: the list of node names to check
4581 @type requested: C{int}
4582 @param requested: the amount of disk in MiB to check for
4583 @raise errors.OpPrereqError: if the node doesn't have enough disk, or
4584 we cannot check the node
4587 nodeinfo = lu.rpc.call_node_info(nodenames, lu.cfg.GetVGName(),
4588 lu.cfg.GetHypervisorType())
4589 for node in nodenames:
4590 info = nodeinfo[node]
4591 info.Raise("Cannot get current information from node %s" % node,
4592 prereq=True, ecode=errors.ECODE_ENVIRON)
4593 vg_free = info.payload.get("vg_free", None)
4594 if not isinstance(vg_free, int):
4595 raise errors.OpPrereqError("Can't compute free disk space on node %s,"
4596 " result was '%s'" % (node, vg_free),
4597 errors.ECODE_ENVIRON)
4598 if requested > vg_free:
4599 raise errors.OpPrereqError("Not enough disk space on target node %s:"
4600 " required %d MiB, available %d MiB" %
4601 (node, requested, vg_free),
4605 class LUStartupInstance(LogicalUnit):
4606 """Starts an instance.
4609 HPATH = "instance-start"
4610 HTYPE = constants.HTYPE_INSTANCE
4614 _PIgnoreOfflineNodes,
4615 ("hvparams", ht.EmptyDict, ht.TDict),
4616 ("beparams", ht.EmptyDict, ht.TDict),
4620 def CheckArguments(self):
4622 if self.op.beparams:
4623 # fill the beparams dict
4624 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
4626 def ExpandNames(self):
4627 self._ExpandAndLockInstance()
4629 def BuildHooksEnv(self):
4632 This runs on master, primary and secondary nodes of the instance.
4636 "FORCE": self.op.force,
4638 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
4639 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
4642 def CheckPrereq(self):
4643 """Check prerequisites.
4645 This checks that the instance is in the cluster.
4648 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
4649 assert self.instance is not None, \
4650 "Cannot retrieve locked instance %s" % self.op.instance_name
4653 if self.op.hvparams:
4654 # check hypervisor parameter syntax (locally)
4655 cluster = self.cfg.GetClusterInfo()
4656 utils.ForceDictType(self.op.hvparams, constants.HVS_PARAMETER_TYPES)
4657 filled_hvp = cluster.FillHV(instance)
4658 filled_hvp.update(self.op.hvparams)
4659 hv_type = hypervisor.GetHypervisor(instance.hypervisor)
4660 hv_type.CheckParameterSyntax(filled_hvp)
4661 _CheckHVParams(self, instance.all_nodes, instance.hypervisor, filled_hvp)
4663 self.primary_offline = self.cfg.GetNodeInfo(instance.primary_node).offline
4665 if self.primary_offline and self.op.ignore_offline_nodes:
4666 self.proc.LogWarning("Ignoring offline primary node")
4668 if self.op.hvparams or self.op.beparams:
4669 self.proc.LogWarning("Overridden parameters are ignored")
4671 _CheckNodeOnline(self, instance.primary_node)
4673 bep = self.cfg.GetClusterInfo().FillBE(instance)
4675 # check bridges existence
4676 _CheckInstanceBridgesExist(self, instance)
4678 remote_info = self.rpc.call_instance_info(instance.primary_node,
4680 instance.hypervisor)
4681 remote_info.Raise("Error checking node %s" % instance.primary_node,
4682 prereq=True, ecode=errors.ECODE_ENVIRON)
4683 if not remote_info.payload: # not running already
4684 _CheckNodeFreeMemory(self, instance.primary_node,
4685 "starting instance %s" % instance.name,
4686 bep[constants.BE_MEMORY], instance.hypervisor)
4688 def Exec(self, feedback_fn):
4689 """Start the instance.
4692 instance = self.instance
4693 force = self.op.force
4695 self.cfg.MarkInstanceUp(instance.name)
4697 if self.primary_offline:
4698 assert self.op.ignore_offline_nodes
4699 self.proc.LogInfo("Primary node offline, marked instance as started")
4701 node_current = instance.primary_node
4703 _StartInstanceDisks(self, instance, force)
4705 result = self.rpc.call_instance_start(node_current, instance,
4706 self.op.hvparams, self.op.beparams)
4707 msg = result.fail_msg
4709 _ShutdownInstanceDisks(self, instance)
4710 raise errors.OpExecError("Could not start instance: %s" % msg)
4713 class LURebootInstance(LogicalUnit):
4714 """Reboot an instance.
4717 HPATH = "instance-reboot"
4718 HTYPE = constants.HTYPE_INSTANCE
4721 ("ignore_secondaries", False, ht.TBool),
4722 ("reboot_type", ht.NoDefault, ht.TElemOf(constants.REBOOT_TYPES)),
4727 def ExpandNames(self):
4728 self._ExpandAndLockInstance()
4730 def BuildHooksEnv(self):
4733 This runs on master, primary and secondary nodes of the instance.
4737 "IGNORE_SECONDARIES": self.op.ignore_secondaries,
4738 "REBOOT_TYPE": self.op.reboot_type,
4739 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
4741 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
4742 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
4745 def CheckPrereq(self):
4746 """Check prerequisites.
4748 This checks that the instance is in the cluster.
4751 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
4752 assert self.instance is not None, \
4753 "Cannot retrieve locked instance %s" % self.op.instance_name
4755 _CheckNodeOnline(self, instance.primary_node)
4757 # check bridges existence
4758 _CheckInstanceBridgesExist(self, instance)
4760 def Exec(self, feedback_fn):
4761 """Reboot the instance.
4764 instance = self.instance
4765 ignore_secondaries = self.op.ignore_secondaries
4766 reboot_type = self.op.reboot_type
4768 node_current = instance.primary_node
4770 if reboot_type in [constants.INSTANCE_REBOOT_SOFT,
4771 constants.INSTANCE_REBOOT_HARD]:
4772 for disk in instance.disks:
4773 self.cfg.SetDiskID(disk, node_current)
4774 result = self.rpc.call_instance_reboot(node_current, instance,
4776 self.op.shutdown_timeout)
4777 result.Raise("Could not reboot instance")
4779 result = self.rpc.call_instance_shutdown(node_current, instance,
4780 self.op.shutdown_timeout)
4781 result.Raise("Could not shutdown instance for full reboot")
4782 _ShutdownInstanceDisks(self, instance)
4783 _StartInstanceDisks(self, instance, ignore_secondaries)
4784 result = self.rpc.call_instance_start(node_current, instance, None, None)
4785 msg = result.fail_msg
4787 _ShutdownInstanceDisks(self, instance)
4788 raise errors.OpExecError("Could not start instance for"
4789 " full reboot: %s" % msg)
4791 self.cfg.MarkInstanceUp(instance.name)
4794 class LUShutdownInstance(LogicalUnit):
4795 """Shutdown an instance.
4798 HPATH = "instance-stop"
4799 HTYPE = constants.HTYPE_INSTANCE
4802 _PIgnoreOfflineNodes,
4803 ("timeout", constants.DEFAULT_SHUTDOWN_TIMEOUT, ht.TPositiveInt),
4807 def ExpandNames(self):
4808 self._ExpandAndLockInstance()
4810 def BuildHooksEnv(self):
4813 This runs on master, primary and secondary nodes of the instance.
4816 env = _BuildInstanceHookEnvByObject(self, self.instance)
4817 env["TIMEOUT"] = self.op.timeout
4818 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
4821 def CheckPrereq(self):
4822 """Check prerequisites.
4824 This checks that the instance is in the cluster.
4827 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
4828 assert self.instance is not None, \
4829 "Cannot retrieve locked instance %s" % self.op.instance_name
4831 self.primary_offline = \
4832 self.cfg.GetNodeInfo(self.instance.primary_node).offline
4834 if self.primary_offline and self.op.ignore_offline_nodes:
4835 self.proc.LogWarning("Ignoring offline primary node")
4837 _CheckNodeOnline(self, self.instance.primary_node)
4839 def Exec(self, feedback_fn):
4840 """Shutdown the instance.
4843 instance = self.instance
4844 node_current = instance.primary_node
4845 timeout = self.op.timeout
4847 self.cfg.MarkInstanceDown(instance.name)
4849 if self.primary_offline:
4850 assert self.op.ignore_offline_nodes
4851 self.proc.LogInfo("Primary node offline, marked instance as stopped")
4853 result = self.rpc.call_instance_shutdown(node_current, instance, timeout)
4854 msg = result.fail_msg
4856 self.proc.LogWarning("Could not shutdown instance: %s" % msg)
4858 _ShutdownInstanceDisks(self, instance)
4861 class LUReinstallInstance(LogicalUnit):
4862 """Reinstall an instance.
4865 HPATH = "instance-reinstall"
4866 HTYPE = constants.HTYPE_INSTANCE
4869 ("os_type", None, ht.TMaybeString),
4870 ("force_variant", False, ht.TBool),
4871 ("osparams", None, ht.TOr(ht.TDict, ht.TNone)),
4875 def ExpandNames(self):
4876 self._ExpandAndLockInstance()
4878 def BuildHooksEnv(self):
4881 This runs on master, primary and secondary nodes of the instance.
4884 env = _BuildInstanceHookEnvByObject(self, self.instance)
4885 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
4888 def CheckPrereq(self):
4889 """Check prerequisites.
4891 This checks that the instance is in the cluster and is not running.
4894 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
4895 assert instance is not None, \
4896 "Cannot retrieve locked instance %s" % self.op.instance_name
4897 _CheckNodeOnline(self, instance.primary_node)
4899 if instance.disk_template == constants.DT_DISKLESS:
4900 raise errors.OpPrereqError("Instance '%s' has no disks" %
4901 self.op.instance_name,
4903 _CheckInstanceDown(self, instance, "cannot reinstall")
4905 if self.op.os_type is not None:
4907 pnode = _ExpandNodeName(self.cfg, instance.primary_node)
4908 _CheckNodeHasOS(self, pnode, self.op.os_type, self.op.force_variant)
4909 instance_os = self.op.os_type
4911 instance_os = instance.os
4913 nodelist = list(instance.all_nodes)
4915 if self.op.osparams:
4916 i_osdict = _GetUpdatedParams(instance.osparams, self.op.osparams)
4917 _CheckOSParams(self, True, nodelist, instance_os, i_osdict)
4918 self.os_inst = i_osdict # the new dict (without defaults)
4922 self.instance = instance
4924 def Exec(self, feedback_fn):
4925 """Reinstall the instance.
4928 inst = self.instance
4930 if self.op.os_type is not None:
4931 feedback_fn("Changing OS to '%s'..." % self.op.os_type)
4932 inst.os = self.op.os_type
4933 # Write to configuration
4934 self.cfg.Update(inst, feedback_fn)
4936 _StartInstanceDisks(self, inst, None)
4938 feedback_fn("Running the instance OS create scripts...")
4939 # FIXME: pass debug option from opcode to backend
4940 result = self.rpc.call_instance_os_add(inst.primary_node, inst, True,
4941 self.op.debug_level,
4942 osparams=self.os_inst)
4943 result.Raise("Could not install OS for instance %s on node %s" %
4944 (inst.name, inst.primary_node))
4946 _ShutdownInstanceDisks(self, inst)
4949 class LURecreateInstanceDisks(LogicalUnit):
4950 """Recreate an instance's missing disks.
4953 HPATH = "instance-recreate-disks"
4954 HTYPE = constants.HTYPE_INSTANCE
4957 ("disks", ht.EmptyList, ht.TListOf(ht.TPositiveInt)),
4961 def ExpandNames(self):
4962 self._ExpandAndLockInstance()
4964 def BuildHooksEnv(self):
4967 This runs on master, primary and secondary nodes of the instance.
4970 env = _BuildInstanceHookEnvByObject(self, self.instance)
4971 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
4974 def CheckPrereq(self):
4975 """Check prerequisites.
4977 This checks that the instance is in the cluster and is not running.
4980 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
4981 assert instance is not None, \
4982 "Cannot retrieve locked instance %s" % self.op.instance_name
4983 _CheckNodeOnline(self, instance.primary_node)
4985 if instance.disk_template == constants.DT_DISKLESS:
4986 raise errors.OpPrereqError("Instance '%s' has no disks" %
4987 self.op.instance_name, errors.ECODE_INVAL)
4988 _CheckInstanceDown(self, instance, "cannot recreate disks")
4990 if not self.op.disks:
4991 self.op.disks = range(len(instance.disks))
4993 for idx in self.op.disks:
4994 if idx >= len(instance.disks):
4995 raise errors.OpPrereqError("Invalid disk index passed '%s'" % idx,
4998 self.instance = instance
5000 def Exec(self, feedback_fn):
5001 """Recreate the disks.
5005 for idx, _ in enumerate(self.instance.disks):
5006 if idx not in self.op.disks: # disk idx has not been passed in
5010 _CreateDisks(self, self.instance, to_skip=to_skip)
5013 class LURenameInstance(LogicalUnit):
5014 """Rename an instance.
5017 HPATH = "instance-rename"
5018 HTYPE = constants.HTYPE_INSTANCE
5021 ("new_name", ht.NoDefault, ht.TNonEmptyString),
5022 ("ip_check", False, ht.TBool),
5023 ("name_check", True, ht.TBool),
5026 def CheckArguments(self):
5030 if self.op.ip_check and not self.op.name_check:
5031 # TODO: make the ip check more flexible and not depend on the name check
5032 raise errors.OpPrereqError("Cannot do ip check without a name check",
5035 def BuildHooksEnv(self):
5038 This runs on master, primary and secondary nodes of the instance.
5041 env = _BuildInstanceHookEnvByObject(self, self.instance)
5042 env["INSTANCE_NEW_NAME"] = self.op.new_name
5043 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
5046 def CheckPrereq(self):
5047 """Check prerequisites.
5049 This checks that the instance is in the cluster and is not running.
5052 self.op.instance_name = _ExpandInstanceName(self.cfg,
5053 self.op.instance_name)
5054 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5055 assert instance is not None
5056 _CheckNodeOnline(self, instance.primary_node)
5057 _CheckInstanceDown(self, instance, "cannot rename")
5058 self.instance = instance
5060 new_name = self.op.new_name
5061 if self.op.name_check:
5062 hostname = netutils.GetHostname(name=new_name)
5063 new_name = self.op.new_name = hostname.name
5064 if (self.op.ip_check and
5065 netutils.TcpPing(hostname.ip, constants.DEFAULT_NODED_PORT)):
5066 raise errors.OpPrereqError("IP %s of instance %s already in use" %
5067 (hostname.ip, new_name),
5068 errors.ECODE_NOTUNIQUE)
5070 instance_list = self.cfg.GetInstanceList()
5071 if new_name in instance_list:
5072 raise errors.OpPrereqError("Instance '%s' is already in the cluster" %
5073 new_name, errors.ECODE_EXISTS)
5075 def Exec(self, feedback_fn):
5076 """Reinstall the instance.
5079 inst = self.instance
5080 old_name = inst.name
5082 if inst.disk_template == constants.DT_FILE:
5083 old_file_storage_dir = os.path.dirname(inst.disks[0].logical_id[1])
5085 self.cfg.RenameInstance(inst.name, self.op.new_name)
5086 # Change the instance lock. This is definitely safe while we hold the BGL
5087 self.context.glm.remove(locking.LEVEL_INSTANCE, old_name)
5088 self.context.glm.add(locking.LEVEL_INSTANCE, self.op.new_name)
5090 # re-read the instance from the configuration after rename
5091 inst = self.cfg.GetInstanceInfo(self.op.new_name)
5093 if inst.disk_template == constants.DT_FILE:
5094 new_file_storage_dir = os.path.dirname(inst.disks[0].logical_id[1])
5095 result = self.rpc.call_file_storage_dir_rename(inst.primary_node,
5096 old_file_storage_dir,
5097 new_file_storage_dir)
5098 result.Raise("Could not rename on node %s directory '%s' to '%s'"
5099 " (but the instance has been renamed in Ganeti)" %
5100 (inst.primary_node, old_file_storage_dir,
5101 new_file_storage_dir))
5103 _StartInstanceDisks(self, inst, None)
5105 result = self.rpc.call_instance_run_rename(inst.primary_node, inst,
5106 old_name, self.op.debug_level)
5107 msg = result.fail_msg
5109 msg = ("Could not run OS rename script for instance %s on node %s"
5110 " (but the instance has been renamed in Ganeti): %s" %
5111 (inst.name, inst.primary_node, msg))
5112 self.proc.LogWarning(msg)
5114 _ShutdownInstanceDisks(self, inst)
5119 class LURemoveInstance(LogicalUnit):
5120 """Remove an instance.
5123 HPATH = "instance-remove"
5124 HTYPE = constants.HTYPE_INSTANCE
5127 ("ignore_failures", False, ht.TBool),
5132 def ExpandNames(self):
5133 self._ExpandAndLockInstance()
5134 self.needed_locks[locking.LEVEL_NODE] = []
5135 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
5137 def DeclareLocks(self, level):
5138 if level == locking.LEVEL_NODE:
5139 self._LockInstancesNodes()
5141 def BuildHooksEnv(self):
5144 This runs on master, primary and secondary nodes of the instance.
5147 env = _BuildInstanceHookEnvByObject(self, self.instance)
5148 env["SHUTDOWN_TIMEOUT"] = self.op.shutdown_timeout
5149 nl = [self.cfg.GetMasterNode()]
5150 nl_post = list(self.instance.all_nodes) + nl
5151 return env, nl, nl_post
5153 def CheckPrereq(self):
5154 """Check prerequisites.
5156 This checks that the instance is in the cluster.
5159 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5160 assert self.instance is not None, \
5161 "Cannot retrieve locked instance %s" % self.op.instance_name
5163 def Exec(self, feedback_fn):
5164 """Remove the instance.
5167 instance = self.instance
5168 logging.info("Shutting down instance %s on node %s",
5169 instance.name, instance.primary_node)
5171 result = self.rpc.call_instance_shutdown(instance.primary_node, instance,
5172 self.op.shutdown_timeout)
5173 msg = result.fail_msg
5175 if self.op.ignore_failures:
5176 feedback_fn("Warning: can't shutdown instance: %s" % msg)
5178 raise errors.OpExecError("Could not shutdown instance %s on"
5180 (instance.name, instance.primary_node, msg))
5182 _RemoveInstance(self, feedback_fn, instance, self.op.ignore_failures)
5185 def _RemoveInstance(lu, feedback_fn, instance, ignore_failures):
5186 """Utility function to remove an instance.
5189 logging.info("Removing block devices for instance %s", instance.name)
5191 if not _RemoveDisks(lu, instance):
5192 if not ignore_failures:
5193 raise errors.OpExecError("Can't remove instance's disks")
5194 feedback_fn("Warning: can't remove instance's disks")
5196 logging.info("Removing instance %s out of cluster config", instance.name)
5198 lu.cfg.RemoveInstance(instance.name)
5200 assert not lu.remove_locks.get(locking.LEVEL_INSTANCE), \
5201 "Instance lock removal conflict"
5203 # Remove lock for the instance
5204 lu.remove_locks[locking.LEVEL_INSTANCE] = instance.name
5207 class LUQueryInstances(NoHooksLU):
5208 """Logical unit for querying instances.
5211 # pylint: disable-msg=W0142
5213 ("output_fields", ht.NoDefault, ht.TListOf(ht.TNonEmptyString)),
5214 ("names", ht.EmptyList, ht.TListOf(ht.TNonEmptyString)),
5215 ("use_locking", False, ht.TBool),
5218 _SIMPLE_FIELDS = ["name", "os", "network_port", "hypervisor",
5219 "serial_no", "ctime", "mtime", "uuid"]
5220 _FIELDS_STATIC = utils.FieldSet(*["name", "os", "pnode", "snodes",
5222 "disk_template", "ip", "mac", "bridge",
5223 "nic_mode", "nic_link",
5224 "sda_size", "sdb_size", "vcpus", "tags",
5225 "network_port", "beparams",
5226 r"(disk)\.(size)/([0-9]+)",
5227 r"(disk)\.(sizes)", "disk_usage",
5228 r"(nic)\.(mac|ip|mode|link)/([0-9]+)",
5229 r"(nic)\.(bridge)/([0-9]+)",
5230 r"(nic)\.(macs|ips|modes|links|bridges)",
5231 r"(disk|nic)\.(count)",
5232 "hvparams", "custom_hvparams",
5233 "custom_beparams", "custom_nicparams",
5234 ] + _SIMPLE_FIELDS +
5236 for name in constants.HVS_PARAMETERS
5237 if name not in constants.HVC_GLOBALS] +
5239 for name in constants.BES_PARAMETERS])
5240 _FIELDS_DYNAMIC = utils.FieldSet("oper_state",
5246 def CheckArguments(self):
5247 _CheckOutputFields(static=self._FIELDS_STATIC,
5248 dynamic=self._FIELDS_DYNAMIC,
5249 selected=self.op.output_fields)
5251 def ExpandNames(self):
5252 self.needed_locks = {}
5253 self.share_locks[locking.LEVEL_INSTANCE] = 1
5254 self.share_locks[locking.LEVEL_NODE] = 1
5257 self.wanted = _GetWantedInstances(self, self.op.names)
5259 self.wanted = locking.ALL_SET
5261 self.do_node_query = self._FIELDS_STATIC.NonMatching(self.op.output_fields)
5262 self.do_locking = self.do_node_query and self.op.use_locking
5264 self.needed_locks[locking.LEVEL_INSTANCE] = self.wanted
5265 self.needed_locks[locking.LEVEL_NODE] = []
5266 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
5268 def DeclareLocks(self, level):
5269 if level == locking.LEVEL_NODE and self.do_locking:
5270 self._LockInstancesNodes()
5272 def Exec(self, feedback_fn):
5273 """Computes the list of nodes and their attributes.
5276 # pylint: disable-msg=R0912
5277 # way too many branches here
5278 all_info = self.cfg.GetAllInstancesInfo()
5279 if self.wanted == locking.ALL_SET:
5280 # caller didn't specify instance names, so ordering is not important
5282 instance_names = self.acquired_locks[locking.LEVEL_INSTANCE]
5284 instance_names = all_info.keys()
5285 instance_names = utils.NiceSort(instance_names)
5287 # caller did specify names, so we must keep the ordering
5289 tgt_set = self.acquired_locks[locking.LEVEL_INSTANCE]
5291 tgt_set = all_info.keys()
5292 missing = set(self.wanted).difference(tgt_set)
5294 raise errors.OpExecError("Some instances were removed before"
5295 " retrieving their data: %s" % missing)
5296 instance_names = self.wanted
5298 instance_list = [all_info[iname] for iname in instance_names]
5300 # begin data gathering
5302 nodes = frozenset([inst.primary_node for inst in instance_list])
5303 hv_list = list(set([inst.hypervisor for inst in instance_list]))
5307 if self.do_node_query:
5309 node_data = self.rpc.call_all_instances_info(nodes, hv_list)
5311 result = node_data[name]
5313 # offline nodes will be in both lists
5314 off_nodes.append(name)
5316 bad_nodes.append(name)
5319 live_data.update(result.payload)
5320 # else no instance is alive
5322 live_data = dict([(name, {}) for name in instance_names])
5324 # end data gathering
5329 cluster = self.cfg.GetClusterInfo()
5330 for instance in instance_list:
5332 i_hv = cluster.FillHV(instance, skip_globals=True)
5333 i_be = cluster.FillBE(instance)
5334 i_nicp = [cluster.SimpleFillNIC(nic.nicparams) for nic in instance.nics]
5335 for field in self.op.output_fields:
5336 st_match = self._FIELDS_STATIC.Matches(field)
5337 if field in self._SIMPLE_FIELDS:
5338 val = getattr(instance, field)
5339 elif field == "pnode":
5340 val = instance.primary_node
5341 elif field == "snodes":
5342 val = list(instance.secondary_nodes)
5343 elif field == "admin_state":
5344 val = instance.admin_up
5345 elif field == "oper_state":
5346 if instance.primary_node in bad_nodes:
5349 val = bool(live_data.get(instance.name))
5350 elif field == "status":
5351 if instance.primary_node in off_nodes:
5352 val = "ERROR_nodeoffline"
5353 elif instance.primary_node in bad_nodes:
5354 val = "ERROR_nodedown"
5356 running = bool(live_data.get(instance.name))
5358 if instance.admin_up:
5363 if instance.admin_up:
5367 elif field == "oper_ram":
5368 if instance.primary_node in bad_nodes:
5370 elif instance.name in live_data:
5371 val = live_data[instance.name].get("memory", "?")
5374 elif field == "oper_vcpus":
5375 if instance.primary_node in bad_nodes:
5377 elif instance.name in live_data:
5378 val = live_data[instance.name].get("vcpus", "?")
5381 elif field == "vcpus":
5382 val = i_be[constants.BE_VCPUS]
5383 elif field == "disk_template":
5384 val = instance.disk_template
5387 val = instance.nics[0].ip
5390 elif field == "nic_mode":
5392 val = i_nicp[0][constants.NIC_MODE]
5395 elif field == "nic_link":
5397 val = i_nicp[0][constants.NIC_LINK]
5400 elif field == "bridge":
5401 if (instance.nics and
5402 i_nicp[0][constants.NIC_MODE] == constants.NIC_MODE_BRIDGED):
5403 val = i_nicp[0][constants.NIC_LINK]
5406 elif field == "mac":
5408 val = instance.nics[0].mac
5411 elif field == "custom_nicparams":
5412 val = [nic.nicparams for nic in instance.nics]
5413 elif field == "sda_size" or field == "sdb_size":
5414 idx = ord(field[2]) - ord('a')
5416 val = instance.FindDisk(idx).size
5417 except errors.OpPrereqError:
5419 elif field == "disk_usage": # total disk usage per node
5420 disk_sizes = [{'size': disk.size} for disk in instance.disks]
5421 val = _ComputeDiskSize(instance.disk_template, disk_sizes)
5422 elif field == "tags":
5423 val = list(instance.GetTags())
5424 elif field == "custom_hvparams":
5425 val = instance.hvparams # not filled!
5426 elif field == "hvparams":
5428 elif (field.startswith(HVPREFIX) and
5429 field[len(HVPREFIX):] in constants.HVS_PARAMETERS and
5430 field[len(HVPREFIX):] not in constants.HVC_GLOBALS):
5431 val = i_hv.get(field[len(HVPREFIX):], None)
5432 elif field == "custom_beparams":
5433 val = instance.beparams
5434 elif field == "beparams":
5436 elif (field.startswith(BEPREFIX) and
5437 field[len(BEPREFIX):] in constants.BES_PARAMETERS):
5438 val = i_be.get(field[len(BEPREFIX):], None)
5439 elif st_match and st_match.groups():
5440 # matches a variable list
5441 st_groups = st_match.groups()
5442 if st_groups and st_groups[0] == "disk":
5443 if st_groups[1] == "count":
5444 val = len(instance.disks)
5445 elif st_groups[1] == "sizes":
5446 val = [disk.size for disk in instance.disks]
5447 elif st_groups[1] == "size":
5449 val = instance.FindDisk(st_groups[2]).size
5450 except errors.OpPrereqError:
5453 assert False, "Unhandled disk parameter"
5454 elif st_groups[0] == "nic":
5455 if st_groups[1] == "count":
5456 val = len(instance.nics)
5457 elif st_groups[1] == "macs":
5458 val = [nic.mac for nic in instance.nics]
5459 elif st_groups[1] == "ips":
5460 val = [nic.ip for nic in instance.nics]
5461 elif st_groups[1] == "modes":
5462 val = [nicp[constants.NIC_MODE] for nicp in i_nicp]
5463 elif st_groups[1] == "links":
5464 val = [nicp[constants.NIC_LINK] for nicp in i_nicp]
5465 elif st_groups[1] == "bridges":
5468 if nicp[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
5469 val.append(nicp[constants.NIC_LINK])
5474 nic_idx = int(st_groups[2])
5475 if nic_idx >= len(instance.nics):
5478 if st_groups[1] == "mac":
5479 val = instance.nics[nic_idx].mac
5480 elif st_groups[1] == "ip":
5481 val = instance.nics[nic_idx].ip
5482 elif st_groups[1] == "mode":
5483 val = i_nicp[nic_idx][constants.NIC_MODE]
5484 elif st_groups[1] == "link":
5485 val = i_nicp[nic_idx][constants.NIC_LINK]
5486 elif st_groups[1] == "bridge":
5487 nic_mode = i_nicp[nic_idx][constants.NIC_MODE]
5488 if nic_mode == constants.NIC_MODE_BRIDGED:
5489 val = i_nicp[nic_idx][constants.NIC_LINK]
5493 assert False, "Unhandled NIC parameter"
5495 assert False, ("Declared but unhandled variable parameter '%s'" %
5498 assert False, "Declared but unhandled parameter '%s'" % field
5505 class LUFailoverInstance(LogicalUnit):
5506 """Failover an instance.
5509 HPATH = "instance-failover"
5510 HTYPE = constants.HTYPE_INSTANCE
5513 ("ignore_consistency", False, ht.TBool),
5518 def ExpandNames(self):
5519 self._ExpandAndLockInstance()
5520 self.needed_locks[locking.LEVEL_NODE] = []
5521 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
5523 def DeclareLocks(self, level):
5524 if level == locking.LEVEL_NODE:
5525 self._LockInstancesNodes()
5527 def BuildHooksEnv(self):
5530 This runs on master, primary and secondary nodes of the instance.
5533 instance = self.instance
5534 source_node = instance.primary_node
5535 target_node = instance.secondary_nodes[0]
5537 "IGNORE_CONSISTENCY": self.op.ignore_consistency,
5538 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
5539 "OLD_PRIMARY": source_node,
5540 "OLD_SECONDARY": target_node,
5541 "NEW_PRIMARY": target_node,
5542 "NEW_SECONDARY": source_node,
5544 env.update(_BuildInstanceHookEnvByObject(self, instance))
5545 nl = [self.cfg.GetMasterNode()] + list(instance.secondary_nodes)
5547 nl_post.append(source_node)
5548 return env, nl, nl_post
5550 def CheckPrereq(self):
5551 """Check prerequisites.
5553 This checks that the instance is in the cluster.
5556 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5557 assert self.instance is not None, \
5558 "Cannot retrieve locked instance %s" % self.op.instance_name
5560 bep = self.cfg.GetClusterInfo().FillBE(instance)
5561 if instance.disk_template not in constants.DTS_NET_MIRROR:
5562 raise errors.OpPrereqError("Instance's disk layout is not"
5563 " network mirrored, cannot failover.",
5566 secondary_nodes = instance.secondary_nodes
5567 if not secondary_nodes:
5568 raise errors.ProgrammerError("no secondary node but using "
5569 "a mirrored disk template")
5571 target_node = secondary_nodes[0]
5572 _CheckNodeOnline(self, target_node)
5573 _CheckNodeNotDrained(self, target_node)
5574 if instance.admin_up:
5575 # check memory requirements on the secondary node
5576 _CheckNodeFreeMemory(self, target_node, "failing over instance %s" %
5577 instance.name, bep[constants.BE_MEMORY],
5578 instance.hypervisor)
5580 self.LogInfo("Not checking memory on the secondary node as"
5581 " instance will not be started")
5583 # check bridge existance
5584 _CheckInstanceBridgesExist(self, instance, node=target_node)
5586 def Exec(self, feedback_fn):
5587 """Failover an instance.
5589 The failover is done by shutting it down on its present node and
5590 starting it on the secondary.
5593 instance = self.instance
5594 primary_node = self.cfg.GetNodeInfo(instance.primary_node)
5596 source_node = instance.primary_node
5597 target_node = instance.secondary_nodes[0]
5599 if instance.admin_up:
5600 feedback_fn("* checking disk consistency between source and target")
5601 for dev in instance.disks:
5602 # for drbd, these are drbd over lvm
5603 if not _CheckDiskConsistency(self, dev, target_node, False):
5604 if not self.op.ignore_consistency:
5605 raise errors.OpExecError("Disk %s is degraded on target node,"
5606 " aborting failover." % dev.iv_name)
5608 feedback_fn("* not checking disk consistency as instance is not running")
5610 feedback_fn("* shutting down instance on source node")
5611 logging.info("Shutting down instance %s on node %s",
5612 instance.name, source_node)
5614 result = self.rpc.call_instance_shutdown(source_node, instance,
5615 self.op.shutdown_timeout)
5616 msg = result.fail_msg
5618 if self.op.ignore_consistency or primary_node.offline:
5619 self.proc.LogWarning("Could not shutdown instance %s on node %s."
5620 " Proceeding anyway. Please make sure node"
5621 " %s is down. Error details: %s",
5622 instance.name, source_node, source_node, msg)
5624 raise errors.OpExecError("Could not shutdown instance %s on"
5626 (instance.name, source_node, msg))
5628 feedback_fn("* deactivating the instance's disks on source node")
5629 if not _ShutdownInstanceDisks(self, instance, ignore_primary=True):
5630 raise errors.OpExecError("Can't shut down the instance's disks.")
5632 instance.primary_node = target_node
5633 # distribute new instance config to the other nodes
5634 self.cfg.Update(instance, feedback_fn)
5636 # Only start the instance if it's marked as up
5637 if instance.admin_up:
5638 feedback_fn("* activating the instance's disks on target node")
5639 logging.info("Starting instance %s on node %s",
5640 instance.name, target_node)
5642 disks_ok, _ = _AssembleInstanceDisks(self, instance,
5643 ignore_secondaries=True)
5645 _ShutdownInstanceDisks(self, instance)
5646 raise errors.OpExecError("Can't activate the instance's disks")
5648 feedback_fn("* starting the instance on the target node")
5649 result = self.rpc.call_instance_start(target_node, instance, None, None)
5650 msg = result.fail_msg
5652 _ShutdownInstanceDisks(self, instance)
5653 raise errors.OpExecError("Could not start instance %s on node %s: %s" %
5654 (instance.name, target_node, msg))
5657 class LUMigrateInstance(LogicalUnit):
5658 """Migrate an instance.
5660 This is migration without shutting down, compared to the failover,
5661 which is done with shutdown.
5664 HPATH = "instance-migrate"
5665 HTYPE = constants.HTYPE_INSTANCE
5670 ("cleanup", False, ht.TBool),
5675 def ExpandNames(self):
5676 self._ExpandAndLockInstance()
5678 self.needed_locks[locking.LEVEL_NODE] = []
5679 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
5681 self._migrater = TLMigrateInstance(self, self.op.instance_name,
5683 self.tasklets = [self._migrater]
5685 def DeclareLocks(self, level):
5686 if level == locking.LEVEL_NODE:
5687 self._LockInstancesNodes()
5689 def BuildHooksEnv(self):
5692 This runs on master, primary and secondary nodes of the instance.
5695 instance = self._migrater.instance
5696 source_node = instance.primary_node
5697 target_node = instance.secondary_nodes[0]
5698 env = _BuildInstanceHookEnvByObject(self, instance)
5699 env["MIGRATE_LIVE"] = self._migrater.live
5700 env["MIGRATE_CLEANUP"] = self.op.cleanup
5702 "OLD_PRIMARY": source_node,
5703 "OLD_SECONDARY": target_node,
5704 "NEW_PRIMARY": target_node,
5705 "NEW_SECONDARY": source_node,
5707 nl = [self.cfg.GetMasterNode()] + list(instance.secondary_nodes)
5709 nl_post.append(source_node)
5710 return env, nl, nl_post
5713 class LUMoveInstance(LogicalUnit):
5714 """Move an instance by data-copying.
5717 HPATH = "instance-move"
5718 HTYPE = constants.HTYPE_INSTANCE
5721 ("target_node", ht.NoDefault, ht.TNonEmptyString),
5726 def ExpandNames(self):
5727 self._ExpandAndLockInstance()
5728 target_node = _ExpandNodeName(self.cfg, self.op.target_node)
5729 self.op.target_node = target_node
5730 self.needed_locks[locking.LEVEL_NODE] = [target_node]
5731 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
5733 def DeclareLocks(self, level):
5734 if level == locking.LEVEL_NODE:
5735 self._LockInstancesNodes(primary_only=True)
5737 def BuildHooksEnv(self):
5740 This runs on master, primary and secondary nodes of the instance.
5744 "TARGET_NODE": self.op.target_node,
5745 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
5747 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
5748 nl = [self.cfg.GetMasterNode()] + [self.instance.primary_node,
5749 self.op.target_node]
5752 def CheckPrereq(self):
5753 """Check prerequisites.
5755 This checks that the instance is in the cluster.
5758 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5759 assert self.instance is not None, \
5760 "Cannot retrieve locked instance %s" % self.op.instance_name
5762 node = self.cfg.GetNodeInfo(self.op.target_node)
5763 assert node is not None, \
5764 "Cannot retrieve locked node %s" % self.op.target_node
5766 self.target_node = target_node = node.name
5768 if target_node == instance.primary_node:
5769 raise errors.OpPrereqError("Instance %s is already on the node %s" %
5770 (instance.name, target_node),
5773 bep = self.cfg.GetClusterInfo().FillBE(instance)
5775 for idx, dsk in enumerate(instance.disks):
5776 if dsk.dev_type not in (constants.LD_LV, constants.LD_FILE):
5777 raise errors.OpPrereqError("Instance disk %d has a complex layout,"
5778 " cannot copy" % idx, errors.ECODE_STATE)
5780 _CheckNodeOnline(self, target_node)
5781 _CheckNodeNotDrained(self, target_node)
5782 _CheckNodeVmCapable(self, target_node)
5784 if instance.admin_up:
5785 # check memory requirements on the secondary node
5786 _CheckNodeFreeMemory(self, target_node, "failing over instance %s" %
5787 instance.name, bep[constants.BE_MEMORY],
5788 instance.hypervisor)
5790 self.LogInfo("Not checking memory on the secondary node as"
5791 " instance will not be started")
5793 # check bridge existance
5794 _CheckInstanceBridgesExist(self, instance, node=target_node)
5796 def Exec(self, feedback_fn):
5797 """Move an instance.
5799 The move is done by shutting it down on its present node, copying
5800 the data over (slow) and starting it on the new node.
5803 instance = self.instance
5805 source_node = instance.primary_node
5806 target_node = self.target_node
5808 self.LogInfo("Shutting down instance %s on source node %s",
5809 instance.name, source_node)
5811 result = self.rpc.call_instance_shutdown(source_node, instance,
5812 self.op.shutdown_timeout)
5813 msg = result.fail_msg
5815 if self.op.ignore_consistency:
5816 self.proc.LogWarning("Could not shutdown instance %s on node %s."
5817 " Proceeding anyway. Please make sure node"
5818 " %s is down. Error details: %s",
5819 instance.name, source_node, source_node, msg)
5821 raise errors.OpExecError("Could not shutdown instance %s on"
5823 (instance.name, source_node, msg))
5825 # create the target disks
5827 _CreateDisks(self, instance, target_node=target_node)
5828 except errors.OpExecError:
5829 self.LogWarning("Device creation failed, reverting...")
5831 _RemoveDisks(self, instance, target_node=target_node)
5833 self.cfg.ReleaseDRBDMinors(instance.name)
5836 cluster_name = self.cfg.GetClusterInfo().cluster_name
5839 # activate, get path, copy the data over
5840 for idx, disk in enumerate(instance.disks):
5841 self.LogInfo("Copying data for disk %d", idx)
5842 result = self.rpc.call_blockdev_assemble(target_node, disk,
5843 instance.name, True)
5845 self.LogWarning("Can't assemble newly created disk %d: %s",
5846 idx, result.fail_msg)
5847 errs.append(result.fail_msg)
5849 dev_path = result.payload
5850 result = self.rpc.call_blockdev_export(source_node, disk,
5851 target_node, dev_path,
5854 self.LogWarning("Can't copy data over for disk %d: %s",
5855 idx, result.fail_msg)
5856 errs.append(result.fail_msg)
5860 self.LogWarning("Some disks failed to copy, aborting")
5862 _RemoveDisks(self, instance, target_node=target_node)
5864 self.cfg.ReleaseDRBDMinors(instance.name)
5865 raise errors.OpExecError("Errors during disk copy: %s" %
5868 instance.primary_node = target_node
5869 self.cfg.Update(instance, feedback_fn)
5871 self.LogInfo("Removing the disks on the original node")
5872 _RemoveDisks(self, instance, target_node=source_node)
5874 # Only start the instance if it's marked as up
5875 if instance.admin_up:
5876 self.LogInfo("Starting instance %s on node %s",
5877 instance.name, target_node)
5879 disks_ok, _ = _AssembleInstanceDisks(self, instance,
5880 ignore_secondaries=True)
5882 _ShutdownInstanceDisks(self, instance)
5883 raise errors.OpExecError("Can't activate the instance's disks")
5885 result = self.rpc.call_instance_start(target_node, instance, None, None)
5886 msg = result.fail_msg
5888 _ShutdownInstanceDisks(self, instance)
5889 raise errors.OpExecError("Could not start instance %s on node %s: %s" %
5890 (instance.name, target_node, msg))
5893 class LUMigrateNode(LogicalUnit):
5894 """Migrate all instances from a node.
5897 HPATH = "node-migrate"
5898 HTYPE = constants.HTYPE_NODE
5906 def ExpandNames(self):
5907 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
5909 self.needed_locks = {
5910 locking.LEVEL_NODE: [self.op.node_name],
5913 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
5915 # Create tasklets for migrating instances for all instances on this node
5919 for inst in _GetNodePrimaryInstances(self.cfg, self.op.node_name):
5920 logging.debug("Migrating instance %s", inst.name)
5921 names.append(inst.name)
5923 tasklets.append(TLMigrateInstance(self, inst.name, False))
5925 self.tasklets = tasklets
5927 # Declare instance locks
5928 self.needed_locks[locking.LEVEL_INSTANCE] = names
5930 def DeclareLocks(self, level):
5931 if level == locking.LEVEL_NODE:
5932 self._LockInstancesNodes()
5934 def BuildHooksEnv(self):
5937 This runs on the master, the primary and all the secondaries.
5941 "NODE_NAME": self.op.node_name,
5944 nl = [self.cfg.GetMasterNode()]
5946 return (env, nl, nl)
5949 class TLMigrateInstance(Tasklet):
5950 """Tasklet class for instance migration.
5953 @ivar live: whether the migration will be done live or non-live;
5954 this variable is initalized only after CheckPrereq has run
5957 def __init__(self, lu, instance_name, cleanup):
5958 """Initializes this class.
5961 Tasklet.__init__(self, lu)
5964 self.instance_name = instance_name
5965 self.cleanup = cleanup
5966 self.live = False # will be overridden later
5968 def CheckPrereq(self):
5969 """Check prerequisites.
5971 This checks that the instance is in the cluster.
5974 instance_name = _ExpandInstanceName(self.lu.cfg, self.instance_name)
5975 instance = self.cfg.GetInstanceInfo(instance_name)
5976 assert instance is not None
5978 if instance.disk_template != constants.DT_DRBD8:
5979 raise errors.OpPrereqError("Instance's disk layout is not"
5980 " drbd8, cannot migrate.", errors.ECODE_STATE)
5982 secondary_nodes = instance.secondary_nodes
5983 if not secondary_nodes:
5984 raise errors.ConfigurationError("No secondary node but using"
5985 " drbd8 disk template")
5987 i_be = self.cfg.GetClusterInfo().FillBE(instance)
5989 target_node = secondary_nodes[0]
5990 # check memory requirements on the secondary node
5991 _CheckNodeFreeMemory(self.lu, target_node, "migrating instance %s" %
5992 instance.name, i_be[constants.BE_MEMORY],
5993 instance.hypervisor)
5995 # check bridge existance
5996 _CheckInstanceBridgesExist(self.lu, instance, node=target_node)
5998 if not self.cleanup:
5999 _CheckNodeNotDrained(self.lu, target_node)
6000 result = self.rpc.call_instance_migratable(instance.primary_node,
6002 result.Raise("Can't migrate, please use failover",
6003 prereq=True, ecode=errors.ECODE_STATE)
6005 self.instance = instance
6007 if self.lu.op.live is not None and self.lu.op.mode is not None:
6008 raise errors.OpPrereqError("Only one of the 'live' and 'mode'"
6009 " parameters are accepted",
6011 if self.lu.op.live is not None:
6013 self.lu.op.mode = constants.HT_MIGRATION_LIVE
6015 self.lu.op.mode = constants.HT_MIGRATION_NONLIVE
6016 # reset the 'live' parameter to None so that repeated
6017 # invocations of CheckPrereq do not raise an exception
6018 self.lu.op.live = None
6019 elif self.lu.op.mode is None:
6020 # read the default value from the hypervisor
6021 i_hv = self.cfg.GetClusterInfo().FillHV(instance, skip_globals=False)
6022 self.lu.op.mode = i_hv[constants.HV_MIGRATION_MODE]
6024 self.live = self.lu.op.mode == constants.HT_MIGRATION_LIVE
6026 def _WaitUntilSync(self):
6027 """Poll with custom rpc for disk sync.
6029 This uses our own step-based rpc call.
6032 self.feedback_fn("* wait until resync is done")
6036 result = self.rpc.call_drbd_wait_sync(self.all_nodes,
6038 self.instance.disks)
6040 for node, nres in result.items():
6041 nres.Raise("Cannot resync disks on node %s" % node)
6042 node_done, node_percent = nres.payload
6043 all_done = all_done and node_done
6044 if node_percent is not None:
6045 min_percent = min(min_percent, node_percent)
6047 if min_percent < 100:
6048 self.feedback_fn(" - progress: %.1f%%" % min_percent)
6051 def _EnsureSecondary(self, node):
6052 """Demote a node to secondary.
6055 self.feedback_fn("* switching node %s to secondary mode" % node)
6057 for dev in self.instance.disks:
6058 self.cfg.SetDiskID(dev, node)
6060 result = self.rpc.call_blockdev_close(node, self.instance.name,
6061 self.instance.disks)
6062 result.Raise("Cannot change disk to secondary on node %s" % node)
6064 def _GoStandalone(self):
6065 """Disconnect from the network.
6068 self.feedback_fn("* changing into standalone mode")
6069 result = self.rpc.call_drbd_disconnect_net(self.all_nodes, self.nodes_ip,
6070 self.instance.disks)
6071 for node, nres in result.items():
6072 nres.Raise("Cannot disconnect disks node %s" % node)
6074 def _GoReconnect(self, multimaster):
6075 """Reconnect to the network.
6081 msg = "single-master"
6082 self.feedback_fn("* changing disks into %s mode" % msg)
6083 result = self.rpc.call_drbd_attach_net(self.all_nodes, self.nodes_ip,
6084 self.instance.disks,
6085 self.instance.name, multimaster)
6086 for node, nres in result.items():
6087 nres.Raise("Cannot change disks config on node %s" % node)
6089 def _ExecCleanup(self):
6090 """Try to cleanup after a failed migration.
6092 The cleanup is done by:
6093 - check that the instance is running only on one node
6094 (and update the config if needed)
6095 - change disks on its secondary node to secondary
6096 - wait until disks are fully synchronized
6097 - disconnect from the network
6098 - change disks into single-master mode
6099 - wait again until disks are fully synchronized
6102 instance = self.instance
6103 target_node = self.target_node
6104 source_node = self.source_node
6106 # check running on only one node
6107 self.feedback_fn("* checking where the instance actually runs"
6108 " (if this hangs, the hypervisor might be in"
6110 ins_l = self.rpc.call_instance_list(self.all_nodes, [instance.hypervisor])
6111 for node, result in ins_l.items():
6112 result.Raise("Can't contact node %s" % node)
6114 runningon_source = instance.name in ins_l[source_node].payload
6115 runningon_target = instance.name in ins_l[target_node].payload
6117 if runningon_source and runningon_target:
6118 raise errors.OpExecError("Instance seems to be running on two nodes,"
6119 " or the hypervisor is confused. You will have"
6120 " to ensure manually that it runs only on one"
6121 " and restart this operation.")
6123 if not (runningon_source or runningon_target):
6124 raise errors.OpExecError("Instance does not seem to be running at all."
6125 " In this case, it's safer to repair by"
6126 " running 'gnt-instance stop' to ensure disk"
6127 " shutdown, and then restarting it.")
6129 if runningon_target:
6130 # the migration has actually succeeded, we need to update the config
6131 self.feedback_fn("* instance running on secondary node (%s),"
6132 " updating config" % target_node)
6133 instance.primary_node = target_node
6134 self.cfg.Update(instance, self.feedback_fn)
6135 demoted_node = source_node
6137 self.feedback_fn("* instance confirmed to be running on its"
6138 " primary node (%s)" % source_node)
6139 demoted_node = target_node
6141 self._EnsureSecondary(demoted_node)
6143 self._WaitUntilSync()
6144 except errors.OpExecError:
6145 # we ignore here errors, since if the device is standalone, it
6146 # won't be able to sync
6148 self._GoStandalone()
6149 self._GoReconnect(False)
6150 self._WaitUntilSync()
6152 self.feedback_fn("* done")
6154 def _RevertDiskStatus(self):
6155 """Try to revert the disk status after a failed migration.
6158 target_node = self.target_node
6160 self._EnsureSecondary(target_node)
6161 self._GoStandalone()
6162 self._GoReconnect(False)
6163 self._WaitUntilSync()
6164 except errors.OpExecError, err:
6165 self.lu.LogWarning("Migration failed and I can't reconnect the"
6166 " drives: error '%s'\n"
6167 "Please look and recover the instance status" %
6170 def _AbortMigration(self):
6171 """Call the hypervisor code to abort a started migration.
6174 instance = self.instance
6175 target_node = self.target_node
6176 migration_info = self.migration_info
6178 abort_result = self.rpc.call_finalize_migration(target_node,
6182 abort_msg = abort_result.fail_msg
6184 logging.error("Aborting migration failed on target node %s: %s",
6185 target_node, abort_msg)
6186 # Don't raise an exception here, as we stil have to try to revert the
6187 # disk status, even if this step failed.
6189 def _ExecMigration(self):
6190 """Migrate an instance.
6192 The migrate is done by:
6193 - change the disks into dual-master mode
6194 - wait until disks are fully synchronized again
6195 - migrate the instance
6196 - change disks on the new secondary node (the old primary) to secondary
6197 - wait until disks are fully synchronized
6198 - change disks into single-master mode
6201 instance = self.instance
6202 target_node = self.target_node
6203 source_node = self.source_node
6205 self.feedback_fn("* checking disk consistency between source and target")
6206 for dev in instance.disks:
6207 if not _CheckDiskConsistency(self.lu, dev, target_node, False):
6208 raise errors.OpExecError("Disk %s is degraded or not fully"
6209 " synchronized on target node,"
6210 " aborting migrate." % dev.iv_name)
6212 # First get the migration information from the remote node
6213 result = self.rpc.call_migration_info(source_node, instance)
6214 msg = result.fail_msg
6216 log_err = ("Failed fetching source migration information from %s: %s" %
6218 logging.error(log_err)
6219 raise errors.OpExecError(log_err)
6221 self.migration_info = migration_info = result.payload
6223 # Then switch the disks to master/master mode
6224 self._EnsureSecondary(target_node)
6225 self._GoStandalone()
6226 self._GoReconnect(True)
6227 self._WaitUntilSync()
6229 self.feedback_fn("* preparing %s to accept the instance" % target_node)
6230 result = self.rpc.call_accept_instance(target_node,
6233 self.nodes_ip[target_node])
6235 msg = result.fail_msg
6237 logging.error("Instance pre-migration failed, trying to revert"
6238 " disk status: %s", msg)
6239 self.feedback_fn("Pre-migration failed, aborting")
6240 self._AbortMigration()
6241 self._RevertDiskStatus()
6242 raise errors.OpExecError("Could not pre-migrate instance %s: %s" %
6243 (instance.name, msg))
6245 self.feedback_fn("* migrating instance to %s" % target_node)
6247 result = self.rpc.call_instance_migrate(source_node, instance,
6248 self.nodes_ip[target_node],
6250 msg = result.fail_msg
6252 logging.error("Instance migration failed, trying to revert"
6253 " disk status: %s", msg)
6254 self.feedback_fn("Migration failed, aborting")
6255 self._AbortMigration()
6256 self._RevertDiskStatus()
6257 raise errors.OpExecError("Could not migrate instance %s: %s" %
6258 (instance.name, msg))
6261 instance.primary_node = target_node
6262 # distribute new instance config to the other nodes
6263 self.cfg.Update(instance, self.feedback_fn)
6265 result = self.rpc.call_finalize_migration(target_node,
6269 msg = result.fail_msg
6271 logging.error("Instance migration succeeded, but finalization failed:"
6273 raise errors.OpExecError("Could not finalize instance migration: %s" %
6276 self._EnsureSecondary(source_node)
6277 self._WaitUntilSync()
6278 self._GoStandalone()
6279 self._GoReconnect(False)
6280 self._WaitUntilSync()
6282 self.feedback_fn("* done")
6284 def Exec(self, feedback_fn):
6285 """Perform the migration.
6288 feedback_fn("Migrating instance %s" % self.instance.name)
6290 self.feedback_fn = feedback_fn
6292 self.source_node = self.instance.primary_node
6293 self.target_node = self.instance.secondary_nodes[0]
6294 self.all_nodes = [self.source_node, self.target_node]
6296 self.source_node: self.cfg.GetNodeInfo(self.source_node).secondary_ip,
6297 self.target_node: self.cfg.GetNodeInfo(self.target_node).secondary_ip,
6301 return self._ExecCleanup()
6303 return self._ExecMigration()
6306 def _CreateBlockDev(lu, node, instance, device, force_create,
6308 """Create a tree of block devices on a given node.
6310 If this device type has to be created on secondaries, create it and
6313 If not, just recurse to children keeping the same 'force' value.
6315 @param lu: the lu on whose behalf we execute
6316 @param node: the node on which to create the device
6317 @type instance: L{objects.Instance}
6318 @param instance: the instance which owns the device
6319 @type device: L{objects.Disk}
6320 @param device: the device to create
6321 @type force_create: boolean
6322 @param force_create: whether to force creation of this device; this
6323 will be change to True whenever we find a device which has
6324 CreateOnSecondary() attribute
6325 @param info: the extra 'metadata' we should attach to the device
6326 (this will be represented as a LVM tag)
6327 @type force_open: boolean
6328 @param force_open: this parameter will be passes to the
6329 L{backend.BlockdevCreate} function where it specifies
6330 whether we run on primary or not, and it affects both
6331 the child assembly and the device own Open() execution
6334 if device.CreateOnSecondary():
6338 for child in device.children:
6339 _CreateBlockDev(lu, node, instance, child, force_create,
6342 if not force_create:
6345 _CreateSingleBlockDev(lu, node, instance, device, info, force_open)
6348 def _CreateSingleBlockDev(lu, node, instance, device, info, force_open):
6349 """Create a single block device on a given node.
6351 This will not recurse over children of the device, so they must be
6354 @param lu: the lu on whose behalf we execute
6355 @param node: the node on which to create the device
6356 @type instance: L{objects.Instance}
6357 @param instance: the instance which owns the device
6358 @type device: L{objects.Disk}
6359 @param device: the device to create
6360 @param info: the extra 'metadata' we should attach to the device
6361 (this will be represented as a LVM tag)
6362 @type force_open: boolean
6363 @param force_open: this parameter will be passes to the
6364 L{backend.BlockdevCreate} function where it specifies
6365 whether we run on primary or not, and it affects both
6366 the child assembly and the device own Open() execution
6369 lu.cfg.SetDiskID(device, node)
6370 result = lu.rpc.call_blockdev_create(node, device, device.size,
6371 instance.name, force_open, info)
6372 result.Raise("Can't create block device %s on"
6373 " node %s for instance %s" % (device, node, instance.name))
6374 if device.physical_id is None:
6375 device.physical_id = result.payload
6378 def _GenerateUniqueNames(lu, exts):
6379 """Generate a suitable LV name.
6381 This will generate a logical volume name for the given instance.
6386 new_id = lu.cfg.GenerateUniqueID(lu.proc.GetECId())
6387 results.append("%s%s" % (new_id, val))
6391 def _GenerateDRBD8Branch(lu, primary, secondary, size, names, iv_name,
6393 """Generate a drbd8 device complete with its children.
6396 port = lu.cfg.AllocatePort()
6397 vgname = lu.cfg.GetVGName()
6398 shared_secret = lu.cfg.GenerateDRBDSecret(lu.proc.GetECId())
6399 dev_data = objects.Disk(dev_type=constants.LD_LV, size=size,
6400 logical_id=(vgname, names[0]))
6401 dev_meta = objects.Disk(dev_type=constants.LD_LV, size=128,
6402 logical_id=(vgname, names[1]))
6403 drbd_dev = objects.Disk(dev_type=constants.LD_DRBD8, size=size,
6404 logical_id=(primary, secondary, port,
6407 children=[dev_data, dev_meta],
6412 def _GenerateDiskTemplate(lu, template_name,
6413 instance_name, primary_node,
6414 secondary_nodes, disk_info,
6415 file_storage_dir, file_driver,
6417 """Generate the entire disk layout for a given template type.
6420 #TODO: compute space requirements
6422 vgname = lu.cfg.GetVGName()
6423 disk_count = len(disk_info)
6425 if template_name == constants.DT_DISKLESS:
6427 elif template_name == constants.DT_PLAIN:
6428 if len(secondary_nodes) != 0:
6429 raise errors.ProgrammerError("Wrong template configuration")
6431 names = _GenerateUniqueNames(lu, [".disk%d" % (base_index + i)
6432 for i in range(disk_count)])
6433 for idx, disk in enumerate(disk_info):
6434 disk_index = idx + base_index
6435 disk_dev = objects.Disk(dev_type=constants.LD_LV, size=disk["size"],
6436 logical_id=(vgname, names[idx]),
6437 iv_name="disk/%d" % disk_index,
6439 disks.append(disk_dev)
6440 elif template_name == constants.DT_DRBD8:
6441 if len(secondary_nodes) != 1:
6442 raise errors.ProgrammerError("Wrong template configuration")
6443 remote_node = secondary_nodes[0]
6444 minors = lu.cfg.AllocateDRBDMinor(
6445 [primary_node, remote_node] * len(disk_info), instance_name)
6448 for lv_prefix in _GenerateUniqueNames(lu, [".disk%d" % (base_index + i)
6449 for i in range(disk_count)]):
6450 names.append(lv_prefix + "_data")
6451 names.append(lv_prefix + "_meta")
6452 for idx, disk in enumerate(disk_info):
6453 disk_index = idx + base_index
6454 disk_dev = _GenerateDRBD8Branch(lu, primary_node, remote_node,
6455 disk["size"], names[idx*2:idx*2+2],
6456 "disk/%d" % disk_index,
6457 minors[idx*2], minors[idx*2+1])
6458 disk_dev.mode = disk["mode"]
6459 disks.append(disk_dev)
6460 elif template_name == constants.DT_FILE:
6461 if len(secondary_nodes) != 0:
6462 raise errors.ProgrammerError("Wrong template configuration")
6464 _RequireFileStorage()
6466 for idx, disk in enumerate(disk_info):
6467 disk_index = idx + base_index
6468 disk_dev = objects.Disk(dev_type=constants.LD_FILE, size=disk["size"],
6469 iv_name="disk/%d" % disk_index,
6470 logical_id=(file_driver,
6471 "%s/disk%d" % (file_storage_dir,
6474 disks.append(disk_dev)
6476 raise errors.ProgrammerError("Invalid disk template '%s'" % template_name)
6480 def _GetInstanceInfoText(instance):
6481 """Compute that text that should be added to the disk's metadata.
6484 return "originstname+%s" % instance.name
6487 def _CalcEta(time_taken, written, total_size):
6488 """Calculates the ETA based on size written and total size.
6490 @param time_taken: The time taken so far
6491 @param written: amount written so far
6492 @param total_size: The total size of data to be written
6493 @return: The remaining time in seconds
6496 avg_time = time_taken / float(written)
6497 return (total_size - written) * avg_time
6500 def _WipeDisks(lu, instance):
6501 """Wipes instance disks.
6503 @type lu: L{LogicalUnit}
6504 @param lu: the logical unit on whose behalf we execute
6505 @type instance: L{objects.Instance}
6506 @param instance: the instance whose disks we should create
6507 @return: the success of the wipe
6510 node = instance.primary_node
6511 for idx, device in enumerate(instance.disks):
6512 lu.LogInfo("* Wiping disk %d", idx)
6513 logging.info("Wiping disk %d for instance %s", idx, instance.name)
6515 # The wipe size is MIN_WIPE_CHUNK_PERCENT % of the instance disk but
6516 # MAX_WIPE_CHUNK at max
6517 wipe_chunk_size = min(constants.MAX_WIPE_CHUNK, device.size / 100.0 *
6518 constants.MIN_WIPE_CHUNK_PERCENT)
6523 start_time = time.time()
6525 while offset < size:
6526 wipe_size = min(wipe_chunk_size, size - offset)
6527 result = lu.rpc.call_blockdev_wipe(node, device, offset, wipe_size)
6528 result.Raise("Could not wipe disk %d at offset %d for size %d" %
6529 (idx, offset, wipe_size))
6532 if now - last_output >= 60:
6533 eta = _CalcEta(now - start_time, offset, size)
6534 lu.LogInfo(" - done: %.1f%% ETA: %s" %
6535 (offset / float(size) * 100, utils.FormatSeconds(eta)))
6539 def _CreateDisks(lu, instance, to_skip=None, target_node=None):
6540 """Create all disks for an instance.
6542 This abstracts away some work from AddInstance.
6544 @type lu: L{LogicalUnit}
6545 @param lu: the logical unit on whose behalf we execute
6546 @type instance: L{objects.Instance}
6547 @param instance: the instance whose disks we should create
6549 @param to_skip: list of indices to skip
6550 @type target_node: string
6551 @param target_node: if passed, overrides the target node for creation
6553 @return: the success of the creation
6556 info = _GetInstanceInfoText(instance)
6557 if target_node is None:
6558 pnode = instance.primary_node
6559 all_nodes = instance.all_nodes
6564 if instance.disk_template == constants.DT_FILE:
6565 file_storage_dir = os.path.dirname(instance.disks[0].logical_id[1])
6566 result = lu.rpc.call_file_storage_dir_create(pnode, file_storage_dir)
6568 result.Raise("Failed to create directory '%s' on"
6569 " node %s" % (file_storage_dir, pnode))
6571 # Note: this needs to be kept in sync with adding of disks in
6572 # LUSetInstanceParams
6573 for idx, device in enumerate(instance.disks):
6574 if to_skip and idx in to_skip:
6576 logging.info("Creating volume %s for instance %s",
6577 device.iv_name, instance.name)
6579 for node in all_nodes:
6580 f_create = node == pnode
6581 _CreateBlockDev(lu, node, instance, device, f_create, info, f_create)
6584 def _RemoveDisks(lu, instance, target_node=None):
6585 """Remove all disks for an instance.
6587 This abstracts away some work from `AddInstance()` and
6588 `RemoveInstance()`. Note that in case some of the devices couldn't
6589 be removed, the removal will continue with the other ones (compare
6590 with `_CreateDisks()`).
6592 @type lu: L{LogicalUnit}
6593 @param lu: the logical unit on whose behalf we execute
6594 @type instance: L{objects.Instance}
6595 @param instance: the instance whose disks we should remove
6596 @type target_node: string
6597 @param target_node: used to override the node on which to remove the disks
6599 @return: the success of the removal
6602 logging.info("Removing block devices for instance %s", instance.name)
6605 for device in instance.disks:
6607 edata = [(target_node, device)]
6609 edata = device.ComputeNodeTree(instance.primary_node)
6610 for node, disk in edata:
6611 lu.cfg.SetDiskID(disk, node)
6612 msg = lu.rpc.call_blockdev_remove(node, disk).fail_msg
6614 lu.LogWarning("Could not remove block device %s on node %s,"
6615 " continuing anyway: %s", device.iv_name, node, msg)
6618 if instance.disk_template == constants.DT_FILE:
6619 file_storage_dir = os.path.dirname(instance.disks[0].logical_id[1])
6623 tgt = instance.primary_node
6624 result = lu.rpc.call_file_storage_dir_remove(tgt, file_storage_dir)
6626 lu.LogWarning("Could not remove directory '%s' on node %s: %s",
6627 file_storage_dir, instance.primary_node, result.fail_msg)
6633 def _ComputeDiskSize(disk_template, disks):
6634 """Compute disk size requirements in the volume group
6637 # Required free disk space as a function of disk and swap space
6639 constants.DT_DISKLESS: None,
6640 constants.DT_PLAIN: sum(d["size"] for d in disks),
6641 # 128 MB are added for drbd metadata for each disk
6642 constants.DT_DRBD8: sum(d["size"] + 128 for d in disks),
6643 constants.DT_FILE: None,
6646 if disk_template not in req_size_dict:
6647 raise errors.ProgrammerError("Disk template '%s' size requirement"
6648 " is unknown" % disk_template)
6650 return req_size_dict[disk_template]
6653 def _CheckHVParams(lu, nodenames, hvname, hvparams):
6654 """Hypervisor parameter validation.
6656 This function abstract the hypervisor parameter validation to be
6657 used in both instance create and instance modify.
6659 @type lu: L{LogicalUnit}
6660 @param lu: the logical unit for which we check
6661 @type nodenames: list
6662 @param nodenames: the list of nodes on which we should check
6663 @type hvname: string
6664 @param hvname: the name of the hypervisor we should use
6665 @type hvparams: dict
6666 @param hvparams: the parameters which we need to check
6667 @raise errors.OpPrereqError: if the parameters are not valid
6670 hvinfo = lu.rpc.call_hypervisor_validate_params(nodenames,
6673 for node in nodenames:
6677 info.Raise("Hypervisor parameter validation failed on node %s" % node)
6680 def _CheckOSParams(lu, required, nodenames, osname, osparams):
6681 """OS parameters validation.
6683 @type lu: L{LogicalUnit}
6684 @param lu: the logical unit for which we check
6685 @type required: boolean
6686 @param required: whether the validation should fail if the OS is not
6688 @type nodenames: list
6689 @param nodenames: the list of nodes on which we should check
6690 @type osname: string
6691 @param osname: the name of the hypervisor we should use
6692 @type osparams: dict
6693 @param osparams: the parameters which we need to check
6694 @raise errors.OpPrereqError: if the parameters are not valid
6697 result = lu.rpc.call_os_validate(required, nodenames, osname,
6698 [constants.OS_VALIDATE_PARAMETERS],
6700 for node, nres in result.items():
6701 # we don't check for offline cases since this should be run only
6702 # against the master node and/or an instance's nodes
6703 nres.Raise("OS Parameters validation failed on node %s" % node)
6704 if not nres.payload:
6705 lu.LogInfo("OS %s not found on node %s, validation skipped",
6709 class LUCreateInstance(LogicalUnit):
6710 """Create an instance.
6713 HPATH = "instance-add"
6714 HTYPE = constants.HTYPE_INSTANCE
6717 ("mode", ht.NoDefault, ht.TElemOf(constants.INSTANCE_CREATE_MODES)),
6718 ("start", True, ht.TBool),
6719 ("wait_for_sync", True, ht.TBool),
6720 ("ip_check", True, ht.TBool),
6721 ("name_check", True, ht.TBool),
6722 ("disks", ht.NoDefault, ht.TListOf(ht.TDict)),
6723 ("nics", ht.NoDefault, ht.TListOf(ht.TDict)),
6724 ("hvparams", ht.EmptyDict, ht.TDict),
6725 ("beparams", ht.EmptyDict, ht.TDict),
6726 ("osparams", ht.EmptyDict, ht.TDict),
6727 ("no_install", None, ht.TMaybeBool),
6728 ("os_type", None, ht.TMaybeString),
6729 ("force_variant", False, ht.TBool),
6730 ("source_handshake", None, ht.TOr(ht.TList, ht.TNone)),
6731 ("source_x509_ca", None, ht.TMaybeString),
6732 ("source_instance_name", None, ht.TMaybeString),
6733 ("src_node", None, ht.TMaybeString),
6734 ("src_path", None, ht.TMaybeString),
6735 ("pnode", None, ht.TMaybeString),
6736 ("snode", None, ht.TMaybeString),
6737 ("iallocator", None, ht.TMaybeString),
6738 ("hypervisor", None, ht.TMaybeString),
6739 ("disk_template", ht.NoDefault, _CheckDiskTemplate),
6740 ("identify_defaults", False, ht.TBool),
6741 ("file_driver", None, ht.TOr(ht.TNone, ht.TElemOf(constants.FILE_DRIVER))),
6742 ("file_storage_dir", None, ht.TMaybeString),
6746 def CheckArguments(self):
6750 # do not require name_check to ease forward/backward compatibility
6752 if self.op.no_install and self.op.start:
6753 self.LogInfo("No-installation mode selected, disabling startup")
6754 self.op.start = False
6755 # validate/normalize the instance name
6756 self.op.instance_name = \
6757 netutils.Hostname.GetNormalizedName(self.op.instance_name)
6759 if self.op.ip_check and not self.op.name_check:
6760 # TODO: make the ip check more flexible and not depend on the name check
6761 raise errors.OpPrereqError("Cannot do ip check without a name check",
6764 # check nics' parameter names
6765 for nic in self.op.nics:
6766 utils.ForceDictType(nic, constants.INIC_PARAMS_TYPES)
6768 # check disks. parameter names and consistent adopt/no-adopt strategy
6769 has_adopt = has_no_adopt = False
6770 for disk in self.op.disks:
6771 utils.ForceDictType(disk, constants.IDISK_PARAMS_TYPES)
6776 if has_adopt and has_no_adopt:
6777 raise errors.OpPrereqError("Either all disks are adopted or none is",
6780 if self.op.disk_template not in constants.DTS_MAY_ADOPT:
6781 raise errors.OpPrereqError("Disk adoption is not supported for the"
6782 " '%s' disk template" %
6783 self.op.disk_template,
6785 if self.op.iallocator is not None:
6786 raise errors.OpPrereqError("Disk adoption not allowed with an"
6787 " iallocator script", errors.ECODE_INVAL)
6788 if self.op.mode == constants.INSTANCE_IMPORT:
6789 raise errors.OpPrereqError("Disk adoption not allowed for"
6790 " instance import", errors.ECODE_INVAL)
6792 self.adopt_disks = has_adopt
6794 # instance name verification
6795 if self.op.name_check:
6796 self.hostname1 = netutils.GetHostname(name=self.op.instance_name)
6797 self.op.instance_name = self.hostname1.name
6798 # used in CheckPrereq for ip ping check
6799 self.check_ip = self.hostname1.ip
6801 self.check_ip = None
6803 # file storage checks
6804 if (self.op.file_driver and
6805 not self.op.file_driver in constants.FILE_DRIVER):
6806 raise errors.OpPrereqError("Invalid file driver name '%s'" %
6807 self.op.file_driver, errors.ECODE_INVAL)
6809 if self.op.file_storage_dir and os.path.isabs(self.op.file_storage_dir):
6810 raise errors.OpPrereqError("File storage directory path not absolute",
6813 ### Node/iallocator related checks
6814 _CheckIAllocatorOrNode(self, "iallocator", "pnode")
6816 if self.op.pnode is not None:
6817 if self.op.disk_template in constants.DTS_NET_MIRROR:
6818 if self.op.snode is None:
6819 raise errors.OpPrereqError("The networked disk templates need"
6820 " a mirror node", errors.ECODE_INVAL)
6822 self.LogWarning("Secondary node will be ignored on non-mirrored disk"
6824 self.op.snode = None
6826 self._cds = _GetClusterDomainSecret()
6828 if self.op.mode == constants.INSTANCE_IMPORT:
6829 # On import force_variant must be True, because if we forced it at
6830 # initial install, our only chance when importing it back is that it
6832 self.op.force_variant = True
6834 if self.op.no_install:
6835 self.LogInfo("No-installation mode has no effect during import")
6837 elif self.op.mode == constants.INSTANCE_CREATE:
6838 if self.op.os_type is None:
6839 raise errors.OpPrereqError("No guest OS specified",
6841 if self.op.os_type in self.cfg.GetClusterInfo().blacklisted_os:
6842 raise errors.OpPrereqError("Guest OS '%s' is not allowed for"
6843 " installation" % self.op.os_type,
6845 if self.op.disk_template is None:
6846 raise errors.OpPrereqError("No disk template specified",
6849 elif self.op.mode == constants.INSTANCE_REMOTE_IMPORT:
6850 # Check handshake to ensure both clusters have the same domain secret
6851 src_handshake = self.op.source_handshake
6852 if not src_handshake:
6853 raise errors.OpPrereqError("Missing source handshake",
6856 errmsg = masterd.instance.CheckRemoteExportHandshake(self._cds,
6859 raise errors.OpPrereqError("Invalid handshake: %s" % errmsg,
6862 # Load and check source CA
6863 self.source_x509_ca_pem = self.op.source_x509_ca
6864 if not self.source_x509_ca_pem:
6865 raise errors.OpPrereqError("Missing source X509 CA",
6869 (cert, _) = utils.LoadSignedX509Certificate(self.source_x509_ca_pem,
6871 except OpenSSL.crypto.Error, err:
6872 raise errors.OpPrereqError("Unable to load source X509 CA (%s)" %
6873 (err, ), errors.ECODE_INVAL)
6875 (errcode, msg) = utils.VerifyX509Certificate(cert, None, None)
6876 if errcode is not None:
6877 raise errors.OpPrereqError("Invalid source X509 CA (%s)" % (msg, ),
6880 self.source_x509_ca = cert
6882 src_instance_name = self.op.source_instance_name
6883 if not src_instance_name:
6884 raise errors.OpPrereqError("Missing source instance name",
6887 self.source_instance_name = \
6888 netutils.GetHostname(name=src_instance_name).name
6891 raise errors.OpPrereqError("Invalid instance creation mode %r" %
6892 self.op.mode, errors.ECODE_INVAL)
6894 def ExpandNames(self):
6895 """ExpandNames for CreateInstance.
6897 Figure out the right locks for instance creation.
6900 self.needed_locks = {}
6902 instance_name = self.op.instance_name
6903 # this is just a preventive check, but someone might still add this
6904 # instance in the meantime, and creation will fail at lock-add time
6905 if instance_name in self.cfg.GetInstanceList():
6906 raise errors.OpPrereqError("Instance '%s' is already in the cluster" %
6907 instance_name, errors.ECODE_EXISTS)
6909 self.add_locks[locking.LEVEL_INSTANCE] = instance_name
6911 if self.op.iallocator:
6912 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
6914 self.op.pnode = _ExpandNodeName(self.cfg, self.op.pnode)
6915 nodelist = [self.op.pnode]
6916 if self.op.snode is not None:
6917 self.op.snode = _ExpandNodeName(self.cfg, self.op.snode)
6918 nodelist.append(self.op.snode)
6919 self.needed_locks[locking.LEVEL_NODE] = nodelist
6921 # in case of import lock the source node too
6922 if self.op.mode == constants.INSTANCE_IMPORT:
6923 src_node = self.op.src_node
6924 src_path = self.op.src_path
6926 if src_path is None:
6927 self.op.src_path = src_path = self.op.instance_name
6929 if src_node is None:
6930 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
6931 self.op.src_node = None
6932 if os.path.isabs(src_path):
6933 raise errors.OpPrereqError("Importing an instance from an absolute"
6934 " path requires a source node option.",
6937 self.op.src_node = src_node = _ExpandNodeName(self.cfg, src_node)
6938 if self.needed_locks[locking.LEVEL_NODE] is not locking.ALL_SET:
6939 self.needed_locks[locking.LEVEL_NODE].append(src_node)
6940 if not os.path.isabs(src_path):
6941 self.op.src_path = src_path = \
6942 utils.PathJoin(constants.EXPORT_DIR, src_path)
6944 def _RunAllocator(self):
6945 """Run the allocator based on input opcode.
6948 nics = [n.ToDict() for n in self.nics]
6949 ial = IAllocator(self.cfg, self.rpc,
6950 mode=constants.IALLOCATOR_MODE_ALLOC,
6951 name=self.op.instance_name,
6952 disk_template=self.op.disk_template,
6955 vcpus=self.be_full[constants.BE_VCPUS],
6956 mem_size=self.be_full[constants.BE_MEMORY],
6959 hypervisor=self.op.hypervisor,
6962 ial.Run(self.op.iallocator)
6965 raise errors.OpPrereqError("Can't compute nodes using"
6966 " iallocator '%s': %s" %
6967 (self.op.iallocator, ial.info),
6969 if len(ial.result) != ial.required_nodes:
6970 raise errors.OpPrereqError("iallocator '%s' returned invalid number"
6971 " of nodes (%s), required %s" %
6972 (self.op.iallocator, len(ial.result),
6973 ial.required_nodes), errors.ECODE_FAULT)
6974 self.op.pnode = ial.result[0]
6975 self.LogInfo("Selected nodes for instance %s via iallocator %s: %s",
6976 self.op.instance_name, self.op.iallocator,
6977 utils.CommaJoin(ial.result))
6978 if ial.required_nodes == 2:
6979 self.op.snode = ial.result[1]
6981 def BuildHooksEnv(self):
6984 This runs on master, primary and secondary nodes of the instance.
6988 "ADD_MODE": self.op.mode,
6990 if self.op.mode == constants.INSTANCE_IMPORT:
6991 env["SRC_NODE"] = self.op.src_node
6992 env["SRC_PATH"] = self.op.src_path
6993 env["SRC_IMAGES"] = self.src_images
6995 env.update(_BuildInstanceHookEnv(
6996 name=self.op.instance_name,
6997 primary_node=self.op.pnode,
6998 secondary_nodes=self.secondaries,
6999 status=self.op.start,
7000 os_type=self.op.os_type,
7001 memory=self.be_full[constants.BE_MEMORY],
7002 vcpus=self.be_full[constants.BE_VCPUS],
7003 nics=_NICListToTuple(self, self.nics),
7004 disk_template=self.op.disk_template,
7005 disks=[(d["size"], d["mode"]) for d in self.disks],
7008 hypervisor_name=self.op.hypervisor,
7011 nl = ([self.cfg.GetMasterNode(), self.op.pnode] +
7015 def _ReadExportInfo(self):
7016 """Reads the export information from disk.
7018 It will override the opcode source node and path with the actual
7019 information, if these two were not specified before.
7021 @return: the export information
7024 assert self.op.mode == constants.INSTANCE_IMPORT
7026 src_node = self.op.src_node
7027 src_path = self.op.src_path
7029 if src_node is None:
7030 locked_nodes = self.acquired_locks[locking.LEVEL_NODE]
7031 exp_list = self.rpc.call_export_list(locked_nodes)
7033 for node in exp_list:
7034 if exp_list[node].fail_msg:
7036 if src_path in exp_list[node].payload:
7038 self.op.src_node = src_node = node
7039 self.op.src_path = src_path = utils.PathJoin(constants.EXPORT_DIR,
7043 raise errors.OpPrereqError("No export found for relative path %s" %
7044 src_path, errors.ECODE_INVAL)
7046 _CheckNodeOnline(self, src_node)
7047 result = self.rpc.call_export_info(src_node, src_path)
7048 result.Raise("No export or invalid export found in dir %s" % src_path)
7050 export_info = objects.SerializableConfigParser.Loads(str(result.payload))
7051 if not export_info.has_section(constants.INISECT_EXP):
7052 raise errors.ProgrammerError("Corrupted export config",
7053 errors.ECODE_ENVIRON)
7055 ei_version = export_info.get(constants.INISECT_EXP, "version")
7056 if (int(ei_version) != constants.EXPORT_VERSION):
7057 raise errors.OpPrereqError("Wrong export version %s (wanted %d)" %
7058 (ei_version, constants.EXPORT_VERSION),
7059 errors.ECODE_ENVIRON)
7062 def _ReadExportParams(self, einfo):
7063 """Use export parameters as defaults.
7065 In case the opcode doesn't specify (as in override) some instance
7066 parameters, then try to use them from the export information, if
7070 self.op.os_type = einfo.get(constants.INISECT_EXP, "os")
7072 if self.op.disk_template is None:
7073 if einfo.has_option(constants.INISECT_INS, "disk_template"):
7074 self.op.disk_template = einfo.get(constants.INISECT_INS,
7077 raise errors.OpPrereqError("No disk template specified and the export"
7078 " is missing the disk_template information",
7081 if not self.op.disks:
7082 if einfo.has_option(constants.INISECT_INS, "disk_count"):
7084 # TODO: import the disk iv_name too
7085 for idx in range(einfo.getint(constants.INISECT_INS, "disk_count")):
7086 disk_sz = einfo.getint(constants.INISECT_INS, "disk%d_size" % idx)
7087 disks.append({"size": disk_sz})
7088 self.op.disks = disks
7090 raise errors.OpPrereqError("No disk info specified and the export"
7091 " is missing the disk information",
7094 if (not self.op.nics and
7095 einfo.has_option(constants.INISECT_INS, "nic_count")):
7097 for idx in range(einfo.getint(constants.INISECT_INS, "nic_count")):
7099 for name in list(constants.NICS_PARAMETERS) + ["ip", "mac"]:
7100 v = einfo.get(constants.INISECT_INS, "nic%d_%s" % (idx, name))
7105 if (self.op.hypervisor is None and
7106 einfo.has_option(constants.INISECT_INS, "hypervisor")):
7107 self.op.hypervisor = einfo.get(constants.INISECT_INS, "hypervisor")
7108 if einfo.has_section(constants.INISECT_HYP):
7109 # use the export parameters but do not override the ones
7110 # specified by the user
7111 for name, value in einfo.items(constants.INISECT_HYP):
7112 if name not in self.op.hvparams:
7113 self.op.hvparams[name] = value
7115 if einfo.has_section(constants.INISECT_BEP):
7116 # use the parameters, without overriding
7117 for name, value in einfo.items(constants.INISECT_BEP):
7118 if name not in self.op.beparams:
7119 self.op.beparams[name] = value
7121 # try to read the parameters old style, from the main section
7122 for name in constants.BES_PARAMETERS:
7123 if (name not in self.op.beparams and
7124 einfo.has_option(constants.INISECT_INS, name)):
7125 self.op.beparams[name] = einfo.get(constants.INISECT_INS, name)
7127 if einfo.has_section(constants.INISECT_OSP):
7128 # use the parameters, without overriding
7129 for name, value in einfo.items(constants.INISECT_OSP):
7130 if name not in self.op.osparams:
7131 self.op.osparams[name] = value
7133 def _RevertToDefaults(self, cluster):
7134 """Revert the instance parameters to the default values.
7138 hv_defs = cluster.SimpleFillHV(self.op.hypervisor, self.op.os_type, {})
7139 for name in self.op.hvparams.keys():
7140 if name in hv_defs and hv_defs[name] == self.op.hvparams[name]:
7141 del self.op.hvparams[name]
7143 be_defs = cluster.SimpleFillBE({})
7144 for name in self.op.beparams.keys():
7145 if name in be_defs and be_defs[name] == self.op.beparams[name]:
7146 del self.op.beparams[name]
7148 nic_defs = cluster.SimpleFillNIC({})
7149 for nic in self.op.nics:
7150 for name in constants.NICS_PARAMETERS:
7151 if name in nic and name in nic_defs and nic[name] == nic_defs[name]:
7154 os_defs = cluster.SimpleFillOS(self.op.os_type, {})
7155 for name in self.op.osparams.keys():
7156 if name in os_defs and os_defs[name] == self.op.osparams[name]:
7157 del self.op.osparams[name]
7159 def CheckPrereq(self):
7160 """Check prerequisites.
7163 if self.op.mode == constants.INSTANCE_IMPORT:
7164 export_info = self._ReadExportInfo()
7165 self._ReadExportParams(export_info)
7167 _CheckDiskTemplate(self.op.disk_template)
7169 if (not self.cfg.GetVGName() and
7170 self.op.disk_template not in constants.DTS_NOT_LVM):
7171 raise errors.OpPrereqError("Cluster does not support lvm-based"
7172 " instances", errors.ECODE_STATE)
7174 if self.op.hypervisor is None:
7175 self.op.hypervisor = self.cfg.GetHypervisorType()
7177 cluster = self.cfg.GetClusterInfo()
7178 enabled_hvs = cluster.enabled_hypervisors
7179 if self.op.hypervisor not in enabled_hvs:
7180 raise errors.OpPrereqError("Selected hypervisor (%s) not enabled in the"
7181 " cluster (%s)" % (self.op.hypervisor,
7182 ",".join(enabled_hvs)),
7185 # check hypervisor parameter syntax (locally)
7186 utils.ForceDictType(self.op.hvparams, constants.HVS_PARAMETER_TYPES)
7187 filled_hvp = cluster.SimpleFillHV(self.op.hypervisor, self.op.os_type,
7189 hv_type = hypervisor.GetHypervisor(self.op.hypervisor)
7190 hv_type.CheckParameterSyntax(filled_hvp)
7191 self.hv_full = filled_hvp
7192 # check that we don't specify global parameters on an instance
7193 _CheckGlobalHvParams(self.op.hvparams)
7195 # fill and remember the beparams dict
7196 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
7197 self.be_full = cluster.SimpleFillBE(self.op.beparams)
7199 # build os parameters
7200 self.os_full = cluster.SimpleFillOS(self.op.os_type, self.op.osparams)
7202 # now that hvp/bep are in final format, let's reset to defaults,
7204 if self.op.identify_defaults:
7205 self._RevertToDefaults(cluster)
7209 for idx, nic in enumerate(self.op.nics):
7210 nic_mode_req = nic.get("mode", None)
7211 nic_mode = nic_mode_req
7212 if nic_mode is None:
7213 nic_mode = cluster.nicparams[constants.PP_DEFAULT][constants.NIC_MODE]
7215 # in routed mode, for the first nic, the default ip is 'auto'
7216 if nic_mode == constants.NIC_MODE_ROUTED and idx == 0:
7217 default_ip_mode = constants.VALUE_AUTO
7219 default_ip_mode = constants.VALUE_NONE
7221 # ip validity checks
7222 ip = nic.get("ip", default_ip_mode)
7223 if ip is None or ip.lower() == constants.VALUE_NONE:
7225 elif ip.lower() == constants.VALUE_AUTO:
7226 if not self.op.name_check:
7227 raise errors.OpPrereqError("IP address set to auto but name checks"
7228 " have been skipped",
7230 nic_ip = self.hostname1.ip
7232 if not netutils.IPAddress.IsValid(ip):
7233 raise errors.OpPrereqError("Invalid IP address '%s'" % ip,
7237 # TODO: check the ip address for uniqueness
7238 if nic_mode == constants.NIC_MODE_ROUTED and not nic_ip:
7239 raise errors.OpPrereqError("Routed nic mode requires an ip address",
7242 # MAC address verification
7243 mac = nic.get("mac", constants.VALUE_AUTO)
7244 if mac not in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
7245 mac = utils.NormalizeAndValidateMac(mac)
7248 self.cfg.ReserveMAC(mac, self.proc.GetECId())
7249 except errors.ReservationError:
7250 raise errors.OpPrereqError("MAC address %s already in use"
7251 " in cluster" % mac,
7252 errors.ECODE_NOTUNIQUE)
7254 # bridge verification
7255 bridge = nic.get("bridge", None)
7256 link = nic.get("link", None)
7258 raise errors.OpPrereqError("Cannot pass 'bridge' and 'link'"
7259 " at the same time", errors.ECODE_INVAL)
7260 elif bridge and nic_mode == constants.NIC_MODE_ROUTED:
7261 raise errors.OpPrereqError("Cannot pass 'bridge' on a routed nic",
7268 nicparams[constants.NIC_MODE] = nic_mode_req
7270 nicparams[constants.NIC_LINK] = link
7272 check_params = cluster.SimpleFillNIC(nicparams)
7273 objects.NIC.CheckParameterSyntax(check_params)
7274 self.nics.append(objects.NIC(mac=mac, ip=nic_ip, nicparams=nicparams))
7276 # disk checks/pre-build
7278 for disk in self.op.disks:
7279 mode = disk.get("mode", constants.DISK_RDWR)
7280 if mode not in constants.DISK_ACCESS_SET:
7281 raise errors.OpPrereqError("Invalid disk access mode '%s'" %
7282 mode, errors.ECODE_INVAL)
7283 size = disk.get("size", None)
7285 raise errors.OpPrereqError("Missing disk size", errors.ECODE_INVAL)
7288 except (TypeError, ValueError):
7289 raise errors.OpPrereqError("Invalid disk size '%s'" % size,
7291 new_disk = {"size": size, "mode": mode}
7293 new_disk["adopt"] = disk["adopt"]
7294 self.disks.append(new_disk)
7296 if self.op.mode == constants.INSTANCE_IMPORT:
7298 # Check that the new instance doesn't have less disks than the export
7299 instance_disks = len(self.disks)
7300 export_disks = export_info.getint(constants.INISECT_INS, 'disk_count')
7301 if instance_disks < export_disks:
7302 raise errors.OpPrereqError("Not enough disks to import."
7303 " (instance: %d, export: %d)" %
7304 (instance_disks, export_disks),
7308 for idx in range(export_disks):
7309 option = 'disk%d_dump' % idx
7310 if export_info.has_option(constants.INISECT_INS, option):
7311 # FIXME: are the old os-es, disk sizes, etc. useful?
7312 export_name = export_info.get(constants.INISECT_INS, option)
7313 image = utils.PathJoin(self.op.src_path, export_name)
7314 disk_images.append(image)
7316 disk_images.append(False)
7318 self.src_images = disk_images
7320 old_name = export_info.get(constants.INISECT_INS, 'name')
7322 exp_nic_count = export_info.getint(constants.INISECT_INS, 'nic_count')
7323 except (TypeError, ValueError), err:
7324 raise errors.OpPrereqError("Invalid export file, nic_count is not"
7325 " an integer: %s" % str(err),
7327 if self.op.instance_name == old_name:
7328 for idx, nic in enumerate(self.nics):
7329 if nic.mac == constants.VALUE_AUTO and exp_nic_count >= idx:
7330 nic_mac_ini = 'nic%d_mac' % idx
7331 nic.mac = export_info.get(constants.INISECT_INS, nic_mac_ini)
7333 # ENDIF: self.op.mode == constants.INSTANCE_IMPORT
7335 # ip ping checks (we use the same ip that was resolved in ExpandNames)
7336 if self.op.ip_check:
7337 if netutils.TcpPing(self.check_ip, constants.DEFAULT_NODED_PORT):
7338 raise errors.OpPrereqError("IP %s of instance %s already in use" %
7339 (self.check_ip, self.op.instance_name),
7340 errors.ECODE_NOTUNIQUE)
7342 #### mac address generation
7343 # By generating here the mac address both the allocator and the hooks get
7344 # the real final mac address rather than the 'auto' or 'generate' value.
7345 # There is a race condition between the generation and the instance object
7346 # creation, which means that we know the mac is valid now, but we're not
7347 # sure it will be when we actually add the instance. If things go bad
7348 # adding the instance will abort because of a duplicate mac, and the
7349 # creation job will fail.
7350 for nic in self.nics:
7351 if nic.mac in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
7352 nic.mac = self.cfg.GenerateMAC(self.proc.GetECId())
7356 if self.op.iallocator is not None:
7357 self._RunAllocator()
7359 #### node related checks
7361 # check primary node
7362 self.pnode = pnode = self.cfg.GetNodeInfo(self.op.pnode)
7363 assert self.pnode is not None, \
7364 "Cannot retrieve locked node %s" % self.op.pnode
7366 raise errors.OpPrereqError("Cannot use offline primary node '%s'" %
7367 pnode.name, errors.ECODE_STATE)
7369 raise errors.OpPrereqError("Cannot use drained primary node '%s'" %
7370 pnode.name, errors.ECODE_STATE)
7371 if not pnode.vm_capable:
7372 raise errors.OpPrereqError("Cannot use non-vm_capable primary node"
7373 " '%s'" % pnode.name, errors.ECODE_STATE)
7375 self.secondaries = []
7377 # mirror node verification
7378 if self.op.disk_template in constants.DTS_NET_MIRROR:
7379 if self.op.snode == pnode.name:
7380 raise errors.OpPrereqError("The secondary node cannot be the"
7381 " primary node.", errors.ECODE_INVAL)
7382 _CheckNodeOnline(self, self.op.snode)
7383 _CheckNodeNotDrained(self, self.op.snode)
7384 _CheckNodeVmCapable(self, self.op.snode)
7385 self.secondaries.append(self.op.snode)
7387 nodenames = [pnode.name] + self.secondaries
7389 req_size = _ComputeDiskSize(self.op.disk_template,
7392 # Check lv size requirements, if not adopting
7393 if req_size is not None and not self.adopt_disks:
7394 _CheckNodesFreeDisk(self, nodenames, req_size)
7396 if self.adopt_disks: # instead, we must check the adoption data
7397 all_lvs = set([i["adopt"] for i in self.disks])
7398 if len(all_lvs) != len(self.disks):
7399 raise errors.OpPrereqError("Duplicate volume names given for adoption",
7401 for lv_name in all_lvs:
7403 self.cfg.ReserveLV(lv_name, self.proc.GetECId())
7404 except errors.ReservationError:
7405 raise errors.OpPrereqError("LV named %s used by another instance" %
7406 lv_name, errors.ECODE_NOTUNIQUE)
7408 node_lvs = self.rpc.call_lv_list([pnode.name],
7409 self.cfg.GetVGName())[pnode.name]
7410 node_lvs.Raise("Cannot get LV information from node %s" % pnode.name)
7411 node_lvs = node_lvs.payload
7412 delta = all_lvs.difference(node_lvs.keys())
7414 raise errors.OpPrereqError("Missing logical volume(s): %s" %
7415 utils.CommaJoin(delta),
7417 online_lvs = [lv for lv in all_lvs if node_lvs[lv][2]]
7419 raise errors.OpPrereqError("Online logical volumes found, cannot"
7420 " adopt: %s" % utils.CommaJoin(online_lvs),
7422 # update the size of disk based on what is found
7423 for dsk in self.disks:
7424 dsk["size"] = int(float(node_lvs[dsk["adopt"]][0]))
7426 _CheckHVParams(self, nodenames, self.op.hypervisor, self.op.hvparams)
7428 _CheckNodeHasOS(self, pnode.name, self.op.os_type, self.op.force_variant)
7429 # check OS parameters (remotely)
7430 _CheckOSParams(self, True, nodenames, self.op.os_type, self.os_full)
7432 _CheckNicsBridgesExist(self, self.nics, self.pnode.name)
7434 # memory check on primary node
7436 _CheckNodeFreeMemory(self, self.pnode.name,
7437 "creating instance %s" % self.op.instance_name,
7438 self.be_full[constants.BE_MEMORY],
7441 self.dry_run_result = list(nodenames)
7443 def Exec(self, feedback_fn):
7444 """Create and add the instance to the cluster.
7447 instance = self.op.instance_name
7448 pnode_name = self.pnode.name
7450 ht_kind = self.op.hypervisor
7451 if ht_kind in constants.HTS_REQ_PORT:
7452 network_port = self.cfg.AllocatePort()
7456 if constants.ENABLE_FILE_STORAGE:
7457 # this is needed because os.path.join does not accept None arguments
7458 if self.op.file_storage_dir is None:
7459 string_file_storage_dir = ""
7461 string_file_storage_dir = self.op.file_storage_dir
7463 # build the full file storage dir path
7464 file_storage_dir = utils.PathJoin(self.cfg.GetFileStorageDir(),
7465 string_file_storage_dir, instance)
7467 file_storage_dir = ""
7469 disks = _GenerateDiskTemplate(self,
7470 self.op.disk_template,
7471 instance, pnode_name,
7475 self.op.file_driver,
7478 iobj = objects.Instance(name=instance, os=self.op.os_type,
7479 primary_node=pnode_name,
7480 nics=self.nics, disks=disks,
7481 disk_template=self.op.disk_template,
7483 network_port=network_port,
7484 beparams=self.op.beparams,
7485 hvparams=self.op.hvparams,
7486 hypervisor=self.op.hypervisor,
7487 osparams=self.op.osparams,
7490 if self.adopt_disks:
7491 # rename LVs to the newly-generated names; we need to construct
7492 # 'fake' LV disks with the old data, plus the new unique_id
7493 tmp_disks = [objects.Disk.FromDict(v.ToDict()) for v in disks]
7495 for t_dsk, a_dsk in zip (tmp_disks, self.disks):
7496 rename_to.append(t_dsk.logical_id)
7497 t_dsk.logical_id = (t_dsk.logical_id[0], a_dsk["adopt"])
7498 self.cfg.SetDiskID(t_dsk, pnode_name)
7499 result = self.rpc.call_blockdev_rename(pnode_name,
7500 zip(tmp_disks, rename_to))
7501 result.Raise("Failed to rename adoped LVs")
7503 feedback_fn("* creating instance disks...")
7505 _CreateDisks(self, iobj)
7506 except errors.OpExecError:
7507 self.LogWarning("Device creation failed, reverting...")
7509 _RemoveDisks(self, iobj)
7511 self.cfg.ReleaseDRBDMinors(instance)
7514 if self.cfg.GetClusterInfo().prealloc_wipe_disks:
7515 feedback_fn("* wiping instance disks...")
7517 _WipeDisks(self, iobj)
7518 except errors.OpExecError:
7519 self.LogWarning("Device wiping failed, reverting...")
7521 _RemoveDisks(self, iobj)
7523 self.cfg.ReleaseDRBDMinors(instance)
7526 feedback_fn("adding instance %s to cluster config" % instance)
7528 self.cfg.AddInstance(iobj, self.proc.GetECId())
7530 # Declare that we don't want to remove the instance lock anymore, as we've
7531 # added the instance to the config
7532 del self.remove_locks[locking.LEVEL_INSTANCE]
7533 # Unlock all the nodes
7534 if self.op.mode == constants.INSTANCE_IMPORT:
7535 nodes_keep = [self.op.src_node]
7536 nodes_release = [node for node in self.acquired_locks[locking.LEVEL_NODE]
7537 if node != self.op.src_node]
7538 self.context.glm.release(locking.LEVEL_NODE, nodes_release)
7539 self.acquired_locks[locking.LEVEL_NODE] = nodes_keep
7541 self.context.glm.release(locking.LEVEL_NODE)
7542 del self.acquired_locks[locking.LEVEL_NODE]
7544 if self.op.wait_for_sync:
7545 disk_abort = not _WaitForSync(self, iobj)
7546 elif iobj.disk_template in constants.DTS_NET_MIRROR:
7547 # make sure the disks are not degraded (still sync-ing is ok)
7549 feedback_fn("* checking mirrors status")
7550 disk_abort = not _WaitForSync(self, iobj, oneshot=True)
7555 _RemoveDisks(self, iobj)
7556 self.cfg.RemoveInstance(iobj.name)
7557 # Make sure the instance lock gets removed
7558 self.remove_locks[locking.LEVEL_INSTANCE] = iobj.name
7559 raise errors.OpExecError("There are some degraded disks for"
7562 if iobj.disk_template != constants.DT_DISKLESS and not self.adopt_disks:
7563 if self.op.mode == constants.INSTANCE_CREATE:
7564 if not self.op.no_install:
7565 feedback_fn("* running the instance OS create scripts...")
7566 # FIXME: pass debug option from opcode to backend
7567 result = self.rpc.call_instance_os_add(pnode_name, iobj, False,
7568 self.op.debug_level)
7569 result.Raise("Could not add os for instance %s"
7570 " on node %s" % (instance, pnode_name))
7572 elif self.op.mode == constants.INSTANCE_IMPORT:
7573 feedback_fn("* running the instance OS import scripts...")
7577 for idx, image in enumerate(self.src_images):
7581 # FIXME: pass debug option from opcode to backend
7582 dt = masterd.instance.DiskTransfer("disk/%s" % idx,
7583 constants.IEIO_FILE, (image, ),
7584 constants.IEIO_SCRIPT,
7585 (iobj.disks[idx], idx),
7587 transfers.append(dt)
7590 masterd.instance.TransferInstanceData(self, feedback_fn,
7591 self.op.src_node, pnode_name,
7592 self.pnode.secondary_ip,
7594 if not compat.all(import_result):
7595 self.LogWarning("Some disks for instance %s on node %s were not"
7596 " imported successfully" % (instance, pnode_name))
7598 elif self.op.mode == constants.INSTANCE_REMOTE_IMPORT:
7599 feedback_fn("* preparing remote import...")
7600 connect_timeout = constants.RIE_CONNECT_TIMEOUT
7601 timeouts = masterd.instance.ImportExportTimeouts(connect_timeout)
7603 disk_results = masterd.instance.RemoteImport(self, feedback_fn, iobj,
7604 self.source_x509_ca,
7605 self._cds, timeouts)
7606 if not compat.all(disk_results):
7607 # TODO: Should the instance still be started, even if some disks
7608 # failed to import (valid for local imports, too)?
7609 self.LogWarning("Some disks for instance %s on node %s were not"
7610 " imported successfully" % (instance, pnode_name))
7612 # Run rename script on newly imported instance
7613 assert iobj.name == instance
7614 feedback_fn("Running rename script for %s" % instance)
7615 result = self.rpc.call_instance_run_rename(pnode_name, iobj,
7616 self.source_instance_name,
7617 self.op.debug_level)
7619 self.LogWarning("Failed to run rename script for %s on node"
7620 " %s: %s" % (instance, pnode_name, result.fail_msg))
7623 # also checked in the prereq part
7624 raise errors.ProgrammerError("Unknown OS initialization mode '%s'"
7628 iobj.admin_up = True
7629 self.cfg.Update(iobj, feedback_fn)
7630 logging.info("Starting instance %s on node %s", instance, pnode_name)
7631 feedback_fn("* starting instance...")
7632 result = self.rpc.call_instance_start(pnode_name, iobj, None, None)
7633 result.Raise("Could not start instance")
7635 return list(iobj.all_nodes)
7638 class LUConnectConsole(NoHooksLU):
7639 """Connect to an instance's console.
7641 This is somewhat special in that it returns the command line that
7642 you need to run on the master node in order to connect to the
7651 def ExpandNames(self):
7652 self._ExpandAndLockInstance()
7654 def CheckPrereq(self):
7655 """Check prerequisites.
7657 This checks that the instance is in the cluster.
7660 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
7661 assert self.instance is not None, \
7662 "Cannot retrieve locked instance %s" % self.op.instance_name
7663 _CheckNodeOnline(self, self.instance.primary_node)
7665 def Exec(self, feedback_fn):
7666 """Connect to the console of an instance
7669 instance = self.instance
7670 node = instance.primary_node
7672 node_insts = self.rpc.call_instance_list([node],
7673 [instance.hypervisor])[node]
7674 node_insts.Raise("Can't get node information from %s" % node)
7676 if instance.name not in node_insts.payload:
7677 if instance.admin_up:
7678 state = "ERROR_down"
7680 state = "ADMIN_down"
7681 raise errors.OpExecError("Instance %s is not running (state %s)" %
7682 (instance.name, state))
7684 logging.debug("Connecting to console of %s on %s", instance.name, node)
7686 hyper = hypervisor.GetHypervisor(instance.hypervisor)
7687 cluster = self.cfg.GetClusterInfo()
7688 # beparams and hvparams are passed separately, to avoid editing the
7689 # instance and then saving the defaults in the instance itself.
7690 hvparams = cluster.FillHV(instance)
7691 beparams = cluster.FillBE(instance)
7692 console_cmd = hyper.GetShellCommandForConsole(instance, hvparams, beparams)
7695 return self.ssh.BuildCmd(node, "root", console_cmd, batch=True, tty=True)
7698 class LUReplaceDisks(LogicalUnit):
7699 """Replace the disks of an instance.
7702 HPATH = "mirrors-replace"
7703 HTYPE = constants.HTYPE_INSTANCE
7706 ("mode", ht.NoDefault, ht.TElemOf(constants.REPLACE_MODES)),
7707 ("disks", ht.EmptyList, ht.TListOf(ht.TPositiveInt)),
7708 ("remote_node", None, ht.TMaybeString),
7709 ("iallocator", None, ht.TMaybeString),
7710 ("early_release", False, ht.TBool),
7714 def CheckArguments(self):
7715 TLReplaceDisks.CheckArguments(self.op.mode, self.op.remote_node,
7718 def ExpandNames(self):
7719 self._ExpandAndLockInstance()
7721 if self.op.iallocator is not None:
7722 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
7724 elif self.op.remote_node is not None:
7725 remote_node = _ExpandNodeName(self.cfg, self.op.remote_node)
7726 self.op.remote_node = remote_node
7728 # Warning: do not remove the locking of the new secondary here
7729 # unless DRBD8.AddChildren is changed to work in parallel;
7730 # currently it doesn't since parallel invocations of
7731 # FindUnusedMinor will conflict
7732 self.needed_locks[locking.LEVEL_NODE] = [remote_node]
7733 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
7736 self.needed_locks[locking.LEVEL_NODE] = []
7737 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
7739 self.replacer = TLReplaceDisks(self, self.op.instance_name, self.op.mode,
7740 self.op.iallocator, self.op.remote_node,
7741 self.op.disks, False, self.op.early_release)
7743 self.tasklets = [self.replacer]
7745 def DeclareLocks(self, level):
7746 # If we're not already locking all nodes in the set we have to declare the
7747 # instance's primary/secondary nodes.
7748 if (level == locking.LEVEL_NODE and
7749 self.needed_locks[locking.LEVEL_NODE] is not locking.ALL_SET):
7750 self._LockInstancesNodes()
7752 def BuildHooksEnv(self):
7755 This runs on the master, the primary and all the secondaries.
7758 instance = self.replacer.instance
7760 "MODE": self.op.mode,
7761 "NEW_SECONDARY": self.op.remote_node,
7762 "OLD_SECONDARY": instance.secondary_nodes[0],
7764 env.update(_BuildInstanceHookEnvByObject(self, instance))
7766 self.cfg.GetMasterNode(),
7767 instance.primary_node,
7769 if self.op.remote_node is not None:
7770 nl.append(self.op.remote_node)
7774 class TLReplaceDisks(Tasklet):
7775 """Replaces disks for an instance.
7777 Note: Locking is not within the scope of this class.
7780 def __init__(self, lu, instance_name, mode, iallocator_name, remote_node,
7781 disks, delay_iallocator, early_release):
7782 """Initializes this class.
7785 Tasklet.__init__(self, lu)
7788 self.instance_name = instance_name
7790 self.iallocator_name = iallocator_name
7791 self.remote_node = remote_node
7793 self.delay_iallocator = delay_iallocator
7794 self.early_release = early_release
7797 self.instance = None
7798 self.new_node = None
7799 self.target_node = None
7800 self.other_node = None
7801 self.remote_node_info = None
7802 self.node_secondary_ip = None
7805 def CheckArguments(mode, remote_node, iallocator):
7806 """Helper function for users of this class.
7809 # check for valid parameter combination
7810 if mode == constants.REPLACE_DISK_CHG:
7811 if remote_node is None and iallocator is None:
7812 raise errors.OpPrereqError("When changing the secondary either an"
7813 " iallocator script must be used or the"
7814 " new node given", errors.ECODE_INVAL)
7816 if remote_node is not None and iallocator is not None:
7817 raise errors.OpPrereqError("Give either the iallocator or the new"
7818 " secondary, not both", errors.ECODE_INVAL)
7820 elif remote_node is not None or iallocator is not None:
7821 # Not replacing the secondary
7822 raise errors.OpPrereqError("The iallocator and new node options can"
7823 " only be used when changing the"
7824 " secondary node", errors.ECODE_INVAL)
7827 def _RunAllocator(lu, iallocator_name, instance_name, relocate_from):
7828 """Compute a new secondary node using an IAllocator.
7831 ial = IAllocator(lu.cfg, lu.rpc,
7832 mode=constants.IALLOCATOR_MODE_RELOC,
7834 relocate_from=relocate_from)
7836 ial.Run(iallocator_name)
7839 raise errors.OpPrereqError("Can't compute nodes using iallocator '%s':"
7840 " %s" % (iallocator_name, ial.info),
7843 if len(ial.result) != ial.required_nodes:
7844 raise errors.OpPrereqError("iallocator '%s' returned invalid number"
7845 " of nodes (%s), required %s" %
7847 len(ial.result), ial.required_nodes),
7850 remote_node_name = ial.result[0]
7852 lu.LogInfo("Selected new secondary for instance '%s': %s",
7853 instance_name, remote_node_name)
7855 return remote_node_name
7857 def _FindFaultyDisks(self, node_name):
7858 return _FindFaultyInstanceDisks(self.cfg, self.rpc, self.instance,
7861 def CheckPrereq(self):
7862 """Check prerequisites.
7864 This checks that the instance is in the cluster.
7867 self.instance = instance = self.cfg.GetInstanceInfo(self.instance_name)
7868 assert instance is not None, \
7869 "Cannot retrieve locked instance %s" % self.instance_name
7871 if instance.disk_template != constants.DT_DRBD8:
7872 raise errors.OpPrereqError("Can only run replace disks for DRBD8-based"
7873 " instances", errors.ECODE_INVAL)
7875 if len(instance.secondary_nodes) != 1:
7876 raise errors.OpPrereqError("The instance has a strange layout,"
7877 " expected one secondary but found %d" %
7878 len(instance.secondary_nodes),
7881 if not self.delay_iallocator:
7882 self._CheckPrereq2()
7884 def _CheckPrereq2(self):
7885 """Check prerequisites, second part.
7887 This function should always be part of CheckPrereq. It was separated and is
7888 now called from Exec because during node evacuation iallocator was only
7889 called with an unmodified cluster model, not taking planned changes into
7893 instance = self.instance
7894 secondary_node = instance.secondary_nodes[0]
7896 if self.iallocator_name is None:
7897 remote_node = self.remote_node
7899 remote_node = self._RunAllocator(self.lu, self.iallocator_name,
7900 instance.name, instance.secondary_nodes)
7902 if remote_node is not None:
7903 self.remote_node_info = self.cfg.GetNodeInfo(remote_node)
7904 assert self.remote_node_info is not None, \
7905 "Cannot retrieve locked node %s" % remote_node
7907 self.remote_node_info = None
7909 if remote_node == self.instance.primary_node:
7910 raise errors.OpPrereqError("The specified node is the primary node of"
7911 " the instance.", errors.ECODE_INVAL)
7913 if remote_node == secondary_node:
7914 raise errors.OpPrereqError("The specified node is already the"
7915 " secondary node of the instance.",
7918 if self.disks and self.mode in (constants.REPLACE_DISK_AUTO,
7919 constants.REPLACE_DISK_CHG):
7920 raise errors.OpPrereqError("Cannot specify disks to be replaced",
7923 if self.mode == constants.REPLACE_DISK_AUTO:
7924 faulty_primary = self._FindFaultyDisks(instance.primary_node)
7925 faulty_secondary = self._FindFaultyDisks(secondary_node)
7927 if faulty_primary and faulty_secondary:
7928 raise errors.OpPrereqError("Instance %s has faulty disks on more than"
7929 " one node and can not be repaired"
7930 " automatically" % self.instance_name,
7934 self.disks = faulty_primary
7935 self.target_node = instance.primary_node
7936 self.other_node = secondary_node
7937 check_nodes = [self.target_node, self.other_node]
7938 elif faulty_secondary:
7939 self.disks = faulty_secondary
7940 self.target_node = secondary_node
7941 self.other_node = instance.primary_node
7942 check_nodes = [self.target_node, self.other_node]
7948 # Non-automatic modes
7949 if self.mode == constants.REPLACE_DISK_PRI:
7950 self.target_node = instance.primary_node
7951 self.other_node = secondary_node
7952 check_nodes = [self.target_node, self.other_node]
7954 elif self.mode == constants.REPLACE_DISK_SEC:
7955 self.target_node = secondary_node
7956 self.other_node = instance.primary_node
7957 check_nodes = [self.target_node, self.other_node]
7959 elif self.mode == constants.REPLACE_DISK_CHG:
7960 self.new_node = remote_node
7961 self.other_node = instance.primary_node
7962 self.target_node = secondary_node
7963 check_nodes = [self.new_node, self.other_node]
7965 _CheckNodeNotDrained(self.lu, remote_node)
7966 _CheckNodeVmCapable(self.lu, remote_node)
7968 old_node_info = self.cfg.GetNodeInfo(secondary_node)
7969 assert old_node_info is not None
7970 if old_node_info.offline and not self.early_release:
7971 # doesn't make sense to delay the release
7972 self.early_release = True
7973 self.lu.LogInfo("Old secondary %s is offline, automatically enabling"
7974 " early-release mode", secondary_node)
7977 raise errors.ProgrammerError("Unhandled disk replace mode (%s)" %
7980 # If not specified all disks should be replaced
7982 self.disks = range(len(self.instance.disks))
7984 for node in check_nodes:
7985 _CheckNodeOnline(self.lu, node)
7987 # Check whether disks are valid
7988 for disk_idx in self.disks:
7989 instance.FindDisk(disk_idx)
7991 # Get secondary node IP addresses
7994 for node_name in [self.target_node, self.other_node, self.new_node]:
7995 if node_name is not None:
7996 node_2nd_ip[node_name] = self.cfg.GetNodeInfo(node_name).secondary_ip
7998 self.node_secondary_ip = node_2nd_ip
8000 def Exec(self, feedback_fn):
8001 """Execute disk replacement.
8003 This dispatches the disk replacement to the appropriate handler.
8006 if self.delay_iallocator:
8007 self._CheckPrereq2()
8010 feedback_fn("No disks need replacement")
8013 feedback_fn("Replacing disk(s) %s for %s" %
8014 (utils.CommaJoin(self.disks), self.instance.name))
8016 activate_disks = (not self.instance.admin_up)
8018 # Activate the instance disks if we're replacing them on a down instance
8020 _StartInstanceDisks(self.lu, self.instance, True)
8023 # Should we replace the secondary node?
8024 if self.new_node is not None:
8025 fn = self._ExecDrbd8Secondary
8027 fn = self._ExecDrbd8DiskOnly
8029 return fn(feedback_fn)
8032 # Deactivate the instance disks if we're replacing them on a
8035 _SafeShutdownInstanceDisks(self.lu, self.instance)
8037 def _CheckVolumeGroup(self, nodes):
8038 self.lu.LogInfo("Checking volume groups")
8040 vgname = self.cfg.GetVGName()
8042 # Make sure volume group exists on all involved nodes
8043 results = self.rpc.call_vg_list(nodes)
8045 raise errors.OpExecError("Can't list volume groups on the nodes")
8049 res.Raise("Error checking node %s" % node)
8050 if vgname not in res.payload:
8051 raise errors.OpExecError("Volume group '%s' not found on node %s" %
8054 def _CheckDisksExistence(self, nodes):
8055 # Check disk existence
8056 for idx, dev in enumerate(self.instance.disks):
8057 if idx not in self.disks:
8061 self.lu.LogInfo("Checking disk/%d on %s" % (idx, node))
8062 self.cfg.SetDiskID(dev, node)
8064 result = self.rpc.call_blockdev_find(node, dev)
8066 msg = result.fail_msg
8067 if msg or not result.payload:
8069 msg = "disk not found"
8070 raise errors.OpExecError("Can't find disk/%d on node %s: %s" %
8073 def _CheckDisksConsistency(self, node_name, on_primary, ldisk):
8074 for idx, dev in enumerate(self.instance.disks):
8075 if idx not in self.disks:
8078 self.lu.LogInfo("Checking disk/%d consistency on node %s" %
8081 if not _CheckDiskConsistency(self.lu, dev, node_name, on_primary,
8083 raise errors.OpExecError("Node %s has degraded storage, unsafe to"
8084 " replace disks for instance %s" %
8085 (node_name, self.instance.name))
8087 def _CreateNewStorage(self, node_name):
8088 vgname = self.cfg.GetVGName()
8091 for idx, dev in enumerate(self.instance.disks):
8092 if idx not in self.disks:
8095 self.lu.LogInfo("Adding storage on %s for disk/%d" % (node_name, idx))
8097 self.cfg.SetDiskID(dev, node_name)
8099 lv_names = [".disk%d_%s" % (idx, suffix) for suffix in ["data", "meta"]]
8100 names = _GenerateUniqueNames(self.lu, lv_names)
8102 lv_data = objects.Disk(dev_type=constants.LD_LV, size=dev.size,
8103 logical_id=(vgname, names[0]))
8104 lv_meta = objects.Disk(dev_type=constants.LD_LV, size=128,
8105 logical_id=(vgname, names[1]))
8107 new_lvs = [lv_data, lv_meta]
8108 old_lvs = dev.children
8109 iv_names[dev.iv_name] = (dev, old_lvs, new_lvs)
8111 # we pass force_create=True to force the LVM creation
8112 for new_lv in new_lvs:
8113 _CreateBlockDev(self.lu, node_name, self.instance, new_lv, True,
8114 _GetInstanceInfoText(self.instance), False)
8118 def _CheckDevices(self, node_name, iv_names):
8119 for name, (dev, _, _) in iv_names.iteritems():
8120 self.cfg.SetDiskID(dev, node_name)
8122 result = self.rpc.call_blockdev_find(node_name, dev)
8124 msg = result.fail_msg
8125 if msg or not result.payload:
8127 msg = "disk not found"
8128 raise errors.OpExecError("Can't find DRBD device %s: %s" %
8131 if result.payload.is_degraded:
8132 raise errors.OpExecError("DRBD device %s is degraded!" % name)
8134 def _RemoveOldStorage(self, node_name, iv_names):
8135 for name, (_, old_lvs, _) in iv_names.iteritems():
8136 self.lu.LogInfo("Remove logical volumes for %s" % name)
8139 self.cfg.SetDiskID(lv, node_name)
8141 msg = self.rpc.call_blockdev_remove(node_name, lv).fail_msg
8143 self.lu.LogWarning("Can't remove old LV: %s" % msg,
8144 hint="remove unused LVs manually")
8146 def _ReleaseNodeLock(self, node_name):
8147 """Releases the lock for a given node."""
8148 self.lu.context.glm.release(locking.LEVEL_NODE, node_name)
8150 def _ExecDrbd8DiskOnly(self, feedback_fn):
8151 """Replace a disk on the primary or secondary for DRBD 8.
8153 The algorithm for replace is quite complicated:
8155 1. for each disk to be replaced:
8157 1. create new LVs on the target node with unique names
8158 1. detach old LVs from the drbd device
8159 1. rename old LVs to name_replaced.<time_t>
8160 1. rename new LVs to old LVs
8161 1. attach the new LVs (with the old names now) to the drbd device
8163 1. wait for sync across all devices
8165 1. for each modified disk:
8167 1. remove old LVs (which have the name name_replaces.<time_t>)
8169 Failures are not very well handled.
8174 # Step: check device activation
8175 self.lu.LogStep(1, steps_total, "Check device existence")
8176 self._CheckDisksExistence([self.other_node, self.target_node])
8177 self._CheckVolumeGroup([self.target_node, self.other_node])
8179 # Step: check other node consistency
8180 self.lu.LogStep(2, steps_total, "Check peer consistency")
8181 self._CheckDisksConsistency(self.other_node,
8182 self.other_node == self.instance.primary_node,
8185 # Step: create new storage
8186 self.lu.LogStep(3, steps_total, "Allocate new storage")
8187 iv_names = self._CreateNewStorage(self.target_node)
8189 # Step: for each lv, detach+rename*2+attach
8190 self.lu.LogStep(4, steps_total, "Changing drbd configuration")
8191 for dev, old_lvs, new_lvs in iv_names.itervalues():
8192 self.lu.LogInfo("Detaching %s drbd from local storage" % dev.iv_name)
8194 result = self.rpc.call_blockdev_removechildren(self.target_node, dev,
8196 result.Raise("Can't detach drbd from local storage on node"
8197 " %s for device %s" % (self.target_node, dev.iv_name))
8199 #cfg.Update(instance)
8201 # ok, we created the new LVs, so now we know we have the needed
8202 # storage; as such, we proceed on the target node to rename
8203 # old_lv to _old, and new_lv to old_lv; note that we rename LVs
8204 # using the assumption that logical_id == physical_id (which in
8205 # turn is the unique_id on that node)
8207 # FIXME(iustin): use a better name for the replaced LVs
8208 temp_suffix = int(time.time())
8209 ren_fn = lambda d, suff: (d.physical_id[0],
8210 d.physical_id[1] + "_replaced-%s" % suff)
8212 # Build the rename list based on what LVs exist on the node
8213 rename_old_to_new = []
8214 for to_ren in old_lvs:
8215 result = self.rpc.call_blockdev_find(self.target_node, to_ren)
8216 if not result.fail_msg and result.payload:
8218 rename_old_to_new.append((to_ren, ren_fn(to_ren, temp_suffix)))
8220 self.lu.LogInfo("Renaming the old LVs on the target node")
8221 result = self.rpc.call_blockdev_rename(self.target_node,
8223 result.Raise("Can't rename old LVs on node %s" % self.target_node)
8225 # Now we rename the new LVs to the old LVs
8226 self.lu.LogInfo("Renaming the new LVs on the target node")
8227 rename_new_to_old = [(new, old.physical_id)
8228 for old, new in zip(old_lvs, new_lvs)]
8229 result = self.rpc.call_blockdev_rename(self.target_node,
8231 result.Raise("Can't rename new LVs on node %s" % self.target_node)
8233 for old, new in zip(old_lvs, new_lvs):
8234 new.logical_id = old.logical_id
8235 self.cfg.SetDiskID(new, self.target_node)
8237 for disk in old_lvs:
8238 disk.logical_id = ren_fn(disk, temp_suffix)
8239 self.cfg.SetDiskID(disk, self.target_node)
8241 # Now that the new lvs have the old name, we can add them to the device
8242 self.lu.LogInfo("Adding new mirror component on %s" % self.target_node)
8243 result = self.rpc.call_blockdev_addchildren(self.target_node, dev,
8245 msg = result.fail_msg
8247 for new_lv in new_lvs:
8248 msg2 = self.rpc.call_blockdev_remove(self.target_node,
8251 self.lu.LogWarning("Can't rollback device %s: %s", dev, msg2,
8252 hint=("cleanup manually the unused logical"
8254 raise errors.OpExecError("Can't add local storage to drbd: %s" % msg)
8256 dev.children = new_lvs
8258 self.cfg.Update(self.instance, feedback_fn)
8261 if self.early_release:
8262 self.lu.LogStep(cstep, steps_total, "Removing old storage")
8264 self._RemoveOldStorage(self.target_node, iv_names)
8265 # WARNING: we release both node locks here, do not do other RPCs
8266 # than WaitForSync to the primary node
8267 self._ReleaseNodeLock([self.target_node, self.other_node])
8270 # This can fail as the old devices are degraded and _WaitForSync
8271 # does a combined result over all disks, so we don't check its return value
8272 self.lu.LogStep(cstep, steps_total, "Sync devices")
8274 _WaitForSync(self.lu, self.instance)
8276 # Check all devices manually
8277 self._CheckDevices(self.instance.primary_node, iv_names)
8279 # Step: remove old storage
8280 if not self.early_release:
8281 self.lu.LogStep(cstep, steps_total, "Removing old storage")
8283 self._RemoveOldStorage(self.target_node, iv_names)
8285 def _ExecDrbd8Secondary(self, feedback_fn):
8286 """Replace the secondary node for DRBD 8.
8288 The algorithm for replace is quite complicated:
8289 - for all disks of the instance:
8290 - create new LVs on the new node with same names
8291 - shutdown the drbd device on the old secondary
8292 - disconnect the drbd network on the primary
8293 - create the drbd device on the new secondary
8294 - network attach the drbd on the primary, using an artifice:
8295 the drbd code for Attach() will connect to the network if it
8296 finds a device which is connected to the good local disks but
8298 - wait for sync across all devices
8299 - remove all disks from the old secondary
8301 Failures are not very well handled.
8306 # Step: check device activation
8307 self.lu.LogStep(1, steps_total, "Check device existence")
8308 self._CheckDisksExistence([self.instance.primary_node])
8309 self._CheckVolumeGroup([self.instance.primary_node])
8311 # Step: check other node consistency
8312 self.lu.LogStep(2, steps_total, "Check peer consistency")
8313 self._CheckDisksConsistency(self.instance.primary_node, True, True)
8315 # Step: create new storage
8316 self.lu.LogStep(3, steps_total, "Allocate new storage")
8317 for idx, dev in enumerate(self.instance.disks):
8318 self.lu.LogInfo("Adding new local storage on %s for disk/%d" %
8319 (self.new_node, idx))
8320 # we pass force_create=True to force LVM creation
8321 for new_lv in dev.children:
8322 _CreateBlockDev(self.lu, self.new_node, self.instance, new_lv, True,
8323 _GetInstanceInfoText(self.instance), False)
8325 # Step 4: dbrd minors and drbd setups changes
8326 # after this, we must manually remove the drbd minors on both the
8327 # error and the success paths
8328 self.lu.LogStep(4, steps_total, "Changing drbd configuration")
8329 minors = self.cfg.AllocateDRBDMinor([self.new_node
8330 for dev in self.instance.disks],
8332 logging.debug("Allocated minors %r", minors)
8335 for idx, (dev, new_minor) in enumerate(zip(self.instance.disks, minors)):
8336 self.lu.LogInfo("activating a new drbd on %s for disk/%d" %
8337 (self.new_node, idx))
8338 # create new devices on new_node; note that we create two IDs:
8339 # one without port, so the drbd will be activated without
8340 # networking information on the new node at this stage, and one
8341 # with network, for the latter activation in step 4
8342 (o_node1, o_node2, o_port, o_minor1, o_minor2, o_secret) = dev.logical_id
8343 if self.instance.primary_node == o_node1:
8346 assert self.instance.primary_node == o_node2, "Three-node instance?"
8349 new_alone_id = (self.instance.primary_node, self.new_node, None,
8350 p_minor, new_minor, o_secret)
8351 new_net_id = (self.instance.primary_node, self.new_node, o_port,
8352 p_minor, new_minor, o_secret)
8354 iv_names[idx] = (dev, dev.children, new_net_id)
8355 logging.debug("Allocated new_minor: %s, new_logical_id: %s", new_minor,
8357 new_drbd = objects.Disk(dev_type=constants.LD_DRBD8,
8358 logical_id=new_alone_id,
8359 children=dev.children,
8362 _CreateSingleBlockDev(self.lu, self.new_node, self.instance, new_drbd,
8363 _GetInstanceInfoText(self.instance), False)
8364 except errors.GenericError:
8365 self.cfg.ReleaseDRBDMinors(self.instance.name)
8368 # We have new devices, shutdown the drbd on the old secondary
8369 for idx, dev in enumerate(self.instance.disks):
8370 self.lu.LogInfo("Shutting down drbd for disk/%d on old node" % idx)
8371 self.cfg.SetDiskID(dev, self.target_node)
8372 msg = self.rpc.call_blockdev_shutdown(self.target_node, dev).fail_msg
8374 self.lu.LogWarning("Failed to shutdown drbd for disk/%d on old"
8375 "node: %s" % (idx, msg),
8376 hint=("Please cleanup this device manually as"
8377 " soon as possible"))
8379 self.lu.LogInfo("Detaching primary drbds from the network (=> standalone)")
8380 result = self.rpc.call_drbd_disconnect_net([self.instance.primary_node],
8381 self.node_secondary_ip,
8382 self.instance.disks)\
8383 [self.instance.primary_node]
8385 msg = result.fail_msg
8387 # detaches didn't succeed (unlikely)
8388 self.cfg.ReleaseDRBDMinors(self.instance.name)
8389 raise errors.OpExecError("Can't detach the disks from the network on"
8390 " old node: %s" % (msg,))
8392 # if we managed to detach at least one, we update all the disks of
8393 # the instance to point to the new secondary
8394 self.lu.LogInfo("Updating instance configuration")
8395 for dev, _, new_logical_id in iv_names.itervalues():
8396 dev.logical_id = new_logical_id
8397 self.cfg.SetDiskID(dev, self.instance.primary_node)
8399 self.cfg.Update(self.instance, feedback_fn)
8401 # and now perform the drbd attach
8402 self.lu.LogInfo("Attaching primary drbds to new secondary"
8403 " (standalone => connected)")
8404 result = self.rpc.call_drbd_attach_net([self.instance.primary_node,
8406 self.node_secondary_ip,
8407 self.instance.disks,
8410 for to_node, to_result in result.items():
8411 msg = to_result.fail_msg
8413 self.lu.LogWarning("Can't attach drbd disks on node %s: %s",
8415 hint=("please do a gnt-instance info to see the"
8416 " status of disks"))
8418 if self.early_release:
8419 self.lu.LogStep(cstep, steps_total, "Removing old storage")
8421 self._RemoveOldStorage(self.target_node, iv_names)
8422 # WARNING: we release all node locks here, do not do other RPCs
8423 # than WaitForSync to the primary node
8424 self._ReleaseNodeLock([self.instance.primary_node,
8429 # This can fail as the old devices are degraded and _WaitForSync
8430 # does a combined result over all disks, so we don't check its return value
8431 self.lu.LogStep(cstep, steps_total, "Sync devices")
8433 _WaitForSync(self.lu, self.instance)
8435 # Check all devices manually
8436 self._CheckDevices(self.instance.primary_node, iv_names)
8438 # Step: remove old storage
8439 if not self.early_release:
8440 self.lu.LogStep(cstep, steps_total, "Removing old storage")
8441 self._RemoveOldStorage(self.target_node, iv_names)
8444 class LURepairNodeStorage(NoHooksLU):
8445 """Repairs the volume group on a node.
8450 ("storage_type", ht.NoDefault, _CheckStorageType),
8451 ("name", ht.NoDefault, ht.TNonEmptyString),
8452 ("ignore_consistency", False, ht.TBool),
8456 def CheckArguments(self):
8457 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
8459 storage_type = self.op.storage_type
8461 if (constants.SO_FIX_CONSISTENCY not in
8462 constants.VALID_STORAGE_OPERATIONS.get(storage_type, [])):
8463 raise errors.OpPrereqError("Storage units of type '%s' can not be"
8464 " repaired" % storage_type,
8467 def ExpandNames(self):
8468 self.needed_locks = {
8469 locking.LEVEL_NODE: [self.op.node_name],
8472 def _CheckFaultyDisks(self, instance, node_name):
8473 """Ensure faulty disks abort the opcode or at least warn."""
8475 if _FindFaultyInstanceDisks(self.cfg, self.rpc, instance,
8477 raise errors.OpPrereqError("Instance '%s' has faulty disks on"
8478 " node '%s'" % (instance.name, node_name),
8480 except errors.OpPrereqError, err:
8481 if self.op.ignore_consistency:
8482 self.proc.LogWarning(str(err.args[0]))
8486 def CheckPrereq(self):
8487 """Check prerequisites.
8490 # Check whether any instance on this node has faulty disks
8491 for inst in _GetNodeInstances(self.cfg, self.op.node_name):
8492 if not inst.admin_up:
8494 check_nodes = set(inst.all_nodes)
8495 check_nodes.discard(self.op.node_name)
8496 for inst_node_name in check_nodes:
8497 self._CheckFaultyDisks(inst, inst_node_name)
8499 def Exec(self, feedback_fn):
8500 feedback_fn("Repairing storage unit '%s' on %s ..." %
8501 (self.op.name, self.op.node_name))
8503 st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
8504 result = self.rpc.call_storage_execute(self.op.node_name,
8505 self.op.storage_type, st_args,
8507 constants.SO_FIX_CONSISTENCY)
8508 result.Raise("Failed to repair storage unit '%s' on %s" %
8509 (self.op.name, self.op.node_name))
8512 class LUNodeEvacuationStrategy(NoHooksLU):
8513 """Computes the node evacuation strategy.
8517 ("nodes", ht.NoDefault, ht.TListOf(ht.TNonEmptyString)),
8518 ("remote_node", None, ht.TMaybeString),
8519 ("iallocator", None, ht.TMaybeString),
8523 def CheckArguments(self):
8524 _CheckIAllocatorOrNode(self, "iallocator", "remote_node")
8526 def ExpandNames(self):
8527 self.op.nodes = _GetWantedNodes(self, self.op.nodes)
8528 self.needed_locks = locks = {}
8529 if self.op.remote_node is None:
8530 locks[locking.LEVEL_NODE] = locking.ALL_SET
8532 self.op.remote_node = _ExpandNodeName(self.cfg, self.op.remote_node)
8533 locks[locking.LEVEL_NODE] = self.op.nodes + [self.op.remote_node]
8535 def Exec(self, feedback_fn):
8536 if self.op.remote_node is not None:
8538 for node in self.op.nodes:
8539 instances.extend(_GetNodeSecondaryInstances(self.cfg, node))
8542 if i.primary_node == self.op.remote_node:
8543 raise errors.OpPrereqError("Node %s is the primary node of"
8544 " instance %s, cannot use it as"
8546 (self.op.remote_node, i.name),
8548 result.append([i.name, self.op.remote_node])
8550 ial = IAllocator(self.cfg, self.rpc,
8551 mode=constants.IALLOCATOR_MODE_MEVAC,
8552 evac_nodes=self.op.nodes)
8553 ial.Run(self.op.iallocator, validate=True)
8555 raise errors.OpExecError("No valid evacuation solution: %s" % ial.info,
8561 class LUGrowDisk(LogicalUnit):
8562 """Grow a disk of an instance.
8566 HTYPE = constants.HTYPE_INSTANCE
8569 ("disk", ht.NoDefault, ht.TInt),
8570 ("amount", ht.NoDefault, ht.TInt),
8571 ("wait_for_sync", True, ht.TBool),
8575 def ExpandNames(self):
8576 self._ExpandAndLockInstance()
8577 self.needed_locks[locking.LEVEL_NODE] = []
8578 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
8580 def DeclareLocks(self, level):
8581 if level == locking.LEVEL_NODE:
8582 self._LockInstancesNodes()
8584 def BuildHooksEnv(self):
8587 This runs on the master, the primary and all the secondaries.
8591 "DISK": self.op.disk,
8592 "AMOUNT": self.op.amount,
8594 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
8595 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
8598 def CheckPrereq(self):
8599 """Check prerequisites.
8601 This checks that the instance is in the cluster.
8604 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
8605 assert instance is not None, \
8606 "Cannot retrieve locked instance %s" % self.op.instance_name
8607 nodenames = list(instance.all_nodes)
8608 for node in nodenames:
8609 _CheckNodeOnline(self, node)
8611 self.instance = instance
8613 if instance.disk_template not in constants.DTS_GROWABLE:
8614 raise errors.OpPrereqError("Instance's disk layout does not support"
8615 " growing.", errors.ECODE_INVAL)
8617 self.disk = instance.FindDisk(self.op.disk)
8619 if instance.disk_template != constants.DT_FILE:
8620 # TODO: check the free disk space for file, when that feature will be
8622 _CheckNodesFreeDisk(self, nodenames, self.op.amount)
8624 def Exec(self, feedback_fn):
8625 """Execute disk grow.
8628 instance = self.instance
8631 disks_ok, _ = _AssembleInstanceDisks(self, self.instance, disks=[disk])
8633 raise errors.OpExecError("Cannot activate block device to grow")
8635 for node in instance.all_nodes:
8636 self.cfg.SetDiskID(disk, node)
8637 result = self.rpc.call_blockdev_grow(node, disk, self.op.amount)
8638 result.Raise("Grow request failed to node %s" % node)
8640 # TODO: Rewrite code to work properly
8641 # DRBD goes into sync mode for a short amount of time after executing the
8642 # "resize" command. DRBD 8.x below version 8.0.13 contains a bug whereby
8643 # calling "resize" in sync mode fails. Sleeping for a short amount of
8644 # time is a work-around.
8647 disk.RecordGrow(self.op.amount)
8648 self.cfg.Update(instance, feedback_fn)
8649 if self.op.wait_for_sync:
8650 disk_abort = not _WaitForSync(self, instance, disks=[disk])
8652 self.proc.LogWarning("Warning: disk sync-ing has not returned a good"
8653 " status.\nPlease check the instance.")
8654 if not instance.admin_up:
8655 _SafeShutdownInstanceDisks(self, instance, disks=[disk])
8656 elif not instance.admin_up:
8657 self.proc.LogWarning("Not shutting down the disk even if the instance is"
8658 " not supposed to be running because no wait for"
8659 " sync mode was requested.")
8662 class LUQueryInstanceData(NoHooksLU):
8663 """Query runtime instance data.
8667 ("instances", ht.EmptyList, ht.TListOf(ht.TNonEmptyString)),
8668 ("static", False, ht.TBool),
8672 def ExpandNames(self):
8673 self.needed_locks = {}
8674 self.share_locks = dict.fromkeys(locking.LEVELS, 1)
8676 if self.op.instances:
8677 self.wanted_names = []
8678 for name in self.op.instances:
8679 full_name = _ExpandInstanceName(self.cfg, name)
8680 self.wanted_names.append(full_name)
8681 self.needed_locks[locking.LEVEL_INSTANCE] = self.wanted_names
8683 self.wanted_names = None
8684 self.needed_locks[locking.LEVEL_INSTANCE] = locking.ALL_SET
8686 self.needed_locks[locking.LEVEL_NODE] = []
8687 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
8689 def DeclareLocks(self, level):
8690 if level == locking.LEVEL_NODE:
8691 self._LockInstancesNodes()
8693 def CheckPrereq(self):
8694 """Check prerequisites.
8696 This only checks the optional instance list against the existing names.
8699 if self.wanted_names is None:
8700 self.wanted_names = self.acquired_locks[locking.LEVEL_INSTANCE]
8702 self.wanted_instances = [self.cfg.GetInstanceInfo(name) for name
8703 in self.wanted_names]
8705 def _ComputeBlockdevStatus(self, node, instance_name, dev):
8706 """Returns the status of a block device
8709 if self.op.static or not node:
8712 self.cfg.SetDiskID(dev, node)
8714 result = self.rpc.call_blockdev_find(node, dev)
8718 result.Raise("Can't compute disk status for %s" % instance_name)
8720 status = result.payload
8724 return (status.dev_path, status.major, status.minor,
8725 status.sync_percent, status.estimated_time,
8726 status.is_degraded, status.ldisk_status)
8728 def _ComputeDiskStatus(self, instance, snode, dev):
8729 """Compute block device status.
8732 if dev.dev_type in constants.LDS_DRBD:
8733 # we change the snode then (otherwise we use the one passed in)
8734 if dev.logical_id[0] == instance.primary_node:
8735 snode = dev.logical_id[1]
8737 snode = dev.logical_id[0]
8739 dev_pstatus = self._ComputeBlockdevStatus(instance.primary_node,
8741 dev_sstatus = self._ComputeBlockdevStatus(snode, instance.name, dev)
8744 dev_children = [self._ComputeDiskStatus(instance, snode, child)
8745 for child in dev.children]
8750 "iv_name": dev.iv_name,
8751 "dev_type": dev.dev_type,
8752 "logical_id": dev.logical_id,
8753 "physical_id": dev.physical_id,
8754 "pstatus": dev_pstatus,
8755 "sstatus": dev_sstatus,
8756 "children": dev_children,
8763 def Exec(self, feedback_fn):
8764 """Gather and return data"""
8767 cluster = self.cfg.GetClusterInfo()
8769 for instance in self.wanted_instances:
8770 if not self.op.static:
8771 remote_info = self.rpc.call_instance_info(instance.primary_node,
8773 instance.hypervisor)
8774 remote_info.Raise("Error checking node %s" % instance.primary_node)
8775 remote_info = remote_info.payload
8776 if remote_info and "state" in remote_info:
8779 remote_state = "down"
8782 if instance.admin_up:
8785 config_state = "down"
8787 disks = [self._ComputeDiskStatus(instance, None, device)
8788 for device in instance.disks]
8791 "name": instance.name,
8792 "config_state": config_state,
8793 "run_state": remote_state,
8794 "pnode": instance.primary_node,
8795 "snodes": instance.secondary_nodes,
8797 # this happens to be the same format used for hooks
8798 "nics": _NICListToTuple(self, instance.nics),
8799 "disk_template": instance.disk_template,
8801 "hypervisor": instance.hypervisor,
8802 "network_port": instance.network_port,
8803 "hv_instance": instance.hvparams,
8804 "hv_actual": cluster.FillHV(instance, skip_globals=True),
8805 "be_instance": instance.beparams,
8806 "be_actual": cluster.FillBE(instance),
8807 "os_instance": instance.osparams,
8808 "os_actual": cluster.SimpleFillOS(instance.os, instance.osparams),
8809 "serial_no": instance.serial_no,
8810 "mtime": instance.mtime,
8811 "ctime": instance.ctime,
8812 "uuid": instance.uuid,
8815 result[instance.name] = idict
8820 class LUSetInstanceParams(LogicalUnit):
8821 """Modifies an instances's parameters.
8824 HPATH = "instance-modify"
8825 HTYPE = constants.HTYPE_INSTANCE
8828 ("nics", ht.EmptyList, ht.TList),
8829 ("disks", ht.EmptyList, ht.TList),
8830 ("beparams", ht.EmptyDict, ht.TDict),
8831 ("hvparams", ht.EmptyDict, ht.TDict),
8832 ("disk_template", None, ht.TMaybeString),
8833 ("remote_node", None, ht.TMaybeString),
8834 ("os_name", None, ht.TMaybeString),
8835 ("force_variant", False, ht.TBool),
8836 ("osparams", None, ht.TOr(ht.TDict, ht.TNone)),
8841 def CheckArguments(self):
8842 if not (self.op.nics or self.op.disks or self.op.disk_template or
8843 self.op.hvparams or self.op.beparams or self.op.os_name):
8844 raise errors.OpPrereqError("No changes submitted", errors.ECODE_INVAL)
8846 if self.op.hvparams:
8847 _CheckGlobalHvParams(self.op.hvparams)
8851 for disk_op, disk_dict in self.op.disks:
8852 utils.ForceDictType(disk_dict, constants.IDISK_PARAMS_TYPES)
8853 if disk_op == constants.DDM_REMOVE:
8856 elif disk_op == constants.DDM_ADD:
8859 if not isinstance(disk_op, int):
8860 raise errors.OpPrereqError("Invalid disk index", errors.ECODE_INVAL)
8861 if not isinstance(disk_dict, dict):
8862 msg = "Invalid disk value: expected dict, got '%s'" % disk_dict
8863 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
8865 if disk_op == constants.DDM_ADD:
8866 mode = disk_dict.setdefault('mode', constants.DISK_RDWR)
8867 if mode not in constants.DISK_ACCESS_SET:
8868 raise errors.OpPrereqError("Invalid disk access mode '%s'" % mode,
8870 size = disk_dict.get('size', None)
8872 raise errors.OpPrereqError("Required disk parameter size missing",
8876 except (TypeError, ValueError), err:
8877 raise errors.OpPrereqError("Invalid disk size parameter: %s" %
8878 str(err), errors.ECODE_INVAL)
8879 disk_dict['size'] = size
8881 # modification of disk
8882 if 'size' in disk_dict:
8883 raise errors.OpPrereqError("Disk size change not possible, use"
8884 " grow-disk", errors.ECODE_INVAL)
8886 if disk_addremove > 1:
8887 raise errors.OpPrereqError("Only one disk add or remove operation"
8888 " supported at a time", errors.ECODE_INVAL)
8890 if self.op.disks and self.op.disk_template is not None:
8891 raise errors.OpPrereqError("Disk template conversion and other disk"
8892 " changes not supported at the same time",
8895 if self.op.disk_template:
8896 _CheckDiskTemplate(self.op.disk_template)
8897 if (self.op.disk_template in constants.DTS_NET_MIRROR and
8898 self.op.remote_node is None):
8899 raise errors.OpPrereqError("Changing the disk template to a mirrored"
8900 " one requires specifying a secondary node",
8905 for nic_op, nic_dict in self.op.nics:
8906 utils.ForceDictType(nic_dict, constants.INIC_PARAMS_TYPES)
8907 if nic_op == constants.DDM_REMOVE:
8910 elif nic_op == constants.DDM_ADD:
8913 if not isinstance(nic_op, int):
8914 raise errors.OpPrereqError("Invalid nic index", errors.ECODE_INVAL)
8915 if not isinstance(nic_dict, dict):
8916 msg = "Invalid nic value: expected dict, got '%s'" % nic_dict
8917 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
8919 # nic_dict should be a dict
8920 nic_ip = nic_dict.get('ip', None)
8921 if nic_ip is not None:
8922 if nic_ip.lower() == constants.VALUE_NONE:
8923 nic_dict['ip'] = None
8925 if not netutils.IPAddress.IsValid(nic_ip):
8926 raise errors.OpPrereqError("Invalid IP address '%s'" % nic_ip,
8929 nic_bridge = nic_dict.get('bridge', None)
8930 nic_link = nic_dict.get('link', None)
8931 if nic_bridge and nic_link:
8932 raise errors.OpPrereqError("Cannot pass 'bridge' and 'link'"
8933 " at the same time", errors.ECODE_INVAL)
8934 elif nic_bridge and nic_bridge.lower() == constants.VALUE_NONE:
8935 nic_dict['bridge'] = None
8936 elif nic_link and nic_link.lower() == constants.VALUE_NONE:
8937 nic_dict['link'] = None
8939 if nic_op == constants.DDM_ADD:
8940 nic_mac = nic_dict.get('mac', None)
8942 nic_dict['mac'] = constants.VALUE_AUTO
8944 if 'mac' in nic_dict:
8945 nic_mac = nic_dict['mac']
8946 if nic_mac not in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
8947 nic_mac = utils.NormalizeAndValidateMac(nic_mac)
8949 if nic_op != constants.DDM_ADD and nic_mac == constants.VALUE_AUTO:
8950 raise errors.OpPrereqError("'auto' is not a valid MAC address when"
8951 " modifying an existing nic",
8954 if nic_addremove > 1:
8955 raise errors.OpPrereqError("Only one NIC add or remove operation"
8956 " supported at a time", errors.ECODE_INVAL)
8958 def ExpandNames(self):
8959 self._ExpandAndLockInstance()
8960 self.needed_locks[locking.LEVEL_NODE] = []
8961 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
8963 def DeclareLocks(self, level):
8964 if level == locking.LEVEL_NODE:
8965 self._LockInstancesNodes()
8966 if self.op.disk_template and self.op.remote_node:
8967 self.op.remote_node = _ExpandNodeName(self.cfg, self.op.remote_node)
8968 self.needed_locks[locking.LEVEL_NODE].append(self.op.remote_node)
8970 def BuildHooksEnv(self):
8973 This runs on the master, primary and secondaries.
8977 if constants.BE_MEMORY in self.be_new:
8978 args['memory'] = self.be_new[constants.BE_MEMORY]
8979 if constants.BE_VCPUS in self.be_new:
8980 args['vcpus'] = self.be_new[constants.BE_VCPUS]
8981 # TODO: export disk changes. Note: _BuildInstanceHookEnv* don't export disk
8982 # information at all.
8985 nic_override = dict(self.op.nics)
8986 for idx, nic in enumerate(self.instance.nics):
8987 if idx in nic_override:
8988 this_nic_override = nic_override[idx]
8990 this_nic_override = {}
8991 if 'ip' in this_nic_override:
8992 ip = this_nic_override['ip']
8995 if 'mac' in this_nic_override:
8996 mac = this_nic_override['mac']
8999 if idx in self.nic_pnew:
9000 nicparams = self.nic_pnew[idx]
9002 nicparams = self.cluster.SimpleFillNIC(nic.nicparams)
9003 mode = nicparams[constants.NIC_MODE]
9004 link = nicparams[constants.NIC_LINK]
9005 args['nics'].append((ip, mac, mode, link))
9006 if constants.DDM_ADD in nic_override:
9007 ip = nic_override[constants.DDM_ADD].get('ip', None)
9008 mac = nic_override[constants.DDM_ADD]['mac']
9009 nicparams = self.nic_pnew[constants.DDM_ADD]
9010 mode = nicparams[constants.NIC_MODE]
9011 link = nicparams[constants.NIC_LINK]
9012 args['nics'].append((ip, mac, mode, link))
9013 elif constants.DDM_REMOVE in nic_override:
9014 del args['nics'][-1]
9016 env = _BuildInstanceHookEnvByObject(self, self.instance, override=args)
9017 if self.op.disk_template:
9018 env["NEW_DISK_TEMPLATE"] = self.op.disk_template
9019 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
9022 def CheckPrereq(self):
9023 """Check prerequisites.
9025 This only checks the instance list against the existing names.
9028 # checking the new params on the primary/secondary nodes
9030 instance = self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
9031 cluster = self.cluster = self.cfg.GetClusterInfo()
9032 assert self.instance is not None, \
9033 "Cannot retrieve locked instance %s" % self.op.instance_name
9034 pnode = instance.primary_node
9035 nodelist = list(instance.all_nodes)
9038 if self.op.os_name and not self.op.force:
9039 _CheckNodeHasOS(self, instance.primary_node, self.op.os_name,
9040 self.op.force_variant)
9041 instance_os = self.op.os_name
9043 instance_os = instance.os
9045 if self.op.disk_template:
9046 if instance.disk_template == self.op.disk_template:
9047 raise errors.OpPrereqError("Instance already has disk template %s" %
9048 instance.disk_template, errors.ECODE_INVAL)
9050 if (instance.disk_template,
9051 self.op.disk_template) not in self._DISK_CONVERSIONS:
9052 raise errors.OpPrereqError("Unsupported disk template conversion from"
9053 " %s to %s" % (instance.disk_template,
9054 self.op.disk_template),
9056 _CheckInstanceDown(self, instance, "cannot change disk template")
9057 if self.op.disk_template in constants.DTS_NET_MIRROR:
9058 if self.op.remote_node == pnode:
9059 raise errors.OpPrereqError("Given new secondary node %s is the same"
9060 " as the primary node of the instance" %
9061 self.op.remote_node, errors.ECODE_STATE)
9062 _CheckNodeOnline(self, self.op.remote_node)
9063 _CheckNodeNotDrained(self, self.op.remote_node)
9064 disks = [{"size": d.size} for d in instance.disks]
9065 required = _ComputeDiskSize(self.op.disk_template, disks)
9066 _CheckNodesFreeDisk(self, [self.op.remote_node], required)
9068 # hvparams processing
9069 if self.op.hvparams:
9070 hv_type = instance.hypervisor
9071 i_hvdict = _GetUpdatedParams(instance.hvparams, self.op.hvparams)
9072 utils.ForceDictType(i_hvdict, constants.HVS_PARAMETER_TYPES)
9073 hv_new = cluster.SimpleFillHV(hv_type, instance.os, i_hvdict)
9076 hypervisor.GetHypervisor(hv_type).CheckParameterSyntax(hv_new)
9077 _CheckHVParams(self, nodelist, instance.hypervisor, hv_new)
9078 self.hv_new = hv_new # the new actual values
9079 self.hv_inst = i_hvdict # the new dict (without defaults)
9081 self.hv_new = self.hv_inst = {}
9083 # beparams processing
9084 if self.op.beparams:
9085 i_bedict = _GetUpdatedParams(instance.beparams, self.op.beparams,
9087 utils.ForceDictType(i_bedict, constants.BES_PARAMETER_TYPES)
9088 be_new = cluster.SimpleFillBE(i_bedict)
9089 self.be_new = be_new # the new actual values
9090 self.be_inst = i_bedict # the new dict (without defaults)
9092 self.be_new = self.be_inst = {}
9094 # osparams processing
9095 if self.op.osparams:
9096 i_osdict = _GetUpdatedParams(instance.osparams, self.op.osparams)
9097 _CheckOSParams(self, True, nodelist, instance_os, i_osdict)
9098 self.os_inst = i_osdict # the new dict (without defaults)
9104 if constants.BE_MEMORY in self.op.beparams and not self.op.force:
9105 mem_check_list = [pnode]
9106 if be_new[constants.BE_AUTO_BALANCE]:
9107 # either we changed auto_balance to yes or it was from before
9108 mem_check_list.extend(instance.secondary_nodes)
9109 instance_info = self.rpc.call_instance_info(pnode, instance.name,
9110 instance.hypervisor)
9111 nodeinfo = self.rpc.call_node_info(mem_check_list, self.cfg.GetVGName(),
9112 instance.hypervisor)
9113 pninfo = nodeinfo[pnode]
9114 msg = pninfo.fail_msg
9116 # Assume the primary node is unreachable and go ahead
9117 self.warn.append("Can't get info from primary node %s: %s" %
9119 elif not isinstance(pninfo.payload.get('memory_free', None), int):
9120 self.warn.append("Node data from primary node %s doesn't contain"
9121 " free memory information" % pnode)
9122 elif instance_info.fail_msg:
9123 self.warn.append("Can't get instance runtime information: %s" %
9124 instance_info.fail_msg)
9126 if instance_info.payload:
9127 current_mem = int(instance_info.payload['memory'])
9129 # Assume instance not running
9130 # (there is a slight race condition here, but it's not very probable,
9131 # and we have no other way to check)
9133 miss_mem = (be_new[constants.BE_MEMORY] - current_mem -
9134 pninfo.payload['memory_free'])
9136 raise errors.OpPrereqError("This change will prevent the instance"
9137 " from starting, due to %d MB of memory"
9138 " missing on its primary node" % miss_mem,
9141 if be_new[constants.BE_AUTO_BALANCE]:
9142 for node, nres in nodeinfo.items():
9143 if node not in instance.secondary_nodes:
9147 self.warn.append("Can't get info from secondary node %s: %s" %
9149 elif not isinstance(nres.payload.get('memory_free', None), int):
9150 self.warn.append("Secondary node %s didn't return free"
9151 " memory information" % node)
9152 elif be_new[constants.BE_MEMORY] > nres.payload['memory_free']:
9153 self.warn.append("Not enough memory to failover instance to"
9154 " secondary node %s" % node)
9159 for nic_op, nic_dict in self.op.nics:
9160 if nic_op == constants.DDM_REMOVE:
9161 if not instance.nics:
9162 raise errors.OpPrereqError("Instance has no NICs, cannot remove",
9165 if nic_op != constants.DDM_ADD:
9167 if not instance.nics:
9168 raise errors.OpPrereqError("Invalid NIC index %s, instance has"
9169 " no NICs" % nic_op,
9171 if nic_op < 0 or nic_op >= len(instance.nics):
9172 raise errors.OpPrereqError("Invalid NIC index %s, valid values"
9174 (nic_op, len(instance.nics) - 1),
9176 old_nic_params = instance.nics[nic_op].nicparams
9177 old_nic_ip = instance.nics[nic_op].ip
9182 update_params_dict = dict([(key, nic_dict[key])
9183 for key in constants.NICS_PARAMETERS
9184 if key in nic_dict])
9186 if 'bridge' in nic_dict:
9187 update_params_dict[constants.NIC_LINK] = nic_dict['bridge']
9189 new_nic_params = _GetUpdatedParams(old_nic_params,
9191 utils.ForceDictType(new_nic_params, constants.NICS_PARAMETER_TYPES)
9192 new_filled_nic_params = cluster.SimpleFillNIC(new_nic_params)
9193 objects.NIC.CheckParameterSyntax(new_filled_nic_params)
9194 self.nic_pinst[nic_op] = new_nic_params
9195 self.nic_pnew[nic_op] = new_filled_nic_params
9196 new_nic_mode = new_filled_nic_params[constants.NIC_MODE]
9198 if new_nic_mode == constants.NIC_MODE_BRIDGED:
9199 nic_bridge = new_filled_nic_params[constants.NIC_LINK]
9200 msg = self.rpc.call_bridges_exist(pnode, [nic_bridge]).fail_msg
9202 msg = "Error checking bridges on node %s: %s" % (pnode, msg)
9204 self.warn.append(msg)
9206 raise errors.OpPrereqError(msg, errors.ECODE_ENVIRON)
9207 if new_nic_mode == constants.NIC_MODE_ROUTED:
9208 if 'ip' in nic_dict:
9209 nic_ip = nic_dict['ip']
9213 raise errors.OpPrereqError('Cannot set the nic ip to None'
9214 ' on a routed nic', errors.ECODE_INVAL)
9215 if 'mac' in nic_dict:
9216 nic_mac = nic_dict['mac']
9218 raise errors.OpPrereqError('Cannot set the nic mac to None',
9220 elif nic_mac in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
9221 # otherwise generate the mac
9222 nic_dict['mac'] = self.cfg.GenerateMAC(self.proc.GetECId())
9224 # or validate/reserve the current one
9226 self.cfg.ReserveMAC(nic_mac, self.proc.GetECId())
9227 except errors.ReservationError:
9228 raise errors.OpPrereqError("MAC address %s already in use"
9229 " in cluster" % nic_mac,
9230 errors.ECODE_NOTUNIQUE)
9233 if self.op.disks and instance.disk_template == constants.DT_DISKLESS:
9234 raise errors.OpPrereqError("Disk operations not supported for"
9235 " diskless instances",
9237 for disk_op, _ in self.op.disks:
9238 if disk_op == constants.DDM_REMOVE:
9239 if len(instance.disks) == 1:
9240 raise errors.OpPrereqError("Cannot remove the last disk of"
9241 " an instance", errors.ECODE_INVAL)
9242 _CheckInstanceDown(self, instance, "cannot remove disks")
9244 if (disk_op == constants.DDM_ADD and
9245 len(instance.nics) >= constants.MAX_DISKS):
9246 raise errors.OpPrereqError("Instance has too many disks (%d), cannot"
9247 " add more" % constants.MAX_DISKS,
9249 if disk_op not in (constants.DDM_ADD, constants.DDM_REMOVE):
9251 if disk_op < 0 or disk_op >= len(instance.disks):
9252 raise errors.OpPrereqError("Invalid disk index %s, valid values"
9254 (disk_op, len(instance.disks)),
9259 def _ConvertPlainToDrbd(self, feedback_fn):
9260 """Converts an instance from plain to drbd.
9263 feedback_fn("Converting template to drbd")
9264 instance = self.instance
9265 pnode = instance.primary_node
9266 snode = self.op.remote_node
9268 # create a fake disk info for _GenerateDiskTemplate
9269 disk_info = [{"size": d.size, "mode": d.mode} for d in instance.disks]
9270 new_disks = _GenerateDiskTemplate(self, self.op.disk_template,
9271 instance.name, pnode, [snode],
9272 disk_info, None, None, 0)
9273 info = _GetInstanceInfoText(instance)
9274 feedback_fn("Creating aditional volumes...")
9275 # first, create the missing data and meta devices
9276 for disk in new_disks:
9277 # unfortunately this is... not too nice
9278 _CreateSingleBlockDev(self, pnode, instance, disk.children[1],
9280 for child in disk.children:
9281 _CreateSingleBlockDev(self, snode, instance, child, info, True)
9282 # at this stage, all new LVs have been created, we can rename the
9284 feedback_fn("Renaming original volumes...")
9285 rename_list = [(o, n.children[0].logical_id)
9286 for (o, n) in zip(instance.disks, new_disks)]
9287 result = self.rpc.call_blockdev_rename(pnode, rename_list)
9288 result.Raise("Failed to rename original LVs")
9290 feedback_fn("Initializing DRBD devices...")
9291 # all child devices are in place, we can now create the DRBD devices
9292 for disk in new_disks:
9293 for node in [pnode, snode]:
9294 f_create = node == pnode
9295 _CreateSingleBlockDev(self, node, instance, disk, info, f_create)
9297 # at this point, the instance has been modified
9298 instance.disk_template = constants.DT_DRBD8
9299 instance.disks = new_disks
9300 self.cfg.Update(instance, feedback_fn)
9302 # disks are created, waiting for sync
9303 disk_abort = not _WaitForSync(self, instance)
9305 raise errors.OpExecError("There are some degraded disks for"
9306 " this instance, please cleanup manually")
9308 def _ConvertDrbdToPlain(self, feedback_fn):
9309 """Converts an instance from drbd to plain.
9312 instance = self.instance
9313 assert len(instance.secondary_nodes) == 1
9314 pnode = instance.primary_node
9315 snode = instance.secondary_nodes[0]
9316 feedback_fn("Converting template to plain")
9318 old_disks = instance.disks
9319 new_disks = [d.children[0] for d in old_disks]
9321 # copy over size and mode
9322 for parent, child in zip(old_disks, new_disks):
9323 child.size = parent.size
9324 child.mode = parent.mode
9326 # update instance structure
9327 instance.disks = new_disks
9328 instance.disk_template = constants.DT_PLAIN
9329 self.cfg.Update(instance, feedback_fn)
9331 feedback_fn("Removing volumes on the secondary node...")
9332 for disk in old_disks:
9333 self.cfg.SetDiskID(disk, snode)
9334 msg = self.rpc.call_blockdev_remove(snode, disk).fail_msg
9336 self.LogWarning("Could not remove block device %s on node %s,"
9337 " continuing anyway: %s", disk.iv_name, snode, msg)
9339 feedback_fn("Removing unneeded volumes on the primary node...")
9340 for idx, disk in enumerate(old_disks):
9341 meta = disk.children[1]
9342 self.cfg.SetDiskID(meta, pnode)
9343 msg = self.rpc.call_blockdev_remove(pnode, meta).fail_msg
9345 self.LogWarning("Could not remove metadata for disk %d on node %s,"
9346 " continuing anyway: %s", idx, pnode, msg)
9349 def Exec(self, feedback_fn):
9350 """Modifies an instance.
9352 All parameters take effect only at the next restart of the instance.
9355 # Process here the warnings from CheckPrereq, as we don't have a
9356 # feedback_fn there.
9357 for warn in self.warn:
9358 feedback_fn("WARNING: %s" % warn)
9361 instance = self.instance
9363 for disk_op, disk_dict in self.op.disks:
9364 if disk_op == constants.DDM_REMOVE:
9365 # remove the last disk
9366 device = instance.disks.pop()
9367 device_idx = len(instance.disks)
9368 for node, disk in device.ComputeNodeTree(instance.primary_node):
9369 self.cfg.SetDiskID(disk, node)
9370 msg = self.rpc.call_blockdev_remove(node, disk).fail_msg
9372 self.LogWarning("Could not remove disk/%d on node %s: %s,"
9373 " continuing anyway", device_idx, node, msg)
9374 result.append(("disk/%d" % device_idx, "remove"))
9375 elif disk_op == constants.DDM_ADD:
9377 if instance.disk_template == constants.DT_FILE:
9378 file_driver, file_path = instance.disks[0].logical_id
9379 file_path = os.path.dirname(file_path)
9381 file_driver = file_path = None
9382 disk_idx_base = len(instance.disks)
9383 new_disk = _GenerateDiskTemplate(self,
9384 instance.disk_template,
9385 instance.name, instance.primary_node,
9386 instance.secondary_nodes,
9391 instance.disks.append(new_disk)
9392 info = _GetInstanceInfoText(instance)
9394 logging.info("Creating volume %s for instance %s",
9395 new_disk.iv_name, instance.name)
9396 # Note: this needs to be kept in sync with _CreateDisks
9398 for node in instance.all_nodes:
9399 f_create = node == instance.primary_node
9401 _CreateBlockDev(self, node, instance, new_disk,
9402 f_create, info, f_create)
9403 except errors.OpExecError, err:
9404 self.LogWarning("Failed to create volume %s (%s) on"
9406 new_disk.iv_name, new_disk, node, err)
9407 result.append(("disk/%d" % disk_idx_base, "add:size=%s,mode=%s" %
9408 (new_disk.size, new_disk.mode)))
9410 # change a given disk
9411 instance.disks[disk_op].mode = disk_dict['mode']
9412 result.append(("disk.mode/%d" % disk_op, disk_dict['mode']))
9414 if self.op.disk_template:
9415 r_shut = _ShutdownInstanceDisks(self, instance)
9417 raise errors.OpExecError("Cannot shutdow instance disks, unable to"
9418 " proceed with disk template conversion")
9419 mode = (instance.disk_template, self.op.disk_template)
9421 self._DISK_CONVERSIONS[mode](self, feedback_fn)
9423 self.cfg.ReleaseDRBDMinors(instance.name)
9425 result.append(("disk_template", self.op.disk_template))
9428 for nic_op, nic_dict in self.op.nics:
9429 if nic_op == constants.DDM_REMOVE:
9430 # remove the last nic
9431 del instance.nics[-1]
9432 result.append(("nic.%d" % len(instance.nics), "remove"))
9433 elif nic_op == constants.DDM_ADD:
9434 # mac and bridge should be set, by now
9435 mac = nic_dict['mac']
9436 ip = nic_dict.get('ip', None)
9437 nicparams = self.nic_pinst[constants.DDM_ADD]
9438 new_nic = objects.NIC(mac=mac, ip=ip, nicparams=nicparams)
9439 instance.nics.append(new_nic)
9440 result.append(("nic.%d" % (len(instance.nics) - 1),
9441 "add:mac=%s,ip=%s,mode=%s,link=%s" %
9442 (new_nic.mac, new_nic.ip,
9443 self.nic_pnew[constants.DDM_ADD][constants.NIC_MODE],
9444 self.nic_pnew[constants.DDM_ADD][constants.NIC_LINK]
9447 for key in 'mac', 'ip':
9449 setattr(instance.nics[nic_op], key, nic_dict[key])
9450 if nic_op in self.nic_pinst:
9451 instance.nics[nic_op].nicparams = self.nic_pinst[nic_op]
9452 for key, val in nic_dict.iteritems():
9453 result.append(("nic.%s/%d" % (key, nic_op), val))
9456 if self.op.hvparams:
9457 instance.hvparams = self.hv_inst
9458 for key, val in self.op.hvparams.iteritems():
9459 result.append(("hv/%s" % key, val))
9462 if self.op.beparams:
9463 instance.beparams = self.be_inst
9464 for key, val in self.op.beparams.iteritems():
9465 result.append(("be/%s" % key, val))
9469 instance.os = self.op.os_name
9472 if self.op.osparams:
9473 instance.osparams = self.os_inst
9474 for key, val in self.op.osparams.iteritems():
9475 result.append(("os/%s" % key, val))
9477 self.cfg.Update(instance, feedback_fn)
9481 _DISK_CONVERSIONS = {
9482 (constants.DT_PLAIN, constants.DT_DRBD8): _ConvertPlainToDrbd,
9483 (constants.DT_DRBD8, constants.DT_PLAIN): _ConvertDrbdToPlain,
9487 class LUQueryExports(NoHooksLU):
9488 """Query the exports list
9492 ("nodes", ht.EmptyList, ht.TListOf(ht.TNonEmptyString)),
9493 ("use_locking", False, ht.TBool),
9497 def ExpandNames(self):
9498 self.needed_locks = {}
9499 self.share_locks[locking.LEVEL_NODE] = 1
9500 if not self.op.nodes:
9501 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
9503 self.needed_locks[locking.LEVEL_NODE] = \
9504 _GetWantedNodes(self, self.op.nodes)
9506 def Exec(self, feedback_fn):
9507 """Compute the list of all the exported system images.
9510 @return: a dictionary with the structure node->(export-list)
9511 where export-list is a list of the instances exported on
9515 self.nodes = self.acquired_locks[locking.LEVEL_NODE]
9516 rpcresult = self.rpc.call_export_list(self.nodes)
9518 for node in rpcresult:
9519 if rpcresult[node].fail_msg:
9520 result[node] = False
9522 result[node] = rpcresult[node].payload
9527 class LUPrepareExport(NoHooksLU):
9528 """Prepares an instance for an export and returns useful information.
9533 ("mode", ht.NoDefault, ht.TElemOf(constants.EXPORT_MODES)),
9537 def ExpandNames(self):
9538 self._ExpandAndLockInstance()
9540 def CheckPrereq(self):
9541 """Check prerequisites.
9544 instance_name = self.op.instance_name
9546 self.instance = self.cfg.GetInstanceInfo(instance_name)
9547 assert self.instance is not None, \
9548 "Cannot retrieve locked instance %s" % self.op.instance_name
9549 _CheckNodeOnline(self, self.instance.primary_node)
9551 self._cds = _GetClusterDomainSecret()
9553 def Exec(self, feedback_fn):
9554 """Prepares an instance for an export.
9557 instance = self.instance
9559 if self.op.mode == constants.EXPORT_MODE_REMOTE:
9560 salt = utils.GenerateSecret(8)
9562 feedback_fn("Generating X509 certificate on %s" % instance.primary_node)
9563 result = self.rpc.call_x509_cert_create(instance.primary_node,
9564 constants.RIE_CERT_VALIDITY)
9565 result.Raise("Can't create X509 key and certificate on %s" % result.node)
9567 (name, cert_pem) = result.payload
9569 cert = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM,
9573 "handshake": masterd.instance.ComputeRemoteExportHandshake(self._cds),
9574 "x509_key_name": (name, utils.Sha1Hmac(self._cds, name, salt=salt),
9576 "x509_ca": utils.SignX509Certificate(cert, self._cds, salt),
9582 class LUExportInstance(LogicalUnit):
9583 """Export an instance to an image in the cluster.
9586 HPATH = "instance-export"
9587 HTYPE = constants.HTYPE_INSTANCE
9590 ("target_node", ht.NoDefault, ht.TOr(ht.TNonEmptyString, ht.TList)),
9591 ("shutdown", True, ht.TBool),
9593 ("remove_instance", False, ht.TBool),
9594 ("ignore_remove_failures", False, ht.TBool),
9595 ("mode", constants.EXPORT_MODE_LOCAL, ht.TElemOf(constants.EXPORT_MODES)),
9596 ("x509_key_name", None, ht.TOr(ht.TList, ht.TNone)),
9597 ("destination_x509_ca", None, ht.TMaybeString),
9601 def CheckArguments(self):
9602 """Check the arguments.
9605 self.x509_key_name = self.op.x509_key_name
9606 self.dest_x509_ca_pem = self.op.destination_x509_ca
9608 if self.op.remove_instance and not self.op.shutdown:
9609 raise errors.OpPrereqError("Can not remove instance without shutting it"
9612 if self.op.mode == constants.EXPORT_MODE_REMOTE:
9613 if not self.x509_key_name:
9614 raise errors.OpPrereqError("Missing X509 key name for encryption",
9617 if not self.dest_x509_ca_pem:
9618 raise errors.OpPrereqError("Missing destination X509 CA",
9621 def ExpandNames(self):
9622 self._ExpandAndLockInstance()
9624 # Lock all nodes for local exports
9625 if self.op.mode == constants.EXPORT_MODE_LOCAL:
9626 # FIXME: lock only instance primary and destination node
9628 # Sad but true, for now we have do lock all nodes, as we don't know where
9629 # the previous export might be, and in this LU we search for it and
9630 # remove it from its current node. In the future we could fix this by:
9631 # - making a tasklet to search (share-lock all), then create the
9632 # new one, then one to remove, after
9633 # - removing the removal operation altogether
9634 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
9636 def DeclareLocks(self, level):
9637 """Last minute lock declaration."""
9638 # All nodes are locked anyway, so nothing to do here.
9640 def BuildHooksEnv(self):
9643 This will run on the master, primary node and target node.
9647 "EXPORT_MODE": self.op.mode,
9648 "EXPORT_NODE": self.op.target_node,
9649 "EXPORT_DO_SHUTDOWN": self.op.shutdown,
9650 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
9651 # TODO: Generic function for boolean env variables
9652 "REMOVE_INSTANCE": str(bool(self.op.remove_instance)),
9655 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
9657 nl = [self.cfg.GetMasterNode(), self.instance.primary_node]
9659 if self.op.mode == constants.EXPORT_MODE_LOCAL:
9660 nl.append(self.op.target_node)
9664 def CheckPrereq(self):
9665 """Check prerequisites.
9667 This checks that the instance and node names are valid.
9670 instance_name = self.op.instance_name
9672 self.instance = self.cfg.GetInstanceInfo(instance_name)
9673 assert self.instance is not None, \
9674 "Cannot retrieve locked instance %s" % self.op.instance_name
9675 _CheckNodeOnline(self, self.instance.primary_node)
9677 if self.op.mode == constants.EXPORT_MODE_LOCAL:
9678 self.op.target_node = _ExpandNodeName(self.cfg, self.op.target_node)
9679 self.dst_node = self.cfg.GetNodeInfo(self.op.target_node)
9680 assert self.dst_node is not None
9682 _CheckNodeOnline(self, self.dst_node.name)
9683 _CheckNodeNotDrained(self, self.dst_node.name)
9686 self.dest_disk_info = None
9687 self.dest_x509_ca = None
9689 elif self.op.mode == constants.EXPORT_MODE_REMOTE:
9690 self.dst_node = None
9692 if len(self.op.target_node) != len(self.instance.disks):
9693 raise errors.OpPrereqError(("Received destination information for %s"
9694 " disks, but instance %s has %s disks") %
9695 (len(self.op.target_node), instance_name,
9696 len(self.instance.disks)),
9699 cds = _GetClusterDomainSecret()
9701 # Check X509 key name
9703 (key_name, hmac_digest, hmac_salt) = self.x509_key_name
9704 except (TypeError, ValueError), err:
9705 raise errors.OpPrereqError("Invalid data for X509 key name: %s" % err)
9707 if not utils.VerifySha1Hmac(cds, key_name, hmac_digest, salt=hmac_salt):
9708 raise errors.OpPrereqError("HMAC for X509 key name is wrong",
9711 # Load and verify CA
9713 (cert, _) = utils.LoadSignedX509Certificate(self.dest_x509_ca_pem, cds)
9714 except OpenSSL.crypto.Error, err:
9715 raise errors.OpPrereqError("Unable to load destination X509 CA (%s)" %
9716 (err, ), errors.ECODE_INVAL)
9718 (errcode, msg) = utils.VerifyX509Certificate(cert, None, None)
9719 if errcode is not None:
9720 raise errors.OpPrereqError("Invalid destination X509 CA (%s)" %
9721 (msg, ), errors.ECODE_INVAL)
9723 self.dest_x509_ca = cert
9725 # Verify target information
9727 for idx, disk_data in enumerate(self.op.target_node):
9729 (host, port, magic) = \
9730 masterd.instance.CheckRemoteExportDiskInfo(cds, idx, disk_data)
9731 except errors.GenericError, err:
9732 raise errors.OpPrereqError("Target info for disk %s: %s" %
9733 (idx, err), errors.ECODE_INVAL)
9735 disk_info.append((host, port, magic))
9737 assert len(disk_info) == len(self.op.target_node)
9738 self.dest_disk_info = disk_info
9741 raise errors.ProgrammerError("Unhandled export mode %r" %
9744 # instance disk type verification
9745 # TODO: Implement export support for file-based disks
9746 for disk in self.instance.disks:
9747 if disk.dev_type == constants.LD_FILE:
9748 raise errors.OpPrereqError("Export not supported for instances with"
9749 " file-based disks", errors.ECODE_INVAL)
9751 def _CleanupExports(self, feedback_fn):
9752 """Removes exports of current instance from all other nodes.
9754 If an instance in a cluster with nodes A..D was exported to node C, its
9755 exports will be removed from the nodes A, B and D.
9758 assert self.op.mode != constants.EXPORT_MODE_REMOTE
9760 nodelist = self.cfg.GetNodeList()
9761 nodelist.remove(self.dst_node.name)
9763 # on one-node clusters nodelist will be empty after the removal
9764 # if we proceed the backup would be removed because OpQueryExports
9765 # substitutes an empty list with the full cluster node list.
9766 iname = self.instance.name
9768 feedback_fn("Removing old exports for instance %s" % iname)
9769 exportlist = self.rpc.call_export_list(nodelist)
9770 for node in exportlist:
9771 if exportlist[node].fail_msg:
9773 if iname in exportlist[node].payload:
9774 msg = self.rpc.call_export_remove(node, iname).fail_msg
9776 self.LogWarning("Could not remove older export for instance %s"
9777 " on node %s: %s", iname, node, msg)
9779 def Exec(self, feedback_fn):
9780 """Export an instance to an image in the cluster.
9783 assert self.op.mode in constants.EXPORT_MODES
9785 instance = self.instance
9786 src_node = instance.primary_node
9788 if self.op.shutdown:
9789 # shutdown the instance, but not the disks
9790 feedback_fn("Shutting down instance %s" % instance.name)
9791 result = self.rpc.call_instance_shutdown(src_node, instance,
9792 self.op.shutdown_timeout)
9793 # TODO: Maybe ignore failures if ignore_remove_failures is set
9794 result.Raise("Could not shutdown instance %s on"
9795 " node %s" % (instance.name, src_node))
9797 # set the disks ID correctly since call_instance_start needs the
9798 # correct drbd minor to create the symlinks
9799 for disk in instance.disks:
9800 self.cfg.SetDiskID(disk, src_node)
9802 activate_disks = (not instance.admin_up)
9805 # Activate the instance disks if we'exporting a stopped instance
9806 feedback_fn("Activating disks for %s" % instance.name)
9807 _StartInstanceDisks(self, instance, None)
9810 helper = masterd.instance.ExportInstanceHelper(self, feedback_fn,
9813 helper.CreateSnapshots()
9815 if (self.op.shutdown and instance.admin_up and
9816 not self.op.remove_instance):
9817 assert not activate_disks
9818 feedback_fn("Starting instance %s" % instance.name)
9819 result = self.rpc.call_instance_start(src_node, instance, None, None)
9820 msg = result.fail_msg
9822 feedback_fn("Failed to start instance: %s" % msg)
9823 _ShutdownInstanceDisks(self, instance)
9824 raise errors.OpExecError("Could not start instance: %s" % msg)
9826 if self.op.mode == constants.EXPORT_MODE_LOCAL:
9827 (fin_resu, dresults) = helper.LocalExport(self.dst_node)
9828 elif self.op.mode == constants.EXPORT_MODE_REMOTE:
9829 connect_timeout = constants.RIE_CONNECT_TIMEOUT
9830 timeouts = masterd.instance.ImportExportTimeouts(connect_timeout)
9832 (key_name, _, _) = self.x509_key_name
9835 OpenSSL.crypto.dump_certificate(OpenSSL.crypto.FILETYPE_PEM,
9838 (fin_resu, dresults) = helper.RemoteExport(self.dest_disk_info,
9839 key_name, dest_ca_pem,
9844 # Check for backwards compatibility
9845 assert len(dresults) == len(instance.disks)
9846 assert compat.all(isinstance(i, bool) for i in dresults), \
9847 "Not all results are boolean: %r" % dresults
9851 feedback_fn("Deactivating disks for %s" % instance.name)
9852 _ShutdownInstanceDisks(self, instance)
9854 if not (compat.all(dresults) and fin_resu):
9857 failures.append("export finalization")
9858 if not compat.all(dresults):
9859 fdsk = utils.CommaJoin(idx for (idx, dsk) in enumerate(dresults)
9861 failures.append("disk export: disk(s) %s" % fdsk)
9863 raise errors.OpExecError("Export failed, errors in %s" %
9864 utils.CommaJoin(failures))
9866 # At this point, the export was successful, we can cleanup/finish
9868 # Remove instance if requested
9869 if self.op.remove_instance:
9870 feedback_fn("Removing instance %s" % instance.name)
9871 _RemoveInstance(self, feedback_fn, instance,
9872 self.op.ignore_remove_failures)
9874 if self.op.mode == constants.EXPORT_MODE_LOCAL:
9875 self._CleanupExports(feedback_fn)
9877 return fin_resu, dresults
9880 class LURemoveExport(NoHooksLU):
9881 """Remove exports related to the named instance.
9889 def ExpandNames(self):
9890 self.needed_locks = {}
9891 # We need all nodes to be locked in order for RemoveExport to work, but we
9892 # don't need to lock the instance itself, as nothing will happen to it (and
9893 # we can remove exports also for a removed instance)
9894 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
9896 def Exec(self, feedback_fn):
9897 """Remove any export.
9900 instance_name = self.cfg.ExpandInstanceName(self.op.instance_name)
9901 # If the instance was not found we'll try with the name that was passed in.
9902 # This will only work if it was an FQDN, though.
9904 if not instance_name:
9906 instance_name = self.op.instance_name
9908 locked_nodes = self.acquired_locks[locking.LEVEL_NODE]
9909 exportlist = self.rpc.call_export_list(locked_nodes)
9911 for node in exportlist:
9912 msg = exportlist[node].fail_msg
9914 self.LogWarning("Failed to query node %s (continuing): %s", node, msg)
9916 if instance_name in exportlist[node].payload:
9918 result = self.rpc.call_export_remove(node, instance_name)
9919 msg = result.fail_msg
9921 logging.error("Could not remove export for instance %s"
9922 " on node %s: %s", instance_name, node, msg)
9924 if fqdn_warn and not found:
9925 feedback_fn("Export not found. If trying to remove an export belonging"
9926 " to a deleted instance please use its Fully Qualified"
9930 class TagsLU(NoHooksLU): # pylint: disable-msg=W0223
9933 This is an abstract class which is the parent of all the other tags LUs.
9937 def ExpandNames(self):
9938 self.needed_locks = {}
9939 if self.op.kind == constants.TAG_NODE:
9940 self.op.name = _ExpandNodeName(self.cfg, self.op.name)
9941 self.needed_locks[locking.LEVEL_NODE] = self.op.name
9942 elif self.op.kind == constants.TAG_INSTANCE:
9943 self.op.name = _ExpandInstanceName(self.cfg, self.op.name)
9944 self.needed_locks[locking.LEVEL_INSTANCE] = self.op.name
9946 # FIXME: Acquire BGL for cluster tag operations (as of this writing it's
9947 # not possible to acquire the BGL based on opcode parameters)
9949 def CheckPrereq(self):
9950 """Check prerequisites.
9953 if self.op.kind == constants.TAG_CLUSTER:
9954 self.target = self.cfg.GetClusterInfo()
9955 elif self.op.kind == constants.TAG_NODE:
9956 self.target = self.cfg.GetNodeInfo(self.op.name)
9957 elif self.op.kind == constants.TAG_INSTANCE:
9958 self.target = self.cfg.GetInstanceInfo(self.op.name)
9960 raise errors.OpPrereqError("Wrong tag type requested (%s)" %
9961 str(self.op.kind), errors.ECODE_INVAL)
9964 class LUGetTags(TagsLU):
9965 """Returns the tags of a given object.
9969 ("kind", ht.NoDefault, ht.TElemOf(constants.VALID_TAG_TYPES)),
9970 # Name is only meaningful for nodes and instances
9971 ("name", ht.NoDefault, ht.TMaybeString),
9975 def ExpandNames(self):
9976 TagsLU.ExpandNames(self)
9978 # Share locks as this is only a read operation
9979 self.share_locks = dict.fromkeys(locking.LEVELS, 1)
9981 def Exec(self, feedback_fn):
9982 """Returns the tag list.
9985 return list(self.target.GetTags())
9988 class LUSearchTags(NoHooksLU):
9989 """Searches the tags for a given pattern.
9993 ("pattern", ht.NoDefault, ht.TNonEmptyString),
9997 def ExpandNames(self):
9998 self.needed_locks = {}
10000 def CheckPrereq(self):
10001 """Check prerequisites.
10003 This checks the pattern passed for validity by compiling it.
10007 self.re = re.compile(self.op.pattern)
10008 except re.error, err:
10009 raise errors.OpPrereqError("Invalid search pattern '%s': %s" %
10010 (self.op.pattern, err), errors.ECODE_INVAL)
10012 def Exec(self, feedback_fn):
10013 """Returns the tag list.
10017 tgts = [("/cluster", cfg.GetClusterInfo())]
10018 ilist = cfg.GetAllInstancesInfo().values()
10019 tgts.extend([("/instances/%s" % i.name, i) for i in ilist])
10020 nlist = cfg.GetAllNodesInfo().values()
10021 tgts.extend([("/nodes/%s" % n.name, n) for n in nlist])
10023 for path, target in tgts:
10024 for tag in target.GetTags():
10025 if self.re.search(tag):
10026 results.append((path, tag))
10030 class LUAddTags(TagsLU):
10031 """Sets a tag on a given object.
10035 ("kind", ht.NoDefault, ht.TElemOf(constants.VALID_TAG_TYPES)),
10036 # Name is only meaningful for nodes and instances
10037 ("name", ht.NoDefault, ht.TMaybeString),
10038 ("tags", ht.NoDefault, ht.TListOf(ht.TNonEmptyString)),
10042 def CheckPrereq(self):
10043 """Check prerequisites.
10045 This checks the type and length of the tag name and value.
10048 TagsLU.CheckPrereq(self)
10049 for tag in self.op.tags:
10050 objects.TaggableObject.ValidateTag(tag)
10052 def Exec(self, feedback_fn):
10057 for tag in self.op.tags:
10058 self.target.AddTag(tag)
10059 except errors.TagError, err:
10060 raise errors.OpExecError("Error while setting tag: %s" % str(err))
10061 self.cfg.Update(self.target, feedback_fn)
10064 class LUDelTags(TagsLU):
10065 """Delete a list of tags from a given object.
10069 ("kind", ht.NoDefault, ht.TElemOf(constants.VALID_TAG_TYPES)),
10070 # Name is only meaningful for nodes and instances
10071 ("name", ht.NoDefault, ht.TMaybeString),
10072 ("tags", ht.NoDefault, ht.TListOf(ht.TNonEmptyString)),
10076 def CheckPrereq(self):
10077 """Check prerequisites.
10079 This checks that we have the given tag.
10082 TagsLU.CheckPrereq(self)
10083 for tag in self.op.tags:
10084 objects.TaggableObject.ValidateTag(tag)
10085 del_tags = frozenset(self.op.tags)
10086 cur_tags = self.target.GetTags()
10088 diff_tags = del_tags - cur_tags
10090 diff_names = ("'%s'" % i for i in sorted(diff_tags))
10091 raise errors.OpPrereqError("Tag(s) %s not found" %
10092 (utils.CommaJoin(diff_names), ),
10093 errors.ECODE_NOENT)
10095 def Exec(self, feedback_fn):
10096 """Remove the tag from the object.
10099 for tag in self.op.tags:
10100 self.target.RemoveTag(tag)
10101 self.cfg.Update(self.target, feedback_fn)
10104 class LUTestDelay(NoHooksLU):
10105 """Sleep for a specified amount of time.
10107 This LU sleeps on the master and/or nodes for a specified amount of
10112 ("duration", ht.NoDefault, ht.TFloat),
10113 ("on_master", True, ht.TBool),
10114 ("on_nodes", ht.EmptyList, ht.TListOf(ht.TNonEmptyString)),
10115 ("repeat", 0, ht.TPositiveInt)
10119 def ExpandNames(self):
10120 """Expand names and set required locks.
10122 This expands the node list, if any.
10125 self.needed_locks = {}
10126 if self.op.on_nodes:
10127 # _GetWantedNodes can be used here, but is not always appropriate to use
10128 # this way in ExpandNames. Check LogicalUnit.ExpandNames docstring for
10129 # more information.
10130 self.op.on_nodes = _GetWantedNodes(self, self.op.on_nodes)
10131 self.needed_locks[locking.LEVEL_NODE] = self.op.on_nodes
10133 def _TestDelay(self):
10134 """Do the actual sleep.
10137 if self.op.on_master:
10138 if not utils.TestDelay(self.op.duration):
10139 raise errors.OpExecError("Error during master delay test")
10140 if self.op.on_nodes:
10141 result = self.rpc.call_test_delay(self.op.on_nodes, self.op.duration)
10142 for node, node_result in result.items():
10143 node_result.Raise("Failure during rpc call to node %s" % node)
10145 def Exec(self, feedback_fn):
10146 """Execute the test delay opcode, with the wanted repetitions.
10149 if self.op.repeat == 0:
10152 top_value = self.op.repeat - 1
10153 for i in range(self.op.repeat):
10154 self.LogInfo("Test delay iteration %d/%d" % (i, top_value))
10158 class LUTestJobqueue(NoHooksLU):
10159 """Utility LU to test some aspects of the job queue.
10163 ("notify_waitlock", False, ht.TBool),
10164 ("notify_exec", False, ht.TBool),
10165 ("log_messages", ht.EmptyList, ht.TListOf(ht.TString)),
10166 ("fail", False, ht.TBool),
10170 # Must be lower than default timeout for WaitForJobChange to see whether it
10171 # notices changed jobs
10172 _CLIENT_CONNECT_TIMEOUT = 20.0
10173 _CLIENT_CONFIRM_TIMEOUT = 60.0
10176 def _NotifyUsingSocket(cls, cb, errcls):
10177 """Opens a Unix socket and waits for another program to connect.
10180 @param cb: Callback to send socket name to client
10181 @type errcls: class
10182 @param errcls: Exception class to use for errors
10185 # Using a temporary directory as there's no easy way to create temporary
10186 # sockets without writing a custom loop around tempfile.mktemp and
10188 tmpdir = tempfile.mkdtemp()
10190 tmpsock = utils.PathJoin(tmpdir, "sock")
10192 logging.debug("Creating temporary socket at %s", tmpsock)
10193 sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
10198 # Send details to client
10201 # Wait for client to connect before continuing
10202 sock.settimeout(cls._CLIENT_CONNECT_TIMEOUT)
10204 (conn, _) = sock.accept()
10205 except socket.error, err:
10206 raise errcls("Client didn't connect in time (%s)" % err)
10210 # Remove as soon as client is connected
10211 shutil.rmtree(tmpdir)
10213 # Wait for client to close
10216 # pylint: disable-msg=E1101
10217 # Instance of '_socketobject' has no ... member
10218 conn.settimeout(cls._CLIENT_CONFIRM_TIMEOUT)
10220 except socket.error, err:
10221 raise errcls("Client failed to confirm notification (%s)" % err)
10225 def _SendNotification(self, test, arg, sockname):
10226 """Sends a notification to the client.
10229 @param test: Test name
10230 @param arg: Test argument (depends on test)
10231 @type sockname: string
10232 @param sockname: Socket path
10235 self.Log(constants.ELOG_JQUEUE_TEST, (sockname, test, arg))
10237 def _Notify(self, prereq, test, arg):
10238 """Notifies the client of a test.
10241 @param prereq: Whether this is a prereq-phase test
10243 @param test: Test name
10244 @param arg: Test argument (depends on test)
10248 errcls = errors.OpPrereqError
10250 errcls = errors.OpExecError
10252 return self._NotifyUsingSocket(compat.partial(self._SendNotification,
10256 def CheckArguments(self):
10257 self.checkargs_calls = getattr(self, "checkargs_calls", 0) + 1
10258 self.expandnames_calls = 0
10260 def ExpandNames(self):
10261 checkargs_calls = getattr(self, "checkargs_calls", 0)
10262 if checkargs_calls < 1:
10263 raise errors.ProgrammerError("CheckArguments was not called")
10265 self.expandnames_calls += 1
10267 if self.op.notify_waitlock:
10268 self._Notify(True, constants.JQT_EXPANDNAMES, None)
10270 self.LogInfo("Expanding names")
10272 # Get lock on master node (just to get a lock, not for a particular reason)
10273 self.needed_locks = {
10274 locking.LEVEL_NODE: self.cfg.GetMasterNode(),
10277 def Exec(self, feedback_fn):
10278 if self.expandnames_calls < 1:
10279 raise errors.ProgrammerError("ExpandNames was not called")
10281 if self.op.notify_exec:
10282 self._Notify(False, constants.JQT_EXEC, None)
10284 self.LogInfo("Executing")
10286 if self.op.log_messages:
10287 self._Notify(False, constants.JQT_STARTMSG, len(self.op.log_messages))
10288 for idx, msg in enumerate(self.op.log_messages):
10289 self.LogInfo("Sending log message %s", idx + 1)
10290 feedback_fn(constants.JQT_MSGPREFIX + msg)
10291 # Report how many test messages have been sent
10292 self._Notify(False, constants.JQT_LOGMSG, idx + 1)
10295 raise errors.OpExecError("Opcode failure was requested")
10300 class IAllocator(object):
10301 """IAllocator framework.
10303 An IAllocator instance has three sets of attributes:
10304 - cfg that is needed to query the cluster
10305 - input data (all members of the _KEYS class attribute are required)
10306 - four buffer attributes (in|out_data|text), that represent the
10307 input (to the external script) in text and data structure format,
10308 and the output from it, again in two formats
10309 - the result variables from the script (success, info, nodes) for
10313 # pylint: disable-msg=R0902
10314 # lots of instance attributes
10316 "name", "mem_size", "disks", "disk_template",
10317 "os", "tags", "nics", "vcpus", "hypervisor",
10320 "name", "relocate_from",
10326 def __init__(self, cfg, rpc, mode, **kwargs):
10329 # init buffer variables
10330 self.in_text = self.out_text = self.in_data = self.out_data = None
10331 # init all input fields so that pylint is happy
10333 self.mem_size = self.disks = self.disk_template = None
10334 self.os = self.tags = self.nics = self.vcpus = None
10335 self.hypervisor = None
10336 self.relocate_from = None
10338 self.evac_nodes = None
10340 self.required_nodes = None
10341 # init result fields
10342 self.success = self.info = self.result = None
10343 if self.mode == constants.IALLOCATOR_MODE_ALLOC:
10344 keyset = self._ALLO_KEYS
10345 fn = self._AddNewInstance
10346 elif self.mode == constants.IALLOCATOR_MODE_RELOC:
10347 keyset = self._RELO_KEYS
10348 fn = self._AddRelocateInstance
10349 elif self.mode == constants.IALLOCATOR_MODE_MEVAC:
10350 keyset = self._EVAC_KEYS
10351 fn = self._AddEvacuateNodes
10353 raise errors.ProgrammerError("Unknown mode '%s' passed to the"
10354 " IAllocator" % self.mode)
10356 if key not in keyset:
10357 raise errors.ProgrammerError("Invalid input parameter '%s' to"
10358 " IAllocator" % key)
10359 setattr(self, key, kwargs[key])
10362 if key not in kwargs:
10363 raise errors.ProgrammerError("Missing input parameter '%s' to"
10364 " IAllocator" % key)
10365 self._BuildInputData(fn)
10367 def _ComputeClusterData(self):
10368 """Compute the generic allocator input data.
10370 This is the data that is independent of the actual operation.
10374 cluster_info = cfg.GetClusterInfo()
10377 "version": constants.IALLOCATOR_VERSION,
10378 "cluster_name": cfg.GetClusterName(),
10379 "cluster_tags": list(cluster_info.GetTags()),
10380 "enabled_hypervisors": list(cluster_info.enabled_hypervisors),
10381 # we don't have job IDs
10383 iinfo = cfg.GetAllInstancesInfo().values()
10384 i_list = [(inst, cluster_info.FillBE(inst)) for inst in iinfo]
10387 node_list = cfg.GetNodeList()
10389 if self.mode == constants.IALLOCATOR_MODE_ALLOC:
10390 hypervisor_name = self.hypervisor
10391 elif self.mode == constants.IALLOCATOR_MODE_RELOC:
10392 hypervisor_name = cfg.GetInstanceInfo(self.name).hypervisor
10393 elif self.mode == constants.IALLOCATOR_MODE_MEVAC:
10394 hypervisor_name = cluster_info.enabled_hypervisors[0]
10396 node_data = self.rpc.call_node_info(node_list, cfg.GetVGName(),
10399 self.rpc.call_all_instances_info(node_list,
10400 cluster_info.enabled_hypervisors)
10402 data["nodegroups"] = self._ComputeNodeGroupData(cfg)
10404 data["nodes"] = self._ComputeNodeData(cfg, node_data, node_iinfo, i_list)
10406 data["instances"] = self._ComputeInstanceData(cluster_info, i_list)
10408 self.in_data = data
10411 def _ComputeNodeGroupData(cfg):
10412 """Compute node groups data.
10416 for guuid, gdata in cfg.GetAllNodeGroupsInfo().items():
10417 ng[guuid] = { "name": gdata.name }
10421 def _ComputeNodeData(cfg, node_data, node_iinfo, i_list):
10422 """Compute global node data.
10426 for nname, nresult in node_data.items():
10427 # first fill in static (config-based) values
10428 ninfo = cfg.GetNodeInfo(nname)
10430 "tags": list(ninfo.GetTags()),
10431 "primary_ip": ninfo.primary_ip,
10432 "secondary_ip": ninfo.secondary_ip,
10433 "offline": ninfo.offline,
10434 "drained": ninfo.drained,
10435 "master_candidate": ninfo.master_candidate,
10436 "group": ninfo.group,
10437 "master_capable": ninfo.master_capable,
10438 "vm_capable": ninfo.vm_capable,
10441 if not (ninfo.offline or ninfo.drained):
10442 nresult.Raise("Can't get data for node %s" % nname)
10443 node_iinfo[nname].Raise("Can't get node instance info from node %s" %
10445 remote_info = nresult.payload
10447 for attr in ['memory_total', 'memory_free', 'memory_dom0',
10448 'vg_size', 'vg_free', 'cpu_total']:
10449 if attr not in remote_info:
10450 raise errors.OpExecError("Node '%s' didn't return attribute"
10451 " '%s'" % (nname, attr))
10452 if not isinstance(remote_info[attr], int):
10453 raise errors.OpExecError("Node '%s' returned invalid value"
10455 (nname, attr, remote_info[attr]))
10456 # compute memory used by primary instances
10457 i_p_mem = i_p_up_mem = 0
10458 for iinfo, beinfo in i_list:
10459 if iinfo.primary_node == nname:
10460 i_p_mem += beinfo[constants.BE_MEMORY]
10461 if iinfo.name not in node_iinfo[nname].payload:
10464 i_used_mem = int(node_iinfo[nname].payload[iinfo.name]['memory'])
10465 i_mem_diff = beinfo[constants.BE_MEMORY] - i_used_mem
10466 remote_info['memory_free'] -= max(0, i_mem_diff)
10469 i_p_up_mem += beinfo[constants.BE_MEMORY]
10471 # compute memory used by instances
10473 "total_memory": remote_info['memory_total'],
10474 "reserved_memory": remote_info['memory_dom0'],
10475 "free_memory": remote_info['memory_free'],
10476 "total_disk": remote_info['vg_size'],
10477 "free_disk": remote_info['vg_free'],
10478 "total_cpus": remote_info['cpu_total'],
10479 "i_pri_memory": i_p_mem,
10480 "i_pri_up_memory": i_p_up_mem,
10482 pnr.update(pnr_dyn)
10484 node_results[nname] = pnr
10486 return node_results
10489 def _ComputeInstanceData(cluster_info, i_list):
10490 """Compute global instance data.
10494 for iinfo, beinfo in i_list:
10496 for nic in iinfo.nics:
10497 filled_params = cluster_info.SimpleFillNIC(nic.nicparams)
10498 nic_dict = {"mac": nic.mac,
10500 "mode": filled_params[constants.NIC_MODE],
10501 "link": filled_params[constants.NIC_LINK],
10503 if filled_params[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
10504 nic_dict["bridge"] = filled_params[constants.NIC_LINK]
10505 nic_data.append(nic_dict)
10507 "tags": list(iinfo.GetTags()),
10508 "admin_up": iinfo.admin_up,
10509 "vcpus": beinfo[constants.BE_VCPUS],
10510 "memory": beinfo[constants.BE_MEMORY],
10512 "nodes": [iinfo.primary_node] + list(iinfo.secondary_nodes),
10514 "disks": [{"size": dsk.size, "mode": dsk.mode} for dsk in iinfo.disks],
10515 "disk_template": iinfo.disk_template,
10516 "hypervisor": iinfo.hypervisor,
10518 pir["disk_space_total"] = _ComputeDiskSize(iinfo.disk_template,
10520 instance_data[iinfo.name] = pir
10522 return instance_data
10524 def _AddNewInstance(self):
10525 """Add new instance data to allocator structure.
10527 This in combination with _AllocatorGetClusterData will create the
10528 correct structure needed as input for the allocator.
10530 The checks for the completeness of the opcode must have already been
10534 disk_space = _ComputeDiskSize(self.disk_template, self.disks)
10536 if self.disk_template in constants.DTS_NET_MIRROR:
10537 self.required_nodes = 2
10539 self.required_nodes = 1
10542 "disk_template": self.disk_template,
10545 "vcpus": self.vcpus,
10546 "memory": self.mem_size,
10547 "disks": self.disks,
10548 "disk_space_total": disk_space,
10550 "required_nodes": self.required_nodes,
10554 def _AddRelocateInstance(self):
10555 """Add relocate instance data to allocator structure.
10557 This in combination with _IAllocatorGetClusterData will create the
10558 correct structure needed as input for the allocator.
10560 The checks for the completeness of the opcode must have already been
10564 instance = self.cfg.GetInstanceInfo(self.name)
10565 if instance is None:
10566 raise errors.ProgrammerError("Unknown instance '%s' passed to"
10567 " IAllocator" % self.name)
10569 if instance.disk_template not in constants.DTS_NET_MIRROR:
10570 raise errors.OpPrereqError("Can't relocate non-mirrored instances",
10571 errors.ECODE_INVAL)
10573 if len(instance.secondary_nodes) != 1:
10574 raise errors.OpPrereqError("Instance has not exactly one secondary node",
10575 errors.ECODE_STATE)
10577 self.required_nodes = 1
10578 disk_sizes = [{'size': disk.size} for disk in instance.disks]
10579 disk_space = _ComputeDiskSize(instance.disk_template, disk_sizes)
10583 "disk_space_total": disk_space,
10584 "required_nodes": self.required_nodes,
10585 "relocate_from": self.relocate_from,
10589 def _AddEvacuateNodes(self):
10590 """Add evacuate nodes data to allocator structure.
10594 "evac_nodes": self.evac_nodes
10598 def _BuildInputData(self, fn):
10599 """Build input data structures.
10602 self._ComputeClusterData()
10605 request["type"] = self.mode
10606 self.in_data["request"] = request
10608 self.in_text = serializer.Dump(self.in_data)
10610 def Run(self, name, validate=True, call_fn=None):
10611 """Run an instance allocator and return the results.
10614 if call_fn is None:
10615 call_fn = self.rpc.call_iallocator_runner
10617 result = call_fn(self.cfg.GetMasterNode(), name, self.in_text)
10618 result.Raise("Failure while running the iallocator script")
10620 self.out_text = result.payload
10622 self._ValidateResult()
10624 def _ValidateResult(self):
10625 """Process the allocator results.
10627 This will process and if successful save the result in
10628 self.out_data and the other parameters.
10632 rdict = serializer.Load(self.out_text)
10633 except Exception, err:
10634 raise errors.OpExecError("Can't parse iallocator results: %s" % str(err))
10636 if not isinstance(rdict, dict):
10637 raise errors.OpExecError("Can't parse iallocator results: not a dict")
10639 # TODO: remove backwards compatiblity in later versions
10640 if "nodes" in rdict and "result" not in rdict:
10641 rdict["result"] = rdict["nodes"]
10644 for key in "success", "info", "result":
10645 if key not in rdict:
10646 raise errors.OpExecError("Can't parse iallocator results:"
10647 " missing key '%s'" % key)
10648 setattr(self, key, rdict[key])
10650 if not isinstance(rdict["result"], list):
10651 raise errors.OpExecError("Can't parse iallocator results: 'result' key"
10653 self.out_data = rdict
10656 class LUTestAllocator(NoHooksLU):
10657 """Run allocator tests.
10659 This LU runs the allocator tests
10663 ("direction", ht.NoDefault,
10664 ht.TElemOf(constants.VALID_IALLOCATOR_DIRECTIONS)),
10665 ("mode", ht.NoDefault, ht.TElemOf(constants.VALID_IALLOCATOR_MODES)),
10666 ("name", ht.NoDefault, ht.TNonEmptyString),
10667 ("nics", ht.NoDefault, ht.TOr(ht.TNone, ht.TListOf(
10668 ht.TDictOf(ht.TElemOf(["mac", "ip", "bridge"]),
10669 ht.TOr(ht.TNone, ht.TNonEmptyString))))),
10670 ("disks", ht.NoDefault, ht.TOr(ht.TNone, ht.TList)),
10671 ("hypervisor", None, ht.TMaybeString),
10672 ("allocator", None, ht.TMaybeString),
10673 ("tags", ht.EmptyList, ht.TListOf(ht.TNonEmptyString)),
10674 ("mem_size", None, ht.TOr(ht.TNone, ht.TPositiveInt)),
10675 ("vcpus", None, ht.TOr(ht.TNone, ht.TPositiveInt)),
10676 ("os", None, ht.TMaybeString),
10677 ("disk_template", None, ht.TMaybeString),
10678 ("evac_nodes", None, ht.TOr(ht.TNone, ht.TListOf(ht.TNonEmptyString))),
10681 def CheckPrereq(self):
10682 """Check prerequisites.
10684 This checks the opcode parameters depending on the director and mode test.
10687 if self.op.mode == constants.IALLOCATOR_MODE_ALLOC:
10688 for attr in ["mem_size", "disks", "disk_template",
10689 "os", "tags", "nics", "vcpus"]:
10690 if not hasattr(self.op, attr):
10691 raise errors.OpPrereqError("Missing attribute '%s' on opcode input" %
10692 attr, errors.ECODE_INVAL)
10693 iname = self.cfg.ExpandInstanceName(self.op.name)
10694 if iname is not None:
10695 raise errors.OpPrereqError("Instance '%s' already in the cluster" %
10696 iname, errors.ECODE_EXISTS)
10697 if not isinstance(self.op.nics, list):
10698 raise errors.OpPrereqError("Invalid parameter 'nics'",
10699 errors.ECODE_INVAL)
10700 if not isinstance(self.op.disks, list):
10701 raise errors.OpPrereqError("Invalid parameter 'disks'",
10702 errors.ECODE_INVAL)
10703 for row in self.op.disks:
10704 if (not isinstance(row, dict) or
10705 "size" not in row or
10706 not isinstance(row["size"], int) or
10707 "mode" not in row or
10708 row["mode"] not in ['r', 'w']):
10709 raise errors.OpPrereqError("Invalid contents of the 'disks'"
10710 " parameter", errors.ECODE_INVAL)
10711 if self.op.hypervisor is None:
10712 self.op.hypervisor = self.cfg.GetHypervisorType()
10713 elif self.op.mode == constants.IALLOCATOR_MODE_RELOC:
10714 fname = _ExpandInstanceName(self.cfg, self.op.name)
10715 self.op.name = fname
10716 self.relocate_from = self.cfg.GetInstanceInfo(fname).secondary_nodes
10717 elif self.op.mode == constants.IALLOCATOR_MODE_MEVAC:
10718 if not hasattr(self.op, "evac_nodes"):
10719 raise errors.OpPrereqError("Missing attribute 'evac_nodes' on"
10720 " opcode input", errors.ECODE_INVAL)
10722 raise errors.OpPrereqError("Invalid test allocator mode '%s'" %
10723 self.op.mode, errors.ECODE_INVAL)
10725 if self.op.direction == constants.IALLOCATOR_DIR_OUT:
10726 if self.op.allocator is None:
10727 raise errors.OpPrereqError("Missing allocator name",
10728 errors.ECODE_INVAL)
10729 elif self.op.direction != constants.IALLOCATOR_DIR_IN:
10730 raise errors.OpPrereqError("Wrong allocator test '%s'" %
10731 self.op.direction, errors.ECODE_INVAL)
10733 def Exec(self, feedback_fn):
10734 """Run the allocator test.
10737 if self.op.mode == constants.IALLOCATOR_MODE_ALLOC:
10738 ial = IAllocator(self.cfg, self.rpc,
10741 mem_size=self.op.mem_size,
10742 disks=self.op.disks,
10743 disk_template=self.op.disk_template,
10747 vcpus=self.op.vcpus,
10748 hypervisor=self.op.hypervisor,
10750 elif self.op.mode == constants.IALLOCATOR_MODE_RELOC:
10751 ial = IAllocator(self.cfg, self.rpc,
10754 relocate_from=list(self.relocate_from),
10756 elif self.op.mode == constants.IALLOCATOR_MODE_MEVAC:
10757 ial = IAllocator(self.cfg, self.rpc,
10759 evac_nodes=self.op.evac_nodes)
10761 raise errors.ProgrammerError("Uncatched mode %s in"
10762 " LUTestAllocator.Exec", self.op.mode)
10764 if self.op.direction == constants.IALLOCATOR_DIR_IN:
10765 result = ial.in_text
10767 ial.Run(self.op.allocator, validate=False)
10768 result = ial.out_text