4 # Copyright (C) 2006, 2007, 2008, 2009, 2010 Google Inc.
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 """Module implementing the master-side code."""
24 # pylint: disable-msg=W0201,C0302
26 # W0201 since most LU attributes are defined in CheckPrereq or similar
29 # C0302: since we have waaaay to many lines in this module
43 from ganeti import ssh
44 from ganeti import utils
45 from ganeti import errors
46 from ganeti import hypervisor
47 from ganeti import locking
48 from ganeti import constants
49 from ganeti import objects
50 from ganeti import serializer
51 from ganeti import ssconf
52 from ganeti import uidpool
53 from ganeti import compat
54 from ganeti import masterd
55 from ganeti import netutils
58 import ganeti.masterd.instance # pylint: disable-msg=W0611
60 # Common opcode attributes
62 #: output fields for a query operation
63 _POutputFields = ("output_fields", ht.NoDefault, ht.TListOf(ht.TNonEmptyString))
66 #: the shutdown timeout
67 _PShutdownTimeout = ("shutdown_timeout", constants.DEFAULT_SHUTDOWN_TIMEOUT,
70 #: the force parameter
71 _PForce = ("force", False, ht.TBool)
73 #: a required instance name (for single-instance LUs)
74 _PInstanceName = ("instance_name", ht.NoDefault, ht.TNonEmptyString)
76 #: Whether to ignore offline nodes
77 _PIgnoreOfflineNodes = ("ignore_offline_nodes", False, ht.TBool)
79 #: a required node name (for single-node LUs)
80 _PNodeName = ("node_name", ht.NoDefault, ht.TNonEmptyString)
82 #: the migration type (live/non-live)
83 _PMigrationMode = ("mode", None,
84 ht.TOr(ht.TNone, ht.TElemOf(constants.HT_MIGRATION_MODES)))
86 #: the obsolete 'live' mode (boolean)
87 _PMigrationLive = ("live", None, ht.TMaybeBool)
91 class LogicalUnit(object):
92 """Logical Unit base class.
94 Subclasses must follow these rules:
95 - implement ExpandNames
96 - implement CheckPrereq (except when tasklets are used)
97 - implement Exec (except when tasklets are used)
98 - implement BuildHooksEnv
99 - redefine HPATH and HTYPE
100 - optionally redefine their run requirements:
101 REQ_BGL: the LU needs to hold the Big Ganeti Lock exclusively
103 Note that all commands require root permissions.
105 @ivar dry_run_result: the value (if any) that will be returned to the caller
106 in dry-run mode (signalled by opcode dry_run parameter)
107 @cvar _OP_PARAMS: a list of opcode attributes, their defaults values
108 they should get if not already defined, and types they must match
116 def __init__(self, processor, op, context, rpc):
117 """Constructor for LogicalUnit.
119 This needs to be overridden in derived classes in order to check op
123 self.proc = processor
125 self.cfg = context.cfg
126 self.context = context
128 # Dicts used to declare locking needs to mcpu
129 self.needed_locks = None
130 self.acquired_locks = {}
131 self.share_locks = dict.fromkeys(locking.LEVELS, 0)
133 self.remove_locks = {}
134 # Used to force good behavior when calling helper functions
135 self.recalculate_locks = {}
138 self.Log = processor.Log # pylint: disable-msg=C0103
139 self.LogWarning = processor.LogWarning # pylint: disable-msg=C0103
140 self.LogInfo = processor.LogInfo # pylint: disable-msg=C0103
141 self.LogStep = processor.LogStep # pylint: disable-msg=C0103
142 # support for dry-run
143 self.dry_run_result = None
144 # support for generic debug attribute
145 if (not hasattr(self.op, "debug_level") or
146 not isinstance(self.op.debug_level, int)):
147 self.op.debug_level = 0
152 # The new kind-of-type-system
153 op_id = self.op.OP_ID
154 for attr_name, aval, test in self._OP_PARAMS:
155 if not hasattr(op, attr_name):
156 if aval == ht.NoDefault:
157 raise errors.OpPrereqError("Required parameter '%s.%s' missing" %
158 (op_id, attr_name), errors.ECODE_INVAL)
164 setattr(self.op, attr_name, dval)
165 attr_val = getattr(op, attr_name)
166 if test == ht.NoType:
169 if not callable(test):
170 raise errors.ProgrammerError("Validation for parameter '%s.%s' failed,"
171 " given type is not a proper type (%s)" %
172 (op_id, attr_name, test))
173 if not test(attr_val):
174 logging.error("OpCode %s, parameter %s, has invalid type %s/value %s",
175 self.op.OP_ID, attr_name, type(attr_val), attr_val)
176 raise errors.OpPrereqError("Parameter '%s.%s' fails validation" %
177 (op_id, attr_name), errors.ECODE_INVAL)
179 self.CheckArguments()
182 """Returns the SshRunner object
186 self.__ssh = ssh.SshRunner(self.cfg.GetClusterName())
189 ssh = property(fget=__GetSSH)
191 def CheckArguments(self):
192 """Check syntactic validity for the opcode arguments.
194 This method is for doing a simple syntactic check and ensure
195 validity of opcode parameters, without any cluster-related
196 checks. While the same can be accomplished in ExpandNames and/or
197 CheckPrereq, doing these separate is better because:
199 - ExpandNames is left as as purely a lock-related function
200 - CheckPrereq is run after we have acquired locks (and possible
203 The function is allowed to change the self.op attribute so that
204 later methods can no longer worry about missing parameters.
209 def ExpandNames(self):
210 """Expand names for this LU.
212 This method is called before starting to execute the opcode, and it should
213 update all the parameters of the opcode to their canonical form (e.g. a
214 short node name must be fully expanded after this method has successfully
215 completed). This way locking, hooks, logging, ecc. can work correctly.
217 LUs which implement this method must also populate the self.needed_locks
218 member, as a dict with lock levels as keys, and a list of needed lock names
221 - use an empty dict if you don't need any lock
222 - if you don't need any lock at a particular level omit that level
223 - don't put anything for the BGL level
224 - if you want all locks at a level use locking.ALL_SET as a value
226 If you need to share locks (rather than acquire them exclusively) at one
227 level you can modify self.share_locks, setting a true value (usually 1) for
228 that level. By default locks are not shared.
230 This function can also define a list of tasklets, which then will be
231 executed in order instead of the usual LU-level CheckPrereq and Exec
232 functions, if those are not defined by the LU.
236 # Acquire all nodes and one instance
237 self.needed_locks = {
238 locking.LEVEL_NODE: locking.ALL_SET,
239 locking.LEVEL_INSTANCE: ['instance1.example.com'],
241 # Acquire just two nodes
242 self.needed_locks = {
243 locking.LEVEL_NODE: ['node1.example.com', 'node2.example.com'],
246 self.needed_locks = {} # No, you can't leave it to the default value None
249 # The implementation of this method is mandatory only if the new LU is
250 # concurrent, so that old LUs don't need to be changed all at the same
253 self.needed_locks = {} # Exclusive LUs don't need locks.
255 raise NotImplementedError
257 def DeclareLocks(self, level):
258 """Declare LU locking needs for a level
260 While most LUs can just declare their locking needs at ExpandNames time,
261 sometimes there's the need to calculate some locks after having acquired
262 the ones before. This function is called just before acquiring locks at a
263 particular level, but after acquiring the ones at lower levels, and permits
264 such calculations. It can be used to modify self.needed_locks, and by
265 default it does nothing.
267 This function is only called if you have something already set in
268 self.needed_locks for the level.
270 @param level: Locking level which is going to be locked
271 @type level: member of ganeti.locking.LEVELS
275 def CheckPrereq(self):
276 """Check prerequisites for this LU.
278 This method should check that the prerequisites for the execution
279 of this LU are fulfilled. It can do internode communication, but
280 it should be idempotent - no cluster or system changes are
283 The method should raise errors.OpPrereqError in case something is
284 not fulfilled. Its return value is ignored.
286 This method should also update all the parameters of the opcode to
287 their canonical form if it hasn't been done by ExpandNames before.
290 if self.tasklets is not None:
291 for (idx, tl) in enumerate(self.tasklets):
292 logging.debug("Checking prerequisites for tasklet %s/%s",
293 idx + 1, len(self.tasklets))
298 def Exec(self, feedback_fn):
301 This method should implement the actual work. It should raise
302 errors.OpExecError for failures that are somewhat dealt with in
306 if self.tasklets is not None:
307 for (idx, tl) in enumerate(self.tasklets):
308 logging.debug("Executing tasklet %s/%s", idx + 1, len(self.tasklets))
311 raise NotImplementedError
313 def BuildHooksEnv(self):
314 """Build hooks environment for this LU.
316 This method should return a three-node tuple consisting of: a dict
317 containing the environment that will be used for running the
318 specific hook for this LU, a list of node names on which the hook
319 should run before the execution, and a list of node names on which
320 the hook should run after the execution.
322 The keys of the dict must not have 'GANETI_' prefixed as this will
323 be handled in the hooks runner. Also note additional keys will be
324 added by the hooks runner. If the LU doesn't define any
325 environment, an empty dict (and not None) should be returned.
327 No nodes should be returned as an empty list (and not None).
329 Note that if the HPATH for a LU class is None, this function will
333 raise NotImplementedError
335 def HooksCallBack(self, phase, hook_results, feedback_fn, lu_result):
336 """Notify the LU about the results of its hooks.
338 This method is called every time a hooks phase is executed, and notifies
339 the Logical Unit about the hooks' result. The LU can then use it to alter
340 its result based on the hooks. By default the method does nothing and the
341 previous result is passed back unchanged but any LU can define it if it
342 wants to use the local cluster hook-scripts somehow.
344 @param phase: one of L{constants.HOOKS_PHASE_POST} or
345 L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase
346 @param hook_results: the results of the multi-node hooks rpc call
347 @param feedback_fn: function used send feedback back to the caller
348 @param lu_result: the previous Exec result this LU had, or None
350 @return: the new Exec result, based on the previous result
354 # API must be kept, thus we ignore the unused argument and could
355 # be a function warnings
356 # pylint: disable-msg=W0613,R0201
359 def _ExpandAndLockInstance(self):
360 """Helper function to expand and lock an instance.
362 Many LUs that work on an instance take its name in self.op.instance_name
363 and need to expand it and then declare the expanded name for locking. This
364 function does it, and then updates self.op.instance_name to the expanded
365 name. It also initializes needed_locks as a dict, if this hasn't been done
369 if self.needed_locks is None:
370 self.needed_locks = {}
372 assert locking.LEVEL_INSTANCE not in self.needed_locks, \
373 "_ExpandAndLockInstance called with instance-level locks set"
374 self.op.instance_name = _ExpandInstanceName(self.cfg,
375 self.op.instance_name)
376 self.needed_locks[locking.LEVEL_INSTANCE] = self.op.instance_name
378 def _LockInstancesNodes(self, primary_only=False):
379 """Helper function to declare instances' nodes for locking.
381 This function should be called after locking one or more instances to lock
382 their nodes. Its effect is populating self.needed_locks[locking.LEVEL_NODE]
383 with all primary or secondary nodes for instances already locked and
384 present in self.needed_locks[locking.LEVEL_INSTANCE].
386 It should be called from DeclareLocks, and for safety only works if
387 self.recalculate_locks[locking.LEVEL_NODE] is set.
389 In the future it may grow parameters to just lock some instance's nodes, or
390 to just lock primaries or secondary nodes, if needed.
392 If should be called in DeclareLocks in a way similar to::
394 if level == locking.LEVEL_NODE:
395 self._LockInstancesNodes()
397 @type primary_only: boolean
398 @param primary_only: only lock primary nodes of locked instances
401 assert locking.LEVEL_NODE in self.recalculate_locks, \
402 "_LockInstancesNodes helper function called with no nodes to recalculate"
404 # TODO: check if we're really been called with the instance locks held
406 # For now we'll replace self.needed_locks[locking.LEVEL_NODE], but in the
407 # future we might want to have different behaviors depending on the value
408 # of self.recalculate_locks[locking.LEVEL_NODE]
410 for instance_name in self.acquired_locks[locking.LEVEL_INSTANCE]:
411 instance = self.context.cfg.GetInstanceInfo(instance_name)
412 wanted_nodes.append(instance.primary_node)
414 wanted_nodes.extend(instance.secondary_nodes)
416 if self.recalculate_locks[locking.LEVEL_NODE] == constants.LOCKS_REPLACE:
417 self.needed_locks[locking.LEVEL_NODE] = wanted_nodes
418 elif self.recalculate_locks[locking.LEVEL_NODE] == constants.LOCKS_APPEND:
419 self.needed_locks[locking.LEVEL_NODE].extend(wanted_nodes)
421 del self.recalculate_locks[locking.LEVEL_NODE]
424 class NoHooksLU(LogicalUnit): # pylint: disable-msg=W0223
425 """Simple LU which runs no hooks.
427 This LU is intended as a parent for other LogicalUnits which will
428 run no hooks, in order to reduce duplicate code.
434 def BuildHooksEnv(self):
435 """Empty BuildHooksEnv for NoHooksLu.
437 This just raises an error.
440 assert False, "BuildHooksEnv called for NoHooksLUs"
444 """Tasklet base class.
446 Tasklets are subcomponents for LUs. LUs can consist entirely of tasklets or
447 they can mix legacy code with tasklets. Locking needs to be done in the LU,
448 tasklets know nothing about locks.
450 Subclasses must follow these rules:
451 - Implement CheckPrereq
455 def __init__(self, lu):
462 def CheckPrereq(self):
463 """Check prerequisites for this tasklets.
465 This method should check whether the prerequisites for the execution of
466 this tasklet are fulfilled. It can do internode communication, but it
467 should be idempotent - no cluster or system changes are allowed.
469 The method should raise errors.OpPrereqError in case something is not
470 fulfilled. Its return value is ignored.
472 This method should also update all parameters to their canonical form if it
473 hasn't been done before.
478 def Exec(self, feedback_fn):
479 """Execute the tasklet.
481 This method should implement the actual work. It should raise
482 errors.OpExecError for failures that are somewhat dealt with in code, or
486 raise NotImplementedError
489 def _GetWantedNodes(lu, nodes):
490 """Returns list of checked and expanded node names.
492 @type lu: L{LogicalUnit}
493 @param lu: the logical unit on whose behalf we execute
495 @param nodes: list of node names or None for all nodes
497 @return: the list of nodes, sorted
498 @raise errors.ProgrammerError: if the nodes parameter is wrong type
502 raise errors.ProgrammerError("_GetWantedNodes should only be called with a"
503 " non-empty list of nodes whose name is to be expanded.")
505 wanted = [_ExpandNodeName(lu.cfg, name) for name in nodes]
506 return utils.NiceSort(wanted)
509 def _GetWantedInstances(lu, instances):
510 """Returns list of checked and expanded instance names.
512 @type lu: L{LogicalUnit}
513 @param lu: the logical unit on whose behalf we execute
514 @type instances: list
515 @param instances: list of instance names or None for all instances
517 @return: the list of instances, sorted
518 @raise errors.OpPrereqError: if the instances parameter is wrong type
519 @raise errors.OpPrereqError: if any of the passed instances is not found
523 wanted = [_ExpandInstanceName(lu.cfg, name) for name in instances]
525 wanted = utils.NiceSort(lu.cfg.GetInstanceList())
529 def _GetUpdatedParams(old_params, update_dict,
530 use_default=True, use_none=False):
531 """Return the new version of a parameter dictionary.
533 @type old_params: dict
534 @param old_params: old parameters
535 @type update_dict: dict
536 @param update_dict: dict containing new parameter values, or
537 constants.VALUE_DEFAULT to reset the parameter to its default
539 @param use_default: boolean
540 @type use_default: whether to recognise L{constants.VALUE_DEFAULT}
541 values as 'to be deleted' values
542 @param use_none: boolean
543 @type use_none: whether to recognise C{None} values as 'to be
546 @return: the new parameter dictionary
549 params_copy = copy.deepcopy(old_params)
550 for key, val in update_dict.iteritems():
551 if ((use_default and val == constants.VALUE_DEFAULT) or
552 (use_none and val is None)):
558 params_copy[key] = val
562 def _CheckOutputFields(static, dynamic, selected):
563 """Checks whether all selected fields are valid.
565 @type static: L{utils.FieldSet}
566 @param static: static fields set
567 @type dynamic: L{utils.FieldSet}
568 @param dynamic: dynamic fields set
575 delta = f.NonMatching(selected)
577 raise errors.OpPrereqError("Unknown output fields selected: %s"
578 % ",".join(delta), errors.ECODE_INVAL)
581 def _CheckGlobalHvParams(params):
582 """Validates that given hypervisor params are not global ones.
584 This will ensure that instances don't get customised versions of
588 used_globals = constants.HVC_GLOBALS.intersection(params)
590 msg = ("The following hypervisor parameters are global and cannot"
591 " be customized at instance level, please modify them at"
592 " cluster level: %s" % utils.CommaJoin(used_globals))
593 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
596 def _CheckNodeOnline(lu, node):
597 """Ensure that a given node is online.
599 @param lu: the LU on behalf of which we make the check
600 @param node: the node to check
601 @raise errors.OpPrereqError: if the node is offline
604 if lu.cfg.GetNodeInfo(node).offline:
605 raise errors.OpPrereqError("Can't use offline node %s" % node,
609 def _CheckNodeNotDrained(lu, node):
610 """Ensure that a given node is not drained.
612 @param lu: the LU on behalf of which we make the check
613 @param node: the node to check
614 @raise errors.OpPrereqError: if the node is drained
617 if lu.cfg.GetNodeInfo(node).drained:
618 raise errors.OpPrereqError("Can't use drained node %s" % node,
622 def _CheckNodeHasOS(lu, node, os_name, force_variant):
623 """Ensure that a node supports a given OS.
625 @param lu: the LU on behalf of which we make the check
626 @param node: the node to check
627 @param os_name: the OS to query about
628 @param force_variant: whether to ignore variant errors
629 @raise errors.OpPrereqError: if the node is not supporting the OS
632 result = lu.rpc.call_os_get(node, os_name)
633 result.Raise("OS '%s' not in supported OS list for node %s" %
635 prereq=True, ecode=errors.ECODE_INVAL)
636 if not force_variant:
637 _CheckOSVariant(result.payload, os_name)
640 def _RequireFileStorage():
641 """Checks that file storage is enabled.
643 @raise errors.OpPrereqError: when file storage is disabled
646 if not constants.ENABLE_FILE_STORAGE:
647 raise errors.OpPrereqError("File storage disabled at configure time",
651 def _CheckDiskTemplate(template):
652 """Ensure a given disk template is valid.
655 if template not in constants.DISK_TEMPLATES:
656 msg = ("Invalid disk template name '%s', valid templates are: %s" %
657 (template, utils.CommaJoin(constants.DISK_TEMPLATES)))
658 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
659 if template == constants.DT_FILE:
660 _RequireFileStorage()
664 def _CheckStorageType(storage_type):
665 """Ensure a given storage type is valid.
668 if storage_type not in constants.VALID_STORAGE_TYPES:
669 raise errors.OpPrereqError("Unknown storage type: %s" % storage_type,
671 if storage_type == constants.ST_FILE:
672 _RequireFileStorage()
676 def _GetClusterDomainSecret():
677 """Reads the cluster domain secret.
680 return utils.ReadOneLineFile(constants.CLUSTER_DOMAIN_SECRET_FILE,
684 def _CheckInstanceDown(lu, instance, reason):
685 """Ensure that an instance is not running."""
686 if instance.admin_up:
687 raise errors.OpPrereqError("Instance %s is marked to be up, %s" %
688 (instance.name, reason), errors.ECODE_STATE)
690 pnode = instance.primary_node
691 ins_l = lu.rpc.call_instance_list([pnode], [instance.hypervisor])[pnode]
692 ins_l.Raise("Can't contact node %s for instance information" % pnode,
693 prereq=True, ecode=errors.ECODE_ENVIRON)
695 if instance.name in ins_l.payload:
696 raise errors.OpPrereqError("Instance %s is running, %s" %
697 (instance.name, reason), errors.ECODE_STATE)
700 def _ExpandItemName(fn, name, kind):
701 """Expand an item name.
703 @param fn: the function to use for expansion
704 @param name: requested item name
705 @param kind: text description ('Node' or 'Instance')
706 @return: the resolved (full) name
707 @raise errors.OpPrereqError: if the item is not found
711 if full_name is None:
712 raise errors.OpPrereqError("%s '%s' not known" % (kind, name),
717 def _ExpandNodeName(cfg, name):
718 """Wrapper over L{_ExpandItemName} for nodes."""
719 return _ExpandItemName(cfg.ExpandNodeName, name, "Node")
722 def _ExpandInstanceName(cfg, name):
723 """Wrapper over L{_ExpandItemName} for instance."""
724 return _ExpandItemName(cfg.ExpandInstanceName, name, "Instance")
727 def _BuildInstanceHookEnv(name, primary_node, secondary_nodes, os_type, status,
728 memory, vcpus, nics, disk_template, disks,
729 bep, hvp, hypervisor_name):
730 """Builds instance related env variables for hooks
732 This builds the hook environment from individual variables.
735 @param name: the name of the instance
736 @type primary_node: string
737 @param primary_node: the name of the instance's primary node
738 @type secondary_nodes: list
739 @param secondary_nodes: list of secondary nodes as strings
740 @type os_type: string
741 @param os_type: the name of the instance's OS
742 @type status: boolean
743 @param status: the should_run status of the instance
745 @param memory: the memory size of the instance
747 @param vcpus: the count of VCPUs the instance has
749 @param nics: list of tuples (ip, mac, mode, link) representing
750 the NICs the instance has
751 @type disk_template: string
752 @param disk_template: the disk template of the instance
754 @param disks: the list of (size, mode) pairs
756 @param bep: the backend parameters for the instance
758 @param hvp: the hypervisor parameters for the instance
759 @type hypervisor_name: string
760 @param hypervisor_name: the hypervisor for the instance
762 @return: the hook environment for this instance
771 "INSTANCE_NAME": name,
772 "INSTANCE_PRIMARY": primary_node,
773 "INSTANCE_SECONDARIES": " ".join(secondary_nodes),
774 "INSTANCE_OS_TYPE": os_type,
775 "INSTANCE_STATUS": str_status,
776 "INSTANCE_MEMORY": memory,
777 "INSTANCE_VCPUS": vcpus,
778 "INSTANCE_DISK_TEMPLATE": disk_template,
779 "INSTANCE_HYPERVISOR": hypervisor_name,
783 nic_count = len(nics)
784 for idx, (ip, mac, mode, link) in enumerate(nics):
787 env["INSTANCE_NIC%d_IP" % idx] = ip
788 env["INSTANCE_NIC%d_MAC" % idx] = mac
789 env["INSTANCE_NIC%d_MODE" % idx] = mode
790 env["INSTANCE_NIC%d_LINK" % idx] = link
791 if mode == constants.NIC_MODE_BRIDGED:
792 env["INSTANCE_NIC%d_BRIDGE" % idx] = link
796 env["INSTANCE_NIC_COUNT"] = nic_count
799 disk_count = len(disks)
800 for idx, (size, mode) in enumerate(disks):
801 env["INSTANCE_DISK%d_SIZE" % idx] = size
802 env["INSTANCE_DISK%d_MODE" % idx] = mode
806 env["INSTANCE_DISK_COUNT"] = disk_count
808 for source, kind in [(bep, "BE"), (hvp, "HV")]:
809 for key, value in source.items():
810 env["INSTANCE_%s_%s" % (kind, key)] = value
815 def _NICListToTuple(lu, nics):
816 """Build a list of nic information tuples.
818 This list is suitable to be passed to _BuildInstanceHookEnv or as a return
819 value in LUQueryInstanceData.
821 @type lu: L{LogicalUnit}
822 @param lu: the logical unit on whose behalf we execute
823 @type nics: list of L{objects.NIC}
824 @param nics: list of nics to convert to hooks tuples
828 cluster = lu.cfg.GetClusterInfo()
832 filled_params = cluster.SimpleFillNIC(nic.nicparams)
833 mode = filled_params[constants.NIC_MODE]
834 link = filled_params[constants.NIC_LINK]
835 hooks_nics.append((ip, mac, mode, link))
839 def _BuildInstanceHookEnvByObject(lu, instance, override=None):
840 """Builds instance related env variables for hooks from an object.
842 @type lu: L{LogicalUnit}
843 @param lu: the logical unit on whose behalf we execute
844 @type instance: L{objects.Instance}
845 @param instance: the instance for which we should build the
848 @param override: dictionary with key/values that will override
851 @return: the hook environment dictionary
854 cluster = lu.cfg.GetClusterInfo()
855 bep = cluster.FillBE(instance)
856 hvp = cluster.FillHV(instance)
858 'name': instance.name,
859 'primary_node': instance.primary_node,
860 'secondary_nodes': instance.secondary_nodes,
861 'os_type': instance.os,
862 'status': instance.admin_up,
863 'memory': bep[constants.BE_MEMORY],
864 'vcpus': bep[constants.BE_VCPUS],
865 'nics': _NICListToTuple(lu, instance.nics),
866 'disk_template': instance.disk_template,
867 'disks': [(disk.size, disk.mode) for disk in instance.disks],
870 'hypervisor_name': instance.hypervisor,
873 args.update(override)
874 return _BuildInstanceHookEnv(**args) # pylint: disable-msg=W0142
877 def _AdjustCandidatePool(lu, exceptions):
878 """Adjust the candidate pool after node operations.
881 mod_list = lu.cfg.MaintainCandidatePool(exceptions)
883 lu.LogInfo("Promoted nodes to master candidate role: %s",
884 utils.CommaJoin(node.name for node in mod_list))
885 for name in mod_list:
886 lu.context.ReaddNode(name)
887 mc_now, mc_max, _ = lu.cfg.GetMasterCandidateStats(exceptions)
889 lu.LogInfo("Note: more nodes are candidates (%d) than desired (%d)" %
893 def _DecideSelfPromotion(lu, exceptions=None):
894 """Decide whether I should promote myself as a master candidate.
897 cp_size = lu.cfg.GetClusterInfo().candidate_pool_size
898 mc_now, mc_should, _ = lu.cfg.GetMasterCandidateStats(exceptions)
899 # the new node will increase mc_max with one, so:
900 mc_should = min(mc_should + 1, cp_size)
901 return mc_now < mc_should
904 def _CheckNicsBridgesExist(lu, target_nics, target_node):
905 """Check that the brigdes needed by a list of nics exist.
908 cluster = lu.cfg.GetClusterInfo()
909 paramslist = [cluster.SimpleFillNIC(nic.nicparams) for nic in target_nics]
910 brlist = [params[constants.NIC_LINK] for params in paramslist
911 if params[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED]
913 result = lu.rpc.call_bridges_exist(target_node, brlist)
914 result.Raise("Error checking bridges on destination node '%s'" %
915 target_node, prereq=True, ecode=errors.ECODE_ENVIRON)
918 def _CheckInstanceBridgesExist(lu, instance, node=None):
919 """Check that the brigdes needed by an instance exist.
923 node = instance.primary_node
924 _CheckNicsBridgesExist(lu, instance.nics, node)
927 def _CheckOSVariant(os_obj, name):
928 """Check whether an OS name conforms to the os variants specification.
930 @type os_obj: L{objects.OS}
931 @param os_obj: OS object to check
933 @param name: OS name passed by the user, to check for validity
936 if not os_obj.supported_variants:
938 variant = objects.OS.GetVariant(name)
940 raise errors.OpPrereqError("OS name must include a variant",
943 if variant not in os_obj.supported_variants:
944 raise errors.OpPrereqError("Unsupported OS variant", errors.ECODE_INVAL)
947 def _GetNodeInstancesInner(cfg, fn):
948 return [i for i in cfg.GetAllInstancesInfo().values() if fn(i)]
951 def _GetNodeInstances(cfg, node_name):
952 """Returns a list of all primary and secondary instances on a node.
956 return _GetNodeInstancesInner(cfg, lambda inst: node_name in inst.all_nodes)
959 def _GetNodePrimaryInstances(cfg, node_name):
960 """Returns primary instances on a node.
963 return _GetNodeInstancesInner(cfg,
964 lambda inst: node_name == inst.primary_node)
967 def _GetNodeSecondaryInstances(cfg, node_name):
968 """Returns secondary instances on a node.
971 return _GetNodeInstancesInner(cfg,
972 lambda inst: node_name in inst.secondary_nodes)
975 def _GetStorageTypeArgs(cfg, storage_type):
976 """Returns the arguments for a storage type.
979 # Special case for file storage
980 if storage_type == constants.ST_FILE:
981 # storage.FileStorage wants a list of storage directories
982 return [[cfg.GetFileStorageDir()]]
987 def _FindFaultyInstanceDisks(cfg, rpc, instance, node_name, prereq):
990 for dev in instance.disks:
991 cfg.SetDiskID(dev, node_name)
993 result = rpc.call_blockdev_getmirrorstatus(node_name, instance.disks)
994 result.Raise("Failed to get disk status from node %s" % node_name,
995 prereq=prereq, ecode=errors.ECODE_ENVIRON)
997 for idx, bdev_status in enumerate(result.payload):
998 if bdev_status and bdev_status.ldisk_status == constants.LDS_FAULTY:
1004 def _CheckIAllocatorOrNode(lu, iallocator_slot, node_slot):
1005 """Check the sanity of iallocator and node arguments and use the
1006 cluster-wide iallocator if appropriate.
1008 Check that at most one of (iallocator, node) is specified. If none is
1009 specified, then the LU's opcode's iallocator slot is filled with the
1010 cluster-wide default iallocator.
1012 @type iallocator_slot: string
1013 @param iallocator_slot: the name of the opcode iallocator slot
1014 @type node_slot: string
1015 @param node_slot: the name of the opcode target node slot
1018 node = getattr(lu.op, node_slot, None)
1019 iallocator = getattr(lu.op, iallocator_slot, None)
1021 if node is not None and iallocator is not None:
1022 raise errors.OpPrereqError("Do not specify both, iallocator and node.",
1024 elif node is None and iallocator is None:
1025 default_iallocator = lu.cfg.GetDefaultIAllocator()
1026 if default_iallocator:
1027 setattr(lu.op, iallocator_slot, default_iallocator)
1029 raise errors.OpPrereqError("No iallocator or node given and no"
1030 " cluster-wide default iallocator found."
1031 " Please specify either an iallocator or a"
1032 " node, or set a cluster-wide default"
1036 class LUPostInitCluster(LogicalUnit):
1037 """Logical unit for running hooks after cluster initialization.
1040 HPATH = "cluster-init"
1041 HTYPE = constants.HTYPE_CLUSTER
1043 def BuildHooksEnv(self):
1047 env = {"OP_TARGET": self.cfg.GetClusterName()}
1048 mn = self.cfg.GetMasterNode()
1049 return env, [], [mn]
1051 def Exec(self, feedback_fn):
1058 class LUDestroyCluster(LogicalUnit):
1059 """Logical unit for destroying the cluster.
1062 HPATH = "cluster-destroy"
1063 HTYPE = constants.HTYPE_CLUSTER
1065 def BuildHooksEnv(self):
1069 env = {"OP_TARGET": self.cfg.GetClusterName()}
1072 def CheckPrereq(self):
1073 """Check prerequisites.
1075 This checks whether the cluster is empty.
1077 Any errors are signaled by raising errors.OpPrereqError.
1080 master = self.cfg.GetMasterNode()
1082 nodelist = self.cfg.GetNodeList()
1083 if len(nodelist) != 1 or nodelist[0] != master:
1084 raise errors.OpPrereqError("There are still %d node(s) in"
1085 " this cluster." % (len(nodelist) - 1),
1087 instancelist = self.cfg.GetInstanceList()
1089 raise errors.OpPrereqError("There are still %d instance(s) in"
1090 " this cluster." % len(instancelist),
1093 def Exec(self, feedback_fn):
1094 """Destroys the cluster.
1097 master = self.cfg.GetMasterNode()
1099 # Run post hooks on master node before it's removed
1100 hm = self.proc.hmclass(self.rpc.call_hooks_runner, self)
1102 hm.RunPhase(constants.HOOKS_PHASE_POST, [master])
1104 # pylint: disable-msg=W0702
1105 self.LogWarning("Errors occurred running hooks on %s" % master)
1107 result = self.rpc.call_node_stop_master(master, False)
1108 result.Raise("Could not disable the master role")
1113 def _VerifyCertificate(filename):
1114 """Verifies a certificate for LUVerifyCluster.
1116 @type filename: string
1117 @param filename: Path to PEM file
1121 cert = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM,
1122 utils.ReadFile(filename))
1123 except Exception, err: # pylint: disable-msg=W0703
1124 return (LUVerifyCluster.ETYPE_ERROR,
1125 "Failed to load X509 certificate %s: %s" % (filename, err))
1128 utils.VerifyX509Certificate(cert, constants.SSL_CERT_EXPIRATION_WARN,
1129 constants.SSL_CERT_EXPIRATION_ERROR)
1132 fnamemsg = "While verifying %s: %s" % (filename, msg)
1137 return (None, fnamemsg)
1138 elif errcode == utils.CERT_WARNING:
1139 return (LUVerifyCluster.ETYPE_WARNING, fnamemsg)
1140 elif errcode == utils.CERT_ERROR:
1141 return (LUVerifyCluster.ETYPE_ERROR, fnamemsg)
1143 raise errors.ProgrammerError("Unhandled certificate error code %r" % errcode)
1146 class LUVerifyCluster(LogicalUnit):
1147 """Verifies the cluster status.
1150 HPATH = "cluster-verify"
1151 HTYPE = constants.HTYPE_CLUSTER
1153 ("skip_checks", ht.EmptyList,
1154 ht.TListOf(ht.TElemOf(constants.VERIFY_OPTIONAL_CHECKS))),
1155 ("verbose", False, ht.TBool),
1156 ("error_codes", False, ht.TBool),
1157 ("debug_simulate_errors", False, ht.TBool),
1161 TCLUSTER = "cluster"
1163 TINSTANCE = "instance"
1165 ECLUSTERCFG = (TCLUSTER, "ECLUSTERCFG")
1166 ECLUSTERCERT = (TCLUSTER, "ECLUSTERCERT")
1167 EINSTANCEBADNODE = (TINSTANCE, "EINSTANCEBADNODE")
1168 EINSTANCEDOWN = (TINSTANCE, "EINSTANCEDOWN")
1169 EINSTANCELAYOUT = (TINSTANCE, "EINSTANCELAYOUT")
1170 EINSTANCEMISSINGDISK = (TINSTANCE, "EINSTANCEMISSINGDISK")
1171 EINSTANCEMISSINGDISK = (TINSTANCE, "EINSTANCEMISSINGDISK")
1172 EINSTANCEWRONGNODE = (TINSTANCE, "EINSTANCEWRONGNODE")
1173 ENODEDRBD = (TNODE, "ENODEDRBD")
1174 ENODEDRBDHELPER = (TNODE, "ENODEDRBDHELPER")
1175 ENODEFILECHECK = (TNODE, "ENODEFILECHECK")
1176 ENODEHOOKS = (TNODE, "ENODEHOOKS")
1177 ENODEHV = (TNODE, "ENODEHV")
1178 ENODELVM = (TNODE, "ENODELVM")
1179 ENODEN1 = (TNODE, "ENODEN1")
1180 ENODENET = (TNODE, "ENODENET")
1181 ENODEOS = (TNODE, "ENODEOS")
1182 ENODEORPHANINSTANCE = (TNODE, "ENODEORPHANINSTANCE")
1183 ENODEORPHANLV = (TNODE, "ENODEORPHANLV")
1184 ENODERPC = (TNODE, "ENODERPC")
1185 ENODESSH = (TNODE, "ENODESSH")
1186 ENODEVERSION = (TNODE, "ENODEVERSION")
1187 ENODESETUP = (TNODE, "ENODESETUP")
1188 ENODETIME = (TNODE, "ENODETIME")
1190 ETYPE_FIELD = "code"
1191 ETYPE_ERROR = "ERROR"
1192 ETYPE_WARNING = "WARNING"
1194 class NodeImage(object):
1195 """A class representing the logical and physical status of a node.
1198 @ivar name: the node name to which this object refers
1199 @ivar volumes: a structure as returned from
1200 L{ganeti.backend.GetVolumeList} (runtime)
1201 @ivar instances: a list of running instances (runtime)
1202 @ivar pinst: list of configured primary instances (config)
1203 @ivar sinst: list of configured secondary instances (config)
1204 @ivar sbp: diction of {secondary-node: list of instances} of all peers
1205 of this node (config)
1206 @ivar mfree: free memory, as reported by hypervisor (runtime)
1207 @ivar dfree: free disk, as reported by the node (runtime)
1208 @ivar offline: the offline status (config)
1209 @type rpc_fail: boolean
1210 @ivar rpc_fail: whether the RPC verify call was successfull (overall,
1211 not whether the individual keys were correct) (runtime)
1212 @type lvm_fail: boolean
1213 @ivar lvm_fail: whether the RPC call didn't return valid LVM data
1214 @type hyp_fail: boolean
1215 @ivar hyp_fail: whether the RPC call didn't return the instance list
1216 @type ghost: boolean
1217 @ivar ghost: whether this is a known node or not (config)
1218 @type os_fail: boolean
1219 @ivar os_fail: whether the RPC call didn't return valid OS data
1221 @ivar oslist: list of OSes as diagnosed by DiagnoseOS
1224 def __init__(self, offline=False, name=None):
1233 self.offline = offline
1234 self.rpc_fail = False
1235 self.lvm_fail = False
1236 self.hyp_fail = False
1238 self.os_fail = False
1241 def ExpandNames(self):
1242 self.needed_locks = {
1243 locking.LEVEL_NODE: locking.ALL_SET,
1244 locking.LEVEL_INSTANCE: locking.ALL_SET,
1246 self.share_locks = dict.fromkeys(locking.LEVELS, 1)
1248 def _Error(self, ecode, item, msg, *args, **kwargs):
1249 """Format an error message.
1251 Based on the opcode's error_codes parameter, either format a
1252 parseable error code, or a simpler error string.
1254 This must be called only from Exec and functions called from Exec.
1257 ltype = kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR)
1259 # first complete the msg
1262 # then format the whole message
1263 if self.op.error_codes:
1264 msg = "%s:%s:%s:%s:%s" % (ltype, etxt, itype, item, msg)
1270 msg = "%s: %s%s: %s" % (ltype, itype, item, msg)
1271 # and finally report it via the feedback_fn
1272 self._feedback_fn(" - %s" % msg)
1274 def _ErrorIf(self, cond, *args, **kwargs):
1275 """Log an error message if the passed condition is True.
1278 cond = bool(cond) or self.op.debug_simulate_errors
1280 self._Error(*args, **kwargs)
1281 # do not mark the operation as failed for WARN cases only
1282 if kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR) == self.ETYPE_ERROR:
1283 self.bad = self.bad or cond
1285 def _VerifyNode(self, ninfo, nresult):
1286 """Perform some basic validation on data returned from a node.
1288 - check the result data structure is well formed and has all the
1290 - check ganeti version
1292 @type ninfo: L{objects.Node}
1293 @param ninfo: the node to check
1294 @param nresult: the results from the node
1296 @return: whether overall this call was successful (and we can expect
1297 reasonable values in the respose)
1301 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1303 # main result, nresult should be a non-empty dict
1304 test = not nresult or not isinstance(nresult, dict)
1305 _ErrorIf(test, self.ENODERPC, node,
1306 "unable to verify node: no data returned")
1310 # compares ganeti version
1311 local_version = constants.PROTOCOL_VERSION
1312 remote_version = nresult.get("version", None)
1313 test = not (remote_version and
1314 isinstance(remote_version, (list, tuple)) and
1315 len(remote_version) == 2)
1316 _ErrorIf(test, self.ENODERPC, node,
1317 "connection to node returned invalid data")
1321 test = local_version != remote_version[0]
1322 _ErrorIf(test, self.ENODEVERSION, node,
1323 "incompatible protocol versions: master %s,"
1324 " node %s", local_version, remote_version[0])
1328 # node seems compatible, we can actually try to look into its results
1330 # full package version
1331 self._ErrorIf(constants.RELEASE_VERSION != remote_version[1],
1332 self.ENODEVERSION, node,
1333 "software version mismatch: master %s, node %s",
1334 constants.RELEASE_VERSION, remote_version[1],
1335 code=self.ETYPE_WARNING)
1337 hyp_result = nresult.get(constants.NV_HYPERVISOR, None)
1338 if isinstance(hyp_result, dict):
1339 for hv_name, hv_result in hyp_result.iteritems():
1340 test = hv_result is not None
1341 _ErrorIf(test, self.ENODEHV, node,
1342 "hypervisor %s verify failure: '%s'", hv_name, hv_result)
1345 test = nresult.get(constants.NV_NODESETUP,
1346 ["Missing NODESETUP results"])
1347 _ErrorIf(test, self.ENODESETUP, node, "node setup error: %s",
1352 def _VerifyNodeTime(self, ninfo, nresult,
1353 nvinfo_starttime, nvinfo_endtime):
1354 """Check the node time.
1356 @type ninfo: L{objects.Node}
1357 @param ninfo: the node to check
1358 @param nresult: the remote results for the node
1359 @param nvinfo_starttime: the start time of the RPC call
1360 @param nvinfo_endtime: the end time of the RPC call
1364 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1366 ntime = nresult.get(constants.NV_TIME, None)
1368 ntime_merged = utils.MergeTime(ntime)
1369 except (ValueError, TypeError):
1370 _ErrorIf(True, self.ENODETIME, node, "Node returned invalid time")
1373 if ntime_merged < (nvinfo_starttime - constants.NODE_MAX_CLOCK_SKEW):
1374 ntime_diff = "%.01fs" % abs(nvinfo_starttime - ntime_merged)
1375 elif ntime_merged > (nvinfo_endtime + constants.NODE_MAX_CLOCK_SKEW):
1376 ntime_diff = "%.01fs" % abs(ntime_merged - nvinfo_endtime)
1380 _ErrorIf(ntime_diff is not None, self.ENODETIME, node,
1381 "Node time diverges by at least %s from master node time",
1384 def _VerifyNodeLVM(self, ninfo, nresult, vg_name):
1385 """Check the node time.
1387 @type ninfo: L{objects.Node}
1388 @param ninfo: the node to check
1389 @param nresult: the remote results for the node
1390 @param vg_name: the configured VG name
1397 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1399 # checks vg existence and size > 20G
1400 vglist = nresult.get(constants.NV_VGLIST, None)
1402 _ErrorIf(test, self.ENODELVM, node, "unable to check volume groups")
1404 vgstatus = utils.CheckVolumeGroupSize(vglist, vg_name,
1405 constants.MIN_VG_SIZE)
1406 _ErrorIf(vgstatus, self.ENODELVM, node, vgstatus)
1409 pvlist = nresult.get(constants.NV_PVLIST, None)
1410 test = pvlist is None
1411 _ErrorIf(test, self.ENODELVM, node, "Can't get PV list from node")
1413 # check that ':' is not present in PV names, since it's a
1414 # special character for lvcreate (denotes the range of PEs to
1416 for _, pvname, owner_vg in pvlist:
1417 test = ":" in pvname
1418 _ErrorIf(test, self.ENODELVM, node, "Invalid character ':' in PV"
1419 " '%s' of VG '%s'", pvname, owner_vg)
1421 def _VerifyNodeNetwork(self, ninfo, nresult):
1422 """Check the node time.
1424 @type ninfo: L{objects.Node}
1425 @param ninfo: the node to check
1426 @param nresult: the remote results for the node
1430 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1432 test = constants.NV_NODELIST not in nresult
1433 _ErrorIf(test, self.ENODESSH, node,
1434 "node hasn't returned node ssh connectivity data")
1436 if nresult[constants.NV_NODELIST]:
1437 for a_node, a_msg in nresult[constants.NV_NODELIST].items():
1438 _ErrorIf(True, self.ENODESSH, node,
1439 "ssh communication with node '%s': %s", a_node, a_msg)
1441 test = constants.NV_NODENETTEST not in nresult
1442 _ErrorIf(test, self.ENODENET, node,
1443 "node hasn't returned node tcp connectivity data")
1445 if nresult[constants.NV_NODENETTEST]:
1446 nlist = utils.NiceSort(nresult[constants.NV_NODENETTEST].keys())
1448 _ErrorIf(True, self.ENODENET, node,
1449 "tcp communication with node '%s': %s",
1450 anode, nresult[constants.NV_NODENETTEST][anode])
1452 test = constants.NV_MASTERIP not in nresult
1453 _ErrorIf(test, self.ENODENET, node,
1454 "node hasn't returned node master IP reachability data")
1456 if not nresult[constants.NV_MASTERIP]:
1457 if node == self.master_node:
1458 msg = "the master node cannot reach the master IP (not configured?)"
1460 msg = "cannot reach the master IP"
1461 _ErrorIf(True, self.ENODENET, node, msg)
1464 def _VerifyInstance(self, instance, instanceconfig, node_image):
1465 """Verify an instance.
1467 This function checks to see if the required block devices are
1468 available on the instance's node.
1471 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1472 node_current = instanceconfig.primary_node
1474 node_vol_should = {}
1475 instanceconfig.MapLVsByNode(node_vol_should)
1477 for node in node_vol_should:
1478 n_img = node_image[node]
1479 if n_img.offline or n_img.rpc_fail or n_img.lvm_fail:
1480 # ignore missing volumes on offline or broken nodes
1482 for volume in node_vol_should[node]:
1483 test = volume not in n_img.volumes
1484 _ErrorIf(test, self.EINSTANCEMISSINGDISK, instance,
1485 "volume %s missing on node %s", volume, node)
1487 if instanceconfig.admin_up:
1488 pri_img = node_image[node_current]
1489 test = instance not in pri_img.instances and not pri_img.offline
1490 _ErrorIf(test, self.EINSTANCEDOWN, instance,
1491 "instance not running on its primary node %s",
1494 for node, n_img in node_image.items():
1495 if (not node == node_current):
1496 test = instance in n_img.instances
1497 _ErrorIf(test, self.EINSTANCEWRONGNODE, instance,
1498 "instance should not run on node %s", node)
1500 def _VerifyOrphanVolumes(self, node_vol_should, node_image, reserved):
1501 """Verify if there are any unknown volumes in the cluster.
1503 The .os, .swap and backup volumes are ignored. All other volumes are
1504 reported as unknown.
1506 @type reserved: L{ganeti.utils.FieldSet}
1507 @param reserved: a FieldSet of reserved volume names
1510 for node, n_img in node_image.items():
1511 if n_img.offline or n_img.rpc_fail or n_img.lvm_fail:
1512 # skip non-healthy nodes
1514 for volume in n_img.volumes:
1515 test = ((node not in node_vol_should or
1516 volume not in node_vol_should[node]) and
1517 not reserved.Matches(volume))
1518 self._ErrorIf(test, self.ENODEORPHANLV, node,
1519 "volume %s is unknown", volume)
1521 def _VerifyOrphanInstances(self, instancelist, node_image):
1522 """Verify the list of running instances.
1524 This checks what instances are running but unknown to the cluster.
1527 for node, n_img in node_image.items():
1528 for o_inst in n_img.instances:
1529 test = o_inst not in instancelist
1530 self._ErrorIf(test, self.ENODEORPHANINSTANCE, node,
1531 "instance %s on node %s should not exist", o_inst, node)
1533 def _VerifyNPlusOneMemory(self, node_image, instance_cfg):
1534 """Verify N+1 Memory Resilience.
1536 Check that if one single node dies we can still start all the
1537 instances it was primary for.
1540 for node, n_img in node_image.items():
1541 # This code checks that every node which is now listed as
1542 # secondary has enough memory to host all instances it is
1543 # supposed to should a single other node in the cluster fail.
1544 # FIXME: not ready for failover to an arbitrary node
1545 # FIXME: does not support file-backed instances
1546 # WARNING: we currently take into account down instances as well
1547 # as up ones, considering that even if they're down someone
1548 # might want to start them even in the event of a node failure.
1549 for prinode, instances in n_img.sbp.items():
1551 for instance in instances:
1552 bep = self.cfg.GetClusterInfo().FillBE(instance_cfg[instance])
1553 if bep[constants.BE_AUTO_BALANCE]:
1554 needed_mem += bep[constants.BE_MEMORY]
1555 test = n_img.mfree < needed_mem
1556 self._ErrorIf(test, self.ENODEN1, node,
1557 "not enough memory on to accommodate"
1558 " failovers should peer node %s fail", prinode)
1560 def _VerifyNodeFiles(self, ninfo, nresult, file_list, local_cksum,
1562 """Verifies and computes the node required file checksums.
1564 @type ninfo: L{objects.Node}
1565 @param ninfo: the node to check
1566 @param nresult: the remote results for the node
1567 @param file_list: required list of files
1568 @param local_cksum: dictionary of local files and their checksums
1569 @param master_files: list of files that only masters should have
1573 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1575 remote_cksum = nresult.get(constants.NV_FILELIST, None)
1576 test = not isinstance(remote_cksum, dict)
1577 _ErrorIf(test, self.ENODEFILECHECK, node,
1578 "node hasn't returned file checksum data")
1582 for file_name in file_list:
1583 node_is_mc = ninfo.master_candidate
1584 must_have = (file_name not in master_files) or node_is_mc
1586 test1 = file_name not in remote_cksum
1588 test2 = not test1 and remote_cksum[file_name] != local_cksum[file_name]
1590 test3 = not test1 and remote_cksum[file_name] == local_cksum[file_name]
1591 _ErrorIf(test1 and must_have, self.ENODEFILECHECK, node,
1592 "file '%s' missing", file_name)
1593 _ErrorIf(test2 and must_have, self.ENODEFILECHECK, node,
1594 "file '%s' has wrong checksum", file_name)
1595 # not candidate and this is not a must-have file
1596 _ErrorIf(test2 and not must_have, self.ENODEFILECHECK, node,
1597 "file '%s' should not exist on non master"
1598 " candidates (and the file is outdated)", file_name)
1599 # all good, except non-master/non-must have combination
1600 _ErrorIf(test3 and not must_have, self.ENODEFILECHECK, node,
1601 "file '%s' should not exist"
1602 " on non master candidates", file_name)
1604 def _VerifyNodeDrbd(self, ninfo, nresult, instanceinfo, drbd_helper,
1606 """Verifies and the node DRBD status.
1608 @type ninfo: L{objects.Node}
1609 @param ninfo: the node to check
1610 @param nresult: the remote results for the node
1611 @param instanceinfo: the dict of instances
1612 @param drbd_helper: the configured DRBD usermode helper
1613 @param drbd_map: the DRBD map as returned by
1614 L{ganeti.config.ConfigWriter.ComputeDRBDMap}
1618 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1621 helper_result = nresult.get(constants.NV_DRBDHELPER, None)
1622 test = (helper_result == None)
1623 _ErrorIf(test, self.ENODEDRBDHELPER, node,
1624 "no drbd usermode helper returned")
1626 status, payload = helper_result
1628 _ErrorIf(test, self.ENODEDRBDHELPER, node,
1629 "drbd usermode helper check unsuccessful: %s", payload)
1630 test = status and (payload != drbd_helper)
1631 _ErrorIf(test, self.ENODEDRBDHELPER, node,
1632 "wrong drbd usermode helper: %s", payload)
1634 # compute the DRBD minors
1636 for minor, instance in drbd_map[node].items():
1637 test = instance not in instanceinfo
1638 _ErrorIf(test, self.ECLUSTERCFG, None,
1639 "ghost instance '%s' in temporary DRBD map", instance)
1640 # ghost instance should not be running, but otherwise we
1641 # don't give double warnings (both ghost instance and
1642 # unallocated minor in use)
1644 node_drbd[minor] = (instance, False)
1646 instance = instanceinfo[instance]
1647 node_drbd[minor] = (instance.name, instance.admin_up)
1649 # and now check them
1650 used_minors = nresult.get(constants.NV_DRBDLIST, [])
1651 test = not isinstance(used_minors, (tuple, list))
1652 _ErrorIf(test, self.ENODEDRBD, node,
1653 "cannot parse drbd status file: %s", str(used_minors))
1655 # we cannot check drbd status
1658 for minor, (iname, must_exist) in node_drbd.items():
1659 test = minor not in used_minors and must_exist
1660 _ErrorIf(test, self.ENODEDRBD, node,
1661 "drbd minor %d of instance %s is not active", minor, iname)
1662 for minor in used_minors:
1663 test = minor not in node_drbd
1664 _ErrorIf(test, self.ENODEDRBD, node,
1665 "unallocated drbd minor %d is in use", minor)
1667 def _UpdateNodeOS(self, ninfo, nresult, nimg):
1668 """Builds the node OS structures.
1670 @type ninfo: L{objects.Node}
1671 @param ninfo: the node to check
1672 @param nresult: the remote results for the node
1673 @param nimg: the node image object
1677 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1679 remote_os = nresult.get(constants.NV_OSLIST, None)
1680 test = (not isinstance(remote_os, list) or
1681 not compat.all(isinstance(v, list) and len(v) == 7
1682 for v in remote_os))
1684 _ErrorIf(test, self.ENODEOS, node,
1685 "node hasn't returned valid OS data")
1694 for (name, os_path, status, diagnose,
1695 variants, parameters, api_ver) in nresult[constants.NV_OSLIST]:
1697 if name not in os_dict:
1700 # parameters is a list of lists instead of list of tuples due to
1701 # JSON lacking a real tuple type, fix it:
1702 parameters = [tuple(v) for v in parameters]
1703 os_dict[name].append((os_path, status, diagnose,
1704 set(variants), set(parameters), set(api_ver)))
1706 nimg.oslist = os_dict
1708 def _VerifyNodeOS(self, ninfo, nimg, base):
1709 """Verifies the node OS list.
1711 @type ninfo: L{objects.Node}
1712 @param ninfo: the node to check
1713 @param nimg: the node image object
1714 @param base: the 'template' node we match against (e.g. from the master)
1718 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1720 assert not nimg.os_fail, "Entered _VerifyNodeOS with failed OS rpc?"
1722 for os_name, os_data in nimg.oslist.items():
1723 assert os_data, "Empty OS status for OS %s?!" % os_name
1724 f_path, f_status, f_diag, f_var, f_param, f_api = os_data[0]
1725 _ErrorIf(not f_status, self.ENODEOS, node,
1726 "Invalid OS %s (located at %s): %s", os_name, f_path, f_diag)
1727 _ErrorIf(len(os_data) > 1, self.ENODEOS, node,
1728 "OS '%s' has multiple entries (first one shadows the rest): %s",
1729 os_name, utils.CommaJoin([v[0] for v in os_data]))
1730 # this will catched in backend too
1731 _ErrorIf(compat.any(v >= constants.OS_API_V15 for v in f_api)
1732 and not f_var, self.ENODEOS, node,
1733 "OS %s with API at least %d does not declare any variant",
1734 os_name, constants.OS_API_V15)
1735 # comparisons with the 'base' image
1736 test = os_name not in base.oslist
1737 _ErrorIf(test, self.ENODEOS, node,
1738 "Extra OS %s not present on reference node (%s)",
1742 assert base.oslist[os_name], "Base node has empty OS status?"
1743 _, b_status, _, b_var, b_param, b_api = base.oslist[os_name][0]
1745 # base OS is invalid, skipping
1747 for kind, a, b in [("API version", f_api, b_api),
1748 ("variants list", f_var, b_var),
1749 ("parameters", f_param, b_param)]:
1750 _ErrorIf(a != b, self.ENODEOS, node,
1751 "OS %s %s differs from reference node %s: %s vs. %s",
1752 kind, os_name, base.name,
1753 utils.CommaJoin(a), utils.CommaJoin(b))
1755 # check any missing OSes
1756 missing = set(base.oslist.keys()).difference(nimg.oslist.keys())
1757 _ErrorIf(missing, self.ENODEOS, node,
1758 "OSes present on reference node %s but missing on this node: %s",
1759 base.name, utils.CommaJoin(missing))
1761 def _UpdateNodeVolumes(self, ninfo, nresult, nimg, vg_name):
1762 """Verifies and updates the node volume data.
1764 This function will update a L{NodeImage}'s internal structures
1765 with data from the remote call.
1767 @type ninfo: L{objects.Node}
1768 @param ninfo: the node to check
1769 @param nresult: the remote results for the node
1770 @param nimg: the node image object
1771 @param vg_name: the configured VG name
1775 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1777 nimg.lvm_fail = True
1778 lvdata = nresult.get(constants.NV_LVLIST, "Missing LV data")
1781 elif isinstance(lvdata, basestring):
1782 _ErrorIf(True, self.ENODELVM, node, "LVM problem on node: %s",
1783 utils.SafeEncode(lvdata))
1784 elif not isinstance(lvdata, dict):
1785 _ErrorIf(True, self.ENODELVM, node, "rpc call to node failed (lvlist)")
1787 nimg.volumes = lvdata
1788 nimg.lvm_fail = False
1790 def _UpdateNodeInstances(self, ninfo, nresult, nimg):
1791 """Verifies and updates the node instance list.
1793 If the listing was successful, then updates this node's instance
1794 list. Otherwise, it marks the RPC call as failed for the instance
1797 @type ninfo: L{objects.Node}
1798 @param ninfo: the node to check
1799 @param nresult: the remote results for the node
1800 @param nimg: the node image object
1803 idata = nresult.get(constants.NV_INSTANCELIST, None)
1804 test = not isinstance(idata, list)
1805 self._ErrorIf(test, self.ENODEHV, ninfo.name, "rpc call to node failed"
1806 " (instancelist): %s", utils.SafeEncode(str(idata)))
1808 nimg.hyp_fail = True
1810 nimg.instances = idata
1812 def _UpdateNodeInfo(self, ninfo, nresult, nimg, vg_name):
1813 """Verifies and computes a node information map
1815 @type ninfo: L{objects.Node}
1816 @param ninfo: the node to check
1817 @param nresult: the remote results for the node
1818 @param nimg: the node image object
1819 @param vg_name: the configured VG name
1823 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1825 # try to read free memory (from the hypervisor)
1826 hv_info = nresult.get(constants.NV_HVINFO, None)
1827 test = not isinstance(hv_info, dict) or "memory_free" not in hv_info
1828 _ErrorIf(test, self.ENODEHV, node, "rpc call to node failed (hvinfo)")
1831 nimg.mfree = int(hv_info["memory_free"])
1832 except (ValueError, TypeError):
1833 _ErrorIf(True, self.ENODERPC, node,
1834 "node returned invalid nodeinfo, check hypervisor")
1836 # FIXME: devise a free space model for file based instances as well
1837 if vg_name is not None:
1838 test = (constants.NV_VGLIST not in nresult or
1839 vg_name not in nresult[constants.NV_VGLIST])
1840 _ErrorIf(test, self.ENODELVM, node,
1841 "node didn't return data for the volume group '%s'"
1842 " - it is either missing or broken", vg_name)
1845 nimg.dfree = int(nresult[constants.NV_VGLIST][vg_name])
1846 except (ValueError, TypeError):
1847 _ErrorIf(True, self.ENODERPC, node,
1848 "node returned invalid LVM info, check LVM status")
1850 def BuildHooksEnv(self):
1853 Cluster-Verify hooks just ran in the post phase and their failure makes
1854 the output be logged in the verify output and the verification to fail.
1857 all_nodes = self.cfg.GetNodeList()
1859 "CLUSTER_TAGS": " ".join(self.cfg.GetClusterInfo().GetTags())
1861 for node in self.cfg.GetAllNodesInfo().values():
1862 env["NODE_TAGS_%s" % node.name] = " ".join(node.GetTags())
1864 return env, [], all_nodes
1866 def Exec(self, feedback_fn):
1867 """Verify integrity of cluster, performing various test on nodes.
1871 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1872 verbose = self.op.verbose
1873 self._feedback_fn = feedback_fn
1874 feedback_fn("* Verifying global settings")
1875 for msg in self.cfg.VerifyConfig():
1876 _ErrorIf(True, self.ECLUSTERCFG, None, msg)
1878 # Check the cluster certificates
1879 for cert_filename in constants.ALL_CERT_FILES:
1880 (errcode, msg) = _VerifyCertificate(cert_filename)
1881 _ErrorIf(errcode, self.ECLUSTERCERT, None, msg, code=errcode)
1883 vg_name = self.cfg.GetVGName()
1884 drbd_helper = self.cfg.GetDRBDHelper()
1885 hypervisors = self.cfg.GetClusterInfo().enabled_hypervisors
1886 cluster = self.cfg.GetClusterInfo()
1887 nodelist = utils.NiceSort(self.cfg.GetNodeList())
1888 nodeinfo = [self.cfg.GetNodeInfo(nname) for nname in nodelist]
1889 instancelist = utils.NiceSort(self.cfg.GetInstanceList())
1890 instanceinfo = dict((iname, self.cfg.GetInstanceInfo(iname))
1891 for iname in instancelist)
1892 i_non_redundant = [] # Non redundant instances
1893 i_non_a_balanced = [] # Non auto-balanced instances
1894 n_offline = 0 # Count of offline nodes
1895 n_drained = 0 # Count of nodes being drained
1896 node_vol_should = {}
1898 # FIXME: verify OS list
1899 # do local checksums
1900 master_files = [constants.CLUSTER_CONF_FILE]
1901 master_node = self.master_node = self.cfg.GetMasterNode()
1902 master_ip = self.cfg.GetMasterIP()
1904 file_names = ssconf.SimpleStore().GetFileList()
1905 file_names.extend(constants.ALL_CERT_FILES)
1906 file_names.extend(master_files)
1907 if cluster.modify_etc_hosts:
1908 file_names.append(constants.ETC_HOSTS)
1910 local_checksums = utils.FingerprintFiles(file_names)
1912 feedback_fn("* Gathering data (%d nodes)" % len(nodelist))
1913 node_verify_param = {
1914 constants.NV_FILELIST: file_names,
1915 constants.NV_NODELIST: [node.name for node in nodeinfo
1916 if not node.offline],
1917 constants.NV_HYPERVISOR: hypervisors,
1918 constants.NV_NODENETTEST: [(node.name, node.primary_ip,
1919 node.secondary_ip) for node in nodeinfo
1920 if not node.offline],
1921 constants.NV_INSTANCELIST: hypervisors,
1922 constants.NV_VERSION: None,
1923 constants.NV_HVINFO: self.cfg.GetHypervisorType(),
1924 constants.NV_NODESETUP: None,
1925 constants.NV_TIME: None,
1926 constants.NV_MASTERIP: (master_node, master_ip),
1927 constants.NV_OSLIST: None,
1930 if vg_name is not None:
1931 node_verify_param[constants.NV_VGLIST] = None
1932 node_verify_param[constants.NV_LVLIST] = vg_name
1933 node_verify_param[constants.NV_PVLIST] = [vg_name]
1934 node_verify_param[constants.NV_DRBDLIST] = None
1937 node_verify_param[constants.NV_DRBDHELPER] = drbd_helper
1939 # Build our expected cluster state
1940 node_image = dict((node.name, self.NodeImage(offline=node.offline,
1942 for node in nodeinfo)
1944 for instance in instancelist:
1945 inst_config = instanceinfo[instance]
1947 for nname in inst_config.all_nodes:
1948 if nname not in node_image:
1950 gnode = self.NodeImage(name=nname)
1952 node_image[nname] = gnode
1954 inst_config.MapLVsByNode(node_vol_should)
1956 pnode = inst_config.primary_node
1957 node_image[pnode].pinst.append(instance)
1959 for snode in inst_config.secondary_nodes:
1960 nimg = node_image[snode]
1961 nimg.sinst.append(instance)
1962 if pnode not in nimg.sbp:
1963 nimg.sbp[pnode] = []
1964 nimg.sbp[pnode].append(instance)
1966 # At this point, we have the in-memory data structures complete,
1967 # except for the runtime information, which we'll gather next
1969 # Due to the way our RPC system works, exact response times cannot be
1970 # guaranteed (e.g. a broken node could run into a timeout). By keeping the
1971 # time before and after executing the request, we can at least have a time
1973 nvinfo_starttime = time.time()
1974 all_nvinfo = self.rpc.call_node_verify(nodelist, node_verify_param,
1975 self.cfg.GetClusterName())
1976 nvinfo_endtime = time.time()
1978 all_drbd_map = self.cfg.ComputeDRBDMap()
1980 feedback_fn("* Verifying node status")
1984 for node_i in nodeinfo:
1986 nimg = node_image[node]
1990 feedback_fn("* Skipping offline node %s" % (node,))
1994 if node == master_node:
1996 elif node_i.master_candidate:
1997 ntype = "master candidate"
1998 elif node_i.drained:
2004 feedback_fn("* Verifying node %s (%s)" % (node, ntype))
2006 msg = all_nvinfo[node].fail_msg
2007 _ErrorIf(msg, self.ENODERPC, node, "while contacting node: %s", msg)
2009 nimg.rpc_fail = True
2012 nresult = all_nvinfo[node].payload
2014 nimg.call_ok = self._VerifyNode(node_i, nresult)
2015 self._VerifyNodeNetwork(node_i, nresult)
2016 self._VerifyNodeLVM(node_i, nresult, vg_name)
2017 self._VerifyNodeFiles(node_i, nresult, file_names, local_checksums,
2019 self._VerifyNodeDrbd(node_i, nresult, instanceinfo, drbd_helper,
2021 self._VerifyNodeTime(node_i, nresult, nvinfo_starttime, nvinfo_endtime)
2023 self._UpdateNodeVolumes(node_i, nresult, nimg, vg_name)
2024 self._UpdateNodeInstances(node_i, nresult, nimg)
2025 self._UpdateNodeInfo(node_i, nresult, nimg, vg_name)
2026 self._UpdateNodeOS(node_i, nresult, nimg)
2027 if not nimg.os_fail:
2028 if refos_img is None:
2030 self._VerifyNodeOS(node_i, nimg, refos_img)
2032 feedback_fn("* Verifying instance status")
2033 for instance in instancelist:
2035 feedback_fn("* Verifying instance %s" % instance)
2036 inst_config = instanceinfo[instance]
2037 self._VerifyInstance(instance, inst_config, node_image)
2038 inst_nodes_offline = []
2040 pnode = inst_config.primary_node
2041 pnode_img = node_image[pnode]
2042 _ErrorIf(pnode_img.rpc_fail and not pnode_img.offline,
2043 self.ENODERPC, pnode, "instance %s, connection to"
2044 " primary node failed", instance)
2046 if pnode_img.offline:
2047 inst_nodes_offline.append(pnode)
2049 # If the instance is non-redundant we cannot survive losing its primary
2050 # node, so we are not N+1 compliant. On the other hand we have no disk
2051 # templates with more than one secondary so that situation is not well
2053 # FIXME: does not support file-backed instances
2054 if not inst_config.secondary_nodes:
2055 i_non_redundant.append(instance)
2056 _ErrorIf(len(inst_config.secondary_nodes) > 1, self.EINSTANCELAYOUT,
2057 instance, "instance has multiple secondary nodes: %s",
2058 utils.CommaJoin(inst_config.secondary_nodes),
2059 code=self.ETYPE_WARNING)
2061 if not cluster.FillBE(inst_config)[constants.BE_AUTO_BALANCE]:
2062 i_non_a_balanced.append(instance)
2064 for snode in inst_config.secondary_nodes:
2065 s_img = node_image[snode]
2066 _ErrorIf(s_img.rpc_fail and not s_img.offline, self.ENODERPC, snode,
2067 "instance %s, connection to secondary node failed", instance)
2070 inst_nodes_offline.append(snode)
2072 # warn that the instance lives on offline nodes
2073 _ErrorIf(inst_nodes_offline, self.EINSTANCEBADNODE, instance,
2074 "instance lives on offline node(s) %s",
2075 utils.CommaJoin(inst_nodes_offline))
2076 # ... or ghost nodes
2077 for node in inst_config.all_nodes:
2078 _ErrorIf(node_image[node].ghost, self.EINSTANCEBADNODE, instance,
2079 "instance lives on ghost node %s", node)
2081 feedback_fn("* Verifying orphan volumes")
2082 reserved = utils.FieldSet(*cluster.reserved_lvs)
2083 self._VerifyOrphanVolumes(node_vol_should, node_image, reserved)
2085 feedback_fn("* Verifying orphan instances")
2086 self._VerifyOrphanInstances(instancelist, node_image)
2088 if constants.VERIFY_NPLUSONE_MEM not in self.op.skip_checks:
2089 feedback_fn("* Verifying N+1 Memory redundancy")
2090 self._VerifyNPlusOneMemory(node_image, instanceinfo)
2092 feedback_fn("* Other Notes")
2094 feedback_fn(" - NOTICE: %d non-redundant instance(s) found."
2095 % len(i_non_redundant))
2097 if i_non_a_balanced:
2098 feedback_fn(" - NOTICE: %d non-auto-balanced instance(s) found."
2099 % len(i_non_a_balanced))
2102 feedback_fn(" - NOTICE: %d offline node(s) found." % n_offline)
2105 feedback_fn(" - NOTICE: %d drained node(s) found." % n_drained)
2109 def HooksCallBack(self, phase, hooks_results, feedback_fn, lu_result):
2110 """Analyze the post-hooks' result
2112 This method analyses the hook result, handles it, and sends some
2113 nicely-formatted feedback back to the user.
2115 @param phase: one of L{constants.HOOKS_PHASE_POST} or
2116 L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase
2117 @param hooks_results: the results of the multi-node hooks rpc call
2118 @param feedback_fn: function used send feedback back to the caller
2119 @param lu_result: previous Exec result
2120 @return: the new Exec result, based on the previous result
2124 # We only really run POST phase hooks, and are only interested in
2126 if phase == constants.HOOKS_PHASE_POST:
2127 # Used to change hooks' output to proper indentation
2128 indent_re = re.compile('^', re.M)
2129 feedback_fn("* Hooks Results")
2130 assert hooks_results, "invalid result from hooks"
2132 for node_name in hooks_results:
2133 res = hooks_results[node_name]
2135 test = msg and not res.offline
2136 self._ErrorIf(test, self.ENODEHOOKS, node_name,
2137 "Communication failure in hooks execution: %s", msg)
2138 if res.offline or msg:
2139 # No need to investigate payload if node is offline or gave an error.
2140 # override manually lu_result here as _ErrorIf only
2141 # overrides self.bad
2144 for script, hkr, output in res.payload:
2145 test = hkr == constants.HKR_FAIL
2146 self._ErrorIf(test, self.ENODEHOOKS, node_name,
2147 "Script %s failed, output:", script)
2149 output = indent_re.sub(' ', output)
2150 feedback_fn("%s" % output)
2156 class LUVerifyDisks(NoHooksLU):
2157 """Verifies the cluster disks status.
2162 def ExpandNames(self):
2163 self.needed_locks = {
2164 locking.LEVEL_NODE: locking.ALL_SET,
2165 locking.LEVEL_INSTANCE: locking.ALL_SET,
2167 self.share_locks = dict.fromkeys(locking.LEVELS, 1)
2169 def Exec(self, feedback_fn):
2170 """Verify integrity of cluster disks.
2172 @rtype: tuple of three items
2173 @return: a tuple of (dict of node-to-node_error, list of instances
2174 which need activate-disks, dict of instance: (node, volume) for
2178 result = res_nodes, res_instances, res_missing = {}, [], {}
2180 vg_name = self.cfg.GetVGName()
2181 nodes = utils.NiceSort(self.cfg.GetNodeList())
2182 instances = [self.cfg.GetInstanceInfo(name)
2183 for name in self.cfg.GetInstanceList()]
2186 for inst in instances:
2188 if (not inst.admin_up or
2189 inst.disk_template not in constants.DTS_NET_MIRROR):
2191 inst.MapLVsByNode(inst_lvs)
2192 # transform { iname: {node: [vol,],},} to {(node, vol): iname}
2193 for node, vol_list in inst_lvs.iteritems():
2194 for vol in vol_list:
2195 nv_dict[(node, vol)] = inst
2200 node_lvs = self.rpc.call_lv_list(nodes, vg_name)
2204 node_res = node_lvs[node]
2205 if node_res.offline:
2207 msg = node_res.fail_msg
2209 logging.warning("Error enumerating LVs on node %s: %s", node, msg)
2210 res_nodes[node] = msg
2213 lvs = node_res.payload
2214 for lv_name, (_, _, lv_online) in lvs.items():
2215 inst = nv_dict.pop((node, lv_name), None)
2216 if (not lv_online and inst is not None
2217 and inst.name not in res_instances):
2218 res_instances.append(inst.name)
2220 # any leftover items in nv_dict are missing LVs, let's arrange the
2222 for key, inst in nv_dict.iteritems():
2223 if inst.name not in res_missing:
2224 res_missing[inst.name] = []
2225 res_missing[inst.name].append(key)
2230 class LURepairDiskSizes(NoHooksLU):
2231 """Verifies the cluster disks sizes.
2234 _OP_PARAMS = [("instances", ht.EmptyList, ht.TListOf(ht.TNonEmptyString))]
2237 def ExpandNames(self):
2238 if self.op.instances:
2239 self.wanted_names = []
2240 for name in self.op.instances:
2241 full_name = _ExpandInstanceName(self.cfg, name)
2242 self.wanted_names.append(full_name)
2243 self.needed_locks = {
2244 locking.LEVEL_NODE: [],
2245 locking.LEVEL_INSTANCE: self.wanted_names,
2247 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
2249 self.wanted_names = None
2250 self.needed_locks = {
2251 locking.LEVEL_NODE: locking.ALL_SET,
2252 locking.LEVEL_INSTANCE: locking.ALL_SET,
2254 self.share_locks = dict(((i, 1) for i in locking.LEVELS))
2256 def DeclareLocks(self, level):
2257 if level == locking.LEVEL_NODE and self.wanted_names is not None:
2258 self._LockInstancesNodes(primary_only=True)
2260 def CheckPrereq(self):
2261 """Check prerequisites.
2263 This only checks the optional instance list against the existing names.
2266 if self.wanted_names is None:
2267 self.wanted_names = self.acquired_locks[locking.LEVEL_INSTANCE]
2269 self.wanted_instances = [self.cfg.GetInstanceInfo(name) for name
2270 in self.wanted_names]
2272 def _EnsureChildSizes(self, disk):
2273 """Ensure children of the disk have the needed disk size.
2275 This is valid mainly for DRBD8 and fixes an issue where the
2276 children have smaller disk size.
2278 @param disk: an L{ganeti.objects.Disk} object
2281 if disk.dev_type == constants.LD_DRBD8:
2282 assert disk.children, "Empty children for DRBD8?"
2283 fchild = disk.children[0]
2284 mismatch = fchild.size < disk.size
2286 self.LogInfo("Child disk has size %d, parent %d, fixing",
2287 fchild.size, disk.size)
2288 fchild.size = disk.size
2290 # and we recurse on this child only, not on the metadev
2291 return self._EnsureChildSizes(fchild) or mismatch
2295 def Exec(self, feedback_fn):
2296 """Verify the size of cluster disks.
2299 # TODO: check child disks too
2300 # TODO: check differences in size between primary/secondary nodes
2302 for instance in self.wanted_instances:
2303 pnode = instance.primary_node
2304 if pnode not in per_node_disks:
2305 per_node_disks[pnode] = []
2306 for idx, disk in enumerate(instance.disks):
2307 per_node_disks[pnode].append((instance, idx, disk))
2310 for node, dskl in per_node_disks.items():
2311 newl = [v[2].Copy() for v in dskl]
2313 self.cfg.SetDiskID(dsk, node)
2314 result = self.rpc.call_blockdev_getsizes(node, newl)
2316 self.LogWarning("Failure in blockdev_getsizes call to node"
2317 " %s, ignoring", node)
2319 if len(result.data) != len(dskl):
2320 self.LogWarning("Invalid result from node %s, ignoring node results",
2323 for ((instance, idx, disk), size) in zip(dskl, result.data):
2325 self.LogWarning("Disk %d of instance %s did not return size"
2326 " information, ignoring", idx, instance.name)
2328 if not isinstance(size, (int, long)):
2329 self.LogWarning("Disk %d of instance %s did not return valid"
2330 " size information, ignoring", idx, instance.name)
2333 if size != disk.size:
2334 self.LogInfo("Disk %d of instance %s has mismatched size,"
2335 " correcting: recorded %d, actual %d", idx,
2336 instance.name, disk.size, size)
2338 self.cfg.Update(instance, feedback_fn)
2339 changed.append((instance.name, idx, size))
2340 if self._EnsureChildSizes(disk):
2341 self.cfg.Update(instance, feedback_fn)
2342 changed.append((instance.name, idx, disk.size))
2346 class LURenameCluster(LogicalUnit):
2347 """Rename the cluster.
2350 HPATH = "cluster-rename"
2351 HTYPE = constants.HTYPE_CLUSTER
2352 _OP_PARAMS = [("name", ht.NoDefault, ht.TNonEmptyString)]
2354 def BuildHooksEnv(self):
2359 "OP_TARGET": self.cfg.GetClusterName(),
2360 "NEW_NAME": self.op.name,
2362 mn = self.cfg.GetMasterNode()
2363 all_nodes = self.cfg.GetNodeList()
2364 return env, [mn], all_nodes
2366 def CheckPrereq(self):
2367 """Verify that the passed name is a valid one.
2370 hostname = netutils.GetHostname(name=self.op.name,
2371 family=self.cfg.GetPrimaryIPFamily())
2373 new_name = hostname.name
2374 self.ip = new_ip = hostname.ip
2375 old_name = self.cfg.GetClusterName()
2376 old_ip = self.cfg.GetMasterIP()
2377 if new_name == old_name and new_ip == old_ip:
2378 raise errors.OpPrereqError("Neither the name nor the IP address of the"
2379 " cluster has changed",
2381 if new_ip != old_ip:
2382 if netutils.TcpPing(new_ip, constants.DEFAULT_NODED_PORT):
2383 raise errors.OpPrereqError("The given cluster IP address (%s) is"
2384 " reachable on the network" %
2385 new_ip, errors.ECODE_NOTUNIQUE)
2387 self.op.name = new_name
2389 def Exec(self, feedback_fn):
2390 """Rename the cluster.
2393 clustername = self.op.name
2396 # shutdown the master IP
2397 master = self.cfg.GetMasterNode()
2398 result = self.rpc.call_node_stop_master(master, False)
2399 result.Raise("Could not disable the master role")
2402 cluster = self.cfg.GetClusterInfo()
2403 cluster.cluster_name = clustername
2404 cluster.master_ip = ip
2405 self.cfg.Update(cluster, feedback_fn)
2407 # update the known hosts file
2408 ssh.WriteKnownHostsFile(self.cfg, constants.SSH_KNOWN_HOSTS_FILE)
2409 node_list = self.cfg.GetNodeList()
2411 node_list.remove(master)
2414 result = self.rpc.call_upload_file(node_list,
2415 constants.SSH_KNOWN_HOSTS_FILE)
2416 for to_node, to_result in result.iteritems():
2417 msg = to_result.fail_msg
2419 msg = ("Copy of file %s to node %s failed: %s" %
2420 (constants.SSH_KNOWN_HOSTS_FILE, to_node, msg))
2421 self.proc.LogWarning(msg)
2424 result = self.rpc.call_node_start_master(master, False, False)
2425 msg = result.fail_msg
2427 self.LogWarning("Could not re-enable the master role on"
2428 " the master, please restart manually: %s", msg)
2433 class LUSetClusterParams(LogicalUnit):
2434 """Change the parameters of the cluster.
2437 HPATH = "cluster-modify"
2438 HTYPE = constants.HTYPE_CLUSTER
2440 ("vg_name", None, ht.TMaybeString),
2441 ("enabled_hypervisors", None,
2442 ht.TOr(ht.TAnd(ht.TListOf(ht.TElemOf(constants.HYPER_TYPES)), ht.TTrue),
2444 ("hvparams", None, ht.TOr(ht.TDictOf(ht.TNonEmptyString, ht.TDict),
2446 ("beparams", None, ht.TOr(ht.TDictOf(ht.TNonEmptyString, ht.TDict),
2448 ("os_hvp", None, ht.TOr(ht.TDictOf(ht.TNonEmptyString, ht.TDict),
2450 ("osparams", None, ht.TOr(ht.TDictOf(ht.TNonEmptyString, ht.TDict),
2452 ("candidate_pool_size", None, ht.TOr(ht.TStrictPositiveInt, ht.TNone)),
2453 ("uid_pool", None, ht.NoType),
2454 ("add_uids", None, ht.NoType),
2455 ("remove_uids", None, ht.NoType),
2456 ("maintain_node_health", None, ht.TMaybeBool),
2457 ("prealloc_wipe_disks", None, ht.TMaybeBool),
2458 ("nicparams", None, ht.TOr(ht.TDict, ht.TNone)),
2459 ("drbd_helper", None, ht.TOr(ht.TString, ht.TNone)),
2460 ("default_iallocator", None, ht.TOr(ht.TString, ht.TNone)),
2461 ("reserved_lvs", None, ht.TOr(ht.TListOf(ht.TNonEmptyString), ht.TNone)),
2462 ("hidden_os", None, ht.TOr(ht.TListOf(\
2465 ht.TMap(lambda v: v[0], ht.TElemOf(constants.DDMS_VALUES)))),
2467 ("blacklisted_os", None, ht.TOr(ht.TListOf(\
2470 ht.TMap(lambda v: v[0], ht.TElemOf(constants.DDMS_VALUES)))),
2475 def CheckArguments(self):
2479 if self.op.uid_pool:
2480 uidpool.CheckUidPool(self.op.uid_pool)
2482 if self.op.add_uids:
2483 uidpool.CheckUidPool(self.op.add_uids)
2485 if self.op.remove_uids:
2486 uidpool.CheckUidPool(self.op.remove_uids)
2488 def ExpandNames(self):
2489 # FIXME: in the future maybe other cluster params won't require checking on
2490 # all nodes to be modified.
2491 self.needed_locks = {
2492 locking.LEVEL_NODE: locking.ALL_SET,
2494 self.share_locks[locking.LEVEL_NODE] = 1
2496 def BuildHooksEnv(self):
2501 "OP_TARGET": self.cfg.GetClusterName(),
2502 "NEW_VG_NAME": self.op.vg_name,
2504 mn = self.cfg.GetMasterNode()
2505 return env, [mn], [mn]
2507 def CheckPrereq(self):
2508 """Check prerequisites.
2510 This checks whether the given params don't conflict and
2511 if the given volume group is valid.
2514 if self.op.vg_name is not None and not self.op.vg_name:
2515 if self.cfg.HasAnyDiskOfType(constants.LD_LV):
2516 raise errors.OpPrereqError("Cannot disable lvm storage while lvm-based"
2517 " instances exist", errors.ECODE_INVAL)
2519 if self.op.drbd_helper is not None and not self.op.drbd_helper:
2520 if self.cfg.HasAnyDiskOfType(constants.LD_DRBD8):
2521 raise errors.OpPrereqError("Cannot disable drbd helper while"
2522 " drbd-based instances exist",
2525 node_list = self.acquired_locks[locking.LEVEL_NODE]
2527 # if vg_name not None, checks given volume group on all nodes
2529 vglist = self.rpc.call_vg_list(node_list)
2530 for node in node_list:
2531 msg = vglist[node].fail_msg
2533 # ignoring down node
2534 self.LogWarning("Error while gathering data on node %s"
2535 " (ignoring node): %s", node, msg)
2537 vgstatus = utils.CheckVolumeGroupSize(vglist[node].payload,
2539 constants.MIN_VG_SIZE)
2541 raise errors.OpPrereqError("Error on node '%s': %s" %
2542 (node, vgstatus), errors.ECODE_ENVIRON)
2544 if self.op.drbd_helper:
2545 # checks given drbd helper on all nodes
2546 helpers = self.rpc.call_drbd_helper(node_list)
2547 for node in node_list:
2548 ninfo = self.cfg.GetNodeInfo(node)
2550 self.LogInfo("Not checking drbd helper on offline node %s", node)
2552 msg = helpers[node].fail_msg
2554 raise errors.OpPrereqError("Error checking drbd helper on node"
2555 " '%s': %s" % (node, msg),
2556 errors.ECODE_ENVIRON)
2557 node_helper = helpers[node].payload
2558 if node_helper != self.op.drbd_helper:
2559 raise errors.OpPrereqError("Error on node '%s': drbd helper is %s" %
2560 (node, node_helper), errors.ECODE_ENVIRON)
2562 self.cluster = cluster = self.cfg.GetClusterInfo()
2563 # validate params changes
2564 if self.op.beparams:
2565 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
2566 self.new_beparams = cluster.SimpleFillBE(self.op.beparams)
2568 if self.op.nicparams:
2569 utils.ForceDictType(self.op.nicparams, constants.NICS_PARAMETER_TYPES)
2570 self.new_nicparams = cluster.SimpleFillNIC(self.op.nicparams)
2571 objects.NIC.CheckParameterSyntax(self.new_nicparams)
2574 # check all instances for consistency
2575 for instance in self.cfg.GetAllInstancesInfo().values():
2576 for nic_idx, nic in enumerate(instance.nics):
2577 params_copy = copy.deepcopy(nic.nicparams)
2578 params_filled = objects.FillDict(self.new_nicparams, params_copy)
2580 # check parameter syntax
2582 objects.NIC.CheckParameterSyntax(params_filled)
2583 except errors.ConfigurationError, err:
2584 nic_errors.append("Instance %s, nic/%d: %s" %
2585 (instance.name, nic_idx, err))
2587 # if we're moving instances to routed, check that they have an ip
2588 target_mode = params_filled[constants.NIC_MODE]
2589 if target_mode == constants.NIC_MODE_ROUTED and not nic.ip:
2590 nic_errors.append("Instance %s, nic/%d: routed nick with no ip" %
2591 (instance.name, nic_idx))
2593 raise errors.OpPrereqError("Cannot apply the change, errors:\n%s" %
2594 "\n".join(nic_errors))
2596 # hypervisor list/parameters
2597 self.new_hvparams = new_hvp = objects.FillDict(cluster.hvparams, {})
2598 if self.op.hvparams:
2599 for hv_name, hv_dict in self.op.hvparams.items():
2600 if hv_name not in self.new_hvparams:
2601 self.new_hvparams[hv_name] = hv_dict
2603 self.new_hvparams[hv_name].update(hv_dict)
2605 # os hypervisor parameters
2606 self.new_os_hvp = objects.FillDict(cluster.os_hvp, {})
2608 for os_name, hvs in self.op.os_hvp.items():
2609 if os_name not in self.new_os_hvp:
2610 self.new_os_hvp[os_name] = hvs
2612 for hv_name, hv_dict in hvs.items():
2613 if hv_name not in self.new_os_hvp[os_name]:
2614 self.new_os_hvp[os_name][hv_name] = hv_dict
2616 self.new_os_hvp[os_name][hv_name].update(hv_dict)
2619 self.new_osp = objects.FillDict(cluster.osparams, {})
2620 if self.op.osparams:
2621 for os_name, osp in self.op.osparams.items():
2622 if os_name not in self.new_osp:
2623 self.new_osp[os_name] = {}
2625 self.new_osp[os_name] = _GetUpdatedParams(self.new_osp[os_name], osp,
2628 if not self.new_osp[os_name]:
2629 # we removed all parameters
2630 del self.new_osp[os_name]
2632 # check the parameter validity (remote check)
2633 _CheckOSParams(self, False, [self.cfg.GetMasterNode()],
2634 os_name, self.new_osp[os_name])
2636 # changes to the hypervisor list
2637 if self.op.enabled_hypervisors is not None:
2638 self.hv_list = self.op.enabled_hypervisors
2639 for hv in self.hv_list:
2640 # if the hypervisor doesn't already exist in the cluster
2641 # hvparams, we initialize it to empty, and then (in both
2642 # cases) we make sure to fill the defaults, as we might not
2643 # have a complete defaults list if the hypervisor wasn't
2645 if hv not in new_hvp:
2647 new_hvp[hv] = objects.FillDict(constants.HVC_DEFAULTS[hv], new_hvp[hv])
2648 utils.ForceDictType(new_hvp[hv], constants.HVS_PARAMETER_TYPES)
2650 self.hv_list = cluster.enabled_hypervisors
2652 if self.op.hvparams or self.op.enabled_hypervisors is not None:
2653 # either the enabled list has changed, or the parameters have, validate
2654 for hv_name, hv_params in self.new_hvparams.items():
2655 if ((self.op.hvparams and hv_name in self.op.hvparams) or
2656 (self.op.enabled_hypervisors and
2657 hv_name in self.op.enabled_hypervisors)):
2658 # either this is a new hypervisor, or its parameters have changed
2659 hv_class = hypervisor.GetHypervisor(hv_name)
2660 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
2661 hv_class.CheckParameterSyntax(hv_params)
2662 _CheckHVParams(self, node_list, hv_name, hv_params)
2665 # no need to check any newly-enabled hypervisors, since the
2666 # defaults have already been checked in the above code-block
2667 for os_name, os_hvp in self.new_os_hvp.items():
2668 for hv_name, hv_params in os_hvp.items():
2669 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
2670 # we need to fill in the new os_hvp on top of the actual hv_p
2671 cluster_defaults = self.new_hvparams.get(hv_name, {})
2672 new_osp = objects.FillDict(cluster_defaults, hv_params)
2673 hv_class = hypervisor.GetHypervisor(hv_name)
2674 hv_class.CheckParameterSyntax(new_osp)
2675 _CheckHVParams(self, node_list, hv_name, new_osp)
2677 if self.op.default_iallocator:
2678 alloc_script = utils.FindFile(self.op.default_iallocator,
2679 constants.IALLOCATOR_SEARCH_PATH,
2681 if alloc_script is None:
2682 raise errors.OpPrereqError("Invalid default iallocator script '%s'"
2683 " specified" % self.op.default_iallocator,
2686 def Exec(self, feedback_fn):
2687 """Change the parameters of the cluster.
2690 if self.op.vg_name is not None:
2691 new_volume = self.op.vg_name
2694 if new_volume != self.cfg.GetVGName():
2695 self.cfg.SetVGName(new_volume)
2697 feedback_fn("Cluster LVM configuration already in desired"
2698 " state, not changing")
2699 if self.op.drbd_helper is not None:
2700 new_helper = self.op.drbd_helper
2703 if new_helper != self.cfg.GetDRBDHelper():
2704 self.cfg.SetDRBDHelper(new_helper)
2706 feedback_fn("Cluster DRBD helper already in desired state,"
2708 if self.op.hvparams:
2709 self.cluster.hvparams = self.new_hvparams
2711 self.cluster.os_hvp = self.new_os_hvp
2712 if self.op.enabled_hypervisors is not None:
2713 self.cluster.hvparams = self.new_hvparams
2714 self.cluster.enabled_hypervisors = self.op.enabled_hypervisors
2715 if self.op.beparams:
2716 self.cluster.beparams[constants.PP_DEFAULT] = self.new_beparams
2717 if self.op.nicparams:
2718 self.cluster.nicparams[constants.PP_DEFAULT] = self.new_nicparams
2719 if self.op.osparams:
2720 self.cluster.osparams = self.new_osp
2722 if self.op.candidate_pool_size is not None:
2723 self.cluster.candidate_pool_size = self.op.candidate_pool_size
2724 # we need to update the pool size here, otherwise the save will fail
2725 _AdjustCandidatePool(self, [])
2727 if self.op.maintain_node_health is not None:
2728 self.cluster.maintain_node_health = self.op.maintain_node_health
2730 if self.op.prealloc_wipe_disks is not None:
2731 self.cluster.prealloc_wipe_disks = self.op.prealloc_wipe_disks
2733 if self.op.add_uids is not None:
2734 uidpool.AddToUidPool(self.cluster.uid_pool, self.op.add_uids)
2736 if self.op.remove_uids is not None:
2737 uidpool.RemoveFromUidPool(self.cluster.uid_pool, self.op.remove_uids)
2739 if self.op.uid_pool is not None:
2740 self.cluster.uid_pool = self.op.uid_pool
2742 if self.op.default_iallocator is not None:
2743 self.cluster.default_iallocator = self.op.default_iallocator
2745 if self.op.reserved_lvs is not None:
2746 self.cluster.reserved_lvs = self.op.reserved_lvs
2748 def helper_os(aname, mods, desc):
2750 lst = getattr(self.cluster, aname)
2751 for key, val in mods:
2752 if key == constants.DDM_ADD:
2754 feedback_fn("OS %s already in %s, ignoring", val, desc)
2757 elif key == constants.DDM_REMOVE:
2761 feedback_fn("OS %s not found in %s, ignoring", val, desc)
2763 raise errors.ProgrammerError("Invalid modification '%s'" % key)
2765 if self.op.hidden_os:
2766 helper_os("hidden_os", self.op.hidden_os, "hidden")
2768 if self.op.blacklisted_os:
2769 helper_os("blacklisted_os", self.op.blacklisted_os, "blacklisted")
2771 self.cfg.Update(self.cluster, feedback_fn)
2774 def _RedistributeAncillaryFiles(lu, additional_nodes=None):
2775 """Distribute additional files which are part of the cluster configuration.
2777 ConfigWriter takes care of distributing the config and ssconf files, but
2778 there are more files which should be distributed to all nodes. This function
2779 makes sure those are copied.
2781 @param lu: calling logical unit
2782 @param additional_nodes: list of nodes not in the config to distribute to
2785 # 1. Gather target nodes
2786 myself = lu.cfg.GetNodeInfo(lu.cfg.GetMasterNode())
2787 dist_nodes = lu.cfg.GetOnlineNodeList()
2788 if additional_nodes is not None:
2789 dist_nodes.extend(additional_nodes)
2790 if myself.name in dist_nodes:
2791 dist_nodes.remove(myself.name)
2793 # 2. Gather files to distribute
2794 dist_files = set([constants.ETC_HOSTS,
2795 constants.SSH_KNOWN_HOSTS_FILE,
2796 constants.RAPI_CERT_FILE,
2797 constants.RAPI_USERS_FILE,
2798 constants.CONFD_HMAC_KEY,
2799 constants.CLUSTER_DOMAIN_SECRET_FILE,
2802 enabled_hypervisors = lu.cfg.GetClusterInfo().enabled_hypervisors
2803 for hv_name in enabled_hypervisors:
2804 hv_class = hypervisor.GetHypervisor(hv_name)
2805 dist_files.update(hv_class.GetAncillaryFiles())
2807 # 3. Perform the files upload
2808 for fname in dist_files:
2809 if os.path.exists(fname):
2810 result = lu.rpc.call_upload_file(dist_nodes, fname)
2811 for to_node, to_result in result.items():
2812 msg = to_result.fail_msg
2814 msg = ("Copy of file %s to node %s failed: %s" %
2815 (fname, to_node, msg))
2816 lu.proc.LogWarning(msg)
2819 class LURedistributeConfig(NoHooksLU):
2820 """Force the redistribution of cluster configuration.
2822 This is a very simple LU.
2827 def ExpandNames(self):
2828 self.needed_locks = {
2829 locking.LEVEL_NODE: locking.ALL_SET,
2831 self.share_locks[locking.LEVEL_NODE] = 1
2833 def Exec(self, feedback_fn):
2834 """Redistribute the configuration.
2837 self.cfg.Update(self.cfg.GetClusterInfo(), feedback_fn)
2838 _RedistributeAncillaryFiles(self)
2841 def _WaitForSync(lu, instance, disks=None, oneshot=False):
2842 """Sleep and poll for an instance's disk to sync.
2845 if not instance.disks or disks is not None and not disks:
2848 disks = _ExpandCheckDisks(instance, disks)
2851 lu.proc.LogInfo("Waiting for instance %s to sync disks." % instance.name)
2853 node = instance.primary_node
2856 lu.cfg.SetDiskID(dev, node)
2858 # TODO: Convert to utils.Retry
2861 degr_retries = 10 # in seconds, as we sleep 1 second each time
2865 cumul_degraded = False
2866 rstats = lu.rpc.call_blockdev_getmirrorstatus(node, disks)
2867 msg = rstats.fail_msg
2869 lu.LogWarning("Can't get any data from node %s: %s", node, msg)
2872 raise errors.RemoteError("Can't contact node %s for mirror data,"
2873 " aborting." % node)
2876 rstats = rstats.payload
2878 for i, mstat in enumerate(rstats):
2880 lu.LogWarning("Can't compute data for node %s/%s",
2881 node, disks[i].iv_name)
2884 cumul_degraded = (cumul_degraded or
2885 (mstat.is_degraded and mstat.sync_percent is None))
2886 if mstat.sync_percent is not None:
2888 if mstat.estimated_time is not None:
2889 rem_time = ("%s remaining (estimated)" %
2890 utils.FormatSeconds(mstat.estimated_time))
2891 max_time = mstat.estimated_time
2893 rem_time = "no time estimate"
2894 lu.proc.LogInfo("- device %s: %5.2f%% done, %s" %
2895 (disks[i].iv_name, mstat.sync_percent, rem_time))
2897 # if we're done but degraded, let's do a few small retries, to
2898 # make sure we see a stable and not transient situation; therefore
2899 # we force restart of the loop
2900 if (done or oneshot) and cumul_degraded and degr_retries > 0:
2901 logging.info("Degraded disks found, %d retries left", degr_retries)
2909 time.sleep(min(60, max_time))
2912 lu.proc.LogInfo("Instance %s's disks are in sync." % instance.name)
2913 return not cumul_degraded
2916 def _CheckDiskConsistency(lu, dev, node, on_primary, ldisk=False):
2917 """Check that mirrors are not degraded.
2919 The ldisk parameter, if True, will change the test from the
2920 is_degraded attribute (which represents overall non-ok status for
2921 the device(s)) to the ldisk (representing the local storage status).
2924 lu.cfg.SetDiskID(dev, node)
2928 if on_primary or dev.AssembleOnSecondary():
2929 rstats = lu.rpc.call_blockdev_find(node, dev)
2930 msg = rstats.fail_msg
2932 lu.LogWarning("Can't find disk on node %s: %s", node, msg)
2934 elif not rstats.payload:
2935 lu.LogWarning("Can't find disk on node %s", node)
2939 result = result and rstats.payload.ldisk_status == constants.LDS_OKAY
2941 result = result and not rstats.payload.is_degraded
2944 for child in dev.children:
2945 result = result and _CheckDiskConsistency(lu, child, node, on_primary)
2950 class LUDiagnoseOS(NoHooksLU):
2951 """Logical unit for OS diagnose/query.
2956 ("names", ht.EmptyList, ht.TListOf(ht.TNonEmptyString)),
2960 _BLK = "blacklisted"
2962 _FIELDS_STATIC = utils.FieldSet()
2963 _FIELDS_DYNAMIC = utils.FieldSet("name", _VLD, "node_status", "variants",
2964 "parameters", "api_versions", _HID, _BLK)
2966 def CheckArguments(self):
2968 raise errors.OpPrereqError("Selective OS query not supported",
2971 _CheckOutputFields(static=self._FIELDS_STATIC,
2972 dynamic=self._FIELDS_DYNAMIC,
2973 selected=self.op.output_fields)
2975 def ExpandNames(self):
2976 # Lock all nodes, in shared mode
2977 # Temporary removal of locks, should be reverted later
2978 # TODO: reintroduce locks when they are lighter-weight
2979 self.needed_locks = {}
2980 #self.share_locks[locking.LEVEL_NODE] = 1
2981 #self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
2984 def _DiagnoseByOS(rlist):
2985 """Remaps a per-node return list into an a per-os per-node dictionary
2987 @param rlist: a map with node names as keys and OS objects as values
2990 @return: a dictionary with osnames as keys and as value another
2991 map, with nodes as keys and tuples of (path, status, diagnose,
2992 variants, parameters, api_versions) as values, eg::
2994 {"debian-etch": {"node1": [(/usr/lib/..., True, "", [], []),
2995 (/srv/..., False, "invalid api")],
2996 "node2": [(/srv/..., True, "", [], [])]}
3001 # we build here the list of nodes that didn't fail the RPC (at RPC
3002 # level), so that nodes with a non-responding node daemon don't
3003 # make all OSes invalid
3004 good_nodes = [node_name for node_name in rlist
3005 if not rlist[node_name].fail_msg]
3006 for node_name, nr in rlist.items():
3007 if nr.fail_msg or not nr.payload:
3009 for (name, path, status, diagnose, variants,
3010 params, api_versions) in nr.payload:
3011 if name not in all_os:
3012 # build a list of nodes for this os containing empty lists
3013 # for each node in node_list
3015 for nname in good_nodes:
3016 all_os[name][nname] = []
3017 # convert params from [name, help] to (name, help)
3018 params = [tuple(v) for v in params]
3019 all_os[name][node_name].append((path, status, diagnose,
3020 variants, params, api_versions))
3023 def Exec(self, feedback_fn):
3024 """Compute the list of OSes.
3027 valid_nodes = [node for node in self.cfg.GetOnlineNodeList()]
3028 node_data = self.rpc.call_os_diagnose(valid_nodes)
3029 pol = self._DiagnoseByOS(node_data)
3031 cluster = self.cfg.GetClusterInfo()
3033 for os_name in utils.NiceSort(pol.keys()):
3034 os_data = pol[os_name]
3037 (variants, params, api_versions) = null_state = (set(), set(), set())
3038 for idx, osl in enumerate(os_data.values()):
3039 valid = bool(valid and osl and osl[0][1])
3041 (variants, params, api_versions) = null_state
3043 node_variants, node_params, node_api = osl[0][3:6]
3044 if idx == 0: # first entry
3045 variants = set(node_variants)
3046 params = set(node_params)
3047 api_versions = set(node_api)
3048 else: # keep consistency
3049 variants.intersection_update(node_variants)
3050 params.intersection_update(node_params)
3051 api_versions.intersection_update(node_api)
3053 is_hid = os_name in cluster.hidden_os
3054 is_blk = os_name in cluster.blacklisted_os
3055 if ((self._HID not in self.op.output_fields and is_hid) or
3056 (self._BLK not in self.op.output_fields and is_blk) or
3057 (self._VLD not in self.op.output_fields and not valid)):
3060 for field in self.op.output_fields:
3063 elif field == self._VLD:
3065 elif field == "node_status":
3066 # this is just a copy of the dict
3068 for node_name, nos_list in os_data.items():
3069 val[node_name] = nos_list
3070 elif field == "variants":
3071 val = utils.NiceSort(list(variants))
3072 elif field == "parameters":
3074 elif field == "api_versions":
3075 val = list(api_versions)
3076 elif field == self._HID:
3078 elif field == self._BLK:
3081 raise errors.ParameterError(field)
3088 class LURemoveNode(LogicalUnit):
3089 """Logical unit for removing a node.
3092 HPATH = "node-remove"
3093 HTYPE = constants.HTYPE_NODE
3098 def BuildHooksEnv(self):
3101 This doesn't run on the target node in the pre phase as a failed
3102 node would then be impossible to remove.
3106 "OP_TARGET": self.op.node_name,
3107 "NODE_NAME": self.op.node_name,
3109 all_nodes = self.cfg.GetNodeList()
3111 all_nodes.remove(self.op.node_name)
3113 logging.warning("Node %s which is about to be removed not found"
3114 " in the all nodes list", self.op.node_name)
3115 return env, all_nodes, all_nodes
3117 def CheckPrereq(self):
3118 """Check prerequisites.
3121 - the node exists in the configuration
3122 - it does not have primary or secondary instances
3123 - it's not the master
3125 Any errors are signaled by raising errors.OpPrereqError.
3128 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
3129 node = self.cfg.GetNodeInfo(self.op.node_name)
3130 assert node is not None
3132 instance_list = self.cfg.GetInstanceList()
3134 masternode = self.cfg.GetMasterNode()
3135 if node.name == masternode:
3136 raise errors.OpPrereqError("Node is the master node,"
3137 " you need to failover first.",
3140 for instance_name in instance_list:
3141 instance = self.cfg.GetInstanceInfo(instance_name)
3142 if node.name in instance.all_nodes:
3143 raise errors.OpPrereqError("Instance %s is still running on the node,"
3144 " please remove first." % instance_name,
3146 self.op.node_name = node.name
3149 def Exec(self, feedback_fn):
3150 """Removes the node from the cluster.
3154 logging.info("Stopping the node daemon and removing configs from node %s",
3157 modify_ssh_setup = self.cfg.GetClusterInfo().modify_ssh_setup
3159 # Promote nodes to master candidate as needed
3160 _AdjustCandidatePool(self, exceptions=[node.name])
3161 self.context.RemoveNode(node.name)
3163 # Run post hooks on the node before it's removed
3164 hm = self.proc.hmclass(self.rpc.call_hooks_runner, self)
3166 hm.RunPhase(constants.HOOKS_PHASE_POST, [node.name])
3168 # pylint: disable-msg=W0702
3169 self.LogWarning("Errors occurred running hooks on %s" % node.name)
3171 result = self.rpc.call_node_leave_cluster(node.name, modify_ssh_setup)
3172 msg = result.fail_msg
3174 self.LogWarning("Errors encountered on the remote node while leaving"
3175 " the cluster: %s", msg)
3177 # Remove node from our /etc/hosts
3178 if self.cfg.GetClusterInfo().modify_etc_hosts:
3179 master_node = self.cfg.GetMasterNode()
3180 result = self.rpc.call_etc_hosts_modify(master_node,
3181 constants.ETC_HOSTS_REMOVE,
3183 result.Raise("Can't update hosts file with new host data")
3184 _RedistributeAncillaryFiles(self)
3187 class LUQueryNodes(NoHooksLU):
3188 """Logical unit for querying nodes.
3191 # pylint: disable-msg=W0142
3194 ("names", ht.EmptyList, ht.TListOf(ht.TNonEmptyString)),
3195 ("use_locking", False, ht.TBool),
3199 _SIMPLE_FIELDS = ["name", "serial_no", "ctime", "mtime", "uuid",
3200 "master_candidate", "offline", "drained"]
3202 _FIELDS_DYNAMIC = utils.FieldSet(
3204 "mtotal", "mnode", "mfree",
3206 "ctotal", "cnodes", "csockets",
3209 _FIELDS_STATIC = utils.FieldSet(*[
3210 "pinst_cnt", "sinst_cnt",
3211 "pinst_list", "sinst_list",
3212 "pip", "sip", "tags",
3214 "role"] + _SIMPLE_FIELDS
3217 def CheckArguments(self):
3218 _CheckOutputFields(static=self._FIELDS_STATIC,
3219 dynamic=self._FIELDS_DYNAMIC,
3220 selected=self.op.output_fields)
3222 def ExpandNames(self):
3223 self.needed_locks = {}
3224 self.share_locks[locking.LEVEL_NODE] = 1
3227 self.wanted = _GetWantedNodes(self, self.op.names)
3229 self.wanted = locking.ALL_SET
3231 self.do_node_query = self._FIELDS_STATIC.NonMatching(self.op.output_fields)
3232 self.do_locking = self.do_node_query and self.op.use_locking
3234 # if we don't request only static fields, we need to lock the nodes
3235 self.needed_locks[locking.LEVEL_NODE] = self.wanted
3237 def Exec(self, feedback_fn):
3238 """Computes the list of nodes and their attributes.
3241 all_info = self.cfg.GetAllNodesInfo()
3243 nodenames = self.acquired_locks[locking.LEVEL_NODE]
3244 elif self.wanted != locking.ALL_SET:
3245 nodenames = self.wanted
3246 missing = set(nodenames).difference(all_info.keys())
3248 raise errors.OpExecError(
3249 "Some nodes were removed before retrieving their data: %s" % missing)
3251 nodenames = all_info.keys()
3253 nodenames = utils.NiceSort(nodenames)
3254 nodelist = [all_info[name] for name in nodenames]
3256 # begin data gathering
3258 if self.do_node_query:
3260 node_data = self.rpc.call_node_info(nodenames, self.cfg.GetVGName(),
3261 self.cfg.GetHypervisorType())
3262 for name in nodenames:
3263 nodeinfo = node_data[name]
3264 if not nodeinfo.fail_msg and nodeinfo.payload:
3265 nodeinfo = nodeinfo.payload
3266 fn = utils.TryConvert
3268 "mtotal": fn(int, nodeinfo.get('memory_total', None)),
3269 "mnode": fn(int, nodeinfo.get('memory_dom0', None)),
3270 "mfree": fn(int, nodeinfo.get('memory_free', None)),
3271 "dtotal": fn(int, nodeinfo.get('vg_size', None)),
3272 "dfree": fn(int, nodeinfo.get('vg_free', None)),
3273 "ctotal": fn(int, nodeinfo.get('cpu_total', None)),
3274 "bootid": nodeinfo.get('bootid', None),
3275 "cnodes": fn(int, nodeinfo.get('cpu_nodes', None)),
3276 "csockets": fn(int, nodeinfo.get('cpu_sockets', None)),
3279 live_data[name] = {}
3281 live_data = dict.fromkeys(nodenames, {})
3283 node_to_primary = dict([(name, set()) for name in nodenames])
3284 node_to_secondary = dict([(name, set()) for name in nodenames])
3286 inst_fields = frozenset(("pinst_cnt", "pinst_list",
3287 "sinst_cnt", "sinst_list"))
3288 if inst_fields & frozenset(self.op.output_fields):
3289 inst_data = self.cfg.GetAllInstancesInfo()
3291 for inst in inst_data.values():
3292 if inst.primary_node in node_to_primary:
3293 node_to_primary[inst.primary_node].add(inst.name)
3294 for secnode in inst.secondary_nodes:
3295 if secnode in node_to_secondary:
3296 node_to_secondary[secnode].add(inst.name)
3298 master_node = self.cfg.GetMasterNode()
3300 # end data gathering
3303 for node in nodelist:
3305 for field in self.op.output_fields:
3306 if field in self._SIMPLE_FIELDS:
3307 val = getattr(node, field)
3308 elif field == "pinst_list":
3309 val = list(node_to_primary[node.name])
3310 elif field == "sinst_list":
3311 val = list(node_to_secondary[node.name])
3312 elif field == "pinst_cnt":
3313 val = len(node_to_primary[node.name])
3314 elif field == "sinst_cnt":
3315 val = len(node_to_secondary[node.name])
3316 elif field == "pip":
3317 val = node.primary_ip
3318 elif field == "sip":
3319 val = node.secondary_ip
3320 elif field == "tags":
3321 val = list(node.GetTags())
3322 elif field == "master":
3323 val = node.name == master_node
3324 elif self._FIELDS_DYNAMIC.Matches(field):
3325 val = live_data[node.name].get(field, None)
3326 elif field == "role":
3327 if node.name == master_node:
3329 elif node.master_candidate:
3338 raise errors.ParameterError(field)
3339 node_output.append(val)
3340 output.append(node_output)
3345 class LUQueryNodeVolumes(NoHooksLU):
3346 """Logical unit for getting volumes on node(s).
3350 ("nodes", ht.EmptyList, ht.TListOf(ht.TNonEmptyString)),
3351 ("output_fields", ht.NoDefault, ht.TListOf(ht.TNonEmptyString)),
3354 _FIELDS_DYNAMIC = utils.FieldSet("phys", "vg", "name", "size", "instance")
3355 _FIELDS_STATIC = utils.FieldSet("node")
3357 def CheckArguments(self):
3358 _CheckOutputFields(static=self._FIELDS_STATIC,
3359 dynamic=self._FIELDS_DYNAMIC,
3360 selected=self.op.output_fields)
3362 def ExpandNames(self):
3363 self.needed_locks = {}
3364 self.share_locks[locking.LEVEL_NODE] = 1
3365 if not self.op.nodes:
3366 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
3368 self.needed_locks[locking.LEVEL_NODE] = \
3369 _GetWantedNodes(self, self.op.nodes)
3371 def Exec(self, feedback_fn):
3372 """Computes the list of nodes and their attributes.
3375 nodenames = self.acquired_locks[locking.LEVEL_NODE]
3376 volumes = self.rpc.call_node_volumes(nodenames)
3378 ilist = [self.cfg.GetInstanceInfo(iname) for iname
3379 in self.cfg.GetInstanceList()]
3381 lv_by_node = dict([(inst, inst.MapLVsByNode()) for inst in ilist])
3384 for node in nodenames:
3385 nresult = volumes[node]
3388 msg = nresult.fail_msg
3390 self.LogWarning("Can't compute volume data on node %s: %s", node, msg)
3393 node_vols = nresult.payload[:]
3394 node_vols.sort(key=lambda vol: vol['dev'])
3396 for vol in node_vols:
3398 for field in self.op.output_fields:
3401 elif field == "phys":
3405 elif field == "name":
3407 elif field == "size":
3408 val = int(float(vol['size']))
3409 elif field == "instance":
3411 if node not in lv_by_node[inst]:
3413 if vol['name'] in lv_by_node[inst][node]:
3419 raise errors.ParameterError(field)
3420 node_output.append(str(val))
3422 output.append(node_output)
3427 class LUQueryNodeStorage(NoHooksLU):
3428 """Logical unit for getting information on storage units on node(s).
3431 _FIELDS_STATIC = utils.FieldSet(constants.SF_NODE)
3433 ("nodes", ht.EmptyList, ht.TListOf(ht.TNonEmptyString)),
3434 ("storage_type", ht.NoDefault, _CheckStorageType),
3435 ("output_fields", ht.NoDefault, ht.TListOf(ht.TNonEmptyString)),
3436 ("name", None, ht.TMaybeString),
3440 def CheckArguments(self):
3441 _CheckOutputFields(static=self._FIELDS_STATIC,
3442 dynamic=utils.FieldSet(*constants.VALID_STORAGE_FIELDS),
3443 selected=self.op.output_fields)
3445 def ExpandNames(self):
3446 self.needed_locks = {}
3447 self.share_locks[locking.LEVEL_NODE] = 1
3450 self.needed_locks[locking.LEVEL_NODE] = \
3451 _GetWantedNodes(self, self.op.nodes)
3453 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
3455 def Exec(self, feedback_fn):
3456 """Computes the list of nodes and their attributes.
3459 self.nodes = self.acquired_locks[locking.LEVEL_NODE]
3461 # Always get name to sort by
3462 if constants.SF_NAME in self.op.output_fields:
3463 fields = self.op.output_fields[:]
3465 fields = [constants.SF_NAME] + self.op.output_fields
3467 # Never ask for node or type as it's only known to the LU
3468 for extra in [constants.SF_NODE, constants.SF_TYPE]:
3469 while extra in fields:
3470 fields.remove(extra)
3472 field_idx = dict([(name, idx) for (idx, name) in enumerate(fields)])
3473 name_idx = field_idx[constants.SF_NAME]
3475 st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
3476 data = self.rpc.call_storage_list(self.nodes,
3477 self.op.storage_type, st_args,
3478 self.op.name, fields)
3482 for node in utils.NiceSort(self.nodes):
3483 nresult = data[node]
3487 msg = nresult.fail_msg
3489 self.LogWarning("Can't get storage data from node %s: %s", node, msg)
3492 rows = dict([(row[name_idx], row) for row in nresult.payload])
3494 for name in utils.NiceSort(rows.keys()):
3499 for field in self.op.output_fields:
3500 if field == constants.SF_NODE:
3502 elif field == constants.SF_TYPE:
3503 val = self.op.storage_type
3504 elif field in field_idx:
3505 val = row[field_idx[field]]
3507 raise errors.ParameterError(field)
3516 class LUModifyNodeStorage(NoHooksLU):
3517 """Logical unit for modifying a storage volume on a node.
3522 ("storage_type", ht.NoDefault, _CheckStorageType),
3523 ("name", ht.NoDefault, ht.TNonEmptyString),
3524 ("changes", ht.NoDefault, ht.TDict),
3528 def CheckArguments(self):
3529 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
3531 storage_type = self.op.storage_type
3534 modifiable = constants.MODIFIABLE_STORAGE_FIELDS[storage_type]
3536 raise errors.OpPrereqError("Storage units of type '%s' can not be"
3537 " modified" % storage_type,
3540 diff = set(self.op.changes.keys()) - modifiable
3542 raise errors.OpPrereqError("The following fields can not be modified for"
3543 " storage units of type '%s': %r" %
3544 (storage_type, list(diff)),
3547 def ExpandNames(self):
3548 self.needed_locks = {
3549 locking.LEVEL_NODE: self.op.node_name,
3552 def Exec(self, feedback_fn):
3553 """Computes the list of nodes and their attributes.
3556 st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
3557 result = self.rpc.call_storage_modify(self.op.node_name,
3558 self.op.storage_type, st_args,
3559 self.op.name, self.op.changes)
3560 result.Raise("Failed to modify storage unit '%s' on %s" %
3561 (self.op.name, self.op.node_name))
3564 class LUAddNode(LogicalUnit):
3565 """Logical unit for adding node to the cluster.
3569 HTYPE = constants.HTYPE_NODE
3572 ("primary_ip", None, ht.NoType),
3573 ("secondary_ip", None, ht.TMaybeString),
3574 ("readd", False, ht.TBool),
3575 ("group", None, ht.TMaybeString)
3578 def CheckArguments(self):
3579 self.primary_ip_family = self.cfg.GetPrimaryIPFamily()
3580 # validate/normalize the node name
3581 self.hostname = netutils.GetHostname(name=self.op.node_name,
3582 family=self.primary_ip_family)
3583 self.op.node_name = self.hostname.name
3584 if self.op.readd and self.op.group:
3585 raise errors.OpPrereqError("Cannot pass a node group when a node is"
3586 " being readded", errors.ECODE_INVAL)
3588 def BuildHooksEnv(self):
3591 This will run on all nodes before, and on all nodes + the new node after.
3595 "OP_TARGET": self.op.node_name,
3596 "NODE_NAME": self.op.node_name,
3597 "NODE_PIP": self.op.primary_ip,
3598 "NODE_SIP": self.op.secondary_ip,
3600 nodes_0 = self.cfg.GetNodeList()
3601 nodes_1 = nodes_0 + [self.op.node_name, ]
3602 return env, nodes_0, nodes_1
3604 def CheckPrereq(self):
3605 """Check prerequisites.
3608 - the new node is not already in the config
3610 - its parameters (single/dual homed) matches the cluster
3612 Any errors are signaled by raising errors.OpPrereqError.
3616 hostname = self.hostname
3617 node = hostname.name
3618 primary_ip = self.op.primary_ip = hostname.ip
3619 if self.op.secondary_ip is None:
3620 if self.primary_ip_family == netutils.IP6Address.family:
3621 raise errors.OpPrereqError("When using a IPv6 primary address, a valid"
3622 " IPv4 address must be given as secondary",
3624 self.op.secondary_ip = primary_ip
3626 secondary_ip = self.op.secondary_ip
3627 if not netutils.IP4Address.IsValid(secondary_ip):
3628 raise errors.OpPrereqError("Secondary IP (%s) needs to be a valid IPv4"
3629 " address" % secondary_ip, errors.ECODE_INVAL)
3631 node_list = cfg.GetNodeList()
3632 if not self.op.readd and node in node_list:
3633 raise errors.OpPrereqError("Node %s is already in the configuration" %
3634 node, errors.ECODE_EXISTS)
3635 elif self.op.readd and node not in node_list:
3636 raise errors.OpPrereqError("Node %s is not in the configuration" % node,
3639 self.changed_primary_ip = False
3641 for existing_node_name in node_list:
3642 existing_node = cfg.GetNodeInfo(existing_node_name)
3644 if self.op.readd and node == existing_node_name:
3645 if existing_node.secondary_ip != secondary_ip:
3646 raise errors.OpPrereqError("Readded node doesn't have the same IP"
3647 " address configuration as before",
3649 if existing_node.primary_ip != primary_ip:
3650 self.changed_primary_ip = True
3654 if (existing_node.primary_ip == primary_ip or
3655 existing_node.secondary_ip == primary_ip or
3656 existing_node.primary_ip == secondary_ip or
3657 existing_node.secondary_ip == secondary_ip):
3658 raise errors.OpPrereqError("New node ip address(es) conflict with"
3659 " existing node %s" % existing_node.name,
3660 errors.ECODE_NOTUNIQUE)
3662 # check that the type of the node (single versus dual homed) is the
3663 # same as for the master
3664 myself = cfg.GetNodeInfo(self.cfg.GetMasterNode())
3665 master_singlehomed = myself.secondary_ip == myself.primary_ip
3666 newbie_singlehomed = secondary_ip == primary_ip
3667 if master_singlehomed != newbie_singlehomed:
3668 if master_singlehomed:
3669 raise errors.OpPrereqError("The master has no private ip but the"
3670 " new node has one",
3673 raise errors.OpPrereqError("The master has a private ip but the"
3674 " new node doesn't have one",
3677 # checks reachability
3678 if not netutils.TcpPing(primary_ip, constants.DEFAULT_NODED_PORT):
3679 raise errors.OpPrereqError("Node not reachable by ping",
3680 errors.ECODE_ENVIRON)
3682 if not newbie_singlehomed:
3683 # check reachability from my secondary ip to newbie's secondary ip
3684 if not netutils.TcpPing(secondary_ip, constants.DEFAULT_NODED_PORT,
3685 source=myself.secondary_ip):
3686 raise errors.OpPrereqError("Node secondary ip not reachable by TCP"
3687 " based ping to noded port",
3688 errors.ECODE_ENVIRON)
3695 self.master_candidate = _DecideSelfPromotion(self, exceptions=exceptions)
3698 self.new_node = self.cfg.GetNodeInfo(node)
3699 assert self.new_node is not None, "Can't retrieve locked node %s" % node
3701 node_group = cfg.LookupNodeGroup(self.op.group)
3702 self.new_node = objects.Node(name=node,
3703 primary_ip=primary_ip,
3704 secondary_ip=secondary_ip,
3705 master_candidate=self.master_candidate,
3706 offline=False, drained=False,
3709 def Exec(self, feedback_fn):
3710 """Adds the new node to the cluster.
3713 new_node = self.new_node
3714 node = new_node.name
3716 # for re-adds, reset the offline/drained/master-candidate flags;
3717 # we need to reset here, otherwise offline would prevent RPC calls
3718 # later in the procedure; this also means that if the re-add
3719 # fails, we are left with a non-offlined, broken node
3721 new_node.drained = new_node.offline = False # pylint: disable-msg=W0201
3722 self.LogInfo("Readding a node, the offline/drained flags were reset")
3723 # if we demote the node, we do cleanup later in the procedure
3724 new_node.master_candidate = self.master_candidate
3725 if self.changed_primary_ip:
3726 new_node.primary_ip = self.op.primary_ip
3728 # notify the user about any possible mc promotion
3729 if new_node.master_candidate:
3730 self.LogInfo("Node will be a master candidate")
3732 # check connectivity
3733 result = self.rpc.call_version([node])[node]
3734 result.Raise("Can't get version information from node %s" % node)
3735 if constants.PROTOCOL_VERSION == result.payload:
3736 logging.info("Communication to node %s fine, sw version %s match",
3737 node, result.payload)
3739 raise errors.OpExecError("Version mismatch master version %s,"
3740 " node version %s" %
3741 (constants.PROTOCOL_VERSION, result.payload))
3743 # Add node to our /etc/hosts, and add key to known_hosts
3744 if self.cfg.GetClusterInfo().modify_etc_hosts:
3745 master_node = self.cfg.GetMasterNode()
3746 result = self.rpc.call_etc_hosts_modify(master_node,
3747 constants.ETC_HOSTS_ADD,
3750 result.Raise("Can't update hosts file with new host data")
3752 if new_node.secondary_ip != new_node.primary_ip:
3753 result = self.rpc.call_node_has_ip_address(new_node.name,
3754 new_node.secondary_ip)
3755 result.Raise("Failure checking secondary ip on node %s" % new_node.name,
3756 prereq=True, ecode=errors.ECODE_ENVIRON)
3757 if not result.payload:
3758 raise errors.OpExecError("Node claims it doesn't have the secondary ip"
3759 " you gave (%s). Please fix and re-run this"
3760 " command." % new_node.secondary_ip)
3762 node_verify_list = [self.cfg.GetMasterNode()]
3763 node_verify_param = {
3764 constants.NV_NODELIST: [node],
3765 # TODO: do a node-net-test as well?
3768 result = self.rpc.call_node_verify(node_verify_list, node_verify_param,
3769 self.cfg.GetClusterName())
3770 for verifier in node_verify_list:
3771 result[verifier].Raise("Cannot communicate with node %s" % verifier)
3772 nl_payload = result[verifier].payload[constants.NV_NODELIST]
3774 for failed in nl_payload:
3775 feedback_fn("ssh/hostname verification failed"
3776 " (checking from %s): %s" %
3777 (verifier, nl_payload[failed]))
3778 raise errors.OpExecError("ssh/hostname verification failed.")
3781 _RedistributeAncillaryFiles(self)
3782 self.context.ReaddNode(new_node)
3783 # make sure we redistribute the config
3784 self.cfg.Update(new_node, feedback_fn)
3785 # and make sure the new node will not have old files around
3786 if not new_node.master_candidate:
3787 result = self.rpc.call_node_demote_from_mc(new_node.name)
3788 msg = result.fail_msg
3790 self.LogWarning("Node failed to demote itself from master"
3791 " candidate status: %s" % msg)
3793 _RedistributeAncillaryFiles(self, additional_nodes=[node])
3794 self.context.AddNode(new_node, self.proc.GetECId())
3797 class LUSetNodeParams(LogicalUnit):
3798 """Modifies the parameters of a node.
3801 HPATH = "node-modify"
3802 HTYPE = constants.HTYPE_NODE
3805 ("master_candidate", None, ht.TMaybeBool),
3806 ("offline", None, ht.TMaybeBool),
3807 ("drained", None, ht.TMaybeBool),
3808 ("auto_promote", False, ht.TBool),
3813 def CheckArguments(self):
3814 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
3815 all_mods = [self.op.offline, self.op.master_candidate, self.op.drained]
3816 if all_mods.count(None) == 3:
3817 raise errors.OpPrereqError("Please pass at least one modification",
3819 if all_mods.count(True) > 1:
3820 raise errors.OpPrereqError("Can't set the node into more than one"
3821 " state at the same time",
3824 # Boolean value that tells us whether we're offlining or draining the node
3825 self.offline_or_drain = (self.op.offline == True or
3826 self.op.drained == True)
3827 self.deoffline_or_drain = (self.op.offline == False or
3828 self.op.drained == False)
3829 self.might_demote = (self.op.master_candidate == False or
3830 self.offline_or_drain)
3832 self.lock_all = self.op.auto_promote and self.might_demote
3835 def ExpandNames(self):
3837 self.needed_locks = {locking.LEVEL_NODE: locking.ALL_SET}
3839 self.needed_locks = {locking.LEVEL_NODE: self.op.node_name}
3841 def BuildHooksEnv(self):
3844 This runs on the master node.
3848 "OP_TARGET": self.op.node_name,
3849 "MASTER_CANDIDATE": str(self.op.master_candidate),
3850 "OFFLINE": str(self.op.offline),
3851 "DRAINED": str(self.op.drained),
3853 nl = [self.cfg.GetMasterNode(),
3857 def CheckPrereq(self):
3858 """Check prerequisites.
3860 This only checks the instance list against the existing names.
3863 node = self.node = self.cfg.GetNodeInfo(self.op.node_name)
3865 if (self.op.master_candidate is not None or
3866 self.op.drained is not None or
3867 self.op.offline is not None):
3868 # we can't change the master's node flags
3869 if self.op.node_name == self.cfg.GetMasterNode():
3870 raise errors.OpPrereqError("The master role can be changed"
3871 " only via master-failover",
3875 if node.master_candidate and self.might_demote and not self.lock_all:
3876 assert not self.op.auto_promote, "auto-promote set but lock_all not"
3877 # check if after removing the current node, we're missing master
3879 (mc_remaining, mc_should, _) = \
3880 self.cfg.GetMasterCandidateStats(exceptions=[node.name])
3881 if mc_remaining < mc_should:
3882 raise errors.OpPrereqError("Not enough master candidates, please"
3883 " pass auto_promote to allow promotion",
3886 if (self.op.master_candidate == True and
3887 ((node.offline and not self.op.offline == False) or
3888 (node.drained and not self.op.drained == False))):
3889 raise errors.OpPrereqError("Node '%s' is offline or drained, can't set"
3890 " to master_candidate" % node.name,
3893 # If we're being deofflined/drained, we'll MC ourself if needed
3894 if (self.deoffline_or_drain and not self.offline_or_drain and not
3895 self.op.master_candidate == True and not node.master_candidate):
3896 self.op.master_candidate = _DecideSelfPromotion(self)
3897 if self.op.master_candidate:
3898 self.LogInfo("Autopromoting node to master candidate")
3902 def Exec(self, feedback_fn):
3911 if self.op.offline is not None:
3912 node.offline = self.op.offline
3913 result.append(("offline", str(self.op.offline)))
3914 if self.op.offline == True:
3915 if node.master_candidate:
3916 node.master_candidate = False
3918 result.append(("master_candidate", "auto-demotion due to offline"))
3920 node.drained = False
3921 result.append(("drained", "clear drained status due to offline"))
3923 if self.op.master_candidate is not None:
3924 node.master_candidate = self.op.master_candidate
3926 result.append(("master_candidate", str(self.op.master_candidate)))
3927 if self.op.master_candidate == False:
3928 rrc = self.rpc.call_node_demote_from_mc(node.name)
3931 self.LogWarning("Node failed to demote itself: %s" % msg)
3933 if self.op.drained is not None:
3934 node.drained = self.op.drained
3935 result.append(("drained", str(self.op.drained)))
3936 if self.op.drained == True:
3937 if node.master_candidate:
3938 node.master_candidate = False
3940 result.append(("master_candidate", "auto-demotion due to drain"))
3941 rrc = self.rpc.call_node_demote_from_mc(node.name)
3944 self.LogWarning("Node failed to demote itself: %s" % msg)
3946 node.offline = False
3947 result.append(("offline", "clear offline status due to drain"))
3949 # we locked all nodes, we adjust the CP before updating this node
3951 _AdjustCandidatePool(self, [node.name])
3953 # this will trigger configuration file update, if needed
3954 self.cfg.Update(node, feedback_fn)
3956 # this will trigger job queue propagation or cleanup
3958 self.context.ReaddNode(node)
3963 class LUPowercycleNode(NoHooksLU):
3964 """Powercycles a node.
3973 def CheckArguments(self):
3974 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
3975 if self.op.node_name == self.cfg.GetMasterNode() and not self.op.force:
3976 raise errors.OpPrereqError("The node is the master and the force"
3977 " parameter was not set",
3980 def ExpandNames(self):
3981 """Locking for PowercycleNode.
3983 This is a last-resort option and shouldn't block on other
3984 jobs. Therefore, we grab no locks.
3987 self.needed_locks = {}
3989 def Exec(self, feedback_fn):
3993 result = self.rpc.call_node_powercycle(self.op.node_name,
3994 self.cfg.GetHypervisorType())
3995 result.Raise("Failed to schedule the reboot")
3996 return result.payload
3999 class LUQueryClusterInfo(NoHooksLU):
4000 """Query cluster configuration.
4005 def ExpandNames(self):
4006 self.needed_locks = {}
4008 def Exec(self, feedback_fn):
4009 """Return cluster config.
4012 cluster = self.cfg.GetClusterInfo()
4015 # Filter just for enabled hypervisors
4016 for os_name, hv_dict in cluster.os_hvp.items():
4017 os_hvp[os_name] = {}
4018 for hv_name, hv_params in hv_dict.items():
4019 if hv_name in cluster.enabled_hypervisors:
4020 os_hvp[os_name][hv_name] = hv_params
4022 # Convert ip_family to ip_version
4023 primary_ip_version = constants.IP4_VERSION
4024 if cluster.primary_ip_family == netutils.IP6Address.family:
4025 primary_ip_version = constants.IP6_VERSION
4028 "software_version": constants.RELEASE_VERSION,
4029 "protocol_version": constants.PROTOCOL_VERSION,
4030 "config_version": constants.CONFIG_VERSION,
4031 "os_api_version": max(constants.OS_API_VERSIONS),
4032 "export_version": constants.EXPORT_VERSION,
4033 "architecture": (platform.architecture()[0], platform.machine()),
4034 "name": cluster.cluster_name,
4035 "master": cluster.master_node,
4036 "default_hypervisor": cluster.enabled_hypervisors[0],
4037 "enabled_hypervisors": cluster.enabled_hypervisors,
4038 "hvparams": dict([(hypervisor_name, cluster.hvparams[hypervisor_name])
4039 for hypervisor_name in cluster.enabled_hypervisors]),
4041 "beparams": cluster.beparams,
4042 "osparams": cluster.osparams,
4043 "nicparams": cluster.nicparams,
4044 "candidate_pool_size": cluster.candidate_pool_size,
4045 "master_netdev": cluster.master_netdev,
4046 "volume_group_name": cluster.volume_group_name,
4047 "drbd_usermode_helper": cluster.drbd_usermode_helper,
4048 "file_storage_dir": cluster.file_storage_dir,
4049 "maintain_node_health": cluster.maintain_node_health,
4050 "ctime": cluster.ctime,
4051 "mtime": cluster.mtime,
4052 "uuid": cluster.uuid,
4053 "tags": list(cluster.GetTags()),
4054 "uid_pool": cluster.uid_pool,
4055 "default_iallocator": cluster.default_iallocator,
4056 "reserved_lvs": cluster.reserved_lvs,
4057 "primary_ip_version": primary_ip_version,
4058 "prealloc_wipe_disks": cluster.prealloc_wipe_disks,
4064 class LUQueryConfigValues(NoHooksLU):
4065 """Return configuration values.
4068 _OP_PARAMS = [_POutputFields]
4070 _FIELDS_DYNAMIC = utils.FieldSet()
4071 _FIELDS_STATIC = utils.FieldSet("cluster_name", "master_node", "drain_flag",
4072 "watcher_pause", "volume_group_name")
4074 def CheckArguments(self):
4075 _CheckOutputFields(static=self._FIELDS_STATIC,
4076 dynamic=self._FIELDS_DYNAMIC,
4077 selected=self.op.output_fields)
4079 def ExpandNames(self):
4080 self.needed_locks = {}
4082 def Exec(self, feedback_fn):
4083 """Dump a representation of the cluster config to the standard output.
4087 for field in self.op.output_fields:
4088 if field == "cluster_name":
4089 entry = self.cfg.GetClusterName()
4090 elif field == "master_node":
4091 entry = self.cfg.GetMasterNode()
4092 elif field == "drain_flag":
4093 entry = os.path.exists(constants.JOB_QUEUE_DRAIN_FILE)
4094 elif field == "watcher_pause":
4095 entry = utils.ReadWatcherPauseFile(constants.WATCHER_PAUSEFILE)
4096 elif field == "volume_group_name":
4097 entry = self.cfg.GetVGName()
4099 raise errors.ParameterError(field)
4100 values.append(entry)
4104 class LUActivateInstanceDisks(NoHooksLU):
4105 """Bring up an instance's disks.
4110 ("ignore_size", False, ht.TBool),
4114 def ExpandNames(self):
4115 self._ExpandAndLockInstance()
4116 self.needed_locks[locking.LEVEL_NODE] = []
4117 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
4119 def DeclareLocks(self, level):
4120 if level == locking.LEVEL_NODE:
4121 self._LockInstancesNodes()
4123 def CheckPrereq(self):
4124 """Check prerequisites.
4126 This checks that the instance is in the cluster.
4129 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
4130 assert self.instance is not None, \
4131 "Cannot retrieve locked instance %s" % self.op.instance_name
4132 _CheckNodeOnline(self, self.instance.primary_node)
4134 def Exec(self, feedback_fn):
4135 """Activate the disks.
4138 disks_ok, disks_info = \
4139 _AssembleInstanceDisks(self, self.instance,
4140 ignore_size=self.op.ignore_size)
4142 raise errors.OpExecError("Cannot activate block devices")
4147 def _AssembleInstanceDisks(lu, instance, disks=None, ignore_secondaries=False,
4149 """Prepare the block devices for an instance.
4151 This sets up the block devices on all nodes.
4153 @type lu: L{LogicalUnit}
4154 @param lu: the logical unit on whose behalf we execute
4155 @type instance: L{objects.Instance}
4156 @param instance: the instance for whose disks we assemble
4157 @type disks: list of L{objects.Disk} or None
4158 @param disks: which disks to assemble (or all, if None)
4159 @type ignore_secondaries: boolean
4160 @param ignore_secondaries: if true, errors on secondary nodes
4161 won't result in an error return from the function
4162 @type ignore_size: boolean
4163 @param ignore_size: if true, the current known size of the disk
4164 will not be used during the disk activation, useful for cases
4165 when the size is wrong
4166 @return: False if the operation failed, otherwise a list of
4167 (host, instance_visible_name, node_visible_name)
4168 with the mapping from node devices to instance devices
4173 iname = instance.name
4174 disks = _ExpandCheckDisks(instance, disks)
4176 # With the two passes mechanism we try to reduce the window of
4177 # opportunity for the race condition of switching DRBD to primary
4178 # before handshaking occured, but we do not eliminate it
4180 # The proper fix would be to wait (with some limits) until the
4181 # connection has been made and drbd transitions from WFConnection
4182 # into any other network-connected state (Connected, SyncTarget,
4185 # 1st pass, assemble on all nodes in secondary mode
4186 for inst_disk in disks:
4187 for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node):
4189 node_disk = node_disk.Copy()
4190 node_disk.UnsetSize()
4191 lu.cfg.SetDiskID(node_disk, node)
4192 result = lu.rpc.call_blockdev_assemble(node, node_disk, iname, False)
4193 msg = result.fail_msg
4195 lu.proc.LogWarning("Could not prepare block device %s on node %s"
4196 " (is_primary=False, pass=1): %s",
4197 inst_disk.iv_name, node, msg)
4198 if not ignore_secondaries:
4201 # FIXME: race condition on drbd migration to primary
4203 # 2nd pass, do only the primary node
4204 for inst_disk in disks:
4207 for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node):
4208 if node != instance.primary_node:
4211 node_disk = node_disk.Copy()
4212 node_disk.UnsetSize()
4213 lu.cfg.SetDiskID(node_disk, node)
4214 result = lu.rpc.call_blockdev_assemble(node, node_disk, iname, True)
4215 msg = result.fail_msg
4217 lu.proc.LogWarning("Could not prepare block device %s on node %s"
4218 " (is_primary=True, pass=2): %s",
4219 inst_disk.iv_name, node, msg)
4222 dev_path = result.payload
4224 device_info.append((instance.primary_node, inst_disk.iv_name, dev_path))
4226 # leave the disks configured for the primary node
4227 # this is a workaround that would be fixed better by
4228 # improving the logical/physical id handling
4230 lu.cfg.SetDiskID(disk, instance.primary_node)
4232 return disks_ok, device_info
4235 def _StartInstanceDisks(lu, instance, force):
4236 """Start the disks of an instance.
4239 disks_ok, _ = _AssembleInstanceDisks(lu, instance,
4240 ignore_secondaries=force)
4242 _ShutdownInstanceDisks(lu, instance)
4243 if force is not None and not force:
4244 lu.proc.LogWarning("", hint="If the message above refers to a"
4246 " you can retry the operation using '--force'.")
4247 raise errors.OpExecError("Disk consistency error")
4250 class LUDeactivateInstanceDisks(NoHooksLU):
4251 """Shutdown an instance's disks.
4259 def ExpandNames(self):
4260 self._ExpandAndLockInstance()
4261 self.needed_locks[locking.LEVEL_NODE] = []
4262 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
4264 def DeclareLocks(self, level):
4265 if level == locking.LEVEL_NODE:
4266 self._LockInstancesNodes()
4268 def CheckPrereq(self):
4269 """Check prerequisites.
4271 This checks that the instance is in the cluster.
4274 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
4275 assert self.instance is not None, \
4276 "Cannot retrieve locked instance %s" % self.op.instance_name
4278 def Exec(self, feedback_fn):
4279 """Deactivate the disks
4282 instance = self.instance
4283 _SafeShutdownInstanceDisks(self, instance)
4286 def _SafeShutdownInstanceDisks(lu, instance, disks=None):
4287 """Shutdown block devices of an instance.
4289 This function checks if an instance is running, before calling
4290 _ShutdownInstanceDisks.
4293 _CheckInstanceDown(lu, instance, "cannot shutdown disks")
4294 _ShutdownInstanceDisks(lu, instance, disks=disks)
4297 def _ExpandCheckDisks(instance, disks):
4298 """Return the instance disks selected by the disks list
4300 @type disks: list of L{objects.Disk} or None
4301 @param disks: selected disks
4302 @rtype: list of L{objects.Disk}
4303 @return: selected instance disks to act on
4307 return instance.disks
4309 if not set(disks).issubset(instance.disks):
4310 raise errors.ProgrammerError("Can only act on disks belonging to the"
4315 def _ShutdownInstanceDisks(lu, instance, disks=None, ignore_primary=False):
4316 """Shutdown block devices of an instance.
4318 This does the shutdown on all nodes of the instance.
4320 If the ignore_primary is false, errors on the primary node are
4325 disks = _ExpandCheckDisks(instance, disks)
4328 for node, top_disk in disk.ComputeNodeTree(instance.primary_node):
4329 lu.cfg.SetDiskID(top_disk, node)
4330 result = lu.rpc.call_blockdev_shutdown(node, top_disk)
4331 msg = result.fail_msg
4333 lu.LogWarning("Could not shutdown block device %s on node %s: %s",
4334 disk.iv_name, node, msg)
4335 if not ignore_primary or node != instance.primary_node:
4340 def _CheckNodeFreeMemory(lu, node, reason, requested, hypervisor_name):
4341 """Checks if a node has enough free memory.
4343 This function check if a given node has the needed amount of free
4344 memory. In case the node has less memory or we cannot get the
4345 information from the node, this function raise an OpPrereqError
4348 @type lu: C{LogicalUnit}
4349 @param lu: a logical unit from which we get configuration data
4351 @param node: the node to check
4352 @type reason: C{str}
4353 @param reason: string to use in the error message
4354 @type requested: C{int}
4355 @param requested: the amount of memory in MiB to check for
4356 @type hypervisor_name: C{str}
4357 @param hypervisor_name: the hypervisor to ask for memory stats
4358 @raise errors.OpPrereqError: if the node doesn't have enough memory, or
4359 we cannot check the node
4362 nodeinfo = lu.rpc.call_node_info([node], lu.cfg.GetVGName(), hypervisor_name)
4363 nodeinfo[node].Raise("Can't get data from node %s" % node,
4364 prereq=True, ecode=errors.ECODE_ENVIRON)
4365 free_mem = nodeinfo[node].payload.get('memory_free', None)
4366 if not isinstance(free_mem, int):
4367 raise errors.OpPrereqError("Can't compute free memory on node %s, result"
4368 " was '%s'" % (node, free_mem),
4369 errors.ECODE_ENVIRON)
4370 if requested > free_mem:
4371 raise errors.OpPrereqError("Not enough memory on node %s for %s:"
4372 " needed %s MiB, available %s MiB" %
4373 (node, reason, requested, free_mem),
4377 def _CheckNodesFreeDisk(lu, nodenames, requested):
4378 """Checks if nodes have enough free disk space in the default VG.
4380 This function check if all given nodes have the needed amount of
4381 free disk. In case any node has less disk or we cannot get the
4382 information from the node, this function raise an OpPrereqError
4385 @type lu: C{LogicalUnit}
4386 @param lu: a logical unit from which we get configuration data
4387 @type nodenames: C{list}
4388 @param nodenames: the list of node names to check
4389 @type requested: C{int}
4390 @param requested: the amount of disk in MiB to check for
4391 @raise errors.OpPrereqError: if the node doesn't have enough disk, or
4392 we cannot check the node
4395 nodeinfo = lu.rpc.call_node_info(nodenames, lu.cfg.GetVGName(),
4396 lu.cfg.GetHypervisorType())
4397 for node in nodenames:
4398 info = nodeinfo[node]
4399 info.Raise("Cannot get current information from node %s" % node,
4400 prereq=True, ecode=errors.ECODE_ENVIRON)
4401 vg_free = info.payload.get("vg_free", None)
4402 if not isinstance(vg_free, int):
4403 raise errors.OpPrereqError("Can't compute free disk space on node %s,"
4404 " result was '%s'" % (node, vg_free),
4405 errors.ECODE_ENVIRON)
4406 if requested > vg_free:
4407 raise errors.OpPrereqError("Not enough disk space on target node %s:"
4408 " required %d MiB, available %d MiB" %
4409 (node, requested, vg_free),
4413 class LUStartupInstance(LogicalUnit):
4414 """Starts an instance.
4417 HPATH = "instance-start"
4418 HTYPE = constants.HTYPE_INSTANCE
4422 _PIgnoreOfflineNodes,
4423 ("hvparams", ht.EmptyDict, ht.TDict),
4424 ("beparams", ht.EmptyDict, ht.TDict),
4428 def CheckArguments(self):
4430 if self.op.beparams:
4431 # fill the beparams dict
4432 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
4434 def ExpandNames(self):
4435 self._ExpandAndLockInstance()
4437 def BuildHooksEnv(self):
4440 This runs on master, primary and secondary nodes of the instance.
4444 "FORCE": self.op.force,
4446 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
4447 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
4450 def CheckPrereq(self):
4451 """Check prerequisites.
4453 This checks that the instance is in the cluster.
4456 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
4457 assert self.instance is not None, \
4458 "Cannot retrieve locked instance %s" % self.op.instance_name
4461 if self.op.hvparams:
4462 # check hypervisor parameter syntax (locally)
4463 cluster = self.cfg.GetClusterInfo()
4464 utils.ForceDictType(self.op.hvparams, constants.HVS_PARAMETER_TYPES)
4465 filled_hvp = cluster.FillHV(instance)
4466 filled_hvp.update(self.op.hvparams)
4467 hv_type = hypervisor.GetHypervisor(instance.hypervisor)
4468 hv_type.CheckParameterSyntax(filled_hvp)
4469 _CheckHVParams(self, instance.all_nodes, instance.hypervisor, filled_hvp)
4471 self.primary_offline = self.cfg.GetNodeInfo(instance.primary_node).offline
4473 if self.primary_offline and self.op.ignore_offline_nodes:
4474 self.proc.LogWarning("Ignoring offline primary node")
4476 if self.op.hvparams or self.op.beparams:
4477 self.proc.LogWarning("Overridden parameters are ignored")
4479 _CheckNodeOnline(self, instance.primary_node)
4481 bep = self.cfg.GetClusterInfo().FillBE(instance)
4483 # check bridges existence
4484 _CheckInstanceBridgesExist(self, instance)
4486 remote_info = self.rpc.call_instance_info(instance.primary_node,
4488 instance.hypervisor)
4489 remote_info.Raise("Error checking node %s" % instance.primary_node,
4490 prereq=True, ecode=errors.ECODE_ENVIRON)
4491 if not remote_info.payload: # not running already
4492 _CheckNodeFreeMemory(self, instance.primary_node,
4493 "starting instance %s" % instance.name,
4494 bep[constants.BE_MEMORY], instance.hypervisor)
4496 def Exec(self, feedback_fn):
4497 """Start the instance.
4500 instance = self.instance
4501 force = self.op.force
4503 self.cfg.MarkInstanceUp(instance.name)
4505 if self.primary_offline:
4506 assert self.op.ignore_offline_nodes
4507 self.proc.LogInfo("Primary node offline, marked instance as started")
4509 node_current = instance.primary_node
4511 _StartInstanceDisks(self, instance, force)
4513 result = self.rpc.call_instance_start(node_current, instance,
4514 self.op.hvparams, self.op.beparams)
4515 msg = result.fail_msg
4517 _ShutdownInstanceDisks(self, instance)
4518 raise errors.OpExecError("Could not start instance: %s" % msg)
4521 class LURebootInstance(LogicalUnit):
4522 """Reboot an instance.
4525 HPATH = "instance-reboot"
4526 HTYPE = constants.HTYPE_INSTANCE
4529 ("ignore_secondaries", False, ht.TBool),
4530 ("reboot_type", ht.NoDefault, ht.TElemOf(constants.REBOOT_TYPES)),
4535 def ExpandNames(self):
4536 self._ExpandAndLockInstance()
4538 def BuildHooksEnv(self):
4541 This runs on master, primary and secondary nodes of the instance.
4545 "IGNORE_SECONDARIES": self.op.ignore_secondaries,
4546 "REBOOT_TYPE": self.op.reboot_type,
4547 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
4549 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
4550 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
4553 def CheckPrereq(self):
4554 """Check prerequisites.
4556 This checks that the instance is in the cluster.
4559 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
4560 assert self.instance is not None, \
4561 "Cannot retrieve locked instance %s" % self.op.instance_name
4563 _CheckNodeOnline(self, instance.primary_node)
4565 # check bridges existence
4566 _CheckInstanceBridgesExist(self, instance)
4568 def Exec(self, feedback_fn):
4569 """Reboot the instance.
4572 instance = self.instance
4573 ignore_secondaries = self.op.ignore_secondaries
4574 reboot_type = self.op.reboot_type
4576 node_current = instance.primary_node
4578 if reboot_type in [constants.INSTANCE_REBOOT_SOFT,
4579 constants.INSTANCE_REBOOT_HARD]:
4580 for disk in instance.disks:
4581 self.cfg.SetDiskID(disk, node_current)
4582 result = self.rpc.call_instance_reboot(node_current, instance,
4584 self.op.shutdown_timeout)
4585 result.Raise("Could not reboot instance")
4587 result = self.rpc.call_instance_shutdown(node_current, instance,
4588 self.op.shutdown_timeout)
4589 result.Raise("Could not shutdown instance for full reboot")
4590 _ShutdownInstanceDisks(self, instance)
4591 _StartInstanceDisks(self, instance, ignore_secondaries)
4592 result = self.rpc.call_instance_start(node_current, instance, None, None)
4593 msg = result.fail_msg
4595 _ShutdownInstanceDisks(self, instance)
4596 raise errors.OpExecError("Could not start instance for"
4597 " full reboot: %s" % msg)
4599 self.cfg.MarkInstanceUp(instance.name)
4602 class LUShutdownInstance(LogicalUnit):
4603 """Shutdown an instance.
4606 HPATH = "instance-stop"
4607 HTYPE = constants.HTYPE_INSTANCE
4610 _PIgnoreOfflineNodes,
4611 ("timeout", constants.DEFAULT_SHUTDOWN_TIMEOUT, ht.TPositiveInt),
4615 def ExpandNames(self):
4616 self._ExpandAndLockInstance()
4618 def BuildHooksEnv(self):
4621 This runs on master, primary and secondary nodes of the instance.
4624 env = _BuildInstanceHookEnvByObject(self, self.instance)
4625 env["TIMEOUT"] = self.op.timeout
4626 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
4629 def CheckPrereq(self):
4630 """Check prerequisites.
4632 This checks that the instance is in the cluster.
4635 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
4636 assert self.instance is not None, \
4637 "Cannot retrieve locked instance %s" % self.op.instance_name
4639 self.primary_offline = \
4640 self.cfg.GetNodeInfo(self.instance.primary_node).offline
4642 if self.primary_offline and self.op.ignore_offline_nodes:
4643 self.proc.LogWarning("Ignoring offline primary node")
4645 _CheckNodeOnline(self, self.instance.primary_node)
4647 def Exec(self, feedback_fn):
4648 """Shutdown the instance.
4651 instance = self.instance
4652 node_current = instance.primary_node
4653 timeout = self.op.timeout
4655 self.cfg.MarkInstanceDown(instance.name)
4657 if self.primary_offline:
4658 assert self.op.ignore_offline_nodes
4659 self.proc.LogInfo("Primary node offline, marked instance as stopped")
4661 result = self.rpc.call_instance_shutdown(node_current, instance, timeout)
4662 msg = result.fail_msg
4664 self.proc.LogWarning("Could not shutdown instance: %s" % msg)
4666 _ShutdownInstanceDisks(self, instance)
4669 class LUReinstallInstance(LogicalUnit):
4670 """Reinstall an instance.
4673 HPATH = "instance-reinstall"
4674 HTYPE = constants.HTYPE_INSTANCE
4677 ("os_type", None, ht.TMaybeString),
4678 ("force_variant", False, ht.TBool),
4679 ("osparams", None, ht.TOr(ht.TDict, ht.TNone)),
4683 def ExpandNames(self):
4684 self._ExpandAndLockInstance()
4686 def BuildHooksEnv(self):
4689 This runs on master, primary and secondary nodes of the instance.
4692 env = _BuildInstanceHookEnvByObject(self, self.instance)
4693 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
4696 def CheckPrereq(self):
4697 """Check prerequisites.
4699 This checks that the instance is in the cluster and is not running.
4702 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
4703 assert instance is not None, \
4704 "Cannot retrieve locked instance %s" % self.op.instance_name
4705 _CheckNodeOnline(self, instance.primary_node)
4707 if instance.disk_template == constants.DT_DISKLESS:
4708 raise errors.OpPrereqError("Instance '%s' has no disks" %
4709 self.op.instance_name,
4711 _CheckInstanceDown(self, instance, "cannot reinstall")
4713 if self.op.os_type is not None:
4715 pnode = _ExpandNodeName(self.cfg, instance.primary_node)
4716 _CheckNodeHasOS(self, pnode, self.op.os_type, self.op.force_variant)
4717 instance_os = self.op.os_type
4719 instance_os = instance.os
4721 nodelist = list(instance.all_nodes)
4723 if self.op.osparams:
4724 i_osdict = _GetUpdatedParams(instance.osparams, self.op.osparams)
4725 _CheckOSParams(self, True, nodelist, instance_os, i_osdict)
4726 self.os_inst = i_osdict # the new dict (without defaults)
4730 self.instance = instance
4732 def Exec(self, feedback_fn):
4733 """Reinstall the instance.
4736 inst = self.instance
4738 if self.op.os_type is not None:
4739 feedback_fn("Changing OS to '%s'..." % self.op.os_type)
4740 inst.os = self.op.os_type
4741 # Write to configuration
4742 self.cfg.Update(inst, feedback_fn)
4744 _StartInstanceDisks(self, inst, None)
4746 feedback_fn("Running the instance OS create scripts...")
4747 # FIXME: pass debug option from opcode to backend
4748 result = self.rpc.call_instance_os_add(inst.primary_node, inst, True,
4749 self.op.debug_level,
4750 osparams=self.os_inst)
4751 result.Raise("Could not install OS for instance %s on node %s" %
4752 (inst.name, inst.primary_node))
4754 _ShutdownInstanceDisks(self, inst)
4757 class LURecreateInstanceDisks(LogicalUnit):
4758 """Recreate an instance's missing disks.
4761 HPATH = "instance-recreate-disks"
4762 HTYPE = constants.HTYPE_INSTANCE
4765 ("disks", ht.EmptyList, ht.TListOf(ht.TPositiveInt)),
4769 def ExpandNames(self):
4770 self._ExpandAndLockInstance()
4772 def BuildHooksEnv(self):
4775 This runs on master, primary and secondary nodes of the instance.
4778 env = _BuildInstanceHookEnvByObject(self, self.instance)
4779 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
4782 def CheckPrereq(self):
4783 """Check prerequisites.
4785 This checks that the instance is in the cluster and is not running.
4788 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
4789 assert instance is not None, \
4790 "Cannot retrieve locked instance %s" % self.op.instance_name
4791 _CheckNodeOnline(self, instance.primary_node)
4793 if instance.disk_template == constants.DT_DISKLESS:
4794 raise errors.OpPrereqError("Instance '%s' has no disks" %
4795 self.op.instance_name, errors.ECODE_INVAL)
4796 _CheckInstanceDown(self, instance, "cannot recreate disks")
4798 if not self.op.disks:
4799 self.op.disks = range(len(instance.disks))
4801 for idx in self.op.disks:
4802 if idx >= len(instance.disks):
4803 raise errors.OpPrereqError("Invalid disk index passed '%s'" % idx,
4806 self.instance = instance
4808 def Exec(self, feedback_fn):
4809 """Recreate the disks.
4813 for idx, _ in enumerate(self.instance.disks):
4814 if idx not in self.op.disks: # disk idx has not been passed in
4818 _CreateDisks(self, self.instance, to_skip=to_skip)
4821 class LURenameInstance(LogicalUnit):
4822 """Rename an instance.
4825 HPATH = "instance-rename"
4826 HTYPE = constants.HTYPE_INSTANCE
4829 ("new_name", ht.NoDefault, ht.TNonEmptyString),
4830 ("ip_check", False, ht.TBool),
4831 ("name_check", True, ht.TBool),
4834 def CheckArguments(self):
4838 if self.op.ip_check and not self.op.name_check:
4839 # TODO: make the ip check more flexible and not depend on the name check
4840 raise errors.OpPrereqError("Cannot do ip check without a name check",
4843 def BuildHooksEnv(self):
4846 This runs on master, primary and secondary nodes of the instance.
4849 env = _BuildInstanceHookEnvByObject(self, self.instance)
4850 env["INSTANCE_NEW_NAME"] = self.op.new_name
4851 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
4854 def CheckPrereq(self):
4855 """Check prerequisites.
4857 This checks that the instance is in the cluster and is not running.
4860 self.op.instance_name = _ExpandInstanceName(self.cfg,
4861 self.op.instance_name)
4862 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
4863 assert instance is not None
4864 _CheckNodeOnline(self, instance.primary_node)
4865 _CheckInstanceDown(self, instance, "cannot rename")
4866 self.instance = instance
4868 new_name = self.op.new_name
4869 if self.op.name_check:
4870 hostname = netutils.GetHostname(name=new_name)
4871 new_name = self.op.new_name = hostname.name
4872 if (self.op.ip_check and
4873 netutils.TcpPing(hostname.ip, constants.DEFAULT_NODED_PORT)):
4874 raise errors.OpPrereqError("IP %s of instance %s already in use" %
4875 (hostname.ip, new_name),
4876 errors.ECODE_NOTUNIQUE)
4878 instance_list = self.cfg.GetInstanceList()
4879 if new_name in instance_list:
4880 raise errors.OpPrereqError("Instance '%s' is already in the cluster" %
4881 new_name, errors.ECODE_EXISTS)
4883 def Exec(self, feedback_fn):
4884 """Reinstall the instance.
4887 inst = self.instance
4888 old_name = inst.name
4890 if inst.disk_template == constants.DT_FILE:
4891 old_file_storage_dir = os.path.dirname(inst.disks[0].logical_id[1])
4893 self.cfg.RenameInstance(inst.name, self.op.new_name)
4894 # Change the instance lock. This is definitely safe while we hold the BGL
4895 self.context.glm.remove(locking.LEVEL_INSTANCE, old_name)
4896 self.context.glm.add(locking.LEVEL_INSTANCE, self.op.new_name)
4898 # re-read the instance from the configuration after rename
4899 inst = self.cfg.GetInstanceInfo(self.op.new_name)
4901 if inst.disk_template == constants.DT_FILE:
4902 new_file_storage_dir = os.path.dirname(inst.disks[0].logical_id[1])
4903 result = self.rpc.call_file_storage_dir_rename(inst.primary_node,
4904 old_file_storage_dir,
4905 new_file_storage_dir)
4906 result.Raise("Could not rename on node %s directory '%s' to '%s'"
4907 " (but the instance has been renamed in Ganeti)" %
4908 (inst.primary_node, old_file_storage_dir,
4909 new_file_storage_dir))
4911 _StartInstanceDisks(self, inst, None)
4913 result = self.rpc.call_instance_run_rename(inst.primary_node, inst,
4914 old_name, self.op.debug_level)
4915 msg = result.fail_msg
4917 msg = ("Could not run OS rename script for instance %s on node %s"
4918 " (but the instance has been renamed in Ganeti): %s" %
4919 (inst.name, inst.primary_node, msg))
4920 self.proc.LogWarning(msg)
4922 _ShutdownInstanceDisks(self, inst)
4927 class LURemoveInstance(LogicalUnit):
4928 """Remove an instance.
4931 HPATH = "instance-remove"
4932 HTYPE = constants.HTYPE_INSTANCE
4935 ("ignore_failures", False, ht.TBool),
4940 def ExpandNames(self):
4941 self._ExpandAndLockInstance()
4942 self.needed_locks[locking.LEVEL_NODE] = []
4943 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
4945 def DeclareLocks(self, level):
4946 if level == locking.LEVEL_NODE:
4947 self._LockInstancesNodes()
4949 def BuildHooksEnv(self):
4952 This runs on master, primary and secondary nodes of the instance.
4955 env = _BuildInstanceHookEnvByObject(self, self.instance)
4956 env["SHUTDOWN_TIMEOUT"] = self.op.shutdown_timeout
4957 nl = [self.cfg.GetMasterNode()]
4958 nl_post = list(self.instance.all_nodes) + nl
4959 return env, nl, nl_post
4961 def CheckPrereq(self):
4962 """Check prerequisites.
4964 This checks that the instance is in the cluster.
4967 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
4968 assert self.instance is not None, \
4969 "Cannot retrieve locked instance %s" % self.op.instance_name
4971 def Exec(self, feedback_fn):
4972 """Remove the instance.
4975 instance = self.instance
4976 logging.info("Shutting down instance %s on node %s",
4977 instance.name, instance.primary_node)
4979 result = self.rpc.call_instance_shutdown(instance.primary_node, instance,
4980 self.op.shutdown_timeout)
4981 msg = result.fail_msg
4983 if self.op.ignore_failures:
4984 feedback_fn("Warning: can't shutdown instance: %s" % msg)
4986 raise errors.OpExecError("Could not shutdown instance %s on"
4988 (instance.name, instance.primary_node, msg))
4990 _RemoveInstance(self, feedback_fn, instance, self.op.ignore_failures)
4993 def _RemoveInstance(lu, feedback_fn, instance, ignore_failures):
4994 """Utility function to remove an instance.
4997 logging.info("Removing block devices for instance %s", instance.name)
4999 if not _RemoveDisks(lu, instance):
5000 if not ignore_failures:
5001 raise errors.OpExecError("Can't remove instance's disks")
5002 feedback_fn("Warning: can't remove instance's disks")
5004 logging.info("Removing instance %s out of cluster config", instance.name)
5006 lu.cfg.RemoveInstance(instance.name)
5008 assert not lu.remove_locks.get(locking.LEVEL_INSTANCE), \
5009 "Instance lock removal conflict"
5011 # Remove lock for the instance
5012 lu.remove_locks[locking.LEVEL_INSTANCE] = instance.name
5015 class LUQueryInstances(NoHooksLU):
5016 """Logical unit for querying instances.
5019 # pylint: disable-msg=W0142
5021 ("output_fields", ht.NoDefault, ht.TListOf(ht.TNonEmptyString)),
5022 ("names", ht.EmptyList, ht.TListOf(ht.TNonEmptyString)),
5023 ("use_locking", False, ht.TBool),
5026 _SIMPLE_FIELDS = ["name", "os", "network_port", "hypervisor",
5027 "serial_no", "ctime", "mtime", "uuid"]
5028 _FIELDS_STATIC = utils.FieldSet(*["name", "os", "pnode", "snodes",
5030 "disk_template", "ip", "mac", "bridge",
5031 "nic_mode", "nic_link",
5032 "sda_size", "sdb_size", "vcpus", "tags",
5033 "network_port", "beparams",
5034 r"(disk)\.(size)/([0-9]+)",
5035 r"(disk)\.(sizes)", "disk_usage",
5036 r"(nic)\.(mac|ip|mode|link)/([0-9]+)",
5037 r"(nic)\.(bridge)/([0-9]+)",
5038 r"(nic)\.(macs|ips|modes|links|bridges)",
5039 r"(disk|nic)\.(count)",
5040 "hvparams", "custom_hvparams",
5041 "custom_beparams", "custom_nicparams",
5042 ] + _SIMPLE_FIELDS +
5044 for name in constants.HVS_PARAMETERS
5045 if name not in constants.HVC_GLOBALS] +
5047 for name in constants.BES_PARAMETERS])
5048 _FIELDS_DYNAMIC = utils.FieldSet("oper_state",
5054 def CheckArguments(self):
5055 _CheckOutputFields(static=self._FIELDS_STATIC,
5056 dynamic=self._FIELDS_DYNAMIC,
5057 selected=self.op.output_fields)
5059 def ExpandNames(self):
5060 self.needed_locks = {}
5061 self.share_locks[locking.LEVEL_INSTANCE] = 1
5062 self.share_locks[locking.LEVEL_NODE] = 1
5065 self.wanted = _GetWantedInstances(self, self.op.names)
5067 self.wanted = locking.ALL_SET
5069 self.do_node_query = self._FIELDS_STATIC.NonMatching(self.op.output_fields)
5070 self.do_locking = self.do_node_query and self.op.use_locking
5072 self.needed_locks[locking.LEVEL_INSTANCE] = self.wanted
5073 self.needed_locks[locking.LEVEL_NODE] = []
5074 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
5076 def DeclareLocks(self, level):
5077 if level == locking.LEVEL_NODE and self.do_locking:
5078 self._LockInstancesNodes()
5080 def Exec(self, feedback_fn):
5081 """Computes the list of nodes and their attributes.
5084 # pylint: disable-msg=R0912
5085 # way too many branches here
5086 all_info = self.cfg.GetAllInstancesInfo()
5087 if self.wanted == locking.ALL_SET:
5088 # caller didn't specify instance names, so ordering is not important
5090 instance_names = self.acquired_locks[locking.LEVEL_INSTANCE]
5092 instance_names = all_info.keys()
5093 instance_names = utils.NiceSort(instance_names)
5095 # caller did specify names, so we must keep the ordering
5097 tgt_set = self.acquired_locks[locking.LEVEL_INSTANCE]
5099 tgt_set = all_info.keys()
5100 missing = set(self.wanted).difference(tgt_set)
5102 raise errors.OpExecError("Some instances were removed before"
5103 " retrieving their data: %s" % missing)
5104 instance_names = self.wanted
5106 instance_list = [all_info[iname] for iname in instance_names]
5108 # begin data gathering
5110 nodes = frozenset([inst.primary_node for inst in instance_list])
5111 hv_list = list(set([inst.hypervisor for inst in instance_list]))
5115 if self.do_node_query:
5117 node_data = self.rpc.call_all_instances_info(nodes, hv_list)
5119 result = node_data[name]
5121 # offline nodes will be in both lists
5122 off_nodes.append(name)
5124 bad_nodes.append(name)
5127 live_data.update(result.payload)
5128 # else no instance is alive
5130 live_data = dict([(name, {}) for name in instance_names])
5132 # end data gathering
5137 cluster = self.cfg.GetClusterInfo()
5138 for instance in instance_list:
5140 i_hv = cluster.FillHV(instance, skip_globals=True)
5141 i_be = cluster.FillBE(instance)
5142 i_nicp = [cluster.SimpleFillNIC(nic.nicparams) for nic in instance.nics]
5143 for field in self.op.output_fields:
5144 st_match = self._FIELDS_STATIC.Matches(field)
5145 if field in self._SIMPLE_FIELDS:
5146 val = getattr(instance, field)
5147 elif field == "pnode":
5148 val = instance.primary_node
5149 elif field == "snodes":
5150 val = list(instance.secondary_nodes)
5151 elif field == "admin_state":
5152 val = instance.admin_up
5153 elif field == "oper_state":
5154 if instance.primary_node in bad_nodes:
5157 val = bool(live_data.get(instance.name))
5158 elif field == "status":
5159 if instance.primary_node in off_nodes:
5160 val = "ERROR_nodeoffline"
5161 elif instance.primary_node in bad_nodes:
5162 val = "ERROR_nodedown"
5164 running = bool(live_data.get(instance.name))
5166 if instance.admin_up:
5171 if instance.admin_up:
5175 elif field == "oper_ram":
5176 if instance.primary_node in bad_nodes:
5178 elif instance.name in live_data:
5179 val = live_data[instance.name].get("memory", "?")
5182 elif field == "oper_vcpus":
5183 if instance.primary_node in bad_nodes:
5185 elif instance.name in live_data:
5186 val = live_data[instance.name].get("vcpus", "?")
5189 elif field == "vcpus":
5190 val = i_be[constants.BE_VCPUS]
5191 elif field == "disk_template":
5192 val = instance.disk_template
5195 val = instance.nics[0].ip
5198 elif field == "nic_mode":
5200 val = i_nicp[0][constants.NIC_MODE]
5203 elif field == "nic_link":
5205 val = i_nicp[0][constants.NIC_LINK]
5208 elif field == "bridge":
5209 if (instance.nics and
5210 i_nicp[0][constants.NIC_MODE] == constants.NIC_MODE_BRIDGED):
5211 val = i_nicp[0][constants.NIC_LINK]
5214 elif field == "mac":
5216 val = instance.nics[0].mac
5219 elif field == "custom_nicparams":
5220 val = [nic.nicparams for nic in instance.nics]
5221 elif field == "sda_size" or field == "sdb_size":
5222 idx = ord(field[2]) - ord('a')
5224 val = instance.FindDisk(idx).size
5225 except errors.OpPrereqError:
5227 elif field == "disk_usage": # total disk usage per node
5228 disk_sizes = [{'size': disk.size} for disk in instance.disks]
5229 val = _ComputeDiskSize(instance.disk_template, disk_sizes)
5230 elif field == "tags":
5231 val = list(instance.GetTags())
5232 elif field == "custom_hvparams":
5233 val = instance.hvparams # not filled!
5234 elif field == "hvparams":
5236 elif (field.startswith(HVPREFIX) and
5237 field[len(HVPREFIX):] in constants.HVS_PARAMETERS and
5238 field[len(HVPREFIX):] not in constants.HVC_GLOBALS):
5239 val = i_hv.get(field[len(HVPREFIX):], None)
5240 elif field == "custom_beparams":
5241 val = instance.beparams
5242 elif field == "beparams":
5244 elif (field.startswith(BEPREFIX) and
5245 field[len(BEPREFIX):] in constants.BES_PARAMETERS):
5246 val = i_be.get(field[len(BEPREFIX):], None)
5247 elif st_match and st_match.groups():
5248 # matches a variable list
5249 st_groups = st_match.groups()
5250 if st_groups and st_groups[0] == "disk":
5251 if st_groups[1] == "count":
5252 val = len(instance.disks)
5253 elif st_groups[1] == "sizes":
5254 val = [disk.size for disk in instance.disks]
5255 elif st_groups[1] == "size":
5257 val = instance.FindDisk(st_groups[2]).size
5258 except errors.OpPrereqError:
5261 assert False, "Unhandled disk parameter"
5262 elif st_groups[0] == "nic":
5263 if st_groups[1] == "count":
5264 val = len(instance.nics)
5265 elif st_groups[1] == "macs":
5266 val = [nic.mac for nic in instance.nics]
5267 elif st_groups[1] == "ips":
5268 val = [nic.ip for nic in instance.nics]
5269 elif st_groups[1] == "modes":
5270 val = [nicp[constants.NIC_MODE] for nicp in i_nicp]
5271 elif st_groups[1] == "links":
5272 val = [nicp[constants.NIC_LINK] for nicp in i_nicp]
5273 elif st_groups[1] == "bridges":
5276 if nicp[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
5277 val.append(nicp[constants.NIC_LINK])
5282 nic_idx = int(st_groups[2])
5283 if nic_idx >= len(instance.nics):
5286 if st_groups[1] == "mac":
5287 val = instance.nics[nic_idx].mac
5288 elif st_groups[1] == "ip":
5289 val = instance.nics[nic_idx].ip
5290 elif st_groups[1] == "mode":
5291 val = i_nicp[nic_idx][constants.NIC_MODE]
5292 elif st_groups[1] == "link":
5293 val = i_nicp[nic_idx][constants.NIC_LINK]
5294 elif st_groups[1] == "bridge":
5295 nic_mode = i_nicp[nic_idx][constants.NIC_MODE]
5296 if nic_mode == constants.NIC_MODE_BRIDGED:
5297 val = i_nicp[nic_idx][constants.NIC_LINK]
5301 assert False, "Unhandled NIC parameter"
5303 assert False, ("Declared but unhandled variable parameter '%s'" %
5306 assert False, "Declared but unhandled parameter '%s'" % field
5313 class LUFailoverInstance(LogicalUnit):
5314 """Failover an instance.
5317 HPATH = "instance-failover"
5318 HTYPE = constants.HTYPE_INSTANCE
5321 ("ignore_consistency", False, ht.TBool),
5326 def ExpandNames(self):
5327 self._ExpandAndLockInstance()
5328 self.needed_locks[locking.LEVEL_NODE] = []
5329 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
5331 def DeclareLocks(self, level):
5332 if level == locking.LEVEL_NODE:
5333 self._LockInstancesNodes()
5335 def BuildHooksEnv(self):
5338 This runs on master, primary and secondary nodes of the instance.
5341 instance = self.instance
5342 source_node = instance.primary_node
5343 target_node = instance.secondary_nodes[0]
5345 "IGNORE_CONSISTENCY": self.op.ignore_consistency,
5346 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
5347 "OLD_PRIMARY": source_node,
5348 "OLD_SECONDARY": target_node,
5349 "NEW_PRIMARY": target_node,
5350 "NEW_SECONDARY": source_node,
5352 env.update(_BuildInstanceHookEnvByObject(self, instance))
5353 nl = [self.cfg.GetMasterNode()] + list(instance.secondary_nodes)
5355 nl_post.append(source_node)
5356 return env, nl, nl_post
5358 def CheckPrereq(self):
5359 """Check prerequisites.
5361 This checks that the instance is in the cluster.
5364 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5365 assert self.instance is not None, \
5366 "Cannot retrieve locked instance %s" % self.op.instance_name
5368 bep = self.cfg.GetClusterInfo().FillBE(instance)
5369 if instance.disk_template not in constants.DTS_NET_MIRROR:
5370 raise errors.OpPrereqError("Instance's disk layout is not"
5371 " network mirrored, cannot failover.",
5374 secondary_nodes = instance.secondary_nodes
5375 if not secondary_nodes:
5376 raise errors.ProgrammerError("no secondary node but using "
5377 "a mirrored disk template")
5379 target_node = secondary_nodes[0]
5380 _CheckNodeOnline(self, target_node)
5381 _CheckNodeNotDrained(self, target_node)
5382 if instance.admin_up:
5383 # check memory requirements on the secondary node
5384 _CheckNodeFreeMemory(self, target_node, "failing over instance %s" %
5385 instance.name, bep[constants.BE_MEMORY],
5386 instance.hypervisor)
5388 self.LogInfo("Not checking memory on the secondary node as"
5389 " instance will not be started")
5391 # check bridge existance
5392 _CheckInstanceBridgesExist(self, instance, node=target_node)
5394 def Exec(self, feedback_fn):
5395 """Failover an instance.
5397 The failover is done by shutting it down on its present node and
5398 starting it on the secondary.
5401 instance = self.instance
5402 primary_node = self.cfg.GetNodeInfo(instance.primary_node)
5404 source_node = instance.primary_node
5405 target_node = instance.secondary_nodes[0]
5407 if instance.admin_up:
5408 feedback_fn("* checking disk consistency between source and target")
5409 for dev in instance.disks:
5410 # for drbd, these are drbd over lvm
5411 if not _CheckDiskConsistency(self, dev, target_node, False):
5412 if not self.op.ignore_consistency:
5413 raise errors.OpExecError("Disk %s is degraded on target node,"
5414 " aborting failover." % dev.iv_name)
5416 feedback_fn("* not checking disk consistency as instance is not running")
5418 feedback_fn("* shutting down instance on source node")
5419 logging.info("Shutting down instance %s on node %s",
5420 instance.name, source_node)
5422 result = self.rpc.call_instance_shutdown(source_node, instance,
5423 self.op.shutdown_timeout)
5424 msg = result.fail_msg
5426 if self.op.ignore_consistency or primary_node.offline:
5427 self.proc.LogWarning("Could not shutdown instance %s on node %s."
5428 " Proceeding anyway. Please make sure node"
5429 " %s is down. Error details: %s",
5430 instance.name, source_node, source_node, msg)
5432 raise errors.OpExecError("Could not shutdown instance %s on"
5434 (instance.name, source_node, msg))
5436 feedback_fn("* deactivating the instance's disks on source node")
5437 if not _ShutdownInstanceDisks(self, instance, ignore_primary=True):
5438 raise errors.OpExecError("Can't shut down the instance's disks.")
5440 instance.primary_node = target_node
5441 # distribute new instance config to the other nodes
5442 self.cfg.Update(instance, feedback_fn)
5444 # Only start the instance if it's marked as up
5445 if instance.admin_up:
5446 feedback_fn("* activating the instance's disks on target node")
5447 logging.info("Starting instance %s on node %s",
5448 instance.name, target_node)
5450 disks_ok, _ = _AssembleInstanceDisks(self, instance,
5451 ignore_secondaries=True)
5453 _ShutdownInstanceDisks(self, instance)
5454 raise errors.OpExecError("Can't activate the instance's disks")
5456 feedback_fn("* starting the instance on the target node")
5457 result = self.rpc.call_instance_start(target_node, instance, None, None)
5458 msg = result.fail_msg
5460 _ShutdownInstanceDisks(self, instance)
5461 raise errors.OpExecError("Could not start instance %s on node %s: %s" %
5462 (instance.name, target_node, msg))
5465 class LUMigrateInstance(LogicalUnit):
5466 """Migrate an instance.
5468 This is migration without shutting down, compared to the failover,
5469 which is done with shutdown.
5472 HPATH = "instance-migrate"
5473 HTYPE = constants.HTYPE_INSTANCE
5478 ("cleanup", False, ht.TBool),
5483 def ExpandNames(self):
5484 self._ExpandAndLockInstance()
5486 self.needed_locks[locking.LEVEL_NODE] = []
5487 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
5489 self._migrater = TLMigrateInstance(self, self.op.instance_name,
5491 self.tasklets = [self._migrater]
5493 def DeclareLocks(self, level):
5494 if level == locking.LEVEL_NODE:
5495 self._LockInstancesNodes()
5497 def BuildHooksEnv(self):
5500 This runs on master, primary and secondary nodes of the instance.
5503 instance = self._migrater.instance
5504 source_node = instance.primary_node
5505 target_node = instance.secondary_nodes[0]
5506 env = _BuildInstanceHookEnvByObject(self, instance)
5507 env["MIGRATE_LIVE"] = self._migrater.live
5508 env["MIGRATE_CLEANUP"] = self.op.cleanup
5510 "OLD_PRIMARY": source_node,
5511 "OLD_SECONDARY": target_node,
5512 "NEW_PRIMARY": target_node,
5513 "NEW_SECONDARY": source_node,
5515 nl = [self.cfg.GetMasterNode()] + list(instance.secondary_nodes)
5517 nl_post.append(source_node)
5518 return env, nl, nl_post
5521 class LUMoveInstance(LogicalUnit):
5522 """Move an instance by data-copying.
5525 HPATH = "instance-move"
5526 HTYPE = constants.HTYPE_INSTANCE
5529 ("target_node", ht.NoDefault, ht.TNonEmptyString),
5534 def ExpandNames(self):
5535 self._ExpandAndLockInstance()
5536 target_node = _ExpandNodeName(self.cfg, self.op.target_node)
5537 self.op.target_node = target_node
5538 self.needed_locks[locking.LEVEL_NODE] = [target_node]
5539 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
5541 def DeclareLocks(self, level):
5542 if level == locking.LEVEL_NODE:
5543 self._LockInstancesNodes(primary_only=True)
5545 def BuildHooksEnv(self):
5548 This runs on master, primary and secondary nodes of the instance.
5552 "TARGET_NODE": self.op.target_node,
5553 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
5555 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
5556 nl = [self.cfg.GetMasterNode()] + [self.instance.primary_node,
5557 self.op.target_node]
5560 def CheckPrereq(self):
5561 """Check prerequisites.
5563 This checks that the instance is in the cluster.
5566 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5567 assert self.instance is not None, \
5568 "Cannot retrieve locked instance %s" % self.op.instance_name
5570 node = self.cfg.GetNodeInfo(self.op.target_node)
5571 assert node is not None, \
5572 "Cannot retrieve locked node %s" % self.op.target_node
5574 self.target_node = target_node = node.name
5576 if target_node == instance.primary_node:
5577 raise errors.OpPrereqError("Instance %s is already on the node %s" %
5578 (instance.name, target_node),
5581 bep = self.cfg.GetClusterInfo().FillBE(instance)
5583 for idx, dsk in enumerate(instance.disks):
5584 if dsk.dev_type not in (constants.LD_LV, constants.LD_FILE):
5585 raise errors.OpPrereqError("Instance disk %d has a complex layout,"
5586 " cannot copy" % idx, errors.ECODE_STATE)
5588 _CheckNodeOnline(self, target_node)
5589 _CheckNodeNotDrained(self, target_node)
5591 if instance.admin_up:
5592 # check memory requirements on the secondary node
5593 _CheckNodeFreeMemory(self, target_node, "failing over instance %s" %
5594 instance.name, bep[constants.BE_MEMORY],
5595 instance.hypervisor)
5597 self.LogInfo("Not checking memory on the secondary node as"
5598 " instance will not be started")
5600 # check bridge existance
5601 _CheckInstanceBridgesExist(self, instance, node=target_node)
5603 def Exec(self, feedback_fn):
5604 """Move an instance.
5606 The move is done by shutting it down on its present node, copying
5607 the data over (slow) and starting it on the new node.
5610 instance = self.instance
5612 source_node = instance.primary_node
5613 target_node = self.target_node
5615 self.LogInfo("Shutting down instance %s on source node %s",
5616 instance.name, source_node)
5618 result = self.rpc.call_instance_shutdown(source_node, instance,
5619 self.op.shutdown_timeout)
5620 msg = result.fail_msg
5622 if self.op.ignore_consistency:
5623 self.proc.LogWarning("Could not shutdown instance %s on node %s."
5624 " Proceeding anyway. Please make sure node"
5625 " %s is down. Error details: %s",
5626 instance.name, source_node, source_node, msg)
5628 raise errors.OpExecError("Could not shutdown instance %s on"
5630 (instance.name, source_node, msg))
5632 # create the target disks
5634 _CreateDisks(self, instance, target_node=target_node)
5635 except errors.OpExecError:
5636 self.LogWarning("Device creation failed, reverting...")
5638 _RemoveDisks(self, instance, target_node=target_node)
5640 self.cfg.ReleaseDRBDMinors(instance.name)
5643 cluster_name = self.cfg.GetClusterInfo().cluster_name
5646 # activate, get path, copy the data over
5647 for idx, disk in enumerate(instance.disks):
5648 self.LogInfo("Copying data for disk %d", idx)
5649 result = self.rpc.call_blockdev_assemble(target_node, disk,
5650 instance.name, True)
5652 self.LogWarning("Can't assemble newly created disk %d: %s",
5653 idx, result.fail_msg)
5654 errs.append(result.fail_msg)
5656 dev_path = result.payload
5657 result = self.rpc.call_blockdev_export(source_node, disk,
5658 target_node, dev_path,
5661 self.LogWarning("Can't copy data over for disk %d: %s",
5662 idx, result.fail_msg)
5663 errs.append(result.fail_msg)
5667 self.LogWarning("Some disks failed to copy, aborting")
5669 _RemoveDisks(self, instance, target_node=target_node)
5671 self.cfg.ReleaseDRBDMinors(instance.name)
5672 raise errors.OpExecError("Errors during disk copy: %s" %
5675 instance.primary_node = target_node
5676 self.cfg.Update(instance, feedback_fn)
5678 self.LogInfo("Removing the disks on the original node")
5679 _RemoveDisks(self, instance, target_node=source_node)
5681 # Only start the instance if it's marked as up
5682 if instance.admin_up:
5683 self.LogInfo("Starting instance %s on node %s",
5684 instance.name, target_node)
5686 disks_ok, _ = _AssembleInstanceDisks(self, instance,
5687 ignore_secondaries=True)
5689 _ShutdownInstanceDisks(self, instance)
5690 raise errors.OpExecError("Can't activate the instance's disks")
5692 result = self.rpc.call_instance_start(target_node, instance, None, None)
5693 msg = result.fail_msg
5695 _ShutdownInstanceDisks(self, instance)
5696 raise errors.OpExecError("Could not start instance %s on node %s: %s" %
5697 (instance.name, target_node, msg))
5700 class LUMigrateNode(LogicalUnit):
5701 """Migrate all instances from a node.
5704 HPATH = "node-migrate"
5705 HTYPE = constants.HTYPE_NODE
5713 def ExpandNames(self):
5714 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
5716 self.needed_locks = {
5717 locking.LEVEL_NODE: [self.op.node_name],
5720 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
5722 # Create tasklets for migrating instances for all instances on this node
5726 for inst in _GetNodePrimaryInstances(self.cfg, self.op.node_name):
5727 logging.debug("Migrating instance %s", inst.name)
5728 names.append(inst.name)
5730 tasklets.append(TLMigrateInstance(self, inst.name, False))
5732 self.tasklets = tasklets
5734 # Declare instance locks
5735 self.needed_locks[locking.LEVEL_INSTANCE] = names
5737 def DeclareLocks(self, level):
5738 if level == locking.LEVEL_NODE:
5739 self._LockInstancesNodes()
5741 def BuildHooksEnv(self):
5744 This runs on the master, the primary and all the secondaries.
5748 "NODE_NAME": self.op.node_name,
5751 nl = [self.cfg.GetMasterNode()]
5753 return (env, nl, nl)
5756 class TLMigrateInstance(Tasklet):
5757 """Tasklet class for instance migration.
5760 @ivar live: whether the migration will be done live or non-live;
5761 this variable is initalized only after CheckPrereq has run
5764 def __init__(self, lu, instance_name, cleanup):
5765 """Initializes this class.
5768 Tasklet.__init__(self, lu)
5771 self.instance_name = instance_name
5772 self.cleanup = cleanup
5773 self.live = False # will be overridden later
5775 def CheckPrereq(self):
5776 """Check prerequisites.
5778 This checks that the instance is in the cluster.
5781 instance_name = _ExpandInstanceName(self.lu.cfg, self.instance_name)
5782 instance = self.cfg.GetInstanceInfo(instance_name)
5783 assert instance is not None
5785 if instance.disk_template != constants.DT_DRBD8:
5786 raise errors.OpPrereqError("Instance's disk layout is not"
5787 " drbd8, cannot migrate.", errors.ECODE_STATE)
5789 secondary_nodes = instance.secondary_nodes
5790 if not secondary_nodes:
5791 raise errors.ConfigurationError("No secondary node but using"
5792 " drbd8 disk template")
5794 i_be = self.cfg.GetClusterInfo().FillBE(instance)
5796 target_node = secondary_nodes[0]
5797 # check memory requirements on the secondary node
5798 _CheckNodeFreeMemory(self.lu, target_node, "migrating instance %s" %
5799 instance.name, i_be[constants.BE_MEMORY],
5800 instance.hypervisor)
5802 # check bridge existance
5803 _CheckInstanceBridgesExist(self.lu, instance, node=target_node)
5805 if not self.cleanup:
5806 _CheckNodeNotDrained(self.lu, target_node)
5807 result = self.rpc.call_instance_migratable(instance.primary_node,
5809 result.Raise("Can't migrate, please use failover",
5810 prereq=True, ecode=errors.ECODE_STATE)
5812 self.instance = instance
5814 if self.lu.op.live is not None and self.lu.op.mode is not None:
5815 raise errors.OpPrereqError("Only one of the 'live' and 'mode'"
5816 " parameters are accepted",
5818 if self.lu.op.live is not None:
5820 self.lu.op.mode = constants.HT_MIGRATION_LIVE
5822 self.lu.op.mode = constants.HT_MIGRATION_NONLIVE
5823 # reset the 'live' parameter to None so that repeated
5824 # invocations of CheckPrereq do not raise an exception
5825 self.lu.op.live = None
5826 elif self.lu.op.mode is None:
5827 # read the default value from the hypervisor
5828 i_hv = self.cfg.GetClusterInfo().FillHV(instance, skip_globals=False)
5829 self.lu.op.mode = i_hv[constants.HV_MIGRATION_MODE]
5831 self.live = self.lu.op.mode == constants.HT_MIGRATION_LIVE
5833 def _WaitUntilSync(self):
5834 """Poll with custom rpc for disk sync.
5836 This uses our own step-based rpc call.
5839 self.feedback_fn("* wait until resync is done")
5843 result = self.rpc.call_drbd_wait_sync(self.all_nodes,
5845 self.instance.disks)
5847 for node, nres in result.items():
5848 nres.Raise("Cannot resync disks on node %s" % node)
5849 node_done, node_percent = nres.payload
5850 all_done = all_done and node_done
5851 if node_percent is not None:
5852 min_percent = min(min_percent, node_percent)
5854 if min_percent < 100:
5855 self.feedback_fn(" - progress: %.1f%%" % min_percent)
5858 def _EnsureSecondary(self, node):
5859 """Demote a node to secondary.
5862 self.feedback_fn("* switching node %s to secondary mode" % node)
5864 for dev in self.instance.disks:
5865 self.cfg.SetDiskID(dev, node)
5867 result = self.rpc.call_blockdev_close(node, self.instance.name,
5868 self.instance.disks)
5869 result.Raise("Cannot change disk to secondary on node %s" % node)
5871 def _GoStandalone(self):
5872 """Disconnect from the network.
5875 self.feedback_fn("* changing into standalone mode")
5876 result = self.rpc.call_drbd_disconnect_net(self.all_nodes, self.nodes_ip,
5877 self.instance.disks)
5878 for node, nres in result.items():
5879 nres.Raise("Cannot disconnect disks node %s" % node)
5881 def _GoReconnect(self, multimaster):
5882 """Reconnect to the network.
5888 msg = "single-master"
5889 self.feedback_fn("* changing disks into %s mode" % msg)
5890 result = self.rpc.call_drbd_attach_net(self.all_nodes, self.nodes_ip,
5891 self.instance.disks,
5892 self.instance.name, multimaster)
5893 for node, nres in result.items():
5894 nres.Raise("Cannot change disks config on node %s" % node)
5896 def _ExecCleanup(self):
5897 """Try to cleanup after a failed migration.
5899 The cleanup is done by:
5900 - check that the instance is running only on one node
5901 (and update the config if needed)
5902 - change disks on its secondary node to secondary
5903 - wait until disks are fully synchronized
5904 - disconnect from the network
5905 - change disks into single-master mode
5906 - wait again until disks are fully synchronized
5909 instance = self.instance
5910 target_node = self.target_node
5911 source_node = self.source_node
5913 # check running on only one node
5914 self.feedback_fn("* checking where the instance actually runs"
5915 " (if this hangs, the hypervisor might be in"
5917 ins_l = self.rpc.call_instance_list(self.all_nodes, [instance.hypervisor])
5918 for node, result in ins_l.items():
5919 result.Raise("Can't contact node %s" % node)
5921 runningon_source = instance.name in ins_l[source_node].payload
5922 runningon_target = instance.name in ins_l[target_node].payload
5924 if runningon_source and runningon_target:
5925 raise errors.OpExecError("Instance seems to be running on two nodes,"
5926 " or the hypervisor is confused. You will have"
5927 " to ensure manually that it runs only on one"
5928 " and restart this operation.")
5930 if not (runningon_source or runningon_target):
5931 raise errors.OpExecError("Instance does not seem to be running at all."
5932 " In this case, it's safer to repair by"
5933 " running 'gnt-instance stop' to ensure disk"
5934 " shutdown, and then restarting it.")
5936 if runningon_target:
5937 # the migration has actually succeeded, we need to update the config
5938 self.feedback_fn("* instance running on secondary node (%s),"
5939 " updating config" % target_node)
5940 instance.primary_node = target_node
5941 self.cfg.Update(instance, self.feedback_fn)
5942 demoted_node = source_node
5944 self.feedback_fn("* instance confirmed to be running on its"
5945 " primary node (%s)" % source_node)
5946 demoted_node = target_node
5948 self._EnsureSecondary(demoted_node)
5950 self._WaitUntilSync()
5951 except errors.OpExecError:
5952 # we ignore here errors, since if the device is standalone, it
5953 # won't be able to sync
5955 self._GoStandalone()
5956 self._GoReconnect(False)
5957 self._WaitUntilSync()
5959 self.feedback_fn("* done")
5961 def _RevertDiskStatus(self):
5962 """Try to revert the disk status after a failed migration.
5965 target_node = self.target_node
5967 self._EnsureSecondary(target_node)
5968 self._GoStandalone()
5969 self._GoReconnect(False)
5970 self._WaitUntilSync()
5971 except errors.OpExecError, err:
5972 self.lu.LogWarning("Migration failed and I can't reconnect the"
5973 " drives: error '%s'\n"
5974 "Please look and recover the instance status" %
5977 def _AbortMigration(self):
5978 """Call the hypervisor code to abort a started migration.
5981 instance = self.instance
5982 target_node = self.target_node
5983 migration_info = self.migration_info
5985 abort_result = self.rpc.call_finalize_migration(target_node,
5989 abort_msg = abort_result.fail_msg
5991 logging.error("Aborting migration failed on target node %s: %s",
5992 target_node, abort_msg)
5993 # Don't raise an exception here, as we stil have to try to revert the
5994 # disk status, even if this step failed.
5996 def _ExecMigration(self):
5997 """Migrate an instance.
5999 The migrate is done by:
6000 - change the disks into dual-master mode
6001 - wait until disks are fully synchronized again
6002 - migrate the instance
6003 - change disks on the new secondary node (the old primary) to secondary
6004 - wait until disks are fully synchronized
6005 - change disks into single-master mode
6008 instance = self.instance
6009 target_node = self.target_node
6010 source_node = self.source_node
6012 self.feedback_fn("* checking disk consistency between source and target")
6013 for dev in instance.disks:
6014 if not _CheckDiskConsistency(self.lu, dev, target_node, False):
6015 raise errors.OpExecError("Disk %s is degraded or not fully"
6016 " synchronized on target node,"
6017 " aborting migrate." % dev.iv_name)
6019 # First get the migration information from the remote node
6020 result = self.rpc.call_migration_info(source_node, instance)
6021 msg = result.fail_msg
6023 log_err = ("Failed fetching source migration information from %s: %s" %
6025 logging.error(log_err)
6026 raise errors.OpExecError(log_err)
6028 self.migration_info = migration_info = result.payload
6030 # Then switch the disks to master/master mode
6031 self._EnsureSecondary(target_node)
6032 self._GoStandalone()
6033 self._GoReconnect(True)
6034 self._WaitUntilSync()
6036 self.feedback_fn("* preparing %s to accept the instance" % target_node)
6037 result = self.rpc.call_accept_instance(target_node,
6040 self.nodes_ip[target_node])
6042 msg = result.fail_msg
6044 logging.error("Instance pre-migration failed, trying to revert"
6045 " disk status: %s", msg)
6046 self.feedback_fn("Pre-migration failed, aborting")
6047 self._AbortMigration()
6048 self._RevertDiskStatus()
6049 raise errors.OpExecError("Could not pre-migrate instance %s: %s" %
6050 (instance.name, msg))
6052 self.feedback_fn("* migrating instance to %s" % target_node)
6054 result = self.rpc.call_instance_migrate(source_node, instance,
6055 self.nodes_ip[target_node],
6057 msg = result.fail_msg
6059 logging.error("Instance migration failed, trying to revert"
6060 " disk status: %s", msg)
6061 self.feedback_fn("Migration failed, aborting")
6062 self._AbortMigration()
6063 self._RevertDiskStatus()
6064 raise errors.OpExecError("Could not migrate instance %s: %s" %
6065 (instance.name, msg))
6068 instance.primary_node = target_node
6069 # distribute new instance config to the other nodes
6070 self.cfg.Update(instance, self.feedback_fn)
6072 result = self.rpc.call_finalize_migration(target_node,
6076 msg = result.fail_msg
6078 logging.error("Instance migration succeeded, but finalization failed:"
6080 raise errors.OpExecError("Could not finalize instance migration: %s" %
6083 self._EnsureSecondary(source_node)
6084 self._WaitUntilSync()
6085 self._GoStandalone()
6086 self._GoReconnect(False)
6087 self._WaitUntilSync()
6089 self.feedback_fn("* done")
6091 def Exec(self, feedback_fn):
6092 """Perform the migration.
6095 feedback_fn("Migrating instance %s" % self.instance.name)
6097 self.feedback_fn = feedback_fn
6099 self.source_node = self.instance.primary_node
6100 self.target_node = self.instance.secondary_nodes[0]
6101 self.all_nodes = [self.source_node, self.target_node]
6103 self.source_node: self.cfg.GetNodeInfo(self.source_node).secondary_ip,
6104 self.target_node: self.cfg.GetNodeInfo(self.target_node).secondary_ip,
6108 return self._ExecCleanup()
6110 return self._ExecMigration()
6113 def _CreateBlockDev(lu, node, instance, device, force_create,
6115 """Create a tree of block devices on a given node.
6117 If this device type has to be created on secondaries, create it and
6120 If not, just recurse to children keeping the same 'force' value.
6122 @param lu: the lu on whose behalf we execute
6123 @param node: the node on which to create the device
6124 @type instance: L{objects.Instance}
6125 @param instance: the instance which owns the device
6126 @type device: L{objects.Disk}
6127 @param device: the device to create
6128 @type force_create: boolean
6129 @param force_create: whether to force creation of this device; this
6130 will be change to True whenever we find a device which has
6131 CreateOnSecondary() attribute
6132 @param info: the extra 'metadata' we should attach to the device
6133 (this will be represented as a LVM tag)
6134 @type force_open: boolean
6135 @param force_open: this parameter will be passes to the
6136 L{backend.BlockdevCreate} function where it specifies
6137 whether we run on primary or not, and it affects both
6138 the child assembly and the device own Open() execution
6141 if device.CreateOnSecondary():
6145 for child in device.children:
6146 _CreateBlockDev(lu, node, instance, child, force_create,
6149 if not force_create:
6152 _CreateSingleBlockDev(lu, node, instance, device, info, force_open)
6155 def _CreateSingleBlockDev(lu, node, instance, device, info, force_open):
6156 """Create a single block device on a given node.
6158 This will not recurse over children of the device, so they must be
6161 @param lu: the lu on whose behalf we execute
6162 @param node: the node on which to create the device
6163 @type instance: L{objects.Instance}
6164 @param instance: the instance which owns the device
6165 @type device: L{objects.Disk}
6166 @param device: the device to create
6167 @param info: the extra 'metadata' we should attach to the device
6168 (this will be represented as a LVM tag)
6169 @type force_open: boolean
6170 @param force_open: this parameter will be passes to the
6171 L{backend.BlockdevCreate} function where it specifies
6172 whether we run on primary or not, and it affects both
6173 the child assembly and the device own Open() execution
6176 lu.cfg.SetDiskID(device, node)
6177 result = lu.rpc.call_blockdev_create(node, device, device.size,
6178 instance.name, force_open, info)
6179 result.Raise("Can't create block device %s on"
6180 " node %s for instance %s" % (device, node, instance.name))
6181 if device.physical_id is None:
6182 device.physical_id = result.payload
6185 def _GenerateUniqueNames(lu, exts):
6186 """Generate a suitable LV name.
6188 This will generate a logical volume name for the given instance.
6193 new_id = lu.cfg.GenerateUniqueID(lu.proc.GetECId())
6194 results.append("%s%s" % (new_id, val))
6198 def _GenerateDRBD8Branch(lu, primary, secondary, size, names, iv_name,
6200 """Generate a drbd8 device complete with its children.
6203 port = lu.cfg.AllocatePort()
6204 vgname = lu.cfg.GetVGName()
6205 shared_secret = lu.cfg.GenerateDRBDSecret(lu.proc.GetECId())
6206 dev_data = objects.Disk(dev_type=constants.LD_LV, size=size,
6207 logical_id=(vgname, names[0]))
6208 dev_meta = objects.Disk(dev_type=constants.LD_LV, size=128,
6209 logical_id=(vgname, names[1]))
6210 drbd_dev = objects.Disk(dev_type=constants.LD_DRBD8, size=size,
6211 logical_id=(primary, secondary, port,
6214 children=[dev_data, dev_meta],
6219 def _GenerateDiskTemplate(lu, template_name,
6220 instance_name, primary_node,
6221 secondary_nodes, disk_info,
6222 file_storage_dir, file_driver,
6224 """Generate the entire disk layout for a given template type.
6227 #TODO: compute space requirements
6229 vgname = lu.cfg.GetVGName()
6230 disk_count = len(disk_info)
6232 if template_name == constants.DT_DISKLESS:
6234 elif template_name == constants.DT_PLAIN:
6235 if len(secondary_nodes) != 0:
6236 raise errors.ProgrammerError("Wrong template configuration")
6238 names = _GenerateUniqueNames(lu, [".disk%d" % (base_index + i)
6239 for i in range(disk_count)])
6240 for idx, disk in enumerate(disk_info):
6241 disk_index = idx + base_index
6242 disk_dev = objects.Disk(dev_type=constants.LD_LV, size=disk["size"],
6243 logical_id=(vgname, names[idx]),
6244 iv_name="disk/%d" % disk_index,
6246 disks.append(disk_dev)
6247 elif template_name == constants.DT_DRBD8:
6248 if len(secondary_nodes) != 1:
6249 raise errors.ProgrammerError("Wrong template configuration")
6250 remote_node = secondary_nodes[0]
6251 minors = lu.cfg.AllocateDRBDMinor(
6252 [primary_node, remote_node] * len(disk_info), instance_name)
6255 for lv_prefix in _GenerateUniqueNames(lu, [".disk%d" % (base_index + i)
6256 for i in range(disk_count)]):
6257 names.append(lv_prefix + "_data")
6258 names.append(lv_prefix + "_meta")
6259 for idx, disk in enumerate(disk_info):
6260 disk_index = idx + base_index
6261 disk_dev = _GenerateDRBD8Branch(lu, primary_node, remote_node,
6262 disk["size"], names[idx*2:idx*2+2],
6263 "disk/%d" % disk_index,
6264 minors[idx*2], minors[idx*2+1])
6265 disk_dev.mode = disk["mode"]
6266 disks.append(disk_dev)
6267 elif template_name == constants.DT_FILE:
6268 if len(secondary_nodes) != 0:
6269 raise errors.ProgrammerError("Wrong template configuration")
6271 _RequireFileStorage()
6273 for idx, disk in enumerate(disk_info):
6274 disk_index = idx + base_index
6275 disk_dev = objects.Disk(dev_type=constants.LD_FILE, size=disk["size"],
6276 iv_name="disk/%d" % disk_index,
6277 logical_id=(file_driver,
6278 "%s/disk%d" % (file_storage_dir,
6281 disks.append(disk_dev)
6283 raise errors.ProgrammerError("Invalid disk template '%s'" % template_name)
6287 def _GetInstanceInfoText(instance):
6288 """Compute that text that should be added to the disk's metadata.
6291 return "originstname+%s" % instance.name
6294 def _CalcEta(time_taken, written, total_size):
6295 """Calculates the ETA based on size written and total size.
6297 @param time_taken: The time taken so far
6298 @param written: amount written so far
6299 @param total_size: The total size of data to be written
6300 @return: The remaining time in seconds
6303 avg_time = time_taken / float(written)
6304 return (total_size - written) * avg_time
6307 def _WipeDisks(lu, instance):
6308 """Wipes instance disks.
6310 @type lu: L{LogicalUnit}
6311 @param lu: the logical unit on whose behalf we execute
6312 @type instance: L{objects.Instance}
6313 @param instance: the instance whose disks we should create
6314 @return: the success of the wipe
6317 node = instance.primary_node
6318 for idx, device in enumerate(instance.disks):
6319 lu.LogInfo("* Wiping disk %d", idx)
6320 logging.info("Wiping disk %d for instance %s", idx, instance.name)
6322 # The wipe size is MIN_WIPE_CHUNK_PERCENT % of the instance disk but
6323 # MAX_WIPE_CHUNK at max
6324 wipe_chunk_size = min(constants.MAX_WIPE_CHUNK, device.size / 100.0 *
6325 constants.MIN_WIPE_CHUNK_PERCENT)
6330 start_time = time.time()
6332 while offset < size:
6333 wipe_size = min(wipe_chunk_size, size - offset)
6334 result = lu.rpc.call_blockdev_wipe(node, device, offset, wipe_size)
6335 result.Raise("Could not wipe disk %d at offset %d for size %d" %
6336 (idx, offset, wipe_size))
6339 if now - last_output >= 60:
6340 eta = _CalcEta(now - start_time, offset, size)
6341 lu.LogInfo(" - done: %.1f%% ETA: %s" %
6342 (offset / float(size) * 100, utils.FormatSeconds(eta)))
6346 def _CreateDisks(lu, instance, to_skip=None, target_node=None):
6347 """Create all disks for an instance.
6349 This abstracts away some work from AddInstance.
6351 @type lu: L{LogicalUnit}
6352 @param lu: the logical unit on whose behalf we execute
6353 @type instance: L{objects.Instance}
6354 @param instance: the instance whose disks we should create
6356 @param to_skip: list of indices to skip
6357 @type target_node: string
6358 @param target_node: if passed, overrides the target node for creation
6360 @return: the success of the creation
6363 info = _GetInstanceInfoText(instance)
6364 if target_node is None:
6365 pnode = instance.primary_node
6366 all_nodes = instance.all_nodes
6371 if instance.disk_template == constants.DT_FILE:
6372 file_storage_dir = os.path.dirname(instance.disks[0].logical_id[1])
6373 result = lu.rpc.call_file_storage_dir_create(pnode, file_storage_dir)
6375 result.Raise("Failed to create directory '%s' on"
6376 " node %s" % (file_storage_dir, pnode))
6378 # Note: this needs to be kept in sync with adding of disks in
6379 # LUSetInstanceParams
6380 for idx, device in enumerate(instance.disks):
6381 if to_skip and idx in to_skip:
6383 logging.info("Creating volume %s for instance %s",
6384 device.iv_name, instance.name)
6386 for node in all_nodes:
6387 f_create = node == pnode
6388 _CreateBlockDev(lu, node, instance, device, f_create, info, f_create)
6391 def _RemoveDisks(lu, instance, target_node=None):
6392 """Remove all disks for an instance.
6394 This abstracts away some work from `AddInstance()` and
6395 `RemoveInstance()`. Note that in case some of the devices couldn't
6396 be removed, the removal will continue with the other ones (compare
6397 with `_CreateDisks()`).
6399 @type lu: L{LogicalUnit}
6400 @param lu: the logical unit on whose behalf we execute
6401 @type instance: L{objects.Instance}
6402 @param instance: the instance whose disks we should remove
6403 @type target_node: string
6404 @param target_node: used to override the node on which to remove the disks
6406 @return: the success of the removal
6409 logging.info("Removing block devices for instance %s", instance.name)
6412 for device in instance.disks:
6414 edata = [(target_node, device)]
6416 edata = device.ComputeNodeTree(instance.primary_node)
6417 for node, disk in edata:
6418 lu.cfg.SetDiskID(disk, node)
6419 msg = lu.rpc.call_blockdev_remove(node, disk).fail_msg
6421 lu.LogWarning("Could not remove block device %s on node %s,"
6422 " continuing anyway: %s", device.iv_name, node, msg)
6425 if instance.disk_template == constants.DT_FILE:
6426 file_storage_dir = os.path.dirname(instance.disks[0].logical_id[1])
6430 tgt = instance.primary_node
6431 result = lu.rpc.call_file_storage_dir_remove(tgt, file_storage_dir)
6433 lu.LogWarning("Could not remove directory '%s' on node %s: %s",
6434 file_storage_dir, instance.primary_node, result.fail_msg)
6440 def _ComputeDiskSize(disk_template, disks):
6441 """Compute disk size requirements in the volume group
6444 # Required free disk space as a function of disk and swap space
6446 constants.DT_DISKLESS: None,
6447 constants.DT_PLAIN: sum(d["size"] for d in disks),
6448 # 128 MB are added for drbd metadata for each disk
6449 constants.DT_DRBD8: sum(d["size"] + 128 for d in disks),
6450 constants.DT_FILE: None,
6453 if disk_template not in req_size_dict:
6454 raise errors.ProgrammerError("Disk template '%s' size requirement"
6455 " is unknown" % disk_template)
6457 return req_size_dict[disk_template]
6460 def _CheckHVParams(lu, nodenames, hvname, hvparams):
6461 """Hypervisor parameter validation.
6463 This function abstract the hypervisor parameter validation to be
6464 used in both instance create and instance modify.
6466 @type lu: L{LogicalUnit}
6467 @param lu: the logical unit for which we check
6468 @type nodenames: list
6469 @param nodenames: the list of nodes on which we should check
6470 @type hvname: string
6471 @param hvname: the name of the hypervisor we should use
6472 @type hvparams: dict
6473 @param hvparams: the parameters which we need to check
6474 @raise errors.OpPrereqError: if the parameters are not valid
6477 hvinfo = lu.rpc.call_hypervisor_validate_params(nodenames,
6480 for node in nodenames:
6484 info.Raise("Hypervisor parameter validation failed on node %s" % node)
6487 def _CheckOSParams(lu, required, nodenames, osname, osparams):
6488 """OS parameters validation.
6490 @type lu: L{LogicalUnit}
6491 @param lu: the logical unit for which we check
6492 @type required: boolean
6493 @param required: whether the validation should fail if the OS is not
6495 @type nodenames: list
6496 @param nodenames: the list of nodes on which we should check
6497 @type osname: string
6498 @param osname: the name of the hypervisor we should use
6499 @type osparams: dict
6500 @param osparams: the parameters which we need to check
6501 @raise errors.OpPrereqError: if the parameters are not valid
6504 result = lu.rpc.call_os_validate(required, nodenames, osname,
6505 [constants.OS_VALIDATE_PARAMETERS],
6507 for node, nres in result.items():
6508 # we don't check for offline cases since this should be run only
6509 # against the master node and/or an instance's nodes
6510 nres.Raise("OS Parameters validation failed on node %s" % node)
6511 if not nres.payload:
6512 lu.LogInfo("OS %s not found on node %s, validation skipped",
6516 class LUCreateInstance(LogicalUnit):
6517 """Create an instance.
6520 HPATH = "instance-add"
6521 HTYPE = constants.HTYPE_INSTANCE
6524 ("mode", ht.NoDefault, ht.TElemOf(constants.INSTANCE_CREATE_MODES)),
6525 ("start", True, ht.TBool),
6526 ("wait_for_sync", True, ht.TBool),
6527 ("ip_check", True, ht.TBool),
6528 ("name_check", True, ht.TBool),
6529 ("disks", ht.NoDefault, ht.TListOf(ht.TDict)),
6530 ("nics", ht.NoDefault, ht.TListOf(ht.TDict)),
6531 ("hvparams", ht.EmptyDict, ht.TDict),
6532 ("beparams", ht.EmptyDict, ht.TDict),
6533 ("osparams", ht.EmptyDict, ht.TDict),
6534 ("no_install", None, ht.TMaybeBool),
6535 ("os_type", None, ht.TMaybeString),
6536 ("force_variant", False, ht.TBool),
6537 ("source_handshake", None, ht.TOr(ht.TList, ht.TNone)),
6538 ("source_x509_ca", None, ht.TMaybeString),
6539 ("source_instance_name", None, ht.TMaybeString),
6540 ("src_node", None, ht.TMaybeString),
6541 ("src_path", None, ht.TMaybeString),
6542 ("pnode", None, ht.TMaybeString),
6543 ("snode", None, ht.TMaybeString),
6544 ("iallocator", None, ht.TMaybeString),
6545 ("hypervisor", None, ht.TMaybeString),
6546 ("disk_template", ht.NoDefault, _CheckDiskTemplate),
6547 ("identify_defaults", False, ht.TBool),
6548 ("file_driver", None, ht.TOr(ht.TNone, ht.TElemOf(constants.FILE_DRIVER))),
6549 ("file_storage_dir", None, ht.TMaybeString),
6553 def CheckArguments(self):
6557 # do not require name_check to ease forward/backward compatibility
6559 if self.op.no_install and self.op.start:
6560 self.LogInfo("No-installation mode selected, disabling startup")
6561 self.op.start = False
6562 # validate/normalize the instance name
6563 self.op.instance_name = \
6564 netutils.Hostname.GetNormalizedName(self.op.instance_name)
6566 if self.op.ip_check and not self.op.name_check:
6567 # TODO: make the ip check more flexible and not depend on the name check
6568 raise errors.OpPrereqError("Cannot do ip check without a name check",
6571 # check nics' parameter names
6572 for nic in self.op.nics:
6573 utils.ForceDictType(nic, constants.INIC_PARAMS_TYPES)
6575 # check disks. parameter names and consistent adopt/no-adopt strategy
6576 has_adopt = has_no_adopt = False
6577 for disk in self.op.disks:
6578 utils.ForceDictType(disk, constants.IDISK_PARAMS_TYPES)
6583 if has_adopt and has_no_adopt:
6584 raise errors.OpPrereqError("Either all disks are adopted or none is",
6587 if self.op.disk_template not in constants.DTS_MAY_ADOPT:
6588 raise errors.OpPrereqError("Disk adoption is not supported for the"
6589 " '%s' disk template" %
6590 self.op.disk_template,
6592 if self.op.iallocator is not None:
6593 raise errors.OpPrereqError("Disk adoption not allowed with an"
6594 " iallocator script", errors.ECODE_INVAL)
6595 if self.op.mode == constants.INSTANCE_IMPORT:
6596 raise errors.OpPrereqError("Disk adoption not allowed for"
6597 " instance import", errors.ECODE_INVAL)
6599 self.adopt_disks = has_adopt
6601 # instance name verification
6602 if self.op.name_check:
6603 self.hostname1 = netutils.GetHostname(name=self.op.instance_name)
6604 self.op.instance_name = self.hostname1.name
6605 # used in CheckPrereq for ip ping check
6606 self.check_ip = self.hostname1.ip
6608 self.check_ip = None
6610 # file storage checks
6611 if (self.op.file_driver and
6612 not self.op.file_driver in constants.FILE_DRIVER):
6613 raise errors.OpPrereqError("Invalid file driver name '%s'" %
6614 self.op.file_driver, errors.ECODE_INVAL)
6616 if self.op.file_storage_dir and os.path.isabs(self.op.file_storage_dir):
6617 raise errors.OpPrereqError("File storage directory path not absolute",
6620 ### Node/iallocator related checks
6621 _CheckIAllocatorOrNode(self, "iallocator", "pnode")
6623 if self.op.pnode is not None:
6624 if self.op.disk_template in constants.DTS_NET_MIRROR:
6625 if self.op.snode is None:
6626 raise errors.OpPrereqError("The networked disk templates need"
6627 " a mirror node", errors.ECODE_INVAL)
6629 self.LogWarning("Secondary node will be ignored on non-mirrored disk"
6631 self.op.snode = None
6633 self._cds = _GetClusterDomainSecret()
6635 if self.op.mode == constants.INSTANCE_IMPORT:
6636 # On import force_variant must be True, because if we forced it at
6637 # initial install, our only chance when importing it back is that it
6639 self.op.force_variant = True
6641 if self.op.no_install:
6642 self.LogInfo("No-installation mode has no effect during import")
6644 elif self.op.mode == constants.INSTANCE_CREATE:
6645 if self.op.os_type is None:
6646 raise errors.OpPrereqError("No guest OS specified",
6648 if self.op.os_type in self.cfg.GetClusterInfo().blacklisted_os:
6649 raise errors.OpPrereqError("Guest OS '%s' is not allowed for"
6650 " installation" % self.op.os_type,
6652 if self.op.disk_template is None:
6653 raise errors.OpPrereqError("No disk template specified",
6656 elif self.op.mode == constants.INSTANCE_REMOTE_IMPORT:
6657 # Check handshake to ensure both clusters have the same domain secret
6658 src_handshake = self.op.source_handshake
6659 if not src_handshake:
6660 raise errors.OpPrereqError("Missing source handshake",
6663 errmsg = masterd.instance.CheckRemoteExportHandshake(self._cds,
6666 raise errors.OpPrereqError("Invalid handshake: %s" % errmsg,
6669 # Load and check source CA
6670 self.source_x509_ca_pem = self.op.source_x509_ca
6671 if not self.source_x509_ca_pem:
6672 raise errors.OpPrereqError("Missing source X509 CA",
6676 (cert, _) = utils.LoadSignedX509Certificate(self.source_x509_ca_pem,
6678 except OpenSSL.crypto.Error, err:
6679 raise errors.OpPrereqError("Unable to load source X509 CA (%s)" %
6680 (err, ), errors.ECODE_INVAL)
6682 (errcode, msg) = utils.VerifyX509Certificate(cert, None, None)
6683 if errcode is not None:
6684 raise errors.OpPrereqError("Invalid source X509 CA (%s)" % (msg, ),
6687 self.source_x509_ca = cert
6689 src_instance_name = self.op.source_instance_name
6690 if not src_instance_name:
6691 raise errors.OpPrereqError("Missing source instance name",
6694 self.source_instance_name = \
6695 netutils.GetHostname(name=src_instance_name).name
6698 raise errors.OpPrereqError("Invalid instance creation mode %r" %
6699 self.op.mode, errors.ECODE_INVAL)
6701 def ExpandNames(self):
6702 """ExpandNames for CreateInstance.
6704 Figure out the right locks for instance creation.
6707 self.needed_locks = {}
6709 instance_name = self.op.instance_name
6710 # this is just a preventive check, but someone might still add this
6711 # instance in the meantime, and creation will fail at lock-add time
6712 if instance_name in self.cfg.GetInstanceList():
6713 raise errors.OpPrereqError("Instance '%s' is already in the cluster" %
6714 instance_name, errors.ECODE_EXISTS)
6716 self.add_locks[locking.LEVEL_INSTANCE] = instance_name
6718 if self.op.iallocator:
6719 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
6721 self.op.pnode = _ExpandNodeName(self.cfg, self.op.pnode)
6722 nodelist = [self.op.pnode]
6723 if self.op.snode is not None:
6724 self.op.snode = _ExpandNodeName(self.cfg, self.op.snode)
6725 nodelist.append(self.op.snode)
6726 self.needed_locks[locking.LEVEL_NODE] = nodelist
6728 # in case of import lock the source node too
6729 if self.op.mode == constants.INSTANCE_IMPORT:
6730 src_node = self.op.src_node
6731 src_path = self.op.src_path
6733 if src_path is None:
6734 self.op.src_path = src_path = self.op.instance_name
6736 if src_node is None:
6737 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
6738 self.op.src_node = None
6739 if os.path.isabs(src_path):
6740 raise errors.OpPrereqError("Importing an instance from an absolute"
6741 " path requires a source node option.",
6744 self.op.src_node = src_node = _ExpandNodeName(self.cfg, src_node)
6745 if self.needed_locks[locking.LEVEL_NODE] is not locking.ALL_SET:
6746 self.needed_locks[locking.LEVEL_NODE].append(src_node)
6747 if not os.path.isabs(src_path):
6748 self.op.src_path = src_path = \
6749 utils.PathJoin(constants.EXPORT_DIR, src_path)
6751 def _RunAllocator(self):
6752 """Run the allocator based on input opcode.
6755 nics = [n.ToDict() for n in self.nics]
6756 ial = IAllocator(self.cfg, self.rpc,
6757 mode=constants.IALLOCATOR_MODE_ALLOC,
6758 name=self.op.instance_name,
6759 disk_template=self.op.disk_template,
6762 vcpus=self.be_full[constants.BE_VCPUS],
6763 mem_size=self.be_full[constants.BE_MEMORY],
6766 hypervisor=self.op.hypervisor,
6769 ial.Run(self.op.iallocator)
6772 raise errors.OpPrereqError("Can't compute nodes using"
6773 " iallocator '%s': %s" %
6774 (self.op.iallocator, ial.info),
6776 if len(ial.result) != ial.required_nodes:
6777 raise errors.OpPrereqError("iallocator '%s' returned invalid number"
6778 " of nodes (%s), required %s" %
6779 (self.op.iallocator, len(ial.result),
6780 ial.required_nodes), errors.ECODE_FAULT)
6781 self.op.pnode = ial.result[0]
6782 self.LogInfo("Selected nodes for instance %s via iallocator %s: %s",
6783 self.op.instance_name, self.op.iallocator,
6784 utils.CommaJoin(ial.result))
6785 if ial.required_nodes == 2:
6786 self.op.snode = ial.result[1]
6788 def BuildHooksEnv(self):
6791 This runs on master, primary and secondary nodes of the instance.
6795 "ADD_MODE": self.op.mode,
6797 if self.op.mode == constants.INSTANCE_IMPORT:
6798 env["SRC_NODE"] = self.op.src_node
6799 env["SRC_PATH"] = self.op.src_path
6800 env["SRC_IMAGES"] = self.src_images
6802 env.update(_BuildInstanceHookEnv(
6803 name=self.op.instance_name,
6804 primary_node=self.op.pnode,
6805 secondary_nodes=self.secondaries,
6806 status=self.op.start,
6807 os_type=self.op.os_type,
6808 memory=self.be_full[constants.BE_MEMORY],
6809 vcpus=self.be_full[constants.BE_VCPUS],
6810 nics=_NICListToTuple(self, self.nics),
6811 disk_template=self.op.disk_template,
6812 disks=[(d["size"], d["mode"]) for d in self.disks],
6815 hypervisor_name=self.op.hypervisor,
6818 nl = ([self.cfg.GetMasterNode(), self.op.pnode] +
6822 def _ReadExportInfo(self):
6823 """Reads the export information from disk.
6825 It will override the opcode source node and path with the actual
6826 information, if these two were not specified before.
6828 @return: the export information
6831 assert self.op.mode == constants.INSTANCE_IMPORT
6833 src_node = self.op.src_node
6834 src_path = self.op.src_path
6836 if src_node is None:
6837 locked_nodes = self.acquired_locks[locking.LEVEL_NODE]
6838 exp_list = self.rpc.call_export_list(locked_nodes)
6840 for node in exp_list:
6841 if exp_list[node].fail_msg:
6843 if src_path in exp_list[node].payload:
6845 self.op.src_node = src_node = node
6846 self.op.src_path = src_path = utils.PathJoin(constants.EXPORT_DIR,
6850 raise errors.OpPrereqError("No export found for relative path %s" %
6851 src_path, errors.ECODE_INVAL)
6853 _CheckNodeOnline(self, src_node)
6854 result = self.rpc.call_export_info(src_node, src_path)
6855 result.Raise("No export or invalid export found in dir %s" % src_path)
6857 export_info = objects.SerializableConfigParser.Loads(str(result.payload))
6858 if not export_info.has_section(constants.INISECT_EXP):
6859 raise errors.ProgrammerError("Corrupted export config",
6860 errors.ECODE_ENVIRON)
6862 ei_version = export_info.get(constants.INISECT_EXP, "version")
6863 if (int(ei_version) != constants.EXPORT_VERSION):
6864 raise errors.OpPrereqError("Wrong export version %s (wanted %d)" %
6865 (ei_version, constants.EXPORT_VERSION),
6866 errors.ECODE_ENVIRON)
6869 def _ReadExportParams(self, einfo):
6870 """Use export parameters as defaults.
6872 In case the opcode doesn't specify (as in override) some instance
6873 parameters, then try to use them from the export information, if
6877 self.op.os_type = einfo.get(constants.INISECT_EXP, "os")
6879 if self.op.disk_template is None:
6880 if einfo.has_option(constants.INISECT_INS, "disk_template"):
6881 self.op.disk_template = einfo.get(constants.INISECT_INS,
6884 raise errors.OpPrereqError("No disk template specified and the export"
6885 " is missing the disk_template information",
6888 if not self.op.disks:
6889 if einfo.has_option(constants.INISECT_INS, "disk_count"):
6891 # TODO: import the disk iv_name too
6892 for idx in range(einfo.getint(constants.INISECT_INS, "disk_count")):
6893 disk_sz = einfo.getint(constants.INISECT_INS, "disk%d_size" % idx)
6894 disks.append({"size": disk_sz})
6895 self.op.disks = disks
6897 raise errors.OpPrereqError("No disk info specified and the export"
6898 " is missing the disk information",
6901 if (not self.op.nics and
6902 einfo.has_option(constants.INISECT_INS, "nic_count")):
6904 for idx in range(einfo.getint(constants.INISECT_INS, "nic_count")):
6906 for name in list(constants.NICS_PARAMETERS) + ["ip", "mac"]:
6907 v = einfo.get(constants.INISECT_INS, "nic%d_%s" % (idx, name))
6912 if (self.op.hypervisor is None and
6913 einfo.has_option(constants.INISECT_INS, "hypervisor")):
6914 self.op.hypervisor = einfo.get(constants.INISECT_INS, "hypervisor")
6915 if einfo.has_section(constants.INISECT_HYP):
6916 # use the export parameters but do not override the ones
6917 # specified by the user
6918 for name, value in einfo.items(constants.INISECT_HYP):
6919 if name not in self.op.hvparams:
6920 self.op.hvparams[name] = value
6922 if einfo.has_section(constants.INISECT_BEP):
6923 # use the parameters, without overriding
6924 for name, value in einfo.items(constants.INISECT_BEP):
6925 if name not in self.op.beparams:
6926 self.op.beparams[name] = value
6928 # try to read the parameters old style, from the main section
6929 for name in constants.BES_PARAMETERS:
6930 if (name not in self.op.beparams and
6931 einfo.has_option(constants.INISECT_INS, name)):
6932 self.op.beparams[name] = einfo.get(constants.INISECT_INS, name)
6934 if einfo.has_section(constants.INISECT_OSP):
6935 # use the parameters, without overriding
6936 for name, value in einfo.items(constants.INISECT_OSP):
6937 if name not in self.op.osparams:
6938 self.op.osparams[name] = value
6940 def _RevertToDefaults(self, cluster):
6941 """Revert the instance parameters to the default values.
6945 hv_defs = cluster.SimpleFillHV(self.op.hypervisor, self.op.os_type, {})
6946 for name in self.op.hvparams.keys():
6947 if name in hv_defs and hv_defs[name] == self.op.hvparams[name]:
6948 del self.op.hvparams[name]
6950 be_defs = cluster.SimpleFillBE({})
6951 for name in self.op.beparams.keys():
6952 if name in be_defs and be_defs[name] == self.op.beparams[name]:
6953 del self.op.beparams[name]
6955 nic_defs = cluster.SimpleFillNIC({})
6956 for nic in self.op.nics:
6957 for name in constants.NICS_PARAMETERS:
6958 if name in nic and name in nic_defs and nic[name] == nic_defs[name]:
6961 os_defs = cluster.SimpleFillOS(self.op.os_type, {})
6962 for name in self.op.osparams.keys():
6963 if name in os_defs and os_defs[name] == self.op.osparams[name]:
6964 del self.op.osparams[name]
6966 def CheckPrereq(self):
6967 """Check prerequisites.
6970 if self.op.mode == constants.INSTANCE_IMPORT:
6971 export_info = self._ReadExportInfo()
6972 self._ReadExportParams(export_info)
6974 _CheckDiskTemplate(self.op.disk_template)
6976 if (not self.cfg.GetVGName() and
6977 self.op.disk_template not in constants.DTS_NOT_LVM):
6978 raise errors.OpPrereqError("Cluster does not support lvm-based"
6979 " instances", errors.ECODE_STATE)
6981 if self.op.hypervisor is None:
6982 self.op.hypervisor = self.cfg.GetHypervisorType()
6984 cluster = self.cfg.GetClusterInfo()
6985 enabled_hvs = cluster.enabled_hypervisors
6986 if self.op.hypervisor not in enabled_hvs:
6987 raise errors.OpPrereqError("Selected hypervisor (%s) not enabled in the"
6988 " cluster (%s)" % (self.op.hypervisor,
6989 ",".join(enabled_hvs)),
6992 # check hypervisor parameter syntax (locally)
6993 utils.ForceDictType(self.op.hvparams, constants.HVS_PARAMETER_TYPES)
6994 filled_hvp = cluster.SimpleFillHV(self.op.hypervisor, self.op.os_type,
6996 hv_type = hypervisor.GetHypervisor(self.op.hypervisor)
6997 hv_type.CheckParameterSyntax(filled_hvp)
6998 self.hv_full = filled_hvp
6999 # check that we don't specify global parameters on an instance
7000 _CheckGlobalHvParams(self.op.hvparams)
7002 # fill and remember the beparams dict
7003 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
7004 self.be_full = cluster.SimpleFillBE(self.op.beparams)
7006 # build os parameters
7007 self.os_full = cluster.SimpleFillOS(self.op.os_type, self.op.osparams)
7009 # now that hvp/bep are in final format, let's reset to defaults,
7011 if self.op.identify_defaults:
7012 self._RevertToDefaults(cluster)
7016 for idx, nic in enumerate(self.op.nics):
7017 nic_mode_req = nic.get("mode", None)
7018 nic_mode = nic_mode_req
7019 if nic_mode is None:
7020 nic_mode = cluster.nicparams[constants.PP_DEFAULT][constants.NIC_MODE]
7022 # in routed mode, for the first nic, the default ip is 'auto'
7023 if nic_mode == constants.NIC_MODE_ROUTED and idx == 0:
7024 default_ip_mode = constants.VALUE_AUTO
7026 default_ip_mode = constants.VALUE_NONE
7028 # ip validity checks
7029 ip = nic.get("ip", default_ip_mode)
7030 if ip is None or ip.lower() == constants.VALUE_NONE:
7032 elif ip.lower() == constants.VALUE_AUTO:
7033 if not self.op.name_check:
7034 raise errors.OpPrereqError("IP address set to auto but name checks"
7035 " have been skipped",
7037 nic_ip = self.hostname1.ip
7039 if not netutils.IPAddress.IsValid(ip):
7040 raise errors.OpPrereqError("Invalid IP address '%s'" % ip,
7044 # TODO: check the ip address for uniqueness
7045 if nic_mode == constants.NIC_MODE_ROUTED and not nic_ip:
7046 raise errors.OpPrereqError("Routed nic mode requires an ip address",
7049 # MAC address verification
7050 mac = nic.get("mac", constants.VALUE_AUTO)
7051 if mac not in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
7052 mac = utils.NormalizeAndValidateMac(mac)
7055 self.cfg.ReserveMAC(mac, self.proc.GetECId())
7056 except errors.ReservationError:
7057 raise errors.OpPrereqError("MAC address %s already in use"
7058 " in cluster" % mac,
7059 errors.ECODE_NOTUNIQUE)
7061 # bridge verification
7062 bridge = nic.get("bridge", None)
7063 link = nic.get("link", None)
7065 raise errors.OpPrereqError("Cannot pass 'bridge' and 'link'"
7066 " at the same time", errors.ECODE_INVAL)
7067 elif bridge and nic_mode == constants.NIC_MODE_ROUTED:
7068 raise errors.OpPrereqError("Cannot pass 'bridge' on a routed nic",
7075 nicparams[constants.NIC_MODE] = nic_mode_req
7077 nicparams[constants.NIC_LINK] = link
7079 check_params = cluster.SimpleFillNIC(nicparams)
7080 objects.NIC.CheckParameterSyntax(check_params)
7081 self.nics.append(objects.NIC(mac=mac, ip=nic_ip, nicparams=nicparams))
7083 # disk checks/pre-build
7085 for disk in self.op.disks:
7086 mode = disk.get("mode", constants.DISK_RDWR)
7087 if mode not in constants.DISK_ACCESS_SET:
7088 raise errors.OpPrereqError("Invalid disk access mode '%s'" %
7089 mode, errors.ECODE_INVAL)
7090 size = disk.get("size", None)
7092 raise errors.OpPrereqError("Missing disk size", errors.ECODE_INVAL)
7095 except (TypeError, ValueError):
7096 raise errors.OpPrereqError("Invalid disk size '%s'" % size,
7098 new_disk = {"size": size, "mode": mode}
7100 new_disk["adopt"] = disk["adopt"]
7101 self.disks.append(new_disk)
7103 if self.op.mode == constants.INSTANCE_IMPORT:
7105 # Check that the new instance doesn't have less disks than the export
7106 instance_disks = len(self.disks)
7107 export_disks = export_info.getint(constants.INISECT_INS, 'disk_count')
7108 if instance_disks < export_disks:
7109 raise errors.OpPrereqError("Not enough disks to import."
7110 " (instance: %d, export: %d)" %
7111 (instance_disks, export_disks),
7115 for idx in range(export_disks):
7116 option = 'disk%d_dump' % idx
7117 if export_info.has_option(constants.INISECT_INS, option):
7118 # FIXME: are the old os-es, disk sizes, etc. useful?
7119 export_name = export_info.get(constants.INISECT_INS, option)
7120 image = utils.PathJoin(self.op.src_path, export_name)
7121 disk_images.append(image)
7123 disk_images.append(False)
7125 self.src_images = disk_images
7127 old_name = export_info.get(constants.INISECT_INS, 'name')
7129 exp_nic_count = export_info.getint(constants.INISECT_INS, 'nic_count')
7130 except (TypeError, ValueError), err:
7131 raise errors.OpPrereqError("Invalid export file, nic_count is not"
7132 " an integer: %s" % str(err),
7134 if self.op.instance_name == old_name:
7135 for idx, nic in enumerate(self.nics):
7136 if nic.mac == constants.VALUE_AUTO and exp_nic_count >= idx:
7137 nic_mac_ini = 'nic%d_mac' % idx
7138 nic.mac = export_info.get(constants.INISECT_INS, nic_mac_ini)
7140 # ENDIF: self.op.mode == constants.INSTANCE_IMPORT
7142 # ip ping checks (we use the same ip that was resolved in ExpandNames)
7143 if self.op.ip_check:
7144 if netutils.TcpPing(self.check_ip, constants.DEFAULT_NODED_PORT):
7145 raise errors.OpPrereqError("IP %s of instance %s already in use" %
7146 (self.check_ip, self.op.instance_name),
7147 errors.ECODE_NOTUNIQUE)
7149 #### mac address generation
7150 # By generating here the mac address both the allocator and the hooks get
7151 # the real final mac address rather than the 'auto' or 'generate' value.
7152 # There is a race condition between the generation and the instance object
7153 # creation, which means that we know the mac is valid now, but we're not
7154 # sure it will be when we actually add the instance. If things go bad
7155 # adding the instance will abort because of a duplicate mac, and the
7156 # creation job will fail.
7157 for nic in self.nics:
7158 if nic.mac in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
7159 nic.mac = self.cfg.GenerateMAC(self.proc.GetECId())
7163 if self.op.iallocator is not None:
7164 self._RunAllocator()
7166 #### node related checks
7168 # check primary node
7169 self.pnode = pnode = self.cfg.GetNodeInfo(self.op.pnode)
7170 assert self.pnode is not None, \
7171 "Cannot retrieve locked node %s" % self.op.pnode
7173 raise errors.OpPrereqError("Cannot use offline primary node '%s'" %
7174 pnode.name, errors.ECODE_STATE)
7176 raise errors.OpPrereqError("Cannot use drained primary node '%s'" %
7177 pnode.name, errors.ECODE_STATE)
7179 self.secondaries = []
7181 # mirror node verification
7182 if self.op.disk_template in constants.DTS_NET_MIRROR:
7183 if self.op.snode == pnode.name:
7184 raise errors.OpPrereqError("The secondary node cannot be the"
7185 " primary node.", errors.ECODE_INVAL)
7186 _CheckNodeOnline(self, self.op.snode)
7187 _CheckNodeNotDrained(self, self.op.snode)
7188 self.secondaries.append(self.op.snode)
7190 nodenames = [pnode.name] + self.secondaries
7192 req_size = _ComputeDiskSize(self.op.disk_template,
7195 # Check lv size requirements, if not adopting
7196 if req_size is not None and not self.adopt_disks:
7197 _CheckNodesFreeDisk(self, nodenames, req_size)
7199 if self.adopt_disks: # instead, we must check the adoption data
7200 all_lvs = set([i["adopt"] for i in self.disks])
7201 if len(all_lvs) != len(self.disks):
7202 raise errors.OpPrereqError("Duplicate volume names given for adoption",
7204 for lv_name in all_lvs:
7206 self.cfg.ReserveLV(lv_name, self.proc.GetECId())
7207 except errors.ReservationError:
7208 raise errors.OpPrereqError("LV named %s used by another instance" %
7209 lv_name, errors.ECODE_NOTUNIQUE)
7211 node_lvs = self.rpc.call_lv_list([pnode.name],
7212 self.cfg.GetVGName())[pnode.name]
7213 node_lvs.Raise("Cannot get LV information from node %s" % pnode.name)
7214 node_lvs = node_lvs.payload
7215 delta = all_lvs.difference(node_lvs.keys())
7217 raise errors.OpPrereqError("Missing logical volume(s): %s" %
7218 utils.CommaJoin(delta),
7220 online_lvs = [lv for lv in all_lvs if node_lvs[lv][2]]
7222 raise errors.OpPrereqError("Online logical volumes found, cannot"
7223 " adopt: %s" % utils.CommaJoin(online_lvs),
7225 # update the size of disk based on what is found
7226 for dsk in self.disks:
7227 dsk["size"] = int(float(node_lvs[dsk["adopt"]][0]))
7229 _CheckHVParams(self, nodenames, self.op.hypervisor, self.op.hvparams)
7231 _CheckNodeHasOS(self, pnode.name, self.op.os_type, self.op.force_variant)
7232 # check OS parameters (remotely)
7233 _CheckOSParams(self, True, nodenames, self.op.os_type, self.os_full)
7235 _CheckNicsBridgesExist(self, self.nics, self.pnode.name)
7237 # memory check on primary node
7239 _CheckNodeFreeMemory(self, self.pnode.name,
7240 "creating instance %s" % self.op.instance_name,
7241 self.be_full[constants.BE_MEMORY],
7244 self.dry_run_result = list(nodenames)
7246 def Exec(self, feedback_fn):
7247 """Create and add the instance to the cluster.
7250 instance = self.op.instance_name
7251 pnode_name = self.pnode.name
7253 ht_kind = self.op.hypervisor
7254 if ht_kind in constants.HTS_REQ_PORT:
7255 network_port = self.cfg.AllocatePort()
7259 if constants.ENABLE_FILE_STORAGE:
7260 # this is needed because os.path.join does not accept None arguments
7261 if self.op.file_storage_dir is None:
7262 string_file_storage_dir = ""
7264 string_file_storage_dir = self.op.file_storage_dir
7266 # build the full file storage dir path
7267 file_storage_dir = utils.PathJoin(self.cfg.GetFileStorageDir(),
7268 string_file_storage_dir, instance)
7270 file_storage_dir = ""
7272 disks = _GenerateDiskTemplate(self,
7273 self.op.disk_template,
7274 instance, pnode_name,
7278 self.op.file_driver,
7281 iobj = objects.Instance(name=instance, os=self.op.os_type,
7282 primary_node=pnode_name,
7283 nics=self.nics, disks=disks,
7284 disk_template=self.op.disk_template,
7286 network_port=network_port,
7287 beparams=self.op.beparams,
7288 hvparams=self.op.hvparams,
7289 hypervisor=self.op.hypervisor,
7290 osparams=self.op.osparams,
7293 if self.adopt_disks:
7294 # rename LVs to the newly-generated names; we need to construct
7295 # 'fake' LV disks with the old data, plus the new unique_id
7296 tmp_disks = [objects.Disk.FromDict(v.ToDict()) for v in disks]
7298 for t_dsk, a_dsk in zip (tmp_disks, self.disks):
7299 rename_to.append(t_dsk.logical_id)
7300 t_dsk.logical_id = (t_dsk.logical_id[0], a_dsk["adopt"])
7301 self.cfg.SetDiskID(t_dsk, pnode_name)
7302 result = self.rpc.call_blockdev_rename(pnode_name,
7303 zip(tmp_disks, rename_to))
7304 result.Raise("Failed to rename adoped LVs")
7306 feedback_fn("* creating instance disks...")
7308 _CreateDisks(self, iobj)
7309 except errors.OpExecError:
7310 self.LogWarning("Device creation failed, reverting...")
7312 _RemoveDisks(self, iobj)
7314 self.cfg.ReleaseDRBDMinors(instance)
7317 if self.cfg.GetClusterInfo().prealloc_wipe_disks:
7318 feedback_fn("* wiping instance disks...")
7320 _WipeDisks(self, iobj)
7321 except errors.OpExecError:
7322 self.LogWarning("Device wiping failed, reverting...")
7324 _RemoveDisks(self, iobj)
7326 self.cfg.ReleaseDRBDMinors(instance)
7329 feedback_fn("adding instance %s to cluster config" % instance)
7331 self.cfg.AddInstance(iobj, self.proc.GetECId())
7333 # Declare that we don't want to remove the instance lock anymore, as we've
7334 # added the instance to the config
7335 del self.remove_locks[locking.LEVEL_INSTANCE]
7336 # Unlock all the nodes
7337 if self.op.mode == constants.INSTANCE_IMPORT:
7338 nodes_keep = [self.op.src_node]
7339 nodes_release = [node for node in self.acquired_locks[locking.LEVEL_NODE]
7340 if node != self.op.src_node]
7341 self.context.glm.release(locking.LEVEL_NODE, nodes_release)
7342 self.acquired_locks[locking.LEVEL_NODE] = nodes_keep
7344 self.context.glm.release(locking.LEVEL_NODE)
7345 del self.acquired_locks[locking.LEVEL_NODE]
7347 if self.op.wait_for_sync:
7348 disk_abort = not _WaitForSync(self, iobj)
7349 elif iobj.disk_template in constants.DTS_NET_MIRROR:
7350 # make sure the disks are not degraded (still sync-ing is ok)
7352 feedback_fn("* checking mirrors status")
7353 disk_abort = not _WaitForSync(self, iobj, oneshot=True)
7358 _RemoveDisks(self, iobj)
7359 self.cfg.RemoveInstance(iobj.name)
7360 # Make sure the instance lock gets removed
7361 self.remove_locks[locking.LEVEL_INSTANCE] = iobj.name
7362 raise errors.OpExecError("There are some degraded disks for"
7365 if iobj.disk_template != constants.DT_DISKLESS and not self.adopt_disks:
7366 if self.op.mode == constants.INSTANCE_CREATE:
7367 if not self.op.no_install:
7368 feedback_fn("* running the instance OS create scripts...")
7369 # FIXME: pass debug option from opcode to backend
7370 result = self.rpc.call_instance_os_add(pnode_name, iobj, False,
7371 self.op.debug_level)
7372 result.Raise("Could not add os for instance %s"
7373 " on node %s" % (instance, pnode_name))
7375 elif self.op.mode == constants.INSTANCE_IMPORT:
7376 feedback_fn("* running the instance OS import scripts...")
7380 for idx, image in enumerate(self.src_images):
7384 # FIXME: pass debug option from opcode to backend
7385 dt = masterd.instance.DiskTransfer("disk/%s" % idx,
7386 constants.IEIO_FILE, (image, ),
7387 constants.IEIO_SCRIPT,
7388 (iobj.disks[idx], idx),
7390 transfers.append(dt)
7393 masterd.instance.TransferInstanceData(self, feedback_fn,
7394 self.op.src_node, pnode_name,
7395 self.pnode.secondary_ip,
7397 if not compat.all(import_result):
7398 self.LogWarning("Some disks for instance %s on node %s were not"
7399 " imported successfully" % (instance, pnode_name))
7401 elif self.op.mode == constants.INSTANCE_REMOTE_IMPORT:
7402 feedback_fn("* preparing remote import...")
7403 connect_timeout = constants.RIE_CONNECT_TIMEOUT
7404 timeouts = masterd.instance.ImportExportTimeouts(connect_timeout)
7406 disk_results = masterd.instance.RemoteImport(self, feedback_fn, iobj,
7407 self.source_x509_ca,
7408 self._cds, timeouts)
7409 if not compat.all(disk_results):
7410 # TODO: Should the instance still be started, even if some disks
7411 # failed to import (valid for local imports, too)?
7412 self.LogWarning("Some disks for instance %s on node %s were not"
7413 " imported successfully" % (instance, pnode_name))
7415 # Run rename script on newly imported instance
7416 assert iobj.name == instance
7417 feedback_fn("Running rename script for %s" % instance)
7418 result = self.rpc.call_instance_run_rename(pnode_name, iobj,
7419 self.source_instance_name,
7420 self.op.debug_level)
7422 self.LogWarning("Failed to run rename script for %s on node"
7423 " %s: %s" % (instance, pnode_name, result.fail_msg))
7426 # also checked in the prereq part
7427 raise errors.ProgrammerError("Unknown OS initialization mode '%s'"
7431 iobj.admin_up = True
7432 self.cfg.Update(iobj, feedback_fn)
7433 logging.info("Starting instance %s on node %s", instance, pnode_name)
7434 feedback_fn("* starting instance...")
7435 result = self.rpc.call_instance_start(pnode_name, iobj, None, None)
7436 result.Raise("Could not start instance")
7438 return list(iobj.all_nodes)
7441 class LUConnectConsole(NoHooksLU):
7442 """Connect to an instance's console.
7444 This is somewhat special in that it returns the command line that
7445 you need to run on the master node in order to connect to the
7454 def ExpandNames(self):
7455 self._ExpandAndLockInstance()
7457 def CheckPrereq(self):
7458 """Check prerequisites.
7460 This checks that the instance is in the cluster.
7463 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
7464 assert self.instance is not None, \
7465 "Cannot retrieve locked instance %s" % self.op.instance_name
7466 _CheckNodeOnline(self, self.instance.primary_node)
7468 def Exec(self, feedback_fn):
7469 """Connect to the console of an instance
7472 instance = self.instance
7473 node = instance.primary_node
7475 node_insts = self.rpc.call_instance_list([node],
7476 [instance.hypervisor])[node]
7477 node_insts.Raise("Can't get node information from %s" % node)
7479 if instance.name not in node_insts.payload:
7480 if instance.admin_up:
7481 state = "ERROR_down"
7483 state = "ADMIN_down"
7484 raise errors.OpExecError("Instance %s is not running (state %s)" %
7485 (instance.name, state))
7487 logging.debug("Connecting to console of %s on %s", instance.name, node)
7489 hyper = hypervisor.GetHypervisor(instance.hypervisor)
7490 cluster = self.cfg.GetClusterInfo()
7491 # beparams and hvparams are passed separately, to avoid editing the
7492 # instance and then saving the defaults in the instance itself.
7493 hvparams = cluster.FillHV(instance)
7494 beparams = cluster.FillBE(instance)
7495 console_cmd = hyper.GetShellCommandForConsole(instance, hvparams, beparams)
7498 return self.ssh.BuildCmd(node, "root", console_cmd, batch=True, tty=True)
7501 class LUReplaceDisks(LogicalUnit):
7502 """Replace the disks of an instance.
7505 HPATH = "mirrors-replace"
7506 HTYPE = constants.HTYPE_INSTANCE
7509 ("mode", ht.NoDefault, ht.TElemOf(constants.REPLACE_MODES)),
7510 ("disks", ht.EmptyList, ht.TListOf(ht.TPositiveInt)),
7511 ("remote_node", None, ht.TMaybeString),
7512 ("iallocator", None, ht.TMaybeString),
7513 ("early_release", False, ht.TBool),
7517 def CheckArguments(self):
7518 TLReplaceDisks.CheckArguments(self.op.mode, self.op.remote_node,
7521 def ExpandNames(self):
7522 self._ExpandAndLockInstance()
7524 if self.op.iallocator is not None:
7525 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
7527 elif self.op.remote_node is not None:
7528 remote_node = _ExpandNodeName(self.cfg, self.op.remote_node)
7529 self.op.remote_node = remote_node
7531 # Warning: do not remove the locking of the new secondary here
7532 # unless DRBD8.AddChildren is changed to work in parallel;
7533 # currently it doesn't since parallel invocations of
7534 # FindUnusedMinor will conflict
7535 self.needed_locks[locking.LEVEL_NODE] = [remote_node]
7536 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
7539 self.needed_locks[locking.LEVEL_NODE] = []
7540 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
7542 self.replacer = TLReplaceDisks(self, self.op.instance_name, self.op.mode,
7543 self.op.iallocator, self.op.remote_node,
7544 self.op.disks, False, self.op.early_release)
7546 self.tasklets = [self.replacer]
7548 def DeclareLocks(self, level):
7549 # If we're not already locking all nodes in the set we have to declare the
7550 # instance's primary/secondary nodes.
7551 if (level == locking.LEVEL_NODE and
7552 self.needed_locks[locking.LEVEL_NODE] is not locking.ALL_SET):
7553 self._LockInstancesNodes()
7555 def BuildHooksEnv(self):
7558 This runs on the master, the primary and all the secondaries.
7561 instance = self.replacer.instance
7563 "MODE": self.op.mode,
7564 "NEW_SECONDARY": self.op.remote_node,
7565 "OLD_SECONDARY": instance.secondary_nodes[0],
7567 env.update(_BuildInstanceHookEnvByObject(self, instance))
7569 self.cfg.GetMasterNode(),
7570 instance.primary_node,
7572 if self.op.remote_node is not None:
7573 nl.append(self.op.remote_node)
7577 class TLReplaceDisks(Tasklet):
7578 """Replaces disks for an instance.
7580 Note: Locking is not within the scope of this class.
7583 def __init__(self, lu, instance_name, mode, iallocator_name, remote_node,
7584 disks, delay_iallocator, early_release):
7585 """Initializes this class.
7588 Tasklet.__init__(self, lu)
7591 self.instance_name = instance_name
7593 self.iallocator_name = iallocator_name
7594 self.remote_node = remote_node
7596 self.delay_iallocator = delay_iallocator
7597 self.early_release = early_release
7600 self.instance = None
7601 self.new_node = None
7602 self.target_node = None
7603 self.other_node = None
7604 self.remote_node_info = None
7605 self.node_secondary_ip = None
7608 def CheckArguments(mode, remote_node, iallocator):
7609 """Helper function for users of this class.
7612 # check for valid parameter combination
7613 if mode == constants.REPLACE_DISK_CHG:
7614 if remote_node is None and iallocator is None:
7615 raise errors.OpPrereqError("When changing the secondary either an"
7616 " iallocator script must be used or the"
7617 " new node given", errors.ECODE_INVAL)
7619 if remote_node is not None and iallocator is not None:
7620 raise errors.OpPrereqError("Give either the iallocator or the new"
7621 " secondary, not both", errors.ECODE_INVAL)
7623 elif remote_node is not None or iallocator is not None:
7624 # Not replacing the secondary
7625 raise errors.OpPrereqError("The iallocator and new node options can"
7626 " only be used when changing the"
7627 " secondary node", errors.ECODE_INVAL)
7630 def _RunAllocator(lu, iallocator_name, instance_name, relocate_from):
7631 """Compute a new secondary node using an IAllocator.
7634 ial = IAllocator(lu.cfg, lu.rpc,
7635 mode=constants.IALLOCATOR_MODE_RELOC,
7637 relocate_from=relocate_from)
7639 ial.Run(iallocator_name)
7642 raise errors.OpPrereqError("Can't compute nodes using iallocator '%s':"
7643 " %s" % (iallocator_name, ial.info),
7646 if len(ial.result) != ial.required_nodes:
7647 raise errors.OpPrereqError("iallocator '%s' returned invalid number"
7648 " of nodes (%s), required %s" %
7650 len(ial.result), ial.required_nodes),
7653 remote_node_name = ial.result[0]
7655 lu.LogInfo("Selected new secondary for instance '%s': %s",
7656 instance_name, remote_node_name)
7658 return remote_node_name
7660 def _FindFaultyDisks(self, node_name):
7661 return _FindFaultyInstanceDisks(self.cfg, self.rpc, self.instance,
7664 def CheckPrereq(self):
7665 """Check prerequisites.
7667 This checks that the instance is in the cluster.
7670 self.instance = instance = self.cfg.GetInstanceInfo(self.instance_name)
7671 assert instance is not None, \
7672 "Cannot retrieve locked instance %s" % self.instance_name
7674 if instance.disk_template != constants.DT_DRBD8:
7675 raise errors.OpPrereqError("Can only run replace disks for DRBD8-based"
7676 " instances", errors.ECODE_INVAL)
7678 if len(instance.secondary_nodes) != 1:
7679 raise errors.OpPrereqError("The instance has a strange layout,"
7680 " expected one secondary but found %d" %
7681 len(instance.secondary_nodes),
7684 if not self.delay_iallocator:
7685 self._CheckPrereq2()
7687 def _CheckPrereq2(self):
7688 """Check prerequisites, second part.
7690 This function should always be part of CheckPrereq. It was separated and is
7691 now called from Exec because during node evacuation iallocator was only
7692 called with an unmodified cluster model, not taking planned changes into
7696 instance = self.instance
7697 secondary_node = instance.secondary_nodes[0]
7699 if self.iallocator_name is None:
7700 remote_node = self.remote_node
7702 remote_node = self._RunAllocator(self.lu, self.iallocator_name,
7703 instance.name, instance.secondary_nodes)
7705 if remote_node is not None:
7706 self.remote_node_info = self.cfg.GetNodeInfo(remote_node)
7707 assert self.remote_node_info is not None, \
7708 "Cannot retrieve locked node %s" % remote_node
7710 self.remote_node_info = None
7712 if remote_node == self.instance.primary_node:
7713 raise errors.OpPrereqError("The specified node is the primary node of"
7714 " the instance.", errors.ECODE_INVAL)
7716 if remote_node == secondary_node:
7717 raise errors.OpPrereqError("The specified node is already the"
7718 " secondary node of the instance.",
7721 if self.disks and self.mode in (constants.REPLACE_DISK_AUTO,
7722 constants.REPLACE_DISK_CHG):
7723 raise errors.OpPrereqError("Cannot specify disks to be replaced",
7726 if self.mode == constants.REPLACE_DISK_AUTO:
7727 faulty_primary = self._FindFaultyDisks(instance.primary_node)
7728 faulty_secondary = self._FindFaultyDisks(secondary_node)
7730 if faulty_primary and faulty_secondary:
7731 raise errors.OpPrereqError("Instance %s has faulty disks on more than"
7732 " one node and can not be repaired"
7733 " automatically" % self.instance_name,
7737 self.disks = faulty_primary
7738 self.target_node = instance.primary_node
7739 self.other_node = secondary_node
7740 check_nodes = [self.target_node, self.other_node]
7741 elif faulty_secondary:
7742 self.disks = faulty_secondary
7743 self.target_node = secondary_node
7744 self.other_node = instance.primary_node
7745 check_nodes = [self.target_node, self.other_node]
7751 # Non-automatic modes
7752 if self.mode == constants.REPLACE_DISK_PRI:
7753 self.target_node = instance.primary_node
7754 self.other_node = secondary_node
7755 check_nodes = [self.target_node, self.other_node]
7757 elif self.mode == constants.REPLACE_DISK_SEC:
7758 self.target_node = secondary_node
7759 self.other_node = instance.primary_node
7760 check_nodes = [self.target_node, self.other_node]
7762 elif self.mode == constants.REPLACE_DISK_CHG:
7763 self.new_node = remote_node
7764 self.other_node = instance.primary_node
7765 self.target_node = secondary_node
7766 check_nodes = [self.new_node, self.other_node]
7768 _CheckNodeNotDrained(self.lu, remote_node)
7770 old_node_info = self.cfg.GetNodeInfo(secondary_node)
7771 assert old_node_info is not None
7772 if old_node_info.offline and not self.early_release:
7773 # doesn't make sense to delay the release
7774 self.early_release = True
7775 self.lu.LogInfo("Old secondary %s is offline, automatically enabling"
7776 " early-release mode", secondary_node)
7779 raise errors.ProgrammerError("Unhandled disk replace mode (%s)" %
7782 # If not specified all disks should be replaced
7784 self.disks = range(len(self.instance.disks))
7786 for node in check_nodes:
7787 _CheckNodeOnline(self.lu, node)
7789 # Check whether disks are valid
7790 for disk_idx in self.disks:
7791 instance.FindDisk(disk_idx)
7793 # Get secondary node IP addresses
7796 for node_name in [self.target_node, self.other_node, self.new_node]:
7797 if node_name is not None:
7798 node_2nd_ip[node_name] = self.cfg.GetNodeInfo(node_name).secondary_ip
7800 self.node_secondary_ip = node_2nd_ip
7802 def Exec(self, feedback_fn):
7803 """Execute disk replacement.
7805 This dispatches the disk replacement to the appropriate handler.
7808 if self.delay_iallocator:
7809 self._CheckPrereq2()
7812 feedback_fn("No disks need replacement")
7815 feedback_fn("Replacing disk(s) %s for %s" %
7816 (utils.CommaJoin(self.disks), self.instance.name))
7818 activate_disks = (not self.instance.admin_up)
7820 # Activate the instance disks if we're replacing them on a down instance
7822 _StartInstanceDisks(self.lu, self.instance, True)
7825 # Should we replace the secondary node?
7826 if self.new_node is not None:
7827 fn = self._ExecDrbd8Secondary
7829 fn = self._ExecDrbd8DiskOnly
7831 return fn(feedback_fn)
7834 # Deactivate the instance disks if we're replacing them on a
7837 _SafeShutdownInstanceDisks(self.lu, self.instance)
7839 def _CheckVolumeGroup(self, nodes):
7840 self.lu.LogInfo("Checking volume groups")
7842 vgname = self.cfg.GetVGName()
7844 # Make sure volume group exists on all involved nodes
7845 results = self.rpc.call_vg_list(nodes)
7847 raise errors.OpExecError("Can't list volume groups on the nodes")
7851 res.Raise("Error checking node %s" % node)
7852 if vgname not in res.payload:
7853 raise errors.OpExecError("Volume group '%s' not found on node %s" %
7856 def _CheckDisksExistence(self, nodes):
7857 # Check disk existence
7858 for idx, dev in enumerate(self.instance.disks):
7859 if idx not in self.disks:
7863 self.lu.LogInfo("Checking disk/%d on %s" % (idx, node))
7864 self.cfg.SetDiskID(dev, node)
7866 result = self.rpc.call_blockdev_find(node, dev)
7868 msg = result.fail_msg
7869 if msg or not result.payload:
7871 msg = "disk not found"
7872 raise errors.OpExecError("Can't find disk/%d on node %s: %s" %
7875 def _CheckDisksConsistency(self, node_name, on_primary, ldisk):
7876 for idx, dev in enumerate(self.instance.disks):
7877 if idx not in self.disks:
7880 self.lu.LogInfo("Checking disk/%d consistency on node %s" %
7883 if not _CheckDiskConsistency(self.lu, dev, node_name, on_primary,
7885 raise errors.OpExecError("Node %s has degraded storage, unsafe to"
7886 " replace disks for instance %s" %
7887 (node_name, self.instance.name))
7889 def _CreateNewStorage(self, node_name):
7890 vgname = self.cfg.GetVGName()
7893 for idx, dev in enumerate(self.instance.disks):
7894 if idx not in self.disks:
7897 self.lu.LogInfo("Adding storage on %s for disk/%d" % (node_name, idx))
7899 self.cfg.SetDiskID(dev, node_name)
7901 lv_names = [".disk%d_%s" % (idx, suffix) for suffix in ["data", "meta"]]
7902 names = _GenerateUniqueNames(self.lu, lv_names)
7904 lv_data = objects.Disk(dev_type=constants.LD_LV, size=dev.size,
7905 logical_id=(vgname, names[0]))
7906 lv_meta = objects.Disk(dev_type=constants.LD_LV, size=128,
7907 logical_id=(vgname, names[1]))
7909 new_lvs = [lv_data, lv_meta]
7910 old_lvs = dev.children
7911 iv_names[dev.iv_name] = (dev, old_lvs, new_lvs)
7913 # we pass force_create=True to force the LVM creation
7914 for new_lv in new_lvs:
7915 _CreateBlockDev(self.lu, node_name, self.instance, new_lv, True,
7916 _GetInstanceInfoText(self.instance), False)
7920 def _CheckDevices(self, node_name, iv_names):
7921 for name, (dev, _, _) in iv_names.iteritems():
7922 self.cfg.SetDiskID(dev, node_name)
7924 result = self.rpc.call_blockdev_find(node_name, dev)
7926 msg = result.fail_msg
7927 if msg or not result.payload:
7929 msg = "disk not found"
7930 raise errors.OpExecError("Can't find DRBD device %s: %s" %
7933 if result.payload.is_degraded:
7934 raise errors.OpExecError("DRBD device %s is degraded!" % name)
7936 def _RemoveOldStorage(self, node_name, iv_names):
7937 for name, (_, old_lvs, _) in iv_names.iteritems():
7938 self.lu.LogInfo("Remove logical volumes for %s" % name)
7941 self.cfg.SetDiskID(lv, node_name)
7943 msg = self.rpc.call_blockdev_remove(node_name, lv).fail_msg
7945 self.lu.LogWarning("Can't remove old LV: %s" % msg,
7946 hint="remove unused LVs manually")
7948 def _ReleaseNodeLock(self, node_name):
7949 """Releases the lock for a given node."""
7950 self.lu.context.glm.release(locking.LEVEL_NODE, node_name)
7952 def _ExecDrbd8DiskOnly(self, feedback_fn):
7953 """Replace a disk on the primary or secondary for DRBD 8.
7955 The algorithm for replace is quite complicated:
7957 1. for each disk to be replaced:
7959 1. create new LVs on the target node with unique names
7960 1. detach old LVs from the drbd device
7961 1. rename old LVs to name_replaced.<time_t>
7962 1. rename new LVs to old LVs
7963 1. attach the new LVs (with the old names now) to the drbd device
7965 1. wait for sync across all devices
7967 1. for each modified disk:
7969 1. remove old LVs (which have the name name_replaces.<time_t>)
7971 Failures are not very well handled.
7976 # Step: check device activation
7977 self.lu.LogStep(1, steps_total, "Check device existence")
7978 self._CheckDisksExistence([self.other_node, self.target_node])
7979 self._CheckVolumeGroup([self.target_node, self.other_node])
7981 # Step: check other node consistency
7982 self.lu.LogStep(2, steps_total, "Check peer consistency")
7983 self._CheckDisksConsistency(self.other_node,
7984 self.other_node == self.instance.primary_node,
7987 # Step: create new storage
7988 self.lu.LogStep(3, steps_total, "Allocate new storage")
7989 iv_names = self._CreateNewStorage(self.target_node)
7991 # Step: for each lv, detach+rename*2+attach
7992 self.lu.LogStep(4, steps_total, "Changing drbd configuration")
7993 for dev, old_lvs, new_lvs in iv_names.itervalues():
7994 self.lu.LogInfo("Detaching %s drbd from local storage" % dev.iv_name)
7996 result = self.rpc.call_blockdev_removechildren(self.target_node, dev,
7998 result.Raise("Can't detach drbd from local storage on node"
7999 " %s for device %s" % (self.target_node, dev.iv_name))
8001 #cfg.Update(instance)
8003 # ok, we created the new LVs, so now we know we have the needed
8004 # storage; as such, we proceed on the target node to rename
8005 # old_lv to _old, and new_lv to old_lv; note that we rename LVs
8006 # using the assumption that logical_id == physical_id (which in
8007 # turn is the unique_id on that node)
8009 # FIXME(iustin): use a better name for the replaced LVs
8010 temp_suffix = int(time.time())
8011 ren_fn = lambda d, suff: (d.physical_id[0],
8012 d.physical_id[1] + "_replaced-%s" % suff)
8014 # Build the rename list based on what LVs exist on the node
8015 rename_old_to_new = []
8016 for to_ren in old_lvs:
8017 result = self.rpc.call_blockdev_find(self.target_node, to_ren)
8018 if not result.fail_msg and result.payload:
8020 rename_old_to_new.append((to_ren, ren_fn(to_ren, temp_suffix)))
8022 self.lu.LogInfo("Renaming the old LVs on the target node")
8023 result = self.rpc.call_blockdev_rename(self.target_node,
8025 result.Raise("Can't rename old LVs on node %s" % self.target_node)
8027 # Now we rename the new LVs to the old LVs
8028 self.lu.LogInfo("Renaming the new LVs on the target node")
8029 rename_new_to_old = [(new, old.physical_id)
8030 for old, new in zip(old_lvs, new_lvs)]
8031 result = self.rpc.call_blockdev_rename(self.target_node,
8033 result.Raise("Can't rename new LVs on node %s" % self.target_node)
8035 for old, new in zip(old_lvs, new_lvs):
8036 new.logical_id = old.logical_id
8037 self.cfg.SetDiskID(new, self.target_node)
8039 for disk in old_lvs:
8040 disk.logical_id = ren_fn(disk, temp_suffix)
8041 self.cfg.SetDiskID(disk, self.target_node)
8043 # Now that the new lvs have the old name, we can add them to the device
8044 self.lu.LogInfo("Adding new mirror component on %s" % self.target_node)
8045 result = self.rpc.call_blockdev_addchildren(self.target_node, dev,
8047 msg = result.fail_msg
8049 for new_lv in new_lvs:
8050 msg2 = self.rpc.call_blockdev_remove(self.target_node,
8053 self.lu.LogWarning("Can't rollback device %s: %s", dev, msg2,
8054 hint=("cleanup manually the unused logical"
8056 raise errors.OpExecError("Can't add local storage to drbd: %s" % msg)
8058 dev.children = new_lvs
8060 self.cfg.Update(self.instance, feedback_fn)
8063 if self.early_release:
8064 self.lu.LogStep(cstep, steps_total, "Removing old storage")
8066 self._RemoveOldStorage(self.target_node, iv_names)
8067 # WARNING: we release both node locks here, do not do other RPCs
8068 # than WaitForSync to the primary node
8069 self._ReleaseNodeLock([self.target_node, self.other_node])
8072 # This can fail as the old devices are degraded and _WaitForSync
8073 # does a combined result over all disks, so we don't check its return value
8074 self.lu.LogStep(cstep, steps_total, "Sync devices")
8076 _WaitForSync(self.lu, self.instance)
8078 # Check all devices manually
8079 self._CheckDevices(self.instance.primary_node, iv_names)
8081 # Step: remove old storage
8082 if not self.early_release:
8083 self.lu.LogStep(cstep, steps_total, "Removing old storage")
8085 self._RemoveOldStorage(self.target_node, iv_names)
8087 def _ExecDrbd8Secondary(self, feedback_fn):
8088 """Replace the secondary node for DRBD 8.
8090 The algorithm for replace is quite complicated:
8091 - for all disks of the instance:
8092 - create new LVs on the new node with same names
8093 - shutdown the drbd device on the old secondary
8094 - disconnect the drbd network on the primary
8095 - create the drbd device on the new secondary
8096 - network attach the drbd on the primary, using an artifice:
8097 the drbd code for Attach() will connect to the network if it
8098 finds a device which is connected to the good local disks but
8100 - wait for sync across all devices
8101 - remove all disks from the old secondary
8103 Failures are not very well handled.
8108 # Step: check device activation
8109 self.lu.LogStep(1, steps_total, "Check device existence")
8110 self._CheckDisksExistence([self.instance.primary_node])
8111 self._CheckVolumeGroup([self.instance.primary_node])
8113 # Step: check other node consistency
8114 self.lu.LogStep(2, steps_total, "Check peer consistency")
8115 self._CheckDisksConsistency(self.instance.primary_node, True, True)
8117 # Step: create new storage
8118 self.lu.LogStep(3, steps_total, "Allocate new storage")
8119 for idx, dev in enumerate(self.instance.disks):
8120 self.lu.LogInfo("Adding new local storage on %s for disk/%d" %
8121 (self.new_node, idx))
8122 # we pass force_create=True to force LVM creation
8123 for new_lv in dev.children:
8124 _CreateBlockDev(self.lu, self.new_node, self.instance, new_lv, True,
8125 _GetInstanceInfoText(self.instance), False)
8127 # Step 4: dbrd minors and drbd setups changes
8128 # after this, we must manually remove the drbd minors on both the
8129 # error and the success paths
8130 self.lu.LogStep(4, steps_total, "Changing drbd configuration")
8131 minors = self.cfg.AllocateDRBDMinor([self.new_node
8132 for dev in self.instance.disks],
8134 logging.debug("Allocated minors %r", minors)
8137 for idx, (dev, new_minor) in enumerate(zip(self.instance.disks, minors)):
8138 self.lu.LogInfo("activating a new drbd on %s for disk/%d" %
8139 (self.new_node, idx))
8140 # create new devices on new_node; note that we create two IDs:
8141 # one without port, so the drbd will be activated without
8142 # networking information on the new node at this stage, and one
8143 # with network, for the latter activation in step 4
8144 (o_node1, o_node2, o_port, o_minor1, o_minor2, o_secret) = dev.logical_id
8145 if self.instance.primary_node == o_node1:
8148 assert self.instance.primary_node == o_node2, "Three-node instance?"
8151 new_alone_id = (self.instance.primary_node, self.new_node, None,
8152 p_minor, new_minor, o_secret)
8153 new_net_id = (self.instance.primary_node, self.new_node, o_port,
8154 p_minor, new_minor, o_secret)
8156 iv_names[idx] = (dev, dev.children, new_net_id)
8157 logging.debug("Allocated new_minor: %s, new_logical_id: %s", new_minor,
8159 new_drbd = objects.Disk(dev_type=constants.LD_DRBD8,
8160 logical_id=new_alone_id,
8161 children=dev.children,
8164 _CreateSingleBlockDev(self.lu, self.new_node, self.instance, new_drbd,
8165 _GetInstanceInfoText(self.instance), False)
8166 except errors.GenericError:
8167 self.cfg.ReleaseDRBDMinors(self.instance.name)
8170 # We have new devices, shutdown the drbd on the old secondary
8171 for idx, dev in enumerate(self.instance.disks):
8172 self.lu.LogInfo("Shutting down drbd for disk/%d on old node" % idx)
8173 self.cfg.SetDiskID(dev, self.target_node)
8174 msg = self.rpc.call_blockdev_shutdown(self.target_node, dev).fail_msg
8176 self.lu.LogWarning("Failed to shutdown drbd for disk/%d on old"
8177 "node: %s" % (idx, msg),
8178 hint=("Please cleanup this device manually as"
8179 " soon as possible"))
8181 self.lu.LogInfo("Detaching primary drbds from the network (=> standalone)")
8182 result = self.rpc.call_drbd_disconnect_net([self.instance.primary_node],
8183 self.node_secondary_ip,
8184 self.instance.disks)\
8185 [self.instance.primary_node]
8187 msg = result.fail_msg
8189 # detaches didn't succeed (unlikely)
8190 self.cfg.ReleaseDRBDMinors(self.instance.name)
8191 raise errors.OpExecError("Can't detach the disks from the network on"
8192 " old node: %s" % (msg,))
8194 # if we managed to detach at least one, we update all the disks of
8195 # the instance to point to the new secondary
8196 self.lu.LogInfo("Updating instance configuration")
8197 for dev, _, new_logical_id in iv_names.itervalues():
8198 dev.logical_id = new_logical_id
8199 self.cfg.SetDiskID(dev, self.instance.primary_node)
8201 self.cfg.Update(self.instance, feedback_fn)
8203 # and now perform the drbd attach
8204 self.lu.LogInfo("Attaching primary drbds to new secondary"
8205 " (standalone => connected)")
8206 result = self.rpc.call_drbd_attach_net([self.instance.primary_node,
8208 self.node_secondary_ip,
8209 self.instance.disks,
8212 for to_node, to_result in result.items():
8213 msg = to_result.fail_msg
8215 self.lu.LogWarning("Can't attach drbd disks on node %s: %s",
8217 hint=("please do a gnt-instance info to see the"
8218 " status of disks"))
8220 if self.early_release:
8221 self.lu.LogStep(cstep, steps_total, "Removing old storage")
8223 self._RemoveOldStorage(self.target_node, iv_names)
8224 # WARNING: we release all node locks here, do not do other RPCs
8225 # than WaitForSync to the primary node
8226 self._ReleaseNodeLock([self.instance.primary_node,
8231 # This can fail as the old devices are degraded and _WaitForSync
8232 # does a combined result over all disks, so we don't check its return value
8233 self.lu.LogStep(cstep, steps_total, "Sync devices")
8235 _WaitForSync(self.lu, self.instance)
8237 # Check all devices manually
8238 self._CheckDevices(self.instance.primary_node, iv_names)
8240 # Step: remove old storage
8241 if not self.early_release:
8242 self.lu.LogStep(cstep, steps_total, "Removing old storage")
8243 self._RemoveOldStorage(self.target_node, iv_names)
8246 class LURepairNodeStorage(NoHooksLU):
8247 """Repairs the volume group on a node.
8252 ("storage_type", ht.NoDefault, _CheckStorageType),
8253 ("name", ht.NoDefault, ht.TNonEmptyString),
8254 ("ignore_consistency", False, ht.TBool),
8258 def CheckArguments(self):
8259 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
8261 storage_type = self.op.storage_type
8263 if (constants.SO_FIX_CONSISTENCY not in
8264 constants.VALID_STORAGE_OPERATIONS.get(storage_type, [])):
8265 raise errors.OpPrereqError("Storage units of type '%s' can not be"
8266 " repaired" % storage_type,
8269 def ExpandNames(self):
8270 self.needed_locks = {
8271 locking.LEVEL_NODE: [self.op.node_name],
8274 def _CheckFaultyDisks(self, instance, node_name):
8275 """Ensure faulty disks abort the opcode or at least warn."""
8277 if _FindFaultyInstanceDisks(self.cfg, self.rpc, instance,
8279 raise errors.OpPrereqError("Instance '%s' has faulty disks on"
8280 " node '%s'" % (instance.name, node_name),
8282 except errors.OpPrereqError, err:
8283 if self.op.ignore_consistency:
8284 self.proc.LogWarning(str(err.args[0]))
8288 def CheckPrereq(self):
8289 """Check prerequisites.
8292 # Check whether any instance on this node has faulty disks
8293 for inst in _GetNodeInstances(self.cfg, self.op.node_name):
8294 if not inst.admin_up:
8296 check_nodes = set(inst.all_nodes)
8297 check_nodes.discard(self.op.node_name)
8298 for inst_node_name in check_nodes:
8299 self._CheckFaultyDisks(inst, inst_node_name)
8301 def Exec(self, feedback_fn):
8302 feedback_fn("Repairing storage unit '%s' on %s ..." %
8303 (self.op.name, self.op.node_name))
8305 st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
8306 result = self.rpc.call_storage_execute(self.op.node_name,
8307 self.op.storage_type, st_args,
8309 constants.SO_FIX_CONSISTENCY)
8310 result.Raise("Failed to repair storage unit '%s' on %s" %
8311 (self.op.name, self.op.node_name))
8314 class LUNodeEvacuationStrategy(NoHooksLU):
8315 """Computes the node evacuation strategy.
8319 ("nodes", ht.NoDefault, ht.TListOf(ht.TNonEmptyString)),
8320 ("remote_node", None, ht.TMaybeString),
8321 ("iallocator", None, ht.TMaybeString),
8325 def CheckArguments(self):
8326 _CheckIAllocatorOrNode(self, "iallocator", "remote_node")
8328 def ExpandNames(self):
8329 self.op.nodes = _GetWantedNodes(self, self.op.nodes)
8330 self.needed_locks = locks = {}
8331 if self.op.remote_node is None:
8332 locks[locking.LEVEL_NODE] = locking.ALL_SET
8334 self.op.remote_node = _ExpandNodeName(self.cfg, self.op.remote_node)
8335 locks[locking.LEVEL_NODE] = self.op.nodes + [self.op.remote_node]
8337 def Exec(self, feedback_fn):
8338 if self.op.remote_node is not None:
8340 for node in self.op.nodes:
8341 instances.extend(_GetNodeSecondaryInstances(self.cfg, node))
8344 if i.primary_node == self.op.remote_node:
8345 raise errors.OpPrereqError("Node %s is the primary node of"
8346 " instance %s, cannot use it as"
8348 (self.op.remote_node, i.name),
8350 result.append([i.name, self.op.remote_node])
8352 ial = IAllocator(self.cfg, self.rpc,
8353 mode=constants.IALLOCATOR_MODE_MEVAC,
8354 evac_nodes=self.op.nodes)
8355 ial.Run(self.op.iallocator, validate=True)
8357 raise errors.OpExecError("No valid evacuation solution: %s" % ial.info,
8363 class LUGrowDisk(LogicalUnit):
8364 """Grow a disk of an instance.
8368 HTYPE = constants.HTYPE_INSTANCE
8371 ("disk", ht.NoDefault, ht.TInt),
8372 ("amount", ht.NoDefault, ht.TInt),
8373 ("wait_for_sync", True, ht.TBool),
8377 def ExpandNames(self):
8378 self._ExpandAndLockInstance()
8379 self.needed_locks[locking.LEVEL_NODE] = []
8380 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
8382 def DeclareLocks(self, level):
8383 if level == locking.LEVEL_NODE:
8384 self._LockInstancesNodes()
8386 def BuildHooksEnv(self):
8389 This runs on the master, the primary and all the secondaries.
8393 "DISK": self.op.disk,
8394 "AMOUNT": self.op.amount,
8396 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
8397 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
8400 def CheckPrereq(self):
8401 """Check prerequisites.
8403 This checks that the instance is in the cluster.
8406 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
8407 assert instance is not None, \
8408 "Cannot retrieve locked instance %s" % self.op.instance_name
8409 nodenames = list(instance.all_nodes)
8410 for node in nodenames:
8411 _CheckNodeOnline(self, node)
8413 self.instance = instance
8415 if instance.disk_template not in constants.DTS_GROWABLE:
8416 raise errors.OpPrereqError("Instance's disk layout does not support"
8417 " growing.", errors.ECODE_INVAL)
8419 self.disk = instance.FindDisk(self.op.disk)
8421 if instance.disk_template != constants.DT_FILE:
8422 # TODO: check the free disk space for file, when that feature will be
8424 _CheckNodesFreeDisk(self, nodenames, self.op.amount)
8426 def Exec(self, feedback_fn):
8427 """Execute disk grow.
8430 instance = self.instance
8433 disks_ok, _ = _AssembleInstanceDisks(self, self.instance, disks=[disk])
8435 raise errors.OpExecError("Cannot activate block device to grow")
8437 for node in instance.all_nodes:
8438 self.cfg.SetDiskID(disk, node)
8439 result = self.rpc.call_blockdev_grow(node, disk, self.op.amount)
8440 result.Raise("Grow request failed to node %s" % node)
8442 # TODO: Rewrite code to work properly
8443 # DRBD goes into sync mode for a short amount of time after executing the
8444 # "resize" command. DRBD 8.x below version 8.0.13 contains a bug whereby
8445 # calling "resize" in sync mode fails. Sleeping for a short amount of
8446 # time is a work-around.
8449 disk.RecordGrow(self.op.amount)
8450 self.cfg.Update(instance, feedback_fn)
8451 if self.op.wait_for_sync:
8452 disk_abort = not _WaitForSync(self, instance, disks=[disk])
8454 self.proc.LogWarning("Warning: disk sync-ing has not returned a good"
8455 " status.\nPlease check the instance.")
8456 if not instance.admin_up:
8457 _SafeShutdownInstanceDisks(self, instance, disks=[disk])
8458 elif not instance.admin_up:
8459 self.proc.LogWarning("Not shutting down the disk even if the instance is"
8460 " not supposed to be running because no wait for"
8461 " sync mode was requested.")
8464 class LUQueryInstanceData(NoHooksLU):
8465 """Query runtime instance data.
8469 ("instances", ht.EmptyList, ht.TListOf(ht.TNonEmptyString)),
8470 ("static", False, ht.TBool),
8474 def ExpandNames(self):
8475 self.needed_locks = {}
8476 self.share_locks = dict.fromkeys(locking.LEVELS, 1)
8478 if self.op.instances:
8479 self.wanted_names = []
8480 for name in self.op.instances:
8481 full_name = _ExpandInstanceName(self.cfg, name)
8482 self.wanted_names.append(full_name)
8483 self.needed_locks[locking.LEVEL_INSTANCE] = self.wanted_names
8485 self.wanted_names = None
8486 self.needed_locks[locking.LEVEL_INSTANCE] = locking.ALL_SET
8488 self.needed_locks[locking.LEVEL_NODE] = []
8489 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
8491 def DeclareLocks(self, level):
8492 if level == locking.LEVEL_NODE:
8493 self._LockInstancesNodes()
8495 def CheckPrereq(self):
8496 """Check prerequisites.
8498 This only checks the optional instance list against the existing names.
8501 if self.wanted_names is None:
8502 self.wanted_names = self.acquired_locks[locking.LEVEL_INSTANCE]
8504 self.wanted_instances = [self.cfg.GetInstanceInfo(name) for name
8505 in self.wanted_names]
8507 def _ComputeBlockdevStatus(self, node, instance_name, dev):
8508 """Returns the status of a block device
8511 if self.op.static or not node:
8514 self.cfg.SetDiskID(dev, node)
8516 result = self.rpc.call_blockdev_find(node, dev)
8520 result.Raise("Can't compute disk status for %s" % instance_name)
8522 status = result.payload
8526 return (status.dev_path, status.major, status.minor,
8527 status.sync_percent, status.estimated_time,
8528 status.is_degraded, status.ldisk_status)
8530 def _ComputeDiskStatus(self, instance, snode, dev):
8531 """Compute block device status.
8534 if dev.dev_type in constants.LDS_DRBD:
8535 # we change the snode then (otherwise we use the one passed in)
8536 if dev.logical_id[0] == instance.primary_node:
8537 snode = dev.logical_id[1]
8539 snode = dev.logical_id[0]
8541 dev_pstatus = self._ComputeBlockdevStatus(instance.primary_node,
8543 dev_sstatus = self._ComputeBlockdevStatus(snode, instance.name, dev)
8546 dev_children = [self._ComputeDiskStatus(instance, snode, child)
8547 for child in dev.children]
8552 "iv_name": dev.iv_name,
8553 "dev_type": dev.dev_type,
8554 "logical_id": dev.logical_id,
8555 "physical_id": dev.physical_id,
8556 "pstatus": dev_pstatus,
8557 "sstatus": dev_sstatus,
8558 "children": dev_children,
8565 def Exec(self, feedback_fn):
8566 """Gather and return data"""
8569 cluster = self.cfg.GetClusterInfo()
8571 for instance in self.wanted_instances:
8572 if not self.op.static:
8573 remote_info = self.rpc.call_instance_info(instance.primary_node,
8575 instance.hypervisor)
8576 remote_info.Raise("Error checking node %s" % instance.primary_node)
8577 remote_info = remote_info.payload
8578 if remote_info and "state" in remote_info:
8581 remote_state = "down"
8584 if instance.admin_up:
8587 config_state = "down"
8589 disks = [self._ComputeDiskStatus(instance, None, device)
8590 for device in instance.disks]
8593 "name": instance.name,
8594 "config_state": config_state,
8595 "run_state": remote_state,
8596 "pnode": instance.primary_node,
8597 "snodes": instance.secondary_nodes,
8599 # this happens to be the same format used for hooks
8600 "nics": _NICListToTuple(self, instance.nics),
8601 "disk_template": instance.disk_template,
8603 "hypervisor": instance.hypervisor,
8604 "network_port": instance.network_port,
8605 "hv_instance": instance.hvparams,
8606 "hv_actual": cluster.FillHV(instance, skip_globals=True),
8607 "be_instance": instance.beparams,
8608 "be_actual": cluster.FillBE(instance),
8609 "os_instance": instance.osparams,
8610 "os_actual": cluster.SimpleFillOS(instance.os, instance.osparams),
8611 "serial_no": instance.serial_no,
8612 "mtime": instance.mtime,
8613 "ctime": instance.ctime,
8614 "uuid": instance.uuid,
8617 result[instance.name] = idict
8622 class LUSetInstanceParams(LogicalUnit):
8623 """Modifies an instances's parameters.
8626 HPATH = "instance-modify"
8627 HTYPE = constants.HTYPE_INSTANCE
8630 ("nics", ht.EmptyList, ht.TList),
8631 ("disks", ht.EmptyList, ht.TList),
8632 ("beparams", ht.EmptyDict, ht.TDict),
8633 ("hvparams", ht.EmptyDict, ht.TDict),
8634 ("disk_template", None, ht.TMaybeString),
8635 ("remote_node", None, ht.TMaybeString),
8636 ("os_name", None, ht.TMaybeString),
8637 ("force_variant", False, ht.TBool),
8638 ("osparams", None, ht.TOr(ht.TDict, ht.TNone)),
8643 def CheckArguments(self):
8644 if not (self.op.nics or self.op.disks or self.op.disk_template or
8645 self.op.hvparams or self.op.beparams or self.op.os_name):
8646 raise errors.OpPrereqError("No changes submitted", errors.ECODE_INVAL)
8648 if self.op.hvparams:
8649 _CheckGlobalHvParams(self.op.hvparams)
8653 for disk_op, disk_dict in self.op.disks:
8654 utils.ForceDictType(disk_dict, constants.IDISK_PARAMS_TYPES)
8655 if disk_op == constants.DDM_REMOVE:
8658 elif disk_op == constants.DDM_ADD:
8661 if not isinstance(disk_op, int):
8662 raise errors.OpPrereqError("Invalid disk index", errors.ECODE_INVAL)
8663 if not isinstance(disk_dict, dict):
8664 msg = "Invalid disk value: expected dict, got '%s'" % disk_dict
8665 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
8667 if disk_op == constants.DDM_ADD:
8668 mode = disk_dict.setdefault('mode', constants.DISK_RDWR)
8669 if mode not in constants.DISK_ACCESS_SET:
8670 raise errors.OpPrereqError("Invalid disk access mode '%s'" % mode,
8672 size = disk_dict.get('size', None)
8674 raise errors.OpPrereqError("Required disk parameter size missing",
8678 except (TypeError, ValueError), err:
8679 raise errors.OpPrereqError("Invalid disk size parameter: %s" %
8680 str(err), errors.ECODE_INVAL)
8681 disk_dict['size'] = size
8683 # modification of disk
8684 if 'size' in disk_dict:
8685 raise errors.OpPrereqError("Disk size change not possible, use"
8686 " grow-disk", errors.ECODE_INVAL)
8688 if disk_addremove > 1:
8689 raise errors.OpPrereqError("Only one disk add or remove operation"
8690 " supported at a time", errors.ECODE_INVAL)
8692 if self.op.disks and self.op.disk_template is not None:
8693 raise errors.OpPrereqError("Disk template conversion and other disk"
8694 " changes not supported at the same time",
8697 if self.op.disk_template:
8698 _CheckDiskTemplate(self.op.disk_template)
8699 if (self.op.disk_template in constants.DTS_NET_MIRROR and
8700 self.op.remote_node is None):
8701 raise errors.OpPrereqError("Changing the disk template to a mirrored"
8702 " one requires specifying a secondary node",
8707 for nic_op, nic_dict in self.op.nics:
8708 utils.ForceDictType(nic_dict, constants.INIC_PARAMS_TYPES)
8709 if nic_op == constants.DDM_REMOVE:
8712 elif nic_op == constants.DDM_ADD:
8715 if not isinstance(nic_op, int):
8716 raise errors.OpPrereqError("Invalid nic index", errors.ECODE_INVAL)
8717 if not isinstance(nic_dict, dict):
8718 msg = "Invalid nic value: expected dict, got '%s'" % nic_dict
8719 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
8721 # nic_dict should be a dict
8722 nic_ip = nic_dict.get('ip', None)
8723 if nic_ip is not None:
8724 if nic_ip.lower() == constants.VALUE_NONE:
8725 nic_dict['ip'] = None
8727 if not netutils.IPAddress.IsValid(nic_ip):
8728 raise errors.OpPrereqError("Invalid IP address '%s'" % nic_ip,
8731 nic_bridge = nic_dict.get('bridge', None)
8732 nic_link = nic_dict.get('link', None)
8733 if nic_bridge and nic_link:
8734 raise errors.OpPrereqError("Cannot pass 'bridge' and 'link'"
8735 " at the same time", errors.ECODE_INVAL)
8736 elif nic_bridge and nic_bridge.lower() == constants.VALUE_NONE:
8737 nic_dict['bridge'] = None
8738 elif nic_link and nic_link.lower() == constants.VALUE_NONE:
8739 nic_dict['link'] = None
8741 if nic_op == constants.DDM_ADD:
8742 nic_mac = nic_dict.get('mac', None)
8744 nic_dict['mac'] = constants.VALUE_AUTO
8746 if 'mac' in nic_dict:
8747 nic_mac = nic_dict['mac']
8748 if nic_mac not in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
8749 nic_mac = utils.NormalizeAndValidateMac(nic_mac)
8751 if nic_op != constants.DDM_ADD and nic_mac == constants.VALUE_AUTO:
8752 raise errors.OpPrereqError("'auto' is not a valid MAC address when"
8753 " modifying an existing nic",
8756 if nic_addremove > 1:
8757 raise errors.OpPrereqError("Only one NIC add or remove operation"
8758 " supported at a time", errors.ECODE_INVAL)
8760 def ExpandNames(self):
8761 self._ExpandAndLockInstance()
8762 self.needed_locks[locking.LEVEL_NODE] = []
8763 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
8765 def DeclareLocks(self, level):
8766 if level == locking.LEVEL_NODE:
8767 self._LockInstancesNodes()
8768 if self.op.disk_template and self.op.remote_node:
8769 self.op.remote_node = _ExpandNodeName(self.cfg, self.op.remote_node)
8770 self.needed_locks[locking.LEVEL_NODE].append(self.op.remote_node)
8772 def BuildHooksEnv(self):
8775 This runs on the master, primary and secondaries.
8779 if constants.BE_MEMORY in self.be_new:
8780 args['memory'] = self.be_new[constants.BE_MEMORY]
8781 if constants.BE_VCPUS in self.be_new:
8782 args['vcpus'] = self.be_new[constants.BE_VCPUS]
8783 # TODO: export disk changes. Note: _BuildInstanceHookEnv* don't export disk
8784 # information at all.
8787 nic_override = dict(self.op.nics)
8788 for idx, nic in enumerate(self.instance.nics):
8789 if idx in nic_override:
8790 this_nic_override = nic_override[idx]
8792 this_nic_override = {}
8793 if 'ip' in this_nic_override:
8794 ip = this_nic_override['ip']
8797 if 'mac' in this_nic_override:
8798 mac = this_nic_override['mac']
8801 if idx in self.nic_pnew:
8802 nicparams = self.nic_pnew[idx]
8804 nicparams = self.cluster.SimpleFillNIC(nic.nicparams)
8805 mode = nicparams[constants.NIC_MODE]
8806 link = nicparams[constants.NIC_LINK]
8807 args['nics'].append((ip, mac, mode, link))
8808 if constants.DDM_ADD in nic_override:
8809 ip = nic_override[constants.DDM_ADD].get('ip', None)
8810 mac = nic_override[constants.DDM_ADD]['mac']
8811 nicparams = self.nic_pnew[constants.DDM_ADD]
8812 mode = nicparams[constants.NIC_MODE]
8813 link = nicparams[constants.NIC_LINK]
8814 args['nics'].append((ip, mac, mode, link))
8815 elif constants.DDM_REMOVE in nic_override:
8816 del args['nics'][-1]
8818 env = _BuildInstanceHookEnvByObject(self, self.instance, override=args)
8819 if self.op.disk_template:
8820 env["NEW_DISK_TEMPLATE"] = self.op.disk_template
8821 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
8824 def CheckPrereq(self):
8825 """Check prerequisites.
8827 This only checks the instance list against the existing names.
8830 # checking the new params on the primary/secondary nodes
8832 instance = self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
8833 cluster = self.cluster = self.cfg.GetClusterInfo()
8834 assert self.instance is not None, \
8835 "Cannot retrieve locked instance %s" % self.op.instance_name
8836 pnode = instance.primary_node
8837 nodelist = list(instance.all_nodes)
8840 if self.op.os_name and not self.op.force:
8841 _CheckNodeHasOS(self, instance.primary_node, self.op.os_name,
8842 self.op.force_variant)
8843 instance_os = self.op.os_name
8845 instance_os = instance.os
8847 if self.op.disk_template:
8848 if instance.disk_template == self.op.disk_template:
8849 raise errors.OpPrereqError("Instance already has disk template %s" %
8850 instance.disk_template, errors.ECODE_INVAL)
8852 if (instance.disk_template,
8853 self.op.disk_template) not in self._DISK_CONVERSIONS:
8854 raise errors.OpPrereqError("Unsupported disk template conversion from"
8855 " %s to %s" % (instance.disk_template,
8856 self.op.disk_template),
8858 _CheckInstanceDown(self, instance, "cannot change disk template")
8859 if self.op.disk_template in constants.DTS_NET_MIRROR:
8860 if self.op.remote_node == pnode:
8861 raise errors.OpPrereqError("Given new secondary node %s is the same"
8862 " as the primary node of the instance" %
8863 self.op.remote_node, errors.ECODE_STATE)
8864 _CheckNodeOnline(self, self.op.remote_node)
8865 _CheckNodeNotDrained(self, self.op.remote_node)
8866 disks = [{"size": d.size} for d in instance.disks]
8867 required = _ComputeDiskSize(self.op.disk_template, disks)
8868 _CheckNodesFreeDisk(self, [self.op.remote_node], required)
8870 # hvparams processing
8871 if self.op.hvparams:
8872 hv_type = instance.hypervisor
8873 i_hvdict = _GetUpdatedParams(instance.hvparams, self.op.hvparams)
8874 utils.ForceDictType(i_hvdict, constants.HVS_PARAMETER_TYPES)
8875 hv_new = cluster.SimpleFillHV(hv_type, instance.os, i_hvdict)
8878 hypervisor.GetHypervisor(hv_type).CheckParameterSyntax(hv_new)
8879 _CheckHVParams(self, nodelist, instance.hypervisor, hv_new)
8880 self.hv_new = hv_new # the new actual values
8881 self.hv_inst = i_hvdict # the new dict (without defaults)
8883 self.hv_new = self.hv_inst = {}
8885 # beparams processing
8886 if self.op.beparams:
8887 i_bedict = _GetUpdatedParams(instance.beparams, self.op.beparams,
8889 utils.ForceDictType(i_bedict, constants.BES_PARAMETER_TYPES)
8890 be_new = cluster.SimpleFillBE(i_bedict)
8891 self.be_new = be_new # the new actual values
8892 self.be_inst = i_bedict # the new dict (without defaults)
8894 self.be_new = self.be_inst = {}
8896 # osparams processing
8897 if self.op.osparams:
8898 i_osdict = _GetUpdatedParams(instance.osparams, self.op.osparams)
8899 _CheckOSParams(self, True, nodelist, instance_os, i_osdict)
8900 self.os_inst = i_osdict # the new dict (without defaults)
8906 if constants.BE_MEMORY in self.op.beparams and not self.op.force:
8907 mem_check_list = [pnode]
8908 if be_new[constants.BE_AUTO_BALANCE]:
8909 # either we changed auto_balance to yes or it was from before
8910 mem_check_list.extend(instance.secondary_nodes)
8911 instance_info = self.rpc.call_instance_info(pnode, instance.name,
8912 instance.hypervisor)
8913 nodeinfo = self.rpc.call_node_info(mem_check_list, self.cfg.GetVGName(),
8914 instance.hypervisor)
8915 pninfo = nodeinfo[pnode]
8916 msg = pninfo.fail_msg
8918 # Assume the primary node is unreachable and go ahead
8919 self.warn.append("Can't get info from primary node %s: %s" %
8921 elif not isinstance(pninfo.payload.get('memory_free', None), int):
8922 self.warn.append("Node data from primary node %s doesn't contain"
8923 " free memory information" % pnode)
8924 elif instance_info.fail_msg:
8925 self.warn.append("Can't get instance runtime information: %s" %
8926 instance_info.fail_msg)
8928 if instance_info.payload:
8929 current_mem = int(instance_info.payload['memory'])
8931 # Assume instance not running
8932 # (there is a slight race condition here, but it's not very probable,
8933 # and we have no other way to check)
8935 miss_mem = (be_new[constants.BE_MEMORY] - current_mem -
8936 pninfo.payload['memory_free'])
8938 raise errors.OpPrereqError("This change will prevent the instance"
8939 " from starting, due to %d MB of memory"
8940 " missing on its primary node" % miss_mem,
8943 if be_new[constants.BE_AUTO_BALANCE]:
8944 for node, nres in nodeinfo.items():
8945 if node not in instance.secondary_nodes:
8949 self.warn.append("Can't get info from secondary node %s: %s" %
8951 elif not isinstance(nres.payload.get('memory_free', None), int):
8952 self.warn.append("Secondary node %s didn't return free"
8953 " memory information" % node)
8954 elif be_new[constants.BE_MEMORY] > nres.payload['memory_free']:
8955 self.warn.append("Not enough memory to failover instance to"
8956 " secondary node %s" % node)
8961 for nic_op, nic_dict in self.op.nics:
8962 if nic_op == constants.DDM_REMOVE:
8963 if not instance.nics:
8964 raise errors.OpPrereqError("Instance has no NICs, cannot remove",
8967 if nic_op != constants.DDM_ADD:
8969 if not instance.nics:
8970 raise errors.OpPrereqError("Invalid NIC index %s, instance has"
8971 " no NICs" % nic_op,
8973 if nic_op < 0 or nic_op >= len(instance.nics):
8974 raise errors.OpPrereqError("Invalid NIC index %s, valid values"
8976 (nic_op, len(instance.nics) - 1),
8978 old_nic_params = instance.nics[nic_op].nicparams
8979 old_nic_ip = instance.nics[nic_op].ip
8984 update_params_dict = dict([(key, nic_dict[key])
8985 for key in constants.NICS_PARAMETERS
8986 if key in nic_dict])
8988 if 'bridge' in nic_dict:
8989 update_params_dict[constants.NIC_LINK] = nic_dict['bridge']
8991 new_nic_params = _GetUpdatedParams(old_nic_params,
8993 utils.ForceDictType(new_nic_params, constants.NICS_PARAMETER_TYPES)
8994 new_filled_nic_params = cluster.SimpleFillNIC(new_nic_params)
8995 objects.NIC.CheckParameterSyntax(new_filled_nic_params)
8996 self.nic_pinst[nic_op] = new_nic_params
8997 self.nic_pnew[nic_op] = new_filled_nic_params
8998 new_nic_mode = new_filled_nic_params[constants.NIC_MODE]
9000 if new_nic_mode == constants.NIC_MODE_BRIDGED:
9001 nic_bridge = new_filled_nic_params[constants.NIC_LINK]
9002 msg = self.rpc.call_bridges_exist(pnode, [nic_bridge]).fail_msg
9004 msg = "Error checking bridges on node %s: %s" % (pnode, msg)
9006 self.warn.append(msg)
9008 raise errors.OpPrereqError(msg, errors.ECODE_ENVIRON)
9009 if new_nic_mode == constants.NIC_MODE_ROUTED:
9010 if 'ip' in nic_dict:
9011 nic_ip = nic_dict['ip']
9015 raise errors.OpPrereqError('Cannot set the nic ip to None'
9016 ' on a routed nic', errors.ECODE_INVAL)
9017 if 'mac' in nic_dict:
9018 nic_mac = nic_dict['mac']
9020 raise errors.OpPrereqError('Cannot set the nic mac to None',
9022 elif nic_mac in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
9023 # otherwise generate the mac
9024 nic_dict['mac'] = self.cfg.GenerateMAC(self.proc.GetECId())
9026 # or validate/reserve the current one
9028 self.cfg.ReserveMAC(nic_mac, self.proc.GetECId())
9029 except errors.ReservationError:
9030 raise errors.OpPrereqError("MAC address %s already in use"
9031 " in cluster" % nic_mac,
9032 errors.ECODE_NOTUNIQUE)
9035 if self.op.disks and instance.disk_template == constants.DT_DISKLESS:
9036 raise errors.OpPrereqError("Disk operations not supported for"
9037 " diskless instances",
9039 for disk_op, _ in self.op.disks:
9040 if disk_op == constants.DDM_REMOVE:
9041 if len(instance.disks) == 1:
9042 raise errors.OpPrereqError("Cannot remove the last disk of"
9043 " an instance", errors.ECODE_INVAL)
9044 _CheckInstanceDown(self, instance, "cannot remove disks")
9046 if (disk_op == constants.DDM_ADD and
9047 len(instance.nics) >= constants.MAX_DISKS):
9048 raise errors.OpPrereqError("Instance has too many disks (%d), cannot"
9049 " add more" % constants.MAX_DISKS,
9051 if disk_op not in (constants.DDM_ADD, constants.DDM_REMOVE):
9053 if disk_op < 0 or disk_op >= len(instance.disks):
9054 raise errors.OpPrereqError("Invalid disk index %s, valid values"
9056 (disk_op, len(instance.disks)),
9061 def _ConvertPlainToDrbd(self, feedback_fn):
9062 """Converts an instance from plain to drbd.
9065 feedback_fn("Converting template to drbd")
9066 instance = self.instance
9067 pnode = instance.primary_node
9068 snode = self.op.remote_node
9070 # create a fake disk info for _GenerateDiskTemplate
9071 disk_info = [{"size": d.size, "mode": d.mode} for d in instance.disks]
9072 new_disks = _GenerateDiskTemplate(self, self.op.disk_template,
9073 instance.name, pnode, [snode],
9074 disk_info, None, None, 0)
9075 info = _GetInstanceInfoText(instance)
9076 feedback_fn("Creating aditional volumes...")
9077 # first, create the missing data and meta devices
9078 for disk in new_disks:
9079 # unfortunately this is... not too nice
9080 _CreateSingleBlockDev(self, pnode, instance, disk.children[1],
9082 for child in disk.children:
9083 _CreateSingleBlockDev(self, snode, instance, child, info, True)
9084 # at this stage, all new LVs have been created, we can rename the
9086 feedback_fn("Renaming original volumes...")
9087 rename_list = [(o, n.children[0].logical_id)
9088 for (o, n) in zip(instance.disks, new_disks)]
9089 result = self.rpc.call_blockdev_rename(pnode, rename_list)
9090 result.Raise("Failed to rename original LVs")
9092 feedback_fn("Initializing DRBD devices...")
9093 # all child devices are in place, we can now create the DRBD devices
9094 for disk in new_disks:
9095 for node in [pnode, snode]:
9096 f_create = node == pnode
9097 _CreateSingleBlockDev(self, node, instance, disk, info, f_create)
9099 # at this point, the instance has been modified
9100 instance.disk_template = constants.DT_DRBD8
9101 instance.disks = new_disks
9102 self.cfg.Update(instance, feedback_fn)
9104 # disks are created, waiting for sync
9105 disk_abort = not _WaitForSync(self, instance)
9107 raise errors.OpExecError("There are some degraded disks for"
9108 " this instance, please cleanup manually")
9110 def _ConvertDrbdToPlain(self, feedback_fn):
9111 """Converts an instance from drbd to plain.
9114 instance = self.instance
9115 assert len(instance.secondary_nodes) == 1
9116 pnode = instance.primary_node
9117 snode = instance.secondary_nodes[0]
9118 feedback_fn("Converting template to plain")
9120 old_disks = instance.disks
9121 new_disks = [d.children[0] for d in old_disks]
9123 # copy over size and mode
9124 for parent, child in zip(old_disks, new_disks):
9125 child.size = parent.size
9126 child.mode = parent.mode
9128 # update instance structure
9129 instance.disks = new_disks
9130 instance.disk_template = constants.DT_PLAIN
9131 self.cfg.Update(instance, feedback_fn)
9133 feedback_fn("Removing volumes on the secondary node...")
9134 for disk in old_disks:
9135 self.cfg.SetDiskID(disk, snode)
9136 msg = self.rpc.call_blockdev_remove(snode, disk).fail_msg
9138 self.LogWarning("Could not remove block device %s on node %s,"
9139 " continuing anyway: %s", disk.iv_name, snode, msg)
9141 feedback_fn("Removing unneeded volumes on the primary node...")
9142 for idx, disk in enumerate(old_disks):
9143 meta = disk.children[1]
9144 self.cfg.SetDiskID(meta, pnode)
9145 msg = self.rpc.call_blockdev_remove(pnode, meta).fail_msg
9147 self.LogWarning("Could not remove metadata for disk %d on node %s,"
9148 " continuing anyway: %s", idx, pnode, msg)
9151 def Exec(self, feedback_fn):
9152 """Modifies an instance.
9154 All parameters take effect only at the next restart of the instance.
9157 # Process here the warnings from CheckPrereq, as we don't have a
9158 # feedback_fn there.
9159 for warn in self.warn:
9160 feedback_fn("WARNING: %s" % warn)
9163 instance = self.instance
9165 for disk_op, disk_dict in self.op.disks:
9166 if disk_op == constants.DDM_REMOVE:
9167 # remove the last disk
9168 device = instance.disks.pop()
9169 device_idx = len(instance.disks)
9170 for node, disk in device.ComputeNodeTree(instance.primary_node):
9171 self.cfg.SetDiskID(disk, node)
9172 msg = self.rpc.call_blockdev_remove(node, disk).fail_msg
9174 self.LogWarning("Could not remove disk/%d on node %s: %s,"
9175 " continuing anyway", device_idx, node, msg)
9176 result.append(("disk/%d" % device_idx, "remove"))
9177 elif disk_op == constants.DDM_ADD:
9179 if instance.disk_template == constants.DT_FILE:
9180 file_driver, file_path = instance.disks[0].logical_id
9181 file_path = os.path.dirname(file_path)
9183 file_driver = file_path = None
9184 disk_idx_base = len(instance.disks)
9185 new_disk = _GenerateDiskTemplate(self,
9186 instance.disk_template,
9187 instance.name, instance.primary_node,
9188 instance.secondary_nodes,
9193 instance.disks.append(new_disk)
9194 info = _GetInstanceInfoText(instance)
9196 logging.info("Creating volume %s for instance %s",
9197 new_disk.iv_name, instance.name)
9198 # Note: this needs to be kept in sync with _CreateDisks
9200 for node in instance.all_nodes:
9201 f_create = node == instance.primary_node
9203 _CreateBlockDev(self, node, instance, new_disk,
9204 f_create, info, f_create)
9205 except errors.OpExecError, err:
9206 self.LogWarning("Failed to create volume %s (%s) on"
9208 new_disk.iv_name, new_disk, node, err)
9209 result.append(("disk/%d" % disk_idx_base, "add:size=%s,mode=%s" %
9210 (new_disk.size, new_disk.mode)))
9212 # change a given disk
9213 instance.disks[disk_op].mode = disk_dict['mode']
9214 result.append(("disk.mode/%d" % disk_op, disk_dict['mode']))
9216 if self.op.disk_template:
9217 r_shut = _ShutdownInstanceDisks(self, instance)
9219 raise errors.OpExecError("Cannot shutdow instance disks, unable to"
9220 " proceed with disk template conversion")
9221 mode = (instance.disk_template, self.op.disk_template)
9223 self._DISK_CONVERSIONS[mode](self, feedback_fn)
9225 self.cfg.ReleaseDRBDMinors(instance.name)
9227 result.append(("disk_template", self.op.disk_template))
9230 for nic_op, nic_dict in self.op.nics:
9231 if nic_op == constants.DDM_REMOVE:
9232 # remove the last nic
9233 del instance.nics[-1]
9234 result.append(("nic.%d" % len(instance.nics), "remove"))
9235 elif nic_op == constants.DDM_ADD:
9236 # mac and bridge should be set, by now
9237 mac = nic_dict['mac']
9238 ip = nic_dict.get('ip', None)
9239 nicparams = self.nic_pinst[constants.DDM_ADD]
9240 new_nic = objects.NIC(mac=mac, ip=ip, nicparams=nicparams)
9241 instance.nics.append(new_nic)
9242 result.append(("nic.%d" % (len(instance.nics) - 1),
9243 "add:mac=%s,ip=%s,mode=%s,link=%s" %
9244 (new_nic.mac, new_nic.ip,
9245 self.nic_pnew[constants.DDM_ADD][constants.NIC_MODE],
9246 self.nic_pnew[constants.DDM_ADD][constants.NIC_LINK]
9249 for key in 'mac', 'ip':
9251 setattr(instance.nics[nic_op], key, nic_dict[key])
9252 if nic_op in self.nic_pinst:
9253 instance.nics[nic_op].nicparams = self.nic_pinst[nic_op]
9254 for key, val in nic_dict.iteritems():
9255 result.append(("nic.%s/%d" % (key, nic_op), val))
9258 if self.op.hvparams:
9259 instance.hvparams = self.hv_inst
9260 for key, val in self.op.hvparams.iteritems():
9261 result.append(("hv/%s" % key, val))
9264 if self.op.beparams:
9265 instance.beparams = self.be_inst
9266 for key, val in self.op.beparams.iteritems():
9267 result.append(("be/%s" % key, val))
9271 instance.os = self.op.os_name
9274 if self.op.osparams:
9275 instance.osparams = self.os_inst
9276 for key, val in self.op.osparams.iteritems():
9277 result.append(("os/%s" % key, val))
9279 self.cfg.Update(instance, feedback_fn)
9283 _DISK_CONVERSIONS = {
9284 (constants.DT_PLAIN, constants.DT_DRBD8): _ConvertPlainToDrbd,
9285 (constants.DT_DRBD8, constants.DT_PLAIN): _ConvertDrbdToPlain,
9289 class LUQueryExports(NoHooksLU):
9290 """Query the exports list
9294 ("nodes", ht.EmptyList, ht.TListOf(ht.TNonEmptyString)),
9295 ("use_locking", False, ht.TBool),
9299 def ExpandNames(self):
9300 self.needed_locks = {}
9301 self.share_locks[locking.LEVEL_NODE] = 1
9302 if not self.op.nodes:
9303 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
9305 self.needed_locks[locking.LEVEL_NODE] = \
9306 _GetWantedNodes(self, self.op.nodes)
9308 def Exec(self, feedback_fn):
9309 """Compute the list of all the exported system images.
9312 @return: a dictionary with the structure node->(export-list)
9313 where export-list is a list of the instances exported on
9317 self.nodes = self.acquired_locks[locking.LEVEL_NODE]
9318 rpcresult = self.rpc.call_export_list(self.nodes)
9320 for node in rpcresult:
9321 if rpcresult[node].fail_msg:
9322 result[node] = False
9324 result[node] = rpcresult[node].payload
9329 class LUPrepareExport(NoHooksLU):
9330 """Prepares an instance for an export and returns useful information.
9335 ("mode", ht.NoDefault, ht.TElemOf(constants.EXPORT_MODES)),
9339 def ExpandNames(self):
9340 self._ExpandAndLockInstance()
9342 def CheckPrereq(self):
9343 """Check prerequisites.
9346 instance_name = self.op.instance_name
9348 self.instance = self.cfg.GetInstanceInfo(instance_name)
9349 assert self.instance is not None, \
9350 "Cannot retrieve locked instance %s" % self.op.instance_name
9351 _CheckNodeOnline(self, self.instance.primary_node)
9353 self._cds = _GetClusterDomainSecret()
9355 def Exec(self, feedback_fn):
9356 """Prepares an instance for an export.
9359 instance = self.instance
9361 if self.op.mode == constants.EXPORT_MODE_REMOTE:
9362 salt = utils.GenerateSecret(8)
9364 feedback_fn("Generating X509 certificate on %s" % instance.primary_node)
9365 result = self.rpc.call_x509_cert_create(instance.primary_node,
9366 constants.RIE_CERT_VALIDITY)
9367 result.Raise("Can't create X509 key and certificate on %s" % result.node)
9369 (name, cert_pem) = result.payload
9371 cert = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM,
9375 "handshake": masterd.instance.ComputeRemoteExportHandshake(self._cds),
9376 "x509_key_name": (name, utils.Sha1Hmac(self._cds, name, salt=salt),
9378 "x509_ca": utils.SignX509Certificate(cert, self._cds, salt),
9384 class LUExportInstance(LogicalUnit):
9385 """Export an instance to an image in the cluster.
9388 HPATH = "instance-export"
9389 HTYPE = constants.HTYPE_INSTANCE
9392 ("target_node", ht.NoDefault, ht.TOr(ht.TNonEmptyString, ht.TList)),
9393 ("shutdown", True, ht.TBool),
9395 ("remove_instance", False, ht.TBool),
9396 ("ignore_remove_failures", False, ht.TBool),
9397 ("mode", constants.EXPORT_MODE_LOCAL, ht.TElemOf(constants.EXPORT_MODES)),
9398 ("x509_key_name", None, ht.TOr(ht.TList, ht.TNone)),
9399 ("destination_x509_ca", None, ht.TMaybeString),
9403 def CheckArguments(self):
9404 """Check the arguments.
9407 self.x509_key_name = self.op.x509_key_name
9408 self.dest_x509_ca_pem = self.op.destination_x509_ca
9410 if self.op.remove_instance and not self.op.shutdown:
9411 raise errors.OpPrereqError("Can not remove instance without shutting it"
9414 if self.op.mode == constants.EXPORT_MODE_REMOTE:
9415 if not self.x509_key_name:
9416 raise errors.OpPrereqError("Missing X509 key name for encryption",
9419 if not self.dest_x509_ca_pem:
9420 raise errors.OpPrereqError("Missing destination X509 CA",
9423 def ExpandNames(self):
9424 self._ExpandAndLockInstance()
9426 # Lock all nodes for local exports
9427 if self.op.mode == constants.EXPORT_MODE_LOCAL:
9428 # FIXME: lock only instance primary and destination node
9430 # Sad but true, for now we have do lock all nodes, as we don't know where
9431 # the previous export might be, and in this LU we search for it and
9432 # remove it from its current node. In the future we could fix this by:
9433 # - making a tasklet to search (share-lock all), then create the
9434 # new one, then one to remove, after
9435 # - removing the removal operation altogether
9436 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
9438 def DeclareLocks(self, level):
9439 """Last minute lock declaration."""
9440 # All nodes are locked anyway, so nothing to do here.
9442 def BuildHooksEnv(self):
9445 This will run on the master, primary node and target node.
9449 "EXPORT_MODE": self.op.mode,
9450 "EXPORT_NODE": self.op.target_node,
9451 "EXPORT_DO_SHUTDOWN": self.op.shutdown,
9452 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
9453 # TODO: Generic function for boolean env variables
9454 "REMOVE_INSTANCE": str(bool(self.op.remove_instance)),
9457 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
9459 nl = [self.cfg.GetMasterNode(), self.instance.primary_node]
9461 if self.op.mode == constants.EXPORT_MODE_LOCAL:
9462 nl.append(self.op.target_node)
9466 def CheckPrereq(self):
9467 """Check prerequisites.
9469 This checks that the instance and node names are valid.
9472 instance_name = self.op.instance_name
9474 self.instance = self.cfg.GetInstanceInfo(instance_name)
9475 assert self.instance is not None, \
9476 "Cannot retrieve locked instance %s" % self.op.instance_name
9477 _CheckNodeOnline(self, self.instance.primary_node)
9479 if self.op.mode == constants.EXPORT_MODE_LOCAL:
9480 self.op.target_node = _ExpandNodeName(self.cfg, self.op.target_node)
9481 self.dst_node = self.cfg.GetNodeInfo(self.op.target_node)
9482 assert self.dst_node is not None
9484 _CheckNodeOnline(self, self.dst_node.name)
9485 _CheckNodeNotDrained(self, self.dst_node.name)
9488 self.dest_disk_info = None
9489 self.dest_x509_ca = None
9491 elif self.op.mode == constants.EXPORT_MODE_REMOTE:
9492 self.dst_node = None
9494 if len(self.op.target_node) != len(self.instance.disks):
9495 raise errors.OpPrereqError(("Received destination information for %s"
9496 " disks, but instance %s has %s disks") %
9497 (len(self.op.target_node), instance_name,
9498 len(self.instance.disks)),
9501 cds = _GetClusterDomainSecret()
9503 # Check X509 key name
9505 (key_name, hmac_digest, hmac_salt) = self.x509_key_name
9506 except (TypeError, ValueError), err:
9507 raise errors.OpPrereqError("Invalid data for X509 key name: %s" % err)
9509 if not utils.VerifySha1Hmac(cds, key_name, hmac_digest, salt=hmac_salt):
9510 raise errors.OpPrereqError("HMAC for X509 key name is wrong",
9513 # Load and verify CA
9515 (cert, _) = utils.LoadSignedX509Certificate(self.dest_x509_ca_pem, cds)
9516 except OpenSSL.crypto.Error, err:
9517 raise errors.OpPrereqError("Unable to load destination X509 CA (%s)" %
9518 (err, ), errors.ECODE_INVAL)
9520 (errcode, msg) = utils.VerifyX509Certificate(cert, None, None)
9521 if errcode is not None:
9522 raise errors.OpPrereqError("Invalid destination X509 CA (%s)" %
9523 (msg, ), errors.ECODE_INVAL)
9525 self.dest_x509_ca = cert
9527 # Verify target information
9529 for idx, disk_data in enumerate(self.op.target_node):
9531 (host, port, magic) = \
9532 masterd.instance.CheckRemoteExportDiskInfo(cds, idx, disk_data)
9533 except errors.GenericError, err:
9534 raise errors.OpPrereqError("Target info for disk %s: %s" %
9535 (idx, err), errors.ECODE_INVAL)
9537 disk_info.append((host, port, magic))
9539 assert len(disk_info) == len(self.op.target_node)
9540 self.dest_disk_info = disk_info
9543 raise errors.ProgrammerError("Unhandled export mode %r" %
9546 # instance disk type verification
9547 # TODO: Implement export support for file-based disks
9548 for disk in self.instance.disks:
9549 if disk.dev_type == constants.LD_FILE:
9550 raise errors.OpPrereqError("Export not supported for instances with"
9551 " file-based disks", errors.ECODE_INVAL)
9553 def _CleanupExports(self, feedback_fn):
9554 """Removes exports of current instance from all other nodes.
9556 If an instance in a cluster with nodes A..D was exported to node C, its
9557 exports will be removed from the nodes A, B and D.
9560 assert self.op.mode != constants.EXPORT_MODE_REMOTE
9562 nodelist = self.cfg.GetNodeList()
9563 nodelist.remove(self.dst_node.name)
9565 # on one-node clusters nodelist will be empty after the removal
9566 # if we proceed the backup would be removed because OpQueryExports
9567 # substitutes an empty list with the full cluster node list.
9568 iname = self.instance.name
9570 feedback_fn("Removing old exports for instance %s" % iname)
9571 exportlist = self.rpc.call_export_list(nodelist)
9572 for node in exportlist:
9573 if exportlist[node].fail_msg:
9575 if iname in exportlist[node].payload:
9576 msg = self.rpc.call_export_remove(node, iname).fail_msg
9578 self.LogWarning("Could not remove older export for instance %s"
9579 " on node %s: %s", iname, node, msg)
9581 def Exec(self, feedback_fn):
9582 """Export an instance to an image in the cluster.
9585 assert self.op.mode in constants.EXPORT_MODES
9587 instance = self.instance
9588 src_node = instance.primary_node
9590 if self.op.shutdown:
9591 # shutdown the instance, but not the disks
9592 feedback_fn("Shutting down instance %s" % instance.name)
9593 result = self.rpc.call_instance_shutdown(src_node, instance,
9594 self.op.shutdown_timeout)
9595 # TODO: Maybe ignore failures if ignore_remove_failures is set
9596 result.Raise("Could not shutdown instance %s on"
9597 " node %s" % (instance.name, src_node))
9599 # set the disks ID correctly since call_instance_start needs the
9600 # correct drbd minor to create the symlinks
9601 for disk in instance.disks:
9602 self.cfg.SetDiskID(disk, src_node)
9604 activate_disks = (not instance.admin_up)
9607 # Activate the instance disks if we'exporting a stopped instance
9608 feedback_fn("Activating disks for %s" % instance.name)
9609 _StartInstanceDisks(self, instance, None)
9612 helper = masterd.instance.ExportInstanceHelper(self, feedback_fn,
9615 helper.CreateSnapshots()
9617 if (self.op.shutdown and instance.admin_up and
9618 not self.op.remove_instance):
9619 assert not activate_disks
9620 feedback_fn("Starting instance %s" % instance.name)
9621 result = self.rpc.call_instance_start(src_node, instance, None, None)
9622 msg = result.fail_msg
9624 feedback_fn("Failed to start instance: %s" % msg)
9625 _ShutdownInstanceDisks(self, instance)
9626 raise errors.OpExecError("Could not start instance: %s" % msg)
9628 if self.op.mode == constants.EXPORT_MODE_LOCAL:
9629 (fin_resu, dresults) = helper.LocalExport(self.dst_node)
9630 elif self.op.mode == constants.EXPORT_MODE_REMOTE:
9631 connect_timeout = constants.RIE_CONNECT_TIMEOUT
9632 timeouts = masterd.instance.ImportExportTimeouts(connect_timeout)
9634 (key_name, _, _) = self.x509_key_name
9637 OpenSSL.crypto.dump_certificate(OpenSSL.crypto.FILETYPE_PEM,
9640 (fin_resu, dresults) = helper.RemoteExport(self.dest_disk_info,
9641 key_name, dest_ca_pem,
9646 # Check for backwards compatibility
9647 assert len(dresults) == len(instance.disks)
9648 assert compat.all(isinstance(i, bool) for i in dresults), \
9649 "Not all results are boolean: %r" % dresults
9653 feedback_fn("Deactivating disks for %s" % instance.name)
9654 _ShutdownInstanceDisks(self, instance)
9656 if not (compat.all(dresults) and fin_resu):
9659 failures.append("export finalization")
9660 if not compat.all(dresults):
9661 fdsk = utils.CommaJoin(idx for (idx, dsk) in enumerate(dresults)
9663 failures.append("disk export: disk(s) %s" % fdsk)
9665 raise errors.OpExecError("Export failed, errors in %s" %
9666 utils.CommaJoin(failures))
9668 # At this point, the export was successful, we can cleanup/finish
9670 # Remove instance if requested
9671 if self.op.remove_instance:
9672 feedback_fn("Removing instance %s" % instance.name)
9673 _RemoveInstance(self, feedback_fn, instance,
9674 self.op.ignore_remove_failures)
9676 if self.op.mode == constants.EXPORT_MODE_LOCAL:
9677 self._CleanupExports(feedback_fn)
9679 return fin_resu, dresults
9682 class LURemoveExport(NoHooksLU):
9683 """Remove exports related to the named instance.
9691 def ExpandNames(self):
9692 self.needed_locks = {}
9693 # We need all nodes to be locked in order for RemoveExport to work, but we
9694 # don't need to lock the instance itself, as nothing will happen to it (and
9695 # we can remove exports also for a removed instance)
9696 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
9698 def Exec(self, feedback_fn):
9699 """Remove any export.
9702 instance_name = self.cfg.ExpandInstanceName(self.op.instance_name)
9703 # If the instance was not found we'll try with the name that was passed in.
9704 # This will only work if it was an FQDN, though.
9706 if not instance_name:
9708 instance_name = self.op.instance_name
9710 locked_nodes = self.acquired_locks[locking.LEVEL_NODE]
9711 exportlist = self.rpc.call_export_list(locked_nodes)
9713 for node in exportlist:
9714 msg = exportlist[node].fail_msg
9716 self.LogWarning("Failed to query node %s (continuing): %s", node, msg)
9718 if instance_name in exportlist[node].payload:
9720 result = self.rpc.call_export_remove(node, instance_name)
9721 msg = result.fail_msg
9723 logging.error("Could not remove export for instance %s"
9724 " on node %s: %s", instance_name, node, msg)
9726 if fqdn_warn and not found:
9727 feedback_fn("Export not found. If trying to remove an export belonging"
9728 " to a deleted instance please use its Fully Qualified"
9732 class TagsLU(NoHooksLU): # pylint: disable-msg=W0223
9735 This is an abstract class which is the parent of all the other tags LUs.
9739 def ExpandNames(self):
9740 self.needed_locks = {}
9741 if self.op.kind == constants.TAG_NODE:
9742 self.op.name = _ExpandNodeName(self.cfg, self.op.name)
9743 self.needed_locks[locking.LEVEL_NODE] = self.op.name
9744 elif self.op.kind == constants.TAG_INSTANCE:
9745 self.op.name = _ExpandInstanceName(self.cfg, self.op.name)
9746 self.needed_locks[locking.LEVEL_INSTANCE] = self.op.name
9748 # FIXME: Acquire BGL for cluster tag operations (as of this writing it's
9749 # not possible to acquire the BGL based on opcode parameters)
9751 def CheckPrereq(self):
9752 """Check prerequisites.
9755 if self.op.kind == constants.TAG_CLUSTER:
9756 self.target = self.cfg.GetClusterInfo()
9757 elif self.op.kind == constants.TAG_NODE:
9758 self.target = self.cfg.GetNodeInfo(self.op.name)
9759 elif self.op.kind == constants.TAG_INSTANCE:
9760 self.target = self.cfg.GetInstanceInfo(self.op.name)
9762 raise errors.OpPrereqError("Wrong tag type requested (%s)" %
9763 str(self.op.kind), errors.ECODE_INVAL)
9766 class LUGetTags(TagsLU):
9767 """Returns the tags of a given object.
9771 ("kind", ht.NoDefault, ht.TElemOf(constants.VALID_TAG_TYPES)),
9772 # Name is only meaningful for nodes and instances
9773 ("name", ht.NoDefault, ht.TMaybeString),
9777 def ExpandNames(self):
9778 TagsLU.ExpandNames(self)
9780 # Share locks as this is only a read operation
9781 self.share_locks = dict.fromkeys(locking.LEVELS, 1)
9783 def Exec(self, feedback_fn):
9784 """Returns the tag list.
9787 return list(self.target.GetTags())
9790 class LUSearchTags(NoHooksLU):
9791 """Searches the tags for a given pattern.
9795 ("pattern", ht.NoDefault, ht.TNonEmptyString),
9799 def ExpandNames(self):
9800 self.needed_locks = {}
9802 def CheckPrereq(self):
9803 """Check prerequisites.
9805 This checks the pattern passed for validity by compiling it.
9809 self.re = re.compile(self.op.pattern)
9810 except re.error, err:
9811 raise errors.OpPrereqError("Invalid search pattern '%s': %s" %
9812 (self.op.pattern, err), errors.ECODE_INVAL)
9814 def Exec(self, feedback_fn):
9815 """Returns the tag list.
9819 tgts = [("/cluster", cfg.GetClusterInfo())]
9820 ilist = cfg.GetAllInstancesInfo().values()
9821 tgts.extend([("/instances/%s" % i.name, i) for i in ilist])
9822 nlist = cfg.GetAllNodesInfo().values()
9823 tgts.extend([("/nodes/%s" % n.name, n) for n in nlist])
9825 for path, target in tgts:
9826 for tag in target.GetTags():
9827 if self.re.search(tag):
9828 results.append((path, tag))
9832 class LUAddTags(TagsLU):
9833 """Sets a tag on a given object.
9837 ("kind", ht.NoDefault, ht.TElemOf(constants.VALID_TAG_TYPES)),
9838 # Name is only meaningful for nodes and instances
9839 ("name", ht.NoDefault, ht.TMaybeString),
9840 ("tags", ht.NoDefault, ht.TListOf(ht.TNonEmptyString)),
9844 def CheckPrereq(self):
9845 """Check prerequisites.
9847 This checks the type and length of the tag name and value.
9850 TagsLU.CheckPrereq(self)
9851 for tag in self.op.tags:
9852 objects.TaggableObject.ValidateTag(tag)
9854 def Exec(self, feedback_fn):
9859 for tag in self.op.tags:
9860 self.target.AddTag(tag)
9861 except errors.TagError, err:
9862 raise errors.OpExecError("Error while setting tag: %s" % str(err))
9863 self.cfg.Update(self.target, feedback_fn)
9866 class LUDelTags(TagsLU):
9867 """Delete a list of tags from a given object.
9871 ("kind", ht.NoDefault, ht.TElemOf(constants.VALID_TAG_TYPES)),
9872 # Name is only meaningful for nodes and instances
9873 ("name", ht.NoDefault, ht.TMaybeString),
9874 ("tags", ht.NoDefault, ht.TListOf(ht.TNonEmptyString)),
9878 def CheckPrereq(self):
9879 """Check prerequisites.
9881 This checks that we have the given tag.
9884 TagsLU.CheckPrereq(self)
9885 for tag in self.op.tags:
9886 objects.TaggableObject.ValidateTag(tag)
9887 del_tags = frozenset(self.op.tags)
9888 cur_tags = self.target.GetTags()
9890 diff_tags = del_tags - cur_tags
9892 diff_names = ("'%s'" % i for i in sorted(diff_tags))
9893 raise errors.OpPrereqError("Tag(s) %s not found" %
9894 (utils.CommaJoin(diff_names), ),
9897 def Exec(self, feedback_fn):
9898 """Remove the tag from the object.
9901 for tag in self.op.tags:
9902 self.target.RemoveTag(tag)
9903 self.cfg.Update(self.target, feedback_fn)
9906 class LUTestDelay(NoHooksLU):
9907 """Sleep for a specified amount of time.
9909 This LU sleeps on the master and/or nodes for a specified amount of
9914 ("duration", ht.NoDefault, ht.TFloat),
9915 ("on_master", True, ht.TBool),
9916 ("on_nodes", ht.EmptyList, ht.TListOf(ht.TNonEmptyString)),
9917 ("repeat", 0, ht.TPositiveInt)
9921 def ExpandNames(self):
9922 """Expand names and set required locks.
9924 This expands the node list, if any.
9927 self.needed_locks = {}
9928 if self.op.on_nodes:
9929 # _GetWantedNodes can be used here, but is not always appropriate to use
9930 # this way in ExpandNames. Check LogicalUnit.ExpandNames docstring for
9932 self.op.on_nodes = _GetWantedNodes(self, self.op.on_nodes)
9933 self.needed_locks[locking.LEVEL_NODE] = self.op.on_nodes
9935 def _TestDelay(self):
9936 """Do the actual sleep.
9939 if self.op.on_master:
9940 if not utils.TestDelay(self.op.duration):
9941 raise errors.OpExecError("Error during master delay test")
9942 if self.op.on_nodes:
9943 result = self.rpc.call_test_delay(self.op.on_nodes, self.op.duration)
9944 for node, node_result in result.items():
9945 node_result.Raise("Failure during rpc call to node %s" % node)
9947 def Exec(self, feedback_fn):
9948 """Execute the test delay opcode, with the wanted repetitions.
9951 if self.op.repeat == 0:
9954 top_value = self.op.repeat - 1
9955 for i in range(self.op.repeat):
9956 self.LogInfo("Test delay iteration %d/%d" % (i, top_value))
9960 class LUTestJobqueue(NoHooksLU):
9961 """Utility LU to test some aspects of the job queue.
9965 ("notify_waitlock", False, ht.TBool),
9966 ("notify_exec", False, ht.TBool),
9967 ("log_messages", ht.EmptyList, ht.TListOf(ht.TString)),
9968 ("fail", False, ht.TBool),
9972 # Must be lower than default timeout for WaitForJobChange to see whether it
9973 # notices changed jobs
9974 _CLIENT_CONNECT_TIMEOUT = 20.0
9975 _CLIENT_CONFIRM_TIMEOUT = 60.0
9978 def _NotifyUsingSocket(cls, cb, errcls):
9979 """Opens a Unix socket and waits for another program to connect.
9982 @param cb: Callback to send socket name to client
9984 @param errcls: Exception class to use for errors
9987 # Using a temporary directory as there's no easy way to create temporary
9988 # sockets without writing a custom loop around tempfile.mktemp and
9990 tmpdir = tempfile.mkdtemp()
9992 tmpsock = utils.PathJoin(tmpdir, "sock")
9994 logging.debug("Creating temporary socket at %s", tmpsock)
9995 sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
10000 # Send details to client
10003 # Wait for client to connect before continuing
10004 sock.settimeout(cls._CLIENT_CONNECT_TIMEOUT)
10006 (conn, _) = sock.accept()
10007 except socket.error, err:
10008 raise errcls("Client didn't connect in time (%s)" % err)
10012 # Remove as soon as client is connected
10013 shutil.rmtree(tmpdir)
10015 # Wait for client to close
10018 # pylint: disable-msg=E1101
10019 # Instance of '_socketobject' has no ... member
10020 conn.settimeout(cls._CLIENT_CONFIRM_TIMEOUT)
10022 except socket.error, err:
10023 raise errcls("Client failed to confirm notification (%s)" % err)
10027 def _SendNotification(self, test, arg, sockname):
10028 """Sends a notification to the client.
10031 @param test: Test name
10032 @param arg: Test argument (depends on test)
10033 @type sockname: string
10034 @param sockname: Socket path
10037 self.Log(constants.ELOG_JQUEUE_TEST, (sockname, test, arg))
10039 def _Notify(self, prereq, test, arg):
10040 """Notifies the client of a test.
10043 @param prereq: Whether this is a prereq-phase test
10045 @param test: Test name
10046 @param arg: Test argument (depends on test)
10050 errcls = errors.OpPrereqError
10052 errcls = errors.OpExecError
10054 return self._NotifyUsingSocket(compat.partial(self._SendNotification,
10058 def CheckArguments(self):
10059 self.checkargs_calls = getattr(self, "checkargs_calls", 0) + 1
10060 self.expandnames_calls = 0
10062 def ExpandNames(self):
10063 checkargs_calls = getattr(self, "checkargs_calls", 0)
10064 if checkargs_calls < 1:
10065 raise errors.ProgrammerError("CheckArguments was not called")
10067 self.expandnames_calls += 1
10069 if self.op.notify_waitlock:
10070 self._Notify(True, constants.JQT_EXPANDNAMES, None)
10072 self.LogInfo("Expanding names")
10074 # Get lock on master node (just to get a lock, not for a particular reason)
10075 self.needed_locks = {
10076 locking.LEVEL_NODE: self.cfg.GetMasterNode(),
10079 def Exec(self, feedback_fn):
10080 if self.expandnames_calls < 1:
10081 raise errors.ProgrammerError("ExpandNames was not called")
10083 if self.op.notify_exec:
10084 self._Notify(False, constants.JQT_EXEC, None)
10086 self.LogInfo("Executing")
10088 if self.op.log_messages:
10089 self._Notify(False, constants.JQT_STARTMSG, len(self.op.log_messages))
10090 for idx, msg in enumerate(self.op.log_messages):
10091 self.LogInfo("Sending log message %s", idx + 1)
10092 feedback_fn(constants.JQT_MSGPREFIX + msg)
10093 # Report how many test messages have been sent
10094 self._Notify(False, constants.JQT_LOGMSG, idx + 1)
10097 raise errors.OpExecError("Opcode failure was requested")
10102 class IAllocator(object):
10103 """IAllocator framework.
10105 An IAllocator instance has three sets of attributes:
10106 - cfg that is needed to query the cluster
10107 - input data (all members of the _KEYS class attribute are required)
10108 - four buffer attributes (in|out_data|text), that represent the
10109 input (to the external script) in text and data structure format,
10110 and the output from it, again in two formats
10111 - the result variables from the script (success, info, nodes) for
10115 # pylint: disable-msg=R0902
10116 # lots of instance attributes
10118 "name", "mem_size", "disks", "disk_template",
10119 "os", "tags", "nics", "vcpus", "hypervisor",
10122 "name", "relocate_from",
10128 def __init__(self, cfg, rpc, mode, **kwargs):
10131 # init buffer variables
10132 self.in_text = self.out_text = self.in_data = self.out_data = None
10133 # init all input fields so that pylint is happy
10135 self.mem_size = self.disks = self.disk_template = None
10136 self.os = self.tags = self.nics = self.vcpus = None
10137 self.hypervisor = None
10138 self.relocate_from = None
10140 self.evac_nodes = None
10142 self.required_nodes = None
10143 # init result fields
10144 self.success = self.info = self.result = None
10145 if self.mode == constants.IALLOCATOR_MODE_ALLOC:
10146 keyset = self._ALLO_KEYS
10147 fn = self._AddNewInstance
10148 elif self.mode == constants.IALLOCATOR_MODE_RELOC:
10149 keyset = self._RELO_KEYS
10150 fn = self._AddRelocateInstance
10151 elif self.mode == constants.IALLOCATOR_MODE_MEVAC:
10152 keyset = self._EVAC_KEYS
10153 fn = self._AddEvacuateNodes
10155 raise errors.ProgrammerError("Unknown mode '%s' passed to the"
10156 " IAllocator" % self.mode)
10158 if key not in keyset:
10159 raise errors.ProgrammerError("Invalid input parameter '%s' to"
10160 " IAllocator" % key)
10161 setattr(self, key, kwargs[key])
10164 if key not in kwargs:
10165 raise errors.ProgrammerError("Missing input parameter '%s' to"
10166 " IAllocator" % key)
10167 self._BuildInputData(fn)
10169 def _ComputeClusterData(self):
10170 """Compute the generic allocator input data.
10172 This is the data that is independent of the actual operation.
10176 cluster_info = cfg.GetClusterInfo()
10179 "version": constants.IALLOCATOR_VERSION,
10180 "cluster_name": cfg.GetClusterName(),
10181 "cluster_tags": list(cluster_info.GetTags()),
10182 "enabled_hypervisors": list(cluster_info.enabled_hypervisors),
10183 # we don't have job IDs
10185 iinfo = cfg.GetAllInstancesInfo().values()
10186 i_list = [(inst, cluster_info.FillBE(inst)) for inst in iinfo]
10189 node_list = cfg.GetNodeList()
10191 if self.mode == constants.IALLOCATOR_MODE_ALLOC:
10192 hypervisor_name = self.hypervisor
10193 elif self.mode == constants.IALLOCATOR_MODE_RELOC:
10194 hypervisor_name = cfg.GetInstanceInfo(self.name).hypervisor
10195 elif self.mode == constants.IALLOCATOR_MODE_MEVAC:
10196 hypervisor_name = cluster_info.enabled_hypervisors[0]
10198 node_data = self.rpc.call_node_info(node_list, cfg.GetVGName(),
10201 self.rpc.call_all_instances_info(node_list,
10202 cluster_info.enabled_hypervisors)
10204 data["nodegroups"] = self._ComputeNodeGroupData(cfg)
10206 data["nodes"] = self._ComputeNodeData(cfg, node_data, node_iinfo, i_list)
10208 data["instances"] = self._ComputeInstanceData(cluster_info, i_list)
10210 self.in_data = data
10213 def _ComputeNodeGroupData(cfg):
10214 """Compute node groups data.
10218 for guuid, gdata in cfg.GetAllNodeGroupsInfo().items():
10219 ng[guuid] = { "name": gdata.name }
10223 def _ComputeNodeData(cfg, node_data, node_iinfo, i_list):
10224 """Compute global node data.
10228 for nname, nresult in node_data.items():
10229 # first fill in static (config-based) values
10230 ninfo = cfg.GetNodeInfo(nname)
10232 "tags": list(ninfo.GetTags()),
10233 "primary_ip": ninfo.primary_ip,
10234 "secondary_ip": ninfo.secondary_ip,
10235 "offline": ninfo.offline,
10236 "drained": ninfo.drained,
10237 "master_candidate": ninfo.master_candidate,
10238 "group": ninfo.group,
10241 if not (ninfo.offline or ninfo.drained):
10242 nresult.Raise("Can't get data for node %s" % nname)
10243 node_iinfo[nname].Raise("Can't get node instance info from node %s" %
10245 remote_info = nresult.payload
10247 for attr in ['memory_total', 'memory_free', 'memory_dom0',
10248 'vg_size', 'vg_free', 'cpu_total']:
10249 if attr not in remote_info:
10250 raise errors.OpExecError("Node '%s' didn't return attribute"
10251 " '%s'" % (nname, attr))
10252 if not isinstance(remote_info[attr], int):
10253 raise errors.OpExecError("Node '%s' returned invalid value"
10255 (nname, attr, remote_info[attr]))
10256 # compute memory used by primary instances
10257 i_p_mem = i_p_up_mem = 0
10258 for iinfo, beinfo in i_list:
10259 if iinfo.primary_node == nname:
10260 i_p_mem += beinfo[constants.BE_MEMORY]
10261 if iinfo.name not in node_iinfo[nname].payload:
10264 i_used_mem = int(node_iinfo[nname].payload[iinfo.name]['memory'])
10265 i_mem_diff = beinfo[constants.BE_MEMORY] - i_used_mem
10266 remote_info['memory_free'] -= max(0, i_mem_diff)
10269 i_p_up_mem += beinfo[constants.BE_MEMORY]
10271 # compute memory used by instances
10273 "total_memory": remote_info['memory_total'],
10274 "reserved_memory": remote_info['memory_dom0'],
10275 "free_memory": remote_info['memory_free'],
10276 "total_disk": remote_info['vg_size'],
10277 "free_disk": remote_info['vg_free'],
10278 "total_cpus": remote_info['cpu_total'],
10279 "i_pri_memory": i_p_mem,
10280 "i_pri_up_memory": i_p_up_mem,
10282 pnr.update(pnr_dyn)
10284 node_results[nname] = pnr
10286 return node_results
10289 def _ComputeInstanceData(cluster_info, i_list):
10290 """Compute global instance data.
10294 for iinfo, beinfo in i_list:
10296 for nic in iinfo.nics:
10297 filled_params = cluster_info.SimpleFillNIC(nic.nicparams)
10298 nic_dict = {"mac": nic.mac,
10300 "mode": filled_params[constants.NIC_MODE],
10301 "link": filled_params[constants.NIC_LINK],
10303 if filled_params[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
10304 nic_dict["bridge"] = filled_params[constants.NIC_LINK]
10305 nic_data.append(nic_dict)
10307 "tags": list(iinfo.GetTags()),
10308 "admin_up": iinfo.admin_up,
10309 "vcpus": beinfo[constants.BE_VCPUS],
10310 "memory": beinfo[constants.BE_MEMORY],
10312 "nodes": [iinfo.primary_node] + list(iinfo.secondary_nodes),
10314 "disks": [{"size": dsk.size, "mode": dsk.mode} for dsk in iinfo.disks],
10315 "disk_template": iinfo.disk_template,
10316 "hypervisor": iinfo.hypervisor,
10318 pir["disk_space_total"] = _ComputeDiskSize(iinfo.disk_template,
10320 instance_data[iinfo.name] = pir
10322 return instance_data
10324 def _AddNewInstance(self):
10325 """Add new instance data to allocator structure.
10327 This in combination with _AllocatorGetClusterData will create the
10328 correct structure needed as input for the allocator.
10330 The checks for the completeness of the opcode must have already been
10334 disk_space = _ComputeDiskSize(self.disk_template, self.disks)
10336 if self.disk_template in constants.DTS_NET_MIRROR:
10337 self.required_nodes = 2
10339 self.required_nodes = 1
10342 "disk_template": self.disk_template,
10345 "vcpus": self.vcpus,
10346 "memory": self.mem_size,
10347 "disks": self.disks,
10348 "disk_space_total": disk_space,
10350 "required_nodes": self.required_nodes,
10354 def _AddRelocateInstance(self):
10355 """Add relocate instance data to allocator structure.
10357 This in combination with _IAllocatorGetClusterData will create the
10358 correct structure needed as input for the allocator.
10360 The checks for the completeness of the opcode must have already been
10364 instance = self.cfg.GetInstanceInfo(self.name)
10365 if instance is None:
10366 raise errors.ProgrammerError("Unknown instance '%s' passed to"
10367 " IAllocator" % self.name)
10369 if instance.disk_template not in constants.DTS_NET_MIRROR:
10370 raise errors.OpPrereqError("Can't relocate non-mirrored instances",
10371 errors.ECODE_INVAL)
10373 if len(instance.secondary_nodes) != 1:
10374 raise errors.OpPrereqError("Instance has not exactly one secondary node",
10375 errors.ECODE_STATE)
10377 self.required_nodes = 1
10378 disk_sizes = [{'size': disk.size} for disk in instance.disks]
10379 disk_space = _ComputeDiskSize(instance.disk_template, disk_sizes)
10383 "disk_space_total": disk_space,
10384 "required_nodes": self.required_nodes,
10385 "relocate_from": self.relocate_from,
10389 def _AddEvacuateNodes(self):
10390 """Add evacuate nodes data to allocator structure.
10394 "evac_nodes": self.evac_nodes
10398 def _BuildInputData(self, fn):
10399 """Build input data structures.
10402 self._ComputeClusterData()
10405 request["type"] = self.mode
10406 self.in_data["request"] = request
10408 self.in_text = serializer.Dump(self.in_data)
10410 def Run(self, name, validate=True, call_fn=None):
10411 """Run an instance allocator and return the results.
10414 if call_fn is None:
10415 call_fn = self.rpc.call_iallocator_runner
10417 result = call_fn(self.cfg.GetMasterNode(), name, self.in_text)
10418 result.Raise("Failure while running the iallocator script")
10420 self.out_text = result.payload
10422 self._ValidateResult()
10424 def _ValidateResult(self):
10425 """Process the allocator results.
10427 This will process and if successful save the result in
10428 self.out_data and the other parameters.
10432 rdict = serializer.Load(self.out_text)
10433 except Exception, err:
10434 raise errors.OpExecError("Can't parse iallocator results: %s" % str(err))
10436 if not isinstance(rdict, dict):
10437 raise errors.OpExecError("Can't parse iallocator results: not a dict")
10439 # TODO: remove backwards compatiblity in later versions
10440 if "nodes" in rdict and "result" not in rdict:
10441 rdict["result"] = rdict["nodes"]
10444 for key in "success", "info", "result":
10445 if key not in rdict:
10446 raise errors.OpExecError("Can't parse iallocator results:"
10447 " missing key '%s'" % key)
10448 setattr(self, key, rdict[key])
10450 if not isinstance(rdict["result"], list):
10451 raise errors.OpExecError("Can't parse iallocator results: 'result' key"
10453 self.out_data = rdict
10456 class LUTestAllocator(NoHooksLU):
10457 """Run allocator tests.
10459 This LU runs the allocator tests
10463 ("direction", ht.NoDefault,
10464 ht.TElemOf(constants.VALID_IALLOCATOR_DIRECTIONS)),
10465 ("mode", ht.NoDefault, ht.TElemOf(constants.VALID_IALLOCATOR_MODES)),
10466 ("name", ht.NoDefault, ht.TNonEmptyString),
10467 ("nics", ht.NoDefault, ht.TOr(ht.TNone, ht.TListOf(
10468 ht.TDictOf(ht.TElemOf(["mac", "ip", "bridge"]),
10469 ht.TOr(ht.TNone, ht.TNonEmptyString))))),
10470 ("disks", ht.NoDefault, ht.TOr(ht.TNone, ht.TList)),
10471 ("hypervisor", None, ht.TMaybeString),
10472 ("allocator", None, ht.TMaybeString),
10473 ("tags", ht.EmptyList, ht.TListOf(ht.TNonEmptyString)),
10474 ("mem_size", None, ht.TOr(ht.TNone, ht.TPositiveInt)),
10475 ("vcpus", None, ht.TOr(ht.TNone, ht.TPositiveInt)),
10476 ("os", None, ht.TMaybeString),
10477 ("disk_template", None, ht.TMaybeString),
10478 ("evac_nodes", None, ht.TOr(ht.TNone, ht.TListOf(ht.TNonEmptyString))),
10481 def CheckPrereq(self):
10482 """Check prerequisites.
10484 This checks the opcode parameters depending on the director and mode test.
10487 if self.op.mode == constants.IALLOCATOR_MODE_ALLOC:
10488 for attr in ["mem_size", "disks", "disk_template",
10489 "os", "tags", "nics", "vcpus"]:
10490 if not hasattr(self.op, attr):
10491 raise errors.OpPrereqError("Missing attribute '%s' on opcode input" %
10492 attr, errors.ECODE_INVAL)
10493 iname = self.cfg.ExpandInstanceName(self.op.name)
10494 if iname is not None:
10495 raise errors.OpPrereqError("Instance '%s' already in the cluster" %
10496 iname, errors.ECODE_EXISTS)
10497 if not isinstance(self.op.nics, list):
10498 raise errors.OpPrereqError("Invalid parameter 'nics'",
10499 errors.ECODE_INVAL)
10500 if not isinstance(self.op.disks, list):
10501 raise errors.OpPrereqError("Invalid parameter 'disks'",
10502 errors.ECODE_INVAL)
10503 for row in self.op.disks:
10504 if (not isinstance(row, dict) or
10505 "size" not in row or
10506 not isinstance(row["size"], int) or
10507 "mode" not in row or
10508 row["mode"] not in ['r', 'w']):
10509 raise errors.OpPrereqError("Invalid contents of the 'disks'"
10510 " parameter", errors.ECODE_INVAL)
10511 if self.op.hypervisor is None:
10512 self.op.hypervisor = self.cfg.GetHypervisorType()
10513 elif self.op.mode == constants.IALLOCATOR_MODE_RELOC:
10514 fname = _ExpandInstanceName(self.cfg, self.op.name)
10515 self.op.name = fname
10516 self.relocate_from = self.cfg.GetInstanceInfo(fname).secondary_nodes
10517 elif self.op.mode == constants.IALLOCATOR_MODE_MEVAC:
10518 if not hasattr(self.op, "evac_nodes"):
10519 raise errors.OpPrereqError("Missing attribute 'evac_nodes' on"
10520 " opcode input", errors.ECODE_INVAL)
10522 raise errors.OpPrereqError("Invalid test allocator mode '%s'" %
10523 self.op.mode, errors.ECODE_INVAL)
10525 if self.op.direction == constants.IALLOCATOR_DIR_OUT:
10526 if self.op.allocator is None:
10527 raise errors.OpPrereqError("Missing allocator name",
10528 errors.ECODE_INVAL)
10529 elif self.op.direction != constants.IALLOCATOR_DIR_IN:
10530 raise errors.OpPrereqError("Wrong allocator test '%s'" %
10531 self.op.direction, errors.ECODE_INVAL)
10533 def Exec(self, feedback_fn):
10534 """Run the allocator test.
10537 if self.op.mode == constants.IALLOCATOR_MODE_ALLOC:
10538 ial = IAllocator(self.cfg, self.rpc,
10541 mem_size=self.op.mem_size,
10542 disks=self.op.disks,
10543 disk_template=self.op.disk_template,
10547 vcpus=self.op.vcpus,
10548 hypervisor=self.op.hypervisor,
10550 elif self.op.mode == constants.IALLOCATOR_MODE_RELOC:
10551 ial = IAllocator(self.cfg, self.rpc,
10554 relocate_from=list(self.relocate_from),
10556 elif self.op.mode == constants.IALLOCATOR_MODE_MEVAC:
10557 ial = IAllocator(self.cfg, self.rpc,
10559 evac_nodes=self.op.evac_nodes)
10561 raise errors.ProgrammerError("Uncatched mode %s in"
10562 " LUTestAllocator.Exec", self.op.mode)
10564 if self.op.direction == constants.IALLOCATOR_DIR_IN:
10565 result = ial.in_text
10567 ial.Run(self.op.allocator, validate=False)
10568 result = ial.out_text