4 # Copyright (C) 2006, 2007, 2008 Google Inc.
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 """Module implementing the master-side code."""
24 # pylint: disable-msg=W0201,C0302
26 # W0201 since most LU attributes are defined in CheckPrereq or similar
29 # C0302: since we have waaaay to many lines in this module
40 from ganeti import ssh
41 from ganeti import utils
42 from ganeti import errors
43 from ganeti import hypervisor
44 from ganeti import locking
45 from ganeti import constants
46 from ganeti import objects
47 from ganeti import serializer
48 from ganeti import ssconf
49 from ganeti import uidpool
50 from ganeti import compat
51 from ganeti import masterd
53 import ganeti.masterd.instance # pylint: disable-msg=W0611
56 # Modifiable default values; need to define these here before the
60 """Returns an empty list.
67 """Returns an empty dict.
73 #: The without-default default value
77 #: The no-type (value to complex to check it in the type system)
83 """Checks if the given value is not None.
86 return val is not None
90 """Checks if the given value is None.
97 """Checks if the given value is a boolean.
100 return isinstance(val, bool)
104 """Checks if the given value is an integer.
107 return isinstance(val, int)
111 """Checks if the given value is a float.
114 return isinstance(val, float)
118 """Checks if the given value is a string.
121 return isinstance(val, basestring)
125 """Checks if a given value evaluates to a boolean True value.
131 def _TElemOf(target_list):
132 """Builds a function that checks if a given value is a member of a list.
135 return lambda val: val in target_list
140 """Checks if the given value is a list.
143 return isinstance(val, list)
147 """Checks if the given value is a dictionary.
150 return isinstance(val, dict)
155 """Combine multiple functions using an AND operation.
159 return compat.all(t(val) for t in args)
164 """Combine multiple functions using an AND operation.
168 return compat.any(t(val) for t in args)
174 #: a non-empty string
175 _TNonEmptyString = _TAnd(_TString, _TTrue)
178 #: a maybe non-empty string
179 _TMaybeString = _TOr(_TNonEmptyString, _TNone)
182 #: a maybe boolean (bool or none)
183 _TMaybeBool = _TOr(_TBool, _TNone)
186 #: a positive integer
187 _TPositiveInt = _TAnd(_TInt, lambda v: v >= 0)
189 #: a strictly positive integer
190 _TStrictPositiveInt = _TAnd(_TInt, lambda v: v > 0)
193 def _TListOf(my_type):
194 """Checks if a given value is a list with all elements of the same type.
198 lambda lst: compat.all(my_type(v) for v in lst))
201 def _TDictOf(key_type, val_type):
202 """Checks a dict type for the type of its key/values.
206 lambda my_dict: (compat.all(key_type(v) for v in my_dict.keys())
207 and compat.all(val_type(v)
208 for v in my_dict.values())))
211 # Common opcode attributes
213 #: output fields for a query operation
214 _POutputFields = ("output_fields", _NoDefault, _TListOf(_TNonEmptyString))
217 #: the shutdown timeout
218 _PShutdownTimeout = ("shutdown_timeout", constants.DEFAULT_SHUTDOWN_TIMEOUT,
221 #: the force parameter
222 _PForce = ("force", False, _TBool)
224 #: a required instance name (for single-instance LUs)
225 _PInstanceName = ("instance_name", _NoDefault, _TNonEmptyString)
228 #: a required node name (for single-node LUs)
229 _PNodeName = ("node_name", _NoDefault, _TNonEmptyString)
233 class LogicalUnit(object):
234 """Logical Unit base class.
236 Subclasses must follow these rules:
237 - implement ExpandNames
238 - implement CheckPrereq (except when tasklets are used)
239 - implement Exec (except when tasklets are used)
240 - implement BuildHooksEnv
241 - redefine HPATH and HTYPE
242 - optionally redefine their run requirements:
243 REQ_BGL: the LU needs to hold the Big Ganeti Lock exclusively
245 Note that all commands require root permissions.
247 @ivar dry_run_result: the value (if any) that will be returned to the caller
248 in dry-run mode (signalled by opcode dry_run parameter)
249 @cvar _OP_PARAMS: a list of opcode attributes, their defaults values
250 they should get if not already defined, and types they must match
258 def __init__(self, processor, op, context, rpc):
259 """Constructor for LogicalUnit.
261 This needs to be overridden in derived classes in order to check op
265 self.proc = processor
267 self.cfg = context.cfg
268 self.context = context
270 # Dicts used to declare locking needs to mcpu
271 self.needed_locks = None
272 self.acquired_locks = {}
273 self.share_locks = dict.fromkeys(locking.LEVELS, 0)
275 self.remove_locks = {}
276 # Used to force good behavior when calling helper functions
277 self.recalculate_locks = {}
280 self.LogWarning = processor.LogWarning # pylint: disable-msg=C0103
281 self.LogInfo = processor.LogInfo # pylint: disable-msg=C0103
282 self.LogStep = processor.LogStep # pylint: disable-msg=C0103
283 # support for dry-run
284 self.dry_run_result = None
285 # support for generic debug attribute
286 if (not hasattr(self.op, "debug_level") or
287 not isinstance(self.op.debug_level, int)):
288 self.op.debug_level = 0
293 # The new kind-of-type-system
294 op_id = self.op.OP_ID
295 for attr_name, aval, test in self._OP_PARAMS:
296 if not hasattr(op, attr_name):
297 if aval == _NoDefault:
298 raise errors.OpPrereqError("Required parameter '%s.%s' missing" %
299 (op_id, attr_name), errors.ECODE_INVAL)
305 setattr(self.op, attr_name, dval)
306 attr_val = getattr(op, attr_name)
310 if not callable(test):
311 raise errors.ProgrammerError("Validation for parameter '%s.%s' failed,"
312 " given type is not a proper type (%s)" %
313 (op_id, attr_name, test))
314 if not test(attr_val):
315 logging.error("OpCode %s, parameter %s, has invalid type %s/value %s",
316 self.op.OP_ID, attr_name, type(attr_val), attr_val)
317 raise errors.OpPrereqError("Parameter '%s.%s' fails validation" %
318 (op_id, attr_name), errors.ECODE_INVAL)
320 self.CheckArguments()
323 """Returns the SshRunner object
327 self.__ssh = ssh.SshRunner(self.cfg.GetClusterName())
330 ssh = property(fget=__GetSSH)
332 def CheckArguments(self):
333 """Check syntactic validity for the opcode arguments.
335 This method is for doing a simple syntactic check and ensure
336 validity of opcode parameters, without any cluster-related
337 checks. While the same can be accomplished in ExpandNames and/or
338 CheckPrereq, doing these separate is better because:
340 - ExpandNames is left as as purely a lock-related function
341 - CheckPrereq is run after we have acquired locks (and possible
344 The function is allowed to change the self.op attribute so that
345 later methods can no longer worry about missing parameters.
350 def ExpandNames(self):
351 """Expand names for this LU.
353 This method is called before starting to execute the opcode, and it should
354 update all the parameters of the opcode to their canonical form (e.g. a
355 short node name must be fully expanded after this method has successfully
356 completed). This way locking, hooks, logging, ecc. can work correctly.
358 LUs which implement this method must also populate the self.needed_locks
359 member, as a dict with lock levels as keys, and a list of needed lock names
362 - use an empty dict if you don't need any lock
363 - if you don't need any lock at a particular level omit that level
364 - don't put anything for the BGL level
365 - if you want all locks at a level use locking.ALL_SET as a value
367 If you need to share locks (rather than acquire them exclusively) at one
368 level you can modify self.share_locks, setting a true value (usually 1) for
369 that level. By default locks are not shared.
371 This function can also define a list of tasklets, which then will be
372 executed in order instead of the usual LU-level CheckPrereq and Exec
373 functions, if those are not defined by the LU.
377 # Acquire all nodes and one instance
378 self.needed_locks = {
379 locking.LEVEL_NODE: locking.ALL_SET,
380 locking.LEVEL_INSTANCE: ['instance1.example.tld'],
382 # Acquire just two nodes
383 self.needed_locks = {
384 locking.LEVEL_NODE: ['node1.example.tld', 'node2.example.tld'],
387 self.needed_locks = {} # No, you can't leave it to the default value None
390 # The implementation of this method is mandatory only if the new LU is
391 # concurrent, so that old LUs don't need to be changed all at the same
394 self.needed_locks = {} # Exclusive LUs don't need locks.
396 raise NotImplementedError
398 def DeclareLocks(self, level):
399 """Declare LU locking needs for a level
401 While most LUs can just declare their locking needs at ExpandNames time,
402 sometimes there's the need to calculate some locks after having acquired
403 the ones before. This function is called just before acquiring locks at a
404 particular level, but after acquiring the ones at lower levels, and permits
405 such calculations. It can be used to modify self.needed_locks, and by
406 default it does nothing.
408 This function is only called if you have something already set in
409 self.needed_locks for the level.
411 @param level: Locking level which is going to be locked
412 @type level: member of ganeti.locking.LEVELS
416 def CheckPrereq(self):
417 """Check prerequisites for this LU.
419 This method should check that the prerequisites for the execution
420 of this LU are fulfilled. It can do internode communication, but
421 it should be idempotent - no cluster or system changes are
424 The method should raise errors.OpPrereqError in case something is
425 not fulfilled. Its return value is ignored.
427 This method should also update all the parameters of the opcode to
428 their canonical form if it hasn't been done by ExpandNames before.
431 if self.tasklets is not None:
432 for (idx, tl) in enumerate(self.tasklets):
433 logging.debug("Checking prerequisites for tasklet %s/%s",
434 idx + 1, len(self.tasklets))
439 def Exec(self, feedback_fn):
442 This method should implement the actual work. It should raise
443 errors.OpExecError for failures that are somewhat dealt with in
447 if self.tasklets is not None:
448 for (idx, tl) in enumerate(self.tasklets):
449 logging.debug("Executing tasklet %s/%s", idx + 1, len(self.tasklets))
452 raise NotImplementedError
454 def BuildHooksEnv(self):
455 """Build hooks environment for this LU.
457 This method should return a three-node tuple consisting of: a dict
458 containing the environment that will be used for running the
459 specific hook for this LU, a list of node names on which the hook
460 should run before the execution, and a list of node names on which
461 the hook should run after the execution.
463 The keys of the dict must not have 'GANETI_' prefixed as this will
464 be handled in the hooks runner. Also note additional keys will be
465 added by the hooks runner. If the LU doesn't define any
466 environment, an empty dict (and not None) should be returned.
468 No nodes should be returned as an empty list (and not None).
470 Note that if the HPATH for a LU class is None, this function will
474 raise NotImplementedError
476 def HooksCallBack(self, phase, hook_results, feedback_fn, lu_result):
477 """Notify the LU about the results of its hooks.
479 This method is called every time a hooks phase is executed, and notifies
480 the Logical Unit about the hooks' result. The LU can then use it to alter
481 its result based on the hooks. By default the method does nothing and the
482 previous result is passed back unchanged but any LU can define it if it
483 wants to use the local cluster hook-scripts somehow.
485 @param phase: one of L{constants.HOOKS_PHASE_POST} or
486 L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase
487 @param hook_results: the results of the multi-node hooks rpc call
488 @param feedback_fn: function used send feedback back to the caller
489 @param lu_result: the previous Exec result this LU had, or None
491 @return: the new Exec result, based on the previous result
495 # API must be kept, thus we ignore the unused argument and could
496 # be a function warnings
497 # pylint: disable-msg=W0613,R0201
500 def _ExpandAndLockInstance(self):
501 """Helper function to expand and lock an instance.
503 Many LUs that work on an instance take its name in self.op.instance_name
504 and need to expand it and then declare the expanded name for locking. This
505 function does it, and then updates self.op.instance_name to the expanded
506 name. It also initializes needed_locks as a dict, if this hasn't been done
510 if self.needed_locks is None:
511 self.needed_locks = {}
513 assert locking.LEVEL_INSTANCE not in self.needed_locks, \
514 "_ExpandAndLockInstance called with instance-level locks set"
515 self.op.instance_name = _ExpandInstanceName(self.cfg,
516 self.op.instance_name)
517 self.needed_locks[locking.LEVEL_INSTANCE] = self.op.instance_name
519 def _LockInstancesNodes(self, primary_only=False):
520 """Helper function to declare instances' nodes for locking.
522 This function should be called after locking one or more instances to lock
523 their nodes. Its effect is populating self.needed_locks[locking.LEVEL_NODE]
524 with all primary or secondary nodes for instances already locked and
525 present in self.needed_locks[locking.LEVEL_INSTANCE].
527 It should be called from DeclareLocks, and for safety only works if
528 self.recalculate_locks[locking.LEVEL_NODE] is set.
530 In the future it may grow parameters to just lock some instance's nodes, or
531 to just lock primaries or secondary nodes, if needed.
533 If should be called in DeclareLocks in a way similar to::
535 if level == locking.LEVEL_NODE:
536 self._LockInstancesNodes()
538 @type primary_only: boolean
539 @param primary_only: only lock primary nodes of locked instances
542 assert locking.LEVEL_NODE in self.recalculate_locks, \
543 "_LockInstancesNodes helper function called with no nodes to recalculate"
545 # TODO: check if we're really been called with the instance locks held
547 # For now we'll replace self.needed_locks[locking.LEVEL_NODE], but in the
548 # future we might want to have different behaviors depending on the value
549 # of self.recalculate_locks[locking.LEVEL_NODE]
551 for instance_name in self.acquired_locks[locking.LEVEL_INSTANCE]:
552 instance = self.context.cfg.GetInstanceInfo(instance_name)
553 wanted_nodes.append(instance.primary_node)
555 wanted_nodes.extend(instance.secondary_nodes)
557 if self.recalculate_locks[locking.LEVEL_NODE] == constants.LOCKS_REPLACE:
558 self.needed_locks[locking.LEVEL_NODE] = wanted_nodes
559 elif self.recalculate_locks[locking.LEVEL_NODE] == constants.LOCKS_APPEND:
560 self.needed_locks[locking.LEVEL_NODE].extend(wanted_nodes)
562 del self.recalculate_locks[locking.LEVEL_NODE]
565 class NoHooksLU(LogicalUnit): # pylint: disable-msg=W0223
566 """Simple LU which runs no hooks.
568 This LU is intended as a parent for other LogicalUnits which will
569 run no hooks, in order to reduce duplicate code.
575 def BuildHooksEnv(self):
576 """Empty BuildHooksEnv for NoHooksLu.
578 This just raises an error.
581 assert False, "BuildHooksEnv called for NoHooksLUs"
585 """Tasklet base class.
587 Tasklets are subcomponents for LUs. LUs can consist entirely of tasklets or
588 they can mix legacy code with tasklets. Locking needs to be done in the LU,
589 tasklets know nothing about locks.
591 Subclasses must follow these rules:
592 - Implement CheckPrereq
596 def __init__(self, lu):
603 def CheckPrereq(self):
604 """Check prerequisites for this tasklets.
606 This method should check whether the prerequisites for the execution of
607 this tasklet are fulfilled. It can do internode communication, but it
608 should be idempotent - no cluster or system changes are allowed.
610 The method should raise errors.OpPrereqError in case something is not
611 fulfilled. Its return value is ignored.
613 This method should also update all parameters to their canonical form if it
614 hasn't been done before.
619 def Exec(self, feedback_fn):
620 """Execute the tasklet.
622 This method should implement the actual work. It should raise
623 errors.OpExecError for failures that are somewhat dealt with in code, or
627 raise NotImplementedError
630 def _GetWantedNodes(lu, nodes):
631 """Returns list of checked and expanded node names.
633 @type lu: L{LogicalUnit}
634 @param lu: the logical unit on whose behalf we execute
636 @param nodes: list of node names or None for all nodes
638 @return: the list of nodes, sorted
639 @raise errors.ProgrammerError: if the nodes parameter is wrong type
643 raise errors.ProgrammerError("_GetWantedNodes should only be called with a"
644 " non-empty list of nodes whose name is to be expanded.")
646 wanted = [_ExpandNodeName(lu.cfg, name) for name in nodes]
647 return utils.NiceSort(wanted)
650 def _GetWantedInstances(lu, instances):
651 """Returns list of checked and expanded instance names.
653 @type lu: L{LogicalUnit}
654 @param lu: the logical unit on whose behalf we execute
655 @type instances: list
656 @param instances: list of instance names or None for all instances
658 @return: the list of instances, sorted
659 @raise errors.OpPrereqError: if the instances parameter is wrong type
660 @raise errors.OpPrereqError: if any of the passed instances is not found
664 wanted = [_ExpandInstanceName(lu.cfg, name) for name in instances]
666 wanted = utils.NiceSort(lu.cfg.GetInstanceList())
670 def _GetUpdatedParams(old_params, update_dict,
671 use_default=True, use_none=False):
672 """Return the new version of a parameter dictionary.
674 @type old_params: dict
675 @param old_params: old parameters
676 @type update_dict: dict
677 @param update_dict: dict containing new parameter values, or
678 constants.VALUE_DEFAULT to reset the parameter to its default
680 @param use_default: boolean
681 @type use_default: whether to recognise L{constants.VALUE_DEFAULT}
682 values as 'to be deleted' values
683 @param use_none: boolean
684 @type use_none: whether to recognise C{None} values as 'to be
687 @return: the new parameter dictionary
690 params_copy = copy.deepcopy(old_params)
691 for key, val in update_dict.iteritems():
692 if ((use_default and val == constants.VALUE_DEFAULT) or
693 (use_none and val is None)):
699 params_copy[key] = val
703 def _CheckOutputFields(static, dynamic, selected):
704 """Checks whether all selected fields are valid.
706 @type static: L{utils.FieldSet}
707 @param static: static fields set
708 @type dynamic: L{utils.FieldSet}
709 @param dynamic: dynamic fields set
716 delta = f.NonMatching(selected)
718 raise errors.OpPrereqError("Unknown output fields selected: %s"
719 % ",".join(delta), errors.ECODE_INVAL)
722 def _CheckBooleanOpField(op, name):
723 """Validates boolean opcode parameters.
725 This will ensure that an opcode parameter is either a boolean value,
726 or None (but that it always exists).
729 val = getattr(op, name, None)
730 if not (val is None or isinstance(val, bool)):
731 raise errors.OpPrereqError("Invalid boolean parameter '%s' (%s)" %
732 (name, str(val)), errors.ECODE_INVAL)
733 setattr(op, name, val)
736 def _CheckGlobalHvParams(params):
737 """Validates that given hypervisor params are not global ones.
739 This will ensure that instances don't get customised versions of
743 used_globals = constants.HVC_GLOBALS.intersection(params)
745 msg = ("The following hypervisor parameters are global and cannot"
746 " be customized at instance level, please modify them at"
747 " cluster level: %s" % utils.CommaJoin(used_globals))
748 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
751 def _CheckNodeOnline(lu, node):
752 """Ensure that a given node is online.
754 @param lu: the LU on behalf of which we make the check
755 @param node: the node to check
756 @raise errors.OpPrereqError: if the node is offline
759 if lu.cfg.GetNodeInfo(node).offline:
760 raise errors.OpPrereqError("Can't use offline node %s" % node,
764 def _CheckNodeNotDrained(lu, node):
765 """Ensure that a given node is not drained.
767 @param lu: the LU on behalf of which we make the check
768 @param node: the node to check
769 @raise errors.OpPrereqError: if the node is drained
772 if lu.cfg.GetNodeInfo(node).drained:
773 raise errors.OpPrereqError("Can't use drained node %s" % node,
777 def _CheckNodeHasOS(lu, node, os_name, force_variant):
778 """Ensure that a node supports a given OS.
780 @param lu: the LU on behalf of which we make the check
781 @param node: the node to check
782 @param os_name: the OS to query about
783 @param force_variant: whether to ignore variant errors
784 @raise errors.OpPrereqError: if the node is not supporting the OS
787 result = lu.rpc.call_os_get(node, os_name)
788 result.Raise("OS '%s' not in supported OS list for node %s" %
790 prereq=True, ecode=errors.ECODE_INVAL)
791 if not force_variant:
792 _CheckOSVariant(result.payload, os_name)
795 def _RequireFileStorage():
796 """Checks that file storage is enabled.
798 @raise errors.OpPrereqError: when file storage is disabled
801 if not constants.ENABLE_FILE_STORAGE:
802 raise errors.OpPrereqError("File storage disabled at configure time",
806 def _CheckDiskTemplate(template):
807 """Ensure a given disk template is valid.
810 if template not in constants.DISK_TEMPLATES:
811 msg = ("Invalid disk template name '%s', valid templates are: %s" %
812 (template, utils.CommaJoin(constants.DISK_TEMPLATES)))
813 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
814 if template == constants.DT_FILE:
815 _RequireFileStorage()
819 def _CheckStorageType(storage_type):
820 """Ensure a given storage type is valid.
823 if storage_type not in constants.VALID_STORAGE_TYPES:
824 raise errors.OpPrereqError("Unknown storage type: %s" % storage_type,
826 if storage_type == constants.ST_FILE:
827 _RequireFileStorage()
831 def _GetClusterDomainSecret():
832 """Reads the cluster domain secret.
835 return utils.ReadOneLineFile(constants.CLUSTER_DOMAIN_SECRET_FILE,
839 def _CheckInstanceDown(lu, instance, reason):
840 """Ensure that an instance is not running."""
841 if instance.admin_up:
842 raise errors.OpPrereqError("Instance %s is marked to be up, %s" %
843 (instance.name, reason), errors.ECODE_STATE)
845 pnode = instance.primary_node
846 ins_l = lu.rpc.call_instance_list([pnode], [instance.hypervisor])[pnode]
847 ins_l.Raise("Can't contact node %s for instance information" % pnode,
848 prereq=True, ecode=errors.ECODE_ENVIRON)
850 if instance.name in ins_l.payload:
851 raise errors.OpPrereqError("Instance %s is running, %s" %
852 (instance.name, reason), errors.ECODE_STATE)
855 def _ExpandItemName(fn, name, kind):
856 """Expand an item name.
858 @param fn: the function to use for expansion
859 @param name: requested item name
860 @param kind: text description ('Node' or 'Instance')
861 @return: the resolved (full) name
862 @raise errors.OpPrereqError: if the item is not found
866 if full_name is None:
867 raise errors.OpPrereqError("%s '%s' not known" % (kind, name),
872 def _ExpandNodeName(cfg, name):
873 """Wrapper over L{_ExpandItemName} for nodes."""
874 return _ExpandItemName(cfg.ExpandNodeName, name, "Node")
877 def _ExpandInstanceName(cfg, name):
878 """Wrapper over L{_ExpandItemName} for instance."""
879 return _ExpandItemName(cfg.ExpandInstanceName, name, "Instance")
882 def _BuildInstanceHookEnv(name, primary_node, secondary_nodes, os_type, status,
883 memory, vcpus, nics, disk_template, disks,
884 bep, hvp, hypervisor_name):
885 """Builds instance related env variables for hooks
887 This builds the hook environment from individual variables.
890 @param name: the name of the instance
891 @type primary_node: string
892 @param primary_node: the name of the instance's primary node
893 @type secondary_nodes: list
894 @param secondary_nodes: list of secondary nodes as strings
895 @type os_type: string
896 @param os_type: the name of the instance's OS
897 @type status: boolean
898 @param status: the should_run status of the instance
900 @param memory: the memory size of the instance
902 @param vcpus: the count of VCPUs the instance has
904 @param nics: list of tuples (ip, mac, mode, link) representing
905 the NICs the instance has
906 @type disk_template: string
907 @param disk_template: the disk template of the instance
909 @param disks: the list of (size, mode) pairs
911 @param bep: the backend parameters for the instance
913 @param hvp: the hypervisor parameters for the instance
914 @type hypervisor_name: string
915 @param hypervisor_name: the hypervisor for the instance
917 @return: the hook environment for this instance
926 "INSTANCE_NAME": name,
927 "INSTANCE_PRIMARY": primary_node,
928 "INSTANCE_SECONDARIES": " ".join(secondary_nodes),
929 "INSTANCE_OS_TYPE": os_type,
930 "INSTANCE_STATUS": str_status,
931 "INSTANCE_MEMORY": memory,
932 "INSTANCE_VCPUS": vcpus,
933 "INSTANCE_DISK_TEMPLATE": disk_template,
934 "INSTANCE_HYPERVISOR": hypervisor_name,
938 nic_count = len(nics)
939 for idx, (ip, mac, mode, link) in enumerate(nics):
942 env["INSTANCE_NIC%d_IP" % idx] = ip
943 env["INSTANCE_NIC%d_MAC" % idx] = mac
944 env["INSTANCE_NIC%d_MODE" % idx] = mode
945 env["INSTANCE_NIC%d_LINK" % idx] = link
946 if mode == constants.NIC_MODE_BRIDGED:
947 env["INSTANCE_NIC%d_BRIDGE" % idx] = link
951 env["INSTANCE_NIC_COUNT"] = nic_count
954 disk_count = len(disks)
955 for idx, (size, mode) in enumerate(disks):
956 env["INSTANCE_DISK%d_SIZE" % idx] = size
957 env["INSTANCE_DISK%d_MODE" % idx] = mode
961 env["INSTANCE_DISK_COUNT"] = disk_count
963 for source, kind in [(bep, "BE"), (hvp, "HV")]:
964 for key, value in source.items():
965 env["INSTANCE_%s_%s" % (kind, key)] = value
970 def _NICListToTuple(lu, nics):
971 """Build a list of nic information tuples.
973 This list is suitable to be passed to _BuildInstanceHookEnv or as a return
974 value in LUQueryInstanceData.
976 @type lu: L{LogicalUnit}
977 @param lu: the logical unit on whose behalf we execute
978 @type nics: list of L{objects.NIC}
979 @param nics: list of nics to convert to hooks tuples
983 cluster = lu.cfg.GetClusterInfo()
987 filled_params = cluster.SimpleFillNIC(nic.nicparams)
988 mode = filled_params[constants.NIC_MODE]
989 link = filled_params[constants.NIC_LINK]
990 hooks_nics.append((ip, mac, mode, link))
994 def _BuildInstanceHookEnvByObject(lu, instance, override=None):
995 """Builds instance related env variables for hooks from an object.
997 @type lu: L{LogicalUnit}
998 @param lu: the logical unit on whose behalf we execute
999 @type instance: L{objects.Instance}
1000 @param instance: the instance for which we should build the
1002 @type override: dict
1003 @param override: dictionary with key/values that will override
1006 @return: the hook environment dictionary
1009 cluster = lu.cfg.GetClusterInfo()
1010 bep = cluster.FillBE(instance)
1011 hvp = cluster.FillHV(instance)
1013 'name': instance.name,
1014 'primary_node': instance.primary_node,
1015 'secondary_nodes': instance.secondary_nodes,
1016 'os_type': instance.os,
1017 'status': instance.admin_up,
1018 'memory': bep[constants.BE_MEMORY],
1019 'vcpus': bep[constants.BE_VCPUS],
1020 'nics': _NICListToTuple(lu, instance.nics),
1021 'disk_template': instance.disk_template,
1022 'disks': [(disk.size, disk.mode) for disk in instance.disks],
1025 'hypervisor_name': instance.hypervisor,
1028 args.update(override)
1029 return _BuildInstanceHookEnv(**args) # pylint: disable-msg=W0142
1032 def _AdjustCandidatePool(lu, exceptions):
1033 """Adjust the candidate pool after node operations.
1036 mod_list = lu.cfg.MaintainCandidatePool(exceptions)
1038 lu.LogInfo("Promoted nodes to master candidate role: %s",
1039 utils.CommaJoin(node.name for node in mod_list))
1040 for name in mod_list:
1041 lu.context.ReaddNode(name)
1042 mc_now, mc_max, _ = lu.cfg.GetMasterCandidateStats(exceptions)
1044 lu.LogInfo("Note: more nodes are candidates (%d) than desired (%d)" %
1048 def _DecideSelfPromotion(lu, exceptions=None):
1049 """Decide whether I should promote myself as a master candidate.
1052 cp_size = lu.cfg.GetClusterInfo().candidate_pool_size
1053 mc_now, mc_should, _ = lu.cfg.GetMasterCandidateStats(exceptions)
1054 # the new node will increase mc_max with one, so:
1055 mc_should = min(mc_should + 1, cp_size)
1056 return mc_now < mc_should
1059 def _CheckNicsBridgesExist(lu, target_nics, target_node):
1060 """Check that the brigdes needed by a list of nics exist.
1063 cluster = lu.cfg.GetClusterInfo()
1064 paramslist = [cluster.SimpleFillNIC(nic.nicparams) for nic in target_nics]
1065 brlist = [params[constants.NIC_LINK] for params in paramslist
1066 if params[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED]
1068 result = lu.rpc.call_bridges_exist(target_node, brlist)
1069 result.Raise("Error checking bridges on destination node '%s'" %
1070 target_node, prereq=True, ecode=errors.ECODE_ENVIRON)
1073 def _CheckInstanceBridgesExist(lu, instance, node=None):
1074 """Check that the brigdes needed by an instance exist.
1078 node = instance.primary_node
1079 _CheckNicsBridgesExist(lu, instance.nics, node)
1082 def _CheckOSVariant(os_obj, name):
1083 """Check whether an OS name conforms to the os variants specification.
1085 @type os_obj: L{objects.OS}
1086 @param os_obj: OS object to check
1088 @param name: OS name passed by the user, to check for validity
1091 if not os_obj.supported_variants:
1094 variant = name.split("+", 1)[1]
1096 raise errors.OpPrereqError("OS name must include a variant",
1099 if variant not in os_obj.supported_variants:
1100 raise errors.OpPrereqError("Unsupported OS variant", errors.ECODE_INVAL)
1103 def _GetNodeInstancesInner(cfg, fn):
1104 return [i for i in cfg.GetAllInstancesInfo().values() if fn(i)]
1107 def _GetNodeInstances(cfg, node_name):
1108 """Returns a list of all primary and secondary instances on a node.
1112 return _GetNodeInstancesInner(cfg, lambda inst: node_name in inst.all_nodes)
1115 def _GetNodePrimaryInstances(cfg, node_name):
1116 """Returns primary instances on a node.
1119 return _GetNodeInstancesInner(cfg,
1120 lambda inst: node_name == inst.primary_node)
1123 def _GetNodeSecondaryInstances(cfg, node_name):
1124 """Returns secondary instances on a node.
1127 return _GetNodeInstancesInner(cfg,
1128 lambda inst: node_name in inst.secondary_nodes)
1131 def _GetStorageTypeArgs(cfg, storage_type):
1132 """Returns the arguments for a storage type.
1135 # Special case for file storage
1136 if storage_type == constants.ST_FILE:
1137 # storage.FileStorage wants a list of storage directories
1138 return [[cfg.GetFileStorageDir()]]
1143 def _FindFaultyInstanceDisks(cfg, rpc, instance, node_name, prereq):
1146 for dev in instance.disks:
1147 cfg.SetDiskID(dev, node_name)
1149 result = rpc.call_blockdev_getmirrorstatus(node_name, instance.disks)
1150 result.Raise("Failed to get disk status from node %s" % node_name,
1151 prereq=prereq, ecode=errors.ECODE_ENVIRON)
1153 for idx, bdev_status in enumerate(result.payload):
1154 if bdev_status and bdev_status.ldisk_status == constants.LDS_FAULTY:
1160 class LUPostInitCluster(LogicalUnit):
1161 """Logical unit for running hooks after cluster initialization.
1164 HPATH = "cluster-init"
1165 HTYPE = constants.HTYPE_CLUSTER
1167 def BuildHooksEnv(self):
1171 env = {"OP_TARGET": self.cfg.GetClusterName()}
1172 mn = self.cfg.GetMasterNode()
1173 return env, [], [mn]
1175 def Exec(self, feedback_fn):
1182 class LUDestroyCluster(LogicalUnit):
1183 """Logical unit for destroying the cluster.
1186 HPATH = "cluster-destroy"
1187 HTYPE = constants.HTYPE_CLUSTER
1189 def BuildHooksEnv(self):
1193 env = {"OP_TARGET": self.cfg.GetClusterName()}
1196 def CheckPrereq(self):
1197 """Check prerequisites.
1199 This checks whether the cluster is empty.
1201 Any errors are signaled by raising errors.OpPrereqError.
1204 master = self.cfg.GetMasterNode()
1206 nodelist = self.cfg.GetNodeList()
1207 if len(nodelist) != 1 or nodelist[0] != master:
1208 raise errors.OpPrereqError("There are still %d node(s) in"
1209 " this cluster." % (len(nodelist) - 1),
1211 instancelist = self.cfg.GetInstanceList()
1213 raise errors.OpPrereqError("There are still %d instance(s) in"
1214 " this cluster." % len(instancelist),
1217 def Exec(self, feedback_fn):
1218 """Destroys the cluster.
1221 master = self.cfg.GetMasterNode()
1222 modify_ssh_setup = self.cfg.GetClusterInfo().modify_ssh_setup
1224 # Run post hooks on master node before it's removed
1225 hm = self.proc.hmclass(self.rpc.call_hooks_runner, self)
1227 hm.RunPhase(constants.HOOKS_PHASE_POST, [master])
1229 # pylint: disable-msg=W0702
1230 self.LogWarning("Errors occurred running hooks on %s" % master)
1232 result = self.rpc.call_node_stop_master(master, False)
1233 result.Raise("Could not disable the master role")
1235 if modify_ssh_setup:
1236 priv_key, pub_key, _ = ssh.GetUserFiles(constants.GANETI_RUNAS)
1237 utils.CreateBackup(priv_key)
1238 utils.CreateBackup(pub_key)
1243 def _VerifyCertificate(filename):
1244 """Verifies a certificate for LUVerifyCluster.
1246 @type filename: string
1247 @param filename: Path to PEM file
1251 cert = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM,
1252 utils.ReadFile(filename))
1253 except Exception, err: # pylint: disable-msg=W0703
1254 return (LUVerifyCluster.ETYPE_ERROR,
1255 "Failed to load X509 certificate %s: %s" % (filename, err))
1258 utils.VerifyX509Certificate(cert, constants.SSL_CERT_EXPIRATION_WARN,
1259 constants.SSL_CERT_EXPIRATION_ERROR)
1262 fnamemsg = "While verifying %s: %s" % (filename, msg)
1267 return (None, fnamemsg)
1268 elif errcode == utils.CERT_WARNING:
1269 return (LUVerifyCluster.ETYPE_WARNING, fnamemsg)
1270 elif errcode == utils.CERT_ERROR:
1271 return (LUVerifyCluster.ETYPE_ERROR, fnamemsg)
1273 raise errors.ProgrammerError("Unhandled certificate error code %r" % errcode)
1276 class LUVerifyCluster(LogicalUnit):
1277 """Verifies the cluster status.
1280 HPATH = "cluster-verify"
1281 HTYPE = constants.HTYPE_CLUSTER
1283 ("skip_checks", _EmptyList,
1284 _TListOf(_TElemOf(constants.VERIFY_OPTIONAL_CHECKS))),
1285 ("verbose", False, _TBool),
1286 ("error_codes", False, _TBool),
1287 ("debug_simulate_errors", False, _TBool),
1291 TCLUSTER = "cluster"
1293 TINSTANCE = "instance"
1295 ECLUSTERCFG = (TCLUSTER, "ECLUSTERCFG")
1296 ECLUSTERCERT = (TCLUSTER, "ECLUSTERCERT")
1297 EINSTANCEBADNODE = (TINSTANCE, "EINSTANCEBADNODE")
1298 EINSTANCEDOWN = (TINSTANCE, "EINSTANCEDOWN")
1299 EINSTANCELAYOUT = (TINSTANCE, "EINSTANCELAYOUT")
1300 EINSTANCEMISSINGDISK = (TINSTANCE, "EINSTANCEMISSINGDISK")
1301 EINSTANCEMISSINGDISK = (TINSTANCE, "EINSTANCEMISSINGDISK")
1302 EINSTANCEWRONGNODE = (TINSTANCE, "EINSTANCEWRONGNODE")
1303 ENODEDRBD = (TNODE, "ENODEDRBD")
1304 ENODEFILECHECK = (TNODE, "ENODEFILECHECK")
1305 ENODEHOOKS = (TNODE, "ENODEHOOKS")
1306 ENODEHV = (TNODE, "ENODEHV")
1307 ENODELVM = (TNODE, "ENODELVM")
1308 ENODEN1 = (TNODE, "ENODEN1")
1309 ENODENET = (TNODE, "ENODENET")
1310 ENODEOS = (TNODE, "ENODEOS")
1311 ENODEORPHANINSTANCE = (TNODE, "ENODEORPHANINSTANCE")
1312 ENODEORPHANLV = (TNODE, "ENODEORPHANLV")
1313 ENODERPC = (TNODE, "ENODERPC")
1314 ENODESSH = (TNODE, "ENODESSH")
1315 ENODEVERSION = (TNODE, "ENODEVERSION")
1316 ENODESETUP = (TNODE, "ENODESETUP")
1317 ENODETIME = (TNODE, "ENODETIME")
1319 ETYPE_FIELD = "code"
1320 ETYPE_ERROR = "ERROR"
1321 ETYPE_WARNING = "WARNING"
1323 class NodeImage(object):
1324 """A class representing the logical and physical status of a node.
1327 @ivar name: the node name to which this object refers
1328 @ivar volumes: a structure as returned from
1329 L{ganeti.backend.GetVolumeList} (runtime)
1330 @ivar instances: a list of running instances (runtime)
1331 @ivar pinst: list of configured primary instances (config)
1332 @ivar sinst: list of configured secondary instances (config)
1333 @ivar sbp: diction of {secondary-node: list of instances} of all peers
1334 of this node (config)
1335 @ivar mfree: free memory, as reported by hypervisor (runtime)
1336 @ivar dfree: free disk, as reported by the node (runtime)
1337 @ivar offline: the offline status (config)
1338 @type rpc_fail: boolean
1339 @ivar rpc_fail: whether the RPC verify call was successfull (overall,
1340 not whether the individual keys were correct) (runtime)
1341 @type lvm_fail: boolean
1342 @ivar lvm_fail: whether the RPC call didn't return valid LVM data
1343 @type hyp_fail: boolean
1344 @ivar hyp_fail: whether the RPC call didn't return the instance list
1345 @type ghost: boolean
1346 @ivar ghost: whether this is a known node or not (config)
1347 @type os_fail: boolean
1348 @ivar os_fail: whether the RPC call didn't return valid OS data
1350 @ivar oslist: list of OSes as diagnosed by DiagnoseOS
1353 def __init__(self, offline=False, name=None):
1362 self.offline = offline
1363 self.rpc_fail = False
1364 self.lvm_fail = False
1365 self.hyp_fail = False
1367 self.os_fail = False
1370 def ExpandNames(self):
1371 self.needed_locks = {
1372 locking.LEVEL_NODE: locking.ALL_SET,
1373 locking.LEVEL_INSTANCE: locking.ALL_SET,
1375 self.share_locks = dict.fromkeys(locking.LEVELS, 1)
1377 def _Error(self, ecode, item, msg, *args, **kwargs):
1378 """Format an error message.
1380 Based on the opcode's error_codes parameter, either format a
1381 parseable error code, or a simpler error string.
1383 This must be called only from Exec and functions called from Exec.
1386 ltype = kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR)
1388 # first complete the msg
1391 # then format the whole message
1392 if self.op.error_codes:
1393 msg = "%s:%s:%s:%s:%s" % (ltype, etxt, itype, item, msg)
1399 msg = "%s: %s%s: %s" % (ltype, itype, item, msg)
1400 # and finally report it via the feedback_fn
1401 self._feedback_fn(" - %s" % msg)
1403 def _ErrorIf(self, cond, *args, **kwargs):
1404 """Log an error message if the passed condition is True.
1407 cond = bool(cond) or self.op.debug_simulate_errors
1409 self._Error(*args, **kwargs)
1410 # do not mark the operation as failed for WARN cases only
1411 if kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR) == self.ETYPE_ERROR:
1412 self.bad = self.bad or cond
1414 def _VerifyNode(self, ninfo, nresult):
1415 """Run multiple tests against a node.
1419 - compares ganeti version
1420 - checks vg existence and size > 20G
1421 - checks config file checksum
1422 - checks ssh to other nodes
1424 @type ninfo: L{objects.Node}
1425 @param ninfo: the node to check
1426 @param nresult: the results from the node
1428 @return: whether overall this call was successful (and we can expect
1429 reasonable values in the respose)
1433 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1435 # main result, nresult should be a non-empty dict
1436 test = not nresult or not isinstance(nresult, dict)
1437 _ErrorIf(test, self.ENODERPC, node,
1438 "unable to verify node: no data returned")
1442 # compares ganeti version
1443 local_version = constants.PROTOCOL_VERSION
1444 remote_version = nresult.get("version", None)
1445 test = not (remote_version and
1446 isinstance(remote_version, (list, tuple)) and
1447 len(remote_version) == 2)
1448 _ErrorIf(test, self.ENODERPC, node,
1449 "connection to node returned invalid data")
1453 test = local_version != remote_version[0]
1454 _ErrorIf(test, self.ENODEVERSION, node,
1455 "incompatible protocol versions: master %s,"
1456 " node %s", local_version, remote_version[0])
1460 # node seems compatible, we can actually try to look into its results
1462 # full package version
1463 self._ErrorIf(constants.RELEASE_VERSION != remote_version[1],
1464 self.ENODEVERSION, node,
1465 "software version mismatch: master %s, node %s",
1466 constants.RELEASE_VERSION, remote_version[1],
1467 code=self.ETYPE_WARNING)
1469 hyp_result = nresult.get(constants.NV_HYPERVISOR, None)
1470 if isinstance(hyp_result, dict):
1471 for hv_name, hv_result in hyp_result.iteritems():
1472 test = hv_result is not None
1473 _ErrorIf(test, self.ENODEHV, node,
1474 "hypervisor %s verify failure: '%s'", hv_name, hv_result)
1477 test = nresult.get(constants.NV_NODESETUP,
1478 ["Missing NODESETUP results"])
1479 _ErrorIf(test, self.ENODESETUP, node, "node setup error: %s",
1484 def _VerifyNodeTime(self, ninfo, nresult,
1485 nvinfo_starttime, nvinfo_endtime):
1486 """Check the node time.
1488 @type ninfo: L{objects.Node}
1489 @param ninfo: the node to check
1490 @param nresult: the remote results for the node
1491 @param nvinfo_starttime: the start time of the RPC call
1492 @param nvinfo_endtime: the end time of the RPC call
1496 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1498 ntime = nresult.get(constants.NV_TIME, None)
1500 ntime_merged = utils.MergeTime(ntime)
1501 except (ValueError, TypeError):
1502 _ErrorIf(True, self.ENODETIME, node, "Node returned invalid time")
1505 if ntime_merged < (nvinfo_starttime - constants.NODE_MAX_CLOCK_SKEW):
1506 ntime_diff = "%.01fs" % abs(nvinfo_starttime - ntime_merged)
1507 elif ntime_merged > (nvinfo_endtime + constants.NODE_MAX_CLOCK_SKEW):
1508 ntime_diff = "%.01fs" % abs(ntime_merged - nvinfo_endtime)
1512 _ErrorIf(ntime_diff is not None, self.ENODETIME, node,
1513 "Node time diverges by at least %s from master node time",
1516 def _VerifyNodeLVM(self, ninfo, nresult, vg_name):
1517 """Check the node time.
1519 @type ninfo: L{objects.Node}
1520 @param ninfo: the node to check
1521 @param nresult: the remote results for the node
1522 @param vg_name: the configured VG name
1529 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1531 # checks vg existence and size > 20G
1532 vglist = nresult.get(constants.NV_VGLIST, None)
1534 _ErrorIf(test, self.ENODELVM, node, "unable to check volume groups")
1536 vgstatus = utils.CheckVolumeGroupSize(vglist, vg_name,
1537 constants.MIN_VG_SIZE)
1538 _ErrorIf(vgstatus, self.ENODELVM, node, vgstatus)
1541 pvlist = nresult.get(constants.NV_PVLIST, None)
1542 test = pvlist is None
1543 _ErrorIf(test, self.ENODELVM, node, "Can't get PV list from node")
1545 # check that ':' is not present in PV names, since it's a
1546 # special character for lvcreate (denotes the range of PEs to
1548 for _, pvname, owner_vg in pvlist:
1549 test = ":" in pvname
1550 _ErrorIf(test, self.ENODELVM, node, "Invalid character ':' in PV"
1551 " '%s' of VG '%s'", pvname, owner_vg)
1553 def _VerifyNodeNetwork(self, ninfo, nresult):
1554 """Check the node time.
1556 @type ninfo: L{objects.Node}
1557 @param ninfo: the node to check
1558 @param nresult: the remote results for the node
1562 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1564 test = constants.NV_NODELIST not in nresult
1565 _ErrorIf(test, self.ENODESSH, node,
1566 "node hasn't returned node ssh connectivity data")
1568 if nresult[constants.NV_NODELIST]:
1569 for a_node, a_msg in nresult[constants.NV_NODELIST].items():
1570 _ErrorIf(True, self.ENODESSH, node,
1571 "ssh communication with node '%s': %s", a_node, a_msg)
1573 test = constants.NV_NODENETTEST not in nresult
1574 _ErrorIf(test, self.ENODENET, node,
1575 "node hasn't returned node tcp connectivity data")
1577 if nresult[constants.NV_NODENETTEST]:
1578 nlist = utils.NiceSort(nresult[constants.NV_NODENETTEST].keys())
1580 _ErrorIf(True, self.ENODENET, node,
1581 "tcp communication with node '%s': %s",
1582 anode, nresult[constants.NV_NODENETTEST][anode])
1584 test = constants.NV_MASTERIP not in nresult
1585 _ErrorIf(test, self.ENODENET, node,
1586 "node hasn't returned node master IP reachability data")
1588 if not nresult[constants.NV_MASTERIP]:
1589 if node == self.master_node:
1590 msg = "the master node cannot reach the master IP (not configured?)"
1592 msg = "cannot reach the master IP"
1593 _ErrorIf(True, self.ENODENET, node, msg)
1596 def _VerifyInstance(self, instance, instanceconfig, node_image):
1597 """Verify an instance.
1599 This function checks to see if the required block devices are
1600 available on the instance's node.
1603 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1604 node_current = instanceconfig.primary_node
1606 node_vol_should = {}
1607 instanceconfig.MapLVsByNode(node_vol_should)
1609 for node in node_vol_should:
1610 n_img = node_image[node]
1611 if n_img.offline or n_img.rpc_fail or n_img.lvm_fail:
1612 # ignore missing volumes on offline or broken nodes
1614 for volume in node_vol_should[node]:
1615 test = volume not in n_img.volumes
1616 _ErrorIf(test, self.EINSTANCEMISSINGDISK, instance,
1617 "volume %s missing on node %s", volume, node)
1619 if instanceconfig.admin_up:
1620 pri_img = node_image[node_current]
1621 test = instance not in pri_img.instances and not pri_img.offline
1622 _ErrorIf(test, self.EINSTANCEDOWN, instance,
1623 "instance not running on its primary node %s",
1626 for node, n_img in node_image.items():
1627 if (not node == node_current):
1628 test = instance in n_img.instances
1629 _ErrorIf(test, self.EINSTANCEWRONGNODE, instance,
1630 "instance should not run on node %s", node)
1632 def _VerifyOrphanVolumes(self, node_vol_should, node_image):
1633 """Verify if there are any unknown volumes in the cluster.
1635 The .os, .swap and backup volumes are ignored. All other volumes are
1636 reported as unknown.
1639 for node, n_img in node_image.items():
1640 if n_img.offline or n_img.rpc_fail or n_img.lvm_fail:
1641 # skip non-healthy nodes
1643 for volume in n_img.volumes:
1644 test = (node not in node_vol_should or
1645 volume not in node_vol_should[node])
1646 self._ErrorIf(test, self.ENODEORPHANLV, node,
1647 "volume %s is unknown", volume)
1649 def _VerifyOrphanInstances(self, instancelist, node_image):
1650 """Verify the list of running instances.
1652 This checks what instances are running but unknown to the cluster.
1655 for node, n_img in node_image.items():
1656 for o_inst in n_img.instances:
1657 test = o_inst not in instancelist
1658 self._ErrorIf(test, self.ENODEORPHANINSTANCE, node,
1659 "instance %s on node %s should not exist", o_inst, node)
1661 def _VerifyNPlusOneMemory(self, node_image, instance_cfg):
1662 """Verify N+1 Memory Resilience.
1664 Check that if one single node dies we can still start all the
1665 instances it was primary for.
1668 for node, n_img in node_image.items():
1669 # This code checks that every node which is now listed as
1670 # secondary has enough memory to host all instances it is
1671 # supposed to should a single other node in the cluster fail.
1672 # FIXME: not ready for failover to an arbitrary node
1673 # FIXME: does not support file-backed instances
1674 # WARNING: we currently take into account down instances as well
1675 # as up ones, considering that even if they're down someone
1676 # might want to start them even in the event of a node failure.
1677 for prinode, instances in n_img.sbp.items():
1679 for instance in instances:
1680 bep = self.cfg.GetClusterInfo().FillBE(instance_cfg[instance])
1681 if bep[constants.BE_AUTO_BALANCE]:
1682 needed_mem += bep[constants.BE_MEMORY]
1683 test = n_img.mfree < needed_mem
1684 self._ErrorIf(test, self.ENODEN1, node,
1685 "not enough memory on to accommodate"
1686 " failovers should peer node %s fail", prinode)
1688 def _VerifyNodeFiles(self, ninfo, nresult, file_list, local_cksum,
1690 """Verifies and computes the node required file checksums.
1692 @type ninfo: L{objects.Node}
1693 @param ninfo: the node to check
1694 @param nresult: the remote results for the node
1695 @param file_list: required list of files
1696 @param local_cksum: dictionary of local files and their checksums
1697 @param master_files: list of files that only masters should have
1701 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1703 remote_cksum = nresult.get(constants.NV_FILELIST, None)
1704 test = not isinstance(remote_cksum, dict)
1705 _ErrorIf(test, self.ENODEFILECHECK, node,
1706 "node hasn't returned file checksum data")
1710 for file_name in file_list:
1711 node_is_mc = ninfo.master_candidate
1712 must_have = (file_name not in master_files) or node_is_mc
1714 test1 = file_name not in remote_cksum
1716 test2 = not test1 and remote_cksum[file_name] != local_cksum[file_name]
1718 test3 = not test1 and remote_cksum[file_name] == local_cksum[file_name]
1719 _ErrorIf(test1 and must_have, self.ENODEFILECHECK, node,
1720 "file '%s' missing", file_name)
1721 _ErrorIf(test2 and must_have, self.ENODEFILECHECK, node,
1722 "file '%s' has wrong checksum", file_name)
1723 # not candidate and this is not a must-have file
1724 _ErrorIf(test2 and not must_have, self.ENODEFILECHECK, node,
1725 "file '%s' should not exist on non master"
1726 " candidates (and the file is outdated)", file_name)
1727 # all good, except non-master/non-must have combination
1728 _ErrorIf(test3 and not must_have, self.ENODEFILECHECK, node,
1729 "file '%s' should not exist"
1730 " on non master candidates", file_name)
1732 def _VerifyNodeDrbd(self, ninfo, nresult, instanceinfo, drbd_map):
1733 """Verifies and the node DRBD status.
1735 @type ninfo: L{objects.Node}
1736 @param ninfo: the node to check
1737 @param nresult: the remote results for the node
1738 @param instanceinfo: the dict of instances
1739 @param drbd_map: the DRBD map as returned by
1740 L{ganeti.config.ConfigWriter.ComputeDRBDMap}
1744 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1746 # compute the DRBD minors
1748 for minor, instance in drbd_map[node].items():
1749 test = instance not in instanceinfo
1750 _ErrorIf(test, self.ECLUSTERCFG, None,
1751 "ghost instance '%s' in temporary DRBD map", instance)
1752 # ghost instance should not be running, but otherwise we
1753 # don't give double warnings (both ghost instance and
1754 # unallocated minor in use)
1756 node_drbd[minor] = (instance, False)
1758 instance = instanceinfo[instance]
1759 node_drbd[minor] = (instance.name, instance.admin_up)
1761 # and now check them
1762 used_minors = nresult.get(constants.NV_DRBDLIST, [])
1763 test = not isinstance(used_minors, (tuple, list))
1764 _ErrorIf(test, self.ENODEDRBD, node,
1765 "cannot parse drbd status file: %s", str(used_minors))
1767 # we cannot check drbd status
1770 for minor, (iname, must_exist) in node_drbd.items():
1771 test = minor not in used_minors and must_exist
1772 _ErrorIf(test, self.ENODEDRBD, node,
1773 "drbd minor %d of instance %s is not active", minor, iname)
1774 for minor in used_minors:
1775 test = minor not in node_drbd
1776 _ErrorIf(test, self.ENODEDRBD, node,
1777 "unallocated drbd minor %d is in use", minor)
1779 def _UpdateNodeOS(self, ninfo, nresult, nimg):
1780 """Builds the node OS structures.
1782 @type ninfo: L{objects.Node}
1783 @param ninfo: the node to check
1784 @param nresult: the remote results for the node
1785 @param nimg: the node image object
1789 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1791 remote_os = nresult.get(constants.NV_OSLIST, None)
1792 test = (not isinstance(remote_os, list) or
1793 not compat.all(isinstance(v, list) and len(v) == 7
1794 for v in remote_os))
1796 _ErrorIf(test, self.ENODEOS, node,
1797 "node hasn't returned valid OS data")
1806 for (name, os_path, status, diagnose,
1807 variants, parameters, api_ver) in nresult[constants.NV_OSLIST]:
1809 if name not in os_dict:
1812 # parameters is a list of lists instead of list of tuples due to
1813 # JSON lacking a real tuple type, fix it:
1814 parameters = [tuple(v) for v in parameters]
1815 os_dict[name].append((os_path, status, diagnose,
1816 set(variants), set(parameters), set(api_ver)))
1818 nimg.oslist = os_dict
1820 def _VerifyNodeOS(self, ninfo, nimg, base):
1821 """Verifies the node OS list.
1823 @type ninfo: L{objects.Node}
1824 @param ninfo: the node to check
1825 @param nimg: the node image object
1826 @param base: the 'template' node we match against (e.g. from the master)
1830 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1832 assert not nimg.os_fail, "Entered _VerifyNodeOS with failed OS rpc?"
1834 for os_name, os_data in nimg.oslist.items():
1835 assert os_data, "Empty OS status for OS %s?!" % os_name
1836 f_path, f_status, f_diag, f_var, f_param, f_api = os_data[0]
1837 _ErrorIf(not f_status, self.ENODEOS, node,
1838 "Invalid OS %s (located at %s): %s", os_name, f_path, f_diag)
1839 _ErrorIf(len(os_data) > 1, self.ENODEOS, node,
1840 "OS '%s' has multiple entries (first one shadows the rest): %s",
1841 os_name, utils.CommaJoin([v[0] for v in os_data]))
1842 # this will catched in backend too
1843 _ErrorIf(compat.any(v >= constants.OS_API_V15 for v in f_api)
1844 and not f_var, self.ENODEOS, node,
1845 "OS %s with API at least %d does not declare any variant",
1846 os_name, constants.OS_API_V15)
1847 # comparisons with the 'base' image
1848 test = os_name not in base.oslist
1849 _ErrorIf(test, self.ENODEOS, node,
1850 "Extra OS %s not present on reference node (%s)",
1854 assert base.oslist[os_name], "Base node has empty OS status?"
1855 _, b_status, _, b_var, b_param, b_api = base.oslist[os_name][0]
1857 # base OS is invalid, skipping
1859 for kind, a, b in [("API version", f_api, b_api),
1860 ("variants list", f_var, b_var),
1861 ("parameters", f_param, b_param)]:
1862 _ErrorIf(a != b, self.ENODEOS, node,
1863 "OS %s %s differs from reference node %s: %s vs. %s",
1864 kind, os_name, base.name,
1865 utils.CommaJoin(a), utils.CommaJoin(b))
1867 # check any missing OSes
1868 missing = set(base.oslist.keys()).difference(nimg.oslist.keys())
1869 _ErrorIf(missing, self.ENODEOS, node,
1870 "OSes present on reference node %s but missing on this node: %s",
1871 base.name, utils.CommaJoin(missing))
1873 def _UpdateNodeVolumes(self, ninfo, nresult, nimg, vg_name):
1874 """Verifies and updates the node volume data.
1876 This function will update a L{NodeImage}'s internal structures
1877 with data from the remote call.
1879 @type ninfo: L{objects.Node}
1880 @param ninfo: the node to check
1881 @param nresult: the remote results for the node
1882 @param nimg: the node image object
1883 @param vg_name: the configured VG name
1887 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1889 nimg.lvm_fail = True
1890 lvdata = nresult.get(constants.NV_LVLIST, "Missing LV data")
1893 elif isinstance(lvdata, basestring):
1894 _ErrorIf(True, self.ENODELVM, node, "LVM problem on node: %s",
1895 utils.SafeEncode(lvdata))
1896 elif not isinstance(lvdata, dict):
1897 _ErrorIf(True, self.ENODELVM, node, "rpc call to node failed (lvlist)")
1899 nimg.volumes = lvdata
1900 nimg.lvm_fail = False
1902 def _UpdateNodeInstances(self, ninfo, nresult, nimg):
1903 """Verifies and updates the node instance list.
1905 If the listing was successful, then updates this node's instance
1906 list. Otherwise, it marks the RPC call as failed for the instance
1909 @type ninfo: L{objects.Node}
1910 @param ninfo: the node to check
1911 @param nresult: the remote results for the node
1912 @param nimg: the node image object
1915 idata = nresult.get(constants.NV_INSTANCELIST, None)
1916 test = not isinstance(idata, list)
1917 self._ErrorIf(test, self.ENODEHV, ninfo.name, "rpc call to node failed"
1918 " (instancelist): %s", utils.SafeEncode(str(idata)))
1920 nimg.hyp_fail = True
1922 nimg.instances = idata
1924 def _UpdateNodeInfo(self, ninfo, nresult, nimg, vg_name):
1925 """Verifies and computes a node information map
1927 @type ninfo: L{objects.Node}
1928 @param ninfo: the node to check
1929 @param nresult: the remote results for the node
1930 @param nimg: the node image object
1931 @param vg_name: the configured VG name
1935 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1937 # try to read free memory (from the hypervisor)
1938 hv_info = nresult.get(constants.NV_HVINFO, None)
1939 test = not isinstance(hv_info, dict) or "memory_free" not in hv_info
1940 _ErrorIf(test, self.ENODEHV, node, "rpc call to node failed (hvinfo)")
1943 nimg.mfree = int(hv_info["memory_free"])
1944 except (ValueError, TypeError):
1945 _ErrorIf(True, self.ENODERPC, node,
1946 "node returned invalid nodeinfo, check hypervisor")
1948 # FIXME: devise a free space model for file based instances as well
1949 if vg_name is not None:
1950 test = (constants.NV_VGLIST not in nresult or
1951 vg_name not in nresult[constants.NV_VGLIST])
1952 _ErrorIf(test, self.ENODELVM, node,
1953 "node didn't return data for the volume group '%s'"
1954 " - it is either missing or broken", vg_name)
1957 nimg.dfree = int(nresult[constants.NV_VGLIST][vg_name])
1958 except (ValueError, TypeError):
1959 _ErrorIf(True, self.ENODERPC, node,
1960 "node returned invalid LVM info, check LVM status")
1962 def BuildHooksEnv(self):
1965 Cluster-Verify hooks just ran in the post phase and their failure makes
1966 the output be logged in the verify output and the verification to fail.
1969 all_nodes = self.cfg.GetNodeList()
1971 "CLUSTER_TAGS": " ".join(self.cfg.GetClusterInfo().GetTags())
1973 for node in self.cfg.GetAllNodesInfo().values():
1974 env["NODE_TAGS_%s" % node.name] = " ".join(node.GetTags())
1976 return env, [], all_nodes
1978 def Exec(self, feedback_fn):
1979 """Verify integrity of cluster, performing various test on nodes.
1983 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1984 verbose = self.op.verbose
1985 self._feedback_fn = feedback_fn
1986 feedback_fn("* Verifying global settings")
1987 for msg in self.cfg.VerifyConfig():
1988 _ErrorIf(True, self.ECLUSTERCFG, None, msg)
1990 # Check the cluster certificates
1991 for cert_filename in constants.ALL_CERT_FILES:
1992 (errcode, msg) = _VerifyCertificate(cert_filename)
1993 _ErrorIf(errcode, self.ECLUSTERCERT, None, msg, code=errcode)
1995 vg_name = self.cfg.GetVGName()
1996 hypervisors = self.cfg.GetClusterInfo().enabled_hypervisors
1997 cluster = self.cfg.GetClusterInfo()
1998 nodelist = utils.NiceSort(self.cfg.GetNodeList())
1999 nodeinfo = [self.cfg.GetNodeInfo(nname) for nname in nodelist]
2000 instancelist = utils.NiceSort(self.cfg.GetInstanceList())
2001 instanceinfo = dict((iname, self.cfg.GetInstanceInfo(iname))
2002 for iname in instancelist)
2003 i_non_redundant = [] # Non redundant instances
2004 i_non_a_balanced = [] # Non auto-balanced instances
2005 n_offline = 0 # Count of offline nodes
2006 n_drained = 0 # Count of nodes being drained
2007 node_vol_should = {}
2009 # FIXME: verify OS list
2010 # do local checksums
2011 master_files = [constants.CLUSTER_CONF_FILE]
2012 master_node = self.master_node = self.cfg.GetMasterNode()
2013 master_ip = self.cfg.GetMasterIP()
2015 file_names = ssconf.SimpleStore().GetFileList()
2016 file_names.extend(constants.ALL_CERT_FILES)
2017 file_names.extend(master_files)
2018 if cluster.modify_etc_hosts:
2019 file_names.append(constants.ETC_HOSTS)
2021 local_checksums = utils.FingerprintFiles(file_names)
2023 feedback_fn("* Gathering data (%d nodes)" % len(nodelist))
2024 node_verify_param = {
2025 constants.NV_FILELIST: file_names,
2026 constants.NV_NODELIST: [node.name for node in nodeinfo
2027 if not node.offline],
2028 constants.NV_HYPERVISOR: hypervisors,
2029 constants.NV_NODENETTEST: [(node.name, node.primary_ip,
2030 node.secondary_ip) for node in nodeinfo
2031 if not node.offline],
2032 constants.NV_INSTANCELIST: hypervisors,
2033 constants.NV_VERSION: None,
2034 constants.NV_HVINFO: self.cfg.GetHypervisorType(),
2035 constants.NV_NODESETUP: None,
2036 constants.NV_TIME: None,
2037 constants.NV_MASTERIP: (master_node, master_ip),
2038 constants.NV_OSLIST: None,
2041 if vg_name is not None:
2042 node_verify_param[constants.NV_VGLIST] = None
2043 node_verify_param[constants.NV_LVLIST] = vg_name
2044 node_verify_param[constants.NV_PVLIST] = [vg_name]
2045 node_verify_param[constants.NV_DRBDLIST] = None
2047 # Build our expected cluster state
2048 node_image = dict((node.name, self.NodeImage(offline=node.offline,
2050 for node in nodeinfo)
2052 for instance in instancelist:
2053 inst_config = instanceinfo[instance]
2055 for nname in inst_config.all_nodes:
2056 if nname not in node_image:
2058 gnode = self.NodeImage(name=nname)
2060 node_image[nname] = gnode
2062 inst_config.MapLVsByNode(node_vol_should)
2064 pnode = inst_config.primary_node
2065 node_image[pnode].pinst.append(instance)
2067 for snode in inst_config.secondary_nodes:
2068 nimg = node_image[snode]
2069 nimg.sinst.append(instance)
2070 if pnode not in nimg.sbp:
2071 nimg.sbp[pnode] = []
2072 nimg.sbp[pnode].append(instance)
2074 # At this point, we have the in-memory data structures complete,
2075 # except for the runtime information, which we'll gather next
2077 # Due to the way our RPC system works, exact response times cannot be
2078 # guaranteed (e.g. a broken node could run into a timeout). By keeping the
2079 # time before and after executing the request, we can at least have a time
2081 nvinfo_starttime = time.time()
2082 all_nvinfo = self.rpc.call_node_verify(nodelist, node_verify_param,
2083 self.cfg.GetClusterName())
2084 nvinfo_endtime = time.time()
2086 all_drbd_map = self.cfg.ComputeDRBDMap()
2088 feedback_fn("* Verifying node status")
2092 for node_i in nodeinfo:
2094 nimg = node_image[node]
2098 feedback_fn("* Skipping offline node %s" % (node,))
2102 if node == master_node:
2104 elif node_i.master_candidate:
2105 ntype = "master candidate"
2106 elif node_i.drained:
2112 feedback_fn("* Verifying node %s (%s)" % (node, ntype))
2114 msg = all_nvinfo[node].fail_msg
2115 _ErrorIf(msg, self.ENODERPC, node, "while contacting node: %s", msg)
2117 nimg.rpc_fail = True
2120 nresult = all_nvinfo[node].payload
2122 nimg.call_ok = self._VerifyNode(node_i, nresult)
2123 self._VerifyNodeNetwork(node_i, nresult)
2124 self._VerifyNodeLVM(node_i, nresult, vg_name)
2125 self._VerifyNodeFiles(node_i, nresult, file_names, local_checksums,
2127 self._VerifyNodeDrbd(node_i, nresult, instanceinfo, all_drbd_map)
2128 self._VerifyNodeTime(node_i, nresult, nvinfo_starttime, nvinfo_endtime)
2130 self._UpdateNodeVolumes(node_i, nresult, nimg, vg_name)
2131 self._UpdateNodeInstances(node_i, nresult, nimg)
2132 self._UpdateNodeInfo(node_i, nresult, nimg, vg_name)
2133 self._UpdateNodeOS(node_i, nresult, nimg)
2134 if not nimg.os_fail:
2135 if refos_img is None:
2137 self._VerifyNodeOS(node_i, nimg, refos_img)
2139 feedback_fn("* Verifying instance status")
2140 for instance in instancelist:
2142 feedback_fn("* Verifying instance %s" % instance)
2143 inst_config = instanceinfo[instance]
2144 self._VerifyInstance(instance, inst_config, node_image)
2145 inst_nodes_offline = []
2147 pnode = inst_config.primary_node
2148 pnode_img = node_image[pnode]
2149 _ErrorIf(pnode_img.rpc_fail and not pnode_img.offline,
2150 self.ENODERPC, pnode, "instance %s, connection to"
2151 " primary node failed", instance)
2153 if pnode_img.offline:
2154 inst_nodes_offline.append(pnode)
2156 # If the instance is non-redundant we cannot survive losing its primary
2157 # node, so we are not N+1 compliant. On the other hand we have no disk
2158 # templates with more than one secondary so that situation is not well
2160 # FIXME: does not support file-backed instances
2161 if not inst_config.secondary_nodes:
2162 i_non_redundant.append(instance)
2163 _ErrorIf(len(inst_config.secondary_nodes) > 1, self.EINSTANCELAYOUT,
2164 instance, "instance has multiple secondary nodes: %s",
2165 utils.CommaJoin(inst_config.secondary_nodes),
2166 code=self.ETYPE_WARNING)
2168 if not cluster.FillBE(inst_config)[constants.BE_AUTO_BALANCE]:
2169 i_non_a_balanced.append(instance)
2171 for snode in inst_config.secondary_nodes:
2172 s_img = node_image[snode]
2173 _ErrorIf(s_img.rpc_fail and not s_img.offline, self.ENODERPC, snode,
2174 "instance %s, connection to secondary node failed", instance)
2177 inst_nodes_offline.append(snode)
2179 # warn that the instance lives on offline nodes
2180 _ErrorIf(inst_nodes_offline, self.EINSTANCEBADNODE, instance,
2181 "instance lives on offline node(s) %s",
2182 utils.CommaJoin(inst_nodes_offline))
2183 # ... or ghost nodes
2184 for node in inst_config.all_nodes:
2185 _ErrorIf(node_image[node].ghost, self.EINSTANCEBADNODE, instance,
2186 "instance lives on ghost node %s", node)
2188 feedback_fn("* Verifying orphan volumes")
2189 self._VerifyOrphanVolumes(node_vol_should, node_image)
2191 feedback_fn("* Verifying orphan instances")
2192 self._VerifyOrphanInstances(instancelist, node_image)
2194 if constants.VERIFY_NPLUSONE_MEM not in self.op.skip_checks:
2195 feedback_fn("* Verifying N+1 Memory redundancy")
2196 self._VerifyNPlusOneMemory(node_image, instanceinfo)
2198 feedback_fn("* Other Notes")
2200 feedback_fn(" - NOTICE: %d non-redundant instance(s) found."
2201 % len(i_non_redundant))
2203 if i_non_a_balanced:
2204 feedback_fn(" - NOTICE: %d non-auto-balanced instance(s) found."
2205 % len(i_non_a_balanced))
2208 feedback_fn(" - NOTICE: %d offline node(s) found." % n_offline)
2211 feedback_fn(" - NOTICE: %d drained node(s) found." % n_drained)
2215 def HooksCallBack(self, phase, hooks_results, feedback_fn, lu_result):
2216 """Analyze the post-hooks' result
2218 This method analyses the hook result, handles it, and sends some
2219 nicely-formatted feedback back to the user.
2221 @param phase: one of L{constants.HOOKS_PHASE_POST} or
2222 L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase
2223 @param hooks_results: the results of the multi-node hooks rpc call
2224 @param feedback_fn: function used send feedback back to the caller
2225 @param lu_result: previous Exec result
2226 @return: the new Exec result, based on the previous result
2230 # We only really run POST phase hooks, and are only interested in
2232 if phase == constants.HOOKS_PHASE_POST:
2233 # Used to change hooks' output to proper indentation
2234 indent_re = re.compile('^', re.M)
2235 feedback_fn("* Hooks Results")
2236 assert hooks_results, "invalid result from hooks"
2238 for node_name in hooks_results:
2239 res = hooks_results[node_name]
2241 test = msg and not res.offline
2242 self._ErrorIf(test, self.ENODEHOOKS, node_name,
2243 "Communication failure in hooks execution: %s", msg)
2244 if res.offline or msg:
2245 # No need to investigate payload if node is offline or gave an error.
2246 # override manually lu_result here as _ErrorIf only
2247 # overrides self.bad
2250 for script, hkr, output in res.payload:
2251 test = hkr == constants.HKR_FAIL
2252 self._ErrorIf(test, self.ENODEHOOKS, node_name,
2253 "Script %s failed, output:", script)
2255 output = indent_re.sub(' ', output)
2256 feedback_fn("%s" % output)
2262 class LUVerifyDisks(NoHooksLU):
2263 """Verifies the cluster disks status.
2268 def ExpandNames(self):
2269 self.needed_locks = {
2270 locking.LEVEL_NODE: locking.ALL_SET,
2271 locking.LEVEL_INSTANCE: locking.ALL_SET,
2273 self.share_locks = dict.fromkeys(locking.LEVELS, 1)
2275 def Exec(self, feedback_fn):
2276 """Verify integrity of cluster disks.
2278 @rtype: tuple of three items
2279 @return: a tuple of (dict of node-to-node_error, list of instances
2280 which need activate-disks, dict of instance: (node, volume) for
2284 result = res_nodes, res_instances, res_missing = {}, [], {}
2286 vg_name = self.cfg.GetVGName()
2287 nodes = utils.NiceSort(self.cfg.GetNodeList())
2288 instances = [self.cfg.GetInstanceInfo(name)
2289 for name in self.cfg.GetInstanceList()]
2292 for inst in instances:
2294 if (not inst.admin_up or
2295 inst.disk_template not in constants.DTS_NET_MIRROR):
2297 inst.MapLVsByNode(inst_lvs)
2298 # transform { iname: {node: [vol,],},} to {(node, vol): iname}
2299 for node, vol_list in inst_lvs.iteritems():
2300 for vol in vol_list:
2301 nv_dict[(node, vol)] = inst
2306 node_lvs = self.rpc.call_lv_list(nodes, vg_name)
2310 node_res = node_lvs[node]
2311 if node_res.offline:
2313 msg = node_res.fail_msg
2315 logging.warning("Error enumerating LVs on node %s: %s", node, msg)
2316 res_nodes[node] = msg
2319 lvs = node_res.payload
2320 for lv_name, (_, _, lv_online) in lvs.items():
2321 inst = nv_dict.pop((node, lv_name), None)
2322 if (not lv_online and inst is not None
2323 and inst.name not in res_instances):
2324 res_instances.append(inst.name)
2326 # any leftover items in nv_dict are missing LVs, let's arrange the
2328 for key, inst in nv_dict.iteritems():
2329 if inst.name not in res_missing:
2330 res_missing[inst.name] = []
2331 res_missing[inst.name].append(key)
2336 class LURepairDiskSizes(NoHooksLU):
2337 """Verifies the cluster disks sizes.
2340 _OP_PARAMS = [("instances", _EmptyList, _TListOf(_TNonEmptyString))]
2343 def ExpandNames(self):
2344 if self.op.instances:
2345 self.wanted_names = []
2346 for name in self.op.instances:
2347 full_name = _ExpandInstanceName(self.cfg, name)
2348 self.wanted_names.append(full_name)
2349 self.needed_locks = {
2350 locking.LEVEL_NODE: [],
2351 locking.LEVEL_INSTANCE: self.wanted_names,
2353 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
2355 self.wanted_names = None
2356 self.needed_locks = {
2357 locking.LEVEL_NODE: locking.ALL_SET,
2358 locking.LEVEL_INSTANCE: locking.ALL_SET,
2360 self.share_locks = dict(((i, 1) for i in locking.LEVELS))
2362 def DeclareLocks(self, level):
2363 if level == locking.LEVEL_NODE and self.wanted_names is not None:
2364 self._LockInstancesNodes(primary_only=True)
2366 def CheckPrereq(self):
2367 """Check prerequisites.
2369 This only checks the optional instance list against the existing names.
2372 if self.wanted_names is None:
2373 self.wanted_names = self.acquired_locks[locking.LEVEL_INSTANCE]
2375 self.wanted_instances = [self.cfg.GetInstanceInfo(name) for name
2376 in self.wanted_names]
2378 def _EnsureChildSizes(self, disk):
2379 """Ensure children of the disk have the needed disk size.
2381 This is valid mainly for DRBD8 and fixes an issue where the
2382 children have smaller disk size.
2384 @param disk: an L{ganeti.objects.Disk} object
2387 if disk.dev_type == constants.LD_DRBD8:
2388 assert disk.children, "Empty children for DRBD8?"
2389 fchild = disk.children[0]
2390 mismatch = fchild.size < disk.size
2392 self.LogInfo("Child disk has size %d, parent %d, fixing",
2393 fchild.size, disk.size)
2394 fchild.size = disk.size
2396 # and we recurse on this child only, not on the metadev
2397 return self._EnsureChildSizes(fchild) or mismatch
2401 def Exec(self, feedback_fn):
2402 """Verify the size of cluster disks.
2405 # TODO: check child disks too
2406 # TODO: check differences in size between primary/secondary nodes
2408 for instance in self.wanted_instances:
2409 pnode = instance.primary_node
2410 if pnode not in per_node_disks:
2411 per_node_disks[pnode] = []
2412 for idx, disk in enumerate(instance.disks):
2413 per_node_disks[pnode].append((instance, idx, disk))
2416 for node, dskl in per_node_disks.items():
2417 newl = [v[2].Copy() for v in dskl]
2419 self.cfg.SetDiskID(dsk, node)
2420 result = self.rpc.call_blockdev_getsizes(node, newl)
2422 self.LogWarning("Failure in blockdev_getsizes call to node"
2423 " %s, ignoring", node)
2425 if len(result.data) != len(dskl):
2426 self.LogWarning("Invalid result from node %s, ignoring node results",
2429 for ((instance, idx, disk), size) in zip(dskl, result.data):
2431 self.LogWarning("Disk %d of instance %s did not return size"
2432 " information, ignoring", idx, instance.name)
2434 if not isinstance(size, (int, long)):
2435 self.LogWarning("Disk %d of instance %s did not return valid"
2436 " size information, ignoring", idx, instance.name)
2439 if size != disk.size:
2440 self.LogInfo("Disk %d of instance %s has mismatched size,"
2441 " correcting: recorded %d, actual %d", idx,
2442 instance.name, disk.size, size)
2444 self.cfg.Update(instance, feedback_fn)
2445 changed.append((instance.name, idx, size))
2446 if self._EnsureChildSizes(disk):
2447 self.cfg.Update(instance, feedback_fn)
2448 changed.append((instance.name, idx, disk.size))
2452 class LURenameCluster(LogicalUnit):
2453 """Rename the cluster.
2456 HPATH = "cluster-rename"
2457 HTYPE = constants.HTYPE_CLUSTER
2458 _OP_PARAMS = [("name", _NoDefault, _TNonEmptyString)]
2460 def BuildHooksEnv(self):
2465 "OP_TARGET": self.cfg.GetClusterName(),
2466 "NEW_NAME": self.op.name,
2468 mn = self.cfg.GetMasterNode()
2469 all_nodes = self.cfg.GetNodeList()
2470 return env, [mn], all_nodes
2472 def CheckPrereq(self):
2473 """Verify that the passed name is a valid one.
2476 hostname = utils.GetHostInfo(self.op.name)
2478 new_name = hostname.name
2479 self.ip = new_ip = hostname.ip
2480 old_name = self.cfg.GetClusterName()
2481 old_ip = self.cfg.GetMasterIP()
2482 if new_name == old_name and new_ip == old_ip:
2483 raise errors.OpPrereqError("Neither the name nor the IP address of the"
2484 " cluster has changed",
2486 if new_ip != old_ip:
2487 if utils.TcpPing(new_ip, constants.DEFAULT_NODED_PORT):
2488 raise errors.OpPrereqError("The given cluster IP address (%s) is"
2489 " reachable on the network. Aborting." %
2490 new_ip, errors.ECODE_NOTUNIQUE)
2492 self.op.name = new_name
2494 def Exec(self, feedback_fn):
2495 """Rename the cluster.
2498 clustername = self.op.name
2501 # shutdown the master IP
2502 master = self.cfg.GetMasterNode()
2503 result = self.rpc.call_node_stop_master(master, False)
2504 result.Raise("Could not disable the master role")
2507 cluster = self.cfg.GetClusterInfo()
2508 cluster.cluster_name = clustername
2509 cluster.master_ip = ip
2510 self.cfg.Update(cluster, feedback_fn)
2512 # update the known hosts file
2513 ssh.WriteKnownHostsFile(self.cfg, constants.SSH_KNOWN_HOSTS_FILE)
2514 node_list = self.cfg.GetNodeList()
2516 node_list.remove(master)
2519 result = self.rpc.call_upload_file(node_list,
2520 constants.SSH_KNOWN_HOSTS_FILE)
2521 for to_node, to_result in result.iteritems():
2522 msg = to_result.fail_msg
2524 msg = ("Copy of file %s to node %s failed: %s" %
2525 (constants.SSH_KNOWN_HOSTS_FILE, to_node, msg))
2526 self.proc.LogWarning(msg)
2529 result = self.rpc.call_node_start_master(master, False, False)
2530 msg = result.fail_msg
2532 self.LogWarning("Could not re-enable the master role on"
2533 " the master, please restart manually: %s", msg)
2536 def _RecursiveCheckIfLVMBased(disk):
2537 """Check if the given disk or its children are lvm-based.
2539 @type disk: L{objects.Disk}
2540 @param disk: the disk to check
2542 @return: boolean indicating whether a LD_LV dev_type was found or not
2546 for chdisk in disk.children:
2547 if _RecursiveCheckIfLVMBased(chdisk):
2549 return disk.dev_type == constants.LD_LV
2552 class LUSetClusterParams(LogicalUnit):
2553 """Change the parameters of the cluster.
2556 HPATH = "cluster-modify"
2557 HTYPE = constants.HTYPE_CLUSTER
2559 ("vg_name", None, _TMaybeString),
2560 ("enabled_hypervisors", None,
2561 _TOr(_TAnd(_TListOf(_TElemOf(constants.HYPER_TYPES)), _TTrue), _TNone)),
2562 ("hvparams", None, _TOr(_TDictOf(_TNonEmptyString, _TDict), _TNone)),
2563 ("beparams", None, _TOr(_TDictOf(_TNonEmptyString, _TDict), _TNone)),
2564 ("os_hvp", None, _TOr(_TDictOf(_TNonEmptyString, _TDict), _TNone)),
2565 ("osparams", None, _TOr(_TDictOf(_TNonEmptyString, _TDict), _TNone)),
2566 ("candidate_pool_size", None, _TOr(_TStrictPositiveInt, _TNone)),
2567 ("uid_pool", None, _NoType),
2568 ("add_uids", None, _NoType),
2569 ("remove_uids", None, _NoType),
2570 ("maintain_node_health", None, _TMaybeBool),
2571 ("nicparams", None, _TOr(_TDict, _TNone)),
2575 def CheckArguments(self):
2579 if self.op.uid_pool:
2580 uidpool.CheckUidPool(self.op.uid_pool)
2582 if self.op.add_uids:
2583 uidpool.CheckUidPool(self.op.add_uids)
2585 if self.op.remove_uids:
2586 uidpool.CheckUidPool(self.op.remove_uids)
2588 def ExpandNames(self):
2589 # FIXME: in the future maybe other cluster params won't require checking on
2590 # all nodes to be modified.
2591 self.needed_locks = {
2592 locking.LEVEL_NODE: locking.ALL_SET,
2594 self.share_locks[locking.LEVEL_NODE] = 1
2596 def BuildHooksEnv(self):
2601 "OP_TARGET": self.cfg.GetClusterName(),
2602 "NEW_VG_NAME": self.op.vg_name,
2604 mn = self.cfg.GetMasterNode()
2605 return env, [mn], [mn]
2607 def CheckPrereq(self):
2608 """Check prerequisites.
2610 This checks whether the given params don't conflict and
2611 if the given volume group is valid.
2614 if self.op.vg_name is not None and not self.op.vg_name:
2615 instances = self.cfg.GetAllInstancesInfo().values()
2616 for inst in instances:
2617 for disk in inst.disks:
2618 if _RecursiveCheckIfLVMBased(disk):
2619 raise errors.OpPrereqError("Cannot disable lvm storage while"
2620 " lvm-based instances exist",
2623 node_list = self.acquired_locks[locking.LEVEL_NODE]
2625 # if vg_name not None, checks given volume group on all nodes
2627 vglist = self.rpc.call_vg_list(node_list)
2628 for node in node_list:
2629 msg = vglist[node].fail_msg
2631 # ignoring down node
2632 self.LogWarning("Error while gathering data on node %s"
2633 " (ignoring node): %s", node, msg)
2635 vgstatus = utils.CheckVolumeGroupSize(vglist[node].payload,
2637 constants.MIN_VG_SIZE)
2639 raise errors.OpPrereqError("Error on node '%s': %s" %
2640 (node, vgstatus), errors.ECODE_ENVIRON)
2642 self.cluster = cluster = self.cfg.GetClusterInfo()
2643 # validate params changes
2644 if self.op.beparams:
2645 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
2646 self.new_beparams = cluster.SimpleFillBE(self.op.beparams)
2648 if self.op.nicparams:
2649 utils.ForceDictType(self.op.nicparams, constants.NICS_PARAMETER_TYPES)
2650 self.new_nicparams = cluster.SimpleFillNIC(self.op.nicparams)
2651 objects.NIC.CheckParameterSyntax(self.new_nicparams)
2654 # check all instances for consistency
2655 for instance in self.cfg.GetAllInstancesInfo().values():
2656 for nic_idx, nic in enumerate(instance.nics):
2657 params_copy = copy.deepcopy(nic.nicparams)
2658 params_filled = objects.FillDict(self.new_nicparams, params_copy)
2660 # check parameter syntax
2662 objects.NIC.CheckParameterSyntax(params_filled)
2663 except errors.ConfigurationError, err:
2664 nic_errors.append("Instance %s, nic/%d: %s" %
2665 (instance.name, nic_idx, err))
2667 # if we're moving instances to routed, check that they have an ip
2668 target_mode = params_filled[constants.NIC_MODE]
2669 if target_mode == constants.NIC_MODE_ROUTED and not nic.ip:
2670 nic_errors.append("Instance %s, nic/%d: routed nick with no ip" %
2671 (instance.name, nic_idx))
2673 raise errors.OpPrereqError("Cannot apply the change, errors:\n%s" %
2674 "\n".join(nic_errors))
2676 # hypervisor list/parameters
2677 self.new_hvparams = new_hvp = objects.FillDict(cluster.hvparams, {})
2678 if self.op.hvparams:
2679 for hv_name, hv_dict in self.op.hvparams.items():
2680 if hv_name not in self.new_hvparams:
2681 self.new_hvparams[hv_name] = hv_dict
2683 self.new_hvparams[hv_name].update(hv_dict)
2685 # os hypervisor parameters
2686 self.new_os_hvp = objects.FillDict(cluster.os_hvp, {})
2688 for os_name, hvs in self.op.os_hvp.items():
2689 if os_name not in self.new_os_hvp:
2690 self.new_os_hvp[os_name] = hvs
2692 for hv_name, hv_dict in hvs.items():
2693 if hv_name not in self.new_os_hvp[os_name]:
2694 self.new_os_hvp[os_name][hv_name] = hv_dict
2696 self.new_os_hvp[os_name][hv_name].update(hv_dict)
2699 self.new_osp = objects.FillDict(cluster.osparams, {})
2700 if self.op.osparams:
2701 for os_name, osp in self.op.osparams.items():
2702 if os_name not in self.new_osp:
2703 self.new_osp[os_name] = {}
2705 self.new_osp[os_name] = _GetUpdatedParams(self.new_osp[os_name], osp,
2708 if not self.new_osp[os_name]:
2709 # we removed all parameters
2710 del self.new_osp[os_name]
2712 # check the parameter validity (remote check)
2713 _CheckOSParams(self, False, [self.cfg.GetMasterNode()],
2714 os_name, self.new_osp[os_name])
2716 # changes to the hypervisor list
2717 if self.op.enabled_hypervisors is not None:
2718 self.hv_list = self.op.enabled_hypervisors
2719 for hv in self.hv_list:
2720 # if the hypervisor doesn't already exist in the cluster
2721 # hvparams, we initialize it to empty, and then (in both
2722 # cases) we make sure to fill the defaults, as we might not
2723 # have a complete defaults list if the hypervisor wasn't
2725 if hv not in new_hvp:
2727 new_hvp[hv] = objects.FillDict(constants.HVC_DEFAULTS[hv], new_hvp[hv])
2728 utils.ForceDictType(new_hvp[hv], constants.HVS_PARAMETER_TYPES)
2730 self.hv_list = cluster.enabled_hypervisors
2732 if self.op.hvparams or self.op.enabled_hypervisors is not None:
2733 # either the enabled list has changed, or the parameters have, validate
2734 for hv_name, hv_params in self.new_hvparams.items():
2735 if ((self.op.hvparams and hv_name in self.op.hvparams) or
2736 (self.op.enabled_hypervisors and
2737 hv_name in self.op.enabled_hypervisors)):
2738 # either this is a new hypervisor, or its parameters have changed
2739 hv_class = hypervisor.GetHypervisor(hv_name)
2740 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
2741 hv_class.CheckParameterSyntax(hv_params)
2742 _CheckHVParams(self, node_list, hv_name, hv_params)
2745 # no need to check any newly-enabled hypervisors, since the
2746 # defaults have already been checked in the above code-block
2747 for os_name, os_hvp in self.new_os_hvp.items():
2748 for hv_name, hv_params in os_hvp.items():
2749 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
2750 # we need to fill in the new os_hvp on top of the actual hv_p
2751 cluster_defaults = self.new_hvparams.get(hv_name, {})
2752 new_osp = objects.FillDict(cluster_defaults, hv_params)
2753 hv_class = hypervisor.GetHypervisor(hv_name)
2754 hv_class.CheckParameterSyntax(new_osp)
2755 _CheckHVParams(self, node_list, hv_name, new_osp)
2758 def Exec(self, feedback_fn):
2759 """Change the parameters of the cluster.
2762 if self.op.vg_name is not None:
2763 new_volume = self.op.vg_name
2766 if new_volume != self.cfg.GetVGName():
2767 self.cfg.SetVGName(new_volume)
2769 feedback_fn("Cluster LVM configuration already in desired"
2770 " state, not changing")
2771 if self.op.hvparams:
2772 self.cluster.hvparams = self.new_hvparams
2774 self.cluster.os_hvp = self.new_os_hvp
2775 if self.op.enabled_hypervisors is not None:
2776 self.cluster.hvparams = self.new_hvparams
2777 self.cluster.enabled_hypervisors = self.op.enabled_hypervisors
2778 if self.op.beparams:
2779 self.cluster.beparams[constants.PP_DEFAULT] = self.new_beparams
2780 if self.op.nicparams:
2781 self.cluster.nicparams[constants.PP_DEFAULT] = self.new_nicparams
2782 if self.op.osparams:
2783 self.cluster.osparams = self.new_osp
2785 if self.op.candidate_pool_size is not None:
2786 self.cluster.candidate_pool_size = self.op.candidate_pool_size
2787 # we need to update the pool size here, otherwise the save will fail
2788 _AdjustCandidatePool(self, [])
2790 if self.op.maintain_node_health is not None:
2791 self.cluster.maintain_node_health = self.op.maintain_node_health
2793 if self.op.add_uids is not None:
2794 uidpool.AddToUidPool(self.cluster.uid_pool, self.op.add_uids)
2796 if self.op.remove_uids is not None:
2797 uidpool.RemoveFromUidPool(self.cluster.uid_pool, self.op.remove_uids)
2799 if self.op.uid_pool is not None:
2800 self.cluster.uid_pool = self.op.uid_pool
2802 self.cfg.Update(self.cluster, feedback_fn)
2805 def _RedistributeAncillaryFiles(lu, additional_nodes=None):
2806 """Distribute additional files which are part of the cluster configuration.
2808 ConfigWriter takes care of distributing the config and ssconf files, but
2809 there are more files which should be distributed to all nodes. This function
2810 makes sure those are copied.
2812 @param lu: calling logical unit
2813 @param additional_nodes: list of nodes not in the config to distribute to
2816 # 1. Gather target nodes
2817 myself = lu.cfg.GetNodeInfo(lu.cfg.GetMasterNode())
2818 dist_nodes = lu.cfg.GetOnlineNodeList()
2819 if additional_nodes is not None:
2820 dist_nodes.extend(additional_nodes)
2821 if myself.name in dist_nodes:
2822 dist_nodes.remove(myself.name)
2824 # 2. Gather files to distribute
2825 dist_files = set([constants.ETC_HOSTS,
2826 constants.SSH_KNOWN_HOSTS_FILE,
2827 constants.RAPI_CERT_FILE,
2828 constants.RAPI_USERS_FILE,
2829 constants.CONFD_HMAC_KEY,
2830 constants.CLUSTER_DOMAIN_SECRET_FILE,
2833 enabled_hypervisors = lu.cfg.GetClusterInfo().enabled_hypervisors
2834 for hv_name in enabled_hypervisors:
2835 hv_class = hypervisor.GetHypervisor(hv_name)
2836 dist_files.update(hv_class.GetAncillaryFiles())
2838 # 3. Perform the files upload
2839 for fname in dist_files:
2840 if os.path.exists(fname):
2841 result = lu.rpc.call_upload_file(dist_nodes, fname)
2842 for to_node, to_result in result.items():
2843 msg = to_result.fail_msg
2845 msg = ("Copy of file %s to node %s failed: %s" %
2846 (fname, to_node, msg))
2847 lu.proc.LogWarning(msg)
2850 class LURedistributeConfig(NoHooksLU):
2851 """Force the redistribution of cluster configuration.
2853 This is a very simple LU.
2858 def ExpandNames(self):
2859 self.needed_locks = {
2860 locking.LEVEL_NODE: locking.ALL_SET,
2862 self.share_locks[locking.LEVEL_NODE] = 1
2864 def Exec(self, feedback_fn):
2865 """Redistribute the configuration.
2868 self.cfg.Update(self.cfg.GetClusterInfo(), feedback_fn)
2869 _RedistributeAncillaryFiles(self)
2872 def _WaitForSync(lu, instance, disks=None, oneshot=False):
2873 """Sleep and poll for an instance's disk to sync.
2876 if not instance.disks or disks is not None and not disks:
2879 disks = _ExpandCheckDisks(instance, disks)
2882 lu.proc.LogInfo("Waiting for instance %s to sync disks." % instance.name)
2884 node = instance.primary_node
2887 lu.cfg.SetDiskID(dev, node)
2889 # TODO: Convert to utils.Retry
2892 degr_retries = 10 # in seconds, as we sleep 1 second each time
2896 cumul_degraded = False
2897 rstats = lu.rpc.call_blockdev_getmirrorstatus(node, disks)
2898 msg = rstats.fail_msg
2900 lu.LogWarning("Can't get any data from node %s: %s", node, msg)
2903 raise errors.RemoteError("Can't contact node %s for mirror data,"
2904 " aborting." % node)
2907 rstats = rstats.payload
2909 for i, mstat in enumerate(rstats):
2911 lu.LogWarning("Can't compute data for node %s/%s",
2912 node, disks[i].iv_name)
2915 cumul_degraded = (cumul_degraded or
2916 (mstat.is_degraded and mstat.sync_percent is None))
2917 if mstat.sync_percent is not None:
2919 if mstat.estimated_time is not None:
2920 rem_time = ("%s remaining (estimated)" %
2921 utils.FormatSeconds(mstat.estimated_time))
2922 max_time = mstat.estimated_time
2924 rem_time = "no time estimate"
2925 lu.proc.LogInfo("- device %s: %5.2f%% done, %s" %
2926 (disks[i].iv_name, mstat.sync_percent, rem_time))
2928 # if we're done but degraded, let's do a few small retries, to
2929 # make sure we see a stable and not transient situation; therefore
2930 # we force restart of the loop
2931 if (done or oneshot) and cumul_degraded and degr_retries > 0:
2932 logging.info("Degraded disks found, %d retries left", degr_retries)
2940 time.sleep(min(60, max_time))
2943 lu.proc.LogInfo("Instance %s's disks are in sync." % instance.name)
2944 return not cumul_degraded
2947 def _CheckDiskConsistency(lu, dev, node, on_primary, ldisk=False):
2948 """Check that mirrors are not degraded.
2950 The ldisk parameter, if True, will change the test from the
2951 is_degraded attribute (which represents overall non-ok status for
2952 the device(s)) to the ldisk (representing the local storage status).
2955 lu.cfg.SetDiskID(dev, node)
2959 if on_primary or dev.AssembleOnSecondary():
2960 rstats = lu.rpc.call_blockdev_find(node, dev)
2961 msg = rstats.fail_msg
2963 lu.LogWarning("Can't find disk on node %s: %s", node, msg)
2965 elif not rstats.payload:
2966 lu.LogWarning("Can't find disk on node %s", node)
2970 result = result and rstats.payload.ldisk_status == constants.LDS_OKAY
2972 result = result and not rstats.payload.is_degraded
2975 for child in dev.children:
2976 result = result and _CheckDiskConsistency(lu, child, node, on_primary)
2981 class LUDiagnoseOS(NoHooksLU):
2982 """Logical unit for OS diagnose/query.
2987 ("names", _EmptyList, _TListOf(_TNonEmptyString)),
2990 _FIELDS_STATIC = utils.FieldSet()
2991 _FIELDS_DYNAMIC = utils.FieldSet("name", "valid", "node_status", "variants",
2992 "parameters", "api_versions")
2994 def CheckArguments(self):
2996 raise errors.OpPrereqError("Selective OS query not supported",
2999 _CheckOutputFields(static=self._FIELDS_STATIC,
3000 dynamic=self._FIELDS_DYNAMIC,
3001 selected=self.op.output_fields)
3003 def ExpandNames(self):
3004 # Lock all nodes, in shared mode
3005 # Temporary removal of locks, should be reverted later
3006 # TODO: reintroduce locks when they are lighter-weight
3007 self.needed_locks = {}
3008 #self.share_locks[locking.LEVEL_NODE] = 1
3009 #self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
3012 def _DiagnoseByOS(rlist):
3013 """Remaps a per-node return list into an a per-os per-node dictionary
3015 @param rlist: a map with node names as keys and OS objects as values
3018 @return: a dictionary with osnames as keys and as value another
3019 map, with nodes as keys and tuples of (path, status, diagnose,
3020 variants, parameters, api_versions) as values, eg::
3022 {"debian-etch": {"node1": [(/usr/lib/..., True, "", [], []),
3023 (/srv/..., False, "invalid api")],
3024 "node2": [(/srv/..., True, "", [], [])]}
3029 # we build here the list of nodes that didn't fail the RPC (at RPC
3030 # level), so that nodes with a non-responding node daemon don't
3031 # make all OSes invalid
3032 good_nodes = [node_name for node_name in rlist
3033 if not rlist[node_name].fail_msg]
3034 for node_name, nr in rlist.items():
3035 if nr.fail_msg or not nr.payload:
3037 for (name, path, status, diagnose, variants,
3038 params, api_versions) in nr.payload:
3039 if name not in all_os:
3040 # build a list of nodes for this os containing empty lists
3041 # for each node in node_list
3043 for nname in good_nodes:
3044 all_os[name][nname] = []
3045 # convert params from [name, help] to (name, help)
3046 params = [tuple(v) for v in params]
3047 all_os[name][node_name].append((path, status, diagnose,
3048 variants, params, api_versions))
3051 def Exec(self, feedback_fn):
3052 """Compute the list of OSes.
3055 valid_nodes = [node for node in self.cfg.GetOnlineNodeList()]
3056 node_data = self.rpc.call_os_diagnose(valid_nodes)
3057 pol = self._DiagnoseByOS(node_data)
3060 for os_name, os_data in pol.items():
3063 (variants, params, api_versions) = null_state = (set(), set(), set())
3064 for idx, osl in enumerate(os_data.values()):
3065 valid = bool(valid and osl and osl[0][1])
3067 (variants, params, api_versions) = null_state
3069 node_variants, node_params, node_api = osl[0][3:6]
3070 if idx == 0: # first entry
3071 variants = set(node_variants)
3072 params = set(node_params)
3073 api_versions = set(node_api)
3074 else: # keep consistency
3075 variants.intersection_update(node_variants)
3076 params.intersection_update(node_params)
3077 api_versions.intersection_update(node_api)
3079 for field in self.op.output_fields:
3082 elif field == "valid":
3084 elif field == "node_status":
3085 # this is just a copy of the dict
3087 for node_name, nos_list in os_data.items():
3088 val[node_name] = nos_list
3089 elif field == "variants":
3090 val = list(variants)
3091 elif field == "parameters":
3093 elif field == "api_versions":
3094 val = list(api_versions)
3096 raise errors.ParameterError(field)
3103 class LURemoveNode(LogicalUnit):
3104 """Logical unit for removing a node.
3107 HPATH = "node-remove"
3108 HTYPE = constants.HTYPE_NODE
3113 def BuildHooksEnv(self):
3116 This doesn't run on the target node in the pre phase as a failed
3117 node would then be impossible to remove.
3121 "OP_TARGET": self.op.node_name,
3122 "NODE_NAME": self.op.node_name,
3124 all_nodes = self.cfg.GetNodeList()
3126 all_nodes.remove(self.op.node_name)
3128 logging.warning("Node %s which is about to be removed not found"
3129 " in the all nodes list", self.op.node_name)
3130 return env, all_nodes, all_nodes
3132 def CheckPrereq(self):
3133 """Check prerequisites.
3136 - the node exists in the configuration
3137 - it does not have primary or secondary instances
3138 - it's not the master
3140 Any errors are signaled by raising errors.OpPrereqError.
3143 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
3144 node = self.cfg.GetNodeInfo(self.op.node_name)
3145 assert node is not None
3147 instance_list = self.cfg.GetInstanceList()
3149 masternode = self.cfg.GetMasterNode()
3150 if node.name == masternode:
3151 raise errors.OpPrereqError("Node is the master node,"
3152 " you need to failover first.",
3155 for instance_name in instance_list:
3156 instance = self.cfg.GetInstanceInfo(instance_name)
3157 if node.name in instance.all_nodes:
3158 raise errors.OpPrereqError("Instance %s is still running on the node,"
3159 " please remove first." % instance_name,
3161 self.op.node_name = node.name
3164 def Exec(self, feedback_fn):
3165 """Removes the node from the cluster.
3169 logging.info("Stopping the node daemon and removing configs from node %s",
3172 modify_ssh_setup = self.cfg.GetClusterInfo().modify_ssh_setup
3174 # Promote nodes to master candidate as needed
3175 _AdjustCandidatePool(self, exceptions=[node.name])
3176 self.context.RemoveNode(node.name)
3178 # Run post hooks on the node before it's removed
3179 hm = self.proc.hmclass(self.rpc.call_hooks_runner, self)
3181 hm.RunPhase(constants.HOOKS_PHASE_POST, [node.name])
3183 # pylint: disable-msg=W0702
3184 self.LogWarning("Errors occurred running hooks on %s" % node.name)
3186 result = self.rpc.call_node_leave_cluster(node.name, modify_ssh_setup)
3187 msg = result.fail_msg
3189 self.LogWarning("Errors encountered on the remote node while leaving"
3190 " the cluster: %s", msg)
3192 # Remove node from our /etc/hosts
3193 if self.cfg.GetClusterInfo().modify_etc_hosts:
3194 # FIXME: this should be done via an rpc call to node daemon
3195 utils.RemoveHostFromEtcHosts(node.name)
3196 _RedistributeAncillaryFiles(self)
3199 class LUQueryNodes(NoHooksLU):
3200 """Logical unit for querying nodes.
3203 # pylint: disable-msg=W0142
3206 ("names", _EmptyList, _TListOf(_TNonEmptyString)),
3207 ("use_locking", False, _TBool),
3211 _SIMPLE_FIELDS = ["name", "serial_no", "ctime", "mtime", "uuid",
3212 "master_candidate", "offline", "drained"]
3214 _FIELDS_DYNAMIC = utils.FieldSet(
3216 "mtotal", "mnode", "mfree",
3218 "ctotal", "cnodes", "csockets",
3221 _FIELDS_STATIC = utils.FieldSet(*[
3222 "pinst_cnt", "sinst_cnt",
3223 "pinst_list", "sinst_list",
3224 "pip", "sip", "tags",
3226 "role"] + _SIMPLE_FIELDS
3229 def CheckArguments(self):
3230 _CheckOutputFields(static=self._FIELDS_STATIC,
3231 dynamic=self._FIELDS_DYNAMIC,
3232 selected=self.op.output_fields)
3234 def ExpandNames(self):
3235 self.needed_locks = {}
3236 self.share_locks[locking.LEVEL_NODE] = 1
3239 self.wanted = _GetWantedNodes(self, self.op.names)
3241 self.wanted = locking.ALL_SET
3243 self.do_node_query = self._FIELDS_STATIC.NonMatching(self.op.output_fields)
3244 self.do_locking = self.do_node_query and self.op.use_locking
3246 # if we don't request only static fields, we need to lock the nodes
3247 self.needed_locks[locking.LEVEL_NODE] = self.wanted
3249 def Exec(self, feedback_fn):
3250 """Computes the list of nodes and their attributes.
3253 all_info = self.cfg.GetAllNodesInfo()
3255 nodenames = self.acquired_locks[locking.LEVEL_NODE]
3256 elif self.wanted != locking.ALL_SET:
3257 nodenames = self.wanted
3258 missing = set(nodenames).difference(all_info.keys())
3260 raise errors.OpExecError(
3261 "Some nodes were removed before retrieving their data: %s" % missing)
3263 nodenames = all_info.keys()
3265 nodenames = utils.NiceSort(nodenames)
3266 nodelist = [all_info[name] for name in nodenames]
3268 # begin data gathering
3270 if self.do_node_query:
3272 node_data = self.rpc.call_node_info(nodenames, self.cfg.GetVGName(),
3273 self.cfg.GetHypervisorType())
3274 for name in nodenames:
3275 nodeinfo = node_data[name]
3276 if not nodeinfo.fail_msg and nodeinfo.payload:
3277 nodeinfo = nodeinfo.payload
3278 fn = utils.TryConvert
3280 "mtotal": fn(int, nodeinfo.get('memory_total', None)),
3281 "mnode": fn(int, nodeinfo.get('memory_dom0', None)),
3282 "mfree": fn(int, nodeinfo.get('memory_free', None)),
3283 "dtotal": fn(int, nodeinfo.get('vg_size', None)),
3284 "dfree": fn(int, nodeinfo.get('vg_free', None)),
3285 "ctotal": fn(int, nodeinfo.get('cpu_total', None)),
3286 "bootid": nodeinfo.get('bootid', None),
3287 "cnodes": fn(int, nodeinfo.get('cpu_nodes', None)),
3288 "csockets": fn(int, nodeinfo.get('cpu_sockets', None)),
3291 live_data[name] = {}
3293 live_data = dict.fromkeys(nodenames, {})
3295 node_to_primary = dict([(name, set()) for name in nodenames])
3296 node_to_secondary = dict([(name, set()) for name in nodenames])
3298 inst_fields = frozenset(("pinst_cnt", "pinst_list",
3299 "sinst_cnt", "sinst_list"))
3300 if inst_fields & frozenset(self.op.output_fields):
3301 inst_data = self.cfg.GetAllInstancesInfo()
3303 for inst in inst_data.values():
3304 if inst.primary_node in node_to_primary:
3305 node_to_primary[inst.primary_node].add(inst.name)
3306 for secnode in inst.secondary_nodes:
3307 if secnode in node_to_secondary:
3308 node_to_secondary[secnode].add(inst.name)
3310 master_node = self.cfg.GetMasterNode()
3312 # end data gathering
3315 for node in nodelist:
3317 for field in self.op.output_fields:
3318 if field in self._SIMPLE_FIELDS:
3319 val = getattr(node, field)
3320 elif field == "pinst_list":
3321 val = list(node_to_primary[node.name])
3322 elif field == "sinst_list":
3323 val = list(node_to_secondary[node.name])
3324 elif field == "pinst_cnt":
3325 val = len(node_to_primary[node.name])
3326 elif field == "sinst_cnt":
3327 val = len(node_to_secondary[node.name])
3328 elif field == "pip":
3329 val = node.primary_ip
3330 elif field == "sip":
3331 val = node.secondary_ip
3332 elif field == "tags":
3333 val = list(node.GetTags())
3334 elif field == "master":
3335 val = node.name == master_node
3336 elif self._FIELDS_DYNAMIC.Matches(field):
3337 val = live_data[node.name].get(field, None)
3338 elif field == "role":
3339 if node.name == master_node:
3341 elif node.master_candidate:
3350 raise errors.ParameterError(field)
3351 node_output.append(val)
3352 output.append(node_output)
3357 class LUQueryNodeVolumes(NoHooksLU):
3358 """Logical unit for getting volumes on node(s).
3362 ("nodes", _EmptyList, _TListOf(_TNonEmptyString)),
3363 ("output_fields", _NoDefault, _TListOf(_TNonEmptyString)),
3366 _FIELDS_DYNAMIC = utils.FieldSet("phys", "vg", "name", "size", "instance")
3367 _FIELDS_STATIC = utils.FieldSet("node")
3369 def CheckArguments(self):
3370 _CheckOutputFields(static=self._FIELDS_STATIC,
3371 dynamic=self._FIELDS_DYNAMIC,
3372 selected=self.op.output_fields)
3374 def ExpandNames(self):
3375 self.needed_locks = {}
3376 self.share_locks[locking.LEVEL_NODE] = 1
3377 if not self.op.nodes:
3378 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
3380 self.needed_locks[locking.LEVEL_NODE] = \
3381 _GetWantedNodes(self, self.op.nodes)
3383 def Exec(self, feedback_fn):
3384 """Computes the list of nodes and their attributes.
3387 nodenames = self.acquired_locks[locking.LEVEL_NODE]
3388 volumes = self.rpc.call_node_volumes(nodenames)
3390 ilist = [self.cfg.GetInstanceInfo(iname) for iname
3391 in self.cfg.GetInstanceList()]
3393 lv_by_node = dict([(inst, inst.MapLVsByNode()) for inst in ilist])
3396 for node in nodenames:
3397 nresult = volumes[node]
3400 msg = nresult.fail_msg
3402 self.LogWarning("Can't compute volume data on node %s: %s", node, msg)
3405 node_vols = nresult.payload[:]
3406 node_vols.sort(key=lambda vol: vol['dev'])
3408 for vol in node_vols:
3410 for field in self.op.output_fields:
3413 elif field == "phys":
3417 elif field == "name":
3419 elif field == "size":
3420 val = int(float(vol['size']))
3421 elif field == "instance":
3423 if node not in lv_by_node[inst]:
3425 if vol['name'] in lv_by_node[inst][node]:
3431 raise errors.ParameterError(field)
3432 node_output.append(str(val))
3434 output.append(node_output)
3439 class LUQueryNodeStorage(NoHooksLU):
3440 """Logical unit for getting information on storage units on node(s).
3443 _FIELDS_STATIC = utils.FieldSet(constants.SF_NODE)
3445 ("nodes", _EmptyList, _TListOf(_TNonEmptyString)),
3446 ("storage_type", _NoDefault, _CheckStorageType),
3447 ("output_fields", _NoDefault, _TListOf(_TNonEmptyString)),
3448 ("name", None, _TMaybeString),
3452 def CheckArguments(self):
3453 _CheckOutputFields(static=self._FIELDS_STATIC,
3454 dynamic=utils.FieldSet(*constants.VALID_STORAGE_FIELDS),
3455 selected=self.op.output_fields)
3457 def ExpandNames(self):
3458 self.needed_locks = {}
3459 self.share_locks[locking.LEVEL_NODE] = 1
3462 self.needed_locks[locking.LEVEL_NODE] = \
3463 _GetWantedNodes(self, self.op.nodes)
3465 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
3467 def Exec(self, feedback_fn):
3468 """Computes the list of nodes and their attributes.
3471 self.nodes = self.acquired_locks[locking.LEVEL_NODE]
3473 # Always get name to sort by
3474 if constants.SF_NAME in self.op.output_fields:
3475 fields = self.op.output_fields[:]
3477 fields = [constants.SF_NAME] + self.op.output_fields
3479 # Never ask for node or type as it's only known to the LU
3480 for extra in [constants.SF_NODE, constants.SF_TYPE]:
3481 while extra in fields:
3482 fields.remove(extra)
3484 field_idx = dict([(name, idx) for (idx, name) in enumerate(fields)])
3485 name_idx = field_idx[constants.SF_NAME]
3487 st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
3488 data = self.rpc.call_storage_list(self.nodes,
3489 self.op.storage_type, st_args,
3490 self.op.name, fields)
3494 for node in utils.NiceSort(self.nodes):
3495 nresult = data[node]
3499 msg = nresult.fail_msg
3501 self.LogWarning("Can't get storage data from node %s: %s", node, msg)
3504 rows = dict([(row[name_idx], row) for row in nresult.payload])
3506 for name in utils.NiceSort(rows.keys()):
3511 for field in self.op.output_fields:
3512 if field == constants.SF_NODE:
3514 elif field == constants.SF_TYPE:
3515 val = self.op.storage_type
3516 elif field in field_idx:
3517 val = row[field_idx[field]]
3519 raise errors.ParameterError(field)
3528 class LUModifyNodeStorage(NoHooksLU):
3529 """Logical unit for modifying a storage volume on a node.
3534 ("storage_type", _NoDefault, _CheckStorageType),
3535 ("name", _NoDefault, _TNonEmptyString),
3536 ("changes", _NoDefault, _TDict),
3540 def CheckArguments(self):
3541 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
3543 storage_type = self.op.storage_type
3546 modifiable = constants.MODIFIABLE_STORAGE_FIELDS[storage_type]
3548 raise errors.OpPrereqError("Storage units of type '%s' can not be"
3549 " modified" % storage_type,
3552 diff = set(self.op.changes.keys()) - modifiable
3554 raise errors.OpPrereqError("The following fields can not be modified for"
3555 " storage units of type '%s': %r" %
3556 (storage_type, list(diff)),
3559 def ExpandNames(self):
3560 self.needed_locks = {
3561 locking.LEVEL_NODE: self.op.node_name,
3564 def Exec(self, feedback_fn):
3565 """Computes the list of nodes and their attributes.
3568 st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
3569 result = self.rpc.call_storage_modify(self.op.node_name,
3570 self.op.storage_type, st_args,
3571 self.op.name, self.op.changes)
3572 result.Raise("Failed to modify storage unit '%s' on %s" %
3573 (self.op.name, self.op.node_name))
3576 class LUAddNode(LogicalUnit):
3577 """Logical unit for adding node to the cluster.
3581 HTYPE = constants.HTYPE_NODE
3584 ("primary_ip", None, _NoType),
3585 ("secondary_ip", None, _TMaybeString),
3586 ("readd", False, _TBool),
3589 def CheckArguments(self):
3590 # validate/normalize the node name
3591 self.op.node_name = utils.HostInfo.NormalizeName(self.op.node_name)
3593 def BuildHooksEnv(self):
3596 This will run on all nodes before, and on all nodes + the new node after.
3600 "OP_TARGET": self.op.node_name,
3601 "NODE_NAME": self.op.node_name,
3602 "NODE_PIP": self.op.primary_ip,
3603 "NODE_SIP": self.op.secondary_ip,
3605 nodes_0 = self.cfg.GetNodeList()
3606 nodes_1 = nodes_0 + [self.op.node_name, ]
3607 return env, nodes_0, nodes_1
3609 def CheckPrereq(self):
3610 """Check prerequisites.
3613 - the new node is not already in the config
3615 - its parameters (single/dual homed) matches the cluster
3617 Any errors are signaled by raising errors.OpPrereqError.
3620 node_name = self.op.node_name
3623 dns_data = utils.GetHostInfo(node_name)
3625 node = dns_data.name
3626 primary_ip = self.op.primary_ip = dns_data.ip
3627 if self.op.secondary_ip is None:
3628 self.op.secondary_ip = primary_ip
3629 if not utils.IsValidIP4(self.op.secondary_ip):
3630 raise errors.OpPrereqError("Invalid secondary IP given",
3632 secondary_ip = self.op.secondary_ip
3634 node_list = cfg.GetNodeList()
3635 if not self.op.readd and node in node_list:
3636 raise errors.OpPrereqError("Node %s is already in the configuration" %
3637 node, errors.ECODE_EXISTS)
3638 elif self.op.readd and node not in node_list:
3639 raise errors.OpPrereqError("Node %s is not in the configuration" % node,
3642 self.changed_primary_ip = False
3644 for existing_node_name in node_list:
3645 existing_node = cfg.GetNodeInfo(existing_node_name)
3647 if self.op.readd and node == existing_node_name:
3648 if existing_node.secondary_ip != secondary_ip:
3649 raise errors.OpPrereqError("Readded node doesn't have the same IP"
3650 " address configuration as before",
3652 if existing_node.primary_ip != primary_ip:
3653 self.changed_primary_ip = True
3657 if (existing_node.primary_ip == primary_ip or
3658 existing_node.secondary_ip == primary_ip or
3659 existing_node.primary_ip == secondary_ip or
3660 existing_node.secondary_ip == secondary_ip):
3661 raise errors.OpPrereqError("New node ip address(es) conflict with"
3662 " existing node %s" % existing_node.name,
3663 errors.ECODE_NOTUNIQUE)
3665 # check that the type of the node (single versus dual homed) is the
3666 # same as for the master
3667 myself = cfg.GetNodeInfo(self.cfg.GetMasterNode())
3668 master_singlehomed = myself.secondary_ip == myself.primary_ip
3669 newbie_singlehomed = secondary_ip == primary_ip
3670 if master_singlehomed != newbie_singlehomed:
3671 if master_singlehomed:
3672 raise errors.OpPrereqError("The master has no private ip but the"
3673 " new node has one",
3676 raise errors.OpPrereqError("The master has a private ip but the"
3677 " new node doesn't have one",
3680 # checks reachability
3681 if not utils.TcpPing(primary_ip, constants.DEFAULT_NODED_PORT):
3682 raise errors.OpPrereqError("Node not reachable by ping",
3683 errors.ECODE_ENVIRON)
3685 if not newbie_singlehomed:
3686 # check reachability from my secondary ip to newbie's secondary ip
3687 if not utils.TcpPing(secondary_ip, constants.DEFAULT_NODED_PORT,
3688 source=myself.secondary_ip):
3689 raise errors.OpPrereqError("Node secondary ip not reachable by TCP"
3690 " based ping to noded port",
3691 errors.ECODE_ENVIRON)
3698 self.master_candidate = _DecideSelfPromotion(self, exceptions=exceptions)
3701 self.new_node = self.cfg.GetNodeInfo(node)
3702 assert self.new_node is not None, "Can't retrieve locked node %s" % node
3704 self.new_node = objects.Node(name=node,
3705 primary_ip=primary_ip,
3706 secondary_ip=secondary_ip,
3707 master_candidate=self.master_candidate,
3708 offline=False, drained=False)
3710 def Exec(self, feedback_fn):
3711 """Adds the new node to the cluster.
3714 new_node = self.new_node
3715 node = new_node.name
3717 # for re-adds, reset the offline/drained/master-candidate flags;
3718 # we need to reset here, otherwise offline would prevent RPC calls
3719 # later in the procedure; this also means that if the re-add
3720 # fails, we are left with a non-offlined, broken node
3722 new_node.drained = new_node.offline = False # pylint: disable-msg=W0201
3723 self.LogInfo("Readding a node, the offline/drained flags were reset")
3724 # if we demote the node, we do cleanup later in the procedure
3725 new_node.master_candidate = self.master_candidate
3726 if self.changed_primary_ip:
3727 new_node.primary_ip = self.op.primary_ip
3729 # notify the user about any possible mc promotion
3730 if new_node.master_candidate:
3731 self.LogInfo("Node will be a master candidate")
3733 # check connectivity
3734 result = self.rpc.call_version([node])[node]
3735 result.Raise("Can't get version information from node %s" % node)
3736 if constants.PROTOCOL_VERSION == result.payload:
3737 logging.info("Communication to node %s fine, sw version %s match",
3738 node, result.payload)
3740 raise errors.OpExecError("Version mismatch master version %s,"
3741 " node version %s" %
3742 (constants.PROTOCOL_VERSION, result.payload))
3745 if self.cfg.GetClusterInfo().modify_ssh_setup:
3746 logging.info("Copy ssh key to node %s", node)
3747 priv_key, pub_key, _ = ssh.GetUserFiles(constants.GANETI_RUNAS)
3749 keyfiles = [constants.SSH_HOST_DSA_PRIV, constants.SSH_HOST_DSA_PUB,
3750 constants.SSH_HOST_RSA_PRIV, constants.SSH_HOST_RSA_PUB,
3754 keyarray.append(utils.ReadFile(i))
3756 result = self.rpc.call_node_add(node, keyarray[0], keyarray[1],
3757 keyarray[2], keyarray[3], keyarray[4],
3759 result.Raise("Cannot transfer ssh keys to the new node")
3761 # Add node to our /etc/hosts, and add key to known_hosts
3762 if self.cfg.GetClusterInfo().modify_etc_hosts:
3763 # FIXME: this should be done via an rpc call to node daemon
3764 utils.AddHostToEtcHosts(new_node.name)
3766 if new_node.secondary_ip != new_node.primary_ip:
3767 result = self.rpc.call_node_has_ip_address(new_node.name,
3768 new_node.secondary_ip)
3769 result.Raise("Failure checking secondary ip on node %s" % new_node.name,
3770 prereq=True, ecode=errors.ECODE_ENVIRON)
3771 if not result.payload:
3772 raise errors.OpExecError("Node claims it doesn't have the secondary ip"
3773 " you gave (%s). Please fix and re-run this"
3774 " command." % new_node.secondary_ip)
3776 node_verify_list = [self.cfg.GetMasterNode()]
3777 node_verify_param = {
3778 constants.NV_NODELIST: [node],
3779 # TODO: do a node-net-test as well?
3782 result = self.rpc.call_node_verify(node_verify_list, node_verify_param,
3783 self.cfg.GetClusterName())
3784 for verifier in node_verify_list:
3785 result[verifier].Raise("Cannot communicate with node %s" % verifier)
3786 nl_payload = result[verifier].payload[constants.NV_NODELIST]
3788 for failed in nl_payload:
3789 feedback_fn("ssh/hostname verification failed"
3790 " (checking from %s): %s" %
3791 (verifier, nl_payload[failed]))
3792 raise errors.OpExecError("ssh/hostname verification failed.")
3795 _RedistributeAncillaryFiles(self)
3796 self.context.ReaddNode(new_node)
3797 # make sure we redistribute the config
3798 self.cfg.Update(new_node, feedback_fn)
3799 # and make sure the new node will not have old files around
3800 if not new_node.master_candidate:
3801 result = self.rpc.call_node_demote_from_mc(new_node.name)
3802 msg = result.fail_msg
3804 self.LogWarning("Node failed to demote itself from master"
3805 " candidate status: %s" % msg)
3807 _RedistributeAncillaryFiles(self, additional_nodes=[node])
3808 self.context.AddNode(new_node, self.proc.GetECId())
3811 class LUSetNodeParams(LogicalUnit):
3812 """Modifies the parameters of a node.
3815 HPATH = "node-modify"
3816 HTYPE = constants.HTYPE_NODE
3819 ("master_candidate", None, _TMaybeBool),
3820 ("offline", None, _TMaybeBool),
3821 ("drained", None, _TMaybeBool),
3822 ("auto_promote", False, _TBool),
3827 def CheckArguments(self):
3828 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
3829 all_mods = [self.op.offline, self.op.master_candidate, self.op.drained]
3830 if all_mods.count(None) == 3:
3831 raise errors.OpPrereqError("Please pass at least one modification",
3833 if all_mods.count(True) > 1:
3834 raise errors.OpPrereqError("Can't set the node into more than one"
3835 " state at the same time",
3838 # Boolean value that tells us whether we're offlining or draining the node
3839 self.offline_or_drain = (self.op.offline == True or
3840 self.op.drained == True)
3841 self.deoffline_or_drain = (self.op.offline == False or
3842 self.op.drained == False)
3843 self.might_demote = (self.op.master_candidate == False or
3844 self.offline_or_drain)
3846 self.lock_all = self.op.auto_promote and self.might_demote
3849 def ExpandNames(self):
3851 self.needed_locks = {locking.LEVEL_NODE: locking.ALL_SET}
3853 self.needed_locks = {locking.LEVEL_NODE: self.op.node_name}
3855 def BuildHooksEnv(self):
3858 This runs on the master node.
3862 "OP_TARGET": self.op.node_name,
3863 "MASTER_CANDIDATE": str(self.op.master_candidate),
3864 "OFFLINE": str(self.op.offline),
3865 "DRAINED": str(self.op.drained),
3867 nl = [self.cfg.GetMasterNode(),
3871 def CheckPrereq(self):
3872 """Check prerequisites.
3874 This only checks the instance list against the existing names.
3877 node = self.node = self.cfg.GetNodeInfo(self.op.node_name)
3879 if (self.op.master_candidate is not None or
3880 self.op.drained is not None or
3881 self.op.offline is not None):
3882 # we can't change the master's node flags
3883 if self.op.node_name == self.cfg.GetMasterNode():
3884 raise errors.OpPrereqError("The master role can be changed"
3885 " only via masterfailover",
3889 if node.master_candidate and self.might_demote and not self.lock_all:
3890 assert not self.op.auto_promote, "auto-promote set but lock_all not"
3891 # check if after removing the current node, we're missing master
3893 (mc_remaining, mc_should, _) = \
3894 self.cfg.GetMasterCandidateStats(exceptions=[node.name])
3895 if mc_remaining < mc_should:
3896 raise errors.OpPrereqError("Not enough master candidates, please"
3897 " pass auto_promote to allow promotion",
3900 if (self.op.master_candidate == True and
3901 ((node.offline and not self.op.offline == False) or
3902 (node.drained and not self.op.drained == False))):
3903 raise errors.OpPrereqError("Node '%s' is offline or drained, can't set"
3904 " to master_candidate" % node.name,
3907 # If we're being deofflined/drained, we'll MC ourself if needed
3908 if (self.deoffline_or_drain and not self.offline_or_drain and not
3909 self.op.master_candidate == True and not node.master_candidate):
3910 self.op.master_candidate = _DecideSelfPromotion(self)
3911 if self.op.master_candidate:
3912 self.LogInfo("Autopromoting node to master candidate")
3916 def Exec(self, feedback_fn):
3925 if self.op.offline is not None:
3926 node.offline = self.op.offline
3927 result.append(("offline", str(self.op.offline)))
3928 if self.op.offline == True:
3929 if node.master_candidate:
3930 node.master_candidate = False
3932 result.append(("master_candidate", "auto-demotion due to offline"))
3934 node.drained = False
3935 result.append(("drained", "clear drained status due to offline"))
3937 if self.op.master_candidate is not None:
3938 node.master_candidate = self.op.master_candidate
3940 result.append(("master_candidate", str(self.op.master_candidate)))
3941 if self.op.master_candidate == False:
3942 rrc = self.rpc.call_node_demote_from_mc(node.name)
3945 self.LogWarning("Node failed to demote itself: %s" % msg)
3947 if self.op.drained is not None:
3948 node.drained = self.op.drained
3949 result.append(("drained", str(self.op.drained)))
3950 if self.op.drained == True:
3951 if node.master_candidate:
3952 node.master_candidate = False
3954 result.append(("master_candidate", "auto-demotion due to drain"))
3955 rrc = self.rpc.call_node_demote_from_mc(node.name)
3958 self.LogWarning("Node failed to demote itself: %s" % msg)
3960 node.offline = False
3961 result.append(("offline", "clear offline status due to drain"))
3963 # we locked all nodes, we adjust the CP before updating this node
3965 _AdjustCandidatePool(self, [node.name])
3967 # this will trigger configuration file update, if needed
3968 self.cfg.Update(node, feedback_fn)
3970 # this will trigger job queue propagation or cleanup
3972 self.context.ReaddNode(node)
3977 class LUPowercycleNode(NoHooksLU):
3978 """Powercycles a node.
3987 def CheckArguments(self):
3988 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
3989 if self.op.node_name == self.cfg.GetMasterNode() and not self.op.force:
3990 raise errors.OpPrereqError("The node is the master and the force"
3991 " parameter was not set",
3994 def ExpandNames(self):
3995 """Locking for PowercycleNode.
3997 This is a last-resort option and shouldn't block on other
3998 jobs. Therefore, we grab no locks.
4001 self.needed_locks = {}
4003 def Exec(self, feedback_fn):
4007 result = self.rpc.call_node_powercycle(self.op.node_name,
4008 self.cfg.GetHypervisorType())
4009 result.Raise("Failed to schedule the reboot")
4010 return result.payload
4013 class LUQueryClusterInfo(NoHooksLU):
4014 """Query cluster configuration.
4019 def ExpandNames(self):
4020 self.needed_locks = {}
4022 def Exec(self, feedback_fn):
4023 """Return cluster config.
4026 cluster = self.cfg.GetClusterInfo()
4029 # Filter just for enabled hypervisors
4030 for os_name, hv_dict in cluster.os_hvp.items():
4031 os_hvp[os_name] = {}
4032 for hv_name, hv_params in hv_dict.items():
4033 if hv_name in cluster.enabled_hypervisors:
4034 os_hvp[os_name][hv_name] = hv_params
4037 "software_version": constants.RELEASE_VERSION,
4038 "protocol_version": constants.PROTOCOL_VERSION,
4039 "config_version": constants.CONFIG_VERSION,
4040 "os_api_version": max(constants.OS_API_VERSIONS),
4041 "export_version": constants.EXPORT_VERSION,
4042 "architecture": (platform.architecture()[0], platform.machine()),
4043 "name": cluster.cluster_name,
4044 "master": cluster.master_node,
4045 "default_hypervisor": cluster.enabled_hypervisors[0],
4046 "enabled_hypervisors": cluster.enabled_hypervisors,
4047 "hvparams": dict([(hypervisor_name, cluster.hvparams[hypervisor_name])
4048 for hypervisor_name in cluster.enabled_hypervisors]),
4050 "beparams": cluster.beparams,
4051 "osparams": cluster.osparams,
4052 "nicparams": cluster.nicparams,
4053 "candidate_pool_size": cluster.candidate_pool_size,
4054 "master_netdev": cluster.master_netdev,
4055 "volume_group_name": cluster.volume_group_name,
4056 "file_storage_dir": cluster.file_storage_dir,
4057 "maintain_node_health": cluster.maintain_node_health,
4058 "ctime": cluster.ctime,
4059 "mtime": cluster.mtime,
4060 "uuid": cluster.uuid,
4061 "tags": list(cluster.GetTags()),
4062 "uid_pool": cluster.uid_pool,
4068 class LUQueryConfigValues(NoHooksLU):
4069 """Return configuration values.
4072 _OP_PARAMS = [_POutputFields]
4074 _FIELDS_DYNAMIC = utils.FieldSet()
4075 _FIELDS_STATIC = utils.FieldSet("cluster_name", "master_node", "drain_flag",
4078 def CheckArguments(self):
4079 _CheckOutputFields(static=self._FIELDS_STATIC,
4080 dynamic=self._FIELDS_DYNAMIC,
4081 selected=self.op.output_fields)
4083 def ExpandNames(self):
4084 self.needed_locks = {}
4086 def Exec(self, feedback_fn):
4087 """Dump a representation of the cluster config to the standard output.
4091 for field in self.op.output_fields:
4092 if field == "cluster_name":
4093 entry = self.cfg.GetClusterName()
4094 elif field == "master_node":
4095 entry = self.cfg.GetMasterNode()
4096 elif field == "drain_flag":
4097 entry = os.path.exists(constants.JOB_QUEUE_DRAIN_FILE)
4098 elif field == "watcher_pause":
4099 entry = utils.ReadWatcherPauseFile(constants.WATCHER_PAUSEFILE)
4101 raise errors.ParameterError(field)
4102 values.append(entry)
4106 class LUActivateInstanceDisks(NoHooksLU):
4107 """Bring up an instance's disks.
4112 ("ignore_size", False, _TBool),
4116 def ExpandNames(self):
4117 self._ExpandAndLockInstance()
4118 self.needed_locks[locking.LEVEL_NODE] = []
4119 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
4121 def DeclareLocks(self, level):
4122 if level == locking.LEVEL_NODE:
4123 self._LockInstancesNodes()
4125 def CheckPrereq(self):
4126 """Check prerequisites.
4128 This checks that the instance is in the cluster.
4131 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
4132 assert self.instance is not None, \
4133 "Cannot retrieve locked instance %s" % self.op.instance_name
4134 _CheckNodeOnline(self, self.instance.primary_node)
4136 def Exec(self, feedback_fn):
4137 """Activate the disks.
4140 disks_ok, disks_info = \
4141 _AssembleInstanceDisks(self, self.instance,
4142 ignore_size=self.op.ignore_size)
4144 raise errors.OpExecError("Cannot activate block devices")
4149 def _AssembleInstanceDisks(lu, instance, disks=None, ignore_secondaries=False,
4151 """Prepare the block devices for an instance.
4153 This sets up the block devices on all nodes.
4155 @type lu: L{LogicalUnit}
4156 @param lu: the logical unit on whose behalf we execute
4157 @type instance: L{objects.Instance}
4158 @param instance: the instance for whose disks we assemble
4159 @type disks: list of L{objects.Disk} or None
4160 @param disks: which disks to assemble (or all, if None)
4161 @type ignore_secondaries: boolean
4162 @param ignore_secondaries: if true, errors on secondary nodes
4163 won't result in an error return from the function
4164 @type ignore_size: boolean
4165 @param ignore_size: if true, the current known size of the disk
4166 will not be used during the disk activation, useful for cases
4167 when the size is wrong
4168 @return: False if the operation failed, otherwise a list of
4169 (host, instance_visible_name, node_visible_name)
4170 with the mapping from node devices to instance devices
4175 iname = instance.name
4176 disks = _ExpandCheckDisks(instance, disks)
4178 # With the two passes mechanism we try to reduce the window of
4179 # opportunity for the race condition of switching DRBD to primary
4180 # before handshaking occured, but we do not eliminate it
4182 # The proper fix would be to wait (with some limits) until the
4183 # connection has been made and drbd transitions from WFConnection
4184 # into any other network-connected state (Connected, SyncTarget,
4187 # 1st pass, assemble on all nodes in secondary mode
4188 for inst_disk in disks:
4189 for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node):
4191 node_disk = node_disk.Copy()
4192 node_disk.UnsetSize()
4193 lu.cfg.SetDiskID(node_disk, node)
4194 result = lu.rpc.call_blockdev_assemble(node, node_disk, iname, False)
4195 msg = result.fail_msg
4197 lu.proc.LogWarning("Could not prepare block device %s on node %s"
4198 " (is_primary=False, pass=1): %s",
4199 inst_disk.iv_name, node, msg)
4200 if not ignore_secondaries:
4203 # FIXME: race condition on drbd migration to primary
4205 # 2nd pass, do only the primary node
4206 for inst_disk in disks:
4209 for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node):
4210 if node != instance.primary_node:
4213 node_disk = node_disk.Copy()
4214 node_disk.UnsetSize()
4215 lu.cfg.SetDiskID(node_disk, node)
4216 result = lu.rpc.call_blockdev_assemble(node, node_disk, iname, True)
4217 msg = result.fail_msg
4219 lu.proc.LogWarning("Could not prepare block device %s on node %s"
4220 " (is_primary=True, pass=2): %s",
4221 inst_disk.iv_name, node, msg)
4224 dev_path = result.payload
4226 device_info.append((instance.primary_node, inst_disk.iv_name, dev_path))
4228 # leave the disks configured for the primary node
4229 # this is a workaround that would be fixed better by
4230 # improving the logical/physical id handling
4232 lu.cfg.SetDiskID(disk, instance.primary_node)
4234 return disks_ok, device_info
4237 def _StartInstanceDisks(lu, instance, force):
4238 """Start the disks of an instance.
4241 disks_ok, _ = _AssembleInstanceDisks(lu, instance,
4242 ignore_secondaries=force)
4244 _ShutdownInstanceDisks(lu, instance)
4245 if force is not None and not force:
4246 lu.proc.LogWarning("", hint="If the message above refers to a"
4248 " you can retry the operation using '--force'.")
4249 raise errors.OpExecError("Disk consistency error")
4252 class LUDeactivateInstanceDisks(NoHooksLU):
4253 """Shutdown an instance's disks.
4261 def ExpandNames(self):
4262 self._ExpandAndLockInstance()
4263 self.needed_locks[locking.LEVEL_NODE] = []
4264 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
4266 def DeclareLocks(self, level):
4267 if level == locking.LEVEL_NODE:
4268 self._LockInstancesNodes()
4270 def CheckPrereq(self):
4271 """Check prerequisites.
4273 This checks that the instance is in the cluster.
4276 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
4277 assert self.instance is not None, \
4278 "Cannot retrieve locked instance %s" % self.op.instance_name
4280 def Exec(self, feedback_fn):
4281 """Deactivate the disks
4284 instance = self.instance
4285 _SafeShutdownInstanceDisks(self, instance)
4288 def _SafeShutdownInstanceDisks(lu, instance, disks=None):
4289 """Shutdown block devices of an instance.
4291 This function checks if an instance is running, before calling
4292 _ShutdownInstanceDisks.
4295 _CheckInstanceDown(lu, instance, "cannot shutdown disks")
4296 _ShutdownInstanceDisks(lu, instance, disks=disks)
4299 def _ExpandCheckDisks(instance, disks):
4300 """Return the instance disks selected by the disks list
4302 @type disks: list of L{objects.Disk} or None
4303 @param disks: selected disks
4304 @rtype: list of L{objects.Disk}
4305 @return: selected instance disks to act on
4309 return instance.disks
4311 if not set(disks).issubset(instance.disks):
4312 raise errors.ProgrammerError("Can only act on disks belonging to the"
4317 def _ShutdownInstanceDisks(lu, instance, disks=None, ignore_primary=False):
4318 """Shutdown block devices of an instance.
4320 This does the shutdown on all nodes of the instance.
4322 If the ignore_primary is false, errors on the primary node are
4327 disks = _ExpandCheckDisks(instance, disks)
4330 for node, top_disk in disk.ComputeNodeTree(instance.primary_node):
4331 lu.cfg.SetDiskID(top_disk, node)
4332 result = lu.rpc.call_blockdev_shutdown(node, top_disk)
4333 msg = result.fail_msg
4335 lu.LogWarning("Could not shutdown block device %s on node %s: %s",
4336 disk.iv_name, node, msg)
4337 if not ignore_primary or node != instance.primary_node:
4342 def _CheckNodeFreeMemory(lu, node, reason, requested, hypervisor_name):
4343 """Checks if a node has enough free memory.
4345 This function check if a given node has the needed amount of free
4346 memory. In case the node has less memory or we cannot get the
4347 information from the node, this function raise an OpPrereqError
4350 @type lu: C{LogicalUnit}
4351 @param lu: a logical unit from which we get configuration data
4353 @param node: the node to check
4354 @type reason: C{str}
4355 @param reason: string to use in the error message
4356 @type requested: C{int}
4357 @param requested: the amount of memory in MiB to check for
4358 @type hypervisor_name: C{str}
4359 @param hypervisor_name: the hypervisor to ask for memory stats
4360 @raise errors.OpPrereqError: if the node doesn't have enough memory, or
4361 we cannot check the node
4364 nodeinfo = lu.rpc.call_node_info([node], lu.cfg.GetVGName(), hypervisor_name)
4365 nodeinfo[node].Raise("Can't get data from node %s" % node,
4366 prereq=True, ecode=errors.ECODE_ENVIRON)
4367 free_mem = nodeinfo[node].payload.get('memory_free', None)
4368 if not isinstance(free_mem, int):
4369 raise errors.OpPrereqError("Can't compute free memory on node %s, result"
4370 " was '%s'" % (node, free_mem),
4371 errors.ECODE_ENVIRON)
4372 if requested > free_mem:
4373 raise errors.OpPrereqError("Not enough memory on node %s for %s:"
4374 " needed %s MiB, available %s MiB" %
4375 (node, reason, requested, free_mem),
4379 def _CheckNodesFreeDisk(lu, nodenames, requested):
4380 """Checks if nodes have enough free disk space in the default VG.
4382 This function check if all given nodes have the needed amount of
4383 free disk. In case any node has less disk or we cannot get the
4384 information from the node, this function raise an OpPrereqError
4387 @type lu: C{LogicalUnit}
4388 @param lu: a logical unit from which we get configuration data
4389 @type nodenames: C{list}
4390 @param nodenames: the list of node names to check
4391 @type requested: C{int}
4392 @param requested: the amount of disk in MiB to check for
4393 @raise errors.OpPrereqError: if the node doesn't have enough disk, or
4394 we cannot check the node
4397 nodeinfo = lu.rpc.call_node_info(nodenames, lu.cfg.GetVGName(),
4398 lu.cfg.GetHypervisorType())
4399 for node in nodenames:
4400 info = nodeinfo[node]
4401 info.Raise("Cannot get current information from node %s" % node,
4402 prereq=True, ecode=errors.ECODE_ENVIRON)
4403 vg_free = info.payload.get("vg_free", None)
4404 if not isinstance(vg_free, int):
4405 raise errors.OpPrereqError("Can't compute free disk space on node %s,"
4406 " result was '%s'" % (node, vg_free),
4407 errors.ECODE_ENVIRON)
4408 if requested > vg_free:
4409 raise errors.OpPrereqError("Not enough disk space on target node %s:"
4410 " required %d MiB, available %d MiB" %
4411 (node, requested, vg_free),
4415 class LUStartupInstance(LogicalUnit):
4416 """Starts an instance.
4419 HPATH = "instance-start"
4420 HTYPE = constants.HTYPE_INSTANCE
4424 ("hvparams", _EmptyDict, _TDict),
4425 ("beparams", _EmptyDict, _TDict),
4429 def CheckArguments(self):
4431 if self.op.beparams:
4432 # fill the beparams dict
4433 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
4435 def ExpandNames(self):
4436 self._ExpandAndLockInstance()
4438 def BuildHooksEnv(self):
4441 This runs on master, primary and secondary nodes of the instance.
4445 "FORCE": self.op.force,
4447 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
4448 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
4451 def CheckPrereq(self):
4452 """Check prerequisites.
4454 This checks that the instance is in the cluster.
4457 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
4458 assert self.instance is not None, \
4459 "Cannot retrieve locked instance %s" % self.op.instance_name
4462 if self.op.hvparams:
4463 # check hypervisor parameter syntax (locally)
4464 cluster = self.cfg.GetClusterInfo()
4465 utils.ForceDictType(self.op.hvparams, constants.HVS_PARAMETER_TYPES)
4466 filled_hvp = cluster.FillHV(instance)
4467 filled_hvp.update(self.op.hvparams)
4468 hv_type = hypervisor.GetHypervisor(instance.hypervisor)
4469 hv_type.CheckParameterSyntax(filled_hvp)
4470 _CheckHVParams(self, instance.all_nodes, instance.hypervisor, filled_hvp)
4472 _CheckNodeOnline(self, instance.primary_node)
4474 bep = self.cfg.GetClusterInfo().FillBE(instance)
4475 # check bridges existence
4476 _CheckInstanceBridgesExist(self, instance)
4478 remote_info = self.rpc.call_instance_info(instance.primary_node,
4480 instance.hypervisor)
4481 remote_info.Raise("Error checking node %s" % instance.primary_node,
4482 prereq=True, ecode=errors.ECODE_ENVIRON)
4483 if not remote_info.payload: # not running already
4484 _CheckNodeFreeMemory(self, instance.primary_node,
4485 "starting instance %s" % instance.name,
4486 bep[constants.BE_MEMORY], instance.hypervisor)
4488 def Exec(self, feedback_fn):
4489 """Start the instance.
4492 instance = self.instance
4493 force = self.op.force
4495 self.cfg.MarkInstanceUp(instance.name)
4497 node_current = instance.primary_node
4499 _StartInstanceDisks(self, instance, force)
4501 result = self.rpc.call_instance_start(node_current, instance,
4502 self.op.hvparams, self.op.beparams)
4503 msg = result.fail_msg
4505 _ShutdownInstanceDisks(self, instance)
4506 raise errors.OpExecError("Could not start instance: %s" % msg)
4509 class LURebootInstance(LogicalUnit):
4510 """Reboot an instance.
4513 HPATH = "instance-reboot"
4514 HTYPE = constants.HTYPE_INSTANCE
4517 ("ignore_secondaries", False, _TBool),
4518 ("reboot_type", _NoDefault, _TElemOf(constants.REBOOT_TYPES)),
4523 def ExpandNames(self):
4524 self._ExpandAndLockInstance()
4526 def BuildHooksEnv(self):
4529 This runs on master, primary and secondary nodes of the instance.
4533 "IGNORE_SECONDARIES": self.op.ignore_secondaries,
4534 "REBOOT_TYPE": self.op.reboot_type,
4535 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
4537 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
4538 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
4541 def CheckPrereq(self):
4542 """Check prerequisites.
4544 This checks that the instance is in the cluster.
4547 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
4548 assert self.instance is not None, \
4549 "Cannot retrieve locked instance %s" % self.op.instance_name
4551 _CheckNodeOnline(self, instance.primary_node)
4553 # check bridges existence
4554 _CheckInstanceBridgesExist(self, instance)
4556 def Exec(self, feedback_fn):
4557 """Reboot the instance.
4560 instance = self.instance
4561 ignore_secondaries = self.op.ignore_secondaries
4562 reboot_type = self.op.reboot_type
4564 node_current = instance.primary_node
4566 if reboot_type in [constants.INSTANCE_REBOOT_SOFT,
4567 constants.INSTANCE_REBOOT_HARD]:
4568 for disk in instance.disks:
4569 self.cfg.SetDiskID(disk, node_current)
4570 result = self.rpc.call_instance_reboot(node_current, instance,
4572 self.op.shutdown_timeout)
4573 result.Raise("Could not reboot instance")
4575 result = self.rpc.call_instance_shutdown(node_current, instance,
4576 self.op.shutdown_timeout)
4577 result.Raise("Could not shutdown instance for full reboot")
4578 _ShutdownInstanceDisks(self, instance)
4579 _StartInstanceDisks(self, instance, ignore_secondaries)
4580 result = self.rpc.call_instance_start(node_current, instance, None, None)
4581 msg = result.fail_msg
4583 _ShutdownInstanceDisks(self, instance)
4584 raise errors.OpExecError("Could not start instance for"
4585 " full reboot: %s" % msg)
4587 self.cfg.MarkInstanceUp(instance.name)
4590 class LUShutdownInstance(LogicalUnit):
4591 """Shutdown an instance.
4594 HPATH = "instance-stop"
4595 HTYPE = constants.HTYPE_INSTANCE
4598 ("timeout", constants.DEFAULT_SHUTDOWN_TIMEOUT, _TPositiveInt),
4602 def ExpandNames(self):
4603 self._ExpandAndLockInstance()
4605 def BuildHooksEnv(self):
4608 This runs on master, primary and secondary nodes of the instance.
4611 env = _BuildInstanceHookEnvByObject(self, self.instance)
4612 env["TIMEOUT"] = self.op.timeout
4613 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
4616 def CheckPrereq(self):
4617 """Check prerequisites.
4619 This checks that the instance is in the cluster.
4622 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
4623 assert self.instance is not None, \
4624 "Cannot retrieve locked instance %s" % self.op.instance_name
4625 _CheckNodeOnline(self, self.instance.primary_node)
4627 def Exec(self, feedback_fn):
4628 """Shutdown the instance.
4631 instance = self.instance
4632 node_current = instance.primary_node
4633 timeout = self.op.timeout
4634 self.cfg.MarkInstanceDown(instance.name)
4635 result = self.rpc.call_instance_shutdown(node_current, instance, timeout)
4636 msg = result.fail_msg
4638 self.proc.LogWarning("Could not shutdown instance: %s" % msg)
4640 _ShutdownInstanceDisks(self, instance)
4643 class LUReinstallInstance(LogicalUnit):
4644 """Reinstall an instance.
4647 HPATH = "instance-reinstall"
4648 HTYPE = constants.HTYPE_INSTANCE
4651 ("os_type", None, _TMaybeString),
4652 ("force_variant", False, _TBool),
4656 def ExpandNames(self):
4657 self._ExpandAndLockInstance()
4659 def BuildHooksEnv(self):
4662 This runs on master, primary and secondary nodes of the instance.
4665 env = _BuildInstanceHookEnvByObject(self, self.instance)
4666 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
4669 def CheckPrereq(self):
4670 """Check prerequisites.
4672 This checks that the instance is in the cluster and is not running.
4675 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
4676 assert instance is not None, \
4677 "Cannot retrieve locked instance %s" % self.op.instance_name
4678 _CheckNodeOnline(self, instance.primary_node)
4680 if instance.disk_template == constants.DT_DISKLESS:
4681 raise errors.OpPrereqError("Instance '%s' has no disks" %
4682 self.op.instance_name,
4684 _CheckInstanceDown(self, instance, "cannot reinstall")
4686 if self.op.os_type is not None:
4688 pnode = _ExpandNodeName(self.cfg, instance.primary_node)
4689 _CheckNodeHasOS(self, pnode, self.op.os_type, self.op.force_variant)
4691 self.instance = instance
4693 def Exec(self, feedback_fn):
4694 """Reinstall the instance.
4697 inst = self.instance
4699 if self.op.os_type is not None:
4700 feedback_fn("Changing OS to '%s'..." % self.op.os_type)
4701 inst.os = self.op.os_type
4702 self.cfg.Update(inst, feedback_fn)
4704 _StartInstanceDisks(self, inst, None)
4706 feedback_fn("Running the instance OS create scripts...")
4707 # FIXME: pass debug option from opcode to backend
4708 result = self.rpc.call_instance_os_add(inst.primary_node, inst, True,
4709 self.op.debug_level)
4710 result.Raise("Could not install OS for instance %s on node %s" %
4711 (inst.name, inst.primary_node))
4713 _ShutdownInstanceDisks(self, inst)
4716 class LURecreateInstanceDisks(LogicalUnit):
4717 """Recreate an instance's missing disks.
4720 HPATH = "instance-recreate-disks"
4721 HTYPE = constants.HTYPE_INSTANCE
4724 ("disks", _EmptyList, _TListOf(_TPositiveInt)),
4728 def ExpandNames(self):
4729 self._ExpandAndLockInstance()
4731 def BuildHooksEnv(self):
4734 This runs on master, primary and secondary nodes of the instance.
4737 env = _BuildInstanceHookEnvByObject(self, self.instance)
4738 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
4741 def CheckPrereq(self):
4742 """Check prerequisites.
4744 This checks that the instance is in the cluster and is not running.
4747 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
4748 assert instance is not None, \
4749 "Cannot retrieve locked instance %s" % self.op.instance_name
4750 _CheckNodeOnline(self, instance.primary_node)
4752 if instance.disk_template == constants.DT_DISKLESS:
4753 raise errors.OpPrereqError("Instance '%s' has no disks" %
4754 self.op.instance_name, errors.ECODE_INVAL)
4755 _CheckInstanceDown(self, instance, "cannot recreate disks")
4757 if not self.op.disks:
4758 self.op.disks = range(len(instance.disks))
4760 for idx in self.op.disks:
4761 if idx >= len(instance.disks):
4762 raise errors.OpPrereqError("Invalid disk index passed '%s'" % idx,
4765 self.instance = instance
4767 def Exec(self, feedback_fn):
4768 """Recreate the disks.
4772 for idx, _ in enumerate(self.instance.disks):
4773 if idx not in self.op.disks: # disk idx has not been passed in
4777 _CreateDisks(self, self.instance, to_skip=to_skip)
4780 class LURenameInstance(LogicalUnit):
4781 """Rename an instance.
4784 HPATH = "instance-rename"
4785 HTYPE = constants.HTYPE_INSTANCE
4788 ("new_name", _NoDefault, _TNonEmptyString),
4789 ("ignore_ip", False, _TBool),
4790 ("check_name", True, _TBool),
4793 def BuildHooksEnv(self):
4796 This runs on master, primary and secondary nodes of the instance.
4799 env = _BuildInstanceHookEnvByObject(self, self.instance)
4800 env["INSTANCE_NEW_NAME"] = self.op.new_name
4801 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
4804 def CheckPrereq(self):
4805 """Check prerequisites.
4807 This checks that the instance is in the cluster and is not running.
4810 self.op.instance_name = _ExpandInstanceName(self.cfg,
4811 self.op.instance_name)
4812 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
4813 assert instance is not None
4814 _CheckNodeOnline(self, instance.primary_node)
4815 _CheckInstanceDown(self, instance, "cannot rename")
4816 self.instance = instance
4818 # new name verification
4819 if self.op.check_name:
4820 name_info = utils.GetHostInfo(self.op.new_name)
4821 self.op.new_name = name_info.name
4823 new_name = self.op.new_name
4825 instance_list = self.cfg.GetInstanceList()
4826 if new_name in instance_list:
4827 raise errors.OpPrereqError("Instance '%s' is already in the cluster" %
4828 new_name, errors.ECODE_EXISTS)
4830 if not self.op.ignore_ip:
4831 if utils.TcpPing(name_info.ip, constants.DEFAULT_NODED_PORT):
4832 raise errors.OpPrereqError("IP %s of instance %s already in use" %
4833 (name_info.ip, new_name),
4834 errors.ECODE_NOTUNIQUE)
4836 def Exec(self, feedback_fn):
4837 """Reinstall the instance.
4840 inst = self.instance
4841 old_name = inst.name
4843 if inst.disk_template == constants.DT_FILE:
4844 old_file_storage_dir = os.path.dirname(inst.disks[0].logical_id[1])
4846 self.cfg.RenameInstance(inst.name, self.op.new_name)
4847 # Change the instance lock. This is definitely safe while we hold the BGL
4848 self.context.glm.remove(locking.LEVEL_INSTANCE, old_name)
4849 self.context.glm.add(locking.LEVEL_INSTANCE, self.op.new_name)
4851 # re-read the instance from the configuration after rename
4852 inst = self.cfg.GetInstanceInfo(self.op.new_name)
4854 if inst.disk_template == constants.DT_FILE:
4855 new_file_storage_dir = os.path.dirname(inst.disks[0].logical_id[1])
4856 result = self.rpc.call_file_storage_dir_rename(inst.primary_node,
4857 old_file_storage_dir,
4858 new_file_storage_dir)
4859 result.Raise("Could not rename on node %s directory '%s' to '%s'"
4860 " (but the instance has been renamed in Ganeti)" %
4861 (inst.primary_node, old_file_storage_dir,
4862 new_file_storage_dir))
4864 _StartInstanceDisks(self, inst, None)
4866 result = self.rpc.call_instance_run_rename(inst.primary_node, inst,
4867 old_name, self.op.debug_level)
4868 msg = result.fail_msg
4870 msg = ("Could not run OS rename script for instance %s on node %s"
4871 " (but the instance has been renamed in Ganeti): %s" %
4872 (inst.name, inst.primary_node, msg))
4873 self.proc.LogWarning(msg)
4875 _ShutdownInstanceDisks(self, inst)
4878 class LURemoveInstance(LogicalUnit):
4879 """Remove an instance.
4882 HPATH = "instance-remove"
4883 HTYPE = constants.HTYPE_INSTANCE
4886 ("ignore_failures", False, _TBool),
4891 def ExpandNames(self):
4892 self._ExpandAndLockInstance()
4893 self.needed_locks[locking.LEVEL_NODE] = []
4894 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
4896 def DeclareLocks(self, level):
4897 if level == locking.LEVEL_NODE:
4898 self._LockInstancesNodes()
4900 def BuildHooksEnv(self):
4903 This runs on master, primary and secondary nodes of the instance.
4906 env = _BuildInstanceHookEnvByObject(self, self.instance)
4907 env["SHUTDOWN_TIMEOUT"] = self.op.shutdown_timeout
4908 nl = [self.cfg.GetMasterNode()]
4909 nl_post = list(self.instance.all_nodes) + nl
4910 return env, nl, nl_post
4912 def CheckPrereq(self):
4913 """Check prerequisites.
4915 This checks that the instance is in the cluster.
4918 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
4919 assert self.instance is not None, \
4920 "Cannot retrieve locked instance %s" % self.op.instance_name
4922 def Exec(self, feedback_fn):
4923 """Remove the instance.
4926 instance = self.instance
4927 logging.info("Shutting down instance %s on node %s",
4928 instance.name, instance.primary_node)
4930 result = self.rpc.call_instance_shutdown(instance.primary_node, instance,
4931 self.op.shutdown_timeout)
4932 msg = result.fail_msg
4934 if self.op.ignore_failures:
4935 feedback_fn("Warning: can't shutdown instance: %s" % msg)
4937 raise errors.OpExecError("Could not shutdown instance %s on"
4939 (instance.name, instance.primary_node, msg))
4941 _RemoveInstance(self, feedback_fn, instance, self.op.ignore_failures)
4944 def _RemoveInstance(lu, feedback_fn, instance, ignore_failures):
4945 """Utility function to remove an instance.
4948 logging.info("Removing block devices for instance %s", instance.name)
4950 if not _RemoveDisks(lu, instance):
4951 if not ignore_failures:
4952 raise errors.OpExecError("Can't remove instance's disks")
4953 feedback_fn("Warning: can't remove instance's disks")
4955 logging.info("Removing instance %s out of cluster config", instance.name)
4957 lu.cfg.RemoveInstance(instance.name)
4959 assert not lu.remove_locks.get(locking.LEVEL_INSTANCE), \
4960 "Instance lock removal conflict"
4962 # Remove lock for the instance
4963 lu.remove_locks[locking.LEVEL_INSTANCE] = instance.name
4966 class LUQueryInstances(NoHooksLU):
4967 """Logical unit for querying instances.
4970 # pylint: disable-msg=W0142
4972 ("output_fields", _NoDefault, _TListOf(_TNonEmptyString)),
4973 ("names", _EmptyList, _TListOf(_TNonEmptyString)),
4974 ("use_locking", False, _TBool),
4977 _SIMPLE_FIELDS = ["name", "os", "network_port", "hypervisor",
4978 "serial_no", "ctime", "mtime", "uuid"]
4979 _FIELDS_STATIC = utils.FieldSet(*["name", "os", "pnode", "snodes",
4981 "disk_template", "ip", "mac", "bridge",
4982 "nic_mode", "nic_link",
4983 "sda_size", "sdb_size", "vcpus", "tags",
4984 "network_port", "beparams",
4985 r"(disk)\.(size)/([0-9]+)",
4986 r"(disk)\.(sizes)", "disk_usage",
4987 r"(nic)\.(mac|ip|mode|link)/([0-9]+)",
4988 r"(nic)\.(bridge)/([0-9]+)",
4989 r"(nic)\.(macs|ips|modes|links|bridges)",
4990 r"(disk|nic)\.(count)",
4992 ] + _SIMPLE_FIELDS +
4994 for name in constants.HVS_PARAMETERS
4995 if name not in constants.HVC_GLOBALS] +
4997 for name in constants.BES_PARAMETERS])
4998 _FIELDS_DYNAMIC = utils.FieldSet("oper_state", "oper_ram", "status")
5001 def CheckArguments(self):
5002 _CheckOutputFields(static=self._FIELDS_STATIC,
5003 dynamic=self._FIELDS_DYNAMIC,
5004 selected=self.op.output_fields)
5006 def ExpandNames(self):
5007 self.needed_locks = {}
5008 self.share_locks[locking.LEVEL_INSTANCE] = 1
5009 self.share_locks[locking.LEVEL_NODE] = 1
5012 self.wanted = _GetWantedInstances(self, self.op.names)
5014 self.wanted = locking.ALL_SET
5016 self.do_node_query = self._FIELDS_STATIC.NonMatching(self.op.output_fields)
5017 self.do_locking = self.do_node_query and self.op.use_locking
5019 self.needed_locks[locking.LEVEL_INSTANCE] = self.wanted
5020 self.needed_locks[locking.LEVEL_NODE] = []
5021 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
5023 def DeclareLocks(self, level):
5024 if level == locking.LEVEL_NODE and self.do_locking:
5025 self._LockInstancesNodes()
5027 def Exec(self, feedback_fn):
5028 """Computes the list of nodes and their attributes.
5031 # pylint: disable-msg=R0912
5032 # way too many branches here
5033 all_info = self.cfg.GetAllInstancesInfo()
5034 if self.wanted == locking.ALL_SET:
5035 # caller didn't specify instance names, so ordering is not important
5037 instance_names = self.acquired_locks[locking.LEVEL_INSTANCE]
5039 instance_names = all_info.keys()
5040 instance_names = utils.NiceSort(instance_names)
5042 # caller did specify names, so we must keep the ordering
5044 tgt_set = self.acquired_locks[locking.LEVEL_INSTANCE]
5046 tgt_set = all_info.keys()
5047 missing = set(self.wanted).difference(tgt_set)
5049 raise errors.OpExecError("Some instances were removed before"
5050 " retrieving their data: %s" % missing)
5051 instance_names = self.wanted
5053 instance_list = [all_info[iname] for iname in instance_names]
5055 # begin data gathering
5057 nodes = frozenset([inst.primary_node for inst in instance_list])
5058 hv_list = list(set([inst.hypervisor for inst in instance_list]))
5062 if self.do_node_query:
5064 node_data = self.rpc.call_all_instances_info(nodes, hv_list)
5066 result = node_data[name]
5068 # offline nodes will be in both lists
5069 off_nodes.append(name)
5071 bad_nodes.append(name)
5074 live_data.update(result.payload)
5075 # else no instance is alive
5077 live_data = dict([(name, {}) for name in instance_names])
5079 # end data gathering
5084 cluster = self.cfg.GetClusterInfo()
5085 for instance in instance_list:
5087 i_hv = cluster.FillHV(instance, skip_globals=True)
5088 i_be = cluster.FillBE(instance)
5089 i_nicp = [cluster.SimpleFillNIC(nic.nicparams) for nic in instance.nics]
5090 for field in self.op.output_fields:
5091 st_match = self._FIELDS_STATIC.Matches(field)
5092 if field in self._SIMPLE_FIELDS:
5093 val = getattr(instance, field)
5094 elif field == "pnode":
5095 val = instance.primary_node
5096 elif field == "snodes":
5097 val = list(instance.secondary_nodes)
5098 elif field == "admin_state":
5099 val = instance.admin_up
5100 elif field == "oper_state":
5101 if instance.primary_node in bad_nodes:
5104 val = bool(live_data.get(instance.name))
5105 elif field == "status":
5106 if instance.primary_node in off_nodes:
5107 val = "ERROR_nodeoffline"
5108 elif instance.primary_node in bad_nodes:
5109 val = "ERROR_nodedown"
5111 running = bool(live_data.get(instance.name))
5113 if instance.admin_up:
5118 if instance.admin_up:
5122 elif field == "oper_ram":
5123 if instance.primary_node in bad_nodes:
5125 elif instance.name in live_data:
5126 val = live_data[instance.name].get("memory", "?")
5129 elif field == "vcpus":
5130 val = i_be[constants.BE_VCPUS]
5131 elif field == "disk_template":
5132 val = instance.disk_template
5135 val = instance.nics[0].ip
5138 elif field == "nic_mode":
5140 val = i_nicp[0][constants.NIC_MODE]
5143 elif field == "nic_link":
5145 val = i_nicp[0][constants.NIC_LINK]
5148 elif field == "bridge":
5149 if (instance.nics and
5150 i_nicp[0][constants.NIC_MODE] == constants.NIC_MODE_BRIDGED):
5151 val = i_nicp[0][constants.NIC_LINK]
5154 elif field == "mac":
5156 val = instance.nics[0].mac
5159 elif field == "sda_size" or field == "sdb_size":
5160 idx = ord(field[2]) - ord('a')
5162 val = instance.FindDisk(idx).size
5163 except errors.OpPrereqError:
5165 elif field == "disk_usage": # total disk usage per node
5166 disk_sizes = [{'size': disk.size} for disk in instance.disks]
5167 val = _ComputeDiskSize(instance.disk_template, disk_sizes)
5168 elif field == "tags":
5169 val = list(instance.GetTags())
5170 elif field == "hvparams":
5172 elif (field.startswith(HVPREFIX) and
5173 field[len(HVPREFIX):] in constants.HVS_PARAMETERS and
5174 field[len(HVPREFIX):] not in constants.HVC_GLOBALS):
5175 val = i_hv.get(field[len(HVPREFIX):], None)
5176 elif field == "beparams":
5178 elif (field.startswith(BEPREFIX) and
5179 field[len(BEPREFIX):] in constants.BES_PARAMETERS):
5180 val = i_be.get(field[len(BEPREFIX):], None)
5181 elif st_match and st_match.groups():
5182 # matches a variable list
5183 st_groups = st_match.groups()
5184 if st_groups and st_groups[0] == "disk":
5185 if st_groups[1] == "count":
5186 val = len(instance.disks)
5187 elif st_groups[1] == "sizes":
5188 val = [disk.size for disk in instance.disks]
5189 elif st_groups[1] == "size":
5191 val = instance.FindDisk(st_groups[2]).size
5192 except errors.OpPrereqError:
5195 assert False, "Unhandled disk parameter"
5196 elif st_groups[0] == "nic":
5197 if st_groups[1] == "count":
5198 val = len(instance.nics)
5199 elif st_groups[1] == "macs":
5200 val = [nic.mac for nic in instance.nics]
5201 elif st_groups[1] == "ips":
5202 val = [nic.ip for nic in instance.nics]
5203 elif st_groups[1] == "modes":
5204 val = [nicp[constants.NIC_MODE] for nicp in i_nicp]
5205 elif st_groups[1] == "links":
5206 val = [nicp[constants.NIC_LINK] for nicp in i_nicp]
5207 elif st_groups[1] == "bridges":
5210 if nicp[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
5211 val.append(nicp[constants.NIC_LINK])
5216 nic_idx = int(st_groups[2])
5217 if nic_idx >= len(instance.nics):
5220 if st_groups[1] == "mac":
5221 val = instance.nics[nic_idx].mac
5222 elif st_groups[1] == "ip":
5223 val = instance.nics[nic_idx].ip
5224 elif st_groups[1] == "mode":
5225 val = i_nicp[nic_idx][constants.NIC_MODE]
5226 elif st_groups[1] == "link":
5227 val = i_nicp[nic_idx][constants.NIC_LINK]
5228 elif st_groups[1] == "bridge":
5229 nic_mode = i_nicp[nic_idx][constants.NIC_MODE]
5230 if nic_mode == constants.NIC_MODE_BRIDGED:
5231 val = i_nicp[nic_idx][constants.NIC_LINK]
5235 assert False, "Unhandled NIC parameter"
5237 assert False, ("Declared but unhandled variable parameter '%s'" %
5240 assert False, "Declared but unhandled parameter '%s'" % field
5247 class LUFailoverInstance(LogicalUnit):
5248 """Failover an instance.
5251 HPATH = "instance-failover"
5252 HTYPE = constants.HTYPE_INSTANCE
5255 ("ignore_consistency", False, _TBool),
5260 def ExpandNames(self):
5261 self._ExpandAndLockInstance()
5262 self.needed_locks[locking.LEVEL_NODE] = []
5263 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
5265 def DeclareLocks(self, level):
5266 if level == locking.LEVEL_NODE:
5267 self._LockInstancesNodes()
5269 def BuildHooksEnv(self):
5272 This runs on master, primary and secondary nodes of the instance.
5275 instance = self.instance
5276 source_node = instance.primary_node
5277 target_node = instance.secondary_nodes[0]
5279 "IGNORE_CONSISTENCY": self.op.ignore_consistency,
5280 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
5281 "OLD_PRIMARY": source_node,
5282 "OLD_SECONDARY": target_node,
5283 "NEW_PRIMARY": target_node,
5284 "NEW_SECONDARY": source_node,
5286 env.update(_BuildInstanceHookEnvByObject(self, instance))
5287 nl = [self.cfg.GetMasterNode()] + list(instance.secondary_nodes)
5289 nl_post.append(source_node)
5290 return env, nl, nl_post
5292 def CheckPrereq(self):
5293 """Check prerequisites.
5295 This checks that the instance is in the cluster.
5298 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5299 assert self.instance is not None, \
5300 "Cannot retrieve locked instance %s" % self.op.instance_name
5302 bep = self.cfg.GetClusterInfo().FillBE(instance)
5303 if instance.disk_template not in constants.DTS_NET_MIRROR:
5304 raise errors.OpPrereqError("Instance's disk layout is not"
5305 " network mirrored, cannot failover.",
5308 secondary_nodes = instance.secondary_nodes
5309 if not secondary_nodes:
5310 raise errors.ProgrammerError("no secondary node but using "
5311 "a mirrored disk template")
5313 target_node = secondary_nodes[0]
5314 _CheckNodeOnline(self, target_node)
5315 _CheckNodeNotDrained(self, target_node)
5316 if instance.admin_up:
5317 # check memory requirements on the secondary node
5318 _CheckNodeFreeMemory(self, target_node, "failing over instance %s" %
5319 instance.name, bep[constants.BE_MEMORY],
5320 instance.hypervisor)
5322 self.LogInfo("Not checking memory on the secondary node as"
5323 " instance will not be started")
5325 # check bridge existance
5326 _CheckInstanceBridgesExist(self, instance, node=target_node)
5328 def Exec(self, feedback_fn):
5329 """Failover an instance.
5331 The failover is done by shutting it down on its present node and
5332 starting it on the secondary.
5335 instance = self.instance
5337 source_node = instance.primary_node
5338 target_node = instance.secondary_nodes[0]
5340 if instance.admin_up:
5341 feedback_fn("* checking disk consistency between source and target")
5342 for dev in instance.disks:
5343 # for drbd, these are drbd over lvm
5344 if not _CheckDiskConsistency(self, dev, target_node, False):
5345 if not self.op.ignore_consistency:
5346 raise errors.OpExecError("Disk %s is degraded on target node,"
5347 " aborting failover." % dev.iv_name)
5349 feedback_fn("* not checking disk consistency as instance is not running")
5351 feedback_fn("* shutting down instance on source node")
5352 logging.info("Shutting down instance %s on node %s",
5353 instance.name, source_node)
5355 result = self.rpc.call_instance_shutdown(source_node, instance,
5356 self.op.shutdown_timeout)
5357 msg = result.fail_msg
5359 if self.op.ignore_consistency:
5360 self.proc.LogWarning("Could not shutdown instance %s on node %s."
5361 " Proceeding anyway. Please make sure node"
5362 " %s is down. Error details: %s",
5363 instance.name, source_node, source_node, msg)
5365 raise errors.OpExecError("Could not shutdown instance %s on"
5367 (instance.name, source_node, msg))
5369 feedback_fn("* deactivating the instance's disks on source node")
5370 if not _ShutdownInstanceDisks(self, instance, ignore_primary=True):
5371 raise errors.OpExecError("Can't shut down the instance's disks.")
5373 instance.primary_node = target_node
5374 # distribute new instance config to the other nodes
5375 self.cfg.Update(instance, feedback_fn)
5377 # Only start the instance if it's marked as up
5378 if instance.admin_up:
5379 feedback_fn("* activating the instance's disks on target node")
5380 logging.info("Starting instance %s on node %s",
5381 instance.name, target_node)
5383 disks_ok, _ = _AssembleInstanceDisks(self, instance,
5384 ignore_secondaries=True)
5386 _ShutdownInstanceDisks(self, instance)
5387 raise errors.OpExecError("Can't activate the instance's disks")
5389 feedback_fn("* starting the instance on the target node")
5390 result = self.rpc.call_instance_start(target_node, instance, None, None)
5391 msg = result.fail_msg
5393 _ShutdownInstanceDisks(self, instance)
5394 raise errors.OpExecError("Could not start instance %s on node %s: %s" %
5395 (instance.name, target_node, msg))
5398 class LUMigrateInstance(LogicalUnit):
5399 """Migrate an instance.
5401 This is migration without shutting down, compared to the failover,
5402 which is done with shutdown.
5405 HPATH = "instance-migrate"
5406 HTYPE = constants.HTYPE_INSTANCE
5409 ("live", True, _TBool),
5410 ("cleanup", False, _TBool),
5415 def ExpandNames(self):
5416 self._ExpandAndLockInstance()
5418 self.needed_locks[locking.LEVEL_NODE] = []
5419 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
5421 self._migrater = TLMigrateInstance(self, self.op.instance_name,
5422 self.op.live, self.op.cleanup)
5423 self.tasklets = [self._migrater]
5425 def DeclareLocks(self, level):
5426 if level == locking.LEVEL_NODE:
5427 self._LockInstancesNodes()
5429 def BuildHooksEnv(self):
5432 This runs on master, primary and secondary nodes of the instance.
5435 instance = self._migrater.instance
5436 source_node = instance.primary_node
5437 target_node = instance.secondary_nodes[0]
5438 env = _BuildInstanceHookEnvByObject(self, instance)
5439 env["MIGRATE_LIVE"] = self.op.live
5440 env["MIGRATE_CLEANUP"] = self.op.cleanup
5442 "OLD_PRIMARY": source_node,
5443 "OLD_SECONDARY": target_node,
5444 "NEW_PRIMARY": target_node,
5445 "NEW_SECONDARY": source_node,
5447 nl = [self.cfg.GetMasterNode()] + list(instance.secondary_nodes)
5449 nl_post.append(source_node)
5450 return env, nl, nl_post
5453 class LUMoveInstance(LogicalUnit):
5454 """Move an instance by data-copying.
5457 HPATH = "instance-move"
5458 HTYPE = constants.HTYPE_INSTANCE
5461 ("target_node", _NoDefault, _TNonEmptyString),
5466 def ExpandNames(self):
5467 self._ExpandAndLockInstance()
5468 target_node = _ExpandNodeName(self.cfg, self.op.target_node)
5469 self.op.target_node = target_node
5470 self.needed_locks[locking.LEVEL_NODE] = [target_node]
5471 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
5473 def DeclareLocks(self, level):
5474 if level == locking.LEVEL_NODE:
5475 self._LockInstancesNodes(primary_only=True)
5477 def BuildHooksEnv(self):
5480 This runs on master, primary and secondary nodes of the instance.
5484 "TARGET_NODE": self.op.target_node,
5485 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
5487 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
5488 nl = [self.cfg.GetMasterNode()] + [self.instance.primary_node,
5489 self.op.target_node]
5492 def CheckPrereq(self):
5493 """Check prerequisites.
5495 This checks that the instance is in the cluster.
5498 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5499 assert self.instance is not None, \
5500 "Cannot retrieve locked instance %s" % self.op.instance_name
5502 node = self.cfg.GetNodeInfo(self.op.target_node)
5503 assert node is not None, \
5504 "Cannot retrieve locked node %s" % self.op.target_node
5506 self.target_node = target_node = node.name
5508 if target_node == instance.primary_node:
5509 raise errors.OpPrereqError("Instance %s is already on the node %s" %
5510 (instance.name, target_node),
5513 bep = self.cfg.GetClusterInfo().FillBE(instance)
5515 for idx, dsk in enumerate(instance.disks):
5516 if dsk.dev_type not in (constants.LD_LV, constants.LD_FILE):
5517 raise errors.OpPrereqError("Instance disk %d has a complex layout,"
5518 " cannot copy" % idx, errors.ECODE_STATE)
5520 _CheckNodeOnline(self, target_node)
5521 _CheckNodeNotDrained(self, target_node)
5523 if instance.admin_up:
5524 # check memory requirements on the secondary node
5525 _CheckNodeFreeMemory(self, target_node, "failing over instance %s" %
5526 instance.name, bep[constants.BE_MEMORY],
5527 instance.hypervisor)
5529 self.LogInfo("Not checking memory on the secondary node as"
5530 " instance will not be started")
5532 # check bridge existance
5533 _CheckInstanceBridgesExist(self, instance, node=target_node)
5535 def Exec(self, feedback_fn):
5536 """Move an instance.
5538 The move is done by shutting it down on its present node, copying
5539 the data over (slow) and starting it on the new node.
5542 instance = self.instance
5544 source_node = instance.primary_node
5545 target_node = self.target_node
5547 self.LogInfo("Shutting down instance %s on source node %s",
5548 instance.name, source_node)
5550 result = self.rpc.call_instance_shutdown(source_node, instance,
5551 self.op.shutdown_timeout)
5552 msg = result.fail_msg
5554 if self.op.ignore_consistency:
5555 self.proc.LogWarning("Could not shutdown instance %s on node %s."
5556 " Proceeding anyway. Please make sure node"
5557 " %s is down. Error details: %s",
5558 instance.name, source_node, source_node, msg)
5560 raise errors.OpExecError("Could not shutdown instance %s on"
5562 (instance.name, source_node, msg))
5564 # create the target disks
5566 _CreateDisks(self, instance, target_node=target_node)
5567 except errors.OpExecError:
5568 self.LogWarning("Device creation failed, reverting...")
5570 _RemoveDisks(self, instance, target_node=target_node)
5572 self.cfg.ReleaseDRBDMinors(instance.name)
5575 cluster_name = self.cfg.GetClusterInfo().cluster_name
5578 # activate, get path, copy the data over
5579 for idx, disk in enumerate(instance.disks):
5580 self.LogInfo("Copying data for disk %d", idx)
5581 result = self.rpc.call_blockdev_assemble(target_node, disk,
5582 instance.name, True)
5584 self.LogWarning("Can't assemble newly created disk %d: %s",
5585 idx, result.fail_msg)
5586 errs.append(result.fail_msg)
5588 dev_path = result.payload
5589 result = self.rpc.call_blockdev_export(source_node, disk,
5590 target_node, dev_path,
5593 self.LogWarning("Can't copy data over for disk %d: %s",
5594 idx, result.fail_msg)
5595 errs.append(result.fail_msg)
5599 self.LogWarning("Some disks failed to copy, aborting")
5601 _RemoveDisks(self, instance, target_node=target_node)
5603 self.cfg.ReleaseDRBDMinors(instance.name)
5604 raise errors.OpExecError("Errors during disk copy: %s" %
5607 instance.primary_node = target_node
5608 self.cfg.Update(instance, feedback_fn)
5610 self.LogInfo("Removing the disks on the original node")
5611 _RemoveDisks(self, instance, target_node=source_node)
5613 # Only start the instance if it's marked as up
5614 if instance.admin_up:
5615 self.LogInfo("Starting instance %s on node %s",
5616 instance.name, target_node)
5618 disks_ok, _ = _AssembleInstanceDisks(self, instance,
5619 ignore_secondaries=True)
5621 _ShutdownInstanceDisks(self, instance)
5622 raise errors.OpExecError("Can't activate the instance's disks")
5624 result = self.rpc.call_instance_start(target_node, instance, None, None)
5625 msg = result.fail_msg
5627 _ShutdownInstanceDisks(self, instance)
5628 raise errors.OpExecError("Could not start instance %s on node %s: %s" %
5629 (instance.name, target_node, msg))
5632 class LUMigrateNode(LogicalUnit):
5633 """Migrate all instances from a node.
5636 HPATH = "node-migrate"
5637 HTYPE = constants.HTYPE_NODE
5640 ("live", False, _TBool),
5644 def ExpandNames(self):
5645 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
5647 self.needed_locks = {
5648 locking.LEVEL_NODE: [self.op.node_name],
5651 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
5653 # Create tasklets for migrating instances for all instances on this node
5657 for inst in _GetNodePrimaryInstances(self.cfg, self.op.node_name):
5658 logging.debug("Migrating instance %s", inst.name)
5659 names.append(inst.name)
5661 tasklets.append(TLMigrateInstance(self, inst.name, self.op.live, False))
5663 self.tasklets = tasklets
5665 # Declare instance locks
5666 self.needed_locks[locking.LEVEL_INSTANCE] = names
5668 def DeclareLocks(self, level):
5669 if level == locking.LEVEL_NODE:
5670 self._LockInstancesNodes()
5672 def BuildHooksEnv(self):
5675 This runs on the master, the primary and all the secondaries.
5679 "NODE_NAME": self.op.node_name,
5682 nl = [self.cfg.GetMasterNode()]
5684 return (env, nl, nl)
5687 class TLMigrateInstance(Tasklet):
5688 def __init__(self, lu, instance_name, live, cleanup):
5689 """Initializes this class.
5692 Tasklet.__init__(self, lu)
5695 self.instance_name = instance_name
5697 self.cleanup = cleanup
5699 def CheckPrereq(self):
5700 """Check prerequisites.
5702 This checks that the instance is in the cluster.
5705 instance_name = _ExpandInstanceName(self.lu.cfg, self.instance_name)
5706 instance = self.cfg.GetInstanceInfo(instance_name)
5707 assert instance is not None
5709 if instance.disk_template != constants.DT_DRBD8:
5710 raise errors.OpPrereqError("Instance's disk layout is not"
5711 " drbd8, cannot migrate.", errors.ECODE_STATE)
5713 secondary_nodes = instance.secondary_nodes
5714 if not secondary_nodes:
5715 raise errors.ConfigurationError("No secondary node but using"
5716 " drbd8 disk template")
5718 i_be = self.cfg.GetClusterInfo().FillBE(instance)
5720 target_node = secondary_nodes[0]
5721 # check memory requirements on the secondary node
5722 _CheckNodeFreeMemory(self.lu, target_node, "migrating instance %s" %
5723 instance.name, i_be[constants.BE_MEMORY],
5724 instance.hypervisor)
5726 # check bridge existance
5727 _CheckInstanceBridgesExist(self.lu, instance, node=target_node)
5729 if not self.cleanup:
5730 _CheckNodeNotDrained(self.lu, target_node)
5731 result = self.rpc.call_instance_migratable(instance.primary_node,
5733 result.Raise("Can't migrate, please use failover",
5734 prereq=True, ecode=errors.ECODE_STATE)
5736 self.instance = instance
5738 def _WaitUntilSync(self):
5739 """Poll with custom rpc for disk sync.
5741 This uses our own step-based rpc call.
5744 self.feedback_fn("* wait until resync is done")
5748 result = self.rpc.call_drbd_wait_sync(self.all_nodes,
5750 self.instance.disks)
5752 for node, nres in result.items():
5753 nres.Raise("Cannot resync disks on node %s" % node)
5754 node_done, node_percent = nres.payload
5755 all_done = all_done and node_done
5756 if node_percent is not None:
5757 min_percent = min(min_percent, node_percent)
5759 if min_percent < 100:
5760 self.feedback_fn(" - progress: %.1f%%" % min_percent)
5763 def _EnsureSecondary(self, node):
5764 """Demote a node to secondary.
5767 self.feedback_fn("* switching node %s to secondary mode" % node)
5769 for dev in self.instance.disks:
5770 self.cfg.SetDiskID(dev, node)
5772 result = self.rpc.call_blockdev_close(node, self.instance.name,
5773 self.instance.disks)
5774 result.Raise("Cannot change disk to secondary on node %s" % node)
5776 def _GoStandalone(self):
5777 """Disconnect from the network.
5780 self.feedback_fn("* changing into standalone mode")
5781 result = self.rpc.call_drbd_disconnect_net(self.all_nodes, self.nodes_ip,
5782 self.instance.disks)
5783 for node, nres in result.items():
5784 nres.Raise("Cannot disconnect disks node %s" % node)
5786 def _GoReconnect(self, multimaster):
5787 """Reconnect to the network.
5793 msg = "single-master"
5794 self.feedback_fn("* changing disks into %s mode" % msg)
5795 result = self.rpc.call_drbd_attach_net(self.all_nodes, self.nodes_ip,
5796 self.instance.disks,
5797 self.instance.name, multimaster)
5798 for node, nres in result.items():
5799 nres.Raise("Cannot change disks config on node %s" % node)
5801 def _ExecCleanup(self):
5802 """Try to cleanup after a failed migration.
5804 The cleanup is done by:
5805 - check that the instance is running only on one node
5806 (and update the config if needed)
5807 - change disks on its secondary node to secondary
5808 - wait until disks are fully synchronized
5809 - disconnect from the network
5810 - change disks into single-master mode
5811 - wait again until disks are fully synchronized
5814 instance = self.instance
5815 target_node = self.target_node
5816 source_node = self.source_node
5818 # check running on only one node
5819 self.feedback_fn("* checking where the instance actually runs"
5820 " (if this hangs, the hypervisor might be in"
5822 ins_l = self.rpc.call_instance_list(self.all_nodes, [instance.hypervisor])
5823 for node, result in ins_l.items():
5824 result.Raise("Can't contact node %s" % node)
5826 runningon_source = instance.name in ins_l[source_node].payload
5827 runningon_target = instance.name in ins_l[target_node].payload
5829 if runningon_source and runningon_target:
5830 raise errors.OpExecError("Instance seems to be running on two nodes,"
5831 " or the hypervisor is confused. You will have"
5832 " to ensure manually that it runs only on one"
5833 " and restart this operation.")
5835 if not (runningon_source or runningon_target):
5836 raise errors.OpExecError("Instance does not seem to be running at all."
5837 " In this case, it's safer to repair by"
5838 " running 'gnt-instance stop' to ensure disk"
5839 " shutdown, and then restarting it.")
5841 if runningon_target:
5842 # the migration has actually succeeded, we need to update the config
5843 self.feedback_fn("* instance running on secondary node (%s),"
5844 " updating config" % target_node)
5845 instance.primary_node = target_node
5846 self.cfg.Update(instance, self.feedback_fn)
5847 demoted_node = source_node
5849 self.feedback_fn("* instance confirmed to be running on its"
5850 " primary node (%s)" % source_node)
5851 demoted_node = target_node
5853 self._EnsureSecondary(demoted_node)
5855 self._WaitUntilSync()
5856 except errors.OpExecError:
5857 # we ignore here errors, since if the device is standalone, it
5858 # won't be able to sync
5860 self._GoStandalone()
5861 self._GoReconnect(False)
5862 self._WaitUntilSync()
5864 self.feedback_fn("* done")
5866 def _RevertDiskStatus(self):
5867 """Try to revert the disk status after a failed migration.
5870 target_node = self.target_node
5872 self._EnsureSecondary(target_node)
5873 self._GoStandalone()
5874 self._GoReconnect(False)
5875 self._WaitUntilSync()
5876 except errors.OpExecError, err:
5877 self.lu.LogWarning("Migration failed and I can't reconnect the"
5878 " drives: error '%s'\n"
5879 "Please look and recover the instance status" %
5882 def _AbortMigration(self):
5883 """Call the hypervisor code to abort a started migration.
5886 instance = self.instance
5887 target_node = self.target_node
5888 migration_info = self.migration_info
5890 abort_result = self.rpc.call_finalize_migration(target_node,
5894 abort_msg = abort_result.fail_msg
5896 logging.error("Aborting migration failed on target node %s: %s",
5897 target_node, abort_msg)
5898 # Don't raise an exception here, as we stil have to try to revert the
5899 # disk status, even if this step failed.
5901 def _ExecMigration(self):
5902 """Migrate an instance.
5904 The migrate is done by:
5905 - change the disks into dual-master mode
5906 - wait until disks are fully synchronized again
5907 - migrate the instance
5908 - change disks on the new secondary node (the old primary) to secondary
5909 - wait until disks are fully synchronized
5910 - change disks into single-master mode
5913 instance = self.instance
5914 target_node = self.target_node
5915 source_node = self.source_node
5917 self.feedback_fn("* checking disk consistency between source and target")
5918 for dev in instance.disks:
5919 if not _CheckDiskConsistency(self.lu, dev, target_node, False):
5920 raise errors.OpExecError("Disk %s is degraded or not fully"
5921 " synchronized on target node,"
5922 " aborting migrate." % dev.iv_name)
5924 # First get the migration information from the remote node
5925 result = self.rpc.call_migration_info(source_node, instance)
5926 msg = result.fail_msg
5928 log_err = ("Failed fetching source migration information from %s: %s" %
5930 logging.error(log_err)
5931 raise errors.OpExecError(log_err)
5933 self.migration_info = migration_info = result.payload
5935 # Then switch the disks to master/master mode
5936 self._EnsureSecondary(target_node)
5937 self._GoStandalone()
5938 self._GoReconnect(True)
5939 self._WaitUntilSync()
5941 self.feedback_fn("* preparing %s to accept the instance" % target_node)
5942 result = self.rpc.call_accept_instance(target_node,
5945 self.nodes_ip[target_node])
5947 msg = result.fail_msg
5949 logging.error("Instance pre-migration failed, trying to revert"
5950 " disk status: %s", msg)
5951 self.feedback_fn("Pre-migration failed, aborting")
5952 self._AbortMigration()
5953 self._RevertDiskStatus()
5954 raise errors.OpExecError("Could not pre-migrate instance %s: %s" %
5955 (instance.name, msg))
5957 self.feedback_fn("* migrating instance to %s" % target_node)
5959 result = self.rpc.call_instance_migrate(source_node, instance,
5960 self.nodes_ip[target_node],
5962 msg = result.fail_msg
5964 logging.error("Instance migration failed, trying to revert"
5965 " disk status: %s", msg)
5966 self.feedback_fn("Migration failed, aborting")
5967 self._AbortMigration()
5968 self._RevertDiskStatus()
5969 raise errors.OpExecError("Could not migrate instance %s: %s" %
5970 (instance.name, msg))
5973 instance.primary_node = target_node
5974 # distribute new instance config to the other nodes
5975 self.cfg.Update(instance, self.feedback_fn)
5977 result = self.rpc.call_finalize_migration(target_node,
5981 msg = result.fail_msg
5983 logging.error("Instance migration succeeded, but finalization failed:"
5985 raise errors.OpExecError("Could not finalize instance migration: %s" %
5988 self._EnsureSecondary(source_node)
5989 self._WaitUntilSync()
5990 self._GoStandalone()
5991 self._GoReconnect(False)
5992 self._WaitUntilSync()
5994 self.feedback_fn("* done")
5996 def Exec(self, feedback_fn):
5997 """Perform the migration.
6000 feedback_fn("Migrating instance %s" % self.instance.name)
6002 self.feedback_fn = feedback_fn
6004 self.source_node = self.instance.primary_node
6005 self.target_node = self.instance.secondary_nodes[0]
6006 self.all_nodes = [self.source_node, self.target_node]
6008 self.source_node: self.cfg.GetNodeInfo(self.source_node).secondary_ip,
6009 self.target_node: self.cfg.GetNodeInfo(self.target_node).secondary_ip,
6013 return self._ExecCleanup()
6015 return self._ExecMigration()
6018 def _CreateBlockDev(lu, node, instance, device, force_create,
6020 """Create a tree of block devices on a given node.
6022 If this device type has to be created on secondaries, create it and
6025 If not, just recurse to children keeping the same 'force' value.
6027 @param lu: the lu on whose behalf we execute
6028 @param node: the node on which to create the device
6029 @type instance: L{objects.Instance}
6030 @param instance: the instance which owns the device
6031 @type device: L{objects.Disk}
6032 @param device: the device to create
6033 @type force_create: boolean
6034 @param force_create: whether to force creation of this device; this
6035 will be change to True whenever we find a device which has
6036 CreateOnSecondary() attribute
6037 @param info: the extra 'metadata' we should attach to the device
6038 (this will be represented as a LVM tag)
6039 @type force_open: boolean
6040 @param force_open: this parameter will be passes to the
6041 L{backend.BlockdevCreate} function where it specifies
6042 whether we run on primary or not, and it affects both
6043 the child assembly and the device own Open() execution
6046 if device.CreateOnSecondary():
6050 for child in device.children:
6051 _CreateBlockDev(lu, node, instance, child, force_create,
6054 if not force_create:
6057 _CreateSingleBlockDev(lu, node, instance, device, info, force_open)
6060 def _CreateSingleBlockDev(lu, node, instance, device, info, force_open):
6061 """Create a single block device on a given node.
6063 This will not recurse over children of the device, so they must be
6066 @param lu: the lu on whose behalf we execute
6067 @param node: the node on which to create the device
6068 @type instance: L{objects.Instance}
6069 @param instance: the instance which owns the device
6070 @type device: L{objects.Disk}
6071 @param device: the device to create
6072 @param info: the extra 'metadata' we should attach to the device
6073 (this will be represented as a LVM tag)
6074 @type force_open: boolean
6075 @param force_open: this parameter will be passes to the
6076 L{backend.BlockdevCreate} function where it specifies
6077 whether we run on primary or not, and it affects both
6078 the child assembly and the device own Open() execution
6081 lu.cfg.SetDiskID(device, node)
6082 result = lu.rpc.call_blockdev_create(node, device, device.size,
6083 instance.name, force_open, info)
6084 result.Raise("Can't create block device %s on"
6085 " node %s for instance %s" % (device, node, instance.name))
6086 if device.physical_id is None:
6087 device.physical_id = result.payload
6090 def _GenerateUniqueNames(lu, exts):
6091 """Generate a suitable LV name.
6093 This will generate a logical volume name for the given instance.
6098 new_id = lu.cfg.GenerateUniqueID(lu.proc.GetECId())
6099 results.append("%s%s" % (new_id, val))
6103 def _GenerateDRBD8Branch(lu, primary, secondary, size, names, iv_name,
6105 """Generate a drbd8 device complete with its children.
6108 port = lu.cfg.AllocatePort()
6109 vgname = lu.cfg.GetVGName()
6110 shared_secret = lu.cfg.GenerateDRBDSecret(lu.proc.GetECId())
6111 dev_data = objects.Disk(dev_type=constants.LD_LV, size=size,
6112 logical_id=(vgname, names[0]))
6113 dev_meta = objects.Disk(dev_type=constants.LD_LV, size=128,
6114 logical_id=(vgname, names[1]))
6115 drbd_dev = objects.Disk(dev_type=constants.LD_DRBD8, size=size,
6116 logical_id=(primary, secondary, port,
6119 children=[dev_data, dev_meta],
6124 def _GenerateDiskTemplate(lu, template_name,
6125 instance_name, primary_node,
6126 secondary_nodes, disk_info,
6127 file_storage_dir, file_driver,
6129 """Generate the entire disk layout for a given template type.
6132 #TODO: compute space requirements
6134 vgname = lu.cfg.GetVGName()
6135 disk_count = len(disk_info)
6137 if template_name == constants.DT_DISKLESS:
6139 elif template_name == constants.DT_PLAIN:
6140 if len(secondary_nodes) != 0:
6141 raise errors.ProgrammerError("Wrong template configuration")
6143 names = _GenerateUniqueNames(lu, [".disk%d" % (base_index + i)
6144 for i in range(disk_count)])
6145 for idx, disk in enumerate(disk_info):
6146 disk_index = idx + base_index
6147 disk_dev = objects.Disk(dev_type=constants.LD_LV, size=disk["size"],
6148 logical_id=(vgname, names[idx]),
6149 iv_name="disk/%d" % disk_index,
6151 disks.append(disk_dev)
6152 elif template_name == constants.DT_DRBD8:
6153 if len(secondary_nodes) != 1:
6154 raise errors.ProgrammerError("Wrong template configuration")
6155 remote_node = secondary_nodes[0]
6156 minors = lu.cfg.AllocateDRBDMinor(
6157 [primary_node, remote_node] * len(disk_info), instance_name)
6160 for lv_prefix in _GenerateUniqueNames(lu, [".disk%d" % (base_index + i)
6161 for i in range(disk_count)]):
6162 names.append(lv_prefix + "_data")
6163 names.append(lv_prefix + "_meta")
6164 for idx, disk in enumerate(disk_info):
6165 disk_index = idx + base_index
6166 disk_dev = _GenerateDRBD8Branch(lu, primary_node, remote_node,
6167 disk["size"], names[idx*2:idx*2+2],
6168 "disk/%d" % disk_index,
6169 minors[idx*2], minors[idx*2+1])
6170 disk_dev.mode = disk["mode"]
6171 disks.append(disk_dev)
6172 elif template_name == constants.DT_FILE:
6173 if len(secondary_nodes) != 0:
6174 raise errors.ProgrammerError("Wrong template configuration")
6176 _RequireFileStorage()
6178 for idx, disk in enumerate(disk_info):
6179 disk_index = idx + base_index
6180 disk_dev = objects.Disk(dev_type=constants.LD_FILE, size=disk["size"],
6181 iv_name="disk/%d" % disk_index,
6182 logical_id=(file_driver,
6183 "%s/disk%d" % (file_storage_dir,
6186 disks.append(disk_dev)
6188 raise errors.ProgrammerError("Invalid disk template '%s'" % template_name)
6192 def _GetInstanceInfoText(instance):
6193 """Compute that text that should be added to the disk's metadata.
6196 return "originstname+%s" % instance.name
6199 def _CreateDisks(lu, instance, to_skip=None, target_node=None):
6200 """Create all disks for an instance.
6202 This abstracts away some work from AddInstance.
6204 @type lu: L{LogicalUnit}
6205 @param lu: the logical unit on whose behalf we execute
6206 @type instance: L{objects.Instance}
6207 @param instance: the instance whose disks we should create
6209 @param to_skip: list of indices to skip
6210 @type target_node: string
6211 @param target_node: if passed, overrides the target node for creation
6213 @return: the success of the creation
6216 info = _GetInstanceInfoText(instance)
6217 if target_node is None:
6218 pnode = instance.primary_node
6219 all_nodes = instance.all_nodes
6224 if instance.disk_template == constants.DT_FILE:
6225 file_storage_dir = os.path.dirname(instance.disks[0].logical_id[1])
6226 result = lu.rpc.call_file_storage_dir_create(pnode, file_storage_dir)
6228 result.Raise("Failed to create directory '%s' on"
6229 " node %s" % (file_storage_dir, pnode))
6231 # Note: this needs to be kept in sync with adding of disks in
6232 # LUSetInstanceParams
6233 for idx, device in enumerate(instance.disks):
6234 if to_skip and idx in to_skip:
6236 logging.info("Creating volume %s for instance %s",
6237 device.iv_name, instance.name)
6239 for node in all_nodes:
6240 f_create = node == pnode
6241 _CreateBlockDev(lu, node, instance, device, f_create, info, f_create)
6244 def _RemoveDisks(lu, instance, target_node=None):
6245 """Remove all disks for an instance.
6247 This abstracts away some work from `AddInstance()` and
6248 `RemoveInstance()`. Note that in case some of the devices couldn't
6249 be removed, the removal will continue with the other ones (compare
6250 with `_CreateDisks()`).
6252 @type lu: L{LogicalUnit}
6253 @param lu: the logical unit on whose behalf we execute
6254 @type instance: L{objects.Instance}
6255 @param instance: the instance whose disks we should remove
6256 @type target_node: string
6257 @param target_node: used to override the node on which to remove the disks
6259 @return: the success of the removal
6262 logging.info("Removing block devices for instance %s", instance.name)
6265 for device in instance.disks:
6267 edata = [(target_node, device)]
6269 edata = device.ComputeNodeTree(instance.primary_node)
6270 for node, disk in edata:
6271 lu.cfg.SetDiskID(disk, node)
6272 msg = lu.rpc.call_blockdev_remove(node, disk).fail_msg
6274 lu.LogWarning("Could not remove block device %s on node %s,"
6275 " continuing anyway: %s", device.iv_name, node, msg)
6278 if instance.disk_template == constants.DT_FILE:
6279 file_storage_dir = os.path.dirname(instance.disks[0].logical_id[1])
6283 tgt = instance.primary_node
6284 result = lu.rpc.call_file_storage_dir_remove(tgt, file_storage_dir)
6286 lu.LogWarning("Could not remove directory '%s' on node %s: %s",
6287 file_storage_dir, instance.primary_node, result.fail_msg)
6293 def _ComputeDiskSize(disk_template, disks):
6294 """Compute disk size requirements in the volume group
6297 # Required free disk space as a function of disk and swap space
6299 constants.DT_DISKLESS: None,
6300 constants.DT_PLAIN: sum(d["size"] for d in disks),
6301 # 128 MB are added for drbd metadata for each disk
6302 constants.DT_DRBD8: sum(d["size"] + 128 for d in disks),
6303 constants.DT_FILE: None,
6306 if disk_template not in req_size_dict:
6307 raise errors.ProgrammerError("Disk template '%s' size requirement"
6308 " is unknown" % disk_template)
6310 return req_size_dict[disk_template]
6313 def _CheckHVParams(lu, nodenames, hvname, hvparams):
6314 """Hypervisor parameter validation.
6316 This function abstract the hypervisor parameter validation to be
6317 used in both instance create and instance modify.
6319 @type lu: L{LogicalUnit}
6320 @param lu: the logical unit for which we check
6321 @type nodenames: list
6322 @param nodenames: the list of nodes on which we should check
6323 @type hvname: string
6324 @param hvname: the name of the hypervisor we should use
6325 @type hvparams: dict
6326 @param hvparams: the parameters which we need to check
6327 @raise errors.OpPrereqError: if the parameters are not valid
6330 hvinfo = lu.rpc.call_hypervisor_validate_params(nodenames,
6333 for node in nodenames:
6337 info.Raise("Hypervisor parameter validation failed on node %s" % node)
6340 def _CheckOSParams(lu, required, nodenames, osname, osparams):
6341 """OS parameters validation.
6343 @type lu: L{LogicalUnit}
6344 @param lu: the logical unit for which we check
6345 @type required: boolean
6346 @param required: whether the validation should fail if the OS is not
6348 @type nodenames: list
6349 @param nodenames: the list of nodes on which we should check
6350 @type osname: string
6351 @param osname: the name of the hypervisor we should use
6352 @type osparams: dict
6353 @param osparams: the parameters which we need to check
6354 @raise errors.OpPrereqError: if the parameters are not valid
6357 result = lu.rpc.call_os_validate(required, nodenames, osname,
6358 [constants.OS_VALIDATE_PARAMETERS],
6360 for node, nres in result.items():
6361 # we don't check for offline cases since this should be run only
6362 # against the master node and/or an instance's nodes
6363 nres.Raise("OS Parameters validation failed on node %s" % node)
6364 if not nres.payload:
6365 lu.LogInfo("OS %s not found on node %s, validation skipped",
6369 class LUCreateInstance(LogicalUnit):
6370 """Create an instance.
6373 HPATH = "instance-add"
6374 HTYPE = constants.HTYPE_INSTANCE
6377 ("mode", _NoDefault, _TElemOf(constants.INSTANCE_CREATE_MODES)),
6378 ("start", True, _TBool),
6379 ("wait_for_sync", True, _TBool),
6380 ("ip_check", True, _TBool),
6381 ("name_check", True, _TBool),
6382 ("disks", _NoDefault, _TListOf(_TDict)),
6383 ("nics", _NoDefault, _TListOf(_TDict)),
6384 ("hvparams", _NoDefault, _TDict),
6385 ("beparams", _NoDefault, _TDict),
6386 ("osparams", _NoDefault, _TDict),
6387 ("no_install", None, _TMaybeBool),
6388 ("os_type", None, _TMaybeString),
6389 ("force_variant", False, _TBool),
6390 ("source_handshake", None, _TOr(_TList, _TNone)),
6391 ("source_x509_ca", None, _TOr(_TList, _TNone)),
6392 ("source_instance_name", None, _TMaybeString),
6393 ("src_node", None, _TMaybeString),
6394 ("src_path", None, _TMaybeString),
6395 ("pnode", None, _TMaybeString),
6396 ("snode", None, _TMaybeString),
6397 ("iallocator", None, _TMaybeString),
6398 ("hypervisor", None, _TMaybeString),
6399 ("disk_template", _NoDefault, _CheckDiskTemplate),
6400 ("identify_defaults", False, _TBool),
6401 ("file_driver", None, _TOr(_TNone, _TElemOf(constants.FILE_DRIVER))),
6402 ("file_storage_dir", None, _TMaybeString),
6403 ("dry_run", False, _TBool),
6407 def CheckArguments(self):
6411 # do not require name_check to ease forward/backward compatibility
6413 if self.op.no_install and self.op.start:
6414 self.LogInfo("No-installation mode selected, disabling startup")
6415 self.op.start = False
6416 # validate/normalize the instance name
6417 self.op.instance_name = utils.HostInfo.NormalizeName(self.op.instance_name)
6418 if self.op.ip_check and not self.op.name_check:
6419 # TODO: make the ip check more flexible and not depend on the name check
6420 raise errors.OpPrereqError("Cannot do ip checks without a name check",
6423 # check nics' parameter names
6424 for nic in self.op.nics:
6425 utils.ForceDictType(nic, constants.INIC_PARAMS_TYPES)
6427 # check disks. parameter names and consistent adopt/no-adopt strategy
6428 has_adopt = has_no_adopt = False
6429 for disk in self.op.disks:
6430 utils.ForceDictType(disk, constants.IDISK_PARAMS_TYPES)
6435 if has_adopt and has_no_adopt:
6436 raise errors.OpPrereqError("Either all disks are adopted or none is",
6439 if self.op.disk_template not in constants.DTS_MAY_ADOPT:
6440 raise errors.OpPrereqError("Disk adoption is not supported for the"
6441 " '%s' disk template" %
6442 self.op.disk_template,
6444 if self.op.iallocator is not None:
6445 raise errors.OpPrereqError("Disk adoption not allowed with an"
6446 " iallocator script", errors.ECODE_INVAL)
6447 if self.op.mode == constants.INSTANCE_IMPORT:
6448 raise errors.OpPrereqError("Disk adoption not allowed for"
6449 " instance import", errors.ECODE_INVAL)
6451 self.adopt_disks = has_adopt
6453 # instance name verification
6454 if self.op.name_check:
6455 self.hostname1 = utils.GetHostInfo(self.op.instance_name)
6456 self.op.instance_name = self.hostname1.name
6457 # used in CheckPrereq for ip ping check
6458 self.check_ip = self.hostname1.ip
6459 elif self.op.mode == constants.INSTANCE_REMOTE_IMPORT:
6460 raise errors.OpPrereqError("Remote imports require names to be checked" %
6463 self.check_ip = None
6465 # file storage checks
6466 if (self.op.file_driver and
6467 not self.op.file_driver in constants.FILE_DRIVER):
6468 raise errors.OpPrereqError("Invalid file driver name '%s'" %
6469 self.op.file_driver, errors.ECODE_INVAL)
6471 if self.op.file_storage_dir and os.path.isabs(self.op.file_storage_dir):
6472 raise errors.OpPrereqError("File storage directory path not absolute",
6475 ### Node/iallocator related checks
6476 if [self.op.iallocator, self.op.pnode].count(None) != 1:
6477 raise errors.OpPrereqError("One and only one of iallocator and primary"
6478 " node must be given",
6481 self._cds = _GetClusterDomainSecret()
6483 if self.op.mode == constants.INSTANCE_IMPORT:
6484 # On import force_variant must be True, because if we forced it at
6485 # initial install, our only chance when importing it back is that it
6487 self.op.force_variant = True
6489 if self.op.no_install:
6490 self.LogInfo("No-installation mode has no effect during import")
6492 elif self.op.mode == constants.INSTANCE_CREATE:
6493 if self.op.os_type is None:
6494 raise errors.OpPrereqError("No guest OS specified",
6496 if self.op.disk_template is None:
6497 raise errors.OpPrereqError("No disk template specified",
6500 elif self.op.mode == constants.INSTANCE_REMOTE_IMPORT:
6501 # Check handshake to ensure both clusters have the same domain secret
6502 src_handshake = self.op.source_handshake
6503 if not src_handshake:
6504 raise errors.OpPrereqError("Missing source handshake",
6507 errmsg = masterd.instance.CheckRemoteExportHandshake(self._cds,
6510 raise errors.OpPrereqError("Invalid handshake: %s" % errmsg,
6513 # Load and check source CA
6514 self.source_x509_ca_pem = self.op.source_x509_ca
6515 if not self.source_x509_ca_pem:
6516 raise errors.OpPrereqError("Missing source X509 CA",
6520 (cert, _) = utils.LoadSignedX509Certificate(self.source_x509_ca_pem,
6522 except OpenSSL.crypto.Error, err:
6523 raise errors.OpPrereqError("Unable to load source X509 CA (%s)" %
6524 (err, ), errors.ECODE_INVAL)
6526 (errcode, msg) = utils.VerifyX509Certificate(cert, None, None)
6527 if errcode is not None:
6528 raise errors.OpPrereqError("Invalid source X509 CA (%s)" % (msg, ),
6531 self.source_x509_ca = cert
6533 src_instance_name = self.op.source_instance_name
6534 if not src_instance_name:
6535 raise errors.OpPrereqError("Missing source instance name",
6538 self.source_instance_name = \
6539 utils.GetHostInfo(utils.HostInfo.NormalizeName(src_instance_name)).name
6542 raise errors.OpPrereqError("Invalid instance creation mode %r" %
6543 self.op.mode, errors.ECODE_INVAL)
6545 def ExpandNames(self):
6546 """ExpandNames for CreateInstance.
6548 Figure out the right locks for instance creation.
6551 self.needed_locks = {}
6553 instance_name = self.op.instance_name
6554 # this is just a preventive check, but someone might still add this
6555 # instance in the meantime, and creation will fail at lock-add time
6556 if instance_name in self.cfg.GetInstanceList():
6557 raise errors.OpPrereqError("Instance '%s' is already in the cluster" %
6558 instance_name, errors.ECODE_EXISTS)
6560 self.add_locks[locking.LEVEL_INSTANCE] = instance_name
6562 if self.op.iallocator:
6563 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
6565 self.op.pnode = _ExpandNodeName(self.cfg, self.op.pnode)
6566 nodelist = [self.op.pnode]
6567 if self.op.snode is not None:
6568 self.op.snode = _ExpandNodeName(self.cfg, self.op.snode)
6569 nodelist.append(self.op.snode)
6570 self.needed_locks[locking.LEVEL_NODE] = nodelist
6572 # in case of import lock the source node too
6573 if self.op.mode == constants.INSTANCE_IMPORT:
6574 src_node = self.op.src_node
6575 src_path = self.op.src_path
6577 if src_path is None:
6578 self.op.src_path = src_path = self.op.instance_name
6580 if src_node is None:
6581 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
6582 self.op.src_node = None
6583 if os.path.isabs(src_path):
6584 raise errors.OpPrereqError("Importing an instance from an absolute"
6585 " path requires a source node option.",
6588 self.op.src_node = src_node = _ExpandNodeName(self.cfg, src_node)
6589 if self.needed_locks[locking.LEVEL_NODE] is not locking.ALL_SET:
6590 self.needed_locks[locking.LEVEL_NODE].append(src_node)
6591 if not os.path.isabs(src_path):
6592 self.op.src_path = src_path = \
6593 utils.PathJoin(constants.EXPORT_DIR, src_path)
6595 def _RunAllocator(self):
6596 """Run the allocator based on input opcode.
6599 nics = [n.ToDict() for n in self.nics]
6600 ial = IAllocator(self.cfg, self.rpc,
6601 mode=constants.IALLOCATOR_MODE_ALLOC,
6602 name=self.op.instance_name,
6603 disk_template=self.op.disk_template,
6606 vcpus=self.be_full[constants.BE_VCPUS],
6607 mem_size=self.be_full[constants.BE_MEMORY],
6610 hypervisor=self.op.hypervisor,
6613 ial.Run(self.op.iallocator)
6616 raise errors.OpPrereqError("Can't compute nodes using"
6617 " iallocator '%s': %s" %
6618 (self.op.iallocator, ial.info),
6620 if len(ial.result) != ial.required_nodes:
6621 raise errors.OpPrereqError("iallocator '%s' returned invalid number"
6622 " of nodes (%s), required %s" %
6623 (self.op.iallocator, len(ial.result),
6624 ial.required_nodes), errors.ECODE_FAULT)
6625 self.op.pnode = ial.result[0]
6626 self.LogInfo("Selected nodes for instance %s via iallocator %s: %s",
6627 self.op.instance_name, self.op.iallocator,
6628 utils.CommaJoin(ial.result))
6629 if ial.required_nodes == 2:
6630 self.op.snode = ial.result[1]
6632 def BuildHooksEnv(self):
6635 This runs on master, primary and secondary nodes of the instance.
6639 "ADD_MODE": self.op.mode,
6641 if self.op.mode == constants.INSTANCE_IMPORT:
6642 env["SRC_NODE"] = self.op.src_node
6643 env["SRC_PATH"] = self.op.src_path
6644 env["SRC_IMAGES"] = self.src_images
6646 env.update(_BuildInstanceHookEnv(
6647 name=self.op.instance_name,
6648 primary_node=self.op.pnode,
6649 secondary_nodes=self.secondaries,
6650 status=self.op.start,
6651 os_type=self.op.os_type,
6652 memory=self.be_full[constants.BE_MEMORY],
6653 vcpus=self.be_full[constants.BE_VCPUS],
6654 nics=_NICListToTuple(self, self.nics),
6655 disk_template=self.op.disk_template,
6656 disks=[(d["size"], d["mode"]) for d in self.disks],
6659 hypervisor_name=self.op.hypervisor,
6662 nl = ([self.cfg.GetMasterNode(), self.op.pnode] +
6666 def _ReadExportInfo(self):
6667 """Reads the export information from disk.
6669 It will override the opcode source node and path with the actual
6670 information, if these two were not specified before.
6672 @return: the export information
6675 assert self.op.mode == constants.INSTANCE_IMPORT
6677 src_node = self.op.src_node
6678 src_path = self.op.src_path
6680 if src_node is None:
6681 locked_nodes = self.acquired_locks[locking.LEVEL_NODE]
6682 exp_list = self.rpc.call_export_list(locked_nodes)
6684 for node in exp_list:
6685 if exp_list[node].fail_msg:
6687 if src_path in exp_list[node].payload:
6689 self.op.src_node = src_node = node
6690 self.op.src_path = src_path = utils.PathJoin(constants.EXPORT_DIR,
6694 raise errors.OpPrereqError("No export found for relative path %s" %
6695 src_path, errors.ECODE_INVAL)
6697 _CheckNodeOnline(self, src_node)
6698 result = self.rpc.call_export_info(src_node, src_path)
6699 result.Raise("No export or invalid export found in dir %s" % src_path)
6701 export_info = objects.SerializableConfigParser.Loads(str(result.payload))
6702 if not export_info.has_section(constants.INISECT_EXP):
6703 raise errors.ProgrammerError("Corrupted export config",
6704 errors.ECODE_ENVIRON)
6706 ei_version = export_info.get(constants.INISECT_EXP, "version")
6707 if (int(ei_version) != constants.EXPORT_VERSION):
6708 raise errors.OpPrereqError("Wrong export version %s (wanted %d)" %
6709 (ei_version, constants.EXPORT_VERSION),
6710 errors.ECODE_ENVIRON)
6713 def _ReadExportParams(self, einfo):
6714 """Use export parameters as defaults.
6716 In case the opcode doesn't specify (as in override) some instance
6717 parameters, then try to use them from the export information, if
6721 self.op.os_type = einfo.get(constants.INISECT_EXP, "os")
6723 if self.op.disk_template is None:
6724 if einfo.has_option(constants.INISECT_INS, "disk_template"):
6725 self.op.disk_template = einfo.get(constants.INISECT_INS,
6728 raise errors.OpPrereqError("No disk template specified and the export"
6729 " is missing the disk_template information",
6732 if not self.op.disks:
6733 if einfo.has_option(constants.INISECT_INS, "disk_count"):
6735 # TODO: import the disk iv_name too
6736 for idx in range(einfo.getint(constants.INISECT_INS, "disk_count")):
6737 disk_sz = einfo.getint(constants.INISECT_INS, "disk%d_size" % idx)
6738 disks.append({"size": disk_sz})
6739 self.op.disks = disks
6741 raise errors.OpPrereqError("No disk info specified and the export"
6742 " is missing the disk information",
6745 if (not self.op.nics and
6746 einfo.has_option(constants.INISECT_INS, "nic_count")):
6748 for idx in range(einfo.getint(constants.INISECT_INS, "nic_count")):
6750 for name in list(constants.NICS_PARAMETERS) + ["ip", "mac"]:
6751 v = einfo.get(constants.INISECT_INS, "nic%d_%s" % (idx, name))
6756 if (self.op.hypervisor is None and
6757 einfo.has_option(constants.INISECT_INS, "hypervisor")):
6758 self.op.hypervisor = einfo.get(constants.INISECT_INS, "hypervisor")
6759 if einfo.has_section(constants.INISECT_HYP):
6760 # use the export parameters but do not override the ones
6761 # specified by the user
6762 for name, value in einfo.items(constants.INISECT_HYP):
6763 if name not in self.op.hvparams:
6764 self.op.hvparams[name] = value
6766 if einfo.has_section(constants.INISECT_BEP):
6767 # use the parameters, without overriding
6768 for name, value in einfo.items(constants.INISECT_BEP):
6769 if name not in self.op.beparams:
6770 self.op.beparams[name] = value
6772 # try to read the parameters old style, from the main section
6773 for name in constants.BES_PARAMETERS:
6774 if (name not in self.op.beparams and
6775 einfo.has_option(constants.INISECT_INS, name)):
6776 self.op.beparams[name] = einfo.get(constants.INISECT_INS, name)
6778 if einfo.has_section(constants.INISECT_OSP):
6779 # use the parameters, without overriding
6780 for name, value in einfo.items(constants.INISECT_OSP):
6781 if name not in self.op.osparams:
6782 self.op.osparams[name] = value
6784 def _RevertToDefaults(self, cluster):
6785 """Revert the instance parameters to the default values.
6789 hv_defs = cluster.SimpleFillHV(self.op.hypervisor, self.op.os_type, {})
6790 for name in self.op.hvparams.keys():
6791 if name in hv_defs and hv_defs[name] == self.op.hvparams[name]:
6792 del self.op.hvparams[name]
6794 be_defs = cluster.SimpleFillBE({})
6795 for name in self.op.beparams.keys():
6796 if name in be_defs and be_defs[name] == self.op.beparams[name]:
6797 del self.op.beparams[name]
6799 nic_defs = cluster.SimpleFillNIC({})
6800 for nic in self.op.nics:
6801 for name in constants.NICS_PARAMETERS:
6802 if name in nic and name in nic_defs and nic[name] == nic_defs[name]:
6805 os_defs = cluster.SimpleFillOS(self.op.os_type, {})
6806 for name in self.op.osparams.keys():
6807 if name in os_defs and os_defs[name] == self.op.osparams[name]:
6808 del self.op.osparams[name]
6810 def CheckPrereq(self):
6811 """Check prerequisites.
6814 if self.op.mode == constants.INSTANCE_IMPORT:
6815 export_info = self._ReadExportInfo()
6816 self._ReadExportParams(export_info)
6818 _CheckDiskTemplate(self.op.disk_template)
6820 if (not self.cfg.GetVGName() and
6821 self.op.disk_template not in constants.DTS_NOT_LVM):
6822 raise errors.OpPrereqError("Cluster does not support lvm-based"
6823 " instances", errors.ECODE_STATE)
6825 if self.op.hypervisor is None:
6826 self.op.hypervisor = self.cfg.GetHypervisorType()
6828 cluster = self.cfg.GetClusterInfo()
6829 enabled_hvs = cluster.enabled_hypervisors
6830 if self.op.hypervisor not in enabled_hvs:
6831 raise errors.OpPrereqError("Selected hypervisor (%s) not enabled in the"
6832 " cluster (%s)" % (self.op.hypervisor,
6833 ",".join(enabled_hvs)),
6836 # check hypervisor parameter syntax (locally)
6837 utils.ForceDictType(self.op.hvparams, constants.HVS_PARAMETER_TYPES)
6838 filled_hvp = cluster.SimpleFillHV(self.op.hypervisor, self.op.os_type,
6840 hv_type = hypervisor.GetHypervisor(self.op.hypervisor)
6841 hv_type.CheckParameterSyntax(filled_hvp)
6842 self.hv_full = filled_hvp
6843 # check that we don't specify global parameters on an instance
6844 _CheckGlobalHvParams(self.op.hvparams)
6846 # fill and remember the beparams dict
6847 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
6848 self.be_full = cluster.SimpleFillBE(self.op.beparams)
6850 # build os parameters
6851 self.os_full = cluster.SimpleFillOS(self.op.os_type, self.op.osparams)
6853 # now that hvp/bep are in final format, let's reset to defaults,
6855 if self.op.identify_defaults:
6856 self._RevertToDefaults(cluster)
6860 for idx, nic in enumerate(self.op.nics):
6861 nic_mode_req = nic.get("mode", None)
6862 nic_mode = nic_mode_req
6863 if nic_mode is None:
6864 nic_mode = cluster.nicparams[constants.PP_DEFAULT][constants.NIC_MODE]
6866 # in routed mode, for the first nic, the default ip is 'auto'
6867 if nic_mode == constants.NIC_MODE_ROUTED and idx == 0:
6868 default_ip_mode = constants.VALUE_AUTO
6870 default_ip_mode = constants.VALUE_NONE
6872 # ip validity checks
6873 ip = nic.get("ip", default_ip_mode)
6874 if ip is None or ip.lower() == constants.VALUE_NONE:
6876 elif ip.lower() == constants.VALUE_AUTO:
6877 if not self.op.name_check:
6878 raise errors.OpPrereqError("IP address set to auto but name checks"
6879 " have been skipped. Aborting.",
6881 nic_ip = self.hostname1.ip
6883 if not utils.IsValidIP4(ip):
6884 raise errors.OpPrereqError("Given IP address '%s' doesn't look"
6885 " like a valid IP" % ip,
6889 # TODO: check the ip address for uniqueness
6890 if nic_mode == constants.NIC_MODE_ROUTED and not nic_ip:
6891 raise errors.OpPrereqError("Routed nic mode requires an ip address",
6894 # MAC address verification
6895 mac = nic.get("mac", constants.VALUE_AUTO)
6896 if mac not in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
6897 mac = utils.NormalizeAndValidateMac(mac)
6900 self.cfg.ReserveMAC(mac, self.proc.GetECId())
6901 except errors.ReservationError:
6902 raise errors.OpPrereqError("MAC address %s already in use"
6903 " in cluster" % mac,
6904 errors.ECODE_NOTUNIQUE)
6906 # bridge verification
6907 bridge = nic.get("bridge", None)
6908 link = nic.get("link", None)
6910 raise errors.OpPrereqError("Cannot pass 'bridge' and 'link'"
6911 " at the same time", errors.ECODE_INVAL)
6912 elif bridge and nic_mode == constants.NIC_MODE_ROUTED:
6913 raise errors.OpPrereqError("Cannot pass 'bridge' on a routed nic",
6920 nicparams[constants.NIC_MODE] = nic_mode_req
6922 nicparams[constants.NIC_LINK] = link
6924 check_params = cluster.SimpleFillNIC(nicparams)
6925 objects.NIC.CheckParameterSyntax(check_params)
6926 self.nics.append(objects.NIC(mac=mac, ip=nic_ip, nicparams=nicparams))
6928 # disk checks/pre-build
6930 for disk in self.op.disks:
6931 mode = disk.get("mode", constants.DISK_RDWR)
6932 if mode not in constants.DISK_ACCESS_SET:
6933 raise errors.OpPrereqError("Invalid disk access mode '%s'" %
6934 mode, errors.ECODE_INVAL)
6935 size = disk.get("size", None)
6937 raise errors.OpPrereqError("Missing disk size", errors.ECODE_INVAL)
6940 except (TypeError, ValueError):
6941 raise errors.OpPrereqError("Invalid disk size '%s'" % size,
6943 new_disk = {"size": size, "mode": mode}
6945 new_disk["adopt"] = disk["adopt"]
6946 self.disks.append(new_disk)
6948 if self.op.mode == constants.INSTANCE_IMPORT:
6950 # Check that the new instance doesn't have less disks than the export
6951 instance_disks = len(self.disks)
6952 export_disks = export_info.getint(constants.INISECT_INS, 'disk_count')
6953 if instance_disks < export_disks:
6954 raise errors.OpPrereqError("Not enough disks to import."
6955 " (instance: %d, export: %d)" %
6956 (instance_disks, export_disks),
6960 for idx in range(export_disks):
6961 option = 'disk%d_dump' % idx
6962 if export_info.has_option(constants.INISECT_INS, option):
6963 # FIXME: are the old os-es, disk sizes, etc. useful?
6964 export_name = export_info.get(constants.INISECT_INS, option)
6965 image = utils.PathJoin(self.op.src_path, export_name)
6966 disk_images.append(image)
6968 disk_images.append(False)
6970 self.src_images = disk_images
6972 old_name = export_info.get(constants.INISECT_INS, 'name')
6974 exp_nic_count = export_info.getint(constants.INISECT_INS, 'nic_count')
6975 except (TypeError, ValueError), err:
6976 raise errors.OpPrereqError("Invalid export file, nic_count is not"
6977 " an integer: %s" % str(err),
6979 if self.op.instance_name == old_name:
6980 for idx, nic in enumerate(self.nics):
6981 if nic.mac == constants.VALUE_AUTO and exp_nic_count >= idx:
6982 nic_mac_ini = 'nic%d_mac' % idx
6983 nic.mac = export_info.get(constants.INISECT_INS, nic_mac_ini)
6985 # ENDIF: self.op.mode == constants.INSTANCE_IMPORT
6987 # ip ping checks (we use the same ip that was resolved in ExpandNames)
6988 if self.op.ip_check:
6989 if utils.TcpPing(self.check_ip, constants.DEFAULT_NODED_PORT):
6990 raise errors.OpPrereqError("IP %s of instance %s already in use" %
6991 (self.check_ip, self.op.instance_name),
6992 errors.ECODE_NOTUNIQUE)
6994 #### mac address generation
6995 # By generating here the mac address both the allocator and the hooks get
6996 # the real final mac address rather than the 'auto' or 'generate' value.
6997 # There is a race condition between the generation and the instance object
6998 # creation, which means that we know the mac is valid now, but we're not
6999 # sure it will be when we actually add the instance. If things go bad
7000 # adding the instance will abort because of a duplicate mac, and the
7001 # creation job will fail.
7002 for nic in self.nics:
7003 if nic.mac in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
7004 nic.mac = self.cfg.GenerateMAC(self.proc.GetECId())
7008 if self.op.iallocator is not None:
7009 self._RunAllocator()
7011 #### node related checks
7013 # check primary node
7014 self.pnode = pnode = self.cfg.GetNodeInfo(self.op.pnode)
7015 assert self.pnode is not None, \
7016 "Cannot retrieve locked node %s" % self.op.pnode
7018 raise errors.OpPrereqError("Cannot use offline primary node '%s'" %
7019 pnode.name, errors.ECODE_STATE)
7021 raise errors.OpPrereqError("Cannot use drained primary node '%s'" %
7022 pnode.name, errors.ECODE_STATE)
7024 self.secondaries = []
7026 # mirror node verification
7027 if self.op.disk_template in constants.DTS_NET_MIRROR:
7028 if self.op.snode is None:
7029 raise errors.OpPrereqError("The networked disk templates need"
7030 " a mirror node", errors.ECODE_INVAL)
7031 if self.op.snode == pnode.name:
7032 raise errors.OpPrereqError("The secondary node cannot be the"
7033 " primary node.", errors.ECODE_INVAL)
7034 _CheckNodeOnline(self, self.op.snode)
7035 _CheckNodeNotDrained(self, self.op.snode)
7036 self.secondaries.append(self.op.snode)
7038 nodenames = [pnode.name] + self.secondaries
7040 req_size = _ComputeDiskSize(self.op.disk_template,
7043 # Check lv size requirements, if not adopting
7044 if req_size is not None and not self.adopt_disks:
7045 _CheckNodesFreeDisk(self, nodenames, req_size)
7047 if self.adopt_disks: # instead, we must check the adoption data
7048 all_lvs = set([i["adopt"] for i in self.disks])
7049 if len(all_lvs) != len(self.disks):
7050 raise errors.OpPrereqError("Duplicate volume names given for adoption",
7052 for lv_name in all_lvs:
7054 self.cfg.ReserveLV(lv_name, self.proc.GetECId())
7055 except errors.ReservationError:
7056 raise errors.OpPrereqError("LV named %s used by another instance" %
7057 lv_name, errors.ECODE_NOTUNIQUE)
7059 node_lvs = self.rpc.call_lv_list([pnode.name],
7060 self.cfg.GetVGName())[pnode.name]
7061 node_lvs.Raise("Cannot get LV information from node %s" % pnode.name)
7062 node_lvs = node_lvs.payload
7063 delta = all_lvs.difference(node_lvs.keys())
7065 raise errors.OpPrereqError("Missing logical volume(s): %s" %
7066 utils.CommaJoin(delta),
7068 online_lvs = [lv for lv in all_lvs if node_lvs[lv][2]]
7070 raise errors.OpPrereqError("Online logical volumes found, cannot"
7071 " adopt: %s" % utils.CommaJoin(online_lvs),
7073 # update the size of disk based on what is found
7074 for dsk in self.disks:
7075 dsk["size"] = int(float(node_lvs[dsk["adopt"]][0]))
7077 _CheckHVParams(self, nodenames, self.op.hypervisor, self.op.hvparams)
7079 _CheckNodeHasOS(self, pnode.name, self.op.os_type, self.op.force_variant)
7080 # check OS parameters (remotely)
7081 _CheckOSParams(self, True, nodenames, self.op.os_type, self.os_full)
7083 _CheckNicsBridgesExist(self, self.nics, self.pnode.name)
7085 # memory check on primary node
7087 _CheckNodeFreeMemory(self, self.pnode.name,
7088 "creating instance %s" % self.op.instance_name,
7089 self.be_full[constants.BE_MEMORY],
7092 self.dry_run_result = list(nodenames)
7094 def Exec(self, feedback_fn):
7095 """Create and add the instance to the cluster.
7098 instance = self.op.instance_name
7099 pnode_name = self.pnode.name
7101 ht_kind = self.op.hypervisor
7102 if ht_kind in constants.HTS_REQ_PORT:
7103 network_port = self.cfg.AllocatePort()
7107 if constants.ENABLE_FILE_STORAGE:
7108 # this is needed because os.path.join does not accept None arguments
7109 if self.op.file_storage_dir is None:
7110 string_file_storage_dir = ""
7112 string_file_storage_dir = self.op.file_storage_dir
7114 # build the full file storage dir path
7115 file_storage_dir = utils.PathJoin(self.cfg.GetFileStorageDir(),
7116 string_file_storage_dir, instance)
7118 file_storage_dir = ""
7120 disks = _GenerateDiskTemplate(self,
7121 self.op.disk_template,
7122 instance, pnode_name,
7126 self.op.file_driver,
7129 iobj = objects.Instance(name=instance, os=self.op.os_type,
7130 primary_node=pnode_name,
7131 nics=self.nics, disks=disks,
7132 disk_template=self.op.disk_template,
7134 network_port=network_port,
7135 beparams=self.op.beparams,
7136 hvparams=self.op.hvparams,
7137 hypervisor=self.op.hypervisor,
7138 osparams=self.op.osparams,
7141 if self.adopt_disks:
7142 # rename LVs to the newly-generated names; we need to construct
7143 # 'fake' LV disks with the old data, plus the new unique_id
7144 tmp_disks = [objects.Disk.FromDict(v.ToDict()) for v in disks]
7146 for t_dsk, a_dsk in zip (tmp_disks, self.disks):
7147 rename_to.append(t_dsk.logical_id)
7148 t_dsk.logical_id = (t_dsk.logical_id[0], a_dsk["adopt"])
7149 self.cfg.SetDiskID(t_dsk, pnode_name)
7150 result = self.rpc.call_blockdev_rename(pnode_name,
7151 zip(tmp_disks, rename_to))
7152 result.Raise("Failed to rename adoped LVs")
7154 feedback_fn("* creating instance disks...")
7156 _CreateDisks(self, iobj)
7157 except errors.OpExecError:
7158 self.LogWarning("Device creation failed, reverting...")
7160 _RemoveDisks(self, iobj)
7162 self.cfg.ReleaseDRBDMinors(instance)
7165 feedback_fn("adding instance %s to cluster config" % instance)
7167 self.cfg.AddInstance(iobj, self.proc.GetECId())
7169 # Declare that we don't want to remove the instance lock anymore, as we've
7170 # added the instance to the config
7171 del self.remove_locks[locking.LEVEL_INSTANCE]
7172 # Unlock all the nodes
7173 if self.op.mode == constants.INSTANCE_IMPORT:
7174 nodes_keep = [self.op.src_node]
7175 nodes_release = [node for node in self.acquired_locks[locking.LEVEL_NODE]
7176 if node != self.op.src_node]
7177 self.context.glm.release(locking.LEVEL_NODE, nodes_release)
7178 self.acquired_locks[locking.LEVEL_NODE] = nodes_keep
7180 self.context.glm.release(locking.LEVEL_NODE)
7181 del self.acquired_locks[locking.LEVEL_NODE]
7183 if self.op.wait_for_sync:
7184 disk_abort = not _WaitForSync(self, iobj)
7185 elif iobj.disk_template in constants.DTS_NET_MIRROR:
7186 # make sure the disks are not degraded (still sync-ing is ok)
7188 feedback_fn("* checking mirrors status")
7189 disk_abort = not _WaitForSync(self, iobj, oneshot=True)
7194 _RemoveDisks(self, iobj)
7195 self.cfg.RemoveInstance(iobj.name)
7196 # Make sure the instance lock gets removed
7197 self.remove_locks[locking.LEVEL_INSTANCE] = iobj.name
7198 raise errors.OpExecError("There are some degraded disks for"
7201 if iobj.disk_template != constants.DT_DISKLESS and not self.adopt_disks:
7202 if self.op.mode == constants.INSTANCE_CREATE:
7203 if not self.op.no_install:
7204 feedback_fn("* running the instance OS create scripts...")
7205 # FIXME: pass debug option from opcode to backend
7206 result = self.rpc.call_instance_os_add(pnode_name, iobj, False,
7207 self.op.debug_level)
7208 result.Raise("Could not add os for instance %s"
7209 " on node %s" % (instance, pnode_name))
7211 elif self.op.mode == constants.INSTANCE_IMPORT:
7212 feedback_fn("* running the instance OS import scripts...")
7216 for idx, image in enumerate(self.src_images):
7220 # FIXME: pass debug option from opcode to backend
7221 dt = masterd.instance.DiskTransfer("disk/%s" % idx,
7222 constants.IEIO_FILE, (image, ),
7223 constants.IEIO_SCRIPT,
7224 (iobj.disks[idx], idx),
7226 transfers.append(dt)
7229 masterd.instance.TransferInstanceData(self, feedback_fn,
7230 self.op.src_node, pnode_name,
7231 self.pnode.secondary_ip,
7233 if not compat.all(import_result):
7234 self.LogWarning("Some disks for instance %s on node %s were not"
7235 " imported successfully" % (instance, pnode_name))
7237 elif self.op.mode == constants.INSTANCE_REMOTE_IMPORT:
7238 feedback_fn("* preparing remote import...")
7239 connect_timeout = constants.RIE_CONNECT_TIMEOUT
7240 timeouts = masterd.instance.ImportExportTimeouts(connect_timeout)
7242 disk_results = masterd.instance.RemoteImport(self, feedback_fn, iobj,
7243 self.source_x509_ca,
7244 self._cds, timeouts)
7245 if not compat.all(disk_results):
7246 # TODO: Should the instance still be started, even if some disks
7247 # failed to import (valid for local imports, too)?
7248 self.LogWarning("Some disks for instance %s on node %s were not"
7249 " imported successfully" % (instance, pnode_name))
7251 # Run rename script on newly imported instance
7252 assert iobj.name == instance
7253 feedback_fn("Running rename script for %s" % instance)
7254 result = self.rpc.call_instance_run_rename(pnode_name, iobj,
7255 self.source_instance_name,
7256 self.op.debug_level)
7258 self.LogWarning("Failed to run rename script for %s on node"
7259 " %s: %s" % (instance, pnode_name, result.fail_msg))
7262 # also checked in the prereq part
7263 raise errors.ProgrammerError("Unknown OS initialization mode '%s'"
7267 iobj.admin_up = True
7268 self.cfg.Update(iobj, feedback_fn)
7269 logging.info("Starting instance %s on node %s", instance, pnode_name)
7270 feedback_fn("* starting instance...")
7271 result = self.rpc.call_instance_start(pnode_name, iobj, None, None)
7272 result.Raise("Could not start instance")
7274 return list(iobj.all_nodes)
7277 class LUConnectConsole(NoHooksLU):
7278 """Connect to an instance's console.
7280 This is somewhat special in that it returns the command line that
7281 you need to run on the master node in order to connect to the
7290 def ExpandNames(self):
7291 self._ExpandAndLockInstance()
7293 def CheckPrereq(self):
7294 """Check prerequisites.
7296 This checks that the instance is in the cluster.
7299 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
7300 assert self.instance is not None, \
7301 "Cannot retrieve locked instance %s" % self.op.instance_name
7302 _CheckNodeOnline(self, self.instance.primary_node)
7304 def Exec(self, feedback_fn):
7305 """Connect to the console of an instance
7308 instance = self.instance
7309 node = instance.primary_node
7311 node_insts = self.rpc.call_instance_list([node],
7312 [instance.hypervisor])[node]
7313 node_insts.Raise("Can't get node information from %s" % node)
7315 if instance.name not in node_insts.payload:
7316 raise errors.OpExecError("Instance %s is not running." % instance.name)
7318 logging.debug("Connecting to console of %s on %s", instance.name, node)
7320 hyper = hypervisor.GetHypervisor(instance.hypervisor)
7321 cluster = self.cfg.GetClusterInfo()
7322 # beparams and hvparams are passed separately, to avoid editing the
7323 # instance and then saving the defaults in the instance itself.
7324 hvparams = cluster.FillHV(instance)
7325 beparams = cluster.FillBE(instance)
7326 console_cmd = hyper.GetShellCommandForConsole(instance, hvparams, beparams)
7329 return self.ssh.BuildCmd(node, "root", console_cmd, batch=True, tty=True)
7332 class LUReplaceDisks(LogicalUnit):
7333 """Replace the disks of an instance.
7336 HPATH = "mirrors-replace"
7337 HTYPE = constants.HTYPE_INSTANCE
7340 ("mode", _NoDefault, _TElemOf(constants.REPLACE_MODES)),
7341 ("disks", _EmptyList, _TListOf(_TPositiveInt)),
7342 ("remote_node", None, _TMaybeString),
7343 ("iallocator", None, _TMaybeString),
7344 ("early_release", False, _TBool),
7348 def CheckArguments(self):
7349 TLReplaceDisks.CheckArguments(self.op.mode, self.op.remote_node,
7352 def ExpandNames(self):
7353 self._ExpandAndLockInstance()
7355 if self.op.iallocator is not None:
7356 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
7358 elif self.op.remote_node is not None:
7359 remote_node = _ExpandNodeName(self.cfg, self.op.remote_node)
7360 self.op.remote_node = remote_node
7362 # Warning: do not remove the locking of the new secondary here
7363 # unless DRBD8.AddChildren is changed to work in parallel;
7364 # currently it doesn't since parallel invocations of
7365 # FindUnusedMinor will conflict
7366 self.needed_locks[locking.LEVEL_NODE] = [remote_node]
7367 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
7370 self.needed_locks[locking.LEVEL_NODE] = []
7371 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
7373 self.replacer = TLReplaceDisks(self, self.op.instance_name, self.op.mode,
7374 self.op.iallocator, self.op.remote_node,
7375 self.op.disks, False, self.op.early_release)
7377 self.tasklets = [self.replacer]
7379 def DeclareLocks(self, level):
7380 # If we're not already locking all nodes in the set we have to declare the
7381 # instance's primary/secondary nodes.
7382 if (level == locking.LEVEL_NODE and
7383 self.needed_locks[locking.LEVEL_NODE] is not locking.ALL_SET):
7384 self._LockInstancesNodes()
7386 def BuildHooksEnv(self):
7389 This runs on the master, the primary and all the secondaries.
7392 instance = self.replacer.instance
7394 "MODE": self.op.mode,
7395 "NEW_SECONDARY": self.op.remote_node,
7396 "OLD_SECONDARY": instance.secondary_nodes[0],
7398 env.update(_BuildInstanceHookEnvByObject(self, instance))
7400 self.cfg.GetMasterNode(),
7401 instance.primary_node,
7403 if self.op.remote_node is not None:
7404 nl.append(self.op.remote_node)
7408 class TLReplaceDisks(Tasklet):
7409 """Replaces disks for an instance.
7411 Note: Locking is not within the scope of this class.
7414 def __init__(self, lu, instance_name, mode, iallocator_name, remote_node,
7415 disks, delay_iallocator, early_release):
7416 """Initializes this class.
7419 Tasklet.__init__(self, lu)
7422 self.instance_name = instance_name
7424 self.iallocator_name = iallocator_name
7425 self.remote_node = remote_node
7427 self.delay_iallocator = delay_iallocator
7428 self.early_release = early_release
7431 self.instance = None
7432 self.new_node = None
7433 self.target_node = None
7434 self.other_node = None
7435 self.remote_node_info = None
7436 self.node_secondary_ip = None
7439 def CheckArguments(mode, remote_node, iallocator):
7440 """Helper function for users of this class.
7443 # check for valid parameter combination
7444 if mode == constants.REPLACE_DISK_CHG:
7445 if remote_node is None and iallocator is None:
7446 raise errors.OpPrereqError("When changing the secondary either an"
7447 " iallocator script must be used or the"
7448 " new node given", errors.ECODE_INVAL)
7450 if remote_node is not None and iallocator is not None:
7451 raise errors.OpPrereqError("Give either the iallocator or the new"
7452 " secondary, not both", errors.ECODE_INVAL)
7454 elif remote_node is not None or iallocator is not None:
7455 # Not replacing the secondary
7456 raise errors.OpPrereqError("The iallocator and new node options can"
7457 " only be used when changing the"
7458 " secondary node", errors.ECODE_INVAL)
7461 def _RunAllocator(lu, iallocator_name, instance_name, relocate_from):
7462 """Compute a new secondary node using an IAllocator.
7465 ial = IAllocator(lu.cfg, lu.rpc,
7466 mode=constants.IALLOCATOR_MODE_RELOC,
7468 relocate_from=relocate_from)
7470 ial.Run(iallocator_name)
7473 raise errors.OpPrereqError("Can't compute nodes using iallocator '%s':"
7474 " %s" % (iallocator_name, ial.info),
7477 if len(ial.result) != ial.required_nodes:
7478 raise errors.OpPrereqError("iallocator '%s' returned invalid number"
7479 " of nodes (%s), required %s" %
7481 len(ial.result), ial.required_nodes),
7484 remote_node_name = ial.result[0]
7486 lu.LogInfo("Selected new secondary for instance '%s': %s",
7487 instance_name, remote_node_name)
7489 return remote_node_name
7491 def _FindFaultyDisks(self, node_name):
7492 return _FindFaultyInstanceDisks(self.cfg, self.rpc, self.instance,
7495 def CheckPrereq(self):
7496 """Check prerequisites.
7498 This checks that the instance is in the cluster.
7501 self.instance = instance = self.cfg.GetInstanceInfo(self.instance_name)
7502 assert instance is not None, \
7503 "Cannot retrieve locked instance %s" % self.instance_name
7505 if instance.disk_template != constants.DT_DRBD8:
7506 raise errors.OpPrereqError("Can only run replace disks for DRBD8-based"
7507 " instances", errors.ECODE_INVAL)
7509 if len(instance.secondary_nodes) != 1:
7510 raise errors.OpPrereqError("The instance has a strange layout,"
7511 " expected one secondary but found %d" %
7512 len(instance.secondary_nodes),
7515 if not self.delay_iallocator:
7516 self._CheckPrereq2()
7518 def _CheckPrereq2(self):
7519 """Check prerequisites, second part.
7521 This function should always be part of CheckPrereq. It was separated and is
7522 now called from Exec because during node evacuation iallocator was only
7523 called with an unmodified cluster model, not taking planned changes into
7527 instance = self.instance
7528 secondary_node = instance.secondary_nodes[0]
7530 if self.iallocator_name is None:
7531 remote_node = self.remote_node
7533 remote_node = self._RunAllocator(self.lu, self.iallocator_name,
7534 instance.name, instance.secondary_nodes)
7536 if remote_node is not None:
7537 self.remote_node_info = self.cfg.GetNodeInfo(remote_node)
7538 assert self.remote_node_info is not None, \
7539 "Cannot retrieve locked node %s" % remote_node
7541 self.remote_node_info = None
7543 if remote_node == self.instance.primary_node:
7544 raise errors.OpPrereqError("The specified node is the primary node of"
7545 " the instance.", errors.ECODE_INVAL)
7547 if remote_node == secondary_node:
7548 raise errors.OpPrereqError("The specified node is already the"
7549 " secondary node of the instance.",
7552 if self.disks and self.mode in (constants.REPLACE_DISK_AUTO,
7553 constants.REPLACE_DISK_CHG):
7554 raise errors.OpPrereqError("Cannot specify disks to be replaced",
7557 if self.mode == constants.REPLACE_DISK_AUTO:
7558 faulty_primary = self._FindFaultyDisks(instance.primary_node)
7559 faulty_secondary = self._FindFaultyDisks(secondary_node)
7561 if faulty_primary and faulty_secondary:
7562 raise errors.OpPrereqError("Instance %s has faulty disks on more than"
7563 " one node and can not be repaired"
7564 " automatically" % self.instance_name,
7568 self.disks = faulty_primary
7569 self.target_node = instance.primary_node
7570 self.other_node = secondary_node
7571 check_nodes = [self.target_node, self.other_node]
7572 elif faulty_secondary:
7573 self.disks = faulty_secondary
7574 self.target_node = secondary_node
7575 self.other_node = instance.primary_node
7576 check_nodes = [self.target_node, self.other_node]
7582 # Non-automatic modes
7583 if self.mode == constants.REPLACE_DISK_PRI:
7584 self.target_node = instance.primary_node
7585 self.other_node = secondary_node
7586 check_nodes = [self.target_node, self.other_node]
7588 elif self.mode == constants.REPLACE_DISK_SEC:
7589 self.target_node = secondary_node
7590 self.other_node = instance.primary_node
7591 check_nodes = [self.target_node, self.other_node]
7593 elif self.mode == constants.REPLACE_DISK_CHG:
7594 self.new_node = remote_node
7595 self.other_node = instance.primary_node
7596 self.target_node = secondary_node
7597 check_nodes = [self.new_node, self.other_node]
7599 _CheckNodeNotDrained(self.lu, remote_node)
7601 old_node_info = self.cfg.GetNodeInfo(secondary_node)
7602 assert old_node_info is not None
7603 if old_node_info.offline and not self.early_release:
7604 # doesn't make sense to delay the release
7605 self.early_release = True
7606 self.lu.LogInfo("Old secondary %s is offline, automatically enabling"
7607 " early-release mode", secondary_node)
7610 raise errors.ProgrammerError("Unhandled disk replace mode (%s)" %
7613 # If not specified all disks should be replaced
7615 self.disks = range(len(self.instance.disks))
7617 for node in check_nodes:
7618 _CheckNodeOnline(self.lu, node)
7620 # Check whether disks are valid
7621 for disk_idx in self.disks:
7622 instance.FindDisk(disk_idx)
7624 # Get secondary node IP addresses
7627 for node_name in [self.target_node, self.other_node, self.new_node]:
7628 if node_name is not None:
7629 node_2nd_ip[node_name] = self.cfg.GetNodeInfo(node_name).secondary_ip
7631 self.node_secondary_ip = node_2nd_ip
7633 def Exec(self, feedback_fn):
7634 """Execute disk replacement.
7636 This dispatches the disk replacement to the appropriate handler.
7639 if self.delay_iallocator:
7640 self._CheckPrereq2()
7643 feedback_fn("No disks need replacement")
7646 feedback_fn("Replacing disk(s) %s for %s" %
7647 (utils.CommaJoin(self.disks), self.instance.name))
7649 activate_disks = (not self.instance.admin_up)
7651 # Activate the instance disks if we're replacing them on a down instance
7653 _StartInstanceDisks(self.lu, self.instance, True)
7656 # Should we replace the secondary node?
7657 if self.new_node is not None:
7658 fn = self._ExecDrbd8Secondary
7660 fn = self._ExecDrbd8DiskOnly
7662 return fn(feedback_fn)
7665 # Deactivate the instance disks if we're replacing them on a
7668 _SafeShutdownInstanceDisks(self.lu, self.instance)
7670 def _CheckVolumeGroup(self, nodes):
7671 self.lu.LogInfo("Checking volume groups")
7673 vgname = self.cfg.GetVGName()
7675 # Make sure volume group exists on all involved nodes
7676 results = self.rpc.call_vg_list(nodes)
7678 raise errors.OpExecError("Can't list volume groups on the nodes")
7682 res.Raise("Error checking node %s" % node)
7683 if vgname not in res.payload:
7684 raise errors.OpExecError("Volume group '%s' not found on node %s" %
7687 def _CheckDisksExistence(self, nodes):
7688 # Check disk existence
7689 for idx, dev in enumerate(self.instance.disks):
7690 if idx not in self.disks:
7694 self.lu.LogInfo("Checking disk/%d on %s" % (idx, node))
7695 self.cfg.SetDiskID(dev, node)
7697 result = self.rpc.call_blockdev_find(node, dev)
7699 msg = result.fail_msg
7700 if msg or not result.payload:
7702 msg = "disk not found"
7703 raise errors.OpExecError("Can't find disk/%d on node %s: %s" %
7706 def _CheckDisksConsistency(self, node_name, on_primary, ldisk):
7707 for idx, dev in enumerate(self.instance.disks):
7708 if idx not in self.disks:
7711 self.lu.LogInfo("Checking disk/%d consistency on node %s" %
7714 if not _CheckDiskConsistency(self.lu, dev, node_name, on_primary,
7716 raise errors.OpExecError("Node %s has degraded storage, unsafe to"
7717 " replace disks for instance %s" %
7718 (node_name, self.instance.name))
7720 def _CreateNewStorage(self, node_name):
7721 vgname = self.cfg.GetVGName()
7724 for idx, dev in enumerate(self.instance.disks):
7725 if idx not in self.disks:
7728 self.lu.LogInfo("Adding storage on %s for disk/%d" % (node_name, idx))
7730 self.cfg.SetDiskID(dev, node_name)
7732 lv_names = [".disk%d_%s" % (idx, suffix) for suffix in ["data", "meta"]]
7733 names = _GenerateUniqueNames(self.lu, lv_names)
7735 lv_data = objects.Disk(dev_type=constants.LD_LV, size=dev.size,
7736 logical_id=(vgname, names[0]))
7737 lv_meta = objects.Disk(dev_type=constants.LD_LV, size=128,
7738 logical_id=(vgname, names[1]))
7740 new_lvs = [lv_data, lv_meta]
7741 old_lvs = dev.children
7742 iv_names[dev.iv_name] = (dev, old_lvs, new_lvs)
7744 # we pass force_create=True to force the LVM creation
7745 for new_lv in new_lvs:
7746 _CreateBlockDev(self.lu, node_name, self.instance, new_lv, True,
7747 _GetInstanceInfoText(self.instance), False)
7751 def _CheckDevices(self, node_name, iv_names):
7752 for name, (dev, _, _) in iv_names.iteritems():
7753 self.cfg.SetDiskID(dev, node_name)
7755 result = self.rpc.call_blockdev_find(node_name, dev)
7757 msg = result.fail_msg
7758 if msg or not result.payload:
7760 msg = "disk not found"
7761 raise errors.OpExecError("Can't find DRBD device %s: %s" %
7764 if result.payload.is_degraded:
7765 raise errors.OpExecError("DRBD device %s is degraded!" % name)
7767 def _RemoveOldStorage(self, node_name, iv_names):
7768 for name, (_, old_lvs, _) in iv_names.iteritems():
7769 self.lu.LogInfo("Remove logical volumes for %s" % name)
7772 self.cfg.SetDiskID(lv, node_name)
7774 msg = self.rpc.call_blockdev_remove(node_name, lv).fail_msg
7776 self.lu.LogWarning("Can't remove old LV: %s" % msg,
7777 hint="remove unused LVs manually")
7779 def _ReleaseNodeLock(self, node_name):
7780 """Releases the lock for a given node."""
7781 self.lu.context.glm.release(locking.LEVEL_NODE, node_name)
7783 def _ExecDrbd8DiskOnly(self, feedback_fn):
7784 """Replace a disk on the primary or secondary for DRBD 8.
7786 The algorithm for replace is quite complicated:
7788 1. for each disk to be replaced:
7790 1. create new LVs on the target node with unique names
7791 1. detach old LVs from the drbd device
7792 1. rename old LVs to name_replaced.<time_t>
7793 1. rename new LVs to old LVs
7794 1. attach the new LVs (with the old names now) to the drbd device
7796 1. wait for sync across all devices
7798 1. for each modified disk:
7800 1. remove old LVs (which have the name name_replaces.<time_t>)
7802 Failures are not very well handled.
7807 # Step: check device activation
7808 self.lu.LogStep(1, steps_total, "Check device existence")
7809 self._CheckDisksExistence([self.other_node, self.target_node])
7810 self._CheckVolumeGroup([self.target_node, self.other_node])
7812 # Step: check other node consistency
7813 self.lu.LogStep(2, steps_total, "Check peer consistency")
7814 self._CheckDisksConsistency(self.other_node,
7815 self.other_node == self.instance.primary_node,
7818 # Step: create new storage
7819 self.lu.LogStep(3, steps_total, "Allocate new storage")
7820 iv_names = self._CreateNewStorage(self.target_node)
7822 # Step: for each lv, detach+rename*2+attach
7823 self.lu.LogStep(4, steps_total, "Changing drbd configuration")
7824 for dev, old_lvs, new_lvs in iv_names.itervalues():
7825 self.lu.LogInfo("Detaching %s drbd from local storage" % dev.iv_name)
7827 result = self.rpc.call_blockdev_removechildren(self.target_node, dev,
7829 result.Raise("Can't detach drbd from local storage on node"
7830 " %s for device %s" % (self.target_node, dev.iv_name))
7832 #cfg.Update(instance)
7834 # ok, we created the new LVs, so now we know we have the needed
7835 # storage; as such, we proceed on the target node to rename
7836 # old_lv to _old, and new_lv to old_lv; note that we rename LVs
7837 # using the assumption that logical_id == physical_id (which in
7838 # turn is the unique_id on that node)
7840 # FIXME(iustin): use a better name for the replaced LVs
7841 temp_suffix = int(time.time())
7842 ren_fn = lambda d, suff: (d.physical_id[0],
7843 d.physical_id[1] + "_replaced-%s" % suff)
7845 # Build the rename list based on what LVs exist on the node
7846 rename_old_to_new = []
7847 for to_ren in old_lvs:
7848 result = self.rpc.call_blockdev_find(self.target_node, to_ren)
7849 if not result.fail_msg and result.payload:
7851 rename_old_to_new.append((to_ren, ren_fn(to_ren, temp_suffix)))
7853 self.lu.LogInfo("Renaming the old LVs on the target node")
7854 result = self.rpc.call_blockdev_rename(self.target_node,
7856 result.Raise("Can't rename old LVs on node %s" % self.target_node)
7858 # Now we rename the new LVs to the old LVs
7859 self.lu.LogInfo("Renaming the new LVs on the target node")
7860 rename_new_to_old = [(new, old.physical_id)
7861 for old, new in zip(old_lvs, new_lvs)]
7862 result = self.rpc.call_blockdev_rename(self.target_node,
7864 result.Raise("Can't rename new LVs on node %s" % self.target_node)
7866 for old, new in zip(old_lvs, new_lvs):
7867 new.logical_id = old.logical_id
7868 self.cfg.SetDiskID(new, self.target_node)
7870 for disk in old_lvs:
7871 disk.logical_id = ren_fn(disk, temp_suffix)
7872 self.cfg.SetDiskID(disk, self.target_node)
7874 # Now that the new lvs have the old name, we can add them to the device
7875 self.lu.LogInfo("Adding new mirror component on %s" % self.target_node)
7876 result = self.rpc.call_blockdev_addchildren(self.target_node, dev,
7878 msg = result.fail_msg
7880 for new_lv in new_lvs:
7881 msg2 = self.rpc.call_blockdev_remove(self.target_node,
7884 self.lu.LogWarning("Can't rollback device %s: %s", dev, msg2,
7885 hint=("cleanup manually the unused logical"
7887 raise errors.OpExecError("Can't add local storage to drbd: %s" % msg)
7889 dev.children = new_lvs
7891 self.cfg.Update(self.instance, feedback_fn)
7894 if self.early_release:
7895 self.lu.LogStep(cstep, steps_total, "Removing old storage")
7897 self._RemoveOldStorage(self.target_node, iv_names)
7898 # WARNING: we release both node locks here, do not do other RPCs
7899 # than WaitForSync to the primary node
7900 self._ReleaseNodeLock([self.target_node, self.other_node])
7903 # This can fail as the old devices are degraded and _WaitForSync
7904 # does a combined result over all disks, so we don't check its return value
7905 self.lu.LogStep(cstep, steps_total, "Sync devices")
7907 _WaitForSync(self.lu, self.instance)
7909 # Check all devices manually
7910 self._CheckDevices(self.instance.primary_node, iv_names)
7912 # Step: remove old storage
7913 if not self.early_release:
7914 self.lu.LogStep(cstep, steps_total, "Removing old storage")
7916 self._RemoveOldStorage(self.target_node, iv_names)
7918 def _ExecDrbd8Secondary(self, feedback_fn):
7919 """Replace the secondary node for DRBD 8.
7921 The algorithm for replace is quite complicated:
7922 - for all disks of the instance:
7923 - create new LVs on the new node with same names
7924 - shutdown the drbd device on the old secondary
7925 - disconnect the drbd network on the primary
7926 - create the drbd device on the new secondary
7927 - network attach the drbd on the primary, using an artifice:
7928 the drbd code for Attach() will connect to the network if it
7929 finds a device which is connected to the good local disks but
7931 - wait for sync across all devices
7932 - remove all disks from the old secondary
7934 Failures are not very well handled.
7939 # Step: check device activation
7940 self.lu.LogStep(1, steps_total, "Check device existence")
7941 self._CheckDisksExistence([self.instance.primary_node])
7942 self._CheckVolumeGroup([self.instance.primary_node])
7944 # Step: check other node consistency
7945 self.lu.LogStep(2, steps_total, "Check peer consistency")
7946 self._CheckDisksConsistency(self.instance.primary_node, True, True)
7948 # Step: create new storage
7949 self.lu.LogStep(3, steps_total, "Allocate new storage")
7950 for idx, dev in enumerate(self.instance.disks):
7951 self.lu.LogInfo("Adding new local storage on %s for disk/%d" %
7952 (self.new_node, idx))
7953 # we pass force_create=True to force LVM creation
7954 for new_lv in dev.children:
7955 _CreateBlockDev(self.lu, self.new_node, self.instance, new_lv, True,
7956 _GetInstanceInfoText(self.instance), False)
7958 # Step 4: dbrd minors and drbd setups changes
7959 # after this, we must manually remove the drbd minors on both the
7960 # error and the success paths
7961 self.lu.LogStep(4, steps_total, "Changing drbd configuration")
7962 minors = self.cfg.AllocateDRBDMinor([self.new_node
7963 for dev in self.instance.disks],
7965 logging.debug("Allocated minors %r", minors)
7968 for idx, (dev, new_minor) in enumerate(zip(self.instance.disks, minors)):
7969 self.lu.LogInfo("activating a new drbd on %s for disk/%d" %
7970 (self.new_node, idx))
7971 # create new devices on new_node; note that we create two IDs:
7972 # one without port, so the drbd will be activated without
7973 # networking information on the new node at this stage, and one
7974 # with network, for the latter activation in step 4
7975 (o_node1, o_node2, o_port, o_minor1, o_minor2, o_secret) = dev.logical_id
7976 if self.instance.primary_node == o_node1:
7979 assert self.instance.primary_node == o_node2, "Three-node instance?"
7982 new_alone_id = (self.instance.primary_node, self.new_node, None,
7983 p_minor, new_minor, o_secret)
7984 new_net_id = (self.instance.primary_node, self.new_node, o_port,
7985 p_minor, new_minor, o_secret)
7987 iv_names[idx] = (dev, dev.children, new_net_id)
7988 logging.debug("Allocated new_minor: %s, new_logical_id: %s", new_minor,
7990 new_drbd = objects.Disk(dev_type=constants.LD_DRBD8,
7991 logical_id=new_alone_id,
7992 children=dev.children,
7995 _CreateSingleBlockDev(self.lu, self.new_node, self.instance, new_drbd,
7996 _GetInstanceInfoText(self.instance), False)
7997 except errors.GenericError:
7998 self.cfg.ReleaseDRBDMinors(self.instance.name)
8001 # We have new devices, shutdown the drbd on the old secondary
8002 for idx, dev in enumerate(self.instance.disks):
8003 self.lu.LogInfo("Shutting down drbd for disk/%d on old node" % idx)
8004 self.cfg.SetDiskID(dev, self.target_node)
8005 msg = self.rpc.call_blockdev_shutdown(self.target_node, dev).fail_msg
8007 self.lu.LogWarning("Failed to shutdown drbd for disk/%d on old"
8008 "node: %s" % (idx, msg),
8009 hint=("Please cleanup this device manually as"
8010 " soon as possible"))
8012 self.lu.LogInfo("Detaching primary drbds from the network (=> standalone)")
8013 result = self.rpc.call_drbd_disconnect_net([self.instance.primary_node],
8014 self.node_secondary_ip,
8015 self.instance.disks)\
8016 [self.instance.primary_node]
8018 msg = result.fail_msg
8020 # detaches didn't succeed (unlikely)
8021 self.cfg.ReleaseDRBDMinors(self.instance.name)
8022 raise errors.OpExecError("Can't detach the disks from the network on"
8023 " old node: %s" % (msg,))
8025 # if we managed to detach at least one, we update all the disks of
8026 # the instance to point to the new secondary
8027 self.lu.LogInfo("Updating instance configuration")
8028 for dev, _, new_logical_id in iv_names.itervalues():
8029 dev.logical_id = new_logical_id
8030 self.cfg.SetDiskID(dev, self.instance.primary_node)
8032 self.cfg.Update(self.instance, feedback_fn)
8034 # and now perform the drbd attach
8035 self.lu.LogInfo("Attaching primary drbds to new secondary"
8036 " (standalone => connected)")
8037 result = self.rpc.call_drbd_attach_net([self.instance.primary_node,
8039 self.node_secondary_ip,
8040 self.instance.disks,
8043 for to_node, to_result in result.items():
8044 msg = to_result.fail_msg
8046 self.lu.LogWarning("Can't attach drbd disks on node %s: %s",
8048 hint=("please do a gnt-instance info to see the"
8049 " status of disks"))
8051 if self.early_release:
8052 self.lu.LogStep(cstep, steps_total, "Removing old storage")
8054 self._RemoveOldStorage(self.target_node, iv_names)
8055 # WARNING: we release all node locks here, do not do other RPCs
8056 # than WaitForSync to the primary node
8057 self._ReleaseNodeLock([self.instance.primary_node,
8062 # This can fail as the old devices are degraded and _WaitForSync
8063 # does a combined result over all disks, so we don't check its return value
8064 self.lu.LogStep(cstep, steps_total, "Sync devices")
8066 _WaitForSync(self.lu, self.instance)
8068 # Check all devices manually
8069 self._CheckDevices(self.instance.primary_node, iv_names)
8071 # Step: remove old storage
8072 if not self.early_release:
8073 self.lu.LogStep(cstep, steps_total, "Removing old storage")
8074 self._RemoveOldStorage(self.target_node, iv_names)
8077 class LURepairNodeStorage(NoHooksLU):
8078 """Repairs the volume group on a node.
8083 ("storage_type", _NoDefault, _CheckStorageType),
8084 ("name", _NoDefault, _TNonEmptyString),
8085 ("ignore_consistency", False, _TBool),
8089 def CheckArguments(self):
8090 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
8092 storage_type = self.op.storage_type
8094 if (constants.SO_FIX_CONSISTENCY not in
8095 constants.VALID_STORAGE_OPERATIONS.get(storage_type, [])):
8096 raise errors.OpPrereqError("Storage units of type '%s' can not be"
8097 " repaired" % storage_type,
8100 def ExpandNames(self):
8101 self.needed_locks = {
8102 locking.LEVEL_NODE: [self.op.node_name],
8105 def _CheckFaultyDisks(self, instance, node_name):
8106 """Ensure faulty disks abort the opcode or at least warn."""
8108 if _FindFaultyInstanceDisks(self.cfg, self.rpc, instance,
8110 raise errors.OpPrereqError("Instance '%s' has faulty disks on"
8111 " node '%s'" % (instance.name, node_name),
8113 except errors.OpPrereqError, err:
8114 if self.op.ignore_consistency:
8115 self.proc.LogWarning(str(err.args[0]))
8119 def CheckPrereq(self):
8120 """Check prerequisites.
8123 # Check whether any instance on this node has faulty disks
8124 for inst in _GetNodeInstances(self.cfg, self.op.node_name):
8125 if not inst.admin_up:
8127 check_nodes = set(inst.all_nodes)
8128 check_nodes.discard(self.op.node_name)
8129 for inst_node_name in check_nodes:
8130 self._CheckFaultyDisks(inst, inst_node_name)
8132 def Exec(self, feedback_fn):
8133 feedback_fn("Repairing storage unit '%s' on %s ..." %
8134 (self.op.name, self.op.node_name))
8136 st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
8137 result = self.rpc.call_storage_execute(self.op.node_name,
8138 self.op.storage_type, st_args,
8140 constants.SO_FIX_CONSISTENCY)
8141 result.Raise("Failed to repair storage unit '%s' on %s" %
8142 (self.op.name, self.op.node_name))
8145 class LUNodeEvacuationStrategy(NoHooksLU):
8146 """Computes the node evacuation strategy.
8150 ("nodes", _NoDefault, _TListOf(_TNonEmptyString)),
8151 ("remote_node", None, _TMaybeString),
8152 ("iallocator", None, _TMaybeString),
8156 def CheckArguments(self):
8157 if self.op.remote_node is not None and self.op.iallocator is not None:
8158 raise errors.OpPrereqError("Give either the iallocator or the new"
8159 " secondary, not both", errors.ECODE_INVAL)
8161 def ExpandNames(self):
8162 self.op.nodes = _GetWantedNodes(self, self.op.nodes)
8163 self.needed_locks = locks = {}
8164 if self.op.remote_node is None:
8165 locks[locking.LEVEL_NODE] = locking.ALL_SET
8167 self.op.remote_node = _ExpandNodeName(self.cfg, self.op.remote_node)
8168 locks[locking.LEVEL_NODE] = self.op.nodes + [self.op.remote_node]
8170 def Exec(self, feedback_fn):
8171 if self.op.remote_node is not None:
8173 for node in self.op.nodes:
8174 instances.extend(_GetNodeSecondaryInstances(self.cfg, node))
8177 if i.primary_node == self.op.remote_node:
8178 raise errors.OpPrereqError("Node %s is the primary node of"
8179 " instance %s, cannot use it as"
8181 (self.op.remote_node, i.name),
8183 result.append([i.name, self.op.remote_node])
8185 ial = IAllocator(self.cfg, self.rpc,
8186 mode=constants.IALLOCATOR_MODE_MEVAC,
8187 evac_nodes=self.op.nodes)
8188 ial.Run(self.op.iallocator, validate=True)
8190 raise errors.OpExecError("No valid evacuation solution: %s" % ial.info,
8196 class LUGrowDisk(LogicalUnit):
8197 """Grow a disk of an instance.
8201 HTYPE = constants.HTYPE_INSTANCE
8204 ("disk", _NoDefault, _TInt),
8205 ("amount", _NoDefault, _TInt),
8206 ("wait_for_sync", True, _TBool),
8210 def ExpandNames(self):
8211 self._ExpandAndLockInstance()
8212 self.needed_locks[locking.LEVEL_NODE] = []
8213 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
8215 def DeclareLocks(self, level):
8216 if level == locking.LEVEL_NODE:
8217 self._LockInstancesNodes()
8219 def BuildHooksEnv(self):
8222 This runs on the master, the primary and all the secondaries.
8226 "DISK": self.op.disk,
8227 "AMOUNT": self.op.amount,
8229 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
8230 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
8233 def CheckPrereq(self):
8234 """Check prerequisites.
8236 This checks that the instance is in the cluster.
8239 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
8240 assert instance is not None, \
8241 "Cannot retrieve locked instance %s" % self.op.instance_name
8242 nodenames = list(instance.all_nodes)
8243 for node in nodenames:
8244 _CheckNodeOnline(self, node)
8246 self.instance = instance
8248 if instance.disk_template not in constants.DTS_GROWABLE:
8249 raise errors.OpPrereqError("Instance's disk layout does not support"
8250 " growing.", errors.ECODE_INVAL)
8252 self.disk = instance.FindDisk(self.op.disk)
8254 if instance.disk_template != constants.DT_FILE:
8255 # TODO: check the free disk space for file, when that feature will be
8257 _CheckNodesFreeDisk(self, nodenames, self.op.amount)
8259 def Exec(self, feedback_fn):
8260 """Execute disk grow.
8263 instance = self.instance
8266 disks_ok, _ = _AssembleInstanceDisks(self, self.instance, disks=[disk])
8268 raise errors.OpExecError("Cannot activate block device to grow")
8270 for node in instance.all_nodes:
8271 self.cfg.SetDiskID(disk, node)
8272 result = self.rpc.call_blockdev_grow(node, disk, self.op.amount)
8273 result.Raise("Grow request failed to node %s" % node)
8275 # TODO: Rewrite code to work properly
8276 # DRBD goes into sync mode for a short amount of time after executing the
8277 # "resize" command. DRBD 8.x below version 8.0.13 contains a bug whereby
8278 # calling "resize" in sync mode fails. Sleeping for a short amount of
8279 # time is a work-around.
8282 disk.RecordGrow(self.op.amount)
8283 self.cfg.Update(instance, feedback_fn)
8284 if self.op.wait_for_sync:
8285 disk_abort = not _WaitForSync(self, instance, disks=[disk])
8287 self.proc.LogWarning("Warning: disk sync-ing has not returned a good"
8288 " status.\nPlease check the instance.")
8289 if not instance.admin_up:
8290 _SafeShutdownInstanceDisks(self, instance, disks=[disk])
8291 elif not instance.admin_up:
8292 self.proc.LogWarning("Not shutting down the disk even if the instance is"
8293 " not supposed to be running because no wait for"
8294 " sync mode was requested.")
8297 class LUQueryInstanceData(NoHooksLU):
8298 """Query runtime instance data.
8302 ("instances", _EmptyList, _TListOf(_TNonEmptyString)),
8303 ("static", False, _TBool),
8307 def ExpandNames(self):
8308 self.needed_locks = {}
8309 self.share_locks = dict.fromkeys(locking.LEVELS, 1)
8311 if self.op.instances:
8312 self.wanted_names = []
8313 for name in self.op.instances:
8314 full_name = _ExpandInstanceName(self.cfg, name)
8315 self.wanted_names.append(full_name)
8316 self.needed_locks[locking.LEVEL_INSTANCE] = self.wanted_names
8318 self.wanted_names = None
8319 self.needed_locks[locking.LEVEL_INSTANCE] = locking.ALL_SET
8321 self.needed_locks[locking.LEVEL_NODE] = []
8322 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
8324 def DeclareLocks(self, level):
8325 if level == locking.LEVEL_NODE:
8326 self._LockInstancesNodes()
8328 def CheckPrereq(self):
8329 """Check prerequisites.
8331 This only checks the optional instance list against the existing names.
8334 if self.wanted_names is None:
8335 self.wanted_names = self.acquired_locks[locking.LEVEL_INSTANCE]
8337 self.wanted_instances = [self.cfg.GetInstanceInfo(name) for name
8338 in self.wanted_names]
8340 def _ComputeBlockdevStatus(self, node, instance_name, dev):
8341 """Returns the status of a block device
8344 if self.op.static or not node:
8347 self.cfg.SetDiskID(dev, node)
8349 result = self.rpc.call_blockdev_find(node, dev)
8353 result.Raise("Can't compute disk status for %s" % instance_name)
8355 status = result.payload
8359 return (status.dev_path, status.major, status.minor,
8360 status.sync_percent, status.estimated_time,
8361 status.is_degraded, status.ldisk_status)
8363 def _ComputeDiskStatus(self, instance, snode, dev):
8364 """Compute block device status.
8367 if dev.dev_type in constants.LDS_DRBD:
8368 # we change the snode then (otherwise we use the one passed in)
8369 if dev.logical_id[0] == instance.primary_node:
8370 snode = dev.logical_id[1]
8372 snode = dev.logical_id[0]
8374 dev_pstatus = self._ComputeBlockdevStatus(instance.primary_node,
8376 dev_sstatus = self._ComputeBlockdevStatus(snode, instance.name, dev)
8379 dev_children = [self._ComputeDiskStatus(instance, snode, child)
8380 for child in dev.children]
8385 "iv_name": dev.iv_name,
8386 "dev_type": dev.dev_type,
8387 "logical_id": dev.logical_id,
8388 "physical_id": dev.physical_id,
8389 "pstatus": dev_pstatus,
8390 "sstatus": dev_sstatus,
8391 "children": dev_children,
8398 def Exec(self, feedback_fn):
8399 """Gather and return data"""
8402 cluster = self.cfg.GetClusterInfo()
8404 for instance in self.wanted_instances:
8405 if not self.op.static:
8406 remote_info = self.rpc.call_instance_info(instance.primary_node,
8408 instance.hypervisor)
8409 remote_info.Raise("Error checking node %s" % instance.primary_node)
8410 remote_info = remote_info.payload
8411 if remote_info and "state" in remote_info:
8414 remote_state = "down"
8417 if instance.admin_up:
8420 config_state = "down"
8422 disks = [self._ComputeDiskStatus(instance, None, device)
8423 for device in instance.disks]
8426 "name": instance.name,
8427 "config_state": config_state,
8428 "run_state": remote_state,
8429 "pnode": instance.primary_node,
8430 "snodes": instance.secondary_nodes,
8432 # this happens to be the same format used for hooks
8433 "nics": _NICListToTuple(self, instance.nics),
8434 "disk_template": instance.disk_template,
8436 "hypervisor": instance.hypervisor,
8437 "network_port": instance.network_port,
8438 "hv_instance": instance.hvparams,
8439 "hv_actual": cluster.FillHV(instance, skip_globals=True),
8440 "be_instance": instance.beparams,
8441 "be_actual": cluster.FillBE(instance),
8442 "os_instance": instance.osparams,
8443 "os_actual": cluster.SimpleFillOS(instance.os, instance.osparams),
8444 "serial_no": instance.serial_no,
8445 "mtime": instance.mtime,
8446 "ctime": instance.ctime,
8447 "uuid": instance.uuid,
8450 result[instance.name] = idict
8455 class LUSetInstanceParams(LogicalUnit):
8456 """Modifies an instances's parameters.
8459 HPATH = "instance-modify"
8460 HTYPE = constants.HTYPE_INSTANCE
8463 ("nics", _EmptyList, _TList),
8464 ("disks", _EmptyList, _TList),
8465 ("beparams", _EmptyDict, _TDict),
8466 ("hvparams", _EmptyDict, _TDict),
8467 ("disk_template", None, _TMaybeString),
8468 ("remote_node", None, _TMaybeString),
8469 ("os_name", None, _TMaybeString),
8470 ("force_variant", False, _TBool),
8471 ("osparams", None, _TOr(_TDict, _TNone)),
8476 def CheckArguments(self):
8477 if not (self.op.nics or self.op.disks or self.op.disk_template or
8478 self.op.hvparams or self.op.beparams or self.op.os_name):
8479 raise errors.OpPrereqError("No changes submitted", errors.ECODE_INVAL)
8481 if self.op.hvparams:
8482 _CheckGlobalHvParams(self.op.hvparams)
8486 for disk_op, disk_dict in self.op.disks:
8487 utils.ForceDictType(disk_dict, constants.IDISK_PARAMS_TYPES)
8488 if disk_op == constants.DDM_REMOVE:
8491 elif disk_op == constants.DDM_ADD:
8494 if not isinstance(disk_op, int):
8495 raise errors.OpPrereqError("Invalid disk index", errors.ECODE_INVAL)
8496 if not isinstance(disk_dict, dict):
8497 msg = "Invalid disk value: expected dict, got '%s'" % disk_dict
8498 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
8500 if disk_op == constants.DDM_ADD:
8501 mode = disk_dict.setdefault('mode', constants.DISK_RDWR)
8502 if mode not in constants.DISK_ACCESS_SET:
8503 raise errors.OpPrereqError("Invalid disk access mode '%s'" % mode,
8505 size = disk_dict.get('size', None)
8507 raise errors.OpPrereqError("Required disk parameter size missing",
8511 except (TypeError, ValueError), err:
8512 raise errors.OpPrereqError("Invalid disk size parameter: %s" %
8513 str(err), errors.ECODE_INVAL)
8514 disk_dict['size'] = size
8516 # modification of disk
8517 if 'size' in disk_dict:
8518 raise errors.OpPrereqError("Disk size change not possible, use"
8519 " grow-disk", errors.ECODE_INVAL)
8521 if disk_addremove > 1:
8522 raise errors.OpPrereqError("Only one disk add or remove operation"
8523 " supported at a time", errors.ECODE_INVAL)
8525 if self.op.disks and self.op.disk_template is not None:
8526 raise errors.OpPrereqError("Disk template conversion and other disk"
8527 " changes not supported at the same time",
8530 if self.op.disk_template:
8531 _CheckDiskTemplate(self.op.disk_template)
8532 if (self.op.disk_template in constants.DTS_NET_MIRROR and
8533 self.op.remote_node is None):
8534 raise errors.OpPrereqError("Changing the disk template to a mirrored"
8535 " one requires specifying a secondary node",
8540 for nic_op, nic_dict in self.op.nics:
8541 utils.ForceDictType(nic_dict, constants.INIC_PARAMS_TYPES)
8542 if nic_op == constants.DDM_REMOVE:
8545 elif nic_op == constants.DDM_ADD:
8548 if not isinstance(nic_op, int):
8549 raise errors.OpPrereqError("Invalid nic index", errors.ECODE_INVAL)
8550 if not isinstance(nic_dict, dict):
8551 msg = "Invalid nic value: expected dict, got '%s'" % nic_dict
8552 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
8554 # nic_dict should be a dict
8555 nic_ip = nic_dict.get('ip', None)
8556 if nic_ip is not None:
8557 if nic_ip.lower() == constants.VALUE_NONE:
8558 nic_dict['ip'] = None
8560 if not utils.IsValidIP4(nic_ip):
8561 raise errors.OpPrereqError("Invalid IP address '%s'" % nic_ip,
8564 nic_bridge = nic_dict.get('bridge', None)
8565 nic_link = nic_dict.get('link', None)
8566 if nic_bridge and nic_link:
8567 raise errors.OpPrereqError("Cannot pass 'bridge' and 'link'"
8568 " at the same time", errors.ECODE_INVAL)
8569 elif nic_bridge and nic_bridge.lower() == constants.VALUE_NONE:
8570 nic_dict['bridge'] = None
8571 elif nic_link and nic_link.lower() == constants.VALUE_NONE:
8572 nic_dict['link'] = None
8574 if nic_op == constants.DDM_ADD:
8575 nic_mac = nic_dict.get('mac', None)
8577 nic_dict['mac'] = constants.VALUE_AUTO
8579 if 'mac' in nic_dict:
8580 nic_mac = nic_dict['mac']
8581 if nic_mac not in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
8582 nic_mac = utils.NormalizeAndValidateMac(nic_mac)
8584 if nic_op != constants.DDM_ADD and nic_mac == constants.VALUE_AUTO:
8585 raise errors.OpPrereqError("'auto' is not a valid MAC address when"
8586 " modifying an existing nic",
8589 if nic_addremove > 1:
8590 raise errors.OpPrereqError("Only one NIC add or remove operation"
8591 " supported at a time", errors.ECODE_INVAL)
8593 def ExpandNames(self):
8594 self._ExpandAndLockInstance()
8595 self.needed_locks[locking.LEVEL_NODE] = []
8596 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
8598 def DeclareLocks(self, level):
8599 if level == locking.LEVEL_NODE:
8600 self._LockInstancesNodes()
8601 if self.op.disk_template and self.op.remote_node:
8602 self.op.remote_node = _ExpandNodeName(self.cfg, self.op.remote_node)
8603 self.needed_locks[locking.LEVEL_NODE].append(self.op.remote_node)
8605 def BuildHooksEnv(self):
8608 This runs on the master, primary and secondaries.
8612 if constants.BE_MEMORY in self.be_new:
8613 args['memory'] = self.be_new[constants.BE_MEMORY]
8614 if constants.BE_VCPUS in self.be_new:
8615 args['vcpus'] = self.be_new[constants.BE_VCPUS]
8616 # TODO: export disk changes. Note: _BuildInstanceHookEnv* don't export disk
8617 # information at all.
8620 nic_override = dict(self.op.nics)
8621 for idx, nic in enumerate(self.instance.nics):
8622 if idx in nic_override:
8623 this_nic_override = nic_override[idx]
8625 this_nic_override = {}
8626 if 'ip' in this_nic_override:
8627 ip = this_nic_override['ip']
8630 if 'mac' in this_nic_override:
8631 mac = this_nic_override['mac']
8634 if idx in self.nic_pnew:
8635 nicparams = self.nic_pnew[idx]
8637 nicparams = self.cluster.SimpleFillNIC(nic.nicparams)
8638 mode = nicparams[constants.NIC_MODE]
8639 link = nicparams[constants.NIC_LINK]
8640 args['nics'].append((ip, mac, mode, link))
8641 if constants.DDM_ADD in nic_override:
8642 ip = nic_override[constants.DDM_ADD].get('ip', None)
8643 mac = nic_override[constants.DDM_ADD]['mac']
8644 nicparams = self.nic_pnew[constants.DDM_ADD]
8645 mode = nicparams[constants.NIC_MODE]
8646 link = nicparams[constants.NIC_LINK]
8647 args['nics'].append((ip, mac, mode, link))
8648 elif constants.DDM_REMOVE in nic_override:
8649 del args['nics'][-1]
8651 env = _BuildInstanceHookEnvByObject(self, self.instance, override=args)
8652 if self.op.disk_template:
8653 env["NEW_DISK_TEMPLATE"] = self.op.disk_template
8654 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
8657 def CheckPrereq(self):
8658 """Check prerequisites.
8660 This only checks the instance list against the existing names.
8663 # checking the new params on the primary/secondary nodes
8665 instance = self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
8666 cluster = self.cluster = self.cfg.GetClusterInfo()
8667 assert self.instance is not None, \
8668 "Cannot retrieve locked instance %s" % self.op.instance_name
8669 pnode = instance.primary_node
8670 nodelist = list(instance.all_nodes)
8673 if self.op.os_name and not self.op.force:
8674 _CheckNodeHasOS(self, instance.primary_node, self.op.os_name,
8675 self.op.force_variant)
8676 instance_os = self.op.os_name
8678 instance_os = instance.os
8680 if self.op.disk_template:
8681 if instance.disk_template == self.op.disk_template:
8682 raise errors.OpPrereqError("Instance already has disk template %s" %
8683 instance.disk_template, errors.ECODE_INVAL)
8685 if (instance.disk_template,
8686 self.op.disk_template) not in self._DISK_CONVERSIONS:
8687 raise errors.OpPrereqError("Unsupported disk template conversion from"
8688 " %s to %s" % (instance.disk_template,
8689 self.op.disk_template),
8691 _CheckInstanceDown(self, instance, "cannot change disk template")
8692 if self.op.disk_template in constants.DTS_NET_MIRROR:
8693 _CheckNodeOnline(self, self.op.remote_node)
8694 _CheckNodeNotDrained(self, self.op.remote_node)
8695 disks = [{"size": d.size} for d in instance.disks]
8696 required = _ComputeDiskSize(self.op.disk_template, disks)
8697 _CheckNodesFreeDisk(self, [self.op.remote_node], required)
8699 # hvparams processing
8700 if self.op.hvparams:
8701 hv_type = instance.hypervisor
8702 i_hvdict = _GetUpdatedParams(instance.hvparams, self.op.hvparams)
8703 utils.ForceDictType(i_hvdict, constants.HVS_PARAMETER_TYPES)
8704 hv_new = cluster.SimpleFillHV(hv_type, instance.os, i_hvdict)
8707 hypervisor.GetHypervisor(hv_type).CheckParameterSyntax(hv_new)
8708 _CheckHVParams(self, nodelist, instance.hypervisor, hv_new)
8709 self.hv_new = hv_new # the new actual values
8710 self.hv_inst = i_hvdict # the new dict (without defaults)
8712 self.hv_new = self.hv_inst = {}
8714 # beparams processing
8715 if self.op.beparams:
8716 i_bedict = _GetUpdatedParams(instance.beparams, self.op.beparams,
8718 utils.ForceDictType(i_bedict, constants.BES_PARAMETER_TYPES)
8719 be_new = cluster.SimpleFillBE(i_bedict)
8720 self.be_new = be_new # the new actual values
8721 self.be_inst = i_bedict # the new dict (without defaults)
8723 self.be_new = self.be_inst = {}
8725 # osparams processing
8726 if self.op.osparams:
8727 i_osdict = _GetUpdatedParams(instance.osparams, self.op.osparams)
8728 _CheckOSParams(self, True, nodelist, instance_os, i_osdict)
8729 self.os_new = cluster.SimpleFillOS(instance_os, i_osdict)
8730 self.os_inst = i_osdict # the new dict (without defaults)
8732 self.os_new = self.os_inst = {}
8736 if constants.BE_MEMORY in self.op.beparams and not self.op.force:
8737 mem_check_list = [pnode]
8738 if be_new[constants.BE_AUTO_BALANCE]:
8739 # either we changed auto_balance to yes or it was from before
8740 mem_check_list.extend(instance.secondary_nodes)
8741 instance_info = self.rpc.call_instance_info(pnode, instance.name,
8742 instance.hypervisor)
8743 nodeinfo = self.rpc.call_node_info(mem_check_list, self.cfg.GetVGName(),
8744 instance.hypervisor)
8745 pninfo = nodeinfo[pnode]
8746 msg = pninfo.fail_msg
8748 # Assume the primary node is unreachable and go ahead
8749 self.warn.append("Can't get info from primary node %s: %s" %
8751 elif not isinstance(pninfo.payload.get('memory_free', None), int):
8752 self.warn.append("Node data from primary node %s doesn't contain"
8753 " free memory information" % pnode)
8754 elif instance_info.fail_msg:
8755 self.warn.append("Can't get instance runtime information: %s" %
8756 instance_info.fail_msg)
8758 if instance_info.payload:
8759 current_mem = int(instance_info.payload['memory'])
8761 # Assume instance not running
8762 # (there is a slight race condition here, but it's not very probable,
8763 # and we have no other way to check)
8765 miss_mem = (be_new[constants.BE_MEMORY] - current_mem -
8766 pninfo.payload['memory_free'])
8768 raise errors.OpPrereqError("This change will prevent the instance"
8769 " from starting, due to %d MB of memory"
8770 " missing on its primary node" % miss_mem,
8773 if be_new[constants.BE_AUTO_BALANCE]:
8774 for node, nres in nodeinfo.items():
8775 if node not in instance.secondary_nodes:
8779 self.warn.append("Can't get info from secondary node %s: %s" %
8781 elif not isinstance(nres.payload.get('memory_free', None), int):
8782 self.warn.append("Secondary node %s didn't return free"
8783 " memory information" % node)
8784 elif be_new[constants.BE_MEMORY] > nres.payload['memory_free']:
8785 self.warn.append("Not enough memory to failover instance to"
8786 " secondary node %s" % node)
8791 for nic_op, nic_dict in self.op.nics:
8792 if nic_op == constants.DDM_REMOVE:
8793 if not instance.nics:
8794 raise errors.OpPrereqError("Instance has no NICs, cannot remove",
8797 if nic_op != constants.DDM_ADD:
8799 if not instance.nics:
8800 raise errors.OpPrereqError("Invalid NIC index %s, instance has"
8801 " no NICs" % nic_op,
8803 if nic_op < 0 or nic_op >= len(instance.nics):
8804 raise errors.OpPrereqError("Invalid NIC index %s, valid values"
8806 (nic_op, len(instance.nics) - 1),
8808 old_nic_params = instance.nics[nic_op].nicparams
8809 old_nic_ip = instance.nics[nic_op].ip
8814 update_params_dict = dict([(key, nic_dict[key])
8815 for key in constants.NICS_PARAMETERS
8816 if key in nic_dict])
8818 if 'bridge' in nic_dict:
8819 update_params_dict[constants.NIC_LINK] = nic_dict['bridge']
8821 new_nic_params = _GetUpdatedParams(old_nic_params,
8823 utils.ForceDictType(new_nic_params, constants.NICS_PARAMETER_TYPES)
8824 new_filled_nic_params = cluster.SimpleFillNIC(new_nic_params)
8825 objects.NIC.CheckParameterSyntax(new_filled_nic_params)
8826 self.nic_pinst[nic_op] = new_nic_params
8827 self.nic_pnew[nic_op] = new_filled_nic_params
8828 new_nic_mode = new_filled_nic_params[constants.NIC_MODE]
8830 if new_nic_mode == constants.NIC_MODE_BRIDGED:
8831 nic_bridge = new_filled_nic_params[constants.NIC_LINK]
8832 msg = self.rpc.call_bridges_exist(pnode, [nic_bridge]).fail_msg
8834 msg = "Error checking bridges on node %s: %s" % (pnode, msg)
8836 self.warn.append(msg)
8838 raise errors.OpPrereqError(msg, errors.ECODE_ENVIRON)
8839 if new_nic_mode == constants.NIC_MODE_ROUTED:
8840 if 'ip' in nic_dict:
8841 nic_ip = nic_dict['ip']
8845 raise errors.OpPrereqError('Cannot set the nic ip to None'
8846 ' on a routed nic', errors.ECODE_INVAL)
8847 if 'mac' in nic_dict:
8848 nic_mac = nic_dict['mac']
8850 raise errors.OpPrereqError('Cannot set the nic mac to None',
8852 elif nic_mac in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
8853 # otherwise generate the mac
8854 nic_dict['mac'] = self.cfg.GenerateMAC(self.proc.GetECId())
8856 # or validate/reserve the current one
8858 self.cfg.ReserveMAC(nic_mac, self.proc.GetECId())
8859 except errors.ReservationError:
8860 raise errors.OpPrereqError("MAC address %s already in use"
8861 " in cluster" % nic_mac,
8862 errors.ECODE_NOTUNIQUE)
8865 if self.op.disks and instance.disk_template == constants.DT_DISKLESS:
8866 raise errors.OpPrereqError("Disk operations not supported for"
8867 " diskless instances",
8869 for disk_op, _ in self.op.disks:
8870 if disk_op == constants.DDM_REMOVE:
8871 if len(instance.disks) == 1:
8872 raise errors.OpPrereqError("Cannot remove the last disk of"
8873 " an instance", errors.ECODE_INVAL)
8874 _CheckInstanceDown(self, instance, "cannot remove disks")
8876 if (disk_op == constants.DDM_ADD and
8877 len(instance.nics) >= constants.MAX_DISKS):
8878 raise errors.OpPrereqError("Instance has too many disks (%d), cannot"
8879 " add more" % constants.MAX_DISKS,
8881 if disk_op not in (constants.DDM_ADD, constants.DDM_REMOVE):
8883 if disk_op < 0 or disk_op >= len(instance.disks):
8884 raise errors.OpPrereqError("Invalid disk index %s, valid values"
8886 (disk_op, len(instance.disks)),
8891 def _ConvertPlainToDrbd(self, feedback_fn):
8892 """Converts an instance from plain to drbd.
8895 feedback_fn("Converting template to drbd")
8896 instance = self.instance
8897 pnode = instance.primary_node
8898 snode = self.op.remote_node
8900 # create a fake disk info for _GenerateDiskTemplate
8901 disk_info = [{"size": d.size, "mode": d.mode} for d in instance.disks]
8902 new_disks = _GenerateDiskTemplate(self, self.op.disk_template,
8903 instance.name, pnode, [snode],
8904 disk_info, None, None, 0)
8905 info = _GetInstanceInfoText(instance)
8906 feedback_fn("Creating aditional volumes...")
8907 # first, create the missing data and meta devices
8908 for disk in new_disks:
8909 # unfortunately this is... not too nice
8910 _CreateSingleBlockDev(self, pnode, instance, disk.children[1],
8912 for child in disk.children:
8913 _CreateSingleBlockDev(self, snode, instance, child, info, True)
8914 # at this stage, all new LVs have been created, we can rename the
8916 feedback_fn("Renaming original volumes...")
8917 rename_list = [(o, n.children[0].logical_id)
8918 for (o, n) in zip(instance.disks, new_disks)]
8919 result = self.rpc.call_blockdev_rename(pnode, rename_list)
8920 result.Raise("Failed to rename original LVs")
8922 feedback_fn("Initializing DRBD devices...")
8923 # all child devices are in place, we can now create the DRBD devices
8924 for disk in new_disks:
8925 for node in [pnode, snode]:
8926 f_create = node == pnode
8927 _CreateSingleBlockDev(self, node, instance, disk, info, f_create)
8929 # at this point, the instance has been modified
8930 instance.disk_template = constants.DT_DRBD8
8931 instance.disks = new_disks
8932 self.cfg.Update(instance, feedback_fn)
8934 # disks are created, waiting for sync
8935 disk_abort = not _WaitForSync(self, instance)
8937 raise errors.OpExecError("There are some degraded disks for"
8938 " this instance, please cleanup manually")
8940 def _ConvertDrbdToPlain(self, feedback_fn):
8941 """Converts an instance from drbd to plain.
8944 instance = self.instance
8945 assert len(instance.secondary_nodes) == 1
8946 pnode = instance.primary_node
8947 snode = instance.secondary_nodes[0]
8948 feedback_fn("Converting template to plain")
8950 old_disks = instance.disks
8951 new_disks = [d.children[0] for d in old_disks]
8953 # copy over size and mode
8954 for parent, child in zip(old_disks, new_disks):
8955 child.size = parent.size
8956 child.mode = parent.mode
8958 # update instance structure
8959 instance.disks = new_disks
8960 instance.disk_template = constants.DT_PLAIN
8961 self.cfg.Update(instance, feedback_fn)
8963 feedback_fn("Removing volumes on the secondary node...")
8964 for disk in old_disks:
8965 self.cfg.SetDiskID(disk, snode)
8966 msg = self.rpc.call_blockdev_remove(snode, disk).fail_msg
8968 self.LogWarning("Could not remove block device %s on node %s,"
8969 " continuing anyway: %s", disk.iv_name, snode, msg)
8971 feedback_fn("Removing unneeded volumes on the primary node...")
8972 for idx, disk in enumerate(old_disks):
8973 meta = disk.children[1]
8974 self.cfg.SetDiskID(meta, pnode)
8975 msg = self.rpc.call_blockdev_remove(pnode, meta).fail_msg
8977 self.LogWarning("Could not remove metadata for disk %d on node %s,"
8978 " continuing anyway: %s", idx, pnode, msg)
8981 def Exec(self, feedback_fn):
8982 """Modifies an instance.
8984 All parameters take effect only at the next restart of the instance.
8987 # Process here the warnings from CheckPrereq, as we don't have a
8988 # feedback_fn there.
8989 for warn in self.warn:
8990 feedback_fn("WARNING: %s" % warn)
8993 instance = self.instance
8995 for disk_op, disk_dict in self.op.disks:
8996 if disk_op == constants.DDM_REMOVE:
8997 # remove the last disk
8998 device = instance.disks.pop()
8999 device_idx = len(instance.disks)
9000 for node, disk in device.ComputeNodeTree(instance.primary_node):
9001 self.cfg.SetDiskID(disk, node)
9002 msg = self.rpc.call_blockdev_remove(node, disk).fail_msg
9004 self.LogWarning("Could not remove disk/%d on node %s: %s,"
9005 " continuing anyway", device_idx, node, msg)
9006 result.append(("disk/%d" % device_idx, "remove"))
9007 elif disk_op == constants.DDM_ADD:
9009 if instance.disk_template == constants.DT_FILE:
9010 file_driver, file_path = instance.disks[0].logical_id
9011 file_path = os.path.dirname(file_path)
9013 file_driver = file_path = None
9014 disk_idx_base = len(instance.disks)
9015 new_disk = _GenerateDiskTemplate(self,
9016 instance.disk_template,
9017 instance.name, instance.primary_node,
9018 instance.secondary_nodes,
9023 instance.disks.append(new_disk)
9024 info = _GetInstanceInfoText(instance)
9026 logging.info("Creating volume %s for instance %s",
9027 new_disk.iv_name, instance.name)
9028 # Note: this needs to be kept in sync with _CreateDisks
9030 for node in instance.all_nodes:
9031 f_create = node == instance.primary_node
9033 _CreateBlockDev(self, node, instance, new_disk,
9034 f_create, info, f_create)
9035 except errors.OpExecError, err:
9036 self.LogWarning("Failed to create volume %s (%s) on"
9038 new_disk.iv_name, new_disk, node, err)
9039 result.append(("disk/%d" % disk_idx_base, "add:size=%s,mode=%s" %
9040 (new_disk.size, new_disk.mode)))
9042 # change a given disk
9043 instance.disks[disk_op].mode = disk_dict['mode']
9044 result.append(("disk.mode/%d" % disk_op, disk_dict['mode']))
9046 if self.op.disk_template:
9047 r_shut = _ShutdownInstanceDisks(self, instance)
9049 raise errors.OpExecError("Cannot shutdow instance disks, unable to"
9050 " proceed with disk template conversion")
9051 mode = (instance.disk_template, self.op.disk_template)
9053 self._DISK_CONVERSIONS[mode](self, feedback_fn)
9055 self.cfg.ReleaseDRBDMinors(instance.name)
9057 result.append(("disk_template", self.op.disk_template))
9060 for nic_op, nic_dict in self.op.nics:
9061 if nic_op == constants.DDM_REMOVE:
9062 # remove the last nic
9063 del instance.nics[-1]
9064 result.append(("nic.%d" % len(instance.nics), "remove"))
9065 elif nic_op == constants.DDM_ADD:
9066 # mac and bridge should be set, by now
9067 mac = nic_dict['mac']
9068 ip = nic_dict.get('ip', None)
9069 nicparams = self.nic_pinst[constants.DDM_ADD]
9070 new_nic = objects.NIC(mac=mac, ip=ip, nicparams=nicparams)
9071 instance.nics.append(new_nic)
9072 result.append(("nic.%d" % (len(instance.nics) - 1),
9073 "add:mac=%s,ip=%s,mode=%s,link=%s" %
9074 (new_nic.mac, new_nic.ip,
9075 self.nic_pnew[constants.DDM_ADD][constants.NIC_MODE],
9076 self.nic_pnew[constants.DDM_ADD][constants.NIC_LINK]
9079 for key in 'mac', 'ip':
9081 setattr(instance.nics[nic_op], key, nic_dict[key])
9082 if nic_op in self.nic_pinst:
9083 instance.nics[nic_op].nicparams = self.nic_pinst[nic_op]
9084 for key, val in nic_dict.iteritems():
9085 result.append(("nic.%s/%d" % (key, nic_op), val))
9088 if self.op.hvparams:
9089 instance.hvparams = self.hv_inst
9090 for key, val in self.op.hvparams.iteritems():
9091 result.append(("hv/%s" % key, val))
9094 if self.op.beparams:
9095 instance.beparams = self.be_inst
9096 for key, val in self.op.beparams.iteritems():
9097 result.append(("be/%s" % key, val))
9101 instance.os = self.op.os_name
9104 if self.op.osparams:
9105 instance.osparams = self.os_inst
9106 for key, val in self.op.osparams.iteritems():
9107 result.append(("os/%s" % key, val))
9109 self.cfg.Update(instance, feedback_fn)
9113 _DISK_CONVERSIONS = {
9114 (constants.DT_PLAIN, constants.DT_DRBD8): _ConvertPlainToDrbd,
9115 (constants.DT_DRBD8, constants.DT_PLAIN): _ConvertDrbdToPlain,
9119 class LUQueryExports(NoHooksLU):
9120 """Query the exports list
9124 ("nodes", _EmptyList, _TListOf(_TNonEmptyString)),
9125 ("use_locking", False, _TBool),
9129 def ExpandNames(self):
9130 self.needed_locks = {}
9131 self.share_locks[locking.LEVEL_NODE] = 1
9132 if not self.op.nodes:
9133 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
9135 self.needed_locks[locking.LEVEL_NODE] = \
9136 _GetWantedNodes(self, self.op.nodes)
9138 def Exec(self, feedback_fn):
9139 """Compute the list of all the exported system images.
9142 @return: a dictionary with the structure node->(export-list)
9143 where export-list is a list of the instances exported on
9147 self.nodes = self.acquired_locks[locking.LEVEL_NODE]
9148 rpcresult = self.rpc.call_export_list(self.nodes)
9150 for node in rpcresult:
9151 if rpcresult[node].fail_msg:
9152 result[node] = False
9154 result[node] = rpcresult[node].payload
9159 class LUPrepareExport(NoHooksLU):
9160 """Prepares an instance for an export and returns useful information.
9165 ("mode", _NoDefault, _TElemOf(constants.EXPORT_MODES)),
9169 def ExpandNames(self):
9170 self._ExpandAndLockInstance()
9172 def CheckPrereq(self):
9173 """Check prerequisites.
9176 instance_name = self.op.instance_name
9178 self.instance = self.cfg.GetInstanceInfo(instance_name)
9179 assert self.instance is not None, \
9180 "Cannot retrieve locked instance %s" % self.op.instance_name
9181 _CheckNodeOnline(self, self.instance.primary_node)
9183 self._cds = _GetClusterDomainSecret()
9185 def Exec(self, feedback_fn):
9186 """Prepares an instance for an export.
9189 instance = self.instance
9191 if self.op.mode == constants.EXPORT_MODE_REMOTE:
9192 salt = utils.GenerateSecret(8)
9194 feedback_fn("Generating X509 certificate on %s" % instance.primary_node)
9195 result = self.rpc.call_x509_cert_create(instance.primary_node,
9196 constants.RIE_CERT_VALIDITY)
9197 result.Raise("Can't create X509 key and certificate on %s" % result.node)
9199 (name, cert_pem) = result.payload
9201 cert = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM,
9205 "handshake": masterd.instance.ComputeRemoteExportHandshake(self._cds),
9206 "x509_key_name": (name, utils.Sha1Hmac(self._cds, name, salt=salt),
9208 "x509_ca": utils.SignX509Certificate(cert, self._cds, salt),
9214 class LUExportInstance(LogicalUnit):
9215 """Export an instance to an image in the cluster.
9218 HPATH = "instance-export"
9219 HTYPE = constants.HTYPE_INSTANCE
9222 ("target_node", _NoDefault, _TOr(_TNonEmptyString, _TList)),
9223 ("shutdown", True, _TBool),
9225 ("remove_instance", False, _TBool),
9226 ("ignore_remove_failures", False, _TBool),
9227 ("mode", constants.EXPORT_MODE_LOCAL, _TElemOf(constants.EXPORT_MODES)),
9228 ("x509_key_name", None, _TOr(_TList, _TNone)),
9229 ("destination_x509_ca", None, _TMaybeString),
9233 def CheckArguments(self):
9234 """Check the arguments.
9237 self.x509_key_name = self.op.x509_key_name
9238 self.dest_x509_ca_pem = self.op.destination_x509_ca
9240 if self.op.remove_instance and not self.op.shutdown:
9241 raise errors.OpPrereqError("Can not remove instance without shutting it"
9244 if self.op.mode == constants.EXPORT_MODE_REMOTE:
9245 if not self.x509_key_name:
9246 raise errors.OpPrereqError("Missing X509 key name for encryption",
9249 if not self.dest_x509_ca_pem:
9250 raise errors.OpPrereqError("Missing destination X509 CA",
9253 def ExpandNames(self):
9254 self._ExpandAndLockInstance()
9256 # Lock all nodes for local exports
9257 if self.op.mode == constants.EXPORT_MODE_LOCAL:
9258 # FIXME: lock only instance primary and destination node
9260 # Sad but true, for now we have do lock all nodes, as we don't know where
9261 # the previous export might be, and in this LU we search for it and
9262 # remove it from its current node. In the future we could fix this by:
9263 # - making a tasklet to search (share-lock all), then create the
9264 # new one, then one to remove, after
9265 # - removing the removal operation altogether
9266 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
9268 def DeclareLocks(self, level):
9269 """Last minute lock declaration."""
9270 # All nodes are locked anyway, so nothing to do here.
9272 def BuildHooksEnv(self):
9275 This will run on the master, primary node and target node.
9279 "EXPORT_MODE": self.op.mode,
9280 "EXPORT_NODE": self.op.target_node,
9281 "EXPORT_DO_SHUTDOWN": self.op.shutdown,
9282 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
9283 # TODO: Generic function for boolean env variables
9284 "REMOVE_INSTANCE": str(bool(self.op.remove_instance)),
9287 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
9289 nl = [self.cfg.GetMasterNode(), self.instance.primary_node]
9291 if self.op.mode == constants.EXPORT_MODE_LOCAL:
9292 nl.append(self.op.target_node)
9296 def CheckPrereq(self):
9297 """Check prerequisites.
9299 This checks that the instance and node names are valid.
9302 instance_name = self.op.instance_name
9304 self.instance = self.cfg.GetInstanceInfo(instance_name)
9305 assert self.instance is not None, \
9306 "Cannot retrieve locked instance %s" % self.op.instance_name
9307 _CheckNodeOnline(self, self.instance.primary_node)
9309 if self.op.mode == constants.EXPORT_MODE_LOCAL:
9310 self.op.target_node = _ExpandNodeName(self.cfg, self.op.target_node)
9311 self.dst_node = self.cfg.GetNodeInfo(self.op.target_node)
9312 assert self.dst_node is not None
9314 _CheckNodeOnline(self, self.dst_node.name)
9315 _CheckNodeNotDrained(self, self.dst_node.name)
9318 self.dest_disk_info = None
9319 self.dest_x509_ca = None
9321 elif self.op.mode == constants.EXPORT_MODE_REMOTE:
9322 self.dst_node = None
9324 if len(self.op.target_node) != len(self.instance.disks):
9325 raise errors.OpPrereqError(("Received destination information for %s"
9326 " disks, but instance %s has %s disks") %
9327 (len(self.op.target_node), instance_name,
9328 len(self.instance.disks)),
9331 cds = _GetClusterDomainSecret()
9333 # Check X509 key name
9335 (key_name, hmac_digest, hmac_salt) = self.x509_key_name
9336 except (TypeError, ValueError), err:
9337 raise errors.OpPrereqError("Invalid data for X509 key name: %s" % err)
9339 if not utils.VerifySha1Hmac(cds, key_name, hmac_digest, salt=hmac_salt):
9340 raise errors.OpPrereqError("HMAC for X509 key name is wrong",
9343 # Load and verify CA
9345 (cert, _) = utils.LoadSignedX509Certificate(self.dest_x509_ca_pem, cds)
9346 except OpenSSL.crypto.Error, err:
9347 raise errors.OpPrereqError("Unable to load destination X509 CA (%s)" %
9348 (err, ), errors.ECODE_INVAL)
9350 (errcode, msg) = utils.VerifyX509Certificate(cert, None, None)
9351 if errcode is not None:
9352 raise errors.OpPrereqError("Invalid destination X509 CA (%s)" %
9353 (msg, ), errors.ECODE_INVAL)
9355 self.dest_x509_ca = cert
9357 # Verify target information
9359 for idx, disk_data in enumerate(self.op.target_node):
9361 (host, port, magic) = \
9362 masterd.instance.CheckRemoteExportDiskInfo(cds, idx, disk_data)
9363 except errors.GenericError, err:
9364 raise errors.OpPrereqError("Target info for disk %s: %s" %
9365 (idx, err), errors.ECODE_INVAL)
9367 disk_info.append((host, port, magic))
9369 assert len(disk_info) == len(self.op.target_node)
9370 self.dest_disk_info = disk_info
9373 raise errors.ProgrammerError("Unhandled export mode %r" %
9376 # instance disk type verification
9377 # TODO: Implement export support for file-based disks
9378 for disk in self.instance.disks:
9379 if disk.dev_type == constants.LD_FILE:
9380 raise errors.OpPrereqError("Export not supported for instances with"
9381 " file-based disks", errors.ECODE_INVAL)
9383 def _CleanupExports(self, feedback_fn):
9384 """Removes exports of current instance from all other nodes.
9386 If an instance in a cluster with nodes A..D was exported to node C, its
9387 exports will be removed from the nodes A, B and D.
9390 assert self.op.mode != constants.EXPORT_MODE_REMOTE
9392 nodelist = self.cfg.GetNodeList()
9393 nodelist.remove(self.dst_node.name)
9395 # on one-node clusters nodelist will be empty after the removal
9396 # if we proceed the backup would be removed because OpQueryExports
9397 # substitutes an empty list with the full cluster node list.
9398 iname = self.instance.name
9400 feedback_fn("Removing old exports for instance %s" % iname)
9401 exportlist = self.rpc.call_export_list(nodelist)
9402 for node in exportlist:
9403 if exportlist[node].fail_msg:
9405 if iname in exportlist[node].payload:
9406 msg = self.rpc.call_export_remove(node, iname).fail_msg
9408 self.LogWarning("Could not remove older export for instance %s"
9409 " on node %s: %s", iname, node, msg)
9411 def Exec(self, feedback_fn):
9412 """Export an instance to an image in the cluster.
9415 assert self.op.mode in constants.EXPORT_MODES
9417 instance = self.instance
9418 src_node = instance.primary_node
9420 if self.op.shutdown:
9421 # shutdown the instance, but not the disks
9422 feedback_fn("Shutting down instance %s" % instance.name)
9423 result = self.rpc.call_instance_shutdown(src_node, instance,
9424 self.op.shutdown_timeout)
9425 # TODO: Maybe ignore failures if ignore_remove_failures is set
9426 result.Raise("Could not shutdown instance %s on"
9427 " node %s" % (instance.name, src_node))
9429 # set the disks ID correctly since call_instance_start needs the
9430 # correct drbd minor to create the symlinks
9431 for disk in instance.disks:
9432 self.cfg.SetDiskID(disk, src_node)
9434 activate_disks = (not instance.admin_up)
9437 # Activate the instance disks if we'exporting a stopped instance
9438 feedback_fn("Activating disks for %s" % instance.name)
9439 _StartInstanceDisks(self, instance, None)
9442 helper = masterd.instance.ExportInstanceHelper(self, feedback_fn,
9445 helper.CreateSnapshots()
9447 if (self.op.shutdown and instance.admin_up and
9448 not self.op.remove_instance):
9449 assert not activate_disks
9450 feedback_fn("Starting instance %s" % instance.name)
9451 result = self.rpc.call_instance_start(src_node, instance, None, None)
9452 msg = result.fail_msg
9454 feedback_fn("Failed to start instance: %s" % msg)
9455 _ShutdownInstanceDisks(self, instance)
9456 raise errors.OpExecError("Could not start instance: %s" % msg)
9458 if self.op.mode == constants.EXPORT_MODE_LOCAL:
9459 (fin_resu, dresults) = helper.LocalExport(self.dst_node)
9460 elif self.op.mode == constants.EXPORT_MODE_REMOTE:
9461 connect_timeout = constants.RIE_CONNECT_TIMEOUT
9462 timeouts = masterd.instance.ImportExportTimeouts(connect_timeout)
9464 (key_name, _, _) = self.x509_key_name
9467 OpenSSL.crypto.dump_certificate(OpenSSL.crypto.FILETYPE_PEM,
9470 (fin_resu, dresults) = helper.RemoteExport(self.dest_disk_info,
9471 key_name, dest_ca_pem,
9476 # Check for backwards compatibility
9477 assert len(dresults) == len(instance.disks)
9478 assert compat.all(isinstance(i, bool) for i in dresults), \
9479 "Not all results are boolean: %r" % dresults
9483 feedback_fn("Deactivating disks for %s" % instance.name)
9484 _ShutdownInstanceDisks(self, instance)
9486 # Remove instance if requested
9487 if self.op.remove_instance:
9488 if not (compat.all(dresults) and fin_resu):
9489 feedback_fn("Not removing instance %s as parts of the export failed" %
9492 feedback_fn("Removing instance %s" % instance.name)
9493 _RemoveInstance(self, feedback_fn, instance,
9494 self.op.ignore_remove_failures)
9496 if self.op.mode == constants.EXPORT_MODE_LOCAL:
9497 self._CleanupExports(feedback_fn)
9499 return fin_resu, dresults
9502 class LURemoveExport(NoHooksLU):
9503 """Remove exports related to the named instance.
9511 def ExpandNames(self):
9512 self.needed_locks = {}
9513 # We need all nodes to be locked in order for RemoveExport to work, but we
9514 # don't need to lock the instance itself, as nothing will happen to it (and
9515 # we can remove exports also for a removed instance)
9516 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
9518 def Exec(self, feedback_fn):
9519 """Remove any export.
9522 instance_name = self.cfg.ExpandInstanceName(self.op.instance_name)
9523 # If the instance was not found we'll try with the name that was passed in.
9524 # This will only work if it was an FQDN, though.
9526 if not instance_name:
9528 instance_name = self.op.instance_name
9530 locked_nodes = self.acquired_locks[locking.LEVEL_NODE]
9531 exportlist = self.rpc.call_export_list(locked_nodes)
9533 for node in exportlist:
9534 msg = exportlist[node].fail_msg
9536 self.LogWarning("Failed to query node %s (continuing): %s", node, msg)
9538 if instance_name in exportlist[node].payload:
9540 result = self.rpc.call_export_remove(node, instance_name)
9541 msg = result.fail_msg
9543 logging.error("Could not remove export for instance %s"
9544 " on node %s: %s", instance_name, node, msg)
9546 if fqdn_warn and not found:
9547 feedback_fn("Export not found. If trying to remove an export belonging"
9548 " to a deleted instance please use its Fully Qualified"
9552 class TagsLU(NoHooksLU): # pylint: disable-msg=W0223
9555 This is an abstract class which is the parent of all the other tags LUs.
9559 def ExpandNames(self):
9560 self.needed_locks = {}
9561 if self.op.kind == constants.TAG_NODE:
9562 self.op.name = _ExpandNodeName(self.cfg, self.op.name)
9563 self.needed_locks[locking.LEVEL_NODE] = self.op.name
9564 elif self.op.kind == constants.TAG_INSTANCE:
9565 self.op.name = _ExpandInstanceName(self.cfg, self.op.name)
9566 self.needed_locks[locking.LEVEL_INSTANCE] = self.op.name
9568 def CheckPrereq(self):
9569 """Check prerequisites.
9572 if self.op.kind == constants.TAG_CLUSTER:
9573 self.target = self.cfg.GetClusterInfo()
9574 elif self.op.kind == constants.TAG_NODE:
9575 self.target = self.cfg.GetNodeInfo(self.op.name)
9576 elif self.op.kind == constants.TAG_INSTANCE:
9577 self.target = self.cfg.GetInstanceInfo(self.op.name)
9579 raise errors.OpPrereqError("Wrong tag type requested (%s)" %
9580 str(self.op.kind), errors.ECODE_INVAL)
9583 class LUGetTags(TagsLU):
9584 """Returns the tags of a given object.
9588 ("kind", _NoDefault, _TElemOf(constants.VALID_TAG_TYPES)),
9589 ("name", _NoDefault, _TNonEmptyString),
9593 def Exec(self, feedback_fn):
9594 """Returns the tag list.
9597 return list(self.target.GetTags())
9600 class LUSearchTags(NoHooksLU):
9601 """Searches the tags for a given pattern.
9605 ("pattern", _NoDefault, _TNonEmptyString),
9609 def ExpandNames(self):
9610 self.needed_locks = {}
9612 def CheckPrereq(self):
9613 """Check prerequisites.
9615 This checks the pattern passed for validity by compiling it.
9619 self.re = re.compile(self.op.pattern)
9620 except re.error, err:
9621 raise errors.OpPrereqError("Invalid search pattern '%s': %s" %
9622 (self.op.pattern, err), errors.ECODE_INVAL)
9624 def Exec(self, feedback_fn):
9625 """Returns the tag list.
9629 tgts = [("/cluster", cfg.GetClusterInfo())]
9630 ilist = cfg.GetAllInstancesInfo().values()
9631 tgts.extend([("/instances/%s" % i.name, i) for i in ilist])
9632 nlist = cfg.GetAllNodesInfo().values()
9633 tgts.extend([("/nodes/%s" % n.name, n) for n in nlist])
9635 for path, target in tgts:
9636 for tag in target.GetTags():
9637 if self.re.search(tag):
9638 results.append((path, tag))
9642 class LUAddTags(TagsLU):
9643 """Sets a tag on a given object.
9647 ("kind", _NoDefault, _TElemOf(constants.VALID_TAG_TYPES)),
9648 ("name", _NoDefault, _TNonEmptyString),
9649 ("tags", _NoDefault, _TListOf(_TNonEmptyString)),
9653 def CheckPrereq(self):
9654 """Check prerequisites.
9656 This checks the type and length of the tag name and value.
9659 TagsLU.CheckPrereq(self)
9660 for tag in self.op.tags:
9661 objects.TaggableObject.ValidateTag(tag)
9663 def Exec(self, feedback_fn):
9668 for tag in self.op.tags:
9669 self.target.AddTag(tag)
9670 except errors.TagError, err:
9671 raise errors.OpExecError("Error while setting tag: %s" % str(err))
9672 self.cfg.Update(self.target, feedback_fn)
9675 class LUDelTags(TagsLU):
9676 """Delete a list of tags from a given object.
9680 ("kind", _NoDefault, _TElemOf(constants.VALID_TAG_TYPES)),
9681 ("name", _NoDefault, _TNonEmptyString),
9682 ("tags", _NoDefault, _TListOf(_TNonEmptyString)),
9686 def CheckPrereq(self):
9687 """Check prerequisites.
9689 This checks that we have the given tag.
9692 TagsLU.CheckPrereq(self)
9693 for tag in self.op.tags:
9694 objects.TaggableObject.ValidateTag(tag)
9695 del_tags = frozenset(self.op.tags)
9696 cur_tags = self.target.GetTags()
9697 if not del_tags <= cur_tags:
9698 diff_tags = del_tags - cur_tags
9699 diff_names = ["'%s'" % tag for tag in diff_tags]
9701 raise errors.OpPrereqError("Tag(s) %s not found" %
9702 (",".join(diff_names)), errors.ECODE_NOENT)
9704 def Exec(self, feedback_fn):
9705 """Remove the tag from the object.
9708 for tag in self.op.tags:
9709 self.target.RemoveTag(tag)
9710 self.cfg.Update(self.target, feedback_fn)
9713 class LUTestDelay(NoHooksLU):
9714 """Sleep for a specified amount of time.
9716 This LU sleeps on the master and/or nodes for a specified amount of
9721 ("duration", _NoDefault, _TFloat),
9722 ("on_master", True, _TBool),
9723 ("on_nodes", _EmptyList, _TListOf(_TNonEmptyString)),
9724 ("repeat", 0, _TPositiveInt)
9728 def ExpandNames(self):
9729 """Expand names and set required locks.
9731 This expands the node list, if any.
9734 self.needed_locks = {}
9735 if self.op.on_nodes:
9736 # _GetWantedNodes can be used here, but is not always appropriate to use
9737 # this way in ExpandNames. Check LogicalUnit.ExpandNames docstring for
9739 self.op.on_nodes = _GetWantedNodes(self, self.op.on_nodes)
9740 self.needed_locks[locking.LEVEL_NODE] = self.op.on_nodes
9742 def _TestDelay(self):
9743 """Do the actual sleep.
9746 if self.op.on_master:
9747 if not utils.TestDelay(self.op.duration):
9748 raise errors.OpExecError("Error during master delay test")
9749 if self.op.on_nodes:
9750 result = self.rpc.call_test_delay(self.op.on_nodes, self.op.duration)
9751 for node, node_result in result.items():
9752 node_result.Raise("Failure during rpc call to node %s" % node)
9754 def Exec(self, feedback_fn):
9755 """Execute the test delay opcode, with the wanted repetitions.
9758 if self.op.repeat == 0:
9761 top_value = self.op.repeat - 1
9762 for i in range(self.op.repeat):
9763 self.LogInfo("Test delay iteration %d/%d" % (i, top_value))
9767 class IAllocator(object):
9768 """IAllocator framework.
9770 An IAllocator instance has three sets of attributes:
9771 - cfg that is needed to query the cluster
9772 - input data (all members of the _KEYS class attribute are required)
9773 - four buffer attributes (in|out_data|text), that represent the
9774 input (to the external script) in text and data structure format,
9775 and the output from it, again in two formats
9776 - the result variables from the script (success, info, nodes) for
9780 # pylint: disable-msg=R0902
9781 # lots of instance attributes
9783 "name", "mem_size", "disks", "disk_template",
9784 "os", "tags", "nics", "vcpus", "hypervisor",
9787 "name", "relocate_from",
9793 def __init__(self, cfg, rpc, mode, **kwargs):
9796 # init buffer variables
9797 self.in_text = self.out_text = self.in_data = self.out_data = None
9798 # init all input fields so that pylint is happy
9800 self.mem_size = self.disks = self.disk_template = None
9801 self.os = self.tags = self.nics = self.vcpus = None
9802 self.hypervisor = None
9803 self.relocate_from = None
9805 self.evac_nodes = None
9807 self.required_nodes = None
9808 # init result fields
9809 self.success = self.info = self.result = None
9810 if self.mode == constants.IALLOCATOR_MODE_ALLOC:
9811 keyset = self._ALLO_KEYS
9812 fn = self._AddNewInstance
9813 elif self.mode == constants.IALLOCATOR_MODE_RELOC:
9814 keyset = self._RELO_KEYS
9815 fn = self._AddRelocateInstance
9816 elif self.mode == constants.IALLOCATOR_MODE_MEVAC:
9817 keyset = self._EVAC_KEYS
9818 fn = self._AddEvacuateNodes
9820 raise errors.ProgrammerError("Unknown mode '%s' passed to the"
9821 " IAllocator" % self.mode)
9823 if key not in keyset:
9824 raise errors.ProgrammerError("Invalid input parameter '%s' to"
9825 " IAllocator" % key)
9826 setattr(self, key, kwargs[key])
9829 if key not in kwargs:
9830 raise errors.ProgrammerError("Missing input parameter '%s' to"
9831 " IAllocator" % key)
9832 self._BuildInputData(fn)
9834 def _ComputeClusterData(self):
9835 """Compute the generic allocator input data.
9837 This is the data that is independent of the actual operation.
9841 cluster_info = cfg.GetClusterInfo()
9844 "version": constants.IALLOCATOR_VERSION,
9845 "cluster_name": cfg.GetClusterName(),
9846 "cluster_tags": list(cluster_info.GetTags()),
9847 "enabled_hypervisors": list(cluster_info.enabled_hypervisors),
9848 # we don't have job IDs
9850 iinfo = cfg.GetAllInstancesInfo().values()
9851 i_list = [(inst, cluster_info.FillBE(inst)) for inst in iinfo]
9855 node_list = cfg.GetNodeList()
9857 if self.mode == constants.IALLOCATOR_MODE_ALLOC:
9858 hypervisor_name = self.hypervisor
9859 elif self.mode == constants.IALLOCATOR_MODE_RELOC:
9860 hypervisor_name = cfg.GetInstanceInfo(self.name).hypervisor
9861 elif self.mode == constants.IALLOCATOR_MODE_MEVAC:
9862 hypervisor_name = cluster_info.enabled_hypervisors[0]
9864 node_data = self.rpc.call_node_info(node_list, cfg.GetVGName(),
9867 self.rpc.call_all_instances_info(node_list,
9868 cluster_info.enabled_hypervisors)
9869 for nname, nresult in node_data.items():
9870 # first fill in static (config-based) values
9871 ninfo = cfg.GetNodeInfo(nname)
9873 "tags": list(ninfo.GetTags()),
9874 "primary_ip": ninfo.primary_ip,
9875 "secondary_ip": ninfo.secondary_ip,
9876 "offline": ninfo.offline,
9877 "drained": ninfo.drained,
9878 "master_candidate": ninfo.master_candidate,
9881 if not (ninfo.offline or ninfo.drained):
9882 nresult.Raise("Can't get data for node %s" % nname)
9883 node_iinfo[nname].Raise("Can't get node instance info from node %s" %
9885 remote_info = nresult.payload
9887 for attr in ['memory_total', 'memory_free', 'memory_dom0',
9888 'vg_size', 'vg_free', 'cpu_total']:
9889 if attr not in remote_info:
9890 raise errors.OpExecError("Node '%s' didn't return attribute"
9891 " '%s'" % (nname, attr))
9892 if not isinstance(remote_info[attr], int):
9893 raise errors.OpExecError("Node '%s' returned invalid value"
9895 (nname, attr, remote_info[attr]))
9896 # compute memory used by primary instances
9897 i_p_mem = i_p_up_mem = 0
9898 for iinfo, beinfo in i_list:
9899 if iinfo.primary_node == nname:
9900 i_p_mem += beinfo[constants.BE_MEMORY]
9901 if iinfo.name not in node_iinfo[nname].payload:
9904 i_used_mem = int(node_iinfo[nname].payload[iinfo.name]['memory'])
9905 i_mem_diff = beinfo[constants.BE_MEMORY] - i_used_mem
9906 remote_info['memory_free'] -= max(0, i_mem_diff)
9909 i_p_up_mem += beinfo[constants.BE_MEMORY]
9911 # compute memory used by instances
9913 "total_memory": remote_info['memory_total'],
9914 "reserved_memory": remote_info['memory_dom0'],
9915 "free_memory": remote_info['memory_free'],
9916 "total_disk": remote_info['vg_size'],
9917 "free_disk": remote_info['vg_free'],
9918 "total_cpus": remote_info['cpu_total'],
9919 "i_pri_memory": i_p_mem,
9920 "i_pri_up_memory": i_p_up_mem,
9924 node_results[nname] = pnr
9925 data["nodes"] = node_results
9929 for iinfo, beinfo in i_list:
9931 for nic in iinfo.nics:
9932 filled_params = cluster_info.SimpleFillNIC(nic.nicparams)
9933 nic_dict = {"mac": nic.mac,
9935 "mode": filled_params[constants.NIC_MODE],
9936 "link": filled_params[constants.NIC_LINK],
9938 if filled_params[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
9939 nic_dict["bridge"] = filled_params[constants.NIC_LINK]
9940 nic_data.append(nic_dict)
9942 "tags": list(iinfo.GetTags()),
9943 "admin_up": iinfo.admin_up,
9944 "vcpus": beinfo[constants.BE_VCPUS],
9945 "memory": beinfo[constants.BE_MEMORY],
9947 "nodes": [iinfo.primary_node] + list(iinfo.secondary_nodes),
9949 "disks": [{"size": dsk.size, "mode": dsk.mode} for dsk in iinfo.disks],
9950 "disk_template": iinfo.disk_template,
9951 "hypervisor": iinfo.hypervisor,
9953 pir["disk_space_total"] = _ComputeDiskSize(iinfo.disk_template,
9955 instance_data[iinfo.name] = pir
9957 data["instances"] = instance_data
9961 def _AddNewInstance(self):
9962 """Add new instance data to allocator structure.
9964 This in combination with _AllocatorGetClusterData will create the
9965 correct structure needed as input for the allocator.
9967 The checks for the completeness of the opcode must have already been
9971 disk_space = _ComputeDiskSize(self.disk_template, self.disks)
9973 if self.disk_template in constants.DTS_NET_MIRROR:
9974 self.required_nodes = 2
9976 self.required_nodes = 1
9979 "disk_template": self.disk_template,
9982 "vcpus": self.vcpus,
9983 "memory": self.mem_size,
9984 "disks": self.disks,
9985 "disk_space_total": disk_space,
9987 "required_nodes": self.required_nodes,
9991 def _AddRelocateInstance(self):
9992 """Add relocate instance data to allocator structure.
9994 This in combination with _IAllocatorGetClusterData will create the
9995 correct structure needed as input for the allocator.
9997 The checks for the completeness of the opcode must have already been
10001 instance = self.cfg.GetInstanceInfo(self.name)
10002 if instance is None:
10003 raise errors.ProgrammerError("Unknown instance '%s' passed to"
10004 " IAllocator" % self.name)
10006 if instance.disk_template not in constants.DTS_NET_MIRROR:
10007 raise errors.OpPrereqError("Can't relocate non-mirrored instances",
10008 errors.ECODE_INVAL)
10010 if len(instance.secondary_nodes) != 1:
10011 raise errors.OpPrereqError("Instance has not exactly one secondary node",
10012 errors.ECODE_STATE)
10014 self.required_nodes = 1
10015 disk_sizes = [{'size': disk.size} for disk in instance.disks]
10016 disk_space = _ComputeDiskSize(instance.disk_template, disk_sizes)
10020 "disk_space_total": disk_space,
10021 "required_nodes": self.required_nodes,
10022 "relocate_from": self.relocate_from,
10026 def _AddEvacuateNodes(self):
10027 """Add evacuate nodes data to allocator structure.
10031 "evac_nodes": self.evac_nodes
10035 def _BuildInputData(self, fn):
10036 """Build input data structures.
10039 self._ComputeClusterData()
10042 request["type"] = self.mode
10043 self.in_data["request"] = request
10045 self.in_text = serializer.Dump(self.in_data)
10047 def Run(self, name, validate=True, call_fn=None):
10048 """Run an instance allocator and return the results.
10051 if call_fn is None:
10052 call_fn = self.rpc.call_iallocator_runner
10054 result = call_fn(self.cfg.GetMasterNode(), name, self.in_text)
10055 result.Raise("Failure while running the iallocator script")
10057 self.out_text = result.payload
10059 self._ValidateResult()
10061 def _ValidateResult(self):
10062 """Process the allocator results.
10064 This will process and if successful save the result in
10065 self.out_data and the other parameters.
10069 rdict = serializer.Load(self.out_text)
10070 except Exception, err:
10071 raise errors.OpExecError("Can't parse iallocator results: %s" % str(err))
10073 if not isinstance(rdict, dict):
10074 raise errors.OpExecError("Can't parse iallocator results: not a dict")
10076 # TODO: remove backwards compatiblity in later versions
10077 if "nodes" in rdict and "result" not in rdict:
10078 rdict["result"] = rdict["nodes"]
10081 for key in "success", "info", "result":
10082 if key not in rdict:
10083 raise errors.OpExecError("Can't parse iallocator results:"
10084 " missing key '%s'" % key)
10085 setattr(self, key, rdict[key])
10087 if not isinstance(rdict["result"], list):
10088 raise errors.OpExecError("Can't parse iallocator results: 'result' key"
10090 self.out_data = rdict
10093 class LUTestAllocator(NoHooksLU):
10094 """Run allocator tests.
10096 This LU runs the allocator tests
10100 ("direction", _NoDefault, _TElemOf(constants.VALID_IALLOCATOR_DIRECTIONS)),
10101 ("mode", _NoDefault, _TElemOf(constants.VALID_IALLOCATOR_MODES)),
10102 ("name", _NoDefault, _TNonEmptyString),
10103 ("nics", _NoDefault, _TOr(_TNone, _TListOf(
10104 _TDictOf(_TElemOf(["mac", "ip", "bridge"]),
10105 _TOr(_TNone, _TNonEmptyString))))),
10106 ("disks", _NoDefault, _TOr(_TNone, _TList)),
10107 ("hypervisor", None, _TMaybeString),
10108 ("allocator", None, _TMaybeString),
10109 ("tags", _EmptyList, _TListOf(_TNonEmptyString)),
10110 ("mem_size", None, _TOr(_TNone, _TPositiveInt)),
10111 ("vcpus", None, _TOr(_TNone, _TPositiveInt)),
10112 ("os", None, _TMaybeString),
10113 ("disk_template", None, _TMaybeString),
10114 ("evac_nodes", None, _TOr(_TNone, _TListOf(_TNonEmptyString))),
10117 def CheckPrereq(self):
10118 """Check prerequisites.
10120 This checks the opcode parameters depending on the director and mode test.
10123 if self.op.mode == constants.IALLOCATOR_MODE_ALLOC:
10124 for attr in ["mem_size", "disks", "disk_template",
10125 "os", "tags", "nics", "vcpus"]:
10126 if not hasattr(self.op, attr):
10127 raise errors.OpPrereqError("Missing attribute '%s' on opcode input" %
10128 attr, errors.ECODE_INVAL)
10129 iname = self.cfg.ExpandInstanceName(self.op.name)
10130 if iname is not None:
10131 raise errors.OpPrereqError("Instance '%s' already in the cluster" %
10132 iname, errors.ECODE_EXISTS)
10133 if not isinstance(self.op.nics, list):
10134 raise errors.OpPrereqError("Invalid parameter 'nics'",
10135 errors.ECODE_INVAL)
10136 if not isinstance(self.op.disks, list):
10137 raise errors.OpPrereqError("Invalid parameter 'disks'",
10138 errors.ECODE_INVAL)
10139 for row in self.op.disks:
10140 if (not isinstance(row, dict) or
10141 "size" not in row or
10142 not isinstance(row["size"], int) or
10143 "mode" not in row or
10144 row["mode"] not in ['r', 'w']):
10145 raise errors.OpPrereqError("Invalid contents of the 'disks'"
10146 " parameter", errors.ECODE_INVAL)
10147 if self.op.hypervisor is None:
10148 self.op.hypervisor = self.cfg.GetHypervisorType()
10149 elif self.op.mode == constants.IALLOCATOR_MODE_RELOC:
10150 fname = _ExpandInstanceName(self.cfg, self.op.name)
10151 self.op.name = fname
10152 self.relocate_from = self.cfg.GetInstanceInfo(fname).secondary_nodes
10153 elif self.op.mode == constants.IALLOCATOR_MODE_MEVAC:
10154 if not hasattr(self.op, "evac_nodes"):
10155 raise errors.OpPrereqError("Missing attribute 'evac_nodes' on"
10156 " opcode input", errors.ECODE_INVAL)
10158 raise errors.OpPrereqError("Invalid test allocator mode '%s'" %
10159 self.op.mode, errors.ECODE_INVAL)
10161 if self.op.direction == constants.IALLOCATOR_DIR_OUT:
10162 if self.op.allocator is None:
10163 raise errors.OpPrereqError("Missing allocator name",
10164 errors.ECODE_INVAL)
10165 elif self.op.direction != constants.IALLOCATOR_DIR_IN:
10166 raise errors.OpPrereqError("Wrong allocator test '%s'" %
10167 self.op.direction, errors.ECODE_INVAL)
10169 def Exec(self, feedback_fn):
10170 """Run the allocator test.
10173 if self.op.mode == constants.IALLOCATOR_MODE_ALLOC:
10174 ial = IAllocator(self.cfg, self.rpc,
10177 mem_size=self.op.mem_size,
10178 disks=self.op.disks,
10179 disk_template=self.op.disk_template,
10183 vcpus=self.op.vcpus,
10184 hypervisor=self.op.hypervisor,
10186 elif self.op.mode == constants.IALLOCATOR_MODE_RELOC:
10187 ial = IAllocator(self.cfg, self.rpc,
10190 relocate_from=list(self.relocate_from),
10192 elif self.op.mode == constants.IALLOCATOR_MODE_MEVAC:
10193 ial = IAllocator(self.cfg, self.rpc,
10195 evac_nodes=self.op.evac_nodes)
10197 raise errors.ProgrammerError("Uncatched mode %s in"
10198 " LUTestAllocator.Exec", self.op.mode)
10200 if self.op.direction == constants.IALLOCATOR_DIR_IN:
10201 result = ial.in_text
10203 ial.Run(self.op.allocator, validate=False)
10204 result = ial.out_text