4 # Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 Google Inc.
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 """Module implementing the master-side code."""
24 # pylint: disable-msg=W0201,C0302
26 # W0201 since most LU attributes are defined in CheckPrereq or similar
29 # C0302: since we have waaaay to many lines in this module
44 from ganeti import ssh
45 from ganeti import utils
46 from ganeti import errors
47 from ganeti import hypervisor
48 from ganeti import locking
49 from ganeti import constants
50 from ganeti import objects
51 from ganeti import serializer
52 from ganeti import ssconf
53 from ganeti import uidpool
54 from ganeti import compat
55 from ganeti import masterd
56 from ganeti import netutils
57 from ganeti import query
58 from ganeti import qlang
59 from ganeti import opcodes
61 import ganeti.masterd.instance # pylint: disable-msg=W0611
64 def _SupportsOob(cfg, node):
65 """Tells if node supports OOB.
67 @type cfg: L{config.ConfigWriter}
68 @param cfg: The cluster configuration
69 @type node: L{objects.Node}
71 @return: The OOB script if supported or an empty string otherwise
74 return cfg.GetNdParams(node)[constants.ND_OOB_PROGRAM]
78 class LogicalUnit(object):
79 """Logical Unit base class.
81 Subclasses must follow these rules:
82 - implement ExpandNames
83 - implement CheckPrereq (except when tasklets are used)
84 - implement Exec (except when tasklets are used)
85 - implement BuildHooksEnv
86 - redefine HPATH and HTYPE
87 - optionally redefine their run requirements:
88 REQ_BGL: the LU needs to hold the Big Ganeti Lock exclusively
90 Note that all commands require root permissions.
92 @ivar dry_run_result: the value (if any) that will be returned to the caller
93 in dry-run mode (signalled by opcode dry_run parameter)
100 def __init__(self, processor, op, context, rpc):
101 """Constructor for LogicalUnit.
103 This needs to be overridden in derived classes in order to check op
107 self.proc = processor
109 self.cfg = context.cfg
110 self.context = context
112 # Dicts used to declare locking needs to mcpu
113 self.needed_locks = None
114 self.acquired_locks = {}
115 self.share_locks = dict.fromkeys(locking.LEVELS, 0)
117 self.remove_locks = {}
118 # Used to force good behavior when calling helper functions
119 self.recalculate_locks = {}
122 self.Log = processor.Log # pylint: disable-msg=C0103
123 self.LogWarning = processor.LogWarning # pylint: disable-msg=C0103
124 self.LogInfo = processor.LogInfo # pylint: disable-msg=C0103
125 self.LogStep = processor.LogStep # pylint: disable-msg=C0103
126 # support for dry-run
127 self.dry_run_result = None
128 # support for generic debug attribute
129 if (not hasattr(self.op, "debug_level") or
130 not isinstance(self.op.debug_level, int)):
131 self.op.debug_level = 0
136 # Validate opcode parameters and set defaults
137 self.op.Validate(True)
139 self.CheckArguments()
142 """Returns the SshRunner object
146 self.__ssh = ssh.SshRunner(self.cfg.GetClusterName())
149 ssh = property(fget=__GetSSH)
151 def CheckArguments(self):
152 """Check syntactic validity for the opcode arguments.
154 This method is for doing a simple syntactic check and ensure
155 validity of opcode parameters, without any cluster-related
156 checks. While the same can be accomplished in ExpandNames and/or
157 CheckPrereq, doing these separate is better because:
159 - ExpandNames is left as as purely a lock-related function
160 - CheckPrereq is run after we have acquired locks (and possible
163 The function is allowed to change the self.op attribute so that
164 later methods can no longer worry about missing parameters.
169 def ExpandNames(self):
170 """Expand names for this LU.
172 This method is called before starting to execute the opcode, and it should
173 update all the parameters of the opcode to their canonical form (e.g. a
174 short node name must be fully expanded after this method has successfully
175 completed). This way locking, hooks, logging, etc. can work correctly.
177 LUs which implement this method must also populate the self.needed_locks
178 member, as a dict with lock levels as keys, and a list of needed lock names
181 - use an empty dict if you don't need any lock
182 - if you don't need any lock at a particular level omit that level
183 - don't put anything for the BGL level
184 - if you want all locks at a level use locking.ALL_SET as a value
186 If you need to share locks (rather than acquire them exclusively) at one
187 level you can modify self.share_locks, setting a true value (usually 1) for
188 that level. By default locks are not shared.
190 This function can also define a list of tasklets, which then will be
191 executed in order instead of the usual LU-level CheckPrereq and Exec
192 functions, if those are not defined by the LU.
196 # Acquire all nodes and one instance
197 self.needed_locks = {
198 locking.LEVEL_NODE: locking.ALL_SET,
199 locking.LEVEL_INSTANCE: ['instance1.example.com'],
201 # Acquire just two nodes
202 self.needed_locks = {
203 locking.LEVEL_NODE: ['node1.example.com', 'node2.example.com'],
206 self.needed_locks = {} # No, you can't leave it to the default value None
209 # The implementation of this method is mandatory only if the new LU is
210 # concurrent, so that old LUs don't need to be changed all at the same
213 self.needed_locks = {} # Exclusive LUs don't need locks.
215 raise NotImplementedError
217 def DeclareLocks(self, level):
218 """Declare LU locking needs for a level
220 While most LUs can just declare their locking needs at ExpandNames time,
221 sometimes there's the need to calculate some locks after having acquired
222 the ones before. This function is called just before acquiring locks at a
223 particular level, but after acquiring the ones at lower levels, and permits
224 such calculations. It can be used to modify self.needed_locks, and by
225 default it does nothing.
227 This function is only called if you have something already set in
228 self.needed_locks for the level.
230 @param level: Locking level which is going to be locked
231 @type level: member of ganeti.locking.LEVELS
235 def CheckPrereq(self):
236 """Check prerequisites for this LU.
238 This method should check that the prerequisites for the execution
239 of this LU are fulfilled. It can do internode communication, but
240 it should be idempotent - no cluster or system changes are
243 The method should raise errors.OpPrereqError in case something is
244 not fulfilled. Its return value is ignored.
246 This method should also update all the parameters of the opcode to
247 their canonical form if it hasn't been done by ExpandNames before.
250 if self.tasklets is not None:
251 for (idx, tl) in enumerate(self.tasklets):
252 logging.debug("Checking prerequisites for tasklet %s/%s",
253 idx + 1, len(self.tasklets))
258 def Exec(self, feedback_fn):
261 This method should implement the actual work. It should raise
262 errors.OpExecError for failures that are somewhat dealt with in
266 if self.tasklets is not None:
267 for (idx, tl) in enumerate(self.tasklets):
268 logging.debug("Executing tasklet %s/%s", idx + 1, len(self.tasklets))
271 raise NotImplementedError
273 def BuildHooksEnv(self):
274 """Build hooks environment for this LU.
276 This method should return a three-node tuple consisting of: a dict
277 containing the environment that will be used for running the
278 specific hook for this LU, a list of node names on which the hook
279 should run before the execution, and a list of node names on which
280 the hook should run after the execution.
282 The keys of the dict must not have 'GANETI_' prefixed as this will
283 be handled in the hooks runner. Also note additional keys will be
284 added by the hooks runner. If the LU doesn't define any
285 environment, an empty dict (and not None) should be returned.
287 No nodes should be returned as an empty list (and not None).
289 Note that if the HPATH for a LU class is None, this function will
293 raise NotImplementedError
295 def HooksCallBack(self, phase, hook_results, feedback_fn, lu_result):
296 """Notify the LU about the results of its hooks.
298 This method is called every time a hooks phase is executed, and notifies
299 the Logical Unit about the hooks' result. The LU can then use it to alter
300 its result based on the hooks. By default the method does nothing and the
301 previous result is passed back unchanged but any LU can define it if it
302 wants to use the local cluster hook-scripts somehow.
304 @param phase: one of L{constants.HOOKS_PHASE_POST} or
305 L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase
306 @param hook_results: the results of the multi-node hooks rpc call
307 @param feedback_fn: function used send feedback back to the caller
308 @param lu_result: the previous Exec result this LU had, or None
310 @return: the new Exec result, based on the previous result
314 # API must be kept, thus we ignore the unused argument and could
315 # be a function warnings
316 # pylint: disable-msg=W0613,R0201
319 def _ExpandAndLockInstance(self):
320 """Helper function to expand and lock an instance.
322 Many LUs that work on an instance take its name in self.op.instance_name
323 and need to expand it and then declare the expanded name for locking. This
324 function does it, and then updates self.op.instance_name to the expanded
325 name. It also initializes needed_locks as a dict, if this hasn't been done
329 if self.needed_locks is None:
330 self.needed_locks = {}
332 assert locking.LEVEL_INSTANCE not in self.needed_locks, \
333 "_ExpandAndLockInstance called with instance-level locks set"
334 self.op.instance_name = _ExpandInstanceName(self.cfg,
335 self.op.instance_name)
336 self.needed_locks[locking.LEVEL_INSTANCE] = self.op.instance_name
338 def _LockInstancesNodes(self, primary_only=False):
339 """Helper function to declare instances' nodes for locking.
341 This function should be called after locking one or more instances to lock
342 their nodes. Its effect is populating self.needed_locks[locking.LEVEL_NODE]
343 with all primary or secondary nodes for instances already locked and
344 present in self.needed_locks[locking.LEVEL_INSTANCE].
346 It should be called from DeclareLocks, and for safety only works if
347 self.recalculate_locks[locking.LEVEL_NODE] is set.
349 In the future it may grow parameters to just lock some instance's nodes, or
350 to just lock primaries or secondary nodes, if needed.
352 If should be called in DeclareLocks in a way similar to::
354 if level == locking.LEVEL_NODE:
355 self._LockInstancesNodes()
357 @type primary_only: boolean
358 @param primary_only: only lock primary nodes of locked instances
361 assert locking.LEVEL_NODE in self.recalculate_locks, \
362 "_LockInstancesNodes helper function called with no nodes to recalculate"
364 # TODO: check if we're really been called with the instance locks held
366 # For now we'll replace self.needed_locks[locking.LEVEL_NODE], but in the
367 # future we might want to have different behaviors depending on the value
368 # of self.recalculate_locks[locking.LEVEL_NODE]
370 for instance_name in self.acquired_locks[locking.LEVEL_INSTANCE]:
371 instance = self.context.cfg.GetInstanceInfo(instance_name)
372 wanted_nodes.append(instance.primary_node)
374 wanted_nodes.extend(instance.secondary_nodes)
376 if self.recalculate_locks[locking.LEVEL_NODE] == constants.LOCKS_REPLACE:
377 self.needed_locks[locking.LEVEL_NODE] = wanted_nodes
378 elif self.recalculate_locks[locking.LEVEL_NODE] == constants.LOCKS_APPEND:
379 self.needed_locks[locking.LEVEL_NODE].extend(wanted_nodes)
381 del self.recalculate_locks[locking.LEVEL_NODE]
384 class NoHooksLU(LogicalUnit): # pylint: disable-msg=W0223
385 """Simple LU which runs no hooks.
387 This LU is intended as a parent for other LogicalUnits which will
388 run no hooks, in order to reduce duplicate code.
394 def BuildHooksEnv(self):
395 """Empty BuildHooksEnv for NoHooksLu.
397 This just raises an error.
400 assert False, "BuildHooksEnv called for NoHooksLUs"
404 """Tasklet base class.
406 Tasklets are subcomponents for LUs. LUs can consist entirely of tasklets or
407 they can mix legacy code with tasklets. Locking needs to be done in the LU,
408 tasklets know nothing about locks.
410 Subclasses must follow these rules:
411 - Implement CheckPrereq
415 def __init__(self, lu):
422 def CheckPrereq(self):
423 """Check prerequisites for this tasklets.
425 This method should check whether the prerequisites for the execution of
426 this tasklet are fulfilled. It can do internode communication, but it
427 should be idempotent - no cluster or system changes are allowed.
429 The method should raise errors.OpPrereqError in case something is not
430 fulfilled. Its return value is ignored.
432 This method should also update all parameters to their canonical form if it
433 hasn't been done before.
438 def Exec(self, feedback_fn):
439 """Execute the tasklet.
441 This method should implement the actual work. It should raise
442 errors.OpExecError for failures that are somewhat dealt with in code, or
446 raise NotImplementedError
450 """Base for query utility classes.
453 #: Attribute holding field definitions
456 def __init__(self, names, fields, use_locking):
457 """Initializes this class.
461 self.use_locking = use_locking
463 self.query = query.Query(self.FIELDS, fields)
464 self.requested_data = self.query.RequestedData()
466 self.do_locking = None
469 def _GetNames(self, lu, all_names, lock_level):
470 """Helper function to determine names asked for in the query.
474 names = lu.acquired_locks[lock_level]
478 if self.wanted == locking.ALL_SET:
479 assert not self.names
480 # caller didn't specify names, so ordering is not important
481 return utils.NiceSort(names)
483 # caller specified names and we must keep the same order
485 assert not self.do_locking or lu.acquired_locks[lock_level]
487 missing = set(self.wanted).difference(names)
489 raise errors.OpExecError("Some items were removed before retrieving"
490 " their data: %s" % missing)
492 # Return expanded names
496 def FieldsQuery(cls, fields):
497 """Returns list of available fields.
499 @return: List of L{objects.QueryFieldDefinition}
502 return query.QueryFields(cls.FIELDS, fields)
504 def ExpandNames(self, lu):
505 """Expand names for this query.
507 See L{LogicalUnit.ExpandNames}.
510 raise NotImplementedError()
512 def DeclareLocks(self, lu, level):
513 """Declare locks for this query.
515 See L{LogicalUnit.DeclareLocks}.
518 raise NotImplementedError()
520 def _GetQueryData(self, lu):
521 """Collects all data for this query.
523 @return: Query data object
526 raise NotImplementedError()
528 def NewStyleQuery(self, lu):
529 """Collect data and execute query.
532 return query.GetQueryResponse(self.query, self._GetQueryData(lu))
534 def OldStyleQuery(self, lu):
535 """Collect data and execute query.
538 return self.query.OldStyleQuery(self._GetQueryData(lu))
541 def _GetWantedNodes(lu, nodes):
542 """Returns list of checked and expanded node names.
544 @type lu: L{LogicalUnit}
545 @param lu: the logical unit on whose behalf we execute
547 @param nodes: list of node names or None for all nodes
549 @return: the list of nodes, sorted
550 @raise errors.ProgrammerError: if the nodes parameter is wrong type
554 return [_ExpandNodeName(lu.cfg, name) for name in nodes]
556 return utils.NiceSort(lu.cfg.GetNodeList())
559 def _GetWantedInstances(lu, instances):
560 """Returns list of checked and expanded instance names.
562 @type lu: L{LogicalUnit}
563 @param lu: the logical unit on whose behalf we execute
564 @type instances: list
565 @param instances: list of instance names or None for all instances
567 @return: the list of instances, sorted
568 @raise errors.OpPrereqError: if the instances parameter is wrong type
569 @raise errors.OpPrereqError: if any of the passed instances is not found
573 wanted = [_ExpandInstanceName(lu.cfg, name) for name in instances]
575 wanted = utils.NiceSort(lu.cfg.GetInstanceList())
579 def _GetUpdatedParams(old_params, update_dict,
580 use_default=True, use_none=False):
581 """Return the new version of a parameter dictionary.
583 @type old_params: dict
584 @param old_params: old parameters
585 @type update_dict: dict
586 @param update_dict: dict containing new parameter values, or
587 constants.VALUE_DEFAULT to reset the parameter to its default
589 @param use_default: boolean
590 @type use_default: whether to recognise L{constants.VALUE_DEFAULT}
591 values as 'to be deleted' values
592 @param use_none: boolean
593 @type use_none: whether to recognise C{None} values as 'to be
596 @return: the new parameter dictionary
599 params_copy = copy.deepcopy(old_params)
600 for key, val in update_dict.iteritems():
601 if ((use_default and val == constants.VALUE_DEFAULT) or
602 (use_none and val is None)):
608 params_copy[key] = val
612 def _CheckOutputFields(static, dynamic, selected):
613 """Checks whether all selected fields are valid.
615 @type static: L{utils.FieldSet}
616 @param static: static fields set
617 @type dynamic: L{utils.FieldSet}
618 @param dynamic: dynamic fields set
625 delta = f.NonMatching(selected)
627 raise errors.OpPrereqError("Unknown output fields selected: %s"
628 % ",".join(delta), errors.ECODE_INVAL)
631 def _CheckGlobalHvParams(params):
632 """Validates that given hypervisor params are not global ones.
634 This will ensure that instances don't get customised versions of
638 used_globals = constants.HVC_GLOBALS.intersection(params)
640 msg = ("The following hypervisor parameters are global and cannot"
641 " be customized at instance level, please modify them at"
642 " cluster level: %s" % utils.CommaJoin(used_globals))
643 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
646 def _CheckNodeOnline(lu, node, msg=None):
647 """Ensure that a given node is online.
649 @param lu: the LU on behalf of which we make the check
650 @param node: the node to check
651 @param msg: if passed, should be a message to replace the default one
652 @raise errors.OpPrereqError: if the node is offline
656 msg = "Can't use offline node"
657 if lu.cfg.GetNodeInfo(node).offline:
658 raise errors.OpPrereqError("%s: %s" % (msg, node), errors.ECODE_STATE)
661 def _CheckNodeNotDrained(lu, node):
662 """Ensure that a given node is not drained.
664 @param lu: the LU on behalf of which we make the check
665 @param node: the node to check
666 @raise errors.OpPrereqError: if the node is drained
669 if lu.cfg.GetNodeInfo(node).drained:
670 raise errors.OpPrereqError("Can't use drained node %s" % node,
674 def _CheckNodeVmCapable(lu, node):
675 """Ensure that a given node is vm capable.
677 @param lu: the LU on behalf of which we make the check
678 @param node: the node to check
679 @raise errors.OpPrereqError: if the node is not vm capable
682 if not lu.cfg.GetNodeInfo(node).vm_capable:
683 raise errors.OpPrereqError("Can't use non-vm_capable node %s" % node,
687 def _CheckNodeHasOS(lu, node, os_name, force_variant):
688 """Ensure that a node supports a given OS.
690 @param lu: the LU on behalf of which we make the check
691 @param node: the node to check
692 @param os_name: the OS to query about
693 @param force_variant: whether to ignore variant errors
694 @raise errors.OpPrereqError: if the node is not supporting the OS
697 result = lu.rpc.call_os_get(node, os_name)
698 result.Raise("OS '%s' not in supported OS list for node %s" %
700 prereq=True, ecode=errors.ECODE_INVAL)
701 if not force_variant:
702 _CheckOSVariant(result.payload, os_name)
705 def _CheckNodeHasSecondaryIP(lu, node, secondary_ip, prereq):
706 """Ensure that a node has the given secondary ip.
708 @type lu: L{LogicalUnit}
709 @param lu: the LU on behalf of which we make the check
711 @param node: the node to check
712 @type secondary_ip: string
713 @param secondary_ip: the ip to check
714 @type prereq: boolean
715 @param prereq: whether to throw a prerequisite or an execute error
716 @raise errors.OpPrereqError: if the node doesn't have the ip, and prereq=True
717 @raise errors.OpExecError: if the node doesn't have the ip, and prereq=False
720 result = lu.rpc.call_node_has_ip_address(node, secondary_ip)
721 result.Raise("Failure checking secondary ip on node %s" % node,
722 prereq=prereq, ecode=errors.ECODE_ENVIRON)
723 if not result.payload:
724 msg = ("Node claims it doesn't have the secondary ip you gave (%s),"
725 " please fix and re-run this command" % secondary_ip)
727 raise errors.OpPrereqError(msg, errors.ECODE_ENVIRON)
729 raise errors.OpExecError(msg)
732 def _GetClusterDomainSecret():
733 """Reads the cluster domain secret.
736 return utils.ReadOneLineFile(constants.CLUSTER_DOMAIN_SECRET_FILE,
740 def _CheckInstanceDown(lu, instance, reason):
741 """Ensure that an instance is not running."""
742 if instance.admin_up:
743 raise errors.OpPrereqError("Instance %s is marked to be up, %s" %
744 (instance.name, reason), errors.ECODE_STATE)
746 pnode = instance.primary_node
747 ins_l = lu.rpc.call_instance_list([pnode], [instance.hypervisor])[pnode]
748 ins_l.Raise("Can't contact node %s for instance information" % pnode,
749 prereq=True, ecode=errors.ECODE_ENVIRON)
751 if instance.name in ins_l.payload:
752 raise errors.OpPrereqError("Instance %s is running, %s" %
753 (instance.name, reason), errors.ECODE_STATE)
756 def _ExpandItemName(fn, name, kind):
757 """Expand an item name.
759 @param fn: the function to use for expansion
760 @param name: requested item name
761 @param kind: text description ('Node' or 'Instance')
762 @return: the resolved (full) name
763 @raise errors.OpPrereqError: if the item is not found
767 if full_name is None:
768 raise errors.OpPrereqError("%s '%s' not known" % (kind, name),
773 def _ExpandNodeName(cfg, name):
774 """Wrapper over L{_ExpandItemName} for nodes."""
775 return _ExpandItemName(cfg.ExpandNodeName, name, "Node")
778 def _ExpandInstanceName(cfg, name):
779 """Wrapper over L{_ExpandItemName} for instance."""
780 return _ExpandItemName(cfg.ExpandInstanceName, name, "Instance")
783 def _BuildInstanceHookEnv(name, primary_node, secondary_nodes, os_type, status,
784 memory, vcpus, nics, disk_template, disks,
785 bep, hvp, hypervisor_name):
786 """Builds instance related env variables for hooks
788 This builds the hook environment from individual variables.
791 @param name: the name of the instance
792 @type primary_node: string
793 @param primary_node: the name of the instance's primary node
794 @type secondary_nodes: list
795 @param secondary_nodes: list of secondary nodes as strings
796 @type os_type: string
797 @param os_type: the name of the instance's OS
798 @type status: boolean
799 @param status: the should_run status of the instance
801 @param memory: the memory size of the instance
803 @param vcpus: the count of VCPUs the instance has
805 @param nics: list of tuples (ip, mac, mode, link) representing
806 the NICs the instance has
807 @type disk_template: string
808 @param disk_template: the disk template of the instance
810 @param disks: the list of (size, mode) pairs
812 @param bep: the backend parameters for the instance
814 @param hvp: the hypervisor parameters for the instance
815 @type hypervisor_name: string
816 @param hypervisor_name: the hypervisor for the instance
818 @return: the hook environment for this instance
827 "INSTANCE_NAME": name,
828 "INSTANCE_PRIMARY": primary_node,
829 "INSTANCE_SECONDARIES": " ".join(secondary_nodes),
830 "INSTANCE_OS_TYPE": os_type,
831 "INSTANCE_STATUS": str_status,
832 "INSTANCE_MEMORY": memory,
833 "INSTANCE_VCPUS": vcpus,
834 "INSTANCE_DISK_TEMPLATE": disk_template,
835 "INSTANCE_HYPERVISOR": hypervisor_name,
839 nic_count = len(nics)
840 for idx, (ip, mac, mode, link) in enumerate(nics):
843 env["INSTANCE_NIC%d_IP" % idx] = ip
844 env["INSTANCE_NIC%d_MAC" % idx] = mac
845 env["INSTANCE_NIC%d_MODE" % idx] = mode
846 env["INSTANCE_NIC%d_LINK" % idx] = link
847 if mode == constants.NIC_MODE_BRIDGED:
848 env["INSTANCE_NIC%d_BRIDGE" % idx] = link
852 env["INSTANCE_NIC_COUNT"] = nic_count
855 disk_count = len(disks)
856 for idx, (size, mode) in enumerate(disks):
857 env["INSTANCE_DISK%d_SIZE" % idx] = size
858 env["INSTANCE_DISK%d_MODE" % idx] = mode
862 env["INSTANCE_DISK_COUNT"] = disk_count
864 for source, kind in [(bep, "BE"), (hvp, "HV")]:
865 for key, value in source.items():
866 env["INSTANCE_%s_%s" % (kind, key)] = value
871 def _NICListToTuple(lu, nics):
872 """Build a list of nic information tuples.
874 This list is suitable to be passed to _BuildInstanceHookEnv or as a return
875 value in LUInstanceQueryData.
877 @type lu: L{LogicalUnit}
878 @param lu: the logical unit on whose behalf we execute
879 @type nics: list of L{objects.NIC}
880 @param nics: list of nics to convert to hooks tuples
884 cluster = lu.cfg.GetClusterInfo()
888 filled_params = cluster.SimpleFillNIC(nic.nicparams)
889 mode = filled_params[constants.NIC_MODE]
890 link = filled_params[constants.NIC_LINK]
891 hooks_nics.append((ip, mac, mode, link))
895 def _BuildInstanceHookEnvByObject(lu, instance, override=None):
896 """Builds instance related env variables for hooks from an object.
898 @type lu: L{LogicalUnit}
899 @param lu: the logical unit on whose behalf we execute
900 @type instance: L{objects.Instance}
901 @param instance: the instance for which we should build the
904 @param override: dictionary with key/values that will override
907 @return: the hook environment dictionary
910 cluster = lu.cfg.GetClusterInfo()
911 bep = cluster.FillBE(instance)
912 hvp = cluster.FillHV(instance)
914 'name': instance.name,
915 'primary_node': instance.primary_node,
916 'secondary_nodes': instance.secondary_nodes,
917 'os_type': instance.os,
918 'status': instance.admin_up,
919 'memory': bep[constants.BE_MEMORY],
920 'vcpus': bep[constants.BE_VCPUS],
921 'nics': _NICListToTuple(lu, instance.nics),
922 'disk_template': instance.disk_template,
923 'disks': [(disk.size, disk.mode) for disk in instance.disks],
926 'hypervisor_name': instance.hypervisor,
929 args.update(override)
930 return _BuildInstanceHookEnv(**args) # pylint: disable-msg=W0142
933 def _AdjustCandidatePool(lu, exceptions):
934 """Adjust the candidate pool after node operations.
937 mod_list = lu.cfg.MaintainCandidatePool(exceptions)
939 lu.LogInfo("Promoted nodes to master candidate role: %s",
940 utils.CommaJoin(node.name for node in mod_list))
941 for name in mod_list:
942 lu.context.ReaddNode(name)
943 mc_now, mc_max, _ = lu.cfg.GetMasterCandidateStats(exceptions)
945 lu.LogInfo("Note: more nodes are candidates (%d) than desired (%d)" %
949 def _DecideSelfPromotion(lu, exceptions=None):
950 """Decide whether I should promote myself as a master candidate.
953 cp_size = lu.cfg.GetClusterInfo().candidate_pool_size
954 mc_now, mc_should, _ = lu.cfg.GetMasterCandidateStats(exceptions)
955 # the new node will increase mc_max with one, so:
956 mc_should = min(mc_should + 1, cp_size)
957 return mc_now < mc_should
960 def _CheckNicsBridgesExist(lu, target_nics, target_node):
961 """Check that the brigdes needed by a list of nics exist.
964 cluster = lu.cfg.GetClusterInfo()
965 paramslist = [cluster.SimpleFillNIC(nic.nicparams) for nic in target_nics]
966 brlist = [params[constants.NIC_LINK] for params in paramslist
967 if params[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED]
969 result = lu.rpc.call_bridges_exist(target_node, brlist)
970 result.Raise("Error checking bridges on destination node '%s'" %
971 target_node, prereq=True, ecode=errors.ECODE_ENVIRON)
974 def _CheckInstanceBridgesExist(lu, instance, node=None):
975 """Check that the brigdes needed by an instance exist.
979 node = instance.primary_node
980 _CheckNicsBridgesExist(lu, instance.nics, node)
983 def _CheckOSVariant(os_obj, name):
984 """Check whether an OS name conforms to the os variants specification.
986 @type os_obj: L{objects.OS}
987 @param os_obj: OS object to check
989 @param name: OS name passed by the user, to check for validity
992 if not os_obj.supported_variants:
994 variant = objects.OS.GetVariant(name)
996 raise errors.OpPrereqError("OS name must include a variant",
999 if variant not in os_obj.supported_variants:
1000 raise errors.OpPrereqError("Unsupported OS variant", errors.ECODE_INVAL)
1003 def _GetNodeInstancesInner(cfg, fn):
1004 return [i for i in cfg.GetAllInstancesInfo().values() if fn(i)]
1007 def _GetNodeInstances(cfg, node_name):
1008 """Returns a list of all primary and secondary instances on a node.
1012 return _GetNodeInstancesInner(cfg, lambda inst: node_name in inst.all_nodes)
1015 def _GetNodePrimaryInstances(cfg, node_name):
1016 """Returns primary instances on a node.
1019 return _GetNodeInstancesInner(cfg,
1020 lambda inst: node_name == inst.primary_node)
1023 def _GetNodeSecondaryInstances(cfg, node_name):
1024 """Returns secondary instances on a node.
1027 return _GetNodeInstancesInner(cfg,
1028 lambda inst: node_name in inst.secondary_nodes)
1031 def _GetStorageTypeArgs(cfg, storage_type):
1032 """Returns the arguments for a storage type.
1035 # Special case for file storage
1036 if storage_type == constants.ST_FILE:
1037 # storage.FileStorage wants a list of storage directories
1038 return [[cfg.GetFileStorageDir()]]
1043 def _FindFaultyInstanceDisks(cfg, rpc, instance, node_name, prereq):
1046 for dev in instance.disks:
1047 cfg.SetDiskID(dev, node_name)
1049 result = rpc.call_blockdev_getmirrorstatus(node_name, instance.disks)
1050 result.Raise("Failed to get disk status from node %s" % node_name,
1051 prereq=prereq, ecode=errors.ECODE_ENVIRON)
1053 for idx, bdev_status in enumerate(result.payload):
1054 if bdev_status and bdev_status.ldisk_status == constants.LDS_FAULTY:
1060 def _CheckIAllocatorOrNode(lu, iallocator_slot, node_slot):
1061 """Check the sanity of iallocator and node arguments and use the
1062 cluster-wide iallocator if appropriate.
1064 Check that at most one of (iallocator, node) is specified. If none is
1065 specified, then the LU's opcode's iallocator slot is filled with the
1066 cluster-wide default iallocator.
1068 @type iallocator_slot: string
1069 @param iallocator_slot: the name of the opcode iallocator slot
1070 @type node_slot: string
1071 @param node_slot: the name of the opcode target node slot
1074 node = getattr(lu.op, node_slot, None)
1075 iallocator = getattr(lu.op, iallocator_slot, None)
1077 if node is not None and iallocator is not None:
1078 raise errors.OpPrereqError("Do not specify both, iallocator and node.",
1080 elif node is None and iallocator is None:
1081 default_iallocator = lu.cfg.GetDefaultIAllocator()
1082 if default_iallocator:
1083 setattr(lu.op, iallocator_slot, default_iallocator)
1085 raise errors.OpPrereqError("No iallocator or node given and no"
1086 " cluster-wide default iallocator found."
1087 " Please specify either an iallocator or a"
1088 " node, or set a cluster-wide default"
1092 class LUClusterPostInit(LogicalUnit):
1093 """Logical unit for running hooks after cluster initialization.
1096 HPATH = "cluster-init"
1097 HTYPE = constants.HTYPE_CLUSTER
1099 def BuildHooksEnv(self):
1103 env = {"OP_TARGET": self.cfg.GetClusterName()}
1104 mn = self.cfg.GetMasterNode()
1105 return env, [], [mn]
1107 def Exec(self, feedback_fn):
1114 class LUClusterDestroy(LogicalUnit):
1115 """Logical unit for destroying the cluster.
1118 HPATH = "cluster-destroy"
1119 HTYPE = constants.HTYPE_CLUSTER
1121 def BuildHooksEnv(self):
1125 env = {"OP_TARGET": self.cfg.GetClusterName()}
1128 def CheckPrereq(self):
1129 """Check prerequisites.
1131 This checks whether the cluster is empty.
1133 Any errors are signaled by raising errors.OpPrereqError.
1136 master = self.cfg.GetMasterNode()
1138 nodelist = self.cfg.GetNodeList()
1139 if len(nodelist) != 1 or nodelist[0] != master:
1140 raise errors.OpPrereqError("There are still %d node(s) in"
1141 " this cluster." % (len(nodelist) - 1),
1143 instancelist = self.cfg.GetInstanceList()
1145 raise errors.OpPrereqError("There are still %d instance(s) in"
1146 " this cluster." % len(instancelist),
1149 def Exec(self, feedback_fn):
1150 """Destroys the cluster.
1153 master = self.cfg.GetMasterNode()
1155 # Run post hooks on master node before it's removed
1156 hm = self.proc.hmclass(self.rpc.call_hooks_runner, self)
1158 hm.RunPhase(constants.HOOKS_PHASE_POST, [master])
1160 # pylint: disable-msg=W0702
1161 self.LogWarning("Errors occurred running hooks on %s" % master)
1163 result = self.rpc.call_node_stop_master(master, False)
1164 result.Raise("Could not disable the master role")
1169 def _VerifyCertificate(filename):
1170 """Verifies a certificate for LUClusterVerify.
1172 @type filename: string
1173 @param filename: Path to PEM file
1177 cert = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM,
1178 utils.ReadFile(filename))
1179 except Exception, err: # pylint: disable-msg=W0703
1180 return (LUClusterVerify.ETYPE_ERROR,
1181 "Failed to load X509 certificate %s: %s" % (filename, err))
1184 utils.VerifyX509Certificate(cert, constants.SSL_CERT_EXPIRATION_WARN,
1185 constants.SSL_CERT_EXPIRATION_ERROR)
1188 fnamemsg = "While verifying %s: %s" % (filename, msg)
1193 return (None, fnamemsg)
1194 elif errcode == utils.CERT_WARNING:
1195 return (LUClusterVerify.ETYPE_WARNING, fnamemsg)
1196 elif errcode == utils.CERT_ERROR:
1197 return (LUClusterVerify.ETYPE_ERROR, fnamemsg)
1199 raise errors.ProgrammerError("Unhandled certificate error code %r" % errcode)
1202 class LUClusterVerify(LogicalUnit):
1203 """Verifies the cluster status.
1206 HPATH = "cluster-verify"
1207 HTYPE = constants.HTYPE_CLUSTER
1210 TCLUSTER = "cluster"
1212 TINSTANCE = "instance"
1214 ECLUSTERCFG = (TCLUSTER, "ECLUSTERCFG")
1215 ECLUSTERCERT = (TCLUSTER, "ECLUSTERCERT")
1216 EINSTANCEBADNODE = (TINSTANCE, "EINSTANCEBADNODE")
1217 EINSTANCEDOWN = (TINSTANCE, "EINSTANCEDOWN")
1218 EINSTANCELAYOUT = (TINSTANCE, "EINSTANCELAYOUT")
1219 EINSTANCEMISSINGDISK = (TINSTANCE, "EINSTANCEMISSINGDISK")
1220 EINSTANCEFAULTYDISK = (TINSTANCE, "EINSTANCEFAULTYDISK")
1221 EINSTANCEWRONGNODE = (TINSTANCE, "EINSTANCEWRONGNODE")
1222 EINSTANCESPLITGROUPS = (TINSTANCE, "EINSTANCESPLITGROUPS")
1223 ENODEDRBD = (TNODE, "ENODEDRBD")
1224 ENODEDRBDHELPER = (TNODE, "ENODEDRBDHELPER")
1225 ENODEFILECHECK = (TNODE, "ENODEFILECHECK")
1226 ENODEHOOKS = (TNODE, "ENODEHOOKS")
1227 ENODEHV = (TNODE, "ENODEHV")
1228 ENODELVM = (TNODE, "ENODELVM")
1229 ENODEN1 = (TNODE, "ENODEN1")
1230 ENODENET = (TNODE, "ENODENET")
1231 ENODEOS = (TNODE, "ENODEOS")
1232 ENODEORPHANINSTANCE = (TNODE, "ENODEORPHANINSTANCE")
1233 ENODEORPHANLV = (TNODE, "ENODEORPHANLV")
1234 ENODERPC = (TNODE, "ENODERPC")
1235 ENODESSH = (TNODE, "ENODESSH")
1236 ENODEVERSION = (TNODE, "ENODEVERSION")
1237 ENODESETUP = (TNODE, "ENODESETUP")
1238 ENODETIME = (TNODE, "ENODETIME")
1239 ENODEOOBPATH = (TNODE, "ENODEOOBPATH")
1241 ETYPE_FIELD = "code"
1242 ETYPE_ERROR = "ERROR"
1243 ETYPE_WARNING = "WARNING"
1245 _HOOKS_INDENT_RE = re.compile("^", re.M)
1247 class NodeImage(object):
1248 """A class representing the logical and physical status of a node.
1251 @ivar name: the node name to which this object refers
1252 @ivar volumes: a structure as returned from
1253 L{ganeti.backend.GetVolumeList} (runtime)
1254 @ivar instances: a list of running instances (runtime)
1255 @ivar pinst: list of configured primary instances (config)
1256 @ivar sinst: list of configured secondary instances (config)
1257 @ivar sbp: diction of {secondary-node: list of instances} of all peers
1258 of this node (config)
1259 @ivar mfree: free memory, as reported by hypervisor (runtime)
1260 @ivar dfree: free disk, as reported by the node (runtime)
1261 @ivar offline: the offline status (config)
1262 @type rpc_fail: boolean
1263 @ivar rpc_fail: whether the RPC verify call was successfull (overall,
1264 not whether the individual keys were correct) (runtime)
1265 @type lvm_fail: boolean
1266 @ivar lvm_fail: whether the RPC call didn't return valid LVM data
1267 @type hyp_fail: boolean
1268 @ivar hyp_fail: whether the RPC call didn't return the instance list
1269 @type ghost: boolean
1270 @ivar ghost: whether this is a known node or not (config)
1271 @type os_fail: boolean
1272 @ivar os_fail: whether the RPC call didn't return valid OS data
1274 @ivar oslist: list of OSes as diagnosed by DiagnoseOS
1275 @type vm_capable: boolean
1276 @ivar vm_capable: whether the node can host instances
1279 def __init__(self, offline=False, name=None, vm_capable=True):
1288 self.offline = offline
1289 self.vm_capable = vm_capable
1290 self.rpc_fail = False
1291 self.lvm_fail = False
1292 self.hyp_fail = False
1294 self.os_fail = False
1297 def ExpandNames(self):
1298 self.needed_locks = {
1299 locking.LEVEL_NODE: locking.ALL_SET,
1300 locking.LEVEL_INSTANCE: locking.ALL_SET,
1302 self.share_locks = dict.fromkeys(locking.LEVELS, 1)
1304 def _Error(self, ecode, item, msg, *args, **kwargs):
1305 """Format an error message.
1307 Based on the opcode's error_codes parameter, either format a
1308 parseable error code, or a simpler error string.
1310 This must be called only from Exec and functions called from Exec.
1313 ltype = kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR)
1315 # first complete the msg
1318 # then format the whole message
1319 if self.op.error_codes:
1320 msg = "%s:%s:%s:%s:%s" % (ltype, etxt, itype, item, msg)
1326 msg = "%s: %s%s: %s" % (ltype, itype, item, msg)
1327 # and finally report it via the feedback_fn
1328 self._feedback_fn(" - %s" % msg)
1330 def _ErrorIf(self, cond, *args, **kwargs):
1331 """Log an error message if the passed condition is True.
1334 cond = bool(cond) or self.op.debug_simulate_errors
1336 self._Error(*args, **kwargs)
1337 # do not mark the operation as failed for WARN cases only
1338 if kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR) == self.ETYPE_ERROR:
1339 self.bad = self.bad or cond
1341 def _VerifyNode(self, ninfo, nresult):
1342 """Perform some basic validation on data returned from a node.
1344 - check the result data structure is well formed and has all the
1346 - check ganeti version
1348 @type ninfo: L{objects.Node}
1349 @param ninfo: the node to check
1350 @param nresult: the results from the node
1352 @return: whether overall this call was successful (and we can expect
1353 reasonable values in the respose)
1357 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1359 # main result, nresult should be a non-empty dict
1360 test = not nresult or not isinstance(nresult, dict)
1361 _ErrorIf(test, self.ENODERPC, node,
1362 "unable to verify node: no data returned")
1366 # compares ganeti version
1367 local_version = constants.PROTOCOL_VERSION
1368 remote_version = nresult.get("version", None)
1369 test = not (remote_version and
1370 isinstance(remote_version, (list, tuple)) and
1371 len(remote_version) == 2)
1372 _ErrorIf(test, self.ENODERPC, node,
1373 "connection to node returned invalid data")
1377 test = local_version != remote_version[0]
1378 _ErrorIf(test, self.ENODEVERSION, node,
1379 "incompatible protocol versions: master %s,"
1380 " node %s", local_version, remote_version[0])
1384 # node seems compatible, we can actually try to look into its results
1386 # full package version
1387 self._ErrorIf(constants.RELEASE_VERSION != remote_version[1],
1388 self.ENODEVERSION, node,
1389 "software version mismatch: master %s, node %s",
1390 constants.RELEASE_VERSION, remote_version[1],
1391 code=self.ETYPE_WARNING)
1393 hyp_result = nresult.get(constants.NV_HYPERVISOR, None)
1394 if ninfo.vm_capable and isinstance(hyp_result, dict):
1395 for hv_name, hv_result in hyp_result.iteritems():
1396 test = hv_result is not None
1397 _ErrorIf(test, self.ENODEHV, node,
1398 "hypervisor %s verify failure: '%s'", hv_name, hv_result)
1400 hvp_result = nresult.get(constants.NV_HVPARAMS, None)
1401 if ninfo.vm_capable and isinstance(hvp_result, list):
1402 for item, hv_name, hv_result in hvp_result:
1403 _ErrorIf(True, self.ENODEHV, node,
1404 "hypervisor %s parameter verify failure (source %s): %s",
1405 hv_name, item, hv_result)
1407 test = nresult.get(constants.NV_NODESETUP,
1408 ["Missing NODESETUP results"])
1409 _ErrorIf(test, self.ENODESETUP, node, "node setup error: %s",
1414 def _VerifyNodeTime(self, ninfo, nresult,
1415 nvinfo_starttime, nvinfo_endtime):
1416 """Check the node time.
1418 @type ninfo: L{objects.Node}
1419 @param ninfo: the node to check
1420 @param nresult: the remote results for the node
1421 @param nvinfo_starttime: the start time of the RPC call
1422 @param nvinfo_endtime: the end time of the RPC call
1426 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1428 ntime = nresult.get(constants.NV_TIME, None)
1430 ntime_merged = utils.MergeTime(ntime)
1431 except (ValueError, TypeError):
1432 _ErrorIf(True, self.ENODETIME, node, "Node returned invalid time")
1435 if ntime_merged < (nvinfo_starttime - constants.NODE_MAX_CLOCK_SKEW):
1436 ntime_diff = "%.01fs" % abs(nvinfo_starttime - ntime_merged)
1437 elif ntime_merged > (nvinfo_endtime + constants.NODE_MAX_CLOCK_SKEW):
1438 ntime_diff = "%.01fs" % abs(ntime_merged - nvinfo_endtime)
1442 _ErrorIf(ntime_diff is not None, self.ENODETIME, node,
1443 "Node time diverges by at least %s from master node time",
1446 def _VerifyNodeLVM(self, ninfo, nresult, vg_name):
1447 """Check the node time.
1449 @type ninfo: L{objects.Node}
1450 @param ninfo: the node to check
1451 @param nresult: the remote results for the node
1452 @param vg_name: the configured VG name
1459 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1461 # checks vg existence and size > 20G
1462 vglist = nresult.get(constants.NV_VGLIST, None)
1464 _ErrorIf(test, self.ENODELVM, node, "unable to check volume groups")
1466 vgstatus = utils.CheckVolumeGroupSize(vglist, vg_name,
1467 constants.MIN_VG_SIZE)
1468 _ErrorIf(vgstatus, self.ENODELVM, node, vgstatus)
1471 pvlist = nresult.get(constants.NV_PVLIST, None)
1472 test = pvlist is None
1473 _ErrorIf(test, self.ENODELVM, node, "Can't get PV list from node")
1475 # check that ':' is not present in PV names, since it's a
1476 # special character for lvcreate (denotes the range of PEs to
1478 for _, pvname, owner_vg in pvlist:
1479 test = ":" in pvname
1480 _ErrorIf(test, self.ENODELVM, node, "Invalid character ':' in PV"
1481 " '%s' of VG '%s'", pvname, owner_vg)
1483 def _VerifyNodeNetwork(self, ninfo, nresult):
1484 """Check the node time.
1486 @type ninfo: L{objects.Node}
1487 @param ninfo: the node to check
1488 @param nresult: the remote results for the node
1492 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1494 test = constants.NV_NODELIST not in nresult
1495 _ErrorIf(test, self.ENODESSH, node,
1496 "node hasn't returned node ssh connectivity data")
1498 if nresult[constants.NV_NODELIST]:
1499 for a_node, a_msg in nresult[constants.NV_NODELIST].items():
1500 _ErrorIf(True, self.ENODESSH, node,
1501 "ssh communication with node '%s': %s", a_node, a_msg)
1503 test = constants.NV_NODENETTEST not in nresult
1504 _ErrorIf(test, self.ENODENET, node,
1505 "node hasn't returned node tcp connectivity data")
1507 if nresult[constants.NV_NODENETTEST]:
1508 nlist = utils.NiceSort(nresult[constants.NV_NODENETTEST].keys())
1510 _ErrorIf(True, self.ENODENET, node,
1511 "tcp communication with node '%s': %s",
1512 anode, nresult[constants.NV_NODENETTEST][anode])
1514 test = constants.NV_MASTERIP not in nresult
1515 _ErrorIf(test, self.ENODENET, node,
1516 "node hasn't returned node master IP reachability data")
1518 if not nresult[constants.NV_MASTERIP]:
1519 if node == self.master_node:
1520 msg = "the master node cannot reach the master IP (not configured?)"
1522 msg = "cannot reach the master IP"
1523 _ErrorIf(True, self.ENODENET, node, msg)
1525 def _VerifyInstance(self, instance, instanceconfig, node_image,
1527 """Verify an instance.
1529 This function checks to see if the required block devices are
1530 available on the instance's node.
1533 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1534 node_current = instanceconfig.primary_node
1536 node_vol_should = {}
1537 instanceconfig.MapLVsByNode(node_vol_should)
1539 for node in node_vol_should:
1540 n_img = node_image[node]
1541 if n_img.offline or n_img.rpc_fail or n_img.lvm_fail:
1542 # ignore missing volumes on offline or broken nodes
1544 for volume in node_vol_should[node]:
1545 test = volume not in n_img.volumes
1546 _ErrorIf(test, self.EINSTANCEMISSINGDISK, instance,
1547 "volume %s missing on node %s", volume, node)
1549 if instanceconfig.admin_up:
1550 pri_img = node_image[node_current]
1551 test = instance not in pri_img.instances and not pri_img.offline
1552 _ErrorIf(test, self.EINSTANCEDOWN, instance,
1553 "instance not running on its primary node %s",
1556 for node, n_img in node_image.items():
1557 if node != node_current:
1558 test = instance in n_img.instances
1559 _ErrorIf(test, self.EINSTANCEWRONGNODE, instance,
1560 "instance should not run on node %s", node)
1562 diskdata = [(nname, success, status, idx)
1563 for (nname, disks) in diskstatus.items()
1564 for idx, (success, status) in enumerate(disks)]
1566 for nname, success, bdev_status, idx in diskdata:
1567 # the 'ghost node' construction in Exec() ensures that we have a
1569 snode = node_image[nname]
1570 bad_snode = snode.ghost or snode.offline
1571 _ErrorIf(instanceconfig.admin_up and not success and not bad_snode,
1572 self.EINSTANCEFAULTYDISK, instance,
1573 "couldn't retrieve status for disk/%s on %s: %s",
1574 idx, nname, bdev_status)
1575 _ErrorIf((instanceconfig.admin_up and success and
1576 bdev_status.ldisk_status == constants.LDS_FAULTY),
1577 self.EINSTANCEFAULTYDISK, instance,
1578 "disk/%s on %s is faulty", idx, nname)
1580 def _VerifyOrphanVolumes(self, node_vol_should, node_image, reserved):
1581 """Verify if there are any unknown volumes in the cluster.
1583 The .os, .swap and backup volumes are ignored. All other volumes are
1584 reported as unknown.
1586 @type reserved: L{ganeti.utils.FieldSet}
1587 @param reserved: a FieldSet of reserved volume names
1590 for node, n_img in node_image.items():
1591 if n_img.offline or n_img.rpc_fail or n_img.lvm_fail:
1592 # skip non-healthy nodes
1594 for volume in n_img.volumes:
1595 test = ((node not in node_vol_should or
1596 volume not in node_vol_should[node]) and
1597 not reserved.Matches(volume))
1598 self._ErrorIf(test, self.ENODEORPHANLV, node,
1599 "volume %s is unknown", volume)
1601 def _VerifyOrphanInstances(self, instancelist, node_image):
1602 """Verify the list of running instances.
1604 This checks what instances are running but unknown to the cluster.
1607 for node, n_img in node_image.items():
1608 for o_inst in n_img.instances:
1609 test = o_inst not in instancelist
1610 self._ErrorIf(test, self.ENODEORPHANINSTANCE, node,
1611 "instance %s on node %s should not exist", o_inst, node)
1613 def _VerifyNPlusOneMemory(self, node_image, instance_cfg):
1614 """Verify N+1 Memory Resilience.
1616 Check that if one single node dies we can still start all the
1617 instances it was primary for.
1620 for node, n_img in node_image.items():
1621 # This code checks that every node which is now listed as
1622 # secondary has enough memory to host all instances it is
1623 # supposed to should a single other node in the cluster fail.
1624 # FIXME: not ready for failover to an arbitrary node
1625 # FIXME: does not support file-backed instances
1626 # WARNING: we currently take into account down instances as well
1627 # as up ones, considering that even if they're down someone
1628 # might want to start them even in the event of a node failure.
1630 # we're skipping offline nodes from the N+1 warning, since
1631 # most likely we don't have good memory infromation from them;
1632 # we already list instances living on such nodes, and that's
1635 for prinode, instances in n_img.sbp.items():
1637 for instance in instances:
1638 bep = self.cfg.GetClusterInfo().FillBE(instance_cfg[instance])
1639 if bep[constants.BE_AUTO_BALANCE]:
1640 needed_mem += bep[constants.BE_MEMORY]
1641 test = n_img.mfree < needed_mem
1642 self._ErrorIf(test, self.ENODEN1, node,
1643 "not enough memory to accomodate instance failovers"
1644 " should node %s fail (%dMiB needed, %dMiB available)",
1645 prinode, needed_mem, n_img.mfree)
1647 def _VerifyNodeFiles(self, ninfo, nresult, file_list, local_cksum,
1649 """Verifies and computes the node required file checksums.
1651 @type ninfo: L{objects.Node}
1652 @param ninfo: the node to check
1653 @param nresult: the remote results for the node
1654 @param file_list: required list of files
1655 @param local_cksum: dictionary of local files and their checksums
1656 @param master_files: list of files that only masters should have
1660 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1662 remote_cksum = nresult.get(constants.NV_FILELIST, None)
1663 test = not isinstance(remote_cksum, dict)
1664 _ErrorIf(test, self.ENODEFILECHECK, node,
1665 "node hasn't returned file checksum data")
1669 for file_name in file_list:
1670 node_is_mc = ninfo.master_candidate
1671 must_have = (file_name not in master_files) or node_is_mc
1673 test1 = file_name not in remote_cksum
1675 test2 = not test1 and remote_cksum[file_name] != local_cksum[file_name]
1677 test3 = not test1 and remote_cksum[file_name] == local_cksum[file_name]
1678 _ErrorIf(test1 and must_have, self.ENODEFILECHECK, node,
1679 "file '%s' missing", file_name)
1680 _ErrorIf(test2 and must_have, self.ENODEFILECHECK, node,
1681 "file '%s' has wrong checksum", file_name)
1682 # not candidate and this is not a must-have file
1683 _ErrorIf(test2 and not must_have, self.ENODEFILECHECK, node,
1684 "file '%s' should not exist on non master"
1685 " candidates (and the file is outdated)", file_name)
1686 # all good, except non-master/non-must have combination
1687 _ErrorIf(test3 and not must_have, self.ENODEFILECHECK, node,
1688 "file '%s' should not exist"
1689 " on non master candidates", file_name)
1691 def _VerifyNodeDrbd(self, ninfo, nresult, instanceinfo, drbd_helper,
1693 """Verifies and the node DRBD status.
1695 @type ninfo: L{objects.Node}
1696 @param ninfo: the node to check
1697 @param nresult: the remote results for the node
1698 @param instanceinfo: the dict of instances
1699 @param drbd_helper: the configured DRBD usermode helper
1700 @param drbd_map: the DRBD map as returned by
1701 L{ganeti.config.ConfigWriter.ComputeDRBDMap}
1705 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1708 helper_result = nresult.get(constants.NV_DRBDHELPER, None)
1709 test = (helper_result == None)
1710 _ErrorIf(test, self.ENODEDRBDHELPER, node,
1711 "no drbd usermode helper returned")
1713 status, payload = helper_result
1715 _ErrorIf(test, self.ENODEDRBDHELPER, node,
1716 "drbd usermode helper check unsuccessful: %s", payload)
1717 test = status and (payload != drbd_helper)
1718 _ErrorIf(test, self.ENODEDRBDHELPER, node,
1719 "wrong drbd usermode helper: %s", payload)
1721 # compute the DRBD minors
1723 for minor, instance in drbd_map[node].items():
1724 test = instance not in instanceinfo
1725 _ErrorIf(test, self.ECLUSTERCFG, None,
1726 "ghost instance '%s' in temporary DRBD map", instance)
1727 # ghost instance should not be running, but otherwise we
1728 # don't give double warnings (both ghost instance and
1729 # unallocated minor in use)
1731 node_drbd[minor] = (instance, False)
1733 instance = instanceinfo[instance]
1734 node_drbd[minor] = (instance.name, instance.admin_up)
1736 # and now check them
1737 used_minors = nresult.get(constants.NV_DRBDLIST, [])
1738 test = not isinstance(used_minors, (tuple, list))
1739 _ErrorIf(test, self.ENODEDRBD, node,
1740 "cannot parse drbd status file: %s", str(used_minors))
1742 # we cannot check drbd status
1745 for minor, (iname, must_exist) in node_drbd.items():
1746 test = minor not in used_minors and must_exist
1747 _ErrorIf(test, self.ENODEDRBD, node,
1748 "drbd minor %d of instance %s is not active", minor, iname)
1749 for minor in used_minors:
1750 test = minor not in node_drbd
1751 _ErrorIf(test, self.ENODEDRBD, node,
1752 "unallocated drbd minor %d is in use", minor)
1754 def _UpdateNodeOS(self, ninfo, nresult, nimg):
1755 """Builds the node OS structures.
1757 @type ninfo: L{objects.Node}
1758 @param ninfo: the node to check
1759 @param nresult: the remote results for the node
1760 @param nimg: the node image object
1764 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1766 remote_os = nresult.get(constants.NV_OSLIST, None)
1767 test = (not isinstance(remote_os, list) or
1768 not compat.all(isinstance(v, list) and len(v) == 7
1769 for v in remote_os))
1771 _ErrorIf(test, self.ENODEOS, node,
1772 "node hasn't returned valid OS data")
1781 for (name, os_path, status, diagnose,
1782 variants, parameters, api_ver) in nresult[constants.NV_OSLIST]:
1784 if name not in os_dict:
1787 # parameters is a list of lists instead of list of tuples due to
1788 # JSON lacking a real tuple type, fix it:
1789 parameters = [tuple(v) for v in parameters]
1790 os_dict[name].append((os_path, status, diagnose,
1791 set(variants), set(parameters), set(api_ver)))
1793 nimg.oslist = os_dict
1795 def _VerifyNodeOS(self, ninfo, nimg, base):
1796 """Verifies the node OS list.
1798 @type ninfo: L{objects.Node}
1799 @param ninfo: the node to check
1800 @param nimg: the node image object
1801 @param base: the 'template' node we match against (e.g. from the master)
1805 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1807 assert not nimg.os_fail, "Entered _VerifyNodeOS with failed OS rpc?"
1809 beautify_params = lambda l: ["%s: %s" % (k, v) for (k, v) in l]
1810 for os_name, os_data in nimg.oslist.items():
1811 assert os_data, "Empty OS status for OS %s?!" % os_name
1812 f_path, f_status, f_diag, f_var, f_param, f_api = os_data[0]
1813 _ErrorIf(not f_status, self.ENODEOS, node,
1814 "Invalid OS %s (located at %s): %s", os_name, f_path, f_diag)
1815 _ErrorIf(len(os_data) > 1, self.ENODEOS, node,
1816 "OS '%s' has multiple entries (first one shadows the rest): %s",
1817 os_name, utils.CommaJoin([v[0] for v in os_data]))
1818 # this will catched in backend too
1819 _ErrorIf(compat.any(v >= constants.OS_API_V15 for v in f_api)
1820 and not f_var, self.ENODEOS, node,
1821 "OS %s with API at least %d does not declare any variant",
1822 os_name, constants.OS_API_V15)
1823 # comparisons with the 'base' image
1824 test = os_name not in base.oslist
1825 _ErrorIf(test, self.ENODEOS, node,
1826 "Extra OS %s not present on reference node (%s)",
1830 assert base.oslist[os_name], "Base node has empty OS status?"
1831 _, b_status, _, b_var, b_param, b_api = base.oslist[os_name][0]
1833 # base OS is invalid, skipping
1835 for kind, a, b in [("API version", f_api, b_api),
1836 ("variants list", f_var, b_var),
1837 ("parameters", beautify_params(f_param),
1838 beautify_params(b_param))]:
1839 _ErrorIf(a != b, self.ENODEOS, node,
1840 "OS %s for %s differs from reference node %s: [%s] vs. [%s]",
1841 kind, os_name, base.name,
1842 utils.CommaJoin(sorted(a)), utils.CommaJoin(sorted(b)))
1844 # check any missing OSes
1845 missing = set(base.oslist.keys()).difference(nimg.oslist.keys())
1846 _ErrorIf(missing, self.ENODEOS, node,
1847 "OSes present on reference node %s but missing on this node: %s",
1848 base.name, utils.CommaJoin(missing))
1850 def _VerifyOob(self, ninfo, nresult):
1851 """Verifies out of band functionality of a node.
1853 @type ninfo: L{objects.Node}
1854 @param ninfo: the node to check
1855 @param nresult: the remote results for the node
1859 # We just have to verify the paths on master and/or master candidates
1860 # as the oob helper is invoked on the master
1861 if ((ninfo.master_candidate or ninfo.master_capable) and
1862 constants.NV_OOB_PATHS in nresult):
1863 for path_result in nresult[constants.NV_OOB_PATHS]:
1864 self._ErrorIf(path_result, self.ENODEOOBPATH, node, path_result)
1866 def _UpdateNodeVolumes(self, ninfo, nresult, nimg, vg_name):
1867 """Verifies and updates the node volume data.
1869 This function will update a L{NodeImage}'s internal structures
1870 with data from the remote call.
1872 @type ninfo: L{objects.Node}
1873 @param ninfo: the node to check
1874 @param nresult: the remote results for the node
1875 @param nimg: the node image object
1876 @param vg_name: the configured VG name
1880 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1882 nimg.lvm_fail = True
1883 lvdata = nresult.get(constants.NV_LVLIST, "Missing LV data")
1886 elif isinstance(lvdata, basestring):
1887 _ErrorIf(True, self.ENODELVM, node, "LVM problem on node: %s",
1888 utils.SafeEncode(lvdata))
1889 elif not isinstance(lvdata, dict):
1890 _ErrorIf(True, self.ENODELVM, node, "rpc call to node failed (lvlist)")
1892 nimg.volumes = lvdata
1893 nimg.lvm_fail = False
1895 def _UpdateNodeInstances(self, ninfo, nresult, nimg):
1896 """Verifies and updates the node instance list.
1898 If the listing was successful, then updates this node's instance
1899 list. Otherwise, it marks the RPC call as failed for the instance
1902 @type ninfo: L{objects.Node}
1903 @param ninfo: the node to check
1904 @param nresult: the remote results for the node
1905 @param nimg: the node image object
1908 idata = nresult.get(constants.NV_INSTANCELIST, None)
1909 test = not isinstance(idata, list)
1910 self._ErrorIf(test, self.ENODEHV, ninfo.name, "rpc call to node failed"
1911 " (instancelist): %s", utils.SafeEncode(str(idata)))
1913 nimg.hyp_fail = True
1915 nimg.instances = idata
1917 def _UpdateNodeInfo(self, ninfo, nresult, nimg, vg_name):
1918 """Verifies and computes a node information map
1920 @type ninfo: L{objects.Node}
1921 @param ninfo: the node to check
1922 @param nresult: the remote results for the node
1923 @param nimg: the node image object
1924 @param vg_name: the configured VG name
1928 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1930 # try to read free memory (from the hypervisor)
1931 hv_info = nresult.get(constants.NV_HVINFO, None)
1932 test = not isinstance(hv_info, dict) or "memory_free" not in hv_info
1933 _ErrorIf(test, self.ENODEHV, node, "rpc call to node failed (hvinfo)")
1936 nimg.mfree = int(hv_info["memory_free"])
1937 except (ValueError, TypeError):
1938 _ErrorIf(True, self.ENODERPC, node,
1939 "node returned invalid nodeinfo, check hypervisor")
1941 # FIXME: devise a free space model for file based instances as well
1942 if vg_name is not None:
1943 test = (constants.NV_VGLIST not in nresult or
1944 vg_name not in nresult[constants.NV_VGLIST])
1945 _ErrorIf(test, self.ENODELVM, node,
1946 "node didn't return data for the volume group '%s'"
1947 " - it is either missing or broken", vg_name)
1950 nimg.dfree = int(nresult[constants.NV_VGLIST][vg_name])
1951 except (ValueError, TypeError):
1952 _ErrorIf(True, self.ENODERPC, node,
1953 "node returned invalid LVM info, check LVM status")
1955 def _CollectDiskInfo(self, nodelist, node_image, instanceinfo):
1956 """Gets per-disk status information for all instances.
1958 @type nodelist: list of strings
1959 @param nodelist: Node names
1960 @type node_image: dict of (name, L{objects.Node})
1961 @param node_image: Node objects
1962 @type instanceinfo: dict of (name, L{objects.Instance})
1963 @param instanceinfo: Instance objects
1964 @rtype: {instance: {node: [(succes, payload)]}}
1965 @return: a dictionary of per-instance dictionaries with nodes as
1966 keys and disk information as values; the disk information is a
1967 list of tuples (success, payload)
1970 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1973 node_disks_devonly = {}
1974 diskless_instances = set()
1975 diskless = constants.DT_DISKLESS
1977 for nname in nodelist:
1978 node_instances = list(itertools.chain(node_image[nname].pinst,
1979 node_image[nname].sinst))
1980 diskless_instances.update(inst for inst in node_instances
1981 if instanceinfo[inst].disk_template == diskless)
1982 disks = [(inst, disk)
1983 for inst in node_instances
1984 for disk in instanceinfo[inst].disks]
1987 # No need to collect data
1990 node_disks[nname] = disks
1992 # Creating copies as SetDiskID below will modify the objects and that can
1993 # lead to incorrect data returned from nodes
1994 devonly = [dev.Copy() for (_, dev) in disks]
1997 self.cfg.SetDiskID(dev, nname)
1999 node_disks_devonly[nname] = devonly
2001 assert len(node_disks) == len(node_disks_devonly)
2003 # Collect data from all nodes with disks
2004 result = self.rpc.call_blockdev_getmirrorstatus_multi(node_disks.keys(),
2007 assert len(result) == len(node_disks)
2011 for (nname, nres) in result.items():
2012 disks = node_disks[nname]
2015 # No data from this node
2016 data = len(disks) * [(False, "node offline")]
2019 _ErrorIf(msg, self.ENODERPC, nname,
2020 "while getting disk information: %s", msg)
2022 # No data from this node
2023 data = len(disks) * [(False, msg)]
2026 for idx, i in enumerate(nres.payload):
2027 if isinstance(i, (tuple, list)) and len(i) == 2:
2030 logging.warning("Invalid result from node %s, entry %d: %s",
2032 data.append((False, "Invalid result from the remote node"))
2034 for ((inst, _), status) in zip(disks, data):
2035 instdisk.setdefault(inst, {}).setdefault(nname, []).append(status)
2037 # Add empty entries for diskless instances.
2038 for inst in diskless_instances:
2039 assert inst not in instdisk
2042 assert compat.all(len(statuses) == len(instanceinfo[inst].disks) and
2043 len(nnames) <= len(instanceinfo[inst].all_nodes) and
2044 compat.all(isinstance(s, (tuple, list)) and
2045 len(s) == 2 for s in statuses)
2046 for inst, nnames in instdisk.items()
2047 for nname, statuses in nnames.items())
2048 assert set(instdisk) == set(instanceinfo), "instdisk consistency failure"
2052 def _VerifyHVP(self, hvp_data):
2053 """Verifies locally the syntax of the hypervisor parameters.
2056 for item, hv_name, hv_params in hvp_data:
2057 msg = ("hypervisor %s parameters syntax check (source %s): %%s" %
2060 hv_class = hypervisor.GetHypervisor(hv_name)
2061 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
2062 hv_class.CheckParameterSyntax(hv_params)
2063 except errors.GenericError, err:
2064 self._ErrorIf(True, self.ECLUSTERCFG, None, msg % str(err))
2067 def BuildHooksEnv(self):
2070 Cluster-Verify hooks just ran in the post phase and their failure makes
2071 the output be logged in the verify output and the verification to fail.
2074 all_nodes = self.cfg.GetNodeList()
2076 "CLUSTER_TAGS": " ".join(self.cfg.GetClusterInfo().GetTags())
2078 for node in self.cfg.GetAllNodesInfo().values():
2079 env["NODE_TAGS_%s" % node.name] = " ".join(node.GetTags())
2081 return env, [], all_nodes
2083 def Exec(self, feedback_fn):
2084 """Verify integrity of cluster, performing various test on nodes.
2087 # This method has too many local variables. pylint: disable-msg=R0914
2089 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
2090 verbose = self.op.verbose
2091 self._feedback_fn = feedback_fn
2092 feedback_fn("* Verifying global settings")
2093 for msg in self.cfg.VerifyConfig():
2094 _ErrorIf(True, self.ECLUSTERCFG, None, msg)
2096 # Check the cluster certificates
2097 for cert_filename in constants.ALL_CERT_FILES:
2098 (errcode, msg) = _VerifyCertificate(cert_filename)
2099 _ErrorIf(errcode, self.ECLUSTERCERT, None, msg, code=errcode)
2101 vg_name = self.cfg.GetVGName()
2102 drbd_helper = self.cfg.GetDRBDHelper()
2103 hypervisors = self.cfg.GetClusterInfo().enabled_hypervisors
2104 cluster = self.cfg.GetClusterInfo()
2105 nodelist = utils.NiceSort(self.cfg.GetNodeList())
2106 nodeinfo = [self.cfg.GetNodeInfo(nname) for nname in nodelist]
2107 nodeinfo_byname = dict(zip(nodelist, nodeinfo))
2108 instancelist = utils.NiceSort(self.cfg.GetInstanceList())
2109 instanceinfo = dict((iname, self.cfg.GetInstanceInfo(iname))
2110 for iname in instancelist)
2111 groupinfo = self.cfg.GetAllNodeGroupsInfo()
2112 i_non_redundant = [] # Non redundant instances
2113 i_non_a_balanced = [] # Non auto-balanced instances
2114 n_offline = 0 # Count of offline nodes
2115 n_drained = 0 # Count of nodes being drained
2116 node_vol_should = {}
2118 # FIXME: verify OS list
2119 # do local checksums
2120 master_files = [constants.CLUSTER_CONF_FILE]
2121 master_node = self.master_node = self.cfg.GetMasterNode()
2122 master_ip = self.cfg.GetMasterIP()
2124 file_names = ssconf.SimpleStore().GetFileList()
2125 file_names.extend(constants.ALL_CERT_FILES)
2126 file_names.extend(master_files)
2127 if cluster.modify_etc_hosts:
2128 file_names.append(constants.ETC_HOSTS)
2130 local_checksums = utils.FingerprintFiles(file_names)
2132 # Compute the set of hypervisor parameters
2134 for hv_name in hypervisors:
2135 hvp_data.append(("cluster", hv_name, cluster.GetHVDefaults(hv_name)))
2136 for os_name, os_hvp in cluster.os_hvp.items():
2137 for hv_name, hv_params in os_hvp.items():
2140 full_params = cluster.GetHVDefaults(hv_name, os_name=os_name)
2141 hvp_data.append(("os %s" % os_name, hv_name, full_params))
2142 # TODO: collapse identical parameter values in a single one
2143 for instance in instanceinfo.values():
2144 if not instance.hvparams:
2146 hvp_data.append(("instance %s" % instance.name, instance.hypervisor,
2147 cluster.FillHV(instance)))
2148 # and verify them locally
2149 self._VerifyHVP(hvp_data)
2151 feedback_fn("* Gathering data (%d nodes)" % len(nodelist))
2152 node_verify_param = {
2153 constants.NV_FILELIST: file_names,
2154 constants.NV_NODELIST: [node.name for node in nodeinfo
2155 if not node.offline],
2156 constants.NV_HYPERVISOR: hypervisors,
2157 constants.NV_HVPARAMS: hvp_data,
2158 constants.NV_NODENETTEST: [(node.name, node.primary_ip,
2159 node.secondary_ip) for node in nodeinfo
2160 if not node.offline],
2161 constants.NV_INSTANCELIST: hypervisors,
2162 constants.NV_VERSION: None,
2163 constants.NV_HVINFO: self.cfg.GetHypervisorType(),
2164 constants.NV_NODESETUP: None,
2165 constants.NV_TIME: None,
2166 constants.NV_MASTERIP: (master_node, master_ip),
2167 constants.NV_OSLIST: None,
2168 constants.NV_VMNODES: self.cfg.GetNonVmCapableNodeList(),
2171 if vg_name is not None:
2172 node_verify_param[constants.NV_VGLIST] = None
2173 node_verify_param[constants.NV_LVLIST] = vg_name
2174 node_verify_param[constants.NV_PVLIST] = [vg_name]
2175 node_verify_param[constants.NV_DRBDLIST] = None
2178 node_verify_param[constants.NV_DRBDHELPER] = drbd_helper
2180 # Build our expected cluster state
2181 node_image = dict((node.name, self.NodeImage(offline=node.offline,
2183 vm_capable=node.vm_capable))
2184 for node in nodeinfo)
2188 for node in nodeinfo:
2189 path = _SupportsOob(self.cfg, node)
2190 if path and path not in oob_paths:
2191 oob_paths.append(path)
2194 node_verify_param[constants.NV_OOB_PATHS] = oob_paths
2196 for instance in instancelist:
2197 inst_config = instanceinfo[instance]
2199 for nname in inst_config.all_nodes:
2200 if nname not in node_image:
2202 gnode = self.NodeImage(name=nname)
2204 node_image[nname] = gnode
2206 inst_config.MapLVsByNode(node_vol_should)
2208 pnode = inst_config.primary_node
2209 node_image[pnode].pinst.append(instance)
2211 for snode in inst_config.secondary_nodes:
2212 nimg = node_image[snode]
2213 nimg.sinst.append(instance)
2214 if pnode not in nimg.sbp:
2215 nimg.sbp[pnode] = []
2216 nimg.sbp[pnode].append(instance)
2218 # At this point, we have the in-memory data structures complete,
2219 # except for the runtime information, which we'll gather next
2221 # Due to the way our RPC system works, exact response times cannot be
2222 # guaranteed (e.g. a broken node could run into a timeout). By keeping the
2223 # time before and after executing the request, we can at least have a time
2225 nvinfo_starttime = time.time()
2226 all_nvinfo = self.rpc.call_node_verify(nodelist, node_verify_param,
2227 self.cfg.GetClusterName())
2228 nvinfo_endtime = time.time()
2230 all_drbd_map = self.cfg.ComputeDRBDMap()
2232 feedback_fn("* Gathering disk information (%s nodes)" % len(nodelist))
2233 instdisk = self._CollectDiskInfo(nodelist, node_image, instanceinfo)
2235 feedback_fn("* Verifying node status")
2239 for node_i in nodeinfo:
2241 nimg = node_image[node]
2245 feedback_fn("* Skipping offline node %s" % (node,))
2249 if node == master_node:
2251 elif node_i.master_candidate:
2252 ntype = "master candidate"
2253 elif node_i.drained:
2259 feedback_fn("* Verifying node %s (%s)" % (node, ntype))
2261 msg = all_nvinfo[node].fail_msg
2262 _ErrorIf(msg, self.ENODERPC, node, "while contacting node: %s", msg)
2264 nimg.rpc_fail = True
2267 nresult = all_nvinfo[node].payload
2269 nimg.call_ok = self._VerifyNode(node_i, nresult)
2270 self._VerifyNodeTime(node_i, nresult, nvinfo_starttime, nvinfo_endtime)
2271 self._VerifyNodeNetwork(node_i, nresult)
2272 self._VerifyNodeFiles(node_i, nresult, file_names, local_checksums,
2275 self._VerifyOob(node_i, nresult)
2278 self._VerifyNodeLVM(node_i, nresult, vg_name)
2279 self._VerifyNodeDrbd(node_i, nresult, instanceinfo, drbd_helper,
2282 self._UpdateNodeVolumes(node_i, nresult, nimg, vg_name)
2283 self._UpdateNodeInstances(node_i, nresult, nimg)
2284 self._UpdateNodeInfo(node_i, nresult, nimg, vg_name)
2285 self._UpdateNodeOS(node_i, nresult, nimg)
2286 if not nimg.os_fail:
2287 if refos_img is None:
2289 self._VerifyNodeOS(node_i, nimg, refos_img)
2291 feedback_fn("* Verifying instance status")
2292 for instance in instancelist:
2294 feedback_fn("* Verifying instance %s" % instance)
2295 inst_config = instanceinfo[instance]
2296 self._VerifyInstance(instance, inst_config, node_image,
2298 inst_nodes_offline = []
2300 pnode = inst_config.primary_node
2301 pnode_img = node_image[pnode]
2302 _ErrorIf(pnode_img.rpc_fail and not pnode_img.offline,
2303 self.ENODERPC, pnode, "instance %s, connection to"
2304 " primary node failed", instance)
2306 _ErrorIf(pnode_img.offline, self.EINSTANCEBADNODE, instance,
2307 "instance lives on offline node %s", inst_config.primary_node)
2309 # If the instance is non-redundant we cannot survive losing its primary
2310 # node, so we are not N+1 compliant. On the other hand we have no disk
2311 # templates with more than one secondary so that situation is not well
2313 # FIXME: does not support file-backed instances
2314 if not inst_config.secondary_nodes:
2315 i_non_redundant.append(instance)
2317 _ErrorIf(len(inst_config.secondary_nodes) > 1, self.EINSTANCELAYOUT,
2318 instance, "instance has multiple secondary nodes: %s",
2319 utils.CommaJoin(inst_config.secondary_nodes),
2320 code=self.ETYPE_WARNING)
2322 if inst_config.disk_template in constants.DTS_NET_MIRROR:
2323 pnode = inst_config.primary_node
2324 instance_nodes = utils.NiceSort(inst_config.all_nodes)
2325 instance_groups = {}
2327 for node in instance_nodes:
2328 instance_groups.setdefault(nodeinfo_byname[node].group,
2332 "%s (group %s)" % (utils.CommaJoin(nodes), groupinfo[group].name)
2333 # Sort so that we always list the primary node first.
2334 for group, nodes in sorted(instance_groups.items(),
2335 key=lambda (_, nodes): pnode in nodes,
2338 self._ErrorIf(len(instance_groups) > 1, self.EINSTANCESPLITGROUPS,
2339 instance, "instance has primary and secondary nodes in"
2340 " different groups: %s", utils.CommaJoin(pretty_list),
2341 code=self.ETYPE_WARNING)
2343 if not cluster.FillBE(inst_config)[constants.BE_AUTO_BALANCE]:
2344 i_non_a_balanced.append(instance)
2346 for snode in inst_config.secondary_nodes:
2347 s_img = node_image[snode]
2348 _ErrorIf(s_img.rpc_fail and not s_img.offline, self.ENODERPC, snode,
2349 "instance %s, connection to secondary node failed", instance)
2352 inst_nodes_offline.append(snode)
2354 # warn that the instance lives on offline nodes
2355 _ErrorIf(inst_nodes_offline, self.EINSTANCEBADNODE, instance,
2356 "instance has offline secondary node(s) %s",
2357 utils.CommaJoin(inst_nodes_offline))
2358 # ... or ghost/non-vm_capable nodes
2359 for node in inst_config.all_nodes:
2360 _ErrorIf(node_image[node].ghost, self.EINSTANCEBADNODE, instance,
2361 "instance lives on ghost node %s", node)
2362 _ErrorIf(not node_image[node].vm_capable, self.EINSTANCEBADNODE,
2363 instance, "instance lives on non-vm_capable node %s", node)
2365 feedback_fn("* Verifying orphan volumes")
2366 reserved = utils.FieldSet(*cluster.reserved_lvs)
2367 self._VerifyOrphanVolumes(node_vol_should, node_image, reserved)
2369 feedback_fn("* Verifying orphan instances")
2370 self._VerifyOrphanInstances(instancelist, node_image)
2372 if constants.VERIFY_NPLUSONE_MEM not in self.op.skip_checks:
2373 feedback_fn("* Verifying N+1 Memory redundancy")
2374 self._VerifyNPlusOneMemory(node_image, instanceinfo)
2376 feedback_fn("* Other Notes")
2378 feedback_fn(" - NOTICE: %d non-redundant instance(s) found."
2379 % len(i_non_redundant))
2381 if i_non_a_balanced:
2382 feedback_fn(" - NOTICE: %d non-auto-balanced instance(s) found."
2383 % len(i_non_a_balanced))
2386 feedback_fn(" - NOTICE: %d offline node(s) found." % n_offline)
2389 feedback_fn(" - NOTICE: %d drained node(s) found." % n_drained)
2393 def HooksCallBack(self, phase, hooks_results, feedback_fn, lu_result):
2394 """Analyze the post-hooks' result
2396 This method analyses the hook result, handles it, and sends some
2397 nicely-formatted feedback back to the user.
2399 @param phase: one of L{constants.HOOKS_PHASE_POST} or
2400 L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase
2401 @param hooks_results: the results of the multi-node hooks rpc call
2402 @param feedback_fn: function used send feedback back to the caller
2403 @param lu_result: previous Exec result
2404 @return: the new Exec result, based on the previous result
2408 # We only really run POST phase hooks, and are only interested in
2410 if phase == constants.HOOKS_PHASE_POST:
2411 # Used to change hooks' output to proper indentation
2412 feedback_fn("* Hooks Results")
2413 assert hooks_results, "invalid result from hooks"
2415 for node_name in hooks_results:
2416 res = hooks_results[node_name]
2418 test = msg and not res.offline
2419 self._ErrorIf(test, self.ENODEHOOKS, node_name,
2420 "Communication failure in hooks execution: %s", msg)
2421 if res.offline or msg:
2422 # No need to investigate payload if node is offline or gave an error.
2423 # override manually lu_result here as _ErrorIf only
2424 # overrides self.bad
2427 for script, hkr, output in res.payload:
2428 test = hkr == constants.HKR_FAIL
2429 self._ErrorIf(test, self.ENODEHOOKS, node_name,
2430 "Script %s failed, output:", script)
2432 output = self._HOOKS_INDENT_RE.sub(' ', output)
2433 feedback_fn("%s" % output)
2439 class LUClusterVerifyDisks(NoHooksLU):
2440 """Verifies the cluster disks status.
2445 def ExpandNames(self):
2446 self.needed_locks = {
2447 locking.LEVEL_NODE: locking.ALL_SET,
2448 locking.LEVEL_INSTANCE: locking.ALL_SET,
2450 self.share_locks = dict.fromkeys(locking.LEVELS, 1)
2452 def Exec(self, feedback_fn):
2453 """Verify integrity of cluster disks.
2455 @rtype: tuple of three items
2456 @return: a tuple of (dict of node-to-node_error, list of instances
2457 which need activate-disks, dict of instance: (node, volume) for
2461 result = res_nodes, res_instances, res_missing = {}, [], {}
2463 nodes = utils.NiceSort(self.cfg.GetVmCapableNodeList())
2464 instances = self.cfg.GetAllInstancesInfo().values()
2467 for inst in instances:
2469 if not inst.admin_up:
2471 inst.MapLVsByNode(inst_lvs)
2472 # transform { iname: {node: [vol,],},} to {(node, vol): iname}
2473 for node, vol_list in inst_lvs.iteritems():
2474 for vol in vol_list:
2475 nv_dict[(node, vol)] = inst
2480 node_lvs = self.rpc.call_lv_list(nodes, [])
2481 for node, node_res in node_lvs.items():
2482 if node_res.offline:
2484 msg = node_res.fail_msg
2486 logging.warning("Error enumerating LVs on node %s: %s", node, msg)
2487 res_nodes[node] = msg
2490 lvs = node_res.payload
2491 for lv_name, (_, _, lv_online) in lvs.items():
2492 inst = nv_dict.pop((node, lv_name), None)
2493 if (not lv_online and inst is not None
2494 and inst.name not in res_instances):
2495 res_instances.append(inst.name)
2497 # any leftover items in nv_dict are missing LVs, let's arrange the
2499 for key, inst in nv_dict.iteritems():
2500 if inst.name not in res_missing:
2501 res_missing[inst.name] = []
2502 res_missing[inst.name].append(key)
2507 class LUClusterRepairDiskSizes(NoHooksLU):
2508 """Verifies the cluster disks sizes.
2513 def ExpandNames(self):
2514 if self.op.instances:
2515 self.wanted_names = []
2516 for name in self.op.instances:
2517 full_name = _ExpandInstanceName(self.cfg, name)
2518 self.wanted_names.append(full_name)
2519 self.needed_locks = {
2520 locking.LEVEL_NODE: [],
2521 locking.LEVEL_INSTANCE: self.wanted_names,
2523 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
2525 self.wanted_names = None
2526 self.needed_locks = {
2527 locking.LEVEL_NODE: locking.ALL_SET,
2528 locking.LEVEL_INSTANCE: locking.ALL_SET,
2530 self.share_locks = dict(((i, 1) for i in locking.LEVELS))
2532 def DeclareLocks(self, level):
2533 if level == locking.LEVEL_NODE and self.wanted_names is not None:
2534 self._LockInstancesNodes(primary_only=True)
2536 def CheckPrereq(self):
2537 """Check prerequisites.
2539 This only checks the optional instance list against the existing names.
2542 if self.wanted_names is None:
2543 self.wanted_names = self.acquired_locks[locking.LEVEL_INSTANCE]
2545 self.wanted_instances = [self.cfg.GetInstanceInfo(name) for name
2546 in self.wanted_names]
2548 def _EnsureChildSizes(self, disk):
2549 """Ensure children of the disk have the needed disk size.
2551 This is valid mainly for DRBD8 and fixes an issue where the
2552 children have smaller disk size.
2554 @param disk: an L{ganeti.objects.Disk} object
2557 if disk.dev_type == constants.LD_DRBD8:
2558 assert disk.children, "Empty children for DRBD8?"
2559 fchild = disk.children[0]
2560 mismatch = fchild.size < disk.size
2562 self.LogInfo("Child disk has size %d, parent %d, fixing",
2563 fchild.size, disk.size)
2564 fchild.size = disk.size
2566 # and we recurse on this child only, not on the metadev
2567 return self._EnsureChildSizes(fchild) or mismatch
2571 def Exec(self, feedback_fn):
2572 """Verify the size of cluster disks.
2575 # TODO: check child disks too
2576 # TODO: check differences in size between primary/secondary nodes
2578 for instance in self.wanted_instances:
2579 pnode = instance.primary_node
2580 if pnode not in per_node_disks:
2581 per_node_disks[pnode] = []
2582 for idx, disk in enumerate(instance.disks):
2583 per_node_disks[pnode].append((instance, idx, disk))
2586 for node, dskl in per_node_disks.items():
2587 newl = [v[2].Copy() for v in dskl]
2589 self.cfg.SetDiskID(dsk, node)
2590 result = self.rpc.call_blockdev_getsize(node, newl)
2592 self.LogWarning("Failure in blockdev_getsize call to node"
2593 " %s, ignoring", node)
2595 if len(result.payload) != len(dskl):
2596 logging.warning("Invalid result from node %s: len(dksl)=%d,"
2597 " result.payload=%s", node, len(dskl), result.payload)
2598 self.LogWarning("Invalid result from node %s, ignoring node results",
2601 for ((instance, idx, disk), size) in zip(dskl, result.payload):
2603 self.LogWarning("Disk %d of instance %s did not return size"
2604 " information, ignoring", idx, instance.name)
2606 if not isinstance(size, (int, long)):
2607 self.LogWarning("Disk %d of instance %s did not return valid"
2608 " size information, ignoring", idx, instance.name)
2611 if size != disk.size:
2612 self.LogInfo("Disk %d of instance %s has mismatched size,"
2613 " correcting: recorded %d, actual %d", idx,
2614 instance.name, disk.size, size)
2616 self.cfg.Update(instance, feedback_fn)
2617 changed.append((instance.name, idx, size))
2618 if self._EnsureChildSizes(disk):
2619 self.cfg.Update(instance, feedback_fn)
2620 changed.append((instance.name, idx, disk.size))
2624 class LUClusterRename(LogicalUnit):
2625 """Rename the cluster.
2628 HPATH = "cluster-rename"
2629 HTYPE = constants.HTYPE_CLUSTER
2631 def BuildHooksEnv(self):
2636 "OP_TARGET": self.cfg.GetClusterName(),
2637 "NEW_NAME": self.op.name,
2639 mn = self.cfg.GetMasterNode()
2640 all_nodes = self.cfg.GetNodeList()
2641 return env, [mn], all_nodes
2643 def CheckPrereq(self):
2644 """Verify that the passed name is a valid one.
2647 hostname = netutils.GetHostname(name=self.op.name,
2648 family=self.cfg.GetPrimaryIPFamily())
2650 new_name = hostname.name
2651 self.ip = new_ip = hostname.ip
2652 old_name = self.cfg.GetClusterName()
2653 old_ip = self.cfg.GetMasterIP()
2654 if new_name == old_name and new_ip == old_ip:
2655 raise errors.OpPrereqError("Neither the name nor the IP address of the"
2656 " cluster has changed",
2658 if new_ip != old_ip:
2659 if netutils.TcpPing(new_ip, constants.DEFAULT_NODED_PORT):
2660 raise errors.OpPrereqError("The given cluster IP address (%s) is"
2661 " reachable on the network" %
2662 new_ip, errors.ECODE_NOTUNIQUE)
2664 self.op.name = new_name
2666 def Exec(self, feedback_fn):
2667 """Rename the cluster.
2670 clustername = self.op.name
2673 # shutdown the master IP
2674 master = self.cfg.GetMasterNode()
2675 result = self.rpc.call_node_stop_master(master, False)
2676 result.Raise("Could not disable the master role")
2679 cluster = self.cfg.GetClusterInfo()
2680 cluster.cluster_name = clustername
2681 cluster.master_ip = ip
2682 self.cfg.Update(cluster, feedback_fn)
2684 # update the known hosts file
2685 ssh.WriteKnownHostsFile(self.cfg, constants.SSH_KNOWN_HOSTS_FILE)
2686 node_list = self.cfg.GetOnlineNodeList()
2688 node_list.remove(master)
2691 _UploadHelper(self, node_list, constants.SSH_KNOWN_HOSTS_FILE)
2693 result = self.rpc.call_node_start_master(master, False, False)
2694 msg = result.fail_msg
2696 self.LogWarning("Could not re-enable the master role on"
2697 " the master, please restart manually: %s", msg)
2702 class LUClusterSetParams(LogicalUnit):
2703 """Change the parameters of the cluster.
2706 HPATH = "cluster-modify"
2707 HTYPE = constants.HTYPE_CLUSTER
2710 def CheckArguments(self):
2714 if self.op.uid_pool:
2715 uidpool.CheckUidPool(self.op.uid_pool)
2717 if self.op.add_uids:
2718 uidpool.CheckUidPool(self.op.add_uids)
2720 if self.op.remove_uids:
2721 uidpool.CheckUidPool(self.op.remove_uids)
2723 def ExpandNames(self):
2724 # FIXME: in the future maybe other cluster params won't require checking on
2725 # all nodes to be modified.
2726 self.needed_locks = {
2727 locking.LEVEL_NODE: locking.ALL_SET,
2729 self.share_locks[locking.LEVEL_NODE] = 1
2731 def BuildHooksEnv(self):
2736 "OP_TARGET": self.cfg.GetClusterName(),
2737 "NEW_VG_NAME": self.op.vg_name,
2739 mn = self.cfg.GetMasterNode()
2740 return env, [mn], [mn]
2742 def CheckPrereq(self):
2743 """Check prerequisites.
2745 This checks whether the given params don't conflict and
2746 if the given volume group is valid.
2749 if self.op.vg_name is not None and not self.op.vg_name:
2750 if self.cfg.HasAnyDiskOfType(constants.LD_LV):
2751 raise errors.OpPrereqError("Cannot disable lvm storage while lvm-based"
2752 " instances exist", errors.ECODE_INVAL)
2754 if self.op.drbd_helper is not None and not self.op.drbd_helper:
2755 if self.cfg.HasAnyDiskOfType(constants.LD_DRBD8):
2756 raise errors.OpPrereqError("Cannot disable drbd helper while"
2757 " drbd-based instances exist",
2760 node_list = self.acquired_locks[locking.LEVEL_NODE]
2762 # if vg_name not None, checks given volume group on all nodes
2764 vglist = self.rpc.call_vg_list(node_list)
2765 for node in node_list:
2766 msg = vglist[node].fail_msg
2768 # ignoring down node
2769 self.LogWarning("Error while gathering data on node %s"
2770 " (ignoring node): %s", node, msg)
2772 vgstatus = utils.CheckVolumeGroupSize(vglist[node].payload,
2774 constants.MIN_VG_SIZE)
2776 raise errors.OpPrereqError("Error on node '%s': %s" %
2777 (node, vgstatus), errors.ECODE_ENVIRON)
2779 if self.op.drbd_helper:
2780 # checks given drbd helper on all nodes
2781 helpers = self.rpc.call_drbd_helper(node_list)
2782 for node in node_list:
2783 ninfo = self.cfg.GetNodeInfo(node)
2785 self.LogInfo("Not checking drbd helper on offline node %s", node)
2787 msg = helpers[node].fail_msg
2789 raise errors.OpPrereqError("Error checking drbd helper on node"
2790 " '%s': %s" % (node, msg),
2791 errors.ECODE_ENVIRON)
2792 node_helper = helpers[node].payload
2793 if node_helper != self.op.drbd_helper:
2794 raise errors.OpPrereqError("Error on node '%s': drbd helper is %s" %
2795 (node, node_helper), errors.ECODE_ENVIRON)
2797 self.cluster = cluster = self.cfg.GetClusterInfo()
2798 # validate params changes
2799 if self.op.beparams:
2800 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
2801 self.new_beparams = cluster.SimpleFillBE(self.op.beparams)
2803 if self.op.ndparams:
2804 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
2805 self.new_ndparams = cluster.SimpleFillND(self.op.ndparams)
2807 # TODO: we need a more general way to handle resetting
2808 # cluster-level parameters to default values
2809 if self.new_ndparams["oob_program"] == "":
2810 self.new_ndparams["oob_program"] = \
2811 constants.NDC_DEFAULTS[constants.ND_OOB_PROGRAM]
2813 if self.op.nicparams:
2814 utils.ForceDictType(self.op.nicparams, constants.NICS_PARAMETER_TYPES)
2815 self.new_nicparams = cluster.SimpleFillNIC(self.op.nicparams)
2816 objects.NIC.CheckParameterSyntax(self.new_nicparams)
2819 # check all instances for consistency
2820 for instance in self.cfg.GetAllInstancesInfo().values():
2821 for nic_idx, nic in enumerate(instance.nics):
2822 params_copy = copy.deepcopy(nic.nicparams)
2823 params_filled = objects.FillDict(self.new_nicparams, params_copy)
2825 # check parameter syntax
2827 objects.NIC.CheckParameterSyntax(params_filled)
2828 except errors.ConfigurationError, err:
2829 nic_errors.append("Instance %s, nic/%d: %s" %
2830 (instance.name, nic_idx, err))
2832 # if we're moving instances to routed, check that they have an ip
2833 target_mode = params_filled[constants.NIC_MODE]
2834 if target_mode == constants.NIC_MODE_ROUTED and not nic.ip:
2835 nic_errors.append("Instance %s, nic/%d: routed nick with no ip" %
2836 (instance.name, nic_idx))
2838 raise errors.OpPrereqError("Cannot apply the change, errors:\n%s" %
2839 "\n".join(nic_errors))
2841 # hypervisor list/parameters
2842 self.new_hvparams = new_hvp = objects.FillDict(cluster.hvparams, {})
2843 if self.op.hvparams:
2844 for hv_name, hv_dict in self.op.hvparams.items():
2845 if hv_name not in self.new_hvparams:
2846 self.new_hvparams[hv_name] = hv_dict
2848 self.new_hvparams[hv_name].update(hv_dict)
2850 # os hypervisor parameters
2851 self.new_os_hvp = objects.FillDict(cluster.os_hvp, {})
2853 for os_name, hvs in self.op.os_hvp.items():
2854 if os_name not in self.new_os_hvp:
2855 self.new_os_hvp[os_name] = hvs
2857 for hv_name, hv_dict in hvs.items():
2858 if hv_name not in self.new_os_hvp[os_name]:
2859 self.new_os_hvp[os_name][hv_name] = hv_dict
2861 self.new_os_hvp[os_name][hv_name].update(hv_dict)
2864 self.new_osp = objects.FillDict(cluster.osparams, {})
2865 if self.op.osparams:
2866 for os_name, osp in self.op.osparams.items():
2867 if os_name not in self.new_osp:
2868 self.new_osp[os_name] = {}
2870 self.new_osp[os_name] = _GetUpdatedParams(self.new_osp[os_name], osp,
2873 if not self.new_osp[os_name]:
2874 # we removed all parameters
2875 del self.new_osp[os_name]
2877 # check the parameter validity (remote check)
2878 _CheckOSParams(self, False, [self.cfg.GetMasterNode()],
2879 os_name, self.new_osp[os_name])
2881 # changes to the hypervisor list
2882 if self.op.enabled_hypervisors is not None:
2883 self.hv_list = self.op.enabled_hypervisors
2884 for hv in self.hv_list:
2885 # if the hypervisor doesn't already exist in the cluster
2886 # hvparams, we initialize it to empty, and then (in both
2887 # cases) we make sure to fill the defaults, as we might not
2888 # have a complete defaults list if the hypervisor wasn't
2890 if hv not in new_hvp:
2892 new_hvp[hv] = objects.FillDict(constants.HVC_DEFAULTS[hv], new_hvp[hv])
2893 utils.ForceDictType(new_hvp[hv], constants.HVS_PARAMETER_TYPES)
2895 self.hv_list = cluster.enabled_hypervisors
2897 if self.op.hvparams or self.op.enabled_hypervisors is not None:
2898 # either the enabled list has changed, or the parameters have, validate
2899 for hv_name, hv_params in self.new_hvparams.items():
2900 if ((self.op.hvparams and hv_name in self.op.hvparams) or
2901 (self.op.enabled_hypervisors and
2902 hv_name in self.op.enabled_hypervisors)):
2903 # either this is a new hypervisor, or its parameters have changed
2904 hv_class = hypervisor.GetHypervisor(hv_name)
2905 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
2906 hv_class.CheckParameterSyntax(hv_params)
2907 _CheckHVParams(self, node_list, hv_name, hv_params)
2910 # no need to check any newly-enabled hypervisors, since the
2911 # defaults have already been checked in the above code-block
2912 for os_name, os_hvp in self.new_os_hvp.items():
2913 for hv_name, hv_params in os_hvp.items():
2914 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
2915 # we need to fill in the new os_hvp on top of the actual hv_p
2916 cluster_defaults = self.new_hvparams.get(hv_name, {})
2917 new_osp = objects.FillDict(cluster_defaults, hv_params)
2918 hv_class = hypervisor.GetHypervisor(hv_name)
2919 hv_class.CheckParameterSyntax(new_osp)
2920 _CheckHVParams(self, node_list, hv_name, new_osp)
2922 if self.op.default_iallocator:
2923 alloc_script = utils.FindFile(self.op.default_iallocator,
2924 constants.IALLOCATOR_SEARCH_PATH,
2926 if alloc_script is None:
2927 raise errors.OpPrereqError("Invalid default iallocator script '%s'"
2928 " specified" % self.op.default_iallocator,
2931 def Exec(self, feedback_fn):
2932 """Change the parameters of the cluster.
2935 if self.op.vg_name is not None:
2936 new_volume = self.op.vg_name
2939 if new_volume != self.cfg.GetVGName():
2940 self.cfg.SetVGName(new_volume)
2942 feedback_fn("Cluster LVM configuration already in desired"
2943 " state, not changing")
2944 if self.op.drbd_helper is not None:
2945 new_helper = self.op.drbd_helper
2948 if new_helper != self.cfg.GetDRBDHelper():
2949 self.cfg.SetDRBDHelper(new_helper)
2951 feedback_fn("Cluster DRBD helper already in desired state,"
2953 if self.op.hvparams:
2954 self.cluster.hvparams = self.new_hvparams
2956 self.cluster.os_hvp = self.new_os_hvp
2957 if self.op.enabled_hypervisors is not None:
2958 self.cluster.hvparams = self.new_hvparams
2959 self.cluster.enabled_hypervisors = self.op.enabled_hypervisors
2960 if self.op.beparams:
2961 self.cluster.beparams[constants.PP_DEFAULT] = self.new_beparams
2962 if self.op.nicparams:
2963 self.cluster.nicparams[constants.PP_DEFAULT] = self.new_nicparams
2964 if self.op.osparams:
2965 self.cluster.osparams = self.new_osp
2966 if self.op.ndparams:
2967 self.cluster.ndparams = self.new_ndparams
2969 if self.op.candidate_pool_size is not None:
2970 self.cluster.candidate_pool_size = self.op.candidate_pool_size
2971 # we need to update the pool size here, otherwise the save will fail
2972 _AdjustCandidatePool(self, [])
2974 if self.op.maintain_node_health is not None:
2975 self.cluster.maintain_node_health = self.op.maintain_node_health
2977 if self.op.prealloc_wipe_disks is not None:
2978 self.cluster.prealloc_wipe_disks = self.op.prealloc_wipe_disks
2980 if self.op.add_uids is not None:
2981 uidpool.AddToUidPool(self.cluster.uid_pool, self.op.add_uids)
2983 if self.op.remove_uids is not None:
2984 uidpool.RemoveFromUidPool(self.cluster.uid_pool, self.op.remove_uids)
2986 if self.op.uid_pool is not None:
2987 self.cluster.uid_pool = self.op.uid_pool
2989 if self.op.default_iallocator is not None:
2990 self.cluster.default_iallocator = self.op.default_iallocator
2992 if self.op.reserved_lvs is not None:
2993 self.cluster.reserved_lvs = self.op.reserved_lvs
2995 def helper_os(aname, mods, desc):
2997 lst = getattr(self.cluster, aname)
2998 for key, val in mods:
2999 if key == constants.DDM_ADD:
3001 feedback_fn("OS %s already in %s, ignoring" % (val, desc))
3004 elif key == constants.DDM_REMOVE:
3008 feedback_fn("OS %s not found in %s, ignoring" % (val, desc))
3010 raise errors.ProgrammerError("Invalid modification '%s'" % key)
3012 if self.op.hidden_os:
3013 helper_os("hidden_os", self.op.hidden_os, "hidden")
3015 if self.op.blacklisted_os:
3016 helper_os("blacklisted_os", self.op.blacklisted_os, "blacklisted")
3018 if self.op.master_netdev:
3019 master = self.cfg.GetMasterNode()
3020 feedback_fn("Shutting down master ip on the current netdev (%s)" %
3021 self.cluster.master_netdev)
3022 result = self.rpc.call_node_stop_master(master, False)
3023 result.Raise("Could not disable the master ip")
3024 feedback_fn("Changing master_netdev from %s to %s" %
3025 (self.cluster.master_netdev, self.op.master_netdev))
3026 self.cluster.master_netdev = self.op.master_netdev
3028 self.cfg.Update(self.cluster, feedback_fn)
3030 if self.op.master_netdev:
3031 feedback_fn("Starting the master ip on the new master netdev (%s)" %
3032 self.op.master_netdev)
3033 result = self.rpc.call_node_start_master(master, False, False)
3035 self.LogWarning("Could not re-enable the master ip on"
3036 " the master, please restart manually: %s",
3040 def _UploadHelper(lu, nodes, fname):
3041 """Helper for uploading a file and showing warnings.
3044 if os.path.exists(fname):
3045 result = lu.rpc.call_upload_file(nodes, fname)
3046 for to_node, to_result in result.items():
3047 msg = to_result.fail_msg
3049 msg = ("Copy of file %s to node %s failed: %s" %
3050 (fname, to_node, msg))
3051 lu.proc.LogWarning(msg)
3054 def _RedistributeAncillaryFiles(lu, additional_nodes=None, additional_vm=True):
3055 """Distribute additional files which are part of the cluster configuration.
3057 ConfigWriter takes care of distributing the config and ssconf files, but
3058 there are more files which should be distributed to all nodes. This function
3059 makes sure those are copied.
3061 @param lu: calling logical unit
3062 @param additional_nodes: list of nodes not in the config to distribute to
3063 @type additional_vm: boolean
3064 @param additional_vm: whether the additional nodes are vm-capable or not
3067 # 1. Gather target nodes
3068 myself = lu.cfg.GetNodeInfo(lu.cfg.GetMasterNode())
3069 dist_nodes = lu.cfg.GetOnlineNodeList()
3070 nvm_nodes = lu.cfg.GetNonVmCapableNodeList()
3071 vm_nodes = [name for name in dist_nodes if name not in nvm_nodes]
3072 if additional_nodes is not None:
3073 dist_nodes.extend(additional_nodes)
3075 vm_nodes.extend(additional_nodes)
3076 if myself.name in dist_nodes:
3077 dist_nodes.remove(myself.name)
3078 if myself.name in vm_nodes:
3079 vm_nodes.remove(myself.name)
3081 # 2. Gather files to distribute
3082 dist_files = set([constants.ETC_HOSTS,
3083 constants.SSH_KNOWN_HOSTS_FILE,
3084 constants.RAPI_CERT_FILE,
3085 constants.RAPI_USERS_FILE,
3086 constants.CONFD_HMAC_KEY,
3087 constants.CLUSTER_DOMAIN_SECRET_FILE,
3091 enabled_hypervisors = lu.cfg.GetClusterInfo().enabled_hypervisors
3092 for hv_name in enabled_hypervisors:
3093 hv_class = hypervisor.GetHypervisor(hv_name)
3094 vm_files.update(hv_class.GetAncillaryFiles())
3096 # 3. Perform the files upload
3097 for fname in dist_files:
3098 _UploadHelper(lu, dist_nodes, fname)
3099 for fname in vm_files:
3100 _UploadHelper(lu, vm_nodes, fname)
3103 class LUClusterRedistConf(NoHooksLU):
3104 """Force the redistribution of cluster configuration.
3106 This is a very simple LU.
3111 def ExpandNames(self):
3112 self.needed_locks = {
3113 locking.LEVEL_NODE: locking.ALL_SET,
3115 self.share_locks[locking.LEVEL_NODE] = 1
3117 def Exec(self, feedback_fn):
3118 """Redistribute the configuration.
3121 self.cfg.Update(self.cfg.GetClusterInfo(), feedback_fn)
3122 _RedistributeAncillaryFiles(self)
3125 def _WaitForSync(lu, instance, disks=None, oneshot=False):
3126 """Sleep and poll for an instance's disk to sync.
3129 if not instance.disks or disks is not None and not disks:
3132 disks = _ExpandCheckDisks(instance, disks)
3135 lu.proc.LogInfo("Waiting for instance %s to sync disks." % instance.name)
3137 node = instance.primary_node
3140 lu.cfg.SetDiskID(dev, node)
3142 # TODO: Convert to utils.Retry
3145 degr_retries = 10 # in seconds, as we sleep 1 second each time
3149 cumul_degraded = False
3150 rstats = lu.rpc.call_blockdev_getmirrorstatus(node, disks)
3151 msg = rstats.fail_msg
3153 lu.LogWarning("Can't get any data from node %s: %s", node, msg)
3156 raise errors.RemoteError("Can't contact node %s for mirror data,"
3157 " aborting." % node)
3160 rstats = rstats.payload
3162 for i, mstat in enumerate(rstats):
3164 lu.LogWarning("Can't compute data for node %s/%s",
3165 node, disks[i].iv_name)
3168 cumul_degraded = (cumul_degraded or
3169 (mstat.is_degraded and mstat.sync_percent is None))
3170 if mstat.sync_percent is not None:
3172 if mstat.estimated_time is not None:
3173 rem_time = ("%s remaining (estimated)" %
3174 utils.FormatSeconds(mstat.estimated_time))
3175 max_time = mstat.estimated_time
3177 rem_time = "no time estimate"
3178 lu.proc.LogInfo("- device %s: %5.2f%% done, %s" %
3179 (disks[i].iv_name, mstat.sync_percent, rem_time))
3181 # if we're done but degraded, let's do a few small retries, to
3182 # make sure we see a stable and not transient situation; therefore
3183 # we force restart of the loop
3184 if (done or oneshot) and cumul_degraded and degr_retries > 0:
3185 logging.info("Degraded disks found, %d retries left", degr_retries)
3193 time.sleep(min(60, max_time))
3196 lu.proc.LogInfo("Instance %s's disks are in sync." % instance.name)
3197 return not cumul_degraded
3200 def _CheckDiskConsistency(lu, dev, node, on_primary, ldisk=False):
3201 """Check that mirrors are not degraded.
3203 The ldisk parameter, if True, will change the test from the
3204 is_degraded attribute (which represents overall non-ok status for
3205 the device(s)) to the ldisk (representing the local storage status).
3208 lu.cfg.SetDiskID(dev, node)
3212 if on_primary or dev.AssembleOnSecondary():
3213 rstats = lu.rpc.call_blockdev_find(node, dev)
3214 msg = rstats.fail_msg
3216 lu.LogWarning("Can't find disk on node %s: %s", node, msg)
3218 elif not rstats.payload:
3219 lu.LogWarning("Can't find disk on node %s", node)
3223 result = result and rstats.payload.ldisk_status == constants.LDS_OKAY
3225 result = result and not rstats.payload.is_degraded
3228 for child in dev.children:
3229 result = result and _CheckDiskConsistency(lu, child, node, on_primary)
3234 class LUOobCommand(NoHooksLU):
3235 """Logical unit for OOB handling.
3240 def CheckPrereq(self):
3241 """Check prerequisites.
3244 - the node exists in the configuration
3247 Any errors are signaled by raising errors.OpPrereqError.
3251 for node_name in self.op.node_names:
3252 node = self.cfg.GetNodeInfo(node_name)
3255 raise errors.OpPrereqError("Node %s not found" % node_name,
3258 self.nodes.append(node)
3260 if (self.op.command == constants.OOB_POWER_OFF and not node.offline):
3261 raise errors.OpPrereqError(("Cannot power off node %s because it is"
3262 " not marked offline") % node_name,
3265 def ExpandNames(self):
3266 """Gather locks we need.
3269 if self.op.node_names:
3270 self.op.node_names = [_ExpandNodeName(self.cfg, name)
3271 for name in self.op.node_names]
3273 self.op.node_names = self.cfg.GetNodeList()
3275 self.needed_locks = {
3276 locking.LEVEL_NODE: self.op.node_names,
3279 def Exec(self, feedback_fn):
3280 """Execute OOB and return result if we expect any.
3283 master_node = self.cfg.GetMasterNode()
3286 for node in self.nodes:
3287 node_entry = [(constants.RS_NORMAL, node.name)]
3288 ret.append(node_entry)
3290 oob_program = _SupportsOob(self.cfg, node)
3293 node_entry.append((constants.RS_UNAVAIL, None))
3296 logging.info("Executing out-of-band command '%s' using '%s' on %s",
3297 self.op.command, oob_program, node.name)
3298 result = self.rpc.call_run_oob(master_node, oob_program,
3299 self.op.command, node.name,
3303 self.LogWarning("On node '%s' out-of-band RPC failed with: %s",
3304 node.name, result.fail_msg)
3305 node_entry.append((constants.RS_NODATA, None))
3308 self._CheckPayload(result)
3309 except errors.OpExecError, err:
3310 self.LogWarning("The payload returned by '%s' is not valid: %s",
3312 node_entry.append((constants.RS_NODATA, None))
3314 if self.op.command == constants.OOB_HEALTH:
3315 # For health we should log important events
3316 for item, status in result.payload:
3317 if status in [constants.OOB_STATUS_WARNING,
3318 constants.OOB_STATUS_CRITICAL]:
3319 self.LogWarning("On node '%s' item '%s' has status '%s'",
3320 node.name, item, status)
3322 if self.op.command == constants.OOB_POWER_ON:
3324 elif self.op.command == constants.OOB_POWER_OFF:
3325 node.powered = False
3326 elif self.op.command == constants.OOB_POWER_STATUS:
3327 powered = result.payload[constants.OOB_POWER_STATUS_POWERED]
3328 if powered != node.powered:
3329 logging.warning(("Recorded power state (%s) of node '%s' does not"
3330 " match actual power state (%s)"), node.powered,
3333 # For configuration changing commands we should update the node
3334 if self.op.command in (constants.OOB_POWER_ON,
3335 constants.OOB_POWER_OFF):
3336 self.cfg.Update(node, feedback_fn)
3338 node_entry.append((constants.RS_NORMAL, result.payload))
3342 def _CheckPayload(self, result):
3343 """Checks if the payload is valid.
3345 @param result: RPC result
3346 @raises errors.OpExecError: If payload is not valid
3350 if self.op.command == constants.OOB_HEALTH:
3351 if not isinstance(result.payload, list):
3352 errs.append("command 'health' is expected to return a list but got %s" %
3353 type(result.payload))
3355 for item, status in result.payload:
3356 if status not in constants.OOB_STATUSES:
3357 errs.append("health item '%s' has invalid status '%s'" %
3360 if self.op.command == constants.OOB_POWER_STATUS:
3361 if not isinstance(result.payload, dict):
3362 errs.append("power-status is expected to return a dict but got %s" %
3363 type(result.payload))
3365 if self.op.command in [
3366 constants.OOB_POWER_ON,
3367 constants.OOB_POWER_OFF,
3368 constants.OOB_POWER_CYCLE,
3370 if result.payload is not None:
3371 errs.append("%s is expected to not return payload but got '%s'" %
3372 (self.op.command, result.payload))
3375 raise errors.OpExecError("Check of out-of-band payload failed due to %s" %
3376 utils.CommaJoin(errs))
3380 class LUOsDiagnose(NoHooksLU):
3381 """Logical unit for OS diagnose/query.
3386 _BLK = "blacklisted"
3388 _FIELDS_STATIC = utils.FieldSet()
3389 _FIELDS_DYNAMIC = utils.FieldSet("name", _VLD, "node_status", "variants",
3390 "parameters", "api_versions", _HID, _BLK)
3392 def CheckArguments(self):
3394 raise errors.OpPrereqError("Selective OS query not supported",
3397 _CheckOutputFields(static=self._FIELDS_STATIC,
3398 dynamic=self._FIELDS_DYNAMIC,
3399 selected=self.op.output_fields)
3401 def ExpandNames(self):
3402 # Lock all nodes, in shared mode
3403 # Temporary removal of locks, should be reverted later
3404 # TODO: reintroduce locks when they are lighter-weight
3405 self.needed_locks = {}
3406 #self.share_locks[locking.LEVEL_NODE] = 1
3407 #self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
3410 def _DiagnoseByOS(rlist):
3411 """Remaps a per-node return list into an a per-os per-node dictionary
3413 @param rlist: a map with node names as keys and OS objects as values
3416 @return: a dictionary with osnames as keys and as value another
3417 map, with nodes as keys and tuples of (path, status, diagnose,
3418 variants, parameters, api_versions) as values, eg::
3420 {"debian-etch": {"node1": [(/usr/lib/..., True, "", [], []),
3421 (/srv/..., False, "invalid api")],
3422 "node2": [(/srv/..., True, "", [], [])]}
3427 # we build here the list of nodes that didn't fail the RPC (at RPC
3428 # level), so that nodes with a non-responding node daemon don't
3429 # make all OSes invalid
3430 good_nodes = [node_name for node_name in rlist
3431 if not rlist[node_name].fail_msg]
3432 for node_name, nr in rlist.items():
3433 if nr.fail_msg or not nr.payload:
3435 for (name, path, status, diagnose, variants,
3436 params, api_versions) in nr.payload:
3437 if name not in all_os:
3438 # build a list of nodes for this os containing empty lists
3439 # for each node in node_list
3441 for nname in good_nodes:
3442 all_os[name][nname] = []
3443 # convert params from [name, help] to (name, help)
3444 params = [tuple(v) for v in params]
3445 all_os[name][node_name].append((path, status, diagnose,
3446 variants, params, api_versions))
3449 def Exec(self, feedback_fn):
3450 """Compute the list of OSes.
3453 valid_nodes = [node.name
3454 for node in self.cfg.GetAllNodesInfo().values()
3455 if not node.offline and node.vm_capable]
3456 node_data = self.rpc.call_os_diagnose(valid_nodes)
3457 pol = self._DiagnoseByOS(node_data)
3459 cluster = self.cfg.GetClusterInfo()
3461 for os_name in utils.NiceSort(pol.keys()):
3462 os_data = pol[os_name]
3465 (variants, params, api_versions) = null_state = (set(), set(), set())
3466 for idx, osl in enumerate(os_data.values()):
3467 valid = bool(valid and osl and osl[0][1])
3469 (variants, params, api_versions) = null_state
3471 node_variants, node_params, node_api = osl[0][3:6]
3472 if idx == 0: # first entry
3473 variants = set(node_variants)
3474 params = set(node_params)
3475 api_versions = set(node_api)
3476 else: # keep consistency
3477 variants.intersection_update(node_variants)
3478 params.intersection_update(node_params)
3479 api_versions.intersection_update(node_api)
3481 is_hid = os_name in cluster.hidden_os
3482 is_blk = os_name in cluster.blacklisted_os
3483 if ((self._HID not in self.op.output_fields and is_hid) or
3484 (self._BLK not in self.op.output_fields and is_blk) or
3485 (self._VLD not in self.op.output_fields and not valid)):
3488 for field in self.op.output_fields:
3491 elif field == self._VLD:
3493 elif field == "node_status":
3494 # this is just a copy of the dict
3496 for node_name, nos_list in os_data.items():
3497 val[node_name] = nos_list
3498 elif field == "variants":
3499 val = utils.NiceSort(list(variants))
3500 elif field == "parameters":
3502 elif field == "api_versions":
3503 val = list(api_versions)
3504 elif field == self._HID:
3506 elif field == self._BLK:
3509 raise errors.ParameterError(field)
3516 class LUNodeRemove(LogicalUnit):
3517 """Logical unit for removing a node.
3520 HPATH = "node-remove"
3521 HTYPE = constants.HTYPE_NODE
3523 def BuildHooksEnv(self):
3526 This doesn't run on the target node in the pre phase as a failed
3527 node would then be impossible to remove.
3531 "OP_TARGET": self.op.node_name,
3532 "NODE_NAME": self.op.node_name,
3534 all_nodes = self.cfg.GetNodeList()
3536 all_nodes.remove(self.op.node_name)
3538 logging.warning("Node %s which is about to be removed not found"
3539 " in the all nodes list", self.op.node_name)
3540 return env, all_nodes, all_nodes
3542 def CheckPrereq(self):
3543 """Check prerequisites.
3546 - the node exists in the configuration
3547 - it does not have primary or secondary instances
3548 - it's not the master
3550 Any errors are signaled by raising errors.OpPrereqError.
3553 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
3554 node = self.cfg.GetNodeInfo(self.op.node_name)
3555 assert node is not None
3557 instance_list = self.cfg.GetInstanceList()
3559 masternode = self.cfg.GetMasterNode()
3560 if node.name == masternode:
3561 raise errors.OpPrereqError("Node is the master node,"
3562 " you need to failover first.",
3565 for instance_name in instance_list:
3566 instance = self.cfg.GetInstanceInfo(instance_name)
3567 if node.name in instance.all_nodes:
3568 raise errors.OpPrereqError("Instance %s is still running on the node,"
3569 " please remove first." % instance_name,
3571 self.op.node_name = node.name
3574 def Exec(self, feedback_fn):
3575 """Removes the node from the cluster.
3579 logging.info("Stopping the node daemon and removing configs from node %s",
3582 modify_ssh_setup = self.cfg.GetClusterInfo().modify_ssh_setup
3584 # Promote nodes to master candidate as needed
3585 _AdjustCandidatePool(self, exceptions=[node.name])
3586 self.context.RemoveNode(node.name)
3588 # Run post hooks on the node before it's removed
3589 hm = self.proc.hmclass(self.rpc.call_hooks_runner, self)
3591 hm.RunPhase(constants.HOOKS_PHASE_POST, [node.name])
3593 # pylint: disable-msg=W0702
3594 self.LogWarning("Errors occurred running hooks on %s" % node.name)
3596 result = self.rpc.call_node_leave_cluster(node.name, modify_ssh_setup)
3597 msg = result.fail_msg
3599 self.LogWarning("Errors encountered on the remote node while leaving"
3600 " the cluster: %s", msg)
3602 # Remove node from our /etc/hosts
3603 if self.cfg.GetClusterInfo().modify_etc_hosts:
3604 master_node = self.cfg.GetMasterNode()
3605 result = self.rpc.call_etc_hosts_modify(master_node,
3606 constants.ETC_HOSTS_REMOVE,
3608 result.Raise("Can't update hosts file with new host data")
3609 _RedistributeAncillaryFiles(self)
3612 class _NodeQuery(_QueryBase):
3613 FIELDS = query.NODE_FIELDS
3615 def ExpandNames(self, lu):
3616 lu.needed_locks = {}
3617 lu.share_locks[locking.LEVEL_NODE] = 1
3620 self.wanted = _GetWantedNodes(lu, self.names)
3622 self.wanted = locking.ALL_SET
3624 self.do_locking = (self.use_locking and
3625 query.NQ_LIVE in self.requested_data)
3628 # if we don't request only static fields, we need to lock the nodes
3629 lu.needed_locks[locking.LEVEL_NODE] = self.wanted
3631 def DeclareLocks(self, lu, level):
3634 def _GetQueryData(self, lu):
3635 """Computes the list of nodes and their attributes.
3638 all_info = lu.cfg.GetAllNodesInfo()
3640 nodenames = self._GetNames(lu, all_info.keys(), locking.LEVEL_NODE)
3642 # Gather data as requested
3643 if query.NQ_LIVE in self.requested_data:
3644 # filter out non-vm_capable nodes
3645 toquery_nodes = [name for name in nodenames if all_info[name].vm_capable]
3647 node_data = lu.rpc.call_node_info(toquery_nodes, lu.cfg.GetVGName(),
3648 lu.cfg.GetHypervisorType())
3649 live_data = dict((name, nresult.payload)
3650 for (name, nresult) in node_data.items()
3651 if not nresult.fail_msg and nresult.payload)
3655 if query.NQ_INST in self.requested_data:
3656 node_to_primary = dict([(name, set()) for name in nodenames])
3657 node_to_secondary = dict([(name, set()) for name in nodenames])
3659 inst_data = lu.cfg.GetAllInstancesInfo()
3661 for inst in inst_data.values():
3662 if inst.primary_node in node_to_primary:
3663 node_to_primary[inst.primary_node].add(inst.name)
3664 for secnode in inst.secondary_nodes:
3665 if secnode in node_to_secondary:
3666 node_to_secondary[secnode].add(inst.name)
3668 node_to_primary = None
3669 node_to_secondary = None
3671 if query.NQ_OOB in self.requested_data:
3672 oob_support = dict((name, bool(_SupportsOob(lu.cfg, node)))
3673 for name, node in all_info.iteritems())
3677 if query.NQ_GROUP in self.requested_data:
3678 groups = lu.cfg.GetAllNodeGroupsInfo()
3682 return query.NodeQueryData([all_info[name] for name in nodenames],
3683 live_data, lu.cfg.GetMasterNode(),
3684 node_to_primary, node_to_secondary, groups,
3685 oob_support, lu.cfg.GetClusterInfo())
3688 class LUNodeQuery(NoHooksLU):
3689 """Logical unit for querying nodes.
3692 # pylint: disable-msg=W0142
3695 def CheckArguments(self):
3696 self.nq = _NodeQuery(self.op.names, self.op.output_fields,
3697 self.op.use_locking)
3699 def ExpandNames(self):
3700 self.nq.ExpandNames(self)
3702 def Exec(self, feedback_fn):
3703 return self.nq.OldStyleQuery(self)
3706 class LUNodeQueryvols(NoHooksLU):
3707 """Logical unit for getting volumes on node(s).
3711 _FIELDS_DYNAMIC = utils.FieldSet("phys", "vg", "name", "size", "instance")
3712 _FIELDS_STATIC = utils.FieldSet("node")
3714 def CheckArguments(self):
3715 _CheckOutputFields(static=self._FIELDS_STATIC,
3716 dynamic=self._FIELDS_DYNAMIC,
3717 selected=self.op.output_fields)
3719 def ExpandNames(self):
3720 self.needed_locks = {}
3721 self.share_locks[locking.LEVEL_NODE] = 1
3722 if not self.op.nodes:
3723 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
3725 self.needed_locks[locking.LEVEL_NODE] = \
3726 _GetWantedNodes(self, self.op.nodes)
3728 def Exec(self, feedback_fn):
3729 """Computes the list of nodes and their attributes.
3732 nodenames = self.acquired_locks[locking.LEVEL_NODE]
3733 volumes = self.rpc.call_node_volumes(nodenames)
3735 ilist = [self.cfg.GetInstanceInfo(iname) for iname
3736 in self.cfg.GetInstanceList()]
3738 lv_by_node = dict([(inst, inst.MapLVsByNode()) for inst in ilist])
3741 for node in nodenames:
3742 nresult = volumes[node]
3745 msg = nresult.fail_msg
3747 self.LogWarning("Can't compute volume data on node %s: %s", node, msg)
3750 node_vols = nresult.payload[:]
3751 node_vols.sort(key=lambda vol: vol['dev'])
3753 for vol in node_vols:
3755 for field in self.op.output_fields:
3758 elif field == "phys":
3762 elif field == "name":
3764 elif field == "size":
3765 val = int(float(vol['size']))
3766 elif field == "instance":
3768 if node not in lv_by_node[inst]:
3770 if vol['name'] in lv_by_node[inst][node]:
3776 raise errors.ParameterError(field)
3777 node_output.append(str(val))
3779 output.append(node_output)
3784 class LUNodeQueryStorage(NoHooksLU):
3785 """Logical unit for getting information on storage units on node(s).
3788 _FIELDS_STATIC = utils.FieldSet(constants.SF_NODE)
3791 def CheckArguments(self):
3792 _CheckOutputFields(static=self._FIELDS_STATIC,
3793 dynamic=utils.FieldSet(*constants.VALID_STORAGE_FIELDS),
3794 selected=self.op.output_fields)
3796 def ExpandNames(self):
3797 self.needed_locks = {}
3798 self.share_locks[locking.LEVEL_NODE] = 1
3801 self.needed_locks[locking.LEVEL_NODE] = \
3802 _GetWantedNodes(self, self.op.nodes)
3804 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
3806 def Exec(self, feedback_fn):
3807 """Computes the list of nodes and their attributes.
3810 self.nodes = self.acquired_locks[locking.LEVEL_NODE]
3812 # Always get name to sort by
3813 if constants.SF_NAME in self.op.output_fields:
3814 fields = self.op.output_fields[:]
3816 fields = [constants.SF_NAME] + self.op.output_fields
3818 # Never ask for node or type as it's only known to the LU
3819 for extra in [constants.SF_NODE, constants.SF_TYPE]:
3820 while extra in fields:
3821 fields.remove(extra)
3823 field_idx = dict([(name, idx) for (idx, name) in enumerate(fields)])
3824 name_idx = field_idx[constants.SF_NAME]
3826 st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
3827 data = self.rpc.call_storage_list(self.nodes,
3828 self.op.storage_type, st_args,
3829 self.op.name, fields)
3833 for node in utils.NiceSort(self.nodes):
3834 nresult = data[node]
3838 msg = nresult.fail_msg
3840 self.LogWarning("Can't get storage data from node %s: %s", node, msg)
3843 rows = dict([(row[name_idx], row) for row in nresult.payload])
3845 for name in utils.NiceSort(rows.keys()):
3850 for field in self.op.output_fields:
3851 if field == constants.SF_NODE:
3853 elif field == constants.SF_TYPE:
3854 val = self.op.storage_type
3855 elif field in field_idx:
3856 val = row[field_idx[field]]
3858 raise errors.ParameterError(field)
3867 class _InstanceQuery(_QueryBase):
3868 FIELDS = query.INSTANCE_FIELDS
3870 def ExpandNames(self, lu):
3871 lu.needed_locks = {}
3872 lu.share_locks[locking.LEVEL_INSTANCE] = 1
3873 lu.share_locks[locking.LEVEL_NODE] = 1
3876 self.wanted = _GetWantedInstances(lu, self.names)
3878 self.wanted = locking.ALL_SET
3880 self.do_locking = (self.use_locking and
3881 query.IQ_LIVE in self.requested_data)
3883 lu.needed_locks[locking.LEVEL_INSTANCE] = self.wanted
3884 lu.needed_locks[locking.LEVEL_NODE] = []
3885 lu.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
3887 def DeclareLocks(self, lu, level):
3888 if level == locking.LEVEL_NODE and self.do_locking:
3889 lu._LockInstancesNodes() # pylint: disable-msg=W0212
3891 def _GetQueryData(self, lu):
3892 """Computes the list of instances and their attributes.
3895 cluster = lu.cfg.GetClusterInfo()
3896 all_info = lu.cfg.GetAllInstancesInfo()
3898 instance_names = self._GetNames(lu, all_info.keys(), locking.LEVEL_INSTANCE)
3900 instance_list = [all_info[name] for name in instance_names]
3901 nodes = frozenset(itertools.chain(*(inst.all_nodes
3902 for inst in instance_list)))
3903 hv_list = list(set([inst.hypervisor for inst in instance_list]))
3906 wrongnode_inst = set()
3908 # Gather data as requested
3909 if self.requested_data & set([query.IQ_LIVE, query.IQ_CONSOLE]):
3911 node_data = lu.rpc.call_all_instances_info(nodes, hv_list)
3913 result = node_data[name]
3915 # offline nodes will be in both lists
3916 assert result.fail_msg
3917 offline_nodes.append(name)
3919 bad_nodes.append(name)
3920 elif result.payload:
3921 for inst in result.payload:
3922 if inst in all_info:
3923 if all_info[inst].primary_node == name:
3924 live_data.update(result.payload)
3926 wrongnode_inst.add(inst)
3928 # orphan instance; we don't list it here as we don't
3929 # handle this case yet in the output of instance listing
3930 logging.warning("Orphan instance '%s' found on node %s",
3932 # else no instance is alive
3936 if query.IQ_DISKUSAGE in self.requested_data:
3937 disk_usage = dict((inst.name,
3938 _ComputeDiskSize(inst.disk_template,
3939 [{"size": disk.size}
3940 for disk in inst.disks]))
3941 for inst in instance_list)
3945 if query.IQ_CONSOLE in self.requested_data:
3947 for inst in instance_list:
3948 if inst.name in live_data:
3949 # Instance is running
3950 consinfo[inst.name] = _GetInstanceConsole(cluster, inst)
3952 consinfo[inst.name] = None
3953 assert set(consinfo.keys()) == set(instance_names)
3957 return query.InstanceQueryData(instance_list, lu.cfg.GetClusterInfo(),
3958 disk_usage, offline_nodes, bad_nodes,
3959 live_data, wrongnode_inst, consinfo)
3962 class LUQuery(NoHooksLU):
3963 """Query for resources/items of a certain kind.
3966 # pylint: disable-msg=W0142
3969 def CheckArguments(self):
3970 qcls = _GetQueryImplementation(self.op.what)
3971 names = qlang.ReadSimpleFilter("name", self.op.filter)
3973 self.impl = qcls(names, self.op.fields, False)
3975 def ExpandNames(self):
3976 self.impl.ExpandNames(self)
3978 def DeclareLocks(self, level):
3979 self.impl.DeclareLocks(self, level)
3981 def Exec(self, feedback_fn):
3982 return self.impl.NewStyleQuery(self)
3985 class LUQueryFields(NoHooksLU):
3986 """Query for resources/items of a certain kind.
3989 # pylint: disable-msg=W0142
3992 def CheckArguments(self):
3993 self.qcls = _GetQueryImplementation(self.op.what)
3995 def ExpandNames(self):
3996 self.needed_locks = {}
3998 def Exec(self, feedback_fn):
3999 return self.qcls.FieldsQuery(self.op.fields)
4002 class LUNodeModifyStorage(NoHooksLU):
4003 """Logical unit for modifying a storage volume on a node.
4008 def CheckArguments(self):
4009 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
4011 storage_type = self.op.storage_type
4014 modifiable = constants.MODIFIABLE_STORAGE_FIELDS[storage_type]
4016 raise errors.OpPrereqError("Storage units of type '%s' can not be"
4017 " modified" % storage_type,
4020 diff = set(self.op.changes.keys()) - modifiable
4022 raise errors.OpPrereqError("The following fields can not be modified for"
4023 " storage units of type '%s': %r" %
4024 (storage_type, list(diff)),
4027 def ExpandNames(self):
4028 self.needed_locks = {
4029 locking.LEVEL_NODE: self.op.node_name,
4032 def Exec(self, feedback_fn):
4033 """Computes the list of nodes and their attributes.
4036 st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
4037 result = self.rpc.call_storage_modify(self.op.node_name,
4038 self.op.storage_type, st_args,
4039 self.op.name, self.op.changes)
4040 result.Raise("Failed to modify storage unit '%s' on %s" %
4041 (self.op.name, self.op.node_name))
4044 class LUNodeAdd(LogicalUnit):
4045 """Logical unit for adding node to the cluster.
4049 HTYPE = constants.HTYPE_NODE
4050 _NFLAGS = ["master_capable", "vm_capable"]
4052 def CheckArguments(self):
4053 self.primary_ip_family = self.cfg.GetPrimaryIPFamily()
4054 # validate/normalize the node name
4055 self.hostname = netutils.GetHostname(name=self.op.node_name,
4056 family=self.primary_ip_family)
4057 self.op.node_name = self.hostname.name
4059 if self.op.readd and self.op.node_name == self.cfg.GetMasterNode():
4060 raise errors.OpPrereqError("Cannot readd the master node",
4063 if self.op.readd and self.op.group:
4064 raise errors.OpPrereqError("Cannot pass a node group when a node is"
4065 " being readded", errors.ECODE_INVAL)
4067 def BuildHooksEnv(self):
4070 This will run on all nodes before, and on all nodes + the new node after.
4074 "OP_TARGET": self.op.node_name,
4075 "NODE_NAME": self.op.node_name,
4076 "NODE_PIP": self.op.primary_ip,
4077 "NODE_SIP": self.op.secondary_ip,
4078 "MASTER_CAPABLE": str(self.op.master_capable),
4079 "VM_CAPABLE": str(self.op.vm_capable),
4081 nodes_0 = self.cfg.GetNodeList()
4082 nodes_1 = nodes_0 + [self.op.node_name, ]
4083 return env, nodes_0, nodes_1
4085 def CheckPrereq(self):
4086 """Check prerequisites.
4089 - the new node is not already in the config
4091 - its parameters (single/dual homed) matches the cluster
4093 Any errors are signaled by raising errors.OpPrereqError.
4097 hostname = self.hostname
4098 node = hostname.name
4099 primary_ip = self.op.primary_ip = hostname.ip
4100 if self.op.secondary_ip is None:
4101 if self.primary_ip_family == netutils.IP6Address.family:
4102 raise errors.OpPrereqError("When using a IPv6 primary address, a valid"
4103 " IPv4 address must be given as secondary",
4105 self.op.secondary_ip = primary_ip
4107 secondary_ip = self.op.secondary_ip
4108 if not netutils.IP4Address.IsValid(secondary_ip):
4109 raise errors.OpPrereqError("Secondary IP (%s) needs to be a valid IPv4"
4110 " address" % secondary_ip, errors.ECODE_INVAL)
4112 node_list = cfg.GetNodeList()
4113 if not self.op.readd and node in node_list:
4114 raise errors.OpPrereqError("Node %s is already in the configuration" %
4115 node, errors.ECODE_EXISTS)
4116 elif self.op.readd and node not in node_list:
4117 raise errors.OpPrereqError("Node %s is not in the configuration" % node,
4120 self.changed_primary_ip = False
4122 for existing_node_name in node_list:
4123 existing_node = cfg.GetNodeInfo(existing_node_name)
4125 if self.op.readd and node == existing_node_name:
4126 if existing_node.secondary_ip != secondary_ip:
4127 raise errors.OpPrereqError("Readded node doesn't have the same IP"
4128 " address configuration as before",
4130 if existing_node.primary_ip != primary_ip:
4131 self.changed_primary_ip = True
4135 if (existing_node.primary_ip == primary_ip or
4136 existing_node.secondary_ip == primary_ip or
4137 existing_node.primary_ip == secondary_ip or
4138 existing_node.secondary_ip == secondary_ip):
4139 raise errors.OpPrereqError("New node ip address(es) conflict with"
4140 " existing node %s" % existing_node.name,
4141 errors.ECODE_NOTUNIQUE)
4143 # After this 'if' block, None is no longer a valid value for the
4144 # _capable op attributes
4146 old_node = self.cfg.GetNodeInfo(node)
4147 assert old_node is not None, "Can't retrieve locked node %s" % node
4148 for attr in self._NFLAGS:
4149 if getattr(self.op, attr) is None:
4150 setattr(self.op, attr, getattr(old_node, attr))
4152 for attr in self._NFLAGS:
4153 if getattr(self.op, attr) is None:
4154 setattr(self.op, attr, True)
4156 if self.op.readd and not self.op.vm_capable:
4157 pri, sec = cfg.GetNodeInstances(node)
4159 raise errors.OpPrereqError("Node %s being re-added with vm_capable"
4160 " flag set to false, but it already holds"
4161 " instances" % node,
4164 # check that the type of the node (single versus dual homed) is the
4165 # same as for the master
4166 myself = cfg.GetNodeInfo(self.cfg.GetMasterNode())
4167 master_singlehomed = myself.secondary_ip == myself.primary_ip
4168 newbie_singlehomed = secondary_ip == primary_ip
4169 if master_singlehomed != newbie_singlehomed:
4170 if master_singlehomed:
4171 raise errors.OpPrereqError("The master has no secondary ip but the"
4172 " new node has one",
4175 raise errors.OpPrereqError("The master has a secondary ip but the"
4176 " new node doesn't have one",
4179 # checks reachability
4180 if not netutils.TcpPing(primary_ip, constants.DEFAULT_NODED_PORT):
4181 raise errors.OpPrereqError("Node not reachable by ping",
4182 errors.ECODE_ENVIRON)
4184 if not newbie_singlehomed:
4185 # check reachability from my secondary ip to newbie's secondary ip
4186 if not netutils.TcpPing(secondary_ip, constants.DEFAULT_NODED_PORT,
4187 source=myself.secondary_ip):
4188 raise errors.OpPrereqError("Node secondary ip not reachable by TCP"
4189 " based ping to node daemon port",
4190 errors.ECODE_ENVIRON)
4197 if self.op.master_capable:
4198 self.master_candidate = _DecideSelfPromotion(self, exceptions=exceptions)
4200 self.master_candidate = False
4203 self.new_node = old_node
4205 node_group = cfg.LookupNodeGroup(self.op.group)
4206 self.new_node = objects.Node(name=node,
4207 primary_ip=primary_ip,
4208 secondary_ip=secondary_ip,
4209 master_candidate=self.master_candidate,
4210 offline=False, drained=False,
4213 if self.op.ndparams:
4214 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
4216 def Exec(self, feedback_fn):
4217 """Adds the new node to the cluster.
4220 new_node = self.new_node
4221 node = new_node.name
4223 # We adding a new node so we assume it's powered
4224 new_node.powered = True
4226 # for re-adds, reset the offline/drained/master-candidate flags;
4227 # we need to reset here, otherwise offline would prevent RPC calls
4228 # later in the procedure; this also means that if the re-add
4229 # fails, we are left with a non-offlined, broken node
4231 new_node.drained = new_node.offline = False # pylint: disable-msg=W0201
4232 self.LogInfo("Readding a node, the offline/drained flags were reset")
4233 # if we demote the node, we do cleanup later in the procedure
4234 new_node.master_candidate = self.master_candidate
4235 if self.changed_primary_ip:
4236 new_node.primary_ip = self.op.primary_ip
4238 # copy the master/vm_capable flags
4239 for attr in self._NFLAGS:
4240 setattr(new_node, attr, getattr(self.op, attr))
4242 # notify the user about any possible mc promotion
4243 if new_node.master_candidate:
4244 self.LogInfo("Node will be a master candidate")
4246 if self.op.ndparams:
4247 new_node.ndparams = self.op.ndparams
4249 new_node.ndparams = {}
4251 # check connectivity
4252 result = self.rpc.call_version([node])[node]
4253 result.Raise("Can't get version information from node %s" % node)
4254 if constants.PROTOCOL_VERSION == result.payload:
4255 logging.info("Communication to node %s fine, sw version %s match",
4256 node, result.payload)
4258 raise errors.OpExecError("Version mismatch master version %s,"
4259 " node version %s" %
4260 (constants.PROTOCOL_VERSION, result.payload))
4262 # Add node to our /etc/hosts, and add key to known_hosts
4263 if self.cfg.GetClusterInfo().modify_etc_hosts:
4264 master_node = self.cfg.GetMasterNode()
4265 result = self.rpc.call_etc_hosts_modify(master_node,
4266 constants.ETC_HOSTS_ADD,
4269 result.Raise("Can't update hosts file with new host data")
4271 if new_node.secondary_ip != new_node.primary_ip:
4272 _CheckNodeHasSecondaryIP(self, new_node.name, new_node.secondary_ip,
4275 node_verify_list = [self.cfg.GetMasterNode()]
4276 node_verify_param = {
4277 constants.NV_NODELIST: [node],
4278 # TODO: do a node-net-test as well?
4281 result = self.rpc.call_node_verify(node_verify_list, node_verify_param,
4282 self.cfg.GetClusterName())
4283 for verifier in node_verify_list:
4284 result[verifier].Raise("Cannot communicate with node %s" % verifier)
4285 nl_payload = result[verifier].payload[constants.NV_NODELIST]
4287 for failed in nl_payload:
4288 feedback_fn("ssh/hostname verification failed"
4289 " (checking from %s): %s" %
4290 (verifier, nl_payload[failed]))
4291 raise errors.OpExecError("ssh/hostname verification failed")
4294 _RedistributeAncillaryFiles(self)
4295 self.context.ReaddNode(new_node)
4296 # make sure we redistribute the config
4297 self.cfg.Update(new_node, feedback_fn)
4298 # and make sure the new node will not have old files around
4299 if not new_node.master_candidate:
4300 result = self.rpc.call_node_demote_from_mc(new_node.name)
4301 msg = result.fail_msg
4303 self.LogWarning("Node failed to demote itself from master"
4304 " candidate status: %s" % msg)
4306 _RedistributeAncillaryFiles(self, additional_nodes=[node],
4307 additional_vm=self.op.vm_capable)
4308 self.context.AddNode(new_node, self.proc.GetECId())
4311 class LUNodeSetParams(LogicalUnit):
4312 """Modifies the parameters of a node.
4314 @cvar _F2R: a dictionary from tuples of flags (mc, drained, offline)
4315 to the node role (as _ROLE_*)
4316 @cvar _R2F: a dictionary from node role to tuples of flags
4317 @cvar _FLAGS: a list of attribute names corresponding to the flags
4320 HPATH = "node-modify"
4321 HTYPE = constants.HTYPE_NODE
4323 (_ROLE_CANDIDATE, _ROLE_DRAINED, _ROLE_OFFLINE, _ROLE_REGULAR) = range(4)
4325 (True, False, False): _ROLE_CANDIDATE,
4326 (False, True, False): _ROLE_DRAINED,
4327 (False, False, True): _ROLE_OFFLINE,
4328 (False, False, False): _ROLE_REGULAR,
4330 _R2F = dict((v, k) for k, v in _F2R.items())
4331 _FLAGS = ["master_candidate", "drained", "offline"]
4333 def CheckArguments(self):
4334 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
4335 all_mods = [self.op.offline, self.op.master_candidate, self.op.drained,
4336 self.op.master_capable, self.op.vm_capable,
4337 self.op.secondary_ip, self.op.ndparams]
4338 if all_mods.count(None) == len(all_mods):
4339 raise errors.OpPrereqError("Please pass at least one modification",
4341 if all_mods.count(True) > 1:
4342 raise errors.OpPrereqError("Can't set the node into more than one"
4343 " state at the same time",
4346 # Boolean value that tells us whether we might be demoting from MC
4347 self.might_demote = (self.op.master_candidate == False or
4348 self.op.offline == True or
4349 self.op.drained == True or
4350 self.op.master_capable == False)
4352 if self.op.secondary_ip:
4353 if not netutils.IP4Address.IsValid(self.op.secondary_ip):
4354 raise errors.OpPrereqError("Secondary IP (%s) needs to be a valid IPv4"
4355 " address" % self.op.secondary_ip,
4358 self.lock_all = self.op.auto_promote and self.might_demote
4359 self.lock_instances = self.op.secondary_ip is not None
4361 def ExpandNames(self):
4363 self.needed_locks = {locking.LEVEL_NODE: locking.ALL_SET}
4365 self.needed_locks = {locking.LEVEL_NODE: self.op.node_name}
4367 if self.lock_instances:
4368 self.needed_locks[locking.LEVEL_INSTANCE] = locking.ALL_SET
4370 def DeclareLocks(self, level):
4371 # If we have locked all instances, before waiting to lock nodes, release
4372 # all the ones living on nodes unrelated to the current operation.
4373 if level == locking.LEVEL_NODE and self.lock_instances:
4374 instances_release = []
4376 self.affected_instances = []
4377 if self.needed_locks[locking.LEVEL_NODE] is not locking.ALL_SET:
4378 for instance_name in self.acquired_locks[locking.LEVEL_INSTANCE]:
4379 instance = self.context.cfg.GetInstanceInfo(instance_name)
4380 i_mirrored = instance.disk_template in constants.DTS_NET_MIRROR
4381 if i_mirrored and self.op.node_name in instance.all_nodes:
4382 instances_keep.append(instance_name)
4383 self.affected_instances.append(instance)
4385 instances_release.append(instance_name)
4386 if instances_release:
4387 self.context.glm.release(locking.LEVEL_INSTANCE, instances_release)
4388 self.acquired_locks[locking.LEVEL_INSTANCE] = instances_keep
4390 def BuildHooksEnv(self):
4393 This runs on the master node.
4397 "OP_TARGET": self.op.node_name,
4398 "MASTER_CANDIDATE": str(self.op.master_candidate),
4399 "OFFLINE": str(self.op.offline),
4400 "DRAINED": str(self.op.drained),
4401 "MASTER_CAPABLE": str(self.op.master_capable),
4402 "VM_CAPABLE": str(self.op.vm_capable),
4404 nl = [self.cfg.GetMasterNode(),
4408 def CheckPrereq(self):
4409 """Check prerequisites.
4411 This only checks the instance list against the existing names.
4414 node = self.node = self.cfg.GetNodeInfo(self.op.node_name)
4416 if (self.op.master_candidate is not None or
4417 self.op.drained is not None or
4418 self.op.offline is not None):
4419 # we can't change the master's node flags
4420 if self.op.node_name == self.cfg.GetMasterNode():
4421 raise errors.OpPrereqError("The master role can be changed"
4422 " only via master-failover",
4425 if self.op.master_candidate and not node.master_capable:
4426 raise errors.OpPrereqError("Node %s is not master capable, cannot make"
4427 " it a master candidate" % node.name,
4430 if self.op.vm_capable == False:
4431 (ipri, isec) = self.cfg.GetNodeInstances(self.op.node_name)
4433 raise errors.OpPrereqError("Node %s hosts instances, cannot unset"
4434 " the vm_capable flag" % node.name,
4437 if node.master_candidate and self.might_demote and not self.lock_all:
4438 assert not self.op.auto_promote, "auto_promote set but lock_all not"
4439 # check if after removing the current node, we're missing master
4441 (mc_remaining, mc_should, _) = \
4442 self.cfg.GetMasterCandidateStats(exceptions=[node.name])
4443 if mc_remaining < mc_should:
4444 raise errors.OpPrereqError("Not enough master candidates, please"
4445 " pass auto promote option to allow"
4446 " promotion", errors.ECODE_STATE)
4448 self.old_flags = old_flags = (node.master_candidate,
4449 node.drained, node.offline)
4450 assert old_flags in self._F2R, "Un-handled old flags %s" % str(old_flags)
4451 self.old_role = old_role = self._F2R[old_flags]
4453 # Check for ineffective changes
4454 for attr in self._FLAGS:
4455 if (getattr(self.op, attr) == False and getattr(node, attr) == False):
4456 self.LogInfo("Ignoring request to unset flag %s, already unset", attr)
4457 setattr(self.op, attr, None)
4459 # Past this point, any flag change to False means a transition
4460 # away from the respective state, as only real changes are kept
4462 # TODO: We might query the real power state if it supports OOB
4463 if _SupportsOob(self.cfg, node):
4464 if self.op.offline is False and not (node.powered or
4465 self.op.powered == True):
4466 raise errors.OpPrereqError(("Please power on node %s first before you"
4467 " can reset offline state") %
4469 elif self.op.powered is not None:
4470 raise errors.OpPrereqError(("Unable to change powered state for node %s"
4471 " which does not support out-of-band"
4472 " handling") % self.op.node_name)
4474 # If we're being deofflined/drained, we'll MC ourself if needed
4475 if (self.op.drained == False or self.op.offline == False or
4476 (self.op.master_capable and not node.master_capable)):
4477 if _DecideSelfPromotion(self):
4478 self.op.master_candidate = True
4479 self.LogInfo("Auto-promoting node to master candidate")
4481 # If we're no longer master capable, we'll demote ourselves from MC
4482 if self.op.master_capable == False and node.master_candidate:
4483 self.LogInfo("Demoting from master candidate")
4484 self.op.master_candidate = False
4487 assert [getattr(self.op, attr) for attr in self._FLAGS].count(True) <= 1
4488 if self.op.master_candidate:
4489 new_role = self._ROLE_CANDIDATE
4490 elif self.op.drained:
4491 new_role = self._ROLE_DRAINED
4492 elif self.op.offline:
4493 new_role = self._ROLE_OFFLINE
4494 elif False in [self.op.master_candidate, self.op.drained, self.op.offline]:
4495 # False is still in new flags, which means we're un-setting (the
4497 new_role = self._ROLE_REGULAR
4498 else: # no new flags, nothing, keep old role
4501 self.new_role = new_role
4503 if old_role == self._ROLE_OFFLINE and new_role != old_role:
4504 # Trying to transition out of offline status
4505 result = self.rpc.call_version([node.name])[node.name]
4507 raise errors.OpPrereqError("Node %s is being de-offlined but fails"
4508 " to report its version: %s" %
4509 (node.name, result.fail_msg),
4512 self.LogWarning("Transitioning node from offline to online state"
4513 " without using re-add. Please make sure the node"
4516 if self.op.secondary_ip:
4517 # Ok even without locking, because this can't be changed by any LU
4518 master = self.cfg.GetNodeInfo(self.cfg.GetMasterNode())
4519 master_singlehomed = master.secondary_ip == master.primary_ip
4520 if master_singlehomed and self.op.secondary_ip:
4521 raise errors.OpPrereqError("Cannot change the secondary ip on a single"
4522 " homed cluster", errors.ECODE_INVAL)
4525 if self.affected_instances:
4526 raise errors.OpPrereqError("Cannot change secondary ip: offline"
4527 " node has instances (%s) configured"
4528 " to use it" % self.affected_instances)
4530 # On online nodes, check that no instances are running, and that
4531 # the node has the new ip and we can reach it.
4532 for instance in self.affected_instances:
4533 _CheckInstanceDown(self, instance, "cannot change secondary ip")
4535 _CheckNodeHasSecondaryIP(self, node.name, self.op.secondary_ip, True)
4536 if master.name != node.name:
4537 # check reachability from master secondary ip to new secondary ip
4538 if not netutils.TcpPing(self.op.secondary_ip,
4539 constants.DEFAULT_NODED_PORT,
4540 source=master.secondary_ip):
4541 raise errors.OpPrereqError("Node secondary ip not reachable by TCP"
4542 " based ping to node daemon port",
4543 errors.ECODE_ENVIRON)
4545 if self.op.ndparams:
4546 new_ndparams = _GetUpdatedParams(self.node.ndparams, self.op.ndparams)
4547 utils.ForceDictType(new_ndparams, constants.NDS_PARAMETER_TYPES)
4548 self.new_ndparams = new_ndparams
4550 def Exec(self, feedback_fn):
4555 old_role = self.old_role
4556 new_role = self.new_role
4560 if self.op.ndparams:
4561 node.ndparams = self.new_ndparams
4563 if self.op.powered is not None:
4564 node.powered = self.op.powered
4566 for attr in ["master_capable", "vm_capable"]:
4567 val = getattr(self.op, attr)
4569 setattr(node, attr, val)
4570 result.append((attr, str(val)))
4572 if new_role != old_role:
4573 # Tell the node to demote itself, if no longer MC and not offline
4574 if old_role == self._ROLE_CANDIDATE and new_role != self._ROLE_OFFLINE:
4575 msg = self.rpc.call_node_demote_from_mc(node.name).fail_msg
4577 self.LogWarning("Node failed to demote itself: %s", msg)
4579 new_flags = self._R2F[new_role]
4580 for of, nf, desc in zip(self.old_flags, new_flags, self._FLAGS):
4582 result.append((desc, str(nf)))
4583 (node.master_candidate, node.drained, node.offline) = new_flags
4585 # we locked all nodes, we adjust the CP before updating this node
4587 _AdjustCandidatePool(self, [node.name])
4589 if self.op.secondary_ip:
4590 node.secondary_ip = self.op.secondary_ip
4591 result.append(("secondary_ip", self.op.secondary_ip))
4593 # this will trigger configuration file update, if needed
4594 self.cfg.Update(node, feedback_fn)
4596 # this will trigger job queue propagation or cleanup if the mc
4598 if [old_role, new_role].count(self._ROLE_CANDIDATE) == 1:
4599 self.context.ReaddNode(node)
4604 class LUNodePowercycle(NoHooksLU):
4605 """Powercycles a node.
4610 def CheckArguments(self):
4611 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
4612 if self.op.node_name == self.cfg.GetMasterNode() and not self.op.force:
4613 raise errors.OpPrereqError("The node is the master and the force"
4614 " parameter was not set",
4617 def ExpandNames(self):
4618 """Locking for PowercycleNode.
4620 This is a last-resort option and shouldn't block on other
4621 jobs. Therefore, we grab no locks.
4624 self.needed_locks = {}
4626 def Exec(self, feedback_fn):
4630 result = self.rpc.call_node_powercycle(self.op.node_name,
4631 self.cfg.GetHypervisorType())
4632 result.Raise("Failed to schedule the reboot")
4633 return result.payload
4636 class LUClusterQuery(NoHooksLU):
4637 """Query cluster configuration.
4642 def ExpandNames(self):
4643 self.needed_locks = {}
4645 def Exec(self, feedback_fn):
4646 """Return cluster config.
4649 cluster = self.cfg.GetClusterInfo()
4652 # Filter just for enabled hypervisors
4653 for os_name, hv_dict in cluster.os_hvp.items():
4654 os_hvp[os_name] = {}
4655 for hv_name, hv_params in hv_dict.items():
4656 if hv_name in cluster.enabled_hypervisors:
4657 os_hvp[os_name][hv_name] = hv_params
4659 # Convert ip_family to ip_version
4660 primary_ip_version = constants.IP4_VERSION
4661 if cluster.primary_ip_family == netutils.IP6Address.family:
4662 primary_ip_version = constants.IP6_VERSION
4665 "software_version": constants.RELEASE_VERSION,
4666 "protocol_version": constants.PROTOCOL_VERSION,
4667 "config_version": constants.CONFIG_VERSION,
4668 "os_api_version": max(constants.OS_API_VERSIONS),
4669 "export_version": constants.EXPORT_VERSION,
4670 "architecture": (platform.architecture()[0], platform.machine()),
4671 "name": cluster.cluster_name,
4672 "master": cluster.master_node,
4673 "default_hypervisor": cluster.enabled_hypervisors[0],
4674 "enabled_hypervisors": cluster.enabled_hypervisors,
4675 "hvparams": dict([(hypervisor_name, cluster.hvparams[hypervisor_name])
4676 for hypervisor_name in cluster.enabled_hypervisors]),
4678 "beparams": cluster.beparams,
4679 "osparams": cluster.osparams,
4680 "nicparams": cluster.nicparams,
4681 "ndparams": cluster.ndparams,
4682 "candidate_pool_size": cluster.candidate_pool_size,
4683 "master_netdev": cluster.master_netdev,
4684 "volume_group_name": cluster.volume_group_name,
4685 "drbd_usermode_helper": cluster.drbd_usermode_helper,
4686 "file_storage_dir": cluster.file_storage_dir,
4687 "maintain_node_health": cluster.maintain_node_health,
4688 "ctime": cluster.ctime,
4689 "mtime": cluster.mtime,
4690 "uuid": cluster.uuid,
4691 "tags": list(cluster.GetTags()),
4692 "uid_pool": cluster.uid_pool,
4693 "default_iallocator": cluster.default_iallocator,
4694 "reserved_lvs": cluster.reserved_lvs,
4695 "primary_ip_version": primary_ip_version,
4696 "prealloc_wipe_disks": cluster.prealloc_wipe_disks,
4697 "hidden_os": cluster.hidden_os,
4698 "blacklisted_os": cluster.blacklisted_os,
4704 class LUClusterConfigQuery(NoHooksLU):
4705 """Return configuration values.
4709 _FIELDS_DYNAMIC = utils.FieldSet()
4710 _FIELDS_STATIC = utils.FieldSet("cluster_name", "master_node", "drain_flag",
4711 "watcher_pause", "volume_group_name")
4713 def CheckArguments(self):
4714 _CheckOutputFields(static=self._FIELDS_STATIC,
4715 dynamic=self._FIELDS_DYNAMIC,
4716 selected=self.op.output_fields)
4718 def ExpandNames(self):
4719 self.needed_locks = {}
4721 def Exec(self, feedback_fn):
4722 """Dump a representation of the cluster config to the standard output.
4726 for field in self.op.output_fields:
4727 if field == "cluster_name":
4728 entry = self.cfg.GetClusterName()
4729 elif field == "master_node":
4730 entry = self.cfg.GetMasterNode()
4731 elif field == "drain_flag":
4732 entry = os.path.exists(constants.JOB_QUEUE_DRAIN_FILE)
4733 elif field == "watcher_pause":
4734 entry = utils.ReadWatcherPauseFile(constants.WATCHER_PAUSEFILE)
4735 elif field == "volume_group_name":
4736 entry = self.cfg.GetVGName()
4738 raise errors.ParameterError(field)
4739 values.append(entry)
4743 class LUInstanceActivateDisks(NoHooksLU):
4744 """Bring up an instance's disks.
4749 def ExpandNames(self):
4750 self._ExpandAndLockInstance()
4751 self.needed_locks[locking.LEVEL_NODE] = []
4752 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
4754 def DeclareLocks(self, level):
4755 if level == locking.LEVEL_NODE:
4756 self._LockInstancesNodes()
4758 def CheckPrereq(self):
4759 """Check prerequisites.
4761 This checks that the instance is in the cluster.
4764 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
4765 assert self.instance is not None, \
4766 "Cannot retrieve locked instance %s" % self.op.instance_name
4767 _CheckNodeOnline(self, self.instance.primary_node)
4769 def Exec(self, feedback_fn):
4770 """Activate the disks.
4773 disks_ok, disks_info = \
4774 _AssembleInstanceDisks(self, self.instance,
4775 ignore_size=self.op.ignore_size)
4777 raise errors.OpExecError("Cannot activate block devices")
4782 def _AssembleInstanceDisks(lu, instance, disks=None, ignore_secondaries=False,
4784 """Prepare the block devices for an instance.
4786 This sets up the block devices on all nodes.
4788 @type lu: L{LogicalUnit}
4789 @param lu: the logical unit on whose behalf we execute
4790 @type instance: L{objects.Instance}
4791 @param instance: the instance for whose disks we assemble
4792 @type disks: list of L{objects.Disk} or None
4793 @param disks: which disks to assemble (or all, if None)
4794 @type ignore_secondaries: boolean
4795 @param ignore_secondaries: if true, errors on secondary nodes
4796 won't result in an error return from the function
4797 @type ignore_size: boolean
4798 @param ignore_size: if true, the current known size of the disk
4799 will not be used during the disk activation, useful for cases
4800 when the size is wrong
4801 @return: False if the operation failed, otherwise a list of
4802 (host, instance_visible_name, node_visible_name)
4803 with the mapping from node devices to instance devices
4808 iname = instance.name
4809 disks = _ExpandCheckDisks(instance, disks)
4811 # With the two passes mechanism we try to reduce the window of
4812 # opportunity for the race condition of switching DRBD to primary
4813 # before handshaking occured, but we do not eliminate it
4815 # The proper fix would be to wait (with some limits) until the
4816 # connection has been made and drbd transitions from WFConnection
4817 # into any other network-connected state (Connected, SyncTarget,
4820 # 1st pass, assemble on all nodes in secondary mode
4821 for idx, inst_disk in enumerate(disks):
4822 for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node):
4824 node_disk = node_disk.Copy()
4825 node_disk.UnsetSize()
4826 lu.cfg.SetDiskID(node_disk, node)
4827 result = lu.rpc.call_blockdev_assemble(node, node_disk, iname, False, idx)
4828 msg = result.fail_msg
4830 lu.proc.LogWarning("Could not prepare block device %s on node %s"
4831 " (is_primary=False, pass=1): %s",
4832 inst_disk.iv_name, node, msg)
4833 if not ignore_secondaries:
4836 # FIXME: race condition on drbd migration to primary
4838 # 2nd pass, do only the primary node
4839 for idx, inst_disk in enumerate(disks):
4842 for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node):
4843 if node != instance.primary_node:
4846 node_disk = node_disk.Copy()
4847 node_disk.UnsetSize()
4848 lu.cfg.SetDiskID(node_disk, node)
4849 result = lu.rpc.call_blockdev_assemble(node, node_disk, iname, True, idx)
4850 msg = result.fail_msg
4852 lu.proc.LogWarning("Could not prepare block device %s on node %s"
4853 " (is_primary=True, pass=2): %s",
4854 inst_disk.iv_name, node, msg)
4857 dev_path = result.payload
4859 device_info.append((instance.primary_node, inst_disk.iv_name, dev_path))
4861 # leave the disks configured for the primary node
4862 # this is a workaround that would be fixed better by
4863 # improving the logical/physical id handling
4865 lu.cfg.SetDiskID(disk, instance.primary_node)
4867 return disks_ok, device_info
4870 def _StartInstanceDisks(lu, instance, force):
4871 """Start the disks of an instance.
4874 disks_ok, _ = _AssembleInstanceDisks(lu, instance,
4875 ignore_secondaries=force)
4877 _ShutdownInstanceDisks(lu, instance)
4878 if force is not None and not force:
4879 lu.proc.LogWarning("", hint="If the message above refers to a"
4881 " you can retry the operation using '--force'.")
4882 raise errors.OpExecError("Disk consistency error")
4885 class LUInstanceDeactivateDisks(NoHooksLU):
4886 """Shutdown an instance's disks.
4891 def ExpandNames(self):
4892 self._ExpandAndLockInstance()
4893 self.needed_locks[locking.LEVEL_NODE] = []
4894 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
4896 def DeclareLocks(self, level):
4897 if level == locking.LEVEL_NODE:
4898 self._LockInstancesNodes()
4900 def CheckPrereq(self):
4901 """Check prerequisites.
4903 This checks that the instance is in the cluster.
4906 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
4907 assert self.instance is not None, \
4908 "Cannot retrieve locked instance %s" % self.op.instance_name
4910 def Exec(self, feedback_fn):
4911 """Deactivate the disks
4914 instance = self.instance
4916 _ShutdownInstanceDisks(self, instance)
4918 _SafeShutdownInstanceDisks(self, instance)
4921 def _SafeShutdownInstanceDisks(lu, instance, disks=None):
4922 """Shutdown block devices of an instance.
4924 This function checks if an instance is running, before calling
4925 _ShutdownInstanceDisks.
4928 _CheckInstanceDown(lu, instance, "cannot shutdown disks")
4929 _ShutdownInstanceDisks(lu, instance, disks=disks)
4932 def _ExpandCheckDisks(instance, disks):
4933 """Return the instance disks selected by the disks list
4935 @type disks: list of L{objects.Disk} or None
4936 @param disks: selected disks
4937 @rtype: list of L{objects.Disk}
4938 @return: selected instance disks to act on
4942 return instance.disks
4944 if not set(disks).issubset(instance.disks):
4945 raise errors.ProgrammerError("Can only act on disks belonging to the"
4950 def _ShutdownInstanceDisks(lu, instance, disks=None, ignore_primary=False):
4951 """Shutdown block devices of an instance.
4953 This does the shutdown on all nodes of the instance.
4955 If the ignore_primary is false, errors on the primary node are
4960 disks = _ExpandCheckDisks(instance, disks)
4963 for node, top_disk in disk.ComputeNodeTree(instance.primary_node):
4964 lu.cfg.SetDiskID(top_disk, node)
4965 result = lu.rpc.call_blockdev_shutdown(node, top_disk)
4966 msg = result.fail_msg
4968 lu.LogWarning("Could not shutdown block device %s on node %s: %s",
4969 disk.iv_name, node, msg)
4970 if ((node == instance.primary_node and not ignore_primary) or
4971 (node != instance.primary_node and not result.offline)):
4976 def _CheckNodeFreeMemory(lu, node, reason, requested, hypervisor_name):
4977 """Checks if a node has enough free memory.
4979 This function check if a given node has the needed amount of free
4980 memory. In case the node has less memory or we cannot get the
4981 information from the node, this function raise an OpPrereqError
4984 @type lu: C{LogicalUnit}
4985 @param lu: a logical unit from which we get configuration data
4987 @param node: the node to check
4988 @type reason: C{str}
4989 @param reason: string to use in the error message
4990 @type requested: C{int}
4991 @param requested: the amount of memory in MiB to check for
4992 @type hypervisor_name: C{str}
4993 @param hypervisor_name: the hypervisor to ask for memory stats
4994 @raise errors.OpPrereqError: if the node doesn't have enough memory, or
4995 we cannot check the node
4998 nodeinfo = lu.rpc.call_node_info([node], None, hypervisor_name)
4999 nodeinfo[node].Raise("Can't get data from node %s" % node,
5000 prereq=True, ecode=errors.ECODE_ENVIRON)
5001 free_mem = nodeinfo[node].payload.get('memory_free', None)
5002 if not isinstance(free_mem, int):
5003 raise errors.OpPrereqError("Can't compute free memory on node %s, result"
5004 " was '%s'" % (node, free_mem),
5005 errors.ECODE_ENVIRON)
5006 if requested > free_mem:
5007 raise errors.OpPrereqError("Not enough memory on node %s for %s:"
5008 " needed %s MiB, available %s MiB" %
5009 (node, reason, requested, free_mem),
5013 def _CheckNodesFreeDiskPerVG(lu, nodenames, req_sizes):
5014 """Checks if nodes have enough free disk space in the all VGs.
5016 This function check if all given nodes have the needed amount of
5017 free disk. In case any node has less disk or we cannot get the
5018 information from the node, this function raise an OpPrereqError
5021 @type lu: C{LogicalUnit}
5022 @param lu: a logical unit from which we get configuration data
5023 @type nodenames: C{list}
5024 @param nodenames: the list of node names to check
5025 @type req_sizes: C{dict}
5026 @param req_sizes: the hash of vg and corresponding amount of disk in
5028 @raise errors.OpPrereqError: if the node doesn't have enough disk,
5029 or we cannot check the node
5032 for vg, req_size in req_sizes.items():
5033 _CheckNodesFreeDiskOnVG(lu, nodenames, vg, req_size)
5036 def _CheckNodesFreeDiskOnVG(lu, nodenames, vg, requested):
5037 """Checks if nodes have enough free disk space in the specified VG.
5039 This function check if all given nodes have the needed amount of
5040 free disk. In case any node has less disk or we cannot get the
5041 information from the node, this function raise an OpPrereqError
5044 @type lu: C{LogicalUnit}
5045 @param lu: a logical unit from which we get configuration data
5046 @type nodenames: C{list}
5047 @param nodenames: the list of node names to check
5049 @param vg: the volume group to check
5050 @type requested: C{int}
5051 @param requested: the amount of disk in MiB to check for
5052 @raise errors.OpPrereqError: if the node doesn't have enough disk,
5053 or we cannot check the node
5056 nodeinfo = lu.rpc.call_node_info(nodenames, vg, None)
5057 for node in nodenames:
5058 info = nodeinfo[node]
5059 info.Raise("Cannot get current information from node %s" % node,
5060 prereq=True, ecode=errors.ECODE_ENVIRON)
5061 vg_free = info.payload.get("vg_free", None)
5062 if not isinstance(vg_free, int):
5063 raise errors.OpPrereqError("Can't compute free disk space on node"
5064 " %s for vg %s, result was '%s'" %
5065 (node, vg, vg_free), errors.ECODE_ENVIRON)
5066 if requested > vg_free:
5067 raise errors.OpPrereqError("Not enough disk space on target node %s"
5068 " vg %s: required %d MiB, available %d MiB" %
5069 (node, vg, requested, vg_free),
5073 class LUInstanceStartup(LogicalUnit):
5074 """Starts an instance.
5077 HPATH = "instance-start"
5078 HTYPE = constants.HTYPE_INSTANCE
5081 def CheckArguments(self):
5083 if self.op.beparams:
5084 # fill the beparams dict
5085 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
5087 def ExpandNames(self):
5088 self._ExpandAndLockInstance()
5090 def BuildHooksEnv(self):
5093 This runs on master, primary and secondary nodes of the instance.
5097 "FORCE": self.op.force,
5099 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
5100 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
5103 def CheckPrereq(self):
5104 """Check prerequisites.
5106 This checks that the instance is in the cluster.
5109 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5110 assert self.instance is not None, \
5111 "Cannot retrieve locked instance %s" % self.op.instance_name
5114 if self.op.hvparams:
5115 # check hypervisor parameter syntax (locally)
5116 cluster = self.cfg.GetClusterInfo()
5117 utils.ForceDictType(self.op.hvparams, constants.HVS_PARAMETER_TYPES)
5118 filled_hvp = cluster.FillHV(instance)
5119 filled_hvp.update(self.op.hvparams)
5120 hv_type = hypervisor.GetHypervisor(instance.hypervisor)
5121 hv_type.CheckParameterSyntax(filled_hvp)
5122 _CheckHVParams(self, instance.all_nodes, instance.hypervisor, filled_hvp)
5124 self.primary_offline = self.cfg.GetNodeInfo(instance.primary_node).offline
5126 if self.primary_offline and self.op.ignore_offline_nodes:
5127 self.proc.LogWarning("Ignoring offline primary node")
5129 if self.op.hvparams or self.op.beparams:
5130 self.proc.LogWarning("Overridden parameters are ignored")
5132 _CheckNodeOnline(self, instance.primary_node)
5134 bep = self.cfg.GetClusterInfo().FillBE(instance)
5136 # check bridges existence
5137 _CheckInstanceBridgesExist(self, instance)
5139 remote_info = self.rpc.call_instance_info(instance.primary_node,
5141 instance.hypervisor)
5142 remote_info.Raise("Error checking node %s" % instance.primary_node,
5143 prereq=True, ecode=errors.ECODE_ENVIRON)
5144 if not remote_info.payload: # not running already
5145 _CheckNodeFreeMemory(self, instance.primary_node,
5146 "starting instance %s" % instance.name,
5147 bep[constants.BE_MEMORY], instance.hypervisor)
5149 def Exec(self, feedback_fn):
5150 """Start the instance.
5153 instance = self.instance
5154 force = self.op.force
5156 self.cfg.MarkInstanceUp(instance.name)
5158 if self.primary_offline:
5159 assert self.op.ignore_offline_nodes
5160 self.proc.LogInfo("Primary node offline, marked instance as started")
5162 node_current = instance.primary_node
5164 _StartInstanceDisks(self, instance, force)
5166 result = self.rpc.call_instance_start(node_current, instance,
5167 self.op.hvparams, self.op.beparams)
5168 msg = result.fail_msg
5170 _ShutdownInstanceDisks(self, instance)
5171 raise errors.OpExecError("Could not start instance: %s" % msg)
5174 class LUInstanceReboot(LogicalUnit):
5175 """Reboot an instance.
5178 HPATH = "instance-reboot"
5179 HTYPE = constants.HTYPE_INSTANCE
5182 def ExpandNames(self):
5183 self._ExpandAndLockInstance()
5185 def BuildHooksEnv(self):
5188 This runs on master, primary and secondary nodes of the instance.
5192 "IGNORE_SECONDARIES": self.op.ignore_secondaries,
5193 "REBOOT_TYPE": self.op.reboot_type,
5194 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
5196 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
5197 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
5200 def CheckPrereq(self):
5201 """Check prerequisites.
5203 This checks that the instance is in the cluster.
5206 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5207 assert self.instance is not None, \
5208 "Cannot retrieve locked instance %s" % self.op.instance_name
5210 _CheckNodeOnline(self, instance.primary_node)
5212 # check bridges existence
5213 _CheckInstanceBridgesExist(self, instance)
5215 def Exec(self, feedback_fn):
5216 """Reboot the instance.
5219 instance = self.instance
5220 ignore_secondaries = self.op.ignore_secondaries
5221 reboot_type = self.op.reboot_type
5223 node_current = instance.primary_node
5225 if reboot_type in [constants.INSTANCE_REBOOT_SOFT,
5226 constants.INSTANCE_REBOOT_HARD]:
5227 for disk in instance.disks:
5228 self.cfg.SetDiskID(disk, node_current)
5229 result = self.rpc.call_instance_reboot(node_current, instance,
5231 self.op.shutdown_timeout)
5232 result.Raise("Could not reboot instance")
5234 result = self.rpc.call_instance_shutdown(node_current, instance,
5235 self.op.shutdown_timeout)
5236 result.Raise("Could not shutdown instance for full reboot")
5237 _ShutdownInstanceDisks(self, instance)
5238 _StartInstanceDisks(self, instance, ignore_secondaries)
5239 result = self.rpc.call_instance_start(node_current, instance, None, None)
5240 msg = result.fail_msg
5242 _ShutdownInstanceDisks(self, instance)
5243 raise errors.OpExecError("Could not start instance for"
5244 " full reboot: %s" % msg)
5246 self.cfg.MarkInstanceUp(instance.name)
5249 class LUInstanceShutdown(LogicalUnit):
5250 """Shutdown an instance.
5253 HPATH = "instance-stop"
5254 HTYPE = constants.HTYPE_INSTANCE
5257 def ExpandNames(self):
5258 self._ExpandAndLockInstance()
5260 def BuildHooksEnv(self):
5263 This runs on master, primary and secondary nodes of the instance.
5266 env = _BuildInstanceHookEnvByObject(self, self.instance)
5267 env["TIMEOUT"] = self.op.timeout
5268 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
5271 def CheckPrereq(self):
5272 """Check prerequisites.
5274 This checks that the instance is in the cluster.
5277 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5278 assert self.instance is not None, \
5279 "Cannot retrieve locked instance %s" % self.op.instance_name
5281 self.primary_offline = \
5282 self.cfg.GetNodeInfo(self.instance.primary_node).offline
5284 if self.primary_offline and self.op.ignore_offline_nodes:
5285 self.proc.LogWarning("Ignoring offline primary node")
5287 _CheckNodeOnline(self, self.instance.primary_node)
5289 def Exec(self, feedback_fn):
5290 """Shutdown the instance.
5293 instance = self.instance
5294 node_current = instance.primary_node
5295 timeout = self.op.timeout
5297 self.cfg.MarkInstanceDown(instance.name)
5299 if self.primary_offline:
5300 assert self.op.ignore_offline_nodes
5301 self.proc.LogInfo("Primary node offline, marked instance as stopped")
5303 result = self.rpc.call_instance_shutdown(node_current, instance, timeout)
5304 msg = result.fail_msg
5306 self.proc.LogWarning("Could not shutdown instance: %s" % msg)
5308 _ShutdownInstanceDisks(self, instance)
5311 class LUInstanceReinstall(LogicalUnit):
5312 """Reinstall an instance.
5315 HPATH = "instance-reinstall"
5316 HTYPE = constants.HTYPE_INSTANCE
5319 def ExpandNames(self):
5320 self._ExpandAndLockInstance()
5322 def BuildHooksEnv(self):
5325 This runs on master, primary and secondary nodes of the instance.
5328 env = _BuildInstanceHookEnvByObject(self, self.instance)
5329 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
5332 def CheckPrereq(self):
5333 """Check prerequisites.
5335 This checks that the instance is in the cluster and is not running.
5338 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5339 assert instance is not None, \
5340 "Cannot retrieve locked instance %s" % self.op.instance_name
5341 _CheckNodeOnline(self, instance.primary_node, "Instance primary node"
5342 " offline, cannot reinstall")
5343 for node in instance.secondary_nodes:
5344 _CheckNodeOnline(self, node, "Instance secondary node offline,"
5345 " cannot reinstall")
5347 if instance.disk_template == constants.DT_DISKLESS:
5348 raise errors.OpPrereqError("Instance '%s' has no disks" %
5349 self.op.instance_name,
5351 _CheckInstanceDown(self, instance, "cannot reinstall")
5353 if self.op.os_type is not None:
5355 pnode = _ExpandNodeName(self.cfg, instance.primary_node)
5356 _CheckNodeHasOS(self, pnode, self.op.os_type, self.op.force_variant)
5357 instance_os = self.op.os_type
5359 instance_os = instance.os
5361 nodelist = list(instance.all_nodes)
5363 if self.op.osparams:
5364 i_osdict = _GetUpdatedParams(instance.osparams, self.op.osparams)
5365 _CheckOSParams(self, True, nodelist, instance_os, i_osdict)
5366 self.os_inst = i_osdict # the new dict (without defaults)
5370 self.instance = instance
5372 def Exec(self, feedback_fn):
5373 """Reinstall the instance.
5376 inst = self.instance
5378 if self.op.os_type is not None:
5379 feedback_fn("Changing OS to '%s'..." % self.op.os_type)
5380 inst.os = self.op.os_type
5381 # Write to configuration
5382 self.cfg.Update(inst, feedback_fn)
5384 _StartInstanceDisks(self, inst, None)
5386 feedback_fn("Running the instance OS create scripts...")
5387 # FIXME: pass debug option from opcode to backend
5388 result = self.rpc.call_instance_os_add(inst.primary_node, inst, True,
5389 self.op.debug_level,
5390 osparams=self.os_inst)
5391 result.Raise("Could not install OS for instance %s on node %s" %
5392 (inst.name, inst.primary_node))
5394 _ShutdownInstanceDisks(self, inst)
5397 class LUInstanceRecreateDisks(LogicalUnit):
5398 """Recreate an instance's missing disks.
5401 HPATH = "instance-recreate-disks"
5402 HTYPE = constants.HTYPE_INSTANCE
5405 def ExpandNames(self):
5406 self._ExpandAndLockInstance()
5408 def BuildHooksEnv(self):
5411 This runs on master, primary and secondary nodes of the instance.
5414 env = _BuildInstanceHookEnvByObject(self, self.instance)
5415 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
5418 def CheckPrereq(self):
5419 """Check prerequisites.
5421 This checks that the instance is in the cluster and is not running.
5424 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5425 assert instance is not None, \
5426 "Cannot retrieve locked instance %s" % self.op.instance_name
5427 _CheckNodeOnline(self, instance.primary_node)
5429 if instance.disk_template == constants.DT_DISKLESS:
5430 raise errors.OpPrereqError("Instance '%s' has no disks" %
5431 self.op.instance_name, errors.ECODE_INVAL)
5432 _CheckInstanceDown(self, instance, "cannot recreate disks")
5434 if not self.op.disks:
5435 self.op.disks = range(len(instance.disks))
5437 for idx in self.op.disks:
5438 if idx >= len(instance.disks):
5439 raise errors.OpPrereqError("Invalid disk index passed '%s'" % idx,
5442 self.instance = instance
5444 def Exec(self, feedback_fn):
5445 """Recreate the disks.
5449 for idx, _ in enumerate(self.instance.disks):
5450 if idx not in self.op.disks: # disk idx has not been passed in
5454 _CreateDisks(self, self.instance, to_skip=to_skip)
5457 class LUInstanceRename(LogicalUnit):
5458 """Rename an instance.
5461 HPATH = "instance-rename"
5462 HTYPE = constants.HTYPE_INSTANCE
5464 def CheckArguments(self):
5468 if self.op.ip_check and not self.op.name_check:
5469 # TODO: make the ip check more flexible and not depend on the name check
5470 raise errors.OpPrereqError("Cannot do ip check without a name check",
5473 def BuildHooksEnv(self):
5476 This runs on master, primary and secondary nodes of the instance.
5479 env = _BuildInstanceHookEnvByObject(self, self.instance)
5480 env["INSTANCE_NEW_NAME"] = self.op.new_name
5481 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
5484 def CheckPrereq(self):
5485 """Check prerequisites.
5487 This checks that the instance is in the cluster and is not running.
5490 self.op.instance_name = _ExpandInstanceName(self.cfg,
5491 self.op.instance_name)
5492 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5493 assert instance is not None
5494 _CheckNodeOnline(self, instance.primary_node)
5495 _CheckInstanceDown(self, instance, "cannot rename")
5496 self.instance = instance
5498 new_name = self.op.new_name
5499 if self.op.name_check:
5500 hostname = netutils.GetHostname(name=new_name)
5501 self.LogInfo("Resolved given name '%s' to '%s'", new_name,
5503 new_name = self.op.new_name = hostname.name
5504 if (self.op.ip_check and
5505 netutils.TcpPing(hostname.ip, constants.DEFAULT_NODED_PORT)):
5506 raise errors.OpPrereqError("IP %s of instance %s already in use" %
5507 (hostname.ip, new_name),
5508 errors.ECODE_NOTUNIQUE)
5510 instance_list = self.cfg.GetInstanceList()
5511 if new_name in instance_list and new_name != instance.name:
5512 raise errors.OpPrereqError("Instance '%s' is already in the cluster" %
5513 new_name, errors.ECODE_EXISTS)
5515 def Exec(self, feedback_fn):
5516 """Rename the instance.
5519 inst = self.instance
5520 old_name = inst.name
5522 rename_file_storage = False
5523 if (inst.disk_template == constants.DT_FILE and
5524 self.op.new_name != inst.name):
5525 old_file_storage_dir = os.path.dirname(inst.disks[0].logical_id[1])
5526 rename_file_storage = True
5528 self.cfg.RenameInstance(inst.name, self.op.new_name)
5529 # Change the instance lock. This is definitely safe while we hold the BGL
5530 self.context.glm.remove(locking.LEVEL_INSTANCE, old_name)
5531 self.context.glm.add(locking.LEVEL_INSTANCE, self.op.new_name)
5533 # re-read the instance from the configuration after rename
5534 inst = self.cfg.GetInstanceInfo(self.op.new_name)
5536 if rename_file_storage:
5537 new_file_storage_dir = os.path.dirname(inst.disks[0].logical_id[1])
5538 result = self.rpc.call_file_storage_dir_rename(inst.primary_node,
5539 old_file_storage_dir,
5540 new_file_storage_dir)
5541 result.Raise("Could not rename on node %s directory '%s' to '%s'"
5542 " (but the instance has been renamed in Ganeti)" %
5543 (inst.primary_node, old_file_storage_dir,
5544 new_file_storage_dir))
5546 _StartInstanceDisks(self, inst, None)
5548 result = self.rpc.call_instance_run_rename(inst.primary_node, inst,
5549 old_name, self.op.debug_level)
5550 msg = result.fail_msg
5552 msg = ("Could not run OS rename script for instance %s on node %s"
5553 " (but the instance has been renamed in Ganeti): %s" %
5554 (inst.name, inst.primary_node, msg))
5555 self.proc.LogWarning(msg)
5557 _ShutdownInstanceDisks(self, inst)
5562 class LUInstanceRemove(LogicalUnit):
5563 """Remove an instance.
5566 HPATH = "instance-remove"
5567 HTYPE = constants.HTYPE_INSTANCE
5570 def ExpandNames(self):
5571 self._ExpandAndLockInstance()
5572 self.needed_locks[locking.LEVEL_NODE] = []
5573 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
5575 def DeclareLocks(self, level):
5576 if level == locking.LEVEL_NODE:
5577 self._LockInstancesNodes()
5579 def BuildHooksEnv(self):
5582 This runs on master, primary and secondary nodes of the instance.
5585 env = _BuildInstanceHookEnvByObject(self, self.instance)
5586 env["SHUTDOWN_TIMEOUT"] = self.op.shutdown_timeout
5587 nl = [self.cfg.GetMasterNode()]
5588 nl_post = list(self.instance.all_nodes) + nl
5589 return env, nl, nl_post
5591 def CheckPrereq(self):
5592 """Check prerequisites.
5594 This checks that the instance is in the cluster.
5597 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5598 assert self.instance is not None, \
5599 "Cannot retrieve locked instance %s" % self.op.instance_name
5601 def Exec(self, feedback_fn):
5602 """Remove the instance.
5605 instance = self.instance
5606 logging.info("Shutting down instance %s on node %s",
5607 instance.name, instance.primary_node)
5609 result = self.rpc.call_instance_shutdown(instance.primary_node, instance,
5610 self.op.shutdown_timeout)
5611 msg = result.fail_msg
5613 if self.op.ignore_failures:
5614 feedback_fn("Warning: can't shutdown instance: %s" % msg)
5616 raise errors.OpExecError("Could not shutdown instance %s on"
5618 (instance.name, instance.primary_node, msg))
5620 _RemoveInstance(self, feedback_fn, instance, self.op.ignore_failures)
5623 def _RemoveInstance(lu, feedback_fn, instance, ignore_failures):
5624 """Utility function to remove an instance.
5627 logging.info("Removing block devices for instance %s", instance.name)
5629 if not _RemoveDisks(lu, instance):
5630 if not ignore_failures:
5631 raise errors.OpExecError("Can't remove instance's disks")
5632 feedback_fn("Warning: can't remove instance's disks")
5634 logging.info("Removing instance %s out of cluster config", instance.name)
5636 lu.cfg.RemoveInstance(instance.name)
5638 assert not lu.remove_locks.get(locking.LEVEL_INSTANCE), \
5639 "Instance lock removal conflict"
5641 # Remove lock for the instance
5642 lu.remove_locks[locking.LEVEL_INSTANCE] = instance.name
5645 class LUInstanceQuery(NoHooksLU):
5646 """Logical unit for querying instances.
5649 # pylint: disable-msg=W0142
5652 def CheckArguments(self):
5653 self.iq = _InstanceQuery(self.op.names, self.op.output_fields,
5654 self.op.use_locking)
5656 def ExpandNames(self):
5657 self.iq.ExpandNames(self)
5659 def DeclareLocks(self, level):
5660 self.iq.DeclareLocks(self, level)
5662 def Exec(self, feedback_fn):
5663 return self.iq.OldStyleQuery(self)
5666 class LUInstanceFailover(LogicalUnit):
5667 """Failover an instance.
5670 HPATH = "instance-failover"
5671 HTYPE = constants.HTYPE_INSTANCE
5674 def ExpandNames(self):
5675 self._ExpandAndLockInstance()
5676 self.needed_locks[locking.LEVEL_NODE] = []
5677 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
5679 def DeclareLocks(self, level):
5680 if level == locking.LEVEL_NODE:
5681 self._LockInstancesNodes()
5683 def BuildHooksEnv(self):
5686 This runs on master, primary and secondary nodes of the instance.
5689 instance = self.instance
5690 source_node = instance.primary_node
5691 target_node = instance.secondary_nodes[0]
5693 "IGNORE_CONSISTENCY": self.op.ignore_consistency,
5694 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
5695 "OLD_PRIMARY": source_node,
5696 "OLD_SECONDARY": target_node,
5697 "NEW_PRIMARY": target_node,
5698 "NEW_SECONDARY": source_node,
5700 env.update(_BuildInstanceHookEnvByObject(self, instance))
5701 nl = [self.cfg.GetMasterNode()] + list(instance.secondary_nodes)
5703 nl_post.append(source_node)
5704 return env, nl, nl_post
5706 def CheckPrereq(self):
5707 """Check prerequisites.
5709 This checks that the instance is in the cluster.
5712 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5713 assert self.instance is not None, \
5714 "Cannot retrieve locked instance %s" % self.op.instance_name
5716 bep = self.cfg.GetClusterInfo().FillBE(instance)
5717 if instance.disk_template not in constants.DTS_NET_MIRROR:
5718 raise errors.OpPrereqError("Instance's disk layout is not"
5719 " network mirrored, cannot failover.",
5722 secondary_nodes = instance.secondary_nodes
5723 if not secondary_nodes:
5724 raise errors.ProgrammerError("no secondary node but using "
5725 "a mirrored disk template")
5727 target_node = secondary_nodes[0]
5728 _CheckNodeOnline(self, target_node)
5729 _CheckNodeNotDrained(self, target_node)
5730 if instance.admin_up:
5731 # check memory requirements on the secondary node
5732 _CheckNodeFreeMemory(self, target_node, "failing over instance %s" %
5733 instance.name, bep[constants.BE_MEMORY],
5734 instance.hypervisor)
5736 self.LogInfo("Not checking memory on the secondary node as"
5737 " instance will not be started")
5739 # check bridge existance
5740 _CheckInstanceBridgesExist(self, instance, node=target_node)
5742 def Exec(self, feedback_fn):
5743 """Failover an instance.
5745 The failover is done by shutting it down on its present node and
5746 starting it on the secondary.
5749 instance = self.instance
5750 primary_node = self.cfg.GetNodeInfo(instance.primary_node)
5752 source_node = instance.primary_node
5753 target_node = instance.secondary_nodes[0]
5755 if instance.admin_up:
5756 feedback_fn("* checking disk consistency between source and target")
5757 for dev in instance.disks:
5758 # for drbd, these are drbd over lvm
5759 if not _CheckDiskConsistency(self, dev, target_node, False):
5760 if not self.op.ignore_consistency:
5761 raise errors.OpExecError("Disk %s is degraded on target node,"
5762 " aborting failover." % dev.iv_name)
5764 feedback_fn("* not checking disk consistency as instance is not running")
5766 feedback_fn("* shutting down instance on source node")
5767 logging.info("Shutting down instance %s on node %s",
5768 instance.name, source_node)
5770 result = self.rpc.call_instance_shutdown(source_node, instance,
5771 self.op.shutdown_timeout)
5772 msg = result.fail_msg
5774 if self.op.ignore_consistency or primary_node.offline:
5775 self.proc.LogWarning("Could not shutdown instance %s on node %s."
5776 " Proceeding anyway. Please make sure node"
5777 " %s is down. Error details: %s",
5778 instance.name, source_node, source_node, msg)
5780 raise errors.OpExecError("Could not shutdown instance %s on"
5782 (instance.name, source_node, msg))
5784 feedback_fn("* deactivating the instance's disks on source node")
5785 if not _ShutdownInstanceDisks(self, instance, ignore_primary=True):
5786 raise errors.OpExecError("Can't shut down the instance's disks.")
5788 instance.primary_node = target_node
5789 # distribute new instance config to the other nodes
5790 self.cfg.Update(instance, feedback_fn)
5792 # Only start the instance if it's marked as up
5793 if instance.admin_up:
5794 feedback_fn("* activating the instance's disks on target node")
5795 logging.info("Starting instance %s on node %s",
5796 instance.name, target_node)
5798 disks_ok, _ = _AssembleInstanceDisks(self, instance,
5799 ignore_secondaries=True)
5801 _ShutdownInstanceDisks(self, instance)
5802 raise errors.OpExecError("Can't activate the instance's disks")
5804 feedback_fn("* starting the instance on the target node")
5805 result = self.rpc.call_instance_start(target_node, instance, None, None)
5806 msg = result.fail_msg
5808 _ShutdownInstanceDisks(self, instance)
5809 raise errors.OpExecError("Could not start instance %s on node %s: %s" %
5810 (instance.name, target_node, msg))
5813 class LUInstanceMigrate(LogicalUnit):
5814 """Migrate an instance.
5816 This is migration without shutting down, compared to the failover,
5817 which is done with shutdown.
5820 HPATH = "instance-migrate"
5821 HTYPE = constants.HTYPE_INSTANCE
5824 def ExpandNames(self):
5825 self._ExpandAndLockInstance()
5827 self.needed_locks[locking.LEVEL_NODE] = []
5828 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
5830 self._migrater = TLMigrateInstance(self, self.op.instance_name,
5832 self.tasklets = [self._migrater]
5834 def DeclareLocks(self, level):
5835 if level == locking.LEVEL_NODE:
5836 self._LockInstancesNodes()
5838 def BuildHooksEnv(self):
5841 This runs on master, primary and secondary nodes of the instance.
5844 instance = self._migrater.instance
5845 source_node = instance.primary_node
5846 target_node = instance.secondary_nodes[0]
5847 env = _BuildInstanceHookEnvByObject(self, instance)
5848 env["MIGRATE_LIVE"] = self._migrater.live
5849 env["MIGRATE_CLEANUP"] = self.op.cleanup
5851 "OLD_PRIMARY": source_node,
5852 "OLD_SECONDARY": target_node,
5853 "NEW_PRIMARY": target_node,
5854 "NEW_SECONDARY": source_node,
5856 nl = [self.cfg.GetMasterNode()] + list(instance.secondary_nodes)
5858 nl_post.append(source_node)
5859 return env, nl, nl_post
5862 class LUInstanceMove(LogicalUnit):
5863 """Move an instance by data-copying.
5866 HPATH = "instance-move"
5867 HTYPE = constants.HTYPE_INSTANCE
5870 def ExpandNames(self):
5871 self._ExpandAndLockInstance()
5872 target_node = _ExpandNodeName(self.cfg, self.op.target_node)
5873 self.op.target_node = target_node
5874 self.needed_locks[locking.LEVEL_NODE] = [target_node]
5875 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
5877 def DeclareLocks(self, level):
5878 if level == locking.LEVEL_NODE:
5879 self._LockInstancesNodes(primary_only=True)
5881 def BuildHooksEnv(self):
5884 This runs on master, primary and secondary nodes of the instance.
5888 "TARGET_NODE": self.op.target_node,
5889 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
5891 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
5892 nl = [self.cfg.GetMasterNode()] + [self.instance.primary_node,
5893 self.op.target_node]
5896 def CheckPrereq(self):
5897 """Check prerequisites.
5899 This checks that the instance is in the cluster.
5902 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5903 assert self.instance is not None, \
5904 "Cannot retrieve locked instance %s" % self.op.instance_name
5906 node = self.cfg.GetNodeInfo(self.op.target_node)
5907 assert node is not None, \
5908 "Cannot retrieve locked node %s" % self.op.target_node
5910 self.target_node = target_node = node.name
5912 if target_node == instance.primary_node:
5913 raise errors.OpPrereqError("Instance %s is already on the node %s" %
5914 (instance.name, target_node),
5917 bep = self.cfg.GetClusterInfo().FillBE(instance)
5919 for idx, dsk in enumerate(instance.disks):
5920 if dsk.dev_type not in (constants.LD_LV, constants.LD_FILE):
5921 raise errors.OpPrereqError("Instance disk %d has a complex layout,"
5922 " cannot copy" % idx, errors.ECODE_STATE)
5924 _CheckNodeOnline(self, target_node)
5925 _CheckNodeNotDrained(self, target_node)
5926 _CheckNodeVmCapable(self, target_node)
5928 if instance.admin_up:
5929 # check memory requirements on the secondary node
5930 _CheckNodeFreeMemory(self, target_node, "failing over instance %s" %
5931 instance.name, bep[constants.BE_MEMORY],
5932 instance.hypervisor)
5934 self.LogInfo("Not checking memory on the secondary node as"
5935 " instance will not be started")
5937 # check bridge existance
5938 _CheckInstanceBridgesExist(self, instance, node=target_node)
5940 def Exec(self, feedback_fn):
5941 """Move an instance.
5943 The move is done by shutting it down on its present node, copying
5944 the data over (slow) and starting it on the new node.
5947 instance = self.instance
5949 source_node = instance.primary_node
5950 target_node = self.target_node
5952 self.LogInfo("Shutting down instance %s on source node %s",
5953 instance.name, source_node)
5955 result = self.rpc.call_instance_shutdown(source_node, instance,
5956 self.op.shutdown_timeout)
5957 msg = result.fail_msg
5959 if self.op.ignore_consistency:
5960 self.proc.LogWarning("Could not shutdown instance %s on node %s."
5961 " Proceeding anyway. Please make sure node"
5962 " %s is down. Error details: %s",
5963 instance.name, source_node, source_node, msg)
5965 raise errors.OpExecError("Could not shutdown instance %s on"
5967 (instance.name, source_node, msg))
5969 # create the target disks
5971 _CreateDisks(self, instance, target_node=target_node)
5972 except errors.OpExecError:
5973 self.LogWarning("Device creation failed, reverting...")
5975 _RemoveDisks(self, instance, target_node=target_node)
5977 self.cfg.ReleaseDRBDMinors(instance.name)
5980 cluster_name = self.cfg.GetClusterInfo().cluster_name
5983 # activate, get path, copy the data over
5984 for idx, disk in enumerate(instance.disks):
5985 self.LogInfo("Copying data for disk %d", idx)
5986 result = self.rpc.call_blockdev_assemble(target_node, disk,
5987 instance.name, True, idx)
5989 self.LogWarning("Can't assemble newly created disk %d: %s",
5990 idx, result.fail_msg)
5991 errs.append(result.fail_msg)
5993 dev_path = result.payload
5994 result = self.rpc.call_blockdev_export(source_node, disk,
5995 target_node, dev_path,
5998 self.LogWarning("Can't copy data over for disk %d: %s",
5999 idx, result.fail_msg)
6000 errs.append(result.fail_msg)
6004 self.LogWarning("Some disks failed to copy, aborting")
6006 _RemoveDisks(self, instance, target_node=target_node)
6008 self.cfg.ReleaseDRBDMinors(instance.name)
6009 raise errors.OpExecError("Errors during disk copy: %s" %
6012 instance.primary_node = target_node
6013 self.cfg.Update(instance, feedback_fn)
6015 self.LogInfo("Removing the disks on the original node")
6016 _RemoveDisks(self, instance, target_node=source_node)
6018 # Only start the instance if it's marked as up
6019 if instance.admin_up:
6020 self.LogInfo("Starting instance %s on node %s",
6021 instance.name, target_node)
6023 disks_ok, _ = _AssembleInstanceDisks(self, instance,
6024 ignore_secondaries=True)
6026 _ShutdownInstanceDisks(self, instance)
6027 raise errors.OpExecError("Can't activate the instance's disks")
6029 result = self.rpc.call_instance_start(target_node, instance, None, None)
6030 msg = result.fail_msg
6032 _ShutdownInstanceDisks(self, instance)
6033 raise errors.OpExecError("Could not start instance %s on node %s: %s" %
6034 (instance.name, target_node, msg))
6037 class LUNodeMigrate(LogicalUnit):
6038 """Migrate all instances from a node.
6041 HPATH = "node-migrate"
6042 HTYPE = constants.HTYPE_NODE
6045 def ExpandNames(self):
6046 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
6048 self.needed_locks = {
6049 locking.LEVEL_NODE: [self.op.node_name],
6052 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
6054 # Create tasklets for migrating instances for all instances on this node
6058 for inst in _GetNodePrimaryInstances(self.cfg, self.op.node_name):
6059 logging.debug("Migrating instance %s", inst.name)
6060 names.append(inst.name)
6062 tasklets.append(TLMigrateInstance(self, inst.name, False))
6064 self.tasklets = tasklets
6066 # Declare instance locks
6067 self.needed_locks[locking.LEVEL_INSTANCE] = names
6069 def DeclareLocks(self, level):
6070 if level == locking.LEVEL_NODE:
6071 self._LockInstancesNodes()
6073 def BuildHooksEnv(self):
6076 This runs on the master, the primary and all the secondaries.
6080 "NODE_NAME": self.op.node_name,
6083 nl = [self.cfg.GetMasterNode()]
6085 return (env, nl, nl)
6088 class TLMigrateInstance(Tasklet):
6089 """Tasklet class for instance migration.
6092 @ivar live: whether the migration will be done live or non-live;
6093 this variable is initalized only after CheckPrereq has run
6096 def __init__(self, lu, instance_name, cleanup):
6097 """Initializes this class.
6100 Tasklet.__init__(self, lu)
6103 self.instance_name = instance_name
6104 self.cleanup = cleanup
6105 self.live = False # will be overridden later
6107 def CheckPrereq(self):
6108 """Check prerequisites.
6110 This checks that the instance is in the cluster.
6113 instance_name = _ExpandInstanceName(self.lu.cfg, self.instance_name)
6114 instance = self.cfg.GetInstanceInfo(instance_name)
6115 assert instance is not None
6117 if instance.disk_template != constants.DT_DRBD8:
6118 raise errors.OpPrereqError("Instance's disk layout is not"
6119 " drbd8, cannot migrate.", errors.ECODE_STATE)
6121 secondary_nodes = instance.secondary_nodes
6122 if not secondary_nodes:
6123 raise errors.ConfigurationError("No secondary node but using"
6124 " drbd8 disk template")
6126 i_be = self.cfg.GetClusterInfo().FillBE(instance)
6128 target_node = secondary_nodes[0]
6129 # check memory requirements on the secondary node
6130 _CheckNodeFreeMemory(self.lu, target_node, "migrating instance %s" %
6131 instance.name, i_be[constants.BE_MEMORY],
6132 instance.hypervisor)
6134 # check bridge existance
6135 _CheckInstanceBridgesExist(self.lu, instance, node=target_node)
6137 if not self.cleanup:
6138 _CheckNodeNotDrained(self.lu, target_node)
6139 result = self.rpc.call_instance_migratable(instance.primary_node,
6141 result.Raise("Can't migrate, please use failover",
6142 prereq=True, ecode=errors.ECODE_STATE)
6144 self.instance = instance
6146 if self.lu.op.live is not None and self.lu.op.mode is not None:
6147 raise errors.OpPrereqError("Only one of the 'live' and 'mode'"
6148 " parameters are accepted",
6150 if self.lu.op.live is not None:
6152 self.lu.op.mode = constants.HT_MIGRATION_LIVE
6154 self.lu.op.mode = constants.HT_MIGRATION_NONLIVE
6155 # reset the 'live' parameter to None so that repeated
6156 # invocations of CheckPrereq do not raise an exception
6157 self.lu.op.live = None
6158 elif self.lu.op.mode is None:
6159 # read the default value from the hypervisor
6160 i_hv = self.cfg.GetClusterInfo().FillHV(instance, skip_globals=False)
6161 self.lu.op.mode = i_hv[constants.HV_MIGRATION_MODE]
6163 self.live = self.lu.op.mode == constants.HT_MIGRATION_LIVE
6165 def _WaitUntilSync(self):
6166 """Poll with custom rpc for disk sync.
6168 This uses our own step-based rpc call.
6171 self.feedback_fn("* wait until resync is done")
6175 result = self.rpc.call_drbd_wait_sync(self.all_nodes,
6177 self.instance.disks)
6179 for node, nres in result.items():
6180 nres.Raise("Cannot resync disks on node %s" % node)
6181 node_done, node_percent = nres.payload
6182 all_done = all_done and node_done
6183 if node_percent is not None:
6184 min_percent = min(min_percent, node_percent)
6186 if min_percent < 100:
6187 self.feedback_fn(" - progress: %.1f%%" % min_percent)
6190 def _EnsureSecondary(self, node):
6191 """Demote a node to secondary.
6194 self.feedback_fn("* switching node %s to secondary mode" % node)
6196 for dev in self.instance.disks:
6197 self.cfg.SetDiskID(dev, node)
6199 result = self.rpc.call_blockdev_close(node, self.instance.name,
6200 self.instance.disks)
6201 result.Raise("Cannot change disk to secondary on node %s" % node)
6203 def _GoStandalone(self):
6204 """Disconnect from the network.
6207 self.feedback_fn("* changing into standalone mode")
6208 result = self.rpc.call_drbd_disconnect_net(self.all_nodes, self.nodes_ip,
6209 self.instance.disks)
6210 for node, nres in result.items():
6211 nres.Raise("Cannot disconnect disks node %s" % node)
6213 def _GoReconnect(self, multimaster):
6214 """Reconnect to the network.
6220 msg = "single-master"
6221 self.feedback_fn("* changing disks into %s mode" % msg)
6222 result = self.rpc.call_drbd_attach_net(self.all_nodes, self.nodes_ip,
6223 self.instance.disks,
6224 self.instance.name, multimaster)
6225 for node, nres in result.items():
6226 nres.Raise("Cannot change disks config on node %s" % node)
6228 def _ExecCleanup(self):
6229 """Try to cleanup after a failed migration.
6231 The cleanup is done by:
6232 - check that the instance is running only on one node
6233 (and update the config if needed)
6234 - change disks on its secondary node to secondary
6235 - wait until disks are fully synchronized
6236 - disconnect from the network
6237 - change disks into single-master mode
6238 - wait again until disks are fully synchronized
6241 instance = self.instance
6242 target_node = self.target_node
6243 source_node = self.source_node
6245 # check running on only one node
6246 self.feedback_fn("* checking where the instance actually runs"
6247 " (if this hangs, the hypervisor might be in"
6249 ins_l = self.rpc.call_instance_list(self.all_nodes, [instance.hypervisor])
6250 for node, result in ins_l.items():
6251 result.Raise("Can't contact node %s" % node)
6253 runningon_source = instance.name in ins_l[source_node].payload
6254 runningon_target = instance.name in ins_l[target_node].payload
6256 if runningon_source and runningon_target:
6257 raise errors.OpExecError("Instance seems to be running on two nodes,"
6258 " or the hypervisor is confused. You will have"
6259 " to ensure manually that it runs only on one"
6260 " and restart this operation.")
6262 if not (runningon_source or runningon_target):
6263 raise errors.OpExecError("Instance does not seem to be running at all."
6264 " In this case, it's safer to repair by"
6265 " running 'gnt-instance stop' to ensure disk"
6266 " shutdown, and then restarting it.")
6268 if runningon_target:
6269 # the migration has actually succeeded, we need to update the config
6270 self.feedback_fn("* instance running on secondary node (%s),"
6271 " updating config" % target_node)
6272 instance.primary_node = target_node
6273 self.cfg.Update(instance, self.feedback_fn)
6274 demoted_node = source_node
6276 self.feedback_fn("* instance confirmed to be running on its"
6277 " primary node (%s)" % source_node)
6278 demoted_node = target_node
6280 self._EnsureSecondary(demoted_node)
6282 self._WaitUntilSync()
6283 except errors.OpExecError:
6284 # we ignore here errors, since if the device is standalone, it
6285 # won't be able to sync
6287 self._GoStandalone()
6288 self._GoReconnect(False)
6289 self._WaitUntilSync()
6291 self.feedback_fn("* done")
6293 def _RevertDiskStatus(self):
6294 """Try to revert the disk status after a failed migration.
6297 target_node = self.target_node
6299 self._EnsureSecondary(target_node)
6300 self._GoStandalone()
6301 self._GoReconnect(False)
6302 self._WaitUntilSync()
6303 except errors.OpExecError, err:
6304 self.lu.LogWarning("Migration failed and I can't reconnect the"
6305 " drives: error '%s'\n"
6306 "Please look and recover the instance status" %
6309 def _AbortMigration(self):
6310 """Call the hypervisor code to abort a started migration.
6313 instance = self.instance
6314 target_node = self.target_node
6315 migration_info = self.migration_info
6317 abort_result = self.rpc.call_finalize_migration(target_node,
6321 abort_msg = abort_result.fail_msg
6323 logging.error("Aborting migration failed on target node %s: %s",
6324 target_node, abort_msg)
6325 # Don't raise an exception here, as we stil have to try to revert the
6326 # disk status, even if this step failed.
6328 def _ExecMigration(self):
6329 """Migrate an instance.
6331 The migrate is done by:
6332 - change the disks into dual-master mode
6333 - wait until disks are fully synchronized again
6334 - migrate the instance
6335 - change disks on the new secondary node (the old primary) to secondary
6336 - wait until disks are fully synchronized
6337 - change disks into single-master mode
6340 instance = self.instance
6341 target_node = self.target_node
6342 source_node = self.source_node
6344 self.feedback_fn("* checking disk consistency between source and target")
6345 for dev in instance.disks:
6346 if not _CheckDiskConsistency(self.lu, dev, target_node, False):
6347 raise errors.OpExecError("Disk %s is degraded or not fully"
6348 " synchronized on target node,"
6349 " aborting migrate." % dev.iv_name)
6351 # First get the migration information from the remote node
6352 result = self.rpc.call_migration_info(source_node, instance)
6353 msg = result.fail_msg
6355 log_err = ("Failed fetching source migration information from %s: %s" %
6357 logging.error(log_err)
6358 raise errors.OpExecError(log_err)
6360 self.migration_info = migration_info = result.payload
6362 # Then switch the disks to master/master mode
6363 self._EnsureSecondary(target_node)
6364 self._GoStandalone()
6365 self._GoReconnect(True)
6366 self._WaitUntilSync()
6368 self.feedback_fn("* preparing %s to accept the instance" % target_node)
6369 result = self.rpc.call_accept_instance(target_node,
6372 self.nodes_ip[target_node])
6374 msg = result.fail_msg
6376 logging.error("Instance pre-migration failed, trying to revert"
6377 " disk status: %s", msg)
6378 self.feedback_fn("Pre-migration failed, aborting")
6379 self._AbortMigration()
6380 self._RevertDiskStatus()
6381 raise errors.OpExecError("Could not pre-migrate instance %s: %s" %
6382 (instance.name, msg))
6384 self.feedback_fn("* migrating instance to %s" % target_node)
6386 result = self.rpc.call_instance_migrate(source_node, instance,
6387 self.nodes_ip[target_node],
6389 msg = result.fail_msg
6391 logging.error("Instance migration failed, trying to revert"
6392 " disk status: %s", msg)
6393 self.feedback_fn("Migration failed, aborting")
6394 self._AbortMigration()
6395 self._RevertDiskStatus()
6396 raise errors.OpExecError("Could not migrate instance %s: %s" %
6397 (instance.name, msg))
6400 instance.primary_node = target_node
6401 # distribute new instance config to the other nodes
6402 self.cfg.Update(instance, self.feedback_fn)
6404 result = self.rpc.call_finalize_migration(target_node,
6408 msg = result.fail_msg
6410 logging.error("Instance migration succeeded, but finalization failed:"
6412 raise errors.OpExecError("Could not finalize instance migration: %s" %
6415 self._EnsureSecondary(source_node)
6416 self._WaitUntilSync()
6417 self._GoStandalone()
6418 self._GoReconnect(False)
6419 self._WaitUntilSync()
6421 self.feedback_fn("* done")
6423 def Exec(self, feedback_fn):
6424 """Perform the migration.
6427 feedback_fn("Migrating instance %s" % self.instance.name)
6429 self.feedback_fn = feedback_fn
6431 self.source_node = self.instance.primary_node
6432 self.target_node = self.instance.secondary_nodes[0]
6433 self.all_nodes = [self.source_node, self.target_node]
6435 self.source_node: self.cfg.GetNodeInfo(self.source_node).secondary_ip,
6436 self.target_node: self.cfg.GetNodeInfo(self.target_node).secondary_ip,
6440 return self._ExecCleanup()
6442 return self._ExecMigration()
6445 def _CreateBlockDev(lu, node, instance, device, force_create,
6447 """Create a tree of block devices on a given node.
6449 If this device type has to be created on secondaries, create it and
6452 If not, just recurse to children keeping the same 'force' value.
6454 @param lu: the lu on whose behalf we execute
6455 @param node: the node on which to create the device
6456 @type instance: L{objects.Instance}
6457 @param instance: the instance which owns the device
6458 @type device: L{objects.Disk}
6459 @param device: the device to create
6460 @type force_create: boolean
6461 @param force_create: whether to force creation of this device; this
6462 will be change to True whenever we find a device which has
6463 CreateOnSecondary() attribute
6464 @param info: the extra 'metadata' we should attach to the device
6465 (this will be represented as a LVM tag)
6466 @type force_open: boolean
6467 @param force_open: this parameter will be passes to the
6468 L{backend.BlockdevCreate} function where it specifies
6469 whether we run on primary or not, and it affects both
6470 the child assembly and the device own Open() execution
6473 if device.CreateOnSecondary():
6477 for child in device.children:
6478 _CreateBlockDev(lu, node, instance, child, force_create,
6481 if not force_create:
6484 _CreateSingleBlockDev(lu, node, instance, device, info, force_open)
6487 def _CreateSingleBlockDev(lu, node, instance, device, info, force_open):
6488 """Create a single block device on a given node.
6490 This will not recurse over children of the device, so they must be
6493 @param lu: the lu on whose behalf we execute
6494 @param node: the node on which to create the device
6495 @type instance: L{objects.Instance}
6496 @param instance: the instance which owns the device
6497 @type device: L{objects.Disk}
6498 @param device: the device to create
6499 @param info: the extra 'metadata' we should attach to the device
6500 (this will be represented as a LVM tag)
6501 @type force_open: boolean
6502 @param force_open: this parameter will be passes to the
6503 L{backend.BlockdevCreate} function where it specifies
6504 whether we run on primary or not, and it affects both
6505 the child assembly and the device own Open() execution
6508 lu.cfg.SetDiskID(device, node)
6509 result = lu.rpc.call_blockdev_create(node, device, device.size,
6510 instance.name, force_open, info)
6511 result.Raise("Can't create block device %s on"
6512 " node %s for instance %s" % (device, node, instance.name))
6513 if device.physical_id is None:
6514 device.physical_id = result.payload
6517 def _GenerateUniqueNames(lu, exts):
6518 """Generate a suitable LV name.
6520 This will generate a logical volume name for the given instance.
6525 new_id = lu.cfg.GenerateUniqueID(lu.proc.GetECId())
6526 results.append("%s%s" % (new_id, val))
6530 def _GenerateDRBD8Branch(lu, primary, secondary, size, vgnames, names,
6531 iv_name, p_minor, s_minor):
6532 """Generate a drbd8 device complete with its children.
6535 assert len(vgnames) == len(names) == 2
6536 port = lu.cfg.AllocatePort()
6537 shared_secret = lu.cfg.GenerateDRBDSecret(lu.proc.GetECId())
6538 dev_data = objects.Disk(dev_type=constants.LD_LV, size=size,
6539 logical_id=(vgnames[0], names[0]))
6540 dev_meta = objects.Disk(dev_type=constants.LD_LV, size=128,
6541 logical_id=(vgnames[1], names[1]))
6542 drbd_dev = objects.Disk(dev_type=constants.LD_DRBD8, size=size,
6543 logical_id=(primary, secondary, port,
6546 children=[dev_data, dev_meta],
6551 def _GenerateDiskTemplate(lu, template_name,
6552 instance_name, primary_node,
6553 secondary_nodes, disk_info,
6554 file_storage_dir, file_driver,
6555 base_index, feedback_fn):
6556 """Generate the entire disk layout for a given template type.
6559 #TODO: compute space requirements
6561 vgname = lu.cfg.GetVGName()
6562 disk_count = len(disk_info)
6564 if template_name == constants.DT_DISKLESS:
6566 elif template_name == constants.DT_PLAIN:
6567 if len(secondary_nodes) != 0:
6568 raise errors.ProgrammerError("Wrong template configuration")
6570 names = _GenerateUniqueNames(lu, [".disk%d" % (base_index + i)
6571 for i in range(disk_count)])
6572 for idx, disk in enumerate(disk_info):
6573 disk_index = idx + base_index
6574 vg = disk.get("vg", vgname)
6575 feedback_fn("* disk %i, vg %s, name %s" % (idx, vg, names[idx]))
6576 disk_dev = objects.Disk(dev_type=constants.LD_LV, size=disk["size"],
6577 logical_id=(vg, names[idx]),
6578 iv_name="disk/%d" % disk_index,
6580 disks.append(disk_dev)
6581 elif template_name == constants.DT_DRBD8:
6582 if len(secondary_nodes) != 1:
6583 raise errors.ProgrammerError("Wrong template configuration")
6584 remote_node = secondary_nodes[0]
6585 minors = lu.cfg.AllocateDRBDMinor(
6586 [primary_node, remote_node] * len(disk_info), instance_name)
6589 for lv_prefix in _GenerateUniqueNames(lu, [".disk%d" % (base_index + i)
6590 for i in range(disk_count)]):
6591 names.append(lv_prefix + "_data")
6592 names.append(lv_prefix + "_meta")
6593 for idx, disk in enumerate(disk_info):
6594 disk_index = idx + base_index
6595 vg = disk.get("vg", vgname)
6596 disk_dev = _GenerateDRBD8Branch(lu, primary_node, remote_node,
6597 disk["size"], [vg, vg],
6598 names[idx*2:idx*2+2],
6599 "disk/%d" % disk_index,
6600 minors[idx*2], minors[idx*2+1])
6601 disk_dev.mode = disk["mode"]
6602 disks.append(disk_dev)
6603 elif template_name == constants.DT_FILE:
6604 if len(secondary_nodes) != 0:
6605 raise errors.ProgrammerError("Wrong template configuration")
6607 opcodes.RequireFileStorage()
6609 for idx, disk in enumerate(disk_info):
6610 disk_index = idx + base_index
6611 disk_dev = objects.Disk(dev_type=constants.LD_FILE, size=disk["size"],
6612 iv_name="disk/%d" % disk_index,
6613 logical_id=(file_driver,
6614 "%s/disk%d" % (file_storage_dir,
6617 disks.append(disk_dev)
6619 raise errors.ProgrammerError("Invalid disk template '%s'" % template_name)
6623 def _GetInstanceInfoText(instance):
6624 """Compute that text that should be added to the disk's metadata.
6627 return "originstname+%s" % instance.name
6630 def _CalcEta(time_taken, written, total_size):
6631 """Calculates the ETA based on size written and total size.
6633 @param time_taken: The time taken so far
6634 @param written: amount written so far
6635 @param total_size: The total size of data to be written
6636 @return: The remaining time in seconds
6639 avg_time = time_taken / float(written)
6640 return (total_size - written) * avg_time
6643 def _WipeDisks(lu, instance):
6644 """Wipes instance disks.
6646 @type lu: L{LogicalUnit}
6647 @param lu: the logical unit on whose behalf we execute
6648 @type instance: L{objects.Instance}
6649 @param instance: the instance whose disks we should create
6650 @return: the success of the wipe
6653 node = instance.primary_node
6655 for device in instance.disks:
6656 lu.cfg.SetDiskID(device, node)
6658 logging.info("Pause sync of instance %s disks", instance.name)
6659 result = lu.rpc.call_blockdev_pause_resume_sync(node, instance.disks, True)
6661 for idx, success in enumerate(result.payload):
6663 logging.warn("pause-sync of instance %s for disks %d failed",
6667 for idx, device in enumerate(instance.disks):
6668 # The wipe size is MIN_WIPE_CHUNK_PERCENT % of the instance disk but
6669 # MAX_WIPE_CHUNK at max
6670 wipe_chunk_size = min(constants.MAX_WIPE_CHUNK, device.size / 100.0 *
6671 constants.MIN_WIPE_CHUNK_PERCENT)
6672 # we _must_ make this an int, otherwise rounding errors will
6674 wipe_chunk_size = int(wipe_chunk_size)
6676 lu.LogInfo("* Wiping disk %d", idx)
6677 logging.info("Wiping disk %d for instance %s, node %s using"
6678 " chunk size %s", idx, instance.name, node, wipe_chunk_size)
6683 start_time = time.time()
6685 while offset < size:
6686 wipe_size = min(wipe_chunk_size, size - offset)
6687 logging.debug("Wiping disk %d, offset %s, chunk %s",
6688 idx, offset, wipe_size)
6689 result = lu.rpc.call_blockdev_wipe(node, device, offset, wipe_size)
6690 result.Raise("Could not wipe disk %d at offset %d for size %d" %
6691 (idx, offset, wipe_size))
6694 if now - last_output >= 60:
6695 eta = _CalcEta(now - start_time, offset, size)
6696 lu.LogInfo(" - done: %.1f%% ETA: %s" %
6697 (offset / float(size) * 100, utils.FormatSeconds(eta)))
6700 logging.info("Resume sync of instance %s disks", instance.name)
6702 result = lu.rpc.call_blockdev_pause_resume_sync(node, instance.disks, False)
6704 for idx, success in enumerate(result.payload):
6706 lu.LogWarning("Warning: Resume sync of disk %d failed. Please have a"
6707 " look at the status and troubleshoot the issue.", idx)
6708 logging.warn("resume-sync of instance %s for disks %d failed",
6712 def _CreateDisks(lu, instance, to_skip=None, target_node=None):
6713 """Create all disks for an instance.
6715 This abstracts away some work from AddInstance.
6717 @type lu: L{LogicalUnit}
6718 @param lu: the logical unit on whose behalf we execute
6719 @type instance: L{objects.Instance}
6720 @param instance: the instance whose disks we should create
6722 @param to_skip: list of indices to skip
6723 @type target_node: string
6724 @param target_node: if passed, overrides the target node for creation
6726 @return: the success of the creation
6729 info = _GetInstanceInfoText(instance)
6730 if target_node is None:
6731 pnode = instance.primary_node
6732 all_nodes = instance.all_nodes
6737 if instance.disk_template == constants.DT_FILE:
6738 file_storage_dir = os.path.dirname(instance.disks[0].logical_id[1])
6739 result = lu.rpc.call_file_storage_dir_create(pnode, file_storage_dir)
6741 result.Raise("Failed to create directory '%s' on"
6742 " node %s" % (file_storage_dir, pnode))
6744 # Note: this needs to be kept in sync with adding of disks in
6745 # LUInstanceSetParams
6746 for idx, device in enumerate(instance.disks):
6747 if to_skip and idx in to_skip:
6749 logging.info("Creating volume %s for instance %s",
6750 device.iv_name, instance.name)
6752 for node in all_nodes:
6753 f_create = node == pnode
6754 _CreateBlockDev(lu, node, instance, device, f_create, info, f_create)
6757 def _RemoveDisks(lu, instance, target_node=None):
6758 """Remove all disks for an instance.
6760 This abstracts away some work from `AddInstance()` and
6761 `RemoveInstance()`. Note that in case some of the devices couldn't
6762 be removed, the removal will continue with the other ones (compare
6763 with `_CreateDisks()`).
6765 @type lu: L{LogicalUnit}
6766 @param lu: the logical unit on whose behalf we execute
6767 @type instance: L{objects.Instance}
6768 @param instance: the instance whose disks we should remove
6769 @type target_node: string
6770 @param target_node: used to override the node on which to remove the disks
6772 @return: the success of the removal
6775 logging.info("Removing block devices for instance %s", instance.name)
6778 for device in instance.disks:
6780 edata = [(target_node, device)]
6782 edata = device.ComputeNodeTree(instance.primary_node)
6783 for node, disk in edata:
6784 lu.cfg.SetDiskID(disk, node)
6785 msg = lu.rpc.call_blockdev_remove(node, disk).fail_msg
6787 lu.LogWarning("Could not remove block device %s on node %s,"
6788 " continuing anyway: %s", device.iv_name, node, msg)
6791 if instance.disk_template == constants.DT_FILE:
6792 file_storage_dir = os.path.dirname(instance.disks[0].logical_id[1])
6796 tgt = instance.primary_node
6797 result = lu.rpc.call_file_storage_dir_remove(tgt, file_storage_dir)
6799 lu.LogWarning("Could not remove directory '%s' on node %s: %s",
6800 file_storage_dir, instance.primary_node, result.fail_msg)
6806 def _ComputeDiskSizePerVG(disk_template, disks):
6807 """Compute disk size requirements in the volume group
6810 def _compute(disks, payload):
6811 """Universal algorithm
6816 vgs[disk["vg"]] = vgs.get("vg", 0) + disk["size"] + payload
6820 # Required free disk space as a function of disk and swap space
6822 constants.DT_DISKLESS: {},
6823 constants.DT_PLAIN: _compute(disks, 0),
6824 # 128 MB are added for drbd metadata for each disk
6825 constants.DT_DRBD8: _compute(disks, 128),
6826 constants.DT_FILE: {},
6829 if disk_template not in req_size_dict:
6830 raise errors.ProgrammerError("Disk template '%s' size requirement"
6831 " is unknown" % disk_template)
6833 return req_size_dict[disk_template]
6836 def _ComputeDiskSize(disk_template, disks):
6837 """Compute disk size requirements in the volume group
6840 # Required free disk space as a function of disk and swap space
6842 constants.DT_DISKLESS: None,
6843 constants.DT_PLAIN: sum(d["size"] for d in disks),
6844 # 128 MB are added for drbd metadata for each disk
6845 constants.DT_DRBD8: sum(d["size"] + 128 for d in disks),
6846 constants.DT_FILE: None,
6849 if disk_template not in req_size_dict:
6850 raise errors.ProgrammerError("Disk template '%s' size requirement"
6851 " is unknown" % disk_template)
6853 return req_size_dict[disk_template]
6856 def _FilterVmNodes(lu, nodenames):
6857 """Filters out non-vm_capable nodes from a list.
6859 @type lu: L{LogicalUnit}
6860 @param lu: the logical unit for which we check
6861 @type nodenames: list
6862 @param nodenames: the list of nodes on which we should check
6864 @return: the list of vm-capable nodes
6867 vm_nodes = frozenset(lu.cfg.GetNonVmCapableNodeList())
6868 return [name for name in nodenames if name not in vm_nodes]
6871 def _CheckHVParams(lu, nodenames, hvname, hvparams):
6872 """Hypervisor parameter validation.
6874 This function abstract the hypervisor parameter validation to be
6875 used in both instance create and instance modify.
6877 @type lu: L{LogicalUnit}
6878 @param lu: the logical unit for which we check
6879 @type nodenames: list
6880 @param nodenames: the list of nodes on which we should check
6881 @type hvname: string
6882 @param hvname: the name of the hypervisor we should use
6883 @type hvparams: dict
6884 @param hvparams: the parameters which we need to check
6885 @raise errors.OpPrereqError: if the parameters are not valid
6888 nodenames = _FilterVmNodes(lu, nodenames)
6889 hvinfo = lu.rpc.call_hypervisor_validate_params(nodenames,
6892 for node in nodenames:
6896 info.Raise("Hypervisor parameter validation failed on node %s" % node)
6899 def _CheckOSParams(lu, required, nodenames, osname, osparams):
6900 """OS parameters validation.
6902 @type lu: L{LogicalUnit}
6903 @param lu: the logical unit for which we check
6904 @type required: boolean
6905 @param required: whether the validation should fail if the OS is not
6907 @type nodenames: list
6908 @param nodenames: the list of nodes on which we should check
6909 @type osname: string
6910 @param osname: the name of the hypervisor we should use
6911 @type osparams: dict
6912 @param osparams: the parameters which we need to check
6913 @raise errors.OpPrereqError: if the parameters are not valid
6916 nodenames = _FilterVmNodes(lu, nodenames)
6917 result = lu.rpc.call_os_validate(required, nodenames, osname,
6918 [constants.OS_VALIDATE_PARAMETERS],
6920 for node, nres in result.items():
6921 # we don't check for offline cases since this should be run only
6922 # against the master node and/or an instance's nodes
6923 nres.Raise("OS Parameters validation failed on node %s" % node)
6924 if not nres.payload:
6925 lu.LogInfo("OS %s not found on node %s, validation skipped",
6929 class LUInstanceCreate(LogicalUnit):
6930 """Create an instance.
6933 HPATH = "instance-add"
6934 HTYPE = constants.HTYPE_INSTANCE
6937 def CheckArguments(self):
6941 # do not require name_check to ease forward/backward compatibility
6943 if self.op.no_install and self.op.start:
6944 self.LogInfo("No-installation mode selected, disabling startup")
6945 self.op.start = False
6946 # validate/normalize the instance name
6947 self.op.instance_name = \
6948 netutils.Hostname.GetNormalizedName(self.op.instance_name)
6950 if self.op.ip_check and not self.op.name_check:
6951 # TODO: make the ip check more flexible and not depend on the name check
6952 raise errors.OpPrereqError("Cannot do ip check without a name check",
6955 # check nics' parameter names
6956 for nic in self.op.nics:
6957 utils.ForceDictType(nic, constants.INIC_PARAMS_TYPES)
6959 # check disks. parameter names and consistent adopt/no-adopt strategy
6960 has_adopt = has_no_adopt = False
6961 for disk in self.op.disks:
6962 utils.ForceDictType(disk, constants.IDISK_PARAMS_TYPES)
6967 if has_adopt and has_no_adopt:
6968 raise errors.OpPrereqError("Either all disks are adopted or none is",
6971 if self.op.disk_template not in constants.DTS_MAY_ADOPT:
6972 raise errors.OpPrereqError("Disk adoption is not supported for the"
6973 " '%s' disk template" %
6974 self.op.disk_template,
6976 if self.op.iallocator is not None:
6977 raise errors.OpPrereqError("Disk adoption not allowed with an"
6978 " iallocator script", errors.ECODE_INVAL)
6979 if self.op.mode == constants.INSTANCE_IMPORT:
6980 raise errors.OpPrereqError("Disk adoption not allowed for"
6981 " instance import", errors.ECODE_INVAL)
6983 self.adopt_disks = has_adopt
6985 # instance name verification
6986 if self.op.name_check:
6987 self.hostname1 = netutils.GetHostname(name=self.op.instance_name)
6988 self.op.instance_name = self.hostname1.name
6989 # used in CheckPrereq for ip ping check
6990 self.check_ip = self.hostname1.ip
6992 self.check_ip = None
6994 # file storage checks
6995 if (self.op.file_driver and
6996 not self.op.file_driver in constants.FILE_DRIVER):
6997 raise errors.OpPrereqError("Invalid file driver name '%s'" %
6998 self.op.file_driver, errors.ECODE_INVAL)
7000 if self.op.file_storage_dir and os.path.isabs(self.op.file_storage_dir):
7001 raise errors.OpPrereqError("File storage directory path not absolute",
7004 ### Node/iallocator related checks
7005 _CheckIAllocatorOrNode(self, "iallocator", "pnode")
7007 if self.op.pnode is not None:
7008 if self.op.disk_template in constants.DTS_NET_MIRROR:
7009 if self.op.snode is None:
7010 raise errors.OpPrereqError("The networked disk templates need"
7011 " a mirror node", errors.ECODE_INVAL)
7013 self.LogWarning("Secondary node will be ignored on non-mirrored disk"
7015 self.op.snode = None
7017 self._cds = _GetClusterDomainSecret()
7019 if self.op.mode == constants.INSTANCE_IMPORT:
7020 # On import force_variant must be True, because if we forced it at
7021 # initial install, our only chance when importing it back is that it
7023 self.op.force_variant = True
7025 if self.op.no_install:
7026 self.LogInfo("No-installation mode has no effect during import")
7028 elif self.op.mode == constants.INSTANCE_CREATE:
7029 if self.op.os_type is None:
7030 raise errors.OpPrereqError("No guest OS specified",
7032 if self.op.os_type in self.cfg.GetClusterInfo().blacklisted_os:
7033 raise errors.OpPrereqError("Guest OS '%s' is not allowed for"
7034 " installation" % self.op.os_type,
7036 if self.op.disk_template is None:
7037 raise errors.OpPrereqError("No disk template specified",
7040 elif self.op.mode == constants.INSTANCE_REMOTE_IMPORT:
7041 # Check handshake to ensure both clusters have the same domain secret
7042 src_handshake = self.op.source_handshake
7043 if not src_handshake:
7044 raise errors.OpPrereqError("Missing source handshake",
7047 errmsg = masterd.instance.CheckRemoteExportHandshake(self._cds,
7050 raise errors.OpPrereqError("Invalid handshake: %s" % errmsg,
7053 # Load and check source CA
7054 self.source_x509_ca_pem = self.op.source_x509_ca
7055 if not self.source_x509_ca_pem:
7056 raise errors.OpPrereqError("Missing source X509 CA",
7060 (cert, _) = utils.LoadSignedX509Certificate(self.source_x509_ca_pem,
7062 except OpenSSL.crypto.Error, err:
7063 raise errors.OpPrereqError("Unable to load source X509 CA (%s)" %
7064 (err, ), errors.ECODE_INVAL)
7066 (errcode, msg) = utils.VerifyX509Certificate(cert, None, None)
7067 if errcode is not None:
7068 raise errors.OpPrereqError("Invalid source X509 CA (%s)" % (msg, ),
7071 self.source_x509_ca = cert
7073 src_instance_name = self.op.source_instance_name
7074 if not src_instance_name:
7075 raise errors.OpPrereqError("Missing source instance name",
7078 self.source_instance_name = \
7079 netutils.GetHostname(name=src_instance_name).name
7082 raise errors.OpPrereqError("Invalid instance creation mode %r" %
7083 self.op.mode, errors.ECODE_INVAL)
7085 def ExpandNames(self):
7086 """ExpandNames for CreateInstance.
7088 Figure out the right locks for instance creation.
7091 self.needed_locks = {}
7093 instance_name = self.op.instance_name
7094 # this is just a preventive check, but someone might still add this
7095 # instance in the meantime, and creation will fail at lock-add time
7096 if instance_name in self.cfg.GetInstanceList():
7097 raise errors.OpPrereqError("Instance '%s' is already in the cluster" %
7098 instance_name, errors.ECODE_EXISTS)
7100 self.add_locks[locking.LEVEL_INSTANCE] = instance_name
7102 if self.op.iallocator:
7103 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
7105 self.op.pnode = _ExpandNodeName(self.cfg, self.op.pnode)
7106 nodelist = [self.op.pnode]
7107 if self.op.snode is not None:
7108 self.op.snode = _ExpandNodeName(self.cfg, self.op.snode)
7109 nodelist.append(self.op.snode)
7110 self.needed_locks[locking.LEVEL_NODE] = nodelist
7112 # in case of import lock the source node too
7113 if self.op.mode == constants.INSTANCE_IMPORT:
7114 src_node = self.op.src_node
7115 src_path = self.op.src_path
7117 if src_path is None:
7118 self.op.src_path = src_path = self.op.instance_name
7120 if src_node is None:
7121 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
7122 self.op.src_node = None
7123 if os.path.isabs(src_path):
7124 raise errors.OpPrereqError("Importing an instance from an absolute"
7125 " path requires a source node option.",
7128 self.op.src_node = src_node = _ExpandNodeName(self.cfg, src_node)
7129 if self.needed_locks[locking.LEVEL_NODE] is not locking.ALL_SET:
7130 self.needed_locks[locking.LEVEL_NODE].append(src_node)
7131 if not os.path.isabs(src_path):
7132 self.op.src_path = src_path = \
7133 utils.PathJoin(constants.EXPORT_DIR, src_path)
7135 def _RunAllocator(self):
7136 """Run the allocator based on input opcode.
7139 nics = [n.ToDict() for n in self.nics]
7140 ial = IAllocator(self.cfg, self.rpc,
7141 mode=constants.IALLOCATOR_MODE_ALLOC,
7142 name=self.op.instance_name,
7143 disk_template=self.op.disk_template,
7146 vcpus=self.be_full[constants.BE_VCPUS],
7147 mem_size=self.be_full[constants.BE_MEMORY],
7150 hypervisor=self.op.hypervisor,
7153 ial.Run(self.op.iallocator)
7156 raise errors.OpPrereqError("Can't compute nodes using"
7157 " iallocator '%s': %s" %
7158 (self.op.iallocator, ial.info),
7160 if len(ial.result) != ial.required_nodes:
7161 raise errors.OpPrereqError("iallocator '%s' returned invalid number"
7162 " of nodes (%s), required %s" %
7163 (self.op.iallocator, len(ial.result),
7164 ial.required_nodes), errors.ECODE_FAULT)
7165 self.op.pnode = ial.result[0]
7166 self.LogInfo("Selected nodes for instance %s via iallocator %s: %s",
7167 self.op.instance_name, self.op.iallocator,
7168 utils.CommaJoin(ial.result))
7169 if ial.required_nodes == 2:
7170 self.op.snode = ial.result[1]
7172 def BuildHooksEnv(self):
7175 This runs on master, primary and secondary nodes of the instance.
7179 "ADD_MODE": self.op.mode,
7181 if self.op.mode == constants.INSTANCE_IMPORT:
7182 env["SRC_NODE"] = self.op.src_node
7183 env["SRC_PATH"] = self.op.src_path
7184 env["SRC_IMAGES"] = self.src_images
7186 env.update(_BuildInstanceHookEnv(
7187 name=self.op.instance_name,
7188 primary_node=self.op.pnode,
7189 secondary_nodes=self.secondaries,
7190 status=self.op.start,
7191 os_type=self.op.os_type,
7192 memory=self.be_full[constants.BE_MEMORY],
7193 vcpus=self.be_full[constants.BE_VCPUS],
7194 nics=_NICListToTuple(self, self.nics),
7195 disk_template=self.op.disk_template,
7196 disks=[(d["size"], d["mode"]) for d in self.disks],
7199 hypervisor_name=self.op.hypervisor,
7202 nl = ([self.cfg.GetMasterNode(), self.op.pnode] +
7206 def _ReadExportInfo(self):
7207 """Reads the export information from disk.
7209 It will override the opcode source node and path with the actual
7210 information, if these two were not specified before.
7212 @return: the export information
7215 assert self.op.mode == constants.INSTANCE_IMPORT
7217 src_node = self.op.src_node
7218 src_path = self.op.src_path
7220 if src_node is None:
7221 locked_nodes = self.acquired_locks[locking.LEVEL_NODE]
7222 exp_list = self.rpc.call_export_list(locked_nodes)
7224 for node in exp_list:
7225 if exp_list[node].fail_msg:
7227 if src_path in exp_list[node].payload:
7229 self.op.src_node = src_node = node
7230 self.op.src_path = src_path = utils.PathJoin(constants.EXPORT_DIR,
7234 raise errors.OpPrereqError("No export found for relative path %s" %
7235 src_path, errors.ECODE_INVAL)
7237 _CheckNodeOnline(self, src_node)
7238 result = self.rpc.call_export_info(src_node, src_path)
7239 result.Raise("No export or invalid export found in dir %s" % src_path)
7241 export_info = objects.SerializableConfigParser.Loads(str(result.payload))
7242 if not export_info.has_section(constants.INISECT_EXP):
7243 raise errors.ProgrammerError("Corrupted export config",
7244 errors.ECODE_ENVIRON)
7246 ei_version = export_info.get(constants.INISECT_EXP, "version")
7247 if (int(ei_version) != constants.EXPORT_VERSION):
7248 raise errors.OpPrereqError("Wrong export version %s (wanted %d)" %
7249 (ei_version, constants.EXPORT_VERSION),
7250 errors.ECODE_ENVIRON)
7253 def _ReadExportParams(self, einfo):
7254 """Use export parameters as defaults.
7256 In case the opcode doesn't specify (as in override) some instance
7257 parameters, then try to use them from the export information, if
7261 self.op.os_type = einfo.get(constants.INISECT_EXP, "os")
7263 if self.op.disk_template is None:
7264 if einfo.has_option(constants.INISECT_INS, "disk_template"):
7265 self.op.disk_template = einfo.get(constants.INISECT_INS,
7268 raise errors.OpPrereqError("No disk template specified and the export"
7269 " is missing the disk_template information",
7272 if not self.op.disks:
7273 if einfo.has_option(constants.INISECT_INS, "disk_count"):
7275 # TODO: import the disk iv_name too
7276 for idx in range(einfo.getint(constants.INISECT_INS, "disk_count")):
7277 disk_sz = einfo.getint(constants.INISECT_INS, "disk%d_size" % idx)
7278 disks.append({"size": disk_sz})
7279 self.op.disks = disks
7281 raise errors.OpPrereqError("No disk info specified and the export"
7282 " is missing the disk information",
7285 if (not self.op.nics and
7286 einfo.has_option(constants.INISECT_INS, "nic_count")):
7288 for idx in range(einfo.getint(constants.INISECT_INS, "nic_count")):
7290 for name in list(constants.NICS_PARAMETERS) + ["ip", "mac"]:
7291 v = einfo.get(constants.INISECT_INS, "nic%d_%s" % (idx, name))
7296 if (self.op.hypervisor is None and
7297 einfo.has_option(constants.INISECT_INS, "hypervisor")):
7298 self.op.hypervisor = einfo.get(constants.INISECT_INS, "hypervisor")
7299 if einfo.has_section(constants.INISECT_HYP):
7300 # use the export parameters but do not override the ones
7301 # specified by the user
7302 for name, value in einfo.items(constants.INISECT_HYP):
7303 if name not in self.op.hvparams:
7304 self.op.hvparams[name] = value
7306 if einfo.has_section(constants.INISECT_BEP):
7307 # use the parameters, without overriding
7308 for name, value in einfo.items(constants.INISECT_BEP):
7309 if name not in self.op.beparams:
7310 self.op.beparams[name] = value
7312 # try to read the parameters old style, from the main section
7313 for name in constants.BES_PARAMETERS:
7314 if (name not in self.op.beparams and
7315 einfo.has_option(constants.INISECT_INS, name)):
7316 self.op.beparams[name] = einfo.get(constants.INISECT_INS, name)
7318 if einfo.has_section(constants.INISECT_OSP):
7319 # use the parameters, without overriding
7320 for name, value in einfo.items(constants.INISECT_OSP):
7321 if name not in self.op.osparams:
7322 self.op.osparams[name] = value
7324 def _RevertToDefaults(self, cluster):
7325 """Revert the instance parameters to the default values.
7329 hv_defs = cluster.SimpleFillHV(self.op.hypervisor, self.op.os_type, {})
7330 for name in self.op.hvparams.keys():
7331 if name in hv_defs and hv_defs[name] == self.op.hvparams[name]:
7332 del self.op.hvparams[name]
7334 be_defs = cluster.SimpleFillBE({})
7335 for name in self.op.beparams.keys():
7336 if name in be_defs and be_defs[name] == self.op.beparams[name]:
7337 del self.op.beparams[name]
7339 nic_defs = cluster.SimpleFillNIC({})
7340 for nic in self.op.nics:
7341 for name in constants.NICS_PARAMETERS:
7342 if name in nic and name in nic_defs and nic[name] == nic_defs[name]:
7345 os_defs = cluster.SimpleFillOS(self.op.os_type, {})
7346 for name in self.op.osparams.keys():
7347 if name in os_defs and os_defs[name] == self.op.osparams[name]:
7348 del self.op.osparams[name]
7350 def CheckPrereq(self):
7351 """Check prerequisites.
7354 if self.op.mode == constants.INSTANCE_IMPORT:
7355 export_info = self._ReadExportInfo()
7356 self._ReadExportParams(export_info)
7358 if (not self.cfg.GetVGName() and
7359 self.op.disk_template not in constants.DTS_NOT_LVM):
7360 raise errors.OpPrereqError("Cluster does not support lvm-based"
7361 " instances", errors.ECODE_STATE)
7363 if self.op.hypervisor is None:
7364 self.op.hypervisor = self.cfg.GetHypervisorType()
7366 cluster = self.cfg.GetClusterInfo()
7367 enabled_hvs = cluster.enabled_hypervisors
7368 if self.op.hypervisor not in enabled_hvs:
7369 raise errors.OpPrereqError("Selected hypervisor (%s) not enabled in the"
7370 " cluster (%s)" % (self.op.hypervisor,
7371 ",".join(enabled_hvs)),
7374 # check hypervisor parameter syntax (locally)
7375 utils.ForceDictType(self.op.hvparams, constants.HVS_PARAMETER_TYPES)
7376 filled_hvp = cluster.SimpleFillHV(self.op.hypervisor, self.op.os_type,
7378 hv_type = hypervisor.GetHypervisor(self.op.hypervisor)
7379 hv_type.CheckParameterSyntax(filled_hvp)
7380 self.hv_full = filled_hvp
7381 # check that we don't specify global parameters on an instance
7382 _CheckGlobalHvParams(self.op.hvparams)
7384 # fill and remember the beparams dict
7385 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
7386 self.be_full = cluster.SimpleFillBE(self.op.beparams)
7388 # build os parameters
7389 self.os_full = cluster.SimpleFillOS(self.op.os_type, self.op.osparams)
7391 # now that hvp/bep are in final format, let's reset to defaults,
7393 if self.op.identify_defaults:
7394 self._RevertToDefaults(cluster)
7398 for idx, nic in enumerate(self.op.nics):
7399 nic_mode_req = nic.get("mode", None)
7400 nic_mode = nic_mode_req
7401 if nic_mode is None:
7402 nic_mode = cluster.nicparams[constants.PP_DEFAULT][constants.NIC_MODE]
7404 # in routed mode, for the first nic, the default ip is 'auto'
7405 if nic_mode == constants.NIC_MODE_ROUTED and idx == 0:
7406 default_ip_mode = constants.VALUE_AUTO
7408 default_ip_mode = constants.VALUE_NONE
7410 # ip validity checks
7411 ip = nic.get("ip", default_ip_mode)
7412 if ip is None or ip.lower() == constants.VALUE_NONE:
7414 elif ip.lower() == constants.VALUE_AUTO:
7415 if not self.op.name_check:
7416 raise errors.OpPrereqError("IP address set to auto but name checks"
7417 " have been skipped",
7419 nic_ip = self.hostname1.ip
7421 if not netutils.IPAddress.IsValid(ip):
7422 raise errors.OpPrereqError("Invalid IP address '%s'" % ip,
7426 # TODO: check the ip address for uniqueness
7427 if nic_mode == constants.NIC_MODE_ROUTED and not nic_ip:
7428 raise errors.OpPrereqError("Routed nic mode requires an ip address",
7431 # MAC address verification
7432 mac = nic.get("mac", constants.VALUE_AUTO)
7433 if mac not in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
7434 mac = utils.NormalizeAndValidateMac(mac)
7437 self.cfg.ReserveMAC(mac, self.proc.GetECId())
7438 except errors.ReservationError:
7439 raise errors.OpPrereqError("MAC address %s already in use"
7440 " in cluster" % mac,
7441 errors.ECODE_NOTUNIQUE)
7443 # bridge verification
7444 bridge = nic.get("bridge", None)
7445 link = nic.get("link", None)
7447 raise errors.OpPrereqError("Cannot pass 'bridge' and 'link'"
7448 " at the same time", errors.ECODE_INVAL)
7449 elif bridge and nic_mode == constants.NIC_MODE_ROUTED:
7450 raise errors.OpPrereqError("Cannot pass 'bridge' on a routed nic",
7457 nicparams[constants.NIC_MODE] = nic_mode_req
7459 nicparams[constants.NIC_LINK] = link
7461 check_params = cluster.SimpleFillNIC(nicparams)
7462 objects.NIC.CheckParameterSyntax(check_params)
7463 self.nics.append(objects.NIC(mac=mac, ip=nic_ip, nicparams=nicparams))
7465 # disk checks/pre-build
7467 for disk in self.op.disks:
7468 mode = disk.get("mode", constants.DISK_RDWR)
7469 if mode not in constants.DISK_ACCESS_SET:
7470 raise errors.OpPrereqError("Invalid disk access mode '%s'" %
7471 mode, errors.ECODE_INVAL)
7472 size = disk.get("size", None)
7474 raise errors.OpPrereqError("Missing disk size", errors.ECODE_INVAL)
7477 except (TypeError, ValueError):
7478 raise errors.OpPrereqError("Invalid disk size '%s'" % size,
7480 vg = disk.get("vg", self.cfg.GetVGName())
7481 new_disk = {"size": size, "mode": mode, "vg": vg}
7483 new_disk["adopt"] = disk["adopt"]
7484 self.disks.append(new_disk)
7486 if self.op.mode == constants.INSTANCE_IMPORT:
7488 # Check that the new instance doesn't have less disks than the export
7489 instance_disks = len(self.disks)
7490 export_disks = export_info.getint(constants.INISECT_INS, 'disk_count')
7491 if instance_disks < export_disks:
7492 raise errors.OpPrereqError("Not enough disks to import."
7493 " (instance: %d, export: %d)" %
7494 (instance_disks, export_disks),
7498 for idx in range(export_disks):
7499 option = 'disk%d_dump' % idx
7500 if export_info.has_option(constants.INISECT_INS, option):
7501 # FIXME: are the old os-es, disk sizes, etc. useful?
7502 export_name = export_info.get(constants.INISECT_INS, option)
7503 image = utils.PathJoin(self.op.src_path, export_name)
7504 disk_images.append(image)
7506 disk_images.append(False)
7508 self.src_images = disk_images
7510 old_name = export_info.get(constants.INISECT_INS, 'name')
7512 exp_nic_count = export_info.getint(constants.INISECT_INS, 'nic_count')
7513 except (TypeError, ValueError), err:
7514 raise errors.OpPrereqError("Invalid export file, nic_count is not"
7515 " an integer: %s" % str(err),
7517 if self.op.instance_name == old_name:
7518 for idx, nic in enumerate(self.nics):
7519 if nic.mac == constants.VALUE_AUTO and exp_nic_count >= idx:
7520 nic_mac_ini = 'nic%d_mac' % idx
7521 nic.mac = export_info.get(constants.INISECT_INS, nic_mac_ini)
7523 # ENDIF: self.op.mode == constants.INSTANCE_IMPORT
7525 # ip ping checks (we use the same ip that was resolved in ExpandNames)
7526 if self.op.ip_check:
7527 if netutils.TcpPing(self.check_ip, constants.DEFAULT_NODED_PORT):
7528 raise errors.OpPrereqError("IP %s of instance %s already in use" %
7529 (self.check_ip, self.op.instance_name),
7530 errors.ECODE_NOTUNIQUE)
7532 #### mac address generation
7533 # By generating here the mac address both the allocator and the hooks get
7534 # the real final mac address rather than the 'auto' or 'generate' value.
7535 # There is a race condition between the generation and the instance object
7536 # creation, which means that we know the mac is valid now, but we're not
7537 # sure it will be when we actually add the instance. If things go bad
7538 # adding the instance will abort because of a duplicate mac, and the
7539 # creation job will fail.
7540 for nic in self.nics:
7541 if nic.mac in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
7542 nic.mac = self.cfg.GenerateMAC(self.proc.GetECId())
7546 if self.op.iallocator is not None:
7547 self._RunAllocator()
7549 #### node related checks
7551 # check primary node
7552 self.pnode = pnode = self.cfg.GetNodeInfo(self.op.pnode)
7553 assert self.pnode is not None, \
7554 "Cannot retrieve locked node %s" % self.op.pnode
7556 raise errors.OpPrereqError("Cannot use offline primary node '%s'" %
7557 pnode.name, errors.ECODE_STATE)
7559 raise errors.OpPrereqError("Cannot use drained primary node '%s'" %
7560 pnode.name, errors.ECODE_STATE)
7561 if not pnode.vm_capable:
7562 raise errors.OpPrereqError("Cannot use non-vm_capable primary node"
7563 " '%s'" % pnode.name, errors.ECODE_STATE)
7565 self.secondaries = []
7567 # mirror node verification
7568 if self.op.disk_template in constants.DTS_NET_MIRROR:
7569 if self.op.snode == pnode.name:
7570 raise errors.OpPrereqError("The secondary node cannot be the"
7571 " primary node.", errors.ECODE_INVAL)
7572 _CheckNodeOnline(self, self.op.snode)
7573 _CheckNodeNotDrained(self, self.op.snode)
7574 _CheckNodeVmCapable(self, self.op.snode)
7575 self.secondaries.append(self.op.snode)
7577 nodenames = [pnode.name] + self.secondaries
7579 if not self.adopt_disks:
7580 # Check lv size requirements, if not adopting
7581 req_sizes = _ComputeDiskSizePerVG(self.op.disk_template, self.disks)
7582 _CheckNodesFreeDiskPerVG(self, nodenames, req_sizes)
7584 else: # instead, we must check the adoption data
7585 all_lvs = set([i["vg"] + "/" + i["adopt"] for i in self.disks])
7586 if len(all_lvs) != len(self.disks):
7587 raise errors.OpPrereqError("Duplicate volume names given for adoption",
7589 for lv_name in all_lvs:
7591 # FIXME: lv_name here is "vg/lv" need to ensure that other calls
7592 # to ReserveLV uses the same syntax
7593 self.cfg.ReserveLV(lv_name, self.proc.GetECId())
7594 except errors.ReservationError:
7595 raise errors.OpPrereqError("LV named %s used by another instance" %
7596 lv_name, errors.ECODE_NOTUNIQUE)
7598 vg_names = self.rpc.call_vg_list([pnode.name])[pnode.name]
7599 vg_names.Raise("Cannot get VG information from node %s" % pnode.name)
7601 node_lvs = self.rpc.call_lv_list([pnode.name],
7602 vg_names.payload.keys())[pnode.name]
7603 node_lvs.Raise("Cannot get LV information from node %s" % pnode.name)
7604 node_lvs = node_lvs.payload
7606 delta = all_lvs.difference(node_lvs.keys())
7608 raise errors.OpPrereqError("Missing logical volume(s): %s" %
7609 utils.CommaJoin(delta),
7611 online_lvs = [lv for lv in all_lvs if node_lvs[lv][2]]
7613 raise errors.OpPrereqError("Online logical volumes found, cannot"
7614 " adopt: %s" % utils.CommaJoin(online_lvs),
7616 # update the size of disk based on what is found
7617 for dsk in self.disks:
7618 dsk["size"] = int(float(node_lvs[dsk["vg"] + "/" + dsk["adopt"]][0]))
7620 _CheckHVParams(self, nodenames, self.op.hypervisor, self.op.hvparams)
7622 _CheckNodeHasOS(self, pnode.name, self.op.os_type, self.op.force_variant)
7623 # check OS parameters (remotely)
7624 _CheckOSParams(self, True, nodenames, self.op.os_type, self.os_full)
7626 _CheckNicsBridgesExist(self, self.nics, self.pnode.name)
7628 # memory check on primary node
7630 _CheckNodeFreeMemory(self, self.pnode.name,
7631 "creating instance %s" % self.op.instance_name,
7632 self.be_full[constants.BE_MEMORY],
7635 self.dry_run_result = list(nodenames)
7637 def Exec(self, feedback_fn):
7638 """Create and add the instance to the cluster.
7641 instance = self.op.instance_name
7642 pnode_name = self.pnode.name
7644 ht_kind = self.op.hypervisor
7645 if ht_kind in constants.HTS_REQ_PORT:
7646 network_port = self.cfg.AllocatePort()
7650 if constants.ENABLE_FILE_STORAGE:
7651 # this is needed because os.path.join does not accept None arguments
7652 if self.op.file_storage_dir is None:
7653 string_file_storage_dir = ""
7655 string_file_storage_dir = self.op.file_storage_dir
7657 # build the full file storage dir path
7658 file_storage_dir = utils.PathJoin(self.cfg.GetFileStorageDir(),
7659 string_file_storage_dir, instance)
7661 file_storage_dir = ""
7663 disks = _GenerateDiskTemplate(self,
7664 self.op.disk_template,
7665 instance, pnode_name,
7669 self.op.file_driver,
7673 iobj = objects.Instance(name=instance, os=self.op.os_type,
7674 primary_node=pnode_name,
7675 nics=self.nics, disks=disks,
7676 disk_template=self.op.disk_template,
7678 network_port=network_port,
7679 beparams=self.op.beparams,
7680 hvparams=self.op.hvparams,
7681 hypervisor=self.op.hypervisor,
7682 osparams=self.op.osparams,
7685 if self.adopt_disks:
7686 # rename LVs to the newly-generated names; we need to construct
7687 # 'fake' LV disks with the old data, plus the new unique_id
7688 tmp_disks = [objects.Disk.FromDict(v.ToDict()) for v in disks]
7690 for t_dsk, a_dsk in zip (tmp_disks, self.disks):
7691 rename_to.append(t_dsk.logical_id)
7692 t_dsk.logical_id = (t_dsk.logical_id[0], a_dsk["adopt"])
7693 self.cfg.SetDiskID(t_dsk, pnode_name)
7694 result = self.rpc.call_blockdev_rename(pnode_name,
7695 zip(tmp_disks, rename_to))
7696 result.Raise("Failed to rename adoped LVs")
7698 feedback_fn("* creating instance disks...")
7700 _CreateDisks(self, iobj)
7701 except errors.OpExecError:
7702 self.LogWarning("Device creation failed, reverting...")
7704 _RemoveDisks(self, iobj)
7706 self.cfg.ReleaseDRBDMinors(instance)
7709 feedback_fn("adding instance %s to cluster config" % instance)
7711 self.cfg.AddInstance(iobj, self.proc.GetECId())
7713 # Declare that we don't want to remove the instance lock anymore, as we've
7714 # added the instance to the config
7715 del self.remove_locks[locking.LEVEL_INSTANCE]
7716 # Unlock all the nodes
7717 if self.op.mode == constants.INSTANCE_IMPORT:
7718 nodes_keep = [self.op.src_node]
7719 nodes_release = [node for node in self.acquired_locks[locking.LEVEL_NODE]
7720 if node != self.op.src_node]
7721 self.context.glm.release(locking.LEVEL_NODE, nodes_release)
7722 self.acquired_locks[locking.LEVEL_NODE] = nodes_keep
7724 self.context.glm.release(locking.LEVEL_NODE)
7725 del self.acquired_locks[locking.LEVEL_NODE]
7728 if not self.adopt_disks and self.cfg.GetClusterInfo().prealloc_wipe_disks:
7729 feedback_fn("* wiping instance disks...")
7731 _WipeDisks(self, iobj)
7732 except errors.OpExecError, err:
7733 logging.exception("Wiping disks failed")
7734 self.LogWarning("Wiping instance disks failed (%s)", err)
7738 # Something is already wrong with the disks, don't do anything else
7740 elif self.op.wait_for_sync:
7741 disk_abort = not _WaitForSync(self, iobj)
7742 elif iobj.disk_template in constants.DTS_NET_MIRROR:
7743 # make sure the disks are not degraded (still sync-ing is ok)
7745 feedback_fn("* checking mirrors status")
7746 disk_abort = not _WaitForSync(self, iobj, oneshot=True)
7751 _RemoveDisks(self, iobj)
7752 self.cfg.RemoveInstance(iobj.name)
7753 # Make sure the instance lock gets removed
7754 self.remove_locks[locking.LEVEL_INSTANCE] = iobj.name
7755 raise errors.OpExecError("There are some degraded disks for"
7758 if iobj.disk_template != constants.DT_DISKLESS and not self.adopt_disks:
7759 if self.op.mode == constants.INSTANCE_CREATE:
7760 if not self.op.no_install:
7761 feedback_fn("* running the instance OS create scripts...")
7762 # FIXME: pass debug option from opcode to backend
7763 result = self.rpc.call_instance_os_add(pnode_name, iobj, False,
7764 self.op.debug_level)
7765 result.Raise("Could not add os for instance %s"
7766 " on node %s" % (instance, pnode_name))
7768 elif self.op.mode == constants.INSTANCE_IMPORT:
7769 feedback_fn("* running the instance OS import scripts...")
7773 for idx, image in enumerate(self.src_images):
7777 # FIXME: pass debug option from opcode to backend
7778 dt = masterd.instance.DiskTransfer("disk/%s" % idx,
7779 constants.IEIO_FILE, (image, ),
7780 constants.IEIO_SCRIPT,
7781 (iobj.disks[idx], idx),
7783 transfers.append(dt)
7786 masterd.instance.TransferInstanceData(self, feedback_fn,
7787 self.op.src_node, pnode_name,
7788 self.pnode.secondary_ip,
7790 if not compat.all(import_result):
7791 self.LogWarning("Some disks for instance %s on node %s were not"
7792 " imported successfully" % (instance, pnode_name))
7794 elif self.op.mode == constants.INSTANCE_REMOTE_IMPORT:
7795 feedback_fn("* preparing remote import...")
7796 # The source cluster will stop the instance before attempting to make a
7797 # connection. In some cases stopping an instance can take a long time,
7798 # hence the shutdown timeout is added to the connection timeout.
7799 connect_timeout = (constants.RIE_CONNECT_TIMEOUT +
7800 self.op.source_shutdown_timeout)
7801 timeouts = masterd.instance.ImportExportTimeouts(connect_timeout)
7803 assert iobj.primary_node == self.pnode.name
7805 masterd.instance.RemoteImport(self, feedback_fn, iobj, self.pnode,
7806 self.source_x509_ca,
7807 self._cds, timeouts)
7808 if not compat.all(disk_results):
7809 # TODO: Should the instance still be started, even if some disks
7810 # failed to import (valid for local imports, too)?
7811 self.LogWarning("Some disks for instance %s on node %s were not"
7812 " imported successfully" % (instance, pnode_name))
7814 # Run rename script on newly imported instance
7815 assert iobj.name == instance
7816 feedback_fn("Running rename script for %s" % instance)
7817 result = self.rpc.call_instance_run_rename(pnode_name, iobj,
7818 self.source_instance_name,
7819 self.op.debug_level)
7821 self.LogWarning("Failed to run rename script for %s on node"
7822 " %s: %s" % (instance, pnode_name, result.fail_msg))
7825 # also checked in the prereq part
7826 raise errors.ProgrammerError("Unknown OS initialization mode '%s'"
7830 iobj.admin_up = True
7831 self.cfg.Update(iobj, feedback_fn)
7832 logging.info("Starting instance %s on node %s", instance, pnode_name)
7833 feedback_fn("* starting instance...")
7834 result = self.rpc.call_instance_start(pnode_name, iobj, None, None)
7835 result.Raise("Could not start instance")
7837 return list(iobj.all_nodes)
7840 class LUInstanceConsole(NoHooksLU):
7841 """Connect to an instance's console.
7843 This is somewhat special in that it returns the command line that
7844 you need to run on the master node in order to connect to the
7850 def ExpandNames(self):
7851 self._ExpandAndLockInstance()
7853 def CheckPrereq(self):
7854 """Check prerequisites.
7856 This checks that the instance is in the cluster.
7859 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
7860 assert self.instance is not None, \
7861 "Cannot retrieve locked instance %s" % self.op.instance_name
7862 _CheckNodeOnline(self, self.instance.primary_node)
7864 def Exec(self, feedback_fn):
7865 """Connect to the console of an instance
7868 instance = self.instance
7869 node = instance.primary_node
7871 node_insts = self.rpc.call_instance_list([node],
7872 [instance.hypervisor])[node]
7873 node_insts.Raise("Can't get node information from %s" % node)
7875 if instance.name not in node_insts.payload:
7876 if instance.admin_up:
7877 state = "ERROR_down"
7879 state = "ADMIN_down"
7880 raise errors.OpExecError("Instance %s is not running (state %s)" %
7881 (instance.name, state))
7883 logging.debug("Connecting to console of %s on %s", instance.name, node)
7885 return _GetInstanceConsole(self.cfg.GetClusterInfo(), instance)
7888 def _GetInstanceConsole(cluster, instance):
7889 """Returns console information for an instance.
7891 @type cluster: L{objects.Cluster}
7892 @type instance: L{objects.Instance}
7896 hyper = hypervisor.GetHypervisor(instance.hypervisor)
7897 # beparams and hvparams are passed separately, to avoid editing the
7898 # instance and then saving the defaults in the instance itself.
7899 hvparams = cluster.FillHV(instance)
7900 beparams = cluster.FillBE(instance)
7901 console = hyper.GetInstanceConsole(instance, hvparams, beparams)
7903 assert console.instance == instance.name
7904 assert console.Validate()
7906 return console.ToDict()
7909 class LUInstanceReplaceDisks(LogicalUnit):
7910 """Replace the disks of an instance.
7913 HPATH = "mirrors-replace"
7914 HTYPE = constants.HTYPE_INSTANCE
7917 def CheckArguments(self):
7918 TLReplaceDisks.CheckArguments(self.op.mode, self.op.remote_node,
7921 def ExpandNames(self):
7922 self._ExpandAndLockInstance()
7924 if self.op.iallocator is not None:
7925 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
7927 elif self.op.remote_node is not None:
7928 remote_node = _ExpandNodeName(self.cfg, self.op.remote_node)
7929 self.op.remote_node = remote_node
7931 # Warning: do not remove the locking of the new secondary here
7932 # unless DRBD8.AddChildren is changed to work in parallel;
7933 # currently it doesn't since parallel invocations of
7934 # FindUnusedMinor will conflict
7935 self.needed_locks[locking.LEVEL_NODE] = [remote_node]
7936 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
7939 self.needed_locks[locking.LEVEL_NODE] = []
7940 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
7942 self.replacer = TLReplaceDisks(self, self.op.instance_name, self.op.mode,
7943 self.op.iallocator, self.op.remote_node,
7944 self.op.disks, False, self.op.early_release)
7946 self.tasklets = [self.replacer]
7948 def DeclareLocks(self, level):
7949 # If we're not already locking all nodes in the set we have to declare the
7950 # instance's primary/secondary nodes.
7951 if (level == locking.LEVEL_NODE and
7952 self.needed_locks[locking.LEVEL_NODE] is not locking.ALL_SET):
7953 self._LockInstancesNodes()
7955 def BuildHooksEnv(self):
7958 This runs on the master, the primary and all the secondaries.
7961 instance = self.replacer.instance
7963 "MODE": self.op.mode,
7964 "NEW_SECONDARY": self.op.remote_node,
7965 "OLD_SECONDARY": instance.secondary_nodes[0],
7967 env.update(_BuildInstanceHookEnvByObject(self, instance))
7969 self.cfg.GetMasterNode(),
7970 instance.primary_node,
7972 if self.op.remote_node is not None:
7973 nl.append(self.op.remote_node)
7977 class TLReplaceDisks(Tasklet):
7978 """Replaces disks for an instance.
7980 Note: Locking is not within the scope of this class.
7983 def __init__(self, lu, instance_name, mode, iallocator_name, remote_node,
7984 disks, delay_iallocator, early_release):
7985 """Initializes this class.
7988 Tasklet.__init__(self, lu)
7991 self.instance_name = instance_name
7993 self.iallocator_name = iallocator_name
7994 self.remote_node = remote_node
7996 self.delay_iallocator = delay_iallocator
7997 self.early_release = early_release
8000 self.instance = None
8001 self.new_node = None
8002 self.target_node = None
8003 self.other_node = None
8004 self.remote_node_info = None
8005 self.node_secondary_ip = None
8008 def CheckArguments(mode, remote_node, iallocator):
8009 """Helper function for users of this class.
8012 # check for valid parameter combination
8013 if mode == constants.REPLACE_DISK_CHG:
8014 if remote_node is None and iallocator is None:
8015 raise errors.OpPrereqError("When changing the secondary either an"
8016 " iallocator script must be used or the"
8017 " new node given", errors.ECODE_INVAL)
8019 if remote_node is not None and iallocator is not None:
8020 raise errors.OpPrereqError("Give either the iallocator or the new"
8021 " secondary, not both", errors.ECODE_INVAL)
8023 elif remote_node is not None or iallocator is not None:
8024 # Not replacing the secondary
8025 raise errors.OpPrereqError("The iallocator and new node options can"
8026 " only be used when changing the"
8027 " secondary node", errors.ECODE_INVAL)
8030 def _RunAllocator(lu, iallocator_name, instance_name, relocate_from):
8031 """Compute a new secondary node using an IAllocator.
8034 ial = IAllocator(lu.cfg, lu.rpc,
8035 mode=constants.IALLOCATOR_MODE_RELOC,
8037 relocate_from=relocate_from)
8039 ial.Run(iallocator_name)
8042 raise errors.OpPrereqError("Can't compute nodes using iallocator '%s':"
8043 " %s" % (iallocator_name, ial.info),
8046 if len(ial.result) != ial.required_nodes:
8047 raise errors.OpPrereqError("iallocator '%s' returned invalid number"
8048 " of nodes (%s), required %s" %
8050 len(ial.result), ial.required_nodes),
8053 remote_node_name = ial.result[0]
8055 lu.LogInfo("Selected new secondary for instance '%s': %s",
8056 instance_name, remote_node_name)
8058 return remote_node_name
8060 def _FindFaultyDisks(self, node_name):
8061 return _FindFaultyInstanceDisks(self.cfg, self.rpc, self.instance,
8064 def CheckPrereq(self):
8065 """Check prerequisites.
8067 This checks that the instance is in the cluster.
8070 self.instance = instance = self.cfg.GetInstanceInfo(self.instance_name)
8071 assert instance is not None, \
8072 "Cannot retrieve locked instance %s" % self.instance_name
8074 if instance.disk_template != constants.DT_DRBD8:
8075 raise errors.OpPrereqError("Can only run replace disks for DRBD8-based"
8076 " instances", errors.ECODE_INVAL)
8078 if len(instance.secondary_nodes) != 1:
8079 raise errors.OpPrereqError("The instance has a strange layout,"
8080 " expected one secondary but found %d" %
8081 len(instance.secondary_nodes),
8084 if not self.delay_iallocator:
8085 self._CheckPrereq2()
8087 def _CheckPrereq2(self):
8088 """Check prerequisites, second part.
8090 This function should always be part of CheckPrereq. It was separated and is
8091 now called from Exec because during node evacuation iallocator was only
8092 called with an unmodified cluster model, not taking planned changes into
8096 instance = self.instance
8097 secondary_node = instance.secondary_nodes[0]
8099 if self.iallocator_name is None:
8100 remote_node = self.remote_node
8102 remote_node = self._RunAllocator(self.lu, self.iallocator_name,
8103 instance.name, instance.secondary_nodes)
8105 if remote_node is not None:
8106 self.remote_node_info = self.cfg.GetNodeInfo(remote_node)
8107 assert self.remote_node_info is not None, \
8108 "Cannot retrieve locked node %s" % remote_node
8110 self.remote_node_info = None
8112 if remote_node == self.instance.primary_node:
8113 raise errors.OpPrereqError("The specified node is the primary node of"
8114 " the instance.", errors.ECODE_INVAL)
8116 if remote_node == secondary_node:
8117 raise errors.OpPrereqError("The specified node is already the"
8118 " secondary node of the instance.",
8121 if self.disks and self.mode in (constants.REPLACE_DISK_AUTO,
8122 constants.REPLACE_DISK_CHG):
8123 raise errors.OpPrereqError("Cannot specify disks to be replaced",
8126 if self.mode == constants.REPLACE_DISK_AUTO:
8127 faulty_primary = self._FindFaultyDisks(instance.primary_node)
8128 faulty_secondary = self._FindFaultyDisks(secondary_node)
8130 if faulty_primary and faulty_secondary:
8131 raise errors.OpPrereqError("Instance %s has faulty disks on more than"
8132 " one node and can not be repaired"
8133 " automatically" % self.instance_name,
8137 self.disks = faulty_primary
8138 self.target_node = instance.primary_node
8139 self.other_node = secondary_node
8140 check_nodes = [self.target_node, self.other_node]
8141 elif faulty_secondary:
8142 self.disks = faulty_secondary
8143 self.target_node = secondary_node
8144 self.other_node = instance.primary_node
8145 check_nodes = [self.target_node, self.other_node]
8151 # Non-automatic modes
8152 if self.mode == constants.REPLACE_DISK_PRI:
8153 self.target_node = instance.primary_node
8154 self.other_node = secondary_node
8155 check_nodes = [self.target_node, self.other_node]
8157 elif self.mode == constants.REPLACE_DISK_SEC:
8158 self.target_node = secondary_node
8159 self.other_node = instance.primary_node
8160 check_nodes = [self.target_node, self.other_node]
8162 elif self.mode == constants.REPLACE_DISK_CHG:
8163 self.new_node = remote_node
8164 self.other_node = instance.primary_node
8165 self.target_node = secondary_node
8166 check_nodes = [self.new_node, self.other_node]
8168 _CheckNodeNotDrained(self.lu, remote_node)
8169 _CheckNodeVmCapable(self.lu, remote_node)
8171 old_node_info = self.cfg.GetNodeInfo(secondary_node)
8172 assert old_node_info is not None
8173 if old_node_info.offline and not self.early_release:
8174 # doesn't make sense to delay the release
8175 self.early_release = True
8176 self.lu.LogInfo("Old secondary %s is offline, automatically enabling"
8177 " early-release mode", secondary_node)
8180 raise errors.ProgrammerError("Unhandled disk replace mode (%s)" %
8183 # If not specified all disks should be replaced
8185 self.disks = range(len(self.instance.disks))
8187 for node in check_nodes:
8188 _CheckNodeOnline(self.lu, node)
8190 # Check whether disks are valid
8191 for disk_idx in self.disks:
8192 instance.FindDisk(disk_idx)
8194 # Get secondary node IP addresses
8197 for node_name in [self.target_node, self.other_node, self.new_node]:
8198 if node_name is not None:
8199 node_2nd_ip[node_name] = self.cfg.GetNodeInfo(node_name).secondary_ip
8201 self.node_secondary_ip = node_2nd_ip
8203 def Exec(self, feedback_fn):
8204 """Execute disk replacement.
8206 This dispatches the disk replacement to the appropriate handler.
8209 if self.delay_iallocator:
8210 self._CheckPrereq2()
8213 feedback_fn("No disks need replacement")
8216 feedback_fn("Replacing disk(s) %s for %s" %
8217 (utils.CommaJoin(self.disks), self.instance.name))
8219 activate_disks = (not self.instance.admin_up)
8221 # Activate the instance disks if we're replacing them on a down instance
8223 _StartInstanceDisks(self.lu, self.instance, True)
8226 # Should we replace the secondary node?
8227 if self.new_node is not None:
8228 fn = self._ExecDrbd8Secondary
8230 fn = self._ExecDrbd8DiskOnly
8232 return fn(feedback_fn)
8235 # Deactivate the instance disks if we're replacing them on a
8238 _SafeShutdownInstanceDisks(self.lu, self.instance)
8240 def _CheckVolumeGroup(self, nodes):
8241 self.lu.LogInfo("Checking volume groups")
8243 vgname = self.cfg.GetVGName()
8245 # Make sure volume group exists on all involved nodes
8246 results = self.rpc.call_vg_list(nodes)
8248 raise errors.OpExecError("Can't list volume groups on the nodes")
8252 res.Raise("Error checking node %s" % node)
8253 if vgname not in res.payload:
8254 raise errors.OpExecError("Volume group '%s' not found on node %s" %
8257 def _CheckDisksExistence(self, nodes):
8258 # Check disk existence
8259 for idx, dev in enumerate(self.instance.disks):
8260 if idx not in self.disks:
8264 self.lu.LogInfo("Checking disk/%d on %s" % (idx, node))
8265 self.cfg.SetDiskID(dev, node)
8267 result = self.rpc.call_blockdev_find(node, dev)
8269 msg = result.fail_msg
8270 if msg or not result.payload:
8272 msg = "disk not found"
8273 raise errors.OpExecError("Can't find disk/%d on node %s: %s" %
8276 def _CheckDisksConsistency(self, node_name, on_primary, ldisk):
8277 for idx, dev in enumerate(self.instance.disks):
8278 if idx not in self.disks:
8281 self.lu.LogInfo("Checking disk/%d consistency on node %s" %
8284 if not _CheckDiskConsistency(self.lu, dev, node_name, on_primary,
8286 raise errors.OpExecError("Node %s has degraded storage, unsafe to"
8287 " replace disks for instance %s" %
8288 (node_name, self.instance.name))
8290 def _CreateNewStorage(self, node_name):
8293 for idx, dev in enumerate(self.instance.disks):
8294 if idx not in self.disks:
8297 self.lu.LogInfo("Adding storage on %s for disk/%d" % (node_name, idx))
8299 self.cfg.SetDiskID(dev, node_name)
8301 lv_names = [".disk%d_%s" % (idx, suffix) for suffix in ["data", "meta"]]
8302 names = _GenerateUniqueNames(self.lu, lv_names)
8304 vg_data = dev.children[0].logical_id[0]
8305 lv_data = objects.Disk(dev_type=constants.LD_LV, size=dev.size,
8306 logical_id=(vg_data, names[0]))
8307 vg_meta = dev.children[1].logical_id[0]
8308 lv_meta = objects.Disk(dev_type=constants.LD_LV, size=128,
8309 logical_id=(vg_meta, names[1]))
8311 new_lvs = [lv_data, lv_meta]
8312 old_lvs = dev.children
8313 iv_names[dev.iv_name] = (dev, old_lvs, new_lvs)
8315 # we pass force_create=True to force the LVM creation
8316 for new_lv in new_lvs:
8317 _CreateBlockDev(self.lu, node_name, self.instance, new_lv, True,
8318 _GetInstanceInfoText(self.instance), False)
8322 def _CheckDevices(self, node_name, iv_names):
8323 for name, (dev, _, _) in iv_names.iteritems():
8324 self.cfg.SetDiskID(dev, node_name)
8326 result = self.rpc.call_blockdev_find(node_name, dev)
8328 msg = result.fail_msg
8329 if msg or not result.payload:
8331 msg = "disk not found"
8332 raise errors.OpExecError("Can't find DRBD device %s: %s" %
8335 if result.payload.is_degraded:
8336 raise errors.OpExecError("DRBD device %s is degraded!" % name)
8338 def _RemoveOldStorage(self, node_name, iv_names):
8339 for name, (_, old_lvs, _) in iv_names.iteritems():
8340 self.lu.LogInfo("Remove logical volumes for %s" % name)
8343 self.cfg.SetDiskID(lv, node_name)
8345 msg = self.rpc.call_blockdev_remove(node_name, lv).fail_msg
8347 self.lu.LogWarning("Can't remove old LV: %s" % msg,
8348 hint="remove unused LVs manually")
8350 def _ReleaseNodeLock(self, node_name):
8351 """Releases the lock for a given node."""
8352 self.lu.context.glm.release(locking.LEVEL_NODE, node_name)
8354 def _ExecDrbd8DiskOnly(self, feedback_fn):
8355 """Replace a disk on the primary or secondary for DRBD 8.
8357 The algorithm for replace is quite complicated:
8359 1. for each disk to be replaced:
8361 1. create new LVs on the target node with unique names
8362 1. detach old LVs from the drbd device
8363 1. rename old LVs to name_replaced.<time_t>
8364 1. rename new LVs to old LVs
8365 1. attach the new LVs (with the old names now) to the drbd device
8367 1. wait for sync across all devices
8369 1. for each modified disk:
8371 1. remove old LVs (which have the name name_replaces.<time_t>)
8373 Failures are not very well handled.
8378 # Step: check device activation
8379 self.lu.LogStep(1, steps_total, "Check device existence")
8380 self._CheckDisksExistence([self.other_node, self.target_node])
8381 self._CheckVolumeGroup([self.target_node, self.other_node])
8383 # Step: check other node consistency
8384 self.lu.LogStep(2, steps_total, "Check peer consistency")
8385 self._CheckDisksConsistency(self.other_node,
8386 self.other_node == self.instance.primary_node,
8389 # Step: create new storage
8390 self.lu.LogStep(3, steps_total, "Allocate new storage")
8391 iv_names = self._CreateNewStorage(self.target_node)
8393 # Step: for each lv, detach+rename*2+attach
8394 self.lu.LogStep(4, steps_total, "Changing drbd configuration")
8395 for dev, old_lvs, new_lvs in iv_names.itervalues():
8396 self.lu.LogInfo("Detaching %s drbd from local storage" % dev.iv_name)
8398 result = self.rpc.call_blockdev_removechildren(self.target_node, dev,
8400 result.Raise("Can't detach drbd from local storage on node"
8401 " %s for device %s" % (self.target_node, dev.iv_name))
8403 #cfg.Update(instance)
8405 # ok, we created the new LVs, so now we know we have the needed
8406 # storage; as such, we proceed on the target node to rename
8407 # old_lv to _old, and new_lv to old_lv; note that we rename LVs
8408 # using the assumption that logical_id == physical_id (which in
8409 # turn is the unique_id on that node)
8411 # FIXME(iustin): use a better name for the replaced LVs
8412 temp_suffix = int(time.time())
8413 ren_fn = lambda d, suff: (d.physical_id[0],
8414 d.physical_id[1] + "_replaced-%s" % suff)
8416 # Build the rename list based on what LVs exist on the node
8417 rename_old_to_new = []
8418 for to_ren in old_lvs:
8419 result = self.rpc.call_blockdev_find(self.target_node, to_ren)
8420 if not result.fail_msg and result.payload:
8422 rename_old_to_new.append((to_ren, ren_fn(to_ren, temp_suffix)))
8424 self.lu.LogInfo("Renaming the old LVs on the target node")
8425 result = self.rpc.call_blockdev_rename(self.target_node,
8427 result.Raise("Can't rename old LVs on node %s" % self.target_node)
8429 # Now we rename the new LVs to the old LVs
8430 self.lu.LogInfo("Renaming the new LVs on the target node")
8431 rename_new_to_old = [(new, old.physical_id)
8432 for old, new in zip(old_lvs, new_lvs)]
8433 result = self.rpc.call_blockdev_rename(self.target_node,
8435 result.Raise("Can't rename new LVs on node %s" % self.target_node)
8437 for old, new in zip(old_lvs, new_lvs):
8438 new.logical_id = old.logical_id
8439 self.cfg.SetDiskID(new, self.target_node)
8441 for disk in old_lvs:
8442 disk.logical_id = ren_fn(disk, temp_suffix)
8443 self.cfg.SetDiskID(disk, self.target_node)
8445 # Now that the new lvs have the old name, we can add them to the device
8446 self.lu.LogInfo("Adding new mirror component on %s" % self.target_node)
8447 result = self.rpc.call_blockdev_addchildren(self.target_node, dev,
8449 msg = result.fail_msg
8451 for new_lv in new_lvs:
8452 msg2 = self.rpc.call_blockdev_remove(self.target_node,
8455 self.lu.LogWarning("Can't rollback device %s: %s", dev, msg2,
8456 hint=("cleanup manually the unused logical"
8458 raise errors.OpExecError("Can't add local storage to drbd: %s" % msg)
8460 dev.children = new_lvs
8462 self.cfg.Update(self.instance, feedback_fn)
8465 if self.early_release:
8466 self.lu.LogStep(cstep, steps_total, "Removing old storage")
8468 self._RemoveOldStorage(self.target_node, iv_names)
8469 # WARNING: we release both node locks here, do not do other RPCs
8470 # than WaitForSync to the primary node
8471 self._ReleaseNodeLock([self.target_node, self.other_node])
8474 # This can fail as the old devices are degraded and _WaitForSync
8475 # does a combined result over all disks, so we don't check its return value
8476 self.lu.LogStep(cstep, steps_total, "Sync devices")
8478 _WaitForSync(self.lu, self.instance)
8480 # Check all devices manually
8481 self._CheckDevices(self.instance.primary_node, iv_names)
8483 # Step: remove old storage
8484 if not self.early_release:
8485 self.lu.LogStep(cstep, steps_total, "Removing old storage")
8487 self._RemoveOldStorage(self.target_node, iv_names)
8489 def _ExecDrbd8Secondary(self, feedback_fn):
8490 """Replace the secondary node for DRBD 8.
8492 The algorithm for replace is quite complicated:
8493 - for all disks of the instance:
8494 - create new LVs on the new node with same names
8495 - shutdown the drbd device on the old secondary
8496 - disconnect the drbd network on the primary
8497 - create the drbd device on the new secondary
8498 - network attach the drbd on the primary, using an artifice:
8499 the drbd code for Attach() will connect to the network if it
8500 finds a device which is connected to the good local disks but
8502 - wait for sync across all devices
8503 - remove all disks from the old secondary
8505 Failures are not very well handled.
8510 # Step: check device activation
8511 self.lu.LogStep(1, steps_total, "Check device existence")
8512 self._CheckDisksExistence([self.instance.primary_node])
8513 self._CheckVolumeGroup([self.instance.primary_node])
8515 # Step: check other node consistency
8516 self.lu.LogStep(2, steps_total, "Check peer consistency")
8517 self._CheckDisksConsistency(self.instance.primary_node, True, True)
8519 # Step: create new storage
8520 self.lu.LogStep(3, steps_total, "Allocate new storage")
8521 for idx, dev in enumerate(self.instance.disks):
8522 self.lu.LogInfo("Adding new local storage on %s for disk/%d" %
8523 (self.new_node, idx))
8524 # we pass force_create=True to force LVM creation
8525 for new_lv in dev.children:
8526 _CreateBlockDev(self.lu, self.new_node, self.instance, new_lv, True,
8527 _GetInstanceInfoText(self.instance), False)
8529 # Step 4: dbrd minors and drbd setups changes
8530 # after this, we must manually remove the drbd minors on both the
8531 # error and the success paths
8532 self.lu.LogStep(4, steps_total, "Changing drbd configuration")
8533 minors = self.cfg.AllocateDRBDMinor([self.new_node
8534 for dev in self.instance.disks],
8536 logging.debug("Allocated minors %r", minors)
8539 for idx, (dev, new_minor) in enumerate(zip(self.instance.disks, minors)):
8540 self.lu.LogInfo("activating a new drbd on %s for disk/%d" %
8541 (self.new_node, idx))
8542 # create new devices on new_node; note that we create two IDs:
8543 # one without port, so the drbd will be activated without
8544 # networking information on the new node at this stage, and one
8545 # with network, for the latter activation in step 4
8546 (o_node1, o_node2, o_port, o_minor1, o_minor2, o_secret) = dev.logical_id
8547 if self.instance.primary_node == o_node1:
8550 assert self.instance.primary_node == o_node2, "Three-node instance?"
8553 new_alone_id = (self.instance.primary_node, self.new_node, None,
8554 p_minor, new_minor, o_secret)
8555 new_net_id = (self.instance.primary_node, self.new_node, o_port,
8556 p_minor, new_minor, o_secret)
8558 iv_names[idx] = (dev, dev.children, new_net_id)
8559 logging.debug("Allocated new_minor: %s, new_logical_id: %s", new_minor,
8561 new_drbd = objects.Disk(dev_type=constants.LD_DRBD8,
8562 logical_id=new_alone_id,
8563 children=dev.children,
8566 _CreateSingleBlockDev(self.lu, self.new_node, self.instance, new_drbd,
8567 _GetInstanceInfoText(self.instance), False)
8568 except errors.GenericError:
8569 self.cfg.ReleaseDRBDMinors(self.instance.name)
8572 # We have new devices, shutdown the drbd on the old secondary
8573 for idx, dev in enumerate(self.instance.disks):
8574 self.lu.LogInfo("Shutting down drbd for disk/%d on old node" % idx)
8575 self.cfg.SetDiskID(dev, self.target_node)
8576 msg = self.rpc.call_blockdev_shutdown(self.target_node, dev).fail_msg
8578 self.lu.LogWarning("Failed to shutdown drbd for disk/%d on old"
8579 "node: %s" % (idx, msg),
8580 hint=("Please cleanup this device manually as"
8581 " soon as possible"))
8583 self.lu.LogInfo("Detaching primary drbds from the network (=> standalone)")
8584 result = self.rpc.call_drbd_disconnect_net([self.instance.primary_node],
8585 self.node_secondary_ip,
8586 self.instance.disks)\
8587 [self.instance.primary_node]
8589 msg = result.fail_msg
8591 # detaches didn't succeed (unlikely)
8592 self.cfg.ReleaseDRBDMinors(self.instance.name)
8593 raise errors.OpExecError("Can't detach the disks from the network on"
8594 " old node: %s" % (msg,))
8596 # if we managed to detach at least one, we update all the disks of
8597 # the instance to point to the new secondary
8598 self.lu.LogInfo("Updating instance configuration")
8599 for dev, _, new_logical_id in iv_names.itervalues():
8600 dev.logical_id = new_logical_id
8601 self.cfg.SetDiskID(dev, self.instance.primary_node)
8603 self.cfg.Update(self.instance, feedback_fn)
8605 # and now perform the drbd attach
8606 self.lu.LogInfo("Attaching primary drbds to new secondary"
8607 " (standalone => connected)")
8608 result = self.rpc.call_drbd_attach_net([self.instance.primary_node,
8610 self.node_secondary_ip,
8611 self.instance.disks,
8614 for to_node, to_result in result.items():
8615 msg = to_result.fail_msg
8617 self.lu.LogWarning("Can't attach drbd disks on node %s: %s",
8619 hint=("please do a gnt-instance info to see the"
8620 " status of disks"))
8622 if self.early_release:
8623 self.lu.LogStep(cstep, steps_total, "Removing old storage")
8625 self._RemoveOldStorage(self.target_node, iv_names)
8626 # WARNING: we release all node locks here, do not do other RPCs
8627 # than WaitForSync to the primary node
8628 self._ReleaseNodeLock([self.instance.primary_node,
8633 # This can fail as the old devices are degraded and _WaitForSync
8634 # does a combined result over all disks, so we don't check its return value
8635 self.lu.LogStep(cstep, steps_total, "Sync devices")
8637 _WaitForSync(self.lu, self.instance)
8639 # Check all devices manually
8640 self._CheckDevices(self.instance.primary_node, iv_names)
8642 # Step: remove old storage
8643 if not self.early_release:
8644 self.lu.LogStep(cstep, steps_total, "Removing old storage")
8645 self._RemoveOldStorage(self.target_node, iv_names)
8648 class LURepairNodeStorage(NoHooksLU):
8649 """Repairs the volume group on a node.
8654 def CheckArguments(self):
8655 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
8657 storage_type = self.op.storage_type
8659 if (constants.SO_FIX_CONSISTENCY not in
8660 constants.VALID_STORAGE_OPERATIONS.get(storage_type, [])):
8661 raise errors.OpPrereqError("Storage units of type '%s' can not be"
8662 " repaired" % storage_type,
8665 def ExpandNames(self):
8666 self.needed_locks = {
8667 locking.LEVEL_NODE: [self.op.node_name],
8670 def _CheckFaultyDisks(self, instance, node_name):
8671 """Ensure faulty disks abort the opcode or at least warn."""
8673 if _FindFaultyInstanceDisks(self.cfg, self.rpc, instance,
8675 raise errors.OpPrereqError("Instance '%s' has faulty disks on"
8676 " node '%s'" % (instance.name, node_name),
8678 except errors.OpPrereqError, err:
8679 if self.op.ignore_consistency:
8680 self.proc.LogWarning(str(err.args[0]))
8684 def CheckPrereq(self):
8685 """Check prerequisites.
8688 # Check whether any instance on this node has faulty disks
8689 for inst in _GetNodeInstances(self.cfg, self.op.node_name):
8690 if not inst.admin_up:
8692 check_nodes = set(inst.all_nodes)
8693 check_nodes.discard(self.op.node_name)
8694 for inst_node_name in check_nodes:
8695 self._CheckFaultyDisks(inst, inst_node_name)
8697 def Exec(self, feedback_fn):
8698 feedback_fn("Repairing storage unit '%s' on %s ..." %
8699 (self.op.name, self.op.node_name))
8701 st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
8702 result = self.rpc.call_storage_execute(self.op.node_name,
8703 self.op.storage_type, st_args,
8705 constants.SO_FIX_CONSISTENCY)
8706 result.Raise("Failed to repair storage unit '%s' on %s" %
8707 (self.op.name, self.op.node_name))
8710 class LUNodeEvacStrategy(NoHooksLU):
8711 """Computes the node evacuation strategy.
8716 def CheckArguments(self):
8717 _CheckIAllocatorOrNode(self, "iallocator", "remote_node")
8719 def ExpandNames(self):
8720 self.op.nodes = _GetWantedNodes(self, self.op.nodes)
8721 self.needed_locks = locks = {}
8722 if self.op.remote_node is None:
8723 locks[locking.LEVEL_NODE] = locking.ALL_SET
8725 self.op.remote_node = _ExpandNodeName(self.cfg, self.op.remote_node)
8726 locks[locking.LEVEL_NODE] = self.op.nodes + [self.op.remote_node]
8728 def Exec(self, feedback_fn):
8729 if self.op.remote_node is not None:
8731 for node in self.op.nodes:
8732 instances.extend(_GetNodeSecondaryInstances(self.cfg, node))
8735 if i.primary_node == self.op.remote_node:
8736 raise errors.OpPrereqError("Node %s is the primary node of"
8737 " instance %s, cannot use it as"
8739 (self.op.remote_node, i.name),
8741 result.append([i.name, self.op.remote_node])
8743 ial = IAllocator(self.cfg, self.rpc,
8744 mode=constants.IALLOCATOR_MODE_MEVAC,
8745 evac_nodes=self.op.nodes)
8746 ial.Run(self.op.iallocator, validate=True)
8748 raise errors.OpExecError("No valid evacuation solution: %s" % ial.info,
8754 class LUInstanceGrowDisk(LogicalUnit):
8755 """Grow a disk of an instance.
8759 HTYPE = constants.HTYPE_INSTANCE
8762 def ExpandNames(self):
8763 self._ExpandAndLockInstance()
8764 self.needed_locks[locking.LEVEL_NODE] = []
8765 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
8767 def DeclareLocks(self, level):
8768 if level == locking.LEVEL_NODE:
8769 self._LockInstancesNodes()
8771 def BuildHooksEnv(self):
8774 This runs on the master, the primary and all the secondaries.
8778 "DISK": self.op.disk,
8779 "AMOUNT": self.op.amount,
8781 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
8782 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
8785 def CheckPrereq(self):
8786 """Check prerequisites.
8788 This checks that the instance is in the cluster.
8791 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
8792 assert instance is not None, \
8793 "Cannot retrieve locked instance %s" % self.op.instance_name
8794 nodenames = list(instance.all_nodes)
8795 for node in nodenames:
8796 _CheckNodeOnline(self, node)
8798 self.instance = instance
8800 if instance.disk_template not in constants.DTS_GROWABLE:
8801 raise errors.OpPrereqError("Instance's disk layout does not support"
8802 " growing.", errors.ECODE_INVAL)
8804 self.disk = instance.FindDisk(self.op.disk)
8806 if instance.disk_template != constants.DT_FILE:
8807 # TODO: check the free disk space for file, when that feature
8809 _CheckNodesFreeDiskPerVG(self, nodenames,
8810 self.disk.ComputeGrowth(self.op.amount))
8812 def Exec(self, feedback_fn):
8813 """Execute disk grow.
8816 instance = self.instance
8819 disks_ok, _ = _AssembleInstanceDisks(self, self.instance, disks=[disk])
8821 raise errors.OpExecError("Cannot activate block device to grow")
8823 for node in instance.all_nodes:
8824 self.cfg.SetDiskID(disk, node)
8825 result = self.rpc.call_blockdev_grow(node, disk, self.op.amount)
8826 result.Raise("Grow request failed to node %s" % node)
8828 # TODO: Rewrite code to work properly
8829 # DRBD goes into sync mode for a short amount of time after executing the
8830 # "resize" command. DRBD 8.x below version 8.0.13 contains a bug whereby
8831 # calling "resize" in sync mode fails. Sleeping for a short amount of
8832 # time is a work-around.
8835 disk.RecordGrow(self.op.amount)
8836 self.cfg.Update(instance, feedback_fn)
8837 if self.op.wait_for_sync:
8838 disk_abort = not _WaitForSync(self, instance, disks=[disk])
8840 self.proc.LogWarning("Warning: disk sync-ing has not returned a good"
8841 " status.\nPlease check the instance.")
8842 if not instance.admin_up:
8843 _SafeShutdownInstanceDisks(self, instance, disks=[disk])
8844 elif not instance.admin_up:
8845 self.proc.LogWarning("Not shutting down the disk even if the instance is"
8846 " not supposed to be running because no wait for"
8847 " sync mode was requested.")
8850 class LUInstanceQueryData(NoHooksLU):
8851 """Query runtime instance data.
8856 def ExpandNames(self):
8857 self.needed_locks = {}
8859 # Use locking if requested or when non-static information is wanted
8860 if not (self.op.static or self.op.use_locking):
8861 self.LogWarning("Non-static data requested, locks need to be acquired")
8862 self.op.use_locking = True
8864 if self.op.instances or not self.op.use_locking:
8865 # Expand instance names right here
8866 self.wanted_names = _GetWantedInstances(self, self.op.instances)
8868 # Will use acquired locks
8869 self.wanted_names = None
8871 if self.op.use_locking:
8872 self.share_locks = dict.fromkeys(locking.LEVELS, 1)
8874 if self.wanted_names is None:
8875 self.needed_locks[locking.LEVEL_INSTANCE] = locking.ALL_SET
8877 self.needed_locks[locking.LEVEL_INSTANCE] = self.wanted_names
8879 self.needed_locks[locking.LEVEL_NODE] = []
8880 self.share_locks = dict.fromkeys(locking.LEVELS, 1)
8881 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
8883 def DeclareLocks(self, level):
8884 if self.op.use_locking and level == locking.LEVEL_NODE:
8885 self._LockInstancesNodes()
8887 def CheckPrereq(self):
8888 """Check prerequisites.
8890 This only checks the optional instance list against the existing names.
8893 if self.wanted_names is None:
8894 assert self.op.use_locking, "Locking was not used"
8895 self.wanted_names = self.acquired_locks[locking.LEVEL_INSTANCE]
8897 self.wanted_instances = [self.cfg.GetInstanceInfo(name)
8898 for name in self.wanted_names]
8900 def _ComputeBlockdevStatus(self, node, instance_name, dev):
8901 """Returns the status of a block device
8904 if self.op.static or not node:
8907 self.cfg.SetDiskID(dev, node)
8909 result = self.rpc.call_blockdev_find(node, dev)
8913 result.Raise("Can't compute disk status for %s" % instance_name)
8915 status = result.payload
8919 return (status.dev_path, status.major, status.minor,
8920 status.sync_percent, status.estimated_time,
8921 status.is_degraded, status.ldisk_status)
8923 def _ComputeDiskStatus(self, instance, snode, dev):
8924 """Compute block device status.
8927 if dev.dev_type in constants.LDS_DRBD:
8928 # we change the snode then (otherwise we use the one passed in)
8929 if dev.logical_id[0] == instance.primary_node:
8930 snode = dev.logical_id[1]
8932 snode = dev.logical_id[0]
8934 dev_pstatus = self._ComputeBlockdevStatus(instance.primary_node,
8936 dev_sstatus = self._ComputeBlockdevStatus(snode, instance.name, dev)
8939 dev_children = [self._ComputeDiskStatus(instance, snode, child)
8940 for child in dev.children]
8945 "iv_name": dev.iv_name,
8946 "dev_type": dev.dev_type,
8947 "logical_id": dev.logical_id,
8948 "physical_id": dev.physical_id,
8949 "pstatus": dev_pstatus,
8950 "sstatus": dev_sstatus,
8951 "children": dev_children,
8956 def Exec(self, feedback_fn):
8957 """Gather and return data"""
8960 cluster = self.cfg.GetClusterInfo()
8962 for instance in self.wanted_instances:
8963 if not self.op.static:
8964 remote_info = self.rpc.call_instance_info(instance.primary_node,
8966 instance.hypervisor)
8967 remote_info.Raise("Error checking node %s" % instance.primary_node)
8968 remote_info = remote_info.payload
8969 if remote_info and "state" in remote_info:
8972 remote_state = "down"
8975 if instance.admin_up:
8978 config_state = "down"
8980 disks = [self._ComputeDiskStatus(instance, None, device)
8981 for device in instance.disks]
8983 result[instance.name] = {
8984 "name": instance.name,
8985 "config_state": config_state,
8986 "run_state": remote_state,
8987 "pnode": instance.primary_node,
8988 "snodes": instance.secondary_nodes,
8990 # this happens to be the same format used for hooks
8991 "nics": _NICListToTuple(self, instance.nics),
8992 "disk_template": instance.disk_template,
8994 "hypervisor": instance.hypervisor,
8995 "network_port": instance.network_port,
8996 "hv_instance": instance.hvparams,
8997 "hv_actual": cluster.FillHV(instance, skip_globals=True),
8998 "be_instance": instance.beparams,
8999 "be_actual": cluster.FillBE(instance),
9000 "os_instance": instance.osparams,
9001 "os_actual": cluster.SimpleFillOS(instance.os, instance.osparams),
9002 "serial_no": instance.serial_no,
9003 "mtime": instance.mtime,
9004 "ctime": instance.ctime,
9005 "uuid": instance.uuid,
9011 class LUInstanceSetParams(LogicalUnit):
9012 """Modifies an instances's parameters.
9015 HPATH = "instance-modify"
9016 HTYPE = constants.HTYPE_INSTANCE
9019 def CheckArguments(self):
9020 if not (self.op.nics or self.op.disks or self.op.disk_template or
9021 self.op.hvparams or self.op.beparams or self.op.os_name):
9022 raise errors.OpPrereqError("No changes submitted", errors.ECODE_INVAL)
9024 if self.op.hvparams:
9025 _CheckGlobalHvParams(self.op.hvparams)
9029 for disk_op, disk_dict in self.op.disks:
9030 utils.ForceDictType(disk_dict, constants.IDISK_PARAMS_TYPES)
9031 if disk_op == constants.DDM_REMOVE:
9034 elif disk_op == constants.DDM_ADD:
9037 if not isinstance(disk_op, int):
9038 raise errors.OpPrereqError("Invalid disk index", errors.ECODE_INVAL)
9039 if not isinstance(disk_dict, dict):
9040 msg = "Invalid disk value: expected dict, got '%s'" % disk_dict
9041 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
9043 if disk_op == constants.DDM_ADD:
9044 mode = disk_dict.setdefault('mode', constants.DISK_RDWR)
9045 if mode not in constants.DISK_ACCESS_SET:
9046 raise errors.OpPrereqError("Invalid disk access mode '%s'" % mode,
9048 size = disk_dict.get('size', None)
9050 raise errors.OpPrereqError("Required disk parameter size missing",
9054 except (TypeError, ValueError), err:
9055 raise errors.OpPrereqError("Invalid disk size parameter: %s" %
9056 str(err), errors.ECODE_INVAL)
9057 disk_dict['size'] = size
9059 # modification of disk
9060 if 'size' in disk_dict:
9061 raise errors.OpPrereqError("Disk size change not possible, use"
9062 " grow-disk", errors.ECODE_INVAL)
9064 if disk_addremove > 1:
9065 raise errors.OpPrereqError("Only one disk add or remove operation"
9066 " supported at a time", errors.ECODE_INVAL)
9068 if self.op.disks and self.op.disk_template is not None:
9069 raise errors.OpPrereqError("Disk template conversion and other disk"
9070 " changes not supported at the same time",
9073 if (self.op.disk_template and
9074 self.op.disk_template in constants.DTS_NET_MIRROR and
9075 self.op.remote_node is None):
9076 raise errors.OpPrereqError("Changing the disk template to a mirrored"
9077 " one requires specifying a secondary node",
9082 for nic_op, nic_dict in self.op.nics:
9083 utils.ForceDictType(nic_dict, constants.INIC_PARAMS_TYPES)
9084 if nic_op == constants.DDM_REMOVE:
9087 elif nic_op == constants.DDM_ADD:
9090 if not isinstance(nic_op, int):
9091 raise errors.OpPrereqError("Invalid nic index", errors.ECODE_INVAL)
9092 if not isinstance(nic_dict, dict):
9093 msg = "Invalid nic value: expected dict, got '%s'" % nic_dict
9094 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
9096 # nic_dict should be a dict
9097 nic_ip = nic_dict.get('ip', None)
9098 if nic_ip is not None:
9099 if nic_ip.lower() == constants.VALUE_NONE:
9100 nic_dict['ip'] = None
9102 if not netutils.IPAddress.IsValid(nic_ip):
9103 raise errors.OpPrereqError("Invalid IP address '%s'" % nic_ip,
9106 nic_bridge = nic_dict.get('bridge', None)
9107 nic_link = nic_dict.get('link', None)
9108 if nic_bridge and nic_link:
9109 raise errors.OpPrereqError("Cannot pass 'bridge' and 'link'"
9110 " at the same time", errors.ECODE_INVAL)
9111 elif nic_bridge and nic_bridge.lower() == constants.VALUE_NONE:
9112 nic_dict['bridge'] = None
9113 elif nic_link and nic_link.lower() == constants.VALUE_NONE:
9114 nic_dict['link'] = None
9116 if nic_op == constants.DDM_ADD:
9117 nic_mac = nic_dict.get('mac', None)
9119 nic_dict['mac'] = constants.VALUE_AUTO
9121 if 'mac' in nic_dict:
9122 nic_mac = nic_dict['mac']
9123 if nic_mac not in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
9124 nic_mac = utils.NormalizeAndValidateMac(nic_mac)
9126 if nic_op != constants.DDM_ADD and nic_mac == constants.VALUE_AUTO:
9127 raise errors.OpPrereqError("'auto' is not a valid MAC address when"
9128 " modifying an existing nic",
9131 if nic_addremove > 1:
9132 raise errors.OpPrereqError("Only one NIC add or remove operation"
9133 " supported at a time", errors.ECODE_INVAL)
9135 def ExpandNames(self):
9136 self._ExpandAndLockInstance()
9137 self.needed_locks[locking.LEVEL_NODE] = []
9138 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
9140 def DeclareLocks(self, level):
9141 if level == locking.LEVEL_NODE:
9142 self._LockInstancesNodes()
9143 if self.op.disk_template and self.op.remote_node:
9144 self.op.remote_node = _ExpandNodeName(self.cfg, self.op.remote_node)
9145 self.needed_locks[locking.LEVEL_NODE].append(self.op.remote_node)
9147 def BuildHooksEnv(self):
9150 This runs on the master, primary and secondaries.
9154 if constants.BE_MEMORY in self.be_new:
9155 args['memory'] = self.be_new[constants.BE_MEMORY]
9156 if constants.BE_VCPUS in self.be_new:
9157 args['vcpus'] = self.be_new[constants.BE_VCPUS]
9158 # TODO: export disk changes. Note: _BuildInstanceHookEnv* don't export disk
9159 # information at all.
9162 nic_override = dict(self.op.nics)
9163 for idx, nic in enumerate(self.instance.nics):
9164 if idx in nic_override:
9165 this_nic_override = nic_override[idx]
9167 this_nic_override = {}
9168 if 'ip' in this_nic_override:
9169 ip = this_nic_override['ip']
9172 if 'mac' in this_nic_override:
9173 mac = this_nic_override['mac']
9176 if idx in self.nic_pnew:
9177 nicparams = self.nic_pnew[idx]
9179 nicparams = self.cluster.SimpleFillNIC(nic.nicparams)
9180 mode = nicparams[constants.NIC_MODE]
9181 link = nicparams[constants.NIC_LINK]
9182 args['nics'].append((ip, mac, mode, link))
9183 if constants.DDM_ADD in nic_override:
9184 ip = nic_override[constants.DDM_ADD].get('ip', None)
9185 mac = nic_override[constants.DDM_ADD]['mac']
9186 nicparams = self.nic_pnew[constants.DDM_ADD]
9187 mode = nicparams[constants.NIC_MODE]
9188 link = nicparams[constants.NIC_LINK]
9189 args['nics'].append((ip, mac, mode, link))
9190 elif constants.DDM_REMOVE in nic_override:
9191 del args['nics'][-1]
9193 env = _BuildInstanceHookEnvByObject(self, self.instance, override=args)
9194 if self.op.disk_template:
9195 env["NEW_DISK_TEMPLATE"] = self.op.disk_template
9196 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
9199 def CheckPrereq(self):
9200 """Check prerequisites.
9202 This only checks the instance list against the existing names.
9205 # checking the new params on the primary/secondary nodes
9207 instance = self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
9208 cluster = self.cluster = self.cfg.GetClusterInfo()
9209 assert self.instance is not None, \
9210 "Cannot retrieve locked instance %s" % self.op.instance_name
9211 pnode = instance.primary_node
9212 nodelist = list(instance.all_nodes)
9215 if self.op.os_name and not self.op.force:
9216 _CheckNodeHasOS(self, instance.primary_node, self.op.os_name,
9217 self.op.force_variant)
9218 instance_os = self.op.os_name
9220 instance_os = instance.os
9222 if self.op.disk_template:
9223 if instance.disk_template == self.op.disk_template:
9224 raise errors.OpPrereqError("Instance already has disk template %s" %
9225 instance.disk_template, errors.ECODE_INVAL)
9227 if (instance.disk_template,
9228 self.op.disk_template) not in self._DISK_CONVERSIONS:
9229 raise errors.OpPrereqError("Unsupported disk template conversion from"
9230 " %s to %s" % (instance.disk_template,
9231 self.op.disk_template),
9233 _CheckInstanceDown(self, instance, "cannot change disk template")
9234 if self.op.disk_template in constants.DTS_NET_MIRROR:
9235 if self.op.remote_node == pnode:
9236 raise errors.OpPrereqError("Given new secondary node %s is the same"
9237 " as the primary node of the instance" %
9238 self.op.remote_node, errors.ECODE_STATE)
9239 _CheckNodeOnline(self, self.op.remote_node)
9240 _CheckNodeNotDrained(self, self.op.remote_node)
9241 # FIXME: here we assume that the old instance type is DT_PLAIN
9242 assert instance.disk_template == constants.DT_PLAIN
9243 disks = [{"size": d.size, "vg": d.logical_id[0]}
9244 for d in instance.disks]
9245 required = _ComputeDiskSizePerVG(self.op.disk_template, disks)
9246 _CheckNodesFreeDiskPerVG(self, [self.op.remote_node], required)
9248 # hvparams processing
9249 if self.op.hvparams:
9250 hv_type = instance.hypervisor
9251 i_hvdict = _GetUpdatedParams(instance.hvparams, self.op.hvparams)
9252 utils.ForceDictType(i_hvdict, constants.HVS_PARAMETER_TYPES)
9253 hv_new = cluster.SimpleFillHV(hv_type, instance.os, i_hvdict)
9256 hypervisor.GetHypervisor(hv_type).CheckParameterSyntax(hv_new)
9257 _CheckHVParams(self, nodelist, instance.hypervisor, hv_new)
9258 self.hv_new = hv_new # the new actual values
9259 self.hv_inst = i_hvdict # the new dict (without defaults)
9261 self.hv_new = self.hv_inst = {}
9263 # beparams processing
9264 if self.op.beparams:
9265 i_bedict = _GetUpdatedParams(instance.beparams, self.op.beparams,
9267 utils.ForceDictType(i_bedict, constants.BES_PARAMETER_TYPES)
9268 be_new = cluster.SimpleFillBE(i_bedict)
9269 self.be_new = be_new # the new actual values
9270 self.be_inst = i_bedict # the new dict (without defaults)
9272 self.be_new = self.be_inst = {}
9274 # osparams processing
9275 if self.op.osparams:
9276 i_osdict = _GetUpdatedParams(instance.osparams, self.op.osparams)
9277 _CheckOSParams(self, True, nodelist, instance_os, i_osdict)
9278 self.os_inst = i_osdict # the new dict (without defaults)
9284 if constants.BE_MEMORY in self.op.beparams and not self.op.force:
9285 mem_check_list = [pnode]
9286 if be_new[constants.BE_AUTO_BALANCE]:
9287 # either we changed auto_balance to yes or it was from before
9288 mem_check_list.extend(instance.secondary_nodes)
9289 instance_info = self.rpc.call_instance_info(pnode, instance.name,
9290 instance.hypervisor)
9291 nodeinfo = self.rpc.call_node_info(mem_check_list, None,
9292 instance.hypervisor)
9293 pninfo = nodeinfo[pnode]
9294 msg = pninfo.fail_msg
9296 # Assume the primary node is unreachable and go ahead
9297 self.warn.append("Can't get info from primary node %s: %s" %
9299 elif not isinstance(pninfo.payload.get('memory_free', None), int):
9300 self.warn.append("Node data from primary node %s doesn't contain"
9301 " free memory information" % pnode)
9302 elif instance_info.fail_msg:
9303 self.warn.append("Can't get instance runtime information: %s" %
9304 instance_info.fail_msg)
9306 if instance_info.payload:
9307 current_mem = int(instance_info.payload['memory'])
9309 # Assume instance not running
9310 # (there is a slight race condition here, but it's not very probable,
9311 # and we have no other way to check)
9313 miss_mem = (be_new[constants.BE_MEMORY] - current_mem -
9314 pninfo.payload['memory_free'])
9316 raise errors.OpPrereqError("This change will prevent the instance"
9317 " from starting, due to %d MB of memory"
9318 " missing on its primary node" % miss_mem,
9321 if be_new[constants.BE_AUTO_BALANCE]:
9322 for node, nres in nodeinfo.items():
9323 if node not in instance.secondary_nodes:
9327 self.warn.append("Can't get info from secondary node %s: %s" %
9329 elif not isinstance(nres.payload.get('memory_free', None), int):
9330 self.warn.append("Secondary node %s didn't return free"
9331 " memory information" % node)
9332 elif be_new[constants.BE_MEMORY] > nres.payload['memory_free']:
9333 self.warn.append("Not enough memory to failover instance to"
9334 " secondary node %s" % node)
9339 for nic_op, nic_dict in self.op.nics:
9340 if nic_op == constants.DDM_REMOVE:
9341 if not instance.nics:
9342 raise errors.OpPrereqError("Instance has no NICs, cannot remove",
9345 if nic_op != constants.DDM_ADD:
9347 if not instance.nics:
9348 raise errors.OpPrereqError("Invalid NIC index %s, instance has"
9349 " no NICs" % nic_op,
9351 if nic_op < 0 or nic_op >= len(instance.nics):
9352 raise errors.OpPrereqError("Invalid NIC index %s, valid values"
9354 (nic_op, len(instance.nics) - 1),
9356 old_nic_params = instance.nics[nic_op].nicparams
9357 old_nic_ip = instance.nics[nic_op].ip
9362 update_params_dict = dict([(key, nic_dict[key])
9363 for key in constants.NICS_PARAMETERS
9364 if key in nic_dict])
9366 if 'bridge' in nic_dict:
9367 update_params_dict[constants.NIC_LINK] = nic_dict['bridge']
9369 new_nic_params = _GetUpdatedParams(old_nic_params,
9371 utils.ForceDictType(new_nic_params, constants.NICS_PARAMETER_TYPES)
9372 new_filled_nic_params = cluster.SimpleFillNIC(new_nic_params)
9373 objects.NIC.CheckParameterSyntax(new_filled_nic_params)
9374 self.nic_pinst[nic_op] = new_nic_params
9375 self.nic_pnew[nic_op] = new_filled_nic_params
9376 new_nic_mode = new_filled_nic_params[constants.NIC_MODE]
9378 if new_nic_mode == constants.NIC_MODE_BRIDGED:
9379 nic_bridge = new_filled_nic_params[constants.NIC_LINK]
9380 msg = self.rpc.call_bridges_exist(pnode, [nic_bridge]).fail_msg
9382 msg = "Error checking bridges on node %s: %s" % (pnode, msg)
9384 self.warn.append(msg)
9386 raise errors.OpPrereqError(msg, errors.ECODE_ENVIRON)
9387 if new_nic_mode == constants.NIC_MODE_ROUTED:
9388 if 'ip' in nic_dict:
9389 nic_ip = nic_dict['ip']
9393 raise errors.OpPrereqError('Cannot set the nic ip to None'
9394 ' on a routed nic', errors.ECODE_INVAL)
9395 if 'mac' in nic_dict:
9396 nic_mac = nic_dict['mac']
9398 raise errors.OpPrereqError('Cannot set the nic mac to None',
9400 elif nic_mac in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
9401 # otherwise generate the mac
9402 nic_dict['mac'] = self.cfg.GenerateMAC(self.proc.GetECId())
9404 # or validate/reserve the current one
9406 self.cfg.ReserveMAC(nic_mac, self.proc.GetECId())
9407 except errors.ReservationError:
9408 raise errors.OpPrereqError("MAC address %s already in use"
9409 " in cluster" % nic_mac,
9410 errors.ECODE_NOTUNIQUE)
9413 if self.op.disks and instance.disk_template == constants.DT_DISKLESS:
9414 raise errors.OpPrereqError("Disk operations not supported for"
9415 " diskless instances",
9417 for disk_op, _ in self.op.disks:
9418 if disk_op == constants.DDM_REMOVE:
9419 if len(instance.disks) == 1:
9420 raise errors.OpPrereqError("Cannot remove the last disk of"
9421 " an instance", errors.ECODE_INVAL)
9422 _CheckInstanceDown(self, instance, "cannot remove disks")
9424 if (disk_op == constants.DDM_ADD and
9425 len(instance.disks) >= constants.MAX_DISKS):
9426 raise errors.OpPrereqError("Instance has too many disks (%d), cannot"
9427 " add more" % constants.MAX_DISKS,
9429 if disk_op not in (constants.DDM_ADD, constants.DDM_REMOVE):
9431 if disk_op < 0 or disk_op >= len(instance.disks):
9432 raise errors.OpPrereqError("Invalid disk index %s, valid values"
9434 (disk_op, len(instance.disks)),
9439 def _ConvertPlainToDrbd(self, feedback_fn):
9440 """Converts an instance from plain to drbd.
9443 feedback_fn("Converting template to drbd")
9444 instance = self.instance
9445 pnode = instance.primary_node
9446 snode = self.op.remote_node
9448 # create a fake disk info for _GenerateDiskTemplate
9449 disk_info = [{"size": d.size, "mode": d.mode,
9450 "vg": d.logical_id[0]} for d in instance.disks]
9451 new_disks = _GenerateDiskTemplate(self, self.op.disk_template,
9452 instance.name, pnode, [snode],
9453 disk_info, None, None, 0, feedback_fn)
9454 info = _GetInstanceInfoText(instance)
9455 feedback_fn("Creating aditional volumes...")
9456 # first, create the missing data and meta devices
9457 for disk in new_disks:
9458 # unfortunately this is... not too nice
9459 _CreateSingleBlockDev(self, pnode, instance, disk.children[1],
9461 for child in disk.children:
9462 _CreateSingleBlockDev(self, snode, instance, child, info, True)
9463 # at this stage, all new LVs have been created, we can rename the
9465 feedback_fn("Renaming original volumes...")
9466 rename_list = [(o, n.children[0].logical_id)
9467 for (o, n) in zip(instance.disks, new_disks)]
9468 result = self.rpc.call_blockdev_rename(pnode, rename_list)
9469 result.Raise("Failed to rename original LVs")
9471 feedback_fn("Initializing DRBD devices...")
9472 # all child devices are in place, we can now create the DRBD devices
9473 for disk in new_disks:
9474 for node in [pnode, snode]:
9475 f_create = node == pnode
9476 _CreateSingleBlockDev(self, node, instance, disk, info, f_create)
9478 # at this point, the instance has been modified
9479 instance.disk_template = constants.DT_DRBD8
9480 instance.disks = new_disks
9481 self.cfg.Update(instance, feedback_fn)
9483 # disks are created, waiting for sync
9484 disk_abort = not _WaitForSync(self, instance)
9486 raise errors.OpExecError("There are some degraded disks for"
9487 " this instance, please cleanup manually")
9489 def _ConvertDrbdToPlain(self, feedback_fn):
9490 """Converts an instance from drbd to plain.
9493 instance = self.instance
9494 assert len(instance.secondary_nodes) == 1
9495 pnode = instance.primary_node
9496 snode = instance.secondary_nodes[0]
9497 feedback_fn("Converting template to plain")
9499 old_disks = instance.disks
9500 new_disks = [d.children[0] for d in old_disks]
9502 # copy over size and mode
9503 for parent, child in zip(old_disks, new_disks):
9504 child.size = parent.size
9505 child.mode = parent.mode
9507 # update instance structure
9508 instance.disks = new_disks
9509 instance.disk_template = constants.DT_PLAIN
9510 self.cfg.Update(instance, feedback_fn)
9512 feedback_fn("Removing volumes on the secondary node...")
9513 for disk in old_disks:
9514 self.cfg.SetDiskID(disk, snode)
9515 msg = self.rpc.call_blockdev_remove(snode, disk).fail_msg
9517 self.LogWarning("Could not remove block device %s on node %s,"
9518 " continuing anyway: %s", disk.iv_name, snode, msg)
9520 feedback_fn("Removing unneeded volumes on the primary node...")
9521 for idx, disk in enumerate(old_disks):
9522 meta = disk.children[1]
9523 self.cfg.SetDiskID(meta, pnode)
9524 msg = self.rpc.call_blockdev_remove(pnode, meta).fail_msg
9526 self.LogWarning("Could not remove metadata for disk %d on node %s,"
9527 " continuing anyway: %s", idx, pnode, msg)
9529 def Exec(self, feedback_fn):
9530 """Modifies an instance.
9532 All parameters take effect only at the next restart of the instance.
9535 # Process here the warnings from CheckPrereq, as we don't have a
9536 # feedback_fn there.
9537 for warn in self.warn:
9538 feedback_fn("WARNING: %s" % warn)
9541 instance = self.instance
9543 for disk_op, disk_dict in self.op.disks:
9544 if disk_op == constants.DDM_REMOVE:
9545 # remove the last disk
9546 device = instance.disks.pop()
9547 device_idx = len(instance.disks)
9548 for node, disk in device.ComputeNodeTree(instance.primary_node):
9549 self.cfg.SetDiskID(disk, node)
9550 msg = self.rpc.call_blockdev_remove(node, disk).fail_msg
9552 self.LogWarning("Could not remove disk/%d on node %s: %s,"
9553 " continuing anyway", device_idx, node, msg)
9554 result.append(("disk/%d" % device_idx, "remove"))
9555 elif disk_op == constants.DDM_ADD:
9557 if instance.disk_template == constants.DT_FILE:
9558 file_driver, file_path = instance.disks[0].logical_id
9559 file_path = os.path.dirname(file_path)
9561 file_driver = file_path = None
9562 disk_idx_base = len(instance.disks)
9563 new_disk = _GenerateDiskTemplate(self,
9564 instance.disk_template,
9565 instance.name, instance.primary_node,
9566 instance.secondary_nodes,
9570 disk_idx_base, feedback_fn)[0]
9571 instance.disks.append(new_disk)
9572 info = _GetInstanceInfoText(instance)
9574 logging.info("Creating volume %s for instance %s",
9575 new_disk.iv_name, instance.name)
9576 # Note: this needs to be kept in sync with _CreateDisks
9578 for node in instance.all_nodes:
9579 f_create = node == instance.primary_node
9581 _CreateBlockDev(self, node, instance, new_disk,
9582 f_create, info, f_create)
9583 except errors.OpExecError, err:
9584 self.LogWarning("Failed to create volume %s (%s) on"
9586 new_disk.iv_name, new_disk, node, err)
9587 result.append(("disk/%d" % disk_idx_base, "add:size=%s,mode=%s" %
9588 (new_disk.size, new_disk.mode)))
9590 # change a given disk
9591 instance.disks[disk_op].mode = disk_dict['mode']
9592 result.append(("disk.mode/%d" % disk_op, disk_dict['mode']))
9594 if self.op.disk_template:
9595 r_shut = _ShutdownInstanceDisks(self, instance)
9597 raise errors.OpExecError("Cannot shutdown instance disks, unable to"
9598 " proceed with disk template conversion")
9599 mode = (instance.disk_template, self.op.disk_template)
9601 self._DISK_CONVERSIONS[mode](self, feedback_fn)
9603 self.cfg.ReleaseDRBDMinors(instance.name)
9605 result.append(("disk_template", self.op.disk_template))
9608 for nic_op, nic_dict in self.op.nics:
9609 if nic_op == constants.DDM_REMOVE:
9610 # remove the last nic
9611 del instance.nics[-1]
9612 result.append(("nic.%d" % len(instance.nics), "remove"))
9613 elif nic_op == constants.DDM_ADD:
9614 # mac and bridge should be set, by now
9615 mac = nic_dict['mac']
9616 ip = nic_dict.get('ip', None)
9617 nicparams = self.nic_pinst[constants.DDM_ADD]
9618 new_nic = objects.NIC(mac=mac, ip=ip, nicparams=nicparams)
9619 instance.nics.append(new_nic)
9620 result.append(("nic.%d" % (len(instance.nics) - 1),
9621 "add:mac=%s,ip=%s,mode=%s,link=%s" %
9622 (new_nic.mac, new_nic.ip,
9623 self.nic_pnew[constants.DDM_ADD][constants.NIC_MODE],
9624 self.nic_pnew[constants.DDM_ADD][constants.NIC_LINK]
9627 for key in 'mac', 'ip':
9629 setattr(instance.nics[nic_op], key, nic_dict[key])
9630 if nic_op in self.nic_pinst:
9631 instance.nics[nic_op].nicparams = self.nic_pinst[nic_op]
9632 for key, val in nic_dict.iteritems():
9633 result.append(("nic.%s/%d" % (key, nic_op), val))
9636 if self.op.hvparams:
9637 instance.hvparams = self.hv_inst
9638 for key, val in self.op.hvparams.iteritems():
9639 result.append(("hv/%s" % key, val))
9642 if self.op.beparams:
9643 instance.beparams = self.be_inst
9644 for key, val in self.op.beparams.iteritems():
9645 result.append(("be/%s" % key, val))
9649 instance.os = self.op.os_name
9652 if self.op.osparams:
9653 instance.osparams = self.os_inst
9654 for key, val in self.op.osparams.iteritems():
9655 result.append(("os/%s" % key, val))
9657 self.cfg.Update(instance, feedback_fn)
9661 _DISK_CONVERSIONS = {
9662 (constants.DT_PLAIN, constants.DT_DRBD8): _ConvertPlainToDrbd,
9663 (constants.DT_DRBD8, constants.DT_PLAIN): _ConvertDrbdToPlain,
9667 class LUBackupQuery(NoHooksLU):
9668 """Query the exports list
9673 def ExpandNames(self):
9674 self.needed_locks = {}
9675 self.share_locks[locking.LEVEL_NODE] = 1
9676 if not self.op.nodes:
9677 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
9679 self.needed_locks[locking.LEVEL_NODE] = \
9680 _GetWantedNodes(self, self.op.nodes)
9682 def Exec(self, feedback_fn):
9683 """Compute the list of all the exported system images.
9686 @return: a dictionary with the structure node->(export-list)
9687 where export-list is a list of the instances exported on
9691 self.nodes = self.acquired_locks[locking.LEVEL_NODE]
9692 rpcresult = self.rpc.call_export_list(self.nodes)
9694 for node in rpcresult:
9695 if rpcresult[node].fail_msg:
9696 result[node] = False
9698 result[node] = rpcresult[node].payload
9703 class LUBackupPrepare(NoHooksLU):
9704 """Prepares an instance for an export and returns useful information.
9709 def ExpandNames(self):
9710 self._ExpandAndLockInstance()
9712 def CheckPrereq(self):
9713 """Check prerequisites.
9716 instance_name = self.op.instance_name
9718 self.instance = self.cfg.GetInstanceInfo(instance_name)
9719 assert self.instance is not None, \
9720 "Cannot retrieve locked instance %s" % self.op.instance_name
9721 _CheckNodeOnline(self, self.instance.primary_node)
9723 self._cds = _GetClusterDomainSecret()
9725 def Exec(self, feedback_fn):
9726 """Prepares an instance for an export.
9729 instance = self.instance
9731 if self.op.mode == constants.EXPORT_MODE_REMOTE:
9732 salt = utils.GenerateSecret(8)
9734 feedback_fn("Generating X509 certificate on %s" % instance.primary_node)
9735 result = self.rpc.call_x509_cert_create(instance.primary_node,
9736 constants.RIE_CERT_VALIDITY)
9737 result.Raise("Can't create X509 key and certificate on %s" % result.node)
9739 (name, cert_pem) = result.payload
9741 cert = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM,
9745 "handshake": masterd.instance.ComputeRemoteExportHandshake(self._cds),
9746 "x509_key_name": (name, utils.Sha1Hmac(self._cds, name, salt=salt),
9748 "x509_ca": utils.SignX509Certificate(cert, self._cds, salt),
9754 class LUBackupExport(LogicalUnit):
9755 """Export an instance to an image in the cluster.
9758 HPATH = "instance-export"
9759 HTYPE = constants.HTYPE_INSTANCE
9762 def CheckArguments(self):
9763 """Check the arguments.
9766 self.x509_key_name = self.op.x509_key_name
9767 self.dest_x509_ca_pem = self.op.destination_x509_ca
9769 if self.op.mode == constants.EXPORT_MODE_REMOTE:
9770 if not self.x509_key_name:
9771 raise errors.OpPrereqError("Missing X509 key name for encryption",
9774 if not self.dest_x509_ca_pem:
9775 raise errors.OpPrereqError("Missing destination X509 CA",
9778 def ExpandNames(self):
9779 self._ExpandAndLockInstance()
9781 # Lock all nodes for local exports
9782 if self.op.mode == constants.EXPORT_MODE_LOCAL:
9783 # FIXME: lock only instance primary and destination node
9785 # Sad but true, for now we have do lock all nodes, as we don't know where
9786 # the previous export might be, and in this LU we search for it and
9787 # remove it from its current node. In the future we could fix this by:
9788 # - making a tasklet to search (share-lock all), then create the
9789 # new one, then one to remove, after
9790 # - removing the removal operation altogether
9791 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
9793 def DeclareLocks(self, level):
9794 """Last minute lock declaration."""
9795 # All nodes are locked anyway, so nothing to do here.
9797 def BuildHooksEnv(self):
9800 This will run on the master, primary node and target node.
9804 "EXPORT_MODE": self.op.mode,
9805 "EXPORT_NODE": self.op.target_node,
9806 "EXPORT_DO_SHUTDOWN": self.op.shutdown,
9807 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
9808 # TODO: Generic function for boolean env variables
9809 "REMOVE_INSTANCE": str(bool(self.op.remove_instance)),
9812 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
9814 nl = [self.cfg.GetMasterNode(), self.instance.primary_node]
9816 if self.op.mode == constants.EXPORT_MODE_LOCAL:
9817 nl.append(self.op.target_node)
9821 def CheckPrereq(self):
9822 """Check prerequisites.
9824 This checks that the instance and node names are valid.
9827 instance_name = self.op.instance_name
9829 self.instance = self.cfg.GetInstanceInfo(instance_name)
9830 assert self.instance is not None, \
9831 "Cannot retrieve locked instance %s" % self.op.instance_name
9832 _CheckNodeOnline(self, self.instance.primary_node)
9834 if (self.op.remove_instance and self.instance.admin_up and
9835 not self.op.shutdown):
9836 raise errors.OpPrereqError("Can not remove instance without shutting it"
9839 if self.op.mode == constants.EXPORT_MODE_LOCAL:
9840 self.op.target_node = _ExpandNodeName(self.cfg, self.op.target_node)
9841 self.dst_node = self.cfg.GetNodeInfo(self.op.target_node)
9842 assert self.dst_node is not None
9844 _CheckNodeOnline(self, self.dst_node.name)
9845 _CheckNodeNotDrained(self, self.dst_node.name)
9848 self.dest_disk_info = None
9849 self.dest_x509_ca = None
9851 elif self.op.mode == constants.EXPORT_MODE_REMOTE:
9852 self.dst_node = None
9854 if len(self.op.target_node) != len(self.instance.disks):
9855 raise errors.OpPrereqError(("Received destination information for %s"
9856 " disks, but instance %s has %s disks") %
9857 (len(self.op.target_node), instance_name,
9858 len(self.instance.disks)),
9861 cds = _GetClusterDomainSecret()
9863 # Check X509 key name
9865 (key_name, hmac_digest, hmac_salt) = self.x509_key_name
9866 except (TypeError, ValueError), err:
9867 raise errors.OpPrereqError("Invalid data for X509 key name: %s" % err)
9869 if not utils.VerifySha1Hmac(cds, key_name, hmac_digest, salt=hmac_salt):
9870 raise errors.OpPrereqError("HMAC for X509 key name is wrong",
9873 # Load and verify CA
9875 (cert, _) = utils.LoadSignedX509Certificate(self.dest_x509_ca_pem, cds)
9876 except OpenSSL.crypto.Error, err:
9877 raise errors.OpPrereqError("Unable to load destination X509 CA (%s)" %
9878 (err, ), errors.ECODE_INVAL)
9880 (errcode, msg) = utils.VerifyX509Certificate(cert, None, None)
9881 if errcode is not None:
9882 raise errors.OpPrereqError("Invalid destination X509 CA (%s)" %
9883 (msg, ), errors.ECODE_INVAL)
9885 self.dest_x509_ca = cert
9887 # Verify target information
9889 for idx, disk_data in enumerate(self.op.target_node):
9891 (host, port, magic) = \
9892 masterd.instance.CheckRemoteExportDiskInfo(cds, idx, disk_data)
9893 except errors.GenericError, err:
9894 raise errors.OpPrereqError("Target info for disk %s: %s" %
9895 (idx, err), errors.ECODE_INVAL)
9897 disk_info.append((host, port, magic))
9899 assert len(disk_info) == len(self.op.target_node)
9900 self.dest_disk_info = disk_info
9903 raise errors.ProgrammerError("Unhandled export mode %r" %
9906 # instance disk type verification
9907 # TODO: Implement export support for file-based disks
9908 for disk in self.instance.disks:
9909 if disk.dev_type == constants.LD_FILE:
9910 raise errors.OpPrereqError("Export not supported for instances with"
9911 " file-based disks", errors.ECODE_INVAL)
9913 def _CleanupExports(self, feedback_fn):
9914 """Removes exports of current instance from all other nodes.
9916 If an instance in a cluster with nodes A..D was exported to node C, its
9917 exports will be removed from the nodes A, B and D.
9920 assert self.op.mode != constants.EXPORT_MODE_REMOTE
9922 nodelist = self.cfg.GetNodeList()
9923 nodelist.remove(self.dst_node.name)
9925 # on one-node clusters nodelist will be empty after the removal
9926 # if we proceed the backup would be removed because OpBackupQuery
9927 # substitutes an empty list with the full cluster node list.
9928 iname = self.instance.name
9930 feedback_fn("Removing old exports for instance %s" % iname)
9931 exportlist = self.rpc.call_export_list(nodelist)
9932 for node in exportlist:
9933 if exportlist[node].fail_msg:
9935 if iname in exportlist[node].payload:
9936 msg = self.rpc.call_export_remove(node, iname).fail_msg
9938 self.LogWarning("Could not remove older export for instance %s"
9939 " on node %s: %s", iname, node, msg)
9941 def Exec(self, feedback_fn):
9942 """Export an instance to an image in the cluster.
9945 assert self.op.mode in constants.EXPORT_MODES
9947 instance = self.instance
9948 src_node = instance.primary_node
9950 if self.op.shutdown:
9951 # shutdown the instance, but not the disks
9952 feedback_fn("Shutting down instance %s" % instance.name)
9953 result = self.rpc.call_instance_shutdown(src_node, instance,
9954 self.op.shutdown_timeout)
9955 # TODO: Maybe ignore failures if ignore_remove_failures is set
9956 result.Raise("Could not shutdown instance %s on"
9957 " node %s" % (instance.name, src_node))
9959 # set the disks ID correctly since call_instance_start needs the
9960 # correct drbd minor to create the symlinks
9961 for disk in instance.disks:
9962 self.cfg.SetDiskID(disk, src_node)
9964 activate_disks = (not instance.admin_up)
9967 # Activate the instance disks if we'exporting a stopped instance
9968 feedback_fn("Activating disks for %s" % instance.name)
9969 _StartInstanceDisks(self, instance, None)
9972 helper = masterd.instance.ExportInstanceHelper(self, feedback_fn,
9975 helper.CreateSnapshots()
9977 if (self.op.shutdown and instance.admin_up and
9978 not self.op.remove_instance):
9979 assert not activate_disks
9980 feedback_fn("Starting instance %s" % instance.name)
9981 result = self.rpc.call_instance_start(src_node, instance, None, None)
9982 msg = result.fail_msg
9984 feedback_fn("Failed to start instance: %s" % msg)
9985 _ShutdownInstanceDisks(self, instance)
9986 raise errors.OpExecError("Could not start instance: %s" % msg)
9988 if self.op.mode == constants.EXPORT_MODE_LOCAL:
9989 (fin_resu, dresults) = helper.LocalExport(self.dst_node)
9990 elif self.op.mode == constants.EXPORT_MODE_REMOTE:
9991 connect_timeout = constants.RIE_CONNECT_TIMEOUT
9992 timeouts = masterd.instance.ImportExportTimeouts(connect_timeout)
9994 (key_name, _, _) = self.x509_key_name
9997 OpenSSL.crypto.dump_certificate(OpenSSL.crypto.FILETYPE_PEM,
10000 (fin_resu, dresults) = helper.RemoteExport(self.dest_disk_info,
10001 key_name, dest_ca_pem,
10006 # Check for backwards compatibility
10007 assert len(dresults) == len(instance.disks)
10008 assert compat.all(isinstance(i, bool) for i in dresults), \
10009 "Not all results are boolean: %r" % dresults
10013 feedback_fn("Deactivating disks for %s" % instance.name)
10014 _ShutdownInstanceDisks(self, instance)
10016 if not (compat.all(dresults) and fin_resu):
10019 failures.append("export finalization")
10020 if not compat.all(dresults):
10021 fdsk = utils.CommaJoin(idx for (idx, dsk) in enumerate(dresults)
10023 failures.append("disk export: disk(s) %s" % fdsk)
10025 raise errors.OpExecError("Export failed, errors in %s" %
10026 utils.CommaJoin(failures))
10028 # At this point, the export was successful, we can cleanup/finish
10030 # Remove instance if requested
10031 if self.op.remove_instance:
10032 feedback_fn("Removing instance %s" % instance.name)
10033 _RemoveInstance(self, feedback_fn, instance,
10034 self.op.ignore_remove_failures)
10036 if self.op.mode == constants.EXPORT_MODE_LOCAL:
10037 self._CleanupExports(feedback_fn)
10039 return fin_resu, dresults
10042 class LUBackupRemove(NoHooksLU):
10043 """Remove exports related to the named instance.
10048 def ExpandNames(self):
10049 self.needed_locks = {}
10050 # We need all nodes to be locked in order for RemoveExport to work, but we
10051 # don't need to lock the instance itself, as nothing will happen to it (and
10052 # we can remove exports also for a removed instance)
10053 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
10055 def Exec(self, feedback_fn):
10056 """Remove any export.
10059 instance_name = self.cfg.ExpandInstanceName(self.op.instance_name)
10060 # If the instance was not found we'll try with the name that was passed in.
10061 # This will only work if it was an FQDN, though.
10063 if not instance_name:
10065 instance_name = self.op.instance_name
10067 locked_nodes = self.acquired_locks[locking.LEVEL_NODE]
10068 exportlist = self.rpc.call_export_list(locked_nodes)
10070 for node in exportlist:
10071 msg = exportlist[node].fail_msg
10073 self.LogWarning("Failed to query node %s (continuing): %s", node, msg)
10075 if instance_name in exportlist[node].payload:
10077 result = self.rpc.call_export_remove(node, instance_name)
10078 msg = result.fail_msg
10080 logging.error("Could not remove export for instance %s"
10081 " on node %s: %s", instance_name, node, msg)
10083 if fqdn_warn and not found:
10084 feedback_fn("Export not found. If trying to remove an export belonging"
10085 " to a deleted instance please use its Fully Qualified"
10089 class LUGroupAdd(LogicalUnit):
10090 """Logical unit for creating node groups.
10093 HPATH = "group-add"
10094 HTYPE = constants.HTYPE_GROUP
10097 def ExpandNames(self):
10098 # We need the new group's UUID here so that we can create and acquire the
10099 # corresponding lock. Later, in Exec(), we'll indicate to cfg.AddNodeGroup
10100 # that it should not check whether the UUID exists in the configuration.
10101 self.group_uuid = self.cfg.GenerateUniqueID(self.proc.GetECId())
10102 self.needed_locks = {}
10103 self.add_locks[locking.LEVEL_NODEGROUP] = self.group_uuid
10105 def CheckPrereq(self):
10106 """Check prerequisites.
10108 This checks that the given group name is not an existing node group
10113 existing_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
10114 except errors.OpPrereqError:
10117 raise errors.OpPrereqError("Desired group name '%s' already exists as a"
10118 " node group (UUID: %s)" %
10119 (self.op.group_name, existing_uuid),
10120 errors.ECODE_EXISTS)
10122 if self.op.ndparams:
10123 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
10125 def BuildHooksEnv(self):
10126 """Build hooks env.
10130 "GROUP_NAME": self.op.group_name,
10132 mn = self.cfg.GetMasterNode()
10133 return env, [mn], [mn]
10135 def Exec(self, feedback_fn):
10136 """Add the node group to the cluster.
10139 group_obj = objects.NodeGroup(name=self.op.group_name, members=[],
10140 uuid=self.group_uuid,
10141 alloc_policy=self.op.alloc_policy,
10142 ndparams=self.op.ndparams)
10144 self.cfg.AddNodeGroup(group_obj, self.proc.GetECId(), check_uuid=False)
10145 del self.remove_locks[locking.LEVEL_NODEGROUP]
10148 class LUGroupAssignNodes(NoHooksLU):
10149 """Logical unit for assigning nodes to groups.
10154 def ExpandNames(self):
10155 # These raise errors.OpPrereqError on their own:
10156 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
10157 self.op.nodes = _GetWantedNodes(self, self.op.nodes)
10159 # We want to lock all the affected nodes and groups. We have readily
10160 # available the list of nodes, and the *destination* group. To gather the
10161 # list of "source" groups, we need to fetch node information.
10162 self.node_data = self.cfg.GetAllNodesInfo()
10163 affected_groups = set(self.node_data[node].group for node in self.op.nodes)
10164 affected_groups.add(self.group_uuid)
10166 self.needed_locks = {
10167 locking.LEVEL_NODEGROUP: list(affected_groups),
10168 locking.LEVEL_NODE: self.op.nodes,
10171 def CheckPrereq(self):
10172 """Check prerequisites.
10175 self.group = self.cfg.GetNodeGroup(self.group_uuid)
10176 instance_data = self.cfg.GetAllInstancesInfo()
10178 if self.group is None:
10179 raise errors.OpExecError("Could not retrieve group '%s' (UUID: %s)" %
10180 (self.op.group_name, self.group_uuid))
10182 (new_splits, previous_splits) = \
10183 self.CheckAssignmentForSplitInstances([(node, self.group_uuid)
10184 for node in self.op.nodes],
10185 self.node_data, instance_data)
10188 fmt_new_splits = utils.CommaJoin(utils.NiceSort(new_splits))
10190 if not self.op.force:
10191 raise errors.OpExecError("The following instances get split by this"
10192 " change and --force was not given: %s" %
10195 self.LogWarning("This operation will split the following instances: %s",
10198 if previous_splits:
10199 self.LogWarning("In addition, these already-split instances continue"
10200 " to be split across groups: %s",
10201 utils.CommaJoin(utils.NiceSort(previous_splits)))
10203 def Exec(self, feedback_fn):
10204 """Assign nodes to a new group.
10207 for node in self.op.nodes:
10208 self.node_data[node].group = self.group_uuid
10210 self.cfg.Update(self.group, feedback_fn) # Saves all modified nodes.
10213 def CheckAssignmentForSplitInstances(changes, node_data, instance_data):
10214 """Check for split instances after a node assignment.
10216 This method considers a series of node assignments as an atomic operation,
10217 and returns information about split instances after applying the set of
10220 In particular, it returns information about newly split instances, and
10221 instances that were already split, and remain so after the change.
10223 Only instances whose disk template is listed in constants.DTS_NET_MIRROR are
10226 @type changes: list of (node_name, new_group_uuid) pairs.
10227 @param changes: list of node assignments to consider.
10228 @param node_data: a dict with data for all nodes
10229 @param instance_data: a dict with all instances to consider
10230 @rtype: a two-tuple
10231 @return: a list of instances that were previously okay and result split as a
10232 consequence of this change, and a list of instances that were previously
10233 split and this change does not fix.
10236 changed_nodes = dict((node, group) for node, group in changes
10237 if node_data[node].group != group)
10239 all_split_instances = set()
10240 previously_split_instances = set()
10242 def InstanceNodes(instance):
10243 return [instance.primary_node] + list(instance.secondary_nodes)
10245 for inst in instance_data.values():
10246 if inst.disk_template not in constants.DTS_NET_MIRROR:
10249 instance_nodes = InstanceNodes(inst)
10251 if len(set(node_data[node].group for node in instance_nodes)) > 1:
10252 previously_split_instances.add(inst.name)
10254 if len(set(changed_nodes.get(node, node_data[node].group)
10255 for node in instance_nodes)) > 1:
10256 all_split_instances.add(inst.name)
10258 return (list(all_split_instances - previously_split_instances),
10259 list(previously_split_instances & all_split_instances))
10262 class _GroupQuery(_QueryBase):
10264 FIELDS = query.GROUP_FIELDS
10266 def ExpandNames(self, lu):
10267 lu.needed_locks = {}
10269 self._all_groups = lu.cfg.GetAllNodeGroupsInfo()
10270 name_to_uuid = dict((g.name, g.uuid) for g in self._all_groups.values())
10273 self.wanted = [name_to_uuid[name]
10274 for name in utils.NiceSort(name_to_uuid.keys())]
10276 # Accept names to be either names or UUIDs.
10279 all_uuid = frozenset(self._all_groups.keys())
10281 for name in self.names:
10282 if name in all_uuid:
10283 self.wanted.append(name)
10284 elif name in name_to_uuid:
10285 self.wanted.append(name_to_uuid[name])
10287 missing.append(name)
10290 raise errors.OpPrereqError("Some groups do not exist: %s" %
10291 utils.CommaJoin(missing),
10292 errors.ECODE_NOENT)
10294 def DeclareLocks(self, lu, level):
10297 def _GetQueryData(self, lu):
10298 """Computes the list of node groups and their attributes.
10301 do_nodes = query.GQ_NODE in self.requested_data
10302 do_instances = query.GQ_INST in self.requested_data
10304 group_to_nodes = None
10305 group_to_instances = None
10307 # For GQ_NODE, we need to map group->[nodes], and group->[instances] for
10308 # GQ_INST. The former is attainable with just GetAllNodesInfo(), but for the
10309 # latter GetAllInstancesInfo() is not enough, for we have to go through
10310 # instance->node. Hence, we will need to process nodes even if we only need
10311 # instance information.
10312 if do_nodes or do_instances:
10313 all_nodes = lu.cfg.GetAllNodesInfo()
10314 group_to_nodes = dict((uuid, []) for uuid in self.wanted)
10317 for node in all_nodes.values():
10318 if node.group in group_to_nodes:
10319 group_to_nodes[node.group].append(node.name)
10320 node_to_group[node.name] = node.group
10323 all_instances = lu.cfg.GetAllInstancesInfo()
10324 group_to_instances = dict((uuid, []) for uuid in self.wanted)
10326 for instance in all_instances.values():
10327 node = instance.primary_node
10328 if node in node_to_group:
10329 group_to_instances[node_to_group[node]].append(instance.name)
10332 # Do not pass on node information if it was not requested.
10333 group_to_nodes = None
10335 return query.GroupQueryData([self._all_groups[uuid]
10336 for uuid in self.wanted],
10337 group_to_nodes, group_to_instances)
10340 class LUGroupQuery(NoHooksLU):
10341 """Logical unit for querying node groups.
10346 def CheckArguments(self):
10347 self.gq = _GroupQuery(self.op.names, self.op.output_fields, False)
10349 def ExpandNames(self):
10350 self.gq.ExpandNames(self)
10352 def Exec(self, feedback_fn):
10353 return self.gq.OldStyleQuery(self)
10356 class LUGroupSetParams(LogicalUnit):
10357 """Modifies the parameters of a node group.
10360 HPATH = "group-modify"
10361 HTYPE = constants.HTYPE_GROUP
10364 def CheckArguments(self):
10367 self.op.alloc_policy,
10370 if all_changes.count(None) == len(all_changes):
10371 raise errors.OpPrereqError("Please pass at least one modification",
10372 errors.ECODE_INVAL)
10374 def ExpandNames(self):
10375 # This raises errors.OpPrereqError on its own:
10376 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
10378 self.needed_locks = {
10379 locking.LEVEL_NODEGROUP: [self.group_uuid],
10382 def CheckPrereq(self):
10383 """Check prerequisites.
10386 self.group = self.cfg.GetNodeGroup(self.group_uuid)
10388 if self.group is None:
10389 raise errors.OpExecError("Could not retrieve group '%s' (UUID: %s)" %
10390 (self.op.group_name, self.group_uuid))
10392 if self.op.ndparams:
10393 new_ndparams = _GetUpdatedParams(self.group.ndparams, self.op.ndparams)
10394 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
10395 self.new_ndparams = new_ndparams
10397 def BuildHooksEnv(self):
10398 """Build hooks env.
10402 "GROUP_NAME": self.op.group_name,
10403 "NEW_ALLOC_POLICY": self.op.alloc_policy,
10405 mn = self.cfg.GetMasterNode()
10406 return env, [mn], [mn]
10408 def Exec(self, feedback_fn):
10409 """Modifies the node group.
10414 if self.op.ndparams:
10415 self.group.ndparams = self.new_ndparams
10416 result.append(("ndparams", str(self.group.ndparams)))
10418 if self.op.alloc_policy:
10419 self.group.alloc_policy = self.op.alloc_policy
10421 self.cfg.Update(self.group, feedback_fn)
10426 class LUGroupRemove(LogicalUnit):
10427 HPATH = "group-remove"
10428 HTYPE = constants.HTYPE_GROUP
10431 def ExpandNames(self):
10432 # This will raises errors.OpPrereqError on its own:
10433 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
10434 self.needed_locks = {
10435 locking.LEVEL_NODEGROUP: [self.group_uuid],
10438 def CheckPrereq(self):
10439 """Check prerequisites.
10441 This checks that the given group name exists as a node group, that is
10442 empty (i.e., contains no nodes), and that is not the last group of the
10446 # Verify that the group is empty.
10447 group_nodes = [node.name
10448 for node in self.cfg.GetAllNodesInfo().values()
10449 if node.group == self.group_uuid]
10452 raise errors.OpPrereqError("Group '%s' not empty, has the following"
10454 (self.op.group_name,
10455 utils.CommaJoin(utils.NiceSort(group_nodes))),
10456 errors.ECODE_STATE)
10458 # Verify the cluster would not be left group-less.
10459 if len(self.cfg.GetNodeGroupList()) == 1:
10460 raise errors.OpPrereqError("Group '%s' is the only group,"
10461 " cannot be removed" %
10462 self.op.group_name,
10463 errors.ECODE_STATE)
10465 def BuildHooksEnv(self):
10466 """Build hooks env.
10470 "GROUP_NAME": self.op.group_name,
10472 mn = self.cfg.GetMasterNode()
10473 return env, [mn], [mn]
10475 def Exec(self, feedback_fn):
10476 """Remove the node group.
10480 self.cfg.RemoveNodeGroup(self.group_uuid)
10481 except errors.ConfigurationError:
10482 raise errors.OpExecError("Group '%s' with UUID %s disappeared" %
10483 (self.op.group_name, self.group_uuid))
10485 self.remove_locks[locking.LEVEL_NODEGROUP] = self.group_uuid
10488 class LUGroupRename(LogicalUnit):
10489 HPATH = "group-rename"
10490 HTYPE = constants.HTYPE_GROUP
10493 def ExpandNames(self):
10494 # This raises errors.OpPrereqError on its own:
10495 self.group_uuid = self.cfg.LookupNodeGroup(self.op.old_name)
10497 self.needed_locks = {
10498 locking.LEVEL_NODEGROUP: [self.group_uuid],
10501 def CheckPrereq(self):
10502 """Check prerequisites.
10504 This checks that the given old_name exists as a node group, and that
10509 new_name_uuid = self.cfg.LookupNodeGroup(self.op.new_name)
10510 except errors.OpPrereqError:
10513 raise errors.OpPrereqError("Desired new name '%s' clashes with existing"
10514 " node group (UUID: %s)" %
10515 (self.op.new_name, new_name_uuid),
10516 errors.ECODE_EXISTS)
10518 def BuildHooksEnv(self):
10519 """Build hooks env.
10523 "OLD_NAME": self.op.old_name,
10524 "NEW_NAME": self.op.new_name,
10527 mn = self.cfg.GetMasterNode()
10528 all_nodes = self.cfg.GetAllNodesInfo()
10530 all_nodes.pop(mn, None)
10532 for node in all_nodes.values():
10533 if node.group == self.group_uuid:
10534 run_nodes.append(node.name)
10536 return env, run_nodes, run_nodes
10538 def Exec(self, feedback_fn):
10539 """Rename the node group.
10542 group = self.cfg.GetNodeGroup(self.group_uuid)
10545 raise errors.OpExecError("Could not retrieve group '%s' (UUID: %s)" %
10546 (self.op.old_name, self.group_uuid))
10548 group.name = self.op.new_name
10549 self.cfg.Update(group, feedback_fn)
10551 return self.op.new_name
10554 class TagsLU(NoHooksLU): # pylint: disable-msg=W0223
10555 """Generic tags LU.
10557 This is an abstract class which is the parent of all the other tags LUs.
10561 def ExpandNames(self):
10562 self.needed_locks = {}
10563 if self.op.kind == constants.TAG_NODE:
10564 self.op.name = _ExpandNodeName(self.cfg, self.op.name)
10565 self.needed_locks[locking.LEVEL_NODE] = self.op.name
10566 elif self.op.kind == constants.TAG_INSTANCE:
10567 self.op.name = _ExpandInstanceName(self.cfg, self.op.name)
10568 self.needed_locks[locking.LEVEL_INSTANCE] = self.op.name
10570 # FIXME: Acquire BGL for cluster tag operations (as of this writing it's
10571 # not possible to acquire the BGL based on opcode parameters)
10573 def CheckPrereq(self):
10574 """Check prerequisites.
10577 if self.op.kind == constants.TAG_CLUSTER:
10578 self.target = self.cfg.GetClusterInfo()
10579 elif self.op.kind == constants.TAG_NODE:
10580 self.target = self.cfg.GetNodeInfo(self.op.name)
10581 elif self.op.kind == constants.TAG_INSTANCE:
10582 self.target = self.cfg.GetInstanceInfo(self.op.name)
10584 raise errors.OpPrereqError("Wrong tag type requested (%s)" %
10585 str(self.op.kind), errors.ECODE_INVAL)
10588 class LUTagsGet(TagsLU):
10589 """Returns the tags of a given object.
10594 def ExpandNames(self):
10595 TagsLU.ExpandNames(self)
10597 # Share locks as this is only a read operation
10598 self.share_locks = dict.fromkeys(locking.LEVELS, 1)
10600 def Exec(self, feedback_fn):
10601 """Returns the tag list.
10604 return list(self.target.GetTags())
10607 class LUTagsSearch(NoHooksLU):
10608 """Searches the tags for a given pattern.
10613 def ExpandNames(self):
10614 self.needed_locks = {}
10616 def CheckPrereq(self):
10617 """Check prerequisites.
10619 This checks the pattern passed for validity by compiling it.
10623 self.re = re.compile(self.op.pattern)
10624 except re.error, err:
10625 raise errors.OpPrereqError("Invalid search pattern '%s': %s" %
10626 (self.op.pattern, err), errors.ECODE_INVAL)
10628 def Exec(self, feedback_fn):
10629 """Returns the tag list.
10633 tgts = [("/cluster", cfg.GetClusterInfo())]
10634 ilist = cfg.GetAllInstancesInfo().values()
10635 tgts.extend([("/instances/%s" % i.name, i) for i in ilist])
10636 nlist = cfg.GetAllNodesInfo().values()
10637 tgts.extend([("/nodes/%s" % n.name, n) for n in nlist])
10639 for path, target in tgts:
10640 for tag in target.GetTags():
10641 if self.re.search(tag):
10642 results.append((path, tag))
10646 class LUTagsSet(TagsLU):
10647 """Sets a tag on a given object.
10652 def CheckPrereq(self):
10653 """Check prerequisites.
10655 This checks the type and length of the tag name and value.
10658 TagsLU.CheckPrereq(self)
10659 for tag in self.op.tags:
10660 objects.TaggableObject.ValidateTag(tag)
10662 def Exec(self, feedback_fn):
10667 for tag in self.op.tags:
10668 self.target.AddTag(tag)
10669 except errors.TagError, err:
10670 raise errors.OpExecError("Error while setting tag: %s" % str(err))
10671 self.cfg.Update(self.target, feedback_fn)
10674 class LUTagsDel(TagsLU):
10675 """Delete a list of tags from a given object.
10680 def CheckPrereq(self):
10681 """Check prerequisites.
10683 This checks that we have the given tag.
10686 TagsLU.CheckPrereq(self)
10687 for tag in self.op.tags:
10688 objects.TaggableObject.ValidateTag(tag)
10689 del_tags = frozenset(self.op.tags)
10690 cur_tags = self.target.GetTags()
10692 diff_tags = del_tags - cur_tags
10694 diff_names = ("'%s'" % i for i in sorted(diff_tags))
10695 raise errors.OpPrereqError("Tag(s) %s not found" %
10696 (utils.CommaJoin(diff_names), ),
10697 errors.ECODE_NOENT)
10699 def Exec(self, feedback_fn):
10700 """Remove the tag from the object.
10703 for tag in self.op.tags:
10704 self.target.RemoveTag(tag)
10705 self.cfg.Update(self.target, feedback_fn)
10708 class LUTestDelay(NoHooksLU):
10709 """Sleep for a specified amount of time.
10711 This LU sleeps on the master and/or nodes for a specified amount of
10717 def ExpandNames(self):
10718 """Expand names and set required locks.
10720 This expands the node list, if any.
10723 self.needed_locks = {}
10724 if self.op.on_nodes:
10725 # _GetWantedNodes can be used here, but is not always appropriate to use
10726 # this way in ExpandNames. Check LogicalUnit.ExpandNames docstring for
10727 # more information.
10728 self.op.on_nodes = _GetWantedNodes(self, self.op.on_nodes)
10729 self.needed_locks[locking.LEVEL_NODE] = self.op.on_nodes
10731 def _TestDelay(self):
10732 """Do the actual sleep.
10735 if self.op.on_master:
10736 if not utils.TestDelay(self.op.duration):
10737 raise errors.OpExecError("Error during master delay test")
10738 if self.op.on_nodes:
10739 result = self.rpc.call_test_delay(self.op.on_nodes, self.op.duration)
10740 for node, node_result in result.items():
10741 node_result.Raise("Failure during rpc call to node %s" % node)
10743 def Exec(self, feedback_fn):
10744 """Execute the test delay opcode, with the wanted repetitions.
10747 if self.op.repeat == 0:
10750 top_value = self.op.repeat - 1
10751 for i in range(self.op.repeat):
10752 self.LogInfo("Test delay iteration %d/%d" % (i, top_value))
10756 class LUTestJqueue(NoHooksLU):
10757 """Utility LU to test some aspects of the job queue.
10762 # Must be lower than default timeout for WaitForJobChange to see whether it
10763 # notices changed jobs
10764 _CLIENT_CONNECT_TIMEOUT = 20.0
10765 _CLIENT_CONFIRM_TIMEOUT = 60.0
10768 def _NotifyUsingSocket(cls, cb, errcls):
10769 """Opens a Unix socket and waits for another program to connect.
10772 @param cb: Callback to send socket name to client
10773 @type errcls: class
10774 @param errcls: Exception class to use for errors
10777 # Using a temporary directory as there's no easy way to create temporary
10778 # sockets without writing a custom loop around tempfile.mktemp and
10780 tmpdir = tempfile.mkdtemp()
10782 tmpsock = utils.PathJoin(tmpdir, "sock")
10784 logging.debug("Creating temporary socket at %s", tmpsock)
10785 sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
10790 # Send details to client
10793 # Wait for client to connect before continuing
10794 sock.settimeout(cls._CLIENT_CONNECT_TIMEOUT)
10796 (conn, _) = sock.accept()
10797 except socket.error, err:
10798 raise errcls("Client didn't connect in time (%s)" % err)
10802 # Remove as soon as client is connected
10803 shutil.rmtree(tmpdir)
10805 # Wait for client to close
10808 # pylint: disable-msg=E1101
10809 # Instance of '_socketobject' has no ... member
10810 conn.settimeout(cls._CLIENT_CONFIRM_TIMEOUT)
10812 except socket.error, err:
10813 raise errcls("Client failed to confirm notification (%s)" % err)
10817 def _SendNotification(self, test, arg, sockname):
10818 """Sends a notification to the client.
10821 @param test: Test name
10822 @param arg: Test argument (depends on test)
10823 @type sockname: string
10824 @param sockname: Socket path
10827 self.Log(constants.ELOG_JQUEUE_TEST, (sockname, test, arg))
10829 def _Notify(self, prereq, test, arg):
10830 """Notifies the client of a test.
10833 @param prereq: Whether this is a prereq-phase test
10835 @param test: Test name
10836 @param arg: Test argument (depends on test)
10840 errcls = errors.OpPrereqError
10842 errcls = errors.OpExecError
10844 return self._NotifyUsingSocket(compat.partial(self._SendNotification,
10848 def CheckArguments(self):
10849 self.checkargs_calls = getattr(self, "checkargs_calls", 0) + 1
10850 self.expandnames_calls = 0
10852 def ExpandNames(self):
10853 checkargs_calls = getattr(self, "checkargs_calls", 0)
10854 if checkargs_calls < 1:
10855 raise errors.ProgrammerError("CheckArguments was not called")
10857 self.expandnames_calls += 1
10859 if self.op.notify_waitlock:
10860 self._Notify(True, constants.JQT_EXPANDNAMES, None)
10862 self.LogInfo("Expanding names")
10864 # Get lock on master node (just to get a lock, not for a particular reason)
10865 self.needed_locks = {
10866 locking.LEVEL_NODE: self.cfg.GetMasterNode(),
10869 def Exec(self, feedback_fn):
10870 if self.expandnames_calls < 1:
10871 raise errors.ProgrammerError("ExpandNames was not called")
10873 if self.op.notify_exec:
10874 self._Notify(False, constants.JQT_EXEC, None)
10876 self.LogInfo("Executing")
10878 if self.op.log_messages:
10879 self._Notify(False, constants.JQT_STARTMSG, len(self.op.log_messages))
10880 for idx, msg in enumerate(self.op.log_messages):
10881 self.LogInfo("Sending log message %s", idx + 1)
10882 feedback_fn(constants.JQT_MSGPREFIX + msg)
10883 # Report how many test messages have been sent
10884 self._Notify(False, constants.JQT_LOGMSG, idx + 1)
10887 raise errors.OpExecError("Opcode failure was requested")
10892 class IAllocator(object):
10893 """IAllocator framework.
10895 An IAllocator instance has three sets of attributes:
10896 - cfg that is needed to query the cluster
10897 - input data (all members of the _KEYS class attribute are required)
10898 - four buffer attributes (in|out_data|text), that represent the
10899 input (to the external script) in text and data structure format,
10900 and the output from it, again in two formats
10901 - the result variables from the script (success, info, nodes) for
10905 # pylint: disable-msg=R0902
10906 # lots of instance attributes
10908 "name", "mem_size", "disks", "disk_template",
10909 "os", "tags", "nics", "vcpus", "hypervisor",
10912 "name", "relocate_from",
10918 def __init__(self, cfg, rpc, mode, **kwargs):
10921 # init buffer variables
10922 self.in_text = self.out_text = self.in_data = self.out_data = None
10923 # init all input fields so that pylint is happy
10925 self.mem_size = self.disks = self.disk_template = None
10926 self.os = self.tags = self.nics = self.vcpus = None
10927 self.hypervisor = None
10928 self.relocate_from = None
10930 self.evac_nodes = None
10932 self.required_nodes = None
10933 # init result fields
10934 self.success = self.info = self.result = None
10935 if self.mode == constants.IALLOCATOR_MODE_ALLOC:
10936 keyset = self._ALLO_KEYS
10937 fn = self._AddNewInstance
10938 elif self.mode == constants.IALLOCATOR_MODE_RELOC:
10939 keyset = self._RELO_KEYS
10940 fn = self._AddRelocateInstance
10941 elif self.mode == constants.IALLOCATOR_MODE_MEVAC:
10942 keyset = self._EVAC_KEYS
10943 fn = self._AddEvacuateNodes
10945 raise errors.ProgrammerError("Unknown mode '%s' passed to the"
10946 " IAllocator" % self.mode)
10948 if key not in keyset:
10949 raise errors.ProgrammerError("Invalid input parameter '%s' to"
10950 " IAllocator" % key)
10951 setattr(self, key, kwargs[key])
10954 if key not in kwargs:
10955 raise errors.ProgrammerError("Missing input parameter '%s' to"
10956 " IAllocator" % key)
10957 self._BuildInputData(fn)
10959 def _ComputeClusterData(self):
10960 """Compute the generic allocator input data.
10962 This is the data that is independent of the actual operation.
10966 cluster_info = cfg.GetClusterInfo()
10969 "version": constants.IALLOCATOR_VERSION,
10970 "cluster_name": cfg.GetClusterName(),
10971 "cluster_tags": list(cluster_info.GetTags()),
10972 "enabled_hypervisors": list(cluster_info.enabled_hypervisors),
10973 # we don't have job IDs
10975 ninfo = cfg.GetAllNodesInfo()
10976 iinfo = cfg.GetAllInstancesInfo().values()
10977 i_list = [(inst, cluster_info.FillBE(inst)) for inst in iinfo]
10980 node_list = [n.name for n in ninfo.values() if n.vm_capable]
10982 if self.mode == constants.IALLOCATOR_MODE_ALLOC:
10983 hypervisor_name = self.hypervisor
10984 elif self.mode == constants.IALLOCATOR_MODE_RELOC:
10985 hypervisor_name = cfg.GetInstanceInfo(self.name).hypervisor
10986 elif self.mode == constants.IALLOCATOR_MODE_MEVAC:
10987 hypervisor_name = cluster_info.enabled_hypervisors[0]
10989 node_data = self.rpc.call_node_info(node_list, cfg.GetVGName(),
10992 self.rpc.call_all_instances_info(node_list,
10993 cluster_info.enabled_hypervisors)
10995 data["nodegroups"] = self._ComputeNodeGroupData(cfg)
10997 config_ndata = self._ComputeBasicNodeData(ninfo)
10998 data["nodes"] = self._ComputeDynamicNodeData(ninfo, node_data, node_iinfo,
10999 i_list, config_ndata)
11000 assert len(data["nodes"]) == len(ninfo), \
11001 "Incomplete node data computed"
11003 data["instances"] = self._ComputeInstanceData(cluster_info, i_list)
11005 self.in_data = data
11008 def _ComputeNodeGroupData(cfg):
11009 """Compute node groups data.
11013 for guuid, gdata in cfg.GetAllNodeGroupsInfo().items():
11015 "name": gdata.name,
11016 "alloc_policy": gdata.alloc_policy,
11021 def _ComputeBasicNodeData(node_cfg):
11022 """Compute global node data.
11025 @returns: a dict of name: (node dict, node config)
11029 for ninfo in node_cfg.values():
11030 # fill in static (config-based) values
11032 "tags": list(ninfo.GetTags()),
11033 "primary_ip": ninfo.primary_ip,
11034 "secondary_ip": ninfo.secondary_ip,
11035 "offline": ninfo.offline,
11036 "drained": ninfo.drained,
11037 "master_candidate": ninfo.master_candidate,
11038 "group": ninfo.group,
11039 "master_capable": ninfo.master_capable,
11040 "vm_capable": ninfo.vm_capable,
11043 node_results[ninfo.name] = pnr
11045 return node_results
11048 def _ComputeDynamicNodeData(node_cfg, node_data, node_iinfo, i_list,
11050 """Compute global node data.
11052 @param node_results: the basic node structures as filled from the config
11055 # make a copy of the current dict
11056 node_results = dict(node_results)
11057 for nname, nresult in node_data.items():
11058 assert nname in node_results, "Missing basic data for node %s" % nname
11059 ninfo = node_cfg[nname]
11061 if not (ninfo.offline or ninfo.drained):
11062 nresult.Raise("Can't get data for node %s" % nname)
11063 node_iinfo[nname].Raise("Can't get node instance info from node %s" %
11065 remote_info = nresult.payload
11067 for attr in ['memory_total', 'memory_free', 'memory_dom0',
11068 'vg_size', 'vg_free', 'cpu_total']:
11069 if attr not in remote_info:
11070 raise errors.OpExecError("Node '%s' didn't return attribute"
11071 " '%s'" % (nname, attr))
11072 if not isinstance(remote_info[attr], int):
11073 raise errors.OpExecError("Node '%s' returned invalid value"
11075 (nname, attr, remote_info[attr]))
11076 # compute memory used by primary instances
11077 i_p_mem = i_p_up_mem = 0
11078 for iinfo, beinfo in i_list:
11079 if iinfo.primary_node == nname:
11080 i_p_mem += beinfo[constants.BE_MEMORY]
11081 if iinfo.name not in node_iinfo[nname].payload:
11084 i_used_mem = int(node_iinfo[nname].payload[iinfo.name]['memory'])
11085 i_mem_diff = beinfo[constants.BE_MEMORY] - i_used_mem
11086 remote_info['memory_free'] -= max(0, i_mem_diff)
11089 i_p_up_mem += beinfo[constants.BE_MEMORY]
11091 # compute memory used by instances
11093 "total_memory": remote_info['memory_total'],
11094 "reserved_memory": remote_info['memory_dom0'],
11095 "free_memory": remote_info['memory_free'],
11096 "total_disk": remote_info['vg_size'],
11097 "free_disk": remote_info['vg_free'],
11098 "total_cpus": remote_info['cpu_total'],
11099 "i_pri_memory": i_p_mem,
11100 "i_pri_up_memory": i_p_up_mem,
11102 pnr_dyn.update(node_results[nname])
11103 node_results[nname] = pnr_dyn
11105 return node_results
11108 def _ComputeInstanceData(cluster_info, i_list):
11109 """Compute global instance data.
11113 for iinfo, beinfo in i_list:
11115 for nic in iinfo.nics:
11116 filled_params = cluster_info.SimpleFillNIC(nic.nicparams)
11117 nic_dict = {"mac": nic.mac,
11119 "mode": filled_params[constants.NIC_MODE],
11120 "link": filled_params[constants.NIC_LINK],
11122 if filled_params[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
11123 nic_dict["bridge"] = filled_params[constants.NIC_LINK]
11124 nic_data.append(nic_dict)
11126 "tags": list(iinfo.GetTags()),
11127 "admin_up": iinfo.admin_up,
11128 "vcpus": beinfo[constants.BE_VCPUS],
11129 "memory": beinfo[constants.BE_MEMORY],
11131 "nodes": [iinfo.primary_node] + list(iinfo.secondary_nodes),
11133 "disks": [{"size": dsk.size, "mode": dsk.mode} for dsk in iinfo.disks],
11134 "disk_template": iinfo.disk_template,
11135 "hypervisor": iinfo.hypervisor,
11137 pir["disk_space_total"] = _ComputeDiskSize(iinfo.disk_template,
11139 instance_data[iinfo.name] = pir
11141 return instance_data
11143 def _AddNewInstance(self):
11144 """Add new instance data to allocator structure.
11146 This in combination with _AllocatorGetClusterData will create the
11147 correct structure needed as input for the allocator.
11149 The checks for the completeness of the opcode must have already been
11153 disk_space = _ComputeDiskSize(self.disk_template, self.disks)
11155 if self.disk_template in constants.DTS_NET_MIRROR:
11156 self.required_nodes = 2
11158 self.required_nodes = 1
11161 "disk_template": self.disk_template,
11164 "vcpus": self.vcpus,
11165 "memory": self.mem_size,
11166 "disks": self.disks,
11167 "disk_space_total": disk_space,
11169 "required_nodes": self.required_nodes,
11173 def _AddRelocateInstance(self):
11174 """Add relocate instance data to allocator structure.
11176 This in combination with _IAllocatorGetClusterData will create the
11177 correct structure needed as input for the allocator.
11179 The checks for the completeness of the opcode must have already been
11183 instance = self.cfg.GetInstanceInfo(self.name)
11184 if instance is None:
11185 raise errors.ProgrammerError("Unknown instance '%s' passed to"
11186 " IAllocator" % self.name)
11188 if instance.disk_template not in constants.DTS_NET_MIRROR:
11189 raise errors.OpPrereqError("Can't relocate non-mirrored instances",
11190 errors.ECODE_INVAL)
11192 if len(instance.secondary_nodes) != 1:
11193 raise errors.OpPrereqError("Instance has not exactly one secondary node",
11194 errors.ECODE_STATE)
11196 self.required_nodes = 1
11197 disk_sizes = [{'size': disk.size} for disk in instance.disks]
11198 disk_space = _ComputeDiskSize(instance.disk_template, disk_sizes)
11202 "disk_space_total": disk_space,
11203 "required_nodes": self.required_nodes,
11204 "relocate_from": self.relocate_from,
11208 def _AddEvacuateNodes(self):
11209 """Add evacuate nodes data to allocator structure.
11213 "evac_nodes": self.evac_nodes
11217 def _BuildInputData(self, fn):
11218 """Build input data structures.
11221 self._ComputeClusterData()
11224 request["type"] = self.mode
11225 self.in_data["request"] = request
11227 self.in_text = serializer.Dump(self.in_data)
11229 def Run(self, name, validate=True, call_fn=None):
11230 """Run an instance allocator and return the results.
11233 if call_fn is None:
11234 call_fn = self.rpc.call_iallocator_runner
11236 result = call_fn(self.cfg.GetMasterNode(), name, self.in_text)
11237 result.Raise("Failure while running the iallocator script")
11239 self.out_text = result.payload
11241 self._ValidateResult()
11243 def _ValidateResult(self):
11244 """Process the allocator results.
11246 This will process and if successful save the result in
11247 self.out_data and the other parameters.
11251 rdict = serializer.Load(self.out_text)
11252 except Exception, err:
11253 raise errors.OpExecError("Can't parse iallocator results: %s" % str(err))
11255 if not isinstance(rdict, dict):
11256 raise errors.OpExecError("Can't parse iallocator results: not a dict")
11258 # TODO: remove backwards compatiblity in later versions
11259 if "nodes" in rdict and "result" not in rdict:
11260 rdict["result"] = rdict["nodes"]
11263 for key in "success", "info", "result":
11264 if key not in rdict:
11265 raise errors.OpExecError("Can't parse iallocator results:"
11266 " missing key '%s'" % key)
11267 setattr(self, key, rdict[key])
11269 if not isinstance(rdict["result"], list):
11270 raise errors.OpExecError("Can't parse iallocator results: 'result' key"
11272 self.out_data = rdict
11275 class LUTestAllocator(NoHooksLU):
11276 """Run allocator tests.
11278 This LU runs the allocator tests
11281 def CheckPrereq(self):
11282 """Check prerequisites.
11284 This checks the opcode parameters depending on the director and mode test.
11287 if self.op.mode == constants.IALLOCATOR_MODE_ALLOC:
11288 for attr in ["mem_size", "disks", "disk_template",
11289 "os", "tags", "nics", "vcpus"]:
11290 if not hasattr(self.op, attr):
11291 raise errors.OpPrereqError("Missing attribute '%s' on opcode input" %
11292 attr, errors.ECODE_INVAL)
11293 iname = self.cfg.ExpandInstanceName(self.op.name)
11294 if iname is not None:
11295 raise errors.OpPrereqError("Instance '%s' already in the cluster" %
11296 iname, errors.ECODE_EXISTS)
11297 if not isinstance(self.op.nics, list):
11298 raise errors.OpPrereqError("Invalid parameter 'nics'",
11299 errors.ECODE_INVAL)
11300 if not isinstance(self.op.disks, list):
11301 raise errors.OpPrereqError("Invalid parameter 'disks'",
11302 errors.ECODE_INVAL)
11303 for row in self.op.disks:
11304 if (not isinstance(row, dict) or
11305 "size" not in row or
11306 not isinstance(row["size"], int) or
11307 "mode" not in row or
11308 row["mode"] not in ['r', 'w']):
11309 raise errors.OpPrereqError("Invalid contents of the 'disks'"
11310 " parameter", errors.ECODE_INVAL)
11311 if self.op.hypervisor is None:
11312 self.op.hypervisor = self.cfg.GetHypervisorType()
11313 elif self.op.mode == constants.IALLOCATOR_MODE_RELOC:
11314 fname = _ExpandInstanceName(self.cfg, self.op.name)
11315 self.op.name = fname
11316 self.relocate_from = self.cfg.GetInstanceInfo(fname).secondary_nodes
11317 elif self.op.mode == constants.IALLOCATOR_MODE_MEVAC:
11318 if not hasattr(self.op, "evac_nodes"):
11319 raise errors.OpPrereqError("Missing attribute 'evac_nodes' on"
11320 " opcode input", errors.ECODE_INVAL)
11322 raise errors.OpPrereqError("Invalid test allocator mode '%s'" %
11323 self.op.mode, errors.ECODE_INVAL)
11325 if self.op.direction == constants.IALLOCATOR_DIR_OUT:
11326 if self.op.allocator is None:
11327 raise errors.OpPrereqError("Missing allocator name",
11328 errors.ECODE_INVAL)
11329 elif self.op.direction != constants.IALLOCATOR_DIR_IN:
11330 raise errors.OpPrereqError("Wrong allocator test '%s'" %
11331 self.op.direction, errors.ECODE_INVAL)
11333 def Exec(self, feedback_fn):
11334 """Run the allocator test.
11337 if self.op.mode == constants.IALLOCATOR_MODE_ALLOC:
11338 ial = IAllocator(self.cfg, self.rpc,
11341 mem_size=self.op.mem_size,
11342 disks=self.op.disks,
11343 disk_template=self.op.disk_template,
11347 vcpus=self.op.vcpus,
11348 hypervisor=self.op.hypervisor,
11350 elif self.op.mode == constants.IALLOCATOR_MODE_RELOC:
11351 ial = IAllocator(self.cfg, self.rpc,
11354 relocate_from=list(self.relocate_from),
11356 elif self.op.mode == constants.IALLOCATOR_MODE_MEVAC:
11357 ial = IAllocator(self.cfg, self.rpc,
11359 evac_nodes=self.op.evac_nodes)
11361 raise errors.ProgrammerError("Uncatched mode %s in"
11362 " LUTestAllocator.Exec", self.op.mode)
11364 if self.op.direction == constants.IALLOCATOR_DIR_IN:
11365 result = ial.in_text
11367 ial.Run(self.op.allocator, validate=False)
11368 result = ial.out_text
11372 #: Query type implementations
11374 constants.QR_INSTANCE: _InstanceQuery,
11375 constants.QR_NODE: _NodeQuery,
11376 constants.QR_GROUP: _GroupQuery,
11380 def _GetQueryImplementation(name):
11381 """Returns the implemtnation for a query type.
11383 @param name: Query type, must be one of L{constants.QR_OP_QUERY}
11387 return _QUERY_IMPL[name]
11389 raise errors.OpPrereqError("Unknown query resource '%s'" % name,
11390 errors.ECODE_INVAL)