4 # Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 Google Inc.
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 """Module implementing the master-side code."""
24 # pylint: disable-msg=W0201,C0302
26 # W0201 since most LU attributes are defined in CheckPrereq or similar
29 # C0302: since we have waaaay to many lines in this module
44 from ganeti import ssh
45 from ganeti import utils
46 from ganeti import errors
47 from ganeti import hypervisor
48 from ganeti import locking
49 from ganeti import constants
50 from ganeti import objects
51 from ganeti import serializer
52 from ganeti import ssconf
53 from ganeti import uidpool
54 from ganeti import compat
55 from ganeti import masterd
56 from ganeti import netutils
57 from ganeti import query
58 from ganeti import qlang
59 from ganeti import opcodes
61 import ganeti.masterd.instance # pylint: disable-msg=W0611
64 def _SupportsOob(cfg, node):
65 """Tells if node supports OOB.
67 @type cfg: L{config.ConfigWriter}
68 @param cfg: The cluster configuration
69 @type node: L{objects.Node}
71 @return: The OOB script if supported or an empty string otherwise
74 return cfg.GetNdParams(node)[constants.ND_OOB_PROGRAM]
78 """Data container for LU results with jobs.
80 Instances of this class returned from L{LogicalUnit.Exec} will be recognized
81 by L{mcpu.Processor._ProcessResult}. The latter will then submit the jobs
82 contained in the C{jobs} attribute and include the job IDs in the opcode
86 def __init__(self, jobs, **kwargs):
87 """Initializes this class.
89 Additional return values can be specified as keyword arguments.
91 @type jobs: list of lists of L{opcode.OpCode}
92 @param jobs: A list of lists of opcode objects
99 class LogicalUnit(object):
100 """Logical Unit base class.
102 Subclasses must follow these rules:
103 - implement ExpandNames
104 - implement CheckPrereq (except when tasklets are used)
105 - implement Exec (except when tasklets are used)
106 - implement BuildHooksEnv
107 - implement BuildHooksNodes
108 - redefine HPATH and HTYPE
109 - optionally redefine their run requirements:
110 REQ_BGL: the LU needs to hold the Big Ganeti Lock exclusively
112 Note that all commands require root permissions.
114 @ivar dry_run_result: the value (if any) that will be returned to the caller
115 in dry-run mode (signalled by opcode dry_run parameter)
122 def __init__(self, processor, op, context, rpc):
123 """Constructor for LogicalUnit.
125 This needs to be overridden in derived classes in order to check op
129 self.proc = processor
131 self.cfg = context.cfg
132 self.glm = context.glm
133 self.context = context
135 # Dicts used to declare locking needs to mcpu
136 self.needed_locks = None
137 self.share_locks = dict.fromkeys(locking.LEVELS, 0)
139 self.remove_locks = {}
140 # Used to force good behavior when calling helper functions
141 self.recalculate_locks = {}
143 self.Log = processor.Log # pylint: disable-msg=C0103
144 self.LogWarning = processor.LogWarning # pylint: disable-msg=C0103
145 self.LogInfo = processor.LogInfo # pylint: disable-msg=C0103
146 self.LogStep = processor.LogStep # pylint: disable-msg=C0103
147 # support for dry-run
148 self.dry_run_result = None
149 # support for generic debug attribute
150 if (not hasattr(self.op, "debug_level") or
151 not isinstance(self.op.debug_level, int)):
152 self.op.debug_level = 0
157 # Validate opcode parameters and set defaults
158 self.op.Validate(True)
160 self.CheckArguments()
162 def CheckArguments(self):
163 """Check syntactic validity for the opcode arguments.
165 This method is for doing a simple syntactic check and ensure
166 validity of opcode parameters, without any cluster-related
167 checks. While the same can be accomplished in ExpandNames and/or
168 CheckPrereq, doing these separate is better because:
170 - ExpandNames is left as as purely a lock-related function
171 - CheckPrereq is run after we have acquired locks (and possible
174 The function is allowed to change the self.op attribute so that
175 later methods can no longer worry about missing parameters.
180 def ExpandNames(self):
181 """Expand names for this LU.
183 This method is called before starting to execute the opcode, and it should
184 update all the parameters of the opcode to their canonical form (e.g. a
185 short node name must be fully expanded after this method has successfully
186 completed). This way locking, hooks, logging, etc. can work correctly.
188 LUs which implement this method must also populate the self.needed_locks
189 member, as a dict with lock levels as keys, and a list of needed lock names
192 - use an empty dict if you don't need any lock
193 - if you don't need any lock at a particular level omit that level
194 - don't put anything for the BGL level
195 - if you want all locks at a level use locking.ALL_SET as a value
197 If you need to share locks (rather than acquire them exclusively) at one
198 level you can modify self.share_locks, setting a true value (usually 1) for
199 that level. By default locks are not shared.
201 This function can also define a list of tasklets, which then will be
202 executed in order instead of the usual LU-level CheckPrereq and Exec
203 functions, if those are not defined by the LU.
207 # Acquire all nodes and one instance
208 self.needed_locks = {
209 locking.LEVEL_NODE: locking.ALL_SET,
210 locking.LEVEL_INSTANCE: ['instance1.example.com'],
212 # Acquire just two nodes
213 self.needed_locks = {
214 locking.LEVEL_NODE: ['node1.example.com', 'node2.example.com'],
217 self.needed_locks = {} # No, you can't leave it to the default value None
220 # The implementation of this method is mandatory only if the new LU is
221 # concurrent, so that old LUs don't need to be changed all at the same
224 self.needed_locks = {} # Exclusive LUs don't need locks.
226 raise NotImplementedError
228 def DeclareLocks(self, level):
229 """Declare LU locking needs for a level
231 While most LUs can just declare their locking needs at ExpandNames time,
232 sometimes there's the need to calculate some locks after having acquired
233 the ones before. This function is called just before acquiring locks at a
234 particular level, but after acquiring the ones at lower levels, and permits
235 such calculations. It can be used to modify self.needed_locks, and by
236 default it does nothing.
238 This function is only called if you have something already set in
239 self.needed_locks for the level.
241 @param level: Locking level which is going to be locked
242 @type level: member of ganeti.locking.LEVELS
246 def CheckPrereq(self):
247 """Check prerequisites for this LU.
249 This method should check that the prerequisites for the execution
250 of this LU are fulfilled. It can do internode communication, but
251 it should be idempotent - no cluster or system changes are
254 The method should raise errors.OpPrereqError in case something is
255 not fulfilled. Its return value is ignored.
257 This method should also update all the parameters of the opcode to
258 their canonical form if it hasn't been done by ExpandNames before.
261 if self.tasklets is not None:
262 for (idx, tl) in enumerate(self.tasklets):
263 logging.debug("Checking prerequisites for tasklet %s/%s",
264 idx + 1, len(self.tasklets))
269 def Exec(self, feedback_fn):
272 This method should implement the actual work. It should raise
273 errors.OpExecError for failures that are somewhat dealt with in
277 if self.tasklets is not None:
278 for (idx, tl) in enumerate(self.tasklets):
279 logging.debug("Executing tasklet %s/%s", idx + 1, len(self.tasklets))
282 raise NotImplementedError
284 def BuildHooksEnv(self):
285 """Build hooks environment for this LU.
288 @return: Dictionary containing the environment that will be used for
289 running the hooks for this LU. The keys of the dict must not be prefixed
290 with "GANETI_"--that'll be added by the hooks runner. The hooks runner
291 will extend the environment with additional variables. If no environment
292 should be defined, an empty dictionary should be returned (not C{None}).
293 @note: If the C{HPATH} attribute of the LU class is C{None}, this function
297 raise NotImplementedError
299 def BuildHooksNodes(self):
300 """Build list of nodes to run LU's hooks.
302 @rtype: tuple; (list, list)
303 @return: Tuple containing a list of node names on which the hook
304 should run before the execution and a list of node names on which the
305 hook should run after the execution. No nodes should be returned as an
306 empty list (and not None).
307 @note: If the C{HPATH} attribute of the LU class is C{None}, this function
311 raise NotImplementedError
313 def HooksCallBack(self, phase, hook_results, feedback_fn, lu_result):
314 """Notify the LU about the results of its hooks.
316 This method is called every time a hooks phase is executed, and notifies
317 the Logical Unit about the hooks' result. The LU can then use it to alter
318 its result based on the hooks. By default the method does nothing and the
319 previous result is passed back unchanged but any LU can define it if it
320 wants to use the local cluster hook-scripts somehow.
322 @param phase: one of L{constants.HOOKS_PHASE_POST} or
323 L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase
324 @param hook_results: the results of the multi-node hooks rpc call
325 @param feedback_fn: function used send feedback back to the caller
326 @param lu_result: the previous Exec result this LU had, or None
328 @return: the new Exec result, based on the previous result
332 # API must be kept, thus we ignore the unused argument and could
333 # be a function warnings
334 # pylint: disable-msg=W0613,R0201
337 def _ExpandAndLockInstance(self):
338 """Helper function to expand and lock an instance.
340 Many LUs that work on an instance take its name in self.op.instance_name
341 and need to expand it and then declare the expanded name for locking. This
342 function does it, and then updates self.op.instance_name to the expanded
343 name. It also initializes needed_locks as a dict, if this hasn't been done
347 if self.needed_locks is None:
348 self.needed_locks = {}
350 assert locking.LEVEL_INSTANCE not in self.needed_locks, \
351 "_ExpandAndLockInstance called with instance-level locks set"
352 self.op.instance_name = _ExpandInstanceName(self.cfg,
353 self.op.instance_name)
354 self.needed_locks[locking.LEVEL_INSTANCE] = self.op.instance_name
356 def _LockInstancesNodes(self, primary_only=False):
357 """Helper function to declare instances' nodes for locking.
359 This function should be called after locking one or more instances to lock
360 their nodes. Its effect is populating self.needed_locks[locking.LEVEL_NODE]
361 with all primary or secondary nodes for instances already locked and
362 present in self.needed_locks[locking.LEVEL_INSTANCE].
364 It should be called from DeclareLocks, and for safety only works if
365 self.recalculate_locks[locking.LEVEL_NODE] is set.
367 In the future it may grow parameters to just lock some instance's nodes, or
368 to just lock primaries or secondary nodes, if needed.
370 If should be called in DeclareLocks in a way similar to::
372 if level == locking.LEVEL_NODE:
373 self._LockInstancesNodes()
375 @type primary_only: boolean
376 @param primary_only: only lock primary nodes of locked instances
379 assert locking.LEVEL_NODE in self.recalculate_locks, \
380 "_LockInstancesNodes helper function called with no nodes to recalculate"
382 # TODO: check if we're really been called with the instance locks held
384 # For now we'll replace self.needed_locks[locking.LEVEL_NODE], but in the
385 # future we might want to have different behaviors depending on the value
386 # of self.recalculate_locks[locking.LEVEL_NODE]
388 for instance_name in self.glm.list_owned(locking.LEVEL_INSTANCE):
389 instance = self.context.cfg.GetInstanceInfo(instance_name)
390 wanted_nodes.append(instance.primary_node)
392 wanted_nodes.extend(instance.secondary_nodes)
394 if self.recalculate_locks[locking.LEVEL_NODE] == constants.LOCKS_REPLACE:
395 self.needed_locks[locking.LEVEL_NODE] = wanted_nodes
396 elif self.recalculate_locks[locking.LEVEL_NODE] == constants.LOCKS_APPEND:
397 self.needed_locks[locking.LEVEL_NODE].extend(wanted_nodes)
399 del self.recalculate_locks[locking.LEVEL_NODE]
402 class NoHooksLU(LogicalUnit): # pylint: disable-msg=W0223
403 """Simple LU which runs no hooks.
405 This LU is intended as a parent for other LogicalUnits which will
406 run no hooks, in order to reduce duplicate code.
412 def BuildHooksEnv(self):
413 """Empty BuildHooksEnv for NoHooksLu.
415 This just raises an error.
418 raise AssertionError("BuildHooksEnv called for NoHooksLUs")
420 def BuildHooksNodes(self):
421 """Empty BuildHooksNodes for NoHooksLU.
424 raise AssertionError("BuildHooksNodes called for NoHooksLU")
428 """Tasklet base class.
430 Tasklets are subcomponents for LUs. LUs can consist entirely of tasklets or
431 they can mix legacy code with tasklets. Locking needs to be done in the LU,
432 tasklets know nothing about locks.
434 Subclasses must follow these rules:
435 - Implement CheckPrereq
439 def __init__(self, lu):
446 def CheckPrereq(self):
447 """Check prerequisites for this tasklets.
449 This method should check whether the prerequisites for the execution of
450 this tasklet are fulfilled. It can do internode communication, but it
451 should be idempotent - no cluster or system changes are allowed.
453 The method should raise errors.OpPrereqError in case something is not
454 fulfilled. Its return value is ignored.
456 This method should also update all parameters to their canonical form if it
457 hasn't been done before.
462 def Exec(self, feedback_fn):
463 """Execute the tasklet.
465 This method should implement the actual work. It should raise
466 errors.OpExecError for failures that are somewhat dealt with in code, or
470 raise NotImplementedError
474 """Base for query utility classes.
477 #: Attribute holding field definitions
480 def __init__(self, filter_, fields, use_locking):
481 """Initializes this class.
484 self.use_locking = use_locking
486 self.query = query.Query(self.FIELDS, fields, filter_=filter_,
488 self.requested_data = self.query.RequestedData()
489 self.names = self.query.RequestedNames()
491 # Sort only if no names were requested
492 self.sort_by_name = not self.names
494 self.do_locking = None
497 def _GetNames(self, lu, all_names, lock_level):
498 """Helper function to determine names asked for in the query.
502 names = lu.glm.list_owned(lock_level)
506 if self.wanted == locking.ALL_SET:
507 assert not self.names
508 # caller didn't specify names, so ordering is not important
509 return utils.NiceSort(names)
511 # caller specified names and we must keep the same order
513 assert not self.do_locking or lu.glm.is_owned(lock_level)
515 missing = set(self.wanted).difference(names)
517 raise errors.OpExecError("Some items were removed before retrieving"
518 " their data: %s" % missing)
520 # Return expanded names
523 def ExpandNames(self, lu):
524 """Expand names for this query.
526 See L{LogicalUnit.ExpandNames}.
529 raise NotImplementedError()
531 def DeclareLocks(self, lu, level):
532 """Declare locks for this query.
534 See L{LogicalUnit.DeclareLocks}.
537 raise NotImplementedError()
539 def _GetQueryData(self, lu):
540 """Collects all data for this query.
542 @return: Query data object
545 raise NotImplementedError()
547 def NewStyleQuery(self, lu):
548 """Collect data and execute query.
551 return query.GetQueryResponse(self.query, self._GetQueryData(lu),
552 sort_by_name=self.sort_by_name)
554 def OldStyleQuery(self, lu):
555 """Collect data and execute query.
558 return self.query.OldStyleQuery(self._GetQueryData(lu),
559 sort_by_name=self.sort_by_name)
562 def _GetWantedNodes(lu, nodes):
563 """Returns list of checked and expanded node names.
565 @type lu: L{LogicalUnit}
566 @param lu: the logical unit on whose behalf we execute
568 @param nodes: list of node names or None for all nodes
570 @return: the list of nodes, sorted
571 @raise errors.ProgrammerError: if the nodes parameter is wrong type
575 return [_ExpandNodeName(lu.cfg, name) for name in nodes]
577 return utils.NiceSort(lu.cfg.GetNodeList())
580 def _GetWantedInstances(lu, instances):
581 """Returns list of checked and expanded instance names.
583 @type lu: L{LogicalUnit}
584 @param lu: the logical unit on whose behalf we execute
585 @type instances: list
586 @param instances: list of instance names or None for all instances
588 @return: the list of instances, sorted
589 @raise errors.OpPrereqError: if the instances parameter is wrong type
590 @raise errors.OpPrereqError: if any of the passed instances is not found
594 wanted = [_ExpandInstanceName(lu.cfg, name) for name in instances]
596 wanted = utils.NiceSort(lu.cfg.GetInstanceList())
600 def _GetUpdatedParams(old_params, update_dict,
601 use_default=True, use_none=False):
602 """Return the new version of a parameter dictionary.
604 @type old_params: dict
605 @param old_params: old parameters
606 @type update_dict: dict
607 @param update_dict: dict containing new parameter values, or
608 constants.VALUE_DEFAULT to reset the parameter to its default
610 @param use_default: boolean
611 @type use_default: whether to recognise L{constants.VALUE_DEFAULT}
612 values as 'to be deleted' values
613 @param use_none: boolean
614 @type use_none: whether to recognise C{None} values as 'to be
617 @return: the new parameter dictionary
620 params_copy = copy.deepcopy(old_params)
621 for key, val in update_dict.iteritems():
622 if ((use_default and val == constants.VALUE_DEFAULT) or
623 (use_none and val is None)):
629 params_copy[key] = val
633 def _ReleaseLocks(lu, level, names=None, keep=None):
634 """Releases locks owned by an LU.
636 @type lu: L{LogicalUnit}
637 @param level: Lock level
638 @type names: list or None
639 @param names: Names of locks to release
640 @type keep: list or None
641 @param keep: Names of locks to retain
644 assert not (keep is not None and names is not None), \
645 "Only one of the 'names' and the 'keep' parameters can be given"
647 if names is not None:
648 should_release = names.__contains__
650 should_release = lambda name: name not in keep
652 should_release = None
658 # Determine which locks to release
659 for name in lu.glm.list_owned(level):
660 if should_release(name):
665 assert len(lu.glm.list_owned(level)) == (len(retain) + len(release))
667 # Release just some locks
668 lu.glm.release(level, names=release)
670 assert frozenset(lu.glm.list_owned(level)) == frozenset(retain)
673 lu.glm.release(level)
675 assert not lu.glm.is_owned(level), "No locks should be owned"
678 def _RunPostHook(lu, node_name):
679 """Runs the post-hook for an opcode on a single node.
682 hm = lu.proc.hmclass(lu.rpc.call_hooks_runner, lu)
684 hm.RunPhase(constants.HOOKS_PHASE_POST, nodes=[node_name])
686 # pylint: disable-msg=W0702
687 lu.LogWarning("Errors occurred running hooks on %s" % node_name)
690 def _CheckOutputFields(static, dynamic, selected):
691 """Checks whether all selected fields are valid.
693 @type static: L{utils.FieldSet}
694 @param static: static fields set
695 @type dynamic: L{utils.FieldSet}
696 @param dynamic: dynamic fields set
703 delta = f.NonMatching(selected)
705 raise errors.OpPrereqError("Unknown output fields selected: %s"
706 % ",".join(delta), errors.ECODE_INVAL)
709 def _CheckGlobalHvParams(params):
710 """Validates that given hypervisor params are not global ones.
712 This will ensure that instances don't get customised versions of
716 used_globals = constants.HVC_GLOBALS.intersection(params)
718 msg = ("The following hypervisor parameters are global and cannot"
719 " be customized at instance level, please modify them at"
720 " cluster level: %s" % utils.CommaJoin(used_globals))
721 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
724 def _CheckNodeOnline(lu, node, msg=None):
725 """Ensure that a given node is online.
727 @param lu: the LU on behalf of which we make the check
728 @param node: the node to check
729 @param msg: if passed, should be a message to replace the default one
730 @raise errors.OpPrereqError: if the node is offline
734 msg = "Can't use offline node"
735 if lu.cfg.GetNodeInfo(node).offline:
736 raise errors.OpPrereqError("%s: %s" % (msg, node), errors.ECODE_STATE)
739 def _CheckNodeNotDrained(lu, node):
740 """Ensure that a given node is not drained.
742 @param lu: the LU on behalf of which we make the check
743 @param node: the node to check
744 @raise errors.OpPrereqError: if the node is drained
747 if lu.cfg.GetNodeInfo(node).drained:
748 raise errors.OpPrereqError("Can't use drained node %s" % node,
752 def _CheckNodeVmCapable(lu, node):
753 """Ensure that a given node is vm capable.
755 @param lu: the LU on behalf of which we make the check
756 @param node: the node to check
757 @raise errors.OpPrereqError: if the node is not vm capable
760 if not lu.cfg.GetNodeInfo(node).vm_capable:
761 raise errors.OpPrereqError("Can't use non-vm_capable node %s" % node,
765 def _CheckNodeHasOS(lu, node, os_name, force_variant):
766 """Ensure that a node supports a given OS.
768 @param lu: the LU on behalf of which we make the check
769 @param node: the node to check
770 @param os_name: the OS to query about
771 @param force_variant: whether to ignore variant errors
772 @raise errors.OpPrereqError: if the node is not supporting the OS
775 result = lu.rpc.call_os_get(node, os_name)
776 result.Raise("OS '%s' not in supported OS list for node %s" %
778 prereq=True, ecode=errors.ECODE_INVAL)
779 if not force_variant:
780 _CheckOSVariant(result.payload, os_name)
783 def _CheckNodeHasSecondaryIP(lu, node, secondary_ip, prereq):
784 """Ensure that a node has the given secondary ip.
786 @type lu: L{LogicalUnit}
787 @param lu: the LU on behalf of which we make the check
789 @param node: the node to check
790 @type secondary_ip: string
791 @param secondary_ip: the ip to check
792 @type prereq: boolean
793 @param prereq: whether to throw a prerequisite or an execute error
794 @raise errors.OpPrereqError: if the node doesn't have the ip, and prereq=True
795 @raise errors.OpExecError: if the node doesn't have the ip, and prereq=False
798 result = lu.rpc.call_node_has_ip_address(node, secondary_ip)
799 result.Raise("Failure checking secondary ip on node %s" % node,
800 prereq=prereq, ecode=errors.ECODE_ENVIRON)
801 if not result.payload:
802 msg = ("Node claims it doesn't have the secondary ip you gave (%s),"
803 " please fix and re-run this command" % secondary_ip)
805 raise errors.OpPrereqError(msg, errors.ECODE_ENVIRON)
807 raise errors.OpExecError(msg)
810 def _GetClusterDomainSecret():
811 """Reads the cluster domain secret.
814 return utils.ReadOneLineFile(constants.CLUSTER_DOMAIN_SECRET_FILE,
818 def _CheckInstanceDown(lu, instance, reason):
819 """Ensure that an instance is not running."""
820 if instance.admin_up:
821 raise errors.OpPrereqError("Instance %s is marked to be up, %s" %
822 (instance.name, reason), errors.ECODE_STATE)
824 pnode = instance.primary_node
825 ins_l = lu.rpc.call_instance_list([pnode], [instance.hypervisor])[pnode]
826 ins_l.Raise("Can't contact node %s for instance information" % pnode,
827 prereq=True, ecode=errors.ECODE_ENVIRON)
829 if instance.name in ins_l.payload:
830 raise errors.OpPrereqError("Instance %s is running, %s" %
831 (instance.name, reason), errors.ECODE_STATE)
834 def _ExpandItemName(fn, name, kind):
835 """Expand an item name.
837 @param fn: the function to use for expansion
838 @param name: requested item name
839 @param kind: text description ('Node' or 'Instance')
840 @return: the resolved (full) name
841 @raise errors.OpPrereqError: if the item is not found
845 if full_name is None:
846 raise errors.OpPrereqError("%s '%s' not known" % (kind, name),
851 def _ExpandNodeName(cfg, name):
852 """Wrapper over L{_ExpandItemName} for nodes."""
853 return _ExpandItemName(cfg.ExpandNodeName, name, "Node")
856 def _ExpandInstanceName(cfg, name):
857 """Wrapper over L{_ExpandItemName} for instance."""
858 return _ExpandItemName(cfg.ExpandInstanceName, name, "Instance")
861 def _BuildInstanceHookEnv(name, primary_node, secondary_nodes, os_type, status,
862 memory, vcpus, nics, disk_template, disks,
863 bep, hvp, hypervisor_name):
864 """Builds instance related env variables for hooks
866 This builds the hook environment from individual variables.
869 @param name: the name of the instance
870 @type primary_node: string
871 @param primary_node: the name of the instance's primary node
872 @type secondary_nodes: list
873 @param secondary_nodes: list of secondary nodes as strings
874 @type os_type: string
875 @param os_type: the name of the instance's OS
876 @type status: boolean
877 @param status: the should_run status of the instance
879 @param memory: the memory size of the instance
881 @param vcpus: the count of VCPUs the instance has
883 @param nics: list of tuples (ip, mac, mode, link) representing
884 the NICs the instance has
885 @type disk_template: string
886 @param disk_template: the disk template of the instance
888 @param disks: the list of (size, mode) pairs
890 @param bep: the backend parameters for the instance
892 @param hvp: the hypervisor parameters for the instance
893 @type hypervisor_name: string
894 @param hypervisor_name: the hypervisor for the instance
896 @return: the hook environment for this instance
905 "INSTANCE_NAME": name,
906 "INSTANCE_PRIMARY": primary_node,
907 "INSTANCE_SECONDARIES": " ".join(secondary_nodes),
908 "INSTANCE_OS_TYPE": os_type,
909 "INSTANCE_STATUS": str_status,
910 "INSTANCE_MEMORY": memory,
911 "INSTANCE_VCPUS": vcpus,
912 "INSTANCE_DISK_TEMPLATE": disk_template,
913 "INSTANCE_HYPERVISOR": hypervisor_name,
917 nic_count = len(nics)
918 for idx, (ip, mac, mode, link) in enumerate(nics):
921 env["INSTANCE_NIC%d_IP" % idx] = ip
922 env["INSTANCE_NIC%d_MAC" % idx] = mac
923 env["INSTANCE_NIC%d_MODE" % idx] = mode
924 env["INSTANCE_NIC%d_LINK" % idx] = link
925 if mode == constants.NIC_MODE_BRIDGED:
926 env["INSTANCE_NIC%d_BRIDGE" % idx] = link
930 env["INSTANCE_NIC_COUNT"] = nic_count
933 disk_count = len(disks)
934 for idx, (size, mode) in enumerate(disks):
935 env["INSTANCE_DISK%d_SIZE" % idx] = size
936 env["INSTANCE_DISK%d_MODE" % idx] = mode
940 env["INSTANCE_DISK_COUNT"] = disk_count
942 for source, kind in [(bep, "BE"), (hvp, "HV")]:
943 for key, value in source.items():
944 env["INSTANCE_%s_%s" % (kind, key)] = value
949 def _NICListToTuple(lu, nics):
950 """Build a list of nic information tuples.
952 This list is suitable to be passed to _BuildInstanceHookEnv or as a return
953 value in LUInstanceQueryData.
955 @type lu: L{LogicalUnit}
956 @param lu: the logical unit on whose behalf we execute
957 @type nics: list of L{objects.NIC}
958 @param nics: list of nics to convert to hooks tuples
962 cluster = lu.cfg.GetClusterInfo()
966 filled_params = cluster.SimpleFillNIC(nic.nicparams)
967 mode = filled_params[constants.NIC_MODE]
968 link = filled_params[constants.NIC_LINK]
969 hooks_nics.append((ip, mac, mode, link))
973 def _BuildInstanceHookEnvByObject(lu, instance, override=None):
974 """Builds instance related env variables for hooks from an object.
976 @type lu: L{LogicalUnit}
977 @param lu: the logical unit on whose behalf we execute
978 @type instance: L{objects.Instance}
979 @param instance: the instance for which we should build the
982 @param override: dictionary with key/values that will override
985 @return: the hook environment dictionary
988 cluster = lu.cfg.GetClusterInfo()
989 bep = cluster.FillBE(instance)
990 hvp = cluster.FillHV(instance)
992 'name': instance.name,
993 'primary_node': instance.primary_node,
994 'secondary_nodes': instance.secondary_nodes,
995 'os_type': instance.os,
996 'status': instance.admin_up,
997 'memory': bep[constants.BE_MEMORY],
998 'vcpus': bep[constants.BE_VCPUS],
999 'nics': _NICListToTuple(lu, instance.nics),
1000 'disk_template': instance.disk_template,
1001 'disks': [(disk.size, disk.mode) for disk in instance.disks],
1004 'hypervisor_name': instance.hypervisor,
1007 args.update(override)
1008 return _BuildInstanceHookEnv(**args) # pylint: disable-msg=W0142
1011 def _AdjustCandidatePool(lu, exceptions):
1012 """Adjust the candidate pool after node operations.
1015 mod_list = lu.cfg.MaintainCandidatePool(exceptions)
1017 lu.LogInfo("Promoted nodes to master candidate role: %s",
1018 utils.CommaJoin(node.name for node in mod_list))
1019 for name in mod_list:
1020 lu.context.ReaddNode(name)
1021 mc_now, mc_max, _ = lu.cfg.GetMasterCandidateStats(exceptions)
1023 lu.LogInfo("Note: more nodes are candidates (%d) than desired (%d)" %
1027 def _DecideSelfPromotion(lu, exceptions=None):
1028 """Decide whether I should promote myself as a master candidate.
1031 cp_size = lu.cfg.GetClusterInfo().candidate_pool_size
1032 mc_now, mc_should, _ = lu.cfg.GetMasterCandidateStats(exceptions)
1033 # the new node will increase mc_max with one, so:
1034 mc_should = min(mc_should + 1, cp_size)
1035 return mc_now < mc_should
1038 def _CheckNicsBridgesExist(lu, target_nics, target_node):
1039 """Check that the brigdes needed by a list of nics exist.
1042 cluster = lu.cfg.GetClusterInfo()
1043 paramslist = [cluster.SimpleFillNIC(nic.nicparams) for nic in target_nics]
1044 brlist = [params[constants.NIC_LINK] for params in paramslist
1045 if params[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED]
1047 result = lu.rpc.call_bridges_exist(target_node, brlist)
1048 result.Raise("Error checking bridges on destination node '%s'" %
1049 target_node, prereq=True, ecode=errors.ECODE_ENVIRON)
1052 def _CheckInstanceBridgesExist(lu, instance, node=None):
1053 """Check that the brigdes needed by an instance exist.
1057 node = instance.primary_node
1058 _CheckNicsBridgesExist(lu, instance.nics, node)
1061 def _CheckOSVariant(os_obj, name):
1062 """Check whether an OS name conforms to the os variants specification.
1064 @type os_obj: L{objects.OS}
1065 @param os_obj: OS object to check
1067 @param name: OS name passed by the user, to check for validity
1070 if not os_obj.supported_variants:
1072 variant = objects.OS.GetVariant(name)
1074 raise errors.OpPrereqError("OS name must include a variant",
1077 if variant not in os_obj.supported_variants:
1078 raise errors.OpPrereqError("Unsupported OS variant", errors.ECODE_INVAL)
1081 def _GetNodeInstancesInner(cfg, fn):
1082 return [i for i in cfg.GetAllInstancesInfo().values() if fn(i)]
1085 def _GetNodeInstances(cfg, node_name):
1086 """Returns a list of all primary and secondary instances on a node.
1090 return _GetNodeInstancesInner(cfg, lambda inst: node_name in inst.all_nodes)
1093 def _GetNodePrimaryInstances(cfg, node_name):
1094 """Returns primary instances on a node.
1097 return _GetNodeInstancesInner(cfg,
1098 lambda inst: node_name == inst.primary_node)
1101 def _GetNodeSecondaryInstances(cfg, node_name):
1102 """Returns secondary instances on a node.
1105 return _GetNodeInstancesInner(cfg,
1106 lambda inst: node_name in inst.secondary_nodes)
1109 def _GetStorageTypeArgs(cfg, storage_type):
1110 """Returns the arguments for a storage type.
1113 # Special case for file storage
1114 if storage_type == constants.ST_FILE:
1115 # storage.FileStorage wants a list of storage directories
1116 return [[cfg.GetFileStorageDir(), cfg.GetSharedFileStorageDir()]]
1121 def _FindFaultyInstanceDisks(cfg, rpc, instance, node_name, prereq):
1124 for dev in instance.disks:
1125 cfg.SetDiskID(dev, node_name)
1127 result = rpc.call_blockdev_getmirrorstatus(node_name, instance.disks)
1128 result.Raise("Failed to get disk status from node %s" % node_name,
1129 prereq=prereq, ecode=errors.ECODE_ENVIRON)
1131 for idx, bdev_status in enumerate(result.payload):
1132 if bdev_status and bdev_status.ldisk_status == constants.LDS_FAULTY:
1138 def _CheckIAllocatorOrNode(lu, iallocator_slot, node_slot):
1139 """Check the sanity of iallocator and node arguments and use the
1140 cluster-wide iallocator if appropriate.
1142 Check that at most one of (iallocator, node) is specified. If none is
1143 specified, then the LU's opcode's iallocator slot is filled with the
1144 cluster-wide default iallocator.
1146 @type iallocator_slot: string
1147 @param iallocator_slot: the name of the opcode iallocator slot
1148 @type node_slot: string
1149 @param node_slot: the name of the opcode target node slot
1152 node = getattr(lu.op, node_slot, None)
1153 iallocator = getattr(lu.op, iallocator_slot, None)
1155 if node is not None and iallocator is not None:
1156 raise errors.OpPrereqError("Do not specify both, iallocator and node.",
1158 elif node is None and iallocator is None:
1159 default_iallocator = lu.cfg.GetDefaultIAllocator()
1160 if default_iallocator:
1161 setattr(lu.op, iallocator_slot, default_iallocator)
1163 raise errors.OpPrereqError("No iallocator or node given and no"
1164 " cluster-wide default iallocator found."
1165 " Please specify either an iallocator or a"
1166 " node, or set a cluster-wide default"
1170 class LUClusterPostInit(LogicalUnit):
1171 """Logical unit for running hooks after cluster initialization.
1174 HPATH = "cluster-init"
1175 HTYPE = constants.HTYPE_CLUSTER
1177 def BuildHooksEnv(self):
1182 "OP_TARGET": self.cfg.GetClusterName(),
1185 def BuildHooksNodes(self):
1186 """Build hooks nodes.
1189 return ([], [self.cfg.GetMasterNode()])
1191 def Exec(self, feedback_fn):
1198 class LUClusterDestroy(LogicalUnit):
1199 """Logical unit for destroying the cluster.
1202 HPATH = "cluster-destroy"
1203 HTYPE = constants.HTYPE_CLUSTER
1205 def BuildHooksEnv(self):
1210 "OP_TARGET": self.cfg.GetClusterName(),
1213 def BuildHooksNodes(self):
1214 """Build hooks nodes.
1219 def CheckPrereq(self):
1220 """Check prerequisites.
1222 This checks whether the cluster is empty.
1224 Any errors are signaled by raising errors.OpPrereqError.
1227 master = self.cfg.GetMasterNode()
1229 nodelist = self.cfg.GetNodeList()
1230 if len(nodelist) != 1 or nodelist[0] != master:
1231 raise errors.OpPrereqError("There are still %d node(s) in"
1232 " this cluster." % (len(nodelist) - 1),
1234 instancelist = self.cfg.GetInstanceList()
1236 raise errors.OpPrereqError("There are still %d instance(s) in"
1237 " this cluster." % len(instancelist),
1240 def Exec(self, feedback_fn):
1241 """Destroys the cluster.
1244 master = self.cfg.GetMasterNode()
1246 # Run post hooks on master node before it's removed
1247 _RunPostHook(self, master)
1249 result = self.rpc.call_node_stop_master(master, False)
1250 result.Raise("Could not disable the master role")
1255 def _VerifyCertificate(filename):
1256 """Verifies a certificate for LUClusterVerify.
1258 @type filename: string
1259 @param filename: Path to PEM file
1263 cert = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM,
1264 utils.ReadFile(filename))
1265 except Exception, err: # pylint: disable-msg=W0703
1266 return (LUClusterVerify.ETYPE_ERROR,
1267 "Failed to load X509 certificate %s: %s" % (filename, err))
1270 utils.VerifyX509Certificate(cert, constants.SSL_CERT_EXPIRATION_WARN,
1271 constants.SSL_CERT_EXPIRATION_ERROR)
1274 fnamemsg = "While verifying %s: %s" % (filename, msg)
1279 return (None, fnamemsg)
1280 elif errcode == utils.CERT_WARNING:
1281 return (LUClusterVerify.ETYPE_WARNING, fnamemsg)
1282 elif errcode == utils.CERT_ERROR:
1283 return (LUClusterVerify.ETYPE_ERROR, fnamemsg)
1285 raise errors.ProgrammerError("Unhandled certificate error code %r" % errcode)
1288 class LUClusterVerify(LogicalUnit):
1289 """Verifies the cluster status.
1292 HPATH = "cluster-verify"
1293 HTYPE = constants.HTYPE_CLUSTER
1296 TCLUSTER = "cluster"
1298 TINSTANCE = "instance"
1300 ECLUSTERCFG = (TCLUSTER, "ECLUSTERCFG")
1301 ECLUSTERCERT = (TCLUSTER, "ECLUSTERCERT")
1302 ECLUSTERFILECHECK = (TCLUSTER, "ECLUSTERFILECHECK")
1303 EINSTANCEBADNODE = (TINSTANCE, "EINSTANCEBADNODE")
1304 EINSTANCEDOWN = (TINSTANCE, "EINSTANCEDOWN")
1305 EINSTANCELAYOUT = (TINSTANCE, "EINSTANCELAYOUT")
1306 EINSTANCEMISSINGDISK = (TINSTANCE, "EINSTANCEMISSINGDISK")
1307 EINSTANCEFAULTYDISK = (TINSTANCE, "EINSTANCEFAULTYDISK")
1308 EINSTANCEWRONGNODE = (TINSTANCE, "EINSTANCEWRONGNODE")
1309 EINSTANCESPLITGROUPS = (TINSTANCE, "EINSTANCESPLITGROUPS")
1310 ENODEDRBD = (TNODE, "ENODEDRBD")
1311 ENODEDRBDHELPER = (TNODE, "ENODEDRBDHELPER")
1312 ENODEFILECHECK = (TNODE, "ENODEFILECHECK")
1313 ENODEHOOKS = (TNODE, "ENODEHOOKS")
1314 ENODEHV = (TNODE, "ENODEHV")
1315 ENODELVM = (TNODE, "ENODELVM")
1316 ENODEN1 = (TNODE, "ENODEN1")
1317 ENODENET = (TNODE, "ENODENET")
1318 ENODEOS = (TNODE, "ENODEOS")
1319 ENODEORPHANINSTANCE = (TNODE, "ENODEORPHANINSTANCE")
1320 ENODEORPHANLV = (TNODE, "ENODEORPHANLV")
1321 ENODERPC = (TNODE, "ENODERPC")
1322 ENODESSH = (TNODE, "ENODESSH")
1323 ENODEVERSION = (TNODE, "ENODEVERSION")
1324 ENODESETUP = (TNODE, "ENODESETUP")
1325 ENODETIME = (TNODE, "ENODETIME")
1326 ENODEOOBPATH = (TNODE, "ENODEOOBPATH")
1328 ETYPE_FIELD = "code"
1329 ETYPE_ERROR = "ERROR"
1330 ETYPE_WARNING = "WARNING"
1332 _HOOKS_INDENT_RE = re.compile("^", re.M)
1334 class NodeImage(object):
1335 """A class representing the logical and physical status of a node.
1338 @ivar name: the node name to which this object refers
1339 @ivar volumes: a structure as returned from
1340 L{ganeti.backend.GetVolumeList} (runtime)
1341 @ivar instances: a list of running instances (runtime)
1342 @ivar pinst: list of configured primary instances (config)
1343 @ivar sinst: list of configured secondary instances (config)
1344 @ivar sbp: dictionary of {primary-node: list of instances} for all
1345 instances for which this node is secondary (config)
1346 @ivar mfree: free memory, as reported by hypervisor (runtime)
1347 @ivar dfree: free disk, as reported by the node (runtime)
1348 @ivar offline: the offline status (config)
1349 @type rpc_fail: boolean
1350 @ivar rpc_fail: whether the RPC verify call was successfull (overall,
1351 not whether the individual keys were correct) (runtime)
1352 @type lvm_fail: boolean
1353 @ivar lvm_fail: whether the RPC call didn't return valid LVM data
1354 @type hyp_fail: boolean
1355 @ivar hyp_fail: whether the RPC call didn't return the instance list
1356 @type ghost: boolean
1357 @ivar ghost: whether this is a known node or not (config)
1358 @type os_fail: boolean
1359 @ivar os_fail: whether the RPC call didn't return valid OS data
1361 @ivar oslist: list of OSes as diagnosed by DiagnoseOS
1362 @type vm_capable: boolean
1363 @ivar vm_capable: whether the node can host instances
1366 def __init__(self, offline=False, name=None, vm_capable=True):
1375 self.offline = offline
1376 self.vm_capable = vm_capable
1377 self.rpc_fail = False
1378 self.lvm_fail = False
1379 self.hyp_fail = False
1381 self.os_fail = False
1384 def ExpandNames(self):
1385 self.needed_locks = {
1386 locking.LEVEL_NODE: locking.ALL_SET,
1387 locking.LEVEL_INSTANCE: locking.ALL_SET,
1389 self.share_locks = dict.fromkeys(locking.LEVELS, 1)
1391 def _Error(self, ecode, item, msg, *args, **kwargs):
1392 """Format an error message.
1394 Based on the opcode's error_codes parameter, either format a
1395 parseable error code, or a simpler error string.
1397 This must be called only from Exec and functions called from Exec.
1400 ltype = kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR)
1402 # first complete the msg
1405 # then format the whole message
1406 if self.op.error_codes:
1407 msg = "%s:%s:%s:%s:%s" % (ltype, etxt, itype, item, msg)
1413 msg = "%s: %s%s: %s" % (ltype, itype, item, msg)
1414 # and finally report it via the feedback_fn
1415 self._feedback_fn(" - %s" % msg)
1417 def _ErrorIf(self, cond, *args, **kwargs):
1418 """Log an error message if the passed condition is True.
1421 cond = bool(cond) or self.op.debug_simulate_errors
1423 self._Error(*args, **kwargs)
1424 # do not mark the operation as failed for WARN cases only
1425 if kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR) == self.ETYPE_ERROR:
1426 self.bad = self.bad or cond
1428 def _VerifyNode(self, ninfo, nresult):
1429 """Perform some basic validation on data returned from a node.
1431 - check the result data structure is well formed and has all the
1433 - check ganeti version
1435 @type ninfo: L{objects.Node}
1436 @param ninfo: the node to check
1437 @param nresult: the results from the node
1439 @return: whether overall this call was successful (and we can expect
1440 reasonable values in the respose)
1444 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1446 # main result, nresult should be a non-empty dict
1447 test = not nresult or not isinstance(nresult, dict)
1448 _ErrorIf(test, self.ENODERPC, node,
1449 "unable to verify node: no data returned")
1453 # compares ganeti version
1454 local_version = constants.PROTOCOL_VERSION
1455 remote_version = nresult.get("version", None)
1456 test = not (remote_version and
1457 isinstance(remote_version, (list, tuple)) and
1458 len(remote_version) == 2)
1459 _ErrorIf(test, self.ENODERPC, node,
1460 "connection to node returned invalid data")
1464 test = local_version != remote_version[0]
1465 _ErrorIf(test, self.ENODEVERSION, node,
1466 "incompatible protocol versions: master %s,"
1467 " node %s", local_version, remote_version[0])
1471 # node seems compatible, we can actually try to look into its results
1473 # full package version
1474 self._ErrorIf(constants.RELEASE_VERSION != remote_version[1],
1475 self.ENODEVERSION, node,
1476 "software version mismatch: master %s, node %s",
1477 constants.RELEASE_VERSION, remote_version[1],
1478 code=self.ETYPE_WARNING)
1480 hyp_result = nresult.get(constants.NV_HYPERVISOR, None)
1481 if ninfo.vm_capable and isinstance(hyp_result, dict):
1482 for hv_name, hv_result in hyp_result.iteritems():
1483 test = hv_result is not None
1484 _ErrorIf(test, self.ENODEHV, node,
1485 "hypervisor %s verify failure: '%s'", hv_name, hv_result)
1487 hvp_result = nresult.get(constants.NV_HVPARAMS, None)
1488 if ninfo.vm_capable and isinstance(hvp_result, list):
1489 for item, hv_name, hv_result in hvp_result:
1490 _ErrorIf(True, self.ENODEHV, node,
1491 "hypervisor %s parameter verify failure (source %s): %s",
1492 hv_name, item, hv_result)
1494 test = nresult.get(constants.NV_NODESETUP,
1495 ["Missing NODESETUP results"])
1496 _ErrorIf(test, self.ENODESETUP, node, "node setup error: %s",
1501 def _VerifyNodeTime(self, ninfo, nresult,
1502 nvinfo_starttime, nvinfo_endtime):
1503 """Check the node time.
1505 @type ninfo: L{objects.Node}
1506 @param ninfo: the node to check
1507 @param nresult: the remote results for the node
1508 @param nvinfo_starttime: the start time of the RPC call
1509 @param nvinfo_endtime: the end time of the RPC call
1513 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1515 ntime = nresult.get(constants.NV_TIME, None)
1517 ntime_merged = utils.MergeTime(ntime)
1518 except (ValueError, TypeError):
1519 _ErrorIf(True, self.ENODETIME, node, "Node returned invalid time")
1522 if ntime_merged < (nvinfo_starttime - constants.NODE_MAX_CLOCK_SKEW):
1523 ntime_diff = "%.01fs" % abs(nvinfo_starttime - ntime_merged)
1524 elif ntime_merged > (nvinfo_endtime + constants.NODE_MAX_CLOCK_SKEW):
1525 ntime_diff = "%.01fs" % abs(ntime_merged - nvinfo_endtime)
1529 _ErrorIf(ntime_diff is not None, self.ENODETIME, node,
1530 "Node time diverges by at least %s from master node time",
1533 def _VerifyNodeLVM(self, ninfo, nresult, vg_name):
1534 """Check the node LVM results.
1536 @type ninfo: L{objects.Node}
1537 @param ninfo: the node to check
1538 @param nresult: the remote results for the node
1539 @param vg_name: the configured VG name
1546 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1548 # checks vg existence and size > 20G
1549 vglist = nresult.get(constants.NV_VGLIST, None)
1551 _ErrorIf(test, self.ENODELVM, node, "unable to check volume groups")
1553 vgstatus = utils.CheckVolumeGroupSize(vglist, vg_name,
1554 constants.MIN_VG_SIZE)
1555 _ErrorIf(vgstatus, self.ENODELVM, node, vgstatus)
1558 pvlist = nresult.get(constants.NV_PVLIST, None)
1559 test = pvlist is None
1560 _ErrorIf(test, self.ENODELVM, node, "Can't get PV list from node")
1562 # check that ':' is not present in PV names, since it's a
1563 # special character for lvcreate (denotes the range of PEs to
1565 for _, pvname, owner_vg in pvlist:
1566 test = ":" in pvname
1567 _ErrorIf(test, self.ENODELVM, node, "Invalid character ':' in PV"
1568 " '%s' of VG '%s'", pvname, owner_vg)
1570 def _VerifyNodeBridges(self, ninfo, nresult, bridges):
1571 """Check the node bridges.
1573 @type ninfo: L{objects.Node}
1574 @param ninfo: the node to check
1575 @param nresult: the remote results for the node
1576 @param bridges: the expected list of bridges
1583 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1585 missing = nresult.get(constants.NV_BRIDGES, None)
1586 test = not isinstance(missing, list)
1587 _ErrorIf(test, self.ENODENET, node,
1588 "did not return valid bridge information")
1590 _ErrorIf(bool(missing), self.ENODENET, node, "missing bridges: %s" %
1591 utils.CommaJoin(sorted(missing)))
1593 def _VerifyNodeNetwork(self, ninfo, nresult):
1594 """Check the node network connectivity results.
1596 @type ninfo: L{objects.Node}
1597 @param ninfo: the node to check
1598 @param nresult: the remote results for the node
1602 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1604 test = constants.NV_NODELIST not in nresult
1605 _ErrorIf(test, self.ENODESSH, node,
1606 "node hasn't returned node ssh connectivity data")
1608 if nresult[constants.NV_NODELIST]:
1609 for a_node, a_msg in nresult[constants.NV_NODELIST].items():
1610 _ErrorIf(True, self.ENODESSH, node,
1611 "ssh communication with node '%s': %s", a_node, a_msg)
1613 test = constants.NV_NODENETTEST not in nresult
1614 _ErrorIf(test, self.ENODENET, node,
1615 "node hasn't returned node tcp connectivity data")
1617 if nresult[constants.NV_NODENETTEST]:
1618 nlist = utils.NiceSort(nresult[constants.NV_NODENETTEST].keys())
1620 _ErrorIf(True, self.ENODENET, node,
1621 "tcp communication with node '%s': %s",
1622 anode, nresult[constants.NV_NODENETTEST][anode])
1624 test = constants.NV_MASTERIP not in nresult
1625 _ErrorIf(test, self.ENODENET, node,
1626 "node hasn't returned node master IP reachability data")
1628 if not nresult[constants.NV_MASTERIP]:
1629 if node == self.master_node:
1630 msg = "the master node cannot reach the master IP (not configured?)"
1632 msg = "cannot reach the master IP"
1633 _ErrorIf(True, self.ENODENET, node, msg)
1635 def _VerifyInstance(self, instance, instanceconfig, node_image,
1637 """Verify an instance.
1639 This function checks to see if the required block devices are
1640 available on the instance's node.
1643 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1644 node_current = instanceconfig.primary_node
1646 node_vol_should = {}
1647 instanceconfig.MapLVsByNode(node_vol_should)
1649 for node in node_vol_should:
1650 n_img = node_image[node]
1651 if n_img.offline or n_img.rpc_fail or n_img.lvm_fail:
1652 # ignore missing volumes on offline or broken nodes
1654 for volume in node_vol_should[node]:
1655 test = volume not in n_img.volumes
1656 _ErrorIf(test, self.EINSTANCEMISSINGDISK, instance,
1657 "volume %s missing on node %s", volume, node)
1659 if instanceconfig.admin_up:
1660 pri_img = node_image[node_current]
1661 test = instance not in pri_img.instances and not pri_img.offline
1662 _ErrorIf(test, self.EINSTANCEDOWN, instance,
1663 "instance not running on its primary node %s",
1666 for node, n_img in node_image.items():
1667 if node != node_current:
1668 test = instance in n_img.instances
1669 _ErrorIf(test, self.EINSTANCEWRONGNODE, instance,
1670 "instance should not run on node %s", node)
1672 diskdata = [(nname, success, status, idx)
1673 for (nname, disks) in diskstatus.items()
1674 for idx, (success, status) in enumerate(disks)]
1676 for nname, success, bdev_status, idx in diskdata:
1677 # the 'ghost node' construction in Exec() ensures that we have a
1679 snode = node_image[nname]
1680 bad_snode = snode.ghost or snode.offline
1681 _ErrorIf(instanceconfig.admin_up and not success and not bad_snode,
1682 self.EINSTANCEFAULTYDISK, instance,
1683 "couldn't retrieve status for disk/%s on %s: %s",
1684 idx, nname, bdev_status)
1685 _ErrorIf((instanceconfig.admin_up and success and
1686 bdev_status.ldisk_status == constants.LDS_FAULTY),
1687 self.EINSTANCEFAULTYDISK, instance,
1688 "disk/%s on %s is faulty", idx, nname)
1690 def _VerifyOrphanVolumes(self, node_vol_should, node_image, reserved):
1691 """Verify if there are any unknown volumes in the cluster.
1693 The .os, .swap and backup volumes are ignored. All other volumes are
1694 reported as unknown.
1696 @type reserved: L{ganeti.utils.FieldSet}
1697 @param reserved: a FieldSet of reserved volume names
1700 for node, n_img in node_image.items():
1701 if n_img.offline or n_img.rpc_fail or n_img.lvm_fail:
1702 # skip non-healthy nodes
1704 for volume in n_img.volumes:
1705 test = ((node not in node_vol_should or
1706 volume not in node_vol_should[node]) and
1707 not reserved.Matches(volume))
1708 self._ErrorIf(test, self.ENODEORPHANLV, node,
1709 "volume %s is unknown", volume)
1711 def _VerifyOrphanInstances(self, instancelist, node_image):
1712 """Verify the list of running instances.
1714 This checks what instances are running but unknown to the cluster.
1717 for node, n_img in node_image.items():
1718 for o_inst in n_img.instances:
1719 test = o_inst not in instancelist
1720 self._ErrorIf(test, self.ENODEORPHANINSTANCE, node,
1721 "instance %s on node %s should not exist", o_inst, node)
1723 def _VerifyNPlusOneMemory(self, node_image, instance_cfg):
1724 """Verify N+1 Memory Resilience.
1726 Check that if one single node dies we can still start all the
1727 instances it was primary for.
1730 cluster_info = self.cfg.GetClusterInfo()
1731 for node, n_img in node_image.items():
1732 # This code checks that every node which is now listed as
1733 # secondary has enough memory to host all instances it is
1734 # supposed to should a single other node in the cluster fail.
1735 # FIXME: not ready for failover to an arbitrary node
1736 # FIXME: does not support file-backed instances
1737 # WARNING: we currently take into account down instances as well
1738 # as up ones, considering that even if they're down someone
1739 # might want to start them even in the event of a node failure.
1741 # we're skipping offline nodes from the N+1 warning, since
1742 # most likely we don't have good memory infromation from them;
1743 # we already list instances living on such nodes, and that's
1746 for prinode, instances in n_img.sbp.items():
1748 for instance in instances:
1749 bep = cluster_info.FillBE(instance_cfg[instance])
1750 if bep[constants.BE_AUTO_BALANCE]:
1751 needed_mem += bep[constants.BE_MEMORY]
1752 test = n_img.mfree < needed_mem
1753 self._ErrorIf(test, self.ENODEN1, node,
1754 "not enough memory to accomodate instance failovers"
1755 " should node %s fail (%dMiB needed, %dMiB available)",
1756 prinode, needed_mem, n_img.mfree)
1759 def _VerifyFiles(cls, errorif, nodeinfo, master_node, all_nvinfo,
1760 (files_all, files_all_opt, files_mc, files_vm)):
1761 """Verifies file checksums collected from all nodes.
1763 @param errorif: Callback for reporting errors
1764 @param nodeinfo: List of L{objects.Node} objects
1765 @param master_node: Name of master node
1766 @param all_nvinfo: RPC results
1769 node_names = frozenset(node.name for node in nodeinfo)
1771 assert master_node in node_names
1772 assert (len(files_all | files_all_opt | files_mc | files_vm) ==
1773 sum(map(len, [files_all, files_all_opt, files_mc, files_vm]))), \
1774 "Found file listed in more than one file list"
1776 # Define functions determining which nodes to consider for a file
1777 file2nodefn = dict([(filename, fn)
1778 for (files, fn) in [(files_all, None),
1779 (files_all_opt, None),
1780 (files_mc, lambda node: (node.master_candidate or
1781 node.name == master_node)),
1782 (files_vm, lambda node: node.vm_capable)]
1783 for filename in files])
1785 fileinfo = dict((filename, {}) for filename in file2nodefn.keys())
1787 for node in nodeinfo:
1788 nresult = all_nvinfo[node.name]
1790 if nresult.fail_msg or not nresult.payload:
1793 node_files = nresult.payload.get(constants.NV_FILELIST, None)
1795 test = not (node_files and isinstance(node_files, dict))
1796 errorif(test, cls.ENODEFILECHECK, node.name,
1797 "Node did not return file checksum data")
1801 for (filename, checksum) in node_files.items():
1802 # Check if the file should be considered for a node
1803 fn = file2nodefn[filename]
1804 if fn is None or fn(node):
1805 fileinfo[filename].setdefault(checksum, set()).add(node.name)
1807 for (filename, checksums) in fileinfo.items():
1808 assert compat.all(len(i) > 10 for i in checksums), "Invalid checksum"
1810 # Nodes having the file
1811 with_file = frozenset(node_name
1812 for nodes in fileinfo[filename].values()
1813 for node_name in nodes)
1815 # Nodes missing file
1816 missing_file = node_names - with_file
1818 if filename in files_all_opt:
1820 errorif(missing_file and missing_file != node_names,
1821 cls.ECLUSTERFILECHECK, None,
1822 "File %s is optional, but it must exist on all or no nodes (not"
1824 filename, utils.CommaJoin(utils.NiceSort(missing_file)))
1826 errorif(missing_file, cls.ECLUSTERFILECHECK, None,
1827 "File %s is missing from node(s) %s", filename,
1828 utils.CommaJoin(utils.NiceSort(missing_file)))
1830 # See if there are multiple versions of the file
1831 test = len(checksums) > 1
1833 variants = ["variant %s on %s" %
1834 (idx + 1, utils.CommaJoin(utils.NiceSort(nodes)))
1835 for (idx, (checksum, nodes)) in
1836 enumerate(sorted(checksums.items()))]
1840 errorif(test, cls.ECLUSTERFILECHECK, None,
1841 "File %s found with %s different checksums (%s)",
1842 filename, len(checksums), "; ".join(variants))
1844 def _VerifyNodeDrbd(self, ninfo, nresult, instanceinfo, drbd_helper,
1846 """Verifies and the node DRBD status.
1848 @type ninfo: L{objects.Node}
1849 @param ninfo: the node to check
1850 @param nresult: the remote results for the node
1851 @param instanceinfo: the dict of instances
1852 @param drbd_helper: the configured DRBD usermode helper
1853 @param drbd_map: the DRBD map as returned by
1854 L{ganeti.config.ConfigWriter.ComputeDRBDMap}
1858 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1861 helper_result = nresult.get(constants.NV_DRBDHELPER, None)
1862 test = (helper_result == None)
1863 _ErrorIf(test, self.ENODEDRBDHELPER, node,
1864 "no drbd usermode helper returned")
1866 status, payload = helper_result
1868 _ErrorIf(test, self.ENODEDRBDHELPER, node,
1869 "drbd usermode helper check unsuccessful: %s", payload)
1870 test = status and (payload != drbd_helper)
1871 _ErrorIf(test, self.ENODEDRBDHELPER, node,
1872 "wrong drbd usermode helper: %s", payload)
1874 # compute the DRBD minors
1876 for minor, instance in drbd_map[node].items():
1877 test = instance not in instanceinfo
1878 _ErrorIf(test, self.ECLUSTERCFG, None,
1879 "ghost instance '%s' in temporary DRBD map", instance)
1880 # ghost instance should not be running, but otherwise we
1881 # don't give double warnings (both ghost instance and
1882 # unallocated minor in use)
1884 node_drbd[minor] = (instance, False)
1886 instance = instanceinfo[instance]
1887 node_drbd[minor] = (instance.name, instance.admin_up)
1889 # and now check them
1890 used_minors = nresult.get(constants.NV_DRBDLIST, [])
1891 test = not isinstance(used_minors, (tuple, list))
1892 _ErrorIf(test, self.ENODEDRBD, node,
1893 "cannot parse drbd status file: %s", str(used_minors))
1895 # we cannot check drbd status
1898 for minor, (iname, must_exist) in node_drbd.items():
1899 test = minor not in used_minors and must_exist
1900 _ErrorIf(test, self.ENODEDRBD, node,
1901 "drbd minor %d of instance %s is not active", minor, iname)
1902 for minor in used_minors:
1903 test = minor not in node_drbd
1904 _ErrorIf(test, self.ENODEDRBD, node,
1905 "unallocated drbd minor %d is in use", minor)
1907 def _UpdateNodeOS(self, ninfo, nresult, nimg):
1908 """Builds the node OS structures.
1910 @type ninfo: L{objects.Node}
1911 @param ninfo: the node to check
1912 @param nresult: the remote results for the node
1913 @param nimg: the node image object
1917 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1919 remote_os = nresult.get(constants.NV_OSLIST, None)
1920 test = (not isinstance(remote_os, list) or
1921 not compat.all(isinstance(v, list) and len(v) == 7
1922 for v in remote_os))
1924 _ErrorIf(test, self.ENODEOS, node,
1925 "node hasn't returned valid OS data")
1934 for (name, os_path, status, diagnose,
1935 variants, parameters, api_ver) in nresult[constants.NV_OSLIST]:
1937 if name not in os_dict:
1940 # parameters is a list of lists instead of list of tuples due to
1941 # JSON lacking a real tuple type, fix it:
1942 parameters = [tuple(v) for v in parameters]
1943 os_dict[name].append((os_path, status, diagnose,
1944 set(variants), set(parameters), set(api_ver)))
1946 nimg.oslist = os_dict
1948 def _VerifyNodeOS(self, ninfo, nimg, base):
1949 """Verifies the node OS list.
1951 @type ninfo: L{objects.Node}
1952 @param ninfo: the node to check
1953 @param nimg: the node image object
1954 @param base: the 'template' node we match against (e.g. from the master)
1958 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1960 assert not nimg.os_fail, "Entered _VerifyNodeOS with failed OS rpc?"
1962 beautify_params = lambda l: ["%s: %s" % (k, v) for (k, v) in l]
1963 for os_name, os_data in nimg.oslist.items():
1964 assert os_data, "Empty OS status for OS %s?!" % os_name
1965 f_path, f_status, f_diag, f_var, f_param, f_api = os_data[0]
1966 _ErrorIf(not f_status, self.ENODEOS, node,
1967 "Invalid OS %s (located at %s): %s", os_name, f_path, f_diag)
1968 _ErrorIf(len(os_data) > 1, self.ENODEOS, node,
1969 "OS '%s' has multiple entries (first one shadows the rest): %s",
1970 os_name, utils.CommaJoin([v[0] for v in os_data]))
1971 # this will catched in backend too
1972 _ErrorIf(compat.any(v >= constants.OS_API_V15 for v in f_api)
1973 and not f_var, self.ENODEOS, node,
1974 "OS %s with API at least %d does not declare any variant",
1975 os_name, constants.OS_API_V15)
1976 # comparisons with the 'base' image
1977 test = os_name not in base.oslist
1978 _ErrorIf(test, self.ENODEOS, node,
1979 "Extra OS %s not present on reference node (%s)",
1983 assert base.oslist[os_name], "Base node has empty OS status?"
1984 _, b_status, _, b_var, b_param, b_api = base.oslist[os_name][0]
1986 # base OS is invalid, skipping
1988 for kind, a, b in [("API version", f_api, b_api),
1989 ("variants list", f_var, b_var),
1990 ("parameters", beautify_params(f_param),
1991 beautify_params(b_param))]:
1992 _ErrorIf(a != b, self.ENODEOS, node,
1993 "OS %s for %s differs from reference node %s: [%s] vs. [%s]",
1994 kind, os_name, base.name,
1995 utils.CommaJoin(sorted(a)), utils.CommaJoin(sorted(b)))
1997 # check any missing OSes
1998 missing = set(base.oslist.keys()).difference(nimg.oslist.keys())
1999 _ErrorIf(missing, self.ENODEOS, node,
2000 "OSes present on reference node %s but missing on this node: %s",
2001 base.name, utils.CommaJoin(missing))
2003 def _VerifyOob(self, ninfo, nresult):
2004 """Verifies out of band functionality of a node.
2006 @type ninfo: L{objects.Node}
2007 @param ninfo: the node to check
2008 @param nresult: the remote results for the node
2012 # We just have to verify the paths on master and/or master candidates
2013 # as the oob helper is invoked on the master
2014 if ((ninfo.master_candidate or ninfo.master_capable) and
2015 constants.NV_OOB_PATHS in nresult):
2016 for path_result in nresult[constants.NV_OOB_PATHS]:
2017 self._ErrorIf(path_result, self.ENODEOOBPATH, node, path_result)
2019 def _UpdateNodeVolumes(self, ninfo, nresult, nimg, vg_name):
2020 """Verifies and updates the node volume data.
2022 This function will update a L{NodeImage}'s internal structures
2023 with data from the remote call.
2025 @type ninfo: L{objects.Node}
2026 @param ninfo: the node to check
2027 @param nresult: the remote results for the node
2028 @param nimg: the node image object
2029 @param vg_name: the configured VG name
2033 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
2035 nimg.lvm_fail = True
2036 lvdata = nresult.get(constants.NV_LVLIST, "Missing LV data")
2039 elif isinstance(lvdata, basestring):
2040 _ErrorIf(True, self.ENODELVM, node, "LVM problem on node: %s",
2041 utils.SafeEncode(lvdata))
2042 elif not isinstance(lvdata, dict):
2043 _ErrorIf(True, self.ENODELVM, node, "rpc call to node failed (lvlist)")
2045 nimg.volumes = lvdata
2046 nimg.lvm_fail = False
2048 def _UpdateNodeInstances(self, ninfo, nresult, nimg):
2049 """Verifies and updates the node instance list.
2051 If the listing was successful, then updates this node's instance
2052 list. Otherwise, it marks the RPC call as failed for the instance
2055 @type ninfo: L{objects.Node}
2056 @param ninfo: the node to check
2057 @param nresult: the remote results for the node
2058 @param nimg: the node image object
2061 idata = nresult.get(constants.NV_INSTANCELIST, None)
2062 test = not isinstance(idata, list)
2063 self._ErrorIf(test, self.ENODEHV, ninfo.name, "rpc call to node failed"
2064 " (instancelist): %s", utils.SafeEncode(str(idata)))
2066 nimg.hyp_fail = True
2068 nimg.instances = idata
2070 def _UpdateNodeInfo(self, ninfo, nresult, nimg, vg_name):
2071 """Verifies and computes a node information map
2073 @type ninfo: L{objects.Node}
2074 @param ninfo: the node to check
2075 @param nresult: the remote results for the node
2076 @param nimg: the node image object
2077 @param vg_name: the configured VG name
2081 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
2083 # try to read free memory (from the hypervisor)
2084 hv_info = nresult.get(constants.NV_HVINFO, None)
2085 test = not isinstance(hv_info, dict) or "memory_free" not in hv_info
2086 _ErrorIf(test, self.ENODEHV, node, "rpc call to node failed (hvinfo)")
2089 nimg.mfree = int(hv_info["memory_free"])
2090 except (ValueError, TypeError):
2091 _ErrorIf(True, self.ENODERPC, node,
2092 "node returned invalid nodeinfo, check hypervisor")
2094 # FIXME: devise a free space model for file based instances as well
2095 if vg_name is not None:
2096 test = (constants.NV_VGLIST not in nresult or
2097 vg_name not in nresult[constants.NV_VGLIST])
2098 _ErrorIf(test, self.ENODELVM, node,
2099 "node didn't return data for the volume group '%s'"
2100 " - it is either missing or broken", vg_name)
2103 nimg.dfree = int(nresult[constants.NV_VGLIST][vg_name])
2104 except (ValueError, TypeError):
2105 _ErrorIf(True, self.ENODERPC, node,
2106 "node returned invalid LVM info, check LVM status")
2108 def _CollectDiskInfo(self, nodelist, node_image, instanceinfo):
2109 """Gets per-disk status information for all instances.
2111 @type nodelist: list of strings
2112 @param nodelist: Node names
2113 @type node_image: dict of (name, L{objects.Node})
2114 @param node_image: Node objects
2115 @type instanceinfo: dict of (name, L{objects.Instance})
2116 @param instanceinfo: Instance objects
2117 @rtype: {instance: {node: [(succes, payload)]}}
2118 @return: a dictionary of per-instance dictionaries with nodes as
2119 keys and disk information as values; the disk information is a
2120 list of tuples (success, payload)
2123 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
2126 node_disks_devonly = {}
2127 diskless_instances = set()
2128 diskless = constants.DT_DISKLESS
2130 for nname in nodelist:
2131 node_instances = list(itertools.chain(node_image[nname].pinst,
2132 node_image[nname].sinst))
2133 diskless_instances.update(inst for inst in node_instances
2134 if instanceinfo[inst].disk_template == diskless)
2135 disks = [(inst, disk)
2136 for inst in node_instances
2137 for disk in instanceinfo[inst].disks]
2140 # No need to collect data
2143 node_disks[nname] = disks
2145 # Creating copies as SetDiskID below will modify the objects and that can
2146 # lead to incorrect data returned from nodes
2147 devonly = [dev.Copy() for (_, dev) in disks]
2150 self.cfg.SetDiskID(dev, nname)
2152 node_disks_devonly[nname] = devonly
2154 assert len(node_disks) == len(node_disks_devonly)
2156 # Collect data from all nodes with disks
2157 result = self.rpc.call_blockdev_getmirrorstatus_multi(node_disks.keys(),
2160 assert len(result) == len(node_disks)
2164 for (nname, nres) in result.items():
2165 disks = node_disks[nname]
2168 # No data from this node
2169 data = len(disks) * [(False, "node offline")]
2172 _ErrorIf(msg, self.ENODERPC, nname,
2173 "while getting disk information: %s", msg)
2175 # No data from this node
2176 data = len(disks) * [(False, msg)]
2179 for idx, i in enumerate(nres.payload):
2180 if isinstance(i, (tuple, list)) and len(i) == 2:
2183 logging.warning("Invalid result from node %s, entry %d: %s",
2185 data.append((False, "Invalid result from the remote node"))
2187 for ((inst, _), status) in zip(disks, data):
2188 instdisk.setdefault(inst, {}).setdefault(nname, []).append(status)
2190 # Add empty entries for diskless instances.
2191 for inst in diskless_instances:
2192 assert inst not in instdisk
2195 assert compat.all(len(statuses) == len(instanceinfo[inst].disks) and
2196 len(nnames) <= len(instanceinfo[inst].all_nodes) and
2197 compat.all(isinstance(s, (tuple, list)) and
2198 len(s) == 2 for s in statuses)
2199 for inst, nnames in instdisk.items()
2200 for nname, statuses in nnames.items())
2201 assert set(instdisk) == set(instanceinfo), "instdisk consistency failure"
2205 def _VerifyHVP(self, hvp_data):
2206 """Verifies locally the syntax of the hypervisor parameters.
2209 for item, hv_name, hv_params in hvp_data:
2210 msg = ("hypervisor %s parameters syntax check (source %s): %%s" %
2213 hv_class = hypervisor.GetHypervisor(hv_name)
2214 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
2215 hv_class.CheckParameterSyntax(hv_params)
2216 except errors.GenericError, err:
2217 self._ErrorIf(True, self.ECLUSTERCFG, None, msg % str(err))
2219 def BuildHooksEnv(self):
2222 Cluster-Verify hooks just ran in the post phase and their failure makes
2223 the output be logged in the verify output and the verification to fail.
2229 "CLUSTER_TAGS": " ".join(cfg.GetClusterInfo().GetTags())
2232 env.update(("NODE_TAGS_%s" % node.name, " ".join(node.GetTags()))
2233 for node in cfg.GetAllNodesInfo().values())
2237 def BuildHooksNodes(self):
2238 """Build hooks nodes.
2241 return ([], self.cfg.GetNodeList())
2243 def Exec(self, feedback_fn):
2244 """Verify integrity of cluster, performing various test on nodes.
2247 # This method has too many local variables. pylint: disable-msg=R0914
2249 _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
2250 verbose = self.op.verbose
2251 self._feedback_fn = feedback_fn
2252 feedback_fn("* Verifying global settings")
2253 for msg in self.cfg.VerifyConfig():
2254 _ErrorIf(True, self.ECLUSTERCFG, None, msg)
2256 # Check the cluster certificates
2257 for cert_filename in constants.ALL_CERT_FILES:
2258 (errcode, msg) = _VerifyCertificate(cert_filename)
2259 _ErrorIf(errcode, self.ECLUSTERCERT, None, msg, code=errcode)
2261 vg_name = self.cfg.GetVGName()
2262 drbd_helper = self.cfg.GetDRBDHelper()
2263 hypervisors = self.cfg.GetClusterInfo().enabled_hypervisors
2264 cluster = self.cfg.GetClusterInfo()
2265 nodeinfo_byname = self.cfg.GetAllNodesInfo()
2266 nodelist = utils.NiceSort(nodeinfo_byname.keys())
2267 nodeinfo = [nodeinfo_byname[nname] for nname in nodelist]
2268 instanceinfo = self.cfg.GetAllInstancesInfo()
2269 instancelist = utils.NiceSort(instanceinfo.keys())
2270 groupinfo = self.cfg.GetAllNodeGroupsInfo()
2271 i_non_redundant = [] # Non redundant instances
2272 i_non_a_balanced = [] # Non auto-balanced instances
2273 n_offline = 0 # Count of offline nodes
2274 n_drained = 0 # Count of nodes being drained
2275 node_vol_should = {}
2277 # FIXME: verify OS list
2280 filemap = _ComputeAncillaryFiles(cluster, False)
2282 # do local checksums
2283 master_node = self.master_node = self.cfg.GetMasterNode()
2284 master_ip = self.cfg.GetMasterIP()
2286 # Compute the set of hypervisor parameters
2288 for hv_name in hypervisors:
2289 hvp_data.append(("cluster", hv_name, cluster.GetHVDefaults(hv_name)))
2290 for os_name, os_hvp in cluster.os_hvp.items():
2291 for hv_name, hv_params in os_hvp.items():
2294 full_params = cluster.GetHVDefaults(hv_name, os_name=os_name)
2295 hvp_data.append(("os %s" % os_name, hv_name, full_params))
2296 # TODO: collapse identical parameter values in a single one
2297 for instance in instanceinfo.values():
2298 if not instance.hvparams:
2300 hvp_data.append(("instance %s" % instance.name, instance.hypervisor,
2301 cluster.FillHV(instance)))
2302 # and verify them locally
2303 self._VerifyHVP(hvp_data)
2305 feedback_fn("* Gathering data (%d nodes)" % len(nodelist))
2306 node_verify_param = {
2307 constants.NV_FILELIST:
2308 utils.UniqueSequence(filename
2309 for files in filemap
2310 for filename in files),
2311 constants.NV_NODELIST: [node.name for node in nodeinfo
2312 if not node.offline],
2313 constants.NV_HYPERVISOR: hypervisors,
2314 constants.NV_HVPARAMS: hvp_data,
2315 constants.NV_NODENETTEST: [(node.name, node.primary_ip,
2316 node.secondary_ip) for node in nodeinfo
2317 if not node.offline],
2318 constants.NV_INSTANCELIST: hypervisors,
2319 constants.NV_VERSION: None,
2320 constants.NV_HVINFO: self.cfg.GetHypervisorType(),
2321 constants.NV_NODESETUP: None,
2322 constants.NV_TIME: None,
2323 constants.NV_MASTERIP: (master_node, master_ip),
2324 constants.NV_OSLIST: None,
2325 constants.NV_VMNODES: self.cfg.GetNonVmCapableNodeList(),
2328 if vg_name is not None:
2329 node_verify_param[constants.NV_VGLIST] = None
2330 node_verify_param[constants.NV_LVLIST] = vg_name
2331 node_verify_param[constants.NV_PVLIST] = [vg_name]
2332 node_verify_param[constants.NV_DRBDLIST] = None
2335 node_verify_param[constants.NV_DRBDHELPER] = drbd_helper
2338 # FIXME: this needs to be changed per node-group, not cluster-wide
2340 default_nicpp = cluster.nicparams[constants.PP_DEFAULT]
2341 if default_nicpp[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
2342 bridges.add(default_nicpp[constants.NIC_LINK])
2343 for instance in instanceinfo.values():
2344 for nic in instance.nics:
2345 full_nic = cluster.SimpleFillNIC(nic.nicparams)
2346 if full_nic[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
2347 bridges.add(full_nic[constants.NIC_LINK])
2350 node_verify_param[constants.NV_BRIDGES] = list(bridges)
2352 # Build our expected cluster state
2353 node_image = dict((node.name, self.NodeImage(offline=node.offline,
2355 vm_capable=node.vm_capable))
2356 for node in nodeinfo)
2360 for node in nodeinfo:
2361 path = _SupportsOob(self.cfg, node)
2362 if path and path not in oob_paths:
2363 oob_paths.append(path)
2366 node_verify_param[constants.NV_OOB_PATHS] = oob_paths
2368 for instance in instancelist:
2369 inst_config = instanceinfo[instance]
2371 for nname in inst_config.all_nodes:
2372 if nname not in node_image:
2374 gnode = self.NodeImage(name=nname)
2376 node_image[nname] = gnode
2378 inst_config.MapLVsByNode(node_vol_should)
2380 pnode = inst_config.primary_node
2381 node_image[pnode].pinst.append(instance)
2383 for snode in inst_config.secondary_nodes:
2384 nimg = node_image[snode]
2385 nimg.sinst.append(instance)
2386 if pnode not in nimg.sbp:
2387 nimg.sbp[pnode] = []
2388 nimg.sbp[pnode].append(instance)
2390 # At this point, we have the in-memory data structures complete,
2391 # except for the runtime information, which we'll gather next
2393 # Due to the way our RPC system works, exact response times cannot be
2394 # guaranteed (e.g. a broken node could run into a timeout). By keeping the
2395 # time before and after executing the request, we can at least have a time
2397 nvinfo_starttime = time.time()
2398 all_nvinfo = self.rpc.call_node_verify(nodelist, node_verify_param,
2399 self.cfg.GetClusterName())
2400 nvinfo_endtime = time.time()
2402 all_drbd_map = self.cfg.ComputeDRBDMap()
2404 feedback_fn("* Gathering disk information (%s nodes)" % len(nodelist))
2405 instdisk = self._CollectDiskInfo(nodelist, node_image, instanceinfo)
2407 feedback_fn("* Verifying configuration file consistency")
2408 self._VerifyFiles(_ErrorIf, nodeinfo, master_node, all_nvinfo, filemap)
2410 feedback_fn("* Verifying node status")
2414 for node_i in nodeinfo:
2416 nimg = node_image[node]
2420 feedback_fn("* Skipping offline node %s" % (node,))
2424 if node == master_node:
2426 elif node_i.master_candidate:
2427 ntype = "master candidate"
2428 elif node_i.drained:
2434 feedback_fn("* Verifying node %s (%s)" % (node, ntype))
2436 msg = all_nvinfo[node].fail_msg
2437 _ErrorIf(msg, self.ENODERPC, node, "while contacting node: %s", msg)
2439 nimg.rpc_fail = True
2442 nresult = all_nvinfo[node].payload
2444 nimg.call_ok = self._VerifyNode(node_i, nresult)
2445 self._VerifyNodeTime(node_i, nresult, nvinfo_starttime, nvinfo_endtime)
2446 self._VerifyNodeNetwork(node_i, nresult)
2447 self._VerifyOob(node_i, nresult)
2450 self._VerifyNodeLVM(node_i, nresult, vg_name)
2451 self._VerifyNodeDrbd(node_i, nresult, instanceinfo, drbd_helper,
2454 self._UpdateNodeVolumes(node_i, nresult, nimg, vg_name)
2455 self._UpdateNodeInstances(node_i, nresult, nimg)
2456 self._UpdateNodeInfo(node_i, nresult, nimg, vg_name)
2457 self._UpdateNodeOS(node_i, nresult, nimg)
2458 if not nimg.os_fail:
2459 if refos_img is None:
2461 self._VerifyNodeOS(node_i, nimg, refos_img)
2462 self._VerifyNodeBridges(node_i, nresult, bridges)
2464 feedback_fn("* Verifying instance status")
2465 for instance in instancelist:
2467 feedback_fn("* Verifying instance %s" % instance)
2468 inst_config = instanceinfo[instance]
2469 self._VerifyInstance(instance, inst_config, node_image,
2471 inst_nodes_offline = []
2473 pnode = inst_config.primary_node
2474 pnode_img = node_image[pnode]
2475 _ErrorIf(pnode_img.rpc_fail and not pnode_img.offline,
2476 self.ENODERPC, pnode, "instance %s, connection to"
2477 " primary node failed", instance)
2479 _ErrorIf(inst_config.admin_up and pnode_img.offline,
2480 self.EINSTANCEBADNODE, instance,
2481 "instance is marked as running and lives on offline node %s",
2482 inst_config.primary_node)
2484 # If the instance is non-redundant we cannot survive losing its primary
2485 # node, so we are not N+1 compliant. On the other hand we have no disk
2486 # templates with more than one secondary so that situation is not well
2488 # FIXME: does not support file-backed instances
2489 if not inst_config.secondary_nodes:
2490 i_non_redundant.append(instance)
2492 _ErrorIf(len(inst_config.secondary_nodes) > 1, self.EINSTANCELAYOUT,
2493 instance, "instance has multiple secondary nodes: %s",
2494 utils.CommaJoin(inst_config.secondary_nodes),
2495 code=self.ETYPE_WARNING)
2497 if inst_config.disk_template in constants.DTS_INT_MIRROR:
2498 pnode = inst_config.primary_node
2499 instance_nodes = utils.NiceSort(inst_config.all_nodes)
2500 instance_groups = {}
2502 for node in instance_nodes:
2503 instance_groups.setdefault(nodeinfo_byname[node].group,
2507 "%s (group %s)" % (utils.CommaJoin(nodes), groupinfo[group].name)
2508 # Sort so that we always list the primary node first.
2509 for group, nodes in sorted(instance_groups.items(),
2510 key=lambda (_, nodes): pnode in nodes,
2513 self._ErrorIf(len(instance_groups) > 1, self.EINSTANCESPLITGROUPS,
2514 instance, "instance has primary and secondary nodes in"
2515 " different groups: %s", utils.CommaJoin(pretty_list),
2516 code=self.ETYPE_WARNING)
2518 if not cluster.FillBE(inst_config)[constants.BE_AUTO_BALANCE]:
2519 i_non_a_balanced.append(instance)
2521 for snode in inst_config.secondary_nodes:
2522 s_img = node_image[snode]
2523 _ErrorIf(s_img.rpc_fail and not s_img.offline, self.ENODERPC, snode,
2524 "instance %s, connection to secondary node failed", instance)
2527 inst_nodes_offline.append(snode)
2529 # warn that the instance lives on offline nodes
2530 _ErrorIf(inst_nodes_offline, self.EINSTANCEBADNODE, instance,
2531 "instance has offline secondary node(s) %s",
2532 utils.CommaJoin(inst_nodes_offline))
2533 # ... or ghost/non-vm_capable nodes
2534 for node in inst_config.all_nodes:
2535 _ErrorIf(node_image[node].ghost, self.EINSTANCEBADNODE, instance,
2536 "instance lives on ghost node %s", node)
2537 _ErrorIf(not node_image[node].vm_capable, self.EINSTANCEBADNODE,
2538 instance, "instance lives on non-vm_capable node %s", node)
2540 feedback_fn("* Verifying orphan volumes")
2541 reserved = utils.FieldSet(*cluster.reserved_lvs)
2542 self._VerifyOrphanVolumes(node_vol_should, node_image, reserved)
2544 feedback_fn("* Verifying orphan instances")
2545 self._VerifyOrphanInstances(instancelist, node_image)
2547 if constants.VERIFY_NPLUSONE_MEM not in self.op.skip_checks:
2548 feedback_fn("* Verifying N+1 Memory redundancy")
2549 self._VerifyNPlusOneMemory(node_image, instanceinfo)
2551 feedback_fn("* Other Notes")
2553 feedback_fn(" - NOTICE: %d non-redundant instance(s) found."
2554 % len(i_non_redundant))
2556 if i_non_a_balanced:
2557 feedback_fn(" - NOTICE: %d non-auto-balanced instance(s) found."
2558 % len(i_non_a_balanced))
2561 feedback_fn(" - NOTICE: %d offline node(s) found." % n_offline)
2564 feedback_fn(" - NOTICE: %d drained node(s) found." % n_drained)
2568 def HooksCallBack(self, phase, hooks_results, feedback_fn, lu_result):
2569 """Analyze the post-hooks' result
2571 This method analyses the hook result, handles it, and sends some
2572 nicely-formatted feedback back to the user.
2574 @param phase: one of L{constants.HOOKS_PHASE_POST} or
2575 L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase
2576 @param hooks_results: the results of the multi-node hooks rpc call
2577 @param feedback_fn: function used send feedback back to the caller
2578 @param lu_result: previous Exec result
2579 @return: the new Exec result, based on the previous result
2583 # We only really run POST phase hooks, and are only interested in
2585 if phase == constants.HOOKS_PHASE_POST:
2586 # Used to change hooks' output to proper indentation
2587 feedback_fn("* Hooks Results")
2588 assert hooks_results, "invalid result from hooks"
2590 for node_name in hooks_results:
2591 res = hooks_results[node_name]
2593 test = msg and not res.offline
2594 self._ErrorIf(test, self.ENODEHOOKS, node_name,
2595 "Communication failure in hooks execution: %s", msg)
2596 if res.offline or msg:
2597 # No need to investigate payload if node is offline or gave an error.
2598 # override manually lu_result here as _ErrorIf only
2599 # overrides self.bad
2602 for script, hkr, output in res.payload:
2603 test = hkr == constants.HKR_FAIL
2604 self._ErrorIf(test, self.ENODEHOOKS, node_name,
2605 "Script %s failed, output:", script)
2607 output = self._HOOKS_INDENT_RE.sub(' ', output)
2608 feedback_fn("%s" % output)
2614 class LUClusterVerifyDisks(NoHooksLU):
2615 """Verifies the cluster disks status.
2620 def ExpandNames(self):
2621 self.needed_locks = {
2622 locking.LEVEL_NODE: locking.ALL_SET,
2623 locking.LEVEL_INSTANCE: locking.ALL_SET,
2625 self.share_locks = dict.fromkeys(locking.LEVELS, 1)
2627 def Exec(self, feedback_fn):
2628 """Verify integrity of cluster disks.
2630 @rtype: tuple of three items
2631 @return: a tuple of (dict of node-to-node_error, list of instances
2632 which need activate-disks, dict of instance: (node, volume) for
2636 result = res_nodes, res_instances, res_missing = {}, [], {}
2638 nodes = utils.NiceSort(self.cfg.GetVmCapableNodeList())
2639 instances = self.cfg.GetAllInstancesInfo().values()
2642 for inst in instances:
2644 if not inst.admin_up:
2646 inst.MapLVsByNode(inst_lvs)
2647 # transform { iname: {node: [vol,],},} to {(node, vol): iname}
2648 for node, vol_list in inst_lvs.iteritems():
2649 for vol in vol_list:
2650 nv_dict[(node, vol)] = inst
2655 node_lvs = self.rpc.call_lv_list(nodes, [])
2656 for node, node_res in node_lvs.items():
2657 if node_res.offline:
2659 msg = node_res.fail_msg
2661 logging.warning("Error enumerating LVs on node %s: %s", node, msg)
2662 res_nodes[node] = msg
2665 lvs = node_res.payload
2666 for lv_name, (_, _, lv_online) in lvs.items():
2667 inst = nv_dict.pop((node, lv_name), None)
2668 if (not lv_online and inst is not None
2669 and inst.name not in res_instances):
2670 res_instances.append(inst.name)
2672 # any leftover items in nv_dict are missing LVs, let's arrange the
2674 for key, inst in nv_dict.iteritems():
2675 if inst.name not in res_missing:
2676 res_missing[inst.name] = []
2677 res_missing[inst.name].append(key)
2682 class LUClusterRepairDiskSizes(NoHooksLU):
2683 """Verifies the cluster disks sizes.
2688 def ExpandNames(self):
2689 if self.op.instances:
2690 self.wanted_names = _GetWantedInstances(self, self.op.instances)
2691 self.needed_locks = {
2692 locking.LEVEL_NODE: [],
2693 locking.LEVEL_INSTANCE: self.wanted_names,
2695 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
2697 self.wanted_names = None
2698 self.needed_locks = {
2699 locking.LEVEL_NODE: locking.ALL_SET,
2700 locking.LEVEL_INSTANCE: locking.ALL_SET,
2702 self.share_locks = dict.fromkeys(locking.LEVELS, 1)
2704 def DeclareLocks(self, level):
2705 if level == locking.LEVEL_NODE and self.wanted_names is not None:
2706 self._LockInstancesNodes(primary_only=True)
2708 def CheckPrereq(self):
2709 """Check prerequisites.
2711 This only checks the optional instance list against the existing names.
2714 if self.wanted_names is None:
2715 self.wanted_names = self.glm.list_owned(locking.LEVEL_INSTANCE)
2717 self.wanted_instances = [self.cfg.GetInstanceInfo(name) for name
2718 in self.wanted_names]
2720 def _EnsureChildSizes(self, disk):
2721 """Ensure children of the disk have the needed disk size.
2723 This is valid mainly for DRBD8 and fixes an issue where the
2724 children have smaller disk size.
2726 @param disk: an L{ganeti.objects.Disk} object
2729 if disk.dev_type == constants.LD_DRBD8:
2730 assert disk.children, "Empty children for DRBD8?"
2731 fchild = disk.children[0]
2732 mismatch = fchild.size < disk.size
2734 self.LogInfo("Child disk has size %d, parent %d, fixing",
2735 fchild.size, disk.size)
2736 fchild.size = disk.size
2738 # and we recurse on this child only, not on the metadev
2739 return self._EnsureChildSizes(fchild) or mismatch
2743 def Exec(self, feedback_fn):
2744 """Verify the size of cluster disks.
2747 # TODO: check child disks too
2748 # TODO: check differences in size between primary/secondary nodes
2750 for instance in self.wanted_instances:
2751 pnode = instance.primary_node
2752 if pnode not in per_node_disks:
2753 per_node_disks[pnode] = []
2754 for idx, disk in enumerate(instance.disks):
2755 per_node_disks[pnode].append((instance, idx, disk))
2758 for node, dskl in per_node_disks.items():
2759 newl = [v[2].Copy() for v in dskl]
2761 self.cfg.SetDiskID(dsk, node)
2762 result = self.rpc.call_blockdev_getsize(node, newl)
2764 self.LogWarning("Failure in blockdev_getsize call to node"
2765 " %s, ignoring", node)
2767 if len(result.payload) != len(dskl):
2768 logging.warning("Invalid result from node %s: len(dksl)=%d,"
2769 " result.payload=%s", node, len(dskl), result.payload)
2770 self.LogWarning("Invalid result from node %s, ignoring node results",
2773 for ((instance, idx, disk), size) in zip(dskl, result.payload):
2775 self.LogWarning("Disk %d of instance %s did not return size"
2776 " information, ignoring", idx, instance.name)
2778 if not isinstance(size, (int, long)):
2779 self.LogWarning("Disk %d of instance %s did not return valid"
2780 " size information, ignoring", idx, instance.name)
2783 if size != disk.size:
2784 self.LogInfo("Disk %d of instance %s has mismatched size,"
2785 " correcting: recorded %d, actual %d", idx,
2786 instance.name, disk.size, size)
2788 self.cfg.Update(instance, feedback_fn)
2789 changed.append((instance.name, idx, size))
2790 if self._EnsureChildSizes(disk):
2791 self.cfg.Update(instance, feedback_fn)
2792 changed.append((instance.name, idx, disk.size))
2796 class LUClusterRename(LogicalUnit):
2797 """Rename the cluster.
2800 HPATH = "cluster-rename"
2801 HTYPE = constants.HTYPE_CLUSTER
2803 def BuildHooksEnv(self):
2808 "OP_TARGET": self.cfg.GetClusterName(),
2809 "NEW_NAME": self.op.name,
2812 def BuildHooksNodes(self):
2813 """Build hooks nodes.
2816 return ([self.cfg.GetMasterNode()], self.cfg.GetNodeList())
2818 def CheckPrereq(self):
2819 """Verify that the passed name is a valid one.
2822 hostname = netutils.GetHostname(name=self.op.name,
2823 family=self.cfg.GetPrimaryIPFamily())
2825 new_name = hostname.name
2826 self.ip = new_ip = hostname.ip
2827 old_name = self.cfg.GetClusterName()
2828 old_ip = self.cfg.GetMasterIP()
2829 if new_name == old_name and new_ip == old_ip:
2830 raise errors.OpPrereqError("Neither the name nor the IP address of the"
2831 " cluster has changed",
2833 if new_ip != old_ip:
2834 if netutils.TcpPing(new_ip, constants.DEFAULT_NODED_PORT):
2835 raise errors.OpPrereqError("The given cluster IP address (%s) is"
2836 " reachable on the network" %
2837 new_ip, errors.ECODE_NOTUNIQUE)
2839 self.op.name = new_name
2841 def Exec(self, feedback_fn):
2842 """Rename the cluster.
2845 clustername = self.op.name
2848 # shutdown the master IP
2849 master = self.cfg.GetMasterNode()
2850 result = self.rpc.call_node_stop_master(master, False)
2851 result.Raise("Could not disable the master role")
2854 cluster = self.cfg.GetClusterInfo()
2855 cluster.cluster_name = clustername
2856 cluster.master_ip = ip
2857 self.cfg.Update(cluster, feedback_fn)
2859 # update the known hosts file
2860 ssh.WriteKnownHostsFile(self.cfg, constants.SSH_KNOWN_HOSTS_FILE)
2861 node_list = self.cfg.GetOnlineNodeList()
2863 node_list.remove(master)
2866 _UploadHelper(self, node_list, constants.SSH_KNOWN_HOSTS_FILE)
2868 result = self.rpc.call_node_start_master(master, False, False)
2869 msg = result.fail_msg
2871 self.LogWarning("Could not re-enable the master role on"
2872 " the master, please restart manually: %s", msg)
2877 class LUClusterSetParams(LogicalUnit):
2878 """Change the parameters of the cluster.
2881 HPATH = "cluster-modify"
2882 HTYPE = constants.HTYPE_CLUSTER
2885 def CheckArguments(self):
2889 if self.op.uid_pool:
2890 uidpool.CheckUidPool(self.op.uid_pool)
2892 if self.op.add_uids:
2893 uidpool.CheckUidPool(self.op.add_uids)
2895 if self.op.remove_uids:
2896 uidpool.CheckUidPool(self.op.remove_uids)
2898 def ExpandNames(self):
2899 # FIXME: in the future maybe other cluster params won't require checking on
2900 # all nodes to be modified.
2901 self.needed_locks = {
2902 locking.LEVEL_NODE: locking.ALL_SET,
2904 self.share_locks[locking.LEVEL_NODE] = 1
2906 def BuildHooksEnv(self):
2911 "OP_TARGET": self.cfg.GetClusterName(),
2912 "NEW_VG_NAME": self.op.vg_name,
2915 def BuildHooksNodes(self):
2916 """Build hooks nodes.
2919 mn = self.cfg.GetMasterNode()
2922 def CheckPrereq(self):
2923 """Check prerequisites.
2925 This checks whether the given params don't conflict and
2926 if the given volume group is valid.
2929 if self.op.vg_name is not None and not self.op.vg_name:
2930 if self.cfg.HasAnyDiskOfType(constants.LD_LV):
2931 raise errors.OpPrereqError("Cannot disable lvm storage while lvm-based"
2932 " instances exist", errors.ECODE_INVAL)
2934 if self.op.drbd_helper is not None and not self.op.drbd_helper:
2935 if self.cfg.HasAnyDiskOfType(constants.LD_DRBD8):
2936 raise errors.OpPrereqError("Cannot disable drbd helper while"
2937 " drbd-based instances exist",
2940 node_list = self.glm.list_owned(locking.LEVEL_NODE)
2942 # if vg_name not None, checks given volume group on all nodes
2944 vglist = self.rpc.call_vg_list(node_list)
2945 for node in node_list:
2946 msg = vglist[node].fail_msg
2948 # ignoring down node
2949 self.LogWarning("Error while gathering data on node %s"
2950 " (ignoring node): %s", node, msg)
2952 vgstatus = utils.CheckVolumeGroupSize(vglist[node].payload,
2954 constants.MIN_VG_SIZE)
2956 raise errors.OpPrereqError("Error on node '%s': %s" %
2957 (node, vgstatus), errors.ECODE_ENVIRON)
2959 if self.op.drbd_helper:
2960 # checks given drbd helper on all nodes
2961 helpers = self.rpc.call_drbd_helper(node_list)
2962 for node in node_list:
2963 ninfo = self.cfg.GetNodeInfo(node)
2965 self.LogInfo("Not checking drbd helper on offline node %s", node)
2967 msg = helpers[node].fail_msg
2969 raise errors.OpPrereqError("Error checking drbd helper on node"
2970 " '%s': %s" % (node, msg),
2971 errors.ECODE_ENVIRON)
2972 node_helper = helpers[node].payload
2973 if node_helper != self.op.drbd_helper:
2974 raise errors.OpPrereqError("Error on node '%s': drbd helper is %s" %
2975 (node, node_helper), errors.ECODE_ENVIRON)
2977 self.cluster = cluster = self.cfg.GetClusterInfo()
2978 # validate params changes
2979 if self.op.beparams:
2980 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
2981 self.new_beparams = cluster.SimpleFillBE(self.op.beparams)
2983 if self.op.ndparams:
2984 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
2985 self.new_ndparams = cluster.SimpleFillND(self.op.ndparams)
2987 # TODO: we need a more general way to handle resetting
2988 # cluster-level parameters to default values
2989 if self.new_ndparams["oob_program"] == "":
2990 self.new_ndparams["oob_program"] = \
2991 constants.NDC_DEFAULTS[constants.ND_OOB_PROGRAM]
2993 if self.op.nicparams:
2994 utils.ForceDictType(self.op.nicparams, constants.NICS_PARAMETER_TYPES)
2995 self.new_nicparams = cluster.SimpleFillNIC(self.op.nicparams)
2996 objects.NIC.CheckParameterSyntax(self.new_nicparams)
2999 # check all instances for consistency
3000 for instance in self.cfg.GetAllInstancesInfo().values():
3001 for nic_idx, nic in enumerate(instance.nics):
3002 params_copy = copy.deepcopy(nic.nicparams)
3003 params_filled = objects.FillDict(self.new_nicparams, params_copy)
3005 # check parameter syntax
3007 objects.NIC.CheckParameterSyntax(params_filled)
3008 except errors.ConfigurationError, err:
3009 nic_errors.append("Instance %s, nic/%d: %s" %
3010 (instance.name, nic_idx, err))
3012 # if we're moving instances to routed, check that they have an ip
3013 target_mode = params_filled[constants.NIC_MODE]
3014 if target_mode == constants.NIC_MODE_ROUTED and not nic.ip:
3015 nic_errors.append("Instance %s, nic/%d: routed NIC with no ip"
3016 " address" % (instance.name, nic_idx))
3018 raise errors.OpPrereqError("Cannot apply the change, errors:\n%s" %
3019 "\n".join(nic_errors))
3021 # hypervisor list/parameters
3022 self.new_hvparams = new_hvp = objects.FillDict(cluster.hvparams, {})
3023 if self.op.hvparams:
3024 for hv_name, hv_dict in self.op.hvparams.items():
3025 if hv_name not in self.new_hvparams:
3026 self.new_hvparams[hv_name] = hv_dict
3028 self.new_hvparams[hv_name].update(hv_dict)
3030 # os hypervisor parameters
3031 self.new_os_hvp = objects.FillDict(cluster.os_hvp, {})
3033 for os_name, hvs in self.op.os_hvp.items():
3034 if os_name not in self.new_os_hvp:
3035 self.new_os_hvp[os_name] = hvs
3037 for hv_name, hv_dict in hvs.items():
3038 if hv_name not in self.new_os_hvp[os_name]:
3039 self.new_os_hvp[os_name][hv_name] = hv_dict
3041 self.new_os_hvp[os_name][hv_name].update(hv_dict)
3044 self.new_osp = objects.FillDict(cluster.osparams, {})
3045 if self.op.osparams:
3046 for os_name, osp in self.op.osparams.items():
3047 if os_name not in self.new_osp:
3048 self.new_osp[os_name] = {}
3050 self.new_osp[os_name] = _GetUpdatedParams(self.new_osp[os_name], osp,
3053 if not self.new_osp[os_name]:
3054 # we removed all parameters
3055 del self.new_osp[os_name]
3057 # check the parameter validity (remote check)
3058 _CheckOSParams(self, False, [self.cfg.GetMasterNode()],
3059 os_name, self.new_osp[os_name])
3061 # changes to the hypervisor list
3062 if self.op.enabled_hypervisors is not None:
3063 self.hv_list = self.op.enabled_hypervisors
3064 for hv in self.hv_list:
3065 # if the hypervisor doesn't already exist in the cluster
3066 # hvparams, we initialize it to empty, and then (in both
3067 # cases) we make sure to fill the defaults, as we might not
3068 # have a complete defaults list if the hypervisor wasn't
3070 if hv not in new_hvp:
3072 new_hvp[hv] = objects.FillDict(constants.HVC_DEFAULTS[hv], new_hvp[hv])
3073 utils.ForceDictType(new_hvp[hv], constants.HVS_PARAMETER_TYPES)
3075 self.hv_list = cluster.enabled_hypervisors
3077 if self.op.hvparams or self.op.enabled_hypervisors is not None:
3078 # either the enabled list has changed, or the parameters have, validate
3079 for hv_name, hv_params in self.new_hvparams.items():
3080 if ((self.op.hvparams and hv_name in self.op.hvparams) or
3081 (self.op.enabled_hypervisors and
3082 hv_name in self.op.enabled_hypervisors)):
3083 # either this is a new hypervisor, or its parameters have changed
3084 hv_class = hypervisor.GetHypervisor(hv_name)
3085 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
3086 hv_class.CheckParameterSyntax(hv_params)
3087 _CheckHVParams(self, node_list, hv_name, hv_params)
3090 # no need to check any newly-enabled hypervisors, since the
3091 # defaults have already been checked in the above code-block
3092 for os_name, os_hvp in self.new_os_hvp.items():
3093 for hv_name, hv_params in os_hvp.items():
3094 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
3095 # we need to fill in the new os_hvp on top of the actual hv_p
3096 cluster_defaults = self.new_hvparams.get(hv_name, {})
3097 new_osp = objects.FillDict(cluster_defaults, hv_params)
3098 hv_class = hypervisor.GetHypervisor(hv_name)
3099 hv_class.CheckParameterSyntax(new_osp)
3100 _CheckHVParams(self, node_list, hv_name, new_osp)
3102 if self.op.default_iallocator:
3103 alloc_script = utils.FindFile(self.op.default_iallocator,
3104 constants.IALLOCATOR_SEARCH_PATH,
3106 if alloc_script is None:
3107 raise errors.OpPrereqError("Invalid default iallocator script '%s'"
3108 " specified" % self.op.default_iallocator,
3111 def Exec(self, feedback_fn):
3112 """Change the parameters of the cluster.
3115 if self.op.vg_name is not None:
3116 new_volume = self.op.vg_name
3119 if new_volume != self.cfg.GetVGName():
3120 self.cfg.SetVGName(new_volume)
3122 feedback_fn("Cluster LVM configuration already in desired"
3123 " state, not changing")
3124 if self.op.drbd_helper is not None:
3125 new_helper = self.op.drbd_helper
3128 if new_helper != self.cfg.GetDRBDHelper():
3129 self.cfg.SetDRBDHelper(new_helper)
3131 feedback_fn("Cluster DRBD helper already in desired state,"
3133 if self.op.hvparams:
3134 self.cluster.hvparams = self.new_hvparams
3136 self.cluster.os_hvp = self.new_os_hvp
3137 if self.op.enabled_hypervisors is not None:
3138 self.cluster.hvparams = self.new_hvparams
3139 self.cluster.enabled_hypervisors = self.op.enabled_hypervisors
3140 if self.op.beparams:
3141 self.cluster.beparams[constants.PP_DEFAULT] = self.new_beparams
3142 if self.op.nicparams:
3143 self.cluster.nicparams[constants.PP_DEFAULT] = self.new_nicparams
3144 if self.op.osparams:
3145 self.cluster.osparams = self.new_osp
3146 if self.op.ndparams:
3147 self.cluster.ndparams = self.new_ndparams
3149 if self.op.candidate_pool_size is not None:
3150 self.cluster.candidate_pool_size = self.op.candidate_pool_size
3151 # we need to update the pool size here, otherwise the save will fail
3152 _AdjustCandidatePool(self, [])
3154 if self.op.maintain_node_health is not None:
3155 self.cluster.maintain_node_health = self.op.maintain_node_health
3157 if self.op.prealloc_wipe_disks is not None:
3158 self.cluster.prealloc_wipe_disks = self.op.prealloc_wipe_disks
3160 if self.op.add_uids is not None:
3161 uidpool.AddToUidPool(self.cluster.uid_pool, self.op.add_uids)
3163 if self.op.remove_uids is not None:
3164 uidpool.RemoveFromUidPool(self.cluster.uid_pool, self.op.remove_uids)
3166 if self.op.uid_pool is not None:
3167 self.cluster.uid_pool = self.op.uid_pool
3169 if self.op.default_iallocator is not None:
3170 self.cluster.default_iallocator = self.op.default_iallocator
3172 if self.op.reserved_lvs is not None:
3173 self.cluster.reserved_lvs = self.op.reserved_lvs
3175 def helper_os(aname, mods, desc):
3177 lst = getattr(self.cluster, aname)
3178 for key, val in mods:
3179 if key == constants.DDM_ADD:
3181 feedback_fn("OS %s already in %s, ignoring" % (val, desc))
3184 elif key == constants.DDM_REMOVE:
3188 feedback_fn("OS %s not found in %s, ignoring" % (val, desc))
3190 raise errors.ProgrammerError("Invalid modification '%s'" % key)
3192 if self.op.hidden_os:
3193 helper_os("hidden_os", self.op.hidden_os, "hidden")
3195 if self.op.blacklisted_os:
3196 helper_os("blacklisted_os", self.op.blacklisted_os, "blacklisted")
3198 if self.op.master_netdev:
3199 master = self.cfg.GetMasterNode()
3200 feedback_fn("Shutting down master ip on the current netdev (%s)" %
3201 self.cluster.master_netdev)
3202 result = self.rpc.call_node_stop_master(master, False)
3203 result.Raise("Could not disable the master ip")
3204 feedback_fn("Changing master_netdev from %s to %s" %
3205 (self.cluster.master_netdev, self.op.master_netdev))
3206 self.cluster.master_netdev = self.op.master_netdev
3208 self.cfg.Update(self.cluster, feedback_fn)
3210 if self.op.master_netdev:
3211 feedback_fn("Starting the master ip on the new master netdev (%s)" %
3212 self.op.master_netdev)
3213 result = self.rpc.call_node_start_master(master, False, False)
3215 self.LogWarning("Could not re-enable the master ip on"
3216 " the master, please restart manually: %s",
3220 def _UploadHelper(lu, nodes, fname):
3221 """Helper for uploading a file and showing warnings.
3224 if os.path.exists(fname):
3225 result = lu.rpc.call_upload_file(nodes, fname)
3226 for to_node, to_result in result.items():
3227 msg = to_result.fail_msg
3229 msg = ("Copy of file %s to node %s failed: %s" %
3230 (fname, to_node, msg))
3231 lu.proc.LogWarning(msg)
3234 def _ComputeAncillaryFiles(cluster, redist):
3235 """Compute files external to Ganeti which need to be consistent.
3237 @type redist: boolean
3238 @param redist: Whether to include files which need to be redistributed
3241 # Compute files for all nodes
3243 constants.SSH_KNOWN_HOSTS_FILE,
3244 constants.CONFD_HMAC_KEY,
3245 constants.CLUSTER_DOMAIN_SECRET_FILE,
3249 files_all.update(constants.ALL_CERT_FILES)
3250 files_all.update(ssconf.SimpleStore().GetFileList())
3252 if cluster.modify_etc_hosts:
3253 files_all.add(constants.ETC_HOSTS)
3255 # Files which must either exist on all nodes or on none
3256 files_all_opt = set([
3257 constants.RAPI_USERS_FILE,
3260 # Files which should only be on master candidates
3263 files_mc.add(constants.CLUSTER_CONF_FILE)
3265 # Files which should only be on VM-capable nodes
3266 files_vm = set(filename
3267 for hv_name in cluster.enabled_hypervisors
3268 for filename in hypervisor.GetHypervisor(hv_name).GetAncillaryFiles())
3270 # Filenames must be unique
3271 assert (len(files_all | files_all_opt | files_mc | files_vm) ==
3272 sum(map(len, [files_all, files_all_opt, files_mc, files_vm]))), \
3273 "Found file listed in more than one file list"
3275 return (files_all, files_all_opt, files_mc, files_vm)
3278 def _RedistributeAncillaryFiles(lu, additional_nodes=None, additional_vm=True):
3279 """Distribute additional files which are part of the cluster configuration.
3281 ConfigWriter takes care of distributing the config and ssconf files, but
3282 there are more files which should be distributed to all nodes. This function
3283 makes sure those are copied.
3285 @param lu: calling logical unit
3286 @param additional_nodes: list of nodes not in the config to distribute to
3287 @type additional_vm: boolean
3288 @param additional_vm: whether the additional nodes are vm-capable or not
3291 # Gather target nodes
3292 cluster = lu.cfg.GetClusterInfo()
3293 master_info = lu.cfg.GetNodeInfo(lu.cfg.GetMasterNode())
3295 online_nodes = lu.cfg.GetOnlineNodeList()
3296 vm_nodes = lu.cfg.GetVmCapableNodeList()
3298 if additional_nodes is not None:
3299 online_nodes.extend(additional_nodes)
3301 vm_nodes.extend(additional_nodes)
3303 # Never distribute to master node
3304 for nodelist in [online_nodes, vm_nodes]:
3305 if master_info.name in nodelist:
3306 nodelist.remove(master_info.name)
3309 (files_all, files_all_opt, files_mc, files_vm) = \
3310 _ComputeAncillaryFiles(cluster, True)
3312 # Never re-distribute configuration file from here
3313 assert not (constants.CLUSTER_CONF_FILE in files_all or
3314 constants.CLUSTER_CONF_FILE in files_vm)
3315 assert not files_mc, "Master candidates not handled in this function"
3318 (online_nodes, files_all),
3319 (online_nodes, files_all_opt),
3320 (vm_nodes, files_vm),
3324 for (node_list, files) in filemap:
3326 _UploadHelper(lu, node_list, fname)
3329 class LUClusterRedistConf(NoHooksLU):
3330 """Force the redistribution of cluster configuration.
3332 This is a very simple LU.
3337 def ExpandNames(self):
3338 self.needed_locks = {
3339 locking.LEVEL_NODE: locking.ALL_SET,
3341 self.share_locks[locking.LEVEL_NODE] = 1
3343 def Exec(self, feedback_fn):
3344 """Redistribute the configuration.
3347 self.cfg.Update(self.cfg.GetClusterInfo(), feedback_fn)
3348 _RedistributeAncillaryFiles(self)
3351 def _WaitForSync(lu, instance, disks=None, oneshot=False):
3352 """Sleep and poll for an instance's disk to sync.
3355 if not instance.disks or disks is not None and not disks:
3358 disks = _ExpandCheckDisks(instance, disks)
3361 lu.proc.LogInfo("Waiting for instance %s to sync disks." % instance.name)
3363 node = instance.primary_node
3366 lu.cfg.SetDiskID(dev, node)
3368 # TODO: Convert to utils.Retry
3371 degr_retries = 10 # in seconds, as we sleep 1 second each time
3375 cumul_degraded = False
3376 rstats = lu.rpc.call_blockdev_getmirrorstatus(node, disks)
3377 msg = rstats.fail_msg
3379 lu.LogWarning("Can't get any data from node %s: %s", node, msg)
3382 raise errors.RemoteError("Can't contact node %s for mirror data,"
3383 " aborting." % node)
3386 rstats = rstats.payload
3388 for i, mstat in enumerate(rstats):
3390 lu.LogWarning("Can't compute data for node %s/%s",
3391 node, disks[i].iv_name)
3394 cumul_degraded = (cumul_degraded or
3395 (mstat.is_degraded and mstat.sync_percent is None))
3396 if mstat.sync_percent is not None:
3398 if mstat.estimated_time is not None:
3399 rem_time = ("%s remaining (estimated)" %
3400 utils.FormatSeconds(mstat.estimated_time))
3401 max_time = mstat.estimated_time
3403 rem_time = "no time estimate"
3404 lu.proc.LogInfo("- device %s: %5.2f%% done, %s" %
3405 (disks[i].iv_name, mstat.sync_percent, rem_time))
3407 # if we're done but degraded, let's do a few small retries, to
3408 # make sure we see a stable and not transient situation; therefore
3409 # we force restart of the loop
3410 if (done or oneshot) and cumul_degraded and degr_retries > 0:
3411 logging.info("Degraded disks found, %d retries left", degr_retries)
3419 time.sleep(min(60, max_time))
3422 lu.proc.LogInfo("Instance %s's disks are in sync." % instance.name)
3423 return not cumul_degraded
3426 def _CheckDiskConsistency(lu, dev, node, on_primary, ldisk=False):
3427 """Check that mirrors are not degraded.
3429 The ldisk parameter, if True, will change the test from the
3430 is_degraded attribute (which represents overall non-ok status for
3431 the device(s)) to the ldisk (representing the local storage status).
3434 lu.cfg.SetDiskID(dev, node)
3438 if on_primary or dev.AssembleOnSecondary():
3439 rstats = lu.rpc.call_blockdev_find(node, dev)
3440 msg = rstats.fail_msg
3442 lu.LogWarning("Can't find disk on node %s: %s", node, msg)
3444 elif not rstats.payload:
3445 lu.LogWarning("Can't find disk on node %s", node)
3449 result = result and rstats.payload.ldisk_status == constants.LDS_OKAY
3451 result = result and not rstats.payload.is_degraded
3454 for child in dev.children:
3455 result = result and _CheckDiskConsistency(lu, child, node, on_primary)
3460 class LUOobCommand(NoHooksLU):
3461 """Logical unit for OOB handling.
3465 _SKIP_MASTER = (constants.OOB_POWER_OFF, constants.OOB_POWER_CYCLE)
3467 def ExpandNames(self):
3468 """Gather locks we need.
3471 if self.op.node_names:
3472 self.op.node_names = _GetWantedNodes(self, self.op.node_names)
3473 lock_names = self.op.node_names
3475 lock_names = locking.ALL_SET
3477 self.needed_locks = {
3478 locking.LEVEL_NODE: lock_names,
3481 def CheckPrereq(self):
3482 """Check prerequisites.
3485 - the node exists in the configuration
3488 Any errors are signaled by raising errors.OpPrereqError.
3492 self.master_node = self.cfg.GetMasterNode()
3494 assert self.op.power_delay >= 0.0
3496 if self.op.node_names:
3497 if (self.op.command in self._SKIP_MASTER and
3498 self.master_node in self.op.node_names):
3499 master_node_obj = self.cfg.GetNodeInfo(self.master_node)
3500 master_oob_handler = _SupportsOob(self.cfg, master_node_obj)
3502 if master_oob_handler:
3503 additional_text = ("run '%s %s %s' if you want to operate on the"
3504 " master regardless") % (master_oob_handler,
3508 additional_text = "it does not support out-of-band operations"
3510 raise errors.OpPrereqError(("Operating on the master node %s is not"
3511 " allowed for %s; %s") %
3512 (self.master_node, self.op.command,
3513 additional_text), errors.ECODE_INVAL)
3515 self.op.node_names = self.cfg.GetNodeList()
3516 if self.op.command in self._SKIP_MASTER:
3517 self.op.node_names.remove(self.master_node)
3519 if self.op.command in self._SKIP_MASTER:
3520 assert self.master_node not in self.op.node_names
3522 for node_name in self.op.node_names:
3523 node = self.cfg.GetNodeInfo(node_name)
3526 raise errors.OpPrereqError("Node %s not found" % node_name,
3529 self.nodes.append(node)
3531 if (not self.op.ignore_status and
3532 (self.op.command == constants.OOB_POWER_OFF and not node.offline)):
3533 raise errors.OpPrereqError(("Cannot power off node %s because it is"
3534 " not marked offline") % node_name,
3537 def Exec(self, feedback_fn):
3538 """Execute OOB and return result if we expect any.
3541 master_node = self.master_node
3544 for idx, node in enumerate(utils.NiceSort(self.nodes,
3545 key=lambda node: node.name)):
3546 node_entry = [(constants.RS_NORMAL, node.name)]
3547 ret.append(node_entry)
3549 oob_program = _SupportsOob(self.cfg, node)
3552 node_entry.append((constants.RS_UNAVAIL, None))
3555 logging.info("Executing out-of-band command '%s' using '%s' on %s",
3556 self.op.command, oob_program, node.name)
3557 result = self.rpc.call_run_oob(master_node, oob_program,
3558 self.op.command, node.name,
3562 self.LogWarning("Out-of-band RPC failed on node '%s': %s",
3563 node.name, result.fail_msg)
3564 node_entry.append((constants.RS_NODATA, None))
3567 self._CheckPayload(result)
3568 except errors.OpExecError, err:
3569 self.LogWarning("Payload returned by node '%s' is not valid: %s",
3571 node_entry.append((constants.RS_NODATA, None))
3573 if self.op.command == constants.OOB_HEALTH:
3574 # For health we should log important events
3575 for item, status in result.payload:
3576 if status in [constants.OOB_STATUS_WARNING,
3577 constants.OOB_STATUS_CRITICAL]:
3578 self.LogWarning("Item '%s' on node '%s' has status '%s'",
3579 item, node.name, status)
3581 if self.op.command == constants.OOB_POWER_ON:
3583 elif self.op.command == constants.OOB_POWER_OFF:
3584 node.powered = False
3585 elif self.op.command == constants.OOB_POWER_STATUS:
3586 powered = result.payload[constants.OOB_POWER_STATUS_POWERED]
3587 if powered != node.powered:
3588 logging.warning(("Recorded power state (%s) of node '%s' does not"
3589 " match actual power state (%s)"), node.powered,
3592 # For configuration changing commands we should update the node
3593 if self.op.command in (constants.OOB_POWER_ON,
3594 constants.OOB_POWER_OFF):
3595 self.cfg.Update(node, feedback_fn)
3597 node_entry.append((constants.RS_NORMAL, result.payload))
3599 if (self.op.command == constants.OOB_POWER_ON and
3600 idx < len(self.nodes) - 1):
3601 time.sleep(self.op.power_delay)
3605 def _CheckPayload(self, result):
3606 """Checks if the payload is valid.
3608 @param result: RPC result
3609 @raises errors.OpExecError: If payload is not valid
3613 if self.op.command == constants.OOB_HEALTH:
3614 if not isinstance(result.payload, list):
3615 errs.append("command 'health' is expected to return a list but got %s" %
3616 type(result.payload))
3618 for item, status in result.payload:
3619 if status not in constants.OOB_STATUSES:
3620 errs.append("health item '%s' has invalid status '%s'" %
3623 if self.op.command == constants.OOB_POWER_STATUS:
3624 if not isinstance(result.payload, dict):
3625 errs.append("power-status is expected to return a dict but got %s" %
3626 type(result.payload))
3628 if self.op.command in [
3629 constants.OOB_POWER_ON,
3630 constants.OOB_POWER_OFF,
3631 constants.OOB_POWER_CYCLE,
3633 if result.payload is not None:
3634 errs.append("%s is expected to not return payload but got '%s'" %
3635 (self.op.command, result.payload))
3638 raise errors.OpExecError("Check of out-of-band payload failed due to %s" %
3639 utils.CommaJoin(errs))
3641 class _OsQuery(_QueryBase):
3642 FIELDS = query.OS_FIELDS
3644 def ExpandNames(self, lu):
3645 # Lock all nodes in shared mode
3646 # Temporary removal of locks, should be reverted later
3647 # TODO: reintroduce locks when they are lighter-weight
3648 lu.needed_locks = {}
3649 #self.share_locks[locking.LEVEL_NODE] = 1
3650 #self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
3652 # The following variables interact with _QueryBase._GetNames
3654 self.wanted = self.names
3656 self.wanted = locking.ALL_SET
3658 self.do_locking = self.use_locking
3660 def DeclareLocks(self, lu, level):
3664 def _DiagnoseByOS(rlist):
3665 """Remaps a per-node return list into an a per-os per-node dictionary
3667 @param rlist: a map with node names as keys and OS objects as values
3670 @return: a dictionary with osnames as keys and as value another
3671 map, with nodes as keys and tuples of (path, status, diagnose,
3672 variants, parameters, api_versions) as values, eg::
3674 {"debian-etch": {"node1": [(/usr/lib/..., True, "", [], []),
3675 (/srv/..., False, "invalid api")],
3676 "node2": [(/srv/..., True, "", [], [])]}
3681 # we build here the list of nodes that didn't fail the RPC (at RPC
3682 # level), so that nodes with a non-responding node daemon don't
3683 # make all OSes invalid
3684 good_nodes = [node_name for node_name in rlist
3685 if not rlist[node_name].fail_msg]
3686 for node_name, nr in rlist.items():
3687 if nr.fail_msg or not nr.payload:
3689 for (name, path, status, diagnose, variants,
3690 params, api_versions) in nr.payload:
3691 if name not in all_os:
3692 # build a list of nodes for this os containing empty lists
3693 # for each node in node_list
3695 for nname in good_nodes:
3696 all_os[name][nname] = []
3697 # convert params from [name, help] to (name, help)
3698 params = [tuple(v) for v in params]
3699 all_os[name][node_name].append((path, status, diagnose,
3700 variants, params, api_versions))
3703 def _GetQueryData(self, lu):
3704 """Computes the list of nodes and their attributes.
3707 # Locking is not used
3708 assert not (compat.any(lu.glm.is_owned(level)
3709 for level in locking.LEVELS
3710 if level != locking.LEVEL_CLUSTER) or
3711 self.do_locking or self.use_locking)
3713 valid_nodes = [node.name
3714 for node in lu.cfg.GetAllNodesInfo().values()
3715 if not node.offline and node.vm_capable]
3716 pol = self._DiagnoseByOS(lu.rpc.call_os_diagnose(valid_nodes))
3717 cluster = lu.cfg.GetClusterInfo()
3721 for (os_name, os_data) in pol.items():
3722 info = query.OsInfo(name=os_name, valid=True, node_status=os_data,
3723 hidden=(os_name in cluster.hidden_os),
3724 blacklisted=(os_name in cluster.blacklisted_os))
3728 api_versions = set()
3730 for idx, osl in enumerate(os_data.values()):
3731 info.valid = bool(info.valid and osl and osl[0][1])
3735 (node_variants, node_params, node_api) = osl[0][3:6]
3738 variants.update(node_variants)
3739 parameters.update(node_params)
3740 api_versions.update(node_api)
3742 # Filter out inconsistent values
3743 variants.intersection_update(node_variants)
3744 parameters.intersection_update(node_params)
3745 api_versions.intersection_update(node_api)
3747 info.variants = list(variants)
3748 info.parameters = list(parameters)
3749 info.api_versions = list(api_versions)
3751 data[os_name] = info
3753 # Prepare data in requested order
3754 return [data[name] for name in self._GetNames(lu, pol.keys(), None)
3758 class LUOsDiagnose(NoHooksLU):
3759 """Logical unit for OS diagnose/query.
3765 def _BuildFilter(fields, names):
3766 """Builds a filter for querying OSes.
3769 name_filter = qlang.MakeSimpleFilter("name", names)
3771 # Legacy behaviour: Hide hidden, blacklisted or invalid OSes if the
3772 # respective field is not requested
3773 status_filter = [[qlang.OP_NOT, [qlang.OP_TRUE, fname]]
3774 for fname in ["hidden", "blacklisted"]
3775 if fname not in fields]
3776 if "valid" not in fields:
3777 status_filter.append([qlang.OP_TRUE, "valid"])
3780 status_filter.insert(0, qlang.OP_AND)
3782 status_filter = None
3784 if name_filter and status_filter:
3785 return [qlang.OP_AND, name_filter, status_filter]
3789 return status_filter
3791 def CheckArguments(self):
3792 self.oq = _OsQuery(self._BuildFilter(self.op.output_fields, self.op.names),
3793 self.op.output_fields, False)
3795 def ExpandNames(self):
3796 self.oq.ExpandNames(self)
3798 def Exec(self, feedback_fn):
3799 return self.oq.OldStyleQuery(self)
3802 class LUNodeRemove(LogicalUnit):
3803 """Logical unit for removing a node.
3806 HPATH = "node-remove"
3807 HTYPE = constants.HTYPE_NODE
3809 def BuildHooksEnv(self):
3812 This doesn't run on the target node in the pre phase as a failed
3813 node would then be impossible to remove.
3817 "OP_TARGET": self.op.node_name,
3818 "NODE_NAME": self.op.node_name,
3821 def BuildHooksNodes(self):
3822 """Build hooks nodes.
3825 all_nodes = self.cfg.GetNodeList()
3827 all_nodes.remove(self.op.node_name)
3829 logging.warning("Node '%s', which is about to be removed, was not found"
3830 " in the list of all nodes", self.op.node_name)
3831 return (all_nodes, all_nodes)
3833 def CheckPrereq(self):
3834 """Check prerequisites.
3837 - the node exists in the configuration
3838 - it does not have primary or secondary instances
3839 - it's not the master
3841 Any errors are signaled by raising errors.OpPrereqError.
3844 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
3845 node = self.cfg.GetNodeInfo(self.op.node_name)
3846 assert node is not None
3848 instance_list = self.cfg.GetInstanceList()
3850 masternode = self.cfg.GetMasterNode()
3851 if node.name == masternode:
3852 raise errors.OpPrereqError("Node is the master node, failover to another"
3853 " node is required", errors.ECODE_INVAL)
3855 for instance_name in instance_list:
3856 instance = self.cfg.GetInstanceInfo(instance_name)
3857 if node.name in instance.all_nodes:
3858 raise errors.OpPrereqError("Instance %s is still running on the node,"
3859 " please remove first" % instance_name,
3861 self.op.node_name = node.name
3864 def Exec(self, feedback_fn):
3865 """Removes the node from the cluster.
3869 logging.info("Stopping the node daemon and removing configs from node %s",
3872 modify_ssh_setup = self.cfg.GetClusterInfo().modify_ssh_setup
3874 # Promote nodes to master candidate as needed
3875 _AdjustCandidatePool(self, exceptions=[node.name])
3876 self.context.RemoveNode(node.name)
3878 # Run post hooks on the node before it's removed
3879 _RunPostHook(self, node.name)
3881 result = self.rpc.call_node_leave_cluster(node.name, modify_ssh_setup)
3882 msg = result.fail_msg
3884 self.LogWarning("Errors encountered on the remote node while leaving"
3885 " the cluster: %s", msg)
3887 # Remove node from our /etc/hosts
3888 if self.cfg.GetClusterInfo().modify_etc_hosts:
3889 master_node = self.cfg.GetMasterNode()
3890 result = self.rpc.call_etc_hosts_modify(master_node,
3891 constants.ETC_HOSTS_REMOVE,
3893 result.Raise("Can't update hosts file with new host data")
3894 _RedistributeAncillaryFiles(self)
3897 class _NodeQuery(_QueryBase):
3898 FIELDS = query.NODE_FIELDS
3900 def ExpandNames(self, lu):
3901 lu.needed_locks = {}
3902 lu.share_locks[locking.LEVEL_NODE] = 1
3905 self.wanted = _GetWantedNodes(lu, self.names)
3907 self.wanted = locking.ALL_SET
3909 self.do_locking = (self.use_locking and
3910 query.NQ_LIVE in self.requested_data)
3913 # if we don't request only static fields, we need to lock the nodes
3914 lu.needed_locks[locking.LEVEL_NODE] = self.wanted
3916 def DeclareLocks(self, lu, level):
3919 def _GetQueryData(self, lu):
3920 """Computes the list of nodes and their attributes.
3923 all_info = lu.cfg.GetAllNodesInfo()
3925 nodenames = self._GetNames(lu, all_info.keys(), locking.LEVEL_NODE)
3927 # Gather data as requested
3928 if query.NQ_LIVE in self.requested_data:
3929 # filter out non-vm_capable nodes
3930 toquery_nodes = [name for name in nodenames if all_info[name].vm_capable]
3932 node_data = lu.rpc.call_node_info(toquery_nodes, lu.cfg.GetVGName(),
3933 lu.cfg.GetHypervisorType())
3934 live_data = dict((name, nresult.payload)
3935 for (name, nresult) in node_data.items()
3936 if not nresult.fail_msg and nresult.payload)
3940 if query.NQ_INST in self.requested_data:
3941 node_to_primary = dict([(name, set()) for name in nodenames])
3942 node_to_secondary = dict([(name, set()) for name in nodenames])
3944 inst_data = lu.cfg.GetAllInstancesInfo()
3946 for inst in inst_data.values():
3947 if inst.primary_node in node_to_primary:
3948 node_to_primary[inst.primary_node].add(inst.name)
3949 for secnode in inst.secondary_nodes:
3950 if secnode in node_to_secondary:
3951 node_to_secondary[secnode].add(inst.name)
3953 node_to_primary = None
3954 node_to_secondary = None
3956 if query.NQ_OOB in self.requested_data:
3957 oob_support = dict((name, bool(_SupportsOob(lu.cfg, node)))
3958 for name, node in all_info.iteritems())
3962 if query.NQ_GROUP in self.requested_data:
3963 groups = lu.cfg.GetAllNodeGroupsInfo()
3967 return query.NodeQueryData([all_info[name] for name in nodenames],
3968 live_data, lu.cfg.GetMasterNode(),
3969 node_to_primary, node_to_secondary, groups,
3970 oob_support, lu.cfg.GetClusterInfo())
3973 class LUNodeQuery(NoHooksLU):
3974 """Logical unit for querying nodes.
3977 # pylint: disable-msg=W0142
3980 def CheckArguments(self):
3981 self.nq = _NodeQuery(qlang.MakeSimpleFilter("name", self.op.names),
3982 self.op.output_fields, self.op.use_locking)
3984 def ExpandNames(self):
3985 self.nq.ExpandNames(self)
3987 def Exec(self, feedback_fn):
3988 return self.nq.OldStyleQuery(self)
3991 class LUNodeQueryvols(NoHooksLU):
3992 """Logical unit for getting volumes on node(s).
3996 _FIELDS_DYNAMIC = utils.FieldSet("phys", "vg", "name", "size", "instance")
3997 _FIELDS_STATIC = utils.FieldSet("node")
3999 def CheckArguments(self):
4000 _CheckOutputFields(static=self._FIELDS_STATIC,
4001 dynamic=self._FIELDS_DYNAMIC,
4002 selected=self.op.output_fields)
4004 def ExpandNames(self):
4005 self.needed_locks = {}
4006 self.share_locks[locking.LEVEL_NODE] = 1
4007 if not self.op.nodes:
4008 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
4010 self.needed_locks[locking.LEVEL_NODE] = \
4011 _GetWantedNodes(self, self.op.nodes)
4013 def Exec(self, feedback_fn):
4014 """Computes the list of nodes and their attributes.
4017 nodenames = self.glm.list_owned(locking.LEVEL_NODE)
4018 volumes = self.rpc.call_node_volumes(nodenames)
4020 ilist = [self.cfg.GetInstanceInfo(iname) for iname
4021 in self.cfg.GetInstanceList()]
4023 lv_by_node = dict([(inst, inst.MapLVsByNode()) for inst in ilist])
4026 for node in nodenames:
4027 nresult = volumes[node]
4030 msg = nresult.fail_msg
4032 self.LogWarning("Can't compute volume data on node %s: %s", node, msg)
4035 node_vols = nresult.payload[:]
4036 node_vols.sort(key=lambda vol: vol['dev'])
4038 for vol in node_vols:
4040 for field in self.op.output_fields:
4043 elif field == "phys":
4047 elif field == "name":
4049 elif field == "size":
4050 val = int(float(vol['size']))
4051 elif field == "instance":
4053 if node not in lv_by_node[inst]:
4055 if vol['name'] in lv_by_node[inst][node]:
4061 raise errors.ParameterError(field)
4062 node_output.append(str(val))
4064 output.append(node_output)
4069 class LUNodeQueryStorage(NoHooksLU):
4070 """Logical unit for getting information on storage units on node(s).
4073 _FIELDS_STATIC = utils.FieldSet(constants.SF_NODE)
4076 def CheckArguments(self):
4077 _CheckOutputFields(static=self._FIELDS_STATIC,
4078 dynamic=utils.FieldSet(*constants.VALID_STORAGE_FIELDS),
4079 selected=self.op.output_fields)
4081 def ExpandNames(self):
4082 self.needed_locks = {}
4083 self.share_locks[locking.LEVEL_NODE] = 1
4086 self.needed_locks[locking.LEVEL_NODE] = \
4087 _GetWantedNodes(self, self.op.nodes)
4089 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
4091 def Exec(self, feedback_fn):
4092 """Computes the list of nodes and their attributes.
4095 self.nodes = self.glm.list_owned(locking.LEVEL_NODE)
4097 # Always get name to sort by
4098 if constants.SF_NAME in self.op.output_fields:
4099 fields = self.op.output_fields[:]
4101 fields = [constants.SF_NAME] + self.op.output_fields
4103 # Never ask for node or type as it's only known to the LU
4104 for extra in [constants.SF_NODE, constants.SF_TYPE]:
4105 while extra in fields:
4106 fields.remove(extra)
4108 field_idx = dict([(name, idx) for (idx, name) in enumerate(fields)])
4109 name_idx = field_idx[constants.SF_NAME]
4111 st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
4112 data = self.rpc.call_storage_list(self.nodes,
4113 self.op.storage_type, st_args,
4114 self.op.name, fields)
4118 for node in utils.NiceSort(self.nodes):
4119 nresult = data[node]
4123 msg = nresult.fail_msg
4125 self.LogWarning("Can't get storage data from node %s: %s", node, msg)
4128 rows = dict([(row[name_idx], row) for row in nresult.payload])
4130 for name in utils.NiceSort(rows.keys()):
4135 for field in self.op.output_fields:
4136 if field == constants.SF_NODE:
4138 elif field == constants.SF_TYPE:
4139 val = self.op.storage_type
4140 elif field in field_idx:
4141 val = row[field_idx[field]]
4143 raise errors.ParameterError(field)
4152 class _InstanceQuery(_QueryBase):
4153 FIELDS = query.INSTANCE_FIELDS
4155 def ExpandNames(self, lu):
4156 lu.needed_locks = {}
4157 lu.share_locks[locking.LEVEL_INSTANCE] = 1
4158 lu.share_locks[locking.LEVEL_NODE] = 1
4161 self.wanted = _GetWantedInstances(lu, self.names)
4163 self.wanted = locking.ALL_SET
4165 self.do_locking = (self.use_locking and
4166 query.IQ_LIVE in self.requested_data)
4168 lu.needed_locks[locking.LEVEL_INSTANCE] = self.wanted
4169 lu.needed_locks[locking.LEVEL_NODE] = []
4170 lu.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
4172 def DeclareLocks(self, lu, level):
4173 if level == locking.LEVEL_NODE and self.do_locking:
4174 lu._LockInstancesNodes() # pylint: disable-msg=W0212
4176 def _GetQueryData(self, lu):
4177 """Computes the list of instances and their attributes.
4180 cluster = lu.cfg.GetClusterInfo()
4181 all_info = lu.cfg.GetAllInstancesInfo()
4183 instance_names = self._GetNames(lu, all_info.keys(), locking.LEVEL_INSTANCE)
4185 instance_list = [all_info[name] for name in instance_names]
4186 nodes = frozenset(itertools.chain(*(inst.all_nodes
4187 for inst in instance_list)))
4188 hv_list = list(set([inst.hypervisor for inst in instance_list]))
4191 wrongnode_inst = set()
4193 # Gather data as requested
4194 if self.requested_data & set([query.IQ_LIVE, query.IQ_CONSOLE]):
4196 node_data = lu.rpc.call_all_instances_info(nodes, hv_list)
4198 result = node_data[name]
4200 # offline nodes will be in both lists
4201 assert result.fail_msg
4202 offline_nodes.append(name)
4204 bad_nodes.append(name)
4205 elif result.payload:
4206 for inst in result.payload:
4207 if inst in all_info:
4208 if all_info[inst].primary_node == name:
4209 live_data.update(result.payload)
4211 wrongnode_inst.add(inst)
4213 # orphan instance; we don't list it here as we don't
4214 # handle this case yet in the output of instance listing
4215 logging.warning("Orphan instance '%s' found on node %s",
4217 # else no instance is alive
4221 if query.IQ_DISKUSAGE in self.requested_data:
4222 disk_usage = dict((inst.name,
4223 _ComputeDiskSize(inst.disk_template,
4224 [{constants.IDISK_SIZE: disk.size}
4225 for disk in inst.disks]))
4226 for inst in instance_list)
4230 if query.IQ_CONSOLE in self.requested_data:
4232 for inst in instance_list:
4233 if inst.name in live_data:
4234 # Instance is running
4235 consinfo[inst.name] = _GetInstanceConsole(cluster, inst)
4237 consinfo[inst.name] = None
4238 assert set(consinfo.keys()) == set(instance_names)
4242 return query.InstanceQueryData(instance_list, lu.cfg.GetClusterInfo(),
4243 disk_usage, offline_nodes, bad_nodes,
4244 live_data, wrongnode_inst, consinfo)
4247 class LUQuery(NoHooksLU):
4248 """Query for resources/items of a certain kind.
4251 # pylint: disable-msg=W0142
4254 def CheckArguments(self):
4255 qcls = _GetQueryImplementation(self.op.what)
4257 self.impl = qcls(self.op.filter, self.op.fields, False)
4259 def ExpandNames(self):
4260 self.impl.ExpandNames(self)
4262 def DeclareLocks(self, level):
4263 self.impl.DeclareLocks(self, level)
4265 def Exec(self, feedback_fn):
4266 return self.impl.NewStyleQuery(self)
4269 class LUQueryFields(NoHooksLU):
4270 """Query for resources/items of a certain kind.
4273 # pylint: disable-msg=W0142
4276 def CheckArguments(self):
4277 self.qcls = _GetQueryImplementation(self.op.what)
4279 def ExpandNames(self):
4280 self.needed_locks = {}
4282 def Exec(self, feedback_fn):
4283 return query.QueryFields(self.qcls.FIELDS, self.op.fields)
4286 class LUNodeModifyStorage(NoHooksLU):
4287 """Logical unit for modifying a storage volume on a node.
4292 def CheckArguments(self):
4293 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
4295 storage_type = self.op.storage_type
4298 modifiable = constants.MODIFIABLE_STORAGE_FIELDS[storage_type]
4300 raise errors.OpPrereqError("Storage units of type '%s' can not be"
4301 " modified" % storage_type,
4304 diff = set(self.op.changes.keys()) - modifiable
4306 raise errors.OpPrereqError("The following fields can not be modified for"
4307 " storage units of type '%s': %r" %
4308 (storage_type, list(diff)),
4311 def ExpandNames(self):
4312 self.needed_locks = {
4313 locking.LEVEL_NODE: self.op.node_name,
4316 def Exec(self, feedback_fn):
4317 """Computes the list of nodes and their attributes.
4320 st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
4321 result = self.rpc.call_storage_modify(self.op.node_name,
4322 self.op.storage_type, st_args,
4323 self.op.name, self.op.changes)
4324 result.Raise("Failed to modify storage unit '%s' on %s" %
4325 (self.op.name, self.op.node_name))
4328 class LUNodeAdd(LogicalUnit):
4329 """Logical unit for adding node to the cluster.
4333 HTYPE = constants.HTYPE_NODE
4334 _NFLAGS = ["master_capable", "vm_capable"]
4336 def CheckArguments(self):
4337 self.primary_ip_family = self.cfg.GetPrimaryIPFamily()
4338 # validate/normalize the node name
4339 self.hostname = netutils.GetHostname(name=self.op.node_name,
4340 family=self.primary_ip_family)
4341 self.op.node_name = self.hostname.name
4343 if self.op.readd and self.op.node_name == self.cfg.GetMasterNode():
4344 raise errors.OpPrereqError("Cannot readd the master node",
4347 if self.op.readd and self.op.group:
4348 raise errors.OpPrereqError("Cannot pass a node group when a node is"
4349 " being readded", errors.ECODE_INVAL)
4351 def BuildHooksEnv(self):
4354 This will run on all nodes before, and on all nodes + the new node after.
4358 "OP_TARGET": self.op.node_name,
4359 "NODE_NAME": self.op.node_name,
4360 "NODE_PIP": self.op.primary_ip,
4361 "NODE_SIP": self.op.secondary_ip,
4362 "MASTER_CAPABLE": str(self.op.master_capable),
4363 "VM_CAPABLE": str(self.op.vm_capable),
4366 def BuildHooksNodes(self):
4367 """Build hooks nodes.
4370 # Exclude added node
4371 pre_nodes = list(set(self.cfg.GetNodeList()) - set([self.op.node_name]))
4372 post_nodes = pre_nodes + [self.op.node_name, ]
4374 return (pre_nodes, post_nodes)
4376 def CheckPrereq(self):
4377 """Check prerequisites.
4380 - the new node is not already in the config
4382 - its parameters (single/dual homed) matches the cluster
4384 Any errors are signaled by raising errors.OpPrereqError.
4388 hostname = self.hostname
4389 node = hostname.name
4390 primary_ip = self.op.primary_ip = hostname.ip
4391 if self.op.secondary_ip is None:
4392 if self.primary_ip_family == netutils.IP6Address.family:
4393 raise errors.OpPrereqError("When using a IPv6 primary address, a valid"
4394 " IPv4 address must be given as secondary",
4396 self.op.secondary_ip = primary_ip
4398 secondary_ip = self.op.secondary_ip
4399 if not netutils.IP4Address.IsValid(secondary_ip):
4400 raise errors.OpPrereqError("Secondary IP (%s) needs to be a valid IPv4"
4401 " address" % secondary_ip, errors.ECODE_INVAL)
4403 node_list = cfg.GetNodeList()
4404 if not self.op.readd and node in node_list:
4405 raise errors.OpPrereqError("Node %s is already in the configuration" %
4406 node, errors.ECODE_EXISTS)
4407 elif self.op.readd and node not in node_list:
4408 raise errors.OpPrereqError("Node %s is not in the configuration" % node,
4411 self.changed_primary_ip = False
4413 for existing_node_name in node_list:
4414 existing_node = cfg.GetNodeInfo(existing_node_name)
4416 if self.op.readd and node == existing_node_name:
4417 if existing_node.secondary_ip != secondary_ip:
4418 raise errors.OpPrereqError("Readded node doesn't have the same IP"
4419 " address configuration as before",
4421 if existing_node.primary_ip != primary_ip:
4422 self.changed_primary_ip = True
4426 if (existing_node.primary_ip == primary_ip or
4427 existing_node.secondary_ip == primary_ip or
4428 existing_node.primary_ip == secondary_ip or
4429 existing_node.secondary_ip == secondary_ip):
4430 raise errors.OpPrereqError("New node ip address(es) conflict with"
4431 " existing node %s" % existing_node.name,
4432 errors.ECODE_NOTUNIQUE)
4434 # After this 'if' block, None is no longer a valid value for the
4435 # _capable op attributes
4437 old_node = self.cfg.GetNodeInfo(node)
4438 assert old_node is not None, "Can't retrieve locked node %s" % node
4439 for attr in self._NFLAGS:
4440 if getattr(self.op, attr) is None:
4441 setattr(self.op, attr, getattr(old_node, attr))
4443 for attr in self._NFLAGS:
4444 if getattr(self.op, attr) is None:
4445 setattr(self.op, attr, True)
4447 if self.op.readd and not self.op.vm_capable:
4448 pri, sec = cfg.GetNodeInstances(node)
4450 raise errors.OpPrereqError("Node %s being re-added with vm_capable"
4451 " flag set to false, but it already holds"
4452 " instances" % node,
4455 # check that the type of the node (single versus dual homed) is the
4456 # same as for the master
4457 myself = cfg.GetNodeInfo(self.cfg.GetMasterNode())
4458 master_singlehomed = myself.secondary_ip == myself.primary_ip
4459 newbie_singlehomed = secondary_ip == primary_ip
4460 if master_singlehomed != newbie_singlehomed:
4461 if master_singlehomed:
4462 raise errors.OpPrereqError("The master has no secondary ip but the"
4463 " new node has one",
4466 raise errors.OpPrereqError("The master has a secondary ip but the"
4467 " new node doesn't have one",
4470 # checks reachability
4471 if not netutils.TcpPing(primary_ip, constants.DEFAULT_NODED_PORT):
4472 raise errors.OpPrereqError("Node not reachable by ping",
4473 errors.ECODE_ENVIRON)
4475 if not newbie_singlehomed:
4476 # check reachability from my secondary ip to newbie's secondary ip
4477 if not netutils.TcpPing(secondary_ip, constants.DEFAULT_NODED_PORT,
4478 source=myself.secondary_ip):
4479 raise errors.OpPrereqError("Node secondary ip not reachable by TCP"
4480 " based ping to node daemon port",
4481 errors.ECODE_ENVIRON)
4488 if self.op.master_capable:
4489 self.master_candidate = _DecideSelfPromotion(self, exceptions=exceptions)
4491 self.master_candidate = False
4494 self.new_node = old_node
4496 node_group = cfg.LookupNodeGroup(self.op.group)
4497 self.new_node = objects.Node(name=node,
4498 primary_ip=primary_ip,
4499 secondary_ip=secondary_ip,
4500 master_candidate=self.master_candidate,
4501 offline=False, drained=False,
4504 if self.op.ndparams:
4505 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
4507 def Exec(self, feedback_fn):
4508 """Adds the new node to the cluster.
4511 new_node = self.new_node
4512 node = new_node.name
4514 # We adding a new node so we assume it's powered
4515 new_node.powered = True
4517 # for re-adds, reset the offline/drained/master-candidate flags;
4518 # we need to reset here, otherwise offline would prevent RPC calls
4519 # later in the procedure; this also means that if the re-add
4520 # fails, we are left with a non-offlined, broken node
4522 new_node.drained = new_node.offline = False # pylint: disable-msg=W0201
4523 self.LogInfo("Readding a node, the offline/drained flags were reset")
4524 # if we demote the node, we do cleanup later in the procedure
4525 new_node.master_candidate = self.master_candidate
4526 if self.changed_primary_ip:
4527 new_node.primary_ip = self.op.primary_ip
4529 # copy the master/vm_capable flags
4530 for attr in self._NFLAGS:
4531 setattr(new_node, attr, getattr(self.op, attr))
4533 # notify the user about any possible mc promotion
4534 if new_node.master_candidate:
4535 self.LogInfo("Node will be a master candidate")
4537 if self.op.ndparams:
4538 new_node.ndparams = self.op.ndparams
4540 new_node.ndparams = {}
4542 # check connectivity
4543 result = self.rpc.call_version([node])[node]
4544 result.Raise("Can't get version information from node %s" % node)
4545 if constants.PROTOCOL_VERSION == result.payload:
4546 logging.info("Communication to node %s fine, sw version %s match",
4547 node, result.payload)
4549 raise errors.OpExecError("Version mismatch master version %s,"
4550 " node version %s" %
4551 (constants.PROTOCOL_VERSION, result.payload))
4553 # Add node to our /etc/hosts, and add key to known_hosts
4554 if self.cfg.GetClusterInfo().modify_etc_hosts:
4555 master_node = self.cfg.GetMasterNode()
4556 result = self.rpc.call_etc_hosts_modify(master_node,
4557 constants.ETC_HOSTS_ADD,
4560 result.Raise("Can't update hosts file with new host data")
4562 if new_node.secondary_ip != new_node.primary_ip:
4563 _CheckNodeHasSecondaryIP(self, new_node.name, new_node.secondary_ip,
4566 node_verify_list = [self.cfg.GetMasterNode()]
4567 node_verify_param = {
4568 constants.NV_NODELIST: [node],
4569 # TODO: do a node-net-test as well?
4572 result = self.rpc.call_node_verify(node_verify_list, node_verify_param,
4573 self.cfg.GetClusterName())
4574 for verifier in node_verify_list:
4575 result[verifier].Raise("Cannot communicate with node %s" % verifier)
4576 nl_payload = result[verifier].payload[constants.NV_NODELIST]
4578 for failed in nl_payload:
4579 feedback_fn("ssh/hostname verification failed"
4580 " (checking from %s): %s" %
4581 (verifier, nl_payload[failed]))
4582 raise errors.OpExecError("ssh/hostname verification failed")
4585 _RedistributeAncillaryFiles(self)
4586 self.context.ReaddNode(new_node)
4587 # make sure we redistribute the config
4588 self.cfg.Update(new_node, feedback_fn)
4589 # and make sure the new node will not have old files around
4590 if not new_node.master_candidate:
4591 result = self.rpc.call_node_demote_from_mc(new_node.name)
4592 msg = result.fail_msg
4594 self.LogWarning("Node failed to demote itself from master"
4595 " candidate status: %s" % msg)
4597 _RedistributeAncillaryFiles(self, additional_nodes=[node],
4598 additional_vm=self.op.vm_capable)
4599 self.context.AddNode(new_node, self.proc.GetECId())
4602 class LUNodeSetParams(LogicalUnit):
4603 """Modifies the parameters of a node.
4605 @cvar _F2R: a dictionary from tuples of flags (mc, drained, offline)
4606 to the node role (as _ROLE_*)
4607 @cvar _R2F: a dictionary from node role to tuples of flags
4608 @cvar _FLAGS: a list of attribute names corresponding to the flags
4611 HPATH = "node-modify"
4612 HTYPE = constants.HTYPE_NODE
4614 (_ROLE_CANDIDATE, _ROLE_DRAINED, _ROLE_OFFLINE, _ROLE_REGULAR) = range(4)
4616 (True, False, False): _ROLE_CANDIDATE,
4617 (False, True, False): _ROLE_DRAINED,
4618 (False, False, True): _ROLE_OFFLINE,
4619 (False, False, False): _ROLE_REGULAR,
4621 _R2F = dict((v, k) for k, v in _F2R.items())
4622 _FLAGS = ["master_candidate", "drained", "offline"]
4624 def CheckArguments(self):
4625 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
4626 all_mods = [self.op.offline, self.op.master_candidate, self.op.drained,
4627 self.op.master_capable, self.op.vm_capable,
4628 self.op.secondary_ip, self.op.ndparams]
4629 if all_mods.count(None) == len(all_mods):
4630 raise errors.OpPrereqError("Please pass at least one modification",
4632 if all_mods.count(True) > 1:
4633 raise errors.OpPrereqError("Can't set the node into more than one"
4634 " state at the same time",
4637 # Boolean value that tells us whether we might be demoting from MC
4638 self.might_demote = (self.op.master_candidate == False or
4639 self.op.offline == True or
4640 self.op.drained == True or
4641 self.op.master_capable == False)
4643 if self.op.secondary_ip:
4644 if not netutils.IP4Address.IsValid(self.op.secondary_ip):
4645 raise errors.OpPrereqError("Secondary IP (%s) needs to be a valid IPv4"
4646 " address" % self.op.secondary_ip,
4649 self.lock_all = self.op.auto_promote and self.might_demote
4650 self.lock_instances = self.op.secondary_ip is not None
4652 def ExpandNames(self):
4654 self.needed_locks = {locking.LEVEL_NODE: locking.ALL_SET}
4656 self.needed_locks = {locking.LEVEL_NODE: self.op.node_name}
4658 if self.lock_instances:
4659 self.needed_locks[locking.LEVEL_INSTANCE] = locking.ALL_SET
4661 def DeclareLocks(self, level):
4662 # If we have locked all instances, before waiting to lock nodes, release
4663 # all the ones living on nodes unrelated to the current operation.
4664 if level == locking.LEVEL_NODE and self.lock_instances:
4665 self.affected_instances = []
4666 if self.needed_locks[locking.LEVEL_NODE] is not locking.ALL_SET:
4669 # Build list of instances to release
4670 for instance_name in self.glm.list_owned(locking.LEVEL_INSTANCE):
4671 instance = self.context.cfg.GetInstanceInfo(instance_name)
4672 if (instance.disk_template in constants.DTS_INT_MIRROR and
4673 self.op.node_name in instance.all_nodes):
4674 instances_keep.append(instance_name)
4675 self.affected_instances.append(instance)
4677 _ReleaseLocks(self, locking.LEVEL_INSTANCE, keep=instances_keep)
4679 assert (set(self.glm.list_owned(locking.LEVEL_INSTANCE)) ==
4680 set(instances_keep))
4682 def BuildHooksEnv(self):
4685 This runs on the master node.
4689 "OP_TARGET": self.op.node_name,
4690 "MASTER_CANDIDATE": str(self.op.master_candidate),
4691 "OFFLINE": str(self.op.offline),
4692 "DRAINED": str(self.op.drained),
4693 "MASTER_CAPABLE": str(self.op.master_capable),
4694 "VM_CAPABLE": str(self.op.vm_capable),
4697 def BuildHooksNodes(self):
4698 """Build hooks nodes.
4701 nl = [self.cfg.GetMasterNode(), self.op.node_name]
4704 def CheckPrereq(self):
4705 """Check prerequisites.
4707 This only checks the instance list against the existing names.
4710 node = self.node = self.cfg.GetNodeInfo(self.op.node_name)
4712 if (self.op.master_candidate is not None or
4713 self.op.drained is not None or
4714 self.op.offline is not None):
4715 # we can't change the master's node flags
4716 if self.op.node_name == self.cfg.GetMasterNode():
4717 raise errors.OpPrereqError("The master role can be changed"
4718 " only via master-failover",
4721 if self.op.master_candidate and not node.master_capable:
4722 raise errors.OpPrereqError("Node %s is not master capable, cannot make"
4723 " it a master candidate" % node.name,
4726 if self.op.vm_capable == False:
4727 (ipri, isec) = self.cfg.GetNodeInstances(self.op.node_name)
4729 raise errors.OpPrereqError("Node %s hosts instances, cannot unset"
4730 " the vm_capable flag" % node.name,
4733 if node.master_candidate and self.might_demote and not self.lock_all:
4734 assert not self.op.auto_promote, "auto_promote set but lock_all not"
4735 # check if after removing the current node, we're missing master
4737 (mc_remaining, mc_should, _) = \
4738 self.cfg.GetMasterCandidateStats(exceptions=[node.name])
4739 if mc_remaining < mc_should:
4740 raise errors.OpPrereqError("Not enough master candidates, please"
4741 " pass auto promote option to allow"
4742 " promotion", errors.ECODE_STATE)
4744 self.old_flags = old_flags = (node.master_candidate,
4745 node.drained, node.offline)
4746 assert old_flags in self._F2R, "Un-handled old flags %s" % str(old_flags)
4747 self.old_role = old_role = self._F2R[old_flags]
4749 # Check for ineffective changes
4750 for attr in self._FLAGS:
4751 if (getattr(self.op, attr) == False and getattr(node, attr) == False):
4752 self.LogInfo("Ignoring request to unset flag %s, already unset", attr)
4753 setattr(self.op, attr, None)
4755 # Past this point, any flag change to False means a transition
4756 # away from the respective state, as only real changes are kept
4758 # TODO: We might query the real power state if it supports OOB
4759 if _SupportsOob(self.cfg, node):
4760 if self.op.offline is False and not (node.powered or
4761 self.op.powered == True):
4762 raise errors.OpPrereqError(("Node %s needs to be turned on before its"
4763 " offline status can be reset") %
4765 elif self.op.powered is not None:
4766 raise errors.OpPrereqError(("Unable to change powered state for node %s"
4767 " as it does not support out-of-band"
4768 " handling") % self.op.node_name)
4770 # If we're being deofflined/drained, we'll MC ourself if needed
4771 if (self.op.drained == False or self.op.offline == False or
4772 (self.op.master_capable and not node.master_capable)):
4773 if _DecideSelfPromotion(self):
4774 self.op.master_candidate = True
4775 self.LogInfo("Auto-promoting node to master candidate")
4777 # If we're no longer master capable, we'll demote ourselves from MC
4778 if self.op.master_capable == False and node.master_candidate:
4779 self.LogInfo("Demoting from master candidate")
4780 self.op.master_candidate = False
4783 assert [getattr(self.op, attr) for attr in self._FLAGS].count(True) <= 1
4784 if self.op.master_candidate:
4785 new_role = self._ROLE_CANDIDATE
4786 elif self.op.drained:
4787 new_role = self._ROLE_DRAINED
4788 elif self.op.offline:
4789 new_role = self._ROLE_OFFLINE
4790 elif False in [self.op.master_candidate, self.op.drained, self.op.offline]:
4791 # False is still in new flags, which means we're un-setting (the
4793 new_role = self._ROLE_REGULAR
4794 else: # no new flags, nothing, keep old role
4797 self.new_role = new_role
4799 if old_role == self._ROLE_OFFLINE and new_role != old_role:
4800 # Trying to transition out of offline status
4801 result = self.rpc.call_version([node.name])[node.name]
4803 raise errors.OpPrereqError("Node %s is being de-offlined but fails"
4804 " to report its version: %s" %
4805 (node.name, result.fail_msg),
4808 self.LogWarning("Transitioning node from offline to online state"
4809 " without using re-add. Please make sure the node"
4812 if self.op.secondary_ip:
4813 # Ok even without locking, because this can't be changed by any LU
4814 master = self.cfg.GetNodeInfo(self.cfg.GetMasterNode())
4815 master_singlehomed = master.secondary_ip == master.primary_ip
4816 if master_singlehomed and self.op.secondary_ip:
4817 raise errors.OpPrereqError("Cannot change the secondary ip on a single"
4818 " homed cluster", errors.ECODE_INVAL)
4821 if self.affected_instances:
4822 raise errors.OpPrereqError("Cannot change secondary ip: offline"
4823 " node has instances (%s) configured"
4824 " to use it" % self.affected_instances)
4826 # On online nodes, check that no instances are running, and that
4827 # the node has the new ip and we can reach it.
4828 for instance in self.affected_instances:
4829 _CheckInstanceDown(self, instance, "cannot change secondary ip")
4831 _CheckNodeHasSecondaryIP(self, node.name, self.op.secondary_ip, True)
4832 if master.name != node.name:
4833 # check reachability from master secondary ip to new secondary ip
4834 if not netutils.TcpPing(self.op.secondary_ip,
4835 constants.DEFAULT_NODED_PORT,
4836 source=master.secondary_ip):
4837 raise errors.OpPrereqError("Node secondary ip not reachable by TCP"
4838 " based ping to node daemon port",
4839 errors.ECODE_ENVIRON)
4841 if self.op.ndparams:
4842 new_ndparams = _GetUpdatedParams(self.node.ndparams, self.op.ndparams)
4843 utils.ForceDictType(new_ndparams, constants.NDS_PARAMETER_TYPES)
4844 self.new_ndparams = new_ndparams
4846 def Exec(self, feedback_fn):
4851 old_role = self.old_role
4852 new_role = self.new_role
4856 if self.op.ndparams:
4857 node.ndparams = self.new_ndparams
4859 if self.op.powered is not None:
4860 node.powered = self.op.powered
4862 for attr in ["master_capable", "vm_capable"]:
4863 val = getattr(self.op, attr)
4865 setattr(node, attr, val)
4866 result.append((attr, str(val)))
4868 if new_role != old_role:
4869 # Tell the node to demote itself, if no longer MC and not offline
4870 if old_role == self._ROLE_CANDIDATE and new_role != self._ROLE_OFFLINE:
4871 msg = self.rpc.call_node_demote_from_mc(node.name).fail_msg
4873 self.LogWarning("Node failed to demote itself: %s", msg)
4875 new_flags = self._R2F[new_role]
4876 for of, nf, desc in zip(self.old_flags, new_flags, self._FLAGS):
4878 result.append((desc, str(nf)))
4879 (node.master_candidate, node.drained, node.offline) = new_flags
4881 # we locked all nodes, we adjust the CP before updating this node
4883 _AdjustCandidatePool(self, [node.name])
4885 if self.op.secondary_ip:
4886 node.secondary_ip = self.op.secondary_ip
4887 result.append(("secondary_ip", self.op.secondary_ip))
4889 # this will trigger configuration file update, if needed
4890 self.cfg.Update(node, feedback_fn)
4892 # this will trigger job queue propagation or cleanup if the mc
4894 if [old_role, new_role].count(self._ROLE_CANDIDATE) == 1:
4895 self.context.ReaddNode(node)
4900 class LUNodePowercycle(NoHooksLU):
4901 """Powercycles a node.
4906 def CheckArguments(self):
4907 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
4908 if self.op.node_name == self.cfg.GetMasterNode() and not self.op.force:
4909 raise errors.OpPrereqError("The node is the master and the force"
4910 " parameter was not set",
4913 def ExpandNames(self):
4914 """Locking for PowercycleNode.
4916 This is a last-resort option and shouldn't block on other
4917 jobs. Therefore, we grab no locks.
4920 self.needed_locks = {}
4922 def Exec(self, feedback_fn):
4926 result = self.rpc.call_node_powercycle(self.op.node_name,
4927 self.cfg.GetHypervisorType())
4928 result.Raise("Failed to schedule the reboot")
4929 return result.payload
4932 class LUClusterQuery(NoHooksLU):
4933 """Query cluster configuration.
4938 def ExpandNames(self):
4939 self.needed_locks = {}
4941 def Exec(self, feedback_fn):
4942 """Return cluster config.
4945 cluster = self.cfg.GetClusterInfo()
4948 # Filter just for enabled hypervisors
4949 for os_name, hv_dict in cluster.os_hvp.items():
4950 os_hvp[os_name] = {}
4951 for hv_name, hv_params in hv_dict.items():
4952 if hv_name in cluster.enabled_hypervisors:
4953 os_hvp[os_name][hv_name] = hv_params
4955 # Convert ip_family to ip_version
4956 primary_ip_version = constants.IP4_VERSION
4957 if cluster.primary_ip_family == netutils.IP6Address.family:
4958 primary_ip_version = constants.IP6_VERSION
4961 "software_version": constants.RELEASE_VERSION,
4962 "protocol_version": constants.PROTOCOL_VERSION,
4963 "config_version": constants.CONFIG_VERSION,
4964 "os_api_version": max(constants.OS_API_VERSIONS),
4965 "export_version": constants.EXPORT_VERSION,
4966 "architecture": (platform.architecture()[0], platform.machine()),
4967 "name": cluster.cluster_name,
4968 "master": cluster.master_node,
4969 "default_hypervisor": cluster.enabled_hypervisors[0],
4970 "enabled_hypervisors": cluster.enabled_hypervisors,
4971 "hvparams": dict([(hypervisor_name, cluster.hvparams[hypervisor_name])
4972 for hypervisor_name in cluster.enabled_hypervisors]),
4974 "beparams": cluster.beparams,
4975 "osparams": cluster.osparams,
4976 "nicparams": cluster.nicparams,
4977 "ndparams": cluster.ndparams,
4978 "candidate_pool_size": cluster.candidate_pool_size,
4979 "master_netdev": cluster.master_netdev,
4980 "volume_group_name": cluster.volume_group_name,
4981 "drbd_usermode_helper": cluster.drbd_usermode_helper,
4982 "file_storage_dir": cluster.file_storage_dir,
4983 "shared_file_storage_dir": cluster.shared_file_storage_dir,
4984 "maintain_node_health": cluster.maintain_node_health,
4985 "ctime": cluster.ctime,
4986 "mtime": cluster.mtime,
4987 "uuid": cluster.uuid,
4988 "tags": list(cluster.GetTags()),
4989 "uid_pool": cluster.uid_pool,
4990 "default_iallocator": cluster.default_iallocator,
4991 "reserved_lvs": cluster.reserved_lvs,
4992 "primary_ip_version": primary_ip_version,
4993 "prealloc_wipe_disks": cluster.prealloc_wipe_disks,
4994 "hidden_os": cluster.hidden_os,
4995 "blacklisted_os": cluster.blacklisted_os,
5001 class LUClusterConfigQuery(NoHooksLU):
5002 """Return configuration values.
5006 _FIELDS_DYNAMIC = utils.FieldSet()
5007 _FIELDS_STATIC = utils.FieldSet("cluster_name", "master_node", "drain_flag",
5008 "watcher_pause", "volume_group_name")
5010 def CheckArguments(self):
5011 _CheckOutputFields(static=self._FIELDS_STATIC,
5012 dynamic=self._FIELDS_DYNAMIC,
5013 selected=self.op.output_fields)
5015 def ExpandNames(self):
5016 self.needed_locks = {}
5018 def Exec(self, feedback_fn):
5019 """Dump a representation of the cluster config to the standard output.
5023 for field in self.op.output_fields:
5024 if field == "cluster_name":
5025 entry = self.cfg.GetClusterName()
5026 elif field == "master_node":
5027 entry = self.cfg.GetMasterNode()
5028 elif field == "drain_flag":
5029 entry = os.path.exists(constants.JOB_QUEUE_DRAIN_FILE)
5030 elif field == "watcher_pause":
5031 entry = utils.ReadWatcherPauseFile(constants.WATCHER_PAUSEFILE)
5032 elif field == "volume_group_name":
5033 entry = self.cfg.GetVGName()
5035 raise errors.ParameterError(field)
5036 values.append(entry)
5040 class LUInstanceActivateDisks(NoHooksLU):
5041 """Bring up an instance's disks.
5046 def ExpandNames(self):
5047 self._ExpandAndLockInstance()
5048 self.needed_locks[locking.LEVEL_NODE] = []
5049 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
5051 def DeclareLocks(self, level):
5052 if level == locking.LEVEL_NODE:
5053 self._LockInstancesNodes()
5055 def CheckPrereq(self):
5056 """Check prerequisites.
5058 This checks that the instance is in the cluster.
5061 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5062 assert self.instance is not None, \
5063 "Cannot retrieve locked instance %s" % self.op.instance_name
5064 _CheckNodeOnline(self, self.instance.primary_node)
5066 def Exec(self, feedback_fn):
5067 """Activate the disks.
5070 disks_ok, disks_info = \
5071 _AssembleInstanceDisks(self, self.instance,
5072 ignore_size=self.op.ignore_size)
5074 raise errors.OpExecError("Cannot activate block devices")
5079 def _AssembleInstanceDisks(lu, instance, disks=None, ignore_secondaries=False,
5081 """Prepare the block devices for an instance.
5083 This sets up the block devices on all nodes.
5085 @type lu: L{LogicalUnit}
5086 @param lu: the logical unit on whose behalf we execute
5087 @type instance: L{objects.Instance}
5088 @param instance: the instance for whose disks we assemble
5089 @type disks: list of L{objects.Disk} or None
5090 @param disks: which disks to assemble (or all, if None)
5091 @type ignore_secondaries: boolean
5092 @param ignore_secondaries: if true, errors on secondary nodes
5093 won't result in an error return from the function
5094 @type ignore_size: boolean
5095 @param ignore_size: if true, the current known size of the disk
5096 will not be used during the disk activation, useful for cases
5097 when the size is wrong
5098 @return: False if the operation failed, otherwise a list of
5099 (host, instance_visible_name, node_visible_name)
5100 with the mapping from node devices to instance devices
5105 iname = instance.name
5106 disks = _ExpandCheckDisks(instance, disks)
5108 # With the two passes mechanism we try to reduce the window of
5109 # opportunity for the race condition of switching DRBD to primary
5110 # before handshaking occured, but we do not eliminate it
5112 # The proper fix would be to wait (with some limits) until the
5113 # connection has been made and drbd transitions from WFConnection
5114 # into any other network-connected state (Connected, SyncTarget,
5117 # 1st pass, assemble on all nodes in secondary mode
5118 for idx, inst_disk in enumerate(disks):
5119 for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node):
5121 node_disk = node_disk.Copy()
5122 node_disk.UnsetSize()
5123 lu.cfg.SetDiskID(node_disk, node)
5124 result = lu.rpc.call_blockdev_assemble(node, node_disk, iname, False, idx)
5125 msg = result.fail_msg
5127 lu.proc.LogWarning("Could not prepare block device %s on node %s"
5128 " (is_primary=False, pass=1): %s",
5129 inst_disk.iv_name, node, msg)
5130 if not ignore_secondaries:
5133 # FIXME: race condition on drbd migration to primary
5135 # 2nd pass, do only the primary node
5136 for idx, inst_disk in enumerate(disks):
5139 for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node):
5140 if node != instance.primary_node:
5143 node_disk = node_disk.Copy()
5144 node_disk.UnsetSize()
5145 lu.cfg.SetDiskID(node_disk, node)
5146 result = lu.rpc.call_blockdev_assemble(node, node_disk, iname, True, idx)
5147 msg = result.fail_msg
5149 lu.proc.LogWarning("Could not prepare block device %s on node %s"
5150 " (is_primary=True, pass=2): %s",
5151 inst_disk.iv_name, node, msg)
5154 dev_path = result.payload
5156 device_info.append((instance.primary_node, inst_disk.iv_name, dev_path))
5158 # leave the disks configured for the primary node
5159 # this is a workaround that would be fixed better by
5160 # improving the logical/physical id handling
5162 lu.cfg.SetDiskID(disk, instance.primary_node)
5164 return disks_ok, device_info
5167 def _StartInstanceDisks(lu, instance, force):
5168 """Start the disks of an instance.
5171 disks_ok, _ = _AssembleInstanceDisks(lu, instance,
5172 ignore_secondaries=force)
5174 _ShutdownInstanceDisks(lu, instance)
5175 if force is not None and not force:
5176 lu.proc.LogWarning("", hint="If the message above refers to a"
5178 " you can retry the operation using '--force'.")
5179 raise errors.OpExecError("Disk consistency error")
5182 class LUInstanceDeactivateDisks(NoHooksLU):
5183 """Shutdown an instance's disks.
5188 def ExpandNames(self):
5189 self._ExpandAndLockInstance()
5190 self.needed_locks[locking.LEVEL_NODE] = []
5191 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
5193 def DeclareLocks(self, level):
5194 if level == locking.LEVEL_NODE:
5195 self._LockInstancesNodes()
5197 def CheckPrereq(self):
5198 """Check prerequisites.
5200 This checks that the instance is in the cluster.
5203 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5204 assert self.instance is not None, \
5205 "Cannot retrieve locked instance %s" % self.op.instance_name
5207 def Exec(self, feedback_fn):
5208 """Deactivate the disks
5211 instance = self.instance
5213 _ShutdownInstanceDisks(self, instance)
5215 _SafeShutdownInstanceDisks(self, instance)
5218 def _SafeShutdownInstanceDisks(lu, instance, disks=None):
5219 """Shutdown block devices of an instance.
5221 This function checks if an instance is running, before calling
5222 _ShutdownInstanceDisks.
5225 _CheckInstanceDown(lu, instance, "cannot shutdown disks")
5226 _ShutdownInstanceDisks(lu, instance, disks=disks)
5229 def _ExpandCheckDisks(instance, disks):
5230 """Return the instance disks selected by the disks list
5232 @type disks: list of L{objects.Disk} or None
5233 @param disks: selected disks
5234 @rtype: list of L{objects.Disk}
5235 @return: selected instance disks to act on
5239 return instance.disks
5241 if not set(disks).issubset(instance.disks):
5242 raise errors.ProgrammerError("Can only act on disks belonging to the"
5247 def _ShutdownInstanceDisks(lu, instance, disks=None, ignore_primary=False):
5248 """Shutdown block devices of an instance.
5250 This does the shutdown on all nodes of the instance.
5252 If the ignore_primary is false, errors on the primary node are
5257 disks = _ExpandCheckDisks(instance, disks)
5260 for node, top_disk in disk.ComputeNodeTree(instance.primary_node):
5261 lu.cfg.SetDiskID(top_disk, node)
5262 result = lu.rpc.call_blockdev_shutdown(node, top_disk)
5263 msg = result.fail_msg
5265 lu.LogWarning("Could not shutdown block device %s on node %s: %s",
5266 disk.iv_name, node, msg)
5267 if ((node == instance.primary_node and not ignore_primary) or
5268 (node != instance.primary_node and not result.offline)):
5273 def _CheckNodeFreeMemory(lu, node, reason, requested, hypervisor_name):
5274 """Checks if a node has enough free memory.
5276 This function check if a given node has the needed amount of free
5277 memory. In case the node has less memory or we cannot get the
5278 information from the node, this function raise an OpPrereqError
5281 @type lu: C{LogicalUnit}
5282 @param lu: a logical unit from which we get configuration data
5284 @param node: the node to check
5285 @type reason: C{str}
5286 @param reason: string to use in the error message
5287 @type requested: C{int}
5288 @param requested: the amount of memory in MiB to check for
5289 @type hypervisor_name: C{str}
5290 @param hypervisor_name: the hypervisor to ask for memory stats
5291 @raise errors.OpPrereqError: if the node doesn't have enough memory, or
5292 we cannot check the node
5295 nodeinfo = lu.rpc.call_node_info([node], None, hypervisor_name)
5296 nodeinfo[node].Raise("Can't get data from node %s" % node,
5297 prereq=True, ecode=errors.ECODE_ENVIRON)
5298 free_mem = nodeinfo[node].payload.get('memory_free', None)
5299 if not isinstance(free_mem, int):
5300 raise errors.OpPrereqError("Can't compute free memory on node %s, result"
5301 " was '%s'" % (node, free_mem),
5302 errors.ECODE_ENVIRON)
5303 if requested > free_mem:
5304 raise errors.OpPrereqError("Not enough memory on node %s for %s:"
5305 " needed %s MiB, available %s MiB" %
5306 (node, reason, requested, free_mem),
5310 def _CheckNodesFreeDiskPerVG(lu, nodenames, req_sizes):
5311 """Checks if nodes have enough free disk space in the all VGs.
5313 This function check if all given nodes have the needed amount of
5314 free disk. In case any node has less disk or we cannot get the
5315 information from the node, this function raise an OpPrereqError
5318 @type lu: C{LogicalUnit}
5319 @param lu: a logical unit from which we get configuration data
5320 @type nodenames: C{list}
5321 @param nodenames: the list of node names to check
5322 @type req_sizes: C{dict}
5323 @param req_sizes: the hash of vg and corresponding amount of disk in
5325 @raise errors.OpPrereqError: if the node doesn't have enough disk,
5326 or we cannot check the node
5329 for vg, req_size in req_sizes.items():
5330 _CheckNodesFreeDiskOnVG(lu, nodenames, vg, req_size)
5333 def _CheckNodesFreeDiskOnVG(lu, nodenames, vg, requested):
5334 """Checks if nodes have enough free disk space in the specified VG.
5336 This function check if all given nodes have the needed amount of
5337 free disk. In case any node has less disk or we cannot get the
5338 information from the node, this function raise an OpPrereqError
5341 @type lu: C{LogicalUnit}
5342 @param lu: a logical unit from which we get configuration data
5343 @type nodenames: C{list}
5344 @param nodenames: the list of node names to check
5346 @param vg: the volume group to check
5347 @type requested: C{int}
5348 @param requested: the amount of disk in MiB to check for
5349 @raise errors.OpPrereqError: if the node doesn't have enough disk,
5350 or we cannot check the node
5353 nodeinfo = lu.rpc.call_node_info(nodenames, vg, None)
5354 for node in nodenames:
5355 info = nodeinfo[node]
5356 info.Raise("Cannot get current information from node %s" % node,
5357 prereq=True, ecode=errors.ECODE_ENVIRON)
5358 vg_free = info.payload.get("vg_free", None)
5359 if not isinstance(vg_free, int):
5360 raise errors.OpPrereqError("Can't compute free disk space on node"
5361 " %s for vg %s, result was '%s'" %
5362 (node, vg, vg_free), errors.ECODE_ENVIRON)
5363 if requested > vg_free:
5364 raise errors.OpPrereqError("Not enough disk space on target node %s"
5365 " vg %s: required %d MiB, available %d MiB" %
5366 (node, vg, requested, vg_free),
5370 class LUInstanceStartup(LogicalUnit):
5371 """Starts an instance.
5374 HPATH = "instance-start"
5375 HTYPE = constants.HTYPE_INSTANCE
5378 def CheckArguments(self):
5380 if self.op.beparams:
5381 # fill the beparams dict
5382 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
5384 def ExpandNames(self):
5385 self._ExpandAndLockInstance()
5387 def BuildHooksEnv(self):
5390 This runs on master, primary and secondary nodes of the instance.
5394 "FORCE": self.op.force,
5397 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
5401 def BuildHooksNodes(self):
5402 """Build hooks nodes.
5405 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
5408 def CheckPrereq(self):
5409 """Check prerequisites.
5411 This checks that the instance is in the cluster.
5414 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5415 assert self.instance is not None, \
5416 "Cannot retrieve locked instance %s" % self.op.instance_name
5419 if self.op.hvparams:
5420 # check hypervisor parameter syntax (locally)
5421 cluster = self.cfg.GetClusterInfo()
5422 utils.ForceDictType(self.op.hvparams, constants.HVS_PARAMETER_TYPES)
5423 filled_hvp = cluster.FillHV(instance)
5424 filled_hvp.update(self.op.hvparams)
5425 hv_type = hypervisor.GetHypervisor(instance.hypervisor)
5426 hv_type.CheckParameterSyntax(filled_hvp)
5427 _CheckHVParams(self, instance.all_nodes, instance.hypervisor, filled_hvp)
5429 self.primary_offline = self.cfg.GetNodeInfo(instance.primary_node).offline
5431 if self.primary_offline and self.op.ignore_offline_nodes:
5432 self.proc.LogWarning("Ignoring offline primary node")
5434 if self.op.hvparams or self.op.beparams:
5435 self.proc.LogWarning("Overridden parameters are ignored")
5437 _CheckNodeOnline(self, instance.primary_node)
5439 bep = self.cfg.GetClusterInfo().FillBE(instance)
5441 # check bridges existence
5442 _CheckInstanceBridgesExist(self, instance)
5444 remote_info = self.rpc.call_instance_info(instance.primary_node,
5446 instance.hypervisor)
5447 remote_info.Raise("Error checking node %s" % instance.primary_node,
5448 prereq=True, ecode=errors.ECODE_ENVIRON)
5449 if not remote_info.payload: # not running already
5450 _CheckNodeFreeMemory(self, instance.primary_node,
5451 "starting instance %s" % instance.name,
5452 bep[constants.BE_MEMORY], instance.hypervisor)
5454 def Exec(self, feedback_fn):
5455 """Start the instance.
5458 instance = self.instance
5459 force = self.op.force
5461 self.cfg.MarkInstanceUp(instance.name)
5463 if self.primary_offline:
5464 assert self.op.ignore_offline_nodes
5465 self.proc.LogInfo("Primary node offline, marked instance as started")
5467 node_current = instance.primary_node
5469 _StartInstanceDisks(self, instance, force)
5471 result = self.rpc.call_instance_start(node_current, instance,
5472 self.op.hvparams, self.op.beparams)
5473 msg = result.fail_msg
5475 _ShutdownInstanceDisks(self, instance)
5476 raise errors.OpExecError("Could not start instance: %s" % msg)
5479 class LUInstanceReboot(LogicalUnit):
5480 """Reboot an instance.
5483 HPATH = "instance-reboot"
5484 HTYPE = constants.HTYPE_INSTANCE
5487 def ExpandNames(self):
5488 self._ExpandAndLockInstance()
5490 def BuildHooksEnv(self):
5493 This runs on master, primary and secondary nodes of the instance.
5497 "IGNORE_SECONDARIES": self.op.ignore_secondaries,
5498 "REBOOT_TYPE": self.op.reboot_type,
5499 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
5502 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
5506 def BuildHooksNodes(self):
5507 """Build hooks nodes.
5510 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
5513 def CheckPrereq(self):
5514 """Check prerequisites.
5516 This checks that the instance is in the cluster.
5519 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5520 assert self.instance is not None, \
5521 "Cannot retrieve locked instance %s" % self.op.instance_name
5523 _CheckNodeOnline(self, instance.primary_node)
5525 # check bridges existence
5526 _CheckInstanceBridgesExist(self, instance)
5528 def Exec(self, feedback_fn):
5529 """Reboot the instance.
5532 instance = self.instance
5533 ignore_secondaries = self.op.ignore_secondaries
5534 reboot_type = self.op.reboot_type
5536 remote_info = self.rpc.call_instance_info(instance.primary_node,
5538 instance.hypervisor)
5539 remote_info.Raise("Error checking node %s" % instance.primary_node)
5540 instance_running = bool(remote_info.payload)
5542 node_current = instance.primary_node
5544 if instance_running and reboot_type in [constants.INSTANCE_REBOOT_SOFT,
5545 constants.INSTANCE_REBOOT_HARD]:
5546 for disk in instance.disks:
5547 self.cfg.SetDiskID(disk, node_current)
5548 result = self.rpc.call_instance_reboot(node_current, instance,
5550 self.op.shutdown_timeout)
5551 result.Raise("Could not reboot instance")
5553 if instance_running:
5554 result = self.rpc.call_instance_shutdown(node_current, instance,
5555 self.op.shutdown_timeout)
5556 result.Raise("Could not shutdown instance for full reboot")
5557 _ShutdownInstanceDisks(self, instance)
5559 self.LogInfo("Instance %s was already stopped, starting now",
5561 _StartInstanceDisks(self, instance, ignore_secondaries)
5562 result = self.rpc.call_instance_start(node_current, instance, None, None)
5563 msg = result.fail_msg
5565 _ShutdownInstanceDisks(self, instance)
5566 raise errors.OpExecError("Could not start instance for"
5567 " full reboot: %s" % msg)
5569 self.cfg.MarkInstanceUp(instance.name)
5572 class LUInstanceShutdown(LogicalUnit):
5573 """Shutdown an instance.
5576 HPATH = "instance-stop"
5577 HTYPE = constants.HTYPE_INSTANCE
5580 def ExpandNames(self):
5581 self._ExpandAndLockInstance()
5583 def BuildHooksEnv(self):
5586 This runs on master, primary and secondary nodes of the instance.
5589 env = _BuildInstanceHookEnvByObject(self, self.instance)
5590 env["TIMEOUT"] = self.op.timeout
5593 def BuildHooksNodes(self):
5594 """Build hooks nodes.
5597 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
5600 def CheckPrereq(self):
5601 """Check prerequisites.
5603 This checks that the instance is in the cluster.
5606 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5607 assert self.instance is not None, \
5608 "Cannot retrieve locked instance %s" % self.op.instance_name
5610 self.primary_offline = \
5611 self.cfg.GetNodeInfo(self.instance.primary_node).offline
5613 if self.primary_offline and self.op.ignore_offline_nodes:
5614 self.proc.LogWarning("Ignoring offline primary node")
5616 _CheckNodeOnline(self, self.instance.primary_node)
5618 def Exec(self, feedback_fn):
5619 """Shutdown the instance.
5622 instance = self.instance
5623 node_current = instance.primary_node
5624 timeout = self.op.timeout
5626 self.cfg.MarkInstanceDown(instance.name)
5628 if self.primary_offline:
5629 assert self.op.ignore_offline_nodes
5630 self.proc.LogInfo("Primary node offline, marked instance as stopped")
5632 result = self.rpc.call_instance_shutdown(node_current, instance, timeout)
5633 msg = result.fail_msg
5635 self.proc.LogWarning("Could not shutdown instance: %s" % msg)
5637 _ShutdownInstanceDisks(self, instance)
5640 class LUInstanceReinstall(LogicalUnit):
5641 """Reinstall an instance.
5644 HPATH = "instance-reinstall"
5645 HTYPE = constants.HTYPE_INSTANCE
5648 def ExpandNames(self):
5649 self._ExpandAndLockInstance()
5651 def BuildHooksEnv(self):
5654 This runs on master, primary and secondary nodes of the instance.
5657 return _BuildInstanceHookEnvByObject(self, self.instance)
5659 def BuildHooksNodes(self):
5660 """Build hooks nodes.
5663 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
5666 def CheckPrereq(self):
5667 """Check prerequisites.
5669 This checks that the instance is in the cluster and is not running.
5672 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5673 assert instance is not None, \
5674 "Cannot retrieve locked instance %s" % self.op.instance_name
5675 _CheckNodeOnline(self, instance.primary_node, "Instance primary node"
5676 " offline, cannot reinstall")
5677 for node in instance.secondary_nodes:
5678 _CheckNodeOnline(self, node, "Instance secondary node offline,"
5679 " cannot reinstall")
5681 if instance.disk_template == constants.DT_DISKLESS:
5682 raise errors.OpPrereqError("Instance '%s' has no disks" %
5683 self.op.instance_name,
5685 _CheckInstanceDown(self, instance, "cannot reinstall")
5687 if self.op.os_type is not None:
5689 pnode = _ExpandNodeName(self.cfg, instance.primary_node)
5690 _CheckNodeHasOS(self, pnode, self.op.os_type, self.op.force_variant)
5691 instance_os = self.op.os_type
5693 instance_os = instance.os
5695 nodelist = list(instance.all_nodes)
5697 if self.op.osparams:
5698 i_osdict = _GetUpdatedParams(instance.osparams, self.op.osparams)
5699 _CheckOSParams(self, True, nodelist, instance_os, i_osdict)
5700 self.os_inst = i_osdict # the new dict (without defaults)
5704 self.instance = instance
5706 def Exec(self, feedback_fn):
5707 """Reinstall the instance.
5710 inst = self.instance
5712 if self.op.os_type is not None:
5713 feedback_fn("Changing OS to '%s'..." % self.op.os_type)
5714 inst.os = self.op.os_type
5715 # Write to configuration
5716 self.cfg.Update(inst, feedback_fn)
5718 _StartInstanceDisks(self, inst, None)
5720 feedback_fn("Running the instance OS create scripts...")
5721 # FIXME: pass debug option from opcode to backend
5722 result = self.rpc.call_instance_os_add(inst.primary_node, inst, True,
5723 self.op.debug_level,
5724 osparams=self.os_inst)
5725 result.Raise("Could not install OS for instance %s on node %s" %
5726 (inst.name, inst.primary_node))
5728 _ShutdownInstanceDisks(self, inst)
5731 class LUInstanceRecreateDisks(LogicalUnit):
5732 """Recreate an instance's missing disks.
5735 HPATH = "instance-recreate-disks"
5736 HTYPE = constants.HTYPE_INSTANCE
5739 def CheckArguments(self):
5740 # normalise the disk list
5741 self.op.disks = sorted(frozenset(self.op.disks))
5743 def ExpandNames(self):
5744 self._ExpandAndLockInstance()
5745 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
5747 self.op.nodes = [_ExpandNodeName(self.cfg, n) for n in self.op.nodes]
5748 self.needed_locks[locking.LEVEL_NODE] = list(self.op.nodes)
5750 self.needed_locks[locking.LEVEL_NODE] = []
5752 def DeclareLocks(self, level):
5753 if level == locking.LEVEL_NODE:
5754 # if we replace the nodes, we only need to lock the old primary,
5755 # otherwise we need to lock all nodes for disk re-creation
5756 primary_only = bool(self.op.nodes)
5757 self._LockInstancesNodes(primary_only=primary_only)
5759 def BuildHooksEnv(self):
5762 This runs on master, primary and secondary nodes of the instance.
5765 return _BuildInstanceHookEnvByObject(self, self.instance)
5767 def BuildHooksNodes(self):
5768 """Build hooks nodes.
5771 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
5774 def CheckPrereq(self):
5775 """Check prerequisites.
5777 This checks that the instance is in the cluster and is not running.
5780 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5781 assert instance is not None, \
5782 "Cannot retrieve locked instance %s" % self.op.instance_name
5784 if len(self.op.nodes) != len(instance.all_nodes):
5785 raise errors.OpPrereqError("Instance %s currently has %d nodes, but"
5786 " %d replacement nodes were specified" %
5787 (instance.name, len(instance.all_nodes),
5788 len(self.op.nodes)),
5790 assert instance.disk_template != constants.DT_DRBD8 or \
5791 len(self.op.nodes) == 2
5792 assert instance.disk_template != constants.DT_PLAIN or \
5793 len(self.op.nodes) == 1
5794 primary_node = self.op.nodes[0]
5796 primary_node = instance.primary_node
5797 _CheckNodeOnline(self, primary_node)
5799 if instance.disk_template == constants.DT_DISKLESS:
5800 raise errors.OpPrereqError("Instance '%s' has no disks" %
5801 self.op.instance_name, errors.ECODE_INVAL)
5802 # if we replace nodes *and* the old primary is offline, we don't
5804 assert instance.primary_node in self.needed_locks[locking.LEVEL_NODE]
5805 old_pnode = self.cfg.GetNodeInfo(instance.primary_node)
5806 if not (self.op.nodes and old_pnode.offline):
5807 _CheckInstanceDown(self, instance, "cannot recreate disks")
5809 if not self.op.disks:
5810 self.op.disks = range(len(instance.disks))
5812 for idx in self.op.disks:
5813 if idx >= len(instance.disks):
5814 raise errors.OpPrereqError("Invalid disk index '%s'" % idx,
5816 if self.op.disks != range(len(instance.disks)) and self.op.nodes:
5817 raise errors.OpPrereqError("Can't recreate disks partially and"
5818 " change the nodes at the same time",
5820 self.instance = instance
5822 def Exec(self, feedback_fn):
5823 """Recreate the disks.
5826 # change primary node, if needed
5828 self.instance.primary_node = self.op.nodes[0]
5829 self.LogWarning("Changing the instance's nodes, you will have to"
5830 " remove any disks left on the older nodes manually")
5833 for idx, disk in enumerate(self.instance.disks):
5834 if idx not in self.op.disks: # disk idx has not been passed in
5837 # update secondaries for disks, if needed
5839 if disk.dev_type == constants.LD_DRBD8:
5840 # need to update the nodes
5841 assert len(self.op.nodes) == 2
5842 logical_id = list(disk.logical_id)
5843 logical_id[0] = self.op.nodes[0]
5844 logical_id[1] = self.op.nodes[1]
5845 disk.logical_id = tuple(logical_id)
5848 self.cfg.Update(self.instance, feedback_fn)
5850 _CreateDisks(self, self.instance, to_skip=to_skip)
5853 class LUInstanceRename(LogicalUnit):
5854 """Rename an instance.
5857 HPATH = "instance-rename"
5858 HTYPE = constants.HTYPE_INSTANCE
5860 def CheckArguments(self):
5864 if self.op.ip_check and not self.op.name_check:
5865 # TODO: make the ip check more flexible and not depend on the name check
5866 raise errors.OpPrereqError("IP address check requires a name check",
5869 def BuildHooksEnv(self):
5872 This runs on master, primary and secondary nodes of the instance.
5875 env = _BuildInstanceHookEnvByObject(self, self.instance)
5876 env["INSTANCE_NEW_NAME"] = self.op.new_name
5879 def BuildHooksNodes(self):
5880 """Build hooks nodes.
5883 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
5886 def CheckPrereq(self):
5887 """Check prerequisites.
5889 This checks that the instance is in the cluster and is not running.
5892 self.op.instance_name = _ExpandInstanceName(self.cfg,
5893 self.op.instance_name)
5894 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
5895 assert instance is not None
5896 _CheckNodeOnline(self, instance.primary_node)
5897 _CheckInstanceDown(self, instance, "cannot rename")
5898 self.instance = instance
5900 new_name = self.op.new_name
5901 if self.op.name_check:
5902 hostname = netutils.GetHostname(name=new_name)
5903 if hostname != new_name:
5904 self.LogInfo("Resolved given name '%s' to '%s'", new_name,
5906 if not utils.MatchNameComponent(self.op.new_name, [hostname.name]):
5907 raise errors.OpPrereqError(("Resolved hostname '%s' does not look the"
5908 " same as given hostname '%s'") %
5909 (hostname.name, self.op.new_name),
5911 new_name = self.op.new_name = hostname.name
5912 if (self.op.ip_check and
5913 netutils.TcpPing(hostname.ip, constants.DEFAULT_NODED_PORT)):
5914 raise errors.OpPrereqError("IP %s of instance %s already in use" %
5915 (hostname.ip, new_name),
5916 errors.ECODE_NOTUNIQUE)
5918 instance_list = self.cfg.GetInstanceList()
5919 if new_name in instance_list and new_name != instance.name:
5920 raise errors.OpPrereqError("Instance '%s' is already in the cluster" %
5921 new_name, errors.ECODE_EXISTS)
5923 def Exec(self, feedback_fn):
5924 """Rename the instance.
5927 inst = self.instance
5928 old_name = inst.name
5930 rename_file_storage = False
5931 if (inst.disk_template in (constants.DT_FILE, constants.DT_SHARED_FILE) and
5932 self.op.new_name != inst.name):
5933 old_file_storage_dir = os.path.dirname(inst.disks[0].logical_id[1])
5934 rename_file_storage = True
5936 self.cfg.RenameInstance(inst.name, self.op.new_name)
5937 # Change the instance lock. This is definitely safe while we hold the BGL.
5938 # Otherwise the new lock would have to be added in acquired mode.
5940 self.glm.remove(locking.LEVEL_INSTANCE, old_name)
5941 self.glm.add(locking.LEVEL_INSTANCE, self.op.new_name)
5943 # re-read the instance from the configuration after rename
5944 inst = self.cfg.GetInstanceInfo(self.op.new_name)
5946 if rename_file_storage:
5947 new_file_storage_dir = os.path.dirname(inst.disks[0].logical_id[1])
5948 result = self.rpc.call_file_storage_dir_rename(inst.primary_node,
5949 old_file_storage_dir,
5950 new_file_storage_dir)
5951 result.Raise("Could not rename on node %s directory '%s' to '%s'"
5952 " (but the instance has been renamed in Ganeti)" %
5953 (inst.primary_node, old_file_storage_dir,
5954 new_file_storage_dir))
5956 _StartInstanceDisks(self, inst, None)
5958 result = self.rpc.call_instance_run_rename(inst.primary_node, inst,
5959 old_name, self.op.debug_level)
5960 msg = result.fail_msg
5962 msg = ("Could not run OS rename script for instance %s on node %s"
5963 " (but the instance has been renamed in Ganeti): %s" %
5964 (inst.name, inst.primary_node, msg))
5965 self.proc.LogWarning(msg)
5967 _ShutdownInstanceDisks(self, inst)
5972 class LUInstanceRemove(LogicalUnit):
5973 """Remove an instance.
5976 HPATH = "instance-remove"
5977 HTYPE = constants.HTYPE_INSTANCE
5980 def ExpandNames(self):
5981 self._ExpandAndLockInstance()
5982 self.needed_locks[locking.LEVEL_NODE] = []
5983 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
5985 def DeclareLocks(self, level):
5986 if level == locking.LEVEL_NODE:
5987 self._LockInstancesNodes()
5989 def BuildHooksEnv(self):
5992 This runs on master, primary and secondary nodes of the instance.
5995 env = _BuildInstanceHookEnvByObject(self, self.instance)
5996 env["SHUTDOWN_TIMEOUT"] = self.op.shutdown_timeout
5999 def BuildHooksNodes(self):
6000 """Build hooks nodes.
6003 nl = [self.cfg.GetMasterNode()]
6004 nl_post = list(self.instance.all_nodes) + nl
6005 return (nl, nl_post)
6007 def CheckPrereq(self):
6008 """Check prerequisites.
6010 This checks that the instance is in the cluster.
6013 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6014 assert self.instance is not None, \
6015 "Cannot retrieve locked instance %s" % self.op.instance_name
6017 def Exec(self, feedback_fn):
6018 """Remove the instance.
6021 instance = self.instance
6022 logging.info("Shutting down instance %s on node %s",
6023 instance.name, instance.primary_node)
6025 result = self.rpc.call_instance_shutdown(instance.primary_node, instance,
6026 self.op.shutdown_timeout)
6027 msg = result.fail_msg
6029 if self.op.ignore_failures:
6030 feedback_fn("Warning: can't shutdown instance: %s" % msg)
6032 raise errors.OpExecError("Could not shutdown instance %s on"
6034 (instance.name, instance.primary_node, msg))
6036 _RemoveInstance(self, feedback_fn, instance, self.op.ignore_failures)
6039 def _RemoveInstance(lu, feedback_fn, instance, ignore_failures):
6040 """Utility function to remove an instance.
6043 logging.info("Removing block devices for instance %s", instance.name)
6045 if not _RemoveDisks(lu, instance):
6046 if not ignore_failures:
6047 raise errors.OpExecError("Can't remove instance's disks")
6048 feedback_fn("Warning: can't remove instance's disks")
6050 logging.info("Removing instance %s out of cluster config", instance.name)
6052 lu.cfg.RemoveInstance(instance.name)
6054 assert not lu.remove_locks.get(locking.LEVEL_INSTANCE), \
6055 "Instance lock removal conflict"
6057 # Remove lock for the instance
6058 lu.remove_locks[locking.LEVEL_INSTANCE] = instance.name
6061 class LUInstanceQuery(NoHooksLU):
6062 """Logical unit for querying instances.
6065 # pylint: disable-msg=W0142
6068 def CheckArguments(self):
6069 self.iq = _InstanceQuery(qlang.MakeSimpleFilter("name", self.op.names),
6070 self.op.output_fields, self.op.use_locking)
6072 def ExpandNames(self):
6073 self.iq.ExpandNames(self)
6075 def DeclareLocks(self, level):
6076 self.iq.DeclareLocks(self, level)
6078 def Exec(self, feedback_fn):
6079 return self.iq.OldStyleQuery(self)
6082 class LUInstanceFailover(LogicalUnit):
6083 """Failover an instance.
6086 HPATH = "instance-failover"
6087 HTYPE = constants.HTYPE_INSTANCE
6090 def CheckArguments(self):
6091 """Check the arguments.
6094 self.iallocator = getattr(self.op, "iallocator", None)
6095 self.target_node = getattr(self.op, "target_node", None)
6097 def ExpandNames(self):
6098 self._ExpandAndLockInstance()
6100 if self.op.target_node is not None:
6101 self.op.target_node = _ExpandNodeName(self.cfg, self.op.target_node)
6103 self.needed_locks[locking.LEVEL_NODE] = []
6104 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
6106 ignore_consistency = self.op.ignore_consistency
6107 shutdown_timeout = self.op.shutdown_timeout
6108 self._migrater = TLMigrateInstance(self, self.op.instance_name,
6111 ignore_consistency=ignore_consistency,
6112 shutdown_timeout=shutdown_timeout)
6113 self.tasklets = [self._migrater]
6115 def DeclareLocks(self, level):
6116 if level == locking.LEVEL_NODE:
6117 instance = self.context.cfg.GetInstanceInfo(self.op.instance_name)
6118 if instance.disk_template in constants.DTS_EXT_MIRROR:
6119 if self.op.target_node is None:
6120 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
6122 self.needed_locks[locking.LEVEL_NODE] = [instance.primary_node,
6123 self.op.target_node]
6124 del self.recalculate_locks[locking.LEVEL_NODE]
6126 self._LockInstancesNodes()
6128 def BuildHooksEnv(self):
6131 This runs on master, primary and secondary nodes of the instance.
6134 instance = self._migrater.instance
6135 source_node = instance.primary_node
6136 target_node = self.op.target_node
6138 "IGNORE_CONSISTENCY": self.op.ignore_consistency,
6139 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
6140 "OLD_PRIMARY": source_node,
6141 "NEW_PRIMARY": target_node,
6144 if instance.disk_template in constants.DTS_INT_MIRROR:
6145 env["OLD_SECONDARY"] = instance.secondary_nodes[0]
6146 env["NEW_SECONDARY"] = source_node
6148 env["OLD_SECONDARY"] = env["NEW_SECONDARY"] = ""
6150 env.update(_BuildInstanceHookEnvByObject(self, instance))
6154 def BuildHooksNodes(self):
6155 """Build hooks nodes.
6158 instance = self._migrater.instance
6159 nl = [self.cfg.GetMasterNode()] + list(instance.secondary_nodes)
6160 return (nl, nl + [instance.primary_node])
6163 class LUInstanceMigrate(LogicalUnit):
6164 """Migrate an instance.
6166 This is migration without shutting down, compared to the failover,
6167 which is done with shutdown.
6170 HPATH = "instance-migrate"
6171 HTYPE = constants.HTYPE_INSTANCE
6174 def ExpandNames(self):
6175 self._ExpandAndLockInstance()
6177 if self.op.target_node is not None:
6178 self.op.target_node = _ExpandNodeName(self.cfg, self.op.target_node)
6180 self.needed_locks[locking.LEVEL_NODE] = []
6181 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
6183 self._migrater = TLMigrateInstance(self, self.op.instance_name,
6184 cleanup=self.op.cleanup,
6186 fallback=self.op.allow_failover)
6187 self.tasklets = [self._migrater]
6189 def DeclareLocks(self, level):
6190 if level == locking.LEVEL_NODE:
6191 instance = self.context.cfg.GetInstanceInfo(self.op.instance_name)
6192 if instance.disk_template in constants.DTS_EXT_MIRROR:
6193 if self.op.target_node is None:
6194 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
6196 self.needed_locks[locking.LEVEL_NODE] = [instance.primary_node,
6197 self.op.target_node]
6198 del self.recalculate_locks[locking.LEVEL_NODE]
6200 self._LockInstancesNodes()
6202 def BuildHooksEnv(self):
6205 This runs on master, primary and secondary nodes of the instance.
6208 instance = self._migrater.instance
6209 source_node = instance.primary_node
6210 target_node = self.op.target_node
6211 env = _BuildInstanceHookEnvByObject(self, instance)
6213 "MIGRATE_LIVE": self._migrater.live,
6214 "MIGRATE_CLEANUP": self.op.cleanup,
6215 "OLD_PRIMARY": source_node,
6216 "NEW_PRIMARY": target_node,
6219 if instance.disk_template in constants.DTS_INT_MIRROR:
6220 env["OLD_SECONDARY"] = target_node
6221 env["NEW_SECONDARY"] = source_node
6223 env["OLD_SECONDARY"] = env["NEW_SECONDARY"] = None
6227 def BuildHooksNodes(self):
6228 """Build hooks nodes.
6231 instance = self._migrater.instance
6232 nl = [self.cfg.GetMasterNode()] + list(instance.secondary_nodes)
6233 return (nl, nl + [instance.primary_node])
6236 class LUInstanceMove(LogicalUnit):
6237 """Move an instance by data-copying.
6240 HPATH = "instance-move"
6241 HTYPE = constants.HTYPE_INSTANCE
6244 def ExpandNames(self):
6245 self._ExpandAndLockInstance()
6246 target_node = _ExpandNodeName(self.cfg, self.op.target_node)
6247 self.op.target_node = target_node
6248 self.needed_locks[locking.LEVEL_NODE] = [target_node]
6249 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
6251 def DeclareLocks(self, level):
6252 if level == locking.LEVEL_NODE:
6253 self._LockInstancesNodes(primary_only=True)
6255 def BuildHooksEnv(self):
6258 This runs on master, primary and secondary nodes of the instance.
6262 "TARGET_NODE": self.op.target_node,
6263 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
6265 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
6268 def BuildHooksNodes(self):
6269 """Build hooks nodes.
6273 self.cfg.GetMasterNode(),
6274 self.instance.primary_node,
6275 self.op.target_node,
6279 def CheckPrereq(self):
6280 """Check prerequisites.
6282 This checks that the instance is in the cluster.
6285 self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
6286 assert self.instance is not None, \
6287 "Cannot retrieve locked instance %s" % self.op.instance_name
6289 node = self.cfg.GetNodeInfo(self.op.target_node)
6290 assert node is not None, \
6291 "Cannot retrieve locked node %s" % self.op.target_node
6293 self.target_node = target_node = node.name
6295 if target_node == instance.primary_node:
6296 raise errors.OpPrereqError("Instance %s is already on the node %s" %
6297 (instance.name, target_node),
6300 bep = self.cfg.GetClusterInfo().FillBE(instance)
6302 for idx, dsk in enumerate(instance.disks):
6303 if dsk.dev_type not in (constants.LD_LV, constants.LD_FILE):
6304 raise errors.OpPrereqError("Instance disk %d has a complex layout,"
6305 " cannot copy" % idx, errors.ECODE_STATE)
6307 _CheckNodeOnline(self, target_node)
6308 _CheckNodeNotDrained(self, target_node)
6309 _CheckNodeVmCapable(self, target_node)
6311 if instance.admin_up:
6312 # check memory requirements on the secondary node
6313 _CheckNodeFreeMemory(self, target_node, "failing over instance %s" %
6314 instance.name, bep[constants.BE_MEMORY],
6315 instance.hypervisor)
6317 self.LogInfo("Not checking memory on the secondary node as"
6318 " instance will not be started")
6320 # check bridge existance
6321 _CheckInstanceBridgesExist(self, instance, node=target_node)
6323 def Exec(self, feedback_fn):
6324 """Move an instance.
6326 The move is done by shutting it down on its present node, copying
6327 the data over (slow) and starting it on the new node.
6330 instance = self.instance
6332 source_node = instance.primary_node
6333 target_node = self.target_node
6335 self.LogInfo("Shutting down instance %s on source node %s",
6336 instance.name, source_node)
6338 result = self.rpc.call_instance_shutdown(source_node, instance,
6339 self.op.shutdown_timeout)
6340 msg = result.fail_msg
6342 if self.op.ignore_consistency:
6343 self.proc.LogWarning("Could not shutdown instance %s on node %s."
6344 " Proceeding anyway. Please make sure node"
6345 " %s is down. Error details: %s",
6346 instance.name, source_node, source_node, msg)
6348 raise errors.OpExecError("Could not shutdown instance %s on"
6350 (instance.name, source_node, msg))
6352 # create the target disks
6354 _CreateDisks(self, instance, target_node=target_node)
6355 except errors.OpExecError:
6356 self.LogWarning("Device creation failed, reverting...")
6358 _RemoveDisks(self, instance, target_node=target_node)
6360 self.cfg.ReleaseDRBDMinors(instance.name)
6363 cluster_name = self.cfg.GetClusterInfo().cluster_name
6366 # activate, get path, copy the data over
6367 for idx, disk in enumerate(instance.disks):
6368 self.LogInfo("Copying data for disk %d", idx)
6369 result = self.rpc.call_blockdev_assemble(target_node, disk,
6370 instance.name, True, idx)
6372 self.LogWarning("Can't assemble newly created disk %d: %s",
6373 idx, result.fail_msg)
6374 errs.append(result.fail_msg)
6376 dev_path = result.payload
6377 result = self.rpc.call_blockdev_export(source_node, disk,
6378 target_node, dev_path,
6381 self.LogWarning("Can't copy data over for disk %d: %s",
6382 idx, result.fail_msg)
6383 errs.append(result.fail_msg)
6387 self.LogWarning("Some disks failed to copy, aborting")
6389 _RemoveDisks(self, instance, target_node=target_node)
6391 self.cfg.ReleaseDRBDMinors(instance.name)
6392 raise errors.OpExecError("Errors during disk copy: %s" %
6395 instance.primary_node = target_node
6396 self.cfg.Update(instance, feedback_fn)
6398 self.LogInfo("Removing the disks on the original node")
6399 _RemoveDisks(self, instance, target_node=source_node)
6401 # Only start the instance if it's marked as up
6402 if instance.admin_up:
6403 self.LogInfo("Starting instance %s on node %s",
6404 instance.name, target_node)
6406 disks_ok, _ = _AssembleInstanceDisks(self, instance,
6407 ignore_secondaries=True)
6409 _ShutdownInstanceDisks(self, instance)
6410 raise errors.OpExecError("Can't activate the instance's disks")
6412 result = self.rpc.call_instance_start(target_node, instance, None, None)
6413 msg = result.fail_msg
6415 _ShutdownInstanceDisks(self, instance)
6416 raise errors.OpExecError("Could not start instance %s on node %s: %s" %
6417 (instance.name, target_node, msg))
6420 class LUNodeMigrate(LogicalUnit):
6421 """Migrate all instances from a node.
6424 HPATH = "node-migrate"
6425 HTYPE = constants.HTYPE_NODE
6428 def CheckArguments(self):
6429 _CheckIAllocatorOrNode(self, "iallocator", "remote_node")
6431 def ExpandNames(self):
6432 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
6434 self.needed_locks = {}
6436 # Create tasklets for migrating instances for all instances on this node
6440 self.lock_all_nodes = False
6442 for inst in _GetNodePrimaryInstances(self.cfg, self.op.node_name):
6443 logging.debug("Migrating instance %s", inst.name)
6444 names.append(inst.name)
6446 tasklets.append(TLMigrateInstance(self, inst.name, cleanup=False))
6448 if inst.disk_template in constants.DTS_EXT_MIRROR:
6449 # We need to lock all nodes, as the iallocator will choose the
6450 # destination nodes afterwards
6451 self.lock_all_nodes = True
6453 self.tasklets = tasklets
6455 # Declare node locks
6456 if self.lock_all_nodes:
6457 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
6459 self.needed_locks[locking.LEVEL_NODE] = [self.op.node_name]
6460 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
6462 # Declare instance locks
6463 self.needed_locks[locking.LEVEL_INSTANCE] = names
6465 def DeclareLocks(self, level):
6466 if level == locking.LEVEL_NODE and not self.lock_all_nodes:
6467 self._LockInstancesNodes()
6469 def BuildHooksEnv(self):
6472 This runs on the master, the primary and all the secondaries.
6476 "NODE_NAME": self.op.node_name,
6479 def BuildHooksNodes(self):
6480 """Build hooks nodes.
6483 nl = [self.cfg.GetMasterNode()]
6487 class TLMigrateInstance(Tasklet):
6488 """Tasklet class for instance migration.
6491 @ivar live: whether the migration will be done live or non-live;
6492 this variable is initalized only after CheckPrereq has run
6493 @type cleanup: boolean
6494 @ivar cleanup: Wheater we cleanup from a failed migration
6495 @type iallocator: string
6496 @ivar iallocator: The iallocator used to determine target_node
6497 @type target_node: string
6498 @ivar target_node: If given, the target_node to reallocate the instance to
6499 @type failover: boolean
6500 @ivar failover: Whether operation results in failover or migration
6501 @type fallback: boolean
6502 @ivar fallback: Whether fallback to failover is allowed if migration not
6504 @type ignore_consistency: boolean
6505 @ivar ignore_consistency: Wheter we should ignore consistency between source
6507 @type shutdown_timeout: int
6508 @ivar shutdown_timeout: In case of failover timeout of the shutdown
6511 def __init__(self, lu, instance_name, cleanup=False,
6512 failover=False, fallback=False,
6513 ignore_consistency=False,
6514 shutdown_timeout=constants.DEFAULT_SHUTDOWN_TIMEOUT):
6515 """Initializes this class.
6518 Tasklet.__init__(self, lu)
6521 self.instance_name = instance_name
6522 self.cleanup = cleanup
6523 self.live = False # will be overridden later
6524 self.failover = failover
6525 self.fallback = fallback
6526 self.ignore_consistency = ignore_consistency
6527 self.shutdown_timeout = shutdown_timeout
6529 def CheckPrereq(self):
6530 """Check prerequisites.
6532 This checks that the instance is in the cluster.
6535 instance_name = _ExpandInstanceName(self.lu.cfg, self.instance_name)
6536 instance = self.cfg.GetInstanceInfo(instance_name)
6537 assert instance is not None
6538 self.instance = instance
6540 if (not self.cleanup and not instance.admin_up and not self.failover and
6542 self.lu.LogInfo("Instance is marked down, fallback allowed, switching"
6544 self.failover = True
6546 if instance.disk_template not in constants.DTS_MIRRORED:
6551 raise errors.OpPrereqError("Instance's disk layout '%s' does not allow"
6552 " %s" % (instance.disk_template, text),
6555 if instance.disk_template in constants.DTS_EXT_MIRROR:
6556 _CheckIAllocatorOrNode(self.lu, "iallocator", "target_node")
6558 if self.lu.op.iallocator:
6559 self._RunAllocator()
6561 # We set set self.target_node as it is required by
6563 self.target_node = self.lu.op.target_node
6565 # self.target_node is already populated, either directly or by the
6567 target_node = self.target_node
6569 if len(self.lu.tasklets) == 1:
6570 # It is safe to release locks only when we're the only tasklet
6572 _ReleaseLocks(self.lu, locking.LEVEL_NODE,
6573 keep=[instance.primary_node, self.target_node])
6576 secondary_nodes = instance.secondary_nodes
6577 if not secondary_nodes:
6578 raise errors.ConfigurationError("No secondary node but using"
6579 " %s disk template" %
6580 instance.disk_template)
6581 target_node = secondary_nodes[0]
6582 if self.lu.op.iallocator or (self.lu.op.target_node and
6583 self.lu.op.target_node != target_node):
6585 text = "failed over"
6588 raise errors.OpPrereqError("Instances with disk template %s cannot"
6589 " be %s to arbitrary nodes"
6590 " (neither an iallocator nor a target"
6591 " node can be passed)" %
6592 (instance.disk_template, text),
6595 i_be = self.cfg.GetClusterInfo().FillBE(instance)
6597 # check memory requirements on the secondary node
6598 if not self.failover or instance.admin_up:
6599 _CheckNodeFreeMemory(self.lu, target_node, "migrating instance %s" %
6600 instance.name, i_be[constants.BE_MEMORY],
6601 instance.hypervisor)
6603 self.lu.LogInfo("Not checking memory on the secondary node as"
6604 " instance will not be started")
6606 # check bridge existance
6607 _CheckInstanceBridgesExist(self.lu, instance, node=target_node)
6609 if not self.cleanup:
6610 _CheckNodeNotDrained(self.lu, target_node)
6611 if not self.failover:
6612 result = self.rpc.call_instance_migratable(instance.primary_node,
6614 if result.fail_msg and self.fallback:
6615 self.lu.LogInfo("Can't migrate, instance offline, fallback to"
6617 self.failover = True
6619 result.Raise("Can't migrate, please use failover",
6620 prereq=True, ecode=errors.ECODE_STATE)
6622 assert not (self.failover and self.cleanup)
6624 if not self.failover:
6625 if self.lu.op.live is not None and self.lu.op.mode is not None:
6626 raise errors.OpPrereqError("Only one of the 'live' and 'mode'"
6627 " parameters are accepted",
6629 if self.lu.op.live is not None:
6631 self.lu.op.mode = constants.HT_MIGRATION_LIVE
6633 self.lu.op.mode = constants.HT_MIGRATION_NONLIVE
6634 # reset the 'live' parameter to None so that repeated
6635 # invocations of CheckPrereq do not raise an exception
6636 self.lu.op.live = None
6637 elif self.lu.op.mode is None:
6638 # read the default value from the hypervisor
6639 i_hv = self.cfg.GetClusterInfo().FillHV(self.instance,
6641 self.lu.op.mode = i_hv[constants.HV_MIGRATION_MODE]
6643 self.live = self.lu.op.mode == constants.HT_MIGRATION_LIVE
6645 # Failover is never live
6648 def _RunAllocator(self):
6649 """Run the allocator based on input opcode.
6652 ial = IAllocator(self.cfg, self.rpc,
6653 mode=constants.IALLOCATOR_MODE_RELOC,
6654 name=self.instance_name,
6655 # TODO See why hail breaks with a single node below
6656 relocate_from=[self.instance.primary_node,
6657 self.instance.primary_node],
6660 ial.Run(self.lu.op.iallocator)
6663 raise errors.OpPrereqError("Can't compute nodes using"
6664 " iallocator '%s': %s" %
6665 (self.lu.op.iallocator, ial.info),
6667 if len(ial.result) != ial.required_nodes:
6668 raise errors.OpPrereqError("iallocator '%s' returned invalid number"
6669 " of nodes (%s), required %s" %
6670 (self.lu.op.iallocator, len(ial.result),
6671 ial.required_nodes), errors.ECODE_FAULT)
6672 self.target_node = ial.result[0]
6673 self.lu.LogInfo("Selected nodes for instance %s via iallocator %s: %s",
6674 self.instance_name, self.lu.op.iallocator,
6675 utils.CommaJoin(ial.result))
6677 def _WaitUntilSync(self):
6678 """Poll with custom rpc for disk sync.
6680 This uses our own step-based rpc call.
6683 self.feedback_fn("* wait until resync is done")
6687 result = self.rpc.call_drbd_wait_sync(self.all_nodes,
6689 self.instance.disks)
6691 for node, nres in result.items():
6692 nres.Raise("Cannot resync disks on node %s" % node)
6693 node_done, node_percent = nres.payload
6694 all_done = all_done and node_done
6695 if node_percent is not None:
6696 min_percent = min(min_percent, node_percent)
6698 if min_percent < 100:
6699 self.feedback_fn(" - progress: %.1f%%" % min_percent)
6702 def _EnsureSecondary(self, node):
6703 """Demote a node to secondary.
6706 self.feedback_fn("* switching node %s to secondary mode" % node)
6708 for dev in self.instance.disks:
6709 self.cfg.SetDiskID(dev, node)
6711 result = self.rpc.call_blockdev_close(node, self.instance.name,
6712 self.instance.disks)
6713 result.Raise("Cannot change disk to secondary on node %s" % node)
6715 def _GoStandalone(self):
6716 """Disconnect from the network.
6719 self.feedback_fn("* changing into standalone mode")
6720 result = self.rpc.call_drbd_disconnect_net(self.all_nodes, self.nodes_ip,
6721 self.instance.disks)
6722 for node, nres in result.items():
6723 nres.Raise("Cannot disconnect disks node %s" % node)
6725 def _GoReconnect(self, multimaster):
6726 """Reconnect to the network.
6732 msg = "single-master"
6733 self.feedback_fn("* changing disks into %s mode" % msg)
6734 result = self.rpc.call_drbd_attach_net(self.all_nodes, self.nodes_ip,
6735 self.instance.disks,
6736 self.instance.name, multimaster)
6737 for node, nres in result.items():
6738 nres.Raise("Cannot change disks config on node %s" % node)
6740 def _ExecCleanup(self):
6741 """Try to cleanup after a failed migration.
6743 The cleanup is done by:
6744 - check that the instance is running only on one node
6745 (and update the config if needed)
6746 - change disks on its secondary node to secondary
6747 - wait until disks are fully synchronized
6748 - disconnect from the network
6749 - change disks into single-master mode
6750 - wait again until disks are fully synchronized
6753 instance = self.instance
6754 target_node = self.target_node
6755 source_node = self.source_node
6757 # check running on only one node
6758 self.feedback_fn("* checking where the instance actually runs"
6759 " (if this hangs, the hypervisor might be in"
6761 ins_l = self.rpc.call_instance_list(self.all_nodes, [instance.hypervisor])
6762 for node, result in ins_l.items():
6763 result.Raise("Can't contact node %s" % node)
6765 runningon_source = instance.name in ins_l[source_node].payload
6766 runningon_target = instance.name in ins_l[target_node].payload
6768 if runningon_source and runningon_target:
6769 raise errors.OpExecError("Instance seems to be running on two nodes,"
6770 " or the hypervisor is confused; you will have"
6771 " to ensure manually that it runs only on one"
6772 " and restart this operation")
6774 if not (runningon_source or runningon_target):
6775 raise errors.OpExecError("Instance does not seem to be running at all;"
6776 " in this case it's safer to repair by"
6777 " running 'gnt-instance stop' to ensure disk"
6778 " shutdown, and then restarting it")
6780 if runningon_target:
6781 # the migration has actually succeeded, we need to update the config
6782 self.feedback_fn("* instance running on secondary node (%s),"
6783 " updating config" % target_node)
6784 instance.primary_node = target_node
6785 self.cfg.Update(instance, self.feedback_fn)
6786 demoted_node = source_node
6788 self.feedback_fn("* instance confirmed to be running on its"
6789 " primary node (%s)" % source_node)
6790 demoted_node = target_node
6792 if instance.disk_template in constants.DTS_INT_MIRROR:
6793 self._EnsureSecondary(demoted_node)
6795 self._WaitUntilSync()
6796 except errors.OpExecError:
6797 # we ignore here errors, since if the device is standalone, it
6798 # won't be able to sync
6800 self._GoStandalone()
6801 self._GoReconnect(False)
6802 self._WaitUntilSync()
6804 self.feedback_fn("* done")
6806 def _RevertDiskStatus(self):
6807 """Try to revert the disk status after a failed migration.
6810 target_node = self.target_node
6811 if self.instance.disk_template in constants.DTS_EXT_MIRROR:
6815 self._EnsureSecondary(target_node)
6816 self._GoStandalone()
6817 self._GoReconnect(False)
6818 self._WaitUntilSync()
6819 except errors.OpExecError, err:
6820 self.lu.LogWarning("Migration failed and I can't reconnect the drives,"
6821 " please try to recover the instance manually;"
6822 " error '%s'" % str(err))
6824 def _AbortMigration(self):
6825 """Call the hypervisor code to abort a started migration.
6828 instance = self.instance
6829 target_node = self.target_node
6830 migration_info = self.migration_info
6832 abort_result = self.rpc.call_finalize_migration(target_node,
6836 abort_msg = abort_result.fail_msg
6838 logging.error("Aborting migration failed on target node %s: %s",
6839 target_node, abort_msg)
6840 # Don't raise an exception here, as we stil have to try to revert the
6841 # disk status, even if this step failed.
6843 def _ExecMigration(self):
6844 """Migrate an instance.
6846 The migrate is done by:
6847 - change the disks into dual-master mode
6848 - wait until disks are fully synchronized again
6849 - migrate the instance
6850 - change disks on the new secondary node (the old primary) to secondary
6851 - wait until disks are fully synchronized
6852 - change disks into single-master mode
6855 instance = self.instance
6856 target_node = self.target_node
6857 source_node = self.source_node
6859 self.feedback_fn("* checking disk consistency between source and target")
6860 for dev in instance.disks:
6861 if not _CheckDiskConsistency(self.lu, dev, target_node, False):
6862 raise errors.OpExecError("Disk %s is degraded or not fully"
6863 " synchronized on target node,"
6864 " aborting migration" % dev.iv_name)
6866 # First get the migration information from the remote node
6867 result = self.rpc.call_migration_info(source_node, instance)
6868 msg = result.fail_msg
6870 log_err = ("Failed fetching source migration information from %s: %s" %
6872 logging.error(log_err)
6873 raise errors.OpExecError(log_err)
6875 self.migration_info = migration_info = result.payload
6877 if self.instance.disk_template not in constants.DTS_EXT_MIRROR:
6878 # Then switch the disks to master/master mode
6879 self._EnsureSecondary(target_node)
6880 self._GoStandalone()
6881 self._GoReconnect(True)
6882 self._WaitUntilSync()
6884 self.feedback_fn("* preparing %s to accept the instance" % target_node)
6885 result = self.rpc.call_accept_instance(target_node,
6888 self.nodes_ip[target_node])
6890 msg = result.fail_msg
6892 logging.error("Instance pre-migration failed, trying to revert"
6893 " disk status: %s", msg)
6894 self.feedback_fn("Pre-migration failed, aborting")
6895 self._AbortMigration()
6896 self._RevertDiskStatus()
6897 raise errors.OpExecError("Could not pre-migrate instance %s: %s" %
6898 (instance.name, msg))
6900 self.feedback_fn("* migrating instance to %s" % target_node)
6901 result = self.rpc.call_instance_migrate(source_node, instance,
6902 self.nodes_ip[target_node],
6904 msg = result.fail_msg
6906 logging.error("Instance migration failed, trying to revert"
6907 " disk status: %s", msg)
6908 self.feedback_fn("Migration failed, aborting")
6909 self._AbortMigration()
6910 self._RevertDiskStatus()
6911 raise errors.OpExecError("Could not migrate instance %s: %s" %
6912 (instance.name, msg))
6914 instance.primary_node = target_node
6915 # distribute new instance config to the other nodes
6916 self.cfg.Update(instance, self.feedback_fn)
6918 result = self.rpc.call_finalize_migration(target_node,
6922 msg = result.fail_msg
6924 logging.error("Instance migration succeeded, but finalization failed:"
6926 raise errors.OpExecError("Could not finalize instance migration: %s" %
6929 if self.instance.disk_template not in constants.DTS_EXT_MIRROR:
6930 self._EnsureSecondary(source_node)
6931 self._WaitUntilSync()
6932 self._GoStandalone()
6933 self._GoReconnect(False)
6934 self._WaitUntilSync()
6936 self.feedback_fn("* done")
6938 def _ExecFailover(self):
6939 """Failover an instance.
6941 The failover is done by shutting it down on its present node and
6942 starting it on the secondary.
6945 instance = self.instance
6946 primary_node = self.cfg.GetNodeInfo(instance.primary_node)
6948 source_node = instance.primary_node
6949 target_node = self.target_node
6951 if instance.admin_up:
6952 self.feedback_fn("* checking disk consistency between source and target")
6953 for dev in instance.disks:
6954 # for drbd, these are drbd over lvm
6955 if not _CheckDiskConsistency(self, dev, target_node, False):
6956 if not self.ignore_consistency:
6957 raise errors.OpExecError("Disk %s is degraded on target node,"
6958 " aborting failover" % dev.iv_name)
6960 self.feedback_fn("* not checking disk consistency as instance is not"
6963 self.feedback_fn("* shutting down instance on source node")
6964 logging.info("Shutting down instance %s on node %s",
6965 instance.name, source_node)
6967 result = self.rpc.call_instance_shutdown(source_node, instance,
6968 self.shutdown_timeout)
6969 msg = result.fail_msg
6971 if self.ignore_consistency or primary_node.offline:
6972 self.lu.LogWarning("Could not shutdown instance %s on node %s,"
6973 " proceeding anyway; please make sure node"
6974 " %s is down; error details: %s",
6975 instance.name, source_node, source_node, msg)
6977 raise errors.OpExecError("Could not shutdown instance %s on"
6979 (instance.name, source_node, msg))
6981 self.feedback_fn("* deactivating the instance's disks on source node")
6982 if not _ShutdownInstanceDisks(self, instance, ignore_primary=True):
6983 raise errors.OpExecError("Can't shut down the instance's disks.")
6985 instance.primary_node = target_node
6986 # distribute new instance config to the other nodes
6987 self.cfg.Update(instance, self.feedback_fn)
6989 # Only start the instance if it's marked as up
6990 if instance.admin_up:
6991 self.feedback_fn("* activating the instance's disks on target node")
6992 logging.info("Starting instance %s on node %s",
6993 instance.name, target_node)
6995 disks_ok, _ = _AssembleInstanceDisks(self, instance,
6996 ignore_secondaries=True)
6998 _ShutdownInstanceDisks(self, instance)
6999 raise errors.OpExecError("Can't activate the instance's disks")
7001 self.feedback_fn("* starting the instance on the target node")
7002 result = self.rpc.call_instance_start(target_node, instance, None, None)
7003 msg = result.fail_msg
7005 _ShutdownInstanceDisks(self, instance)
7006 raise errors.OpExecError("Could not start instance %s on node %s: %s" %
7007 (instance.name, target_node, msg))
7009 def Exec(self, feedback_fn):
7010 """Perform the migration.
7013 self.feedback_fn = feedback_fn
7014 self.source_node = self.instance.primary_node
7016 # FIXME: if we implement migrate-to-any in DRBD, this needs fixing
7017 if self.instance.disk_template in constants.DTS_INT_MIRROR:
7018 self.target_node = self.instance.secondary_nodes[0]
7019 # Otherwise self.target_node has been populated either
7020 # directly, or through an iallocator.
7022 self.all_nodes = [self.source_node, self.target_node]
7024 self.source_node: self.cfg.GetNodeInfo(self.source_node).secondary_ip,
7025 self.target_node: self.cfg.GetNodeInfo(self.target_node).secondary_ip,
7029 feedback_fn("Failover instance %s" % self.instance.name)
7030 self._ExecFailover()
7032 feedback_fn("Migrating instance %s" % self.instance.name)
7035 return self._ExecCleanup()
7037 return self._ExecMigration()
7040 def _CreateBlockDev(lu, node, instance, device, force_create,
7042 """Create a tree of block devices on a given node.
7044 If this device type has to be created on secondaries, create it and
7047 If not, just recurse to children keeping the same 'force' value.
7049 @param lu: the lu on whose behalf we execute
7050 @param node: the node on which to create the device
7051 @type instance: L{objects.Instance}
7052 @param instance: the instance which owns the device
7053 @type device: L{objects.Disk}
7054 @param device: the device to create
7055 @type force_create: boolean
7056 @param force_create: whether to force creation of this device; this
7057 will be change to True whenever we find a device which has
7058 CreateOnSecondary() attribute
7059 @param info: the extra 'metadata' we should attach to the device
7060 (this will be represented as a LVM tag)
7061 @type force_open: boolean
7062 @param force_open: this parameter will be passes to the
7063 L{backend.BlockdevCreate} function where it specifies
7064 whether we run on primary or not, and it affects both
7065 the child assembly and the device own Open() execution
7068 if device.CreateOnSecondary():
7072 for child in device.children:
7073 _CreateBlockDev(lu, node, instance, child, force_create,
7076 if not force_create:
7079 _CreateSingleBlockDev(lu, node, instance, device, info, force_open)
7082 def _CreateSingleBlockDev(lu, node, instance, device, info, force_open):
7083 """Create a single block device on a given node.
7085 This will not recurse over children of the device, so they must be
7088 @param lu: the lu on whose behalf we execute
7089 @param node: the node on which to create the device
7090 @type instance: L{objects.Instance}
7091 @param instance: the instance which owns the device
7092 @type device: L{objects.Disk}
7093 @param device: the device to create
7094 @param info: the extra 'metadata' we should attach to the device
7095 (this will be represented as a LVM tag)
7096 @type force_open: boolean
7097 @param force_open: this parameter will be passes to the
7098 L{backend.BlockdevCreate} function where it specifies
7099 whether we run on primary or not, and it affects both
7100 the child assembly and the device own Open() execution
7103 lu.cfg.SetDiskID(device, node)
7104 result = lu.rpc.call_blockdev_create(node, device, device.size,
7105 instance.name, force_open, info)
7106 result.Raise("Can't create block device %s on"
7107 " node %s for instance %s" % (device, node, instance.name))
7108 if device.physical_id is None:
7109 device.physical_id = result.payload
7112 def _GenerateUniqueNames(lu, exts):
7113 """Generate a suitable LV name.
7115 This will generate a logical volume name for the given instance.
7120 new_id = lu.cfg.GenerateUniqueID(lu.proc.GetECId())
7121 results.append("%s%s" % (new_id, val))
7125 def _GenerateDRBD8Branch(lu, primary, secondary, size, vgnames, names,
7126 iv_name, p_minor, s_minor):
7127 """Generate a drbd8 device complete with its children.
7130 assert len(vgnames) == len(names) == 2
7131 port = lu.cfg.AllocatePort()
7132 shared_secret = lu.cfg.GenerateDRBDSecret(lu.proc.GetECId())
7133 dev_data = objects.Disk(dev_type=constants.LD_LV, size=size,
7134 logical_id=(vgnames[0], names[0]))
7135 dev_meta = objects.Disk(dev_type=constants.LD_LV, size=128,
7136 logical_id=(vgnames[1], names[1]))
7137 drbd_dev = objects.Disk(dev_type=constants.LD_DRBD8, size=size,
7138 logical_id=(primary, secondary, port,
7141 children=[dev_data, dev_meta],
7146 def _GenerateDiskTemplate(lu, template_name,
7147 instance_name, primary_node,
7148 secondary_nodes, disk_info,
7149 file_storage_dir, file_driver,
7150 base_index, feedback_fn):
7151 """Generate the entire disk layout for a given template type.
7154 #TODO: compute space requirements
7156 vgname = lu.cfg.GetVGName()
7157 disk_count = len(disk_info)
7159 if template_name == constants.DT_DISKLESS:
7161 elif template_name == constants.DT_PLAIN:
7162 if len(secondary_nodes) != 0:
7163 raise errors.ProgrammerError("Wrong template configuration")
7165 names = _GenerateUniqueNames(lu, [".disk%d" % (base_index + i)
7166 for i in range(disk_count)])
7167 for idx, disk in enumerate(disk_info):
7168 disk_index = idx + base_index
7169 vg = disk.get(constants.IDISK_VG, vgname)
7170 feedback_fn("* disk %i, vg %s, name %s" % (idx, vg, names[idx]))
7171 disk_dev = objects.Disk(dev_type=constants.LD_LV,
7172 size=disk[constants.IDISK_SIZE],
7173 logical_id=(vg, names[idx]),
7174 iv_name="disk/%d" % disk_index,
7175 mode=disk[constants.IDISK_MODE])
7176 disks.append(disk_dev)
7177 elif template_name == constants.DT_DRBD8:
7178 if len(secondary_nodes) != 1:
7179 raise errors.ProgrammerError("Wrong template configuration")
7180 remote_node = secondary_nodes[0]
7181 minors = lu.cfg.AllocateDRBDMinor(
7182 [primary_node, remote_node] * len(disk_info), instance_name)
7185 for lv_prefix in _GenerateUniqueNames(lu, [".disk%d" % (base_index + i)
7186 for i in range(disk_count)]):
7187 names.append(lv_prefix + "_data")
7188 names.append(lv_prefix + "_meta")
7189 for idx, disk in enumerate(disk_info):
7190 disk_index = idx + base_index
7191 data_vg = disk.get(constants.IDISK_VG, vgname)
7192 meta_vg = disk.get(constants.IDISK_METAVG, data_vg)
7193 disk_dev = _GenerateDRBD8Branch(lu, primary_node, remote_node,
7194 disk[constants.IDISK_SIZE],
7196 names[idx * 2:idx * 2 + 2],
7197 "disk/%d" % disk_index,
7198 minors[idx * 2], minors[idx * 2 + 1])
7199 disk_dev.mode = disk[constants.IDISK_MODE]
7200 disks.append(disk_dev)
7201 elif template_name == constants.DT_FILE:
7202 if len(secondary_nodes) != 0:
7203 raise errors.ProgrammerError("Wrong template configuration")
7205 opcodes.RequireFileStorage()
7207 for idx, disk in enumerate(disk_info):
7208 disk_index = idx + base_index
7209 disk_dev = objects.Disk(dev_type=constants.LD_FILE,
7210 size=disk[constants.IDISK_SIZE],
7211 iv_name="disk/%d" % disk_index,
7212 logical_id=(file_driver,
7213 "%s/disk%d" % (file_storage_dir,
7215 mode=disk[constants.IDISK_MODE])
7216 disks.append(disk_dev)
7217 elif template_name == constants.DT_SHARED_FILE:
7218 if len(secondary_nodes) != 0:
7219 raise errors.ProgrammerError("Wrong template configuration")
7221 opcodes.RequireSharedFileStorage()
7223 for idx, disk in enumerate(disk_info):
7224 disk_index = idx + base_index
7225 disk_dev = objects.Disk(dev_type=constants.LD_FILE,
7226 size=disk[constants.IDISK_SIZE],
7227 iv_name="disk/%d" % disk_index,
7228 logical_id=(file_driver,
7229 "%s/disk%d" % (file_storage_dir,
7231 mode=disk[constants.IDISK_MODE])
7232 disks.append(disk_dev)
7233 elif template_name == constants.DT_BLOCK:
7234 if len(secondary_nodes) != 0:
7235 raise errors.ProgrammerError("Wrong template configuration")
7237 for idx, disk in enumerate(disk_info):
7238 disk_index = idx + base_index
7239 disk_dev = objects.Disk(dev_type=constants.LD_BLOCKDEV,
7240 size=disk[constants.IDISK_SIZE],
7241 logical_id=(constants.BLOCKDEV_DRIVER_MANUAL,
7242 disk[constants.IDISK_ADOPT]),
7243 iv_name="disk/%d" % disk_index,
7244 mode=disk[constants.IDISK_MODE])
7245 disks.append(disk_dev)
7248 raise errors.ProgrammerError("Invalid disk template '%s'" % template_name)
7252 def _GetInstanceInfoText(instance):
7253 """Compute that text that should be added to the disk's metadata.
7256 return "originstname+%s" % instance.name
7259 def _CalcEta(time_taken, written, total_size):
7260 """Calculates the ETA based on size written and total size.
7262 @param time_taken: The time taken so far
7263 @param written: amount written so far
7264 @param total_size: The total size of data to be written
7265 @return: The remaining time in seconds
7268 avg_time = time_taken / float(written)
7269 return (total_size - written) * avg_time
7272 def _WipeDisks(lu, instance):
7273 """Wipes instance disks.
7275 @type lu: L{LogicalUnit}
7276 @param lu: the logical unit on whose behalf we execute
7277 @type instance: L{objects.Instance}
7278 @param instance: the instance whose disks we should create
7279 @return: the success of the wipe
7282 node = instance.primary_node
7284 for device in instance.disks:
7285 lu.cfg.SetDiskID(device, node)
7287 logging.info("Pause sync of instance %s disks", instance.name)
7288 result = lu.rpc.call_blockdev_pause_resume_sync(node, instance.disks, True)
7290 for idx, success in enumerate(result.payload):
7292 logging.warn("pause-sync of instance %s for disks %d failed",
7296 for idx, device in enumerate(instance.disks):
7297 # The wipe size is MIN_WIPE_CHUNK_PERCENT % of the instance disk but
7298 # MAX_WIPE_CHUNK at max
7299 wipe_chunk_size = min(constants.MAX_WIPE_CHUNK, device.size / 100.0 *
7300 constants.MIN_WIPE_CHUNK_PERCENT)
7301 # we _must_ make this an int, otherwise rounding errors will
7303 wipe_chunk_size = int(wipe_chunk_size)
7305 lu.LogInfo("* Wiping disk %d", idx)
7306 logging.info("Wiping disk %d for instance %s, node %s using"
7307 " chunk size %s", idx, instance.name, node, wipe_chunk_size)
7312 start_time = time.time()
7314 while offset < size:
7315 wipe_size = min(wipe_chunk_size, size - offset)
7316 logging.debug("Wiping disk %d, offset %s, chunk %s",
7317 idx, offset, wipe_size)
7318 result = lu.rpc.call_blockdev_wipe(node, device, offset, wipe_size)
7319 result.Raise("Could not wipe disk %d at offset %d for size %d" %
7320 (idx, offset, wipe_size))
7323 if now - last_output >= 60:
7324 eta = _CalcEta(now - start_time, offset, size)
7325 lu.LogInfo(" - done: %.1f%% ETA: %s" %
7326 (offset / float(size) * 100, utils.FormatSeconds(eta)))
7329 logging.info("Resume sync of instance %s disks", instance.name)
7331 result = lu.rpc.call_blockdev_pause_resume_sync(node, instance.disks, False)
7333 for idx, success in enumerate(result.payload):
7335 lu.LogWarning("Resume sync of disk %d failed, please have a"
7336 " look at the status and troubleshoot the issue", idx)
7337 logging.warn("resume-sync of instance %s for disks %d failed",
7341 def _CreateDisks(lu, instance, to_skip=None, target_node=None):
7342 """Create all disks for an instance.
7344 This abstracts away some work from AddInstance.
7346 @type lu: L{LogicalUnit}
7347 @param lu: the logical unit on whose behalf we execute
7348 @type instance: L{objects.Instance}
7349 @param instance: the instance whose disks we should create
7351 @param to_skip: list of indices to skip
7352 @type target_node: string
7353 @param target_node: if passed, overrides the target node for creation
7355 @return: the success of the creation
7358 info = _GetInstanceInfoText(instance)
7359 if target_node is None:
7360 pnode = instance.primary_node
7361 all_nodes = instance.all_nodes
7366 if instance.disk_template in (constants.DT_FILE, constants.DT_SHARED_FILE):
7367 file_storage_dir = os.path.dirname(instance.disks[0].logical_id[1])
7368 result = lu.rpc.call_file_storage_dir_create(pnode, file_storage_dir)
7370 result.Raise("Failed to create directory '%s' on"
7371 " node %s" % (file_storage_dir, pnode))
7373 # Note: this needs to be kept in sync with adding of disks in
7374 # LUInstanceSetParams
7375 for idx, device in enumerate(instance.disks):
7376 if to_skip and idx in to_skip:
7378 logging.info("Creating volume %s for instance %s",
7379 device.iv_name, instance.name)
7381 for node in all_nodes:
7382 f_create = node == pnode
7383 _CreateBlockDev(lu, node, instance, device, f_create, info, f_create)
7386 def _RemoveDisks(lu, instance, target_node=None):
7387 """Remove all disks for an instance.
7389 This abstracts away some work from `AddInstance()` and
7390 `RemoveInstance()`. Note that in case some of the devices couldn't
7391 be removed, the removal will continue with the other ones (compare
7392 with `_CreateDisks()`).
7394 @type lu: L{LogicalUnit}
7395 @param lu: the logical unit on whose behalf we execute
7396 @type instance: L{objects.Instance}
7397 @param instance: the instance whose disks we should remove
7398 @type target_node: string
7399 @param target_node: used to override the node on which to remove the disks
7401 @return: the success of the removal
7404 logging.info("Removing block devices for instance %s", instance.name)
7407 for device in instance.disks:
7409 edata = [(target_node, device)]
7411 edata = device.ComputeNodeTree(instance.primary_node)
7412 for node, disk in edata:
7413 lu.cfg.SetDiskID(disk, node)
7414 msg = lu.rpc.call_blockdev_remove(node, disk).fail_msg
7416 lu.LogWarning("Could not remove block device %s on node %s,"
7417 " continuing anyway: %s", device.iv_name, node, msg)
7420 if instance.disk_template == constants.DT_FILE:
7421 file_storage_dir = os.path.dirname(instance.disks[0].logical_id[1])
7425 tgt = instance.primary_node
7426 result = lu.rpc.call_file_storage_dir_remove(tgt, file_storage_dir)
7428 lu.LogWarning("Could not remove directory '%s' on node %s: %s",
7429 file_storage_dir, instance.primary_node, result.fail_msg)
7435 def _ComputeDiskSizePerVG(disk_template, disks):
7436 """Compute disk size requirements in the volume group
7439 def _compute(disks, payload):
7440 """Universal algorithm.
7445 vgs[disk[constants.IDISK_VG]] = \
7446 vgs.get(constants.IDISK_VG, 0) + disk[constants.IDISK_SIZE] + payload
7450 # Required free disk space as a function of disk and swap space
7452 constants.DT_DISKLESS: {},
7453 constants.DT_PLAIN: _compute(disks, 0),
7454 # 128 MB are added for drbd metadata for each disk
7455 constants.DT_DRBD8: _compute(disks, 128),
7456 constants.DT_FILE: {},
7457 constants.DT_SHARED_FILE: {},
7460 if disk_template not in req_size_dict:
7461 raise errors.ProgrammerError("Disk template '%s' size requirement"
7462 " is unknown" % disk_template)
7464 return req_size_dict[disk_template]
7467 def _ComputeDiskSize(disk_template, disks):
7468 """Compute disk size requirements in the volume group
7471 # Required free disk space as a function of disk and swap space
7473 constants.DT_DISKLESS: None,
7474 constants.DT_PLAIN: sum(d[constants.IDISK_SIZE] for d in disks),
7475 # 128 MB are added for drbd metadata for each disk
7476 constants.DT_DRBD8: sum(d[constants.IDISK_SIZE] + 128 for d in disks),
7477 constants.DT_FILE: None,
7478 constants.DT_SHARED_FILE: 0,
7479 constants.DT_BLOCK: 0,
7482 if disk_template not in req_size_dict:
7483 raise errors.ProgrammerError("Disk template '%s' size requirement"
7484 " is unknown" % disk_template)
7486 return req_size_dict[disk_template]
7489 def _FilterVmNodes(lu, nodenames):
7490 """Filters out non-vm_capable nodes from a list.
7492 @type lu: L{LogicalUnit}
7493 @param lu: the logical unit for which we check
7494 @type nodenames: list
7495 @param nodenames: the list of nodes on which we should check
7497 @return: the list of vm-capable nodes
7500 vm_nodes = frozenset(lu.cfg.GetNonVmCapableNodeList())
7501 return [name for name in nodenames if name not in vm_nodes]
7504 def _CheckHVParams(lu, nodenames, hvname, hvparams):
7505 """Hypervisor parameter validation.
7507 This function abstract the hypervisor parameter validation to be
7508 used in both instance create and instance modify.
7510 @type lu: L{LogicalUnit}
7511 @param lu: the logical unit for which we check
7512 @type nodenames: list
7513 @param nodenames: the list of nodes on which we should check
7514 @type hvname: string
7515 @param hvname: the name of the hypervisor we should use
7516 @type hvparams: dict
7517 @param hvparams: the parameters which we need to check
7518 @raise errors.OpPrereqError: if the parameters are not valid
7521 nodenames = _FilterVmNodes(lu, nodenames)
7522 hvinfo = lu.rpc.call_hypervisor_validate_params(nodenames,
7525 for node in nodenames:
7529 info.Raise("Hypervisor parameter validation failed on node %s" % node)
7532 def _CheckOSParams(lu, required, nodenames, osname, osparams):
7533 """OS parameters validation.
7535 @type lu: L{LogicalUnit}
7536 @param lu: the logical unit for which we check
7537 @type required: boolean
7538 @param required: whether the validation should fail if the OS is not
7540 @type nodenames: list
7541 @param nodenames: the list of nodes on which we should check
7542 @type osname: string
7543 @param osname: the name of the hypervisor we should use
7544 @type osparams: dict
7545 @param osparams: the parameters which we need to check
7546 @raise errors.OpPrereqError: if the parameters are not valid
7549 nodenames = _FilterVmNodes(lu, nodenames)
7550 result = lu.rpc.call_os_validate(required, nodenames, osname,
7551 [constants.OS_VALIDATE_PARAMETERS],
7553 for node, nres in result.items():
7554 # we don't check for offline cases since this should be run only
7555 # against the master node and/or an instance's nodes
7556 nres.Raise("OS Parameters validation failed on node %s" % node)
7557 if not nres.payload:
7558 lu.LogInfo("OS %s not found on node %s, validation skipped",
7562 class LUInstanceCreate(LogicalUnit):
7563 """Create an instance.
7566 HPATH = "instance-add"
7567 HTYPE = constants.HTYPE_INSTANCE
7570 def CheckArguments(self):
7574 # do not require name_check to ease forward/backward compatibility
7576 if self.op.no_install and self.op.start:
7577 self.LogInfo("No-installation mode selected, disabling startup")
7578 self.op.start = False
7579 # validate/normalize the instance name
7580 self.op.instance_name = \
7581 netutils.Hostname.GetNormalizedName(self.op.instance_name)
7583 if self.op.ip_check and not self.op.name_check:
7584 # TODO: make the ip check more flexible and not depend on the name check
7585 raise errors.OpPrereqError("Cannot do IP address check without a name"
7586 " check", errors.ECODE_INVAL)
7588 # check nics' parameter names
7589 for nic in self.op.nics:
7590 utils.ForceDictType(nic, constants.INIC_PARAMS_TYPES)
7592 # check disks. parameter names and consistent adopt/no-adopt strategy
7593 has_adopt = has_no_adopt = False
7594 for disk in self.op.disks:
7595 utils.ForceDictType(disk, constants.IDISK_PARAMS_TYPES)
7596 if constants.IDISK_ADOPT in disk:
7600 if has_adopt and has_no_adopt:
7601 raise errors.OpPrereqError("Either all disks are adopted or none is",
7604 if self.op.disk_template not in constants.DTS_MAY_ADOPT:
7605 raise errors.OpPrereqError("Disk adoption is not supported for the"
7606 " '%s' disk template" %
7607 self.op.disk_template,
7609 if self.op.iallocator is not None:
7610 raise errors.OpPrereqError("Disk adoption not allowed with an"
7611 " iallocator script", errors.ECODE_INVAL)
7612 if self.op.mode == constants.INSTANCE_IMPORT:
7613 raise errors.OpPrereqError("Disk adoption not allowed for"
7614 " instance import", errors.ECODE_INVAL)
7616 if self.op.disk_template in constants.DTS_MUST_ADOPT:
7617 raise errors.OpPrereqError("Disk template %s requires disk adoption,"
7618 " but no 'adopt' parameter given" %
7619 self.op.disk_template,
7622 self.adopt_disks = has_adopt
7624 # instance name verification
7625 if self.op.name_check:
7626 self.hostname1 = netutils.GetHostname(name=self.op.instance_name)
7627 self.op.instance_name = self.hostname1.name
7628 # used in CheckPrereq for ip ping check
7629 self.check_ip = self.hostname1.ip
7631 self.check_ip = None
7633 # file storage checks
7634 if (self.op.file_driver and
7635 not self.op.file_driver in constants.FILE_DRIVER):
7636 raise errors.OpPrereqError("Invalid file driver name '%s'" %
7637 self.op.file_driver, errors.ECODE_INVAL)
7639 if self.op.file_storage_dir and os.path.isabs(self.op.file_storage_dir):
7640 raise errors.OpPrereqError("File storage directory path not absolute",
7643 ### Node/iallocator related checks
7644 _CheckIAllocatorOrNode(self, "iallocator", "pnode")
7646 if self.op.pnode is not None:
7647 if self.op.disk_template in constants.DTS_INT_MIRROR:
7648 if self.op.snode is None:
7649 raise errors.OpPrereqError("The networked disk templates need"
7650 " a mirror node", errors.ECODE_INVAL)
7652 self.LogWarning("Secondary node will be ignored on non-mirrored disk"
7654 self.op.snode = None
7656 self._cds = _GetClusterDomainSecret()
7658 if self.op.mode == constants.INSTANCE_IMPORT:
7659 # On import force_variant must be True, because if we forced it at
7660 # initial install, our only chance when importing it back is that it
7662 self.op.force_variant = True
7664 if self.op.no_install:
7665 self.LogInfo("No-installation mode has no effect during import")
7667 elif self.op.mode == constants.INSTANCE_CREATE:
7668 if self.op.os_type is None:
7669 raise errors.OpPrereqError("No guest OS specified",
7671 if self.op.os_type in self.cfg.GetClusterInfo().blacklisted_os:
7672 raise errors.OpPrereqError("Guest OS '%s' is not allowed for"
7673 " installation" % self.op.os_type,
7675 if self.op.disk_template is None:
7676 raise errors.OpPrereqError("No disk template specified",
7679 elif self.op.mode == constants.INSTANCE_REMOTE_IMPORT:
7680 # Check handshake to ensure both clusters have the same domain secret
7681 src_handshake = self.op.source_handshake
7682 if not src_handshake:
7683 raise errors.OpPrereqError("Missing source handshake",
7686 errmsg = masterd.instance.CheckRemoteExportHandshake(self._cds,
7689 raise errors.OpPrereqError("Invalid handshake: %s" % errmsg,
7692 # Load and check source CA
7693 self.source_x509_ca_pem = self.op.source_x509_ca
7694 if not self.source_x509_ca_pem:
7695 raise errors.OpPrereqError("Missing source X509 CA",
7699 (cert, _) = utils.LoadSignedX509Certificate(self.source_x509_ca_pem,
7701 except OpenSSL.crypto.Error, err:
7702 raise errors.OpPrereqError("Unable to load source X509 CA (%s)" %
7703 (err, ), errors.ECODE_INVAL)
7705 (errcode, msg) = utils.VerifyX509Certificate(cert, None, None)
7706 if errcode is not None:
7707 raise errors.OpPrereqError("Invalid source X509 CA (%s)" % (msg, ),
7710 self.source_x509_ca = cert
7712 src_instance_name = self.op.source_instance_name
7713 if not src_instance_name:
7714 raise errors.OpPrereqError("Missing source instance name",
7717 self.source_instance_name = \
7718 netutils.GetHostname(name=src_instance_name).name
7721 raise errors.OpPrereqError("Invalid instance creation mode %r" %
7722 self.op.mode, errors.ECODE_INVAL)
7724 def ExpandNames(self):
7725 """ExpandNames for CreateInstance.
7727 Figure out the right locks for instance creation.
7730 self.needed_locks = {}
7732 instance_name = self.op.instance_name
7733 # this is just a preventive check, but someone might still add this
7734 # instance in the meantime, and creation will fail at lock-add time
7735 if instance_name in self.cfg.GetInstanceList():
7736 raise errors.OpPrereqError("Instance '%s' is already in the cluster" %
7737 instance_name, errors.ECODE_EXISTS)
7739 self.add_locks[locking.LEVEL_INSTANCE] = instance_name
7741 if self.op.iallocator:
7742 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
7744 self.op.pnode = _ExpandNodeName(self.cfg, self.op.pnode)
7745 nodelist = [self.op.pnode]
7746 if self.op.snode is not None:
7747 self.op.snode = _ExpandNodeName(self.cfg, self.op.snode)
7748 nodelist.append(self.op.snode)
7749 self.needed_locks[locking.LEVEL_NODE] = nodelist
7751 # in case of import lock the source node too
7752 if self.op.mode == constants.INSTANCE_IMPORT:
7753 src_node = self.op.src_node
7754 src_path = self.op.src_path
7756 if src_path is None:
7757 self.op.src_path = src_path = self.op.instance_name
7759 if src_node is None:
7760 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
7761 self.op.src_node = None
7762 if os.path.isabs(src_path):
7763 raise errors.OpPrereqError("Importing an instance from an absolute"
7764 " path requires a source node option",
7767 self.op.src_node = src_node = _ExpandNodeName(self.cfg, src_node)
7768 if self.needed_locks[locking.LEVEL_NODE] is not locking.ALL_SET:
7769 self.needed_locks[locking.LEVEL_NODE].append(src_node)
7770 if not os.path.isabs(src_path):
7771 self.op.src_path = src_path = \
7772 utils.PathJoin(constants.EXPORT_DIR, src_path)
7774 def _RunAllocator(self):
7775 """Run the allocator based on input opcode.
7778 nics = [n.ToDict() for n in self.nics]
7779 ial = IAllocator(self.cfg, self.rpc,
7780 mode=constants.IALLOCATOR_MODE_ALLOC,
7781 name=self.op.instance_name,
7782 disk_template=self.op.disk_template,
7785 vcpus=self.be_full[constants.BE_VCPUS],
7786 mem_size=self.be_full[constants.BE_MEMORY],
7789 hypervisor=self.op.hypervisor,
7792 ial.Run(self.op.iallocator)
7795 raise errors.OpPrereqError("Can't compute nodes using"
7796 " iallocator '%s': %s" %
7797 (self.op.iallocator, ial.info),
7799 if len(ial.result) != ial.required_nodes:
7800 raise errors.OpPrereqError("iallocator '%s' returned invalid number"
7801 " of nodes (%s), required %s" %
7802 (self.op.iallocator, len(ial.result),
7803 ial.required_nodes), errors.ECODE_FAULT)
7804 self.op.pnode = ial.result[0]
7805 self.LogInfo("Selected nodes for instance %s via iallocator %s: %s",
7806 self.op.instance_name, self.op.iallocator,
7807 utils.CommaJoin(ial.result))
7808 if ial.required_nodes == 2:
7809 self.op.snode = ial.result[1]
7811 def BuildHooksEnv(self):
7814 This runs on master, primary and secondary nodes of the instance.
7818 "ADD_MODE": self.op.mode,
7820 if self.op.mode == constants.INSTANCE_IMPORT:
7821 env["SRC_NODE"] = self.op.src_node
7822 env["SRC_PATH"] = self.op.src_path
7823 env["SRC_IMAGES"] = self.src_images
7825 env.update(_BuildInstanceHookEnv(
7826 name=self.op.instance_name,
7827 primary_node=self.op.pnode,
7828 secondary_nodes=self.secondaries,
7829 status=self.op.start,
7830 os_type=self.op.os_type,
7831 memory=self.be_full[constants.BE_MEMORY],
7832 vcpus=self.be_full[constants.BE_VCPUS],
7833 nics=_NICListToTuple(self, self.nics),
7834 disk_template=self.op.disk_template,
7835 disks=[(d[constants.IDISK_SIZE], d[constants.IDISK_MODE])
7836 for d in self.disks],
7839 hypervisor_name=self.op.hypervisor,
7844 def BuildHooksNodes(self):
7845 """Build hooks nodes.
7848 nl = [self.cfg.GetMasterNode(), self.op.pnode] + self.secondaries
7851 def _ReadExportInfo(self):
7852 """Reads the export information from disk.
7854 It will override the opcode source node and path with the actual
7855 information, if these two were not specified before.
7857 @return: the export information
7860 assert self.op.mode == constants.INSTANCE_IMPORT
7862 src_node = self.op.src_node
7863 src_path = self.op.src_path
7865 if src_node is None:
7866 locked_nodes = self.glm.list_owned(locking.LEVEL_NODE)
7867 exp_list = self.rpc.call_export_list(locked_nodes)
7869 for node in exp_list:
7870 if exp_list[node].fail_msg:
7872 if src_path in exp_list[node].payload:
7874 self.op.src_node = src_node = node
7875 self.op.src_path = src_path = utils.PathJoin(constants.EXPORT_DIR,
7879 raise errors.OpPrereqError("No export found for relative path %s" %
7880 src_path, errors.ECODE_INVAL)
7882 _CheckNodeOnline(self, src_node)
7883 result = self.rpc.call_export_info(src_node, src_path)
7884 result.Raise("No export or invalid export found in dir %s" % src_path)
7886 export_info = objects.SerializableConfigParser.Loads(str(result.payload))
7887 if not export_info.has_section(constants.INISECT_EXP):
7888 raise errors.ProgrammerError("Corrupted export config",
7889 errors.ECODE_ENVIRON)
7891 ei_version = export_info.get(constants.INISECT_EXP, "version")
7892 if (int(ei_version) != constants.EXPORT_VERSION):
7893 raise errors.OpPrereqError("Wrong export version %s (wanted %d)" %
7894 (ei_version, constants.EXPORT_VERSION),
7895 errors.ECODE_ENVIRON)
7898 def _ReadExportParams(self, einfo):
7899 """Use export parameters as defaults.
7901 In case the opcode doesn't specify (as in override) some instance
7902 parameters, then try to use them from the export information, if
7906 self.op.os_type = einfo.get(constants.INISECT_EXP, "os")
7908 if self.op.disk_template is None:
7909 if einfo.has_option(constants.INISECT_INS, "disk_template"):
7910 self.op.disk_template = einfo.get(constants.INISECT_INS,
7913 raise errors.OpPrereqError("No disk template specified and the export"
7914 " is missing the disk_template information",
7917 if not self.op.disks:
7918 if einfo.has_option(constants.INISECT_INS, "disk_count"):
7920 # TODO: import the disk iv_name too
7921 for idx in range(einfo.getint(constants.INISECT_INS, "disk_count")):
7922 disk_sz = einfo.getint(constants.INISECT_INS, "disk%d_size" % idx)
7923 disks.append({constants.IDISK_SIZE: disk_sz})
7924 self.op.disks = disks
7926 raise errors.OpPrereqError("No disk info specified and the export"
7927 " is missing the disk information",
7930 if (not self.op.nics and
7931 einfo.has_option(constants.INISECT_INS, "nic_count")):
7933 for idx in range(einfo.getint(constants.INISECT_INS, "nic_count")):
7935 for name in list(constants.NICS_PARAMETERS) + ["ip", "mac"]:
7936 v = einfo.get(constants.INISECT_INS, "nic%d_%s" % (idx, name))
7941 if (self.op.hypervisor is None and
7942 einfo.has_option(constants.INISECT_INS, "hypervisor")):
7943 self.op.hypervisor = einfo.get(constants.INISECT_INS, "hypervisor")
7944 if einfo.has_section(constants.INISECT_HYP):
7945 # use the export parameters but do not override the ones
7946 # specified by the user
7947 for name, value in einfo.items(constants.INISECT_HYP):
7948 if name not in self.op.hvparams:
7949 self.op.hvparams[name] = value
7951 if einfo.has_section(constants.INISECT_BEP):
7952 # use the parameters, without overriding
7953 for name, value in einfo.items(constants.INISECT_BEP):
7954 if name not in self.op.beparams:
7955 self.op.beparams[name] = value
7957 # try to read the parameters old style, from the main section
7958 for name in constants.BES_PARAMETERS:
7959 if (name not in self.op.beparams and
7960 einfo.has_option(constants.INISECT_INS, name)):
7961 self.op.beparams[name] = einfo.get(constants.INISECT_INS, name)
7963 if einfo.has_section(constants.INISECT_OSP):
7964 # use the parameters, without overriding
7965 for name, value in einfo.items(constants.INISECT_OSP):
7966 if name not in self.op.osparams:
7967 self.op.osparams[name] = value
7969 def _RevertToDefaults(self, cluster):
7970 """Revert the instance parameters to the default values.
7974 hv_defs = cluster.SimpleFillHV(self.op.hypervisor, self.op.os_type, {})
7975 for name in self.op.hvparams.keys():
7976 if name in hv_defs and hv_defs[name] == self.op.hvparams[name]:
7977 del self.op.hvparams[name]
7979 be_defs = cluster.SimpleFillBE({})
7980 for name in self.op.beparams.keys():
7981 if name in be_defs and be_defs[name] == self.op.beparams[name]:
7982 del self.op.beparams[name]
7984 nic_defs = cluster.SimpleFillNIC({})
7985 for nic in self.op.nics:
7986 for name in constants.NICS_PARAMETERS:
7987 if name in nic and name in nic_defs and nic[name] == nic_defs[name]:
7990 os_defs = cluster.SimpleFillOS(self.op.os_type, {})
7991 for name in self.op.osparams.keys():
7992 if name in os_defs and os_defs[name] == self.op.osparams[name]:
7993 del self.op.osparams[name]
7995 def CheckPrereq(self):
7996 """Check prerequisites.
7999 if self.op.mode == constants.INSTANCE_IMPORT:
8000 export_info = self._ReadExportInfo()
8001 self._ReadExportParams(export_info)
8003 if (not self.cfg.GetVGName() and
8004 self.op.disk_template not in constants.DTS_NOT_LVM):
8005 raise errors.OpPrereqError("Cluster does not support lvm-based"
8006 " instances", errors.ECODE_STATE)
8008 if self.op.hypervisor is None:
8009 self.op.hypervisor = self.cfg.GetHypervisorType()
8011 cluster = self.cfg.GetClusterInfo()
8012 enabled_hvs = cluster.enabled_hypervisors
8013 if self.op.hypervisor not in enabled_hvs:
8014 raise errors.OpPrereqError("Selected hypervisor (%s) not enabled in the"
8015 " cluster (%s)" % (self.op.hypervisor,
8016 ",".join(enabled_hvs)),
8019 # check hypervisor parameter syntax (locally)
8020 utils.ForceDictType(self.op.hvparams, constants.HVS_PARAMETER_TYPES)
8021 filled_hvp = cluster.SimpleFillHV(self.op.hypervisor, self.op.os_type,
8023 hv_type = hypervisor.GetHypervisor(self.op.hypervisor)
8024 hv_type.CheckParameterSyntax(filled_hvp)
8025 self.hv_full = filled_hvp
8026 # check that we don't specify global parameters on an instance
8027 _CheckGlobalHvParams(self.op.hvparams)
8029 # fill and remember the beparams dict
8030 utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
8031 self.be_full = cluster.SimpleFillBE(self.op.beparams)
8033 # build os parameters
8034 self.os_full = cluster.SimpleFillOS(self.op.os_type, self.op.osparams)
8036 # now that hvp/bep are in final format, let's reset to defaults,
8038 if self.op.identify_defaults:
8039 self._RevertToDefaults(cluster)
8043 for idx, nic in enumerate(self.op.nics):
8044 nic_mode_req = nic.get(constants.INIC_MODE, None)
8045 nic_mode = nic_mode_req
8046 if nic_mode is None:
8047 nic_mode = cluster.nicparams[constants.PP_DEFAULT][constants.NIC_MODE]
8049 # in routed mode, for the first nic, the default ip is 'auto'
8050 if nic_mode == constants.NIC_MODE_ROUTED and idx == 0:
8051 default_ip_mode = constants.VALUE_AUTO
8053 default_ip_mode = constants.VALUE_NONE
8055 # ip validity checks
8056 ip = nic.get(constants.INIC_IP, default_ip_mode)
8057 if ip is None or ip.lower() == constants.VALUE_NONE:
8059 elif ip.lower() == constants.VALUE_AUTO:
8060 if not self.op.name_check:
8061 raise errors.OpPrereqError("IP address set to auto but name checks"
8062 " have been skipped",
8064 nic_ip = self.hostname1.ip
8066 if not netutils.IPAddress.IsValid(ip):
8067 raise errors.OpPrereqError("Invalid IP address '%s'" % ip,
8071 # TODO: check the ip address for uniqueness
8072 if nic_mode == constants.NIC_MODE_ROUTED and not nic_ip:
8073 raise errors.OpPrereqError("Routed nic mode requires an ip address",
8076 # MAC address verification
8077 mac = nic.get(constants.INIC_MAC, constants.VALUE_AUTO)
8078 if mac not in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
8079 mac = utils.NormalizeAndValidateMac(mac)
8082 self.cfg.ReserveMAC(mac, self.proc.GetECId())
8083 except errors.ReservationError:
8084 raise errors.OpPrereqError("MAC address %s already in use"
8085 " in cluster" % mac,
8086 errors.ECODE_NOTUNIQUE)
8088 # Build nic parameters
8089 link = nic.get(constants.INIC_LINK, None)
8092 nicparams[constants.NIC_MODE] = nic_mode_req
8094 nicparams[constants.NIC_LINK] = link
8096 check_params = cluster.SimpleFillNIC(nicparams)
8097 objects.NIC.CheckParameterSyntax(check_params)
8098 self.nics.append(objects.NIC(mac=mac, ip=nic_ip, nicparams=nicparams))
8100 # disk checks/pre-build
8101 default_vg = self.cfg.GetVGName()
8103 for disk in self.op.disks:
8104 mode = disk.get(constants.IDISK_MODE, constants.DISK_RDWR)
8105 if mode not in constants.DISK_ACCESS_SET:
8106 raise errors.OpPrereqError("Invalid disk access mode '%s'" %
8107 mode, errors.ECODE_INVAL)
8108 size = disk.get(constants.IDISK_SIZE, None)
8110 raise errors.OpPrereqError("Missing disk size", errors.ECODE_INVAL)
8113 except (TypeError, ValueError):
8114 raise errors.OpPrereqError("Invalid disk size '%s'" % size,
8117 data_vg = disk.get(constants.IDISK_VG, default_vg)
8119 constants.IDISK_SIZE: size,
8120 constants.IDISK_MODE: mode,
8121 constants.IDISK_VG: data_vg,
8122 constants.IDISK_METAVG: disk.get(constants.IDISK_METAVG, data_vg),
8124 if constants.IDISK_ADOPT in disk:
8125 new_disk[constants.IDISK_ADOPT] = disk[constants.IDISK_ADOPT]
8126 self.disks.append(new_disk)
8128 if self.op.mode == constants.INSTANCE_IMPORT:
8130 # Check that the new instance doesn't have less disks than the export
8131 instance_disks = len(self.disks)
8132 export_disks = export_info.getint(constants.INISECT_INS, 'disk_count')
8133 if instance_disks < export_disks:
8134 raise errors.OpPrereqError("Not enough disks to import."
8135 " (instance: %d, export: %d)" %
8136 (instance_disks, export_disks),
8140 for idx in range(export_disks):
8141 option = 'disk%d_dump' % idx
8142 if export_info.has_option(constants.INISECT_INS, option):
8143 # FIXME: are the old os-es, disk sizes, etc. useful?
8144 export_name = export_info.get(constants.INISECT_INS, option)
8145 image = utils.PathJoin(self.op.src_path, export_name)
8146 disk_images.append(image)
8148 disk_images.append(False)
8150 self.src_images = disk_images
8152 old_name = export_info.get(constants.INISECT_INS, 'name')
8154 exp_nic_count = export_info.getint(constants.INISECT_INS, 'nic_count')
8155 except (TypeError, ValueError), err:
8156 raise errors.OpPrereqError("Invalid export file, nic_count is not"
8157 " an integer: %s" % str(err),
8159 if self.op.instance_name == old_name:
8160 for idx, nic in enumerate(self.nics):
8161 if nic.mac == constants.VALUE_AUTO and exp_nic_count >= idx:
8162 nic_mac_ini = 'nic%d_mac' % idx
8163 nic.mac = export_info.get(constants.INISECT_INS, nic_mac_ini)
8165 # ENDIF: self.op.mode == constants.INSTANCE_IMPORT
8167 # ip ping checks (we use the same ip that was resolved in ExpandNames)
8168 if self.op.ip_check:
8169 if netutils.TcpPing(self.check_ip, constants.DEFAULT_NODED_PORT):
8170 raise errors.OpPrereqError("IP %s of instance %s already in use" %
8171 (self.check_ip, self.op.instance_name),
8172 errors.ECODE_NOTUNIQUE)
8174 #### mac address generation
8175 # By generating here the mac address both the allocator and the hooks get
8176 # the real final mac address rather than the 'auto' or 'generate' value.
8177 # There is a race condition between the generation and the instance object
8178 # creation, which means that we know the mac is valid now, but we're not
8179 # sure it will be when we actually add the instance. If things go bad
8180 # adding the instance will abort because of a duplicate mac, and the
8181 # creation job will fail.
8182 for nic in self.nics:
8183 if nic.mac in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
8184 nic.mac = self.cfg.GenerateMAC(self.proc.GetECId())
8188 if self.op.iallocator is not None:
8189 self._RunAllocator()
8191 #### node related checks
8193 # check primary node
8194 self.pnode = pnode = self.cfg.GetNodeInfo(self.op.pnode)
8195 assert self.pnode is not None, \
8196 "Cannot retrieve locked node %s" % self.op.pnode
8198 raise errors.OpPrereqError("Cannot use offline primary node '%s'" %
8199 pnode.name, errors.ECODE_STATE)
8201 raise errors.OpPrereqError("Cannot use drained primary node '%s'" %
8202 pnode.name, errors.ECODE_STATE)
8203 if not pnode.vm_capable:
8204 raise errors.OpPrereqError("Cannot use non-vm_capable primary node"
8205 " '%s'" % pnode.name, errors.ECODE_STATE)
8207 self.secondaries = []
8209 # mirror node verification
8210 if self.op.disk_template in constants.DTS_INT_MIRROR:
8211 if self.op.snode == pnode.name:
8212 raise errors.OpPrereqError("The secondary node cannot be the"
8213 " primary node", errors.ECODE_INVAL)
8214 _CheckNodeOnline(self, self.op.snode)
8215 _CheckNodeNotDrained(self, self.op.snode)
8216 _CheckNodeVmCapable(self, self.op.snode)
8217 self.secondaries.append(self.op.snode)
8219 nodenames = [pnode.name] + self.secondaries
8221 if not self.adopt_disks:
8222 # Check lv size requirements, if not adopting
8223 req_sizes = _ComputeDiskSizePerVG(self.op.disk_template, self.disks)
8224 _CheckNodesFreeDiskPerVG(self, nodenames, req_sizes)
8226 elif self.op.disk_template == constants.DT_PLAIN: # Check the adoption data
8227 all_lvs = set(["%s/%s" % (disk[constants.IDISK_VG],
8228 disk[constants.IDISK_ADOPT])
8229 for disk in self.disks])
8230 if len(all_lvs) != len(self.disks):
8231 raise errors.OpPrereqError("Duplicate volume names given for adoption",
8233 for lv_name in all_lvs:
8235 # FIXME: lv_name here is "vg/lv" need to ensure that other calls
8236 # to ReserveLV uses the same syntax
8237 self.cfg.ReserveLV(lv_name, self.proc.GetECId())
8238 except errors.ReservationError:
8239 raise errors.OpPrereqError("LV named %s used by another instance" %
8240 lv_name, errors.ECODE_NOTUNIQUE)
8242 vg_names = self.rpc.call_vg_list([pnode.name])[pnode.name]
8243 vg_names.Raise("Cannot get VG information from node %s" % pnode.name)
8245 node_lvs = self.rpc.call_lv_list([pnode.name],
8246 vg_names.payload.keys())[pnode.name]
8247 node_lvs.Raise("Cannot get LV information from node %s" % pnode.name)
8248 node_lvs = node_lvs.payload
8250 delta = all_lvs.difference(node_lvs.keys())
8252 raise errors.OpPrereqError("Missing logical volume(s): %s" %
8253 utils.CommaJoin(delta),
8255 online_lvs = [lv for lv in all_lvs if node_lvs[lv][2]]
8257 raise errors.OpPrereqError("Online logical volumes found, cannot"
8258 " adopt: %s" % utils.CommaJoin(online_lvs),
8260 # update the size of disk based on what is found
8261 for dsk in self.disks:
8262 dsk[constants.IDISK_SIZE] = \
8263 int(float(node_lvs["%s/%s" % (dsk[constants.IDISK_VG],
8264 dsk[constants.IDISK_ADOPT])][0]))
8266 elif self.op.disk_template == constants.DT_BLOCK:
8267 # Normalize and de-duplicate device paths
8268 all_disks = set([os.path.abspath(disk[constants.IDISK_ADOPT])
8269 for disk in self.disks])
8270 if len(all_disks) != len(self.disks):
8271 raise errors.OpPrereqError("Duplicate disk names given for adoption",
8273 baddisks = [d for d in all_disks
8274 if not d.startswith(constants.ADOPTABLE_BLOCKDEV_ROOT)]
8276 raise errors.OpPrereqError("Device node(s) %s lie outside %s and"
8277 " cannot be adopted" %
8278 (", ".join(baddisks),
8279 constants.ADOPTABLE_BLOCKDEV_ROOT),
8282 node_disks = self.rpc.call_bdev_sizes([pnode.name],
8283 list(all_disks))[pnode.name]
8284 node_disks.Raise("Cannot get block device information from node %s" %
8286 node_disks = node_disks.payload
8287 delta = all_disks.difference(node_disks.keys())
8289 raise errors.OpPrereqError("Missing block device(s): %s" %
8290 utils.CommaJoin(delta),
8292 for dsk in self.disks:
8293 dsk[constants.IDISK_SIZE] = \
8294 int(float(node_disks[dsk[constants.IDISK_ADOPT]]))
8296 _CheckHVParams(self, nodenames, self.op.hypervisor, self.op.hvparams)
8298 _CheckNodeHasOS(self, pnode.name, self.op.os_type, self.op.force_variant)
8299 # check OS parameters (remotely)
8300 _CheckOSParams(self, True, nodenames, self.op.os_type, self.os_full)
8302 _CheckNicsBridgesExist(self, self.nics, self.pnode.name)
8304 # memory check on primary node
8306 _CheckNodeFreeMemory(self, self.pnode.name,
8307 "creating instance %s" % self.op.instance_name,
8308 self.be_full[constants.BE_MEMORY],
8311 self.dry_run_result = list(nodenames)
8313 def Exec(self, feedback_fn):
8314 """Create and add the instance to the cluster.
8317 instance = self.op.instance_name
8318 pnode_name = self.pnode.name
8320 ht_kind = self.op.hypervisor
8321 if ht_kind in constants.HTS_REQ_PORT:
8322 network_port = self.cfg.AllocatePort()
8326 if constants.ENABLE_FILE_STORAGE or constants.ENABLE_SHARED_FILE_STORAGE:
8327 # this is needed because os.path.join does not accept None arguments
8328 if self.op.file_storage_dir is None:
8329 string_file_storage_dir = ""
8331 string_file_storage_dir = self.op.file_storage_dir
8333 # build the full file storage dir path
8334 if self.op.disk_template == constants.DT_SHARED_FILE:
8335 get_fsd_fn = self.cfg.GetSharedFileStorageDir
8337 get_fsd_fn = self.cfg.GetFileStorageDir
8339 file_storage_dir = utils.PathJoin(get_fsd_fn(),
8340 string_file_storage_dir, instance)
8342 file_storage_dir = ""
8344 disks = _GenerateDiskTemplate(self,
8345 self.op.disk_template,
8346 instance, pnode_name,
8350 self.op.file_driver,
8354 iobj = objects.Instance(name=instance, os=self.op.os_type,
8355 primary_node=pnode_name,
8356 nics=self.nics, disks=disks,
8357 disk_template=self.op.disk_template,
8359 network_port=network_port,
8360 beparams=self.op.beparams,
8361 hvparams=self.op.hvparams,
8362 hypervisor=self.op.hypervisor,
8363 osparams=self.op.osparams,
8366 if self.adopt_disks:
8367 if self.op.disk_template == constants.DT_PLAIN:
8368 # rename LVs to the newly-generated names; we need to construct
8369 # 'fake' LV disks with the old data, plus the new unique_id
8370 tmp_disks = [objects.Disk.FromDict(v.ToDict()) for v in disks]
8372 for t_dsk, a_dsk in zip (tmp_disks, self.disks):
8373 rename_to.append(t_dsk.logical_id)
8374 t_dsk.logical_id = (t_dsk.logical_id[0], a_dsk[constants.IDISK_ADOPT])
8375 self.cfg.SetDiskID(t_dsk, pnode_name)
8376 result = self.rpc.call_blockdev_rename(pnode_name,
8377 zip(tmp_disks, rename_to))
8378 result.Raise("Failed to rename adoped LVs")
8380 feedback_fn("* creating instance disks...")
8382 _CreateDisks(self, iobj)
8383 except errors.OpExecError:
8384 self.LogWarning("Device creation failed, reverting...")
8386 _RemoveDisks(self, iobj)
8388 self.cfg.ReleaseDRBDMinors(instance)
8391 feedback_fn("adding instance %s to cluster config" % instance)
8393 self.cfg.AddInstance(iobj, self.proc.GetECId())
8395 # Declare that we don't want to remove the instance lock anymore, as we've
8396 # added the instance to the config
8397 del self.remove_locks[locking.LEVEL_INSTANCE]
8399 if self.op.mode == constants.INSTANCE_IMPORT:
8400 # Release unused nodes
8401 _ReleaseLocks(self, locking.LEVEL_NODE, keep=[self.op.src_node])
8404 _ReleaseLocks(self, locking.LEVEL_NODE)
8407 if not self.adopt_disks and self.cfg.GetClusterInfo().prealloc_wipe_disks:
8408 feedback_fn("* wiping instance disks...")
8410 _WipeDisks(self, iobj)
8411 except errors.OpExecError, err:
8412 logging.exception("Wiping disks failed")
8413 self.LogWarning("Wiping instance disks failed (%s)", err)
8417 # Something is already wrong with the disks, don't do anything else
8419 elif self.op.wait_for_sync:
8420 disk_abort = not _WaitForSync(self, iobj)
8421 elif iobj.disk_template in constants.DTS_INT_MIRROR:
8422 # make sure the disks are not degraded (still sync-ing is ok)
8424 feedback_fn("* checking mirrors status")
8425 disk_abort = not _WaitForSync(self, iobj, oneshot=True)
8430 _RemoveDisks(self, iobj)
8431 self.cfg.RemoveInstance(iobj.name)
8432 # Make sure the instance lock gets removed
8433 self.remove_locks[locking.LEVEL_INSTANCE] = iobj.name
8434 raise errors.OpExecError("There are some degraded disks for"
8437 if iobj.disk_template != constants.DT_DISKLESS and not self.adopt_disks:
8438 if self.op.mode == constants.INSTANCE_CREATE:
8439 if not self.op.no_install:
8440 feedback_fn("* running the instance OS create scripts...")
8441 # FIXME: pass debug option from opcode to backend
8442 result = self.rpc.call_instance_os_add(pnode_name, iobj, False,
8443 self.op.debug_level)
8444 result.Raise("Could not add os for instance %s"
8445 " on node %s" % (instance, pnode_name))
8447 elif self.op.mode == constants.INSTANCE_IMPORT:
8448 feedback_fn("* running the instance OS import scripts...")
8452 for idx, image in enumerate(self.src_images):
8456 # FIXME: pass debug option from opcode to backend
8457 dt = masterd.instance.DiskTransfer("disk/%s" % idx,
8458 constants.IEIO_FILE, (image, ),
8459 constants.IEIO_SCRIPT,
8460 (iobj.disks[idx], idx),
8462 transfers.append(dt)
8465 masterd.instance.TransferInstanceData(self, feedback_fn,
8466 self.op.src_node, pnode_name,
8467 self.pnode.secondary_ip,
8469 if not compat.all(import_result):
8470 self.LogWarning("Some disks for instance %s on node %s were not"
8471 " imported successfully" % (instance, pnode_name))
8473 elif self.op.mode == constants.INSTANCE_REMOTE_IMPORT:
8474 feedback_fn("* preparing remote import...")
8475 # The source cluster will stop the instance before attempting to make a
8476 # connection. In some cases stopping an instance can take a long time,
8477 # hence the shutdown timeout is added to the connection timeout.
8478 connect_timeout = (constants.RIE_CONNECT_TIMEOUT +
8479 self.op.source_shutdown_timeout)
8480 timeouts = masterd.instance.ImportExportTimeouts(connect_timeout)
8482 assert iobj.primary_node == self.pnode.name
8484 masterd.instance.RemoteImport(self, feedback_fn, iobj, self.pnode,
8485 self.source_x509_ca,
8486 self._cds, timeouts)
8487 if not compat.all(disk_results):
8488 # TODO: Should the instance still be started, even if some disks
8489 # failed to import (valid for local imports, too)?
8490 self.LogWarning("Some disks for instance %s on node %s were not"
8491 " imported successfully" % (instance, pnode_name))
8493 # Run rename script on newly imported instance
8494 assert iobj.name == instance
8495 feedback_fn("Running rename script for %s" % instance)
8496 result = self.rpc.call_instance_run_rename(pnode_name, iobj,
8497 self.source_instance_name,
8498 self.op.debug_level)
8500 self.LogWarning("Failed to run rename script for %s on node"
8501 " %s: %s" % (instance, pnode_name, result.fail_msg))
8504 # also checked in the prereq part
8505 raise errors.ProgrammerError("Unknown OS initialization mode '%s'"
8509 iobj.admin_up = True
8510 self.cfg.Update(iobj, feedback_fn)
8511 logging.info("Starting instance %s on node %s", instance, pnode_name)
8512 feedback_fn("* starting instance...")
8513 result = self.rpc.call_instance_start(pnode_name, iobj, None, None)
8514 result.Raise("Could not start instance")
8516 return list(iobj.all_nodes)
8519 class LUInstanceConsole(NoHooksLU):
8520 """Connect to an instance's console.
8522 This is somewhat special in that it returns the command line that
8523 you need to run on the master node in order to connect to the
8529 def ExpandNames(self):
8530 self._ExpandAndLockInstance()
8532 def CheckPrereq(self):
8533 """Check prerequisites.
8535 This checks that the instance is in the cluster.
8538 self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
8539 assert self.instance is not None, \
8540 "Cannot retrieve locked instance %s" % self.op.instance_name
8541 _CheckNodeOnline(self, self.instance.primary_node)
8543 def Exec(self, feedback_fn):
8544 """Connect to the console of an instance
8547 instance = self.instance
8548 node = instance.primary_node
8550 node_insts = self.rpc.call_instance_list([node],
8551 [instance.hypervisor])[node]
8552 node_insts.Raise("Can't get node information from %s" % node)
8554 if instance.name not in node_insts.payload:
8555 if instance.admin_up:
8556 state = constants.INSTST_ERRORDOWN
8558 state = constants.INSTST_ADMINDOWN
8559 raise errors.OpExecError("Instance %s is not running (state %s)" %
8560 (instance.name, state))
8562 logging.debug("Connecting to console of %s on %s", instance.name, node)
8564 return _GetInstanceConsole(self.cfg.GetClusterInfo(), instance)
8567 def _GetInstanceConsole(cluster, instance):
8568 """Returns console information for an instance.
8570 @type cluster: L{objects.Cluster}
8571 @type instance: L{objects.Instance}
8575 hyper = hypervisor.GetHypervisor(instance.hypervisor)
8576 # beparams and hvparams are passed separately, to avoid editing the
8577 # instance and then saving the defaults in the instance itself.
8578 hvparams = cluster.FillHV(instance)
8579 beparams = cluster.FillBE(instance)
8580 console = hyper.GetInstanceConsole(instance, hvparams, beparams)
8582 assert console.instance == instance.name
8583 assert console.Validate()
8585 return console.ToDict()
8588 class LUInstanceReplaceDisks(LogicalUnit):
8589 """Replace the disks of an instance.
8592 HPATH = "mirrors-replace"
8593 HTYPE = constants.HTYPE_INSTANCE
8596 def CheckArguments(self):
8597 TLReplaceDisks.CheckArguments(self.op.mode, self.op.remote_node,
8600 def ExpandNames(self):
8601 self._ExpandAndLockInstance()
8603 assert locking.LEVEL_NODE not in self.needed_locks
8604 assert locking.LEVEL_NODEGROUP not in self.needed_locks
8606 assert self.op.iallocator is None or self.op.remote_node is None, \
8607 "Conflicting options"
8609 if self.op.remote_node is not None:
8610 self.op.remote_node = _ExpandNodeName(self.cfg, self.op.remote_node)
8612 # Warning: do not remove the locking of the new secondary here
8613 # unless DRBD8.AddChildren is changed to work in parallel;
8614 # currently it doesn't since parallel invocations of
8615 # FindUnusedMinor will conflict
8616 self.needed_locks[locking.LEVEL_NODE] = [self.op.remote_node]
8617 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
8619 self.needed_locks[locking.LEVEL_NODE] = []
8620 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
8622 if self.op.iallocator is not None:
8623 # iallocator will select a new node in the same group
8624 self.needed_locks[locking.LEVEL_NODEGROUP] = []
8626 self.replacer = TLReplaceDisks(self, self.op.instance_name, self.op.mode,
8627 self.op.iallocator, self.op.remote_node,
8628 self.op.disks, False, self.op.early_release)
8630 self.tasklets = [self.replacer]
8632 def DeclareLocks(self, level):
8633 if level == locking.LEVEL_NODEGROUP:
8634 assert self.op.remote_node is None
8635 assert self.op.iallocator is not None
8636 assert not self.needed_locks[locking.LEVEL_NODEGROUP]
8638 self.share_locks[locking.LEVEL_NODEGROUP] = 1
8639 self.needed_locks[locking.LEVEL_NODEGROUP] = \
8640 self.cfg.GetInstanceNodeGroups(self.op.instance_name)
8642 elif level == locking.LEVEL_NODE:
8643 if self.op.iallocator is not None:
8644 assert self.op.remote_node is None
8645 assert not self.needed_locks[locking.LEVEL_NODE]
8647 # Lock member nodes of all locked groups
8648 self.needed_locks[locking.LEVEL_NODE] = [node_name
8649 for group_uuid in self.glm.list_owned(locking.LEVEL_NODEGROUP)
8650 for node_name in self.cfg.GetNodeGroup(group_uuid).members]
8652 self._LockInstancesNodes()
8654 def BuildHooksEnv(self):
8657 This runs on the master, the primary and all the secondaries.
8660 instance = self.replacer.instance
8662 "MODE": self.op.mode,
8663 "NEW_SECONDARY": self.op.remote_node,
8664 "OLD_SECONDARY": instance.secondary_nodes[0],
8666 env.update(_BuildInstanceHookEnvByObject(self, instance))
8669 def BuildHooksNodes(self):
8670 """Build hooks nodes.
8673 instance = self.replacer.instance
8675 self.cfg.GetMasterNode(),
8676 instance.primary_node,
8678 if self.op.remote_node is not None:
8679 nl.append(self.op.remote_node)
8682 def CheckPrereq(self):
8683 """Check prerequisites.
8686 assert (self.glm.is_owned(locking.LEVEL_NODEGROUP) or
8687 self.op.iallocator is None)
8689 owned_groups = self.glm.list_owned(locking.LEVEL_NODEGROUP)
8691 groups = self.cfg.GetInstanceNodeGroups(self.op.instance_name)
8692 if owned_groups != groups:
8693 raise errors.OpExecError("Node groups used by instance '%s' changed"
8694 " since lock was acquired, current list is %r,"
8695 " used to be '%s'" %
8696 (self.op.instance_name,
8697 utils.CommaJoin(groups),
8698 utils.CommaJoin(owned_groups)))
8700 return LogicalUnit.CheckPrereq(self)
8703 class TLReplaceDisks(Tasklet):
8704 """Replaces disks for an instance.
8706 Note: Locking is not within the scope of this class.
8709 def __init__(self, lu, instance_name, mode, iallocator_name, remote_node,
8710 disks, delay_iallocator, early_release):
8711 """Initializes this class.
8714 Tasklet.__init__(self, lu)
8717 self.instance_name = instance_name
8719 self.iallocator_name = iallocator_name
8720 self.remote_node = remote_node
8722 self.delay_iallocator = delay_iallocator
8723 self.early_release = early_release
8726 self.instance = None
8727 self.new_node = None
8728 self.target_node = None
8729 self.other_node = None
8730 self.remote_node_info = None
8731 self.node_secondary_ip = None
8734 def CheckArguments(mode, remote_node, iallocator):
8735 """Helper function for users of this class.
8738 # check for valid parameter combination
8739 if mode == constants.REPLACE_DISK_CHG:
8740 if remote_node is None and iallocator is None:
8741 raise errors.OpPrereqError("When changing the secondary either an"
8742 " iallocator script must be used or the"
8743 " new node given", errors.ECODE_INVAL)
8745 if remote_node is not None and iallocator is not None:
8746 raise errors.OpPrereqError("Give either the iallocator or the new"
8747 " secondary, not both", errors.ECODE_INVAL)
8749 elif remote_node is not None or iallocator is not None:
8750 # Not replacing the secondary
8751 raise errors.OpPrereqError("The iallocator and new node options can"
8752 " only be used when changing the"
8753 " secondary node", errors.ECODE_INVAL)
8756 def _RunAllocator(lu, iallocator_name, instance_name, relocate_from):
8757 """Compute a new secondary node using an IAllocator.
8760 ial = IAllocator(lu.cfg, lu.rpc,
8761 mode=constants.IALLOCATOR_MODE_RELOC,
8763 relocate_from=relocate_from)
8765 ial.Run(iallocator_name)
8768 raise errors.OpPrereqError("Can't compute nodes using iallocator '%s':"
8769 " %s" % (iallocator_name, ial.info),
8772 if len(ial.result) != ial.required_nodes:
8773 raise errors.OpPrereqError("iallocator '%s' returned invalid number"
8774 " of nodes (%s), required %s" %
8776 len(ial.result), ial.required_nodes),
8779 remote_node_name = ial.result[0]
8781 lu.LogInfo("Selected new secondary for instance '%s': %s",
8782 instance_name, remote_node_name)
8784 return remote_node_name
8786 def _FindFaultyDisks(self, node_name):
8787 return _FindFaultyInstanceDisks(self.cfg, self.rpc, self.instance,
8790 def _CheckDisksActivated(self, instance):
8791 """Checks if the instance disks are activated.
8793 @param instance: The instance to check disks
8794 @return: True if they are activated, False otherwise
8797 nodes = instance.all_nodes
8799 for idx, dev in enumerate(instance.disks):
8801 self.lu.LogInfo("Checking disk/%d on %s", idx, node)
8802 self.cfg.SetDiskID(dev, node)
8804 result = self.rpc.call_blockdev_find(node, dev)
8808 elif result.fail_msg or not result.payload:
8813 def CheckPrereq(self):
8814 """Check prerequisites.
8816 This checks that the instance is in the cluster.
8819 self.instance = instance = self.cfg.GetInstanceInfo(self.instance_name)
8820 assert instance is not None, \
8821 "Cannot retrieve locked instance %s" % self.instance_name
8823 if instance.disk_template != constants.DT_DRBD8:
8824 raise errors.OpPrereqError("Can only run replace disks for DRBD8-based"
8825 " instances", errors.ECODE_INVAL)
8827 if len(instance.secondary_nodes) != 1:
8828 raise errors.OpPrereqError("The instance has a strange layout,"
8829 " expected one secondary but found %d" %
8830 len(instance.secondary_nodes),
8833 if not self.delay_iallocator:
8834 self._CheckPrereq2()
8836 def _CheckPrereq2(self):
8837 """Check prerequisites, second part.
8839 This function should always be part of CheckPrereq. It was separated and is
8840 now called from Exec because during node evacuation iallocator was only
8841 called with an unmodified cluster model, not taking planned changes into
8845 instance = self.instance
8846 secondary_node = instance.secondary_nodes[0]
8848 if self.iallocator_name is None:
8849 remote_node = self.remote_node
8851 remote_node = self._RunAllocator(self.lu, self.iallocator_name,
8852 instance.name, instance.secondary_nodes)
8854 if remote_node is None:
8855 self.remote_node_info = None
8857 assert remote_node in self.lu.glm.list_owned(locking.LEVEL_NODE), \
8858 "Remote node '%s' is not locked" % remote_node
8860 self.remote_node_info = self.cfg.GetNodeInfo(remote_node)
8861 assert self.remote_node_info is not None, \
8862 "Cannot retrieve locked node %s" % remote_node
8864 if remote_node == self.instance.primary_node:
8865 raise errors.OpPrereqError("The specified node is the primary node of"
8866 " the instance", errors.ECODE_INVAL)
8868 if remote_node == secondary_node:
8869 raise errors.OpPrereqError("The specified node is already the"
8870 " secondary node of the instance",
8873 if self.disks and self.mode in (constants.REPLACE_DISK_AUTO,
8874 constants.REPLACE_DISK_CHG):
8875 raise errors.OpPrereqError("Cannot specify disks to be replaced",
8878 if self.mode == constants.REPLACE_DISK_AUTO:
8879 if not self._CheckDisksActivated(instance):
8880 raise errors.OpPrereqError("Please run activate-disks on instance %s"
8881 " first" % self.instance_name,
8883 faulty_primary = self._FindFaultyDisks(instance.primary_node)
8884 faulty_secondary = self._FindFaultyDisks(secondary_node)
8886 if faulty_primary and faulty_secondary:
8887 raise errors.OpPrereqError("Instance %s has faulty disks on more than"
8888 " one node and can not be repaired"
8889 " automatically" % self.instance_name,
8893 self.disks = faulty_primary
8894 self.target_node = instance.primary_node
8895 self.other_node = secondary_node
8896 check_nodes = [self.target_node, self.other_node]
8897 elif faulty_secondary:
8898 self.disks = faulty_secondary
8899 self.target_node = secondary_node
8900 self.other_node = instance.primary_node
8901 check_nodes = [self.target_node, self.other_node]
8907 # Non-automatic modes
8908 if self.mode == constants.REPLACE_DISK_PRI:
8909 self.target_node = instance.primary_node
8910 self.other_node = secondary_node
8911 check_nodes = [self.target_node, self.other_node]
8913 elif self.mode == constants.REPLACE_DISK_SEC:
8914 self.target_node = secondary_node
8915 self.other_node = instance.primary_node
8916 check_nodes = [self.target_node, self.other_node]
8918 elif self.mode == constants.REPLACE_DISK_CHG:
8919 self.new_node = remote_node
8920 self.other_node = instance.primary_node
8921 self.target_node = secondary_node
8922 check_nodes = [self.new_node, self.other_node]
8924 _CheckNodeNotDrained(self.lu, remote_node)
8925 _CheckNodeVmCapable(self.lu, remote_node)
8927 old_node_info = self.cfg.GetNodeInfo(secondary_node)
8928 assert old_node_info is not None
8929 if old_node_info.offline and not self.early_release:
8930 # doesn't make sense to delay the release
8931 self.early_release = True
8932 self.lu.LogInfo("Old secondary %s is offline, automatically enabling"
8933 " early-release mode", secondary_node)
8936 raise errors.ProgrammerError("Unhandled disk replace mode (%s)" %
8939 # If not specified all disks should be replaced
8941 self.disks = range(len(self.instance.disks))
8943 for node in check_nodes:
8944 _CheckNodeOnline(self.lu, node)
8946 touched_nodes = frozenset(node_name for node_name in [self.new_node,
8949 if node_name is not None)
8951 # Release unneeded node locks
8952 _ReleaseLocks(self.lu, locking.LEVEL_NODE, keep=touched_nodes)
8954 # Release any owned node group
8955 if self.lu.glm.is_owned(locking.LEVEL_NODEGROUP):
8956 _ReleaseLocks(self.lu, locking.LEVEL_NODEGROUP)
8958 # Check whether disks are valid
8959 for disk_idx in self.disks:
8960 instance.FindDisk(disk_idx)
8962 # Get secondary node IP addresses
8963 self.node_secondary_ip = \
8964 dict((node_name, self.cfg.GetNodeInfo(node_name).secondary_ip)
8965 for node_name in touched_nodes)
8967 def Exec(self, feedback_fn):
8968 """Execute disk replacement.
8970 This dispatches the disk replacement to the appropriate handler.
8973 if self.delay_iallocator:
8974 self._CheckPrereq2()
8977 # Verify owned locks before starting operation
8978 owned_locks = self.lu.glm.list_owned(locking.LEVEL_NODE)
8979 assert set(owned_locks) == set(self.node_secondary_ip), \
8980 ("Incorrect node locks, owning %s, expected %s" %
8981 (owned_locks, self.node_secondary_ip.keys()))
8983 owned_locks = self.lu.glm.list_owned(locking.LEVEL_INSTANCE)
8984 assert list(owned_locks) == [self.instance_name], \
8985 "Instance '%s' not locked" % self.instance_name
8987 assert not self.lu.glm.is_owned(locking.LEVEL_NODEGROUP), \
8988 "Should not own any node group lock at this point"
8991 feedback_fn("No disks need replacement")
8994 feedback_fn("Replacing disk(s) %s for %s" %
8995 (utils.CommaJoin(self.disks), self.instance.name))
8997 activate_disks = (not self.instance.admin_up)
8999 # Activate the instance disks if we're replacing them on a down instance
9001 _StartInstanceDisks(self.lu, self.instance, True)
9004 # Should we replace the secondary node?
9005 if self.new_node is not None:
9006 fn = self._ExecDrbd8Secondary
9008 fn = self._ExecDrbd8DiskOnly
9010 result = fn(feedback_fn)
9012 # Deactivate the instance disks if we're replacing them on a
9015 _SafeShutdownInstanceDisks(self.lu, self.instance)
9018 # Verify owned locks
9019 owned_locks = self.lu.glm.list_owned(locking.LEVEL_NODE)
9020 nodes = frozenset(self.node_secondary_ip)
9021 assert ((self.early_release and not owned_locks) or
9022 (not self.early_release and not (set(owned_locks) - nodes))), \
9023 ("Not owning the correct locks, early_release=%s, owned=%r,"
9024 " nodes=%r" % (self.early_release, owned_locks, nodes))
9028 def _CheckVolumeGroup(self, nodes):
9029 self.lu.LogInfo("Checking volume groups")
9031 vgname = self.cfg.GetVGName()
9033 # Make sure volume group exists on all involved nodes
9034 results = self.rpc.call_vg_list(nodes)
9036 raise errors.OpExecError("Can't list volume groups on the nodes")
9040 res.Raise("Error checking node %s" % node)
9041 if vgname not in res.payload:
9042 raise errors.OpExecError("Volume group '%s' not found on node %s" %
9045 def _CheckDisksExistence(self, nodes):
9046 # Check disk existence
9047 for idx, dev in enumerate(self.instance.disks):
9048 if idx not in self.disks:
9052 self.lu.LogInfo("Checking disk/%d on %s" % (idx, node))
9053 self.cfg.SetDiskID(dev, node)
9055 result = self.rpc.call_blockdev_find(node, dev)
9057 msg = result.fail_msg
9058 if msg or not result.payload:
9060 msg = "disk not found"
9061 raise errors.OpExecError("Can't find disk/%d on node %s: %s" %
9064 def _CheckDisksConsistency(self, node_name, on_primary, ldisk):
9065 for idx, dev in enumerate(self.instance.disks):
9066 if idx not in self.disks:
9069 self.lu.LogInfo("Checking disk/%d consistency on node %s" %
9072 if not _CheckDiskConsistency(self.lu, dev, node_name, on_primary,
9074 raise errors.OpExecError("Node %s has degraded storage, unsafe to"
9075 " replace disks for instance %s" %
9076 (node_name, self.instance.name))
9078 def _CreateNewStorage(self, node_name):
9081 for idx, dev in enumerate(self.instance.disks):
9082 if idx not in self.disks:
9085 self.lu.LogInfo("Adding storage on %s for disk/%d" % (node_name, idx))
9087 self.cfg.SetDiskID(dev, node_name)
9089 lv_names = [".disk%d_%s" % (idx, suffix) for suffix in ["data", "meta"]]
9090 names = _GenerateUniqueNames(self.lu, lv_names)
9092 vg_data = dev.children[0].logical_id[0]
9093 lv_data = objects.Disk(dev_type=constants.LD_LV, size=dev.size,
9094 logical_id=(vg_data, names[0]))
9095 vg_meta = dev.children[1].logical_id[0]
9096 lv_meta = objects.Disk(dev_type=constants.LD_LV, size=128,
9097 logical_id=(vg_meta, names[1]))
9099 new_lvs = [lv_data, lv_meta]
9100 old_lvs = dev.children
9101 iv_names[dev.iv_name] = (dev, old_lvs, new_lvs)
9103 # we pass force_create=True to force the LVM creation
9104 for new_lv in new_lvs:
9105 _CreateBlockDev(self.lu, node_name, self.instance, new_lv, True,
9106 _GetInstanceInfoText(self.instance), False)
9110 def _CheckDevices(self, node_name, iv_names):
9111 for name, (dev, _, _) in iv_names.iteritems():
9112 self.cfg.SetDiskID(dev, node_name)
9114 result = self.rpc.call_blockdev_find(node_name, dev)
9116 msg = result.fail_msg
9117 if msg or not result.payload:
9119 msg = "disk not found"
9120 raise errors.OpExecError("Can't find DRBD device %s: %s" %
9123 if result.payload.is_degraded:
9124 raise errors.OpExecError("DRBD device %s is degraded!" % name)
9126 def _RemoveOldStorage(self, node_name, iv_names):
9127 for name, (_, old_lvs, _) in iv_names.iteritems():
9128 self.lu.LogInfo("Remove logical volumes for %s" % name)
9131 self.cfg.SetDiskID(lv, node_name)
9133 msg = self.rpc.call_blockdev_remove(node_name, lv).fail_msg
9135 self.lu.LogWarning("Can't remove old LV: %s" % msg,
9136 hint="remove unused LVs manually")
9138 def _ExecDrbd8DiskOnly(self, feedback_fn):
9139 """Replace a disk on the primary or secondary for DRBD 8.
9141 The algorithm for replace is quite complicated:
9143 1. for each disk to be replaced:
9145 1. create new LVs on the target node with unique names
9146 1. detach old LVs from the drbd device
9147 1. rename old LVs to name_replaced.<time_t>
9148 1. rename new LVs to old LVs
9149 1. attach the new LVs (with the old names now) to the drbd device
9151 1. wait for sync across all devices
9153 1. for each modified disk:
9155 1. remove old LVs (which have the name name_replaces.<time_t>)
9157 Failures are not very well handled.
9162 # Step: check device activation
9163 self.lu.LogStep(1, steps_total, "Check device existence")
9164 self._CheckDisksExistence([self.other_node, self.target_node])
9165 self._CheckVolumeGroup([self.target_node, self.other_node])
9167 # Step: check other node consistency
9168 self.lu.LogStep(2, steps_total, "Check peer consistency")
9169 self._CheckDisksConsistency(self.other_node,
9170 self.other_node == self.instance.primary_node,
9173 # Step: create new storage
9174 self.lu.LogStep(3, steps_total, "Allocate new storage")
9175 iv_names = self._CreateNewStorage(self.target_node)
9177 # Step: for each lv, detach+rename*2+attach
9178 self.lu.LogStep(4, steps_total, "Changing drbd configuration")
9179 for dev, old_lvs, new_lvs in iv_names.itervalues():
9180 self.lu.LogInfo("Detaching %s drbd from local storage" % dev.iv_name)
9182 result = self.rpc.call_blockdev_removechildren(self.target_node, dev,
9184 result.Raise("Can't detach drbd from local storage on node"
9185 " %s for device %s" % (self.target_node, dev.iv_name))
9187 #cfg.Update(instance)
9189 # ok, we created the new LVs, so now we know we have the needed
9190 # storage; as such, we proceed on the target node to rename
9191 # old_lv to _old, and new_lv to old_lv; note that we rename LVs
9192 # using the assumption that logical_id == physical_id (which in
9193 # turn is the unique_id on that node)
9195 # FIXME(iustin): use a better name for the replaced LVs
9196 temp_suffix = int(time.time())
9197 ren_fn = lambda d, suff: (d.physical_id[0],
9198 d.physical_id[1] + "_replaced-%s" % suff)
9200 # Build the rename list based on what LVs exist on the node
9201 rename_old_to_new = []
9202 for to_ren in old_lvs:
9203 result = self.rpc.call_blockdev_find(self.target_node, to_ren)
9204 if not result.fail_msg and result.payload:
9206 rename_old_to_new.append((to_ren, ren_fn(to_ren, temp_suffix)))
9208 self.lu.LogInfo("Renaming the old LVs on the target node")
9209 result = self.rpc.call_blockdev_rename(self.target_node,
9211 result.Raise("Can't rename old LVs on node %s" % self.target_node)
9213 # Now we rename the new LVs to the old LVs
9214 self.lu.LogInfo("Renaming the new LVs on the target node")
9215 rename_new_to_old = [(new, old.physical_id)
9216 for old, new in zip(old_lvs, new_lvs)]
9217 result = self.rpc.call_blockdev_rename(self.target_node,
9219 result.Raise("Can't rename new LVs on node %s" % self.target_node)
9221 for old, new in zip(old_lvs, new_lvs):
9222 new.logical_id = old.logical_id
9223 self.cfg.SetDiskID(new, self.target_node)
9225 for disk in old_lvs:
9226 disk.logical_id = ren_fn(disk, temp_suffix)
9227 self.cfg.SetDiskID(disk, self.target_node)
9229 # Now that the new lvs have the old name, we can add them to the device
9230 self.lu.LogInfo("Adding new mirror component on %s" % self.target_node)
9231 result = self.rpc.call_blockdev_addchildren(self.target_node, dev,
9233 msg = result.fail_msg
9235 for new_lv in new_lvs:
9236 msg2 = self.rpc.call_blockdev_remove(self.target_node,
9239 self.lu.LogWarning("Can't rollback device %s: %s", dev, msg2,
9240 hint=("cleanup manually the unused logical"
9242 raise errors.OpExecError("Can't add local storage to drbd: %s" % msg)
9244 dev.children = new_lvs
9246 self.cfg.Update(self.instance, feedback_fn)
9249 if self.early_release:
9250 self.lu.LogStep(cstep, steps_total, "Removing old storage")
9252 self._RemoveOldStorage(self.target_node, iv_names)
9253 # WARNING: we release both node locks here, do not do other RPCs
9254 # than WaitForSync to the primary node
9255 _ReleaseLocks(self.lu, locking.LEVEL_NODE,
9256 names=[self.target_node, self.other_node])
9259 # This can fail as the old devices are degraded and _WaitForSync
9260 # does a combined result over all disks, so we don't check its return value
9261 self.lu.LogStep(cstep, steps_total, "Sync devices")
9263 _WaitForSync(self.lu, self.instance)
9265 # Check all devices manually
9266 self._CheckDevices(self.instance.primary_node, iv_names)
9268 # Step: remove old storage
9269 if not self.early_release:
9270 self.lu.LogStep(cstep, steps_total, "Removing old storage")
9272 self._RemoveOldStorage(self.target_node, iv_names)
9274 def _ExecDrbd8Secondary(self, feedback_fn):
9275 """Replace the secondary node for DRBD 8.
9277 The algorithm for replace is quite complicated:
9278 - for all disks of the instance:
9279 - create new LVs on the new node with same names
9280 - shutdown the drbd device on the old secondary
9281 - disconnect the drbd network on the primary
9282 - create the drbd device on the new secondary
9283 - network attach the drbd on the primary, using an artifice:
9284 the drbd code for Attach() will connect to the network if it
9285 finds a device which is connected to the good local disks but
9287 - wait for sync across all devices
9288 - remove all disks from the old secondary
9290 Failures are not very well handled.
9295 # Step: check device activation
9296 self.lu.LogStep(1, steps_total, "Check device existence")
9297 self._CheckDisksExistence([self.instance.primary_node])
9298 self._CheckVolumeGroup([self.instance.primary_node])
9300 # Step: check other node consistency
9301 self.lu.LogStep(2, steps_total, "Check peer consistency")
9302 self._CheckDisksConsistency(self.instance.primary_node, True, True)
9304 # Step: create new storage
9305 self.lu.LogStep(3, steps_total, "Allocate new storage")
9306 for idx, dev in enumerate(self.instance.disks):
9307 self.lu.LogInfo("Adding new local storage on %s for disk/%d" %
9308 (self.new_node, idx))
9309 # we pass force_create=True to force LVM creation
9310 for new_lv in dev.children:
9311 _CreateBlockDev(self.lu, self.new_node, self.instance, new_lv, True,
9312 _GetInstanceInfoText(self.instance), False)
9314 # Step 4: dbrd minors and drbd setups changes
9315 # after this, we must manually remove the drbd minors on both the
9316 # error and the success paths
9317 self.lu.LogStep(4, steps_total, "Changing drbd configuration")
9318 minors = self.cfg.AllocateDRBDMinor([self.new_node
9319 for dev in self.instance.disks],
9321 logging.debug("Allocated minors %r", minors)
9324 for idx, (dev, new_minor) in enumerate(zip(self.instance.disks, minors)):
9325 self.lu.LogInfo("activating a new drbd on %s for disk/%d" %
9326 (self.new_node, idx))
9327 # create new devices on new_node; note that we create two IDs:
9328 # one without port, so the drbd will be activated without
9329 # networking information on the new node at this stage, and one
9330 # with network, for the latter activation in step 4
9331 (o_node1, o_node2, o_port, o_minor1, o_minor2, o_secret) = dev.logical_id
9332 if self.instance.primary_node == o_node1:
9335 assert self.instance.primary_node == o_node2, "Three-node instance?"
9338 new_alone_id = (self.instance.primary_node, self.new_node, None,
9339 p_minor, new_minor, o_secret)
9340 new_net_id = (self.instance.primary_node, self.new_node, o_port,
9341 p_minor, new_minor, o_secret)
9343 iv_names[idx] = (dev, dev.children, new_net_id)
9344 logging.debug("Allocated new_minor: %s, new_logical_id: %s", new_minor,
9346 new_drbd = objects.Disk(dev_type=constants.LD_DRBD8,
9347 logical_id=new_alone_id,
9348 children=dev.children,
9351 _CreateSingleBlockDev(self.lu, self.new_node, self.instance, new_drbd,
9352 _GetInstanceInfoText(self.instance), False)
9353 except errors.GenericError:
9354 self.cfg.ReleaseDRBDMinors(self.instance.name)
9357 # We have new devices, shutdown the drbd on the old secondary
9358 for idx, dev in enumerate(self.instance.disks):
9359 self.lu.LogInfo("Shutting down drbd for disk/%d on old node" % idx)
9360 self.cfg.SetDiskID(dev, self.target_node)
9361 msg = self.rpc.call_blockdev_shutdown(self.target_node, dev).fail_msg
9363 self.lu.LogWarning("Failed to shutdown drbd for disk/%d on old"
9364 "node: %s" % (idx, msg),
9365 hint=("Please cleanup this device manually as"
9366 " soon as possible"))
9368 self.lu.LogInfo("Detaching primary drbds from the network (=> standalone)")
9369 result = self.rpc.call_drbd_disconnect_net([self.instance.primary_node],
9370 self.node_secondary_ip,
9371 self.instance.disks)\
9372 [self.instance.primary_node]
9374 msg = result.fail_msg
9376 # detaches didn't succeed (unlikely)
9377 self.cfg.ReleaseDRBDMinors(self.instance.name)
9378 raise errors.OpExecError("Can't detach the disks from the network on"
9379 " old node: %s" % (msg,))
9381 # if we managed to detach at least one, we update all the disks of
9382 # the instance to point to the new secondary
9383 self.lu.LogInfo("Updating instance configuration")
9384 for dev, _, new_logical_id in iv_names.itervalues():
9385 dev.logical_id = new_logical_id
9386 self.cfg.SetDiskID(dev, self.instance.primary_node)
9388 self.cfg.Update(self.instance, feedback_fn)
9390 # and now perform the drbd attach
9391 self.lu.LogInfo("Attaching primary drbds to new secondary"
9392 " (standalone => connected)")
9393 result = self.rpc.call_drbd_attach_net([self.instance.primary_node,
9395 self.node_secondary_ip,
9396 self.instance.disks,
9399 for to_node, to_result in result.items():
9400 msg = to_result.fail_msg
9402 self.lu.LogWarning("Can't attach drbd disks on node %s: %s",
9404 hint=("please do a gnt-instance info to see the"
9405 " status of disks"))
9407 if self.early_release:
9408 self.lu.LogStep(cstep, steps_total, "Removing old storage")
9410 self._RemoveOldStorage(self.target_node, iv_names)
9411 # WARNING: we release all node locks here, do not do other RPCs
9412 # than WaitForSync to the primary node
9413 _ReleaseLocks(self.lu, locking.LEVEL_NODE,
9414 names=[self.instance.primary_node,
9419 # This can fail as the old devices are degraded and _WaitForSync
9420 # does a combined result over all disks, so we don't check its return value
9421 self.lu.LogStep(cstep, steps_total, "Sync devices")
9423 _WaitForSync(self.lu, self.instance)
9425 # Check all devices manually
9426 self._CheckDevices(self.instance.primary_node, iv_names)
9428 # Step: remove old storage
9429 if not self.early_release:
9430 self.lu.LogStep(cstep, steps_total, "Removing old storage")
9431 self._RemoveOldStorage(self.target_node, iv_names)
9434 class LURepairNodeStorage(NoHooksLU):
9435 """Repairs the volume group on a node.
9440 def CheckArguments(self):
9441 self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
9443 storage_type = self.op.storage_type
9445 if (constants.SO_FIX_CONSISTENCY not in
9446 constants.VALID_STORAGE_OPERATIONS.get(storage_type, [])):
9447 raise errors.OpPrereqError("Storage units of type '%s' can not be"
9448 " repaired" % storage_type,
9451 def ExpandNames(self):
9452 self.needed_locks = {
9453 locking.LEVEL_NODE: [self.op.node_name],
9456 def _CheckFaultyDisks(self, instance, node_name):
9457 """Ensure faulty disks abort the opcode or at least warn."""
9459 if _FindFaultyInstanceDisks(self.cfg, self.rpc, instance,
9461 raise errors.OpPrereqError("Instance '%s' has faulty disks on"
9462 " node '%s'" % (instance.name, node_name),
9464 except errors.OpPrereqError, err:
9465 if self.op.ignore_consistency:
9466 self.proc.LogWarning(str(err.args[0]))
9470 def CheckPrereq(self):
9471 """Check prerequisites.
9474 # Check whether any instance on this node has faulty disks
9475 for inst in _GetNodeInstances(self.cfg, self.op.node_name):
9476 if not inst.admin_up:
9478 check_nodes = set(inst.all_nodes)
9479 check_nodes.discard(self.op.node_name)
9480 for inst_node_name in check_nodes:
9481 self._CheckFaultyDisks(inst, inst_node_name)
9483 def Exec(self, feedback_fn):
9484 feedback_fn("Repairing storage unit '%s' on %s ..." %
9485 (self.op.name, self.op.node_name))
9487 st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
9488 result = self.rpc.call_storage_execute(self.op.node_name,
9489 self.op.storage_type, st_args,
9491 constants.SO_FIX_CONSISTENCY)
9492 result.Raise("Failed to repair storage unit '%s' on %s" %
9493 (self.op.name, self.op.node_name))
9496 class LUNodeEvacStrategy(NoHooksLU):
9497 """Computes the node evacuation strategy.
9502 def CheckArguments(self):
9503 _CheckIAllocatorOrNode(self, "iallocator", "remote_node")
9505 def ExpandNames(self):
9506 self.op.nodes = _GetWantedNodes(self, self.op.nodes)
9507 self.needed_locks = locks = {}
9508 if self.op.remote_node is None:
9509 locks[locking.LEVEL_NODE] = locking.ALL_SET
9511 self.op.remote_node = _ExpandNodeName(self.cfg, self.op.remote_node)
9512 locks[locking.LEVEL_NODE] = self.op.nodes + [self.op.remote_node]
9514 def Exec(self, feedback_fn):
9515 if self.op.remote_node is not None:
9517 for node in self.op.nodes:
9518 instances.extend(_GetNodeSecondaryInstances(self.cfg, node))
9521 if i.primary_node == self.op.remote_node:
9522 raise errors.OpPrereqError("Node %s is the primary node of"
9523 " instance %s, cannot use it as"
9525 (self.op.remote_node, i.name),
9527 result.append([i.name, self.op.remote_node])
9529 ial = IAllocator(self.cfg, self.rpc,
9530 mode=constants.IALLOCATOR_MODE_MEVAC,
9531 evac_nodes=self.op.nodes)
9532 ial.Run(self.op.iallocator, validate=True)
9534 raise errors.OpExecError("No valid evacuation solution: %s" % ial.info,
9540 class LUInstanceGrowDisk(LogicalUnit):
9541 """Grow a disk of an instance.
9545 HTYPE = constants.HTYPE_INSTANCE
9548 def ExpandNames(self):
9549 self._ExpandAndLockInstance()
9550 self.needed_locks[locking.LEVEL_NODE] = []
9551 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
9553 def DeclareLocks(self, level):
9554 if level == locking.LEVEL_NODE:
9555 self._LockInstancesNodes()
9557 def BuildHooksEnv(self):
9560 This runs on the master, the primary and all the secondaries.
9564 "DISK": self.op.disk,
9565 "AMOUNT": self.op.amount,
9567 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
9570 def BuildHooksNodes(self):
9571 """Build hooks nodes.
9574 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
9577 def CheckPrereq(self):
9578 """Check prerequisites.
9580 This checks that the instance is in the cluster.
9583 instance = self.cfg.GetInstanceInfo(self.op.instance_name)
9584 assert instance is not None, \
9585 "Cannot retrieve locked instance %s" % self.op.instance_name
9586 nodenames = list(instance.all_nodes)
9587 for node in nodenames:
9588 _CheckNodeOnline(self, node)
9590 self.instance = instance
9592 if instance.disk_template not in constants.DTS_GROWABLE:
9593 raise errors.OpPrereqError("Instance's disk layout does not support"
9594 " growing", errors.ECODE_INVAL)
9596 self.disk = instance.FindDisk(self.op.disk)
9598 if instance.disk_template not in (constants.DT_FILE,
9599 constants.DT_SHARED_FILE):
9600 # TODO: check the free disk space for file, when that feature will be
9602 _CheckNodesFreeDiskPerVG(self, nodenames,
9603 self.disk.ComputeGrowth(self.op.amount))
9605 def Exec(self, feedback_fn):
9606 """Execute disk grow.
9609 instance = self.instance
9612 disks_ok, _ = _AssembleInstanceDisks(self, self.instance, disks=[disk])
9614 raise errors.OpExecError("Cannot activate block device to grow")
9616 # First run all grow ops in dry-run mode
9617 for node in instance.all_nodes:
9618 self.cfg.SetDiskID(disk, node)
9619 result = self.rpc.call_blockdev_grow(node, disk, self.op.amount, True)
9620 result.Raise("Grow request failed to node %s" % node)
9622 # We know that (as far as we can test) operations across different
9623 # nodes will succeed, time to run it for real
9624 for node in instance.all_nodes:
9625 self.cfg.SetDiskID(disk, node)
9626 result = self.rpc.call_blockdev_grow(node, disk, self.op.amount, False)
9627 result.Raise("Grow request failed to node %s" % node)
9629 # TODO: Rewrite code to work properly
9630 # DRBD goes into sync mode for a short amount of time after executing the
9631 # "resize" command. DRBD 8.x below version 8.0.13 contains a bug whereby
9632 # calling "resize" in sync mode fails. Sleeping for a short amount of
9633 # time is a work-around.
9636 disk.RecordGrow(self.op.amount)
9637 self.cfg.Update(instance, feedback_fn)
9638 if self.op.wait_for_sync:
9639 disk_abort = not _WaitForSync(self, instance, disks=[disk])
9641 self.proc.LogWarning("Disk sync-ing has not returned a good"
9642 " status; please check the instance")
9643 if not instance.admin_up:
9644 _SafeShutdownInstanceDisks(self, instance, disks=[disk])
9645 elif not instance.admin_up:
9646 self.proc.LogWarning("Not shutting down the disk even if the instance is"
9647 " not supposed to be running because no wait for"
9648 " sync mode was requested")
9651 class LUInstanceQueryData(NoHooksLU):
9652 """Query runtime instance data.
9657 def ExpandNames(self):
9658 self.needed_locks = {}
9660 # Use locking if requested or when non-static information is wanted
9661 if not (self.op.static or self.op.use_locking):
9662 self.LogWarning("Non-static data requested, locks need to be acquired")
9663 self.op.use_locking = True
9665 if self.op.instances or not self.op.use_locking:
9666 # Expand instance names right here
9667 self.wanted_names = _GetWantedInstances(self, self.op.instances)
9669 # Will use acquired locks
9670 self.wanted_names = None
9672 if self.op.use_locking:
9673 self.share_locks = dict.fromkeys(locking.LEVELS, 1)
9675 if self.wanted_names is None:
9676 self.needed_locks[locking.LEVEL_INSTANCE] = locking.ALL_SET
9678 self.needed_locks[locking.LEVEL_INSTANCE] = self.wanted_names
9680 self.needed_locks[locking.LEVEL_NODE] = []
9681 self.share_locks = dict.fromkeys(locking.LEVELS, 1)
9682 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
9684 def DeclareLocks(self, level):
9685 if self.op.use_locking and level == locking.LEVEL_NODE:
9686 self._LockInstancesNodes()
9688 def CheckPrereq(self):
9689 """Check prerequisites.
9691 This only checks the optional instance list against the existing names.
9694 if self.wanted_names is None:
9695 assert self.op.use_locking, "Locking was not used"
9696 self.wanted_names = self.glm.list_owned(locking.LEVEL_INSTANCE)
9698 self.wanted_instances = [self.cfg.GetInstanceInfo(name)
9699 for name in self.wanted_names]
9701 def _ComputeBlockdevStatus(self, node, instance_name, dev):
9702 """Returns the status of a block device
9705 if self.op.static or not node:
9708 self.cfg.SetDiskID(dev, node)
9710 result = self.rpc.call_blockdev_find(node, dev)
9714 result.Raise("Can't compute disk status for %s" % instance_name)
9716 status = result.payload
9720 return (status.dev_path, status.major, status.minor,
9721 status.sync_percent, status.estimated_time,
9722 status.is_degraded, status.ldisk_status)
9724 def _ComputeDiskStatus(self, instance, snode, dev):
9725 """Compute block device status.
9728 if dev.dev_type in constants.LDS_DRBD:
9729 # we change the snode then (otherwise we use the one passed in)
9730 if dev.logical_id[0] == instance.primary_node:
9731 snode = dev.logical_id[1]
9733 snode = dev.logical_id[0]
9735 dev_pstatus = self._ComputeBlockdevStatus(instance.primary_node,
9737 dev_sstatus = self._ComputeBlockdevStatus(snode, instance.name, dev)
9740 dev_children = [self._ComputeDiskStatus(instance, snode, child)
9741 for child in dev.children]
9746 "iv_name": dev.iv_name,
9747 "dev_type": dev.dev_type,
9748 "logical_id": dev.logical_id,
9749 "physical_id": dev.physical_id,
9750 "pstatus": dev_pstatus,
9751 "sstatus": dev_sstatus,
9752 "children": dev_children,
9757 def Exec(self, feedback_fn):
9758 """Gather and return data"""
9761 cluster = self.cfg.GetClusterInfo()
9763 for instance in self.wanted_instances:
9764 if not self.op.static:
9765 remote_info = self.rpc.call_instance_info(instance.primary_node,
9767 instance.hypervisor)
9768 remote_info.Raise("Error checking node %s" % instance.primary_node)
9769 remote_info = remote_info.payload
9770 if remote_info and "state" in remote_info:
9773 remote_state = "down"
9776 if instance.admin_up:
9779 config_state = "down"
9781 disks = [self._ComputeDiskStatus(instance, None, device)
9782 for device in instance.disks]
9784 result[instance.name] = {
9785 "name": instance.name,
9786 "config_state": config_state,
9787 "run_state": remote_state,
9788 "pnode": instance.primary_node,
9789 "snodes": instance.secondary_nodes,
9791 # this happens to be the same format used for hooks
9792 "nics": _NICListToTuple(self, instance.nics),
9793 "disk_template": instance.disk_template,
9795 "hypervisor": instance.hypervisor,
9796 "network_port": instance.network_port,
9797 "hv_instance": instance.hvparams,
9798 "hv_actual": cluster.FillHV(instance, skip_globals=True),
9799 "be_instance": instance.beparams,
9800 "be_actual": cluster.FillBE(instance),
9801 "os_instance": instance.osparams,
9802 "os_actual": cluster.SimpleFillOS(instance.os, instance.osparams),
9803 "serial_no": instance.serial_no,
9804 "mtime": instance.mtime,
9805 "ctime": instance.ctime,
9806 "uuid": instance.uuid,
9812 class LUInstanceSetParams(LogicalUnit):
9813 """Modifies an instances's parameters.
9816 HPATH = "instance-modify"
9817 HTYPE = constants.HTYPE_INSTANCE
9820 def CheckArguments(self):
9821 if not (self.op.nics or self.op.disks or self.op.disk_template or
9822 self.op.hvparams or self.op.beparams or self.op.os_name):
9823 raise errors.OpPrereqError("No changes submitted", errors.ECODE_INVAL)
9825 if self.op.hvparams:
9826 _CheckGlobalHvParams(self.op.hvparams)
9830 for disk_op, disk_dict in self.op.disks:
9831 utils.ForceDictType(disk_dict, constants.IDISK_PARAMS_TYPES)
9832 if disk_op == constants.DDM_REMOVE:
9835 elif disk_op == constants.DDM_ADD:
9838 if not isinstance(disk_op, int):
9839 raise errors.OpPrereqError("Invalid disk index", errors.ECODE_INVAL)
9840 if not isinstance(disk_dict, dict):
9841 msg = "Invalid disk value: expected dict, got '%s'" % disk_dict
9842 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
9844 if disk_op == constants.DDM_ADD:
9845 mode = disk_dict.setdefault(constants.IDISK_MODE, constants.DISK_RDWR)
9846 if mode not in constants.DISK_ACCESS_SET:
9847 raise errors.OpPrereqError("Invalid disk access mode '%s'" % mode,
9849 size = disk_dict.get(constants.IDISK_SIZE, None)
9851 raise errors.OpPrereqError("Required disk parameter size missing",
9855 except (TypeError, ValueError), err:
9856 raise errors.OpPrereqError("Invalid disk size parameter: %s" %
9857 str(err), errors.ECODE_INVAL)
9858 disk_dict[constants.IDISK_SIZE] = size
9860 # modification of disk
9861 if constants.IDISK_SIZE in disk_dict:
9862 raise errors.OpPrereqError("Disk size change not possible, use"
9863 " grow-disk", errors.ECODE_INVAL)
9865 if disk_addremove > 1:
9866 raise errors.OpPrereqError("Only one disk add or remove operation"
9867 " supported at a time", errors.ECODE_INVAL)
9869 if self.op.disks and self.op.disk_template is not None:
9870 raise errors.OpPrereqError("Disk template conversion and other disk"
9871 " changes not supported at the same time",
9874 if (self.op.disk_template and
9875 self.op.disk_template in constants.DTS_INT_MIRROR and
9876 self.op.remote_node is None):
9877 raise errors.OpPrereqError("Changing the disk template to a mirrored"
9878 " one requires specifying a secondary node",
9883 for nic_op, nic_dict in self.op.nics:
9884 utils.ForceDictType(nic_dict, constants.INIC_PARAMS_TYPES)
9885 if nic_op == constants.DDM_REMOVE:
9888 elif nic_op == constants.DDM_ADD:
9891 if not isinstance(nic_op, int):
9892 raise errors.OpPrereqError("Invalid nic index", errors.ECODE_INVAL)
9893 if not isinstance(nic_dict, dict):
9894 msg = "Invalid nic value: expected dict, got '%s'" % nic_dict
9895 raise errors.OpPrereqError(msg, errors.ECODE_INVAL)
9897 # nic_dict should be a dict
9898 nic_ip = nic_dict.get(constants.INIC_IP, None)
9899 if nic_ip is not None:
9900 if nic_ip.lower() == constants.VALUE_NONE:
9901 nic_dict[constants.INIC_IP] = None
9903 if not netutils.IPAddress.IsValid(nic_ip):
9904 raise errors.OpPrereqError("Invalid IP address '%s'" % nic_ip,
9907 nic_bridge = nic_dict.get('bridge', None)
9908 nic_link = nic_dict.get(constants.INIC_LINK, None)
9909 if nic_bridge and nic_link:
9910 raise errors.OpPrereqError("Cannot pass 'bridge' and 'link'"
9911 " at the same time", errors.ECODE_INVAL)
9912 elif nic_bridge and nic_bridge.lower() == constants.VALUE_NONE:
9913 nic_dict['bridge'] = None
9914 elif nic_link and nic_link.lower() == constants.VALUE_NONE:
9915 nic_dict[constants.INIC_LINK] = None
9917 if nic_op == constants.DDM_ADD:
9918 nic_mac = nic_dict.get(constants.INIC_MAC, None)
9920 nic_dict[constants.INIC_MAC] = constants.VALUE_AUTO
9922 if constants.INIC_MAC in nic_dict:
9923 nic_mac = nic_dict[constants.INIC_MAC]
9924 if nic_mac not in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
9925 nic_mac = utils.NormalizeAndValidateMac(nic_mac)
9927 if nic_op != constants.DDM_ADD and nic_mac == constants.VALUE_AUTO:
9928 raise errors.OpPrereqError("'auto' is not a valid MAC address when"
9929 " modifying an existing nic",
9932 if nic_addremove > 1:
9933 raise errors.OpPrereqError("Only one NIC add or remove operation"
9934 " supported at a time", errors.ECODE_INVAL)
9936 def ExpandNames(self):
9937 self._ExpandAndLockInstance()
9938 self.needed_locks[locking.LEVEL_NODE] = []
9939 self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
9941 def DeclareLocks(self, level):
9942 if level == locking.LEVEL_NODE:
9943 self._LockInstancesNodes()
9944 if self.op.disk_template and self.op.remote_node:
9945 self.op.remote_node = _ExpandNodeName(self.cfg, self.op.remote_node)
9946 self.needed_locks[locking.LEVEL_NODE].append(self.op.remote_node)
9948 def BuildHooksEnv(self):
9951 This runs on the master, primary and secondaries.
9955 if constants.BE_MEMORY in self.be_new:
9956 args['memory'] = self.be_new[constants.BE_MEMORY]
9957 if constants.BE_VCPUS in self.be_new:
9958 args['vcpus'] = self.be_new[constants.BE_VCPUS]
9959 # TODO: export disk changes. Note: _BuildInstanceHookEnv* don't export disk
9960 # information at all.
9963 nic_override = dict(self.op.nics)
9964 for idx, nic in enumerate(self.instance.nics):
9965 if idx in nic_override:
9966 this_nic_override = nic_override[idx]
9968 this_nic_override = {}
9969 if constants.INIC_IP in this_nic_override:
9970 ip = this_nic_override[constants.INIC_IP]
9973 if constants.INIC_MAC in this_nic_override:
9974 mac = this_nic_override[constants.INIC_MAC]
9977 if idx in self.nic_pnew:
9978 nicparams = self.nic_pnew[idx]
9980 nicparams = self.cluster.SimpleFillNIC(nic.nicparams)
9981 mode = nicparams[constants.NIC_MODE]
9982 link = nicparams[constants.NIC_LINK]
9983 args['nics'].append((ip, mac, mode, link))
9984 if constants.DDM_ADD in nic_override:
9985 ip = nic_override[constants.DDM_ADD].get(constants.INIC_IP, None)
9986 mac = nic_override[constants.DDM_ADD][constants.INIC_MAC]
9987 nicparams = self.nic_pnew[constants.DDM_ADD]
9988 mode = nicparams[constants.NIC_MODE]
9989 link = nicparams[constants.NIC_LINK]
9990 args['nics'].append((ip, mac, mode, link))
9991 elif constants.DDM_REMOVE in nic_override:
9992 del args['nics'][-1]
9994 env = _BuildInstanceHookEnvByObject(self, self.instance, override=args)
9995 if self.op.disk_template:
9996 env["NEW_DISK_TEMPLATE"] = self.op.disk_template
10000 def BuildHooksNodes(self):
10001 """Build hooks nodes.
10004 nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
10007 def CheckPrereq(self):
10008 """Check prerequisites.
10010 This only checks the instance list against the existing names.
10013 # checking the new params on the primary/secondary nodes
10015 instance = self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
10016 cluster = self.cluster = self.cfg.GetClusterInfo()
10017 assert self.instance is not None, \
10018 "Cannot retrieve locked instance %s" % self.op.instance_name
10019 pnode = instance.primary_node
10020 nodelist = list(instance.all_nodes)
10023 if self.op.os_name and not self.op.force:
10024 _CheckNodeHasOS(self, instance.primary_node, self.op.os_name,
10025 self.op.force_variant)
10026 instance_os = self.op.os_name
10028 instance_os = instance.os
10030 if self.op.disk_template:
10031 if instance.disk_template == self.op.disk_template:
10032 raise errors.OpPrereqError("Instance already has disk template %s" %
10033 instance.disk_template, errors.ECODE_INVAL)
10035 if (instance.disk_template,
10036 self.op.disk_template) not in self._DISK_CONVERSIONS:
10037 raise errors.OpPrereqError("Unsupported disk template conversion from"
10038 " %s to %s" % (instance.disk_template,
10039 self.op.disk_template),
10040 errors.ECODE_INVAL)
10041 _CheckInstanceDown(self, instance, "cannot change disk template")
10042 if self.op.disk_template in constants.DTS_INT_MIRROR:
10043 if self.op.remote_node == pnode:
10044 raise errors.OpPrereqError("Given new secondary node %s is the same"
10045 " as the primary node of the instance" %
10046 self.op.remote_node, errors.ECODE_STATE)
10047 _CheckNodeOnline(self, self.op.remote_node)
10048 _CheckNodeNotDrained(self, self.op.remote_node)
10049 # FIXME: here we assume that the old instance type is DT_PLAIN
10050 assert instance.disk_template == constants.DT_PLAIN
10051 disks = [{constants.IDISK_SIZE: d.size,
10052 constants.IDISK_VG: d.logical_id[0]}
10053 for d in instance.disks]
10054 required = _ComputeDiskSizePerVG(self.op.disk_template, disks)
10055 _CheckNodesFreeDiskPerVG(self, [self.op.remote_node], required)
10057 # hvparams processing
10058 if self.op.hvparams:
10059 hv_type = instance.hypervisor
10060 i_hvdict = _GetUpdatedParams(instance.hvparams, self.op.hvparams)
10061 utils.ForceDictType(i_hvdict, constants.HVS_PARAMETER_TYPES)
10062 hv_new = cluster.SimpleFillHV(hv_type, instance.os, i_hvdict)
10065 hypervisor.GetHypervisor(hv_type).CheckParameterSyntax(hv_new)
10066 _CheckHVParams(self, nodelist, instance.hypervisor, hv_new)
10067 self.hv_new = hv_new # the new actual values
10068 self.hv_inst = i_hvdict # the new dict (without defaults)
10070 self.hv_new = self.hv_inst = {}
10072 # beparams processing
10073 if self.op.beparams:
10074 i_bedict = _GetUpdatedParams(instance.beparams, self.op.beparams,
10076 utils.ForceDictType(i_bedict, constants.BES_PARAMETER_TYPES)
10077 be_new = cluster.SimpleFillBE(i_bedict)
10078 self.be_new = be_new # the new actual values
10079 self.be_inst = i_bedict # the new dict (without defaults)
10081 self.be_new = self.be_inst = {}
10083 # osparams processing
10084 if self.op.osparams:
10085 i_osdict = _GetUpdatedParams(instance.osparams, self.op.osparams)
10086 _CheckOSParams(self, True, nodelist, instance_os, i_osdict)
10087 self.os_inst = i_osdict # the new dict (without defaults)
10093 if constants.BE_MEMORY in self.op.beparams and not self.op.force:
10094 mem_check_list = [pnode]
10095 if be_new[constants.BE_AUTO_BALANCE]:
10096 # either we changed auto_balance to yes or it was from before
10097 mem_check_list.extend(instance.secondary_nodes)
10098 instance_info = self.rpc.call_instance_info(pnode, instance.name,
10099 instance.hypervisor)
10100 nodeinfo = self.rpc.call_node_info(mem_check_list, None,
10101 instance.hypervisor)
10102 pninfo = nodeinfo[pnode]
10103 msg = pninfo.fail_msg
10105 # Assume the primary node is unreachable and go ahead
10106 self.warn.append("Can't get info from primary node %s: %s" %
10108 elif not isinstance(pninfo.payload.get('memory_free', None), int):
10109 self.warn.append("Node data from primary node %s doesn't contain"
10110 " free memory information" % pnode)
10111 elif instance_info.fail_msg:
10112 self.warn.append("Can't get instance runtime information: %s" %
10113 instance_info.fail_msg)
10115 if instance_info.payload:
10116 current_mem = int(instance_info.payload['memory'])
10118 # Assume instance not running
10119 # (there is a slight race condition here, but it's not very probable,
10120 # and we have no other way to check)
10122 miss_mem = (be_new[constants.BE_MEMORY] - current_mem -
10123 pninfo.payload['memory_free'])
10125 raise errors.OpPrereqError("This change will prevent the instance"
10126 " from starting, due to %d MB of memory"
10127 " missing on its primary node" % miss_mem,
10128 errors.ECODE_NORES)
10130 if be_new[constants.BE_AUTO_BALANCE]:
10131 for node, nres in nodeinfo.items():
10132 if node not in instance.secondary_nodes:
10134 msg = nres.fail_msg
10136 self.warn.append("Can't get info from secondary node %s: %s" %
10138 elif not isinstance(nres.payload.get('memory_free', None), int):
10139 self.warn.append("Secondary node %s didn't return free"
10140 " memory information" % node)
10141 elif be_new[constants.BE_MEMORY] > nres.payload['memory_free']:
10142 self.warn.append("Not enough memory to failover instance to"
10143 " secondary node %s" % node)
10147 self.nic_pinst = {}
10148 for nic_op, nic_dict in self.op.nics:
10149 if nic_op == constants.DDM_REMOVE:
10150 if not instance.nics:
10151 raise errors.OpPrereqError("Instance has no NICs, cannot remove",
10152 errors.ECODE_INVAL)
10154 if nic_op != constants.DDM_ADD:
10156 if not instance.nics:
10157 raise errors.OpPrereqError("Invalid NIC index %s, instance has"
10158 " no NICs" % nic_op,
10159 errors.ECODE_INVAL)
10160 if nic_op < 0 or nic_op >= len(instance.nics):
10161 raise errors.OpPrereqError("Invalid NIC index %s, valid values"
10163 (nic_op, len(instance.nics) - 1),
10164 errors.ECODE_INVAL)
10165 old_nic_params = instance.nics[nic_op].nicparams
10166 old_nic_ip = instance.nics[nic_op].ip
10168 old_nic_params = {}
10171 update_params_dict = dict([(key, nic_dict[key])
10172 for key in constants.NICS_PARAMETERS
10173 if key in nic_dict])
10175 if 'bridge' in nic_dict:
10176 update_params_dict[constants.NIC_LINK] = nic_dict['bridge']
10178 new_nic_params = _GetUpdatedParams(old_nic_params,
10179 update_params_dict)
10180 utils.ForceDictType(new_nic_params, constants.NICS_PARAMETER_TYPES)
10181 new_filled_nic_params = cluster.SimpleFillNIC(new_nic_params)
10182 objects.NIC.CheckParameterSyntax(new_filled_nic_params)
10183 self.nic_pinst[nic_op] = new_nic_params
10184 self.nic_pnew[nic_op] = new_filled_nic_params
10185 new_nic_mode = new_filled_nic_params[constants.NIC_MODE]
10187 if new_nic_mode == constants.NIC_MODE_BRIDGED:
10188 nic_bridge = new_filled_nic_params[constants.NIC_LINK]
10189 msg = self.rpc.call_bridges_exist(pnode, [nic_bridge]).fail_msg
10191 msg = "Error checking bridges on node %s: %s" % (pnode, msg)
10193 self.warn.append(msg)
10195 raise errors.OpPrereqError(msg, errors.ECODE_ENVIRON)
10196 if new_nic_mode == constants.NIC_MODE_ROUTED:
10197 if constants.INIC_IP in nic_dict:
10198 nic_ip = nic_dict[constants.INIC_IP]
10200 nic_ip = old_nic_ip
10202 raise errors.OpPrereqError('Cannot set the nic ip to None'
10203 ' on a routed nic', errors.ECODE_INVAL)
10204 if constants.INIC_MAC in nic_dict:
10205 nic_mac = nic_dict[constants.INIC_MAC]
10206 if nic_mac is None:
10207 raise errors.OpPrereqError('Cannot set the nic mac to None',
10208 errors.ECODE_INVAL)
10209 elif nic_mac in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
10210 # otherwise generate the mac
10211 nic_dict[constants.INIC_MAC] = \
10212 self.cfg.GenerateMAC(self.proc.GetECId())
10214 # or validate/reserve the current one
10216 self.cfg.ReserveMAC(nic_mac, self.proc.GetECId())
10217 except errors.ReservationError:
10218 raise errors.OpPrereqError("MAC address %s already in use"
10219 " in cluster" % nic_mac,
10220 errors.ECODE_NOTUNIQUE)
10223 if self.op.disks and instance.disk_template == constants.DT_DISKLESS:
10224 raise errors.OpPrereqError("Disk operations not supported for"
10225 " diskless instances",
10226 errors.ECODE_INVAL)
10227 for disk_op, _ in self.op.disks:
10228 if disk_op == constants.DDM_REMOVE:
10229 if len(instance.disks) == 1:
10230 raise errors.OpPrereqError("Cannot remove the last disk of"
10231 " an instance", errors.ECODE_INVAL)
10232 _CheckInstanceDown(self, instance, "cannot remove disks")
10234 if (disk_op == constants.DDM_ADD and
10235 len(instance.disks) >= constants.MAX_DISKS):
10236 raise errors.OpPrereqError("Instance has too many disks (%d), cannot"
10237 " add more" % constants.MAX_DISKS,
10238 errors.ECODE_STATE)
10239 if disk_op not in (constants.DDM_ADD, constants.DDM_REMOVE):
10241 if disk_op < 0 or disk_op >= len(instance.disks):
10242 raise errors.OpPrereqError("Invalid disk index %s, valid values"
10244 (disk_op, len(instance.disks)),
10245 errors.ECODE_INVAL)
10249 def _ConvertPlainToDrbd(self, feedback_fn):
10250 """Converts an instance from plain to drbd.
10253 feedback_fn("Converting template to drbd")
10254 instance = self.instance
10255 pnode = instance.primary_node
10256 snode = self.op.remote_node
10258 # create a fake disk info for _GenerateDiskTemplate
10259 disk_info = [{constants.IDISK_SIZE: d.size, constants.IDISK_MODE: d.mode,
10260 constants.IDISK_VG: d.logical_id[0]}
10261 for d in instance.disks]
10262 new_disks = _GenerateDiskTemplate(self, self.op.disk_template,
10263 instance.name, pnode, [snode],
10264 disk_info, None, None, 0, feedback_fn)
10265 info = _GetInstanceInfoText(instance)
10266 feedback_fn("Creating aditional volumes...")
10267 # first, create the missing data and meta devices
10268 for disk in new_disks:
10269 # unfortunately this is... not too nice
10270 _CreateSingleBlockDev(self, pnode, instance, disk.children[1],
10272 for child in disk.children:
10273 _CreateSingleBlockDev(self, snode, instance, child, info, True)
10274 # at this stage, all new LVs have been created, we can rename the
10276 feedback_fn("Renaming original volumes...")
10277 rename_list = [(o, n.children[0].logical_id)
10278 for (o, n) in zip(instance.disks, new_disks)]
10279 result = self.rpc.call_blockdev_rename(pnode, rename_list)
10280 result.Raise("Failed to rename original LVs")
10282 feedback_fn("Initializing DRBD devices...")
10283 # all child devices are in place, we can now create the DRBD devices
10284 for disk in new_disks:
10285 for node in [pnode, snode]:
10286 f_create = node == pnode
10287 _CreateSingleBlockDev(self, node, instance, disk, info, f_create)
10289 # at this point, the instance has been modified
10290 instance.disk_template = constants.DT_DRBD8
10291 instance.disks = new_disks
10292 self.cfg.Update(instance, feedback_fn)
10294 # disks are created, waiting for sync
10295 disk_abort = not _WaitForSync(self, instance,
10296 oneshot=not self.op.wait_for_sync)
10298 raise errors.OpExecError("There are some degraded disks for"
10299 " this instance, please cleanup manually")
10301 def _ConvertDrbdToPlain(self, feedback_fn):
10302 """Converts an instance from drbd to plain.
10305 instance = self.instance
10306 assert len(instance.secondary_nodes) == 1
10307 pnode = instance.primary_node
10308 snode = instance.secondary_nodes[0]
10309 feedback_fn("Converting template to plain")
10311 old_disks = instance.disks
10312 new_disks = [d.children[0] for d in old_disks]
10314 # copy over size and mode
10315 for parent, child in zip(old_disks, new_disks):
10316 child.size = parent.size
10317 child.mode = parent.mode
10319 # update instance structure
10320 instance.disks = new_disks
10321 instance.disk_template = constants.DT_PLAIN
10322 self.cfg.Update(instance, feedback_fn)
10324 feedback_fn("Removing volumes on the secondary node...")
10325 for disk in old_disks:
10326 self.cfg.SetDiskID(disk, snode)
10327 msg = self.rpc.call_blockdev_remove(snode, disk).fail_msg
10329 self.LogWarning("Could not remove block device %s on node %s,"
10330 " continuing anyway: %s", disk.iv_name, snode, msg)
10332 feedback_fn("Removing unneeded volumes on the primary node...")
10333 for idx, disk in enumerate(old_disks):
10334 meta = disk.children[1]
10335 self.cfg.SetDiskID(meta, pnode)
10336 msg = self.rpc.call_blockdev_remove(pnode, meta).fail_msg
10338 self.LogWarning("Could not remove metadata for disk %d on node %s,"
10339 " continuing anyway: %s", idx, pnode, msg)
10341 def Exec(self, feedback_fn):
10342 """Modifies an instance.
10344 All parameters take effect only at the next restart of the instance.
10347 # Process here the warnings from CheckPrereq, as we don't have a
10348 # feedback_fn there.
10349 for warn in self.warn:
10350 feedback_fn("WARNING: %s" % warn)
10353 instance = self.instance
10355 for disk_op, disk_dict in self.op.disks:
10356 if disk_op == constants.DDM_REMOVE:
10357 # remove the last disk
10358 device = instance.disks.pop()
10359 device_idx = len(instance.disks)
10360 for node, disk in device.ComputeNodeTree(instance.primary_node):
10361 self.cfg.SetDiskID(disk, node)
10362 msg = self.rpc.call_blockdev_remove(node, disk).fail_msg
10364 self.LogWarning("Could not remove disk/%d on node %s: %s,"
10365 " continuing anyway", device_idx, node, msg)
10366 result.append(("disk/%d" % device_idx, "remove"))
10367 elif disk_op == constants.DDM_ADD:
10369 if instance.disk_template in (constants.DT_FILE,
10370 constants.DT_SHARED_FILE):
10371 file_driver, file_path = instance.disks[0].logical_id
10372 file_path = os.path.dirname(file_path)
10374 file_driver = file_path = None
10375 disk_idx_base = len(instance.disks)
10376 new_disk = _GenerateDiskTemplate(self,
10377 instance.disk_template,
10378 instance.name, instance.primary_node,
10379 instance.secondary_nodes,
10383 disk_idx_base, feedback_fn)[0]
10384 instance.disks.append(new_disk)
10385 info = _GetInstanceInfoText(instance)
10387 logging.info("Creating volume %s for instance %s",
10388 new_disk.iv_name, instance.name)
10389 # Note: this needs to be kept in sync with _CreateDisks
10391 for node in instance.all_nodes:
10392 f_create = node == instance.primary_node
10394 _CreateBlockDev(self, node, instance, new_disk,
10395 f_create, info, f_create)
10396 except errors.OpExecError, err:
10397 self.LogWarning("Failed to create volume %s (%s) on"
10399 new_disk.iv_name, new_disk, node, err)
10400 result.append(("disk/%d" % disk_idx_base, "add:size=%s,mode=%s" %
10401 (new_disk.size, new_disk.mode)))
10403 # change a given disk
10404 instance.disks[disk_op].mode = disk_dict[constants.IDISK_MODE]
10405 result.append(("disk.mode/%d" % disk_op,
10406 disk_dict[constants.IDISK_MODE]))
10408 if self.op.disk_template:
10409 r_shut = _ShutdownInstanceDisks(self, instance)
10411 raise errors.OpExecError("Cannot shutdown instance disks, unable to"
10412 " proceed with disk template conversion")
10413 mode = (instance.disk_template, self.op.disk_template)
10415 self._DISK_CONVERSIONS[mode](self, feedback_fn)
10417 self.cfg.ReleaseDRBDMinors(instance.name)
10419 result.append(("disk_template", self.op.disk_template))
10422 for nic_op, nic_dict in self.op.nics:
10423 if nic_op == constants.DDM_REMOVE:
10424 # remove the last nic
10425 del instance.nics[-1]
10426 result.append(("nic.%d" % len(instance.nics), "remove"))
10427 elif nic_op == constants.DDM_ADD:
10428 # mac and bridge should be set, by now
10429 mac = nic_dict[constants.INIC_MAC]
10430 ip = nic_dict.get(constants.INIC_IP, None)
10431 nicparams = self.nic_pinst[constants.DDM_ADD]
10432 new_nic = objects.NIC(mac=mac, ip=ip, nicparams=nicparams)
10433 instance.nics.append(new_nic)
10434 result.append(("nic.%d" % (len(instance.nics) - 1),
10435 "add:mac=%s,ip=%s,mode=%s,link=%s" %
10436 (new_nic.mac, new_nic.ip,
10437 self.nic_pnew[constants.DDM_ADD][constants.NIC_MODE],
10438 self.nic_pnew[constants.DDM_ADD][constants.NIC_LINK]
10441 for key in (constants.INIC_MAC, constants.INIC_IP):
10442 if key in nic_dict:
10443 setattr(instance.nics[nic_op], key, nic_dict[key])
10444 if nic_op in self.nic_pinst:
10445 instance.nics[nic_op].nicparams = self.nic_pinst[nic_op]
10446 for key, val in nic_dict.iteritems():
10447 result.append(("nic.%s/%d" % (key, nic_op), val))
10450 if self.op.hvparams:
10451 instance.hvparams = self.hv_inst
10452 for key, val in self.op.hvparams.iteritems():
10453 result.append(("hv/%s" % key, val))
10456 if self.op.beparams:
10457 instance.beparams = self.be_inst
10458 for key, val in self.op.beparams.iteritems():
10459 result.append(("be/%s" % key, val))
10462 if self.op.os_name:
10463 instance.os = self.op.os_name
10466 if self.op.osparams:
10467 instance.osparams = self.os_inst
10468 for key, val in self.op.osparams.iteritems():
10469 result.append(("os/%s" % key, val))
10471 self.cfg.Update(instance, feedback_fn)
10475 _DISK_CONVERSIONS = {
10476 (constants.DT_PLAIN, constants.DT_DRBD8): _ConvertPlainToDrbd,
10477 (constants.DT_DRBD8, constants.DT_PLAIN): _ConvertDrbdToPlain,
10481 class LUBackupQuery(NoHooksLU):
10482 """Query the exports list
10487 def ExpandNames(self):
10488 self.needed_locks = {}
10489 self.share_locks[locking.LEVEL_NODE] = 1
10490 if not self.op.nodes:
10491 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
10493 self.needed_locks[locking.LEVEL_NODE] = \
10494 _GetWantedNodes(self, self.op.nodes)
10496 def Exec(self, feedback_fn):
10497 """Compute the list of all the exported system images.
10500 @return: a dictionary with the structure node->(export-list)
10501 where export-list is a list of the instances exported on
10505 self.nodes = self.glm.list_owned(locking.LEVEL_NODE)
10506 rpcresult = self.rpc.call_export_list(self.nodes)
10508 for node in rpcresult:
10509 if rpcresult[node].fail_msg:
10510 result[node] = False
10512 result[node] = rpcresult[node].payload
10517 class LUBackupPrepare(NoHooksLU):
10518 """Prepares an instance for an export and returns useful information.
10523 def ExpandNames(self):
10524 self._ExpandAndLockInstance()
10526 def CheckPrereq(self):
10527 """Check prerequisites.
10530 instance_name = self.op.instance_name
10532 self.instance = self.cfg.GetInstanceInfo(instance_name)
10533 assert self.instance is not None, \
10534 "Cannot retrieve locked instance %s" % self.op.instance_name
10535 _CheckNodeOnline(self, self.instance.primary_node)
10537 self._cds = _GetClusterDomainSecret()
10539 def Exec(self, feedback_fn):
10540 """Prepares an instance for an export.
10543 instance = self.instance
10545 if self.op.mode == constants.EXPORT_MODE_REMOTE:
10546 salt = utils.GenerateSecret(8)
10548 feedback_fn("Generating X509 certificate on %s" % instance.primary_node)
10549 result = self.rpc.call_x509_cert_create(instance.primary_node,
10550 constants.RIE_CERT_VALIDITY)
10551 result.Raise("Can't create X509 key and certificate on %s" % result.node)
10553 (name, cert_pem) = result.payload
10555 cert = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM,
10559 "handshake": masterd.instance.ComputeRemoteExportHandshake(self._cds),
10560 "x509_key_name": (name, utils.Sha1Hmac(self._cds, name, salt=salt),
10562 "x509_ca": utils.SignX509Certificate(cert, self._cds, salt),
10568 class LUBackupExport(LogicalUnit):
10569 """Export an instance to an image in the cluster.
10572 HPATH = "instance-export"
10573 HTYPE = constants.HTYPE_INSTANCE
10576 def CheckArguments(self):
10577 """Check the arguments.
10580 self.x509_key_name = self.op.x509_key_name
10581 self.dest_x509_ca_pem = self.op.destination_x509_ca
10583 if self.op.mode == constants.EXPORT_MODE_REMOTE:
10584 if not self.x509_key_name:
10585 raise errors.OpPrereqError("Missing X509 key name for encryption",
10586 errors.ECODE_INVAL)
10588 if not self.dest_x509_ca_pem:
10589 raise errors.OpPrereqError("Missing destination X509 CA",
10590 errors.ECODE_INVAL)
10592 def ExpandNames(self):
10593 self._ExpandAndLockInstance()
10595 # Lock all nodes for local exports
10596 if self.op.mode == constants.EXPORT_MODE_LOCAL:
10597 # FIXME: lock only instance primary and destination node
10599 # Sad but true, for now we have do lock all nodes, as we don't know where
10600 # the previous export might be, and in this LU we search for it and
10601 # remove it from its current node. In the future we could fix this by:
10602 # - making a tasklet to search (share-lock all), then create the
10603 # new one, then one to remove, after
10604 # - removing the removal operation altogether
10605 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
10607 def DeclareLocks(self, level):
10608 """Last minute lock declaration."""
10609 # All nodes are locked anyway, so nothing to do here.
10611 def BuildHooksEnv(self):
10612 """Build hooks env.
10614 This will run on the master, primary node and target node.
10618 "EXPORT_MODE": self.op.mode,
10619 "EXPORT_NODE": self.op.target_node,
10620 "EXPORT_DO_SHUTDOWN": self.op.shutdown,
10621 "SHUTDOWN_TIMEOUT": self.op.shutdown_timeout,
10622 # TODO: Generic function for boolean env variables
10623 "REMOVE_INSTANCE": str(bool(self.op.remove_instance)),
10626 env.update(_BuildInstanceHookEnvByObject(self, self.instance))
10630 def BuildHooksNodes(self):
10631 """Build hooks nodes.
10634 nl = [self.cfg.GetMasterNode(), self.instance.primary_node]
10636 if self.op.mode == constants.EXPORT_MODE_LOCAL:
10637 nl.append(self.op.target_node)
10641 def CheckPrereq(self):
10642 """Check prerequisites.
10644 This checks that the instance and node names are valid.
10647 instance_name = self.op.instance_name
10649 self.instance = self.cfg.GetInstanceInfo(instance_name)
10650 assert self.instance is not None, \
10651 "Cannot retrieve locked instance %s" % self.op.instance_name
10652 _CheckNodeOnline(self, self.instance.primary_node)
10654 if (self.op.remove_instance and self.instance.admin_up and
10655 not self.op.shutdown):
10656 raise errors.OpPrereqError("Can not remove instance without shutting it"
10659 if self.op.mode == constants.EXPORT_MODE_LOCAL:
10660 self.op.target_node = _ExpandNodeName(self.cfg, self.op.target_node)
10661 self.dst_node = self.cfg.GetNodeInfo(self.op.target_node)
10662 assert self.dst_node is not None
10664 _CheckNodeOnline(self, self.dst_node.name)
10665 _CheckNodeNotDrained(self, self.dst_node.name)
10668 self.dest_disk_info = None
10669 self.dest_x509_ca = None
10671 elif self.op.mode == constants.EXPORT_MODE_REMOTE:
10672 self.dst_node = None
10674 if len(self.op.target_node) != len(self.instance.disks):
10675 raise errors.OpPrereqError(("Received destination information for %s"
10676 " disks, but instance %s has %s disks") %
10677 (len(self.op.target_node), instance_name,
10678 len(self.instance.disks)),
10679 errors.ECODE_INVAL)
10681 cds = _GetClusterDomainSecret()
10683 # Check X509 key name
10685 (key_name, hmac_digest, hmac_salt) = self.x509_key_name
10686 except (TypeError, ValueError), err:
10687 raise errors.OpPrereqError("Invalid data for X509 key name: %s" % err)
10689 if not utils.VerifySha1Hmac(cds, key_name, hmac_digest, salt=hmac_salt):
10690 raise errors.OpPrereqError("HMAC for X509 key name is wrong",
10691 errors.ECODE_INVAL)
10693 # Load and verify CA
10695 (cert, _) = utils.LoadSignedX509Certificate(self.dest_x509_ca_pem, cds)
10696 except OpenSSL.crypto.Error, err:
10697 raise errors.OpPrereqError("Unable to load destination X509 CA (%s)" %
10698 (err, ), errors.ECODE_INVAL)
10700 (errcode, msg) = utils.VerifyX509Certificate(cert, None, None)
10701 if errcode is not None:
10702 raise errors.OpPrereqError("Invalid destination X509 CA (%s)" %
10703 (msg, ), errors.ECODE_INVAL)
10705 self.dest_x509_ca = cert
10707 # Verify target information
10709 for idx, disk_data in enumerate(self.op.target_node):
10711 (host, port, magic) = \
10712 masterd.instance.CheckRemoteExportDiskInfo(cds, idx, disk_data)
10713 except errors.GenericError, err:
10714 raise errors.OpPrereqError("Target info for disk %s: %s" %
10715 (idx, err), errors.ECODE_INVAL)
10717 disk_info.append((host, port, magic))
10719 assert len(disk_info) == len(self.op.target_node)
10720 self.dest_disk_info = disk_info
10723 raise errors.ProgrammerError("Unhandled export mode %r" %
10726 # instance disk type verification
10727 # TODO: Implement export support for file-based disks
10728 for disk in self.instance.disks:
10729 if disk.dev_type == constants.LD_FILE:
10730 raise errors.OpPrereqError("Export not supported for instances with"
10731 " file-based disks", errors.ECODE_INVAL)
10733 def _CleanupExports(self, feedback_fn):
10734 """Removes exports of current instance from all other nodes.
10736 If an instance in a cluster with nodes A..D was exported to node C, its
10737 exports will be removed from the nodes A, B and D.
10740 assert self.op.mode != constants.EXPORT_MODE_REMOTE
10742 nodelist = self.cfg.GetNodeList()
10743 nodelist.remove(self.dst_node.name)
10745 # on one-node clusters nodelist will be empty after the removal
10746 # if we proceed the backup would be removed because OpBackupQuery
10747 # substitutes an empty list with the full cluster node list.
10748 iname = self.instance.name
10750 feedback_fn("Removing old exports for instance %s" % iname)
10751 exportlist = self.rpc.call_export_list(nodelist)
10752 for node in exportlist:
10753 if exportlist[node].fail_msg:
10755 if iname in exportlist[node].payload:
10756 msg = self.rpc.call_export_remove(node, iname).fail_msg
10758 self.LogWarning("Could not remove older export for instance %s"
10759 " on node %s: %s", iname, node, msg)
10761 def Exec(self, feedback_fn):
10762 """Export an instance to an image in the cluster.
10765 assert self.op.mode in constants.EXPORT_MODES
10767 instance = self.instance
10768 src_node = instance.primary_node
10770 if self.op.shutdown:
10771 # shutdown the instance, but not the disks
10772 feedback_fn("Shutting down instance %s" % instance.name)
10773 result = self.rpc.call_instance_shutdown(src_node, instance,
10774 self.op.shutdown_timeout)
10775 # TODO: Maybe ignore failures if ignore_remove_failures is set
10776 result.Raise("Could not shutdown instance %s on"
10777 " node %s" % (instance.name, src_node))
10779 # set the disks ID correctly since call_instance_start needs the
10780 # correct drbd minor to create the symlinks
10781 for disk in instance.disks:
10782 self.cfg.SetDiskID(disk, src_node)
10784 activate_disks = (not instance.admin_up)
10787 # Activate the instance disks if we'exporting a stopped instance
10788 feedback_fn("Activating disks for %s" % instance.name)
10789 _StartInstanceDisks(self, instance, None)
10792 helper = masterd.instance.ExportInstanceHelper(self, feedback_fn,
10795 helper.CreateSnapshots()
10797 if (self.op.shutdown and instance.admin_up and
10798 not self.op.remove_instance):
10799 assert not activate_disks
10800 feedback_fn("Starting instance %s" % instance.name)
10801 result = self.rpc.call_instance_start(src_node, instance, None, None)
10802 msg = result.fail_msg
10804 feedback_fn("Failed to start instance: %s" % msg)
10805 _ShutdownInstanceDisks(self, instance)
10806 raise errors.OpExecError("Could not start instance: %s" % msg)
10808 if self.op.mode == constants.EXPORT_MODE_LOCAL:
10809 (fin_resu, dresults) = helper.LocalExport(self.dst_node)
10810 elif self.op.mode == constants.EXPORT_MODE_REMOTE:
10811 connect_timeout = constants.RIE_CONNECT_TIMEOUT
10812 timeouts = masterd.instance.ImportExportTimeouts(connect_timeout)
10814 (key_name, _, _) = self.x509_key_name
10817 OpenSSL.crypto.dump_certificate(OpenSSL.crypto.FILETYPE_PEM,
10820 (fin_resu, dresults) = helper.RemoteExport(self.dest_disk_info,
10821 key_name, dest_ca_pem,
10826 # Check for backwards compatibility
10827 assert len(dresults) == len(instance.disks)
10828 assert compat.all(isinstance(i, bool) for i in dresults), \
10829 "Not all results are boolean: %r" % dresults
10833 feedback_fn("Deactivating disks for %s" % instance.name)
10834 _ShutdownInstanceDisks(self, instance)
10836 if not (compat.all(dresults) and fin_resu):
10839 failures.append("export finalization")
10840 if not compat.all(dresults):
10841 fdsk = utils.CommaJoin(idx for (idx, dsk) in enumerate(dresults)
10843 failures.append("disk export: disk(s) %s" % fdsk)
10845 raise errors.OpExecError("Export failed, errors in %s" %
10846 utils.CommaJoin(failures))
10848 # At this point, the export was successful, we can cleanup/finish
10850 # Remove instance if requested
10851 if self.op.remove_instance:
10852 feedback_fn("Removing instance %s" % instance.name)
10853 _RemoveInstance(self, feedback_fn, instance,
10854 self.op.ignore_remove_failures)
10856 if self.op.mode == constants.EXPORT_MODE_LOCAL:
10857 self._CleanupExports(feedback_fn)
10859 return fin_resu, dresults
10862 class LUBackupRemove(NoHooksLU):
10863 """Remove exports related to the named instance.
10868 def ExpandNames(self):
10869 self.needed_locks = {}
10870 # We need all nodes to be locked in order for RemoveExport to work, but we
10871 # don't need to lock the instance itself, as nothing will happen to it (and
10872 # we can remove exports also for a removed instance)
10873 self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
10875 def Exec(self, feedback_fn):
10876 """Remove any export.
10879 instance_name = self.cfg.ExpandInstanceName(self.op.instance_name)
10880 # If the instance was not found we'll try with the name that was passed in.
10881 # This will only work if it was an FQDN, though.
10883 if not instance_name:
10885 instance_name = self.op.instance_name
10887 locked_nodes = self.glm.list_owned(locking.LEVEL_NODE)
10888 exportlist = self.rpc.call_export_list(locked_nodes)
10890 for node in exportlist:
10891 msg = exportlist[node].fail_msg
10893 self.LogWarning("Failed to query node %s (continuing): %s", node, msg)
10895 if instance_name in exportlist[node].payload:
10897 result = self.rpc.call_export_remove(node, instance_name)
10898 msg = result.fail_msg
10900 logging.error("Could not remove export for instance %s"
10901 " on node %s: %s", instance_name, node, msg)
10903 if fqdn_warn and not found:
10904 feedback_fn("Export not found. If trying to remove an export belonging"
10905 " to a deleted instance please use its Fully Qualified"
10909 class LUGroupAdd(LogicalUnit):
10910 """Logical unit for creating node groups.
10913 HPATH = "group-add"
10914 HTYPE = constants.HTYPE_GROUP
10917 def ExpandNames(self):
10918 # We need the new group's UUID here so that we can create and acquire the
10919 # corresponding lock. Later, in Exec(), we'll indicate to cfg.AddNodeGroup
10920 # that it should not check whether the UUID exists in the configuration.
10921 self.group_uuid = self.cfg.GenerateUniqueID(self.proc.GetECId())
10922 self.needed_locks = {}
10923 self.add_locks[locking.LEVEL_NODEGROUP] = self.group_uuid
10925 def CheckPrereq(self):
10926 """Check prerequisites.
10928 This checks that the given group name is not an existing node group
10933 existing_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
10934 except errors.OpPrereqError:
10937 raise errors.OpPrereqError("Desired group name '%s' already exists as a"
10938 " node group (UUID: %s)" %
10939 (self.op.group_name, existing_uuid),
10940 errors.ECODE_EXISTS)
10942 if self.op.ndparams:
10943 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
10945 def BuildHooksEnv(self):
10946 """Build hooks env.
10950 "GROUP_NAME": self.op.group_name,
10953 def BuildHooksNodes(self):
10954 """Build hooks nodes.
10957 mn = self.cfg.GetMasterNode()
10958 return ([mn], [mn])
10960 def Exec(self, feedback_fn):
10961 """Add the node group to the cluster.
10964 group_obj = objects.NodeGroup(name=self.op.group_name, members=[],
10965 uuid=self.group_uuid,
10966 alloc_policy=self.op.alloc_policy,
10967 ndparams=self.op.ndparams)
10969 self.cfg.AddNodeGroup(group_obj, self.proc.GetECId(), check_uuid=False)
10970 del self.remove_locks[locking.LEVEL_NODEGROUP]
10973 class LUGroupAssignNodes(NoHooksLU):
10974 """Logical unit for assigning nodes to groups.
10979 def ExpandNames(self):
10980 # These raise errors.OpPrereqError on their own:
10981 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
10982 self.op.nodes = _GetWantedNodes(self, self.op.nodes)
10984 # We want to lock all the affected nodes and groups. We have readily
10985 # available the list of nodes, and the *destination* group. To gather the
10986 # list of "source" groups, we need to fetch node information later on.
10987 self.needed_locks = {
10988 locking.LEVEL_NODEGROUP: set([self.group_uuid]),
10989 locking.LEVEL_NODE: self.op.nodes,
10992 def DeclareLocks(self, level):
10993 if level == locking.LEVEL_NODEGROUP:
10994 assert len(self.needed_locks[locking.LEVEL_NODEGROUP]) == 1
10996 # Try to get all affected nodes' groups without having the group or node
10997 # lock yet. Needs verification later in the code flow.
10998 groups = self.cfg.GetNodeGroupsFromNodes(self.op.nodes)
11000 self.needed_locks[locking.LEVEL_NODEGROUP].update(groups)
11002 def CheckPrereq(self):
11003 """Check prerequisites.
11006 assert self.needed_locks[locking.LEVEL_NODEGROUP]
11007 assert (frozenset(self.glm.list_owned(locking.LEVEL_NODE)) ==
11008 frozenset(self.op.nodes))
11010 expected_locks = (set([self.group_uuid]) |
11011 self.cfg.GetNodeGroupsFromNodes(self.op.nodes))
11012 actual_locks = self.glm.list_owned(locking.LEVEL_NODEGROUP)
11013 if actual_locks != expected_locks:
11014 raise errors.OpExecError("Nodes changed groups since locks were acquired,"
11015 " current groups are '%s', used to be '%s'" %
11016 (utils.CommaJoin(expected_locks),
11017 utils.CommaJoin(actual_locks)))
11019 self.node_data = self.cfg.GetAllNodesInfo()
11020 self.group = self.cfg.GetNodeGroup(self.group_uuid)
11021 instance_data = self.cfg.GetAllInstancesInfo()
11023 if self.group is None:
11024 raise errors.OpExecError("Could not retrieve group '%s' (UUID: %s)" %
11025 (self.op.group_name, self.group_uuid))
11027 (new_splits, previous_splits) = \
11028 self.CheckAssignmentForSplitInstances([(node, self.group_uuid)
11029 for node in self.op.nodes],
11030 self.node_data, instance_data)
11033 fmt_new_splits = utils.CommaJoin(utils.NiceSort(new_splits))
11035 if not self.op.force:
11036 raise errors.OpExecError("The following instances get split by this"
11037 " change and --force was not given: %s" %
11040 self.LogWarning("This operation will split the following instances: %s",
11043 if previous_splits:
11044 self.LogWarning("In addition, these already-split instances continue"
11045 " to be split across groups: %s",
11046 utils.CommaJoin(utils.NiceSort(previous_splits)))
11048 def Exec(self, feedback_fn):
11049 """Assign nodes to a new group.
11052 for node in self.op.nodes:
11053 self.node_data[node].group = self.group_uuid
11055 # FIXME: Depends on side-effects of modifying the result of
11056 # C{cfg.GetAllNodesInfo}
11058 self.cfg.Update(self.group, feedback_fn) # Saves all modified nodes.
11061 def CheckAssignmentForSplitInstances(changes, node_data, instance_data):
11062 """Check for split instances after a node assignment.
11064 This method considers a series of node assignments as an atomic operation,
11065 and returns information about split instances after applying the set of
11068 In particular, it returns information about newly split instances, and
11069 instances that were already split, and remain so after the change.
11071 Only instances whose disk template is listed in constants.DTS_INT_MIRROR are
11074 @type changes: list of (node_name, new_group_uuid) pairs.
11075 @param changes: list of node assignments to consider.
11076 @param node_data: a dict with data for all nodes
11077 @param instance_data: a dict with all instances to consider
11078 @rtype: a two-tuple
11079 @return: a list of instances that were previously okay and result split as a
11080 consequence of this change, and a list of instances that were previously
11081 split and this change does not fix.
11084 changed_nodes = dict((node, group) for node, group in changes
11085 if node_data[node].group != group)
11087 all_split_instances = set()
11088 previously_split_instances = set()
11090 def InstanceNodes(instance):
11091 return [instance.primary_node] + list(instance.secondary_nodes)
11093 for inst in instance_data.values():
11094 if inst.disk_template not in constants.DTS_INT_MIRROR:
11097 instance_nodes = InstanceNodes(inst)
11099 if len(set(node_data[node].group for node in instance_nodes)) > 1:
11100 previously_split_instances.add(inst.name)
11102 if len(set(changed_nodes.get(node, node_data[node].group)
11103 for node in instance_nodes)) > 1:
11104 all_split_instances.add(inst.name)
11106 return (list(all_split_instances - previously_split_instances),
11107 list(previously_split_instances & all_split_instances))
11110 class _GroupQuery(_QueryBase):
11111 FIELDS = query.GROUP_FIELDS
11113 def ExpandNames(self, lu):
11114 lu.needed_locks = {}
11116 self._all_groups = lu.cfg.GetAllNodeGroupsInfo()
11117 name_to_uuid = dict((g.name, g.uuid) for g in self._all_groups.values())
11120 self.wanted = [name_to_uuid[name]
11121 for name in utils.NiceSort(name_to_uuid.keys())]
11123 # Accept names to be either names or UUIDs.
11126 all_uuid = frozenset(self._all_groups.keys())
11128 for name in self.names:
11129 if name in all_uuid:
11130 self.wanted.append(name)
11131 elif name in name_to_uuid:
11132 self.wanted.append(name_to_uuid[name])
11134 missing.append(name)
11137 raise errors.OpPrereqError("Some groups do not exist: %s" %
11138 utils.CommaJoin(missing),
11139 errors.ECODE_NOENT)
11141 def DeclareLocks(self, lu, level):
11144 def _GetQueryData(self, lu):
11145 """Computes the list of node groups and their attributes.
11148 do_nodes = query.GQ_NODE in self.requested_data
11149 do_instances = query.GQ_INST in self.requested_data
11151 group_to_nodes = None
11152 group_to_instances = None
11154 # For GQ_NODE, we need to map group->[nodes], and group->[instances] for
11155 # GQ_INST. The former is attainable with just GetAllNodesInfo(), but for the
11156 # latter GetAllInstancesInfo() is not enough, for we have to go through
11157 # instance->node. Hence, we will need to process nodes even if we only need
11158 # instance information.
11159 if do_nodes or do_instances:
11160 all_nodes = lu.cfg.GetAllNodesInfo()
11161 group_to_nodes = dict((uuid, []) for uuid in self.wanted)
11164 for node in all_nodes.values():
11165 if node.group in group_to_nodes:
11166 group_to_nodes[node.group].append(node.name)
11167 node_to_group[node.name] = node.group
11170 all_instances = lu.cfg.GetAllInstancesInfo()
11171 group_to_instances = dict((uuid, []) for uuid in self.wanted)
11173 for instance in all_instances.values():
11174 node = instance.primary_node
11175 if node in node_to_group:
11176 group_to_instances[node_to_group[node]].append(instance.name)
11179 # Do not pass on node information if it was not requested.
11180 group_to_nodes = None
11182 return query.GroupQueryData([self._all_groups[uuid]
11183 for uuid in self.wanted],
11184 group_to_nodes, group_to_instances)
11187 class LUGroupQuery(NoHooksLU):
11188 """Logical unit for querying node groups.
11193 def CheckArguments(self):
11194 self.gq = _GroupQuery(qlang.MakeSimpleFilter("name", self.op.names),
11195 self.op.output_fields, False)
11197 def ExpandNames(self):
11198 self.gq.ExpandNames(self)
11200 def Exec(self, feedback_fn):
11201 return self.gq.OldStyleQuery(self)
11204 class LUGroupSetParams(LogicalUnit):
11205 """Modifies the parameters of a node group.
11208 HPATH = "group-modify"
11209 HTYPE = constants.HTYPE_GROUP
11212 def CheckArguments(self):
11215 self.op.alloc_policy,
11218 if all_changes.count(None) == len(all_changes):
11219 raise errors.OpPrereqError("Please pass at least one modification",
11220 errors.ECODE_INVAL)
11222 def ExpandNames(self):
11223 # This raises errors.OpPrereqError on its own:
11224 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
11226 self.needed_locks = {
11227 locking.LEVEL_NODEGROUP: [self.group_uuid],
11230 def CheckPrereq(self):
11231 """Check prerequisites.
11234 self.group = self.cfg.GetNodeGroup(self.group_uuid)
11236 if self.group is None:
11237 raise errors.OpExecError("Could not retrieve group '%s' (UUID: %s)" %
11238 (self.op.group_name, self.group_uuid))
11240 if self.op.ndparams:
11241 new_ndparams = _GetUpdatedParams(self.group.ndparams, self.op.ndparams)
11242 utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
11243 self.new_ndparams = new_ndparams
11245 def BuildHooksEnv(self):
11246 """Build hooks env.
11250 "GROUP_NAME": self.op.group_name,
11251 "NEW_ALLOC_POLICY": self.op.alloc_policy,
11254 def BuildHooksNodes(self):
11255 """Build hooks nodes.
11258 mn = self.cfg.GetMasterNode()
11259 return ([mn], [mn])
11261 def Exec(self, feedback_fn):
11262 """Modifies the node group.
11267 if self.op.ndparams:
11268 self.group.ndparams = self.new_ndparams
11269 result.append(("ndparams", str(self.group.ndparams)))
11271 if self.op.alloc_policy:
11272 self.group.alloc_policy = self.op.alloc_policy
11274 self.cfg.Update(self.group, feedback_fn)
11279 class LUGroupRemove(LogicalUnit):
11280 HPATH = "group-remove"
11281 HTYPE = constants.HTYPE_GROUP
11284 def ExpandNames(self):
11285 # This will raises errors.OpPrereqError on its own:
11286 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
11287 self.needed_locks = {
11288 locking.LEVEL_NODEGROUP: [self.group_uuid],
11291 def CheckPrereq(self):
11292 """Check prerequisites.
11294 This checks that the given group name exists as a node group, that is
11295 empty (i.e., contains no nodes), and that is not the last group of the
11299 # Verify that the group is empty.
11300 group_nodes = [node.name
11301 for node in self.cfg.GetAllNodesInfo().values()
11302 if node.group == self.group_uuid]
11305 raise errors.OpPrereqError("Group '%s' not empty, has the following"
11307 (self.op.group_name,
11308 utils.CommaJoin(utils.NiceSort(group_nodes))),
11309 errors.ECODE_STATE)
11311 # Verify the cluster would not be left group-less.
11312 if len(self.cfg.GetNodeGroupList()) == 1:
11313 raise errors.OpPrereqError("Group '%s' is the only group,"
11314 " cannot be removed" %
11315 self.op.group_name,
11316 errors.ECODE_STATE)
11318 def BuildHooksEnv(self):
11319 """Build hooks env.
11323 "GROUP_NAME": self.op.group_name,
11326 def BuildHooksNodes(self):
11327 """Build hooks nodes.
11330 mn = self.cfg.GetMasterNode()
11331 return ([mn], [mn])
11333 def Exec(self, feedback_fn):
11334 """Remove the node group.
11338 self.cfg.RemoveNodeGroup(self.group_uuid)
11339 except errors.ConfigurationError:
11340 raise errors.OpExecError("Group '%s' with UUID %s disappeared" %
11341 (self.op.group_name, self.group_uuid))
11343 self.remove_locks[locking.LEVEL_NODEGROUP] = self.group_uuid
11346 class LUGroupRename(LogicalUnit):
11347 HPATH = "group-rename"
11348 HTYPE = constants.HTYPE_GROUP
11351 def ExpandNames(self):
11352 # This raises errors.OpPrereqError on its own:
11353 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
11355 self.needed_locks = {
11356 locking.LEVEL_NODEGROUP: [self.group_uuid],
11359 def CheckPrereq(self):
11360 """Check prerequisites.
11362 Ensures requested new name is not yet used.
11366 new_name_uuid = self.cfg.LookupNodeGroup(self.op.new_name)
11367 except errors.OpPrereqError:
11370 raise errors.OpPrereqError("Desired new name '%s' clashes with existing"
11371 " node group (UUID: %s)" %
11372 (self.op.new_name, new_name_uuid),
11373 errors.ECODE_EXISTS)
11375 def BuildHooksEnv(self):
11376 """Build hooks env.
11380 "OLD_NAME": self.op.group_name,
11381 "NEW_NAME": self.op.new_name,
11384 def BuildHooksNodes(self):
11385 """Build hooks nodes.
11388 mn = self.cfg.GetMasterNode()
11390 all_nodes = self.cfg.GetAllNodesInfo()
11391 all_nodes.pop(mn, None)
11394 run_nodes.extend(node.name for node in all_nodes.values()
11395 if node.group == self.group_uuid)
11397 return (run_nodes, run_nodes)
11399 def Exec(self, feedback_fn):
11400 """Rename the node group.
11403 group = self.cfg.GetNodeGroup(self.group_uuid)
11406 raise errors.OpExecError("Could not retrieve group '%s' (UUID: %s)" %
11407 (self.op.group_name, self.group_uuid))
11409 group.name = self.op.new_name
11410 self.cfg.Update(group, feedback_fn)
11412 return self.op.new_name
11415 class TagsLU(NoHooksLU): # pylint: disable-msg=W0223
11416 """Generic tags LU.
11418 This is an abstract class which is the parent of all the other tags LUs.
11421 def ExpandNames(self):
11422 self.group_uuid = None
11423 self.needed_locks = {}
11424 if self.op.kind == constants.TAG_NODE:
11425 self.op.name = _ExpandNodeName(self.cfg, self.op.name)
11426 self.needed_locks[locking.LEVEL_NODE] = self.op.name
11427 elif self.op.kind == constants.TAG_INSTANCE:
11428 self.op.name = _ExpandInstanceName(self.cfg, self.op.name)
11429 self.needed_locks[locking.LEVEL_INSTANCE] = self.op.name
11430 elif self.op.kind == constants.TAG_NODEGROUP:
11431 self.group_uuid = self.cfg.LookupNodeGroup(self.op.name)
11433 # FIXME: Acquire BGL for cluster tag operations (as of this writing it's
11434 # not possible to acquire the BGL based on opcode parameters)
11436 def CheckPrereq(self):
11437 """Check prerequisites.
11440 if self.op.kind == constants.TAG_CLUSTER:
11441 self.target = self.cfg.GetClusterInfo()
11442 elif self.op.kind == constants.TAG_NODE:
11443 self.target = self.cfg.GetNodeInfo(self.op.name)
11444 elif self.op.kind == constants.TAG_INSTANCE:
11445 self.target = self.cfg.GetInstanceInfo(self.op.name)
11446 elif self.op.kind == constants.TAG_NODEGROUP:
11447 self.target = self.cfg.GetNodeGroup(self.group_uuid)
11449 raise errors.OpPrereqError("Wrong tag type requested (%s)" %
11450 str(self.op.kind), errors.ECODE_INVAL)
11453 class LUTagsGet(TagsLU):
11454 """Returns the tags of a given object.
11459 def ExpandNames(self):
11460 TagsLU.ExpandNames(self)
11462 # Share locks as this is only a read operation
11463 self.share_locks = dict.fromkeys(locking.LEVELS, 1)
11465 def Exec(self, feedback_fn):
11466 """Returns the tag list.
11469 return list(self.target.GetTags())
11472 class LUTagsSearch(NoHooksLU):
11473 """Searches the tags for a given pattern.
11478 def ExpandNames(self):
11479 self.needed_locks = {}
11481 def CheckPrereq(self):
11482 """Check prerequisites.
11484 This checks the pattern passed for validity by compiling it.
11488 self.re = re.compile(self.op.pattern)
11489 except re.error, err:
11490 raise errors.OpPrereqError("Invalid search pattern '%s': %s" %
11491 (self.op.pattern, err), errors.ECODE_INVAL)
11493 def Exec(self, feedback_fn):
11494 """Returns the tag list.
11498 tgts = [("/cluster", cfg.GetClusterInfo())]
11499 ilist = cfg.GetAllInstancesInfo().values()
11500 tgts.extend([("/instances/%s" % i.name, i) for i in ilist])
11501 nlist = cfg.GetAllNodesInfo().values()
11502 tgts.extend([("/nodes/%s" % n.name, n) for n in nlist])
11503 tgts.extend(("/nodegroup/%s" % n.name, n)
11504 for n in cfg.GetAllNodeGroupsInfo().values())
11506 for path, target in tgts:
11507 for tag in target.GetTags():
11508 if self.re.search(tag):
11509 results.append((path, tag))
11513 class LUTagsSet(TagsLU):
11514 """Sets a tag on a given object.
11519 def CheckPrereq(self):
11520 """Check prerequisites.
11522 This checks the type and length of the tag name and value.
11525 TagsLU.CheckPrereq(self)
11526 for tag in self.op.tags:
11527 objects.TaggableObject.ValidateTag(tag)
11529 def Exec(self, feedback_fn):
11534 for tag in self.op.tags:
11535 self.target.AddTag(tag)
11536 except errors.TagError, err:
11537 raise errors.OpExecError("Error while setting tag: %s" % str(err))
11538 self.cfg.Update(self.target, feedback_fn)
11541 class LUTagsDel(TagsLU):
11542 """Delete a list of tags from a given object.
11547 def CheckPrereq(self):
11548 """Check prerequisites.
11550 This checks that we have the given tag.
11553 TagsLU.CheckPrereq(self)
11554 for tag in self.op.tags:
11555 objects.TaggableObject.ValidateTag(tag)
11556 del_tags = frozenset(self.op.tags)
11557 cur_tags = self.target.GetTags()
11559 diff_tags = del_tags - cur_tags
11561 diff_names = ("'%s'" % i for i in sorted(diff_tags))
11562 raise errors.OpPrereqError("Tag(s) %s not found" %
11563 (utils.CommaJoin(diff_names), ),
11564 errors.ECODE_NOENT)
11566 def Exec(self, feedback_fn):
11567 """Remove the tag from the object.
11570 for tag in self.op.tags:
11571 self.target.RemoveTag(tag)
11572 self.cfg.Update(self.target, feedback_fn)
11575 class LUTestDelay(NoHooksLU):
11576 """Sleep for a specified amount of time.
11578 This LU sleeps on the master and/or nodes for a specified amount of
11584 def ExpandNames(self):
11585 """Expand names and set required locks.
11587 This expands the node list, if any.
11590 self.needed_locks = {}
11591 if self.op.on_nodes:
11592 # _GetWantedNodes can be used here, but is not always appropriate to use
11593 # this way in ExpandNames. Check LogicalUnit.ExpandNames docstring for
11594 # more information.
11595 self.op.on_nodes = _GetWantedNodes(self, self.op.on_nodes)
11596 self.needed_locks[locking.LEVEL_NODE] = self.op.on_nodes
11598 def _TestDelay(self):
11599 """Do the actual sleep.
11602 if self.op.on_master:
11603 if not utils.TestDelay(self.op.duration):
11604 raise errors.OpExecError("Error during master delay test")
11605 if self.op.on_nodes:
11606 result = self.rpc.call_test_delay(self.op.on_nodes, self.op.duration)
11607 for node, node_result in result.items():
11608 node_result.Raise("Failure during rpc call to node %s" % node)
11610 def Exec(self, feedback_fn):
11611 """Execute the test delay opcode, with the wanted repetitions.
11614 if self.op.repeat == 0:
11617 top_value = self.op.repeat - 1
11618 for i in range(self.op.repeat):
11619 self.LogInfo("Test delay iteration %d/%d" % (i, top_value))
11623 class LUTestJqueue(NoHooksLU):
11624 """Utility LU to test some aspects of the job queue.
11629 # Must be lower than default timeout for WaitForJobChange to see whether it
11630 # notices changed jobs
11631 _CLIENT_CONNECT_TIMEOUT = 20.0
11632 _CLIENT_CONFIRM_TIMEOUT = 60.0
11635 def _NotifyUsingSocket(cls, cb, errcls):
11636 """Opens a Unix socket and waits for another program to connect.
11639 @param cb: Callback to send socket name to client
11640 @type errcls: class
11641 @param errcls: Exception class to use for errors
11644 # Using a temporary directory as there's no easy way to create temporary
11645 # sockets without writing a custom loop around tempfile.mktemp and
11647 tmpdir = tempfile.mkdtemp()
11649 tmpsock = utils.PathJoin(tmpdir, "sock")
11651 logging.debug("Creating temporary socket at %s", tmpsock)
11652 sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
11657 # Send details to client
11660 # Wait for client to connect before continuing
11661 sock.settimeout(cls._CLIENT_CONNECT_TIMEOUT)
11663 (conn, _) = sock.accept()
11664 except socket.error, err:
11665 raise errcls("Client didn't connect in time (%s)" % err)
11669 # Remove as soon as client is connected
11670 shutil.rmtree(tmpdir)
11672 # Wait for client to close
11675 # pylint: disable-msg=E1101
11676 # Instance of '_socketobject' has no ... member
11677 conn.settimeout(cls._CLIENT_CONFIRM_TIMEOUT)
11679 except socket.error, err:
11680 raise errcls("Client failed to confirm notification (%s)" % err)
11684 def _SendNotification(self, test, arg, sockname):
11685 """Sends a notification to the client.
11688 @param test: Test name
11689 @param arg: Test argument (depends on test)
11690 @type sockname: string
11691 @param sockname: Socket path
11694 self.Log(constants.ELOG_JQUEUE_TEST, (sockname, test, arg))
11696 def _Notify(self, prereq, test, arg):
11697 """Notifies the client of a test.
11700 @param prereq: Whether this is a prereq-phase test
11702 @param test: Test name
11703 @param arg: Test argument (depends on test)
11707 errcls = errors.OpPrereqError
11709 errcls = errors.OpExecError
11711 return self._NotifyUsingSocket(compat.partial(self._SendNotification,
11715 def CheckArguments(self):
11716 self.checkargs_calls = getattr(self, "checkargs_calls", 0) + 1
11717 self.expandnames_calls = 0
11719 def ExpandNames(self):
11720 checkargs_calls = getattr(self, "checkargs_calls", 0)
11721 if checkargs_calls < 1:
11722 raise errors.ProgrammerError("CheckArguments was not called")
11724 self.expandnames_calls += 1
11726 if self.op.notify_waitlock:
11727 self._Notify(True, constants.JQT_EXPANDNAMES, None)
11729 self.LogInfo("Expanding names")
11731 # Get lock on master node (just to get a lock, not for a particular reason)
11732 self.needed_locks = {
11733 locking.LEVEL_NODE: self.cfg.GetMasterNode(),
11736 def Exec(self, feedback_fn):
11737 if self.expandnames_calls < 1:
11738 raise errors.ProgrammerError("ExpandNames was not called")
11740 if self.op.notify_exec:
11741 self._Notify(False, constants.JQT_EXEC, None)
11743 self.LogInfo("Executing")
11745 if self.op.log_messages:
11746 self._Notify(False, constants.JQT_STARTMSG, len(self.op.log_messages))
11747 for idx, msg in enumerate(self.op.log_messages):
11748 self.LogInfo("Sending log message %s", idx + 1)
11749 feedback_fn(constants.JQT_MSGPREFIX + msg)
11750 # Report how many test messages have been sent
11751 self._Notify(False, constants.JQT_LOGMSG, idx + 1)
11754 raise errors.OpExecError("Opcode failure was requested")
11759 class IAllocator(object):
11760 """IAllocator framework.
11762 An IAllocator instance has three sets of attributes:
11763 - cfg that is needed to query the cluster
11764 - input data (all members of the _KEYS class attribute are required)
11765 - four buffer attributes (in|out_data|text), that represent the
11766 input (to the external script) in text and data structure format,
11767 and the output from it, again in two formats
11768 - the result variables from the script (success, info, nodes) for
11772 # pylint: disable-msg=R0902
11773 # lots of instance attributes
11775 def __init__(self, cfg, rpc, mode, **kwargs):
11778 # init buffer variables
11779 self.in_text = self.out_text = self.in_data = self.out_data = None
11780 # init all input fields so that pylint is happy
11782 self.mem_size = self.disks = self.disk_template = None
11783 self.os = self.tags = self.nics = self.vcpus = None
11784 self.hypervisor = None
11785 self.relocate_from = None
11787 self.evac_nodes = None
11789 self.required_nodes = None
11790 # init result fields
11791 self.success = self.info = self.result = None
11794 (fn, keyset) = self._MODE_DATA[self.mode]
11796 raise errors.ProgrammerError("Unknown mode '%s' passed to the"
11797 " IAllocator" % self.mode)
11800 if key not in keyset:
11801 raise errors.ProgrammerError("Invalid input parameter '%s' to"
11802 " IAllocator" % key)
11803 setattr(self, key, kwargs[key])
11806 if key not in kwargs:
11807 raise errors.ProgrammerError("Missing input parameter '%s' to"
11808 " IAllocator" % key)
11809 self._BuildInputData(compat.partial(fn, self))
11811 def _ComputeClusterData(self):
11812 """Compute the generic allocator input data.
11814 This is the data that is independent of the actual operation.
11818 cluster_info = cfg.GetClusterInfo()
11821 "version": constants.IALLOCATOR_VERSION,
11822 "cluster_name": cfg.GetClusterName(),
11823 "cluster_tags": list(cluster_info.GetTags()),
11824 "enabled_hypervisors": list(cluster_info.enabled_hypervisors),
11825 # we don't have job IDs
11827 ninfo = cfg.GetAllNodesInfo()
11828 iinfo = cfg.GetAllInstancesInfo().values()
11829 i_list = [(inst, cluster_info.FillBE(inst)) for inst in iinfo]
11832 node_list = [n.name for n in ninfo.values() if n.vm_capable]
11834 if self.mode == constants.IALLOCATOR_MODE_ALLOC:
11835 hypervisor_name = self.hypervisor
11836 elif self.mode == constants.IALLOCATOR_MODE_RELOC:
11837 hypervisor_name = cfg.GetInstanceInfo(self.name).hypervisor
11838 elif self.mode == constants.IALLOCATOR_MODE_MEVAC:
11839 hypervisor_name = cluster_info.enabled_hypervisors[0]
11841 node_data = self.rpc.call_node_info(node_list, cfg.GetVGName(),
11844 self.rpc.call_all_instances_info(node_list,
11845 cluster_info.enabled_hypervisors)
11847 data["nodegroups"] = self._ComputeNodeGroupData(cfg)
11849 config_ndata = self._ComputeBasicNodeData(ninfo)
11850 data["nodes"] = self._ComputeDynamicNodeData(ninfo, node_data, node_iinfo,
11851 i_list, config_ndata)
11852 assert len(data["nodes"]) == len(ninfo), \
11853 "Incomplete node data computed"
11855 data["instances"] = self._ComputeInstanceData(cluster_info, i_list)
11857 self.in_data = data
11860 def _ComputeNodeGroupData(cfg):
11861 """Compute node groups data.
11865 for guuid, gdata in cfg.GetAllNodeGroupsInfo().items():
11867 "name": gdata.name,
11868 "alloc_policy": gdata.alloc_policy,
11873 def _ComputeBasicNodeData(node_cfg):
11874 """Compute global node data.
11877 @returns: a dict of name: (node dict, node config)
11881 for ninfo in node_cfg.values():
11882 # fill in static (config-based) values
11884 "tags": list(ninfo.GetTags()),
11885 "primary_ip": ninfo.primary_ip,
11886 "secondary_ip": ninfo.secondary_ip,
11887 "offline": ninfo.offline,
11888 "drained": ninfo.drained,
11889 "master_candidate": ninfo.master_candidate,
11890 "group": ninfo.group,
11891 "master_capable": ninfo.master_capable,
11892 "vm_capable": ninfo.vm_capable,
11895 node_results[ninfo.name] = pnr
11897 return node_results
11900 def _ComputeDynamicNodeData(node_cfg, node_data, node_iinfo, i_list,
11902 """Compute global node data.
11904 @param node_results: the basic node structures as filled from the config
11907 # make a copy of the current dict
11908 node_results = dict(node_results)
11909 for nname, nresult in node_data.items():
11910 assert nname in node_results, "Missing basic data for node %s" % nname
11911 ninfo = node_cfg[nname]
11913 if not (ninfo.offline or ninfo.drained):
11914 nresult.Raise("Can't get data for node %s" % nname)
11915 node_iinfo[nname].Raise("Can't get node instance info from node %s" %
11917 remote_info = nresult.payload
11919 for attr in ['memory_total', 'memory_free', 'memory_dom0',
11920 'vg_size', 'vg_free', 'cpu_total']:
11921 if attr not in remote_info:
11922 raise errors.OpExecError("Node '%s' didn't return attribute"
11923 " '%s'" % (nname, attr))
11924 if not isinstance(remote_info[attr], int):
11925 raise errors.OpExecError("Node '%s' returned invalid value"
11927 (nname, attr, remote_info[attr]))
11928 # compute memory used by primary instances
11929 i_p_mem = i_p_up_mem = 0
11930 for iinfo, beinfo in i_list:
11931 if iinfo.primary_node == nname:
11932 i_p_mem += beinfo[constants.BE_MEMORY]
11933 if iinfo.name not in node_iinfo[nname].payload:
11936 i_used_mem = int(node_iinfo[nname].payload[iinfo.name]['memory'])
11937 i_mem_diff = beinfo[constants.BE_MEMORY] - i_used_mem
11938 remote_info['memory_free'] -= max(0, i_mem_diff)
11941 i_p_up_mem += beinfo[constants.BE_MEMORY]
11943 # compute memory used by instances
11945 "total_memory": remote_info['memory_total'],
11946 "reserved_memory": remote_info['memory_dom0'],
11947 "free_memory": remote_info['memory_free'],
11948 "total_disk": remote_info['vg_size'],
11949 "free_disk": remote_info['vg_free'],
11950 "total_cpus": remote_info['cpu_total'],
11951 "i_pri_memory": i_p_mem,
11952 "i_pri_up_memory": i_p_up_mem,
11954 pnr_dyn.update(node_results[nname])
11955 node_results[nname] = pnr_dyn
11957 return node_results
11960 def _ComputeInstanceData(cluster_info, i_list):
11961 """Compute global instance data.
11965 for iinfo, beinfo in i_list:
11967 for nic in iinfo.nics:
11968 filled_params = cluster_info.SimpleFillNIC(nic.nicparams)
11969 nic_dict = {"mac": nic.mac,
11971 "mode": filled_params[constants.NIC_MODE],
11972 "link": filled_params[constants.NIC_LINK],
11974 if filled_params[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
11975 nic_dict["bridge"] = filled_params[constants.NIC_LINK]
11976 nic_data.append(nic_dict)
11978 "tags": list(iinfo.GetTags()),
11979 "admin_up": iinfo.admin_up,
11980 "vcpus": beinfo[constants.BE_VCPUS],
11981 "memory": beinfo[constants.BE_MEMORY],
11983 "nodes": [iinfo.primary_node] + list(iinfo.secondary_nodes),
11985 "disks": [{constants.IDISK_SIZE: dsk.size,
11986 constants.IDISK_MODE: dsk.mode}
11987 for dsk in iinfo.disks],
11988 "disk_template": iinfo.disk_template,
11989 "hypervisor": iinfo.hypervisor,
11991 pir["disk_space_total"] = _ComputeDiskSize(iinfo.disk_template,
11993 instance_data[iinfo.name] = pir
11995 return instance_data
11997 def _AddNewInstance(self):
11998 """Add new instance data to allocator structure.
12000 This in combination with _AllocatorGetClusterData will create the
12001 correct structure needed as input for the allocator.
12003 The checks for the completeness of the opcode must have already been
12007 disk_space = _ComputeDiskSize(self.disk_template, self.disks)
12009 if self.disk_template in constants.DTS_INT_MIRROR:
12010 self.required_nodes = 2
12012 self.required_nodes = 1
12015 "disk_template": self.disk_template,
12018 "vcpus": self.vcpus,
12019 "memory": self.mem_size,
12020 "disks": self.disks,
12021 "disk_space_total": disk_space,
12023 "required_nodes": self.required_nodes,
12027 def _AddRelocateInstance(self):
12028 """Add relocate instance data to allocator structure.
12030 This in combination with _IAllocatorGetClusterData will create the
12031 correct structure needed as input for the allocator.
12033 The checks for the completeness of the opcode must have already been
12037 instance = self.cfg.GetInstanceInfo(self.name)
12038 if instance is None:
12039 raise errors.ProgrammerError("Unknown instance '%s' passed to"
12040 " IAllocator" % self.name)
12042 if instance.disk_template not in constants.DTS_MIRRORED:
12043 raise errors.OpPrereqError("Can't relocate non-mirrored instances",
12044 errors.ECODE_INVAL)
12046 if instance.disk_template in constants.DTS_INT_MIRROR and \
12047 len(instance.secondary_nodes) != 1:
12048 raise errors.OpPrereqError("Instance has not exactly one secondary node",
12049 errors.ECODE_STATE)
12051 self.required_nodes = 1
12052 disk_sizes = [{constants.IDISK_SIZE: disk.size} for disk in instance.disks]
12053 disk_space = _ComputeDiskSize(instance.disk_template, disk_sizes)
12057 "disk_space_total": disk_space,
12058 "required_nodes": self.required_nodes,
12059 "relocate_from": self.relocate_from,
12063 def _AddEvacuateNodes(self):
12064 """Add evacuate nodes data to allocator structure.
12068 "evac_nodes": self.evac_nodes
12072 def _BuildInputData(self, fn):
12073 """Build input data structures.
12076 self._ComputeClusterData()
12079 request["type"] = self.mode
12080 self.in_data["request"] = request
12082 self.in_text = serializer.Dump(self.in_data)
12085 constants.IALLOCATOR_MODE_ALLOC:
12087 ["name", "mem_size", "disks", "disk_template", "os", "tags", "nics",
12088 "vcpus", "hypervisor"]),
12089 constants.IALLOCATOR_MODE_RELOC:
12090 (_AddRelocateInstance, ["name", "relocate_from"]),
12091 constants.IALLOCATOR_MODE_MEVAC:
12092 (_AddEvacuateNodes, ["evac_nodes"]),
12095 def Run(self, name, validate=True, call_fn=None):
12096 """Run an instance allocator and return the results.
12099 if call_fn is None:
12100 call_fn = self.rpc.call_iallocator_runner
12102 result = call_fn(self.cfg.GetMasterNode(), name, self.in_text)
12103 result.Raise("Failure while running the iallocator script")
12105 self.out_text = result.payload
12107 self._ValidateResult()
12109 def _ValidateResult(self):
12110 """Process the allocator results.
12112 This will process and if successful save the result in
12113 self.out_data and the other parameters.
12117 rdict = serializer.Load(self.out_text)
12118 except Exception, err:
12119 raise errors.OpExecError("Can't parse iallocator results: %s" % str(err))
12121 if not isinstance(rdict, dict):
12122 raise errors.OpExecError("Can't parse iallocator results: not a dict")
12124 # TODO: remove backwards compatiblity in later versions
12125 if "nodes" in rdict and "result" not in rdict:
12126 rdict["result"] = rdict["nodes"]
12129 for key in "success", "info", "result":
12130 if key not in rdict:
12131 raise errors.OpExecError("Can't parse iallocator results:"
12132 " missing key '%s'" % key)
12133 setattr(self, key, rdict[key])
12135 if not isinstance(rdict["result"], list):
12136 raise errors.OpExecError("Can't parse iallocator results: 'result' key"
12139 if self.mode == constants.IALLOCATOR_MODE_RELOC:
12140 assert self.relocate_from is not None
12141 assert self.required_nodes == 1
12143 node2group = dict((name, ndata["group"])
12144 for (name, ndata) in self.in_data["nodes"].items())
12146 fn = compat.partial(self._NodesToGroups, node2group,
12147 self.in_data["nodegroups"])
12149 request_groups = fn(self.relocate_from)
12150 result_groups = fn(rdict["result"])
12152 if result_groups != request_groups:
12153 raise errors.OpExecError("Groups of nodes returned by iallocator (%s)"
12154 " differ from original groups (%s)" %
12155 (utils.CommaJoin(result_groups),
12156 utils.CommaJoin(request_groups)))
12158 self.out_data = rdict
12161 def _NodesToGroups(node2group, groups, nodes):
12162 """Returns a list of unique group names for a list of nodes.
12164 @type node2group: dict
12165 @param node2group: Map from node name to group UUID
12167 @param groups: Group information
12169 @param nodes: Node names
12176 group_uuid = node2group[node]
12178 # Ignore unknown node
12182 group = groups[group_uuid]
12184 # Can't find group, let's use UUID
12185 group_name = group_uuid
12187 group_name = group["name"]
12189 result.add(group_name)
12191 return sorted(result)
12194 class LUTestAllocator(NoHooksLU):
12195 """Run allocator tests.
12197 This LU runs the allocator tests
12200 def CheckPrereq(self):
12201 """Check prerequisites.
12203 This checks the opcode parameters depending on the director and mode test.
12206 if self.op.mode == constants.IALLOCATOR_MODE_ALLOC:
12207 for attr in ["mem_size", "disks", "disk_template",
12208 "os", "tags", "nics", "vcpus"]:
12209 if not hasattr(self.op, attr):
12210 raise errors.OpPrereqError("Missing attribute '%s' on opcode input" %
12211 attr, errors.ECODE_INVAL)
12212 iname = self.cfg.ExpandInstanceName(self.op.name)
12213 if iname is not None:
12214 raise errors.OpPrereqError("Instance '%s' already in the cluster" %
12215 iname, errors.ECODE_EXISTS)
12216 if not isinstance(self.op.nics, list):
12217 raise errors.OpPrereqError("Invalid parameter 'nics'",
12218 errors.ECODE_INVAL)
12219 if not isinstance(self.op.disks, list):
12220 raise errors.OpPrereqError("Invalid parameter 'disks'",
12221 errors.ECODE_INVAL)
12222 for row in self.op.disks:
12223 if (not isinstance(row, dict) or
12224 "size" not in row or
12225 not isinstance(row["size"], int) or
12226 "mode" not in row or
12227 row["mode"] not in ['r', 'w']):
12228 raise errors.OpPrereqError("Invalid contents of the 'disks'"
12229 " parameter", errors.ECODE_INVAL)
12230 if self.op.hypervisor is None:
12231 self.op.hypervisor = self.cfg.GetHypervisorType()
12232 elif self.op.mode == constants.IALLOCATOR_MODE_RELOC:
12233 fname = _ExpandInstanceName(self.cfg, self.op.name)
12234 self.op.name = fname
12235 self.relocate_from = self.cfg.GetInstanceInfo(fname).secondary_nodes
12236 elif self.op.mode == constants.IALLOCATOR_MODE_MEVAC:
12237 if not hasattr(self.op, "evac_nodes"):
12238 raise errors.OpPrereqError("Missing attribute 'evac_nodes' on"
12239 " opcode input", errors.ECODE_INVAL)
12241 raise errors.OpPrereqError("Invalid test allocator mode '%s'" %
12242 self.op.mode, errors.ECODE_INVAL)
12244 if self.op.direction == constants.IALLOCATOR_DIR_OUT:
12245 if self.op.allocator is None:
12246 raise errors.OpPrereqError("Missing allocator name",
12247 errors.ECODE_INVAL)
12248 elif self.op.direction != constants.IALLOCATOR_DIR_IN:
12249 raise errors.OpPrereqError("Wrong allocator test '%s'" %
12250 self.op.direction, errors.ECODE_INVAL)
12252 def Exec(self, feedback_fn):
12253 """Run the allocator test.
12256 if self.op.mode == constants.IALLOCATOR_MODE_ALLOC:
12257 ial = IAllocator(self.cfg, self.rpc,
12260 mem_size=self.op.mem_size,
12261 disks=self.op.disks,
12262 disk_template=self.op.disk_template,
12266 vcpus=self.op.vcpus,
12267 hypervisor=self.op.hypervisor,
12269 elif self.op.mode == constants.IALLOCATOR_MODE_RELOC:
12270 ial = IAllocator(self.cfg, self.rpc,
12273 relocate_from=list(self.relocate_from),
12275 elif self.op.mode == constants.IALLOCATOR_MODE_MEVAC:
12276 ial = IAllocator(self.cfg, self.rpc,
12278 evac_nodes=self.op.evac_nodes)
12280 raise errors.ProgrammerError("Uncatched mode %s in"
12281 " LUTestAllocator.Exec", self.op.mode)
12283 if self.op.direction == constants.IALLOCATOR_DIR_IN:
12284 result = ial.in_text
12286 ial.Run(self.op.allocator, validate=False)
12287 result = ial.out_text
12291 #: Query type implementations
12293 constants.QR_INSTANCE: _InstanceQuery,
12294 constants.QR_NODE: _NodeQuery,
12295 constants.QR_GROUP: _GroupQuery,
12296 constants.QR_OS: _OsQuery,
12299 assert set(_QUERY_IMPL.keys()) == constants.QR_VIA_OP
12302 def _GetQueryImplementation(name):
12303 """Returns the implemtnation for a query type.
12305 @param name: Query type, must be one of L{constants.QR_VIA_OP}
12309 return _QUERY_IMPL[name]
12311 raise errors.OpPrereqError("Unknown query resource '%s'" % name,
12312 errors.ECODE_INVAL)