4 # Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 Google Inc.
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 """Configuration management for Ganeti
24 This module provides the interface to the Ganeti cluster configuration.
26 The configuration data is stored on every node but is updated on the master
27 only. After each update, the master distributes the data to the other nodes.
29 Currently, the data storage format is JSON. YAML was slow and consuming too
34 # pylint: disable=R0904
35 # R0904: Too many public methods
43 from ganeti import errors
44 from ganeti import locking
45 from ganeti import utils
46 from ganeti import constants
47 from ganeti import rpc
48 from ganeti import objects
49 from ganeti import serializer
50 from ganeti import uidpool
51 from ganeti import netutils
52 from ganeti import runtime
55 _config_lock = locking.SharedLock("ConfigWriter")
57 # job id used for resource management at config upgrade time
58 _UPGRADE_CONFIG_JID = "jid-cfg-upgrade"
61 def _ValidateConfig(data):
62 """Verifies that a configuration objects looks valid.
64 This only verifies the version of the configuration.
66 @raise errors.ConfigurationError: if the version differs from what
70 if data.version != constants.CONFIG_VERSION:
71 raise errors.ConfigVersionMismatch(constants.CONFIG_VERSION, data.version)
74 class TemporaryReservationManager:
75 """A temporary resource reservation manager.
77 This is used to reserve resources in a job, before using them, making sure
78 other jobs cannot get them in the meantime.
82 self._ec_reserved = {}
84 def Reserved(self, resource):
85 for holder_reserved in self._ec_reserved.values():
86 if resource in holder_reserved:
90 def Reserve(self, ec_id, resource):
91 if self.Reserved(resource):
92 raise errors.ReservationError("Duplicate reservation for resource '%s'"
94 if ec_id not in self._ec_reserved:
95 self._ec_reserved[ec_id] = set([resource])
97 self._ec_reserved[ec_id].add(resource)
99 def DropECReservations(self, ec_id):
100 if ec_id in self._ec_reserved:
101 del self._ec_reserved[ec_id]
103 def GetReserved(self):
105 for holder_reserved in self._ec_reserved.values():
106 all_reserved.update(holder_reserved)
109 def Generate(self, existing, generate_one_fn, ec_id):
110 """Generate a new resource of this type
113 assert callable(generate_one_fn)
115 all_elems = self.GetReserved()
116 all_elems.update(existing)
119 new_resource = generate_one_fn()
120 if new_resource is not None and new_resource not in all_elems:
123 raise errors.ConfigurationError("Not able generate new resource"
124 " (last tried: %s)" % new_resource)
125 self.Reserve(ec_id, new_resource)
129 def _MatchNameComponentIgnoreCase(short_name, names):
130 """Wrapper around L{utils.text.MatchNameComponent}.
133 return utils.MatchNameComponent(short_name, names, case_sensitive=False)
137 """The interface to the cluster configuration.
139 @ivar _temporary_lvs: reservation manager for temporary LVs
140 @ivar _all_rms: a list of all temporary reservation managers
143 def __init__(self, cfg_file=None, offline=False, _getents=runtime.GetEnts,
144 accept_foreign=False):
146 self._lock = _config_lock
147 self._config_data = None
148 self._offline = offline
150 self._cfg_file = constants.CLUSTER_CONF_FILE
152 self._cfg_file = cfg_file
153 self._getents = _getents
154 self._temporary_ids = TemporaryReservationManager()
155 self._temporary_drbds = {}
156 self._temporary_macs = TemporaryReservationManager()
157 self._temporary_secrets = TemporaryReservationManager()
158 self._temporary_lvs = TemporaryReservationManager()
159 self._all_rms = [self._temporary_ids, self._temporary_macs,
160 self._temporary_secrets, self._temporary_lvs]
161 # Note: in order to prevent errors when resolving our name in
162 # _DistributeConfig, we compute it here once and reuse it; it's
163 # better to raise an error before starting to modify the config
164 # file than after it was modified
165 self._my_hostname = netutils.Hostname.GetSysName()
166 self._last_cluster_serial = -1
169 self._OpenConfig(accept_foreign)
171 def _GetRpc(self, address_list):
172 """Returns RPC runner for configuration.
175 return rpc.ConfigRunner(self._context, address_list)
177 def SetContext(self, context):
178 """Sets Ganeti context.
181 self._context = context
183 # this method needs to be static, so that we can call it on the class
186 """Check if the cluster is configured.
189 return os.path.exists(constants.CLUSTER_CONF_FILE)
191 def _GenerateOneMAC(self):
192 """Generate one mac address
195 prefix = self._config_data.cluster.mac_prefix
196 byte1 = random.randrange(0, 256)
197 byte2 = random.randrange(0, 256)
198 byte3 = random.randrange(0, 256)
199 mac = "%s:%02x:%02x:%02x" % (prefix, byte1, byte2, byte3)
202 @locking.ssynchronized(_config_lock, shared=1)
203 def GetNdParams(self, node):
204 """Get the node params populated with cluster defaults.
206 @type node: L{objects.Node}
207 @param node: The node we want to know the params for
208 @return: A dict with the filled in node params
211 nodegroup = self._UnlockedGetNodeGroup(node.group)
212 return self._config_data.cluster.FillND(node, nodegroup)
214 @locking.ssynchronized(_config_lock, shared=1)
215 def GenerateMAC(self, ec_id):
216 """Generate a MAC for an instance.
218 This should check the current instances for duplicates.
221 existing = self._AllMACs()
222 return self._temporary_ids.Generate(existing, self._GenerateOneMAC, ec_id)
224 @locking.ssynchronized(_config_lock, shared=1)
225 def ReserveMAC(self, mac, ec_id):
226 """Reserve a MAC for an instance.
228 This only checks instances managed by this cluster, it does not
229 check for potential collisions elsewhere.
232 all_macs = self._AllMACs()
234 raise errors.ReservationError("mac already in use")
236 self._temporary_macs.Reserve(ec_id, mac)
238 @locking.ssynchronized(_config_lock, shared=1)
239 def ReserveLV(self, lv_name, ec_id):
240 """Reserve an VG/LV pair for an instance.
242 @type lv_name: string
243 @param lv_name: the logical volume name to reserve
246 all_lvs = self._AllLVs()
247 if lv_name in all_lvs:
248 raise errors.ReservationError("LV already in use")
250 self._temporary_lvs.Reserve(ec_id, lv_name)
252 @locking.ssynchronized(_config_lock, shared=1)
253 def GenerateDRBDSecret(self, ec_id):
254 """Generate a DRBD secret.
256 This checks the current disks for duplicates.
259 return self._temporary_secrets.Generate(self._AllDRBDSecrets(),
260 utils.GenerateSecret,
264 """Compute the list of all LVs.
268 for instance in self._config_data.instances.values():
269 node_data = instance.MapLVsByNode()
270 for lv_list in node_data.values():
271 lvnames.update(lv_list)
274 def _AllIDs(self, include_temporary):
275 """Compute the list of all UUIDs and names we have.
277 @type include_temporary: boolean
278 @param include_temporary: whether to include the _temporary_ids set
280 @return: a set of IDs
284 if include_temporary:
285 existing.update(self._temporary_ids.GetReserved())
286 existing.update(self._AllLVs())
287 existing.update(self._config_data.instances.keys())
288 existing.update(self._config_data.nodes.keys())
289 existing.update([i.uuid for i in self._AllUUIDObjects() if i.uuid])
292 def _GenerateUniqueID(self, ec_id):
293 """Generate an unique UUID.
295 This checks the current node, instances and disk names for
299 @return: the unique id
302 existing = self._AllIDs(include_temporary=False)
303 return self._temporary_ids.Generate(existing, utils.NewUUID, ec_id)
305 @locking.ssynchronized(_config_lock, shared=1)
306 def GenerateUniqueID(self, ec_id):
307 """Generate an unique ID.
309 This is just a wrapper over the unlocked version.
312 @param ec_id: unique id for the job to reserve the id to
315 return self._GenerateUniqueID(ec_id)
318 """Return all MACs present in the config.
321 @return: the list of all MACs
325 for instance in self._config_data.instances.values():
326 for nic in instance.nics:
327 result.append(nic.mac)
331 def _AllDRBDSecrets(self):
332 """Return all DRBD secrets present in the config.
335 @return: the list of all DRBD secrets
338 def helper(disk, result):
339 """Recursively gather secrets from this disk."""
340 if disk.dev_type == constants.DT_DRBD8:
341 result.append(disk.logical_id[5])
343 for child in disk.children:
344 helper(child, result)
347 for instance in self._config_data.instances.values():
348 for disk in instance.disks:
353 def _CheckDiskIDs(self, disk, l_ids, p_ids):
354 """Compute duplicate disk IDs
356 @type disk: L{objects.Disk}
357 @param disk: the disk at which to start searching
359 @param l_ids: list of current logical ids
361 @param p_ids: list of current physical ids
363 @return: a list of error messages
367 if disk.logical_id is not None:
368 if disk.logical_id in l_ids:
369 result.append("duplicate logical id %s" % str(disk.logical_id))
371 l_ids.append(disk.logical_id)
372 if disk.physical_id is not None:
373 if disk.physical_id in p_ids:
374 result.append("duplicate physical id %s" % str(disk.physical_id))
376 p_ids.append(disk.physical_id)
379 for child in disk.children:
380 result.extend(self._CheckDiskIDs(child, l_ids, p_ids))
383 def _UnlockedVerifyConfig(self):
387 @return: a list of error messages; a non-empty list signifies
391 # pylint: disable=R0914
395 data = self._config_data
396 cluster = data.cluster
400 # global cluster checks
401 if not cluster.enabled_hypervisors:
402 result.append("enabled hypervisors list doesn't have any entries")
403 invalid_hvs = set(cluster.enabled_hypervisors) - constants.HYPER_TYPES
405 result.append("enabled hypervisors contains invalid entries: %s" %
407 missing_hvp = (set(cluster.enabled_hypervisors) -
408 set(cluster.hvparams.keys()))
410 result.append("hypervisor parameters missing for the enabled"
411 " hypervisor(s) %s" % utils.CommaJoin(missing_hvp))
413 if cluster.master_node not in data.nodes:
414 result.append("cluster has invalid primary node '%s'" %
417 def _helper(owner, attr, value, template):
419 utils.ForceDictType(value, template)
420 except errors.GenericError, err:
421 result.append("%s has invalid %s: %s" % (owner, attr, err))
423 def _helper_nic(owner, params):
425 objects.NIC.CheckParameterSyntax(params)
426 except errors.ConfigurationError, err:
427 result.append("%s has invalid nicparams: %s" % (owner, err))
429 def _helper_ipolicy(owner, params):
431 objects.InstancePolicy.CheckParameterSyntax(params)
432 except errors.ConfigurationError, err:
433 result.append("%s has invalid instance policy: %s" % (owner, err))
435 def _helper_ispecs(owner, params):
436 for key, value in params.items():
437 if key in constants.IPOLICY_PARAMETERS:
438 fullkey = "ipolicy/" + key
439 _helper(owner, fullkey, value, constants.ISPECS_PARAMETER_TYPES)
441 # FIXME: assuming list type
442 if not isinstance(value, list):
443 result.append("%s has invalid instance policy: for %s,"
444 " expecting list, got %s" %
445 (owner, key, type(value)))
447 # check cluster parameters
448 _helper("cluster", "beparams", cluster.SimpleFillBE({}),
449 constants.BES_PARAMETER_TYPES)
450 _helper("cluster", "nicparams", cluster.SimpleFillNIC({}),
451 constants.NICS_PARAMETER_TYPES)
452 _helper_nic("cluster", cluster.SimpleFillNIC({}))
453 _helper("cluster", "ndparams", cluster.SimpleFillND({}),
454 constants.NDS_PARAMETER_TYPES)
455 _helper_ipolicy("cluster", cluster.SimpleFillIPolicy({}))
456 _helper_ispecs("cluster", cluster.SimpleFillIPolicy({}))
458 # per-instance checks
459 for instance_name in data.instances:
460 instance = data.instances[instance_name]
461 if instance.name != instance_name:
462 result.append("instance '%s' is indexed by wrong name '%s'" %
463 (instance.name, instance_name))
464 if instance.primary_node not in data.nodes:
465 result.append("instance '%s' has invalid primary node '%s'" %
466 (instance_name, instance.primary_node))
467 for snode in instance.secondary_nodes:
468 if snode not in data.nodes:
469 result.append("instance '%s' has invalid secondary node '%s'" %
470 (instance_name, snode))
471 for idx, nic in enumerate(instance.nics):
472 if nic.mac in seen_macs:
473 result.append("instance '%s' has NIC %d mac %s duplicate" %
474 (instance_name, idx, nic.mac))
476 seen_macs.append(nic.mac)
478 filled = cluster.SimpleFillNIC(nic.nicparams)
479 owner = "instance %s nic %d" % (instance.name, idx)
480 _helper(owner, "nicparams",
481 filled, constants.NICS_PARAMETER_TYPES)
482 _helper_nic(owner, filled)
485 if instance.beparams:
486 _helper("instance %s" % instance.name, "beparams",
487 cluster.FillBE(instance), constants.BES_PARAMETER_TYPES)
489 # gather the drbd ports for duplicate checks
490 for dsk in instance.disks:
491 if dsk.dev_type in constants.LDS_DRBD:
492 tcp_port = dsk.logical_id[2]
493 if tcp_port not in ports:
495 ports[tcp_port].append((instance.name, "drbd disk %s" % dsk.iv_name))
496 # gather network port reservation
497 net_port = getattr(instance, "network_port", None)
498 if net_port is not None:
499 if net_port not in ports:
501 ports[net_port].append((instance.name, "network port"))
503 # instance disk verify
504 for idx, disk in enumerate(instance.disks):
505 result.extend(["instance '%s' disk %d error: %s" %
506 (instance.name, idx, msg) for msg in disk.Verify()])
507 result.extend(self._CheckDiskIDs(disk, seen_lids, seen_pids))
509 # cluster-wide pool of free ports
510 for free_port in cluster.tcpudp_port_pool:
511 if free_port not in ports:
512 ports[free_port] = []
513 ports[free_port].append(("cluster", "port marked as free"))
515 # compute tcp/udp duplicate ports
521 txt = utils.CommaJoin(["%s/%s" % val for val in pdata])
522 result.append("tcp/udp port %s has duplicates: %s" % (pnum, txt))
524 # highest used tcp port check
526 if keys[-1] > cluster.highest_used_port:
527 result.append("Highest used port mismatch, saved %s, computed %s" %
528 (cluster.highest_used_port, keys[-1]))
530 if not data.nodes[cluster.master_node].master_candidate:
531 result.append("Master node is not a master candidate")
533 # master candidate checks
534 mc_now, mc_max, _ = self._UnlockedGetMasterCandidateStats()
536 result.append("Not enough master candidates: actual %d, target %d" %
540 for node_name, node in data.nodes.items():
541 if node.name != node_name:
542 result.append("Node '%s' is indexed by wrong name '%s'" %
543 (node.name, node_name))
544 if [node.master_candidate, node.drained, node.offline].count(True) > 1:
545 result.append("Node %s state is invalid: master_candidate=%s,"
546 " drain=%s, offline=%s" %
547 (node.name, node.master_candidate, node.drained,
549 if node.group not in data.nodegroups:
550 result.append("Node '%s' has invalid group '%s'" %
551 (node.name, node.group))
553 _helper("node %s" % node.name, "ndparams",
554 cluster.FillND(node, data.nodegroups[node.group]),
555 constants.NDS_PARAMETER_TYPES)
558 nodegroups_names = set()
559 for nodegroup_uuid in data.nodegroups:
560 nodegroup = data.nodegroups[nodegroup_uuid]
561 if nodegroup.uuid != nodegroup_uuid:
562 result.append("node group '%s' (uuid: '%s') indexed by wrong uuid '%s'"
563 % (nodegroup.name, nodegroup.uuid, nodegroup_uuid))
564 if utils.UUID_RE.match(nodegroup.name.lower()):
565 result.append("node group '%s' (uuid: '%s') has uuid-like name" %
566 (nodegroup.name, nodegroup.uuid))
567 if nodegroup.name in nodegroups_names:
568 result.append("duplicate node group name '%s'" % nodegroup.name)
570 nodegroups_names.add(nodegroup.name)
571 group_name = "group %s" % nodegroup.name
572 _helper_ipolicy(group_name, cluster.SimpleFillIPolicy(nodegroup.ipolicy))
573 _helper_ispecs(group_name, cluster.SimpleFillIPolicy(nodegroup.ipolicy))
574 if nodegroup.ndparams:
575 _helper(group_name, "ndparams",
576 cluster.SimpleFillND(nodegroup.ndparams),
577 constants.NDS_PARAMETER_TYPES)
580 _, duplicates = self._UnlockedComputeDRBDMap()
581 for node, minor, instance_a, instance_b in duplicates:
582 result.append("DRBD minor %d on node %s is assigned twice to instances"
583 " %s and %s" % (minor, node, instance_a, instance_b))
586 default_nicparams = cluster.nicparams[constants.PP_DEFAULT]
589 def _AddIpAddress(ip, name):
590 ips.setdefault(ip, []).append(name)
592 _AddIpAddress(cluster.master_ip, "cluster_ip")
594 for node in data.nodes.values():
595 _AddIpAddress(node.primary_ip, "node:%s/primary" % node.name)
596 if node.secondary_ip != node.primary_ip:
597 _AddIpAddress(node.secondary_ip, "node:%s/secondary" % node.name)
599 for instance in data.instances.values():
600 for idx, nic in enumerate(instance.nics):
604 nicparams = objects.FillDict(default_nicparams, nic.nicparams)
605 nic_mode = nicparams[constants.NIC_MODE]
606 nic_link = nicparams[constants.NIC_LINK]
608 if nic_mode == constants.NIC_MODE_BRIDGED:
609 link = "bridge:%s" % nic_link
610 elif nic_mode == constants.NIC_MODE_ROUTED:
611 link = "route:%s" % nic_link
613 raise errors.ProgrammerError("NIC mode '%s' not handled" % nic_mode)
615 _AddIpAddress("%s/%s" % (link, nic.ip),
616 "instance:%s/nic:%d" % (instance.name, idx))
618 for ip, owners in ips.items():
620 result.append("IP address %s is used by multiple owners: %s" %
621 (ip, utils.CommaJoin(owners)))
625 @locking.ssynchronized(_config_lock, shared=1)
626 def VerifyConfig(self):
629 This is just a wrapper over L{_UnlockedVerifyConfig}.
632 @return: a list of error messages; a non-empty list signifies
636 return self._UnlockedVerifyConfig()
638 def _UnlockedSetDiskID(self, disk, node_name):
639 """Convert the unique ID to the ID needed on the target nodes.
641 This is used only for drbd, which needs ip/port configuration.
643 The routine descends down and updates its children also, because
644 this helps when the only the top device is passed to the remote
647 This function is for internal use, when the config lock is already held.
651 for child in disk.children:
652 self._UnlockedSetDiskID(child, node_name)
654 if disk.logical_id is None and disk.physical_id is not None:
656 if disk.dev_type == constants.LD_DRBD8:
657 pnode, snode, port, pminor, sminor, secret = disk.logical_id
658 if node_name not in (pnode, snode):
659 raise errors.ConfigurationError("DRBD device not knowing node %s" %
661 pnode_info = self._UnlockedGetNodeInfo(pnode)
662 snode_info = self._UnlockedGetNodeInfo(snode)
663 if pnode_info is None or snode_info is None:
664 raise errors.ConfigurationError("Can't find primary or secondary node"
665 " for %s" % str(disk))
666 p_data = (pnode_info.secondary_ip, port)
667 s_data = (snode_info.secondary_ip, port)
668 if pnode == node_name:
669 disk.physical_id = p_data + s_data + (pminor, secret)
670 else: # it must be secondary, we tested above
671 disk.physical_id = s_data + p_data + (sminor, secret)
673 disk.physical_id = disk.logical_id
676 @locking.ssynchronized(_config_lock)
677 def SetDiskID(self, disk, node_name):
678 """Convert the unique ID to the ID needed on the target nodes.
680 This is used only for drbd, which needs ip/port configuration.
682 The routine descends down and updates its children also, because
683 this helps when the only the top device is passed to the remote
687 return self._UnlockedSetDiskID(disk, node_name)
689 @locking.ssynchronized(_config_lock)
690 def AddTcpUdpPort(self, port):
691 """Adds a new port to the available port pool.
694 if not isinstance(port, int):
695 raise errors.ProgrammerError("Invalid type passed for port")
697 self._config_data.cluster.tcpudp_port_pool.add(port)
700 @locking.ssynchronized(_config_lock, shared=1)
701 def GetPortList(self):
702 """Returns a copy of the current port list.
705 return self._config_data.cluster.tcpudp_port_pool.copy()
707 @locking.ssynchronized(_config_lock)
708 def AllocatePort(self):
711 The port will be taken from the available port pool or from the
712 default port range (and in this case we increase
716 # If there are TCP/IP ports configured, we use them first.
717 if self._config_data.cluster.tcpudp_port_pool:
718 port = self._config_data.cluster.tcpudp_port_pool.pop()
720 port = self._config_data.cluster.highest_used_port + 1
721 if port >= constants.LAST_DRBD_PORT:
722 raise errors.ConfigurationError("The highest used port is greater"
723 " than %s. Aborting." %
724 constants.LAST_DRBD_PORT)
725 self._config_data.cluster.highest_used_port = port
730 def _UnlockedComputeDRBDMap(self):
731 """Compute the used DRBD minor/nodes.
734 @return: dictionary of node_name: dict of minor: instance_name;
735 the returned dict will have all the nodes in it (even if with
736 an empty list), and a list of duplicates; if the duplicates
737 list is not empty, the configuration is corrupted and its caller
738 should raise an exception
741 def _AppendUsedPorts(instance_name, disk, used):
743 if disk.dev_type == constants.LD_DRBD8 and len(disk.logical_id) >= 5:
744 node_a, node_b, _, minor_a, minor_b = disk.logical_id[:5]
745 for node, port in ((node_a, minor_a), (node_b, minor_b)):
746 assert node in used, ("Node '%s' of instance '%s' not found"
747 " in node list" % (node, instance_name))
748 if port in used[node]:
749 duplicates.append((node, port, instance_name, used[node][port]))
751 used[node][port] = instance_name
753 for child in disk.children:
754 duplicates.extend(_AppendUsedPorts(instance_name, child, used))
758 my_dict = dict((node, {}) for node in self._config_data.nodes)
759 for instance in self._config_data.instances.itervalues():
760 for disk in instance.disks:
761 duplicates.extend(_AppendUsedPorts(instance.name, disk, my_dict))
762 for (node, minor), instance in self._temporary_drbds.iteritems():
763 if minor in my_dict[node] and my_dict[node][minor] != instance:
764 duplicates.append((node, minor, instance, my_dict[node][minor]))
766 my_dict[node][minor] = instance
767 return my_dict, duplicates
769 @locking.ssynchronized(_config_lock)
770 def ComputeDRBDMap(self):
771 """Compute the used DRBD minor/nodes.
773 This is just a wrapper over L{_UnlockedComputeDRBDMap}.
775 @return: dictionary of node_name: dict of minor: instance_name;
776 the returned dict will have all the nodes in it (even if with
780 d_map, duplicates = self._UnlockedComputeDRBDMap()
782 raise errors.ConfigurationError("Duplicate DRBD ports detected: %s" %
786 @locking.ssynchronized(_config_lock)
787 def AllocateDRBDMinor(self, nodes, instance):
788 """Allocate a drbd minor.
790 The free minor will be automatically computed from the existing
791 devices. A node can be given multiple times in order to allocate
792 multiple minors. The result is the list of minors, in the same
793 order as the passed nodes.
795 @type instance: string
796 @param instance: the instance for which we allocate minors
799 assert isinstance(instance, basestring), \
800 "Invalid argument '%s' passed to AllocateDRBDMinor" % instance
802 d_map, duplicates = self._UnlockedComputeDRBDMap()
804 raise errors.ConfigurationError("Duplicate DRBD ports detected: %s" %
810 # no minors used, we can start at 0
813 self._temporary_drbds[(nname, 0)] = instance
817 ffree = utils.FirstFree(keys)
819 # return the next minor
820 # TODO: implement high-limit check
824 # double-check minor against current instances
825 assert minor not in d_map[nname], \
826 ("Attempt to reuse allocated DRBD minor %d on node %s,"
827 " already allocated to instance %s" %
828 (minor, nname, d_map[nname][minor]))
829 ndata[minor] = instance
830 # double-check minor against reservation
831 r_key = (nname, minor)
832 assert r_key not in self._temporary_drbds, \
833 ("Attempt to reuse reserved DRBD minor %d on node %s,"
834 " reserved for instance %s" %
835 (minor, nname, self._temporary_drbds[r_key]))
836 self._temporary_drbds[r_key] = instance
838 logging.debug("Request to allocate drbd minors, input: %s, returning %s",
842 def _UnlockedReleaseDRBDMinors(self, instance):
843 """Release temporary drbd minors allocated for a given instance.
845 @type instance: string
846 @param instance: the instance for which temporary minors should be
850 assert isinstance(instance, basestring), \
851 "Invalid argument passed to ReleaseDRBDMinors"
852 for key, name in self._temporary_drbds.items():
854 del self._temporary_drbds[key]
856 @locking.ssynchronized(_config_lock)
857 def ReleaseDRBDMinors(self, instance):
858 """Release temporary drbd minors allocated for a given instance.
860 This should be called on the error paths, on the success paths
861 it's automatically called by the ConfigWriter add and update
864 This function is just a wrapper over L{_UnlockedReleaseDRBDMinors}.
866 @type instance: string
867 @param instance: the instance for which temporary minors should be
871 self._UnlockedReleaseDRBDMinors(instance)
873 @locking.ssynchronized(_config_lock, shared=1)
874 def GetConfigVersion(self):
875 """Get the configuration version.
877 @return: Config version
880 return self._config_data.version
882 @locking.ssynchronized(_config_lock, shared=1)
883 def GetClusterName(self):
886 @return: Cluster name
889 return self._config_data.cluster.cluster_name
891 @locking.ssynchronized(_config_lock, shared=1)
892 def GetMasterNode(self):
893 """Get the hostname of the master node for this cluster.
895 @return: Master hostname
898 return self._config_data.cluster.master_node
900 @locking.ssynchronized(_config_lock, shared=1)
901 def GetMasterIP(self):
902 """Get the IP of the master node for this cluster.
907 return self._config_data.cluster.master_ip
909 @locking.ssynchronized(_config_lock, shared=1)
910 def GetMasterNetdev(self):
911 """Get the master network device for this cluster.
914 return self._config_data.cluster.master_netdev
916 @locking.ssynchronized(_config_lock, shared=1)
917 def GetMasterNetmask(self):
918 """Get the netmask of the master node for this cluster.
921 return self._config_data.cluster.master_netmask
923 @locking.ssynchronized(_config_lock, shared=1)
924 def GetUseExternalMipScript(self):
925 """Get flag representing whether to use the external master IP setup script.
928 return self._config_data.cluster.use_external_mip_script
930 @locking.ssynchronized(_config_lock, shared=1)
931 def GetFileStorageDir(self):
932 """Get the file storage dir for this cluster.
935 return self._config_data.cluster.file_storage_dir
937 @locking.ssynchronized(_config_lock, shared=1)
938 def GetSharedFileStorageDir(self):
939 """Get the shared file storage dir for this cluster.
942 return self._config_data.cluster.shared_file_storage_dir
944 @locking.ssynchronized(_config_lock, shared=1)
945 def GetHypervisorType(self):
946 """Get the hypervisor type for this cluster.
949 return self._config_data.cluster.enabled_hypervisors[0]
951 @locking.ssynchronized(_config_lock, shared=1)
952 def GetHostKey(self):
953 """Return the rsa hostkey from the config.
956 @return: the rsa hostkey
959 return self._config_data.cluster.rsahostkeypub
961 @locking.ssynchronized(_config_lock, shared=1)
962 def GetDefaultIAllocator(self):
963 """Get the default instance allocator for this cluster.
966 return self._config_data.cluster.default_iallocator
968 @locking.ssynchronized(_config_lock, shared=1)
969 def GetPrimaryIPFamily(self):
970 """Get cluster primary ip family.
972 @return: primary ip family
975 return self._config_data.cluster.primary_ip_family
977 @locking.ssynchronized(_config_lock, shared=1)
978 def GetMasterNetworkParameters(self):
979 """Get network parameters of the master node.
981 @rtype: L{object.MasterNetworkParameters}
982 @return: network parameters of the master node
985 cluster = self._config_data.cluster
986 result = objects.MasterNetworkParameters(name=cluster.master_node,
987 ip=cluster.master_ip,
988 netmask=cluster.master_netmask,
989 netdev=cluster.master_netdev,
990 ip_family=cluster.primary_ip_family)
994 @locking.ssynchronized(_config_lock)
995 def AddNodeGroup(self, group, ec_id, check_uuid=True):
996 """Add a node group to the configuration.
998 This method calls group.UpgradeConfig() to fill any missing attributes
999 according to their default values.
1001 @type group: L{objects.NodeGroup}
1002 @param group: the NodeGroup object to add
1004 @param ec_id: unique id for the job to use when creating a missing UUID
1005 @type check_uuid: bool
1006 @param check_uuid: add an UUID to the group if it doesn't have one or, if
1007 it does, ensure that it does not exist in the
1008 configuration already
1011 self._UnlockedAddNodeGroup(group, ec_id, check_uuid)
1014 def _UnlockedAddNodeGroup(self, group, ec_id, check_uuid):
1015 """Add a node group to the configuration.
1018 logging.info("Adding node group %s to configuration", group.name)
1020 # Some code might need to add a node group with a pre-populated UUID
1021 # generated with ConfigWriter.GenerateUniqueID(). We allow them to bypass
1022 # the "does this UUID" exist already check.
1024 self._EnsureUUID(group, ec_id)
1027 existing_uuid = self._UnlockedLookupNodeGroup(group.name)
1028 except errors.OpPrereqError:
1031 raise errors.OpPrereqError("Desired group name '%s' already exists as a"
1032 " node group (UUID: %s)" %
1033 (group.name, existing_uuid),
1034 errors.ECODE_EXISTS)
1037 group.ctime = group.mtime = time.time()
1038 group.UpgradeConfig()
1040 self._config_data.nodegroups[group.uuid] = group
1041 self._config_data.cluster.serial_no += 1
1043 @locking.ssynchronized(_config_lock)
1044 def RemoveNodeGroup(self, group_uuid):
1045 """Remove a node group from the configuration.
1047 @type group_uuid: string
1048 @param group_uuid: the UUID of the node group to remove
1051 logging.info("Removing node group %s from configuration", group_uuid)
1053 if group_uuid not in self._config_data.nodegroups:
1054 raise errors.ConfigurationError("Unknown node group '%s'" % group_uuid)
1056 assert len(self._config_data.nodegroups) != 1, \
1057 "Group '%s' is the only group, cannot be removed" % group_uuid
1059 del self._config_data.nodegroups[group_uuid]
1060 self._config_data.cluster.serial_no += 1
1063 def _UnlockedLookupNodeGroup(self, target):
1064 """Lookup a node group's UUID.
1066 @type target: string or None
1067 @param target: group name or UUID or None to look for the default
1069 @return: nodegroup UUID
1070 @raises errors.OpPrereqError: when the target group cannot be found
1074 if len(self._config_data.nodegroups) != 1:
1075 raise errors.OpPrereqError("More than one node group exists. Target"
1076 " group must be specified explicitely.")
1078 return self._config_data.nodegroups.keys()[0]
1079 if target in self._config_data.nodegroups:
1081 for nodegroup in self._config_data.nodegroups.values():
1082 if nodegroup.name == target:
1083 return nodegroup.uuid
1084 raise errors.OpPrereqError("Node group '%s' not found" % target,
1087 @locking.ssynchronized(_config_lock, shared=1)
1088 def LookupNodeGroup(self, target):
1089 """Lookup a node group's UUID.
1091 This function is just a wrapper over L{_UnlockedLookupNodeGroup}.
1093 @type target: string or None
1094 @param target: group name or UUID or None to look for the default
1096 @return: nodegroup UUID
1099 return self._UnlockedLookupNodeGroup(target)
1101 def _UnlockedGetNodeGroup(self, uuid):
1102 """Lookup a node group.
1105 @param uuid: group UUID
1106 @rtype: L{objects.NodeGroup} or None
1107 @return: nodegroup object, or None if not found
1110 if uuid not in self._config_data.nodegroups:
1113 return self._config_data.nodegroups[uuid]
1115 @locking.ssynchronized(_config_lock, shared=1)
1116 def GetNodeGroup(self, uuid):
1117 """Lookup a node group.
1120 @param uuid: group UUID
1121 @rtype: L{objects.NodeGroup} or None
1122 @return: nodegroup object, or None if not found
1125 return self._UnlockedGetNodeGroup(uuid)
1127 @locking.ssynchronized(_config_lock, shared=1)
1128 def GetAllNodeGroupsInfo(self):
1129 """Get the configuration of all node groups.
1132 return dict(self._config_data.nodegroups)
1134 @locking.ssynchronized(_config_lock, shared=1)
1135 def GetNodeGroupList(self):
1136 """Get a list of node groups.
1139 return self._config_data.nodegroups.keys()
1141 @locking.ssynchronized(_config_lock, shared=1)
1142 def GetNodeGroupMembersByNodes(self, nodes):
1143 """Get nodes which are member in the same nodegroups as the given nodes.
1146 ngfn = lambda node_name: self._UnlockedGetNodeInfo(node_name).group
1147 return frozenset(member_name
1148 for node_name in nodes
1150 self._UnlockedGetNodeGroup(ngfn(node_name)).members)
1152 @locking.ssynchronized(_config_lock)
1153 def AddInstance(self, instance, ec_id):
1154 """Add an instance to the config.
1156 This should be used after creating a new instance.
1158 @type instance: L{objects.Instance}
1159 @param instance: the instance object
1162 if not isinstance(instance, objects.Instance):
1163 raise errors.ProgrammerError("Invalid type passed to AddInstance")
1165 if instance.disk_template != constants.DT_DISKLESS:
1166 all_lvs = instance.MapLVsByNode()
1167 logging.info("Instance '%s' DISK_LAYOUT: %s", instance.name, all_lvs)
1169 all_macs = self._AllMACs()
1170 for nic in instance.nics:
1171 if nic.mac in all_macs:
1172 raise errors.ConfigurationError("Cannot add instance %s:"
1173 " MAC address '%s' already in use." %
1174 (instance.name, nic.mac))
1176 self._EnsureUUID(instance, ec_id)
1178 instance.serial_no = 1
1179 instance.ctime = instance.mtime = time.time()
1180 self._config_data.instances[instance.name] = instance
1181 self._config_data.cluster.serial_no += 1
1182 self._UnlockedReleaseDRBDMinors(instance.name)
1185 def _EnsureUUID(self, item, ec_id):
1186 """Ensures a given object has a valid UUID.
1188 @param item: the instance or node to be checked
1189 @param ec_id: the execution context id for the uuid reservation
1193 item.uuid = self._GenerateUniqueID(ec_id)
1194 elif item.uuid in self._AllIDs(include_temporary=True):
1195 raise errors.ConfigurationError("Cannot add '%s': UUID %s already"
1196 " in use" % (item.name, item.uuid))
1198 def _SetInstanceStatus(self, instance_name, status):
1199 """Set the instance's status to a given value.
1202 assert status in constants.ADMINST_ALL, \
1203 "Invalid status '%s' passed to SetInstanceStatus" % (status,)
1205 if instance_name not in self._config_data.instances:
1206 raise errors.ConfigurationError("Unknown instance '%s'" %
1208 instance = self._config_data.instances[instance_name]
1209 if instance.admin_state != status:
1210 instance.admin_state = status
1211 instance.serial_no += 1
1212 instance.mtime = time.time()
1215 @locking.ssynchronized(_config_lock)
1216 def MarkInstanceUp(self, instance_name):
1217 """Mark the instance status to up in the config.
1220 self._SetInstanceStatus(instance_name, constants.ADMINST_UP)
1222 @locking.ssynchronized(_config_lock)
1223 def MarkInstanceOffline(self, instance_name):
1224 """Mark the instance status to down in the config.
1227 self._SetInstanceStatus(instance_name, constants.ADMINST_OFFLINE)
1229 @locking.ssynchronized(_config_lock)
1230 def RemoveInstance(self, instance_name):
1231 """Remove the instance from the configuration.
1234 if instance_name not in self._config_data.instances:
1235 raise errors.ConfigurationError("Unknown instance '%s'" % instance_name)
1237 # If a network port has been allocated to the instance,
1238 # return it to the pool of free ports.
1239 inst = self._config_data.instances[instance_name]
1240 network_port = getattr(inst, "network_port", None)
1241 if network_port is not None:
1242 self._config_data.cluster.tcpudp_port_pool.add(network_port)
1244 del self._config_data.instances[instance_name]
1245 self._config_data.cluster.serial_no += 1
1248 @locking.ssynchronized(_config_lock)
1249 def RenameInstance(self, old_name, new_name):
1250 """Rename an instance.
1252 This needs to be done in ConfigWriter and not by RemoveInstance
1253 combined with AddInstance as only we can guarantee an atomic
1257 if old_name not in self._config_data.instances:
1258 raise errors.ConfigurationError("Unknown instance '%s'" % old_name)
1259 inst = self._config_data.instances[old_name]
1260 del self._config_data.instances[old_name]
1261 inst.name = new_name
1263 for disk in inst.disks:
1264 if disk.dev_type == constants.LD_FILE:
1265 # rename the file paths in logical and physical id
1266 file_storage_dir = os.path.dirname(os.path.dirname(disk.logical_id[1]))
1267 disk_fname = "disk%s" % disk.iv_name.split("/")[1]
1268 disk.physical_id = disk.logical_id = (disk.logical_id[0],
1269 utils.PathJoin(file_storage_dir,
1273 # Force update of ssconf files
1274 self._config_data.cluster.serial_no += 1
1276 self._config_data.instances[inst.name] = inst
1279 @locking.ssynchronized(_config_lock)
1280 def MarkInstanceDown(self, instance_name):
1281 """Mark the status of an instance to down in the configuration.
1284 self._SetInstanceStatus(instance_name, constants.ADMINST_DOWN)
1286 def _UnlockedGetInstanceList(self):
1287 """Get the list of instances.
1289 This function is for internal use, when the config lock is already held.
1292 return self._config_data.instances.keys()
1294 @locking.ssynchronized(_config_lock, shared=1)
1295 def GetInstanceList(self):
1296 """Get the list of instances.
1298 @return: array of instances, ex. ['instance2.example.com',
1299 'instance1.example.com']
1302 return self._UnlockedGetInstanceList()
1304 def ExpandInstanceName(self, short_name):
1305 """Attempt to expand an incomplete instance name.
1308 # Locking is done in L{ConfigWriter.GetInstanceList}
1309 return _MatchNameComponentIgnoreCase(short_name, self.GetInstanceList())
1311 def _UnlockedGetInstanceInfo(self, instance_name):
1312 """Returns information about an instance.
1314 This function is for internal use, when the config lock is already held.
1317 if instance_name not in self._config_data.instances:
1320 return self._config_data.instances[instance_name]
1322 @locking.ssynchronized(_config_lock, shared=1)
1323 def GetInstanceInfo(self, instance_name):
1324 """Returns information about an instance.
1326 It takes the information from the configuration file. Other information of
1327 an instance are taken from the live systems.
1329 @param instance_name: name of the instance, e.g.
1330 I{instance1.example.com}
1332 @rtype: L{objects.Instance}
1333 @return: the instance object
1336 return self._UnlockedGetInstanceInfo(instance_name)
1338 @locking.ssynchronized(_config_lock, shared=1)
1339 def GetInstanceNodeGroups(self, instance_name, primary_only=False):
1340 """Returns set of node group UUIDs for instance's nodes.
1345 instance = self._UnlockedGetInstanceInfo(instance_name)
1347 raise errors.ConfigurationError("Unknown instance '%s'" % instance_name)
1350 nodes = [instance.primary_node]
1352 nodes = instance.all_nodes
1354 return frozenset(self._UnlockedGetNodeInfo(node_name).group
1355 for node_name in nodes)
1357 @locking.ssynchronized(_config_lock, shared=1)
1358 def GetMultiInstanceInfo(self, instances):
1359 """Get the configuration of multiple instances.
1361 @param instances: list of instance names
1363 @return: list of tuples (instance, instance_info), where
1364 instance_info is what would GetInstanceInfo return for the
1365 node, while keeping the original order
1368 return [(name, self._UnlockedGetInstanceInfo(name)) for name in instances]
1370 @locking.ssynchronized(_config_lock, shared=1)
1371 def GetAllInstancesInfo(self):
1372 """Get the configuration of all instances.
1375 @return: dict of (instance, instance_info), where instance_info is what
1376 would GetInstanceInfo return for the node
1379 my_dict = dict([(instance, self._UnlockedGetInstanceInfo(instance))
1380 for instance in self._UnlockedGetInstanceList()])
1383 @locking.ssynchronized(_config_lock, shared=1)
1384 def GetInstancesInfoByFilter(self, filter_fn):
1385 """Get instance configuration with a filter.
1387 @type filter_fn: callable
1388 @param filter_fn: Filter function receiving instance object as parameter,
1389 returning boolean. Important: this function is called while the
1390 configuration locks is held. It must not do any complex work or call
1391 functions potentially leading to a deadlock. Ideally it doesn't call any
1392 other functions and just compares instance attributes.
1395 return dict((name, inst)
1396 for (name, inst) in self._config_data.instances.items()
1399 @locking.ssynchronized(_config_lock)
1400 def AddNode(self, node, ec_id):
1401 """Add a node to the configuration.
1403 @type node: L{objects.Node}
1404 @param node: a Node instance
1407 logging.info("Adding node %s to configuration", node.name)
1409 self._EnsureUUID(node, ec_id)
1412 node.ctime = node.mtime = time.time()
1413 self._UnlockedAddNodeToGroup(node.name, node.group)
1414 self._config_data.nodes[node.name] = node
1415 self._config_data.cluster.serial_no += 1
1418 @locking.ssynchronized(_config_lock)
1419 def RemoveNode(self, node_name):
1420 """Remove a node from the configuration.
1423 logging.info("Removing node %s from configuration", node_name)
1425 if node_name not in self._config_data.nodes:
1426 raise errors.ConfigurationError("Unknown node '%s'" % node_name)
1428 self._UnlockedRemoveNodeFromGroup(self._config_data.nodes[node_name])
1429 del self._config_data.nodes[node_name]
1430 self._config_data.cluster.serial_no += 1
1433 def ExpandNodeName(self, short_name):
1434 """Attempt to expand an incomplete node name.
1437 # Locking is done in L{ConfigWriter.GetNodeList}
1438 return _MatchNameComponentIgnoreCase(short_name, self.GetNodeList())
1440 def _UnlockedGetNodeInfo(self, node_name):
1441 """Get the configuration of a node, as stored in the config.
1443 This function is for internal use, when the config lock is already
1446 @param node_name: the node name, e.g. I{node1.example.com}
1448 @rtype: L{objects.Node}
1449 @return: the node object
1452 if node_name not in self._config_data.nodes:
1455 return self._config_data.nodes[node_name]
1457 @locking.ssynchronized(_config_lock, shared=1)
1458 def GetNodeInfo(self, node_name):
1459 """Get the configuration of a node, as stored in the config.
1461 This is just a locked wrapper over L{_UnlockedGetNodeInfo}.
1463 @param node_name: the node name, e.g. I{node1.example.com}
1465 @rtype: L{objects.Node}
1466 @return: the node object
1469 return self._UnlockedGetNodeInfo(node_name)
1471 @locking.ssynchronized(_config_lock, shared=1)
1472 def GetNodeInstances(self, node_name):
1473 """Get the instances of a node, as stored in the config.
1475 @param node_name: the node name, e.g. I{node1.example.com}
1477 @rtype: (list, list)
1478 @return: a tuple with two lists: the primary and the secondary instances
1483 for inst in self._config_data.instances.values():
1484 if inst.primary_node == node_name:
1485 pri.append(inst.name)
1486 if node_name in inst.secondary_nodes:
1487 sec.append(inst.name)
1490 @locking.ssynchronized(_config_lock, shared=1)
1491 def GetNodeGroupInstances(self, uuid, primary_only=False):
1492 """Get the instances of a node group.
1494 @param uuid: Node group UUID
1495 @param primary_only: Whether to only consider primary nodes
1497 @return: List of instance names in node group
1501 nodes_fn = lambda inst: [inst.primary_node]
1503 nodes_fn = lambda inst: inst.all_nodes
1505 return frozenset(inst.name
1506 for inst in self._config_data.instances.values()
1507 for node_name in nodes_fn(inst)
1508 if self._UnlockedGetNodeInfo(node_name).group == uuid)
1510 def _UnlockedGetNodeList(self):
1511 """Return the list of nodes which are in the configuration.
1513 This function is for internal use, when the config lock is already
1519 return self._config_data.nodes.keys()
1521 @locking.ssynchronized(_config_lock, shared=1)
1522 def GetNodeList(self):
1523 """Return the list of nodes which are in the configuration.
1526 return self._UnlockedGetNodeList()
1528 def _UnlockedGetOnlineNodeList(self):
1529 """Return the list of nodes which are online.
1532 all_nodes = [self._UnlockedGetNodeInfo(node)
1533 for node in self._UnlockedGetNodeList()]
1534 return [node.name for node in all_nodes if not node.offline]
1536 @locking.ssynchronized(_config_lock, shared=1)
1537 def GetOnlineNodeList(self):
1538 """Return the list of nodes which are online.
1541 return self._UnlockedGetOnlineNodeList()
1543 @locking.ssynchronized(_config_lock, shared=1)
1544 def GetVmCapableNodeList(self):
1545 """Return the list of nodes which are not vm capable.
1548 all_nodes = [self._UnlockedGetNodeInfo(node)
1549 for node in self._UnlockedGetNodeList()]
1550 return [node.name for node in all_nodes if node.vm_capable]
1552 @locking.ssynchronized(_config_lock, shared=1)
1553 def GetNonVmCapableNodeList(self):
1554 """Return the list of nodes which are not vm capable.
1557 all_nodes = [self._UnlockedGetNodeInfo(node)
1558 for node in self._UnlockedGetNodeList()]
1559 return [node.name for node in all_nodes if not node.vm_capable]
1561 @locking.ssynchronized(_config_lock, shared=1)
1562 def GetMultiNodeInfo(self, nodes):
1563 """Get the configuration of multiple nodes.
1565 @param nodes: list of node names
1567 @return: list of tuples of (node, node_info), where node_info is
1568 what would GetNodeInfo return for the node, in the original
1572 return [(name, self._UnlockedGetNodeInfo(name)) for name in nodes]
1574 @locking.ssynchronized(_config_lock, shared=1)
1575 def GetAllNodesInfo(self):
1576 """Get the configuration of all nodes.
1579 @return: dict of (node, node_info), where node_info is what
1580 would GetNodeInfo return for the node
1583 return self._UnlockedGetAllNodesInfo()
1585 def _UnlockedGetAllNodesInfo(self):
1586 """Gets configuration of all nodes.
1588 @note: See L{GetAllNodesInfo}
1591 return dict([(node, self._UnlockedGetNodeInfo(node))
1592 for node in self._UnlockedGetNodeList()])
1594 @locking.ssynchronized(_config_lock, shared=1)
1595 def GetNodeGroupsFromNodes(self, nodes):
1596 """Returns groups for a list of nodes.
1598 @type nodes: list of string
1599 @param nodes: List of node names
1603 return frozenset(self._UnlockedGetNodeInfo(name).group for name in nodes)
1605 def _UnlockedGetMasterCandidateStats(self, exceptions=None):
1606 """Get the number of current and maximum desired and possible candidates.
1608 @type exceptions: list
1609 @param exceptions: if passed, list of nodes that should be ignored
1611 @return: tuple of (current, desired and possible, possible)
1614 mc_now = mc_should = mc_max = 0
1615 for node in self._config_data.nodes.values():
1616 if exceptions and node.name in exceptions:
1618 if not (node.offline or node.drained) and node.master_capable:
1620 if node.master_candidate:
1622 mc_should = min(mc_max, self._config_data.cluster.candidate_pool_size)
1623 return (mc_now, mc_should, mc_max)
1625 @locking.ssynchronized(_config_lock, shared=1)
1626 def GetMasterCandidateStats(self, exceptions=None):
1627 """Get the number of current and maximum possible candidates.
1629 This is just a wrapper over L{_UnlockedGetMasterCandidateStats}.
1631 @type exceptions: list
1632 @param exceptions: if passed, list of nodes that should be ignored
1634 @return: tuple of (current, max)
1637 return self._UnlockedGetMasterCandidateStats(exceptions)
1639 @locking.ssynchronized(_config_lock)
1640 def MaintainCandidatePool(self, exceptions):
1641 """Try to grow the candidate pool to the desired size.
1643 @type exceptions: list
1644 @param exceptions: if passed, list of nodes that should be ignored
1646 @return: list with the adjusted nodes (L{objects.Node} instances)
1649 mc_now, mc_max, _ = self._UnlockedGetMasterCandidateStats(exceptions)
1652 node_list = self._config_data.nodes.keys()
1653 random.shuffle(node_list)
1654 for name in node_list:
1655 if mc_now >= mc_max:
1657 node = self._config_data.nodes[name]
1658 if (node.master_candidate or node.offline or node.drained or
1659 node.name in exceptions or not node.master_capable):
1661 mod_list.append(node)
1662 node.master_candidate = True
1665 if mc_now != mc_max:
1666 # this should not happen
1667 logging.warning("Warning: MaintainCandidatePool didn't manage to"
1668 " fill the candidate pool (%d/%d)", mc_now, mc_max)
1670 self._config_data.cluster.serial_no += 1
1675 def _UnlockedAddNodeToGroup(self, node_name, nodegroup_uuid):
1676 """Add a given node to the specified group.
1679 if nodegroup_uuid not in self._config_data.nodegroups:
1680 # This can happen if a node group gets deleted between its lookup and
1681 # when we're adding the first node to it, since we don't keep a lock in
1682 # the meantime. It's ok though, as we'll fail cleanly if the node group
1683 # is not found anymore.
1684 raise errors.OpExecError("Unknown node group: %s" % nodegroup_uuid)
1685 if node_name not in self._config_data.nodegroups[nodegroup_uuid].members:
1686 self._config_data.nodegroups[nodegroup_uuid].members.append(node_name)
1688 def _UnlockedRemoveNodeFromGroup(self, node):
1689 """Remove a given node from its group.
1692 nodegroup = node.group
1693 if nodegroup not in self._config_data.nodegroups:
1694 logging.warning("Warning: node '%s' has unknown node group '%s'"
1695 " (while being removed from it)", node.name, nodegroup)
1696 nodegroup_obj = self._config_data.nodegroups[nodegroup]
1697 if node.name not in nodegroup_obj.members:
1698 logging.warning("Warning: node '%s' not a member of its node group '%s'"
1699 " (while being removed from it)", node.name, nodegroup)
1701 nodegroup_obj.members.remove(node.name)
1703 @locking.ssynchronized(_config_lock)
1704 def AssignGroupNodes(self, mods):
1705 """Changes the group of a number of nodes.
1707 @type mods: list of tuples; (node name, new group UUID)
1708 @param mods: Node membership modifications
1711 groups = self._config_data.nodegroups
1712 nodes = self._config_data.nodes
1716 # Try to resolve names/UUIDs first
1717 for (node_name, new_group_uuid) in mods:
1719 node = nodes[node_name]
1721 raise errors.ConfigurationError("Unable to find node '%s'" % node_name)
1723 if node.group == new_group_uuid:
1724 # Node is being assigned to its current group
1725 logging.debug("Node '%s' was assigned to its current group (%s)",
1726 node_name, node.group)
1729 # Try to find current group of node
1731 old_group = groups[node.group]
1733 raise errors.ConfigurationError("Unable to find old group '%s'" %
1736 # Try to find new group for node
1738 new_group = groups[new_group_uuid]
1740 raise errors.ConfigurationError("Unable to find new group '%s'" %
1743 assert node.name in old_group.members, \
1744 ("Inconsistent configuration: node '%s' not listed in members for its"
1745 " old group '%s'" % (node.name, old_group.uuid))
1746 assert node.name not in new_group.members, \
1747 ("Inconsistent configuration: node '%s' already listed in members for"
1748 " its new group '%s'" % (node.name, new_group.uuid))
1750 resmod.append((node, old_group, new_group))
1753 for (node, old_group, new_group) in resmod:
1754 assert node.uuid != new_group.uuid and old_group.uuid != new_group.uuid, \
1755 "Assigning to current group is not possible"
1757 node.group = new_group.uuid
1759 # Update members of involved groups
1760 if node.name in old_group.members:
1761 old_group.members.remove(node.name)
1762 if node.name not in new_group.members:
1763 new_group.members.append(node.name)
1765 # Update timestamps and serials (only once per node/group object)
1767 for obj in frozenset(itertools.chain(*resmod)): # pylint: disable=W0142
1771 # Force ssconf update
1772 self._config_data.cluster.serial_no += 1
1776 def _BumpSerialNo(self):
1777 """Bump up the serial number of the config.
1780 self._config_data.serial_no += 1
1781 self._config_data.mtime = time.time()
1783 def _AllUUIDObjects(self):
1784 """Returns all objects with uuid attributes.
1787 return (self._config_data.instances.values() +
1788 self._config_data.nodes.values() +
1789 self._config_data.nodegroups.values() +
1790 [self._config_data.cluster])
1792 def _OpenConfig(self, accept_foreign):
1793 """Read the config data from disk.
1796 raw_data = utils.ReadFile(self._cfg_file)
1799 data = objects.ConfigData.FromDict(serializer.Load(raw_data))
1800 except Exception, err:
1801 raise errors.ConfigurationError(err)
1803 # Make sure the configuration has the right version
1804 _ValidateConfig(data)
1806 if (not hasattr(data, 'cluster') or
1807 not hasattr(data.cluster, 'rsahostkeypub')):
1808 raise errors.ConfigurationError("Incomplete configuration"
1809 " (missing cluster.rsahostkeypub)")
1811 if data.cluster.master_node != self._my_hostname and not accept_foreign:
1812 msg = ("The configuration denotes node %s as master, while my"
1813 " hostname is %s; opening a foreign configuration is only"
1814 " possible in accept_foreign mode" %
1815 (data.cluster.master_node, self._my_hostname))
1816 raise errors.ConfigurationError(msg)
1818 # Upgrade configuration if needed
1819 data.UpgradeConfig()
1821 self._config_data = data
1822 # reset the last serial as -1 so that the next write will cause
1824 self._last_cluster_serial = -1
1826 # And finally run our (custom) config upgrade sequence
1827 self._UpgradeConfig()
1829 self._cfg_id = utils.GetFileID(path=self._cfg_file)
1831 def _UpgradeConfig(self):
1832 """Run upgrade steps that cannot be done purely in the objects.
1834 This is because some data elements need uniqueness across the
1835 whole configuration, etc.
1837 @warning: this function will call L{_WriteConfig()}, but also
1838 L{DropECReservations} so it needs to be called only from a
1839 "safe" place (the constructor). If one wanted to call it with
1840 the lock held, a DropECReservationUnlocked would need to be
1841 created first, to avoid causing deadlock.
1845 for item in self._AllUUIDObjects():
1846 if item.uuid is None:
1847 item.uuid = self._GenerateUniqueID(_UPGRADE_CONFIG_JID)
1849 if not self._config_data.nodegroups:
1850 default_nodegroup_name = constants.INITIAL_NODE_GROUP_NAME
1851 default_nodegroup = objects.NodeGroup(name=default_nodegroup_name,
1853 self._UnlockedAddNodeGroup(default_nodegroup, _UPGRADE_CONFIG_JID, True)
1855 for node in self._config_data.nodes.values():
1857 node.group = self.LookupNodeGroup(None)
1859 # This is technically *not* an upgrade, but needs to be done both when
1860 # nodegroups are being added, and upon normally loading the config,
1861 # because the members list of a node group is discarded upon
1862 # serializing/deserializing the object.
1863 self._UnlockedAddNodeToGroup(node.name, node.group)
1866 # This is ok even if it acquires the internal lock, as _UpgradeConfig is
1867 # only called at config init time, without the lock held
1868 self.DropECReservations(_UPGRADE_CONFIG_JID)
1870 def _DistributeConfig(self, feedback_fn):
1871 """Distribute the configuration to the other nodes.
1873 Currently, this only copies the configuration file. In the future,
1874 it could be used to encapsulate the 2/3-phase update mechanism.
1884 myhostname = self._my_hostname
1885 # we can skip checking whether _UnlockedGetNodeInfo returns None
1886 # since the node list comes from _UnlocketGetNodeList, and we are
1887 # called with the lock held, so no modifications should take place
1889 for node_name in self._UnlockedGetNodeList():
1890 if node_name == myhostname:
1892 node_info = self._UnlockedGetNodeInfo(node_name)
1893 if not node_info.master_candidate:
1895 node_list.append(node_info.name)
1896 addr_list.append(node_info.primary_ip)
1898 # TODO: Use dedicated resolver talking to config writer for name resolution
1900 self._GetRpc(addr_list).call_upload_file(node_list, self._cfg_file)
1901 for to_node, to_result in result.items():
1902 msg = to_result.fail_msg
1904 msg = ("Copy of file %s to node %s failed: %s" %
1905 (self._cfg_file, to_node, msg))
1915 def _WriteConfig(self, destination=None, feedback_fn=None):
1916 """Write the configuration data to persistent storage.
1919 assert feedback_fn is None or callable(feedback_fn)
1921 # Warn on config errors, but don't abort the save - the
1922 # configuration has already been modified, and we can't revert;
1923 # the best we can do is to warn the user and save as is, leaving
1924 # recovery to the user
1925 config_errors = self._UnlockedVerifyConfig()
1927 errmsg = ("Configuration data is not consistent: %s" %
1928 (utils.CommaJoin(config_errors)))
1929 logging.critical(errmsg)
1933 if destination is None:
1934 destination = self._cfg_file
1935 self._BumpSerialNo()
1936 txt = serializer.Dump(self._config_data.ToDict())
1938 getents = self._getents()
1940 fd = utils.SafeWriteFile(destination, self._cfg_id, data=txt,
1941 close=False, gid=getents.confd_gid, mode=0640)
1942 except errors.LockError:
1943 raise errors.ConfigurationError("The configuration file has been"
1944 " modified since the last write, cannot"
1947 self._cfg_id = utils.GetFileID(fd=fd)
1951 self.write_count += 1
1953 # and redistribute the config file to master candidates
1954 self._DistributeConfig(feedback_fn)
1956 # Write ssconf files on all nodes (including locally)
1957 if self._last_cluster_serial < self._config_data.cluster.serial_no:
1958 if not self._offline:
1959 result = self._GetRpc(None).call_write_ssconf_files(
1960 self._UnlockedGetOnlineNodeList(),
1961 self._UnlockedGetSsconfValues())
1963 for nname, nresu in result.items():
1964 msg = nresu.fail_msg
1966 errmsg = ("Error while uploading ssconf files to"
1967 " node %s: %s" % (nname, msg))
1968 logging.warning(errmsg)
1973 self._last_cluster_serial = self._config_data.cluster.serial_no
1975 def _UnlockedGetSsconfValues(self):
1976 """Return the values needed by ssconf.
1979 @return: a dictionary with keys the ssconf names and values their
1984 instance_names = utils.NiceSort(self._UnlockedGetInstanceList())
1985 node_names = utils.NiceSort(self._UnlockedGetNodeList())
1986 node_info = [self._UnlockedGetNodeInfo(name) for name in node_names]
1987 node_pri_ips = ["%s %s" % (ninfo.name, ninfo.primary_ip)
1988 for ninfo in node_info]
1989 node_snd_ips = ["%s %s" % (ninfo.name, ninfo.secondary_ip)
1990 for ninfo in node_info]
1992 instance_data = fn(instance_names)
1993 off_data = fn(node.name for node in node_info if node.offline)
1994 on_data = fn(node.name for node in node_info if not node.offline)
1995 mc_data = fn(node.name for node in node_info if node.master_candidate)
1996 mc_ips_data = fn(node.primary_ip for node in node_info
1997 if node.master_candidate)
1998 node_data = fn(node_names)
1999 node_pri_ips_data = fn(node_pri_ips)
2000 node_snd_ips_data = fn(node_snd_ips)
2002 cluster = self._config_data.cluster
2003 cluster_tags = fn(cluster.GetTags())
2005 hypervisor_list = fn(cluster.enabled_hypervisors)
2007 uid_pool = uidpool.FormatUidPool(cluster.uid_pool, separator="\n")
2009 nodegroups = ["%s %s" % (nodegroup.uuid, nodegroup.name) for nodegroup in
2010 self._config_data.nodegroups.values()]
2011 nodegroups_data = fn(utils.NiceSort(nodegroups))
2014 constants.SS_CLUSTER_NAME: cluster.cluster_name,
2015 constants.SS_CLUSTER_TAGS: cluster_tags,
2016 constants.SS_FILE_STORAGE_DIR: cluster.file_storage_dir,
2017 constants.SS_SHARED_FILE_STORAGE_DIR: cluster.shared_file_storage_dir,
2018 constants.SS_MASTER_CANDIDATES: mc_data,
2019 constants.SS_MASTER_CANDIDATES_IPS: mc_ips_data,
2020 constants.SS_MASTER_IP: cluster.master_ip,
2021 constants.SS_MASTER_NETDEV: cluster.master_netdev,
2022 constants.SS_MASTER_NETMASK: str(cluster.master_netmask),
2023 constants.SS_MASTER_NODE: cluster.master_node,
2024 constants.SS_NODE_LIST: node_data,
2025 constants.SS_NODE_PRIMARY_IPS: node_pri_ips_data,
2026 constants.SS_NODE_SECONDARY_IPS: node_snd_ips_data,
2027 constants.SS_OFFLINE_NODES: off_data,
2028 constants.SS_ONLINE_NODES: on_data,
2029 constants.SS_PRIMARY_IP_FAMILY: str(cluster.primary_ip_family),
2030 constants.SS_INSTANCE_LIST: instance_data,
2031 constants.SS_RELEASE_VERSION: constants.RELEASE_VERSION,
2032 constants.SS_HYPERVISOR_LIST: hypervisor_list,
2033 constants.SS_MAINTAIN_NODE_HEALTH: str(cluster.maintain_node_health),
2034 constants.SS_UID_POOL: uid_pool,
2035 constants.SS_NODEGROUPS: nodegroups_data,
2037 bad_values = [(k, v) for k, v in ssconf_values.items()
2038 if not isinstance(v, (str, basestring))]
2040 err = utils.CommaJoin("%s=%s" % (k, v) for k, v in bad_values)
2041 raise errors.ConfigurationError("Some ssconf key(s) have non-string"
2042 " values: %s" % err)
2043 return ssconf_values
2045 @locking.ssynchronized(_config_lock, shared=1)
2046 def GetSsconfValues(self):
2047 """Wrapper using lock around _UnlockedGetSsconf().
2050 return self._UnlockedGetSsconfValues()
2052 @locking.ssynchronized(_config_lock, shared=1)
2053 def GetVGName(self):
2054 """Return the volume group name.
2057 return self._config_data.cluster.volume_group_name
2059 @locking.ssynchronized(_config_lock)
2060 def SetVGName(self, vg_name):
2061 """Set the volume group name.
2064 self._config_data.cluster.volume_group_name = vg_name
2065 self._config_data.cluster.serial_no += 1
2068 @locking.ssynchronized(_config_lock, shared=1)
2069 def GetDRBDHelper(self):
2070 """Return DRBD usermode helper.
2073 return self._config_data.cluster.drbd_usermode_helper
2075 @locking.ssynchronized(_config_lock)
2076 def SetDRBDHelper(self, drbd_helper):
2077 """Set DRBD usermode helper.
2080 self._config_data.cluster.drbd_usermode_helper = drbd_helper
2081 self._config_data.cluster.serial_no += 1
2084 @locking.ssynchronized(_config_lock, shared=1)
2085 def GetMACPrefix(self):
2086 """Return the mac prefix.
2089 return self._config_data.cluster.mac_prefix
2091 @locking.ssynchronized(_config_lock, shared=1)
2092 def GetClusterInfo(self):
2093 """Returns information about the cluster
2095 @rtype: L{objects.Cluster}
2096 @return: the cluster object
2099 return self._config_data.cluster
2101 @locking.ssynchronized(_config_lock, shared=1)
2102 def HasAnyDiskOfType(self, dev_type):
2103 """Check if in there is at disk of the given type in the configuration.
2106 return self._config_data.HasAnyDiskOfType(dev_type)
2108 @locking.ssynchronized(_config_lock)
2109 def Update(self, target, feedback_fn):
2110 """Notify function to be called after updates.
2112 This function must be called when an object (as returned by
2113 GetInstanceInfo, GetNodeInfo, GetCluster) has been updated and the
2114 caller wants the modifications saved to the backing store. Note
2115 that all modified objects will be saved, but the target argument
2116 is the one the caller wants to ensure that it's saved.
2118 @param target: an instance of either L{objects.Cluster},
2119 L{objects.Node} or L{objects.Instance} which is existing in
2121 @param feedback_fn: Callable feedback function
2124 if self._config_data is None:
2125 raise errors.ProgrammerError("Configuration file not read,"
2127 update_serial = False
2128 if isinstance(target, objects.Cluster):
2129 test = target == self._config_data.cluster
2130 elif isinstance(target, objects.Node):
2131 test = target in self._config_data.nodes.values()
2132 update_serial = True
2133 elif isinstance(target, objects.Instance):
2134 test = target in self._config_data.instances.values()
2135 elif isinstance(target, objects.NodeGroup):
2136 test = target in self._config_data.nodegroups.values()
2138 raise errors.ProgrammerError("Invalid object type (%s) passed to"
2139 " ConfigWriter.Update" % type(target))
2141 raise errors.ConfigurationError("Configuration updated since object"
2142 " has been read or unknown object")
2143 target.serial_no += 1
2144 target.mtime = now = time.time()
2147 # for node updates, we need to increase the cluster serial too
2148 self._config_data.cluster.serial_no += 1
2149 self._config_data.cluster.mtime = now
2151 if isinstance(target, objects.Instance):
2152 self._UnlockedReleaseDRBDMinors(target.name)
2154 self._WriteConfig(feedback_fn=feedback_fn)
2156 @locking.ssynchronized(_config_lock)
2157 def DropECReservations(self, ec_id):
2158 """Drop per-execution-context reservations
2161 for rm in self._all_rms:
2162 rm.DropECReservations(ec_id)