4 # Copyright (C) 2006, 2007 Google Inc.
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 """Configuration management for Ganeti
24 This module provides the interface to the Ganeti cluster configuration.
26 The configuration data is stored on every node but is updated on the master
27 only. After each update, the master distributes the data to the other nodes.
29 Currently, the data storage format is JSON. YAML was slow and consuming too
39 from ganeti import errors
40 from ganeti import locking
41 from ganeti import utils
42 from ganeti import constants
43 from ganeti import rpc
44 from ganeti import objects
45 from ganeti import serializer
46 from ganeti import ssconf
49 _config_lock = locking.SharedLock()
53 sstore = ssconf.SimpleStore()
55 if sstore.GetConfigVersion() != constants.CONFIG_VERSION:
56 raise errors.ConfigurationError("Cluster configuration version"
57 " mismatch, got %s instead of %s" %
58 (sstore.GetConfigVersion(),
59 constants.CONFIG_VERSION))
63 """The interface to the cluster configuration.
66 def __init__(self, cfg_file=None, offline=False):
68 self._lock = _config_lock
69 self._config_data = None
70 self._config_time = None
71 self._config_size = None
72 self._config_inode = None
73 self._offline = offline
75 self._cfg_file = constants.CLUSTER_CONF_FILE
77 self._cfg_file = cfg_file
78 self._temporary_ids = set()
79 self._temporary_drbds = {}
80 # Note: in order to prevent errors when resolving our name in
81 # _DistributeConfig, we compute it here once and reuse it; it's
82 # better to raise an error before starting to modify the config
83 # file than after it was modified
84 self._my_hostname = utils.HostInfo().name
86 # this method needs to be static, so that we can call it on the class
89 """Check if the cluster is configured.
92 return os.path.exists(constants.CLUSTER_CONF_FILE)
94 @locking.ssynchronized(_config_lock, shared=1)
95 def GenerateMAC(self):
96 """Generate a MAC for an instance.
98 This should check the current instances for duplicates.
102 prefix = self._config_data.cluster.mac_prefix
103 all_macs = self._AllMACs()
106 byte1 = random.randrange(0, 256)
107 byte2 = random.randrange(0, 256)
108 byte3 = random.randrange(0, 256)
109 mac = "%s:%02x:%02x:%02x" % (prefix, byte1, byte2, byte3)
110 if mac not in all_macs:
114 raise errors.ConfigurationError("Can't generate unique MAC")
117 @locking.ssynchronized(_config_lock, shared=1)
118 def IsMacInUse(self, mac):
119 """Predicate: check if the specified MAC is in use in the Ganeti cluster.
121 This only checks instances managed by this cluster, it does not
122 check for potential collisions elsewhere.
126 all_macs = self._AllMACs()
127 return mac in all_macs
129 def _ComputeAllLVs(self):
130 """Compute the list of all LVs.
135 for instance in self._config_data.instances.values():
136 node_data = instance.MapLVsByNode()
137 for lv_list in node_data.values():
138 lvnames.update(lv_list)
141 @locking.ssynchronized(_config_lock, shared=1)
142 def GenerateUniqueID(self, exceptions=None):
143 """Generate an unique disk name.
145 This checks the current node, instances and disk names for
149 - exceptions: a list with some other names which should be checked
150 for uniqueness (used for example when you want to get
151 more than one id at one time without adding each one in
152 turn to the config file
154 Returns: the unique id as a string
158 existing.update(self._temporary_ids)
159 existing.update(self._ComputeAllLVs())
160 existing.update(self._config_data.instances.keys())
161 existing.update(self._config_data.nodes.keys())
162 if exceptions is not None:
163 existing.update(exceptions)
166 unique_id = utils.NewUUID()
167 if unique_id not in existing and unique_id is not None:
170 raise errors.ConfigurationError("Not able generate an unique ID"
171 " (last tried ID: %s" % unique_id)
172 self._temporary_ids.add(unique_id)
176 """Return all MACs present in the config.
182 for instance in self._config_data.instances.values():
183 for nic in instance.nics:
184 result.append(nic.mac)
188 @locking.ssynchronized(_config_lock, shared=1)
189 def VerifyConfig(self):
190 """Stub verify function.
196 data = self._config_data
197 for instance_name in data.instances:
198 instance = data.instances[instance_name]
199 if instance.primary_node not in data.nodes:
200 result.append("instance '%s' has invalid primary node '%s'" %
201 (instance_name, instance.primary_node))
202 for snode in instance.secondary_nodes:
203 if snode not in data.nodes:
204 result.append("instance '%s' has invalid secondary node '%s'" %
205 (instance_name, snode))
206 for idx, nic in enumerate(instance.nics):
207 if nic.mac in seen_macs:
208 result.append("instance '%s' has NIC %d mac %s duplicate" %
209 (instance_name, idx, nic.mac))
211 seen_macs.append(nic.mac)
214 def _UnlockedSetDiskID(self, disk, node_name):
215 """Convert the unique ID to the ID needed on the target nodes.
217 This is used only for drbd, which needs ip/port configuration.
219 The routine descends down and updates its children also, because
220 this helps when the only the top device is passed to the remote
223 This function is for internal use, when the config lock is already held.
227 for child in disk.children:
228 self._UnlockedSetDiskID(child, node_name)
230 if disk.logical_id is None and disk.physical_id is not None:
232 if disk.dev_type == constants.LD_DRBD8:
233 pnode, snode, port, pminor, sminor = disk.logical_id
234 if node_name not in (pnode, snode):
235 raise errors.ConfigurationError("DRBD device not knowing node %s" %
237 pnode_info = self._UnlockedGetNodeInfo(pnode)
238 snode_info = self._UnlockedGetNodeInfo(snode)
239 if pnode_info is None or snode_info is None:
240 raise errors.ConfigurationError("Can't find primary or secondary node"
241 " for %s" % str(disk))
242 p_data = (pnode_info.secondary_ip, port)
243 s_data = (snode_info.secondary_ip, port)
244 if pnode == node_name:
245 disk.physical_id = p_data + s_data + (pminor,)
246 else: # it must be secondary, we tested above
247 disk.physical_id = s_data + p_data + (sminor,)
249 disk.physical_id = disk.logical_id
252 @locking.ssynchronized(_config_lock)
253 def SetDiskID(self, disk, node_name):
254 """Convert the unique ID to the ID needed on the target nodes.
256 This is used only for drbd, which needs ip/port configuration.
258 The routine descends down and updates its children also, because
259 this helps when the only the top device is passed to the remote
263 return self._UnlockedSetDiskID(disk, node_name)
265 @locking.ssynchronized(_config_lock)
266 def AddTcpUdpPort(self, port):
267 """Adds a new port to the available port pool.
270 if not isinstance(port, int):
271 raise errors.ProgrammerError("Invalid type passed for port")
274 self._config_data.cluster.tcpudp_port_pool.add(port)
277 @locking.ssynchronized(_config_lock, shared=1)
278 def GetPortList(self):
279 """Returns a copy of the current port list.
283 return self._config_data.cluster.tcpudp_port_pool.copy()
285 @locking.ssynchronized(_config_lock)
286 def AllocatePort(self):
289 The port will be taken from the available port pool or from the
290 default port range (and in this case we increase
296 # If there are TCP/IP ports configured, we use them first.
297 if self._config_data.cluster.tcpudp_port_pool:
298 port = self._config_data.cluster.tcpudp_port_pool.pop()
300 port = self._config_data.cluster.highest_used_port + 1
301 if port >= constants.LAST_DRBD_PORT:
302 raise errors.ConfigurationError("The highest used port is greater"
303 " than %s. Aborting." %
304 constants.LAST_DRBD_PORT)
305 self._config_data.cluster.highest_used_port = port
310 def _ComputeDRBDMap(self, instance):
311 """Compute the used DRBD minor/nodes.
313 Return: dictionary of node_name: dict of minor: instance_name. The
314 returned dict will have all the nodes in it (even if with an empty
318 def _AppendUsedPorts(instance_name, disk, used):
319 if disk.dev_type == constants.LD_DRBD8 and len(disk.logical_id) == 5:
320 nodeA, nodeB, dummy, minorA, minorB = disk.logical_id
321 for node, port in ((nodeA, minorA), (nodeB, minorB)):
322 assert node in used, "Instance node not found in node list"
323 if port in used[node]:
324 raise errors.ProgrammerError("DRBD minor already used:"
326 (node, port, instance_name,
329 used[node][port] = instance_name
331 for child in disk.children:
332 _AppendUsedPorts(instance_name, child, used)
334 my_dict = dict((node, {}) for node in self._config_data.nodes)
335 for (node, minor), instance in self._temporary_drbds.iteritems():
336 my_dict[node][minor] = instance
337 for instance in self._config_data.instances.itervalues():
338 for disk in instance.disks:
339 _AppendUsedPorts(instance.name, disk, my_dict)
342 @locking.ssynchronized(_config_lock)
343 def AllocateDRBDMinor(self, nodes, instance):
344 """Allocate a drbd minor.
346 The free minor will be automatically computed from the existing
347 devices. A node can be given multiple times in order to allocate
348 multiple minors. The result is the list of minors, in the same
349 order as the passed nodes.
354 d_map = self._ComputeDRBDMap(instance)
359 # no minors used, we can start at 0
365 ffree = utils.FirstFree(keys)
367 # return the next minor
368 # TODO: implement high-limit check
373 ndata[minor] = instance
374 assert (nname, minor) not in self._temporary_drbds, \
375 "Attempt to reuse reserved DRBD minor"
376 self._temporary_drbds[(nname, minor)] = instance
377 logging.debug("Request to allocate drbd minors, input: %s, returning %s",
381 @locking.ssynchronized(_config_lock)
382 def ReleaseDRBDMinors(self, instance):
383 """Release temporary drbd minors allocated for a given instance.
385 This should be called on both the error paths and on the success
386 paths (after the instance has been added or updated).
388 @type instance: string
389 @param instance: the instance for which temporary minors should be
393 for key, name in self._temporary_drbds.items():
395 del self._temporary_drbds[key]
397 @locking.ssynchronized(_config_lock, shared=1)
398 def GetHostKey(self):
399 """Return the rsa hostkey from the config.
406 return self._config_data.cluster.rsahostkeypub
408 @locking.ssynchronized(_config_lock)
409 def AddInstance(self, instance):
410 """Add an instance to the config.
412 This should be used after creating a new instance.
415 instance: the instance object
417 if not isinstance(instance, objects.Instance):
418 raise errors.ProgrammerError("Invalid type passed to AddInstance")
420 if instance.disk_template != constants.DT_DISKLESS:
421 all_lvs = instance.MapLVsByNode()
422 logging.info("Instance '%s' DISK_LAYOUT: %s", instance.name, all_lvs)
425 instance.serial_no = 1
426 self._config_data.instances[instance.name] = instance
427 self._config_data.cluster.serial_no += 1
430 def _SetInstanceStatus(self, instance_name, status):
431 """Set the instance's status to a given value.
434 if status not in ("up", "down"):
435 raise errors.ProgrammerError("Invalid status '%s' passed to"
436 " ConfigWriter._SetInstanceStatus()" %
440 if instance_name not in self._config_data.instances:
441 raise errors.ConfigurationError("Unknown instance '%s'" %
443 instance = self._config_data.instances[instance_name]
444 if instance.status != status:
445 instance.status = status
446 instance.serial_no += 1
449 @locking.ssynchronized(_config_lock)
450 def MarkInstanceUp(self, instance_name):
451 """Mark the instance status to up in the config.
454 self._SetInstanceStatus(instance_name, "up")
456 @locking.ssynchronized(_config_lock)
457 def RemoveInstance(self, instance_name):
458 """Remove the instance from the configuration.
463 if instance_name not in self._config_data.instances:
464 raise errors.ConfigurationError("Unknown instance '%s'" % instance_name)
465 del self._config_data.instances[instance_name]
466 self._config_data.cluster.serial_no += 1
469 @locking.ssynchronized(_config_lock)
470 def RenameInstance(self, old_name, new_name):
471 """Rename an instance.
473 This needs to be done in ConfigWriter and not by RemoveInstance
474 combined with AddInstance as only we can guarantee an atomic
479 if old_name not in self._config_data.instances:
480 raise errors.ConfigurationError("Unknown instance '%s'" % old_name)
481 inst = self._config_data.instances[old_name]
482 del self._config_data.instances[old_name]
485 for disk in inst.disks:
486 if disk.dev_type == constants.LD_FILE:
487 # rename the file paths in logical and physical id
488 file_storage_dir = os.path.dirname(os.path.dirname(disk.logical_id[1]))
489 disk.physical_id = disk.logical_id = (disk.logical_id[0],
490 os.path.join(file_storage_dir,
494 self._config_data.instances[inst.name] = inst
495 self._config_data.cluster.serial_no += 1
498 @locking.ssynchronized(_config_lock)
499 def MarkInstanceDown(self, instance_name):
500 """Mark the status of an instance to down in the configuration.
503 self._SetInstanceStatus(instance_name, "down")
505 def _UnlockedGetInstanceList(self):
506 """Get the list of instances.
508 This function is for internal use, when the config lock is already held.
512 return self._config_data.instances.keys()
514 @locking.ssynchronized(_config_lock, shared=1)
515 def GetInstanceList(self):
516 """Get the list of instances.
519 array of instances, ex. ['instance2.example.com','instance1.example.com']
520 these contains all the instances, also the ones in Admin_down state
523 return self._UnlockedGetInstanceList()
525 @locking.ssynchronized(_config_lock, shared=1)
526 def ExpandInstanceName(self, short_name):
527 """Attempt to expand an incomplete instance name.
532 return utils.MatchNameComponent(short_name,
533 self._config_data.instances.keys())
535 def _UnlockedGetInstanceInfo(self, instance_name):
536 """Returns informations about an instance.
538 This function is for internal use, when the config lock is already held.
543 if instance_name not in self._config_data.instances:
546 return self._config_data.instances[instance_name]
548 @locking.ssynchronized(_config_lock, shared=1)
549 def GetInstanceInfo(self, instance_name):
550 """Returns informations about an instance.
552 It takes the information from the configuration file. Other informations of
553 an instance are taken from the live systems.
556 instance: name of the instance, ex instance1.example.com
562 return self._UnlockedGetInstanceInfo(instance_name)
564 @locking.ssynchronized(_config_lock, shared=1)
565 def GetAllInstancesInfo(self):
566 """Get the configuration of all instances.
569 @returns: dict of (instance, instance_info), where instance_info is what
570 would GetInstanceInfo return for the node
573 my_dict = dict([(instance, self._UnlockedGetInstanceInfo(instance))
574 for instance in self._UnlockedGetInstanceList()])
577 @locking.ssynchronized(_config_lock)
578 def AddNode(self, node):
579 """Add a node to the configuration.
582 node: an object.Node instance
585 logging.info("Adding node %s to configuration" % node.name)
589 self._config_data.nodes[node.name] = node
590 self._config_data.cluster.serial_no += 1
593 @locking.ssynchronized(_config_lock)
594 def RemoveNode(self, node_name):
595 """Remove a node from the configuration.
598 logging.info("Removing node %s from configuration" % node_name)
601 if node_name not in self._config_data.nodes:
602 raise errors.ConfigurationError("Unknown node '%s'" % node_name)
604 del self._config_data.nodes[node_name]
605 self._config_data.cluster.serial_no += 1
608 @locking.ssynchronized(_config_lock, shared=1)
609 def ExpandNodeName(self, short_name):
610 """Attempt to expand an incomplete instance name.
615 return utils.MatchNameComponent(short_name,
616 self._config_data.nodes.keys())
618 def _UnlockedGetNodeInfo(self, node_name):
619 """Get the configuration of a node, as stored in the config.
621 This function is for internal use, when the config lock is already held.
623 Args: node: nodename (tuple) of the node
625 Returns: the node object
630 if node_name not in self._config_data.nodes:
633 return self._config_data.nodes[node_name]
636 @locking.ssynchronized(_config_lock, shared=1)
637 def GetNodeInfo(self, node_name):
638 """Get the configuration of a node, as stored in the config.
640 Args: node: nodename (tuple) of the node
642 Returns: the node object
645 return self._UnlockedGetNodeInfo(node_name)
647 def _UnlockedGetNodeList(self):
648 """Return the list of nodes which are in the configuration.
650 This function is for internal use, when the config lock is already held.
654 return self._config_data.nodes.keys()
657 @locking.ssynchronized(_config_lock, shared=1)
658 def GetNodeList(self):
659 """Return the list of nodes which are in the configuration.
662 return self._UnlockedGetNodeList()
664 @locking.ssynchronized(_config_lock, shared=1)
665 def GetAllNodesInfo(self):
666 """Get the configuration of all nodes.
669 @returns: dict of (node, node_info), where node_info is what
670 would GetNodeInfo return for the node
673 my_dict = dict([(node, self._UnlockedGetNodeInfo(node))
674 for node in self._UnlockedGetNodeList()])
677 @locking.ssynchronized(_config_lock, shared=1)
678 def DumpConfig(self):
679 """Return the entire configuration of the cluster.
682 return self._config_data
684 def _BumpSerialNo(self):
685 """Bump up the serial number of the config.
688 self._config_data.serial_no += 1
690 def _OpenConfig(self):
691 """Read the config data from disk.
693 In case we already have configuration data and the config file has
694 the same mtime as when we read it, we skip the parsing of the
695 file, since de-serialisation could be slow.
699 st = os.stat(self._cfg_file)
701 raise errors.ConfigurationError("Can't stat config file: %s" % err)
702 if (self._config_data is not None and
703 self._config_time is not None and
704 self._config_time == st.st_mtime and
705 self._config_size == st.st_size and
706 self._config_inode == st.st_ino):
707 # data is current, so skip loading of config file
710 # Make sure the configuration has the right version
713 f = open(self._cfg_file, 'r')
716 data = objects.ConfigData.FromDict(serializer.Load(f.read()))
717 except Exception, err:
718 raise errors.ConfigurationError(err)
721 if (not hasattr(data, 'cluster') or
722 not hasattr(data.cluster, 'rsahostkeypub')):
723 raise errors.ConfigurationError("Incomplete configuration"
724 " (missing cluster.rsahostkeypub)")
725 self._config_data = data
726 self._config_time = st.st_mtime
727 self._config_size = st.st_size
728 self._config_inode = st.st_ino
730 def _DistributeConfig(self):
731 """Distribute the configuration to the other nodes.
733 Currently, this only copies the configuration file. In the future,
734 it could be used to encapsulate the 2/3-phase update mechanism.
740 nodelist = self._UnlockedGetNodeList()
741 myhostname = self._my_hostname
744 nodelist.remove(myhostname)
748 result = rpc.call_upload_file(nodelist, self._cfg_file)
749 for node in nodelist:
751 logging.error("copy of file %s to node %s failed",
752 self._cfg_file, node)
756 def _WriteConfig(self, destination=None):
757 """Write the configuration data to persistent storage.
760 if destination is None:
761 destination = self._cfg_file
763 txt = serializer.Dump(self._config_data.ToDict())
764 dir_name, file_name = os.path.split(destination)
765 fd, name = tempfile.mkstemp('.newconfig', file_name, dir_name)
766 f = os.fdopen(fd, 'w')
772 # we don't need to do os.close(fd) as f.close() did it
773 os.rename(name, destination)
774 self.write_count += 1
775 # re-set our cache as not to re-read the config file
777 st = os.stat(destination)
779 raise errors.ConfigurationError("Can't stat config file: %s" % err)
780 self._config_time = st.st_mtime
781 self._config_size = st.st_size
782 self._config_inode = st.st_ino
783 # and redistribute the config file
784 self._DistributeConfig()
786 @locking.ssynchronized(_config_lock)
787 def InitConfig(self, node, primary_ip, secondary_ip,
788 hostkeypub, mac_prefix, vg_name, def_bridge):
789 """Create the initial cluster configuration.
791 It will contain the current node, which will also be the master
792 node, and no instances or operating systmes.
795 node: the nodename of the initial node
796 primary_ip: the IP address of the current host
797 secondary_ip: the secondary IP of the current host or None
798 hostkeypub: the public hostkey of this host
801 hu_port = constants.FIRST_DRBD_PORT - 1
802 globalconfig = objects.Cluster(serial_no=1,
803 rsahostkeypub=hostkeypub,
804 highest_used_port=hu_port,
805 mac_prefix=mac_prefix,
806 volume_group_name=vg_name,
807 default_bridge=def_bridge,
808 tcpudp_port_pool=set())
809 if secondary_ip is None:
810 secondary_ip = primary_ip
811 nodeconfig = objects.Node(name=node, primary_ip=primary_ip,
812 secondary_ip=secondary_ip, serial_no=1)
814 self._config_data = objects.ConfigData(nodes={node: nodeconfig},
816 cluster=globalconfig,
820 @locking.ssynchronized(_config_lock, shared=1)
822 """Return the volume group name.
826 return self._config_data.cluster.volume_group_name
828 @locking.ssynchronized(_config_lock)
829 def SetVGName(self, vg_name):
830 """Set the volume group name.
834 self._config_data.cluster.volume_group_name = vg_name
835 self._config_data.cluster.serial_no += 1
838 @locking.ssynchronized(_config_lock, shared=1)
839 def GetDefBridge(self):
840 """Return the default bridge.
844 return self._config_data.cluster.default_bridge
846 @locking.ssynchronized(_config_lock, shared=1)
847 def GetMACPrefix(self):
848 """Return the mac prefix.
852 return self._config_data.cluster.mac_prefix
854 @locking.ssynchronized(_config_lock, shared=1)
855 def GetClusterInfo(self):
856 """Returns informations about the cluster
864 return self._config_data.cluster
866 @locking.ssynchronized(_config_lock)
867 def Update(self, target):
868 """Notify function to be called after updates.
870 This function must be called when an object (as returned by
871 GetInstanceInfo, GetNodeInfo, GetCluster) has been updated and the
872 caller wants the modifications saved to the backing store. Note
873 that all modified objects will be saved, but the target argument
874 is the one the caller wants to ensure that it's saved.
877 if self._config_data is None:
878 raise errors.ProgrammerError("Configuration file not read,"
880 if isinstance(target, objects.Cluster):
881 test = target == self._config_data.cluster
882 elif isinstance(target, objects.Node):
883 test = target in self._config_data.nodes.values()
884 elif isinstance(target, objects.Instance):
885 test = target in self._config_data.instances.values()
887 raise errors.ProgrammerError("Invalid object type (%s) passed to"
888 " ConfigWriter.Update" % type(target))
890 raise errors.ConfigurationError("Configuration updated since object"
891 " has been read or unknown object")
892 target.serial_no += 1