4 # Copyright (C) 2006, 2007 Google Inc.
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 """Functions used by the node daemon"""
36 from ganeti import errors
37 from ganeti import utils
38 from ganeti import ssh
39 from ganeti import hypervisor
40 from ganeti import constants
41 from ganeti import bdev
42 from ganeti import objects
43 from ganeti import ssconf
47 return ssh.SshRunner()
51 """Activate local node as master node.
53 There are two needed steps for this:
54 - run the master script
55 - register the cron script
58 result = utils.RunCmd([constants.MASTER_SCRIPT, "-d", "start"])
61 logging.error("could not activate cluster interface with command %s,"
62 " error: '%s'", result.cmd, result.output)
69 """Deactivate this node as master.
71 This runs the master stop script.
74 result = utils.RunCmd([constants.MASTER_SCRIPT, "-d", "stop"])
77 logging.error("could not deactivate cluster interface with command %s,"
78 " error: '%s'", result.cmd, result.output)
84 def AddNode(dsa, dsapub, rsa, rsapub, sshkey, sshpub):
85 """Joins this node to the cluster.
87 This does the following:
88 - updates the hostkeys of the machine (rsa and dsa)
89 - adds the ssh private key to the user
90 - adds the ssh public key to the users' authorized_keys file
93 sshd_keys = [(constants.SSH_HOST_RSA_PRIV, rsa, 0600),
94 (constants.SSH_HOST_RSA_PUB, rsapub, 0644),
95 (constants.SSH_HOST_DSA_PRIV, dsa, 0600),
96 (constants.SSH_HOST_DSA_PUB, dsapub, 0644)]
97 for name, content, mode in sshd_keys:
98 utils.WriteFile(name, data=content, mode=mode)
101 priv_key, pub_key, auth_keys = ssh.GetUserFiles(constants.GANETI_RUNAS,
103 except errors.OpExecError, err:
104 logging.exception("Error while processing user ssh files")
107 for name, content in [(priv_key, sshkey), (pub_key, sshpub)]:
108 utils.WriteFile(name, data=content, mode=0600)
110 utils.AddAuthorizedKey(auth_keys, sshpub)
112 utils.RunCmd([constants.SSH_INITD_SCRIPT, "restart"])
118 """Cleans up the current node and prepares it to be removed from the cluster.
121 if os.path.isdir(constants.DATA_DIR):
122 for rel_name in utils.ListVisibleFiles(constants.DATA_DIR):
123 full_name = os.path.join(constants.DATA_DIR, rel_name)
124 if os.path.isfile(full_name) and not os.path.islink(full_name):
125 utils.RemoveFile(full_name)
128 priv_key, pub_key, auth_keys = ssh.GetUserFiles(constants.GANETI_RUNAS)
129 except errors.OpExecError:
130 logging.exception("Error while processing ssh files")
133 f = open(pub_key, 'r')
135 utils.RemoveAuthorizedKey(auth_keys, f.read(8192))
139 utils.RemoveFile(priv_key)
140 utils.RemoveFile(pub_key)
142 # Return a reassuring string to the caller, and quit
143 raise errors.QuitGanetiException(False, 'Shutdown scheduled')
146 def GetNodeInfo(vgname):
147 """Gives back a hash with different informations about the node.
150 { 'vg_size' : xxx, 'vg_free' : xxx, 'memory_domain0': xxx,
151 'memory_free' : xxx, 'memory_total' : xxx }
153 vg_size is the size of the configured volume group in MiB
154 vg_free is the free size of the volume group in MiB
155 memory_dom0 is the memory allocated for domain0 in MiB
156 memory_free is the currently available (free) ram in MiB
157 memory_total is the total number of ram in MiB
161 vginfo = _GetVGInfo(vgname)
162 outputarray['vg_size'] = vginfo['vg_size']
163 outputarray['vg_free'] = vginfo['vg_free']
165 hyper = hypervisor.GetHypervisor()
166 hyp_info = hyper.GetNodeInfo()
167 if hyp_info is not None:
168 outputarray.update(hyp_info)
170 f = open("/proc/sys/kernel/random/boot_id", 'r')
172 outputarray["bootid"] = f.read(128).rstrip("\n")
179 def VerifyNode(what):
180 """Verify the status of the local node.
183 what - a dictionary of things to check:
184 'filelist' : list of files for which to compute checksums
185 'nodelist' : list of nodes we should check communication with
186 'hypervisor': run the hypervisor-specific verify
188 Requested files on local node are checksummed and the result returned.
190 The nodelist is traversed, with the following checks being made
192 - known_hosts key correct
193 - correct resolving of node name (target node returns its own hostname
194 by ssh-execution of 'hostname', result compared against name in list.
199 if 'hypervisor' in what:
200 result['hypervisor'] = hypervisor.GetHypervisor().Verify()
202 if 'filelist' in what:
203 result['filelist'] = utils.FingerprintFiles(what['filelist'])
205 if 'nodelist' in what:
206 result['nodelist'] = {}
207 random.shuffle(what['nodelist'])
208 for node in what['nodelist']:
209 success, message = _GetSshRunner().VerifyNodeHostname(node)
211 result['nodelist'][node] = message
212 if 'node-net-test' in what:
213 result['node-net-test'] = {}
214 my_name = utils.HostInfo().name
215 my_pip = my_sip = None
216 for name, pip, sip in what['node-net-test']:
222 result['node-net-test'][my_name] = ("Can't find my own"
223 " primary/secondary IP"
226 port = ssconf.SimpleStore().GetNodeDaemonPort()
227 for name, pip, sip in what['node-net-test']:
229 if not utils.TcpPing(pip, port, source=my_pip):
230 fail.append("primary")
232 if not utils.TcpPing(sip, port, source=my_sip):
233 fail.append("secondary")
235 result['node-net-test'][name] = ("failure using the %s"
242 def GetVolumeList(vg_name):
243 """Compute list of logical volumes and their size.
246 dictionary of all partions (key) with their size (in MiB), inactive
248 {'test1': ('20.06', True, True)}
253 result = utils.RunCmd(["lvs", "--noheadings", "--units=m", "--nosuffix",
254 "--separator=%s" % sep,
255 "-olv_name,lv_size,lv_attr", vg_name])
257 logging.error("Failed to list logical volumes, lvs output: %s",
261 valid_line_re = re.compile("^ *([^|]+)\|([0-9.]+)\|([^|]{6})\|?$")
262 for line in result.stdout.splitlines():
264 match = valid_line_re.match(line)
266 logging.error("Invalid line returned from lvs output: '%s'", line)
268 name, size, attr = match.groups()
269 inactive = attr[4] == '-'
270 online = attr[5] == 'o'
271 lvs[name] = (size, inactive, online)
276 def ListVolumeGroups():
277 """List the volume groups and their size.
280 Dictionary with keys volume name and values the size of the volume
283 return utils.ListVolumeGroups()
287 """List all volumes on this node.
290 result = utils.RunCmd(["lvs", "--noheadings", "--units=m", "--nosuffix",
292 "--options=lv_name,lv_size,devices,vg_name"])
294 logging.error("Failed to list logical volumes, lvs output: %s",
300 return dev.split('(')[0]
306 'name': line[0].strip(),
307 'size': line[1].strip(),
308 'dev': parse_dev(line[2].strip()),
309 'vg': line[3].strip(),
312 return [map_line(line.split('|')) for line in result.stdout.splitlines()
313 if line.count('|') >= 3]
316 def BridgesExist(bridges_list):
317 """Check if a list of bridges exist on the current node.
320 True if all of them exist, false otherwise
323 for bridge in bridges_list:
324 if not utils.BridgeExists(bridge):
330 def GetInstanceList():
331 """Provides a list of instances.
334 A list of all running instances on the current node
335 - instance1.example.com
336 - instance2.example.com
340 names = hypervisor.GetHypervisor().ListInstances()
341 except errors.HypervisorError, err:
342 logging.exception("Error enumerating instances")
348 def GetInstanceInfo(instance):
349 """Gives back the informations about an instance as a dictionary.
352 instance: name of the instance (ex. instance1.example.com)
355 { 'memory' : 511, 'state' : '-b---', 'time' : 3188.8, }
357 memory: memory size of instance (int)
358 state: xen state of instance (string)
359 time: cpu time of instance (float)
364 iinfo = hypervisor.GetHypervisor().GetInstanceInfo(instance)
365 if iinfo is not None:
366 output['memory'] = iinfo[2]
367 output['state'] = iinfo[4]
368 output['time'] = iinfo[5]
373 def GetAllInstancesInfo():
374 """Gather data about all instances.
376 This is the equivalent of `GetInstanceInfo()`, except that it
377 computes data for all instances at once, thus being faster if one
378 needs data about more than one instance.
380 Returns: a dictionary of dictionaries, keys being the instance name,
382 { 'memory' : 511, 'state' : '-b---', 'time' : 3188.8, }
384 memory: memory size of instance (int)
385 state: xen state of instance (string)
386 time: cpu time of instance (float)
387 vcpus: the number of cpus
392 iinfo = hypervisor.GetHypervisor().GetAllInstancesInfo()
394 for name, inst_id, memory, vcpus, state, times in iinfo:
405 def AddOSToInstance(instance, os_disk, swap_disk):
406 """Add an OS to an instance.
409 instance: the instance object
410 os_disk: the instance-visible name of the os device
411 swap_disk: the instance-visible name of the swap device
414 inst_os = OSFromDisk(instance.os)
416 create_script = inst_os.create_script
418 os_device = instance.FindDisk(os_disk)
419 if os_device is None:
420 logging.error("Can't find this device-visible name '%s'", os_disk)
423 swap_device = instance.FindDisk(swap_disk)
424 if swap_device is None:
425 logging.error("Can't find this device-visible name '%s'", swap_disk)
428 real_os_dev = _RecursiveFindBD(os_device)
429 if real_os_dev is None:
430 raise errors.BlockDeviceError("Block device '%s' is not set up" %
434 real_swap_dev = _RecursiveFindBD(swap_device)
435 if real_swap_dev is None:
436 raise errors.BlockDeviceError("Block device '%s' is not set up" %
440 logfile = "%s/add-%s-%s-%d.log" % (constants.LOG_OS_DIR, instance.os,
441 instance.name, int(time.time()))
442 if not os.path.exists(constants.LOG_OS_DIR):
443 os.mkdir(constants.LOG_OS_DIR, 0750)
445 command = utils.BuildShellCmd("cd %s && %s -i %s -b %s -s %s &>%s",
446 inst_os.path, create_script, instance.name,
447 real_os_dev.dev_path, real_swap_dev.dev_path,
450 result = utils.RunCmd(command)
452 logging.error("os create command '%s' returned error: %s, logfile: %s,"
453 " output: %s", command, result.fail_reason, logfile,
460 def RunRenameInstance(instance, old_name, os_disk, swap_disk):
461 """Run the OS rename script for an instance.
464 instance: the instance object
465 old_name: the old name of the instance
466 os_disk: the instance-visible name of the os device
467 swap_disk: the instance-visible name of the swap device
470 inst_os = OSFromDisk(instance.os)
472 script = inst_os.rename_script
474 os_device = instance.FindDisk(os_disk)
475 if os_device is None:
476 logging.error("Can't find this device-visible name '%s'", os_disk)
479 swap_device = instance.FindDisk(swap_disk)
480 if swap_device is None:
481 logging.error("Can't find this device-visible name '%s'", swap_disk)
484 real_os_dev = _RecursiveFindBD(os_device)
485 if real_os_dev is None:
486 raise errors.BlockDeviceError("Block device '%s' is not set up" %
490 real_swap_dev = _RecursiveFindBD(swap_device)
491 if real_swap_dev is None:
492 raise errors.BlockDeviceError("Block device '%s' is not set up" %
496 logfile = "%s/rename-%s-%s-%s-%d.log" % (constants.LOG_OS_DIR, instance.os,
498 instance.name, int(time.time()))
499 if not os.path.exists(constants.LOG_OS_DIR):
500 os.mkdir(constants.LOG_OS_DIR, 0750)
502 command = utils.BuildShellCmd("cd %s && %s -o %s -n %s -b %s -s %s &>%s",
503 inst_os.path, script, old_name, instance.name,
504 real_os_dev.dev_path, real_swap_dev.dev_path,
507 result = utils.RunCmd(command)
510 logging.error("os create command '%s' returned error: %s output: %s",
511 command, result.fail_reason, result.output)
517 def _GetVGInfo(vg_name):
518 """Get informations about the volume group.
521 vg_name: the volume group
524 { 'vg_size' : xxx, 'vg_free' : xxx, 'pv_count' : xxx }
526 vg_size is the total size of the volume group in MiB
527 vg_free is the free size of the volume group in MiB
528 pv_count are the number of physical disks in that vg
530 If an error occurs during gathering of data, we return the same dict
531 with keys all set to None.
534 retdic = dict.fromkeys(["vg_size", "vg_free", "pv_count"])
536 retval = utils.RunCmd(["vgs", "-ovg_size,vg_free,pv_count", "--noheadings",
537 "--nosuffix", "--units=m", "--separator=:", vg_name])
540 logging.error("volume group %s not present", vg_name)
542 valarr = retval.stdout.strip().rstrip(':').split(':')
546 "vg_size": int(round(float(valarr[0]), 0)),
547 "vg_free": int(round(float(valarr[1]), 0)),
548 "pv_count": int(valarr[2]),
550 except ValueError, err:
551 logging.exception("Fail to parse vgs output")
553 logging.error("vgs output has the wrong number of fields (expected"
554 " three): %s", str(valarr))
558 def _GatherBlockDevs(instance):
559 """Set up an instance's block device(s).
561 This is run on the primary node at instance startup. The block
562 devices must be already assembled.
566 for disk in instance.disks:
567 device = _RecursiveFindBD(disk)
569 raise errors.BlockDeviceError("Block device '%s' is not set up." %
572 block_devices.append((disk, device))
576 def StartInstance(instance, extra_args):
577 """Start an instance.
580 instance - name of instance to start.
583 running_instances = GetInstanceList()
585 if instance.name in running_instances:
588 block_devices = _GatherBlockDevs(instance)
589 hyper = hypervisor.GetHypervisor()
592 hyper.StartInstance(instance, block_devices, extra_args)
593 except errors.HypervisorError, err:
594 logging.exception("Failed to start instance")
600 def ShutdownInstance(instance):
601 """Shut an instance down.
604 instance - name of instance to shutdown.
607 running_instances = GetInstanceList()
609 if instance.name not in running_instances:
612 hyper = hypervisor.GetHypervisor()
614 hyper.StopInstance(instance)
615 except errors.HypervisorError, err:
616 logging.error("Failed to stop instance")
619 # test every 10secs for 2min
623 for dummy in range(11):
624 if instance.name not in GetInstanceList():
628 # the shutdown did not succeed
629 logging.error("shutdown of '%s' unsuccessful, using destroy", instance)
632 hyper.StopInstance(instance, force=True)
633 except errors.HypervisorError, err:
634 logging.exception("Failed to stop instance")
638 if instance.name in GetInstanceList():
639 logging.error("could not shutdown instance '%s' even by destroy",
646 def RebootInstance(instance, reboot_type, extra_args):
647 """Reboot an instance.
650 instance - name of instance to reboot
651 reboot_type - how to reboot [soft,hard,full]
654 running_instances = GetInstanceList()
656 if instance.name not in running_instances:
657 logging.error("Cannot reboot instance that is not running")
660 hyper = hypervisor.GetHypervisor()
661 if reboot_type == constants.INSTANCE_REBOOT_SOFT:
663 hyper.RebootInstance(instance)
664 except errors.HypervisorError, err:
665 logging.exception("Failed to soft reboot instance")
667 elif reboot_type == constants.INSTANCE_REBOOT_HARD:
669 ShutdownInstance(instance)
670 StartInstance(instance, extra_args)
671 except errors.HypervisorError, err:
672 logging.exception("Failed to hard reboot instance")
675 raise errors.ParameterError("reboot_type invalid")
681 def MigrateInstance(instance, target, live):
682 """Migrates an instance to another node.
685 hyper = hypervisor.GetHypervisor()
688 hyper.MigrateInstance(instance, target, live)
689 except errors.HypervisorError, err:
690 msg = "Failed to migrate instance: %s" % str(err)
693 return (True, "Migration successfull")
696 def CreateBlockDevice(disk, size, owner, on_primary, info):
697 """Creates a block device for an instance.
700 disk: a ganeti.objects.Disk object
701 size: the size of the physical underlying device
702 owner: a string with the name of the instance
703 on_primary: a boolean indicating if it is the primary node or not
704 info: string that will be sent to the physical device creation
707 the new unique_id of the device (this can sometime be
708 computed only after creation), or None. On secondary nodes,
709 it's not required to return anything.
714 for child in disk.children:
715 crdev = _RecursiveAssembleBD(child, owner, on_primary)
716 if on_primary or disk.AssembleOnSecondary():
717 # we need the children open in case the device itself has to
722 device = bdev.FindDevice(disk.dev_type, disk.physical_id, clist)
723 if device is not None:
724 logging.info("removing existing device %s", disk)
726 except errors.BlockDeviceError, err:
729 device = bdev.Create(disk.dev_type, disk.physical_id,
732 raise ValueError("Can't create child device for %s, %s" %
734 if on_primary or disk.AssembleOnSecondary():
735 if not device.Assemble():
736 errorstring = "Can't assemble device after creation"
737 logging.error(errorstring)
738 raise errors.BlockDeviceError("%s, very unusual event - check the node"
739 " daemon logs" % errorstring)
740 device.SetSyncSpeed(constants.SYNC_SPEED)
741 if on_primary or disk.OpenOnSecondary():
742 device.Open(force=True)
743 DevCacheManager.UpdateCache(device.dev_path, owner,
744 on_primary, disk.iv_name)
748 physical_id = device.unique_id
752 def RemoveBlockDevice(disk):
753 """Remove a block device.
755 This is intended to be called recursively.
759 # since we are removing the device, allow a partial match
760 # this allows removal of broken mirrors
761 rdev = _RecursiveFindBD(disk, allow_partial=True)
762 except errors.BlockDeviceError, err:
763 # probably can't attach
764 logging.info("Can't attach to device %s in remove", disk)
767 r_path = rdev.dev_path
768 result = rdev.Remove()
770 DevCacheManager.RemoveCache(r_path)
774 for child in disk.children:
775 result = result and RemoveBlockDevice(child)
779 def _RecursiveAssembleBD(disk, owner, as_primary):
780 """Activate a block device for an instance.
782 This is run on the primary and secondary nodes for an instance.
784 This function is called recursively.
787 disk: a objects.Disk object
788 as_primary: if we should make the block device read/write
791 the assembled device or None (in case no device was assembled)
793 If the assembly is not successful, an exception is raised.
798 mcn = disk.ChildrenNeeded()
800 mcn = 0 # max number of Nones allowed
802 mcn = len(disk.children) - mcn # max number of Nones
803 for chld_disk in disk.children:
805 cdev = _RecursiveAssembleBD(chld_disk, owner, as_primary)
806 except errors.BlockDeviceError, err:
807 if children.count(None) >= mcn:
810 logging.debug("Error in child activation: %s", str(err))
811 children.append(cdev)
813 if as_primary or disk.AssembleOnSecondary():
814 r_dev = bdev.AttachOrAssemble(disk.dev_type, disk.physical_id, children)
815 r_dev.SetSyncSpeed(constants.SYNC_SPEED)
817 if as_primary or disk.OpenOnSecondary():
819 DevCacheManager.UpdateCache(r_dev.dev_path, owner,
820 as_primary, disk.iv_name)
827 def AssembleBlockDevice(disk, owner, as_primary):
828 """Activate a block device for an instance.
830 This is a wrapper over _RecursiveAssembleBD.
833 a /dev path for primary nodes
834 True for secondary nodes
837 result = _RecursiveAssembleBD(disk, owner, as_primary)
838 if isinstance(result, bdev.BlockDev):
839 result = result.dev_path
843 def ShutdownBlockDevice(disk):
844 """Shut down a block device.
846 First, if the device is assembled (can `Attach()`), then the device
847 is shutdown. Then the children of the device are shutdown.
849 This function is called recursively. Note that we don't cache the
850 children or such, as oppossed to assemble, shutdown of different
851 devices doesn't require that the upper device was active.
854 r_dev = _RecursiveFindBD(disk)
855 if r_dev is not None:
856 r_path = r_dev.dev_path
857 result = r_dev.Shutdown()
859 DevCacheManager.RemoveCache(r_path)
863 for child in disk.children:
864 result = result and ShutdownBlockDevice(child)
868 def MirrorAddChildren(parent_cdev, new_cdevs):
869 """Extend a mirrored block device.
872 parent_bdev = _RecursiveFindBD(parent_cdev, allow_partial=True)
873 if parent_bdev is None:
874 logging.error("Can't find parent device")
876 new_bdevs = [_RecursiveFindBD(disk) for disk in new_cdevs]
877 if new_bdevs.count(None) > 0:
878 logging.error("Can't find new device(s) to add: %s:%s",
879 new_bdevs, new_cdevs)
881 parent_bdev.AddChildren(new_bdevs)
885 def MirrorRemoveChildren(parent_cdev, new_cdevs):
886 """Shrink a mirrored block device.
889 parent_bdev = _RecursiveFindBD(parent_cdev)
890 if parent_bdev is None:
891 logging.error("Can't find parent in remove children: %s", parent_cdev)
894 for disk in new_cdevs:
895 rpath = disk.StaticDevPath()
897 bd = _RecursiveFindBD(disk)
899 logging.error("Can't find dynamic device %s while removing children",
903 devs.append(bd.dev_path)
906 parent_bdev.RemoveChildren(devs)
910 def GetMirrorStatus(disks):
911 """Get the mirroring status of a list of devices.
914 disks: list of `objects.Disk`
917 list of (mirror_done, estimated_time) tuples, which
918 are the result of bdev.BlockDevice.CombinedSyncStatus()
923 rbd = _RecursiveFindBD(dsk)
925 raise errors.BlockDeviceError("Can't find device %s" % str(dsk))
926 stats.append(rbd.CombinedSyncStatus())
930 def _RecursiveFindBD(disk, allow_partial=False):
931 """Check if a device is activated.
933 If so, return informations about the real device.
936 disk: the objects.Disk instance
937 allow_partial: don't abort the find if a child of the
938 device can't be found; this is intended to be
939 used when repairing mirrors
942 None if the device can't be found
943 otherwise the device instance
948 for chdisk in disk.children:
949 children.append(_RecursiveFindBD(chdisk))
951 return bdev.FindDevice(disk.dev_type, disk.physical_id, children)
954 def FindBlockDevice(disk):
955 """Check if a device is activated.
957 If so, return informations about the real device.
960 disk: the objects.Disk instance
962 None if the device can't be found
963 (device_path, major, minor, sync_percent, estimated_time, is_degraded)
966 rbd = _RecursiveFindBD(disk)
969 return (rbd.dev_path, rbd.major, rbd.minor) + rbd.GetSyncStatus()
972 def UploadFile(file_name, data, mode, uid, gid, atime, mtime):
973 """Write a file to the filesystem.
975 This allows the master to overwrite(!) a file. It will only perform
976 the operation if the file belongs to a list of configuration files.
979 if not os.path.isabs(file_name):
980 logging.error("Filename passed to UploadFile is not absolute: '%s'",
985 constants.CLUSTER_CONF_FILE,
987 constants.SSH_KNOWN_HOSTS_FILE,
988 constants.VNC_PASSWORD_FILE,
990 allowed_files.extend(ssconf.SimpleStore().GetFileList())
991 if file_name not in allowed_files:
992 logging.error("Filename passed to UploadFile not in allowed"
993 " upload targets: '%s'", file_name)
996 utils.WriteFile(file_name, data=data, mode=mode, uid=uid, gid=gid,
997 atime=atime, mtime=mtime)
1001 def _ErrnoOrStr(err):
1002 """Format an EnvironmentError exception.
1004 If the `err` argument has an errno attribute, it will be looked up
1005 and converted into a textual EXXXX description. Otherwise the string
1006 representation of the error will be returned.
1009 if hasattr(err, 'errno'):
1010 detail = errno.errorcode[err.errno]
1016 def _OSOndiskVersion(name, os_dir):
1017 """Compute and return the API version of a given OS.
1019 This function will try to read the API version of the os given by
1020 the 'name' parameter and residing in the 'os_dir' directory.
1022 Return value will be either an integer denoting the version or None in the
1023 case when this is not a valid OS name.
1026 api_file = os.path.sep.join([os_dir, "ganeti_api_version"])
1029 st = os.stat(api_file)
1030 except EnvironmentError, err:
1031 raise errors.InvalidOS(name, os_dir, "'ganeti_api_version' file not"
1032 " found (%s)" % _ErrnoOrStr(err))
1034 if not stat.S_ISREG(stat.S_IFMT(st.st_mode)):
1035 raise errors.InvalidOS(name, os_dir, "'ganeti_api_version' file is not"
1041 api_version = f.read(256)
1044 except EnvironmentError, err:
1045 raise errors.InvalidOS(name, os_dir, "error while reading the"
1046 " API version (%s)" % _ErrnoOrStr(err))
1048 api_version = api_version.strip()
1050 api_version = int(api_version)
1051 except (TypeError, ValueError), err:
1052 raise errors.InvalidOS(name, os_dir,
1053 "API version is not integer (%s)" % str(err))
1058 def DiagnoseOS(top_dirs=None):
1059 """Compute the validity for all OSes.
1061 Returns an OS object for each name in all the given top directories
1062 (if not given defaults to constants.OS_SEARCH_PATH)
1068 if top_dirs is None:
1069 top_dirs = constants.OS_SEARCH_PATH
1072 for dir_name in top_dirs:
1073 if os.path.isdir(dir_name):
1075 f_names = utils.ListVisibleFiles(dir_name)
1076 except EnvironmentError, err:
1077 logging.exception("Can't list the OS directory %s", dir_name)
1079 for name in f_names:
1081 os_inst = OSFromDisk(name, base_dir=dir_name)
1082 result.append(os_inst)
1083 except errors.InvalidOS, err:
1084 result.append(objects.OS.FromInvalidOS(err))
1089 def OSFromDisk(name, base_dir=None):
1090 """Create an OS instance from disk.
1092 This function will return an OS instance if the given name is a
1093 valid OS name. Otherwise, it will raise an appropriate
1094 `errors.InvalidOS` exception, detailing why this is not a valid
1098 os_dir: Directory containing the OS scripts. Defaults to a search
1099 in all the OS_SEARCH_PATH directories.
1103 if base_dir is None:
1104 os_dir = utils.FindFile(name, constants.OS_SEARCH_PATH, os.path.isdir)
1106 raise errors.InvalidOS(name, None, "OS dir not found in search path")
1108 os_dir = os.path.sep.join([base_dir, name])
1110 api_version = _OSOndiskVersion(name, os_dir)
1112 if api_version != constants.OS_API_VERSION:
1113 raise errors.InvalidOS(name, os_dir, "API version mismatch"
1114 " (found %s want %s)"
1115 % (api_version, constants.OS_API_VERSION))
1117 # OS Scripts dictionary, we will populate it with the actual script names
1118 os_scripts = {'create': '', 'export': '', 'import': '', 'rename': ''}
1120 for script in os_scripts:
1121 os_scripts[script] = os.path.sep.join([os_dir, script])
1124 st = os.stat(os_scripts[script])
1125 except EnvironmentError, err:
1126 raise errors.InvalidOS(name, os_dir, "'%s' script missing (%s)" %
1127 (script, _ErrnoOrStr(err)))
1129 if stat.S_IMODE(st.st_mode) & stat.S_IXUSR != stat.S_IXUSR:
1130 raise errors.InvalidOS(name, os_dir, "'%s' script not executable" %
1133 if not stat.S_ISREG(stat.S_IFMT(st.st_mode)):
1134 raise errors.InvalidOS(name, os_dir, "'%s' is not a regular file" %
1138 return objects.OS(name=name, path=os_dir, status=constants.OS_VALID_STATUS,
1139 create_script=os_scripts['create'],
1140 export_script=os_scripts['export'],
1141 import_script=os_scripts['import'],
1142 rename_script=os_scripts['rename'],
1143 api_version=api_version)
1146 def GrowBlockDevice(disk, amount):
1147 """Grow a stack of block devices.
1149 This function is called recursively, with the childrens being the
1153 disk: the disk to be grown
1155 Returns: a tuple of (status, result), with:
1156 status: the result (true/false) of the operation
1157 result: the error message if the operation failed, otherwise not used
1160 r_dev = _RecursiveFindBD(disk)
1162 return False, "Cannot find block device %s" % (disk,)
1166 except errors.BlockDeviceError, err:
1167 return False, str(err)
1172 def SnapshotBlockDevice(disk):
1173 """Create a snapshot copy of a block device.
1175 This function is called recursively, and the snapshot is actually created
1176 just for the leaf lvm backend device.
1179 disk: the disk to be snapshotted
1182 a config entry for the actual lvm device snapshotted.
1186 if len(disk.children) == 1:
1187 # only one child, let's recurse on it
1188 return SnapshotBlockDevice(disk.children[0])
1190 # more than one child, choose one that matches
1191 for child in disk.children:
1192 if child.size == disk.size:
1193 # return implies breaking the loop
1194 return SnapshotBlockDevice(child)
1195 elif disk.dev_type == constants.LD_LV:
1196 r_dev = _RecursiveFindBD(disk)
1197 if r_dev is not None:
1198 # let's stay on the safe side and ask for the full size, for now
1199 return r_dev.Snapshot(disk.size)
1203 raise errors.ProgrammerError("Cannot snapshot non-lvm block device"
1204 " '%s' of type '%s'" %
1205 (disk.unique_id, disk.dev_type))
1208 def ExportSnapshot(disk, dest_node, instance):
1209 """Export a block device snapshot to a remote node.
1212 disk: the snapshot block device
1213 dest_node: the node to send the image to
1214 instance: instance being exported
1217 True if successful, False otherwise.
1220 inst_os = OSFromDisk(instance.os)
1221 export_script = inst_os.export_script
1223 logfile = "%s/exp-%s-%s-%s.log" % (constants.LOG_OS_DIR, inst_os.name,
1224 instance.name, int(time.time()))
1225 if not os.path.exists(constants.LOG_OS_DIR):
1226 os.mkdir(constants.LOG_OS_DIR, 0750)
1228 real_os_dev = _RecursiveFindBD(disk)
1229 if real_os_dev is None:
1230 raise errors.BlockDeviceError("Block device '%s' is not set up" %
1234 destdir = os.path.join(constants.EXPORT_DIR, instance.name + ".new")
1235 destfile = disk.physical_id[1]
1237 # the target command is built out of three individual commands,
1238 # which are joined by pipes; we check each individual command for
1241 expcmd = utils.BuildShellCmd("cd %s; %s -i %s -b %s 2>%s", inst_os.path,
1242 export_script, instance.name,
1243 real_os_dev.dev_path, logfile)
1247 destcmd = utils.BuildShellCmd("mkdir -p %s && cat > %s/%s",
1248 destdir, destdir, destfile)
1249 remotecmd = _GetSshRunner().BuildCmd(dest_node, constants.GANETI_RUNAS,
1252 # all commands have been checked, so we're safe to combine them
1253 command = '|'.join([expcmd, comprcmd, utils.ShellQuoteArgs(remotecmd)])
1255 result = utils.RunCmd(command)
1258 logging.error("os snapshot export command '%s' returned error: %s"
1259 " output: %s", command, result.fail_reason, result.output)
1265 def FinalizeExport(instance, snap_disks):
1266 """Write out the export configuration information.
1269 instance: instance configuration
1270 snap_disks: snapshot block devices
1273 False in case of error, True otherwise.
1276 destdir = os.path.join(constants.EXPORT_DIR, instance.name + ".new")
1277 finaldestdir = os.path.join(constants.EXPORT_DIR, instance.name)
1279 config = objects.SerializableConfigParser()
1281 config.add_section(constants.INISECT_EXP)
1282 config.set(constants.INISECT_EXP, 'version', '0')
1283 config.set(constants.INISECT_EXP, 'timestamp', '%d' % int(time.time()))
1284 config.set(constants.INISECT_EXP, 'source', instance.primary_node)
1285 config.set(constants.INISECT_EXP, 'os', instance.os)
1286 config.set(constants.INISECT_EXP, 'compression', 'gzip')
1288 config.add_section(constants.INISECT_INS)
1289 config.set(constants.INISECT_INS, 'name', instance.name)
1290 config.set(constants.INISECT_INS, 'memory', '%d' % instance.memory)
1291 config.set(constants.INISECT_INS, 'vcpus', '%d' % instance.vcpus)
1292 config.set(constants.INISECT_INS, 'disk_template', instance.disk_template)
1295 for nic_count, nic in enumerate(instance.nics):
1296 config.set(constants.INISECT_INS, 'nic%d_mac' %
1297 nic_count, '%s' % nic.mac)
1298 config.set(constants.INISECT_INS, 'nic%d_ip' % nic_count, '%s' % nic.ip)
1299 config.set(constants.INISECT_INS, 'nic%d_bridge' % nic_count, '%s' % nic.bridge)
1300 # TODO: redundant: on load can read nics until it doesn't exist
1301 config.set(constants.INISECT_INS, 'nic_count' , '%d' % nic_count)
1304 for disk_count, disk in enumerate(snap_disks):
1305 config.set(constants.INISECT_INS, 'disk%d_ivname' % disk_count,
1306 ('%s' % disk.iv_name))
1307 config.set(constants.INISECT_INS, 'disk%d_dump' % disk_count,
1308 ('%s' % disk.physical_id[1]))
1309 config.set(constants.INISECT_INS, 'disk%d_size' % disk_count,
1311 config.set(constants.INISECT_INS, 'disk_count' , '%d' % disk_count)
1313 cff = os.path.join(destdir, constants.EXPORT_CONF_FILE)
1314 cfo = open(cff, 'w')
1320 shutil.rmtree(finaldestdir, True)
1321 shutil.move(destdir, finaldestdir)
1326 def ExportInfo(dest):
1327 """Get export configuration information.
1330 dest: directory containing the export
1333 A serializable config file containing the export info.
1336 cff = os.path.join(dest, constants.EXPORT_CONF_FILE)
1338 config = objects.SerializableConfigParser()
1341 if (not config.has_section(constants.INISECT_EXP) or
1342 not config.has_section(constants.INISECT_INS)):
1348 def ImportOSIntoInstance(instance, os_disk, swap_disk, src_node, src_image):
1349 """Import an os image into an instance.
1352 instance: the instance object
1353 os_disk: the instance-visible name of the os device
1354 swap_disk: the instance-visible name of the swap device
1355 src_node: node holding the source image
1356 src_image: path to the source image on src_node
1359 False in case of error, True otherwise.
1362 inst_os = OSFromDisk(instance.os)
1363 import_script = inst_os.import_script
1365 os_device = instance.FindDisk(os_disk)
1366 if os_device is None:
1367 logging.error("Can't find this device-visible name '%s'", os_disk)
1370 swap_device = instance.FindDisk(swap_disk)
1371 if swap_device is None:
1372 logging.error("Can't find this device-visible name '%s'", swap_disk)
1375 real_os_dev = _RecursiveFindBD(os_device)
1376 if real_os_dev is None:
1377 raise errors.BlockDeviceError("Block device '%s' is not set up" %
1381 real_swap_dev = _RecursiveFindBD(swap_device)
1382 if real_swap_dev is None:
1383 raise errors.BlockDeviceError("Block device '%s' is not set up" %
1385 real_swap_dev.Open()
1387 logfile = "%s/import-%s-%s-%s.log" % (constants.LOG_OS_DIR, instance.os,
1388 instance.name, int(time.time()))
1389 if not os.path.exists(constants.LOG_OS_DIR):
1390 os.mkdir(constants.LOG_OS_DIR, 0750)
1392 destcmd = utils.BuildShellCmd('cat %s', src_image)
1393 remotecmd = _GetSshRunner().BuildCmd(src_node, constants.GANETI_RUNAS,
1397 impcmd = utils.BuildShellCmd("(cd %s; %s -i %s -b %s -s %s &>%s)",
1398 inst_os.path, import_script, instance.name,
1399 real_os_dev.dev_path, real_swap_dev.dev_path,
1402 command = '|'.join([utils.ShellQuoteArgs(remotecmd), comprcmd, impcmd])
1404 result = utils.RunCmd(command)
1407 logging.error("os import command '%s' returned error: %s"
1408 " output: %s", command, result.fail_reason, result.output)
1415 """Return a list of exports currently available on this machine.
1418 if os.path.isdir(constants.EXPORT_DIR):
1419 return utils.ListVisibleFiles(constants.EXPORT_DIR)
1424 def RemoveExport(export):
1425 """Remove an existing export from the node.
1428 export: the name of the export to remove
1431 False in case of error, True otherwise.
1434 target = os.path.join(constants.EXPORT_DIR, export)
1436 shutil.rmtree(target)
1437 # TODO: catch some of the relevant exceptions and provide a pretty
1438 # error message if rmtree fails.
1443 def RenameBlockDevices(devlist):
1444 """Rename a list of block devices.
1446 The devlist argument is a list of tuples (disk, new_logical,
1447 new_physical). The return value will be a combined boolean result
1448 (True only if all renames succeeded).
1452 for disk, unique_id in devlist:
1453 dev = _RecursiveFindBD(disk)
1458 old_rpath = dev.dev_path
1459 dev.Rename(unique_id)
1460 new_rpath = dev.dev_path
1461 if old_rpath != new_rpath:
1462 DevCacheManager.RemoveCache(old_rpath)
1463 # FIXME: we should add the new cache information here, like:
1464 # DevCacheManager.UpdateCache(new_rpath, owner, ...)
1465 # but we don't have the owner here - maybe parse from existing
1466 # cache? for now, we only lose lvm data when we rename, which
1467 # is less critical than DRBD or MD
1468 except errors.BlockDeviceError, err:
1469 logging.exception("Can't rename device '%s' to '%s'", dev, unique_id)
1474 def _TransformFileStorageDir(file_storage_dir):
1475 """Checks whether given file_storage_dir is valid.
1477 Checks wheter the given file_storage_dir is within the cluster-wide
1478 default file_storage_dir stored in SimpleStore. Only paths under that
1479 directory are allowed.
1482 file_storage_dir: string with path
1485 normalized file_storage_dir (string) if valid, None otherwise
1488 file_storage_dir = os.path.normpath(file_storage_dir)
1489 base_file_storage_dir = ssconf.SimpleStore().GetFileStorageDir()
1490 if (not os.path.commonprefix([file_storage_dir, base_file_storage_dir]) ==
1491 base_file_storage_dir):
1492 logging.error("file storage directory '%s' is not under base file"
1493 " storage directory '%s'",
1494 file_storage_dir, base_file_storage_dir)
1496 return file_storage_dir
1499 def CreateFileStorageDir(file_storage_dir):
1500 """Create file storage directory.
1503 file_storage_dir: string containing the path
1506 tuple with first element a boolean indicating wheter dir
1507 creation was successful or not
1510 file_storage_dir = _TransformFileStorageDir(file_storage_dir)
1512 if not file_storage_dir:
1515 if os.path.exists(file_storage_dir):
1516 if not os.path.isdir(file_storage_dir):
1517 logging.error("'%s' is not a directory", file_storage_dir)
1521 os.makedirs(file_storage_dir, 0750)
1522 except OSError, err:
1523 logging.error("Cannot create file storage directory '%s': %s",
1524 file_storage_dir, err)
1529 def RemoveFileStorageDir(file_storage_dir):
1530 """Remove file storage directory.
1532 Remove it only if it's empty. If not log an error and return.
1535 file_storage_dir: string containing the path
1538 tuple with first element a boolean indicating wheter dir
1539 removal was successful or not
1542 file_storage_dir = _TransformFileStorageDir(file_storage_dir)
1544 if not file_storage_dir:
1547 if os.path.exists(file_storage_dir):
1548 if not os.path.isdir(file_storage_dir):
1549 logging.error("'%s' is not a directory", file_storage_dir)
1551 # deletes dir only if empty, otherwise we want to return False
1553 os.rmdir(file_storage_dir)
1554 except OSError, err:
1555 logging.exception("Cannot remove file storage directory '%s'",
1561 def RenameFileStorageDir(old_file_storage_dir, new_file_storage_dir):
1562 """Rename the file storage directory.
1565 old_file_storage_dir: string containing the old path
1566 new_file_storage_dir: string containing the new path
1569 tuple with first element a boolean indicating wheter dir
1570 rename was successful or not
1573 old_file_storage_dir = _TransformFileStorageDir(old_file_storage_dir)
1574 new_file_storage_dir = _TransformFileStorageDir(new_file_storage_dir)
1576 if not old_file_storage_dir or not new_file_storage_dir:
1579 if not os.path.exists(new_file_storage_dir):
1580 if os.path.isdir(old_file_storage_dir):
1582 os.rename(old_file_storage_dir, new_file_storage_dir)
1583 except OSError, err:
1584 logging.exception("Cannot rename '%s' to '%s'",
1585 old_file_storage_dir, new_file_storage_dir)
1588 logging.error("'%s' is not a directory", old_file_storage_dir)
1591 if os.path.exists(old_file_storage_dir):
1592 logging.error("Cannot rename '%s' to '%s'. Both locations exist.",
1593 old_file_storage_dir, new_file_storage_dir)
1598 def CloseBlockDevices(disks):
1599 """Closes the given block devices.
1601 This means they will be switched to secondary mode (in case of DRBD).
1606 rd = _RecursiveFindBD(cf)
1608 return (False, "Can't find device %s" % cf)
1615 except errors.BlockDeviceError, err:
1616 msg.append(str(err))
1618 return (False, "Can't make devices secondary: %s" % ",".join(msg))
1620 return (True, "All devices secondary")
1623 class HooksRunner(object):
1626 This class is instantiated on the node side (ganeti-noded) and not on
1630 RE_MASK = re.compile("^[a-zA-Z0-9_-]+$")
1632 def __init__(self, hooks_base_dir=None):
1633 """Constructor for hooks runner.
1636 - hooks_base_dir: if not None, this overrides the
1637 constants.HOOKS_BASE_DIR (useful for unittests)
1640 if hooks_base_dir is None:
1641 hooks_base_dir = constants.HOOKS_BASE_DIR
1642 self._BASE_DIR = hooks_base_dir
1645 def ExecHook(script, env):
1646 """Exec one hook script.
1649 - script: the full path to the script
1650 - env: the environment with which to exec the script
1653 # exec the process using subprocess and log the output
1656 fdstdin = open("/dev/null", "r")
1657 child = subprocess.Popen([script], stdin=fdstdin, stdout=subprocess.PIPE,
1658 stderr=subprocess.STDOUT, close_fds=True,
1659 shell=False, cwd="/", env=env)
1662 output = child.stdout.read(4096)
1663 child.stdout.close()
1664 except EnvironmentError, err:
1665 output += "Hook script error: %s" % str(err)
1669 result = child.wait()
1671 except EnvironmentError, err:
1672 if err.errno == errno.EINTR:
1676 # try not to leak fds
1677 for fd in (fdstdin, ):
1681 except EnvironmentError, err:
1682 # just log the error
1683 #logging.exception("Error while closing fd %s", fd)
1686 return result == 0, output
1688 def RunHooks(self, hpath, phase, env):
1689 """Run the scripts in the hooks directory.
1691 This method will not be usually overriden by child opcodes.
1694 if phase == constants.HOOKS_PHASE_PRE:
1696 elif phase == constants.HOOKS_PHASE_POST:
1699 raise errors.ProgrammerError("Unknown hooks phase: '%s'" % phase)
1702 subdir = "%s-%s.d" % (hpath, suffix)
1703 dir_name = "%s/%s" % (self._BASE_DIR, subdir)
1705 dir_contents = utils.ListVisibleFiles(dir_name)
1706 except OSError, err:
1710 # we use the standard python sort order,
1711 # so 00name is the recommended naming scheme
1713 for relname in dir_contents:
1714 fname = os.path.join(dir_name, relname)
1715 if not (os.path.isfile(fname) and os.access(fname, os.X_OK) and
1716 self.RE_MASK.match(relname) is not None):
1717 rrval = constants.HKR_SKIP
1720 result, output = self.ExecHook(fname, env)
1722 rrval = constants.HKR_FAIL
1724 rrval = constants.HKR_SUCCESS
1725 rr.append(("%s/%s" % (subdir, relname), rrval, output))
1730 class IAllocatorRunner(object):
1731 """IAllocator runner.
1733 This class is instantiated on the node side (ganeti-noded) and not on
1737 def Run(self, name, idata):
1738 """Run an iallocator script.
1740 Return value: tuple of:
1741 - run status (one of the IARUN_ constants)
1744 - fail reason (as from utils.RunResult)
1747 alloc_script = utils.FindFile(name, constants.IALLOCATOR_SEARCH_PATH,
1749 if alloc_script is None:
1750 return (constants.IARUN_NOTFOUND, None, None, None)
1752 fd, fin_name = tempfile.mkstemp(prefix="ganeti-iallocator.")
1756 result = utils.RunCmd([alloc_script, fin_name])
1758 return (constants.IARUN_FAILURE, result.stdout, result.stderr,
1763 return (constants.IARUN_SUCCESS, result.stdout, result.stderr, None)
1766 class DevCacheManager(object):
1767 """Simple class for managing a cache of block device information.
1770 _DEV_PREFIX = "/dev/"
1771 _ROOT_DIR = constants.BDEV_CACHE_DIR
1774 def _ConvertPath(cls, dev_path):
1775 """Converts a /dev/name path to the cache file name.
1777 This replaces slashes with underscores and strips the /dev
1778 prefix. It then returns the full path to the cache file
1781 if dev_path.startswith(cls._DEV_PREFIX):
1782 dev_path = dev_path[len(cls._DEV_PREFIX):]
1783 dev_path = dev_path.replace("/", "_")
1784 fpath = "%s/bdev_%s" % (cls._ROOT_DIR, dev_path)
1788 def UpdateCache(cls, dev_path, owner, on_primary, iv_name):
1789 """Updates the cache information for a given device.
1792 if dev_path is None:
1793 logging.error("DevCacheManager.UpdateCache got a None dev_path")
1795 fpath = cls._ConvertPath(dev_path)
1801 iv_name = "not_visible"
1802 fdata = "%s %s %s\n" % (str(owner), state, iv_name)
1804 utils.WriteFile(fpath, data=fdata)
1805 except EnvironmentError, err:
1806 logging.exception("Can't update bdev cache for %s", dev_path)
1809 def RemoveCache(cls, dev_path):
1810 """Remove data for a dev_path.
1813 if dev_path is None:
1814 logging.error("DevCacheManager.RemoveCache got a None dev_path")
1816 fpath = cls._ConvertPath(dev_path)
1818 utils.RemoveFile(fpath)
1819 except EnvironmentError, err:
1820 logging.exception("Can't update bdev cache for %s", dev_path)