4 # Copyright (C) 2006, 2007, 2011 Google Inc.
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 """Program which configures LVM on the Ganeti nodes.
24 This program wipes disks and creates a volume group on top of them. It
25 can also show disk information to help you decide which disks you want
28 The error handling is done by raising our own exceptions from most of
29 the functions; these exceptions then handled globally in the main()
30 function. The exceptions that each function can raise are not
31 documented individually, since almost every error path ends in a
34 Another two exceptions that are handled globally are IOError and
35 OSError. The idea behind this is, since we run as root, we should
36 usually not get these errors, but if we do it's most probably a system
37 error, so they should be handled and the user instructed to report
50 from ganeti.utils import RunCmd, ReadFile
51 from ganeti import constants
52 from ganeti import cli
53 from ganeti import compat
55 USAGE = ("\tlvmstrap diskinfo\n"
56 "\tlvmstrap [--vg-name=NAME] [--allow-removable]"
57 " { --alldisks | --disks DISKLIST } [--use-sfdisk]"
62 #: Supported disk types (as prefixes)
70 #: Excluded filesystem types
71 EXCLUDED_FS = frozenset([
82 #: A regular expression that matches partitions (must be kept in sync
83 # with L{SUPPORTED_TYPES}
84 PART_RE = re.compile("^((?:h|s|m|ub)d[a-z]{1,2})[0-9]+$")
86 #: Minimum partition size to be considered (1 GB)
87 PART_MINSIZE = 1024 * 1024 * 1024
88 MBR_MAX_SIZE = 2 * 10**12
90 class Error(Exception):
91 """Generic exception"""
95 class ProgrammingError(Error):
96 """Exception denoting invalid assumptions in programming.
98 This should catch sysfs tree changes, or otherwise incorrect
99 assumptions about the contents of the /sys/block/... directories.
105 class SysconfigError(Error):
106 """Exception denoting invalid system configuration.
108 If the system configuration is somehow wrong (e.g. /dev files
109 missing, or having mismatched major/minor numbers relative to
110 /sys/block devices), this exception will be raised.
112 This should usually mean that the installation of the Xen node
113 failed in some steps.
119 class PrereqError(Error):
120 """Exception denoting invalid prerequisites.
122 If the node does not meet the requirements for cluster membership, this
123 exception will be raised. Things like wrong kernel version, or no
124 free disks, etc. belong here.
126 This should usually mean that the build steps for the Xen node were
127 not followed correctly.
133 class OperationalError(Error):
134 """Exception denoting actual errors.
136 Errors during the bootstrapping are signaled using this exception.
142 class ParameterError(Error):
143 """Exception denoting invalid input from user.
145 Wrong disks given as parameters will be signaled using this
153 """Shows program usage information and exits the program.
156 print >> sys.stderr, "Usage:"
157 print >> sys.stderr, USAGE
162 """Parses the command line options.
164 In case of command line errors, it will show the usage and exit the
168 @return: a tuple of (options, args), as returned by
169 OptionParser.parse_args
172 global verbose_flag # pylint: disable-msg=W0603
174 parser = optparse.OptionParser(usage="\n%s" % USAGE,
175 version="%%prog (ganeti) %s" %
176 constants.RELEASE_VERSION)
178 parser.add_option("--alldisks", dest="alldisks",
179 help="erase ALL disks", action="store_true",
181 parser.add_option("-d", "--disks", dest="disks",
182 help="Choose disks (e.g. hda,hdg)",
184 parser.add_option(cli.VERBOSE_OPT)
185 parser.add_option("-r", "--allow-removable",
186 action="store_true", dest="removable_ok", default=False,
187 help="allow and use removable devices too")
188 parser.add_option("-g", "--vg-name", type="string",
189 dest="vgname", default="xenvg", metavar="NAME",
190 help="the volume group to be created [default: xenvg]")
191 parser.add_option("--use-sfdisk", dest="use_sfdisk",
192 action="store_true", default=False,
193 help="use sfdisk instead of parted")
196 options, args = parser.parse_args()
200 verbose_flag = options.verbose
205 def IsPartitioned(disk):
206 """Returns whether a given disk should be used partitioned or as-is.
208 Currently only md devices are used as is.
211 return not (disk.startswith('md') or PART_RE.match(disk))
214 def DeviceName(disk):
215 """Returns the appropriate device name for a disk.
217 For non-partitioned devices, it returns the name as is, otherwise it
218 returns the first partition.
221 if IsPartitioned(disk):
222 device = '/dev/%s1' % disk
224 device = '/dev/%s' % disk
229 """Returns the sysfs name for a disk or partition.
232 match = PART_RE.match(disk)
234 # this is a partition, which resides in /sys/block under a different name
235 disk = "%s/%s" % (match.group(1), disk)
236 return "/sys/block/%s" % disk
239 def ExecCommand(command):
240 """Executes a command.
242 This is just a wrapper around commands.getstatusoutput, with the
243 difference that if the command line argument -v has been given, it
244 will print the command line and the command output on stdout.
246 @param command: the command line to be executed
248 @return: a tuple of (status, output) where status is the exit status
249 and output the stdout and stderr of the command together
254 result = RunCmd(command)
261 """Check the prerequisites of this program.
263 It check that it runs on Linux 2.6, and that /sys is mounted and the
264 fact that /sys/block is a directory.
268 raise PrereqError("This tool runs as root only. Really.")
270 osname, _, release, _, _ = os.uname()
271 if osname != 'Linux':
272 raise PrereqError("This tool only runs on Linux"
273 " (detected OS: %s)." % osname)
275 if not release.startswith("2.6."):
276 raise PrereqError("Wrong major kernel version (detected %s, needs"
279 if not os.path.ismount("/sys"):
280 raise PrereqError("Can't find a filesystem mounted at /sys."
281 " Please mount /sys.")
283 if not os.path.isdir("/sys/block"):
284 raise SysconfigError("Can't find /sys/block directory. Has the"
285 " layout of /sys changed?")
287 if not os.path.ismount("/proc"):
288 raise PrereqError("Can't find a filesystem mounted at /proc."
289 " Please mount /proc.")
291 if not os.path.exists("/proc/mounts"):
292 raise SysconfigError("Can't find /proc/mounts")
295 def CheckVGExists(vgname):
296 """Checks to see if a volume group exists.
298 @param vgname: the volume group name
300 @return: a four-tuple (exists, lv_count, vg_size, vg_free), where:
301 - exists: True if the volume exists, otherwise False; if False,
302 all other members of the tuple are None
303 - lv_count: The number of logical volumes in the volume group
304 - vg_size: The total size of the volume group (in gibibytes)
305 - vg_free: The available space in the volume group
308 result = ExecCommand("vgs --nohead -o lv_count,vg_size,vg_free"
309 " --nosuffix --units g"
310 " --ignorelockingfailure %s" % vgname)
311 if not result.failed:
313 lv_count, vg_size, vg_free = result.stdout.strip().split()
315 # This means the output of vgdisplay can't be parsed
316 raise PrereqError("cannot parse output of vgs (%s)" % result.stdout)
318 lv_count = vg_size = vg_free = None
320 return not result.failed, lv_count, vg_size, vg_free
323 def CheckSysDev(name, devnum):
324 """Checks consistency between /sys and /dev trees.
326 In /sys/block/<name>/dev and /sys/block/<name>/<part>/dev are the
327 kernel-known device numbers. The /dev/<name> block/char devices are
328 created by userspace and thus could differ from the kernel
329 view. This function checks the consistency between the device number
330 read from /sys and the actual device number in /dev.
332 Note that since the system could be using udev which removes and
333 recreates the device nodes on partition table rescan, we need to do
334 some retries here. Since we only do a stat, we can afford to do many
337 @param name: the device name, e.g. 'sda'
338 @param devnum: the device number, e.g. 0x803 (2051 in decimal) for sda3
339 @raises L{SysconfigError}: in case of failure of the check
342 path = "/dev/%s" % name
344 if os.path.exists(path):
348 raise SysconfigError("the device file %s does not exist, but the block"
349 " device exists in the /sys/block tree" % path)
350 rdev = os.stat(path).st_rdev
352 raise SysconfigError("For device %s, the major:minor in /dev is %04x"
353 " while the major:minor in sysfs is %s" %
354 (path, rdev, devnum))
357 def ReadDev(syspath):
358 """Reads the device number from a sysfs path.
360 The device number is given in sysfs under a block device directory
361 in a file named 'dev' which contains major:minor (in ASCII). This
362 function reads that file and converts the major:minor pair to a dev
365 @type syspath: string
366 @param syspath: the path to a block device dir in sysfs,
367 e.g. C{/sys/block/sda}
369 @return: the device number
372 if not os.path.exists("%s/dev" % syspath):
373 raise ProgrammingError("Invalid path passed to ReadDev: %s" % syspath)
374 f = open("%s/dev" % syspath)
375 data = f.read().strip()
377 major, minor = data.split(":", 1)
380 dev = os.makedev(major, minor)
384 def ReadSize(syspath):
385 """Reads the size from a sysfs path.
387 The size is given in sysfs under a block device directory in a file
388 named 'size' which contains the number of sectors (in ASCII). This
389 function reads that file and converts the number in sectors to the
392 @type syspath: string
393 @param syspath: the path to a block device dir in sysfs,
394 e.g. C{/sys/block/sda}
397 @return: the device size in bytes
401 if not os.path.exists("%s/size" % syspath):
402 raise ProgrammingError("Invalid path passed to ReadSize: %s" % syspath)
403 f = open("%s/size" % syspath)
404 data = f.read().strip()
406 size = 512L * int(data)
411 """Reads physical volume information.
413 This function tries to see if a block device is a physical volume.
416 @param name: the device name (e.g. sda)
418 @return: the name of the volume group to which this PV belongs, or
419 "" if this PV is not in use, or None if this is not a PV
422 result = ExecCommand("pvdisplay -c /dev/%s" % name)
425 vgname = result.stdout.strip().split(":")[1]
429 def GetDiskList(opts):
430 """Computes the block device list for this system.
432 This function examines the /sys/block tree and using information
433 therein, computes the status of the block device.
435 @return: a list like [(name, size, dev, partitions, inuse), ...], where:
436 - name is the block device name (e.g. sda)
437 - size the size in bytes
438 - dev is the device number (e.g. 8704 for hdg)
439 - partitions is [(name, size, dev), ...] mirroring the disk list
440 data inuse is a boolean showing the in-use status of the disk,
441 computed as the possibility of re-reading the partition table
442 (the meaning of the operation varies with the kernel version,
443 but is usually accurate; a mounted disk/partition or swap-area
444 or PV with active LVs on it is busy)
448 for name in os.listdir("/sys/block"):
449 if not compat.any([name.startswith(pfx) for pfx in SUPPORTED_TYPES]):
452 disksysfsname = "/sys/block/%s" % name
453 size = ReadSize(disksysfsname)
455 f = open("/sys/block/%s/removable" % name)
456 removable = int(f.read().strip())
459 if removable and not opts.removable_ok:
462 dev = ReadDev(disksysfsname)
463 CheckSysDev(name, dev)
465 # Enumerate partitions of the block device
467 for partname in os.listdir(disksysfsname):
468 if not partname.startswith(name):
470 partsysfsname = "%s/%s" % (disksysfsname, partname)
471 partdev = ReadDev(partsysfsname)
472 partsize = ReadSize(partsysfsname)
473 if partsize >= PART_MINSIZE:
474 CheckSysDev(partname, partdev)
475 partinuse = InUse(partname)
476 partitions.append((partname, partsize, partdev, partinuse))
478 dlist.append((name, size, dev, partitions, inuse))
484 """Reads /proc/mounts and computes the mountpoint-devnum mapping.
486 This function reads /proc/mounts, finds the mounted filesystems
487 (excepting a hard-coded blacklist of network and virtual
488 filesystems) and does a stat on these mountpoints. The st_dev number
489 of the results is memorised for later matching against the
493 @return: a {mountpoint: device number} dictionary
496 mountlines = ReadFile("/proc/mounts").splitlines()
498 for line in mountlines:
499 _, mountpoint, fstype, _ = line.split(None, 3)
501 if fstype in EXCLUDED_FS:
504 dev = os.stat(mountpoint).st_dev
506 # this should be a fairly rare error, since we are blacklisting
507 # network filesystems; with this in mind, we'll ignore it,
508 # since the rereadpt check catches in-use filesystems,
509 # and this is used for disk information only
510 print >> sys.stderr, ("Can't stat mountpoint '%s': %s" %
512 print >> sys.stderr, "Ignoring."
514 mounts[dev] = mountpoint
519 """Reads /proc/swaps and returns the list of swap backing stores.
522 swaplines = ReadFile("/proc/swaps").splitlines()[1:]
523 return [line.split(None, 1)[0] for line in swaplines]
526 def DevInfo(name, dev, mountinfo):
527 """Computes miscellaneous information about a block device.
530 @param name: the device name, e.g. sda
532 @return: a tuple (mpath, whatvg, fileinfo), where:
533 - mpath is the mount path where this device is mounted or None
534 - whatvg is the result of the ReadPV function
535 - fileinfo is the output of file -bs on the device
539 mpath = mountinfo[dev]
543 whatvg = ReadPV(name)
545 result = ExecCommand("file -bs /dev/%s" % name)
547 fileinfo = "<error: %s>" % result.stderr
548 fileinfo = result.stdout[:45]
549 return mpath, whatvg, fileinfo
552 def ShowDiskInfo(opts):
553 """Shows a nicely formatted block device list for this system.
555 This function shows the user a table with the information gathered
556 by the other functions defined, in order to help the user make a
557 choice about which disks should be allocated to our volume group.
566 mounts = GetMountInfo()
567 dlist = GetDiskList(opts)
569 print "------- Disk information -------"
578 fields = ["name", "size", "used", "mount", "lvm", "info"]
581 # Flatten the [(disk, [partition,...]), ...] list
582 for name, size, dev, parts, inuse in dlist:
583 flatlist.append((name, size, dev, _inuse(inuse)))
584 for partname, partsize, partdev, partinuse in parts:
585 flatlist.append((partname, partsize, partdev, _inuse(partinuse)))
588 for name, size, dev, in_use in flatlist:
589 mp, vgname, fileinfo = DevInfo(name, dev, mounts)
597 lvminfo = "in %s" % vgname
603 strlist.append([name, "%.2f" % (float(size) / 1024 / 1024),
604 in_use, mp, lvminfo, fileinfo])
606 data = cli.GenerateTable(headers, fields, None,
607 strlist, numfields=["size"])
613 def CheckSysfsHolders(name):
614 """Check to see if a device is 'hold' at sysfs level.
616 This is usually the case for Physical Volumes under LVM.
619 @return: true if the device is available according to sysfs
623 contents = os.listdir("%s/holders/" % SysfsName(name))
625 if err.errno == errno.ENOENT:
629 return not bool(contents)
632 def CheckReread(name):
633 """Check to see if a block device is in use.
635 Uses blockdev to reread the partition table of a block device (or
636 fuser if the device is not partitionable), and thus compute the
637 in-use status. See the discussion in GetDiskList about the meaning
641 @return: the in-use status of the device
644 use_blockdev = IsPartitioned(name)
646 cmd = "blockdev --rereadpt /dev/%s" % name
648 cmd = "fuser -vam /dev/%s" % name
651 result = ExecCommand(cmd)
652 if not use_blockdev and result.failed:
654 elif use_blockdev and not result.failed:
659 return not result.failed
664 def CheckMounted(name):
665 """Check to see if a block device is a mountpoint.
667 In recent distros/kernels, this is reported directly via fuser, but
668 on older ones not, so we do an additional check here (manually).
671 minfo = GetMountInfo()
672 dev = ReadDev(SysfsName(name))
673 return dev not in minfo
677 """Check to see if a block device is being used as swap.
680 name = "/dev/%s" % name
681 return name not in GetSwapInfo()
685 """Returns if a disk is in use or not.
688 return not (CheckSysfsHolders(name) and CheckReread(name) and
689 CheckMounted(name) and CheckSwap(name))
693 """Wipes a block device.
695 This function wipes a block device, by clearing and re-reading the
696 partition table. If not successful, it writes back the old partition
697 data, and leaves the cleanup to the user.
699 @param name: the device name (e.g. sda)
704 raise OperationalError("CRITICAL: disk %s you selected seems to be in"
705 " use. ABORTING!" % name)
707 fd = os.open("/dev/%s" % name, os.O_RDWR | os.O_SYNC)
708 olddata = os.read(fd, 512)
709 if len(olddata) != 512:
710 raise OperationalError("CRITICAL: Can't read partition table information"
711 " from /dev/%s (needed 512 bytes, got %d" %
712 (name, len(olddata)))
715 bytes_written = os.write(fd, newdata)
717 if bytes_written != 512:
718 raise OperationalError("CRITICAL: Can't write partition table information"
719 " to /dev/%s (tried to write 512 bytes, written"
720 " %d. I don't know how to cleanup. Sorry." %
721 (name, bytes_written))
724 # try to restore the data
725 fd = os.open("/dev/%s" % name, os.O_RDWR | os.O_SYNC)
726 os.write(fd, olddata)
728 raise OperationalError("CRITICAL: disk %s which I have just wiped cannot"
729 " reread partition table. Most likely, it is"
730 " in use. You have to clean after this yourself."
731 " I tried to restore the old partition table,"
732 " but I cannot guarantee nothing has broken." %
736 def PartitionDisk(name, use_sfdisk):
737 """Partitions a disk.
739 This function creates a single partition spanning the entire disk,
742 @param name: the device name, e.g. sda
746 # Check that parted exists
747 result = ExecCommand("parted --help")
750 print >> sys.stderr, ("Unable to execute \"parted --help\","
751 " falling back to sfdisk.")
753 # Check disk size - over 2TB means we need to use GPT
754 size = ReadSize("/sys/block/%s" % name)
755 if size > MBR_MAX_SIZE:
758 raise OperationalError("Critical: Disk larger than 2TB detected, but"
759 " parted is either not installed or --use-sfdisk"
760 " has been specified")
765 result = ExecCommand(
766 "echo ,,8e, | sfdisk /dev/%s" % name)
768 raise OperationalError("CRITICAL: disk %s which I have just partitioned"
769 " cannot reread its partition table, or there"
770 " is some other sfdisk error. Likely, it is in"
771 " use. You have to clean this yourself. Error"
772 " message from sfdisk: %s" %
773 (name, result.output))
776 result = ExecCommand("parted -s /dev/%s mklabel %s" % (name, label_type))
778 raise OperationalError("Critical: failed to create %s label on %s" %
780 result = ExecCommand("parted -s /dev/%s mkpart pri ext2 1 100%%" % name)
782 raise OperationalError("Critical: failed to create partition on %s" %
784 result = ExecCommand("parted -s /dev/%s set 1 lvm on" % name)
786 raise OperationalError("Critical: failed to set partition on %s to LVM" %
790 def CreatePVOnDisk(name):
791 """Creates a physical volume on a block device.
793 This function creates a physical volume on a block device, overriding
794 all warnings. So it can wipe existing PVs and PVs which are in a VG.
796 @param name: the device name, e.g. sda
799 device = DeviceName(name)
800 result = ExecCommand("pvcreate -yff %s" % device)
802 raise OperationalError("I cannot create a physical volume on"
803 " %s. Error message: %s."
804 " Please clean up yourself." %
805 (device, result.output))
808 def CreateVG(vgname, disks):
809 """Creates the volume group.
811 This function creates a volume group named `vgname` on the disks
812 given as parameters. The physical extent size is set to 64MB.
814 @param disks: a list of disk names, e.g. ['sda','sdb']
817 pnames = [DeviceName(d) for d in disks]
818 result = ExecCommand("vgcreate -s 64MB '%s' %s" % (vgname, " ".join(pnames)))
820 raise OperationalError("I cannot create the volume group %s from"
821 " disks %s. Error message: %s. Please clean up"
823 (vgname, " ".join(disks), result.output))
826 def ValidateDiskList(options):
827 """Validates or computes the disk list for create.
829 This function either computes the available disk list (if the user
830 gave --alldisks option), or validates the user-given disk list (by
831 using the --disks option) such that all given disks are present and
834 @param options: the options returned from OptParser.parse_options
836 @return: a list of disk names, e.g. ['sda', 'sdb']
839 sysdisks = GetDiskList(options)
841 raise PrereqError("no disks found (I looked for"
842 " non-removable block devices).")
845 for name, _, _, parts, used in sysdisks:
847 sysd_used.append(name)
848 for partname, _, _, partused in parts:
850 sysd_used.append(partname)
852 sysd_free.append(partname)
854 sysd_free.append(name)
857 raise PrereqError("no free disks found! (%d in-use disks)" %
862 disklist = options.disks.split(",")
863 for name in disklist:
864 if name in sysd_used:
865 raise ParameterError("disk %s is in use, cannot wipe!" % name)
866 if name not in sysd_free:
867 raise ParameterError("cannot find disk %s!" % name)
869 raise ParameterError("Please use either --alldisks or --disks!")
875 """Actual main routine.
880 options, args = ParseOptions()
881 vgname = options.vgname
882 command = args.pop(0)
883 if command == "diskinfo":
884 ShowDiskInfo(options)
886 if command != "create":
889 exists, lv_count, vg_size, vg_free = CheckVGExists(vgname)
891 raise PrereqError("It seems volume group '%s' already exists:\n"
892 " LV count: %s, size: %s, free: %s." %
893 (vgname, lv_count, vg_size, vg_free))
896 disklist = ValidateDiskList(options)
898 for disk in disklist:
900 if IsPartitioned(disk):
901 PartitionDisk(disk, options.use_sfdisk)
902 for disk in disklist:
904 CreateVG(vgname, disklist)
906 status, lv_count, size, _ = CheckVGExists(vgname)
908 print "Done! %s: size %s GiB, disks: %s" % (vgname, size,
911 raise OperationalError("Although everything seemed ok, the volume"
912 " group did not get created.")
916 """Application entry point.
918 This is just a wrapper over BootStrap, to handle our own exceptions.
923 except PrereqError, err:
924 print >> sys.stderr, "The prerequisites for running this tool are not met."
925 print >> sys.stderr, ("Please make sure you followed all the steps in"
926 " the build document.")
927 print >> sys.stderr, "Description: %s" % str(err)
929 except SysconfigError, err:
930 print >> sys.stderr, ("This system's configuration seems wrong, at"
931 " least is not what I expect.")
932 print >> sys.stderr, ("Please check that the installation didn't fail"
934 print >> sys.stderr, "Description: %s" % str(err)
936 except ParameterError, err:
937 print >> sys.stderr, ("Some parameters you gave to the program or the"
938 " invocation is wrong. ")
939 print >> sys.stderr, "Description: %s" % str(err)
941 except OperationalError, err:
942 print >> sys.stderr, ("A serious error has happened while modifying"
943 " the system's configuration.")
944 print >> sys.stderr, ("Please review the error message below and make"
945 " sure you clean up yourself.")
946 print >> sys.stderr, ("It is most likely that the system configuration"
947 " has been partially altered.")
948 print >> sys.stderr, str(err)
950 except ProgrammingError, err:
951 print >> sys.stderr, ("Internal application error. Please report this"
952 " to the Ganeti developer list.")
953 print >> sys.stderr, "Error description: %s" % str(err)
956 print >> sys.stderr, "Unhandled application error: %s" % err
958 except (IOError, OSError), err:
959 print >> sys.stderr, "I/O error detected, please report."
960 print >> sys.stderr, "Description: %s" % str(err)
964 if __name__ == "__main__":