Shared block storage support
authorApollon Oikonomopoulos <apollon@noc.grnet.gr>
Fri, 4 Mar 2011 14:35:23 +0000 (16:35 +0200)
committerIustin Pop <iustin@google.com>
Tue, 8 Mar 2011 12:00:39 +0000 (13:00 +0100)
This patch introduces basic shared block storage support.

It introduces a new storage backend, bdev.PersistentBlockDevice, to
use as a backend for shared block storage. The new bdev requires a new
BLOCKDEV_DRIVER_MANUAL constant with the value "manual" and uses it as
the first part of the block device unique_id.

A new disk template, DT_BLOCK is introduced as well and added to
DTS_EXT_MIRROR and DTS_MAY_ADOPT. Also added DTS_MUST_ADOPT constant
and use it to check for the presence of the adopt keyword during LU
invocation. We enforce the /dev/disk limitation upon adoption, but we
allow block devices to reside anywhere under /dev.

This is very basic support and includes no storage manipulation (provisioning,
resizing, renaming) which will have to be implemented through a "driver"
framework.

Signed-off-by: Apollon Oikonomopoulos <apollon@noc.grnet.gr>
[iustin@google.com: slight changes to bdev.py]
Signed-off-by: Iustin Pop <iustin@google.com>
Reviewed-by: Iustin Pop <iustin@google.com>

lib/bdev.py
lib/cmdlib.py
lib/constants.py
lib/objects.py

index 69797bc..2430b1d 100644 (file)
@@ -24,6 +24,7 @@
 import re
 import time
 import errno
+import stat
 import pyparsing as pyp
 import os
 import logging
@@ -2069,9 +2070,120 @@ class FileStorage(BlockDev):
     return FileStorage(unique_id, children, size)
 
 
+class PersistentBlockDevice(BlockDev):
+  """A block device with persistent node
+
+  May be either directly attached, or exposed through DM (e.g. dm-multipath).
+  udev helpers are probably required to give persistent, human-friendly
+  names.
+
+  For the time being, pathnames are required to lie under /dev.
+
+  """
+  def __init__(self, unique_id, children, size):
+    """Attaches to a static block device.
+
+    The unique_id is a path under /dev.
+
+    """
+    super(PersistentBlockDevice, self).__init__(unique_id, children, size)
+    if not isinstance(unique_id, (tuple, list)) or len(unique_id) != 2:
+      raise ValueError("Invalid configuration data %s" % str(unique_id))
+    self.dev_path = unique_id[1]
+    if not os.path.realpath(self.dev_path).startswith('/dev/'):
+      raise ValueError("Full path '%s' lies outside /dev" %
+                              os.path.realpath(self.dev_path))
+    # TODO: this is just a safety guard checking that we only deal with devices
+    # we know how to handle. In the future this will be integrated with
+    # external storage backends and possible values will probably be collected
+    # from the cluster configuration.
+    if unique_id[0] != constants.BLOCKDEV_DRIVER_MANUAL:
+      raise ValueError("Got persistent block device of invalid type: %s" %
+                       unique_id[0])
+
+    self.major = self.minor = None
+    self.Attach()
+
+  @classmethod
+  def Create(cls, unique_id, children, size):
+    """Create a new device
+
+    This is a noop, we only return a PersistentBlockDevice instance
+
+    """
+    return PersistentBlockDevice(unique_id, children, 0)
+
+  def Remove(self):
+    """Remove a device
+
+    This is a noop
+
+    """
+    pass
+
+  def Rename(self, new_id):
+    """Rename this device.
+
+    """
+    _ThrowError("Rename is not supported for PersistentBlockDev storage")
+
+  def Attach(self):
+    """Attach to an existing block device.
+
+
+    """
+    self.attached = False
+    try:
+      st = os.stat(self.dev_path)
+    except OSError, err:
+      logging.error("Error stat()'ing %s: %s", self.dev_path, str(err))
+      return False
+
+    if not stat.S_ISBLK(st.st_mode):
+      logging.error("%s is not a block device", self.dev_path)
+      return False
+
+    self.major = os.major(st.st_rdev)
+    self.minor = os.minor(st.st_rdev)
+    self.attached = True
+
+    return True
+
+  def Assemble(self):
+    """Assemble the device.
+
+    """
+    pass
+
+  def Shutdown(self):
+    """Shutdown the device.
+
+    """
+    pass
+
+  def Open(self, force=False):
+    """Make the device ready for I/O.
+
+    """
+    pass
+
+  def Close(self):
+    """Notifies that the device will no longer be used for I/O.
+
+    """
+    pass
+
+  def Grow(self, amount):
+    """Grow the logical volume.
+
+    """
+    _ThrowError("Grow is not supported for PersistentBlockDev storage")
+
+
 DEV_MAP = {
   constants.LD_LV: LogicalVolume,
   constants.LD_DRBD8: DRBD8,
+  constants.LD_BLOCKDEV: PersistentBlockDevice,
   }
 
 if constants.ENABLE_FILE_STORAGE or constants.ENABLE_SHARED_FILE_STORAGE:
index 29ed69a..be8e70c 100644 (file)
@@ -6661,6 +6661,19 @@ def _GenerateDiskTemplate(lu, template_name,
                                                          disk_index)),
                               mode=disk["mode"])
       disks.append(disk_dev)
+  elif template_name == constants.DT_BLOCK:
+    if len(secondary_nodes) != 0:
+      raise errors.ProgrammerError("Wrong template configuration")
+
+    for idx, disk in enumerate(disk_info):
+      disk_index = idx + base_index
+      disk_dev = objects.Disk(dev_type=constants.LD_BLOCKDEV, size=disk["size"],
+                              logical_id=(constants.BLOCKDEV_DRIVER_MANUAL,
+                                          disk["adopt"]),
+                              iv_name="disk/%d" % disk_index,
+                              mode=disk["mode"])
+      disks.append(disk_dev)
+
   else:
     raise errors.ProgrammerError("Invalid disk template '%s'" % template_name)
   return disks
@@ -6887,6 +6900,7 @@ def _ComputeDiskSize(disk_template, disks):
     constants.DT_DRBD8: sum(d["size"] + 128 for d in disks),
     constants.DT_FILE: None,
     constants.DT_SHARED_FILE: 0,
+    constants.DT_BLOCK: 0,
   }
 
   if disk_template not in req_size_dict:
@@ -7022,6 +7036,12 @@ class LUInstanceCreate(LogicalUnit):
       if self.op.mode == constants.INSTANCE_IMPORT:
         raise errors.OpPrereqError("Disk adoption not allowed for"
                                    " instance import", errors.ECODE_INVAL)
+    else:
+      if self.op.disk_template in constants.DTS_MUST_ADOPT:
+        raise errors.OpPrereqError("Disk template %s requires disk adoption,"
+                                   " but no 'adopt' parameter given" %
+                                   self.op.disk_template,
+                                   errors.ECODE_INVAL)
 
     self.adopt_disks = has_adopt
 
@@ -7614,7 +7634,7 @@ class LUInstanceCreate(LogicalUnit):
       req_sizes = _ComputeDiskSizePerVG(self.op.disk_template, self.disks)
       _CheckNodesFreeDiskPerVG(self, nodenames, req_sizes)
 
-    else: # instead, we must check the adoption data
+    elif self.op.disk_template == constants.DT_PLAIN: # Check the adoption data
       all_lvs = set([i["vg"] + "/" + i["adopt"] for i in self.disks])
       if len(all_lvs) != len(self.disks):
         raise errors.OpPrereqError("Duplicate volume names given for adoption",
@@ -7650,6 +7670,34 @@ class LUInstanceCreate(LogicalUnit):
       for dsk in self.disks:
         dsk["size"] = int(float(node_lvs[dsk["vg"] + "/" + dsk["adopt"]][0]))
 
+    elif self.op.disk_template == constants.DT_BLOCK:
+      # Normalize and de-duplicate device paths
+      all_disks = set([os.path.abspath(i["adopt"]) for i in self.disks])
+      if len(all_disks) != len(self.disks):
+        raise errors.OpPrereqError("Duplicate disk names given for adoption",
+                                   errors.ECODE_INVAL)
+      baddisks = [d for d in all_disks
+                  if not d.startswith(constants.ADOPTABLE_BLOCKDEV_ROOT)]
+      if baddisks:
+        raise errors.OpPrereqError("Device node(s) %s lie outside %s and"
+                                   " cannot be adopted" %
+                                   (", ".join(baddisks),
+                                    constants.ADOPTABLE_BLOCKDEV_ROOT),
+                                   errors.ECODE_INVAL)
+
+      node_disks = self.rpc.call_bdev_sizes([pnode.name],
+                                            list(all_disks))[pnode.name]
+      node_disks.Raise("Cannot get block device information from node %s" %
+                       pnode.name)
+      node_disks = node_disks.payload
+      delta = all_disks.difference(node_disks.keys())
+      if delta:
+        raise errors.OpPrereqError("Missing block device(s): %s" %
+                                   utils.CommaJoin(delta),
+                                   errors.ECODE_INVAL)
+      for dsk in self.disks:
+        dsk["size"] = int(float(node_disks[dsk["adopt"]]))
+
     _CheckHVParams(self, nodenames, self.op.hypervisor, self.op.hvparams)
 
     _CheckNodeHasOS(self, pnode.name, self.op.os_type, self.op.force_variant)
@@ -7721,17 +7769,18 @@ class LUInstanceCreate(LogicalUnit):
                             )
 
     if self.adopt_disks:
-      # rename LVs to the newly-generated names; we need to construct
-      # 'fake' LV disks with the old data, plus the new unique_id
-      tmp_disks = [objects.Disk.FromDict(v.ToDict()) for v in disks]
-      rename_to = []
-      for t_dsk, a_dsk in zip (tmp_disks, self.disks):
-        rename_to.append(t_dsk.logical_id)
-        t_dsk.logical_id = (t_dsk.logical_id[0], a_dsk["adopt"])
-        self.cfg.SetDiskID(t_dsk, pnode_name)
-      result = self.rpc.call_blockdev_rename(pnode_name,
-                                             zip(tmp_disks, rename_to))
-      result.Raise("Failed to rename adoped LVs")
+      if self.op.disk_template == constants.DT_PLAIN:
+        # rename LVs to the newly-generated names; we need to construct
+        # 'fake' LV disks with the old data, plus the new unique_id
+        tmp_disks = [objects.Disk.FromDict(v.ToDict()) for v in disks]
+        rename_to = []
+        for t_dsk, a_dsk in zip (tmp_disks, self.disks):
+          rename_to.append(t_dsk.logical_id)
+          t_dsk.logical_id = (t_dsk.logical_id[0], a_dsk["adopt"])
+          self.cfg.SetDiskID(t_dsk, pnode_name)
+        result = self.rpc.call_blockdev_rename(pnode_name,
+                                               zip(tmp_disks, rename_to))
+        result.Raise("Failed to rename adoped LVs")
     else:
       feedback_fn("* creating instance disks...")
       try:
index 033de03..816ad17 100644 (file)
@@ -120,6 +120,7 @@ CRYPTO_KEYS_DIR = RUN_GANETI_DIR + "/crypto"
 CRYPTO_KEYS_DIR_MODE = SECURE_DIR_MODE
 IMPORT_EXPORT_DIR = RUN_GANETI_DIR + "/import-export"
 IMPORT_EXPORT_DIR_MODE = 0755
+ADOPTABLE_BLOCKDEV_ROOT = "/dev/disk/"
 # keep RUN_GANETI_DIR first here, to make sure all get created when the node
 # daemon is started (this takes care of RUN_DIR being tmpfs)
 SUB_RUN_DIRS = [ RUN_GANETI_DIR, BDEV_CACHE_DIR, DISK_LINKS_DIR ]
@@ -363,21 +364,25 @@ DT_PLAIN = "plain"
 DT_DRBD8 = "drbd"
 DT_FILE = "file"
 DT_SHARED_FILE = "sharedfile"
+DT_BLOCK = "blockdev"
 
 # the set of network-mirrored disk templates
 DTS_NET_MIRROR = frozenset([DT_DRBD8])
 
-# the set of externally mirrored disk templates
-DTS_EXT_MIRROR = frozenset([DT_SHARED_FILE])
+# the set of externally-mirrored disk templates (e.g. SAN, NAS)
+DTS_EXT_MIRROR = frozenset([DT_SHARED_FILE, DT_BLOCK])
 
 # the set of non-lvm-based disk templates
-DTS_NOT_LVM = frozenset([DT_DISKLESS, DT_FILE, DT_SHARED_FILE])
+DTS_NOT_LVM = frozenset([DT_DISKLESS, DT_FILE, DT_SHARED_FILE, DT_BLOCK])
 
 # the set of disk templates which can be grown
 DTS_GROWABLE = frozenset([DT_PLAIN, DT_DRBD8, DT_FILE, DT_SHARED_FILE])
 
 # the set of disk templates that allow adoption
-DTS_MAY_ADOPT = frozenset([DT_PLAIN])
+DTS_MAY_ADOPT = frozenset([DT_PLAIN, DT_BLOCK])
+
+# the set of disk templates that *must* use adoption
+DTS_MUST_ADOPT = frozenset([DT_BLOCK])
 
 # the set of disk templates that allow migrations
 DTS_MIRRORED = frozenset.union(DTS_NET_MIRROR, DTS_EXT_MIRROR)
@@ -387,7 +392,8 @@ DTS_MIRRORED = frozenset.union(DTS_NET_MIRROR, DTS_EXT_MIRROR)
 LD_LV = "lvm"
 LD_DRBD8 = "drbd8"
 LD_FILE = "file"
-LDS_BLOCK = frozenset([LD_LV, LD_DRBD8])
+LD_BLOCKDEV = "blockdev"
+LDS_BLOCK = frozenset([LD_LV, LD_DRBD8, LD_BLOCKDEV])
 
 # drbd constants
 DRBD_HMAC_ALG = "md5"
@@ -460,7 +466,7 @@ RIE_CONNECT_RETRIES = 10
 CHILD_LINGER_TIMEOUT = 5.0
 
 DISK_TEMPLATES = frozenset([DT_DISKLESS, DT_PLAIN, DT_DRBD8,
-                            DT_FILE, DT_SHARED_FILE])
+                            DT_FILE, DT_SHARED_FILE, DT_BLOCK])
 
 FILE_DRIVER = frozenset([FD_LOOP, FD_BLKTAP])
 
@@ -1312,3 +1318,6 @@ VALID_ALLOC_POLICIES = [
   ALLOC_POLICY_LAST_RESORT,
   ALLOC_POLICY_UNALLOCABLE,
   ]
+
+# Temporary external/shared storage parameters
+BLOCKDEV_DRIVER_MANUAL = "manual"
index ef99349..fb7e323 100644 (file)
@@ -441,6 +441,8 @@ class Disk(ConfigObject):
     """
     if self.dev_type == constants.LD_LV:
       return "/dev/%s/%s" % (self.logical_id[0], self.logical_id[1])
+    elif self.dev_type == constants.LD_BLOCKDEV:
+      return self.logical_id[1]
     return None
 
   def ChildrenNeeded(self):
@@ -483,7 +485,8 @@ class Disk(ConfigObject):
     devices needs to (or can) be assembled.
 
     """
-    if self.dev_type in [constants.LD_LV, constants.LD_FILE]:
+    if self.dev_type in [constants.LD_LV, constants.LD_FILE,
+                         constants.LD_BLOCKDEV]:
       result = [node]
     elif self.dev_type in constants.LDS_DRBD:
       result = [self.logical_id[0], self.logical_id[1]]