Convert call_blockdev_addchildren to new result

[ganeti-local] / lib / backend.py
diff --git a/lib/backend.py b/lib/backend.py

index e2aac3b..f941fbb 100644 (file)
--- a/lib/backend.py
+++ b/lib/backend.py
@@ -78,6 +78,7 @@ def _Decompress(data):
    @return: Decompressed data
  
    """
+  assert isinstance(data, (list, tuple))
    assert len(data) == 2
    (encoding, content) = data
    if encoding == constants.RPC_ENCODING_NONE:
@@ -88,7 +89,7 @@ def _Decompress(data):
      raise AssertionError("Unknown data encoding")
  
  
-def _CleanDirectory(path, exclude=[]):
+def _CleanDirectory(path, exclude=None):
    """Removes all regular files in a directory.
  
    @type path: str
@@ -96,14 +97,15 @@ def _CleanDirectory(path, exclude=[]):
    @type exclude: list
    @param exclude: list of files to be excluded, defaults
        to the empty list
-  @rtype: None
  
    """
    if not os.path.isdir(path):
      return
-
-  # Normalize excluded paths
-  exclude = [os.path.normpath(i) for i in exclude]
+  if exclude is None:
+    exclude = []
+  else:
+    # Normalize excluded paths
+    exclude = [os.path.normpath(i) for i in exclude]
  
    for rel_name in utils.ListVisibleFiles(path):
      full_name = os.path.normpath(os.path.join(path, rel_name))
@@ -258,8 +260,9 @@ def AddNode(dsa, dsapub, rsa, rsapub, sshkey, sshpub):
      priv_key, pub_key, auth_keys = ssh.GetUserFiles(constants.GANETI_RUNAS,
                                                      mkdir=True)
    except errors.OpExecError, err:
-    logging.exception("Error while processing user ssh files")
-    return False
+    msg = "Error while processing user ssh files"
+    logging.exception(msg)
+    return (False, "%s: %s" % (msg, err))
  
    for name, content in [(priv_key, sshkey), (pub_key, sshpub)]:
      utils.WriteFile(name, data=content, mode=0600)
@@ -268,7 +271,7 @@ def AddNode(dsa, dsapub, rsa, rsapub, sshkey, sshpub):
  
    utils.RunCmd([constants.SSH_INITD_SCRIPT, "restart"])
  
-  return True
+  return (True, "Node added successfully")
  
  
  def LeaveCluster():
@@ -278,7 +281,7 @@ def LeaveCluster():
    from the cluster.
  
    If processing is successful, then it raises an
-  L{errors.GanetiQuitException} which is used as a special case to
+  L{errors.QuitGanetiException} which is used as a special case to
    shutdown the node daemon.
  
    """
@@ -424,12 +427,21 @@ def VerifyNode(what, cluster_name):
      result[constants.NV_VGLIST] = ListVolumeGroups()
  
    if constants.NV_VERSION in what:
-    result[constants.NV_VERSION] = constants.PROTOCOL_VERSION
+    result[constants.NV_VERSION] = (constants.PROTOCOL_VERSION,
+                                    constants.RELEASE_VERSION)
  
    if constants.NV_HVINFO in what:
      hyper = hypervisor.GetHypervisor(what[constants.NV_HVINFO])
      result[constants.NV_HVINFO] = hyper.GetNodeInfo()
  
+  if constants.NV_DRBDLIST in what:
+    try:
+      used_minors = bdev.DRBD8.GetUsedDevs().keys()
+    except errors.BlockDeviceError, err:
+      logging.warning("Can't get used minors list", exc_info=True)
+      used_minors = str(err)
+    result[constants.NV_DRBDLIST] = used_minors
+
    return result
  
  
@@ -563,7 +575,6 @@ def GetInstanceList(hypervisor_list):
        results.extend(names)
      except errors.HypervisorError, err:
        logging.exception("Error enumerating instances for hypevisor %s", hname)
-      # FIXME: should we somehow not propagate this to the master?
        raise
  
    return results
@@ -595,6 +606,30 @@ def GetInstanceInfo(instance, hname):
    return output
  
  
+def GetInstanceMigratable(instance):
+  """Gives whether an instance can be migrated.
+
+  @type instance: L{objects.Instance}
+  @param instance: object representing the instance to be checked.
+
+  @rtype: tuple
+  @return: tuple of (result, description) where:
+      - result: whether the instance can be migrated or not
+      - description: a description of the issue, if relevant
+
+  """
+  hyper = hypervisor.GetHypervisor(instance.hypervisor)
+  if instance.name not in hyper.ListInstances():
+    return (False, 'not running')
+
+  for idx in range(len(instance.disks)):
+    link_name = _GetBlockDevSymlinkPath(instance.name, idx)
+    if not os.path.islink(link_name):
+      return (False, 'not restarted since ganeti 1.2.5')
+
+  return (True, '')
+
+
  def GetAllInstancesInfo(hypervisor_list):
    """Gather data about all instances.
  
@@ -625,26 +660,43 @@ def GetAllInstancesInfo(hypervisor_list):
            'state': state,
            'time': times,
            }
-        if name in output and output[name] != value:
-          raise errors.HypervisorError("Instance %s running duplicate"
-                                       " with different parameters" % name)
+        if name in output:
+          # we only check static parameters, like memory and vcpus,
+          # and not state and time which can change between the
+          # invocations of the different hypervisors
+          for key in 'memory', 'vcpus':
+            if value[key] != output[name][key]:
+              raise errors.HypervisorError("Instance %s is running twice"
+                                           " with different parameters" % name)
          output[name] = value
  
    return output
  
  
-def AddOSToInstance(instance):
+def InstanceOsAdd(instance, reinstall):
    """Add an OS to an instance.
  
    @type instance: L{objects.Instance}
    @param instance: Instance whose OS is to be installed
+  @type reinstall: boolean
+  @param reinstall: whether this is an instance reinstall
    @rtype: boolean
    @return: the success of the operation
  
    """
-  inst_os = OSFromDisk(instance.os)
+  try:
+    inst_os = OSFromDisk(instance.os)
+  except errors.InvalidOS, err:
+    os_name, os_dir, os_err = err.args
+    if os_dir is None:
+      return (False, "Can't find OS '%s': %s" % (os_name, os_err))
+    else:
+      return (False, "Error parsing OS '%s' in directory %s: %s" %
+              (os_name, os_dir, os_err))
  
    create_env = OSEnvironment(instance)
+  if reinstall:
+    create_env['INSTANCE_REINSTALL'] = "1"
  
    logfile = "%s/add-%s-%s-%d.log" % (constants.LOG_OS_DIR, instance.os,
                                       instance.name, int(time.time()))
@@ -655,9 +707,12 @@ def AddOSToInstance(instance):
      logging.error("os create command '%s' returned error: %s, logfile: %s,"
                    " output: %s", result.cmd, result.fail_reason, logfile,
                    result.output)
-    return False
+    lines = [utils.SafeEncode(val)
+             for val in utils.TailFile(logfile, lines=20)]
+    return (False, "OS create script failed (%s), last lines in the"
+            " log file:\n%s" % (result.fail_reason, "\n".join(lines)))
  
-  return True
+  return (True, "Successfully installed")
  
  
  def RunRenameInstance(instance, old_name):
@@ -686,9 +741,12 @@ def RunRenameInstance(instance, old_name):
    if result.failed:
      logging.error("os create command '%s' returned error: %s output: %s",
                    result.cmd, result.fail_reason, result.output)
-    return False
+    lines = [utils.SafeEncode(val)
+             for val in utils.TailFile(logfile, lines=20)]
+    return (False, "OS rename script failed (%s), last lines in the"
+            " log file:\n%s" % (result.fail_reason, "\n".join(lines)))
  
-  return True
+  return (True, "Rename successful")
  
  
  def _GetVGInfo(vg_name):
@@ -731,7 +789,53 @@ def _GetVGInfo(vg_name):
    return retdic
  
  
-def _GatherBlockDevs(instance):
+def _GetBlockDevSymlinkPath(instance_name, idx):
+  return os.path.join(constants.DISK_LINKS_DIR,
+                      "%s:%d" % (instance_name, idx))
+
+
+def _SymlinkBlockDev(instance_name, device_path, idx):
+  """Set up symlinks to a instance's block device.
+
+  This is an auxiliary function run when an instance is start (on the primary
+  node) or when an instance is migrated (on the target node).
+
+
+  @param instance_name: the name of the target instance
+  @param device_path: path of the physical block device, on the node
+  @param idx: the disk index
+  @return: absolute path to the disk's symlink
+
+  """
+  link_name = _GetBlockDevSymlinkPath(instance_name, idx)
+  try:
+    os.symlink(device_path, link_name)
+  except OSError, err:
+    if err.errno == errno.EEXIST:
+      if (not os.path.islink(link_name) or
+          os.readlink(link_name) != device_path):
+        os.remove(link_name)
+        os.symlink(device_path, link_name)
+    else:
+      raise
+
+  return link_name
+
+
+def _RemoveBlockDevLinks(instance_name, disks):
+  """Remove the block device symlinks belonging to the given instance.
+
+  """
+  for idx, disk in enumerate(disks):
+    link_name = _GetBlockDevSymlinkPath(instance_name, idx)
+    if os.path.islink(link_name):
+      try:
+        os.remove(link_name)
+      except OSError:
+        logging.exception("Can't remove symlink '%s'", link_name)
+
+
+def _GatherAndLinkBlockDevs(instance):
    """Set up an instance's block device(s).
  
    This is run on the primary node at instance startup. The block
@@ -739,22 +843,29 @@ def _GatherBlockDevs(instance):
  
    @type instance: L{objects.Instance}
    @param instance: the instance whose disks we shoul assemble
-  @rtype: list of L{bdev.BlockDev}
-  @return: list of the block devices
+  @rtype: list
+  @return: list of (disk_object, device_path)
  
    """
    block_devices = []
-  for disk in instance.disks:
+  for idx, disk in enumerate(instance.disks):
      device = _RecursiveFindBD(disk)
      if device is None:
        raise errors.BlockDeviceError("Block device '%s' is not set up." %
                                      str(disk))
      device.Open()
-    block_devices.append((disk, device))
+    try:
+      link_name = _SymlinkBlockDev(instance.name, device.dev_path, idx)
+    except OSError, e:
+      raise errors.BlockDeviceError("Cannot create block device symlink: %s" %
+                                    e.strerror)
+
+    block_devices.append((disk, link_name))
+
    return block_devices
  
  
-def StartInstance(instance, extra_args):
+def StartInstance(instance):
    """Start an instance.
  
    @type instance: L{objects.Instance}
@@ -766,21 +877,24 @@ def StartInstance(instance, extra_args):
    running_instances = GetInstanceList([instance.hypervisor])
  
    if instance.name in running_instances:
-    return True
-
-  block_devices = _GatherBlockDevs(instance)
-  hyper = hypervisor.GetHypervisor(instance.hypervisor)
+    return (True, "Already running")
  
    try:
-    hyper.StartInstance(instance, block_devices, extra_args)
+    block_devices = _GatherAndLinkBlockDevs(instance)
+    hyper = hypervisor.GetHypervisor(instance.hypervisor)
+    hyper.StartInstance(instance, block_devices)
+  except errors.BlockDeviceError, err:
+    logging.exception("Failed to start instance")
+    return (False, "Block device error: %s" % str(err))
    except errors.HypervisorError, err:
      logging.exception("Failed to start instance")
-    return False
+    _RemoveBlockDevLinks(instance.name, instance.disks)
+    return (False, "Hypervisor error: %s" % str(err))
  
-  return True
+  return (True, "Instance started successfully")
  
  
-def ShutdownInstance(instance):
+def InstanceShutdown(instance):
    """Shut an instance down.
  
    @note: this functions uses polling with a hardcoded timeout.
@@ -795,17 +909,17 @@ def ShutdownInstance(instance):
    running_instances = GetInstanceList([hv_name])
  
    if instance.name not in running_instances:
-    return True
+    return (True, "Instance already stopped")
  
    hyper = hypervisor.GetHypervisor(hv_name)
    try:
      hyper.StopInstance(instance)
    except errors.HypervisorError, err:
-    logging.error("Failed to stop instance")
-    return False
+    msg = "Failed to stop instance %s: %s" % (instance.name, err)
+    logging.error(msg)
+    return (False, msg)
  
    # test every 10secs for 2min
-  shutdown_ok = False
  
    time.sleep(1)
    for dummy in range(11):
@@ -814,24 +928,29 @@ def ShutdownInstance(instance):
      time.sleep(10)
    else:
      # the shutdown did not succeed
-    logging.error("shutdown of '%s' unsuccessful, using destroy", instance)
+    logging.error("Shutdown of '%s' unsuccessful, using destroy",
+                  instance.name)
  
      try:
        hyper.StopInstance(instance, force=True)
      except errors.HypervisorError, err:
-      logging.exception("Failed to stop instance")
-      return False
+      msg = "Failed to force stop instance %s: %s" % (instance.name, err)
+      logging.error(msg)
+      return (False, msg)
  
      time.sleep(1)
      if instance.name in GetInstanceList([hv_name]):
-      logging.error("could not shutdown instance '%s' even by destroy",
-                    instance.name)
-      return False
+      msg = ("Could not shutdown instance %s even by destroy" %
+             instance.name)
+      logging.error(msg)
+      return (False, msg)
  
-  return True
+  _RemoveBlockDevLinks(instance.name, instance.disks)
+
+  return (True, "Instance has been shutdown successfully")
  
  
-def RebootInstance(instance, reboot_type, extra_args):
+def InstanceReboot(instance, reboot_type):
    """Reboot an instance.
  
    @type instance: L{objects.Instance}
@@ -853,27 +972,91 @@ def RebootInstance(instance, reboot_type, extra_args):
    running_instances = GetInstanceList([instance.hypervisor])
  
    if instance.name not in running_instances:
-    logging.error("Cannot reboot instance that is not running")
-    return False
+    msg = "Cannot reboot instance %s that is not running" % instance.name
+    logging.error(msg)
+    return (False, msg)
  
    hyper = hypervisor.GetHypervisor(instance.hypervisor)
    if reboot_type == constants.INSTANCE_REBOOT_SOFT:
      try:
        hyper.RebootInstance(instance)
      except errors.HypervisorError, err:
-      logging.exception("Failed to soft reboot instance")
-      return False
+      msg = "Failed to soft reboot instance %s: %s" % (instance.name, err)
+      logging.error(msg)
+      return (False, msg)
    elif reboot_type == constants.INSTANCE_REBOOT_HARD:
      try:
-      ShutdownInstance(instance)
-      StartInstance(instance, extra_args)
+      stop_result = InstanceShutdown(instance)
+      if not stop_result[0]:
+        return stop_result
+      return StartInstance(instance)
      except errors.HypervisorError, err:
-      logging.exception("Failed to hard reboot instance")
-      return False
+      msg = "Failed to hard reboot instance %s: %s" % (instance.name, err)
+      logging.error(msg)
+      return (False, msg)
    else:
-    raise errors.ParameterError("reboot_type invalid")
+    return (False, "Invalid reboot_type received: %s" % (reboot_type,))
  
-  return True
+  return (True, "Reboot successful")
+
+
+def MigrationInfo(instance):
+  """Gather information about an instance to be migrated.
+
+  @type instance: L{objects.Instance}
+  @param instance: the instance definition
+
+  """
+  hyper = hypervisor.GetHypervisor(instance.hypervisor)
+  try:
+    info = hyper.MigrationInfo(instance)
+  except errors.HypervisorError, err:
+    msg = "Failed to fetch migration information"
+    logging.exception(msg)
+    return (False, '%s: %s' % (msg, err))
+  return (True, info)
+
+
+def AcceptInstance(instance, info, target):
+  """Prepare the node to accept an instance.
+
+  @type instance: L{objects.Instance}
+  @param instance: the instance definition
+  @type info: string/data (opaque)
+  @param info: migration information, from the source node
+  @type target: string
+  @param target: target host (usually ip), on this node
+
+  """
+  hyper = hypervisor.GetHypervisor(instance.hypervisor)
+  try:
+    hyper.AcceptInstance(instance, info, target)
+  except errors.HypervisorError, err:
+    msg = "Failed to accept instance"
+    logging.exception(msg)
+    return (False, '%s: %s' % (msg, err))
+  return (True, "Accept successfull")
+
+
+def FinalizeMigration(instance, info, success):
+  """Finalize any preparation to accept an instance.
+
+  @type instance: L{objects.Instance}
+  @param instance: the instance definition
+  @type info: string/data (opaque)
+  @param info: migration information, from the source node
+  @type success: boolean
+  @param success: whether the migration was a success or a failure
+
+  """
+  hyper = hypervisor.GetHypervisor(instance.hypervisor)
+  try:
+    hyper.FinalizeMigration(instance, info, success)
+  except errors.HypervisorError, err:
+    msg = "Failed to finalize migration"
+    logging.exception(msg)
+    return (False, '%s: %s' % (msg, err))
+  return (True, "Migration Finalized")
  
  
  def MigrateInstance(instance, target, live):
@@ -892,18 +1075,18 @@ def MigrateInstance(instance, target, live):
        - msg is a string with details in case of failure
  
    """
-  hyper = hypervisor.GetHypervisor(instance.hypervisor_name)
+  hyper = hypervisor.GetHypervisor(instance.hypervisor)
  
    try:
      hyper.MigrateInstance(instance.name, target, live)
    except errors.HypervisorError, err:
-    msg = "Failed to migrate instance: %s" % str(err)
-    logging.error(msg)
-    return (False, msg)
+    msg = "Failed to migrate instance"
+    logging.exception(msg)
+    return (False, "%s: %s" % (msg, err))
    return (True, "Migration successfull")
  
  
-def CreateBlockDevice(disk, size, owner, on_primary, info):
+def BlockdevCreate(disk, size, owner, on_primary, info):
    """Creates a block device for an instance.
  
    @type disk: L{objects.Disk}
@@ -927,73 +1110,91 @@ def CreateBlockDevice(disk, size, owner, on_primary, info):
    clist = []
    if disk.children:
      for child in disk.children:
-      crdev = _RecursiveAssembleBD(child, owner, on_primary)
+      try:
+        crdev = _RecursiveAssembleBD(child, owner, on_primary)
+      except errors.BlockDeviceError, err:
+        errmsg = "Can't assemble device %s: %s" % (child, err)
+        logging.error(errmsg)
+        return False, errmsg
        if on_primary or disk.AssembleOnSecondary():
          # we need the children open in case the device itself has to
          # be assembled
-        crdev.Open()
+        try:
+          crdev.Open()
+        except errors.BlockDeviceError, err:
+          errmsg = "Can't make child '%s' read-write: %s" % (child, err)
+          logging.error(errmsg)
+          return False, errmsg
        clist.append(crdev)
+
    try:
-    device = bdev.FindDevice(disk.dev_type, disk.physical_id, clist)
-    if device is not None:
-      logging.info("removing existing device %s", disk)
-      device.Remove()
+    device = bdev.Create(disk.dev_type, disk.physical_id, clist, size)
    except errors.BlockDeviceError, err:
-    pass
+    return False, "Can't create block device: %s" % str(err)
  
-  device = bdev.Create(disk.dev_type, disk.physical_id,
-                       clist, size)
-  if device is None:
-    raise ValueError("Can't create child device for %s, %s" %
-                     (disk, size))
    if on_primary or disk.AssembleOnSecondary():
-    if not device.Assemble():
-      errorstring = "Can't assemble device after creation"
-      logging.error(errorstring)
-      raise errors.BlockDeviceError("%s, very unusual event - check the node"
-                                    " daemon logs" % errorstring)
+    try:
+      device.Assemble()
+    except errors.BlockDeviceError, err:
+      errmsg = ("Can't assemble device after creation, very"
+                " unusual event: %s" % str(err))
+      logging.error(errmsg)
+      return False, errmsg
      device.SetSyncSpeed(constants.SYNC_SPEED)
      if on_primary or disk.OpenOnSecondary():
-      device.Open(force=True)
+      try:
+        device.Open(force=True)
+      except errors.BlockDeviceError, err:
+        errmsg = ("Can't make device r/w after creation, very"
+                  " unusual event: %s" % str(err))
+        logging.error(errmsg)
+        return False, errmsg
      DevCacheManager.UpdateCache(device.dev_path, owner,
                                  on_primary, disk.iv_name)
  
    device.SetInfo(info)
  
    physical_id = device.unique_id
-  return physical_id
+  return True, physical_id
  
  
-def RemoveBlockDevice(disk):
+def BlockdevRemove(disk):
    """Remove a block device.
  
    @note: This is intended to be called recursively.
  
-  @type disk: L{objects.disk}
+  @type disk: L{objects.Disk}
    @param disk: the disk object we should remove
    @rtype: boolean
    @return: the success of the operation
  
    """
+  msgs = []
+  result = True
    try:
-    # since we are removing the device, allow a partial match
-    # this allows removal of broken mirrors
-    rdev = _RecursiveFindBD(disk, allow_partial=True)
+    rdev = _RecursiveFindBD(disk)
    except errors.BlockDeviceError, err:
      # probably can't attach
      logging.info("Can't attach to device %s in remove", disk)
      rdev = None
    if rdev is not None:
      r_path = rdev.dev_path
-    result = rdev.Remove()
+    try:
+      rdev.Remove()
+    except errors.BlockDeviceError, err:
+      msgs.append(str(err))
+      result = False
      if result:
        DevCacheManager.RemoveCache(r_path)
-  else:
-    result = True
+
    if disk.children:
      for child in disk.children:
-      result = result and RemoveBlockDevice(child)
-  return result
+      c_status, c_msg = BlockdevRemove(child)
+      result = result and c_status
+      if c_msg: # not an empty message
+        msgs.append(c_msg)
+
+  return (result, "; ".join(msgs))
  
  
  def _RecursiveAssembleBD(disk, owner, as_primary):
@@ -1032,11 +1233,12 @@ def _RecursiveAssembleBD(disk, owner, as_primary):
          if children.count(None) >= mcn:
            raise
          cdev = None
-        logging.debug("Error in child activation: %s", str(err))
+        logging.error("Error in child activation (but continuing): %s",
+                      str(err))
        children.append(cdev)
  
    if as_primary or disk.AssembleOnSecondary():
-    r_dev = bdev.AttachOrAssemble(disk.dev_type, disk.physical_id, children)
+    r_dev = bdev.Assemble(disk.dev_type, disk.physical_id, children)
      r_dev.SetSyncSpeed(constants.SYNC_SPEED)
      result = r_dev
      if as_primary or disk.OpenOnSecondary():
@@ -1049,7 +1251,7 @@ def _RecursiveAssembleBD(disk, owner, as_primary):
    return result
  
  
-def AssembleBlockDevice(disk, owner, as_primary):
+def BlockdevAssemble(disk, owner, as_primary):
    """Activate a block device for an instance.
  
    This is a wrapper over _RecursiveAssembleBD.
@@ -1059,17 +1261,24 @@ def AssembleBlockDevice(disk, owner, as_primary):
        C{True} for secondary nodes
  
    """
-  result = _RecursiveAssembleBD(disk, owner, as_primary)
-  if isinstance(result, bdev.BlockDev):
-    result = result.dev_path
-  return result
+  status = True
+  result = "no error information"
+  try:
+    result = _RecursiveAssembleBD(disk, owner, as_primary)
+    if isinstance(result, bdev.BlockDev):
+      result = result.dev_path
+  except errors.BlockDeviceError, err:
+    result = "Error while assembling disk: %s" % str(err)
+    status = False
+  return (status, result)
  
  
-def ShutdownBlockDevice(disk):
+def BlockdevShutdown(disk):
    """Shut down a block device.
  
-  First, if the device is assembled (can L{Attach()}), then the device
-  is shutdown. Then the children of the device are shutdown.
+  First, if the device is assembled (Attach() is successfull), then
+  the device is shutdown. Then the children of the device are
+  shutdown.
  
    This function is called recursively. Note that we don't cache the
    children or such, as oppossed to assemble, shutdown of different
@@ -1082,21 +1291,29 @@ def ShutdownBlockDevice(disk):
    @return: the success of the operation
  
    """
+  msgs = []
+  result = True
    r_dev = _RecursiveFindBD(disk)
    if r_dev is not None:
      r_path = r_dev.dev_path
-    result = r_dev.Shutdown()
-    if result:
+    try:
+      r_dev.Shutdown()
        DevCacheManager.RemoveCache(r_path)
-  else:
-    result = True
+    except errors.BlockDeviceError, err:
+      msgs.append(str(err))
+      result = False
+
    if disk.children:
      for child in disk.children:
-      result = result and ShutdownBlockDevice(child)
-  return result
+      c_status, c_msg = BlockdevShutdown(child)
+      result = result and c_status
+      if c_msg: # not an empty message
+        msgs.append(c_msg)
+
+  return (result, "; ".join(msgs))
  
  
-def MirrorAddChildren(parent_cdev, new_cdevs):
+def BlockdevAddchildren(parent_cdev, new_cdevs):
    """Extend a mirrored block device.
  
    @type parent_cdev: L{objects.Disk}
@@ -1107,20 +1324,21 @@ def MirrorAddChildren(parent_cdev, new_cdevs):
    @return: the success of the operation
  
    """
-  parent_bdev = _RecursiveFindBD(parent_cdev, allow_partial=True)
+  parent_bdev = _RecursiveFindBD(parent_cdev)
    if parent_bdev is None:
-    logging.error("Can't find parent device")
-    return False
+    msg = "Can't find parent device %s" % str(parent_cdev)
+    logging.error("BlockdevAddchildren: %s", msg)
+    return (False, msg)
    new_bdevs = [_RecursiveFindBD(disk) for disk in new_cdevs]
    if new_bdevs.count(None) > 0:
-    logging.error("Can't find new device(s) to add: %s:%s",
-                  new_bdevs, new_cdevs)
-    return False
+    msg = "Can't find new device(s) to add: %s:%s" % (new_bdevs, new_cdevs)
+    logging.error(msg)
+    return (False, msg)
    parent_bdev.AddChildren(new_bdevs)
-  return True
+  return (True, None)
  
  
-def MirrorRemoveChildren(parent_cdev, new_cdevs):
+def BlockdevRemovechildren(parent_cdev, new_cdevs):
    """Shrink a mirrored block device.
  
    @type parent_cdev: L{objects.Disk}
@@ -1152,7 +1370,7 @@ def MirrorRemoveChildren(parent_cdev, new_cdevs):
    return True
  
  
-def GetMirrorStatus(disks):
+def BlockdevGetmirrorstatus(disks):
    """Get the mirroring status of a list of devices.
  
    @type disks: list of L{objects.Disk}
@@ -1160,7 +1378,7 @@ def GetMirrorStatus(disks):
    @rtype: disk
    @return:
        a list of (mirror_done, estimated_time) tuples, which
-      are the result of L{bdev.BlockDevice.CombinedSyncStatus}
+      are the result of L{bdev.BlockDev.CombinedSyncStatus}
    @raise errors.BlockDeviceError: if any of the disks cannot be
        found
  
@@ -1174,17 +1392,13 @@ def GetMirrorStatus(disks):
    return stats
  
  
-def _RecursiveFindBD(disk, allow_partial=False):
+def _RecursiveFindBD(disk):
    """Check if a device is activated.
  
    If so, return informations about the real device.
  
    @type disk: L{objects.Disk}
    @param disk: the disk object we need to find
-  @type allow_partial: boolean
-  @param allow_partial: if true, don't abort the find if a
-      child of the device can't be found; this is intended
-      to be used when repairing mirrors
  
    @return: None if the device can't be found,
        otherwise the device instance
@@ -1198,7 +1412,7 @@ def _RecursiveFindBD(disk, allow_partial=False):
    return bdev.FindDevice(disk.dev_type, disk.physical_id, children)
  
  
-def FindBlockDevice(disk):
+def BlockdevFind(disk):
    """Check if a device is activated.
  
    If it is, return informations about the real device.
@@ -1211,10 +1425,13 @@ def FindBlockDevice(disk):
        estimated_time, is_degraded)
  
    """
-  rbd = _RecursiveFindBD(disk)
+  try:
+    rbd = _RecursiveFindBD(disk)
+  except errors.BlockDeviceError, err:
+    return (False, str(err))
    if rbd is None:
-    return rbd
-  return (rbd.dev_path, rbd.major, rbd.minor) + rbd.GetSyncStatus()
+    return (True, None)
+  return (True, (rbd.dev_path, rbd.major, rbd.minor) + rbd.GetSyncStatus())
  
  
  def UploadFile(file_name, data, mode, uid, gid, atime, mtime):
@@ -1243,27 +1460,34 @@ def UploadFile(file_name, data, mode, uid, gid, atime, mtime):
  
    """
    if not os.path.isabs(file_name):
-    logging.error("Filename passed to UploadFile is not absolute: '%s'",
-                  file_name)
-    return False
+    err = "Filename passed to UploadFile is not absolute: '%s'" % file_name
+    logging.error(err)
+    return (False, err)
  
-  allowed_files = [
+  allowed_files = set([
      constants.CLUSTER_CONF_FILE,
      constants.ETC_HOSTS,
      constants.SSH_KNOWN_HOSTS_FILE,
      constants.VNC_PASSWORD_FILE,
-    ]
+    constants.RAPI_CERT_FILE,
+    constants.RAPI_USERS_FILE,
+    ])
+
+  for hv_name in constants.HYPER_TYPES:
+    hv_class = hypervisor.GetHypervisor(hv_name)
+    allowed_files.update(hv_class.GetAncillaryFiles())
  
    if file_name not in allowed_files:
-    logging.error("Filename passed to UploadFile not in allowed"
-                 " upload targets: '%s'", file_name)
-    return False
+    err = "Filename passed to UploadFile not in allowed upload targets: '%s'" \
+          % file_name
+    logging.error(err)
+    return (False, err)
  
    raw_data = _Decompress(data)
  
    utils.WriteFile(file_name, data=raw_data, mode=mode, uid=uid, gid=gid,
                    atime=atime, mtime=mtime)
-  return True
+  return (True, "success")
  
  
  def WriteSsconfFiles(values):
@@ -1448,6 +1672,7 @@ def OSEnvironment(instance, debug=0):
    result = {}
    result['OS_API_VERSION'] = '%d' % constants.OS_API_VERSION
    result['INSTANCE_NAME'] = instance.name
+  result['INSTANCE_OS'] = instance.os
    result['HYPERVISOR'] = instance.hypervisor
    result['DISK_COUNT'] = '%d' % len(instance.disks)
    result['NIC_COUNT'] = '%d' % len(instance.nics)
@@ -1459,8 +1684,7 @@ def OSEnvironment(instance, debug=0):
                                      str(disk))
      real_disk.Open()
      result['DISK_%d_PATH' % idx] = real_disk.dev_path
-    # FIXME: When disks will have read-only mode, populate this
-    result['DISK_%d_ACCESS' % idx] = 'W'
+    result['DISK_%d_ACCESS' % idx] = disk.mode
      if constants.HV_DISK_TYPE in instance.hvparams:
        result['DISK_%d_FRONTEND_TYPE' % idx] = \
          instance.hvparams[constants.HV_DISK_TYPE]
@@ -1480,7 +1704,7 @@ def OSEnvironment(instance, debug=0):
  
    return result
  
-def GrowBlockDevice(disk, amount):
+def BlockdevGrow(disk, amount):
    """Grow a stack of block devices.
  
    This function is called recursively, with the childrens being the
@@ -1506,7 +1730,7 @@ def GrowBlockDevice(disk, amount):
    return True, None
  
  
-def SnapshotBlockDevice(disk):
+def BlockdevSnapshot(disk):
    """Create a snapshot copy of a block device.
  
    This function is called recursively, and the snapshot is actually created
@@ -1521,13 +1745,13 @@ def SnapshotBlockDevice(disk):
    if disk.children:
      if len(disk.children) == 1:
        # only one child, let's recurse on it
-      return SnapshotBlockDevice(disk.children[0])
+      return BlockdevSnapshot(disk.children[0])
      else:
        # more than one child, choose one that matches
        for child in disk.children:
          if child.size == disk.size:
            # return implies breaking the loop
-          return SnapshotBlockDevice(child)
+          return BlockdevSnapshot(child)
    elif disk.dev_type == constants.LD_LV:
      r_dev = _RecursiveFindBD(disk)
      if r_dev is not None:
@@ -1641,15 +1865,16 @@ def FinalizeExport(instance, snap_disks):
               instance.beparams[constants.BE_VCPUS])
    config.set(constants.INISECT_INS, 'disk_template', instance.disk_template)
  
-  nic_count = 0
+  nic_total = 0
    for nic_count, nic in enumerate(instance.nics):
+    nic_total += 1
      config.set(constants.INISECT_INS, 'nic%d_mac' %
                 nic_count, '%s' % nic.mac)
      config.set(constants.INISECT_INS, 'nic%d_ip' % nic_count, '%s' % nic.ip)
      config.set(constants.INISECT_INS, 'nic%d_bridge' % nic_count,
                 '%s' % nic.bridge)
    # TODO: redundant: on load can read nics until it doesn't exist
-  config.set(constants.INISECT_INS, 'nic_count' , '%d' % nic_count)
+  config.set(constants.INISECT_INS, 'nic_count' , '%d' % nic_total)
  
    disk_total = 0
    for disk_count, disk in enumerate(snap_disks):
@@ -1776,7 +2001,7 @@ def RemoveExport(export):
    return True
  
  
-def RenameBlockDevices(devlist):
+def BlockdevRename(devlist):
    """Rename a list of block devices.
  
    @type devlist: list of tuples
@@ -1789,10 +2014,12 @@ def RenameBlockDevices(devlist):
    @return: True if all renames succeeded, False otherwise
  
    """
+  msgs = []
    result = True
    for disk, unique_id in devlist:
      dev = _RecursiveFindBD(disk)
      if dev is None:
+      msgs.append("Can't find device %s in rename" % str(disk))
        result = False
        continue
      try:
@@ -1807,9 +2034,11 @@ def RenameBlockDevices(devlist):
          # cache? for now, we only lose lvm data when we rename, which
          # is less critical than DRBD or MD
      except errors.BlockDeviceError, err:
+      msgs.append("Can't rename device '%s' to '%s': %s" %
+                  (dev, unique_id, err))
        logging.exception("Can't rename device '%s' to '%s'", dev, unique_id)
        result = False
-  return result
+  return (result, "; ".join(msgs))
  
  
  def _TransformFileStorageDir(file_storage_dir):
@@ -1980,7 +2209,7 @@ def JobQueueUpdate(file_name, content):
  def JobQueueRename(old, new):
    """Renames a job queue file.
  
-  This is just a wrapper over L{os.rename} with proper checking.
+  This is just a wrapper over os.rename with proper checking.
  
    @type old: str
    @param old: the old (actual) file name
@@ -1993,7 +2222,7 @@ def JobQueueRename(old, new):
    if not (_IsJobQueueFile(old) and _IsJobQueueFile(new)):
      return False
  
-  os.rename(old, new)
+  utils.RenameFile(old, new, mkdir=True)
  
    return True
  
@@ -2018,12 +2247,14 @@ def JobQueueSetDrainFlag(drain_flag):
    return True
  
  
-def CloseBlockDevices(disks):
+def BlockdevClose(instance_name, disks):
    """Closes the given block devices.
  
    This means they will be switched to secondary mode (in case of
    DRBD).
  
+  @param instance_name: if the argument is not empty, the symlinks
+      of this instance will be removed
    @type disks: list of L{objects.Disk}
    @param disks: the list of disks to be closed
    @rtype: tuple (success, message)
@@ -2049,6 +2280,8 @@ def CloseBlockDevices(disks):
    if msg:
      return (False, "Can't make devices secondary: %s" % ",".join(msg))
    else:
+    if instance_name:
+      _RemoveBlockDevLinks(instance_name, disks)
      return (True, "All devices secondary")
  
  
@@ -2094,6 +2327,147 @@ def DemoteFromMC():
    return (True, "Done")
  
  
+def _FindDisks(nodes_ip, disks):
+  """Sets the physical ID on disks and returns the block devices.
+
+  """
+  # set the correct physical ID
+  my_name = utils.HostInfo().name
+  for cf in disks:
+    cf.SetPhysicalID(my_name, nodes_ip)
+
+  bdevs = []
+
+  for cf in disks:
+    rd = _RecursiveFindBD(cf)
+    if rd is None:
+      return (False, "Can't find device %s" % cf)
+    bdevs.append(rd)
+  return (True, bdevs)
+
+
+def DrbdDisconnectNet(nodes_ip, disks):
+  """Disconnects the network on a list of drbd devices.
+
+  """
+  status, bdevs = _FindDisks(nodes_ip, disks)
+  if not status:
+    return status, bdevs
+
+  # disconnect disks
+  for rd in bdevs:
+    try:
+      rd.DisconnectNet()
+    except errors.BlockDeviceError, err:
+      logging.exception("Failed to go into standalone mode")
+      return (False, "Can't change network configuration: %s" % str(err))
+  return (True, "All disks are now disconnected")
+
+
+def DrbdAttachNet(nodes_ip, disks, instance_name, multimaster):
+  """Attaches the network on a list of drbd devices.
+
+  """
+  status, bdevs = _FindDisks(nodes_ip, disks)
+  if not status:
+    return status, bdevs
+
+  if multimaster:
+    for idx, rd in enumerate(bdevs):
+      try:
+        _SymlinkBlockDev(instance_name, rd.dev_path, idx)
+      except EnvironmentError, err:
+        return (False, "Can't create symlink: %s" % str(err))
+  # reconnect disks, switch to new master configuration and if
+  # needed primary mode
+  for rd in bdevs:
+    try:
+      rd.AttachNet(multimaster)
+    except errors.BlockDeviceError, err:
+      return (False, "Can't change network configuration: %s" % str(err))
+  # wait until the disks are connected; we need to retry the re-attach
+  # if the device becomes standalone, as this might happen if the one
+  # node disconnects and reconnects in a different mode before the
+  # other node reconnects; in this case, one or both of the nodes will
+  # decide it has wrong configuration and switch to standalone
+  RECONNECT_TIMEOUT = 2 * 60
+  sleep_time = 0.100 # start with 100 miliseconds
+  timeout_limit = time.time() + RECONNECT_TIMEOUT
+  while time.time() < timeout_limit:
+    all_connected = True
+    for rd in bdevs:
+      stats = rd.GetProcStatus()
+      if not (stats.is_connected or stats.is_in_resync):
+        all_connected = False
+      if stats.is_standalone:
+        # peer had different config info and this node became
+        # standalone, even though this should not happen with the
+        # new staged way of changing disk configs
+        try:
+          rd.ReAttachNet(multimaster)
+        except errors.BlockDeviceError, err:
+          return (False, "Can't change network configuration: %s" % str(err))
+    if all_connected:
+      break
+    time.sleep(sleep_time)
+    sleep_time = min(5, sleep_time * 1.5)
+  if not all_connected:
+    return (False, "Timeout in disk reconnecting")
+  if multimaster:
+    # change to primary mode
+    for rd in bdevs:
+      try:
+        rd.Open()
+      except errors.BlockDeviceError, err:
+        return (False, "Can't change to primary mode: %s" % str(err))
+  if multimaster:
+    msg = "multi-master and primary"
+  else:
+    msg = "single-master"
+  return (True, "Disks are now configured as %s" % msg)
+
+
+def DrbdWaitSync(nodes_ip, disks):
+  """Wait until DRBDs have synchronized.
+
+  """
+  status, bdevs = _FindDisks(nodes_ip, disks)
+  if not status:
+    return status, bdevs
+
+  min_resync = 100
+  alldone = True
+  failure = False
+  for rd in bdevs:
+    stats = rd.GetProcStatus()
+    if not (stats.is_connected or stats.is_in_resync):
+      failure = True
+      break
+    alldone = alldone and (not stats.is_in_resync)
+    if stats.sync_percent is not None:
+      min_resync = min(min_resync, stats.sync_percent)
+  return (not failure, (alldone, min_resync))
+
+
+def PowercycleNode(hypervisor_type):
+  """Hard-powercycle the node.
+
+  Because we need to return first, and schedule the powercycle in the
+  background, we won't be able to report failures nicely.
+
+  """
+  hyper = hypervisor.GetHypervisor(hypervisor_type)
+  try:
+    pid = os.fork()
+  except OSError, err:
+    # if we can't fork, we'll pretend that we're in the child process
+    pid = 0
+  if pid > 0:
+    return (True, "Reboot scheduled in 5 seconds")
+  time.sleep(5)
+  hyper.PowercycleNode()
+
+
  class HooksRunner(object):
    """Hook runner.
  
@@ -2163,7 +2537,7 @@ class HooksRunner(object):
              #logging.exception("Error while closing fd %s", fd)
              pass
  
-    return result == 0, output
+    return result == 0, utils.SafeEncode(output.strip())
  
    def RunHooks(self, hpath, phase, env):
      """Run the scripts in the hooks directory.
@@ -2304,7 +2678,7 @@ class DevCacheManager(object):
          node nor not
      @type iv_name: str
      @param iv_name: the instance-visible name of the
-        device, as in L{objects.Disk.iv_name}
+        device, as in objects.Disk.iv_name
  
      @rtype: None