OSFromDisk: handle variants when loading os

[ganeti-local] / lib / cmdlib.py
diff --git a/lib/cmdlib.py b/lib/cmdlib.py

index ff4aee2..c7e1ec1 100644 (file)
--- a/lib/cmdlib.py
+++ b/lib/cmdlib.py
@@ -26,12 +26,10 @@
  import os
  import os.path
  import time
  import os
  import os.path
  import time
-import tempfile
  import re
  import platform
  import logging
  import copy
  import re
  import platform
  import logging
  import copy
-import random
  
  from ganeti import ssh
  from ganeti import utils
  
  from ganeti import ssh
  from ganeti import utils
@@ -40,7 +38,6 @@ from ganeti import hypervisor
  from ganeti import locking
  from ganeti import constants
  from ganeti import objects
  from ganeti import locking
  from ganeti import constants
  from ganeti import objects
-from ganeti import opcodes
  from ganeti import serializer
  from ganeti import ssconf
  
  from ganeti import serializer
  from ganeti import ssconf
  
@@ -50,8 +47,8 @@ class LogicalUnit(object):
  
    Subclasses must follow these rules:
      - implement ExpandNames
  
    Subclasses must follow these rules:
      - implement ExpandNames
-    - implement CheckPrereq
-    - implement Exec
+    - implement CheckPrereq (except when tasklets are used)
+    - implement Exec (except when tasklets are used)
      - implement BuildHooksEnv
      - redefine HPATH and HTYPE
      - optionally redefine their run requirements:
      - implement BuildHooksEnv
      - redefine HPATH and HTYPE
      - optionally redefine their run requirements:
@@ -59,6 +56,9 @@ class LogicalUnit(object):
  
    Note that all commands require root permissions.
  
  
    Note that all commands require root permissions.
  
+  @ivar dry_run_result: the value (if any) that will be returned to the caller
+      in dry-run mode (signalled by opcode dry_run parameter)
+
    """
    HPATH = None
    HTYPE = None
    """
    HPATH = None
    HTYPE = None
@@ -68,7 +68,7 @@ class LogicalUnit(object):
    def __init__(self, processor, op, context, rpc):
      """Constructor for LogicalUnit.
  
    def __init__(self, processor, op, context, rpc):
      """Constructor for LogicalUnit.
  
-    This needs to be overriden in derived classes in order to check op
+    This needs to be overridden in derived classes in order to check op
      validity.
  
      """
      validity.
  
      """
@@ -80,7 +80,7 @@ class LogicalUnit(object):
      # Dicts used to declare locking needs to mcpu
      self.needed_locks = None
      self.acquired_locks = {}
      # Dicts used to declare locking needs to mcpu
      self.needed_locks = None
      self.acquired_locks = {}
-    self.share_locks = dict(((i, 0) for i in locking.LEVELS))
+    self.share_locks = dict.fromkeys(locking.LEVELS, 0)
      self.add_locks = {}
      self.remove_locks = {}
      # Used to force good behavior when calling helper functions
      self.add_locks = {}
      self.remove_locks = {}
      # Used to force good behavior when calling helper functions
@@ -89,12 +89,19 @@ class LogicalUnit(object):
      # logging
      self.LogWarning = processor.LogWarning
      self.LogInfo = processor.LogInfo
      # logging
      self.LogWarning = processor.LogWarning
      self.LogInfo = processor.LogInfo
+    self.LogStep = processor.LogStep
+    # support for dry-run
+    self.dry_run_result = None
+
+    # Tasklets
+    self.tasklets = None
  
      for attr_name in self._OP_REQP:
        attr_val = getattr(op, attr_name, None)
        if attr_val is None:
          raise errors.OpPrereqError("Required parameter '%s' missing" %
                                     attr_name)
  
      for attr_name in self._OP_REQP:
        attr_val = getattr(op, attr_name, None)
        if attr_val is None:
          raise errors.OpPrereqError("Required parameter '%s' missing" %
                                     attr_name)
+
      self.CheckArguments()
  
    def __GetSSH(self):
      self.CheckArguments()
  
    def __GetSSH(self):
@@ -116,7 +123,7 @@ class LogicalUnit(object):
      CheckPrereq, doing these separate is better because:
  
        - ExpandNames is left as as purely a lock-related function
      CheckPrereq, doing these separate is better because:
  
        - ExpandNames is left as as purely a lock-related function
-      - CheckPrereq is run after we have aquired locks (and possible
+      - CheckPrereq is run after we have acquired locks (and possible
          waited for them)
  
      The function is allowed to change the self.op attribute so that
          waited for them)
  
      The function is allowed to change the self.op attribute so that
@@ -146,6 +153,10 @@ class LogicalUnit(object):
      level you can modify self.share_locks, setting a true value (usually 1) for
      that level. By default locks are not shared.
  
      level you can modify self.share_locks, setting a true value (usually 1) for
      that level. By default locks are not shared.
  
+    This function can also define a list of tasklets, which then will be
+    executed in order instead of the usual LU-level CheckPrereq and Exec
+    functions, if those are not defined by the LU.
+
      Examples::
  
        # Acquire all nodes and one instance
      Examples::
  
        # Acquire all nodes and one instance
@@ -202,7 +213,13 @@ class LogicalUnit(object):
      their canonical form if it hasn't been done by ExpandNames before.
  
      """
      their canonical form if it hasn't been done by ExpandNames before.
  
      """
-    raise NotImplementedError
+    if self.tasklets is not None:
+      for (idx, tl) in enumerate(self.tasklets):
+        logging.debug("Checking prerequisites for tasklet %s/%s",
+                      idx + 1, len(self.tasklets))
+        tl.CheckPrereq()
+    else:
+      raise NotImplementedError
  
    def Exec(self, feedback_fn):
      """Execute the LU.
  
    def Exec(self, feedback_fn):
      """Execute the LU.
@@ -212,7 +229,12 @@ class LogicalUnit(object):
      code, or expected.
  
      """
      code, or expected.
  
      """
-    raise NotImplementedError
+    if self.tasklets is not None:
+      for (idx, tl) in enumerate(self.tasklets):
+        logging.debug("Executing tasklet %s/%s", idx + 1, len(self.tasklets))
+        tl.Exec(feedback_fn)
+    else:
+      raise NotImplementedError
  
    def BuildHooksEnv(self):
      """Build hooks environment for this LU.
  
    def BuildHooksEnv(self):
      """Build hooks environment for this LU.
@@ -336,6 +358,52 @@ class NoHooksLU(LogicalUnit):
    HTYPE = None
  
  
    HTYPE = None
  
  
+class Tasklet:
+  """Tasklet base class.
+
+  Tasklets are subcomponents for LUs. LUs can consist entirely of tasklets or
+  they can mix legacy code with tasklets. Locking needs to be done in the LU,
+  tasklets know nothing about locks.
+
+  Subclasses must follow these rules:
+    - Implement CheckPrereq
+    - Implement Exec
+
+  """
+  def __init__(self, lu):
+    self.lu = lu
+
+    # Shortcuts
+    self.cfg = lu.cfg
+    self.rpc = lu.rpc
+
+  def CheckPrereq(self):
+    """Check prerequisites for this tasklets.
+
+    This method should check whether the prerequisites for the execution of
+    this tasklet are fulfilled. It can do internode communication, but it
+    should be idempotent - no cluster or system changes are allowed.
+
+    The method should raise errors.OpPrereqError in case something is not
+    fulfilled. Its return value is ignored.
+
+    This method should also update all parameters to their canonical form if it
+    hasn't been done before.
+
+    """
+    raise NotImplementedError
+
+  def Exec(self, feedback_fn):
+    """Execute the tasklet.
+
+    This method should implement the actual work. It should raise
+    errors.OpExecError for failures that are somewhat dealt with in code, or
+    expected.
+
+    """
+    raise NotImplementedError
+
+
  def _GetWantedNodes(lu, nodes):
    """Returns list of checked and expanded node names.
  
  def _GetWantedNodes(lu, nodes):
    """Returns list of checked and expanded node names.
  
@@ -453,7 +521,8 @@ def _CheckNodeNotDrained(lu, node):
  
  
  def _BuildInstanceHookEnv(name, primary_node, secondary_nodes, os_type, status,
  
  
  def _BuildInstanceHookEnv(name, primary_node, secondary_nodes, os_type, status,
-                          memory, vcpus, nics, disk_template, disks):
+                          memory, vcpus, nics, disk_template, disks,
+                          bep, hvp, hypervisor_name):
    """Builds instance related env variables for hooks
  
    This builds the hook environment from individual variables.
    """Builds instance related env variables for hooks
  
    This builds the hook environment from individual variables.
@@ -473,12 +542,18 @@ def _BuildInstanceHookEnv(name, primary_node, secondary_nodes, os_type, status,
    @type vcpus: string
    @param vcpus: the count of VCPUs the instance has
    @type nics: list
    @type vcpus: string
    @param vcpus: the count of VCPUs the instance has
    @type nics: list
-  @param nics: list of tuples (ip, bridge, mac) representing
-      the NICs the instance  has
+  @param nics: list of tuples (ip, mac, mode, link) representing
+      the NICs the instance has
    @type disk_template: string
    @type disk_template: string
-  @param disk_template: the distk template of the instance
+  @param disk_template: the disk template of the instance
    @type disks: list
    @param disks: the list of (size, mode) pairs
    @type disks: list
    @param disks: the list of (size, mode) pairs
+  @type bep: dict
+  @param bep: the backend parameters for the instance
+  @type hvp: dict
+  @param hvp: the hypervisor parameters for the instance
+  @type hypervisor_name: string
+  @param hypervisor_name: the hypervisor for the instance
    @rtype: dict
    @return: the hook environment for this instance
  
    @rtype: dict
    @return: the hook environment for this instance
  
@@ -497,6 +572,7 @@ def _BuildInstanceHookEnv(name, primary_node, secondary_nodes, os_type, status,
      "INSTANCE_MEMORY": memory,
      "INSTANCE_VCPUS": vcpus,
      "INSTANCE_DISK_TEMPLATE": disk_template,
      "INSTANCE_MEMORY": memory,
      "INSTANCE_VCPUS": vcpus,
      "INSTANCE_DISK_TEMPLATE": disk_template,
+    "INSTANCE_HYPERVISOR": hypervisor_name,
    }
  
    if nics:
    }
  
    if nics:
@@ -525,12 +601,18 @@ def _BuildInstanceHookEnv(name, primary_node, secondary_nodes, os_type, status,
  
    env["INSTANCE_DISK_COUNT"] = disk_count
  
  
    env["INSTANCE_DISK_COUNT"] = disk_count
  
+  for source, kind in [(bep, "BE"), (hvp, "HV")]:
+    for key, value in source.items():
+      env["INSTANCE_%s_%s" % (kind, key)] = value
+
    return env
  
    return env
  
-def _PreBuildNICHooksList(lu, nics):
+
+def _NICListToTuple(lu, nics):
    """Build a list of nic information tuples.
  
    """Build a list of nic information tuples.
  
-  This list is suitable to be passed to _BuildInstanceHookEnv.
+  This list is suitable to be passed to _BuildInstanceHookEnv or as a return
+  value in LUQueryInstanceData.
  
    @type lu:  L{LogicalUnit}
    @param lu: the logical unit on whose behalf we execute
  
    @type lu:  L{LogicalUnit}
    @param lu: the logical unit on whose behalf we execute
@@ -549,6 +631,7 @@ def _PreBuildNICHooksList(lu, nics):
      hooks_nics.append((ip, mac, mode, link))
    return hooks_nics
  
      hooks_nics.append((ip, mac, mode, link))
    return hooks_nics
  
+
  def _BuildInstanceHookEnvByObject(lu, instance, override=None):
    """Builds instance related env variables for hooks from an object.
  
  def _BuildInstanceHookEnvByObject(lu, instance, override=None):
    """Builds instance related env variables for hooks from an object.
  
@@ -564,7 +647,9 @@ def _BuildInstanceHookEnvByObject(lu, instance, override=None):
    @return: the hook environment dictionary
  
    """
    @return: the hook environment dictionary
  
    """
-  bep = lu.cfg.GetClusterInfo().FillBE(instance)
+  cluster = lu.cfg.GetClusterInfo()
+  bep = cluster.FillBE(instance)
+  hvp = cluster.FillHV(instance)
    args = {
      'name': instance.name,
      'primary_node': instance.primary_node,
    args = {
      'name': instance.name,
      'primary_node': instance.primary_node,
@@ -573,31 +658,45 @@ def _BuildInstanceHookEnvByObject(lu, instance, override=None):
      'status': instance.admin_up,
      'memory': bep[constants.BE_MEMORY],
      'vcpus': bep[constants.BE_VCPUS],
      'status': instance.admin_up,
      'memory': bep[constants.BE_MEMORY],
      'vcpus': bep[constants.BE_VCPUS],
-    'nics': _PreBuildNICHooksList(lu, instance.nics),
+    'nics': _NICListToTuple(lu, instance.nics),
      'disk_template': instance.disk_template,
      'disks': [(disk.size, disk.mode) for disk in instance.disks],
      'disk_template': instance.disk_template,
      'disks': [(disk.size, disk.mode) for disk in instance.disks],
+    'bep': bep,
+    'hvp': hvp,
+    'hypervisor_name': instance.hypervisor,
    }
    if override:
      args.update(override)
    return _BuildInstanceHookEnv(**args)
  
  
    }
    if override:
      args.update(override)
    return _BuildInstanceHookEnv(**args)
  
  
-def _AdjustCandidatePool(lu):
+def _AdjustCandidatePool(lu, exceptions):
    """Adjust the candidate pool after node operations.
  
    """
    """Adjust the candidate pool after node operations.
  
    """
-  mod_list = lu.cfg.MaintainCandidatePool()
+  mod_list = lu.cfg.MaintainCandidatePool(exceptions)
    if mod_list:
      lu.LogInfo("Promoted nodes to master candidate role: %s",
                 ", ".join(node.name for node in mod_list))
      for name in mod_list:
        lu.context.ReaddNode(name)
    if mod_list:
      lu.LogInfo("Promoted nodes to master candidate role: %s",
                 ", ".join(node.name for node in mod_list))
      for name in mod_list:
        lu.context.ReaddNode(name)
-  mc_now, mc_max = lu.cfg.GetMasterCandidateStats()
+  mc_now, mc_max, _ = lu.cfg.GetMasterCandidateStats(exceptions)
    if mc_now > mc_max:
      lu.LogInfo("Note: more nodes are candidates (%d) than desired (%d)" %
                 (mc_now, mc_max))
  
  
    if mc_now > mc_max:
      lu.LogInfo("Note: more nodes are candidates (%d) than desired (%d)" %
                 (mc_now, mc_max))
  
  
+def _DecideSelfPromotion(lu, exceptions=None):
+  """Decide whether I should promote myself as a master candidate.
+
+  """
+  cp_size = lu.cfg.GetClusterInfo().candidate_pool_size
+  mc_now, mc_should, _ = lu.cfg.GetMasterCandidateStats(exceptions)
+  # the new node will increase mc_max with one, so:
+  mc_should = min(mc_should + 1, cp_size)
+  return mc_now < mc_should
+
+
  def _CheckNicsBridgesExist(lu, target_nics, target_node,
                                 profile=constants.PP_DEFAULT):
    """Check that the brigdes needed by a list of nics exist.
  def _CheckNicsBridgesExist(lu, target_nics, target_node,
                                 profile=constants.PP_DEFAULT):
    """Check that the brigdes needed by a list of nics exist.
@@ -610,10 +709,8 @@ def _CheckNicsBridgesExist(lu, target_nics, target_node,
              if params[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED]
    if brlist:
      result = lu.rpc.call_bridges_exist(target_node, brlist)
              if params[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED]
    if brlist:
      result = lu.rpc.call_bridges_exist(target_node, brlist)
-    msg = result.RemoteFailMsg()
-    if msg:
-      raise errors.OpPrereqError("Error checking bridges on destination node"
-                                 " '%s': %s" % (target_node, msg))
+    result.Raise("Error checking bridges on destination node '%s'" %
+                 target_node, prereq=True)
  
  
  def _CheckInstanceBridgesExist(lu, instance, node=None):
  
  
  def _CheckInstanceBridgesExist(lu, instance, node=None):
@@ -621,22 +718,117 @@ def _CheckInstanceBridgesExist(lu, instance, node=None):
  
    """
    if node is None:
  
    """
    if node is None:
-    node=instance.primary_node
+    node = instance.primary_node
    _CheckNicsBridgesExist(lu, instance.nics, node)
  
  
    _CheckNicsBridgesExist(lu, instance.nics, node)
  
  
-class LUDestroyCluster(NoHooksLU):
+def _GetNodeInstancesInner(cfg, fn):
+  return [i for i in cfg.GetAllInstancesInfo().values() if fn(i)]
+
+
+def _GetNodeInstances(cfg, node_name):
+  """Returns a list of all primary and secondary instances on a node.
+
+  """
+
+  return _GetNodeInstancesInner(cfg, lambda inst: node_name in inst.all_nodes)
+
+
+def _GetNodePrimaryInstances(cfg, node_name):
+  """Returns primary instances on a node.
+
+  """
+  return _GetNodeInstancesInner(cfg,
+                                lambda inst: node_name == inst.primary_node)
+
+
+def _GetNodeSecondaryInstances(cfg, node_name):
+  """Returns secondary instances on a node.
+
+  """
+  return _GetNodeInstancesInner(cfg,
+                                lambda inst: node_name in inst.secondary_nodes)
+
+
+def _GetStorageTypeArgs(cfg, storage_type):
+  """Returns the arguments for a storage type.
+
+  """
+  # Special case for file storage
+  if storage_type == constants.ST_FILE:
+    # storage.FileStorage wants a list of storage directories
+    return [[cfg.GetFileStorageDir()]]
+
+  return []
+
+
+def _FindFaultyInstanceDisks(cfg, rpc, instance, node_name, prereq):
+  faulty = []
+
+  for dev in instance.disks:
+    cfg.SetDiskID(dev, node_name)
+
+  result = rpc.call_blockdev_getmirrorstatus(node_name, instance.disks)
+  result.Raise("Failed to get disk status from node %s" % node_name,
+               prereq=prereq)
+
+  for idx, bdev_status in enumerate(result.payload):
+    if bdev_status and bdev_status.ldisk_status == constants.LDS_FAULTY:
+      faulty.append(idx)
+
+  return faulty
+
+
+class LUPostInitCluster(LogicalUnit):
+  """Logical unit for running hooks after cluster initialization.
+
+  """
+  HPATH = "cluster-init"
+  HTYPE = constants.HTYPE_CLUSTER
+  _OP_REQP = []
+
+  def BuildHooksEnv(self):
+    """Build hooks env.
+
+    """
+    env = {"OP_TARGET": self.cfg.GetClusterName()}
+    mn = self.cfg.GetMasterNode()
+    return env, [], [mn]
+
+  def CheckPrereq(self):
+    """No prerequisites to check.
+
+    """
+    return True
+
+  def Exec(self, feedback_fn):
+    """Nothing to do.
+
+    """
+    return True
+
+
+class LUDestroyCluster(LogicalUnit):
    """Logical unit for destroying the cluster.
  
    """
    """Logical unit for destroying the cluster.
  
    """
+  HPATH = "cluster-destroy"
+  HTYPE = constants.HTYPE_CLUSTER
    _OP_REQP = []
  
    _OP_REQP = []
  
+  def BuildHooksEnv(self):
+    """Build hooks env.
+
+    """
+    env = {"OP_TARGET": self.cfg.GetClusterName()}
+    return env, [], []
+
    def CheckPrereq(self):
      """Check prerequisites.
  
      This checks whether the cluster is empty.
  
    def CheckPrereq(self):
      """Check prerequisites.
  
      This checks whether the cluster is empty.
  
-    Any errors are signalled by raising errors.OpPrereqError.
+    Any errors are signaled by raising errors.OpPrereqError.
  
      """
      master = self.cfg.GetMasterNode()
  
      """
      master = self.cfg.GetMasterNode()
@@ -655,10 +847,16 @@ class LUDestroyCluster(NoHooksLU):
  
      """
      master = self.cfg.GetMasterNode()
  
      """
      master = self.cfg.GetMasterNode()
+
+    # Run post hooks on master node before it's removed
+    hm = self.proc.hmclass(self.rpc.call_hooks_runner, self)
+    try:
+      hm.RunPhase(constants.HOOKS_PHASE_POST, [master])
+    except:
+      self.LogWarning("Errors occurred running hooks on %s" % master)
+
      result = self.rpc.call_node_stop_master(master, False)
      result = self.rpc.call_node_stop_master(master, False)
-    msg = result.RemoteFailMsg()
-    if msg:
-      raise errors.OpExecError("Could not disable the master role: %s" % msg)
+    result.Raise("Could not disable the master role")
      priv_key, pub_key, _ = ssh.GetUserFiles(constants.GANETI_RUNAS)
      utils.CreateBackup(priv_key)
      utils.CreateBackup(pub_key)
      priv_key, pub_key, _ = ssh.GetUserFiles(constants.GANETI_RUNAS)
      utils.CreateBackup(priv_key)
      utils.CreateBackup(pub_key)
@@ -671,25 +869,89 @@ class LUVerifyCluster(LogicalUnit):
    """
    HPATH = "cluster-verify"
    HTYPE = constants.HTYPE_CLUSTER
    """
    HPATH = "cluster-verify"
    HTYPE = constants.HTYPE_CLUSTER
-  _OP_REQP = ["skip_checks"]
+  _OP_REQP = ["skip_checks", "verbose", "error_codes", "debug_simulate_errors"]
    REQ_BGL = False
  
    REQ_BGL = False
  
+  TCLUSTER = "cluster"
+  TNODE = "node"
+  TINSTANCE = "instance"
+
+  ECLUSTERCFG = (TCLUSTER, "ECLUSTERCFG")
+  EINSTANCEBADNODE = (TINSTANCE, "EINSTANCEBADNODE")
+  EINSTANCEDOWN = (TINSTANCE, "EINSTANCEDOWN")
+  EINSTANCELAYOUT = (TINSTANCE, "EINSTANCELAYOUT")
+  EINSTANCEMISSINGDISK = (TINSTANCE, "EINSTANCEMISSINGDISK")
+  EINSTANCEMISSINGDISK = (TINSTANCE, "EINSTANCEMISSINGDISK")
+  EINSTANCEWRONGNODE = (TINSTANCE, "EINSTANCEWRONGNODE")
+  ENODEDRBD = (TNODE, "ENODEDRBD")
+  ENODEFILECHECK = (TNODE, "ENODEFILECHECK")
+  ENODEHOOKS = (TNODE, "ENODEHOOKS")
+  ENODEHV = (TNODE, "ENODEHV")
+  ENODELVM = (TNODE, "ENODELVM")
+  ENODEN1 = (TNODE, "ENODEN1")
+  ENODENET = (TNODE, "ENODENET")
+  ENODEORPHANINSTANCE = (TNODE, "ENODEORPHANINSTANCE")
+  ENODEORPHANLV = (TNODE, "ENODEORPHANLV")
+  ENODERPC = (TNODE, "ENODERPC")
+  ENODESSH = (TNODE, "ENODESSH")
+  ENODEVERSION = (TNODE, "ENODEVERSION")
+
+  ETYPE_FIELD = "code"
+  ETYPE_ERROR = "ERROR"
+  ETYPE_WARNING = "WARNING"
+
    def ExpandNames(self):
      self.needed_locks = {
        locking.LEVEL_NODE: locking.ALL_SET,
        locking.LEVEL_INSTANCE: locking.ALL_SET,
      }
    def ExpandNames(self):
      self.needed_locks = {
        locking.LEVEL_NODE: locking.ALL_SET,
        locking.LEVEL_INSTANCE: locking.ALL_SET,
      }
-    self.share_locks = dict(((i, 1) for i in locking.LEVELS))
+    self.share_locks = dict.fromkeys(locking.LEVELS, 1)
+
+  def _Error(self, ecode, item, msg, *args, **kwargs):
+    """Format an error message.
+
+    Based on the opcode's error_codes parameter, either format a
+    parseable error code, or a simpler error string.
+
+    This must be called only from Exec and functions called from Exec.
+
+    """
+    ltype = kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR)
+    itype, etxt = ecode
+    # first complete the msg
+    if args:
+      msg = msg % args
+    # then format the whole message
+    if self.op.error_codes:
+      msg = "%s:%s:%s:%s:%s" % (ltype, etxt, itype, item, msg)
+    else:
+      if item:
+        item = " " + item
+      else:
+        item = ""
+      msg = "%s: %s%s: %s" % (ltype, itype, item, msg)
+    # and finally report it via the feedback_fn
+    self._feedback_fn("  - %s" % msg)
+
+  def _ErrorIf(self, cond, *args, **kwargs):
+    """Log an error message if the passed condition is True.
+
+    """
+    cond = bool(cond) or self.op.debug_simulate_errors
+    if cond:
+      self._Error(*args, **kwargs)
+    # do not mark the operation as failed for WARN cases only
+    if kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR) == self.ETYPE_ERROR:
+      self.bad = self.bad or cond
  
    def _VerifyNode(self, nodeinfo, file_list, local_cksum,
  
    def _VerifyNode(self, nodeinfo, file_list, local_cksum,
-                  node_result, feedback_fn, master_files,
-                  drbd_map, vg_name):
+                  node_result, master_files, drbd_map, vg_name):
      """Run multiple tests against a node.
  
      Test list:
  
        - compares ganeti version
      """Run multiple tests against a node.
  
      Test list:
  
        - compares ganeti version
-      - checks vg existance and size > 20G
+      - checks vg existence and size > 20G
        - checks config file checksum
        - checks ssh to other nodes
  
        - checks config file checksum
        - checks ssh to other nodes
  
@@ -698,7 +960,6 @@ class LUVerifyCluster(LogicalUnit):
      @param file_list: required list of files
      @param local_cksum: dictionary of local files and their checksums
      @param node_result: the results from the node
      @param file_list: required list of files
      @param local_cksum: dictionary of local files and their checksums
      @param node_result: the results from the node
-    @param feedback_fn: function used to accumulate results
      @param master_files: list of files that only masters should have
      @param drbd_map: the useddrbd minors for this node, in
          form of minor: (instance, must_exist) which correspond to instances
      @param master_files: list of files that only masters should have
      @param drbd_map: the useddrbd minors for this node, in
          form of minor: (instance, must_exist) which correspond to instances
@@ -707,138 +968,136 @@ class LUVerifyCluster(LogicalUnit):
  
      """
      node = nodeinfo.name
  
      """
      node = nodeinfo.name
+    _ErrorIf = self._ErrorIf
  
      # main result, node_result should be a non-empty dict
  
      # main result, node_result should be a non-empty dict
-    if not node_result or not isinstance(node_result, dict):
-      feedback_fn("  - ERROR: unable to verify node %s." % (node,))
-      return True
+    test = not node_result or not isinstance(node_result, dict)
+    _ErrorIf(test, self.ENODERPC, node,
+                  "unable to verify node: no data returned")
+    if test:
+      return
  
      # compares ganeti version
      local_version = constants.PROTOCOL_VERSION
      remote_version = node_result.get('version', None)
  
      # compares ganeti version
      local_version = constants.PROTOCOL_VERSION
      remote_version = node_result.get('version', None)
-    if not (remote_version and isinstance(remote_version, (list, tuple)) and
-            len(remote_version) == 2):
-      feedback_fn("  - ERROR: connection to %s failed" % (node))
-      return True
-
-    if local_version != remote_version[0]:
-      feedback_fn("  - ERROR: incompatible protocol versions: master %s,"
-                  " node %s %s" % (local_version, node, remote_version[0]))
-      return True
+    test = not (remote_version and
+                isinstance(remote_version, (list, tuple)) and
+                len(remote_version) == 2)
+    _ErrorIf(test, self.ENODERPC, node,
+             "connection to node returned invalid data")
+    if test:
+      return
+
+    test = local_version != remote_version[0]
+    _ErrorIf(test, self.ENODEVERSION, node,
+             "incompatible protocol versions: master %s,"
+             " node %s", local_version, remote_version[0])
+    if test:
+      return
  
      # node seems compatible, we can actually try to look into its results
  
  
      # node seems compatible, we can actually try to look into its results
  
-    bad = False
-
      # full package version
      # full package version
-    if constants.RELEASE_VERSION != remote_version[1]:
-      feedback_fn("  - WARNING: software version mismatch: master %s,"
-                  " node %s %s" %
-                  (constants.RELEASE_VERSION, node, remote_version[1]))
+    self._ErrorIf(constants.RELEASE_VERSION != remote_version[1],
+                  self.ENODEVERSION, node,
+                  "software version mismatch: master %s, node %s",
+                  constants.RELEASE_VERSION, remote_version[1],
+                  code=self.ETYPE_WARNING)
  
      # checks vg existence and size > 20G
      if vg_name is not None:
        vglist = node_result.get(constants.NV_VGLIST, None)
  
      # checks vg existence and size > 20G
      if vg_name is not None:
        vglist = node_result.get(constants.NV_VGLIST, None)
-      if not vglist:
-        feedback_fn("  - ERROR: unable to check volume groups on node %s." %
-                        (node,))
-        bad = True
-      else:
+      test = not vglist
+      _ErrorIf(test, self.ENODELVM, node, "unable to check volume groups")
+      if not test:
          vgstatus = utils.CheckVolumeGroupSize(vglist, vg_name,
                                                constants.MIN_VG_SIZE)
          vgstatus = utils.CheckVolumeGroupSize(vglist, vg_name,
                                                constants.MIN_VG_SIZE)
-        if vgstatus:
-          feedback_fn("  - ERROR: %s on node %s" % (vgstatus, node))
-          bad = True
+        _ErrorIf(vgstatus, self.ENODELVM, node, vgstatus)
  
      # checks config file checksum
  
      remote_cksum = node_result.get(constants.NV_FILELIST, None)
  
      # checks config file checksum
  
      remote_cksum = node_result.get(constants.NV_FILELIST, None)
-    if not isinstance(remote_cksum, dict):
-      bad = True
-      feedback_fn("  - ERROR: node hasn't returned file checksum data")
-    else:
+    test = not isinstance(remote_cksum, dict)
+    _ErrorIf(test, self.ENODEFILECHECK, node,
+             "node hasn't returned file checksum data")
+    if not test:
        for file_name in file_list:
          node_is_mc = nodeinfo.master_candidate
        for file_name in file_list:
          node_is_mc = nodeinfo.master_candidate
-        must_have_file = file_name not in master_files
-        if file_name not in remote_cksum:
-          if node_is_mc or must_have_file:
-            bad = True
-            feedback_fn("  - ERROR: file '%s' missing" % file_name)
-        elif remote_cksum[file_name] != local_cksum[file_name]:
-          if node_is_mc or must_have_file:
-            bad = True
-            feedback_fn("  - ERROR: file '%s' has wrong checksum" % file_name)
-          else:
-            # not candidate and this is not a must-have file
-            bad = True
-            feedback_fn("  - ERROR: non master-candidate has old/wrong file"
-                        " '%s'" % file_name)
-        else:
-          # all good, except non-master/non-must have combination
-          if not node_is_mc and not must_have_file:
-            feedback_fn("  - ERROR: file '%s' should not exist on non master"
-                        " candidates" % file_name)
+        must_have = (file_name not in master_files) or node_is_mc
+        # missing
+        test1 = file_name not in remote_cksum
+        # invalid checksum
+        test2 = not test1 and remote_cksum[file_name] != local_cksum[file_name]
+        # existing and good
+        test3 = not test1 and remote_cksum[file_name] == local_cksum[file_name]
+        _ErrorIf(test1 and must_have, self.ENODEFILECHECK, node,
+                 "file '%s' missing", file_name)
+        _ErrorIf(test2 and must_have, self.ENODEFILECHECK, node,
+                 "file '%s' has wrong checksum", file_name)
+        # not candidate and this is not a must-have file
+        _ErrorIf(test2 and not must_have, self.ENODEFILECHECK, node,
+                 "file '%s' should not exist on non master"
+                 " candidates (and the file is outdated)", file_name)
+        # all good, except non-master/non-must have combination
+        _ErrorIf(test3 and not must_have, self.ENODEFILECHECK, node,
+                 "file '%s' should not exist"
+                 " on non master candidates", file_name)
  
      # checks ssh to any
  
  
      # checks ssh to any
  
-    if constants.NV_NODELIST not in node_result:
-      bad = True
-      feedback_fn("  - ERROR: node hasn't returned node ssh connectivity data")
-    else:
+    test = constants.NV_NODELIST not in node_result
+    _ErrorIf(test, self.ENODESSH, node,
+             "node hasn't returned node ssh connectivity data")
+    if not test:
        if node_result[constants.NV_NODELIST]:
        if node_result[constants.NV_NODELIST]:
-        bad = True
-        for node in node_result[constants.NV_NODELIST]:
-          feedback_fn("  - ERROR: ssh communication with node '%s': %s" %
-                          (node, node_result[constants.NV_NODELIST][node]))
-
-    if constants.NV_NODENETTEST not in node_result:
-      bad = True
-      feedback_fn("  - ERROR: node hasn't returned node tcp connectivity data")
-    else:
+        for a_node, a_msg in node_result[constants.NV_NODELIST].items():
+          _ErrorIf(True, self.ENODESSH, node,
+                   "ssh communication with node '%s': %s", a_node, a_msg)
+
+    test = constants.NV_NODENETTEST not in node_result
+    _ErrorIf(test, self.ENODENET, node,
+             "node hasn't returned node tcp connectivity data")
+    if not test:
        if node_result[constants.NV_NODENETTEST]:
        if node_result[constants.NV_NODENETTEST]:
-        bad = True
          nlist = utils.NiceSort(node_result[constants.NV_NODENETTEST].keys())
          nlist = utils.NiceSort(node_result[constants.NV_NODENETTEST].keys())
-        for node in nlist:
-          feedback_fn("  - ERROR: tcp communication with node '%s': %s" %
-                          (node, node_result[constants.NV_NODENETTEST][node]))
+        for anode in nlist:
+          _ErrorIf(True, self.ENODENET, node,
+                   "tcp communication with node '%s': %s",
+                   anode, node_result[constants.NV_NODENETTEST][anode])
  
      hyp_result = node_result.get(constants.NV_HYPERVISOR, None)
      if isinstance(hyp_result, dict):
        for hv_name, hv_result in hyp_result.iteritems():
  
      hyp_result = node_result.get(constants.NV_HYPERVISOR, None)
      if isinstance(hyp_result, dict):
        for hv_name, hv_result in hyp_result.iteritems():
-        if hv_result is not None:
-          feedback_fn("  - ERROR: hypervisor %s verify failure: '%s'" %
-                      (hv_name, hv_result))
+        test = hv_result is not None
+        _ErrorIf(test, self.ENODEHV, node,
+                 "hypervisor %s verify failure: '%s'", hv_name, hv_result)
  
      # check used drbd list
      if vg_name is not None:
        used_minors = node_result.get(constants.NV_DRBDLIST, [])
  
      # check used drbd list
      if vg_name is not None:
        used_minors = node_result.get(constants.NV_DRBDLIST, [])
-      if not isinstance(used_minors, (tuple, list)):
-        feedback_fn("  - ERROR: cannot parse drbd status file: %s" %
-                    str(used_minors))
-      else:
+      test = not isinstance(used_minors, (tuple, list))
+      _ErrorIf(test, self.ENODEDRBD, node,
+               "cannot parse drbd status file: %s", str(used_minors))
+      if not test:
          for minor, (iname, must_exist) in drbd_map.items():
          for minor, (iname, must_exist) in drbd_map.items():
-          if minor not in used_minors and must_exist:
-            feedback_fn("  - ERROR: drbd minor %d of instance %s is"
-                        " not active" % (minor, iname))
-            bad = True
+          test = minor not in used_minors and must_exist
+          _ErrorIf(test, self.ENODEDRBD, node,
+                   "drbd minor %d of instance %s is not active",
+                   minor, iname)
          for minor in used_minors:
          for minor in used_minors:
-          if minor not in drbd_map:
-            feedback_fn("  - ERROR: unallocated drbd minor %d is in use" %
-                        minor)
-            bad = True
-
-    return bad
+          test = minor not in drbd_map
+          _ErrorIf(test, self.ENODEDRBD, node,
+                   "unallocated drbd minor %d is in use", minor)
  
    def _VerifyInstance(self, instance, instanceconfig, node_vol_is,
  
    def _VerifyInstance(self, instance, instanceconfig, node_vol_is,
-                      node_instance, feedback_fn, n_offline):
+                      node_instance, n_offline):
      """Verify an instance.
  
      This function checks to see if the required block devices are
      available on the instance's node.
  
      """
      """Verify an instance.
  
      This function checks to see if the required block devices are
      available on the instance's node.
  
      """
-    bad = False
-
+    _ErrorIf = self._ErrorIf
      node_current = instanceconfig.primary_node
  
      node_vol_should = {}
      node_current = instanceconfig.primary_node
  
      node_vol_should = {}
@@ -849,69 +1108,57 @@ class LUVerifyCluster(LogicalUnit):
          # ignore missing volumes on offline nodes
          continue
        for volume in node_vol_should[node]:
          # ignore missing volumes on offline nodes
          continue
        for volume in node_vol_should[node]:
-        if node not in node_vol_is or volume not in node_vol_is[node]:
-          feedback_fn("  - ERROR: volume %s missing on node %s" %
-                          (volume, node))
-          bad = True
+        test = node not in node_vol_is or volume not in node_vol_is[node]
+        _ErrorIf(test, self.EINSTANCEMISSINGDISK, instance,
+                 "volume %s missing on node %s", volume, node)
  
      if instanceconfig.admin_up:
  
      if instanceconfig.admin_up:
-      if ((node_current not in node_instance or
-          not instance in node_instance[node_current]) and
-          node_current not in n_offline):
-        feedback_fn("  - ERROR: instance %s not running on node %s" %
-                        (instance, node_current))
-        bad = True
+      test = ((node_current not in node_instance or
+               not instance in node_instance[node_current]) and
+              node_current not in n_offline)
+      _ErrorIf(test, self.EINSTANCEDOWN, instance,
+               "instance not running on its primary node %s",
+               node_current)
  
      for node in node_instance:
        if (not node == node_current):
  
      for node in node_instance:
        if (not node == node_current):
-        if instance in node_instance[node]:
-          feedback_fn("  - ERROR: instance %s should not run on node %s" %
-                          (instance, node))
-          bad = True
+        test = instance in node_instance[node]
+        _ErrorIf(test, self.EINSTANCEWRONGNODE, instance,
+                 "instance should not run on node %s", node)
  
  
-    return bad
-
-  def _VerifyOrphanVolumes(self, node_vol_should, node_vol_is, feedback_fn):
+  def _VerifyOrphanVolumes(self, node_vol_should, node_vol_is):
      """Verify if there are any unknown volumes in the cluster.
  
      The .os, .swap and backup volumes are ignored. All other volumes are
      reported as unknown.
  
      """
      """Verify if there are any unknown volumes in the cluster.
  
      The .os, .swap and backup volumes are ignored. All other volumes are
      reported as unknown.
  
      """
-    bad = False
-
      for node in node_vol_is:
        for volume in node_vol_is[node]:
      for node in node_vol_is:
        for volume in node_vol_is[node]:
-        if node not in node_vol_should or volume not in node_vol_should[node]:
-          feedback_fn("  - ERROR: volume %s on node %s should not exist" %
-                      (volume, node))
-          bad = True
-    return bad
+        test = (node not in node_vol_should or
+                volume not in node_vol_should[node])
+        self._ErrorIf(test, self.ENODEORPHANLV, node,
+                      "volume %s is unknown", volume)
  
  
-  def _VerifyOrphanInstances(self, instancelist, node_instance, feedback_fn):
+  def _VerifyOrphanInstances(self, instancelist, node_instance):
      """Verify the list of running instances.
  
      This checks what instances are running but unknown to the cluster.
  
      """
      """Verify the list of running instances.
  
      This checks what instances are running but unknown to the cluster.
  
      """
-    bad = False
      for node in node_instance:
      for node in node_instance:
-      for runninginstance in node_instance[node]:
-        if runninginstance not in instancelist:
-          feedback_fn("  - ERROR: instance %s on node %s should not exist" %
-                          (runninginstance, node))
-          bad = True
-    return bad
-
-  def _VerifyNPlusOneMemory(self, node_info, instance_cfg, feedback_fn):
+      for o_inst in node_instance[node]:
+        test = o_inst not in instancelist
+        self._ErrorIf(test, self.ENODEORPHANINSTANCE, node,
+                      "instance %s on node %s should not exist", o_inst, node)
+
+  def _VerifyNPlusOneMemory(self, node_info, instance_cfg):
      """Verify N+1 Memory Resilience.
  
      Check that if one single node dies we can still start all the instances it
      was primary for.
  
      """
      """Verify N+1 Memory Resilience.
  
      Check that if one single node dies we can still start all the instances it
      was primary for.
  
      """
-    bad = False
-
      for node, nodeinfo in node_info.iteritems():
        # This code checks that every node which is now listed as secondary has
        # enough memory to host all instances it is supposed to should a single
      for node, nodeinfo in node_info.iteritems():
        # This code checks that every node which is now listed as secondary has
        # enough memory to host all instances it is supposed to should a single
@@ -927,11 +1174,10 @@ class LUVerifyCluster(LogicalUnit):
            bep = self.cfg.GetClusterInfo().FillBE(instance_cfg[instance])
            if bep[constants.BE_AUTO_BALANCE]:
              needed_mem += bep[constants.BE_MEMORY]
            bep = self.cfg.GetClusterInfo().FillBE(instance_cfg[instance])
            if bep[constants.BE_AUTO_BALANCE]:
              needed_mem += bep[constants.BE_MEMORY]
-        if nodeinfo['mfree'] < needed_mem:
-          feedback_fn("  - ERROR: not enough memory on node %s to accomodate"
-                      " failovers should node %s fail" % (node, prinode))
-          bad = True
-    return bad
+        test = nodeinfo['mfree'] < needed_mem
+        self._ErrorIf(test, self.ENODEN1, node,
+                      "not enough memory on to accommodate"
+                      " failovers should peer node %s fail", prinode)
  
    def CheckPrereq(self):
      """Check prerequisites.
  
    def CheckPrereq(self):
      """Check prerequisites.
@@ -947,7 +1193,7 @@ class LUVerifyCluster(LogicalUnit):
    def BuildHooksEnv(self):
      """Build hooks env.
  
    def BuildHooksEnv(self):
      """Build hooks env.
  
-    Cluster-Verify hooks just rone in the post phase and their failure makes
+    Cluster-Verify hooks just ran in the post phase and their failure makes
      the output be logged in the verify output and the verification to fail.
  
      """
      the output be logged in the verify output and the verification to fail.
  
      """
@@ -964,10 +1210,13 @@ class LUVerifyCluster(LogicalUnit):
      """Verify integrity of cluster, performing various test on nodes.
  
      """
      """Verify integrity of cluster, performing various test on nodes.
  
      """
-    bad = False
+    self.bad = False
+    _ErrorIf = self._ErrorIf
+    verbose = self.op.verbose
+    self._feedback_fn = feedback_fn
      feedback_fn("* Verifying global settings")
      for msg in self.cfg.VerifyConfig():
      feedback_fn("* Verifying global settings")
      for msg in self.cfg.VerifyConfig():
-      feedback_fn("  - ERROR: %s" % msg)
+      _ErrorIf(True, self.ECLUSTERCFG, None, msg)
  
      vg_name = self.cfg.GetVGName()
      hypervisors = self.cfg.GetClusterInfo().enabled_hypervisors
  
      vg_name = self.cfg.GetVGName()
      hypervisors = self.cfg.GetClusterInfo().enabled_hypervisors
@@ -1020,11 +1269,13 @@ class LUVerifyCluster(LogicalUnit):
      master_node = self.cfg.GetMasterNode()
      all_drbd_map = self.cfg.ComputeDRBDMap()
  
      master_node = self.cfg.GetMasterNode()
      all_drbd_map = self.cfg.ComputeDRBDMap()
  
+    feedback_fn("* Verifying node status")
      for node_i in nodeinfo:
        node = node_i.name
  
        if node_i.offline:
      for node_i in nodeinfo:
        node = node_i.name
  
        if node_i.offline:
-        feedback_fn("* Skipping offline node %s" % (node,))
+        if verbose:
+          feedback_fn("* Skipping offline node %s" % (node,))
          n_offline.append(node)
          continue
  
          n_offline.append(node)
          continue
  
@@ -1037,62 +1288,59 @@ class LUVerifyCluster(LogicalUnit):
          n_drained.append(node)
        else:
          ntype = "regular"
          n_drained.append(node)
        else:
          ntype = "regular"
-      feedback_fn("* Verifying node %s (%s)" % (node, ntype))
+      if verbose:
+        feedback_fn("* Verifying node %s (%s)" % (node, ntype))
  
  
-      msg = all_nvinfo[node].RemoteFailMsg()
+      msg = all_nvinfo[node].fail_msg
+      _ErrorIf(msg, self.ENODERPC, node, "while contacting node: %s", msg)
        if msg:
        if msg:
-        feedback_fn("  - ERROR: while contacting node %s: %s" % (node, msg))
-        bad = True
          continue
  
        nresult = all_nvinfo[node].payload
        node_drbd = {}
        for minor, instance in all_drbd_map[node].items():
          continue
  
        nresult = all_nvinfo[node].payload
        node_drbd = {}
        for minor, instance in all_drbd_map[node].items():
-        if instance not in instanceinfo:
-          feedback_fn("  - ERROR: ghost instance '%s' in temporary DRBD map" %
-                      instance)
+        test = instance not in instanceinfo
+        _ErrorIf(test, self.ECLUSTERCFG, None,
+                 "ghost instance '%s' in temporary DRBD map", instance)
            # ghost instance should not be running, but otherwise we
            # don't give double warnings (both ghost instance and
            # unallocated minor in use)
            # ghost instance should not be running, but otherwise we
            # don't give double warnings (both ghost instance and
            # unallocated minor in use)
+        if test:
            node_drbd[minor] = (instance, False)
          else:
            instance = instanceinfo[instance]
            node_drbd[minor] = (instance.name, instance.admin_up)
            node_drbd[minor] = (instance, False)
          else:
            instance = instanceinfo[instance]
            node_drbd[minor] = (instance.name, instance.admin_up)
-      result = self._VerifyNode(node_i, file_names, local_checksums,
-                                nresult, feedback_fn, master_files,
-                                node_drbd, vg_name)
-      bad = bad or result
+      self._VerifyNode(node_i, file_names, local_checksums,
+                       nresult, master_files, node_drbd, vg_name)
  
        lvdata = nresult.get(constants.NV_LVLIST, "Missing LV data")
        if vg_name is None:
          node_volume[node] = {}
        elif isinstance(lvdata, basestring):
  
        lvdata = nresult.get(constants.NV_LVLIST, "Missing LV data")
        if vg_name is None:
          node_volume[node] = {}
        elif isinstance(lvdata, basestring):
-        feedback_fn("  - ERROR: LVM problem on node %s: %s" %
-                    (node, utils.SafeEncode(lvdata)))
-        bad = True
+        _ErrorIf(True, self.ENODELVM, node, "LVM problem on node: %s",
+                 utils.SafeEncode(lvdata))
          node_volume[node] = {}
        elif not isinstance(lvdata, dict):
          node_volume[node] = {}
        elif not isinstance(lvdata, dict):
-        feedback_fn("  - ERROR: connection to %s failed (lvlist)" % (node,))
-        bad = True
+        _ErrorIf(True, self.ENODELVM, node, "rpc call to node failed (lvlist)")
          continue
        else:
          node_volume[node] = lvdata
  
        # node_instance
        idata = nresult.get(constants.NV_INSTANCELIST, None)
          continue
        else:
          node_volume[node] = lvdata
  
        # node_instance
        idata = nresult.get(constants.NV_INSTANCELIST, None)
-      if not isinstance(idata, list):
-        feedback_fn("  - ERROR: connection to %s failed (instancelist)" %
-                    (node,))
-        bad = True
+      test = not isinstance(idata, list)
+      _ErrorIf(test, self.ENODEHV, node,
+               "rpc call to node failed (instancelist)")
+      if test:
          continue
  
        node_instance[node] = idata
  
        # node_info
        nodeinfo = nresult.get(constants.NV_HVINFO, None)
          continue
  
        node_instance[node] = idata
  
        # node_info
        nodeinfo = nresult.get(constants.NV_HVINFO, None)
-      if not isinstance(nodeinfo, dict):
-        feedback_fn("  - ERROR: connection to %s failed (hvinfo)" % (node,))
-        bad = True
+      test = not isinstance(nodeinfo, dict)
+      _ErrorIf(test, self.ENODEHV, node, "rpc call to node failed (hvinfo)")
+      if test:
          continue
  
        try:
          continue
  
        try:
@@ -1110,28 +1358,28 @@ class LUVerifyCluster(LogicalUnit):
          }
          # FIXME: devise a free space model for file based instances as well
          if vg_name is not None:
          }
          # FIXME: devise a free space model for file based instances as well
          if vg_name is not None:
-          if (constants.NV_VGLIST not in nresult or
-              vg_name not in nresult[constants.NV_VGLIST]):
-            feedback_fn("  - ERROR: node %s didn't return data for the"
-                        " volume group '%s' - it is either missing or broken" %
-                        (node, vg_name))
-            bad = True
+          test = (constants.NV_VGLIST not in nresult or
+                  vg_name not in nresult[constants.NV_VGLIST])
+          _ErrorIf(test, self.ENODELVM, node,
+                   "node didn't return data for the volume group '%s'"
+                   " - it is either missing or broken", vg_name)
+          if test:
              continue
            node_info[node]["dfree"] = int(nresult[constants.NV_VGLIST][vg_name])
        except (ValueError, KeyError):
              continue
            node_info[node]["dfree"] = int(nresult[constants.NV_VGLIST][vg_name])
        except (ValueError, KeyError):
-        feedback_fn("  - ERROR: invalid nodeinfo value returned"
-                    " from node %s" % (node,))
-        bad = True
+        _ErrorIf(True, self.ENODERPC, node,
+                 "node returned invalid nodeinfo, check lvm/hypervisor")
          continue
  
      node_vol_should = {}
  
          continue
  
      node_vol_should = {}
  
+    feedback_fn("* Verifying instance status")
      for instance in instancelist:
      for instance in instancelist:
-      feedback_fn("* Verifying instance %s" % instance)
+      if verbose:
+        feedback_fn("* Verifying instance %s" % instance)
        inst_config = instanceinfo[instance]
        inst_config = instanceinfo[instance]
-      result =  self._VerifyInstance(instance, inst_config, node_volume,
-                                     node_instance, feedback_fn, n_offline)
-      bad = bad or result
+      self._VerifyInstance(instance, inst_config, node_volume,
+                           node_instance, n_offline)
        inst_nodes_offline = []
  
        inst_config.MapLVsByNode(node_vol_should)
        inst_nodes_offline = []
  
        inst_config.MapLVsByNode(node_vol_should)
@@ -1139,12 +1387,11 @@ class LUVerifyCluster(LogicalUnit):
        instance_cfg[instance] = inst_config
  
        pnode = inst_config.primary_node
        instance_cfg[instance] = inst_config
  
        pnode = inst_config.primary_node
+      _ErrorIf(pnode not in node_info and pnode not in n_offline,
+               self.ENODERPC, pnode, "instance %s, connection to"
+               " primary node failed", instance)
        if pnode in node_info:
          node_info[pnode]['pinst'].append(instance)
        if pnode in node_info:
          node_info[pnode]['pinst'].append(instance)
-      elif pnode not in n_offline:
-        feedback_fn("  - ERROR: instance %s, connection to primary node"
-                    " %s failed" % (instance, pnode))
-        bad = True
  
        if pnode in n_offline:
          inst_nodes_offline.append(pnode)
  
        if pnode in n_offline:
          inst_nodes_offline.append(pnode)
@@ -1156,46 +1403,42 @@ class LUVerifyCluster(LogicalUnit):
        # FIXME: does not support file-backed instances
        if len(inst_config.secondary_nodes) == 0:
          i_non_redundant.append(instance)
        # FIXME: does not support file-backed instances
        if len(inst_config.secondary_nodes) == 0:
          i_non_redundant.append(instance)
-      elif len(inst_config.secondary_nodes) > 1:
-        feedback_fn("  - WARNING: multiple secondaries for instance %s"
-                    % instance)
+      _ErrorIf(len(inst_config.secondary_nodes) > 1,
+               self.EINSTANCELAYOUT, instance,
+               "instance has multiple secondary nodes", code="WARNING")
  
        if not cluster.FillBE(inst_config)[constants.BE_AUTO_BALANCE]:
          i_non_a_balanced.append(instance)
  
        for snode in inst_config.secondary_nodes:
  
        if not cluster.FillBE(inst_config)[constants.BE_AUTO_BALANCE]:
          i_non_a_balanced.append(instance)
  
        for snode in inst_config.secondary_nodes:
+        _ErrorIf(snode not in node_info and snode not in n_offline,
+                 self.ENODERPC, snode,
+                 "instance %s, connection to secondary node"
+                 "failed", instance)
+
          if snode in node_info:
            node_info[snode]['sinst'].append(instance)
            if pnode not in node_info[snode]['sinst-by-pnode']:
              node_info[snode]['sinst-by-pnode'][pnode] = []
            node_info[snode]['sinst-by-pnode'][pnode].append(instance)
          if snode in node_info:
            node_info[snode]['sinst'].append(instance)
            if pnode not in node_info[snode]['sinst-by-pnode']:
              node_info[snode]['sinst-by-pnode'][pnode] = []
            node_info[snode]['sinst-by-pnode'][pnode].append(instance)
-        elif snode not in n_offline:
-          feedback_fn("  - ERROR: instance %s, connection to secondary node"
-                      " %s failed" % (instance, snode))
-          bad = True
+
          if snode in n_offline:
            inst_nodes_offline.append(snode)
  
          if snode in n_offline:
            inst_nodes_offline.append(snode)
  
-      if inst_nodes_offline:
-        # warn that the instance lives on offline nodes, and set bad=True
-        feedback_fn("  - ERROR: instance lives on offline node(s) %s" %
-                    ", ".join(inst_nodes_offline))
-        bad = True
+      # warn that the instance lives on offline nodes
+      _ErrorIf(inst_nodes_offline, self.EINSTANCEBADNODE, instance,
+               "instance lives on offline node(s) %s",
+               ", ".join(inst_nodes_offline))
  
      feedback_fn("* Verifying orphan volumes")
  
      feedback_fn("* Verifying orphan volumes")
-    result = self._VerifyOrphanVolumes(node_vol_should, node_volume,
-                                       feedback_fn)
-    bad = bad or result
+    self._VerifyOrphanVolumes(node_vol_should, node_volume)
  
      feedback_fn("* Verifying remaining instances")
  
      feedback_fn("* Verifying remaining instances")
-    result = self._VerifyOrphanInstances(instancelist, node_instance,
-                                         feedback_fn)
-    bad = bad or result
+    self._VerifyOrphanInstances(instancelist, node_instance)
  
      if constants.VERIFY_NPLUSONE_MEM not in self.skip_set:
        feedback_fn("* Verifying N+1 Memory redundancy")
  
      if constants.VERIFY_NPLUSONE_MEM not in self.skip_set:
        feedback_fn("* Verifying N+1 Memory redundancy")
-      result = self._VerifyNPlusOneMemory(node_info, instance_cfg, feedback_fn)
-      bad = bad or result
+      self._VerifyNPlusOneMemory(node_info, instance_cfg)
  
      feedback_fn("* Other Notes")
      if i_non_redundant:
  
      feedback_fn("* Other Notes")
      if i_non_redundant:
@@ -1212,10 +1455,10 @@ class LUVerifyCluster(LogicalUnit):
      if n_drained:
        feedback_fn("  - NOTICE: %d drained node(s) found." % len(n_drained))
  
      if n_drained:
        feedback_fn("  - NOTICE: %d drained node(s) found." % len(n_drained))
  
-    return not bad
+    return not self.bad
  
    def HooksCallBack(self, phase, hooks_results, feedback_fn, lu_result):
  
    def HooksCallBack(self, phase, hooks_results, feedback_fn, lu_result):
-    """Analize the post-hooks' result
+    """Analyze the post-hooks' result
  
      This method analyses the hook result, handles it, and sends some
      nicely-formatted feedback back to the user.
  
      This method analyses the hook result, handles it, and sends some
      nicely-formatted feedback back to the user.
@@ -1235,31 +1478,28 @@ class LUVerifyCluster(LogicalUnit):
        # Used to change hooks' output to proper indentation
        indent_re = re.compile('^', re.M)
        feedback_fn("* Hooks Results")
        # Used to change hooks' output to proper indentation
        indent_re = re.compile('^', re.M)
        feedback_fn("* Hooks Results")
-      if not hooks_results:
-        feedback_fn("  - ERROR: general communication failure")
-        lu_result = 1
-      else:
-        for node_name in hooks_results:
-          show_node_header = True
-          res = hooks_results[node_name]
-          if res.failed or res.data is False or not isinstance(res.data, list):
-            if res.offline:
-              # no need to warn or set fail return value
-              continue
-            feedback_fn("    Communication failure in hooks execution")
+      assert hooks_results, "invalid result from hooks"
+
+      for node_name in hooks_results:
+        show_node_header = True
+        res = hooks_results[node_name]
+        msg = res.fail_msg
+        test = msg and not res.offline
+        self._ErrorIf(test, self.ENODEHOOKS, node_name,
+                      "Communication failure in hooks execution: %s", msg)
+        if test:
+          # override manually lu_result here as _ErrorIf only
+          # overrides self.bad
+          lu_result = 1
+          continue
+        for script, hkr, output in res.payload:
+          test = hkr == constants.HKR_FAIL
+          self._ErrorIf(test, self.ENODEHOOKS, node_name,
+                        "Script %s failed, output:", script)
+          if test:
+            output = indent_re.sub('      ', output)
+            feedback_fn("%s" % output)
              lu_result = 1
              lu_result = 1
-            continue
-          for script, hkr, output in res.data:
-            if hkr == constants.HKR_FAIL:
-              # The node header is only shown once, if there are
-              # failing hooks on that node
-              if show_node_header:
-                feedback_fn("  Node %s:" % node_name)
-                show_node_header = False
-              feedback_fn("    ERROR: Script %s failed, output:" % script)
-              output = indent_re.sub('      ', output)
-              feedback_fn("%s" % output)
-              lu_result = 1
  
        return lu_result
  
  
        return lu_result
  
@@ -1276,7 +1516,7 @@ class LUVerifyDisks(NoHooksLU):
        locking.LEVEL_NODE: locking.ALL_SET,
        locking.LEVEL_INSTANCE: locking.ALL_SET,
      }
        locking.LEVEL_NODE: locking.ALL_SET,
        locking.LEVEL_INSTANCE: locking.ALL_SET,
      }
-    self.share_locks = dict(((i, 1) for i in locking.LEVELS))
+    self.share_locks = dict.fromkeys(locking.LEVELS, 1)
  
    def CheckPrereq(self):
      """Check prerequisites.
  
    def CheckPrereq(self):
      """Check prerequisites.
@@ -1317,15 +1557,14 @@ class LUVerifyDisks(NoHooksLU):
      if not nv_dict:
        return result
  
      if not nv_dict:
        return result
  
-    node_lvs = self.rpc.call_volume_list(nodes, vg_name)
+    node_lvs = self.rpc.call_lv_list(nodes, vg_name)
  
  
-    to_act = set()
      for node in nodes:
        # node_volume
        node_res = node_lvs[node]
        if node_res.offline:
          continue
      for node in nodes:
        # node_volume
        node_res = node_lvs[node]
        if node_res.offline:
          continue
-      msg = node_res.RemoteFailMsg()
+      msg = node_res.fail_msg
        if msg:
          logging.warning("Error enumerating LVs on node %s: %s", node, msg)
          res_nodes[node] = msg
        if msg:
          logging.warning("Error enumerating LVs on node %s: %s", node, msg)
          res_nodes[node] = msg
@@ -1348,6 +1587,127 @@ class LUVerifyDisks(NoHooksLU):
      return result
  
  
      return result
  
  
+class LURepairDiskSizes(NoHooksLU):
+  """Verifies the cluster disks sizes.
+
+  """
+  _OP_REQP = ["instances"]
+  REQ_BGL = False
+
+  def ExpandNames(self):
+    if not isinstance(self.op.instances, list):
+      raise errors.OpPrereqError("Invalid argument type 'instances'")
+
+    if self.op.instances:
+      self.wanted_names = []
+      for name in self.op.instances:
+        full_name = self.cfg.ExpandInstanceName(name)
+        if full_name is None:
+          raise errors.OpPrereqError("Instance '%s' not known" % name)
+        self.wanted_names.append(full_name)
+      self.needed_locks = {
+        locking.LEVEL_NODE: [],
+        locking.LEVEL_INSTANCE: self.wanted_names,
+        }
+      self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
+    else:
+      self.wanted_names = None
+      self.needed_locks = {
+        locking.LEVEL_NODE: locking.ALL_SET,
+        locking.LEVEL_INSTANCE: locking.ALL_SET,
+        }
+    self.share_locks = dict(((i, 1) for i in locking.LEVELS))
+
+  def DeclareLocks(self, level):
+    if level == locking.LEVEL_NODE and self.wanted_names is not None:
+      self._LockInstancesNodes(primary_only=True)
+
+  def CheckPrereq(self):
+    """Check prerequisites.
+
+    This only checks the optional instance list against the existing names.
+
+    """
+    if self.wanted_names is None:
+      self.wanted_names = self.acquired_locks[locking.LEVEL_INSTANCE]
+
+    self.wanted_instances = [self.cfg.GetInstanceInfo(name) for name
+                             in self.wanted_names]
+
+  def _EnsureChildSizes(self, disk):
+    """Ensure children of the disk have the needed disk size.
+
+    This is valid mainly for DRBD8 and fixes an issue where the
+    children have smaller disk size.
+
+    @param disk: an L{ganeti.objects.Disk} object
+
+    """
+    if disk.dev_type == constants.LD_DRBD8:
+      assert disk.children, "Empty children for DRBD8?"
+      fchild = disk.children[0]
+      mismatch = fchild.size < disk.size
+      if mismatch:
+        self.LogInfo("Child disk has size %d, parent %d, fixing",
+                     fchild.size, disk.size)
+        fchild.size = disk.size
+
+      # and we recurse on this child only, not on the metadev
+      return self._EnsureChildSizes(fchild) or mismatch
+    else:
+      return False
+
+  def Exec(self, feedback_fn):
+    """Verify the size of cluster disks.
+
+    """
+    # TODO: check child disks too
+    # TODO: check differences in size between primary/secondary nodes
+    per_node_disks = {}
+    for instance in self.wanted_instances:
+      pnode = instance.primary_node
+      if pnode not in per_node_disks:
+        per_node_disks[pnode] = []
+      for idx, disk in enumerate(instance.disks):
+        per_node_disks[pnode].append((instance, idx, disk))
+
+    changed = []
+    for node, dskl in per_node_disks.items():
+      newl = [v[2].Copy() for v in dskl]
+      for dsk in newl:
+        self.cfg.SetDiskID(dsk, node)
+      result = self.rpc.call_blockdev_getsizes(node, newl)
+      if result.fail_msg:
+        self.LogWarning("Failure in blockdev_getsizes call to node"
+                        " %s, ignoring", node)
+        continue
+      if len(result.data) != len(dskl):
+        self.LogWarning("Invalid result from node %s, ignoring node results",
+                        node)
+        continue
+      for ((instance, idx, disk), size) in zip(dskl, result.data):
+        if size is None:
+          self.LogWarning("Disk %d of instance %s did not return size"
+                          " information, ignoring", idx, instance.name)
+          continue
+        if not isinstance(size, (int, long)):
+          self.LogWarning("Disk %d of instance %s did not return valid"
+                          " size information, ignoring", idx, instance.name)
+          continue
+        size = size >> 20
+        if size != disk.size:
+          self.LogInfo("Disk %d of instance %s has mismatched size,"
+                       " correcting: recorded %d, actual %d", idx,
+                       instance.name, disk.size, size)
+          disk.size = size
+          self.cfg.Update(instance)
+          changed.append((instance.name, idx, size))
+        if self._EnsureChildSizes(disk):
+          self.cfg.Update(instance)
+          changed.append((instance.name, idx, disk.size))
+    return changed
+
+
  class LURenameCluster(LogicalUnit):
    """Rename the cluster.
  
  class LURenameCluster(LogicalUnit):
    """Rename the cluster.
  
@@ -1398,9 +1758,7 @@ class LURenameCluster(LogicalUnit):
      # shutdown the master IP
      master = self.cfg.GetMasterNode()
      result = self.rpc.call_node_stop_master(master, False)
      # shutdown the master IP
      master = self.cfg.GetMasterNode()
      result = self.rpc.call_node_stop_master(master, False)
-    msg = result.RemoteFailMsg()
-    if msg:
-      raise errors.OpExecError("Could not disable the master role: %s" % msg)
+    result.Raise("Could not disable the master role")
  
      try:
        cluster = self.cfg.GetClusterInfo()
  
      try:
        cluster = self.cfg.GetClusterInfo()
@@ -1418,15 +1776,15 @@ class LURenameCluster(LogicalUnit):
        result = self.rpc.call_upload_file(node_list,
                                           constants.SSH_KNOWN_HOSTS_FILE)
        for to_node, to_result in result.iteritems():
        result = self.rpc.call_upload_file(node_list,
                                           constants.SSH_KNOWN_HOSTS_FILE)
        for to_node, to_result in result.iteritems():
-         msg = to_result.RemoteFailMsg()
-         if msg:
-           msg = ("Copy of file %s to node %s failed: %s" %
-                   (constants.SSH_KNOWN_HOSTS_FILE, to_node, msg))
-           self.proc.LogWarning(msg)
+        msg = to_result.fail_msg
+        if msg:
+          msg = ("Copy of file %s to node %s failed: %s" %
+                 (constants.SSH_KNOWN_HOSTS_FILE, to_node, msg))
+          self.proc.LogWarning(msg)
  
      finally:
  
      finally:
-      result = self.rpc.call_node_start_master(master, False)
-      msg = result.RemoteFailMsg()
+      result = self.rpc.call_node_start_master(master, False, False)
+      msg = result.fail_msg
        if msg:
          self.LogWarning("Could not re-enable the master role on"
                          " the master, please restart manually: %s", msg)
        if msg:
          self.LogWarning("Could not re-enable the master role on"
                          " the master, please restart manually: %s", msg)
@@ -1437,7 +1795,7 @@ def _RecursiveCheckIfLVMBased(disk):
  
    @type disk: L{objects.Disk}
    @param disk: the disk to check
  
    @type disk: L{objects.Disk}
    @param disk: the disk to check
-  @rtype: booleean
+  @rtype: boolean
    @return: boolean indicating whether a LD_LV dev_type was found or not
  
    """
    @return: boolean indicating whether a LD_LV dev_type was found or not
  
    """
@@ -1512,7 +1870,7 @@ class LUSetClusterParams(LogicalUnit):
      if self.op.vg_name:
        vglist = self.rpc.call_vg_list(node_list)
        for node in node_list:
      if self.op.vg_name:
        vglist = self.rpc.call_vg_list(node_list)
        for node in node_list:
-        msg = vglist[node].RemoteFailMsg()
+        msg = vglist[node].fail_msg
          if msg:
            # ignoring down node
            self.LogWarning("Error while gathering data on node %s"
          if msg:
            # ignoring down node
            self.LogWarning("Error while gathering data on node %s"
@@ -1551,6 +1909,14 @@ class LUSetClusterParams(LogicalUnit):
  
      if self.op.enabled_hypervisors is not None:
        self.hv_list = self.op.enabled_hypervisors
  
      if self.op.enabled_hypervisors is not None:
        self.hv_list = self.op.enabled_hypervisors
+      if not self.hv_list:
+        raise errors.OpPrereqError("Enabled hypervisors list must contain at"
+                                   " least one member")
+      invalid_hvs = set(self.hv_list) - constants.HYPER_TYPES
+      if invalid_hvs:
+        raise errors.OpPrereqError("Enabled hypervisors contains invalid"
+                                   " entries: %s" %
+                                   utils.CommaJoin(invalid_hvs))
      else:
        self.hv_list = cluster.enabled_hypervisors
  
      else:
        self.hv_list = cluster.enabled_hypervisors
  
@@ -1590,14 +1956,11 @@ class LUSetClusterParams(LogicalUnit):
  
      if self.op.candidate_pool_size is not None:
        self.cluster.candidate_pool_size = self.op.candidate_pool_size
  
      if self.op.candidate_pool_size is not None:
        self.cluster.candidate_pool_size = self.op.candidate_pool_size
+      # we need to update the pool size here, otherwise the save will fail
+      _AdjustCandidatePool(self, [])
  
      self.cfg.Update(self.cluster)
  
  
      self.cfg.Update(self.cluster)
  
-    # we want to update nodes after the cluster so that if any errors
-    # happen, we have recorded and saved the cluster info
-    if self.op.candidate_pool_size is not None:
-      _AdjustCandidatePool(self)
-
  
  def _RedistributeAncillaryFiles(lu, additional_nodes=None):
    """Distribute additional files which are part of the cluster configuration.
  
  def _RedistributeAncillaryFiles(lu, additional_nodes=None):
    """Distribute additional files which are part of the cluster configuration.
@@ -1622,6 +1985,7 @@ def _RedistributeAncillaryFiles(lu, additional_nodes=None):
                      constants.SSH_KNOWN_HOSTS_FILE,
                      constants.RAPI_CERT_FILE,
                      constants.RAPI_USERS_FILE,
                      constants.SSH_KNOWN_HOSTS_FILE,
                      constants.RAPI_CERT_FILE,
                      constants.RAPI_USERS_FILE,
+                    constants.HMAC_CLUSTER_KEY,
                     ])
  
    enabled_hypervisors = lu.cfg.GetClusterInfo().enabled_hypervisors
                     ])
  
    enabled_hypervisors = lu.cfg.GetClusterInfo().enabled_hypervisors
@@ -1634,11 +1998,11 @@ def _RedistributeAncillaryFiles(lu, additional_nodes=None):
      if os.path.exists(fname):
        result = lu.rpc.call_upload_file(dist_nodes, fname)
        for to_node, to_result in result.items():
      if os.path.exists(fname):
        result = lu.rpc.call_upload_file(dist_nodes, fname)
        for to_node, to_result in result.items():
-         msg = to_result.RemoteFailMsg()
-         if msg:
-           msg = ("Copy of file %s to node %s failed: %s" %
-                   (fname, to_node, msg))
-           lu.proc.LogWarning(msg)
+        msg = to_result.fail_msg
+        if msg:
+          msg = ("Copy of file %s to node %s failed: %s" %
+                 (fname, to_node, msg))
+          lu.proc.LogWarning(msg)
  
  
  class LURedistributeConfig(NoHooksLU):
  
  
  class LURedistributeConfig(NoHooksLU):
@@ -1685,12 +2049,13 @@ def _WaitForSync(lu, instance, oneshot=False, unlock=False):
      lu.cfg.SetDiskID(dev, node)
  
    retries = 0
      lu.cfg.SetDiskID(dev, node)
  
    retries = 0
+  degr_retries = 10 # in seconds, as we sleep 1 second each time
    while True:
      max_time = 0
      done = True
      cumul_degraded = False
      rstats = lu.rpc.call_blockdev_getmirrorstatus(node, instance.disks)
    while True:
      max_time = 0
      done = True
      cumul_degraded = False
      rstats = lu.rpc.call_blockdev_getmirrorstatus(node, instance.disks)
-    msg = rstats.RemoteFailMsg()
+    msg = rstats.fail_msg
      if msg:
        lu.LogWarning("Can't get any data from node %s: %s", node, msg)
        retries += 1
      if msg:
        lu.LogWarning("Can't get any data from node %s: %s", node, msg)
        retries += 1
@@ -1706,18 +2071,29 @@ def _WaitForSync(lu, instance, oneshot=False, unlock=False):
          lu.LogWarning("Can't compute data for node %s/%s",
                             node, instance.disks[i].iv_name)
          continue
          lu.LogWarning("Can't compute data for node %s/%s",
                             node, instance.disks[i].iv_name)
          continue
-      # we ignore the ldisk parameter
-      perc_done, est_time, is_degraded, _ = mstat
-      cumul_degraded = cumul_degraded or (is_degraded and perc_done is None)
-      if perc_done is not None:
+
+      cumul_degraded = (cumul_degraded or
+                        (mstat.is_degraded and mstat.sync_percent is None))
+      if mstat.sync_percent is not None:
          done = False
          done = False
-        if est_time is not None:
-          rem_time = "%d estimated seconds remaining" % est_time
-          max_time = est_time
+        if mstat.estimated_time is not None:
+          rem_time = "%d estimated seconds remaining" % mstat.estimated_time
+          max_time = mstat.estimated_time
          else:
            rem_time = "no time estimate"
          lu.proc.LogInfo("- device %s: %5.2f%% done, %s" %
          else:
            rem_time = "no time estimate"
          lu.proc.LogInfo("- device %s: %5.2f%% done, %s" %
-                        (instance.disks[i].iv_name, perc_done, rem_time))
+                        (instance.disks[i].iv_name, mstat.sync_percent,
+                         rem_time))
+
+    # if we're done but degraded, let's do a few small retries, to
+    # make sure we see a stable and not transient situation; therefore
+    # we force restart of the loop
+    if (done or oneshot) and cumul_degraded and degr_retries > 0:
+      logging.info("Degraded disks found, %d retries left", degr_retries)
+      degr_retries -= 1
+      time.sleep(1)
+      continue
+
      if done or oneshot:
        break
  
      if done or oneshot:
        break
  
@@ -1737,15 +2113,12 @@ def _CheckDiskConsistency(lu, dev, node, on_primary, ldisk=False):
  
    """
    lu.cfg.SetDiskID(dev, node)
  
    """
    lu.cfg.SetDiskID(dev, node)
-  if ldisk:
-    idx = 6
-  else:
-    idx = 5
  
    result = True
  
    result = True
+
    if on_primary or dev.AssembleOnSecondary():
      rstats = lu.rpc.call_blockdev_find(node, dev)
    if on_primary or dev.AssembleOnSecondary():
      rstats = lu.rpc.call_blockdev_find(node, dev)
-    msg = rstats.RemoteFailMsg()
+    msg = rstats.fail_msg
      if msg:
        lu.LogWarning("Can't find disk on node %s: %s", node, msg)
        result = False
      if msg:
        lu.LogWarning("Can't find disk on node %s: %s", node, msg)
        result = False
@@ -1753,7 +2126,11 @@ def _CheckDiskConsistency(lu, dev, node, on_primary, ldisk=False):
        lu.LogWarning("Can't find disk on node %s", node)
        result = False
      else:
        lu.LogWarning("Can't find disk on node %s", node)
        result = False
      else:
-      result = result and (not rstats.payload[idx])
+      if ldisk:
+        result = result and rstats.payload.ldisk_status == constants.LDS_OKAY
+      else:
+        result = result and not rstats.payload.is_degraded
+
    if dev.children:
      for child in dev.children:
        result = result and _CheckDiskConsistency(lu, child, node, on_primary)
    if dev.children:
      for child in dev.children:
        result = result and _CheckDiskConsistency(lu, child, node, on_primary)
@@ -1799,10 +2176,11 @@ class LUDiagnoseOS(NoHooksLU):
  
      @rtype: dict
      @return: a dictionary with osnames as keys and as value another map, with
  
      @rtype: dict
      @return: a dictionary with osnames as keys and as value another map, with
-        nodes as keys and list of OS objects as values, eg::
+        nodes as keys and tuples of (path, status, diagnose) as values, eg::
  
  
-          {"debian-etch": {"node1": [<object>,...],
-                           "node2": [<object>,]}
+          {"debian-etch": {"node1": [(/usr/lib/..., True, ""),
+                                     (/srv/..., False, "invalid api")],
+                           "node2": [(/srv/..., True, "")]}
            }
  
      """
            }
  
      """
@@ -1811,18 +2189,18 @@ class LUDiagnoseOS(NoHooksLU):
      # level), so that nodes with a non-responding node daemon don't
      # make all OSes invalid
      good_nodes = [node_name for node_name in rlist
      # level), so that nodes with a non-responding node daemon don't
      # make all OSes invalid
      good_nodes = [node_name for node_name in rlist
-                  if not rlist[node_name].failed]
-    for node_name, nr in rlist.iteritems():
-      if nr.failed or not nr.data:
+                  if not rlist[node_name].fail_msg]
+    for node_name, nr in rlist.items():
+      if nr.fail_msg or not nr.payload:
          continue
          continue
-      for os_obj in nr.data:
-        if os_obj.name not in all_os:
+      for name, path, status, diagnose in nr.payload:
+        if name not in all_os:
            # build a list of nodes for this os containing empty lists
            # for each node in node_list
            # build a list of nodes for this os containing empty lists
            # for each node in node_list
-          all_os[os_obj.name] = {}
+          all_os[name] = {}
            for nname in good_nodes:
            for nname in good_nodes:
-            all_os[os_obj.name][nname] = []
-        all_os[os_obj.name][node_name].append(os_obj)
+            all_os[name][nname] = []
+        all_os[name][node_name].append((path, status, diagnose))
      return all_os
  
    def Exec(self, feedback_fn):
      return all_os
  
    def Exec(self, feedback_fn):
@@ -1831,21 +2209,20 @@ class LUDiagnoseOS(NoHooksLU):
      """
      valid_nodes = [node for node in self.cfg.GetOnlineNodeList()]
      node_data = self.rpc.call_os_diagnose(valid_nodes)
      """
      valid_nodes = [node for node in self.cfg.GetOnlineNodeList()]
      node_data = self.rpc.call_os_diagnose(valid_nodes)
-    if node_data == False:
-      raise errors.OpExecError("Can't gather the list of OSes")
      pol = self._DiagnoseByOS(valid_nodes, node_data)
      output = []
      pol = self._DiagnoseByOS(valid_nodes, node_data)
      output = []
-    for os_name, os_data in pol.iteritems():
+    for os_name, os_data in pol.items():
        row = []
        for field in self.op.output_fields:
          if field == "name":
            val = os_name
          elif field == "valid":
        row = []
        for field in self.op.output_fields:
          if field == "name":
            val = os_name
          elif field == "valid":
-          val = utils.all([osl and osl[0] for osl in os_data.values()])
+          val = utils.all([osl and osl[0][1] for osl in os_data.values()])
          elif field == "node_status":
          elif field == "node_status":
+          # this is just a copy of the dict
            val = {}
            val = {}
-          for node_name, nos_list in os_data.iteritems():
-            val[node_name] = [(v.status, v.path) for v in nos_list]
+          for node_name, nos_list in os_data.items():
+            val[node_name] = nos_list
          else:
            raise errors.ParameterError(field)
          row.append(val)
          else:
            raise errors.ParameterError(field)
          row.append(val)
@@ -1874,7 +2251,8 @@ class LURemoveNode(LogicalUnit):
        "NODE_NAME": self.op.node_name,
        }
      all_nodes = self.cfg.GetNodeList()
        "NODE_NAME": self.op.node_name,
        }
      all_nodes = self.cfg.GetNodeList()
-    all_nodes.remove(self.op.node_name)
+    if self.op.node_name in all_nodes:
+      all_nodes.remove(self.op.node_name)
      return env, all_nodes, all_nodes
  
    def CheckPrereq(self):
      return env, all_nodes, all_nodes
  
    def CheckPrereq(self):
@@ -1885,7 +2263,7 @@ class LURemoveNode(LogicalUnit):
       - it does not have primary or secondary instances
       - it's not the master
  
       - it does not have primary or secondary instances
       - it's not the master
  
-    Any errors are signalled by raising errors.OpPrereqError.
+    Any errors are signaled by raising errors.OpPrereqError.
  
      """
      node = self.cfg.GetNodeInfo(self.cfg.ExpandNodeName(self.op.node_name))
  
      """
      node = self.cfg.GetNodeInfo(self.cfg.ExpandNodeName(self.op.node_name))
@@ -1915,17 +2293,23 @@ class LURemoveNode(LogicalUnit):
      logging.info("Stopping the node daemon and removing configs from node %s",
                   node.name)
  
      logging.info("Stopping the node daemon and removing configs from node %s",
                   node.name)
  
+    # Promote nodes to master candidate as needed
+    _AdjustCandidatePool(self, exceptions=[node.name])
      self.context.RemoveNode(node.name)
  
      self.context.RemoveNode(node.name)
  
+    # Run post hooks on the node before it's removed
+    hm = self.proc.hmclass(self.rpc.call_hooks_runner, self)
+    try:
+      h_results = hm.RunPhase(constants.HOOKS_PHASE_POST, [node.name])
+    except:
+      self.LogWarning("Errors occurred running hooks on %s" % node.name)
+
      result = self.rpc.call_node_leave_cluster(node.name)
      result = self.rpc.call_node_leave_cluster(node.name)
-    msg = result.RemoteFailMsg()
+    msg = result.fail_msg
      if msg:
        self.LogWarning("Errors encountered on the remote node while leaving"
                        " the cluster: %s", msg)
  
      if msg:
        self.LogWarning("Errors encountered on the remote node while leaving"
                        " the cluster: %s", msg)
  
-    # Promote nodes to master candidate as needed
-    _AdjustCandidatePool(self)
-
  
  class LUQueryNodes(NoHooksLU):
    """Logical unit for querying nodes.
  
  class LUQueryNodes(NoHooksLU):
    """Logical unit for querying nodes.
@@ -1933,6 +2317,10 @@ class LUQueryNodes(NoHooksLU):
    """
    _OP_REQP = ["output_fields", "names", "use_locking"]
    REQ_BGL = False
    """
    _OP_REQP = ["output_fields", "names", "use_locking"]
    REQ_BGL = False
+
+  _SIMPLE_FIELDS = ["name", "serial_no", "ctime", "mtime", "uuid",
+                    "master_candidate", "offline", "drained"]
+
    _FIELDS_DYNAMIC = utils.FieldSet(
      "dtotal", "dfree",
      "mtotal", "mnode", "mfree",
    _FIELDS_DYNAMIC = utils.FieldSet(
      "dtotal", "dfree",
      "mtotal", "mnode", "mfree",
@@ -1940,15 +2328,12 @@ class LUQueryNodes(NoHooksLU):
      "ctotal", "cnodes", "csockets",
      )
  
      "ctotal", "cnodes", "csockets",
      )
  
-  _FIELDS_STATIC = utils.FieldSet(
-    "name", "pinst_cnt", "sinst_cnt",
+  _FIELDS_STATIC = utils.FieldSet(*[
+    "pinst_cnt", "sinst_cnt",
      "pinst_list", "sinst_list",
      "pip", "sip", "tags",
      "pinst_list", "sinst_list",
      "pip", "sip", "tags",
-    "serial_no",
-    "master_candidate",
      "master",
      "master",
-    "offline",
-    "drained",
+    "role"] + _SIMPLE_FIELDS
      )
  
    def ExpandNames(self):
      )
  
    def ExpandNames(self):
@@ -2006,7 +2391,7 @@ class LUQueryNodes(NoHooksLU):
                                            self.cfg.GetHypervisorType())
        for name in nodenames:
          nodeinfo = node_data[name]
                                            self.cfg.GetHypervisorType())
        for name in nodenames:
          nodeinfo = node_data[name]
-        if not nodeinfo.RemoteFailMsg() and nodeinfo.payload:
+        if not nodeinfo.fail_msg and nodeinfo.payload:
            nodeinfo = nodeinfo.payload
            fn = utils.TryConvert
            live_data[name] = {
            nodeinfo = nodeinfo.payload
            fn = utils.TryConvert
            live_data[name] = {
@@ -2049,8 +2434,8 @@ class LUQueryNodes(NoHooksLU):
      for node in nodelist:
        node_output = []
        for field in self.op.output_fields:
      for node in nodelist:
        node_output = []
        for field in self.op.output_fields:
-        if field == "name":
-          val = node.name
+        if field in self._SIMPLE_FIELDS:
+          val = getattr(node, field)
          elif field == "pinst_list":
            val = list(node_to_primary[node.name])
          elif field == "sinst_list":
          elif field == "pinst_list":
            val = list(node_to_primary[node.name])
          elif field == "sinst_list":
@@ -2065,18 +2450,21 @@ class LUQueryNodes(NoHooksLU):
            val = node.secondary_ip
          elif field == "tags":
            val = list(node.GetTags())
            val = node.secondary_ip
          elif field == "tags":
            val = list(node.GetTags())
-        elif field == "serial_no":
-          val = node.serial_no
-        elif field == "master_candidate":
-          val = node.master_candidate
          elif field == "master":
            val = node.name == master_node
          elif field == "master":
            val = node.name == master_node
-        elif field == "offline":
-          val = node.offline
-        elif field == "drained":
-          val = node.drained
          elif self._FIELDS_DYNAMIC.Matches(field):
            val = live_data[node.name].get(field, None)
          elif self._FIELDS_DYNAMIC.Matches(field):
            val = live_data[node.name].get(field, None)
+        elif field == "role":
+          if node.name == master_node:
+            val = "M"
+          elif node.master_candidate:
+            val = "C"
+          elif node.drained:
+            val = "D"
+          elif node.offline:
+            val = "O"
+          else:
+            val = "R"
          else:
            raise errors.ParameterError(field)
          node_output.append(val)
          else:
            raise errors.ParameterError(field)
          node_output.append(val)
@@ -2129,10 +2517,15 @@ class LUQueryNodeVolumes(NoHooksLU):
  
      output = []
      for node in nodenames:
  
      output = []
      for node in nodenames:
-      if node not in volumes or volumes[node].failed or not volumes[node].data:
+      nresult = volumes[node]
+      if nresult.offline:
+        continue
+      msg = nresult.fail_msg
+      if msg:
+        self.LogWarning("Can't compute volume data on node %s: %s", node, msg)
          continue
  
          continue
  
-      node_vols = volumes[node].data[:]
+      node_vols = nresult.payload[:]
        node_vols.sort(key=lambda vol: vol['dev'])
  
        for vol in node_vols:
        node_vols.sort(key=lambda vol: vol['dev'])
  
        for vol in node_vols:
@@ -2166,42 +2559,190 @@ class LUQueryNodeVolumes(NoHooksLU):
      return output
  
  
      return output
  
  
-class LUAddNode(LogicalUnit):
-  """Logical unit for adding node to the cluster.
+class LUQueryNodeStorage(NoHooksLU):
+  """Logical unit for getting information on storage units on node(s).
  
    """
  
    """
-  HPATH = "node-add"
-  HTYPE = constants.HTYPE_NODE
-  _OP_REQP = ["node_name"]
+  _OP_REQP = ["nodes", "storage_type", "output_fields"]
+  REQ_BGL = False
+  _FIELDS_STATIC = utils.FieldSet("node")
  
  
-  def BuildHooksEnv(self):
-    """Build hooks env.
+  def ExpandNames(self):
+    storage_type = self.op.storage_type
  
  
-    This will run on all nodes before, and on all nodes + the new node after.
+    if storage_type not in constants.VALID_STORAGE_FIELDS:
+      raise errors.OpPrereqError("Unknown storage type: %s" % storage_type)
  
  
-    """
-    env = {
-      "OP_TARGET": self.op.node_name,
-      "NODE_NAME": self.op.node_name,
-      "NODE_PIP": self.op.primary_ip,
-      "NODE_SIP": self.op.secondary_ip,
-      }
-    nodes_0 = self.cfg.GetNodeList()
-    nodes_1 = nodes_0 + [self.op.node_name, ]
-    return env, nodes_0, nodes_1
+    dynamic_fields = constants.VALID_STORAGE_FIELDS[storage_type]
+
+    _CheckOutputFields(static=self._FIELDS_STATIC,
+                       dynamic=utils.FieldSet(*dynamic_fields),
+                       selected=self.op.output_fields)
+
+    self.needed_locks = {}
+    self.share_locks[locking.LEVEL_NODE] = 1
+
+    if self.op.nodes:
+      self.needed_locks[locking.LEVEL_NODE] = \
+        _GetWantedNodes(self, self.op.nodes)
+    else:
+      self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
  
    def CheckPrereq(self):
      """Check prerequisites.
  
  
    def CheckPrereq(self):
      """Check prerequisites.
  
-    This checks:
-     - the new node is not already in the config
-     - it is resolvable
-     - its parameters (single/dual homed) matches the cluster
-
-    Any errors are signalled by raising errors.OpPrereqError.
+    This checks that the fields required are valid output fields.
  
      """
  
      """
-    node_name = self.op.node_name
+    self.op.name = getattr(self.op, "name", None)
+
+    self.nodes = self.acquired_locks[locking.LEVEL_NODE]
+
+  def Exec(self, feedback_fn):
+    """Computes the list of nodes and their attributes.
+
+    """
+    # Always get name to sort by
+    if constants.SF_NAME in self.op.output_fields:
+      fields = self.op.output_fields[:]
+    else:
+      fields = [constants.SF_NAME] + self.op.output_fields
+
+    # Never ask for node as it's only known to the LU
+    while "node" in fields:
+      fields.remove("node")
+
+    field_idx = dict([(name, idx) for (idx, name) in enumerate(fields)])
+    name_idx = field_idx[constants.SF_NAME]
+
+    st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
+    data = self.rpc.call_storage_list(self.nodes,
+                                      self.op.storage_type, st_args,
+                                      self.op.name, fields)
+
+    result = []
+
+    for node in utils.NiceSort(self.nodes):
+      nresult = data[node]
+      if nresult.offline:
+        continue
+
+      msg = nresult.fail_msg
+      if msg:
+        self.LogWarning("Can't get storage data from node %s: %s", node, msg)
+        continue
+
+      rows = dict([(row[name_idx], row) for row in nresult.payload])
+
+      for name in utils.NiceSort(rows.keys()):
+        row = rows[name]
+
+        out = []
+
+        for field in self.op.output_fields:
+          if field == "node":
+            val = node
+          elif field in field_idx:
+            val = row[field_idx[field]]
+          else:
+            raise errors.ParameterError(field)
+
+          out.append(val)
+
+        result.append(out)
+
+    return result
+
+
+class LUModifyNodeStorage(NoHooksLU):
+  """Logical unit for modifying a storage volume on a node.
+
+  """
+  _OP_REQP = ["node_name", "storage_type", "name", "changes"]
+  REQ_BGL = False
+
+  def CheckArguments(self):
+    node_name = self.cfg.ExpandNodeName(self.op.node_name)
+    if node_name is None:
+      raise errors.OpPrereqError("Invalid node name '%s'" % self.op.node_name)
+
+    self.op.node_name = node_name
+
+    storage_type = self.op.storage_type
+    if storage_type not in constants.VALID_STORAGE_FIELDS:
+      raise errors.OpPrereqError("Unknown storage type: %s" % storage_type)
+
+  def ExpandNames(self):
+    self.needed_locks = {
+      locking.LEVEL_NODE: self.op.node_name,
+      }
+
+  def CheckPrereq(self):
+    """Check prerequisites.
+
+    """
+    storage_type = self.op.storage_type
+
+    try:
+      modifiable = constants.MODIFIABLE_STORAGE_FIELDS[storage_type]
+    except KeyError:
+      raise errors.OpPrereqError("Storage units of type '%s' can not be"
+                                 " modified" % storage_type)
+
+    diff = set(self.op.changes.keys()) - modifiable
+    if diff:
+      raise errors.OpPrereqError("The following fields can not be modified for"
+                                 " storage units of type '%s': %r" %
+                                 (storage_type, list(diff)))
+
+  def Exec(self, feedback_fn):
+    """Computes the list of nodes and their attributes.
+
+    """
+    st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
+    result = self.rpc.call_storage_modify(self.op.node_name,
+                                          self.op.storage_type, st_args,
+                                          self.op.name, self.op.changes)
+    result.Raise("Failed to modify storage unit '%s' on %s" %
+                 (self.op.name, self.op.node_name))
+
+
+class LUAddNode(LogicalUnit):
+  """Logical unit for adding node to the cluster.
+
+  """
+  HPATH = "node-add"
+  HTYPE = constants.HTYPE_NODE
+  _OP_REQP = ["node_name"]
+
+  def BuildHooksEnv(self):
+    """Build hooks env.
+
+    This will run on all nodes before, and on all nodes + the new node after.
+
+    """
+    env = {
+      "OP_TARGET": self.op.node_name,
+      "NODE_NAME": self.op.node_name,
+      "NODE_PIP": self.op.primary_ip,
+      "NODE_SIP": self.op.secondary_ip,
+      }
+    nodes_0 = self.cfg.GetNodeList()
+    nodes_1 = nodes_0 + [self.op.node_name, ]
+    return env, nodes_0, nodes_1
+
+  def CheckPrereq(self):
+    """Check prerequisites.
+
+    This checks:
+     - the new node is not already in the config
+     - it is resolvable
+     - its parameters (single/dual homed) matches the cluster
+
+    Any errors are signaled by raising errors.OpPrereqError.
+
+    """
+    node_name = self.op.node_name
      cfg = self.cfg
  
      dns_data = utils.HostInfo(node_name)
      cfg = self.cfg
  
      dns_data = utils.HostInfo(node_name)
@@ -2252,7 +2793,7 @@ class LUAddNode(LogicalUnit):
          raise errors.OpPrereqError("The master has a private ip but the"
                                     " new node doesn't have one")
  
          raise errors.OpPrereqError("The master has a private ip but the"
                                     " new node doesn't have one")
  
-    # checks reachablity
+    # checks reachability
      if not utils.TcpPing(primary_ip, constants.DEFAULT_NODED_PORT):
        raise errors.OpPrereqError("Node not reachable by ping")
  
      if not utils.TcpPing(primary_ip, constants.DEFAULT_NODED_PORT):
        raise errors.OpPrereqError("Node not reachable by ping")
  
@@ -2263,15 +2804,22 @@ class LUAddNode(LogicalUnit):
          raise errors.OpPrereqError("Node secondary ip not reachable by TCP"
                                     " based ping to noded port")
  
          raise errors.OpPrereqError("Node secondary ip not reachable by TCP"
                                     " based ping to noded port")
  
-    cp_size = self.cfg.GetClusterInfo().candidate_pool_size
-    mc_now, _ = self.cfg.GetMasterCandidateStats()
-    master_candidate = mc_now < cp_size
+    if self.op.readd:
+      exceptions = [node]
+    else:
+      exceptions = []
+
+    self.master_candidate = _DecideSelfPromotion(self, exceptions=exceptions)
  
  
-    self.new_node = objects.Node(name=node,
-                                 primary_ip=primary_ip,
-                                 secondary_ip=secondary_ip,
-                                 master_candidate=master_candidate,
-                                 offline=False, drained=False)
+    if self.op.readd:
+      self.new_node = self.cfg.GetNodeInfo(node)
+      assert self.new_node is not None, "Can't retrieve locked node %s" % node
+    else:
+      self.new_node = objects.Node(name=node,
+                                   primary_ip=primary_ip,
+                                   secondary_ip=secondary_ip,
+                                   master_candidate=self.master_candidate,
+                                   offline=False, drained=False)
  
    def Exec(self, feedback_fn):
      """Adds the new node to the cluster.
  
    def Exec(self, feedback_fn):
      """Adds the new node to the cluster.
@@ -2280,19 +2828,30 @@ class LUAddNode(LogicalUnit):
      new_node = self.new_node
      node = new_node.name
  
      new_node = self.new_node
      node = new_node.name
  
+    # for re-adds, reset the offline/drained/master-candidate flags;
+    # we need to reset here, otherwise offline would prevent RPC calls
+    # later in the procedure; this also means that if the re-add
+    # fails, we are left with a non-offlined, broken node
+    if self.op.readd:
+      new_node.drained = new_node.offline = False
+      self.LogInfo("Readding a node, the offline/drained flags were reset")
+      # if we demote the node, we do cleanup later in the procedure
+      new_node.master_candidate = self.master_candidate
+
+    # notify the user about any possible mc promotion
+    if new_node.master_candidate:
+      self.LogInfo("Node will be a master candidate")
+
      # check connectivity
      result = self.rpc.call_version([node])[node]
      # check connectivity
      result = self.rpc.call_version([node])[node]
-    result.Raise()
-    if result.data:
-      if constants.PROTOCOL_VERSION == result.data:
-        logging.info("Communication to node %s fine, sw version %s match",
-                     node, result.data)
-      else:
-        raise errors.OpExecError("Version mismatch master version %s,"
-                                 " node version %s" %
-                                 (constants.PROTOCOL_VERSION, result.data))
+    result.Raise("Can't get version information from node %s" % node)
+    if constants.PROTOCOL_VERSION == result.payload:
+      logging.info("Communication to node %s fine, sw version %s match",
+                   node, result.payload)
      else:
      else:
-      raise errors.OpExecError("Cannot get version from the new node")
+      raise errors.OpExecError("Version mismatch master version %s,"
+                               " node version %s" %
+                               (constants.PROTOCOL_VERSION, result.payload))
  
      # setup ssh on node
      logging.info("Copy ssh key to node %s", node)
  
      # setup ssh on node
      logging.info("Copy ssh key to node %s", node)
@@ -2303,20 +2862,12 @@ class LUAddNode(LogicalUnit):
                  priv_key, pub_key]
  
      for i in keyfiles:
                  priv_key, pub_key]
  
      for i in keyfiles:
-      f = open(i, 'r')
-      try:
-        keyarray.append(f.read())
-      finally:
-        f.close()
+      keyarray.append(utils.ReadFile(i))
  
      result = self.rpc.call_node_add(node, keyarray[0], keyarray[1],
                                      keyarray[2],
                                      keyarray[3], keyarray[4], keyarray[5])
  
      result = self.rpc.call_node_add(node, keyarray[0], keyarray[1],
                                      keyarray[2],
                                      keyarray[3], keyarray[4], keyarray[5])
-
-    msg = result.RemoteFailMsg()
-    if msg:
-      raise errors.OpExecError("Cannot transfer ssh keys to the"
-                               " new node: %s" % msg)
+    result.Raise("Cannot transfer ssh keys to the new node")
  
      # Add node to our /etc/hosts, and add key to known_hosts
      if self.cfg.GetClusterInfo().modify_etc_hosts:
  
      # Add node to our /etc/hosts, and add key to known_hosts
      if self.cfg.GetClusterInfo().modify_etc_hosts:
@@ -2325,10 +2876,8 @@ class LUAddNode(LogicalUnit):
      if new_node.secondary_ip != new_node.primary_ip:
        result = self.rpc.call_node_has_ip_address(new_node.name,
                                                   new_node.secondary_ip)
      if new_node.secondary_ip != new_node.primary_ip:
        result = self.rpc.call_node_has_ip_address(new_node.name,
                                                   new_node.secondary_ip)
-      msg = result.RemoteFailMsg()
-      if msg:
-        raise errors.OpPrereqError("Failure checking secondary ip"
-                                   " on node %s: %s" % (new_node.name, msg))
+      result.Raise("Failure checking secondary ip on node %s" % new_node.name,
+                   prereq=True)
        if not result.payload:
          raise errors.OpExecError("Node claims it doesn't have the secondary ip"
                                   " you gave (%s). Please fix and re-run this"
        if not result.payload:
          raise errors.OpExecError("Node claims it doesn't have the secondary ip"
                                   " you gave (%s). Please fix and re-run this"
@@ -2336,27 +2885,34 @@ class LUAddNode(LogicalUnit):
  
      node_verify_list = [self.cfg.GetMasterNode()]
      node_verify_param = {
  
      node_verify_list = [self.cfg.GetMasterNode()]
      node_verify_param = {
-      'nodelist': [node],
+      constants.NV_NODELIST: [node],
        # TODO: do a node-net-test as well?
      }
  
      result = self.rpc.call_node_verify(node_verify_list, node_verify_param,
                                         self.cfg.GetClusterName())
      for verifier in node_verify_list:
        # TODO: do a node-net-test as well?
      }
  
      result = self.rpc.call_node_verify(node_verify_list, node_verify_param,
                                         self.cfg.GetClusterName())
      for verifier in node_verify_list:
-      msg = result[verifier].RemoteFailMsg()
-      if msg:
-        raise errors.OpExecError("Cannot communicate with node %s: %s" %
-                                 (verifier, msg))
-      nl_payload = result[verifier].payload['nodelist']
+      result[verifier].Raise("Cannot communicate with node %s" % verifier)
+      nl_payload = result[verifier].payload[constants.NV_NODELIST]
        if nl_payload:
          for failed in nl_payload:
        if nl_payload:
          for failed in nl_payload:
-          feedback_fn("ssh/hostname verification failed %s -> %s" %
+          feedback_fn("ssh/hostname verification failed"
+                      " (checking from %s): %s" %
                        (verifier, nl_payload[failed]))
          raise errors.OpExecError("ssh/hostname verification failed.")
  
      if self.op.readd:
        _RedistributeAncillaryFiles(self)
        self.context.ReaddNode(new_node)
                        (verifier, nl_payload[failed]))
          raise errors.OpExecError("ssh/hostname verification failed.")
  
      if self.op.readd:
        _RedistributeAncillaryFiles(self)
        self.context.ReaddNode(new_node)
+      # make sure we redistribute the config
+      self.cfg.Update(new_node)
+      # and make sure the new node will not have old files around
+      if not new_node.master_candidate:
+        result = self.rpc.call_node_demote_from_mc(new_node.name)
+        msg = result.fail_msg
+        if msg:
+          self.LogWarning("Node failed to demote itself from master"
+                          " candidate status: %s" % msg)
      else:
        _RedistributeAncillaryFiles(self, additional_nodes=[node])
        self.context.AddNode(new_node)
      else:
        _RedistributeAncillaryFiles(self, additional_nodes=[node])
        self.context.AddNode(new_node)
@@ -2413,18 +2969,30 @@ class LUSetNodeParams(LogicalUnit):
      """
      node = self.node = self.cfg.GetNodeInfo(self.op.node_name)
  
      """
      node = self.node = self.cfg.GetNodeInfo(self.op.node_name)
  
-    if ((self.op.master_candidate == False or self.op.offline == True or
-         self.op.drained == True) and node.master_candidate):
-      # we will demote the node from master_candidate
+    if (self.op.master_candidate is not None or
+        self.op.drained is not None or
+        self.op.offline is not None):
+      # we can't change the master's node flags
        if self.op.node_name == self.cfg.GetMasterNode():
        if self.op.node_name == self.cfg.GetMasterNode():
-        raise errors.OpPrereqError("The master node has to be a"
-                                   " master candidate, online and not drained")
+        raise errors.OpPrereqError("The master role can be changed"
+                                   " only via masterfailover")
+
+    # Boolean value that tells us whether we're offlining or draining the node
+    offline_or_drain = self.op.offline == True or self.op.drained == True
+    deoffline_or_drain = self.op.offline == False or self.op.drained == False
+
+    if (node.master_candidate and
+        (self.op.master_candidate == False or offline_or_drain)):
        cp_size = self.cfg.GetClusterInfo().candidate_pool_size
        cp_size = self.cfg.GetClusterInfo().candidate_pool_size
-      num_candidates, _ = self.cfg.GetMasterCandidateStats()
-      if num_candidates <= cp_size:
+      mc_now, mc_should, mc_max = self.cfg.GetMasterCandidateStats()
+      if mc_now <= cp_size:
          msg = ("Not enough master candidates (desired"
          msg = ("Not enough master candidates (desired"
-               " %d, new value will be %d)" % (cp_size, num_candidates-1))
-        if self.op.force:
+               " %d, new value will be %d)" % (cp_size, mc_now-1))
+        # Only allow forcing the operation if it's an offline/drain operation,
+        # and we could not possibly promote more nodes.
+        # FIXME: this can still lead to issues if in any way another node which
+        # could be promoted appears in the meantime.
+        if self.op.force and offline_or_drain and mc_should == mc_max:
            self.LogWarning(msg)
          else:
            raise errors.OpPrereqError(msg)
            self.LogWarning(msg)
          else:
            raise errors.OpPrereqError(msg)
@@ -2435,6 +3003,13 @@ class LUSetNodeParams(LogicalUnit):
        raise errors.OpPrereqError("Node '%s' is offline or drained, can't set"
                                   " to master_candidate" % node.name)
  
        raise errors.OpPrereqError("Node '%s' is offline or drained, can't set"
                                   " to master_candidate" % node.name)
  
+    # If we're being deofflined/drained, we'll MC ourself if needed
+    if (deoffline_or_drain and not offline_or_drain and not
+        self.op.master_candidate == True):
+      self.op.master_candidate = _DecideSelfPromotion(self)
+      if self.op.master_candidate:
+        self.LogInfo("Autopromoting node to master candidate")
+
      return
  
    def Exec(self, feedback_fn):
      return
  
    def Exec(self, feedback_fn):
@@ -2464,7 +3039,7 @@ class LUSetNodeParams(LogicalUnit):
        result.append(("master_candidate", str(self.op.master_candidate)))
        if self.op.master_candidate == False:
          rrc = self.rpc.call_node_demote_from_mc(node.name)
        result.append(("master_candidate", str(self.op.master_candidate)))
        if self.op.master_candidate == False:
          rrc = self.rpc.call_node_demote_from_mc(node.name)
-        msg = rrc.RemoteFailMsg()
+        msg = rrc.fail_msg
          if msg:
            self.LogWarning("Node failed to demote itself: %s" % msg)
  
          if msg:
            self.LogWarning("Node failed to demote itself: %s" % msg)
  
@@ -2476,6 +3051,10 @@ class LUSetNodeParams(LogicalUnit):
            node.master_candidate = False
            changed_mc = True
            result.append(("master_candidate", "auto-demotion due to drain"))
            node.master_candidate = False
            changed_mc = True
            result.append(("master_candidate", "auto-demotion due to drain"))
+          rrc = self.rpc.call_node_demote_from_mc(node.name)
+          msg = rrc.fail_msg
+          if msg:
+            self.LogWarning("Node failed to demote itself: %s" % msg)
          if node.offline:
            node.offline = False
            result.append(("offline", "clear offline status due to drain"))
          if node.offline:
            node.offline = False
            result.append(("offline", "clear offline status due to drain"))
@@ -2508,7 +3087,7 @@ class LUPowercycleNode(NoHooksLU):
    def ExpandNames(self):
      """Locking for PowercycleNode.
  
    def ExpandNames(self):
      """Locking for PowercycleNode.
  
-    This is a last-resource option and shouldn't block on other
+    This is a last-resort option and shouldn't block on other
      jobs. Therefore, we grab no locks.
  
      """
      jobs. Therefore, we grab no locks.
  
      """
@@ -2528,9 +3107,7 @@ class LUPowercycleNode(NoHooksLU):
      """
      result = self.rpc.call_node_powercycle(self.op.node_name,
                                             self.cfg.GetHypervisorType())
      """
      result = self.rpc.call_node_powercycle(self.op.node_name,
                                             self.cfg.GetHypervisorType())
-    msg = result.RemoteFailMsg()
-    if msg:
-      raise errors.OpExecError("Failed to schedule the reboot: %s" % msg)
+    result.Raise("Failed to schedule the reboot")
      return result.payload
  
  
      return result.payload
  
  
@@ -2559,21 +3136,25 @@ class LUQueryClusterInfo(NoHooksLU):
        "software_version": constants.RELEASE_VERSION,
        "protocol_version": constants.PROTOCOL_VERSION,
        "config_version": constants.CONFIG_VERSION,
        "software_version": constants.RELEASE_VERSION,
        "protocol_version": constants.PROTOCOL_VERSION,
        "config_version": constants.CONFIG_VERSION,
-      "os_api_version": constants.OS_API_VERSION,
+      "os_api_version": max(constants.OS_API_VERSIONS),
        "export_version": constants.EXPORT_VERSION,
        "architecture": (platform.architecture()[0], platform.machine()),
        "name": cluster.cluster_name,
        "master": cluster.master_node,
        "export_version": constants.EXPORT_VERSION,
        "architecture": (platform.architecture()[0], platform.machine()),
        "name": cluster.cluster_name,
        "master": cluster.master_node,
-      "default_hypervisor": cluster.default_hypervisor,
+      "default_hypervisor": cluster.enabled_hypervisors[0],
        "enabled_hypervisors": cluster.enabled_hypervisors,
        "enabled_hypervisors": cluster.enabled_hypervisors,
-      "hvparams": dict([(hypervisor, cluster.hvparams[hypervisor])
-                        for hypervisor in cluster.enabled_hypervisors]),
+      "hvparams": dict([(hypervisor_name, cluster.hvparams[hypervisor_name])
+                        for hypervisor_name in cluster.enabled_hypervisors]),
        "beparams": cluster.beparams,
        "nicparams": cluster.nicparams,
        "candidate_pool_size": cluster.candidate_pool_size,
        "master_netdev": cluster.master_netdev,
        "volume_group_name": cluster.volume_group_name,
        "file_storage_dir": cluster.file_storage_dir,
        "beparams": cluster.beparams,
        "nicparams": cluster.nicparams,
        "candidate_pool_size": cluster.candidate_pool_size,
        "master_netdev": cluster.master_netdev,
        "volume_group_name": cluster.volume_group_name,
        "file_storage_dir": cluster.file_storage_dir,
+      "ctime": cluster.ctime,
+      "mtime": cluster.mtime,
+      "uuid": cluster.uuid,
+      "tags": list(cluster.GetTags()),
        }
  
      return result
        }
  
      return result
@@ -2586,7 +3167,8 @@ class LUQueryConfigValues(NoHooksLU):
    _OP_REQP = []
    REQ_BGL = False
    _FIELDS_DYNAMIC = utils.FieldSet()
    _OP_REQP = []
    REQ_BGL = False
    _FIELDS_DYNAMIC = utils.FieldSet()
-  _FIELDS_STATIC = utils.FieldSet("cluster_name", "master_node", "drain_flag")
+  _FIELDS_STATIC = utils.FieldSet("cluster_name", "master_node", "drain_flag",
+                                  "watcher_pause")
  
    def ExpandNames(self):
      self.needed_locks = {}
  
    def ExpandNames(self):
      self.needed_locks = {}
@@ -2613,6 +3195,8 @@ class LUQueryConfigValues(NoHooksLU):
          entry = self.cfg.GetMasterNode()
        elif field == "drain_flag":
          entry = os.path.exists(constants.JOB_QUEUE_DRAIN_FILE)
          entry = self.cfg.GetMasterNode()
        elif field == "drain_flag":
          entry = os.path.exists(constants.JOB_QUEUE_DRAIN_FILE)
+      elif field == "watcher_pause":
+        return utils.ReadWatcherPauseFile(constants.WATCHER_PAUSEFILE)
        else:
          raise errors.ParameterError(field)
        values.append(entry)
        else:
          raise errors.ParameterError(field)
        values.append(entry)
@@ -2645,19 +3229,24 @@ class LUActivateInstanceDisks(NoHooksLU):
      assert self.instance is not None, \
        "Cannot retrieve locked instance %s" % self.op.instance_name
      _CheckNodeOnline(self, self.instance.primary_node)
      assert self.instance is not None, \
        "Cannot retrieve locked instance %s" % self.op.instance_name
      _CheckNodeOnline(self, self.instance.primary_node)
+    if not hasattr(self.op, "ignore_size"):
+      self.op.ignore_size = False
  
    def Exec(self, feedback_fn):
      """Activate the disks.
  
      """
  
    def Exec(self, feedback_fn):
      """Activate the disks.
  
      """
-    disks_ok, disks_info = _AssembleInstanceDisks(self, self.instance)
+    disks_ok, disks_info = \
+              _AssembleInstanceDisks(self, self.instance,
+                                     ignore_size=self.op.ignore_size)
      if not disks_ok:
        raise errors.OpExecError("Cannot activate block devices")
  
      return disks_info
  
  
      if not disks_ok:
        raise errors.OpExecError("Cannot activate block devices")
  
      return disks_info
  
  
-def _AssembleInstanceDisks(lu, instance, ignore_secondaries=False):
+def _AssembleInstanceDisks(lu, instance, ignore_secondaries=False,
+                           ignore_size=False):
    """Prepare the block devices for an instance.
  
    This sets up the block devices on all nodes.
    """Prepare the block devices for an instance.
  
    This sets up the block devices on all nodes.
@@ -2669,6 +3258,10 @@ def _AssembleInstanceDisks(lu, instance, ignore_secondaries=False):
    @type ignore_secondaries: boolean
    @param ignore_secondaries: if true, errors on secondary nodes
        won't result in an error return from the function
    @type ignore_secondaries: boolean
    @param ignore_secondaries: if true, errors on secondary nodes
        won't result in an error return from the function
+  @type ignore_size: boolean
+  @param ignore_size: if true, the current known size of the disk
+      will not be used during the disk activation, useful for cases
+      when the size is wrong
    @return: False if the operation failed, otherwise a list of
        (host, instance_visible_name, node_visible_name)
        with the mapping from node devices to instance devices
    @return: False if the operation failed, otherwise a list of
        (host, instance_visible_name, node_visible_name)
        with the mapping from node devices to instance devices
@@ -2689,9 +3282,12 @@ def _AssembleInstanceDisks(lu, instance, ignore_secondaries=False):
    # 1st pass, assemble on all nodes in secondary mode
    for inst_disk in instance.disks:
      for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node):
    # 1st pass, assemble on all nodes in secondary mode
    for inst_disk in instance.disks:
      for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node):
+      if ignore_size:
+        node_disk = node_disk.Copy()
+        node_disk.UnsetSize()
        lu.cfg.SetDiskID(node_disk, node)
        result = lu.rpc.call_blockdev_assemble(node, node_disk, iname, False)
        lu.cfg.SetDiskID(node_disk, node)
        result = lu.rpc.call_blockdev_assemble(node, node_disk, iname, False)
-      msg = result.RemoteFailMsg()
+      msg = result.fail_msg
        if msg:
          lu.proc.LogWarning("Could not prepare block device %s on node %s"
                             " (is_primary=False, pass=1): %s",
        if msg:
          lu.proc.LogWarning("Could not prepare block device %s on node %s"
                             " (is_primary=False, pass=1): %s",
@@ -2706,9 +3302,12 @@ def _AssembleInstanceDisks(lu, instance, ignore_secondaries=False):
      for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node):
        if node != instance.primary_node:
          continue
      for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node):
        if node != instance.primary_node:
          continue
+      if ignore_size:
+        node_disk = node_disk.Copy()
+        node_disk.UnsetSize()
        lu.cfg.SetDiskID(node_disk, node)
        result = lu.rpc.call_blockdev_assemble(node, node_disk, iname, True)
        lu.cfg.SetDiskID(node_disk, node)
        result = lu.rpc.call_blockdev_assemble(node, node_disk, iname, True)
-      msg = result.RemoteFailMsg()
+      msg = result.fail_msg
        if msg:
          lu.proc.LogWarning("Could not prepare block device %s on node %s"
                             " (is_primary=True, pass=2): %s",
        if msg:
          lu.proc.LogWarning("Could not prepare block device %s on node %s"
                             " (is_primary=True, pass=2): %s",
@@ -2730,7 +3329,7 @@ def _StartInstanceDisks(lu, instance, force):
    """Start the disks of an instance.
  
    """
    """Start the disks of an instance.
  
    """
-  disks_ok, dummy = _AssembleInstanceDisks(lu, instance,
+  disks_ok, _ = _AssembleInstanceDisks(lu, instance,
                                             ignore_secondaries=force)
    if not disks_ok:
      _ShutdownInstanceDisks(lu, instance)
                                             ignore_secondaries=force)
    if not disks_ok:
      _ShutdownInstanceDisks(lu, instance)
@@ -2783,11 +3382,8 @@ def _SafeShutdownInstanceDisks(lu, instance):
  
    """
    pnode = instance.primary_node
  
    """
    pnode = instance.primary_node
-  ins_l = lu.rpc.call_instance_list([pnode], [instance.hypervisor])
-  ins_l = ins_l[pnode]
-  msg = ins_l.RemoteFailMsg()
-  if msg:
-    raise errors.OpExecError("Can't contact node %s: %s" % (pnode, msg))
+  ins_l = lu.rpc.call_instance_list([pnode], [instance.hypervisor])[pnode]
+  ins_l.Raise("Can't contact node %s" % pnode)
  
    if instance.name in ins_l.payload:
      raise errors.OpExecError("Instance is running, can't shutdown"
  
    if instance.name in ins_l.payload:
      raise errors.OpExecError("Instance is running, can't shutdown"
@@ -2810,7 +3406,7 @@ def _ShutdownInstanceDisks(lu, instance, ignore_primary=False):
      for node, top_disk in disk.ComputeNodeTree(instance.primary_node):
        lu.cfg.SetDiskID(top_disk, node)
        result = lu.rpc.call_blockdev_shutdown(node, top_disk)
      for node, top_disk in disk.ComputeNodeTree(instance.primary_node):
        lu.cfg.SetDiskID(top_disk, node)
        result = lu.rpc.call_blockdev_shutdown(node, top_disk)
-      msg = result.RemoteFailMsg()
+      msg = result.fail_msg
        if msg:
          lu.LogWarning("Could not shutdown block device %s on node %s: %s",
                        disk.iv_name, node, msg)
        if msg:
          lu.LogWarning("Could not shutdown block device %s on node %s: %s",
                        disk.iv_name, node, msg)
@@ -2842,9 +3438,7 @@ def _CheckNodeFreeMemory(lu, node, reason, requested, hypervisor_name):
  
    """
    nodeinfo = lu.rpc.call_node_info([node], lu.cfg.GetVGName(), hypervisor_name)
  
    """
    nodeinfo = lu.rpc.call_node_info([node], lu.cfg.GetVGName(), hypervisor_name)
-  msg = nodeinfo[node].RemoteFailMsg()
-  if msg:
-    raise errors.OpPrereqError("Can't get data from node %s: %s" % (node, msg))
+  nodeinfo[node].Raise("Can't get data from node %s" % node, prereq=True)
    free_mem = nodeinfo[node].payload.get('memory_free', None)
    if not isinstance(free_mem, int):
      raise errors.OpPrereqError("Can't compute free memory on node %s, result"
    free_mem = nodeinfo[node].payload.get('memory_free', None)
    if not isinstance(free_mem, int):
      raise errors.OpPrereqError("Can't compute free memory on node %s, result"
@@ -2921,16 +3515,14 @@ class LUStartupInstance(LogicalUnit):
      _CheckNodeOnline(self, instance.primary_node)
  
      bep = self.cfg.GetClusterInfo().FillBE(instance)
      _CheckNodeOnline(self, instance.primary_node)
  
      bep = self.cfg.GetClusterInfo().FillBE(instance)
-    # check bridges existance
+    # check bridges existence
      _CheckInstanceBridgesExist(self, instance)
  
      remote_info = self.rpc.call_instance_info(instance.primary_node,
                                                instance.name,
                                                instance.hypervisor)
      _CheckInstanceBridgesExist(self, instance)
  
      remote_info = self.rpc.call_instance_info(instance.primary_node,
                                                instance.name,
                                                instance.hypervisor)
-    msg = remote_info.RemoteFailMsg()
-    if msg:
-      raise errors.OpPrereqError("Error checking node %s: %s" %
-                                 (instance.primary_node, msg))
+    remote_info.Raise("Error checking node %s" % instance.primary_node,
+                      prereq=True)
      if not remote_info.payload: # not running already
        _CheckNodeFreeMemory(self, instance.primary_node,
                             "starting instance %s" % instance.name,
      if not remote_info.payload: # not running already
        _CheckNodeFreeMemory(self, instance.primary_node,
                             "starting instance %s" % instance.name,
@@ -2951,7 +3543,7 @@ class LUStartupInstance(LogicalUnit):
  
      result = self.rpc.call_instance_start(node_current, instance,
                                            self.hvparams, self.beparams)
  
      result = self.rpc.call_instance_start(node_current, instance,
                                            self.hvparams, self.beparams)
-    msg = result.RemoteFailMsg()
+    msg = result.fail_msg
      if msg:
        _ShutdownInstanceDisks(self, instance)
        raise errors.OpExecError("Could not start instance: %s" % msg)
      if msg:
        _ShutdownInstanceDisks(self, instance)
        raise errors.OpExecError("Could not start instance: %s" % msg)
@@ -3002,7 +3594,7 @@ class LURebootInstance(LogicalUnit):
  
      _CheckNodeOnline(self, instance.primary_node)
  
  
      _CheckNodeOnline(self, instance.primary_node)
  
-    # check bridges existance
+    # check bridges existence
      _CheckInstanceBridgesExist(self, instance)
  
    def Exec(self, feedback_fn):
      _CheckInstanceBridgesExist(self, instance)
  
    def Exec(self, feedback_fn):
@@ -3021,19 +3613,14 @@ class LURebootInstance(LogicalUnit):
          self.cfg.SetDiskID(disk, node_current)
        result = self.rpc.call_instance_reboot(node_current, instance,
                                               reboot_type)
          self.cfg.SetDiskID(disk, node_current)
        result = self.rpc.call_instance_reboot(node_current, instance,
                                               reboot_type)
-      msg = result.RemoteFailMsg()
-      if msg:
-        raise errors.OpExecError("Could not reboot instance: %s" % msg)
+      result.Raise("Could not reboot instance")
      else:
        result = self.rpc.call_instance_shutdown(node_current, instance)
      else:
        result = self.rpc.call_instance_shutdown(node_current, instance)
-      msg = result.RemoteFailMsg()
-      if msg:
-        raise errors.OpExecError("Could not shutdown instance for"
-                                 " full reboot: %s" % msg)
+      result.Raise("Could not shutdown instance for full reboot")
        _ShutdownInstanceDisks(self, instance)
        _StartInstanceDisks(self, instance, ignore_secondaries)
        result = self.rpc.call_instance_start(node_current, instance, None, None)
        _ShutdownInstanceDisks(self, instance)
        _StartInstanceDisks(self, instance, ignore_secondaries)
        result = self.rpc.call_instance_start(node_current, instance, None, None)
-      msg = result.RemoteFailMsg()
+      msg = result.fail_msg
        if msg:
          _ShutdownInstanceDisks(self, instance)
          raise errors.OpExecError("Could not start instance for"
        if msg:
          _ShutdownInstanceDisks(self, instance)
          raise errors.OpExecError("Could not start instance for"
@@ -3083,7 +3670,7 @@ class LUShutdownInstance(LogicalUnit):
      node_current = instance.primary_node
      self.cfg.MarkInstanceDown(instance.name)
      result = self.rpc.call_instance_shutdown(node_current, instance)
      node_current = instance.primary_node
      self.cfg.MarkInstanceDown(instance.name)
      result = self.rpc.call_instance_shutdown(node_current, instance)
-    msg = result.RemoteFailMsg()
+    msg = result.fail_msg
      if msg:
        self.proc.LogWarning("Could not shutdown instance: %s" % msg)
  
      if msg:
        self.proc.LogWarning("Could not shutdown instance: %s" % msg)
  
@@ -3132,10 +3719,8 @@ class LUReinstallInstance(LogicalUnit):
      remote_info = self.rpc.call_instance_info(instance.primary_node,
                                                instance.name,
                                                instance.hypervisor)
      remote_info = self.rpc.call_instance_info(instance.primary_node,
                                                instance.name,
                                                instance.hypervisor)
-    msg = remote_info.RemoteFailMsg()
-    if msg:
-      raise errors.OpPrereqError("Error checking node %s: %s" %
-                                 (instance.primary_node, msg))
+    remote_info.Raise("Error checking node %s" % instance.primary_node,
+                      prereq=True)
      if remote_info.payload:
        raise errors.OpPrereqError("Instance '%s' is running on the node %s" %
                                   (self.op.instance_name,
      if remote_info.payload:
        raise errors.OpPrereqError("Instance '%s' is running on the node %s" %
                                   (self.op.instance_name,
@@ -3150,10 +3735,8 @@ class LUReinstallInstance(LogicalUnit):
          raise errors.OpPrereqError("Primary node '%s' is unknown" %
                                     self.op.pnode)
        result = self.rpc.call_os_get(pnode.name, self.op.os_type)
          raise errors.OpPrereqError("Primary node '%s' is unknown" %
                                     self.op.pnode)
        result = self.rpc.call_os_get(pnode.name, self.op.os_type)
-      result.Raise()
-      if not isinstance(result.data, objects.OS):
-        raise errors.OpPrereqError("OS '%s' not in supported OS list for"
-                                   " primary node"  % self.op.os_type)
+      result.Raise("OS '%s' not in supported OS list for primary node %s" %
+                   (self.op.os_type, pnode.name), prereq=True)
  
      self.instance = instance
  
  
      self.instance = instance
  
@@ -3172,15 +3755,95 @@ class LUReinstallInstance(LogicalUnit):
      try:
        feedback_fn("Running the instance OS create scripts...")
        result = self.rpc.call_instance_os_add(inst.primary_node, inst, True)
      try:
        feedback_fn("Running the instance OS create scripts...")
        result = self.rpc.call_instance_os_add(inst.primary_node, inst, True)
-      msg = result.RemoteFailMsg()
-      if msg:
-        raise errors.OpExecError("Could not install OS for instance %s"
-                                 " on node %s: %s" %
-                                 (inst.name, inst.primary_node, msg))
+      result.Raise("Could not install OS for instance %s on node %s" %
+                   (inst.name, inst.primary_node))
      finally:
        _ShutdownInstanceDisks(self, inst)
  
  
      finally:
        _ShutdownInstanceDisks(self, inst)
  
  
+class LURecreateInstanceDisks(LogicalUnit):
+  """Recreate an instance's missing disks.
+
+  """
+  HPATH = "instance-recreate-disks"
+  HTYPE = constants.HTYPE_INSTANCE
+  _OP_REQP = ["instance_name", "disks"]
+  REQ_BGL = False
+
+  def CheckArguments(self):
+    """Check the arguments.
+
+    """
+    if not isinstance(self.op.disks, list):
+      raise errors.OpPrereqError("Invalid disks parameter")
+    for item in self.op.disks:
+      if (not isinstance(item, int) or
+          item < 0):
+        raise errors.OpPrereqError("Invalid disk specification '%s'" %
+                                   str(item))
+
+  def ExpandNames(self):
+    self._ExpandAndLockInstance()
+
+  def BuildHooksEnv(self):
+    """Build hooks env.
+
+    This runs on master, primary and secondary nodes of the instance.
+
+    """
+    env = _BuildInstanceHookEnvByObject(self, self.instance)
+    nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
+    return env, nl, nl
+
+  def CheckPrereq(self):
+    """Check prerequisites.
+
+    This checks that the instance is in the cluster and is not running.
+
+    """
+    instance = self.cfg.GetInstanceInfo(self.op.instance_name)
+    assert instance is not None, \
+      "Cannot retrieve locked instance %s" % self.op.instance_name
+    _CheckNodeOnline(self, instance.primary_node)
+
+    if instance.disk_template == constants.DT_DISKLESS:
+      raise errors.OpPrereqError("Instance '%s' has no disks" %
+                                 self.op.instance_name)
+    if instance.admin_up:
+      raise errors.OpPrereqError("Instance '%s' is marked to be up" %
+                                 self.op.instance_name)
+    remote_info = self.rpc.call_instance_info(instance.primary_node,
+                                              instance.name,
+                                              instance.hypervisor)
+    remote_info.Raise("Error checking node %s" % instance.primary_node,
+                      prereq=True)
+    if remote_info.payload:
+      raise errors.OpPrereqError("Instance '%s' is running on the node %s" %
+                                 (self.op.instance_name,
+                                  instance.primary_node))
+
+    if not self.op.disks:
+      self.op.disks = range(len(instance.disks))
+    else:
+      for idx in self.op.disks:
+        if idx >= len(instance.disks):
+          raise errors.OpPrereqError("Invalid disk index passed '%s'" % idx)
+
+    self.instance = instance
+
+  def Exec(self, feedback_fn):
+    """Recreate the disks.
+
+    """
+    to_skip = []
+    for idx, disk in enumerate(self.instance.disks):
+      if idx not in self.op.disks: # disk idx has not been passed in
+        to_skip.append(idx)
+        continue
+
+    _CreateDisks(self, self.instance, to_skip=to_skip)
+
+
  class LURenameInstance(LogicalUnit):
    """Rename an instance.
  
  class LURenameInstance(LogicalUnit):
    """Rename an instance.
  
@@ -3219,10 +3882,8 @@ class LURenameInstance(LogicalUnit):
      remote_info = self.rpc.call_instance_info(instance.primary_node,
                                                instance.name,
                                                instance.hypervisor)
      remote_info = self.rpc.call_instance_info(instance.primary_node,
                                                instance.name,
                                                instance.hypervisor)
-    msg = remote_info.RemoteFailMsg()
-    if msg:
-      raise errors.OpPrereqError("Error checking node %s: %s" %
-                                 (instance.primary_node, msg))
+    remote_info.Raise("Error checking node %s" % instance.primary_node,
+                      prereq=True)
      if remote_info.payload:
        raise errors.OpPrereqError("Instance '%s' is running on the node %s" %
                                   (self.op.instance_name,
      if remote_info.payload:
        raise errors.OpPrereqError("Instance '%s' is running on the node %s" %
                                   (self.op.instance_name,
@@ -3267,25 +3928,16 @@ class LURenameInstance(LogicalUnit):
        result = self.rpc.call_file_storage_dir_rename(inst.primary_node,
                                                       old_file_storage_dir,
                                                       new_file_storage_dir)
        result = self.rpc.call_file_storage_dir_rename(inst.primary_node,
                                                       old_file_storage_dir,
                                                       new_file_storage_dir)
-      result.Raise()
-      if not result.data:
-        raise errors.OpExecError("Could not connect to node '%s' to rename"
-                                 " directory '%s' to '%s' (but the instance"
-                                 " has been renamed in Ganeti)" % (
-                                 inst.primary_node, old_file_storage_dir,
-                                 new_file_storage_dir))
-
-      if not result.data[0]:
-        raise errors.OpExecError("Could not rename directory '%s' to '%s'"
-                                 " (but the instance has been renamed in"
-                                 " Ganeti)" % (old_file_storage_dir,
-                                               new_file_storage_dir))
+      result.Raise("Could not rename on node %s directory '%s' to '%s'"
+                   " (but the instance has been renamed in Ganeti)" %
+                   (inst.primary_node, old_file_storage_dir,
+                    new_file_storage_dir))
  
      _StartInstanceDisks(self, inst, None)
      try:
        result = self.rpc.call_instance_run_rename(inst.primary_node, inst,
                                                   old_name)
  
      _StartInstanceDisks(self, inst, None)
      try:
        result = self.rpc.call_instance_run_rename(inst.primary_node, inst,
                                                   old_name)
-      msg = result.RemoteFailMsg()
+      msg = result.fail_msg
        if msg:
          msg = ("Could not run OS rename script for instance %s on node %s"
                 " (but the instance has been renamed in Ganeti): %s" %
        if msg:
          msg = ("Could not run OS rename script for instance %s on node %s"
                 " (but the instance has been renamed in Ganeti): %s" %
@@ -3342,7 +3994,7 @@ class LURemoveInstance(LogicalUnit):
                   instance.name, instance.primary_node)
  
      result = self.rpc.call_instance_shutdown(instance.primary_node, instance)
                   instance.name, instance.primary_node)
  
      result = self.rpc.call_instance_shutdown(instance.primary_node, instance)
-    msg = result.RemoteFailMsg()
+    msg = result.fail_msg
      if msg:
        if self.op.ignore_failures:
          feedback_fn("Warning: can't shutdown instance: %s" % msg)
      if msg:
        if self.op.ignore_failures:
          feedback_fn("Warning: can't shutdown instance: %s" % msg)
@@ -3371,17 +4023,22 @@ class LUQueryInstances(NoHooksLU):
    """
    _OP_REQP = ["output_fields", "names", "use_locking"]
    REQ_BGL = False
    """
    _OP_REQP = ["output_fields", "names", "use_locking"]
    REQ_BGL = False
+  _SIMPLE_FIELDS = ["name", "os", "network_port", "hypervisor",
+                    "serial_no", "ctime", "mtime", "uuid"]
    _FIELDS_STATIC = utils.FieldSet(*["name", "os", "pnode", "snodes",
                                      "admin_state",
                                      "disk_template", "ip", "mac", "bridge",
    _FIELDS_STATIC = utils.FieldSet(*["name", "os", "pnode", "snodes",
                                      "admin_state",
                                      "disk_template", "ip", "mac", "bridge",
+                                    "nic_mode", "nic_link",
                                      "sda_size", "sdb_size", "vcpus", "tags",
                                      "network_port", "beparams",
                                      r"(disk)\.(size)/([0-9]+)",
                                      r"(disk)\.(sizes)", "disk_usage",
                                      "sda_size", "sdb_size", "vcpus", "tags",
                                      "network_port", "beparams",
                                      r"(disk)\.(size)/([0-9]+)",
                                      r"(disk)\.(sizes)", "disk_usage",
-                                    r"(nic)\.(mac|ip|bridge)/([0-9]+)",
-                                    r"(nic)\.(macs|ips|bridges)",
+                                    r"(nic)\.(mac|ip|mode|link)/([0-9]+)",
+                                    r"(nic)\.(bridge)/([0-9]+)",
+                                    r"(nic)\.(macs|ips|modes|links|bridges)",
                                      r"(disk|nic)\.(count)",
                                      r"(disk|nic)\.(count)",
-                                    "serial_no", "hypervisor", "hvparams",] +
+                                    "hvparams",
+                                    ] + _SIMPLE_FIELDS +
                                    ["hv/%s" % name
                                     for name in constants.HVS_PARAMETERS] +
                                    ["be/%s" % name
                                    ["hv/%s" % name
                                     for name in constants.HVS_PARAMETERS] +
                                    ["be/%s" % name
@@ -3461,7 +4118,7 @@ class LUQueryInstances(NoHooksLU):
          if result.offline:
            # offline nodes will be in both lists
            off_nodes.append(name)
          if result.offline:
            # offline nodes will be in both lists
            off_nodes.append(name)
-        if result.failed or result.RemoteFailMsg():
+        if result.fail_msg:
            bad_nodes.append(name)
          else:
            if result.payload:
            bad_nodes.append(name)
          else:
            if result.payload:
@@ -3475,16 +4132,17 @@ class LUQueryInstances(NoHooksLU):
      HVPREFIX = "hv/"
      BEPREFIX = "be/"
      output = []
      HVPREFIX = "hv/"
      BEPREFIX = "be/"
      output = []
+    cluster = self.cfg.GetClusterInfo()
      for instance in instance_list:
        iout = []
      for instance in instance_list:
        iout = []
-      i_hv = self.cfg.GetClusterInfo().FillHV(instance)
-      i_be = self.cfg.GetClusterInfo().FillBE(instance)
+      i_hv = cluster.FillHV(instance)
+      i_be = cluster.FillBE(instance)
+      i_nicp = [objects.FillDict(cluster.nicparams[constants.PP_DEFAULT],
+                                 nic.nicparams) for nic in instance.nics]
        for field in self.op.output_fields:
          st_match = self._FIELDS_STATIC.Matches(field)
        for field in self.op.output_fields:
          st_match = self._FIELDS_STATIC.Matches(field)
-        if field == "name":
-          val = instance.name
-        elif field == "os":
-          val = instance.os
+        if field in self._SIMPLE_FIELDS:
+          val = getattr(instance, field)
          elif field == "pnode":
            val = instance.primary_node
          elif field == "snodes":
          elif field == "pnode":
            val = instance.primary_node
          elif field == "snodes":
@@ -3520,14 +4178,36 @@ class LUQueryInstances(NoHooksLU):
              val = live_data[instance.name].get("memory", "?")
            else:
              val = "-"
              val = live_data[instance.name].get("memory", "?")
            else:
              val = "-"
+        elif field == "vcpus":
+          val = i_be[constants.BE_VCPUS]
          elif field == "disk_template":
            val = instance.disk_template
          elif field == "ip":
          elif field == "disk_template":
            val = instance.disk_template
          elif field == "ip":
-          val = instance.nics[0].ip
+          if instance.nics:
+            val = instance.nics[0].ip
+          else:
+            val = None
+        elif field == "nic_mode":
+          if instance.nics:
+            val = i_nicp[0][constants.NIC_MODE]
+          else:
+            val = None
+        elif field == "nic_link":
+          if instance.nics:
+            val = i_nicp[0][constants.NIC_LINK]
+          else:
+            val = None
          elif field == "bridge":
          elif field == "bridge":
-          val = instance.nics[0].bridge
+          if (instance.nics and
+              i_nicp[0][constants.NIC_MODE] == constants.NIC_MODE_BRIDGED):
+            val = i_nicp[0][constants.NIC_LINK]
+          else:
+            val = None
          elif field == "mac":
          elif field == "mac":
-          val = instance.nics[0].mac
+          if instance.nics:
+            val = instance.nics[0].mac
+          else:
+            val = None
          elif field == "sda_size" or field == "sdb_size":
            idx = ord(field[2]) - ord('a')
            try:
          elif field == "sda_size" or field == "sdb_size":
            idx = ord(field[2]) - ord('a')
            try:
@@ -3539,12 +4219,6 @@ class LUQueryInstances(NoHooksLU):
            val = _ComputeDiskSize(instance.disk_template, disk_sizes)
          elif field == "tags":
            val = list(instance.GetTags())
            val = _ComputeDiskSize(instance.disk_template, disk_sizes)
          elif field == "tags":
            val = list(instance.GetTags())
-        elif field == "serial_no":
-          val = instance.serial_no
-        elif field == "network_port":
-          val = instance.network_port
-        elif field == "hypervisor":
-          val = instance.hypervisor
          elif field == "hvparams":
            val = i_hv
          elif (field.startswith(HVPREFIX) and
          elif field == "hvparams":
            val = i_hv
          elif (field.startswith(HVPREFIX) and
@@ -3577,8 +4251,17 @@ class LUQueryInstances(NoHooksLU):
                val = [nic.mac for nic in instance.nics]
              elif st_groups[1] == "ips":
                val = [nic.ip for nic in instance.nics]
                val = [nic.mac for nic in instance.nics]
              elif st_groups[1] == "ips":
                val = [nic.ip for nic in instance.nics]
+            elif st_groups[1] == "modes":
+              val = [nicp[constants.NIC_MODE] for nicp in i_nicp]
+            elif st_groups[1] == "links":
+              val = [nicp[constants.NIC_LINK] for nicp in i_nicp]
              elif st_groups[1] == "bridges":
              elif st_groups[1] == "bridges":
-              val = [nic.bridge for nic in instance.nics]
+              val = []
+              for nicp in i_nicp:
+                if nicp[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
+                  val.append(nicp[constants.NIC_LINK])
+                else:
+                  val.append(None)
              else:
                # index-based item
                nic_idx = int(st_groups[2])
              else:
                # index-based item
                nic_idx = int(st_groups[2])
@@ -3589,14 +4272,23 @@ class LUQueryInstances(NoHooksLU):
                    val = instance.nics[nic_idx].mac
                  elif st_groups[1] == "ip":
                    val = instance.nics[nic_idx].ip
                    val = instance.nics[nic_idx].mac
                  elif st_groups[1] == "ip":
                    val = instance.nics[nic_idx].ip
+                elif st_groups[1] == "mode":
+                  val = i_nicp[nic_idx][constants.NIC_MODE]
+                elif st_groups[1] == "link":
+                  val = i_nicp[nic_idx][constants.NIC_LINK]
                  elif st_groups[1] == "bridge":
                  elif st_groups[1] == "bridge":
-                  val = instance.nics[nic_idx].bridge
+                  nic_mode = i_nicp[nic_idx][constants.NIC_MODE]
+                  if nic_mode == constants.NIC_MODE_BRIDGED:
+                    val = i_nicp[nic_idx][constants.NIC_LINK]
+                  else:
+                    val = None
                  else:
                    assert False, "Unhandled NIC parameter"
            else:
                  else:
                    assert False, "Unhandled NIC parameter"
            else:
-            assert False, "Unhandled variable parameter"
+            assert False, ("Declared but unhandled variable parameter '%s'" %
+                           field)
          else:
          else:
-          raise errors.ParameterError(field)
+          assert False, "Declared but unhandled parameter '%s'" % field
          iout.append(val)
        output.append(iout)
  
          iout.append(val)
        output.append(iout)
  
@@ -3657,10 +4349,15 @@ class LUFailoverInstance(LogicalUnit):
      target_node = secondary_nodes[0]
      _CheckNodeOnline(self, target_node)
      _CheckNodeNotDrained(self, target_node)
      target_node = secondary_nodes[0]
      _CheckNodeOnline(self, target_node)
      _CheckNodeNotDrained(self, target_node)
-    # check memory requirements on the secondary node
-    _CheckNodeFreeMemory(self, target_node, "failing over instance %s" %
-                         instance.name, bep[constants.BE_MEMORY],
-                         instance.hypervisor)
+    if instance.admin_up:
+      # check memory requirements on the secondary node
+      _CheckNodeFreeMemory(self, target_node, "failing over instance %s" %
+                           instance.name, bep[constants.BE_MEMORY],
+                           instance.hypervisor)
+    else:
+      self.LogInfo("Not checking memory on the secondary node as"
+                   " instance will not be started")
+
      # check bridge existance
      _CheckInstanceBridgesExist(self, instance, node=target_node)
  
      # check bridge existance
      _CheckInstanceBridgesExist(self, instance, node=target_node)
  
@@ -3689,7 +4386,7 @@ class LUFailoverInstance(LogicalUnit):
                   instance.name, source_node)
  
      result = self.rpc.call_instance_shutdown(source_node, instance)
                   instance.name, source_node)
  
      result = self.rpc.call_instance_shutdown(source_node, instance)
-    msg = result.RemoteFailMsg()
+    msg = result.fail_msg
      if msg:
        if self.op.ignore_consistency:
          self.proc.LogWarning("Could not shutdown instance %s on node %s."
      if msg:
        if self.op.ignore_consistency:
          self.proc.LogWarning("Could not shutdown instance %s on node %s."
@@ -3715,7 +4412,7 @@ class LUFailoverInstance(LogicalUnit):
        logging.info("Starting instance %s on node %s",
                     instance.name, target_node)
  
        logging.info("Starting instance %s on node %s",
                     instance.name, target_node)
  
-      disks_ok, dummy = _AssembleInstanceDisks(self, instance,
+      disks_ok, _ = _AssembleInstanceDisks(self, instance,
                                                 ignore_secondaries=True)
        if not disks_ok:
          _ShutdownInstanceDisks(self, instance)
                                                 ignore_secondaries=True)
        if not disks_ok:
          _ShutdownInstanceDisks(self, instance)
@@ -3723,7 +4420,7 @@ class LUFailoverInstance(LogicalUnit):
  
        feedback_fn("* starting the instance on the target node")
        result = self.rpc.call_instance_start(target_node, instance, None, None)
  
        feedback_fn("* starting the instance on the target node")
        result = self.rpc.call_instance_start(target_node, instance, None, None)
-      msg = result.RemoteFailMsg()
+      msg = result.fail_msg
        if msg:
          _ShutdownInstanceDisks(self, instance)
          raise errors.OpExecError("Could not start instance %s on node %s: %s" %
        if msg:
          _ShutdownInstanceDisks(self, instance)
          raise errors.OpExecError("Could not start instance %s on node %s: %s" %
@@ -3745,9 +4442,14 @@ class LUMigrateInstance(LogicalUnit):
  
    def ExpandNames(self):
      self._ExpandAndLockInstance()
  
    def ExpandNames(self):
      self._ExpandAndLockInstance()
+
      self.needed_locks[locking.LEVEL_NODE] = []
      self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
  
      self.needed_locks[locking.LEVEL_NODE] = []
      self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
  
+    self._migrater = TLMigrateInstance(self, self.op.instance_name,
+                                       self.op.live, self.op.cleanup)
+    self.tasklets = [self._migrater]
+
    def DeclareLocks(self, level):
      if level == locking.LEVEL_NODE:
        self._LockInstancesNodes()
    def DeclareLocks(self, level):
      if level == locking.LEVEL_NODE:
        self._LockInstancesNodes()
@@ -3758,10 +4460,49 @@ class LUMigrateInstance(LogicalUnit):
      This runs on master, primary and secondary nodes of the instance.
  
      """
      This runs on master, primary and secondary nodes of the instance.
  
      """
-    env = _BuildInstanceHookEnvByObject(self, self.instance)
+    instance = self._migrater.instance
+    env = _BuildInstanceHookEnvByObject(self, instance)
      env["MIGRATE_LIVE"] = self.op.live
      env["MIGRATE_CLEANUP"] = self.op.cleanup
      env["MIGRATE_LIVE"] = self.op.live
      env["MIGRATE_CLEANUP"] = self.op.cleanup
-    nl = [self.cfg.GetMasterNode()] + list(self.instance.secondary_nodes)
+    nl = [self.cfg.GetMasterNode()] + list(instance.secondary_nodes)
+    return env, nl, nl
+
+
+class LUMoveInstance(LogicalUnit):
+  """Move an instance by data-copying.
+
+  """
+  HPATH = "instance-move"
+  HTYPE = constants.HTYPE_INSTANCE
+  _OP_REQP = ["instance_name", "target_node"]
+  REQ_BGL = False
+
+  def ExpandNames(self):
+    self._ExpandAndLockInstance()
+    target_node = self.cfg.ExpandNodeName(self.op.target_node)
+    if target_node is None:
+      raise errors.OpPrereqError("Node '%s' not known" %
+                                  self.op.target_node)
+    self.op.target_node = target_node
+    self.needed_locks[locking.LEVEL_NODE] = [target_node]
+    self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
+
+  def DeclareLocks(self, level):
+    if level == locking.LEVEL_NODE:
+      self._LockInstancesNodes(primary_only=True)
+
+  def BuildHooksEnv(self):
+    """Build hooks env.
+
+    This runs on master, primary and secondary nodes of the instance.
+
+    """
+    env = {
+      "TARGET_NODE": self.op.target_node,
+      }
+    env.update(_BuildInstanceHookEnvByObject(self, self.instance))
+    nl = [self.cfg.GetMasterNode()] + [self.instance.primary_node,
+                                       self.op.target_node]
      return env, nl, nl
  
    def CheckPrereq(self):
      return env, nl, nl
  
    def CheckPrereq(self):
@@ -3770,11 +4511,215 @@ class LUMigrateInstance(LogicalUnit):
      This checks that the instance is in the cluster.
  
      """
      This checks that the instance is in the cluster.
  
      """
+    self.instance = instance = self.cfg.GetInstanceInfo(self.op.instance_name)
+    assert self.instance is not None, \
+      "Cannot retrieve locked instance %s" % self.op.instance_name
+
+    node = self.cfg.GetNodeInfo(self.op.target_node)
+    assert node is not None, \
+      "Cannot retrieve locked node %s" % self.op.target_node
+
+    self.target_node = target_node = node.name
+
+    if target_node == instance.primary_node:
+      raise errors.OpPrereqError("Instance %s is already on the node %s" %
+                                 (instance.name, target_node))
+
+    bep = self.cfg.GetClusterInfo().FillBE(instance)
+
+    for idx, dsk in enumerate(instance.disks):
+      if dsk.dev_type not in (constants.LD_LV, constants.LD_FILE):
+        raise errors.OpPrereqError("Instance disk %d has a complex layout,"
+                                   " cannot copy")
+
+    _CheckNodeOnline(self, target_node)
+    _CheckNodeNotDrained(self, target_node)
+
+    if instance.admin_up:
+      # check memory requirements on the secondary node
+      _CheckNodeFreeMemory(self, target_node, "failing over instance %s" %
+                           instance.name, bep[constants.BE_MEMORY],
+                           instance.hypervisor)
+    else:
+      self.LogInfo("Not checking memory on the secondary node as"
+                   " instance will not be started")
+
+    # check bridge existance
+    _CheckInstanceBridgesExist(self, instance, node=target_node)
+
+  def Exec(self, feedback_fn):
+    """Move an instance.
+
+    The move is done by shutting it down on its present node, copying
+    the data over (slow) and starting it on the new node.
+
+    """
+    instance = self.instance
+
+    source_node = instance.primary_node
+    target_node = self.target_node
+
+    self.LogInfo("Shutting down instance %s on source node %s",
+                 instance.name, source_node)
+
+    result = self.rpc.call_instance_shutdown(source_node, instance)
+    msg = result.fail_msg
+    if msg:
+      if self.op.ignore_consistency:
+        self.proc.LogWarning("Could not shutdown instance %s on node %s."
+                             " Proceeding anyway. Please make sure node"
+                             " %s is down. Error details: %s",
+                             instance.name, source_node, source_node, msg)
+      else:
+        raise errors.OpExecError("Could not shutdown instance %s on"
+                                 " node %s: %s" %
+                                 (instance.name, source_node, msg))
+
+    # create the target disks
+    try:
+      _CreateDisks(self, instance, target_node=target_node)
+    except errors.OpExecError:
+      self.LogWarning("Device creation failed, reverting...")
+      try:
+        _RemoveDisks(self, instance, target_node=target_node)
+      finally:
+        self.cfg.ReleaseDRBDMinors(instance.name)
+        raise
+
+    cluster_name = self.cfg.GetClusterInfo().cluster_name
+
+    errs = []
+    # activate, get path, copy the data over
+    for idx, disk in enumerate(instance.disks):
+      self.LogInfo("Copying data for disk %d", idx)
+      result = self.rpc.call_blockdev_assemble(target_node, disk,
+                                               instance.name, True)
+      if result.fail_msg:
+        self.LogWarning("Can't assemble newly created disk %d: %s",
+                        idx, result.fail_msg)
+        errs.append(result.fail_msg)
+        break
+      dev_path = result.payload
+      result = self.rpc.call_blockdev_export(source_node, disk,
+                                             target_node, dev_path,
+                                             cluster_name)
+      if result.fail_msg:
+        self.LogWarning("Can't copy data over for disk %d: %s",
+                        idx, result.fail_msg)
+        errs.append(result.fail_msg)
+        break
+
+    if errs:
+      self.LogWarning("Some disks failed to copy, aborting")
+      try:
+        _RemoveDisks(self, instance, target_node=target_node)
+      finally:
+        self.cfg.ReleaseDRBDMinors(instance.name)
+        raise errors.OpExecError("Errors during disk copy: %s" %
+                                 (",".join(errs),))
+
+    instance.primary_node = target_node
+    self.cfg.Update(instance)
+
+    self.LogInfo("Removing the disks on the original node")
+    _RemoveDisks(self, instance, target_node=source_node)
+
+    # Only start the instance if it's marked as up
+    if instance.admin_up:
+      self.LogInfo("Starting instance %s on node %s",
+                   instance.name, target_node)
+
+      disks_ok, _ = _AssembleInstanceDisks(self, instance,
+                                           ignore_secondaries=True)
+      if not disks_ok:
+        _ShutdownInstanceDisks(self, instance)
+        raise errors.OpExecError("Can't activate the instance's disks")
+
+      result = self.rpc.call_instance_start(target_node, instance, None, None)
+      msg = result.fail_msg
+      if msg:
+        _ShutdownInstanceDisks(self, instance)
+        raise errors.OpExecError("Could not start instance %s on node %s: %s" %
+                                 (instance.name, target_node, msg))
+
+
+class LUMigrateNode(LogicalUnit):
+  """Migrate all instances from a node.
+
+  """
+  HPATH = "node-migrate"
+  HTYPE = constants.HTYPE_NODE
+  _OP_REQP = ["node_name", "live"]
+  REQ_BGL = False
+
+  def ExpandNames(self):
+    self.op.node_name = self.cfg.ExpandNodeName(self.op.node_name)
+    if self.op.node_name is None:
+      raise errors.OpPrereqError("Node '%s' not known" % self.op.node_name)
+
+    self.needed_locks = {
+      locking.LEVEL_NODE: [self.op.node_name],
+      }
+
+    self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
+
+    # Create tasklets for migrating instances for all instances on this node
+    names = []
+    tasklets = []
+
+    for inst in _GetNodePrimaryInstances(self.cfg, self.op.node_name):
+      logging.debug("Migrating instance %s", inst.name)
+      names.append(inst.name)
+
+      tasklets.append(TLMigrateInstance(self, inst.name, self.op.live, False))
+
+    self.tasklets = tasklets
+
+    # Declare instance locks
+    self.needed_locks[locking.LEVEL_INSTANCE] = names
+
+  def DeclareLocks(self, level):
+    if level == locking.LEVEL_NODE:
+      self._LockInstancesNodes()
+
+  def BuildHooksEnv(self):
+    """Build hooks env.
+
+    This runs on the master, the primary and all the secondaries.
+
+    """
+    env = {
+      "NODE_NAME": self.op.node_name,
+      }
+
+    nl = [self.cfg.GetMasterNode()]
+
+    return (env, nl, nl)
+
+
+class TLMigrateInstance(Tasklet):
+  def __init__(self, lu, instance_name, live, cleanup):
+    """Initializes this class.
+
+    """
+    Tasklet.__init__(self, lu)
+
+    # Parameters
+    self.instance_name = instance_name
+    self.live = live
+    self.cleanup = cleanup
+
+  def CheckPrereq(self):
+    """Check prerequisites.
+
+    This checks that the instance is in the cluster.
+
+    """
      instance = self.cfg.GetInstanceInfo(
      instance = self.cfg.GetInstanceInfo(
-      self.cfg.ExpandInstanceName(self.op.instance_name))
+      self.cfg.ExpandInstanceName(self.instance_name))
      if instance is None:
        raise errors.OpPrereqError("Instance '%s' not known" %
      if instance is None:
        raise errors.OpPrereqError("Instance '%s' not known" %
-                                 self.op.instance_name)
+                                 self.instance_name)
  
      if instance.disk_template != constants.DT_DRBD8:
        raise errors.OpPrereqError("Instance's disk layout is not"
  
      if instance.disk_template != constants.DT_DRBD8:
        raise errors.OpPrereqError("Instance's disk layout is not"
@@ -3796,14 +4741,11 @@ class LUMigrateInstance(LogicalUnit):
      # check bridge existance
      _CheckInstanceBridgesExist(self, instance, node=target_node)
  
      # check bridge existance
      _CheckInstanceBridgesExist(self, instance, node=target_node)
  
-    if not self.op.cleanup:
+    if not self.cleanup:
        _CheckNodeNotDrained(self, target_node)
        result = self.rpc.call_instance_migratable(instance.primary_node,
                                                   instance)
        _CheckNodeNotDrained(self, target_node)
        result = self.rpc.call_instance_migratable(instance.primary_node,
                                                   instance)
-      msg = result.RemoteFailMsg()
-      if msg:
-        raise errors.OpPrereqError("Can't migrate: %s - please use failover" %
-                                   msg)
+      result.Raise("Can't migrate, please use failover", prereq=True)
  
      self.instance = instance
  
  
      self.instance = instance
  
@@ -3822,10 +4764,7 @@ class LUMigrateInstance(LogicalUnit):
                                              self.instance.disks)
        min_percent = 100
        for node, nres in result.items():
                                              self.instance.disks)
        min_percent = 100
        for node, nres in result.items():
-        msg = nres.RemoteFailMsg()
-        if msg:
-          raise errors.OpExecError("Cannot resync disks on node %s: %s" %
-                                   (node, msg))
+        nres.Raise("Cannot resync disks on node %s" % node)
          node_done, node_percent = nres.payload
          all_done = all_done and node_done
          if node_percent is not None:
          node_done, node_percent = nres.payload
          all_done = all_done and node_done
          if node_percent is not None:
@@ -3846,10 +4785,7 @@ class LUMigrateInstance(LogicalUnit):
  
      result = self.rpc.call_blockdev_close(node, self.instance.name,
                                            self.instance.disks)
  
      result = self.rpc.call_blockdev_close(node, self.instance.name,
                                            self.instance.disks)
-    msg = result.RemoteFailMsg()
-    if msg:
-      raise errors.OpExecError("Cannot change disk to secondary on node %s,"
-                               " error %s" % (node, msg))
+    result.Raise("Cannot change disk to secondary on node %s" % node)
  
    def _GoStandalone(self):
      """Disconnect from the network.
  
    def _GoStandalone(self):
      """Disconnect from the network.
@@ -3859,10 +4795,7 @@ class LUMigrateInstance(LogicalUnit):
      result = self.rpc.call_drbd_disconnect_net(self.all_nodes, self.nodes_ip,
                                                 self.instance.disks)
      for node, nres in result.items():
      result = self.rpc.call_drbd_disconnect_net(self.all_nodes, self.nodes_ip,
                                                 self.instance.disks)
      for node, nres in result.items():
-      msg = nres.RemoteFailMsg()
-      if msg:
-        raise errors.OpExecError("Cannot disconnect disks node %s,"
-                                 " error %s" % (node, msg))
+      nres.Raise("Cannot disconnect disks node %s" % node)
  
    def _GoReconnect(self, multimaster):
      """Reconnect to the network.
  
    def _GoReconnect(self, multimaster):
      """Reconnect to the network.
@@ -3877,10 +4810,7 @@ class LUMigrateInstance(LogicalUnit):
                                             self.instance.disks,
                                             self.instance.name, multimaster)
      for node, nres in result.items():
                                             self.instance.disks,
                                             self.instance.name, multimaster)
      for node, nres in result.items():
-      msg = nres.RemoteFailMsg()
-      if msg:
-        raise errors.OpExecError("Cannot change disks config on node %s,"
-                                 " error: %s" % (node, msg))
+      nres.Raise("Cannot change disks config on node %s" % node)
  
    def _ExecCleanup(self):
      """Try to cleanup after a failed migration.
  
    def _ExecCleanup(self):
      """Try to cleanup after a failed migration.
@@ -3905,9 +4835,7 @@ class LUMigrateInstance(LogicalUnit):
                       " a bad state)")
      ins_l = self.rpc.call_instance_list(self.all_nodes, [instance.hypervisor])
      for node, result in ins_l.items():
                       " a bad state)")
      ins_l = self.rpc.call_instance_list(self.all_nodes, [instance.hypervisor])
      for node, result in ins_l.items():
-      msg = result.RemoteFailMsg()
-      if msg:
-        raise errors.OpExecError("Can't contact node %s: %s" % (node, msg))
+      result.Raise("Can't contact node %s" % node)
  
      runningon_source = instance.name in ins_l[source_node].payload
      runningon_target = instance.name in ins_l[target_node].payload
  
      runningon_source = instance.name in ins_l[source_node].payload
      runningon_target = instance.name in ins_l[target_node].payload
@@ -3960,10 +4888,10 @@ class LUMigrateInstance(LogicalUnit):
        self._GoReconnect(False)
        self._WaitUntilSync()
      except errors.OpExecError, err:
        self._GoReconnect(False)
        self._WaitUntilSync()
      except errors.OpExecError, err:
-      self.LogWarning("Migration failed and I can't reconnect the"
-                      " drives: error '%s'\n"
-                      "Please look and recover the instance status" %
-                      str(err))
+      self.lu.LogWarning("Migration failed and I can't reconnect the"
+                         " drives: error '%s'\n"
+                         "Please look and recover the instance status" %
+                         str(err))
  
    def _AbortMigration(self):
      """Call the hypervisor code to abort a started migration.
  
    def _AbortMigration(self):
      """Call the hypervisor code to abort a started migration.
@@ -3977,7 +4905,7 @@ class LUMigrateInstance(LogicalUnit):
                                                      instance,
                                                      migration_info,
                                                      False)
                                                      instance,
                                                      migration_info,
                                                      False)
-    abort_msg = abort_result.RemoteFailMsg()
+    abort_msg = abort_result.fail_msg
      if abort_msg:
        logging.error("Aborting migration failed on target node %s: %s" %
                      (target_node, abort_msg))
      if abort_msg:
        logging.error("Aborting migration failed on target node %s: %s" %
                      (target_node, abort_msg))
@@ -4009,7 +4937,7 @@ class LUMigrateInstance(LogicalUnit):
  
      # First get the migration information from the remote node
      result = self.rpc.call_migration_info(source_node, instance)
  
      # First get the migration information from the remote node
      result = self.rpc.call_migration_info(source_node, instance)
-    msg = result.RemoteFailMsg()
+    msg = result.fail_msg
      if msg:
        log_err = ("Failed fetching source migration information from %s: %s" %
                   (source_node, msg))
      if msg:
        log_err = ("Failed fetching source migration information from %s: %s" %
                   (source_node, msg))
@@ -4030,7 +4958,7 @@ class LUMigrateInstance(LogicalUnit):
                                             migration_info,
                                             self.nodes_ip[target_node])
  
                                             migration_info,
                                             self.nodes_ip[target_node])
  
-    msg = result.RemoteFailMsg()
+    msg = result.fail_msg
      if msg:
        logging.error("Instance pre-migration failed, trying to revert"
                      " disk status: %s", msg)
      if msg:
        logging.error("Instance pre-migration failed, trying to revert"
                      " disk status: %s", msg)
@@ -4043,8 +4971,8 @@ class LUMigrateInstance(LogicalUnit):
      time.sleep(10)
      result = self.rpc.call_instance_migrate(source_node, instance,
                                              self.nodes_ip[target_node],
      time.sleep(10)
      result = self.rpc.call_instance_migrate(source_node, instance,
                                              self.nodes_ip[target_node],
-                                            self.op.live)
-    msg = result.RemoteFailMsg()
+                                            self.live)
+    msg = result.fail_msg
      if msg:
        logging.error("Instance migration failed, trying to revert"
                      " disk status: %s", msg)
      if msg:
        logging.error("Instance migration failed, trying to revert"
                      " disk status: %s", msg)
@@ -4062,7 +4990,7 @@ class LUMigrateInstance(LogicalUnit):
                                                instance,
                                                migration_info,
                                                True)
                                                instance,
                                                migration_info,
                                                True)
-    msg = result.RemoteFailMsg()
+    msg = result.fail_msg
      if msg:
        logging.error("Instance migration succeeded, but finalization failed:"
                      " %s" % msg)
      if msg:
        logging.error("Instance migration succeeded, but finalization failed:"
                      " %s" % msg)
@@ -4081,6 +5009,8 @@ class LUMigrateInstance(LogicalUnit):
      """Perform the migration.
  
      """
      """Perform the migration.
  
      """
+    feedback_fn("Migrating instance %s" % self.instance.name)
+
      self.feedback_fn = feedback_fn
  
      self.source_node = self.instance.primary_node
      self.feedback_fn = feedback_fn
  
      self.source_node = self.instance.primary_node
@@ -4090,7 +5020,8 @@ class LUMigrateInstance(LogicalUnit):
        self.source_node: self.cfg.GetNodeInfo(self.source_node).secondary_ip,
        self.target_node: self.cfg.GetNodeInfo(self.target_node).secondary_ip,
        }
        self.source_node: self.cfg.GetNodeInfo(self.source_node).secondary_ip,
        self.target_node: self.cfg.GetNodeInfo(self.target_node).secondary_ip,
        }
-    if self.op.cleanup:
+
+    if self.cleanup:
        return self._ExecCleanup()
      else:
        return self._ExecMigration()
        return self._ExecCleanup()
      else:
        return self._ExecMigration()
@@ -4162,11 +5093,8 @@ def _CreateSingleBlockDev(lu, node, instance, device, info, force_open):
    lu.cfg.SetDiskID(device, node)
    result = lu.rpc.call_blockdev_create(node, device, device.size,
                                         instance.name, force_open, info)
    lu.cfg.SetDiskID(device, node)
    result = lu.rpc.call_blockdev_create(node, device, device.size,
                                         instance.name, force_open, info)
-  msg = result.RemoteFailMsg()
-  if msg:
-    raise errors.OpExecError("Can't create block device %s on"
-                             " node %s for instance %s: %s" %
-                             (device, node, instance.name, msg))
+  result.Raise("Can't create block device %s on"
+               " node %s for instance %s" % (device, node, instance.name))
    if device.physical_id is None:
      device.physical_id = result.payload
  
    if device.physical_id is None:
      device.physical_id = result.payload
  
@@ -4224,7 +5152,7 @@ def _GenerateDiskTemplate(lu, template_name,
      if len(secondary_nodes) != 0:
        raise errors.ProgrammerError("Wrong template configuration")
  
      if len(secondary_nodes) != 0:
        raise errors.ProgrammerError("Wrong template configuration")
  
-    names = _GenerateUniqueNames(lu, [".disk%d" % i
+    names = _GenerateUniqueNames(lu, [".disk%d" % (base_index + i)
                                        for i in range(disk_count)])
      for idx, disk in enumerate(disk_info):
        disk_index = idx + base_index
                                        for i in range(disk_count)])
      for idx, disk in enumerate(disk_info):
        disk_index = idx + base_index
@@ -4241,7 +5169,7 @@ def _GenerateDiskTemplate(lu, template_name,
        [primary_node, remote_node] * len(disk_info), instance_name)
  
      names = []
        [primary_node, remote_node] * len(disk_info), instance_name)
  
      names = []
-    for lv_prefix in _GenerateUniqueNames(lu, [".disk%d" % i
+    for lv_prefix in _GenerateUniqueNames(lu, [".disk%d" % (base_index + i)
                                                 for i in range(disk_count)]):
        names.append(lv_prefix + "_data")
        names.append(lv_prefix + "_meta")
                                                 for i in range(disk_count)]):
        names.append(lv_prefix + "_data")
        names.append(lv_prefix + "_meta")
@@ -4278,7 +5206,7 @@ def _GetInstanceInfoText(instance):
    return "originstname+%s" % instance.name
  
  
    return "originstname+%s" % instance.name
  
  
-def _CreateDisks(lu, instance):
+def _CreateDisks(lu, instance, to_skip=None, target_node=None):
    """Create all disks for an instance.
  
    This abstracts away some work from AddInstance.
    """Create all disks for an instance.
  
    This abstracts away some work from AddInstance.
@@ -4287,36 +5215,43 @@ def _CreateDisks(lu, instance):
    @param lu: the logical unit on whose behalf we execute
    @type instance: L{objects.Instance}
    @param instance: the instance whose disks we should create
    @param lu: the logical unit on whose behalf we execute
    @type instance: L{objects.Instance}
    @param instance: the instance whose disks we should create
+  @type to_skip: list
+  @param to_skip: list of indices to skip
+  @type target_node: string
+  @param target_node: if passed, overrides the target node for creation
    @rtype: boolean
    @return: the success of the creation
  
    """
    info = _GetInstanceInfoText(instance)
    @rtype: boolean
    @return: the success of the creation
  
    """
    info = _GetInstanceInfoText(instance)
-  pnode = instance.primary_node
+  if target_node is None:
+    pnode = instance.primary_node
+    all_nodes = instance.all_nodes
+  else:
+    pnode = target_node
+    all_nodes = [pnode]
  
    if instance.disk_template == constants.DT_FILE:
      file_storage_dir = os.path.dirname(instance.disks[0].logical_id[1])
      result = lu.rpc.call_file_storage_dir_create(pnode, file_storage_dir)
  
  
    if instance.disk_template == constants.DT_FILE:
      file_storage_dir = os.path.dirname(instance.disks[0].logical_id[1])
      result = lu.rpc.call_file_storage_dir_create(pnode, file_storage_dir)
  
-    if result.failed or not result.data:
-      raise errors.OpExecError("Could not connect to node '%s'" % pnode)
-
-    if not result.data[0]:
-      raise errors.OpExecError("Failed to create directory '%s'" %
-                               file_storage_dir)
+    result.Raise("Failed to create directory '%s' on"
+                 " node %s" % (file_storage_dir, pnode))
  
    # Note: this needs to be kept in sync with adding of disks in
    # LUSetInstanceParams
  
    # Note: this needs to be kept in sync with adding of disks in
    # LUSetInstanceParams
-  for device in instance.disks:
+  for idx, device in enumerate(instance.disks):
+    if to_skip and idx in to_skip:
+      continue
      logging.info("Creating volume %s for instance %s",
                   device.iv_name, instance.name)
      #HARDCODE
      logging.info("Creating volume %s for instance %s",
                   device.iv_name, instance.name)
      #HARDCODE
-    for node in instance.all_nodes:
+    for node in all_nodes:
        f_create = node == pnode
        _CreateBlockDev(lu, node, instance, device, f_create, info, f_create)
  
  
        f_create = node == pnode
        _CreateBlockDev(lu, node, instance, device, f_create, info, f_create)
  
  
-def _RemoveDisks(lu, instance):
+def _RemoveDisks(lu, instance, target_node=None):
    """Remove all disks for an instance.
  
    This abstracts away some work from `AddInstance()` and
    """Remove all disks for an instance.
  
    This abstracts away some work from `AddInstance()` and
@@ -4328,6 +5263,8 @@ def _RemoveDisks(lu, instance):
    @param lu: the logical unit on whose behalf we execute
    @type instance: L{objects.Instance}
    @param instance: the instance whose disks we should remove
    @param lu: the logical unit on whose behalf we execute
    @type instance: L{objects.Instance}
    @param instance: the instance whose disks we should remove
+  @type target_node: string
+  @param target_node: used to override the node on which to remove the disks
    @rtype: boolean
    @return: the success of the removal
  
    @rtype: boolean
    @return: the success of the removal
  
@@ -4336,9 +5273,13 @@ def _RemoveDisks(lu, instance):
  
    all_result = True
    for device in instance.disks:
  
    all_result = True
    for device in instance.disks:
-    for node, disk in device.ComputeNodeTree(instance.primary_node):
+    if target_node:
+      edata = [(target_node, device)]
+    else:
+      edata = device.ComputeNodeTree(instance.primary_node)
+    for node, disk in edata:
        lu.cfg.SetDiskID(disk, node)
        lu.cfg.SetDiskID(disk, node)
-      msg = lu.rpc.call_blockdev_remove(node, disk).RemoteFailMsg()
+      msg = lu.rpc.call_blockdev_remove(node, disk).fail_msg
        if msg:
          lu.LogWarning("Could not remove block device %s on node %s,"
                        " continuing anyway: %s", device.iv_name, node, msg)
        if msg:
          lu.LogWarning("Could not remove block device %s on node %s,"
                        " continuing anyway: %s", device.iv_name, node, msg)
@@ -4346,10 +5287,14 @@ def _RemoveDisks(lu, instance):
  
    if instance.disk_template == constants.DT_FILE:
      file_storage_dir = os.path.dirname(instance.disks[0].logical_id[1])
  
    if instance.disk_template == constants.DT_FILE:
      file_storage_dir = os.path.dirname(instance.disks[0].logical_id[1])
-    result = lu.rpc.call_file_storage_dir_remove(instance.primary_node,
-                                                 file_storage_dir)
-    if result.failed or not result.data:
-      logging.error("Could not remove directory '%s'", file_storage_dir)
+    if target_node:
+      tgt = target_node
+    else:
+      tgt = instance.primary_node
+    result = lu.rpc.call_file_storage_dir_remove(tgt, file_storage_dir)
+    if result.fail_msg:
+      lu.LogWarning("Could not remove directory '%s' on node %s: %s",
+                    file_storage_dir, instance.primary_node, result.fail_msg)
        all_result = False
  
    return all_result
        all_result = False
  
    return all_result
@@ -4399,10 +5344,7 @@ def _CheckHVParams(lu, nodenames, hvname, hvparams):
      info = hvinfo[node]
      if info.offline:
        continue
      info = hvinfo[node]
      if info.offline:
        continue
-    msg = info.RemoteFailMsg()
-    if msg:
-      raise errors.OpPrereqError("Hypervisor parameter validation"
-                                 " failed on node %s: %s" % (node, msg))
+    info.Raise("Hypervisor parameter validation failed on node %s" % node)
  
  
  class LUCreateInstance(LogicalUnit):
  
  
  class LUCreateInstance(LogicalUnit):
@@ -4467,6 +5409,7 @@ class LUCreateInstance(LogicalUnit):
                                    self.op.hvparams)
      hv_type = hypervisor.GetHypervisor(self.op.hypervisor)
      hv_type.CheckParameterSyntax(filled_hvp)
                                    self.op.hvparams)
      hv_type = hypervisor.GetHypervisor(self.op.hypervisor)
      hv_type.CheckParameterSyntax(filled_hvp)
+    self.hv_full = filled_hvp
  
      # fill and remember the beparams dict
      utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
  
      # fill and remember the beparams dict
      utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
@@ -4523,11 +5466,18 @@ class LUCreateInstance(LogicalUnit):
          if not utils.IsValidMac(mac.lower()):
            raise errors.OpPrereqError("Invalid MAC address specified: %s" %
                                       mac)
          if not utils.IsValidMac(mac.lower()):
            raise errors.OpPrereqError("Invalid MAC address specified: %s" %
                                       mac)
+        else:
+          # or validate/reserve the current one
+          if self.cfg.IsMacInUse(mac):
+            raise errors.OpPrereqError("MAC address %s already in use"
+                                       " in cluster" % mac)
+
        # bridge verification
        bridge = nic.get("bridge", None)
        link = nic.get("link", None)
        if bridge and link:
        # bridge verification
        bridge = nic.get("bridge", None)
        link = nic.get("link", None)
        if bridge and link:
-        raise errors.OpPrereqError("Cannot pass 'bridge' and 'link' at the same time")
+        raise errors.OpPrereqError("Cannot pass 'bridge' and 'link'"
+                                   " at the same time")
        elif bridge and nic_mode == constants.NIC_MODE_ROUTED:
          raise errors.OpPrereqError("Cannot pass 'bridge' on a routed nic")
        elif bridge:
        elif bridge and nic_mode == constants.NIC_MODE_ROUTED:
          raise errors.OpPrereqError("Cannot pass 'bridge' on a routed nic")
        elif bridge:
@@ -4618,7 +5568,7 @@ class LUCreateInstance(LogicalUnit):
  
      """
      nics = [n.ToDict() for n in self.nics]
  
      """
      nics = [n.ToDict() for n in self.nics]
-    ial = IAllocator(self,
+    ial = IAllocator(self.cfg, self.rpc,
                       mode=constants.IALLOCATOR_MODE_ALLOC,
                       name=self.op.instance_name,
                       disk_template=self.op.disk_template,
                       mode=constants.IALLOCATOR_MODE_ALLOC,
                       name=self.op.instance_name,
                       disk_template=self.op.disk_template,
@@ -4671,9 +5621,12 @@ class LUCreateInstance(LogicalUnit):
        os_type=self.op.os_type,
        memory=self.be_full[constants.BE_MEMORY],
        vcpus=self.be_full[constants.BE_VCPUS],
        os_type=self.op.os_type,
        memory=self.be_full[constants.BE_MEMORY],
        vcpus=self.be_full[constants.BE_VCPUS],
-      nics=_PreBuildNICHooksList(self, self.nics),
+      nics=_NICListToTuple(self, self.nics),
        disk_template=self.op.disk_template,
        disks=[(d["size"], d["mode"]) for d in self.disks],
        disk_template=self.op.disk_template,
        disks=[(d["size"], d["mode"]) for d in self.disks],
+      bep=self.be_full,
+      hvp=self.hv_full,
+      hypervisor_name=self.op.hypervisor,
      ))
  
      nl = ([self.cfg.GetMasterNode(), self.op.pnode] +
      ))
  
      nl = ([self.cfg.GetMasterNode(), self.op.pnode] +
@@ -4699,7 +5652,7 @@ class LUCreateInstance(LogicalUnit):
          exp_list = self.rpc.call_export_list(locked_nodes)
          found = False
          for node in exp_list:
          exp_list = self.rpc.call_export_list(locked_nodes)
          found = False
          for node in exp_list:
-          if exp_list[node].RemoteFailMsg():
+          if exp_list[node].fail_msg:
              continue
            if src_path in exp_list[node].payload:
              found = True
              continue
            if src_path in exp_list[node].payload:
              found = True
@@ -4713,10 +5666,7 @@ class LUCreateInstance(LogicalUnit):
  
        _CheckNodeOnline(self, src_node)
        result = self.rpc.call_export_info(src_node, src_path)
  
        _CheckNodeOnline(self, src_node)
        result = self.rpc.call_export_info(src_node, src_path)
-      msg = result.RemoteFailMsg()
-      if msg:
-        raise errors.OpPrereqError("No export or invalid export found in"
-                                   " dir %s: %s" % (src_path, msg))
+      result.Raise("No export or invalid export found in dir %s" % src_path)
  
        export_info = objects.SerializableConfigParser.Loads(str(result.payload))
        if not export_info.has_section(constants.INISECT_EXP):
  
        export_info = objects.SerializableConfigParser.Loads(str(result.payload))
        if not export_info.has_section(constants.INISECT_EXP):
@@ -4824,10 +5774,7 @@ class LUCreateInstance(LogicalUnit):
                                           self.op.hypervisor)
        for node in nodenames:
          info = nodeinfo[node]
                                           self.op.hypervisor)
        for node in nodenames:
          info = nodeinfo[node]
-        msg = info.RemoteFailMsg()
-        if msg:
-          raise errors.OpPrereqError("Cannot get current information"
-                                     " from node %s: %s" % (node, msg))
+        info.Raise("Cannot get current information from node %s" % node)
          info = info.payload
          vg_free = info.get('vg_free', None)
          if not isinstance(vg_free, int):
          info = info.payload
          vg_free = info.get('vg_free', None)
          if not isinstance(vg_free, int):
@@ -4842,10 +5789,8 @@ class LUCreateInstance(LogicalUnit):
  
      # os verification
      result = self.rpc.call_os_get(pnode.name, self.op.os_type)
  
      # os verification
      result = self.rpc.call_os_get(pnode.name, self.op.os_type)
-    result.Raise()
-    if not isinstance(result.data, objects.OS):
-      raise errors.OpPrereqError("OS '%s' not in supported os list for"
-                                 " primary node"  % self.op.os_type)
+    result.Raise("OS '%s' not in supported os list for primary node %s" %
+                 (self.op.os_type, pnode.name), prereq=True)
  
      _CheckNicsBridgesExist(self, self.nics, self.pnode.name)
  
  
      _CheckNicsBridgesExist(self, self.nics, self.pnode.name)
  
@@ -4856,6 +5801,8 @@ class LUCreateInstance(LogicalUnit):
                             self.be_full[constants.BE_MEMORY],
                             self.op.hypervisor)
  
                             self.be_full[constants.BE_MEMORY],
                             self.op.hypervisor)
  
+    self.dry_run_result = list(nodenames)
+
    def Exec(self, feedback_fn):
      """Create and add the instance to the cluster.
  
    def Exec(self, feedback_fn):
      """Create and add the instance to the cluster.
  
@@ -4957,11 +5904,8 @@ class LUCreateInstance(LogicalUnit):
        if self.op.mode == constants.INSTANCE_CREATE:
          feedback_fn("* running the instance OS create scripts...")
          result = self.rpc.call_instance_os_add(pnode_name, iobj, False)
        if self.op.mode == constants.INSTANCE_CREATE:
          feedback_fn("* running the instance OS create scripts...")
          result = self.rpc.call_instance_os_add(pnode_name, iobj, False)
-        msg = result.RemoteFailMsg()
-        if msg:
-          raise errors.OpExecError("Could not add os for instance %s"
-                                   " on node %s: %s" %
-                                   (instance, pnode_name, msg))
+        result.Raise("Could not add os for instance %s"
+                     " on node %s" % (instance, pnode_name))
  
        elif self.op.mode == constants.INSTANCE_IMPORT:
          feedback_fn("* running the instance OS import scripts...")
  
        elif self.op.mode == constants.INSTANCE_IMPORT:
          feedback_fn("* running the instance OS import scripts...")
@@ -4971,7 +5915,7 @@ class LUCreateInstance(LogicalUnit):
          import_result = self.rpc.call_instance_os_import(pnode_name, iobj,
                                                           src_node, src_images,
                                                           cluster_name)
          import_result = self.rpc.call_instance_os_import(pnode_name, iobj,
                                                           src_node, src_images,
                                                           cluster_name)
-        msg = import_result.RemoteFailMsg()
+        msg = import_result.fail_msg
          if msg:
            self.LogWarning("Error while importing the disk images for instance"
                            " %s on node %s: %s" % (instance, pnode_name, msg))
          if msg:
            self.LogWarning("Error while importing the disk images for instance"
                            " %s on node %s: %s" % (instance, pnode_name, msg))
@@ -4986,9 +5930,9 @@ class LUCreateInstance(LogicalUnit):
        logging.info("Starting instance %s on node %s", instance, pnode_name)
        feedback_fn("* starting instance...")
        result = self.rpc.call_instance_start(pnode_name, iobj, None, None)
        logging.info("Starting instance %s on node %s", instance, pnode_name)
        feedback_fn("* starting instance...")
        result = self.rpc.call_instance_start(pnode_name, iobj, None, None)
-      msg = result.RemoteFailMsg()
-      if msg:
-        raise errors.OpExecError("Could not start instance: %s" % msg)
+      result.Raise("Could not start instance")
+
+    return list(iobj.all_nodes)
  
  
  class LUConnectConsole(NoHooksLU):
  
  
  class LUConnectConsole(NoHooksLU):
@@ -5025,10 +5969,7 @@ class LUConnectConsole(NoHooksLU):
  
      node_insts = self.rpc.call_instance_list([node],
                                               [instance.hypervisor])[node]
  
      node_insts = self.rpc.call_instance_list([node],
                                               [instance.hypervisor])[node]
-    msg = node_insts.RemoteFailMsg()
-    if msg:
-      raise errors.OpExecError("Can't get node information from %s: %s" %
-                               (node, msg))
+    node_insts.Raise("Can't get node information from %s" % node)
  
      if instance.name not in node_insts.payload:
        raise errors.OpExecError("Instance %s is not running." % instance.name)
  
      if instance.name not in node_insts.payload:
        raise errors.OpExecError("Instance %s is not running." % instance.name)
@@ -5062,92 +6003,243 @@ class LUReplaceDisks(LogicalUnit):
      if not hasattr(self.op, "iallocator"):
        self.op.iallocator = None
  
      if not hasattr(self.op, "iallocator"):
        self.op.iallocator = None
  
-    # check for valid parameter combination
-    cnt = [self.op.remote_node, self.op.iallocator].count(None)
-    if self.op.mode == constants.REPLACE_DISK_CHG:
-      if cnt == 2:
-        raise errors.OpPrereqError("When changing the secondary either an"
-                                   " iallocator script must be used or the"
-                                   " new node given")
-      elif cnt == 0:
-        raise errors.OpPrereqError("Give either the iallocator or the new"
-                                   " secondary, not both")
-    else: # not replacing the secondary
-      if cnt != 2:
-        raise errors.OpPrereqError("The iallocator and new node options can"
-                                   " be used only when changing the"
-                                   " secondary node")
+    TLReplaceDisks.CheckArguments(self.op.mode, self.op.remote_node,
+                                  self.op.iallocator)
  
    def ExpandNames(self):
      self._ExpandAndLockInstance()
  
      if self.op.iallocator is not None:
        self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
  
    def ExpandNames(self):
      self._ExpandAndLockInstance()
  
      if self.op.iallocator is not None:
        self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
+
      elif self.op.remote_node is not None:
        remote_node = self.cfg.ExpandNodeName(self.op.remote_node)
        if remote_node is None:
          raise errors.OpPrereqError("Node '%s' not known" %
                                     self.op.remote_node)
      elif self.op.remote_node is not None:
        remote_node = self.cfg.ExpandNodeName(self.op.remote_node)
        if remote_node is None:
          raise errors.OpPrereqError("Node '%s' not known" %
                                     self.op.remote_node)
+
        self.op.remote_node = remote_node
        self.op.remote_node = remote_node
+
        # Warning: do not remove the locking of the new secondary here
        # unless DRBD8.AddChildren is changed to work in parallel;
        # currently it doesn't since parallel invocations of
        # FindUnusedMinor will conflict
        self.needed_locks[locking.LEVEL_NODE] = [remote_node]
        self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
        # Warning: do not remove the locking of the new secondary here
        # unless DRBD8.AddChildren is changed to work in parallel;
        # currently it doesn't since parallel invocations of
        # FindUnusedMinor will conflict
        self.needed_locks[locking.LEVEL_NODE] = [remote_node]
        self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
+
      else:
        self.needed_locks[locking.LEVEL_NODE] = []
        self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
  
      else:
        self.needed_locks[locking.LEVEL_NODE] = []
        self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
  
-  def DeclareLocks(self, level):
-    # If we're not already locking all nodes in the set we have to declare the
-    # instance's primary/secondary nodes.
-    if (level == locking.LEVEL_NODE and
-        self.needed_locks[locking.LEVEL_NODE] is not locking.ALL_SET):
-      self._LockInstancesNodes()
+    self.replacer = TLReplaceDisks(self, self.op.instance_name, self.op.mode,
+                                   self.op.iallocator, self.op.remote_node,
+                                   self.op.disks)
+
+    self.tasklets = [self.replacer]
+
+  def DeclareLocks(self, level):
+    # If we're not already locking all nodes in the set we have to declare the
+    # instance's primary/secondary nodes.
+    if (level == locking.LEVEL_NODE and
+        self.needed_locks[locking.LEVEL_NODE] is not locking.ALL_SET):
+      self._LockInstancesNodes()
+
+  def BuildHooksEnv(self):
+    """Build hooks env.
+
+    This runs on the master, the primary and all the secondaries.
+
+    """
+    instance = self.replacer.instance
+    env = {
+      "MODE": self.op.mode,
+      "NEW_SECONDARY": self.op.remote_node,
+      "OLD_SECONDARY": instance.secondary_nodes[0],
+      }
+    env.update(_BuildInstanceHookEnvByObject(self, instance))
+    nl = [
+      self.cfg.GetMasterNode(),
+      instance.primary_node,
+      ]
+    if self.op.remote_node is not None:
+      nl.append(self.op.remote_node)
+    return env, nl, nl
+
+
+class LUEvacuateNode(LogicalUnit):
+  """Relocate the secondary instances from a node.
+
+  """
+  HPATH = "node-evacuate"
+  HTYPE = constants.HTYPE_NODE
+  _OP_REQP = ["node_name"]
+  REQ_BGL = False
+
+  def CheckArguments(self):
+    if not hasattr(self.op, "remote_node"):
+      self.op.remote_node = None
+    if not hasattr(self.op, "iallocator"):
+      self.op.iallocator = None
+
+    TLReplaceDisks.CheckArguments(constants.REPLACE_DISK_CHG,
+                                  self.op.remote_node,
+                                  self.op.iallocator)
+
+  def ExpandNames(self):
+    self.op.node_name = self.cfg.ExpandNodeName(self.op.node_name)
+    if self.op.node_name is None:
+      raise errors.OpPrereqError("Node '%s' not known" % self.op.node_name)
+
+    self.needed_locks = {}
+
+    # Declare node locks
+    if self.op.iallocator is not None:
+      self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
+
+    elif self.op.remote_node is not None:
+      remote_node = self.cfg.ExpandNodeName(self.op.remote_node)
+      if remote_node is None:
+        raise errors.OpPrereqError("Node '%s' not known" %
+                                   self.op.remote_node)
+
+      self.op.remote_node = remote_node
+
+      # Warning: do not remove the locking of the new secondary here
+      # unless DRBD8.AddChildren is changed to work in parallel;
+      # currently it doesn't since parallel invocations of
+      # FindUnusedMinor will conflict
+      self.needed_locks[locking.LEVEL_NODE] = [remote_node]
+      self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
+
+    else:
+      raise errors.OpPrereqError("Invalid parameters")
+
+    # Create tasklets for replacing disks for all secondary instances on this
+    # node
+    names = []
+    tasklets = []
+
+    for inst in _GetNodeSecondaryInstances(self.cfg, self.op.node_name):
+      logging.debug("Replacing disks for instance %s", inst.name)
+      names.append(inst.name)
+
+      replacer = TLReplaceDisks(self, inst.name, constants.REPLACE_DISK_CHG,
+                                self.op.iallocator, self.op.remote_node, [])
+      tasklets.append(replacer)
+
+    self.tasklets = tasklets
+    self.instance_names = names
+
+    # Declare instance locks
+    self.needed_locks[locking.LEVEL_INSTANCE] = self.instance_names
+
+  def DeclareLocks(self, level):
+    # If we're not already locking all nodes in the set we have to declare the
+    # instance's primary/secondary nodes.
+    if (level == locking.LEVEL_NODE and
+        self.needed_locks[locking.LEVEL_NODE] is not locking.ALL_SET):
+      self._LockInstancesNodes()
+
+  def BuildHooksEnv(self):
+    """Build hooks env.
+
+    This runs on the master, the primary and all the secondaries.
+
+    """
+    env = {
+      "NODE_NAME": self.op.node_name,
+      }
+
+    nl = [self.cfg.GetMasterNode()]
+
+    if self.op.remote_node is not None:
+      env["NEW_SECONDARY"] = self.op.remote_node
+      nl.append(self.op.remote_node)
+
+    return (env, nl, nl)
+
+
+class TLReplaceDisks(Tasklet):
+  """Replaces disks for an instance.
+
+  Note: Locking is not within the scope of this class.
+
+  """
+  def __init__(self, lu, instance_name, mode, iallocator_name, remote_node,
+               disks):
+    """Initializes this class.
+
+    """
+    Tasklet.__init__(self, lu)
+
+    # Parameters
+    self.instance_name = instance_name
+    self.mode = mode
+    self.iallocator_name = iallocator_name
+    self.remote_node = remote_node
+    self.disks = disks
+
+    # Runtime data
+    self.instance = None
+    self.new_node = None
+    self.target_node = None
+    self.other_node = None
+    self.remote_node_info = None
+    self.node_secondary_ip = None
+
+  @staticmethod
+  def CheckArguments(mode, remote_node, iallocator):
+    """Helper function for users of this class.
+
+    """
+    # check for valid parameter combination
+    if mode == constants.REPLACE_DISK_CHG:
+      if remote_node is None and iallocator is None:
+        raise errors.OpPrereqError("When changing the secondary either an"
+                                   " iallocator script must be used or the"
+                                   " new node given")
+
+      if remote_node is not None and iallocator is not None:
+        raise errors.OpPrereqError("Give either the iallocator or the new"
+                                   " secondary, not both")
  
  
-  def _RunAllocator(self):
+    elif remote_node is not None or iallocator is not None:
+      # Not replacing the secondary
+      raise errors.OpPrereqError("The iallocator and new node options can"
+                                 " only be used when changing the"
+                                 " secondary node")
+
+  @staticmethod
+  def _RunAllocator(lu, iallocator_name, instance_name, relocate_from):
      """Compute a new secondary node using an IAllocator.
  
      """
      """Compute a new secondary node using an IAllocator.
  
      """
-    ial = IAllocator(self,
+    ial = IAllocator(lu.cfg, lu.rpc,
                       mode=constants.IALLOCATOR_MODE_RELOC,
                       mode=constants.IALLOCATOR_MODE_RELOC,
-                     name=self.op.instance_name,
-                     relocate_from=[self.sec_node])
+                     name=instance_name,
+                     relocate_from=relocate_from)
  
  
-    ial.Run(self.op.iallocator)
+    ial.Run(iallocator_name)
  
      if not ial.success:
  
      if not ial.success:
-      raise errors.OpPrereqError("Can't compute nodes using"
-                                 " iallocator '%s': %s" % (self.op.iallocator,
-                                                           ial.info))
+      raise errors.OpPrereqError("Can't compute nodes using iallocator '%s':"
+                                 " %s" % (iallocator_name, ial.info))
+
      if len(ial.nodes) != ial.required_nodes:
        raise errors.OpPrereqError("iallocator '%s' returned invalid number"
                                   " of nodes (%s), required %s" %
                                   (len(ial.nodes), ial.required_nodes))
      if len(ial.nodes) != ial.required_nodes:
        raise errors.OpPrereqError("iallocator '%s' returned invalid number"
                                   " of nodes (%s), required %s" %
                                   (len(ial.nodes), ial.required_nodes))
-    self.op.remote_node = ial.nodes[0]
-    self.LogInfo("Selected new secondary for the instance: %s",
-                 self.op.remote_node)
  
  
-  def BuildHooksEnv(self):
-    """Build hooks env.
+    remote_node_name = ial.nodes[0]
  
  
-    This runs on the master, the primary and all the secondaries.
+    lu.LogInfo("Selected new secondary for instance '%s': %s",
+               instance_name, remote_node_name)
  
  
-    """
-    env = {
-      "MODE": self.op.mode,
-      "NEW_SECONDARY": self.op.remote_node,
-      "OLD_SECONDARY": self.instance.secondary_nodes[0],
-      }
-    env.update(_BuildInstanceHookEnvByObject(self, self.instance))
-    nl = [
-      self.cfg.GetMasterNode(),
-      self.instance.primary_node,
-      ]
-    if self.op.remote_node is not None:
-      nl.append(self.op.remote_node)
-    return env, nl, nl
+    return remote_node_name
+
+  def _FindFaultyDisks(self, node_name):
+    return _FindFaultyInstanceDisks(self.cfg, self.rpc, self.instance,
+                                    node_name, True)
  
    def CheckPrereq(self):
      """Check prerequisites.
  
    def CheckPrereq(self):
      """Check prerequisites.
@@ -5155,169 +6247,301 @@ class LUReplaceDisks(LogicalUnit):
      This checks that the instance is in the cluster.
  
      """
      This checks that the instance is in the cluster.
  
      """
-    instance = self.cfg.GetInstanceInfo(self.op.instance_name)
-    assert instance is not None, \
-      "Cannot retrieve locked instance %s" % self.op.instance_name
-    self.instance = instance
+    self.instance = self.cfg.GetInstanceInfo(self.instance_name)
+    assert self.instance is not None, \
+      "Cannot retrieve locked instance %s" % self.instance_name
  
  
-    if instance.disk_template != constants.DT_DRBD8:
+    if self.instance.disk_template != constants.DT_DRBD8:
        raise errors.OpPrereqError("Can only run replace disks for DRBD8-based"
                                   " instances")
  
        raise errors.OpPrereqError("Can only run replace disks for DRBD8-based"
                                   " instances")
  
-    if len(instance.secondary_nodes) != 1:
+    if len(self.instance.secondary_nodes) != 1:
        raise errors.OpPrereqError("The instance has a strange layout,"
                                   " expected one secondary but found %d" %
        raise errors.OpPrereqError("The instance has a strange layout,"
                                   " expected one secondary but found %d" %
-                                 len(instance.secondary_nodes))
+                                 len(self.instance.secondary_nodes))
  
  
-    self.sec_node = instance.secondary_nodes[0]
+    secondary_node = self.instance.secondary_nodes[0]
  
  
-    if self.op.iallocator is not None:
-      self._RunAllocator()
+    if self.iallocator_name is None:
+      remote_node = self.remote_node
+    else:
+      remote_node = self._RunAllocator(self.lu, self.iallocator_name,
+                                       self.instance.name, secondary_node)
  
  
-    remote_node = self.op.remote_node
      if remote_node is not None:
        self.remote_node_info = self.cfg.GetNodeInfo(remote_node)
        assert self.remote_node_info is not None, \
          "Cannot retrieve locked node %s" % remote_node
      else:
        self.remote_node_info = None
      if remote_node is not None:
        self.remote_node_info = self.cfg.GetNodeInfo(remote_node)
        assert self.remote_node_info is not None, \
          "Cannot retrieve locked node %s" % remote_node
      else:
        self.remote_node_info = None
-    if remote_node == instance.primary_node:
+
+    if remote_node == self.instance.primary_node:
        raise errors.OpPrereqError("The specified node is the primary node of"
                                   " the instance.")
        raise errors.OpPrereqError("The specified node is the primary node of"
                                   " the instance.")
-    elif remote_node == self.sec_node:
+
+    if remote_node == secondary_node:
        raise errors.OpPrereqError("The specified node is already the"
                                   " secondary node of the instance.")
  
        raise errors.OpPrereqError("The specified node is already the"
                                   " secondary node of the instance.")
  
-    if self.op.mode == constants.REPLACE_DISK_PRI:
-      n1 = self.tgt_node = instance.primary_node
-      n2 = self.oth_node = self.sec_node
-    elif self.op.mode == constants.REPLACE_DISK_SEC:
-      n1 = self.tgt_node = self.sec_node
-      n2 = self.oth_node = instance.primary_node
-    elif self.op.mode == constants.REPLACE_DISK_CHG:
-      n1 = self.new_node = remote_node
-      n2 = self.oth_node = instance.primary_node
-      self.tgt_node = self.sec_node
-      _CheckNodeNotDrained(self, remote_node)
+    if self.disks and self.mode in (constants.REPLACE_DISK_AUTO,
+                                    constants.REPLACE_DISK_CHG):
+      raise errors.OpPrereqError("Cannot specify disks to be replaced")
+
+    if self.mode == constants.REPLACE_DISK_AUTO:
+      faulty_primary = self._FindFaultyDisks(self.instance.primary_node)
+      faulty_secondary = self._FindFaultyDisks(secondary_node)
+
+      if faulty_primary and faulty_secondary:
+        raise errors.OpPrereqError("Instance %s has faulty disks on more than"
+                                   " one node and can not be repaired"
+                                   " automatically" % self.instance_name)
+
+      if faulty_primary:
+        self.disks = faulty_primary
+        self.target_node = self.instance.primary_node
+        self.other_node = secondary_node
+        check_nodes = [self.target_node, self.other_node]
+      elif faulty_secondary:
+        self.disks = faulty_secondary
+        self.target_node = secondary_node
+        self.other_node = self.instance.primary_node
+        check_nodes = [self.target_node, self.other_node]
+      else:
+        self.disks = []
+        check_nodes = []
+
      else:
      else:
-      raise errors.ProgrammerError("Unhandled disk replace mode")
+      # Non-automatic modes
+      if self.mode == constants.REPLACE_DISK_PRI:
+        self.target_node = self.instance.primary_node
+        self.other_node = secondary_node
+        check_nodes = [self.target_node, self.other_node]
  
  
-    _CheckNodeOnline(self, n1)
-    _CheckNodeOnline(self, n2)
+      elif self.mode == constants.REPLACE_DISK_SEC:
+        self.target_node = secondary_node
+        self.other_node = self.instance.primary_node
+        check_nodes = [self.target_node, self.other_node]
  
  
-    if not self.op.disks:
-      self.op.disks = range(len(instance.disks))
+      elif self.mode == constants.REPLACE_DISK_CHG:
+        self.new_node = remote_node
+        self.other_node = self.instance.primary_node
+        self.target_node = secondary_node
+        check_nodes = [self.new_node, self.other_node]
  
  
-    for disk_idx in self.op.disks:
-      instance.FindDisk(disk_idx)
+        _CheckNodeNotDrained(self.lu, remote_node)
  
  
-  def _ExecD8DiskOnly(self, feedback_fn):
-    """Replace a disk on the primary or secondary for dbrd8.
+      else:
+        raise errors.ProgrammerError("Unhandled disk replace mode (%s)" %
+                                     self.mode)
  
  
-    The algorithm for replace is quite complicated:
+      # If not specified all disks should be replaced
+      if not self.disks:
+        self.disks = range(len(self.instance.disks))
  
  
-      1. for each disk to be replaced:
+    for node in check_nodes:
+      _CheckNodeOnline(self.lu, node)
  
  
-        1. create new LVs on the target node with unique names
-        1. detach old LVs from the drbd device
-        1. rename old LVs to name_replaced.<time_t>
-        1. rename new LVs to old LVs
-        1. attach the new LVs (with the old names now) to the drbd device
+    # Check whether disks are valid
+    for disk_idx in self.disks:
+      self.instance.FindDisk(disk_idx)
  
  
-      1. wait for sync across all devices
+    # Get secondary node IP addresses
+    node_2nd_ip = {}
  
  
-      1. for each modified disk:
+    for node_name in [self.target_node, self.other_node, self.new_node]:
+      if node_name is not None:
+        node_2nd_ip[node_name] = self.cfg.GetNodeInfo(node_name).secondary_ip
  
  
-        1. remove old LVs (which have the name name_replaces.<time_t>)
+    self.node_secondary_ip = node_2nd_ip
  
  
-    Failures are not very well handled.
+  def Exec(self, feedback_fn):
+    """Execute disk replacement.
+
+    This dispatches the disk replacement to the appropriate handler.
  
      """
  
      """
-    steps_total = 6
-    warning, info = (self.proc.LogWarning, self.proc.LogInfo)
-    instance = self.instance
-    iv_names = {}
+    if not self.disks:
+      feedback_fn("No disks need replacement")
+      return
+
+    feedback_fn("Replacing disk(s) %s for %s" %
+                (", ".join([str(i) for i in self.disks]), self.instance.name))
+
+    activate_disks = (not self.instance.admin_up)
+
+    # Activate the instance disks if we're replacing them on a down instance
+    if activate_disks:
+      _StartInstanceDisks(self.lu, self.instance, True)
+
+    try:
+      # Should we replace the secondary node?
+      if self.new_node is not None:
+        return self._ExecDrbd8Secondary()
+      else:
+        return self._ExecDrbd8DiskOnly()
+
+    finally:
+      # Deactivate the instance disks if we're replacing them on a down instance
+      if activate_disks:
+        _SafeShutdownInstanceDisks(self.lu, self.instance)
+
+  def _CheckVolumeGroup(self, nodes):
+    self.lu.LogInfo("Checking volume groups")
+
      vgname = self.cfg.GetVGName()
      vgname = self.cfg.GetVGName()
-    # start of work
-    cfg = self.cfg
-    tgt_node = self.tgt_node
-    oth_node = self.oth_node
  
  
-    # Step: check device activation
-    self.proc.LogStep(1, steps_total, "check device existence")
-    info("checking volume groups")
-    my_vg = cfg.GetVGName()
-    results = self.rpc.call_vg_list([oth_node, tgt_node])
+    # Make sure volume group exists on all involved nodes
+    results = self.rpc.call_vg_list(nodes)
      if not results:
        raise errors.OpExecError("Can't list volume groups on the nodes")
      if not results:
        raise errors.OpExecError("Can't list volume groups on the nodes")
-    for node in oth_node, tgt_node:
+
+    for node in nodes:
        res = results[node]
        res = results[node]
-      msg = res.RemoteFailMsg()
-      if msg:
-        raise errors.OpExecError("Error checking node %s: %s" % (node, msg))
-      if my_vg not in res.payload:
-        raise errors.OpExecError("Volume group '%s' not found on %s" %
-                                 (my_vg, node))
-    for idx, dev in enumerate(instance.disks):
-      if idx not in self.op.disks:
+      res.Raise("Error checking node %s" % node)
+      if vgname not in res.payload:
+        raise errors.OpExecError("Volume group '%s' not found on node %s" %
+                                 (vgname, node))
+
+  def _CheckDisksExistence(self, nodes):
+    # Check disk existence
+    for idx, dev in enumerate(self.instance.disks):
+      if idx not in self.disks:
          continue
          continue
-      for node in tgt_node, oth_node:
-        info("checking disk/%d on %s" % (idx, node))
-        cfg.SetDiskID(dev, node)
+
+      for node in nodes:
+        self.lu.LogInfo("Checking disk/%d on %s" % (idx, node))
+        self.cfg.SetDiskID(dev, node)
+
          result = self.rpc.call_blockdev_find(node, dev)
          result = self.rpc.call_blockdev_find(node, dev)
-        msg = result.RemoteFailMsg()
-        if not msg and not result.payload:
-          msg = "disk not found"
-        if msg:
+
+        msg = result.fail_msg
+        if msg or not result.payload:
+          if not msg:
+            msg = "disk not found"
            raise errors.OpExecError("Can't find disk/%d on node %s: %s" %
                                     (idx, node, msg))
  
            raise errors.OpExecError("Can't find disk/%d on node %s: %s" %
                                     (idx, node, msg))
  
-    # Step: check other node consistency
-    self.proc.LogStep(2, steps_total, "check peer consistency")
-    for idx, dev in enumerate(instance.disks):
-      if idx not in self.op.disks:
+  def _CheckDisksConsistency(self, node_name, on_primary, ldisk):
+    for idx, dev in enumerate(self.instance.disks):
+      if idx not in self.disks:
          continue
          continue
-      info("checking disk/%d consistency on %s" % (idx, oth_node))
-      if not _CheckDiskConsistency(self, dev, oth_node,
-                                   oth_node==instance.primary_node):
-        raise errors.OpExecError("Peer node (%s) has degraded storage, unsafe"
-                                 " to replace disks on this node (%s)" %
-                                 (oth_node, tgt_node))
  
  
-    # Step: create new storage
-    self.proc.LogStep(3, steps_total, "allocate new storage")
-    for idx, dev in enumerate(instance.disks):
-      if idx not in self.op.disks:
+      self.lu.LogInfo("Checking disk/%d consistency on node %s" %
+                      (idx, node_name))
+
+      if not _CheckDiskConsistency(self.lu, dev, node_name, on_primary,
+                                   ldisk=ldisk):
+        raise errors.OpExecError("Node %s has degraded storage, unsafe to"
+                                 " replace disks for instance %s" %
+                                 (node_name, self.instance.name))
+
+  def _CreateNewStorage(self, node_name):
+    vgname = self.cfg.GetVGName()
+    iv_names = {}
+
+    for idx, dev in enumerate(self.instance.disks):
+      if idx not in self.disks:
          continue
          continue
-      size = dev.size
-      cfg.SetDiskID(dev, tgt_node)
-      lv_names = [".disk%d_%s" % (idx, suf)
-                  for suf in ["data", "meta"]]
-      names = _GenerateUniqueNames(self, lv_names)
-      lv_data = objects.Disk(dev_type=constants.LD_LV, size=size,
+
+      self.lu.LogInfo("Adding storage on %s for disk/%d" % (node_name, idx))
+
+      self.cfg.SetDiskID(dev, node_name)
+
+      lv_names = [".disk%d_%s" % (idx, suffix) for suffix in ["data", "meta"]]
+      names = _GenerateUniqueNames(self.lu, lv_names)
+
+      lv_data = objects.Disk(dev_type=constants.LD_LV, size=dev.size,
                               logical_id=(vgname, names[0]))
        lv_meta = objects.Disk(dev_type=constants.LD_LV, size=128,
                               logical_id=(vgname, names[1]))
                               logical_id=(vgname, names[0]))
        lv_meta = objects.Disk(dev_type=constants.LD_LV, size=128,
                               logical_id=(vgname, names[1]))
+
        new_lvs = [lv_data, lv_meta]
        old_lvs = dev.children
        iv_names[dev.iv_name] = (dev, old_lvs, new_lvs)
        new_lvs = [lv_data, lv_meta]
        old_lvs = dev.children
        iv_names[dev.iv_name] = (dev, old_lvs, new_lvs)
-      info("creating new local storage on %s for %s" %
-           (tgt_node, dev.iv_name))
+
        # we pass force_create=True to force the LVM creation
        for new_lv in new_lvs:
        # we pass force_create=True to force the LVM creation
        for new_lv in new_lvs:
-        _CreateBlockDev(self, tgt_node, instance, new_lv, True,
-                        _GetInstanceInfoText(instance), False)
+        _CreateBlockDev(self.lu, node_name, self.instance, new_lv, True,
+                        _GetInstanceInfoText(self.instance), False)
+
+    return iv_names
+
+  def _CheckDevices(self, node_name, iv_names):
+    for name, (dev, old_lvs, new_lvs) in iv_names.iteritems():
+      self.cfg.SetDiskID(dev, node_name)
+
+      result = self.rpc.call_blockdev_find(node_name, dev)
+
+      msg = result.fail_msg
+      if msg or not result.payload:
+        if not msg:
+          msg = "disk not found"
+        raise errors.OpExecError("Can't find DRBD device %s: %s" %
+                                 (name, msg))
+
+      if result.payload.is_degraded:
+        raise errors.OpExecError("DRBD device %s is degraded!" % name)
+
+  def _RemoveOldStorage(self, node_name, iv_names):
+    for name, (dev, old_lvs, _) in iv_names.iteritems():
+      self.lu.LogInfo("Remove logical volumes for %s" % name)
+
+      for lv in old_lvs:
+        self.cfg.SetDiskID(lv, node_name)
+
+        msg = self.rpc.call_blockdev_remove(node_name, lv).fail_msg
+        if msg:
+          self.lu.LogWarning("Can't remove old LV: %s" % msg,
+                             hint="remove unused LVs manually")
+
+  def _ExecDrbd8DiskOnly(self):
+    """Replace a disk on the primary or secondary for DRBD 8.
+
+    The algorithm for replace is quite complicated:
+
+      1. for each disk to be replaced:
+
+        1. create new LVs on the target node with unique names
+        1. detach old LVs from the drbd device
+        1. rename old LVs to name_replaced.<time_t>
+        1. rename new LVs to old LVs
+        1. attach the new LVs (with the old names now) to the drbd device
+
+      1. wait for sync across all devices
+
+      1. for each modified disk:
+
+        1. remove old LVs (which have the name name_replaces.<time_t>)
+
+    Failures are not very well handled.
+
+    """
+    steps_total = 6
+
+    # Step: check device activation
+    self.lu.LogStep(1, steps_total, "Check device existence")
+    self._CheckDisksExistence([self.other_node, self.target_node])
+    self._CheckVolumeGroup([self.target_node, self.other_node])
+
+    # Step: check other node consistency
+    self.lu.LogStep(2, steps_total, "Check peer consistency")
+    self._CheckDisksConsistency(self.other_node,
+                                self.other_node == self.instance.primary_node,
+                                False)
+
+    # Step: create new storage
+    self.lu.LogStep(3, steps_total, "Allocate new storage")
+    iv_names = self._CreateNewStorage(self.target_node)
  
      # Step: for each lv, detach+rename*2+attach
  
      # Step: for each lv, detach+rename*2+attach
-    self.proc.LogStep(4, steps_total, "change drbd configuration")
+    self.lu.LogStep(4, steps_total, "Changing drbd configuration")
      for dev, old_lvs, new_lvs in iv_names.itervalues():
      for dev, old_lvs, new_lvs in iv_names.itervalues():
-      info("detaching %s drbd from local storage" % dev.iv_name)
-      result = self.rpc.call_blockdev_removechildren(tgt_node, dev, old_lvs)
-      msg = result.RemoteFailMsg()
-      if msg:
-        raise errors.OpExecError("Can't detach drbd from local storage on node"
-                                 " %s for device %s: %s" %
-                                 (tgt_node, dev.iv_name, msg))
+      self.lu.LogInfo("Detaching %s drbd from local storage" % dev.iv_name)
+
+      result = self.rpc.call_blockdev_removechildren(self.target_node, dev,
+                                                     old_lvs)
+      result.Raise("Can't detach drbd from local storage on node"
+                   " %s for device %s" % (self.target_node, dev.iv_name))
        #dev.children = []
        #cfg.Update(instance)
  
        #dev.children = []
        #cfg.Update(instance)
  
@@ -5331,87 +6555,70 @@ class LUReplaceDisks(LogicalUnit):
        temp_suffix = int(time.time())
        ren_fn = lambda d, suff: (d.physical_id[0],
                                  d.physical_id[1] + "_replaced-%s" % suff)
        temp_suffix = int(time.time())
        ren_fn = lambda d, suff: (d.physical_id[0],
                                  d.physical_id[1] + "_replaced-%s" % suff)
-      # build the rename list based on what LVs exist on the node
-      rlist = []
+
+      # Build the rename list based on what LVs exist on the node
+      rename_old_to_new = []
        for to_ren in old_lvs:
        for to_ren in old_lvs:
-        result = self.rpc.call_blockdev_find(tgt_node, to_ren)
-        if not result.RemoteFailMsg() and result.payload:
+        result = self.rpc.call_blockdev_find(self.target_node, to_ren)
+        if not result.fail_msg and result.payload:
            # device exists
            # device exists
-          rlist.append((to_ren, ren_fn(to_ren, temp_suffix)))
+          rename_old_to_new.append((to_ren, ren_fn(to_ren, temp_suffix)))
  
  
-      info("renaming the old LVs on the target node")
-      result = self.rpc.call_blockdev_rename(tgt_node, rlist)
-      msg = result.RemoteFailMsg()
-      if msg:
-        raise errors.OpExecError("Can't rename old LVs on node %s: %s" %
-                                 (tgt_node, msg))
-      # now we rename the new LVs to the old LVs
-      info("renaming the new LVs on the target node")
-      rlist = [(new, old.physical_id) for old, new in zip(old_lvs, new_lvs)]
-      result = self.rpc.call_blockdev_rename(tgt_node, rlist)
-      msg = result.RemoteFailMsg()
-      if msg:
-        raise errors.OpExecError("Can't rename new LVs on node %s: %s" %
-                                 (tgt_node, msg))
+      self.lu.LogInfo("Renaming the old LVs on the target node")
+      result = self.rpc.call_blockdev_rename(self.target_node,
+                                             rename_old_to_new)
+      result.Raise("Can't rename old LVs on node %s" % self.target_node)
+
+      # Now we rename the new LVs to the old LVs
+      self.lu.LogInfo("Renaming the new LVs on the target node")
+      rename_new_to_old = [(new, old.physical_id)
+                           for old, new in zip(old_lvs, new_lvs)]
+      result = self.rpc.call_blockdev_rename(self.target_node,
+                                             rename_new_to_old)
+      result.Raise("Can't rename new LVs on node %s" % self.target_node)
  
        for old, new in zip(old_lvs, new_lvs):
          new.logical_id = old.logical_id
  
        for old, new in zip(old_lvs, new_lvs):
          new.logical_id = old.logical_id
-        cfg.SetDiskID(new, tgt_node)
+        self.cfg.SetDiskID(new, self.target_node)
  
        for disk in old_lvs:
          disk.logical_id = ren_fn(disk, temp_suffix)
  
        for disk in old_lvs:
          disk.logical_id = ren_fn(disk, temp_suffix)
-        cfg.SetDiskID(disk, tgt_node)
+        self.cfg.SetDiskID(disk, self.target_node)
  
  
-      # now that the new lvs have the old name, we can add them to the device
-      info("adding new mirror component on %s" % tgt_node)
-      result = self.rpc.call_blockdev_addchildren(tgt_node, dev, new_lvs)
-      msg = result.RemoteFailMsg()
+      # Now that the new lvs have the old name, we can add them to the device
+      self.lu.LogInfo("Adding new mirror component on %s" % self.target_node)
+      result = self.rpc.call_blockdev_addchildren(self.target_node, dev,
+                                                  new_lvs)
+      msg = result.fail_msg
        if msg:
          for new_lv in new_lvs:
        if msg:
          for new_lv in new_lvs:
-          msg = self.rpc.call_blockdev_remove(tgt_node, new_lv).RemoteFailMsg()
-          if msg:
-            warning("Can't rollback device %s: %s", dev, msg,
-                    hint="cleanup manually the unused logical volumes")
+          msg2 = self.rpc.call_blockdev_remove(self.target_node,
+                                               new_lv).fail_msg
+          if msg2:
+            self.lu.LogWarning("Can't rollback device %s: %s", dev, msg2,
+                               hint=("cleanup manually the unused logical"
+                                     "volumes"))
          raise errors.OpExecError("Can't add local storage to drbd: %s" % msg)
  
        dev.children = new_lvs
          raise errors.OpExecError("Can't add local storage to drbd: %s" % msg)
  
        dev.children = new_lvs
-      cfg.Update(instance)
  
  
-    # Step: wait for sync
+      self.cfg.Update(self.instance)
  
  
-    # this can fail as the old devices are degraded and _WaitForSync
-    # does a combined result over all disks, so we don't check its
-    # return value
-    self.proc.LogStep(5, steps_total, "sync devices")
-    _WaitForSync(self, instance, unlock=True)
+    # Wait for sync
+    # This can fail as the old devices are degraded and _WaitForSync
+    # does a combined result over all disks, so we don't check its return value
+    self.lu.LogStep(5, steps_total, "Sync devices")
+    _WaitForSync(self.lu, self.instance, unlock=True)
  
  
-    # so check manually all the devices
-    for name, (dev, old_lvs, new_lvs) in iv_names.iteritems():
-      cfg.SetDiskID(dev, instance.primary_node)
-      result = self.rpc.call_blockdev_find(instance.primary_node, dev)
-      msg = result.RemoteFailMsg()
-      if not msg and not result.payload:
-        msg = "disk not found"
-      if msg:
-        raise errors.OpExecError("Can't find DRBD device %s: %s" %
-                                 (name, msg))
-      if result.payload[5]:
-        raise errors.OpExecError("DRBD device %s is degraded!" % name)
+    # Check all devices manually
+    self._CheckDevices(self.instance.primary_node, iv_names)
  
      # Step: remove old storage
  
      # Step: remove old storage
-    self.proc.LogStep(6, steps_total, "removing old storage")
-    for name, (dev, old_lvs, new_lvs) in iv_names.iteritems():
-      info("remove logical volumes for %s" % name)
-      for lv in old_lvs:
-        cfg.SetDiskID(lv, tgt_node)
-        msg = self.rpc.call_blockdev_remove(tgt_node, lv).RemoteFailMsg()
-        if msg:
-          warning("Can't remove old LV: %s" % msg,
-                  hint="manually remove unused LVs")
-          continue
+    self.lu.LogStep(6, steps_total, "Removing old storage")
+    self._RemoveOldStorage(self.target_node, iv_names)
  
  
-  def _ExecD8Secondary(self, feedback_fn):
-    """Replace the secondary node for drbd8.
+  def _ExecDrbd8Secondary(self):
+    """Replace the secondary node for DRBD 8.
  
      The algorithm for replace is quite complicated:
        - for all disks of the instance:
  
      The algorithm for replace is quite complicated:
        - for all disks of the instance:
@@ -5430,197 +6637,186 @@ class LUReplaceDisks(LogicalUnit):
  
      """
      steps_total = 6
  
      """
      steps_total = 6
-    warning, info = (self.proc.LogWarning, self.proc.LogInfo)
-    instance = self.instance
-    iv_names = {}
-    # start of work
-    cfg = self.cfg
-    old_node = self.tgt_node
-    new_node = self.new_node
-    pri_node = instance.primary_node
-    nodes_ip = {
-      old_node: self.cfg.GetNodeInfo(old_node).secondary_ip,
-      new_node: self.cfg.GetNodeInfo(new_node).secondary_ip,
-      pri_node: self.cfg.GetNodeInfo(pri_node).secondary_ip,
-      }
  
      # Step: check device activation
  
      # Step: check device activation
-    self.proc.LogStep(1, steps_total, "check device existence")
-    info("checking volume groups")
-    my_vg = cfg.GetVGName()
-    results = self.rpc.call_vg_list([pri_node, new_node])
-    for node in pri_node, new_node:
-      res = results[node]
-      msg = res.RemoteFailMsg()
-      if msg:
-        raise errors.OpExecError("Error checking node %s: %s" % (node, msg))
-      if my_vg not in res.payload:
-        raise errors.OpExecError("Volume group '%s' not found on %s" %
-                                 (my_vg, node))
-    for idx, dev in enumerate(instance.disks):
-      if idx not in self.op.disks:
-        continue
-      info("checking disk/%d on %s" % (idx, pri_node))
-      cfg.SetDiskID(dev, pri_node)
-      result = self.rpc.call_blockdev_find(pri_node, dev)
-      msg = result.RemoteFailMsg()
-      if not msg and not result.payload:
-        msg = "disk not found"
-      if msg:
-        raise errors.OpExecError("Can't find disk/%d on node %s: %s" %
-                                 (idx, pri_node, msg))
+    self.lu.LogStep(1, steps_total, "Check device existence")
+    self._CheckDisksExistence([self.instance.primary_node])
+    self._CheckVolumeGroup([self.instance.primary_node])
  
      # Step: check other node consistency
  
      # Step: check other node consistency
-    self.proc.LogStep(2, steps_total, "check peer consistency")
-    for idx, dev in enumerate(instance.disks):
-      if idx not in self.op.disks:
-        continue
-      info("checking disk/%d consistency on %s" % (idx, pri_node))
-      if not _CheckDiskConsistency(self, dev, pri_node, True, ldisk=True):
-        raise errors.OpExecError("Primary node (%s) has degraded storage,"
-                                 " unsafe to replace the secondary" %
-                                 pri_node)
+    self.lu.LogStep(2, steps_total, "Check peer consistency")
+    self._CheckDisksConsistency(self.instance.primary_node, True, True)
  
      # Step: create new storage
  
      # Step: create new storage
-    self.proc.LogStep(3, steps_total, "allocate new storage")
-    for idx, dev in enumerate(instance.disks):
-      info("adding new local storage on %s for disk/%d" %
-           (new_node, idx))
+    self.lu.LogStep(3, steps_total, "Allocate new storage")
+    for idx, dev in enumerate(self.instance.disks):
+      self.lu.LogInfo("Adding new local storage on %s for disk/%d" %
+                      (self.new_node, idx))
        # we pass force_create=True to force LVM creation
        for new_lv in dev.children:
        # we pass force_create=True to force LVM creation
        for new_lv in dev.children:
-        _CreateBlockDev(self, new_node, instance, new_lv, True,
-                        _GetInstanceInfoText(instance), False)
+        _CreateBlockDev(self.lu, self.new_node, self.instance, new_lv, True,
+                        _GetInstanceInfoText(self.instance), False)
  
      # Step 4: dbrd minors and drbd setups changes
      # after this, we must manually remove the drbd minors on both the
      # error and the success paths
  
      # Step 4: dbrd minors and drbd setups changes
      # after this, we must manually remove the drbd minors on both the
      # error and the success paths
-    minors = cfg.AllocateDRBDMinor([new_node for dev in instance.disks],
-                                   instance.name)
-    logging.debug("Allocated minors %s" % (minors,))
-    self.proc.LogStep(4, steps_total, "changing drbd configuration")
-    for idx, (dev, new_minor) in enumerate(zip(instance.disks, minors)):
-      size = dev.size
-      info("activating a new drbd on %s for disk/%d" % (new_node, idx))
+    self.lu.LogStep(4, steps_total, "Changing drbd configuration")
+    minors = self.cfg.AllocateDRBDMinor([self.new_node
+                                         for dev in self.instance.disks],
+                                        self.instance.name)
+    logging.debug("Allocated minors %r" % (minors,))
+
+    iv_names = {}
+    for idx, (dev, new_minor) in enumerate(zip(self.instance.disks, minors)):
+      self.lu.LogInfo("activating a new drbd on %s for disk/%d" %
+                      (self.new_node, idx))
        # create new devices on new_node; note that we create two IDs:
        # one without port, so the drbd will be activated without
        # networking information on the new node at this stage, and one
        # with network, for the latter activation in step 4
        (o_node1, o_node2, o_port, o_minor1, o_minor2, o_secret) = dev.logical_id
        # create new devices on new_node; note that we create two IDs:
        # one without port, so the drbd will be activated without
        # networking information on the new node at this stage, and one
        # with network, for the latter activation in step 4
        (o_node1, o_node2, o_port, o_minor1, o_minor2, o_secret) = dev.logical_id
-      if pri_node == o_node1:
+      if self.instance.primary_node == o_node1:
          p_minor = o_minor1
        else:
          p_minor = o_minor2
  
          p_minor = o_minor1
        else:
          p_minor = o_minor2
  
-      new_alone_id = (pri_node, new_node, None, p_minor, new_minor, o_secret)
-      new_net_id = (pri_node, new_node, o_port, p_minor, new_minor, o_secret)
+      new_alone_id = (self.instance.primary_node, self.new_node, None,
+                      p_minor, new_minor, o_secret)
+      new_net_id = (self.instance.primary_node, self.new_node, o_port,
+                    p_minor, new_minor, o_secret)
  
        iv_names[idx] = (dev, dev.children, new_net_id)
        logging.debug("Allocated new_minor: %s, new_logical_id: %s", new_minor,
                      new_net_id)
        new_drbd = objects.Disk(dev_type=constants.LD_DRBD8,
                                logical_id=new_alone_id,
  
        iv_names[idx] = (dev, dev.children, new_net_id)
        logging.debug("Allocated new_minor: %s, new_logical_id: %s", new_minor,
                      new_net_id)
        new_drbd = objects.Disk(dev_type=constants.LD_DRBD8,
                                logical_id=new_alone_id,
-                              children=dev.children)
+                              children=dev.children,
+                              size=dev.size)
        try:
        try:
-        _CreateSingleBlockDev(self, new_node, instance, new_drbd,
-                              _GetInstanceInfoText(instance), False)
+        _CreateSingleBlockDev(self.lu, self.new_node, self.instance, new_drbd,
+                              _GetInstanceInfoText(self.instance), False)
        except errors.GenericError:
        except errors.GenericError:
-        self.cfg.ReleaseDRBDMinors(instance.name)
+        self.cfg.ReleaseDRBDMinors(self.instance.name)
          raise
  
          raise
  
-    for idx, dev in enumerate(instance.disks):
-      # we have new devices, shutdown the drbd on the old secondary
-      info("shutting down drbd for disk/%d on old node" % idx)
-      cfg.SetDiskID(dev, old_node)
-      msg = self.rpc.call_blockdev_shutdown(old_node, dev).RemoteFailMsg()
+    # We have new devices, shutdown the drbd on the old secondary
+    for idx, dev in enumerate(self.instance.disks):
+      self.lu.LogInfo("Shutting down drbd for disk/%d on old node" % idx)
+      self.cfg.SetDiskID(dev, self.target_node)
+      msg = self.rpc.call_blockdev_shutdown(self.target_node, dev).fail_msg
        if msg:
        if msg:
-        warning("Failed to shutdown drbd for disk/%d on old node: %s" %
-                (idx, msg),
-                hint="Please cleanup this device manually as soon as possible")
-
-    info("detaching primary drbds from the network (=> standalone)")
-    result = self.rpc.call_drbd_disconnect_net([pri_node], nodes_ip,
-                                               instance.disks)[pri_node]
-
-    msg = result.RemoteFailMsg()
+        self.lu.LogWarning("Failed to shutdown drbd for disk/%d on old"
+                           "node: %s" % (idx, msg),
+                           hint=("Please cleanup this device manually as"
+                                 " soon as possible"))
+
+    self.lu.LogInfo("Detaching primary drbds from the network (=> standalone)")
+    result = self.rpc.call_drbd_disconnect_net([self.instance.primary_node],
+                                               self.node_secondary_ip,
+                                               self.instance.disks)\
+                                              [self.instance.primary_node]
+
+    msg = result.fail_msg
      if msg:
        # detaches didn't succeed (unlikely)
      if msg:
        # detaches didn't succeed (unlikely)
-      self.cfg.ReleaseDRBDMinors(instance.name)
+      self.cfg.ReleaseDRBDMinors(self.instance.name)
        raise errors.OpExecError("Can't detach the disks from the network on"
                                 " old node: %s" % (msg,))
  
      # if we managed to detach at least one, we update all the disks of
      # the instance to point to the new secondary
        raise errors.OpExecError("Can't detach the disks from the network on"
                                 " old node: %s" % (msg,))
  
      # if we managed to detach at least one, we update all the disks of
      # the instance to point to the new secondary
-    info("updating instance configuration")
+    self.lu.LogInfo("Updating instance configuration")
      for dev, _, new_logical_id in iv_names.itervalues():
        dev.logical_id = new_logical_id
      for dev, _, new_logical_id in iv_names.itervalues():
        dev.logical_id = new_logical_id
-      cfg.SetDiskID(dev, pri_node)
-    cfg.Update(instance)
+      self.cfg.SetDiskID(dev, self.instance.primary_node)
+
+    self.cfg.Update(self.instance)
  
      # and now perform the drbd attach
  
      # and now perform the drbd attach
-    info("attaching primary drbds to new secondary (standalone => connected)")
-    result = self.rpc.call_drbd_attach_net([pri_node, new_node], nodes_ip,
-                                           instance.disks, instance.name,
+    self.lu.LogInfo("Attaching primary drbds to new secondary"
+                    " (standalone => connected)")
+    result = self.rpc.call_drbd_attach_net([self.instance.primary_node,
+                                            self.new_node],
+                                           self.node_secondary_ip,
+                                           self.instance.disks,
+                                           self.instance.name,
                                             False)
      for to_node, to_result in result.items():
                                             False)
      for to_node, to_result in result.items():
-      msg = to_result.RemoteFailMsg()
+      msg = to_result.fail_msg
        if msg:
        if msg:
-        warning("can't attach drbd disks on node %s: %s", to_node, msg,
-                hint="please do a gnt-instance info to see the"
-                " status of disks")
-
-    # this can fail as the old devices are degraded and _WaitForSync
-    # does a combined result over all disks, so we don't check its
-    # return value
-    self.proc.LogStep(5, steps_total, "sync devices")
-    _WaitForSync(self, instance, unlock=True)
-
-    # so check manually all the devices
-    for idx, (dev, old_lvs, _) in iv_names.iteritems():
-      cfg.SetDiskID(dev, pri_node)
-      result = self.rpc.call_blockdev_find(pri_node, dev)
-      msg = result.RemoteFailMsg()
-      if not msg and not result.payload:
-        msg = "disk not found"
-      if msg:
-        raise errors.OpExecError("Can't find DRBD device disk/%d: %s" %
-                                 (idx, msg))
-      if result.payload[5]:
-        raise errors.OpExecError("DRBD device disk/%d is degraded!" % idx)
-
-    self.proc.LogStep(6, steps_total, "removing old storage")
-    for idx, (dev, old_lvs, _) in iv_names.iteritems():
-      info("remove logical volumes for disk/%d" % idx)
-      for lv in old_lvs:
-        cfg.SetDiskID(lv, old_node)
-        msg = self.rpc.call_blockdev_remove(old_node, lv).RemoteFailMsg()
-        if msg:
-          warning("Can't remove LV on old secondary: %s", msg,
-                  hint="Cleanup stale volumes by hand")
+        self.lu.LogWarning("Can't attach drbd disks on node %s: %s",
+                           to_node, msg,
+                           hint=("please do a gnt-instance info to see the"
+                                 " status of disks"))
  
  
-  def Exec(self, feedback_fn):
-    """Execute disk replacement.
+    # Wait for sync
+    # This can fail as the old devices are degraded and _WaitForSync
+    # does a combined result over all disks, so we don't check its return value
+    self.lu.LogStep(5, steps_total, "Sync devices")
+    _WaitForSync(self.lu, self.instance, unlock=True)
  
  
-    This dispatches the disk replacement to the appropriate handler.
+    # Check all devices manually
+    self._CheckDevices(self.instance.primary_node, iv_names)
  
  
-    """
-    instance = self.instance
+    # Step: remove old storage
+    self.lu.LogStep(6, steps_total, "Removing old storage")
+    self._RemoveOldStorage(self.target_node, iv_names)
  
  
-    # Activate the instance disks if we're replacing them on a down instance
-    if not instance.admin_up:
-      _StartInstanceDisks(self, instance, True)
  
  
-    if self.op.mode == constants.REPLACE_DISK_CHG:
-      fn = self._ExecD8Secondary
-    else:
-      fn = self._ExecD8DiskOnly
+class LURepairNodeStorage(NoHooksLU):
+  """Repairs the volume group on a node.
+
+  """
+  _OP_REQP = ["node_name"]
+  REQ_BGL = False
+
+  def CheckArguments(self):
+    node_name = self.cfg.ExpandNodeName(self.op.node_name)
+    if node_name is None:
+      raise errors.OpPrereqError("Invalid node name '%s'" % self.op.node_name)
+
+    self.op.node_name = node_name
+
+  def ExpandNames(self):
+    self.needed_locks = {
+      locking.LEVEL_NODE: [self.op.node_name],
+      }
+
+  def _CheckFaultyDisks(self, instance, node_name):
+    if _FindFaultyInstanceDisks(self.cfg, self.rpc, instance,
+                                node_name, True):
+      raise errors.OpPrereqError("Instance '%s' has faulty disks on"
+                                 " node '%s'" % (instance.name, node_name))
+
+  def CheckPrereq(self):
+    """Check prerequisites.
+
+    """
+    storage_type = self.op.storage_type
  
  
-    ret = fn(feedback_fn)
+    if (constants.SO_FIX_CONSISTENCY not in
+        constants.VALID_STORAGE_OPERATIONS.get(storage_type, [])):
+      raise errors.OpPrereqError("Storage units of type '%s' can not be"
+                                 " repaired" % storage_type)
  
  
-    # Deactivate the instance disks if we're replacing them on a down instance
-    if not instance.admin_up:
-      _SafeShutdownInstanceDisks(self, instance)
+    # Check whether any instance on this node has faulty disks
+    for inst in _GetNodeInstances(self.cfg, self.op.node_name):
+      check_nodes = set(inst.all_nodes)
+      check_nodes.discard(self.op.node_name)
+      for inst_node_name in check_nodes:
+        self._CheckFaultyDisks(inst, inst_node_name)
  
  
-    return ret
+  def Exec(self, feedback_fn):
+    feedback_fn("Repairing storage unit '%s' on %s ..." %
+                (self.op.name, self.op.node_name))
+
+    st_args = _GetStorageTypeArgs(self.cfg, self.op.storage_type)
+    result = self.rpc.call_storage_execute(self.op.node_name,
+                                           self.op.storage_type, st_args,
+                                           self.op.name,
+                                           constants.SO_FIX_CONSISTENCY)
+    result.Raise("Failed to repair storage unit '%s' on %s" %
+                 (self.op.name, self.op.node_name))
  
  
  class LUGrowDisk(LogicalUnit):
  
  
  class LUGrowDisk(LogicalUnit):
@@ -5684,10 +6880,7 @@ class LUGrowDisk(LogicalUnit):
                                         instance.hypervisor)
      for node in nodenames:
        info = nodeinfo[node]
                                         instance.hypervisor)
      for node in nodenames:
        info = nodeinfo[node]
-      msg = info.RemoteFailMsg()
-      if msg:
-        raise errors.OpPrereqError("Cannot get current information"
-                                   " from node %s:" % (node, msg))
+      info.Raise("Cannot get current information from node %s" % node)
        vg_free = info.payload.get('vg_free', None)
        if not isinstance(vg_free, int):
          raise errors.OpPrereqError("Can't compute free disk space on"
        vg_free = info.payload.get('vg_free', None)
        if not isinstance(vg_free, int):
          raise errors.OpPrereqError("Can't compute free disk space on"
@@ -5706,10 +6899,7 @@ class LUGrowDisk(LogicalUnit):
      for node in instance.all_nodes:
        self.cfg.SetDiskID(disk, node)
        result = self.rpc.call_blockdev_grow(node, disk, self.op.amount)
      for node in instance.all_nodes:
        self.cfg.SetDiskID(disk, node)
        result = self.rpc.call_blockdev_grow(node, disk, self.op.amount)
-      msg = result.RemoteFailMsg()
-      if msg:
-        raise errors.OpExecError("Grow request failed to node %s: %s" %
-                                 (node, msg))
+      result.Raise("Grow request failed to node %s" % node)
      disk.RecordGrow(self.op.amount)
      self.cfg.Update(instance)
      if self.op.wait_for_sync:
      disk.RecordGrow(self.op.amount)
      self.cfg.Update(instance)
      if self.op.wait_for_sync:
@@ -5728,7 +6918,7 @@ class LUQueryInstanceData(NoHooksLU):
  
    def ExpandNames(self):
      self.needed_locks = {}
  
    def ExpandNames(self):
      self.needed_locks = {}
-    self.share_locks = dict(((i, 1) for i in locking.LEVELS))
+    self.share_locks = dict.fromkeys(locking.LEVELS, 1)
  
      if not isinstance(self.op.instances, list):
        raise errors.OpPrereqError("Invalid argument type 'instances'")
  
      if not isinstance(self.op.instances, list):
        raise errors.OpPrereqError("Invalid argument type 'instances'")
@@ -5765,25 +6955,33 @@ class LUQueryInstanceData(NoHooksLU):
                               in self.wanted_names]
      return
  
                               in self.wanted_names]
      return
  
+  def _ComputeBlockdevStatus(self, node, instance_name, dev):
+    """Returns the status of a block device
+
+    """
+    if self.op.static or not node:
+      return None
+
+    self.cfg.SetDiskID(dev, node)
+
+    result = self.rpc.call_blockdev_find(node, dev)
+    if result.offline:
+      return None
+
+    result.Raise("Can't compute disk status for %s" % instance_name)
+
+    status = result.payload
+    if status is None:
+      return None
+
+    return (status.dev_path, status.major, status.minor,
+            status.sync_percent, status.estimated_time,
+            status.is_degraded, status.ldisk_status)
+
    def _ComputeDiskStatus(self, instance, snode, dev):
      """Compute block device status.
  
      """
    def _ComputeDiskStatus(self, instance, snode, dev):
      """Compute block device status.
  
      """
-    static = self.op.static
-    if not static:
-      self.cfg.SetDiskID(dev, instance.primary_node)
-      dev_pstatus = self.rpc.call_blockdev_find(instance.primary_node, dev)
-      if dev_pstatus.offline:
-        dev_pstatus = None
-      else:
-        msg = dev_pstatus.RemoteFailMsg()
-        if msg:
-          raise errors.OpExecError("Can't compute disk status for %s: %s" %
-                                   (instance.name, msg))
-        dev_pstatus = dev_pstatus.payload
-    else:
-      dev_pstatus = None
-
      if dev.dev_type in constants.LDS_DRBD:
        # we change the snode then (otherwise we use the one passed in)
        if dev.logical_id[0] == instance.primary_node:
      if dev.dev_type in constants.LDS_DRBD:
        # we change the snode then (otherwise we use the one passed in)
        if dev.logical_id[0] == instance.primary_node:
@@ -5791,19 +6989,9 @@ class LUQueryInstanceData(NoHooksLU):
        else:
          snode = dev.logical_id[0]
  
        else:
          snode = dev.logical_id[0]
  
-    if snode and not static:
-      self.cfg.SetDiskID(dev, snode)
-      dev_sstatus = self.rpc.call_blockdev_find(snode, dev)
-      if dev_sstatus.offline:
-        dev_sstatus = None
-      else:
-        msg = dev_sstatus.RemoteFailMsg()
-        if msg:
-          raise errors.OpExecError("Can't compute disk status for %s: %s" %
-                                   (instance.name, msg))
-        dev_sstatus = dev_sstatus.payload
-    else:
-      dev_sstatus = None
+    dev_pstatus = self._ComputeBlockdevStatus(instance.primary_node,
+                                              instance.name, dev)
+    dev_sstatus = self._ComputeBlockdevStatus(snode, instance.name, dev)
  
      if dev.children:
        dev_children = [self._ComputeDiskStatus(instance, snode, child)
  
      if dev.children:
        dev_children = [self._ComputeDiskStatus(instance, snode, child)
@@ -5820,6 +7008,7 @@ class LUQueryInstanceData(NoHooksLU):
        "sstatus": dev_sstatus,
        "children": dev_children,
        "mode": dev.mode,
        "sstatus": dev_sstatus,
        "children": dev_children,
        "mode": dev.mode,
+      "size": dev.size,
        }
  
      return data
        }
  
      return data
@@ -5835,10 +7024,7 @@ class LUQueryInstanceData(NoHooksLU):
          remote_info = self.rpc.call_instance_info(instance.primary_node,
                                                    instance.name,
                                                    instance.hypervisor)
          remote_info = self.rpc.call_instance_info(instance.primary_node,
                                                    instance.name,
                                                    instance.hypervisor)
-        msg = remote_info.RemoteFailMsg()
-        if msg:
-          raise errors.OpExecError("Error checking node %s: %s" %
-                                   (instance.primary_node, msg))
+        remote_info.Raise("Error checking node %s" % instance.primary_node)
          remote_info = remote_info.payload
          if remote_info and "state" in remote_info:
            remote_state = "up"
          remote_info = remote_info.payload
          if remote_info and "state" in remote_info:
            remote_state = "up"
@@ -5861,7 +7047,8 @@ class LUQueryInstanceData(NoHooksLU):
          "pnode": instance.primary_node,
          "snodes": instance.secondary_nodes,
          "os": instance.os,
          "pnode": instance.primary_node,
          "snodes": instance.secondary_nodes,
          "os": instance.os,
-        "nics": [(nic.mac, nic.ip, nic.bridge) for nic in instance.nics],
+        # this happens to be the same format used for hooks
+        "nics": _NICListToTuple(self, instance.nics),
          "disks": disks,
          "hypervisor": instance.hypervisor,
          "network_port": instance.network_port,
          "disks": disks,
          "hypervisor": instance.hypervisor,
          "network_port": instance.network_port,
@@ -5869,6 +7056,10 @@ class LUQueryInstanceData(NoHooksLU):
          "hv_actual": cluster.FillHV(instance),
          "be_instance": instance.beparams,
          "be_actual": cluster.FillBE(instance),
          "hv_actual": cluster.FillHV(instance),
          "be_instance": instance.beparams,
          "be_actual": cluster.FillBE(instance),
+        "serial_no": instance.serial_no,
+        "mtime": instance.mtime,
+        "ctime": instance.ctime,
+        "uuid": instance.uuid,
          }
  
        result[instance.name] = idict
          }
  
        result[instance.name] = idict
@@ -5910,6 +7101,10 @@ class LUSetInstanceParams(LogicalUnit):
        else:
          if not isinstance(disk_op, int):
            raise errors.OpPrereqError("Invalid disk index")
        else:
          if not isinstance(disk_op, int):
            raise errors.OpPrereqError("Invalid disk index")
+        if not isinstance(disk_dict, dict):
+          msg = "Invalid disk value: expected dict, got '%s'" % disk_dict
+          raise errors.OpPrereqError(msg)
+
        if disk_op == constants.DDM_ADD:
          mode = disk_dict.setdefault('mode', constants.DISK_RDWR)
          if mode not in constants.DISK_ACCESS_SET:
        if disk_op == constants.DDM_ADD:
          mode = disk_dict.setdefault('mode', constants.DISK_RDWR)
          if mode not in constants.DISK_ACCESS_SET:
@@ -5944,6 +7139,9 @@ class LUSetInstanceParams(LogicalUnit):
        else:
          if not isinstance(nic_op, int):
            raise errors.OpPrereqError("Invalid nic index")
        else:
          if not isinstance(nic_op, int):
            raise errors.OpPrereqError("Invalid nic index")
+        if not isinstance(nic_dict, dict):
+          msg = "Invalid nic value: expected dict, got '%s'" % nic_dict
+          raise errors.OpPrereqError(msg)
  
        # nic_dict should be a dict
        nic_ip = nic_dict.get('ip', None)
  
        # nic_dict should be a dict
        nic_ip = nic_dict.get('ip', None)
@@ -5957,7 +7155,8 @@ class LUSetInstanceParams(LogicalUnit):
        nic_bridge = nic_dict.get('bridge', None)
        nic_link = nic_dict.get('link', None)
        if nic_bridge and nic_link:
        nic_bridge = nic_dict.get('bridge', None)
        nic_link = nic_dict.get('link', None)
        if nic_bridge and nic_link:
-        raise errors.OpPrereqError("Cannot pass 'bridge' and 'link' at the same time")
+        raise errors.OpPrereqError("Cannot pass 'bridge' and 'link'"
+                                   " at the same time")
        elif nic_bridge and nic_bridge.lower() == constants.VALUE_NONE:
          nic_dict['bridge'] = None
        elif nic_link and nic_link.lower() == constants.VALUE_NONE:
        elif nic_bridge and nic_bridge.lower() == constants.VALUE_NONE:
          nic_dict['bridge'] = None
        elif nic_link and nic_link.lower() == constants.VALUE_NONE:
@@ -6046,11 +7245,11 @@ class LUSetInstanceParams(LogicalUnit):
      """Return the new params dict for the given params.
  
      @type old_params: dict
      """Return the new params dict for the given params.
  
      @type old_params: dict
-    @type old_params: old parameters
+    @param old_params: old parameters
      @type update_dict: dict
      @type update_dict: dict
-    @type update_dict: dict containing new parameter values,
-                       or constants.VALUE_DEFAULT to reset the
-                       parameter to its default value
+    @param update_dict: dict containing new parameter values,
+                        or constants.VALUE_DEFAULT to reset the
+                        parameter to its default value
      @type default_values: dict
      @param default_values: default values for the filled parameters
      @type parameter_types: dict
      @type default_values: dict
      @param default_values: default values for the filled parameters
      @type parameter_types: dict
@@ -6079,7 +7278,7 @@ class LUSetInstanceParams(LogicalUnit):
      This only checks the instance list against the existing names.
  
      """
      This only checks the instance list against the existing names.
  
      """
-    force = self.force = self.op.force
+    self.force = self.op.force
  
      # checking the new params on the primary/secondary nodes
  
  
      # checking the new params on the primary/secondary nodes
  
@@ -6128,7 +7327,7 @@ class LUSetInstanceParams(LogicalUnit):
        nodeinfo = self.rpc.call_node_info(mem_check_list, self.cfg.GetVGName(),
                                           instance.hypervisor)
        pninfo = nodeinfo[pnode]
        nodeinfo = self.rpc.call_node_info(mem_check_list, self.cfg.GetVGName(),
                                           instance.hypervisor)
        pninfo = nodeinfo[pnode]
-      msg = pninfo.RemoteFailMsg()
+      msg = pninfo.fail_msg
        if msg:
          # Assume the primary node is unreachable and go ahead
          self.warn.append("Can't get info from primary node %s: %s" %
        if msg:
          # Assume the primary node is unreachable and go ahead
          self.warn.append("Can't get info from primary node %s: %s" %
@@ -6136,9 +7335,9 @@ class LUSetInstanceParams(LogicalUnit):
        elif not isinstance(pninfo.payload.get('memory_free', None), int):
          self.warn.append("Node data from primary node %s doesn't contain"
                           " free memory information" % pnode)
        elif not isinstance(pninfo.payload.get('memory_free', None), int):
          self.warn.append("Node data from primary node %s doesn't contain"
                           " free memory information" % pnode)
-      elif instance_info.RemoteFailMsg():
+      elif instance_info.fail_msg:
          self.warn.append("Can't get instance runtime information: %s" %
          self.warn.append("Can't get instance runtime information: %s" %
-                        instance_info.RemoteFailMsg())
+                        instance_info.fail_msg)
        else:
          if instance_info.payload:
            current_mem = int(instance_info.payload['memory'])
        else:
          if instance_info.payload:
            current_mem = int(instance_info.payload['memory'])
@@ -6158,7 +7357,7 @@ class LUSetInstanceParams(LogicalUnit):
          for node, nres in nodeinfo.items():
            if node not in instance.secondary_nodes:
              continue
          for node, nres in nodeinfo.items():
            if node not in instance.secondary_nodes:
              continue
-          msg = nres.RemoteFailMsg()
+          msg = nres.fail_msg
            if msg:
              self.warn.append("Can't get info from secondary node %s: %s" %
                               (node, msg))
            if msg:
              self.warn.append("Can't get info from secondary node %s: %s" %
                               (node, msg))
@@ -6207,8 +7406,7 @@ class LUSetInstanceParams(LogicalUnit):
  
        if new_nic_mode == constants.NIC_MODE_BRIDGED:
          nic_bridge = new_filled_nic_params[constants.NIC_LINK]
  
        if new_nic_mode == constants.NIC_MODE_BRIDGED:
          nic_bridge = new_filled_nic_params[constants.NIC_LINK]
-        result = self.rpc.call_bridges_exist(pnode, [nic_bridge])
-        msg = result.RemoteFailMsg()
+        msg = self.rpc.call_bridges_exist(pnode, [nic_bridge]).fail_msg
          if msg:
            msg = "Error checking bridges on node %s: %s" % (pnode, msg)
            if self.force:
          if msg:
            msg = "Error checking bridges on node %s: %s" % (pnode, msg)
            if self.force:
@@ -6247,7 +7445,7 @@ class LUSetInstanceParams(LogicalUnit):
                                       " an instance")
          ins_l = self.rpc.call_instance_list([pnode], [instance.hypervisor])
          ins_l = ins_l[pnode]
                                       " an instance")
          ins_l = self.rpc.call_instance_list([pnode], [instance.hypervisor])
          ins_l = ins_l[pnode]
-        msg = ins_l.RemoteFailMsg()
+        msg = ins_l.fail_msg
          if msg:
            raise errors.OpPrereqError("Can't contact node %s: %s" %
                                       (pnode, msg))
          if msg:
            raise errors.OpPrereqError("Can't contact node %s: %s" %
                                       (pnode, msg))
@@ -6290,7 +7488,7 @@ class LUSetInstanceParams(LogicalUnit):
          device_idx = len(instance.disks)
          for node, disk in device.ComputeNodeTree(instance.primary_node):
            self.cfg.SetDiskID(disk, node)
          device_idx = len(instance.disks)
          for node, disk in device.ComputeNodeTree(instance.primary_node):
            self.cfg.SetDiskID(disk, node)
-          msg = self.rpc.call_blockdev_remove(node, disk).RemoteFailMsg()
+          msg = self.rpc.call_blockdev_remove(node, disk).fail_msg
            if msg:
              self.LogWarning("Could not remove disk/%d on node %s: %s,"
                              " continuing anyway", device_idx, node, msg)
            if msg:
              self.LogWarning("Could not remove disk/%d on node %s: %s,"
                              " continuing anyway", device_idx, node, msg)
@@ -6412,7 +7610,7 @@ class LUQueryExports(NoHooksLU):
      rpcresult = self.rpc.call_export_list(self.nodes)
      result = {}
      for node in rpcresult:
      rpcresult = self.rpc.call_export_list(self.nodes)
      result = {}
      for node in rpcresult:
-      if rpcresult[node].RemoteFailMsg():
+      if rpcresult[node].fail_msg:
          result[node] = False
        else:
          result[node] = rpcresult[node].payload
          result[node] = False
        else:
          result[node] = rpcresult[node].payload
@@ -6438,7 +7636,7 @@ class LUExportInstance(LogicalUnit):
      # remove it from its current node. In the future we could fix this by:
      #  - making a tasklet to search (share-lock all), then create the new one,
      #    then one to remove, after
      # remove it from its current node. In the future we could fix this by:
      #  - making a tasklet to search (share-lock all), then create the new one,
      #    then one to remove, after
-    #  - removing the removal operation altoghether
+    #  - removing the removal operation altogether
      self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
  
    def DeclareLocks(self, level):
      self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
  
    def DeclareLocks(self, level):
@@ -6494,14 +7692,13 @@ class LUExportInstance(LogicalUnit):
      instance = self.instance
      dst_node = self.dst_node
      src_node = instance.primary_node
      instance = self.instance
      dst_node = self.dst_node
      src_node = instance.primary_node
+
      if self.op.shutdown:
        # shutdown the instance, but not the disks
      if self.op.shutdown:
        # shutdown the instance, but not the disks
+      feedback_fn("Shutting down instance %s" % instance.name)
        result = self.rpc.call_instance_shutdown(src_node, instance)
        result = self.rpc.call_instance_shutdown(src_node, instance)
-      msg = result.RemoteFailMsg()
-      if msg:
-        raise errors.OpExecError("Could not shutdown instance %s on"
-                                 " node %s: %s" %
-                                 (instance.name, src_node, msg))
+      result.Raise("Could not shutdown instance %s on"
+                   " node %s" % (instance.name, src_node))
  
      vgname = self.cfg.GetVGName()
  
  
      vgname = self.cfg.GetVGName()
  
@@ -6512,14 +7709,19 @@ class LUExportInstance(LogicalUnit):
      for disk in instance.disks:
        self.cfg.SetDiskID(disk, src_node)
  
      for disk in instance.disks:
        self.cfg.SetDiskID(disk, src_node)
  
+    # per-disk results
+    dresults = []
      try:
      try:
-      for disk in instance.disks:
+      for idx, disk in enumerate(instance.disks):
+        feedback_fn("Creating a snapshot of disk/%s on node %s" %
+                    (idx, src_node))
+
          # result.payload will be a snapshot of an lvm leaf of the one we passed
          result = self.rpc.call_blockdev_snapshot(src_node, disk)
          # result.payload will be a snapshot of an lvm leaf of the one we passed
          result = self.rpc.call_blockdev_snapshot(src_node, disk)
-        msg = result.RemoteFailMsg()
+        msg = result.fail_msg
          if msg:
          if msg:
-          self.LogWarning("Could not snapshot block device %s on node %s: %s",
-                          disk.logical_id[1], src_node, msg)
+          self.LogWarning("Could not snapshot disk/%s on node %s: %s",
+                          idx, src_node, msg)
            snap_disks.append(False)
          else:
            disk_id = (vgname, result.payload)
            snap_disks.append(False)
          else:
            disk_id = (vgname, result.payload)
@@ -6530,8 +7732,9 @@ class LUExportInstance(LogicalUnit):
  
      finally:
        if self.op.shutdown and instance.admin_up:
  
      finally:
        if self.op.shutdown and instance.admin_up:
+        feedback_fn("Starting instance %s" % instance.name)
          result = self.rpc.call_instance_start(src_node, instance, None, None)
          result = self.rpc.call_instance_start(src_node, instance, None, None)
-        msg = result.RemoteFailMsg()
+        msg = result.fail_msg
          if msg:
            _ShutdownInstanceDisks(self, instance)
            raise errors.OpExecError("Could not start instance: %s" % msg)
          if msg:
            _ShutdownInstanceDisks(self, instance)
            raise errors.OpExecError("Could not start instance: %s" % msg)
@@ -6540,24 +7743,33 @@ class LUExportInstance(LogicalUnit):
  
      cluster_name = self.cfg.GetClusterName()
      for idx, dev in enumerate(snap_disks):
  
      cluster_name = self.cfg.GetClusterName()
      for idx, dev in enumerate(snap_disks):
+      feedback_fn("Exporting snapshot %s from %s to %s" %
+                  (idx, src_node, dst_node.name))
        if dev:
          result = self.rpc.call_snapshot_export(src_node, dev, dst_node.name,
                                                 instance, cluster_name, idx)
        if dev:
          result = self.rpc.call_snapshot_export(src_node, dev, dst_node.name,
                                                 instance, cluster_name, idx)
-        msg = result.RemoteFailMsg()
+        msg = result.fail_msg
          if msg:
          if msg:
-          self.LogWarning("Could not export block device %s from node %s to"
-                          " node %s: %s", dev.logical_id[1], src_node,
-                          dst_node.name, msg)
-        msg = self.rpc.call_blockdev_remove(src_node, dev).RemoteFailMsg()
+          self.LogWarning("Could not export disk/%s from node %s to"
+                          " node %s: %s", idx, src_node, dst_node.name, msg)
+          dresults.append(False)
+        else:
+          dresults.append(True)
+        msg = self.rpc.call_blockdev_remove(src_node, dev).fail_msg
          if msg:
          if msg:
-          self.LogWarning("Could not remove snapshot block device %s from node"
-                          " %s: %s", dev.logical_id[1], src_node, msg)
+          self.LogWarning("Could not remove snapshot for disk/%d from node"
+                          " %s: %s", idx, src_node, msg)
+      else:
+        dresults.append(False)
  
  
+    feedback_fn("Finalizing export on %s" % dst_node.name)
      result = self.rpc.call_finalize_export(dst_node.name, instance, snap_disks)
      result = self.rpc.call_finalize_export(dst_node.name, instance, snap_disks)
-    msg = result.RemoteFailMsg()
+    fin_resu = True
+    msg = result.fail_msg
      if msg:
        self.LogWarning("Could not finalize export for instance %s"
                        " on node %s: %s", instance.name, dst_node.name, msg)
      if msg:
        self.LogWarning("Could not finalize export for instance %s"
                        " on node %s: %s", instance.name, dst_node.name, msg)
+      fin_resu = False
  
      nodelist = self.cfg.GetNodeList()
      nodelist.remove(dst_node.name)
  
      nodelist = self.cfg.GetNodeList()
      nodelist.remove(dst_node.name)
@@ -6567,15 +7779,17 @@ class LUExportInstance(LogicalUnit):
      # substitutes an empty list with the full cluster node list.
      iname = instance.name
      if nodelist:
      # substitutes an empty list with the full cluster node list.
      iname = instance.name
      if nodelist:
+      feedback_fn("Removing old exports for instance %s" % iname)
        exportlist = self.rpc.call_export_list(nodelist)
        for node in exportlist:
        exportlist = self.rpc.call_export_list(nodelist)
        for node in exportlist:
-        if exportlist[node].RemoteFailMsg():
+        if exportlist[node].fail_msg:
            continue
          if iname in exportlist[node].payload:
            continue
          if iname in exportlist[node].payload:
-          msg = self.rpc.call_export_remove(node, iname).RemoteFailMsg()
+          msg = self.rpc.call_export_remove(node, iname).fail_msg
            if msg:
              self.LogWarning("Could not remove older export for instance %s"
                              " on node %s: %s", iname, node, msg)
            if msg:
              self.LogWarning("Could not remove older export for instance %s"
                              " on node %s: %s", iname, node, msg)
+    return fin_resu, dresults
  
  
  class LURemoveExport(NoHooksLU):
  
  
  class LURemoveExport(NoHooksLU):
@@ -6613,14 +7827,14 @@ class LURemoveExport(NoHooksLU):
      exportlist = self.rpc.call_export_list(locked_nodes)
      found = False
      for node in exportlist:
      exportlist = self.rpc.call_export_list(locked_nodes)
      found = False
      for node in exportlist:
-      msg = exportlist[node].RemoteFailMsg()
+      msg = exportlist[node].fail_msg
        if msg:
          self.LogWarning("Failed to query node %s (continuing): %s", node, msg)
          continue
        if instance_name in exportlist[node].payload:
          found = True
          result = self.rpc.call_export_remove(node, instance_name)
        if msg:
          self.LogWarning("Failed to query node %s (continuing): %s", node, msg)
          continue
        if instance_name in exportlist[node].payload:
          found = True
          result = self.rpc.call_export_remove(node, instance_name)
-        msg = result.RemoteFailMsg()
+        msg = result.fail_msg
          if msg:
            logging.error("Could not remove export for instance %s"
                          " on node %s: %s", instance_name, node, msg)
          if msg:
            logging.error("Could not remove export for instance %s"
                          " on node %s: %s", instance_name, node, msg)
@@ -6835,13 +8049,8 @@ class LUTestDelay(NoHooksLU):
          raise errors.OpExecError("Error during master delay test")
      if self.op.on_nodes:
        result = self.rpc.call_test_delay(self.op.on_nodes, self.op.duration)
          raise errors.OpExecError("Error during master delay test")
      if self.op.on_nodes:
        result = self.rpc.call_test_delay(self.op.on_nodes, self.op.duration)
-      if not result:
-        raise errors.OpExecError("Complete failure from rpc call")
        for node, node_result in result.items():
        for node, node_result in result.items():
-        node_result.Raise()
-        if not node_result.data:
-          raise errors.OpExecError("Failure during rpc call to node %s,"
-                                   " result: %s" % (node, node_result.data))
+        node_result.Raise("Failure during rpc call to node %s" % node)
  
  
  class IAllocator(object):
  
  
  class IAllocator(object):
@@ -6865,8 +8074,9 @@ class IAllocator(object):
      "relocate_from",
      ]
  
      "relocate_from",
      ]
  
-  def __init__(self, lu, mode, name, **kwargs):
-    self.lu = lu
+  def __init__(self, cfg, rpc, mode, name, **kwargs):
+    self.cfg = cfg
+    self.rpc = rpc
      # init buffer variables
      self.in_text = self.out_text = self.in_data = self.out_data = None
      # init all input fields so that pylint is happy
      # init buffer variables
      self.in_text = self.out_text = self.in_data = self.out_data = None
      # init all input fields so that pylint is happy
@@ -6904,7 +8114,7 @@ class IAllocator(object):
      This is the data that is independent of the actual operation.
  
      """
      This is the data that is independent of the actual operation.
  
      """
-    cfg = self.lu.cfg
+    cfg = self.cfg
      cluster_info = cfg.GetClusterInfo()
      # cluster data
      data = {
      cluster_info = cfg.GetClusterInfo()
      # cluster data
      data = {
@@ -6926,10 +8136,11 @@ class IAllocator(object):
      elif self.mode == constants.IALLOCATOR_MODE_RELOC:
        hypervisor_name = cfg.GetInstanceInfo(self.name).hypervisor
  
      elif self.mode == constants.IALLOCATOR_MODE_RELOC:
        hypervisor_name = cfg.GetInstanceInfo(self.name).hypervisor
  
-    node_data = self.lu.rpc.call_node_info(node_list, cfg.GetVGName(),
-                                           hypervisor_name)
-    node_iinfo = self.lu.rpc.call_all_instances_info(node_list,
-                       cluster_info.enabled_hypervisors)
+    node_data = self.rpc.call_node_info(node_list, cfg.GetVGName(),
+                                        hypervisor_name)
+    node_iinfo = \
+      self.rpc.call_all_instances_info(node_list,
+                                       cluster_info.enabled_hypervisors)
      for nname, nresult in node_data.items():
        # first fill in static (config-based) values
        ninfo = cfg.GetNodeInfo(nname)
      for nname, nresult in node_data.items():
        # first fill in static (config-based) values
        ninfo = cfg.GetNodeInfo(nname)
@@ -6942,16 +8153,12 @@ class IAllocator(object):
          "master_candidate": ninfo.master_candidate,
          }
  
          "master_candidate": ninfo.master_candidate,
          }
  
-      if not ninfo.offline:
-        msg = nresult.RemoteFailMsg()
-        if msg:
-          raise errors.OpExecError("Can't get data for node %s: %s" %
-                                   (nname, msg))
-        msg = node_iinfo[nname].RemoteFailMsg()
-        if msg:
-          raise errors.OpExecError("Can't get node instance info"
-                                   " from node %s: %s" % (nname, msg))
+      if not (ninfo.offline or ninfo.drained):
+        nresult.Raise("Can't get data for node %s" % nname)
+        node_iinfo[nname].Raise("Can't get node instance info from node %s" %
+                                nname)
          remote_info = nresult.payload
          remote_info = nresult.payload
+
          for attr in ['memory_total', 'memory_free', 'memory_dom0',
                       'vg_size', 'vg_free', 'cpu_total']:
            if attr not in remote_info:
          for attr in ['memory_total', 'memory_free', 'memory_dom0',
                       'vg_size', 'vg_free', 'cpu_total']:
            if attr not in remote_info:
@@ -7071,7 +8278,7 @@ class IAllocator(object):
      done.
  
      """
      done.
  
      """
-    instance = self.lu.cfg.GetInstanceInfo(self.name)
+    instance = self.cfg.GetInstanceInfo(self.name)
      if instance is None:
        raise errors.ProgrammerError("Unknown instance '%s' passed to"
                                     " IAllocator" % self.name)
      if instance is None:
        raise errors.ProgrammerError("Unknown instance '%s' passed to"
                                     " IAllocator" % self.name)
@@ -7113,23 +8320,12 @@ class IAllocator(object):
  
      """
      if call_fn is None:
  
      """
      if call_fn is None:
-      call_fn = self.lu.rpc.call_iallocator_runner
-    data = self.in_text
-
-    result = call_fn(self.lu.cfg.GetMasterNode(), name, self.in_text)
-    result.Raise()
-
-    if not isinstance(result.data, (list, tuple)) or len(result.data) != 4:
-      raise errors.OpExecError("Invalid result from master iallocator runner")
+      call_fn = self.rpc.call_iallocator_runner
  
  
-    rcode, stdout, stderr, fail = result.data
+    result = call_fn(self.cfg.GetMasterNode(), name, self.in_text)
+    result.Raise("Failure while running the iallocator script")
  
  
-    if rcode == constants.IARUN_NOTFOUND:
-      raise errors.OpExecError("Can't find allocator '%s'" % name)
-    elif rcode == constants.IARUN_FAILURE:
-      raise errors.OpExecError("Instance allocator call failed: %s,"
-                               " output: %s" % (fail, stdout+stderr))
-    self.out_text = stdout
+    self.out_text = result.payload
      if validate:
        self._ValidateResult()
  
      if validate:
        self._ValidateResult()
  
@@ -7230,7 +8426,7 @@ class LUTestAllocator(NoHooksLU):
  
      """
      if self.op.mode == constants.IALLOCATOR_MODE_ALLOC:
  
      """
      if self.op.mode == constants.IALLOCATOR_MODE_ALLOC:
-      ial = IAllocator(self,
+      ial = IAllocator(self.cfg, self.rpc,
                         mode=self.op.mode,
                         name=self.op.name,
                         mem_size=self.op.mem_size,
                         mode=self.op.mode,
                         name=self.op.name,
                         mem_size=self.op.mem_size,
@@ -7243,7 +8439,7 @@ class LUTestAllocator(NoHooksLU):
                         hypervisor=self.op.hypervisor,
                         )
      else:
                         hypervisor=self.op.hypervisor,
                         )
      else:
-      ial = IAllocator(self,
+      ial = IAllocator(self.cfg, self.rpc,
                         mode=self.op.mode,
                         name=self.op.name,
                         relocate_from=list(self.relocate_from),
                         mode=self.op.mode,
                         name=self.op.name,
                         relocate_from=list(self.relocate_from),