Relax replace_disks_all meaning for drbd8

[ganeti-local] / lib / cmdlib.py
diff --git a/lib/cmdlib.py b/lib/cmdlib.py

index 6bd0ed0..f338da5 100644 (file)
--- a/lib/cmdlib.py
+++ b/lib/cmdlib.py
@@ -69,7 +69,7 @@ class LogicalUnit(object):
      validity.
  
      """
-    self.processor = processor
+    self.proc = processor
      self.op = op
      self.cfg = cfg
      self.sstore = sstore
@@ -163,6 +163,15 @@ class NoHooksLU(LogicalUnit):
      return {}, [], []
  
  
+def _RemoveHostFromEtcHosts(hostname):
+  """Wrapper around utils.RemoteEtcHostsEntry.
+
+  """
+  hi = utils.HostInfo(name=hostname)
+  utils.RemoveEtcHostsEntry(constants.ETC_HOSTS, hi.name)
+  utils.RemoveEtcHostsEntry(constants.ETC_HOSTS, hi.ShortName())
+
+
  def _GetWantedNodes(lu, nodes):
    """Returns list of checked and expanded node names.
  
@@ -285,86 +294,6 @@ def _BuildInstanceHookEnvByObject(instance, override=None):
    return _BuildInstanceHookEnv(**args)
  
  
-def _UpdateEtcHosts(fullnode, ip):
-  """Ensure a node has a correct entry in /etc/hosts.
-
-  Args:
-    fullnode - Fully qualified domain name of host. (str)
-    ip       - IPv4 address of host (str)
-
-  """
-  node = fullnode.split(".", 1)[0]
-
-  f = open('/etc/hosts', 'r+')
-
-  inthere = False
-
-  save_lines = []
-  add_lines = []
-  removed = False
-
-  while True:
-    rawline = f.readline()
-
-    if not rawline:
-      # End of file
-      break
-
-    line = rawline.split('\n')[0]
-
-    # Strip off comments
-    line = line.split('#')[0]
-
-    if not line:
-      # Entire line was comment, skip
-      save_lines.append(rawline)
-      continue
-
-    fields = line.split()
-
-    haveall = True
-    havesome = False
-    for spec in [ ip, fullnode, node ]:
-      if spec not in fields:
-        haveall = False
-      if spec in fields:
-        havesome = True
-
-    if haveall:
-      inthere = True
-      save_lines.append(rawline)
-      continue
-
-    if havesome and not haveall:
-      # Line (old, or manual?) which is missing some.  Remove.
-      removed = True
-      continue
-
-    save_lines.append(rawline)
-
-  if not inthere:
-    add_lines.append('%s\t%s %s\n' % (ip, fullnode, node))
-
-  if removed:
-    if add_lines:
-      save_lines = save_lines + add_lines
-
-    # We removed a line, write a new file and replace old.
-    fd, tmpname = tempfile.mkstemp('tmp', 'hosts_', '/etc')
-    newfile = os.fdopen(fd, 'w')
-    newfile.write(''.join(save_lines))
-    newfile.close()
-    os.rename(tmpname, '/etc/hosts')
-
-  elif add_lines:
-    # Simply appending a new line will do the trick.
-    f.seek(0, 2)
-    for add in add_lines:
-      f.write(add)
-
-  f.close()
-
-
  def _UpdateKnownHosts(fullnode, ip, pubkey):
    """Ensure a node has a correct known_hosts entry.
  
@@ -611,6 +540,11 @@ class LUInitCluster(LogicalUnit):
                                   (self.op.master_netdev,
                                    result.output.strip()))
  
+    if not (os.path.isfile(constants.NODE_INITD_SCRIPT) and
+            os.access(constants.NODE_INITD_SCRIPT, os.X_OK)):
+      raise errors.OpPrereqError("Init.d script '%s' missing or not "
+                                 "executable." % constants.NODE_INITD_SCRIPT)
+
    def Exec(self, feedback_fn):
      """Initialize the cluster.
  
@@ -640,7 +574,10 @@ class LUInitCluster(LogicalUnit):
        f.close()
      sshkey = sshline.split(" ")[1]
  
-    _UpdateEtcHosts(hostname.name, hostname.ip)
+    hi = utils.HostInfo(name=hostname.name)
+    utils.AddEtcHostsEntry(constants.ETC_HOSTS, hostname.name, hi.ip)
+    utils.AddEtcHostsEntry(constants.ETC_HOSTS, hi.ShortName(), hi.ip)
+    del hi
  
      _UpdateKnownHosts(hostname.name, hostname.ip, sshkey)
  
@@ -682,10 +619,12 @@ class LUDestroyCluster(NoHooksLU):
      """Destroys the cluster.
  
      """
+    master = self.sstore.GetMasterNode()
      priv_key, pub_key, _ = ssh.GetUserFiles(constants.GANETI_RUNAS)
      utils.CreateBackup(priv_key)
      utils.CreateBackup(pub_key)
-    rpc.call_node_leave_cluster(self.sstore.GetMasterNode())
+    rpc.call_node_leave_cluster(master)
+    _RemoveHostFromEtcHosts(master)
  
  
  class LUVerifyCluster(NoHooksLU):
@@ -1010,7 +949,7 @@ class LURenameCluster(LogicalUnit):
                       "please restart manually.")
  
  
-def _WaitForSync(cfgw, instance, oneshot=False, unlock=False):
+def _WaitForSync(cfgw, instance, proc, oneshot=False, unlock=False):
    """Sleep and poll for an instance's disk to sync.
  
    """
@@ -1018,7 +957,7 @@ def _WaitForSync(cfgw, instance, oneshot=False, unlock=False):
      return True
  
    if not oneshot:
-    logger.ToStdout("Waiting for instance %s to sync disks." % instance.name)
+    proc.LogInfo("Waiting for instance %s to sync disks." % instance.name)
  
    node = instance.primary_node
  
@@ -1032,7 +971,7 @@ def _WaitForSync(cfgw, instance, oneshot=False, unlock=False):
      cumul_degraded = False
      rstats = rpc.call_blockdev_getmirrorstatus(node, instance.disks)
      if not rstats:
-      logger.ToStderr("Can't get any data from node %s" % node)
+      proc.LogWarning("Can't get any data from node %s" % node)
        retries += 1
        if retries >= 10:
          raise errors.RemoteError("Can't contact node %s for mirror data,"
@@ -1043,10 +982,11 @@ def _WaitForSync(cfgw, instance, oneshot=False, unlock=False):
      for i in range(len(rstats)):
        mstat = rstats[i]
        if mstat is None:
-        logger.ToStderr("Can't compute data for node %s/%s" %
+        proc.LogWarning("Can't compute data for node %s/%s" %
                          (node, instance.disks[i].iv_name))
          continue
-      perc_done, est_time, is_degraded = mstat
+      # we ignore the ldisk parameter
+      perc_done, est_time, is_degraded, _ = mstat
        cumul_degraded = cumul_degraded or (is_degraded and perc_done is None)
        if perc_done is not None:
          done = False
@@ -1055,8 +995,8 @@ def _WaitForSync(cfgw, instance, oneshot=False, unlock=False):
            max_time = est_time
          else:
            rem_time = "no time estimate"
-        logger.ToStdout("- device %s: %5.2f%% done, %s" %
-                        (instance.disks[i].iv_name, perc_done, rem_time))
+        proc.LogInfo("- device %s: %5.2f%% done, %s" %
+                     (instance.disks[i].iv_name, perc_done, rem_time))
      if done or oneshot:
        break
  
@@ -1069,15 +1009,23 @@ def _WaitForSync(cfgw, instance, oneshot=False, unlock=False):
          utils.Lock('cmd')
  
    if done:
-    logger.ToStdout("Instance %s's disks are in sync." % instance.name)
+    proc.LogInfo("Instance %s's disks are in sync." % instance.name)
    return not cumul_degraded
  
  
-def _CheckDiskConsistency(cfgw, dev, node, on_primary):
+def _CheckDiskConsistency(cfgw, dev, node, on_primary, ldisk=False):
    """Check that mirrors are not degraded.
  
+  The ldisk parameter, if True, will change the test from the
+  is_degraded attribute (which represents overall non-ok status for
+  the device(s)) to the ldisk (representing the local storage status).
+
    """
    cfgw.SetDiskID(dev, node)
+  if ldisk:
+    idx = 6
+  else:
+    idx = 5
  
    result = True
    if on_primary or dev.AssembleOnSecondary():
@@ -1086,7 +1034,7 @@ def _CheckDiskConsistency(cfgw, dev, node, on_primary):
        logger.ToStderr("Can't get any data from node %s" % node)
        result = False
      else:
-      result = result and (not rstats[5])
+      result = result and (not rstats[idx])
    if dev.children:
      for child in dev.children:
        result = result and _CheckDiskConsistency(cfgw, child, node, on_primary)
@@ -1191,6 +1139,8 @@ class LURemoveNode(LogicalUnit):
  
      self.cfg.RemoveNode(node.name)
  
+    _RemoveHostFromEtcHosts(node.name)
+
  
  class LUQueryNodes(NoHooksLU):
    """Logical unit for querying nodes.
@@ -1534,7 +1484,11 @@ class LUAddNode(LogicalUnit):
        raise errors.OpExecError("Cannot transfer ssh keys to the new node")
  
      # Add node to our /etc/hosts, and add key to known_hosts
-    _UpdateEtcHosts(new_node.name, new_node.primary_ip)
+    hi = utils.HostInfo(name=new_node.name)
+    utils.AddEtcHostsEntry(constants.ETC_HOSTS, new_node.name, hi.ip)
+    utils.AddEtcHostsEntry(constants.ETC_HOSTS, hi.ShortName(), hi.ip)
+    del hi
+
      _UpdateKnownHosts(new_node.name, new_node.primary_ip,
                        self.cfg.GetHostKey())
  
@@ -3036,12 +2990,12 @@ class LUCreateInstance(LogicalUnit):
      self.cfg.AddInstance(iobj)
  
      if self.op.wait_for_sync:
-      disk_abort = not _WaitForSync(self.cfg, iobj)
+      disk_abort = not _WaitForSync(self.cfg, iobj, self.proc)
      elif iobj.disk_template in constants.DTS_NET_MIRROR:
        # make sure the disks are not degraded (still sync-ing is ok)
        time.sleep(15)
        feedback_fn("* checking mirrors status")
-      disk_abort = not _WaitForSync(self.cfg, iobj, oneshot=True)
+      disk_abort = not _WaitForSync(self.cfg, iobj, self.proc, oneshot=True)
      else:
        disk_abort = False
  
@@ -3243,7 +3197,7 @@ class LUAddMDDRBDComponent(LogicalUnit):
  
      self.cfg.AddInstance(instance)
  
-    _WaitForSync(self.cfg, instance)
+    _WaitForSync(self.cfg, instance, self.proc)
  
      return 0
  
@@ -3355,8 +3309,12 @@ class LUReplaceDisks(LogicalUnit):
        "OLD_SECONDARY": self.instance.secondary_nodes[0],
        }
      env.update(_BuildInstanceHookEnvByObject(self.instance))
-    nl = [self.sstore.GetMasterNode(),
-          self.instance.primary_node] + list(self.instance.secondary_nodes)
+    nl = [
+      self.sstore.GetMasterNode(),
+      self.instance.primary_node,
+      ]
+    if self.op.remote_node is not None:
+      nl.append(self.op.remote_node)
      return env, nl, nl
  
    def CheckPrereq(self):
@@ -3371,6 +3329,7 @@ class LUReplaceDisks(LogicalUnit):
        raise errors.OpPrereqError("Instance '%s' not known" %
                                   self.op.instance_name)
      self.instance = instance
+    self.op.instance_name = instance.name
  
      if instance.disk_template not in constants.DTS_NET_MIRROR:
        raise errors.OpPrereqError("Instance's disk layout is not"
@@ -3396,14 +3355,24 @@ class LUReplaceDisks(LogicalUnit):
        raise errors.OpPrereqError("The specified node is the primary node of"
                                   " the instance.")
      elif remote_node == self.sec_node:
+      if self.op.mode == constants.REPLACE_DISK_SEC:
+        # this is for DRBD8, where we can't execute the same mode of
+        # replacement as for drbd7 (no different port allocated)
+        raise errors.OpPrereqError("Same secondary given, cannot execute"
+                                   " replacement")
        # the user gave the current secondary, switch to
-      # 'no-replace-secondary' mode
+      # 'no-replace-secondary' mode for drbd7
        remote_node = None
      if (instance.disk_template == constants.DT_REMOTE_RAID1 and
          self.op.mode != constants.REPLACE_DISK_ALL):
        raise errors.OpPrereqError("Template 'remote_raid1' only allows all"
                                   " disks replacement, not individual ones")
      if instance.disk_template == constants.DT_DRBD8:
+      if (self.op.mode == constants.REPLACE_DISK_ALL and
+          remote_node is not None):
+        # switch to replace secondary mode
+        self.op.mode = constants.REPLACE_DISK_SEC
+
        if self.op.mode == constants.REPLACE_DISK_ALL:
          raise errors.OpPrereqError("Template 'drbd8' only allows primary or"
                                     " secondary disk replacement, not"
@@ -3414,10 +3383,12 @@ class LUReplaceDisks(LogicalUnit):
                                       " the secondary while doing a primary"
                                       " node disk replacement")
          self.tgt_node = instance.primary_node
+        self.oth_node = instance.secondary_nodes[0]
        elif self.op.mode == constants.REPLACE_DISK_SEC:
          self.new_node = remote_node # this can be None, in which case
                                      # we don't change the secondary
          self.tgt_node = instance.secondary_nodes[0]
+        self.oth_node = instance.primary_node
        else:
          raise errors.ProgrammerError("Unhandled disk replace mode")
  
@@ -3488,7 +3459,7 @@ class LUReplaceDisks(LogicalUnit):
      # this can fail as the old devices are degraded and _WaitForSync
      # does a combined result over all disks, so we don't check its
      # return value
-    _WaitForSync(cfg, instance, unlock=True)
+    _WaitForSync(cfg, instance, self.proc, unlock=True)
  
      # so check manually all the devices
      for name in iv_names:
@@ -3538,13 +3509,54 @@ class LUReplaceDisks(LogicalUnit):
          - remove old LVs (which have the name name_replaces.<time_t>)
  
      Failures are not very well handled.
+
      """
+    steps_total = 6
+    warning, info = (self.proc.LogWarning, self.proc.LogInfo)
      instance = self.instance
      iv_names = {}
      vgname = self.cfg.GetVGName()
      # start of work
      cfg = self.cfg
      tgt_node = self.tgt_node
+    oth_node = self.oth_node
+
+    # Step: check device activation
+    self.proc.LogStep(1, steps_total, "check device existence")
+    info("checking volume groups")
+    my_vg = cfg.GetVGName()
+    results = rpc.call_vg_list([oth_node, tgt_node])
+    if not results:
+      raise errors.OpExecError("Can't list volume groups on the nodes")
+    for node in oth_node, tgt_node:
+      res = results.get(node, False)
+      if not res or my_vg not in res:
+        raise errors.OpExecError("Volume group '%s' not found on %s" %
+                                 (my_vg, node))
+    for dev in instance.disks:
+      if not dev.iv_name in self.op.disks:
+        continue
+      for node in tgt_node, oth_node:
+        info("checking %s on %s" % (dev.iv_name, node))
+        cfg.SetDiskID(dev, node)
+        if not rpc.call_blockdev_find(node, dev):
+          raise errors.OpExecError("Can't find device %s on node %s" %
+                                   (dev.iv_name, node))
+
+    # Step: check other node consistency
+    self.proc.LogStep(2, steps_total, "check peer consistency")
+    for dev in instance.disks:
+      if not dev.iv_name in self.op.disks:
+        continue
+      info("checking %s consistency on %s" % (dev.iv_name, oth_node))
+      if not _CheckDiskConsistency(self.cfg, dev, oth_node,
+                                   oth_node==instance.primary_node):
+        raise errors.OpExecError("Peer node (%s) has degraded storage, unsafe"
+                                 " to replace disks on this node (%s)" %
+                                 (oth_node, tgt_node))
+
+    # Step: create new storage
+    self.proc.LogStep(3, steps_total, "allocate new storage")
      for dev in instance.disks:
        if not dev.iv_name in self.op.disks:
          continue
@@ -3559,8 +3571,8 @@ class LUReplaceDisks(LogicalUnit):
        new_lvs = [lv_data, lv_meta]
        old_lvs = dev.children
        iv_names[dev.iv_name] = (dev, old_lvs, new_lvs)
-      logger.Info("adding new local storage on %s for %s" %
-                  (tgt_node, dev.iv_name))
+      info("creating new local storage on %s for %s" %
+           (tgt_node, dev.iv_name))
        # since we *always* want to create this LV, we use the
        # _Create...OnPrimary (which forces the creation), even if we
        # are talking about the secondary node
@@ -3571,60 +3583,69 @@ class LUReplaceDisks(LogicalUnit):
                                     " node '%s'" %
                                     (new_lv.logical_id[1], tgt_node))
  
+    # Step: for each lv, detach+rename*2+attach
+    self.proc.LogStep(4, steps_total, "change drbd configuration")
+    for dev, old_lvs, new_lvs in iv_names.itervalues():
+      info("detaching %s drbd from local storage" % dev.iv_name)
        if not rpc.call_blockdev_removechildren(tgt_node, dev, old_lvs):
          raise errors.OpExecError("Can't detach drbd from local storage on node"
                                   " %s for device %s" % (tgt_node, dev.iv_name))
-      dev.children = []
-      cfg.Update(instance)
+      #dev.children = []
+      #cfg.Update(instance)
  
        # ok, we created the new LVs, so now we know we have the needed
        # storage; as such, we proceed on the target node to rename
        # old_lv to _old, and new_lv to old_lv; note that we rename LVs
        # using the assumption than logical_id == physical_id (which in
        # turn is the unique_id on that node)
+
+      # FIXME(iustin): use a better name for the replaced LVs
        temp_suffix = int(time.time())
-      logger.Info("renaming the old LVs on the target node")
        ren_fn = lambda d, suff: (d.physical_id[0],
                                  d.physical_id[1] + "_replaced-%s" % suff)
-      rlist = [(disk, ren_fn(disk, temp_suffix)) for disk in old_lvs]
+      # build the rename list based on what LVs exist on the node
+      rlist = []
+      for to_ren in old_lvs:
+        find_res = rpc.call_blockdev_find(tgt_node, to_ren)
+        if find_res is not None: # device exists
+          rlist.append((to_ren, ren_fn(to_ren, temp_suffix)))
+
+      info("renaming the old LVs on the target node")
        if not rpc.call_blockdev_rename(tgt_node, rlist):
-        logger.Error("Can't rename old LVs on node %s" % tgt_node)
-        do_change_old = False
-      else:
-        do_change_old = True
+        raise errors.OpExecError("Can't rename old LVs on node %s" % tgt_node)
        # now we rename the new LVs to the old LVs
-      logger.Info("renaming the new LVs on the target node")
+      info("renaming the new LVs on the target node")
        rlist = [(new, old.physical_id) for old, new in zip(old_lvs, new_lvs)]
        if not rpc.call_blockdev_rename(tgt_node, rlist):
-        logger.Error("Can't rename new LVs on node %s" % tgt_node)
-      else:
-        for old, new in zip(old_lvs, new_lvs):
-          new.logical_id = old.logical_id
-          cfg.SetDiskID(new, tgt_node)
+        raise errors.OpExecError("Can't rename new LVs on node %s" % tgt_node)
+
+      for old, new in zip(old_lvs, new_lvs):
+        new.logical_id = old.logical_id
+        cfg.SetDiskID(new, tgt_node)
  
-      if do_change_old:
-        for disk in old_lvs:
-          disk.logical_id = ren_fn(disk, temp_suffix)
-          cfg.SetDiskID(disk, tgt_node)
+      for disk in old_lvs:
+        disk.logical_id = ren_fn(disk, temp_suffix)
+        cfg.SetDiskID(disk, tgt_node)
  
        # now that the new lvs have the old name, we can add them to the device
-      logger.Info("adding new mirror component on %s" % tgt_node)
+      info("adding new mirror component on %s" % tgt_node)
        if not rpc.call_blockdev_addchildren(tgt_node, dev, new_lvs):
-        logger.Error("Can't add local storage to drbd!")
          for new_lv in new_lvs:
            if not rpc.call_blockdev_remove(tgt_node, new_lv):
-            logger.Error("Can't rollback device %s")
-        return
+            warning("Can't rollback device %s", "manually cleanup unused"
+                    " logical volumes")
+        raise errors.OpExecError("Can't add local storage to drbd")
  
        dev.children = new_lvs
        cfg.Update(instance)
  
+    # Step: wait for sync
  
      # this can fail as the old devices are degraded and _WaitForSync
      # does a combined result over all disks, so we don't check its
      # return value
-    logger.Info("Done changing drbd configs, waiting for sync")
-    _WaitForSync(cfg, instance, unlock=True)
+    self.proc.LogStep(5, steps_total, "sync devices")
+    _WaitForSync(cfg, instance, self.proc, unlock=True)
  
      # so check manually all the devices
      for name, (dev, old_lvs, new_lvs) in iv_names.iteritems():
@@ -3633,13 +3654,14 @@ class LUReplaceDisks(LogicalUnit):
        if is_degr:
          raise errors.OpExecError("DRBD device %s is degraded!" % name)
  
+    # Step: remove old storage
+    self.proc.LogStep(6, steps_total, "removing old storage")
      for name, (dev, old_lvs, new_lvs) in iv_names.iteritems():
-      logger.Info("remove logical volumes for %s" % name)
+      info("remove logical volumes for %s" % name)
        for lv in old_lvs:
          cfg.SetDiskID(lv, tgt_node)
          if not rpc.call_blockdev_remove(tgt_node, lv):
-          logger.Error("Can't cleanup child device, skipping. You need to"
-                       " fix manually!")
+          warning("Can't remove old LV", "manually remove unused LVs")
            continue
  
    def _ExecD8Secondary(self, feedback_fn):
@@ -3659,7 +3681,10 @@ class LUReplaceDisks(LogicalUnit):
        - remove all disks from the old secondary
  
      Failures are not very well handled.
+
      """
+    steps_total = 6
+    warning, info = (self.proc.LogWarning, self.proc.LogInfo)
      instance = self.instance
      iv_names = {}
      vgname = self.cfg.GetVGName()
@@ -3668,10 +3693,44 @@ class LUReplaceDisks(LogicalUnit):
      old_node = self.tgt_node
      new_node = self.new_node
      pri_node = instance.primary_node
+
+    # Step: check device activation
+    self.proc.LogStep(1, steps_total, "check device existence")
+    info("checking volume groups")
+    my_vg = cfg.GetVGName()
+    results = rpc.call_vg_list([pri_node, new_node])
+    if not results:
+      raise errors.OpExecError("Can't list volume groups on the nodes")
+    for node in pri_node, new_node:
+      res = results.get(node, False)
+      if not res or my_vg not in res:
+        raise errors.OpExecError("Volume group '%s' not found on %s" %
+                                 (my_vg, node))
+    for dev in instance.disks:
+      if not dev.iv_name in self.op.disks:
+        continue
+      info("checking %s on %s" % (dev.iv_name, pri_node))
+      cfg.SetDiskID(dev, pri_node)
+      if not rpc.call_blockdev_find(pri_node, dev):
+        raise errors.OpExecError("Can't find device %s on node %s" %
+                                 (dev.iv_name, pri_node))
+
+    # Step: check other node consistency
+    self.proc.LogStep(2, steps_total, "check peer consistency")
+    for dev in instance.disks:
+      if not dev.iv_name in self.op.disks:
+        continue
+      info("checking %s consistency on %s" % (dev.iv_name, pri_node))
+      if not _CheckDiskConsistency(self.cfg, dev, pri_node, True, ldisk=True):
+        raise errors.OpExecError("Primary node (%s) has degraded storage,"
+                                 " unsafe to replace the secondary" %
+                                 pri_node)
+
+    # Step: create new storage
+    self.proc.LogStep(3, steps_total, "allocate new storage")
      for dev in instance.disks:
        size = dev.size
-      logger.Info("adding new local storage on %s for %s" %
-                  (new_node, dev.iv_name))
+      info("adding new local storage on %s for %s" % (new_node, dev.iv_name))
        # since we *always* want to create this LV, we use the
        # _Create...OnPrimary (which forces the creation), even if we
        # are talking about the secondary node
@@ -3682,6 +3741,12 @@ class LUReplaceDisks(LogicalUnit):
                                     " node '%s'" %
                                     (new_lv.logical_id[1], new_node))
  
+      iv_names[dev.iv_name] = (dev, dev.children)
+
+    self.proc.LogStep(4, steps_total, "changing drbd configuration")
+    for dev in instance.disks:
+      size = dev.size
+      info("activating a new drbd on %s for %s" % (new_node, dev.iv_name))
        # create new devices on new_node
        new_drbd = objects.Disk(dev_type=constants.LD_DRBD8,
                                logical_id=(pri_node, new_node,
@@ -3693,32 +3758,35 @@ class LUReplaceDisks(LogicalUnit):
          raise errors.OpExecError("Failed to create new DRBD on"
                                   " node '%s'" % new_node)
  
+    for dev in instance.disks:
        # we have new devices, shutdown the drbd on the old secondary
+      info("shutting down drbd for %s on old node" % dev.iv_name)
        cfg.SetDiskID(dev, old_node)
        if not rpc.call_blockdev_shutdown(old_node, dev):
-        raise errors.OpExecError("Failed to shutdown DRBD on old node")
+        warning("Failed to shutdown drbd for %s on old node" % dev.iv_name,
+                "Please cleanup this device manuall as soon as possible")
  
        # we have new storage, we 'rename' the network on the primary
+      info("switching primary drbd for %s to new secondary node" % dev.iv_name)
        cfg.SetDiskID(dev, pri_node)
        # rename to the ip of the new node
        new_uid = list(dev.physical_id)
        new_uid[2] = self.remote_node_info.secondary_ip
        rlist = [(dev, tuple(new_uid))]
        if not rpc.call_blockdev_rename(pri_node, rlist):
-        raise errors.OpExecError("Can't detach re-attach drbd %s on node"
+        raise errors.OpExecError("Can't detach & re-attach drbd %s on node"
                                   " %s from %s to %s" %
                                   (dev.iv_name, pri_node, old_node, new_node))
        dev.logical_id = (pri_node, new_node, dev.logical_id[2])
        cfg.SetDiskID(dev, pri_node)
        cfg.Update(instance)
  
-      iv_names[dev.iv_name] = (dev, dev.children)
  
      # this can fail as the old devices are degraded and _WaitForSync
      # does a combined result over all disks, so we don't check its
      # return value
-    logger.Info("Done changing drbd configs, waiting for sync")
-    _WaitForSync(cfg, instance, unlock=True)
+    self.proc.LogStep(5, steps_total, "sync devices")
+    _WaitForSync(cfg, instance, self.proc, unlock=True)
  
      # so check manually all the devices
      for name, (dev, old_lvs) in iv_names.iteritems():
@@ -3727,14 +3795,14 @@ class LUReplaceDisks(LogicalUnit):
        if is_degr:
          raise errors.OpExecError("DRBD device %s is degraded!" % name)
  
+    self.proc.LogStep(6, steps_total, "removing old storage")
      for name, (dev, old_lvs) in iv_names.iteritems():
-      logger.Info("remove logical volumes for %s" % name)
+      info("remove logical volumes for %s" % name)
        for lv in old_lvs:
          cfg.SetDiskID(lv, old_node)
          if not rpc.call_blockdev_remove(old_node, lv):
-          logger.Error("Can't cleanup child device, skipping. You need to"
-                       " fix manually!")
-          continue
+          warning("Can't remove LV on old secondary",
+                  "Cleanup stale volumes by hand")
  
    def Exec(self, feedback_fn):
      """Execute disk replacement.
@@ -4035,7 +4103,7 @@ class LUExportInstance(LogicalUnit):
      # shutdown the instance, unless requested not to do so
      if self.op.shutdown:
        op = opcodes.OpShutdownInstance(instance_name=instance.name)
-      self.processor.ChainOpCode(op)
+      self.proc.ChainOpCode(op)
  
      vgname = self.cfg.GetVGName()
  
@@ -4061,7 +4129,7 @@ class LUExportInstance(LogicalUnit):
        if self.op.shutdown:
          op = opcodes.OpStartupInstance(instance_name=instance.name,
                                         force=False)
-        self.processor.ChainOpCode(op)
+        self.proc.ChainOpCode(op)
  
      # TODO: check for size
  
@@ -4087,7 +4155,7 @@ class LUExportInstance(LogicalUnit):
      # substitutes an empty list with the full cluster node list.
      if nodelist:
        op = opcodes.OpQueryExports(nodes=nodelist)
-      exportlist = self.processor.ChainOpCode(op)
+      exportlist = self.proc.ChainOpCode(op)
        for node in exportlist:
          if instance.name in exportlist[node]:
            if not rpc.call_export_remove(node, instance.name):