Revision cff90b79

b/lib/cmdlib.py
3419 3419
                                     " the secondary while doing a primary"
3420 3420
                                     " node disk replacement")
3421 3421
        self.tgt_node = instance.primary_node
3422
        self.oth_node = instance.secondary_nodes[0]
3422 3423
      elif self.op.mode == constants.REPLACE_DISK_SEC:
3423 3424
        self.new_node = remote_node # this can be None, in which case
3424 3425
                                    # we don't change the secondary
3425 3426
        self.tgt_node = instance.secondary_nodes[0]
3427
        self.oth_node = instance.primary_node
3426 3428
      else:
3427 3429
        raise errors.ProgrammerError("Unhandled disk replace mode")
3428 3430

  
......
3543 3545
        - remove old LVs (which have the name name_replaces.<time_t>)
3544 3546

  
3545 3547
    Failures are not very well handled.
3548

  
3546 3549
    """
3550
    steps_total = 6
3551
    warning, info = (self.processor.LogWarning, self.processor.LogInfo)
3547 3552
    instance = self.instance
3548 3553
    iv_names = {}
3549 3554
    vgname = self.cfg.GetVGName()
3550 3555
    # start of work
3551 3556
    cfg = self.cfg
3552 3557
    tgt_node = self.tgt_node
3558
    oth_node = self.oth_node
3559

  
3560
    # Step: check device activation
3561
    self.processor.LogStep(1, steps_total, "check device existence")
3562
    info("checking volume groups")
3563
    my_vg = cfg.GetVGName()
3564
    results = rpc.call_vg_list([oth_node, tgt_node])
3565
    if not results:
3566
      raise errors.OpExecError("Can't list volume groups on the nodes")
3567
    for node in oth_node, tgt_node:
3568
      res = results.get(node, False)
3569
      if not res or my_vg not in res:
3570
        raise errors.OpExecError("Volume group '%s' not found on %s" %
3571
                                 (my_vg, node))
3572
    for dev in instance.disks:
3573
      if not dev.iv_name in self.op.disks:
3574
        continue
3575
      for node in tgt_node, oth_node:
3576
        info("checking %s on %s" % (dev.iv_name, node))
3577
        cfg.SetDiskID(dev, node)
3578
        if not rpc.call_blockdev_find(node, dev):
3579
          raise errors.OpExecError("Can't find device %s on node %s" %
3580
                                   (dev.iv_name, node))
3581

  
3582
    # Step: check other node consistency
3583
    self.processor.LogStep(2, steps_total, "check peer consistency")
3584
    for dev in instance.disks:
3585
      if not dev.iv_name in self.op.disks:
3586
        continue
3587
      info("checking %s consistency on %s" % (dev.iv_name, oth_node))
3588
      if not _CheckDiskConsistency(self.cfg, dev, oth_node,
3589
                                   oth_node==instance.primary_node):
3590
        raise errors.OpExecError("Peer node (%s) has degraded storage, unsafe"
3591
                                 " to replace disks on this node (%s)" %
3592
                                 (oth_node, tgt_node))
3593

  
3594
    # Step: create new storage
3595
    self.processor.LogStep(3, steps_total, "allocate new storage")
3553 3596
    for dev in instance.disks:
3554 3597
      if not dev.iv_name in self.op.disks:
3555 3598
        continue
......
3564 3607
      new_lvs = [lv_data, lv_meta]
3565 3608
      old_lvs = dev.children
3566 3609
      iv_names[dev.iv_name] = (dev, old_lvs, new_lvs)
3567
      logger.Info("adding new local storage on %s for %s" %
3568
                  (tgt_node, dev.iv_name))
3610
      info("creating new local storage on %s for %s" %
3611
           (tgt_node, dev.iv_name))
3569 3612
      # since we *always* want to create this LV, we use the
3570 3613
      # _Create...OnPrimary (which forces the creation), even if we
3571 3614
      # are talking about the secondary node
......
3576 3619
                                   " node '%s'" %
3577 3620
                                   (new_lv.logical_id[1], tgt_node))
3578 3621

  
3622
    # Step: for each lv, detach+rename*2+attach
3623
    self.processor.LogStep(4, steps_total, "change drbd configuration")
3624
    for dev, old_lvs, new_lvs in iv_names.itervalues():
3625
      info("detaching %s drbd from local storage" % dev.iv_name)
3579 3626
      if not rpc.call_blockdev_removechildren(tgt_node, dev, old_lvs):
3580 3627
        raise errors.OpExecError("Can't detach drbd from local storage on node"
3581 3628
                                 " %s for device %s" % (tgt_node, dev.iv_name))
3582
      dev.children = []
3583
      cfg.Update(instance)
3629
      #dev.children = []
3630
      #cfg.Update(instance)
3584 3631

  
3585 3632
      # ok, we created the new LVs, so now we know we have the needed
3586 3633
      # storage; as such, we proceed on the target node to rename
3587 3634
      # old_lv to _old, and new_lv to old_lv; note that we rename LVs
3588 3635
      # using the assumption than logical_id == physical_id (which in
3589 3636
      # turn is the unique_id on that node)
3637

  
3638
      # FIXME(iustin): use a better name for the replaced LVs
3590 3639
      temp_suffix = int(time.time())
3591
      logger.Info("renaming the old LVs on the target node")
3592 3640
      ren_fn = lambda d, suff: (d.physical_id[0],
3593 3641
                                d.physical_id[1] + "_replaced-%s" % suff)
3594
      rlist = [(disk, ren_fn(disk, temp_suffix)) for disk in old_lvs]
3642
      # build the rename list based on what LVs exist on the node
3643
      rlist = []
3644
      for to_ren in old_lvs:
3645
        find_res = rpc.call_blockdev_find(tgt_node, to_ren)
3646
        if find_res is not None: # device exists
3647
          rlist.append((to_ren, ren_fn(to_ren, temp_suffix)))
3648

  
3649
      info("renaming the old LVs on the target node")
3595 3650
      if not rpc.call_blockdev_rename(tgt_node, rlist):
3596
        logger.Error("Can't rename old LVs on node %s" % tgt_node)
3597
        do_change_old = False
3598
      else:
3599
        do_change_old = True
3651
        raise errors.OpExecError("Can't rename old LVs on node %s" % tgt_node)
3600 3652
      # now we rename the new LVs to the old LVs
3601
      logger.Info("renaming the new LVs on the target node")
3653
      info("renaming the new LVs on the target node")
3602 3654
      rlist = [(new, old.physical_id) for old, new in zip(old_lvs, new_lvs)]
3603 3655
      if not rpc.call_blockdev_rename(tgt_node, rlist):
3604
        logger.Error("Can't rename new LVs on node %s" % tgt_node)
3605
      else:
3606
        for old, new in zip(old_lvs, new_lvs):
3607
          new.logical_id = old.logical_id
3608
          cfg.SetDiskID(new, tgt_node)
3656
        raise errors.OpExecError("Can't rename new LVs on node %s" % tgt_node)
3657

  
3658
      for old, new in zip(old_lvs, new_lvs):
3659
        new.logical_id = old.logical_id
3660
        cfg.SetDiskID(new, tgt_node)
3609 3661

  
3610
      if do_change_old:
3611
        for disk in old_lvs:
3612
          disk.logical_id = ren_fn(disk, temp_suffix)
3613
          cfg.SetDiskID(disk, tgt_node)
3662
      for disk in old_lvs:
3663
        disk.logical_id = ren_fn(disk, temp_suffix)
3664
        cfg.SetDiskID(disk, tgt_node)
3614 3665

  
3615 3666
      # now that the new lvs have the old name, we can add them to the device
3616
      logger.Info("adding new mirror component on %s" % tgt_node)
3667
      info("adding new mirror component on %s" % tgt_node)
3617 3668
      if not rpc.call_blockdev_addchildren(tgt_node, dev, new_lvs):
3618
        logger.Error("Can't add local storage to drbd!")
3619 3669
        for new_lv in new_lvs:
3620 3670
          if not rpc.call_blockdev_remove(tgt_node, new_lv):
3621
            logger.Error("Can't rollback device %s")
3622
        return
3671
            warning("Can't rollback device %s", "manually cleanup unused"
3672
                    " logical volumes")
3673
        raise errors.OpExecError("Can't add local storage to drbd")
3623 3674

  
3624 3675
      dev.children = new_lvs
3625 3676
      cfg.Update(instance)
3626 3677

  
3678
    # Step: wait for sync
3627 3679

  
3628 3680
    # this can fail as the old devices are degraded and _WaitForSync
3629 3681
    # does a combined result over all disks, so we don't check its
3630 3682
    # return value
3631
    logger.Info("Done changing drbd configs, waiting for sync")
3683
    self.processor.LogStep(5, steps_total, "sync devices")
3632 3684
    _WaitForSync(cfg, instance, unlock=True)
3633 3685

  
3634 3686
    # so check manually all the devices
......
3638 3690
      if is_degr:
3639 3691
        raise errors.OpExecError("DRBD device %s is degraded!" % name)
3640 3692

  
3693
    # Step: remove old storage
3694
    self.processor.LogStep(6, steps_total, "removing old storage")
3641 3695
    for name, (dev, old_lvs, new_lvs) in iv_names.iteritems():
3642
      logger.Info("remove logical volumes for %s" % name)
3696
      info("remove logical volumes for %s" % name)
3643 3697
      for lv in old_lvs:
3644 3698
        cfg.SetDiskID(lv, tgt_node)
3645 3699
        if not rpc.call_blockdev_remove(tgt_node, lv):
3646
          logger.Error("Can't cleanup child device, skipping. You need to"
3647
                       " fix manually!")
3700
          warning("Can't remove old LV", "manually remove unused LVs")
3648 3701
          continue
3649 3702

  
3650 3703
  def _ExecD8Secondary(self, feedback_fn):

Also available in: Unified diff