Revision cff90b79
b/lib/cmdlib.py | ||
---|---|---|
3419 | 3419 |
" the secondary while doing a primary" |
3420 | 3420 |
" node disk replacement") |
3421 | 3421 |
self.tgt_node = instance.primary_node |
3422 |
self.oth_node = instance.secondary_nodes[0] |
|
3422 | 3423 |
elif self.op.mode == constants.REPLACE_DISK_SEC: |
3423 | 3424 |
self.new_node = remote_node # this can be None, in which case |
3424 | 3425 |
# we don't change the secondary |
3425 | 3426 |
self.tgt_node = instance.secondary_nodes[0] |
3427 |
self.oth_node = instance.primary_node |
|
3426 | 3428 |
else: |
3427 | 3429 |
raise errors.ProgrammerError("Unhandled disk replace mode") |
3428 | 3430 |
|
... | ... | |
3543 | 3545 |
- remove old LVs (which have the name name_replaces.<time_t>) |
3544 | 3546 |
|
3545 | 3547 |
Failures are not very well handled. |
3548 |
|
|
3546 | 3549 |
""" |
3550 |
steps_total = 6 |
|
3551 |
warning, info = (self.processor.LogWarning, self.processor.LogInfo) |
|
3547 | 3552 |
instance = self.instance |
3548 | 3553 |
iv_names = {} |
3549 | 3554 |
vgname = self.cfg.GetVGName() |
3550 | 3555 |
# start of work |
3551 | 3556 |
cfg = self.cfg |
3552 | 3557 |
tgt_node = self.tgt_node |
3558 |
oth_node = self.oth_node |
|
3559 |
|
|
3560 |
# Step: check device activation |
|
3561 |
self.processor.LogStep(1, steps_total, "check device existence") |
|
3562 |
info("checking volume groups") |
|
3563 |
my_vg = cfg.GetVGName() |
|
3564 |
results = rpc.call_vg_list([oth_node, tgt_node]) |
|
3565 |
if not results: |
|
3566 |
raise errors.OpExecError("Can't list volume groups on the nodes") |
|
3567 |
for node in oth_node, tgt_node: |
|
3568 |
res = results.get(node, False) |
|
3569 |
if not res or my_vg not in res: |
|
3570 |
raise errors.OpExecError("Volume group '%s' not found on %s" % |
|
3571 |
(my_vg, node)) |
|
3572 |
for dev in instance.disks: |
|
3573 |
if not dev.iv_name in self.op.disks: |
|
3574 |
continue |
|
3575 |
for node in tgt_node, oth_node: |
|
3576 |
info("checking %s on %s" % (dev.iv_name, node)) |
|
3577 |
cfg.SetDiskID(dev, node) |
|
3578 |
if not rpc.call_blockdev_find(node, dev): |
|
3579 |
raise errors.OpExecError("Can't find device %s on node %s" % |
|
3580 |
(dev.iv_name, node)) |
|
3581 |
|
|
3582 |
# Step: check other node consistency |
|
3583 |
self.processor.LogStep(2, steps_total, "check peer consistency") |
|
3584 |
for dev in instance.disks: |
|
3585 |
if not dev.iv_name in self.op.disks: |
|
3586 |
continue |
|
3587 |
info("checking %s consistency on %s" % (dev.iv_name, oth_node)) |
|
3588 |
if not _CheckDiskConsistency(self.cfg, dev, oth_node, |
|
3589 |
oth_node==instance.primary_node): |
|
3590 |
raise errors.OpExecError("Peer node (%s) has degraded storage, unsafe" |
|
3591 |
" to replace disks on this node (%s)" % |
|
3592 |
(oth_node, tgt_node)) |
|
3593 |
|
|
3594 |
# Step: create new storage |
|
3595 |
self.processor.LogStep(3, steps_total, "allocate new storage") |
|
3553 | 3596 |
for dev in instance.disks: |
3554 | 3597 |
if not dev.iv_name in self.op.disks: |
3555 | 3598 |
continue |
... | ... | |
3564 | 3607 |
new_lvs = [lv_data, lv_meta] |
3565 | 3608 |
old_lvs = dev.children |
3566 | 3609 |
iv_names[dev.iv_name] = (dev, old_lvs, new_lvs) |
3567 |
logger.Info("adding new local storage on %s for %s" %
|
|
3568 |
(tgt_node, dev.iv_name))
|
|
3610 |
info("creating new local storage on %s for %s" %
|
|
3611 |
(tgt_node, dev.iv_name)) |
|
3569 | 3612 |
# since we *always* want to create this LV, we use the |
3570 | 3613 |
# _Create...OnPrimary (which forces the creation), even if we |
3571 | 3614 |
# are talking about the secondary node |
... | ... | |
3576 | 3619 |
" node '%s'" % |
3577 | 3620 |
(new_lv.logical_id[1], tgt_node)) |
3578 | 3621 |
|
3622 |
# Step: for each lv, detach+rename*2+attach |
|
3623 |
self.processor.LogStep(4, steps_total, "change drbd configuration") |
|
3624 |
for dev, old_lvs, new_lvs in iv_names.itervalues(): |
|
3625 |
info("detaching %s drbd from local storage" % dev.iv_name) |
|
3579 | 3626 |
if not rpc.call_blockdev_removechildren(tgt_node, dev, old_lvs): |
3580 | 3627 |
raise errors.OpExecError("Can't detach drbd from local storage on node" |
3581 | 3628 |
" %s for device %s" % (tgt_node, dev.iv_name)) |
3582 |
dev.children = [] |
|
3583 |
cfg.Update(instance) |
|
3629 |
#dev.children = []
|
|
3630 |
#cfg.Update(instance)
|
|
3584 | 3631 |
|
3585 | 3632 |
# ok, we created the new LVs, so now we know we have the needed |
3586 | 3633 |
# storage; as such, we proceed on the target node to rename |
3587 | 3634 |
# old_lv to _old, and new_lv to old_lv; note that we rename LVs |
3588 | 3635 |
# using the assumption than logical_id == physical_id (which in |
3589 | 3636 |
# turn is the unique_id on that node) |
3637 |
|
|
3638 |
# FIXME(iustin): use a better name for the replaced LVs |
|
3590 | 3639 |
temp_suffix = int(time.time()) |
3591 |
logger.Info("renaming the old LVs on the target node") |
|
3592 | 3640 |
ren_fn = lambda d, suff: (d.physical_id[0], |
3593 | 3641 |
d.physical_id[1] + "_replaced-%s" % suff) |
3594 |
rlist = [(disk, ren_fn(disk, temp_suffix)) for disk in old_lvs] |
|
3642 |
# build the rename list based on what LVs exist on the node |
|
3643 |
rlist = [] |
|
3644 |
for to_ren in old_lvs: |
|
3645 |
find_res = rpc.call_blockdev_find(tgt_node, to_ren) |
|
3646 |
if find_res is not None: # device exists |
|
3647 |
rlist.append((to_ren, ren_fn(to_ren, temp_suffix))) |
|
3648 |
|
|
3649 |
info("renaming the old LVs on the target node") |
|
3595 | 3650 |
if not rpc.call_blockdev_rename(tgt_node, rlist): |
3596 |
logger.Error("Can't rename old LVs on node %s" % tgt_node) |
|
3597 |
do_change_old = False |
|
3598 |
else: |
|
3599 |
do_change_old = True |
|
3651 |
raise errors.OpExecError("Can't rename old LVs on node %s" % tgt_node) |
|
3600 | 3652 |
# now we rename the new LVs to the old LVs |
3601 |
logger.Info("renaming the new LVs on the target node")
|
|
3653 |
info("renaming the new LVs on the target node")
|
|
3602 | 3654 |
rlist = [(new, old.physical_id) for old, new in zip(old_lvs, new_lvs)] |
3603 | 3655 |
if not rpc.call_blockdev_rename(tgt_node, rlist): |
3604 |
logger.Error("Can't rename new LVs on node %s" % tgt_node)
|
|
3605 |
else: |
|
3606 |
for old, new in zip(old_lvs, new_lvs):
|
|
3607 |
new.logical_id = old.logical_id
|
|
3608 |
cfg.SetDiskID(new, tgt_node)
|
|
3656 |
raise errors.OpExecError("Can't rename new LVs on node %s" % tgt_node)
|
|
3657 |
|
|
3658 |
for old, new in zip(old_lvs, new_lvs): |
|
3659 |
new.logical_id = old.logical_id |
|
3660 |
cfg.SetDiskID(new, tgt_node) |
|
3609 | 3661 |
|
3610 |
if do_change_old: |
|
3611 |
for disk in old_lvs: |
|
3612 |
disk.logical_id = ren_fn(disk, temp_suffix) |
|
3613 |
cfg.SetDiskID(disk, tgt_node) |
|
3662 |
for disk in old_lvs: |
|
3663 |
disk.logical_id = ren_fn(disk, temp_suffix) |
|
3664 |
cfg.SetDiskID(disk, tgt_node) |
|
3614 | 3665 |
|
3615 | 3666 |
# now that the new lvs have the old name, we can add them to the device |
3616 |
logger.Info("adding new mirror component on %s" % tgt_node)
|
|
3667 |
info("adding new mirror component on %s" % tgt_node)
|
|
3617 | 3668 |
if not rpc.call_blockdev_addchildren(tgt_node, dev, new_lvs): |
3618 |
logger.Error("Can't add local storage to drbd!") |
|
3619 | 3669 |
for new_lv in new_lvs: |
3620 | 3670 |
if not rpc.call_blockdev_remove(tgt_node, new_lv): |
3621 |
logger.Error("Can't rollback device %s") |
|
3622 |
return |
|
3671 |
warning("Can't rollback device %s", "manually cleanup unused" |
|
3672 |
" logical volumes") |
|
3673 |
raise errors.OpExecError("Can't add local storage to drbd") |
|
3623 | 3674 |
|
3624 | 3675 |
dev.children = new_lvs |
3625 | 3676 |
cfg.Update(instance) |
3626 | 3677 |
|
3678 |
# Step: wait for sync |
|
3627 | 3679 |
|
3628 | 3680 |
# this can fail as the old devices are degraded and _WaitForSync |
3629 | 3681 |
# does a combined result over all disks, so we don't check its |
3630 | 3682 |
# return value |
3631 |
logger.Info("Done changing drbd configs, waiting for sync")
|
|
3683 |
self.processor.LogStep(5, steps_total, "sync devices")
|
|
3632 | 3684 |
_WaitForSync(cfg, instance, unlock=True) |
3633 | 3685 |
|
3634 | 3686 |
# so check manually all the devices |
... | ... | |
3638 | 3690 |
if is_degr: |
3639 | 3691 |
raise errors.OpExecError("DRBD device %s is degraded!" % name) |
3640 | 3692 |
|
3693 |
# Step: remove old storage |
|
3694 |
self.processor.LogStep(6, steps_total, "removing old storage") |
|
3641 | 3695 |
for name, (dev, old_lvs, new_lvs) in iv_names.iteritems(): |
3642 |
logger.Info("remove logical volumes for %s" % name)
|
|
3696 |
info("remove logical volumes for %s" % name)
|
|
3643 | 3697 |
for lv in old_lvs: |
3644 | 3698 |
cfg.SetDiskID(lv, tgt_node) |
3645 | 3699 |
if not rpc.call_blockdev_remove(tgt_node, lv): |
3646 |
logger.Error("Can't cleanup child device, skipping. You need to" |
|
3647 |
" fix manually!") |
|
3700 |
warning("Can't remove old LV", "manually remove unused LVs") |
|
3648 | 3701 |
continue |
3649 | 3702 |
|
3650 | 3703 |
def _ExecD8Secondary(self, feedback_fn): |
Also available in: Unified diff