Revision 0834c866 lib/cmdlib.py
b/lib/cmdlib.py | ||
---|---|---|
1051 | 1051 |
logger.ToStderr("Can't compute data for node %s/%s" % |
1052 | 1052 |
(node, instance.disks[i].iv_name)) |
1053 | 1053 |
continue |
1054 |
perc_done, est_time, is_degraded = mstat |
|
1054 |
# we ignore the ldisk parameter |
|
1055 |
perc_done, est_time, is_degraded, _ = mstat |
|
1055 | 1056 |
cumul_degraded = cumul_degraded or (is_degraded and perc_done is None) |
1056 | 1057 |
if perc_done is not None: |
1057 | 1058 |
done = False |
... | ... | |
1078 | 1079 |
return not cumul_degraded |
1079 | 1080 |
|
1080 | 1081 |
|
1081 |
def _CheckDiskConsistency(cfgw, dev, node, on_primary): |
|
1082 |
def _CheckDiskConsistency(cfgw, dev, node, on_primary, ldisk=False):
|
|
1082 | 1083 |
"""Check that mirrors are not degraded. |
1083 | 1084 |
|
1085 |
The ldisk parameter, if True, will change the test from the |
|
1086 |
is_degraded attribute (which represents overall non-ok status for |
|
1087 |
the device(s)) to the ldisk (representing the local storage status). |
|
1088 |
|
|
1084 | 1089 |
""" |
1085 | 1090 |
cfgw.SetDiskID(dev, node) |
1091 |
if ldisk: |
|
1092 |
idx = 6 |
|
1093 |
else: |
|
1094 |
idx = 5 |
|
1086 | 1095 |
|
1087 | 1096 |
result = True |
1088 | 1097 |
if on_primary or dev.AssembleOnSecondary(): |
... | ... | |
1091 | 1100 |
logger.ToStderr("Can't get any data from node %s" % node) |
1092 | 1101 |
result = False |
1093 | 1102 |
else: |
1094 |
result = result and (not rstats[5])
|
|
1103 |
result = result and (not rstats[idx])
|
|
1095 | 1104 |
if dev.children: |
1096 | 1105 |
for child in dev.children: |
1097 | 1106 |
result = result and _CheckDiskConsistency(cfgw, child, node, on_primary) |
... | ... | |
3360 | 3369 |
"OLD_SECONDARY": self.instance.secondary_nodes[0], |
3361 | 3370 |
} |
3362 | 3371 |
env.update(_BuildInstanceHookEnvByObject(self.instance)) |
3363 |
nl = [self.sstore.GetMasterNode(), |
|
3364 |
self.instance.primary_node] + list(self.instance.secondary_nodes) |
|
3372 |
nl = [ |
|
3373 |
self.sstore.GetMasterNode(), |
|
3374 |
self.instance.primary_node, |
|
3375 |
] |
|
3376 |
if self.op.remote_node is not None: |
|
3377 |
nl.append(self.op.remote_node) |
|
3365 | 3378 |
return env, nl, nl |
3366 | 3379 |
|
3367 | 3380 |
def CheckPrereq(self): |
... | ... | |
3401 | 3414 |
raise errors.OpPrereqError("The specified node is the primary node of" |
3402 | 3415 |
" the instance.") |
3403 | 3416 |
elif remote_node == self.sec_node: |
3417 |
if self.op.mode == constants.REPLACE_DISK_SEC: |
|
3418 |
# this is for DRBD8, where we can't execute the same mode of |
|
3419 |
# replacement as for drbd7 (no different port allocated) |
|
3420 |
raise errors.OpPrereqError("Same secondary given, cannot execute" |
|
3421 |
" replacement") |
|
3404 | 3422 |
# the user gave the current secondary, switch to |
3405 |
# 'no-replace-secondary' mode |
|
3423 |
# 'no-replace-secondary' mode for drbd7
|
|
3406 | 3424 |
remote_node = None |
3407 | 3425 |
if (instance.disk_template == constants.DT_REMOTE_RAID1 and |
3408 | 3426 |
self.op.mode != constants.REPLACE_DISK_ALL): |
... | ... | |
3717 | 3735 |
- remove all disks from the old secondary |
3718 | 3736 |
|
3719 | 3737 |
Failures are not very well handled. |
3738 |
|
|
3720 | 3739 |
""" |
3740 |
steps_total = 6 |
|
3741 |
warning, info = (self.processor.LogWarning, self.processor.LogInfo) |
|
3721 | 3742 |
instance = self.instance |
3722 | 3743 |
iv_names = {} |
3723 | 3744 |
vgname = self.cfg.GetVGName() |
... | ... | |
3726 | 3747 |
old_node = self.tgt_node |
3727 | 3748 |
new_node = self.new_node |
3728 | 3749 |
pri_node = instance.primary_node |
3750 |
|
|
3751 |
# Step: check device activation |
|
3752 |
self.processor.LogStep(1, steps_total, "check device existence") |
|
3753 |
info("checking volume groups") |
|
3754 |
my_vg = cfg.GetVGName() |
|
3755 |
results = rpc.call_vg_list([pri_node, new_node]) |
|
3756 |
if not results: |
|
3757 |
raise errors.OpExecError("Can't list volume groups on the nodes") |
|
3758 |
for node in pri_node, new_node: |
|
3759 |
res = results.get(node, False) |
|
3760 |
if not res or my_vg not in res: |
|
3761 |
raise errors.OpExecError("Volume group '%s' not found on %s" % |
|
3762 |
(my_vg, node)) |
|
3763 |
for dev in instance.disks: |
|
3764 |
if not dev.iv_name in self.op.disks: |
|
3765 |
continue |
|
3766 |
info("checking %s on %s" % (dev.iv_name, pri_node)) |
|
3767 |
cfg.SetDiskID(dev, pri_node) |
|
3768 |
if not rpc.call_blockdev_find(pri_node, dev): |
|
3769 |
raise errors.OpExecError("Can't find device %s on node %s" % |
|
3770 |
(dev.iv_name, pri_node)) |
|
3771 |
|
|
3772 |
# Step: check other node consistency |
|
3773 |
self.processor.LogStep(2, steps_total, "check peer consistency") |
|
3774 |
for dev in instance.disks: |
|
3775 |
if not dev.iv_name in self.op.disks: |
|
3776 |
continue |
|
3777 |
info("checking %s consistency on %s" % (dev.iv_name, pri_node)) |
|
3778 |
if not _CheckDiskConsistency(self.cfg, dev, pri_node, True, ldisk=True): |
|
3779 |
raise errors.OpExecError("Primary node (%s) has degraded storage," |
|
3780 |
" unsafe to replace the secondary" % |
|
3781 |
pri_node) |
|
3782 |
|
|
3783 |
# Step: create new storage |
|
3784 |
self.processor.LogStep(3, steps_total, "allocate new storage") |
|
3729 | 3785 |
for dev in instance.disks: |
3730 | 3786 |
size = dev.size |
3731 |
logger.Info("adding new local storage on %s for %s" % |
|
3732 |
(new_node, dev.iv_name)) |
|
3787 |
info("adding new local storage on %s for %s" % (new_node, dev.iv_name)) |
|
3733 | 3788 |
# since we *always* want to create this LV, we use the |
3734 | 3789 |
# _Create...OnPrimary (which forces the creation), even if we |
3735 | 3790 |
# are talking about the secondary node |
... | ... | |
3740 | 3795 |
" node '%s'" % |
3741 | 3796 |
(new_lv.logical_id[1], new_node)) |
3742 | 3797 |
|
3798 |
iv_names[dev.iv_name] = (dev, dev.children) |
|
3799 |
|
|
3800 |
self.processor.LogStep(4, steps_total, "changing drbd configuration") |
|
3801 |
for dev in instance.disks: |
|
3802 |
size = dev.size |
|
3803 |
info("activating a new drbd on %s for %s" % (new_node, dev.iv_name)) |
|
3743 | 3804 |
# create new devices on new_node |
3744 | 3805 |
new_drbd = objects.Disk(dev_type=constants.LD_DRBD8, |
3745 | 3806 |
logical_id=(pri_node, new_node, |
... | ... | |
3751 | 3812 |
raise errors.OpExecError("Failed to create new DRBD on" |
3752 | 3813 |
" node '%s'" % new_node) |
3753 | 3814 |
|
3815 |
for dev in instance.disks: |
|
3754 | 3816 |
# we have new devices, shutdown the drbd on the old secondary |
3817 |
info("shutting down drbd for %s on old node" % dev.iv_name) |
|
3755 | 3818 |
cfg.SetDiskID(dev, old_node) |
3756 | 3819 |
if not rpc.call_blockdev_shutdown(old_node, dev): |
3757 |
raise errors.OpExecError("Failed to shutdown DRBD on old node") |
|
3820 |
warning("Failed to shutdown drbd for %s on old node" % dev.iv_name, |
|
3821 |
"Please cleanup this device manuall as soon as possible") |
|
3758 | 3822 |
|
3759 | 3823 |
# we have new storage, we 'rename' the network on the primary |
3824 |
info("switching primary drbd for %s to new secondary node" % dev.iv_name) |
|
3760 | 3825 |
cfg.SetDiskID(dev, pri_node) |
3761 | 3826 |
# rename to the ip of the new node |
3762 | 3827 |
new_uid = list(dev.physical_id) |
3763 | 3828 |
new_uid[2] = self.remote_node_info.secondary_ip |
3764 | 3829 |
rlist = [(dev, tuple(new_uid))] |
3765 | 3830 |
if not rpc.call_blockdev_rename(pri_node, rlist): |
3766 |
raise errors.OpExecError("Can't detach re-attach drbd %s on node" |
|
3831 |
raise errors.OpExecError("Can't detach & re-attach drbd %s on node"
|
|
3767 | 3832 |
" %s from %s to %s" % |
3768 | 3833 |
(dev.iv_name, pri_node, old_node, new_node)) |
3769 | 3834 |
dev.logical_id = (pri_node, new_node, dev.logical_id[2]) |
3770 | 3835 |
cfg.SetDiskID(dev, pri_node) |
3771 | 3836 |
cfg.Update(instance) |
3772 | 3837 |
|
3773 |
iv_names[dev.iv_name] = (dev, dev.children) |
|
3774 | 3838 |
|
3775 | 3839 |
# this can fail as the old devices are degraded and _WaitForSync |
3776 | 3840 |
# does a combined result over all disks, so we don't check its |
3777 | 3841 |
# return value |
3778 |
logger.Info("Done changing drbd configs, waiting for sync")
|
|
3842 |
self.processor.LogStep(5, steps_total, "sync devices")
|
|
3779 | 3843 |
_WaitForSync(cfg, instance, unlock=True) |
3780 | 3844 |
|
3781 | 3845 |
# so check manually all the devices |
... | ... | |
3785 | 3849 |
if is_degr: |
3786 | 3850 |
raise errors.OpExecError("DRBD device %s is degraded!" % name) |
3787 | 3851 |
|
3852 |
self.processor.LogStep(6, steps_total, "removing old storage") |
|
3788 | 3853 |
for name, (dev, old_lvs) in iv_names.iteritems(): |
3789 |
logger.Info("remove logical volumes for %s" % name)
|
|
3854 |
info("remove logical volumes for %s" % name)
|
|
3790 | 3855 |
for lv in old_lvs: |
3791 | 3856 |
cfg.SetDiskID(lv, old_node) |
3792 | 3857 |
if not rpc.call_blockdev_remove(old_node, lv): |
3793 |
logger.Error("Can't cleanup child device, skipping. You need to" |
|
3794 |
" fix manually!") |
|
3795 |
continue |
|
3858 |
warning("Can't remove LV on old secondary", |
|
3859 |
"Cleanup stale volumes by hand") |
|
3796 | 3860 |
|
3797 | 3861 |
def Exec(self, feedback_fn): |
3798 | 3862 |
"""Execute disk replacement. |
Also available in: Unified diff