From fbafd7a864cc1c47587f6c4746589d07847b61ae Mon Sep 17 00:00:00 2001 From: Iustin Pop Date: Wed, 3 Jun 2009 14:20:03 +0200 Subject: [PATCH] Wait for a while in failed resyncs This patch is an attempt at fixing some very rare occurrences of messages like: - "There are some degraded disks for this instance", or: - "Cannot resync disks on node node3.example.com: [True, 100]" What I believe happens is that drbd has finished syncing, but not all fields are updated in 'Connected' state; maybe it's in WFBitmap[ST], or in some other transient state we don't handle well. The patch will change the _WaitForSync method to recheck up to a hardcoded number of times if we're finished syncing but we're degraded (using the same condition as the 'break' clause of the loop). The cons of this changes is that a normal, really-degraded due to network or disk failure will cause an extra delay before it aborts. For this, I'm happy to choose other values. A better, long term fix is to handle more DRBD state correctly (see the bdev.DRBD8Status class). Signed-off-by: Iustin Pop Reviewed-by: Guido Trotter --- lib/cmdlib.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/lib/cmdlib.py b/lib/cmdlib.py index e491163..87f1452 100644 --- a/lib/cmdlib.py +++ b/lib/cmdlib.py @@ -1608,6 +1608,7 @@ def _WaitForSync(lu, instance, oneshot=False, unlock=False): lu.cfg.SetDiskID(dev, node) retries = 0 + degr_retries = 10 # in seconds, as we sleep 1 second each time while True: max_time = 0 done = True @@ -1640,6 +1641,16 @@ def _WaitForSync(lu, instance, oneshot=False, unlock=False): rem_time = "no time estimate" lu.proc.LogInfo("- device %s: %5.2f%% done, %s" % (instance.disks[i].iv_name, perc_done, rem_time)) + + # if we're done but degraded, let's do a few small retries, to + # make sure we see a stable and not transient situation; therefore + # we force restart of the loop + if (done or oneshot) and cumul_degraded and degr_retries > 0: + logging.info("Degraded disks found, %d retries left", degr_retries) + degr_retries -= 1 + time.sleep(1) + continue + if done or oneshot: break -- 1.7.10.4