Add an early release lock/storage for disk replace
authorIustin Pop <iustin@google.com>
Tue, 9 Feb 2010 08:57:50 +0000 (09:57 +0100)
committerIustin Pop <iustin@google.com>
Tue, 9 Feb 2010 10:04:07 +0000 (11:04 +0100)
This patch adds an early_release parameter in the OpReplaceDisks and
OpEvacuateNode opcodes, allowing earlier release of storage and more
importantly of internal Ganeti locks.

The behaviour of the early release is that any locks and storage on all
secondary nodes are released early. This is valid for change secondary
(where we remove the storage on the old secondary, and release the locks
on the old and new secondary) and replace on secondary (where we remove
the old storage and release the lock on the secondary node.

Using this, on a three node setup:

- instance1 on nodes A:B
- instance2 on nodes C:B

It is possible to run in parallel a replace-disks -s (on secondary) for
instances 1 and 2.

Replace on primary will remove the storage, but not the locks, as we use
the primary node later in the LU to check consistency.

It is debatable whether to also remove the locks on the primary node,
and thus making replace-disks keep zero locks during the sync. While
this would allow greatly enhanced parallelism, let's first see how
removal of secondary locks works.

Signed-off-by: Iustin Pop <iustin@google.com>
Reviewed-by: Guido Trotter <ultrotter@google.com>

lib/cli.py
lib/cmdlib.py
lib/opcodes.py
man/gnt-instance.sgml
man/gnt-node.sgml
scripts/gnt-instance
tools/burnin

index d9c2dbb..523cab7 100644 (file)
@@ -56,6 +56,7 @@ __all__ = [
   "DISK_OPT",
   "DISK_TEMPLATE_OPT",
   "DRAINED_OPT",
+  "EARLY_RELEASE_OPT",
   "ENABLED_HV_OPT",
   "ERROR_CODES_OPT",
   "FIELDS_OPT",
@@ -837,6 +838,12 @@ SHUTDOWN_TIMEOUT_OPT = cli_option("--shutdown-timeout",
                          default=constants.DEFAULT_SHUTDOWN_TIMEOUT,
                          help="Maximum time to wait for instance shutdown")
 
+EARLY_RELEASE_OPT = cli_option("--early-release",
+                               dest="early_release", default=False,
+                               action="store_true",
+                               help="Release the locks on the secondary"
+                               " node(s) early")
+
 
 def _ParseArgs(argv, commands, aliases):
   """Parser for the command line arguments.
index cb922eb..5cf76ad 100644 (file)
@@ -6332,6 +6332,8 @@ class LUReplaceDisks(LogicalUnit):
       self.op.remote_node = None
     if not hasattr(self.op, "iallocator"):
       self.op.iallocator = None
+    if not hasattr(self.op, "early_release"):
+      self.op.early_release = False
 
     TLReplaceDisks.CheckArguments(self.op.mode, self.op.remote_node,
                                   self.op.iallocator)
@@ -6363,7 +6365,7 @@ class LUReplaceDisks(LogicalUnit):
 
     self.replacer = TLReplaceDisks(self, self.op.instance_name, self.op.mode,
                                    self.op.iallocator, self.op.remote_node,
-                                   self.op.disks, False)
+                                   self.op.disks, False, self.op.early_release)
 
     self.tasklets = [self.replacer]
 
@@ -6410,6 +6412,8 @@ class LUEvacuateNode(LogicalUnit):
       self.op.remote_node = None
     if not hasattr(self.op, "iallocator"):
       self.op.iallocator = None
+    if not hasattr(self.op, "early_release"):
+      self.op.early_release = False
 
     TLReplaceDisks.CheckArguments(constants.REPLACE_DISK_CHG,
                                   self.op.remote_node,
@@ -6456,7 +6460,7 @@ class LUEvacuateNode(LogicalUnit):
 
       replacer = TLReplaceDisks(self, inst.name, constants.REPLACE_DISK_CHG,
                                 self.op.iallocator, self.op.remote_node, [],
-                                True)
+                                True, self.op.early_release)
       tasklets.append(replacer)
 
     self.tasklets = tasklets
@@ -6498,7 +6502,7 @@ class TLReplaceDisks(Tasklet):
 
   """
   def __init__(self, lu, instance_name, mode, iallocator_name, remote_node,
-               disks, delay_iallocator):
+               disks, delay_iallocator, early_release):
     """Initializes this class.
 
     """
@@ -6511,6 +6515,7 @@ class TLReplaceDisks(Tasklet):
     self.remote_node = remote_node
     self.disks = disks
     self.delay_iallocator = delay_iallocator
+    self.early_release = early_release
 
     # Runtime data
     self.instance = None
@@ -6853,6 +6858,10 @@ class TLReplaceDisks(Tasklet):
           self.lu.LogWarning("Can't remove old LV: %s" % msg,
                              hint="remove unused LVs manually")
 
+  def _ReleaseNodeLock(self, node_name):
+    """Releases the lock for a given node."""
+    self.lu.context.glm.release(locking.LEVEL_NODE, node_name)
+
   def _ExecDrbd8DiskOnly(self, feedback_fn):
     """Replace a disk on the primary or secondary for DRBD 8.
 
@@ -6963,18 +6972,31 @@ class TLReplaceDisks(Tasklet):
 
       self.cfg.Update(self.instance, feedback_fn)
 
+    cstep = 5
+    if self.early_release:
+      self.lu.LogStep(cstep, steps_total, "Removing old storage")
+      cstep += 1
+      self._RemoveOldStorage(self.target_node, iv_names)
+      # only release the lock if we're doing secondary replace, since
+      # we use the primary node later
+      if self.target_node != self.instance.primary_node:
+        self._ReleaseNodeLock(self.target_node)
+
     # Wait for sync
     # This can fail as the old devices are degraded and _WaitForSync
     # does a combined result over all disks, so we don't check its return value
-    self.lu.LogStep(5, steps_total, "Sync devices")
+    self.lu.LogStep(cstep, steps_total, "Sync devices")
+    cstep += 1
     _WaitForSync(self.lu, self.instance)
 
     # Check all devices manually
     self._CheckDevices(self.instance.primary_node, iv_names)
 
     # Step: remove old storage
-    self.lu.LogStep(6, steps_total, "Removing old storage")
-    self._RemoveOldStorage(self.target_node, iv_names)
+    if not self.early_release:
+      self.lu.LogStep(cstep, steps_total, "Removing old storage")
+      cstep += 1
+      self._RemoveOldStorage(self.target_node, iv_names)
 
   def _ExecDrbd8Secondary(self, feedback_fn):
     """Replace the secondary node for DRBD 8.
@@ -7108,19 +7130,27 @@ class TLReplaceDisks(Tasklet):
                            to_node, msg,
                            hint=("please do a gnt-instance info to see the"
                                  " status of disks"))
+    cstep = 5
+    if self.early_release:
+      self.lu.LogStep(cstep, steps_total, "Removing old storage")
+      cstep += 1
+      self._RemoveOldStorage(self.target_node, iv_names)
+      self._ReleaseNodeLock([self.target_node, self.new_node])
 
     # Wait for sync
     # This can fail as the old devices are degraded and _WaitForSync
     # does a combined result over all disks, so we don't check its return value
-    self.lu.LogStep(5, steps_total, "Sync devices")
+    self.lu.LogStep(cstep, steps_total, "Sync devices")
+    cstep += 1
     _WaitForSync(self.lu, self.instance)
 
     # Check all devices manually
     self._CheckDevices(self.instance.primary_node, iv_names)
 
     # Step: remove old storage
-    self.lu.LogStep(6, steps_total, "Removing old storage")
-    self._RemoveOldStorage(self.target_node, iv_names)
+    if not self.early_release:
+      self.lu.LogStep(cstep, steps_total, "Removing old storage")
+      self._RemoveOldStorage(self.target_node, iv_names)
 
 
 class LURepairNodeStorage(NoHooksLU):
index 906542e..3aed41e 100644 (file)
@@ -419,7 +419,7 @@ class OpEvacuateNode(OpCode):
   OP_ID = "OP_NODE_EVACUATE"
   OP_DSC_FIELD = "node_name"
   __slots__ = [
-    "node_name", "remote_node", "iallocator",
+    "node_name", "remote_node", "iallocator", "early_release",
     ]
 
 
@@ -509,6 +509,7 @@ class OpReplaceDisks(OpCode):
   OP_DSC_FIELD = "instance_name"
   __slots__ = [
     "instance_name", "remote_node", "mode", "disks", "iallocator",
+    "early_release",
     ]
 
 
index 2cbb67e..64ec961 100644 (file)
@@ -1828,6 +1828,7 @@ instance5: 11225
         <cmdsynopsis>
           <command>replace-disks</command>
           <arg>--submit</arg>
+          <arg>--early-release</arg>
           <arg choice="req">-p</arg>
           <arg>--disks <replaceable>idx</replaceable></arg>
           <arg choice="req"><replaceable>instance</replaceable></arg>
@@ -1836,6 +1837,7 @@ instance5: 11225
         <cmdsynopsis>
           <command>replace-disks</command>
           <arg>--submit</arg>
+          <arg>--early-release</arg>
           <arg choice="req">-s</arg>
           <arg>--disks <replaceable>idx</replaceable></arg>
           <arg choice="req"><replaceable>instance</replaceable></arg>
@@ -1844,6 +1846,7 @@ instance5: 11225
         <cmdsynopsis>
           <command>replace-disks</command>
           <arg>--submit</arg>
+          <arg>--early-release</arg>
           <group choice="req">
             <arg>--iallocator <replaceable>name</replaceable></arg>
             <arg>--new-secondary <replaceable>NODE</replaceable></arg>
@@ -1855,6 +1858,7 @@ instance5: 11225
         <cmdsynopsis>
           <command>replace-disks</command>
           <arg>--submit</arg>
+          <arg>--early-release</arg>
           <arg choice="req">--auto</arg>
           <arg choice="req"><replaceable>instance</replaceable></arg>
         </cmdsynopsis>
@@ -1906,6 +1910,19 @@ instance5: 11225
         </para>
 
         <para>
+          The <option>--early-release</option> changes the code so
+          that the old storage on secondary node(s) is removed early
+          (before the resync is completed) and the internal Ganeti
+          locks for the current (and new, if any) secondary node are
+          also released, thus allowing more parallelism in the cluster
+          operation. This should be used only when recovering from a
+          disk failure on the current secondary (thus the old storage
+          is already broken) or when the storage on the primary node
+          is known to be fine (thus we won't need the old storage for
+          potential recovery).
+        </para>
+
+        <para>
           Note that it is not possible to select an offline or drained
           node as a new secondary.
         </para>
index b16a4a7..c1980f2 100644 (file)
       <cmdsynopsis>
         <command>evacuate</command>
         <arg>-f</arg>
+        <arg>--early-release</arg>
         <group>
           <arg>--iallocator <replaceable>NAME</replaceable></arg>
           <arg>--new-secondary <replaceable>destination_node</replaceable></arg>
       </para>
 
       <para>
+        The <option>--early-release</option> changes the code so that
+        the old storage on node being evacuated is removed early
+        (before the resync is completed) and the internal Ganeti locks
+        are also released for both the current secondary and the new
+        secondary, thus allowing more parallelism in the cluster
+        operation. This should be used only when recovering from a
+        disk failure on the current secondary (thus the old storage is
+        already broken) or when the storage on the primary node is
+        known to be fine (thus we won't need the old storage for
+        potential recovery).
+      </para>
+
+      <para>
         Example:
         <screen>
           # gnt-node evacuate -I dumb node3.example.com
index 0c02b9e..51f892a 100755 (executable)
@@ -807,7 +807,8 @@ def ReplaceDisks(opts, args):
 
   op = opcodes.OpReplaceDisks(instance_name=args[0], disks=disks,
                               remote_node=new_2ndary, mode=mode,
-                              iallocator=iallocator)
+                              iallocator=iallocator,
+                              early_release=opts.early_release)
   SubmitOrSend(op, opts)
   return 0
 
@@ -1400,7 +1401,7 @@ commands = {
     "<instance> <new_name>", "Rename the instance"),
   'replace-disks': (
     ReplaceDisks, ARGS_ONE_INSTANCE,
-    [AUTO_REPLACE_OPT, DISKIDX_OPT, IALLOCATOR_OPT,
+    [AUTO_REPLACE_OPT, DISKIDX_OPT, IALLOCATOR_OPT, EARLY_RELEASE_OPT,
      NEW_SECONDARY_OPT, ON_PRIMARY_OPT, ON_SECONDARY_OPT, SUBMIT_OPT],
     "[-s|-p|-n NODE|-I NAME] <instance>",
     "Replaces all disks for the instance"),
index f0e4113..8bd4281 100755 (executable)
@@ -120,6 +120,7 @@ OPTIONS = [
   cli.VERBOSE_OPT,
   cli.NOIPCHECK_OPT,
   cli.NONAMECHECK_OPT,
+  cli.EARLY_RELEASE_OPT,
   cli.cli_option("--no-replace1", dest="do_replace1",
                  help="Skip disk replacement with the same secondary",
                  action="store_false", default=True),
@@ -544,7 +545,8 @@ class Burner(object):
       for mode in constants.REPLACE_DISK_SEC, constants.REPLACE_DISK_PRI:
         op = opcodes.OpReplaceDisks(instance_name=instance,
                                     mode=mode,
-                                    disks=[i for i in range(self.disk_count)])
+                                    disks=[i for i in range(self.disk_count)],
+                                    early_release=self.opts.early_release)
         Log("run %s" % mode, indent=2)
         ops.append(op)
       self.ExecOrQueue(instance, *ops) # pylint: disable-msg=W0142
@@ -568,7 +570,8 @@ class Burner(object):
                                   mode=mode,
                                   remote_node=tnode,
                                   iallocator=self.opts.iallocator,
-                                  disks=[])
+                                  disks=[],
+                                  early_release=self.opts.early_release)
       Log("run %s %s" % (mode, msg), indent=2)
       self.ExecOrQueue(instance, op)