Add an early release lock/storage for disk replace

author Iustin Pop <iustin@google.com>

Tue, 9 Feb 2010 08:57:50 +0000 (09:57 +0100)

committer Iustin Pop <iustin@google.com>

Tue, 9 Feb 2010 10:04:07 +0000 (11:04 +0100)
author Iustin Pop <iustin@google.com>
Tue, 9 Feb 2010 08:57:50 +0000 (09:57 +0100)
committer Iustin Pop <iustin@google.com>
Tue, 9 Feb 2010 10:04:07 +0000 (11:04 +0100)
diff --git a/lib/cli.py b/lib/cli.py

index d9c2dbb..523cab7 100644 (file)
--- a/lib/cli.py
+++ b/lib/cli.py
@@ -56,6 +56,7 @@ __all__ = [
    "DISK_OPT",
    "DISK_TEMPLATE_OPT",
    "DRAINED_OPT",
+  "EARLY_RELEASE_OPT",
    "ENABLED_HV_OPT",
    "ERROR_CODES_OPT",
    "FIELDS_OPT",
@@ -837,6 +838,12 @@ SHUTDOWN_TIMEOUT_OPT = cli_option("--shutdown-timeout",
                           default=constants.DEFAULT_SHUTDOWN_TIMEOUT,
                           help="Maximum time to wait for instance shutdown")
  
+EARLY_RELEASE_OPT = cli_option("--early-release",
+                               dest="early_release", default=False,
+                               action="store_true",
+                               help="Release the locks on the secondary"
+                               " node(s) early")
+
  
  def _ParseArgs(argv, commands, aliases):
    """Parser for the command line arguments.
diff --git a/lib/cmdlib.py b/lib/cmdlib.py

index cb922eb..5cf76ad 100644 (file)
--- a/lib/cmdlib.py
+++ b/lib/cmdlib.py
@@ -6332,6 +6332,8 @@ class LUReplaceDisks(LogicalUnit):
        self.op.remote_node = None
      if not hasattr(self.op, "iallocator"):
        self.op.iallocator = None
+    if not hasattr(self.op, "early_release"):
+      self.op.early_release = False
  
      TLReplaceDisks.CheckArguments(self.op.mode, self.op.remote_node,
                                    self.op.iallocator)
@@ -6363,7 +6365,7 @@ class LUReplaceDisks(LogicalUnit):
  
      self.replacer = TLReplaceDisks(self, self.op.instance_name, self.op.mode,
                                     self.op.iallocator, self.op.remote_node,
-                                   self.op.disks, False)
+                                   self.op.disks, False, self.op.early_release)
  
      self.tasklets = [self.replacer]
  
@@ -6410,6 +6412,8 @@ class LUEvacuateNode(LogicalUnit):
        self.op.remote_node = None
      if not hasattr(self.op, "iallocator"):
        self.op.iallocator = None
+    if not hasattr(self.op, "early_release"):
+      self.op.early_release = False
  
      TLReplaceDisks.CheckArguments(constants.REPLACE_DISK_CHG,
                                    self.op.remote_node,
@@ -6456,7 +6460,7 @@ class LUEvacuateNode(LogicalUnit):
  
        replacer = TLReplaceDisks(self, inst.name, constants.REPLACE_DISK_CHG,
                                  self.op.iallocator, self.op.remote_node, [],
-                                True)
+                                True, self.op.early_release)
        tasklets.append(replacer)
  
      self.tasklets = tasklets
@@ -6498,7 +6502,7 @@ class TLReplaceDisks(Tasklet):
  
    """
    def __init__(self, lu, instance_name, mode, iallocator_name, remote_node,
-               disks, delay_iallocator):
+               disks, delay_iallocator, early_release):
      """Initializes this class.
  
      """
@@ -6511,6 +6515,7 @@ class TLReplaceDisks(Tasklet):
      self.remote_node = remote_node
      self.disks = disks
      self.delay_iallocator = delay_iallocator
+    self.early_release = early_release
  
      # Runtime data
      self.instance = None
@@ -6853,6 +6858,10 @@ class TLReplaceDisks(Tasklet):
            self.lu.LogWarning("Can't remove old LV: %s" % msg,
                               hint="remove unused LVs manually")
  
+  def _ReleaseNodeLock(self, node_name):
+    """Releases the lock for a given node."""
+    self.lu.context.glm.release(locking.LEVEL_NODE, node_name)
+
    def _ExecDrbd8DiskOnly(self, feedback_fn):
      """Replace a disk on the primary or secondary for DRBD 8.
  
@@ -6963,18 +6972,31 @@ class TLReplaceDisks(Tasklet):
  
        self.cfg.Update(self.instance, feedback_fn)
  
+    cstep = 5
+    if self.early_release:
+      self.lu.LogStep(cstep, steps_total, "Removing old storage")
+      cstep += 1
+      self._RemoveOldStorage(self.target_node, iv_names)
+      # only release the lock if we're doing secondary replace, since
+      # we use the primary node later
+      if self.target_node != self.instance.primary_node:
+        self._ReleaseNodeLock(self.target_node)
+
      # Wait for sync
      # This can fail as the old devices are degraded and _WaitForSync
      # does a combined result over all disks, so we don't check its return value
-    self.lu.LogStep(5, steps_total, "Sync devices")
+    self.lu.LogStep(cstep, steps_total, "Sync devices")
+    cstep += 1
      _WaitForSync(self.lu, self.instance)
  
      # Check all devices manually
      self._CheckDevices(self.instance.primary_node, iv_names)
  
      # Step: remove old storage
-    self.lu.LogStep(6, steps_total, "Removing old storage")
-    self._RemoveOldStorage(self.target_node, iv_names)
+    if not self.early_release:
+      self.lu.LogStep(cstep, steps_total, "Removing old storage")
+      cstep += 1
+      self._RemoveOldStorage(self.target_node, iv_names)
  
    def _ExecDrbd8Secondary(self, feedback_fn):
      """Replace the secondary node for DRBD 8.
@@ -7108,19 +7130,27 @@ class TLReplaceDisks(Tasklet):
                             to_node, msg,
                             hint=("please do a gnt-instance info to see the"
                                   " status of disks"))
+    cstep = 5
+    if self.early_release:
+      self.lu.LogStep(cstep, steps_total, "Removing old storage")
+      cstep += 1
+      self._RemoveOldStorage(self.target_node, iv_names)
+      self._ReleaseNodeLock([self.target_node, self.new_node])
  
      # Wait for sync
      # This can fail as the old devices are degraded and _WaitForSync
      # does a combined result over all disks, so we don't check its return value
-    self.lu.LogStep(5, steps_total, "Sync devices")
+    self.lu.LogStep(cstep, steps_total, "Sync devices")
+    cstep += 1
      _WaitForSync(self.lu, self.instance)
  
      # Check all devices manually
      self._CheckDevices(self.instance.primary_node, iv_names)
  
      # Step: remove old storage
-    self.lu.LogStep(6, steps_total, "Removing old storage")
-    self._RemoveOldStorage(self.target_node, iv_names)
+    if not self.early_release:
+      self.lu.LogStep(cstep, steps_total, "Removing old storage")
+      self._RemoveOldStorage(self.target_node, iv_names)
  
  
  class LURepairNodeStorage(NoHooksLU):
diff --git a/lib/opcodes.py b/lib/opcodes.py

index 906542e..3aed41e 100644 (file)
--- a/lib/opcodes.py
+++ b/lib/opcodes.py
@@ -419,7 +419,7 @@ class OpEvacuateNode(OpCode):
    OP_ID = "OP_NODE_EVACUATE"
    OP_DSC_FIELD = "node_name"
    __slots__ = [
-    "node_name", "remote_node", "iallocator",
+    "node_name", "remote_node", "iallocator", "early_release",
      ]
  
  
@@ -509,6 +509,7 @@ class OpReplaceDisks(OpCode):
    OP_DSC_FIELD = "instance_name"
    __slots__ = [
      "instance_name", "remote_node", "mode", "disks", "iallocator",
+    "early_release",
      ]
  
  
diff --git a/man/gnt-instance.sgml b/man/gnt-instance.sgml

index 2cbb67e..64ec961 100644 (file)
--- a/man/gnt-instance.sgml
+++ b/man/gnt-instance.sgml
@@ -1828,6 +1828,7 @@ instance5: 11225
          <cmdsynopsis>
            <command>replace-disks</command>
            <arg>--submit</arg>
+          <arg>--early-release</arg>
            <arg choice="req">-p</arg>
            <arg>--disks <replaceable>idx</replaceable></arg>
            <arg choice="req"><replaceable>instance</replaceable></arg>
@@ -1836,6 +1837,7 @@ instance5: 11225
          <cmdsynopsis>
            <command>replace-disks</command>
            <arg>--submit</arg>
+          <arg>--early-release</arg>
            <arg choice="req">-s</arg>
            <arg>--disks <replaceable>idx</replaceable></arg>
            <arg choice="req"><replaceable>instance</replaceable></arg>
@@ -1844,6 +1846,7 @@ instance5: 11225
          <cmdsynopsis>
            <command>replace-disks</command>
            <arg>--submit</arg>
+          <arg>--early-release</arg>
            <group choice="req">
              <arg>--iallocator <replaceable>name</replaceable></arg>
              <arg>--new-secondary <replaceable>NODE</replaceable></arg>
@@ -1855,6 +1858,7 @@ instance5: 11225
          <cmdsynopsis>
            <command>replace-disks</command>
            <arg>--submit</arg>
+          <arg>--early-release</arg>
            <arg choice="req">--auto</arg>
            <arg choice="req"><replaceable>instance</replaceable></arg>
          </cmdsynopsis>
@@ -1906,6 +1910,19 @@ instance5: 11225
          </para>
  
          <para>
+          The <option>--early-release</option> changes the code so
+          that the old storage on secondary node(s) is removed early
+          (before the resync is completed) and the internal Ganeti
+          locks for the current (and new, if any) secondary node are
+          also released, thus allowing more parallelism in the cluster
+          operation. This should be used only when recovering from a
+          disk failure on the current secondary (thus the old storage
+          is already broken) or when the storage on the primary node
+          is known to be fine (thus we won't need the old storage for
+          potential recovery).
+        </para>
+
+        <para>
            Note that it is not possible to select an offline or drained
            node as a new secondary.
          </para>
diff --git a/man/gnt-node.sgml b/man/gnt-node.sgml

index b16a4a7..c1980f2 100644 (file)
--- a/man/gnt-node.sgml
+++ b/man/gnt-node.sgml
@@ -143,6 +143,7 @@
        <cmdsynopsis>
          <command>evacuate</command>
          <arg>-f</arg>
+        <arg>--early-release</arg>
          <group>
            <arg>--iallocator <replaceable>NAME</replaceable></arg>
            <arg>--new-secondary <replaceable>destination_node</replaceable></arg>
@@ -173,6 +174,19 @@
        </para>
  
        <para>
+        The <option>--early-release</option> changes the code so that
+        the old storage on node being evacuated is removed early
+        (before the resync is completed) and the internal Ganeti locks
+        are also released for both the current secondary and the new
+        secondary, thus allowing more parallelism in the cluster
+        operation. This should be used only when recovering from a
+        disk failure on the current secondary (thus the old storage is
+        already broken) or when the storage on the primary node is
+        known to be fine (thus we won't need the old storage for
+        potential recovery).
+      </para>
+
+      <para>
          Example:
          <screen>
            # gnt-node evacuate -I dumb node3.example.com
diff --git a/scripts/gnt-instance b/scripts/gnt-instance

index 0c02b9e..51f892a 100755 (executable)
--- a/scripts/gnt-instance
+++ b/scripts/gnt-instance
@@ -807,7 +807,8 @@ def ReplaceDisks(opts, args):
  
    op = opcodes.OpReplaceDisks(instance_name=args[0], disks=disks,
                                remote_node=new_2ndary, mode=mode,
-                              iallocator=iallocator)
+                              iallocator=iallocator,
+                              early_release=opts.early_release)
    SubmitOrSend(op, opts)
    return 0
  
@@ -1400,7 +1401,7 @@ commands = {
      "<instance> <new_name>", "Rename the instance"),
    'replace-disks': (
      ReplaceDisks, ARGS_ONE_INSTANCE,
-    [AUTO_REPLACE_OPT, DISKIDX_OPT, IALLOCATOR_OPT,
+    [AUTO_REPLACE_OPT, DISKIDX_OPT, IALLOCATOR_OPT, EARLY_RELEASE_OPT,
       NEW_SECONDARY_OPT, ON_PRIMARY_OPT, ON_SECONDARY_OPT, SUBMIT_OPT],
      "[-s|-p|-n NODE|-I NAME] <instance>",
      "Replaces all disks for the instance"),
diff --git a/tools/burnin b/tools/burnin

index f0e4113..8bd4281 100755 (executable)
--- a/tools/burnin
+++ b/tools/burnin
@@ -120,6 +120,7 @@ OPTIONS = [
    cli.VERBOSE_OPT,
    cli.NOIPCHECK_OPT,
    cli.NONAMECHECK_OPT,
+  cli.EARLY_RELEASE_OPT,
    cli.cli_option("--no-replace1", dest="do_replace1",
                   help="Skip disk replacement with the same secondary",
                   action="store_false", default=True),
@@ -544,7 +545,8 @@ class Burner(object):
        for mode in constants.REPLACE_DISK_SEC, constants.REPLACE_DISK_PRI:
          op = opcodes.OpReplaceDisks(instance_name=instance,
                                      mode=mode,
-                                    disks=[i for i in range(self.disk_count)])
+                                    disks=[i for i in range(self.disk_count)],
+                                    early_release=self.opts.early_release)
          Log("run %s" % mode, indent=2)
          ops.append(op)
        self.ExecOrQueue(instance, *ops) # pylint: disable-msg=W0142
@@ -568,7 +570,8 @@ class Burner(object):
                                    mode=mode,
                                    remote_node=tnode,
                                    iallocator=self.opts.iallocator,
-                                  disks=[])
+                                  disks=[],
+                                  early_release=self.opts.early_release)
        Log("run %s %s" % (mode, msg), indent=2)
        self.ExecOrQueue(instance, op)
author	Iustin Pop <iustin@google.com>
	Tue, 9 Feb 2010 08:57:50 +0000 (09:57 +0100)
committer	Iustin Pop <iustin@google.com>
	Tue, 9 Feb 2010 10:04:07 +0000 (11:04 +0100)
lib/cli.py		patch \| blob \| history
lib/cmdlib.py		patch \| blob \| history
lib/opcodes.py		patch \| blob \| history
man/gnt-instance.sgml		patch \| blob \| history
man/gnt-node.sgml		patch \| blob \| history
scripts/gnt-instance		patch \| blob \| history
tools/burnin		patch \| blob \| history