Allow force removal of instances
authorIustin Pop <iustin@google.com>
Tue, 16 Oct 2007 13:51:51 +0000 (13:51 +0000)
committerIustin Pop <iustin@google.com>
Tue, 16 Oct 2007 13:51:51 +0000 (13:51 +0000)
This patch adds a new option to the instance removal command
"--ignore-failures" that forces the removal of the instance from the
configuration even if the removal process encounters errors.

In order to be able to do this when the remote node(s) is(are) down, we
need to restrict the execution of the instance removal hook to the
master only. I think this is a reasonable trade-off (but I'm not sure).

Reviewed-by: imsnah

doc/hooks.sgml
lib/cmdlib.py
lib/opcodes.py
man/gnt-instance.sgml
scripts/gnt-instance

index daa1d07..b6cf2e9 100644 (file)
               <entry>Remove an instance</entry>
               <entry><computeroutput>gnt-instance remove</computeroutput></entry>
               <entry><constant>INSTANCE_NAME</constant>, <constant>INSTANCE_PRIMARY</constant>, <constant>INSTANCE_SECONDARIES</constant></entry>
+              <entry spanname="bothhooks">master node</entry>
             </row>
             <row>
               <entry>OP_INSTANCE_ADD_MDDRBD</entry>
index ea56345..d41ed5a 100644 (file)
@@ -2193,8 +2193,7 @@ class LURemoveInstance(LogicalUnit):
 
     """
     env = _BuildInstanceHookEnvByObject(self.instance)
-    nl = ([self.sstore.GetMasterNode(), self.instance.primary_node] +
-          list(self.instance.secondary_nodes))
+    nl = [self.sstore.GetMasterNode()]
     return env, nl, nl
 
   def CheckPrereq(self):
@@ -2219,12 +2218,19 @@ class LURemoveInstance(LogicalUnit):
                 (instance.name, instance.primary_node))
 
     if not rpc.call_instance_shutdown(instance.primary_node, instance):
-      raise errors.OpExecError("Could not shutdown instance %s on node %s" %
-                               (instance.name, instance.primary_node))
+      if self.op.ignore_failures:
+        feedback_fn("Warning: can't shutdown instance")
+      else:
+        raise errors.OpExecError("Could not shutdown instance %s on node %s" %
+                                 (instance.name, instance.primary_node))
 
     logger.Info("removing block devices for instance %s" % instance.name)
 
-    _RemoveDisks(instance, self.cfg)
+    if not _RemoveDisks(instance, self.cfg):
+      if self.op.ignore_failures:
+        feedback_fn("Warning: can't remove instance's disks")
+      else:
+        raise errors.OpExecError("Can't remove instance's disks")
 
     logger.Info("removing instance %s out of cluster config" % instance.name)
 
@@ -2649,7 +2655,7 @@ def _RemoveDisks(instance, cfg):
 
   This abstracts away some work from `AddInstance()` and
   `RemoveInstance()`. Note that in case some of the devices couldn't
-  be remove, the removal will continue with the other ones (compare
+  be removed, the removal will continue with the other ones (compare
   with `_CreateDisks()`).
 
   Args:
index 7beb299..5e11fba 100644 (file)
@@ -150,7 +150,7 @@ class OpReinstallInstance(OpCode):
 class OpRemoveInstance(OpCode):
   """Remove an instance."""
   OP_ID = "OP_INSTANCE_REMOVE"
-  __slots__ = ["instance_name"]
+  __slots__ = ["instance_name", "ignore_failures"]
 
 
 class OpRenameInstance(OpCode):
index 9ada7c7..f31dca2 100644 (file)
 
         <cmdsynopsis>
           <command>remove</command>
+          <arg>--ignore-failures</arg>
           <arg choice="req"><replaceable>instance</replaceable></arg>
         </cmdsynopsis>
 
           you are not sure if you use an instance again, use
           <command>shutdown</command> first and leave it in the
           shutdown state for a while.
+
+        </para>
+
+        <para>
+          The <option>--ignore-failures</option> option will cause the
+          removal to proceed even in the presence of errors during the
+          removal of the instance (e.g. during the shutdown or the
+          disk removal). If this option is not given, the command will
+          stop at the first error.
         </para>
 
         <para>
index 5768b44..943f489 100755 (executable)
@@ -270,7 +270,8 @@ def RemoveInstance(opts, args):
     if not AskUser(usertext):
       return 1
 
-  op = opcodes.OpRemoveInstance(instance_name=instance_name)
+  op = opcodes.OpRemoveInstance(instance_name=instance_name,
+                                ignore_failures=opts.ignore_failures)
   SubmitOpCode(op)
   return 0
 
@@ -689,7 +690,14 @@ commands = {
            "", "Lists the instances and their status"),
   'reinstall': (ReinstallInstance, ARGS_ONE, [DEBUG_OPT, FORCE_OPT, os_opt],
                 "[-f] <instance>", "Reinstall the instance"),
-  'remove': (RemoveInstance, ARGS_ONE, [DEBUG_OPT, FORCE_OPT],
+  'remove': (RemoveInstance, ARGS_ONE,
+             [DEBUG_OPT, FORCE_OPT,
+              make_option("--ignore-failures", dest="ignore_failures",
+                          action="store_true", default=False,
+                          help=("Remove the instance from the cluster even"
+                                " if there are failures during the removal"
+                                " process (shutdown, disk removal, etc.)")),
+              ],
              "[-f] <instance>", "Shuts down the instance and removes it"),
   'remove-mirror': (RemoveMDDRBDComponent, ARGS_ONE,
                    [DEBUG_OPT, node_opt,