Add an early release lock/storage for disk replace
[ganeti-local] / lib / cmdlib.py
index 3bb6e1a..5cf76ad 100644 (file)
@@ -1603,7 +1603,8 @@ class LUVerifyCluster(LogicalUnit):
         test = msg and not res.offline
         self._ErrorIf(test, self.ENODEHOOKS, node_name,
                       "Communication failure in hooks execution: %s", msg)
-        if test:
+        if res.offline or msg:
+          # No need to investigate payload if node is offline or gave an error.
           # override manually lu_result here as _ErrorIf only
           # overrides self.bad
           lu_result = 1
@@ -1843,7 +1844,8 @@ class LURenameCluster(LogicalUnit):
       "NEW_NAME": self.op.name,
       }
     mn = self.cfg.GetMasterNode()
-    return env, [mn], [mn]
+    all_nodes = self.cfg.GetNodeList()
+    return env, [mn], all_nodes
 
   def CheckPrereq(self):
     """Verify that the passed name is a valid one.
@@ -2423,8 +2425,11 @@ class LURemoveNode(LogicalUnit):
       "NODE_NAME": self.op.node_name,
       }
     all_nodes = self.cfg.GetNodeList()
-    if self.op.node_name in all_nodes:
+    try:
       all_nodes.remove(self.op.node_name)
+    except ValueError:
+      logging.warning("Node %s which is about to be removed not found"
+                      " in the all nodes list", self.op.node_name)
     return env, all_nodes, all_nodes
 
   def CheckPrereq(self):
@@ -3202,7 +3207,7 @@ class LUSetNodeParams(LogicalUnit):
 
     # If we're being deofflined/drained, we'll MC ourself if needed
     if (deoffline_or_drain and not offline_or_drain and not
-        self.op.master_candidate == True):
+        self.op.master_candidate == True and not node.master_candidate):
       self.op.master_candidate = _DecideSelfPromotion(self)
       if self.op.master_candidate:
         self.LogInfo("Autopromoting node to master candidate")
@@ -3988,7 +3993,8 @@ class LUReinstallInstance(LogicalUnit):
     _StartInstanceDisks(self, inst, None)
     try:
       feedback_fn("Running the instance OS create scripts...")
-      result = self.rpc.call_instance_os_add(inst.primary_node, inst, True)
+      # FIXME: pass debug option from opcode to backend
+      result = self.rpc.call_instance_os_add(inst.primary_node, inst, True, 0)
       result.Raise("Could not install OS for instance %s on node %s" %
                    (inst.name, inst.primary_node))
     finally:
@@ -4172,7 +4178,7 @@ class LURenameInstance(LogicalUnit):
     _StartInstanceDisks(self, inst, None)
     try:
       result = self.rpc.call_instance_run_rename(inst.primary_node, inst,
-                                                 old_name)
+                                                 old_name, 0)
       msg = result.fail_msg
       if msg:
         msg = ("Could not run OS rename script for instance %s on node %s"
@@ -5769,16 +5775,14 @@ class LUCreateInstance(LogicalUnit):
       # MAC address verification
       mac = nic.get("mac", constants.VALUE_AUTO)
       if mac not in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
-        if not utils.IsValidMac(mac.lower()):
-          raise errors.OpPrereqError("Invalid MAC address specified: %s" %
-                                     mac, errors.ECODE_INVAL)
-        else:
-          try:
-            self.cfg.ReserveMAC(mac, self.proc.GetECId())
-          except errors.ReservationError:
-            raise errors.OpPrereqError("MAC address %s already in use"
-                                       " in cluster" % mac,
-                                       errors.ECODE_NOTUNIQUE)
+        mac = utils.NormalizeAndValidateMac(mac)
+
+        try:
+          self.cfg.ReserveMAC(mac, self.proc.GetECId())
+        except errors.ReservationError:
+          raise errors.OpPrereqError("MAC address %s already in use"
+                                     " in cluster" % mac,
+                                     errors.ECODE_NOTUNIQUE)
 
       # bridge verification
       bridge = nic.get("bridge", None)
@@ -5815,7 +5819,7 @@ class LUCreateInstance(LogicalUnit):
         raise errors.OpPrereqError("Missing disk size", errors.ECODE_INVAL)
       try:
         size = int(size)
-      except ValueError:
+      except (TypeError, ValueError):
         raise errors.OpPrereqError("Invalid disk size '%s'" % size,
                                    errors.ECODE_INVAL)
       self.disks.append({"size": size, "mode": mode})
@@ -6227,7 +6231,8 @@ class LUCreateInstance(LogicalUnit):
     if iobj.disk_template != constants.DT_DISKLESS:
       if self.op.mode == constants.INSTANCE_CREATE:
         feedback_fn("* running the instance OS create scripts...")
-        result = self.rpc.call_instance_os_add(pnode_name, iobj, False)
+        # FIXME: pass debug option from opcode to backend
+        result = self.rpc.call_instance_os_add(pnode_name, iobj, False, 0)
         result.Raise("Could not add os for instance %s"
                      " on node %s" % (instance, pnode_name))
 
@@ -6236,9 +6241,10 @@ class LUCreateInstance(LogicalUnit):
         src_node = self.op.src_node
         src_images = self.src_images
         cluster_name = self.cfg.GetClusterName()
+        # FIXME: pass debug option from opcode to backend
         import_result = self.rpc.call_instance_os_import(pnode_name, iobj,
                                                          src_node, src_images,
-                                                         cluster_name)
+                                                         cluster_name, 0)
         msg = import_result.fail_msg
         if msg:
           self.LogWarning("Error while importing the disk images for instance"
@@ -6326,6 +6332,8 @@ class LUReplaceDisks(LogicalUnit):
       self.op.remote_node = None
     if not hasattr(self.op, "iallocator"):
       self.op.iallocator = None
+    if not hasattr(self.op, "early_release"):
+      self.op.early_release = False
 
     TLReplaceDisks.CheckArguments(self.op.mode, self.op.remote_node,
                                   self.op.iallocator)
@@ -6357,7 +6365,7 @@ class LUReplaceDisks(LogicalUnit):
 
     self.replacer = TLReplaceDisks(self, self.op.instance_name, self.op.mode,
                                    self.op.iallocator, self.op.remote_node,
-                                   self.op.disks)
+                                   self.op.disks, False, self.op.early_release)
 
     self.tasklets = [self.replacer]
 
@@ -6404,6 +6412,8 @@ class LUEvacuateNode(LogicalUnit):
       self.op.remote_node = None
     if not hasattr(self.op, "iallocator"):
       self.op.iallocator = None
+    if not hasattr(self.op, "early_release"):
+      self.op.early_release = False
 
     TLReplaceDisks.CheckArguments(constants.REPLACE_DISK_CHG,
                                   self.op.remote_node,
@@ -6449,7 +6459,8 @@ class LUEvacuateNode(LogicalUnit):
       names.append(inst.name)
 
       replacer = TLReplaceDisks(self, inst.name, constants.REPLACE_DISK_CHG,
-                                self.op.iallocator, self.op.remote_node, [])
+                                self.op.iallocator, self.op.remote_node, [],
+                                True, self.op.early_release)
       tasklets.append(replacer)
 
     self.tasklets = tasklets
@@ -6491,7 +6502,7 @@ class TLReplaceDisks(Tasklet):
 
   """
   def __init__(self, lu, instance_name, mode, iallocator_name, remote_node,
-               disks):
+               disks, delay_iallocator, early_release):
     """Initializes this class.
 
     """
@@ -6503,6 +6514,8 @@ class TLReplaceDisks(Tasklet):
     self.iallocator_name = iallocator_name
     self.remote_node = remote_node
     self.disks = disks
+    self.delay_iallocator = delay_iallocator
+    self.early_release = early_release
 
     # Runtime data
     self.instance = None
@@ -6589,6 +6602,19 @@ class TLReplaceDisks(Tasklet):
                                  len(instance.secondary_nodes),
                                  errors.ECODE_FAULT)
 
+    if not self.delay_iallocator:
+      self._CheckPrereq2()
+
+  def _CheckPrereq2(self):
+    """Check prerequisites, second part.
+
+    This function should always be part of CheckPrereq. It was separated and is
+    now called from Exec because during node evacuation iallocator was only
+    called with an unmodified cluster model, not taking planned changes into
+    account.
+
+    """
+    instance = self.instance
     secondary_node = instance.secondary_nodes[0]
 
     if self.iallocator_name is None:
@@ -6692,6 +6718,9 @@ class TLReplaceDisks(Tasklet):
     This dispatches the disk replacement to the appropriate handler.
 
     """
+    if self.delay_iallocator:
+      self._CheckPrereq2()
+
     if not self.disks:
       feedback_fn("No disks need replacement")
       return
@@ -6829,6 +6858,10 @@ class TLReplaceDisks(Tasklet):
           self.lu.LogWarning("Can't remove old LV: %s" % msg,
                              hint="remove unused LVs manually")
 
+  def _ReleaseNodeLock(self, node_name):
+    """Releases the lock for a given node."""
+    self.lu.context.glm.release(locking.LEVEL_NODE, node_name)
+
   def _ExecDrbd8DiskOnly(self, feedback_fn):
     """Replace a disk on the primary or secondary for DRBD 8.
 
@@ -6939,18 +6972,31 @@ class TLReplaceDisks(Tasklet):
 
       self.cfg.Update(self.instance, feedback_fn)
 
+    cstep = 5
+    if self.early_release:
+      self.lu.LogStep(cstep, steps_total, "Removing old storage")
+      cstep += 1
+      self._RemoveOldStorage(self.target_node, iv_names)
+      # only release the lock if we're doing secondary replace, since
+      # we use the primary node later
+      if self.target_node != self.instance.primary_node:
+        self._ReleaseNodeLock(self.target_node)
+
     # Wait for sync
     # This can fail as the old devices are degraded and _WaitForSync
     # does a combined result over all disks, so we don't check its return value
-    self.lu.LogStep(5, steps_total, "Sync devices")
+    self.lu.LogStep(cstep, steps_total, "Sync devices")
+    cstep += 1
     _WaitForSync(self.lu, self.instance)
 
     # Check all devices manually
     self._CheckDevices(self.instance.primary_node, iv_names)
 
     # Step: remove old storage
-    self.lu.LogStep(6, steps_total, "Removing old storage")
-    self._RemoveOldStorage(self.target_node, iv_names)
+    if not self.early_release:
+      self.lu.LogStep(cstep, steps_total, "Removing old storage")
+      cstep += 1
+      self._RemoveOldStorage(self.target_node, iv_names)
 
   def _ExecDrbd8Secondary(self, feedback_fn):
     """Replace the secondary node for DRBD 8.
@@ -7084,19 +7130,27 @@ class TLReplaceDisks(Tasklet):
                            to_node, msg,
                            hint=("please do a gnt-instance info to see the"
                                  " status of disks"))
+    cstep = 5
+    if self.early_release:
+      self.lu.LogStep(cstep, steps_total, "Removing old storage")
+      cstep += 1
+      self._RemoveOldStorage(self.target_node, iv_names)
+      self._ReleaseNodeLock([self.target_node, self.new_node])
 
     # Wait for sync
     # This can fail as the old devices are degraded and _WaitForSync
     # does a combined result over all disks, so we don't check its return value
-    self.lu.LogStep(5, steps_total, "Sync devices")
+    self.lu.LogStep(cstep, steps_total, "Sync devices")
+    cstep += 1
     _WaitForSync(self.lu, self.instance)
 
     # Check all devices manually
     self._CheckDevices(self.instance.primary_node, iv_names)
 
     # Step: remove old storage
-    self.lu.LogStep(6, steps_total, "Removing old storage")
-    self._RemoveOldStorage(self.target_node, iv_names)
+    if not self.early_release:
+      self.lu.LogStep(cstep, steps_total, "Removing old storage")
+      self._RemoveOldStorage(self.target_node, iv_names)
 
 
 class LURepairNodeStorage(NoHooksLU):
@@ -7478,7 +7532,7 @@ class LUSetInstanceParams(LogicalUnit):
                                      errors.ECODE_INVAL)
         try:
           size = int(size)
-        except ValueError, err:
+        except (TypeError, ValueError), err:
           raise errors.OpPrereqError("Invalid disk size parameter: %s" %
                                      str(err), errors.ECODE_INVAL)
         disk_dict['size'] = size
@@ -7535,9 +7589,8 @@ class LUSetInstanceParams(LogicalUnit):
       if 'mac' in nic_dict:
         nic_mac = nic_dict['mac']
         if nic_mac not in (constants.VALUE_AUTO, constants.VALUE_GENERATE):
-          if not utils.IsValidMac(nic_mac):
-            raise errors.OpPrereqError("Invalid MAC address %s" % nic_mac,
-                                       errors.ECODE_INVAL)
+          nic_mac = utils.NormalizeAndValidateMac(nic_mac)
+
         if nic_op != constants.DDM_ADD and nic_mac == constants.VALUE_AUTO:
           raise errors.OpPrereqError("'auto' is not a valid MAC address when"
                                      " modifying an existing nic",
@@ -8147,8 +8200,10 @@ class LUExportInstance(LogicalUnit):
         feedback_fn("Exporting snapshot %s from %s to %s" %
                     (idx, src_node, dst_node.name))
         if dev:
+          # FIXME: pass debug from opcode to backend
           result = self.rpc.call_snapshot_export(src_node, dev, dst_node.name,
-                                                 instance, cluster_name, idx)
+                                                 instance, cluster_name,
+                                                 idx, 0)
           msg = result.fail_msg
           if msg:
             self.LogWarning("Could not export disk/%s from node %s to"