Makefile: Fix list of directories

[ganeti-local] / lib / mcpu.py
diff --git a/lib/mcpu.py b/lib/mcpu.py

index a7ea80c..594e16e 100644 (file)
--- a/lib/mcpu.py
+++ b/lib/mcpu.py
@@ -28,10 +28,12 @@ are two kinds of classes defined:
  
  """
  
+import sys
  import logging
  import random
  import time
  import itertools
+import traceback
  
  from ganeti import opcodes
  from ganeti import constants
@@ -46,6 +48,19 @@ from ganeti import pathutils
  _OP_PREFIX = "Op"
  _LU_PREFIX = "LU"
  
+#: LU classes which don't need to acquire the node allocation lock
+#: (L{locking.NAL}) when they acquire all node or node resource locks
+_NODE_ALLOC_WHITELIST = frozenset([])
+
+#: LU classes which don't need to acquire the node allocation lock
+#: (L{locking.NAL}) in the same mode (shared/exclusive) as the node
+#: or node resource locks
+_NODE_ALLOC_MODE_WHITELIST = compat.UniqueFrozenset([
+  cmdlib.LUBackupExport,
+  cmdlib.LUBackupRemove,
+  cmdlib.LUOobCommand,
+  ])
+
  
  class LockAcquireTimeout(Exception):
    """Exception to report timeouts on acquiring locks.
@@ -140,10 +155,11 @@ class OpExecCbBase: # pylint: disable=W0232
  
      """
  
-  def CheckCancel(self):
-    """Check whether job has been cancelled.
+  def CurrentPriority(self): # pylint: disable=R0201
+    """Returns current priority or C{None}.
  
      """
+    return None
  
    def SubmitManyJobs(self, jobs):
      """Submits jobs for processing.
@@ -243,6 +259,44 @@ def _RpcResultsToHooksResults(rpc_results):
                for (node, rpc_res) in rpc_results.items())
  
  
+def _VerifyLocks(lu, glm, _mode_whitelist=_NODE_ALLOC_MODE_WHITELIST,
+                 _nal_whitelist=_NODE_ALLOC_WHITELIST):
+  """Performs consistency checks on locks acquired by a logical unit.
+
+  @type lu: L{cmdlib.LogicalUnit}
+  @param lu: Logical unit instance
+  @type glm: L{locking.GanetiLockManager}
+  @param glm: Lock manager
+
+  """
+  if not __debug__:
+    return
+
+  have_nal = glm.check_owned(locking.LEVEL_NODE_ALLOC, locking.NAL)
+
+  for level in [locking.LEVEL_NODE, locking.LEVEL_NODE_RES]:
+    # TODO: Verify using actual lock mode, not using LU variables
+    if level in lu.needed_locks:
+      share_node_alloc = lu.share_locks[locking.LEVEL_NODE_ALLOC]
+      share_level = lu.share_locks[level]
+
+      if lu.__class__ in _mode_whitelist:
+        assert share_node_alloc != share_level, \
+          "LU is whitelisted to use different modes for node allocation lock"
+      else:
+        assert bool(share_node_alloc) == bool(share_level), \
+          ("Node allocation lock must be acquired using the same mode as nodes"
+           " and node resources")
+
+      if lu.__class__ in _nal_whitelist:
+        assert not have_nal, \
+          "LU is whitelisted for not acquiring the node allocation lock"
+      elif lu.needed_locks[level] == locking.ALL_SET or glm.owning_all(level):
+        assert have_nal, \
+          ("Node allocation lock must be used if an LU acquires all nodes"
+           " or node resources")
+
+
  class Processor(object):
    """Object which runs OpCodes"""
    DISPATCH_TABLE = _ComputeDispatchTable()
@@ -272,7 +326,7 @@ class Processor(object):
      if not self._enable_locks:
        raise errors.ProgrammerError("Attempted to use disabled locks")
  
-  def _AcquireLocks(self, level, names, shared, timeout, priority):
+  def _AcquireLocks(self, level, names, shared, opportunistic, timeout):
      """Acquires locks via the Ganeti lock manager.
  
      @type level: int
@@ -281,6 +335,8 @@ class Processor(object):
      @param names: Lock names
      @type shared: bool
      @param shared: Whether the locks should be acquired in shared mode
+    @type opportunistic: bool
+    @param opportunistic: Whether to acquire opportunistically
      @type timeout: None or float
      @param timeout: Timeout for acquiring the locks
      @raise LockAcquireTimeout: In case locks couldn't be acquired in specified
@@ -290,10 +346,13 @@ class Processor(object):
      self._CheckLocksEnabled()
  
      if self._cbs:
-      self._cbs.CheckCancel()
+      priority = self._cbs.CurrentPriority()
+    else:
+      priority = None
  
      acquired = self.context.glm.acquire(level, names, shared=shared,
-                                        timeout=timeout, priority=priority)
+                                        timeout=timeout, priority=priority,
+                                        opportunistic=opportunistic)
  
      if acquired is None:
        raise LockAcquireTimeout()
@@ -340,7 +399,7 @@ class Processor(object):
    def BuildHooksManager(self, lu):
      return self.hmclass.BuildFromLu(lu.rpc.call_hooks_runner, lu)
  
-  def _LockAndExecLU(self, lu, level, calc_timeout, priority):
+  def _LockAndExecLU(self, lu, level, calc_timeout):
      """Execute a Logical Unit, with the needed locks.
  
      This is a recursive function that starts locking the given level, and
@@ -348,13 +407,29 @@ class Processor(object):
      given LU and its opcodes.
  
      """
+    glm = self.context.glm
      adding_locks = level in lu.add_locks
      acquiring_locks = level in lu.needed_locks
+
      if level not in locking.LEVELS:
+      _VerifyLocks(lu, glm)
+
        if self._cbs:
          self._cbs.NotifyStart()
  
-      result = self._ExecLU(lu)
+      try:
+        result = self._ExecLU(lu)
+      except AssertionError, err:
+        # this is a bit ugly, as we don't know from which phase
+        # (prereq, exec) this comes; but it's better than an exception
+        # with no information
+        (_, _, tb) = sys.exc_info()
+        err_info = traceback.format_tb(tb)
+        del tb
+        logging.exception("Detected AssertionError")
+        raise errors.OpExecError("Internal assertion error: please report"
+                                 " this as a bug.\nError message: '%s';"
+                                 " location:\n%s" % (str(err), err_info[-1]))
  
      elif adding_locks and acquiring_locks:
        # We could both acquire and add locks at the same level, but for now we
@@ -367,6 +442,7 @@ class Processor(object):
  
        lu.DeclareLocks(level)
        share = lu.share_locks[level]
+      opportunistic = lu.opportunistic_locks[level]
  
        try:
          assert adding_locks ^ acquiring_locks, \
@@ -376,36 +452,38 @@ class Processor(object):
            # Acquiring locks
            needed_locks = lu.needed_locks[level]
  
-          self._AcquireLocks(level, needed_locks, share,
-                             calc_timeout(), priority)
+          self._AcquireLocks(level, needed_locks, share, opportunistic,
+                             calc_timeout())
          else:
            # Adding locks
            add_locks = lu.add_locks[level]
            lu.remove_locks[level] = add_locks
  
            try:
-            self.context.glm.add(level, add_locks, acquired=1, shared=share)
+            glm.add(level, add_locks, acquired=1, shared=share)
            except errors.LockError:
+            logging.exception("Detected lock error in level %s for locks"
+                              " %s, shared=%s", level, add_locks, share)
              raise errors.OpPrereqError(
-              "Couldn't add locks (%s), probably because of a race condition"
-              " with another job, who added them first" % add_locks,
-              errors.ECODE_FAULT)
+              "Couldn't add locks (%s), most likely because of another"
+              " job who added them first" % add_locks,
+              errors.ECODE_NOTUNIQUE)
  
          try:
-          result = self._LockAndExecLU(lu, level + 1, calc_timeout, priority)
+          result = self._LockAndExecLU(lu, level + 1, calc_timeout)
          finally:
            if level in lu.remove_locks:
-            self.context.glm.remove(level, lu.remove_locks[level])
+            glm.remove(level, lu.remove_locks[level])
        finally:
-        if self.context.glm.is_owned(level):
-          self.context.glm.release(level)
+        if glm.is_owned(level):
+          glm.release(level)
  
      else:
-      result = self._LockAndExecLU(lu, level + 1, calc_timeout, priority)
+      result = self._LockAndExecLU(lu, level + 1, calc_timeout)
  
      return result
  
-  def ExecOpCode(self, op, cbs, timeout=None, priority=None):
+  def ExecOpCode(self, op, cbs, timeout=None):
      """Execute an opcode.
  
      @type op: an OpCode instance
@@ -414,8 +492,6 @@ class Processor(object):
      @param cbs: Runtime callbacks
      @type timeout: float or None
      @param timeout: Maximum time to acquire all locks, None for no timeout
-    @type priority: number or None
-    @param priority: Priority for acquiring lock(s)
      @raise LockAcquireTimeout: In case locks couldn't be acquired in specified
          amount of time
  
@@ -440,8 +516,7 @@ class Processor(object):
          # and in a shared fashion otherwise (to prevent concurrent run with
          # an exclusive LU.
          self._AcquireLocks(locking.LEVEL_CLUSTER, locking.BGL,
-                            not lu_class.REQ_BGL, calc_timeout(),
-                            priority)
+                            not lu_class.REQ_BGL, False, calc_timeout())
        elif lu_class.REQ_BGL:
          raise errors.ProgrammerError("Opcode '%s' requires BGL, but locks are"
                                       " disabled" % op.OP_ID)
@@ -452,8 +527,8 @@ class Processor(object):
          assert lu.needed_locks is not None, "needed_locks not set by LU"
  
          try:
-          result = self._LockAndExecLU(lu, locking.LEVEL_INSTANCE, calc_timeout,
-                                       priority)
+          result = self._LockAndExecLU(lu, locking.LEVEL_CLUSTER + 1,
+                                       calc_timeout)
          finally:
            if self._ec_id:
              self.context.cfg.DropECReservations(self._ec_id)
@@ -469,8 +544,12 @@ class Processor(object):
      if not (resultcheck_fn is None or resultcheck_fn(result)):
        logging.error("Expected opcode result matching %s, got %s",
                      resultcheck_fn, result)
-      raise errors.OpResultError("Opcode result does not match %s: %s" %
-                                 (resultcheck_fn, utils.Truncate(result, 80)))
+      if not getattr(op, "dry_run", False):
+        # FIXME: LUs should still behave in dry_run mode, or
+        # alternately we should have OP_DRYRUN_RESULT; in the
+        # meantime, we simply skip the OP_RESULT check in dry-run mode
+        raise errors.OpResultError("Opcode result does not match %s: %s" %
+                                   (resultcheck_fn, utils.Truncate(result, 80)))
  
      return result