Add and remove instance/node locks

[ganeti-local] / lib / cmdlib.py
diff --git a/lib/cmdlib.py b/lib/cmdlib.py

index e3c1a86..c339933 100644 (file)
--- a/lib/cmdlib.py
+++ b/lib/cmdlib.py
@@ -57,6 +57,7 @@ class LogicalUnit(object):
      - optionally redefine their run requirements:
          REQ_MASTER: the LU needs to run on the master node
          REQ_WSSTORE: the LU needs a writable SimpleStore
+        REQ_BGL: the LU needs to hold the Big Ganeti Lock exclusively
  
    Note that all commands require root permissions.
  
@@ -66,8 +67,9 @@ class LogicalUnit(object):
    _OP_REQP = []
    REQ_MASTER = True
    REQ_WSSTORE = False
+  REQ_BGL = True
  
-  def __init__(self, processor, op, cfg, sstore):
+  def __init__(self, processor, op, context, sstore):
      """Constructor for LogicalUnit.
  
      This needs to be overriden in derived classes in order to check op
@@ -76,8 +78,9 @@ class LogicalUnit(object):
      """
      self.proc = processor
      self.op = op
-    self.cfg = cfg
+    self.cfg = context.cfg
      self.sstore = sstore
+    self.context = context
      self.__ssh = None
  
      for attr_name in self._OP_REQP:
@@ -1243,6 +1246,8 @@ class LURemoveNode(LogicalUnit):
      logger.Info("Removing node %s from config" % node.name)
  
      self.cfg.RemoveNode(node.name)
+    # Remove the node from the Ganeti Lock Manager
+    self.context.glm.remove(locking.LEVEL_NODE, node.name)
  
      utils.RemoveHostFromEtcHosts(node.name)
  
@@ -1529,46 +1534,7 @@ class LUAddNode(LogicalUnit):
      new_node = self.new_node
      node = new_node.name
  
-    # set up inter-node password and certificate and restarts the node daemon
-    gntpass = self.sstore.GetNodeDaemonPassword()
-    if not re.match('^[a-zA-Z0-9.]{1,64}$', gntpass):
-      raise errors.OpExecError("ganeti password corruption detected")
-    f = open(constants.SSL_CERT_FILE)
-    try:
-      gntpem = f.read(8192)
-    finally:
-      f.close()
-    # in the base64 pem encoding, neither '!' nor '.' are valid chars,
-    # so we use this to detect an invalid certificate; as long as the
-    # cert doesn't contain this, the here-document will be correctly
-    # parsed by the shell sequence below
-    if re.search('^!EOF\.', gntpem, re.MULTILINE):
-      raise errors.OpExecError("invalid PEM encoding in the SSL certificate")
-    if not gntpem.endswith("\n"):
-      raise errors.OpExecError("PEM must end with newline")
-    logger.Info("copy cluster pass to %s and starting the node daemon" % node)
-
-    # and then connect with ssh to set password and start ganeti-noded
-    # note that all the below variables are sanitized at this point,
-    # either by being constants or by the checks above
-    ss = self.sstore
-    mycommand = ("umask 077 && "
-                 "echo '%s' > '%s' && "
-                 "cat > '%s' << '!EOF.' && \n"
-                 "%s!EOF.\n%s restart" %
-                 (gntpass, ss.KeyToFilename(ss.SS_NODED_PASS),
-                  constants.SSL_CERT_FILE, gntpem,
-                  constants.NODE_INITD_SCRIPT))
-
-    result = self.ssh.Run(node, 'root', mycommand, batch=False, ask_key=True)
-    if result.failed:
-      raise errors.OpExecError("Remote command on node %s, error: %s,"
-                               " output: %s" %
-                               (node, result.fail_reason, result.output))
-
      # check connectivity
-    time.sleep(4)
-
      result = rpc.call_version([node])[node]
      if result:
        if constants.PROTOCOL_VERSION == result:
@@ -1615,12 +1581,22 @@ class LUAddNode(LogicalUnit):
                                   " you gave (%s). Please fix and re-run this"
                                   " command." % new_node.secondary_ip)
  
-    success, msg = self.ssh.VerifyNodeHostname(node)
-    if not success:
-      raise errors.OpExecError("Node '%s' claims it has a different hostname"
-                               " than the one the resolver gives: %s."
-                               " Please fix and re-run this command." %
-                               (node, msg))
+    node_verify_list = [self.sstore.GetMasterNode()]
+    node_verify_param = {
+      'nodelist': [node],
+      # TODO: do a node-net-test as well?
+    }
+
+    result = rpc.call_node_verify(node_verify_list, node_verify_param)
+    for verifier in node_verify_list:
+      if not result[verifier]:
+        raise errors.OpExecError("Cannot communicate with %s's node daemon"
+                                 " for remote verification" % verifier)
+      if result[verifier]['nodelist']:
+        for failed in result[verifier]['nodelist']:
+          feedback_fn("ssh/hostname verification failed %s -> %s" %
+                      (verifier, result[verifier]['nodelist'][failed]))
+        raise errors.OpExecError("ssh/hostname verification failed.")
  
      # Distribute updated /etc/hosts and known_hosts to all nodes,
      # including the node just added
@@ -1639,7 +1615,7 @@ class LUAddNode(LogicalUnit):
            logger.Error("copy of file %s to node %s failed" %
                         (fname, to_node))
  
-    to_copy = ss.GetFileList()
+    to_copy = self.sstore.GetFileList()
      if self.sstore.GetHypervisorType() == constants.HT_XEN_HVM31:
        to_copy.append(constants.VNC_PASSWORD_FILE)
      for fname in to_copy:
@@ -1650,6 +1626,8 @@ class LUAddNode(LogicalUnit):
      if not self.op.readd:
        logger.Info("adding node %s to cluster.conf" % node)
        self.cfg.AddNode(new_node)
+      # Add the new node to the Ganeti Lock Manager
+      self.context.glm.add(locking.LEVEL_NODE, node)
  
  
  class LUMasterFailover(LogicalUnit):
@@ -2392,6 +2370,8 @@ class LURemoveInstance(LogicalUnit):
      logger.Info("removing instance %s out of cluster config" % instance.name)
  
      self.cfg.RemoveInstance(instance.name)
+    # Remove the new instance from the Ganeti Lock Manager
+    self.context.glm.remove(locking.LEVEL_INSTANCE, instance.name)
  
  
  class LUQueryInstances(NoHooksLU):
@@ -3250,6 +3230,8 @@ class LUCreateInstance(LogicalUnit):
      feedback_fn("adding instance %s to cluster config" % instance)
  
      self.cfg.AddInstance(iobj)
+    # Add the new instance to the Ganeti Lock Manager
+    self.context.glm.add(locking.LEVEL_INSTANCE, instance)
  
      if self.op.wait_for_sync:
        disk_abort = not _WaitForSync(self.cfg, iobj, self.proc)
@@ -3264,6 +3246,8 @@ class LUCreateInstance(LogicalUnit):
      if disk_abort:
        _RemoveDisks(iobj, self.cfg)
        self.cfg.RemoveInstance(iobj.name)
+      # Remove the new instance from the Ganeti Lock Manager
+      self.context.glm.remove(locking.LEVEL_INSTANCE, iobj.name)
        raise errors.OpExecError("There are some degraded disks for"
                                 " this instance")
  
@@ -4577,7 +4561,7 @@ class LUDelTags(TagsLU):
  class LUTestDelay(NoHooksLU):
    """Sleep for a specified amount of time.
  
-  This LU sleeps on the master and/or nodes for a specified amoutn of
+  This LU sleeps on the master and/or nodes for a specified amount of
    time.
  
    """