Add output of job/opcode timestamps
[ganeti-local] / daemons / ganeti-masterd
old mode 100644 (file)
new mode 100755 (executable)
index b18a8bb..a1dc48d
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/python -u
 #
 
 # Copyright (C) 2006, 2007 Google Inc.
@@ -27,24 +27,53 @@ inheritance from parent classes requires it.
 """
 
 
+import sys
 import SocketServer
-import threading
 import time
 import collections
 import Queue
 import random
 import signal
 import simplejson
-
+import logging
 
 from cStringIO import StringIO
+from optparse import OptionParser
 
+from ganeti import config
 from ganeti import constants
 from ganeti import mcpu
 from ganeti import opcodes
 from ganeti import jqueue
+from ganeti import locking
 from ganeti import luxi
 from ganeti import utils
+from ganeti import errors
+from ganeti import ssconf
+from ganeti import logger
+from ganeti import workerpool
+from ganeti import rpc
+
+
+CLIENT_REQUEST_WORKERS = 16
+
+EXIT_NOTMASTER = constants.EXIT_NOTMASTER
+EXIT_NODESETUP_ERROR = constants.EXIT_NODESETUP_ERROR
+
+
+class ClientRequestWorker(workerpool.BaseWorker):
+  def RunTask(self, server, request, client_address):
+    """Process the request.
+
+    This is copied from the code in ThreadingMixIn.
+
+    """
+    try:
+      server.finish_request(request, client_address)
+      server.close_request(request)
+    except:
+      server.handle_error(request, client_address)
+      server.close_request(request)
 
 
 class IOServer(SocketServer.UnixStreamServer):
@@ -55,54 +84,54 @@ class IOServer(SocketServer.UnixStreamServer):
   cleanup at shutdown.
 
   """
-  QUEUE_PROCESSOR_SIZE = 1
-
   def __init__(self, address, rqhandler):
-    SocketServer.UnixStreamServer.__init__(self, address, rqhandler)
-    self.do_quit = False
-    self.queue = jqueue.QueueManager()
-    self.processors = []
-    for i in range(self.QUEUE_PROCESSOR_SIZE):
-      self.processors.append(threading.Thread(target=PoolWorker,
-                                              args=(i, self.queue.new_queue)))
-    for t in self.processors:
-      t.start()
-    signal.signal(signal.SIGINT, self.handle_sigint)
-
-  def process_request_thread(self, request, client_address):
-    """Process the request.
+    """IOServer constructor
 
-    This is copied from the code in ThreadingMixIn.
+    Args:
+      address: the address to bind this IOServer to
+      rqhandler: RequestHandler type object
 
     """
-    try:
-      self.finish_request(request, client_address)
-      self.close_request(request)
-    except:
-      self.handle_error(request, client_address)
-      self.close_request(request)
+    SocketServer.UnixStreamServer.__init__(self, address, rqhandler)
 
-  def process_request(self, request, client_address):
-    """Start a new thread to process the request.
+    # We'll only start threads once we've forked.
+    self.context = None
+    self.request_workers = None
 
-    This is copied from the coode in ThreadingMixIn.
+  def setup_queue(self):
+    self.context = GanetiContext()
+    self.request_workers = workerpool.WorkerPool(CLIENT_REQUEST_WORKERS,
+                                                 ClientRequestWorker)
 
-    """
-    t = threading.Thread(target=self.process_request_thread,
-                         args=(request, client_address))
-    t.start()
+  def process_request(self, request, client_address):
+    """Add task to workerpool to process request.
 
-  def handle_sigint(self, signum, frame):
-    print "received %s in %s" % (signum, frame)
-    self.do_quit = True
-    self.server_close()
-    for i in range(self.QUEUE_PROCESSOR_SIZE):
-      self.queue.new_queue.put(None)
+    """
+    self.request_workers.AddTask(self, request, client_address)
 
   def serve_forever(self):
     """Handle one request at a time until told to quit."""
-    while not self.do_quit:
-      self.handle_request()
+    sighandler = utils.SignalHandler([signal.SIGINT, signal.SIGTERM])
+    try:
+      while not sighandler.called:
+        self.handle_request()
+    finally:
+      sighandler.Reset()
+
+  def server_cleanup(self):
+    """Cleanup the server.
+
+    This involves shutting down the processor threads and the master
+    socket.
+
+    """
+    try:
+      self.server_close()
+    finally:
+      if self.request_workers:
+        self.request_workers.TerminateWorkers()
+      if self.context:
+        self.context.jobqueue.Shutdown()
 
 
 class ClientRqHandler(SocketServer.BaseRequestHandler):
@@ -119,21 +148,36 @@ class ClientRqHandler(SocketServer.BaseRequestHandler):
     while True:
       msg = self.read_message()
       if msg is None:
-        print "client closed connection"
+        logging.info("client closed connection")
         break
+
       request = simplejson.loads(msg)
+      logging.debug("request: %s", request)
       if not isinstance(request, dict):
-        print "wrong request received: %s" % msg
+        logging.error("wrong request received: %s", msg)
         break
-      method = request.get('request', None)
-      data = request.get('data', None)
-      if method is None or data is None:
-        print "no method or data in request"
+
+      method = request.get(luxi.KEY_METHOD, None)
+      args = request.get(luxi.KEY_ARGS, None)
+      if method is None or args is None:
+        logging.error("no method or args in request")
         break
-      print "request:", method, data
-      result = self._ops.handle_request(method, data)
-      print "result:", result
-      self.send_message(simplejson.dumps({'success': True, 'result': result}))
+
+      success = False
+      try:
+        result = self._ops.handle_request(method, args)
+        success = True
+      except:
+        logging.error("Unexpected exception", exc_info=True)
+        err = sys.exc_info()
+        result = "Caught exception: %s" % str(err[1])
+
+      response = {
+        luxi.KEY_SUCCESS: success,
+        luxi.KEY_RESULT: result,
+        }
+      logging.debug("response: %s", response)
+      self.send_message(simplejson.dumps(response))
 
   def read_message(self):
     while not self._msgs:
@@ -154,92 +198,253 @@ class ClientOps:
   """Class holding high-level client operations."""
   def __init__(self, server):
     self.server = server
-    self._cpu = None
-
-  def _getcpu(self):
-    if self._cpu is None:
-      self._cpu = mcpu.Processor(lambda x: None)
-    return self._cpu
-
-  def handle_request(self, operation, args):
-    print operation, args
-    if operation == "submit":
-      return self.put(args)
-    elif operation == "query":
-      return self.query(args)
+
+  def handle_request(self, method, args):
+    queue = self.server.context.jobqueue
+
+    # TODO: Parameter validation
+
+    if method == luxi.REQ_SUBMIT_JOB:
+      ops = [opcodes.OpCode.LoadOpCode(state) for state in args]
+      return queue.SubmitJob(ops)
+
+    elif method == luxi.REQ_CANCEL_JOB:
+      job_id = args
+      return queue.CancelJob(job_id)
+
+    elif method == luxi.REQ_ARCHIVE_JOB:
+      job_id = args
+      return queue.ArchiveJob(job_id)
+
+    elif method == luxi.REQ_WAIT_FOR_JOB_CHANGE:
+      (job_id, fields, prev_job_info, prev_log_serial, timeout) = args
+      return queue.WaitForJobChanges(job_id, fields, prev_job_info,
+                                     prev_log_serial, timeout)
+
+    elif method == luxi.REQ_QUERY_JOBS:
+      (job_ids, fields) = args
+      return queue.QueryJobs(job_ids, fields)
+
+    elif method == luxi.REQ_QUERY_INSTANCES:
+      (names, fields) = args
+      op = opcodes.OpQueryInstances(names=names, output_fields=fields)
+      return self._Query(op)
+
+    elif method == luxi.REQ_QUERY_NODES:
+      (names, fields) = args
+      op = opcodes.OpQueryNodes(names=names, output_fields=fields)
+      return self._Query(op)
+
+    elif method == luxi.REQ_QUERY_EXPORTS:
+      nodes = args
+      op = opcodes.OpQueryExports(nodes=nodes)
+      return self._Query(op)
+
     else:
       raise ValueError("Invalid operation")
 
-  def put(self, args):
-    job = luxi.UnserializeJob(args)
-    rid = self.server.queue.put(job)
-    return rid
-
-  def query(self, args):
-    path = args["object"]
-    fields = args["fields"]
-    names = args["names"]
-    if path == "instances":
-      opclass = opcodes.OpQueryInstances
-    elif path == "jobs":
-      # early exit because job query-ing is special (not via opcodes)
-      return self.query_jobs(fields, names)
-    else:
-      raise ValueError("Invalid object %s" % path)
+  def _DummyLog(self, *args):
+    pass
 
-    op = opclass(output_fields = fields, names=names)
-    cpu = self._getcpu()
-    result = cpu.ExecOpCode(op)
-    return result
+  def _Query(self, op):
+    """Runs the specified opcode and returns the result.
 
-  def query_jobs(self, fields, names):
-    return self.server.queue.query_jobs(fields, names)
+    """
+    proc = mcpu.Processor(self.server.context)
+    # TODO: Where should log messages go?
+    return proc.ExecOpCode(op, self._DummyLog)
 
 
-def JobRunner(proc, job):
-  """Job executor.
+class GanetiContext(object):
+  """Context common to all ganeti threads.
 
-  This functions processes a single job in the context of given
-  processor instance.
+  This class creates and holds common objects shared by all threads.
 
   """
-  job.SetStatus(opcodes.Job.STATUS_RUNNING)
-  for op in job.data.op_list:
-    proc.ExecOpCode(op)
-  job.SetStatus(opcodes.Job.STATUS_FINISHED, result=opcodes.Job.RESULT_OK)
+  _instance = None
+
+  def __init__(self):
+    """Constructs a new GanetiContext object.
+
+    There should be only a GanetiContext object at any time, so this
+    function raises an error if this is not the case.
+
+    """
+    assert self.__class__._instance is None, "double GanetiContext instance"
+
+    # Create global configuration object
+    self.cfg = config.ConfigWriter()
+
+    # Locking manager
+    self.glm = locking.GanetiLockManager(
+                self.cfg.GetNodeList(),
+                self.cfg.GetInstanceList())
+
+    # Job queue
+    self.jobqueue = jqueue.JobQueue(self)
+
+    # setting this also locks the class against attribute modifications
+    self.__class__._instance = self
+
+  def __setattr__(self, name, value):
+    """Setting GanetiContext attributes is forbidden after initialization.
+
+    """
+    assert self.__class__._instance is None, "Attempt to modify Ganeti Context"
+    object.__setattr__(self, name, value)
+
+  def AddNode(self, node):
+    """Adds a node to the configuration and lock manager.
+
+    """
+    # Add it to the configuration
+    self.cfg.AddNode(node)
+
+    # If preseeding fails it'll not be added
+    self.jobqueue.AddNode(node.name)
+
+    # Add the new node to the Ganeti Lock Manager
+    self.glm.add(locking.LEVEL_NODE, node.name)
+
+  def ReaddNode(self, node):
+    """Updates a node that's already in the configuration
+
+    """
+    # Synchronize the queue again
+    self.jobqueue.AddNode(node.name)
+
+  def RemoveNode(self, name):
+    """Removes a node from the configuration and lock manager.
+
+    """
+    # Remove node from configuration
+    self.cfg.RemoveNode(name)
+
+    # Notify job queue
+    self.jobqueue.RemoveNode(name)
+
+    # Remove the node from the Ganeti Lock Manager
+    self.glm.remove(locking.LEVEL_NODE, name)
 
 
-def PoolWorker(worker_id, incoming_queue):
-  """A worker thread function.
+def ParseOptions():
+  """Parse the command line options.
 
-  This is the actual processor of a single thread of Job execution.
+  Returns:
+    (options, args) as from OptionParser.parse_args()
 
   """
-  while True:
-    print "worker %s sleeping" % worker_id
-    item = incoming_queue.get(True)
-    if item is None:
-      break
-    print "worker %s processing job %s" % (worker_id, item.data.job_id)
-    utils.Lock('cmd')
-    try:
-      proc = mcpu.Processor(feedback=lambda x: None)
-      try:
-        JobRunner(proc, item)
-      except errors.GenericError, err:
-        print "ganeti exception %s" % err
-    finally:
-      utils.Unlock('cmd')
-      utils.LockCleanup()
-    print "worker %s finish job %s" % (worker_id, item.data.job_id)
-  print "worker %s exiting" % worker_id
+  parser = OptionParser(description="Ganeti master daemon",
+                        usage="%prog [-f] [-d]",
+                        version="%%prog (ganeti) %s" %
+                        constants.RELEASE_VERSION)
+
+  parser.add_option("-f", "--foreground", dest="fork",
+                    help="Don't detach from the current terminal",
+                    default=True, action="store_false")
+  parser.add_option("-d", "--debug", dest="debug",
+                    help="Enable some debug messages",
+                    default=False, action="store_true")
+  options, args = parser.parse_args()
+  return options, args
+
+
+def CheckAgreement():
+  """Check the agreement on who is the master.
+
+  The function uses a very simple algorithm: we must get more positive
+  than negative answers. Since in most of the cases we are the master,
+  we'll use our own config file for getting the node list. In the
+  future we could collect the current node list from our (possibly
+  obsolete) known nodes.
+
+  """
+  myself = utils.HostInfo().name
+  #temp instantiation of a config writer, used only to get the node list
+  cfg = config.ConfigWriter()
+  node_list = cfg.GetNodeList()
+  del cfg
+  try:
+    node_list.remove(myself)
+  except KeyError:
+    pass
+  if not node_list:
+    # either single node cluster, or a misconfiguration, but I won't
+    # break any other node, so I can proceed
+    return True
+  results = rpc.call_master_info(node_list)
+  if not isinstance(results, dict):
+    # this should not happen (unless internal error in rpc)
+    logging.critical("Can't complete rpc call, aborting master startup")
+    return False
+  positive = negative = 0
+  other_masters = {}
+  for node in results:
+    if not isinstance(results[node], (tuple, list)) or len(results[node]) < 3:
+      logging.warning("Can't contact node %s", node)
+      continue
+    master_node = results[node][2]
+    if master_node == myself:
+      positive += 1
+    else:
+      negative += 1
+      if not master_node in other_masters:
+        other_masters[master_node] = 0
+      other_masters[master_node] += 1
+  if positive <= negative:
+    # bad!
+    logging.critical("It seems we are not the master (%d votes for,"
+                     " %d votes against)", positive, negative)
+    if len(other_masters) > 1:
+      logging.critical("The other nodes do not agree on a single master")
+    elif other_masters:
+      # TODO: resync my files from the master
+      logging.critical("It seems the real master is %s",
+                       other_masters.keys()[0])
+    else:
+      logging.critical("Can't contact any node for data, aborting startup")
+    return False
+  return True
 
 
 def main():
   """Main function"""
 
+  options, args = ParseOptions()
+  utils.debug = options.debug
+  utils.no_fork = True
+
+  ssconf.CheckMaster(options.debug)
+
+  # we believe we are the master, let's ask the other nodes...
+  if not CheckAgreement():
+    return
+
   master = IOServer(constants.MASTER_SOCKET, ClientRqHandler)
-  master.serve_forever()
+
+  # become a daemon
+  if options.fork:
+    utils.Daemonize(logfile=constants.LOG_MASTERDAEMON,
+                    noclose_fds=[master.fileno()])
+
+  utils.WritePidFile(constants.MASTERD_PID)
+
+  logger.SetupLogging(constants.LOG_MASTERDAEMON, debug=options.debug,
+                      stderr_logging=not options.fork)
+
+  logging.info("ganeti master daemon startup")
+
+  # activate ip
+  master_node = ssconf.SimpleStore().GetMasterNode()
+  if not rpc.call_node_start_master(master_node, False):
+    logging.error("Can't activate master IP address")
+
+  master.setup_queue()
+  try:
+    master.serve_forever()
+  finally:
+    master.server_cleanup()
+    utils.RemovePidFile(constants.MASTERD_PID)
 
 
 if __name__ == "__main__":