from ganeti import ssconf
from ganeti import logger
from ganeti import workerpool
+from ganeti import rpc
CLIENT_REQUEST_WORKERS = 16
cleanup at shutdown.
"""
- def __init__(self, address, rqhandler, context):
+ def __init__(self, address, rqhandler):
"""IOServer constructor
Args:
address: the address to bind this IOServer to
rqhandler: RequestHandler type object
- context: Context Object common to all worker threads
"""
SocketServer.UnixStreamServer.__init__(self, address, rqhandler)
- self.do_quit = False
- self.context = context
# We'll only start threads once we've forked.
- self.jobqueue = None
+ self.context = None
self.request_workers = None
- signal.signal(signal.SIGINT, self.handle_quit_signals)
- signal.signal(signal.SIGTERM, self.handle_quit_signals)
-
def setup_queue(self):
- self.jobqueue = jqueue.JobQueue(self.context)
+ self.context = GanetiContext()
self.request_workers = workerpool.WorkerPool(CLIENT_REQUEST_WORKERS,
ClientRequestWorker)
"""
self.request_workers.AddTask(self, request, client_address)
- def handle_quit_signals(self, signum, frame):
- print "received %s in %s" % (signum, frame)
- self.do_quit = True
-
def serve_forever(self):
"""Handle one request at a time until told to quit."""
- while not self.do_quit:
- self.handle_request()
- print "served request, quit=%s" % (self.do_quit)
+ sighandler = utils.SignalHandler([signal.SIGINT, signal.SIGTERM])
+ try:
+ while not sighandler.called:
+ self.handle_request()
+ finally:
+ sighandler.Reset()
def server_cleanup(self):
"""Cleanup the server.
"""
try:
self.server_close()
- utils.RemoveFile(constants.MASTER_SOCKET)
finally:
if self.request_workers:
self.request_workers.TerminateWorkers()
- if self.jobqueue:
- self.jobqueue.Shutdown()
+ if self.context:
+ self.context.jobqueue.Shutdown()
class ClientRqHandler(SocketServer.BaseRequestHandler):
self.server = server
def handle_request(self, method, args):
- queue = self.server.jobqueue
+ queue = self.server.context.jobqueue
# TODO: Parameter validation
return queue.SubmitJob(ops)
elif method == luxi.REQ_CANCEL_JOB:
- (job_id, ) = args
+ job_id = args
return queue.CancelJob(job_id)
elif method == luxi.REQ_ARCHIVE_JOB:
- (job_id, ) = args
+ job_id = args
return queue.ArchiveJob(job_id)
+ elif method == luxi.REQ_WAIT_FOR_JOB_CHANGE:
+ (job_id, fields, prev_job_info, prev_log_serial, timeout) = args
+ return queue.WaitForJobChanges(job_id, fields, prev_job_info,
+ prev_log_serial, timeout)
+
elif method == luxi.REQ_QUERY_JOBS:
(job_ids, fields) = args
return queue.QueryJobs(job_ids, fields)
+ elif method == luxi.REQ_QUERY_INSTANCES:
+ (names, fields) = args
+ op = opcodes.OpQueryInstances(names=names, output_fields=fields)
+ return self._Query(op)
+
+ elif method == luxi.REQ_QUERY_NODES:
+ (names, fields) = args
+ op = opcodes.OpQueryNodes(names=names, output_fields=fields)
+ return self._Query(op)
+
+ elif method == luxi.REQ_QUERY_EXPORTS:
+ nodes = args
+ op = opcodes.OpQueryExports(nodes=nodes)
+ return self._Query(op)
+
else:
raise ValueError("Invalid operation")
+ def _DummyLog(self, *args):
+ pass
+
+ def _Query(self, op):
+ """Runs the specified opcode and returns the result.
+
+ """
+ proc = mcpu.Processor(self.server.context)
+ # TODO: Where should log messages go?
+ return proc.ExecOpCode(op, self._DummyLog)
+
class GanetiContext(object):
"""Context common to all ganeti threads.
"""
assert self.__class__._instance is None, "double GanetiContext instance"
- # Create a ConfigWriter...
+ # Create global configuration object
self.cfg = config.ConfigWriter()
- # And a GanetiLockingManager...
+
+ # Locking manager
self.glm = locking.GanetiLockManager(
self.cfg.GetNodeList(),
self.cfg.GetInstanceList())
+ # Job queue
+ self.jobqueue = jqueue.JobQueue(self)
+
# setting this also locks the class against attribute modifications
self.__class__._instance = self
assert self.__class__._instance is None, "Attempt to modify Ganeti Context"
object.__setattr__(self, name, value)
+ def AddNode(self, node):
+ """Adds a node to the configuration and lock manager.
-def CheckMaster(debug):
- """Checks the node setup.
+ """
+ # Add it to the configuration
+ self.cfg.AddNode(node)
- If this is the master, the function will return. Otherwise it will
- exit with an exit code based on the node status.
+ # If preseeding fails it'll not be added
+ self.jobqueue.AddNode(node.name)
- """
- try:
- ss = ssconf.SimpleStore()
- master_name = ss.GetMasterNode()
- except errors.ConfigurationError, err:
- print "Cluster configuration incomplete: '%s'" % str(err)
- sys.exit(EXIT_NODESETUP_ERROR)
+ # Add the new node to the Ganeti Lock Manager
+ self.glm.add(locking.LEVEL_NODE, node.name)
- try:
- myself = utils.HostInfo()
- except errors.ResolverError, err:
- sys.stderr.write("Cannot resolve my own name (%s)\n" % err.args[0])
- sys.exit(EXIT_NODESETUP_ERROR)
+ def ReaddNode(self, node):
+ """Updates a node that's already in the configuration
- if myself.name != master_name:
- if debug:
- sys.stderr.write("Not master, exiting.\n")
- sys.exit(EXIT_NOTMASTER)
+ """
+ # Synchronize the queue again
+ self.jobqueue.AddNode(node.name)
+
+ def RemoveNode(self, name):
+ """Removes a node from the configuration and lock manager.
+
+ """
+ # Remove node from configuration
+ self.cfg.RemoveNode(name)
+
+ # Notify job queue
+ self.jobqueue.RemoveNode(name)
+
+ # Remove the node from the Ganeti Lock Manager
+ self.glm.remove(locking.LEVEL_NODE, name)
def ParseOptions():
return options, args
+def CheckAgreement():
+ """Check the agreement on who is the master.
+
+ The function uses a very simple algorithm: we must get more positive
+ than negative answers. Since in most of the cases we are the master,
+ we'll use our own config file for getting the node list. In the
+ future we could collect the current node list from our (possibly
+ obsolete) known nodes.
+
+ """
+ myself = utils.HostInfo().name
+ #temp instantiation of a config writer, used only to get the node list
+ cfg = config.ConfigWriter()
+ node_list = cfg.GetNodeList()
+ del cfg
+ try:
+ node_list.remove(myself)
+ except KeyError:
+ pass
+ if not node_list:
+ # either single node cluster, or a misconfiguration, but I won't
+ # break any other node, so I can proceed
+ return True
+ results = rpc.call_master_info(node_list)
+ if not isinstance(results, dict):
+ # this should not happen (unless internal error in rpc)
+ logging.critical("Can't complete rpc call, aborting master startup")
+ return False
+ positive = negative = 0
+ other_masters = {}
+ for node in results:
+ if not isinstance(results[node], (tuple, list)) or len(results[node]) < 3:
+ logging.warning("Can't contact node %s", node)
+ continue
+ master_node = results[node][2]
+ if master_node == myself:
+ positive += 1
+ else:
+ negative += 1
+ if not master_node in other_masters:
+ other_masters[master_node] = 0
+ other_masters[master_node] += 1
+ if positive <= negative:
+ # bad!
+ logging.critical("It seems we are not the master (%d votes for,"
+ " %d votes against)", positive, negative)
+ if len(other_masters) > 1:
+ logging.critical("The other nodes do not agree on a single master")
+ elif other_masters:
+ # TODO: resync my files from the master
+ logging.critical("It seems the real master is %s",
+ other_masters.keys()[0])
+ else:
+ logging.critical("Can't contact any node for data, aborting startup")
+ return False
+ return True
+
+
def main():
"""Main function"""
utils.debug = options.debug
utils.no_fork = True
- CheckMaster(options.debug)
+ ssconf.CheckMaster(options.debug)
+
+ # we believe we are the master, let's ask the other nodes...
+ if not CheckAgreement():
+ return
- master = IOServer(constants.MASTER_SOCKET, ClientRqHandler, GanetiContext())
+ master = IOServer(constants.MASTER_SOCKET, ClientRqHandler)
# become a daemon
if options.fork:
utils.Daemonize(logfile=constants.LOG_MASTERDAEMON,
noclose_fds=[master.fileno()])
- logger.SetupDaemon(constants.LOG_MASTERDAEMON, debug=options.debug,
- stderr_logging=not options.fork)
+ utils.WritePidFile(constants.MASTERD_PID)
+
+ logger.SetupLogging(constants.LOG_MASTERDAEMON, debug=options.debug,
+ stderr_logging=not options.fork)
logging.info("ganeti master daemon startup")
+ # activate ip
+ master_node = ssconf.SimpleStore().GetMasterNode()
+ if not rpc.call_node_start_master(master_node, False):
+ logging.error("Can't activate master IP address")
+
master.setup_queue()
try:
master.serve_forever()
finally:
master.server_cleanup()
+ utils.RemovePidFile(constants.MASTERD_PID)
if __name__ == "__main__":