#!/usr/bin/python -u # # Copyright (C) 2006, 2007 Google Inc. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA # 02110-1301, USA. """Master daemon program. Some classes deviates from the standard style guide since the inheritance from parent classes requires it. """ import os import errno import sys import SocketServer import time import collections import Queue import random import signal import simplejson import logging from cStringIO import StringIO from optparse import OptionParser from ganeti import config from ganeti import constants from ganeti import mcpu from ganeti import opcodes from ganeti import jqueue from ganeti import locking from ganeti import luxi from ganeti import utils from ganeti import errors from ganeti import ssconf from ganeti import workerpool from ganeti import rpc from ganeti import bootstrap CLIENT_REQUEST_WORKERS = 16 EXIT_NOTMASTER = constants.EXIT_NOTMASTER EXIT_NODESETUP_ERROR = constants.EXIT_NODESETUP_ERROR class ClientRequestWorker(workerpool.BaseWorker): def RunTask(self, server, request, client_address): """Process the request. This is copied from the code in ThreadingMixIn. """ try: server.finish_request(request, client_address) server.close_request(request) except: server.handle_error(request, client_address) server.close_request(request) class IOServer(SocketServer.UnixStreamServer): """IO thread class. This class takes care of initializing the other threads, setting signal handlers (which are processed only in this thread), and doing cleanup at shutdown. """ def __init__(self, address, rqhandler): """IOServer constructor @param address: the address to bind this IOServer to @param rqhandler: RequestHandler type object """ SocketServer.UnixStreamServer.__init__(self, address, rqhandler) # We'll only start threads once we've forked. self.context = None self.request_workers = None def setup_queue(self): self.context = GanetiContext() self.request_workers = workerpool.WorkerPool(CLIENT_REQUEST_WORKERS, ClientRequestWorker) def process_request(self, request, client_address): """Add task to workerpool to process request. """ self.request_workers.AddTask(self, request, client_address) def serve_forever(self): """Handle one request at a time until told to quit.""" sighandler = utils.SignalHandler([signal.SIGINT, signal.SIGTERM]) try: while not sighandler.called: self.handle_request() finally: sighandler.Reset() def server_cleanup(self): """Cleanup the server. This involves shutting down the processor threads and the master socket. """ try: self.server_close() finally: if self.request_workers: self.request_workers.TerminateWorkers() if self.context: self.context.jobqueue.Shutdown() class ClientRqHandler(SocketServer.BaseRequestHandler): """Client handler""" EOM = '\3' READ_SIZE = 4096 def setup(self): self._buffer = "" self._msgs = collections.deque() self._ops = ClientOps(self.server) def handle(self): while True: msg = self.read_message() if msg is None: logging.info("client closed connection") break request = simplejson.loads(msg) logging.debug("request: %s", request) if not isinstance(request, dict): logging.error("wrong request received: %s", msg) break method = request.get(luxi.KEY_METHOD, None) args = request.get(luxi.KEY_ARGS, None) if method is None or args is None: logging.error("no method or args in request") break success = False try: result = self._ops.handle_request(method, args) success = True except errors.GenericError, err: success = False result = (err.__class__.__name__, err.args) except: logging.error("Unexpected exception", exc_info=True) err = sys.exc_info() result = "Caught exception: %s" % str(err[1]) response = { luxi.KEY_SUCCESS: success, luxi.KEY_RESULT: result, } logging.debug("response: %s", response) self.send_message(simplejson.dumps(response)) def read_message(self): while not self._msgs: data = self.request.recv(self.READ_SIZE) if not data: return None new_msgs = (self._buffer + data).split(self.EOM) self._buffer = new_msgs.pop() self._msgs.extend(new_msgs) return self._msgs.popleft() def send_message(self, msg): #print "sending", msg self.request.sendall(msg + self.EOM) class ClientOps: """Class holding high-level client operations.""" def __init__(self, server): self.server = server def handle_request(self, method, args): queue = self.server.context.jobqueue # TODO: Parameter validation if method == luxi.REQ_SUBMIT_JOB: ops = [opcodes.OpCode.LoadOpCode(state) for state in args] return queue.SubmitJob(ops) elif method == luxi.REQ_CANCEL_JOB: job_id = args return queue.CancelJob(job_id) elif method == luxi.REQ_ARCHIVE_JOB: job_id = args return queue.ArchiveJob(job_id) elif method == luxi.REQ_AUTOARCHIVE_JOBS: (age, timeout) = args return queue.AutoArchiveJobs(age, timeout) elif method == luxi.REQ_WAIT_FOR_JOB_CHANGE: (job_id, fields, prev_job_info, prev_log_serial, timeout) = args return queue.WaitForJobChanges(job_id, fields, prev_job_info, prev_log_serial, timeout) elif method == luxi.REQ_QUERY_JOBS: (job_ids, fields) = args return queue.QueryJobs(job_ids, fields) elif method == luxi.REQ_QUERY_INSTANCES: (names, fields) = args op = opcodes.OpQueryInstances(names=names, output_fields=fields) return self._Query(op) elif method == luxi.REQ_QUERY_NODES: (names, fields) = args op = opcodes.OpQueryNodes(names=names, output_fields=fields) return self._Query(op) elif method == luxi.REQ_QUERY_EXPORTS: nodes = args op = opcodes.OpQueryExports(nodes=nodes) return self._Query(op) elif method == luxi.REQ_QUERY_CONFIG_VALUES: fields = args op = opcodes.OpQueryConfigValues(output_fields=fields) return self._Query(op) elif method == luxi.REQ_QUEUE_SET_DRAIN_FLAG: drain_flag = args return queue.SetDrainFlag(drain_flag) else: raise ValueError("Invalid operation") def _DummyLog(self, *args): pass def _Query(self, op): """Runs the specified opcode and returns the result. """ proc = mcpu.Processor(self.server.context) # TODO: Where should log messages go? return proc.ExecOpCode(op, self._DummyLog, None) class GanetiContext(object): """Context common to all ganeti threads. This class creates and holds common objects shared by all threads. """ _instance = None def __init__(self): """Constructs a new GanetiContext object. There should be only a GanetiContext object at any time, so this function raises an error if this is not the case. """ assert self.__class__._instance is None, "double GanetiContext instance" # Create global configuration object self.cfg = config.ConfigWriter() # Locking manager self.glm = locking.GanetiLockManager( self.cfg.GetNodeList(), self.cfg.GetInstanceList()) # Job queue self.jobqueue = jqueue.JobQueue(self) # setting this also locks the class against attribute modifications self.__class__._instance = self def __setattr__(self, name, value): """Setting GanetiContext attributes is forbidden after initialization. """ assert self.__class__._instance is None, "Attempt to modify Ganeti Context" object.__setattr__(self, name, value) def AddNode(self, node): """Adds a node to the configuration and lock manager. """ # Add it to the configuration self.cfg.AddNode(node) # If preseeding fails it'll not be added self.jobqueue.AddNode(node) # Add the new node to the Ganeti Lock Manager self.glm.add(locking.LEVEL_NODE, node.name) def ReaddNode(self, node): """Updates a node that's already in the configuration """ # Synchronize the queue again self.jobqueue.AddNode(node) def RemoveNode(self, name): """Removes a node from the configuration and lock manager. """ # Remove node from configuration self.cfg.RemoveNode(name) # Notify job queue self.jobqueue.RemoveNode(name) # Remove the node from the Ganeti Lock Manager self.glm.remove(locking.LEVEL_NODE, name) def ParseOptions(): """Parse the command line options. @return: (options, args) as from OptionParser.parse_args() """ parser = OptionParser(description="Ganeti master daemon", usage="%prog [-f] [-d]", version="%%prog (ganeti) %s" % constants.RELEASE_VERSION) parser.add_option("-f", "--foreground", dest="fork", help="Don't detach from the current terminal", default=True, action="store_false") parser.add_option("-d", "--debug", dest="debug", help="Enable some debug messages", default=False, action="store_true") options, args = parser.parse_args() return options, args def CheckAgreement(): """Check the agreement on who is the master. The function uses a very simple algorithm: we must get more positive than negative answers. Since in most of the cases we are the master, we'll use our own config file for getting the node list. In the future we could collect the current node list from our (possibly obsolete) known nodes. In order to account for cold-start of all nodes, we retry for up to a minute until we get a real answer as the top-voted one. If the nodes are more out-of-sync, for now manual startup of the master should be attempted. Note that for a even number of nodes cluster, we need at least half of the nodes (beside ourselves) to vote for us. This creates a problem on two-node clusters, since in this case we require the other node to be up too to confirm our status. """ myself = utils.HostInfo().name #temp instantiation of a config writer, used only to get the node list cfg = config.ConfigWriter() node_list = cfg.GetNodeList() del cfg retries = 6 while retries > 0: votes = bootstrap.GatherMasterVotes(node_list) if not votes: # empty node list, this is a one node cluster return True if votes[0][0] is None: retries -= 1 time.sleep(10) continue break if retries == 0: logging.critical("Cluster inconsistent, most of the nodes didn't answer" " after multiple retries. Aborting startup") return False # here a real node is at the top of the list all_votes = sum(item[1] for item in votes) top_node, top_votes = votes[0] result = False if top_node != myself: logging.critical("It seems we are not the master (top-voted node" " is %s with %d out of %d votes)", top_node, top_votes, all_votes) elif top_votes < all_votes - top_votes: logging.critical("It seems we are not the master (%d votes for," " %d votes against)", top_votes, all_votes - top_votes) else: result = True return result def main(): """Main function""" options, args = ParseOptions() utils.debug = options.debug utils.no_fork = True if options.fork: utils.CloseFDs() rpc.Init() try: ssconf.CheckMaster(options.debug) # we believe we are the master, let's ask the other nodes... if not CheckAgreement(): return dirs = [(constants.RUN_GANETI_DIR, constants.RUN_DIRS_MODE), (constants.SOCKET_DIR, constants.SOCKET_DIR_MODE), ] for dir, mode in dirs: try: os.mkdir(dir, mode) except EnvironmentError, err: if err.errno != errno.EEXIST: raise errors.GenericError("Cannot create needed directory" " '%s': %s" % (constants.SOCKET_DIR, err)) if not os.path.isdir(dir): raise errors.GenericError("%s is not a directory" % dir) # This is safe to do as the pid file guarantees against # concurrent execution. utils.RemoveFile(constants.MASTER_SOCKET) master = IOServer(constants.MASTER_SOCKET, ClientRqHandler) finally: rpc.Shutdown() # become a daemon if options.fork: utils.Daemonize(logfile=constants.LOG_MASTERDAEMON) utils.WritePidFile(constants.MASTERD_PID) try: utils.SetupLogging(constants.LOG_MASTERDAEMON, debug=options.debug, stderr_logging=not options.fork) logging.info("Ganeti master daemon startup") rpc.Init() try: # activate ip master_node = ssconf.SimpleConfigReader().GetMasterNode() if not rpc.RpcRunner.call_node_start_master(master_node, False): logging.error("Can't activate master IP address") master.setup_queue() try: master.serve_forever() finally: master.server_cleanup() finally: rpc.Shutdown() finally: utils.RemovePidFile(constants.MASTERD_PID) utils.RemoveFile(constants.MASTER_SOCKET) if __name__ == "__main__": main()