X-Git-Url: https://code.grnet.gr/git/ganeti-local/blobdiff_plain/3620598182387c981a38fbfaf1f92d2940dec62b..5dc626fd898ea59855e93d778834ab64144b2fdf:/daemons/ganeti-masterd diff --git a/daemons/ganeti-masterd b/daemons/ganeti-masterd index a1dc48d..ac0af6d 100755 --- a/daemons/ganeti-masterd +++ b/daemons/ganeti-masterd @@ -27,6 +27,8 @@ inheritance from parent classes requires it. """ +import os +import errno import sys import SocketServer import time @@ -50,9 +52,9 @@ from ganeti import luxi from ganeti import utils from ganeti import errors from ganeti import ssconf -from ganeti import logger from ganeti import workerpool from ganeti import rpc +from ganeti import bootstrap CLIENT_REQUEST_WORKERS = 16 @@ -87,9 +89,8 @@ class IOServer(SocketServer.UnixStreamServer): def __init__(self, address, rqhandler): """IOServer constructor - Args: - address: the address to bind this IOServer to - rqhandler: RequestHandler type object + @param address: the address to bind this IOServer to + @param rqhandler: RequestHandler type object """ SocketServer.UnixStreamServer.__init__(self, address, rqhandler) @@ -167,6 +168,9 @@ class ClientRqHandler(SocketServer.BaseRequestHandler): try: result = self._ops.handle_request(method, args) success = True + except errors.GenericError, err: + success = False + result = (err.__class__.__name__, err.args) except: logging.error("Unexpected exception", exc_info=True) err = sys.exc_info() @@ -216,6 +220,10 @@ class ClientOps: job_id = args return queue.ArchiveJob(job_id) + elif method == luxi.REQ_AUTOARCHIVE_JOBS: + (age, timeout) = args + return queue.AutoArchiveJobs(age, timeout) + elif method == luxi.REQ_WAIT_FOR_JOB_CHANGE: (job_id, fields, prev_job_info, prev_log_serial, timeout) = args return queue.WaitForJobChanges(job_id, fields, prev_job_info, @@ -240,6 +248,15 @@ class ClientOps: op = opcodes.OpQueryExports(nodes=nodes) return self._Query(op) + elif method == luxi.REQ_QUERY_CONFIG_VALUES: + fields = args + op = opcodes.OpQueryConfigValues(output_fields=fields) + return self._Query(op) + + elif method == luxi.REQ_QUEUE_SET_DRAIN_FLAG: + drain_flag = args + return queue.SetDrainFlag(drain_flag) + else: raise ValueError("Invalid operation") @@ -252,7 +269,7 @@ class ClientOps: """ proc = mcpu.Processor(self.server.context) # TODO: Where should log messages go? - return proc.ExecOpCode(op, self._DummyLog) + return proc.ExecOpCode(op, self._DummyLog, None) class GanetiContext(object): @@ -301,7 +318,7 @@ class GanetiContext(object): self.cfg.AddNode(node) # If preseeding fails it'll not be added - self.jobqueue.AddNode(node.name) + self.jobqueue.AddNode(node) # Add the new node to the Ganeti Lock Manager self.glm.add(locking.LEVEL_NODE, node.name) @@ -311,7 +328,7 @@ class GanetiContext(object): """ # Synchronize the queue again - self.jobqueue.AddNode(node.name) + self.jobqueue.AddNode(node) def RemoveNode(self, name): """Removes a node from the configuration and lock manager. @@ -330,8 +347,7 @@ class GanetiContext(object): def ParseOptions(): """Parse the command line options. - Returns: - (options, args) as from OptionParser.parse_args() + @return: (options, args) as from OptionParser.parse_args() """ parser = OptionParser(description="Ganeti master daemon", @@ -358,53 +374,52 @@ def CheckAgreement(): future we could collect the current node list from our (possibly obsolete) known nodes. + In order to account for cold-start of all nodes, we retry for up to + a minute until we get a real answer as the top-voted one. If the + nodes are more out-of-sync, for now manual startup of the master + should be attempted. + + Note that for a even number of nodes cluster, we need at least half + of the nodes (beside ourselves) to vote for us. This creates a + problem on two-node clusters, since in this case we require the + other node to be up too to confirm our status. + """ myself = utils.HostInfo().name #temp instantiation of a config writer, used only to get the node list cfg = config.ConfigWriter() node_list = cfg.GetNodeList() del cfg - try: - node_list.remove(myself) - except KeyError: - pass - if not node_list: - # either single node cluster, or a misconfiguration, but I won't - # break any other node, so I can proceed - return True - results = rpc.call_master_info(node_list) - if not isinstance(results, dict): - # this should not happen (unless internal error in rpc) - logging.critical("Can't complete rpc call, aborting master startup") - return False - positive = negative = 0 - other_masters = {} - for node in results: - if not isinstance(results[node], (tuple, list)) or len(results[node]) < 3: - logging.warning("Can't contact node %s", node) + retries = 6 + while retries > 0: + votes = bootstrap.GatherMasterVotes(node_list) + if not votes: + # empty node list, this is a one node cluster + return True + if votes[0][0] is None: + retries -= 1 + time.sleep(10) continue - master_node = results[node][2] - if master_node == myself: - positive += 1 - else: - negative += 1 - if not master_node in other_masters: - other_masters[master_node] = 0 - other_masters[master_node] += 1 - if positive <= negative: - # bad! - logging.critical("It seems we are not the master (%d votes for," - " %d votes against)", positive, negative) - if len(other_masters) > 1: - logging.critical("The other nodes do not agree on a single master") - elif other_masters: - # TODO: resync my files from the master - logging.critical("It seems the real master is %s", - other_masters.keys()[0]) - else: - logging.critical("Can't contact any node for data, aborting startup") + break + if retries == 0: + logging.critical("Cluster inconsistent, most of the nodes didn't answer" + " after multiple retries. Aborting startup") return False - return True + # here a real node is at the top of the list + all_votes = sum(item[1] for item in votes) + top_node, top_votes = votes[0] + result = False + if top_node != myself: + logging.critical("It seems we are not the master (top-voted node" + " is %s with %d out of %d votes)", top_node, top_votes, + all_votes) + elif top_votes < all_votes - top_votes: + logging.critical("It seems we are not the master (%d votes for," + " %d votes against)", top_votes, all_votes - top_votes) + else: + result = True + + return result def main(): @@ -414,37 +429,66 @@ def main(): utils.debug = options.debug utils.no_fork = True - ssconf.CheckMaster(options.debug) + if options.fork: + utils.CloseFDs() + + rpc.Init() + try: + ssconf.CheckMaster(options.debug) - # we believe we are the master, let's ask the other nodes... - if not CheckAgreement(): - return + # we believe we are the master, let's ask the other nodes... + if not CheckAgreement(): + return - master = IOServer(constants.MASTER_SOCKET, ClientRqHandler) + dirs = [(constants.RUN_GANETI_DIR, constants.RUN_DIRS_MODE), + (constants.SOCKET_DIR, constants.SOCKET_DIR_MODE), + ] + for dir, mode in dirs: + try: + os.mkdir(dir, mode) + except EnvironmentError, err: + if err.errno != errno.EEXIST: + raise errors.GenericError("Cannot create needed directory" + " '%s': %s" % (constants.SOCKET_DIR, err)) + if not os.path.isdir(dir): + raise errors.GenericError("%s is not a directory" % dir) + + # This is safe to do as the pid file guarantees against + # concurrent execution. + utils.RemoveFile(constants.MASTER_SOCKET) + + master = IOServer(constants.MASTER_SOCKET, ClientRqHandler) + finally: + rpc.Shutdown() # become a daemon if options.fork: - utils.Daemonize(logfile=constants.LOG_MASTERDAEMON, - noclose_fds=[master.fileno()]) + utils.Daemonize(logfile=constants.LOG_MASTERDAEMON) utils.WritePidFile(constants.MASTERD_PID) + try: + utils.SetupLogging(constants.LOG_MASTERDAEMON, debug=options.debug, + stderr_logging=not options.fork) - logger.SetupLogging(constants.LOG_MASTERDAEMON, debug=options.debug, - stderr_logging=not options.fork) - - logging.info("ganeti master daemon startup") + logging.info("Ganeti master daemon startup") - # activate ip - master_node = ssconf.SimpleStore().GetMasterNode() - if not rpc.call_node_start_master(master_node, False): - logging.error("Can't activate master IP address") + rpc.Init() + try: + # activate ip + master_node = ssconf.SimpleConfigReader().GetMasterNode() + if not rpc.RpcRunner.call_node_start_master(master_node, False): + logging.error("Can't activate master IP address") - master.setup_queue() - try: - master.serve_forever() + master.setup_queue() + try: + master.serve_forever() + finally: + master.server_cleanup() + finally: + rpc.Shutdown() finally: - master.server_cleanup() utils.RemovePidFile(constants.MASTERD_PID) + utils.RemoveFile(constants.MASTER_SOCKET) if __name__ == "__main__":