-#!/usr/bin/python -u
+#!/usr/bin/python
#
# Copyright (C) 2006, 2007 Google Inc.
"""
+# pylint: disable-msg=C0103
+# C0103: Invalid name ganeti-masterd
import sys
import SocketServer
import time
import collections
-import Queue
-import random
import signal
-import simplejson
import logging
-from cStringIO import StringIO
from optparse import OptionParser
from ganeti import config
from ganeti import constants
+from ganeti import daemon
from ganeti import mcpu
from ganeti import opcodes
from ganeti import jqueue
from ganeti import utils
from ganeti import errors
from ganeti import ssconf
-from ganeti import logger
from ganeti import workerpool
from ganeti import rpc
from ganeti import bootstrap
class ClientRequestWorker(workerpool.BaseWorker):
+ # pylint: disable-msg=W0221
def RunTask(self, server, request, client_address):
"""Process the request.
try:
server.finish_request(request, client_address)
server.close_request(request)
- except:
+ except: # pylint: disable-msg=W0702
server.handle_error(request, client_address)
server.close_request(request)
def __init__(self, address, rqhandler):
"""IOServer constructor
- Args:
- address: the address to bind this IOServer to
- rqhandler: RequestHandler type object
+ @param address: the address to bind this IOServer to
+ @param rqhandler: RequestHandler type object
"""
SocketServer.UnixStreamServer.__init__(self, address, rqhandler)
def setup_queue(self):
self.context = GanetiContext()
- self.request_workers = workerpool.WorkerPool(CLIENT_REQUEST_WORKERS,
+ self.request_workers = workerpool.WorkerPool("ClientReq",
+ CLIENT_REQUEST_WORKERS,
ClientRequestWorker)
def process_request(self, request, client_address):
"""Add task to workerpool to process request.
"""
+ (pid, uid, gid) = utils.GetSocketCredentials(request)
+ logging.info("Accepted connection from pid=%s, uid=%s, gid=%s",
+ pid, uid, gid)
+
self.request_workers.AddTask(self, request, client_address)
- def serve_forever(self):
+ def handle_error(self, request, client_address):
+ logging.exception("Error while handling request")
+
+ @utils.SignalHandled([signal.SIGINT, signal.SIGTERM])
+ def serve_forever(self, signal_handlers=None): # pylint: disable-msg=W0221
"""Handle one request at a time until told to quit."""
- sighandler = utils.SignalHandler([signal.SIGINT, signal.SIGTERM])
- try:
- while not sighandler.called:
- self.handle_request()
- finally:
- sighandler.Reset()
+ assert isinstance(signal_handlers, dict) and \
+ len(signal_handlers) > 0, \
+ "Broken SignalHandled decorator"
+ # Since we use SignalHandled only once, the resulting dict will map all
+ # signals to the same handler. We'll just use the first one.
+ sighandler = signal_handlers.values()[0]
+ while not sighandler.called:
+ self.handle_request()
def server_cleanup(self):
"""Cleanup the server.
READ_SIZE = 4096
def setup(self):
+ # pylint: disable-msg=W0201
+ # setup() is the api for initialising for this class
self._buffer = ""
self._msgs = collections.deque()
self._ops = ClientOps(self.server)
while True:
msg = self.read_message()
if msg is None:
- logging.info("client closed connection")
+ logging.debug("client closed connection")
break
- request = simplejson.loads(msg)
- logging.debug("request: %s", request)
- if not isinstance(request, dict):
- logging.error("wrong request received: %s", msg)
- break
-
- method = request.get(luxi.KEY_METHOD, None)
- args = request.get(luxi.KEY_ARGS, None)
- if method is None or args is None:
- logging.error("no method or args in request")
- break
+ (method, args) = luxi.ParseRequest(msg)
success = False
try:
result = self._ops.handle_request(method, args)
success = True
except errors.GenericError, err:
- success = False
- result = (err.__class__.__name__, err.args)
+ result = errors.EncodeException(err)
except:
logging.error("Unexpected exception", exc_info=True)
- err = sys.exc_info()
- result = "Caught exception: %s" % str(err[1])
+ result = "Caught exception: %s" % str(sys.exc_info()[1])
- response = {
- luxi.KEY_SUCCESS: success,
- luxi.KEY_RESULT: result,
- }
- logging.debug("response: %s", response)
- self.send_message(simplejson.dumps(response))
+ self.send_message(luxi.FormatResponse(success, result))
def read_message(self):
while not self._msgs:
return self._msgs.popleft()
def send_message(self, msg):
- #print "sending", msg
+ # TODO: sendall is not guaranteed to send everything
self.request.sendall(msg + self.EOM)
def __init__(self, server):
self.server = server
- def handle_request(self, method, args):
+ def handle_request(self, method, args): # pylint: disable-msg=R0911
queue = self.server.context.jobqueue
# TODO: Parameter validation
+ # TODO: Rewrite to not exit in each 'if/elif' branch
+
if method == luxi.REQ_SUBMIT_JOB:
+ logging.info("Received new job")
ops = [opcodes.OpCode.LoadOpCode(state) for state in args]
return queue.SubmitJob(ops)
+ if method == luxi.REQ_SUBMIT_MANY_JOBS:
+ logging.info("Received multiple jobs")
+ jobs = []
+ for ops in args:
+ jobs.append([opcodes.OpCode.LoadOpCode(state) for state in ops])
+ return queue.SubmitManyJobs(jobs)
+
elif method == luxi.REQ_CANCEL_JOB:
job_id = args
+ logging.info("Received job cancel request for %s", job_id)
return queue.CancelJob(job_id)
elif method == luxi.REQ_ARCHIVE_JOB:
job_id = args
+ logging.info("Received job archive request for %s", job_id)
return queue.ArchiveJob(job_id)
elif method == luxi.REQ_AUTOARCHIVE_JOBS:
- age = args
- return queue.AutoArchiveJobs(age)
+ (age, timeout) = args
+ logging.info("Received job autoarchive request for age %s, timeout %s",
+ age, timeout)
+ return queue.AutoArchiveJobs(age, timeout)
elif method == luxi.REQ_WAIT_FOR_JOB_CHANGE:
(job_id, fields, prev_job_info, prev_log_serial, timeout) = args
+ logging.info("Received job poll request for %s", job_id)
return queue.WaitForJobChanges(job_id, fields, prev_job_info,
prev_log_serial, timeout)
elif method == luxi.REQ_QUERY_JOBS:
(job_ids, fields) = args
+ if isinstance(job_ids, (tuple, list)) and job_ids:
+ msg = utils.CommaJoin(job_ids)
+ else:
+ msg = str(job_ids)
+ logging.info("Received job query request for %s", msg)
return queue.QueryJobs(job_ids, fields)
elif method == luxi.REQ_QUERY_INSTANCES:
- (names, fields) = args
- op = opcodes.OpQueryInstances(names=names, output_fields=fields)
+ (names, fields, use_locking) = args
+ logging.info("Received instance query request for %s", names)
+ if use_locking:
+ raise errors.OpPrereqError("Sync queries are not allowed",
+ errors.ECODE_INVAL)
+ op = opcodes.OpQueryInstances(names=names, output_fields=fields,
+ use_locking=use_locking)
return self._Query(op)
elif method == luxi.REQ_QUERY_NODES:
- (names, fields) = args
- op = opcodes.OpQueryNodes(names=names, output_fields=fields)
+ (names, fields, use_locking) = args
+ logging.info("Received node query request for %s", names)
+ if use_locking:
+ raise errors.OpPrereqError("Sync queries are not allowed",
+ errors.ECODE_INVAL)
+ op = opcodes.OpQueryNodes(names=names, output_fields=fields,
+ use_locking=use_locking)
return self._Query(op)
elif method == luxi.REQ_QUERY_EXPORTS:
- nodes = args
- op = opcodes.OpQueryExports(nodes=nodes)
+ nodes, use_locking = args
+ if use_locking:
+ raise errors.OpPrereqError("Sync queries are not allowed",
+ errors.ECODE_INVAL)
+ logging.info("Received exports query request")
+ op = opcodes.OpQueryExports(nodes=nodes, use_locking=use_locking)
return self._Query(op)
elif method == luxi.REQ_QUERY_CONFIG_VALUES:
fields = args
+ logging.info("Received config values query request for %s", fields)
op = opcodes.OpQueryConfigValues(output_fields=fields)
return self._Query(op)
+ elif method == luxi.REQ_QUERY_CLUSTER_INFO:
+ logging.info("Received cluster info query request")
+ op = opcodes.OpQueryClusterInfo()
+ return self._Query(op)
+
+ elif method == luxi.REQ_QUERY_TAGS:
+ kind, name = args
+ logging.info("Received tags query request")
+ op = opcodes.OpGetTags(kind=kind, name=name)
+ return self._Query(op)
+
elif method == luxi.REQ_QUEUE_SET_DRAIN_FLAG:
drain_flag = args
+ logging.info("Received queue drain flag change request to %s",
+ drain_flag)
return queue.SetDrainFlag(drain_flag)
- else:
- raise ValueError("Invalid operation")
+ elif method == luxi.REQ_SET_WATCHER_PAUSE:
+ (until, ) = args
+
+ if until is None:
+ logging.info("Received request to no longer pause the watcher")
+ else:
+ if not isinstance(until, (int, float)):
+ raise TypeError("Duration must be an integer or float")
+
+ if until < time.time():
+ raise errors.GenericError("Unable to set pause end time in the past")
+
+ logging.info("Received request to pause the watcher until %s", until)
- def _DummyLog(self, *args):
- pass
+ return _SetWatcherPause(until)
+
+ else:
+ logging.info("Received invalid request '%s'", method)
+ raise ValueError("Invalid operation '%s'" % method)
def _Query(self, op):
"""Runs the specified opcode and returns the result.
"""
- proc = mcpu.Processor(self.server.context)
- # TODO: Where should log messages go?
- return proc.ExecOpCode(op, self._DummyLog, None)
+ # Queries don't have a job id
+ proc = mcpu.Processor(self.server.context, None)
+ return proc.ExecOpCode(op, None)
class GanetiContext(object):
This class creates and holds common objects shared by all threads.
"""
+ # pylint: disable-msg=W0212
+ # we do want to ensure a singleton here
_instance = None
def __init__(self):
assert self.__class__._instance is None, "Attempt to modify Ganeti Context"
object.__setattr__(self, name, value)
- def AddNode(self, node):
+ def AddNode(self, node, ec_id):
"""Adds a node to the configuration and lock manager.
"""
# Add it to the configuration
- self.cfg.AddNode(node)
+ self.cfg.AddNode(node, ec_id)
# If preseeding fails it'll not be added
- self.jobqueue.AddNode(node.name)
+ self.jobqueue.AddNode(node)
# Add the new node to the Ganeti Lock Manager
self.glm.add(locking.LEVEL_NODE, node.name)
"""
# Synchronize the queue again
- self.jobqueue.AddNode(node.name)
+ self.jobqueue.AddNode(node)
def RemoveNode(self, name):
"""Removes a node from the configuration and lock manager.
self.glm.remove(locking.LEVEL_NODE, name)
-def ParseOptions():
- """Parse the command line options.
+def _SetWatcherPause(until):
+ """Creates or removes the watcher pause file.
- Returns:
- (options, args) as from OptionParser.parse_args()
+ @type until: None or int
+ @param until: Unix timestamp saying until when the watcher shouldn't run
"""
- parser = OptionParser(description="Ganeti master daemon",
- usage="%prog [-f] [-d]",
- version="%%prog (ganeti) %s" %
- constants.RELEASE_VERSION)
+ if until is None:
+ utils.RemoveFile(constants.WATCHER_PAUSEFILE)
+ else:
+ utils.WriteFile(constants.WATCHER_PAUSEFILE,
+ data="%d\n" % (until, ))
- parser.add_option("-f", "--foreground", dest="fork",
- help="Don't detach from the current terminal",
- default=True, action="store_false")
- parser.add_option("-d", "--debug", dest="debug",
- help="Enable some debug messages",
- default=False, action="store_true")
- options, args = parser.parse_args()
- return options, args
+ return until
def CheckAgreement():
continue
break
if retries == 0:
- logging.critical("Cluster inconsistent, most of the nodes didn't answer"
- " after multiple retries. Aborting startup")
- return False
+ logging.critical("Cluster inconsistent, most of the nodes didn't answer"
+ " after multiple retries. Aborting startup")
+ logging.critical("Use the --no-voting option if you understand what"
+ " effects it has on the cluster state")
+ return False
# here a real node is at the top of the list
all_votes = sum(item[1] for item in votes)
top_node, top_votes = votes[0]
+
result = False
if top_node != myself:
logging.critical("It seems we are not the master (top-voted node"
- " is %s)", top_node)
+ " is %s with %d out of %d votes)", top_node, top_votes,
+ all_votes)
elif top_votes < all_votes - top_votes:
logging.critical("It seems we are not the master (%d votes for,"
" %d votes against)", top_votes, all_votes - top_votes)
return result
-def main():
- """Main function"""
+def CheckAgreementWithRpc():
+ rpc.Init()
+ try:
+ return CheckAgreement()
+ finally:
+ rpc.Shutdown()
- options, args = ParseOptions()
- utils.debug = options.debug
- utils.no_fork = True
+
+def CheckMasterd(options, args):
+ """Initial checks whether to run or exit with a failure.
+
+ """
+ if args: # masterd doesn't take any arguments
+ print >> sys.stderr, ("Usage: %s [-f] [-d]" % sys.argv[0])
+ sys.exit(constants.EXIT_FAILURE)
ssconf.CheckMaster(options.debug)
- # we believe we are the master, let's ask the other nodes...
- if not CheckAgreement():
- return
+ # If CheckMaster didn't fail we believe we are the master, but we have to
+ # confirm with the other nodes.
+ if options.no_voting:
+ if options.yes_do_it:
+ return
- master = IOServer(constants.MASTER_SOCKET, ClientRqHandler)
+ sys.stdout.write("The 'no voting' option has been selected.\n")
+ sys.stdout.write("This is dangerous, please confirm by"
+ " typing uppercase 'yes': ")
+ sys.stdout.flush()
+
+ confirmation = sys.stdin.readline().strip()
+ if confirmation != "YES":
+ print >> sys.stderr, "Aborting."
+ sys.exit(constants.EXIT_FAILURE)
- # become a daemon
- if options.fork:
- utils.Daemonize(logfile=constants.LOG_MASTERDAEMON,
- noclose_fds=[master.fileno()])
+ return
- utils.WritePidFile(constants.MASTERD_PID)
+ # CheckAgreement uses RPC and threads, hence it needs to be run in a separate
+ # process before we call utils.Daemonize in the current process.
+ if not utils.RunInSeparateProcess(CheckAgreementWithRpc):
+ sys.exit(constants.EXIT_FAILURE)
- logger.SetupLogging(constants.LOG_MASTERDAEMON, debug=options.debug,
- stderr_logging=not options.fork)
- logging.info("ganeti master daemon startup")
+def ExecMasterd (options, args): # pylint: disable-msg=W0613
+ """Main master daemon function, executed with the PID file held.
- # activate ip
- master_node = ssconf.SimpleConfigReader().GetMasterNode()
- if not rpc.RpcRunner.call_node_start_master(master_node, False):
- logging.error("Can't activate master IP address")
+ """
+ # This is safe to do as the pid file guarantees against
+ # concurrent execution.
+ utils.RemoveFile(constants.MASTER_SOCKET)
- master.setup_queue()
+ master = IOServer(constants.MASTER_SOCKET, ClientRqHandler)
try:
- master.serve_forever()
+ rpc.Init()
+ try:
+ # activate ip
+ master_node = ssconf.SimpleStore().GetMasterNode()
+ result = rpc.RpcRunner.call_node_start_master(master_node, False, False)
+ msg = result.fail_msg
+ if msg:
+ logging.error("Can't activate master IP address: %s", msg)
+
+ master.setup_queue()
+ try:
+ master.serve_forever()
+ finally:
+ master.server_cleanup()
+ finally:
+ rpc.Shutdown()
finally:
- master.server_cleanup()
- utils.RemovePidFile(constants.MASTERD_PID)
+ utils.RemoveFile(constants.MASTER_SOCKET)
+
+
+def main():
+ """Main function"""
+ parser = OptionParser(description="Ganeti master daemon",
+ usage="%prog [-f] [-d]",
+ version="%%prog (ganeti) %s" %
+ constants.RELEASE_VERSION)
+ parser.add_option("--no-voting", dest="no_voting",
+ help="Do not check that the nodes agree on this node"
+ " being the master and start the daemon unconditionally",
+ default=False, action="store_true")
+ parser.add_option("--yes-do-it", dest="yes_do_it",
+ help="Override interactive check for --no-voting",
+ default=False, action="store_true")
+ dirs = [(constants.RUN_GANETI_DIR, constants.RUN_DIRS_MODE),
+ (constants.SOCKET_DIR, constants.SOCKET_DIR_MODE),
+ ]
+ daemon.GenericMain(constants.MASTERD, parser, dirs,
+ CheckMasterd, ExecMasterd,
+ multithreaded=True)
if __name__ == "__main__":