X-Git-Url: https://code.grnet.gr/git/ganeti-local/blobdiff_plain/7b195d9b587a0c8a2622f9e067875736d9af2657..aad81f98ca9fca86aeb5292ae87dca300fc6b83a:/daemons/ganeti-watcher diff --git a/daemons/ganeti-watcher b/daemons/ganeti-watcher index 78de6c9..074fc3b 100755 --- a/daemons/ganeti-watcher +++ b/daemons/ganeti-watcher @@ -1,7 +1,7 @@ #!/usr/bin/python # -# Copyright (C) 2006, 2007 Google Inc. +# Copyright (C) 2006, 2007, 2008 Google Inc. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -29,17 +29,20 @@ by a node reboot. Run from cron or similar. import os import sys -import re import time import fcntl import errno -import simplejson +import logging from optparse import OptionParser from ganeti import utils from ganeti import constants +from ganeti import serializer from ganeti import ssconf from ganeti import errors +from ganeti import opcodes +from ganeti import logger +from ganeti import cli MAXTRIES = 5 @@ -52,11 +55,11 @@ KEY_RESTART_WHEN = "restart_when" KEY_BOOT_ID = "bootid" -class Error(Exception): - """Generic custom error class.""" +# Global client object +client = None -class NotMasterError(Error): +class NotMasterError(errors.GenericError): """Exception raised when this host is not the master.""" @@ -83,11 +86,12 @@ def DoCmd(cmd): res = utils.RunCmd(cmd) if res.failed: - raise Error("Command %s failed:\n%s\nstdout:\n%sstderr:\n%s" % - (repr(cmd), - Indent(res.fail_reason), - Indent(res.stdout), - Indent(res.stderr))) + msg = ("Command %s failed:\n%s\nstdout:\n%sstderr:\n%s" % + (repr(cmd), + Indent(res.fail_reason), + Indent(res.stdout), + Indent(res.stderr))) + raise errors.CommandError(msg) return res @@ -99,52 +103,58 @@ class WatcherState(object): def __init__(self): """Open, lock, read and parse the file. - Raises StandardError on lock contention. + Raises exception on lock contention. """ # The two-step dance below is necessary to allow both opening existing # file read/write and creating if not existing. Vanilla open will truncate # an existing file -or- allow creating if not existing. - f = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT) - f = os.fdopen(f, 'w+') + fd = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT) + self.statefile = os.fdopen(fd, 'w+') - try: - fcntl.flock(f.fileno(), fcntl.LOCK_EX|fcntl.LOCK_NB) - except IOError, x: - if x.errno == errno.EAGAIN: - raise StandardError("State file already locked") - raise - - self.statefile = f + utils.LockFile(self.statefile.fileno()) try: - self.data = simplejson.load(self.statefile) + self._data = serializer.Load(self.statefile.read()) except Exception, msg: # Ignore errors while loading the file and treat it as empty - self.data = {} - sys.stderr.write("Empty or invalid state file. " - "Using defaults. Error message: %s\n" % msg) + self._data = {} + logging.warning(("Empty or invalid state file. Using defaults." + " Error message: %s"), msg) + + if "instance" not in self._data: + self._data["instance"] = {} + if "node" not in self._data: + self._data["node"] = {} - if "instance" not in self.data: - self.data["instance"] = {} - if "node" not in self.data: - self.data["node"] = {} + self._orig_data = serializer.Dump(self._data) - def __del__(self): - """Called on destruction. + def Save(self): + """Save state to file, then unlock and close it. """ - if self.statefile: - self._Close() + assert self.statefile + + serialized_form = serializer.Dump(self._data) + if self._orig_data == serialized_form: + logging.debug("Data didn't change, just touching status file") + os.utime(constants.WATCHER_STATEFILE, None) + return + + # We need to make sure the file is locked before renaming it, otherwise + # starting ganeti-watcher again at the same time will create a conflict. + fd = utils.WriteFile(constants.WATCHER_STATEFILE, + data=serialized_form, + prewrite=utils.LockFile, close=False) + self.statefile = os.fdopen(fd, 'w+') - def _Close(self): + def Close(self): """Unlock configuration file and close it. """ assert self.statefile - fcntl.flock(self.statefile.fileno(), fcntl.LOCK_UN) - + # Files are automatically unlocked when closing them self.statefile.close() self.statefile = None @@ -152,7 +162,7 @@ class WatcherState(object): """Returns the last boot ID of a node or None. """ - ndata = self.data["node"] + ndata = self._data["node"] if name in ndata and KEY_BOOT_ID in ndata[name]: return ndata[name][KEY_BOOT_ID] @@ -164,7 +174,7 @@ class WatcherState(object): """ assert bootid - ndata = self.data["node"] + ndata = self._data["node"] if name not in ndata: ndata[name] = {} @@ -178,7 +188,7 @@ class WatcherState(object): instance - the instance to look up. """ - idata = self.data["instance"] + idata = self._data["instance"] if instance.name in idata: return idata[instance.name][KEY_RESTART_COUNT] @@ -192,7 +202,7 @@ class WatcherState(object): instance - the instance being restarted """ - idata = self.data["instance"] + idata = self._data["instance"] if instance.name not in idata: inst = idata[instance.name] = {} @@ -211,24 +221,11 @@ class WatcherState(object): This method removes the record for a named instance. """ - idata = self.data["instance"] + idata = self._data["instance"] if instance.name in idata: del idata[instance.name] - def Save(self): - """Save state to file, then unlock and close it. - - """ - assert self.statefile - - self.statefile.seek(0) - self.statefile.truncate() - - simplejson.dump(self.data, self.statefile) - - self._Close() - class Instance(object): """Abstraction for a Virtual Machine instance. @@ -246,48 +243,40 @@ class Instance(object): """Encapsulates the start of an instance. """ - DoCmd(['gnt-instance', 'startup', '--lock-retries=15', self.name]) + op = opcodes.OpStartupInstance(instance_name=self.name, + force=False, + extra_args=None) + cli.SubmitOpCode(op, cl=client) def ActivateDisks(self): """Encapsulates the activation of all disks of an instance. """ - DoCmd(['gnt-instance', 'activate-disks', '--lock-retries=15', self.name]) - - -def _RunListCmd(cmd): - """Runs a command and parses its output into lists. - - """ - for line in DoCmd(cmd).stdout.splitlines(): - yield line.split(':') + op = opcodes.OpActivateInstanceDisks(instance_name=self.name) + cli.SubmitOpCode(op, cl=client) def GetInstanceList(with_secondaries=None): """Get a list of instances on this cluster. """ - cmd = ['gnt-instance', 'list', '--lock-retries=15', '--no-headers', - '--separator=:'] - - fields = 'name,oper_state,admin_state' + fields = ["name", "oper_state", "admin_state"] if with_secondaries is not None: - fields += ',snodes' + fields.append("snodes") - cmd.append('-o') - cmd.append(fields) + result = client.QueryInstances([], fields) instances = [] - for fields in _RunListCmd(cmd): + for fields in result: if with_secondaries is not None: (name, status, autostart, snodes) = fields - if snodes == "-": + if not snodes: continue for node in with_secondaries: - if node in snodes.split(','): + if node in snodes: break else: continue @@ -295,7 +284,7 @@ def GetInstanceList(with_secondaries=None): else: (name, status, autostart) = fields - instances.append(Instance(name, status, autostart != "no")) + instances.append(Instance(name, status, autostart)) return instances @@ -304,28 +293,8 @@ def GetNodeBootIDs(): """Get a dict mapping nodes to boot IDs. """ - cmd = ['gnt-node', 'list', '--lock-retries=15', '--no-headers', - '--separator=:', '-o', 'name,bootid'] - - ids = {} - for fields in _RunListCmd(cmd): - (name, bootid) = fields - ids[name] = bootid - - return ids - - -class Message(object): - """Encapsulation of a notice or error message. - - """ - def __init__(self, level, msg): - self.level = level - self.msg = msg - self.when = time.time() - - def __str__(self): - return self.level + ' ' + time.ctime(self.when) + '\n' + Indent(self.msg) + result = client.QueryNodes([], ["name", "bootid"]) + return dict([(name, bootid) for name, bootid in result]) class Watcher(object): @@ -343,22 +312,25 @@ class Watcher(object): raise NotMasterError("This is not the master node") self.instances = GetInstanceList() self.bootids = GetNodeBootIDs() - self.messages = [] + self.started_instances = set() def Run(self): notepad = WatcherState() - self.CheckInstances(notepad) - self.CheckDisks(notepad) - notepad.Save() + try: + self.CheckInstances(notepad) + self.CheckDisks(notepad) + self.VerifyDisks() + finally: + notepad.Save() def CheckDisks(self, notepad): """Check all nodes for restarted ones. """ check_nodes = [] - for name, id in self.bootids.iteritems(): + for name, new_id in self.bootids.iteritems(): old = notepad.GetNodeBootID(name) - if old != id: + if old != new_id: # Node's boot ID has changed, proably through a reboot. check_nodes.append(name) @@ -366,13 +338,19 @@ class Watcher(object): # Activate disks for all instances with any of the checked nodes as a # secondary node. for instance in GetInstanceList(with_secondaries=check_nodes): + if not instance.autostart: + logging.info(("Skipping disk activation for non-autostart" + " instance %s"), instance.name) + continue + if instance.name in self.started_instances: + # we already tried to start the instance, which should have + # activated its drives (if they can be at all) + continue try: - self.messages.append(Message(NOTICE, - "Activating disks for %s." % - instance.name)) + logging.info("Activating disks for instance %s", instance.name) instance.ActivateDisks() - except Error, x: - self.messages.append(Message(ERROR, str(x))) + except Exception, err: + logging.error(str(err), exc_info=True) # Keep changed boot IDs for name in check_nodes: @@ -397,17 +375,16 @@ class Watcher(object): last = " (Attempt #%d)" % (n + 1) else: notepad.RecordRestartAttempt(instance) - self.messages.append(Message(ERROR, "Could not restart %s for %d" - " times, giving up..." % - (instance.name, MAXTRIES))) + logging.error("Could not restart %s after %d attempts, giving up", + instance.name, MAXTRIES) continue try: - self.messages.append(Message(NOTICE, - "Restarting %s%s." % - (instance.name, last))) + logging.info("Restarting %s%s", + instance.name, last) instance.Restart() - except Error, x: - self.messages.append(Message(ERROR, str(x))) + self.started_instances.add(instance.name) + except Exception, err: + logging.error(str(err), exc_info=True) notepad.RecordRestartAttempt(instance) elif instance.state in HELPLESS_STATES: @@ -416,19 +393,16 @@ class Watcher(object): else: if notepad.NumberOfRestartAttempts(instance): notepad.RemoveInstance(instance) - msg = Message(NOTICE, - "Restart of %s succeeded." % instance.name) - self.messages.append(msg) + logging.info("Restart of %s succeeded", instance.name) - def WriteReport(self, logfile): - """Log all messages to file. - - Args: - logfile: file object open for writing (the log file) + def VerifyDisks(self): + """Run gnt-cluster verify-disks. """ - for msg in self.messages: - print >> logfile, str(msg) + # TODO: What should we do here? + result = DoCmd(['gnt-cluster', 'verify-disks']) + if result.output: + logging.info(result.output) def ParseOptions(): @@ -444,7 +418,7 @@ def ParseOptions(): constants.RELEASE_VERSION) parser.add_option("-d", "--debug", dest="debug", - help="Don't redirect messages to the log file", + help="Write all messages to stderr", default=False, action="store_true") options, args = parser.parse_args() return options, args @@ -454,28 +428,33 @@ def main(): """Main function. """ + global client + options, args = ParseOptions() - if not options.debug: - sys.stderr = sys.stdout = open(constants.LOG_WATCHER, 'a') + logger.SetupLogging(constants.LOG_WATCHER, debug=options.debug) try: + client = cli.GetClient() + try: watcher = Watcher() except errors.ConfigurationError: # Just exit if there's no configuration sys.exit(constants.EXIT_SUCCESS) + watcher.Run() - watcher.WriteReport(sys.stdout) + except SystemExit: + raise except NotMasterError: - if options.debug: - sys.stderr.write("Not master, exiting.\n") + logging.debug("Not master, exiting") sys.exit(constants.EXIT_NOTMASTER) except errors.ResolverError, err: - sys.stderr.write("Cannot resolve hostname '%s', exiting.\n" % err.args[0]) + logging.error("Cannot resolve hostname '%s', exiting.", err.args[0]) sys.exit(constants.EXIT_NODESETUP_ERROR) - except Error, err: - print err + except Exception, err: + logging.error(str(err), exc_info=True) + sys.exit(constants.EXIT_FAILURE) if __name__ == '__main__':