#!/usr/bin/python # # Copyright (C) 2006, 2007 Google Inc. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA # 02110-1301, USA. """Tool to restart erronously downed virtual machines. This program and set of classes implement a watchdog to restart virtual machines in a Ganeti cluster that have crashed or been killed by a node reboot. Run from cron or similar. """ import os import sys import re import time import fcntl import errno import simplejson from optparse import OptionParser from ganeti import utils from ganeti import constants from ganeti import ssconf from ganeti import errors MAXTRIES = 5 BAD_STATES = ['stopped'] HELPLESS_STATES = ['(node down)'] NOTICE = 'NOTICE' ERROR = 'ERROR' KEY_RESTART_COUNT = "restart_count" KEY_RESTART_WHEN = "restart_when" KEY_BOOT_ID = "bootid" class Error(Exception): """Generic custom error class.""" class NotMasterError(Error): """Exception raised when this host is not the master.""" def Indent(s, prefix='| '): """Indent a piece of text with a given prefix before each line. Args: s: The string to indent prefix: The string to prepend each line. """ return "%s%s\n" % (prefix, ('\n' + prefix).join(s.splitlines())) def DoCmd(cmd): """Run a shell command. Args: cmd: the command to run. Raises CommandError with verbose commentary on error. """ res = utils.RunCmd(cmd) if res.failed: raise Error("Command %s failed:\n%s\nstdout:\n%sstderr:\n%s" % (repr(cmd), Indent(res.fail_reason), Indent(res.stdout), Indent(res.stderr))) return res class WatcherState(object): """Interface to a state file recording restart attempts. """ def __init__(self): """Open, lock, read and parse the file. Raises StandardError on lock contention. """ # The two-step dance below is necessary to allow both opening existing # file read/write and creating if not existing. Vanilla open will truncate # an existing file -or- allow creating if not existing. f = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT) f = os.fdopen(f, 'w+') try: fcntl.flock(f.fileno(), fcntl.LOCK_EX|fcntl.LOCK_NB) except IOError, x: if x.errno == errno.EAGAIN: raise StandardError("State file already locked") raise self.statefile = f try: self.data = simplejson.load(self.statefile) except Exception, msg: # Ignore errors while loading the file and treat it as empty self.data = {} sys.stderr.write("Empty or invalid state file." " Using defaults. Error message: %s\n" % msg) if "instance" not in self.data: self.data["instance"] = {} if "node" not in self.data: self.data["node"] = {} def __del__(self): """Called on destruction. """ if self.statefile: self._Close() def _Close(self): """Unlock configuration file and close it. """ assert self.statefile fcntl.flock(self.statefile.fileno(), fcntl.LOCK_UN) self.statefile.close() self.statefile = None def GetNodeBootID(self, name): """Returns the last boot ID of a node or None. """ ndata = self.data["node"] if name in ndata and KEY_BOOT_ID in ndata[name]: return ndata[name][KEY_BOOT_ID] return None def SetNodeBootID(self, name, bootid): """Sets the boot ID of a node. """ assert bootid ndata = self.data["node"] if name not in ndata: ndata[name] = {} ndata[name][KEY_BOOT_ID] = bootid def NumberOfRestartAttempts(self, instance): """Returns number of previous restart attempts. Args: instance - the instance to look up. """ idata = self.data["instance"] if instance.name in idata: return idata[instance.name][KEY_RESTART_COUNT] return 0 def RecordRestartAttempt(self, instance): """Record a restart attempt. Args: instance - the instance being restarted """ idata = self.data["instance"] if instance.name not in idata: inst = idata[instance.name] = {} else: inst = idata[instance.name] inst[KEY_RESTART_WHEN] = time.time() inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1 def RemoveInstance(self, instance): """Update state to reflect that a machine is running, i.e. remove record. Args: instance - the instance to remove from books This method removes the record for a named instance. """ idata = self.data["instance"] if instance.name in idata: del idata[instance.name] def Save(self): """Save state to file, then unlock and close it. """ assert self.statefile self.statefile.seek(0) self.statefile.truncate() simplejson.dump(self.data, self.statefile) self._Close() class Instance(object): """Abstraction for a Virtual Machine instance. Methods: Restart(): issue a command to restart the represented machine. """ def __init__(self, name, state, autostart): self.name = name self.state = state self.autostart = autostart def Restart(self): """Encapsulates the start of an instance. """ DoCmd(['gnt-instance', 'startup', '--lock-retries=15', self.name]) def ActivateDisks(self): """Encapsulates the activation of all disks of an instance. """ DoCmd(['gnt-instance', 'activate-disks', '--lock-retries=15', self.name]) def _RunListCmd(cmd): """Runs a command and parses its output into lists. """ for line in DoCmd(cmd).stdout.splitlines(): yield line.split(':') def GetInstanceList(with_secondaries=None): """Get a list of instances on this cluster. """ cmd = ['gnt-instance', 'list', '--lock-retries=15', '--no-headers', '--separator=:'] fields = 'name,oper_state,admin_state' if with_secondaries is not None: fields += ',snodes' cmd.append('-o') cmd.append(fields) instances = [] for fields in _RunListCmd(cmd): if with_secondaries is not None: (name, status, autostart, snodes) = fields if snodes == "-": continue for node in with_secondaries: if node in snodes.split(','): break else: continue else: (name, status, autostart) = fields instances.append(Instance(name, status, autostart != "no")) return instances def GetNodeBootIDs(): """Get a dict mapping nodes to boot IDs. """ cmd = ['gnt-node', 'list', '--lock-retries=15', '--no-headers', '--separator=:', '-o', 'name,bootid'] ids = {} for fields in _RunListCmd(cmd): (name, bootid) = fields ids[name] = bootid return ids class Message(object): """Encapsulation of a notice or error message. """ def __init__(self, level, msg): self.level = level self.msg = msg self.when = time.time() def __str__(self): return self.level + ' ' + time.ctime(self.when) + '\n' + Indent(self.msg) class Watcher(object): """Encapsulate the logic for restarting erronously halted virtual machines. The calling program should periodically instantiate me and call Run(). This will traverse the list of instances, and make up to MAXTRIES attempts to restart machines that are down. """ def __init__(self): sstore = ssconf.SimpleStore() master = sstore.GetMasterNode() if master != utils.HostInfo().name: raise NotMasterError("This is not the master node") self.instances = GetInstanceList() self.bootids = GetNodeBootIDs() self.messages = [] def Run(self): notepad = WatcherState() self.CheckInstances(notepad) self.CheckDisks(notepad) self.VerifyDisks() notepad.Save() def CheckDisks(self, notepad): """Check all nodes for restarted ones. """ check_nodes = [] for name, id in self.bootids.iteritems(): old = notepad.GetNodeBootID(name) if old != id: # Node's boot ID has changed, proably through a reboot. check_nodes.append(name) if check_nodes: # Activate disks for all instances with any of the checked nodes as a # secondary node. for instance in GetInstanceList(with_secondaries=check_nodes): try: self.messages.append(Message(NOTICE, ("Activating disks for %s." % instance.name))) instance.ActivateDisks() except Error, x: self.messages.append(Message(ERROR, str(x))) # Keep changed boot IDs for name in check_nodes: notepad.SetNodeBootID(name, self.bootids[name]) def CheckInstances(self, notepad): """Make a pass over the list of instances, restarting downed ones. """ for instance in self.instances: # Don't care about manually stopped instances if not instance.autostart: continue if instance.state in BAD_STATES: n = notepad.NumberOfRestartAttempts(instance) if n > MAXTRIES: # stay quiet. continue elif n < MAXTRIES: last = " (Attempt #%d)" % (n + 1) else: notepad.RecordRestartAttempt(instance) self.messages.append(Message(ERROR, "Could not restart %s for %d" " times, giving up..." % (instance.name, MAXTRIES))) continue try: self.messages.append(Message(NOTICE, ("Restarting %s%s." % (instance.name, last)))) instance.Restart() except Error, x: self.messages.append(Message(ERROR, str(x))) notepad.RecordRestartAttempt(instance) elif instance.state in HELPLESS_STATES: if notepad.NumberOfRestartAttempts(instance): notepad.RemoveInstance(instance) else: if notepad.NumberOfRestartAttempts(instance): notepad.RemoveInstance(instance) msg = Message(NOTICE, "Restart of %s succeeded." % instance.name) self.messages.append(msg) def VerifyDisks(self): """Run gnt-cluster verify-disks. """ result = DoCmd(['gnt-cluster', 'verify-disks', '--lock-retries=15']) if result.output: self.messages.append(Message(NOTICE, result.output)) def WriteReport(self, logfile): """Log all messages to file. Args: logfile: file object open for writing (the log file) """ for msg in self.messages: print >> logfile, str(msg) def ParseOptions(): """Parse the command line options. Returns: (options, args) as from OptionParser.parse_args() """ parser = OptionParser(description="Ganeti cluster watcher", usage="%prog [-d]", version="%%prog (ganeti) %s" % constants.RELEASE_VERSION) parser.add_option("-d", "--debug", dest="debug", help="Don't redirect messages to the log file", default=False, action="store_true") options, args = parser.parse_args() return options, args def main(): """Main function. """ options, args = ParseOptions() if not options.debug: sys.stderr = sys.stdout = open(constants.LOG_WATCHER, 'a') try: try: watcher = Watcher() except errors.ConfigurationError: # Just exit if there's no configuration sys.exit(constants.EXIT_SUCCESS) watcher.Run() watcher.WriteReport(sys.stdout) except NotMasterError: if options.debug: sys.stderr.write("Not master, exiting.\n") sys.exit(constants.EXIT_NOTMASTER) except errors.ResolverError, err: sys.stderr.write("Cannot resolve hostname '%s', exiting.\n" % err.args[0]) sys.exit(constants.EXIT_NODESETUP_ERROR) except Error, err: print err if __name__ == '__main__': main()