X-Git-Url: https://code.grnet.gr/git/ganeti-local/blobdiff_plain/38242904d0b8dd84613ea8d73d0eae012afb6073..130440e7d0f26fb256b801c1474dccfe6f3cb76c:/daemons/ganeti-watcher diff --git a/daemons/ganeti-watcher b/daemons/ganeti-watcher index 7525f7b..8b2b183 100755 --- a/daemons/ganeti-watcher +++ b/daemons/ganeti-watcher @@ -24,28 +24,32 @@ This program and set of classes implement a watchdog to restart virtual machines in a Ganeti cluster that have crashed or been killed by a node reboot. Run from cron or similar. -""" - -LOGFILE = '/var/log/ganeti/watcher.log' -MAXTRIES = 5 -BAD_STATES = ['stopped'] -HELPLESS_STATES = ['(node down)'] -NOTICE = 'NOTICE' -ERROR = 'ERROR' +""" import os import sys +import re import time import fcntl import errno -import socket +import simplejson from optparse import OptionParser - from ganeti import utils from ganeti import constants from ganeti import ssconf +from ganeti import errors + + +MAXTRIES = 5 +BAD_STATES = ['stopped'] +HELPLESS_STATES = ['(node down)'] +NOTICE = 'NOTICE' +ERROR = 'ERROR' +KEY_RESTART_COUNT = "restart_count" +KEY_RESTART_WHEN = "restart_when" +KEY_BOOT_ID = "bootid" class Error(Exception): @@ -88,25 +92,16 @@ def DoCmd(cmd): return res -class RestarterState(object): +class WatcherState(object): """Interface to a state file recording restart attempts. - Methods: - Open(): open, lock, read and parse the file. - Raises StandardError on lock contention. - - NumberOfAttempts(name): returns the number of times in succession - a restart has been attempted of the named instance. - - RecordAttempt(name, when): records one restart attempt of name at - time in when. - - Remove(name): remove record given by name, if exists. - - Save(name): saves all records to file, releases lock and closes file. - """ def __init__(self): + """Open, lock, read and parse the file. + + Raises StandardError on lock contention. + + """ # The two-step dance below is necessary to allow both opening existing # file read/write and creating if not existing. Vanilla open will truncate # an existing file -or- allow creating if not existing. @@ -117,48 +112,97 @@ class RestarterState(object): fcntl.flock(f.fileno(), fcntl.LOCK_EX|fcntl.LOCK_NB) except IOError, x: if x.errno == errno.EAGAIN: - raise StandardError('State file already locked') + raise StandardError("State file already locked") raise self.statefile = f - self.inst_map = {} - for line in f: - name, when, count = line.rstrip().split(':') + try: + self.data = simplejson.load(self.statefile) + except Exception, msg: + # Ignore errors while loading the file and treat it as empty + self.data = {} + sys.stderr.write("Empty or invalid state file." + " Using defaults. Error message: %s\n" % msg) + + if "instance" not in self.data: + self.data["instance"] = {} + if "node" not in self.data: + self.data["node"] = {} + + def __del__(self): + """Called on destruction. + + """ + if self.statefile: + self._Close() + + def _Close(self): + """Unlock configuration file and close it. + + """ + assert self.statefile + + fcntl.flock(self.statefile.fileno(), fcntl.LOCK_UN) + + self.statefile.close() + self.statefile = None + + def GetNodeBootID(self, name): + """Returns the last boot ID of a node or None. + + """ + ndata = self.data["node"] + + if name in ndata and KEY_BOOT_ID in ndata[name]: + return ndata[name][KEY_BOOT_ID] + return None + + def SetNodeBootID(self, name, bootid): + """Sets the boot ID of a node. + + """ + assert bootid + + ndata = self.data["node"] - when = int(when) - count = int(count) + if name not in ndata: + ndata[name] = {} - self.inst_map[name] = (when, count) + ndata[name][KEY_BOOT_ID] = bootid - def NumberOfAttempts(self, instance): + def NumberOfRestartAttempts(self, instance): """Returns number of previous restart attempts. Args: instance - the instance to look up. """ - assert self.statefile + idata = self.data["instance"] - if instance.name in self.inst_map: - return self.inst_map[instance.name][1] + if instance.name in idata: + return idata[instance.name][KEY_RESTART_COUNT] return 0 - def RecordAttempt(self, instance): + def RecordRestartAttempt(self, instance): """Record a restart attempt. Args: instance - the instance being restarted """ - assert self.statefile + idata = self.data["instance"] - when = time.time() + if instance.name not in idata: + inst = idata[instance.name] = {} + else: + inst = idata[instance.name] - self.inst_map[instance.name] = (when, 1 + self.NumberOfAttempts(instance)) + inst[KEY_RESTART_WHEN] = time.time() + inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1 - def Remove(self, instance): + def RemoveInstance(self, instance): """Update state to reflect that a machine is running, i.e. remove record. Args: @@ -167,13 +211,13 @@ class RestarterState(object): This method removes the record for a named instance. """ - assert self.statefile + idata = self.data["instance"] - if instance.name in self.inst_map: - del self.inst_map[instance.name] + if instance.name in idata: + del idata[instance.name] def Save(self): - """Save records to file, then unlock and close file. + """Save state to file, then unlock and close it. """ assert self.statefile @@ -181,13 +225,9 @@ class RestarterState(object): self.statefile.seek(0) self.statefile.truncate() - for name in self.inst_map: - print >> self.statefile, "%s:%d:%d" % ((name,) + self.inst_map[name]) - - fcntl.flock(self.statefile.fileno(), fcntl.LOCK_UN) + simplejson.dump(self.data, self.statefile) - self.statefile.close() - self.statefile = None + self._Close() class Instance(object): @@ -195,41 +235,84 @@ class Instance(object): Methods: Restart(): issue a command to restart the represented machine. + """ - def __init__(self, name, state): + def __init__(self, name, state, autostart): self.name = name self.state = state + self.autostart = autostart def Restart(self): + """Encapsulates the start of an instance. + + """ DoCmd(['gnt-instance', 'startup', '--lock-retries=15', self.name]) + def ActivateDisks(self): + """Encapsulates the activation of all disks of an instance. -class InstanceList(object): - """The set of Virtual Machine instances on a cluster. + """ + DoCmd(['gnt-instance', 'activate-disks', '--lock-retries=15', self.name]) + + +def _RunListCmd(cmd): + """Runs a command and parses its output into lists. """ - cmd = ['gnt-instance', 'list', '--lock-retries=15', - '-o', 'name,admin_state,oper_state', '--no-headers', '--separator=:'] + for line in DoCmd(cmd).stdout.splitlines(): + yield line.split(':') - def __init__(self): - res = DoCmd(self.cmd) - lines = res.stdout.splitlines() +def GetInstanceList(with_secondaries=None): + """Get a list of instances on this cluster. + + """ + cmd = ['gnt-instance', 'list', '--lock-retries=15', '--no-headers', + '--separator=:'] + + fields = 'name,oper_state,admin_state' - self.instances = [] - for line in lines: - fields = [fld.strip() for fld in line.split(':')] + if with_secondaries is not None: + fields += ',snodes' - if len(fields) != 3: + cmd.append('-o') + cmd.append(fields) + + instances = [] + for fields in _RunListCmd(cmd): + if with_secondaries is not None: + (name, status, autostart, snodes) = fields + + if snodes == "-": continue - if fields[1] == "no": #no autostart, we don't care about this instance + + for node in with_secondaries: + if node in snodes.split(','): + break + else: continue - name, status = fields[0], fields[2] - self.instances.append(Instance(name, status)) + else: + (name, status, autostart) = fields + + instances.append(Instance(name, status, autostart != "no")) - def __iter__(self): - return self.instances.__iter__() + return instances + + +def GetNodeBootIDs(): + """Get a dict mapping nodes to boot IDs. + + """ + cmd = ['gnt-node', 'list', '--lock-retries=15', '--no-headers', + '--separator=:', '-o', 'name,bootid'] + + ids = {} + for fields in _RunListCmd(cmd): + (name, bootid) = fields + ids[name] = bootid + + return ids class Message(object): @@ -245,7 +328,7 @@ class Message(object): return self.level + ' ' + time.ctime(self.when) + '\n' + Indent(self.msg) -class Restarter(object): +class Watcher(object): """Encapsulate the logic for restarting erronously halted virtual machines. The calling program should periodically instantiate me and call Run(). @@ -256,20 +339,56 @@ class Restarter(object): def __init__(self): sstore = ssconf.SimpleStore() master = sstore.GetMasterNode() - if master != socket.gethostname(): - raise NotMasterError, ("This is not the master node") - self.instances = InstanceList() + if master != utils.HostInfo().name: + raise NotMasterError("This is not the master node") + self.instances = GetInstanceList() + self.bootids = GetNodeBootIDs() self.messages = [] def Run(self): - """Make a pass over the list of instances, restarting downed ones. + notepad = WatcherState() + self.CheckInstances(notepad) + self.CheckDisks(notepad) + self.VerifyDisks() + notepad.Save() + + def CheckDisks(self, notepad): + """Check all nodes for restarted ones. """ - notepad = RestarterState() + check_nodes = [] + for name, id in self.bootids.iteritems(): + old = notepad.GetNodeBootID(name) + if old != id: + # Node's boot ID has changed, proably through a reboot. + check_nodes.append(name) + + if check_nodes: + # Activate disks for all instances with any of the checked nodes as a + # secondary node. + for instance in GetInstanceList(with_secondaries=check_nodes): + try: + self.messages.append(Message(NOTICE, ("Activating disks for %s." % + instance.name))) + instance.ActivateDisks() + except Error, x: + self.messages.append(Message(ERROR, str(x))) + # Keep changed boot IDs + for name in check_nodes: + notepad.SetNodeBootID(name, self.bootids[name]) + + def CheckInstances(self, notepad): + """Make a pass over the list of instances, restarting downed ones. + + """ for instance in self.instances: + # Don't care about manually stopped instances + if not instance.autostart: + continue + if instance.state in BAD_STATES: - n = notepad.NumberOfAttempts(instance) + n = notepad.NumberOfRestartAttempts(instance) if n > MAXTRIES: # stay quiet. @@ -277,31 +396,35 @@ class Restarter(object): elif n < MAXTRIES: last = " (Attempt #%d)" % (n + 1) else: - notepad.RecordAttempt(instance) + notepad.RecordRestartAttempt(instance) self.messages.append(Message(ERROR, "Could not restart %s for %d" " times, giving up..." % (instance.name, MAXTRIES))) continue try: - self.messages.append(Message(NOTICE, - "Restarting %s%s." % - (instance.name, last))) + self.messages.append(Message(NOTICE, ("Restarting %s%s." % + (instance.name, last)))) instance.Restart() except Error, x: self.messages.append(Message(ERROR, str(x))) - notepad.RecordAttempt(instance) + notepad.RecordRestartAttempt(instance) elif instance.state in HELPLESS_STATES: - if notepad.NumberOfAttempts(instance): - notepad.Remove(instance) + if notepad.NumberOfRestartAttempts(instance): + notepad.RemoveInstance(instance) else: - if notepad.NumberOfAttempts(instance): - notepad.Remove(instance) - msg = Message(NOTICE, - "Restart of %s succeeded." % instance.name) + if notepad.NumberOfRestartAttempts(instance): + notepad.RemoveInstance(instance) + msg = Message(NOTICE, "Restart of %s succeeded." % instance.name) self.messages.append(msg) - notepad.Save() + def VerifyDisks(self): + """Run gnt-cluster verify-disks. + + """ + result = DoCmd(['gnt-cluster', 'verify-disks', '--lock-retries=15']) + if result.output: + self.messages.append(Message(NOTICE, result.output)) def WriteReport(self, logfile): """Log all messages to file. @@ -340,18 +463,26 @@ def main(): options, args = ParseOptions() if not options.debug: - sys.stderr = sys.stdout = open(LOGFILE, 'a') + sys.stderr = sys.stdout = open(constants.LOG_WATCHER, 'a') try: - restarter = Restarter() - restarter.Run() - restarter.WriteReport(sys.stdout) + try: + watcher = Watcher() + except errors.ConfigurationError: + # Just exit if there's no configuration + sys.exit(constants.EXIT_SUCCESS) + watcher.Run() + watcher.WriteReport(sys.stdout) except NotMasterError: if options.debug: sys.stderr.write("Not master, exiting.\n") sys.exit(constants.EXIT_NOTMASTER) + except errors.ResolverError, err: + sys.stderr.write("Cannot resolve hostname '%s', exiting.\n" % err.args[0]) + sys.exit(constants.EXIT_NODESETUP_ERROR) except Error, err: print err + if __name__ == '__main__': main()