X-Git-Url: https://code.grnet.gr/git/ganeti-local/blobdiff_plain/5a3103e9ef4564d8e8cd9f9d60aac799ccf3c08f..0d0c1b4a16ae3edaebe231b67e657d148c8c3c8b:/daemons/ganeti-watcher diff --git a/daemons/ganeti-watcher b/daemons/ganeti-watcher index c912592..8b2b183 100755 --- a/daemons/ganeti-watcher +++ b/daemons/ganeti-watcher @@ -47,6 +47,9 @@ BAD_STATES = ['stopped'] HELPLESS_STATES = ['(node down)'] NOTICE = 'NOTICE' ERROR = 'ERROR' +KEY_RESTART_COUNT = "restart_count" +KEY_RESTART_WHEN = "restart_when" +KEY_BOOT_ID = "bootid" class Error(Exception): @@ -119,8 +122,8 @@ class WatcherState(object): except Exception, msg: # Ignore errors while loading the file and treat it as empty self.data = {} - sys.stderr.write("Empty or invalid state file. " - "Using defaults. Error message: %s\n" % msg) + sys.stderr.write("Empty or invalid state file." + " Using defaults. Error message: %s\n" % msg) if "instance" not in self.data: self.data["instance"] = {} @@ -151,8 +154,8 @@ class WatcherState(object): """ ndata = self.data["node"] - if name in ndata and "bootid" in ndata[name]: - return ndata[name]["bootid"] + if name in ndata and KEY_BOOT_ID in ndata[name]: + return ndata[name][KEY_BOOT_ID] return None def SetNodeBootID(self, name, bootid): @@ -166,7 +169,7 @@ class WatcherState(object): if name not in ndata: ndata[name] = {} - ndata[name]["bootid"] = bootid + ndata[name][KEY_BOOT_ID] = bootid def NumberOfRestartAttempts(self, instance): """Returns number of previous restart attempts. @@ -178,7 +181,7 @@ class WatcherState(object): idata = self.data["instance"] if instance.name in idata: - return idata[instance.name]["restart_count"] + return idata[instance.name][KEY_RESTART_COUNT] return 0 @@ -196,8 +199,8 @@ class WatcherState(object): else: inst = idata[instance.name] - inst["restart_when"] = time.time() - inst["restart_count"] = idata.get("restart_count", 0) + 1 + inst[KEY_RESTART_WHEN] = time.time() + inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1 def RemoveInstance(self, instance): """Update state to reflect that a machine is running, i.e. remove record. @@ -346,6 +349,7 @@ class Watcher(object): notepad = WatcherState() self.CheckInstances(notepad) self.CheckDisks(notepad) + self.VerifyDisks() notepad.Save() def CheckDisks(self, notepad): @@ -364,9 +368,8 @@ class Watcher(object): # secondary node. for instance in GetInstanceList(with_secondaries=check_nodes): try: - self.messages.append(Message(NOTICE, - "Activating disks for %s." % - instance.name)) + self.messages.append(Message(NOTICE, ("Activating disks for %s." % + instance.name))) instance.ActivateDisks() except Error, x: self.messages.append(Message(ERROR, str(x))) @@ -399,9 +402,8 @@ class Watcher(object): (instance.name, MAXTRIES))) continue try: - self.messages.append(Message(NOTICE, - "Restarting %s%s." % - (instance.name, last))) + self.messages.append(Message(NOTICE, ("Restarting %s%s." % + (instance.name, last)))) instance.Restart() except Error, x: self.messages.append(Message(ERROR, str(x))) @@ -413,10 +415,17 @@ class Watcher(object): else: if notepad.NumberOfRestartAttempts(instance): notepad.RemoveInstance(instance) - msg = Message(NOTICE, - "Restart of %s succeeded." % instance.name) + msg = Message(NOTICE, "Restart of %s succeeded." % instance.name) self.messages.append(msg) + def VerifyDisks(self): + """Run gnt-cluster verify-disks. + + """ + result = DoCmd(['gnt-cluster', 'verify-disks', '--lock-retries=15']) + if result.output: + self.messages.append(Message(NOTICE, result.output)) + def WriteReport(self, logfile): """Log all messages to file. @@ -457,7 +466,11 @@ def main(): sys.stderr = sys.stdout = open(constants.LOG_WATCHER, 'a') try: - watcher = Watcher() + try: + watcher = Watcher() + except errors.ConfigurationError: + # Just exit if there's no configuration + sys.exit(constants.EXIT_SUCCESS) watcher.Run() watcher.WriteReport(sys.stdout) except NotMasterError: