From: Iustin Pop Date: Mon, 9 Mar 2009 15:12:24 +0000 (+0000) Subject: watcher: fix startup sequence locking the master X-Git-Tag: v2.0.0rc2~8 X-Git-Url: https://code.grnet.gr/git/ganeti-local/commitdiff_plain/cc962d581164937f90391dc2fcc7a965e82e6aa4 watcher: fix startup sequence locking the master Currently, the watcher startup sequence does: - open a luxi client - get the instance list - get the node boot ids - open and lock the status file, and: - archive jobs - restart the down instances - check disks This, of course, can lead to problems when a node is (genuinely or not) locked for more than (watcher interval * maximum query clients) time. At that time, the master is completely unresponsive until the node is unlocked and all the watchers exit with error due to the state file being locked by the first instance. This patch reworks the startup sequence to first open/lock the status file, and only then open a luxi client. This should prevent the above case. Reviewed-by: ultrotter --- diff --git a/daemons/ganeti-watcher b/daemons/ganeti-watcher index 12bafce..2c5948b 100755 --- a/daemons/ganeti-watcher +++ b/daemons/ganeti-watcher @@ -274,7 +274,8 @@ class Watcher(object): to restart machines that are down. """ - def __init__(self, opts): + def __init__(self, opts, notepad): + self.notepad = notepad master = client.QueryConfigValues(["master_node"])[0] if master != utils.HostInfo().name: raise NotMasterError("This is not the master node") @@ -284,14 +285,14 @@ class Watcher(object): self.opts = opts def Run(self): - notepad = WatcherState() - try: - self.ArchiveJobs(self.opts.job_age) - self.CheckInstances(notepad) - self.CheckDisks(notepad) - self.VerifyDisks() - finally: - notepad.Save() + """Watcher run sequence. + + """ + notepad = self.notepad + self.ArchiveJobs(self.opts.job_age) + self.CheckInstances(notepad) + self.CheckDisks(notepad) + self.VerifyDisks() def ArchiveJobs(self, age): """Archive old jobs. @@ -435,15 +436,19 @@ def main(): stderr_logging=options.debug) try: - client = cli.GetClient() - + notepad = WatcherState() try: - watcher = Watcher(options) - except errors.ConfigurationError: - # Just exit if there's no configuration - sys.exit(constants.EXIT_SUCCESS) + client = cli.GetClient() + + try: + watcher = Watcher(options, notepad) + except errors.ConfigurationError: + # Just exit if there's no configuration + sys.exit(constants.EXIT_SUCCESS) - watcher.Run() + watcher.Run() + finally: + notepad.Save() except SystemExit: raise except NotMasterError: