4 # Copyright (C) 2006, 2007 Google Inc.
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 """Tool to restart erronously downed virtual machines.
24 This program and set of classes implement a watchdog to restart
25 virtual machines in a Ganeti cluster that have crashed or been killed
26 by a node reboot. Run from cron or similar.
37 from optparse import OptionParser
39 from ganeti import utils
40 from ganeti import constants
41 from ganeti import ssconf
42 from ganeti import errors
46 BAD_STATES = ['stopped']
47 HELPLESS_STATES = ['(node down)']
50 KEY_RESTART_COUNT = "restart_count"
51 KEY_RESTART_WHEN = "restart_when"
52 KEY_BOOT_ID = "bootid"
55 class Error(Exception):
56 """Generic custom error class."""
59 class NotMasterError(Error):
60 """Exception raised when this host is not the master."""
63 def Indent(s, prefix='| '):
64 """Indent a piece of text with a given prefix before each line.
67 s: The string to indent
68 prefix: The string to prepend each line.
71 return "%s%s\n" % (prefix, ('\n' + prefix).join(s.splitlines()))
75 """Run a shell command.
78 cmd: the command to run.
80 Raises CommandError with verbose commentary on error.
83 res = utils.RunCmd(cmd)
86 raise Error("Command %s failed:\n%s\nstdout:\n%sstderr:\n%s" %
88 Indent(res.fail_reason),
95 class WatcherState(object):
96 """Interface to a state file recording restart attempts.
100 """Open, lock, read and parse the file.
102 Raises StandardError on lock contention.
105 # The two-step dance below is necessary to allow both opening existing
106 # file read/write and creating if not existing. Vanilla open will truncate
107 # an existing file -or- allow creating if not existing.
108 f = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT)
109 f = os.fdopen(f, 'w+')
112 fcntl.flock(f.fileno(), fcntl.LOCK_EX|fcntl.LOCK_NB)
114 if x.errno == errno.EAGAIN:
115 raise StandardError("State file already locked")
121 self.data = simplejson.load(self.statefile)
122 except Exception, msg:
123 # Ignore errors while loading the file and treat it as empty
125 sys.stderr.write("Empty or invalid state file."
126 " Using defaults. Error message: %s\n" % msg)
128 if "instance" not in self.data:
129 self.data["instance"] = {}
130 if "node" not in self.data:
131 self.data["node"] = {}
134 """Called on destruction.
141 """Unlock configuration file and close it.
144 assert self.statefile
146 fcntl.flock(self.statefile.fileno(), fcntl.LOCK_UN)
148 self.statefile.close()
149 self.statefile = None
151 def GetNodeBootID(self, name):
152 """Returns the last boot ID of a node or None.
155 ndata = self.data["node"]
157 if name in ndata and KEY_BOOT_ID in ndata[name]:
158 return ndata[name][KEY_BOOT_ID]
161 def SetNodeBootID(self, name, bootid):
162 """Sets the boot ID of a node.
167 ndata = self.data["node"]
169 if name not in ndata:
172 ndata[name][KEY_BOOT_ID] = bootid
174 def NumberOfRestartAttempts(self, instance):
175 """Returns number of previous restart attempts.
178 instance - the instance to look up.
181 idata = self.data["instance"]
183 if instance.name in idata:
184 return idata[instance.name][KEY_RESTART_COUNT]
188 def RecordRestartAttempt(self, instance):
189 """Record a restart attempt.
192 instance - the instance being restarted
195 idata = self.data["instance"]
197 if instance.name not in idata:
198 inst = idata[instance.name] = {}
200 inst = idata[instance.name]
202 inst[KEY_RESTART_WHEN] = time.time()
203 inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1
205 def RemoveInstance(self, instance):
206 """Update state to reflect that a machine is running, i.e. remove record.
209 instance - the instance to remove from books
211 This method removes the record for a named instance.
214 idata = self.data["instance"]
216 if instance.name in idata:
217 del idata[instance.name]
220 """Save state to file, then unlock and close it.
223 assert self.statefile
225 self.statefile.seek(0)
226 self.statefile.truncate()
228 simplejson.dump(self.data, self.statefile)
233 class Instance(object):
234 """Abstraction for a Virtual Machine instance.
237 Restart(): issue a command to restart the represented machine.
240 def __init__(self, name, state, autostart):
243 self.autostart = autostart
246 """Encapsulates the start of an instance.
249 DoCmd(['gnt-instance', 'startup', '--lock-retries=15', self.name])
251 def ActivateDisks(self):
252 """Encapsulates the activation of all disks of an instance.
255 DoCmd(['gnt-instance', 'activate-disks', '--lock-retries=15', self.name])
258 def _RunListCmd(cmd):
259 """Runs a command and parses its output into lists.
262 for line in DoCmd(cmd).stdout.splitlines():
263 yield line.split(':')
266 def GetInstanceList(with_secondaries=None):
267 """Get a list of instances on this cluster.
270 cmd = ['gnt-instance', 'list', '--lock-retries=15', '--no-headers',
273 fields = 'name,oper_state,admin_state'
275 if with_secondaries is not None:
282 for fields in _RunListCmd(cmd):
283 if with_secondaries is not None:
284 (name, status, autostart, snodes) = fields
289 for node in with_secondaries:
290 if node in snodes.split(','):
296 (name, status, autostart) = fields
298 instances.append(Instance(name, status, autostart != "no"))
303 def GetNodeBootIDs():
304 """Get a dict mapping nodes to boot IDs.
307 cmd = ['gnt-node', 'list', '--lock-retries=15', '--no-headers',
308 '--separator=:', '-o', 'name,bootid']
311 for fields in _RunListCmd(cmd):
312 (name, bootid) = fields
318 class Message(object):
319 """Encapsulation of a notice or error message.
322 def __init__(self, level, msg):
325 self.when = time.time()
328 return self.level + ' ' + time.ctime(self.when) + '\n' + Indent(self.msg)
331 class Watcher(object):
332 """Encapsulate the logic for restarting erronously halted virtual machines.
334 The calling program should periodically instantiate me and call Run().
335 This will traverse the list of instances, and make up to MAXTRIES attempts
336 to restart machines that are down.
340 sstore = ssconf.SimpleStore()
341 master = sstore.GetMasterNode()
342 if master != utils.HostInfo().name:
343 raise NotMasterError("This is not the master node")
344 self.instances = GetInstanceList()
345 self.bootids = GetNodeBootIDs()
349 notepad = WatcherState()
350 self.CheckInstances(notepad)
351 self.CheckDisks(notepad)
355 def CheckDisks(self, notepad):
356 """Check all nodes for restarted ones.
360 for name, id in self.bootids.iteritems():
361 old = notepad.GetNodeBootID(name)
363 # Node's boot ID has changed, proably through a reboot.
364 check_nodes.append(name)
367 # Activate disks for all instances with any of the checked nodes as a
369 for instance in GetInstanceList(with_secondaries=check_nodes):
371 self.messages.append(Message(NOTICE, ("Activating disks for %s." %
373 instance.ActivateDisks()
375 self.messages.append(Message(ERROR, str(x)))
377 # Keep changed boot IDs
378 for name in check_nodes:
379 notepad.SetNodeBootID(name, self.bootids[name])
381 def CheckInstances(self, notepad):
382 """Make a pass over the list of instances, restarting downed ones.
385 for instance in self.instances:
386 # Don't care about manually stopped instances
387 if not instance.autostart:
390 if instance.state in BAD_STATES:
391 n = notepad.NumberOfRestartAttempts(instance)
397 last = " (Attempt #%d)" % (n + 1)
399 notepad.RecordRestartAttempt(instance)
400 self.messages.append(Message(ERROR, "Could not restart %s for %d"
401 " times, giving up..." %
402 (instance.name, MAXTRIES)))
405 self.messages.append(Message(NOTICE, ("Restarting %s%s." %
406 (instance.name, last))))
409 self.messages.append(Message(ERROR, str(x)))
411 notepad.RecordRestartAttempt(instance)
412 elif instance.state in HELPLESS_STATES:
413 if notepad.NumberOfRestartAttempts(instance):
414 notepad.RemoveInstance(instance)
416 if notepad.NumberOfRestartAttempts(instance):
417 notepad.RemoveInstance(instance)
418 msg = Message(NOTICE, "Restart of %s succeeded." % instance.name)
419 self.messages.append(msg)
421 def VerifyDisks(self):
422 """Run gnt-cluster verify-disks.
425 result = DoCmd(['gnt-cluster', 'verify-disks', '--lock-retries=15'])
427 self.messages.append(Message(NOTICE, result.output))
429 def WriteReport(self, logfile):
430 """Log all messages to file.
433 logfile: file object open for writing (the log file)
436 for msg in self.messages:
437 print >> logfile, str(msg)
441 """Parse the command line options.
444 (options, args) as from OptionParser.parse_args()
447 parser = OptionParser(description="Ganeti cluster watcher",
449 version="%%prog (ganeti) %s" %
450 constants.RELEASE_VERSION)
452 parser.add_option("-d", "--debug", dest="debug",
453 help="Don't redirect messages to the log file",
454 default=False, action="store_true")
455 options, args = parser.parse_args()
463 options, args = ParseOptions()
465 if not options.debug:
466 sys.stderr = sys.stdout = open(constants.LOG_WATCHER, 'a')
471 except errors.ConfigurationError:
472 # Just exit if there's no configuration
473 sys.exit(constants.EXIT_SUCCESS)
475 watcher.WriteReport(sys.stdout)
476 except NotMasterError:
478 sys.stderr.write("Not master, exiting.\n")
479 sys.exit(constants.EXIT_NOTMASTER)
480 except errors.ResolverError, err:
481 sys.stderr.write("Cannot resolve hostname '%s', exiting.\n" % err.args[0])
482 sys.exit(constants.EXIT_NODESETUP_ERROR)
487 if __name__ == '__main__':