4 # Copyright (C) 2006, 2007 Google Inc.
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 """Tool to restart erronously downed virtual machines.
24 This program and set of classes implement a watchdog to restart
25 virtual machines in a Ganeti cluster that have crashed or been killed
26 by a node reboot. Run from cron or similar.
38 from optparse import OptionParser
40 from ganeti import utils
41 from ganeti import constants
42 from ganeti import ssconf
43 from ganeti import errors
47 BAD_STATES = ['stopped']
48 HELPLESS_STATES = ['(node down)']
51 KEY_RESTART_COUNT = "restart_count"
52 KEY_RESTART_WHEN = "restart_when"
53 KEY_BOOT_ID = "bootid"
56 class Error(Exception):
57 """Generic custom error class."""
60 class NotMasterError(Error):
61 """Exception raised when this host is not the master."""
64 def Indent(s, prefix='| '):
65 """Indent a piece of text with a given prefix before each line.
68 s: The string to indent
69 prefix: The string to prepend each line.
72 return "%s%s\n" % (prefix, ('\n' + prefix).join(s.splitlines()))
76 """Run a shell command.
79 cmd: the command to run.
81 Raises CommandError with verbose commentary on error.
84 res = utils.RunCmd(cmd)
87 raise Error("Command %s failed:\n%s\nstdout:\n%sstderr:\n%s" %
89 Indent(res.fail_reason),
96 class WatcherState(object):
97 """Interface to a state file recording restart attempts.
101 """Open, lock, read and parse the file.
103 Raises StandardError on lock contention.
106 # The two-step dance below is necessary to allow both opening existing
107 # file read/write and creating if not existing. Vanilla open will truncate
108 # an existing file -or- allow creating if not existing.
109 f = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT)
110 f = os.fdopen(f, 'w+')
113 fcntl.flock(f.fileno(), fcntl.LOCK_EX|fcntl.LOCK_NB)
115 if x.errno == errno.EAGAIN:
116 raise StandardError("State file already locked")
122 self.data = simplejson.load(self.statefile)
123 except Exception, msg:
124 # Ignore errors while loading the file and treat it as empty
126 logging.warning(("Empty or invalid state file. Using defaults."
127 " Error message: %s"), msg)
129 if "instance" not in self.data:
130 self.data["instance"] = {}
131 if "node" not in self.data:
132 self.data["node"] = {}
135 """Called on destruction.
142 """Unlock configuration file and close it.
145 assert self.statefile
147 fcntl.flock(self.statefile.fileno(), fcntl.LOCK_UN)
149 self.statefile.close()
150 self.statefile = None
152 def GetNodeBootID(self, name):
153 """Returns the last boot ID of a node or None.
156 ndata = self.data["node"]
158 if name in ndata and KEY_BOOT_ID in ndata[name]:
159 return ndata[name][KEY_BOOT_ID]
162 def SetNodeBootID(self, name, bootid):
163 """Sets the boot ID of a node.
168 ndata = self.data["node"]
170 if name not in ndata:
173 ndata[name][KEY_BOOT_ID] = bootid
175 def NumberOfRestartAttempts(self, instance):
176 """Returns number of previous restart attempts.
179 instance - the instance to look up.
182 idata = self.data["instance"]
184 if instance.name in idata:
185 return idata[instance.name][KEY_RESTART_COUNT]
189 def RecordRestartAttempt(self, instance):
190 """Record a restart attempt.
193 instance - the instance being restarted
196 idata = self.data["instance"]
198 if instance.name not in idata:
199 inst = idata[instance.name] = {}
201 inst = idata[instance.name]
203 inst[KEY_RESTART_WHEN] = time.time()
204 inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1
206 def RemoveInstance(self, instance):
207 """Update state to reflect that a machine is running, i.e. remove record.
210 instance - the instance to remove from books
212 This method removes the record for a named instance.
215 idata = self.data["instance"]
217 if instance.name in idata:
218 del idata[instance.name]
221 """Save state to file, then unlock and close it.
224 assert self.statefile
226 self.statefile.seek(0)
227 self.statefile.truncate()
229 simplejson.dump(self.data, self.statefile)
234 class Instance(object):
235 """Abstraction for a Virtual Machine instance.
238 Restart(): issue a command to restart the represented machine.
241 def __init__(self, name, state, autostart):
244 self.autostart = autostart
247 """Encapsulates the start of an instance.
250 DoCmd(['gnt-instance', 'startup', '--lock-retries=15', self.name])
252 def ActivateDisks(self):
253 """Encapsulates the activation of all disks of an instance.
256 DoCmd(['gnt-instance', 'activate-disks', '--lock-retries=15', self.name])
259 def _RunListCmd(cmd):
260 """Runs a command and parses its output into lists.
263 for line in DoCmd(cmd).stdout.splitlines():
264 yield line.split(':')
267 def GetInstanceList(with_secondaries=None):
268 """Get a list of instances on this cluster.
271 cmd = ['gnt-instance', 'list', '--lock-retries=15', '--no-headers',
274 fields = 'name,oper_state,admin_state'
276 if with_secondaries is not None:
283 for fields in _RunListCmd(cmd):
284 if with_secondaries is not None:
285 (name, status, autostart, snodes) = fields
290 for node in with_secondaries:
291 if node in snodes.split(','):
297 (name, status, autostart) = fields
299 instances.append(Instance(name, status, autostart != "no"))
304 def GetNodeBootIDs():
305 """Get a dict mapping nodes to boot IDs.
308 cmd = ['gnt-node', 'list', '--lock-retries=15', '--no-headers',
309 '--separator=:', '-o', 'name,bootid']
312 for fields in _RunListCmd(cmd):
313 (name, bootid) = fields
319 class Watcher(object):
320 """Encapsulate the logic for restarting erronously halted virtual machines.
322 The calling program should periodically instantiate me and call Run().
323 This will traverse the list of instances, and make up to MAXTRIES attempts
324 to restart machines that are down.
328 sstore = ssconf.SimpleStore()
329 master = sstore.GetMasterNode()
330 if master != utils.HostInfo().name:
331 raise NotMasterError("This is not the master node")
332 self.instances = GetInstanceList()
333 self.bootids = GetNodeBootIDs()
334 self.started_instances = set()
337 notepad = WatcherState()
338 self.CheckInstances(notepad)
339 self.CheckDisks(notepad)
343 def CheckDisks(self, notepad):
344 """Check all nodes for restarted ones.
348 for name, id in self.bootids.iteritems():
349 old = notepad.GetNodeBootID(name)
351 # Node's boot ID has changed, proably through a reboot.
352 check_nodes.append(name)
355 # Activate disks for all instances with any of the checked nodes as a
357 for instance in GetInstanceList(with_secondaries=check_nodes):
358 if not instance.autostart:
359 logging.info(("Skipping disk activation for non-autostart"
360 " instance %s"), instance.name)
362 if instance.name in self.started_instances:
363 # we already tried to start the instance, which should have
364 # activated its drives (if they can be at all)
367 logging.info("Activating disks for instance %s", instance.name)
368 instance.ActivateDisks()
370 logging.error(str(err), exc_info=True)
372 # Keep changed boot IDs
373 for name in check_nodes:
374 notepad.SetNodeBootID(name, self.bootids[name])
376 def CheckInstances(self, notepad):
377 """Make a pass over the list of instances, restarting downed ones.
380 for instance in self.instances:
381 # Don't care about manually stopped instances
382 if not instance.autostart:
385 if instance.state in BAD_STATES:
386 n = notepad.NumberOfRestartAttempts(instance)
392 last = " (Attempt #%d)" % (n + 1)
394 notepad.RecordRestartAttempt(instance)
395 logging.error("Could not restart %s after %d attempts, giving up",
396 instance.name, MAXTRIES)
399 logging.info("Restarting %s%s",
402 self.started_instances.add(instance.name)
404 logging.error(str(err), exc_info=True)
406 notepad.RecordRestartAttempt(instance)
407 elif instance.state in HELPLESS_STATES:
408 if notepad.NumberOfRestartAttempts(instance):
409 notepad.RemoveInstance(instance)
411 if notepad.NumberOfRestartAttempts(instance):
412 notepad.RemoveInstance(instance)
413 logging.info("Restart of %s succeeded", instance.name)
415 def VerifyDisks(self):
416 """Run gnt-cluster verify-disks.
419 result = DoCmd(['gnt-cluster', 'verify-disks', '--lock-retries=15'])
421 logging.info(result.output)
425 """Parse the command line options.
428 (options, args) as from OptionParser.parse_args()
431 parser = OptionParser(description="Ganeti cluster watcher",
433 version="%%prog (ganeti) %s" %
434 constants.RELEASE_VERSION)
436 parser.add_option("-d", "--debug", dest="debug",
437 help="Write all messages to stderr",
438 default=False, action="store_true")
439 options, args = parser.parse_args()
443 def SetupLogging(debug):
444 """Configures the logging module.
447 formatter = logging.Formatter("%(asctime)s: %(message)s")
449 logfile_handler = logging.FileHandler(constants.LOG_WATCHER)
450 logfile_handler.setFormatter(formatter)
451 logfile_handler.setLevel(logging.INFO)
453 stderr_handler = logging.StreamHandler()
454 stderr_handler.setFormatter(formatter)
456 stderr_handler.setLevel(logging.NOTSET)
458 stderr_handler.setLevel(logging.CRITICAL)
460 root_logger = logging.getLogger("")
461 root_logger.setLevel(logging.NOTSET)
462 root_logger.addHandler(logfile_handler)
463 root_logger.addHandler(stderr_handler)
470 options, args = ParseOptions()
472 SetupLogging(options.debug)
477 except errors.ConfigurationError:
478 # Just exit if there's no configuration
479 sys.exit(constants.EXIT_SUCCESS)
481 except NotMasterError:
482 logging.debug("Not master, exiting")
483 sys.exit(constants.EXIT_NOTMASTER)
484 except errors.ResolverError, err:
485 logging.error("Cannot resolve hostname '%s', exiting.", err.args[0])
486 sys.exit(constants.EXIT_NODESETUP_ERROR)
487 except Exception, err:
488 logging.error(str(err), exc_info=True)
489 sys.exit(constants.EXIT_FAILURE)
492 if __name__ == '__main__':