4 # Copyright (C) 2006, 2007 Google Inc.
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 """Tool to restart erronously downed virtual machines.
24 This program and set of classes implement a watchdog to restart
25 virtual machines in a Ganeti cluster that have crashed or been killed
26 by a node reboot. Run from cron or similar.
37 from optparse import OptionParser
39 from ganeti import utils
40 from ganeti import constants
41 from ganeti import serializer
42 from ganeti import ssconf
43 from ganeti import errors
47 BAD_STATES = ['stopped']
48 HELPLESS_STATES = ['(node down)']
51 KEY_RESTART_COUNT = "restart_count"
52 KEY_RESTART_WHEN = "restart_when"
53 KEY_BOOT_ID = "bootid"
56 class Error(Exception):
57 """Generic custom error class."""
60 class NotMasterError(Error):
61 """Exception raised when this host is not the master."""
64 def Indent(s, prefix='| '):
65 """Indent a piece of text with a given prefix before each line.
68 s: The string to indent
69 prefix: The string to prepend each line.
72 return "%s%s\n" % (prefix, ('\n' + prefix).join(s.splitlines()))
76 """Run a shell command.
79 cmd: the command to run.
81 Raises CommandError with verbose commentary on error.
84 res = utils.RunCmd(cmd)
87 raise Error("Command %s failed:\n%s\nstdout:\n%sstderr:\n%s" %
89 Indent(res.fail_reason),
96 class WatcherState(object):
97 """Interface to a state file recording restart attempts.
101 """Open, lock, read and parse the file.
103 Raises StandardError on lock contention.
106 # The two-step dance below is necessary to allow both opening existing
107 # file read/write and creating if not existing. Vanilla open will truncate
108 # an existing file -or- allow creating if not existing.
109 f = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT)
110 f = os.fdopen(f, 'w+')
113 fcntl.flock(f.fileno(), fcntl.LOCK_EX|fcntl.LOCK_NB)
115 if x.errno == errno.EAGAIN:
116 raise StandardError("State file already locked")
122 self.data = serializer.Load(self.statefile.read())
123 except Exception, msg:
124 # Ignore errors while loading the file and treat it as empty
126 logging.warning(("Empty or invalid state file. Using defaults."
127 " Error message: %s"), msg)
129 if "instance" not in self.data:
130 self.data["instance"] = {}
131 if "node" not in self.data:
132 self.data["node"] = {}
135 """Called on destruction.
142 """Unlock configuration file and close it.
145 assert self.statefile
147 fcntl.flock(self.statefile.fileno(), fcntl.LOCK_UN)
149 self.statefile.close()
150 self.statefile = None
152 def GetNodeBootID(self, name):
153 """Returns the last boot ID of a node or None.
156 ndata = self.data["node"]
158 if name in ndata and KEY_BOOT_ID in ndata[name]:
159 return ndata[name][KEY_BOOT_ID]
162 def SetNodeBootID(self, name, bootid):
163 """Sets the boot ID of a node.
168 ndata = self.data["node"]
170 if name not in ndata:
173 ndata[name][KEY_BOOT_ID] = bootid
175 def NumberOfRestartAttempts(self, instance):
176 """Returns number of previous restart attempts.
179 instance - the instance to look up.
182 idata = self.data["instance"]
184 if instance.name in idata:
185 return idata[instance.name][KEY_RESTART_COUNT]
189 def RecordRestartAttempt(self, instance):
190 """Record a restart attempt.
193 instance - the instance being restarted
196 idata = self.data["instance"]
198 if instance.name not in idata:
199 inst = idata[instance.name] = {}
201 inst = idata[instance.name]
203 inst[KEY_RESTART_WHEN] = time.time()
204 inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1
206 def RemoveInstance(self, instance):
207 """Update state to reflect that a machine is running, i.e. remove record.
210 instance - the instance to remove from books
212 This method removes the record for a named instance.
215 idata = self.data["instance"]
217 if instance.name in idata:
218 del idata[instance.name]
221 """Save state to file, then unlock and close it.
224 assert self.statefile
226 self.statefile.seek(0)
227 self.statefile.truncate()
229 self.statefile.write(serializer.Dump(self.data))
234 class Instance(object):
235 """Abstraction for a Virtual Machine instance.
238 Restart(): issue a command to restart the represented machine.
241 def __init__(self, name, state, autostart):
244 self.autostart = autostart
247 """Encapsulates the start of an instance.
250 DoCmd(['gnt-instance', 'startup', '--lock-retries=15', self.name])
252 def ActivateDisks(self):
253 """Encapsulates the activation of all disks of an instance.
256 DoCmd(['gnt-instance', 'activate-disks', '--lock-retries=15', self.name])
259 def _RunListCmd(cmd):
260 """Runs a command and parses its output into lists.
263 for line in DoCmd(cmd).stdout.splitlines():
264 yield line.split(':')
267 def GetInstanceList(with_secondaries=None):
268 """Get a list of instances on this cluster.
271 cmd = ['gnt-instance', 'list', '--lock-retries=15', '--no-headers',
274 fields = 'name,oper_state,admin_state'
276 if with_secondaries is not None:
283 for fields in _RunListCmd(cmd):
284 if with_secondaries is not None:
285 (name, status, autostart, snodes) = fields
290 for node in with_secondaries:
291 if node in snodes.split(','):
297 (name, status, autostart) = fields
299 instances.append(Instance(name, status, autostart != "no"))
304 def GetNodeBootIDs():
305 """Get a dict mapping nodes to boot IDs.
308 cmd = ['gnt-node', 'list', '--lock-retries=15', '--no-headers',
309 '--separator=:', '-o', 'name,bootid']
312 for fields in _RunListCmd(cmd):
313 (name, bootid) = fields
319 class Watcher(object):
320 """Encapsulate the logic for restarting erronously halted virtual machines.
322 The calling program should periodically instantiate me and call Run().
323 This will traverse the list of instances, and make up to MAXTRIES attempts
324 to restart machines that are down.
328 sstore = ssconf.SimpleStore()
329 master = sstore.GetMasterNode()
330 if master != utils.HostInfo().name:
331 raise NotMasterError("This is not the master node")
332 self.instances = GetInstanceList()
333 self.bootids = GetNodeBootIDs()
334 self.started_instances = set()
337 notepad = WatcherState()
339 self.CheckInstances(notepad)
340 self.CheckDisks(notepad)
345 def CheckDisks(self, notepad):
346 """Check all nodes for restarted ones.
350 for name, id in self.bootids.iteritems():
351 old = notepad.GetNodeBootID(name)
353 # Node's boot ID has changed, proably through a reboot.
354 check_nodes.append(name)
357 # Activate disks for all instances with any of the checked nodes as a
359 for instance in GetInstanceList(with_secondaries=check_nodes):
360 if not instance.autostart:
361 logging.info(("Skipping disk activation for non-autostart"
362 " instance %s"), instance.name)
364 if instance.name in self.started_instances:
365 # we already tried to start the instance, which should have
366 # activated its drives (if they can be at all)
369 logging.info("Activating disks for instance %s", instance.name)
370 instance.ActivateDisks()
372 logging.error(str(err), exc_info=True)
374 # Keep changed boot IDs
375 for name in check_nodes:
376 notepad.SetNodeBootID(name, self.bootids[name])
378 def CheckInstances(self, notepad):
379 """Make a pass over the list of instances, restarting downed ones.
382 for instance in self.instances:
383 # Don't care about manually stopped instances
384 if not instance.autostart:
387 if instance.state in BAD_STATES:
388 n = notepad.NumberOfRestartAttempts(instance)
394 last = " (Attempt #%d)" % (n + 1)
396 notepad.RecordRestartAttempt(instance)
397 logging.error("Could not restart %s after %d attempts, giving up",
398 instance.name, MAXTRIES)
401 logging.info("Restarting %s%s",
404 self.started_instances.add(instance.name)
406 logging.error(str(err), exc_info=True)
408 notepad.RecordRestartAttempt(instance)
409 elif instance.state in HELPLESS_STATES:
410 if notepad.NumberOfRestartAttempts(instance):
411 notepad.RemoveInstance(instance)
413 if notepad.NumberOfRestartAttempts(instance):
414 notepad.RemoveInstance(instance)
415 logging.info("Restart of %s succeeded", instance.name)
417 def VerifyDisks(self):
418 """Run gnt-cluster verify-disks.
421 result = DoCmd(['gnt-cluster', 'verify-disks', '--lock-retries=15'])
423 logging.info(result.output)
427 """Parse the command line options.
430 (options, args) as from OptionParser.parse_args()
433 parser = OptionParser(description="Ganeti cluster watcher",
435 version="%%prog (ganeti) %s" %
436 constants.RELEASE_VERSION)
438 parser.add_option("-d", "--debug", dest="debug",
439 help="Write all messages to stderr",
440 default=False, action="store_true")
441 options, args = parser.parse_args()
445 def SetupLogging(debug):
446 """Configures the logging module.
449 formatter = logging.Formatter("%(asctime)s: %(message)s")
451 logfile_handler = logging.FileHandler(constants.LOG_WATCHER)
452 logfile_handler.setFormatter(formatter)
453 logfile_handler.setLevel(logging.INFO)
455 stderr_handler = logging.StreamHandler()
456 stderr_handler.setFormatter(formatter)
458 stderr_handler.setLevel(logging.NOTSET)
460 stderr_handler.setLevel(logging.CRITICAL)
462 root_logger = logging.getLogger("")
463 root_logger.setLevel(logging.NOTSET)
464 root_logger.addHandler(logfile_handler)
465 root_logger.addHandler(stderr_handler)
472 options, args = ParseOptions()
474 SetupLogging(options.debug)
479 except errors.ConfigurationError:
480 # Just exit if there's no configuration
481 sys.exit(constants.EXIT_SUCCESS)
483 except NotMasterError:
484 logging.debug("Not master, exiting")
485 sys.exit(constants.EXIT_NOTMASTER)
486 except errors.ResolverError, err:
487 logging.error("Cannot resolve hostname '%s', exiting.", err.args[0])
488 sys.exit(constants.EXIT_NODESETUP_ERROR)
489 except Exception, err:
490 logging.error(str(err), exc_info=True)
491 sys.exit(constants.EXIT_FAILURE)
494 if __name__ == '__main__':