4 # Copyright (C) 2006, 2007, 2008 Google Inc.
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 """Tool to restart erronously downed virtual machines.
24 This program and set of classes implement a watchdog to restart
25 virtual machines in a Ganeti cluster that have crashed or been killed
26 by a node reboot. Run from cron or similar.
36 from optparse import OptionParser
38 from ganeti import utils
39 from ganeti import constants
40 from ganeti import serializer
41 from ganeti import ssconf
42 from ganeti import errors
43 from ganeti import logger
47 BAD_STATES = ['stopped']
48 HELPLESS_STATES = ['(node down)']
51 KEY_RESTART_COUNT = "restart_count"
52 KEY_RESTART_WHEN = "restart_when"
53 KEY_BOOT_ID = "bootid"
56 class NotMasterError(errors.GenericError):
57 """Exception raised when this host is not the master."""
60 def Indent(s, prefix='| '):
61 """Indent a piece of text with a given prefix before each line.
64 s: The string to indent
65 prefix: The string to prepend each line.
68 return "%s%s\n" % (prefix, ('\n' + prefix).join(s.splitlines()))
72 """Run a shell command.
75 cmd: the command to run.
77 Raises CommandError with verbose commentary on error.
80 res = utils.RunCmd(cmd)
83 msg = ("Command %s failed:\n%s\nstdout:\n%sstderr:\n%s" %
85 Indent(res.fail_reason),
88 raise errors.CommandError(msg)
93 class WatcherState(object):
94 """Interface to a state file recording restart attempts.
98 """Open, lock, read and parse the file.
100 Raises exception on lock contention.
103 # The two-step dance below is necessary to allow both opening existing
104 # file read/write and creating if not existing. Vanilla open will truncate
105 # an existing file -or- allow creating if not existing.
106 fd = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT)
107 self.statefile = os.fdopen(fd, 'w+')
109 utils.LockFile(self.statefile.fileno())
112 self._data = serializer.Load(self.statefile.read())
113 except Exception, msg:
114 # Ignore errors while loading the file and treat it as empty
116 logging.warning(("Empty or invalid state file. Using defaults."
117 " Error message: %s"), msg)
119 if "instance" not in self._data:
120 self._data["instance"] = {}
121 if "node" not in self._data:
122 self._data["node"] = {}
124 self._orig_data = serializer.Dump(self._data)
127 """Save state to file, then unlock and close it.
130 assert self.statefile
132 serialized_form = serializer.Dump(self._data)
133 if self._orig_data == serialized_form:
134 logging.debug("Data didn't change, just touching status file")
135 os.utime(constants.WATCHER_STATEFILE, None)
138 # We need to make sure the file is locked before renaming it, otherwise
139 # starting ganeti-watcher again at the same time will create a conflict.
140 fd = utils.WriteFile(constants.WATCHER_STATEFILE,
141 data=serialized_form,
142 prewrite=utils.LockFile, close=False)
143 self.statefile = os.fdopen(fd, 'w+')
146 """Unlock configuration file and close it.
149 assert self.statefile
151 # Files are automatically unlocked when closing them
152 self.statefile.close()
153 self.statefile = None
155 def GetNodeBootID(self, name):
156 """Returns the last boot ID of a node or None.
159 ndata = self._data["node"]
161 if name in ndata and KEY_BOOT_ID in ndata[name]:
162 return ndata[name][KEY_BOOT_ID]
165 def SetNodeBootID(self, name, bootid):
166 """Sets the boot ID of a node.
171 ndata = self._data["node"]
173 if name not in ndata:
176 ndata[name][KEY_BOOT_ID] = bootid
178 def NumberOfRestartAttempts(self, instance):
179 """Returns number of previous restart attempts.
182 instance - the instance to look up.
185 idata = self._data["instance"]
187 if instance.name in idata:
188 return idata[instance.name][KEY_RESTART_COUNT]
192 def RecordRestartAttempt(self, instance):
193 """Record a restart attempt.
196 instance - the instance being restarted
199 idata = self._data["instance"]
201 if instance.name not in idata:
202 inst = idata[instance.name] = {}
204 inst = idata[instance.name]
206 inst[KEY_RESTART_WHEN] = time.time()
207 inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1
209 def RemoveInstance(self, instance):
210 """Update state to reflect that a machine is running, i.e. remove record.
213 instance - the instance to remove from books
215 This method removes the record for a named instance.
218 idata = self._data["instance"]
220 if instance.name in idata:
221 del idata[instance.name]
224 class Instance(object):
225 """Abstraction for a Virtual Machine instance.
228 Restart(): issue a command to restart the represented machine.
231 def __init__(self, name, state, autostart):
234 self.autostart = autostart
237 """Encapsulates the start of an instance.
240 DoCmd(['gnt-instance', 'startup', '--lock-retries=15', self.name])
242 def ActivateDisks(self):
243 """Encapsulates the activation of all disks of an instance.
246 DoCmd(['gnt-instance', 'activate-disks', '--lock-retries=15', self.name])
249 def _RunListCmd(cmd):
250 """Runs a command and parses its output into lists.
253 for line in DoCmd(cmd).stdout.splitlines():
254 yield line.split(':')
257 def GetInstanceList(with_secondaries=None):
258 """Get a list of instances on this cluster.
261 cmd = ['gnt-instance', 'list', '--lock-retries=15', '--no-headers',
264 fields = 'name,oper_state,admin_state'
266 if with_secondaries is not None:
273 for fields in _RunListCmd(cmd):
274 if with_secondaries is not None:
275 (name, status, autostart, snodes) = fields
280 for node in with_secondaries:
281 if node in snodes.split(','):
287 (name, status, autostart) = fields
289 instances.append(Instance(name, status, autostart != "no"))
294 def GetNodeBootIDs():
295 """Get a dict mapping nodes to boot IDs.
298 cmd = ['gnt-node', 'list', '--lock-retries=15', '--no-headers',
299 '--separator=:', '-o', 'name,bootid']
302 for fields in _RunListCmd(cmd):
303 (name, bootid) = fields
309 class Watcher(object):
310 """Encapsulate the logic for restarting erronously halted virtual machines.
312 The calling program should periodically instantiate me and call Run().
313 This will traverse the list of instances, and make up to MAXTRIES attempts
314 to restart machines that are down.
318 sstore = ssconf.SimpleStore()
319 master = sstore.GetMasterNode()
320 if master != utils.HostInfo().name:
321 raise NotMasterError("This is not the master node")
322 self.instances = GetInstanceList()
323 self.bootids = GetNodeBootIDs()
324 self.started_instances = set()
327 notepad = WatcherState()
329 self.CheckInstances(notepad)
330 self.CheckDisks(notepad)
335 def CheckDisks(self, notepad):
336 """Check all nodes for restarted ones.
340 for name, new_id in self.bootids.iteritems():
341 old = notepad.GetNodeBootID(name)
343 # Node's boot ID has changed, proably through a reboot.
344 check_nodes.append(name)
347 # Activate disks for all instances with any of the checked nodes as a
349 for instance in GetInstanceList(with_secondaries=check_nodes):
350 if not instance.autostart:
351 logging.info(("Skipping disk activation for non-autostart"
352 " instance %s"), instance.name)
354 if instance.name in self.started_instances:
355 # we already tried to start the instance, which should have
356 # activated its drives (if they can be at all)
359 logging.info("Activating disks for instance %s", instance.name)
360 instance.ActivateDisks()
361 except Exception, err:
362 logging.error(str(err), exc_info=True)
364 # Keep changed boot IDs
365 for name in check_nodes:
366 notepad.SetNodeBootID(name, self.bootids[name])
368 def CheckInstances(self, notepad):
369 """Make a pass over the list of instances, restarting downed ones.
372 for instance in self.instances:
373 # Don't care about manually stopped instances
374 if not instance.autostart:
377 if instance.state in BAD_STATES:
378 n = notepad.NumberOfRestartAttempts(instance)
384 last = " (Attempt #%d)" % (n + 1)
386 notepad.RecordRestartAttempt(instance)
387 logging.error("Could not restart %s after %d attempts, giving up",
388 instance.name, MAXTRIES)
391 logging.info("Restarting %s%s",
394 self.started_instances.add(instance.name)
395 except Exception, err:
396 logging.error(str(err), exc_info=True)
398 notepad.RecordRestartAttempt(instance)
399 elif instance.state in HELPLESS_STATES:
400 if notepad.NumberOfRestartAttempts(instance):
401 notepad.RemoveInstance(instance)
403 if notepad.NumberOfRestartAttempts(instance):
404 notepad.RemoveInstance(instance)
405 logging.info("Restart of %s succeeded", instance.name)
407 def VerifyDisks(self):
408 """Run gnt-cluster verify-disks.
411 result = DoCmd(['gnt-cluster', 'verify-disks', '--lock-retries=15'])
413 logging.info(result.output)
417 """Parse the command line options.
420 (options, args) as from OptionParser.parse_args()
423 parser = OptionParser(description="Ganeti cluster watcher",
425 version="%%prog (ganeti) %s" %
426 constants.RELEASE_VERSION)
428 parser.add_option("-d", "--debug", dest="debug",
429 help="Write all messages to stderr",
430 default=False, action="store_true")
431 options, args = parser.parse_args()
439 options, args = ParseOptions()
441 logger.SetupDaemon(constants.LOG_WATCHER, debug=options.debug)
446 except errors.ConfigurationError:
447 # Just exit if there's no configuration
448 sys.exit(constants.EXIT_SUCCESS)
452 except NotMasterError:
453 logging.debug("Not master, exiting")
454 sys.exit(constants.EXIT_NOTMASTER)
455 except errors.ResolverError, err:
456 logging.error("Cannot resolve hostname '%s', exiting.", err.args[0])
457 sys.exit(constants.EXIT_NODESETUP_ERROR)
458 except Exception, err:
459 logging.error(str(err), exc_info=True)
460 sys.exit(constants.EXIT_FAILURE)
463 if __name__ == '__main__':