4 # Copyright (C) 2006, 2007, 2008 Google Inc.
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 """Tool to restart erronously downed virtual machines.
24 This program and set of classes implement a watchdog to restart
25 virtual machines in a Ganeti cluster that have crashed or been killed
26 by a node reboot. Run from cron or similar.
36 from optparse import OptionParser
38 from ganeti import utils
39 from ganeti import constants
40 from ganeti import serializer
41 from ganeti import ssconf
42 from ganeti import errors
43 from ganeti import opcodes
44 from ganeti import logger
45 from ganeti import cli
49 BAD_STATES = ['stopped']
50 HELPLESS_STATES = ['(node down)']
53 KEY_RESTART_COUNT = "restart_count"
54 KEY_RESTART_WHEN = "restart_when"
55 KEY_BOOT_ID = "bootid"
58 # Global client object
62 class NotMasterError(errors.GenericError):
63 """Exception raised when this host is not the master."""
66 def Indent(s, prefix='| '):
67 """Indent a piece of text with a given prefix before each line.
70 s: The string to indent
71 prefix: The string to prepend each line.
74 return "%s%s\n" % (prefix, ('\n' + prefix).join(s.splitlines()))
77 class WatcherState(object):
78 """Interface to a state file recording restart attempts.
82 """Open, lock, read and parse the file.
84 Raises exception on lock contention.
87 # The two-step dance below is necessary to allow both opening existing
88 # file read/write and creating if not existing. Vanilla open will truncate
89 # an existing file -or- allow creating if not existing.
90 fd = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT)
91 self.statefile = os.fdopen(fd, 'w+')
93 utils.LockFile(self.statefile.fileno())
96 self._data = serializer.Load(self.statefile.read())
97 except Exception, msg:
98 # Ignore errors while loading the file and treat it as empty
100 logging.warning(("Empty or invalid state file. Using defaults."
101 " Error message: %s"), msg)
103 if "instance" not in self._data:
104 self._data["instance"] = {}
105 if "node" not in self._data:
106 self._data["node"] = {}
108 self._orig_data = serializer.Dump(self._data)
111 """Save state to file, then unlock and close it.
114 assert self.statefile
116 serialized_form = serializer.Dump(self._data)
117 if self._orig_data == serialized_form:
118 logging.debug("Data didn't change, just touching status file")
119 os.utime(constants.WATCHER_STATEFILE, None)
122 # We need to make sure the file is locked before renaming it, otherwise
123 # starting ganeti-watcher again at the same time will create a conflict.
124 fd = utils.WriteFile(constants.WATCHER_STATEFILE,
125 data=serialized_form,
126 prewrite=utils.LockFile, close=False)
127 self.statefile = os.fdopen(fd, 'w+')
130 """Unlock configuration file and close it.
133 assert self.statefile
135 # Files are automatically unlocked when closing them
136 self.statefile.close()
137 self.statefile = None
139 def GetNodeBootID(self, name):
140 """Returns the last boot ID of a node or None.
143 ndata = self._data["node"]
145 if name in ndata and KEY_BOOT_ID in ndata[name]:
146 return ndata[name][KEY_BOOT_ID]
149 def SetNodeBootID(self, name, bootid):
150 """Sets the boot ID of a node.
155 ndata = self._data["node"]
157 if name not in ndata:
160 ndata[name][KEY_BOOT_ID] = bootid
162 def NumberOfRestartAttempts(self, instance):
163 """Returns number of previous restart attempts.
166 instance - the instance to look up.
169 idata = self._data["instance"]
171 if instance.name in idata:
172 return idata[instance.name][KEY_RESTART_COUNT]
176 def RecordRestartAttempt(self, instance):
177 """Record a restart attempt.
180 instance - the instance being restarted
183 idata = self._data["instance"]
185 if instance.name not in idata:
186 inst = idata[instance.name] = {}
188 inst = idata[instance.name]
190 inst[KEY_RESTART_WHEN] = time.time()
191 inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1
193 def RemoveInstance(self, instance):
194 """Update state to reflect that a machine is running, i.e. remove record.
197 instance - the instance to remove from books
199 This method removes the record for a named instance.
202 idata = self._data["instance"]
204 if instance.name in idata:
205 del idata[instance.name]
208 class Instance(object):
209 """Abstraction for a Virtual Machine instance.
212 Restart(): issue a command to restart the represented machine.
215 def __init__(self, name, state, autostart):
218 self.autostart = autostart
221 """Encapsulates the start of an instance.
224 op = opcodes.OpStartupInstance(instance_name=self.name,
227 cli.SubmitOpCode(op, cl=client)
229 def ActivateDisks(self):
230 """Encapsulates the activation of all disks of an instance.
233 op = opcodes.OpActivateInstanceDisks(instance_name=self.name)
234 cli.SubmitOpCode(op, cl=client)
237 def GetInstanceList(with_secondaries=None):
238 """Get a list of instances on this cluster.
241 fields = ["name", "oper_state", "admin_state"]
243 if with_secondaries is not None:
244 fields.append("snodes")
246 result = client.QueryInstances([], fields)
249 for fields in result:
250 if with_secondaries is not None:
251 (name, status, autostart, snodes) = fields
256 for node in with_secondaries:
263 (name, status, autostart) = fields
265 instances.append(Instance(name, status, autostart))
270 def GetNodeBootIDs():
271 """Get a dict mapping nodes to boot IDs.
274 result = client.QueryNodes([], ["name", "bootid"])
275 return dict([(name, bootid) for name, bootid in result])
278 class Watcher(object):
279 """Encapsulate the logic for restarting erronously halted virtual machines.
281 The calling program should periodically instantiate me and call Run().
282 This will traverse the list of instances, and make up to MAXTRIES attempts
283 to restart machines that are down.
287 sstore = ssconf.SimpleStore()
288 master = sstore.GetMasterNode()
289 if master != utils.HostInfo().name:
290 raise NotMasterError("This is not the master node")
291 self.instances = GetInstanceList()
292 self.bootids = GetNodeBootIDs()
293 self.started_instances = set()
296 notepad = WatcherState()
298 self.CheckInstances(notepad)
299 self.CheckDisks(notepad)
304 def CheckDisks(self, notepad):
305 """Check all nodes for restarted ones.
309 for name, new_id in self.bootids.iteritems():
310 old = notepad.GetNodeBootID(name)
312 # Node's boot ID has changed, proably through a reboot.
313 check_nodes.append(name)
316 # Activate disks for all instances with any of the checked nodes as a
318 for instance in GetInstanceList(with_secondaries=check_nodes):
319 if not instance.autostart:
320 logging.info(("Skipping disk activation for non-autostart"
321 " instance %s"), instance.name)
323 if instance.name in self.started_instances:
324 # we already tried to start the instance, which should have
325 # activated its drives (if they can be at all)
328 logging.info("Activating disks for instance %s", instance.name)
329 instance.ActivateDisks()
330 except Exception, err:
331 logging.error(str(err), exc_info=True)
333 # Keep changed boot IDs
334 for name in check_nodes:
335 notepad.SetNodeBootID(name, self.bootids[name])
337 def CheckInstances(self, notepad):
338 """Make a pass over the list of instances, restarting downed ones.
341 for instance in self.instances:
342 # Don't care about manually stopped instances
343 if not instance.autostart:
346 if instance.state in BAD_STATES:
347 n = notepad.NumberOfRestartAttempts(instance)
353 last = " (Attempt #%d)" % (n + 1)
355 notepad.RecordRestartAttempt(instance)
356 logging.error("Could not restart %s after %d attempts, giving up",
357 instance.name, MAXTRIES)
360 logging.info("Restarting %s%s",
363 self.started_instances.add(instance.name)
364 except Exception, err:
365 logging.error(str(err), exc_info=True)
367 notepad.RecordRestartAttempt(instance)
368 elif instance.state in HELPLESS_STATES:
369 if notepad.NumberOfRestartAttempts(instance):
370 notepad.RemoveInstance(instance)
372 if notepad.NumberOfRestartAttempts(instance):
373 notepad.RemoveInstance(instance)
374 logging.info("Restart of %s succeeded", instance.name)
376 def VerifyDisks(self):
377 """Run gnt-cluster verify-disks.
380 op = opcodes.OpVerifyDisks()
381 result = cli.SubmitOpCode(op, cl=client)
382 if not isinstance(result, (tuple, list)):
383 logging.error("Can't get a valid result from verify-disks")
385 offline_disk_instances = result[2]
386 if not offline_disk_instances:
389 logging.debug("Will activate disks for instances %s",
390 ", ".join(offline_disk_instances))
391 # we submit only one job, and wait for it. not optimal, but spams
393 job = [opcodes.OpActivateInstanceDisks(instance_name=name)
394 for name in offline_disk_instances]
395 job_id = cli.SendJob(job, cl=client)
397 cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)
401 """Parse the command line options.
404 (options, args) as from OptionParser.parse_args()
407 parser = OptionParser(description="Ganeti cluster watcher",
409 version="%%prog (ganeti) %s" %
410 constants.RELEASE_VERSION)
412 parser.add_option("-d", "--debug", dest="debug",
413 help="Write all messages to stderr",
414 default=False, action="store_true")
415 options, args = parser.parse_args()
425 options, args = ParseOptions()
427 logger.SetupLogging(constants.LOG_WATCHER, debug=options.debug,
428 stderr_logging=options.debug)
431 client = cli.GetClient()
435 except errors.ConfigurationError:
436 # Just exit if there's no configuration
437 sys.exit(constants.EXIT_SUCCESS)
442 except NotMasterError:
443 logging.debug("Not master, exiting")
444 sys.exit(constants.EXIT_NOTMASTER)
445 except errors.ResolverError, err:
446 logging.error("Cannot resolve hostname '%s', exiting.", err.args[0])
447 sys.exit(constants.EXIT_NODESETUP_ERROR)
448 except Exception, err:
449 logging.error(str(err), exc_info=True)
450 sys.exit(constants.EXIT_FAILURE)
453 if __name__ == '__main__':