4 # Copyright (C) 2006, 2007 Google Inc.
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 """Tool to restart erronously downed virtual machines.
24 This program and set of classes implement a watchdog to restart
25 virtual machines in a Ganeti cluster that have crashed or been killed
26 by a node reboot. Run from cron or similar.
30 LOGFILE = '/var/log/ganeti/watcher.log'
32 BAD_STATES = ['stopped']
33 HELPLESS_STATES = ['(node down)']
43 from optparse import OptionParser
46 from ganeti import utils
47 from ganeti import constants
48 from ganeti import ssconf
51 class Error(Exception):
52 """Generic custom error class."""
55 class NotMasterError(Error):
56 """Exception raised when this host is not the master."""
59 def Indent(s, prefix='| '):
60 """Indent a piece of text with a given prefix before each line.
63 s: The string to indent
64 prefix: The string to prepend each line.
67 return "%s%s\n" % (prefix, ('\n' + prefix).join(s.splitlines()))
71 """Run a shell command.
74 cmd: the command to run.
76 Raises CommandError with verbose commentary on error.
79 res = utils.RunCmd(cmd)
82 raise Error("Command %s failed:\n%s\nstdout:\n%sstderr:\n%s" %
84 Indent(res.fail_reason),
91 class RestarterState(object):
92 """Interface to a state file recording restart attempts.
95 Open(): open, lock, read and parse the file.
96 Raises StandardError on lock contention.
98 NumberOfAttempts(name): returns the number of times in succession
99 a restart has been attempted of the named instance.
101 RecordAttempt(name, when): records one restart attempt of name at
104 Remove(name): remove record given by name, if exists.
106 Save(name): saves all records to file, releases lock and closes file.
110 # The two-step dance below is necessary to allow both opening existing
111 # file read/write and creating if not existing. Vanilla open will truncate
112 # an existing file -or- allow creating if not existing.
113 f = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT)
114 f = os.fdopen(f, 'w+')
117 fcntl.flock(f.fileno(), fcntl.LOCK_EX|fcntl.LOCK_NB)
119 if x.errno == errno.EAGAIN:
120 raise StandardError('State file already locked')
127 name, when, count = line.rstrip().split(':')
132 self.inst_map[name] = (when, count)
134 def NumberOfAttempts(self, instance):
135 """Returns number of previous restart attempts.
138 instance - the instance to look up.
141 assert self.statefile
143 if instance.name in self.inst_map:
144 return self.inst_map[instance.name][1]
148 def RecordAttempt(self, instance):
149 """Record a restart attempt.
152 instance - the instance being restarted
155 assert self.statefile
159 self.inst_map[instance.name] = (when, 1 + self.NumberOfAttempts(instance))
161 def Remove(self, instance):
162 """Update state to reflect that a machine is running, i.e. remove record.
165 instance - the instance to remove from books
167 This method removes the record for a named instance.
170 assert self.statefile
172 if instance.name in self.inst_map:
173 del self.inst_map[instance.name]
176 """Save records to file, then unlock and close file.
179 assert self.statefile
181 self.statefile.seek(0)
182 self.statefile.truncate()
184 for name in self.inst_map:
185 print >> self.statefile, "%s:%d:%d" % ((name,) + self.inst_map[name])
187 fcntl.flock(self.statefile.fileno(), fcntl.LOCK_UN)
189 self.statefile.close()
190 self.statefile = None
193 class Instance(object):
194 """Abstraction for a Virtual Machine instance.
197 Restart(): issue a command to restart the represented machine.
200 def __init__(self, name, state):
205 DoCmd(['gnt-instance', 'startup', '--lock-retries=15', self.name])
208 class InstanceList(object):
209 """The set of Virtual Machine instances on a cluster.
212 cmd = ['gnt-instance', 'list', '--lock-retries=15',
213 '-o', 'name,admin_state,oper_state', '--no-headers', '--separator=:']
216 res = DoCmd(self.cmd)
218 lines = res.stdout.splitlines()
222 fields = [fld.strip() for fld in line.split(':')]
226 if fields[1] == "no": #no autostart, we don't care about this instance
228 name, status = fields[0], fields[2]
230 self.instances.append(Instance(name, status))
233 return self.instances.__iter__()
236 class Message(object):
237 """Encapsulation of a notice or error message.
240 def __init__(self, level, msg):
243 self.when = time.time()
246 return self.level + ' ' + time.ctime(self.when) + '\n' + Indent(self.msg)
249 class Restarter(object):
250 """Encapsulate the logic for restarting erronously halted virtual machines.
252 The calling program should periodically instantiate me and call Run().
253 This will traverse the list of instances, and make up to MAXTRIES attempts
254 to restart machines that are down.
258 sstore = ssconf.SimpleStore()
259 master = sstore.GetMasterNode()
260 if master != socket.gethostname():
261 raise NotMasterError, ("This is not the master node")
262 self.instances = InstanceList()
266 """Make a pass over the list of instances, restarting downed ones.
269 notepad = RestarterState()
271 for instance in self.instances:
272 if instance.state in BAD_STATES:
273 n = notepad.NumberOfAttempts(instance)
279 last = " (Attempt #%d)" % (n + 1)
281 notepad.RecordAttempt(instance)
282 self.messages.append(Message(ERROR, "Could not restart %s for %d"
283 " times, giving up..." %
284 (instance.name, MAXTRIES)))
287 self.messages.append(Message(NOTICE,
289 (instance.name, last)))
292 self.messages.append(Message(ERROR, str(x)))
294 notepad.RecordAttempt(instance)
295 elif instance.state in HELPLESS_STATES:
296 if notepad.NumberOfAttempts(instance):
297 notepad.Remove(instance)
299 if notepad.NumberOfAttempts(instance):
300 notepad.Remove(instance)
301 msg = Message(NOTICE,
302 "Restart of %s succeeded." % instance.name)
303 self.messages.append(msg)
307 def WriteReport(self, logfile):
308 """Log all messages to file.
311 logfile: file object open for writing (the log file)
314 for msg in self.messages:
315 print >> logfile, str(msg)
319 """Parse the command line options.
322 (options, args) as from OptionParser.parse_args()
325 parser = OptionParser(description="Ganeti cluster watcher",
327 version="%%prog (ganeti) %s" %
328 constants.RELEASE_VERSION)
330 parser.add_option("-d", "--debug", dest="debug",
331 help="Don't redirect messages to the log file",
332 default=False, action="store_true")
333 options, args = parser.parse_args()
341 options, args = ParseOptions()
343 if not options.debug:
344 sys.stderr = sys.stdout = open(LOGFILE, 'a')
347 restarter = Restarter()
349 restarter.WriteReport(sys.stdout)
350 except NotMasterError:
352 sys.stderr.write("Not master, exiting.\n")
353 sys.exit(constants.EXIT_NOTMASTER)
357 if __name__ == '__main__':