4 # Copyright (C) 2006, 2007 Google Inc.
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 """Tool to restart erronously downed virtual machines.
24 This program and set of classes implement a watchdog to restart
25 virtual machines in a Ganeti cluster that have crashed or been killed
26 by a node reboot. Run from cron or similar.
30 LOGFILE = '/var/log/ganeti/watcher.log'
32 BAD_STATES = ['stopped']
33 HELPLESS_STATES = ['(node down)']
42 from optparse import OptionParser
45 from ganeti import utils
46 from ganeti import constants
49 class Error(Exception):
50 """Generic custom error class."""
54 def Indent(s, prefix='| '):
55 """Indent a piece of text with a given prefix before each line.
58 s: The string to indent
59 prefix: The string to prepend each line.
61 return "%s%s\n" % (prefix, ('\n' + prefix).join(s.splitlines()))
65 """Run a shell command.
68 cmd: the command to run.
70 Raises CommandError with verbose commentary on error.
72 res = utils.RunCmd(cmd)
75 raise Error("Command %s failed:\n%s\nstdout:\n%sstderr:\n%s" %
77 Indent(res.fail_reason),
84 class RestarterState(object):
85 """Interface to a state file recording restart attempts.
88 Open(): open, lock, read and parse the file.
89 Raises StandardError on lock contention.
91 NumberOfAttempts(name): returns the number of times in succession
92 a restart has been attempted of the named instance.
94 RecordAttempt(name, when): records one restart attempt of name at
97 Remove(name): remove record given by name, if exists.
99 Save(name): saves all records to file, releases lock and closes file.
102 # The two-step dance below is necessary to allow both opening existing
103 # file read/write and creating if not existing. Vanilla open will truncate
104 # an existing file -or- allow creating if not existing.
105 f = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT)
106 f = os.fdopen(f, 'w+')
109 fcntl.flock(f.fileno(), fcntl.LOCK_EX|fcntl.LOCK_NB)
111 if x.errno == errno.EAGAIN:
112 raise StandardError('State file already locked')
119 name, when, count = line.rstrip().split(':')
124 self.inst_map[name] = (when, count)
126 def NumberOfAttempts(self, instance):
127 """Returns number of previous restart attempts.
130 instance - the instance to look up.
132 assert self.statefile
134 if instance.name in self.inst_map:
135 return self.inst_map[instance.name][1]
139 def RecordAttempt(self, instance):
140 """Record a restart attempt.
143 instance - the instance being restarted
145 assert self.statefile
149 self.inst_map[instance.name] = (when, 1 + self.NumberOfAttempts(instance))
151 def Remove(self, instance):
152 """Update state to reflect that a machine is running, i.e. remove record
155 instance - the instance to remove from books
157 This method removes the record for a named instance
159 assert self.statefile
161 if instance.name in self.inst_map:
162 del self.inst_map[instance.name]
165 """Save records to file, then unlock and close file.
167 assert self.statefile
169 self.statefile.seek(0)
170 self.statefile.truncate()
172 for name in self.inst_map:
173 print >> self.statefile, "%s:%d:%d" % ((name,) + self.inst_map[name])
175 fcntl.flock(self.statefile.fileno(), fcntl.LOCK_UN)
177 self.statefile.close()
178 self.statefile = None
181 class Instance(object):
182 """Abstraction for a Virtual Machine instance.
185 Restart(): issue a command to restart the represented machine.
187 def __init__(self, name, state):
192 DoCmd(['gnt-instance', 'startup', '--lock-retries=15', self.name])
195 class InstanceList(object):
196 """The set of Virtual Machine instances on a cluster.
198 cmd = ['gnt-instance', 'list', '--lock-retries=15',
199 '-o', 'name,admin_state,oper_state', '--no-headers', '--separator=:']
202 res = DoCmd(self.cmd)
204 lines = res.stdout.splitlines()
208 fields = [fld.strip() for fld in line.split(':')]
212 if fields[1] == "no": #no autostart, we don't care about this instance
214 name, status = fields[0], fields[2]
216 self.instances.append(Instance(name, status))
219 return self.instances.__iter__()
222 class Message(object):
223 """Encapsulation of a notice or error message.
225 def __init__(self, level, msg):
228 self.when = time.time()
231 return self.level + ' ' + time.ctime(self.when) + '\n' + Indent(self.msg)
234 class Restarter(object):
235 """Encapsulate the logic for restarting erronously halted virtual machines.
237 The calling program should periodically instantiate me and call Run().
238 This will traverse the list of instances, and make up to MAXTRIES attempts
239 to restart machines that are down.
242 self.instances = InstanceList()
246 """Make a pass over the list of instances, restarting downed ones.
248 notepad = RestarterState()
250 for instance in self.instances:
251 if instance.state in BAD_STATES:
252 n = notepad.NumberOfAttempts(instance)
258 last = " (Attempt #%d)" % (n + 1)
260 notepad.RecordAttempt(instance)
261 self.messages.append(Message(ERROR, "Could not restart %s for %d"
262 " times, giving up..." %
263 (instance.name, MAXTRIES)))
266 self.messages.append(Message(NOTICE,
268 (instance.name, last)))
271 self.messages.append(Message(ERROR, str(x)))
273 notepad.RecordAttempt(instance)
274 elif instance.state in HELPLESS_STATES:
275 if notepad.NumberOfAttempts(instance):
276 notepad.Remove(instance)
278 if notepad.NumberOfAttempts(instance):
279 notepad.Remove(instance)
280 msg = Message(NOTICE,
281 "Restart of %s succeeded." % instance.name)
282 self.messages.append(msg)
286 def WriteReport(self, logfile):
288 Log all messages to file.
291 logfile: file object open for writing (the log file)
293 for msg in self.messages:
294 print >> logfile, str(msg)
298 """Parse the command line options.
301 (options, args) as from OptionParser.parse_args()
304 parser = OptionParser(description="Ganeti cluster watcher",
306 version="%%prog (ganeti) %s" %
307 constants.RELEASE_VERSION)
309 parser.add_option("-d", "--debug", dest="debug",
310 help="Don't redirect messages to the log file",
311 default=False, action="store_true")
312 options, args = parser.parse_args()
320 options, args = ParseOptions()
322 if not options.debug:
323 sys.stderr = sys.stdout = open(LOGFILE, 'a')
326 restarter = Restarter()
328 restarter.WriteReport(sys.stdout)
332 if __name__ == '__main__':