Statistics
| Branch: | Tag: | Revision:

root / daemons / ganeti-watcher @ 38242904

History | View | Annotate | Download (9.1 kB)

1 a8083063 Iustin Pop
#!/usr/bin/python
2 a8083063 Iustin Pop
#
3 a8083063 Iustin Pop
4 a8083063 Iustin Pop
# Copyright (C) 2006, 2007 Google Inc.
5 a8083063 Iustin Pop
#
6 a8083063 Iustin Pop
# This program is free software; you can redistribute it and/or modify
7 a8083063 Iustin Pop
# it under the terms of the GNU General Public License as published by
8 a8083063 Iustin Pop
# the Free Software Foundation; either version 2 of the License, or
9 a8083063 Iustin Pop
# (at your option) any later version.
10 a8083063 Iustin Pop
#
11 a8083063 Iustin Pop
# This program is distributed in the hope that it will be useful, but
12 a8083063 Iustin Pop
# WITHOUT ANY WARRANTY; without even the implied warranty of
13 a8083063 Iustin Pop
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 a8083063 Iustin Pop
# General Public License for more details.
15 a8083063 Iustin Pop
#
16 a8083063 Iustin Pop
# You should have received a copy of the GNU General Public License
17 a8083063 Iustin Pop
# along with this program; if not, write to the Free Software
18 a8083063 Iustin Pop
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 a8083063 Iustin Pop
# 02110-1301, USA.
20 a8083063 Iustin Pop
21 a8083063 Iustin Pop
22 a8083063 Iustin Pop
"""Tool to restart erronously downed virtual machines.
23 a8083063 Iustin Pop
24 a8083063 Iustin Pop
This program and set of classes implement a watchdog to restart
25 a8083063 Iustin Pop
virtual machines in a Ganeti cluster that have crashed or been killed
26 a8083063 Iustin Pop
by a node reboot.  Run from cron or similar.
27 a8083063 Iustin Pop
"""
28 a8083063 Iustin Pop
29 a8083063 Iustin Pop
30 a8083063 Iustin Pop
LOGFILE = '/var/log/ganeti/watcher.log'
31 a8083063 Iustin Pop
MAXTRIES = 5
32 a8083063 Iustin Pop
BAD_STATES = ['stopped']
33 a8083063 Iustin Pop
HELPLESS_STATES = ['(node down)']
34 a8083063 Iustin Pop
NOTICE = 'NOTICE'
35 a8083063 Iustin Pop
ERROR = 'ERROR'
36 a8083063 Iustin Pop
37 a8083063 Iustin Pop
import os
38 a8083063 Iustin Pop
import sys
39 a8083063 Iustin Pop
import time
40 a8083063 Iustin Pop
import fcntl
41 a8083063 Iustin Pop
import errno
42 38242904 Iustin Pop
import socket
43 a8083063 Iustin Pop
from optparse import OptionParser
44 a8083063 Iustin Pop
45 a8083063 Iustin Pop
46 a8083063 Iustin Pop
from ganeti import utils
47 a8083063 Iustin Pop
from ganeti import constants
48 38242904 Iustin Pop
from ganeti import ssconf
49 a8083063 Iustin Pop
50 a8083063 Iustin Pop
51 a8083063 Iustin Pop
class Error(Exception):
52 a8083063 Iustin Pop
  """Generic custom error class."""
53 38242904 Iustin Pop
54 38242904 Iustin Pop
55 38242904 Iustin Pop
class NotMasterError(Error):
56 38242904 Iustin Pop
  """Exception raised when this host is not the master."""
57 a8083063 Iustin Pop
58 a8083063 Iustin Pop
59 a8083063 Iustin Pop
def Indent(s, prefix='| '):
60 a8083063 Iustin Pop
  """Indent a piece of text with a given prefix before each line.
61 a8083063 Iustin Pop
62 a8083063 Iustin Pop
  Args:
63 a8083063 Iustin Pop
    s: The string to indent
64 a8083063 Iustin Pop
    prefix: The string to prepend each line.
65 38242904 Iustin Pop
66 a8083063 Iustin Pop
  """
67 a8083063 Iustin Pop
  return "%s%s\n" % (prefix, ('\n' + prefix).join(s.splitlines()))
68 a8083063 Iustin Pop
69 a8083063 Iustin Pop
70 a8083063 Iustin Pop
def DoCmd(cmd):
71 a8083063 Iustin Pop
  """Run a shell command.
72 a8083063 Iustin Pop
73 a8083063 Iustin Pop
  Args:
74 a8083063 Iustin Pop
    cmd: the command to run.
75 a8083063 Iustin Pop
76 a8083063 Iustin Pop
  Raises CommandError with verbose commentary on error.
77 38242904 Iustin Pop
78 a8083063 Iustin Pop
  """
79 a8083063 Iustin Pop
  res = utils.RunCmd(cmd)
80 a8083063 Iustin Pop
81 a8083063 Iustin Pop
  if res.failed:
82 a8083063 Iustin Pop
    raise Error("Command %s failed:\n%s\nstdout:\n%sstderr:\n%s" %
83 a8083063 Iustin Pop
                (repr(cmd),
84 a8083063 Iustin Pop
                 Indent(res.fail_reason),
85 a8083063 Iustin Pop
                 Indent(res.stdout),
86 a8083063 Iustin Pop
                 Indent(res.stderr)))
87 a8083063 Iustin Pop
88 a8083063 Iustin Pop
  return res
89 a8083063 Iustin Pop
90 a8083063 Iustin Pop
91 a8083063 Iustin Pop
class RestarterState(object):
92 a8083063 Iustin Pop
  """Interface to a state file recording restart attempts.
93 a8083063 Iustin Pop
94 a8083063 Iustin Pop
  Methods:
95 a8083063 Iustin Pop
    Open(): open, lock, read and parse the file.
96 a8083063 Iustin Pop
            Raises StandardError on lock contention.
97 a8083063 Iustin Pop
98 a8083063 Iustin Pop
    NumberOfAttempts(name): returns the number of times in succession
99 a8083063 Iustin Pop
                            a restart has been attempted of the named instance.
100 a8083063 Iustin Pop
101 a8083063 Iustin Pop
    RecordAttempt(name, when): records one restart attempt of name at
102 a8083063 Iustin Pop
                               time in when.
103 a8083063 Iustin Pop
104 a8083063 Iustin Pop
    Remove(name): remove record given by name, if exists.
105 a8083063 Iustin Pop
106 a8083063 Iustin Pop
    Save(name): saves all records to file, releases lock and closes file.
107 38242904 Iustin Pop
108 a8083063 Iustin Pop
  """
109 a8083063 Iustin Pop
  def __init__(self):
110 a8083063 Iustin Pop
    # The two-step dance below is necessary to allow both opening existing
111 a8083063 Iustin Pop
    # file read/write and creating if not existing.  Vanilla open will truncate
112 a8083063 Iustin Pop
    # an existing file -or- allow creating if not existing.
113 a8083063 Iustin Pop
    f = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT)
114 a8083063 Iustin Pop
    f = os.fdopen(f, 'w+')
115 a8083063 Iustin Pop
116 a8083063 Iustin Pop
    try:
117 a8083063 Iustin Pop
      fcntl.flock(f.fileno(), fcntl.LOCK_EX|fcntl.LOCK_NB)
118 a8083063 Iustin Pop
    except IOError, x:
119 a8083063 Iustin Pop
      if x.errno == errno.EAGAIN:
120 a8083063 Iustin Pop
        raise StandardError('State file already locked')
121 a8083063 Iustin Pop
      raise
122 a8083063 Iustin Pop
123 a8083063 Iustin Pop
    self.statefile = f
124 a8083063 Iustin Pop
    self.inst_map = {}
125 a8083063 Iustin Pop
126 a8083063 Iustin Pop
    for line in f:
127 a8083063 Iustin Pop
      name, when, count = line.rstrip().split(':')
128 a8083063 Iustin Pop
129 a8083063 Iustin Pop
      when = int(when)
130 a8083063 Iustin Pop
      count = int(count)
131 a8083063 Iustin Pop
132 a8083063 Iustin Pop
      self.inst_map[name] = (when, count)
133 a8083063 Iustin Pop
134 a8083063 Iustin Pop
  def NumberOfAttempts(self, instance):
135 a8083063 Iustin Pop
    """Returns number of previous restart attempts.
136 a8083063 Iustin Pop
137 a8083063 Iustin Pop
    Args:
138 a8083063 Iustin Pop
      instance - the instance to look up.
139 38242904 Iustin Pop
140 a8083063 Iustin Pop
    """
141 a8083063 Iustin Pop
    assert self.statefile
142 a8083063 Iustin Pop
143 a8083063 Iustin Pop
    if instance.name in self.inst_map:
144 a8083063 Iustin Pop
      return self.inst_map[instance.name][1]
145 a8083063 Iustin Pop
146 a8083063 Iustin Pop
    return 0
147 a8083063 Iustin Pop
148 a8083063 Iustin Pop
  def RecordAttempt(self, instance):
149 a8083063 Iustin Pop
    """Record a restart attempt.
150 a8083063 Iustin Pop
151 a8083063 Iustin Pop
    Args:
152 a8083063 Iustin Pop
      instance - the instance being restarted
153 38242904 Iustin Pop
154 a8083063 Iustin Pop
    """
155 a8083063 Iustin Pop
    assert self.statefile
156 a8083063 Iustin Pop
157 a8083063 Iustin Pop
    when = time.time()
158 a8083063 Iustin Pop
159 a8083063 Iustin Pop
    self.inst_map[instance.name] = (when, 1 + self.NumberOfAttempts(instance))
160 a8083063 Iustin Pop
161 a8083063 Iustin Pop
  def Remove(self, instance):
162 38242904 Iustin Pop
    """Update state to reflect that a machine is running, i.e. remove record.
163 a8083063 Iustin Pop
164 a8083063 Iustin Pop
    Args:
165 a8083063 Iustin Pop
      instance - the instance to remove from books
166 a8083063 Iustin Pop
167 38242904 Iustin Pop
    This method removes the record for a named instance.
168 38242904 Iustin Pop
169 a8083063 Iustin Pop
    """
170 a8083063 Iustin Pop
    assert self.statefile
171 a8083063 Iustin Pop
172 a8083063 Iustin Pop
    if instance.name in self.inst_map:
173 a8083063 Iustin Pop
      del self.inst_map[instance.name]
174 a8083063 Iustin Pop
175 a8083063 Iustin Pop
  def Save(self):
176 a8083063 Iustin Pop
    """Save records to file, then unlock and close file.
177 38242904 Iustin Pop
178 a8083063 Iustin Pop
    """
179 a8083063 Iustin Pop
    assert self.statefile
180 a8083063 Iustin Pop
181 a8083063 Iustin Pop
    self.statefile.seek(0)
182 a8083063 Iustin Pop
    self.statefile.truncate()
183 a8083063 Iustin Pop
184 a8083063 Iustin Pop
    for name in self.inst_map:
185 a8083063 Iustin Pop
      print >> self.statefile, "%s:%d:%d" % ((name,) + self.inst_map[name])
186 a8083063 Iustin Pop
187 a8083063 Iustin Pop
    fcntl.flock(self.statefile.fileno(), fcntl.LOCK_UN)
188 a8083063 Iustin Pop
189 a8083063 Iustin Pop
    self.statefile.close()
190 a8083063 Iustin Pop
    self.statefile = None
191 a8083063 Iustin Pop
192 a8083063 Iustin Pop
193 a8083063 Iustin Pop
class Instance(object):
194 a8083063 Iustin Pop
  """Abstraction for a Virtual Machine instance.
195 a8083063 Iustin Pop
196 a8083063 Iustin Pop
  Methods:
197 a8083063 Iustin Pop
    Restart(): issue a command to restart the represented machine.
198 a8083063 Iustin Pop
  """
199 a8083063 Iustin Pop
  def __init__(self, name, state):
200 a8083063 Iustin Pop
    self.name = name
201 a8083063 Iustin Pop
    self.state = state
202 a8083063 Iustin Pop
203 a8083063 Iustin Pop
  def Restart(self):
204 a8083063 Iustin Pop
    DoCmd(['gnt-instance', 'startup', '--lock-retries=15', self.name])
205 a8083063 Iustin Pop
206 a8083063 Iustin Pop
207 a8083063 Iustin Pop
class InstanceList(object):
208 a8083063 Iustin Pop
  """The set of Virtual Machine instances on a cluster.
209 38242904 Iustin Pop
210 a8083063 Iustin Pop
  """
211 a8083063 Iustin Pop
  cmd = ['gnt-instance', 'list', '--lock-retries=15',
212 a8083063 Iustin Pop
         '-o', 'name,admin_state,oper_state', '--no-headers', '--separator=:']
213 a8083063 Iustin Pop
214 a8083063 Iustin Pop
  def __init__(self):
215 a8083063 Iustin Pop
    res = DoCmd(self.cmd)
216 a8083063 Iustin Pop
217 a8083063 Iustin Pop
    lines = res.stdout.splitlines()
218 a8083063 Iustin Pop
219 a8083063 Iustin Pop
    self.instances = []
220 a8083063 Iustin Pop
    for line in lines:
221 a8083063 Iustin Pop
      fields = [fld.strip() for fld in line.split(':')]
222 a8083063 Iustin Pop
223 a8083063 Iustin Pop
      if len(fields) != 3:
224 a8083063 Iustin Pop
        continue
225 a8083063 Iustin Pop
      if fields[1] == "no": #no autostart, we don't care about this instance
226 a8083063 Iustin Pop
        continue
227 a8083063 Iustin Pop
      name, status = fields[0], fields[2]
228 a8083063 Iustin Pop
229 a8083063 Iustin Pop
      self.instances.append(Instance(name, status))
230 a8083063 Iustin Pop
231 a8083063 Iustin Pop
  def __iter__(self):
232 a8083063 Iustin Pop
    return self.instances.__iter__()
233 a8083063 Iustin Pop
234 a8083063 Iustin Pop
235 a8083063 Iustin Pop
class Message(object):
236 a8083063 Iustin Pop
  """Encapsulation of a notice or error message.
237 38242904 Iustin Pop
238 a8083063 Iustin Pop
  """
239 a8083063 Iustin Pop
  def __init__(self, level, msg):
240 a8083063 Iustin Pop
    self.level = level
241 a8083063 Iustin Pop
    self.msg = msg
242 a8083063 Iustin Pop
    self.when = time.time()
243 a8083063 Iustin Pop
244 a8083063 Iustin Pop
  def __str__(self):
245 a8083063 Iustin Pop
    return self.level + ' ' + time.ctime(self.when) + '\n' + Indent(self.msg)
246 a8083063 Iustin Pop
247 a8083063 Iustin Pop
248 a8083063 Iustin Pop
class Restarter(object):
249 a8083063 Iustin Pop
  """Encapsulate the logic for restarting erronously halted virtual machines.
250 a8083063 Iustin Pop
251 a8083063 Iustin Pop
  The calling program should periodically instantiate me and call Run().
252 a8083063 Iustin Pop
  This will traverse the list of instances, and make up to MAXTRIES attempts
253 a8083063 Iustin Pop
  to restart machines that are down.
254 38242904 Iustin Pop
255 a8083063 Iustin Pop
  """
256 a8083063 Iustin Pop
  def __init__(self):
257 38242904 Iustin Pop
    sstore = ssconf.SimpleStore()
258 38242904 Iustin Pop
    master = sstore.GetMasterNode()
259 38242904 Iustin Pop
    if master != socket.gethostname():
260 38242904 Iustin Pop
      raise NotMasterError, ("This is not the master node")
261 a8083063 Iustin Pop
    self.instances = InstanceList()
262 a8083063 Iustin Pop
    self.messages = []
263 a8083063 Iustin Pop
264 a8083063 Iustin Pop
  def Run(self):
265 a8083063 Iustin Pop
    """Make a pass over the list of instances, restarting downed ones.
266 38242904 Iustin Pop
267 a8083063 Iustin Pop
    """
268 a8083063 Iustin Pop
    notepad = RestarterState()
269 a8083063 Iustin Pop
270 a8083063 Iustin Pop
    for instance in self.instances:
271 a8083063 Iustin Pop
      if instance.state in BAD_STATES:
272 a8083063 Iustin Pop
        n = notepad.NumberOfAttempts(instance)
273 a8083063 Iustin Pop
274 a8083063 Iustin Pop
        if n > MAXTRIES:
275 a8083063 Iustin Pop
          # stay quiet.
276 a8083063 Iustin Pop
          continue
277 a8083063 Iustin Pop
        elif n < MAXTRIES:
278 a8083063 Iustin Pop
          last = " (Attempt #%d)" % (n + 1)
279 a8083063 Iustin Pop
        else:
280 a8083063 Iustin Pop
          notepad.RecordAttempt(instance)
281 a8083063 Iustin Pop
          self.messages.append(Message(ERROR, "Could not restart %s for %d"
282 a8083063 Iustin Pop
                                       " times, giving up..." %
283 a8083063 Iustin Pop
                                       (instance.name, MAXTRIES)))
284 a8083063 Iustin Pop
          continue
285 a8083063 Iustin Pop
        try:
286 a8083063 Iustin Pop
          self.messages.append(Message(NOTICE,
287 a8083063 Iustin Pop
                                       "Restarting %s%s." %
288 a8083063 Iustin Pop
                                       (instance.name, last)))
289 a8083063 Iustin Pop
          instance.Restart()
290 a8083063 Iustin Pop
        except Error, x:
291 a8083063 Iustin Pop
          self.messages.append(Message(ERROR, str(x)))
292 a8083063 Iustin Pop
293 a8083063 Iustin Pop
        notepad.RecordAttempt(instance)
294 a8083063 Iustin Pop
      elif instance.state in HELPLESS_STATES:
295 a8083063 Iustin Pop
        if notepad.NumberOfAttempts(instance):
296 a8083063 Iustin Pop
          notepad.Remove(instance)
297 a8083063 Iustin Pop
      else:
298 a8083063 Iustin Pop
        if notepad.NumberOfAttempts(instance):
299 a8083063 Iustin Pop
          notepad.Remove(instance)
300 a8083063 Iustin Pop
          msg = Message(NOTICE,
301 a8083063 Iustin Pop
                        "Restart of %s succeeded." % instance.name)
302 a8083063 Iustin Pop
          self.messages.append(msg)
303 a8083063 Iustin Pop
304 a8083063 Iustin Pop
    notepad.Save()
305 a8083063 Iustin Pop
306 a8083063 Iustin Pop
  def WriteReport(self, logfile):
307 38242904 Iustin Pop
    """Log all messages to file.
308 a8083063 Iustin Pop
309 a8083063 Iustin Pop
    Args:
310 a8083063 Iustin Pop
      logfile: file object open for writing (the log file)
311 38242904 Iustin Pop
312 a8083063 Iustin Pop
    """
313 a8083063 Iustin Pop
    for msg in self.messages:
314 a8083063 Iustin Pop
      print >> logfile, str(msg)
315 a8083063 Iustin Pop
316 a8083063 Iustin Pop
317 a8083063 Iustin Pop
def ParseOptions():
318 a8083063 Iustin Pop
  """Parse the command line options.
319 a8083063 Iustin Pop
320 a8083063 Iustin Pop
  Returns:
321 a8083063 Iustin Pop
    (options, args) as from OptionParser.parse_args()
322 a8083063 Iustin Pop
323 a8083063 Iustin Pop
  """
324 a8083063 Iustin Pop
  parser = OptionParser(description="Ganeti cluster watcher",
325 a8083063 Iustin Pop
                        usage="%prog [-d]",
326 a8083063 Iustin Pop
                        version="%%prog (ganeti) %s" %
327 a8083063 Iustin Pop
                        constants.RELEASE_VERSION)
328 a8083063 Iustin Pop
329 a8083063 Iustin Pop
  parser.add_option("-d", "--debug", dest="debug",
330 a8083063 Iustin Pop
                    help="Don't redirect messages to the log file",
331 a8083063 Iustin Pop
                    default=False, action="store_true")
332 a8083063 Iustin Pop
  options, args = parser.parse_args()
333 a8083063 Iustin Pop
  return options, args
334 a8083063 Iustin Pop
335 a8083063 Iustin Pop
336 a8083063 Iustin Pop
def main():
337 a8083063 Iustin Pop
  """Main function.
338 a8083063 Iustin Pop
339 a8083063 Iustin Pop
  """
340 a8083063 Iustin Pop
  options, args = ParseOptions()
341 a8083063 Iustin Pop
342 a8083063 Iustin Pop
  if not options.debug:
343 a8083063 Iustin Pop
    sys.stderr = sys.stdout = open(LOGFILE, 'a')
344 a8083063 Iustin Pop
345 a8083063 Iustin Pop
  try:
346 a8083063 Iustin Pop
    restarter = Restarter()
347 a8083063 Iustin Pop
    restarter.Run()
348 a8083063 Iustin Pop
    restarter.WriteReport(sys.stdout)
349 38242904 Iustin Pop
  except NotMasterError:
350 38242904 Iustin Pop
    if options.debug:
351 38242904 Iustin Pop
      sys.stderr.write("Not master, exiting.\n")
352 38242904 Iustin Pop
    sys.exit(constants.EXIT_NOTMASTER)
353 a8083063 Iustin Pop
  except Error, err:
354 a8083063 Iustin Pop
    print err
355 a8083063 Iustin Pop
356 a8083063 Iustin Pop
if __name__ == '__main__':
357 a8083063 Iustin Pop
  main()