Statistics
| Branch: | Tag: | Revision:

root / daemons / ganeti-watcher @ 7c18ef8e

History | View | Annotate | Download (9.2 kB)

1 a8083063 Iustin Pop
#!/usr/bin/python
2 a8083063 Iustin Pop
#
3 a8083063 Iustin Pop
4 a8083063 Iustin Pop
# Copyright (C) 2006, 2007 Google Inc.
5 a8083063 Iustin Pop
#
6 a8083063 Iustin Pop
# This program is free software; you can redistribute it and/or modify
7 a8083063 Iustin Pop
# it under the terms of the GNU General Public License as published by
8 a8083063 Iustin Pop
# the Free Software Foundation; either version 2 of the License, or
9 a8083063 Iustin Pop
# (at your option) any later version.
10 a8083063 Iustin Pop
#
11 a8083063 Iustin Pop
# This program is distributed in the hope that it will be useful, but
12 a8083063 Iustin Pop
# WITHOUT ANY WARRANTY; without even the implied warranty of
13 a8083063 Iustin Pop
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 a8083063 Iustin Pop
# General Public License for more details.
15 a8083063 Iustin Pop
#
16 a8083063 Iustin Pop
# You should have received a copy of the GNU General Public License
17 a8083063 Iustin Pop
# along with this program; if not, write to the Free Software
18 a8083063 Iustin Pop
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 a8083063 Iustin Pop
# 02110-1301, USA.
20 a8083063 Iustin Pop
21 a8083063 Iustin Pop
22 a8083063 Iustin Pop
"""Tool to restart erronously downed virtual machines.
23 a8083063 Iustin Pop
24 a8083063 Iustin Pop
This program and set of classes implement a watchdog to restart
25 a8083063 Iustin Pop
virtual machines in a Ganeti cluster that have crashed or been killed
26 a8083063 Iustin Pop
by a node reboot.  Run from cron or similar.
27 a8083063 Iustin Pop
"""
28 a8083063 Iustin Pop
29 a8083063 Iustin Pop
30 a8083063 Iustin Pop
LOGFILE = '/var/log/ganeti/watcher.log'
31 a8083063 Iustin Pop
MAXTRIES = 5
32 a8083063 Iustin Pop
BAD_STATES = ['stopped']
33 a8083063 Iustin Pop
HELPLESS_STATES = ['(node down)']
34 a8083063 Iustin Pop
NOTICE = 'NOTICE'
35 a8083063 Iustin Pop
ERROR = 'ERROR'
36 a8083063 Iustin Pop
37 a8083063 Iustin Pop
import os
38 a8083063 Iustin Pop
import sys
39 a8083063 Iustin Pop
import time
40 a8083063 Iustin Pop
import fcntl
41 a8083063 Iustin Pop
import errno
42 38242904 Iustin Pop
import socket
43 a8083063 Iustin Pop
from optparse import OptionParser
44 a8083063 Iustin Pop
45 a8083063 Iustin Pop
46 a8083063 Iustin Pop
from ganeti import utils
47 a8083063 Iustin Pop
from ganeti import constants
48 38242904 Iustin Pop
from ganeti import ssconf
49 a8083063 Iustin Pop
50 a8083063 Iustin Pop
51 a8083063 Iustin Pop
class Error(Exception):
52 a8083063 Iustin Pop
  """Generic custom error class."""
53 38242904 Iustin Pop
54 38242904 Iustin Pop
55 38242904 Iustin Pop
class NotMasterError(Error):
56 38242904 Iustin Pop
  """Exception raised when this host is not the master."""
57 a8083063 Iustin Pop
58 a8083063 Iustin Pop
59 a8083063 Iustin Pop
def Indent(s, prefix='| '):
60 a8083063 Iustin Pop
  """Indent a piece of text with a given prefix before each line.
61 a8083063 Iustin Pop
62 a8083063 Iustin Pop
  Args:
63 a8083063 Iustin Pop
    s: The string to indent
64 a8083063 Iustin Pop
    prefix: The string to prepend each line.
65 38242904 Iustin Pop
66 a8083063 Iustin Pop
  """
67 a8083063 Iustin Pop
  return "%s%s\n" % (prefix, ('\n' + prefix).join(s.splitlines()))
68 a8083063 Iustin Pop
69 a8083063 Iustin Pop
70 a8083063 Iustin Pop
def DoCmd(cmd):
71 a8083063 Iustin Pop
  """Run a shell command.
72 a8083063 Iustin Pop
73 a8083063 Iustin Pop
  Args:
74 a8083063 Iustin Pop
    cmd: the command to run.
75 a8083063 Iustin Pop
76 a8083063 Iustin Pop
  Raises CommandError with verbose commentary on error.
77 38242904 Iustin Pop
78 a8083063 Iustin Pop
  """
79 a8083063 Iustin Pop
  res = utils.RunCmd(cmd)
80 a8083063 Iustin Pop
81 a8083063 Iustin Pop
  if res.failed:
82 a8083063 Iustin Pop
    raise Error("Command %s failed:\n%s\nstdout:\n%sstderr:\n%s" %
83 a8083063 Iustin Pop
                (repr(cmd),
84 a8083063 Iustin Pop
                 Indent(res.fail_reason),
85 a8083063 Iustin Pop
                 Indent(res.stdout),
86 a8083063 Iustin Pop
                 Indent(res.stderr)))
87 a8083063 Iustin Pop
88 a8083063 Iustin Pop
  return res
89 a8083063 Iustin Pop
90 a8083063 Iustin Pop
91 a8083063 Iustin Pop
class RestarterState(object):
92 a8083063 Iustin Pop
  """Interface to a state file recording restart attempts.
93 a8083063 Iustin Pop
94 a8083063 Iustin Pop
  Methods:
95 a8083063 Iustin Pop
    Open(): open, lock, read and parse the file.
96 a8083063 Iustin Pop
            Raises StandardError on lock contention.
97 a8083063 Iustin Pop
98 a8083063 Iustin Pop
    NumberOfAttempts(name): returns the number of times in succession
99 a8083063 Iustin Pop
                            a restart has been attempted of the named instance.
100 a8083063 Iustin Pop
101 a8083063 Iustin Pop
    RecordAttempt(name, when): records one restart attempt of name at
102 a8083063 Iustin Pop
                               time in when.
103 a8083063 Iustin Pop
104 a8083063 Iustin Pop
    Remove(name): remove record given by name, if exists.
105 a8083063 Iustin Pop
106 a8083063 Iustin Pop
    Save(name): saves all records to file, releases lock and closes file.
107 38242904 Iustin Pop
108 a8083063 Iustin Pop
  """
109 a8083063 Iustin Pop
  def __init__(self):
110 a8083063 Iustin Pop
    # The two-step dance below is necessary to allow both opening existing
111 a8083063 Iustin Pop
    # file read/write and creating if not existing.  Vanilla open will truncate
112 a8083063 Iustin Pop
    # an existing file -or- allow creating if not existing.
113 a8083063 Iustin Pop
    f = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT)
114 a8083063 Iustin Pop
    f = os.fdopen(f, 'w+')
115 a8083063 Iustin Pop
116 a8083063 Iustin Pop
    try:
117 a8083063 Iustin Pop
      fcntl.flock(f.fileno(), fcntl.LOCK_EX|fcntl.LOCK_NB)
118 a8083063 Iustin Pop
    except IOError, x:
119 a8083063 Iustin Pop
      if x.errno == errno.EAGAIN:
120 3ecf6786 Iustin Pop
        raise StandardError("State file already locked")
121 a8083063 Iustin Pop
      raise
122 a8083063 Iustin Pop
123 a8083063 Iustin Pop
    self.statefile = f
124 a8083063 Iustin Pop
    self.inst_map = {}
125 a8083063 Iustin Pop
126 a8083063 Iustin Pop
    for line in f:
127 a8083063 Iustin Pop
      name, when, count = line.rstrip().split(':')
128 a8083063 Iustin Pop
129 a8083063 Iustin Pop
      when = int(when)
130 a8083063 Iustin Pop
      count = int(count)
131 a8083063 Iustin Pop
132 a8083063 Iustin Pop
      self.inst_map[name] = (when, count)
133 a8083063 Iustin Pop
134 a8083063 Iustin Pop
  def NumberOfAttempts(self, instance):
135 a8083063 Iustin Pop
    """Returns number of previous restart attempts.
136 a8083063 Iustin Pop
137 a8083063 Iustin Pop
    Args:
138 a8083063 Iustin Pop
      instance - the instance to look up.
139 38242904 Iustin Pop
140 a8083063 Iustin Pop
    """
141 a8083063 Iustin Pop
    assert self.statefile
142 a8083063 Iustin Pop
143 a8083063 Iustin Pop
    if instance.name in self.inst_map:
144 a8083063 Iustin Pop
      return self.inst_map[instance.name][1]
145 a8083063 Iustin Pop
146 a8083063 Iustin Pop
    return 0
147 a8083063 Iustin Pop
148 a8083063 Iustin Pop
  def RecordAttempt(self, instance):
149 a8083063 Iustin Pop
    """Record a restart attempt.
150 a8083063 Iustin Pop
151 a8083063 Iustin Pop
    Args:
152 a8083063 Iustin Pop
      instance - the instance being restarted
153 38242904 Iustin Pop
154 a8083063 Iustin Pop
    """
155 a8083063 Iustin Pop
    assert self.statefile
156 a8083063 Iustin Pop
157 a8083063 Iustin Pop
    when = time.time()
158 a8083063 Iustin Pop
159 a8083063 Iustin Pop
    self.inst_map[instance.name] = (when, 1 + self.NumberOfAttempts(instance))
160 a8083063 Iustin Pop
161 a8083063 Iustin Pop
  def Remove(self, instance):
162 38242904 Iustin Pop
    """Update state to reflect that a machine is running, i.e. remove record.
163 a8083063 Iustin Pop
164 a8083063 Iustin Pop
    Args:
165 a8083063 Iustin Pop
      instance - the instance to remove from books
166 a8083063 Iustin Pop
167 38242904 Iustin Pop
    This method removes the record for a named instance.
168 38242904 Iustin Pop
169 a8083063 Iustin Pop
    """
170 a8083063 Iustin Pop
    assert self.statefile
171 a8083063 Iustin Pop
172 a8083063 Iustin Pop
    if instance.name in self.inst_map:
173 a8083063 Iustin Pop
      del self.inst_map[instance.name]
174 a8083063 Iustin Pop
175 a8083063 Iustin Pop
  def Save(self):
176 a8083063 Iustin Pop
    """Save records to file, then unlock and close file.
177 38242904 Iustin Pop
178 a8083063 Iustin Pop
    """
179 a8083063 Iustin Pop
    assert self.statefile
180 a8083063 Iustin Pop
181 a8083063 Iustin Pop
    self.statefile.seek(0)
182 a8083063 Iustin Pop
    self.statefile.truncate()
183 a8083063 Iustin Pop
184 a8083063 Iustin Pop
    for name in self.inst_map:
185 a8083063 Iustin Pop
      print >> self.statefile, "%s:%d:%d" % ((name,) + self.inst_map[name])
186 a8083063 Iustin Pop
187 a8083063 Iustin Pop
    fcntl.flock(self.statefile.fileno(), fcntl.LOCK_UN)
188 a8083063 Iustin Pop
189 a8083063 Iustin Pop
    self.statefile.close()
190 a8083063 Iustin Pop
    self.statefile = None
191 a8083063 Iustin Pop
192 a8083063 Iustin Pop
193 a8083063 Iustin Pop
class Instance(object):
194 a8083063 Iustin Pop
  """Abstraction for a Virtual Machine instance.
195 a8083063 Iustin Pop
196 a8083063 Iustin Pop
  Methods:
197 a8083063 Iustin Pop
    Restart(): issue a command to restart the represented machine.
198 098c0958 Michael Hanselmann
199 a8083063 Iustin Pop
  """
200 a8083063 Iustin Pop
  def __init__(self, name, state):
201 a8083063 Iustin Pop
    self.name = name
202 a8083063 Iustin Pop
    self.state = state
203 a8083063 Iustin Pop
204 a8083063 Iustin Pop
  def Restart(self):
205 3ecf6786 Iustin Pop
    """Encapsulates the start of an instance.
206 3ecf6786 Iustin Pop
207 3ecf6786 Iustin Pop
    This is currently done using the command line interface and not
208 3ecf6786 Iustin Pop
    the Ganeti modules.
209 3ecf6786 Iustin Pop
210 3ecf6786 Iustin Pop
    """
211 a8083063 Iustin Pop
    DoCmd(['gnt-instance', 'startup', '--lock-retries=15', self.name])
212 a8083063 Iustin Pop
213 a8083063 Iustin Pop
214 a8083063 Iustin Pop
class InstanceList(object):
215 a8083063 Iustin Pop
  """The set of Virtual Machine instances on a cluster.
216 38242904 Iustin Pop
217 a8083063 Iustin Pop
  """
218 a8083063 Iustin Pop
  cmd = ['gnt-instance', 'list', '--lock-retries=15',
219 a8083063 Iustin Pop
         '-o', 'name,admin_state,oper_state', '--no-headers', '--separator=:']
220 a8083063 Iustin Pop
221 a8083063 Iustin Pop
  def __init__(self):
222 a8083063 Iustin Pop
    res = DoCmd(self.cmd)
223 a8083063 Iustin Pop
224 a8083063 Iustin Pop
    lines = res.stdout.splitlines()
225 a8083063 Iustin Pop
226 a8083063 Iustin Pop
    self.instances = []
227 a8083063 Iustin Pop
    for line in lines:
228 a8083063 Iustin Pop
      fields = [fld.strip() for fld in line.split(':')]
229 a8083063 Iustin Pop
230 a8083063 Iustin Pop
      if len(fields) != 3:
231 a8083063 Iustin Pop
        continue
232 a8083063 Iustin Pop
      if fields[1] == "no": #no autostart, we don't care about this instance
233 a8083063 Iustin Pop
        continue
234 a8083063 Iustin Pop
      name, status = fields[0], fields[2]
235 a8083063 Iustin Pop
236 a8083063 Iustin Pop
      self.instances.append(Instance(name, status))
237 a8083063 Iustin Pop
238 a8083063 Iustin Pop
  def __iter__(self):
239 a8083063 Iustin Pop
    return self.instances.__iter__()
240 a8083063 Iustin Pop
241 a8083063 Iustin Pop
242 a8083063 Iustin Pop
class Message(object):
243 a8083063 Iustin Pop
  """Encapsulation of a notice or error message.
244 38242904 Iustin Pop
245 a8083063 Iustin Pop
  """
246 a8083063 Iustin Pop
  def __init__(self, level, msg):
247 a8083063 Iustin Pop
    self.level = level
248 a8083063 Iustin Pop
    self.msg = msg
249 a8083063 Iustin Pop
    self.when = time.time()
250 a8083063 Iustin Pop
251 a8083063 Iustin Pop
  def __str__(self):
252 a8083063 Iustin Pop
    return self.level + ' ' + time.ctime(self.when) + '\n' + Indent(self.msg)
253 a8083063 Iustin Pop
254 a8083063 Iustin Pop
255 a8083063 Iustin Pop
class Restarter(object):
256 a8083063 Iustin Pop
  """Encapsulate the logic for restarting erronously halted virtual machines.
257 a8083063 Iustin Pop
258 a8083063 Iustin Pop
  The calling program should periodically instantiate me and call Run().
259 a8083063 Iustin Pop
  This will traverse the list of instances, and make up to MAXTRIES attempts
260 a8083063 Iustin Pop
  to restart machines that are down.
261 38242904 Iustin Pop
262 a8083063 Iustin Pop
  """
263 a8083063 Iustin Pop
  def __init__(self):
264 38242904 Iustin Pop
    sstore = ssconf.SimpleStore()
265 38242904 Iustin Pop
    master = sstore.GetMasterNode()
266 38242904 Iustin Pop
    if master != socket.gethostname():
267 3ecf6786 Iustin Pop
      raise NotMasterError("This is not the master node")
268 a8083063 Iustin Pop
    self.instances = InstanceList()
269 a8083063 Iustin Pop
    self.messages = []
270 a8083063 Iustin Pop
271 a8083063 Iustin Pop
  def Run(self):
272 a8083063 Iustin Pop
    """Make a pass over the list of instances, restarting downed ones.
273 38242904 Iustin Pop
274 a8083063 Iustin Pop
    """
275 a8083063 Iustin Pop
    notepad = RestarterState()
276 a8083063 Iustin Pop
277 a8083063 Iustin Pop
    for instance in self.instances:
278 a8083063 Iustin Pop
      if instance.state in BAD_STATES:
279 a8083063 Iustin Pop
        n = notepad.NumberOfAttempts(instance)
280 a8083063 Iustin Pop
281 a8083063 Iustin Pop
        if n > MAXTRIES:
282 a8083063 Iustin Pop
          # stay quiet.
283 a8083063 Iustin Pop
          continue
284 a8083063 Iustin Pop
        elif n < MAXTRIES:
285 a8083063 Iustin Pop
          last = " (Attempt #%d)" % (n + 1)
286 a8083063 Iustin Pop
        else:
287 a8083063 Iustin Pop
          notepad.RecordAttempt(instance)
288 a8083063 Iustin Pop
          self.messages.append(Message(ERROR, "Could not restart %s for %d"
289 a8083063 Iustin Pop
                                       " times, giving up..." %
290 a8083063 Iustin Pop
                                       (instance.name, MAXTRIES)))
291 a8083063 Iustin Pop
          continue
292 a8083063 Iustin Pop
        try:
293 a8083063 Iustin Pop
          self.messages.append(Message(NOTICE,
294 a8083063 Iustin Pop
                                       "Restarting %s%s." %
295 a8083063 Iustin Pop
                                       (instance.name, last)))
296 a8083063 Iustin Pop
          instance.Restart()
297 a8083063 Iustin Pop
        except Error, x:
298 a8083063 Iustin Pop
          self.messages.append(Message(ERROR, str(x)))
299 a8083063 Iustin Pop
300 a8083063 Iustin Pop
        notepad.RecordAttempt(instance)
301 a8083063 Iustin Pop
      elif instance.state in HELPLESS_STATES:
302 a8083063 Iustin Pop
        if notepad.NumberOfAttempts(instance):
303 a8083063 Iustin Pop
          notepad.Remove(instance)
304 a8083063 Iustin Pop
      else:
305 a8083063 Iustin Pop
        if notepad.NumberOfAttempts(instance):
306 a8083063 Iustin Pop
          notepad.Remove(instance)
307 a8083063 Iustin Pop
          msg = Message(NOTICE,
308 a8083063 Iustin Pop
                        "Restart of %s succeeded." % instance.name)
309 a8083063 Iustin Pop
          self.messages.append(msg)
310 a8083063 Iustin Pop
311 a8083063 Iustin Pop
    notepad.Save()
312 a8083063 Iustin Pop
313 a8083063 Iustin Pop
  def WriteReport(self, logfile):
314 38242904 Iustin Pop
    """Log all messages to file.
315 a8083063 Iustin Pop
316 a8083063 Iustin Pop
    Args:
317 a8083063 Iustin Pop
      logfile: file object open for writing (the log file)
318 38242904 Iustin Pop
319 a8083063 Iustin Pop
    """
320 a8083063 Iustin Pop
    for msg in self.messages:
321 a8083063 Iustin Pop
      print >> logfile, str(msg)
322 a8083063 Iustin Pop
323 a8083063 Iustin Pop
324 a8083063 Iustin Pop
def ParseOptions():
325 a8083063 Iustin Pop
  """Parse the command line options.
326 a8083063 Iustin Pop
327 a8083063 Iustin Pop
  Returns:
328 a8083063 Iustin Pop
    (options, args) as from OptionParser.parse_args()
329 a8083063 Iustin Pop
330 a8083063 Iustin Pop
  """
331 a8083063 Iustin Pop
  parser = OptionParser(description="Ganeti cluster watcher",
332 a8083063 Iustin Pop
                        usage="%prog [-d]",
333 a8083063 Iustin Pop
                        version="%%prog (ganeti) %s" %
334 a8083063 Iustin Pop
                        constants.RELEASE_VERSION)
335 a8083063 Iustin Pop
336 a8083063 Iustin Pop
  parser.add_option("-d", "--debug", dest="debug",
337 a8083063 Iustin Pop
                    help="Don't redirect messages to the log file",
338 a8083063 Iustin Pop
                    default=False, action="store_true")
339 a8083063 Iustin Pop
  options, args = parser.parse_args()
340 a8083063 Iustin Pop
  return options, args
341 a8083063 Iustin Pop
342 a8083063 Iustin Pop
343 a8083063 Iustin Pop
def main():
344 a8083063 Iustin Pop
  """Main function.
345 a8083063 Iustin Pop
346 a8083063 Iustin Pop
  """
347 a8083063 Iustin Pop
  options, args = ParseOptions()
348 a8083063 Iustin Pop
349 a8083063 Iustin Pop
  if not options.debug:
350 a8083063 Iustin Pop
    sys.stderr = sys.stdout = open(LOGFILE, 'a')
351 a8083063 Iustin Pop
352 a8083063 Iustin Pop
  try:
353 a8083063 Iustin Pop
    restarter = Restarter()
354 a8083063 Iustin Pop
    restarter.Run()
355 a8083063 Iustin Pop
    restarter.WriteReport(sys.stdout)
356 38242904 Iustin Pop
  except NotMasterError:
357 38242904 Iustin Pop
    if options.debug:
358 38242904 Iustin Pop
      sys.stderr.write("Not master, exiting.\n")
359 38242904 Iustin Pop
    sys.exit(constants.EXIT_NOTMASTER)
360 a8083063 Iustin Pop
  except Error, err:
361 a8083063 Iustin Pop
    print err
362 a8083063 Iustin Pop
363 a8083063 Iustin Pop
if __name__ == '__main__':
364 a8083063 Iustin Pop
  main()