Statistics
| Branch: | Tag: | Revision:

root / daemons / ganeti-watcher @ 098c0958

History | View | Annotate | Download (9.1 kB)

1
#!/usr/bin/python
2
#
3

    
4
# Copyright (C) 2006, 2007 Google Inc.
5
#
6
# This program is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 2 of the License, or
9
# (at your option) any later version.
10
#
11
# This program is distributed in the hope that it will be useful, but
12
# WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
# General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with this program; if not, write to the Free Software
18
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19
# 02110-1301, USA.
20

    
21

    
22
"""Tool to restart erronously downed virtual machines.
23

    
24
This program and set of classes implement a watchdog to restart
25
virtual machines in a Ganeti cluster that have crashed or been killed
26
by a node reboot.  Run from cron or similar.
27
"""
28

    
29

    
30
LOGFILE = '/var/log/ganeti/watcher.log'
31
MAXTRIES = 5
32
BAD_STATES = ['stopped']
33
HELPLESS_STATES = ['(node down)']
34
NOTICE = 'NOTICE'
35
ERROR = 'ERROR'
36

    
37
import os
38
import sys
39
import time
40
import fcntl
41
import errno
42
import socket
43
from optparse import OptionParser
44

    
45

    
46
from ganeti import utils
47
from ganeti import constants
48
from ganeti import ssconf
49

    
50

    
51
class Error(Exception):
52
  """Generic custom error class."""
53

    
54

    
55
class NotMasterError(Error):
56
  """Exception raised when this host is not the master."""
57

    
58

    
59
def Indent(s, prefix='| '):
60
  """Indent a piece of text with a given prefix before each line.
61

    
62
  Args:
63
    s: The string to indent
64
    prefix: The string to prepend each line.
65

    
66
  """
67
  return "%s%s\n" % (prefix, ('\n' + prefix).join(s.splitlines()))
68

    
69

    
70
def DoCmd(cmd):
71
  """Run a shell command.
72

    
73
  Args:
74
    cmd: the command to run.
75

    
76
  Raises CommandError with verbose commentary on error.
77

    
78
  """
79
  res = utils.RunCmd(cmd)
80

    
81
  if res.failed:
82
    raise Error("Command %s failed:\n%s\nstdout:\n%sstderr:\n%s" %
83
                (repr(cmd),
84
                 Indent(res.fail_reason),
85
                 Indent(res.stdout),
86
                 Indent(res.stderr)))
87

    
88
  return res
89

    
90

    
91
class RestarterState(object):
92
  """Interface to a state file recording restart attempts.
93

    
94
  Methods:
95
    Open(): open, lock, read and parse the file.
96
            Raises StandardError on lock contention.
97

    
98
    NumberOfAttempts(name): returns the number of times in succession
99
                            a restart has been attempted of the named instance.
100

    
101
    RecordAttempt(name, when): records one restart attempt of name at
102
                               time in when.
103

    
104
    Remove(name): remove record given by name, if exists.
105

    
106
    Save(name): saves all records to file, releases lock and closes file.
107

    
108
  """
109
  def __init__(self):
110
    # The two-step dance below is necessary to allow both opening existing
111
    # file read/write and creating if not existing.  Vanilla open will truncate
112
    # an existing file -or- allow creating if not existing.
113
    f = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT)
114
    f = os.fdopen(f, 'w+')
115

    
116
    try:
117
      fcntl.flock(f.fileno(), fcntl.LOCK_EX|fcntl.LOCK_NB)
118
    except IOError, x:
119
      if x.errno == errno.EAGAIN:
120
        raise StandardError('State file already locked')
121
      raise
122

    
123
    self.statefile = f
124
    self.inst_map = {}
125

    
126
    for line in f:
127
      name, when, count = line.rstrip().split(':')
128

    
129
      when = int(when)
130
      count = int(count)
131

    
132
      self.inst_map[name] = (when, count)
133

    
134
  def NumberOfAttempts(self, instance):
135
    """Returns number of previous restart attempts.
136

    
137
    Args:
138
      instance - the instance to look up.
139

    
140
    """
141
    assert self.statefile
142

    
143
    if instance.name in self.inst_map:
144
      return self.inst_map[instance.name][1]
145

    
146
    return 0
147

    
148
  def RecordAttempt(self, instance):
149
    """Record a restart attempt.
150

    
151
    Args:
152
      instance - the instance being restarted
153

    
154
    """
155
    assert self.statefile
156

    
157
    when = time.time()
158

    
159
    self.inst_map[instance.name] = (when, 1 + self.NumberOfAttempts(instance))
160

    
161
  def Remove(self, instance):
162
    """Update state to reflect that a machine is running, i.e. remove record.
163

    
164
    Args:
165
      instance - the instance to remove from books
166

    
167
    This method removes the record for a named instance.
168

    
169
    """
170
    assert self.statefile
171

    
172
    if instance.name in self.inst_map:
173
      del self.inst_map[instance.name]
174

    
175
  def Save(self):
176
    """Save records to file, then unlock and close file.
177

    
178
    """
179
    assert self.statefile
180

    
181
    self.statefile.seek(0)
182
    self.statefile.truncate()
183

    
184
    for name in self.inst_map:
185
      print >> self.statefile, "%s:%d:%d" % ((name,) + self.inst_map[name])
186

    
187
    fcntl.flock(self.statefile.fileno(), fcntl.LOCK_UN)
188

    
189
    self.statefile.close()
190
    self.statefile = None
191

    
192

    
193
class Instance(object):
194
  """Abstraction for a Virtual Machine instance.
195

    
196
  Methods:
197
    Restart(): issue a command to restart the represented machine.
198

    
199
  """
200
  def __init__(self, name, state):
201
    self.name = name
202
    self.state = state
203

    
204
  def Restart(self):
205
    DoCmd(['gnt-instance', 'startup', '--lock-retries=15', self.name])
206

    
207

    
208
class InstanceList(object):
209
  """The set of Virtual Machine instances on a cluster.
210

    
211
  """
212
  cmd = ['gnt-instance', 'list', '--lock-retries=15',
213
         '-o', 'name,admin_state,oper_state', '--no-headers', '--separator=:']
214

    
215
  def __init__(self):
216
    res = DoCmd(self.cmd)
217

    
218
    lines = res.stdout.splitlines()
219

    
220
    self.instances = []
221
    for line in lines:
222
      fields = [fld.strip() for fld in line.split(':')]
223

    
224
      if len(fields) != 3:
225
        continue
226
      if fields[1] == "no": #no autostart, we don't care about this instance
227
        continue
228
      name, status = fields[0], fields[2]
229

    
230
      self.instances.append(Instance(name, status))
231

    
232
  def __iter__(self):
233
    return self.instances.__iter__()
234

    
235

    
236
class Message(object):
237
  """Encapsulation of a notice or error message.
238

    
239
  """
240
  def __init__(self, level, msg):
241
    self.level = level
242
    self.msg = msg
243
    self.when = time.time()
244

    
245
  def __str__(self):
246
    return self.level + ' ' + time.ctime(self.when) + '\n' + Indent(self.msg)
247

    
248

    
249
class Restarter(object):
250
  """Encapsulate the logic for restarting erronously halted virtual machines.
251

    
252
  The calling program should periodically instantiate me and call Run().
253
  This will traverse the list of instances, and make up to MAXTRIES attempts
254
  to restart machines that are down.
255

    
256
  """
257
  def __init__(self):
258
    sstore = ssconf.SimpleStore()
259
    master = sstore.GetMasterNode()
260
    if master != socket.gethostname():
261
      raise NotMasterError, ("This is not the master node")
262
    self.instances = InstanceList()
263
    self.messages = []
264

    
265
  def Run(self):
266
    """Make a pass over the list of instances, restarting downed ones.
267

    
268
    """
269
    notepad = RestarterState()
270

    
271
    for instance in self.instances:
272
      if instance.state in BAD_STATES:
273
        n = notepad.NumberOfAttempts(instance)
274

    
275
        if n > MAXTRIES:
276
          # stay quiet.
277
          continue
278
        elif n < MAXTRIES:
279
          last = " (Attempt #%d)" % (n + 1)
280
        else:
281
          notepad.RecordAttempt(instance)
282
          self.messages.append(Message(ERROR, "Could not restart %s for %d"
283
                                       " times, giving up..." %
284
                                       (instance.name, MAXTRIES)))
285
          continue
286
        try:
287
          self.messages.append(Message(NOTICE,
288
                                       "Restarting %s%s." %
289
                                       (instance.name, last)))
290
          instance.Restart()
291
        except Error, x:
292
          self.messages.append(Message(ERROR, str(x)))
293

    
294
        notepad.RecordAttempt(instance)
295
      elif instance.state in HELPLESS_STATES:
296
        if notepad.NumberOfAttempts(instance):
297
          notepad.Remove(instance)
298
      else:
299
        if notepad.NumberOfAttempts(instance):
300
          notepad.Remove(instance)
301
          msg = Message(NOTICE,
302
                        "Restart of %s succeeded." % instance.name)
303
          self.messages.append(msg)
304

    
305
    notepad.Save()
306

    
307
  def WriteReport(self, logfile):
308
    """Log all messages to file.
309

    
310
    Args:
311
      logfile: file object open for writing (the log file)
312

    
313
    """
314
    for msg in self.messages:
315
      print >> logfile, str(msg)
316

    
317

    
318
def ParseOptions():
319
  """Parse the command line options.
320

    
321
  Returns:
322
    (options, args) as from OptionParser.parse_args()
323

    
324
  """
325
  parser = OptionParser(description="Ganeti cluster watcher",
326
                        usage="%prog [-d]",
327
                        version="%%prog (ganeti) %s" %
328
                        constants.RELEASE_VERSION)
329

    
330
  parser.add_option("-d", "--debug", dest="debug",
331
                    help="Don't redirect messages to the log file",
332
                    default=False, action="store_true")
333
  options, args = parser.parse_args()
334
  return options, args
335

    
336

    
337
def main():
338
  """Main function.
339

    
340
  """
341
  options, args = ParseOptions()
342

    
343
  if not options.debug:
344
    sys.stderr = sys.stdout = open(LOGFILE, 'a')
345

    
346
  try:
347
    restarter = Restarter()
348
    restarter.Run()
349
    restarter.WriteReport(sys.stdout)
350
  except NotMasterError:
351
    if options.debug:
352
      sys.stderr.write("Not master, exiting.\n")
353
    sys.exit(constants.EXIT_NOTMASTER)
354
  except Error, err:
355
    print err
356

    
357
if __name__ == '__main__':
358
  main()