Statistics
| Branch: | Tag: | Revision:

root / daemons / ganeti-watcher @ 28cc354f

History | View | Annotate | Download (8.7 kB)

1
#!/usr/bin/python
2
#
3

    
4
# Copyright (C) 2006, 2007 Google Inc.
5
#
6
# This program is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 2 of the License, or
9
# (at your option) any later version.
10
#
11
# This program is distributed in the hope that it will be useful, but
12
# WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
# General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with this program; if not, write to the Free Software
18
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19
# 02110-1301, USA.
20

    
21

    
22
"""Tool to restart erronously downed virtual machines.
23

    
24
This program and set of classes implement a watchdog to restart
25
virtual machines in a Ganeti cluster that have crashed or been killed
26
by a node reboot.  Run from cron or similar.
27
"""
28

    
29

    
30
LOGFILE = '/var/log/ganeti/watcher.log'
31
MAXTRIES = 5
32
BAD_STATES = ['stopped']
33
HELPLESS_STATES = ['(node down)']
34
NOTICE = 'NOTICE'
35
ERROR = 'ERROR'
36

    
37
import os
38
import sys
39
import time
40
import fcntl
41
import errno
42
from optparse import OptionParser
43

    
44

    
45
from ganeti import utils
46
from ganeti import constants
47

    
48

    
49
class Error(Exception):
50
  """Generic custom error class."""
51
  pass
52

    
53

    
54
def Indent(s, prefix='| '):
55
  """Indent a piece of text with a given prefix before each line.
56

    
57
  Args:
58
    s: The string to indent
59
    prefix: The string to prepend each line.
60
  """
61
  return "%s%s\n" % (prefix, ('\n' + prefix).join(s.splitlines()))
62

    
63

    
64
def DoCmd(cmd):
65
  """Run a shell command.
66

    
67
  Args:
68
    cmd: the command to run.
69

    
70
  Raises CommandError with verbose commentary on error.
71
  """
72
  res = utils.RunCmd(cmd)
73

    
74
  if res.failed:
75
    raise Error("Command %s failed:\n%s\nstdout:\n%sstderr:\n%s" %
76
                (repr(cmd),
77
                 Indent(res.fail_reason),
78
                 Indent(res.stdout),
79
                 Indent(res.stderr)))
80

    
81
  return res
82

    
83

    
84
class RestarterState(object):
85
  """Interface to a state file recording restart attempts.
86

    
87
  Methods:
88
    Open(): open, lock, read and parse the file.
89
            Raises StandardError on lock contention.
90

    
91
    NumberOfAttempts(name): returns the number of times in succession
92
                            a restart has been attempted of the named instance.
93

    
94
    RecordAttempt(name, when): records one restart attempt of name at
95
                               time in when.
96

    
97
    Remove(name): remove record given by name, if exists.
98

    
99
    Save(name): saves all records to file, releases lock and closes file.
100
  """
101
  def __init__(self):
102
    # The two-step dance below is necessary to allow both opening existing
103
    # file read/write and creating if not existing.  Vanilla open will truncate
104
    # an existing file -or- allow creating if not existing.
105
    f = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT)
106
    f = os.fdopen(f, 'w+')
107

    
108
    try:
109
      fcntl.flock(f.fileno(), fcntl.LOCK_EX|fcntl.LOCK_NB)
110
    except IOError, x:
111
      if x.errno == errno.EAGAIN:
112
        raise StandardError('State file already locked')
113
      raise
114

    
115
    self.statefile = f
116
    self.inst_map = {}
117

    
118
    for line in f:
119
      name, when, count = line.rstrip().split(':')
120

    
121
      when = int(when)
122
      count = int(count)
123

    
124
      self.inst_map[name] = (when, count)
125

    
126
  def NumberOfAttempts(self, instance):
127
    """Returns number of previous restart attempts.
128

    
129
    Args:
130
      instance - the instance to look up.
131
    """
132
    assert self.statefile
133

    
134
    if instance.name in self.inst_map:
135
      return self.inst_map[instance.name][1]
136

    
137
    return 0
138

    
139
  def RecordAttempt(self, instance):
140
    """Record a restart attempt.
141

    
142
    Args:
143
      instance - the instance being restarted
144
    """
145
    assert self.statefile
146

    
147
    when = time.time()
148

    
149
    self.inst_map[instance.name] = (when, 1 + self.NumberOfAttempts(instance))
150

    
151
  def Remove(self, instance):
152
    """Update state to reflect that a machine is running, i.e. remove record
153

    
154
    Args:
155
      instance - the instance to remove from books
156

    
157
    This method removes the record for a named instance
158
    """
159
    assert self.statefile
160

    
161
    if instance.name in self.inst_map:
162
      del self.inst_map[instance.name]
163

    
164
  def Save(self):
165
    """Save records to file, then unlock and close file.
166
    """
167
    assert self.statefile
168

    
169
    self.statefile.seek(0)
170
    self.statefile.truncate()
171

    
172
    for name in self.inst_map:
173
      print >> self.statefile, "%s:%d:%d" % ((name,) + self.inst_map[name])
174

    
175
    fcntl.flock(self.statefile.fileno(), fcntl.LOCK_UN)
176

    
177
    self.statefile.close()
178
    self.statefile = None
179

    
180

    
181
class Instance(object):
182
  """Abstraction for a Virtual Machine instance.
183

    
184
  Methods:
185
    Restart(): issue a command to restart the represented machine.
186
  """
187
  def __init__(self, name, state):
188
    self.name = name
189
    self.state = state
190

    
191
  def Restart(self):
192
    DoCmd(['gnt-instance', 'startup', '--lock-retries=15', self.name])
193

    
194

    
195
class InstanceList(object):
196
  """The set of Virtual Machine instances on a cluster.
197
  """
198
  cmd = ['gnt-instance', 'list', '--lock-retries=15',
199
         '-o', 'name,admin_state,oper_state', '--no-headers', '--separator=:']
200

    
201
  def __init__(self):
202
    res = DoCmd(self.cmd)
203

    
204
    lines = res.stdout.splitlines()
205

    
206
    self.instances = []
207
    for line in lines:
208
      fields = [fld.strip() for fld in line.split(':')]
209

    
210
      if len(fields) != 3:
211
        continue
212
      if fields[1] == "no": #no autostart, we don't care about this instance
213
        continue
214
      name, status = fields[0], fields[2]
215

    
216
      self.instances.append(Instance(name, status))
217

    
218
  def __iter__(self):
219
    return self.instances.__iter__()
220

    
221

    
222
class Message(object):
223
  """Encapsulation of a notice or error message.
224
  """
225
  def __init__(self, level, msg):
226
    self.level = level
227
    self.msg = msg
228
    self.when = time.time()
229

    
230
  def __str__(self):
231
    return self.level + ' ' + time.ctime(self.when) + '\n' + Indent(self.msg)
232

    
233

    
234
class Restarter(object):
235
  """Encapsulate the logic for restarting erronously halted virtual machines.
236

    
237
  The calling program should periodically instantiate me and call Run().
238
  This will traverse the list of instances, and make up to MAXTRIES attempts
239
  to restart machines that are down.
240
  """
241
  def __init__(self):
242
    self.instances = InstanceList()
243
    self.messages = []
244

    
245
  def Run(self):
246
    """Make a pass over the list of instances, restarting downed ones.
247
    """
248
    notepad = RestarterState()
249

    
250
    for instance in self.instances:
251
      if instance.state in BAD_STATES:
252
        n = notepad.NumberOfAttempts(instance)
253

    
254
        if n > MAXTRIES:
255
          # stay quiet.
256
          continue
257
        elif n < MAXTRIES:
258
          last = " (Attempt #%d)" % (n + 1)
259
        else:
260
          notepad.RecordAttempt(instance)
261
          self.messages.append(Message(ERROR, "Could not restart %s for %d"
262
                                       " times, giving up..." %
263
                                       (instance.name, MAXTRIES)))
264
          continue
265
        try:
266
          self.messages.append(Message(NOTICE,
267
                                       "Restarting %s%s." %
268
                                       (instance.name, last)))
269
          instance.Restart()
270
        except Error, x:
271
          self.messages.append(Message(ERROR, str(x)))
272

    
273
        notepad.RecordAttempt(instance)
274
      elif instance.state in HELPLESS_STATES:
275
        if notepad.NumberOfAttempts(instance):
276
          notepad.Remove(instance)
277
      else:
278
        if notepad.NumberOfAttempts(instance):
279
          notepad.Remove(instance)
280
          msg = Message(NOTICE,
281
                        "Restart of %s succeeded." % instance.name)
282
          self.messages.append(msg)
283

    
284
    notepad.Save()
285

    
286
  def WriteReport(self, logfile):
287
    """
288
    Log all messages to file.
289

    
290
    Args:
291
      logfile: file object open for writing (the log file)
292
    """
293
    for msg in self.messages:
294
      print >> logfile, str(msg)
295

    
296

    
297
def ParseOptions():
298
  """Parse the command line options.
299

    
300
  Returns:
301
    (options, args) as from OptionParser.parse_args()
302

    
303
  """
304
  parser = OptionParser(description="Ganeti cluster watcher",
305
                        usage="%prog [-d]",
306
                        version="%%prog (ganeti) %s" %
307
                        constants.RELEASE_VERSION)
308

    
309
  parser.add_option("-d", "--debug", dest="debug",
310
                    help="Don't redirect messages to the log file",
311
                    default=False, action="store_true")
312
  options, args = parser.parse_args()
313
  return options, args
314

    
315

    
316
def main():
317
  """Main function.
318

    
319
  """
320
  options, args = ParseOptions()
321

    
322
  if not options.debug:
323
    sys.stderr = sys.stdout = open(LOGFILE, 'a')
324

    
325
  try:
326
    restarter = Restarter()
327
    restarter.Run()
328
    restarter.WriteReport(sys.stdout)
329
  except Error, err:
330
    print err
331

    
332
if __name__ == '__main__':
333
  main()