Statistics
| Branch: | Tag: | Revision:

root / daemons / ganeti-watcher @ 89e1fc26

History | View | Annotate | Download (9.4 kB)

1
#!/usr/bin/python
2
#
3

    
4
# Copyright (C) 2006, 2007 Google Inc.
5
#
6
# This program is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 2 of the License, or
9
# (at your option) any later version.
10
#
11
# This program is distributed in the hope that it will be useful, but
12
# WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
# General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with this program; if not, write to the Free Software
18
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19
# 02110-1301, USA.
20

    
21

    
22
"""Tool to restart erronously downed virtual machines.
23

    
24
This program and set of classes implement a watchdog to restart
25
virtual machines in a Ganeti cluster that have crashed or been killed
26
by a node reboot.  Run from cron or similar.
27
"""
28

    
29

    
30
LOGFILE = '/var/log/ganeti/watcher.log'
31
MAXTRIES = 5
32
BAD_STATES = ['stopped']
33
HELPLESS_STATES = ['(node down)']
34
NOTICE = 'NOTICE'
35
ERROR = 'ERROR'
36

    
37
import os
38
import sys
39
import time
40
import fcntl
41
import errno
42
from optparse import OptionParser
43

    
44

    
45
from ganeti import utils
46
from ganeti import constants
47
from ganeti import ssconf
48
from ganeti import errors
49

    
50

    
51
class Error(Exception):
52
  """Generic custom error class."""
53

    
54

    
55
class NotMasterError(Error):
56
  """Exception raised when this host is not the master."""
57

    
58

    
59
def Indent(s, prefix='| '):
60
  """Indent a piece of text with a given prefix before each line.
61

    
62
  Args:
63
    s: The string to indent
64
    prefix: The string to prepend each line.
65

    
66
  """
67
  return "%s%s\n" % (prefix, ('\n' + prefix).join(s.splitlines()))
68

    
69

    
70
def DoCmd(cmd):
71
  """Run a shell command.
72

    
73
  Args:
74
    cmd: the command to run.
75

    
76
  Raises CommandError with verbose commentary on error.
77

    
78
  """
79
  res = utils.RunCmd(cmd)
80

    
81
  if res.failed:
82
    raise Error("Command %s failed:\n%s\nstdout:\n%sstderr:\n%s" %
83
                (repr(cmd),
84
                 Indent(res.fail_reason),
85
                 Indent(res.stdout),
86
                 Indent(res.stderr)))
87

    
88
  return res
89

    
90

    
91
class RestarterState(object):
92
  """Interface to a state file recording restart attempts.
93

    
94
  Methods:
95
    Open(): open, lock, read and parse the file.
96
            Raises StandardError on lock contention.
97

    
98
    NumberOfAttempts(name): returns the number of times in succession
99
                            a restart has been attempted of the named instance.
100

    
101
    RecordAttempt(name, when): records one restart attempt of name at
102
                               time in when.
103

    
104
    Remove(name): remove record given by name, if exists.
105

    
106
    Save(name): saves all records to file, releases lock and closes file.
107

    
108
  """
109
  def __init__(self):
110
    # The two-step dance below is necessary to allow both opening existing
111
    # file read/write and creating if not existing.  Vanilla open will truncate
112
    # an existing file -or- allow creating if not existing.
113
    f = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT)
114
    f = os.fdopen(f, 'w+')
115

    
116
    try:
117
      fcntl.flock(f.fileno(), fcntl.LOCK_EX|fcntl.LOCK_NB)
118
    except IOError, x:
119
      if x.errno == errno.EAGAIN:
120
        raise StandardError("State file already locked")
121
      raise
122

    
123
    self.statefile = f
124
    self.inst_map = {}
125

    
126
    for line in f:
127
      name, when, count = line.rstrip().split(':')
128

    
129
      when = int(when)
130
      count = int(count)
131

    
132
      self.inst_map[name] = (when, count)
133

    
134
  def NumberOfAttempts(self, instance):
135
    """Returns number of previous restart attempts.
136

    
137
    Args:
138
      instance - the instance to look up.
139

    
140
    """
141
    assert self.statefile
142

    
143
    if instance.name in self.inst_map:
144
      return self.inst_map[instance.name][1]
145

    
146
    return 0
147

    
148
  def RecordAttempt(self, instance):
149
    """Record a restart attempt.
150

    
151
    Args:
152
      instance - the instance being restarted
153

    
154
    """
155
    assert self.statefile
156

    
157
    when = time.time()
158

    
159
    self.inst_map[instance.name] = (when, 1 + self.NumberOfAttempts(instance))
160

    
161
  def Remove(self, instance):
162
    """Update state to reflect that a machine is running, i.e. remove record.
163

    
164
    Args:
165
      instance - the instance to remove from books
166

    
167
    This method removes the record for a named instance.
168

    
169
    """
170
    assert self.statefile
171

    
172
    if instance.name in self.inst_map:
173
      del self.inst_map[instance.name]
174

    
175
  def Save(self):
176
    """Save records to file, then unlock and close file.
177

    
178
    """
179
    assert self.statefile
180

    
181
    self.statefile.seek(0)
182
    self.statefile.truncate()
183

    
184
    for name in self.inst_map:
185
      print >> self.statefile, "%s:%d:%d" % ((name,) + self.inst_map[name])
186

    
187
    fcntl.flock(self.statefile.fileno(), fcntl.LOCK_UN)
188

    
189
    self.statefile.close()
190
    self.statefile = None
191

    
192

    
193
class Instance(object):
194
  """Abstraction for a Virtual Machine instance.
195

    
196
  Methods:
197
    Restart(): issue a command to restart the represented machine.
198

    
199
  """
200
  def __init__(self, name, state):
201
    self.name = name
202
    self.state = state
203

    
204
  def Restart(self):
205
    """Encapsulates the start of an instance.
206

    
207
    This is currently done using the command line interface and not
208
    the Ganeti modules.
209

    
210
    """
211
    DoCmd(['gnt-instance', 'startup', '--lock-retries=15', self.name])
212

    
213

    
214
class InstanceList(object):
215
  """The set of Virtual Machine instances on a cluster.
216

    
217
  """
218
  cmd = ['gnt-instance', 'list', '--lock-retries=15',
219
         '-o', 'name,admin_state,oper_state', '--no-headers', '--separator=:']
220

    
221
  def __init__(self):
222
    res = DoCmd(self.cmd)
223

    
224
    lines = res.stdout.splitlines()
225

    
226
    self.instances = []
227
    for line in lines:
228
      fields = [fld.strip() for fld in line.split(':')]
229

    
230
      if len(fields) != 3:
231
        continue
232
      if fields[1] == "no": #no autostart, we don't care about this instance
233
        continue
234
      name, status = fields[0], fields[2]
235

    
236
      self.instances.append(Instance(name, status))
237

    
238
  def __iter__(self):
239
    return self.instances.__iter__()
240

    
241

    
242
class Message(object):
243
  """Encapsulation of a notice or error message.
244

    
245
  """
246
  def __init__(self, level, msg):
247
    self.level = level
248
    self.msg = msg
249
    self.when = time.time()
250

    
251
  def __str__(self):
252
    return self.level + ' ' + time.ctime(self.when) + '\n' + Indent(self.msg)
253

    
254

    
255
class Restarter(object):
256
  """Encapsulate the logic for restarting erronously halted virtual machines.
257

    
258
  The calling program should periodically instantiate me and call Run().
259
  This will traverse the list of instances, and make up to MAXTRIES attempts
260
  to restart machines that are down.
261

    
262
  """
263
  def __init__(self):
264
    sstore = ssconf.SimpleStore()
265
    master = sstore.GetMasterNode()
266
    if master != utils.HostInfo().name:
267
      raise NotMasterError("This is not the master node")
268
    self.instances = InstanceList()
269
    self.messages = []
270

    
271
  def Run(self):
272
    """Make a pass over the list of instances, restarting downed ones.
273

    
274
    """
275
    notepad = RestarterState()
276

    
277
    for instance in self.instances:
278
      if instance.state in BAD_STATES:
279
        n = notepad.NumberOfAttempts(instance)
280

    
281
        if n > MAXTRIES:
282
          # stay quiet.
283
          continue
284
        elif n < MAXTRIES:
285
          last = " (Attempt #%d)" % (n + 1)
286
        else:
287
          notepad.RecordAttempt(instance)
288
          self.messages.append(Message(ERROR, "Could not restart %s for %d"
289
                                       " times, giving up..." %
290
                                       (instance.name, MAXTRIES)))
291
          continue
292
        try:
293
          self.messages.append(Message(NOTICE,
294
                                       "Restarting %s%s." %
295
                                       (instance.name, last)))
296
          instance.Restart()
297
        except Error, x:
298
          self.messages.append(Message(ERROR, str(x)))
299

    
300
        notepad.RecordAttempt(instance)
301
      elif instance.state in HELPLESS_STATES:
302
        if notepad.NumberOfAttempts(instance):
303
          notepad.Remove(instance)
304
      else:
305
        if notepad.NumberOfAttempts(instance):
306
          notepad.Remove(instance)
307
          msg = Message(NOTICE,
308
                        "Restart of %s succeeded." % instance.name)
309
          self.messages.append(msg)
310

    
311
    notepad.Save()
312

    
313
  def WriteReport(self, logfile):
314
    """Log all messages to file.
315

    
316
    Args:
317
      logfile: file object open for writing (the log file)
318

    
319
    """
320
    for msg in self.messages:
321
      print >> logfile, str(msg)
322

    
323

    
324
def ParseOptions():
325
  """Parse the command line options.
326

    
327
  Returns:
328
    (options, args) as from OptionParser.parse_args()
329

    
330
  """
331
  parser = OptionParser(description="Ganeti cluster watcher",
332
                        usage="%prog [-d]",
333
                        version="%%prog (ganeti) %s" %
334
                        constants.RELEASE_VERSION)
335

    
336
  parser.add_option("-d", "--debug", dest="debug",
337
                    help="Don't redirect messages to the log file",
338
                    default=False, action="store_true")
339
  options, args = parser.parse_args()
340
  return options, args
341

    
342

    
343
def main():
344
  """Main function.
345

    
346
  """
347
  options, args = ParseOptions()
348

    
349
  if not options.debug:
350
    sys.stderr = sys.stdout = open(LOGFILE, 'a')
351

    
352
  try:
353
    restarter = Restarter()
354
    restarter.Run()
355
    restarter.WriteReport(sys.stdout)
356
  except NotMasterError:
357
    if options.debug:
358
      sys.stderr.write("Not master, exiting.\n")
359
    sys.exit(constants.EXIT_NOTMASTER)
360
  except errors.ResolverError, err:
361
    sys.stderr.write("Cannot resolve hostname '%s', exiting.\n" % err.args[0])
362
    sys.exit(constants.EXIT_NODESETUP_ERROR)
363
  except Error, err:
364
    print err
365

    
366
if __name__ == '__main__':
367
  main()