Statistics
| Branch: | Tag: | Revision:

root / daemons / ganeti-watcher @ 38242904

History | View | Annotate | Download (9.1 kB)

1
#!/usr/bin/python
2
#
3

    
4
# Copyright (C) 2006, 2007 Google Inc.
5
#
6
# This program is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 2 of the License, or
9
# (at your option) any later version.
10
#
11
# This program is distributed in the hope that it will be useful, but
12
# WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
# General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with this program; if not, write to the Free Software
18
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19
# 02110-1301, USA.
20

    
21

    
22
"""Tool to restart erronously downed virtual machines.
23

    
24
This program and set of classes implement a watchdog to restart
25
virtual machines in a Ganeti cluster that have crashed or been killed
26
by a node reboot.  Run from cron or similar.
27
"""
28

    
29

    
30
LOGFILE = '/var/log/ganeti/watcher.log'
31
MAXTRIES = 5
32
BAD_STATES = ['stopped']
33
HELPLESS_STATES = ['(node down)']
34
NOTICE = 'NOTICE'
35
ERROR = 'ERROR'
36

    
37
import os
38
import sys
39
import time
40
import fcntl
41
import errno
42
import socket
43
from optparse import OptionParser
44

    
45

    
46
from ganeti import utils
47
from ganeti import constants
48
from ganeti import ssconf
49

    
50

    
51
class Error(Exception):
52
  """Generic custom error class."""
53

    
54

    
55
class NotMasterError(Error):
56
  """Exception raised when this host is not the master."""
57

    
58

    
59
def Indent(s, prefix='| '):
60
  """Indent a piece of text with a given prefix before each line.
61

    
62
  Args:
63
    s: The string to indent
64
    prefix: The string to prepend each line.
65

    
66
  """
67
  return "%s%s\n" % (prefix, ('\n' + prefix).join(s.splitlines()))
68

    
69

    
70
def DoCmd(cmd):
71
  """Run a shell command.
72

    
73
  Args:
74
    cmd: the command to run.
75

    
76
  Raises CommandError with verbose commentary on error.
77

    
78
  """
79
  res = utils.RunCmd(cmd)
80

    
81
  if res.failed:
82
    raise Error("Command %s failed:\n%s\nstdout:\n%sstderr:\n%s" %
83
                (repr(cmd),
84
                 Indent(res.fail_reason),
85
                 Indent(res.stdout),
86
                 Indent(res.stderr)))
87

    
88
  return res
89

    
90

    
91
class RestarterState(object):
92
  """Interface to a state file recording restart attempts.
93

    
94
  Methods:
95
    Open(): open, lock, read and parse the file.
96
            Raises StandardError on lock contention.
97

    
98
    NumberOfAttempts(name): returns the number of times in succession
99
                            a restart has been attempted of the named instance.
100

    
101
    RecordAttempt(name, when): records one restart attempt of name at
102
                               time in when.
103

    
104
    Remove(name): remove record given by name, if exists.
105

    
106
    Save(name): saves all records to file, releases lock and closes file.
107

    
108
  """
109
  def __init__(self):
110
    # The two-step dance below is necessary to allow both opening existing
111
    # file read/write and creating if not existing.  Vanilla open will truncate
112
    # an existing file -or- allow creating if not existing.
113
    f = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT)
114
    f = os.fdopen(f, 'w+')
115

    
116
    try:
117
      fcntl.flock(f.fileno(), fcntl.LOCK_EX|fcntl.LOCK_NB)
118
    except IOError, x:
119
      if x.errno == errno.EAGAIN:
120
        raise StandardError('State file already locked')
121
      raise
122

    
123
    self.statefile = f
124
    self.inst_map = {}
125

    
126
    for line in f:
127
      name, when, count = line.rstrip().split(':')
128

    
129
      when = int(when)
130
      count = int(count)
131

    
132
      self.inst_map[name] = (when, count)
133

    
134
  def NumberOfAttempts(self, instance):
135
    """Returns number of previous restart attempts.
136

    
137
    Args:
138
      instance - the instance to look up.
139

    
140
    """
141
    assert self.statefile
142

    
143
    if instance.name in self.inst_map:
144
      return self.inst_map[instance.name][1]
145

    
146
    return 0
147

    
148
  def RecordAttempt(self, instance):
149
    """Record a restart attempt.
150

    
151
    Args:
152
      instance - the instance being restarted
153

    
154
    """
155
    assert self.statefile
156

    
157
    when = time.time()
158

    
159
    self.inst_map[instance.name] = (when, 1 + self.NumberOfAttempts(instance))
160

    
161
  def Remove(self, instance):
162
    """Update state to reflect that a machine is running, i.e. remove record.
163

    
164
    Args:
165
      instance - the instance to remove from books
166

    
167
    This method removes the record for a named instance.
168

    
169
    """
170
    assert self.statefile
171

    
172
    if instance.name in self.inst_map:
173
      del self.inst_map[instance.name]
174

    
175
  def Save(self):
176
    """Save records to file, then unlock and close file.
177

    
178
    """
179
    assert self.statefile
180

    
181
    self.statefile.seek(0)
182
    self.statefile.truncate()
183

    
184
    for name in self.inst_map:
185
      print >> self.statefile, "%s:%d:%d" % ((name,) + self.inst_map[name])
186

    
187
    fcntl.flock(self.statefile.fileno(), fcntl.LOCK_UN)
188

    
189
    self.statefile.close()
190
    self.statefile = None
191

    
192

    
193
class Instance(object):
194
  """Abstraction for a Virtual Machine instance.
195

    
196
  Methods:
197
    Restart(): issue a command to restart the represented machine.
198
  """
199
  def __init__(self, name, state):
200
    self.name = name
201
    self.state = state
202

    
203
  def Restart(self):
204
    DoCmd(['gnt-instance', 'startup', '--lock-retries=15', self.name])
205

    
206

    
207
class InstanceList(object):
208
  """The set of Virtual Machine instances on a cluster.
209

    
210
  """
211
  cmd = ['gnt-instance', 'list', '--lock-retries=15',
212
         '-o', 'name,admin_state,oper_state', '--no-headers', '--separator=:']
213

    
214
  def __init__(self):
215
    res = DoCmd(self.cmd)
216

    
217
    lines = res.stdout.splitlines()
218

    
219
    self.instances = []
220
    for line in lines:
221
      fields = [fld.strip() for fld in line.split(':')]
222

    
223
      if len(fields) != 3:
224
        continue
225
      if fields[1] == "no": #no autostart, we don't care about this instance
226
        continue
227
      name, status = fields[0], fields[2]
228

    
229
      self.instances.append(Instance(name, status))
230

    
231
  def __iter__(self):
232
    return self.instances.__iter__()
233

    
234

    
235
class Message(object):
236
  """Encapsulation of a notice or error message.
237

    
238
  """
239
  def __init__(self, level, msg):
240
    self.level = level
241
    self.msg = msg
242
    self.when = time.time()
243

    
244
  def __str__(self):
245
    return self.level + ' ' + time.ctime(self.when) + '\n' + Indent(self.msg)
246

    
247

    
248
class Restarter(object):
249
  """Encapsulate the logic for restarting erronously halted virtual machines.
250

    
251
  The calling program should periodically instantiate me and call Run().
252
  This will traverse the list of instances, and make up to MAXTRIES attempts
253
  to restart machines that are down.
254

    
255
  """
256
  def __init__(self):
257
    sstore = ssconf.SimpleStore()
258
    master = sstore.GetMasterNode()
259
    if master != socket.gethostname():
260
      raise NotMasterError, ("This is not the master node")
261
    self.instances = InstanceList()
262
    self.messages = []
263

    
264
  def Run(self):
265
    """Make a pass over the list of instances, restarting downed ones.
266

    
267
    """
268
    notepad = RestarterState()
269

    
270
    for instance in self.instances:
271
      if instance.state in BAD_STATES:
272
        n = notepad.NumberOfAttempts(instance)
273

    
274
        if n > MAXTRIES:
275
          # stay quiet.
276
          continue
277
        elif n < MAXTRIES:
278
          last = " (Attempt #%d)" % (n + 1)
279
        else:
280
          notepad.RecordAttempt(instance)
281
          self.messages.append(Message(ERROR, "Could not restart %s for %d"
282
                                       " times, giving up..." %
283
                                       (instance.name, MAXTRIES)))
284
          continue
285
        try:
286
          self.messages.append(Message(NOTICE,
287
                                       "Restarting %s%s." %
288
                                       (instance.name, last)))
289
          instance.Restart()
290
        except Error, x:
291
          self.messages.append(Message(ERROR, str(x)))
292

    
293
        notepad.RecordAttempt(instance)
294
      elif instance.state in HELPLESS_STATES:
295
        if notepad.NumberOfAttempts(instance):
296
          notepad.Remove(instance)
297
      else:
298
        if notepad.NumberOfAttempts(instance):
299
          notepad.Remove(instance)
300
          msg = Message(NOTICE,
301
                        "Restart of %s succeeded." % instance.name)
302
          self.messages.append(msg)
303

    
304
    notepad.Save()
305

    
306
  def WriteReport(self, logfile):
307
    """Log all messages to file.
308

    
309
    Args:
310
      logfile: file object open for writing (the log file)
311

    
312
    """
313
    for msg in self.messages:
314
      print >> logfile, str(msg)
315

    
316

    
317
def ParseOptions():
318
  """Parse the command line options.
319

    
320
  Returns:
321
    (options, args) as from OptionParser.parse_args()
322

    
323
  """
324
  parser = OptionParser(description="Ganeti cluster watcher",
325
                        usage="%prog [-d]",
326
                        version="%%prog (ganeti) %s" %
327
                        constants.RELEASE_VERSION)
328

    
329
  parser.add_option("-d", "--debug", dest="debug",
330
                    help="Don't redirect messages to the log file",
331
                    default=False, action="store_true")
332
  options, args = parser.parse_args()
333
  return options, args
334

    
335

    
336
def main():
337
  """Main function.
338

    
339
  """
340
  options, args = ParseOptions()
341

    
342
  if not options.debug:
343
    sys.stderr = sys.stdout = open(LOGFILE, 'a')
344

    
345
  try:
346
    restarter = Restarter()
347
    restarter.Run()
348
    restarter.WriteReport(sys.stdout)
349
  except NotMasterError:
350
    if options.debug:
351
      sys.stderr.write("Not master, exiting.\n")
352
    sys.exit(constants.EXIT_NOTMASTER)
353
  except Error, err:
354
    print err
355

    
356
if __name__ == '__main__':
357
  main()