Statistics
| Branch: | Tag: | Revision:

root / daemons / ganeti-watcher @ 7b195d9b

History | View | Annotate | Download (12 kB)

1
#!/usr/bin/python
2
#
3

    
4
# Copyright (C) 2006, 2007 Google Inc.
5
#
6
# This program is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 2 of the License, or
9
# (at your option) any later version.
10
#
11
# This program is distributed in the hope that it will be useful, but
12
# WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
# General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with this program; if not, write to the Free Software
18
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19
# 02110-1301, USA.
20

    
21

    
22
"""Tool to restart erronously downed virtual machines.
23

    
24
This program and set of classes implement a watchdog to restart
25
virtual machines in a Ganeti cluster that have crashed or been killed
26
by a node reboot.  Run from cron or similar.
27

    
28
"""
29

    
30
import os
31
import sys
32
import re
33
import time
34
import fcntl
35
import errno
36
import simplejson
37
from optparse import OptionParser
38

    
39
from ganeti import utils
40
from ganeti import constants
41
from ganeti import ssconf
42
from ganeti import errors
43

    
44

    
45
MAXTRIES = 5
46
BAD_STATES = ['stopped']
47
HELPLESS_STATES = ['(node down)']
48
NOTICE = 'NOTICE'
49
ERROR = 'ERROR'
50
KEY_RESTART_COUNT = "restart_count"
51
KEY_RESTART_WHEN = "restart_when"
52
KEY_BOOT_ID = "bootid"
53

    
54

    
55
class Error(Exception):
56
  """Generic custom error class."""
57

    
58

    
59
class NotMasterError(Error):
60
  """Exception raised when this host is not the master."""
61

    
62

    
63
def Indent(s, prefix='| '):
64
  """Indent a piece of text with a given prefix before each line.
65

    
66
  Args:
67
    s: The string to indent
68
    prefix: The string to prepend each line.
69

    
70
  """
71
  return "%s%s\n" % (prefix, ('\n' + prefix).join(s.splitlines()))
72

    
73

    
74
def DoCmd(cmd):
75
  """Run a shell command.
76

    
77
  Args:
78
    cmd: the command to run.
79

    
80
  Raises CommandError with verbose commentary on error.
81

    
82
  """
83
  res = utils.RunCmd(cmd)
84

    
85
  if res.failed:
86
    raise Error("Command %s failed:\n%s\nstdout:\n%sstderr:\n%s" %
87
                (repr(cmd),
88
                 Indent(res.fail_reason),
89
                 Indent(res.stdout),
90
                 Indent(res.stderr)))
91

    
92
  return res
93

    
94

    
95
class WatcherState(object):
96
  """Interface to a state file recording restart attempts.
97

    
98
  """
99
  def __init__(self):
100
    """Open, lock, read and parse the file.
101

    
102
    Raises StandardError on lock contention.
103

    
104
    """
105
    # The two-step dance below is necessary to allow both opening existing
106
    # file read/write and creating if not existing.  Vanilla open will truncate
107
    # an existing file -or- allow creating if not existing.
108
    f = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT)
109
    f = os.fdopen(f, 'w+')
110

    
111
    try:
112
      fcntl.flock(f.fileno(), fcntl.LOCK_EX|fcntl.LOCK_NB)
113
    except IOError, x:
114
      if x.errno == errno.EAGAIN:
115
        raise StandardError("State file already locked")
116
      raise
117

    
118
    self.statefile = f
119

    
120
    try:
121
      self.data = simplejson.load(self.statefile)
122
    except Exception, msg:
123
      # Ignore errors while loading the file and treat it as empty
124
      self.data = {}
125
      sys.stderr.write("Empty or invalid state file. "
126
          "Using defaults. Error message: %s\n" % msg)
127

    
128
    if "instance" not in self.data:
129
      self.data["instance"] = {}
130
    if "node" not in self.data:
131
      self.data["node"] = {}
132

    
133
  def __del__(self):
134
    """Called on destruction.
135

    
136
    """
137
    if self.statefile:
138
      self._Close()
139

    
140
  def _Close(self):
141
    """Unlock configuration file and close it.
142

    
143
    """
144
    assert self.statefile
145

    
146
    fcntl.flock(self.statefile.fileno(), fcntl.LOCK_UN)
147

    
148
    self.statefile.close()
149
    self.statefile = None
150

    
151
  def GetNodeBootID(self, name):
152
    """Returns the last boot ID of a node or None.
153

    
154
    """
155
    ndata = self.data["node"]
156

    
157
    if name in ndata and KEY_BOOT_ID in ndata[name]:
158
      return ndata[name][KEY_BOOT_ID]
159
    return None
160

    
161
  def SetNodeBootID(self, name, bootid):
162
    """Sets the boot ID of a node.
163

    
164
    """
165
    assert bootid
166

    
167
    ndata = self.data["node"]
168

    
169
    if name not in ndata:
170
      ndata[name] = {}
171

    
172
    ndata[name][KEY_BOOT_ID] = bootid
173

    
174
  def NumberOfRestartAttempts(self, instance):
175
    """Returns number of previous restart attempts.
176

    
177
    Args:
178
      instance - the instance to look up.
179

    
180
    """
181
    idata = self.data["instance"]
182

    
183
    if instance.name in idata:
184
      return idata[instance.name][KEY_RESTART_COUNT]
185

    
186
    return 0
187

    
188
  def RecordRestartAttempt(self, instance):
189
    """Record a restart attempt.
190

    
191
    Args:
192
      instance - the instance being restarted
193

    
194
    """
195
    idata = self.data["instance"]
196

    
197
    if instance.name not in idata:
198
      inst = idata[instance.name] = {}
199
    else:
200
      inst = idata[instance.name]
201

    
202
    inst[KEY_RESTART_WHEN] = time.time()
203
    inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1
204

    
205
  def RemoveInstance(self, instance):
206
    """Update state to reflect that a machine is running, i.e. remove record.
207

    
208
    Args:
209
      instance - the instance to remove from books
210

    
211
    This method removes the record for a named instance.
212

    
213
    """
214
    idata = self.data["instance"]
215

    
216
    if instance.name in idata:
217
      del idata[instance.name]
218

    
219
  def Save(self):
220
    """Save state to file, then unlock and close it.
221

    
222
    """
223
    assert self.statefile
224

    
225
    self.statefile.seek(0)
226
    self.statefile.truncate()
227

    
228
    simplejson.dump(self.data, self.statefile)
229

    
230
    self._Close()
231

    
232

    
233
class Instance(object):
234
  """Abstraction for a Virtual Machine instance.
235

    
236
  Methods:
237
    Restart(): issue a command to restart the represented machine.
238

    
239
  """
240
  def __init__(self, name, state, autostart):
241
    self.name = name
242
    self.state = state
243
    self.autostart = autostart
244

    
245
  def Restart(self):
246
    """Encapsulates the start of an instance.
247

    
248
    """
249
    DoCmd(['gnt-instance', 'startup', '--lock-retries=15', self.name])
250

    
251
  def ActivateDisks(self):
252
    """Encapsulates the activation of all disks of an instance.
253

    
254
    """
255
    DoCmd(['gnt-instance', 'activate-disks', '--lock-retries=15', self.name])
256

    
257

    
258
def _RunListCmd(cmd):
259
  """Runs a command and parses its output into lists.
260

    
261
  """
262
  for line in DoCmd(cmd).stdout.splitlines():
263
    yield line.split(':')
264

    
265

    
266
def GetInstanceList(with_secondaries=None):
267
  """Get a list of instances on this cluster.
268

    
269
  """
270
  cmd = ['gnt-instance', 'list', '--lock-retries=15', '--no-headers',
271
         '--separator=:']
272

    
273
  fields = 'name,oper_state,admin_state'
274

    
275
  if with_secondaries is not None:
276
    fields += ',snodes'
277

    
278
  cmd.append('-o')
279
  cmd.append(fields)
280

    
281
  instances = []
282
  for fields in _RunListCmd(cmd):
283
    if with_secondaries is not None:
284
      (name, status, autostart, snodes) = fields
285

    
286
      if snodes == "-":
287
        continue
288

    
289
      for node in with_secondaries:
290
        if node in snodes.split(','):
291
          break
292
      else:
293
        continue
294

    
295
    else:
296
      (name, status, autostart) = fields
297

    
298
    instances.append(Instance(name, status, autostart != "no"))
299

    
300
  return instances
301

    
302

    
303
def GetNodeBootIDs():
304
  """Get a dict mapping nodes to boot IDs.
305

    
306
  """
307
  cmd = ['gnt-node', 'list', '--lock-retries=15', '--no-headers',
308
         '--separator=:', '-o', 'name,bootid']
309

    
310
  ids = {}
311
  for fields in _RunListCmd(cmd):
312
    (name, bootid) = fields
313
    ids[name] = bootid
314

    
315
  return ids
316

    
317

    
318
class Message(object):
319
  """Encapsulation of a notice or error message.
320

    
321
  """
322
  def __init__(self, level, msg):
323
    self.level = level
324
    self.msg = msg
325
    self.when = time.time()
326

    
327
  def __str__(self):
328
    return self.level + ' ' + time.ctime(self.when) + '\n' + Indent(self.msg)
329

    
330

    
331
class Watcher(object):
332
  """Encapsulate the logic for restarting erronously halted virtual machines.
333

    
334
  The calling program should periodically instantiate me and call Run().
335
  This will traverse the list of instances, and make up to MAXTRIES attempts
336
  to restart machines that are down.
337

    
338
  """
339
  def __init__(self):
340
    sstore = ssconf.SimpleStore()
341
    master = sstore.GetMasterNode()
342
    if master != utils.HostInfo().name:
343
      raise NotMasterError("This is not the master node")
344
    self.instances = GetInstanceList()
345
    self.bootids = GetNodeBootIDs()
346
    self.messages = []
347

    
348
  def Run(self):
349
    notepad = WatcherState()
350
    self.CheckInstances(notepad)
351
    self.CheckDisks(notepad)
352
    notepad.Save()
353

    
354
  def CheckDisks(self, notepad):
355
    """Check all nodes for restarted ones.
356

    
357
    """
358
    check_nodes = []
359
    for name, id in self.bootids.iteritems():
360
      old = notepad.GetNodeBootID(name)
361
      if old != id:
362
        # Node's boot ID has changed, proably through a reboot.
363
        check_nodes.append(name)
364

    
365
    if check_nodes:
366
      # Activate disks for all instances with any of the checked nodes as a
367
      # secondary node.
368
      for instance in GetInstanceList(with_secondaries=check_nodes):
369
        try:
370
          self.messages.append(Message(NOTICE,
371
                                       "Activating disks for %s." %
372
                                       instance.name))
373
          instance.ActivateDisks()
374
        except Error, x:
375
          self.messages.append(Message(ERROR, str(x)))
376

    
377
      # Keep changed boot IDs
378
      for name in check_nodes:
379
        notepad.SetNodeBootID(name, self.bootids[name])
380

    
381
  def CheckInstances(self, notepad):
382
    """Make a pass over the list of instances, restarting downed ones.
383

    
384
    """
385
    for instance in self.instances:
386
      # Don't care about manually stopped instances
387
      if not instance.autostart:
388
        continue
389

    
390
      if instance.state in BAD_STATES:
391
        n = notepad.NumberOfRestartAttempts(instance)
392

    
393
        if n > MAXTRIES:
394
          # stay quiet.
395
          continue
396
        elif n < MAXTRIES:
397
          last = " (Attempt #%d)" % (n + 1)
398
        else:
399
          notepad.RecordRestartAttempt(instance)
400
          self.messages.append(Message(ERROR, "Could not restart %s for %d"
401
                                       " times, giving up..." %
402
                                       (instance.name, MAXTRIES)))
403
          continue
404
        try:
405
          self.messages.append(Message(NOTICE,
406
                                       "Restarting %s%s." %
407
                                       (instance.name, last)))
408
          instance.Restart()
409
        except Error, x:
410
          self.messages.append(Message(ERROR, str(x)))
411

    
412
        notepad.RecordRestartAttempt(instance)
413
      elif instance.state in HELPLESS_STATES:
414
        if notepad.NumberOfRestartAttempts(instance):
415
          notepad.RemoveInstance(instance)
416
      else:
417
        if notepad.NumberOfRestartAttempts(instance):
418
          notepad.RemoveInstance(instance)
419
          msg = Message(NOTICE,
420
                        "Restart of %s succeeded." % instance.name)
421
          self.messages.append(msg)
422

    
423
  def WriteReport(self, logfile):
424
    """Log all messages to file.
425

    
426
    Args:
427
      logfile: file object open for writing (the log file)
428

    
429
    """
430
    for msg in self.messages:
431
      print >> logfile, str(msg)
432

    
433

    
434
def ParseOptions():
435
  """Parse the command line options.
436

    
437
  Returns:
438
    (options, args) as from OptionParser.parse_args()
439

    
440
  """
441
  parser = OptionParser(description="Ganeti cluster watcher",
442
                        usage="%prog [-d]",
443
                        version="%%prog (ganeti) %s" %
444
                        constants.RELEASE_VERSION)
445

    
446
  parser.add_option("-d", "--debug", dest="debug",
447
                    help="Don't redirect messages to the log file",
448
                    default=False, action="store_true")
449
  options, args = parser.parse_args()
450
  return options, args
451

    
452

    
453
def main():
454
  """Main function.
455

    
456
  """
457
  options, args = ParseOptions()
458

    
459
  if not options.debug:
460
    sys.stderr = sys.stdout = open(constants.LOG_WATCHER, 'a')
461

    
462
  try:
463
    try:
464
      watcher = Watcher()
465
    except errors.ConfigurationError:
466
      # Just exit if there's no configuration
467
      sys.exit(constants.EXIT_SUCCESS)
468
    watcher.Run()
469
    watcher.WriteReport(sys.stdout)
470
  except NotMasterError:
471
    if options.debug:
472
      sys.stderr.write("Not master, exiting.\n")
473
    sys.exit(constants.EXIT_NOTMASTER)
474
  except errors.ResolverError, err:
475
    sys.stderr.write("Cannot resolve hostname '%s', exiting.\n" % err.args[0])
476
    sys.exit(constants.EXIT_NODESETUP_ERROR)
477
  except Error, err:
478
    print err
479

    
480

    
481
if __name__ == '__main__':
482
  main()