Statistics
| Branch: | Tag: | Revision:

root / daemons / ganeti-watcher @ eee1fa2d

History | View | Annotate | Download (12.7 kB)

1
#!/usr/bin/python
2
#
3

    
4
# Copyright (C) 2006, 2007 Google Inc.
5
#
6
# This program is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 2 of the License, or
9
# (at your option) any later version.
10
#
11
# This program is distributed in the hope that it will be useful, but
12
# WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
# General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with this program; if not, write to the Free Software
18
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19
# 02110-1301, USA.
20

    
21

    
22
"""Tool to restart erronously downed virtual machines.
23

    
24
This program and set of classes implement a watchdog to restart
25
virtual machines in a Ganeti cluster that have crashed or been killed
26
by a node reboot.  Run from cron or similar.
27

    
28
"""
29

    
30
import os
31
import sys
32
import re
33
import time
34
import fcntl
35
import errno
36
import simplejson
37
from optparse import OptionParser
38

    
39
from ganeti import utils
40
from ganeti import constants
41
from ganeti import ssconf
42
from ganeti import errors
43

    
44

    
45
MAXTRIES = 5
46
BAD_STATES = ['stopped']
47
HELPLESS_STATES = ['(node down)']
48
NOTICE = 'NOTICE'
49
ERROR = 'ERROR'
50
KEY_RESTART_COUNT = "restart_count"
51
KEY_RESTART_WHEN = "restart_when"
52
KEY_BOOT_ID = "bootid"
53

    
54

    
55
class Error(Exception):
56
  """Generic custom error class."""
57

    
58

    
59
class NotMasterError(Error):
60
  """Exception raised when this host is not the master."""
61

    
62

    
63
def Indent(s, prefix='| '):
64
  """Indent a piece of text with a given prefix before each line.
65

    
66
  Args:
67
    s: The string to indent
68
    prefix: The string to prepend each line.
69

    
70
  """
71
  return "%s%s\n" % (prefix, ('\n' + prefix).join(s.splitlines()))
72

    
73

    
74
def DoCmd(cmd):
75
  """Run a shell command.
76

    
77
  Args:
78
    cmd: the command to run.
79

    
80
  Raises CommandError with verbose commentary on error.
81

    
82
  """
83
  res = utils.RunCmd(cmd)
84

    
85
  if res.failed:
86
    raise Error("Command %s failed:\n%s\nstdout:\n%sstderr:\n%s" %
87
                (repr(cmd),
88
                 Indent(res.fail_reason),
89
                 Indent(res.stdout),
90
                 Indent(res.stderr)))
91

    
92
  return res
93

    
94

    
95
class WatcherState(object):
96
  """Interface to a state file recording restart attempts.
97

    
98
  """
99
  def __init__(self):
100
    """Open, lock, read and parse the file.
101

    
102
    Raises StandardError on lock contention.
103

    
104
    """
105
    # The two-step dance below is necessary to allow both opening existing
106
    # file read/write and creating if not existing.  Vanilla open will truncate
107
    # an existing file -or- allow creating if not existing.
108
    f = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT)
109
    f = os.fdopen(f, 'w+')
110

    
111
    try:
112
      fcntl.flock(f.fileno(), fcntl.LOCK_EX|fcntl.LOCK_NB)
113
    except IOError, x:
114
      if x.errno == errno.EAGAIN:
115
        raise StandardError("State file already locked")
116
      raise
117

    
118
    self.statefile = f
119

    
120
    try:
121
      self.data = simplejson.load(self.statefile)
122
    except Exception, msg:
123
      # Ignore errors while loading the file and treat it as empty
124
      self.data = {}
125
      sys.stderr.write("Empty or invalid state file."
126
                       " Using defaults. Error message: %s\n" % msg)
127

    
128
    if "instance" not in self.data:
129
      self.data["instance"] = {}
130
    if "node" not in self.data:
131
      self.data["node"] = {}
132

    
133
  def __del__(self):
134
    """Called on destruction.
135

    
136
    """
137
    if self.statefile:
138
      self._Close()
139

    
140
  def _Close(self):
141
    """Unlock configuration file and close it.
142

    
143
    """
144
    assert self.statefile
145

    
146
    fcntl.flock(self.statefile.fileno(), fcntl.LOCK_UN)
147

    
148
    self.statefile.close()
149
    self.statefile = None
150

    
151
  def GetNodeBootID(self, name):
152
    """Returns the last boot ID of a node or None.
153

    
154
    """
155
    ndata = self.data["node"]
156

    
157
    if name in ndata and KEY_BOOT_ID in ndata[name]:
158
      return ndata[name][KEY_BOOT_ID]
159
    return None
160

    
161
  def SetNodeBootID(self, name, bootid):
162
    """Sets the boot ID of a node.
163

    
164
    """
165
    assert bootid
166

    
167
    ndata = self.data["node"]
168

    
169
    if name not in ndata:
170
      ndata[name] = {}
171

    
172
    ndata[name][KEY_BOOT_ID] = bootid
173

    
174
  def NumberOfRestartAttempts(self, instance):
175
    """Returns number of previous restart attempts.
176

    
177
    Args:
178
      instance - the instance to look up.
179

    
180
    """
181
    idata = self.data["instance"]
182

    
183
    if instance.name in idata:
184
      return idata[instance.name][KEY_RESTART_COUNT]
185

    
186
    return 0
187

    
188
  def RecordRestartAttempt(self, instance):
189
    """Record a restart attempt.
190

    
191
    Args:
192
      instance - the instance being restarted
193

    
194
    """
195
    idata = self.data["instance"]
196

    
197
    if instance.name not in idata:
198
      inst = idata[instance.name] = {}
199
    else:
200
      inst = idata[instance.name]
201

    
202
    inst[KEY_RESTART_WHEN] = time.time()
203
    inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1
204

    
205
  def RemoveInstance(self, instance):
206
    """Update state to reflect that a machine is running, i.e. remove record.
207

    
208
    Args:
209
      instance - the instance to remove from books
210

    
211
    This method removes the record for a named instance.
212

    
213
    """
214
    idata = self.data["instance"]
215

    
216
    if instance.name in idata:
217
      del idata[instance.name]
218

    
219
  def Save(self):
220
    """Save state to file, then unlock and close it.
221

    
222
    """
223
    assert self.statefile
224

    
225
    self.statefile.seek(0)
226
    self.statefile.truncate()
227

    
228
    simplejson.dump(self.data, self.statefile)
229

    
230
    self._Close()
231

    
232

    
233
class Instance(object):
234
  """Abstraction for a Virtual Machine instance.
235

    
236
  Methods:
237
    Restart(): issue a command to restart the represented machine.
238

    
239
  """
240
  def __init__(self, name, state, autostart):
241
    self.name = name
242
    self.state = state
243
    self.autostart = autostart
244

    
245
  def Restart(self):
246
    """Encapsulates the start of an instance.
247

    
248
    """
249
    DoCmd(['gnt-instance', 'startup', '--lock-retries=15', self.name])
250

    
251
  def ActivateDisks(self):
252
    """Encapsulates the activation of all disks of an instance.
253

    
254
    """
255
    DoCmd(['gnt-instance', 'activate-disks', '--lock-retries=15', self.name])
256

    
257

    
258
def _RunListCmd(cmd):
259
  """Runs a command and parses its output into lists.
260

    
261
  """
262
  for line in DoCmd(cmd).stdout.splitlines():
263
    yield line.split(':')
264

    
265

    
266
def GetInstanceList(with_secondaries=None):
267
  """Get a list of instances on this cluster.
268

    
269
  """
270
  cmd = ['gnt-instance', 'list', '--lock-retries=15', '--no-headers',
271
         '--separator=:']
272

    
273
  fields = 'name,oper_state,admin_state'
274

    
275
  if with_secondaries is not None:
276
    fields += ',snodes'
277

    
278
  cmd.append('-o')
279
  cmd.append(fields)
280

    
281
  instances = []
282
  for fields in _RunListCmd(cmd):
283
    if with_secondaries is not None:
284
      (name, status, autostart, snodes) = fields
285

    
286
      if snodes == "-":
287
        continue
288

    
289
      for node in with_secondaries:
290
        if node in snodes.split(','):
291
          break
292
      else:
293
        continue
294

    
295
    else:
296
      (name, status, autostart) = fields
297

    
298
    instances.append(Instance(name, status, autostart != "no"))
299

    
300
  return instances
301

    
302

    
303
def GetNodeBootIDs():
304
  """Get a dict mapping nodes to boot IDs.
305

    
306
  """
307
  cmd = ['gnt-node', 'list', '--lock-retries=15', '--no-headers',
308
         '--separator=:', '-o', 'name,bootid']
309

    
310
  ids = {}
311
  for fields in _RunListCmd(cmd):
312
    (name, bootid) = fields
313
    ids[name] = bootid
314

    
315
  return ids
316

    
317

    
318
class Message(object):
319
  """Encapsulation of a notice or error message.
320

    
321
  """
322
  def __init__(self, level, msg):
323
    self.level = level
324
    self.msg = msg
325
    self.when = time.time()
326

    
327
  def __str__(self):
328
    return self.level + ' ' + time.ctime(self.when) + '\n' + Indent(self.msg)
329

    
330

    
331
class Watcher(object):
332
  """Encapsulate the logic for restarting erronously halted virtual machines.
333

    
334
  The calling program should periodically instantiate me and call Run().
335
  This will traverse the list of instances, and make up to MAXTRIES attempts
336
  to restart machines that are down.
337

    
338
  """
339
  def __init__(self):
340
    sstore = ssconf.SimpleStore()
341
    master = sstore.GetMasterNode()
342
    if master != utils.HostInfo().name:
343
      raise NotMasterError("This is not the master node")
344
    self.instances = GetInstanceList()
345
    self.bootids = GetNodeBootIDs()
346
    self.messages = []
347
    self.started_instances = set()
348

    
349
  def Run(self):
350
    notepad = WatcherState()
351
    self.CheckInstances(notepad)
352
    self.CheckDisks(notepad)
353
    self.VerifyDisks()
354
    notepad.Save()
355

    
356
  def CheckDisks(self, notepad):
357
    """Check all nodes for restarted ones.
358

    
359
    """
360
    check_nodes = []
361
    for name, id in self.bootids.iteritems():
362
      old = notepad.GetNodeBootID(name)
363
      if old != id:
364
        # Node's boot ID has changed, proably through a reboot.
365
        check_nodes.append(name)
366

    
367
    if check_nodes:
368
      # Activate disks for all instances with any of the checked nodes as a
369
      # secondary node.
370
      for instance in GetInstanceList(with_secondaries=check_nodes):
371
        if not instance.autostart:
372
          self.messages.append(Message(NOTICE,
373
                                       ("Skipping disk activation for"
374
                                        " non-autostart instance '%s'." %
375
                                        instance.name)))
376
          continue
377
        if instance.name in self.started_instances:
378
          # we already tried to start the instance, which should have
379
          # activated its drives (if they can be at all)
380
          continue
381
        try:
382
          self.messages.append(Message(NOTICE, ("Activating disks for %s." %
383
                                                instance.name)))
384
          instance.ActivateDisks()
385
        except Error, x:
386
          self.messages.append(Message(ERROR, str(x)))
387

    
388
      # Keep changed boot IDs
389
      for name in check_nodes:
390
        notepad.SetNodeBootID(name, self.bootids[name])
391

    
392
  def CheckInstances(self, notepad):
393
    """Make a pass over the list of instances, restarting downed ones.
394

    
395
    """
396
    for instance in self.instances:
397
      # Don't care about manually stopped instances
398
      if not instance.autostart:
399
        continue
400

    
401
      if instance.state in BAD_STATES:
402
        n = notepad.NumberOfRestartAttempts(instance)
403

    
404
        if n > MAXTRIES:
405
          # stay quiet.
406
          continue
407
        elif n < MAXTRIES:
408
          last = " (Attempt #%d)" % (n + 1)
409
        else:
410
          notepad.RecordRestartAttempt(instance)
411
          self.messages.append(Message(ERROR, "Could not restart %s for %d"
412
                                       " times, giving up..." %
413
                                       (instance.name, MAXTRIES)))
414
          continue
415
        try:
416
          self.messages.append(Message(NOTICE, ("Restarting %s%s." %
417
                                                (instance.name, last))))
418
          instance.Restart()
419
          self.started_instances.add(instance.name)
420
        except Error, x:
421
          self.messages.append(Message(ERROR, str(x)))
422

    
423
        notepad.RecordRestartAttempt(instance)
424
      elif instance.state in HELPLESS_STATES:
425
        if notepad.NumberOfRestartAttempts(instance):
426
          notepad.RemoveInstance(instance)
427
      else:
428
        if notepad.NumberOfRestartAttempts(instance):
429
          notepad.RemoveInstance(instance)
430
          msg = Message(NOTICE, "Restart of %s succeeded." % instance.name)
431
          self.messages.append(msg)
432

    
433
  def VerifyDisks(self):
434
    """Run gnt-cluster verify-disks.
435

    
436
    """
437
    result = DoCmd(['gnt-cluster', 'verify-disks', '--lock-retries=15'])
438
    if result.output:
439
      self.messages.append(Message(NOTICE, result.output))
440

    
441
  def WriteReport(self, logfile):
442
    """Log all messages to file.
443

    
444
    Args:
445
      logfile: file object open for writing (the log file)
446

    
447
    """
448
    for msg in self.messages:
449
      print >> logfile, str(msg)
450

    
451

    
452
def ParseOptions():
453
  """Parse the command line options.
454

    
455
  Returns:
456
    (options, args) as from OptionParser.parse_args()
457

    
458
  """
459
  parser = OptionParser(description="Ganeti cluster watcher",
460
                        usage="%prog [-d]",
461
                        version="%%prog (ganeti) %s" %
462
                        constants.RELEASE_VERSION)
463

    
464
  parser.add_option("-d", "--debug", dest="debug",
465
                    help="Don't redirect messages to the log file",
466
                    default=False, action="store_true")
467
  options, args = parser.parse_args()
468
  return options, args
469

    
470

    
471
def main():
472
  """Main function.
473

    
474
  """
475
  options, args = ParseOptions()
476

    
477
  if not options.debug:
478
    sys.stderr = sys.stdout = open(constants.LOG_WATCHER, 'a')
479

    
480
  try:
481
    try:
482
      watcher = Watcher()
483
    except errors.ConfigurationError:
484
      # Just exit if there's no configuration
485
      sys.exit(constants.EXIT_SUCCESS)
486
    watcher.Run()
487
    watcher.WriteReport(sys.stdout)
488
  except NotMasterError:
489
    if options.debug:
490
      sys.stderr.write("Not master, exiting.\n")
491
    sys.exit(constants.EXIT_NOTMASTER)
492
  except errors.ResolverError, err:
493
    sys.stderr.write("Cannot resolve hostname '%s', exiting.\n" % err.args[0])
494
    sys.exit(constants.EXIT_NODESETUP_ERROR)
495
  except Error, err:
496
    print err
497

    
498

    
499
if __name__ == '__main__':
500
  main()