Statistics
| Branch: | Tag: | Revision:

root / daemons / ganeti-watcher @ f4bc1f2c

History | View | Annotate | Download (11.9 kB)

1
#!/usr/bin/python
2
#
3

    
4
# Copyright (C) 2006, 2007 Google Inc.
5
#
6
# This program is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 2 of the License, or
9
# (at your option) any later version.
10
#
11
# This program is distributed in the hope that it will be useful, but
12
# WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
# General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with this program; if not, write to the Free Software
18
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19
# 02110-1301, USA.
20

    
21

    
22
"""Tool to restart erronously downed virtual machines.
23

    
24
This program and set of classes implement a watchdog to restart
25
virtual machines in a Ganeti cluster that have crashed or been killed
26
by a node reboot.  Run from cron or similar.
27

    
28
"""
29

    
30
import os
31
import sys
32
import re
33
import time
34
import fcntl
35
import errno
36
import simplejson
37
from optparse import OptionParser
38

    
39
from ganeti import utils
40
from ganeti import constants
41
from ganeti import ssconf
42
from ganeti import errors
43

    
44

    
45
MAXTRIES = 5
46
BAD_STATES = ['stopped']
47
HELPLESS_STATES = ['(node down)']
48
NOTICE = 'NOTICE'
49
ERROR = 'ERROR'
50
KEY_RESTART_COUNT = "restart_count"
51
KEY_RESTART_WHEN = "restart_when"
52
KEY_BOOT_ID = "bootid"
53

    
54

    
55
class Error(Exception):
56
  """Generic custom error class."""
57

    
58

    
59
class NotMasterError(Error):
60
  """Exception raised when this host is not the master."""
61

    
62

    
63
def Indent(s, prefix='| '):
64
  """Indent a piece of text with a given prefix before each line.
65

    
66
  Args:
67
    s: The string to indent
68
    prefix: The string to prepend each line.
69

    
70
  """
71
  return "%s%s\n" % (prefix, ('\n' + prefix).join(s.splitlines()))
72

    
73

    
74
def DoCmd(cmd):
75
  """Run a shell command.
76

    
77
  Args:
78
    cmd: the command to run.
79

    
80
  Raises CommandError with verbose commentary on error.
81

    
82
  """
83
  res = utils.RunCmd(cmd)
84

    
85
  if res.failed:
86
    raise Error("Command %s failed:\n%s\nstdout:\n%sstderr:\n%s" %
87
                (repr(cmd),
88
                 Indent(res.fail_reason),
89
                 Indent(res.stdout),
90
                 Indent(res.stderr)))
91

    
92
  return res
93

    
94

    
95
class WatcherState(object):
96
  """Interface to a state file recording restart attempts.
97

    
98
  """
99
  def __init__(self):
100
    """Open, lock, read and parse the file.
101

    
102
    Raises StandardError on lock contention.
103

    
104
    """
105
    # The two-step dance below is necessary to allow both opening existing
106
    # file read/write and creating if not existing.  Vanilla open will truncate
107
    # an existing file -or- allow creating if not existing.
108
    f = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT)
109
    f = os.fdopen(f, 'w+')
110

    
111
    try:
112
      fcntl.flock(f.fileno(), fcntl.LOCK_EX|fcntl.LOCK_NB)
113
    except IOError, x:
114
      if x.errno == errno.EAGAIN:
115
        raise StandardError("State file already locked")
116
      raise
117

    
118
    self.statefile = f
119

    
120
    try:
121
      self.data = simplejson.load(self.statefile)
122
    except Exception, msg:
123
      # Ignore errors while loading the file and treat it as empty
124
      self.data = {}
125
      sys.stderr.write("Empty or invalid state file."
126
                       " Using defaults. Error message: %s\n" % msg)
127

    
128
    if "instance" not in self.data:
129
      self.data["instance"] = {}
130
    if "node" not in self.data:
131
      self.data["node"] = {}
132

    
133
  def __del__(self):
134
    """Called on destruction.
135

    
136
    """
137
    if self.statefile:
138
      self._Close()
139

    
140
  def _Close(self):
141
    """Unlock configuration file and close it.
142

    
143
    """
144
    assert self.statefile
145

    
146
    fcntl.flock(self.statefile.fileno(), fcntl.LOCK_UN)
147

    
148
    self.statefile.close()
149
    self.statefile = None
150

    
151
  def GetNodeBootID(self, name):
152
    """Returns the last boot ID of a node or None.
153

    
154
    """
155
    ndata = self.data["node"]
156

    
157
    if name in ndata and KEY_BOOT_ID in ndata[name]:
158
      return ndata[name][KEY_BOOT_ID]
159
    return None
160

    
161
  def SetNodeBootID(self, name, bootid):
162
    """Sets the boot ID of a node.
163

    
164
    """
165
    assert bootid
166

    
167
    ndata = self.data["node"]
168

    
169
    if name not in ndata:
170
      ndata[name] = {}
171

    
172
    ndata[name][KEY_BOOT_ID] = bootid
173

    
174
  def NumberOfRestartAttempts(self, instance):
175
    """Returns number of previous restart attempts.
176

    
177
    Args:
178
      instance - the instance to look up.
179

    
180
    """
181
    idata = self.data["instance"]
182

    
183
    if instance.name in idata:
184
      return idata[instance.name][KEY_RESTART_COUNT]
185

    
186
    return 0
187

    
188
  def RecordRestartAttempt(self, instance):
189
    """Record a restart attempt.
190

    
191
    Args:
192
      instance - the instance being restarted
193

    
194
    """
195
    idata = self.data["instance"]
196

    
197
    if instance.name not in idata:
198
      inst = idata[instance.name] = {}
199
    else:
200
      inst = idata[instance.name]
201

    
202
    inst[KEY_RESTART_WHEN] = time.time()
203
    inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1
204

    
205
  def RemoveInstance(self, instance):
206
    """Update state to reflect that a machine is running, i.e. remove record.
207

    
208
    Args:
209
      instance - the instance to remove from books
210

    
211
    This method removes the record for a named instance.
212

    
213
    """
214
    idata = self.data["instance"]
215

    
216
    if instance.name in idata:
217
      del idata[instance.name]
218

    
219
  def Save(self):
220
    """Save state to file, then unlock and close it.
221

    
222
    """
223
    assert self.statefile
224

    
225
    self.statefile.seek(0)
226
    self.statefile.truncate()
227

    
228
    simplejson.dump(self.data, self.statefile)
229

    
230
    self._Close()
231

    
232

    
233
class Instance(object):
234
  """Abstraction for a Virtual Machine instance.
235

    
236
  Methods:
237
    Restart(): issue a command to restart the represented machine.
238

    
239
  """
240
  def __init__(self, name, state, autostart):
241
    self.name = name
242
    self.state = state
243
    self.autostart = autostart
244

    
245
  def Restart(self):
246
    """Encapsulates the start of an instance.
247

    
248
    """
249
    DoCmd(['gnt-instance', 'startup', '--lock-retries=15', self.name])
250

    
251
  def ActivateDisks(self):
252
    """Encapsulates the activation of all disks of an instance.
253

    
254
    """
255
    DoCmd(['gnt-instance', 'activate-disks', '--lock-retries=15', self.name])
256

    
257

    
258
def _RunListCmd(cmd):
259
  """Runs a command and parses its output into lists.
260

    
261
  """
262
  for line in DoCmd(cmd).stdout.splitlines():
263
    yield line.split(':')
264

    
265

    
266
def GetInstanceList(with_secondaries=None):
267
  """Get a list of instances on this cluster.
268

    
269
  """
270
  cmd = ['gnt-instance', 'list', '--lock-retries=15', '--no-headers',
271
         '--separator=:']
272

    
273
  fields = 'name,oper_state,admin_state'
274

    
275
  if with_secondaries is not None:
276
    fields += ',snodes'
277

    
278
  cmd.append('-o')
279
  cmd.append(fields)
280

    
281
  instances = []
282
  for fields in _RunListCmd(cmd):
283
    if with_secondaries is not None:
284
      (name, status, autostart, snodes) = fields
285

    
286
      if snodes == "-":
287
        continue
288

    
289
      for node in with_secondaries:
290
        if node in snodes.split(','):
291
          break
292
      else:
293
        continue
294

    
295
    else:
296
      (name, status, autostart) = fields
297

    
298
    instances.append(Instance(name, status, autostart != "no"))
299

    
300
  return instances
301

    
302

    
303
def GetNodeBootIDs():
304
  """Get a dict mapping nodes to boot IDs.
305

    
306
  """
307
  cmd = ['gnt-node', 'list', '--lock-retries=15', '--no-headers',
308
         '--separator=:', '-o', 'name,bootid']
309

    
310
  ids = {}
311
  for fields in _RunListCmd(cmd):
312
    (name, bootid) = fields
313
    ids[name] = bootid
314

    
315
  return ids
316

    
317

    
318
class Message(object):
319
  """Encapsulation of a notice or error message.
320

    
321
  """
322
  def __init__(self, level, msg):
323
    self.level = level
324
    self.msg = msg
325
    self.when = time.time()
326

    
327
  def __str__(self):
328
    return self.level + ' ' + time.ctime(self.when) + '\n' + Indent(self.msg)
329

    
330

    
331
class Watcher(object):
332
  """Encapsulate the logic for restarting erronously halted virtual machines.
333

    
334
  The calling program should periodically instantiate me and call Run().
335
  This will traverse the list of instances, and make up to MAXTRIES attempts
336
  to restart machines that are down.
337

    
338
  """
339
  def __init__(self):
340
    sstore = ssconf.SimpleStore()
341
    master = sstore.GetMasterNode()
342
    if master != utils.HostInfo().name:
343
      raise NotMasterError("This is not the master node")
344
    self.instances = GetInstanceList()
345
    self.bootids = GetNodeBootIDs()
346
    self.messages = []
347

    
348
  def Run(self):
349
    notepad = WatcherState()
350
    self.CheckInstances(notepad)
351
    self.CheckDisks(notepad)
352
    notepad.Save()
353

    
354
  def CheckDisks(self, notepad):
355
    """Check all nodes for restarted ones.
356

    
357
    """
358
    check_nodes = []
359
    for name, id in self.bootids.iteritems():
360
      old = notepad.GetNodeBootID(name)
361
      if old != id:
362
        # Node's boot ID has changed, proably through a reboot.
363
        check_nodes.append(name)
364

    
365
    if check_nodes:
366
      # Activate disks for all instances with any of the checked nodes as a
367
      # secondary node.
368
      for instance in GetInstanceList(with_secondaries=check_nodes):
369
        try:
370
          self.messages.append(Message(NOTICE, ("Activating disks for %s." %
371
                                                instance.name)))
372
          instance.ActivateDisks()
373
        except Error, x:
374
          self.messages.append(Message(ERROR, str(x)))
375

    
376
      # Keep changed boot IDs
377
      for name in check_nodes:
378
        notepad.SetNodeBootID(name, self.bootids[name])
379

    
380
  def CheckInstances(self, notepad):
381
    """Make a pass over the list of instances, restarting downed ones.
382

    
383
    """
384
    for instance in self.instances:
385
      # Don't care about manually stopped instances
386
      if not instance.autostart:
387
        continue
388

    
389
      if instance.state in BAD_STATES:
390
        n = notepad.NumberOfRestartAttempts(instance)
391

    
392
        if n > MAXTRIES:
393
          # stay quiet.
394
          continue
395
        elif n < MAXTRIES:
396
          last = " (Attempt #%d)" % (n + 1)
397
        else:
398
          notepad.RecordRestartAttempt(instance)
399
          self.messages.append(Message(ERROR, "Could not restart %s for %d"
400
                                       " times, giving up..." %
401
                                       (instance.name, MAXTRIES)))
402
          continue
403
        try:
404
          self.messages.append(Message(NOTICE, ("Restarting %s%s." %
405
                                                (instance.name, last))))
406
          instance.Restart()
407
        except Error, x:
408
          self.messages.append(Message(ERROR, str(x)))
409

    
410
        notepad.RecordRestartAttempt(instance)
411
      elif instance.state in HELPLESS_STATES:
412
        if notepad.NumberOfRestartAttempts(instance):
413
          notepad.RemoveInstance(instance)
414
      else:
415
        if notepad.NumberOfRestartAttempts(instance):
416
          notepad.RemoveInstance(instance)
417
          msg = Message(NOTICE, "Restart of %s succeeded." % instance.name)
418
          self.messages.append(msg)
419

    
420
  def WriteReport(self, logfile):
421
    """Log all messages to file.
422

    
423
    Args:
424
      logfile: file object open for writing (the log file)
425

    
426
    """
427
    for msg in self.messages:
428
      print >> logfile, str(msg)
429

    
430

    
431
def ParseOptions():
432
  """Parse the command line options.
433

    
434
  Returns:
435
    (options, args) as from OptionParser.parse_args()
436

    
437
  """
438
  parser = OptionParser(description="Ganeti cluster watcher",
439
                        usage="%prog [-d]",
440
                        version="%%prog (ganeti) %s" %
441
                        constants.RELEASE_VERSION)
442

    
443
  parser.add_option("-d", "--debug", dest="debug",
444
                    help="Don't redirect messages to the log file",
445
                    default=False, action="store_true")
446
  options, args = parser.parse_args()
447
  return options, args
448

    
449

    
450
def main():
451
  """Main function.
452

    
453
  """
454
  options, args = ParseOptions()
455

    
456
  if not options.debug:
457
    sys.stderr = sys.stdout = open(constants.LOG_WATCHER, 'a')
458

    
459
  try:
460
    try:
461
      watcher = Watcher()
462
    except errors.ConfigurationError:
463
      # Just exit if there's no configuration
464
      sys.exit(constants.EXIT_SUCCESS)
465
    watcher.Run()
466
    watcher.WriteReport(sys.stdout)
467
  except NotMasterError:
468
    if options.debug:
469
      sys.stderr.write("Not master, exiting.\n")
470
    sys.exit(constants.EXIT_NOTMASTER)
471
  except errors.ResolverError, err:
472
    sys.stderr.write("Cannot resolve hostname '%s', exiting.\n" % err.args[0])
473
    sys.exit(constants.EXIT_NODESETUP_ERROR)
474
  except Error, err:
475
    print err
476

    
477

    
478
if __name__ == '__main__':
479
  main()