Statistics
| Branch: | Tag: | Revision:

root / daemons / ganeti-watcher @ 781b2b2b

History | View | Annotate | Download (11.9 kB)

1
#!/usr/bin/python
2
#
3

    
4
# Copyright (C) 2006, 2007 Google Inc.
5
#
6
# This program is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 2 of the License, or
9
# (at your option) any later version.
10
#
11
# This program is distributed in the hope that it will be useful, but
12
# WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
# General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with this program; if not, write to the Free Software
18
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19
# 02110-1301, USA.
20

    
21

    
22
"""Tool to restart erronously downed virtual machines.
23

    
24
This program and set of classes implement a watchdog to restart
25
virtual machines in a Ganeti cluster that have crashed or been killed
26
by a node reboot.  Run from cron or similar.
27

    
28
"""
29

    
30
import os
31
import sys
32
import re
33
import time
34
import fcntl
35
import errno
36
import simplejson
37
from optparse import OptionParser
38

    
39
from ganeti import utils
40
from ganeti import constants
41
from ganeti import ssconf
42
from ganeti import errors
43

    
44

    
45
MAXTRIES = 5
46
BAD_STATES = ['stopped']
47
HELPLESS_STATES = ['(node down)']
48
NOTICE = 'NOTICE'
49
ERROR = 'ERROR'
50

    
51

    
52
class Error(Exception):
53
  """Generic custom error class."""
54

    
55

    
56
class NotMasterError(Error):
57
  """Exception raised when this host is not the master."""
58

    
59

    
60
def Indent(s, prefix='| '):
61
  """Indent a piece of text with a given prefix before each line.
62

    
63
  Args:
64
    s: The string to indent
65
    prefix: The string to prepend each line.
66

    
67
  """
68
  return "%s%s\n" % (prefix, ('\n' + prefix).join(s.splitlines()))
69

    
70

    
71
def DoCmd(cmd):
72
  """Run a shell command.
73

    
74
  Args:
75
    cmd: the command to run.
76

    
77
  Raises CommandError with verbose commentary on error.
78

    
79
  """
80
  res = utils.RunCmd(cmd)
81

    
82
  if res.failed:
83
    raise Error("Command %s failed:\n%s\nstdout:\n%sstderr:\n%s" %
84
                (repr(cmd),
85
                 Indent(res.fail_reason),
86
                 Indent(res.stdout),
87
                 Indent(res.stderr)))
88

    
89
  return res
90

    
91

    
92
class WatcherState(object):
93
  """Interface to a state file recording restart attempts.
94

    
95
  """
96
  def __init__(self):
97
    """Open, lock, read and parse the file.
98

    
99
    Raises StandardError on lock contention.
100

    
101
    """
102
    # The two-step dance below is necessary to allow both opening existing
103
    # file read/write and creating if not existing.  Vanilla open will truncate
104
    # an existing file -or- allow creating if not existing.
105
    f = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT)
106
    f = os.fdopen(f, 'w+')
107

    
108
    try:
109
      fcntl.flock(f.fileno(), fcntl.LOCK_EX|fcntl.LOCK_NB)
110
    except IOError, x:
111
      if x.errno == errno.EAGAIN:
112
        raise StandardError("State file already locked")
113
      raise
114

    
115
    self.statefile = f
116

    
117
    try:
118
      self.data = simplejson.load(self.statefile)
119
    except Exception, msg:
120
      # Ignore errors while loading the file and treat it as empty
121
      self.data = {}
122
      sys.stderr.write("Empty or invalid state file. "
123
          "Using defaults. Error message: %s\n" % msg)
124

    
125
    if "instance" not in self.data:
126
      self.data["instance"] = {}
127
    if "node" not in self.data:
128
      self.data["node"] = {}
129

    
130
  def __del__(self):
131
    """Called on destruction.
132

    
133
    """
134
    if self.statefile:
135
      self._Close()
136

    
137
  def _Close(self):
138
    """Unlock configuration file and close it.
139

    
140
    """
141
    assert self.statefile
142

    
143
    fcntl.flock(self.statefile.fileno(), fcntl.LOCK_UN)
144

    
145
    self.statefile.close()
146
    self.statefile = None
147

    
148
  def GetNodeBootID(self, name):
149
    """Returns the last boot ID of a node or None.
150

    
151
    """
152
    ndata = self.data["node"]
153

    
154
    if name in ndata and "bootid" in ndata[name]:
155
      return ndata[name]["bootid"]
156
    return None
157

    
158
  def SetNodeBootID(self, name, bootid):
159
    """Sets the boot ID of a node.
160

    
161
    """
162
    assert bootid
163

    
164
    ndata = self.data["node"]
165

    
166
    if name not in ndata:
167
      ndata[name] = {}
168

    
169
    ndata[name]["bootid"] = bootid
170

    
171
  def NumberOfRestartAttempts(self, instance):
172
    """Returns number of previous restart attempts.
173

    
174
    Args:
175
      instance - the instance to look up.
176

    
177
    """
178
    idata = self.data["instance"]
179

    
180
    if instance.name in idata:
181
      return idata[instance.name]["restart_count"]
182

    
183
    return 0
184

    
185
  def RecordRestartAttempt(self, instance):
186
    """Record a restart attempt.
187

    
188
    Args:
189
      instance - the instance being restarted
190

    
191
    """
192
    idata = self.data["instance"]
193

    
194
    if instance.name not in idata:
195
      inst = idata[instance.name] = {}
196
    else:
197
      inst = idata[instance.name]
198

    
199
    inst["restart_when"] = time.time()
200
    inst["restart_count"] = idata.get("restart_count", 0) + 1
201

    
202
  def RemoveInstance(self, instance):
203
    """Update state to reflect that a machine is running, i.e. remove record.
204

    
205
    Args:
206
      instance - the instance to remove from books
207

    
208
    This method removes the record for a named instance.
209

    
210
    """
211
    idata = self.data["instance"]
212

    
213
    if instance.name in idata:
214
      del idata[instance.name]
215

    
216
  def Save(self):
217
    """Save state to file, then unlock and close it.
218

    
219
    """
220
    assert self.statefile
221

    
222
    self.statefile.seek(0)
223
    self.statefile.truncate()
224

    
225
    simplejson.dump(self.data, self.statefile)
226

    
227
    self._Close()
228

    
229

    
230
class Instance(object):
231
  """Abstraction for a Virtual Machine instance.
232

    
233
  Methods:
234
    Restart(): issue a command to restart the represented machine.
235

    
236
  """
237
  def __init__(self, name, state, autostart):
238
    self.name = name
239
    self.state = state
240
    self.autostart = autostart
241

    
242
  def Restart(self):
243
    """Encapsulates the start of an instance.
244

    
245
    """
246
    DoCmd(['gnt-instance', 'startup', '--lock-retries=15', self.name])
247

    
248
  def ActivateDisks(self):
249
    """Encapsulates the activation of all disks of an instance.
250

    
251
    """
252
    DoCmd(['gnt-instance', 'activate-disks', '--lock-retries=15', self.name])
253

    
254

    
255
def _RunListCmd(cmd):
256
  """Runs a command and parses its output into lists.
257

    
258
  """
259
  for line in DoCmd(cmd).stdout.splitlines():
260
    yield line.split(':')
261

    
262

    
263
def GetInstanceList(with_secondaries=None):
264
  """Get a list of instances on this cluster.
265

    
266
  """
267
  cmd = ['gnt-instance', 'list', '--lock-retries=15', '--no-headers',
268
         '--separator=:']
269

    
270
  fields = 'name,oper_state,admin_state'
271

    
272
  if with_secondaries is not None:
273
    fields += ',snodes'
274

    
275
  cmd.append('-o')
276
  cmd.append(fields)
277

    
278
  instances = []
279
  for fields in _RunListCmd(cmd):
280
    if with_secondaries is not None:
281
      (name, status, autostart, snodes) = fields
282

    
283
      if snodes == "-":
284
        continue
285

    
286
      for node in with_secondaries:
287
        if node in snodes.split(','):
288
          break
289
      else:
290
        continue
291

    
292
    else:
293
      (name, status, autostart) = fields
294

    
295
    instances.append(Instance(name, status, autostart != "no"))
296

    
297
  return instances
298

    
299

    
300
def GetNodeBootIDs():
301
  """Get a dict mapping nodes to boot IDs.
302

    
303
  """
304
  cmd = ['gnt-node', 'list', '--lock-retries=15', '--no-headers',
305
         '--separator=:', '-o', 'name,bootid']
306

    
307
  ids = {}
308
  for fields in _RunListCmd(cmd):
309
    (name, bootid) = fields
310
    ids[name] = bootid
311

    
312
  return ids
313

    
314

    
315
class Message(object):
316
  """Encapsulation of a notice or error message.
317

    
318
  """
319
  def __init__(self, level, msg):
320
    self.level = level
321
    self.msg = msg
322
    self.when = time.time()
323

    
324
  def __str__(self):
325
    return self.level + ' ' + time.ctime(self.when) + '\n' + Indent(self.msg)
326

    
327

    
328
class Watcher(object):
329
  """Encapsulate the logic for restarting erronously halted virtual machines.
330

    
331
  The calling program should periodically instantiate me and call Run().
332
  This will traverse the list of instances, and make up to MAXTRIES attempts
333
  to restart machines that are down.
334

    
335
  """
336
  def __init__(self):
337
    sstore = ssconf.SimpleStore()
338
    master = sstore.GetMasterNode()
339
    if master != utils.HostInfo().name:
340
      raise NotMasterError("This is not the master node")
341
    self.instances = GetInstanceList()
342
    self.bootids = GetNodeBootIDs()
343
    self.messages = []
344

    
345
  def Run(self):
346
    notepad = WatcherState()
347
    self.CheckInstances(notepad)
348
    self.CheckDisks(notepad)
349
    notepad.Save()
350

    
351
  def CheckDisks(self, notepad):
352
    """Check all nodes for restarted ones.
353

    
354
    """
355
    check_nodes = []
356
    for name, id in self.bootids.iteritems():
357
      old = notepad.GetNodeBootID(name)
358
      if old != id:
359
        # Node's boot ID has changed, proably through a reboot.
360
        check_nodes.append(name)
361

    
362
    if check_nodes:
363
      # Activate disks for all instances with any of the checked nodes as a
364
      # secondary node.
365
      for instance in GetInstanceList(with_secondaries=check_nodes):
366
        try:
367
          self.messages.append(Message(NOTICE,
368
                                       "Activating disks for %s." %
369
                                       instance.name))
370
          instance.ActivateDisks()
371
        except Error, x:
372
          self.messages.append(Message(ERROR, str(x)))
373

    
374
      # Keep changed boot IDs
375
      for name in check_nodes:
376
        notepad.SetNodeBootID(name, self.bootids[name])
377

    
378
  def CheckInstances(self, notepad):
379
    """Make a pass over the list of instances, restarting downed ones.
380

    
381
    """
382
    for instance in self.instances:
383
      # Don't care about manually stopped instances
384
      if not instance.autostart:
385
        continue
386

    
387
      if instance.state in BAD_STATES:
388
        n = notepad.NumberOfRestartAttempts(instance)
389

    
390
        if n > MAXTRIES:
391
          # stay quiet.
392
          continue
393
        elif n < MAXTRIES:
394
          last = " (Attempt #%d)" % (n + 1)
395
        else:
396
          notepad.RecordRestartAttempt(instance)
397
          self.messages.append(Message(ERROR, "Could not restart %s for %d"
398
                                       " times, giving up..." %
399
                                       (instance.name, MAXTRIES)))
400
          continue
401
        try:
402
          self.messages.append(Message(NOTICE,
403
                                       "Restarting %s%s." %
404
                                       (instance.name, last)))
405
          instance.Restart()
406
        except Error, x:
407
          self.messages.append(Message(ERROR, str(x)))
408

    
409
        notepad.RecordRestartAttempt(instance)
410
      elif instance.state in HELPLESS_STATES:
411
        if notepad.NumberOfRestartAttempts(instance):
412
          notepad.RemoveInstance(instance)
413
      else:
414
        if notepad.NumberOfRestartAttempts(instance):
415
          notepad.RemoveInstance(instance)
416
          msg = Message(NOTICE,
417
                        "Restart of %s succeeded." % instance.name)
418
          self.messages.append(msg)
419

    
420
  def WriteReport(self, logfile):
421
    """Log all messages to file.
422

    
423
    Args:
424
      logfile: file object open for writing (the log file)
425

    
426
    """
427
    for msg in self.messages:
428
      print >> logfile, str(msg)
429

    
430

    
431
def ParseOptions():
432
  """Parse the command line options.
433

    
434
  Returns:
435
    (options, args) as from OptionParser.parse_args()
436

    
437
  """
438
  parser = OptionParser(description="Ganeti cluster watcher",
439
                        usage="%prog [-d]",
440
                        version="%%prog (ganeti) %s" %
441
                        constants.RELEASE_VERSION)
442

    
443
  parser.add_option("-d", "--debug", dest="debug",
444
                    help="Don't redirect messages to the log file",
445
                    default=False, action="store_true")
446
  options, args = parser.parse_args()
447
  return options, args
448

    
449

    
450
def main():
451
  """Main function.
452

    
453
  """
454
  options, args = ParseOptions()
455

    
456
  if not options.debug:
457
    sys.stderr = sys.stdout = open(constants.LOG_WATCHER, 'a')
458

    
459
  try:
460
    try:
461
      watcher = Watcher()
462
    except errors.ConfigurationError:
463
      # Just exit if there's no configuration
464
      sys.exit(constants.EXIT_SUCCESS)
465
    watcher.Run()
466
    watcher.WriteReport(sys.stdout)
467
  except NotMasterError:
468
    if options.debug:
469
      sys.stderr.write("Not master, exiting.\n")
470
    sys.exit(constants.EXIT_NOTMASTER)
471
  except errors.ResolverError, err:
472
    sys.stderr.write("Cannot resolve hostname '%s', exiting.\n" % err.args[0])
473
    sys.exit(constants.EXIT_NODESETUP_ERROR)
474
  except Error, err:
475
    print err
476

    
477

    
478
if __name__ == '__main__':
479
  main()