Statistics
| Branch: | Tag: | Revision:

root / daemons / ganeti-watcher @ 7bca53e4

History | View | Annotate | Download (12.9 kB)

1
#!/usr/bin/python
2
#
3

    
4
# Copyright (C) 2006, 2007, 2008 Google Inc.
5
#
6
# This program is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 2 of the License, or
9
# (at your option) any later version.
10
#
11
# This program is distributed in the hope that it will be useful, but
12
# WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
# General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with this program; if not, write to the Free Software
18
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19
# 02110-1301, USA.
20

    
21

    
22
"""Tool to restart erronously downed virtual machines.
23

    
24
This program and set of classes implement a watchdog to restart
25
virtual machines in a Ganeti cluster that have crashed or been killed
26
by a node reboot.  Run from cron or similar.
27

    
28
"""
29

    
30
import os
31
import sys
32
import re
33
import time
34
import fcntl
35
import errno
36
import logging
37
from optparse import OptionParser
38

    
39
from ganeti import utils
40
from ganeti import constants
41
from ganeti import serializer
42
from ganeti import ssconf
43
from ganeti import errors
44

    
45

    
46
MAXTRIES = 5
47
BAD_STATES = ['stopped']
48
HELPLESS_STATES = ['(node down)']
49
NOTICE = 'NOTICE'
50
ERROR = 'ERROR'
51
KEY_RESTART_COUNT = "restart_count"
52
KEY_RESTART_WHEN = "restart_when"
53
KEY_BOOT_ID = "bootid"
54

    
55

    
56
class NotMasterError(errors.GenericError):
57
  """Exception raised when this host is not the master."""
58

    
59

    
60
def Indent(s, prefix='| '):
61
  """Indent a piece of text with a given prefix before each line.
62

    
63
  Args:
64
    s: The string to indent
65
    prefix: The string to prepend each line.
66

    
67
  """
68
  return "%s%s\n" % (prefix, ('\n' + prefix).join(s.splitlines()))
69

    
70

    
71
def DoCmd(cmd):
72
  """Run a shell command.
73

    
74
  Args:
75
    cmd: the command to run.
76

    
77
  Raises CommandError with verbose commentary on error.
78

    
79
  """
80
  res = utils.RunCmd(cmd)
81

    
82
  if res.failed:
83
    msg = ("Command %s failed:\n%s\nstdout:\n%sstderr:\n%s" %
84
           (repr(cmd),
85
            Indent(res.fail_reason),
86
            Indent(res.stdout),
87
            Indent(res.stderr)))
88
    raise errors.CommandError(msg)
89

    
90
  return res
91

    
92

    
93
def LockFile(fd):
94
  """Locks a file using POSIX locks.
95

    
96
  """
97
  try:
98
    fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
99
  except IOError, err:
100
    if err.errno == errno.EAGAIN:
101
      raise errors.LockError("File already locked")
102
    raise
103

    
104

    
105
class WatcherState(object):
106
  """Interface to a state file recording restart attempts.
107

    
108
  """
109
  def __init__(self):
110
    """Open, lock, read and parse the file.
111

    
112
    Raises exception on lock contention.
113

    
114
    """
115
    # The two-step dance below is necessary to allow both opening existing
116
    # file read/write and creating if not existing.  Vanilla open will truncate
117
    # an existing file -or- allow creating if not existing.
118
    fd = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT)
119
    self.statefile = os.fdopen(fd, 'w+')
120

    
121
    LockFile(self.statefile.fileno())
122

    
123
    try:
124
      self._data = serializer.Load(self.statefile.read())
125
    except Exception, msg:
126
      # Ignore errors while loading the file and treat it as empty
127
      self._data = {}
128
      logging.warning(("Empty or invalid state file. Using defaults."
129
                       " Error message: %s"), msg)
130

    
131
    if "instance" not in self._data:
132
      self._data["instance"] = {}
133
    if "node" not in self._data:
134
      self._data["node"] = {}
135

    
136
    self._orig_data = self._data.copy()
137

    
138
  def Save(self):
139
    """Save state to file, then unlock and close it.
140

    
141
    """
142
    assert self.statefile
143

    
144
    if self._orig_data == self._data:
145
      logging.debug("Data didn't change, just touching status file")
146
      os.utime(constants.WATCHER_STATEFILE, None)
147
      return
148

    
149
    # We need to make sure the file is locked before renaming it, otherwise
150
    # starting ganeti-watcher again at the same time will create a conflict.
151
    fd = utils.WriteFile(constants.WATCHER_STATEFILE,
152
                         data=serializer.Dump(self._data),
153
                         prewrite=LockFile, close=False)
154
    self.statefile = os.fdopen(fd, 'w+')
155

    
156
  def Close(self):
157
    """Unlock configuration file and close it.
158

    
159
    """
160
    assert self.statefile
161

    
162
    # Files are automatically unlocked when closing them
163
    self.statefile.close()
164
    self.statefile = None
165

    
166
  def GetNodeBootID(self, name):
167
    """Returns the last boot ID of a node or None.
168

    
169
    """
170
    ndata = self._data["node"]
171

    
172
    if name in ndata and KEY_BOOT_ID in ndata[name]:
173
      return ndata[name][KEY_BOOT_ID]
174
    return None
175

    
176
  def SetNodeBootID(self, name, bootid):
177
    """Sets the boot ID of a node.
178

    
179
    """
180
    assert bootid
181

    
182
    ndata = self._data["node"]
183

    
184
    if name not in ndata:
185
      ndata[name] = {}
186

    
187
    ndata[name][KEY_BOOT_ID] = bootid
188

    
189
  def NumberOfRestartAttempts(self, instance):
190
    """Returns number of previous restart attempts.
191

    
192
    Args:
193
      instance - the instance to look up.
194

    
195
    """
196
    idata = self._data["instance"]
197

    
198
    if instance.name in idata:
199
      return idata[instance.name][KEY_RESTART_COUNT]
200

    
201
    return 0
202

    
203
  def RecordRestartAttempt(self, instance):
204
    """Record a restart attempt.
205

    
206
    Args:
207
      instance - the instance being restarted
208

    
209
    """
210
    idata = self._data["instance"]
211

    
212
    if instance.name not in idata:
213
      inst = idata[instance.name] = {}
214
    else:
215
      inst = idata[instance.name]
216

    
217
    inst[KEY_RESTART_WHEN] = time.time()
218
    inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1
219

    
220
  def RemoveInstance(self, instance):
221
    """Update state to reflect that a machine is running, i.e. remove record.
222

    
223
    Args:
224
      instance - the instance to remove from books
225

    
226
    This method removes the record for a named instance.
227

    
228
    """
229
    idata = self._data["instance"]
230

    
231
    if instance.name in idata:
232
      del idata[instance.name]
233

    
234

    
235
class Instance(object):
236
  """Abstraction for a Virtual Machine instance.
237

    
238
  Methods:
239
    Restart(): issue a command to restart the represented machine.
240

    
241
  """
242
  def __init__(self, name, state, autostart):
243
    self.name = name
244
    self.state = state
245
    self.autostart = autostart
246

    
247
  def Restart(self):
248
    """Encapsulates the start of an instance.
249

    
250
    """
251
    DoCmd(['gnt-instance', 'startup', '--lock-retries=15', self.name])
252

    
253
  def ActivateDisks(self):
254
    """Encapsulates the activation of all disks of an instance.
255

    
256
    """
257
    DoCmd(['gnt-instance', 'activate-disks', '--lock-retries=15', self.name])
258

    
259

    
260
def _RunListCmd(cmd):
261
  """Runs a command and parses its output into lists.
262

    
263
  """
264
  for line in DoCmd(cmd).stdout.splitlines():
265
    yield line.split(':')
266

    
267

    
268
def GetInstanceList(with_secondaries=None):
269
  """Get a list of instances on this cluster.
270

    
271
  """
272
  cmd = ['gnt-instance', 'list', '--lock-retries=15', '--no-headers',
273
         '--separator=:']
274

    
275
  fields = 'name,oper_state,admin_state'
276

    
277
  if with_secondaries is not None:
278
    fields += ',snodes'
279

    
280
  cmd.append('-o')
281
  cmd.append(fields)
282

    
283
  instances = []
284
  for fields in _RunListCmd(cmd):
285
    if with_secondaries is not None:
286
      (name, status, autostart, snodes) = fields
287

    
288
      if snodes == "-":
289
        continue
290

    
291
      for node in with_secondaries:
292
        if node in snodes.split(','):
293
          break
294
      else:
295
        continue
296

    
297
    else:
298
      (name, status, autostart) = fields
299

    
300
    instances.append(Instance(name, status, autostart != "no"))
301

    
302
  return instances
303

    
304

    
305
def GetNodeBootIDs():
306
  """Get a dict mapping nodes to boot IDs.
307

    
308
  """
309
  cmd = ['gnt-node', 'list', '--lock-retries=15', '--no-headers',
310
         '--separator=:', '-o', 'name,bootid']
311

    
312
  ids = {}
313
  for fields in _RunListCmd(cmd):
314
    (name, bootid) = fields
315
    ids[name] = bootid
316

    
317
  return ids
318

    
319

    
320
class Watcher(object):
321
  """Encapsulate the logic for restarting erronously halted virtual machines.
322

    
323
  The calling program should periodically instantiate me and call Run().
324
  This will traverse the list of instances, and make up to MAXTRIES attempts
325
  to restart machines that are down.
326

    
327
  """
328
  def __init__(self):
329
    sstore = ssconf.SimpleStore()
330
    master = sstore.GetMasterNode()
331
    if master != utils.HostInfo().name:
332
      raise NotMasterError("This is not the master node")
333
    self.instances = GetInstanceList()
334
    self.bootids = GetNodeBootIDs()
335
    self.started_instances = set()
336

    
337
  def Run(self):
338
    notepad = WatcherState()
339
    try:
340
      self.CheckInstances(notepad)
341
      self.CheckDisks(notepad)
342
      self.VerifyDisks()
343
    finally:
344
      notepad.Save()
345

    
346
  def CheckDisks(self, notepad):
347
    """Check all nodes for restarted ones.
348

    
349
    """
350
    check_nodes = []
351
    for name, id in self.bootids.iteritems():
352
      old = notepad.GetNodeBootID(name)
353
      if old != id:
354
        # Node's boot ID has changed, proably through a reboot.
355
        check_nodes.append(name)
356

    
357
    if check_nodes:
358
      # Activate disks for all instances with any of the checked nodes as a
359
      # secondary node.
360
      for instance in GetInstanceList(with_secondaries=check_nodes):
361
        if not instance.autostart:
362
          logging.info(("Skipping disk activation for non-autostart"
363
                        " instance %s"), instance.name)
364
          continue
365
        if instance.name in self.started_instances:
366
          # we already tried to start the instance, which should have
367
          # activated its drives (if they can be at all)
368
          continue
369
        try:
370
          logging.info("Activating disks for instance %s", instance.name)
371
          instance.ActivateDisks()
372
        except Error, err:
373
          logging.error(str(err), exc_info=True)
374

    
375
      # Keep changed boot IDs
376
      for name in check_nodes:
377
        notepad.SetNodeBootID(name, self.bootids[name])
378

    
379
  def CheckInstances(self, notepad):
380
    """Make a pass over the list of instances, restarting downed ones.
381

    
382
    """
383
    for instance in self.instances:
384
      # Don't care about manually stopped instances
385
      if not instance.autostart:
386
        continue
387

    
388
      if instance.state in BAD_STATES:
389
        n = notepad.NumberOfRestartAttempts(instance)
390

    
391
        if n > MAXTRIES:
392
          # stay quiet.
393
          continue
394
        elif n < MAXTRIES:
395
          last = " (Attempt #%d)" % (n + 1)
396
        else:
397
          notepad.RecordRestartAttempt(instance)
398
          logging.error("Could not restart %s after %d attempts, giving up",
399
                        instance.name, MAXTRIES)
400
          continue
401
        try:
402
          logging.info("Restarting %s%s",
403
                        instance.name, last)
404
          instance.Restart()
405
          self.started_instances.add(instance.name)
406
        except Error, err:
407
          logging.error(str(err), exc_info=True)
408

    
409
        notepad.RecordRestartAttempt(instance)
410
      elif instance.state in HELPLESS_STATES:
411
        if notepad.NumberOfRestartAttempts(instance):
412
          notepad.RemoveInstance(instance)
413
      else:
414
        if notepad.NumberOfRestartAttempts(instance):
415
          notepad.RemoveInstance(instance)
416
          logging.info("Restart of %s succeeded", instance.name)
417

    
418
  def VerifyDisks(self):
419
    """Run gnt-cluster verify-disks.
420

    
421
    """
422
    result = DoCmd(['gnt-cluster', 'verify-disks', '--lock-retries=15'])
423
    if result.output:
424
      logging.info(result.output)
425

    
426

    
427
def ParseOptions():
428
  """Parse the command line options.
429

    
430
  Returns:
431
    (options, args) as from OptionParser.parse_args()
432

    
433
  """
434
  parser = OptionParser(description="Ganeti cluster watcher",
435
                        usage="%prog [-d]",
436
                        version="%%prog (ganeti) %s" %
437
                        constants.RELEASE_VERSION)
438

    
439
  parser.add_option("-d", "--debug", dest="debug",
440
                    help="Write all messages to stderr",
441
                    default=False, action="store_true")
442
  options, args = parser.parse_args()
443
  return options, args
444

    
445

    
446
def SetupLogging(debug):
447
  """Configures the logging module.
448

    
449
  """
450
  formatter = logging.Formatter("%(asctime)s: %(message)s")
451

    
452
  logfile_handler = logging.FileHandler(constants.LOG_WATCHER)
453
  logfile_handler.setFormatter(formatter)
454
  logfile_handler.setLevel(logging.INFO)
455

    
456
  stderr_handler = logging.StreamHandler()
457
  stderr_handler.setFormatter(formatter)
458
  if debug:
459
    stderr_handler.setLevel(logging.NOTSET)
460
  else:
461
    stderr_handler.setLevel(logging.CRITICAL)
462

    
463
  root_logger = logging.getLogger("")
464
  root_logger.setLevel(logging.NOTSET)
465
  root_logger.addHandler(logfile_handler)
466
  root_logger.addHandler(stderr_handler)
467

    
468

    
469
def main():
470
  """Main function.
471

    
472
  """
473
  options, args = ParseOptions()
474

    
475
  SetupLogging(options.debug)
476

    
477
  try:
478
    try:
479
      watcher = Watcher()
480
    except errors.ConfigurationError:
481
      # Just exit if there's no configuration
482
      sys.exit(constants.EXIT_SUCCESS)
483
    watcher.Run()
484
  except SystemExit:
485
    raise
486
  except NotMasterError:
487
    logging.debug("Not master, exiting")
488
    sys.exit(constants.EXIT_NOTMASTER)
489
  except errors.ResolverError, err:
490
    logging.error("Cannot resolve hostname '%s', exiting.", err.args[0])
491
    sys.exit(constants.EXIT_NODESETUP_ERROR)
492
  except Exception, err:
493
    logging.error(str(err), exc_info=True)
494
    sys.exit(constants.EXIT_FAILURE)
495

    
496

    
497
if __name__ == '__main__':
498
  main()