Statistics
| Branch: | Tag: | Revision:

root / daemons / ganeti-watcher @ 3b316acb

History | View | Annotate | Download (12.3 kB)

1
#!/usr/bin/python
2
#
3

    
4
# Copyright (C) 2006, 2007, 2008 Google Inc.
5
#
6
# This program is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 2 of the License, or
9
# (at your option) any later version.
10
#
11
# This program is distributed in the hope that it will be useful, but
12
# WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
# General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with this program; if not, write to the Free Software
18
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19
# 02110-1301, USA.
20

    
21

    
22
"""Tool to restart erronously downed virtual machines.
23

    
24
This program and set of classes implement a watchdog to restart
25
virtual machines in a Ganeti cluster that have crashed or been killed
26
by a node reboot.  Run from cron or similar.
27

    
28
"""
29

    
30
import os
31
import sys
32
import re
33
import time
34
import fcntl
35
import errno
36
import logging
37
from optparse import OptionParser
38

    
39
from ganeti import utils
40
from ganeti import constants
41
from ganeti import serializer
42
from ganeti import ssconf
43
from ganeti import errors
44
from ganeti import logger
45

    
46

    
47
MAXTRIES = 5
48
BAD_STATES = ['stopped']
49
HELPLESS_STATES = ['(node down)']
50
NOTICE = 'NOTICE'
51
ERROR = 'ERROR'
52
KEY_RESTART_COUNT = "restart_count"
53
KEY_RESTART_WHEN = "restart_when"
54
KEY_BOOT_ID = "bootid"
55

    
56

    
57
class NotMasterError(errors.GenericError):
58
  """Exception raised when this host is not the master."""
59

    
60

    
61
def Indent(s, prefix='| '):
62
  """Indent a piece of text with a given prefix before each line.
63

    
64
  Args:
65
    s: The string to indent
66
    prefix: The string to prepend each line.
67

    
68
  """
69
  return "%s%s\n" % (prefix, ('\n' + prefix).join(s.splitlines()))
70

    
71

    
72
def DoCmd(cmd):
73
  """Run a shell command.
74

    
75
  Args:
76
    cmd: the command to run.
77

    
78
  Raises CommandError with verbose commentary on error.
79

    
80
  """
81
  res = utils.RunCmd(cmd)
82

    
83
  if res.failed:
84
    msg = ("Command %s failed:\n%s\nstdout:\n%sstderr:\n%s" %
85
           (repr(cmd),
86
            Indent(res.fail_reason),
87
            Indent(res.stdout),
88
            Indent(res.stderr)))
89
    raise errors.CommandError(msg)
90

    
91
  return res
92

    
93

    
94
def LockFile(fd):
95
  """Locks a file using POSIX locks.
96

    
97
  """
98
  try:
99
    fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
100
  except IOError, err:
101
    if err.errno == errno.EAGAIN:
102
      raise errors.LockError("File already locked")
103
    raise
104

    
105

    
106
class WatcherState(object):
107
  """Interface to a state file recording restart attempts.
108

    
109
  """
110
  def __init__(self):
111
    """Open, lock, read and parse the file.
112

    
113
    Raises exception on lock contention.
114

    
115
    """
116
    # The two-step dance below is necessary to allow both opening existing
117
    # file read/write and creating if not existing.  Vanilla open will truncate
118
    # an existing file -or- allow creating if not existing.
119
    fd = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT)
120
    self.statefile = os.fdopen(fd, 'w+')
121

    
122
    LockFile(self.statefile.fileno())
123

    
124
    try:
125
      self._data = serializer.Load(self.statefile.read())
126
    except Exception, msg:
127
      # Ignore errors while loading the file and treat it as empty
128
      self._data = {}
129
      logging.warning(("Empty or invalid state file. Using defaults."
130
                       " Error message: %s"), msg)
131

    
132
    if "instance" not in self._data:
133
      self._data["instance"] = {}
134
    if "node" not in self._data:
135
      self._data["node"] = {}
136

    
137
    self._orig_data = self._data.copy()
138

    
139
  def Save(self):
140
    """Save state to file, then unlock and close it.
141

    
142
    """
143
    assert self.statefile
144

    
145
    if self._orig_data == self._data:
146
      logging.debug("Data didn't change, just touching status file")
147
      os.utime(constants.WATCHER_STATEFILE, None)
148
      return
149

    
150
    # We need to make sure the file is locked before renaming it, otherwise
151
    # starting ganeti-watcher again at the same time will create a conflict.
152
    fd = utils.WriteFile(constants.WATCHER_STATEFILE,
153
                         data=serializer.Dump(self._data),
154
                         prewrite=LockFile, close=False)
155
    self.statefile = os.fdopen(fd, 'w+')
156

    
157
  def Close(self):
158
    """Unlock configuration file and close it.
159

    
160
    """
161
    assert self.statefile
162

    
163
    # Files are automatically unlocked when closing them
164
    self.statefile.close()
165
    self.statefile = None
166

    
167
  def GetNodeBootID(self, name):
168
    """Returns the last boot ID of a node or None.
169

    
170
    """
171
    ndata = self._data["node"]
172

    
173
    if name in ndata and KEY_BOOT_ID in ndata[name]:
174
      return ndata[name][KEY_BOOT_ID]
175
    return None
176

    
177
  def SetNodeBootID(self, name, bootid):
178
    """Sets the boot ID of a node.
179

    
180
    """
181
    assert bootid
182

    
183
    ndata = self._data["node"]
184

    
185
    if name not in ndata:
186
      ndata[name] = {}
187

    
188
    ndata[name][KEY_BOOT_ID] = bootid
189

    
190
  def NumberOfRestartAttempts(self, instance):
191
    """Returns number of previous restart attempts.
192

    
193
    Args:
194
      instance - the instance to look up.
195

    
196
    """
197
    idata = self._data["instance"]
198

    
199
    if instance.name in idata:
200
      return idata[instance.name][KEY_RESTART_COUNT]
201

    
202
    return 0
203

    
204
  def RecordRestartAttempt(self, instance):
205
    """Record a restart attempt.
206

    
207
    Args:
208
      instance - the instance being restarted
209

    
210
    """
211
    idata = self._data["instance"]
212

    
213
    if instance.name not in idata:
214
      inst = idata[instance.name] = {}
215
    else:
216
      inst = idata[instance.name]
217

    
218
    inst[KEY_RESTART_WHEN] = time.time()
219
    inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1
220

    
221
  def RemoveInstance(self, instance):
222
    """Update state to reflect that a machine is running, i.e. remove record.
223

    
224
    Args:
225
      instance - the instance to remove from books
226

    
227
    This method removes the record for a named instance.
228

    
229
    """
230
    idata = self._data["instance"]
231

    
232
    if instance.name in idata:
233
      del idata[instance.name]
234

    
235

    
236
class Instance(object):
237
  """Abstraction for a Virtual Machine instance.
238

    
239
  Methods:
240
    Restart(): issue a command to restart the represented machine.
241

    
242
  """
243
  def __init__(self, name, state, autostart):
244
    self.name = name
245
    self.state = state
246
    self.autostart = autostart
247

    
248
  def Restart(self):
249
    """Encapsulates the start of an instance.
250

    
251
    """
252
    DoCmd(['gnt-instance', 'startup', '--lock-retries=15', self.name])
253

    
254
  def ActivateDisks(self):
255
    """Encapsulates the activation of all disks of an instance.
256

    
257
    """
258
    DoCmd(['gnt-instance', 'activate-disks', '--lock-retries=15', self.name])
259

    
260

    
261
def _RunListCmd(cmd):
262
  """Runs a command and parses its output into lists.
263

    
264
  """
265
  for line in DoCmd(cmd).stdout.splitlines():
266
    yield line.split(':')
267

    
268

    
269
def GetInstanceList(with_secondaries=None):
270
  """Get a list of instances on this cluster.
271

    
272
  """
273
  cmd = ['gnt-instance', 'list', '--lock-retries=15', '--no-headers',
274
         '--separator=:']
275

    
276
  fields = 'name,oper_state,admin_state'
277

    
278
  if with_secondaries is not None:
279
    fields += ',snodes'
280

    
281
  cmd.append('-o')
282
  cmd.append(fields)
283

    
284
  instances = []
285
  for fields in _RunListCmd(cmd):
286
    if with_secondaries is not None:
287
      (name, status, autostart, snodes) = fields
288

    
289
      if snodes == "-":
290
        continue
291

    
292
      for node in with_secondaries:
293
        if node in snodes.split(','):
294
          break
295
      else:
296
        continue
297

    
298
    else:
299
      (name, status, autostart) = fields
300

    
301
    instances.append(Instance(name, status, autostart != "no"))
302

    
303
  return instances
304

    
305

    
306
def GetNodeBootIDs():
307
  """Get a dict mapping nodes to boot IDs.
308

    
309
  """
310
  cmd = ['gnt-node', 'list', '--lock-retries=15', '--no-headers',
311
         '--separator=:', '-o', 'name,bootid']
312

    
313
  ids = {}
314
  for fields in _RunListCmd(cmd):
315
    (name, bootid) = fields
316
    ids[name] = bootid
317

    
318
  return ids
319

    
320

    
321
class Watcher(object):
322
  """Encapsulate the logic for restarting erronously halted virtual machines.
323

    
324
  The calling program should periodically instantiate me and call Run().
325
  This will traverse the list of instances, and make up to MAXTRIES attempts
326
  to restart machines that are down.
327

    
328
  """
329
  def __init__(self):
330
    sstore = ssconf.SimpleStore()
331
    master = sstore.GetMasterNode()
332
    if master != utils.HostInfo().name:
333
      raise NotMasterError("This is not the master node")
334
    self.instances = GetInstanceList()
335
    self.bootids = GetNodeBootIDs()
336
    self.started_instances = set()
337

    
338
  def Run(self):
339
    notepad = WatcherState()
340
    try:
341
      self.CheckInstances(notepad)
342
      self.CheckDisks(notepad)
343
      self.VerifyDisks()
344
    finally:
345
      notepad.Save()
346

    
347
  def CheckDisks(self, notepad):
348
    """Check all nodes for restarted ones.
349

    
350
    """
351
    check_nodes = []
352
    for name, id in self.bootids.iteritems():
353
      old = notepad.GetNodeBootID(name)
354
      if old != id:
355
        # Node's boot ID has changed, proably through a reboot.
356
        check_nodes.append(name)
357

    
358
    if check_nodes:
359
      # Activate disks for all instances with any of the checked nodes as a
360
      # secondary node.
361
      for instance in GetInstanceList(with_secondaries=check_nodes):
362
        if not instance.autostart:
363
          logging.info(("Skipping disk activation for non-autostart"
364
                        " instance %s"), instance.name)
365
          continue
366
        if instance.name in self.started_instances:
367
          # we already tried to start the instance, which should have
368
          # activated its drives (if they can be at all)
369
          continue
370
        try:
371
          logging.info("Activating disks for instance %s", instance.name)
372
          instance.ActivateDisks()
373
        except Error, err:
374
          logging.error(str(err), exc_info=True)
375

    
376
      # Keep changed boot IDs
377
      for name in check_nodes:
378
        notepad.SetNodeBootID(name, self.bootids[name])
379

    
380
  def CheckInstances(self, notepad):
381
    """Make a pass over the list of instances, restarting downed ones.
382

    
383
    """
384
    for instance in self.instances:
385
      # Don't care about manually stopped instances
386
      if not instance.autostart:
387
        continue
388

    
389
      if instance.state in BAD_STATES:
390
        n = notepad.NumberOfRestartAttempts(instance)
391

    
392
        if n > MAXTRIES:
393
          # stay quiet.
394
          continue
395
        elif n < MAXTRIES:
396
          last = " (Attempt #%d)" % (n + 1)
397
        else:
398
          notepad.RecordRestartAttempt(instance)
399
          logging.error("Could not restart %s after %d attempts, giving up",
400
                        instance.name, MAXTRIES)
401
          continue
402
        try:
403
          logging.info("Restarting %s%s",
404
                        instance.name, last)
405
          instance.Restart()
406
          self.started_instances.add(instance.name)
407
        except Error, err:
408
          logging.error(str(err), exc_info=True)
409

    
410
        notepad.RecordRestartAttempt(instance)
411
      elif instance.state in HELPLESS_STATES:
412
        if notepad.NumberOfRestartAttempts(instance):
413
          notepad.RemoveInstance(instance)
414
      else:
415
        if notepad.NumberOfRestartAttempts(instance):
416
          notepad.RemoveInstance(instance)
417
          logging.info("Restart of %s succeeded", instance.name)
418

    
419
  def VerifyDisks(self):
420
    """Run gnt-cluster verify-disks.
421

    
422
    """
423
    result = DoCmd(['gnt-cluster', 'verify-disks', '--lock-retries=15'])
424
    if result.output:
425
      logging.info(result.output)
426

    
427

    
428
def ParseOptions():
429
  """Parse the command line options.
430

    
431
  Returns:
432
    (options, args) as from OptionParser.parse_args()
433

    
434
  """
435
  parser = OptionParser(description="Ganeti cluster watcher",
436
                        usage="%prog [-d]",
437
                        version="%%prog (ganeti) %s" %
438
                        constants.RELEASE_VERSION)
439

    
440
  parser.add_option("-d", "--debug", dest="debug",
441
                    help="Write all messages to stderr",
442
                    default=False, action="store_true")
443
  options, args = parser.parse_args()
444
  return options, args
445

    
446

    
447
def main():
448
  """Main function.
449

    
450
  """
451
  options, args = ParseOptions()
452

    
453
  logger.SetupDaemon(constants.LOG_WATCHER, debug=options.debug)
454

    
455
  try:
456
    try:
457
      watcher = Watcher()
458
    except errors.ConfigurationError:
459
      # Just exit if there's no configuration
460
      sys.exit(constants.EXIT_SUCCESS)
461
    watcher.Run()
462
  except SystemExit:
463
    raise
464
  except NotMasterError:
465
    logging.debug("Not master, exiting")
466
    sys.exit(constants.EXIT_NOTMASTER)
467
  except errors.ResolverError, err:
468
    logging.error("Cannot resolve hostname '%s', exiting.", err.args[0])
469
    sys.exit(constants.EXIT_NODESETUP_ERROR)
470
  except Exception, err:
471
    logging.error(str(err), exc_info=True)
472
    sys.exit(constants.EXIT_FAILURE)
473

    
474

    
475
if __name__ == '__main__':
476
  main()