Statistics
| Branch: | Tag: | Revision:

root / daemons / ganeti-watcher @ 78f3bd30

History | View | Annotate | Download (12.5 kB)

1
#!/usr/bin/python
2
#
3

    
4
# Copyright (C) 2006, 2007 Google Inc.
5
#
6
# This program is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 2 of the License, or
9
# (at your option) any later version.
10
#
11
# This program is distributed in the hope that it will be useful, but
12
# WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
# General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with this program; if not, write to the Free Software
18
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19
# 02110-1301, USA.
20

    
21

    
22
"""Tool to restart erronously downed virtual machines.
23

    
24
This program and set of classes implement a watchdog to restart
25
virtual machines in a Ganeti cluster that have crashed or been killed
26
by a node reboot.  Run from cron or similar.
27

    
28
"""
29

    
30
import os
31
import sys
32
import re
33
import time
34
import fcntl
35
import errno
36
import logging
37
from optparse import OptionParser
38

    
39
from ganeti import utils
40
from ganeti import constants
41
from ganeti import serializer
42
from ganeti import ssconf
43
from ganeti import errors
44

    
45

    
46
MAXTRIES = 5
47
BAD_STATES = ['stopped']
48
HELPLESS_STATES = ['(node down)']
49
NOTICE = 'NOTICE'
50
ERROR = 'ERROR'
51
KEY_RESTART_COUNT = "restart_count"
52
KEY_RESTART_WHEN = "restart_when"
53
KEY_BOOT_ID = "bootid"
54

    
55

    
56
class Error(Exception):
57
  """Generic custom error class."""
58

    
59

    
60
class NotMasterError(Error):
61
  """Exception raised when this host is not the master."""
62

    
63

    
64
def Indent(s, prefix='| '):
65
  """Indent a piece of text with a given prefix before each line.
66

    
67
  Args:
68
    s: The string to indent
69
    prefix: The string to prepend each line.
70

    
71
  """
72
  return "%s%s\n" % (prefix, ('\n' + prefix).join(s.splitlines()))
73

    
74

    
75
def DoCmd(cmd):
76
  """Run a shell command.
77

    
78
  Args:
79
    cmd: the command to run.
80

    
81
  Raises CommandError with verbose commentary on error.
82

    
83
  """
84
  res = utils.RunCmd(cmd)
85

    
86
  if res.failed:
87
    raise Error("Command %s failed:\n%s\nstdout:\n%sstderr:\n%s" %
88
                (repr(cmd),
89
                 Indent(res.fail_reason),
90
                 Indent(res.stdout),
91
                 Indent(res.stderr)))
92

    
93
  return res
94

    
95

    
96
class WatcherState(object):
97
  """Interface to a state file recording restart attempts.
98

    
99
  """
100
  def __init__(self):
101
    """Open, lock, read and parse the file.
102

    
103
    Raises StandardError on lock contention.
104

    
105
    """
106
    # The two-step dance below is necessary to allow both opening existing
107
    # file read/write and creating if not existing.  Vanilla open will truncate
108
    # an existing file -or- allow creating if not existing.
109
    f = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT)
110
    f = os.fdopen(f, 'w+')
111

    
112
    try:
113
      fcntl.flock(f.fileno(), fcntl.LOCK_EX|fcntl.LOCK_NB)
114
    except IOError, x:
115
      if x.errno == errno.EAGAIN:
116
        raise StandardError("State file already locked")
117
      raise
118

    
119
    self.statefile = f
120

    
121
    try:
122
      self.data = serializer.Load(self.statefile.read())
123
    except Exception, msg:
124
      # Ignore errors while loading the file and treat it as empty
125
      self.data = {}
126
      logging.warning(("Empty or invalid state file. Using defaults."
127
                       " Error message: %s"), msg)
128

    
129
    if "instance" not in self.data:
130
      self.data["instance"] = {}
131
    if "node" not in self.data:
132
      self.data["node"] = {}
133

    
134
  def __del__(self):
135
    """Called on destruction.
136

    
137
    """
138
    if self.statefile:
139
      self._Close()
140

    
141
  def _Close(self):
142
    """Unlock configuration file and close it.
143

    
144
    """
145
    assert self.statefile
146

    
147
    fcntl.flock(self.statefile.fileno(), fcntl.LOCK_UN)
148

    
149
    self.statefile.close()
150
    self.statefile = None
151

    
152
  def GetNodeBootID(self, name):
153
    """Returns the last boot ID of a node or None.
154

    
155
    """
156
    ndata = self.data["node"]
157

    
158
    if name in ndata and KEY_BOOT_ID in ndata[name]:
159
      return ndata[name][KEY_BOOT_ID]
160
    return None
161

    
162
  def SetNodeBootID(self, name, bootid):
163
    """Sets the boot ID of a node.
164

    
165
    """
166
    assert bootid
167

    
168
    ndata = self.data["node"]
169

    
170
    if name not in ndata:
171
      ndata[name] = {}
172

    
173
    ndata[name][KEY_BOOT_ID] = bootid
174

    
175
  def NumberOfRestartAttempts(self, instance):
176
    """Returns number of previous restart attempts.
177

    
178
    Args:
179
      instance - the instance to look up.
180

    
181
    """
182
    idata = self.data["instance"]
183

    
184
    if instance.name in idata:
185
      return idata[instance.name][KEY_RESTART_COUNT]
186

    
187
    return 0
188

    
189
  def RecordRestartAttempt(self, instance):
190
    """Record a restart attempt.
191

    
192
    Args:
193
      instance - the instance being restarted
194

    
195
    """
196
    idata = self.data["instance"]
197

    
198
    if instance.name not in idata:
199
      inst = idata[instance.name] = {}
200
    else:
201
      inst = idata[instance.name]
202

    
203
    inst[KEY_RESTART_WHEN] = time.time()
204
    inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1
205

    
206
  def RemoveInstance(self, instance):
207
    """Update state to reflect that a machine is running, i.e. remove record.
208

    
209
    Args:
210
      instance - the instance to remove from books
211

    
212
    This method removes the record for a named instance.
213

    
214
    """
215
    idata = self.data["instance"]
216

    
217
    if instance.name in idata:
218
      del idata[instance.name]
219

    
220
  def Save(self):
221
    """Save state to file, then unlock and close it.
222

    
223
    """
224
    assert self.statefile
225

    
226
    self.statefile.seek(0)
227
    self.statefile.truncate()
228

    
229
    self.statefile.write(serializer.Dump(self.data))
230

    
231
    self._Close()
232

    
233

    
234
class Instance(object):
235
  """Abstraction for a Virtual Machine instance.
236

    
237
  Methods:
238
    Restart(): issue a command to restart the represented machine.
239

    
240
  """
241
  def __init__(self, name, state, autostart):
242
    self.name = name
243
    self.state = state
244
    self.autostart = autostart
245

    
246
  def Restart(self):
247
    """Encapsulates the start of an instance.
248

    
249
    """
250
    DoCmd(['gnt-instance', 'startup', '--lock-retries=15', self.name])
251

    
252
  def ActivateDisks(self):
253
    """Encapsulates the activation of all disks of an instance.
254

    
255
    """
256
    DoCmd(['gnt-instance', 'activate-disks', '--lock-retries=15', self.name])
257

    
258

    
259
def _RunListCmd(cmd):
260
  """Runs a command and parses its output into lists.
261

    
262
  """
263
  for line in DoCmd(cmd).stdout.splitlines():
264
    yield line.split(':')
265

    
266

    
267
def GetInstanceList(with_secondaries=None):
268
  """Get a list of instances on this cluster.
269

    
270
  """
271
  cmd = ['gnt-instance', 'list', '--lock-retries=15', '--no-headers',
272
         '--separator=:']
273

    
274
  fields = 'name,oper_state,admin_state'
275

    
276
  if with_secondaries is not None:
277
    fields += ',snodes'
278

    
279
  cmd.append('-o')
280
  cmd.append(fields)
281

    
282
  instances = []
283
  for fields in _RunListCmd(cmd):
284
    if with_secondaries is not None:
285
      (name, status, autostart, snodes) = fields
286

    
287
      if snodes == "-":
288
        continue
289

    
290
      for node in with_secondaries:
291
        if node in snodes.split(','):
292
          break
293
      else:
294
        continue
295

    
296
    else:
297
      (name, status, autostart) = fields
298

    
299
    instances.append(Instance(name, status, autostart != "no"))
300

    
301
  return instances
302

    
303

    
304
def GetNodeBootIDs():
305
  """Get a dict mapping nodes to boot IDs.
306

    
307
  """
308
  cmd = ['gnt-node', 'list', '--lock-retries=15', '--no-headers',
309
         '--separator=:', '-o', 'name,bootid']
310

    
311
  ids = {}
312
  for fields in _RunListCmd(cmd):
313
    (name, bootid) = fields
314
    ids[name] = bootid
315

    
316
  return ids
317

    
318

    
319
class Watcher(object):
320
  """Encapsulate the logic for restarting erronously halted virtual machines.
321

    
322
  The calling program should periodically instantiate me and call Run().
323
  This will traverse the list of instances, and make up to MAXTRIES attempts
324
  to restart machines that are down.
325

    
326
  """
327
  def __init__(self):
328
    sstore = ssconf.SimpleStore()
329
    master = sstore.GetMasterNode()
330
    if master != utils.HostInfo().name:
331
      raise NotMasterError("This is not the master node")
332
    self.instances = GetInstanceList()
333
    self.bootids = GetNodeBootIDs()
334
    self.started_instances = set()
335

    
336
  def Run(self):
337
    notepad = WatcherState()
338
    try:
339
      self.CheckInstances(notepad)
340
      self.CheckDisks(notepad)
341
      self.VerifyDisks()
342
    finally:
343
      notepad.Save()
344

    
345
  def CheckDisks(self, notepad):
346
    """Check all nodes for restarted ones.
347

    
348
    """
349
    check_nodes = []
350
    for name, id in self.bootids.iteritems():
351
      old = notepad.GetNodeBootID(name)
352
      if old != id:
353
        # Node's boot ID has changed, proably through a reboot.
354
        check_nodes.append(name)
355

    
356
    if check_nodes:
357
      # Activate disks for all instances with any of the checked nodes as a
358
      # secondary node.
359
      for instance in GetInstanceList(with_secondaries=check_nodes):
360
        if not instance.autostart:
361
          logging.info(("Skipping disk activation for non-autostart"
362
                        " instance %s"), instance.name)
363
          continue
364
        if instance.name in self.started_instances:
365
          # we already tried to start the instance, which should have
366
          # activated its drives (if they can be at all)
367
          continue
368
        try:
369
          logging.info("Activating disks for instance %s", instance.name)
370
          instance.ActivateDisks()
371
        except Error, err:
372
          logging.error(str(err), exc_info=True)
373

    
374
      # Keep changed boot IDs
375
      for name in check_nodes:
376
        notepad.SetNodeBootID(name, self.bootids[name])
377

    
378
  def CheckInstances(self, notepad):
379
    """Make a pass over the list of instances, restarting downed ones.
380

    
381
    """
382
    for instance in self.instances:
383
      # Don't care about manually stopped instances
384
      if not instance.autostart:
385
        continue
386

    
387
      if instance.state in BAD_STATES:
388
        n = notepad.NumberOfRestartAttempts(instance)
389

    
390
        if n > MAXTRIES:
391
          # stay quiet.
392
          continue
393
        elif n < MAXTRIES:
394
          last = " (Attempt #%d)" % (n + 1)
395
        else:
396
          notepad.RecordRestartAttempt(instance)
397
          logging.error("Could not restart %s after %d attempts, giving up",
398
                        instance.name, MAXTRIES)
399
          continue
400
        try:
401
          logging.info("Restarting %s%s",
402
                        instance.name, last)
403
          instance.Restart()
404
          self.started_instances.add(instance.name)
405
        except Error, err:
406
          logging.error(str(err), exc_info=True)
407

    
408
        notepad.RecordRestartAttempt(instance)
409
      elif instance.state in HELPLESS_STATES:
410
        if notepad.NumberOfRestartAttempts(instance):
411
          notepad.RemoveInstance(instance)
412
      else:
413
        if notepad.NumberOfRestartAttempts(instance):
414
          notepad.RemoveInstance(instance)
415
          logging.info("Restart of %s succeeded", instance.name)
416

    
417
  def VerifyDisks(self):
418
    """Run gnt-cluster verify-disks.
419

    
420
    """
421
    result = DoCmd(['gnt-cluster', 'verify-disks', '--lock-retries=15'])
422
    if result.output:
423
      logging.info(result.output)
424

    
425

    
426
def ParseOptions():
427
  """Parse the command line options.
428

    
429
  Returns:
430
    (options, args) as from OptionParser.parse_args()
431

    
432
  """
433
  parser = OptionParser(description="Ganeti cluster watcher",
434
                        usage="%prog [-d]",
435
                        version="%%prog (ganeti) %s" %
436
                        constants.RELEASE_VERSION)
437

    
438
  parser.add_option("-d", "--debug", dest="debug",
439
                    help="Write all messages to stderr",
440
                    default=False, action="store_true")
441
  options, args = parser.parse_args()
442
  return options, args
443

    
444

    
445
def SetupLogging(debug):
446
  """Configures the logging module.
447

    
448
  """
449
  formatter = logging.Formatter("%(asctime)s: %(message)s")
450

    
451
  logfile_handler = logging.FileHandler(constants.LOG_WATCHER)
452
  logfile_handler.setFormatter(formatter)
453
  logfile_handler.setLevel(logging.INFO)
454

    
455
  stderr_handler = logging.StreamHandler()
456
  stderr_handler.setFormatter(formatter)
457
  if debug:
458
    stderr_handler.setLevel(logging.NOTSET)
459
  else:
460
    stderr_handler.setLevel(logging.CRITICAL)
461

    
462
  root_logger = logging.getLogger("")
463
  root_logger.setLevel(logging.NOTSET)
464
  root_logger.addHandler(logfile_handler)
465
  root_logger.addHandler(stderr_handler)
466

    
467

    
468
def main():
469
  """Main function.
470

    
471
  """
472
  options, args = ParseOptions()
473

    
474
  SetupLogging(options.debug)
475

    
476
  try:
477
    try:
478
      watcher = Watcher()
479
    except errors.ConfigurationError:
480
      # Just exit if there's no configuration
481
      sys.exit(constants.EXIT_SUCCESS)
482
    watcher.Run()
483
  except NotMasterError:
484
    logging.debug("Not master, exiting")
485
    sys.exit(constants.EXIT_NOTMASTER)
486
  except errors.ResolverError, err:
487
    logging.error("Cannot resolve hostname '%s', exiting.", err.args[0])
488
    sys.exit(constants.EXIT_NODESETUP_ERROR)
489
  except Exception, err:
490
    logging.error(str(err), exc_info=True)
491
    sys.exit(constants.EXIT_FAILURE)
492

    
493

    
494
if __name__ == '__main__':
495
  main()