Statistics
| Branch: | Tag: | Revision:

root / daemons / ganeti-watcher @ 67fe61c4

History | View | Annotate | Download (12.4 kB)

1
#!/usr/bin/python
2
#
3

    
4
# Copyright (C) 2006, 2007 Google Inc.
5
#
6
# This program is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 2 of the License, or
9
# (at your option) any later version.
10
#
11
# This program is distributed in the hope that it will be useful, but
12
# WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
# General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with this program; if not, write to the Free Software
18
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19
# 02110-1301, USA.
20

    
21

    
22
"""Tool to restart erronously downed virtual machines.
23

    
24
This program and set of classes implement a watchdog to restart
25
virtual machines in a Ganeti cluster that have crashed or been killed
26
by a node reboot.  Run from cron or similar.
27

    
28
"""
29

    
30
import os
31
import sys
32
import re
33
import time
34
import fcntl
35
import errno
36
import logging
37
from optparse import OptionParser
38

    
39
from ganeti import utils
40
from ganeti import constants
41
from ganeti import serializer
42
from ganeti import ssconf
43
from ganeti import errors
44

    
45

    
46
MAXTRIES = 5
47
BAD_STATES = ['stopped']
48
HELPLESS_STATES = ['(node down)']
49
NOTICE = 'NOTICE'
50
ERROR = 'ERROR'
51
KEY_RESTART_COUNT = "restart_count"
52
KEY_RESTART_WHEN = "restart_when"
53
KEY_BOOT_ID = "bootid"
54

    
55

    
56
class Error(Exception):
57
  """Generic custom error class."""
58

    
59

    
60
class NotMasterError(Error):
61
  """Exception raised when this host is not the master."""
62

    
63

    
64
def Indent(s, prefix='| '):
65
  """Indent a piece of text with a given prefix before each line.
66

    
67
  Args:
68
    s: The string to indent
69
    prefix: The string to prepend each line.
70

    
71
  """
72
  return "%s%s\n" % (prefix, ('\n' + prefix).join(s.splitlines()))
73

    
74

    
75
def DoCmd(cmd):
76
  """Run a shell command.
77

    
78
  Args:
79
    cmd: the command to run.
80

    
81
  Raises CommandError with verbose commentary on error.
82

    
83
  """
84
  res = utils.RunCmd(cmd)
85

    
86
  if res.failed:
87
    raise Error("Command %s failed:\n%s\nstdout:\n%sstderr:\n%s" %
88
                (repr(cmd),
89
                 Indent(res.fail_reason),
90
                 Indent(res.stdout),
91
                 Indent(res.stderr)))
92

    
93
  return res
94

    
95

    
96
class WatcherState(object):
97
  """Interface to a state file recording restart attempts.
98

    
99
  """
100
  def __init__(self):
101
    """Open, lock, read and parse the file.
102

    
103
    Raises StandardError on lock contention.
104

    
105
    """
106
    # The two-step dance below is necessary to allow both opening existing
107
    # file read/write and creating if not existing.  Vanilla open will truncate
108
    # an existing file -or- allow creating if not existing.
109
    f = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT)
110
    f = os.fdopen(f, 'w+')
111

    
112
    try:
113
      fcntl.flock(f.fileno(), fcntl.LOCK_EX|fcntl.LOCK_NB)
114
    except IOError, x:
115
      if x.errno == errno.EAGAIN:
116
        raise StandardError("State file already locked")
117
      raise
118

    
119
    self.statefile = f
120

    
121
    try:
122
      self.data = serializer.Load(self.statefile.read())
123
    except Exception, msg:
124
      # Ignore errors while loading the file and treat it as empty
125
      self.data = {}
126
      logging.warning(("Empty or invalid state file. Using defaults."
127
                       " Error message: %s"), msg)
128

    
129
    if "instance" not in self.data:
130
      self.data["instance"] = {}
131
    if "node" not in self.data:
132
      self.data["node"] = {}
133

    
134
  def __del__(self):
135
    """Called on destruction.
136

    
137
    """
138
    if self.statefile:
139
      self._Close()
140

    
141
  def _Close(self):
142
    """Unlock configuration file and close it.
143

    
144
    """
145
    assert self.statefile
146

    
147
    fcntl.flock(self.statefile.fileno(), fcntl.LOCK_UN)
148

    
149
    self.statefile.close()
150
    self.statefile = None
151

    
152
  def GetNodeBootID(self, name):
153
    """Returns the last boot ID of a node or None.
154

    
155
    """
156
    ndata = self.data["node"]
157

    
158
    if name in ndata and KEY_BOOT_ID in ndata[name]:
159
      return ndata[name][KEY_BOOT_ID]
160
    return None
161

    
162
  def SetNodeBootID(self, name, bootid):
163
    """Sets the boot ID of a node.
164

    
165
    """
166
    assert bootid
167

    
168
    ndata = self.data["node"]
169

    
170
    if name not in ndata:
171
      ndata[name] = {}
172

    
173
    ndata[name][KEY_BOOT_ID] = bootid
174

    
175
  def NumberOfRestartAttempts(self, instance):
176
    """Returns number of previous restart attempts.
177

    
178
    Args:
179
      instance - the instance to look up.
180

    
181
    """
182
    idata = self.data["instance"]
183

    
184
    if instance.name in idata:
185
      return idata[instance.name][KEY_RESTART_COUNT]
186

    
187
    return 0
188

    
189
  def RecordRestartAttempt(self, instance):
190
    """Record a restart attempt.
191

    
192
    Args:
193
      instance - the instance being restarted
194

    
195
    """
196
    idata = self.data["instance"]
197

    
198
    if instance.name not in idata:
199
      inst = idata[instance.name] = {}
200
    else:
201
      inst = idata[instance.name]
202

    
203
    inst[KEY_RESTART_WHEN] = time.time()
204
    inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1
205

    
206
  def RemoveInstance(self, instance):
207
    """Update state to reflect that a machine is running, i.e. remove record.
208

    
209
    Args:
210
      instance - the instance to remove from books
211

    
212
    This method removes the record for a named instance.
213

    
214
    """
215
    idata = self.data["instance"]
216

    
217
    if instance.name in idata:
218
      del idata[instance.name]
219

    
220
  def Save(self):
221
    """Save state to file, then unlock and close it.
222

    
223
    """
224
    assert self.statefile
225

    
226
    self.statefile.seek(0)
227
    self.statefile.truncate()
228

    
229
    self.statefile.write(serializer.Dump(self.data))
230

    
231
    self._Close()
232

    
233

    
234
class Instance(object):
235
  """Abstraction for a Virtual Machine instance.
236

    
237
  Methods:
238
    Restart(): issue a command to restart the represented machine.
239

    
240
  """
241
  def __init__(self, name, state, autostart):
242
    self.name = name
243
    self.state = state
244
    self.autostart = autostart
245

    
246
  def Restart(self):
247
    """Encapsulates the start of an instance.
248

    
249
    """
250
    DoCmd(['gnt-instance', 'startup', '--lock-retries=15', self.name])
251

    
252
  def ActivateDisks(self):
253
    """Encapsulates the activation of all disks of an instance.
254

    
255
    """
256
    DoCmd(['gnt-instance', 'activate-disks', '--lock-retries=15', self.name])
257

    
258

    
259
def _RunListCmd(cmd):
260
  """Runs a command and parses its output into lists.
261

    
262
  """
263
  for line in DoCmd(cmd).stdout.splitlines():
264
    yield line.split(':')
265

    
266

    
267
def GetInstanceList(with_secondaries=None):
268
  """Get a list of instances on this cluster.
269

    
270
  """
271
  cmd = ['gnt-instance', 'list', '--lock-retries=15', '--no-headers',
272
         '--separator=:']
273

    
274
  fields = 'name,oper_state,admin_state'
275

    
276
  if with_secondaries is not None:
277
    fields += ',snodes'
278

    
279
  cmd.append('-o')
280
  cmd.append(fields)
281

    
282
  instances = []
283
  for fields in _RunListCmd(cmd):
284
    if with_secondaries is not None:
285
      (name, status, autostart, snodes) = fields
286

    
287
      if snodes == "-":
288
        continue
289

    
290
      for node in with_secondaries:
291
        if node in snodes.split(','):
292
          break
293
      else:
294
        continue
295

    
296
    else:
297
      (name, status, autostart) = fields
298

    
299
    instances.append(Instance(name, status, autostart != "no"))
300

    
301
  return instances
302

    
303

    
304
def GetNodeBootIDs():
305
  """Get a dict mapping nodes to boot IDs.
306

    
307
  """
308
  cmd = ['gnt-node', 'list', '--lock-retries=15', '--no-headers',
309
         '--separator=:', '-o', 'name,bootid']
310

    
311
  ids = {}
312
  for fields in _RunListCmd(cmd):
313
    (name, bootid) = fields
314
    ids[name] = bootid
315

    
316
  return ids
317

    
318

    
319
class Watcher(object):
320
  """Encapsulate the logic for restarting erronously halted virtual machines.
321

    
322
  The calling program should periodically instantiate me and call Run().
323
  This will traverse the list of instances, and make up to MAXTRIES attempts
324
  to restart machines that are down.
325

    
326
  """
327
  def __init__(self):
328
    sstore = ssconf.SimpleStore()
329
    master = sstore.GetMasterNode()
330
    if master != utils.HostInfo().name:
331
      raise NotMasterError("This is not the master node")
332
    self.instances = GetInstanceList()
333
    self.bootids = GetNodeBootIDs()
334
    self.started_instances = set()
335

    
336
  def Run(self):
337
    notepad = WatcherState()
338
    self.CheckInstances(notepad)
339
    self.CheckDisks(notepad)
340
    self.VerifyDisks()
341
    notepad.Save()
342

    
343
  def CheckDisks(self, notepad):
344
    """Check all nodes for restarted ones.
345

    
346
    """
347
    check_nodes = []
348
    for name, id in self.bootids.iteritems():
349
      old = notepad.GetNodeBootID(name)
350
      if old != id:
351
        # Node's boot ID has changed, proably through a reboot.
352
        check_nodes.append(name)
353

    
354
    if check_nodes:
355
      # Activate disks for all instances with any of the checked nodes as a
356
      # secondary node.
357
      for instance in GetInstanceList(with_secondaries=check_nodes):
358
        if not instance.autostart:
359
          logging.info(("Skipping disk activation for non-autostart"
360
                        " instance %s"), instance.name)
361
          continue
362
        if instance.name in self.started_instances:
363
          # we already tried to start the instance, which should have
364
          # activated its drives (if they can be at all)
365
          continue
366
        try:
367
          logging.info("Activating disks for instance %s", instance.name)
368
          instance.ActivateDisks()
369
        except Error, err:
370
          logging.error(str(err), exc_info=True)
371

    
372
      # Keep changed boot IDs
373
      for name in check_nodes:
374
        notepad.SetNodeBootID(name, self.bootids[name])
375

    
376
  def CheckInstances(self, notepad):
377
    """Make a pass over the list of instances, restarting downed ones.
378

    
379
    """
380
    for instance in self.instances:
381
      # Don't care about manually stopped instances
382
      if not instance.autostart:
383
        continue
384

    
385
      if instance.state in BAD_STATES:
386
        n = notepad.NumberOfRestartAttempts(instance)
387

    
388
        if n > MAXTRIES:
389
          # stay quiet.
390
          continue
391
        elif n < MAXTRIES:
392
          last = " (Attempt #%d)" % (n + 1)
393
        else:
394
          notepad.RecordRestartAttempt(instance)
395
          logging.error("Could not restart %s after %d attempts, giving up",
396
                        instance.name, MAXTRIES)
397
          continue
398
        try:
399
          logging.info("Restarting %s%s",
400
                        instance.name, last)
401
          instance.Restart()
402
          self.started_instances.add(instance.name)
403
        except Error, err:
404
          logging.error(str(err), exc_info=True)
405

    
406
        notepad.RecordRestartAttempt(instance)
407
      elif instance.state in HELPLESS_STATES:
408
        if notepad.NumberOfRestartAttempts(instance):
409
          notepad.RemoveInstance(instance)
410
      else:
411
        if notepad.NumberOfRestartAttempts(instance):
412
          notepad.RemoveInstance(instance)
413
          logging.info("Restart of %s succeeded", instance.name)
414

    
415
  def VerifyDisks(self):
416
    """Run gnt-cluster verify-disks.
417

    
418
    """
419
    result = DoCmd(['gnt-cluster', 'verify-disks', '--lock-retries=15'])
420
    if result.output:
421
      logging.info(result.output)
422

    
423

    
424
def ParseOptions():
425
  """Parse the command line options.
426

    
427
  Returns:
428
    (options, args) as from OptionParser.parse_args()
429

    
430
  """
431
  parser = OptionParser(description="Ganeti cluster watcher",
432
                        usage="%prog [-d]",
433
                        version="%%prog (ganeti) %s" %
434
                        constants.RELEASE_VERSION)
435

    
436
  parser.add_option("-d", "--debug", dest="debug",
437
                    help="Write all messages to stderr",
438
                    default=False, action="store_true")
439
  options, args = parser.parse_args()
440
  return options, args
441

    
442

    
443
def SetupLogging(debug):
444
  """Configures the logging module.
445

    
446
  """
447
  formatter = logging.Formatter("%(asctime)s: %(message)s")
448

    
449
  logfile_handler = logging.FileHandler(constants.LOG_WATCHER)
450
  logfile_handler.setFormatter(formatter)
451
  logfile_handler.setLevel(logging.INFO)
452

    
453
  stderr_handler = logging.StreamHandler()
454
  stderr_handler.setFormatter(formatter)
455
  if debug:
456
    stderr_handler.setLevel(logging.NOTSET)
457
  else:
458
    stderr_handler.setLevel(logging.CRITICAL)
459

    
460
  root_logger = logging.getLogger("")
461
  root_logger.setLevel(logging.NOTSET)
462
  root_logger.addHandler(logfile_handler)
463
  root_logger.addHandler(stderr_handler)
464

    
465

    
466
def main():
467
  """Main function.
468

    
469
  """
470
  options, args = ParseOptions()
471

    
472
  SetupLogging(options.debug)
473

    
474
  try:
475
    try:
476
      watcher = Watcher()
477
    except errors.ConfigurationError:
478
      # Just exit if there's no configuration
479
      sys.exit(constants.EXIT_SUCCESS)
480
    watcher.Run()
481
  except NotMasterError:
482
    logging.debug("Not master, exiting")
483
    sys.exit(constants.EXIT_NOTMASTER)
484
  except errors.ResolverError, err:
485
    logging.error("Cannot resolve hostname '%s', exiting.", err.args[0])
486
    sys.exit(constants.EXIT_NODESETUP_ERROR)
487
  except Exception, err:
488
    logging.error(str(err), exc_info=True)
489
    sys.exit(constants.EXIT_FAILURE)
490

    
491

    
492
if __name__ == '__main__':
493
  main()