Revision 438b45d4

b/daemons/ganeti-watcher
34 34
import fcntl
35 35
import errno
36 36
import simplejson
37
import logging
37 38
from optparse import OptionParser
38 39

  
39 40
from ganeti import utils
......
122 123
    except Exception, msg:
123 124
      # Ignore errors while loading the file and treat it as empty
124 125
      self.data = {}
125
      sys.stderr.write("Empty or invalid state file."
126
                       " Using defaults. Error message: %s\n" % msg)
126
      logging.warning(("Empty or invalid state file. Using defaults."
127
                       " Error message: %s"), msg)
127 128

  
128 129
    if "instance" not in self.data:
129 130
      self.data["instance"] = {}
......
315 316
  return ids
316 317

  
317 318

  
318
class Message(object):
319
  """Encapsulation of a notice or error message.
320

  
321
  """
322
  def __init__(self, level, msg):
323
    self.level = level
324
    self.msg = msg
325
    self.when = time.time()
326

  
327
  def __str__(self):
328
    return self.level + ' ' + time.ctime(self.when) + '\n' + Indent(self.msg)
329

  
330

  
331 319
class Watcher(object):
332 320
  """Encapsulate the logic for restarting erronously halted virtual machines.
333 321

  
......
343 331
      raise NotMasterError("This is not the master node")
344 332
    self.instances = GetInstanceList()
345 333
    self.bootids = GetNodeBootIDs()
346
    self.messages = []
347 334
    self.started_instances = set()
348 335

  
349 336
  def Run(self):
......
369 356
      # secondary node.
370 357
      for instance in GetInstanceList(with_secondaries=check_nodes):
371 358
        if not instance.autostart:
372
          self.messages.append(Message(NOTICE,
373
                                       ("Skipping disk activation for"
374
                                        " non-autostart instance '%s'." %
375
                                        instance.name)))
359
          logging.info(("Skipping disk activation for non-autostart"
360
                        " instance %s"), instance.name)
376 361
          continue
377 362
        if instance.name in self.started_instances:
378 363
          # we already tried to start the instance, which should have
379 364
          # activated its drives (if they can be at all)
380 365
          continue
381 366
        try:
382
          self.messages.append(Message(NOTICE, ("Activating disks for %s." %
383
                                                instance.name)))
367
          logging.info("Activating disks for instance %s", instance.name)
384 368
          instance.ActivateDisks()
385
        except Error, x:
386
          self.messages.append(Message(ERROR, str(x)))
369
        except Error, err:
370
          logging.error(str(err), exc_info=True)
387 371

  
388 372
      # Keep changed boot IDs
389 373
      for name in check_nodes:
......
408 392
          last = " (Attempt #%d)" % (n + 1)
409 393
        else:
410 394
          notepad.RecordRestartAttempt(instance)
411
          self.messages.append(Message(ERROR, "Could not restart %s for %d"
412
                                       " times, giving up..." %
413
                                       (instance.name, MAXTRIES)))
395
          logging.error("Could not restart %s after %d attempts, giving up",
396
                        instance.name, MAXTRIES)
414 397
          continue
415 398
        try:
416
          self.messages.append(Message(NOTICE, ("Restarting %s%s." %
417
                                                (instance.name, last))))
399
          logging.info("Restarting %s%s",
400
                        instance.name, last)
418 401
          instance.Restart()
419 402
          self.started_instances.add(instance.name)
420
        except Error, x:
421
          self.messages.append(Message(ERROR, str(x)))
403
        except Error, err:
404
          logging.error(str(err), exc_info=True)
422 405

  
423 406
        notepad.RecordRestartAttempt(instance)
424 407
      elif instance.state in HELPLESS_STATES:
......
427 410
      else:
428 411
        if notepad.NumberOfRestartAttempts(instance):
429 412
          notepad.RemoveInstance(instance)
430
          msg = Message(NOTICE, "Restart of %s succeeded." % instance.name)
431
          self.messages.append(msg)
413
          logging.info("Restart of %s succeeded", instance.name)
432 414

  
433 415
  def VerifyDisks(self):
434 416
    """Run gnt-cluster verify-disks.
......
436 418
    """
437 419
    result = DoCmd(['gnt-cluster', 'verify-disks', '--lock-retries=15'])
438 420
    if result.output:
439
      self.messages.append(Message(NOTICE, result.output))
440

  
441
  def WriteReport(self, logfile):
442
    """Log all messages to file.
443

  
444
    Args:
445
      logfile: file object open for writing (the log file)
446

  
447
    """
448
    for msg in self.messages:
449
      print >> logfile, str(msg)
421
      logging.info(result.output)
450 422

  
451 423

  
452 424
def ParseOptions():
......
462 434
                        constants.RELEASE_VERSION)
463 435

  
464 436
  parser.add_option("-d", "--debug", dest="debug",
465
                    help="Don't redirect messages to the log file",
437
                    help="Write all messages to stderr",
466 438
                    default=False, action="store_true")
467 439
  options, args = parser.parse_args()
468 440
  return options, args
469 441

  
470 442

  
443
def SetupLogging(debug):
444
  """Configures the logging module.
445

  
446
  """
447
  formatter = logging.Formatter("%(asctime)s: %(message)s")
448

  
449
  logfile_handler = logging.FileHandler(constants.LOG_WATCHER)
450
  logfile_handler.setFormatter(formatter)
451
  logfile_handler.setLevel(logging.INFO)
452

  
453
  stderr_handler = logging.StreamHandler()
454
  stderr_handler.setFormatter(formatter)
455
  if debug:
456
    stderr_handler.setLevel(logging.NOTSET)
457
  else:
458
    stderr_handler.setLevel(logging.CRITICAL)
459

  
460
  root_logger = logging.getLogger("")
461
  root_logger.setLevel(logging.NOTSET)
462
  root_logger.addHandler(logfile_handler)
463
  root_logger.addHandler(stderr_handler)
464

  
465

  
471 466
def main():
472 467
  """Main function.
473 468

  
474 469
  """
475 470
  options, args = ParseOptions()
476 471

  
477
  if not options.debug:
478
    sys.stderr = sys.stdout = open(constants.LOG_WATCHER, 'a')
472
  SetupLogging(options.debug)
479 473

  
480 474
  try:
481 475
    try:
......
484 478
      # Just exit if there's no configuration
485 479
      sys.exit(constants.EXIT_SUCCESS)
486 480
    watcher.Run()
487
    watcher.WriteReport(sys.stdout)
488 481
  except NotMasterError:
489
    if options.debug:
490
      sys.stderr.write("Not master, exiting.\n")
482
    logging.debug("Not master, exiting")
491 483
    sys.exit(constants.EXIT_NOTMASTER)
492 484
  except errors.ResolverError, err:
493
    sys.stderr.write("Cannot resolve hostname '%s', exiting.\n" % err.args[0])
485
    logging.error("Cannot resolve hostname '%s', exiting.", err.args[0])
494 486
    sys.exit(constants.EXIT_NODESETUP_ERROR)
495
  except Error, err:
496
    print err
487
  except Exception, err:
488
    logging.error(str(err), exc_info=True)
489
    sys.exit(constants.EXIT_FAILURE)
497 490

  
498 491

  
499 492
if __name__ == '__main__':
b/lib/constants.py
142 142

  
143 143
# common exit codes
144 144
EXIT_SUCCESS = 0
145
EXIT_FAILURE = 1
145 146
EXIT_NOTMASTER = 11
146 147
EXIT_NODESETUP_ERROR = 12
147 148
EXIT_CONFIRMATION = 13 # need user confirmation

Also available in: Unified diff