Revision 438b45d4 daemons/ganeti-watcher
b/daemons/ganeti-watcher | ||
---|---|---|
34 | 34 |
import fcntl |
35 | 35 |
import errno |
36 | 36 |
import simplejson |
37 |
import logging |
|
37 | 38 |
from optparse import OptionParser |
38 | 39 |
|
39 | 40 |
from ganeti import utils |
... | ... | |
122 | 123 |
except Exception, msg: |
123 | 124 |
# Ignore errors while loading the file and treat it as empty |
124 | 125 |
self.data = {} |
125 |
sys.stderr.write("Empty or invalid state file."
|
|
126 |
" Using defaults. Error message: %s\n" % msg)
|
|
126 |
logging.warning(("Empty or invalid state file. Using defaults."
|
|
127 |
" Error message: %s"), msg)
|
|
127 | 128 |
|
128 | 129 |
if "instance" not in self.data: |
129 | 130 |
self.data["instance"] = {} |
... | ... | |
315 | 316 |
return ids |
316 | 317 |
|
317 | 318 |
|
318 |
class Message(object): |
|
319 |
"""Encapsulation of a notice or error message. |
|
320 |
|
|
321 |
""" |
|
322 |
def __init__(self, level, msg): |
|
323 |
self.level = level |
|
324 |
self.msg = msg |
|
325 |
self.when = time.time() |
|
326 |
|
|
327 |
def __str__(self): |
|
328 |
return self.level + ' ' + time.ctime(self.when) + '\n' + Indent(self.msg) |
|
329 |
|
|
330 |
|
|
331 | 319 |
class Watcher(object): |
332 | 320 |
"""Encapsulate the logic for restarting erronously halted virtual machines. |
333 | 321 |
|
... | ... | |
343 | 331 |
raise NotMasterError("This is not the master node") |
344 | 332 |
self.instances = GetInstanceList() |
345 | 333 |
self.bootids = GetNodeBootIDs() |
346 |
self.messages = [] |
|
347 | 334 |
self.started_instances = set() |
348 | 335 |
|
349 | 336 |
def Run(self): |
... | ... | |
369 | 356 |
# secondary node. |
370 | 357 |
for instance in GetInstanceList(with_secondaries=check_nodes): |
371 | 358 |
if not instance.autostart: |
372 |
self.messages.append(Message(NOTICE, |
|
373 |
("Skipping disk activation for" |
|
374 |
" non-autostart instance '%s'." % |
|
375 |
instance.name))) |
|
359 |
logging.info(("Skipping disk activation for non-autostart" |
|
360 |
" instance %s"), instance.name) |
|
376 | 361 |
continue |
377 | 362 |
if instance.name in self.started_instances: |
378 | 363 |
# we already tried to start the instance, which should have |
379 | 364 |
# activated its drives (if they can be at all) |
380 | 365 |
continue |
381 | 366 |
try: |
382 |
self.messages.append(Message(NOTICE, ("Activating disks for %s." % |
|
383 |
instance.name))) |
|
367 |
logging.info("Activating disks for instance %s", instance.name) |
|
384 | 368 |
instance.ActivateDisks() |
385 |
except Error, x:
|
|
386 |
self.messages.append(Message(ERROR, str(x)))
|
|
369 |
except Error, err:
|
|
370 |
logging.error(str(err), exc_info=True)
|
|
387 | 371 |
|
388 | 372 |
# Keep changed boot IDs |
389 | 373 |
for name in check_nodes: |
... | ... | |
408 | 392 |
last = " (Attempt #%d)" % (n + 1) |
409 | 393 |
else: |
410 | 394 |
notepad.RecordRestartAttempt(instance) |
411 |
self.messages.append(Message(ERROR, "Could not restart %s for %d" |
|
412 |
" times, giving up..." % |
|
413 |
(instance.name, MAXTRIES))) |
|
395 |
logging.error("Could not restart %s after %d attempts, giving up", |
|
396 |
instance.name, MAXTRIES) |
|
414 | 397 |
continue |
415 | 398 |
try: |
416 |
self.messages.append(Message(NOTICE, ("Restarting %s%s." %
|
|
417 |
(instance.name, last))))
|
|
399 |
logging.info("Restarting %s%s",
|
|
400 |
instance.name, last)
|
|
418 | 401 |
instance.Restart() |
419 | 402 |
self.started_instances.add(instance.name) |
420 |
except Error, x:
|
|
421 |
self.messages.append(Message(ERROR, str(x)))
|
|
403 |
except Error, err:
|
|
404 |
logging.error(str(err), exc_info=True)
|
|
422 | 405 |
|
423 | 406 |
notepad.RecordRestartAttempt(instance) |
424 | 407 |
elif instance.state in HELPLESS_STATES: |
... | ... | |
427 | 410 |
else: |
428 | 411 |
if notepad.NumberOfRestartAttempts(instance): |
429 | 412 |
notepad.RemoveInstance(instance) |
430 |
msg = Message(NOTICE, "Restart of %s succeeded." % instance.name) |
|
431 |
self.messages.append(msg) |
|
413 |
logging.info("Restart of %s succeeded", instance.name) |
|
432 | 414 |
|
433 | 415 |
def VerifyDisks(self): |
434 | 416 |
"""Run gnt-cluster verify-disks. |
... | ... | |
436 | 418 |
""" |
437 | 419 |
result = DoCmd(['gnt-cluster', 'verify-disks', '--lock-retries=15']) |
438 | 420 |
if result.output: |
439 |
self.messages.append(Message(NOTICE, result.output)) |
|
440 |
|
|
441 |
def WriteReport(self, logfile): |
|
442 |
"""Log all messages to file. |
|
443 |
|
|
444 |
Args: |
|
445 |
logfile: file object open for writing (the log file) |
|
446 |
|
|
447 |
""" |
|
448 |
for msg in self.messages: |
|
449 |
print >> logfile, str(msg) |
|
421 |
logging.info(result.output) |
|
450 | 422 |
|
451 | 423 |
|
452 | 424 |
def ParseOptions(): |
... | ... | |
462 | 434 |
constants.RELEASE_VERSION) |
463 | 435 |
|
464 | 436 |
parser.add_option("-d", "--debug", dest="debug", |
465 |
help="Don't redirect messages to the log file",
|
|
437 |
help="Write all messages to stderr",
|
|
466 | 438 |
default=False, action="store_true") |
467 | 439 |
options, args = parser.parse_args() |
468 | 440 |
return options, args |
469 | 441 |
|
470 | 442 |
|
443 |
def SetupLogging(debug): |
|
444 |
"""Configures the logging module. |
|
445 |
|
|
446 |
""" |
|
447 |
formatter = logging.Formatter("%(asctime)s: %(message)s") |
|
448 |
|
|
449 |
logfile_handler = logging.FileHandler(constants.LOG_WATCHER) |
|
450 |
logfile_handler.setFormatter(formatter) |
|
451 |
logfile_handler.setLevel(logging.INFO) |
|
452 |
|
|
453 |
stderr_handler = logging.StreamHandler() |
|
454 |
stderr_handler.setFormatter(formatter) |
|
455 |
if debug: |
|
456 |
stderr_handler.setLevel(logging.NOTSET) |
|
457 |
else: |
|
458 |
stderr_handler.setLevel(logging.CRITICAL) |
|
459 |
|
|
460 |
root_logger = logging.getLogger("") |
|
461 |
root_logger.setLevel(logging.NOTSET) |
|
462 |
root_logger.addHandler(logfile_handler) |
|
463 |
root_logger.addHandler(stderr_handler) |
|
464 |
|
|
465 |
|
|
471 | 466 |
def main(): |
472 | 467 |
"""Main function. |
473 | 468 |
|
474 | 469 |
""" |
475 | 470 |
options, args = ParseOptions() |
476 | 471 |
|
477 |
if not options.debug: |
|
478 |
sys.stderr = sys.stdout = open(constants.LOG_WATCHER, 'a') |
|
472 |
SetupLogging(options.debug) |
|
479 | 473 |
|
480 | 474 |
try: |
481 | 475 |
try: |
... | ... | |
484 | 478 |
# Just exit if there's no configuration |
485 | 479 |
sys.exit(constants.EXIT_SUCCESS) |
486 | 480 |
watcher.Run() |
487 |
watcher.WriteReport(sys.stdout) |
|
488 | 481 |
except NotMasterError: |
489 |
if options.debug: |
|
490 |
sys.stderr.write("Not master, exiting.\n") |
|
482 |
logging.debug("Not master, exiting") |
|
491 | 483 |
sys.exit(constants.EXIT_NOTMASTER) |
492 | 484 |
except errors.ResolverError, err: |
493 |
sys.stderr.write("Cannot resolve hostname '%s', exiting.\n" % err.args[0])
|
|
485 |
logging.error("Cannot resolve hostname '%s', exiting.", err.args[0])
|
|
494 | 486 |
sys.exit(constants.EXIT_NODESETUP_ERROR) |
495 |
except Error, err: |
|
496 |
print err |
|
487 |
except Exception, err: |
|
488 |
logging.error(str(err), exc_info=True) |
|
489 |
sys.exit(constants.EXIT_FAILURE) |
|
497 | 490 |
|
498 | 491 |
|
499 | 492 |
if __name__ == '__main__': |
Also available in: Unified diff