34 |
34 |
import fcntl
|
35 |
35 |
import errno
|
36 |
36 |
import simplejson
|
|
37 |
import logging
|
37 |
38 |
from optparse import OptionParser
|
38 |
39 |
|
39 |
40 |
from ganeti import utils
|
... | ... | |
122 |
123 |
except Exception, msg:
|
123 |
124 |
# Ignore errors while loading the file and treat it as empty
|
124 |
125 |
self.data = {}
|
125 |
|
sys.stderr.write("Empty or invalid state file."
|
126 |
|
" Using defaults. Error message: %s\n" % msg)
|
|
126 |
logging.warning(("Empty or invalid state file. Using defaults."
|
|
127 |
" Error message: %s"), msg)
|
127 |
128 |
|
128 |
129 |
if "instance" not in self.data:
|
129 |
130 |
self.data["instance"] = {}
|
... | ... | |
315 |
316 |
return ids
|
316 |
317 |
|
317 |
318 |
|
318 |
|
class Message(object):
|
319 |
|
"""Encapsulation of a notice or error message.
|
320 |
|
|
321 |
|
"""
|
322 |
|
def __init__(self, level, msg):
|
323 |
|
self.level = level
|
324 |
|
self.msg = msg
|
325 |
|
self.when = time.time()
|
326 |
|
|
327 |
|
def __str__(self):
|
328 |
|
return self.level + ' ' + time.ctime(self.when) + '\n' + Indent(self.msg)
|
329 |
|
|
330 |
|
|
331 |
319 |
class Watcher(object):
|
332 |
320 |
"""Encapsulate the logic for restarting erronously halted virtual machines.
|
333 |
321 |
|
... | ... | |
343 |
331 |
raise NotMasterError("This is not the master node")
|
344 |
332 |
self.instances = GetInstanceList()
|
345 |
333 |
self.bootids = GetNodeBootIDs()
|
346 |
|
self.messages = []
|
347 |
334 |
self.started_instances = set()
|
348 |
335 |
|
349 |
336 |
def Run(self):
|
... | ... | |
369 |
356 |
# secondary node.
|
370 |
357 |
for instance in GetInstanceList(with_secondaries=check_nodes):
|
371 |
358 |
if not instance.autostart:
|
372 |
|
self.messages.append(Message(NOTICE,
|
373 |
|
("Skipping disk activation for"
|
374 |
|
" non-autostart instance '%s'." %
|
375 |
|
instance.name)))
|
|
359 |
logging.info(("Skipping disk activation for non-autostart"
|
|
360 |
" instance %s"), instance.name)
|
376 |
361 |
continue
|
377 |
362 |
if instance.name in self.started_instances:
|
378 |
363 |
# we already tried to start the instance, which should have
|
379 |
364 |
# activated its drives (if they can be at all)
|
380 |
365 |
continue
|
381 |
366 |
try:
|
382 |
|
self.messages.append(Message(NOTICE, ("Activating disks for %s." %
|
383 |
|
instance.name)))
|
|
367 |
logging.info("Activating disks for instance %s", instance.name)
|
384 |
368 |
instance.ActivateDisks()
|
385 |
|
except Error, x:
|
386 |
|
self.messages.append(Message(ERROR, str(x)))
|
|
369 |
except Error, err:
|
|
370 |
logging.error(str(err), exc_info=True)
|
387 |
371 |
|
388 |
372 |
# Keep changed boot IDs
|
389 |
373 |
for name in check_nodes:
|
... | ... | |
408 |
392 |
last = " (Attempt #%d)" % (n + 1)
|
409 |
393 |
else:
|
410 |
394 |
notepad.RecordRestartAttempt(instance)
|
411 |
|
self.messages.append(Message(ERROR, "Could not restart %s for %d"
|
412 |
|
" times, giving up..." %
|
413 |
|
(instance.name, MAXTRIES)))
|
|
395 |
logging.error("Could not restart %s after %d attempts, giving up",
|
|
396 |
instance.name, MAXTRIES)
|
414 |
397 |
continue
|
415 |
398 |
try:
|
416 |
|
self.messages.append(Message(NOTICE, ("Restarting %s%s." %
|
417 |
|
(instance.name, last))))
|
|
399 |
logging.info("Restarting %s%s",
|
|
400 |
instance.name, last)
|
418 |
401 |
instance.Restart()
|
419 |
402 |
self.started_instances.add(instance.name)
|
420 |
|
except Error, x:
|
421 |
|
self.messages.append(Message(ERROR, str(x)))
|
|
403 |
except Error, err:
|
|
404 |
logging.error(str(err), exc_info=True)
|
422 |
405 |
|
423 |
406 |
notepad.RecordRestartAttempt(instance)
|
424 |
407 |
elif instance.state in HELPLESS_STATES:
|
... | ... | |
427 |
410 |
else:
|
428 |
411 |
if notepad.NumberOfRestartAttempts(instance):
|
429 |
412 |
notepad.RemoveInstance(instance)
|
430 |
|
msg = Message(NOTICE, "Restart of %s succeeded." % instance.name)
|
431 |
|
self.messages.append(msg)
|
|
413 |
logging.info("Restart of %s succeeded", instance.name)
|
432 |
414 |
|
433 |
415 |
def VerifyDisks(self):
|
434 |
416 |
"""Run gnt-cluster verify-disks.
|
... | ... | |
436 |
418 |
"""
|
437 |
419 |
result = DoCmd(['gnt-cluster', 'verify-disks', '--lock-retries=15'])
|
438 |
420 |
if result.output:
|
439 |
|
self.messages.append(Message(NOTICE, result.output))
|
440 |
|
|
441 |
|
def WriteReport(self, logfile):
|
442 |
|
"""Log all messages to file.
|
443 |
|
|
444 |
|
Args:
|
445 |
|
logfile: file object open for writing (the log file)
|
446 |
|
|
447 |
|
"""
|
448 |
|
for msg in self.messages:
|
449 |
|
print >> logfile, str(msg)
|
|
421 |
logging.info(result.output)
|
450 |
422 |
|
451 |
423 |
|
452 |
424 |
def ParseOptions():
|
... | ... | |
462 |
434 |
constants.RELEASE_VERSION)
|
463 |
435 |
|
464 |
436 |
parser.add_option("-d", "--debug", dest="debug",
|
465 |
|
help="Don't redirect messages to the log file",
|
|
437 |
help="Write all messages to stderr",
|
466 |
438 |
default=False, action="store_true")
|
467 |
439 |
options, args = parser.parse_args()
|
468 |
440 |
return options, args
|
469 |
441 |
|
470 |
442 |
|
|
443 |
def SetupLogging(debug):
|
|
444 |
"""Configures the logging module.
|
|
445 |
|
|
446 |
"""
|
|
447 |
formatter = logging.Formatter("%(asctime)s: %(message)s")
|
|
448 |
|
|
449 |
logfile_handler = logging.FileHandler(constants.LOG_WATCHER)
|
|
450 |
logfile_handler.setFormatter(formatter)
|
|
451 |
logfile_handler.setLevel(logging.INFO)
|
|
452 |
|
|
453 |
stderr_handler = logging.StreamHandler()
|
|
454 |
stderr_handler.setFormatter(formatter)
|
|
455 |
if debug:
|
|
456 |
stderr_handler.setLevel(logging.NOTSET)
|
|
457 |
else:
|
|
458 |
stderr_handler.setLevel(logging.CRITICAL)
|
|
459 |
|
|
460 |
root_logger = logging.getLogger("")
|
|
461 |
root_logger.setLevel(logging.NOTSET)
|
|
462 |
root_logger.addHandler(logfile_handler)
|
|
463 |
root_logger.addHandler(stderr_handler)
|
|
464 |
|
|
465 |
|
471 |
466 |
def main():
|
472 |
467 |
"""Main function.
|
473 |
468 |
|
474 |
469 |
"""
|
475 |
470 |
options, args = ParseOptions()
|
476 |
471 |
|
477 |
|
if not options.debug:
|
478 |
|
sys.stderr = sys.stdout = open(constants.LOG_WATCHER, 'a')
|
|
472 |
SetupLogging(options.debug)
|
479 |
473 |
|
480 |
474 |
try:
|
481 |
475 |
try:
|
... | ... | |
484 |
478 |
# Just exit if there's no configuration
|
485 |
479 |
sys.exit(constants.EXIT_SUCCESS)
|
486 |
480 |
watcher.Run()
|
487 |
|
watcher.WriteReport(sys.stdout)
|
488 |
481 |
except NotMasterError:
|
489 |
|
if options.debug:
|
490 |
|
sys.stderr.write("Not master, exiting.\n")
|
|
482 |
logging.debug("Not master, exiting")
|
491 |
483 |
sys.exit(constants.EXIT_NOTMASTER)
|
492 |
484 |
except errors.ResolverError, err:
|
493 |
|
sys.stderr.write("Cannot resolve hostname '%s', exiting.\n" % err.args[0])
|
|
485 |
logging.error("Cannot resolve hostname '%s', exiting.", err.args[0])
|
494 |
486 |
sys.exit(constants.EXIT_NODESETUP_ERROR)
|
495 |
|
except Error, err:
|
496 |
|
print err
|
|
487 |
except Exception, err:
|
|
488 |
logging.error(str(err), exc_info=True)
|
|
489 |
sys.exit(constants.EXIT_FAILURE)
|
497 |
490 |
|
498 |
491 |
|
499 |
492 |
if __name__ == '__main__':
|