Revision f5116c87 daemons/ganeti-watcher
b/daemons/ganeti-watcher | ||
---|---|---|
55 | 55 |
|
56 | 56 |
|
57 | 57 |
MAXTRIES = 5 |
58 |
# Delete any record that is older than 8 hours; this value is based on |
|
59 |
# the fact that the current retry counter is 5, and watcher runs every |
|
60 |
# 5 minutes, so it takes around half an hour to exceed the retry |
|
61 |
# counter, so 8 hours (16*1/2h) seems like a reasonable reset time |
|
62 |
RETRY_EXPIRATION = 8 * 3600 |
|
58 | 63 |
BAD_STATES = ['ERROR_down'] |
59 | 64 |
HELPLESS_STATES = ['ERROR_nodedown', 'ERROR_nodeoffline'] |
60 | 65 |
NOTICE = 'NOTICE' |
... | ... | |
326 | 331 |
|
327 | 332 |
return 0 |
328 | 333 |
|
334 |
def MaintainInstanceList(self, instances): |
|
335 |
"""Perform maintenance on the recorded instances. |
|
336 |
|
|
337 |
@type instances: list of string |
|
338 |
@param instances: the list of currently existing instances |
|
339 |
|
|
340 |
""" |
|
341 |
idict = self._data["instance"] |
|
342 |
# First, delete obsolete instances |
|
343 |
obsolete_instances = set(idict).difference(instances) |
|
344 |
for inst in obsolete_instances: |
|
345 |
logging.debug("Forgetting obsolete instance %s", inst) |
|
346 |
del idict[inst] |
|
347 |
|
|
348 |
# Second, delete expired records |
|
349 |
earliest = time.time() - RETRY_EXPIRATION |
|
350 |
expired_instances = [i for i in idict |
|
351 |
if idict[i][KEY_RESTART_WHEN] < earliest] |
|
352 |
for inst in expired_instances: |
|
353 |
logging.debug("Expiring record for instance %s", inst) |
|
354 |
del idict[inst] |
|
355 |
|
|
329 | 356 |
def RecordRestartAttempt(self, instance): |
330 | 357 |
"""Record a restart attempt. |
331 | 358 |
|
... | ... | |
513 | 540 |
"""Make a pass over the list of instances, restarting downed ones. |
514 | 541 |
|
515 | 542 |
""" |
543 |
notepad.MaintainInstanceList(self.instances.keys()) |
|
544 |
|
|
516 | 545 |
for instance in self.instances.values(): |
517 | 546 |
if instance.state in BAD_STATES: |
518 | 547 |
n = notepad.NumberOfRestartAttempts(instance) |
519 | 548 |
|
520 | 549 |
if n > MAXTRIES: |
521 |
# stay quiet. |
|
550 |
logging.warning("Not restarting instance %s, retries exhausted", |
|
551 |
instance.name) |
|
522 | 552 |
continue |
523 | 553 |
elif n < MAXTRIES: |
524 | 554 |
last = " (Attempt #%d)" % (n + 1) |
Also available in: Unified diff