Revision f5116c87

b/daemons/ganeti-watcher
55 55

  
56 56

  
57 57
MAXTRIES = 5
58
# Delete any record that is older than 8 hours; this value is based on
59
# the fact that the current retry counter is 5, and watcher runs every
60
# 5 minutes, so it takes around half an hour to exceed the retry
61
# counter, so 8 hours (16*1/2h) seems like a reasonable reset time
62
RETRY_EXPIRATION = 8 * 3600
58 63
BAD_STATES = ['ERROR_down']
59 64
HELPLESS_STATES = ['ERROR_nodedown', 'ERROR_nodeoffline']
60 65
NOTICE = 'NOTICE'
......
326 331

  
327 332
    return 0
328 333

  
334
  def MaintainInstanceList(self, instances):
335
    """Perform maintenance on the recorded instances.
336

  
337
    @type instances: list of string
338
    @param instances: the list of currently existing instances
339

  
340
    """
341
    idict = self._data["instance"]
342
    # First, delete obsolete instances
343
    obsolete_instances = set(idict).difference(instances)
344
    for inst in obsolete_instances:
345
      logging.debug("Forgetting obsolete instance %s", inst)
346
      del idict[inst]
347

  
348
    # Second, delete expired records
349
    earliest = time.time() - RETRY_EXPIRATION
350
    expired_instances = [i for i in idict
351
                         if idict[i][KEY_RESTART_WHEN] < earliest]
352
    for inst in expired_instances:
353
      logging.debug("Expiring record for instance %s", inst)
354
      del idict[inst]
355

  
329 356
  def RecordRestartAttempt(self, instance):
330 357
    """Record a restart attempt.
331 358

  
......
513 540
    """Make a pass over the list of instances, restarting downed ones.
514 541

  
515 542
    """
543
    notepad.MaintainInstanceList(self.instances.keys())
544

  
516 545
    for instance in self.instances.values():
517 546
      if instance.state in BAD_STATES:
518 547
        n = notepad.NumberOfRestartAttempts(instance)
519 548

  
520 549
        if n > MAXTRIES:
521
          # stay quiet.
550
          logging.warning("Not restarting instance %s, retries exhausted",
551
                          instance.name)
522 552
          continue
523 553
        elif n < MAXTRIES:
524 554
          last = " (Attempt #%d)" % (n + 1)

Also available in: Unified diff