Revision d7cdb55d daemons/ganeti-masterd

b/daemons/ganeti-masterd
53 53
from ganeti import logger
54 54
from ganeti import workerpool
55 55
from ganeti import rpc
56
from ganeti import bootstrap
56 57

  
57 58

  
58 59
CLIENT_REQUEST_WORKERS = 16
......
374 375
  future we could collect the current node list from our (possibly
375 376
  obsolete) known nodes.
376 377

  
378
  In order to account for cold-start of all nodes, we retry for up to
379
  a minute until we get a real answer as the top-voted one. If the
380
  nodes are more out-of-sync, for now manual startup of the master
381
  should be attempted.
382

  
383
  Note that for a even number of nodes cluster, we need at least half
384
  of the nodes (beside ourselves) to vote for us. This creates a
385
  problem on two-node clusters, since in this case we require the
386
  other node to be up too to confirm our status.
387

  
377 388
  """
378 389
  myself = utils.HostInfo().name
379 390
  #temp instantiation of a config writer, used only to get the node list
380 391
  cfg = config.ConfigWriter()
381 392
  node_list = cfg.GetNodeList()
382 393
  del cfg
383
  try:
384
    node_list.remove(myself)
385
  except KeyError:
386
    pass
387
  if not node_list:
388
    # either single node cluster, or a misconfiguration, but I won't
389
    # break any other node, so I can proceed
390
    return True
391
  results = rpc.RpcRunner.call_master_info(node_list)
392
  if not isinstance(results, dict):
393
    # this should not happen (unless internal error in rpc)
394
    logging.critical("Can't complete rpc call, aborting master startup")
395
    return False
396
  positive = negative = 0
397
  other_masters = {}
398
  for node in results:
399
    if not isinstance(results[node], (tuple, list)) or len(results[node]) < 3:
400
      logging.warning("Can't contact node %s", node)
394
  retries = 6
395
  while retries > 0:
396
    votes = bootstrap.GatherMasterVotes(node_list)
397
    if not votes:
398
      # empty node list, this is a one node cluster
399
      return True
400
    if votes[0][0] is None:
401
      retries -= 1
402
      time.sleep(10)
401 403
      continue
402
    master_node = results[node][2]
403
    if master_node == myself:
404
      positive += 1
405
    else:
406
      negative += 1
407
      if not master_node in other_masters:
408
        other_masters[master_node] = 0
409
      other_masters[master_node] += 1
410
  if positive <= negative:
411
    # bad!
404
    break
405
  if retries == 0:
406
      logging.critical("Cluster inconsistent, most of the nodes didn't answer"
407
                       " after multiple retries. Aborting startup")
408
      return False
409
  # here a real node is at the top of the list
410
  all_votes = sum(item[1] for item in votes)
411
  top_node, top_votes = votes[0]
412
  result = False
413
  if top_node != myself:
414
    logging.critical("It seems we are not the master (top-voted node"
415
                     " is %s)", top_node)
416
  elif top_votes < all_votes - top_votes:
412 417
    logging.critical("It seems we are not the master (%d votes for,"
413
                     " %d votes against)", positive, negative)
414
    if len(other_masters) > 1:
415
      logging.critical("The other nodes do not agree on a single master")
416
    elif other_masters:
417
      # TODO: resync my files from the master
418
      logging.critical("It seems the real master is %s",
419
                       other_masters.keys()[0])
420
    else:
421
      logging.critical("Can't contact any node for data, aborting startup")
422
    return False
423
  return True
418
                     " %d votes against)", top_votes, all_votes - top_votes)
419
  else:
420
    result = True
421

  
422
  return result
424 423

  
425 424

  
426 425
def main():

Also available in: Unified diff