Statistics
| Branch: | Tag: | Revision:

root / lib / bootstrap.py @ 0e3baaf3

History | View | Annotate | Download (23.2 kB)

1
#
2
#
3

    
4
# Copyright (C) 2006, 2007, 2008 Google Inc.
5
#
6
# This program is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 2 of the License, or
9
# (at your option) any later version.
10
#
11
# This program is distributed in the hope that it will be useful, but
12
# WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
# General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with this program; if not, write to the Free Software
18
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19
# 02110-1301, USA.
20

    
21

    
22
"""Functions to bootstrap a new cluster.
23

24
"""
25

    
26
import os
27
import os.path
28
import re
29
import logging
30
import tempfile
31
import time
32

    
33
from ganeti import rpc
34
from ganeti import ssh
35
from ganeti import utils
36
from ganeti import errors
37
from ganeti import config
38
from ganeti import constants
39
from ganeti import objects
40
from ganeti import ssconf
41
from ganeti import serializer
42
from ganeti import hypervisor
43

    
44

    
45
def _InitSSHSetup():
46
  """Setup the SSH configuration for the cluster.
47

48
  This generates a dsa keypair for root, adds the pub key to the
49
  permitted hosts and adds the hostkey to its own known hosts.
50

51
  """
52
  priv_key, pub_key, auth_keys = ssh.GetUserFiles(constants.GANETI_RUNAS)
53

    
54
  for name in priv_key, pub_key:
55
    if os.path.exists(name):
56
      utils.CreateBackup(name)
57
    utils.RemoveFile(name)
58

    
59
  result = utils.RunCmd(["ssh-keygen", "-t", "dsa",
60
                         "-f", priv_key,
61
                         "-q", "-N", ""])
62
  if result.failed:
63
    raise errors.OpExecError("Could not generate ssh keypair, error %s" %
64
                             result.output)
65

    
66
  utils.AddAuthorizedKey(auth_keys, utils.ReadFile(pub_key))
67

    
68

    
69
def GenerateSelfSignedSslCert(file_name, validity=(365 * 5)):
70
  """Generates a self-signed SSL certificate.
71

72
  @type file_name: str
73
  @param file_name: Path to output file
74
  @type validity: int
75
  @param validity: Validity for certificate in days
76

77
  """
78
  (fd, tmp_file_name) = tempfile.mkstemp(dir=os.path.dirname(file_name))
79
  try:
80
    try:
81
      # Set permissions before writing key
82
      os.chmod(tmp_file_name, 0600)
83

    
84
      result = utils.RunCmd(["openssl", "req", "-new", "-newkey", "rsa:1024",
85
                             "-days", str(validity), "-nodes", "-x509",
86
                             "-keyout", tmp_file_name, "-out", tmp_file_name,
87
                             "-batch"])
88
      if result.failed:
89
        raise errors.OpExecError("Could not generate SSL certificate, command"
90
                                 " %s had exitcode %s and error message %s" %
91
                                 (result.cmd, result.exit_code, result.output))
92

    
93
      # Make read-only
94
      os.chmod(tmp_file_name, 0400)
95

    
96
      os.rename(tmp_file_name, file_name)
97
    finally:
98
      utils.RemoveFile(tmp_file_name)
99
  finally:
100
    os.close(fd)
101

    
102

    
103
def GenerateHmacKey(file_name):
104
  """Writes a new HMAC key.
105

106
  @type file_name: str
107
  @param file_name: Path to output file
108

109
  """
110
  utils.WriteFile(file_name, data="%s\n" % utils.GenerateSecret(), mode=0400,
111
                  backup=True)
112

    
113

    
114
def GenerateClusterCrypto(new_cluster_cert, new_rapi_cert, new_confd_hmac_key,
115
                          rapi_cert_pem=None,
116
                          nodecert_file=constants.NODED_CERT_FILE,
117
                          rapicert_file=constants.RAPI_CERT_FILE,
118
                          hmackey_file=constants.CONFD_HMAC_KEY):
119
  """Updates the cluster certificates, keys and secrets.
120

121
  @type new_cluster_cert: bool
122
  @param new_cluster_cert: Whether to generate a new cluster certificate
123
  @type new_rapi_cert: bool
124
  @param new_rapi_cert: Whether to generate a new RAPI certificate
125
  @type new_confd_hmac_key: bool
126
  @param new_confd_hmac_key: Whether to generate a new HMAC key
127
  @type rapi_cert_pem: string
128
  @param rapi_cert_pem: New RAPI certificate in PEM format
129
  @type nodecert_file: string
130
  @param nodecert_file: optional override of the node cert file path
131
  @type rapicert_file: string
132
  @param rapicert_file: optional override of the rapi cert file path
133
  @type hmackey_file: string
134
  @param hmackey_file: optional override of the hmac key file path
135

136
  """
137
  # noded SSL certificate
138
  cluster_cert_exists = os.path.exists(nodecert_file)
139
  if new_cluster_cert or not cluster_cert_exists:
140
    if cluster_cert_exists:
141
      utils.CreateBackup(nodecert_file)
142

    
143
    logging.debug("Generating new cluster certificate at %s", nodecert_file)
144
    GenerateSelfSignedSslCert(nodecert_file)
145

    
146
  # confd HMAC key
147
  if new_confd_hmac_key or not os.path.exists(hmackey_file):
148
    logging.debug("Writing new confd HMAC key to %s", hmackey_file)
149
    GenerateHmacKey(hmackey_file)
150

    
151
  # RAPI
152
  rapi_cert_exists = os.path.exists(rapicert_file)
153

    
154
  if rapi_cert_pem:
155
    # Assume rapi_pem contains a valid PEM-formatted certificate and key
156
    logging.debug("Writing RAPI certificate at %s", rapicert_file)
157
    utils.WriteFile(rapicert_file, data=rapi_cert_pem, backup=True)
158

    
159
  elif new_rapi_cert or not rapi_cert_exists:
160
    if rapi_cert_exists:
161
      utils.CreateBackup(rapicert_file)
162

    
163
    logging.debug("Generating new RAPI certificate at %s", rapicert_file)
164
    GenerateSelfSignedSslCert(rapicert_file)
165

    
166

    
167
def _InitGanetiServerSetup(master_name):
168
  """Setup the necessary configuration for the initial node daemon.
169

170
  This creates the nodepass file containing the shared password for
171
  the cluster and also generates the SSL certificate.
172

173
  """
174
  # Generate cluster secrets
175
  GenerateClusterCrypto(True, False, False)
176

    
177
  result = utils.RunCmd([constants.DAEMON_UTIL, "start", constants.NODED])
178
  if result.failed:
179
    raise errors.OpExecError("Could not start the node daemon, command %s"
180
                             " had exitcode %s and error %s" %
181
                             (result.cmd, result.exit_code, result.output))
182

    
183
  _WaitForNodeDaemon(master_name)
184

    
185

    
186
def _WaitForNodeDaemon(node_name):
187
  """Wait for node daemon to become responsive.
188

189
  """
190
  def _CheckNodeDaemon():
191
    result = rpc.RpcRunner.call_version([node_name])[node_name]
192
    if result.fail_msg:
193
      raise utils.RetryAgain()
194

    
195
  try:
196
    utils.Retry(_CheckNodeDaemon, 1.0, 10.0)
197
  except utils.RetryTimeout:
198
    raise errors.OpExecError("Node daemon on %s didn't answer queries within"
199
                             " 10 seconds" % node_name)
200

    
201

    
202
def _InitFileStorage(file_storage_dir):
203
  """Initialize if needed the file storage.
204

205
  @param file_storage_dir: the user-supplied value
206
  @return: either empty string (if file storage was disabled at build
207
      time) or the normalized path to the storage directory
208

209
  """
210
  if not constants.ENABLE_FILE_STORAGE:
211
    return ""
212

    
213
  file_storage_dir = os.path.normpath(file_storage_dir)
214

    
215
  if not os.path.isabs(file_storage_dir):
216
    raise errors.OpPrereqError("The file storage directory you passed is"
217
                               " not an absolute path.", errors.ECODE_INVAL)
218

    
219
  if not os.path.exists(file_storage_dir):
220
    try:
221
      os.makedirs(file_storage_dir, 0750)
222
    except OSError, err:
223
      raise errors.OpPrereqError("Cannot create file storage directory"
224
                                 " '%s': %s" % (file_storage_dir, err),
225
                                 errors.ECODE_ENVIRON)
226

    
227
  if not os.path.isdir(file_storage_dir):
228
    raise errors.OpPrereqError("The file storage directory '%s' is not"
229
                               " a directory." % file_storage_dir,
230
                               errors.ECODE_ENVIRON)
231
  return file_storage_dir
232

    
233

    
234
def InitCluster(cluster_name, mac_prefix,
235
                master_netdev, file_storage_dir, candidate_pool_size,
236
                secondary_ip=None, vg_name=None, beparams=None,
237
                nicparams=None, hvparams=None, enabled_hypervisors=None,
238
                modify_etc_hosts=True, modify_ssh_setup=True,
239
                maintain_node_health=False):
240
  """Initialise the cluster.
241

242
  @type candidate_pool_size: int
243
  @param candidate_pool_size: master candidate pool size
244

245
  """
246
  # TODO: complete the docstring
247
  if config.ConfigWriter.IsCluster():
248
    raise errors.OpPrereqError("Cluster is already initialised",
249
                               errors.ECODE_STATE)
250

    
251
  if not enabled_hypervisors:
252
    raise errors.OpPrereqError("Enabled hypervisors list must contain at"
253
                               " least one member", errors.ECODE_INVAL)
254
  invalid_hvs = set(enabled_hypervisors) - constants.HYPER_TYPES
255
  if invalid_hvs:
256
    raise errors.OpPrereqError("Enabled hypervisors contains invalid"
257
                               " entries: %s" % invalid_hvs,
258
                               errors.ECODE_INVAL)
259

    
260
  hostname = utils.GetHostInfo()
261

    
262
  if hostname.ip.startswith("127."):
263
    raise errors.OpPrereqError("This host's IP resolves to the private"
264
                               " range (%s). Please fix DNS or %s." %
265
                               (hostname.ip, constants.ETC_HOSTS),
266
                               errors.ECODE_ENVIRON)
267

    
268
  if not utils.OwnIpAddress(hostname.ip):
269
    raise errors.OpPrereqError("Inconsistency: this host's name resolves"
270
                               " to %s,\nbut this ip address does not"
271
                               " belong to this host. Aborting." %
272
                               hostname.ip, errors.ECODE_ENVIRON)
273

    
274
  clustername = utils.GetHostInfo(utils.HostInfo.NormalizeName(cluster_name))
275

    
276
  if utils.TcpPing(clustername.ip, constants.DEFAULT_NODED_PORT,
277
                   timeout=5):
278
    raise errors.OpPrereqError("Cluster IP already active. Aborting.",
279
                               errors.ECODE_NOTUNIQUE)
280

    
281
  if secondary_ip:
282
    if not utils.IsValidIP(secondary_ip):
283
      raise errors.OpPrereqError("Invalid secondary ip given",
284
                                 errors.ECODE_INVAL)
285
    if (secondary_ip != hostname.ip and
286
        not utils.OwnIpAddress(secondary_ip)):
287
      raise errors.OpPrereqError("You gave %s as secondary IP,"
288
                                 " but it does not belong to this host." %
289
                                 secondary_ip, errors.ECODE_ENVIRON)
290
  else:
291
    secondary_ip = hostname.ip
292

    
293
  if vg_name is not None:
294
    # Check if volume group is valid
295
    vgstatus = utils.CheckVolumeGroupSize(utils.ListVolumeGroups(), vg_name,
296
                                          constants.MIN_VG_SIZE)
297
    if vgstatus:
298
      raise errors.OpPrereqError("Error: %s\nspecify --no-lvm-storage if"
299
                                 " you are not using lvm" % vgstatus,
300
                                 errors.ECODE_INVAL)
301

    
302
  file_storage_dir = _InitFileStorage(file_storage_dir)
303

    
304
  if not re.match("^[0-9a-z]{2}:[0-9a-z]{2}:[0-9a-z]{2}$", mac_prefix):
305
    raise errors.OpPrereqError("Invalid mac prefix given '%s'" % mac_prefix,
306
                               errors.ECODE_INVAL)
307

    
308
  result = utils.RunCmd(["ip", "link", "show", "dev", master_netdev])
309
  if result.failed:
310
    raise errors.OpPrereqError("Invalid master netdev given (%s): '%s'" %
311
                               (master_netdev,
312
                                result.output.strip()), errors.ECODE_INVAL)
313

    
314
  dirs = [(constants.RUN_GANETI_DIR, constants.RUN_DIRS_MODE)]
315
  utils.EnsureDirs(dirs)
316

    
317
  utils.ForceDictType(beparams, constants.BES_PARAMETER_TYPES)
318
  utils.ForceDictType(nicparams, constants.NICS_PARAMETER_TYPES)
319
  objects.NIC.CheckParameterSyntax(nicparams)
320

    
321
  # hvparams is a mapping of hypervisor->hvparams dict
322
  for hv_name, hv_params in hvparams.iteritems():
323
    utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
324
    hv_class = hypervisor.GetHypervisor(hv_name)
325
    hv_class.CheckParameterSyntax(hv_params)
326

    
327
  # set up the inter-node password and certificate
328
  _InitGanetiServerSetup(hostname.name)
329

    
330
  # set up ssh config and /etc/hosts
331
  sshline = utils.ReadFile(constants.SSH_HOST_RSA_PUB)
332
  sshkey = sshline.split(" ")[1]
333

    
334
  if modify_etc_hosts:
335
    utils.AddHostToEtcHosts(hostname.name)
336

    
337
  if modify_ssh_setup:
338
    _InitSSHSetup()
339

    
340
  now = time.time()
341

    
342
  # init of cluster config file
343
  cluster_config = objects.Cluster(
344
    serial_no=1,
345
    rsahostkeypub=sshkey,
346
    highest_used_port=(constants.FIRST_DRBD_PORT - 1),
347
    mac_prefix=mac_prefix,
348
    volume_group_name=vg_name,
349
    tcpudp_port_pool=set(),
350
    master_node=hostname.name,
351
    master_ip=clustername.ip,
352
    master_netdev=master_netdev,
353
    cluster_name=clustername.name,
354
    file_storage_dir=file_storage_dir,
355
    enabled_hypervisors=enabled_hypervisors,
356
    beparams={constants.PP_DEFAULT: beparams},
357
    nicparams={constants.PP_DEFAULT: nicparams},
358
    hvparams=hvparams,
359
    candidate_pool_size=candidate_pool_size,
360
    modify_etc_hosts=modify_etc_hosts,
361
    modify_ssh_setup=modify_ssh_setup,
362
    ctime=now,
363
    mtime=now,
364
    uuid=utils.NewUUID(),
365
    maintain_node_health=maintain_node_health,
366
    )
367
  master_node_config = objects.Node(name=hostname.name,
368
                                    primary_ip=hostname.ip,
369
                                    secondary_ip=secondary_ip,
370
                                    serial_no=1,
371
                                    master_candidate=True,
372
                                    offline=False, drained=False,
373
                                    )
374
  InitConfig(constants.CONFIG_VERSION, cluster_config, master_node_config)
375
  cfg = config.ConfigWriter()
376
  ssh.WriteKnownHostsFile(cfg, constants.SSH_KNOWN_HOSTS_FILE)
377
  cfg.Update(cfg.GetClusterInfo(), logging.error)
378

    
379
  # start the master ip
380
  # TODO: Review rpc call from bootstrap
381
  # TODO: Warn on failed start master
382
  rpc.RpcRunner.call_node_start_master(hostname.name, True, False)
383

    
384

    
385
def InitConfig(version, cluster_config, master_node_config,
386
               cfg_file=constants.CLUSTER_CONF_FILE):
387
  """Create the initial cluster configuration.
388

389
  It will contain the current node, which will also be the master
390
  node, and no instances.
391

392
  @type version: int
393
  @param version: configuration version
394
  @type cluster_config: L{objects.Cluster}
395
  @param cluster_config: cluster configuration
396
  @type master_node_config: L{objects.Node}
397
  @param master_node_config: master node configuration
398
  @type cfg_file: string
399
  @param cfg_file: configuration file path
400

401
  """
402
  nodes = {
403
    master_node_config.name: master_node_config,
404
    }
405

    
406
  now = time.time()
407
  config_data = objects.ConfigData(version=version,
408
                                   cluster=cluster_config,
409
                                   nodes=nodes,
410
                                   instances={},
411
                                   serial_no=1,
412
                                   ctime=now, mtime=now)
413
  utils.WriteFile(cfg_file,
414
                  data=serializer.Dump(config_data.ToDict()),
415
                  mode=0600)
416

    
417

    
418
def FinalizeClusterDestroy(master):
419
  """Execute the last steps of cluster destroy
420

421
  This function shuts down all the daemons, completing the destroy
422
  begun in cmdlib.LUDestroyOpcode.
423

424
  """
425
  cfg = config.ConfigWriter()
426
  modify_ssh_setup = cfg.GetClusterInfo().modify_ssh_setup
427
  result = rpc.RpcRunner.call_node_stop_master(master, True)
428
  msg = result.fail_msg
429
  if msg:
430
    logging.warning("Could not disable the master role: %s", msg)
431
  result = rpc.RpcRunner.call_node_leave_cluster(master, modify_ssh_setup)
432
  msg = result.fail_msg
433
  if msg:
434
    logging.warning("Could not shutdown the node daemon and cleanup"
435
                    " the node: %s", msg)
436

    
437

    
438
def SetupNodeDaemon(cluster_name, node, ssh_key_check):
439
  """Add a node to the cluster.
440

441
  This function must be called before the actual opcode, and will ssh
442
  to the remote node, copy the needed files, and start ganeti-noded,
443
  allowing the master to do the rest via normal rpc calls.
444

445
  @param cluster_name: the cluster name
446
  @param node: the name of the new node
447
  @param ssh_key_check: whether to do a strict key check
448

449
  """
450
  sshrunner = ssh.SshRunner(cluster_name)
451

    
452
  noded_cert = utils.ReadFile(constants.NODED_CERT_FILE)
453
  rapi_cert = utils.ReadFile(constants.RAPI_CERT_FILE)
454
  confd_hmac_key = utils.ReadFile(constants.CONFD_HMAC_KEY)
455

    
456
  # in the base64 pem encoding, neither '!' nor '.' are valid chars,
457
  # so we use this to detect an invalid certificate; as long as the
458
  # cert doesn't contain this, the here-document will be correctly
459
  # parsed by the shell sequence below. HMAC keys are hexadecimal strings,
460
  # so the same restrictions apply.
461
  for content in (noded_cert, rapi_cert, confd_hmac_key):
462
    if re.search('^!EOF\.', content, re.MULTILINE):
463
      raise errors.OpExecError("invalid SSL certificate or HMAC key")
464

    
465
  if not noded_cert.endswith("\n"):
466
    noded_cert += "\n"
467
  if not rapi_cert.endswith("\n"):
468
    rapi_cert += "\n"
469
  if not confd_hmac_key.endswith("\n"):
470
    confd_hmac_key += "\n"
471

    
472
  # set up inter-node password and certificate and restarts the node daemon
473
  # and then connect with ssh to set password and start ganeti-noded
474
  # note that all the below variables are sanitized at this point,
475
  # either by being constants or by the checks above
476
  mycommand = ("umask 077 && "
477
               "cat > '%s' << '!EOF.' && \n"
478
               "%s!EOF.\n"
479
               "cat > '%s' << '!EOF.' && \n"
480
               "%s!EOF.\n"
481
               "cat > '%s' << '!EOF.' && \n"
482
               "%s!EOF.\n"
483
               "chmod 0400 %s %s %s && "
484
               "%s start %s" %
485
               (constants.NODED_CERT_FILE, noded_cert,
486
                constants.RAPI_CERT_FILE, rapi_cert,
487
                constants.CONFD_HMAC_KEY, confd_hmac_key,
488
                constants.NODED_CERT_FILE, constants.RAPI_CERT_FILE,
489
                constants.CONFD_HMAC_KEY,
490
                constants.DAEMON_UTIL, constants.NODED))
491

    
492
  result = sshrunner.Run(node, 'root', mycommand, batch=False,
493
                         ask_key=ssh_key_check,
494
                         use_cluster_key=False,
495
                         strict_host_check=ssh_key_check)
496
  if result.failed:
497
    raise errors.OpExecError("Remote command on node %s, error: %s,"
498
                             " output: %s" %
499
                             (node, result.fail_reason, result.output))
500

    
501
  _WaitForNodeDaemon(node)
502

    
503

    
504
def MasterFailover(no_voting=False):
505
  """Failover the master node.
506

507
  This checks that we are not already the master, and will cause the
508
  current master to cease being master, and the non-master to become
509
  new master.
510

511
  @type no_voting: boolean
512
  @param no_voting: force the operation without remote nodes agreement
513
                      (dangerous)
514

515
  """
516
  sstore = ssconf.SimpleStore()
517

    
518
  old_master, new_master = ssconf.GetMasterAndMyself(sstore)
519
  node_list = sstore.GetNodeList()
520
  mc_list = sstore.GetMasterCandidates()
521

    
522
  if old_master == new_master:
523
    raise errors.OpPrereqError("This commands must be run on the node"
524
                               " where you want the new master to be."
525
                               " %s is already the master" %
526
                               old_master, errors.ECODE_INVAL)
527

    
528
  if new_master not in mc_list:
529
    mc_no_master = [name for name in mc_list if name != old_master]
530
    raise errors.OpPrereqError("This node is not among the nodes marked"
531
                               " as master candidates. Only these nodes"
532
                               " can become masters. Current list of"
533
                               " master candidates is:\n"
534
                               "%s" % ('\n'.join(mc_no_master)),
535
                               errors.ECODE_STATE)
536

    
537
  if not no_voting:
538
    vote_list = GatherMasterVotes(node_list)
539

    
540
    if vote_list:
541
      voted_master = vote_list[0][0]
542
      if voted_master is None:
543
        raise errors.OpPrereqError("Cluster is inconsistent, most nodes did"
544
                                   " not respond.", errors.ECODE_ENVIRON)
545
      elif voted_master != old_master:
546
        raise errors.OpPrereqError("I have a wrong configuration, I believe"
547
                                   " the master is %s but the other nodes"
548
                                   " voted %s. Please resync the configuration"
549
                                   " of this node." %
550
                                   (old_master, voted_master),
551
                                   errors.ECODE_STATE)
552
  # end checks
553

    
554
  rcode = 0
555

    
556
  logging.info("Setting master to %s, old master: %s", new_master, old_master)
557

    
558
  result = rpc.RpcRunner.call_node_stop_master(old_master, True)
559
  msg = result.fail_msg
560
  if msg:
561
    logging.error("Could not disable the master role on the old master"
562
                 " %s, please disable manually: %s", old_master, msg)
563

    
564
  # Here we have a phase where no master should be running
565

    
566
  # instantiate a real config writer, as we now know we have the
567
  # configuration data
568
  cfg = config.ConfigWriter()
569

    
570
  cluster_info = cfg.GetClusterInfo()
571
  cluster_info.master_node = new_master
572
  # this will also regenerate the ssconf files, since we updated the
573
  # cluster info
574
  cfg.Update(cluster_info, logging.error)
575

    
576
  result = rpc.RpcRunner.call_node_start_master(new_master, True, no_voting)
577
  msg = result.fail_msg
578
  if msg:
579
    logging.error("Could not start the master role on the new master"
580
                  " %s, please check: %s", new_master, msg)
581
    rcode = 1
582

    
583
  return rcode
584

    
585

    
586
def GetMaster():
587
  """Returns the current master node.
588

589
  This is a separate function in bootstrap since it's needed by
590
  gnt-cluster, and instead of importing directly ssconf, it's better
591
  to abstract it in bootstrap, where we do use ssconf in other
592
  functions too.
593

594
  """
595
  sstore = ssconf.SimpleStore()
596

    
597
  old_master, _ = ssconf.GetMasterAndMyself(sstore)
598

    
599
  return old_master
600

    
601

    
602
def GatherMasterVotes(node_list):
603
  """Check the agreement on who is the master.
604

605
  This function will return a list of (node, number of votes), ordered
606
  by the number of votes. Errors will be denoted by the key 'None'.
607

608
  Note that the sum of votes is the number of nodes this machine
609
  knows, whereas the number of entries in the list could be different
610
  (if some nodes vote for another master).
611

612
  We remove ourselves from the list since we know that (bugs aside)
613
  since we use the same source for configuration information for both
614
  backend and boostrap, we'll always vote for ourselves.
615

616
  @type node_list: list
617
  @param node_list: the list of nodes to query for master info; the current
618
      node will be removed if it is in the list
619
  @rtype: list
620
  @return: list of (node, votes)
621

622
  """
623
  myself = utils.HostInfo().name
624
  try:
625
    node_list.remove(myself)
626
  except ValueError:
627
    pass
628
  if not node_list:
629
    # no nodes left (eventually after removing myself)
630
    return []
631
  results = rpc.RpcRunner.call_master_info(node_list)
632
  if not isinstance(results, dict):
633
    # this should not happen (unless internal error in rpc)
634
    logging.critical("Can't complete rpc call, aborting master startup")
635
    return [(None, len(node_list))]
636
  votes = {}
637
  for node in results:
638
    nres = results[node]
639
    data = nres.payload
640
    msg = nres.fail_msg
641
    fail = False
642
    if msg:
643
      logging.warning("Error contacting node %s: %s", node, msg)
644
      fail = True
645
    elif not isinstance(data, (tuple, list)) or len(data) < 3:
646
      logging.warning("Invalid data received from node %s: %s", node, data)
647
      fail = True
648
    if fail:
649
      if None not in votes:
650
        votes[None] = 0
651
      votes[None] += 1
652
      continue
653
    master_node = data[2]
654
    if master_node not in votes:
655
      votes[master_node] = 0
656
    votes[master_node] += 1
657

    
658
  vote_list = [v for v in votes.items()]
659
  # sort first on number of votes then on name, since we want None
660
  # sorted later if we have the half of the nodes not responding, and
661
  # half voting all for the same master
662
  vote_list.sort(key=lambda x: (x[1], x[0]), reverse=True)
663

    
664
  return vote_list