Statistics
| Branch: | Tag: | Revision:

root / lib / bootstrap.py @ 89b70f39

History | View | Annotate | Download (20.7 kB)

1
#
2
#
3

    
4
# Copyright (C) 2006, 2007, 2008 Google Inc.
5
#
6
# This program is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 2 of the License, or
9
# (at your option) any later version.
10
#
11
# This program is distributed in the hope that it will be useful, but
12
# WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
# General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with this program; if not, write to the Free Software
18
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19
# 02110-1301, USA.
20

    
21

    
22
"""Functions to bootstrap a new cluster.
23

24
"""
25

    
26
import os
27
import os.path
28
import re
29
import logging
30
import tempfile
31
import time
32

    
33
from ganeti import rpc
34
from ganeti import ssh
35
from ganeti import utils
36
from ganeti import errors
37
from ganeti import config
38
from ganeti import constants
39
from ganeti import objects
40
from ganeti import ssconf
41
from ganeti import serializer
42
from ganeti import hypervisor
43

    
44

    
45
def _InitSSHSetup():
46
  """Setup the SSH configuration for the cluster.
47

48
  This generates a dsa keypair for root, adds the pub key to the
49
  permitted hosts and adds the hostkey to its own known hosts.
50

51
  """
52
  priv_key, pub_key, auth_keys = ssh.GetUserFiles(constants.GANETI_RUNAS)
53

    
54
  for name in priv_key, pub_key:
55
    if os.path.exists(name):
56
      utils.CreateBackup(name)
57
    utils.RemoveFile(name)
58

    
59
  result = utils.RunCmd(["ssh-keygen", "-t", "dsa",
60
                         "-f", priv_key,
61
                         "-q", "-N", ""])
62
  if result.failed:
63
    raise errors.OpExecError("Could not generate ssh keypair, error %s" %
64
                             result.output)
65

    
66
  utils.AddAuthorizedKey(auth_keys, utils.ReadFile(pub_key))
67

    
68

    
69
def GenerateSelfSignedSslCert(file_name, validity=(365 * 5)):
70
  """Generates a self-signed SSL certificate.
71

72
  @type file_name: str
73
  @param file_name: Path to output file
74
  @type validity: int
75
  @param validity: Validity for certificate in days
76

77
  """
78
  (fd, tmp_file_name) = tempfile.mkstemp(dir=os.path.dirname(file_name))
79
  try:
80
    try:
81
      # Set permissions before writing key
82
      os.chmod(tmp_file_name, 0600)
83

    
84
      result = utils.RunCmd(["openssl", "req", "-new", "-newkey", "rsa:1024",
85
                             "-days", str(validity), "-nodes", "-x509",
86
                             "-keyout", tmp_file_name, "-out", tmp_file_name,
87
                             "-batch"])
88
      if result.failed:
89
        raise errors.OpExecError("Could not generate SSL certificate, command"
90
                                 " %s had exitcode %s and error message %s" %
91
                                 (result.cmd, result.exit_code, result.output))
92

    
93
      # Make read-only
94
      os.chmod(tmp_file_name, 0400)
95

    
96
      os.rename(tmp_file_name, file_name)
97
    finally:
98
      utils.RemoveFile(tmp_file_name)
99
  finally:
100
    os.close(fd)
101

    
102

    
103
def GenerateHmacKey(file_name):
104
  """Writes a new HMAC key.
105

106
  @type file_name: str
107
  @param file_name: Path to output file
108

109
  """
110
  utils.WriteFile(file_name, data="%s\n" % utils.GenerateSecret(), mode=0400)
111

    
112

    
113
def _InitGanetiServerSetup(master_name):
114
  """Setup the necessary configuration for the initial node daemon.
115

116
  This creates the nodepass file containing the shared password for
117
  the cluster and also generates the SSL certificate.
118

119
  """
120
  GenerateSelfSignedSslCert(constants.SSL_CERT_FILE)
121

    
122
  # Don't overwrite existing file
123
  if not os.path.exists(constants.RAPI_CERT_FILE):
124
    GenerateSelfSignedSslCert(constants.RAPI_CERT_FILE)
125

    
126
  if not os.path.exists(constants.HMAC_CLUSTER_KEY):
127
    GenerateHmacKey(constants.HMAC_CLUSTER_KEY)
128

    
129
  result = utils.RunCmd([constants.DAEMON_UTIL, "start", constants.NODED])
130
  if result.failed:
131
    raise errors.OpExecError("Could not start the node daemon, command %s"
132
                             " had exitcode %s and error %s" %
133
                             (result.cmd, result.exit_code, result.output))
134

    
135
  # Wait for node daemon to become responsive
136
  def _CheckNodeDaemon():
137
    result = rpc.RpcRunner.call_version([master_name])[master_name]
138
    if result.fail_msg:
139
      raise utils.RetryAgain()
140

    
141
  try:
142
    utils.Retry(_CheckNodeDaemon, 1.0, 10.0)
143
  except utils.RetryTimeout:
144
    raise errors.OpExecError("Node daemon didn't answer queries within"
145
                             " 10 seconds")
146

    
147
def InitCluster(cluster_name, mac_prefix,
148
                master_netdev, file_storage_dir, candidate_pool_size,
149
                secondary_ip=None, vg_name=None, beparams=None,
150
                nicparams=None, hvparams=None, enabled_hypervisors=None,
151
                modify_etc_hosts=True, modify_ssh_setup=True):
152
  """Initialise the cluster.
153

154
  @type candidate_pool_size: int
155
  @param candidate_pool_size: master candidate pool size
156

157
  """
158
  # TODO: complete the docstring
159
  if config.ConfigWriter.IsCluster():
160
    raise errors.OpPrereqError("Cluster is already initialised",
161
                               errors.ECODE_STATE)
162

    
163
  if not enabled_hypervisors:
164
    raise errors.OpPrereqError("Enabled hypervisors list must contain at"
165
                               " least one member", errors.ECODE_INVAL)
166
  invalid_hvs = set(enabled_hypervisors) - constants.HYPER_TYPES
167
  if invalid_hvs:
168
    raise errors.OpPrereqError("Enabled hypervisors contains invalid"
169
                               " entries: %s" % invalid_hvs,
170
                               errors.ECODE_INVAL)
171

    
172
  hostname = utils.GetHostInfo()
173

    
174
  if hostname.ip.startswith("127."):
175
    raise errors.OpPrereqError("This host's IP resolves to the private"
176
                               " range (%s). Please fix DNS or %s." %
177
                               (hostname.ip, constants.ETC_HOSTS),
178
                               errors.ECODE_ENVIRON)
179

    
180
  if not utils.OwnIpAddress(hostname.ip):
181
    raise errors.OpPrereqError("Inconsistency: this host's name resolves"
182
                               " to %s,\nbut this ip address does not"
183
                               " belong to this host. Aborting." %
184
                               hostname.ip, errors.ECODE_ENVIRON)
185

    
186
  clustername = utils.GetHostInfo(cluster_name)
187

    
188
  if utils.TcpPing(clustername.ip, constants.DEFAULT_NODED_PORT,
189
                   timeout=5):
190
    raise errors.OpPrereqError("Cluster IP already active. Aborting.",
191
                               errors.ECODE_NOTUNIQUE)
192

    
193
  if secondary_ip:
194
    if not utils.IsValidIP(secondary_ip):
195
      raise errors.OpPrereqError("Invalid secondary ip given",
196
                                 errors.ECODE_INVAL)
197
    if (secondary_ip != hostname.ip and
198
        not utils.OwnIpAddress(secondary_ip)):
199
      raise errors.OpPrereqError("You gave %s as secondary IP,"
200
                                 " but it does not belong to this host." %
201
                                 secondary_ip, errors.ECODE_ENVIRON)
202
  else:
203
    secondary_ip = hostname.ip
204

    
205
  if vg_name is not None:
206
    # Check if volume group is valid
207
    vgstatus = utils.CheckVolumeGroupSize(utils.ListVolumeGroups(), vg_name,
208
                                          constants.MIN_VG_SIZE)
209
    if vgstatus:
210
      raise errors.OpPrereqError("Error: %s\nspecify --no-lvm-storage if"
211
                                 " you are not using lvm" % vgstatus,
212
                                 errors.ECODE_INVAL)
213

    
214
  file_storage_dir = os.path.normpath(file_storage_dir)
215

    
216
  if not os.path.isabs(file_storage_dir):
217
    raise errors.OpPrereqError("The file storage directory you passed is"
218
                               " not an absolute path.", errors.ECODE_INVAL)
219

    
220
  if not os.path.exists(file_storage_dir):
221
    try:
222
      os.makedirs(file_storage_dir, 0750)
223
    except OSError, err:
224
      raise errors.OpPrereqError("Cannot create file storage directory"
225
                                 " '%s': %s" % (file_storage_dir, err),
226
                                 errors.ECODE_ENVIRON)
227

    
228
  if not os.path.isdir(file_storage_dir):
229
    raise errors.OpPrereqError("The file storage directory '%s' is not"
230
                               " a directory." % file_storage_dir,
231
                               errors.ECODE_ENVIRON)
232

    
233
  if not re.match("^[0-9a-z]{2}:[0-9a-z]{2}:[0-9a-z]{2}$", mac_prefix):
234
    raise errors.OpPrereqError("Invalid mac prefix given '%s'" % mac_prefix,
235
                               errors.ECODE_INVAL)
236

    
237
  result = utils.RunCmd(["ip", "link", "show", "dev", master_netdev])
238
  if result.failed:
239
    raise errors.OpPrereqError("Invalid master netdev given (%s): '%s'" %
240
                               (master_netdev,
241
                                result.output.strip()), errors.ECODE_INVAL)
242

    
243
  dirs = [(constants.RUN_GANETI_DIR, constants.RUN_DIRS_MODE)]
244
  utils.EnsureDirs(dirs)
245

    
246
  utils.ForceDictType(beparams, constants.BES_PARAMETER_TYPES)
247
  utils.ForceDictType(nicparams, constants.NICS_PARAMETER_TYPES)
248
  objects.NIC.CheckParameterSyntax(nicparams)
249

    
250
  # hvparams is a mapping of hypervisor->hvparams dict
251
  for hv_name, hv_params in hvparams.iteritems():
252
    utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
253
    hv_class = hypervisor.GetHypervisor(hv_name)
254
    hv_class.CheckParameterSyntax(hv_params)
255

    
256
  # set up the inter-node password and certificate
257
  _InitGanetiServerSetup(hostname.name)
258

    
259
  # set up ssh config and /etc/hosts
260
  sshline = utils.ReadFile(constants.SSH_HOST_RSA_PUB)
261
  sshkey = sshline.split(" ")[1]
262

    
263
  if modify_etc_hosts:
264
    utils.AddHostToEtcHosts(hostname.name)
265

    
266
  if modify_ssh_setup:
267
    _InitSSHSetup()
268

    
269
  now = time.time()
270

    
271
  # init of cluster config file
272
  cluster_config = objects.Cluster(
273
    serial_no=1,
274
    rsahostkeypub=sshkey,
275
    highest_used_port=(constants.FIRST_DRBD_PORT - 1),
276
    mac_prefix=mac_prefix,
277
    volume_group_name=vg_name,
278
    tcpudp_port_pool=set(),
279
    master_node=hostname.name,
280
    master_ip=clustername.ip,
281
    master_netdev=master_netdev,
282
    cluster_name=clustername.name,
283
    file_storage_dir=file_storage_dir,
284
    enabled_hypervisors=enabled_hypervisors,
285
    beparams={constants.PP_DEFAULT: beparams},
286
    nicparams={constants.PP_DEFAULT: nicparams},
287
    hvparams=hvparams,
288
    candidate_pool_size=candidate_pool_size,
289
    modify_etc_hosts=modify_etc_hosts,
290
    modify_ssh_setup=modify_ssh_setup,
291
    ctime=now,
292
    mtime=now,
293
    uuid=utils.NewUUID(),
294
    )
295
  master_node_config = objects.Node(name=hostname.name,
296
                                    primary_ip=hostname.ip,
297
                                    secondary_ip=secondary_ip,
298
                                    serial_no=1,
299
                                    master_candidate=True,
300
                                    offline=False, drained=False,
301
                                    )
302
  InitConfig(constants.CONFIG_VERSION, cluster_config, master_node_config)
303
  cfg = config.ConfigWriter()
304
  ssh.WriteKnownHostsFile(cfg, constants.SSH_KNOWN_HOSTS_FILE)
305
  cfg.Update(cfg.GetClusterInfo(), logging.error)
306

    
307
  # start the master ip
308
  # TODO: Review rpc call from bootstrap
309
  # TODO: Warn on failed start master
310
  rpc.RpcRunner.call_node_start_master(hostname.name, True, False)
311

    
312

    
313
def InitConfig(version, cluster_config, master_node_config,
314
               cfg_file=constants.CLUSTER_CONF_FILE):
315
  """Create the initial cluster configuration.
316

317
  It will contain the current node, which will also be the master
318
  node, and no instances.
319

320
  @type version: int
321
  @param version: configuration version
322
  @type cluster_config: L{objects.Cluster}
323
  @param cluster_config: cluster configuration
324
  @type master_node_config: L{objects.Node}
325
  @param master_node_config: master node configuration
326
  @type cfg_file: string
327
  @param cfg_file: configuration file path
328

329
  """
330
  nodes = {
331
    master_node_config.name: master_node_config,
332
    }
333

    
334
  now = time.time()
335
  config_data = objects.ConfigData(version=version,
336
                                   cluster=cluster_config,
337
                                   nodes=nodes,
338
                                   instances={},
339
                                   serial_no=1,
340
                                   ctime=now, mtime=now)
341
  utils.WriteFile(cfg_file,
342
                  data=serializer.Dump(config_data.ToDict()),
343
                  mode=0600)
344

    
345

    
346
def FinalizeClusterDestroy(master):
347
  """Execute the last steps of cluster destroy
348

349
  This function shuts down all the daemons, completing the destroy
350
  begun in cmdlib.LUDestroyOpcode.
351

352
  """
353
  cfg = config.ConfigWriter()
354
  modify_ssh_setup = cfg.GetClusterInfo().modify_ssh_setup
355
  result = rpc.RpcRunner.call_node_stop_master(master, True)
356
  msg = result.fail_msg
357
  if msg:
358
    logging.warning("Could not disable the master role: %s", msg)
359
  result = rpc.RpcRunner.call_node_leave_cluster(master, modify_ssh_setup)
360
  msg = result.fail_msg
361
  if msg:
362
    logging.warning("Could not shutdown the node daemon and cleanup"
363
                    " the node: %s", msg)
364

    
365

    
366
def SetupNodeDaemon(cluster_name, node, ssh_key_check):
367
  """Add a node to the cluster.
368

369
  This function must be called before the actual opcode, and will ssh
370
  to the remote node, copy the needed files, and start ganeti-noded,
371
  allowing the master to do the rest via normal rpc calls.
372

373
  @param cluster_name: the cluster name
374
  @param node: the name of the new node
375
  @param ssh_key_check: whether to do a strict key check
376

377
  """
378
  sshrunner = ssh.SshRunner(cluster_name)
379

    
380
  noded_cert = utils.ReadFile(constants.SSL_CERT_FILE)
381
  rapi_cert = utils.ReadFile(constants.RAPI_CERT_FILE)
382
  hmac_key = utils.ReadFile(constants.HMAC_CLUSTER_KEY)
383

    
384
  # in the base64 pem encoding, neither '!' nor '.' are valid chars,
385
  # so we use this to detect an invalid certificate; as long as the
386
  # cert doesn't contain this, the here-document will be correctly
387
  # parsed by the shell sequence below. HMAC keys are hexadecimal strings,
388
  # so the same restrictions apply.
389
  for content in (noded_cert, rapi_cert, hmac_key):
390
    if re.search('^!EOF\.', content, re.MULTILINE):
391
      raise errors.OpExecError("invalid SSL certificate or HMAC key")
392

    
393
  if not noded_cert.endswith("\n"):
394
    noded_cert += "\n"
395
  if not rapi_cert.endswith("\n"):
396
    rapi_cert += "\n"
397
  if not hmac_key.endswith("\n"):
398
    hmac_key += "\n"
399

    
400
  # set up inter-node password and certificate and restarts the node daemon
401
  # and then connect with ssh to set password and start ganeti-noded
402
  # note that all the below variables are sanitized at this point,
403
  # either by being constants or by the checks above
404
  mycommand = ("umask 077 && "
405
               "cat > '%s' << '!EOF.' && \n"
406
               "%s!EOF.\n"
407
               "cat > '%s' << '!EOF.' && \n"
408
               "%s!EOF.\n"
409
               "cat > '%s' << '!EOF.' && \n"
410
               "%s!EOF.\n"
411
               "chmod 0400 %s %s %s && "
412
               "%s start %s" %
413
               (constants.SSL_CERT_FILE, noded_cert,
414
                constants.RAPI_CERT_FILE, rapi_cert,
415
                constants.HMAC_CLUSTER_KEY, hmac_key,
416
                constants.SSL_CERT_FILE, constants.RAPI_CERT_FILE,
417
                constants.HMAC_CLUSTER_KEY,
418
                constants.DAEMON_UTIL, constants.NODED))
419

    
420
  result = sshrunner.Run(node, 'root', mycommand, batch=False,
421
                         ask_key=ssh_key_check,
422
                         use_cluster_key=False,
423
                         strict_host_check=ssh_key_check)
424
  if result.failed:
425
    raise errors.OpExecError("Remote command on node %s, error: %s,"
426
                             " output: %s" %
427
                             (node, result.fail_reason, result.output))
428

    
429

    
430
def MasterFailover(no_voting=False):
431
  """Failover the master node.
432

433
  This checks that we are not already the master, and will cause the
434
  current master to cease being master, and the non-master to become
435
  new master.
436

437
  @type no_voting: boolean
438
  @param no_voting: force the operation without remote nodes agreement
439
                      (dangerous)
440

441
  """
442
  sstore = ssconf.SimpleStore()
443

    
444
  old_master, new_master = ssconf.GetMasterAndMyself(sstore)
445
  node_list = sstore.GetNodeList()
446
  mc_list = sstore.GetMasterCandidates()
447

    
448
  if old_master == new_master:
449
    raise errors.OpPrereqError("This commands must be run on the node"
450
                               " where you want the new master to be."
451
                               " %s is already the master" %
452
                               old_master, errors.ECODE_INVAL)
453

    
454
  if new_master not in mc_list:
455
    mc_no_master = [name for name in mc_list if name != old_master]
456
    raise errors.OpPrereqError("This node is not among the nodes marked"
457
                               " as master candidates. Only these nodes"
458
                               " can become masters. Current list of"
459
                               " master candidates is:\n"
460
                               "%s" % ('\n'.join(mc_no_master)),
461
                               errors.ECODE_STATE)
462

    
463
  if not no_voting:
464
    vote_list = GatherMasterVotes(node_list)
465

    
466
    if vote_list:
467
      voted_master = vote_list[0][0]
468
      if voted_master is None:
469
        raise errors.OpPrereqError("Cluster is inconsistent, most nodes did"
470
                                   " not respond.", errors.ECODE_ENVIRON)
471
      elif voted_master != old_master:
472
        raise errors.OpPrereqError("I have a wrong configuration, I believe"
473
                                   " the master is %s but the other nodes"
474
                                   " voted %s. Please resync the configuration"
475
                                   " of this node." %
476
                                   (old_master, voted_master),
477
                                   errors.ECODE_STATE)
478
  # end checks
479

    
480
  rcode = 0
481

    
482
  logging.info("Setting master to %s, old master: %s", new_master, old_master)
483

    
484
  result = rpc.RpcRunner.call_node_stop_master(old_master, True)
485
  msg = result.fail_msg
486
  if msg:
487
    logging.error("Could not disable the master role on the old master"
488
                 " %s, please disable manually: %s", old_master, msg)
489

    
490
  # Here we have a phase where no master should be running
491

    
492
  # instantiate a real config writer, as we now know we have the
493
  # configuration data
494
  cfg = config.ConfigWriter()
495

    
496
  cluster_info = cfg.GetClusterInfo()
497
  cluster_info.master_node = new_master
498
  # this will also regenerate the ssconf files, since we updated the
499
  # cluster info
500
  cfg.Update(cluster_info, logging.error)
501

    
502
  result = rpc.RpcRunner.call_node_start_master(new_master, True, no_voting)
503
  msg = result.fail_msg
504
  if msg:
505
    logging.error("Could not start the master role on the new master"
506
                  " %s, please check: %s", new_master, msg)
507
    rcode = 1
508

    
509
  return rcode
510

    
511

    
512
def GetMaster():
513
  """Returns the current master node.
514

515
  This is a separate function in bootstrap since it's needed by
516
  gnt-cluster, and instead of importing directly ssconf, it's better
517
  to abstract it in bootstrap, where we do use ssconf in other
518
  functions too.
519

520
  """
521
  sstore = ssconf.SimpleStore()
522

    
523
  old_master, _ = ssconf.GetMasterAndMyself(sstore)
524

    
525
  return old_master
526

    
527

    
528
def GatherMasterVotes(node_list):
529
  """Check the agreement on who is the master.
530

531
  This function will return a list of (node, number of votes), ordered
532
  by the number of votes. Errors will be denoted by the key 'None'.
533

534
  Note that the sum of votes is the number of nodes this machine
535
  knows, whereas the number of entries in the list could be different
536
  (if some nodes vote for another master).
537

538
  We remove ourselves from the list since we know that (bugs aside)
539
  since we use the same source for configuration information for both
540
  backend and boostrap, we'll always vote for ourselves.
541

542
  @type node_list: list
543
  @param node_list: the list of nodes to query for master info; the current
544
      node will be removed if it is in the list
545
  @rtype: list
546
  @return: list of (node, votes)
547

548
  """
549
  myself = utils.HostInfo().name
550
  try:
551
    node_list.remove(myself)
552
  except ValueError:
553
    pass
554
  if not node_list:
555
    # no nodes left (eventually after removing myself)
556
    return []
557
  results = rpc.RpcRunner.call_master_info(node_list)
558
  if not isinstance(results, dict):
559
    # this should not happen (unless internal error in rpc)
560
    logging.critical("Can't complete rpc call, aborting master startup")
561
    return [(None, len(node_list))]
562
  votes = {}
563
  for node in results:
564
    nres = results[node]
565
    data = nres.payload
566
    msg = nres.fail_msg
567
    fail = False
568
    if msg:
569
      logging.warning("Error contacting node %s: %s", node, msg)
570
      fail = True
571
    elif not isinstance(data, (tuple, list)) or len(data) < 3:
572
      logging.warning("Invalid data received from node %s: %s", node, data)
573
      fail = True
574
    if fail:
575
      if None not in votes:
576
        votes[None] = 0
577
      votes[None] += 1
578
      continue
579
    master_node = data[2]
580
    if master_node not in votes:
581
      votes[master_node] = 0
582
    votes[master_node] += 1
583

    
584
  vote_list = [v for v in votes.items()]
585
  # sort first on number of votes then on name, since we want None
586
  # sorted later if we have the half of the nodes not responding, and
587
  # half voting all for the same master
588
  vote_list.sort(key=lambda x: (x[1], x[0]), reverse=True)
589

    
590
  return vote_list