Statistics
| Branch: | Tag: | Revision:

root / lib / bootstrap.py @ 8f215968

History | View | Annotate | Download (20.2 kB)

1
#
2
#
3

    
4
# Copyright (C) 2006, 2007, 2008 Google Inc.
5
#
6
# This program is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 2 of the License, or
9
# (at your option) any later version.
10
#
11
# This program is distributed in the hope that it will be useful, but
12
# WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
# General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with this program; if not, write to the Free Software
18
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19
# 02110-1301, USA.
20

    
21

    
22
"""Functions to bootstrap a new cluster.
23

24
"""
25

    
26
import os
27
import os.path
28
import re
29
import logging
30
import tempfile
31
import time
32

    
33
from ganeti import rpc
34
from ganeti import ssh
35
from ganeti import utils
36
from ganeti import errors
37
from ganeti import config
38
from ganeti import constants
39
from ganeti import objects
40
from ganeti import ssconf
41
from ganeti import serializer
42
from ganeti import hypervisor
43

    
44

    
45
def _InitSSHSetup():
46
  """Setup the SSH configuration for the cluster.
47

48
  This generates a dsa keypair for root, adds the pub key to the
49
  permitted hosts and adds the hostkey to its own known hosts.
50

51
  """
52
  priv_key, pub_key, auth_keys = ssh.GetUserFiles(constants.GANETI_RUNAS)
53

    
54
  for name in priv_key, pub_key:
55
    if os.path.exists(name):
56
      utils.CreateBackup(name)
57
    utils.RemoveFile(name)
58

    
59
  result = utils.RunCmd(["ssh-keygen", "-t", "dsa",
60
                         "-f", priv_key,
61
                         "-q", "-N", ""])
62
  if result.failed:
63
    raise errors.OpExecError("Could not generate ssh keypair, error %s" %
64
                             result.output)
65

    
66
  utils.AddAuthorizedKey(auth_keys, utils.ReadFile(pub_key))
67

    
68

    
69
def GenerateSelfSignedSslCert(file_name, validity=(365 * 5)):
70
  """Generates a self-signed SSL certificate.
71

72
  @type file_name: str
73
  @param file_name: Path to output file
74
  @type validity: int
75
  @param validity: Validity for certificate in days
76

77
  """
78
  (fd, tmp_file_name) = tempfile.mkstemp(dir=os.path.dirname(file_name))
79
  try:
80
    try:
81
      # Set permissions before writing key
82
      os.chmod(tmp_file_name, 0600)
83

    
84
      result = utils.RunCmd(["openssl", "req", "-new", "-newkey", "rsa:1024",
85
                             "-days", str(validity), "-nodes", "-x509",
86
                             "-keyout", tmp_file_name, "-out", tmp_file_name,
87
                             "-batch"])
88
      if result.failed:
89
        raise errors.OpExecError("Could not generate SSL certificate, command"
90
                                 " %s had exitcode %s and error message %s" %
91
                                 (result.cmd, result.exit_code, result.output))
92

    
93
      # Make read-only
94
      os.chmod(tmp_file_name, 0400)
95

    
96
      os.rename(tmp_file_name, file_name)
97
    finally:
98
      utils.RemoveFile(tmp_file_name)
99
  finally:
100
    os.close(fd)
101

    
102

    
103
def GenerateHmacKey(file_name):
104
  """Writes a new HMAC key.
105

106
  @type file_name: str
107
  @param file_name: Path to output file
108

109
  """
110
  utils.WriteFile(file_name, data=utils.GenerateSecret(), mode=0400)
111

    
112

    
113
def _InitGanetiServerSetup(master_name):
114
  """Setup the necessary configuration for the initial node daemon.
115

116
  This creates the nodepass file containing the shared password for
117
  the cluster and also generates the SSL certificate.
118

119
  """
120
  GenerateSelfSignedSslCert(constants.SSL_CERT_FILE)
121

    
122
  # Don't overwrite existing file
123
  if not os.path.exists(constants.RAPI_CERT_FILE):
124
    GenerateSelfSignedSslCert(constants.RAPI_CERT_FILE)
125

    
126
  if not os.path.exists(constants.HMAC_CLUSTER_KEY):
127
    GenerateHmacKey(constants.HMAC_CLUSTER_KEY)
128

    
129
  result = utils.RunCmd([constants.NODE_INITD_SCRIPT, "restart"])
130

    
131
  if result.failed:
132
    raise errors.OpExecError("Could not start the node daemon, command %s"
133
                             " had exitcode %s and error %s" %
134
                             (result.cmd, result.exit_code, result.output))
135

    
136
  # Wait for node daemon to become responsive
137
  end_time = time.time() + 10.0
138
  while True:
139
    result = rpc.RpcRunner.call_version([master_name])[master_name]
140
    if not result.fail_msg:
141
      break
142

    
143
    if time.time() > end_time:
144
      raise errors.OpExecError("Node daemon didn't answer queries within"
145
                               " 10 seconds")
146

    
147
    time.sleep(1)
148

    
149

    
150
def InitCluster(cluster_name, mac_prefix,
151
                master_netdev, file_storage_dir, candidate_pool_size,
152
                secondary_ip=None, vg_name=None, beparams=None,
153
                nicparams=None, hvparams=None, enabled_hypervisors=None,
154
                modify_etc_hosts=True, modify_ssh_setup=True):
155
  """Initialise the cluster.
156

157
  @type candidate_pool_size: int
158
  @param candidate_pool_size: master candidate pool size
159

160
  """
161
  # TODO: complete the docstring
162
  if config.ConfigWriter.IsCluster():
163
    raise errors.OpPrereqError("Cluster is already initialised")
164

    
165
  if not enabled_hypervisors:
166
    raise errors.OpPrereqError("Enabled hypervisors list must contain at"
167
                               " least one member")
168
  invalid_hvs = set(enabled_hypervisors) - constants.HYPER_TYPES
169
  if invalid_hvs:
170
    raise errors.OpPrereqError("Enabled hypervisors contains invalid"
171
                               " entries: %s" % invalid_hvs)
172

    
173
  hostname = utils.HostInfo()
174

    
175
  if hostname.ip.startswith("127."):
176
    raise errors.OpPrereqError("This host's IP resolves to the private"
177
                               " range (%s). Please fix DNS or %s." %
178
                               (hostname.ip, constants.ETC_HOSTS))
179

    
180
  if not utils.OwnIpAddress(hostname.ip):
181
    raise errors.OpPrereqError("Inconsistency: this host's name resolves"
182
                               " to %s,\nbut this ip address does not"
183
                               " belong to this host."
184
                               " Aborting." % hostname.ip)
185

    
186
  clustername = utils.HostInfo(cluster_name)
187

    
188
  if utils.TcpPing(clustername.ip, constants.DEFAULT_NODED_PORT,
189
                   timeout=5):
190
    raise errors.OpPrereqError("Cluster IP already active. Aborting.")
191

    
192
  if secondary_ip:
193
    if not utils.IsValidIP(secondary_ip):
194
      raise errors.OpPrereqError("Invalid secondary ip given")
195
    if (secondary_ip != hostname.ip and
196
        not utils.OwnIpAddress(secondary_ip)):
197
      raise errors.OpPrereqError("You gave %s as secondary IP,"
198
                                 " but it does not belong to this host." %
199
                                 secondary_ip)
200
  else:
201
    secondary_ip = hostname.ip
202

    
203
  if vg_name is not None:
204
    # Check if volume group is valid
205
    vgstatus = utils.CheckVolumeGroupSize(utils.ListVolumeGroups(), vg_name,
206
                                          constants.MIN_VG_SIZE)
207
    if vgstatus:
208
      raise errors.OpPrereqError("Error: %s\nspecify --no-lvm-storage if"
209
                                 " you are not using lvm" % vgstatus)
210

    
211
  file_storage_dir = os.path.normpath(file_storage_dir)
212

    
213
  if not os.path.isabs(file_storage_dir):
214
    raise errors.OpPrereqError("The file storage directory you passed is"
215
                               " not an absolute path.")
216

    
217
  if not os.path.exists(file_storage_dir):
218
    try:
219
      os.makedirs(file_storage_dir, 0750)
220
    except OSError, err:
221
      raise errors.OpPrereqError("Cannot create file storage directory"
222
                                 " '%s': %s" %
223
                                 (file_storage_dir, err))
224

    
225
  if not os.path.isdir(file_storage_dir):
226
    raise errors.OpPrereqError("The file storage directory '%s' is not"
227
                               " a directory." % file_storage_dir)
228

    
229
  if not re.match("^[0-9a-z]{2}:[0-9a-z]{2}:[0-9a-z]{2}$", mac_prefix):
230
    raise errors.OpPrereqError("Invalid mac prefix given '%s'" % mac_prefix)
231

    
232
  result = utils.RunCmd(["ip", "link", "show", "dev", master_netdev])
233
  if result.failed:
234
    raise errors.OpPrereqError("Invalid master netdev given (%s): '%s'" %
235
                               (master_netdev,
236
                                result.output.strip()))
237

    
238
  if not (os.path.isfile(constants.NODE_INITD_SCRIPT) and
239
          os.access(constants.NODE_INITD_SCRIPT, os.X_OK)):
240
    raise errors.OpPrereqError("Init.d script '%s' missing or not"
241
                               " executable." % constants.NODE_INITD_SCRIPT)
242

    
243
  dirs = [(constants.RUN_GANETI_DIR, constants.RUN_DIRS_MODE)]
244
  utils.EnsureDirs(dirs)
245

    
246
  utils.ForceDictType(beparams, constants.BES_PARAMETER_TYPES)
247
  utils.ForceDictType(nicparams, constants.NICS_PARAMETER_TYPES)
248
  objects.NIC.CheckParameterSyntax(nicparams)
249

    
250
  # hvparams is a mapping of hypervisor->hvparams dict
251
  for hv_name, hv_params in hvparams.iteritems():
252
    utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
253
    hv_class = hypervisor.GetHypervisor(hv_name)
254
    hv_class.CheckParameterSyntax(hv_params)
255

    
256
  # set up the inter-node password and certificate
257
  _InitGanetiServerSetup(hostname.name)
258

    
259
  # set up ssh config and /etc/hosts
260
  sshline = utils.ReadFile(constants.SSH_HOST_RSA_PUB)
261
  sshkey = sshline.split(" ")[1]
262

    
263
  if modify_etc_hosts:
264
    utils.AddHostToEtcHosts(hostname.name)
265

    
266
  if modify_ssh_setup:
267
    _InitSSHSetup()
268

    
269
  now = time.time()
270

    
271
  # init of cluster config file
272
  cluster_config = objects.Cluster(
273
    serial_no=1,
274
    rsahostkeypub=sshkey,
275
    highest_used_port=(constants.FIRST_DRBD_PORT - 1),
276
    mac_prefix=mac_prefix,
277
    volume_group_name=vg_name,
278
    tcpudp_port_pool=set(),
279
    master_node=hostname.name,
280
    master_ip=clustername.ip,
281
    master_netdev=master_netdev,
282
    cluster_name=clustername.name,
283
    file_storage_dir=file_storage_dir,
284
    enabled_hypervisors=enabled_hypervisors,
285
    beparams={constants.PP_DEFAULT: beparams},
286
    nicparams={constants.PP_DEFAULT: nicparams},
287
    hvparams=hvparams,
288
    candidate_pool_size=candidate_pool_size,
289
    modify_etc_hosts=modify_etc_hosts,
290
    modify_ssh_setup=modify_ssh_setup,
291
    ctime=now,
292
    mtime=now,
293
    uuid=utils.NewUUID(),
294
    )
295
  master_node_config = objects.Node(name=hostname.name,
296
                                    primary_ip=hostname.ip,
297
                                    secondary_ip=secondary_ip,
298
                                    serial_no=1,
299
                                    master_candidate=True,
300
                                    offline=False, drained=False,
301
                                    )
302
  InitConfig(constants.CONFIG_VERSION, cluster_config, master_node_config)
303
  cfg = config.ConfigWriter()
304
  ssh.WriteKnownHostsFile(cfg, constants.SSH_KNOWN_HOSTS_FILE)
305
  cfg.Update(cfg.GetClusterInfo(), logging.error)
306

    
307
  # start the master ip
308
  # TODO: Review rpc call from bootstrap
309
  # TODO: Warn on failed start master
310
  rpc.RpcRunner.call_node_start_master(hostname.name, True, False)
311

    
312

    
313
def InitConfig(version, cluster_config, master_node_config,
314
               cfg_file=constants.CLUSTER_CONF_FILE):
315
  """Create the initial cluster configuration.
316

317
  It will contain the current node, which will also be the master
318
  node, and no instances.
319

320
  @type version: int
321
  @param version: configuration version
322
  @type cluster_config: L{objects.Cluster}
323
  @param cluster_config: cluster configuration
324
  @type master_node_config: L{objects.Node}
325
  @param master_node_config: master node configuration
326
  @type cfg_file: string
327
  @param cfg_file: configuration file path
328

329
  """
330
  nodes = {
331
    master_node_config.name: master_node_config,
332
    }
333

    
334
  now = time.time()
335
  config_data = objects.ConfigData(version=version,
336
                                   cluster=cluster_config,
337
                                   nodes=nodes,
338
                                   instances={},
339
                                   serial_no=1,
340
                                   ctime=now, mtime=now)
341
  utils.WriteFile(cfg_file,
342
                  data=serializer.Dump(config_data.ToDict()),
343
                  mode=0600)
344

    
345

    
346
def FinalizeClusterDestroy(master):
347
  """Execute the last steps of cluster destroy
348

349
  This function shuts down all the daemons, completing the destroy
350
  begun in cmdlib.LUDestroyOpcode.
351

352
  """
353
  cfg = config.ConfigWriter()
354
  modify_ssh_setup = cfg.GetClusterInfo().modify_ssh_setup
355
  result = rpc.RpcRunner.call_node_stop_master(master, True)
356
  msg = result.fail_msg
357
  if msg:
358
    logging.warning("Could not disable the master role: %s" % msg)
359
  result = rpc.RpcRunner.call_node_leave_cluster(master, modify_ssh_setup)
360
  msg = result.fail_msg
361
  if msg:
362
    logging.warning("Could not shutdown the node daemon and cleanup"
363
                    " the node: %s", msg)
364

    
365

    
366
def SetupNodeDaemon(cluster_name, node, ssh_key_check):
367
  """Add a node to the cluster.
368

369
  This function must be called before the actual opcode, and will ssh
370
  to the remote node, copy the needed files, and start ganeti-noded,
371
  allowing the master to do the rest via normal rpc calls.
372

373
  @param cluster_name: the cluster name
374
  @param node: the name of the new node
375
  @param ssh_key_check: whether to do a strict key check
376

377
  """
378
  sshrunner = ssh.SshRunner(cluster_name)
379

    
380
  noded_cert = utils.ReadFile(constants.SSL_CERT_FILE)
381
  rapi_cert = utils.ReadFile(constants.RAPI_CERT_FILE)
382
  hmac_key = utils.ReadFile(constants.HMAC_CLUSTER_KEY)
383

    
384
  # in the base64 pem encoding, neither '!' nor '.' are valid chars,
385
  # so we use this to detect an invalid certificate; as long as the
386
  # cert doesn't contain this, the here-document will be correctly
387
  # parsed by the shell sequence below. HMAC keys are hexadecimal strings,
388
  # so the same restrictions apply.
389
  for content in (noded_cert, rapi_cert, hmac_key):
390
    if re.search('^!EOF\.', content, re.MULTILINE):
391
      raise errors.OpExecError("invalid SSL certificate or HMAC key")
392

    
393
  if not noded_cert.endswith("\n"):
394
    noded_cert += "\n"
395
  if not rapi_cert.endswith("\n"):
396
    rapi_cert += "\n"
397
  if not hmac_key.endswith("\n"):
398
    hmac_key += "\n"
399

    
400
  # set up inter-node password and certificate and restarts the node daemon
401
  # and then connect with ssh to set password and start ganeti-noded
402
  # note that all the below variables are sanitized at this point,
403
  # either by being constants or by the checks above
404
  mycommand = ("umask 077 && "
405
               "cat > '%s' << '!EOF.' && \n"
406
               "%s!EOF.\n"
407
               "cat > '%s' << '!EOF.' && \n"
408
               "%s!EOF.\n"
409
               "cat > '%s' << '!EOF.' && \n"
410
               "%s!EOF.\n"
411
               "chmod 0400 %s %s %s && "
412
               "%s restart" %
413
               (constants.SSL_CERT_FILE, noded_cert,
414
                constants.RAPI_CERT_FILE, rapi_cert,
415
                constants.HMAC_CLUSTER_KEY, hmac_key,
416
                constants.SSL_CERT_FILE, constants.RAPI_CERT_FILE,
417
                constants.HMAC_CLUSTER_KEY,
418
                constants.NODE_INITD_SCRIPT))
419

    
420
  result = sshrunner.Run(node, 'root', mycommand, batch=False,
421
                         ask_key=ssh_key_check,
422
                         use_cluster_key=False,
423
                         strict_host_check=ssh_key_check)
424
  if result.failed:
425
    raise errors.OpExecError("Remote command on node %s, error: %s,"
426
                             " output: %s" %
427
                             (node, result.fail_reason, result.output))
428

    
429

    
430
def MasterFailover(no_voting=False):
431
  """Failover the master node.
432

433
  This checks that we are not already the master, and will cause the
434
  current master to cease being master, and the non-master to become
435
  new master.
436

437
  @type no_voting: boolean
438
  @param no_voting: force the operation without remote nodes agreement
439
                      (dangerous)
440

441
  """
442
  sstore = ssconf.SimpleStore()
443

    
444
  old_master, new_master = ssconf.GetMasterAndMyself(sstore)
445
  node_list = sstore.GetNodeList()
446
  mc_list = sstore.GetMasterCandidates()
447

    
448
  if old_master == new_master:
449
    raise errors.OpPrereqError("This commands must be run on the node"
450
                               " where you want the new master to be."
451
                               " %s is already the master" %
452
                               old_master)
453

    
454
  if new_master not in mc_list:
455
    mc_no_master = [name for name in mc_list if name != old_master]
456
    raise errors.OpPrereqError("This node is not among the nodes marked"
457
                               " as master candidates. Only these nodes"
458
                               " can become masters. Current list of"
459
                               " master candidates is:\n"
460
                               "%s" % ('\n'.join(mc_no_master)))
461

    
462
  if not no_voting:
463
    vote_list = GatherMasterVotes(node_list)
464

    
465
    if vote_list:
466
      voted_master = vote_list[0][0]
467
      if voted_master is None:
468
        raise errors.OpPrereqError("Cluster is inconsistent, most nodes did"
469
                                   " not respond.")
470
      elif voted_master != old_master:
471
        raise errors.OpPrereqError("I have a wrong configuration, I believe"
472
                                   " the master is %s but the other nodes"
473
                                   " voted %s. Please resync the configuration"
474
                                   " of this node." %
475
                                   (old_master, voted_master))
476
  # end checks
477

    
478
  rcode = 0
479

    
480
  logging.info("Setting master to %s, old master: %s", new_master, old_master)
481

    
482
  result = rpc.RpcRunner.call_node_stop_master(old_master, True)
483
  msg = result.fail_msg
484
  if msg:
485
    logging.error("Could not disable the master role on the old master"
486
                 " %s, please disable manually: %s", old_master, msg)
487

    
488
  # Here we have a phase where no master should be running
489

    
490
  # instantiate a real config writer, as we now know we have the
491
  # configuration data
492
  cfg = config.ConfigWriter()
493

    
494
  cluster_info = cfg.GetClusterInfo()
495
  cluster_info.master_node = new_master
496
  # this will also regenerate the ssconf files, since we updated the
497
  # cluster info
498
  cfg.Update(cluster_info, logging.error)
499

    
500
  result = rpc.RpcRunner.call_node_start_master(new_master, True, no_voting)
501
  msg = result.fail_msg
502
  if msg:
503
    logging.error("Could not start the master role on the new master"
504
                  " %s, please check: %s", new_master, msg)
505
    rcode = 1
506

    
507
  return rcode
508

    
509

    
510
def GetMaster():
511
  """Returns the current master node.
512

513
  This is a separate function in bootstrap since it's needed by
514
  gnt-cluster, and instead of importing directly ssconf, it's better
515
  to abstract it in bootstrap, where we do use ssconf in other
516
  functions too.
517

518
  """
519
  sstore = ssconf.SimpleStore()
520

    
521
  old_master, _ = ssconf.GetMasterAndMyself(sstore)
522

    
523
  return old_master
524

    
525

    
526
def GatherMasterVotes(node_list):
527
  """Check the agreement on who is the master.
528

529
  This function will return a list of (node, number of votes), ordered
530
  by the number of votes. Errors will be denoted by the key 'None'.
531

532
  Note that the sum of votes is the number of nodes this machine
533
  knows, whereas the number of entries in the list could be different
534
  (if some nodes vote for another master).
535

536
  We remove ourselves from the list since we know that (bugs aside)
537
  since we use the same source for configuration information for both
538
  backend and boostrap, we'll always vote for ourselves.
539

540
  @type node_list: list
541
  @param node_list: the list of nodes to query for master info; the current
542
      node will be removed if it is in the list
543
  @rtype: list
544
  @return: list of (node, votes)
545

546
  """
547
  myself = utils.HostInfo().name
548
  try:
549
    node_list.remove(myself)
550
  except ValueError:
551
    pass
552
  if not node_list:
553
    # no nodes left (eventually after removing myself)
554
    return []
555
  results = rpc.RpcRunner.call_master_info(node_list)
556
  if not isinstance(results, dict):
557
    # this should not happen (unless internal error in rpc)
558
    logging.critical("Can't complete rpc call, aborting master startup")
559
    return [(None, len(node_list))]
560
  votes = {}
561
  for node in results:
562
    nres = results[node]
563
    data = nres.payload
564
    msg = nres.fail_msg
565
    fail = False
566
    if msg:
567
      logging.warning("Error contacting node %s: %s", node, msg)
568
      fail = True
569
    elif not isinstance(data, (tuple, list)) or len(data) < 3:
570
      logging.warning("Invalid data received from node %s: %s", node, data)
571
      fail = True
572
    if fail:
573
      if None not in votes:
574
        votes[None] = 0
575
      votes[None] += 1
576
      continue
577
    master_node = data[2]
578
    if master_node not in votes:
579
      votes[master_node] = 0
580
    votes[master_node] += 1
581

    
582
  vote_list = [v for v in votes.items()]
583
  # sort first on number of votes then on name, since we want None
584
  # sorted later if we have the half of the nodes not responding, and
585
  # half voting all for the same master
586
  vote_list.sort(key=lambda x: (x[1], x[0]), reverse=True)
587

    
588
  return vote_list