Statistics
| Branch: | Tag: | Revision:

root / lib / bootstrap.py @ b989b9d9

History | View | Annotate | Download (19.8 kB)

1
#
2
#
3

    
4
# Copyright (C) 2006, 2007, 2008 Google Inc.
5
#
6
# This program is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 2 of the License, or
9
# (at your option) any later version.
10
#
11
# This program is distributed in the hope that it will be useful, but
12
# WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
# General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with this program; if not, write to the Free Software
18
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19
# 02110-1301, USA.
20

    
21

    
22
"""Functions to bootstrap a new cluster.
23

24
"""
25

    
26
import os
27
import os.path
28
import re
29
import logging
30
import tempfile
31
import time
32

    
33
from ganeti import rpc
34
from ganeti import ssh
35
from ganeti import utils
36
from ganeti import errors
37
from ganeti import config
38
from ganeti import constants
39
from ganeti import objects
40
from ganeti import ssconf
41
from ganeti import serializer
42
from ganeti import hypervisor
43

    
44

    
45
def _InitSSHSetup():
46
  """Setup the SSH configuration for the cluster.
47

48
  This generates a dsa keypair for root, adds the pub key to the
49
  permitted hosts and adds the hostkey to its own known hosts.
50

51
  """
52
  priv_key, pub_key, auth_keys = ssh.GetUserFiles(constants.GANETI_RUNAS)
53

    
54
  for name in priv_key, pub_key:
55
    if os.path.exists(name):
56
      utils.CreateBackup(name)
57
    utils.RemoveFile(name)
58

    
59
  result = utils.RunCmd(["ssh-keygen", "-t", "dsa",
60
                         "-f", priv_key,
61
                         "-q", "-N", ""])
62
  if result.failed:
63
    raise errors.OpExecError("Could not generate ssh keypair, error %s" %
64
                             result.output)
65

    
66
  utils.AddAuthorizedKey(auth_keys, utils.ReadFile(pub_key))
67

    
68

    
69
def GenerateSelfSignedSslCert(file_name, validity=(365 * 5)):
70
  """Generates a self-signed SSL certificate.
71

72
  @type file_name: str
73
  @param file_name: Path to output file
74
  @type validity: int
75
  @param validity: Validity for certificate in days
76

77
  """
78
  (fd, tmp_file_name) = tempfile.mkstemp(dir=os.path.dirname(file_name))
79
  try:
80
    try:
81
      # Set permissions before writing key
82
      os.chmod(tmp_file_name, 0600)
83

    
84
      result = utils.RunCmd(["openssl", "req", "-new", "-newkey", "rsa:1024",
85
                             "-days", str(validity), "-nodes", "-x509",
86
                             "-keyout", tmp_file_name, "-out", tmp_file_name,
87
                             "-batch"])
88
      if result.failed:
89
        raise errors.OpExecError("Could not generate SSL certificate, command"
90
                                 " %s had exitcode %s and error message %s" %
91
                                 (result.cmd, result.exit_code, result.output))
92

    
93
      # Make read-only
94
      os.chmod(tmp_file_name, 0400)
95

    
96
      os.rename(tmp_file_name, file_name)
97
    finally:
98
      utils.RemoveFile(tmp_file_name)
99
  finally:
100
    os.close(fd)
101

    
102

    
103
def GenerateHmacKey(file_name):
104
  """Writes a new HMAC key.
105

106
  @type file_name: str
107
  @param file_name: Path to output file
108

109
  """
110
  utils.WriteFile(file_name, data=utils.GenerateSecret(), mode=0400)
111

    
112

    
113
def _InitGanetiServerSetup():
114
  """Setup the necessary configuration for the initial node daemon.
115

116
  This creates the nodepass file containing the shared password for
117
  the cluster and also generates the SSL certificate.
118

119
  """
120
  GenerateSelfSignedSslCert(constants.SSL_CERT_FILE)
121

    
122
  # Don't overwrite existing file
123
  if not os.path.exists(constants.RAPI_CERT_FILE):
124
    GenerateSelfSignedSslCert(constants.RAPI_CERT_FILE)
125

    
126
  if not os.path.exists(constants.HMAC_CLUSTER_KEY):
127
    GenerateHmacKey(constants.HMAC_CLUSTER_KEY)
128

    
129
  result = utils.RunCmd([constants.NODE_INITD_SCRIPT, "restart"])
130

    
131
  if result.failed:
132
    raise errors.OpExecError("Could not start the node daemon, command %s"
133
                             " had exitcode %s and error %s" %
134
                             (result.cmd, result.exit_code, result.output))
135

    
136

    
137
def InitCluster(cluster_name, mac_prefix,
138
                master_netdev, file_storage_dir, candidate_pool_size,
139
                secondary_ip=None, vg_name=None, beparams=None,
140
                nicparams=None, hvparams=None, enabled_hypervisors=None,
141
                modify_etc_hosts=True, modify_ssh_setup=True):
142
  """Initialise the cluster.
143

144
  @type candidate_pool_size: int
145
  @param candidate_pool_size: master candidate pool size
146

147
  """
148
  # TODO: complete the docstring
149
  if config.ConfigWriter.IsCluster():
150
    raise errors.OpPrereqError("Cluster is already initialised")
151

    
152
  if not enabled_hypervisors:
153
    raise errors.OpPrereqError("Enabled hypervisors list must contain at"
154
                               " least one member")
155
  invalid_hvs = set(enabled_hypervisors) - constants.HYPER_TYPES
156
  if invalid_hvs:
157
    raise errors.OpPrereqError("Enabled hypervisors contains invalid"
158
                               " entries: %s" % invalid_hvs)
159

    
160
  hostname = utils.HostInfo()
161

    
162
  if hostname.ip.startswith("127."):
163
    raise errors.OpPrereqError("This host's IP resolves to the private"
164
                               " range (%s). Please fix DNS or %s." %
165
                               (hostname.ip, constants.ETC_HOSTS))
166

    
167
  if not utils.OwnIpAddress(hostname.ip):
168
    raise errors.OpPrereqError("Inconsistency: this host's name resolves"
169
                               " to %s,\nbut this ip address does not"
170
                               " belong to this host."
171
                               " Aborting." % hostname.ip)
172

    
173
  clustername = utils.HostInfo(cluster_name)
174

    
175
  if utils.TcpPing(clustername.ip, constants.DEFAULT_NODED_PORT,
176
                   timeout=5):
177
    raise errors.OpPrereqError("Cluster IP already active. Aborting.")
178

    
179
  if secondary_ip:
180
    if not utils.IsValidIP(secondary_ip):
181
      raise errors.OpPrereqError("Invalid secondary ip given")
182
    if (secondary_ip != hostname.ip and
183
        not utils.OwnIpAddress(secondary_ip)):
184
      raise errors.OpPrereqError("You gave %s as secondary IP,"
185
                                 " but it does not belong to this host." %
186
                                 secondary_ip)
187
  else:
188
    secondary_ip = hostname.ip
189

    
190
  if vg_name is not None:
191
    # Check if volume group is valid
192
    vgstatus = utils.CheckVolumeGroupSize(utils.ListVolumeGroups(), vg_name,
193
                                          constants.MIN_VG_SIZE)
194
    if vgstatus:
195
      raise errors.OpPrereqError("Error: %s\nspecify --no-lvm-storage if"
196
                                 " you are not using lvm" % vgstatus)
197

    
198
  file_storage_dir = os.path.normpath(file_storage_dir)
199

    
200
  if not os.path.isabs(file_storage_dir):
201
    raise errors.OpPrereqError("The file storage directory you passed is"
202
                               " not an absolute path.")
203

    
204
  if not os.path.exists(file_storage_dir):
205
    try:
206
      os.makedirs(file_storage_dir, 0750)
207
    except OSError, err:
208
      raise errors.OpPrereqError("Cannot create file storage directory"
209
                                 " '%s': %s" %
210
                                 (file_storage_dir, err))
211

    
212
  if not os.path.isdir(file_storage_dir):
213
    raise errors.OpPrereqError("The file storage directory '%s' is not"
214
                               " a directory." % file_storage_dir)
215

    
216
  if not re.match("^[0-9a-z]{2}:[0-9a-z]{2}:[0-9a-z]{2}$", mac_prefix):
217
    raise errors.OpPrereqError("Invalid mac prefix given '%s'" % mac_prefix)
218

    
219
  result = utils.RunCmd(["ip", "link", "show", "dev", master_netdev])
220
  if result.failed:
221
    raise errors.OpPrereqError("Invalid master netdev given (%s): '%s'" %
222
                               (master_netdev,
223
                                result.output.strip()))
224

    
225
  if not (os.path.isfile(constants.NODE_INITD_SCRIPT) and
226
          os.access(constants.NODE_INITD_SCRIPT, os.X_OK)):
227
    raise errors.OpPrereqError("Init.d script '%s' missing or not"
228
                               " executable." % constants.NODE_INITD_SCRIPT)
229

    
230
  dirs = [(constants.RUN_GANETI_DIR, constants.RUN_DIRS_MODE)]
231
  utils.EnsureDirs(dirs)
232

    
233
  utils.ForceDictType(beparams, constants.BES_PARAMETER_TYPES)
234
  utils.ForceDictType(nicparams, constants.NICS_PARAMETER_TYPES)
235
  objects.NIC.CheckParameterSyntax(nicparams)
236

    
237
  # hvparams is a mapping of hypervisor->hvparams dict
238
  for hv_name, hv_params in hvparams.iteritems():
239
    utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
240
    hv_class = hypervisor.GetHypervisor(hv_name)
241
    hv_class.CheckParameterSyntax(hv_params)
242

    
243
  # set up the inter-node password and certificate
244
  _InitGanetiServerSetup()
245

    
246
  # set up ssh config and /etc/hosts
247
  sshline = utils.ReadFile(constants.SSH_HOST_RSA_PUB)
248
  sshkey = sshline.split(" ")[1]
249

    
250
  if modify_etc_hosts:
251
    utils.AddHostToEtcHosts(hostname.name)
252

    
253
  if modify_ssh_setup:
254
    _InitSSHSetup()
255

    
256
  now = time.time()
257

    
258
  # init of cluster config file
259
  cluster_config = objects.Cluster(
260
    serial_no=1,
261
    rsahostkeypub=sshkey,
262
    highest_used_port=(constants.FIRST_DRBD_PORT - 1),
263
    mac_prefix=mac_prefix,
264
    volume_group_name=vg_name,
265
    tcpudp_port_pool=set(),
266
    master_node=hostname.name,
267
    master_ip=clustername.ip,
268
    master_netdev=master_netdev,
269
    cluster_name=clustername.name,
270
    file_storage_dir=file_storage_dir,
271
    enabled_hypervisors=enabled_hypervisors,
272
    beparams={constants.PP_DEFAULT: beparams},
273
    nicparams={constants.PP_DEFAULT: nicparams},
274
    hvparams=hvparams,
275
    candidate_pool_size=candidate_pool_size,
276
    modify_etc_hosts=modify_etc_hosts,
277
    modify_ssh_setup=modify_ssh_setup,
278
    ctime=now,
279
    mtime=now,
280
    uuid=utils.NewUUID(),
281
    )
282
  master_node_config = objects.Node(name=hostname.name,
283
                                    primary_ip=hostname.ip,
284
                                    secondary_ip=secondary_ip,
285
                                    serial_no=1,
286
                                    master_candidate=True,
287
                                    offline=False, drained=False,
288
                                    )
289
  InitConfig(constants.CONFIG_VERSION, cluster_config, master_node_config)
290
  cfg = config.ConfigWriter()
291
  ssh.WriteKnownHostsFile(cfg, constants.SSH_KNOWN_HOSTS_FILE)
292
  cfg.Update(cfg.GetClusterInfo())
293

    
294
  # start the master ip
295
  # TODO: Review rpc call from bootstrap
296
  # TODO: Warn on failed start master
297
  rpc.RpcRunner.call_node_start_master(hostname.name, True, False)
298

    
299

    
300
def InitConfig(version, cluster_config, master_node_config,
301
               cfg_file=constants.CLUSTER_CONF_FILE):
302
  """Create the initial cluster configuration.
303

304
  It will contain the current node, which will also be the master
305
  node, and no instances.
306

307
  @type version: int
308
  @param version: configuration version
309
  @type cluster_config: L{objects.Cluster}
310
  @param cluster_config: cluster configuration
311
  @type master_node_config: L{objects.Node}
312
  @param master_node_config: master node configuration
313
  @type cfg_file: string
314
  @param cfg_file: configuration file path
315

316
  """
317
  nodes = {
318
    master_node_config.name: master_node_config,
319
    }
320

    
321
  now = time.time()
322
  config_data = objects.ConfigData(version=version,
323
                                   cluster=cluster_config,
324
                                   nodes=nodes,
325
                                   instances={},
326
                                   serial_no=1,
327
                                   ctime=now, mtime=now)
328
  utils.WriteFile(cfg_file,
329
                  data=serializer.Dump(config_data.ToDict()),
330
                  mode=0600)
331

    
332

    
333
def FinalizeClusterDestroy(master):
334
  """Execute the last steps of cluster destroy
335

336
  This function shuts down all the daemons, completing the destroy
337
  begun in cmdlib.LUDestroyOpcode.
338

339
  """
340
  cfg = config.ConfigWriter()
341
  modify_ssh_setup = cfg.GetClusterInfo().modify_ssh_setup
342
  result = rpc.RpcRunner.call_node_stop_master(master, True)
343
  msg = result.fail_msg
344
  if msg:
345
    logging.warning("Could not disable the master role: %s" % msg)
346
  result = rpc.RpcRunner.call_node_leave_cluster(master, modify_ssh_setup)
347
  msg = result.fail_msg
348
  if msg:
349
    logging.warning("Could not shutdown the node daemon and cleanup"
350
                    " the node: %s", msg)
351

    
352

    
353
def SetupNodeDaemon(cluster_name, node, ssh_key_check):
354
  """Add a node to the cluster.
355

356
  This function must be called before the actual opcode, and will ssh
357
  to the remote node, copy the needed files, and start ganeti-noded,
358
  allowing the master to do the rest via normal rpc calls.
359

360
  @param cluster_name: the cluster name
361
  @param node: the name of the new node
362
  @param ssh_key_check: whether to do a strict key check
363

364
  """
365
  sshrunner = ssh.SshRunner(cluster_name)
366

    
367
  noded_cert = utils.ReadFile(constants.SSL_CERT_FILE)
368
  rapi_cert = utils.ReadFile(constants.RAPI_CERT_FILE)
369
  hmac_key = utils.ReadFile(constants.HMAC_CLUSTER_KEY)
370

    
371
  # in the base64 pem encoding, neither '!' nor '.' are valid chars,
372
  # so we use this to detect an invalid certificate; as long as the
373
  # cert doesn't contain this, the here-document will be correctly
374
  # parsed by the shell sequence below. HMAC keys are hexadecimal strings,
375
  # so the same restrictions apply.
376
  for content in (noded_cert, rapi_cert, hmac_key):
377
    if re.search('^!EOF\.', content, re.MULTILINE):
378
      raise errors.OpExecError("invalid SSL certificate or HMAC key")
379

    
380
  if not noded_cert.endswith("\n"):
381
    noded_cert += "\n"
382
  if not rapi_cert.endswith("\n"):
383
    rapi_cert += "\n"
384
  if not hmac_key.endswith("\n"):
385
    hmac_key += "\n"
386

    
387
  # set up inter-node password and certificate and restarts the node daemon
388
  # and then connect with ssh to set password and start ganeti-noded
389
  # note that all the below variables are sanitized at this point,
390
  # either by being constants or by the checks above
391
  mycommand = ("umask 077 && "
392
               "cat > '%s' << '!EOF.' && \n"
393
               "%s!EOF.\n"
394
               "cat > '%s' << '!EOF.' && \n"
395
               "%s!EOF.\n"
396
               "cat > '%s' << '!EOF.' && \n"
397
               "%s!EOF.\n"
398
               "chmod 0400 %s %s %s && "
399
               "%s restart" %
400
               (constants.SSL_CERT_FILE, noded_cert,
401
                constants.RAPI_CERT_FILE, rapi_cert,
402
                constants.HMAC_CLUSTER_KEY, hmac_key,
403
                constants.SSL_CERT_FILE, constants.RAPI_CERT_FILE,
404
                constants.HMAC_CLUSTER_KEY,
405
                constants.NODE_INITD_SCRIPT))
406

    
407
  result = sshrunner.Run(node, 'root', mycommand, batch=False,
408
                         ask_key=ssh_key_check,
409
                         use_cluster_key=False,
410
                         strict_host_check=ssh_key_check)
411
  if result.failed:
412
    raise errors.OpExecError("Remote command on node %s, error: %s,"
413
                             " output: %s" %
414
                             (node, result.fail_reason, result.output))
415

    
416

    
417
def MasterFailover(no_voting=False):
418
  """Failover the master node.
419

420
  This checks that we are not already the master, and will cause the
421
  current master to cease being master, and the non-master to become
422
  new master.
423

424
  @type no_voting: boolean
425
  @param no_voting: force the operation without remote nodes agreement
426
                      (dangerous)
427

428
  """
429
  sstore = ssconf.SimpleStore()
430

    
431
  old_master, new_master = ssconf.GetMasterAndMyself(sstore)
432
  node_list = sstore.GetNodeList()
433
  mc_list = sstore.GetMasterCandidates()
434

    
435
  if old_master == new_master:
436
    raise errors.OpPrereqError("This commands must be run on the node"
437
                               " where you want the new master to be."
438
                               " %s is already the master" %
439
                               old_master)
440

    
441
  if new_master not in mc_list:
442
    mc_no_master = [name for name in mc_list if name != old_master]
443
    raise errors.OpPrereqError("This node is not among the nodes marked"
444
                               " as master candidates. Only these nodes"
445
                               " can become masters. Current list of"
446
                               " master candidates is:\n"
447
                               "%s" % ('\n'.join(mc_no_master)))
448

    
449
  if not no_voting:
450
    vote_list = GatherMasterVotes(node_list)
451

    
452
    if vote_list:
453
      voted_master = vote_list[0][0]
454
      if voted_master is None:
455
        raise errors.OpPrereqError("Cluster is inconsistent, most nodes did"
456
                                   " not respond.")
457
      elif voted_master != old_master:
458
        raise errors.OpPrereqError("I have a wrong configuration, I believe"
459
                                   " the master is %s but the other nodes"
460
                                   " voted %s. Please resync the configuration"
461
                                   " of this node." %
462
                                   (old_master, voted_master))
463
  # end checks
464

    
465
  rcode = 0
466

    
467
  logging.info("Setting master to %s, old master: %s", new_master, old_master)
468

    
469
  result = rpc.RpcRunner.call_node_stop_master(old_master, True)
470
  msg = result.fail_msg
471
  if msg:
472
    logging.error("Could not disable the master role on the old master"
473
                 " %s, please disable manually: %s", old_master, msg)
474

    
475
  # Here we have a phase where no master should be running
476

    
477
  # instantiate a real config writer, as we now know we have the
478
  # configuration data
479
  cfg = config.ConfigWriter()
480

    
481
  cluster_info = cfg.GetClusterInfo()
482
  cluster_info.master_node = new_master
483
  # this will also regenerate the ssconf files, since we updated the
484
  # cluster info
485
  cfg.Update(cluster_info)
486

    
487
  result = rpc.RpcRunner.call_node_start_master(new_master, True, no_voting)
488
  msg = result.fail_msg
489
  if msg:
490
    logging.error("Could not start the master role on the new master"
491
                  " %s, please check: %s", new_master, msg)
492
    rcode = 1
493

    
494
  return rcode
495

    
496

    
497
def GetMaster():
498
  """Returns the current master node.
499

500
  This is a separate function in bootstrap since it's needed by
501
  gnt-cluster, and instead of importing directly ssconf, it's better
502
  to abstract it in bootstrap, where we do use ssconf in other
503
  functions too.
504

505
  """
506
  sstore = ssconf.SimpleStore()
507

    
508
  old_master, _ = ssconf.GetMasterAndMyself(sstore)
509

    
510
  return old_master
511

    
512

    
513
def GatherMasterVotes(node_list):
514
  """Check the agreement on who is the master.
515

516
  This function will return a list of (node, number of votes), ordered
517
  by the number of votes. Errors will be denoted by the key 'None'.
518

519
  Note that the sum of votes is the number of nodes this machine
520
  knows, whereas the number of entries in the list could be different
521
  (if some nodes vote for another master).
522

523
  We remove ourselves from the list since we know that (bugs aside)
524
  since we use the same source for configuration information for both
525
  backend and boostrap, we'll always vote for ourselves.
526

527
  @type node_list: list
528
  @param node_list: the list of nodes to query for master info; the current
529
      node will be removed if it is in the list
530
  @rtype: list
531
  @return: list of (node, votes)
532

533
  """
534
  myself = utils.HostInfo().name
535
  try:
536
    node_list.remove(myself)
537
  except ValueError:
538
    pass
539
  if not node_list:
540
    # no nodes left (eventually after removing myself)
541
    return []
542
  results = rpc.RpcRunner.call_master_info(node_list)
543
  if not isinstance(results, dict):
544
    # this should not happen (unless internal error in rpc)
545
    logging.critical("Can't complete rpc call, aborting master startup")
546
    return [(None, len(node_list))]
547
  votes = {}
548
  for node in results:
549
    nres = results[node]
550
    data = nres.payload
551
    msg = nres.fail_msg
552
    fail = False
553
    if msg:
554
      logging.warning("Error contacting node %s: %s", node, msg)
555
      fail = True
556
    elif not isinstance(data, (tuple, list)) or len(data) < 3:
557
      logging.warning("Invalid data received from node %s: %s", node, data)
558
      fail = True
559
    if fail:
560
      if None not in votes:
561
        votes[None] = 0
562
      votes[None] += 1
563
      continue
564
    master_node = data[2]
565
    if master_node not in votes:
566
      votes[master_node] = 0
567
    votes[master_node] += 1
568

    
569
  vote_list = [v for v in votes.items()]
570
  # sort first on number of votes then on name, since we want None
571
  # sorted later if we have the half of the nodes not responding, and
572
  # half voting all for the same master
573
  vote_list.sort(key=lambda x: (x[1], x[0]), reverse=True)
574

    
575
  return vote_list