Statistics
| Branch: | Tag: | Revision:

root / lib / bootstrap.py @ c008906b

History | View | Annotate | Download (19.6 kB)

1
#
2
#
3

    
4
# Copyright (C) 2006, 2007, 2008 Google Inc.
5
#
6
# This program is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 2 of the License, or
9
# (at your option) any later version.
10
#
11
# This program is distributed in the hope that it will be useful, but
12
# WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
# General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with this program; if not, write to the Free Software
18
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19
# 02110-1301, USA.
20

    
21

    
22
"""Functions to bootstrap a new cluster.
23

24
"""
25

    
26
import os
27
import os.path
28
import re
29
import logging
30
import tempfile
31
import time
32

    
33
from ganeti import rpc
34
from ganeti import ssh
35
from ganeti import utils
36
from ganeti import errors
37
from ganeti import config
38
from ganeti import constants
39
from ganeti import objects
40
from ganeti import ssconf
41
from ganeti import serializer
42
from ganeti import hypervisor
43

    
44

    
45
def _InitSSHSetup():
46
  """Setup the SSH configuration for the cluster.
47

48
  This generates a dsa keypair for root, adds the pub key to the
49
  permitted hosts and adds the hostkey to its own known hosts.
50

51
  """
52
  priv_key, pub_key, auth_keys = ssh.GetUserFiles(constants.GANETI_RUNAS)
53

    
54
  for name in priv_key, pub_key:
55
    if os.path.exists(name):
56
      utils.CreateBackup(name)
57
    utils.RemoveFile(name)
58

    
59
  result = utils.RunCmd(["ssh-keygen", "-t", "dsa",
60
                         "-f", priv_key,
61
                         "-q", "-N", ""])
62
  if result.failed:
63
    raise errors.OpExecError("Could not generate ssh keypair, error %s" %
64
                             result.output)
65

    
66
  utils.AddAuthorizedKey(auth_keys, utils.ReadFile(pub_key))
67

    
68

    
69
def GenerateSelfSignedSslCert(file_name, validity=(365 * 5)):
70
  """Generates a self-signed SSL certificate.
71

72
  @type file_name: str
73
  @param file_name: Path to output file
74
  @type validity: int
75
  @param validity: Validity for certificate in days
76

77
  """
78
  (fd, tmp_file_name) = tempfile.mkstemp(dir=os.path.dirname(file_name))
79
  try:
80
    try:
81
      # Set permissions before writing key
82
      os.chmod(tmp_file_name, 0600)
83

    
84
      result = utils.RunCmd(["openssl", "req", "-new", "-newkey", "rsa:1024",
85
                             "-days", str(validity), "-nodes", "-x509",
86
                             "-keyout", tmp_file_name, "-out", tmp_file_name,
87
                             "-batch"])
88
      if result.failed:
89
        raise errors.OpExecError("Could not generate SSL certificate, command"
90
                                 " %s had exitcode %s and error message %s" %
91
                                 (result.cmd, result.exit_code, result.output))
92

    
93
      # Make read-only
94
      os.chmod(tmp_file_name, 0400)
95

    
96
      os.rename(tmp_file_name, file_name)
97
    finally:
98
      utils.RemoveFile(tmp_file_name)
99
  finally:
100
    os.close(fd)
101

    
102

    
103
def GenerateHmacKey(file_name):
104
  """Writes a new HMAC key.
105

106
  @type file_name: str
107
  @param file_name: Path to output file
108

109
  """
110
  utils.WriteFile(file_name, data=utils.GenerateSecret(), mode=0400)
111

    
112

    
113
def _InitGanetiServerSetup():
114
  """Setup the necessary configuration for the initial node daemon.
115

116
  This creates the nodepass file containing the shared password for
117
  the cluster and also generates the SSL certificate.
118

119
  """
120
  GenerateSelfSignedSslCert(constants.SSL_CERT_FILE)
121

    
122
  # Don't overwrite existing file
123
  if not os.path.exists(constants.RAPI_CERT_FILE):
124
    GenerateSelfSignedSslCert(constants.RAPI_CERT_FILE)
125

    
126
  if not os.path.exists(constants.HMAC_CLUSTER_KEY):
127
    GenerateHmacKey(constants.HMAC_CLUSTER_KEY)
128

    
129
  result = utils.RunCmd([constants.NODE_INITD_SCRIPT, "restart"])
130

    
131
  if result.failed:
132
    raise errors.OpExecError("Could not start the node daemon, command %s"
133
                             " had exitcode %s and error %s" %
134
                             (result.cmd, result.exit_code, result.output))
135

    
136

    
137
def InitCluster(cluster_name, mac_prefix,
138
                master_netdev, file_storage_dir, candidate_pool_size,
139
                secondary_ip=None, vg_name=None, beparams=None,
140
                nicparams=None, hvparams=None, enabled_hypervisors=None,
141
                modify_etc_hosts=True):
142
  """Initialise the cluster.
143

144
  @type candidate_pool_size: int
145
  @param candidate_pool_size: master candidate pool size
146

147
  """
148
  # TODO: complete the docstring
149
  if config.ConfigWriter.IsCluster():
150
    raise errors.OpPrereqError("Cluster is already initialised")
151

    
152
  if not enabled_hypervisors:
153
    raise errors.OpPrereqError("Enabled hypervisors list must contain at"
154
                               " least one member")
155
  invalid_hvs = set(enabled_hypervisors) - constants.HYPER_TYPES
156
  if invalid_hvs:
157
    raise errors.OpPrereqError("Enabled hypervisors contains invalid"
158
                               " entries: %s" % invalid_hvs)
159

    
160
  hostname = utils.HostInfo()
161

    
162
  if hostname.ip.startswith("127."):
163
    raise errors.OpPrereqError("This host's IP resolves to the private"
164
                               " range (%s). Please fix DNS or %s." %
165
                               (hostname.ip, constants.ETC_HOSTS))
166

    
167
  if not utils.OwnIpAddress(hostname.ip):
168
    raise errors.OpPrereqError("Inconsistency: this host's name resolves"
169
                               " to %s,\nbut this ip address does not"
170
                               " belong to this host."
171
                               " Aborting." % hostname.ip)
172

    
173
  clustername = utils.HostInfo(cluster_name)
174

    
175
  if utils.TcpPing(clustername.ip, constants.DEFAULT_NODED_PORT,
176
                   timeout=5):
177
    raise errors.OpPrereqError("Cluster IP already active. Aborting.")
178

    
179
  if secondary_ip:
180
    if not utils.IsValidIP(secondary_ip):
181
      raise errors.OpPrereqError("Invalid secondary ip given")
182
    if (secondary_ip != hostname.ip and
183
        not utils.OwnIpAddress(secondary_ip)):
184
      raise errors.OpPrereqError("You gave %s as secondary IP,"
185
                                 " but it does not belong to this host." %
186
                                 secondary_ip)
187
  else:
188
    secondary_ip = hostname.ip
189

    
190
  if vg_name is not None:
191
    # Check if volume group is valid
192
    vgstatus = utils.CheckVolumeGroupSize(utils.ListVolumeGroups(), vg_name,
193
                                          constants.MIN_VG_SIZE)
194
    if vgstatus:
195
      raise errors.OpPrereqError("Error: %s\nspecify --no-lvm-storage if"
196
                                 " you are not using lvm" % vgstatus)
197

    
198
  file_storage_dir = os.path.normpath(file_storage_dir)
199

    
200
  if not os.path.isabs(file_storage_dir):
201
    raise errors.OpPrereqError("The file storage directory you passed is"
202
                               " not an absolute path.")
203

    
204
  if not os.path.exists(file_storage_dir):
205
    try:
206
      os.makedirs(file_storage_dir, 0750)
207
    except OSError, err:
208
      raise errors.OpPrereqError("Cannot create file storage directory"
209
                                 " '%s': %s" %
210
                                 (file_storage_dir, err))
211

    
212
  if not os.path.isdir(file_storage_dir):
213
    raise errors.OpPrereqError("The file storage directory '%s' is not"
214
                               " a directory." % file_storage_dir)
215

    
216
  if not re.match("^[0-9a-z]{2}:[0-9a-z]{2}:[0-9a-z]{2}$", mac_prefix):
217
    raise errors.OpPrereqError("Invalid mac prefix given '%s'" % mac_prefix)
218

    
219
  result = utils.RunCmd(["ip", "link", "show", "dev", master_netdev])
220
  if result.failed:
221
    raise errors.OpPrereqError("Invalid master netdev given (%s): '%s'" %
222
                               (master_netdev,
223
                                result.output.strip()))
224

    
225
  if not (os.path.isfile(constants.NODE_INITD_SCRIPT) and
226
          os.access(constants.NODE_INITD_SCRIPT, os.X_OK)):
227
    raise errors.OpPrereqError("Init.d script '%s' missing or not"
228
                               " executable." % constants.NODE_INITD_SCRIPT)
229

    
230
  dirs = [(constants.RUN_GANETI_DIR, constants.RUN_DIRS_MODE)]
231
  utils.EnsureDirs(dirs)
232

    
233
  utils.ForceDictType(beparams, constants.BES_PARAMETER_TYPES)
234
  utils.ForceDictType(nicparams, constants.NICS_PARAMETER_TYPES)
235
  objects.NIC.CheckParameterSyntax(nicparams)
236

    
237
  # hvparams is a mapping of hypervisor->hvparams dict
238
  for hv_name, hv_params in hvparams.iteritems():
239
    utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
240
    hv_class = hypervisor.GetHypervisor(hv_name)
241
    hv_class.CheckParameterSyntax(hv_params)
242

    
243
  # set up the inter-node password and certificate
244
  _InitGanetiServerSetup()
245

    
246
  # set up ssh config and /etc/hosts
247
  sshline = utils.ReadFile(constants.SSH_HOST_RSA_PUB)
248
  sshkey = sshline.split(" ")[1]
249

    
250
  if modify_etc_hosts:
251
    utils.AddHostToEtcHosts(hostname.name)
252

    
253
  _InitSSHSetup()
254

    
255
  now = time.time()
256

    
257
  # init of cluster config file
258
  cluster_config = objects.Cluster(
259
    serial_no=1,
260
    rsahostkeypub=sshkey,
261
    highest_used_port=(constants.FIRST_DRBD_PORT - 1),
262
    mac_prefix=mac_prefix,
263
    volume_group_name=vg_name,
264
    tcpudp_port_pool=set(),
265
    master_node=hostname.name,
266
    master_ip=clustername.ip,
267
    master_netdev=master_netdev,
268
    cluster_name=clustername.name,
269
    file_storage_dir=file_storage_dir,
270
    enabled_hypervisors=enabled_hypervisors,
271
    beparams={constants.PP_DEFAULT: beparams},
272
    nicparams={constants.PP_DEFAULT: nicparams},
273
    hvparams=hvparams,
274
    candidate_pool_size=candidate_pool_size,
275
    modify_etc_hosts=modify_etc_hosts,
276
    ctime=now,
277
    mtime=now,
278
    uuid=utils.NewUUID(),
279
    )
280
  master_node_config = objects.Node(name=hostname.name,
281
                                    primary_ip=hostname.ip,
282
                                    secondary_ip=secondary_ip,
283
                                    serial_no=1,
284
                                    master_candidate=True,
285
                                    offline=False, drained=False,
286
                                    )
287
  InitConfig(constants.CONFIG_VERSION, cluster_config, master_node_config)
288
  cfg = config.ConfigWriter()
289
  ssh.WriteKnownHostsFile(cfg, constants.SSH_KNOWN_HOSTS_FILE)
290
  cfg.Update(cfg.GetClusterInfo())
291

    
292
  # start the master ip
293
  # TODO: Review rpc call from bootstrap
294
  # TODO: Warn on failed start master
295
  rpc.RpcRunner.call_node_start_master(hostname.name, True, False)
296

    
297

    
298
def InitConfig(version, cluster_config, master_node_config,
299
               cfg_file=constants.CLUSTER_CONF_FILE):
300
  """Create the initial cluster configuration.
301

302
  It will contain the current node, which will also be the master
303
  node, and no instances.
304

305
  @type version: int
306
  @param version: configuration version
307
  @type cluster_config: L{objects.Cluster}
308
  @param cluster_config: cluster configuration
309
  @type master_node_config: L{objects.Node}
310
  @param master_node_config: master node configuration
311
  @type cfg_file: string
312
  @param cfg_file: configuration file path
313

314
  """
315
  nodes = {
316
    master_node_config.name: master_node_config,
317
    }
318

    
319
  now = time.time()
320
  config_data = objects.ConfigData(version=version,
321
                                   cluster=cluster_config,
322
                                   nodes=nodes,
323
                                   instances={},
324
                                   serial_no=1,
325
                                   ctime=now, mtime=now)
326
  utils.WriteFile(cfg_file,
327
                  data=serializer.Dump(config_data.ToDict()),
328
                  mode=0600)
329

    
330

    
331
def FinalizeClusterDestroy(master):
332
  """Execute the last steps of cluster destroy
333

334
  This function shuts down all the daemons, completing the destroy
335
  begun in cmdlib.LUDestroyOpcode.
336

337
  """
338
  result = rpc.RpcRunner.call_node_stop_master(master, True)
339
  msg = result.fail_msg
340
  if msg:
341
    logging.warning("Could not disable the master role: %s" % msg)
342
  result = rpc.RpcRunner.call_node_leave_cluster(master)
343
  msg = result.fail_msg
344
  if msg:
345
    logging.warning("Could not shutdown the node daemon and cleanup"
346
                    " the node: %s", msg)
347

    
348

    
349
def SetupNodeDaemon(cluster_name, node, ssh_key_check):
350
  """Add a node to the cluster.
351

352
  This function must be called before the actual opcode, and will ssh
353
  to the remote node, copy the needed files, and start ganeti-noded,
354
  allowing the master to do the rest via normal rpc calls.
355

356
  @param cluster_name: the cluster name
357
  @param node: the name of the new node
358
  @param ssh_key_check: whether to do a strict key check
359

360
  """
361
  sshrunner = ssh.SshRunner(cluster_name)
362

    
363
  noded_cert = utils.ReadFile(constants.SSL_CERT_FILE)
364
  rapi_cert = utils.ReadFile(constants.RAPI_CERT_FILE)
365
  hmac_key = utils.ReadFile(constants.HMAC_CLUSTER_KEY)
366

    
367
  # in the base64 pem encoding, neither '!' nor '.' are valid chars,
368
  # so we use this to detect an invalid certificate; as long as the
369
  # cert doesn't contain this, the here-document will be correctly
370
  # parsed by the shell sequence below. HMAC keys are hexadecimal strings,
371
  # so the same restrictions apply.
372
  for content in (noded_cert, rapi_cert, hmac_key):
373
    if re.search('^!EOF\.', content, re.MULTILINE):
374
      raise errors.OpExecError("invalid SSL certificate or HMAC key")
375

    
376
  if not noded_cert.endswith("\n"):
377
    noded_cert += "\n"
378
  if not rapi_cert.endswith("\n"):
379
    rapi_cert += "\n"
380
  if not hmac_key.endswith("\n"):
381
    hmac_key += "\n"
382

    
383
  # set up inter-node password and certificate and restarts the node daemon
384
  # and then connect with ssh to set password and start ganeti-noded
385
  # note that all the below variables are sanitized at this point,
386
  # either by being constants or by the checks above
387
  mycommand = ("umask 077 && "
388
               "cat > '%s' << '!EOF.' && \n"
389
               "%s!EOF.\n"
390
               "cat > '%s' << '!EOF.' && \n"
391
               "%s!EOF.\n"
392
               "cat > '%s' << '!EOF.' && \n"
393
               "%s!EOF.\n"
394
               "chmod 0400 %s %s %s && "
395
               "%s restart" %
396
               (constants.SSL_CERT_FILE, noded_cert,
397
                constants.RAPI_CERT_FILE, rapi_cert,
398
                constants.HMAC_CLUSTER_KEY, hmac_key,
399
                constants.SSL_CERT_FILE, constants.RAPI_CERT_FILE,
400
                constants.HMAC_CLUSTER_KEY,
401
                constants.NODE_INITD_SCRIPT))
402

    
403
  result = sshrunner.Run(node, 'root', mycommand, batch=False,
404
                         ask_key=ssh_key_check,
405
                         use_cluster_key=False,
406
                         strict_host_check=ssh_key_check)
407
  if result.failed:
408
    raise errors.OpExecError("Remote command on node %s, error: %s,"
409
                             " output: %s" %
410
                             (node, result.fail_reason, result.output))
411

    
412

    
413
def MasterFailover(no_voting=False):
414
  """Failover the master node.
415

416
  This checks that we are not already the master, and will cause the
417
  current master to cease being master, and the non-master to become
418
  new master.
419

420
  @type no_voting: boolean
421
  @param no_voting: force the operation without remote nodes agreement
422
                      (dangerous)
423

424
  """
425
  sstore = ssconf.SimpleStore()
426

    
427
  old_master, new_master = ssconf.GetMasterAndMyself(sstore)
428
  node_list = sstore.GetNodeList()
429
  mc_list = sstore.GetMasterCandidates()
430

    
431
  if old_master == new_master:
432
    raise errors.OpPrereqError("This commands must be run on the node"
433
                               " where you want the new master to be."
434
                               " %s is already the master" %
435
                               old_master)
436

    
437
  if new_master not in mc_list:
438
    mc_no_master = [name for name in mc_list if name != old_master]
439
    raise errors.OpPrereqError("This node is not among the nodes marked"
440
                               " as master candidates. Only these nodes"
441
                               " can become masters. Current list of"
442
                               " master candidates is:\n"
443
                               "%s" % ('\n'.join(mc_no_master)))
444

    
445
  if not no_voting:
446
    vote_list = GatherMasterVotes(node_list)
447

    
448
    if vote_list:
449
      voted_master = vote_list[0][0]
450
      if voted_master is None:
451
        raise errors.OpPrereqError("Cluster is inconsistent, most nodes did"
452
                                   " not respond.")
453
      elif voted_master != old_master:
454
        raise errors.OpPrereqError("I have a wrong configuration, I believe"
455
                                   " the master is %s but the other nodes"
456
                                   " voted %s. Please resync the configuration"
457
                                   " of this node." %
458
                                   (old_master, voted_master))
459
  # end checks
460

    
461
  rcode = 0
462

    
463
  logging.info("Setting master to %s, old master: %s", new_master, old_master)
464

    
465
  result = rpc.RpcRunner.call_node_stop_master(old_master, True)
466
  msg = result.fail_msg
467
  if msg:
468
    logging.error("Could not disable the master role on the old master"
469
                 " %s, please disable manually: %s", old_master, msg)
470

    
471
  # Here we have a phase where no master should be running
472

    
473
  # instantiate a real config writer, as we now know we have the
474
  # configuration data
475
  cfg = config.ConfigWriter()
476

    
477
  cluster_info = cfg.GetClusterInfo()
478
  cluster_info.master_node = new_master
479
  # this will also regenerate the ssconf files, since we updated the
480
  # cluster info
481
  cfg.Update(cluster_info)
482

    
483
  result = rpc.RpcRunner.call_node_start_master(new_master, True, no_voting)
484
  msg = result.fail_msg
485
  if msg:
486
    logging.error("Could not start the master role on the new master"
487
                  " %s, please check: %s", new_master, msg)
488
    rcode = 1
489

    
490
  return rcode
491

    
492

    
493
def GetMaster():
494
  """Returns the current master node.
495

496
  This is a separate function in bootstrap since it's needed by
497
  gnt-cluster, and instead of importing directly ssconf, it's better
498
  to abstract it in bootstrap, where we do use ssconf in other
499
  functions too.
500

501
  """
502
  sstore = ssconf.SimpleStore()
503

    
504
  old_master, _ = ssconf.GetMasterAndMyself(sstore)
505

    
506
  return old_master
507

    
508

    
509
def GatherMasterVotes(node_list):
510
  """Check the agreement on who is the master.
511

512
  This function will return a list of (node, number of votes), ordered
513
  by the number of votes. Errors will be denoted by the key 'None'.
514

515
  Note that the sum of votes is the number of nodes this machine
516
  knows, whereas the number of entries in the list could be different
517
  (if some nodes vote for another master).
518

519
  We remove ourselves from the list since we know that (bugs aside)
520
  since we use the same source for configuration information for both
521
  backend and boostrap, we'll always vote for ourselves.
522

523
  @type node_list: list
524
  @param node_list: the list of nodes to query for master info; the current
525
      node will be removed if it is in the list
526
  @rtype: list
527
  @return: list of (node, votes)
528

529
  """
530
  myself = utils.HostInfo().name
531
  try:
532
    node_list.remove(myself)
533
  except ValueError:
534
    pass
535
  if not node_list:
536
    # no nodes left (eventually after removing myself)
537
    return []
538
  results = rpc.RpcRunner.call_master_info(node_list)
539
  if not isinstance(results, dict):
540
    # this should not happen (unless internal error in rpc)
541
    logging.critical("Can't complete rpc call, aborting master startup")
542
    return [(None, len(node_list))]
543
  votes = {}
544
  for node in results:
545
    nres = results[node]
546
    data = nres.payload
547
    msg = nres.fail_msg
548
    fail = False
549
    if msg:
550
      logging.warning("Error contacting node %s: %s", node, msg)
551
      fail = True
552
    elif not isinstance(data, (tuple, list)) or len(data) < 3:
553
      logging.warning("Invalid data received from node %s: %s", node, data)
554
      fail = True
555
    if fail:
556
      if None not in votes:
557
        votes[None] = 0
558
      votes[None] += 1
559
      continue
560
    master_node = data[2]
561
    if master_node not in votes:
562
      votes[master_node] = 0
563
    votes[master_node] += 1
564

    
565
  vote_list = [v for v in votes.items()]
566
  # sort first on number of votes then on name, since we want None
567
  # sorted later if we have the half of the nodes not responding, and
568
  # half voting all for the same master
569
  vote_list.sort(key=lambda x: (x[1], x[0]), reverse=True)
570

    
571
  return vote_list