Statistics
| Branch: | Tag: | Revision:

root / lib / bootstrap.py @ bec0522b

History | View | Annotate | Download (18.6 kB)

1
#
2
#
3

    
4
# Copyright (C) 2006, 2007, 2008 Google Inc.
5
#
6
# This program is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 2 of the License, or
9
# (at your option) any later version.
10
#
11
# This program is distributed in the hope that it will be useful, but
12
# WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
# General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with this program; if not, write to the Free Software
18
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19
# 02110-1301, USA.
20

    
21

    
22
"""Functions to bootstrap a new cluster.
23

24
"""
25

    
26
import os
27
import os.path
28
import re
29
import logging
30
import tempfile
31

    
32
from ganeti import rpc
33
from ganeti import ssh
34
from ganeti import utils
35
from ganeti import errors
36
from ganeti import config
37
from ganeti import constants
38
from ganeti import objects
39
from ganeti import ssconf
40
from ganeti import hypervisor
41

    
42

    
43
def _InitSSHSetup():
44
  """Setup the SSH configuration for the cluster.
45

46
  This generates a dsa keypair for root, adds the pub key to the
47
  permitted hosts and adds the hostkey to its own known hosts.
48

49
  """
50
  priv_key, pub_key, auth_keys = ssh.GetUserFiles(constants.GANETI_RUNAS)
51

    
52
  for name in priv_key, pub_key:
53
    if os.path.exists(name):
54
      utils.CreateBackup(name)
55
    utils.RemoveFile(name)
56

    
57
  result = utils.RunCmd(["ssh-keygen", "-t", "dsa",
58
                         "-f", priv_key,
59
                         "-q", "-N", ""])
60
  if result.failed:
61
    raise errors.OpExecError("Could not generate ssh keypair, error %s" %
62
                             result.output)
63

    
64
  f = open(pub_key, 'r')
65
  try:
66
    utils.AddAuthorizedKey(auth_keys, f.read(8192))
67
  finally:
68
    f.close()
69

    
70

    
71
def _GenerateSelfSignedSslCert(file_name, validity=(365 * 5)):
72
  """Generates a self-signed SSL certificate.
73

74
  @type file_name: str
75
  @param file_name: Path to output file
76
  @type validity: int
77
  @param validity: Validity for certificate in days
78

79
  """
80
  (fd, tmp_file_name) = tempfile.mkstemp(dir=os.path.dirname(file_name))
81
  try:
82
    try:
83
      # Set permissions before writing key
84
      os.chmod(tmp_file_name, 0600)
85

    
86
      result = utils.RunCmd(["openssl", "req", "-new", "-newkey", "rsa:1024",
87
                             "-days", str(validity), "-nodes", "-x509",
88
                             "-keyout", tmp_file_name, "-out", tmp_file_name,
89
                             "-batch"])
90
      if result.failed:
91
        raise errors.OpExecError("Could not generate SSL certificate, command"
92
                                 " %s had exitcode %s and error message %s" %
93
                                 (result.cmd, result.exit_code, result.output))
94

    
95
      # Make read-only
96
      os.chmod(tmp_file_name, 0400)
97

    
98
      os.rename(tmp_file_name, file_name)
99
    finally:
100
      utils.RemoveFile(tmp_file_name)
101
  finally:
102
    os.close(fd)
103

    
104

    
105
def _InitGanetiServerSetup():
106
  """Setup the necessary configuration for the initial node daemon.
107

108
  This creates the nodepass file containing the shared password for
109
  the cluster and also generates the SSL certificate.
110

111
  """
112
  _GenerateSelfSignedSslCert(constants.SSL_CERT_FILE)
113

    
114
  # Don't overwrite existing file
115
  if not os.path.exists(constants.RAPI_CERT_FILE):
116
    _GenerateSelfSignedSslCert(constants.RAPI_CERT_FILE)
117

    
118
  result = utils.RunCmd([constants.NODE_INITD_SCRIPT, "restart"])
119

    
120
  if result.failed:
121
    raise errors.OpExecError("Could not start the node daemon, command %s"
122
                             " had exitcode %s and error %s" %
123
                             (result.cmd, result.exit_code, result.output))
124

    
125

    
126
def InitCluster(cluster_name, mac_prefix, def_bridge,
127
                master_netdev, file_storage_dir, candidate_pool_size,
128
                secondary_ip=None, vg_name=None, beparams=None, hvparams=None,
129
                enabled_hypervisors=None, default_hypervisor=None,
130
                modify_etc_hosts=True):
131
  """Initialise the cluster.
132

133
  @type candidate_pool_size: int
134
  @param candidate_pool_size: master candidate pool size
135

136
  """
137
  # TODO: complete the docstring
138
  if config.ConfigWriter.IsCluster():
139
    raise errors.OpPrereqError("Cluster is already initialised")
140

    
141
  if not enabled_hypervisors:
142
    raise errors.OpPrereqError("Enabled hypervisors list must contain at"
143
                               " least one member")
144
  invalid_hvs = set(enabled_hypervisors) - constants.HYPER_TYPES
145
  if invalid_hvs:
146
    raise errors.OpPrereqError("Enabled hypervisors contains invalid"
147
                               " entries: %s" % invalid_hvs)
148

    
149
  hostname = utils.HostInfo()
150

    
151
  if hostname.ip.startswith("127."):
152
    raise errors.OpPrereqError("This host's IP resolves to the private"
153
                               " range (%s). Please fix DNS or %s." %
154
                               (hostname.ip, constants.ETC_HOSTS))
155

    
156
  if not utils.OwnIpAddress(hostname.ip):
157
    raise errors.OpPrereqError("Inconsistency: this host's name resolves"
158
                               " to %s,\nbut this ip address does not"
159
                               " belong to this host."
160
                               " Aborting." % hostname.ip)
161

    
162
  clustername = utils.HostInfo(cluster_name)
163

    
164
  if utils.TcpPing(clustername.ip, constants.DEFAULT_NODED_PORT,
165
                   timeout=5):
166
    raise errors.OpPrereqError("Cluster IP already active. Aborting.")
167

    
168
  if secondary_ip:
169
    if not utils.IsValidIP(secondary_ip):
170
      raise errors.OpPrereqError("Invalid secondary ip given")
171
    if (secondary_ip != hostname.ip and
172
        not utils.OwnIpAddress(secondary_ip)):
173
      raise errors.OpPrereqError("You gave %s as secondary IP,"
174
                                 " but it does not belong to this host." %
175
                                 secondary_ip)
176
  else:
177
    secondary_ip = hostname.ip
178

    
179
  if vg_name is not None:
180
    # Check if volume group is valid
181
    vgstatus = utils.CheckVolumeGroupSize(utils.ListVolumeGroups(), vg_name,
182
                                          constants.MIN_VG_SIZE)
183
    if vgstatus:
184
      raise errors.OpPrereqError("Error: %s\nspecify --no-lvm-storage if"
185
                                 " you are not using lvm" % vgstatus)
186

    
187
  file_storage_dir = os.path.normpath(file_storage_dir)
188

    
189
  if not os.path.isabs(file_storage_dir):
190
    raise errors.OpPrereqError("The file storage directory you passed is"
191
                               " not an absolute path.")
192

    
193
  if not os.path.exists(file_storage_dir):
194
    try:
195
      os.makedirs(file_storage_dir, 0750)
196
    except OSError, err:
197
      raise errors.OpPrereqError("Cannot create file storage directory"
198
                                 " '%s': %s" %
199
                                 (file_storage_dir, err))
200

    
201
  if not os.path.isdir(file_storage_dir):
202
    raise errors.OpPrereqError("The file storage directory '%s' is not"
203
                               " a directory." % file_storage_dir)
204

    
205
  if not re.match("^[0-9a-z]{2}:[0-9a-z]{2}:[0-9a-z]{2}$", mac_prefix):
206
    raise errors.OpPrereqError("Invalid mac prefix given '%s'" % mac_prefix)
207

    
208
  result = utils.RunCmd(["ip", "link", "show", "dev", master_netdev])
209
  if result.failed:
210
    raise errors.OpPrereqError("Invalid master netdev given (%s): '%s'" %
211
                               (master_netdev,
212
                                result.output.strip()))
213

    
214
  if not (os.path.isfile(constants.NODE_INITD_SCRIPT) and
215
          os.access(constants.NODE_INITD_SCRIPT, os.X_OK)):
216
    raise errors.OpPrereqError("Init.d script '%s' missing or not"
217
                               " executable." % constants.NODE_INITD_SCRIPT)
218

    
219
  dirs = [(constants.RUN_GANETI_DIR, constants.RUN_DIRS_MODE)]
220
  utils.EnsureDirs(dirs)
221

    
222
  utils.ForceDictType(beparams, constants.BES_PARAMETER_TYPES)
223
  # hvparams is a mapping of hypervisor->hvparams dict
224
  for hv_name, hv_params in hvparams.iteritems():
225
    utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
226
    hv_class = hypervisor.GetHypervisor(hv_name)
227
    hv_class.CheckParameterSyntax(hv_params)
228

    
229
  # set up the inter-node password and certificate
230
  _InitGanetiServerSetup()
231

    
232
  # set up ssh config and /etc/hosts
233
  f = open(constants.SSH_HOST_RSA_PUB, 'r')
234
  try:
235
    sshline = f.read()
236
  finally:
237
    f.close()
238
  sshkey = sshline.split(" ")[1]
239

    
240
  if modify_etc_hosts:
241
    utils.AddHostToEtcHosts(hostname.name)
242

    
243
  _InitSSHSetup()
244

    
245
  # init of cluster config file
246
  cluster_config = objects.Cluster(
247
    serial_no=1,
248
    rsahostkeypub=sshkey,
249
    highest_used_port=(constants.FIRST_DRBD_PORT - 1),
250
    mac_prefix=mac_prefix,
251
    volume_group_name=vg_name,
252
    default_bridge=def_bridge,
253
    tcpudp_port_pool=set(),
254
    master_node=hostname.name,
255
    master_ip=clustername.ip,
256
    master_netdev=master_netdev,
257
    cluster_name=clustername.name,
258
    file_storage_dir=file_storage_dir,
259
    enabled_hypervisors=enabled_hypervisors,
260
    default_hypervisor=default_hypervisor,
261
    beparams={constants.BEGR_DEFAULT: beparams},
262
    hvparams=hvparams,
263
    candidate_pool_size=candidate_pool_size,
264
    modify_etc_hosts=modify_etc_hosts,
265
    )
266
  master_node_config = objects.Node(name=hostname.name,
267
                                    primary_ip=hostname.ip,
268
                                    secondary_ip=secondary_ip,
269
                                    serial_no=1,
270
                                    master_candidate=True,
271
                                    offline=False, drained=False,
272
                                    )
273

    
274
  sscfg = InitConfig(constants.CONFIG_VERSION,
275
                     cluster_config, master_node_config)
276
  ssh.WriteKnownHostsFile(sscfg, constants.SSH_KNOWN_HOSTS_FILE)
277
  cfg = config.ConfigWriter()
278
  cfg.Update(cfg.GetClusterInfo())
279

    
280
  # start the master ip
281
  # TODO: Review rpc call from bootstrap
282
  rpc.RpcRunner.call_node_start_master(hostname.name, True, False)
283

    
284

    
285
def InitConfig(version, cluster_config, master_node_config,
286
               cfg_file=constants.CLUSTER_CONF_FILE):
287
  """Create the initial cluster configuration.
288

289
  It will contain the current node, which will also be the master
290
  node, and no instances.
291

292
  @type version: int
293
  @param version: configuration version
294
  @type cluster_config: L{objects.Cluster}
295
  @param cluster_config: cluster configuration
296
  @type master_node_config: L{objects.Node}
297
  @param master_node_config: master node configuration
298
  @type cfg_file: string
299
  @param cfg_file: configuration file path
300

301
  @rtype: L{ssconf.SimpleConfigWriter}
302
  @return: initialized config instance
303

304
  """
305
  nodes = {
306
    master_node_config.name: master_node_config,
307
    }
308

    
309
  config_data = objects.ConfigData(version=version,
310
                                   cluster=cluster_config,
311
                                   nodes=nodes,
312
                                   instances={},
313
                                   serial_no=1)
314
  cfg = ssconf.SimpleConfigWriter.FromDict(config_data.ToDict(), cfg_file)
315
  cfg.Save()
316

    
317
  return cfg
318

    
319

    
320
def FinalizeClusterDestroy(master):
321
  """Execute the last steps of cluster destroy
322

323
  This function shuts down all the daemons, completing the destroy
324
  begun in cmdlib.LUDestroyOpcode.
325

326
  """
327
  result = rpc.RpcRunner.call_node_stop_master(master, True)
328
  if result.failed or not result.data:
329
    logging.warning("Could not disable the master role")
330
  result = rpc.RpcRunner.call_node_leave_cluster(master)
331
  if result.failed or not result.data:
332
    logging.warning("Could not shutdown the node daemon and cleanup the node")
333

    
334

    
335
def SetupNodeDaemon(cluster_name, node, ssh_key_check):
336
  """Add a node to the cluster.
337

338
  This function must be called before the actual opcode, and will ssh
339
  to the remote node, copy the needed files, and start ganeti-noded,
340
  allowing the master to do the rest via normal rpc calls.
341

342
  @param cluster_name: the cluster name
343
  @param node: the name of the new node
344
  @param ssh_key_check: whether to do a strict key check
345

346
  """
347
  sshrunner = ssh.SshRunner(cluster_name)
348

    
349
  noded_cert = utils.ReadFile(constants.SSL_CERT_FILE)
350
  rapi_cert = utils.ReadFile(constants.RAPI_CERT_FILE)
351

    
352
  # in the base64 pem encoding, neither '!' nor '.' are valid chars,
353
  # so we use this to detect an invalid certificate; as long as the
354
  # cert doesn't contain this, the here-document will be correctly
355
  # parsed by the shell sequence below
356
  if (re.search('^!EOF\.', noded_cert, re.MULTILINE) or
357
      re.search('^!EOF\.', rapi_cert, re.MULTILINE)):
358
    raise errors.OpExecError("invalid PEM encoding in the SSL certificate")
359

    
360
  if not noded_cert.endswith("\n"):
361
    noded_cert += "\n"
362
  if not rapi_cert.endswith("\n"):
363
    rapi_cert += "\n"
364

    
365
  # set up inter-node password and certificate and restarts the node daemon
366
  # and then connect with ssh to set password and start ganeti-noded
367
  # note that all the below variables are sanitized at this point,
368
  # either by being constants or by the checks above
369
  mycommand = ("umask 077 && "
370
               "cat > '%s' << '!EOF.' && \n"
371
               "%s!EOF.\n"
372
               "cat > '%s' << '!EOF.' && \n"
373
               "%s!EOF.\n"
374
               "chmod 0400 %s %s && "
375
               "%s restart" %
376
               (constants.SSL_CERT_FILE, noded_cert,
377
                constants.RAPI_CERT_FILE, rapi_cert,
378
                constants.SSL_CERT_FILE, constants.RAPI_CERT_FILE,
379
                constants.NODE_INITD_SCRIPT))
380

    
381
  result = sshrunner.Run(node, 'root', mycommand, batch=False,
382
                         ask_key=ssh_key_check,
383
                         use_cluster_key=False,
384
                         strict_host_check=ssh_key_check)
385
  if result.failed:
386
    raise errors.OpExecError("Remote command on node %s, error: %s,"
387
                             " output: %s" %
388
                             (node, result.fail_reason, result.output))
389

    
390

    
391
def MasterFailover(no_voting=False):
392
  """Failover the master node.
393

394
  This checks that we are not already the master, and will cause the
395
  current master to cease being master, and the non-master to become
396
  new master.
397

398
  @type no_voting: boolean
399
  @param no_voting: force the operation without remote nodes agreement
400
                      (dangerous)
401

402
  """
403
  sstore = ssconf.SimpleStore()
404

    
405
  old_master, new_master = ssconf.GetMasterAndMyself(sstore)
406
  node_list = sstore.GetNodeList()
407
  mc_list = sstore.GetMasterCandidates()
408

    
409
  if old_master == new_master:
410
    raise errors.OpPrereqError("This commands must be run on the node"
411
                               " where you want the new master to be."
412
                               " %s is already the master" %
413
                               old_master)
414

    
415
  if new_master not in mc_list:
416
    mc_no_master = [name for name in mc_list if name != old_master]
417
    raise errors.OpPrereqError("This node is not among the nodes marked"
418
                               " as master candidates. Only these nodes"
419
                               " can become masters. Current list of"
420
                               " master candidates is:\n"
421
                               "%s" % ('\n'.join(mc_no_master)))
422

    
423
  if not no_voting:
424
    vote_list = GatherMasterVotes(node_list)
425

    
426
    if vote_list:
427
      voted_master = vote_list[0][0]
428
      if voted_master is None:
429
        raise errors.OpPrereqError("Cluster is inconsistent, most nodes did"
430
                                   " not respond.")
431
      elif voted_master != old_master:
432
        raise errors.OpPrereqError("I have a wrong configuration, I believe"
433
                                   " the master is %s but the other nodes"
434
                                   " voted %s. Please resync the configuration"
435
                                   " of this node." %
436
                                   (old_master, voted_master))
437
  # end checks
438

    
439
  rcode = 0
440

    
441
  logging.info("Setting master to %s, old master: %s", new_master, old_master)
442

    
443
  result = rpc.RpcRunner.call_node_stop_master(old_master, True)
444
  if result.failed or not result.data:
445
    logging.error("Could not disable the master role on the old master"
446
                 " %s, please disable manually", old_master)
447

    
448
  # Here we have a phase where no master should be running
449

    
450
  # instantiate a real config writer, as we now know we have the
451
  # configuration data
452
  cfg = config.ConfigWriter()
453

    
454
  cluster_info = cfg.GetClusterInfo()
455
  cluster_info.master_node = new_master
456
  # this will also regenerate the ssconf files, since we updated the
457
  # cluster info
458
  cfg.Update(cluster_info)
459

    
460
  result = rpc.RpcRunner.call_node_start_master(new_master, True, no_voting)
461
  if result.failed or not result.data:
462
    logging.error("Could not start the master role on the new master"
463
                  " %s, please check", new_master)
464
    rcode = 1
465

    
466
  return rcode
467

    
468

    
469
def GetMaster():
470
  """Returns the current master node.
471

472
  This is a separate function in bootstrap since it's needed by
473
  gnt-cluster, and instead of importing directly ssconf, it's better
474
  to abstract it in bootstrap, where we do use ssconf in other
475
  functions too.
476

477
  """
478
  sstore = ssconf.SimpleStore()
479

    
480
  old_master, _ = ssconf.GetMasterAndMyself(sstore)
481

    
482
  return old_master
483

    
484

    
485
def GatherMasterVotes(node_list):
486
  """Check the agreement on who is the master.
487

488
  This function will return a list of (node, number of votes), ordered
489
  by the number of votes. Errors will be denoted by the key 'None'.
490

491
  Note that the sum of votes is the number of nodes this machine
492
  knows, whereas the number of entries in the list could be different
493
  (if some nodes vote for another master).
494

495
  We remove ourselves from the list since we know that (bugs aside)
496
  since we use the same source for configuration information for both
497
  backend and boostrap, we'll always vote for ourselves.
498

499
  @type node_list: list
500
  @param node_list: the list of nodes to query for master info; the current
501
      node will be removed if it is in the list
502
  @rtype: list
503
  @return: list of (node, votes)
504

505
  """
506
  myself = utils.HostInfo().name
507
  try:
508
    node_list.remove(myself)
509
  except ValueError:
510
    pass
511
  if not node_list:
512
    # no nodes left (eventually after removing myself)
513
    return []
514
  results = rpc.RpcRunner.call_master_info(node_list)
515
  if not isinstance(results, dict):
516
    # this should not happen (unless internal error in rpc)
517
    logging.critical("Can't complete rpc call, aborting master startup")
518
    return [(None, len(node_list))]
519
  votes = {}
520
  for node in results:
521
    nres = results[node]
522
    data = nres.data
523
    if nres.failed or not isinstance(data, (tuple, list)) or len(data) < 3:
524
      # here the rpc layer should have already logged errors
525
      if None not in votes:
526
        votes[None] = 0
527
      votes[None] += 1
528
      continue
529
    master_node = data[2]
530
    if master_node not in votes:
531
      votes[master_node] = 0
532
    votes[master_node] += 1
533

    
534
  vote_list = [v for v in votes.items()]
535
  # sort first on number of votes then on name, since we want None
536
  # sorted later if we have the half of the nodes not responding, and
537
  # half voting all for the same master
538
  vote_list.sort(key=lambda x: (x[1], x[0]), reverse=True)
539

    
540
  return vote_list