Statistics
| Branch: | Tag: | Revision:

root / lib / bootstrap.py @ f4a2f532

History | View | Annotate | Download (18.7 kB)

1
#
2
#
3

    
4
# Copyright (C) 2006, 2007, 2008 Google Inc.
5
#
6
# This program is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 2 of the License, or
9
# (at your option) any later version.
10
#
11
# This program is distributed in the hope that it will be useful, but
12
# WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
# General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with this program; if not, write to the Free Software
18
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19
# 02110-1301, USA.
20

    
21

    
22
"""Functions to bootstrap a new cluster.
23

24
"""
25

    
26
import os
27
import os.path
28
import re
29
import logging
30
import tempfile
31

    
32
from ganeti import rpc
33
from ganeti import ssh
34
from ganeti import utils
35
from ganeti import errors
36
from ganeti import config
37
from ganeti import constants
38
from ganeti import objects
39
from ganeti import ssconf
40
from ganeti import hypervisor
41

    
42

    
43
def _InitSSHSetup():
44
  """Setup the SSH configuration for the cluster.
45

46
  This generates a dsa keypair for root, adds the pub key to the
47
  permitted hosts and adds the hostkey to its own known hosts.
48

49
  """
50
  priv_key, pub_key, auth_keys = ssh.GetUserFiles(constants.GANETI_RUNAS)
51

    
52
  for name in priv_key, pub_key:
53
    if os.path.exists(name):
54
      utils.CreateBackup(name)
55
    utils.RemoveFile(name)
56

    
57
  result = utils.RunCmd(["ssh-keygen", "-t", "dsa",
58
                         "-f", priv_key,
59
                         "-q", "-N", ""])
60
  if result.failed:
61
    raise errors.OpExecError("Could not generate ssh keypair, error %s" %
62
                             result.output)
63

    
64
  f = open(pub_key, 'r')
65
  try:
66
    utils.AddAuthorizedKey(auth_keys, f.read(8192))
67
  finally:
68
    f.close()
69

    
70

    
71
def _GenerateSelfSignedSslCert(file_name, validity=(365 * 5)):
72
  """Generates a self-signed SSL certificate.
73

74
  @type file_name: str
75
  @param file_name: Path to output file
76
  @type validity: int
77
  @param validity: Validity for certificate in days
78

79
  """
80
  (fd, tmp_file_name) = tempfile.mkstemp(dir=os.path.dirname(file_name))
81
  try:
82
    try:
83
      # Set permissions before writing key
84
      os.chmod(tmp_file_name, 0600)
85

    
86
      result = utils.RunCmd(["openssl", "req", "-new", "-newkey", "rsa:1024",
87
                             "-days", str(validity), "-nodes", "-x509",
88
                             "-keyout", tmp_file_name, "-out", tmp_file_name,
89
                             "-batch"])
90
      if result.failed:
91
        raise errors.OpExecError("Could not generate SSL certificate, command"
92
                                 " %s had exitcode %s and error message %s" %
93
                                 (result.cmd, result.exit_code, result.output))
94

    
95
      # Make read-only
96
      os.chmod(tmp_file_name, 0400)
97

    
98
      os.rename(tmp_file_name, file_name)
99
    finally:
100
      utils.RemoveFile(tmp_file_name)
101
  finally:
102
    os.close(fd)
103

    
104

    
105
def _InitGanetiServerSetup():
106
  """Setup the necessary configuration for the initial node daemon.
107

108
  This creates the nodepass file containing the shared password for
109
  the cluster and also generates the SSL certificate.
110

111
  """
112
  _GenerateSelfSignedSslCert(constants.SSL_CERT_FILE)
113

    
114
  # Don't overwrite existing file
115
  if not os.path.exists(constants.RAPI_CERT_FILE):
116
    _GenerateSelfSignedSslCert(constants.RAPI_CERT_FILE)
117

    
118
  result = utils.RunCmd([constants.NODE_INITD_SCRIPT, "restart"])
119

    
120
  if result.failed:
121
    raise errors.OpExecError("Could not start the node daemon, command %s"
122
                             " had exitcode %s and error %s" %
123
                             (result.cmd, result.exit_code, result.output))
124

    
125

    
126
def InitCluster(cluster_name, mac_prefix,
127
                master_netdev, file_storage_dir, candidate_pool_size,
128
                secondary_ip=None, vg_name=None, beparams=None,
129
                nicparams=None, hvparams=None, enabled_hypervisors=None,
130
                default_hypervisor=None, modify_etc_hosts=True):
131
  """Initialise the cluster.
132

133
  @type candidate_pool_size: int
134
  @param candidate_pool_size: master candidate pool size
135

136
  """
137
  # TODO: complete the docstring
138
  if config.ConfigWriter.IsCluster():
139
    raise errors.OpPrereqError("Cluster is already initialised")
140

    
141
  hostname = utils.HostInfo()
142

    
143
  if hostname.ip.startswith("127."):
144
    raise errors.OpPrereqError("This host's IP resolves to the private"
145
                               " range (%s). Please fix DNS or %s." %
146
                               (hostname.ip, constants.ETC_HOSTS))
147

    
148
  if not utils.OwnIpAddress(hostname.ip):
149
    raise errors.OpPrereqError("Inconsistency: this host's name resolves"
150
                               " to %s,\nbut this ip address does not"
151
                               " belong to this host."
152
                               " Aborting." % hostname.ip)
153

    
154
  clustername = utils.HostInfo(cluster_name)
155

    
156
  if utils.TcpPing(clustername.ip, constants.DEFAULT_NODED_PORT,
157
                   timeout=5):
158
    raise errors.OpPrereqError("Cluster IP already active. Aborting.")
159

    
160
  if secondary_ip:
161
    if not utils.IsValidIP(secondary_ip):
162
      raise errors.OpPrereqError("Invalid secondary ip given")
163
    if (secondary_ip != hostname.ip and
164
        not utils.OwnIpAddress(secondary_ip)):
165
      raise errors.OpPrereqError("You gave %s as secondary IP,"
166
                                 " but it does not belong to this host." %
167
                                 secondary_ip)
168
  else:
169
    secondary_ip = hostname.ip
170

    
171
  if vg_name is not None:
172
    # Check if volume group is valid
173
    vgstatus = utils.CheckVolumeGroupSize(utils.ListVolumeGroups(), vg_name,
174
                                          constants.MIN_VG_SIZE)
175
    if vgstatus:
176
      raise errors.OpPrereqError("Error: %s\nspecify --no-lvm-storage if"
177
                                 " you are not using lvm" % vgstatus)
178

    
179
  file_storage_dir = os.path.normpath(file_storage_dir)
180

    
181
  if not os.path.isabs(file_storage_dir):
182
    raise errors.OpPrereqError("The file storage directory you passed is"
183
                               " not an absolute path.")
184

    
185
  if not os.path.exists(file_storage_dir):
186
    try:
187
      os.makedirs(file_storage_dir, 0750)
188
    except OSError, err:
189
      raise errors.OpPrereqError("Cannot create file storage directory"
190
                                 " '%s': %s" %
191
                                 (file_storage_dir, err))
192

    
193
  if not os.path.isdir(file_storage_dir):
194
    raise errors.OpPrereqError("The file storage directory '%s' is not"
195
                               " a directory." % file_storage_dir)
196

    
197
  if not re.match("^[0-9a-z]{2}:[0-9a-z]{2}:[0-9a-z]{2}$", mac_prefix):
198
    raise errors.OpPrereqError("Invalid mac prefix given '%s'" % mac_prefix)
199

    
200
  result = utils.RunCmd(["ip", "link", "show", "dev", master_netdev])
201
  if result.failed:
202
    raise errors.OpPrereqError("Invalid master netdev given (%s): '%s'" %
203
                               (master_netdev,
204
                                result.output.strip()))
205

    
206
  if not (os.path.isfile(constants.NODE_INITD_SCRIPT) and
207
          os.access(constants.NODE_INITD_SCRIPT, os.X_OK)):
208
    raise errors.OpPrereqError("Init.d script '%s' missing or not"
209
                               " executable." % constants.NODE_INITD_SCRIPT)
210

    
211
  dirs = [(constants.RUN_GANETI_DIR, constants.RUN_DIRS_MODE)]
212
  utils.EnsureDirs(dirs)
213

    
214
  utils.ForceDictType(beparams, constants.BES_PARAMETER_TYPES)
215
  utils.ForceDictType(nicparams, constants.NICS_PARAMETER_TYPES)
216
  objects.NIC.CheckParameterSyntax(nicparams)
217

    
218
  # hvparams is a mapping of hypervisor->hvparams dict
219
  for hv_name, hv_params in hvparams.iteritems():
220
    utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
221
    hv_class = hypervisor.GetHypervisor(hv_name)
222
    hv_class.CheckParameterSyntax(hv_params)
223

    
224
  # set up the inter-node password and certificate
225
  _InitGanetiServerSetup()
226

    
227
  # set up ssh config and /etc/hosts
228
  f = open(constants.SSH_HOST_RSA_PUB, 'r')
229
  try:
230
    sshline = f.read()
231
  finally:
232
    f.close()
233
  sshkey = sshline.split(" ")[1]
234

    
235
  if modify_etc_hosts:
236
    utils.AddHostToEtcHosts(hostname.name)
237

    
238
  _InitSSHSetup()
239

    
240
  # init of cluster config file
241
  cluster_config = objects.Cluster(
242
    serial_no=1,
243
    rsahostkeypub=sshkey,
244
    highest_used_port=(constants.FIRST_DRBD_PORT - 1),
245
    mac_prefix=mac_prefix,
246
    volume_group_name=vg_name,
247
    tcpudp_port_pool=set(),
248
    master_node=hostname.name,
249
    master_ip=clustername.ip,
250
    master_netdev=master_netdev,
251
    cluster_name=clustername.name,
252
    file_storage_dir=file_storage_dir,
253
    enabled_hypervisors=enabled_hypervisors,
254
    default_hypervisor=default_hypervisor,
255
    beparams={constants.PP_DEFAULT: beparams},
256
    nicparams={constants.PP_DEFAULT: nicparams},
257
    hvparams=hvparams,
258
    candidate_pool_size=candidate_pool_size,
259
    modify_etc_hosts=modify_etc_hosts,
260
    )
261
  master_node_config = objects.Node(name=hostname.name,
262
                                    primary_ip=hostname.ip,
263
                                    secondary_ip=secondary_ip,
264
                                    serial_no=1,
265
                                    master_candidate=True,
266
                                    offline=False, drained=False,
267
                                    )
268

    
269
  sscfg = InitConfig(constants.CONFIG_VERSION,
270
                     cluster_config, master_node_config)
271
  ssh.WriteKnownHostsFile(sscfg, constants.SSH_KNOWN_HOSTS_FILE)
272
  cfg = config.ConfigWriter()
273
  cfg.Update(cfg.GetClusterInfo())
274

    
275
  # start the master ip
276
  # TODO: Review rpc call from bootstrap
277
  # TODO: Warn on failed start master
278
  rpc.RpcRunner.call_node_start_master(hostname.name, True, False)
279

    
280

    
281
def InitConfig(version, cluster_config, master_node_config,
282
               cfg_file=constants.CLUSTER_CONF_FILE):
283
  """Create the initial cluster configuration.
284

285
  It will contain the current node, which will also be the master
286
  node, and no instances.
287

288
  @type version: int
289
  @param version: configuration version
290
  @type cluster_config: L{objects.Cluster}
291
  @param cluster_config: cluster configuration
292
  @type master_node_config: L{objects.Node}
293
  @param master_node_config: master node configuration
294
  @type cfg_file: string
295
  @param cfg_file: configuration file path
296

297
  @rtype: L{ssconf.SimpleConfigWriter}
298
  @return: initialized config instance
299

300
  """
301
  nodes = {
302
    master_node_config.name: master_node_config,
303
    }
304

    
305
  config_data = objects.ConfigData(version=version,
306
                                   cluster=cluster_config,
307
                                   nodes=nodes,
308
                                   instances={},
309
                                   serial_no=1)
310
  cfg = ssconf.SimpleConfigWriter.FromDict(config_data.ToDict(), cfg_file)
311
  cfg.Save()
312

    
313
  return cfg
314

    
315

    
316
def FinalizeClusterDestroy(master):
317
  """Execute the last steps of cluster destroy
318

319
  This function shuts down all the daemons, completing the destroy
320
  begun in cmdlib.LUDestroyOpcode.
321

322
  """
323
  result = rpc.RpcRunner.call_node_stop_master(master, True)
324
  msg = result.RemoteFailMsg()
325
  if msg:
326
    logging.warning("Could not disable the master role: %s" % msg)
327
  result = rpc.RpcRunner.call_node_leave_cluster(master)
328
  msg = result.RemoteFailMsg()
329
  if msg:
330
    logging.warning("Could not shutdown the node daemon and cleanup"
331
                    " the node: %s", msg)
332

    
333

    
334
def SetupNodeDaemon(cluster_name, node, ssh_key_check):
335
  """Add a node to the cluster.
336

337
  This function must be called before the actual opcode, and will ssh
338
  to the remote node, copy the needed files, and start ganeti-noded,
339
  allowing the master to do the rest via normal rpc calls.
340

341
  @param cluster_name: the cluster name
342
  @param node: the name of the new node
343
  @param ssh_key_check: whether to do a strict key check
344

345
  """
346
  sshrunner = ssh.SshRunner(cluster_name)
347

    
348
  noded_cert = utils.ReadFile(constants.SSL_CERT_FILE)
349
  rapi_cert = utils.ReadFile(constants.RAPI_CERT_FILE)
350

    
351
  # in the base64 pem encoding, neither '!' nor '.' are valid chars,
352
  # so we use this to detect an invalid certificate; as long as the
353
  # cert doesn't contain this, the here-document will be correctly
354
  # parsed by the shell sequence below
355
  if (re.search('^!EOF\.', noded_cert, re.MULTILINE) or
356
      re.search('^!EOF\.', rapi_cert, re.MULTILINE)):
357
    raise errors.OpExecError("invalid PEM encoding in the SSL certificate")
358

    
359
  if not noded_cert.endswith("\n"):
360
    noded_cert += "\n"
361
  if not rapi_cert.endswith("\n"):
362
    rapi_cert += "\n"
363

    
364
  # set up inter-node password and certificate and restarts the node daemon
365
  # and then connect with ssh to set password and start ganeti-noded
366
  # note that all the below variables are sanitized at this point,
367
  # either by being constants or by the checks above
368
  mycommand = ("umask 077 && "
369
               "cat > '%s' << '!EOF.' && \n"
370
               "%s!EOF.\n"
371
               "cat > '%s' << '!EOF.' && \n"
372
               "%s!EOF.\n"
373
               "chmod 0400 %s %s && "
374
               "%s restart" %
375
               (constants.SSL_CERT_FILE, noded_cert,
376
                constants.RAPI_CERT_FILE, rapi_cert,
377
                constants.SSL_CERT_FILE, constants.RAPI_CERT_FILE,
378
                constants.NODE_INITD_SCRIPT))
379

    
380
  result = sshrunner.Run(node, 'root', mycommand, batch=False,
381
                         ask_key=ssh_key_check,
382
                         use_cluster_key=False,
383
                         strict_host_check=ssh_key_check)
384
  if result.failed:
385
    raise errors.OpExecError("Remote command on node %s, error: %s,"
386
                             " output: %s" %
387
                             (node, result.fail_reason, result.output))
388

    
389

    
390
def MasterFailover(no_voting=False):
391
  """Failover the master node.
392

393
  This checks that we are not already the master, and will cause the
394
  current master to cease being master, and the non-master to become
395
  new master.
396

397
  @type no_voting: boolean
398
  @param no_voting: force the operation without remote nodes agreement
399
                      (dangerous)
400

401
  """
402
  sstore = ssconf.SimpleStore()
403

    
404
  old_master, new_master = ssconf.GetMasterAndMyself(sstore)
405
  node_list = sstore.GetNodeList()
406
  mc_list = sstore.GetMasterCandidates()
407

    
408
  if old_master == new_master:
409
    raise errors.OpPrereqError("This commands must be run on the node"
410
                               " where you want the new master to be."
411
                               " %s is already the master" %
412
                               old_master)
413

    
414
  if new_master not in mc_list:
415
    mc_no_master = [name for name in mc_list if name != old_master]
416
    raise errors.OpPrereqError("This node is not among the nodes marked"
417
                               " as master candidates. Only these nodes"
418
                               " can become masters. Current list of"
419
                               " master candidates is:\n"
420
                               "%s" % ('\n'.join(mc_no_master)))
421

    
422
  if not no_voting:
423
    vote_list = GatherMasterVotes(node_list)
424

    
425
    if vote_list:
426
      voted_master = vote_list[0][0]
427
      if voted_master is None:
428
        raise errors.OpPrereqError("Cluster is inconsistent, most nodes did"
429
                                   " not respond.")
430
      elif voted_master != old_master:
431
        raise errors.OpPrereqError("I have a wrong configuration, I believe"
432
                                   " the master is %s but the other nodes"
433
                                   " voted %s. Please resync the configuration"
434
                                   " of this node." %
435
                                   (old_master, voted_master))
436
  # end checks
437

    
438
  rcode = 0
439

    
440
  logging.info("Setting master to %s, old master: %s", new_master, old_master)
441

    
442
  result = rpc.RpcRunner.call_node_stop_master(old_master, True)
443
  msg = result.RemoteFailMsg()
444
  if msg:
445
    logging.error("Could not disable the master role on the old master"
446
                 " %s, please disable manually: %s", old_master, msg)
447

    
448
  # Here we have a phase where no master should be running
449

    
450
  # instantiate a real config writer, as we now know we have the
451
  # configuration data
452
  cfg = config.ConfigWriter()
453

    
454
  cluster_info = cfg.GetClusterInfo()
455
  cluster_info.master_node = new_master
456
  # this will also regenerate the ssconf files, since we updated the
457
  # cluster info
458
  cfg.Update(cluster_info)
459

    
460
  result = rpc.RpcRunner.call_node_start_master(new_master, True, no_voting)
461
  msg = result.RemoteFailMsg()
462
  if msg:
463
    logging.error("Could not start the master role on the new master"
464
                  " %s, please check: %s", new_master, msg)
465
    rcode = 1
466

    
467
  return rcode
468

    
469

    
470
def GetMaster():
471
  """Returns the current master node.
472

473
  This is a separate function in bootstrap since it's needed by
474
  gnt-cluster, and instead of importing directly ssconf, it's better
475
  to abstract it in bootstrap, where we do use ssconf in other
476
  functions too.
477

478
  """
479
  sstore = ssconf.SimpleStore()
480

    
481
  old_master, _ = ssconf.GetMasterAndMyself(sstore)
482

    
483
  return old_master
484

    
485

    
486
def GatherMasterVotes(node_list):
487
  """Check the agreement on who is the master.
488

489
  This function will return a list of (node, number of votes), ordered
490
  by the number of votes. Errors will be denoted by the key 'None'.
491

492
  Note that the sum of votes is the number of nodes this machine
493
  knows, whereas the number of entries in the list could be different
494
  (if some nodes vote for another master).
495

496
  We remove ourselves from the list since we know that (bugs aside)
497
  since we use the same source for configuration information for both
498
  backend and boostrap, we'll always vote for ourselves.
499

500
  @type node_list: list
501
  @param node_list: the list of nodes to query for master info; the current
502
      node will be removed if it is in the list
503
  @rtype: list
504
  @return: list of (node, votes)
505

506
  """
507
  myself = utils.HostInfo().name
508
  try:
509
    node_list.remove(myself)
510
  except ValueError:
511
    pass
512
  if not node_list:
513
    # no nodes left (eventually after removing myself)
514
    return []
515
  results = rpc.RpcRunner.call_master_info(node_list)
516
  if not isinstance(results, dict):
517
    # this should not happen (unless internal error in rpc)
518
    logging.critical("Can't complete rpc call, aborting master startup")
519
    return [(None, len(node_list))]
520
  votes = {}
521
  for node in results:
522
    nres = results[node]
523
    data = nres.payload
524
    msg = nres.RemoteFailMsg()
525
    fail = False
526
    if msg:
527
      logging.warning("Error contacting node %s: %s", node, msg)
528
      fail = True
529
    elif not isinstance(data, (tuple, list)) or len(data) < 3:
530
      logging.warning("Invalid data received from node %s: %s", node, data)
531
      fail = True
532
    if fail:
533
      if None not in votes:
534
        votes[None] = 0
535
      votes[None] += 1
536
      continue
537
    master_node = data[2]
538
    if master_node not in votes:
539
      votes[master_node] = 0
540
    votes[master_node] += 1
541

    
542
  vote_list = [v for v in votes.items()]
543
  # sort first on number of votes then on name, since we want None
544
  # sorted later if we have the half of the nodes not responding, and
545
  # half voting all for the same master
546
  vote_list.sort(key=lambda x: (x[1], x[0]), reverse=True)
547

    
548
  return vote_list