Statistics
| Branch: | Tag: | Revision:

root / lib / bootstrap.py @ 9cdea43f

History | View | Annotate | Download (41.8 kB)

1
#
2
#
3

    
4
# Copyright (C) 2006, 2007, 2008, 2010, 2011, 2012 Google Inc.
5
#
6
# This program is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 2 of the License, or
9
# (at your option) any later version.
10
#
11
# This program is distributed in the hope that it will be useful, but
12
# WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
# General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with this program; if not, write to the Free Software
18
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19
# 02110-1301, USA.
20

    
21

    
22
"""Functions to bootstrap a new cluster.
23

24
"""
25

    
26
import os
27
import os.path
28
import re
29
import logging
30
import time
31
import tempfile
32

    
33
from ganeti.cmdlib import cluster
34
import ganeti.rpc.node as rpc
35
from ganeti import ssh
36
from ganeti import utils
37
from ganeti import errors
38
from ganeti import config
39
from ganeti import constants
40
from ganeti import objects
41
from ganeti import ssconf
42
from ganeti import serializer
43
from ganeti import hypervisor
44
from ganeti.storage import drbd
45
from ganeti.storage import filestorage
46
from ganeti import netutils
47
from ganeti import luxi
48
from ganeti import jstore
49
from ganeti import pathutils
50
from ganeti import runtime
51

    
52

    
53
# ec_id for InitConfig's temporary reservation manager
54
_INITCONF_ECID = "initconfig-ecid"
55

    
56
#: After how many seconds daemon must be responsive
57
_DAEMON_READY_TIMEOUT = 10.0
58

    
59

    
60
def _InitSSHSetup():
61
  """Setup the SSH configuration for the cluster.
62

63
  This generates a dsa keypair for root, adds the pub key to the
64
  permitted hosts and adds the hostkey to its own known hosts.
65

66
  """
67
  priv_key, pub_key, auth_keys = ssh.GetUserFiles(constants.SSH_LOGIN_USER)
68

    
69
  for name in priv_key, pub_key:
70
    if os.path.exists(name):
71
      utils.CreateBackup(name)
72
    utils.RemoveFile(name)
73

    
74
  result = utils.RunCmd(["ssh-keygen", "-t", "dsa",
75
                         "-f", priv_key,
76
                         "-q", "-N", ""])
77
  if result.failed:
78
    raise errors.OpExecError("Could not generate ssh keypair, error %s" %
79
                             result.output)
80

    
81
  utils.AddAuthorizedKey(auth_keys, utils.ReadFile(pub_key))
82

    
83

    
84
def GenerateHmacKey(file_name):
85
  """Writes a new HMAC key.
86

87
  @type file_name: str
88
  @param file_name: Path to output file
89

90
  """
91
  utils.WriteFile(file_name, data="%s\n" % utils.GenerateSecret(), mode=0400,
92
                  backup=True)
93

    
94

    
95
# pylint: disable=R0913
96
def GenerateClusterCrypto(new_cluster_cert, new_rapi_cert, new_spice_cert,
97
                          new_confd_hmac_key, new_cds,
98
                          rapi_cert_pem=None, spice_cert_pem=None,
99
                          spice_cacert_pem=None, cds=None,
100
                          nodecert_file=pathutils.NODED_CERT_FILE,
101
                          rapicert_file=pathutils.RAPI_CERT_FILE,
102
                          spicecert_file=pathutils.SPICE_CERT_FILE,
103
                          spicecacert_file=pathutils.SPICE_CACERT_FILE,
104
                          hmackey_file=pathutils.CONFD_HMAC_KEY,
105
                          cds_file=pathutils.CLUSTER_DOMAIN_SECRET_FILE):
106
  """Updates the cluster certificates, keys and secrets.
107

108
  @type new_cluster_cert: bool
109
  @param new_cluster_cert: Whether to generate a new cluster certificate
110
  @type new_rapi_cert: bool
111
  @param new_rapi_cert: Whether to generate a new RAPI certificate
112
  @type new_spice_cert: bool
113
  @param new_spice_cert: Whether to generate a new SPICE certificate
114
  @type new_confd_hmac_key: bool
115
  @param new_confd_hmac_key: Whether to generate a new HMAC key
116
  @type new_cds: bool
117
  @param new_cds: Whether to generate a new cluster domain secret
118
  @type rapi_cert_pem: string
119
  @param rapi_cert_pem: New RAPI certificate in PEM format
120
  @type spice_cert_pem: string
121
  @param spice_cert_pem: New SPICE certificate in PEM format
122
  @type spice_cacert_pem: string
123
  @param spice_cacert_pem: Certificate of the CA that signed the SPICE
124
                           certificate, in PEM format
125
  @type cds: string
126
  @param cds: New cluster domain secret
127
  @type nodecert_file: string
128
  @param nodecert_file: optional override of the node cert file path
129
  @type rapicert_file: string
130
  @param rapicert_file: optional override of the rapi cert file path
131
  @type spicecert_file: string
132
  @param spicecert_file: optional override of the spice cert file path
133
  @type spicecacert_file: string
134
  @param spicecacert_file: optional override of the spice CA cert file path
135
  @type hmackey_file: string
136
  @param hmackey_file: optional override of the hmac key file path
137

138
  """
139
  # pylint: disable=R0913
140
  # noded SSL certificate
141
  utils.GenerateNewSslCert(
142
    new_cluster_cert, nodecert_file, 1,
143
    "Generating new cluster certificate at %s" % nodecert_file)
144

    
145
  # confd HMAC key
146
  if new_confd_hmac_key or not os.path.exists(hmackey_file):
147
    logging.debug("Writing new confd HMAC key to %s", hmackey_file)
148
    GenerateHmacKey(hmackey_file)
149

    
150
  if rapi_cert_pem:
151
    # Assume rapi_pem contains a valid PEM-formatted certificate and key
152
    logging.debug("Writing RAPI certificate at %s", rapicert_file)
153
    utils.WriteFile(rapicert_file, data=rapi_cert_pem, backup=True)
154

    
155
  else:
156
    utils.GenerateNewSslCert(
157
      new_rapi_cert, rapicert_file, 1,
158
      "Generating new RAPI certificate at %s" % rapicert_file)
159

    
160
  # SPICE
161
  spice_cert_exists = os.path.exists(spicecert_file)
162
  spice_cacert_exists = os.path.exists(spicecacert_file)
163
  if spice_cert_pem:
164
    # spice_cert_pem implies also spice_cacert_pem
165
    logging.debug("Writing SPICE certificate at %s", spicecert_file)
166
    utils.WriteFile(spicecert_file, data=spice_cert_pem, backup=True)
167
    logging.debug("Writing SPICE CA certificate at %s", spicecacert_file)
168
    utils.WriteFile(spicecacert_file, data=spice_cacert_pem, backup=True)
169
  elif new_spice_cert or not spice_cert_exists:
170
    if spice_cert_exists:
171
      utils.CreateBackup(spicecert_file)
172
    if spice_cacert_exists:
173
      utils.CreateBackup(spicecacert_file)
174

    
175
    logging.debug("Generating new self-signed SPICE certificate at %s",
176
                  spicecert_file)
177
    (_, cert_pem) = utils.GenerateSelfSignedSslCert(spicecert_file, 1)
178

    
179
    # Self-signed certificate -> the public certificate is also the CA public
180
    # certificate
181
    logging.debug("Writing the public certificate to %s",
182
                  spicecert_file)
183
    utils.io.WriteFile(spicecacert_file, mode=0400, data=cert_pem)
184

    
185
  # Cluster domain secret
186
  if cds:
187
    logging.debug("Writing cluster domain secret to %s", cds_file)
188
    utils.WriteFile(cds_file, data=cds, backup=True)
189

    
190
  elif new_cds or not os.path.exists(cds_file):
191
    logging.debug("Generating new cluster domain secret at %s", cds_file)
192
    GenerateHmacKey(cds_file)
193

    
194

    
195
def _InitGanetiServerSetup(master_name):
196
  """Setup the necessary configuration for the initial node daemon.
197

198
  This creates the nodepass file containing the shared password for
199
  the cluster, generates the SSL certificate and starts the node daemon.
200

201
  @type master_name: str
202
  @param master_name: Name of the master node
203

204
  """
205
  # Generate cluster secrets
206
  GenerateClusterCrypto(True, False, False, False, False)
207

    
208
  result = utils.RunCmd([pathutils.DAEMON_UTIL, "start", constants.NODED])
209
  if result.failed:
210
    raise errors.OpExecError("Could not start the node daemon, command %s"
211
                             " had exitcode %s and error %s" %
212
                             (result.cmd, result.exit_code, result.output))
213

    
214
  _WaitForNodeDaemon(master_name)
215

    
216

    
217
def _WaitForNodeDaemon(node_name):
218
  """Wait for node daemon to become responsive.
219

220
  """
221
  def _CheckNodeDaemon():
222
    # Pylint bug <http://www.logilab.org/ticket/35642>
223
    # pylint: disable=E1101
224
    result = rpc.BootstrapRunner().call_version([node_name])[node_name]
225
    if result.fail_msg:
226
      raise utils.RetryAgain()
227

    
228
  try:
229
    utils.Retry(_CheckNodeDaemon, 1.0, _DAEMON_READY_TIMEOUT)
230
  except utils.RetryTimeout:
231
    raise errors.OpExecError("Node daemon on %s didn't answer queries within"
232
                             " %s seconds" % (node_name, _DAEMON_READY_TIMEOUT))
233

    
234

    
235
def _WaitForMasterDaemon():
236
  """Wait for master daemon to become responsive.
237

238
  """
239
  def _CheckMasterDaemon():
240
    try:
241
      cl = luxi.Client()
242
      (cluster_name, ) = cl.QueryConfigValues(["cluster_name"])
243
    except Exception:
244
      raise utils.RetryAgain()
245

    
246
    logging.debug("Received cluster name %s from master", cluster_name)
247

    
248
  try:
249
    utils.Retry(_CheckMasterDaemon, 1.0, _DAEMON_READY_TIMEOUT)
250
  except utils.RetryTimeout:
251
    raise errors.OpExecError("Master daemon didn't answer queries within"
252
                             " %s seconds" % _DAEMON_READY_TIMEOUT)
253

    
254

    
255
def _WaitForSshDaemon(hostname, port, family):
256
  """Wait for SSH daemon to become responsive.
257

258
  """
259
  hostip = netutils.GetHostname(name=hostname, family=family).ip
260

    
261
  def _CheckSshDaemon():
262
    if netutils.TcpPing(hostip, port, timeout=1.0, live_port_needed=True):
263
      logging.debug("SSH daemon on %s:%s (IP address %s) has become"
264
                    " responsive", hostname, port, hostip)
265
    else:
266
      raise utils.RetryAgain()
267

    
268
  try:
269
    utils.Retry(_CheckSshDaemon, 1.0, _DAEMON_READY_TIMEOUT)
270
  except utils.RetryTimeout:
271
    raise errors.OpExecError("SSH daemon on %s:%s (IP address %s) didn't"
272
                             " become responsive within %s seconds" %
273
                             (hostname, port, hostip, _DAEMON_READY_TIMEOUT))
274

    
275

    
276
def RunNodeSetupCmd(cluster_name, node, basecmd, debug, verbose,
277
                    use_cluster_key, ask_key, strict_host_check,
278
                    port, data):
279
  """Runs a command to configure something on a remote machine.
280

281
  @type cluster_name: string
282
  @param cluster_name: Cluster name
283
  @type node: string
284
  @param node: Node name
285
  @type basecmd: string
286
  @param basecmd: Base command (path on the remote machine)
287
  @type debug: bool
288
  @param debug: Enable debug output
289
  @type verbose: bool
290
  @param verbose: Enable verbose output
291
  @type use_cluster_key: bool
292
  @param use_cluster_key: See L{ssh.SshRunner.BuildCmd}
293
  @type ask_key: bool
294
  @param ask_key: See L{ssh.SshRunner.BuildCmd}
295
  @type strict_host_check: bool
296
  @param strict_host_check: See L{ssh.SshRunner.BuildCmd}
297
  @type port: int
298
  @param port: The SSH port of the remote machine or None for the default
299
  @param data: JSON-serializable input data for script (passed to stdin)
300

301
  """
302
  cmd = [basecmd]
303

    
304
  # Pass --debug/--verbose to the external script if set on our invocation
305
  if debug:
306
    cmd.append("--debug")
307

    
308
  if verbose:
309
    cmd.append("--verbose")
310

    
311
  if port is None:
312
    port = netutils.GetDaemonPort(constants.SSH)
313

    
314
  family = ssconf.SimpleStore().GetPrimaryIPFamily()
315
  srun = ssh.SshRunner(cluster_name,
316
                       ipv6=(family == netutils.IP6Address.family))
317
  scmd = srun.BuildCmd(node, constants.SSH_LOGIN_USER,
318
                       utils.ShellQuoteArgs(cmd),
319
                       batch=False, ask_key=ask_key, quiet=False,
320
                       strict_host_check=strict_host_check,
321
                       use_cluster_key=use_cluster_key,
322
                       port=port)
323

    
324
  tempfh = tempfile.TemporaryFile()
325
  try:
326
    tempfh.write(serializer.DumpJson(data))
327
    tempfh.seek(0)
328

    
329
    result = utils.RunCmd(scmd, interactive=True, input_fd=tempfh)
330
  finally:
331
    tempfh.close()
332

    
333
  if result.failed:
334
    raise errors.OpExecError("Command '%s' failed: %s" %
335
                             (result.cmd, result.fail_reason))
336

    
337
  _WaitForSshDaemon(node, port, family)
338

    
339

    
340
def _InitFileStorageDir(file_storage_dir):
341
  """Initialize if needed the file storage.
342

343
  @param file_storage_dir: the user-supplied value
344
  @return: either empty string (if file storage was disabled at build
345
      time) or the normalized path to the storage directory
346

347
  """
348
  file_storage_dir = os.path.normpath(file_storage_dir)
349

    
350
  if not os.path.isabs(file_storage_dir):
351
    raise errors.OpPrereqError("File storage directory '%s' is not an absolute"
352
                               " path" % file_storage_dir, errors.ECODE_INVAL)
353

    
354
  if not os.path.exists(file_storage_dir):
355
    try:
356
      os.makedirs(file_storage_dir, 0750)
357
    except OSError, err:
358
      raise errors.OpPrereqError("Cannot create file storage directory"
359
                                 " '%s': %s" % (file_storage_dir, err),
360
                                 errors.ECODE_ENVIRON)
361

    
362
  if not os.path.isdir(file_storage_dir):
363
    raise errors.OpPrereqError("The file storage directory '%s' is not"
364
                               " a directory." % file_storage_dir,
365
                               errors.ECODE_ENVIRON)
366

    
367
  return file_storage_dir
368

    
369

    
370
def _PrepareFileBasedStorage(
371
    enabled_disk_templates, file_storage_dir,
372
    default_dir, file_disk_template,
373
    init_fn=_InitFileStorageDir, acceptance_fn=None):
374
  """Checks if a file-base storage type is enabled and inits the dir.
375

376
  @type enabled_disk_templates: list of string
377
  @param enabled_disk_templates: list of enabled disk templates
378
  @type file_storage_dir: string
379
  @param file_storage_dir: the file storage directory
380
  @type default_dir: string
381
  @param default_dir: default file storage directory when C{file_storage_dir}
382
      is 'None'
383
  @type file_disk_template: string
384
  @param file_disk_template: a disk template whose storage type is 'ST_FILE' or
385
      'ST_SHARED_FILE'
386
  @rtype: string
387
  @returns: the name of the actual file storage directory
388

389
  """
390
  assert (file_disk_template in utils.storage.GetDiskTemplatesOfStorageTypes(
391
            constants.ST_FILE, constants.ST_SHARED_FILE
392
         ))
393

    
394
  if file_storage_dir is None:
395
    file_storage_dir = default_dir
396
  if not acceptance_fn:
397
    acceptance_fn = \
398
        lambda path: filestorage.CheckFileStoragePathAcceptance(
399
            path, exact_match_ok=True)
400

    
401
  cluster.CheckFileStoragePathVsEnabledDiskTemplates(
402
      logging.warning, file_storage_dir, enabled_disk_templates)
403

    
404
  file_storage_enabled = file_disk_template in enabled_disk_templates
405
  if file_storage_enabled:
406
    try:
407
      acceptance_fn(file_storage_dir)
408
    except errors.FileStoragePathError as e:
409
      raise errors.OpPrereqError(str(e))
410
    result_file_storage_dir = init_fn(file_storage_dir)
411
  else:
412
    result_file_storage_dir = file_storage_dir
413
  return result_file_storage_dir
414

    
415

    
416
def _PrepareFileStorage(
417
    enabled_disk_templates, file_storage_dir, init_fn=_InitFileStorageDir,
418
    acceptance_fn=None):
419
  """Checks if file storage is enabled and inits the dir.
420

421
  @see: C{_PrepareFileBasedStorage}
422

423
  """
424
  return _PrepareFileBasedStorage(
425
      enabled_disk_templates, file_storage_dir,
426
      pathutils.DEFAULT_FILE_STORAGE_DIR, constants.DT_FILE,
427
      init_fn=init_fn, acceptance_fn=acceptance_fn)
428

    
429

    
430
def _PrepareSharedFileStorage(
431
    enabled_disk_templates, file_storage_dir, init_fn=_InitFileStorageDir,
432
    acceptance_fn=None):
433
  """Checks if shared file storage is enabled and inits the dir.
434

435
  @see: C{_PrepareFileBasedStorage}
436

437
  """
438
  return _PrepareFileBasedStorage(
439
      enabled_disk_templates, file_storage_dir,
440
      pathutils.DEFAULT_SHARED_FILE_STORAGE_DIR, constants.DT_SHARED_FILE,
441
      init_fn=init_fn, acceptance_fn=acceptance_fn)
442

    
443

    
444
def _PrepareGlusterStorage(
445
    enabled_disk_templates, file_storage_dir, init_fn=_InitFileStorageDir,
446
    acceptance_fn=None):
447
  """Checks if gluster storage is enabled and inits the dir.
448

449
  @see: C{_PrepareFileBasedStorage}
450

451
  """
452
  return _PrepareFileBasedStorage(
453
      enabled_disk_templates, file_storage_dir,
454
      pathutils.DEFAULT_GLUSTER_STORAGE_DIR, constants.DT_GLUSTER,
455
      init_fn=init_fn, acceptance_fn=acceptance_fn)
456

    
457

    
458
def _InitCheckEnabledDiskTemplates(enabled_disk_templates):
459
  """Checks the sanity of the enabled disk templates.
460

461
  """
462
  if not enabled_disk_templates:
463
    raise errors.OpPrereqError("Enabled disk templates list must contain at"
464
                               " least one member", errors.ECODE_INVAL)
465
  invalid_disk_templates = \
466
    set(enabled_disk_templates) - constants.DISK_TEMPLATES
467
  if invalid_disk_templates:
468
    raise errors.OpPrereqError("Enabled disk templates list contains invalid"
469
                               " entries: %s" % invalid_disk_templates,
470
                               errors.ECODE_INVAL)
471

    
472

    
473
def _RestrictIpolicyToEnabledDiskTemplates(ipolicy, enabled_disk_templates):
474
  """Restricts the ipolicy's disk templates to the enabled ones.
475

476
  This function clears the ipolicy's list of allowed disk templates from the
477
  ones that are not enabled by the cluster.
478

479
  @type ipolicy: dict
480
  @param ipolicy: the instance policy
481
  @type enabled_disk_templates: list of string
482
  @param enabled_disk_templates: the list of cluster-wide enabled disk
483
    templates
484

485
  """
486
  assert constants.IPOLICY_DTS in ipolicy
487
  allowed_disk_templates = ipolicy[constants.IPOLICY_DTS]
488
  restricted_disk_templates = list(set(allowed_disk_templates)
489
                                   .intersection(set(enabled_disk_templates)))
490
  ipolicy[constants.IPOLICY_DTS] = restricted_disk_templates
491

    
492

    
493
def _InitCheckDrbdHelper(drbd_helper, drbd_enabled):
494
  """Checks the DRBD usermode helper.
495

496
  @type drbd_helper: string
497
  @param drbd_helper: name of the DRBD usermode helper that the system should
498
    use
499

500
  """
501
  if not drbd_enabled:
502
    return
503

    
504
  if drbd_helper is not None:
505
    try:
506
      curr_helper = drbd.DRBD8.GetUsermodeHelper()
507
    except errors.BlockDeviceError, err:
508
      raise errors.OpPrereqError("Error while checking drbd helper"
509
                                 " (disable drbd with --enabled-disk-templates"
510
                                 " if you are not using drbd): %s" % str(err),
511
                                 errors.ECODE_ENVIRON)
512
    if drbd_helper != curr_helper:
513
      raise errors.OpPrereqError("Error: requiring %s as drbd helper but %s"
514
                                 " is the current helper" % (drbd_helper,
515
                                                             curr_helper),
516
                                 errors.ECODE_INVAL)
517

    
518

    
519
def InitCluster(cluster_name, mac_prefix, # pylint: disable=R0913, R0914
520
                master_netmask, master_netdev, file_storage_dir,
521
                shared_file_storage_dir, gluster_storage_dir,
522
                candidate_pool_size, secondary_ip=None,
523
                vg_name=None, beparams=None, nicparams=None, ndparams=None,
524
                hvparams=None, diskparams=None, enabled_hypervisors=None,
525
                modify_etc_hosts=True, modify_ssh_setup=True,
526
                maintain_node_health=False, drbd_helper=None, uid_pool=None,
527
                default_iallocator=None, default_iallocator_params=None,
528
                primary_ip_version=None, ipolicy=None,
529
                prealloc_wipe_disks=False, use_external_mip_script=False,
530
                hv_state=None, disk_state=None, enabled_disk_templates=None):
531
  """Initialise the cluster.
532

533
  @type candidate_pool_size: int
534
  @param candidate_pool_size: master candidate pool size
535
  @type enabled_disk_templates: list of string
536
  @param enabled_disk_templates: list of disk_templates to be used in this
537
    cluster
538

539
  """
540
  # TODO: complete the docstring
541
  if config.ConfigWriter.IsCluster():
542
    raise errors.OpPrereqError("Cluster is already initialised",
543
                               errors.ECODE_STATE)
544

    
545
  if not enabled_hypervisors:
546
    raise errors.OpPrereqError("Enabled hypervisors list must contain at"
547
                               " least one member", errors.ECODE_INVAL)
548
  invalid_hvs = set(enabled_hypervisors) - constants.HYPER_TYPES
549
  if invalid_hvs:
550
    raise errors.OpPrereqError("Enabled hypervisors contains invalid"
551
                               " entries: %s" % invalid_hvs,
552
                               errors.ECODE_INVAL)
553

    
554
  _InitCheckEnabledDiskTemplates(enabled_disk_templates)
555

    
556
  try:
557
    ipcls = netutils.IPAddress.GetClassFromIpVersion(primary_ip_version)
558
  except errors.ProgrammerError:
559
    raise errors.OpPrereqError("Invalid primary ip version: %d." %
560
                               primary_ip_version, errors.ECODE_INVAL)
561

    
562
  hostname = netutils.GetHostname(family=ipcls.family)
563
  if not ipcls.IsValid(hostname.ip):
564
    raise errors.OpPrereqError("This host's IP (%s) is not a valid IPv%d"
565
                               " address." % (hostname.ip, primary_ip_version),
566
                               errors.ECODE_INVAL)
567

    
568
  if ipcls.IsLoopback(hostname.ip):
569
    raise errors.OpPrereqError("This host's IP (%s) resolves to a loopback"
570
                               " address. Please fix DNS or %s." %
571
                               (hostname.ip, pathutils.ETC_HOSTS),
572
                               errors.ECODE_ENVIRON)
573

    
574
  if not ipcls.Own(hostname.ip):
575
    raise errors.OpPrereqError("Inconsistency: this host's name resolves"
576
                               " to %s,\nbut this ip address does not"
577
                               " belong to this host" %
578
                               hostname.ip, errors.ECODE_ENVIRON)
579

    
580
  clustername = netutils.GetHostname(name=cluster_name, family=ipcls.family)
581

    
582
  if netutils.TcpPing(clustername.ip, constants.DEFAULT_NODED_PORT, timeout=5):
583
    raise errors.OpPrereqError("Cluster IP already active",
584
                               errors.ECODE_NOTUNIQUE)
585

    
586
  if not secondary_ip:
587
    if primary_ip_version == constants.IP6_VERSION:
588
      raise errors.OpPrereqError("When using a IPv6 primary address, a valid"
589
                                 " IPv4 address must be given as secondary",
590
                                 errors.ECODE_INVAL)
591
    secondary_ip = hostname.ip
592

    
593
  if not netutils.IP4Address.IsValid(secondary_ip):
594
    raise errors.OpPrereqError("Secondary IP address (%s) has to be a valid"
595
                               " IPv4 address." % secondary_ip,
596
                               errors.ECODE_INVAL)
597

    
598
  if not netutils.IP4Address.Own(secondary_ip):
599
    raise errors.OpPrereqError("You gave %s as secondary IP,"
600
                               " but it does not belong to this host." %
601
                               secondary_ip, errors.ECODE_ENVIRON)
602

    
603
  if master_netmask is not None:
604
    if not ipcls.ValidateNetmask(master_netmask):
605
      raise errors.OpPrereqError("CIDR netmask (%s) not valid for IPv%s " %
606
                                  (master_netmask, primary_ip_version),
607
                                 errors.ECODE_INVAL)
608
  else:
609
    master_netmask = ipcls.iplen
610

    
611
  if vg_name:
612
    # Check if volume group is valid
613
    vgstatus = utils.CheckVolumeGroupSize(utils.ListVolumeGroups(), vg_name,
614
                                          constants.MIN_VG_SIZE)
615
    if vgstatus:
616
      raise errors.OpPrereqError("Error: %s" % vgstatus, errors.ECODE_INVAL)
617

    
618
  drbd_enabled = constants.DT_DRBD8 in enabled_disk_templates
619
  _InitCheckDrbdHelper(drbd_helper, drbd_enabled)
620

    
621
  logging.debug("Stopping daemons (if any are running)")
622
  result = utils.RunCmd([pathutils.DAEMON_UTIL, "stop-all"])
623
  if result.failed:
624
    raise errors.OpExecError("Could not stop daemons, command %s"
625
                             " had exitcode %s and error '%s'" %
626
                             (result.cmd, result.exit_code, result.output))
627

    
628
  file_storage_dir = _PrepareFileStorage(enabled_disk_templates,
629
                                         file_storage_dir)
630
  shared_file_storage_dir = _PrepareSharedFileStorage(enabled_disk_templates,
631
                                                      shared_file_storage_dir)
632

    
633
  if not re.match("^[0-9a-z]{2}:[0-9a-z]{2}:[0-9a-z]{2}$", mac_prefix):
634
    raise errors.OpPrereqError("Invalid mac prefix given '%s'" % mac_prefix,
635
                               errors.ECODE_INVAL)
636

    
637
  if not nicparams.get('mode', None) == constants.NIC_MODE_OVS:
638
    # Do not do this check if mode=openvswitch, since the openvswitch is not
639
    # created yet
640
    result = utils.RunCmd(["ip", "link", "show", "dev", master_netdev])
641
    if result.failed:
642
      raise errors.OpPrereqError("Invalid master netdev given (%s): '%s'" %
643
                                 (master_netdev,
644
                                  result.output.strip()), errors.ECODE_INVAL)
645

    
646
  dirs = [(pathutils.RUN_DIR, constants.RUN_DIRS_MODE)]
647
  utils.EnsureDirs(dirs)
648

    
649
  objects.UpgradeBeParams(beparams)
650
  utils.ForceDictType(beparams, constants.BES_PARAMETER_TYPES)
651
  utils.ForceDictType(nicparams, constants.NICS_PARAMETER_TYPES)
652

    
653
  objects.NIC.CheckParameterSyntax(nicparams)
654

    
655
  full_ipolicy = objects.FillIPolicy(constants.IPOLICY_DEFAULTS, ipolicy)
656
  _RestrictIpolicyToEnabledDiskTemplates(full_ipolicy, enabled_disk_templates)
657

    
658
  if ndparams is not None:
659
    utils.ForceDictType(ndparams, constants.NDS_PARAMETER_TYPES)
660
  else:
661
    ndparams = dict(constants.NDC_DEFAULTS)
662

    
663
  # This is ugly, as we modify the dict itself
664
  # FIXME: Make utils.ForceDictType pure functional or write a wrapper
665
  # around it
666
  if hv_state:
667
    for hvname, hvs_data in hv_state.items():
668
      utils.ForceDictType(hvs_data, constants.HVSTS_PARAMETER_TYPES)
669
      hv_state[hvname] = objects.Cluster.SimpleFillHvState(hvs_data)
670
  else:
671
    hv_state = dict((hvname, constants.HVST_DEFAULTS)
672
                    for hvname in enabled_hypervisors)
673

    
674
  # FIXME: disk_state has no default values yet
675
  if disk_state:
676
    for storage, ds_data in disk_state.items():
677
      if storage not in constants.DS_VALID_TYPES:
678
        raise errors.OpPrereqError("Invalid storage type in disk state: %s" %
679
                                   storage, errors.ECODE_INVAL)
680
      for ds_name, state in ds_data.items():
681
        utils.ForceDictType(state, constants.DSS_PARAMETER_TYPES)
682
        ds_data[ds_name] = objects.Cluster.SimpleFillDiskState(state)
683

    
684
  # hvparams is a mapping of hypervisor->hvparams dict
685
  for hv_name, hv_params in hvparams.iteritems():
686
    utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
687
    hv_class = hypervisor.GetHypervisor(hv_name)
688
    hv_class.CheckParameterSyntax(hv_params)
689

    
690
  # diskparams is a mapping of disk-template->diskparams dict
691
  for template, dt_params in diskparams.items():
692
    param_keys = set(dt_params.keys())
693
    default_param_keys = set(constants.DISK_DT_DEFAULTS[template].keys())
694
    if not (param_keys <= default_param_keys):
695
      unknown_params = param_keys - default_param_keys
696
      raise errors.OpPrereqError("Invalid parameters for disk template %s:"
697
                                 " %s" % (template,
698
                                          utils.CommaJoin(unknown_params)),
699
                                 errors.ECODE_INVAL)
700
    utils.ForceDictType(dt_params, constants.DISK_DT_TYPES)
701
    if template == constants.DT_DRBD8 and vg_name is not None:
702
      # The default METAVG value is equal to the VG name set at init time,
703
      # if provided
704
      dt_params[constants.DRBD_DEFAULT_METAVG] = vg_name
705

    
706
  try:
707
    utils.VerifyDictOptions(diskparams, constants.DISK_DT_DEFAULTS)
708
  except errors.OpPrereqError, err:
709
    raise errors.OpPrereqError("While verify diskparam options: %s" % err,
710
                               errors.ECODE_INVAL)
711

    
712
  # set up ssh config and /etc/hosts
713
  rsa_sshkey = ""
714
  dsa_sshkey = ""
715
  if os.path.isfile(pathutils.SSH_HOST_RSA_PUB):
716
    sshline = utils.ReadFile(pathutils.SSH_HOST_RSA_PUB)
717
    rsa_sshkey = sshline.split(" ")[1]
718
  if os.path.isfile(pathutils.SSH_HOST_DSA_PUB):
719
    sshline = utils.ReadFile(pathutils.SSH_HOST_DSA_PUB)
720
    dsa_sshkey = sshline.split(" ")[1]
721
  if not rsa_sshkey and not dsa_sshkey:
722
    raise errors.OpPrereqError("Failed to find SSH public keys",
723
                               errors.ECODE_ENVIRON)
724

    
725
  if modify_etc_hosts:
726
    utils.AddHostToEtcHosts(hostname.name, hostname.ip)
727

    
728
  if modify_ssh_setup:
729
    _InitSSHSetup()
730

    
731
  if default_iallocator is not None:
732
    alloc_script = utils.FindFile(default_iallocator,
733
                                  constants.IALLOCATOR_SEARCH_PATH,
734
                                  os.path.isfile)
735
    if alloc_script is None:
736
      raise errors.OpPrereqError("Invalid default iallocator script '%s'"
737
                                 " specified" % default_iallocator,
738
                                 errors.ECODE_INVAL)
739
  else:
740
    # default to htools
741
    if utils.FindFile(constants.IALLOC_HAIL,
742
                      constants.IALLOCATOR_SEARCH_PATH,
743
                      os.path.isfile):
744
      default_iallocator = constants.IALLOC_HAIL
745

    
746
  # check if we have all the users we need
747
  try:
748
    runtime.GetEnts()
749
  except errors.ConfigurationError, err:
750
    raise errors.OpPrereqError("Required system user/group missing: %s" %
751
                               err, errors.ECODE_ENVIRON)
752

    
753
  candidate_certs = {}
754

    
755
  now = time.time()
756

    
757
  # init of cluster config file
758
  cluster_config = objects.Cluster(
759
    serial_no=1,
760
    rsahostkeypub=rsa_sshkey,
761
    dsahostkeypub=dsa_sshkey,
762
    highest_used_port=(constants.FIRST_DRBD_PORT - 1),
763
    mac_prefix=mac_prefix,
764
    volume_group_name=vg_name,
765
    tcpudp_port_pool=set(),
766
    master_ip=clustername.ip,
767
    master_netmask=master_netmask,
768
    master_netdev=master_netdev,
769
    cluster_name=clustername.name,
770
    file_storage_dir=file_storage_dir,
771
    shared_file_storage_dir=shared_file_storage_dir,
772
    gluster_storage_dir=gluster_storage_dir,
773
    enabled_hypervisors=enabled_hypervisors,
774
    beparams={constants.PP_DEFAULT: beparams},
775
    nicparams={constants.PP_DEFAULT: nicparams},
776
    ndparams=ndparams,
777
    hvparams=hvparams,
778
    diskparams=diskparams,
779
    candidate_pool_size=candidate_pool_size,
780
    modify_etc_hosts=modify_etc_hosts,
781
    modify_ssh_setup=modify_ssh_setup,
782
    uid_pool=uid_pool,
783
    ctime=now,
784
    mtime=now,
785
    maintain_node_health=maintain_node_health,
786
    drbd_usermode_helper=drbd_helper,
787
    default_iallocator=default_iallocator,
788
    default_iallocator_params=default_iallocator_params,
789
    primary_ip_family=ipcls.family,
790
    prealloc_wipe_disks=prealloc_wipe_disks,
791
    use_external_mip_script=use_external_mip_script,
792
    ipolicy=full_ipolicy,
793
    hv_state_static=hv_state,
794
    disk_state_static=disk_state,
795
    enabled_disk_templates=enabled_disk_templates,
796
    candidate_certs=candidate_certs,
797
    )
798
  master_node_config = objects.Node(name=hostname.name,
799
                                    primary_ip=hostname.ip,
800
                                    secondary_ip=secondary_ip,
801
                                    serial_no=1,
802
                                    master_candidate=True,
803
                                    offline=False, drained=False,
804
                                    ctime=now, mtime=now,
805
                                    )
806
  InitConfig(constants.CONFIG_VERSION, cluster_config, master_node_config)
807
  cfg = config.ConfigWriter(offline=True)
808
  ssh.WriteKnownHostsFile(cfg, pathutils.SSH_KNOWN_HOSTS_FILE)
809
  cfg.Update(cfg.GetClusterInfo(), logging.error)
810
  ssconf.WriteSsconfFiles(cfg.GetSsconfValues())
811

    
812
  # set up the inter-node password and certificate
813
  _InitGanetiServerSetup(hostname.name)
814

    
815
  logging.debug("Starting daemons")
816
  result = utils.RunCmd([pathutils.DAEMON_UTIL, "start-all"])
817
  if result.failed:
818
    raise errors.OpExecError("Could not start daemons, command %s"
819
                             " had exitcode %s and error %s" %
820
                             (result.cmd, result.exit_code, result.output))
821

    
822
  _WaitForMasterDaemon()
823

    
824

    
825
def InitConfig(version, cluster_config, master_node_config,
826
               cfg_file=pathutils.CLUSTER_CONF_FILE):
827
  """Create the initial cluster configuration.
828

829
  It will contain the current node, which will also be the master
830
  node, and no instances.
831

832
  @type version: int
833
  @param version: configuration version
834
  @type cluster_config: L{objects.Cluster}
835
  @param cluster_config: cluster configuration
836
  @type master_node_config: L{objects.Node}
837
  @param master_node_config: master node configuration
838
  @type cfg_file: string
839
  @param cfg_file: configuration file path
840

841
  """
842
  uuid_generator = config.TemporaryReservationManager()
843
  cluster_config.uuid = uuid_generator.Generate([], utils.NewUUID,
844
                                                _INITCONF_ECID)
845
  master_node_config.uuid = uuid_generator.Generate([], utils.NewUUID,
846
                                                    _INITCONF_ECID)
847
  cluster_config.master_node = master_node_config.uuid
848
  nodes = {
849
    master_node_config.uuid: master_node_config,
850
    }
851
  default_nodegroup = objects.NodeGroup(
852
    uuid=uuid_generator.Generate([], utils.NewUUID, _INITCONF_ECID),
853
    name=constants.INITIAL_NODE_GROUP_NAME,
854
    members=[master_node_config.uuid],
855
    diskparams={},
856
    )
857
  nodegroups = {
858
    default_nodegroup.uuid: default_nodegroup,
859
    }
860
  now = time.time()
861
  config_data = objects.ConfigData(version=version,
862
                                   cluster=cluster_config,
863
                                   nodegroups=nodegroups,
864
                                   nodes=nodes,
865
                                   instances={},
866
                                   networks={},
867
                                   serial_no=1,
868
                                   ctime=now, mtime=now)
869
  utils.WriteFile(cfg_file,
870
                  data=serializer.Dump(config_data.ToDict()),
871
                  mode=0600)
872

    
873

    
874
def FinalizeClusterDestroy(master_uuid):
875
  """Execute the last steps of cluster destroy
876

877
  This function shuts down all the daemons, completing the destroy
878
  begun in cmdlib.LUDestroyOpcode.
879

880
  """
881
  cfg = config.ConfigWriter()
882
  modify_ssh_setup = cfg.GetClusterInfo().modify_ssh_setup
883
  runner = rpc.BootstrapRunner()
884

    
885
  master_name = cfg.GetNodeName(master_uuid)
886

    
887
  master_params = cfg.GetMasterNetworkParameters()
888
  master_params.uuid = master_uuid
889
  ems = cfg.GetUseExternalMipScript()
890
  result = runner.call_node_deactivate_master_ip(master_name, master_params,
891
                                                 ems)
892

    
893
  msg = result.fail_msg
894
  if msg:
895
    logging.warning("Could not disable the master IP: %s", msg)
896

    
897
  result = runner.call_node_stop_master(master_name)
898
  msg = result.fail_msg
899
  if msg:
900
    logging.warning("Could not disable the master role: %s", msg)
901

    
902
  result = runner.call_node_leave_cluster(master_name, modify_ssh_setup)
903
  msg = result.fail_msg
904
  if msg:
905
    logging.warning("Could not shutdown the node daemon and cleanup"
906
                    " the node: %s", msg)
907

    
908

    
909
def SetupNodeDaemon(opts, cluster_name, node, ssh_port):
910
  """Add a node to the cluster.
911

912
  This function must be called before the actual opcode, and will ssh
913
  to the remote node, copy the needed files, and start ganeti-noded,
914
  allowing the master to do the rest via normal rpc calls.
915

916
  @param cluster_name: the cluster name
917
  @param node: the name of the new node
918
  @param ssh_port: the SSH port of the new node
919

920
  """
921
  data = {
922
    constants.NDS_CLUSTER_NAME: cluster_name,
923
    constants.NDS_NODE_DAEMON_CERTIFICATE:
924
      utils.ReadFile(pathutils.NODED_CERT_FILE),
925
    constants.NDS_SSCONF: ssconf.SimpleStore().ReadAll(),
926
    constants.NDS_START_NODE_DAEMON: True,
927
    }
928

    
929
  RunNodeSetupCmd(cluster_name, node, pathutils.NODE_DAEMON_SETUP,
930
                  opts.debug, opts.verbose,
931
                  True, opts.ssh_key_check, opts.ssh_key_check,
932
                  ssh_port, data)
933

    
934
  _WaitForNodeDaemon(node)
935

    
936

    
937
def MasterFailover(no_voting=False):
938
  """Failover the master node.
939

940
  This checks that we are not already the master, and will cause the
941
  current master to cease being master, and the non-master to become
942
  new master.
943

944
  @type no_voting: boolean
945
  @param no_voting: force the operation without remote nodes agreement
946
                      (dangerous)
947

948
  """
949
  sstore = ssconf.SimpleStore()
950

    
951
  old_master, new_master = ssconf.GetMasterAndMyself(sstore)
952
  node_names = sstore.GetNodeList()
953
  mc_list = sstore.GetMasterCandidates()
954

    
955
  if old_master == new_master:
956
    raise errors.OpPrereqError("This commands must be run on the node"
957
                               " where you want the new master to be."
958
                               " %s is already the master" %
959
                               old_master, errors.ECODE_INVAL)
960

    
961
  if new_master not in mc_list:
962
    mc_no_master = [name for name in mc_list if name != old_master]
963
    raise errors.OpPrereqError("This node is not among the nodes marked"
964
                               " as master candidates. Only these nodes"
965
                               " can become masters. Current list of"
966
                               " master candidates is:\n"
967
                               "%s" % ("\n".join(mc_no_master)),
968
                               errors.ECODE_STATE)
969

    
970
  if not no_voting:
971
    vote_list = GatherMasterVotes(node_names)
972

    
973
    if vote_list:
974
      voted_master = vote_list[0][0]
975
      if voted_master is None:
976
        raise errors.OpPrereqError("Cluster is inconsistent, most nodes did"
977
                                   " not respond.", errors.ECODE_ENVIRON)
978
      elif voted_master != old_master:
979
        raise errors.OpPrereqError("I have a wrong configuration, I believe"
980
                                   " the master is %s but the other nodes"
981
                                   " voted %s. Please resync the configuration"
982
                                   " of this node." %
983
                                   (old_master, voted_master),
984
                                   errors.ECODE_STATE)
985
  # end checks
986

    
987
  rcode = 0
988

    
989
  logging.info("Setting master to %s, old master: %s", new_master, old_master)
990

    
991
  try:
992
    # instantiate a real config writer, as we now know we have the
993
    # configuration data
994
    cfg = config.ConfigWriter(accept_foreign=True)
995

    
996
    old_master_node = cfg.GetNodeInfoByName(old_master)
997
    if old_master_node is None:
998
      raise errors.OpPrereqError("Could not find old master node '%s' in"
999
                                 " cluster configuration." % old_master,
1000
                                 errors.ECODE_NOENT)
1001

    
1002
    cluster_info = cfg.GetClusterInfo()
1003
    new_master_node = cfg.GetNodeInfoByName(new_master)
1004
    if new_master_node is None:
1005
      raise errors.OpPrereqError("Could not find new master node '%s' in"
1006
                                 " cluster configuration." % new_master,
1007
                                 errors.ECODE_NOENT)
1008

    
1009
    cluster_info.master_node = new_master_node.uuid
1010
    # this will also regenerate the ssconf files, since we updated the
1011
    # cluster info
1012
    cfg.Update(cluster_info, logging.error)
1013
  except errors.ConfigurationError, err:
1014
    logging.error("Error while trying to set the new master: %s",
1015
                  str(err))
1016
    return 1
1017

    
1018
  # if cfg.Update worked, then it means the old master daemon won't be
1019
  # able now to write its own config file (we rely on locking in both
1020
  # backend.UploadFile() and ConfigWriter._Write(); hence the next
1021
  # step is to kill the old master
1022

    
1023
  logging.info("Stopping the master daemon on node %s", old_master)
1024

    
1025
  runner = rpc.BootstrapRunner()
1026
  master_params = cfg.GetMasterNetworkParameters()
1027
  master_params.uuid = old_master_node.uuid
1028
  ems = cfg.GetUseExternalMipScript()
1029
  result = runner.call_node_deactivate_master_ip(old_master,
1030
                                                 master_params, ems)
1031

    
1032
  msg = result.fail_msg
1033
  if msg:
1034
    logging.warning("Could not disable the master IP: %s", msg)
1035

    
1036
  result = runner.call_node_stop_master(old_master)
1037
  msg = result.fail_msg
1038
  if msg:
1039
    logging.error("Could not disable the master role on the old master"
1040
                  " %s, please disable manually: %s", old_master, msg)
1041

    
1042
  logging.info("Checking master IP non-reachability...")
1043

    
1044
  master_ip = sstore.GetMasterIP()
1045
  total_timeout = 30
1046

    
1047
  # Here we have a phase where no master should be running
1048
  def _check_ip():
1049
    if netutils.TcpPing(master_ip, constants.DEFAULT_NODED_PORT):
1050
      raise utils.RetryAgain()
1051

    
1052
  try:
1053
    utils.Retry(_check_ip, (1, 1.5, 5), total_timeout)
1054
  except utils.RetryTimeout:
1055
    logging.warning("The master IP is still reachable after %s seconds,"
1056
                    " continuing but activating the master on the current"
1057
                    " node will probably fail", total_timeout)
1058

    
1059
  if jstore.CheckDrainFlag():
1060
    logging.info("Undraining job queue")
1061
    jstore.SetDrainFlag(False)
1062

    
1063
  logging.info("Starting the master daemons on the new master")
1064

    
1065
  result = rpc.BootstrapRunner().call_node_start_master_daemons(new_master,
1066
                                                                no_voting)
1067
  msg = result.fail_msg
1068
  if msg:
1069
    logging.error("Could not start the master role on the new master"
1070
                  " %s, please check: %s", new_master, msg)
1071
    rcode = 1
1072

    
1073
  logging.info("Master failed over from %s to %s", old_master, new_master)
1074
  return rcode
1075

    
1076

    
1077
def GetMaster():
1078
  """Returns the current master node.
1079

1080
  This is a separate function in bootstrap since it's needed by
1081
  gnt-cluster, and instead of importing directly ssconf, it's better
1082
  to abstract it in bootstrap, where we do use ssconf in other
1083
  functions too.
1084

1085
  """
1086
  sstore = ssconf.SimpleStore()
1087

    
1088
  old_master, _ = ssconf.GetMasterAndMyself(sstore)
1089

    
1090
  return old_master
1091

    
1092

    
1093
def GatherMasterVotes(node_names):
1094
  """Check the agreement on who is the master.
1095

1096
  This function will return a list of (node, number of votes), ordered
1097
  by the number of votes. Errors will be denoted by the key 'None'.
1098

1099
  Note that the sum of votes is the number of nodes this machine
1100
  knows, whereas the number of entries in the list could be different
1101
  (if some nodes vote for another master).
1102

1103
  We remove ourselves from the list since we know that (bugs aside)
1104
  since we use the same source for configuration information for both
1105
  backend and boostrap, we'll always vote for ourselves.
1106

1107
  @type node_names: list
1108
  @param node_names: the list of nodes to query for master info; the current
1109
      node will be removed if it is in the list
1110
  @rtype: list
1111
  @return: list of (node, votes)
1112

1113
  """
1114
  myself = netutils.Hostname.GetSysName()
1115
  try:
1116
    node_names.remove(myself)
1117
  except ValueError:
1118
    pass
1119
  if not node_names:
1120
    # no nodes left (eventually after removing myself)
1121
    return []
1122
  results = rpc.BootstrapRunner().call_master_node_name(node_names)
1123
  if not isinstance(results, dict):
1124
    # this should not happen (unless internal error in rpc)
1125
    logging.critical("Can't complete rpc call, aborting master startup")
1126
    return [(None, len(node_names))]
1127
  votes = {}
1128
  for node_name in results:
1129
    nres = results[node_name]
1130
    msg = nres.fail_msg
1131

    
1132
    if msg:
1133
      logging.warning("Error contacting node %s: %s", node_name, msg)
1134
      node = None
1135
    else:
1136
      node = nres.payload
1137

    
1138
    if node not in votes:
1139
      votes[node] = 1
1140
    else:
1141
      votes[node] += 1
1142

    
1143
  vote_list = [v for v in votes.items()]
1144
  # sort first on number of votes then on name, since we want None
1145
  # sorted later if we have the half of the nodes not responding, and
1146
  # half voting all for the same master
1147
  vote_list.sort(key=lambda x: (x[1], x[0]), reverse=True)
1148

    
1149
  return vote_list