Statistics
| Branch: | Tag: | Revision:

root / lib / bootstrap.py @ 31d3b918

History | View | Annotate | Download (41.6 kB)

1
#
2
#
3

    
4
# Copyright (C) 2006, 2007, 2008, 2010, 2011, 2012 Google Inc.
5
#
6
# This program is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 2 of the License, or
9
# (at your option) any later version.
10
#
11
# This program is distributed in the hope that it will be useful, but
12
# WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
# General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with this program; if not, write to the Free Software
18
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19
# 02110-1301, USA.
20

    
21

    
22
"""Functions to bootstrap a new cluster.
23

24
"""
25

    
26
import os
27
import os.path
28
import re
29
import logging
30
import time
31
import tempfile
32

    
33
from ganeti.cmdlib import cluster
34
import ganeti.rpc.node as rpc
35
from ganeti import ssh
36
from ganeti import utils
37
from ganeti import errors
38
from ganeti import config
39
from ganeti import constants
40
from ganeti import objects
41
from ganeti import ssconf
42
from ganeti import serializer
43
from ganeti import hypervisor
44
from ganeti.storage import drbd
45
from ganeti.storage import filestorage
46
from ganeti import netutils
47
from ganeti import luxi
48
from ganeti import jstore
49
from ganeti import pathutils
50

    
51

    
52
# ec_id for InitConfig's temporary reservation manager
53
_INITCONF_ECID = "initconfig-ecid"
54

    
55
#: After how many seconds daemon must be responsive
56
_DAEMON_READY_TIMEOUT = 10.0
57

    
58

    
59
def _InitSSHSetup():
60
  """Setup the SSH configuration for the cluster.
61

62
  This generates a dsa keypair for root, adds the pub key to the
63
  permitted hosts and adds the hostkey to its own known hosts.
64

65
  """
66
  priv_key, pub_key, auth_keys = ssh.GetUserFiles(constants.SSH_LOGIN_USER)
67

    
68
  for name in priv_key, pub_key:
69
    if os.path.exists(name):
70
      utils.CreateBackup(name)
71
    utils.RemoveFile(name)
72

    
73
  result = utils.RunCmd(["ssh-keygen", "-t", "dsa",
74
                         "-f", priv_key,
75
                         "-q", "-N", ""])
76
  if result.failed:
77
    raise errors.OpExecError("Could not generate ssh keypair, error %s" %
78
                             result.output)
79

    
80
  utils.AddAuthorizedKey(auth_keys, utils.ReadFile(pub_key))
81

    
82

    
83
def GenerateHmacKey(file_name):
84
  """Writes a new HMAC key.
85

86
  @type file_name: str
87
  @param file_name: Path to output file
88

89
  """
90
  utils.WriteFile(file_name, data="%s\n" % utils.GenerateSecret(), mode=0400,
91
                  backup=True)
92

    
93

    
94
# pylint: disable=R0913
95
def GenerateClusterCrypto(new_cluster_cert, new_rapi_cert, new_spice_cert,
96
                          new_confd_hmac_key, new_cds,
97
                          rapi_cert_pem=None, spice_cert_pem=None,
98
                          spice_cacert_pem=None, cds=None,
99
                          nodecert_file=pathutils.NODED_CERT_FILE,
100
                          rapicert_file=pathutils.RAPI_CERT_FILE,
101
                          spicecert_file=pathutils.SPICE_CERT_FILE,
102
                          spicecacert_file=pathutils.SPICE_CACERT_FILE,
103
                          hmackey_file=pathutils.CONFD_HMAC_KEY,
104
                          cds_file=pathutils.CLUSTER_DOMAIN_SECRET_FILE):
105
  """Updates the cluster certificates, keys and secrets.
106

107
  @type new_cluster_cert: bool
108
  @param new_cluster_cert: Whether to generate a new cluster certificate
109
  @type new_rapi_cert: bool
110
  @param new_rapi_cert: Whether to generate a new RAPI certificate
111
  @type new_spice_cert: bool
112
  @param new_spice_cert: Whether to generate a new SPICE certificate
113
  @type new_confd_hmac_key: bool
114
  @param new_confd_hmac_key: Whether to generate a new HMAC key
115
  @type new_cds: bool
116
  @param new_cds: Whether to generate a new cluster domain secret
117
  @type rapi_cert_pem: string
118
  @param rapi_cert_pem: New RAPI certificate in PEM format
119
  @type spice_cert_pem: string
120
  @param spice_cert_pem: New SPICE certificate in PEM format
121
  @type spice_cacert_pem: string
122
  @param spice_cacert_pem: Certificate of the CA that signed the SPICE
123
                           certificate, in PEM format
124
  @type cds: string
125
  @param cds: New cluster domain secret
126
  @type nodecert_file: string
127
  @param nodecert_file: optional override of the node cert file path
128
  @type rapicert_file: string
129
  @param rapicert_file: optional override of the rapi cert file path
130
  @type spicecert_file: string
131
  @param spicecert_file: optional override of the spice cert file path
132
  @type spicecacert_file: string
133
  @param spicecacert_file: optional override of the spice CA cert file path
134
  @type hmackey_file: string
135
  @param hmackey_file: optional override of the hmac key file path
136

137
  """
138
  # pylint: disable=R0913
139
  # noded SSL certificate
140
  utils.GenerateNewSslCert(
141
    new_cluster_cert, nodecert_file,
142
    "Generating new cluster certificate at %s" % nodecert_file)
143

    
144
  # confd HMAC key
145
  if new_confd_hmac_key or not os.path.exists(hmackey_file):
146
    logging.debug("Writing new confd HMAC key to %s", hmackey_file)
147
    GenerateHmacKey(hmackey_file)
148

    
149
  if rapi_cert_pem:
150
    # Assume rapi_pem contains a valid PEM-formatted certificate and key
151
    logging.debug("Writing RAPI certificate at %s", rapicert_file)
152
    utils.WriteFile(rapicert_file, data=rapi_cert_pem, backup=True)
153

    
154
  else:
155
    utils.GenerateNewSslCert(
156
      new_rapi_cert, rapicert_file,
157
      "Generating new RAPI certificate at %s" % rapicert_file)
158

    
159
  # SPICE
160
  spice_cert_exists = os.path.exists(spicecert_file)
161
  spice_cacert_exists = os.path.exists(spicecacert_file)
162
  if spice_cert_pem:
163
    # spice_cert_pem implies also spice_cacert_pem
164
    logging.debug("Writing SPICE certificate at %s", spicecert_file)
165
    utils.WriteFile(spicecert_file, data=spice_cert_pem, backup=True)
166
    logging.debug("Writing SPICE CA certificate at %s", spicecacert_file)
167
    utils.WriteFile(spicecacert_file, data=spice_cacert_pem, backup=True)
168
  elif new_spice_cert or not spice_cert_exists:
169
    if spice_cert_exists:
170
      utils.CreateBackup(spicecert_file)
171
    if spice_cacert_exists:
172
      utils.CreateBackup(spicecacert_file)
173

    
174
    logging.debug("Generating new self-signed SPICE certificate at %s",
175
                  spicecert_file)
176
    (_, cert_pem) = utils.GenerateSelfSignedSslCert(spicecert_file)
177

    
178
    # Self-signed certificate -> the public certificate is also the CA public
179
    # certificate
180
    logging.debug("Writing the public certificate to %s",
181
                  spicecert_file)
182
    utils.io.WriteFile(spicecacert_file, mode=0400, data=cert_pem)
183

    
184
  # Cluster domain secret
185
  if cds:
186
    logging.debug("Writing cluster domain secret to %s", cds_file)
187
    utils.WriteFile(cds_file, data=cds, backup=True)
188

    
189
  elif new_cds or not os.path.exists(cds_file):
190
    logging.debug("Generating new cluster domain secret at %s", cds_file)
191
    GenerateHmacKey(cds_file)
192

    
193

    
194
def _InitGanetiServerSetup(master_name):
195
  """Setup the necessary configuration for the initial node daemon.
196

197
  This creates the nodepass file containing the shared password for
198
  the cluster, generates the SSL certificate and starts the node daemon.
199

200
  @type master_name: str
201
  @param master_name: Name of the master node
202

203
  """
204
  # Generate cluster secrets
205
  GenerateClusterCrypto(True, False, False, False, False)
206

    
207
  result = utils.RunCmd([pathutils.DAEMON_UTIL, "start", constants.NODED])
208
  if result.failed:
209
    raise errors.OpExecError("Could not start the node daemon, command %s"
210
                             " had exitcode %s and error %s" %
211
                             (result.cmd, result.exit_code, result.output))
212

    
213
  _WaitForNodeDaemon(master_name)
214

    
215

    
216
def _WaitForNodeDaemon(node_name):
217
  """Wait for node daemon to become responsive.
218

219
  """
220
  def _CheckNodeDaemon():
221
    # Pylint bug <http://www.logilab.org/ticket/35642>
222
    # pylint: disable=E1101
223
    result = rpc.BootstrapRunner().call_version([node_name])[node_name]
224
    if result.fail_msg:
225
      raise utils.RetryAgain()
226

    
227
  try:
228
    utils.Retry(_CheckNodeDaemon, 1.0, _DAEMON_READY_TIMEOUT)
229
  except utils.RetryTimeout:
230
    raise errors.OpExecError("Node daemon on %s didn't answer queries within"
231
                             " %s seconds" % (node_name, _DAEMON_READY_TIMEOUT))
232

    
233

    
234
def _WaitForMasterDaemon():
235
  """Wait for master daemon to become responsive.
236

237
  """
238
  def _CheckMasterDaemon():
239
    try:
240
      cl = luxi.Client()
241
      (cluster_name, ) = cl.QueryConfigValues(["cluster_name"])
242
    except Exception:
243
      raise utils.RetryAgain()
244

    
245
    logging.debug("Received cluster name %s from master", cluster_name)
246

    
247
  try:
248
    utils.Retry(_CheckMasterDaemon, 1.0, _DAEMON_READY_TIMEOUT)
249
  except utils.RetryTimeout:
250
    raise errors.OpExecError("Master daemon didn't answer queries within"
251
                             " %s seconds" % _DAEMON_READY_TIMEOUT)
252

    
253

    
254
def _WaitForSshDaemon(hostname, port, family):
255
  """Wait for SSH daemon to become responsive.
256

257
  """
258
  hostip = netutils.GetHostname(name=hostname, family=family).ip
259

    
260
  def _CheckSshDaemon():
261
    if netutils.TcpPing(hostip, port, timeout=1.0, live_port_needed=True):
262
      logging.debug("SSH daemon on %s:%s (IP address %s) has become"
263
                    " responsive", hostname, port, hostip)
264
    else:
265
      raise utils.RetryAgain()
266

    
267
  try:
268
    utils.Retry(_CheckSshDaemon, 1.0, _DAEMON_READY_TIMEOUT)
269
  except utils.RetryTimeout:
270
    raise errors.OpExecError("SSH daemon on %s:%s (IP address %s) didn't"
271
                             " become responsive within %s seconds" %
272
                             (hostname, port, hostip, _DAEMON_READY_TIMEOUT))
273

    
274

    
275
def RunNodeSetupCmd(cluster_name, node, basecmd, debug, verbose,
276
                    use_cluster_key, ask_key, strict_host_check,
277
                    port, data):
278
  """Runs a command to configure something on a remote machine.
279

280
  @type cluster_name: string
281
  @param cluster_name: Cluster name
282
  @type node: string
283
  @param node: Node name
284
  @type basecmd: string
285
  @param basecmd: Base command (path on the remote machine)
286
  @type debug: bool
287
  @param debug: Enable debug output
288
  @type verbose: bool
289
  @param verbose: Enable verbose output
290
  @type use_cluster_key: bool
291
  @param use_cluster_key: See L{ssh.SshRunner.BuildCmd}
292
  @type ask_key: bool
293
  @param ask_key: See L{ssh.SshRunner.BuildCmd}
294
  @type strict_host_check: bool
295
  @param strict_host_check: See L{ssh.SshRunner.BuildCmd}
296
  @type port: int
297
  @param port: The SSH port of the remote machine or None for the default
298
  @param data: JSON-serializable input data for script (passed to stdin)
299

300
  """
301
  cmd = [basecmd]
302

    
303
  # Pass --debug/--verbose to the external script if set on our invocation
304
  if debug:
305
    cmd.append("--debug")
306

    
307
  if verbose:
308
    cmd.append("--verbose")
309

    
310
  if port is None:
311
    port = netutils.GetDaemonPort(constants.SSH)
312

    
313
  family = ssconf.SimpleStore().GetPrimaryIPFamily()
314
  srun = ssh.SshRunner(cluster_name,
315
                       ipv6=(family == netutils.IP6Address.family))
316
  scmd = srun.BuildCmd(node, constants.SSH_LOGIN_USER,
317
                       utils.ShellQuoteArgs(cmd),
318
                       batch=False, ask_key=ask_key, quiet=False,
319
                       strict_host_check=strict_host_check,
320
                       use_cluster_key=use_cluster_key,
321
                       port=port)
322

    
323
  tempfh = tempfile.TemporaryFile()
324
  try:
325
    tempfh.write(serializer.DumpJson(data))
326
    tempfh.seek(0)
327

    
328
    result = utils.RunCmd(scmd, interactive=True, input_fd=tempfh)
329
  finally:
330
    tempfh.close()
331

    
332
  if result.failed:
333
    raise errors.OpExecError("Command '%s' failed: %s" %
334
                             (result.cmd, result.fail_reason))
335

    
336
  _WaitForSshDaemon(node, port, family)
337

    
338

    
339
def _InitFileStorageDir(file_storage_dir):
340
  """Initialize if needed the file storage.
341

342
  @param file_storage_dir: the user-supplied value
343
  @return: either empty string (if file storage was disabled at build
344
      time) or the normalized path to the storage directory
345

346
  """
347
  file_storage_dir = os.path.normpath(file_storage_dir)
348

    
349
  if not os.path.isabs(file_storage_dir):
350
    raise errors.OpPrereqError("File storage directory '%s' is not an absolute"
351
                               " path" % file_storage_dir, errors.ECODE_INVAL)
352

    
353
  if not os.path.exists(file_storage_dir):
354
    try:
355
      os.makedirs(file_storage_dir, 0750)
356
    except OSError, err:
357
      raise errors.OpPrereqError("Cannot create file storage directory"
358
                                 " '%s': %s" % (file_storage_dir, err),
359
                                 errors.ECODE_ENVIRON)
360

    
361
  if not os.path.isdir(file_storage_dir):
362
    raise errors.OpPrereqError("The file storage directory '%s' is not"
363
                               " a directory." % file_storage_dir,
364
                               errors.ECODE_ENVIRON)
365

    
366
  return file_storage_dir
367

    
368

    
369
def _PrepareFileBasedStorage(
370
    enabled_disk_templates, file_storage_dir,
371
    default_dir, file_disk_template,
372
    init_fn=_InitFileStorageDir, acceptance_fn=None):
373
  """Checks if a file-base storage type is enabled and inits the dir.
374

375
  @type enabled_disk_templates: list of string
376
  @param enabled_disk_templates: list of enabled disk templates
377
  @type file_storage_dir: string
378
  @param file_storage_dir: the file storage directory
379
  @type default_dir: string
380
  @param default_dir: default file storage directory when C{file_storage_dir}
381
      is 'None'
382
  @type file_disk_template: string
383
  @param file_disk_template: a disk template whose storage type is 'ST_FILE' or
384
      'ST_SHARED_FILE'
385
  @rtype: string
386
  @returns: the name of the actual file storage directory
387

388
  """
389
  assert (file_disk_template in utils.storage.GetDiskTemplatesOfStorageTypes(
390
            constants.ST_FILE, constants.ST_SHARED_FILE
391
         ))
392

    
393
  if file_storage_dir is None:
394
    file_storage_dir = default_dir
395
  if not acceptance_fn:
396
    acceptance_fn = \
397
        lambda path: filestorage.CheckFileStoragePathAcceptance(
398
            path, exact_match_ok=True)
399

    
400
  cluster.CheckFileStoragePathVsEnabledDiskTemplates(
401
      logging.warning, file_storage_dir, enabled_disk_templates)
402

    
403
  file_storage_enabled = file_disk_template in enabled_disk_templates
404
  if file_storage_enabled:
405
    try:
406
      acceptance_fn(file_storage_dir)
407
    except errors.FileStoragePathError as e:
408
      raise errors.OpPrereqError(str(e))
409
    result_file_storage_dir = init_fn(file_storage_dir)
410
  else:
411
    result_file_storage_dir = file_storage_dir
412
  return result_file_storage_dir
413

    
414

    
415
def _PrepareFileStorage(
416
    enabled_disk_templates, file_storage_dir, init_fn=_InitFileStorageDir,
417
    acceptance_fn=None):
418
  """Checks if file storage is enabled and inits the dir.
419

420
  @see: C{_PrepareFileBasedStorage}
421

422
  """
423
  return _PrepareFileBasedStorage(
424
      enabled_disk_templates, file_storage_dir,
425
      pathutils.DEFAULT_FILE_STORAGE_DIR, constants.DT_FILE,
426
      init_fn=init_fn, acceptance_fn=acceptance_fn)
427

    
428

    
429
def _PrepareSharedFileStorage(
430
    enabled_disk_templates, file_storage_dir, init_fn=_InitFileStorageDir,
431
    acceptance_fn=None):
432
  """Checks if shared file storage is enabled and inits the dir.
433

434
  @see: C{_PrepareFileBasedStorage}
435

436
  """
437
  return _PrepareFileBasedStorage(
438
      enabled_disk_templates, file_storage_dir,
439
      pathutils.DEFAULT_SHARED_FILE_STORAGE_DIR, constants.DT_SHARED_FILE,
440
      init_fn=init_fn, acceptance_fn=acceptance_fn)
441

    
442

    
443
def _PrepareGlusterStorage(
444
    enabled_disk_templates, file_storage_dir, init_fn=_InitFileStorageDir,
445
    acceptance_fn=None):
446
  """Checks if gluster storage is enabled and inits the dir.
447

448
  @see: C{_PrepareFileBasedStorage}
449

450
  """
451
  return _PrepareFileBasedStorage(
452
      enabled_disk_templates, file_storage_dir,
453
      pathutils.DEFAULT_GLUSTER_STORAGE_DIR, constants.DT_GLUSTER,
454
      init_fn=init_fn, acceptance_fn=acceptance_fn)
455

    
456

    
457
def _InitCheckEnabledDiskTemplates(enabled_disk_templates):
458
  """Checks the sanity of the enabled disk templates.
459

460
  """
461
  if not enabled_disk_templates:
462
    raise errors.OpPrereqError("Enabled disk templates list must contain at"
463
                               " least one member", errors.ECODE_INVAL)
464
  invalid_disk_templates = \
465
    set(enabled_disk_templates) - constants.DISK_TEMPLATES
466
  if invalid_disk_templates:
467
    raise errors.OpPrereqError("Enabled disk templates list contains invalid"
468
                               " entries: %s" % invalid_disk_templates,
469
                               errors.ECODE_INVAL)
470

    
471

    
472
def _RestrictIpolicyToEnabledDiskTemplates(ipolicy, enabled_disk_templates):
473
  """Restricts the ipolicy's disk templates to the enabled ones.
474

475
  This function clears the ipolicy's list of allowed disk templates from the
476
  ones that are not enabled by the cluster.
477

478
  @type ipolicy: dict
479
  @param ipolicy: the instance policy
480
  @type enabled_disk_templates: list of string
481
  @param enabled_disk_templates: the list of cluster-wide enabled disk
482
    templates
483

484
  """
485
  assert constants.IPOLICY_DTS in ipolicy
486
  allowed_disk_templates = ipolicy[constants.IPOLICY_DTS]
487
  restricted_disk_templates = list(set(allowed_disk_templates)
488
                                   .intersection(set(enabled_disk_templates)))
489
  ipolicy[constants.IPOLICY_DTS] = restricted_disk_templates
490

    
491

    
492
def _InitCheckDrbdHelper(drbd_helper, drbd_enabled):
493
  """Checks the DRBD usermode helper.
494

495
  @type drbd_helper: string
496
  @param drbd_helper: name of the DRBD usermode helper that the system should
497
    use
498

499
  """
500
  if not drbd_enabled:
501
    return
502

    
503
  if drbd_helper is not None:
504
    try:
505
      curr_helper = drbd.DRBD8.GetUsermodeHelper()
506
    except errors.BlockDeviceError, err:
507
      raise errors.OpPrereqError("Error while checking drbd helper"
508
                                 " (disable drbd with --enabled-disk-templates"
509
                                 " if you are not using drbd): %s" % str(err),
510
                                 errors.ECODE_ENVIRON)
511
    if drbd_helper != curr_helper:
512
      raise errors.OpPrereqError("Error: requiring %s as drbd helper but %s"
513
                                 " is the current helper" % (drbd_helper,
514
                                                             curr_helper),
515
                                 errors.ECODE_INVAL)
516

    
517

    
518
def InitCluster(cluster_name, mac_prefix, # pylint: disable=R0913, R0914
519
                master_netmask, master_netdev, file_storage_dir,
520
                shared_file_storage_dir, gluster_storage_dir,
521
                candidate_pool_size, secondary_ip=None,
522
                vg_name=None, beparams=None, nicparams=None, ndparams=None,
523
                hvparams=None, diskparams=None, enabled_hypervisors=None,
524
                modify_etc_hosts=True, modify_ssh_setup=True,
525
                maintain_node_health=False, drbd_helper=None, uid_pool=None,
526
                default_iallocator=None, default_iallocator_params=None,
527
                primary_ip_version=None, ipolicy=None,
528
                prealloc_wipe_disks=False, use_external_mip_script=False,
529
                hv_state=None, disk_state=None, enabled_disk_templates=None):
530
  """Initialise the cluster.
531

532
  @type candidate_pool_size: int
533
  @param candidate_pool_size: master candidate pool size
534
  @type enabled_disk_templates: list of string
535
  @param enabled_disk_templates: list of disk_templates to be used in this
536
    cluster
537

538
  """
539
  # TODO: complete the docstring
540
  if config.ConfigWriter.IsCluster():
541
    raise errors.OpPrereqError("Cluster is already initialised",
542
                               errors.ECODE_STATE)
543

    
544
  if not enabled_hypervisors:
545
    raise errors.OpPrereqError("Enabled hypervisors list must contain at"
546
                               " least one member", errors.ECODE_INVAL)
547
  invalid_hvs = set(enabled_hypervisors) - constants.HYPER_TYPES
548
  if invalid_hvs:
549
    raise errors.OpPrereqError("Enabled hypervisors contains invalid"
550
                               " entries: %s" % invalid_hvs,
551
                               errors.ECODE_INVAL)
552

    
553
  _InitCheckEnabledDiskTemplates(enabled_disk_templates)
554

    
555
  try:
556
    ipcls = netutils.IPAddress.GetClassFromIpVersion(primary_ip_version)
557
  except errors.ProgrammerError:
558
    raise errors.OpPrereqError("Invalid primary ip version: %d." %
559
                               primary_ip_version, errors.ECODE_INVAL)
560

    
561
  hostname = netutils.GetHostname(family=ipcls.family)
562
  if not ipcls.IsValid(hostname.ip):
563
    raise errors.OpPrereqError("This host's IP (%s) is not a valid IPv%d"
564
                               " address." % (hostname.ip, primary_ip_version),
565
                               errors.ECODE_INVAL)
566

    
567
  if ipcls.IsLoopback(hostname.ip):
568
    raise errors.OpPrereqError("This host's IP (%s) resolves to a loopback"
569
                               " address. Please fix DNS or %s." %
570
                               (hostname.ip, pathutils.ETC_HOSTS),
571
                               errors.ECODE_ENVIRON)
572

    
573
  if not ipcls.Own(hostname.ip):
574
    raise errors.OpPrereqError("Inconsistency: this host's name resolves"
575
                               " to %s,\nbut this ip address does not"
576
                               " belong to this host" %
577
                               hostname.ip, errors.ECODE_ENVIRON)
578

    
579
  clustername = netutils.GetHostname(name=cluster_name, family=ipcls.family)
580

    
581
  if netutils.TcpPing(clustername.ip, constants.DEFAULT_NODED_PORT, timeout=5):
582
    raise errors.OpPrereqError("Cluster IP already active",
583
                               errors.ECODE_NOTUNIQUE)
584

    
585
  if not secondary_ip:
586
    if primary_ip_version == constants.IP6_VERSION:
587
      raise errors.OpPrereqError("When using a IPv6 primary address, a valid"
588
                                 " IPv4 address must be given as secondary",
589
                                 errors.ECODE_INVAL)
590
    secondary_ip = hostname.ip
591

    
592
  if not netutils.IP4Address.IsValid(secondary_ip):
593
    raise errors.OpPrereqError("Secondary IP address (%s) has to be a valid"
594
                               " IPv4 address." % secondary_ip,
595
                               errors.ECODE_INVAL)
596

    
597
  if not netutils.IP4Address.Own(secondary_ip):
598
    raise errors.OpPrereqError("You gave %s as secondary IP,"
599
                               " but it does not belong to this host." %
600
                               secondary_ip, errors.ECODE_ENVIRON)
601

    
602
  if master_netmask is not None:
603
    if not ipcls.ValidateNetmask(master_netmask):
604
      raise errors.OpPrereqError("CIDR netmask (%s) not valid for IPv%s " %
605
                                  (master_netmask, primary_ip_version),
606
                                 errors.ECODE_INVAL)
607
  else:
608
    master_netmask = ipcls.iplen
609

    
610
  if vg_name:
611
    # Check if volume group is valid
612
    vgstatus = utils.CheckVolumeGroupSize(utils.ListVolumeGroups(), vg_name,
613
                                          constants.MIN_VG_SIZE)
614
    if vgstatus:
615
      raise errors.OpPrereqError("Error: %s" % vgstatus, errors.ECODE_INVAL)
616

    
617
  drbd_enabled = constants.DT_DRBD8 in enabled_disk_templates
618
  _InitCheckDrbdHelper(drbd_helper, drbd_enabled)
619

    
620
  logging.debug("Stopping daemons (if any are running)")
621
  result = utils.RunCmd([pathutils.DAEMON_UTIL, "stop-all"])
622
  if result.failed:
623
    raise errors.OpExecError("Could not stop daemons, command %s"
624
                             " had exitcode %s and error '%s'" %
625
                             (result.cmd, result.exit_code, result.output))
626

    
627
  file_storage_dir = _PrepareFileStorage(enabled_disk_templates,
628
                                         file_storage_dir)
629
  shared_file_storage_dir = _PrepareSharedFileStorage(enabled_disk_templates,
630
                                                      shared_file_storage_dir)
631

    
632
  if not re.match("^[0-9a-z]{2}:[0-9a-z]{2}:[0-9a-z]{2}$", mac_prefix):
633
    raise errors.OpPrereqError("Invalid mac prefix given '%s'" % mac_prefix,
634
                               errors.ECODE_INVAL)
635

    
636
  if not nicparams.get('mode', None) == constants.NIC_MODE_OVS:
637
    # Do not do this check if mode=openvswitch, since the openvswitch is not
638
    # created yet
639
    result = utils.RunCmd(["ip", "link", "show", "dev", master_netdev])
640
    if result.failed:
641
      raise errors.OpPrereqError("Invalid master netdev given (%s): '%s'" %
642
                                 (master_netdev,
643
                                  result.output.strip()), errors.ECODE_INVAL)
644

    
645
  dirs = [(pathutils.RUN_DIR, constants.RUN_DIRS_MODE)]
646
  utils.EnsureDirs(dirs)
647

    
648
  objects.UpgradeBeParams(beparams)
649
  utils.ForceDictType(beparams, constants.BES_PARAMETER_TYPES)
650
  utils.ForceDictType(nicparams, constants.NICS_PARAMETER_TYPES)
651

    
652
  objects.NIC.CheckParameterSyntax(nicparams)
653

    
654
  full_ipolicy = objects.FillIPolicy(constants.IPOLICY_DEFAULTS, ipolicy)
655
  _RestrictIpolicyToEnabledDiskTemplates(full_ipolicy, enabled_disk_templates)
656

    
657
  if ndparams is not None:
658
    utils.ForceDictType(ndparams, constants.NDS_PARAMETER_TYPES)
659
  else:
660
    ndparams = dict(constants.NDC_DEFAULTS)
661

    
662
  # This is ugly, as we modify the dict itself
663
  # FIXME: Make utils.ForceDictType pure functional or write a wrapper
664
  # around it
665
  if hv_state:
666
    for hvname, hvs_data in hv_state.items():
667
      utils.ForceDictType(hvs_data, constants.HVSTS_PARAMETER_TYPES)
668
      hv_state[hvname] = objects.Cluster.SimpleFillHvState(hvs_data)
669
  else:
670
    hv_state = dict((hvname, constants.HVST_DEFAULTS)
671
                    for hvname in enabled_hypervisors)
672

    
673
  # FIXME: disk_state has no default values yet
674
  if disk_state:
675
    for storage, ds_data in disk_state.items():
676
      if storage not in constants.DS_VALID_TYPES:
677
        raise errors.OpPrereqError("Invalid storage type in disk state: %s" %
678
                                   storage, errors.ECODE_INVAL)
679
      for ds_name, state in ds_data.items():
680
        utils.ForceDictType(state, constants.DSS_PARAMETER_TYPES)
681
        ds_data[ds_name] = objects.Cluster.SimpleFillDiskState(state)
682

    
683
  # hvparams is a mapping of hypervisor->hvparams dict
684
  for hv_name, hv_params in hvparams.iteritems():
685
    utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
686
    hv_class = hypervisor.GetHypervisor(hv_name)
687
    hv_class.CheckParameterSyntax(hv_params)
688

    
689
  # diskparams is a mapping of disk-template->diskparams dict
690
  for template, dt_params in diskparams.items():
691
    param_keys = set(dt_params.keys())
692
    default_param_keys = set(constants.DISK_DT_DEFAULTS[template].keys())
693
    if not (param_keys <= default_param_keys):
694
      unknown_params = param_keys - default_param_keys
695
      raise errors.OpPrereqError("Invalid parameters for disk template %s:"
696
                                 " %s" % (template,
697
                                          utils.CommaJoin(unknown_params)),
698
                                 errors.ECODE_INVAL)
699
    utils.ForceDictType(dt_params, constants.DISK_DT_TYPES)
700
    if template == constants.DT_DRBD8 and vg_name is not None:
701
      # The default METAVG value is equal to the VG name set at init time,
702
      # if provided
703
      dt_params[constants.DRBD_DEFAULT_METAVG] = vg_name
704

    
705
  try:
706
    utils.VerifyDictOptions(diskparams, constants.DISK_DT_DEFAULTS)
707
  except errors.OpPrereqError, err:
708
    raise errors.OpPrereqError("While verify diskparam options: %s" % err,
709
                               errors.ECODE_INVAL)
710

    
711
  # set up ssh config and /etc/hosts
712
  rsa_sshkey = ""
713
  dsa_sshkey = ""
714
  if os.path.isfile(pathutils.SSH_HOST_RSA_PUB):
715
    sshline = utils.ReadFile(pathutils.SSH_HOST_RSA_PUB)
716
    rsa_sshkey = sshline.split(" ")[1]
717
  if os.path.isfile(pathutils.SSH_HOST_DSA_PUB):
718
    sshline = utils.ReadFile(pathutils.SSH_HOST_DSA_PUB)
719
    dsa_sshkey = sshline.split(" ")[1]
720
  if not rsa_sshkey and not dsa_sshkey:
721
    raise errors.OpPrereqError("Failed to find SSH public keys",
722
                               errors.ECODE_ENVIRON)
723

    
724
  if modify_etc_hosts:
725
    utils.AddHostToEtcHosts(hostname.name, hostname.ip)
726

    
727
  if modify_ssh_setup:
728
    _InitSSHSetup()
729

    
730
  if default_iallocator is not None:
731
    alloc_script = utils.FindFile(default_iallocator,
732
                                  constants.IALLOCATOR_SEARCH_PATH,
733
                                  os.path.isfile)
734
    if alloc_script is None:
735
      raise errors.OpPrereqError("Invalid default iallocator script '%s'"
736
                                 " specified" % default_iallocator,
737
                                 errors.ECODE_INVAL)
738
  elif constants.HTOOLS:
739
    # htools was enabled at build-time, we default to it
740
    if utils.FindFile(constants.IALLOC_HAIL,
741
                      constants.IALLOCATOR_SEARCH_PATH,
742
                      os.path.isfile):
743
      default_iallocator = constants.IALLOC_HAIL
744

    
745
  candidate_certs = {}
746

    
747
  now = time.time()
748

    
749
  # init of cluster config file
750
  cluster_config = objects.Cluster(
751
    serial_no=1,
752
    rsahostkeypub=rsa_sshkey,
753
    dsahostkeypub=dsa_sshkey,
754
    highest_used_port=(constants.FIRST_DRBD_PORT - 1),
755
    mac_prefix=mac_prefix,
756
    volume_group_name=vg_name,
757
    tcpudp_port_pool=set(),
758
    master_ip=clustername.ip,
759
    master_netmask=master_netmask,
760
    master_netdev=master_netdev,
761
    cluster_name=clustername.name,
762
    file_storage_dir=file_storage_dir,
763
    shared_file_storage_dir=shared_file_storage_dir,
764
    gluster_storage_dir=gluster_storage_dir,
765
    enabled_hypervisors=enabled_hypervisors,
766
    beparams={constants.PP_DEFAULT: beparams},
767
    nicparams={constants.PP_DEFAULT: nicparams},
768
    ndparams=ndparams,
769
    hvparams=hvparams,
770
    diskparams=diskparams,
771
    candidate_pool_size=candidate_pool_size,
772
    modify_etc_hosts=modify_etc_hosts,
773
    modify_ssh_setup=modify_ssh_setup,
774
    uid_pool=uid_pool,
775
    ctime=now,
776
    mtime=now,
777
    maintain_node_health=maintain_node_health,
778
    drbd_usermode_helper=drbd_helper,
779
    default_iallocator=default_iallocator,
780
    default_iallocator_params=default_iallocator_params,
781
    primary_ip_family=ipcls.family,
782
    prealloc_wipe_disks=prealloc_wipe_disks,
783
    use_external_mip_script=use_external_mip_script,
784
    ipolicy=full_ipolicy,
785
    hv_state_static=hv_state,
786
    disk_state_static=disk_state,
787
    enabled_disk_templates=enabled_disk_templates,
788
    candidate_certs=candidate_certs,
789
    osparams={},
790
    osparams_private_cluster={}
791
    )
792
  master_node_config = objects.Node(name=hostname.name,
793
                                    primary_ip=hostname.ip,
794
                                    secondary_ip=secondary_ip,
795
                                    serial_no=1,
796
                                    master_candidate=True,
797
                                    offline=False, drained=False,
798
                                    ctime=now, mtime=now,
799
                                    )
800
  InitConfig(constants.CONFIG_VERSION, cluster_config, master_node_config)
801
  cfg = config.ConfigWriter(offline=True)
802
  ssh.WriteKnownHostsFile(cfg, pathutils.SSH_KNOWN_HOSTS_FILE)
803
  cfg.Update(cfg.GetClusterInfo(), logging.error)
804
  ssconf.WriteSsconfFiles(cfg.GetSsconfValues())
805

    
806
  # set up the inter-node password and certificate
807
  _InitGanetiServerSetup(hostname.name)
808

    
809
  logging.debug("Starting daemons")
810
  result = utils.RunCmd([pathutils.DAEMON_UTIL, "start-all"])
811
  if result.failed:
812
    raise errors.OpExecError("Could not start daemons, command %s"
813
                             " had exitcode %s and error %s" %
814
                             (result.cmd, result.exit_code, result.output))
815

    
816
  _WaitForMasterDaemon()
817

    
818

    
819
def InitConfig(version, cluster_config, master_node_config,
820
               cfg_file=pathutils.CLUSTER_CONF_FILE):
821
  """Create the initial cluster configuration.
822

823
  It will contain the current node, which will also be the master
824
  node, and no instances.
825

826
  @type version: int
827
  @param version: configuration version
828
  @type cluster_config: L{objects.Cluster}
829
  @param cluster_config: cluster configuration
830
  @type master_node_config: L{objects.Node}
831
  @param master_node_config: master node configuration
832
  @type cfg_file: string
833
  @param cfg_file: configuration file path
834

835
  """
836
  uuid_generator = config.TemporaryReservationManager()
837
  cluster_config.uuid = uuid_generator.Generate([], utils.NewUUID,
838
                                                _INITCONF_ECID)
839
  master_node_config.uuid = uuid_generator.Generate([], utils.NewUUID,
840
                                                    _INITCONF_ECID)
841
  cluster_config.master_node = master_node_config.uuid
842
  nodes = {
843
    master_node_config.uuid: master_node_config,
844
    }
845
  default_nodegroup = objects.NodeGroup(
846
    uuid=uuid_generator.Generate([], utils.NewUUID, _INITCONF_ECID),
847
    name=constants.INITIAL_NODE_GROUP_NAME,
848
    members=[master_node_config.uuid],
849
    diskparams={},
850
    )
851
  nodegroups = {
852
    default_nodegroup.uuid: default_nodegroup,
853
    }
854
  now = time.time()
855
  config_data = objects.ConfigData(version=version,
856
                                   cluster=cluster_config,
857
                                   nodegroups=nodegroups,
858
                                   nodes=nodes,
859
                                   instances={},
860
                                   networks={},
861
                                   serial_no=1,
862
                                   ctime=now, mtime=now)
863
  utils.WriteFile(cfg_file,
864
                  data=serializer.Dump(config_data.ToDict()),
865
                  mode=0600)
866

    
867

    
868
def FinalizeClusterDestroy(master_uuid):
869
  """Execute the last steps of cluster destroy
870

871
  This function shuts down all the daemons, completing the destroy
872
  begun in cmdlib.LUDestroyOpcode.
873

874
  """
875
  cfg = config.ConfigWriter()
876
  modify_ssh_setup = cfg.GetClusterInfo().modify_ssh_setup
877
  runner = rpc.BootstrapRunner()
878

    
879
  master_name = cfg.GetNodeName(master_uuid)
880

    
881
  master_params = cfg.GetMasterNetworkParameters()
882
  master_params.uuid = master_uuid
883
  ems = cfg.GetUseExternalMipScript()
884
  result = runner.call_node_deactivate_master_ip(master_name, master_params,
885
                                                 ems)
886

    
887
  msg = result.fail_msg
888
  if msg:
889
    logging.warning("Could not disable the master IP: %s", msg)
890

    
891
  result = runner.call_node_stop_master(master_name)
892
  msg = result.fail_msg
893
  if msg:
894
    logging.warning("Could not disable the master role: %s", msg)
895

    
896
  result = runner.call_node_leave_cluster(master_name, modify_ssh_setup)
897
  msg = result.fail_msg
898
  if msg:
899
    logging.warning("Could not shutdown the node daemon and cleanup"
900
                    " the node: %s", msg)
901

    
902

    
903
def SetupNodeDaemon(opts, cluster_name, node, ssh_port):
904
  """Add a node to the cluster.
905

906
  This function must be called before the actual opcode, and will ssh
907
  to the remote node, copy the needed files, and start ganeti-noded,
908
  allowing the master to do the rest via normal rpc calls.
909

910
  @param cluster_name: the cluster name
911
  @param node: the name of the new node
912
  @param ssh_port: the SSH port of the new node
913

914
  """
915
  data = {
916
    constants.NDS_CLUSTER_NAME: cluster_name,
917
    constants.NDS_NODE_DAEMON_CERTIFICATE:
918
      utils.ReadFile(pathutils.NODED_CERT_FILE),
919
    constants.NDS_SSCONF: ssconf.SimpleStore().ReadAll(),
920
    constants.NDS_START_NODE_DAEMON: True,
921
    }
922

    
923
  RunNodeSetupCmd(cluster_name, node, pathutils.NODE_DAEMON_SETUP,
924
                  opts.debug, opts.verbose,
925
                  True, opts.ssh_key_check, opts.ssh_key_check,
926
                  ssh_port, data)
927

    
928
  _WaitForNodeDaemon(node)
929

    
930

    
931
def MasterFailover(no_voting=False):
932
  """Failover the master node.
933

934
  This checks that we are not already the master, and will cause the
935
  current master to cease being master, and the non-master to become
936
  new master.
937

938
  @type no_voting: boolean
939
  @param no_voting: force the operation without remote nodes agreement
940
                      (dangerous)
941

942
  """
943
  sstore = ssconf.SimpleStore()
944

    
945
  old_master, new_master = ssconf.GetMasterAndMyself(sstore)
946
  node_names = sstore.GetNodeList()
947
  mc_list = sstore.GetMasterCandidates()
948

    
949
  if old_master == new_master:
950
    raise errors.OpPrereqError("This commands must be run on the node"
951
                               " where you want the new master to be."
952
                               " %s is already the master" %
953
                               old_master, errors.ECODE_INVAL)
954

    
955
  if new_master not in mc_list:
956
    mc_no_master = [name for name in mc_list if name != old_master]
957
    raise errors.OpPrereqError("This node is not among the nodes marked"
958
                               " as master candidates. Only these nodes"
959
                               " can become masters. Current list of"
960
                               " master candidates is:\n"
961
                               "%s" % ("\n".join(mc_no_master)),
962
                               errors.ECODE_STATE)
963

    
964
  if not no_voting:
965
    vote_list = GatherMasterVotes(node_names)
966

    
967
    if vote_list:
968
      voted_master = vote_list[0][0]
969
      if voted_master is None:
970
        raise errors.OpPrereqError("Cluster is inconsistent, most nodes did"
971
                                   " not respond.", errors.ECODE_ENVIRON)
972
      elif voted_master != old_master:
973
        raise errors.OpPrereqError("I have a wrong configuration, I believe"
974
                                   " the master is %s but the other nodes"
975
                                   " voted %s. Please resync the configuration"
976
                                   " of this node." %
977
                                   (old_master, voted_master),
978
                                   errors.ECODE_STATE)
979
  # end checks
980

    
981
  rcode = 0
982

    
983
  logging.info("Setting master to %s, old master: %s", new_master, old_master)
984

    
985
  try:
986
    # instantiate a real config writer, as we now know we have the
987
    # configuration data
988
    cfg = config.ConfigWriter(accept_foreign=True)
989

    
990
    old_master_node = cfg.GetNodeInfoByName(old_master)
991
    if old_master_node is None:
992
      raise errors.OpPrereqError("Could not find old master node '%s' in"
993
                                 " cluster configuration." % old_master,
994
                                 errors.ECODE_NOENT)
995

    
996
    cluster_info = cfg.GetClusterInfo()
997
    new_master_node = cfg.GetNodeInfoByName(new_master)
998
    if new_master_node is None:
999
      raise errors.OpPrereqError("Could not find new master node '%s' in"
1000
                                 " cluster configuration." % new_master,
1001
                                 errors.ECODE_NOENT)
1002

    
1003
    cluster_info.master_node = new_master_node.uuid
1004
    # this will also regenerate the ssconf files, since we updated the
1005
    # cluster info
1006
    cfg.Update(cluster_info, logging.error)
1007
  except errors.ConfigurationError, err:
1008
    logging.error("Error while trying to set the new master: %s",
1009
                  str(err))
1010
    return 1
1011

    
1012
  # if cfg.Update worked, then it means the old master daemon won't be
1013
  # able now to write its own config file (we rely on locking in both
1014
  # backend.UploadFile() and ConfigWriter._Write(); hence the next
1015
  # step is to kill the old master
1016

    
1017
  logging.info("Stopping the master daemon on node %s", old_master)
1018

    
1019
  runner = rpc.BootstrapRunner()
1020
  master_params = cfg.GetMasterNetworkParameters()
1021
  master_params.uuid = old_master_node.uuid
1022
  ems = cfg.GetUseExternalMipScript()
1023
  result = runner.call_node_deactivate_master_ip(old_master,
1024
                                                 master_params, ems)
1025

    
1026
  msg = result.fail_msg
1027
  if msg:
1028
    logging.warning("Could not disable the master IP: %s", msg)
1029

    
1030
  result = runner.call_node_stop_master(old_master)
1031
  msg = result.fail_msg
1032
  if msg:
1033
    logging.error("Could not disable the master role on the old master"
1034
                  " %s, please disable manually: %s", old_master, msg)
1035

    
1036
  logging.info("Checking master IP non-reachability...")
1037

    
1038
  master_ip = sstore.GetMasterIP()
1039
  total_timeout = 30
1040

    
1041
  # Here we have a phase where no master should be running
1042
  def _check_ip():
1043
    if netutils.TcpPing(master_ip, constants.DEFAULT_NODED_PORT):
1044
      raise utils.RetryAgain()
1045

    
1046
  try:
1047
    utils.Retry(_check_ip, (1, 1.5, 5), total_timeout)
1048
  except utils.RetryTimeout:
1049
    logging.warning("The master IP is still reachable after %s seconds,"
1050
                    " continuing but activating the master on the current"
1051
                    " node will probably fail", total_timeout)
1052

    
1053
  if jstore.CheckDrainFlag():
1054
    logging.info("Undraining job queue")
1055
    jstore.SetDrainFlag(False)
1056

    
1057
  logging.info("Starting the master daemons on the new master")
1058

    
1059
  result = rpc.BootstrapRunner().call_node_start_master_daemons(new_master,
1060
                                                                no_voting)
1061
  msg = result.fail_msg
1062
  if msg:
1063
    logging.error("Could not start the master role on the new master"
1064
                  " %s, please check: %s", new_master, msg)
1065
    rcode = 1
1066

    
1067
  logging.info("Master failed over from %s to %s", old_master, new_master)
1068
  return rcode
1069

    
1070

    
1071
def GetMaster():
1072
  """Returns the current master node.
1073

1074
  This is a separate function in bootstrap since it's needed by
1075
  gnt-cluster, and instead of importing directly ssconf, it's better
1076
  to abstract it in bootstrap, where we do use ssconf in other
1077
  functions too.
1078

1079
  """
1080
  sstore = ssconf.SimpleStore()
1081

    
1082
  old_master, _ = ssconf.GetMasterAndMyself(sstore)
1083

    
1084
  return old_master
1085

    
1086

    
1087
def GatherMasterVotes(node_names):
1088
  """Check the agreement on who is the master.
1089

1090
  This function will return a list of (node, number of votes), ordered
1091
  by the number of votes. Errors will be denoted by the key 'None'.
1092

1093
  Note that the sum of votes is the number of nodes this machine
1094
  knows, whereas the number of entries in the list could be different
1095
  (if some nodes vote for another master).
1096

1097
  We remove ourselves from the list since we know that (bugs aside)
1098
  since we use the same source for configuration information for both
1099
  backend and boostrap, we'll always vote for ourselves.
1100

1101
  @type node_names: list
1102
  @param node_names: the list of nodes to query for master info; the current
1103
      node will be removed if it is in the list
1104
  @rtype: list
1105
  @return: list of (node, votes)
1106

1107
  """
1108
  myself = netutils.Hostname.GetSysName()
1109
  try:
1110
    node_names.remove(myself)
1111
  except ValueError:
1112
    pass
1113
  if not node_names:
1114
    # no nodes left (eventually after removing myself)
1115
    return []
1116
  results = rpc.BootstrapRunner().call_master_node_name(node_names)
1117
  if not isinstance(results, dict):
1118
    # this should not happen (unless internal error in rpc)
1119
    logging.critical("Can't complete rpc call, aborting master startup")
1120
    return [(None, len(node_names))]
1121
  votes = {}
1122
  for node_name in results:
1123
    nres = results[node_name]
1124
    msg = nres.fail_msg
1125

    
1126
    if msg:
1127
      logging.warning("Error contacting node %s: %s", node_name, msg)
1128
      node = None
1129
    else:
1130
      node = nres.payload
1131

    
1132
    if node not in votes:
1133
      votes[node] = 1
1134
    else:
1135
      votes[node] += 1
1136

    
1137
  vote_list = [v for v in votes.items()]
1138
  # sort first on number of votes then on name, since we want None
1139
  # sorted later if we have the half of the nodes not responding, and
1140
  # half voting all for the same master
1141
  vote_list.sort(key=lambda x: (x[1], x[0]), reverse=True)
1142

    
1143
  return vote_list