Statistics
| Branch: | Tag: | Revision:

root / lib / bootstrap.py @ cb8028f3

History | View | Annotate | Download (41.5 kB)

1
#
2
#
3

    
4
# Copyright (C) 2006, 2007, 2008, 2010, 2011, 2012 Google Inc.
5
#
6
# This program is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 2 of the License, or
9
# (at your option) any later version.
10
#
11
# This program is distributed in the hope that it will be useful, but
12
# WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
# General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with this program; if not, write to the Free Software
18
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19
# 02110-1301, USA.
20

    
21

    
22
"""Functions to bootstrap a new cluster.
23

24
"""
25

    
26
import os
27
import os.path
28
import re
29
import logging
30
import time
31
import tempfile
32

    
33
from ganeti.cmdlib import cluster
34
import ganeti.rpc.node as rpc
35
from ganeti import ssh
36
from ganeti import utils
37
from ganeti import errors
38
from ganeti import config
39
from ganeti import constants
40
from ganeti import objects
41
from ganeti import ssconf
42
from ganeti import serializer
43
from ganeti import hypervisor
44
from ganeti.storage import drbd
45
from ganeti.storage import filestorage
46
from ganeti import netutils
47
from ganeti import luxi
48
from ganeti import jstore
49
from ganeti import pathutils
50

    
51

    
52
# ec_id for InitConfig's temporary reservation manager
53
_INITCONF_ECID = "initconfig-ecid"
54

    
55
#: After how many seconds daemon must be responsive
56
_DAEMON_READY_TIMEOUT = 10.0
57

    
58

    
59
def _InitSSHSetup():
60
  """Setup the SSH configuration for the cluster.
61

62
  This generates a dsa keypair for root, adds the pub key to the
63
  permitted hosts and adds the hostkey to its own known hosts.
64

65
  """
66
  priv_key, pub_key, auth_keys = ssh.GetUserFiles(constants.SSH_LOGIN_USER)
67

    
68
  for name in priv_key, pub_key:
69
    if os.path.exists(name):
70
      utils.CreateBackup(name)
71
    utils.RemoveFile(name)
72

    
73
  result = utils.RunCmd(["ssh-keygen", "-t", "dsa",
74
                         "-f", priv_key,
75
                         "-q", "-N", ""])
76
  if result.failed:
77
    raise errors.OpExecError("Could not generate ssh keypair, error %s" %
78
                             result.output)
79

    
80
  utils.AddAuthorizedKey(auth_keys, utils.ReadFile(pub_key))
81

    
82

    
83
def GenerateHmacKey(file_name):
84
  """Writes a new HMAC key.
85

86
  @type file_name: str
87
  @param file_name: Path to output file
88

89
  """
90
  utils.WriteFile(file_name, data="%s\n" % utils.GenerateSecret(), mode=0400,
91
                  backup=True)
92

    
93

    
94
# pylint: disable=R0913
95
def GenerateClusterCrypto(new_cluster_cert, new_rapi_cert, new_spice_cert,
96
                          new_confd_hmac_key, new_cds,
97
                          rapi_cert_pem=None, spice_cert_pem=None,
98
                          spice_cacert_pem=None, cds=None,
99
                          nodecert_file=pathutils.NODED_CERT_FILE,
100
                          rapicert_file=pathutils.RAPI_CERT_FILE,
101
                          spicecert_file=pathutils.SPICE_CERT_FILE,
102
                          spicecacert_file=pathutils.SPICE_CACERT_FILE,
103
                          hmackey_file=pathutils.CONFD_HMAC_KEY,
104
                          cds_file=pathutils.CLUSTER_DOMAIN_SECRET_FILE):
105
  """Updates the cluster certificates, keys and secrets.
106

107
  @type new_cluster_cert: bool
108
  @param new_cluster_cert: Whether to generate a new cluster certificate
109
  @type new_rapi_cert: bool
110
  @param new_rapi_cert: Whether to generate a new RAPI certificate
111
  @type new_spice_cert: bool
112
  @param new_spice_cert: Whether to generate a new SPICE certificate
113
  @type new_confd_hmac_key: bool
114
  @param new_confd_hmac_key: Whether to generate a new HMAC key
115
  @type new_cds: bool
116
  @param new_cds: Whether to generate a new cluster domain secret
117
  @type rapi_cert_pem: string
118
  @param rapi_cert_pem: New RAPI certificate in PEM format
119
  @type spice_cert_pem: string
120
  @param spice_cert_pem: New SPICE certificate in PEM format
121
  @type spice_cacert_pem: string
122
  @param spice_cacert_pem: Certificate of the CA that signed the SPICE
123
                           certificate, in PEM format
124
  @type cds: string
125
  @param cds: New cluster domain secret
126
  @type nodecert_file: string
127
  @param nodecert_file: optional override of the node cert file path
128
  @type rapicert_file: string
129
  @param rapicert_file: optional override of the rapi cert file path
130
  @type spicecert_file: string
131
  @param spicecert_file: optional override of the spice cert file path
132
  @type spicecacert_file: string
133
  @param spicecacert_file: optional override of the spice CA cert file path
134
  @type hmackey_file: string
135
  @param hmackey_file: optional override of the hmac key file path
136

137
  """
138
  # pylint: disable=R0913
139
  # noded SSL certificate
140
  utils.GenerateNewSslCert(
141
    new_cluster_cert, nodecert_file,
142
    "Generating new cluster certificate at %s" % nodecert_file)
143

    
144
  # confd HMAC key
145
  if new_confd_hmac_key or not os.path.exists(hmackey_file):
146
    logging.debug("Writing new confd HMAC key to %s", hmackey_file)
147
    GenerateHmacKey(hmackey_file)
148

    
149
  if rapi_cert_pem:
150
    # Assume rapi_pem contains a valid PEM-formatted certificate and key
151
    logging.debug("Writing RAPI certificate at %s", rapicert_file)
152
    utils.WriteFile(rapicert_file, data=rapi_cert_pem, backup=True)
153

    
154
  else:
155
    utils.GenerateNewSslCert(
156
      new_rapi_cert, rapicert_file,
157
      "Generating new RAPI certificate at %s" % rapicert_file)
158

    
159
  # SPICE
160
  spice_cert_exists = os.path.exists(spicecert_file)
161
  spice_cacert_exists = os.path.exists(spicecacert_file)
162
  if spice_cert_pem:
163
    # spice_cert_pem implies also spice_cacert_pem
164
    logging.debug("Writing SPICE certificate at %s", spicecert_file)
165
    utils.WriteFile(spicecert_file, data=spice_cert_pem, backup=True)
166
    logging.debug("Writing SPICE CA certificate at %s", spicecacert_file)
167
    utils.WriteFile(spicecacert_file, data=spice_cacert_pem, backup=True)
168
  elif new_spice_cert or not spice_cert_exists:
169
    if spice_cert_exists:
170
      utils.CreateBackup(spicecert_file)
171
    if spice_cacert_exists:
172
      utils.CreateBackup(spicecacert_file)
173

    
174
    logging.debug("Generating new self-signed SPICE certificate at %s",
175
                  spicecert_file)
176
    (_, cert_pem) = utils.GenerateSelfSignedSslCert(spicecert_file)
177

    
178
    # Self-signed certificate -> the public certificate is also the CA public
179
    # certificate
180
    logging.debug("Writing the public certificate to %s",
181
                  spicecert_file)
182
    utils.io.WriteFile(spicecacert_file, mode=0400, data=cert_pem)
183

    
184
  # Cluster domain secret
185
  if cds:
186
    logging.debug("Writing cluster domain secret to %s", cds_file)
187
    utils.WriteFile(cds_file, data=cds, backup=True)
188

    
189
  elif new_cds or not os.path.exists(cds_file):
190
    logging.debug("Generating new cluster domain secret at %s", cds_file)
191
    GenerateHmacKey(cds_file)
192

    
193

    
194
def _InitGanetiServerSetup(master_name):
195
  """Setup the necessary configuration for the initial node daemon.
196

197
  This creates the nodepass file containing the shared password for
198
  the cluster, generates the SSL certificate and starts the node daemon.
199

200
  @type master_name: str
201
  @param master_name: Name of the master node
202

203
  """
204
  # Generate cluster secrets
205
  GenerateClusterCrypto(True, False, False, False, False)
206

    
207
  result = utils.RunCmd([pathutils.DAEMON_UTIL, "start", constants.NODED])
208
  if result.failed:
209
    raise errors.OpExecError("Could not start the node daemon, command %s"
210
                             " had exitcode %s and error %s" %
211
                             (result.cmd, result.exit_code, result.output))
212

    
213
  _WaitForNodeDaemon(master_name)
214

    
215

    
216
def _WaitForNodeDaemon(node_name):
217
  """Wait for node daemon to become responsive.
218

219
  """
220
  def _CheckNodeDaemon():
221
    # Pylint bug <http://www.logilab.org/ticket/35642>
222
    # pylint: disable=E1101
223
    result = rpc.BootstrapRunner().call_version([node_name])[node_name]
224
    if result.fail_msg:
225
      raise utils.RetryAgain()
226

    
227
  try:
228
    utils.Retry(_CheckNodeDaemon, 1.0, _DAEMON_READY_TIMEOUT)
229
  except utils.RetryTimeout:
230
    raise errors.OpExecError("Node daemon on %s didn't answer queries within"
231
                             " %s seconds" % (node_name, _DAEMON_READY_TIMEOUT))
232

    
233

    
234
def _WaitForMasterDaemon():
235
  """Wait for master daemon to become responsive.
236

237
  """
238
  def _CheckMasterDaemon():
239
    try:
240
      cl = luxi.Client()
241
      (cluster_name, ) = cl.QueryConfigValues(["cluster_name"])
242
    except Exception:
243
      raise utils.RetryAgain()
244

    
245
    logging.debug("Received cluster name %s from master", cluster_name)
246

    
247
  try:
248
    utils.Retry(_CheckMasterDaemon, 1.0, _DAEMON_READY_TIMEOUT)
249
  except utils.RetryTimeout:
250
    raise errors.OpExecError("Master daemon didn't answer queries within"
251
                             " %s seconds" % _DAEMON_READY_TIMEOUT)
252

    
253

    
254
def _WaitForSshDaemon(hostname, port, family):
255
  """Wait for SSH daemon to become responsive.
256

257
  """
258
  hostip = netutils.GetHostname(name=hostname, family=family).ip
259

    
260
  def _CheckSshDaemon():
261
    if netutils.TcpPing(hostip, port, timeout=1.0, live_port_needed=True):
262
      logging.debug("SSH daemon on %s:%s (IP address %s) has become"
263
                    " responsive", hostname, port, hostip)
264
    else:
265
      raise utils.RetryAgain()
266

    
267
  try:
268
    utils.Retry(_CheckSshDaemon, 1.0, _DAEMON_READY_TIMEOUT)
269
  except utils.RetryTimeout:
270
    raise errors.OpExecError("SSH daemon on %s:%s (IP address %s) didn't"
271
                             " become responsive within %s seconds" %
272
                             (hostname, port, hostip, _DAEMON_READY_TIMEOUT))
273

    
274

    
275
def RunNodeSetupCmd(cluster_name, node, basecmd, debug, verbose,
276
                    use_cluster_key, ask_key, strict_host_check,
277
                    port, data):
278
  """Runs a command to configure something on a remote machine.
279

280
  @type cluster_name: string
281
  @param cluster_name: Cluster name
282
  @type node: string
283
  @param node: Node name
284
  @type basecmd: string
285
  @param basecmd: Base command (path on the remote machine)
286
  @type debug: bool
287
  @param debug: Enable debug output
288
  @type verbose: bool
289
  @param verbose: Enable verbose output
290
  @type use_cluster_key: bool
291
  @param use_cluster_key: See L{ssh.SshRunner.BuildCmd}
292
  @type ask_key: bool
293
  @param ask_key: See L{ssh.SshRunner.BuildCmd}
294
  @type strict_host_check: bool
295
  @param strict_host_check: See L{ssh.SshRunner.BuildCmd}
296
  @type port: int
297
  @param port: The SSH port of the remote machine or None for the default
298
  @param data: JSON-serializable input data for script (passed to stdin)
299

300
  """
301
  cmd = [basecmd]
302

    
303
  # Pass --debug/--verbose to the external script if set on our invocation
304
  if debug:
305
    cmd.append("--debug")
306

    
307
  if verbose:
308
    cmd.append("--verbose")
309

    
310
  if port is None:
311
    port = netutils.GetDaemonPort(constants.SSH)
312

    
313
  family = ssconf.SimpleStore().GetPrimaryIPFamily()
314
  srun = ssh.SshRunner(cluster_name,
315
                       ipv6=(family == netutils.IP6Address.family))
316
  scmd = srun.BuildCmd(node, constants.SSH_LOGIN_USER,
317
                       utils.ShellQuoteArgs(cmd),
318
                       batch=False, ask_key=ask_key, quiet=False,
319
                       strict_host_check=strict_host_check,
320
                       use_cluster_key=use_cluster_key,
321
                       port=port)
322

    
323
  tempfh = tempfile.TemporaryFile()
324
  try:
325
    tempfh.write(serializer.DumpJson(data))
326
    tempfh.seek(0)
327

    
328
    result = utils.RunCmd(scmd, interactive=True, input_fd=tempfh)
329
  finally:
330
    tempfh.close()
331

    
332
  if result.failed:
333
    raise errors.OpExecError("Command '%s' failed: %s" %
334
                             (result.cmd, result.fail_reason))
335

    
336
  _WaitForSshDaemon(node, port, family)
337

    
338

    
339
def _InitFileStorageDir(file_storage_dir):
340
  """Initialize if needed the file storage.
341

342
  @param file_storage_dir: the user-supplied value
343
  @return: either empty string (if file storage was disabled at build
344
      time) or the normalized path to the storage directory
345

346
  """
347
  file_storage_dir = os.path.normpath(file_storage_dir)
348

    
349
  if not os.path.isabs(file_storage_dir):
350
    raise errors.OpPrereqError("File storage directory '%s' is not an absolute"
351
                               " path" % file_storage_dir, errors.ECODE_INVAL)
352

    
353
  if not os.path.exists(file_storage_dir):
354
    try:
355
      os.makedirs(file_storage_dir, 0750)
356
    except OSError, err:
357
      raise errors.OpPrereqError("Cannot create file storage directory"
358
                                 " '%s': %s" % (file_storage_dir, err),
359
                                 errors.ECODE_ENVIRON)
360

    
361
  if not os.path.isdir(file_storage_dir):
362
    raise errors.OpPrereqError("The file storage directory '%s' is not"
363
                               " a directory." % file_storage_dir,
364
                               errors.ECODE_ENVIRON)
365

    
366
  return file_storage_dir
367

    
368

    
369
def _PrepareFileBasedStorage(
370
    enabled_disk_templates, file_storage_dir,
371
    default_dir, file_disk_template,
372
    init_fn=_InitFileStorageDir, acceptance_fn=None):
373
  """Checks if a file-base storage type is enabled and inits the dir.
374

375
  @type enabled_disk_templates: list of string
376
  @param enabled_disk_templates: list of enabled disk templates
377
  @type file_storage_dir: string
378
  @param file_storage_dir: the file storage directory
379
  @type default_dir: string
380
  @param default_dir: default file storage directory when C{file_storage_dir}
381
      is 'None'
382
  @type file_disk_template: string
383
  @param file_disk_template: a disk template whose storage type is 'ST_FILE' or
384
      'ST_SHARED_FILE'
385
  @rtype: string
386
  @returns: the name of the actual file storage directory
387

388
  """
389
  assert (file_disk_template in utils.storage.GetDiskTemplatesOfStorageTypes(
390
            constants.ST_FILE, constants.ST_SHARED_FILE
391
         ))
392

    
393
  if file_storage_dir is None:
394
    file_storage_dir = default_dir
395
  if not acceptance_fn:
396
    acceptance_fn = \
397
        lambda path: filestorage.CheckFileStoragePathAcceptance(
398
            path, exact_match_ok=True)
399

    
400
  cluster.CheckFileStoragePathVsEnabledDiskTemplates(
401
      logging.warning, file_storage_dir, enabled_disk_templates)
402

    
403
  file_storage_enabled = file_disk_template in enabled_disk_templates
404
  if file_storage_enabled:
405
    try:
406
      acceptance_fn(file_storage_dir)
407
    except errors.FileStoragePathError as e:
408
      raise errors.OpPrereqError(str(e))
409
    result_file_storage_dir = init_fn(file_storage_dir)
410
  else:
411
    result_file_storage_dir = file_storage_dir
412
  return result_file_storage_dir
413

    
414

    
415
def _PrepareFileStorage(
416
    enabled_disk_templates, file_storage_dir, init_fn=_InitFileStorageDir,
417
    acceptance_fn=None):
418
  """Checks if file storage is enabled and inits the dir.
419

420
  @see: C{_PrepareFileBasedStorage}
421

422
  """
423
  return _PrepareFileBasedStorage(
424
      enabled_disk_templates, file_storage_dir,
425
      pathutils.DEFAULT_FILE_STORAGE_DIR, constants.DT_FILE,
426
      init_fn=init_fn, acceptance_fn=acceptance_fn)
427

    
428

    
429
def _PrepareSharedFileStorage(
430
    enabled_disk_templates, file_storage_dir, init_fn=_InitFileStorageDir,
431
    acceptance_fn=None):
432
  """Checks if shared file storage is enabled and inits the dir.
433

434
  @see: C{_PrepareFileBasedStorage}
435

436
  """
437
  return _PrepareFileBasedStorage(
438
      enabled_disk_templates, file_storage_dir,
439
      pathutils.DEFAULT_SHARED_FILE_STORAGE_DIR, constants.DT_SHARED_FILE,
440
      init_fn=init_fn, acceptance_fn=acceptance_fn)
441

    
442

    
443
def _PrepareGlusterStorage(
444
    enabled_disk_templates, file_storage_dir, init_fn=_InitFileStorageDir,
445
    acceptance_fn=None):
446
  """Checks if gluster storage is enabled and inits the dir.
447

448
  @see: C{_PrepareFileBasedStorage}
449

450
  """
451
  return _PrepareFileBasedStorage(
452
      enabled_disk_templates, file_storage_dir,
453
      pathutils.DEFAULT_GLUSTER_STORAGE_DIR, constants.DT_GLUSTER,
454
      init_fn=init_fn, acceptance_fn=acceptance_fn)
455

    
456

    
457
def _InitCheckEnabledDiskTemplates(enabled_disk_templates):
458
  """Checks the sanity of the enabled disk templates.
459

460
  """
461
  if not enabled_disk_templates:
462
    raise errors.OpPrereqError("Enabled disk templates list must contain at"
463
                               " least one member", errors.ECODE_INVAL)
464
  invalid_disk_templates = \
465
    set(enabled_disk_templates) - constants.DISK_TEMPLATES
466
  if invalid_disk_templates:
467
    raise errors.OpPrereqError("Enabled disk templates list contains invalid"
468
                               " entries: %s" % invalid_disk_templates,
469
                               errors.ECODE_INVAL)
470

    
471

    
472
def _RestrictIpolicyToEnabledDiskTemplates(ipolicy, enabled_disk_templates):
473
  """Restricts the ipolicy's disk templates to the enabled ones.
474

475
  This function clears the ipolicy's list of allowed disk templates from the
476
  ones that are not enabled by the cluster.
477

478
  @type ipolicy: dict
479
  @param ipolicy: the instance policy
480
  @type enabled_disk_templates: list of string
481
  @param enabled_disk_templates: the list of cluster-wide enabled disk
482
    templates
483

484
  """
485
  assert constants.IPOLICY_DTS in ipolicy
486
  allowed_disk_templates = ipolicy[constants.IPOLICY_DTS]
487
  restricted_disk_templates = list(set(allowed_disk_templates)
488
                                   .intersection(set(enabled_disk_templates)))
489
  ipolicy[constants.IPOLICY_DTS] = restricted_disk_templates
490

    
491

    
492
def _InitCheckDrbdHelper(drbd_helper, drbd_enabled):
493
  """Checks the DRBD usermode helper.
494

495
  @type drbd_helper: string
496
  @param drbd_helper: name of the DRBD usermode helper that the system should
497
    use
498

499
  """
500
  if not drbd_enabled:
501
    return
502

    
503
  if drbd_helper is not None:
504
    try:
505
      curr_helper = drbd.DRBD8.GetUsermodeHelper()
506
    except errors.BlockDeviceError, err:
507
      raise errors.OpPrereqError("Error while checking drbd helper"
508
                                 " (disable drbd with --enabled-disk-templates"
509
                                 " if you are not using drbd): %s" % str(err),
510
                                 errors.ECODE_ENVIRON)
511
    if drbd_helper != curr_helper:
512
      raise errors.OpPrereqError("Error: requiring %s as drbd helper but %s"
513
                                 " is the current helper" % (drbd_helper,
514
                                                             curr_helper),
515
                                 errors.ECODE_INVAL)
516

    
517

    
518
def InitCluster(cluster_name, mac_prefix, # pylint: disable=R0913, R0914
519
                master_netmask, master_netdev, file_storage_dir,
520
                shared_file_storage_dir, gluster_storage_dir,
521
                candidate_pool_size, secondary_ip=None,
522
                vg_name=None, beparams=None, nicparams=None, ndparams=None,
523
                hvparams=None, diskparams=None, enabled_hypervisors=None,
524
                modify_etc_hosts=True, modify_ssh_setup=True,
525
                maintain_node_health=False, drbd_helper=None, uid_pool=None,
526
                default_iallocator=None, default_iallocator_params=None,
527
                primary_ip_version=None, ipolicy=None,
528
                prealloc_wipe_disks=False, use_external_mip_script=False,
529
                hv_state=None, disk_state=None, enabled_disk_templates=None):
530
  """Initialise the cluster.
531

532
  @type candidate_pool_size: int
533
  @param candidate_pool_size: master candidate pool size
534
  @type enabled_disk_templates: list of string
535
  @param enabled_disk_templates: list of disk_templates to be used in this
536
    cluster
537

538
  """
539
  # TODO: complete the docstring
540
  if config.ConfigWriter.IsCluster():
541
    raise errors.OpPrereqError("Cluster is already initialised",
542
                               errors.ECODE_STATE)
543

    
544
  if not enabled_hypervisors:
545
    raise errors.OpPrereqError("Enabled hypervisors list must contain at"
546
                               " least one member", errors.ECODE_INVAL)
547
  invalid_hvs = set(enabled_hypervisors) - constants.HYPER_TYPES
548
  if invalid_hvs:
549
    raise errors.OpPrereqError("Enabled hypervisors contains invalid"
550
                               " entries: %s" % invalid_hvs,
551
                               errors.ECODE_INVAL)
552

    
553
  _InitCheckEnabledDiskTemplates(enabled_disk_templates)
554

    
555
  try:
556
    ipcls = netutils.IPAddress.GetClassFromIpVersion(primary_ip_version)
557
  except errors.ProgrammerError:
558
    raise errors.OpPrereqError("Invalid primary ip version: %d." %
559
                               primary_ip_version, errors.ECODE_INVAL)
560

    
561
  hostname = netutils.GetHostname(family=ipcls.family)
562
  if not ipcls.IsValid(hostname.ip):
563
    raise errors.OpPrereqError("This host's IP (%s) is not a valid IPv%d"
564
                               " address." % (hostname.ip, primary_ip_version),
565
                               errors.ECODE_INVAL)
566

    
567
  if ipcls.IsLoopback(hostname.ip):
568
    raise errors.OpPrereqError("This host's IP (%s) resolves to a loopback"
569
                               " address. Please fix DNS or %s." %
570
                               (hostname.ip, pathutils.ETC_HOSTS),
571
                               errors.ECODE_ENVIRON)
572

    
573
  if not ipcls.Own(hostname.ip):
574
    raise errors.OpPrereqError("Inconsistency: this host's name resolves"
575
                               " to %s,\nbut this ip address does not"
576
                               " belong to this host" %
577
                               hostname.ip, errors.ECODE_ENVIRON)
578

    
579
  clustername = netutils.GetHostname(name=cluster_name, family=ipcls.family)
580

    
581
  if netutils.TcpPing(clustername.ip, constants.DEFAULT_NODED_PORT, timeout=5):
582
    raise errors.OpPrereqError("Cluster IP already active",
583
                               errors.ECODE_NOTUNIQUE)
584

    
585
  if not secondary_ip:
586
    if primary_ip_version == constants.IP6_VERSION:
587
      raise errors.OpPrereqError("When using a IPv6 primary address, a valid"
588
                                 " IPv4 address must be given as secondary",
589
                                 errors.ECODE_INVAL)
590
    secondary_ip = hostname.ip
591

    
592
  if not netutils.IP4Address.IsValid(secondary_ip):
593
    raise errors.OpPrereqError("Secondary IP address (%s) has to be a valid"
594
                               " IPv4 address." % secondary_ip,
595
                               errors.ECODE_INVAL)
596

    
597
  if not netutils.IP4Address.Own(secondary_ip):
598
    raise errors.OpPrereqError("You gave %s as secondary IP,"
599
                               " but it does not belong to this host." %
600
                               secondary_ip, errors.ECODE_ENVIRON)
601

    
602
  if master_netmask is not None:
603
    if not ipcls.ValidateNetmask(master_netmask):
604
      raise errors.OpPrereqError("CIDR netmask (%s) not valid for IPv%s " %
605
                                  (master_netmask, primary_ip_version),
606
                                 errors.ECODE_INVAL)
607
  else:
608
    master_netmask = ipcls.iplen
609

    
610
  if vg_name:
611
    # Check if volume group is valid
612
    vgstatus = utils.CheckVolumeGroupSize(utils.ListVolumeGroups(), vg_name,
613
                                          constants.MIN_VG_SIZE)
614
    if vgstatus:
615
      raise errors.OpPrereqError("Error: %s" % vgstatus, errors.ECODE_INVAL)
616

    
617
  drbd_enabled = constants.DT_DRBD8 in enabled_disk_templates
618
  _InitCheckDrbdHelper(drbd_helper, drbd_enabled)
619

    
620
  logging.debug("Stopping daemons (if any are running)")
621
  result = utils.RunCmd([pathutils.DAEMON_UTIL, "stop-all"])
622
  if result.failed:
623
    raise errors.OpExecError("Could not stop daemons, command %s"
624
                             " had exitcode %s and error '%s'" %
625
                             (result.cmd, result.exit_code, result.output))
626

    
627
  file_storage_dir = _PrepareFileStorage(enabled_disk_templates,
628
                                         file_storage_dir)
629
  shared_file_storage_dir = _PrepareSharedFileStorage(enabled_disk_templates,
630
                                                      shared_file_storage_dir)
631

    
632
  if not re.match("^[0-9a-z]{2}:[0-9a-z]{2}:[0-9a-z]{2}$", mac_prefix):
633
    raise errors.OpPrereqError("Invalid mac prefix given '%s'" % mac_prefix,
634
                               errors.ECODE_INVAL)
635

    
636
  if not nicparams.get('mode', None) == constants.NIC_MODE_OVS:
637
    # Do not do this check if mode=openvswitch, since the openvswitch is not
638
    # created yet
639
    result = utils.RunCmd(["ip", "link", "show", "dev", master_netdev])
640
    if result.failed:
641
      raise errors.OpPrereqError("Invalid master netdev given (%s): '%s'" %
642
                                 (master_netdev,
643
                                  result.output.strip()), errors.ECODE_INVAL)
644

    
645
  dirs = [(pathutils.RUN_DIR, constants.RUN_DIRS_MODE)]
646
  utils.EnsureDirs(dirs)
647

    
648
  objects.UpgradeBeParams(beparams)
649
  utils.ForceDictType(beparams, constants.BES_PARAMETER_TYPES)
650
  utils.ForceDictType(nicparams, constants.NICS_PARAMETER_TYPES)
651

    
652
  objects.NIC.CheckParameterSyntax(nicparams)
653

    
654
  full_ipolicy = objects.FillIPolicy(constants.IPOLICY_DEFAULTS, ipolicy)
655
  _RestrictIpolicyToEnabledDiskTemplates(full_ipolicy, enabled_disk_templates)
656

    
657
  if ndparams is not None:
658
    utils.ForceDictType(ndparams, constants.NDS_PARAMETER_TYPES)
659
  else:
660
    ndparams = dict(constants.NDC_DEFAULTS)
661

    
662
  # This is ugly, as we modify the dict itself
663
  # FIXME: Make utils.ForceDictType pure functional or write a wrapper
664
  # around it
665
  if hv_state:
666
    for hvname, hvs_data in hv_state.items():
667
      utils.ForceDictType(hvs_data, constants.HVSTS_PARAMETER_TYPES)
668
      hv_state[hvname] = objects.Cluster.SimpleFillHvState(hvs_data)
669
  else:
670
    hv_state = dict((hvname, constants.HVST_DEFAULTS)
671
                    for hvname in enabled_hypervisors)
672

    
673
  # FIXME: disk_state has no default values yet
674
  if disk_state:
675
    for storage, ds_data in disk_state.items():
676
      if storage not in constants.DS_VALID_TYPES:
677
        raise errors.OpPrereqError("Invalid storage type in disk state: %s" %
678
                                   storage, errors.ECODE_INVAL)
679
      for ds_name, state in ds_data.items():
680
        utils.ForceDictType(state, constants.DSS_PARAMETER_TYPES)
681
        ds_data[ds_name] = objects.Cluster.SimpleFillDiskState(state)
682

    
683
  # hvparams is a mapping of hypervisor->hvparams dict
684
  for hv_name, hv_params in hvparams.iteritems():
685
    utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
686
    hv_class = hypervisor.GetHypervisor(hv_name)
687
    hv_class.CheckParameterSyntax(hv_params)
688

    
689
  # diskparams is a mapping of disk-template->diskparams dict
690
  for template, dt_params in diskparams.items():
691
    param_keys = set(dt_params.keys())
692
    default_param_keys = set(constants.DISK_DT_DEFAULTS[template].keys())
693
    if not (param_keys <= default_param_keys):
694
      unknown_params = param_keys - default_param_keys
695
      raise errors.OpPrereqError("Invalid parameters for disk template %s:"
696
                                 " %s" % (template,
697
                                          utils.CommaJoin(unknown_params)),
698
                                 errors.ECODE_INVAL)
699
    utils.ForceDictType(dt_params, constants.DISK_DT_TYPES)
700
    if template == constants.DT_DRBD8 and vg_name is not None:
701
      # The default METAVG value is equal to the VG name set at init time,
702
      # if provided
703
      dt_params[constants.DRBD_DEFAULT_METAVG] = vg_name
704

    
705
  try:
706
    utils.VerifyDictOptions(diskparams, constants.DISK_DT_DEFAULTS)
707
  except errors.OpPrereqError, err:
708
    raise errors.OpPrereqError("While verify diskparam options: %s" % err,
709
                               errors.ECODE_INVAL)
710

    
711
  # set up ssh config and /etc/hosts
712
  rsa_sshkey = ""
713
  dsa_sshkey = ""
714
  if os.path.isfile(pathutils.SSH_HOST_RSA_PUB):
715
    sshline = utils.ReadFile(pathutils.SSH_HOST_RSA_PUB)
716
    rsa_sshkey = sshline.split(" ")[1]
717
  if os.path.isfile(pathutils.SSH_HOST_DSA_PUB):
718
    sshline = utils.ReadFile(pathutils.SSH_HOST_DSA_PUB)
719
    dsa_sshkey = sshline.split(" ")[1]
720
  if not rsa_sshkey and not dsa_sshkey:
721
    raise errors.OpPrereqError("Failed to find SSH public keys",
722
                               errors.ECODE_ENVIRON)
723

    
724
  if modify_etc_hosts:
725
    utils.AddHostToEtcHosts(hostname.name, hostname.ip)
726

    
727
  if modify_ssh_setup:
728
    _InitSSHSetup()
729

    
730
  if default_iallocator is not None:
731
    alloc_script = utils.FindFile(default_iallocator,
732
                                  constants.IALLOCATOR_SEARCH_PATH,
733
                                  os.path.isfile)
734
    if alloc_script is None:
735
      raise errors.OpPrereqError("Invalid default iallocator script '%s'"
736
                                 " specified" % default_iallocator,
737
                                 errors.ECODE_INVAL)
738
  elif constants.HTOOLS:
739
    # htools was enabled at build-time, we default to it
740
    if utils.FindFile(constants.IALLOC_HAIL,
741
                      constants.IALLOCATOR_SEARCH_PATH,
742
                      os.path.isfile):
743
      default_iallocator = constants.IALLOC_HAIL
744

    
745
  candidate_certs = {}
746

    
747
  now = time.time()
748

    
749
  # init of cluster config file
750
  cluster_config = objects.Cluster(
751
    serial_no=1,
752
    rsahostkeypub=rsa_sshkey,
753
    dsahostkeypub=dsa_sshkey,
754
    highest_used_port=(constants.FIRST_DRBD_PORT - 1),
755
    mac_prefix=mac_prefix,
756
    volume_group_name=vg_name,
757
    tcpudp_port_pool=set(),
758
    master_ip=clustername.ip,
759
    master_netmask=master_netmask,
760
    master_netdev=master_netdev,
761
    cluster_name=clustername.name,
762
    file_storage_dir=file_storage_dir,
763
    shared_file_storage_dir=shared_file_storage_dir,
764
    gluster_storage_dir=gluster_storage_dir,
765
    enabled_hypervisors=enabled_hypervisors,
766
    beparams={constants.PP_DEFAULT: beparams},
767
    nicparams={constants.PP_DEFAULT: nicparams},
768
    ndparams=ndparams,
769
    hvparams=hvparams,
770
    diskparams=diskparams,
771
    candidate_pool_size=candidate_pool_size,
772
    modify_etc_hosts=modify_etc_hosts,
773
    modify_ssh_setup=modify_ssh_setup,
774
    uid_pool=uid_pool,
775
    ctime=now,
776
    mtime=now,
777
    maintain_node_health=maintain_node_health,
778
    drbd_usermode_helper=drbd_helper,
779
    default_iallocator=default_iallocator,
780
    default_iallocator_params=default_iallocator_params,
781
    primary_ip_family=ipcls.family,
782
    prealloc_wipe_disks=prealloc_wipe_disks,
783
    use_external_mip_script=use_external_mip_script,
784
    ipolicy=full_ipolicy,
785
    hv_state_static=hv_state,
786
    disk_state_static=disk_state,
787
    enabled_disk_templates=enabled_disk_templates,
788
    candidate_certs=candidate_certs,
789
    )
790
  master_node_config = objects.Node(name=hostname.name,
791
                                    primary_ip=hostname.ip,
792
                                    secondary_ip=secondary_ip,
793
                                    serial_no=1,
794
                                    master_candidate=True,
795
                                    offline=False, drained=False,
796
                                    ctime=now, mtime=now,
797
                                    )
798
  InitConfig(constants.CONFIG_VERSION, cluster_config, master_node_config)
799
  cfg = config.ConfigWriter(offline=True)
800
  ssh.WriteKnownHostsFile(cfg, pathutils.SSH_KNOWN_HOSTS_FILE)
801
  cfg.Update(cfg.GetClusterInfo(), logging.error)
802
  ssconf.WriteSsconfFiles(cfg.GetSsconfValues())
803

    
804
  # set up the inter-node password and certificate
805
  _InitGanetiServerSetup(hostname.name)
806

    
807
  logging.debug("Starting daemons")
808
  result = utils.RunCmd([pathutils.DAEMON_UTIL, "start-all"])
809
  if result.failed:
810
    raise errors.OpExecError("Could not start daemons, command %s"
811
                             " had exitcode %s and error %s" %
812
                             (result.cmd, result.exit_code, result.output))
813

    
814
  _WaitForMasterDaemon()
815

    
816

    
817
def InitConfig(version, cluster_config, master_node_config,
818
               cfg_file=pathutils.CLUSTER_CONF_FILE):
819
  """Create the initial cluster configuration.
820

821
  It will contain the current node, which will also be the master
822
  node, and no instances.
823

824
  @type version: int
825
  @param version: configuration version
826
  @type cluster_config: L{objects.Cluster}
827
  @param cluster_config: cluster configuration
828
  @type master_node_config: L{objects.Node}
829
  @param master_node_config: master node configuration
830
  @type cfg_file: string
831
  @param cfg_file: configuration file path
832

833
  """
834
  uuid_generator = config.TemporaryReservationManager()
835
  cluster_config.uuid = uuid_generator.Generate([], utils.NewUUID,
836
                                                _INITCONF_ECID)
837
  master_node_config.uuid = uuid_generator.Generate([], utils.NewUUID,
838
                                                    _INITCONF_ECID)
839
  cluster_config.master_node = master_node_config.uuid
840
  nodes = {
841
    master_node_config.uuid: master_node_config,
842
    }
843
  default_nodegroup = objects.NodeGroup(
844
    uuid=uuid_generator.Generate([], utils.NewUUID, _INITCONF_ECID),
845
    name=constants.INITIAL_NODE_GROUP_NAME,
846
    members=[master_node_config.uuid],
847
    diskparams={},
848
    )
849
  nodegroups = {
850
    default_nodegroup.uuid: default_nodegroup,
851
    }
852
  now = time.time()
853
  config_data = objects.ConfigData(version=version,
854
                                   cluster=cluster_config,
855
                                   nodegroups=nodegroups,
856
                                   nodes=nodes,
857
                                   instances={},
858
                                   networks={},
859
                                   serial_no=1,
860
                                   ctime=now, mtime=now)
861
  utils.WriteFile(cfg_file,
862
                  data=serializer.Dump(config_data.ToDict()),
863
                  mode=0600)
864

    
865

    
866
def FinalizeClusterDestroy(master_uuid):
867
  """Execute the last steps of cluster destroy
868

869
  This function shuts down all the daemons, completing the destroy
870
  begun in cmdlib.LUDestroyOpcode.
871

872
  """
873
  cfg = config.ConfigWriter()
874
  modify_ssh_setup = cfg.GetClusterInfo().modify_ssh_setup
875
  runner = rpc.BootstrapRunner()
876

    
877
  master_name = cfg.GetNodeName(master_uuid)
878

    
879
  master_params = cfg.GetMasterNetworkParameters()
880
  master_params.uuid = master_uuid
881
  ems = cfg.GetUseExternalMipScript()
882
  result = runner.call_node_deactivate_master_ip(master_name, master_params,
883
                                                 ems)
884

    
885
  msg = result.fail_msg
886
  if msg:
887
    logging.warning("Could not disable the master IP: %s", msg)
888

    
889
  result = runner.call_node_stop_master(master_name)
890
  msg = result.fail_msg
891
  if msg:
892
    logging.warning("Could not disable the master role: %s", msg)
893

    
894
  result = runner.call_node_leave_cluster(master_name, modify_ssh_setup)
895
  msg = result.fail_msg
896
  if msg:
897
    logging.warning("Could not shutdown the node daemon and cleanup"
898
                    " the node: %s", msg)
899

    
900

    
901
def SetupNodeDaemon(opts, cluster_name, node, ssh_port):
902
  """Add a node to the cluster.
903

904
  This function must be called before the actual opcode, and will ssh
905
  to the remote node, copy the needed files, and start ganeti-noded,
906
  allowing the master to do the rest via normal rpc calls.
907

908
  @param cluster_name: the cluster name
909
  @param node: the name of the new node
910
  @param ssh_port: the SSH port of the new node
911

912
  """
913
  data = {
914
    constants.NDS_CLUSTER_NAME: cluster_name,
915
    constants.NDS_NODE_DAEMON_CERTIFICATE:
916
      utils.ReadFile(pathutils.NODED_CERT_FILE),
917
    constants.NDS_SSCONF: ssconf.SimpleStore().ReadAll(),
918
    constants.NDS_START_NODE_DAEMON: True,
919
    }
920

    
921
  RunNodeSetupCmd(cluster_name, node, pathutils.NODE_DAEMON_SETUP,
922
                  opts.debug, opts.verbose,
923
                  True, opts.ssh_key_check, opts.ssh_key_check,
924
                  ssh_port, data)
925

    
926
  _WaitForNodeDaemon(node)
927

    
928

    
929
def MasterFailover(no_voting=False):
930
  """Failover the master node.
931

932
  This checks that we are not already the master, and will cause the
933
  current master to cease being master, and the non-master to become
934
  new master.
935

936
  @type no_voting: boolean
937
  @param no_voting: force the operation without remote nodes agreement
938
                      (dangerous)
939

940
  """
941
  sstore = ssconf.SimpleStore()
942

    
943
  old_master, new_master = ssconf.GetMasterAndMyself(sstore)
944
  node_names = sstore.GetNodeList()
945
  mc_list = sstore.GetMasterCandidates()
946

    
947
  if old_master == new_master:
948
    raise errors.OpPrereqError("This commands must be run on the node"
949
                               " where you want the new master to be."
950
                               " %s is already the master" %
951
                               old_master, errors.ECODE_INVAL)
952

    
953
  if new_master not in mc_list:
954
    mc_no_master = [name for name in mc_list if name != old_master]
955
    raise errors.OpPrereqError("This node is not among the nodes marked"
956
                               " as master candidates. Only these nodes"
957
                               " can become masters. Current list of"
958
                               " master candidates is:\n"
959
                               "%s" % ("\n".join(mc_no_master)),
960
                               errors.ECODE_STATE)
961

    
962
  if not no_voting:
963
    vote_list = GatherMasterVotes(node_names)
964

    
965
    if vote_list:
966
      voted_master = vote_list[0][0]
967
      if voted_master is None:
968
        raise errors.OpPrereqError("Cluster is inconsistent, most nodes did"
969
                                   " not respond.", errors.ECODE_ENVIRON)
970
      elif voted_master != old_master:
971
        raise errors.OpPrereqError("I have a wrong configuration, I believe"
972
                                   " the master is %s but the other nodes"
973
                                   " voted %s. Please resync the configuration"
974
                                   " of this node." %
975
                                   (old_master, voted_master),
976
                                   errors.ECODE_STATE)
977
  # end checks
978

    
979
  rcode = 0
980

    
981
  logging.info("Setting master to %s, old master: %s", new_master, old_master)
982

    
983
  try:
984
    # instantiate a real config writer, as we now know we have the
985
    # configuration data
986
    cfg = config.ConfigWriter(accept_foreign=True)
987

    
988
    old_master_node = cfg.GetNodeInfoByName(old_master)
989
    if old_master_node is None:
990
      raise errors.OpPrereqError("Could not find old master node '%s' in"
991
                                 " cluster configuration." % old_master,
992
                                 errors.ECODE_NOENT)
993

    
994
    cluster_info = cfg.GetClusterInfo()
995
    new_master_node = cfg.GetNodeInfoByName(new_master)
996
    if new_master_node is None:
997
      raise errors.OpPrereqError("Could not find new master node '%s' in"
998
                                 " cluster configuration." % new_master,
999
                                 errors.ECODE_NOENT)
1000

    
1001
    cluster_info.master_node = new_master_node.uuid
1002
    # this will also regenerate the ssconf files, since we updated the
1003
    # cluster info
1004
    cfg.Update(cluster_info, logging.error)
1005
  except errors.ConfigurationError, err:
1006
    logging.error("Error while trying to set the new master: %s",
1007
                  str(err))
1008
    return 1
1009

    
1010
  # if cfg.Update worked, then it means the old master daemon won't be
1011
  # able now to write its own config file (we rely on locking in both
1012
  # backend.UploadFile() and ConfigWriter._Write(); hence the next
1013
  # step is to kill the old master
1014

    
1015
  logging.info("Stopping the master daemon on node %s", old_master)
1016

    
1017
  runner = rpc.BootstrapRunner()
1018
  master_params = cfg.GetMasterNetworkParameters()
1019
  master_params.uuid = old_master_node.uuid
1020
  ems = cfg.GetUseExternalMipScript()
1021
  result = runner.call_node_deactivate_master_ip(old_master,
1022
                                                 master_params, ems)
1023

    
1024
  msg = result.fail_msg
1025
  if msg:
1026
    logging.warning("Could not disable the master IP: %s", msg)
1027

    
1028
  result = runner.call_node_stop_master(old_master)
1029
  msg = result.fail_msg
1030
  if msg:
1031
    logging.error("Could not disable the master role on the old master"
1032
                  " %s, please disable manually: %s", old_master, msg)
1033

    
1034
  logging.info("Checking master IP non-reachability...")
1035

    
1036
  master_ip = sstore.GetMasterIP()
1037
  total_timeout = 30
1038

    
1039
  # Here we have a phase where no master should be running
1040
  def _check_ip():
1041
    if netutils.TcpPing(master_ip, constants.DEFAULT_NODED_PORT):
1042
      raise utils.RetryAgain()
1043

    
1044
  try:
1045
    utils.Retry(_check_ip, (1, 1.5, 5), total_timeout)
1046
  except utils.RetryTimeout:
1047
    logging.warning("The master IP is still reachable after %s seconds,"
1048
                    " continuing but activating the master on the current"
1049
                    " node will probably fail", total_timeout)
1050

    
1051
  if jstore.CheckDrainFlag():
1052
    logging.info("Undraining job queue")
1053
    jstore.SetDrainFlag(False)
1054

    
1055
  logging.info("Starting the master daemons on the new master")
1056

    
1057
  result = rpc.BootstrapRunner().call_node_start_master_daemons(new_master,
1058
                                                                no_voting)
1059
  msg = result.fail_msg
1060
  if msg:
1061
    logging.error("Could not start the master role on the new master"
1062
                  " %s, please check: %s", new_master, msg)
1063
    rcode = 1
1064

    
1065
  logging.info("Master failed over from %s to %s", old_master, new_master)
1066
  return rcode
1067

    
1068

    
1069
def GetMaster():
1070
  """Returns the current master node.
1071

1072
  This is a separate function in bootstrap since it's needed by
1073
  gnt-cluster, and instead of importing directly ssconf, it's better
1074
  to abstract it in bootstrap, where we do use ssconf in other
1075
  functions too.
1076

1077
  """
1078
  sstore = ssconf.SimpleStore()
1079

    
1080
  old_master, _ = ssconf.GetMasterAndMyself(sstore)
1081

    
1082
  return old_master
1083

    
1084

    
1085
def GatherMasterVotes(node_names):
1086
  """Check the agreement on who is the master.
1087

1088
  This function will return a list of (node, number of votes), ordered
1089
  by the number of votes. Errors will be denoted by the key 'None'.
1090

1091
  Note that the sum of votes is the number of nodes this machine
1092
  knows, whereas the number of entries in the list could be different
1093
  (if some nodes vote for another master).
1094

1095
  We remove ourselves from the list since we know that (bugs aside)
1096
  since we use the same source for configuration information for both
1097
  backend and boostrap, we'll always vote for ourselves.
1098

1099
  @type node_names: list
1100
  @param node_names: the list of nodes to query for master info; the current
1101
      node will be removed if it is in the list
1102
  @rtype: list
1103
  @return: list of (node, votes)
1104

1105
  """
1106
  myself = netutils.Hostname.GetSysName()
1107
  try:
1108
    node_names.remove(myself)
1109
  except ValueError:
1110
    pass
1111
  if not node_names:
1112
    # no nodes left (eventually after removing myself)
1113
    return []
1114
  results = rpc.BootstrapRunner().call_master_node_name(node_names)
1115
  if not isinstance(results, dict):
1116
    # this should not happen (unless internal error in rpc)
1117
    logging.critical("Can't complete rpc call, aborting master startup")
1118
    return [(None, len(node_names))]
1119
  votes = {}
1120
  for node_name in results:
1121
    nres = results[node_name]
1122
    msg = nres.fail_msg
1123

    
1124
    if msg:
1125
      logging.warning("Error contacting node %s: %s", node_name, msg)
1126
      node = None
1127
    else:
1128
      node = nres.payload
1129

    
1130
    if node not in votes:
1131
      votes[node] = 1
1132
    else:
1133
      votes[node] += 1
1134

    
1135
  vote_list = [v for v in votes.items()]
1136
  # sort first on number of votes then on name, since we want None
1137
  # sorted later if we have the half of the nodes not responding, and
1138
  # half voting all for the same master
1139
  vote_list.sort(key=lambda x: (x[1], x[0]), reverse=True)
1140

    
1141
  return vote_list