Statistics
| Branch: | Tag: | Revision:

root / lib / bootstrap.py @ f4f36cf3

History | View | Annotate | Download (33.3 kB)

1
#
2
#
3

    
4
# Copyright (C) 2006, 2007, 2008, 2010, 2011, 2012 Google Inc.
5
#
6
# This program is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 2 of the License, or
9
# (at your option) any later version.
10
#
11
# This program is distributed in the hope that it will be useful, but
12
# WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
# General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with this program; if not, write to the Free Software
18
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19
# 02110-1301, USA.
20

    
21

    
22
"""Functions to bootstrap a new cluster.
23

24
"""
25

    
26
import os
27
import os.path
28
import re
29
import logging
30
import time
31

    
32
from ganeti import rpc
33
from ganeti import ssh
34
from ganeti import utils
35
from ganeti import errors
36
from ganeti import config
37
from ganeti import constants
38
from ganeti import objects
39
from ganeti import ssconf
40
from ganeti import serializer
41
from ganeti import hypervisor
42
from ganeti import bdev
43
from ganeti import netutils
44
from ganeti import backend
45
from ganeti import luxi
46
from ganeti import jstore
47

    
48

    
49
# ec_id for InitConfig's temporary reservation manager
50
_INITCONF_ECID = "initconfig-ecid"
51

    
52
#: After how many seconds daemon must be responsive
53
_DAEMON_READY_TIMEOUT = 10.0
54

    
55

    
56
def _InitSSHSetup():
57
  """Setup the SSH configuration for the cluster.
58

59
  This generates a dsa keypair for root, adds the pub key to the
60
  permitted hosts and adds the hostkey to its own known hosts.
61

62
  """
63
  priv_key, pub_key, auth_keys = ssh.GetUserFiles(constants.GANETI_RUNAS)
64

    
65
  for name in priv_key, pub_key:
66
    if os.path.exists(name):
67
      utils.CreateBackup(name)
68
    utils.RemoveFile(name)
69

    
70
  result = utils.RunCmd(["ssh-keygen", "-t", "dsa",
71
                         "-f", priv_key,
72
                         "-q", "-N", ""])
73
  if result.failed:
74
    raise errors.OpExecError("Could not generate ssh keypair, error %s" %
75
                             result.output)
76

    
77
  utils.AddAuthorizedKey(auth_keys, utils.ReadFile(pub_key))
78

    
79

    
80
def GenerateHmacKey(file_name):
81
  """Writes a new HMAC key.
82

83
  @type file_name: str
84
  @param file_name: Path to output file
85

86
  """
87
  utils.WriteFile(file_name, data="%s\n" % utils.GenerateSecret(), mode=0400,
88
                  backup=True)
89

    
90

    
91
def GenerateClusterCrypto(new_cluster_cert, new_rapi_cert, new_spice_cert,
92
                          new_confd_hmac_key, new_cds,
93
                          rapi_cert_pem=None, spice_cert_pem=None,
94
                          spice_cacert_pem=None, cds=None,
95
                          nodecert_file=constants.NODED_CERT_FILE,
96
                          rapicert_file=constants.RAPI_CERT_FILE,
97
                          spicecert_file=constants.SPICE_CERT_FILE,
98
                          spicecacert_file=constants.SPICE_CACERT_FILE,
99
                          hmackey_file=constants.CONFD_HMAC_KEY,
100
                          cds_file=constants.CLUSTER_DOMAIN_SECRET_FILE):
101
  """Updates the cluster certificates, keys and secrets.
102

103
  @type new_cluster_cert: bool
104
  @param new_cluster_cert: Whether to generate a new cluster certificate
105
  @type new_rapi_cert: bool
106
  @param new_rapi_cert: Whether to generate a new RAPI certificate
107
  @type new_spice_cert: bool
108
  @param new_spice_cert: Whether to generate a new SPICE certificate
109
  @type new_confd_hmac_key: bool
110
  @param new_confd_hmac_key: Whether to generate a new HMAC key
111
  @type new_cds: bool
112
  @param new_cds: Whether to generate a new cluster domain secret
113
  @type rapi_cert_pem: string
114
  @param rapi_cert_pem: New RAPI certificate in PEM format
115
  @type spice_cert_pem: string
116
  @param spice_cert_pem: New SPICE certificate in PEM format
117
  @type spice_cacert_pem: string
118
  @param spice_cacert_pem: Certificate of the CA that signed the SPICE
119
                           certificate, in PEM format
120
  @type cds: string
121
  @param cds: New cluster domain secret
122
  @type nodecert_file: string
123
  @param nodecert_file: optional override of the node cert file path
124
  @type rapicert_file: string
125
  @param rapicert_file: optional override of the rapi cert file path
126
  @type spicecert_file: string
127
  @param spicecert_file: optional override of the spice cert file path
128
  @type spicecacert_file: string
129
  @param spicecacert_file: optional override of the spice CA cert file path
130
  @type hmackey_file: string
131
  @param hmackey_file: optional override of the hmac key file path
132

133
  """
134
  # noded SSL certificate
135
  cluster_cert_exists = os.path.exists(nodecert_file)
136
  if new_cluster_cert or not cluster_cert_exists:
137
    if cluster_cert_exists:
138
      utils.CreateBackup(nodecert_file)
139

    
140
    logging.debug("Generating new cluster certificate at %s", nodecert_file)
141
    utils.GenerateSelfSignedSslCert(nodecert_file)
142

    
143
  # confd HMAC key
144
  if new_confd_hmac_key or not os.path.exists(hmackey_file):
145
    logging.debug("Writing new confd HMAC key to %s", hmackey_file)
146
    GenerateHmacKey(hmackey_file)
147

    
148
  # RAPI
149
  rapi_cert_exists = os.path.exists(rapicert_file)
150

    
151
  if rapi_cert_pem:
152
    # Assume rapi_pem contains a valid PEM-formatted certificate and key
153
    logging.debug("Writing RAPI certificate at %s", rapicert_file)
154
    utils.WriteFile(rapicert_file, data=rapi_cert_pem, backup=True)
155

    
156
  elif new_rapi_cert or not rapi_cert_exists:
157
    if rapi_cert_exists:
158
      utils.CreateBackup(rapicert_file)
159

    
160
    logging.debug("Generating new RAPI certificate at %s", rapicert_file)
161
    utils.GenerateSelfSignedSslCert(rapicert_file)
162

    
163
  # SPICE
164
  spice_cert_exists = os.path.exists(spicecert_file)
165
  spice_cacert_exists = os.path.exists(spicecacert_file)
166
  if spice_cert_pem:
167
    # spice_cert_pem implies also spice_cacert_pem
168
    logging.debug("Writing SPICE certificate at %s", spicecert_file)
169
    utils.WriteFile(spicecert_file, data=spice_cert_pem, backup=True)
170
    logging.debug("Writing SPICE CA certificate at %s", spicecacert_file)
171
    utils.WriteFile(spicecacert_file, data=spice_cacert_pem, backup=True)
172
  elif new_spice_cert or not spice_cert_exists:
173
    if spice_cert_exists:
174
      utils.CreateBackup(spicecert_file)
175
    if spice_cacert_exists:
176
      utils.CreateBackup(spicecacert_file)
177

    
178
    logging.debug("Generating new self-signed SPICE certificate at %s",
179
                  spicecert_file)
180
    (_, cert_pem) = utils.GenerateSelfSignedSslCert(spicecert_file)
181

    
182
    # Self-signed certificate -> the public certificate is also the CA public
183
    # certificate
184
    logging.debug("Writing the public certificate to %s",
185
                  spicecert_file)
186
    utils.io.WriteFile(spicecacert_file, mode=0400, data=cert_pem)
187

    
188
  # Cluster domain secret
189
  if cds:
190
    logging.debug("Writing cluster domain secret to %s", cds_file)
191
    utils.WriteFile(cds_file, data=cds, backup=True)
192

    
193
  elif new_cds or not os.path.exists(cds_file):
194
    logging.debug("Generating new cluster domain secret at %s", cds_file)
195
    GenerateHmacKey(cds_file)
196

    
197

    
198
def _InitGanetiServerSetup(master_name):
199
  """Setup the necessary configuration for the initial node daemon.
200

201
  This creates the nodepass file containing the shared password for
202
  the cluster, generates the SSL certificate and starts the node daemon.
203

204
  @type master_name: str
205
  @param master_name: Name of the master node
206

207
  """
208
  # Generate cluster secrets
209
  GenerateClusterCrypto(True, False, False, False, False)
210

    
211
  result = utils.RunCmd([constants.DAEMON_UTIL, "start", constants.NODED])
212
  if result.failed:
213
    raise errors.OpExecError("Could not start the node daemon, command %s"
214
                             " had exitcode %s and error %s" %
215
                             (result.cmd, result.exit_code, result.output))
216

    
217
  _WaitForNodeDaemon(master_name)
218

    
219

    
220
def _WaitForNodeDaemon(node_name):
221
  """Wait for node daemon to become responsive.
222

223
  """
224
  def _CheckNodeDaemon():
225
    # Pylint bug <http://www.logilab.org/ticket/35642>
226
    # pylint: disable=E1101
227
    result = rpc.BootstrapRunner().call_version([node_name])[node_name]
228
    if result.fail_msg:
229
      raise utils.RetryAgain()
230

    
231
  try:
232
    utils.Retry(_CheckNodeDaemon, 1.0, _DAEMON_READY_TIMEOUT)
233
  except utils.RetryTimeout:
234
    raise errors.OpExecError("Node daemon on %s didn't answer queries within"
235
                             " %s seconds" % (node_name, _DAEMON_READY_TIMEOUT))
236

    
237

    
238
def _WaitForMasterDaemon():
239
  """Wait for master daemon to become responsive.
240

241
  """
242
  def _CheckMasterDaemon():
243
    try:
244
      cl = luxi.Client()
245
      (cluster_name, ) = cl.QueryConfigValues(["cluster_name"])
246
    except Exception:
247
      raise utils.RetryAgain()
248

    
249
    logging.debug("Received cluster name %s from master", cluster_name)
250

    
251
  try:
252
    utils.Retry(_CheckMasterDaemon, 1.0, _DAEMON_READY_TIMEOUT)
253
  except utils.RetryTimeout:
254
    raise errors.OpExecError("Master daemon didn't answer queries within"
255
                             " %s seconds" % _DAEMON_READY_TIMEOUT)
256

    
257

    
258
def _InitFileStorage(file_storage_dir):
259
  """Initialize if needed the file storage.
260

261
  @param file_storage_dir: the user-supplied value
262
  @return: either empty string (if file storage was disabled at build
263
      time) or the normalized path to the storage directory
264

265
  """
266
  file_storage_dir = os.path.normpath(file_storage_dir)
267

    
268
  if not os.path.isabs(file_storage_dir):
269
    raise errors.OpPrereqError("File storage directory '%s' is not an absolute"
270
                               " path" % file_storage_dir, errors.ECODE_INVAL)
271

    
272
  if not os.path.exists(file_storage_dir):
273
    try:
274
      os.makedirs(file_storage_dir, 0750)
275
    except OSError, err:
276
      raise errors.OpPrereqError("Cannot create file storage directory"
277
                                 " '%s': %s" % (file_storage_dir, err),
278
                                 errors.ECODE_ENVIRON)
279

    
280
  if not os.path.isdir(file_storage_dir):
281
    raise errors.OpPrereqError("The file storage directory '%s' is not"
282
                               " a directory." % file_storage_dir,
283
                               errors.ECODE_ENVIRON)
284
  return file_storage_dir
285

    
286

    
287
def InitCluster(cluster_name, mac_prefix, # pylint: disable=R0913, R0914
288
                master_netmask, master_netdev, file_storage_dir,
289
                shared_file_storage_dir, candidate_pool_size, secondary_ip=None,
290
                vg_name=None, beparams=None, nicparams=None, ndparams=None,
291
                hvparams=None, diskparams=None, enabled_hypervisors=None,
292
                modify_etc_hosts=True, modify_ssh_setup=True,
293
                maintain_node_health=False, drbd_helper=None, uid_pool=None,
294
                default_iallocator=None, primary_ip_version=None, ipolicy=None,
295
                prealloc_wipe_disks=False, use_external_mip_script=False,
296
                hv_state=None, disk_state=None):
297
  """Initialise the cluster.
298

299
  @type candidate_pool_size: int
300
  @param candidate_pool_size: master candidate pool size
301

302
  """
303
  # TODO: complete the docstring
304
  if config.ConfigWriter.IsCluster():
305
    raise errors.OpPrereqError("Cluster is already initialised",
306
                               errors.ECODE_STATE)
307

    
308
  if not enabled_hypervisors:
309
    raise errors.OpPrereqError("Enabled hypervisors list must contain at"
310
                               " least one member", errors.ECODE_INVAL)
311
  invalid_hvs = set(enabled_hypervisors) - constants.HYPER_TYPES
312
  if invalid_hvs:
313
    raise errors.OpPrereqError("Enabled hypervisors contains invalid"
314
                               " entries: %s" % invalid_hvs,
315
                               errors.ECODE_INVAL)
316

    
317
  try:
318
    ipcls = netutils.IPAddress.GetClassFromIpVersion(primary_ip_version)
319
  except errors.ProgrammerError:
320
    raise errors.OpPrereqError("Invalid primary ip version: %d." %
321
                               primary_ip_version)
322

    
323
  hostname = netutils.GetHostname(family=ipcls.family)
324
  if not ipcls.IsValid(hostname.ip):
325
    raise errors.OpPrereqError("This host's IP (%s) is not a valid IPv%d"
326
                               " address." % (hostname.ip, primary_ip_version))
327

    
328
  if ipcls.IsLoopback(hostname.ip):
329
    raise errors.OpPrereqError("This host's IP (%s) resolves to a loopback"
330
                               " address. Please fix DNS or %s." %
331
                               (hostname.ip, constants.ETC_HOSTS),
332
                               errors.ECODE_ENVIRON)
333

    
334
  if not ipcls.Own(hostname.ip):
335
    raise errors.OpPrereqError("Inconsistency: this host's name resolves"
336
                               " to %s,\nbut this ip address does not"
337
                               " belong to this host" %
338
                               hostname.ip, errors.ECODE_ENVIRON)
339

    
340
  clustername = netutils.GetHostname(name=cluster_name, family=ipcls.family)
341

    
342
  if netutils.TcpPing(clustername.ip, constants.DEFAULT_NODED_PORT, timeout=5):
343
    raise errors.OpPrereqError("Cluster IP already active",
344
                               errors.ECODE_NOTUNIQUE)
345

    
346
  if not secondary_ip:
347
    if primary_ip_version == constants.IP6_VERSION:
348
      raise errors.OpPrereqError("When using a IPv6 primary address, a valid"
349
                                 " IPv4 address must be given as secondary",
350
                                 errors.ECODE_INVAL)
351
    secondary_ip = hostname.ip
352

    
353
  if not netutils.IP4Address.IsValid(secondary_ip):
354
    raise errors.OpPrereqError("Secondary IP address (%s) has to be a valid"
355
                               " IPv4 address." % secondary_ip,
356
                               errors.ECODE_INVAL)
357

    
358
  if not netutils.IP4Address.Own(secondary_ip):
359
    raise errors.OpPrereqError("You gave %s as secondary IP,"
360
                               " but it does not belong to this host." %
361
                               secondary_ip, errors.ECODE_ENVIRON)
362

    
363
  if master_netmask is not None:
364
    if not ipcls.ValidateNetmask(master_netmask):
365
      raise errors.OpPrereqError("CIDR netmask (%s) not valid for IPv%s " %
366
                                  (master_netmask, primary_ip_version))
367
  else:
368
    master_netmask = ipcls.iplen
369

    
370
  if vg_name is not None:
371
    # Check if volume group is valid
372
    vgstatus = utils.CheckVolumeGroupSize(utils.ListVolumeGroups(), vg_name,
373
                                          constants.MIN_VG_SIZE)
374
    if vgstatus:
375
      raise errors.OpPrereqError("Error: %s\nspecify --no-lvm-storage if"
376
                                 " you are not using lvm" % vgstatus,
377
                                 errors.ECODE_INVAL)
378

    
379
  if drbd_helper is not None:
380
    try:
381
      curr_helper = bdev.BaseDRBD.GetUsermodeHelper()
382
    except errors.BlockDeviceError, err:
383
      raise errors.OpPrereqError("Error while checking drbd helper"
384
                                 " (specify --no-drbd-storage if you are not"
385
                                 " using drbd): %s" % str(err),
386
                                 errors.ECODE_ENVIRON)
387
    if drbd_helper != curr_helper:
388
      raise errors.OpPrereqError("Error: requiring %s as drbd helper but %s"
389
                                 " is the current helper" % (drbd_helper,
390
                                                             curr_helper),
391
                                 errors.ECODE_INVAL)
392

    
393
  if constants.ENABLE_FILE_STORAGE:
394
    file_storage_dir = _InitFileStorage(file_storage_dir)
395
  else:
396
    file_storage_dir = ""
397

    
398
  if constants.ENABLE_SHARED_FILE_STORAGE:
399
    shared_file_storage_dir = _InitFileStorage(shared_file_storage_dir)
400
  else:
401
    shared_file_storage_dir = ""
402

    
403
  if not re.match("^[0-9a-z]{2}:[0-9a-z]{2}:[0-9a-z]{2}$", mac_prefix):
404
    raise errors.OpPrereqError("Invalid mac prefix given '%s'" % mac_prefix,
405
                               errors.ECODE_INVAL)
406

    
407
  result = utils.RunCmd(["ip", "link", "show", "dev", master_netdev])
408
  if result.failed:
409
    raise errors.OpPrereqError("Invalid master netdev given (%s): '%s'" %
410
                               (master_netdev,
411
                                result.output.strip()), errors.ECODE_INVAL)
412

    
413
  dirs = [(constants.RUN_GANETI_DIR, constants.RUN_DIRS_MODE)]
414
  utils.EnsureDirs(dirs)
415

    
416
  objects.UpgradeBeParams(beparams)
417
  utils.ForceDictType(beparams, constants.BES_PARAMETER_TYPES)
418
  utils.ForceDictType(nicparams, constants.NICS_PARAMETER_TYPES)
419

    
420
  objects.NIC.CheckParameterSyntax(nicparams)
421

    
422
  full_ipolicy = objects.FillIPolicy(constants.IPOLICY_DEFAULTS, ipolicy)
423

    
424
  if ndparams is not None:
425
    utils.ForceDictType(ndparams, constants.NDS_PARAMETER_TYPES)
426
  else:
427
    ndparams = dict(constants.NDC_DEFAULTS)
428

    
429
  # This is ugly, as we modify the dict itself
430
  # FIXME: Make utils.ForceDictType pure functional or write a wrapper
431
  # around it
432
  if hv_state:
433
    for hvname, hvs_data in hv_state.items():
434
      utils.ForceDictType(hvs_data, constants.HVSTS_PARAMETER_TYPES)
435
      hv_state[hvname] = objects.Cluster.SimpleFillHvState(hvs_data)
436
  else:
437
    hv_state = dict((hvname, constants.HVST_DEFAULTS)
438
                    for hvname in enabled_hypervisors)
439

    
440
  # FIXME: disk_state has no default values yet
441
  if disk_state:
442
    for storage, ds_data in disk_state.items():
443
      if storage not in constants.DS_VALID_TYPES:
444
        raise errors.OpPrereqError("Invalid storage type in disk state: %s" %
445
                                   storage, errors.ECODE_INVAL)
446
      for ds_name, state in ds_data.items():
447
        utils.ForceDictType(state, constants.DSS_PARAMETER_TYPES)
448
        ds_data[ds_name] = objects.Cluster.SimpleFillDiskState(state)
449

    
450
  # hvparams is a mapping of hypervisor->hvparams dict
451
  for hv_name, hv_params in hvparams.iteritems():
452
    utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
453
    hv_class = hypervisor.GetHypervisor(hv_name)
454
    hv_class.CheckParameterSyntax(hv_params)
455

    
456
  # diskparams is a mapping of disk-template->diskparams dict
457
  for template, dt_params in diskparams.items():
458
    param_keys = set(dt_params.keys())
459
    default_param_keys = set(constants.DISK_DT_DEFAULTS[template].keys())
460
    if not (param_keys <= default_param_keys):
461
      unknown_params = param_keys - default_param_keys
462
      raise errors.OpPrereqError("Invalid parameters for disk template %s:"
463
                                 " %s" % (template,
464
                                          utils.CommaJoin(unknown_params)))
465
    utils.ForceDictType(dt_params, constants.DISK_DT_TYPES)
466
  try:
467
    utils.VerifyDictOptions(diskparams, constants.DISK_DT_DEFAULTS)
468
  except errors.OpPrereqError, err:
469
    raise errors.OpPrereqError("While verify diskparam options: %s" % err,
470
                               errors.ECODE_INVAL)
471

    
472
  # set up ssh config and /etc/hosts
473
  sshline = utils.ReadFile(constants.SSH_HOST_RSA_PUB)
474
  sshkey = sshline.split(" ")[1]
475

    
476
  if modify_etc_hosts:
477
    utils.AddHostToEtcHosts(hostname.name, hostname.ip)
478

    
479
  if modify_ssh_setup:
480
    _InitSSHSetup()
481

    
482
  if default_iallocator is not None:
483
    alloc_script = utils.FindFile(default_iallocator,
484
                                  constants.IALLOCATOR_SEARCH_PATH,
485
                                  os.path.isfile)
486
    if alloc_script is None:
487
      raise errors.OpPrereqError("Invalid default iallocator script '%s'"
488
                                 " specified" % default_iallocator,
489
                                 errors.ECODE_INVAL)
490
  elif constants.HTOOLS:
491
    # htools was enabled at build-time, we default to it
492
    if utils.FindFile(constants.IALLOC_HAIL,
493
                      constants.IALLOCATOR_SEARCH_PATH,
494
                      os.path.isfile):
495
      default_iallocator = constants.IALLOC_HAIL
496

    
497
  now = time.time()
498

    
499
  # init of cluster config file
500
  cluster_config = objects.Cluster(
501
    serial_no=1,
502
    rsahostkeypub=sshkey,
503
    highest_used_port=(constants.FIRST_DRBD_PORT - 1),
504
    mac_prefix=mac_prefix,
505
    volume_group_name=vg_name,
506
    tcpudp_port_pool=set(),
507
    master_node=hostname.name,
508
    master_ip=clustername.ip,
509
    master_netmask=master_netmask,
510
    master_netdev=master_netdev,
511
    cluster_name=clustername.name,
512
    file_storage_dir=file_storage_dir,
513
    shared_file_storage_dir=shared_file_storage_dir,
514
    enabled_hypervisors=enabled_hypervisors,
515
    beparams={constants.PP_DEFAULT: beparams},
516
    nicparams={constants.PP_DEFAULT: nicparams},
517
    ndparams=ndparams,
518
    hvparams=hvparams,
519
    diskparams=diskparams,
520
    candidate_pool_size=candidate_pool_size,
521
    modify_etc_hosts=modify_etc_hosts,
522
    modify_ssh_setup=modify_ssh_setup,
523
    uid_pool=uid_pool,
524
    ctime=now,
525
    mtime=now,
526
    maintain_node_health=maintain_node_health,
527
    drbd_usermode_helper=drbd_helper,
528
    default_iallocator=default_iallocator,
529
    primary_ip_family=ipcls.family,
530
    prealloc_wipe_disks=prealloc_wipe_disks,
531
    use_external_mip_script=use_external_mip_script,
532
    ipolicy=full_ipolicy,
533
    hv_state_static=hv_state,
534
    disk_state_static=disk_state,
535
    )
536
  master_node_config = objects.Node(name=hostname.name,
537
                                    primary_ip=hostname.ip,
538
                                    secondary_ip=secondary_ip,
539
                                    serial_no=1,
540
                                    master_candidate=True,
541
                                    offline=False, drained=False,
542
                                    ctime=now, mtime=now,
543
                                    )
544
  InitConfig(constants.CONFIG_VERSION, cluster_config, master_node_config)
545
  cfg = config.ConfigWriter(offline=True)
546
  ssh.WriteKnownHostsFile(cfg, constants.SSH_KNOWN_HOSTS_FILE)
547
  cfg.Update(cfg.GetClusterInfo(), logging.error)
548
  backend.WriteSsconfFiles(cfg.GetSsconfValues())
549

    
550
  # set up the inter-node password and certificate
551
  _InitGanetiServerSetup(hostname.name)
552

    
553
  logging.debug("Starting daemons")
554
  result = utils.RunCmd([constants.DAEMON_UTIL, "start-all"])
555
  if result.failed:
556
    raise errors.OpExecError("Could not start daemons, command %s"
557
                             " had exitcode %s and error %s" %
558
                             (result.cmd, result.exit_code, result.output))
559

    
560
  _WaitForMasterDaemon()
561

    
562

    
563
def InitConfig(version, cluster_config, master_node_config,
564
               cfg_file=constants.CLUSTER_CONF_FILE):
565
  """Create the initial cluster configuration.
566

567
  It will contain the current node, which will also be the master
568
  node, and no instances.
569

570
  @type version: int
571
  @param version: configuration version
572
  @type cluster_config: L{objects.Cluster}
573
  @param cluster_config: cluster configuration
574
  @type master_node_config: L{objects.Node}
575
  @param master_node_config: master node configuration
576
  @type cfg_file: string
577
  @param cfg_file: configuration file path
578

579
  """
580
  uuid_generator = config.TemporaryReservationManager()
581
  cluster_config.uuid = uuid_generator.Generate([], utils.NewUUID,
582
                                                _INITCONF_ECID)
583
  master_node_config.uuid = uuid_generator.Generate([], utils.NewUUID,
584
                                                    _INITCONF_ECID)
585
  nodes = {
586
    master_node_config.name: master_node_config,
587
    }
588
  default_nodegroup = objects.NodeGroup(
589
    uuid=uuid_generator.Generate([], utils.NewUUID, _INITCONF_ECID),
590
    name=constants.INITIAL_NODE_GROUP_NAME,
591
    members=[master_node_config.name],
592
    diskparams={},
593
    )
594
  nodegroups = {
595
    default_nodegroup.uuid: default_nodegroup,
596
    }
597
  now = time.time()
598
  config_data = objects.ConfigData(version=version,
599
                                   cluster=cluster_config,
600
                                   nodegroups=nodegroups,
601
                                   nodes=nodes,
602
                                   instances={},
603
                                   networks={},
604
                                   serial_no=1,
605
                                   ctime=now, mtime=now)
606
  utils.WriteFile(cfg_file,
607
                  data=serializer.Dump(config_data.ToDict()),
608
                  mode=0600)
609

    
610

    
611
def FinalizeClusterDestroy(master):
612
  """Execute the last steps of cluster destroy
613

614
  This function shuts down all the daemons, completing the destroy
615
  begun in cmdlib.LUDestroyOpcode.
616

617
  """
618
  cfg = config.ConfigWriter()
619
  modify_ssh_setup = cfg.GetClusterInfo().modify_ssh_setup
620
  runner = rpc.BootstrapRunner()
621

    
622
  master_params = cfg.GetMasterNetworkParameters()
623
  master_params.name = master
624
  ems = cfg.GetUseExternalMipScript()
625
  result = runner.call_node_deactivate_master_ip(master_params.name,
626
                                                 master_params, ems)
627

    
628
  msg = result.fail_msg
629
  if msg:
630
    logging.warning("Could not disable the master IP: %s", msg)
631

    
632
  result = runner.call_node_stop_master(master)
633
  msg = result.fail_msg
634
  if msg:
635
    logging.warning("Could not disable the master role: %s", msg)
636

    
637
  result = runner.call_node_leave_cluster(master, modify_ssh_setup)
638
  msg = result.fail_msg
639
  if msg:
640
    logging.warning("Could not shutdown the node daemon and cleanup"
641
                    " the node: %s", msg)
642

    
643

    
644
def SetupNodeDaemon(cluster_name, node, ssh_key_check):
645
  """Add a node to the cluster.
646

647
  This function must be called before the actual opcode, and will ssh
648
  to the remote node, copy the needed files, and start ganeti-noded,
649
  allowing the master to do the rest via normal rpc calls.
650

651
  @param cluster_name: the cluster name
652
  @param node: the name of the new node
653
  @param ssh_key_check: whether to do a strict key check
654

655
  """
656
  family = ssconf.SimpleStore().GetPrimaryIPFamily()
657
  sshrunner = ssh.SshRunner(cluster_name,
658
                            ipv6=(family == netutils.IP6Address.family))
659

    
660
  bind_address = constants.IP4_ADDRESS_ANY
661
  if family == netutils.IP6Address.family:
662
    bind_address = constants.IP6_ADDRESS_ANY
663

    
664
  # set up inter-node password and certificate and restarts the node daemon
665
  # and then connect with ssh to set password and start ganeti-noded
666
  # note that all the below variables are sanitized at this point,
667
  # either by being constants or by the checks above
668
  sshrunner.CopyFileToNode(node, constants.NODED_CERT_FILE)
669
  sshrunner.CopyFileToNode(node, constants.RAPI_CERT_FILE)
670
  sshrunner.CopyFileToNode(node, constants.SPICE_CERT_FILE)
671
  sshrunner.CopyFileToNode(node, constants.SPICE_CACERT_FILE)
672
  sshrunner.CopyFileToNode(node, constants.CONFD_HMAC_KEY)
673
  mycommand = ("%s stop-all; %s start %s -b %s" %
674
               (constants.DAEMON_UTIL, constants.DAEMON_UTIL, constants.NODED,
675
                utils.ShellQuote(bind_address)))
676

    
677
  result = sshrunner.Run(node, "root", mycommand, batch=False,
678
                         ask_key=ssh_key_check,
679
                         use_cluster_key=True,
680
                         strict_host_check=ssh_key_check)
681
  if result.failed:
682
    raise errors.OpExecError("Remote command on node %s, error: %s,"
683
                             " output: %s" %
684
                             (node, result.fail_reason, result.output))
685

    
686
  _WaitForNodeDaemon(node)
687

    
688

    
689
def MasterFailover(no_voting=False):
690
  """Failover the master node.
691

692
  This checks that we are not already the master, and will cause the
693
  current master to cease being master, and the non-master to become
694
  new master.
695

696
  @type no_voting: boolean
697
  @param no_voting: force the operation without remote nodes agreement
698
                      (dangerous)
699

700
  """
701
  sstore = ssconf.SimpleStore()
702

    
703
  old_master, new_master = ssconf.GetMasterAndMyself(sstore)
704
  node_list = sstore.GetNodeList()
705
  mc_list = sstore.GetMasterCandidates()
706

    
707
  if old_master == new_master:
708
    raise errors.OpPrereqError("This commands must be run on the node"
709
                               " where you want the new master to be."
710
                               " %s is already the master" %
711
                               old_master, errors.ECODE_INVAL)
712

    
713
  if new_master not in mc_list:
714
    mc_no_master = [name for name in mc_list if name != old_master]
715
    raise errors.OpPrereqError("This node is not among the nodes marked"
716
                               " as master candidates. Only these nodes"
717
                               " can become masters. Current list of"
718
                               " master candidates is:\n"
719
                               "%s" % ("\n".join(mc_no_master)),
720
                               errors.ECODE_STATE)
721

    
722
  if not no_voting:
723
    vote_list = GatherMasterVotes(node_list)
724

    
725
    if vote_list:
726
      voted_master = vote_list[0][0]
727
      if voted_master is None:
728
        raise errors.OpPrereqError("Cluster is inconsistent, most nodes did"
729
                                   " not respond.", errors.ECODE_ENVIRON)
730
      elif voted_master != old_master:
731
        raise errors.OpPrereqError("I have a wrong configuration, I believe"
732
                                   " the master is %s but the other nodes"
733
                                   " voted %s. Please resync the configuration"
734
                                   " of this node." %
735
                                   (old_master, voted_master),
736
                                   errors.ECODE_STATE)
737
  # end checks
738

    
739
  rcode = 0
740

    
741
  logging.info("Setting master to %s, old master: %s", new_master, old_master)
742

    
743
  try:
744
    # instantiate a real config writer, as we now know we have the
745
    # configuration data
746
    cfg = config.ConfigWriter(accept_foreign=True)
747

    
748
    cluster_info = cfg.GetClusterInfo()
749
    cluster_info.master_node = new_master
750
    # this will also regenerate the ssconf files, since we updated the
751
    # cluster info
752
    cfg.Update(cluster_info, logging.error)
753
  except errors.ConfigurationError, err:
754
    logging.error("Error while trying to set the new master: %s",
755
                  str(err))
756
    return 1
757

    
758
  # if cfg.Update worked, then it means the old master daemon won't be
759
  # able now to write its own config file (we rely on locking in both
760
  # backend.UploadFile() and ConfigWriter._Write(); hence the next
761
  # step is to kill the old master
762

    
763
  logging.info("Stopping the master daemon on node %s", old_master)
764

    
765
  runner = rpc.BootstrapRunner()
766
  master_params = cfg.GetMasterNetworkParameters()
767
  master_params.name = old_master
768
  ems = cfg.GetUseExternalMipScript()
769
  result = runner.call_node_deactivate_master_ip(master_params.name,
770
                                                 master_params, ems)
771

    
772
  msg = result.fail_msg
773
  if msg:
774
    logging.warning("Could not disable the master IP: %s", msg)
775

    
776
  result = runner.call_node_stop_master(old_master)
777
  msg = result.fail_msg
778
  if msg:
779
    logging.error("Could not disable the master role on the old master"
780
                 " %s, please disable manually: %s", old_master, msg)
781

    
782
  logging.info("Checking master IP non-reachability...")
783

    
784
  master_ip = sstore.GetMasterIP()
785
  total_timeout = 30
786

    
787
  # Here we have a phase where no master should be running
788
  def _check_ip():
789
    if netutils.TcpPing(master_ip, constants.DEFAULT_NODED_PORT):
790
      raise utils.RetryAgain()
791

    
792
  try:
793
    utils.Retry(_check_ip, (1, 1.5, 5), total_timeout)
794
  except utils.RetryTimeout:
795
    logging.warning("The master IP is still reachable after %s seconds,"
796
                    " continuing but activating the master on the current"
797
                    " node will probably fail", total_timeout)
798

    
799
  if jstore.CheckDrainFlag():
800
    logging.info("Undraining job queue")
801
    jstore.SetDrainFlag(False)
802

    
803
  logging.info("Starting the master daemons on the new master")
804

    
805
  result = rpc.BootstrapRunner().call_node_start_master_daemons(new_master,
806
                                                                no_voting)
807
  msg = result.fail_msg
808
  if msg:
809
    logging.error("Could not start the master role on the new master"
810
                  " %s, please check: %s", new_master, msg)
811
    rcode = 1
812

    
813
  logging.info("Master failed over from %s to %s", old_master, new_master)
814
  return rcode
815

    
816

    
817
def GetMaster():
818
  """Returns the current master node.
819

820
  This is a separate function in bootstrap since it's needed by
821
  gnt-cluster, and instead of importing directly ssconf, it's better
822
  to abstract it in bootstrap, where we do use ssconf in other
823
  functions too.
824

825
  """
826
  sstore = ssconf.SimpleStore()
827

    
828
  old_master, _ = ssconf.GetMasterAndMyself(sstore)
829

    
830
  return old_master
831

    
832

    
833
def GatherMasterVotes(node_list):
834
  """Check the agreement on who is the master.
835

836
  This function will return a list of (node, number of votes), ordered
837
  by the number of votes. Errors will be denoted by the key 'None'.
838

839
  Note that the sum of votes is the number of nodes this machine
840
  knows, whereas the number of entries in the list could be different
841
  (if some nodes vote for another master).
842

843
  We remove ourselves from the list since we know that (bugs aside)
844
  since we use the same source for configuration information for both
845
  backend and boostrap, we'll always vote for ourselves.
846

847
  @type node_list: list
848
  @param node_list: the list of nodes to query for master info; the current
849
      node will be removed if it is in the list
850
  @rtype: list
851
  @return: list of (node, votes)
852

853
  """
854
  myself = netutils.Hostname.GetSysName()
855
  try:
856
    node_list.remove(myself)
857
  except ValueError:
858
    pass
859
  if not node_list:
860
    # no nodes left (eventually after removing myself)
861
    return []
862
  results = rpc.BootstrapRunner().call_master_info(node_list)
863
  if not isinstance(results, dict):
864
    # this should not happen (unless internal error in rpc)
865
    logging.critical("Can't complete rpc call, aborting master startup")
866
    return [(None, len(node_list))]
867
  votes = {}
868
  for node in results:
869
    nres = results[node]
870
    data = nres.payload
871
    msg = nres.fail_msg
872
    fail = False
873
    if msg:
874
      logging.warning("Error contacting node %s: %s", node, msg)
875
      fail = True
876
    # for now we accept both length 3, 4 and 5 (data[3] is primary ip version
877
    # and data[4] is the master netmask)
878
    elif not isinstance(data, (tuple, list)) or len(data) < 3:
879
      logging.warning("Invalid data received from node %s: %s", node, data)
880
      fail = True
881
    if fail:
882
      if None not in votes:
883
        votes[None] = 0
884
      votes[None] += 1
885
      continue
886
    master_node = data[2]
887
    if master_node not in votes:
888
      votes[master_node] = 0
889
    votes[master_node] += 1
890

    
891
  vote_list = [v for v in votes.items()]
892
  # sort first on number of votes then on name, since we want None
893
  # sorted later if we have the half of the nodes not responding, and
894
  # half voting all for the same master
895
  vote_list.sort(key=lambda x: (x[1], x[0]), reverse=True)
896

    
897
  return vote_list