Statistics
| Branch: | Tag: | Revision:

root / lib / bootstrap.py @ 40a97d80

History | View | Annotate | Download (16.2 kB)

1
#
2
#
3

    
4
# Copyright (C) 2006, 2007, 2008 Google Inc.
5
#
6
# This program is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 2 of the License, or
9
# (at your option) any later version.
10
#
11
# This program is distributed in the hope that it will be useful, but
12
# WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
# General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with this program; if not, write to the Free Software
18
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19
# 02110-1301, USA.
20

    
21

    
22
"""Functions to bootstrap a new cluster.
23

24
"""
25

    
26
import os
27
import os.path
28
import sha
29
import re
30
import logging
31

    
32
from ganeti import rpc
33
from ganeti import ssh
34
from ganeti import utils
35
from ganeti import errors
36
from ganeti import config
37
from ganeti import constants
38
from ganeti import objects
39
from ganeti import ssconf
40

    
41

    
42
def _InitSSHSetup():
43
  """Setup the SSH configuration for the cluster.
44

45
  This generates a dsa keypair for root, adds the pub key to the
46
  permitted hosts and adds the hostkey to its own known hosts.
47

48
  """
49
  priv_key, pub_key, auth_keys = ssh.GetUserFiles(constants.GANETI_RUNAS)
50

    
51
  for name in priv_key, pub_key:
52
    if os.path.exists(name):
53
      utils.CreateBackup(name)
54
    utils.RemoveFile(name)
55

    
56
  result = utils.RunCmd(["ssh-keygen", "-t", "dsa",
57
                         "-f", priv_key,
58
                         "-q", "-N", ""])
59
  if result.failed:
60
    raise errors.OpExecError("Could not generate ssh keypair, error %s" %
61
                             result.output)
62

    
63
  f = open(pub_key, 'r')
64
  try:
65
    utils.AddAuthorizedKey(auth_keys, f.read(8192))
66
  finally:
67
    f.close()
68

    
69

    
70
def _GenerateSelfSignedSslCert(file_name, validity=(365 * 5)):
71
  """Generates a self-signed SSL certificate.
72

73
  @type file_name: str
74
  @param file_name: Path to output file
75
  @type validity: int
76
  @param validity: Validity for certificate in days
77

78
  """
79
  result = utils.RunCmd(["openssl", "req", "-new", "-newkey", "rsa:1024",
80
                         "-days", str(validity), "-nodes", "-x509",
81
                         "-keyout", file_name, "-out", file_name, "-batch"])
82
  if result.failed:
83
    raise errors.OpExecError("Could not generate SSL certificate, command"
84
                             " %s had exitcode %s and error message %s" %
85
                             (result.cmd, result.exit_code, result.output))
86

    
87
  os.chmod(file_name, 0400)
88

    
89

    
90
def _InitGanetiServerSetup():
91
  """Setup the necessary configuration for the initial node daemon.
92

93
  This creates the nodepass file containing the shared password for
94
  the cluster and also generates the SSL certificate.
95

96
  """
97
  _GenerateSelfSignedSslCert(constants.SSL_CERT_FILE)
98

    
99
  result = utils.RunCmd([constants.NODE_INITD_SCRIPT, "restart"])
100

    
101
  if result.failed:
102
    raise errors.OpExecError("Could not start the node daemon, command %s"
103
                             " had exitcode %s and error %s" %
104
                             (result.cmd, result.exit_code, result.output))
105

    
106

    
107
def InitCluster(cluster_name, mac_prefix, def_bridge,
108
                master_netdev, file_storage_dir, candidate_pool_size,
109
                secondary_ip=None, vg_name=None, beparams=None, hvparams=None,
110
                enabled_hypervisors=None, default_hypervisor=None):
111
  """Initialise the cluster.
112

113
  @type candidate_pool_size: int
114
  @param candidate_pool_size: master candidate pool size
115

116
  """
117
  # TODO: complete the docstring
118
  if config.ConfigWriter.IsCluster():
119
    raise errors.OpPrereqError("Cluster is already initialised")
120

    
121
  hostname = utils.HostInfo()
122

    
123
  if hostname.ip.startswith("127."):
124
    raise errors.OpPrereqError("This host's IP resolves to the private"
125
                               " range (%s). Please fix DNS or %s." %
126
                               (hostname.ip, constants.ETC_HOSTS))
127

    
128
  if not utils.OwnIpAddress(hostname.ip):
129
    raise errors.OpPrereqError("Inconsistency: this host's name resolves"
130
                               " to %s,\nbut this ip address does not"
131
                               " belong to this host."
132
                               " Aborting." % hostname.ip)
133

    
134
  clustername = utils.HostInfo(cluster_name)
135

    
136
  if utils.TcpPing(clustername.ip, constants.DEFAULT_NODED_PORT,
137
                   timeout=5):
138
    raise errors.OpPrereqError("Cluster IP already active. Aborting.")
139

    
140
  if secondary_ip:
141
    if not utils.IsValidIP(secondary_ip):
142
      raise errors.OpPrereqError("Invalid secondary ip given")
143
    if (secondary_ip != hostname.ip and
144
        not utils.OwnIpAddress(secondary_ip)):
145
      raise errors.OpPrereqError("You gave %s as secondary IP,"
146
                                 " but it does not belong to this host." %
147
                                 secondary_ip)
148
  else:
149
    secondary_ip = hostname.ip
150

    
151
  if vg_name is not None:
152
    # Check if volume group is valid
153
    vgstatus = utils.CheckVolumeGroupSize(utils.ListVolumeGroups(), vg_name,
154
                                          constants.MIN_VG_SIZE)
155
    if vgstatus:
156
      raise errors.OpPrereqError("Error: %s\nspecify --no-lvm-storage if"
157
                                 " you are not using lvm" % vgstatus)
158

    
159
  file_storage_dir = os.path.normpath(file_storage_dir)
160

    
161
  if not os.path.isabs(file_storage_dir):
162
    raise errors.OpPrereqError("The file storage directory you passed is"
163
                               " not an absolute path.")
164

    
165
  if not os.path.exists(file_storage_dir):
166
    try:
167
      os.makedirs(file_storage_dir, 0750)
168
    except OSError, err:
169
      raise errors.OpPrereqError("Cannot create file storage directory"
170
                                 " '%s': %s" %
171
                                 (file_storage_dir, err))
172

    
173
  if not os.path.isdir(file_storage_dir):
174
    raise errors.OpPrereqError("The file storage directory '%s' is not"
175
                               " a directory." % file_storage_dir)
176

    
177
  if not re.match("^[0-9a-z]{2}:[0-9a-z]{2}:[0-9a-z]{2}$", mac_prefix):
178
    raise errors.OpPrereqError("Invalid mac prefix given '%s'" % mac_prefix)
179

    
180
  result = utils.RunCmd(["ip", "link", "show", "dev", master_netdev])
181
  if result.failed:
182
    raise errors.OpPrereqError("Invalid master netdev given (%s): '%s'" %
183
                               (master_netdev,
184
                                result.output.strip()))
185

    
186
  if not (os.path.isfile(constants.NODE_INITD_SCRIPT) and
187
          os.access(constants.NODE_INITD_SCRIPT, os.X_OK)):
188
    raise errors.OpPrereqError("Init.d script '%s' missing or not"
189
                               " executable." % constants.NODE_INITD_SCRIPT)
190

    
191
  utils.CheckBEParams(beparams)
192

    
193
  # set up the inter-node password and certificate
194
  _InitGanetiServerSetup()
195

    
196
  # set up ssh config and /etc/hosts
197
  f = open(constants.SSH_HOST_RSA_PUB, 'r')
198
  try:
199
    sshline = f.read()
200
  finally:
201
    f.close()
202
  sshkey = sshline.split(" ")[1]
203

    
204
  utils.AddHostToEtcHosts(hostname.name)
205
  _InitSSHSetup()
206

    
207
  # init of cluster config file
208
  cluster_config = objects.Cluster(
209
    serial_no=1,
210
    rsahostkeypub=sshkey,
211
    highest_used_port=(constants.FIRST_DRBD_PORT - 1),
212
    mac_prefix=mac_prefix,
213
    volume_group_name=vg_name,
214
    default_bridge=def_bridge,
215
    tcpudp_port_pool=set(),
216
    master_node=hostname.name,
217
    master_ip=clustername.ip,
218
    master_netdev=master_netdev,
219
    cluster_name=clustername.name,
220
    file_storage_dir=file_storage_dir,
221
    enabled_hypervisors=enabled_hypervisors,
222
    default_hypervisor=default_hypervisor,
223
    beparams={constants.BEGR_DEFAULT: beparams},
224
    hvparams=hvparams,
225
    candidate_pool_size=candidate_pool_size,
226
    )
227
  master_node_config = objects.Node(name=hostname.name,
228
                                    primary_ip=hostname.ip,
229
                                    secondary_ip=secondary_ip,
230
                                    serial_no=1,
231
                                    master_candidate=True,
232
                                    offline=False,
233
                                    )
234

    
235
  sscfg = InitConfig(constants.CONFIG_VERSION,
236
                     cluster_config, master_node_config)
237
  ssh.WriteKnownHostsFile(sscfg, constants.SSH_KNOWN_HOSTS_FILE)
238
  cfg = config.ConfigWriter()
239
  cfg.Update(cfg.GetClusterInfo())
240

    
241
  # start the master ip
242
  # TODO: Review rpc call from bootstrap
243
  rpc.RpcRunner.call_node_start_master(hostname.name, True)
244

    
245

    
246
def InitConfig(version, cluster_config, master_node_config,
247
               cfg_file=constants.CLUSTER_CONF_FILE):
248
  """Create the initial cluster configuration.
249

250
  It will contain the current node, which will also be the master
251
  node, and no instances.
252

253
  @type version: int
254
  @param version: configuration version
255
  @type cluster_config: L{objects.Cluster}
256
  @param cluster_config: cluster configuration
257
  @type master_node_config: L{objects.Node}
258
  @param master_node_config: master node configuration
259
  @type cfg_file: string
260
  @param cfg_file: configuration file path
261

262
  @rtype: L{ssconf.SimpleConfigWriter}
263
  @returns: initialized config instance
264

265
  """
266
  nodes = {
267
    master_node_config.name: master_node_config,
268
    }
269

    
270
  config_data = objects.ConfigData(version=version,
271
                                   cluster=cluster_config,
272
                                   nodes=nodes,
273
                                   instances={},
274
                                   serial_no=1)
275
  cfg = ssconf.SimpleConfigWriter.FromDict(config_data.ToDict(), cfg_file)
276
  cfg.Save()
277

    
278
  return cfg
279

    
280

    
281
def FinalizeClusterDestroy(master):
282
  """Execute the last steps of cluster destroy
283

284
  This function shuts down all the daemons, completing the destroy
285
  begun in cmdlib.LUDestroyOpcode.
286

287
  """
288
  result = rpc.RpcRunner.call_node_stop_master(master, True)
289
  if result.failed or not result.data:
290
    logging.warning("Could not disable the master role")
291
  result = rpc.RpcRunner.call_node_leave_cluster(master)
292
  if result.failed or not result.data:
293
    logging.warning("Could not shutdown the node daemon and cleanup the node")
294

    
295

    
296
def SetupNodeDaemon(cluster_name, node, ssh_key_check):
297
  """Add a node to the cluster.
298

299
  This function must be called before the actual opcode, and will ssh
300
  to the remote node, copy the needed files, and start ganeti-noded,
301
  allowing the master to do the rest via normal rpc calls.
302

303
  @param cluster_name: the cluster name
304
  @param node: the name of the new node
305
  @param ssh_key_check: whether to do a strict key check
306

307
  """
308
  sshrunner = ssh.SshRunner(cluster_name)
309
  gntpem = utils.ReadFile(constants.SSL_CERT_FILE)
310
  # in the base64 pem encoding, neither '!' nor '.' are valid chars,
311
  # so we use this to detect an invalid certificate; as long as the
312
  # cert doesn't contain this, the here-document will be correctly
313
  # parsed by the shell sequence below
314
  if re.search('^!EOF\.', gntpem, re.MULTILINE):
315
    raise errors.OpExecError("invalid PEM encoding in the SSL certificate")
316
  if not gntpem.endswith("\n"):
317
    raise errors.OpExecError("PEM must end with newline")
318

    
319
  # set up inter-node password and certificate and restarts the node daemon
320
  # and then connect with ssh to set password and start ganeti-noded
321
  # note that all the below variables are sanitized at this point,
322
  # either by being constants or by the checks above
323
  mycommand = ("umask 077 && "
324
               "cat > '%s' << '!EOF.' && \n"
325
               "%s!EOF.\n%s restart" %
326
               (constants.SSL_CERT_FILE, gntpem,
327
                constants.NODE_INITD_SCRIPT))
328

    
329
  result = sshrunner.Run(node, 'root', mycommand, batch=False,
330
                         ask_key=ssh_key_check,
331
                         use_cluster_key=False,
332
                         strict_host_check=ssh_key_check)
333
  if result.failed:
334
    raise errors.OpExecError("Remote command on node %s, error: %s,"
335
                             " output: %s" %
336
                             (node, result.fail_reason, result.output))
337

    
338

    
339
def MasterFailover():
340
  """Failover the master node.
341

342
  This checks that we are not already the master, and will cause the
343
  current master to cease being master, and the non-master to become
344
  new master.
345

346
  """
347
  sstore = ssconf.SimpleStore()
348

    
349
  old_master, new_master = ssconf.GetMasterAndMyself(sstore)
350
  node_list = sstore.GetNodeList()
351
  mc_list = sstore.GetMasterCandidates()
352

    
353
  if old_master == new_master:
354
    raise errors.OpPrereqError("This commands must be run on the node"
355
                               " where you want the new master to be."
356
                               " %s is already the master" %
357
                               old_master)
358

    
359
  if new_master not in mc_list:
360
    mc_no_master = [name for name in mc_list if name != old_master]
361
    raise errors.OpPrereqError("This node is not among the nodes marked"
362
                               " as master candidates. Only these nodes"
363
                               " can become masters. Current list of"
364
                               " master candidates is:\n"
365
                               "%s" % ('\n'.join(mc_no_master)))
366

    
367
  vote_list = GatherMasterVotes(node_list)
368

    
369
  if vote_list:
370
    voted_master = vote_list[0][0]
371
    if voted_master is None:
372
      raise errors.OpPrereqError("Cluster is inconsistent, most nodes did not"
373
                                 " respond.")
374
    elif voted_master != old_master:
375
      raise errors.OpPrereqError("I have wrong configuration, I believe the"
376
                                 " master is %s but the other nodes voted for"
377
                                 " %s. Please resync the configuration of"
378
                                 " this node." % (old_master, voted_master))
379
  # end checks
380

    
381
  rcode = 0
382

    
383
  logging.info("Setting master to %s, old master: %s", new_master, old_master)
384

    
385
  result = rpc.RpcRunner.call_node_stop_master(old_master, True)
386
  if result.failed or not result.data:
387
    logging.error("Could not disable the master role on the old master"
388
                 " %s, please disable manually", old_master)
389

    
390
  # Here we have a phase where no master should be running
391

    
392
  # instantiate a real config writer, as we now know we have the
393
  # configuration data
394
  cfg = config.ConfigWriter()
395

    
396
  cluster_info = cfg.GetClusterInfo()
397
  cluster_info.master_node = new_master
398
  # this will also regenerate the ssconf files, since we updated the
399
  # cluster info
400
  cfg.Update(cluster_info)
401

    
402
  result = rpc.RpcRunner.call_node_start_master(new_master, True)
403
  if result.failed or not result.data:
404
    logging.error("Could not start the master role on the new master"
405
                  " %s, please check", new_master)
406
    rcode = 1
407

    
408
  return rcode
409

    
410

    
411
def GatherMasterVotes(node_list):
412
  """Check the agreement on who is the master.
413

414
  This function will return a list of (node, number of votes), ordered
415
  by the number of votes. Errors will be denoted by the key 'None'.
416

417
  Note that the sum of votes is the number of nodes this machine
418
  knows, whereas the number of entries in the list could be different
419
  (if some nodes vote for another master).
420

421
  We remove ourselves from the list since we know that (bugs aside)
422
  since we use the same source for configuration information for both
423
  backend and boostrap, we'll always vote for ourselves.
424

425
  @type node_list: list
426
  @param node_list: the list of nodes to query for master info; the current
427
      node wil be removed if it is in the list
428
  @rtype: list
429
  @return: list of (node, votes)
430

431
  """
432
  myself = utils.HostInfo().name
433
  try:
434
    node_list.remove(myself)
435
  except ValueError:
436
    pass
437
  if not node_list:
438
    # no nodes left (eventually after removing myself)
439
    return []
440
  results = rpc.RpcRunner.call_master_info(node_list)
441
  if not isinstance(results, dict):
442
    # this should not happen (unless internal error in rpc)
443
    logging.critical("Can't complete rpc call, aborting master startup")
444
    return [(None, len(node_list))]
445
  votes = {}
446
  for node in results:
447
    nres = results[node]
448
    data = nres.data
449
    if nres.failed or not isinstance(data, (tuple, list)) or len(data) < 3:
450
      # here the rpc layer should have already logged errors
451
      if None not in votes:
452
        votes[None] = 0
453
      votes[None] += 1
454
      continue
455
    master_node = data[2]
456
    if master_node not in votes:
457
      votes[master_node] = 0
458
    votes[master_node] += 1
459

    
460
  vote_list = [v for v in votes.items()]
461
  # sort first on number of votes then on name, since we want None
462
  # sorted later if we have the half of the nodes not responding, and
463
  # half voting all for the same master
464
  vote_list.sort(key=lambda x: (x[1], x[0]), reverse=True)
465

    
466
  return vote_list