4 # Copyright (C) 2006, 2007, 2008 Google Inc.
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 """Functions to bootstrap a new cluster.
32 from ganeti import rpc
33 from ganeti import ssh
34 from ganeti import utils
35 from ganeti import errors
36 from ganeti import config
37 from ganeti import constants
38 from ganeti import objects
39 from ganeti import ssconf
40 from ganeti import hypervisor
44 """Setup the SSH configuration for the cluster.
46 This generates a dsa keypair for root, adds the pub key to the
47 permitted hosts and adds the hostkey to its own known hosts.
50 priv_key, pub_key, auth_keys = ssh.GetUserFiles(constants.GANETI_RUNAS)
52 for name in priv_key, pub_key:
53 if os.path.exists(name):
54 utils.CreateBackup(name)
55 utils.RemoveFile(name)
57 result = utils.RunCmd(["ssh-keygen", "-t", "dsa",
61 raise errors.OpExecError("Could not generate ssh keypair, error %s" %
64 f = open(pub_key, 'r')
66 utils.AddAuthorizedKey(auth_keys, f.read(8192))
71 def _GenerateSelfSignedSslCert(file_name, validity=(365 * 5)):
72 """Generates a self-signed SSL certificate.
75 @param file_name: Path to output file
77 @param validity: Validity for certificate in days
80 (fd, tmp_file_name) = tempfile.mkstemp(dir=os.path.dirname(file_name))
83 # Set permissions before writing key
84 os.chmod(tmp_file_name, 0600)
86 result = utils.RunCmd(["openssl", "req", "-new", "-newkey", "rsa:1024",
87 "-days", str(validity), "-nodes", "-x509",
88 "-keyout", tmp_file_name, "-out", tmp_file_name,
91 raise errors.OpExecError("Could not generate SSL certificate, command"
92 " %s had exitcode %s and error message %s" %
93 (result.cmd, result.exit_code, result.output))
96 os.chmod(tmp_file_name, 0400)
98 os.rename(tmp_file_name, file_name)
100 utils.RemoveFile(tmp_file_name)
105 def _InitGanetiServerSetup():
106 """Setup the necessary configuration for the initial node daemon.
108 This creates the nodepass file containing the shared password for
109 the cluster and also generates the SSL certificate.
112 _GenerateSelfSignedSslCert(constants.SSL_CERT_FILE)
114 # Don't overwrite existing file
115 if not os.path.exists(constants.RAPI_CERT_FILE):
116 _GenerateSelfSignedSslCert(constants.RAPI_CERT_FILE)
118 result = utils.RunCmd([constants.NODE_INITD_SCRIPT, "restart"])
121 raise errors.OpExecError("Could not start the node daemon, command %s"
122 " had exitcode %s and error %s" %
123 (result.cmd, result.exit_code, result.output))
126 def InitCluster(cluster_name, mac_prefix, def_bridge,
127 master_netdev, file_storage_dir, candidate_pool_size,
128 secondary_ip=None, vg_name=None, beparams=None, hvparams=None,
129 enabled_hypervisors=None, default_hypervisor=None):
130 """Initialise the cluster.
132 @type candidate_pool_size: int
133 @param candidate_pool_size: master candidate pool size
136 # TODO: complete the docstring
137 if config.ConfigWriter.IsCluster():
138 raise errors.OpPrereqError("Cluster is already initialised")
140 if not enabled_hypervisors:
141 raise errors.OpPrereqError("Enabled hypervisors list must contain at"
143 invalid_hvs = set(enabled_hypervisors) - constants.HYPER_TYPES
145 raise errors.OpPrereqError("Enabled hypervisors contains invalid"
146 " entries: %s" % invalid_hvs)
148 hostname = utils.HostInfo()
150 if hostname.ip.startswith("127."):
151 raise errors.OpPrereqError("This host's IP resolves to the private"
152 " range (%s). Please fix DNS or %s." %
153 (hostname.ip, constants.ETC_HOSTS))
155 if not utils.OwnIpAddress(hostname.ip):
156 raise errors.OpPrereqError("Inconsistency: this host's name resolves"
157 " to %s,\nbut this ip address does not"
158 " belong to this host."
159 " Aborting." % hostname.ip)
161 clustername = utils.HostInfo(cluster_name)
163 if utils.TcpPing(clustername.ip, constants.DEFAULT_NODED_PORT,
165 raise errors.OpPrereqError("Cluster IP already active. Aborting.")
168 if not utils.IsValidIP(secondary_ip):
169 raise errors.OpPrereqError("Invalid secondary ip given")
170 if (secondary_ip != hostname.ip and
171 not utils.OwnIpAddress(secondary_ip)):
172 raise errors.OpPrereqError("You gave %s as secondary IP,"
173 " but it does not belong to this host." %
176 secondary_ip = hostname.ip
178 if vg_name is not None:
179 # Check if volume group is valid
180 vgstatus = utils.CheckVolumeGroupSize(utils.ListVolumeGroups(), vg_name,
181 constants.MIN_VG_SIZE)
183 raise errors.OpPrereqError("Error: %s\nspecify --no-lvm-storage if"
184 " you are not using lvm" % vgstatus)
186 file_storage_dir = os.path.normpath(file_storage_dir)
188 if not os.path.isabs(file_storage_dir):
189 raise errors.OpPrereqError("The file storage directory you passed is"
190 " not an absolute path.")
192 if not os.path.exists(file_storage_dir):
194 os.makedirs(file_storage_dir, 0750)
196 raise errors.OpPrereqError("Cannot create file storage directory"
198 (file_storage_dir, err))
200 if not os.path.isdir(file_storage_dir):
201 raise errors.OpPrereqError("The file storage directory '%s' is not"
202 " a directory." % file_storage_dir)
204 if not re.match("^[0-9a-z]{2}:[0-9a-z]{2}:[0-9a-z]{2}$", mac_prefix):
205 raise errors.OpPrereqError("Invalid mac prefix given '%s'" % mac_prefix)
207 result = utils.RunCmd(["ip", "link", "show", "dev", master_netdev])
209 raise errors.OpPrereqError("Invalid master netdev given (%s): '%s'" %
211 result.output.strip()))
213 if not (os.path.isfile(constants.NODE_INITD_SCRIPT) and
214 os.access(constants.NODE_INITD_SCRIPT, os.X_OK)):
215 raise errors.OpPrereqError("Init.d script '%s' missing or not"
216 " executable." % constants.NODE_INITD_SCRIPT)
218 dirs = [(constants.RUN_GANETI_DIR, constants.RUN_DIRS_MODE)]
219 utils.EnsureDirs(dirs)
221 utils.ForceDictType(beparams, constants.BES_PARAMETER_TYPES)
222 # hvparams is a mapping of hypervisor->hvparams dict
223 for hv_name, hv_params in hvparams.iteritems():
224 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
225 hv_class = hypervisor.GetHypervisor(hv_name)
226 hv_class.CheckParameterSyntax(hv_params)
228 # set up the inter-node password and certificate
229 _InitGanetiServerSetup()
231 # set up ssh config and /etc/hosts
232 f = open(constants.SSH_HOST_RSA_PUB, 'r')
237 sshkey = sshline.split(" ")[1]
239 utils.AddHostToEtcHosts(hostname.name)
242 # init of cluster config file
243 cluster_config = objects.Cluster(
245 rsahostkeypub=sshkey,
246 highest_used_port=(constants.FIRST_DRBD_PORT - 1),
247 mac_prefix=mac_prefix,
248 volume_group_name=vg_name,
249 default_bridge=def_bridge,
250 tcpudp_port_pool=set(),
251 master_node=hostname.name,
252 master_ip=clustername.ip,
253 master_netdev=master_netdev,
254 cluster_name=clustername.name,
255 file_storage_dir=file_storage_dir,
256 enabled_hypervisors=enabled_hypervisors,
257 default_hypervisor=default_hypervisor,
258 beparams={constants.BEGR_DEFAULT: beparams},
260 candidate_pool_size=candidate_pool_size,
262 master_node_config = objects.Node(name=hostname.name,
263 primary_ip=hostname.ip,
264 secondary_ip=secondary_ip,
266 master_candidate=True,
267 offline=False, drained=False,
270 sscfg = InitConfig(constants.CONFIG_VERSION,
271 cluster_config, master_node_config)
272 ssh.WriteKnownHostsFile(sscfg, constants.SSH_KNOWN_HOSTS_FILE)
273 cfg = config.ConfigWriter()
274 cfg.Update(cfg.GetClusterInfo())
276 # start the master ip
277 # TODO: Review rpc call from bootstrap
278 rpc.RpcRunner.call_node_start_master(hostname.name, True)
281 def InitConfig(version, cluster_config, master_node_config,
282 cfg_file=constants.CLUSTER_CONF_FILE):
283 """Create the initial cluster configuration.
285 It will contain the current node, which will also be the master
286 node, and no instances.
289 @param version: configuration version
290 @type cluster_config: L{objects.Cluster}
291 @param cluster_config: cluster configuration
292 @type master_node_config: L{objects.Node}
293 @param master_node_config: master node configuration
294 @type cfg_file: string
295 @param cfg_file: configuration file path
297 @rtype: L{ssconf.SimpleConfigWriter}
298 @return: initialized config instance
302 master_node_config.name: master_node_config,
305 config_data = objects.ConfigData(version=version,
306 cluster=cluster_config,
310 cfg = ssconf.SimpleConfigWriter.FromDict(config_data.ToDict(), cfg_file)
316 def FinalizeClusterDestroy(master):
317 """Execute the last steps of cluster destroy
319 This function shuts down all the daemons, completing the destroy
320 begun in cmdlib.LUDestroyOpcode.
323 result = rpc.RpcRunner.call_node_stop_master(master, True)
324 if result.failed or not result.data:
325 logging.warning("Could not disable the master role")
326 result = rpc.RpcRunner.call_node_leave_cluster(master)
327 if result.failed or not result.data:
328 logging.warning("Could not shutdown the node daemon and cleanup the node")
331 def SetupNodeDaemon(cluster_name, node, ssh_key_check):
332 """Add a node to the cluster.
334 This function must be called before the actual opcode, and will ssh
335 to the remote node, copy the needed files, and start ganeti-noded,
336 allowing the master to do the rest via normal rpc calls.
338 @param cluster_name: the cluster name
339 @param node: the name of the new node
340 @param ssh_key_check: whether to do a strict key check
343 sshrunner = ssh.SshRunner(cluster_name)
345 noded_cert = utils.ReadFile(constants.SSL_CERT_FILE)
346 rapi_cert = utils.ReadFile(constants.RAPI_CERT_FILE)
348 # in the base64 pem encoding, neither '!' nor '.' are valid chars,
349 # so we use this to detect an invalid certificate; as long as the
350 # cert doesn't contain this, the here-document will be correctly
351 # parsed by the shell sequence below
352 if (re.search('^!EOF\.', noded_cert, re.MULTILINE) or
353 re.search('^!EOF\.', rapi_cert, re.MULTILINE)):
354 raise errors.OpExecError("invalid PEM encoding in the SSL certificate")
356 if not noded_cert.endswith("\n"):
358 if not rapi_cert.endswith("\n"):
361 # set up inter-node password and certificate and restarts the node daemon
362 # and then connect with ssh to set password and start ganeti-noded
363 # note that all the below variables are sanitized at this point,
364 # either by being constants or by the checks above
365 mycommand = ("umask 077 && "
366 "cat > '%s' << '!EOF.' && \n"
368 "cat > '%s' << '!EOF.' && \n"
370 "chmod 0400 %s %s && "
372 (constants.SSL_CERT_FILE, noded_cert,
373 constants.RAPI_CERT_FILE, rapi_cert,
374 constants.SSL_CERT_FILE, constants.RAPI_CERT_FILE,
375 constants.NODE_INITD_SCRIPT))
377 result = sshrunner.Run(node, 'root', mycommand, batch=False,
378 ask_key=ssh_key_check,
379 use_cluster_key=False,
380 strict_host_check=ssh_key_check)
382 raise errors.OpExecError("Remote command on node %s, error: %s,"
384 (node, result.fail_reason, result.output))
387 def MasterFailover(no_voting=False):
388 """Failover the master node.
390 This checks that we are not already the master, and will cause the
391 current master to cease being master, and the non-master to become
394 @type no_voting: boolean
395 @param no_voting: force the operation without remote nodes agreement
399 sstore = ssconf.SimpleStore()
401 old_master, new_master = ssconf.GetMasterAndMyself(sstore)
402 node_list = sstore.GetNodeList()
403 mc_list = sstore.GetMasterCandidates()
405 if old_master == new_master:
406 raise errors.OpPrereqError("This commands must be run on the node"
407 " where you want the new master to be."
408 " %s is already the master" %
411 if new_master not in mc_list:
412 mc_no_master = [name for name in mc_list if name != old_master]
413 raise errors.OpPrereqError("This node is not among the nodes marked"
414 " as master candidates. Only these nodes"
415 " can become masters. Current list of"
416 " master candidates is:\n"
417 "%s" % ('\n'.join(mc_no_master)))
420 vote_list = GatherMasterVotes(node_list)
423 voted_master = vote_list[0][0]
424 if voted_master is None:
425 raise errors.OpPrereqError("Cluster is inconsistent, most nodes did"
427 elif voted_master != old_master:
428 raise errors.OpPrereqError("I have a wrong configuration, I believe"
429 " the master is %s but the other nodes"
430 " voted %s. Please resync the configuration"
432 (old_master, voted_master))
437 logging.info("Setting master to %s, old master: %s", new_master, old_master)
439 result = rpc.RpcRunner.call_node_stop_master(old_master, True)
440 if result.failed or not result.data:
441 logging.error("Could not disable the master role on the old master"
442 " %s, please disable manually", old_master)
444 # Here we have a phase where no master should be running
446 # instantiate a real config writer, as we now know we have the
448 cfg = config.ConfigWriter()
450 cluster_info = cfg.GetClusterInfo()
451 cluster_info.master_node = new_master
452 # this will also regenerate the ssconf files, since we updated the
454 cfg.Update(cluster_info)
456 # 2.0.X: Don't start the master if no_voting is true
457 result = rpc.RpcRunner.call_node_start_master(new_master, not no_voting)
458 if result.failed or not result.data:
459 logging.error("Could not start the master role on the new master"
460 " %s, please check", new_master)
467 """Returns the current master node.
469 This is a separate function in bootstrap since it's needed by
470 gnt-cluster, and instead of importing directly ssconf, it's better
471 to abstract it in bootstrap, where we do use ssconf in other
475 sstore = ssconf.SimpleStore()
477 old_master, _ = ssconf.GetMasterAndMyself(sstore)
482 def GatherMasterVotes(node_list):
483 """Check the agreement on who is the master.
485 This function will return a list of (node, number of votes), ordered
486 by the number of votes. Errors will be denoted by the key 'None'.
488 Note that the sum of votes is the number of nodes this machine
489 knows, whereas the number of entries in the list could be different
490 (if some nodes vote for another master).
492 We remove ourselves from the list since we know that (bugs aside)
493 since we use the same source for configuration information for both
494 backend and boostrap, we'll always vote for ourselves.
496 @type node_list: list
497 @param node_list: the list of nodes to query for master info; the current
498 node will be removed if it is in the list
500 @return: list of (node, votes)
503 myself = utils.HostInfo().name
505 node_list.remove(myself)
509 # no nodes left (eventually after removing myself)
511 results = rpc.RpcRunner.call_master_info(node_list)
512 if not isinstance(results, dict):
513 # this should not happen (unless internal error in rpc)
514 logging.critical("Can't complete rpc call, aborting master startup")
515 return [(None, len(node_list))]
520 if nres.failed or not isinstance(data, (tuple, list)) or len(data) < 3:
521 # here the rpc layer should have already logged errors
522 if None not in votes:
526 master_node = data[2]
527 if master_node not in votes:
528 votes[master_node] = 0
529 votes[master_node] += 1
531 vote_list = [v for v in votes.items()]
532 # sort first on number of votes then on name, since we want None
533 # sorted later if we have the half of the nodes not responding, and
534 # half voting all for the same master
535 vote_list.sort(key=lambda x: (x[1], x[0]), reverse=True)