4 # Copyright (C) 2006, 2007, 2008 Google Inc.
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 """Functions to bootstrap a new cluster.
32 from ganeti import rpc
33 from ganeti import ssh
34 from ganeti import utils
35 from ganeti import errors
36 from ganeti import config
37 from ganeti import constants
38 from ganeti import objects
39 from ganeti import ssconf
43 """Setup the SSH configuration for the cluster.
45 This generates a dsa keypair for root, adds the pub key to the
46 permitted hosts and adds the hostkey to its own known hosts.
49 priv_key, pub_key, auth_keys = ssh.GetUserFiles(constants.GANETI_RUNAS)
51 for name in priv_key, pub_key:
52 if os.path.exists(name):
53 utils.CreateBackup(name)
54 utils.RemoveFile(name)
56 result = utils.RunCmd(["ssh-keygen", "-t", "dsa",
60 raise errors.OpExecError("Could not generate ssh keypair, error %s" %
63 f = open(pub_key, 'r')
65 utils.AddAuthorizedKey(auth_keys, f.read(8192))
70 def _GenerateSelfSignedSslCert(file_name, validity=(365 * 5)):
71 """Generates a self-signed SSL certificate.
74 @param file_name: Path to output file
76 @param validity: Validity for certificate in days
79 result = utils.RunCmd(["openssl", "req", "-new", "-newkey", "rsa:1024",
80 "-days", str(validity), "-nodes", "-x509",
81 "-keyout", file_name, "-out", file_name, "-batch"])
83 raise errors.OpExecError("Could not generate SSL certificate, command"
84 " %s had exitcode %s and error message %s" %
85 (result.cmd, result.exit_code, result.output))
87 os.chmod(file_name, 0400)
90 def _InitGanetiServerSetup():
91 """Setup the necessary configuration for the initial node daemon.
93 This creates the nodepass file containing the shared password for
94 the cluster and also generates the SSL certificate.
97 _GenerateSelfSignedSslCert(constants.SSL_CERT_FILE)
99 result = utils.RunCmd([constants.NODE_INITD_SCRIPT, "restart"])
102 raise errors.OpExecError("Could not start the node daemon, command %s"
103 " had exitcode %s and error %s" %
104 (result.cmd, result.exit_code, result.output))
107 def InitCluster(cluster_name, mac_prefix, def_bridge,
108 master_netdev, file_storage_dir, candidate_pool_size,
109 secondary_ip=None, vg_name=None, beparams=None, hvparams=None,
110 enabled_hypervisors=None, default_hypervisor=None):
111 """Initialise the cluster.
113 @type candidate_pool_size: int
114 @param candidate_pool_size: master candidate pool size
117 # TODO: complete the docstring
118 if config.ConfigWriter.IsCluster():
119 raise errors.OpPrereqError("Cluster is already initialised")
121 hostname = utils.HostInfo()
123 if hostname.ip.startswith("127."):
124 raise errors.OpPrereqError("This host's IP resolves to the private"
125 " range (%s). Please fix DNS or %s." %
126 (hostname.ip, constants.ETC_HOSTS))
128 if not utils.OwnIpAddress(hostname.ip):
129 raise errors.OpPrereqError("Inconsistency: this host's name resolves"
130 " to %s,\nbut this ip address does not"
131 " belong to this host."
132 " Aborting." % hostname.ip)
134 clustername = utils.HostInfo(cluster_name)
136 if utils.TcpPing(clustername.ip, constants.DEFAULT_NODED_PORT,
138 raise errors.OpPrereqError("Cluster IP already active. Aborting.")
141 if not utils.IsValidIP(secondary_ip):
142 raise errors.OpPrereqError("Invalid secondary ip given")
143 if (secondary_ip != hostname.ip and
144 not utils.OwnIpAddress(secondary_ip)):
145 raise errors.OpPrereqError("You gave %s as secondary IP,"
146 " but it does not belong to this host." %
149 secondary_ip = hostname.ip
151 if vg_name is not None:
152 # Check if volume group is valid
153 vgstatus = utils.CheckVolumeGroupSize(utils.ListVolumeGroups(), vg_name,
154 constants.MIN_VG_SIZE)
156 raise errors.OpPrereqError("Error: %s\nspecify --no-lvm-storage if"
157 " you are not using lvm" % vgstatus)
159 file_storage_dir = os.path.normpath(file_storage_dir)
161 if not os.path.isabs(file_storage_dir):
162 raise errors.OpPrereqError("The file storage directory you passed is"
163 " not an absolute path.")
165 if not os.path.exists(file_storage_dir):
167 os.makedirs(file_storage_dir, 0750)
169 raise errors.OpPrereqError("Cannot create file storage directory"
171 (file_storage_dir, err))
173 if not os.path.isdir(file_storage_dir):
174 raise errors.OpPrereqError("The file storage directory '%s' is not"
175 " a directory." % file_storage_dir)
177 if not re.match("^[0-9a-z]{2}:[0-9a-z]{2}:[0-9a-z]{2}$", mac_prefix):
178 raise errors.OpPrereqError("Invalid mac prefix given '%s'" % mac_prefix)
180 result = utils.RunCmd(["ip", "link", "show", "dev", master_netdev])
182 raise errors.OpPrereqError("Invalid master netdev given (%s): '%s'" %
184 result.output.strip()))
186 if not (os.path.isfile(constants.NODE_INITD_SCRIPT) and
187 os.access(constants.NODE_INITD_SCRIPT, os.X_OK)):
188 raise errors.OpPrereqError("Init.d script '%s' missing or not"
189 " executable." % constants.NODE_INITD_SCRIPT)
191 utils.CheckBEParams(beparams)
193 # set up the inter-node password and certificate
194 _InitGanetiServerSetup()
196 # set up ssh config and /etc/hosts
197 f = open(constants.SSH_HOST_RSA_PUB, 'r')
202 sshkey = sshline.split(" ")[1]
204 utils.AddHostToEtcHosts(hostname.name)
207 # init of cluster config file
208 cluster_config = objects.Cluster(
210 rsahostkeypub=sshkey,
211 highest_used_port=(constants.FIRST_DRBD_PORT - 1),
212 mac_prefix=mac_prefix,
213 volume_group_name=vg_name,
214 default_bridge=def_bridge,
215 tcpudp_port_pool=set(),
216 master_node=hostname.name,
217 master_ip=clustername.ip,
218 master_netdev=master_netdev,
219 cluster_name=clustername.name,
220 file_storage_dir=file_storage_dir,
221 enabled_hypervisors=enabled_hypervisors,
222 default_hypervisor=default_hypervisor,
223 beparams={constants.BEGR_DEFAULT: beparams},
225 candidate_pool_size=candidate_pool_size,
227 master_node_config = objects.Node(name=hostname.name,
228 primary_ip=hostname.ip,
229 secondary_ip=secondary_ip,
231 master_candidate=True,
235 sscfg = InitConfig(constants.CONFIG_VERSION,
236 cluster_config, master_node_config)
237 ssh.WriteKnownHostsFile(sscfg, constants.SSH_KNOWN_HOSTS_FILE)
238 cfg = config.ConfigWriter()
239 cfg.Update(cfg.GetClusterInfo())
241 # start the master ip
242 # TODO: Review rpc call from bootstrap
243 rpc.RpcRunner.call_node_start_master(hostname.name, True)
246 def InitConfig(version, cluster_config, master_node_config,
247 cfg_file=constants.CLUSTER_CONF_FILE):
248 """Create the initial cluster configuration.
250 It will contain the current node, which will also be the master
251 node, and no instances.
254 @param version: configuration version
255 @type cluster_config: L{objects.Cluster}
256 @param cluster_config: cluster configuration
257 @type master_node_config: L{objects.Node}
258 @param master_node_config: master node configuration
259 @type cfg_file: string
260 @param cfg_file: configuration file path
262 @rtype: L{ssconf.SimpleConfigWriter}
263 @returns: initialized config instance
267 master_node_config.name: master_node_config,
270 config_data = objects.ConfigData(version=version,
271 cluster=cluster_config,
275 cfg = ssconf.SimpleConfigWriter.FromDict(config_data.ToDict(), cfg_file)
281 def FinalizeClusterDestroy(master):
282 """Execute the last steps of cluster destroy
284 This function shuts down all the daemons, completing the destroy
285 begun in cmdlib.LUDestroyOpcode.
288 result = rpc.RpcRunner.call_node_stop_master(master, True)
289 if result.failed or not result.data:
290 logging.warning("Could not disable the master role")
291 result = rpc.RpcRunner.call_node_leave_cluster(master)
292 if result.failed or not result.data:
293 logging.warning("Could not shutdown the node daemon and cleanup the node")
296 def SetupNodeDaemon(cluster_name, node, ssh_key_check):
297 """Add a node to the cluster.
299 This function must be called before the actual opcode, and will ssh
300 to the remote node, copy the needed files, and start ganeti-noded,
301 allowing the master to do the rest via normal rpc calls.
303 @param cluster_name: the cluster name
304 @param node: the name of the new node
305 @param ssh_key_check: whether to do a strict key check
308 sshrunner = ssh.SshRunner(cluster_name)
309 gntpem = utils.ReadFile(constants.SSL_CERT_FILE)
310 # in the base64 pem encoding, neither '!' nor '.' are valid chars,
311 # so we use this to detect an invalid certificate; as long as the
312 # cert doesn't contain this, the here-document will be correctly
313 # parsed by the shell sequence below
314 if re.search('^!EOF\.', gntpem, re.MULTILINE):
315 raise errors.OpExecError("invalid PEM encoding in the SSL certificate")
316 if not gntpem.endswith("\n"):
317 raise errors.OpExecError("PEM must end with newline")
319 # set up inter-node password and certificate and restarts the node daemon
320 # and then connect with ssh to set password and start ganeti-noded
321 # note that all the below variables are sanitized at this point,
322 # either by being constants or by the checks above
323 mycommand = ("umask 077 && "
324 "cat > '%s' << '!EOF.' && \n"
325 "%s!EOF.\n%s restart" %
326 (constants.SSL_CERT_FILE, gntpem,
327 constants.NODE_INITD_SCRIPT))
329 result = sshrunner.Run(node, 'root', mycommand, batch=False,
330 ask_key=ssh_key_check,
331 use_cluster_key=False,
332 strict_host_check=ssh_key_check)
334 raise errors.OpExecError("Remote command on node %s, error: %s,"
336 (node, result.fail_reason, result.output))
339 def MasterFailover():
340 """Failover the master node.
342 This checks that we are not already the master, and will cause the
343 current master to cease being master, and the non-master to become
347 sstore = ssconf.SimpleStore()
349 old_master, new_master = ssconf.GetMasterAndMyself(sstore)
350 node_list = sstore.GetNodeList()
351 mc_list = sstore.GetMasterCandidates()
353 if old_master == new_master:
354 raise errors.OpPrereqError("This commands must be run on the node"
355 " where you want the new master to be."
356 " %s is already the master" %
359 if new_master not in mc_list:
360 mc_no_master = [name for name in mc_list if name != old_master]
361 raise errors.OpPrereqError("This node is not among the nodes marked"
362 " as master candidates. Only these nodes"
363 " can become masters. Current list of"
364 " master candidates is:\n"
365 "%s" % ('\n'.join(mc_no_master)))
367 vote_list = GatherMasterVotes(node_list)
370 voted_master = vote_list[0][0]
371 if voted_master is None:
372 raise errors.OpPrereqError("Cluster is inconsistent, most nodes did not"
374 elif voted_master != old_master:
375 raise errors.OpPrereqError("I have wrong configuration, I believe the"
376 " master is %s but the other nodes voted for"
377 " %s. Please resync the configuration of"
378 " this node." % (old_master, voted_master))
383 logging.info("Setting master to %s, old master: %s", new_master, old_master)
385 result = rpc.RpcRunner.call_node_stop_master(old_master, True)
386 if result.failed or not result.data:
387 logging.error("Could not disable the master role on the old master"
388 " %s, please disable manually", old_master)
390 # Here we have a phase where no master should be running
392 # instantiate a real config writer, as we now know we have the
394 cfg = config.ConfigWriter()
396 cluster_info = cfg.GetClusterInfo()
397 cluster_info.master_node = new_master
398 # this will also regenerate the ssconf files, since we updated the
400 cfg.Update(cluster_info)
402 result = rpc.RpcRunner.call_node_start_master(new_master, True)
403 if result.failed or not result.data:
404 logging.error("Could not start the master role on the new master"
405 " %s, please check", new_master)
411 def GatherMasterVotes(node_list):
412 """Check the agreement on who is the master.
414 This function will return a list of (node, number of votes), ordered
415 by the number of votes. Errors will be denoted by the key 'None'.
417 Note that the sum of votes is the number of nodes this machine
418 knows, whereas the number of entries in the list could be different
419 (if some nodes vote for another master).
421 We remove ourselves from the list since we know that (bugs aside)
422 since we use the same source for configuration information for both
423 backend and boostrap, we'll always vote for ourselves.
425 @type node_list: list
426 @param node_list: the list of nodes to query for master info; the current
427 node wil be removed if it is in the list
429 @return: list of (node, votes)
432 myself = utils.HostInfo().name
434 node_list.remove(myself)
438 # no nodes left (eventually after removing myself)
440 results = rpc.RpcRunner.call_master_info(node_list)
441 if not isinstance(results, dict):
442 # this should not happen (unless internal error in rpc)
443 logging.critical("Can't complete rpc call, aborting master startup")
444 return [(None, len(node_list))]
449 if nres.failed or not isinstance(data, (tuple, list)) or len(data) < 3:
450 # here the rpc layer should have already logged errors
451 if None not in votes:
455 master_node = data[2]
456 if master_node not in votes:
457 votes[master_node] = 0
458 votes[master_node] += 1
460 vote_list = [v for v in votes.items()]
461 # sort first on number of votes then on name, since we want None
462 # sorted later if we have the half of the nodes not responding, and
463 # half voting all for the same master
464 vote_list.sort(key=lambda x: (x[1], x[0]), reverse=True)