4 # Copyright (C) 2006, 2007, 2008 Google Inc.
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 """Functions to bootstrap a new cluster.
32 from ganeti import rpc
33 from ganeti import ssh
34 from ganeti import utils
35 from ganeti import errors
36 from ganeti import config
37 from ganeti import constants
38 from ganeti import objects
39 from ganeti import ssconf
43 """Setup the SSH configuration for the cluster.
45 This generates a dsa keypair for root, adds the pub key to the
46 permitted hosts and adds the hostkey to its own known hosts.
49 priv_key, pub_key, auth_keys = ssh.GetUserFiles(constants.GANETI_RUNAS)
51 for name in priv_key, pub_key:
52 if os.path.exists(name):
53 utils.CreateBackup(name)
54 utils.RemoveFile(name)
56 result = utils.RunCmd(["ssh-keygen", "-t", "dsa",
60 raise errors.OpExecError("Could not generate ssh keypair, error %s" %
63 f = open(pub_key, 'r')
65 utils.AddAuthorizedKey(auth_keys, f.read(8192))
70 def _InitGanetiServerSetup():
71 """Setup the necessary configuration for the initial node daemon.
73 This creates the nodepass file containing the shared password for
74 the cluster and also generates the SSL certificate.
77 result = utils.RunCmd(["openssl", "req", "-new", "-newkey", "rsa:1024",
78 "-days", str(365*5), "-nodes", "-x509",
79 "-keyout", constants.SSL_CERT_FILE,
80 "-out", constants.SSL_CERT_FILE, "-batch"])
82 raise errors.OpExecError("could not generate server ssl cert, command"
83 " %s had exitcode %s and error message %s" %
84 (result.cmd, result.exit_code, result.output))
86 os.chmod(constants.SSL_CERT_FILE, 0400)
88 result = utils.RunCmd([constants.NODE_INITD_SCRIPT, "restart"])
91 raise errors.OpExecError("Could not start the node daemon, command %s"
92 " had exitcode %s and error %s" %
93 (result.cmd, result.exit_code, result.output))
96 def InitCluster(cluster_name, mac_prefix, def_bridge,
97 master_netdev, file_storage_dir, candidate_pool_size,
98 secondary_ip=None, vg_name=None, beparams=None, hvparams=None,
99 enabled_hypervisors=None, default_hypervisor=None):
100 """Initialise the cluster.
102 @type candidate_pool_size: int
103 @param candidate_pool_size: master candidate pool size
106 # TODO: complete the docstring
107 if config.ConfigWriter.IsCluster():
108 raise errors.OpPrereqError("Cluster is already initialised")
110 hostname = utils.HostInfo()
112 if hostname.ip.startswith("127."):
113 raise errors.OpPrereqError("This host's IP resolves to the private"
114 " range (%s). Please fix DNS or %s." %
115 (hostname.ip, constants.ETC_HOSTS))
117 if not utils.OwnIpAddress(hostname.ip):
118 raise errors.OpPrereqError("Inconsistency: this host's name resolves"
119 " to %s,\nbut this ip address does not"
120 " belong to this host."
121 " Aborting." % hostname.ip)
123 clustername = utils.HostInfo(cluster_name)
125 if utils.TcpPing(clustername.ip, constants.DEFAULT_NODED_PORT,
127 raise errors.OpPrereqError("Cluster IP already active. Aborting.")
130 if not utils.IsValidIP(secondary_ip):
131 raise errors.OpPrereqError("Invalid secondary ip given")
132 if (secondary_ip != hostname.ip and
133 not utils.OwnIpAddress(secondary_ip)):
134 raise errors.OpPrereqError("You gave %s as secondary IP,"
135 " but it does not belong to this host." %
138 secondary_ip = hostname.ip
140 if vg_name is not None:
141 # Check if volume group is valid
142 vgstatus = utils.CheckVolumeGroupSize(utils.ListVolumeGroups(), vg_name,
143 constants.MIN_VG_SIZE)
145 raise errors.OpPrereqError("Error: %s\nspecify --no-lvm-storage if"
146 " you are not using lvm" % vgstatus)
148 file_storage_dir = os.path.normpath(file_storage_dir)
150 if not os.path.isabs(file_storage_dir):
151 raise errors.OpPrereqError("The file storage directory you passed is"
152 " not an absolute path.")
154 if not os.path.exists(file_storage_dir):
156 os.makedirs(file_storage_dir, 0750)
158 raise errors.OpPrereqError("Cannot create file storage directory"
160 (file_storage_dir, err))
162 if not os.path.isdir(file_storage_dir):
163 raise errors.OpPrereqError("The file storage directory '%s' is not"
164 " a directory." % file_storage_dir)
166 if not re.match("^[0-9a-z]{2}:[0-9a-z]{2}:[0-9a-z]{2}$", mac_prefix):
167 raise errors.OpPrereqError("Invalid mac prefix given '%s'" % mac_prefix)
169 result = utils.RunCmd(["ip", "link", "show", "dev", master_netdev])
171 raise errors.OpPrereqError("Invalid master netdev given (%s): '%s'" %
173 result.output.strip()))
175 if not (os.path.isfile(constants.NODE_INITD_SCRIPT) and
176 os.access(constants.NODE_INITD_SCRIPT, os.X_OK)):
177 raise errors.OpPrereqError("Init.d script '%s' missing or not"
178 " executable." % constants.NODE_INITD_SCRIPT)
180 utils.CheckBEParams(beparams)
182 # set up the inter-node password and certificate
183 _InitGanetiServerSetup()
185 # set up ssh config and /etc/hosts
186 f = open(constants.SSH_HOST_RSA_PUB, 'r')
191 sshkey = sshline.split(" ")[1]
193 utils.AddHostToEtcHosts(hostname.name)
196 # init of cluster config file
197 cluster_config = objects.Cluster(
199 rsahostkeypub=sshkey,
200 highest_used_port=(constants.FIRST_DRBD_PORT - 1),
201 mac_prefix=mac_prefix,
202 volume_group_name=vg_name,
203 default_bridge=def_bridge,
204 tcpudp_port_pool=set(),
205 master_node=hostname.name,
206 master_ip=clustername.ip,
207 master_netdev=master_netdev,
208 cluster_name=clustername.name,
209 file_storage_dir=file_storage_dir,
210 enabled_hypervisors=enabled_hypervisors,
211 default_hypervisor=default_hypervisor,
212 beparams={constants.BEGR_DEFAULT: beparams},
214 candidate_pool_size=candidate_pool_size,
216 master_node_config = objects.Node(name=hostname.name,
217 primary_ip=hostname.ip,
218 secondary_ip=secondary_ip,
220 master_candidate=True,
224 sscfg = InitConfig(constants.CONFIG_VERSION,
225 cluster_config, master_node_config)
226 ssh.WriteKnownHostsFile(sscfg, constants.SSH_KNOWN_HOSTS_FILE)
227 cfg = config.ConfigWriter()
228 cfg.Update(cfg.GetClusterInfo())
230 # start the master ip
231 # TODO: Review rpc call from bootstrap
232 rpc.RpcRunner.call_node_start_master(hostname.name, True)
235 def InitConfig(version, cluster_config, master_node_config,
236 cfg_file=constants.CLUSTER_CONF_FILE):
237 """Create the initial cluster configuration.
239 It will contain the current node, which will also be the master
240 node, and no instances.
243 @param version: configuration version
244 @type cluster_config: L{objects.Cluster}
245 @param cluster_config: cluster configuration
246 @type master_node_config: L{objects.Node}
247 @param master_node_config: master node configuration
248 @type cfg_file: string
249 @param cfg_file: configuration file path
251 @rtype: L{ssconf.SimpleConfigWriter}
252 @returns: initialized config instance
256 master_node_config.name: master_node_config,
259 config_data = objects.ConfigData(version=version,
260 cluster=cluster_config,
264 cfg = ssconf.SimpleConfigWriter.FromDict(config_data.ToDict(), cfg_file)
270 def FinalizeClusterDestroy(master):
271 """Execute the last steps of cluster destroy
273 This function shuts down all the daemons, completing the destroy
274 begun in cmdlib.LUDestroyOpcode.
277 result = rpc.RpcRunner.call_node_stop_master(master, True)
278 if result.failed or not result.data:
279 logging.warning("Could not disable the master role")
280 result = rpc.RpcRunner.call_node_leave_cluster(master)
281 if result.failed or not result.data:
282 logging.warning("Could not shutdown the node daemon and cleanup the node")
285 def SetupNodeDaemon(cluster_name, node, ssh_key_check):
286 """Add a node to the cluster.
288 This function must be called before the actual opcode, and will ssh
289 to the remote node, copy the needed files, and start ganeti-noded,
290 allowing the master to do the rest via normal rpc calls.
292 @param cluster_name: the cluster name
293 @param node: the name of the new node
294 @param ssh_key_check: whether to do a strict key check
297 sshrunner = ssh.SshRunner(cluster_name)
298 gntpem = utils.ReadFile(constants.SSL_CERT_FILE)
299 # in the base64 pem encoding, neither '!' nor '.' are valid chars,
300 # so we use this to detect an invalid certificate; as long as the
301 # cert doesn't contain this, the here-document will be correctly
302 # parsed by the shell sequence below
303 if re.search('^!EOF\.', gntpem, re.MULTILINE):
304 raise errors.OpExecError("invalid PEM encoding in the SSL certificate")
305 if not gntpem.endswith("\n"):
306 raise errors.OpExecError("PEM must end with newline")
308 # set up inter-node password and certificate and restarts the node daemon
309 # and then connect with ssh to set password and start ganeti-noded
310 # note that all the below variables are sanitized at this point,
311 # either by being constants or by the checks above
312 mycommand = ("umask 077 && "
313 "cat > '%s' << '!EOF.' && \n"
314 "%s!EOF.\n%s restart" %
315 (constants.SSL_CERT_FILE, gntpem,
316 constants.NODE_INITD_SCRIPT))
318 result = sshrunner.Run(node, 'root', mycommand, batch=False,
319 ask_key=ssh_key_check,
320 use_cluster_key=False,
321 strict_host_check=ssh_key_check)
323 raise errors.OpExecError("Remote command on node %s, error: %s,"
325 (node, result.fail_reason, result.output))
328 def MasterFailover():
329 """Failover the master node.
331 This checks that we are not already the master, and will cause the
332 current master to cease being master, and the non-master to become
336 sstore = ssconf.SimpleStore()
338 old_master, new_master = ssconf.GetMasterAndMyself(sstore)
339 node_list = sstore.GetNodeList()
340 mc_list = sstore.GetMasterCandidates()
342 if old_master == new_master:
343 raise errors.OpPrereqError("This commands must be run on the node"
344 " where you want the new master to be."
345 " %s is already the master" %
348 if new_master not in mc_list:
349 mc_no_master = [name for name in mc_list if name != old_master]
350 raise errors.OpPrereqError("This node is not among the nodes marked"
351 " as master candidates. Only these nodes"
352 " can become masters. Current list of"
353 " master candidates is:\n"
354 "%s" % ('\n'.join(mc_no_master)))
356 vote_list = GatherMasterVotes(node_list)
359 voted_master = vote_list[0][0]
360 if voted_master is None:
361 raise errors.OpPrereqError("Cluster is inconsistent, most nodes did not"
363 elif voted_master != old_master:
364 raise errors.OpPrereqError("I have wrong configuration, I believe the"
365 " master is %s but the other nodes voted for"
366 " %s. Please resync the configuration of"
367 " this node." % (old_master, voted_master))
372 logging.info("Setting master to %s, old master: %s", new_master, old_master)
374 result = rpc.RpcRunner.call_node_stop_master(old_master, True)
375 if result.failed or not result.data:
376 logging.error("Could not disable the master role on the old master"
377 " %s, please disable manually", old_master)
379 # Here we have a phase where no master should be running
381 # instantiate a real config writer, as we now know we have the
383 cfg = config.ConfigWriter()
385 cluster_info = cfg.GetClusterInfo()
386 cluster_info.master_node = new_master
387 # this will also regenerate the ssconf files, since we updated the
389 cfg.Update(cluster_info)
391 result = rpc.RpcRunner.call_node_start_master(new_master, True)
392 if result.failed or not result.data:
393 logging.error("Could not start the master role on the new master"
394 " %s, please check", new_master)
400 def GatherMasterVotes(node_list):
401 """Check the agreement on who is the master.
403 This function will return a list of (node, number of votes), ordered
404 by the number of votes. Errors will be denoted by the key 'None'.
406 Note that the sum of votes is the number of nodes this machine
407 knows, whereas the number of entries in the list could be different
408 (if some nodes vote for another master).
410 We remove ourselves from the list since we know that (bugs aside)
411 since we use the same source for configuration information for both
412 backend and boostrap, we'll always vote for ourselves.
414 @type node_list: list
415 @param node_list: the list of nodes to query for master info; the current
416 node wil be removed if it is in the list
418 @return: list of (node, votes)
421 myself = utils.HostInfo().name
423 node_list.remove(myself)
427 # no nodes left (eventually after removing myself)
429 results = rpc.RpcRunner.call_master_info(node_list)
430 if not isinstance(results, dict):
431 # this should not happen (unless internal error in rpc)
432 logging.critical("Can't complete rpc call, aborting master startup")
433 return [(None, len(node_list))]
438 if nres.failed or not isinstance(data, (tuple, list)) or len(data) < 3:
439 # here the rpc layer should have already logged errors
440 if None not in votes:
444 master_node = data[2]
445 if master_node not in votes:
446 votes[master_node] = 0
447 votes[master_node] += 1
449 vote_list = [v for v in votes.items()]
450 # sort first on number of votes then on name, since we want None
451 # sorted later if we have the half of the nodes not responding, and
452 # half voting all for the same master
453 vote_list.sort(key=lambda x: (x[1], x[0]), reverse=True)