From 8f215968385b63c61520a721ee622cb7b6a490a9 Mon Sep 17 00:00:00 2001 From: Michael Hanselmann Date: Thu, 29 Oct 2009 18:30:56 +0100 Subject: [PATCH 1/1] Make cluster initialization more reliable There was a race condition between starting the node daemon and sending requests to write the ssconf files. With this patch, the initialization waits up to ten seconds for the node daemon to become responsive. Signed-off-by: Michael Hanselmann Reviewed-by: Guido Trotter --- lib/bootstrap.py | 17 +++++++++++++++-- lib/rpc.py | 5 +++-- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/lib/bootstrap.py b/lib/bootstrap.py index 365eb7f..4edb1b3 100644 --- a/lib/bootstrap.py +++ b/lib/bootstrap.py @@ -110,7 +110,7 @@ def GenerateHmacKey(file_name): utils.WriteFile(file_name, data=utils.GenerateSecret(), mode=0400) -def _InitGanetiServerSetup(): +def _InitGanetiServerSetup(master_name): """Setup the necessary configuration for the initial node daemon. This creates the nodepass file containing the shared password for @@ -133,6 +133,19 @@ def _InitGanetiServerSetup(): " had exitcode %s and error %s" % (result.cmd, result.exit_code, result.output)) + # Wait for node daemon to become responsive + end_time = time.time() + 10.0 + while True: + result = rpc.RpcRunner.call_version([master_name])[master_name] + if not result.fail_msg: + break + + if time.time() > end_time: + raise errors.OpExecError("Node daemon didn't answer queries within" + " 10 seconds") + + time.sleep(1) + def InitCluster(cluster_name, mac_prefix, master_netdev, file_storage_dir, candidate_pool_size, @@ -241,7 +254,7 @@ def InitCluster(cluster_name, mac_prefix, hv_class.CheckParameterSyntax(hv_params) # set up the inter-node password and certificate - _InitGanetiServerSetup() + _InitGanetiServerSetup(hostname.name) # set up ssh config and /etc/hosts sshline = utils.ReadFile(constants.SSH_HOST_RSA_PUB) diff --git a/lib/rpc.py b/lib/rpc.py index 780f601..3c12e2a 100644 --- a/lib/rpc.py +++ b/lib/rpc.py @@ -730,13 +730,14 @@ class RpcRunner(object): # TODO: should this method query down nodes? return cls._StaticMultiNodeCall(node_list, "master_info", []) - def call_version(self, node_list): + @classmethod + def call_version(cls, node_list): """Query node version. This is a multi-node call. """ - return self._MultiNodeCall(node_list, "version", []) + return cls._StaticMultiNodeCall(node_list, "version", []) def call_blockdev_create(self, node, bdev, size, owner, on_primary, info): """Request creation of a given block device. -- 1.7.10.4