Make cluster initialization more reliable
authorMichael Hanselmann <hansmi@google.com>
Thu, 29 Oct 2009 17:30:56 +0000 (18:30 +0100)
committerMichael Hanselmann <hansmi@google.com>
Fri, 30 Oct 2009 10:46:48 +0000 (11:46 +0100)
There was a race condition between starting the node daemon
and sending requests to write the ssconf files. With this
patch, the initialization waits up to ten seconds for the
node daemon to become responsive.

Signed-off-by: Michael Hanselmann <hansmi@google.com>
Reviewed-by: Guido Trotter <ultrotter@google.com>

lib/bootstrap.py
lib/rpc.py

index 365eb7f..4edb1b3 100644 (file)
@@ -110,7 +110,7 @@ def GenerateHmacKey(file_name):
   utils.WriteFile(file_name, data=utils.GenerateSecret(), mode=0400)
 
 
-def _InitGanetiServerSetup():
+def _InitGanetiServerSetup(master_name):
   """Setup the necessary configuration for the initial node daemon.
 
   This creates the nodepass file containing the shared password for
@@ -133,6 +133,19 @@ def _InitGanetiServerSetup():
                              " had exitcode %s and error %s" %
                              (result.cmd, result.exit_code, result.output))
 
+  # Wait for node daemon to become responsive
+  end_time = time.time() + 10.0
+  while True:
+    result = rpc.RpcRunner.call_version([master_name])[master_name]
+    if not result.fail_msg:
+      break
+
+    if time.time() > end_time:
+      raise errors.OpExecError("Node daemon didn't answer queries within"
+                               " 10 seconds")
+
+    time.sleep(1)
+
 
 def InitCluster(cluster_name, mac_prefix,
                 master_netdev, file_storage_dir, candidate_pool_size,
@@ -241,7 +254,7 @@ def InitCluster(cluster_name, mac_prefix,
     hv_class.CheckParameterSyntax(hv_params)
 
   # set up the inter-node password and certificate
-  _InitGanetiServerSetup()
+  _InitGanetiServerSetup(hostname.name)
 
   # set up ssh config and /etc/hosts
   sshline = utils.ReadFile(constants.SSH_HOST_RSA_PUB)
index 780f601..3c12e2a 100644 (file)
@@ -730,13 +730,14 @@ class RpcRunner(object):
     # TODO: should this method query down nodes?
     return cls._StaticMultiNodeCall(node_list, "master_info", [])
 
-  def call_version(self, node_list):
+  @classmethod
+  def call_version(cls, node_list):
     """Query node version.
 
     This is a multi-node call.
 
     """
-    return self._MultiNodeCall(node_list, "version", [])
+    return cls._StaticMultiNodeCall(node_list, "version", [])
 
   def call_blockdev_create(self, node, bdev, size, owner, on_primary, info):
     """Request creation of a given block device.