watcher: handle offline nodes better
[ganeti-local] / lib / cmdlib.py
index f59f5b3..6e2f41c 100644 (file)
@@ -1622,6 +1622,24 @@ class LURemoveNode(LogicalUnit):
 
     self.rpc.call_node_leave_cluster(node.name)
 
+    # Promote nodes to master candidate as needed
+    cp_size = self.cfg.GetClusterInfo().candidate_pool_size
+    node_info = self.cfg.GetAllNodesInfo().values()
+    num_candidates = len([n for n in node_info
+                          if n.master_candidate])
+    num_nodes = len(node_info)
+    random.shuffle(node_info)
+    for node in node_info:
+      if num_candidates >= cp_size or num_candidates >= num_nodes:
+        break
+      if node.master_candidate:
+        continue
+      node.master_candidate = True
+      self.LogInfo("Promoting node %s to master candidate", node.name)
+      self.cfg.Update(node)
+      self.context.ReaddNode(node)
+      num_candidates += 1
+
 
 class LUQueryNodes(NoHooksLU):
   """Logical unit for querying nodes.
@@ -1643,6 +1661,7 @@ class LUQueryNodes(NoHooksLU):
     "serial_no",
     "master_candidate",
     "master",
+    "offline",
     )
 
   def ExpandNames(self):
@@ -1762,6 +1781,8 @@ class LUQueryNodes(NoHooksLU):
           val = node.master_candidate
         elif field == "master":
           val = node.name == master_node
+        elif field == "offline":
+          val = node.offline
         elif self._FIELDS_DYNAMIC.Matches(field):
           val = live_data[node.name].get(field, None)
         else:
@@ -1950,9 +1971,17 @@ class LUAddNode(LogicalUnit):
         raise errors.OpPrereqError("Node secondary ip not reachable by TCP"
                                    " based ping to noded port")
 
+    cp_size = self.cfg.GetClusterInfo().candidate_pool_size
+    node_info = self.cfg.GetAllNodesInfo().values()
+    num_candidates = len([n for n in node_info
+                          if n.master_candidate])
+    master_candidate = num_candidates < cp_size
+
     self.new_node = objects.Node(name=node,
                                  primary_ip=primary_ip,
-                                 secondary_ip=secondary_ip)
+                                 secondary_ip=secondary_ip,
+                                 master_candidate=master_candidate,
+                                 offline=False)
 
   def Exec(self, feedback_fn):
     """Adds the new node to the cluster.
@@ -2017,7 +2046,7 @@ class LUAddNode(LogicalUnit):
     result = self.rpc.call_node_verify(node_verify_list, node_verify_param,
                                        self.cfg.GetClusterName())
     for verifier in node_verify_list:
-      if result.failed or not result[verifier].data:
+      if result[verifier].failed or not result[verifier].data:
         raise errors.OpExecError("Cannot communicate with %s's node daemon"
                                  " for remote verification" % verifier)
       if result[verifier].data['nodelist']:
@@ -2978,11 +3007,15 @@ class LUQueryInstances(NoHooksLU):
     hv_list = list(set([inst.hypervisor for inst in instance_list]))
 
     bad_nodes = []
+    off_nodes = []
     if self.do_locking:
       live_data = {}
       node_data = self.rpc.call_all_instances_info(nodes, hv_list)
       for name in nodes:
         result = node_data[name]
+        if result.offline:
+          # offline nodes will be in both lists
+          off_nodes.append(name)
         if result.failed:
           bad_nodes.append(name)
         else:
@@ -3019,7 +3052,9 @@ class LUQueryInstances(NoHooksLU):
           else:
             val = bool(live_data.get(instance.name))
         elif field == "status":
-          if instance.primary_node in bad_nodes:
+          if instance.primary_node in off_nodes:
+            val = "ERROR_nodeoffline"
+          elif instance.primary_node in bad_nodes:
             val = "ERROR_nodedown"
           else:
             running = bool(live_data.get(instance.name))
@@ -5342,9 +5377,15 @@ class LUQueryExports(NoHooksLU):
         that node.
 
     """
-    result = self.rpc.call_export_list(self.nodes)
-    result.Raise()
-    return result.data
+    rpcresult = self.rpc.call_export_list(self.nodes)
+    result = {}
+    for node in rpcresult:
+      if rpcresult[node].failed:
+        result[node] = False
+      else:
+        result[node] = rpcresult[node].data
+
+    return result
 
 
 class LUExportInstance(LogicalUnit):
@@ -5883,6 +5924,7 @@ class IAllocator(object):
         "primary_ip": ninfo.primary_ip,
         "secondary_ip": ninfo.secondary_ip,
         "total_cpus": remote_info['cpu_total'],
+        "offline": ninfo.offline,
         }
       node_results[nname] = pnr
     data["nodes"] = node_results