Modify utils.TcpPing to make source address optional
[ganeti-local] / lib / cmdlib.py
index 6318d45..2e26cfb 100644 (file)
@@ -510,13 +510,13 @@ class LUInitCluster(LogicalUnit):
 
     if hostname.ip.startswith("127."):
       raise errors.OpPrereqError("This host's IP resolves to the private"
-                                 " range (%s). Please fix DNS or /etc/hosts." %
-                                 (hostname.ip,))
+                                 " range (%s). Please fix DNS or %s." %
+                                 (hostname.ip, constants.ETC_HOSTS))
 
     self.clustername = clustername = utils.HostInfo(self.op.cluster_name)
 
-    if not utils.TcpPing(constants.LOCALHOST_IP_ADDRESS, hostname.ip,
-                         constants.DEFAULT_NODED_PORT):
+    if not utils.TcpPing(hostname.ip, constants.DEFAULT_NODED_PORT,
+                         source=constants.LOCALHOST_IP_ADDRESS):
       raise errors.OpPrereqError("Inconsistency: this host's name resolves"
                                  " to %s,\nbut this ip address does not"
                                  " belong to this host."
@@ -527,8 +527,8 @@ class LUInitCluster(LogicalUnit):
       raise errors.OpPrereqError("Invalid secondary ip given")
     if (secondary_ip and
         secondary_ip != hostname.ip and
-        (not utils.TcpPing(constants.LOCALHOST_IP_ADDRESS, secondary_ip,
-                           constants.DEFAULT_NODED_PORT))):
+        (not utils.TcpPing(secondary_ip, constants.DEFAULT_NODED_PORT,
+                           source=constants.LOCALHOST_IP_ADDRESS))):
       raise errors.OpPrereqError("You gave %s as secondary IP,"
                                  " but it does not belong to this host." %
                                  secondary_ip)
@@ -632,6 +632,8 @@ class LUDestroyCluster(NoHooksLU):
 
     """
     master = self.sstore.GetMasterNode()
+    if not rpc.call_node_stop_master(master):
+      raise errors.OpExecError("Could not disable the master role")
     priv_key, pub_key, _ = ssh.GetUserFiles(constants.GANETI_RUNAS)
     utils.CreateBackup(priv_key)
     utils.CreateBackup(pub_key)
@@ -947,7 +949,7 @@ class LUVerifyDisks(NoHooksLU):
         inst = nv_dict.pop((node, lv_name), None)
         if (not lv_online and inst is not None
             and inst.name not in res_instances):
-            res_instances.append(inst.name)
+          res_instances.append(inst.name)
 
     # any leftover items in nv_dict are missing LVs, let's arrange the
     # data better
@@ -972,7 +974,7 @@ class LURenameCluster(LogicalUnit):
 
     """
     env = {
-      "OP_TARGET": self.op.sstore.GetClusterName(),
+      "OP_TARGET": self.sstore.GetClusterName(),
       "NEW_NAME": self.op.name,
       }
     mn = self.sstore.GetMasterNode()
@@ -1475,16 +1477,13 @@ class LUAddNode(LogicalUnit):
                                    " new node doesn't have one")
 
     # checks reachablity
-    if not utils.TcpPing(utils.HostInfo().name,
-                         primary_ip,
-                         constants.DEFAULT_NODED_PORT):
+    if not utils.TcpPing(primary_ip, constants.DEFAULT_NODED_PORT):
       raise errors.OpPrereqError("Node not reachable by ping")
 
     if not newbie_singlehomed:
       # check reachability from my secondary ip to newbie's secondary ip
-      if not utils.TcpPing(myself.secondary_ip,
-                           secondary_ip,
-                           constants.DEFAULT_NODED_PORT):
+      if not utils.TcpPing(secondary_ip, constants.DEFAULT_NODED_PORT,
+                           source=myself.secondary_ip):
         raise errors.OpPrereqError("Node secondary ip not reachable by TCP"
                                    " based ping to noded port")
 
@@ -1608,7 +1607,7 @@ class LUAddNode(LogicalUnit):
       dist_nodes.remove(myself.name)
 
     logger.Debug("Copying hosts and known_hosts to all nodes")
-    for fname in ("/etc/hosts", constants.SSH_KNOWN_HOSTS_FILE):
+    for fname in (constants.ETC_HOSTS, constants.SSH_KNOWN_HOSTS_FILE):
       result = rpc.call_upload_file(dist_nodes, fname)
       for to_node in dist_nodes:
         if not result[to_node]:
@@ -1860,23 +1859,41 @@ def _AssembleInstanceDisks(instance, cfg, ignore_secondaries=False):
   """
   device_info = []
   disks_ok = True
+  iname = instance.name
+  # With the two passes mechanism we try to reduce the window of
+  # opportunity for the race condition of switching DRBD to primary
+  # before handshaking occured, but we do not eliminate it
+
+  # The proper fix would be to wait (with some limits) until the
+  # connection has been made and drbd transitions from WFConnection
+  # into any other network-connected state (Connected, SyncTarget,
+  # SyncSource, etc.)
+
+  # 1st pass, assemble on all nodes in secondary mode
   for inst_disk in instance.disks:
-    master_result = None
     for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node):
       cfg.SetDiskID(node_disk, node)
-      is_primary = node == instance.primary_node
-      result = rpc.call_blockdev_assemble(node, node_disk,
-                                          instance.name, is_primary)
+      result = rpc.call_blockdev_assemble(node, node_disk, iname, False)
       if not result:
         logger.Error("could not prepare block device %s on node %s"
-                     " (is_primary=%s)" %
-                     (inst_disk.iv_name, node, is_primary))
-        if is_primary or not ignore_secondaries:
+                     " (is_primary=False, pass=1)" % (inst_disk.iv_name, node))
+        if not ignore_secondaries:
           disks_ok = False
-      if is_primary:
-        master_result = result
-    device_info.append((instance.primary_node, inst_disk.iv_name,
-                        master_result))
+
+  # FIXME: race condition on drbd migration to primary
+
+  # 2nd pass, do only the primary node
+  for inst_disk in instance.disks:
+    for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node):
+      if node != instance.primary_node:
+        continue
+      cfg.SetDiskID(node_disk, node)
+      result = rpc.call_blockdev_assemble(node, node_disk, iname, True)
+      if not result:
+        logger.Error("could not prepare block device %s on node %s"
+                     " (is_primary=True, pass=2)" % (inst_disk.iv_name, node))
+        disks_ok = False
+    device_info.append((instance.primary_node, inst_disk.iv_name, result))
 
   # leave the disks configured for the primary node
   # this is a workaround that would be fixed better by
@@ -2297,6 +2314,11 @@ class LURenameInstance(LogicalUnit):
     name_info = utils.HostInfo(self.op.new_name)
 
     self.op.new_name = new_name = name_info.name
+    instance_list = self.cfg.GetInstanceList()
+    if new_name in instance_list:
+      raise errors.OpPrereqError("Instance '%s' is already in the cluster" %
+                                 instance_name)
+
     if not getattr(self.op, "ignore_ip", False):
       command = ["fping", "-q", name_info.ip]
       result = utils.RunCmd(command)
@@ -2400,7 +2422,7 @@ class LUQueryInstances(NoHooksLU):
     This checks that the fields required are valid output fields.
 
     """
-    self.dynamic_fields = frozenset(["oper_state", "oper_ram"])
+    self.dynamic_fields = frozenset(["oper_state", "oper_ram", "status"])
     _CheckOutputFields(static=["name", "os", "pnode", "snodes",
                                "admin_state", "admin_ram",
                                "disk_template", "ip", "mac", "bridge",
@@ -2457,6 +2479,21 @@ class LUQueryInstances(NoHooksLU):
             val = None
           else:
             val = bool(live_data.get(instance.name))
+        elif field == "status":
+          if instance.primary_node in bad_nodes:
+            val = "ERROR_nodedown"
+          else:
+            running = bool(live_data.get(instance.name))
+            if running:
+              if instance.status != "down":
+                val = "running"
+              else:
+                val = "ERROR_up"
+            else:
+              if instance.status != "down":
+                val = "ERROR_down"
+              else:
+                val = "ADMIN_down"
         elif field == "admin_ram":
           val = instance.memory
         elif field == "oper_ram":
@@ -2708,9 +2745,9 @@ def _GenerateDiskTemplate(cfg, template_name,
   #TODO: compute space requirements
 
   vgname = cfg.GetVGName()
-  if template_name == "diskless":
+  if template_name == constants.DT_DISKLESS:
     disks = []
-  elif template_name == "plain":
+  elif template_name == constants.DT_PLAIN:
     if len(secondary_nodes) != 0:
       raise errors.ProgrammerError("Wrong template configuration")
 
@@ -2722,7 +2759,7 @@ def _GenerateDiskTemplate(cfg, template_name,
                            logical_id=(vgname, names[1]),
                            iv_name = "sdb")
     disks = [sda_dev, sdb_dev]
-  elif template_name == "local_raid1":
+  elif template_name == constants.DT_LOCAL_RAID1:
     if len(secondary_nodes) != 0:
       raise errors.ProgrammerError("Wrong template configuration")
 
@@ -2878,7 +2915,7 @@ class LUCreateInstance(LogicalUnit):
       os_type=self.op.os_type,
       memory=self.op.mem_size,
       vcpus=self.op.vcpus,
-      nics=[(self.inst_ip, self.op.bridge)],
+      nics=[(self.inst_ip, self.op.bridge, self.op.mac)],
     ))
 
     nl = ([self.sstore.GetMasterNode(), self.op.pnode] +
@@ -3034,8 +3071,7 @@ class LUCreateInstance(LogicalUnit):
                                  " adding an instance in start mode")
 
     if self.op.ip_check:
-      if utils.TcpPing(utils.HostInfo().name, hostname1.ip,
-                       constants.DEFAULT_NODED_PORT):
+      if utils.TcpPing(hostname1.ip, constants.DEFAULT_NODED_PORT):
         raise errors.OpPrereqError("IP %s of instance %s already in use" %
                                    (hostname1.ip, instance_name))
 
@@ -3060,8 +3096,8 @@ class LUCreateInstance(LogicalUnit):
     # boot order verification
     if self.op.hvm_boot_order is not None:
       if len(self.op.hvm_boot_order.strip("acdn")) != 0:
-             raise errors.OpPrereqError("invalid boot order specified,"
-                                        " must be one or more of [acdn]")
+        raise errors.OpPrereqError("invalid boot order specified,"
+                                   " must be one or more of [acdn]")
 
     if self.op.start:
       self.instance_status = 'up'
@@ -3723,7 +3759,7 @@ class LUReplaceDisks(LogicalUnit):
       # ok, we created the new LVs, so now we know we have the needed
       # storage; as such, we proceed on the target node to rename
       # old_lv to _old, and new_lv to old_lv; note that we rename LVs
-      # using the assumption than logical_id == physical_id (which in
+      # using the assumption that logical_id == physical_id (which in
       # turn is the unique_id on that node)
 
       # FIXME(iustin): use a better name for the replaced LVs
@@ -3995,7 +4031,7 @@ class LUQueryInstanceData(NoHooksLU):
         instance = self.cfg.GetInstanceInfo(self.cfg.ExpandInstanceName(name))
         if instance is None:
           raise errors.OpPrereqError("No such instance name '%s'" % name)
-      self.wanted_instances.append(instance)
+        self.wanted_instances.append(instance)
     else:
       self.wanted_instances = [self.cfg.GetInstanceInfo(name) for name
                                in self.cfg.GetInstanceList()]
@@ -4098,7 +4134,7 @@ class LUSetInstanceParms(LogicalUnit):
       args['memory'] = self.mem
     if self.vcpus:
       args['vcpus'] = self.vcpus
-    if self.do_ip or self.do_bridge:
+    if self.do_ip or self.do_bridge or self.mac:
       if self.do_ip:
         ip = self.ip
       else:
@@ -4107,7 +4143,11 @@ class LUSetInstanceParms(LogicalUnit):
         bridge = self.bridge
       else:
         bridge = self.instance.nics[0].bridge
-      args['nics'] = [(ip, bridge)]
+      if self.mac:
+        mac = self.mac
+      else:
+        mac = self.instance.nics[0].mac
+      args['nics'] = [(ip, bridge, mac)]
     env = _BuildInstanceHookEnvByObject(self.instance, override=args)
     nl = [self.sstore.GetMasterNode(),
           self.instance.primary_node] + list(self.instance.secondary_nodes)
@@ -4524,3 +4564,39 @@ class LUDelTags(TagsLU):
       raise errors.OpRetryError("There has been a modification to the"
                                 " config file and the operation has been"
                                 " aborted. Please retry.")
+
+class LUTestDelay(NoHooksLU):
+  """Sleep for a specified amount of time.
+
+  This LU sleeps on the master and/or nodes for a specified amoutn of
+  time.
+
+  """
+  _OP_REQP = ["duration", "on_master", "on_nodes"]
+
+  def CheckPrereq(self):
+    """Check prerequisites.
+
+    This checks that we have a good list of nodes and/or the duration
+    is valid.
+
+    """
+
+    if self.op.on_nodes:
+      self.op.on_nodes = _GetWantedNodes(self, self.op.on_nodes)
+
+  def Exec(self, feedback_fn):
+    """Do the actual sleep.
+
+    """
+    if self.op.on_master:
+      if not utils.TestDelay(self.op.duration):
+        raise errors.OpExecError("Error during master delay test")
+    if self.op.on_nodes:
+      result = rpc.call_test_delay(self.op.on_nodes, self.op.duration)
+      if not result:
+        raise errors.OpExecError("Complete failure from rpc call")
+      for node, node_result in result.items():
+        if not node_result:
+          raise errors.OpExecError("Failure during rpc call to node %s,"
+                                   " result: %s" % (node, node_result))