Modify utils.TcpPing to make source address optional

[ganeti-local] / lib / cmdlib.py
diff --git a/lib/cmdlib.py b/lib/cmdlib.py

index 6318d45..2e26cfb 100644 (file)
--- a/lib/cmdlib.py
+++ b/lib/cmdlib.py
@@ -510,13 +510,13 @@ class LUInitCluster(LogicalUnit):
  
      if hostname.ip.startswith("127."):
        raise errors.OpPrereqError("This host's IP resolves to the private"
-                                 " range (%s). Please fix DNS or /etc/hosts." %
-                                 (hostname.ip,))
+                                 " range (%s). Please fix DNS or %s." %
+                                 (hostname.ip, constants.ETC_HOSTS))
  
      self.clustername = clustername = utils.HostInfo(self.op.cluster_name)
  
-    if not utils.TcpPing(constants.LOCALHOST_IP_ADDRESS, hostname.ip,
-                         constants.DEFAULT_NODED_PORT):
+    if not utils.TcpPing(hostname.ip, constants.DEFAULT_NODED_PORT,
+                         source=constants.LOCALHOST_IP_ADDRESS):
        raise errors.OpPrereqError("Inconsistency: this host's name resolves"
                                   " to %s,\nbut this ip address does not"
                                   " belong to this host."
@@ -527,8 +527,8 @@ class LUInitCluster(LogicalUnit):
        raise errors.OpPrereqError("Invalid secondary ip given")
      if (secondary_ip and
          secondary_ip != hostname.ip and
-        (not utils.TcpPing(constants.LOCALHOST_IP_ADDRESS, secondary_ip,
-                           constants.DEFAULT_NODED_PORT))):
+        (not utils.TcpPing(secondary_ip, constants.DEFAULT_NODED_PORT,
+                           source=constants.LOCALHOST_IP_ADDRESS))):
        raise errors.OpPrereqError("You gave %s as secondary IP,"
                                   " but it does not belong to this host." %
                                   secondary_ip)
@@ -632,6 +632,8 @@ class LUDestroyCluster(NoHooksLU):
  
      """
      master = self.sstore.GetMasterNode()
+    if not rpc.call_node_stop_master(master):
+      raise errors.OpExecError("Could not disable the master role")
      priv_key, pub_key, _ = ssh.GetUserFiles(constants.GANETI_RUNAS)
      utils.CreateBackup(priv_key)
      utils.CreateBackup(pub_key)
@@ -947,7 +949,7 @@ class LUVerifyDisks(NoHooksLU):
          inst = nv_dict.pop((node, lv_name), None)
          if (not lv_online and inst is not None
              and inst.name not in res_instances):
-            res_instances.append(inst.name)
+          res_instances.append(inst.name)
  
      # any leftover items in nv_dict are missing LVs, let's arrange the
      # data better
@@ -972,7 +974,7 @@ class LURenameCluster(LogicalUnit):
  
      """
      env = {
-      "OP_TARGET": self.op.sstore.GetClusterName(),
+      "OP_TARGET": self.sstore.GetClusterName(),
        "NEW_NAME": self.op.name,
        }
      mn = self.sstore.GetMasterNode()
@@ -1475,16 +1477,13 @@ class LUAddNode(LogicalUnit):
                                     " new node doesn't have one")
  
      # checks reachablity
-    if not utils.TcpPing(utils.HostInfo().name,
-                         primary_ip,
-                         constants.DEFAULT_NODED_PORT):
+    if not utils.TcpPing(primary_ip, constants.DEFAULT_NODED_PORT):
        raise errors.OpPrereqError("Node not reachable by ping")
  
      if not newbie_singlehomed:
        # check reachability from my secondary ip to newbie's secondary ip
-      if not utils.TcpPing(myself.secondary_ip,
-                           secondary_ip,
-                           constants.DEFAULT_NODED_PORT):
+      if not utils.TcpPing(secondary_ip, constants.DEFAULT_NODED_PORT,
+                           source=myself.secondary_ip):
          raise errors.OpPrereqError("Node secondary ip not reachable by TCP"
                                     " based ping to noded port")
  
@@ -1608,7 +1607,7 @@ class LUAddNode(LogicalUnit):
        dist_nodes.remove(myself.name)
  
      logger.Debug("Copying hosts and known_hosts to all nodes")
-    for fname in ("/etc/hosts", constants.SSH_KNOWN_HOSTS_FILE):
+    for fname in (constants.ETC_HOSTS, constants.SSH_KNOWN_HOSTS_FILE):
        result = rpc.call_upload_file(dist_nodes, fname)
        for to_node in dist_nodes:
          if not result[to_node]:
@@ -1860,23 +1859,41 @@ def _AssembleInstanceDisks(instance, cfg, ignore_secondaries=False):
    """
    device_info = []
    disks_ok = True
+  iname = instance.name
+  # With the two passes mechanism we try to reduce the window of
+  # opportunity for the race condition of switching DRBD to primary
+  # before handshaking occured, but we do not eliminate it
+
+  # The proper fix would be to wait (with some limits) until the
+  # connection has been made and drbd transitions from WFConnection
+  # into any other network-connected state (Connected, SyncTarget,
+  # SyncSource, etc.)
+
+  # 1st pass, assemble on all nodes in secondary mode
    for inst_disk in instance.disks:
-    master_result = None
      for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node):
        cfg.SetDiskID(node_disk, node)
-      is_primary = node == instance.primary_node
-      result = rpc.call_blockdev_assemble(node, node_disk,
-                                          instance.name, is_primary)
+      result = rpc.call_blockdev_assemble(node, node_disk, iname, False)
        if not result:
          logger.Error("could not prepare block device %s on node %s"
-                     " (is_primary=%s)" %
-                     (inst_disk.iv_name, node, is_primary))
-        if is_primary or not ignore_secondaries:
+                     " (is_primary=False, pass=1)" % (inst_disk.iv_name, node))
+        if not ignore_secondaries:
            disks_ok = False
-      if is_primary:
-        master_result = result
-    device_info.append((instance.primary_node, inst_disk.iv_name,
-                        master_result))
+
+  # FIXME: race condition on drbd migration to primary
+
+  # 2nd pass, do only the primary node
+  for inst_disk in instance.disks:
+    for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node):
+      if node != instance.primary_node:
+        continue
+      cfg.SetDiskID(node_disk, node)
+      result = rpc.call_blockdev_assemble(node, node_disk, iname, True)
+      if not result:
+        logger.Error("could not prepare block device %s on node %s"
+                     " (is_primary=True, pass=2)" % (inst_disk.iv_name, node))
+        disks_ok = False
+    device_info.append((instance.primary_node, inst_disk.iv_name, result))
  
    # leave the disks configured for the primary node
    # this is a workaround that would be fixed better by
@@ -2297,6 +2314,11 @@ class LURenameInstance(LogicalUnit):
      name_info = utils.HostInfo(self.op.new_name)
  
      self.op.new_name = new_name = name_info.name
+    instance_list = self.cfg.GetInstanceList()
+    if new_name in instance_list:
+      raise errors.OpPrereqError("Instance '%s' is already in the cluster" %
+                                 instance_name)
+
      if not getattr(self.op, "ignore_ip", False):
        command = ["fping", "-q", name_info.ip]
        result = utils.RunCmd(command)
@@ -2400,7 +2422,7 @@ class LUQueryInstances(NoHooksLU):
      This checks that the fields required are valid output fields.
  
      """
-    self.dynamic_fields = frozenset(["oper_state", "oper_ram"])
+    self.dynamic_fields = frozenset(["oper_state", "oper_ram", "status"])
      _CheckOutputFields(static=["name", "os", "pnode", "snodes",
                                 "admin_state", "admin_ram",
                                 "disk_template", "ip", "mac", "bridge",
@@ -2457,6 +2479,21 @@ class LUQueryInstances(NoHooksLU):
              val = None
            else:
              val = bool(live_data.get(instance.name))
+        elif field == "status":
+          if instance.primary_node in bad_nodes:
+            val = "ERROR_nodedown"
+          else:
+            running = bool(live_data.get(instance.name))
+            if running:
+              if instance.status != "down":
+                val = "running"
+              else:
+                val = "ERROR_up"
+            else:
+              if instance.status != "down":
+                val = "ERROR_down"
+              else:
+                val = "ADMIN_down"
          elif field == "admin_ram":
            val = instance.memory
          elif field == "oper_ram":
@@ -2708,9 +2745,9 @@ def _GenerateDiskTemplate(cfg, template_name,
    #TODO: compute space requirements
  
    vgname = cfg.GetVGName()
-  if template_name == "diskless":
+  if template_name == constants.DT_DISKLESS:
      disks = []
-  elif template_name == "plain":
+  elif template_name == constants.DT_PLAIN:
      if len(secondary_nodes) != 0:
        raise errors.ProgrammerError("Wrong template configuration")
  
@@ -2722,7 +2759,7 @@ def _GenerateDiskTemplate(cfg, template_name,
                             logical_id=(vgname, names[1]),
                             iv_name = "sdb")
      disks = [sda_dev, sdb_dev]
-  elif template_name == "local_raid1":
+  elif template_name == constants.DT_LOCAL_RAID1:
      if len(secondary_nodes) != 0:
        raise errors.ProgrammerError("Wrong template configuration")
  
@@ -2878,7 +2915,7 @@ class LUCreateInstance(LogicalUnit):
        os_type=self.op.os_type,
        memory=self.op.mem_size,
        vcpus=self.op.vcpus,
-      nics=[(self.inst_ip, self.op.bridge)],
+      nics=[(self.inst_ip, self.op.bridge, self.op.mac)],
      ))
  
      nl = ([self.sstore.GetMasterNode(), self.op.pnode] +
@@ -3034,8 +3071,7 @@ class LUCreateInstance(LogicalUnit):
                                   " adding an instance in start mode")
  
      if self.op.ip_check:
-      if utils.TcpPing(utils.HostInfo().name, hostname1.ip,
-                       constants.DEFAULT_NODED_PORT):
+      if utils.TcpPing(hostname1.ip, constants.DEFAULT_NODED_PORT):
          raise errors.OpPrereqError("IP %s of instance %s already in use" %
                                     (hostname1.ip, instance_name))
  
@@ -3060,8 +3096,8 @@ class LUCreateInstance(LogicalUnit):
      # boot order verification
      if self.op.hvm_boot_order is not None:
        if len(self.op.hvm_boot_order.strip("acdn")) != 0:
-             raise errors.OpPrereqError("invalid boot order specified,"
-                                        " must be one or more of [acdn]")
+        raise errors.OpPrereqError("invalid boot order specified,"
+                                   " must be one or more of [acdn]")
  
      if self.op.start:
        self.instance_status = 'up'
@@ -3723,7 +3759,7 @@ class LUReplaceDisks(LogicalUnit):
        # ok, we created the new LVs, so now we know we have the needed
        # storage; as such, we proceed on the target node to rename
        # old_lv to _old, and new_lv to old_lv; note that we rename LVs
-      # using the assumption than logical_id == physical_id (which in
+      # using the assumption that logical_id == physical_id (which in
        # turn is the unique_id on that node)
  
        # FIXME(iustin): use a better name for the replaced LVs
@@ -3995,7 +4031,7 @@ class LUQueryInstanceData(NoHooksLU):
          instance = self.cfg.GetInstanceInfo(self.cfg.ExpandInstanceName(name))
          if instance is None:
            raise errors.OpPrereqError("No such instance name '%s'" % name)
-      self.wanted_instances.append(instance)
+        self.wanted_instances.append(instance)
      else:
        self.wanted_instances = [self.cfg.GetInstanceInfo(name) for name
                                 in self.cfg.GetInstanceList()]
@@ -4098,7 +4134,7 @@ class LUSetInstanceParms(LogicalUnit):
        args['memory'] = self.mem
      if self.vcpus:
        args['vcpus'] = self.vcpus
-    if self.do_ip or self.do_bridge:
+    if self.do_ip or self.do_bridge or self.mac:
        if self.do_ip:
          ip = self.ip
        else:
@@ -4107,7 +4143,11 @@ class LUSetInstanceParms(LogicalUnit):
          bridge = self.bridge
        else:
          bridge = self.instance.nics[0].bridge
-      args['nics'] = [(ip, bridge)]
+      if self.mac:
+        mac = self.mac
+      else:
+        mac = self.instance.nics[0].mac
+      args['nics'] = [(ip, bridge, mac)]
      env = _BuildInstanceHookEnvByObject(self.instance, override=args)
      nl = [self.sstore.GetMasterNode(),
            self.instance.primary_node] + list(self.instance.secondary_nodes)
@@ -4524,3 +4564,39 @@ class LUDelTags(TagsLU):
        raise errors.OpRetryError("There has been a modification to the"
                                  " config file and the operation has been"
                                  " aborted. Please retry.")
+
+class LUTestDelay(NoHooksLU):
+  """Sleep for a specified amount of time.
+
+  This LU sleeps on the master and/or nodes for a specified amoutn of
+  time.
+
+  """
+  _OP_REQP = ["duration", "on_master", "on_nodes"]
+
+  def CheckPrereq(self):
+    """Check prerequisites.
+
+    This checks that we have a good list of nodes and/or the duration
+    is valid.
+
+    """
+
+    if self.op.on_nodes:
+      self.op.on_nodes = _GetWantedNodes(self, self.op.on_nodes)
+
+  def Exec(self, feedback_fn):
+    """Do the actual sleep.
+
+    """
+    if self.op.on_master:
+      if not utils.TestDelay(self.op.duration):
+        raise errors.OpExecError("Error during master delay test")
+    if self.op.on_nodes:
+      result = rpc.call_test_delay(self.op.on_nodes, self.op.duration)
+      if not result:
+        raise errors.OpExecError("Complete failure from rpc call")
+      for node, node_result in result.items():
+        if not node_result:
+          raise errors.OpExecError("Failure during rpc call to node %s,"
+                                   " result: %s" % (node, node_result))