Fix unittest failure after list_owned changes

[ganeti-local] / tools / cluster-merge
diff --git a/tools/cluster-merge b/tools/cluster-merge

index 412280b..6025211 100755 (executable)
--- a/tools/cluster-merge
+++ b/tools/cluster-merge
@@ -42,12 +42,33 @@ from ganeti import ssh
  from ganeti import utils
  
  
+_GROUPS_MERGE = "merge"
+_GROUPS_RENAME = "rename"
+_CLUSTERMERGE_ECID = "clustermerge-ecid"
+_RESTART_ALL = "all"
+_RESTART_UP = "up"
+_RESTART_NONE = "none"
+_RESTART_CHOICES = (_RESTART_ALL, _RESTART_UP, _RESTART_NONE)
+
+
  PAUSE_PERIOD_OPT = cli.cli_option("-p", "--watcher-pause-period", default=1800,
                                    action="store", type="int",
                                    dest="pause_period",
                                    help=("Amount of time in seconds watcher"
                                          " should be suspended from running"))
-_CLUSTERMERGE_ECID = "clustermerge-ecid"
+GROUPS_OPT = cli.cli_option("--groups", default=None, metavar="STRATEGY",
+                            choices=(_GROUPS_MERGE, _GROUPS_RENAME),
+                            dest="groups",
+                            help=("How to handle groups that have the"
+                                  " same name (One of: %s/%s)" %
+                                  (_GROUPS_MERGE, _GROUPS_RENAME)))
+RESTART_OPT = cli.cli_option("--restart", default=_RESTART_ALL,
+                             metavar="STRATEGY",
+                             choices=_RESTART_CHOICES,
+                             dest="restart",
+                             help=("How to handle restarting instances"
+                                   " same name (One of: %s/%s/%s)" %
+                                   _RESTART_CHOICES))
  
  
  def Flatten(unflattened_list):
@@ -76,27 +97,29 @@ class MergerData(object):
  
      @param cluster: The name of the cluster
      @param key_path: Path to the ssh private key used for authentication
-    @param config_path: Path to the merging cluster config
-    @param nodes: List of nodes in the merging cluster
+    @param nodes: List of online nodes in the merging cluster
      @param instances: List of instances running on merging cluster
+    @param config_path: Path to the merging cluster config
  
      """
      self.cluster = cluster
      self.key_path = key_path
-    self.config_path = config_path
-    self.instances = instances
      self.nodes = nodes
+    self.instances = instances
+    self.config_path = config_path
  
  
  class Merger(object):
    """Handling the merge.
  
    """
-  def __init__(self, clusters, pause_period):
+  def __init__(self, clusters, pause_period, groups, restart):
      """Initialize object with sane defaults and infos required.
  
      @param clusters: The list of clusters to merge in
      @param pause_period: The time watcher shall be disabled for
+    @param groups: How to handle group conflicts
+    @param restart: How to handle instance restart
  
      """
      self.merger_data = []
@@ -105,6 +128,11 @@ class Merger(object):
      self.work_dir = tempfile.mkdtemp(suffix="cluster-merger")
      (self.cluster_name, ) = cli.GetClient().QueryConfigValues(["cluster_name"])
      self.ssh_runner = ssh.SshRunner(self.cluster_name)
+    self.groups = groups
+    self.restart = restart
+    if self.restart == _RESTART_UP:
+      raise NotImplementedError
+
  
    def Setup(self):
      """Sets up our end so we can do the merger.
@@ -133,13 +161,15 @@ class Merger(object):
        key_path = utils.PathJoin(self.work_dir, cluster)
        utils.WriteFile(key_path, mode=0600, data=result.stdout)
  
-      result = self._RunCmd(cluster, "gnt-node list -o name --no-header",
-                            private_key=key_path)
+      result = self._RunCmd(cluster, "gnt-node list -o name,offline"
+                            " --no-header --separator=,", private_key=key_path)
        if result.failed:
          raise errors.RemoteError("Unable to retrieve list of nodes from %s."
                                   " Fail reason: %s; output: %s" %
                                   (cluster, result.fail_reason, result.output))
-      nodes = result.stdout.splitlines()
+      nodes_statuses = [line.split(',') for line in result.stdout.splitlines()]
+      nodes = [node_status[0] for node_status in nodes_statuses
+               if node_status[1] == "N"]
  
        result = self._RunCmd(cluster, "gnt-instance list -o name --no-header",
                              private_key=key_path)
@@ -269,12 +299,12 @@ class Merger(object):
  
      for data in self.merger_data:
        other_config = config.ConfigWriter(data.config_path, accept_foreign=True)
+      self._MergeClusterConfigs(my_config, other_config)
        self._MergeNodeGroups(my_config, other_config)
  
        for node in other_config.GetNodeList():
          node_info = other_config.GetNodeInfo(node)
-        node_info.master_candidate = False
-        my_config.AddNode(node_info, str(fake_ec_id))
+        my_config.AddNode(node_info, _CLUSTERMERGE_ECID + str(fake_ec_id))
          fake_ec_id += 1
  
        for instance in other_config.GetInstanceList():
@@ -294,17 +324,210 @@ class Merger(object):
              physical_id[1] = physical_id[3] = port
              dsk.physical_id = tuple(physical_id)
  
-        my_config.AddInstance(instance_info, str(fake_ec_id))
+        my_config.AddInstance(instance_info,
+                              _CLUSTERMERGE_ECID + str(fake_ec_id))
          fake_ec_id += 1
  
    # R0201: Method could be a function
+  def _MergeClusterConfigs(self, my_config, other_config):
+    """Checks that all relevant cluster parameters are compatible
+
+    """
+    # pylint: disable-msg=R0201
+    my_cluster = my_config.GetClusterInfo()
+    other_cluster = other_config.GetClusterInfo()
+    err_count = 0
+
+    #
+    # Generic checks
+    #
+    check_params = (
+      "beparams",
+      "default_iallocator",
+      "drbd_usermode_helper",
+      "file_storage_dir",
+      "hidden_os",
+      "maintain_node_health",
+      "master_netdev",
+      "ndparams",
+      "nicparams",
+      "primary_ip_family",
+      "tags",
+      "uid_pool",
+      "volume_group_name",
+      )
+    for param_name in check_params:
+      my_param = getattr(my_cluster, param_name)
+      other_param = getattr(other_cluster, param_name)
+      if my_param != other_param:
+        logging.error("The value (%s) of the cluster parameter %s on %s"
+                      " differs to this cluster's value (%s)",
+                      other_param, param_name, other_cluster.cluster_name,
+                      my_param)
+        err_count += 1
+
+    #
+    # Custom checks
+    #
+
+    # Check default hypervisor
+    my_defhyp = my_cluster.enabled_hypervisors[0]
+    other_defhyp = other_cluster.enabled_hypervisors[0]
+    if my_defhyp != other_defhyp:
+      logging.warning("The default hypervisor (%s) differs on %s, new"
+                      " instances will be created with this cluster's"
+                      " default hypervisor (%s)", other_defhyp,
+                      other_cluster.cluster_name, my_defhyp)
+
+    if (set(my_cluster.enabled_hypervisors) !=
+        set(other_cluster.enabled_hypervisors)):
+      logging.error("The set of enabled hypervisors (%s) on %s differs to"
+                    " this cluster's set (%s)",
+                    other_cluster.enabled_hypervisors,
+                    other_cluster.cluster_name, my_cluster.enabled_hypervisors)
+      err_count += 1
+
+    # Check hypervisor params for hypervisors we care about
+    # TODO: we probably don't care about all params for a given hypervisor
+    for hyp in my_cluster.enabled_hypervisors:
+      for param in my_cluster.hvparams[hyp]:
+        my_value = my_cluster.hvparams[hyp][param]
+        other_value = other_cluster.hvparams[hyp][param]
+        if my_value != other_value:
+          logging.error("The value (%s) of the %s parameter of the %s"
+                        " hypervisor on %s differs to this cluster's parameter"
+                        " (%s)",
+                        other_value, param, hyp, other_cluster.cluster_name,
+                        my_value)
+          err_count += 1
+
+    # Check os hypervisor params for hypervisors we care about
+    for os_name in set(my_cluster.os_hvp.keys() + other_cluster.os_hvp.keys()):
+      for hyp in my_cluster.enabled_hypervisors:
+        my_os_hvp = self._GetOsHypervisor(my_cluster, os_name, hyp)
+        other_os_hvp = self._GetOsHypervisor(other_cluster, os_name, hyp)
+        if my_os_hvp != other_os_hvp:
+          logging.error("The OS parameters (%s) for the %s OS for the %s"
+                        " hypervisor on %s differs to this cluster's parameters"
+                        " (%s)",
+                        other_os_hvp, os_name, hyp, other_cluster.cluster_name,
+                        my_os_hvp)
+          err_count += 1
+
+    #
+    # Warnings
+    #
+    if my_cluster.modify_etc_hosts != other_cluster.modify_etc_hosts:
+      logging.warning("The modify_etc_hosts value (%s) differs on %s,"
+                      " this cluster's value (%s) will take precedence",
+                      other_cluster.modify_etc_hosts,
+                      other_cluster.cluster_name,
+                      my_cluster.modify_etc_hosts)
+
+    if my_cluster.modify_ssh_setup != other_cluster.modify_ssh_setup:
+      logging.warning("The modify_ssh_setup value (%s) differs on %s,"
+                      " this cluster's value (%s) will take precedence",
+                      other_cluster.modify_ssh_setup,
+                      other_cluster.cluster_name,
+                      my_cluster.modify_ssh_setup)
+
+    #
+    # Actual merging
+    #
+    my_cluster.reserved_lvs = list(set(my_cluster.reserved_lvs +
+                                       other_cluster.reserved_lvs))
+
+    if my_cluster.prealloc_wipe_disks != other_cluster.prealloc_wipe_disks:
+      logging.warning("The prealloc_wipe_disks value (%s) on %s differs to this"
+                      " cluster's value (%s). The least permissive value (%s)"
+                      " will be used", other_cluster.prealloc_wipe_disks,
+                      other_cluster.cluster_name,
+                      my_cluster.prealloc_wipe_disks, True)
+      my_cluster.prealloc_wipe_disks = True
+
+    for os_, osparams in other_cluster.osparams.items():
+      if os_ not in my_cluster.osparams:
+        my_cluster.osparams[os_] = osparams
+      elif my_cluster.osparams[os_] != osparams:
+        logging.error("The OS parameters (%s) for the %s OS on %s differs to"
+                      " this cluster's parameters (%s)",
+                      osparams, os_, other_cluster.cluster_name,
+                      my_cluster.osparams[os_])
+        err_count += 1
+
+    if err_count:
+      raise errors.ConfigurationError("Cluster config for %s has incompatible"
+                                      " values, please fix and re-run" %
+                                      other_cluster.cluster_name)
+
+  # R0201: Method could be a function
+  def _GetOsHypervisor(self, cluster, os_name, hyp): # pylint: disable-msg=R0201
+    if os_name in cluster.os_hvp:
+      return cluster.os_hvp[os_name].get(hyp, None)
+    else:
+      return None
+
+  # R0201: Method could be a function
    def _MergeNodeGroups(self, my_config, other_config):
      """Adds foreign node groups
  
      ConfigWriter.AddNodeGroup takes care of making sure there are no conflicts.
      """
      # pylint: disable-msg=R0201
-    for (_, grp) in other_config.GetAllNodeGroupsInfo():
+    logging.info("Node group conflict strategy: %s", self.groups)
+
+    my_grps = my_config.GetAllNodeGroupsInfo().values()
+    other_grps = other_config.GetAllNodeGroupsInfo().values()
+
+    # Check for node group naming conflicts:
+    conflicts = []
+    for other_grp in other_grps:
+      for my_grp in my_grps:
+        if other_grp.name == my_grp.name:
+          conflicts.append(other_grp)
+
+    if conflicts:
+      conflict_names = utils.CommaJoin([g.name for g in conflicts])
+      logging.info("Node groups in both local and remote cluster: %s",
+                   conflict_names)
+
+      # User hasn't specified how to handle conflicts
+      if not self.groups:
+        raise errors.CommandError("The following node group(s) are in both"
+                                  " clusters, and no merge strategy has been"
+                                  " supplied (see the --groups option): %s" %
+                                  conflict_names)
+
+      # User wants to rename conflicts
+      elif self.groups == _GROUPS_RENAME:
+        for grp in conflicts:
+          new_name = "%s-%s" % (grp.name, other_config.GetClusterName())
+          logging.info("Renaming remote node group from %s to %s"
+                       " to resolve conflict", grp.name, new_name)
+          grp.name = new_name
+
+      # User wants to merge conflicting groups
+      elif self.groups == _GROUPS_MERGE:
+        for other_grp in conflicts:
+          logging.info("Merging local and remote '%s' groups", other_grp.name)
+          for node_name in other_grp.members[:]:
+            node = other_config.GetNodeInfo(node_name)
+            # Access to a protected member of a client class
+            # pylint: disable-msg=W0212
+            other_config._UnlockedRemoveNodeFromGroup(node)
+
+            # Access to a protected member of a client class
+            # pylint: disable-msg=W0212
+            my_grp_uuid = my_config._UnlockedLookupNodeGroup(other_grp.name)
+
+            # Access to a protected member of a client class
+            # pylint: disable-msg=W0212
+            my_config._UnlockedAddNodeToGroup(node, my_grp_uuid)
+            node.group = my_grp_uuid
+          # Remove from list of groups to add
+          other_grps.remove(other_grp)
+
+    for grp in other_grps:
        #TODO: handle node group conflicts
        my_config.AddNodeGroup(grp, _CLUSTERMERGE_ECID)
  
@@ -335,7 +558,7 @@ class Merger(object):
      for data in self.merger_data:
        for node in data.nodes:
          result = utils.RunCmd(["gnt-node", "add", "--readd",
-                               "--no-ssh-key-check", node])
+                               "--no-ssh-key-check", "--force-join", node])
          if result.failed:
            raise errors.CommandError("Couldn't readd node %s. Fail reason: %s;"
                                      " output: %s" % (node, result.fail_reason,
@@ -403,28 +626,32 @@ class Merger(object):
        logging.info("Merging config")
        self._FetchRemoteConfig()
  
-      def _OfflineClusterMerge(_):
-        """Closure run when master daemons stopped
+      logging.info("Stopping master daemon")
+      self._KillMasterDaemon()
  
-        """
-        rbsteps.append("Restore %s from another master candidate" %
-                       constants.CLUSTER_CONF_FILE)
-        self._MergeConfig()
-        self._StartMasterDaemon(no_vote=True)
+      rbsteps.append("Restore %s from another master candidate"
+                     " and restart master daemon" %
+                     constants.CLUSTER_CONF_FILE)
+      self._MergeConfig()
+      self._StartMasterDaemon(no_vote=True)
  
-        # Point of no return, delete rbsteps
-        del rbsteps[:]
+      # Point of no return, delete rbsteps
+      del rbsteps[:]
  
-        logging.warning("We are at the point of no return. Merge can not easily"
-                        " be undone after this point.")
-        logging.info("Readd nodes and redistribute config")
-        self._ReaddMergedNodesAndRedist()
-        self._KillMasterDaemon()
+      logging.warning("We are at the point of no return. Merge can not easily"
+                      " be undone after this point.")
+      logging.info("Readd nodes")
+      self._ReaddMergedNodesAndRedist()
  
-      cli.RunWhileClusterStopped(logging.info, _OfflineClusterMerge)
+      logging.info("Merge done, restart master daemon normally")
+      self._KillMasterDaemon()
+      self._StartMasterDaemon()
  
-      logging.info("Starting instances again")
-      self._StartupAllInstances()
+      if self.restart == _RESTART_ALL:
+        logging.info("Starting instances again")
+        self._StartupAllInstances()
+      else:
+        logging.info("Not starting instances again")
        logging.info("Post cluster verification")
        self._VerifyCluster()
      except errors.GenericError, e:
@@ -469,7 +696,7 @@ def SetupLogging(options):
    elif options.verbose:
      stderr_handler.setLevel(logging.INFO)
    else:
-    stderr_handler.setLevel(logging.ERROR)
+    stderr_handler.setLevel(logging.WARNING)
  
    root_logger = logging.getLogger("")
    root_logger.setLevel(logging.NOTSET)
@@ -482,13 +709,13 @@ def main():
    """
    program = os.path.basename(sys.argv[0])
  
-  parser = optparse.OptionParser(usage=("%prog [--debug|--verbose]"
-                                        " [--watcher-pause-period SECONDS]"
-                                        " <cluster> <cluster...>"),
-                                        prog=program)
+  parser = optparse.OptionParser(usage="%%prog [options...] <cluster...>",
+                                 prog=program)
    parser.add_option(cli.DEBUG_OPT)
    parser.add_option(cli.VERBOSE_OPT)
    parser.add_option(PAUSE_PERIOD_OPT)
+  parser.add_option(GROUPS_OPT)
+  parser.add_option(RESTART_OPT)
  
    (options, args) = parser.parse_args()
  
@@ -497,7 +724,8 @@ def main():
    if not args:
      parser.error("No clusters specified")
  
-  cluster_merger = Merger(utils.UniqueSequence(args), options.pause_period)
+  cluster_merger = Merger(utils.UniqueSequence(args), options.pause_period,
+                          options.groups, options.restart)
    try:
      try:
        cluster_merger.Setup()