from ganeti import utils
+_GROUPS_MERGE = "merge"
+_GROUPS_RENAME = "rename"
+_CLUSTERMERGE_ECID = "clustermerge-ecid"
+_RESTART_ALL = "all"
+_RESTART_UP = "up"
+_RESTART_NONE = "none"
+_RESTART_CHOICES = (_RESTART_ALL, _RESTART_UP, _RESTART_NONE)
+
+
PAUSE_PERIOD_OPT = cli.cli_option("-p", "--watcher-pause-period", default=1800,
action="store", type="int",
dest="pause_period",
help=("Amount of time in seconds watcher"
" should be suspended from running"))
-_CLUSTERMERGE_ECID = "clustermerge-ecid"
+GROUPS_OPT = cli.cli_option("--groups", default=None, metavar="STRATEGY",
+ choices=(_GROUPS_MERGE, _GROUPS_RENAME),
+ dest="groups",
+ help=("How to handle groups that have the"
+ " same name (One of: %s/%s)" %
+ (_GROUPS_MERGE, _GROUPS_RENAME)))
+RESTART_OPT = cli.cli_option("--restart", default=_RESTART_ALL,
+ metavar="STRATEGY",
+ choices=_RESTART_CHOICES,
+ dest="restart",
+ help=("How to handle restarting instances"
+ " same name (One of: %s/%s/%s)" %
+ _RESTART_CHOICES))
def Flatten(unflattened_list):
@param cluster: The name of the cluster
@param key_path: Path to the ssh private key used for authentication
- @param config_path: Path to the merging cluster config
- @param nodes: List of nodes in the merging cluster
+ @param nodes: List of online nodes in the merging cluster
@param instances: List of instances running on merging cluster
+ @param config_path: Path to the merging cluster config
"""
self.cluster = cluster
self.key_path = key_path
- self.config_path = config_path
- self.instances = instances
self.nodes = nodes
+ self.instances = instances
+ self.config_path = config_path
class Merger(object):
"""Handling the merge.
"""
- def __init__(self, clusters, pause_period):
+ def __init__(self, clusters, pause_period, groups, restart):
"""Initialize object with sane defaults and infos required.
@param clusters: The list of clusters to merge in
@param pause_period: The time watcher shall be disabled for
+ @param groups: How to handle group conflicts
+ @param restart: How to handle instance restart
"""
self.merger_data = []
self.work_dir = tempfile.mkdtemp(suffix="cluster-merger")
(self.cluster_name, ) = cli.GetClient().QueryConfigValues(["cluster_name"])
self.ssh_runner = ssh.SshRunner(self.cluster_name)
+ self.groups = groups
+ self.restart = restart
+ if self.restart == _RESTART_UP:
+ raise NotImplementedError
+
def Setup(self):
"""Sets up our end so we can do the merger.
key_path = utils.PathJoin(self.work_dir, cluster)
utils.WriteFile(key_path, mode=0600, data=result.stdout)
- result = self._RunCmd(cluster, "gnt-node list -o name --no-header",
- private_key=key_path)
+ result = self._RunCmd(cluster, "gnt-node list -o name,offline"
+ " --no-header --separator=,", private_key=key_path)
if result.failed:
raise errors.RemoteError("Unable to retrieve list of nodes from %s."
" Fail reason: %s; output: %s" %
(cluster, result.fail_reason, result.output))
- nodes = result.stdout.splitlines()
+ nodes_statuses = [line.split(',') for line in result.stdout.splitlines()]
+ nodes = [node_status[0] for node_status in nodes_statuses
+ if node_status[1] == "N"]
result = self._RunCmd(cluster, "gnt-instance list -o name --no-header",
private_key=key_path)
for data in self.merger_data:
other_config = config.ConfigWriter(data.config_path, accept_foreign=True)
+ self._MergeClusterConfigs(my_config, other_config)
self._MergeNodeGroups(my_config, other_config)
for node in other_config.GetNodeList():
node_info = other_config.GetNodeInfo(node)
- node_info.master_candidate = False
- my_config.AddNode(node_info, str(fake_ec_id))
+ my_config.AddNode(node_info, _CLUSTERMERGE_ECID + str(fake_ec_id))
fake_ec_id += 1
for instance in other_config.GetInstanceList():
physical_id[1] = physical_id[3] = port
dsk.physical_id = tuple(physical_id)
- my_config.AddInstance(instance_info, str(fake_ec_id))
+ my_config.AddInstance(instance_info,
+ _CLUSTERMERGE_ECID + str(fake_ec_id))
fake_ec_id += 1
# R0201: Method could be a function
+ def _MergeClusterConfigs(self, my_config, other_config):
+ """Checks that all relevant cluster parameters are compatible
+
+ """
+ # pylint: disable-msg=R0201
+ my_cluster = my_config.GetClusterInfo()
+ other_cluster = other_config.GetClusterInfo()
+ err_count = 0
+
+ #
+ # Generic checks
+ #
+ check_params = (
+ "beparams",
+ "default_iallocator",
+ "drbd_usermode_helper",
+ "file_storage_dir",
+ "hidden_os",
+ "maintain_node_health",
+ "master_netdev",
+ "ndparams",
+ "nicparams",
+ "primary_ip_family",
+ "tags",
+ "uid_pool",
+ "volume_group_name",
+ )
+ for param_name in check_params:
+ my_param = getattr(my_cluster, param_name)
+ other_param = getattr(other_cluster, param_name)
+ if my_param != other_param:
+ logging.error("The value (%s) of the cluster parameter %s on %s"
+ " differs to this cluster's value (%s)",
+ other_param, param_name, other_cluster.cluster_name,
+ my_param)
+ err_count += 1
+
+ #
+ # Custom checks
+ #
+
+ # Check default hypervisor
+ my_defhyp = my_cluster.enabled_hypervisors[0]
+ other_defhyp = other_cluster.enabled_hypervisors[0]
+ if my_defhyp != other_defhyp:
+ logging.warning("The default hypervisor (%s) differs on %s, new"
+ " instances will be created with this cluster's"
+ " default hypervisor (%s)", other_defhyp,
+ other_cluster.cluster_name, my_defhyp)
+
+ if (set(my_cluster.enabled_hypervisors) !=
+ set(other_cluster.enabled_hypervisors)):
+ logging.error("The set of enabled hypervisors (%s) on %s differs to"
+ " this cluster's set (%s)",
+ other_cluster.enabled_hypervisors,
+ other_cluster.cluster_name, my_cluster.enabled_hypervisors)
+ err_count += 1
+
+ # Check hypervisor params for hypervisors we care about
+ # TODO: we probably don't care about all params for a given hypervisor
+ for hyp in my_cluster.enabled_hypervisors:
+ for param in my_cluster.hvparams[hyp]:
+ my_value = my_cluster.hvparams[hyp][param]
+ other_value = other_cluster.hvparams[hyp][param]
+ if my_value != other_value:
+ logging.error("The value (%s) of the %s parameter of the %s"
+ " hypervisor on %s differs to this cluster's parameter"
+ " (%s)",
+ other_value, param, hyp, other_cluster.cluster_name,
+ my_value)
+ err_count += 1
+
+ # Check os hypervisor params for hypervisors we care about
+ for os_name in set(my_cluster.os_hvp.keys() + other_cluster.os_hvp.keys()):
+ for hyp in my_cluster.enabled_hypervisors:
+ my_os_hvp = self._GetOsHypervisor(my_cluster, os_name, hyp)
+ other_os_hvp = self._GetOsHypervisor(other_cluster, os_name, hyp)
+ if my_os_hvp != other_os_hvp:
+ logging.error("The OS parameters (%s) for the %s OS for the %s"
+ " hypervisor on %s differs to this cluster's parameters"
+ " (%s)",
+ other_os_hvp, os_name, hyp, other_cluster.cluster_name,
+ my_os_hvp)
+ err_count += 1
+
+ #
+ # Warnings
+ #
+ if my_cluster.modify_etc_hosts != other_cluster.modify_etc_hosts:
+ logging.warning("The modify_etc_hosts value (%s) differs on %s,"
+ " this cluster's value (%s) will take precedence",
+ other_cluster.modify_etc_hosts,
+ other_cluster.cluster_name,
+ my_cluster.modify_etc_hosts)
+
+ if my_cluster.modify_ssh_setup != other_cluster.modify_ssh_setup:
+ logging.warning("The modify_ssh_setup value (%s) differs on %s,"
+ " this cluster's value (%s) will take precedence",
+ other_cluster.modify_ssh_setup,
+ other_cluster.cluster_name,
+ my_cluster.modify_ssh_setup)
+
+ #
+ # Actual merging
+ #
+ my_cluster.reserved_lvs = list(set(my_cluster.reserved_lvs +
+ other_cluster.reserved_lvs))
+
+ if my_cluster.prealloc_wipe_disks != other_cluster.prealloc_wipe_disks:
+ logging.warning("The prealloc_wipe_disks value (%s) on %s differs to this"
+ " cluster's value (%s). The least permissive value (%s)"
+ " will be used", other_cluster.prealloc_wipe_disks,
+ other_cluster.cluster_name,
+ my_cluster.prealloc_wipe_disks, True)
+ my_cluster.prealloc_wipe_disks = True
+
+ for os_, osparams in other_cluster.osparams.items():
+ if os_ not in my_cluster.osparams:
+ my_cluster.osparams[os_] = osparams
+ elif my_cluster.osparams[os_] != osparams:
+ logging.error("The OS parameters (%s) for the %s OS on %s differs to"
+ " this cluster's parameters (%s)",
+ osparams, os_, other_cluster.cluster_name,
+ my_cluster.osparams[os_])
+ err_count += 1
+
+ if err_count:
+ raise errors.ConfigurationError("Cluster config for %s has incompatible"
+ " values, please fix and re-run" %
+ other_cluster.cluster_name)
+
+ # R0201: Method could be a function
+ def _GetOsHypervisor(self, cluster, os_name, hyp): # pylint: disable-msg=R0201
+ if os_name in cluster.os_hvp:
+ return cluster.os_hvp[os_name].get(hyp, None)
+ else:
+ return None
+
+ # R0201: Method could be a function
def _MergeNodeGroups(self, my_config, other_config):
"""Adds foreign node groups
ConfigWriter.AddNodeGroup takes care of making sure there are no conflicts.
"""
# pylint: disable-msg=R0201
- for (_, grp) in other_config.GetAllNodeGroupsInfo():
+ logging.info("Node group conflict strategy: %s", self.groups)
+
+ my_grps = my_config.GetAllNodeGroupsInfo().values()
+ other_grps = other_config.GetAllNodeGroupsInfo().values()
+
+ # Check for node group naming conflicts:
+ conflicts = []
+ for other_grp in other_grps:
+ for my_grp in my_grps:
+ if other_grp.name == my_grp.name:
+ conflicts.append(other_grp)
+
+ if conflicts:
+ conflict_names = utils.CommaJoin([g.name for g in conflicts])
+ logging.info("Node groups in both local and remote cluster: %s",
+ conflict_names)
+
+ # User hasn't specified how to handle conflicts
+ if not self.groups:
+ raise errors.CommandError("The following node group(s) are in both"
+ " clusters, and no merge strategy has been"
+ " supplied (see the --groups option): %s" %
+ conflict_names)
+
+ # User wants to rename conflicts
+ elif self.groups == _GROUPS_RENAME:
+ for grp in conflicts:
+ new_name = "%s-%s" % (grp.name, other_config.GetClusterName())
+ logging.info("Renaming remote node group from %s to %s"
+ " to resolve conflict", grp.name, new_name)
+ grp.name = new_name
+
+ # User wants to merge conflicting groups
+ elif self.groups == _GROUPS_MERGE:
+ for other_grp in conflicts:
+ logging.info("Merging local and remote '%s' groups", other_grp.name)
+ for node_name in other_grp.members[:]:
+ node = other_config.GetNodeInfo(node_name)
+ # Access to a protected member of a client class
+ # pylint: disable-msg=W0212
+ other_config._UnlockedRemoveNodeFromGroup(node)
+
+ # Access to a protected member of a client class
+ # pylint: disable-msg=W0212
+ my_grp_uuid = my_config._UnlockedLookupNodeGroup(other_grp.name)
+
+ # Access to a protected member of a client class
+ # pylint: disable-msg=W0212
+ my_config._UnlockedAddNodeToGroup(node, my_grp_uuid)
+ node.group = my_grp_uuid
+ # Remove from list of groups to add
+ other_grps.remove(other_grp)
+
+ for grp in other_grps:
#TODO: handle node group conflicts
my_config.AddNodeGroup(grp, _CLUSTERMERGE_ECID)
for data in self.merger_data:
for node in data.nodes:
result = utils.RunCmd(["gnt-node", "add", "--readd",
- "--no-ssh-key-check", node])
+ "--no-ssh-key-check", "--force-join", node])
if result.failed:
raise errors.CommandError("Couldn't readd node %s. Fail reason: %s;"
" output: %s" % (node, result.fail_reason,
logging.info("Merging config")
self._FetchRemoteConfig()
- def _OfflineClusterMerge(_):
- """Closure run when master daemons stopped
+ logging.info("Stopping master daemon")
+ self._KillMasterDaemon()
- """
- rbsteps.append("Restore %s from another master candidate" %
- constants.CLUSTER_CONF_FILE)
- self._MergeConfig()
- self._StartMasterDaemon(no_vote=True)
+ rbsteps.append("Restore %s from another master candidate"
+ " and restart master daemon" %
+ constants.CLUSTER_CONF_FILE)
+ self._MergeConfig()
+ self._StartMasterDaemon(no_vote=True)
- # Point of no return, delete rbsteps
- del rbsteps[:]
+ # Point of no return, delete rbsteps
+ del rbsteps[:]
- logging.warning("We are at the point of no return. Merge can not easily"
- " be undone after this point.")
- logging.info("Readd nodes and redistribute config")
- self._ReaddMergedNodesAndRedist()
- self._KillMasterDaemon()
+ logging.warning("We are at the point of no return. Merge can not easily"
+ " be undone after this point.")
+ logging.info("Readd nodes")
+ self._ReaddMergedNodesAndRedist()
- cli.RunWhileClusterStopped(logging.info, _OfflineClusterMerge)
+ logging.info("Merge done, restart master daemon normally")
+ self._KillMasterDaemon()
+ self._StartMasterDaemon()
- logging.info("Starting instances again")
- self._StartupAllInstances()
+ if self.restart == _RESTART_ALL:
+ logging.info("Starting instances again")
+ self._StartupAllInstances()
+ else:
+ logging.info("Not starting instances again")
logging.info("Post cluster verification")
self._VerifyCluster()
except errors.GenericError, e:
elif options.verbose:
stderr_handler.setLevel(logging.INFO)
else:
- stderr_handler.setLevel(logging.ERROR)
+ stderr_handler.setLevel(logging.WARNING)
root_logger = logging.getLogger("")
root_logger.setLevel(logging.NOTSET)
"""
program = os.path.basename(sys.argv[0])
- parser = optparse.OptionParser(usage=("%prog [--debug|--verbose]"
- " [--watcher-pause-period SECONDS]"
- " <cluster> <cluster...>"),
- prog=program)
+ parser = optparse.OptionParser(usage="%%prog [options...] <cluster...>",
+ prog=program)
parser.add_option(cli.DEBUG_OPT)
parser.add_option(cli.VERBOSE_OPT)
parser.add_option(PAUSE_PERIOD_OPT)
+ parser.add_option(GROUPS_OPT)
+ parser.add_option(RESTART_OPT)
(options, args) = parser.parse_args()
if not args:
parser.error("No clusters specified")
- cluster_merger = Merger(utils.UniqueSequence(args), options.pause_period)
+ cluster_merger = Merger(utils.UniqueSequence(args), options.pause_period,
+ options.groups, options.restart)
try:
try:
cluster_merger.Setup()