for prinode, instances in nodeinfo['sinst-by-pnode'].iteritems():
needed_mem = 0
for instance in instances:
- needed_mem += instance_cfg[instance].memory
+ if instance_cfg[instance].auto_balance:
+ needed_mem += instance_cfg[instance].memory
if nodeinfo['mfree'] < needed_mem:
feedback_fn(" - ERROR: not enough memory on node %s to accomodate"
" failovers should node %s fail" % (node, prinode))
nodeinfo = [self.cfg.GetNodeInfo(nname) for nname in nodelist]
instancelist = utils.NiceSort(self.cfg.GetInstanceList())
i_non_redundant = [] # Non redundant instances
+ i_non_a_balanced = [] # Non auto-balanced instances
node_volume = {}
node_instance = {}
node_info = {}
feedback_fn(" - WARNING: multiple secondaries for instance %s"
% instance)
+ if not inst_config.auto_balance:
+ i_non_a_balanced.append(instance)
+
for snode in inst_config.secondary_nodes:
if snode in node_info:
node_info[snode]['sinst'].append(instance)
feedback_fn(" - NOTICE: %d non-redundant instance(s) found."
% len(i_non_redundant))
+ if i_non_a_balanced:
+ feedback_fn(" - NOTICE: %d non-auto-balanced instance(s) found."
+ % len(i_non_a_balanced))
+
return int(bad)
def HooksCallBack(self, phase, hooks_results, feedback_fn, lu_result):
raise errors.OpPrereqError("Neither the name nor the IP address of the"
" cluster has changed")
if new_ip != old_ip:
- result = utils.RunCmd(["fping", "-q", new_ip])
- if not result.failed:
+ if utils.TcpPing(new_ip, constants.DEFAULT_NODED_PORT):
raise errors.OpPrereqError("The given cluster IP address (%s) is"
" reachable on the network. Aborting." %
new_ip)
"dtotal", "dfree",
"mtotal", "mnode", "mfree",
"bootid",
- "ctotal",
+ "ctotal", "cnodes", "csockets",
])
_CheckOutputFields(static=["name", "pinst_cnt", "sinst_cnt",
for name in nodenames:
nodeinfo = node_data.get(name, None)
if nodeinfo:
+ fn = utils.TryConvert
live_data[name] = {
- "mtotal": utils.TryConvert(int, nodeinfo['memory_total']),
- "mnode": utils.TryConvert(int, nodeinfo['memory_dom0']),
- "mfree": utils.TryConvert(int, nodeinfo['memory_free']),
- "dtotal": utils.TryConvert(int, nodeinfo['vg_size']),
- "dfree": utils.TryConvert(int, nodeinfo['vg_free']),
- "ctotal": utils.TryConvert(int, nodeinfo['cpu_total']),
- "bootid": nodeinfo['bootid'],
+ "mtotal": fn(int, nodeinfo.get('memory_total', None)),
+ "mnode": fn(int, nodeinfo.get('memory_dom0', None)),
+ "mfree": fn(int, nodeinfo.get('memory_free', None)),
+ "dtotal": fn(int, nodeinfo.get('vg_size', None)),
+ "dfree": fn(int, nodeinfo.get('vg_free', None)),
+ "ctotal": fn(int, nodeinfo.get('cpu_total', None)),
+ "bootid": nodeinfo.get('bootid', None),
+ "cnodes": fn(int, nodeinfo.get('cpu_nodes', None)),
+ "csockets": fn(int, nodeinfo.get('cpu_sockets', None)),
}
else:
live_data[name] = {}
"""
nodeinfo = rpc.call_node_info([node], cfg.GetVGName())
- if not nodeinfo or not isinstance(nodeinfo, dict):
+ if not (nodeinfo and isinstance(nodeinfo, dict) and
+ node in nodeinfo and isinstance(nodeinfo[node], dict)):
raise errors.OpPrereqError("Could not contact node %s for resource"
" information" % (node,))
new_name)
if not getattr(self.op, "ignore_ip", False):
- command = ["fping", "-q", name_info.ip]
- result = utils.RunCmd(command)
- if not result.failed:
+ if utils.TcpPing(name_info.ip, constants.DEFAULT_NODED_PORT):
raise errors.OpPrereqError("IP %s of instance %s already in use" %
(name_info.ip, new_name))
try:
if not rpc.call_instance_run_rename(inst.primary_node, inst, old_name,
"sda", "sdb"):
- msg = ("Could run OS rename script for instance %s on node %s (but the"
- " instance has been renamed in Ganeti)" %
+ msg = ("Could not run OS rename script for instance %s on node %s"
+ " (but the instance has been renamed in Ganeti)" %
(inst.name, inst.primary_node))
logger.Error(msg)
finally:
_CheckOutputFields(static=["name", "os", "pnode", "snodes",
"admin_state", "admin_ram",
"disk_template", "ip", "mac", "bridge",
- "sda_size", "sdb_size", "vcpus", "tags"],
+ "sda_size", "sdb_size", "vcpus", "tags",
+ "auto_balance",
+ "network_port", "kernel_path", "initrd_path",
+ "hvm_boot_order", "hvm_acpi", "hvm_pae",
+ "hvm_cdrom_image_path", "hvm_nic_type",
+ "hvm_disk_type", "vnc_bind_address"],
dynamic=self.dynamic_fields,
selected=self.op.output_fields)
val = instance.vcpus
elif field == "tags":
val = list(instance.GetTags())
+ elif field == "auto_balance":
+ val = instance.auto_balance
+ elif field in ("network_port", "kernel_path", "initrd_path",
+ "hvm_boot_order", "hvm_acpi", "hvm_pae",
+ "hvm_cdrom_image_path", "hvm_nic_type",
+ "hvm_disk_type", "vnc_bind_address"):
+ val = getattr(instance, field, None)
+ if val is None:
+ if field in ("hvm_nic_type", "hvm_disk_type",
+ "kernel_path", "initrd_path"):
+ val = "default"
+ else:
+ val = "-"
else:
raise errors.ParameterError(field)
iout.append(val)
"""
HPATH = "instance-migrate"
HTYPE = constants.HTYPE_INSTANCE
- _OP_REQP = ["instance_name", "live"]
+ _OP_REQP = ["instance_name", "live", "cleanup"]
def BuildHooksEnv(self):
"""Build hooks env.
target_node = secondary_nodes[0]
# check memory requirements on the secondary node
- _CheckNodeFreeMemory(self.cfg, target_node, "failing over instance %s" %
+ _CheckNodeFreeMemory(self.cfg, target_node, "migrating instance %s" %
instance.name, instance.memory)
# check bridge existance
" exist on destination node '%s'" %
(brlist, target_node))
- migratable = rpc.call_instance_migratable(instance.primary_node, instance)
- if not migratable:
- raise errors.OpPrereqError("Can't contact node '%s'" %
- instance.primary_node)
- if not migratable[0]:
- raise errors.OpPrereqError("Can't migrate: %s - please use failover" %
- migratable[1])
+ if not self.op.cleanup:
+ migratable = rpc.call_instance_migratable(instance.primary_node,
+ instance)
+ if not migratable:
+ raise errors.OpPrereqError("Can't contact node '%s'" %
+ instance.primary_node)
+ if not migratable[0]:
+ raise errors.OpPrereqError("Can't migrate: %s - please use failover" %
+ migratable[1])
self.instance = instance
raise errors.OpExecError("Cannot identify disks node %s,"
" error %s" % (node, result[node][1]))
+ def _ExecCleanup(self):
+ """Try to cleanup after a failed migration.
+
+ The cleanup is done by:
+ - check that the instance is running only on one node
+ (and update the config if needed)
+ - change disks on its secondary node to secondary
+ - wait until disks are fully synchronized
+ - disconnect from the network
+ - change disks into single-master mode
+ - wait again until disks are fully synchronized
+
+ """
+ instance = self.instance
+ target_node = self.target_node
+ source_node = self.source_node
+
+ # check running on only one node
+ self.feedback_fn("* checking where the instance actually runs"
+ " (if this hangs, the hypervisor might be in"
+ " a bad state)")
+ ins_l = rpc.call_instance_list(self.all_nodes)
+ for node in self.all_nodes:
+ if not type(ins_l[node]) is list:
+ raise errors.OpExecError("Can't contact node '%s'" % node)
+
+ runningon_source = instance.name in ins_l[source_node]
+ runningon_target = instance.name in ins_l[target_node]
+
+ if runningon_source and runningon_target:
+ raise errors.OpExecError("Instance seems to be running on two nodes,"
+ " or the hypervisor is confused. You will have"
+ " to ensure manually that it runs only on one"
+ " and restart this operation.")
+
+ if not (runningon_source or runningon_target):
+ raise errors.OpExecError("Instance does not seem to be running at all."
+ " In this case, it's safer to repair by"
+ " running 'gnt-instance stop' to ensure disk"
+ " shutdown, and then restarting it.")
+
+ if runningon_target:
+ # the migration has actually succeeded, we need to update the config
+ self.feedback_fn("* instance running on secondary node (%s),"
+ " updating config" % target_node)
+ instance.primary_node = target_node
+ self.cfg.Update(instance)
+ demoted_node = source_node
+ else:
+ self.feedback_fn("* instance confirmed to be running on its"
+ " primary node (%s)" % source_node)
+ demoted_node = target_node
+
+ self._IdentifyDisks()
+
+ self._EnsureSecondary(demoted_node)
+ self._WaitUntilSync()
+ self._GoStandalone()
+ self._GoReconnect(False)
+ self._WaitUntilSync()
+
+ self.feedback_fn("* done")
+
def _ExecMigration(self):
"""Migrate an instance.
self._WaitUntilSync()
self.feedback_fn("* migrating instance to %s" % target_node)
+ time.sleep(10)
result = rpc.call_instance_migrate(source_node, instance,
self.nodes_ip[target_node],
self.op.live)
if not result or not result[0]:
logger.Error("Instance migration failed, trying to revert disk status")
-
try:
self._EnsureSecondary(target_node)
self._GoStandalone()
raise errors.OpExecError("Could not migrate instance %s: %s" %
(instance.name, result[1]))
+ time.sleep(10)
instance.primary_node = target_node
# distribute new instance config to the other nodes
self.source_node: self.cfg.GetNodeInfo(self.source_node).secondary_ip,
self.target_node: self.cfg.GetNodeInfo(self.target_node).secondary_ip,
}
- return self._ExecMigration()
+ if self.op.cleanup:
+ return self._ExecCleanup()
+ else:
+ return self._ExecMigration()
def _CreateBlockDevOnPrimary(cfg, node, instance, device, info):
HTYPE = constants.HTYPE_INSTANCE
_OP_REQP = ["instance_name", "mem_size", "disk_size",
"disk_template", "swap_size", "mode", "start", "vcpus",
- "wait_for_sync", "ip_check", "mac"]
+ "wait_for_sync", "ip_check", "mac", "auto_balance"]
def _RunAllocator(self):
"""Run the allocator based on input opcode.
if len(ial.nodes) != ial.required_nodes:
raise errors.OpPrereqError("iallocator '%s' returned invalid number"
" of nodes (%s), required %s" %
- (len(ial.nodes), ial.required_nodes))
+ (self.op.iallocator, len(ial.nodes),
+ ial.required_nodes))
self.op.pnode = ial.nodes[0]
logger.ToStdout("Selected nodes for the instance: %s" %
(", ".join(ial.nodes),))
# set optional parameters to none if they don't exist
for attr in ["kernel_path", "initrd_path", "hvm_boot_order", "pnode",
"iallocator", "hvm_acpi", "hvm_pae", "hvm_cdrom_image_path",
- "vnc_bind_address"]:
+ "hvm_nic_type", "hvm_disk_type", "vnc_bind_address"]:
if not hasattr(self.op, attr):
setattr(self.op, attr, None)
" like a valid IP address" %
self.op.vnc_bind_address)
+ # Xen HVM device type checks
+ if self.sstore.GetHypervisorType() == constants.HT_XEN_HVM31:
+ if self.op.hvm_nic_type not in constants.HT_HVM_VALID_NIC_TYPES:
+ raise errors.OpPrereqError("Invalid NIC type %s specified for Xen HVM"
+ " hypervisor" % self.op.hvm_nic_type)
+ if self.op.hvm_disk_type not in constants.HT_HVM_VALID_DISK_TYPES:
+ raise errors.OpPrereqError("Invalid disk type %s specified for Xen HVM"
+ " hypervisor" % self.op.hvm_disk_type)
+
if self.op.start:
self.instance_status = 'up'
else:
hvm_pae=self.op.hvm_pae,
hvm_cdrom_image_path=self.op.hvm_cdrom_image_path,
vnc_bind_address=self.op.vnc_bind_address,
+ hvm_nic_type=self.op.hvm_nic_type,
+ hvm_disk_type=self.op.hvm_disk_type,
+ auto_balance=bool(self.op.auto_balance),
)
feedback_fn("* creating instance disks...")
if self.op.remote_node is not None:
raise errors.OpPrereqError("Give either the iallocator or the new"
" secondary, not both")
- self.op.remote_node = self._RunAllocator()
+ self._RunAllocator()
remote_node = self.op.remote_node
if remote_node is not None:
"""Query runtime instance data.
"""
- _OP_REQP = ["instances"]
+ _OP_REQP = ["instances", "static"]
def CheckPrereq(self):
"""Check prerequisites.
"""Compute block device status.
"""
- self.cfg.SetDiskID(dev, instance.primary_node)
- dev_pstatus = rpc.call_blockdev_find(instance.primary_node, dev)
+ static = self.op.static
+ if not static:
+ self.cfg.SetDiskID(dev, instance.primary_node)
+ dev_pstatus = rpc.call_blockdev_find(instance.primary_node, dev)
+ else:
+ dev_pstatus = None
+
if dev.dev_type in constants.LDS_DRBD:
# we change the snode then (otherwise we use the one passed in)
if dev.logical_id[0] == instance.primary_node:
else:
snode = dev.logical_id[0]
- if snode:
+ if snode and not static:
self.cfg.SetDiskID(dev, snode)
dev_sstatus = rpc.call_blockdev_find(snode, dev)
else:
"""Gather and return data"""
result = {}
for instance in self.wanted_instances:
- remote_info = rpc.call_instance_info(instance.primary_node,
- instance.name)
- if remote_info and "state" in remote_info:
- remote_state = "up"
+ if not self.op.static:
+ remote_info = rpc.call_instance_info(instance.primary_node,
+ instance.name)
+ if remote_info and "state" in remote_info:
+ remote_state = "up"
+ else:
+ remote_state = "down"
else:
- remote_state = "down"
+ remote_state = None
if instance.status == "down":
config_state = "down"
else:
"nics": [(nic.mac, nic.ip, nic.bridge) for nic in instance.nics],
"disks": disks,
"vcpus": instance.vcpus,
+ "auto_balance": instance.auto_balance,
}
htkind = self.sstore.GetHypervisorType()
idict["hvm_acpi"] = instance.hvm_acpi
idict["hvm_pae"] = instance.hvm_pae
idict["hvm_cdrom_image_path"] = instance.hvm_cdrom_image_path
+ idict["hvm_nic_type"] = instance.hvm_nic_type
+ idict["hvm_disk_type"] = instance.hvm_disk_type
if htkind in constants.HTS_REQ_PORT:
- idict["vnc_bind_address"] = instance.vnc_bind_address
+ if instance.vnc_bind_address is None:
+ vnc_bind_address = constants.VNC_DEFAULT_BIND_ADDRESS
+ else:
+ vnc_bind_address = instance.vnc_bind_address
+ if instance.network_port is None:
+ vnc_console_port = None
+ elif vnc_bind_address == constants.BIND_ADDRESS_GLOBAL:
+ vnc_console_port = "%s:%s" % (instance.primary_node,
+ instance.network_port)
+ elif vnc_bind_address == constants.LOCALHOST_IP_ADDRESS:
+ vnc_console_port = "%s:%s on node %s" % (vnc_bind_address,
+ instance.network_port,
+ instance.primary_node)
+ else:
+ vnc_console_port = "%s:%s" % (instance.vnc_bind_address,
+ instance.network_port)
+ idict["vnc_console_port"] = vnc_console_port
+ idict["vnc_bind_address"] = vnc_bind_address
idict["network_port"] = instance.network_port
result[instance.name] = idict
self.hvm_boot_order = getattr(self.op, "hvm_boot_order", None)
self.hvm_acpi = getattr(self.op, "hvm_acpi", None)
self.hvm_pae = getattr(self.op, "hvm_pae", None)
+ self.hvm_nic_type = getattr(self.op, "hvm_nic_type", None)
+ self.hvm_disk_type = getattr(self.op, "hvm_disk_type", None)
self.hvm_cdrom_image_path = getattr(self.op, "hvm_cdrom_image_path", None)
self.vnc_bind_address = getattr(self.op, "vnc_bind_address", None)
- all_parms = [self.mem, self.vcpus, self.ip, self.bridge, self.mac,
- self.kernel_path, self.initrd_path, self.hvm_boot_order,
- self.hvm_acpi, self.hvm_pae, self.hvm_cdrom_image_path,
- self.vnc_bind_address]
+ self.force = getattr(self.op, "force", None)
+ self.auto_balance = getattr(self.op, "auto_balance", None)
+ all_parms = [
+ self.mem, self.vcpus, self.ip, self.bridge, self.mac,
+ self.kernel_path, self.initrd_path, self.hvm_boot_order,
+ self.hvm_acpi, self.hvm_pae, self.hvm_cdrom_image_path,
+ self.vnc_bind_address, self.hvm_nic_type, self.hvm_disk_type,
+ self.auto_balance,
+ ]
if all_parms.count(None) == len(all_parms):
raise errors.OpPrereqError("No changes submitted")
if self.mem is not None:
" like a valid IP address" %
self.op.vnc_bind_address)
+ # Xen HVM device type checks
+ if self.sstore.GetHypervisorType() == constants.HT_XEN_HVM31:
+ if self.op.hvm_nic_type is not None:
+ if self.op.hvm_nic_type not in constants.HT_HVM_VALID_NIC_TYPES:
+ raise errors.OpPrereqError("Invalid NIC type %s specified for Xen"
+ " HVM hypervisor" % self.op.hvm_nic_type)
+ if self.op.hvm_disk_type is not None:
+ if self.op.hvm_disk_type not in constants.HT_HVM_VALID_DISK_TYPES:
+ raise errors.OpPrereqError("Invalid disk type %s specified for Xen"
+ " HVM hypervisor" % self.op.hvm_disk_type)
+
+ # auto balance setting
+ if self.auto_balance is not None:
+ # convert the value to a proper bool value, if it's not
+ self.auto_balance = bool(self.auto_balance)
+
instance = self.cfg.GetInstanceInfo(
self.cfg.ExpandInstanceName(self.op.instance_name))
if instance is None:
self.op.instance_name)
self.op.instance_name = instance.name
self.instance = instance
+ self.warn = []
+ if self.mem is not None and not self.force:
+ pnode = self.instance.primary_node
+ nodelist = [pnode]
+ if instance.auto_balance:
+ nodelist.extend(instance.secondary_nodes)
+ instance_info = rpc.call_instance_info(pnode, instance.name)
+ nodeinfo = rpc.call_node_info(nodelist, self.cfg.GetVGName())
+
+ if pnode not in nodeinfo or not isinstance(nodeinfo[pnode], dict):
+ # Assume the primary node is unreachable and go ahead
+ self.warn.append("Can't get info from primary node %s" % pnode)
+ else:
+ if instance_info:
+ current_mem = instance_info['memory']
+ else:
+ # Assume instance not running
+ # (there is a slight race condition here, but it's not very probable,
+ # and we have no other way to check)
+ current_mem = 0
+ miss_mem = self.mem - current_mem - nodeinfo[pnode]['memory_free']
+ if miss_mem > 0:
+ raise errors.OpPrereqError("This change will prevent the instance"
+ " from starting, due to %d MB of memory"
+ " missing on its primary node" % miss_mem)
+
+ if instance.auto_balance:
+ for node in instance.secondary_nodes:
+ if node not in nodeinfo or not isinstance(nodeinfo[node], dict):
+ self.warn.append("Can't get info from secondary node %s" % node)
+ elif self.mem > nodeinfo[node]['memory_free']:
+ self.warn.append("Not enough memory to failover instance to"
+ " secondary node %s" % node)
return
def Exec(self, feedback_fn):
All parameters take effect only at the next restart of the instance.
"""
+ # Process here the warnings from CheckPrereq, as we don't have a
+ # feedback_fn there.
+ for warn in self.warn:
+ feedback_fn("WARNING: %s" % warn)
+
result = []
instance = self.instance
if self.mem:
if self.hvm_pae is not None:
instance.hvm_pae = self.hvm_pae
result.append(("hvm_pae", self.hvm_pae))
+ if self.hvm_nic_type is not None:
+ instance.hvm_nic_type = self.hvm_nic_type
+ result.append(("hvm_nic_type", self.hvm_nic_type))
+ if self.hvm_disk_type is not None:
+ instance.hvm_disk_type = self.hvm_disk_type
+ result.append(("hvm_disk_type", self.hvm_disk_type))
if self.hvm_cdrom_image_path:
if self.hvm_cdrom_image_path == constants.VALUE_NONE:
instance.hvm_cdrom_image_path = None
if self.vnc_bind_address:
instance.vnc_bind_address = self.vnc_bind_address
result.append(("vnc_bind_address", self.vnc_bind_address))
+ if self.auto_balance is not None:
+ instance.auto_balance = self.auto_balance
+ result.append(("auto_balance", self.auto_balance))
self.cfg.AddInstance(instance)