@param secondary_nodes: list of secondary nodes as strings
@type os_type: string
@param os_type: the name of the instance's OS
- @type status: string
- @param status: the desired status of the instances
+ @type status: boolean
+ @param status: the should_run status of the instance
@type memory: string
@param memory: the memory size of the instance
@type vcpus: string
@return: the hook environment for this instance
"""
+ if status:
+ str_status = "up"
+ else:
+ str_status = "down"
env = {
"OP_TARGET": name,
"INSTANCE_NAME": name,
"INSTANCE_PRIMARY": primary_node,
"INSTANCE_SECONDARIES": " ".join(secondary_nodes),
"INSTANCE_OS_TYPE": os_type,
- "INSTANCE_STATUS": status,
+ "INSTANCE_STATUS": str_status,
"INSTANCE_MEMORY": memory,
"INSTANCE_VCPUS": vcpus,
}
'primary_node': instance.primary_node,
'secondary_nodes': instance.secondary_nodes,
'os_type': instance.os,
- 'status': instance.os,
+ 'status': instance.admin_up,
'memory': bep[constants.BE_MEMORY],
'vcpus': bep[constants.BE_VCPUS],
'nics': [(nic.ip, nic.bridge, nic.mac) for nic in instance.nics],
self.share_locks = dict(((i, 1) for i in locking.LEVELS))
def _VerifyNode(self, nodeinfo, file_list, local_cksum,
- node_result, feedback_fn, master_files):
+ node_result, feedback_fn, master_files,
+ drbd_map):
"""Run multiple tests against a node.
Test list:
@param node_result: the results from the node
@param feedback_fn: function used to accumulate results
@param master_files: list of files that only masters should have
+ @param drbd_map: the useddrbd minors for this node, in
+ form of minor: (instance, must_exist) which correspond to instances
+ and their running status
"""
node = nodeinfo.name
if hv_result is not None:
feedback_fn(" - ERROR: hypervisor %s verify failure: '%s'" %
(hv_name, hv_result))
+
+ # check used drbd list
+ used_minors = node_result.get(constants.NV_DRBDLIST, [])
+ for minor, (iname, must_exist) in drbd_map.items():
+ if minor not in used_minors and must_exist:
+ feedback_fn(" - ERROR: drbd minor %d of instance %s is not active" %
+ (minor, iname))
+ bad = True
+ for minor in used_minors:
+ if minor not in drbd_map:
+ feedback_fn(" - ERROR: unallocated drbd minor %d is in use" % minor)
+ bad = True
+
return bad
def _VerifyInstance(self, instance, instanceconfig, node_vol_is,
(volume, node))
bad = True
- if not instanceconfig.status == 'down':
+ if instanceconfig.admin_up:
if ((node_current not in node_instance or
not instance in node_instance[node_current]) and
node_current not in n_offline):
nodelist = utils.NiceSort(self.cfg.GetNodeList())
nodeinfo = [self.cfg.GetNodeInfo(nname) for nname in nodelist]
instancelist = utils.NiceSort(self.cfg.GetInstanceList())
+ instanceinfo = dict((iname, self.cfg.GetInstanceInfo(iname))
+ for iname in instancelist)
i_non_redundant = [] # Non redundant instances
i_non_a_balanced = [] # Non auto-balanced instances
n_offline = [] # List of offline nodes
constants.NV_VGLIST: None,
constants.NV_VERSION: None,
constants.NV_HVINFO: self.cfg.GetHypervisorType(),
+ constants.NV_DRBDLIST: None,
}
all_nvinfo = self.rpc.call_node_verify(nodelist, node_verify_param,
self.cfg.GetClusterName())
cluster = self.cfg.GetClusterInfo()
master_node = self.cfg.GetMasterNode()
+ all_drbd_map = self.cfg.ComputeDRBDMap()
+
for node_i in nodeinfo:
node = node_i.name
nresult = all_nvinfo[node].data
bad = True
continue
+ node_drbd = {}
+ for minor, instance in all_drbd_map[node].items():
+ instance = instanceinfo[instance]
+ node_drbd[minor] = (instance.name, instance.admin_up)
result = self._VerifyNode(node_i, file_names, local_checksums,
- nresult, feedback_fn, master_files)
+ nresult, feedback_fn, master_files,
+ node_drbd)
bad = bad or result
lvdata = nresult.get(constants.NV_LVLIST, "Missing LV data")
for instance in instancelist:
feedback_fn("* Verifying instance %s" % instance)
- inst_config = self.cfg.GetInstanceInfo(instance)
+ inst_config = instanceinfo[instance]
result = self._VerifyInstance(instance, inst_config, node_volume,
node_instance, feedback_fn, n_offline)
bad = bad or result
nv_dict = {}
for inst in instances:
inst_lvs = {}
- if (inst.status != "up" or
+ if (not inst.admin_up or
inst.disk_template not in constants.DTS_NET_MIRROR):
continue
inst.MapLVsByNode(inst_lvs)
continue
rstats = rstats.data
retries = 0
- for i in range(len(rstats)):
- mstat = rstats[i]
+ for i, mstat in enumerate(rstats):
if mstat is None:
lu.LogWarning("Can't compute data for node %s/%s",
node, instance.disks[i].iv_name)
for instance_name in instance_list:
instance = self.cfg.GetInstanceInfo(instance_name)
- if node.name == instance.primary_node:
- raise errors.OpPrereqError("Instance %s still running on the node,"
- " please remove first." % instance_name)
- if node.name in instance.secondary_nodes:
- raise errors.OpPrereqError("Instance %s has node as a secondary,"
+ if node.name in instance.all_nodes:
+ raise errors.OpPrereqError("Instance %s is still running on the node,"
" please remove first." % instance_name)
self.op.node_name = node.name
self.node = node
"FORCE": self.op.force,
}
env.update(_BuildInstanceHookEnvByObject(self, self.instance))
- nl = ([self.cfg.GetMasterNode(), self.instance.primary_node] +
- list(self.instance.secondary_nodes))
+ nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
return env, nl, nl
def CheckPrereq(self):
_StartInstanceDisks(self, instance, force)
result = self.rpc.call_instance_start(node_current, instance, extra_args)
- if result.failed or not result.data:
+ msg = result.RemoteFailMsg()
+ if msg:
_ShutdownInstanceDisks(self, instance)
- raise errors.OpExecError("Could not start instance")
+ raise errors.OpExecError("Could not start instance: %s" % msg)
class LURebootInstance(LogicalUnit):
"IGNORE_SECONDARIES": self.op.ignore_secondaries,
}
env.update(_BuildInstanceHookEnvByObject(self, self.instance))
- nl = ([self.cfg.GetMasterNode(), self.instance.primary_node] +
- list(self.instance.secondary_nodes))
+ nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
return env, nl, nl
def CheckPrereq(self):
_ShutdownInstanceDisks(self, instance)
_StartInstanceDisks(self, instance, ignore_secondaries)
result = self.rpc.call_instance_start(node_current, instance, extra_args)
- if result.failed or not result.data:
+ msg = result.RemoteFailMsg()
+ if msg:
_ShutdownInstanceDisks(self, instance)
- raise errors.OpExecError("Could not start instance for full reboot")
+ raise errors.OpExecError("Could not start instance for"
+ " full reboot: %s" % msg)
self.cfg.MarkInstanceUp(instance.name)
"""
env = _BuildInstanceHookEnvByObject(self, self.instance)
- nl = ([self.cfg.GetMasterNode(), self.instance.primary_node] +
- list(self.instance.secondary_nodes))
+ nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
return env, nl, nl
def CheckPrereq(self):
"""
env = _BuildInstanceHookEnvByObject(self, self.instance)
- nl = ([self.cfg.GetMasterNode(), self.instance.primary_node] +
- list(self.instance.secondary_nodes))
+ nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
return env, nl, nl
def CheckPrereq(self):
if instance.disk_template == constants.DT_DISKLESS:
raise errors.OpPrereqError("Instance '%s' has no disks" %
self.op.instance_name)
- if instance.status != "down":
+ if instance.admin_up:
raise errors.OpPrereqError("Instance '%s' is marked to be up" %
self.op.instance_name)
remote_info = self.rpc.call_instance_info(instance.primary_node,
try:
feedback_fn("Running the instance OS create scripts...")
result = self.rpc.call_instance_os_add(inst.primary_node, inst)
- result.Raise()
- if not result.data:
+ msg = result.RemoteFailMsg()
+ if msg:
raise errors.OpExecError("Could not install OS for instance %s"
- " on node %s" %
- (inst.name, inst.primary_node))
+ " on node %s: %s" %
+ (inst.name, inst.primary_node, msg))
finally:
_ShutdownInstanceDisks(self, inst)
"""
env = _BuildInstanceHookEnvByObject(self, self.instance)
env["INSTANCE_NEW_NAME"] = self.op.new_name
- nl = ([self.cfg.GetMasterNode(), self.instance.primary_node] +
- list(self.instance.secondary_nodes))
+ nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
return env, nl, nl
def CheckPrereq(self):
self.op.instance_name)
_CheckNodeOnline(self, instance.primary_node)
- if instance.status != "down":
+ if instance.admin_up:
raise errors.OpPrereqError("Instance '%s' is marked to be up" %
self.op.instance_name)
remote_info = self.rpc.call_instance_info(instance.primary_node,
try:
result = self.rpc.call_instance_run_rename(inst.primary_node, inst,
old_name)
- if result.failed or not result.data:
+ msg = result.RemoteFailMsg()
+ if msg:
msg = ("Could not run OS rename script for instance %s on node %s"
- " (but the instance has been renamed in Ganeti)" %
- (inst.name, inst.primary_node))
+ " (but the instance has been renamed in Ganeti): %s" %
+ (inst.name, inst.primary_node, msg))
self.proc.LogWarning(msg)
finally:
_ShutdownInstanceDisks(self, inst)
elif field == "snodes":
val = list(instance.secondary_nodes)
elif field == "admin_state":
- val = (instance.status != "down")
+ val = instance.admin_up
elif field == "oper_state":
if instance.primary_node in bad_nodes:
val = None
else:
running = bool(live_data.get(instance.name))
if running:
- if instance.status != "down":
+ if instance.admin_up:
val = "running"
else:
val = "ERROR_up"
else:
- if instance.status != "down":
+ if instance.admin_up:
val = "ERROR_down"
else:
val = "ADMIN_down"
for dev in instance.disks:
# for drbd, these are drbd over lvm
if not _CheckDiskConsistency(self, dev, target_node, False):
- if instance.status == "up" and not self.op.ignore_consistency:
+ if instance.admin_up and not self.op.ignore_consistency:
raise errors.OpExecError("Disk %s is degraded on target node,"
" aborting failover." % dev.iv_name)
self.cfg.Update(instance)
# Only start the instance if it's marked as up
- if instance.status == "up":
+ if instance.admin_up:
feedback_fn("* activating the instance's disks on target node")
logging.info("Starting instance %s on node %s",
instance.name, target_node)
feedback_fn("* starting the instance on the target node")
result = self.rpc.call_instance_start(target_node, instance, None)
- if result.failed or not result.data:
+ msg = result.RemoteFailMsg()
+ if msg:
_ShutdownInstanceDisks(self, instance)
- raise errors.OpExecError("Could not start instance %s on node %s." %
- (instance.name, target_node))
+ raise errors.OpExecError("Could not start instance %s on node %s: %s" %
+ (instance.name, target_node, msg))
class LUMigrateInstance(LogicalUnit):
self.feedback_fn("* done")
+ def _RevertDiskStatus(self):
+ """Try to revert the disk status after a failed migration.
+
+ """
+ target_node = self.target_node
+ try:
+ self._EnsureSecondary(target_node)
+ self._GoStandalone()
+ self._GoReconnect(False)
+ self._WaitUntilSync()
+ except errors.OpExecError, err:
+ self.LogWarning("Migration failed and I can't reconnect the"
+ " drives: error '%s'\n"
+ "Please look and recover the instance status" %
+ str(err))
+
+ def _AbortMigration(self):
+ """Call the hypervisor code to abort a started migration.
+
+ """
+ instance = self.instance
+ target_node = self.target_node
+ migration_info = self.migration_info
+
+ abort_result = self.rpc.call_finalize_migration(target_node,
+ instance,
+ migration_info,
+ False)
+ abort_msg = abort_result.RemoteFailMsg()
+ if abort_msg:
+ logging.error("Aborting migration failed on target node %s: %s" %
+ (target_node, abort_msg))
+ # Don't raise an exception here, as we stil have to try to revert the
+ # disk status, even if this step failed.
+
def _ExecMigration(self):
"""Migrate an instance.
" synchronized on target node,"
" aborting migrate." % dev.iv_name)
+ # First get the migration information from the remote node
+ result = self.rpc.call_migration_info(source_node, instance)
+ msg = result.RemoteFailMsg()
+ if msg:
+ log_err = ("Failed fetching source migration information from %s: %s" %
+ (source_node, msg))
+ logging.error(log_err)
+ raise errors.OpExecError(log_err)
+
+ self.migration_info = migration_info = result.data[1]
+
+ # Then switch the disks to master/master mode
self._EnsureSecondary(target_node)
self._GoStandalone()
self._GoReconnect(True)
self._WaitUntilSync()
+ self.feedback_fn("* preparing %s to accept the instance" % target_node)
+ result = self.rpc.call_accept_instance(target_node,
+ instance,
+ migration_info,
+ self.nodes_ip[target_node])
+
+ msg = result.RemoteFailMsg()
+ if msg:
+ logging.error("Instance pre-migration failed, trying to revert"
+ " disk status: %s", msg)
+ self._AbortMigration()
+ self._RevertDiskStatus()
+ raise errors.OpExecError("Could not pre-migrate instance %s: %s" %
+ (instance.name, msg))
+
self.feedback_fn("* migrating instance to %s" % target_node)
time.sleep(10)
result = self.rpc.call_instance_migrate(source_node, instance,
if msg:
logging.error("Instance migration failed, trying to revert"
" disk status: %s", msg)
- try:
- self._EnsureSecondary(target_node)
- self._GoStandalone()
- self._GoReconnect(False)
- self._WaitUntilSync()
- except errors.OpExecError, err:
- self.LogWarning("Migration failed and I can't reconnect the"
- " drives: error '%s'\n"
- "Please look and recover the instance status" %
- str(err))
-
+ self._AbortMigration()
+ self._RevertDiskStatus()
raise errors.OpExecError("Could not migrate instance %s: %s" %
(instance.name, msg))
time.sleep(10)
# distribute new instance config to the other nodes
self.cfg.Update(instance)
+ result = self.rpc.call_finalize_migration(target_node,
+ instance,
+ migration_info,
+ True)
+ msg = result.RemoteFailMsg()
+ if msg:
+ logging.error("Instance migration succeeded, but finalization failed:"
+ " %s" % msg)
+ raise errors.OpExecError("Could not finalize instance migration: %s" %
+ msg)
+
self._EnsureSecondary(source_node)
self._WaitUntilSync()
self._GoStandalone()
if not force_create:
return
+ _CreateSingleBlockDev(lu, node, instance, device, info, force_open)
+
+
+def _CreateSingleBlockDev(lu, node, instance, device, info, force_open):
+ """Create a single block device on a given node.
+
+ This will not recurse over children of the device, so they must be
+ created in advance.
+
+ @param lu: the lu on whose behalf we execute
+ @param node: the node on which to create the device
+ @type instance: L{objects.Instance}
+ @param instance: the instance which owns the device
+ @type device: L{objects.Disk}
+ @param device: the device to create
+ @param info: the extra 'metadata' we should attach to the device
+ (this will be represented as a LVM tag)
+ @type force_open: boolean
+ @param force_open: this parameter will be passes to the
+ L{backend.CreateBlockDevice} function where it specifies
+ whether we run on primary or not, and it affects both
+ the child assembly and the device own Open() execution
+
+ """
lu.cfg.SetDiskID(device, node)
- new_id = lu.rpc.call_blockdev_create(node, device, device.size,
+ result = lu.rpc.call_blockdev_create(node, device, device.size,
instance.name, force_open, info)
- if new_id.failed or not new_id.data:
+ msg = result.RemoteFailMsg()
+ if msg:
raise errors.OpExecError("Can't create block device %s on"
- " node %s" % (device, node))
+ " node %s for instance %s: %s" %
+ (device, node, instance.name, msg))
if device.physical_id is None:
- device.physical_id = new_id
+ device.physical_id = result.data[1]
def _GenerateUniqueNames(lu, exts):
self.be_full[constants.BE_MEMORY],
self.op.hypervisor)
- if self.op.start:
- self.instance_status = 'up'
- else:
- self.instance_status = 'down'
+ self.instance_status = self.op.start
def Exec(self, feedback_fn):
"""Create and add the instance to the cluster.
primary_node=pnode_name,
nics=self.nics, disks=disks,
disk_template=self.op.disk_template,
- status=self.instance_status,
+ admin_up=self.instance_status,
network_port=network_port,
beparams=self.op.beparams,
hvparams=self.op.hvparams,
# Declare that we don't want to remove the instance lock anymore, as we've
# added the instance to the config
del self.remove_locks[locking.LEVEL_INSTANCE]
- # Remove the temp. assignements for the instance's drbds
- self.cfg.ReleaseDRBDMinors(instance)
# Unlock all the nodes
if self.op.mode == constants.INSTANCE_IMPORT:
nodes_keep = [self.op.src_node]
if self.op.mode == constants.INSTANCE_CREATE:
feedback_fn("* running the instance OS create scripts...")
result = self.rpc.call_instance_os_add(pnode_name, iobj)
- result.Raise()
- if not result.data:
+ msg = result.RemoteFailMsg()
+ if msg:
raise errors.OpExecError("Could not add os for instance %s"
- " on node %s" %
- (instance, pnode_name))
+ " on node %s: %s" %
+ (instance, pnode_name, msg))
elif self.op.mode == constants.INSTANCE_IMPORT:
feedback_fn("* running the instance OS import scripts...")
logging.info("Starting instance %s on node %s", instance, pnode_name)
feedback_fn("* starting instance...")
result = self.rpc.call_instance_start(pnode_name, iobj, None)
- result.Raise()
- if not result.data:
- raise errors.OpExecError("Could not start instance")
+ msg = result.RemoteFailMsg()
+ if msg:
+ raise errors.OpExecError("Could not start instance: %s" % msg)
class LUConnectConsole(NoHooksLU):
logical_id=new_alone_id,
children=dev.children)
try:
- _CreateBlockDev(self, new_node, instance, new_drbd, False,
- _GetInstanceInfoText(instance), False)
- except error.BlockDeviceError:
+ _CreateSingleBlockDev(self, new_node, instance, new_drbd,
+ _GetInstanceInfoText(instance), False)
+ except errors.BlockDeviceError:
self.cfg.ReleaseDRBDMinors(instance.name)
raise
dev.logical_id = new_logical_id
cfg.SetDiskID(dev, pri_node)
cfg.Update(instance)
- # we can remove now the temp minors as now the new values are
- # written to the config file (and therefore stable)
- self.cfg.ReleaseDRBDMinors(instance.name)
# and now perform the drbd attach
info("attaching primary drbds to new secondary (standalone => connected)")
instance = self.instance
# Activate the instance disks if we're replacing them on a down instance
- if instance.status == "down":
+ if not instance.admin_up:
_StartInstanceDisks(self, instance, True)
if self.op.mode == constants.REPLACE_DISK_CHG:
ret = fn(feedback_fn)
# Deactivate the instance disks if we're replacing them on a down instance
- if instance.status == "down":
+ if not instance.admin_up:
_SafeShutdownInstanceDisks(self, instance)
return ret
instance = self.cfg.GetInstanceInfo(self.op.instance_name)
assert instance is not None, \
"Cannot retrieve locked instance %s" % self.op.instance_name
- _CheckNodeOnline(self, instance.primary_node)
- for node in instance.secondary_nodes:
+ nodenames = list(instance.all_nodes)
+ for node in nodenames:
_CheckNodeOnline(self, node)
self.disk = instance.FindDisk(self.op.disk)
- nodenames = [instance.primary_node] + list(instance.secondary_nodes)
nodeinfo = self.rpc.call_node_info(nodenames, self.cfg.GetVGName(),
instance.hypervisor)
for node in nodenames:
"""
instance = self.instance
disk = self.disk
- for node in (instance.secondary_nodes + (instance.primary_node,)):
+ for node in instance.all_nodes:
self.cfg.SetDiskID(disk, node)
result = self.rpc.call_blockdev_grow(node, disk, self.op.amount)
result.Raise()
remote_state = "down"
else:
remote_state = None
- if instance.status == "down":
- config_state = "down"
- else:
+ if instance.admin_up:
config_state = "up"
+ else:
+ config_state = "down"
disks = [self._ComputeDiskStatus(instance, None, device)
for device in instance.disks]
args['vcpus'] = self.be_new[constants.BE_VCPUS]
# FIXME: readd disk/nic changes
env = _BuildInstanceHookEnvByObject(self, self.instance, override=args)
- nl = [self.cfg.GetMasterNode(),
- self.instance.primary_node] + list(self.instance.secondary_nodes)
+ nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
return env, nl, nl
def CheckPrereq(self):
instance = self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
assert self.instance is not None, \
"Cannot retrieve locked instance %s" % self.op.instance_name
- pnode = self.instance.primary_node
- nodelist = [pnode]
- nodelist.extend(instance.secondary_nodes)
+ pnode = instance.primary_node
+ nodelist = list(instance.all_nodes)
# hvparams processing
if self.op.hvparams:
disk_idx_base = len(instance.disks)
new_disk = _GenerateDiskTemplate(self,
instance.disk_template,
- instance, instance.primary_node,
+ instance.name, instance.primary_node,
instance.secondary_nodes,
[disk_dict],
file_path,
try:
_CreateBlockDev(self, node, instance, new_disk,
f_create, info, f_create)
- except error.OpExecError, err:
+ except errors.OpExecError, err:
self.LogWarning("Failed to create volume %s (%s) on"
" node %s: %s",
new_disk.iv_name, new_disk, node, err)
snap_disks.append(new_dev)
finally:
- if self.op.shutdown and instance.status == "up":
+ if self.op.shutdown and instance.admin_up:
result = self.rpc.call_instance_start(src_node, instance, None)
- if result.failed or not result.data:
+ msg = result.RemoteFailMsg()
+ if msg:
_ShutdownInstanceDisks(self, instance)
- raise errors.OpExecError("Could not start instance")
+ raise errors.OpExecError("Could not start instance: %s" % msg)
# TODO: check for size
i_mem_diff = beinfo[constants.BE_MEMORY] - i_used_mem
remote_info['memory_free'] -= max(0, i_mem_diff)
- if iinfo.status == "up":
+ if iinfo.admin_up:
i_p_up_mem += beinfo[constants.BE_MEMORY]
# compute memory used by instances
for n in iinfo.nics]
pir = {
"tags": list(iinfo.GetTags()),
- "should_run": iinfo.status == "up",
+ "should_run": iinfo.admin_up,
"vcpus": beinfo[constants.BE_VCPUS],
"memory": beinfo[constants.BE_MEMORY],
"os": iinfo.os,
- "nodes": [iinfo.primary_node] + list(iinfo.secondary_nodes),
+ "nodes": list(iinfo.all_nodes),
"nics": nic_data,
"disks": [{"size": dsk.size, "mode": "w"} for dsk in iinfo.disks],
"disk_template": iinfo.disk_template,
"""
data = self.in_data
- if len(self.disks) != 2:
- raise errors.OpExecError("Only two-disk configurations supported")
disk_space = _ComputeDiskSize(self.disk_template, self.disks)
" 'nics' parameter")
if not isinstance(self.op.disks, list):
raise errors.OpPrereqError("Invalid parameter 'disks'")
- if len(self.op.disks) != 2:
- raise errors.OpPrereqError("Only two-disk configurations supported")
for row in self.op.disks:
if (not isinstance(row, dict) or
"size" not in row or
row["mode"] not in ['r', 'w']):
raise errors.OpPrereqError("Invalid contents of the"
" 'disks' parameter")
- if self.op.hypervisor is None:
+ if not hasattr(self.op, "hypervisor") or self.op.hypervisor is None:
self.op.hypervisor = self.cfg.GetHypervisorType()
elif self.op.mode == constants.IALLOCATOR_MODE_RELOC:
if not hasattr(self.op, "name"):