wanted.append(instance)
else:
- wanted = lu.cfg.GetInstanceList()
- return utils.NiceSort(wanted)
+ wanted = utils.NiceSort(lu.cfg.GetInstanceList())
+ return wanted
def _CheckOutputFields(static, dynamic, selected):
@param secondary_nodes: list of secondary nodes as strings
@type os_type: string
@param os_type: the name of the instance's OS
- @type status: string
- @param status: the desired status of the instances
+ @type status: boolean
+ @param status: the should_run status of the instance
@type memory: string
@param memory: the memory size of the instance
@type vcpus: string
@return: the hook environment for this instance
"""
+ if status:
+ str_status = "up"
+ else:
+ str_status = "down"
env = {
"OP_TARGET": name,
"INSTANCE_NAME": name,
"INSTANCE_PRIMARY": primary_node,
"INSTANCE_SECONDARIES": " ".join(secondary_nodes),
"INSTANCE_OS_TYPE": os_type,
- "INSTANCE_STATUS": status,
+ "INSTANCE_STATUS": str_status,
"INSTANCE_MEMORY": memory,
"INSTANCE_VCPUS": vcpus,
}
'primary_node': instance.primary_node,
'secondary_nodes': instance.secondary_nodes,
'os_type': instance.os,
- 'status': instance.os,
+ 'status': instance.admin_up,
'memory': bep[constants.BE_MEMORY],
'vcpus': bep[constants.BE_VCPUS],
'nics': [(nic.ip, nic.bridge, nic.mac) for nic in instance.nics],
self.share_locks = dict(((i, 1) for i in locking.LEVELS))
def _VerifyNode(self, nodeinfo, file_list, local_cksum,
- node_result, feedback_fn, master_files):
+ node_result, feedback_fn, master_files,
+ drbd_map):
"""Run multiple tests against a node.
Test list:
@param node_result: the results from the node
@param feedback_fn: function used to accumulate results
@param master_files: list of files that only masters should have
+ @param drbd_map: the useddrbd minors for this node, in
+ form of minor: (instance, must_exist) which correspond to instances
+ and their running status
"""
node = nodeinfo.name
# compares ganeti version
local_version = constants.PROTOCOL_VERSION
remote_version = node_result.get('version', None)
- if not remote_version:
+ if not (remote_version and isinstance(remote_version, (list, tuple)) and
+ len(remote_version) == 2):
feedback_fn(" - ERROR: connection to %s failed" % (node))
return True
- if local_version != remote_version:
- feedback_fn(" - ERROR: sw version mismatch: master %s, node(%s) %s" %
- (local_version, node, remote_version))
+ if local_version != remote_version[0]:
+ feedback_fn(" - ERROR: incompatible protocol versions: master %s,"
+ " node %s %s" % (local_version, node, remote_version[0]))
return True
- # checks vg existance and size > 20G
+ # node seems compatible, we can actually try to look into its results
bad = False
+
+ # full package version
+ if constants.RELEASE_VERSION != remote_version[1]:
+ feedback_fn(" - WARNING: software version mismatch: master %s,"
+ " node %s %s" %
+ (constants.RELEASE_VERSION, node, remote_version[1]))
+
+ # checks vg existence and size > 20G
+
vglist = node_result.get(constants.NV_VGLIST, None)
if not vglist:
feedback_fn(" - ERROR: unable to check volume groups on node %s." %
if hv_result is not None:
feedback_fn(" - ERROR: hypervisor %s verify failure: '%s'" %
(hv_name, hv_result))
+
+ # check used drbd list
+ used_minors = node_result.get(constants.NV_DRBDLIST, [])
+ for minor, (iname, must_exist) in drbd_map.items():
+ if minor not in used_minors and must_exist:
+ feedback_fn(" - ERROR: drbd minor %d of instance %s is not active" %
+ (minor, iname))
+ bad = True
+ for minor in used_minors:
+ if minor not in drbd_map:
+ feedback_fn(" - ERROR: unallocated drbd minor %d is in use" % minor)
+ bad = True
+
return bad
def _VerifyInstance(self, instance, instanceconfig, node_vol_is,
(volume, node))
bad = True
- if not instanceconfig.status == 'down':
+ if instanceconfig.admin_up:
if ((node_current not in node_instance or
not instance in node_instance[node_current]) and
node_current not in n_offline):
nodelist = utils.NiceSort(self.cfg.GetNodeList())
nodeinfo = [self.cfg.GetNodeInfo(nname) for nname in nodelist]
instancelist = utils.NiceSort(self.cfg.GetInstanceList())
+ instanceinfo = dict((iname, self.cfg.GetInstanceInfo(iname))
+ for iname in instancelist)
i_non_redundant = [] # Non redundant instances
i_non_a_balanced = [] # Non auto-balanced instances
n_offline = [] # List of offline nodes
constants.NV_VGLIST: None,
constants.NV_VERSION: None,
constants.NV_HVINFO: self.cfg.GetHypervisorType(),
+ constants.NV_DRBDLIST: None,
}
all_nvinfo = self.rpc.call_node_verify(nodelist, node_verify_param,
self.cfg.GetClusterName())
cluster = self.cfg.GetClusterInfo()
master_node = self.cfg.GetMasterNode()
+ all_drbd_map = self.cfg.ComputeDRBDMap()
+
for node_i in nodeinfo:
node = node_i.name
nresult = all_nvinfo[node].data
bad = True
continue
+ node_drbd = {}
+ for minor, instance in all_drbd_map[node].items():
+ instance = instanceinfo[instance]
+ node_drbd[minor] = (instance.name, instance.admin_up)
result = self._VerifyNode(node_i, file_names, local_checksums,
- nresult, feedback_fn, master_files)
+ nresult, feedback_fn, master_files,
+ node_drbd)
bad = bad or result
lvdata = nresult.get(constants.NV_LVLIST, "Missing LV data")
for instance in instancelist:
feedback_fn("* Verifying instance %s" % instance)
- inst_config = self.cfg.GetInstanceInfo(instance)
+ inst_config = instanceinfo[instance]
result = self._VerifyInstance(instance, inst_config, node_volume,
node_instance, feedback_fn, n_offline)
bad = bad or result
nv_dict = {}
for inst in instances:
inst_lvs = {}
- if (inst.status != "up" or
+ if (not inst.admin_up or
inst.disk_template not in constants.DTS_NET_MIRROR):
continue
inst.MapLVsByNode(inst_lvs)
continue
rstats = rstats.data
retries = 0
- for i in range(len(rstats)):
- mstat = rstats[i]
+ for i, mstat in enumerate(rstats):
if mstat is None:
lu.LogWarning("Can't compute data for node %s/%s",
node, instance.disks[i].iv_name)
for instance_name in instance_list:
instance = self.cfg.GetInstanceInfo(instance_name)
- if node.name == instance.primary_node:
- raise errors.OpPrereqError("Instance %s still running on the node,"
- " please remove first." % instance_name)
- if node.name in instance.secondary_nodes:
- raise errors.OpPrereqError("Instance %s has node as a secondary,"
+ if node.name in instance.all_nodes:
+ raise errors.OpPrereqError("Instance %s is still running on the node,"
" please remove first." % instance_name)
self.op.node_name = node.name
self.node = node
logging.error("Copy of file %s to node %s failed", fname, to_node)
to_copy = []
- if constants.HT_XEN_HVM in self.cfg.GetClusterInfo().enabled_hypervisors:
+ enabled_hypervisors = self.cfg.GetClusterInfo().enabled_hypervisors
+ if constants.HTS_USE_VNC.intersection(enabled_hypervisors):
to_copy.append(constants.VNC_PASSWORD_FILE)
+
for fname in to_copy:
result = self.rpc.call_upload_file([node], fname)
if result[node].failed or not result[node]:
"FORCE": self.op.force,
}
env.update(_BuildInstanceHookEnvByObject(self, self.instance))
- nl = ([self.cfg.GetMasterNode(), self.instance.primary_node] +
- list(self.instance.secondary_nodes))
+ nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
return env, nl, nl
def CheckPrereq(self):
_StartInstanceDisks(self, instance, force)
result = self.rpc.call_instance_start(node_current, instance, extra_args)
- if result.failed or not result.data:
+ msg = result.RemoteFailMsg()
+ if msg:
_ShutdownInstanceDisks(self, instance)
- raise errors.OpExecError("Could not start instance")
+ raise errors.OpExecError("Could not start instance: %s" % msg)
class LURebootInstance(LogicalUnit):
"IGNORE_SECONDARIES": self.op.ignore_secondaries,
}
env.update(_BuildInstanceHookEnvByObject(self, self.instance))
- nl = ([self.cfg.GetMasterNode(), self.instance.primary_node] +
- list(self.instance.secondary_nodes))
+ nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
return env, nl, nl
def CheckPrereq(self):
_ShutdownInstanceDisks(self, instance)
_StartInstanceDisks(self, instance, ignore_secondaries)
result = self.rpc.call_instance_start(node_current, instance, extra_args)
- if result.failed or not result.data:
+ msg = result.RemoteFailMsg()
+ if msg:
_ShutdownInstanceDisks(self, instance)
- raise errors.OpExecError("Could not start instance for full reboot")
+ raise errors.OpExecError("Could not start instance for"
+ " full reboot: %s" % msg)
self.cfg.MarkInstanceUp(instance.name)
"""
env = _BuildInstanceHookEnvByObject(self, self.instance)
- nl = ([self.cfg.GetMasterNode(), self.instance.primary_node] +
- list(self.instance.secondary_nodes))
+ nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
return env, nl, nl
def CheckPrereq(self):
"""
env = _BuildInstanceHookEnvByObject(self, self.instance)
- nl = ([self.cfg.GetMasterNode(), self.instance.primary_node] +
- list(self.instance.secondary_nodes))
+ nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
return env, nl, nl
def CheckPrereq(self):
if instance.disk_template == constants.DT_DISKLESS:
raise errors.OpPrereqError("Instance '%s' has no disks" %
self.op.instance_name)
- if instance.status != "down":
+ if instance.admin_up:
raise errors.OpPrereqError("Instance '%s' is marked to be up" %
self.op.instance_name)
remote_info = self.rpc.call_instance_info(instance.primary_node,
try:
feedback_fn("Running the instance OS create scripts...")
result = self.rpc.call_instance_os_add(inst.primary_node, inst)
- result.Raise()
- if not result.data:
+ msg = result.RemoteFailMsg()
+ if msg:
raise errors.OpExecError("Could not install OS for instance %s"
- " on node %s" %
- (inst.name, inst.primary_node))
+ " on node %s: %s" %
+ (inst.name, inst.primary_node, msg))
finally:
_ShutdownInstanceDisks(self, inst)
"""
env = _BuildInstanceHookEnvByObject(self, self.instance)
env["INSTANCE_NEW_NAME"] = self.op.new_name
- nl = ([self.cfg.GetMasterNode(), self.instance.primary_node] +
- list(self.instance.secondary_nodes))
+ nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
return env, nl, nl
def CheckPrereq(self):
self.op.instance_name)
_CheckNodeOnline(self, instance.primary_node)
- if instance.status != "down":
+ if instance.admin_up:
raise errors.OpPrereqError("Instance '%s' is marked to be up" %
self.op.instance_name)
remote_info = self.rpc.call_instance_info(instance.primary_node,
try:
result = self.rpc.call_instance_run_rename(inst.primary_node, inst,
old_name)
- if result.failed or not result.data:
+ msg = result.RemoteFailMsg()
+ if msg:
msg = ("Could not run OS rename script for instance %s on node %s"
- " (but the instance has been renamed in Ganeti)" %
- (inst.name, inst.primary_node))
+ " (but the instance has been renamed in Ganeti): %s" %
+ (inst.name, inst.primary_node, msg))
self.proc.LogWarning(msg)
finally:
_ShutdownInstanceDisks(self, inst)
"""
all_info = self.cfg.GetAllInstancesInfo()
- if self.do_locking:
- instance_names = self.acquired_locks[locking.LEVEL_INSTANCE]
- elif self.wanted != locking.ALL_SET:
- instance_names = self.wanted
- missing = set(instance_names).difference(all_info.keys())
- if missing:
- raise errors.OpExecError(
- "Some instances were removed before retrieving their data: %s"
- % missing)
+ if self.wanted == locking.ALL_SET:
+ # caller didn't specify instance names, so ordering is not important
+ if self.do_locking:
+ instance_names = self.acquired_locks[locking.LEVEL_INSTANCE]
+ else:
+ instance_names = all_info.keys()
+ instance_names = utils.NiceSort(instance_names)
else:
- instance_names = all_info.keys()
+ # caller did specify names, so we must keep the ordering
+ if self.do_locking:
+ tgt_set = self.acquired_locks[locking.LEVEL_INSTANCE]
+ else:
+ tgt_set = all_info.keys()
+ missing = set(self.wanted).difference(tgt_set)
+ if missing:
+ raise errors.OpExecError("Some instances were removed before"
+ " retrieving their data: %s" % missing)
+ instance_names = self.wanted
- instance_names = utils.NiceSort(instance_names)
instance_list = [all_info[iname] for iname in instance_names]
# begin data gathering
elif field == "snodes":
val = list(instance.secondary_nodes)
elif field == "admin_state":
- val = (instance.status != "down")
+ val = instance.admin_up
elif field == "oper_state":
if instance.primary_node in bad_nodes:
val = None
else:
running = bool(live_data.get(instance.name))
if running:
- if instance.status != "down":
+ if instance.admin_up:
val = "running"
else:
val = "ERROR_up"
else:
- if instance.status != "down":
+ if instance.admin_up:
val = "ERROR_down"
else:
val = "ADMIN_down"
for dev in instance.disks:
# for drbd, these are drbd over lvm
if not _CheckDiskConsistency(self, dev, target_node, False):
- if instance.status == "up" and not self.op.ignore_consistency:
+ if instance.admin_up and not self.op.ignore_consistency:
raise errors.OpExecError("Disk %s is degraded on target node,"
" aborting failover." % dev.iv_name)
self.cfg.Update(instance)
# Only start the instance if it's marked as up
- if instance.status == "up":
+ if instance.admin_up:
feedback_fn("* activating the instance's disks on target node")
logging.info("Starting instance %s on node %s",
instance.name, target_node)
feedback_fn("* starting the instance on the target node")
result = self.rpc.call_instance_start(target_node, instance, None)
- if result.failed or not result.data:
+ msg = result.RemoteFailMsg()
+ if msg:
_ShutdownInstanceDisks(self, instance)
- raise errors.OpExecError("Could not start instance %s on node %s." %
- (instance.name, target_node))
+ raise errors.OpExecError("Could not start instance %s on node %s: %s" %
+ (instance.name, target_node, msg))
class LUMigrateInstance(LogicalUnit):
self.feedback_fn("* done")
+ def _RevertDiskStatus(self):
+ """Try to revert the disk status after a failed migration.
+
+ """
+ target_node = self.target_node
+ try:
+ self._EnsureSecondary(target_node)
+ self._GoStandalone()
+ self._GoReconnect(False)
+ self._WaitUntilSync()
+ except errors.OpExecError, err:
+ self.LogWarning("Migration failed and I can't reconnect the"
+ " drives: error '%s'\n"
+ "Please look and recover the instance status" %
+ str(err))
+
+ def _AbortMigration(self):
+ """Call the hypervisor code to abort a started migration.
+
+ """
+ instance = self.instance
+ target_node = self.target_node
+ migration_info = self.migration_info
+
+ abort_result = self.rpc.call_finalize_migration(target_node,
+ instance,
+ migration_info,
+ False)
+ abort_msg = abort_result.RemoteFailMsg()
+ if abort_msg:
+ logging.error("Aborting migration failed on target node %s: %s" %
+ (target_node, abort_msg))
+ # Don't raise an exception here, as we stil have to try to revert the
+ # disk status, even if this step failed.
+
def _ExecMigration(self):
"""Migrate an instance.
" synchronized on target node,"
" aborting migrate." % dev.iv_name)
+ # First get the migration information from the remote node
+ result = self.rpc.call_migration_info(source_node, instance)
+ msg = result.RemoteFailMsg()
+ if msg:
+ log_err = ("Failed fetching source migration information from %s: %s" %
+ (source_node, msg))
+ logging.error(log_err)
+ raise errors.OpExecError(log_err)
+
+ self.migration_info = migration_info = result.data[1]
+
+ # Then switch the disks to master/master mode
self._EnsureSecondary(target_node)
self._GoStandalone()
self._GoReconnect(True)
self._WaitUntilSync()
+ self.feedback_fn("* preparing %s to accept the instance" % target_node)
+ result = self.rpc.call_accept_instance(target_node,
+ instance,
+ migration_info,
+ self.nodes_ip[target_node])
+
+ msg = result.RemoteFailMsg()
+ if msg:
+ logging.error("Instance pre-migration failed, trying to revert"
+ " disk status: %s", msg)
+ self._AbortMigration()
+ self._RevertDiskStatus()
+ raise errors.OpExecError("Could not pre-migrate instance %s: %s" %
+ (instance.name, msg))
+
self.feedback_fn("* migrating instance to %s" % target_node)
time.sleep(10)
result = self.rpc.call_instance_migrate(source_node, instance,
if msg:
logging.error("Instance migration failed, trying to revert"
" disk status: %s", msg)
- try:
- self._EnsureSecondary(target_node)
- self._GoStandalone()
- self._GoReconnect(False)
- self._WaitUntilSync()
- except errors.OpExecError, err:
- self.LogWarning("Migration failed and I can't reconnect the"
- " drives: error '%s'\n"
- "Please look and recover the instance status" %
- str(err))
-
+ self._AbortMigration()
+ self._RevertDiskStatus()
raise errors.OpExecError("Could not migrate instance %s: %s" %
(instance.name, msg))
time.sleep(10)
# distribute new instance config to the other nodes
self.cfg.Update(instance)
+ result = self.rpc.call_finalize_migration(target_node,
+ instance,
+ migration_info,
+ True)
+ msg = result.RemoteFailMsg()
+ if msg:
+ logging.error("Instance migration succeeded, but finalization failed:"
+ " %s" % msg)
+ raise errors.OpExecError("Could not finalize instance migration: %s" %
+ msg)
+
self._EnsureSecondary(source_node)
self._WaitUntilSync()
self._GoStandalone()
return self._ExecMigration()
-def _CreateBlockDevOnPrimary(lu, node, instance, device, info):
- """Create a tree of block devices on the primary node.
+def _CreateBlockDev(lu, node, instance, device, force_create,
+ info, force_open):
+ """Create a tree of block devices on a given node.
+
+ If this device type has to be created on secondaries, create it and
+ all its children.
+
+ If not, just recurse to children keeping the same 'force' value.
- This always creates all devices.
+ @param lu: the lu on whose behalf we execute
+ @param node: the node on which to create the device
+ @type instance: L{objects.Instance}
+ @param instance: the instance which owns the device
+ @type device: L{objects.Disk}
+ @param device: the device to create
+ @type force_create: boolean
+ @param force_create: whether to force creation of this device; this
+ will be change to True whenever we find a device which has
+ CreateOnSecondary() attribute
+ @param info: the extra 'metadata' we should attach to the device
+ (this will be represented as a LVM tag)
+ @type force_open: boolean
+ @param force_open: this parameter will be passes to the
+ L{backend.CreateBlockDevice} function where it specifies
+ whether we run on primary or not, and it affects both
+ the child assembly and the device own Open() execution
"""
+ if device.CreateOnSecondary():
+ force_create = True
+
if device.children:
for child in device.children:
- if not _CreateBlockDevOnPrimary(lu, node, instance, child, info):
- return False
+ _CreateBlockDev(lu, node, instance, child, force_create,
+ info, force_open)
- lu.cfg.SetDiskID(device, node)
- new_id = lu.rpc.call_blockdev_create(node, device, device.size,
- instance.name, True, info)
- if new_id.failed or not new_id.data:
- return False
- if device.physical_id is None:
- device.physical_id = new_id
- return True
+ if not force_create:
+ return
+ _CreateSingleBlockDev(lu, node, instance, device, info, force_open)
-def _CreateBlockDevOnSecondary(lu, node, instance, device, force, info):
- """Create a tree of block devices on a secondary node.
- If this device type has to be created on secondaries, create it and
- all its children.
+def _CreateSingleBlockDev(lu, node, instance, device, info, force_open):
+ """Create a single block device on a given node.
- If not, just recurse to children keeping the same 'force' value.
+ This will not recurse over children of the device, so they must be
+ created in advance.
- """
- if device.CreateOnSecondary():
- force = True
- if device.children:
- for child in device.children:
- if not _CreateBlockDevOnSecondary(lu, node, instance,
- child, force, info):
- return False
+ @param lu: the lu on whose behalf we execute
+ @param node: the node on which to create the device
+ @type instance: L{objects.Instance}
+ @param instance: the instance which owns the device
+ @type device: L{objects.Disk}
+ @param device: the device to create
+ @param info: the extra 'metadata' we should attach to the device
+ (this will be represented as a LVM tag)
+ @type force_open: boolean
+ @param force_open: this parameter will be passes to the
+ L{backend.CreateBlockDevice} function where it specifies
+ whether we run on primary or not, and it affects both
+ the child assembly and the device own Open() execution
- if not force:
- return True
+ """
lu.cfg.SetDiskID(device, node)
- new_id = lu.rpc.call_blockdev_create(node, device, device.size,
- instance.name, False, info)
- if new_id.failed or not new_id.data:
- return False
+ result = lu.rpc.call_blockdev_create(node, device, device.size,
+ instance.name, force_open, info)
+ msg = result.RemoteFailMsg()
+ if msg:
+ raise errors.OpExecError("Can't create block device %s on"
+ " node %s for instance %s: %s" %
+ (device, node, instance.name, msg))
if device.physical_id is None:
- device.physical_id = new_id
- return True
+ device.physical_id = result.data[1]
def _GenerateUniqueNames(lu, exts):
disk_index = idx + base_index
disk_dev = objects.Disk(dev_type=constants.LD_LV, size=disk["size"],
logical_id=(vgname, names[idx]),
- iv_name="disk/%d" % disk_index)
+ iv_name="disk/%d" % disk_index,
+ mode=disk["mode"])
disks.append(disk_dev)
elif template_name == constants.DT_DRBD8:
if len(secondary_nodes) != 1:
minors = lu.cfg.AllocateDRBDMinor(
[primary_node, remote_node] * len(disk_info), instance_name)
- names = _GenerateUniqueNames(lu,
- [".disk%d_%s" % (i, s)
- for i in range(disk_count)
- for s in ("data", "meta")
- ])
+ names = []
+ for lv_prefix in _GenerateUniqueNames(lu, [".disk%d" % i
+ for i in range(disk_count)]):
+ names.append(lv_prefix + "_data")
+ names.append(lv_prefix + "_meta")
for idx, disk in enumerate(disk_info):
disk_index = idx + base_index
disk_dev = _GenerateDRBD8Branch(lu, primary_node, remote_node,
disk["size"], names[idx*2:idx*2+2],
"disk/%d" % disk_index,
minors[idx*2], minors[idx*2+1])
+ disk_dev.mode = disk["mode"]
disks.append(disk_dev)
elif template_name == constants.DT_FILE:
if len(secondary_nodes) != 0:
iv_name="disk/%d" % disk_index,
logical_id=(file_driver,
"%s/disk%d" % (file_storage_dir,
- idx)))
+ idx)),
+ mode=disk["mode"])
disks.append(disk_dev)
else:
raise errors.ProgrammerError("Invalid disk template '%s'" % template_name)
"""
info = _GetInstanceInfoText(instance)
+ pnode = instance.primary_node
if instance.disk_template == constants.DT_FILE:
file_storage_dir = os.path.dirname(instance.disks[0].logical_id[1])
- result = lu.rpc.call_file_storage_dir_create(instance.primary_node,
- file_storage_dir)
+ result = lu.rpc.call_file_storage_dir_create(pnode, file_storage_dir)
if result.failed or not result.data:
- logging.error("Could not connect to node '%s'", instance.primary_node)
- return False
+ raise errors.OpExecError("Could not connect to node '%s'" % pnode)
if not result.data[0]:
- logging.error("Failed to create directory '%s'", file_storage_dir)
- return False
+ raise errors.OpExecError("Failed to create directory '%s'" %
+ file_storage_dir)
# Note: this needs to be kept in sync with adding of disks in
# LUSetInstanceParams
logging.info("Creating volume %s for instance %s",
device.iv_name, instance.name)
#HARDCODE
- for secondary_node in instance.secondary_nodes:
- if not _CreateBlockDevOnSecondary(lu, secondary_node, instance,
- device, False, info):
- logging.error("Failed to create volume %s (%s) on secondary node %s!",
- device.iv_name, device, secondary_node)
- return False
- #HARDCODE
- if not _CreateBlockDevOnPrimary(lu, instance.primary_node,
- instance, device, info):
- logging.error("Failed to create volume %s on primary!", device.iv_name)
- return False
-
- return True
+ for node in instance.all_nodes:
+ f_create = node == pnode
+ _CreateBlockDev(lu, node, instance, device, f_create, info, f_create)
def _RemoveDisks(lu, instance):
hvparams)
for node in nodenames:
info = hvinfo[node]
+ if info.offline:
+ continue
info.Raise()
if not info.data or not isinstance(info.data, (tuple, list)):
raise errors.OpPrereqError("Cannot get current information"
raise errors.OpPrereqError("Invalid MAC address specified: %s" %
mac)
# bridge verification
- bridge = nic.get("bridge", self.cfg.GetDefBridge())
+ bridge = nic.get("bridge", None)
+ if bridge is None:
+ bridge = self.cfg.GetDefBridge()
self.nics.append(objects.NIC(mac=mac, ip=nic_ip, bridge=bridge))
# disk checks/pre-build
self.be_full[constants.BE_MEMORY],
self.op.hypervisor)
- if self.op.start:
- self.instance_status = 'up'
- else:
- self.instance_status = 'down'
+ self.instance_status = self.op.start
def Exec(self, feedback_fn):
"""Create and add the instance to the cluster.
primary_node=pnode_name,
nics=self.nics, disks=disks,
disk_template=self.op.disk_template,
- status=self.instance_status,
+ admin_up=self.instance_status,
network_port=network_port,
beparams=self.op.beparams,
hvparams=self.op.hvparams,
)
feedback_fn("* creating instance disks...")
- if not _CreateDisks(self, iobj):
- _RemoveDisks(self, iobj)
- self.cfg.ReleaseDRBDMinors(instance)
- raise errors.OpExecError("Device creation failed, reverting...")
+ try:
+ _CreateDisks(self, iobj)
+ except errors.OpExecError:
+ self.LogWarning("Device creation failed, reverting...")
+ try:
+ _RemoveDisks(self, iobj)
+ finally:
+ self.cfg.ReleaseDRBDMinors(instance)
+ raise
feedback_fn("adding instance %s to cluster config" % instance)
# Declare that we don't want to remove the instance lock anymore, as we've
# added the instance to the config
del self.remove_locks[locking.LEVEL_INSTANCE]
- # Remove the temp. assignements for the instance's drbds
- self.cfg.ReleaseDRBDMinors(instance)
# Unlock all the nodes
if self.op.mode == constants.INSTANCE_IMPORT:
nodes_keep = [self.op.src_node]
if self.op.mode == constants.INSTANCE_CREATE:
feedback_fn("* running the instance OS create scripts...")
result = self.rpc.call_instance_os_add(pnode_name, iobj)
- result.Raise()
- if not result.data:
+ msg = result.RemoteFailMsg()
+ if msg:
raise errors.OpExecError("Could not add os for instance %s"
- " on node %s" %
- (instance, pnode_name))
+ " on node %s: %s" %
+ (instance, pnode_name, msg))
elif self.op.mode == constants.INSTANCE_IMPORT:
feedback_fn("* running the instance OS import scripts...")
logging.info("Starting instance %s on node %s", instance, pnode_name)
feedback_fn("* starting instance...")
result = self.rpc.call_instance_start(pnode_name, iobj, None)
- result.Raise()
- if not result.data:
- raise errors.OpExecError("Could not start instance")
+ msg = result.RemoteFailMsg()
+ if msg:
+ raise errors.OpExecError("Could not start instance: %s" % msg)
class LUConnectConsole(NoHooksLU):
logging.debug("Connecting to console of %s on %s", instance.name, node)
hyper = hypervisor.GetHypervisor(instance.hypervisor)
- console_cmd = hyper.GetShellCommandForConsole(instance)
+ cluster = self.cfg.GetClusterInfo()
+ # beparams and hvparams are passed separately, to avoid editing the
+ # instance and then saving the defaults in the instance itself.
+ hvparams = cluster.FillHV(instance)
+ beparams = cluster.FillBE(instance)
+ console_cmd = hyper.GetShellCommandForConsole(instance, hvparams, beparams)
# build ssh cmdline
return self.ssh.BuildCmd(node, "root", console_cmd, batch=True, tty=True)
raise errors.OpPrereqError("Node '%s' not known" %
self.op.remote_node)
self.op.remote_node = remote_node
+ # Warning: do not remove the locking of the new secondary here
+ # unless DRBD8.AddChildren is changed to work in parallel;
+ # currently it doesn't since parallel invocations of
+ # FindUnusedMinor will conflict
self.needed_locks[locking.LEVEL_NODE] = [remote_node]
self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
else:
iv_names[dev.iv_name] = (dev, old_lvs, new_lvs)
info("creating new local storage on %s for %s" %
(tgt_node, dev.iv_name))
- # since we *always* want to create this LV, we use the
- # _Create...OnPrimary (which forces the creation), even if we
- # are talking about the secondary node
+ # we pass force_create=True to force the LVM creation
for new_lv in new_lvs:
- if not _CreateBlockDevOnPrimary(self, tgt_node, instance, new_lv,
- _GetInstanceInfoText(instance)):
- raise errors.OpExecError("Failed to create new LV named '%s' on"
- " node '%s'" %
- (new_lv.logical_id[1], tgt_node))
+ _CreateBlockDev(self, tgt_node, instance, new_lv, True,
+ _GetInstanceInfoText(instance), False)
# Step: for each lv, detach+rename*2+attach
self.proc.LogStep(4, steps_total, "change drbd configuration")
for idx, dev in enumerate(instance.disks):
info("adding new local storage on %s for disk/%d" %
(new_node, idx))
- # since we *always* want to create this LV, we use the
- # _Create...OnPrimary (which forces the creation), even if we
- # are talking about the secondary node
+ # we pass force_create=True to force LVM creation
for new_lv in dev.children:
- if not _CreateBlockDevOnPrimary(self, new_node, instance, new_lv,
- _GetInstanceInfoText(instance)):
- raise errors.OpExecError("Failed to create new LV named '%s' on"
- " node '%s'" %
- (new_lv.logical_id[1], new_node))
+ _CreateBlockDev(self, new_node, instance, new_lv, True,
+ _GetInstanceInfoText(instance), False)
# Step 4: dbrd minors and drbd setups changes
# after this, we must manually remove the drbd minors on both the
new_drbd = objects.Disk(dev_type=constants.LD_DRBD8,
logical_id=new_alone_id,
children=dev.children)
- if not _CreateBlockDevOnSecondary(self, new_node, instance,
- new_drbd, False,
- _GetInstanceInfoText(instance)):
+ try:
+ _CreateSingleBlockDev(self, new_node, instance, new_drbd,
+ _GetInstanceInfoText(instance), False)
+ except errors.BlockDeviceError:
self.cfg.ReleaseDRBDMinors(instance.name)
- raise errors.OpExecError("Failed to create new DRBD on"
- " node '%s'" % new_node)
+ raise
for idx, dev in enumerate(instance.disks):
# we have new devices, shutdown the drbd on the old secondary
dev.logical_id = new_logical_id
cfg.SetDiskID(dev, pri_node)
cfg.Update(instance)
- # we can remove now the temp minors as now the new values are
- # written to the config file (and therefore stable)
- self.cfg.ReleaseDRBDMinors(instance.name)
# and now perform the drbd attach
info("attaching primary drbds to new secondary (standalone => connected)")
instance = self.instance
# Activate the instance disks if we're replacing them on a down instance
- if instance.status == "down":
+ if not instance.admin_up:
_StartInstanceDisks(self, instance, True)
if self.op.mode == constants.REPLACE_DISK_CHG:
ret = fn(feedback_fn)
# Deactivate the instance disks if we're replacing them on a down instance
- if instance.status == "down":
+ if not instance.admin_up:
_SafeShutdownInstanceDisks(self, instance)
return ret
instance = self.cfg.GetInstanceInfo(self.op.instance_name)
assert instance is not None, \
"Cannot retrieve locked instance %s" % self.op.instance_name
- _CheckNodeOnline(self, instance.primary_node)
- for node in instance.secondary_nodes:
+ nodenames = list(instance.all_nodes)
+ for node in nodenames:
_CheckNodeOnline(self, node)
self.disk = instance.FindDisk(self.op.disk)
- nodenames = [instance.primary_node] + list(instance.secondary_nodes)
nodeinfo = self.rpc.call_node_info(nodenames, self.cfg.GetVGName(),
instance.hypervisor)
for node in nodenames:
"""
instance = self.instance
disk = self.disk
- for node in (instance.secondary_nodes + (instance.primary_node,)):
+ for node in instance.all_nodes:
self.cfg.SetDiskID(disk, node)
result = self.rpc.call_blockdev_grow(node, disk, self.op.amount)
result.Raise()
remote_state = "down"
else:
remote_state = None
- if instance.status == "down":
- config_state = "down"
- else:
+ if instance.admin_up:
config_state = "up"
+ else:
+ config_state = "down"
disks = [self._ComputeDiskStatus(instance, None, device)
for device in instance.disks]
raise errors.OpPrereqError("Invalid disk index")
if disk_op == constants.DDM_ADD:
mode = disk_dict.setdefault('mode', constants.DISK_RDWR)
- if mode not in (constants.DISK_RDONLY, constants.DISK_RDWR):
+ if mode not in constants.DISK_ACCESS_SET:
raise errors.OpPrereqError("Invalid disk access mode '%s'" % mode)
size = disk_dict.get('size', None)
if size is None:
args['vcpus'] = self.be_new[constants.BE_VCPUS]
# FIXME: readd disk/nic changes
env = _BuildInstanceHookEnvByObject(self, self.instance, override=args)
- nl = [self.cfg.GetMasterNode(),
- self.instance.primary_node] + list(self.instance.secondary_nodes)
+ nl = [self.cfg.GetMasterNode()] + list(self.instance.all_nodes)
return env, nl, nl
def CheckPrereq(self):
instance = self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
assert self.instance is not None, \
"Cannot retrieve locked instance %s" % self.op.instance_name
- pnode = self.instance.primary_node
- nodelist = [pnode]
- nodelist.extend(instance.secondary_nodes)
+ pnode = instance.primary_node
+ nodelist = list(instance.all_nodes)
# hvparams processing
if self.op.hvparams:
disk_idx_base = len(instance.disks)
new_disk = _GenerateDiskTemplate(self,
instance.disk_template,
- instance, instance.primary_node,
+ instance.name, instance.primary_node,
instance.secondary_nodes,
[disk_dict],
file_path,
file_driver,
disk_idx_base)[0]
- new_disk.mode = disk_dict['mode']
instance.disks.append(new_disk)
info = _GetInstanceInfoText(instance)
new_disk.iv_name, instance.name)
# Note: this needs to be kept in sync with _CreateDisks
#HARDCODE
- for secondary_node in instance.secondary_nodes:
- if not _CreateBlockDevOnSecondary(self, secondary_node, instance,
- new_disk, False, info):
+ for node in instance.all_nodes:
+ f_create = node == instance.primary_node
+ try:
+ _CreateBlockDev(self, node, instance, new_disk,
+ f_create, info, f_create)
+ except errors.OpExecError, err:
self.LogWarning("Failed to create volume %s (%s) on"
- " secondary node %s!",
- new_disk.iv_name, new_disk, secondary_node)
- #HARDCODE
- if not _CreateBlockDevOnPrimary(self, instance.primary_node,
- instance, new_disk, info):
- self.LogWarning("Failed to create volume %s on primary!",
- new_disk.iv_name)
+ " node %s: %s",
+ new_disk.iv_name, new_disk, node, err)
result.append(("disk/%d" % disk_idx_base, "add:size=%s,mode=%s" %
(new_disk.size, new_disk.mode)))
else:
snap_disks.append(new_dev)
finally:
- if self.op.shutdown and instance.status == "up":
+ if self.op.shutdown and instance.admin_up:
result = self.rpc.call_instance_start(src_node, instance, None)
- if result.failed or not result.data:
+ msg = result.RemoteFailMsg()
+ if msg:
_ShutdownInstanceDisks(self, instance)
- raise errors.OpExecError("Could not start instance")
+ raise errors.OpExecError("Could not start instance: %s" % msg)
# TODO: check for size
"version": 1,
"cluster_name": cfg.GetClusterName(),
"cluster_tags": list(cluster_info.GetTags()),
- "enable_hypervisors": list(cluster_info.enabled_hypervisors),
+ "enabled_hypervisors": list(cluster_info.enabled_hypervisors),
# we don't have job IDs
}
iinfo = cfg.GetAllInstancesInfo().values()
hypervisor_name)
node_iinfo = self.lu.rpc.call_all_instances_info(node_list,
cluster_info.enabled_hypervisors)
- for nname in node_list:
+ for nname, nresult in node_data.items():
+ # first fill in static (config-based) values
ninfo = cfg.GetNodeInfo(nname)
- node_data[nname].Raise()
- if not isinstance(node_data[nname].data, dict):
- raise errors.OpExecError("Can't get data for node %s" % nname)
- remote_info = node_data[nname].data
- for attr in ['memory_total', 'memory_free', 'memory_dom0',
- 'vg_size', 'vg_free', 'cpu_total']:
- if attr not in remote_info:
- raise errors.OpExecError("Node '%s' didn't return attribute '%s'" %
- (nname, attr))
- try:
- remote_info[attr] = int(remote_info[attr])
- except ValueError, err:
- raise errors.OpExecError("Node '%s' returned invalid value for '%s':"
- " %s" % (nname, attr, str(err)))
- # compute memory used by primary instances
- i_p_mem = i_p_up_mem = 0
- for iinfo, beinfo in i_list:
- if iinfo.primary_node == nname:
- i_p_mem += beinfo[constants.BE_MEMORY]
- if iinfo.name not in node_iinfo[nname]:
- i_used_mem = 0
- else:
- i_used_mem = int(node_iinfo[nname][iinfo.name]['memory'])
- i_mem_diff = beinfo[constants.BE_MEMORY] - i_used_mem
- remote_info['memory_free'] -= max(0, i_mem_diff)
-
- if iinfo.status == "up":
- i_p_up_mem += beinfo[constants.BE_MEMORY]
-
- # compute memory used by instances
pnr = {
"tags": list(ninfo.GetTags()),
- "total_memory": remote_info['memory_total'],
- "reserved_memory": remote_info['memory_dom0'],
- "free_memory": remote_info['memory_free'],
- "i_pri_memory": i_p_mem,
- "i_pri_up_memory": i_p_up_mem,
- "total_disk": remote_info['vg_size'],
- "free_disk": remote_info['vg_free'],
"primary_ip": ninfo.primary_ip,
"secondary_ip": ninfo.secondary_ip,
- "total_cpus": remote_info['cpu_total'],
"offline": ninfo.offline,
+ "master_candidate": ninfo.master_candidate,
}
+
+ if not ninfo.offline:
+ nresult.Raise()
+ if not isinstance(nresult.data, dict):
+ raise errors.OpExecError("Can't get data for node %s" % nname)
+ remote_info = nresult.data
+ for attr in ['memory_total', 'memory_free', 'memory_dom0',
+ 'vg_size', 'vg_free', 'cpu_total']:
+ if attr not in remote_info:
+ raise errors.OpExecError("Node '%s' didn't return attribute"
+ " '%s'" % (nname, attr))
+ try:
+ remote_info[attr] = int(remote_info[attr])
+ except ValueError, err:
+ raise errors.OpExecError("Node '%s' returned invalid value"
+ " for '%s': %s" % (nname, attr, err))
+ # compute memory used by primary instances
+ i_p_mem = i_p_up_mem = 0
+ for iinfo, beinfo in i_list:
+ if iinfo.primary_node == nname:
+ i_p_mem += beinfo[constants.BE_MEMORY]
+ if iinfo.name not in node_iinfo[nname].data:
+ i_used_mem = 0
+ else:
+ i_used_mem = int(node_iinfo[nname].data[iinfo.name]['memory'])
+ i_mem_diff = beinfo[constants.BE_MEMORY] - i_used_mem
+ remote_info['memory_free'] -= max(0, i_mem_diff)
+
+ if iinfo.admin_up:
+ i_p_up_mem += beinfo[constants.BE_MEMORY]
+
+ # compute memory used by instances
+ pnr_dyn = {
+ "total_memory": remote_info['memory_total'],
+ "reserved_memory": remote_info['memory_dom0'],
+ "free_memory": remote_info['memory_free'],
+ "total_disk": remote_info['vg_size'],
+ "free_disk": remote_info['vg_free'],
+ "total_cpus": remote_info['cpu_total'],
+ "i_pri_memory": i_p_mem,
+ "i_pri_up_memory": i_p_up_mem,
+ }
+ pnr.update(pnr_dyn)
+
node_results[nname] = pnr
data["nodes"] = node_results
for n in iinfo.nics]
pir = {
"tags": list(iinfo.GetTags()),
- "should_run": iinfo.status == "up",
+ "admin_up": iinfo.admin_up,
"vcpus": beinfo[constants.BE_VCPUS],
"memory": beinfo[constants.BE_MEMORY],
"os": iinfo.os,
"nodes": [iinfo.primary_node] + list(iinfo.secondary_nodes),
"nics": nic_data,
- "disks": [{"size": dsk.size, "mode": "w"} for dsk in iinfo.disks],
+ "disks": [{"size": dsk.size, "mode": dsk.mode} for dsk in iinfo.disks],
"disk_template": iinfo.disk_template,
"hypervisor": iinfo.hypervisor,
}
"""
data = self.in_data
- if len(self.disks) != 2:
- raise errors.OpExecError("Only two-disk configurations supported")
disk_space = _ComputeDiskSize(self.disk_template, self.disks)
" 'nics' parameter")
if not isinstance(self.op.disks, list):
raise errors.OpPrereqError("Invalid parameter 'disks'")
- if len(self.op.disks) != 2:
- raise errors.OpPrereqError("Only two-disk configurations supported")
for row in self.op.disks:
if (not isinstance(row, dict) or
"size" not in row or
row["mode"] not in ['r', 'w']):
raise errors.OpPrereqError("Invalid contents of the"
" 'disks' parameter")
- if self.op.hypervisor is None:
+ if not hasattr(self.op, "hypervisor") or self.op.hypervisor is None:
self.op.hypervisor = self.cfg.GetHypervisorType()
elif self.op.mode == constants.IALLOCATOR_MODE_RELOC:
if not hasattr(self.op, "name"):