priv_key, pub_key, auth_keys = ssh.GetUserFiles(constants.GANETI_RUNAS,
mkdir=True)
except errors.OpExecError, err:
- logging.exception("Error while processing user ssh files")
- return False
+ msg = "Error while processing user ssh files"
+ logging.exception(msg)
+ return (False, "%s: %s" % (msg, err))
for name, content in [(priv_key, sshkey), (pub_key, sshpub)]:
utils.WriteFile(name, data=content, mode=0600)
utils.RunCmd([constants.SSH_INITD_SCRIPT, "restart"])
- return True
+ return (True, "Node added successfully")
def LeaveCluster():
result[constants.NV_VGLIST] = ListVolumeGroups()
if constants.NV_VERSION in what:
- result[constants.NV_VERSION] = constants.PROTOCOL_VERSION
+ result[constants.NV_VERSION] = (constants.PROTOCOL_VERSION,
+ constants.RELEASE_VERSION)
if constants.NV_HVINFO in what:
hyper = hypervisor.GetHypervisor(what[constants.NV_HVINFO])
result[constants.NV_HVINFO] = hyper.GetNodeInfo()
+ if constants.NV_DRBDLIST in what:
+ try:
+ used_minors = bdev.DRBD8.GetUsedDevs().keys()
+ except errors.BlockDeviceError:
+ logging.warning("Can't get used minors list", exc_info=True)
+ used_minors = []
+ result[constants.NV_DRBDLIST] = used_minors
+
return result
return output
+def GetInstanceMigratable(instance):
+ """Gives whether an instance can be migrated.
+
+ @type instance: L{objects.Instance}
+ @param instance: object representing the instance to be checked.
+
+ @rtype: tuple
+ @return: tuple of (result, description) where:
+ - result: whether the instance can be migrated or not
+ - description: a description of the issue, if relevant
+
+ """
+ hyper = hypervisor.GetHypervisor(instance.hypervisor)
+ if instance.name not in hyper.ListInstances():
+ return (False, 'not running')
+
+ for idx in range(len(instance.disks)):
+ link_name = _GetBlockDevSymlinkPath(instance.name, idx)
+ if not os.path.islink(link_name):
+ return (False, 'not restarted since ganeti 1.2.5')
+
+ return (True, '')
+
+
def GetAllInstancesInfo(hypervisor_list):
"""Gather data about all instances.
logging.error("os create command '%s' returned error: %s, logfile: %s,"
" output: %s", result.cmd, result.fail_reason, logfile,
result.output)
- return False
+ lines = [utils.SafeEncode(val)
+ for val in utils.TailFile(logfile, lines=20)]
+ return (False, "OS create script failed (%s), last lines in the"
+ " log file:\n%s" % (result.fail_reason, "\n".join(lines)))
- return True
+ return (True, "Successfully installed")
def RunRenameInstance(instance, old_name):
if result.failed:
logging.error("os create command '%s' returned error: %s output: %s",
result.cmd, result.fail_reason, result.output)
- return False
+ lines = [utils.SafeEncode(val)
+ for val in utils.TailFile(logfile, lines=20)]
+ return (False, "OS rename script failed (%s), last lines in the"
+ " log file:\n%s" % (result.fail_reason, "\n".join(lines)))
- return True
+ return (True, "Rename successful")
def _GetVGInfo(vg_name):
return retdic
-def _GatherBlockDevs(instance):
+def _GetBlockDevSymlinkPath(instance_name, idx):
+ return os.path.join(constants.DISK_LINKS_DIR,
+ "%s:%d" % (instance_name, idx))
+
+
+def _SymlinkBlockDev(instance_name, device_path, idx):
+ """Set up symlinks to a instance's block device.
+
+ This is an auxiliary function run when an instance is start (on the primary
+ node) or when an instance is migrated (on the target node).
+
+
+ @param instance_name: the name of the target instance
+ @param device_path: path of the physical block device, on the node
+ @param idx: the disk index
+ @return: absolute path to the disk's symlink
+
+ """
+ link_name = _GetBlockDevSymlinkPath(instance_name, idx)
+ try:
+ os.symlink(device_path, link_name)
+ except OSError, err:
+ if err.errno == errno.EEXIST:
+ if (not os.path.islink(link_name) or
+ os.readlink(link_name) != device_path):
+ os.remove(link_name)
+ os.symlink(device_path, link_name)
+ else:
+ raise
+
+ return link_name
+
+
+def _RemoveBlockDevLinks(instance_name, disks):
+ """Remove the block device symlinks belonging to the given instance.
+
+ """
+ for idx, disk in enumerate(disks):
+ link_name = _GetBlockDevSymlinkPath(instance_name, idx)
+ if os.path.islink(link_name):
+ try:
+ os.remove(link_name)
+ except OSError:
+ logging.exception("Can't remove symlink '%s'", link_name)
+
+
+def _GatherAndLinkBlockDevs(instance):
"""Set up an instance's block device(s).
This is run on the primary node at instance startup. The block
@type instance: L{objects.Instance}
@param instance: the instance whose disks we shoul assemble
- @rtype: list of L{bdev.BlockDev}
- @return: list of the block devices
+ @rtype: list
+ @return: list of (disk_object, device_path)
"""
block_devices = []
- for disk in instance.disks:
+ for idx, disk in enumerate(instance.disks):
device = _RecursiveFindBD(disk)
if device is None:
raise errors.BlockDeviceError("Block device '%s' is not set up." %
str(disk))
device.Open()
- block_devices.append((disk, device))
+ try:
+ link_name = _SymlinkBlockDev(instance.name, device.dev_path, idx)
+ except OSError, e:
+ raise errors.BlockDeviceError("Cannot create block device symlink: %s" %
+ e.strerror)
+
+ block_devices.append((disk, link_name))
+
return block_devices
running_instances = GetInstanceList([instance.hypervisor])
if instance.name in running_instances:
- return True
-
- block_devices = _GatherBlockDevs(instance)
- hyper = hypervisor.GetHypervisor(instance.hypervisor)
+ return (True, "Already running")
try:
+ block_devices = _GatherAndLinkBlockDevs(instance)
+ hyper = hypervisor.GetHypervisor(instance.hypervisor)
hyper.StartInstance(instance, block_devices, extra_args)
+ except errors.BlockDeviceError, err:
+ logging.exception("Failed to start instance")
+ return (False, "Block device error: %s" % str(err))
except errors.HypervisorError, err:
logging.exception("Failed to start instance")
- return False
+ _RemoveBlockDevLinks(instance.name, instance.disks)
+ return (False, "Hypervisor error: %s" % str(err))
- return True
+ return (True, "Instance started successfully")
def ShutdownInstance(instance):
try:
hyper.StopInstance(instance)
except errors.HypervisorError, err:
- logging.error("Failed to stop instance")
+ logging.error("Failed to stop instance: %s" % err)
return False
# test every 10secs for 2min
- shutdown_ok = False
time.sleep(1)
for dummy in range(11):
time.sleep(10)
else:
# the shutdown did not succeed
- logging.error("shutdown of '%s' unsuccessful, using destroy", instance)
+ logging.error("Shutdown of '%s' unsuccessful, using destroy",
+ instance.name)
try:
hyper.StopInstance(instance, force=True)
except errors.HypervisorError, err:
- logging.exception("Failed to stop instance")
+ logging.exception("Failed to stop instance: %s" % err)
return False
time.sleep(1)
if instance.name in GetInstanceList([hv_name]):
- logging.error("could not shutdown instance '%s' even by destroy",
+ logging.error("Could not shutdown instance '%s' even by destroy",
instance.name)
return False
+ _RemoveBlockDevLinks(instance.name, instance.disks)
+
return True
return True
+def MigrationInfo(instance):
+ """Gather information about an instance to be migrated.
+
+ @type instance: L{objects.Instance}
+ @param instance: the instance definition
+
+ """
+ hyper = hypervisor.GetHypervisor(instance.hypervisor)
+ try:
+ info = hyper.MigrationInfo(instance)
+ except errors.HypervisorError, err:
+ msg = "Failed to fetch migration information"
+ logging.exception(msg)
+ return (False, '%s: %s' % (msg, err))
+ return (True, info)
+
+
+def AcceptInstance(instance, info, target):
+ """Prepare the node to accept an instance.
+
+ @type instance: L{objects.Instance}
+ @param instance: the instance definition
+ @type info: string/data (opaque)
+ @param info: migration information, from the source node
+ @type target: string
+ @param target: target host (usually ip), on this node
+
+ """
+ hyper = hypervisor.GetHypervisor(instance.hypervisor)
+ try:
+ hyper.AcceptInstance(instance, info, target)
+ except errors.HypervisorError, err:
+ msg = "Failed to accept instance"
+ logging.exception(msg)
+ return (False, '%s: %s' % (msg, err))
+ return (True, "Accept successfull")
+
+
+def FinalizeMigration(instance, info, success):
+ """Finalize any preparation to accept an instance.
+
+ @type instance: L{objects.Instance}
+ @param instance: the instance definition
+ @type info: string/data (opaque)
+ @param info: migration information, from the source node
+ @type success: boolean
+ @param success: whether the migration was a success or a failure
+
+ """
+ hyper = hypervisor.GetHypervisor(instance.hypervisor)
+ try:
+ hyper.FinalizeMigration(instance, info, success)
+ except errors.HypervisorError, err:
+ msg = "Failed to finalize migration"
+ logging.exception(msg)
+ return (False, '%s: %s' % (msg, err))
+ return (True, "Migration Finalized")
+
+
def MigrateInstance(instance, target, live):
"""Migrates an instance to another node.
- msg is a string with details in case of failure
"""
- hyper = hypervisor.GetHypervisor(instance.hypervisor_name)
+ hyper = hypervisor.GetHypervisor(instance.hypervisor)
try:
hyper.MigrateInstance(instance.name, target, live)
except errors.HypervisorError, err:
- msg = "Failed to migrate instance: %s" % str(err)
- logging.error(msg)
- return (False, msg)
+ msg = "Failed to migrate instance"
+ logging.exception(msg)
+ return (False, "%s: %s" % (msg, err))
return (True, "Migration successfull")
# be assembled
crdev.Open()
clist.append(crdev)
+
try:
- device = bdev.FindDevice(disk.dev_type, disk.physical_id, clist)
- if device is not None:
- logging.info("removing existing device %s", disk)
- device.Remove()
- except errors.BlockDeviceError, err:
- pass
+ device = bdev.Create(disk.dev_type, disk.physical_id, clist, size)
+ except errors.GenericError, err:
+ return False, "Can't create block device: %s" % str(err)
- device = bdev.Create(disk.dev_type, disk.physical_id,
- clist, size)
- if device is None:
- raise ValueError("Can't create child device for %s, %s" %
- (disk, size))
if on_primary or disk.AssembleOnSecondary():
if not device.Assemble():
- errorstring = "Can't assemble device after creation"
+ errorstring = "Can't assemble device after creation, very unusual event"
logging.error(errorstring)
- raise errors.BlockDeviceError("%s, very unusual event - check the node"
- " daemon logs" % errorstring)
+ return False, errorstring
device.SetSyncSpeed(constants.SYNC_SPEED)
if on_primary or disk.OpenOnSecondary():
device.Open(force=True)
device.SetInfo(info)
physical_id = device.unique_id
- return physical_id
+ return True, physical_id
def RemoveBlockDevice(disk):
"""
try:
- # since we are removing the device, allow a partial match
- # this allows removal of broken mirrors
- rdev = _RecursiveFindBD(disk, allow_partial=True)
+ rdev = _RecursiveFindBD(disk)
except errors.BlockDeviceError, err:
# probably can't attach
logging.info("Can't attach to device %s in remove", disk)
children.append(cdev)
if as_primary or disk.AssembleOnSecondary():
- r_dev = bdev.AttachOrAssemble(disk.dev_type, disk.physical_id, children)
+ r_dev = bdev.Assemble(disk.dev_type, disk.physical_id, children)
r_dev.SetSyncSpeed(constants.SYNC_SPEED)
result = r_dev
if as_primary or disk.OpenOnSecondary():
@return: the success of the operation
"""
- parent_bdev = _RecursiveFindBD(parent_cdev, allow_partial=True)
+ parent_bdev = _RecursiveFindBD(parent_cdev)
if parent_bdev is None:
logging.error("Can't find parent device")
return False
return stats
-def _RecursiveFindBD(disk, allow_partial=False):
+def _RecursiveFindBD(disk):
"""Check if a device is activated.
If so, return informations about the real device.
@type disk: L{objects.Disk}
@param disk: the disk object we need to find
- @type allow_partial: boolean
- @param allow_partial: if true, don't abort the find if a
- child of the device can't be found; this is intended
- to be used when repairing mirrors
@return: None if the device can't be found,
otherwise the device instance
result = {}
result['OS_API_VERSION'] = '%d' % constants.OS_API_VERSION
result['INSTANCE_NAME'] = instance.name
+ result['INSTANCE_OS'] = instance.os
result['HYPERVISOR'] = instance.hypervisor
result['DISK_COUNT'] = '%d' % len(instance.disks)
result['NIC_COUNT'] = '%d' % len(instance.nics)
real_disk.Open()
result['DISK_%d_PATH' % idx] = real_disk.dev_path
# FIXME: When disks will have read-only mode, populate this
- result['DISK_%d_ACCESS' % idx] = 'W'
+ result['DISK_%d_ACCESS' % idx] = disk.mode
if constants.HV_DISK_TYPE in instance.hvparams:
result['DISK_%d_FRONTEND_TYPE' % idx] = \
instance.hvparams[constants.HV_DISK_TYPE]
instance.beparams[constants.BE_VCPUS])
config.set(constants.INISECT_INS, 'disk_template', instance.disk_template)
- nic_count = 0
+ nic_total = 0
for nic_count, nic in enumerate(instance.nics):
+ nic_total += 1
config.set(constants.INISECT_INS, 'nic%d_mac' %
nic_count, '%s' % nic.mac)
config.set(constants.INISECT_INS, 'nic%d_ip' % nic_count, '%s' % nic.ip)
config.set(constants.INISECT_INS, 'nic%d_bridge' % nic_count,
'%s' % nic.bridge)
# TODO: redundant: on load can read nics until it doesn't exist
- config.set(constants.INISECT_INS, 'nic_count' , '%d' % nic_count)
+ config.set(constants.INISECT_INS, 'nic_count' , '%d' % nic_total)
disk_total = 0
for disk_count, disk in enumerate(snap_disks):
if not (_IsJobQueueFile(old) and _IsJobQueueFile(new)):
return False
- os.rename(old, new)
+ utils.RenameFile(old, new, mkdir=True)
return True
return True
-def CloseBlockDevices(disks):
+def CloseBlockDevices(instance_name, disks):
"""Closes the given block devices.
This means they will be switched to secondary mode (in case of
DRBD).
+ @param instance_name: if the argument is not empty, the symlinks
+ of this instance will be removed
@type disks: list of L{objects.Disk}
@param disks: the list of disks to be closed
@rtype: tuple (success, message)
if msg:
return (False, "Can't make devices secondary: %s" % ",".join(msg))
else:
+ if instance_name:
+ _RemoveBlockDevLinks(instance_name, disks)
return (True, "All devices secondary")
return (True, "Done")
+def _FindDisks(nodes_ip, disks):
+ """Sets the physical ID on disks and returns the block devices.
+
+ """
+ # set the correct physical ID
+ my_name = utils.HostInfo().name
+ for cf in disks:
+ cf.SetPhysicalID(my_name, nodes_ip)
+
+ bdevs = []
+
+ for cf in disks:
+ rd = _RecursiveFindBD(cf)
+ if rd is None:
+ return (False, "Can't find device %s" % cf)
+ bdevs.append(rd)
+ return (True, bdevs)
+
+
+def DrbdDisconnectNet(nodes_ip, disks):
+ """Disconnects the network on a list of drbd devices.
+
+ """
+ status, bdevs = _FindDisks(nodes_ip, disks)
+ if not status:
+ return status, bdevs
+
+ # disconnect disks
+ for rd in bdevs:
+ try:
+ rd.DisconnectNet()
+ except errors.BlockDeviceError, err:
+ logging.exception("Failed to go into standalone mode")
+ return (False, "Can't change network configuration: %s" % str(err))
+ return (True, "All disks are now disconnected")
+
+
+def DrbdAttachNet(nodes_ip, disks, instance_name, multimaster):
+ """Attaches the network on a list of drbd devices.
+
+ """
+ status, bdevs = _FindDisks(nodes_ip, disks)
+ if not status:
+ return status, bdevs
+
+ if multimaster:
+ for idx, rd in enumerate(bdevs):
+ try:
+ _SymlinkBlockDev(instance_name, rd.dev_path, idx)
+ except EnvironmentError, err:
+ return (False, "Can't create symlink: %s" % str(err))
+ # reconnect disks, switch to new master configuration and if
+ # needed primary mode
+ for rd in bdevs:
+ try:
+ rd.AttachNet(multimaster)
+ except errors.BlockDeviceError, err:
+ return (False, "Can't change network configuration: %s" % str(err))
+ # wait until the disks are connected; we need to retry the re-attach
+ # if the device becomes standalone, as this might happen if the one
+ # node disconnects and reconnects in a different mode before the
+ # other node reconnects; in this case, one or both of the nodes will
+ # decide it has wrong configuration and switch to standalone
+ RECONNECT_TIMEOUT = 2 * 60
+ sleep_time = 0.100 # start with 100 miliseconds
+ timeout_limit = time.time() + RECONNECT_TIMEOUT
+ while time.time() < timeout_limit:
+ all_connected = True
+ for rd in bdevs:
+ stats = rd.GetProcStatus()
+ if not (stats.is_connected or stats.is_in_resync):
+ all_connected = False
+ if stats.is_standalone:
+ # peer had different config info and this node became
+ # standalone, even though this should not happen with the
+ # new staged way of changing disk configs
+ try:
+ rd.ReAttachNet(multimaster)
+ except errors.BlockDeviceError, err:
+ return (False, "Can't change network configuration: %s" % str(err))
+ if all_connected:
+ break
+ time.sleep(sleep_time)
+ sleep_time = min(5, sleep_time * 1.5)
+ if not all_connected:
+ return (False, "Timeout in disk reconnecting")
+ if multimaster:
+ # change to primary mode
+ for rd in bdevs:
+ rd.Open()
+ if multimaster:
+ msg = "multi-master and primary"
+ else:
+ msg = "single-master"
+ return (True, "Disks are now configured as %s" % msg)
+
+
+def DrbdWaitSync(nodes_ip, disks):
+ """Wait until DRBDs have synchronized.
+
+ """
+ status, bdevs = _FindDisks(nodes_ip, disks)
+ if not status:
+ return status, bdevs
+
+ min_resync = 100
+ alldone = True
+ failure = False
+ for rd in bdevs:
+ stats = rd.GetProcStatus()
+ if not (stats.is_connected or stats.is_in_resync):
+ failure = True
+ break
+ alldone = alldone and (not stats.is_in_resync)
+ if stats.sync_percent is not None:
+ min_resync = min(min_resync, stats.sync_percent)
+ return (not failure, (alldone, min_resync))
+
+
class HooksRunner(object):
"""Hook runner.
#logging.exception("Error while closing fd %s", fd)
pass
- return result == 0, output
+ return result == 0, utils.SafeEncode(output.strip())
def RunHooks(self, hpath, phase, env):
"""Run the scripts in the hooks directory.