X-Git-Url: https://code.grnet.gr/git/ganeti-local/blobdiff_plain/cacfd1fdd4a9fade7526d5da9de72ec147f27161..9205a895ac0b5d60b881458a44bb157b88eac21d:/lib/backend.py diff --git a/lib/backend.py b/lib/backend.py index e56628e..ae07ae1 100644 --- a/lib/backend.py +++ b/lib/backend.py @@ -437,9 +437,9 @@ def VerifyNode(what, cluster_name): if constants.NV_DRBDLIST in what: try: used_minors = bdev.DRBD8.GetUsedDevs().keys() - except errors.BlockDeviceError: + except errors.BlockDeviceError, err: logging.warning("Can't get used minors list", exc_info=True) - used_minors = [] + used_minors = str(err) result[constants.NV_DRBDLIST] = used_minors return result @@ -575,7 +575,6 @@ def GetInstanceList(hypervisor_list): results.extend(names) except errors.HypervisorError, err: logging.exception("Error enumerating instances for hypevisor %s", hname) - # FIXME: should we somehow not propagate this to the master? raise return results @@ -661,19 +660,26 @@ def GetAllInstancesInfo(hypervisor_list): 'state': state, 'time': times, } - if name in output and output[name] != value: - raise errors.HypervisorError("Instance %s running duplicate" - " with different parameters" % name) + if name in output: + # we only check static parameters, like memory and vcpus, + # and not state and time which can change between the + # invocations of the different hypervisors + for key in 'memory', 'vcpus': + if value[key] != output[name][key]: + raise errors.HypervisorError("Instance %s is running twice" + " with different parameters" % name) output[name] = value return output -def InstanceOsAdd(instance): +def InstanceOsAdd(instance, reinstall): """Add an OS to an instance. @type instance: L{objects.Instance} @param instance: Instance whose OS is to be installed + @type reinstall: boolean + @param reinstall: whether this is an instance reinstall @rtype: boolean @return: the success of the operation @@ -689,6 +695,8 @@ def InstanceOsAdd(instance): (os_name, os_dir, os_err)) create_env = OSEnvironment(instance) + if reinstall: + create_env['INSTANCE_REINSTALL'] = "1" logfile = "%s/add-%s-%s-%d.log" % (constants.LOG_OS_DIR, instance.os, instance.name, int(time.time())) @@ -857,7 +865,7 @@ def _GatherAndLinkBlockDevs(instance): return block_devices -def StartInstance(instance, extra_args): +def StartInstance(instance): """Start an instance. @type instance: L{objects.Instance} @@ -874,7 +882,7 @@ def StartInstance(instance, extra_args): try: block_devices = _GatherAndLinkBlockDevs(instance) hyper = hypervisor.GetHypervisor(instance.hypervisor) - hyper.StartInstance(instance, block_devices, extra_args) + hyper.StartInstance(instance, block_devices) except errors.BlockDeviceError, err: logging.exception("Failed to start instance") return (False, "Block device error: %s" % str(err)) @@ -886,7 +894,7 @@ def StartInstance(instance, extra_args): return (True, "Instance started successfully") -def ShutdownInstance(instance): +def InstanceShutdown(instance): """Shut an instance down. @note: this functions uses polling with a hardcoded timeout. @@ -901,14 +909,15 @@ def ShutdownInstance(instance): running_instances = GetInstanceList([hv_name]) if instance.name not in running_instances: - return True + return (True, "Instance already stopped") hyper = hypervisor.GetHypervisor(hv_name) try: hyper.StopInstance(instance) except errors.HypervisorError, err: - logging.error("Failed to stop instance: %s" % err) - return False + msg = "Failed to stop instance %s: %s" % (instance.name, err) + logging.error(msg) + return (False, msg) # test every 10secs for 2min @@ -925,21 +934,23 @@ def ShutdownInstance(instance): try: hyper.StopInstance(instance, force=True) except errors.HypervisorError, err: - logging.exception("Failed to stop instance: %s" % err) - return False + msg = "Failed to force stop instance %s: %s" % (instance.name, err) + logging.error(msg) + return (False, msg) time.sleep(1) if instance.name in GetInstanceList([hv_name]): - logging.error("Could not shutdown instance '%s' even by destroy", - instance.name) - return False + msg = ("Could not shutdown instance %s even by destroy" % + instance.name) + logging.error(msg) + return (False, msg) _RemoveBlockDevLinks(instance.name, instance.disks) - return True + return (True, "Instance has been shutdown successfully") -def RebootInstance(instance, reboot_type, extra_args): +def InstanceReboot(instance, reboot_type): """Reboot an instance. @type instance: L{objects.Instance} @@ -961,27 +972,32 @@ def RebootInstance(instance, reboot_type, extra_args): running_instances = GetInstanceList([instance.hypervisor]) if instance.name not in running_instances: - logging.error("Cannot reboot instance that is not running") - return False + msg = "Cannot reboot instance %s that is not running" % instance.name + logging.error(msg) + return (False, msg) hyper = hypervisor.GetHypervisor(instance.hypervisor) if reboot_type == constants.INSTANCE_REBOOT_SOFT: try: hyper.RebootInstance(instance) except errors.HypervisorError, err: - logging.exception("Failed to soft reboot instance") - return False + msg = "Failed to soft reboot instance %s: %s" % (instance.name, err) + logging.error(msg) + return (False, msg) elif reboot_type == constants.INSTANCE_REBOOT_HARD: try: - ShutdownInstance(instance) - StartInstance(instance, extra_args) + stop_result = InstanceShutdown(instance) + if not stop_result[0]: + return stop_result + return StartInstance(instance) except errors.HypervisorError, err: - logging.exception("Failed to hard reboot instance") - return False + msg = "Failed to hard reboot instance %s: %s" % (instance.name, err) + logging.error(msg) + return (False, msg) else: - raise errors.ParameterError("reboot_type invalid") + return (False, "Invalid reboot_type received: %s" % (reboot_type,)) - return True + return (True, "Reboot successful") def MigrationInfo(instance): @@ -1094,26 +1110,45 @@ def BlockdevCreate(disk, size, owner, on_primary, info): clist = [] if disk.children: for child in disk.children: - crdev = _RecursiveAssembleBD(child, owner, on_primary) + try: + crdev = _RecursiveAssembleBD(child, owner, on_primary) + except errors.BlockDeviceError, err: + errmsg = "Can't assemble device %s: %s" % (child, err) + logging.error(errmsg) + return False, errmsg if on_primary or disk.AssembleOnSecondary(): # we need the children open in case the device itself has to # be assembled - crdev.Open() + try: + crdev.Open() + except errors.BlockDeviceError, err: + errmsg = "Can't make child '%s' read-write: %s" % (child, err) + logging.error(errmsg) + return False, errmsg clist.append(crdev) try: device = bdev.Create(disk.dev_type, disk.physical_id, clist, size) - except errors.GenericError, err: + except errors.BlockDeviceError, err: return False, "Can't create block device: %s" % str(err) if on_primary or disk.AssembleOnSecondary(): - if not device.Assemble(): - errorstring = "Can't assemble device after creation, very unusual event" - logging.error(errorstring) - return False, errorstring + try: + device.Assemble() + except errors.BlockDeviceError, err: + errmsg = ("Can't assemble device after creation, very" + " unusual event: %s" % str(err)) + logging.error(errmsg) + return False, errmsg device.SetSyncSpeed(constants.SYNC_SPEED) if on_primary or disk.OpenOnSecondary(): - device.Open(force=True) + try: + device.Open(force=True) + except errors.BlockDeviceError, err: + errmsg = ("Can't make device r/w after creation, very" + " unusual event: %s" % str(err)) + logging.error(errmsg) + return False, errmsg DevCacheManager.UpdateCache(device.dev_path, owner, on_primary, disk.iv_name) @@ -1134,6 +1169,8 @@ def BlockdevRemove(disk): @return: the success of the operation """ + msgs = [] + result = True try: rdev = _RecursiveFindBD(disk) except errors.BlockDeviceError, err: @@ -1142,15 +1179,22 @@ def BlockdevRemove(disk): rdev = None if rdev is not None: r_path = rdev.dev_path - result = rdev.Remove() + try: + rdev.Remove() + except errors.BlockDeviceError, err: + msgs.append(str(err)) + result = False if result: DevCacheManager.RemoveCache(r_path) - else: - result = True + if disk.children: for child in disk.children: - result = result and BlockdevRemove(child) - return result + c_status, c_msg = BlockdevRemove(child) + result = result and c_status + if c_msg: # not an empty message + msgs.append(c_msg) + + return (result, "; ".join(msgs)) def _RecursiveAssembleBD(disk, owner, as_primary): @@ -1189,7 +1233,8 @@ def _RecursiveAssembleBD(disk, owner, as_primary): if children.count(None) >= mcn: raise cdev = None - logging.error("Error in child activation: %s", str(err)) + logging.error("Error in child activation (but continuing): %s", + str(err)) children.append(cdev) if as_primary or disk.AssembleOnSecondary(): @@ -1216,17 +1261,15 @@ def BlockdevAssemble(disk, owner, as_primary): C{True} for secondary nodes """ - status = False + status = True result = "no error information" try: result = _RecursiveAssembleBD(disk, owner, as_primary) if isinstance(result, bdev.BlockDev): result = result.dev_path - status = True - if result == True: - status = True except errors.BlockDeviceError, err: result = "Error while assembling disk: %s" % str(err) + status = False return (status, result) @@ -1249,24 +1292,24 @@ def BlockdevShutdown(disk): """ msgs = [] + result = True r_dev = _RecursiveFindBD(disk) if r_dev is not None: r_path = r_dev.dev_path try: - result = r_dev.Shutdown() + r_dev.Shutdown() + DevCacheManager.RemoveCache(r_path) except errors.BlockDeviceError, err: msgs.append(str(err)) result = False - if result: - DevCacheManager.RemoveCache(r_path) - else: - result = True + if disk.children: for child in disk.children: c_status, c_msg = BlockdevShutdown(child) result = result and c_status if c_msg: # not an empty message msgs.append(c_msg) + return (result, "; ".join(msgs)) @@ -1283,15 +1326,16 @@ def BlockdevAddchildren(parent_cdev, new_cdevs): """ parent_bdev = _RecursiveFindBD(parent_cdev) if parent_bdev is None: - logging.error("Can't find parent device") - return False + msg = "Can't find parent device '%s' in add children" % str(parent_cdev) + logging.error("BlockdevAddchildren: %s", msg) + return (False, msg) new_bdevs = [_RecursiveFindBD(disk) for disk in new_cdevs] if new_bdevs.count(None) > 0: - logging.error("Can't find new device(s) to add: %s:%s", - new_bdevs, new_cdevs) - return False + msg = "Can't find new device(s) to add: %s:%s" % (new_bdevs, new_cdevs) + logging.error(msg) + return (False, msg) parent_bdev.AddChildren(new_bdevs) - return True + return (True, None) def BlockdevRemovechildren(parent_cdev, new_cdevs): @@ -1307,23 +1351,24 @@ def BlockdevRemovechildren(parent_cdev, new_cdevs): """ parent_bdev = _RecursiveFindBD(parent_cdev) if parent_bdev is None: - logging.error("Can't find parent in remove children: %s", parent_cdev) - return False + msg = "Can't find parent device '%s' in remove children" % str(parent_cdev) + logging.error(msg) + return (False, msg) devs = [] for disk in new_cdevs: rpath = disk.StaticDevPath() if rpath is None: bd = _RecursiveFindBD(disk) if bd is None: - logging.error("Can't find dynamic device %s while removing children", - disk) - return False + msg = "Can't find device %s while removing children" % (disk,) + logging.error(msg) + return (False, msg) else: devs.append(bd.dev_path) else: devs.append(rpath) parent_bdev.RemoveChildren(devs) - return True + return (True, None) def BlockdevGetmirrorstatus(disks): @@ -1416,27 +1461,34 @@ def UploadFile(file_name, data, mode, uid, gid, atime, mtime): """ if not os.path.isabs(file_name): - logging.error("Filename passed to UploadFile is not absolute: '%s'", - file_name) - return False + err = "Filename passed to UploadFile is not absolute: '%s'" % file_name + logging.error(err) + return (False, err) - allowed_files = [ + allowed_files = set([ constants.CLUSTER_CONF_FILE, constants.ETC_HOSTS, constants.SSH_KNOWN_HOSTS_FILE, constants.VNC_PASSWORD_FILE, - ] + constants.RAPI_CERT_FILE, + constants.RAPI_USERS_FILE, + ]) + + for hv_name in constants.HYPER_TYPES: + hv_class = hypervisor.GetHypervisor(hv_name) + allowed_files.update(hv_class.GetAncillaryFiles()) if file_name not in allowed_files: - logging.error("Filename passed to UploadFile not in allowed" - " upload targets: '%s'", file_name) - return False + err = "Filename passed to UploadFile not in allowed upload targets: '%s'" \ + % file_name + logging.error(err) + return (False, err) raw_data = _Decompress(data) utils.WriteFile(file_name, data=raw_data, mode=mode, uid=uid, gid=gid, atime=atime, mtime=mtime) - return True + return (True, "success") def WriteSsconfFiles(values): @@ -1633,7 +1685,6 @@ def OSEnvironment(instance, debug=0): str(disk)) real_disk.Open() result['DISK_%d_PATH' % idx] = real_disk.dev_path - # FIXME: When disks will have read-only mode, populate this result['DISK_%d_ACCESS' % idx] = disk.mode if constants.HV_DISK_TYPE in instance.hvparams: result['DISK_%d_FRONTEND_TYPE' % idx] = \ @@ -1964,10 +2015,12 @@ def BlockdevRename(devlist): @return: True if all renames succeeded, False otherwise """ + msgs = [] result = True for disk, unique_id in devlist: dev = _RecursiveFindBD(disk) if dev is None: + msgs.append("Can't find device %s in rename" % str(disk)) result = False continue try: @@ -1982,9 +2035,11 @@ def BlockdevRename(devlist): # cache? for now, we only lose lvm data when we rename, which # is less critical than DRBD or MD except errors.BlockDeviceError, err: + msgs.append("Can't rename device '%s' to '%s': %s" % + (dev, unique_id, err)) logging.exception("Can't rename device '%s' to '%s'", dev, unique_id) result = False - return result + return (result, "; ".join(msgs)) def _TransformFileStorageDir(file_storage_dir): @@ -2395,6 +2450,25 @@ def DrbdWaitSync(nodes_ip, disks): return (not failure, (alldone, min_resync)) +def PowercycleNode(hypervisor_type): + """Hard-powercycle the node. + + Because we need to return first, and schedule the powercycle in the + background, we won't be able to report failures nicely. + + """ + hyper = hypervisor.GetHypervisor(hypervisor_type) + try: + pid = os.fork() + except OSError, err: + # if we can't fork, we'll pretend that we're in the child process + pid = 0 + if pid > 0: + return (True, "Reboot scheduled in 5 seconds") + time.sleep(5) + hyper.PowercycleNode() + + class HooksRunner(object): """Hook runner.