X-Git-Url: https://code.grnet.gr/git/ganeti-local/blobdiff_plain/3c9c571da2a639cbd5cb01a18db447a085c345a2..4fe80ef2ed1cda3a6357274eccafe5c1f21a5283:/lib/backend.py diff --git a/lib/backend.py b/lib/backend.py index 419b9a7..44e45f6 100644 --- a/lib/backend.py +++ b/lib/backend.py @@ -19,7 +19,12 @@ # 02110-1301, USA. -"""Functions used by the node daemon""" +"""Functions used by the node daemon + +@var _ALLOWED_UPLOAD_FILES: denotes which files are accepted in + the L{UploadFile} function + +""" import os @@ -115,6 +120,23 @@ def _CleanDirectory(path, exclude=None): utils.RemoveFile(full_name) +def _BuildUploadFileList(): + """Build the list of allowed upload files. + + This is abstracted so that it's built only once at module import time. + + """ + return frozenset([ + constants.CLUSTER_CONF_FILE, + constants.ETC_HOSTS, + constants.SSH_KNOWN_HOSTS_FILE, + constants.VNC_PASSWORD_FILE, + ]) + + +_ALLOWED_UPLOAD_FILES = _BuildUploadFileList() + + def JobQueuePurge(): """Removes job queue files and archived jobs. @@ -141,13 +163,13 @@ def GetMasterInfo(): master_netdev = cfg.GetMasterNetdev() master_ip = cfg.GetMasterIP() master_node = cfg.GetMasterNode() - except errors.ConfigurationError, err: + except errors.ConfigurationError: logging.exception("Cluster configuration incomplete") return (None, None, None) return (master_netdev, master_ip, master_node) -def StartMaster(start_daemons): +def StartMaster(start_daemons, no_voting): """Activate local node as master node. The function will always try activate the IP address of the master @@ -157,6 +179,9 @@ def StartMaster(start_daemons): @type start_daemons: boolean @param start_daemons: whther to also start the master daemons (ganeti-masterd and ganeti-rapi) + @type no_voting: boolean + @param no_voting: whether to start ganeti-masterd without a node vote + (if start_daemons is True), but still non-interactively @rtype: None """ @@ -186,8 +211,17 @@ def StartMaster(start_daemons): # and now start the master and rapi daemons if start_daemons: - for daemon in 'ganeti-masterd', 'ganeti-rapi': - result = utils.RunCmd([daemon]) + daemons_params = { + 'ganeti-masterd': [], + 'ganeti-rapi': [], + } + if no_voting: + daemons_params['ganeti-masterd'].append('--no-voting') + daemons_params['ganeti-masterd'].append('--yes-do-it') + for daemon in daemons_params: + cmd = [daemon] + cmd.extend(daemons_params[daemon]) + result = utils.RunCmd(cmd) if result.failed: logging.error("Can't start daemon %s: %s", daemon, result.output) ok = False @@ -260,8 +294,9 @@ def AddNode(dsa, dsapub, rsa, rsapub, sshkey, sshpub): priv_key, pub_key, auth_keys = ssh.GetUserFiles(constants.GANETI_RUNAS, mkdir=True) except errors.OpExecError, err: - logging.exception("Error while processing user ssh files") - return False + msg = "Error while processing user ssh files" + logging.exception(msg) + return (False, "%s: %s" % (msg, err)) for name, content in [(priv_key, sshkey), (pub_key, sshpub)]: utils.WriteFile(name, data=content, mode=0600) @@ -270,7 +305,7 @@ def AddNode(dsa, dsapub, rsa, rsapub, sshkey, sshpub): utils.RunCmd([constants.SSH_INITD_SCRIPT, "restart"]) - return True + return (True, "Node added successfully") def LeaveCluster(): @@ -307,7 +342,7 @@ def LeaveCluster(): def GetNodeInfo(vgname, hypervisor_type): - """Gives back a hash with different informations about the node. + """Gives back a hash with different information about the node. @type vgname: C{string} @param vgname: the name of the volume group to ask for disk space information @@ -426,12 +461,21 @@ def VerifyNode(what, cluster_name): result[constants.NV_VGLIST] = ListVolumeGroups() if constants.NV_VERSION in what: - result[constants.NV_VERSION] = constants.PROTOCOL_VERSION + result[constants.NV_VERSION] = (constants.PROTOCOL_VERSION, + constants.RELEASE_VERSION) if constants.NV_HVINFO in what: hyper = hypervisor.GetHypervisor(what[constants.NV_HVINFO]) result[constants.NV_HVINFO] = hyper.GetNodeInfo() + if constants.NV_DRBDLIST in what: + try: + used_minors = bdev.DRBD8.GetUsedDevs().keys() + except errors.BlockDeviceError, err: + logging.warning("Can't get used minors list", exc_info=True) + used_minors = str(err) + result[constants.NV_DRBDLIST] = used_minors + return result @@ -563,16 +607,15 @@ def GetInstanceList(hypervisor_list): try: names = hypervisor.GetHypervisor(hname).ListInstances() results.extend(names) - except errors.HypervisorError, err: + except errors.HypervisorError: logging.exception("Error enumerating instances for hypevisor %s", hname) - # FIXME: should we somehow not propagate this to the master? raise return results def GetInstanceInfo(instance, hname): - """Gives back the informations about an instance as a dictionary. + """Gives back the information about an instance as a dictionary. @type instance: string @param instance: the instance name @@ -597,6 +640,30 @@ def GetInstanceInfo(instance, hname): return output +def GetInstanceMigratable(instance): + """Gives whether an instance can be migrated. + + @type instance: L{objects.Instance} + @param instance: object representing the instance to be checked. + + @rtype: tuple + @return: tuple of (result, description) where: + - result: whether the instance can be migrated or not + - description: a description of the issue, if relevant + + """ + hyper = hypervisor.GetHypervisor(instance.hypervisor) + if instance.name not in hyper.ListInstances(): + return (False, 'not running') + + for idx in range(len(instance.disks)): + link_name = _GetBlockDevSymlinkPath(instance.name, idx) + if not os.path.islink(link_name): + return (False, 'not restarted since ganeti 1.2.5') + + return (True, '') + + def GetAllInstancesInfo(hypervisor_list): """Gather data about all instances. @@ -627,15 +694,20 @@ def GetAllInstancesInfo(hypervisor_list): 'state': state, 'time': times, } - if name in output and output[name] != value: - raise errors.HypervisorError("Instance %s running duplicate" - " with different parameters" % name) + if name in output: + # we only check static parameters, like memory and vcpus, + # and not state and time which can change between the + # invocations of the different hypervisors + for key in 'memory', 'vcpus': + if value[key] != output[name][key]: + raise errors.HypervisorError("Instance %s is running twice" + " with different parameters" % name) output[name] = value return output -def AddOSToInstance(instance): +def InstanceOsAdd(instance): """Add an OS to an instance. @type instance: L{objects.Instance} @@ -644,7 +716,15 @@ def AddOSToInstance(instance): @return: the success of the operation """ - inst_os = OSFromDisk(instance.os) + try: + inst_os = OSFromDisk(instance.os) + except errors.InvalidOS, err: + os_name, os_dir, os_err = err.args + if os_dir is None: + return (False, "Can't find OS '%s': %s" % (os_name, os_err)) + else: + return (False, "Error parsing OS '%s' in directory %s: %s" % + (os_name, os_dir, os_err)) create_env = OSEnvironment(instance) @@ -657,9 +737,12 @@ def AddOSToInstance(instance): logging.error("os create command '%s' returned error: %s, logfile: %s," " output: %s", result.cmd, result.fail_reason, logfile, result.output) - return False + lines = [utils.SafeEncode(val) + for val in utils.TailFile(logfile, lines=20)] + return (False, "OS create script failed (%s), last lines in the" + " log file:\n%s" % (result.fail_reason, "\n".join(lines))) - return True + return (True, "Successfully installed") def RunRenameInstance(instance, old_name): @@ -688,13 +771,16 @@ def RunRenameInstance(instance, old_name): if result.failed: logging.error("os create command '%s' returned error: %s output: %s", result.cmd, result.fail_reason, result.output) - return False + lines = [utils.SafeEncode(val) + for val in utils.TailFile(logfile, lines=20)] + return (False, "OS rename script failed (%s), last lines in the" + " log file:\n%s" % (result.fail_reason, "\n".join(lines))) - return True + return (True, "Rename successful") def _GetVGInfo(vg_name): - """Get informations about the volume group. + """Get information about the volume group. @type vg_name: str @param vg_name: the volume group which we query @@ -733,27 +819,29 @@ def _GetVGInfo(vg_name): return retdic -def _SymlinkBlockDev(instance_name, device_path, device_name): +def _GetBlockDevSymlinkPath(instance_name, idx): + return os.path.join(constants.DISK_LINKS_DIR, + "%s:%d" % (instance_name, idx)) + + +def _SymlinkBlockDev(instance_name, device_path, idx): """Set up symlinks to a instance's block device. This is an auxiliary function run when an instance is start (on the primary node) or when an instance is migrated (on the target node). - Args: - instance_name: the name of the target instance - device_path: path of the physical block device, on the node - device_name: 'virtual' name of the device - Returns: - absolute path to the disk's symlink + @param instance_name: the name of the target instance + @param device_path: path of the physical block device, on the node + @param idx: the disk index + @return: absolute path to the disk's symlink """ - link_basename = "%s-%s" % (instance_name, device_name) - link_name = os.path.join(constants.DISK_LINKS_DIR, link_basename) + link_name = _GetBlockDevSymlinkPath(instance_name, idx) try: os.symlink(device_path, link_name) - except OSError, e: - if e.errno == errno.EEXIST: + except OSError, err: + if err.errno == errno.EEXIST: if (not os.path.islink(link_name) or os.readlink(link_name) != device_path): os.remove(link_name) @@ -764,16 +852,17 @@ def _SymlinkBlockDev(instance_name, device_path, device_name): return link_name -def _RemoveBlockDevLinks(instance_name): +def _RemoveBlockDevLinks(instance_name, disks): """Remove the block device symlinks belonging to the given instance. """ - for i in os.listdir(constants.DISK_LINKS_DIR): - if os.path.islink(i) and i.startswith('%s-' % instance_name): + for idx, disk in enumerate(disks): + link_name = _GetBlockDevSymlinkPath(instance_name, idx) + if os.path.islink(link_name): try: - os.remove(link) - except OSError, e: - pass # Ignore errors when removing the symlinks + os.remove(link_name) + except OSError: + logging.exception("Can't remove symlink '%s'", link_name) def _GatherAndLinkBlockDevs(instance): @@ -796,8 +885,7 @@ def _GatherAndLinkBlockDevs(instance): str(disk)) device.Open() try: - link_name = _SymlinkBlockDev(instance.name, device.dev_path, - "disk%d" % idx) + link_name = _SymlinkBlockDev(instance.name, device.dev_path, idx) except OSError, e: raise errors.BlockDeviceError("Cannot create block device symlink: %s" % e.strerror) @@ -807,7 +895,7 @@ def _GatherAndLinkBlockDevs(instance): return block_devices -def StartInstance(instance, extra_args): +def StartInstance(instance): """Start an instance. @type instance: L{objects.Instance} @@ -819,24 +907,24 @@ def StartInstance(instance, extra_args): running_instances = GetInstanceList([instance.hypervisor]) if instance.name in running_instances: - return True + return (True, "Already running") try: block_devices = _GatherAndLinkBlockDevs(instance) hyper = hypervisor.GetHypervisor(instance.hypervisor) - hyper.StartInstance(instance, block_devices, extra_args) + hyper.StartInstance(instance, block_devices) except errors.BlockDeviceError, err: logging.exception("Failed to start instance") - return False + return (False, "Block device error: %s" % str(err)) except errors.HypervisorError, err: logging.exception("Failed to start instance") - _RemoveBlockDevLinks(instance.name) - return False + _RemoveBlockDevLinks(instance.name, instance.disks) + return (False, "Hypervisor error: %s" % str(err)) - return True + return (True, "Instance started successfully") -def ShutdownInstance(instance): +def InstanceShutdown(instance): """Shut an instance down. @note: this functions uses polling with a hardcoded timeout. @@ -851,44 +939,48 @@ def ShutdownInstance(instance): running_instances = GetInstanceList([hv_name]) if instance.name not in running_instances: - return True + return (True, "Instance already stopped") hyper = hypervisor.GetHypervisor(hv_name) try: hyper.StopInstance(instance) except errors.HypervisorError, err: - logging.error("Failed to stop instance") - return False + msg = "Failed to stop instance %s: %s" % (instance.name, err) + logging.error(msg) + return (False, msg) # test every 10secs for 2min time.sleep(1) - for dummy in range(11): + for _ in range(11): if instance.name not in GetInstanceList([hv_name]): break time.sleep(10) else: # the shutdown did not succeed - logging.error("shutdown of '%s' unsuccessful, using destroy", instance) + logging.error("Shutdown of '%s' unsuccessful, using destroy", + instance.name) try: hyper.StopInstance(instance, force=True) except errors.HypervisorError, err: - logging.exception("Failed to stop instance") - return False + msg = "Failed to force stop instance %s: %s" % (instance.name, err) + logging.error(msg) + return (False, msg) time.sleep(1) if instance.name in GetInstanceList([hv_name]): - logging.error("could not shutdown instance '%s' even by destroy", - instance.name) - return False + msg = ("Could not shutdown instance %s even by destroy" % + instance.name) + logging.error(msg) + return (False, msg) - _RemoveBlockDevLinks(instance.name) + _RemoveBlockDevLinks(instance.name, instance.disks) - return True + return (True, "Instance has been shutdown successfully") -def RebootInstance(instance, reboot_type, extra_args): +def InstanceReboot(instance, reboot_type): """Reboot an instance. @type instance: L{objects.Instance} @@ -900,9 +992,10 @@ def RebootInstance(instance, reboot_type, extra_args): instance OS, do not recreate the VM - L{constants.INSTANCE_REBOOT_HARD}: tear down and restart the VM (at the hypervisor level) - - the other reboot type (L{constants.INSTANCE_REBOOT_HARD}) - is not accepted here, since that mode is handled - differently + - the other reboot type (L{constants.INSTANCE_REBOOT_FULL}) is + not accepted here, since that mode is handled differently, in + cmdlib, and translates into full stop and start of the + instance (instead of a call_instance_reboot RPC) @rtype: boolean @return: the success of the operation @@ -910,27 +1003,91 @@ def RebootInstance(instance, reboot_type, extra_args): running_instances = GetInstanceList([instance.hypervisor]) if instance.name not in running_instances: - logging.error("Cannot reboot instance that is not running") - return False + msg = "Cannot reboot instance %s that is not running" % instance.name + logging.error(msg) + return (False, msg) hyper = hypervisor.GetHypervisor(instance.hypervisor) if reboot_type == constants.INSTANCE_REBOOT_SOFT: try: hyper.RebootInstance(instance) except errors.HypervisorError, err: - logging.exception("Failed to soft reboot instance") - return False + msg = "Failed to soft reboot instance %s: %s" % (instance.name, err) + logging.error(msg) + return (False, msg) elif reboot_type == constants.INSTANCE_REBOOT_HARD: try: - ShutdownInstance(instance) - StartInstance(instance, extra_args) + stop_result = InstanceShutdown(instance) + if not stop_result[0]: + return stop_result + return StartInstance(instance) except errors.HypervisorError, err: - logging.exception("Failed to hard reboot instance") - return False + msg = "Failed to hard reboot instance %s: %s" % (instance.name, err) + logging.error(msg) + return (False, msg) else: - raise errors.ParameterError("reboot_type invalid") + return (False, "Invalid reboot_type received: %s" % (reboot_type,)) - return True + return (True, "Reboot successful") + + +def MigrationInfo(instance): + """Gather information about an instance to be migrated. + + @type instance: L{objects.Instance} + @param instance: the instance definition + + """ + hyper = hypervisor.GetHypervisor(instance.hypervisor) + try: + info = hyper.MigrationInfo(instance) + except errors.HypervisorError, err: + msg = "Failed to fetch migration information" + logging.exception(msg) + return (False, '%s: %s' % (msg, err)) + return (True, info) + + +def AcceptInstance(instance, info, target): + """Prepare the node to accept an instance. + + @type instance: L{objects.Instance} + @param instance: the instance definition + @type info: string/data (opaque) + @param info: migration information, from the source node + @type target: string + @param target: target host (usually ip), on this node + + """ + hyper = hypervisor.GetHypervisor(instance.hypervisor) + try: + hyper.AcceptInstance(instance, info, target) + except errors.HypervisorError, err: + msg = "Failed to accept instance" + logging.exception(msg) + return (False, '%s: %s' % (msg, err)) + return (True, "Accept successful") + + +def FinalizeMigration(instance, info, success): + """Finalize any preparation to accept an instance. + + @type instance: L{objects.Instance} + @param instance: the instance definition + @type info: string/data (opaque) + @param info: migration information, from the source node + @type success: boolean + @param success: whether the migration was a success or a failure + + """ + hyper = hypervisor.GetHypervisor(instance.hypervisor) + try: + hyper.FinalizeMigration(instance, info, success) + except errors.HypervisorError, err: + msg = "Failed to finalize migration" + logging.exception(msg) + return (False, '%s: %s' % (msg, err)) + return (True, "Migration Finalized") def MigrateInstance(instance, target, live): @@ -949,18 +1106,18 @@ def MigrateInstance(instance, target, live): - msg is a string with details in case of failure """ - hyper = hypervisor.GetHypervisor(instance.hypervisor_name) + hyper = hypervisor.GetHypervisor(instance.hypervisor) try: hyper.MigrateInstance(instance.name, target, live) except errors.HypervisorError, err: - msg = "Failed to migrate instance: %s" % str(err) - logging.error(msg) - return (False, msg) - return (True, "Migration successfull") + msg = "Failed to migrate instance" + logging.exception(msg) + return (False, "%s: %s" % (msg, err)) + return (True, "Migration successful") -def CreateBlockDevice(disk, size, owner, on_primary, info): +def BlockdevCreate(disk, size, owner, on_primary, info): """Creates a block device for an instance. @type disk: L{objects.Disk} @@ -984,44 +1141,55 @@ def CreateBlockDevice(disk, size, owner, on_primary, info): clist = [] if disk.children: for child in disk.children: - crdev = _RecursiveAssembleBD(child, owner, on_primary) + try: + crdev = _RecursiveAssembleBD(child, owner, on_primary) + except errors.BlockDeviceError, err: + errmsg = "Can't assemble device %s: %s" % (child, err) + logging.error(errmsg) + return False, errmsg if on_primary or disk.AssembleOnSecondary(): # we need the children open in case the device itself has to # be assembled - crdev.Open() + try: + crdev.Open() + except errors.BlockDeviceError, err: + errmsg = "Can't make child '%s' read-write: %s" % (child, err) + logging.error(errmsg) + return False, errmsg clist.append(crdev) + try: - device = bdev.FindDevice(disk.dev_type, disk.physical_id, clist) - if device is not None: - logging.info("removing existing device %s", disk) - device.Remove() + device = bdev.Create(disk.dev_type, disk.physical_id, clist, disk.size) except errors.BlockDeviceError, err: - pass + return False, "Can't create block device: %s" % str(err) - device = bdev.Create(disk.dev_type, disk.physical_id, - clist, size) - if device is None: - raise ValueError("Can't create child device for %s, %s" % - (disk, size)) if on_primary or disk.AssembleOnSecondary(): - if not device.Assemble(): - errorstring = "Can't assemble device after creation" - logging.error(errorstring) - raise errors.BlockDeviceError("%s, very unusual event - check the node" - " daemon logs" % errorstring) + try: + device.Assemble() + except errors.BlockDeviceError, err: + errmsg = ("Can't assemble device after creation, very" + " unusual event: %s" % str(err)) + logging.error(errmsg) + return False, errmsg device.SetSyncSpeed(constants.SYNC_SPEED) if on_primary or disk.OpenOnSecondary(): - device.Open(force=True) + try: + device.Open(force=True) + except errors.BlockDeviceError, err: + errmsg = ("Can't make device r/w after creation, very" + " unusual event: %s" % str(err)) + logging.error(errmsg) + return False, errmsg DevCacheManager.UpdateCache(device.dev_path, owner, on_primary, disk.iv_name) device.SetInfo(info) physical_id = device.unique_id - return physical_id + return True, physical_id -def RemoveBlockDevice(disk): +def BlockdevRemove(disk): """Remove a block device. @note: This is intended to be called recursively. @@ -1032,6 +1200,8 @@ def RemoveBlockDevice(disk): @return: the success of the operation """ + msgs = [] + result = True try: rdev = _RecursiveFindBD(disk) except errors.BlockDeviceError, err: @@ -1040,15 +1210,22 @@ def RemoveBlockDevice(disk): rdev = None if rdev is not None: r_path = rdev.dev_path - result = rdev.Remove() + try: + rdev.Remove() + except errors.BlockDeviceError, err: + msgs.append(str(err)) + result = False if result: DevCacheManager.RemoveCache(r_path) - else: - result = True + if disk.children: for child in disk.children: - result = result and RemoveBlockDevice(child) - return result + c_status, c_msg = BlockdevRemove(child) + result = result and c_status + if c_msg: # not an empty message + msgs.append(c_msg) + + return (result, "; ".join(msgs)) def _RecursiveAssembleBD(disk, owner, as_primary): @@ -1087,11 +1264,12 @@ def _RecursiveAssembleBD(disk, owner, as_primary): if children.count(None) >= mcn: raise cdev = None - logging.debug("Error in child activation: %s", str(err)) + logging.error("Error in child activation (but continuing): %s", + str(err)) children.append(cdev) if as_primary or disk.AssembleOnSecondary(): - r_dev = bdev.AttachOrAssemble(disk.dev_type, disk.physical_id, children) + r_dev = bdev.Assemble(disk.dev_type, disk.physical_id, children, disk.size) r_dev.SetSyncSpeed(constants.SYNC_SPEED) result = r_dev if as_primary or disk.OpenOnSecondary(): @@ -1104,7 +1282,7 @@ def _RecursiveAssembleBD(disk, owner, as_primary): return result -def AssembleBlockDevice(disk, owner, as_primary): +def BlockdevAssemble(disk, owner, as_primary): """Activate a block device for an instance. This is a wrapper over _RecursiveAssembleBD. @@ -1114,16 +1292,22 @@ def AssembleBlockDevice(disk, owner, as_primary): C{True} for secondary nodes """ - result = _RecursiveAssembleBD(disk, owner, as_primary) - if isinstance(result, bdev.BlockDev): - result = result.dev_path - return result + status = True + result = "no error information" + try: + result = _RecursiveAssembleBD(disk, owner, as_primary) + if isinstance(result, bdev.BlockDev): + result = result.dev_path + except errors.BlockDeviceError, err: + result = "Error while assembling disk: %s" % str(err) + status = False + return (status, result) -def ShutdownBlockDevice(disk): +def BlockdevShutdown(disk): """Shut down a block device. - First, if the device is assembled (Attach() is successfull), then + First, if the device is assembled (Attach() is successful), then the device is shutdown. Then the children of the device are shutdown. @@ -1138,21 +1322,29 @@ def ShutdownBlockDevice(disk): @return: the success of the operation """ + msgs = [] + result = True r_dev = _RecursiveFindBD(disk) if r_dev is not None: r_path = r_dev.dev_path - result = r_dev.Shutdown() - if result: + try: + r_dev.Shutdown() DevCacheManager.RemoveCache(r_path) - else: - result = True + except errors.BlockDeviceError, err: + msgs.append(str(err)) + result = False + if disk.children: for child in disk.children: - result = result and ShutdownBlockDevice(child) - return result + c_status, c_msg = BlockdevShutdown(child) + result = result and c_status + if c_msg: # not an empty message + msgs.append(c_msg) + + return (result, "; ".join(msgs)) -def MirrorAddChildren(parent_cdev, new_cdevs): +def BlockdevAddchildren(parent_cdev, new_cdevs): """Extend a mirrored block device. @type parent_cdev: L{objects.Disk} @@ -1176,7 +1368,7 @@ def MirrorAddChildren(parent_cdev, new_cdevs): return True -def MirrorRemoveChildren(parent_cdev, new_cdevs): +def BlockdevRemovechildren(parent_cdev, new_cdevs): """Shrink a mirrored block device. @type parent_cdev: L{objects.Disk} @@ -1208,7 +1400,7 @@ def MirrorRemoveChildren(parent_cdev, new_cdevs): return True -def GetMirrorStatus(disks): +def BlockdevGetmirrorstatus(disks): """Get the mirroring status of a list of devices. @type disks: list of L{objects.Disk} @@ -1233,7 +1425,7 @@ def GetMirrorStatus(disks): def _RecursiveFindBD(disk): """Check if a device is activated. - If so, return informations about the real device. + If so, return information about the real device. @type disk: L{objects.Disk} @param disk: the disk object we need to find @@ -1247,13 +1439,13 @@ def _RecursiveFindBD(disk): for chdisk in disk.children: children.append(_RecursiveFindBD(chdisk)) - return bdev.FindDevice(disk.dev_type, disk.physical_id, children) + return bdev.FindDevice(disk.dev_type, disk.physical_id, children, disk.size) -def FindBlockDevice(disk): +def BlockdevFind(disk): """Check if a device is activated. - If it is, return informations about the real device. + If it is, return information about the real device. @type disk: L{objects.Disk} @param disk: the disk to find @@ -1263,10 +1455,39 @@ def FindBlockDevice(disk): estimated_time, is_degraded) """ - rbd = _RecursiveFindBD(disk) + try: + rbd = _RecursiveFindBD(disk) + except errors.BlockDeviceError, err: + return (False, str(err)) if rbd is None: - return rbd - return (rbd.dev_path, rbd.major, rbd.minor) + rbd.GetSyncStatus() + return (True, None) + return (True, (rbd.dev_path, rbd.major, rbd.minor) + rbd.GetSyncStatus()) + + +def BlockdevGetsize(disks): + """Computes the size of the given disks. + + If a disk is not found, returns None instead. + + @type disks: list of L{objects.Disk} + @param disks: the list of disk to compute the size for + @rtype: list + @return: list with elements None if the disk cannot be found, + otherwise the size + + """ + result = [] + for cf in disks: + try: + rbd = _RecursiveFindBD(cf) + except errors.BlockDeviceError, err: + result.append(None) + continue + if rbd is None: + result.append(None) + else: + result.append(rbd.GetActualSize()) + return result def UploadFile(file_name, data, mode, uid, gid, atime, mtime): @@ -1299,14 +1520,7 @@ def UploadFile(file_name, data, mode, uid, gid, atime, mtime): file_name) return False - allowed_files = [ - constants.CLUSTER_CONF_FILE, - constants.ETC_HOSTS, - constants.SSH_KNOWN_HOSTS_FILE, - constants.VNC_PASSWORD_FILE, - ] - - if file_name not in allowed_files: + if file_name not in _ALLOWED_UPLOAD_FILES: logging.error("Filename passed to UploadFile not in allowed" " upload targets: '%s'", file_name) return False @@ -1500,6 +1714,7 @@ def OSEnvironment(instance, debug=0): result = {} result['OS_API_VERSION'] = '%d' % constants.OS_API_VERSION result['INSTANCE_NAME'] = instance.name + result['INSTANCE_OS'] = instance.os result['HYPERVISOR'] = instance.hypervisor result['DISK_COUNT'] = '%d' % len(instance.disks) result['NIC_COUNT'] = '%d' % len(instance.nics) @@ -1511,8 +1726,7 @@ def OSEnvironment(instance, debug=0): str(disk)) real_disk.Open() result['DISK_%d_PATH' % idx] = real_disk.dev_path - # FIXME: When disks will have read-only mode, populate this - result['DISK_%d_ACCESS' % idx] = 'W' + result['DISK_%d_ACCESS' % idx] = disk.mode if constants.HV_DISK_TYPE in instance.hvparams: result['DISK_%d_FRONTEND_TYPE' % idx] = \ instance.hvparams[constants.HV_DISK_TYPE] @@ -1530,9 +1744,13 @@ def OSEnvironment(instance, debug=0): result['NIC_%d_FRONTEND_TYPE' % idx] = \ instance.hvparams[constants.HV_NIC_TYPE] + for source, kind in [(instance.beparams, "BE"), (instance.hvparams, "HV")]: + for key, value in source.items(): + result["INSTANCE_%s_%s" % (kind, key)] = str(value) + return result -def GrowBlockDevice(disk, amount): +def BlockdevGrow(disk, amount): """Grow a stack of block devices. This function is called recursively, with the childrens being the @@ -1558,7 +1776,7 @@ def GrowBlockDevice(disk, amount): return True, None -def SnapshotBlockDevice(disk): +def BlockdevSnapshot(disk): """Create a snapshot copy of a block device. This function is called recursively, and the snapshot is actually created @@ -1573,13 +1791,13 @@ def SnapshotBlockDevice(disk): if disk.children: if len(disk.children) == 1: # only one child, let's recurse on it - return SnapshotBlockDevice(disk.children[0]) + return BlockdevSnapshot(disk.children[0]) else: # more than one child, choose one that matches for child in disk.children: if child.size == disk.size: # return implies breaking the loop - return SnapshotBlockDevice(child) + return BlockdevSnapshot(child) elif disk.dev_type == constants.LD_LV: r_dev = _RecursiveFindBD(disk) if r_dev is not None: @@ -1635,8 +1853,8 @@ def ExportSnapshot(disk, dest_node, instance, cluster_name, idx): # the target command is built out of three individual commands, # which are joined by pipes; we check each individual command for # valid parameters - expcmd = utils.BuildShellCmd("cd %s; %s 2>%s", inst_os.path, - export_script, logfile) + expcmd = utils.BuildShellCmd("set -e; set -o pipefail; cd %s; %s 2>%s", + inst_os.path, export_script, logfile) comprcmd = "gzip" @@ -1649,7 +1867,7 @@ def ExportSnapshot(disk, dest_node, instance, cluster_name, idx): # all commands have been checked, so we're safe to combine them command = '|'.join([expcmd, comprcmd, utils.ShellQuoteArgs(remotecmd)]) - result = utils.RunCmd(command, env=export_env) + result = utils.RunCmd(["bash", "-c", command], env=export_env) if result.failed: logging.error("os snapshot export command '%s' returned error: %s" @@ -1829,7 +2047,7 @@ def RemoveExport(export): return True -def RenameBlockDevices(devlist): +def BlockdevRename(devlist): """Rename a list of block devices. @type devlist: list of tuples @@ -1859,7 +2077,7 @@ def RenameBlockDevices(devlist): # but we don't have the owner here - maybe parse from existing # cache? for now, we only lose lvm data when we rename, which # is less critical than DRBD or MD - except errors.BlockDeviceError, err: + except errors.BlockDeviceError: logging.exception("Can't rename device '%s' to '%s'", dev, unique_id) result = False return result @@ -1929,7 +2147,7 @@ def RemoveFileStorageDir(file_storage_dir): @param file_storage_dir: the directory we should cleanup @rtype: tuple (success,) @return: tuple of one element, C{success}, denoting - whether the operation was successfull + whether the operation was successful """ file_storage_dir = _TransformFileStorageDir(file_storage_dir) @@ -1944,7 +2162,7 @@ def RemoveFileStorageDir(file_storage_dir): # deletes dir only if empty, otherwise we want to return False try: os.rmdir(file_storage_dir) - except OSError, err: + except OSError: logging.exception("Cannot remove file storage directory '%s'", file_storage_dir) result = False, @@ -1973,7 +2191,7 @@ def RenameFileStorageDir(old_file_storage_dir, new_file_storage_dir): if os.path.isdir(old_file_storage_dir): try: os.rename(old_file_storage_dir, new_file_storage_dir) - except OSError, err: + except OSError: logging.exception("Cannot rename '%s' to '%s'", old_file_storage_dir, new_file_storage_dir) result = False, @@ -2071,12 +2289,14 @@ def JobQueueSetDrainFlag(drain_flag): return True -def CloseBlockDevices(disks): +def BlockdevClose(instance_name, disks): """Closes the given block devices. This means they will be switched to secondary mode (in case of DRBD). + @param instance_name: if the argument is not empty, the symlinks + of this instance will be removed @type disks: list of L{objects.Disk} @param disks: the list of disks to be closed @rtype: tuple (success, message) @@ -2102,6 +2322,8 @@ def CloseBlockDevices(disks): if msg: return (False, "Can't make devices secondary: %s" % ",".join(msg)) else: + if instance_name: + _RemoveBlockDevLinks(instance_name, disks) return (True, "All devices secondary") @@ -2139,7 +2361,8 @@ def DemoteFromMC(): if utils.IsProcessAlive(utils.ReadPidFile(pid_file)): return (False, "The master daemon is running, will not demote") try: - utils.CreateBackup(constants.CLUSTER_CONF_FILE) + if os.path.isfile(constants.CLUSTER_CONF_FILE): + utils.CreateBackup(constants.CLUSTER_CONF_FILE) except EnvironmentError, err: if err.errno != errno.ENOENT: return (False, "Error while backing up cluster file: %s" % str(err)) @@ -2147,6 +2370,128 @@ def DemoteFromMC(): return (True, "Done") +def _FindDisks(nodes_ip, disks): + """Sets the physical ID on disks and returns the block devices. + + """ + # set the correct physical ID + my_name = utils.HostInfo().name + for cf in disks: + cf.SetPhysicalID(my_name, nodes_ip) + + bdevs = [] + + for cf in disks: + rd = _RecursiveFindBD(cf) + if rd is None: + return (False, "Can't find device %s" % cf) + bdevs.append(rd) + return (True, bdevs) + + +def DrbdDisconnectNet(nodes_ip, disks): + """Disconnects the network on a list of drbd devices. + + """ + status, bdevs = _FindDisks(nodes_ip, disks) + if not status: + return status, bdevs + + # disconnect disks + for rd in bdevs: + try: + rd.DisconnectNet() + except errors.BlockDeviceError, err: + logging.exception("Failed to go into standalone mode") + return (False, "Can't change network configuration: %s" % str(err)) + return (True, "All disks are now disconnected") + + +def DrbdAttachNet(nodes_ip, disks, instance_name, multimaster): + """Attaches the network on a list of drbd devices. + + """ + status, bdevs = _FindDisks(nodes_ip, disks) + if not status: + return status, bdevs + + if multimaster: + for idx, rd in enumerate(bdevs): + try: + _SymlinkBlockDev(instance_name, rd.dev_path, idx) + except EnvironmentError, err: + return (False, "Can't create symlink: %s" % str(err)) + # reconnect disks, switch to new master configuration and if + # needed primary mode + for rd in bdevs: + try: + rd.AttachNet(multimaster) + except errors.BlockDeviceError, err: + return (False, "Can't change network configuration: %s" % str(err)) + # wait until the disks are connected; we need to retry the re-attach + # if the device becomes standalone, as this might happen if the one + # node disconnects and reconnects in a different mode before the + # other node reconnects; in this case, one or both of the nodes will + # decide it has wrong configuration and switch to standalone + RECONNECT_TIMEOUT = 2 * 60 + sleep_time = 0.100 # start with 100 miliseconds + timeout_limit = time.time() + RECONNECT_TIMEOUT + while time.time() < timeout_limit: + all_connected = True + for rd in bdevs: + stats = rd.GetProcStatus() + if not (stats.is_connected or stats.is_in_resync): + all_connected = False + if stats.is_standalone: + # peer had different config info and this node became + # standalone, even though this should not happen with the + # new staged way of changing disk configs + try: + rd.AttachNet(multimaster) + except errors.BlockDeviceError, err: + return (False, "Can't change network configuration: %s" % str(err)) + if all_connected: + break + time.sleep(sleep_time) + sleep_time = min(5, sleep_time * 1.5) + if not all_connected: + return (False, "Timeout in disk reconnecting") + if multimaster: + # change to primary mode + for rd in bdevs: + try: + rd.Open() + except errors.BlockDeviceError, err: + return (False, "Can't change to primary mode: %s" % str(err)) + if multimaster: + msg = "multi-master and primary" + else: + msg = "single-master" + return (True, "Disks are now configured as %s" % msg) + + +def DrbdWaitSync(nodes_ip, disks): + """Wait until DRBDs have synchronized. + + """ + status, bdevs = _FindDisks(nodes_ip, disks) + if not status: + return status, bdevs + + min_resync = 100 + alldone = True + failure = False + for rd in bdevs: + stats = rd.GetProcStatus() + if not (stats.is_connected or stats.is_in_resync): + failure = True + break + alldone = alldone and (not stats.is_in_resync) + if stats.sync_percent is not None: + min_resync = min(min_resync, stats.sync_percent) + return (not failure, (alldone, min_resync)) + + class HooksRunner(object): """Hook runner. @@ -2154,8 +2499,6 @@ class HooksRunner(object): on the master side. """ - RE_MASK = re.compile("^[a-zA-Z0-9_-]+$") - def __init__(self, hooks_base_dir=None): """Constructor for hooks runner. @@ -2216,7 +2559,7 @@ class HooksRunner(object): #logging.exception("Error while closing fd %s", fd) pass - return result == 0, output + return result == 0, utils.SafeEncode(output.strip()) def RunHooks(self, hpath, phase, env): """Run the scripts in the hooks directory. @@ -2252,7 +2595,7 @@ class HooksRunner(object): dir_name = "%s/%s" % (self._BASE_DIR, subdir) try: dir_contents = utils.ListVisibleFiles(dir_name) - except OSError, err: + except OSError: # FIXME: must log output in case of failures return rr @@ -2262,7 +2605,7 @@ class HooksRunner(object): for relname in dir_contents: fname = os.path.join(dir_name, relname) if not (os.path.isfile(fname) and os.access(fname, os.X_OK) and - self.RE_MASK.match(relname) is not None): + constants.EXT_PLUGIN_MASK.match(relname) is not None): rrval = constants.HKR_SKIP output = "" else: @@ -2375,7 +2718,7 @@ class DevCacheManager(object): fdata = "%s %s %s\n" % (str(owner), state, iv_name) try: utils.WriteFile(fpath, data=fdata) - except EnvironmentError, err: + except EnvironmentError: logging.exception("Can't update bdev cache for %s", dev_path) @classmethod @@ -2397,5 +2740,5 @@ class DevCacheManager(object): fpath = cls._ConvertPath(dev_path) try: utils.RemoveFile(fpath) - except EnvironmentError, err: + except EnvironmentError: logging.exception("Can't update bdev cache for %s", dev_path)