X-Git-Url: https://code.grnet.gr/git/ganeti-local/blobdiff_plain/59726e1575b2533a5d280456031e2f4ac8982638..22001b7894bd406e2180bd6aafc0ca987c40cb6b:/lib/backend.py diff --git a/lib/backend.py b/lib/backend.py index 4b97e9b..2c5e5ae 100644 --- a/lib/backend.py +++ b/lib/backend.py @@ -1,7 +1,7 @@ # # -# Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012 Google Inc. +# Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013 Google Inc. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -60,11 +60,11 @@ from ganeti import ssconf from ganeti import serializer from ganeti import netutils from ganeti import runtime -from ganeti import mcpu from ganeti import compat from ganeti import pathutils from ganeti import vcluster from ganeti import ht +from ganeti import hooksmaster _BOOT_ID_PATH = "/proc/sys/kernel/random/boot_id" @@ -88,15 +88,15 @@ _LVSLINE_REGEX = re.compile("^ *([^|]+)\|([^|]+)\|([0-9.]+)\|([^|]{6,})\|?$") _MASTER_START = "start" _MASTER_STOP = "stop" -#: Maximum file permissions for remote command directory and executables +#: Maximum file permissions for restricted command directory and executables _RCMD_MAX_MODE = (stat.S_IRWXU | stat.S_IRGRP | stat.S_IXGRP | stat.S_IROTH | stat.S_IXOTH) -#: Delay before returning an error for remote commands +#: Delay before returning an error for restricted commands _RCMD_INVALID_DELAY = 10 -#: How long to wait to acquire lock for remote commands (shorter than +#: How long to wait to acquire lock for restricted commands (shorter than #: L{_RCMD_INVALID_DELAY}) to reduce blockage of noded forks when many #: command requests arrive _RCMD_LOCK_TIMEOUT = _RCMD_INVALID_DELAY * 0.8 @@ -110,6 +110,34 @@ class RPCFail(Exception): """ +def _GetInstReasonFilename(instance_name): + """Path of the file containing the reason of the instance status change. + + @type instance_name: string + @param instance_name: The name of the instance + @rtype: string + @return: The path of the file + + """ + return utils.PathJoin(pathutils.INSTANCE_REASON_DIR, instance_name) + + +def _StoreInstReasonTrail(instance_name, trail): + """Serialize a reason trail related to an instance change of state to file. + + The exact location of the file depends on the name of the instance and on + the configuration of the Ganeti cluster defined at deploy time. + + @type instance_name: string + @param instance_name: The name of the instance + @rtype: None + + """ + json = serializer.DumpJson(trail) + filename = _GetInstReasonFilename(instance_name) + utils.WriteFile(filename, data=json) + + def _Fail(msg, *args, **kwargs): """Log an error and the raise an RPCFail exception. @@ -297,10 +325,10 @@ def RunLocalHooks(hook_opcode, hooks_path, env_builder_fn): cfg = _GetConfig() hr = HooksRunner() - hm = mcpu.HooksMaster(hook_opcode, hooks_path, nodes, hr.RunLocalHooks, - None, env_fn, logging.warning, cfg.GetClusterName(), - cfg.GetMasterNode()) - + hm = hooksmaster.HooksMaster(hook_opcode, hooks_path, nodes, + hr.RunLocalHooks, None, env_fn, + logging.warning, cfg.GetClusterName(), + cfg.GetMasterNode()) hm.RunPhase(constants.HOOKS_PHASE_PRE) result = fn(*args, **kwargs) hm.RunPhase(constants.HOOKS_PHASE_POST) @@ -358,8 +386,8 @@ def _RunMasterSetupScript(master_params, action, use_external_mip_script): result = utils.RunCmd([setup_script, action], env=env, reset_env=True) if result.failed: - _Fail("Failed to %s the master IP. Script return value: %s" % - (action, result.exit_code), log=True) + _Fail("Failed to %s the master IP. Script return value: %s, output: '%s'" % + (action, result.exit_code, result.output), log=True) @RunLocalHooks(constants.FAKE_OP_MASTER_TURNUP, "master-ip-turnup", @@ -541,12 +569,12 @@ def LeaveCluster(modify_ssh_setup): raise errors.QuitGanetiException(True, "Shutdown scheduled") -def _GetVgInfo(name): +def _GetVgInfo(name, excl_stor): """Retrieves information about a LVM volume group. """ # TODO: GetVGInfo supports returning information for multiple VGs at once - vginfo = bdev.LogicalVolume.GetVGInfo([name]) + vginfo = bdev.LogicalVolume.GetVGInfo([name], excl_stor) if vginfo: vg_free = int(round(vginfo[0][0], 0)) vg_size = int(round(vginfo[0][1], 0)) @@ -589,25 +617,44 @@ def _GetNamedNodeInfo(names, fn): return map(fn, names) -def GetNodeInfo(vg_names, hv_names): +def GetNodeInfo(vg_names, hv_names, excl_stor): """Gives back a hash with different information about the node. @type vg_names: list of string @param vg_names: Names of the volume groups to ask for disk space information @type hv_names: list of string @param hv_names: Names of the hypervisors to ask for node information + @type excl_stor: boolean + @param excl_stor: Whether exclusive_storage is active @rtype: tuple; (string, None/dict, None/dict) @return: Tuple containing boot ID, volume group information and hypervisor information """ bootid = utils.ReadFile(_BOOT_ID_PATH, size=128).rstrip("\n") - vg_info = _GetNamedNodeInfo(vg_names, _GetVgInfo) + vg_info = _GetNamedNodeInfo(vg_names, (lambda vg: _GetVgInfo(vg, excl_stor))) hv_info = _GetNamedNodeInfo(hv_names, _GetHvInfo) return (bootid, vg_info, hv_info) +def _CheckExclusivePvs(pvi_list): + """Check that PVs are not shared among LVs + + @type pvi_list: list of L{objects.LvmPvInfo} objects + @param pvi_list: information about the PVs + + @rtype: list of tuples (string, list of strings) + @return: offending volumes, as tuples: (pv_name, [lv1_name, lv2_name...]) + + """ + res = [] + for pvi in pvi_list: + if len(pvi.lv_list) > 1: + res.append((pvi.name, pvi.lv_list)) + return res + + def VerifyNode(what, cluster_name): """Verify the status of the local node. @@ -762,8 +809,15 @@ def VerifyNode(what, cluster_name): result[constants.NV_VGLIST] = utils.ListVolumeGroups() if constants.NV_PVLIST in what and vm_capable: + check_exclusive_pvs = constants.NV_EXCLUSIVEPVS in what val = bdev.LogicalVolume.GetPVInfo(what[constants.NV_PVLIST], - filter_allocatable=False) + filter_allocatable=False, + include_lvs=check_exclusive_pvs) + if check_exclusive_pvs: + result[constants.NV_EXCLUSIVEPVS] = _CheckExclusivePvs(val) + for pvi in val: + # Avoid sending useless data on the wire + pvi.lv_list = [] result[constants.NV_PVLIST] = map(objects.LvmPvInfo.ToDict, val) if constants.NV_VERSION in what: @@ -1191,9 +1245,16 @@ def RunRenameInstance(instance, old_name, debug): " log file:\n%s", result.fail_reason, "\n".join(lines), log=False) -def _GetBlockDevSymlinkPath(instance_name, idx): - return utils.PathJoin(pathutils.DISK_LINKS_DIR, "%s%s%d" % - (instance_name, constants.DISK_SEPARATOR, idx)) +def _GetBlockDevSymlinkPath(instance_name, idx, _dir=None): + """Returns symlink path for block device. + + """ + if _dir is None: + _dir = pathutils.DISK_LINKS_DIR + + return utils.PathJoin(_dir, + ("%s%s%s" % + (instance_name, constants.DISK_SEPARATOR, idx))) def _SymlinkBlockDev(instance_name, device_path, idx): @@ -1267,13 +1328,17 @@ def _GatherAndLinkBlockDevs(instance): return block_devices -def StartInstance(instance, startup_paused): +def StartInstance(instance, startup_paused, reason, store_reason=True): """Start an instance. @type instance: L{objects.Instance} @param instance: the instance object @type startup_paused: bool @param instance: pause instance at startup? + @type reason: list of reasons + @param reason: the reason trail for this startup + @type store_reason: boolean + @param store_reason: whether to store the shutdown reason trail on file @rtype: None """ @@ -1287,6 +1352,8 @@ def StartInstance(instance, startup_paused): block_devices = _GatherAndLinkBlockDevs(instance) hyper = hypervisor.GetHypervisor(instance.hypervisor) hyper.StartInstance(instance, block_devices, startup_paused) + if store_reason: + _StoreInstReasonTrail(instance.name, reason) except errors.BlockDeviceError, err: _Fail("Block device error: %s", err, exc=True) except errors.HypervisorError, err: @@ -1294,7 +1361,7 @@ def StartInstance(instance, startup_paused): _Fail("Hypervisor error: %s", err, exc=True) -def InstanceShutdown(instance, timeout): +def InstanceShutdown(instance, timeout, reason, store_reason=True): """Shut an instance down. @note: this functions uses polling with a hardcoded timeout. @@ -1303,6 +1370,10 @@ def InstanceShutdown(instance, timeout): @param instance: the instance object @type timeout: integer @param timeout: maximum timeout for soft shutdown + @type reason: list of reasons + @param reason: the reason trail for this shutdown + @type store_reason: boolean + @param store_reason: whether to store the shutdown reason trail on file @rtype: None """ @@ -1324,6 +1395,8 @@ def InstanceShutdown(instance, timeout): try: hyper.StopInstance(instance, retry=self.tried_once) + if store_reason: + _StoreInstReasonTrail(instance.name, reason) except errors.HypervisorError, err: if iname not in hyper.ListInstances(): # if the instance is no longer existing, consider this a @@ -1363,7 +1436,7 @@ def InstanceShutdown(instance, timeout): _RemoveBlockDevLinks(iname, instance.disks) -def InstanceReboot(instance, reboot_type, shutdown_timeout): +def InstanceReboot(instance, reboot_type, shutdown_timeout, reason): """Reboot an instance. @type instance: L{objects.Instance} @@ -1381,6 +1454,8 @@ def InstanceReboot(instance, reboot_type, shutdown_timeout): instance (instead of a call_instance_reboot RPC) @type shutdown_timeout: integer @param shutdown_timeout: maximum timeout for soft shutdown + @type reason: list of reasons + @param reason: the reason trail for this reboot @rtype: None """ @@ -1397,8 +1472,10 @@ def InstanceReboot(instance, reboot_type, shutdown_timeout): _Fail("Failed to soft reboot instance %s: %s", instance.name, err) elif reboot_type == constants.INSTANCE_REBOOT_HARD: try: - InstanceShutdown(instance, shutdown_timeout) - return StartInstance(instance, False) + InstanceShutdown(instance, shutdown_timeout, reason, store_reason=False) + result = StartInstance(instance, False, reason, store_reason=False) + _StoreInstReasonTrail(instance.name, reason) + return result except errors.HypervisorError, err: _Fail("Failed to hard reboot instance %s: %s", instance.name, err) else: @@ -1549,7 +1626,55 @@ def GetMigrationStatus(instance): _Fail("Failed to get migration status: %s", err, exc=True) -def BlockdevCreate(disk, size, owner, on_primary, info): +def HotplugDevice(instance, action, dev_type, device, extra, seq): + """Hotplug a device + + Hotplug is currently supported only for KVM Hypervisor. + @type instance: L{objects.Instance} + @param instance: the instance to which we hotplug a device + @type action: string + @param action: the hotplug action to perform + @type dev_type: string + @param dev_type: the device type to hotplug + @type device: either L{objects.NIC} or L{objects.Disk} + @param device: the device object to hotplug + @type extra: string + @param extra: extra info used by hotplug code (e.g. disk link) + @type seq: int + @param seq: the index of the device from master perspective + @raise RPCFail: in case instance does not have KVM hypervisor + + """ + hyper = hypervisor.GetHypervisor(instance.hypervisor) + try: + hyper.VerifyHotplugSupport(instance, action, dev_type) + except errors.HotplugError, err: + _Fail("Hotplug is not supported: %s", err) + + if action == constants.HOTPLUG_ACTION_ADD: + fn = hyper.HotAddDevice + elif action == constants.HOTPLUG_ACTION_REMOVE: + fn = hyper.HotDelDevice + elif action == constants.HOTPLUG_ACTION_MODIFY: + fn = hyper.HotModDevice + else: + assert action in constants.HOTPLUG_ALL_ACTIONS + + return fn(instance, dev_type, device, extra, seq) + + +def HotplugSupported(instance): + """Checks if hotplug is generally supported. + + """ + hyper = hypervisor.GetHypervisor(instance.hypervisor) + try: + hyper.HotplugSupported(instance) + except errors.HotplugError, err: + _Fail("Hotplug is not supported: %s", err) + + +def BlockdevCreate(disk, size, owner, on_primary, info, excl_stor): """Creates a block device for an instance. @type disk: L{objects.Disk} @@ -1564,6 +1689,8 @@ def BlockdevCreate(disk, size, owner, on_primary, info): @type info: string @param info: string that will be sent to the physical device creation, used for example to set (LVM) tags on LVs + @type excl_stor: boolean + @param excl_stor: Whether exclusive_storage is active @return: the new unique_id of the device (this can sometime be computed only after creation), or None. On secondary nodes, @@ -1590,7 +1717,7 @@ def BlockdevCreate(disk, size, owner, on_primary, info): clist.append(crdev) try: - device = bdev.Create(disk, clist) + device = bdev.Create(disk, clist, excl_stor) except errors.BlockDeviceError, err: _Fail("Can't create block device: %s", err) @@ -1722,10 +1849,18 @@ def BlockdevRemove(disk): rdev = None if rdev is not None: r_path = rdev.dev_path - try: - rdev.Remove() - except errors.BlockDeviceError, err: - msgs.append(str(err)) + + def _TryRemove(): + try: + rdev.Remove() + return [] + except errors.BlockDeviceError, err: + return [str(err)] + + msgs.extend(utils.SimpleRetry([], _TryRemove, + constants.DISK_REMOVE_RETRY_INTERVAL, + constants.DISK_REMOVE_RETRY_TIMEOUT)) + if not msgs: DevCacheManager.RemoveCache(r_path) @@ -1799,23 +1934,28 @@ def BlockdevAssemble(disk, owner, as_primary, idx): This is a wrapper over _RecursiveAssembleBD. @rtype: str or boolean - @return: a C{/dev/...} path for primary nodes, and - C{True} for secondary nodes + @return: a tuple with the C{/dev/...} path and the created symlink + for primary nodes, and (C{True}, C{True}) for secondary nodes """ try: result = _RecursiveAssembleBD(disk, owner, as_primary) if isinstance(result, bdev.BlockDev): # pylint: disable=E1103 - result = result.dev_path + dev_path = result.dev_path + link_name = None if as_primary: - _SymlinkBlockDev(owner, result, idx) + link_name = _SymlinkBlockDev(owner, dev_path, idx) + elif result: + return result, result + else: + _Fail("Unexpected result from _RecursiveAssembleBD") except errors.BlockDeviceError, err: _Fail("Error while assembling disk: %s", err, exc=True) except OSError, err: _Fail("Error while symlinking disk: %s", err, exc=True) - return result + return dev_path, link_name def BlockdevShutdown(disk): @@ -2444,6 +2584,9 @@ def OSEnvironment(instance, inst_os, debug=0): real_disk = _OpenRealBD(disk) result["DISK_%d_PATH" % idx] = real_disk.dev_path result["DISK_%d_ACCESS" % idx] = disk.mode + result["DISK_%d_UUID" % idx] = disk.uuid + if disk.name: + result["DISK_%d_NAME" % idx] = disk.name if constants.HV_DISK_TYPE in instance.hvparams: result["DISK_%d_FRONTEND_TYPE" % idx] = \ instance.hvparams[constants.HV_DISK_TYPE] @@ -2456,6 +2599,9 @@ def OSEnvironment(instance, inst_os, debug=0): # NICs for idx, nic in enumerate(instance.nics): result["NIC_%d_MAC" % idx] = nic.mac + result["NIC_%d_UUID" % idx] = nic.uuid + if nic.name: + result["NIC_%d_NAME" % idx] = nic.name if nic.ip: result["NIC_%d_IP" % idx] = nic.ip result["NIC_%d_MODE" % idx] = nic.nicparams[constants.NIC_MODE] @@ -2463,8 +2609,9 @@ def OSEnvironment(instance, inst_os, debug=0): result["NIC_%d_BRIDGE" % idx] = nic.nicparams[constants.NIC_LINK] if nic.nicparams[constants.NIC_LINK]: result["NIC_%d_LINK" % idx] = nic.nicparams[constants.NIC_LINK] - if nic.network: - result["NIC_%d_NETWORK" % idx] = nic.network + if nic.netinfo: + nobj = objects.Network.FromDict(nic.netinfo) + result.update(nobj.HooksDict("NIC_%d_" % idx)) if constants.HV_NIC_TYPE in instance.hvparams: result["NIC_%d_FRONTEND_TYPE" % idx] = \ instance.hvparams[constants.HV_NIC_TYPE] @@ -2477,6 +2624,51 @@ def OSEnvironment(instance, inst_os, debug=0): return result +def DiagnoseExtStorage(top_dirs=None): + """Compute the validity for all ExtStorage Providers. + + @type top_dirs: list + @param top_dirs: the list of directories in which to + search (if not given defaults to + L{pathutils.ES_SEARCH_PATH}) + @rtype: list of L{objects.ExtStorage} + @return: a list of tuples (name, path, status, diagnose, parameters) + for all (potential) ExtStorage Providers under all + search paths, where: + - name is the (potential) ExtStorage Provider + - path is the full path to the ExtStorage Provider + - status True/False is the validity of the ExtStorage Provider + - diagnose is the error message for an invalid ExtStorage Provider, + otherwise empty + - parameters is a list of (name, help) parameters, if any + + """ + if top_dirs is None: + top_dirs = pathutils.ES_SEARCH_PATH + + result = [] + for dir_name in top_dirs: + if os.path.isdir(dir_name): + try: + f_names = utils.ListVisibleFiles(dir_name) + except EnvironmentError, err: + logging.exception("Can't list the ExtStorage directory %s: %s", + dir_name, err) + break + for name in f_names: + es_path = utils.PathJoin(dir_name, name) + status, es_inst = bdev.ExtStorageFromDisk(name, base_dir=dir_name) + if status: + diagnose = "" + parameters = es_inst.supported_parameters + else: + diagnose = es_inst + parameters = [] + result.append((name, es_path, status, diagnose, parameters)) + + return result + + def BlockdevGrow(disk, amount, dryrun, backingstore): """Grow a stack of block devices. @@ -2881,7 +3073,7 @@ def JobQueueUpdate(file_name, content): # Write and replace the file atomically utils.WriteFile(file_name, data=_Decompress(content), uid=getents.masterd_uid, - gid=getents.masterd_gid) + gid=getents.daemons_gid, mode=constants.JOB_QUEUE_FILES_PERMS) def JobQueueRename(old, new): @@ -2905,8 +3097,8 @@ def JobQueueRename(old, new): getents = runtime.GetEnts() - utils.RenameFile(old, new, mkdir=True, mkdir_mode=0700, - dir_uid=getents.masterd_uid, dir_gid=getents.masterd_gid) + utils.RenameFile(old, new, mkdir=True, mkdir_mode=0750, + dir_uid=getents.masterd_uid, dir_gid=getents.daemons_gid) def BlockdevClose(instance_name, disks): @@ -3491,8 +3683,20 @@ def DrbdAttachNet(nodes_ip, disks, instance_name, multimaster): for rd in bdevs: stats = rd.GetProcStatus() - all_connected = (all_connected and - (stats.is_connected or stats.is_in_resync)) + if multimaster: + # In the multimaster case we have to wait explicitly until + # the resource is Connected and UpToDate/UpToDate, because + # we promote *both nodes* to primary directly afterwards. + # Being in resync is not enough, since there is a race during which we + # may promote a node with an Outdated disk to primary, effectively + # tearing down the connection. + all_connected = (all_connected and + stats.is_connected and + stats.is_disk_uptodate and + stats.peer_disk_uptodate) + else: + all_connected = (all_connected and + (stats.is_connected or stats.is_in_resync)) if stats.is_standalone: # peer had different config info and this node became @@ -3586,7 +3790,7 @@ def PowercycleNode(hypervisor_type): def _VerifyRestrictedCmdName(cmd): - """Verifies a remote command name. + """Verifies a restricted command name. @type cmd: string @param cmd: Command name @@ -3608,7 +3812,7 @@ def _VerifyRestrictedCmdName(cmd): def _CommonRestrictedCmdCheck(path, owner): - """Common checks for remote command file system directories and files. + """Common checks for restricted command file system directories and files. @type path: string @param path: Path to check @@ -3638,7 +3842,7 @@ def _CommonRestrictedCmdCheck(path, owner): def _VerifyRestrictedCmdDirectory(path, _owner=None): - """Verifies remote command directory. + """Verifies restricted command directory. @type path: string @param path: Path to check @@ -3659,10 +3863,10 @@ def _VerifyRestrictedCmdDirectory(path, _owner=None): def _VerifyRestrictedCmd(path, cmd, _owner=None): - """Verifies a whole remote command and returns its executable filename. + """Verifies a whole restricted command and returns its executable filename. @type path: string - @param path: Directory containing remote commands + @param path: Directory containing restricted commands @type cmd: string @param cmd: Command name @rtype: tuple; (boolean, string) @@ -3688,10 +3892,10 @@ def _PrepareRestrictedCmd(path, cmd, _verify_dir=_VerifyRestrictedCmdDirectory, _verify_name=_VerifyRestrictedCmdName, _verify_cmd=_VerifyRestrictedCmd): - """Performs a number of tests on a remote command. + """Performs a number of tests on a restricted command. @type path: string - @param path: Directory containing remote commands + @param path: Directory containing restricted commands @type cmd: string @param cmd: Command name @return: Same as L{_VerifyRestrictedCmd} @@ -3718,7 +3922,7 @@ def RunRestrictedCmd(cmd, _prepare_fn=_PrepareRestrictedCmd, _runcmd_fn=utils.RunCmd, _enabled=constants.ENABLE_RESTRICTED_COMMANDS): - """Executes a remote command after performing strict tests. + """Executes a restricted command after performing strict tests. @type cmd: string @param cmd: Command name @@ -3727,10 +3931,10 @@ def RunRestrictedCmd(cmd, @raise RPCFail: In case of an error """ - logging.info("Preparing to run remote command '%s'", cmd) + logging.info("Preparing to run restricted command '%s'", cmd) if not _enabled: - _Fail("Remote commands disabled at configure time") + _Fail("Restricted commands disabled at configure time") lock = None try: @@ -3758,7 +3962,7 @@ def RunRestrictedCmd(cmd, # Do not include original error message in returned error _Fail("Executing command '%s' failed" % cmd) elif cmdresult.failed or cmdresult.fail_reason: - _Fail("Remote command '%s' failed: %s; output: %s", + _Fail("Restricted command '%s' failed: %s; output: %s", cmd, cmdresult.fail_reason, cmdresult.output) else: return cmdresult.output