X-Git-Url: https://code.grnet.gr/git/ganeti-local/blobdiff_plain/afdc3985644be8d2b91d9f0e37dfda391a30d036..b6b45e0d6251cd870658be36004ce4116f4a63c6:/lib/backend.py diff --git a/lib/backend.py b/lib/backend.py index 886ba27..75457e3 100644 --- a/lib/backend.py +++ b/lib/backend.py @@ -19,7 +19,12 @@ # 02110-1301, USA. -"""Functions used by the node daemon""" +"""Functions used by the node daemon + +@var _ALLOWED_UPLOAD_FILES: denotes which files are accepted in + the L{UploadFile} function + +""" import os @@ -46,6 +51,9 @@ from ganeti import objects from ganeti import ssconf +_BOOT_ID_PATH = "/proc/sys/kernel/random/boot_id" + + class RPCFail(Exception): """Class denoting RPC failure. @@ -53,6 +61,7 @@ class RPCFail(Exception): """ + def _Fail(msg, *args, **kwargs): """Log an error and the raise an RPCFail exception. @@ -145,6 +154,32 @@ def _CleanDirectory(path, exclude=None): utils.RemoveFile(full_name) +def _BuildUploadFileList(): + """Build the list of allowed upload files. + + This is abstracted so that it's built only once at module import time. + + """ + allowed_files = set([ + constants.CLUSTER_CONF_FILE, + constants.ETC_HOSTS, + constants.SSH_KNOWN_HOSTS_FILE, + constants.VNC_PASSWORD_FILE, + constants.RAPI_CERT_FILE, + constants.RAPI_USERS_FILE, + constants.HMAC_CLUSTER_KEY, + ]) + + for hv_name in constants.HYPER_TYPES: + hv_class = hypervisor.GetHypervisorClass(hv_name) + allowed_files.update(hv_class.GetAncillaryFiles()) + + return frozenset(allowed_files) + + +_ALLOWED_UPLOAD_FILES = _BuildUploadFileList() + + def JobQueuePurge(): """Removes job queue files and archived jobs. @@ -154,7 +189,6 @@ def JobQueuePurge(): """ _CleanDirectory(constants.QUEUE_DIR, exclude=[constants.JOB_QUEUE_LOCK_FILE]) _CleanDirectory(constants.JOB_QUEUE_ARCHIVE_DIR) - return True, None def GetMasterInfo(): @@ -164,7 +198,7 @@ def GetMasterInfo(): for consumption here or from the node daemon. @rtype: tuple - @return: True, (master_netdev, master_ip, master_name) in case of success + @return: master_netdev, master_ip, master_name @raise RPCFail: in case of errors """ @@ -174,11 +208,11 @@ def GetMasterInfo(): master_ip = cfg.GetMasterIP() master_node = cfg.GetMasterNode() except errors.ConfigurationError, err: - _Fail("Cluster configuration incomplete", exc=True) - return True, (master_netdev, master_ip, master_node) + _Fail("Cluster configuration incomplete: %s", err, exc=True) + return (master_netdev, master_ip, master_node) -def StartMaster(start_daemons): +def StartMaster(start_daemons, no_voting): """Activate local node as master node. The function will always try activate the IP address of the master @@ -186,15 +220,18 @@ def StartMaster(start_daemons): based on the start_daemons parameter. @type start_daemons: boolean - @param start_daemons: whther to also start the master + @param start_daemons: whether to also start the master daemons (ganeti-masterd and ganeti-rapi) + @type no_voting: boolean + @param no_voting: whether to start ganeti-masterd without a node vote + (if start_daemons is True), but still non-interactively @rtype: None """ # GetMasterInfo will raise an exception if not able to return data - master_netdev, master_ip, _ = GetMasterInfo()[1] + master_netdev, master_ip, _ = GetMasterInfo() - payload = [] + err_msgs = [] if utils.TcpPing(master_ip, constants.DEFAULT_NODED_PORT): if utils.OwnIpAddress(master_ip): # we already have the ip: @@ -202,7 +239,7 @@ def StartMaster(start_daemons): else: msg = "Someone else has the master ip, not activating" logging.error(msg) - payload.append(msg) + err_msgs.append(msg) else: result = utils.RunCmd(["ip", "address", "add", "%s/32" % master_ip, "dev", master_netdev, "label", @@ -210,7 +247,7 @@ def StartMaster(start_daemons): if result.failed: msg = "Can't activate master IP: %s" % result.output logging.error(msg) - payload.append(msg) + err_msgs.append(msg) result = utils.RunCmd(["arping", "-q", "-U", "-c 3", "-I", master_netdev, "-s", master_ip, master_ip]) @@ -218,17 +255,24 @@ def StartMaster(start_daemons): # and now start the master and rapi daemons if start_daemons: - for daemon in 'ganeti-masterd', 'ganeti-rapi': - result = utils.RunCmd([daemon]) + daemons_params = { + 'ganeti-masterd': [], + 'ganeti-rapi': [], + } + if no_voting: + daemons_params['ganeti-masterd'].append('--no-voting') + daemons_params['ganeti-masterd'].append('--yes-do-it') + for daemon in daemons_params: + cmd = [daemon] + cmd.extend(daemons_params[daemon]) + result = utils.RunCmd(cmd) if result.failed: msg = "Can't start daemon %s: %s" % (daemon, result.output) logging.error(msg) - payload.append(msg) + err_msgs.append(msg) - if payload: - _Fail("; ".join(payload)) - - return True, None + if err_msgs: + _Fail("; ".join(err_msgs)) def StopMaster(stop_daemons): @@ -248,7 +292,7 @@ def StopMaster(stop_daemons): # need to decide in which case we fail the RPC for this # GetMasterInfo will raise an exception if not able to return data - master_netdev, master_ip, _ = GetMasterInfo()[1] + master_netdev, master_ip, _ = GetMasterInfo() result = utils.RunCmd(["ip", "address", "del", "%s/32" % master_ip, "dev", master_netdev]) @@ -258,11 +302,9 @@ def StopMaster(stop_daemons): if stop_daemons: # stop/kill the rapi and the master daemon - for daemon in constants.RAPI_PID, constants.MASTERD_PID: + for daemon in constants.RAPI, constants.MASTERD: utils.KillProcess(utils.ReadPidFile(utils.DaemonPidFileName(daemon))) - return True, None - def AddNode(dsa, dsapub, rsa, rsapub, sshkey, sshpub): """Joins this node to the cluster. @@ -308,8 +350,6 @@ def AddNode(dsa, dsapub, rsa, rsapub, sshkey, sshpub): utils.RunCmd([constants.SSH_INITD_SCRIPT, "restart"]) - return (True, "Node added successfully") - def LeaveCluster(): """Cleans up and remove the current node. @@ -328,23 +368,31 @@ def LeaveCluster(): try: priv_key, pub_key, auth_keys = ssh.GetUserFiles(constants.GANETI_RUNAS) - f = open(pub_key, 'r') - try: - utils.RemoveAuthorizedKey(auth_keys, f.read(8192)) - finally: - f.close() + utils.RemoveAuthorizedKey(auth_keys, utils.ReadFile(pub_key)) utils.RemoveFile(priv_key) utils.RemoveFile(pub_key) except errors.OpExecError: logging.exception("Error while processing ssh files") + try: + utils.RemoveFile(constants.HMAC_CLUSTER_KEY) + utils.RemoveFile(constants.RAPI_CERT_FILE) + utils.RemoveFile(constants.SSL_CERT_FILE) + except: + logging.exception("Error while removing cluster secrets") + + confd_pid = utils.ReadPidFile(utils.DaemonPidFileName(constants.CONFD)) + + if confd_pid: + utils.KillProcess(confd_pid, timeout=2) + # Raise a custom exception (handled in ganeti-noded) raise errors.QuitGanetiException(True, 'Shutdown scheduled') def GetNodeInfo(vgname, hypervisor_type): - """Gives back a hash with different informations about the node. + """Gives back a hash with different information about the node. @type vgname: C{string} @param vgname: the name of the volume group to ask for disk space information @@ -370,13 +418,9 @@ def GetNodeInfo(vgname, hypervisor_type): if hyp_info is not None: outputarray.update(hyp_info) - f = open("/proc/sys/kernel/random/boot_id", 'r') - try: - outputarray["bootid"] = f.read(128).rstrip("\n") - finally: - f.close() + outputarray["bootid"] = utils.ReadFile(_BOOT_ID_PATH, size=128).rstrip("\n") - return True, outputarray + return outputarray def VerifyNode(what, cluster_name): @@ -440,7 +484,7 @@ def VerifyNode(what, cluster_name): tmp[my_name] = ("Can't find my own primary/secondary IP" " in the node list") else: - port = utils.GetNodeDaemonPort() + port = utils.GetDaemonPort(constants.NODED) for name, pip, sip in what[constants.NV_NODENETTEST]: fail = [] if not utils.TcpPing(pip, port, source=my_pip): @@ -478,7 +522,7 @@ def VerifyNode(what, cluster_name): used_minors = str(err) result[constants.NV_DRBDLIST] = used_minors - return True, result + return result def GetVolumeList(vg_name): @@ -515,6 +559,11 @@ def GetVolumeList(vg_name): name, size, attr = match.groups() inactive = attr[4] == '-' online = attr[5] == 'o' + virtual = attr[0] == 'v' + if virtual: + # we don't want to report such volumes as existing, since they + # don't really hold data + continue lvs[name] = (size, inactive, online) return lvs @@ -528,7 +577,7 @@ def ListVolumeGroups(): size of the volume """ - return True, utils.ListVolumeGroups() + return utils.ListVolumeGroups() def NodeVolumes(): @@ -571,9 +620,8 @@ def NodeVolumes(): 'vg': line[3].strip(), } - return True, [map_line(line.split('|')) - for line in result.stdout.splitlines() - if line.count('|') >= 3] + return [map_line(line.split('|')) for line in result.stdout.splitlines() + if line.count('|') >= 3] def BridgesExist(bridges_list): @@ -591,8 +639,6 @@ def BridgesExist(bridges_list): if missing: _Fail("Missing bridges %s", ", ".join(missing)) - return True, None - def GetInstanceList(hypervisor_list): """Provides a list of instances. @@ -619,7 +665,7 @@ def GetInstanceList(hypervisor_list): def GetInstanceInfo(instance, hname): - """Gives back the informations about an instance as a dictionary. + """Gives back the information about an instance as a dictionary. @type instance: string @param instance: the instance name @@ -641,7 +687,7 @@ def GetInstanceInfo(instance, hname): output['state'] = iinfo[4] output['time'] = iinfo[5] - return True, output + return output def GetInstanceMigratable(instance): @@ -666,8 +712,6 @@ def GetInstanceMigratable(instance): if not os.path.islink(link_name): _Fail("Instance %s was not restarted since ganeti 1.2.5", iname) - return True, None - def GetAllInstancesInfo(hypervisor_list): """Gather data about all instances. @@ -692,7 +736,7 @@ def GetAllInstancesInfo(hypervisor_list): for hname in hypervisor_list: iinfo = hypervisor.GetHypervisor(hname).GetAllInstancesInfo() if iinfo: - for name, inst_id, memory, vcpus, state, times in iinfo: + for name, _, memory, vcpus, state, times in iinfo: value = { 'memory': memory, 'vcpus': vcpus, @@ -709,7 +753,7 @@ def GetAllInstancesInfo(hypervisor_list): " with different parameters", name) output[name] = value - return True, output + return output def InstanceOsAdd(instance, reinstall): @@ -719,14 +763,12 @@ def InstanceOsAdd(instance, reinstall): @param instance: Instance whose OS is to be installed @type reinstall: boolean @param reinstall: whether this is an instance reinstall - @rtype: boolean - @return: the success of the operation + @rtype: None """ inst_os = OSFromDisk(instance.os) - - create_env = OSEnvironment(instance) + create_env = OSEnvironment(instance, inst_os) if reinstall: create_env['INSTANCE_REINSTALL'] = "1" @@ -744,8 +786,6 @@ def InstanceOsAdd(instance, reinstall): _Fail("OS create script failed (%s), last lines in the" " log file:\n%s", result.fail_reason, "\n".join(lines), log=False) - return (True, "Successfully installed") - def RunRenameInstance(instance, old_name): """Run the OS rename script for an instance. @@ -760,7 +800,7 @@ def RunRenameInstance(instance, old_name): """ inst_os = OSFromDisk(instance.os) - rename_env = OSEnvironment(instance) + rename_env = OSEnvironment(instance, inst_os) rename_env['OLD_INSTANCE_NAME'] = old_name logfile = "%s/rename-%s-%s-%s-%d.log" % (constants.LOG_OS_DIR, instance.os, @@ -778,11 +818,9 @@ def RunRenameInstance(instance, old_name): _Fail("OS rename script failed (%s), last lines in the" " log file:\n%s", result.fail_reason, "\n".join(lines), log=False) - return (True, "Rename successful") - def _GetVGInfo(vg_name): - """Get informations about the volume group. + """Get information about the volume group. @type vg_name: str @param vg_name: the volume group which we query @@ -814,7 +852,7 @@ def _GetVGInfo(vg_name): "pv_count": int(valarr[2]), } except ValueError, err: - logging.exception("Fail to parse vgs output") + logging.exception("Fail to parse vgs output: %s", err) else: logging.error("vgs output has the wrong number of fields (expected" " three): %s", str(valarr)) @@ -858,7 +896,7 @@ def _RemoveBlockDevLinks(instance_name, disks): """Remove the block device symlinks belonging to the given instance. """ - for idx, disk in enumerate(disks): + for idx, _ in enumerate(disks): link_name = _GetBlockDevSymlinkPath(instance_name, idx) if os.path.islink(link_name): try: @@ -902,14 +940,14 @@ def StartInstance(instance): @type instance: L{objects.Instance} @param instance: the instance object - @rtype: boolean - @return: whether the startup was successful or not + @rtype: None """ running_instances = GetInstanceList([instance.hypervisor]) if instance.name in running_instances: - return (True, "Already running") + logging.info("Instance %s already running, not starting", instance.name) + return try: block_devices = _GatherAndLinkBlockDevs(instance) @@ -921,8 +959,6 @@ def StartInstance(instance): _RemoveBlockDevLinks(instance.name, instance.disks) _Fail("Hypervisor error: %s", err, exc=True) - return (True, "Instance started successfully") - def InstanceShutdown(instance): """Shut an instance down. @@ -931,46 +967,44 @@ def InstanceShutdown(instance): @type instance: L{objects.Instance} @param instance: the instance object - @rtype: boolean - @return: whether the startup was successful or not + @rtype: None """ hv_name = instance.hypervisor running_instances = GetInstanceList([hv_name]) + iname = instance.name - if instance.name not in running_instances: - return (True, "Instance already stopped") + if iname not in running_instances: + logging.info("Instance %s not running, doing nothing", iname) + return hyper = hypervisor.GetHypervisor(hv_name) try: hyper.StopInstance(instance) except errors.HypervisorError, err: - _Fail("Failed to stop instance %s: %s", instance.name, err) + _Fail("Failed to stop instance %s: %s", iname, err) # test every 10secs for 2min time.sleep(1) - for dummy in range(11): + for _ in range(11): if instance.name not in GetInstanceList([hv_name]): break time.sleep(10) else: # the shutdown did not succeed - logging.error("Shutdown of '%s' unsuccessful, using destroy", - instance.name) + logging.error("Shutdown of '%s' unsuccessful, using destroy", iname) try: hyper.StopInstance(instance, force=True) except errors.HypervisorError, err: - _Fail("Failed to force stop instance %s: %s", instance.name, err) + _Fail("Failed to force stop instance %s: %s", iname, err) time.sleep(1) if instance.name in GetInstanceList([hv_name]): - _Fail("Could not shutdown instance %s even by destroy", instance.name) - - _RemoveBlockDevLinks(instance.name, instance.disks) + _Fail("Could not shutdown instance %s even by destroy", iname) - return (True, "Instance has been shutdown successfully") + _RemoveBlockDevLinks(iname, instance.disks) def InstanceReboot(instance, reboot_type): @@ -985,11 +1019,11 @@ def InstanceReboot(instance, reboot_type): instance OS, do not recreate the VM - L{constants.INSTANCE_REBOOT_HARD}: tear down and restart the VM (at the hypervisor level) - - the other reboot type (L{constants.INSTANCE_REBOOT_HARD}) - is not accepted here, since that mode is handled - differently - @rtype: boolean - @return: the success of the operation + - the other reboot type (L{constants.INSTANCE_REBOOT_FULL}) is + not accepted here, since that mode is handled differently, in + cmdlib, and translates into full stop and start of the + instance (instead of a call_instance_reboot RPC) + @rtype: None """ running_instances = GetInstanceList([instance.hypervisor]) @@ -1005,17 +1039,13 @@ def InstanceReboot(instance, reboot_type): _Fail("Failed to soft reboot instance %s: %s", instance.name, err) elif reboot_type == constants.INSTANCE_REBOOT_HARD: try: - stop_result = InstanceShutdown(instance) - if not stop_result[0]: - return stop_result + InstanceShutdown(instance) return StartInstance(instance) except errors.HypervisorError, err: _Fail("Failed to hard reboot instance %s: %s", instance.name, err) else: _Fail("Invalid reboot_type received: %s", reboot_type) - return (True, "Reboot successful") - def MigrationInfo(instance): """Gather information about an instance to be migrated. @@ -1029,7 +1059,7 @@ def MigrationInfo(instance): info = hyper.MigrationInfo(instance) except errors.HypervisorError, err: _Fail("Failed to fetch migration information: %s", err, exc=True) - return (True, info) + return info def AcceptInstance(instance, info, target): @@ -1048,7 +1078,6 @@ def AcceptInstance(instance, info, target): hyper.AcceptInstance(instance, info, target) except errors.HypervisorError, err: _Fail("Failed to accept instance: %s", err, exc=True) - return (True, "Accept successfull") def FinalizeMigration(instance, info, success): @@ -1067,7 +1096,6 @@ def FinalizeMigration(instance, info, success): hyper.FinalizeMigration(instance, info, success) except errors.HypervisorError, err: _Fail("Failed to finalize migration: %s", err, exc=True) - return (True, "Migration Finalized") def MigrateInstance(instance, target, live): @@ -1092,7 +1120,6 @@ def MigrateInstance(instance, target, live): hyper.MigrateInstance(instance.name, target, live) except errors.HypervisorError, err: _Fail("Failed to migrate instance: %s", err, exc=True) - return (True, "Migration successfull") def BlockdevCreate(disk, size, owner, on_primary, info): @@ -1133,7 +1160,7 @@ def BlockdevCreate(disk, size, owner, on_primary, info): clist.append(crdev) try: - device = bdev.Create(disk.dev_type, disk.physical_id, clist, size) + device = bdev.Create(disk.dev_type, disk.physical_id, clist, disk.size) except errors.BlockDeviceError, err: _Fail("Can't create block device: %s", err) @@ -1153,8 +1180,7 @@ def BlockdevCreate(disk, size, owner, on_primary, info): device.SetInfo(info) - physical_id = device.unique_id - return True, physical_id + return device.unique_id def BlockdevRemove(disk): @@ -1169,7 +1195,6 @@ def BlockdevRemove(disk): """ msgs = [] - result = True try: rdev = _RecursiveFindBD(disk) except errors.BlockDeviceError, err: @@ -1182,22 +1207,19 @@ def BlockdevRemove(disk): rdev.Remove() except errors.BlockDeviceError, err: msgs.append(str(err)) - result = False - if result: + if not msgs: DevCacheManager.RemoveCache(r_path) if disk.children: for child in disk.children: - c_status, c_msg = BlockdevRemove(child) - result = result and c_status - if c_msg: # not an empty message - msgs.append(c_msg) + try: + BlockdevRemove(child) + except RPCFail, err: + msgs.append(str(err)) - if not result: + if msgs: _Fail("; ".join(msgs)) - return True, None - def _RecursiveAssembleBD(disk, owner, as_primary): """Activate a block device for an instance. @@ -1240,7 +1262,7 @@ def _RecursiveAssembleBD(disk, owner, as_primary): children.append(cdev) if as_primary or disk.AssembleOnSecondary(): - r_dev = bdev.Assemble(disk.dev_type, disk.physical_id, children) + r_dev = bdev.Assemble(disk.dev_type, disk.physical_id, children, disk.size) r_dev.SetSyncSpeed(constants.SYNC_SPEED) result = r_dev if as_primary or disk.OpenOnSecondary(): @@ -1270,13 +1292,13 @@ def BlockdevAssemble(disk, owner, as_primary): except errors.BlockDeviceError, err: _Fail("Error while assembling disk: %s", err, exc=True) - return True, result + return result def BlockdevShutdown(disk): """Shut down a block device. - First, if the device is assembled (Attach() is successfull), then + First, if the device is assembled (Attach() is successful), then the device is shutdown. Then the children of the device are shutdown. @@ -1287,12 +1309,10 @@ def BlockdevShutdown(disk): @type disk: L{objects.Disk} @param disk: the description of the disk we should shutdown - @rtype: boolean - @return: the success of the operation + @rtype: None """ msgs = [] - result = True r_dev = _RecursiveFindBD(disk) if r_dev is not None: r_path = r_dev.dev_path @@ -1301,18 +1321,16 @@ def BlockdevShutdown(disk): DevCacheManager.RemoveCache(r_path) except errors.BlockDeviceError, err: msgs.append(str(err)) - result = False if disk.children: for child in disk.children: - c_status, c_msg = BlockdevShutdown(child) - result = result and c_status - if c_msg: # not an empty message - msgs.append(c_msg) + try: + BlockdevShutdown(child) + except RPCFail, err: + msgs.append(str(err)) - if not result: + if msgs: _Fail("; ".join(msgs)) - return (True, None) def BlockdevAddchildren(parent_cdev, new_cdevs): @@ -1322,8 +1340,7 @@ def BlockdevAddchildren(parent_cdev, new_cdevs): @param parent_cdev: the disk to which we should add children @type new_cdevs: list of L{objects.Disk} @param new_cdevs: the list of children which we should add - @rtype: boolean - @return: the success of the operation + @rtype: None """ parent_bdev = _RecursiveFindBD(parent_cdev) @@ -1333,7 +1350,6 @@ def BlockdevAddchildren(parent_cdev, new_cdevs): if new_bdevs.count(None) > 0: _Fail("Can't find new device(s) to add: %s:%s", new_bdevs, new_cdevs) parent_bdev.AddChildren(new_bdevs) - return (True, None) def BlockdevRemovechildren(parent_cdev, new_cdevs): @@ -1343,8 +1359,7 @@ def BlockdevRemovechildren(parent_cdev, new_cdevs): @param parent_cdev: the disk from which we should remove children @type new_cdevs: list of L{objects.Disk} @param new_cdevs: the list of children which we should remove - @rtype: boolean - @return: the success of the operation + @rtype: None """ parent_bdev = _RecursiveFindBD(parent_cdev) @@ -1362,7 +1377,6 @@ def BlockdevRemovechildren(parent_cdev, new_cdevs): else: devs.append(rpath) parent_bdev.RemoveChildren(devs) - return (True, None) def BlockdevGetmirrorstatus(disks): @@ -1383,14 +1397,16 @@ def BlockdevGetmirrorstatus(disks): rbd = _RecursiveFindBD(dsk) if rbd is None: _Fail("Can't find device %s", dsk) + stats.append(rbd.CombinedSyncStatus()) - return True, stats + + return stats def _RecursiveFindBD(disk): """Check if a device is activated. - If so, return informations about the real device. + If so, return information about the real device. @type disk: L{objects.Disk} @param disk: the disk object we need to find @@ -1404,29 +1420,104 @@ def _RecursiveFindBD(disk): for chdisk in disk.children: children.append(_RecursiveFindBD(chdisk)) - return bdev.FindDevice(disk.dev_type, disk.physical_id, children) + return bdev.FindDevice(disk.dev_type, disk.physical_id, children, disk.size) def BlockdevFind(disk): """Check if a device is activated. - If it is, return informations about the real device. + If it is, return information about the real device. @type disk: L{objects.Disk} @param disk: the disk to find - @rtype: None or tuple - @return: None if the disk cannot be found, otherwise a - tuple (device_path, major, minor, sync_percent, - estimated_time, is_degraded) + @rtype: None or objects.BlockDevStatus + @return: None if the disk cannot be found, otherwise a the current + information """ try: rbd = _RecursiveFindBD(disk) except errors.BlockDeviceError, err: _Fail("Failed to find device: %s", err, exc=True) + if rbd is None: - return (True, None) - return (True, (rbd.dev_path, rbd.major, rbd.minor) + rbd.GetSyncStatus()) + return None + + return rbd.GetSyncStatus() + + +def BlockdevGetsize(disks): + """Computes the size of the given disks. + + If a disk is not found, returns None instead. + + @type disks: list of L{objects.Disk} + @param disks: the list of disk to compute the size for + @rtype: list + @return: list with elements None if the disk cannot be found, + otherwise the size + + """ + result = [] + for cf in disks: + try: + rbd = _RecursiveFindBD(cf) + except errors.BlockDeviceError, err: + result.append(None) + continue + if rbd is None: + result.append(None) + else: + result.append(rbd.GetActualSize()) + return result + + +def BlockdevExport(disk, dest_node, dest_path, cluster_name): + """Export a block device to a remote node. + + @type disk: L{objects.Disk} + @param disk: the description of the disk to export + @type dest_node: str + @param dest_node: the destination node to export to + @type dest_path: str + @param dest_path: the destination path on the target node + @type cluster_name: str + @param cluster_name: the cluster name, needed for SSH hostalias + @rtype: None + + """ + real_disk = _RecursiveFindBD(disk) + if real_disk is None: + _Fail("Block device '%s' is not set up", disk) + + real_disk.Open() + + # the block size on the read dd is 1MiB to match our units + expcmd = utils.BuildShellCmd("set -e; set -o pipefail; " + "dd if=%s bs=1048576 count=%s", + real_disk.dev_path, str(disk.size)) + + # we set here a smaller block size as, due to ssh buffering, more + # than 64-128k will mostly ignored; we use nocreat to fail if the + # device is not already there or we pass a wrong path; we use + # notrunc to no attempt truncate on an LV device; we use oflag=dsync + # to not buffer too much memory; this means that at best, we flush + # every 64k, which will not be very fast + destcmd = utils.BuildShellCmd("dd of=%s conv=nocreat,notrunc bs=65536" + " oflag=dsync", dest_path) + + remotecmd = _GetSshRunner(cluster_name).BuildCmd(dest_node, + constants.GANETI_RUNAS, + destcmd) + + # all commands have been checked, so we're safe to combine them + command = '|'.join([expcmd, utils.ShellQuoteArgs(remotecmd)]) + + result = utils.RunCmd(["bash", "-c", command]) + + if result.failed: + _Fail("Disk copy command '%s' returned error: %s" + " output: %s", command, result.fail_reason, result.output) def UploadFile(file_name, data, mode, uid, gid, atime, mtime): @@ -1449,28 +1540,13 @@ def UploadFile(file_name, data, mode, uid, gid, atime, mtime): @param atime: the atime to set on the file (can be None) @type mtime: float @param mtime: the mtime to set on the file (can be None) - @rtype: boolean - @return: the success of the operation; errors are logged - in the node daemon log + @rtype: None """ if not os.path.isabs(file_name): _Fail("Filename passed to UploadFile is not absolute: '%s'", file_name) - allowed_files = set([ - constants.CLUSTER_CONF_FILE, - constants.ETC_HOSTS, - constants.SSH_KNOWN_HOSTS_FILE, - constants.VNC_PASSWORD_FILE, - constants.RAPI_CERT_FILE, - constants.RAPI_USERS_FILE, - ]) - - for hv_name in constants.HYPER_TYPES: - hv_class = hypervisor.GetHypervisor(hv_name) - allowed_files.update(hv_class.GetAncillaryFiles()) - - if file_name not in allowed_files: + if file_name not in _ALLOWED_UPLOAD_FILES: _Fail("Filename passed to UploadFile not in allowed upload targets: '%s'", file_name) @@ -1478,7 +1554,6 @@ def UploadFile(file_name, data, mode, uid, gid, atime, mtime): utils.WriteFile(file_name, data=raw_data, mode=mode, uid=uid, gid=gid, atime=atime, mtime=mtime) - return (True, "success") def WriteSsconfFiles(values): @@ -1488,7 +1563,6 @@ def WriteSsconfFiles(values): """ ssconf.SimpleStore().WriteFiles(values) - return True, None def _ErrnoOrStr(err): @@ -1509,7 +1583,7 @@ def _ErrnoOrStr(err): return detail -def _OSOndiskVersion(name, os_dir): +def _OSOndiskAPIVersion(name, os_dir): """Compute and return the API version of a given OS. This function will try to read the API version of the OS given by @@ -1524,31 +1598,26 @@ def _OSOndiskVersion(name, os_dir): data holding either the vaid versions or an error message """ - api_file = os.path.sep.join([os_dir, "ganeti_api_version"]) + api_file = os.path.sep.join([os_dir, constants.OS_API_FILE]) try: st = os.stat(api_file) except EnvironmentError, err: - return False, ("Required file 'ganeti_api_version' file not" - " found under path %s: %s" % (os_dir, _ErrnoOrStr(err))) + return False, ("Required file '%s' not found under path %s: %s" % + (constants.OS_API_FILE, os_dir, _ErrnoOrStr(err))) if not stat.S_ISREG(stat.S_IFMT(st.st_mode)): - return False, ("File 'ganeti_api_version' file at %s is not" - " a regular file" % os_dir) + return False, ("File '%s' in %s is not a regular file" % + (constants.OS_API_FILE, os_dir)) try: - f = open(api_file) - try: - api_versions = f.readlines() - finally: - f.close() + api_versions = utils.ReadFile(api_file).splitlines() except EnvironmentError, err: return False, ("Error while reading the API version file at %s: %s" % (api_file, _ErrnoOrStr(err))) - api_versions = [version.strip() for version in api_versions] try: - api_versions = [int(version) for version in api_versions] + api_versions = [int(version.strip()) for version in api_versions] except (TypeError, ValueError), err: return False, ("API version(s) can't be converted to integer: %s" % str(err)) @@ -1581,7 +1650,7 @@ def DiagnoseOS(top_dirs=None): try: f_names = utils.ListVisibleFiles(dir_name) except EnvironmentError, err: - logging.exception("Can't list the OS directory %s", dir_name) + logging.exception("Can't list the OS directory %s: %s", dir_name, err) break for name in f_names: os_path = os.path.sep.join([dir_name, name]) @@ -1592,7 +1661,7 @@ def DiagnoseOS(top_dirs=None): diagnose = os_inst result.append((name, os_path, status, diagnose)) - return True, result + return result def _TryOSFromDisk(name, base_dir=None): @@ -1616,14 +1685,14 @@ def _TryOSFromDisk(name, base_dir=None): else: os_dir = os.path.sep.join([base_dir, name]) - status, api_versions = _OSOndiskVersion(name, os_dir) + status, api_versions = _OSOndiskAPIVersion(name, os_dir) if not status: # push the error up return status, api_versions - if constants.OS_API_VERSION not in api_versions: + if not constants.OS_API_VERSIONS.intersection(api_versions): return False, ("API version mismatch for path '%s': found %s, want %s." % - (os_dir, api_versions, constants.OS_API_VERSION)) + (os_dir, api_versions, constants.OS_API_VERSIONS)) # OS Scripts dictionary, we will populate it with the actual script names os_scripts = dict.fromkeys(constants.OS_SCRIPTS) @@ -1680,11 +1749,13 @@ def OSFromDisk(name, base_dir=None): return payload -def OSEnvironment(instance, debug=0): +def OSEnvironment(instance, os, debug=0): """Calculate the environment for an os script. @type instance: L{objects.Instance} @param instance: target instance for the os script run + @type os: L{objects.OS} + @param os: operating system for which the environment is being built @type debug: integer @param debug: debug level (0 or 1, for OS Api 10) @rtype: dict @@ -1694,7 +1765,8 @@ def OSEnvironment(instance, debug=0): """ result = {} - result['OS_API_VERSION'] = '%d' % constants.OS_API_VERSION + api_version = max(constants.OS_API_VERSIONS.intersection(os.api_versions)) + result['OS_API_VERSION'] = '%d' % api_version result['INSTANCE_NAME'] = instance.name result['INSTANCE_OS'] = instance.os result['HYPERVISOR'] = instance.hypervisor @@ -1730,6 +1802,10 @@ def OSEnvironment(instance, debug=0): result['NIC_%d_FRONTEND_TYPE' % idx] = \ instance.hvparams[constants.HV_NIC_TYPE] + for source, kind in [(instance.beparams, "BE"), (instance.hvparams, "HV")]: + for key, value in source.items(): + result["INSTANCE_%s_%s" % (kind, key)] = str(value) + return result def BlockdevGrow(disk, amount): @@ -1755,8 +1831,6 @@ def BlockdevGrow(disk, amount): except errors.BlockDeviceError, err: _Fail("Failed to grow block device: %s", err, exc=True) - return True, None - def BlockdevSnapshot(disk): """Create a snapshot copy of a block device. @@ -1784,7 +1858,7 @@ def BlockdevSnapshot(disk): r_dev = _RecursiveFindBD(disk) if r_dev is not None: # let's stay on the safe side and ask for the full size, for now - return True, r_dev.Snapshot(disk.size) + return r_dev.Snapshot(disk.size) else: _Fail("Cannot find block device %s", disk) else: @@ -1806,13 +1880,12 @@ def ExportSnapshot(disk, dest_node, instance, cluster_name, idx): @type idx: int @param idx: the index of the disk in the instance's disk list, used to export to the OS scripts environment - @rtype: boolean - @return: the success of the operation + @rtype: None """ - export_env = OSEnvironment(instance) - inst_os = OSFromDisk(instance.os) + export_env = OSEnvironment(instance, inst_os) + export_script = inst_os.export_script logfile = "%s/exp-%s-%s-%s.log" % (constants.LOG_OS_DIR, inst_os.name, @@ -1834,8 +1907,8 @@ def ExportSnapshot(disk, dest_node, instance, cluster_name, idx): # the target command is built out of three individual commands, # which are joined by pipes; we check each individual command for # valid parameters - expcmd = utils.BuildShellCmd("cd %s; %s 2>%s", inst_os.path, - export_script, logfile) + expcmd = utils.BuildShellCmd("set -e; set -o pipefail; cd %s; %s 2>%s", + inst_os.path, export_script, logfile) comprcmd = "gzip" @@ -1848,14 +1921,12 @@ def ExportSnapshot(disk, dest_node, instance, cluster_name, idx): # all commands have been checked, so we're safe to combine them command = '|'.join([expcmd, comprcmd, utils.ShellQuoteArgs(remotecmd)]) - result = utils.RunCmd(command, env=export_env) + result = utils.RunCmd(["bash", "-c", command], env=export_env) if result.failed: _Fail("OS snapshot export command '%s' returned error: %s" " output: %s", command, result.fail_reason, result.output) - return (True, None) - def FinalizeExport(instance, snap_disks): """Write out the export configuration information. @@ -1867,8 +1938,7 @@ def FinalizeExport(instance, snap_disks): @param snap_disks: list of snapshot block devices, which will be used to get the actual name of the dump file - @rtype: boolean - @return: the success of the operation + @rtype: None """ destdir = os.path.join(constants.EXPORT_DIR, instance.name + ".new") @@ -1920,8 +1990,6 @@ def FinalizeExport(instance, snap_disks): shutil.rmtree(finaldestdir, True) shutil.move(destdir, finaldestdir) - return True, None - def ExportInfo(dest): """Get export configuration information. @@ -1943,7 +2011,7 @@ def ExportInfo(dest): not config.has_section(constants.INISECT_INS)): _Fail("Export info file doesn't have the required fields") - return True, config.Dumps() + return config.Dumps() def ImportOSIntoInstance(instance, src_node, src_images, cluster_name): @@ -1959,8 +2027,8 @@ def ImportOSIntoInstance(instance, src_node, src_images, cluster_name): @return: each boolean represent the success of importing the n-th disk """ - import_env = OSEnvironment(instance) inst_os = OSFromDisk(instance.os) + import_env = OSEnvironment(instance, inst_os) import_script = inst_os.import_script logfile = "%s/import-%s-%s-%s.log" % (constants.LOG_OS_DIR, instance.os, @@ -1992,7 +2060,6 @@ def ImportOSIntoInstance(instance, src_node, src_images, cluster_name): if final_result: _Fail("; ".join(final_result), log=False) - return True, None def ListExports(): @@ -2003,7 +2070,7 @@ def ListExports(): """ if os.path.isdir(constants.EXPORT_DIR): - return True, utils.ListVisibleFiles(constants.EXPORT_DIR) + return utils.ListVisibleFiles(constants.EXPORT_DIR) else: _Fail("No exports directory") @@ -2013,8 +2080,7 @@ def RemoveExport(export): @type export: str @param export: the name of the export to remove - @rtype: boolean - @return: the success of the operation + @rtype: None """ target = os.path.join(constants.EXPORT_DIR, export) @@ -2024,8 +2090,6 @@ def RemoveExport(export): except EnvironmentError, err: _Fail("Error while removing the export: %s", err, exc=True) - return True, None - def BlockdevRename(devlist): """Rename a list of block devices. @@ -2066,7 +2130,6 @@ def BlockdevRename(devlist): result = False if not result: _Fail("; ".join(msgs)) - return True, None def _TransformFileStorageDir(file_storage_dir): @@ -2114,7 +2177,6 @@ def CreateFileStorageDir(file_storage_dir): except OSError, err: _Fail("Cannot create file storage directory '%s': %s", file_storage_dir, err, exc=True) - return True, None def RemoveFileStorageDir(file_storage_dir): @@ -2126,7 +2188,7 @@ def RemoveFileStorageDir(file_storage_dir): @param file_storage_dir: the directory we should cleanup @rtype: tuple (success,) @return: tuple of one element, C{success}, denoting - whether the operation was successfull + whether the operation was successful """ file_storage_dir = _TransformFileStorageDir(file_storage_dir) @@ -2141,8 +2203,6 @@ def RemoveFileStorageDir(file_storage_dir): _Fail("Cannot remove file storage directory '%s': %s", file_storage_dir, err) - return True, None - def RenameFileStorageDir(old_file_storage_dir, new_file_storage_dir): """Rename the file storage directory. @@ -2172,7 +2232,6 @@ def RenameFileStorageDir(old_file_storage_dir, new_file_storage_dir): if os.path.exists(old_file_storage_dir): _Fail("Cannot rename '%s' to '%s': both locations exist", old_file_storage_dir, new_file_storage_dir) - return True, None def _EnsureJobQueueFile(file_name): @@ -2211,8 +2270,6 @@ def JobQueueUpdate(file_name, content): # Write and replace the file atomically utils.WriteFile(file_name, data=_Decompress(content)) - return True, None - def JobQueueRename(old, new): """Renames a job queue file. @@ -2232,8 +2289,6 @@ def JobQueueRename(old, new): utils.RenameFile(old, new, mkdir=True) - return True, None - def JobQueueSetDrainFlag(drain_flag): """Set the drain flag for the queue. @@ -2252,8 +2307,6 @@ def JobQueueSetDrainFlag(drain_flag): else: utils.RemoveFile(constants.JOB_QUEUE_DRAIN_FILE) - return True, None - def BlockdevClose(instance_name, disks): """Closes the given block devices. @@ -2290,7 +2343,6 @@ def BlockdevClose(instance_name, disks): else: if instance_name: _RemoveBlockDevLinks(instance_name, disks) - return (True, "All devices secondary") def ValidateHVParams(hvname, hvparams): @@ -2300,17 +2352,12 @@ def ValidateHVParams(hvname, hvparams): @param hvname: the hypervisor name @type hvparams: dict @param hvparams: the hypervisor parameters to be validated - @rtype: tuple (success, message) - @return: a tuple of success and message, where success - indicates the succes of the operation, and message - which will contain the error details in case we - failed + @rtype: None """ try: hv_type = hypervisor.GetHypervisor(hvname) hv_type.ValidateParameters(hvparams) - return (True, "Validation passed") except errors.HypervisorError, err: _Fail(str(err), log=False) @@ -2323,16 +2370,16 @@ def DemoteFromMC(): master, myself = ssconf.GetMasterAndMyself() if master == myself: _Fail("ssconf status shows I'm the master node, will not demote") - pid_file = utils.DaemonPidFileName(constants.MASTERD_PID) + pid_file = utils.DaemonPidFileName(constants.MASTERD) if utils.IsProcessAlive(utils.ReadPidFile(pid_file)): _Fail("The master daemon is running, will not demote") try: - utils.CreateBackup(constants.CLUSTER_CONF_FILE) + if os.path.isfile(constants.CLUSTER_CONF_FILE): + utils.CreateBackup(constants.CLUSTER_CONF_FILE) except EnvironmentError, err: if err.errno != errno.ENOENT: _Fail("Error while backing up cluster file: %s", err, exc=True) utils.RemoveFile(constants.CLUSTER_CONF_FILE) - return (True, "Done") def _FindDisks(nodes_ip, disks): @@ -2367,7 +2414,6 @@ def DrbdDisconnectNet(nodes_ip, disks): except errors.BlockDeviceError, err: _Fail("Can't change network configuration to standalone mode: %s", err, exc=True) - return (True, "All disks are now disconnected") def DrbdAttachNet(nodes_ip, disks, instance_name, multimaster): @@ -2408,7 +2454,7 @@ def DrbdAttachNet(nodes_ip, disks, instance_name, multimaster): # standalone, even though this should not happen with the # new staged way of changing disk configs try: - rd.ReAttachNet(multimaster) + rd.AttachNet(multimaster) except errors.BlockDeviceError, err: _Fail("Can't change network configuration: %s", err) if all_connected: @@ -2424,11 +2470,6 @@ def DrbdAttachNet(nodes_ip, disks, instance_name, multimaster): rd.Open() except errors.BlockDeviceError, err: _Fail("Can't change to primary mode: %s", err) - if multimaster: - msg = "multi-master and primary" - else: - msg = "single-master" - return (True, "Disks are now configured as %s" % msg) def DrbdWaitSync(nodes_ip, disks): @@ -2439,7 +2480,6 @@ def DrbdWaitSync(nodes_ip, disks): min_resync = 100 alldone = True - failure = False for rd in bdevs: stats = rd.GetProcStatus() if not (stats.is_connected or stats.is_in_resync): @@ -2448,7 +2488,7 @@ def DrbdWaitSync(nodes_ip, disks): if stats.sync_percent is not None: min_resync = min(min_resync, stats.sync_percent) - return (True, (alldone, min_resync)) + return (alldone, min_resync) def PowercycleNode(hypervisor_type): @@ -2461,11 +2501,11 @@ def PowercycleNode(hypervisor_type): hyper = hypervisor.GetHypervisor(hypervisor_type) try: pid = os.fork() - except OSError, err: + except OSError: # if we can't fork, we'll pretend that we're in the child process pid = 0 if pid > 0: - return (True, "Reboot scheduled in 5 seconds") + return "Reboot scheduled in 5 seconds" time.sleep(5) hyper.PowercycleNode() @@ -2576,9 +2616,9 @@ class HooksRunner(object): dir_name = "%s/%s" % (self._BASE_DIR, subdir) try: dir_contents = utils.ListVisibleFiles(dir_name) - except OSError, err: + except OSError: # FIXME: must log output in case of failures - return True, rr + return rr # we use the standard python sort order, # so 00name is the recommended naming scheme @@ -2597,7 +2637,7 @@ class HooksRunner(object): rrval = constants.HKR_SUCCESS rr.append(("%s/%s" % (subdir, relname), rrval, output)) - return True, rr + return rr class IAllocatorRunner(object): @@ -2637,7 +2677,7 @@ class IAllocatorRunner(object): finally: os.unlink(fin_name) - return True, result.stdout + return result.stdout class DevCacheManager(object): @@ -2698,7 +2738,7 @@ class DevCacheManager(object): try: utils.WriteFile(fpath, data=fdata) except EnvironmentError, err: - logging.exception("Can't update bdev cache for %s", dev_path) + logging.exception("Can't update bdev cache for %s: %s", dev_path, err) @classmethod def RemoveCache(cls, dev_path): @@ -2720,4 +2760,4 @@ class DevCacheManager(object): try: utils.RemoveFile(fpath) except EnvironmentError, err: - logging.exception("Can't update bdev cache for %s", dev_path) + logging.exception("Can't update bdev cache for %s: %s", dev_path, err)