X-Git-Url: https://code.grnet.gr/git/ganeti-local/blobdiff_plain/6ddc95ece8689c62e13dc37d6c964bdc53cb5849..a66bd91b7074b62884839ebff79bce9e237b39f1:/lib/rpc.py?ds=sidebyside diff --git a/lib/rpc.py b/lib/rpc.py index 9fd5459..8101538 100644 --- a/lib/rpc.py +++ b/lib/rpc.py @@ -31,14 +31,121 @@ # R0904: Too many public methods import os -import socket -import httplib import logging +import zlib +import base64 from ganeti import utils from ganeti import objects from ganeti import http from ganeti import serializer +from ganeti import constants +from ganeti import errors + +import ganeti.http.client + + +# Module level variable +_http_manager = None + + +def Init(): + """Initializes the module-global HTTP client manager. + + Must be called before using any RPC function. + + """ + global _http_manager + + assert not _http_manager, "RPC module initialized more than once" + + _http_manager = http.client.HttpClientManager() + + +def Shutdown(): + """Stops the module-global HTTP client manager. + + Must be called before quitting the program. + + """ + global _http_manager + + if _http_manager: + _http_manager.Shutdown() + _http_manager = None + + +class RpcResult(object): + """RPC Result class. + + This class holds an RPC result. It is needed since in multi-node + calls we can't raise an exception just because one one out of many + failed, and therefore we use this class to encapsulate the result. + + @ivar data: the data payload, for successful results, or None + @ivar call: the name of the RPC call + @ivar node: the name of the node to which we made the call + @ivar offline: whether the operation failed because the node was + offline, as opposed to actual failure; offline=True will always + imply failed=True, in order to allow simpler checking if + the user doesn't care about the exact failure mode + @ivar fail_msg: the error message if the call failed + + """ + def __init__(self, data=None, failed=False, offline=False, + call=None, node=None): + self.offline = offline + self.call = call + self.node = node + if offline: + self.fail_msg = "Node is marked offline" + self.data = self.payload = None + elif failed: + self.fail_msg = self._EnsureErr(data) + self.data = self.payload = None + else: + self.data = data + if not isinstance(self.data, (tuple, list)): + self.fail_msg = ("RPC layer error: invalid result type (%s)" % + type(self.data)) + elif len(data) != 2: + self.fail_msg = ("RPC layer error: invalid result length (%d), " + "expected 2" % len(self.data)) + elif not self.data[0]: + self.fail_msg = self._EnsureErr(self.data[1]) + else: + # finally success + self.fail_msg = None + self.payload = data[1] + + @staticmethod + def _EnsureErr(val): + """Helper to ensure we return a 'True' value for error.""" + if val: + return val + else: + return "No error information" + + def Raise(self, msg, prereq=False): + """If the result has failed, raise an OpExecError. + + This is used so that LU code doesn't have to check for each + result, but instead can call this function. + + """ + if not self.fail_msg: + return + + if not msg: # one could pass None for default message + msg = ("Call '%s' to node '%s' has failed: %s" % + (self.call, self.node, self.fail_msg)) + else: + msg = "%s: %s" % (msg, self.fail_msg) + if prereq: + ec = errors.OpPrereqError + else: + ec = errors.OpExecError + raise ec(msg) class Client: @@ -48,20 +155,21 @@ class Client: list of nodes, will contact (in parallel) all nodes, and return a dict of results (key: node name, value: result). - One current bug is that generic failure is still signalled by + One current bug is that generic failure is still signaled by 'False' result, which is not good. This overloading of values can cause bugs. """ - def __init__(self, procedure, args): + def __init__(self, procedure, body, port): self.procedure = procedure - self.args = args - self.body = serializer.DumpJson(args, indent=False) - - self.port = utils.GetNodeDaemonPort() - self.nodepw = utils.GetNodeDaemonPassword() + self.body = body + self.port = port self.nc = {} + self._ssl_params = \ + http.HttpSslParams(ssl_key_path=constants.SSL_CERT_FILE, + ssl_cert_path=constants.SSL_CERT_FILE) + def ConnectList(self, node_list, address_list=None): """Add a list of nodes to the target nodes. @@ -92,38 +200,42 @@ class Client: if address is None: address = name - self.nc[name] = http.HttpClientRequest(address, self.port, http.HTTP_PUT, - "/%s" % self.procedure, - post_data=self.body) + self.nc[name] = \ + http.client.HttpClientRequest(address, self.port, http.HTTP_PUT, + "/%s" % self.procedure, + post_data=self.body, + ssl_params=self._ssl_params, + ssl_verify_peer=True) def GetResults(self): """Call nodes and return results. @rtype: list - @returns: List of RPC results + @return: List of RPC results """ - # TODO: Shared and reused manager - mgr = http.HttpClientManager() - try: - mgr.ExecRequests(self.nc.values()) - finally: - mgr.Shutdown() + assert _http_manager, "RPC module not initialized" + + _http_manager.ExecRequests(self.nc.values()) results = {} for name, req in self.nc.iteritems(): - if req.success and req.resp_status == http.HTTP_OK: - results[name] = serializer.LoadJson(req.resp_body) + if req.success and req.resp_status_code == http.HTTP_OK: + results[name] = RpcResult(data=serializer.LoadJson(req.resp_body), + node=name, call=self.procedure) continue + # TODO: Better error reporting if req.error: msg = req.error else: msg = req.resp_body - logging.error("RPC error from node %s: %s", name, msg) - results[name] = False + logging.error("RPC error in %s from node %s: %s", + self.procedure, name, msg) + results[name] = RpcResult(data=msg, failed=True, node=name, + call=self.procedure) return results @@ -140,8 +252,9 @@ class RpcRunner(object): """ self._cfg = cfg + self.port = utils.GetDaemonPort(constants.NODED) - def _InstDict(self, instance): + def _InstDict(self, instance, hvp=None, bep=None): """Convert the given instance to a dict. This is done via the instance's ToDict() method and additionally @@ -149,6 +262,10 @@ class RpcRunner(object): @type instance: L{objects.Instance} @param instance: an Instance object + @type hvp: dict or None + @param hvp: a dictionary with overridden hypervisor parameters + @type bep: dict or None + @param bep: a dictionary with overridden backend parameters @rtype: dict @return: the instance dict, with the hvparams filled with the cluster defaults @@ -157,84 +274,142 @@ class RpcRunner(object): idict = instance.ToDict() cluster = self._cfg.GetClusterInfo() idict["hvparams"] = cluster.FillHV(instance) + if hvp is not None: + idict["hvparams"].update(hvp) idict["beparams"] = cluster.FillBE(instance) + if bep is not None: + idict["beparams"].update(bep) + for nic in idict["nics"]: + nic['nicparams'] = objects.FillDict( + cluster.nicparams[constants.PP_DEFAULT], + nic['nicparams']) return idict - def _ConnectList(self, client, node_list): + def _ConnectList(self, client, node_list, call): """Helper for computing node addresses. - @type client: L{Client} + @type client: L{ganeti.rpc.Client} @param client: a C{Client} instance @type node_list: list @param node_list: the node list we should connect + @type call: string + @param call: the name of the remote procedure call, for filling in + correctly any eventual offline nodes' results """ all_nodes = self._cfg.GetAllNodesInfo() + name_list = [] addr_list = [] + skip_dict = {} for node in node_list: if node in all_nodes: + if all_nodes[node].offline: + skip_dict[node] = RpcResult(node=node, offline=True, call=call) + continue val = all_nodes[node].primary_ip else: val = None addr_list.append(val) - client.ConnectList(node_list, address_list=addr_list) + name_list.append(node) + if name_list: + client.ConnectList(name_list, address_list=addr_list) + return skip_dict - def _ConnectNode(self, client, node): + def _ConnectNode(self, client, node, call): """Helper for computing one node's address. - @type client: L{Client} + @type client: L{ganeti.rpc.Client} @param client: a C{Client} instance @type node: str @param node: the node we should connect + @type call: string + @param call: the name of the remote procedure call, for filling in + correctly any eventual offline nodes' results """ node_info = self._cfg.GetNodeInfo(node) if node_info is not None: + if node_info.offline: + return RpcResult(node=node, offline=True, call=call) addr = node_info.primary_ip else: addr = None client.ConnectNode(node, address=addr) - def _MultiNodeCall(self, node_list, procedure, args, - address_list=None): - c = Client(procedure, args) - if address_list is None: - self._ConnectList(c, node_list) - else: - c.ConnectList(node_list, address_list=address_list) - return c.GetResults() + def _MultiNodeCall(self, node_list, procedure, args): + """Helper for making a multi-node call + + """ + body = serializer.DumpJson(args, indent=False) + c = Client(procedure, body, self.port) + skip_dict = self._ConnectList(c, node_list, procedure) + skip_dict.update(c.GetResults()) + return skip_dict @classmethod def _StaticMultiNodeCall(cls, node_list, procedure, args, address_list=None): - c = Client(procedure, args) + """Helper for making a multi-node static call + + """ + body = serializer.DumpJson(args, indent=False) + c = Client(procedure, body, utils.GetDaemonPort(constants.NODED)) c.ConnectList(node_list, address_list=address_list) return c.GetResults() def _SingleNodeCall(self, node, procedure, args): - """ + """Helper for making a single-node call """ - c = Client(procedure, args) - self._ConnectNode(c, node) - return c.GetResults().get(node, False) + body = serializer.DumpJson(args, indent=False) + c = Client(procedure, body, self.port) + result = self._ConnectNode(c, node, procedure) + if result is None: + # we did connect, node is not offline + result = c.GetResults()[node] + return result @classmethod def _StaticSingleNodeCall(cls, node, procedure, args): + """Helper for making a single-node static call + """ + body = serializer.DumpJson(args, indent=False) + c = Client(procedure, body, utils.GetDaemonPort(constants.NODED)) + c.ConnectNode(node) + return c.GetResults()[node] + + @staticmethod + def _Compress(data): + """Compresses a string for transport over RPC. + + Small amounts of data are not compressed. + + @type data: str + @param data: Data + @rtype: tuple + @return: Encoded data to send """ - c = Client(procedure, args) - c.ConnectNode(c, node) - return c.GetResults().get(node, False) + # Small amounts of data are not compressed + if len(data) < 512: + return (constants.RPC_ENCODING_NONE, data) - def call_volume_list(self, node_list, vg_name): + # Compress with zlib and encode in base64 + return (constants.RPC_ENCODING_ZLIB_BASE64, + base64.b64encode(zlib.compress(data, 3))) + + # + # Begin RPC calls + # + + def call_lv_list(self, node_list, vg_name): """Gets the logical volumes present in a given volume group. This is a multi-node call. """ - return self._MultiNodeCall(node_list, "volume_list", [vg_name]) + return self._MultiNodeCall(node_list, "lv_list", [vg_name]) def call_vg_list(self, node_list): """Gets the volume group list. @@ -244,6 +419,33 @@ class RpcRunner(object): """ return self._MultiNodeCall(node_list, "vg_list", []) + def call_storage_list(self, node_list, su_name, su_args, name, fields): + """Get list of storage units. + + This is a multi-node call. + + """ + return self._MultiNodeCall(node_list, "storage_list", + [su_name, su_args, name, fields]) + + def call_storage_modify(self, node, su_name, su_args, name, changes): + """Modify a storage unit. + + This is a single-node call. + + """ + return self._SingleNodeCall(node, "storage_modify", + [su_name, su_args, name, changes]) + + def call_storage_execute(self, node, su_name, su_args, name, op): + """Executes an operation on a storage unit. + + This is a single-node call. + + """ + return self._SingleNodeCall(node, "storage_execute", + [su_name, su_args, name, op]) + def call_bridges_exist(self, node, bridges_list): """Checks if a node has all the bridges given. @@ -256,14 +458,14 @@ class RpcRunner(object): """ return self._SingleNodeCall(node, "bridges_exist", [bridges_list]) - def call_instance_start(self, node, instance, extra_args): + def call_instance_start(self, node, instance, hvp, bep): """Starts an instance. This is a single-node call. """ - return self._SingleNodeCall(node, "instance_start", - [self._InstDict(instance), extra_args]) + idict = self._InstDict(instance, hvp=hvp, bep=bep) + return self._SingleNodeCall(node, "instance_start", [idict]) def call_instance_shutdown(self, node, instance): """Stops an instance. @@ -274,6 +476,59 @@ class RpcRunner(object): return self._SingleNodeCall(node, "instance_shutdown", [self._InstDict(instance)]) + def call_migration_info(self, node, instance): + """Gather the information necessary to prepare an instance migration. + + This is a single-node call. + + @type node: string + @param node: the node on which the instance is currently running + @type instance: C{objects.Instance} + @param instance: the instance definition + + """ + return self._SingleNodeCall(node, "migration_info", + [self._InstDict(instance)]) + + def call_accept_instance(self, node, instance, info, target): + """Prepare a node to accept an instance. + + This is a single-node call. + + @type node: string + @param node: the target node for the migration + @type instance: C{objects.Instance} + @param instance: the instance definition + @type info: opaque/hypervisor specific (string/data) + @param info: result for the call_migration_info call + @type target: string + @param target: target hostname (usually ip address) (on the node itself) + + """ + return self._SingleNodeCall(node, "accept_instance", + [self._InstDict(instance), info, target]) + + def call_finalize_migration(self, node, instance, info, success): + """Finalize any target-node migration specific operation. + + This is called both in case of a successful migration and in case of error + (in which case it should abort the migration). + + This is a single-node call. + + @type node: string + @param node: the target node for the migration + @type instance: C{objects.Instance} + @param instance: the instance definition + @type info: opaque/hypervisor specific (string/data) + @param info: result for the call_migration_info call + @type success: boolean + @param success: whether the migration was a success or a failure + + """ + return self._SingleNodeCall(node, "finalize_migration", + [self._InstDict(instance), info, success]) + def call_instance_migrate(self, node, instance, target, live): """Migrate an instance. @@ -293,24 +548,23 @@ class RpcRunner(object): return self._SingleNodeCall(node, "instance_migrate", [self._InstDict(instance), target, live]) - def call_instance_reboot(self, node, instance, reboot_type, extra_args): + def call_instance_reboot(self, node, instance, reboot_type): """Reboots an instance. This is a single-node call. """ return self._SingleNodeCall(node, "instance_reboot", - [self._InstDict(instance), reboot_type, - extra_args]) + [self._InstDict(instance), reboot_type]) - def call_instance_os_add(self, node, inst): + def call_instance_os_add(self, node, inst, reinstall): """Installs an OS on the given instance. This is a single-node call. """ return self._SingleNodeCall(node, "instance_os_add", - [self._InstDict(inst)]) + [self._InstDict(inst), reinstall]) def call_instance_run_rename(self, node, inst, old_name): """Run the OS rename script for an instance. @@ -336,6 +590,20 @@ class RpcRunner(object): """ return self._SingleNodeCall(node, "instance_info", [instance, hname]) + def call_instance_migratable(self, node, instance): + """Checks whether the given instance can be migrated. + + This is a single-node call. + + @param node: the node to query + @type instance: L{objects.Instance} + @param instance: the instance to check + + + """ + return self._SingleNodeCall(node, "instance_migratable", + [self._InstDict(instance)]) + def call_all_instances_info(self, node_list, hypervisor_list): """Returns information about all instances on the given nodes. @@ -392,31 +660,16 @@ class RpcRunner(object): @type node_list: list @param node_list: the list of nodes to query - @type vgname: C{string} - @param vgname: the name of the volume group to ask for disk space + @type vg_name: C{string} + @param vg_name: the name of the volume group to ask for disk space information @type hypervisor_type: C{str} @param hypervisor_type: the name of the hypervisor to ask for memory information """ - retux = self._MultiNodeCall(node_list, "node_info", - [vg_name, hypervisor_type]) - - for node_name in retux: - ret = retux.get(node_name, False) - if type(ret) != dict: - logging.error("could not connect to node %s", node_name) - ret = {} - - utils.CheckDict(ret, { - 'memory_total' : '-', - 'memory_dom0' : '-', - 'memory_free' : '-', - 'vg_size' : 'node_unreachable', - 'vg_free' : '-', - }, "call_node_info") - return retux + return self._MultiNodeCall(node_list, "node_info", + [vg_name, hypervisor_type]) def call_node_add(self, node, dsa, dsapub, rsa, rsapub, ssh, sshpub): """Add a node to the cluster. @@ -437,14 +690,14 @@ class RpcRunner(object): [checkdict, cluster_name]) @classmethod - def call_node_start_master(cls, node, start_daemons): + def call_node_start_master(cls, node, start_daemons, no_voting): """Tells a node to activate itself as a master. This is a single-node call. """ return cls._StaticSingleNodeCall(node, "node_start_master", - [start_daemons]) + [start_daemons, no_voting]) @classmethod def call_node_stop_master(cls, node, stop_daemons): @@ -542,8 +795,12 @@ class RpcRunner(object): This is a single-node call. """ - return self._SingleNodeCall(node, "blockdev_getmirrorstatus", - [dsk.ToDict() for dsk in disks]) + result = self._SingleNodeCall(node, "blockdev_getmirrorstatus", + [dsk.ToDict() for dsk in disks]) + if not result.fail_msg: + result.payload = [objects.BlockDevStatus.FromDict(i) + for i in result.payload] + return result def call_blockdev_find(self, node, disk): """Request identification of a given block device. @@ -551,16 +808,57 @@ class RpcRunner(object): This is a single-node call. """ - return self._SingleNodeCall(node, "blockdev_find", [disk.ToDict()]) + result = self._SingleNodeCall(node, "blockdev_find", [disk.ToDict()]) + if not result.fail_msg and result.payload is not None: + result.payload = objects.BlockDevStatus.FromDict(result.payload) + return result - def call_blockdev_close(self, node, disks): + def call_blockdev_close(self, node, instance_name, disks): """Closes the given block devices. This is a single-node call. """ - return self._SingleNodeCall(node, "blockdev_close", - [cf.ToDict() for cf in disks]) + params = [instance_name, [cf.ToDict() for cf in disks]] + return self._SingleNodeCall(node, "blockdev_close", params) + + def call_blockdev_getsizes(self, node, disks): + """Returns the size of the given disks. + + This is a single-node call. + + """ + params = [[cf.ToDict() for cf in disks]] + return self._SingleNodeCall(node, "blockdev_getsize", params) + + def call_drbd_disconnect_net(self, node_list, nodes_ip, disks): + """Disconnects the network of the given drbd devices. + + This is a multi-node call. + + """ + return self._MultiNodeCall(node_list, "drbd_disconnect_net", + [nodes_ip, [cf.ToDict() for cf in disks]]) + + def call_drbd_attach_net(self, node_list, nodes_ip, + disks, instance_name, multimaster): + """Disconnects the given drbd devices. + + This is a multi-node call. + + """ + return self._MultiNodeCall(node_list, "drbd_attach_net", + [nodes_ip, [cf.ToDict() for cf in disks], + instance_name, multimaster]) + + def call_drbd_wait_sync(self, node_list, nodes_ip, disks): + """Waits for the synchronization of drbd devices is complete. + + This is a multi-node call. + + """ + return self._MultiNodeCall(node_list, "drbd_wait_sync", + [nodes_ip, [cf.ToDict() for cf in disks]]) @classmethod def call_upload_file(cls, node_list, file_name, address_list=None): @@ -580,7 +878,8 @@ class RpcRunner(object): to optimize the RPC speed """ - data = utils.ReadFile(file_name) + file_contents = utils.ReadFile(file_name) + data = cls._Compress(file_contents) st = os.stat(file_name) params = [file_name, data, st.st_mode, st.st_uid, st.st_gid, st.st_atime, st.st_mtime] @@ -588,13 +887,13 @@ class RpcRunner(object): address_list=address_list) @classmethod - def call_write_ssconf_files(cls, node_list): + def call_write_ssconf_files(cls, node_list, values): """Write ssconf files. This is a multi-node call. """ - return cls._StaticMultiNodeCall(node_list, "write_ssconf_files", []) + return cls._StaticMultiNodeCall(node_list, "write_ssconf_files", [values]) def call_os_diagnose(self, node_list): """Request a diagnose of OS definitions. @@ -602,16 +901,7 @@ class RpcRunner(object): This is a multi-node call. """ - result = self._MultiNodeCall(node_list, "os_diagnose", []) - - new_result = {} - for node_name in result: - if result[node_name]: - nr = [objects.OS.FromDict(oss) for oss in result[node_name]] - else: - nr = [] - new_result[node_name] = nr - return new_result + return self._MultiNodeCall(node_list, "os_diagnose", []) def call_os_get(self, node, name): """Returns an OS definition. @@ -620,10 +910,9 @@ class RpcRunner(object): """ result = self._SingleNodeCall(node, "os_get", [name]) - if isinstance(result, dict): - return objects.OS.FromDict(result) - else: - return result + if not result.fail_msg and isinstance(result.data, dict): + result.data = objects.OS.FromDict(result.data) + return result def call_hooks_runner(self, node_list, hpath, phase, env): """Call the hooks runner. @@ -659,6 +948,17 @@ class RpcRunner(object): return self._SingleNodeCall(node, "blockdev_grow", [cf_bdev.ToDict(), amount]) + def call_blockdev_export(self, node, cf_bdev, + dest_node, dest_path, cluster_name): + """Export a given disk to another node. + + This is a single-node call. + + """ + return self._SingleNodeCall(node, "blockdev_export", + [cf_bdev.ToDict(), dest_node, dest_path, + cluster_name]) + def call_blockdev_snapshot(self, node, cf_bdev): """Request a snapshot of the given block device. @@ -688,7 +988,10 @@ class RpcRunner(object): """ flat_disks = [] for disk in snap_disks: - flat_disks.append(disk.ToDict()) + if isinstance(disk, bool): + flat_disks.append(disk) + else: + flat_disks.append(disk.ToDict()) return self._SingleNodeCall(node, "finalize_export", [self._InstDict(instance), flat_disks]) @@ -699,10 +1002,7 @@ class RpcRunner(object): This is a single-node call. """ - result = self._SingleNodeCall(node, "export_info", [path]) - if not result: - return result - return objects.SerializableConfigParser.Loads(str(result)) + return self._SingleNodeCall(node, "export_info", [path]) def call_instance_os_import(self, node, inst, src_node, src_images, cluster_name): @@ -751,6 +1051,24 @@ class RpcRunner(object): """ return self._MultiNodeCall(node_list, "node_volumes", []) + def call_node_demote_from_mc(self, node): + """Demote a node from the master candidate role. + + This is a single-node call. + + """ + return self._SingleNodeCall(node, "node_demote_from_mc", []) + + + def call_node_powercycle(self, node, hypervisor): + """Tries to powercycle a node. + + This is a single-node call. + + """ + return self._SingleNodeCall(node, "node_powercycle", [hypervisor]) + + def call_test_delay(self, node_list, duration): """Sleep for a fixed time on given node(s). @@ -795,7 +1113,7 @@ class RpcRunner(object): """ return cls._StaticMultiNodeCall(node_list, "jobqueue_update", - [file_name, content], + [file_name, cls._Compress(content)], address_list=address_list) @classmethod @@ -808,13 +1126,13 @@ class RpcRunner(object): return cls._StaticSingleNodeCall(node, "jobqueue_purge", []) @classmethod - def call_jobqueue_rename(cls, node_list, address_list, old, new): + def call_jobqueue_rename(cls, node_list, address_list, rename): """Rename a job queue file. This is a multi-node call. """ - return cls._StaticMultiNodeCall(node_list, "jobqueue_rename", [old, new], + return cls._StaticMultiNodeCall(node_list, "jobqueue_rename", rename, address_list=address_list) @classmethod @@ -846,6 +1164,6 @@ class RpcRunner(object): """ cluster = self._cfg.GetClusterInfo() - hv_full = cluster.FillDict(cluster.hvparams.get(hvname, {}), hvparams) + hv_full = objects.FillDict(cluster.hvparams.get(hvname, {}), hvparams) return self._MultiNodeCall(node_list, "hypervisor_validate_params", [hvname, hv_full])