X-Git-Url: https://code.grnet.gr/git/ganeti-local/blobdiff_plain/56e7640c5cd4826ebb7efa0eb320e8c72ed13b88..4a34c5cf5664c10a1c06e8865067b429ab0b9c71:/lib/rpc.py diff --git a/lib/rpc.py b/lib/rpc.py index 10b48b1..e28db0e 100644 --- a/lib/rpc.py +++ b/lib/rpc.py @@ -31,7 +31,6 @@ # R0904: Too many public methods import os -import socket import logging import zlib import base64 @@ -83,9 +82,9 @@ class RpcResult(object): calls we can't raise an exception just because one one out of many failed, and therefore we use this class to encapsulate the result. - @ivar data: the data payload, for successfull results, or None + @ivar data: the data payload, for successful results, or None @type failed: boolean - @ivar failed: whether the operation failed at RPC level (not + @ivar failed: whether the operation failed at transport level (not application level on the remote node) @ivar call: the name of the RPC call @ivar node: the name of the node to which we made the call @@ -93,6 +92,7 @@ class RpcResult(object): offline, as opposed to actual failure; offline=True will always imply failed=True, in order to allow simpler checking if the user doesn't care about the exact failure mode + @ivar fail_msg: the error message if the call failed """ def __init__(self, data=None, failed=False, offline=False, @@ -103,25 +103,62 @@ class RpcResult(object): self.node = node if offline: self.failed = True - self.error = "Node is marked offline" - self.data = None + self.fail_msg = "Node is marked offline" + self.data = self.payload = None elif failed: - self.error = data - self.data = None + self.fail_msg = self._EnsureErr(data) + self.data = self.payload = None else: self.data = data - self.error = None + if not isinstance(self.data, (tuple, list)): + self.fail_msg = ("RPC layer error: invalid result type (%s)" % + type(self.data)) + elif len(data) != 2: + self.fail_msg = ("RPC layer error: invalid result length (%d), " + "expected 2" % len(self.data)) + elif not self.data[0]: + self.fail_msg = self._EnsureErr(self.data[1]) + else: + # finally success + self.fail_msg = None + self.payload = data[1] - def Raise(self): + @staticmethod + def _EnsureErr(val): + """Helper to ensure we return a 'True' value for error.""" + if val: + return val + else: + return "No error information" + + def Raise(self, msg, prereq=False): """If the result has failed, raise an OpExecError. This is used so that LU code doesn't have to check for each result, but instead can call this function. """ - if self.failed: - raise errors.OpExecError("Call '%s' to node '%s' has failed: %s" % - (self.call, self.node, self.error)) + if not self.fail_msg: + return + + if not msg: # one could pass None for default message + msg = ("Call '%s' to node '%s' has failed: %s" % + (self.call, self.node, self.fail_msg)) + else: + msg = "%s: %s" % (msg, self.fail_msg) + if prereq: + ec = errors.OpPrereqError + else: + ec = errors.OpExecError + raise ec(msg) + + def RemoteFailMsg(self): + """Check if the remote procedure failed. + + @return: the fail_msg attribute + + """ + return self.fail_msg class Client: @@ -131,7 +168,7 @@ class Client: list of nodes, will contact (in parallel) all nodes, and return a dict of results (key: node name, value: result). - One current bug is that generic failure is still signalled by + One current bug is that generic failure is still signaled by 'False' result, which is not good. This overloading of values can cause bugs. @@ -187,10 +224,10 @@ class Client: """Call nodes and return results. @rtype: list - @returns: List of RPC results + @return: List of RPC results """ - assert _http_manager, "RPC module not intialized" + assert _http_manager, "RPC module not initialized" _http_manager.ExecRequests(self.nc.values()) @@ -208,7 +245,8 @@ class Client: else: msg = req.resp_body - logging.error("RPC error from node %s: %s", name, msg) + logging.error("RPC error in %s from node %s: %s", + self.procedure, name, msg) results[name] = RpcResult(data=msg, failed=True, node=name, call=self.procedure) @@ -229,7 +267,7 @@ class RpcRunner(object): self._cfg = cfg self.port = utils.GetNodeDaemonPort() - def _InstDict(self, instance): + def _InstDict(self, instance, hvp=None, bep=None): """Convert the given instance to a dict. This is done via the instance's ToDict() method and additionally @@ -237,6 +275,10 @@ class RpcRunner(object): @type instance: L{objects.Instance} @param instance: an Instance object + @type hvp: dict or None + @param hvp: a dictionary with overridden hypervisor parameters + @type bep: dict or None + @param bep: a dictionary with overridden backend parameters @rtype: dict @return: the instance dict, with the hvparams filled with the cluster defaults @@ -245,16 +287,27 @@ class RpcRunner(object): idict = instance.ToDict() cluster = self._cfg.GetClusterInfo() idict["hvparams"] = cluster.FillHV(instance) + if hvp is not None: + idict["hvparams"].update(hvp) idict["beparams"] = cluster.FillBE(instance) + if bep is not None: + idict["beparams"].update(bep) + for nic in idict["nics"]: + nic['nicparams'] = objects.FillDict( + cluster.nicparams[constants.PP_DEFAULT], + nic['nicparams']) return idict - def _ConnectList(self, client, node_list): + def _ConnectList(self, client, node_list, call): """Helper for computing node addresses. - @type client: L{Client} + @type client: L{ganeti.rpc.Client} @param client: a C{Client} instance @type node_list: list @param node_list: the node list we should connect + @type call: string + @param call: the name of the remote procedure call, for filling in + correctly any eventual offline nodes' results """ all_nodes = self._cfg.GetAllNodesInfo() @@ -264,7 +317,7 @@ class RpcRunner(object): for node in node_list: if node in all_nodes: if all_nodes[node].offline: - skip_dict[node] = RpcResult(node=node, offline=True) + skip_dict[node] = RpcResult(node=node, offline=True, call=call) continue val = all_nodes[node].primary_ip else: @@ -275,19 +328,22 @@ class RpcRunner(object): client.ConnectList(name_list, address_list=addr_list) return skip_dict - def _ConnectNode(self, client, node): + def _ConnectNode(self, client, node, call): """Helper for computing one node's address. - @type client: L{Client} + @type client: L{ganeti.rpc.Client} @param client: a C{Client} instance @type node: str @param node: the node we should connect + @type call: string + @param call: the name of the remote procedure call, for filling in + correctly any eventual offline nodes' results """ node_info = self._cfg.GetNodeInfo(node) if node_info is not None: if node_info.offline: - return RpcResult(node=node, offline=True) + return RpcResult(node=node, offline=True, call=call) addr = node_info.primary_ip else: addr = None @@ -299,7 +355,7 @@ class RpcRunner(object): """ body = serializer.DumpJson(args, indent=False) c = Client(procedure, body, self.port) - skip_dict = self._ConnectList(c, node_list) + skip_dict = self._ConnectList(c, node_list, procedure) skip_dict.update(c.GetResults()) return skip_dict @@ -320,7 +376,7 @@ class RpcRunner(object): """ body = serializer.DumpJson(args, indent=False) c = Client(procedure, body, self.port) - result = self._ConnectNode(c, node) + result = self._ConnectNode(c, node, procedure) if result is None: # we did connect, node is not offline result = c.GetResults()[node] @@ -360,13 +416,13 @@ class RpcRunner(object): # Begin RPC calls # - def call_volume_list(self, node_list, vg_name): + def call_lv_list(self, node_list, vg_name): """Gets the logical volumes present in a given volume group. This is a multi-node call. """ - return self._MultiNodeCall(node_list, "volume_list", [vg_name]) + return self._MultiNodeCall(node_list, "lv_list", [vg_name]) def call_vg_list(self, node_list): """Gets the volume group list. @@ -388,14 +444,14 @@ class RpcRunner(object): """ return self._SingleNodeCall(node, "bridges_exist", [bridges_list]) - def call_instance_start(self, node, instance, extra_args): + def call_instance_start(self, node, instance, hvp, bep): """Starts an instance. This is a single-node call. """ - return self._SingleNodeCall(node, "instance_start", - [self._InstDict(instance), extra_args]) + idict = self._InstDict(instance, hvp=hvp, bep=bep) + return self._SingleNodeCall(node, "instance_start", [idict]) def call_instance_shutdown(self, node, instance): """Stops an instance. @@ -406,6 +462,59 @@ class RpcRunner(object): return self._SingleNodeCall(node, "instance_shutdown", [self._InstDict(instance)]) + def call_migration_info(self, node, instance): + """Gather the information necessary to prepare an instance migration. + + This is a single-node call. + + @type node: string + @param node: the node on which the instance is currently running + @type instance: C{objects.Instance} + @param instance: the instance definition + + """ + return self._SingleNodeCall(node, "migration_info", + [self._InstDict(instance)]) + + def call_accept_instance(self, node, instance, info, target): + """Prepare a node to accept an instance. + + This is a single-node call. + + @type node: string + @param node: the target node for the migration + @type instance: C{objects.Instance} + @param instance: the instance definition + @type info: opaque/hypervisor specific (string/data) + @param info: result for the call_migration_info call + @type target: string + @param target: target hostname (usually ip address) (on the node itself) + + """ + return self._SingleNodeCall(node, "accept_instance", + [self._InstDict(instance), info, target]) + + def call_finalize_migration(self, node, instance, info, success): + """Finalize any target-node migration specific operation. + + This is called both in case of a successful migration and in case of error + (in which case it should abort the migration). + + This is a single-node call. + + @type node: string + @param node: the target node for the migration + @type instance: C{objects.Instance} + @param instance: the instance definition + @type info: opaque/hypervisor specific (string/data) + @param info: result for the call_migration_info call + @type success: boolean + @param success: whether the migration was a success or a failure + + """ + return self._SingleNodeCall(node, "finalize_migration", + [self._InstDict(instance), info, success]) + def call_instance_migrate(self, node, instance, target, live): """Migrate an instance. @@ -425,24 +534,23 @@ class RpcRunner(object): return self._SingleNodeCall(node, "instance_migrate", [self._InstDict(instance), target, live]) - def call_instance_reboot(self, node, instance, reboot_type, extra_args): + def call_instance_reboot(self, node, instance, reboot_type): """Reboots an instance. This is a single-node call. """ return self._SingleNodeCall(node, "instance_reboot", - [self._InstDict(instance), reboot_type, - extra_args]) + [self._InstDict(instance), reboot_type]) - def call_instance_os_add(self, node, inst): + def call_instance_os_add(self, node, inst, reinstall): """Installs an OS on the given instance. This is a single-node call. """ return self._SingleNodeCall(node, "instance_os_add", - [self._InstDict(inst)]) + [self._InstDict(inst), reinstall]) def call_instance_run_rename(self, node, inst, old_name): """Run the OS rename script for an instance. @@ -546,21 +654,8 @@ class RpcRunner(object): memory information """ - retux = self._MultiNodeCall(node_list, "node_info", - [vg_name, hypervisor_type]) - - for result in retux.itervalues(): - if result.failed or not isinstance(result.data, dict): - result.data = {} - - utils.CheckDict(result.data, { - 'memory_total' : '-', - 'memory_dom0' : '-', - 'memory_free' : '-', - 'vg_size' : 'node_unreachable', - 'vg_free' : '-', - }, "call_node_info") - return retux + return self._MultiNodeCall(node_list, "node_info", + [vg_name, hypervisor_type]) def call_node_add(self, node, dsa, dsapub, rsa, rsapub, ssh, sshpub): """Add a node to the cluster. @@ -581,14 +676,14 @@ class RpcRunner(object): [checkdict, cluster_name]) @classmethod - def call_node_start_master(cls, node, start_daemons): + def call_node_start_master(cls, node, start_daemons, no_voting): """Tells a node to activate itself as a master. This is a single-node call. """ return cls._StaticSingleNodeCall(node, "node_start_master", - [start_daemons]) + [start_daemons, no_voting]) @classmethod def call_node_stop_master(cls, node, stop_daemons): @@ -706,6 +801,35 @@ class RpcRunner(object): params = [instance_name, [cf.ToDict() for cf in disks]] return self._SingleNodeCall(node, "blockdev_close", params) + def call_drbd_disconnect_net(self, node_list, nodes_ip, disks): + """Disconnects the network of the given drbd devices. + + This is a multi-node call. + + """ + return self._MultiNodeCall(node_list, "drbd_disconnect_net", + [nodes_ip, [cf.ToDict() for cf in disks]]) + + def call_drbd_attach_net(self, node_list, nodes_ip, + disks, instance_name, multimaster): + """Disconnects the given drbd devices. + + This is a multi-node call. + + """ + return self._MultiNodeCall(node_list, "drbd_attach_net", + [nodes_ip, [cf.ToDict() for cf in disks], + instance_name, multimaster]) + + def call_drbd_wait_sync(self, node_list, nodes_ip, disks): + """Waits for the synchronization of drbd devices is complete. + + This is a multi-node call. + + """ + return self._MultiNodeCall(node_list, "drbd_wait_sync", + [nodes_ip, [cf.ToDict() for cf in disks]]) + @classmethod def call_upload_file(cls, node_list, file_name, address_list=None): """Upload a file. @@ -747,13 +871,7 @@ class RpcRunner(object): This is a multi-node call. """ - result = self._MultiNodeCall(node_list, "os_diagnose", []) - - for node_result in result.values(): - if not node_result.failed and node_result.data: - node_result.data = [objects.OS.FromDict(oss) - for oss in node_result.data] - return result + return self._MultiNodeCall(node_list, "os_diagnose", []) def call_os_get(self, node, name): """Returns an OS definition. @@ -829,7 +947,10 @@ class RpcRunner(object): """ flat_disks = [] for disk in snap_disks: - flat_disks.append(disk.ToDict()) + if isinstance(disk, bool): + flat_disks.append(disk) + else: + flat_disks.append(disk.ToDict()) return self._SingleNodeCall(node, "finalize_export", [self._InstDict(instance), flat_disks]) @@ -840,10 +961,7 @@ class RpcRunner(object): This is a single-node call. """ - result = self._SingleNodeCall(node, "export_info", [path]) - if not result.failed and result.data: - result.data = objects.SerializableConfigParser.Loads(str(result.data)) - return result + return self._SingleNodeCall(node, "export_info", [path]) def call_instance_os_import(self, node, inst, src_node, src_images, cluster_name): @@ -900,6 +1018,16 @@ class RpcRunner(object): """ return self._SingleNodeCall(node, "node_demote_from_mc", []) + + def call_node_powercycle(self, node, hypervisor): + """Tries to powercycle a node. + + This is a single-node call. + + """ + return self._SingleNodeCall(node, "node_powercycle", [hypervisor]) + + def call_test_delay(self, node_list, duration): """Sleep for a fixed time on given node(s). @@ -995,6 +1123,6 @@ class RpcRunner(object): """ cluster = self._cfg.GetClusterInfo() - hv_full = cluster.FillDict(cluster.hvparams.get(hvname, {}), hvparams) + hv_full = objects.FillDict(cluster.hvparams.get(hvname, {}), hvparams) return self._MultiNodeCall(node_list, "hypervisor_validate_params", [hvname, hv_full])