X-Git-Url: https://code.grnet.gr/git/ganeti-local/blobdiff_plain/fad50141719e1f8e0525265e62ceb09e0ce5efb4..7b5c4a693b48f52db43f835f6740201c5e23a251:/lib/luxi.py diff --git a/lib/luxi.py b/lib/luxi.py index f99a435..669c3dd 100644 --- a/lib/luxi.py +++ b/lib/luxi.py @@ -21,57 +21,69 @@ """Module for the unix socket protocol -This module implements the local unix socket protocl. You only need +This module implements the local unix socket protocol. You only need this module and the opcodes module in the client program in order to communicate with the master. -The module is also be used by the master daemon. +The module is also used by the master daemon. """ import socket import collections -import simplejson import time import errno +import logging -from ganeti import opcodes from ganeti import serializer from ganeti import constants - - -KEY_REQUEST = 'request' -KEY_DATA = 'data' -REQ_SUBMIT = 'submit' -REQ_ABORT = 'abort' -REQ_QUERY = 'query' +from ganeti import errors +from ganeti import utils + + +KEY_METHOD = "method" +KEY_ARGS = "args" +KEY_SUCCESS = "success" +KEY_RESULT = "result" + +REQ_SUBMIT_JOB = "SubmitJob" +REQ_SUBMIT_MANY_JOBS = "SubmitManyJobs" +REQ_WAIT_FOR_JOB_CHANGE = "WaitForJobChange" +REQ_CANCEL_JOB = "CancelJob" +REQ_ARCHIVE_JOB = "ArchiveJob" +REQ_AUTOARCHIVE_JOBS = "AutoArchiveJobs" +REQ_QUERY_JOBS = "QueryJobs" +REQ_QUERY_INSTANCES = "QueryInstances" +REQ_QUERY_NODES = "QueryNodes" +REQ_QUERY_EXPORTS = "QueryExports" +REQ_QUERY_CONFIG_VALUES = "QueryConfigValues" +REQ_QUERY_CLUSTER_INFO = "QueryClusterInfo" +REQ_QUERY_TAGS = "QueryTags" +REQ_QUERY_LOCKS = "QueryLocks" +REQ_QUEUE_SET_DRAIN_FLAG = "SetDrainFlag" +REQ_SET_WATCHER_PAUSE = "SetWatcherPause" DEF_CTMO = 10 DEF_RWTO = 60 +# WaitForJobChange timeout +WFJC_TIMEOUT = (DEF_RWTO - 1) / 2 + -class ProtocolError(Exception): - """Denotes an error in the server communication""" +class ProtocolError(errors.GenericError): + """Denotes an error in the LUXI protocol.""" class ConnectionClosedError(ProtocolError): - """Connection closed error""" + """Connection closed error.""" class TimeoutError(ProtocolError): - """Operation timeout error""" - - -class EncodingError(ProtocolError): - """Encoding failure on the sending side""" - - -class DecodingError(ProtocolError): - """Decoding failure on the receiving side""" + """Operation timeout error.""" class RequestError(ProtocolError): - """Error on request + """Error on request. This signifies an error in the request format or request handling, but not (e.g.) an error in starting up an instance. @@ -82,8 +94,9 @@ class RequestError(ProtocolError): """ + class NoMasterError(ProtocolError): - """The master cannot be reached + """The master cannot be reached. This means that the master daemon is not running or the socket has been removed. @@ -91,22 +104,12 @@ class NoMasterError(ProtocolError): """ -def SerializeJob(job): - """Convert a job description to a string format. +class PermissionError(ProtocolError): + """Permission denied while connecting to the master socket. - """ - return simplejson.dumps(job.__getstate__()) + This means the user doesn't have the proper rights. - -def UnserializeJob(data): - """Load a job from a string format""" - try: - new_data = simplejson.loads(data) - except Exception, err: - raise DecodingError("Error while unserializing: %s" % str(err)) - job = opcodes.Job() - job.__setstate__(new_data) - return job + """ class Transport: @@ -121,15 +124,12 @@ class Transport: """ - def __init__(self, address, timeouts=None, eom=None): + def __init__(self, address, timeouts=None): """Constructor for the Client class. Arguments: - address: a valid address the the used transport class - timeout: a list of timeouts, to be used on connect and read/write - - eom: an identifier to be used as end-of-message which the - upper-layer will guarantee that this identifier will not appear - in any message There are two timeouts used since we might want to wait for a long time for a response, but the connect timeout should be lower. @@ -152,22 +152,16 @@ class Transport: self._buffer = "" self._msgs = collections.deque() - if eom is None: - self.eom = '\3' - else: - self.eom = eom - try: self.socket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) - self.socket.settimeout(self._ctimeout) + + # Try to connect try: - self.socket.connect(address) - except socket.timeout, err: - raise TimeoutError("Connect timed out: %s" % str(err)) - except socket.error, err: - if err.args[0] == errno.ENOENT: - raise NoMasterError((address,)) - raise + utils.Retry(self._Connect, 1.0, self._ctimeout, + args=(self.socket, address, self._ctimeout)) + except utils.RetryTimeout: + raise TimeoutError("Connect timed out") + self.socket.settimeout(self._rwtimeout) except (socket.error, NoMasterError): if self.socket is not None: @@ -175,6 +169,24 @@ class Transport: self.socket = None raise + @staticmethod + def _Connect(sock, address, timeout): + sock.settimeout(timeout) + try: + sock.connect(address) + except socket.timeout, err: + raise TimeoutError("Connect timed out: %s" % str(err)) + except socket.error, err: + error_code = err.args[0] + if error_code in (errno.ENOENT, errno.ECONNREFUSED): + raise NoMasterError(address) + elif error_code in (errno.EPERM, errno.EACCES): + raise PermissionError(address) + elif error_code == errno.EAGAIN: + # Server's socket backlog is full at the moment + raise utils.RetryAgain() + raise + def _CheckSocket(self): """Make sure we are connected. @@ -188,16 +200,18 @@ class Transport: This just sends a message and doesn't wait for the response. """ - if self.eom in msg: - raise EncodingError("Message terminator found in payload") + if constants.LUXI_EOM in msg: + raise ProtocolError("Message terminator found in payload") + self._CheckSocket() try: - self.socket.sendall(msg + self.eom) + # TODO: sendall is not guaranteed to send everything + self.socket.sendall(msg + constants.LUXI_EOM) except socket.timeout, err: raise TimeoutError("Sending timeout: %s" % str(err)) def Recv(self): - """Try to receive a messae from the socket. + """Try to receive a message from the socket. In case we already have messages queued, we just return from the queue. Otherwise, we try to read data with a _rwtimeout network @@ -210,13 +224,19 @@ class Transport: while not self._msgs: if time.time() > etime: raise TimeoutError("Extended receive timeout") - try: - data = self.socket.recv(4096) - except socket.timeout, err: - raise TimeoutError("Receive timeout: %s" % str(err)) + while True: + try: + data = self.socket.recv(4096) + except socket.error, err: + if err.args and err.args[0] == errno.EAGAIN: + continue + raise + except socket.timeout, err: + raise TimeoutError("Receive timeout: %s" % str(err)) + break if not data: raise ConnectionClosedError("Connection closed while reading") - new_msgs = (self._buffer + data).split(self.eom) + new_msgs = (self._buffer + data).split(constants.LUXI_EOM) self._buffer = new_msgs.pop() self._msgs.extend(new_msgs) return self._msgs.popleft() @@ -237,6 +257,99 @@ class Transport: self.socket = None +def ParseRequest(msg): + """Parses a LUXI request message. + + """ + try: + request = serializer.LoadJson(msg) + except ValueError, err: + raise ProtocolError("Invalid LUXI request (parsing error): %s" % err) + + logging.debug("LUXI request: %s", request) + + if not isinstance(request, dict): + logging.error("LUXI request not a dict: %r", msg) + raise ProtocolError("Invalid LUXI request (not a dict)") + + method = request.get(KEY_METHOD, None) # pylint: disable-msg=E1103 + args = request.get(KEY_ARGS, None) # pylint: disable-msg=E1103 + + if method is None or args is None: + logging.error("LUXI request missing method or arguments: %r", msg) + raise ProtocolError(("Invalid LUXI request (no method or arguments" + " in request): %r") % msg) + + return (method, args) + + +def ParseResponse(msg): + """Parses a LUXI response message. + + """ + # Parse the result + try: + data = serializer.LoadJson(msg) + except Exception, err: + raise ProtocolError("Error while deserializing response: %s" % str(err)) + + # Validate response + if not (isinstance(data, dict) and + KEY_SUCCESS in data and + KEY_RESULT in data): + raise ProtocolError("Invalid response from server: %r" % data) + + return (data[KEY_SUCCESS], data[KEY_RESULT]) + + +def FormatResponse(success, result): + """Formats a LUXI response message. + + """ + response = { + KEY_SUCCESS: success, + KEY_RESULT: result, + } + + logging.debug("LUXI response: %s", response) + + return serializer.DumpJson(response) + + +def FormatRequest(method, args): + """Formats a LUXI request message. + + """ + # Build request + request = { + KEY_METHOD: method, + KEY_ARGS: args, + } + + # Serialize the request + return serializer.DumpJson(request, indent=False) + + +def CallLuxiMethod(transport_cb, method, args): + """Send a LUXI request via a transport and return the response. + + """ + assert callable(transport_cb) + + request_msg = FormatRequest(method, args) + + # Send request and wait for response + response_msg = transport_cb(request_msg) + + (success, result) = ParseResponse(response_msg) + + if success: + return result + + errors.MaybeRaise(result) + raise RequestError(result) + + class Client(object): """High-level client implementation. @@ -259,41 +372,125 @@ class Client(object): """ if address is None: address = constants.MASTER_SOCKET - self.transport = transport(address, timeouts=timeouts) + self.address = address + self.timeouts = timeouts + self.transport_class = transport + self.transport = None + self._InitTransport() - def SendRequest(self, request, data): - """Send a generic request and return the response. + def _InitTransport(self): + """(Re)initialize the transport if needed. """ - msg = {KEY_REQUEST: request, KEY_DATA: data} - result = self.transport.Call(serializer.DumpJson(msg, indent=False)) + if self.transport is None: + self.transport = self.transport_class(self.address, + timeouts=self.timeouts) + + def _CloseTransport(self): + """Close the transport, ignoring errors. + + """ + if self.transport is None: + return + try: + old_transp = self.transport + self.transport = None + old_transp.Close() + except Exception: # pylint: disable-msg=W0703 + pass + + def _SendMethodCall(self, data): + # Send request and wait for response try: - data = serializer.LoadJson(result) - except Exception, err: - raise ProtocolError("Error while deserializing response: %s" % str(err)) - if (not isinstance(data, dict) or - 'success' not in data or - 'result' not in data): - raise DecodingError("Invalid response from server: %s" % str(data)) - return data - - def SubmitJob(self, job): - """Submit a job""" - result = self.SendRequest(REQ_SUBMIT, SerializeJob(job)) - if not result['success']: - raise RequestError(result['result']) - return result['result'] - - def Query(self, data): - """Make a query""" - result = self.SendRequest(REQ_QUERY, data) - if not result['success']: - raise RequestError(result[result]) - result = result['result'] - if data["object"] == "jobs": - # custom job processing of query values - for row in result: - for idx, field in enumerate(data["fields"]): - if field == "op_list": - row[idx] = [opcodes.OpCode.LoadOpCode(i) for i in row[idx]] + self._InitTransport() + return self.transport.Call(data) + except Exception: + self._CloseTransport() + raise + + def CallMethod(self, method, args): + """Send a generic request and return the response. + + """ + return CallLuxiMethod(self._SendMethodCall, method, args) + + def SetQueueDrainFlag(self, drain_flag): + return self.CallMethod(REQ_QUEUE_SET_DRAIN_FLAG, drain_flag) + + def SetWatcherPause(self, until): + return self.CallMethod(REQ_SET_WATCHER_PAUSE, [until]) + + def SubmitJob(self, ops): + ops_state = map(lambda op: op.__getstate__(), ops) + return self.CallMethod(REQ_SUBMIT_JOB, ops_state) + + def SubmitManyJobs(self, jobs): + jobs_state = [] + for ops in jobs: + jobs_state.append([op.__getstate__() for op in ops]) + return self.CallMethod(REQ_SUBMIT_MANY_JOBS, jobs_state) + + def CancelJob(self, job_id): + return self.CallMethod(REQ_CANCEL_JOB, job_id) + + def ArchiveJob(self, job_id): + return self.CallMethod(REQ_ARCHIVE_JOB, job_id) + + def AutoArchiveJobs(self, age): + timeout = (DEF_RWTO - 1) / 2 + return self.CallMethod(REQ_AUTOARCHIVE_JOBS, (age, timeout)) + + def WaitForJobChangeOnce(self, job_id, fields, + prev_job_info, prev_log_serial, + timeout=WFJC_TIMEOUT): + """Waits for changes on a job. + + @param job_id: Job ID + @type fields: list + @param fields: List of field names to be observed + @type prev_job_info: None or list + @param prev_job_info: Previously received job information + @type prev_log_serial: None or int/long + @param prev_log_serial: Highest log serial number previously received + @type timeout: int/float + @param timeout: Timeout in seconds (values larger than L{WFJC_TIMEOUT} will + be capped to that value) + + """ + assert timeout >= 0, "Timeout can not be negative" + return self.CallMethod(REQ_WAIT_FOR_JOB_CHANGE, + (job_id, fields, prev_job_info, + prev_log_serial, + min(WFJC_TIMEOUT, timeout))) + + def WaitForJobChange(self, job_id, fields, prev_job_info, prev_log_serial): + while True: + result = self.WaitForJobChangeOnce(job_id, fields, + prev_job_info, prev_log_serial) + if result != constants.JOB_NOTCHANGED: + break return result + + def QueryJobs(self, job_ids, fields): + return self.CallMethod(REQ_QUERY_JOBS, (job_ids, fields)) + + def QueryInstances(self, names, fields, use_locking): + return self.CallMethod(REQ_QUERY_INSTANCES, (names, fields, use_locking)) + + def QueryNodes(self, names, fields, use_locking): + return self.CallMethod(REQ_QUERY_NODES, (names, fields, use_locking)) + + def QueryExports(self, nodes, use_locking): + return self.CallMethod(REQ_QUERY_EXPORTS, (nodes, use_locking)) + + def QueryClusterInfo(self): + return self.CallMethod(REQ_QUERY_CLUSTER_INFO, ()) + + def QueryConfigValues(self, fields): + return self.CallMethod(REQ_QUERY_CONFIG_VALUES, fields) + + def QueryTags(self, kind, name): + return self.CallMethod(REQ_QUERY_TAGS, (kind, name)) + + def QueryLocks(self, fields, sync): + return self.CallMethod(REQ_QUERY_LOCKS, (fields, sync))