+ """
+ assert self.queue, "Queue attribute is missing"
+ assert self.opcode, "Opcode attribute is missing"
+
+ self.queue.acquire()
+ try:
+ assert self.opcode.status in (constants.OP_STATUS_WAITLOCK,
+ constants.OP_STATUS_CANCELING)
+
+ # Cancel here if we were asked to
+ if self.opcode.status == constants.OP_STATUS_CANCELING:
+ raise CancelJob()
+
+ self.opcode.status = constants.OP_STATUS_RUNNING
+ finally:
+ self.queue.release()
+
+ def RunTask(self, job):
+ """Job executor.
+
+ This functions processes a job. It is closely tied to the _QueuedJob and
+ _QueuedOpCode classes.
+
+ @type job: L{_QueuedJob}
+ @param job: the job to be processed
+
+ """
+ logging.info("Worker %s processing job %s",
+ self.worker_id, job.id)
+ proc = mcpu.Processor(self.pool.queue.context)
+ self.queue = queue = job.queue
+ try:
+ try:
+ count = len(job.ops)
+ for idx, op in enumerate(job.ops):
+ op_summary = op.input.Summary()
+ try:
+ logging.info("Op %s/%s: Starting opcode %s", idx + 1, count,
+ op_summary)
+
+ queue.acquire()
+ try:
+ if op.status == constants.OP_STATUS_CANCELED:
+ raise CancelJob()
+ assert op.status == constants.OP_STATUS_QUEUED
+ job.run_op_index = idx
+ op.status = constants.OP_STATUS_WAITLOCK
+ op.result = None
+ op.start_timestamp = TimeStampNow()
+ if idx == 0: # first opcode
+ job.start_timestamp = op.start_timestamp
+ queue.UpdateJobUnlocked(job)
+
+ input_opcode = op.input
+ finally:
+ queue.release()
+
+ def _Log(*args):
+ """Append a log entry.
+
+ """
+ assert len(args) < 3
+
+ if len(args) == 1:
+ log_type = constants.ELOG_MESSAGE
+ log_msg = args[0]
+ else:
+ log_type, log_msg = args
+
+ # The time is split to make serialization easier and not lose
+ # precision.
+ timestamp = utils.SplitTime(time.time())
+
+ queue.acquire()
+ try:
+ job.log_serial += 1
+ op.log.append((job.log_serial, timestamp, log_type, log_msg))
+
+ job.change.notifyAll()
+ finally:
+ queue.release()
+
+ # Make sure not to hold lock while _Log is called
+ self.opcode = op
+ result = proc.ExecOpCode(input_opcode, _Log, self._NotifyStart)
+
+ queue.acquire()
+ try:
+ op.status = constants.OP_STATUS_SUCCESS
+ op.result = result
+ op.end_timestamp = TimeStampNow()
+ queue.UpdateJobUnlocked(job)
+ finally:
+ queue.release()
+
+ logging.info("Op %s/%s: Successfully finished opcode %s",
+ idx + 1, count, op_summary)
+ except CancelJob:
+ # Will be handled further up
+ raise
+ except Exception, err:
+ queue.acquire()
+ try:
+ try:
+ op.status = constants.OP_STATUS_ERROR
+ op.result = str(err)
+ op.end_timestamp = TimeStampNow()
+ logging.info("Op %s/%s: Error in opcode %s: %s",
+ idx + 1, count, op_summary, err)
+ finally:
+ queue.UpdateJobUnlocked(job)
+ finally:
+ queue.release()
+ raise
+
+ except CancelJob:
+ queue.acquire()
+ try:
+ queue.CancelJobUnlocked(job)
+ finally:
+ queue.release()
+ except errors.GenericError, err:
+ logging.exception("Ganeti exception")
+ except:
+ logging.exception("Unhandled exception")
+ finally:
+ queue.acquire()
+ try:
+ try:
+ job.run_op_idx = -1
+ job.end_timestamp = TimeStampNow()
+ queue.UpdateJobUnlocked(job)
+ finally:
+ job_id = job.id
+ status = job.CalcStatus()
+ finally:
+ queue.release()
+ logging.info("Worker %s finished job %s, status = %s",
+ self.worker_id, job_id, status)
+
+
+class _JobQueueWorkerPool(workerpool.WorkerPool):
+ """Simple class implementing a job-processing workerpool.
+
+ """
+ def __init__(self, queue):
+ super(_JobQueueWorkerPool, self).__init__(JOBQUEUE_THREADS,
+ _JobQueueWorker)
+ self.queue = queue
+
+
+class JobQueue(object):
+ """Quue used to manaage the jobs.
+
+ @cvar _RE_JOB_FILE: regex matching the valid job file names
+
+ """
+ _RE_JOB_FILE = re.compile(r"^job-(%s)$" % constants.JOB_ID_TEMPLATE)
+
+ def _RequireOpenQueue(fn):
+ """Decorator for "public" functions.
+
+ This function should be used for all 'public' functions. That is,
+ functions usually called from other classes.
+
+ @warning: Use this decorator only after utils.LockedMethod!
+
+ Example::
+ @utils.LockedMethod
+ @_RequireOpenQueue
+ def Example(self):
+ pass
+
+ """
+ def wrapper(self, *args, **kwargs):
+ assert self._queue_lock is not None, "Queue should be open"
+ return fn(self, *args, **kwargs)
+ return wrapper
+
+ def __init__(self, context):
+ """Constructor for JobQueue.
+
+ The constructor will initialize the job queue object and then
+ start loading the current jobs from disk, either for starting them
+ (if they were queue) or for aborting them (if they were already
+ running).
+
+ @type context: GanetiContext
+ @param context: the context object for access to the configuration
+ data and other ganeti objects
+
+ """
+ self.context = context
+ self._memcache = weakref.WeakValueDictionary()
+ self._my_hostname = utils.HostInfo().name
+
+ # Locking
+ self._lock = threading.Lock()
+ self.acquire = self._lock.acquire
+ self.release = self._lock.release
+
+ # Initialize
+ self._queue_lock = jstore.InitAndVerifyQueue(must_lock=True)
+
+ # Read serial file
+ self._last_serial = jstore.ReadSerial()
+ assert self._last_serial is not None, ("Serial file was modified between"
+ " check in jstore and here")
+
+ # Get initial list of nodes
+ self._nodes = dict((n.name, n.primary_ip)
+ for n in self.context.cfg.GetAllNodesInfo().values()
+ if n.master_candidate)
+
+ # Remove master node
+ try:
+ del self._nodes[self._my_hostname]
+ except KeyError:
+ pass
+
+ # TODO: Check consistency across nodes
+
+ # Setup worker pool
+ self._wpool = _JobQueueWorkerPool(self)
+ try:
+ # We need to lock here because WorkerPool.AddTask() may start a job while
+ # we're still doing our work.
+ self.acquire()
+ try:
+ logging.info("Inspecting job queue")
+
+ all_job_ids = self._GetJobIDsUnlocked()
+ jobs_count = len(all_job_ids)
+ lastinfo = time.time()
+ for idx, job_id in enumerate(all_job_ids):
+ # Give an update every 1000 jobs or 10 seconds
+ if (idx % 1000 == 0 or time.time() >= (lastinfo + 10.0) or
+ idx == (jobs_count - 1)):
+ logging.info("Job queue inspection: %d/%d (%0.1f %%)",
+ idx, jobs_count - 1, 100.0 * (idx + 1) / jobs_count)
+ lastinfo = time.time()
+
+ job = self._LoadJobUnlocked(job_id)
+
+ # a failure in loading the job can cause 'None' to be returned
+ if job is None:
+ continue
+
+ status = job.CalcStatus()
+
+ if status in (constants.JOB_STATUS_QUEUED, ):
+ self._wpool.AddTask(job)
+
+ elif status in (constants.JOB_STATUS_RUNNING,
+ constants.JOB_STATUS_WAITLOCK,
+ constants.JOB_STATUS_CANCELING):
+ logging.warning("Unfinished job %s found: %s", job.id, job)
+ try:
+ for op in job.ops:
+ op.status = constants.OP_STATUS_ERROR
+ op.result = "Unclean master daemon shutdown"
+ finally:
+ self.UpdateJobUnlocked(job)
+
+ logging.info("Job queue inspection finished")
+ finally:
+ self.release()
+ except:
+ self._wpool.TerminateWorkers()
+ raise
+
+ @utils.LockedMethod
+ @_RequireOpenQueue
+ def AddNode(self, node):
+ """Register a new node with the queue.
+
+ @type node: L{objects.Node}
+ @param node: the node object to be added
+
+ """
+ node_name = node.name
+ assert node_name != self._my_hostname
+
+ # Clean queue directory on added node
+ rpc.RpcRunner.call_jobqueue_purge(node_name)
+
+ if not node.master_candidate:
+ # remove if existing, ignoring errors
+ self._nodes.pop(node_name, None)
+ # and skip the replication of the job ids
+ return
+
+ # Upload the whole queue excluding archived jobs
+ files = [self._GetJobPath(job_id) for job_id in self._GetJobIDsUnlocked()]
+
+ # Upload current serial file
+ files.append(constants.JOB_QUEUE_SERIAL_FILE)
+
+ for file_name in files:
+ # Read file content
+ fd = open(file_name, "r")
+ try:
+ content = fd.read()
+ finally:
+ fd.close()
+
+ result = rpc.RpcRunner.call_jobqueue_update([node_name],
+ [node.primary_ip],
+ file_name, content)
+ if not result[node_name]:
+ logging.error("Failed to upload %s to %s", file_name, node_name)
+
+ self._nodes[node_name] = node.primary_ip
+
+ @utils.LockedMethod
+ @_RequireOpenQueue
+ def RemoveNode(self, node_name):
+ """Callback called when removing nodes from the cluster.
+
+ @type node_name: str
+ @param node_name: the name of the node to remove
+
+ """
+ try:
+ # The queue is removed by the "leave node" RPC call.
+ del self._nodes[node_name]
+ except KeyError:
+ pass
+
+ def _CheckRpcResult(self, result, nodes, failmsg):
+ """Verifies the status of an RPC call.
+
+ Since we aim to keep consistency should this node (the current
+ master) fail, we will log errors if our rpc fail, and especially
+ log the case when more than half of the nodes failes.
+
+ @param result: the data as returned from the rpc call
+ @type nodes: list
+ @param nodes: the list of nodes we made the call to
+ @type failmsg: str
+ @param failmsg: the identifier to be used for logging
+
+ """
+ failed = []
+ success = []
+
+ for node in nodes:
+ if result[node]:
+ success.append(node)
+ else:
+ failed.append(node)
+
+ if failed:
+ logging.error("%s failed on %s", failmsg, ", ".join(failed))
+
+ # +1 for the master node
+ if (len(success) + 1) < len(failed):
+ # TODO: Handle failing nodes
+ logging.error("More than half of the nodes failed")
+
+ def _GetNodeIp(self):
+ """Helper for returning the node name/ip list.
+
+ @rtype: (list, list)
+ @return: a tuple of two lists, the first one with the node
+ names and the second one with the node addresses
+
+ """
+ name_list = self._nodes.keys()
+ addr_list = [self._nodes[name] for name in name_list]
+ return name_list, addr_list
+
+ def _WriteAndReplicateFileUnlocked(self, file_name, data):
+ """Writes a file locally and then replicates it to all nodes.
+
+ This function will replace the contents of a file on the local
+ node and then replicate it to all the other nodes we have.
+
+ @type file_name: str
+ @param file_name: the path of the file to be replicated
+ @type data: str
+ @param data: the new contents of the file
+
+ """
+ utils.WriteFile(file_name, data=data)
+
+ names, addrs = self._GetNodeIp()
+ result = rpc.RpcRunner.call_jobqueue_update(names, addrs, file_name, data)
+ self._CheckRpcResult(result, self._nodes,
+ "Updating %s" % file_name)
+
+ def _RenameFilesUnlocked(self, rename):
+ """Renames a file locally and then replicate the change.
+
+ This function will rename a file in the local queue directory
+ and then replicate this rename to all the other nodes we have.
+
+ @type rename: list of (old, new)
+ @param rename: List containing tuples mapping old to new names
+
+ """
+ # Rename them locally
+ for old, new in rename:
+ utils.RenameFile(old, new, mkdir=True)
+
+ # ... and on all nodes
+ names, addrs = self._GetNodeIp()
+ result = rpc.RpcRunner.call_jobqueue_rename(names, addrs, rename)
+ self._CheckRpcResult(result, self._nodes, "Renaming files (%r)" % rename)
+
+ def _FormatJobID(self, job_id):
+ """Convert a job ID to string format.
+
+ Currently this just does C{str(job_id)} after performing some
+ checks, but if we want to change the job id format this will
+ abstract this change.
+
+ @type job_id: int or long
+ @param job_id: the numeric job id
+ @rtype: str
+ @return: the formatted job id
+
+ """
+ if not isinstance(job_id, (int, long)):
+ raise errors.ProgrammerError("Job ID '%s' not numeric" % job_id)
+ if job_id < 0:
+ raise errors.ProgrammerError("Job ID %s is negative" % job_id)
+
+ return str(job_id)
+
+ @classmethod
+ def _GetArchiveDirectory(cls, job_id):
+ """Returns the archive directory for a job.
+
+ @type job_id: str
+ @param job_id: Job identifier
+ @rtype: str
+ @return: Directory name
+
+ """
+ return str(int(job_id) / JOBS_PER_ARCHIVE_DIRECTORY)
+
+ def _NewSerialUnlocked(self):
+ """Generates a new job identifier.
+
+ Job identifiers are unique during the lifetime of a cluster.
+
+ @rtype: str
+ @return: a string representing the job identifier.
+
+ """
+ # New number
+ serial = self._last_serial + 1
+
+ # Write to file
+ self._WriteAndReplicateFileUnlocked(constants.JOB_QUEUE_SERIAL_FILE,
+ "%s\n" % serial)
+
+ # Keep it only if we were able to write the file
+ self._last_serial = serial
+
+ return self._FormatJobID(serial)
+
+ @staticmethod
+ def _GetJobPath(job_id):
+ """Returns the job file for a given job id.
+
+ @type job_id: str
+ @param job_id: the job identifier
+ @rtype: str
+ @return: the path to the job file
+
+ """
+ return os.path.join(constants.QUEUE_DIR, "job-%s" % job_id)
+
+ @classmethod
+ def _GetArchivedJobPath(cls, job_id):
+ """Returns the archived job file for a give job id.
+
+ @type job_id: str
+ @param job_id: the job identifier
+ @rtype: str
+ @return: the path to the archived job file
+
+ """
+ path = "%s/job-%s" % (cls._GetArchiveDirectory(job_id), job_id)
+ return os.path.join(constants.JOB_QUEUE_ARCHIVE_DIR, path)
+
+ @classmethod
+ def _ExtractJobID(cls, name):
+ """Extract the job id from a filename.
+
+ @type name: str
+ @param name: the job filename
+ @rtype: job id or None
+ @return: the job id corresponding to the given filename,
+ or None if the filename does not represent a valid
+ job file
+
+ """
+ m = cls._RE_JOB_FILE.match(name)
+ if m:
+ return m.group(1)
+ else:
+ return None
+
+ def _GetJobIDsUnlocked(self, archived=False):
+ """Return all known job IDs.
+
+ If the parameter archived is True, archived jobs IDs will be
+ included. Currently this argument is unused.
+
+ The method only looks at disk because it's a requirement that all
+ jobs are present on disk (so in the _memcache we don't have any
+ extra IDs).
+
+ @rtype: list
+ @return: the list of job IDs
+
+ """
+ jlist = [self._ExtractJobID(name) for name in self._ListJobFiles()]
+ jlist = utils.NiceSort(jlist)
+ return jlist
+
+ def _ListJobFiles(self):
+ """Returns the list of current job files.
+
+ @rtype: list
+ @return: the list of job file names