+ self.acquire = self._lock.acquire
+ self.release = self._lock.release
+
+ # Initialize
+ self._queue_lock = jstore.InitAndVerifyQueue(must_lock=True)
+
+ # Read serial file
+ self._last_serial = jstore.ReadSerial()
+ assert self._last_serial is not None, ("Serial file was modified between"
+ " check in jstore and here")
+
+ # Get initial list of nodes
+ self._nodes = dict((n.name, n.primary_ip)
+ for n in self.context.cfg.GetAllNodesInfo().values()
+ if n.master_candidate)
+
+ # Remove master node
+ try:
+ del self._nodes[self._my_hostname]
+ except KeyError:
+ pass
+
+ # TODO: Check consistency across nodes
+
+ # Setup worker pool
+ self._wpool = _JobQueueWorkerPool(self)
+ try:
+ # We need to lock here because WorkerPool.AddTask() may start a job while
+ # we're still doing our work.
+ self.acquire()
+ try:
+ logging.info("Inspecting job queue")
+
+ all_job_ids = self._GetJobIDsUnlocked()
+ jobs_count = len(all_job_ids)
+ lastinfo = time.time()
+ for idx, job_id in enumerate(all_job_ids):
+ # Give an update every 1000 jobs or 10 seconds
+ if (idx % 1000 == 0 or time.time() >= (lastinfo + 10.0) or
+ idx == (jobs_count - 1)):
+ logging.info("Job queue inspection: %d/%d (%0.1f %%)",
+ idx, jobs_count - 1, 100.0 * (idx + 1) / jobs_count)
+ lastinfo = time.time()
+
+ job = self._LoadJobUnlocked(job_id)
+
+ # a failure in loading the job can cause 'None' to be returned
+ if job is None:
+ continue
+
+ status = job.CalcStatus()
+
+ if status in (constants.JOB_STATUS_QUEUED, ):
+ self._wpool.AddTask(job)
+
+ elif status in (constants.JOB_STATUS_RUNNING,
+ constants.JOB_STATUS_WAITLOCK,
+ constants.JOB_STATUS_CANCELING):
+ logging.warning("Unfinished job %s found: %s", job.id, job)
+ try:
+ for op in job.ops:
+ op.status = constants.OP_STATUS_ERROR
+ op.result = "Unclean master daemon shutdown"
+ finally:
+ self.UpdateJobUnlocked(job)
+
+ logging.info("Job queue inspection finished")
+ finally:
+ self.release()
+ except:
+ self._wpool.TerminateWorkers()
+ raise
+
+ @utils.LockedMethod
+ @_RequireOpenQueue
+ def AddNode(self, node):
+ """Register a new node with the queue.
+
+ @type node: L{objects.Node}
+ @param node: the node object to be added
+
+ """
+ node_name = node.name
+ assert node_name != self._my_hostname
+
+ # Clean queue directory on added node
+ rpc.RpcRunner.call_jobqueue_purge(node_name)
+
+ if not node.master_candidate:
+ # remove if existing, ignoring errors
+ self._nodes.pop(node_name, None)
+ # and skip the replication of the job ids
+ return
+
+ # Upload the whole queue excluding archived jobs
+ files = [self._GetJobPath(job_id) for job_id in self._GetJobIDsUnlocked()]
+
+ # Upload current serial file
+ files.append(constants.JOB_QUEUE_SERIAL_FILE)
+
+ for file_name in files:
+ # Read file content
+ fd = open(file_name, "r")
+ try:
+ content = fd.read()
+ finally:
+ fd.close()
+
+ result = rpc.RpcRunner.call_jobqueue_update([node_name],
+ [node.primary_ip],
+ file_name, content)
+ if not result[node_name]:
+ logging.error("Failed to upload %s to %s", file_name, node_name)
+
+ self._nodes[node_name] = node.primary_ip
+
+ @utils.LockedMethod
+ @_RequireOpenQueue
+ def RemoveNode(self, node_name):
+ """Callback called when removing nodes from the cluster.
+
+ @type node_name: str
+ @param node_name: the name of the node to remove
+
+ """
+ try:
+ # The queue is removed by the "leave node" RPC call.
+ del self._nodes[node_name]
+ except KeyError:
+ pass
+
+ def _CheckRpcResult(self, result, nodes, failmsg):
+ """Verifies the status of an RPC call.
+
+ Since we aim to keep consistency should this node (the current
+ master) fail, we will log errors if our rpc fail, and especially
+ log the case when more than half of the nodes failes.
+
+ @param result: the data as returned from the rpc call
+ @type nodes: list
+ @param nodes: the list of nodes we made the call to
+ @type failmsg: str
+ @param failmsg: the identifier to be used for logging
+
+ """
+ failed = []
+ success = []
+
+ for node in nodes:
+ if result[node]:
+ success.append(node)
+ else:
+ failed.append(node)
+
+ if failed:
+ logging.error("%s failed on %s", failmsg, ", ".join(failed))
+
+ # +1 for the master node
+ if (len(success) + 1) < len(failed):
+ # TODO: Handle failing nodes
+ logging.error("More than half of the nodes failed")
+
+ def _GetNodeIp(self):
+ """Helper for returning the node name/ip list.
+
+ @rtype: (list, list)
+ @return: a tuple of two lists, the first one with the node
+ names and the second one with the node addresses
+
+ """
+ name_list = self._nodes.keys()
+ addr_list = [self._nodes[name] for name in name_list]
+ return name_list, addr_list
+
+ def _WriteAndReplicateFileUnlocked(self, file_name, data):
+ """Writes a file locally and then replicates it to all nodes.
+
+ This function will replace the contents of a file on the local
+ node and then replicate it to all the other nodes we have.
+
+ @type file_name: str
+ @param file_name: the path of the file to be replicated
+ @type data: str
+ @param data: the new contents of the file
+
+ """
+ utils.WriteFile(file_name, data=data)
+
+ names, addrs = self._GetNodeIp()
+ result = rpc.RpcRunner.call_jobqueue_update(names, addrs, file_name, data)
+ self._CheckRpcResult(result, self._nodes,
+ "Updating %s" % file_name)
+
+ def _RenameFilesUnlocked(self, rename):
+ """Renames a file locally and then replicate the change.