Statistics
| Branch: | Tag: | Revision:

root / lib / jqueue.py @ ce594241

History | View | Annotate | Download (16.6 kB)

1
#
2
#
3

    
4
# Copyright (C) 2006, 2007 Google Inc.
5
#
6
# This program is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 2 of the License, or
9
# (at your option) any later version.
10
#
11
# This program is distributed in the hope that it will be useful, but
12
# WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
# General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with this program; if not, write to the Free Software
18
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19
# 02110-1301, USA.
20

    
21

    
22
"""Module implementing the job queue handling."""
23

    
24
import os
25
import logging
26
import threading
27
import errno
28
import re
29
import time
30

    
31
from ganeti import constants
32
from ganeti import serializer
33
from ganeti import workerpool
34
from ganeti import opcodes
35
from ganeti import errors
36
from ganeti import mcpu
37
from ganeti import utils
38
from ganeti import rpc
39

    
40

    
41
JOBQUEUE_THREADS = 5
42

    
43

    
44
class _QueuedOpCode(object):
45
  """Encasulates an opcode object.
46

47
  Access is synchronized by the '_lock' attribute.
48

49
  The 'log' attribute holds the execution log and consists of tuples
50
  of the form (timestamp, level, message).
51

52
  """
53
  def __init__(self, op):
54
    self.__Setup(op, constants.OP_STATUS_QUEUED, None, [])
55

    
56
  def __Setup(self, input_, status, result, log):
57
    self._lock = threading.Lock()
58
    self.input = input_
59
    self.status = status
60
    self.result = result
61
    self.log = log
62

    
63
  @classmethod
64
  def Restore(cls, state):
65
    obj = object.__new__(cls)
66
    obj.__Setup(opcodes.OpCode.LoadOpCode(state["input"]),
67
                state["status"], state["result"], state["log"])
68
    return obj
69

    
70
  @utils.LockedMethod
71
  def Serialize(self):
72
    return {
73
      "input": self.input.__getstate__(),
74
      "status": self.status,
75
      "result": self.result,
76
      "log": self.log,
77
      }
78

    
79
  @utils.LockedMethod
80
  def GetInput(self):
81
    """Returns the original opcode.
82

83
    """
84
    return self.input
85

    
86
  @utils.LockedMethod
87
  def SetStatus(self, status, result):
88
    """Update the opcode status and result.
89

90
    """
91
    self.status = status
92
    self.result = result
93

    
94
  @utils.LockedMethod
95
  def GetStatus(self):
96
    """Get the opcode status.
97

98
    """
99
    return self.status
100

    
101
  @utils.LockedMethod
102
  def GetResult(self):
103
    """Get the opcode result.
104

105
    """
106
    return self.result
107

    
108
  @utils.LockedMethod
109
  def Log(self, *args):
110
    """Append a log entry.
111

112
    """
113
    assert len(args) < 2
114

    
115
    if len(args) == 1:
116
      log_type = constants.ELOG_MESSAGE
117
      log_msg = args[0]
118
    else:
119
      log_type, log_msg = args
120
    self.log.append((time.time(), log_type, log_msg))
121

    
122
  @utils.LockedMethod
123
  def RetrieveLog(self, start_at=0):
124
    """Retrieve (a part of) the execution log.
125

126
    """
127
    return self.log[start_at:]
128

    
129

    
130
class _QueuedJob(object):
131
  """In-memory job representation.
132

133
  This is what we use to track the user-submitted jobs.
134

135
  """
136
  def __init__(self, storage, job_id, ops):
137
    if not ops:
138
      # TODO
139
      raise Exception("No opcodes")
140

    
141
    self.__Setup(storage, job_id, [_QueuedOpCode(op) for op in ops], -1)
142

    
143
  def __Setup(self, storage, job_id, ops, run_op_index):
144
    self._lock = threading.Lock()
145
    self.storage = storage
146
    self.id = job_id
147
    self._ops = ops
148
    self.run_op_index = run_op_index
149

    
150
  @classmethod
151
  def Restore(cls, storage, state):
152
    obj = object.__new__(cls)
153
    op_list = [_QueuedOpCode.Restore(op_state) for op_state in state["ops"]]
154
    obj.__Setup(storage, state["id"], op_list, state["run_op_index"])
155
    return obj
156

    
157
  def Serialize(self):
158
    return {
159
      "id": self.id,
160
      "ops": [op.Serialize() for op in self._ops],
161
      "run_op_index": self.run_op_index,
162
      }
163

    
164
  def SetUnclean(self, msg):
165
    try:
166
      for op in self._ops:
167
        op.SetStatus(constants.OP_STATUS_ERROR, msg)
168
    finally:
169
      self.storage.UpdateJob(self)
170

    
171
  def GetStatus(self):
172
    status = constants.JOB_STATUS_QUEUED
173

    
174
    all_success = True
175
    for op in self._ops:
176
      op_status = op.GetStatus()
177
      if op_status == constants.OP_STATUS_SUCCESS:
178
        continue
179

    
180
      all_success = False
181

    
182
      if op_status == constants.OP_STATUS_QUEUED:
183
        pass
184
      elif op_status == constants.OP_STATUS_RUNNING:
185
        status = constants.JOB_STATUS_RUNNING
186
      elif op_status == constants.OP_STATUS_ERROR:
187
        status = constants.JOB_STATUS_ERROR
188
        # The whole job fails if one opcode failed
189
        break
190

    
191
    if all_success:
192
      status = constants.JOB_STATUS_SUCCESS
193

    
194
    return status
195

    
196
  @utils.LockedMethod
197
  def GetRunOpIndex(self):
198
    return self.run_op_index
199

    
200
  def Run(self, proc):
201
    """Job executor.
202

203
    This functions processes a this job in the context of given processor
204
    instance.
205

206
    Args:
207
    - proc: Ganeti Processor to run the job with
208

209
    """
210
    try:
211
      count = len(self._ops)
212
      for idx, op in enumerate(self._ops):
213
        try:
214
          logging.debug("Op %s/%s: Starting %s", idx + 1, count, op)
215

    
216
          self._lock.acquire()
217
          try:
218
            self.run_op_index = idx
219
          finally:
220
            self._lock.release()
221

    
222
          op.SetStatus(constants.OP_STATUS_RUNNING, None)
223
          self.storage.UpdateJob(self)
224

    
225
          result = proc.ExecOpCode(op.input, op.Log)
226

    
227
          op.SetStatus(constants.OP_STATUS_SUCCESS, result)
228
          self.storage.UpdateJob(self)
229
          logging.debug("Op %s/%s: Successfully finished %s",
230
                        idx + 1, count, op)
231
        except Exception, err:
232
          try:
233
            op.SetStatus(constants.OP_STATUS_ERROR, str(err))
234
            logging.debug("Op %s/%s: Error in %s", idx + 1, count, op)
235
          finally:
236
            self.storage.UpdateJob(self)
237
          raise
238

    
239
    except errors.GenericError, err:
240
      logging.error("ganeti exception %s", exc_info=err)
241
    except Exception, err:
242
      logging.error("unhandled exception %s", exc_info=err)
243
    except:
244
      logging.error("unhandled unknown exception %s", exc_info=err)
245

    
246

    
247
class _JobQueueWorker(workerpool.BaseWorker):
248
  def RunTask(self, job):
249
    logging.debug("Worker %s processing job %s",
250
                  self.worker_id, job.id)
251
    # TODO: feedback function
252
    proc = mcpu.Processor(self.pool.context)
253
    try:
254
      job.Run(proc)
255
    finally:
256
      logging.debug("Worker %s finished job %s, status = %s",
257
                    self.worker_id, job.id, job.GetStatus())
258

    
259

    
260
class _JobQueueWorkerPool(workerpool.WorkerPool):
261
  def __init__(self, context):
262
    super(_JobQueueWorkerPool, self).__init__(JOBQUEUE_THREADS,
263
                                              _JobQueueWorker)
264
    self.context = context
265

    
266

    
267
class JobStorageBase(object):
268
  def __init__(self, id_prefix):
269
    self.id_prefix = id_prefix
270

    
271
    if id_prefix:
272
      prefix_pattern = re.escape("%s-" % id_prefix)
273
    else:
274
      prefix_pattern = ""
275

    
276
    # Apart from the prefix, all job IDs are numeric
277
    self._re_job_id = re.compile(r"^%s\d+$" % prefix_pattern)
278

    
279
  def OwnsJobId(self, job_id):
280
    return self._re_job_id.match(job_id)
281

    
282
  def FormatJobID(self, job_id):
283
    if not isinstance(job_id, (int, long)):
284
      raise errors.ProgrammerError("Job ID '%s' not numeric" % job_id)
285
    if job_id < 0:
286
      raise errors.ProgrammerError("Job ID %s is negative" % job_id)
287

    
288
    if self.id_prefix:
289
      prefix = "%s-" % self.id_prefix
290
    else:
291
      prefix = ""
292

    
293
    return "%s%010d" % (prefix, job_id)
294

    
295

    
296
class DiskJobStorage(JobStorageBase):
297
  _RE_JOB_FILE = re.compile(r"^job-(%s)$" % constants.JOB_ID_TEMPLATE)
298

    
299
  def __init__(self, id_prefix):
300
    JobStorageBase.__init__(self, id_prefix)
301

    
302
    self._lock = threading.Lock()
303
    self._memcache = {}
304
    self._my_hostname = utils.HostInfo().name
305

    
306
    # Make sure our directory exists
307
    try:
308
      os.mkdir(constants.QUEUE_DIR, 0700)
309
    except OSError, err:
310
      if err.errno not in (errno.EEXIST, ):
311
        raise
312

    
313
    # Get queue lock
314
    self.lock_fd = open(constants.JOB_QUEUE_LOCK_FILE, "w")
315
    try:
316
      utils.LockFile(self.lock_fd)
317
    except:
318
      self.lock_fd.close()
319
      raise
320

    
321
    # Read version
322
    try:
323
      version_fd = open(constants.JOB_QUEUE_VERSION_FILE, "r")
324
    except IOError, err:
325
      if err.errno not in (errno.ENOENT, ):
326
        raise
327

    
328
      # Setup a new queue
329
      self._InitQueueUnlocked()
330

    
331
      # Try to open again
332
      version_fd = open(constants.JOB_QUEUE_VERSION_FILE, "r")
333

    
334
    try:
335
      # Try to read version
336
      version = int(version_fd.read(128))
337

    
338
      # Verify version
339
      if version != constants.JOB_QUEUE_VERSION:
340
        raise errors.JobQueueError("Found version %s, expected %s",
341
                                   version, constants.JOB_QUEUE_VERSION)
342
    finally:
343
      version_fd.close()
344

    
345
    self._last_serial = self._ReadSerial()
346
    if self._last_serial is None:
347
      raise errors.ConfigurationError("Can't read/parse the job queue serial"
348
                                      " file")
349

    
350
  @staticmethod
351
  def _ReadSerial():
352
    """Try to read the job serial file.
353

354
    @rtype: None or int
355
    @return: If the serial can be read, then it is returned. Otherwise None
356
             is returned.
357

358
    """
359
    try:
360
      serial_fd = open(constants.JOB_QUEUE_SERIAL_FILE, "r")
361
      try:
362
        # Read last serial
363
        serial = int(serial_fd.read(1024).strip())
364
      finally:
365
        serial_fd.close()
366
    except (ValueError, EnvironmentError):
367
      serial = None
368

    
369
    return serial
370

    
371
  def Close(self):
372
    assert self.lock_fd, "Queue should be open"
373

    
374
    self.lock_fd.close()
375
    self.lock_fd = None
376

    
377
  def _InitQueueUnlocked(self):
378
    assert self.lock_fd, "Queue should be open"
379

    
380
    utils.WriteFile(constants.JOB_QUEUE_VERSION_FILE,
381
                    data="%s\n" % constants.JOB_QUEUE_VERSION)
382
    if self._ReadSerial() is None:
383
      utils.WriteFile(constants.JOB_QUEUE_SERIAL_FILE,
384
                      data="%s\n" % 0)
385

    
386
  def _NewSerialUnlocked(self, nodes):
387
    """Generates a new job identifier.
388

389
    Job identifiers are unique during the lifetime of a cluster.
390

391
    Returns: A string representing the job identifier.
392

393
    """
394
    assert self.lock_fd, "Queue should be open"
395

    
396
    # New number
397
    serial = self._last_serial + 1
398

    
399
    # Write to file
400
    utils.WriteFile(constants.JOB_QUEUE_SERIAL_FILE,
401
                    data="%s\n" % serial)
402

    
403
    # Keep it only if we were able to write the file
404
    self._last_serial = serial
405

    
406
    # Distribute the serial to the other nodes
407
    try:
408
      nodes.remove(self._my_hostname)
409
    except ValueError:
410
      pass
411

    
412
    result = rpc.call_upload_file(nodes, constants.JOB_QUEUE_SERIAL_FILE)
413
    for node in nodes:
414
      if not result[node]:
415
        logging.error("copy of job queue file to node %s failed", node)
416

    
417
    return self.FormatJobID(serial)
418

    
419
  def _GetJobPath(self, job_id):
420
    return os.path.join(constants.QUEUE_DIR, "job-%s" % job_id)
421

    
422
  def _GetJobIDsUnlocked(self, archived=False):
423
    """Return all known job IDs.
424

425
    If the parameter archived is True, archived jobs IDs will be
426
    included. Currently this argument is unused.
427

428
    The method only looks at disk because it's a requirement that all
429
    jobs are present on disk (so in the _memcache we don't have any
430
    extra IDs).
431

432
    """
433
    jfiles = self._ListJobFiles()
434
    jlist = [m.group(1) for m in
435
             [self._RE_JOB_FILE.match(name) for name in jfiles]]
436
    jlist.sort()
437
    return jlist
438

    
439
  def _ListJobFiles(self):
440
    assert self.lock_fd, "Queue should be open"
441

    
442
    return [name for name in utils.ListVisibleFiles(constants.QUEUE_DIR)
443
            if self._RE_JOB_FILE.match(name)]
444

    
445
  def _LoadJobUnlocked(self, job_id):
446
    assert self.lock_fd, "Queue should be open"
447

    
448
    if job_id in self._memcache:
449
      logging.debug("Found job %s in memcache", job_id)
450
      return self._memcache[job_id]
451

    
452
    filepath = self._GetJobPath(job_id)
453
    logging.debug("Loading job from %s", filepath)
454
    try:
455
      fd = open(filepath, "r")
456
    except IOError, err:
457
      if err.errno in (errno.ENOENT, ):
458
        return None
459
      raise
460
    try:
461
      data = serializer.LoadJson(fd.read())
462
    finally:
463
      fd.close()
464

    
465
    job = _QueuedJob.Restore(self, data)
466
    self._memcache[job_id] = job
467
    logging.debug("Added job %s to the cache", job_id)
468
    return job
469

    
470
  def _GetJobsUnlocked(self, job_ids):
471
    if not job_ids:
472
      job_ids = self._GetJobIDsUnlocked()
473

    
474
    return [self._LoadJobUnlocked(job_id) for job_id in job_ids]
475

    
476
  @utils.LockedMethod
477
  def GetJobs(self, job_ids):
478
    return self._GetJobsUnlocked(job_ids)
479

    
480
  @utils.LockedMethod
481
  def AddJob(self, ops, nodes):
482
    """Create and store on disk a new job.
483

484
    @type ops: list
485
    @param ops: The list of OpCodes that will become the new job.
486
    @type nodes: list
487
    @param nodes: The list of nodes to which the new job serial will be
488
                  distributed.
489

490
    """
491
    assert self.lock_fd, "Queue should be open"
492

    
493
    # Get job identifier
494
    job_id = self._NewSerialUnlocked(nodes)
495
    job = _QueuedJob(self, job_id, ops)
496

    
497
    # Write to disk
498
    self._UpdateJobUnlocked(job)
499

    
500
    logging.debug("Added new job %s to the cache", job_id)
501
    self._memcache[job_id] = job
502

    
503
    return job
504

    
505
  def _UpdateJobUnlocked(self, job):
506
    assert self.lock_fd, "Queue should be open"
507

    
508
    filename = self._GetJobPath(job.id)
509
    logging.debug("Writing job %s to %s", job.id, filename)
510
    utils.WriteFile(filename,
511
                    data=serializer.DumpJson(job.Serialize(), indent=False))
512
    self._CleanCacheUnlocked([job.id])
513

    
514
  def _CleanCacheUnlocked(self, exclude):
515
    """Clean the memory cache.
516

517
    The exceptions argument contains job IDs that should not be
518
    cleaned.
519

520
    """
521
    assert isinstance(exclude, list)
522
    for job in self._memcache.values():
523
      if job.id in exclude:
524
        continue
525
      if job.GetStatus() not in (constants.JOB_STATUS_QUEUED,
526
                                 constants.JOB_STATUS_RUNNING):
527
        logging.debug("Cleaning job %s from the cache", job.id)
528
        try:
529
          del self._memcache[job.id]
530
        except KeyError:
531
          pass
532

    
533
  @utils.LockedMethod
534
  def UpdateJob(self, job):
535
    return self._UpdateJobUnlocked(job)
536

    
537
  def ArchiveJob(self, job_id):
538
    raise NotImplementedError()
539

    
540

    
541
class JobQueue:
542
  """The job queue.
543

544
  """
545
  def __init__(self, context):
546
    self._lock = threading.Lock()
547
    self._jobs = DiskJobStorage("")
548
    self._wpool = _JobQueueWorkerPool(context)
549

    
550
    for job in self._jobs.GetJobs(None):
551
      status = job.GetStatus()
552
      if status in (constants.JOB_STATUS_QUEUED, ):
553
        self._wpool.AddTask(job)
554

    
555
      elif status in (constants.JOB_STATUS_RUNNING, ):
556
        logging.warning("Unfinished job %s found: %s", job.id, job)
557
        job.SetUnclean("Unclean master daemon shutdown")
558

    
559
  @utils.LockedMethod
560
  def SubmitJob(self, ops, nodes):
561
    """Add a new job to the queue.
562

563
    This enters the job into our job queue and also puts it on the new
564
    queue, in order for it to be picked up by the queue processors.
565

566
    @type ops: list
567
    @param ops: the sequence of opcodes that will become the new job
568
    @type nodes: list
569
    @param nodes: the list of nodes to which the queue should be
570
                  distributed
571

572
    """
573
    job = self._jobs.AddJob(ops, nodes)
574

    
575
    # Add to worker pool
576
    self._wpool.AddTask(job)
577

    
578
    return job.id
579

    
580
  def ArchiveJob(self, job_id):
581
    raise NotImplementedError()
582

    
583
  def CancelJob(self, job_id):
584
    raise NotImplementedError()
585

    
586
  def _GetJobInfo(self, job, fields):
587
    row = []
588
    for fname in fields:
589
      if fname == "id":
590
        row.append(job.id)
591
      elif fname == "status":
592
        row.append(job.GetStatus())
593
      elif fname == "ops":
594
        row.append([op.GetInput().__getstate__() for op in job._ops])
595
      elif fname == "opresult":
596
        row.append([op.GetResult() for op in job._ops])
597
      elif fname == "opstatus":
598
        row.append([op.GetStatus() for op in job._ops])
599
      elif fname == "ticker":
600
        ji = job.GetRunOpIndex()
601
        if ji < 0:
602
          lmsg = None
603
        else:
604
          lmsg = job._ops[ji].RetrieveLog(-1)
605
          # message might be empty here
606
          if lmsg:
607
            lmsg = lmsg[0]
608
          else:
609
            lmsg = None
610
        row.append(lmsg)
611
      else:
612
        raise errors.OpExecError("Invalid job query field '%s'" % fname)
613
    return row
614

    
615
  def QueryJobs(self, job_ids, fields):
616
    """Returns a list of jobs in queue.
617

618
    Args:
619
    - job_ids: Sequence of job identifiers or None for all
620
    - fields: Names of fields to return
621

622
    """
623
    self._lock.acquire()
624
    try:
625
      jobs = []
626

    
627
      for job in self._jobs.GetJobs(job_ids):
628
        if job is None:
629
          jobs.append(None)
630
        else:
631
          jobs.append(self._GetJobInfo(job, fields))
632

    
633
      return jobs
634
    finally:
635
      self._lock.release()
636

    
637
  @utils.LockedMethod
638
  def Shutdown(self):
639
    """Stops the job queue.
640

641
    """
642
    self._wpool.TerminateWorkers()
643
    self._jobs.Close()