Revision 73ff3118 tools/burnin

b/tools/burnin
41 41

  
42 42
USAGE = ("\tburnin -o OS_NAME [options...] instance_name ...")
43 43

  
44
MAX_RETRIES = 3
44 45

  
45 46
class InstanceDown(Exception):
46 47
  """The checked instance was not up"""
47 48

  
48 49

  
50
class BurninFailure(Exception):
51
  """Failure detected during burning"""
52

  
53

  
49 54
def Usage():
50 55
  """Shows program usage information and exits the program."""
51 56

  
......
106 111
    self.to_rem = []
107 112
    self.queued_ops = []
108 113
    self.opts = None
114
    self.queue_retry = False
115
    self.disk_count = self.disk_growth = self.disk_size = None
116
    self.hvp = self.bep = None
109 117
    self.ParseOptions()
110 118
    self.cl = cli.GetClient()
111 119
    self.GetState()
......
125 133
    if self.opts.verbose:
126 134
      Log(msg, indent=3)
127 135

  
128
  def ExecOp(self, *ops):
136
  def MaybeRetry(self, retry_count, msg, fn, *args):
137
    """Possibly retry a given function execution.
138

  
139
    @type retry_count: int
140
    @param retry_count: retry counter:
141
        - 0: non-retryable action
142
        - 1: last retry for a retryable action
143
        - MAX_RETRIES: original try for a retryable action
144
    @type msg: str
145
    @param msg: the kind of the operation
146
    @type fn: callable
147
    @param fn: the function to be called
148

  
149
    """
150
    try:
151
      val = fn(*args)
152
      if retry_count > 0 and retry_count < MAX_RETRIES:
153
        Log("Idempotent %s succeeded after %d retries" %
154
            (msg, MAX_RETRIES - retry_count))
155
      return val
156
    except Exception, err:
157
      if retry_count == 0:
158
        Log("Non-idempotent %s failed, aborting" % (msg, ))
159
        raise
160
      elif retry_count == 1:
161
        Log("Idempotent %s repeated failure, aborting" % (msg, ))
162
        raise
163
      else:
164
        Log("Idempotent %s failed, retry #%d/%d: %s" %
165
            (msg, MAX_RETRIES - retry_count + 1, MAX_RETRIES, err))
166
        self.MaybeRetry(retry_count - 1, msg, fn, *args)
167

  
168
  def _ExecOp(self, *ops):
129 169
    """Execute one or more opcodes and manage the exec buffer.
130 170

  
131 171
    @result: if only opcode has been passed, we return its result;
......
139 179
    else:
140 180
      return results
141 181

  
182
  def ExecOp(self, retry, *ops):
183
    """Execute one or more opcodes and manage the exec buffer.
184

  
185
    @result: if only opcode has been passed, we return its result;
186
        otherwise we return the list of results
187

  
188
    """
189
    if retry:
190
      rval = MAX_RETRIES
191
    else:
192
      rval = 0
193
    return self.MaybeRetry(rval, "opcode", self._ExecOp, *ops)
194

  
142 195
  def ExecOrQueue(self, name, *ops):
143 196
    """Execute an opcode and manage the exec buffer."""
144 197
    if self.opts.parallel:
145 198
      self.queued_ops.append((ops, name))
146 199
    else:
147
      return self.ExecOp(*ops)
200
      return self.ExecOp(self.queue_retry, *ops)
201

  
202
  def StartBatch(self, retry):
203
    """Start a new batch of jobs.
204

  
205
    @param retry: whether this is a retryable batch
206

  
207
    """
208
    self.queued_ops = []
209
    self.queue_retry = retry
148 210

  
149 211
  def CommitQueue(self):
150 212
    """Execute all submitted opcodes in case of parallel burnin"""
151 213
    if not self.opts.parallel:
152 214
      return
153 215

  
216
    if self.queue_retry:
217
      rval = MAX_RETRIES
218
    else:
219
      rval = 0
220

  
154 221
    try:
155
      results = self.ExecJobSet(self.queued_ops)
222
      results = self.MaybeRetry(rval, "jobset", self.ExecJobSet,
223
                                self.queued_ops)
156 224
    finally:
157 225
      self.queued_ops = []
158 226
    return results
......
171 239
    results = []
172 240
    for jid, (_, iname) in zip(job_ids, jobs):
173 241
      Log("waiting for job %s for %s" % (jid, iname), indent=2)
174
      results.append(cli.PollJob(jid, cl=self.cl, feedback_fn=self.Feedback))
175

  
242
      try:
243
        results.append(cli.PollJob(jid, cl=self.cl, feedback_fn=self.Feedback))
244
      except Exception, err:
245
        Log("Job for %s failed: %s" % (iname, err))
246
    if len(results) != len(jobs):
247
      raise BurninFailure()
176 248
    return results
177 249

  
178 250
  def ParseOptions(self):
......
325 397
    try:
326 398
      op = opcodes.OpQueryNodes(output_fields=["name", "offline", "drained"],
327 399
                                names=names, use_locking=True)
328
      result = self.ExecOp(op)
400
      result = self.ExecOp(True, op)
329 401
    except errors.GenericError, err:
330 402
      err_code, msg = cli.FormatError(err)
331 403
      Err(msg, exit_code=err_code)
332 404
    self.nodes = [data[0] for data in result if not (data[1] or data[2])]
333 405

  
334
    result = self.ExecOp(opcodes.OpDiagnoseOS(output_fields=["name", "valid"],
335
                                              names=[]))
406
    op_diagos = opcodes.OpDiagnoseOS(output_fields=["name", "valid"], names=[])
407
    result = self.ExecOp(True, op_diagos)
336 408

  
337 409
    if not result:
338 410
      Err("Can't get the OS list")
......
347 419
    """Create the given instances.
348 420

  
349 421
    """
422
    self.StartBatch(False)
350 423
    self.to_rem = []
351 424
    mytor = izip(cycle(self.nodes),
352 425
                 islice(cycle(self.nodes), 1, None),
......
396 469
  def BurnGrowDisks(self):
397 470
    """Grow both the os and the swap disks by the requested amount, if any."""
398 471
    Log("Growing disks")
472
    self.StartBatch(False)
399 473
    for instance in self.instances:
400 474
      Log("instance %s" % instance, indent=1)
401 475
      for idx, growth in enumerate(self.disk_growth):
......
409 483
  def BurnReplaceDisks1D8(self):
410 484
    """Replace disks on primary and secondary for drbd8."""
411 485
    Log("Replacing disks on the same nodes")
486
    self.StartBatch(True)
412 487
    for instance in self.instances:
413 488
      Log("instance %s" % instance, indent=1)
414 489
      ops = []
......
424 499
  def BurnReplaceDisks2(self):
425 500
    """Replace secondary node."""
426 501
    Log("Changing the secondary node")
502
    self.StartBatch(True)
427 503
    mode = constants.REPLACE_DISK_CHG
428 504

  
429 505
    mytor = izip(islice(cycle(self.nodes), 2, None),
......
447 523
  def BurnFailover(self):
448 524
    """Failover the instances."""
449 525
    Log("Failing over instances")
526
    self.StartBatch(False)
450 527
    for instance in self.instances:
451 528
      Log("instance %s" % instance, indent=1)
452 529
      op = opcodes.OpFailoverInstance(instance_name=instance,
......
460 537
  def BurnMigrate(self):
461 538
    """Migrate the instances."""
462 539
    Log("Migrating instances")
540
    self.StartBatch(False)
463 541
    for instance in self.instances:
464 542
      Log("instance %s" % instance, indent=1)
465 543
      op1 = opcodes.OpMigrateInstance(instance_name=instance, live=True,
......
476 554

  
477 555
    """
478 556
    Log("Exporting and re-importing instances")
557
    self.StartBatch(False)
479 558
    mytor = izip(cycle(self.nodes),
480 559
                 islice(cycle(self.nodes), 1, None),
481 560
                 islice(cycle(self.nodes), 2, None),
......
486 565
      # read the full name of the instance
487 566
      nam_op = opcodes.OpQueryInstances(output_fields=["name"],
488 567
                                        names=[instance], use_locking=True)
489
      full_name = self.ExecOp(nam_op)[0][0]
568
      full_name = self.ExecOp(False, nam_op)[0][0]
490 569

  
491 570
      if self.opts.iallocator:
492 571
        pnode = snode = None
......
555 634
  def BurnStopStart(self):
556 635
    """Stop/start the instances."""
557 636
    Log("Stopping and starting instances")
637
    self.StartBatch(True)
558 638
    for instance in self.instances:
559 639
      Log("instance %s" % instance, indent=1)
560 640
      op1 = self.StopInstanceOp(instance)
......
568 648

  
569 649
  def BurnRemove(self):
570 650
    """Remove the instances."""
651
    self.StartBatch(False)
571 652
    Log("Removing instances")
572 653
    for instance in self.to_rem:
573 654
      Log("instance %s" % instance, indent=1)
......
594 675
      op_rename2 = self.RenameInstanceOp(rename, instance)
595 676
      op_start1 = self.StartInstanceOp(rename)
596 677
      op_start2 = self.StartInstanceOp(instance)
597
      self.ExecOp(op_stop1, op_rename1, op_start1)
678
      self.ExecOp(False, op_stop1, op_rename1, op_start1)
598 679
      self._CheckInstanceAlive(rename)
599
      self.ExecOp(op_stop2, op_rename2, op_start2)
680
      self.ExecOp(False, op_stop2, op_rename2, op_start2)
600 681
      self._CheckInstanceAlive(instance)
601 682

  
602 683
  def BurnReinstall(self):
603 684
    """Reinstall the instances."""
604 685
    Log("Reinstalling instances")
686
    self.StartBatch(True)
605 687
    for instance in self.instances:
606 688
      Log("instance %s" % instance, indent=1)
607 689
      op1 = self.StopInstanceOp(instance)
......
621 703
  def BurnReboot(self):
622 704
    """Reboot the instances."""
623 705
    Log("Rebooting instances")
706
    self.StartBatch(True)
624 707
    for instance in self.instances:
625 708
      Log("instance %s" % instance, indent=1)
626 709
      ops = []
......
640 723
  def BurnActivateDisks(self):
641 724
    """Activate and deactivate disks of the instances."""
642 725
    Log("Activating/deactivating disks")
726
    self.StartBatch(True)
643 727
    for instance in self.instances:
644 728
      Log("instance %s" % instance, indent=1)
645 729
      op_start = self.StartInstanceOp(instance)
......
657 741
  def BurnAddRemoveDisks(self):
658 742
    """Add and remove an extra disk for the instances."""
659 743
    Log("Adding and removing disks")
744
    self.StartBatch(False)
660 745
    for instance in self.instances:
661 746
      Log("instance %s" % instance, indent=1)
662 747
      op_add = opcodes.OpSetInstanceParams(\
......
676 761
  def BurnAddRemoveNICs(self):
677 762
    """Add and remove an extra NIC for the instances."""
678 763
    Log("Adding and removing NICs")
764
    self.StartBatch(False)
679 765
    for instance in self.instances:
680 766
      Log("instance %s" % instance, indent=1)
681 767
      op_add = opcodes.OpSetInstanceParams(\

Also available in: Unified diff