Revision 5178f1bc tools/burnin

b/tools/burnin
27 27
import sys
28 28
import optparse
29 29
import time
30
import socket
31
import urllib2
32
import errno
30 33
from itertools import izip, islice, cycle
31 34
from cStringIO import StringIO
32 35

  
......
41 44
USAGE = ("\tburnin -o OS_NAME [options...] instance_name ...")
42 45

  
43 46

  
47
class InstanceDown(Exception):
48
  """The checked instance was not up"""
49

  
50

  
44 51
def Usage():
45 52
  """Shows program usage information and exits the program."""
46 53

  
......
190 197
                      dest="parallel",
191 198
                      help="Enable parallelization of some operations in"
192 199
                      " order to speed burnin or to test granular locking")
200
    parser.add_option("--net-timeout", default=15, type="int",
201
                      dest="net_timeout",
202
                      help="The instance check network timeout in seconds"
203
                      " (defaults to 15 seconds)")
204
    parser.add_option("-C", "--http-check", default=False, action="store_true",
205
                      dest="http_check",
206
                      help="Enable checking of instance status via http,"
207
                      " looking for /hostname.txt that should contain the"
208
                      " name of the instance")
209

  
193 210

  
194 211
    options, args = parser.parse_args()
195 212
    if len(args) < 1 or options.os is None:
......
205 222

  
206 223
    if options.disk_template == constants.DT_DISKLESS:
207 224
      disk_size = disk_growth = []
208
      opts.do_addremove_disks = False
225
      options.do_addremove_disks = False
209 226
    else:
210 227
      disk_size = [utils.ParseUnit(v) for v in options.disk_size.split(",")]
211 228
      disk_growth = [utils.ParseUnit(v)
......
234 251
      }
235 252
    self.hvp = {}
236 253

  
254
    socket.setdefaulttimeout(options.net_timeout)
255

  
237 256
  def GetState(self):
238 257
    """Read the cluster state from the config."""
239 258
    if self.opts.nodes:
......
314 333
    if self.opts.parallel:
315 334
      self.ExecJobSet(jobset)
316 335

  
336
    for instance in self.instances:
337
      self._CheckInstanceAlive(instance)
338

  
317 339
  def GrowDisks(self):
318 340
    """Grow both the os and the swap disks by the requested amount, if any."""
319 341
    for instance in self.instances:
......
360 382

  
361 383
      Log("- Failover instance %s" % (instance))
362 384
      self.ExecOp(op)
385
    for instance in self.instances:
386
      self._CheckInstanceAlive(instance)
363 387

  
364 388
  def ImportExport(self):
365 389
    """Export the instance, delete it, and import it back.
......
429 453

  
430 454
      self.to_rem.append(instance)
431 455

  
456
    for instance in self.instances:
457
      self._CheckInstanceAlive(instance)
458

  
432 459
  def StopInstance(self, instance):
433 460
    """Stop given instance."""
434 461
    op = opcodes.OpShutdownInstance(instance_name=instance)
......
454 481
      self.StopInstance(instance)
455 482
      self.StartInstance(instance)
456 483

  
484
    for instance in self.instances:
485
      self._CheckInstanceAlive(instance)
486

  
457 487
  def Remove(self):
458 488
    """Remove the instances."""
459 489
    for instance in self.to_rem:
......
469 499
      self.StopInstance(instance)
470 500
      self.RenameInstance(instance, rename)
471 501
      self.StartInstance(rename)
502
      self._CheckInstanceAlive(rename)
472 503
      self.StopInstance(rename)
473 504
      self.RenameInstance(rename, instance)
474 505
      self.StartInstance(instance)
475 506

  
507
    for instance in self.instances:
508
      self._CheckInstanceAlive(instance)
509

  
476 510
  def Reinstall(self):
477 511
    """Reinstall the instances."""
478 512
    for instance in self.instances:
......
485 519
      Log("- Reinstall instance %s specifying the OS" % (instance,))
486 520
      self.ExecOp(op)
487 521
      self.StartInstance(instance)
522
    for instance in self.instances:
523
      self._CheckInstanceAlive(instance)
488 524

  
489 525
  def Reboot(self):
490 526
    """Reinstall the instances."""
......
495 531
                                      ignore_secondaries=False)
496 532
        Log("- Reboot instance %s with type '%s'" % (instance, reboot_type))
497 533
        self.ExecOp(op)
534
        self._CheckInstanceAlive(instance)
498 535

  
499 536
  def ActivateDisks(self):
500 537
    """Activate and deactivate disks of the instances."""
......
509 546
      Log("- Deactivate disks of offline instance %s" % (instance,))
510 547
      self.ExecOp(op_deact)
511 548
      self.StartInstance(instance)
549
    for instance in self.instances:
550
      self._CheckInstanceAlive(instance)
512 551

  
513 552
  def AddRemoveDisks(self):
514 553
    """Add and remove an extra disk for the instances."""
......
524 563
      Log("- Removing the last disk of instance %s" % (instance,))
525 564
      self.ExecOp(op_rem)
526 565
      self.StartInstance(instance)
566
    for instance in self.instances:
567
      self._CheckInstanceAlive(instance)
527 568

  
528 569
  def AddRemoveNICs(self):
529 570
    """Add and remove an extra NIC for the instances."""
......
537 578
      Log("- Removing the last NIC of instance %s" % (instance,))
538 579
      self.ExecOp(op_rem)
539 580

  
581
  def _CheckInstanceAlive(self, instance):
582
    """Check if an instance is alive by doing http checks.
583

  
584
    This will try to retrieve the url on the instance /hostname.txt
585
    and check that it contains the hostname of the instance. In case
586
    we get ECONNREFUSED, we retry up to the net timeout seconds, for
587
    any other error we abort.
588

  
589
    """
590
    if not self.opts.http_check:
591
      return
592
    try:
593
      for retries in range(self.opts.net_timeout):
594
        try:
595
          url = urllib2.urlopen("http://%s/hostname.txt" % instance)
596
        except urllib2.URLError, err:
597
          if err.args[0][0] == errno.ECONNREFUSED:
598
            time.sleep(1)
599
            continue
600
          raise
601
    except urllib2.URLError, err:
602
      raise InstanceDown(instance, str(err))
603
    hostname = url.read().strip()
604
    if hostname != instance:
605
      raise InstanceDown(instance, ("Hostname mismatch, expected %s, got %s" %
606
                                    (instance, hostname)))
607

  
540 608
  def BurninCluster(self):
541 609
    """Test a cluster intensively.
542 610

  

Also available in: Unified diff