Statistics
| Branch: | Tag: | Revision:

root / testing / ganeti.qa.py @ 6f11f250

History | View | Annotate | Download (17.4 kB)

1
#!/usr/bin/python
2
#
3

    
4
# Copyright (C) 2006, 2007 Google Inc.
5
#
6
# This program is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 2 of the License, or
9
# (at your option) any later version.
10
#
11
# This program is distributed in the hope that it will be useful, but
12
# WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
# General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with this program; if not, write to the Free Software
18
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19
# 02110-1301, USA.
20

    
21

    
22
"""Script for doing Q&A on Ganeti"""
23

    
24
import os
25
import re
26
import sys
27
import yaml
28
import time
29

    
30
from datetime import datetime
31
from optparse import OptionParser
32

    
33
# I want more flexibility for testing over SSH, therefore I'm not using
34
# Ganeti's ssh module.
35
import subprocess
36

    
37
from ganeti import utils
38
from ganeti import constants
39

    
40
# {{{ Global variables
41
cfg = None
42
options = None
43
# }}}
44

    
45
# {{{ Errors
46
class Error(Exception):
47
  """An error occurred during Q&A testing.
48

49
  """
50
  pass
51

    
52

    
53
class OutOfNodesError(Error):
54
  """Out of nodes.
55

56
  """
57
  pass
58

    
59

    
60
class OutOfInstancesError(Error):
61
  """Out of instances.
62

63
  """
64
  pass
65
# }}}
66

    
67
# {{{ Utilities
68
def TestEnabled(test):
69
  """Returns True if the given test is enabled."""
70
  return cfg.get('tests', {}).get(test, False)
71

    
72

    
73
def RunTest(callable, *args):
74
  """Runs a test after printing a header.
75

76
  """
77
  if callable.__doc__:
78
    desc = callable.__doc__.splitlines()[0].strip()
79
  else:
80
    desc = '%r' % callable
81

    
82
  now = str(datetime.now())
83

    
84
  print
85
  print '---', now, ('-' * (55 - len(now)))
86
  print desc
87
  print '-' * 60
88

    
89
  return callable(*args)
90

    
91

    
92
def AssertEqual(first, second, msg=None):
93
  """Raises an error when values aren't equal.
94

95
  """
96
  if not first == second:
97
    raise Error, (msg or '%r == %r' % (first, second))
98

    
99

    
100
def GetSSHCommand(node, cmd, strict=True):
101
  """Builds SSH command to be executed.
102

103
  """
104
  args = [ 'ssh', '-oEscapeChar=none', '-oBatchMode=yes', '-l', 'root' ]
105

    
106
  if strict:
107
    tmp = 'yes'
108
  else:
109
    tmp = 'no'
110
  args.append('-oStrictHostKeyChecking=%s' % tmp)
111
  args.append(node)
112

    
113
  if options.dry_run:
114
    prefix = 'exit 0; '
115
  else:
116
    prefix = ''
117

    
118
  args.append(prefix + cmd)
119

    
120
  if options.verbose:
121
    print 'SSH:', utils.ShellQuoteArgs(args)
122

    
123
  return args
124

    
125

    
126
def StartSSH(node, cmd, strict=True):
127
  """Starts SSH.
128

129
  """
130
  args = GetSSHCommand(node, cmd, strict=strict)
131
  return subprocess.Popen(args, shell=False)
132

    
133

    
134
def UploadFile(node, file):
135
  """Uploads a file to a node and returns the filename.
136

137
  Caller needs to remove the file when it's not needed anymore.
138
  """
139
  if os.stat(file).st_mode & 0100:
140
    mode = '0700'
141
  else:
142
    mode = '0600'
143

    
144
  cmd = ('tmp=$(tempfile --mode %s --prefix gnt) && '
145
         '[[ -f "${tmp}" ]] && '
146
         'cat > "${tmp}" && '
147
         'echo "${tmp}"') % mode
148

    
149
  f = open(file, 'r')
150
  try:
151
    p = subprocess.Popen(GetSSHCommand(node, cmd), shell=False, stdin=f,
152
                         stdout=subprocess.PIPE)
153
    AssertEqual(p.wait(), 0)
154

    
155
    name = p.stdout.read().strip()
156

    
157
    return name
158
  finally:
159
    f.close()
160
# }}}
161

    
162
# {{{ Config helpers
163
def GetMasterNode():
164
  return cfg['nodes'][0]
165

    
166

    
167
def AcquireInstance():
168
  """Returns an instance which isn't in use.
169

170
  """
171
  # Filter out unwanted instances
172
  tmp_flt = lambda inst: not inst.get('_used', False)
173
  instances = filter(tmp_flt, cfg['instances'])
174
  del tmp_flt
175

    
176
  if len(instances) == 0:
177
    raise OutOfInstancesError, ("No instances left")
178

    
179
  inst = instances[0]
180
  inst['_used'] = True
181
  return inst
182

    
183

    
184
def ReleaseInstance(inst):
185
  inst['_used'] = False
186

    
187

    
188
def AcquireNode(exclude=None):
189
  """Returns the least used node.
190

191
  """
192
  master = GetMasterNode()
193

    
194
  # Filter out unwanted nodes
195
  # TODO: Maybe combine filters
196
  if exclude is None:
197
    nodes = cfg['nodes'][:]
198
  else:
199
    nodes = filter(lambda node: node != exclude, cfg['nodes'])
200

    
201
  tmp_flt = lambda node: node.get('_added', False) or node == master
202
  nodes = filter(tmp_flt, nodes)
203
  del tmp_flt
204

    
205
  if len(nodes) == 0:
206
    raise OutOfNodesError, ("No nodes left")
207

    
208
  # Get node with least number of uses
209
  def compare(a, b):
210
    result = cmp(a.get('_count', 0), b.get('_count', 0))
211
    if result == 0:
212
      result = cmp(a['primary'], b['primary'])
213
    return result
214

    
215
  nodes.sort(cmp=compare)
216

    
217
  node = nodes[0]
218
  node['_count'] = node.get('_count', 0) + 1
219
  return node
220

    
221

    
222
def ReleaseNode(node):
223
  node['_count'] = node.get('_count', 0) - 1
224
# }}}
225

    
226
# {{{ Environment tests
227
def TestConfig():
228
  """Test configuration for sanity.
229

230
  """
231
  if len(cfg['nodes']) < 1:
232
    raise Error, ("Need at least one node")
233
  if len(cfg['instances']) < 1:
234
    raise Error, ("Need at least one instance")
235
  # TODO: Add more checks
236

    
237

    
238
def TestSshConnection():
239
  """Test SSH connection.
240

241
  """
242
  for node in cfg['nodes']:
243
    AssertEqual(StartSSH(node['primary'], 'exit').wait(), 0)
244

    
245

    
246
def TestGanetiCommands():
247
  """Test availibility of Ganeti commands.
248

249
  """
250
  cmds = ( ['gnt-cluster', '--version'],
251
           ['gnt-os', '--version'],
252
           ['gnt-node', '--version'],
253
           ['gnt-instance', '--version'],
254
           ['gnt-backup', '--version'],
255
           ['ganeti-noded', '--version'],
256
           ['ganeti-watcher', '--version'] )
257

    
258
  cmd = ' && '.join(map(utils.ShellQuoteArgs, cmds))
259

    
260
  for node in cfg['nodes']:
261
    AssertEqual(StartSSH(node['primary'], cmd).wait(), 0)
262

    
263

    
264
def TestIcmpPing():
265
  """ICMP ping each node.
266

267
  """
268
  for node in cfg['nodes']:
269
    check = []
270
    for i in cfg['nodes']:
271
      check.append(i['primary'])
272
      if i.has_key('secondary'):
273
        check.append(i['secondary'])
274

    
275
    ping = lambda ip: utils.ShellQuoteArgs(['ping', '-w', '3', '-c', '1', ip])
276
    cmd = ' && '.join(map(ping, check))
277

    
278
    AssertEqual(StartSSH(node['primary'], cmd).wait(), 0)
279
# }}}
280

    
281
# {{{ Cluster tests
282
def TestClusterInit():
283
  """gnt-cluster init"""
284
  master = GetMasterNode()
285

    
286
  cmd = ['gnt-cluster', 'init']
287
  if master.get('secondary', None):
288
    cmd.append('--secondary-ip=%s' % master['secondary'])
289
  if cfg.get('bridge', None):
290
    cmd.append('--bridge=%s' % cfg['bridge'])
291
  cmd.append(cfg['name'])
292

    
293
  AssertEqual(StartSSH(master['primary'],
294
                       utils.ShellQuoteArgs(cmd)).wait(), 0)
295

    
296

    
297
def TestClusterVerify():
298
  """gnt-cluster verify"""
299
  cmd = ['gnt-cluster', 'verify']
300
  AssertEqual(StartSSH(GetMasterNode()['primary'],
301
                       utils.ShellQuoteArgs(cmd)).wait(), 0)
302

    
303

    
304
def TestClusterInfo():
305
  """gnt-cluster info"""
306
  cmd = ['gnt-cluster', 'info']
307
  AssertEqual(StartSSH(GetMasterNode()['primary'],
308
                       utils.ShellQuoteArgs(cmd)).wait(), 0)
309

    
310

    
311
def TestClusterBurnin():
312
  """Burnin"""
313
  master = GetMasterNode()
314

    
315
  # Get as many instances as we need
316
  instances = []
317
  try:
318
    for _ in xrange(0, cfg.get('options', {}).get('burnin-instances', 1)):
319
      instances.append(AcquireInstance())
320
  except OutOfInstancesError:
321
    print "Not enough instances, continuing anyway."
322

    
323
  if len(instances) < 1:
324
    raise Error, ("Burnin needs at least one instance")
325

    
326
  # Run burnin
327
  try:
328
    script = UploadFile(master['primary'], '../tools/burnin')
329
    try:
330
      cmd = [script, '--os=%s' % cfg['os']]
331
      cmd += map(lambda inst: inst['name'], instances)
332
      AssertEqual(StartSSH(master['primary'],
333
                           utils.ShellQuoteArgs(cmd)).wait(), 0)
334
    finally:
335
      cmd = ['rm', '-f', script]
336
      AssertEqual(StartSSH(master['primary'],
337
                           utils.ShellQuoteArgs(cmd)).wait(), 0)
338
  finally:
339
    for inst in instances:
340
      ReleaseInstance(inst)
341

    
342

    
343
def TestClusterMasterFailover():
344
  """gnt-cluster masterfailover"""
345
  master = GetMasterNode()
346

    
347
  failovermaster = AcquireNode(exclude=master)
348
  try:
349
    cmd = ['gnt-cluster', 'masterfailover']
350
    AssertEqual(StartSSH(failovermaster['primary'],
351
                         utils.ShellQuoteArgs(cmd)).wait(), 0)
352

    
353
    cmd = ['gnt-cluster', 'masterfailover']
354
    AssertEqual(StartSSH(master['primary'],
355
                         utils.ShellQuoteArgs(cmd)).wait(), 0)
356
  finally:
357
    ReleaseNode(failovermaster)
358

    
359

    
360
def TestClusterDestroy():
361
  """gnt-cluster destroy"""
362
  cmd = ['gnt-cluster', 'destroy', '--yes-do-it']
363
  AssertEqual(StartSSH(GetMasterNode()['primary'],
364
                       utils.ShellQuoteArgs(cmd)).wait(), 0)
365
# }}}
366

    
367
# {{{ Node tests
368
def _NodeAdd(node):
369
  if node.get('_added', False):
370
    raise Error, ("Node %s already in cluster" % node['primary'])
371

    
372
  cmd = ['gnt-node', 'add']
373
  if node.get('secondary', None):
374
    cmd.append('--secondary-ip=%s' % node['secondary'])
375
  cmd.append(node['primary'])
376
  AssertEqual(StartSSH(GetMasterNode()['primary'],
377
                       utils.ShellQuoteArgs(cmd)).wait(), 0)
378

    
379
  node['_added'] = True
380

    
381

    
382
def TestNodeAddAll():
383
  """Adding all nodes to cluster."""
384
  master = GetMasterNode()
385
  for node in cfg['nodes']:
386
    if node != master:
387
      _NodeAdd(node)
388

    
389

    
390
def _NodeRemove(node):
391
  cmd = ['gnt-node', 'remove', node['primary']]
392
  AssertEqual(StartSSH(GetMasterNode()['primary'],
393
                       utils.ShellQuoteArgs(cmd)).wait(), 0)
394
  node['_added'] = False
395

    
396

    
397
def TestNodeRemoveAll():
398
  """Removing all nodes from cluster."""
399
  master = GetMasterNode()
400
  for node in cfg['nodes']:
401
    if node != master:
402
      _NodeRemove(node)
403
# }}}
404

    
405
# {{{ Instance tests
406
def _DiskTest(node, instance, args):
407
  cmd = ['gnt-instance', 'add',
408
         '--os-type=%s' % cfg['os'],
409
         '--os-size=%s' % cfg['os-size'],
410
         '--swap-size=%s' % cfg['swap-size'],
411
         '--memory=%s' % cfg['mem'],
412
         '--node=%s' % node['primary']]
413
  if args:
414
    cmd += args
415
  cmd.append(instance['name'])
416

    
417
  AssertEqual(StartSSH(GetMasterNode()['primary'],
418
                       utils.ShellQuoteArgs(cmd)).wait(), 0)
419
  return instance
420

    
421

    
422
def TestInstanceAddWithPlainDisk(node):
423
  """gnt-instance add -t plain"""
424
  return _DiskTest(node, AcquireInstance(), ['--disk-template=plain'])
425

    
426

    
427
def TestInstanceAddWithLocalMirrorDisk(node):
428
  """gnt-instance add -t local_raid1"""
429
  return _DiskTest(node, AcquireInstance(), ['--disk-template=local_raid1'])
430

    
431

    
432
def TestInstanceAddWithRemoteRaidDisk(node, node2):
433
  """gnt-instance add -t remote_raid1"""
434
  return _DiskTest(node, AcquireInstance(),
435
                   ['--disk-template=remote_raid1',
436
                    '--secondary-node=%s' % node2['primary']])
437

    
438

    
439
def TestInstanceRemove(instance):
440
  """gnt-instance remove"""
441
  cmd = ['gnt-instance', 'remove', '-f', instance['name']]
442
  AssertEqual(StartSSH(GetMasterNode()['primary'],
443
                       utils.ShellQuoteArgs(cmd)).wait(), 0)
444

    
445
  ReleaseInstance(instance)
446

    
447

    
448
def TestInstanceStartup(instance):
449
  """gnt-instance startup"""
450
  cmd = ['gnt-instance', 'startup', instance['name']]
451
  AssertEqual(StartSSH(GetMasterNode()['primary'],
452
                       utils.ShellQuoteArgs(cmd)).wait(), 0)
453

    
454

    
455
def TestInstanceShutdown(instance):
456
  """gnt-instance shutdown"""
457
  cmd = ['gnt-instance', 'shutdown', instance['name']]
458
  AssertEqual(StartSSH(GetMasterNode()['primary'],
459
                       utils.ShellQuoteArgs(cmd)).wait(), 0)
460

    
461

    
462
def TestInstanceFailover(instance):
463
  """gnt-instance failover"""
464
  cmd = ['gnt-instance', 'failover', '--force', instance['name']]
465
  AssertEqual(StartSSH(GetMasterNode()['primary'],
466
                       utils.ShellQuoteArgs(cmd)).wait(), 0)
467
# }}}
468

    
469
# {{{ Daemon tests
470
def _ResolveInstanceName(instance):
471
  """Gets the full Xen name of an instance.
472

473
  """
474
  master = GetMasterNode()
475

    
476
  info_cmd = utils.ShellQuoteArgs(['gnt-instance', 'info', instance['name']])
477
  sed_cmd = utils.ShellQuoteArgs(['sed', '-n', '-e', 's/^Instance name: *//p'])
478

    
479
  cmd = '%s | %s' % (info_cmd, sed_cmd)
480
  p = subprocess.Popen(GetSSHCommand(master['primary'], cmd), shell=False,
481
                       stdout=subprocess.PIPE)
482
  AssertEqual(p.wait(), 0)
483

    
484
  return p.stdout.read().strip()
485

    
486

    
487
def _InstanceRunning(node, name):
488
  """Checks whether an instance is running.
489

490
  Args:
491
    node: Node the instance runs on
492
    name: Full name of Xen instance
493
  """
494
  cmd = utils.ShellQuoteArgs(['xm', 'list', name]) + ' >/dev/null'
495
  ret = StartSSH(node['primary'], cmd).wait()
496
  return ret == 0
497

    
498

    
499
def _XmShutdownInstance(node, name):
500
  """Shuts down instance using "xm" and waits for completion.
501

502
  Args:
503
    node: Node the instance runs on
504
    name: Full name of Xen instance
505
  """
506
  cmd = ['xm', 'shutdown', name]
507
  AssertEqual(StartSSH(GetMasterNode()['primary'],
508
                       utils.ShellQuoteArgs(cmd)).wait(), 0)
509

    
510
  # Wait up to a minute
511
  end = time.time() + 60
512
  while time.time() <= end:
513
    if not _InstanceRunning(node, name):
514
      break
515
    time.sleep(5)
516
  else:
517
    raise Error, ("xm shutdown failed")
518

    
519

    
520
def _ResetWatcherDaemon(node):
521
  """Removes the watcher daemon's state file.
522

523
  Args:
524
    node: Node to be reset
525
  """
526
  cmd = ['rm', '-f', constants.WATCHER_STATEFILE]
527
  AssertEqual(StartSSH(node['primary'],
528
                       utils.ShellQuoteArgs(cmd)).wait(), 0)
529

    
530

    
531
def TestInstanceAutomaticRestart(node, instance):
532
  """Test automatic restart of instance by ganeti-watcher.
533

534
  Note: takes up to 6 minutes to complete.
535
  """
536
  master = GetMasterNode()
537
  inst_name = _ResolveInstanceName(instance)
538

    
539
  _ResetWatcherDaemon(node)
540
  _XmShutdownInstance(node, inst_name)
541

    
542
  # Give it a bit more than five minutes to start again
543
  restart_at = time.time() + 330
544

    
545
  # Wait until it's running again
546
  while time.time() <= restart_at:
547
    if _InstanceRunning(node, inst_name):
548
      break
549
    time.sleep(15)
550
  else:
551
    raise Error, ("Daemon didn't restart instance in time")
552

    
553
  cmd = ['gnt-instance', 'info', inst_name]
554
  AssertEqual(StartSSH(master['primary'],
555
                       utils.ShellQuoteArgs(cmd)).wait(), 0)
556

    
557

    
558
def TestInstanceConsecutiveFailures(node, instance):
559
  """Test five consecutive instance failures.
560

561
  Note: takes at least 35 minutes to complete.
562
  """
563
  master = GetMasterNode()
564
  inst_name = _ResolveInstanceName(instance)
565

    
566
  _ResetWatcherDaemon(node)
567
  _XmShutdownInstance(node, inst_name)
568

    
569
  # Do shutdowns for 30 minutes
570
  finished_at = time.time() + (35 * 60)
571

    
572
  while time.time() <= finished_at:
573
    if _InstanceRunning(node, inst_name):
574
      _XmShutdownInstance(node, inst_name)
575
    time.sleep(30)
576

    
577
  # Check for some time whether the instance doesn't start again
578
  check_until = time.time() + 330
579
  while time.time() <= check_until:
580
    if _InstanceRunning(node, inst_name):
581
      raise Error, ("Instance started when it shouldn't")
582
    time.sleep(30)
583

    
584
  cmd = ['gnt-instance', 'info', inst_name]
585
  AssertEqual(StartSSH(master['primary'],
586
                       utils.ShellQuoteArgs(cmd)).wait(), 0)
587
# }}}
588

    
589
# {{{ Main program
590
if __name__ == '__main__':
591
  # {{{ Option parsing
592
  parser = OptionParser(usage="%prog [options] <configfile>")
593
  parser.add_option('--cleanup', dest='cleanup',
594
      action="store_true",
595
      help="Clean up cluster after testing?")
596
  parser.add_option('--dry-run', dest='dry_run',
597
      action="store_true",
598
      help="Show what would be done")
599
  parser.add_option('--verbose', dest='verbose',
600
      action="store_true",
601
      help="Verbose output")
602
  parser.add_option('--yes-do-it', dest='yes_do_it',
603
      action="store_true",
604
      help="Really execute the tests")
605
  (options, args) = parser.parse_args()
606
  # }}}
607

    
608
  if len(args) == 1:
609
    config_file = args[0]
610
  else:
611
    raise SyntaxError, ("Exactly one configuration file is expected")
612

    
613
  if not options.yes_do_it:
614
    print ("Executing this script irreversibly destroys any Ganeti\n"
615
           "configuration on all nodes involved. If you really want\n"
616
           "to start testing, supply the --yes-do-it option.")
617
    sys.exit(1)
618

    
619
  f = open(config_file, 'r')
620
  try:
621
    cfg = yaml.load(f.read())
622
  finally:
623
    f.close()
624

    
625
  RunTest(TestConfig)
626

    
627
  if TestEnabled('env'):
628
    RunTest(TestSshConnection)
629
    RunTest(TestIcmpPing)
630
    RunTest(TestGanetiCommands)
631

    
632
  RunTest(TestClusterInit)
633

    
634
  if TestEnabled('cluster-verify'):
635
    RunTest(TestClusterVerify)
636
    RunTest(TestClusterInfo)
637

    
638
  RunTest(TestNodeAddAll)
639

    
640
  if TestEnabled('cluster-burnin'):
641
    RunTest(TestClusterBurnin)
642

    
643
  if TestEnabled('cluster-master-failover'):
644
    RunTest(TestClusterMasterFailover)
645

    
646
  node = AcquireNode()
647
  try:
648
    if TestEnabled('instance-add-plain-disk'):
649
      instance = RunTest(TestInstanceAddWithPlainDisk, node)
650
      RunTest(TestInstanceShutdown, instance)
651
      RunTest(TestInstanceStartup, instance)
652

    
653
      if TestEnabled('instance-automatic-restart'):
654
        RunTest(TestInstanceAutomaticRestart, node, instance)
655

    
656
      if TestEnabled('instance-consecutive-failures'):
657
        RunTest(TestInstanceConsecutiveFailures, node, instance)
658

    
659
      RunTest(TestInstanceRemove, instance)
660
      del instance
661

    
662
    if TestEnabled('instance-add-local-mirror-disk'):
663
      instance = RunTest(TestInstanceAddWithLocalMirrorDisk, node)
664
      RunTest(TestInstanceShutdown, instance)
665
      RunTest(TestInstanceStartup, instance)
666
      RunTest(TestInstanceRemove, instance)
667
      del instance
668

    
669
    if TestEnabled('instance-add-remote-raid-disk'):
670
      node2 = AcquireNode(exclude=node)
671
      try:
672
        instance = RunTest(TestInstanceAddWithRemoteRaidDisk, node, node2)
673
        RunTest(TestInstanceShutdown, instance)
674
        RunTest(TestInstanceStartup, instance)
675

    
676
        if TestEnabled('instance-failover'):
677
          RunTest(TestInstanceFailover, instance)
678

    
679
        RunTest(TestInstanceRemove, instance)
680
        del instance
681
      finally:
682
        ReleaseNode(node2)
683

    
684
  finally:
685
    ReleaseNode(node)
686

    
687
  RunTest(TestNodeRemoveAll)
688

    
689
  if TestEnabled('cluster-destroy'):
690
    RunTest(TestClusterDestroy)
691
# }}}
692

    
693
# vim: foldmethod=marker :