Statistics
| Branch: | Tag: | Revision:

root / testing / ganeti.qa.py @ a8083063

History | View | Annotate | Download (17.3 kB)

1
#!/usr/bin/python
2
#
3

    
4
# Copyright (C) 2006, 2007 Google Inc.
5
#
6
# This program is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 2 of the License, or
9
# (at your option) any later version.
10
#
11
# This program is distributed in the hope that it will be useful, but
12
# WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
# General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with this program; if not, write to the Free Software
18
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19
# 02110-1301, USA.
20

    
21

    
22
"""Script for doing Q&A on Ganeti"""
23

    
24
import os
25
import re
26
import sys
27
import yaml
28
import time
29

    
30
from datetime import datetime
31
from optparse import OptionParser
32

    
33
# I want more flexibility for testing over SSH, therefore I'm not using
34
# Ganeti's ssh module.
35
import subprocess
36

    
37
from ganeti import utils
38
from ganeti import constants
39

    
40
# {{{ Global variables
41
cfg = None
42
options = None
43
# }}}
44

    
45
# {{{ Errors
46
class Error(Exception):
47
  """An error occurred during Q&A testing.
48

49
  """
50
  pass
51

    
52

    
53
class OutOfNodesError(Error):
54
  """Out of nodes.
55

56
  """
57
  pass
58

    
59

    
60
class OutOfInstancesError(Error):
61
  """Out of instances.
62

63
  """
64
  pass
65
# }}}
66

    
67
# {{{ Utilities
68
def TestEnabled(test):
69
  """Returns True if the given test is enabled."""
70
  return cfg.get('tests', {}).get(test, False)
71

    
72

    
73
def RunTest(callable, *args):
74
  """Runs a test after printing a header.
75

76
  """
77
  if callable.__doc__:
78
    desc = callable.__doc__.splitlines()[0].strip()
79
  else:
80
    desc = '%r' % callable
81

    
82
  now = str(datetime.now())
83

    
84
  print
85
  print '---', now, ('-' * (55 - len(now)))
86
  print desc
87
  print '-' * 60
88

    
89
  return callable(*args)
90

    
91

    
92
def AssertEqual(first, second, msg=None):
93
  """Raises an error when values aren't equal.
94

95
  """
96
  if not first == second:
97
    raise Error, (msg or '%r == %r' % (first, second))
98

    
99

    
100
def GetSSHCommand(node, cmd, strict=True):
101
  """Builds SSH command to be executed.
102

103
  """
104
  args = [ 'ssh', '-oEscapeChar=none', '-oBatchMode=yes', '-l', 'root' ]
105

    
106
  if strict:
107
    tmp = 'yes'
108
  else:
109
    tmp = 'no'
110
  args.append('-oStrictHostKeyChecking=%s' % tmp)
111
  args.append(node)
112

    
113
  if options.dry_run:
114
    prefix = 'exit 0; '
115
  else:
116
    prefix = ''
117

    
118
  args.append(prefix + cmd)
119

    
120
  if options.verbose:
121
    print 'SSH:', utils.ShellQuoteArgs(args)
122

    
123
  return args
124

    
125

    
126
def StartSSH(node, cmd, strict=True):
127
  """Starts SSH.
128

129
  """
130
  args = GetSSHCommand(node, cmd, strict=strict)
131
  return subprocess.Popen(args, shell=False)
132

    
133

    
134
def UploadFile(node, file):
135
  """Uploads a file to a node and returns the filename.
136

137
  Caller needs to remove the file when it's not needed anymore.
138
  """
139
  if os.stat(file).st_mode & 0100:
140
    mode = '0700'
141
  else:
142
    mode = '0600'
143

    
144
  cmd = ('tmp=$(tempfile --mode %s --prefix gnt) && '
145
         '[[ -f "${tmp}" ]] && '
146
         'cat > "${tmp}" && '
147
         'echo "${tmp}"') % mode
148

    
149
  f = open(file, 'r')
150
  try:
151
    p = subprocess.Popen(GetSSHCommand(node, cmd), shell=False, stdin=f,
152
                         stdout=subprocess.PIPE)
153
    AssertEqual(p.wait(), 0)
154

    
155
    name = p.stdout.read().strip()
156

    
157
    return name
158
  finally:
159
    f.close()
160
# }}}
161

    
162
# {{{ Config helpers
163
def GetMasterNode():
164
  return cfg['nodes'][0]
165

    
166

    
167
def AcquireInstance():
168
  """Returns an instance which isn't in use.
169

170
  """
171
  # Filter out unwanted instances
172
  tmp_flt = lambda inst: not inst.get('_used', False)
173
  instances = filter(tmp_flt, cfg['instances'])
174
  del tmp_flt
175

    
176
  if len(instances) == 0:
177
    raise OutOfInstancesError, ("No instances left")
178

    
179
  inst = instances[0]
180
  inst['_used'] = True
181
  return inst
182

    
183

    
184
def ReleaseInstance(inst):
185
  inst['_used'] = False
186

    
187

    
188
def AcquireNode(exclude=None):
189
  """Returns the least used node.
190

191
  """
192
  master = GetMasterNode()
193

    
194
  # Filter out unwanted nodes
195
  # TODO: Maybe combine filters
196
  if exclude is None:
197
    nodes = cfg['nodes'][:]
198
  else:
199
    nodes = filter(lambda node: node != exclude, cfg['nodes'])
200

    
201
  tmp_flt = lambda node: node.get('_added', False) or node == master
202
  nodes = filter(tmp_flt, nodes)
203
  del tmp_flt
204

    
205
  if len(nodes) == 0:
206
    raise OutOfNodesError, ("No nodes left")
207

    
208
  # Get node with least number of uses
209
  def compare(a, b):
210
    result = cmp(a.get('_count', 0), b.get('_count', 0))
211
    if result == 0:
212
      result = cmp(a['primary'], b['primary'])
213
    return result
214

    
215
  nodes.sort(cmp=compare)
216

    
217
  node = nodes[0]
218
  node['_count'] = node.get('_count', 0) + 1
219
  return node
220

    
221

    
222
def ReleaseNode(node):
223
  node['_count'] = node.get('_count', 0) - 1
224
# }}}
225

    
226
# {{{ Environment tests
227
def TestConfig():
228
  """Test configuration for sanity.
229

230
  """
231
  if len(cfg['nodes']) < 1:
232
    raise Error, ("Need at least one node")
233
  if len(cfg['instances']) < 1:
234
    raise Error, ("Need at least one instance")
235
  # TODO: Add more checks
236

    
237

    
238
def TestSshConnection():
239
  """Test SSH connection.
240

241
  """
242
  for node in cfg['nodes']:
243
    AssertEqual(StartSSH(node['primary'], 'exit').wait(), 0)
244

    
245

    
246
def TestGanetiCommands():
247
  """Test availibility of Ganeti commands.
248

249
  """
250
  cmds = ( ['gnt-cluster', '--version'],
251
           ['gnt-os', '--version'],
252
           ['gnt-node', '--version'],
253
           ['gnt-instance', '--version'],
254
           ['gnt-backup', '--version'],
255
           ['ganeti-noded', '--version'],
256
           ['ganeti-watcher', '--version'] )
257

    
258
  cmd = ' && '.join(map(utils.ShellQuoteArgs, cmds))
259

    
260
  for node in cfg['nodes']:
261
    AssertEqual(StartSSH(node['primary'], cmd).wait(), 0)
262

    
263

    
264
def TestIcmpPing():
265
  """ICMP ping each node.
266

267
  """
268
  for node in cfg['nodes']:
269
    check = []
270
    for i in cfg['nodes']:
271
      check.append(i['primary'])
272
      if i.has_key('secondary'):
273
        check.append(i['secondary'])
274

    
275
    ping = lambda ip: utils.ShellQuoteArgs(['ping', '-w', '3', '-c', '1', ip])
276
    cmd = ' && '.join(map(ping, check))
277

    
278
    AssertEqual(StartSSH(node['primary'], cmd).wait(), 0)
279
# }}}
280

    
281
# {{{ Cluster tests
282
def TestClusterInit():
283
  """gnt-cluster init"""
284
  master = GetMasterNode()
285

    
286
  cmd = ['gnt-cluster', 'init']
287
  if master.get('secondary', None):
288
    cmd.append('--secondary-ip=%s' % master['secondary'])
289
  cmd.append(cfg['name'])
290

    
291
  AssertEqual(StartSSH(master['primary'],
292
                       utils.ShellQuoteArgs(cmd)).wait(), 0)
293

    
294

    
295
def TestClusterVerify():
296
  """gnt-cluster verify"""
297
  cmd = ['gnt-cluster', 'verify']
298
  AssertEqual(StartSSH(GetMasterNode()['primary'],
299
                       utils.ShellQuoteArgs(cmd)).wait(), 0)
300

    
301

    
302
def TestClusterInfo():
303
  """gnt-cluster info"""
304
  cmd = ['gnt-cluster', 'info']
305
  AssertEqual(StartSSH(GetMasterNode()['primary'],
306
                       utils.ShellQuoteArgs(cmd)).wait(), 0)
307

    
308

    
309
def TestClusterBurnin():
310
  """Burnin"""
311
  master = GetMasterNode()
312

    
313
  # Get as many instances as we need
314
  instances = []
315
  try:
316
    for _ in xrange(0, cfg.get('options', {}).get('burnin-instances', 1)):
317
      instances.append(AcquireInstance())
318
  except OutOfInstancesError:
319
    print "Not enough instances, continuing anyway."
320

    
321
  if len(instances) < 1:
322
    raise Error, ("Burnin needs at least one instance")
323

    
324
  # Run burnin
325
  try:
326
    script = UploadFile(master['primary'], '../tools/burnin')
327
    try:
328
      cmd = [script, '--os=%s' % cfg['os']]
329
      cmd += map(lambda inst: inst['name'], instances)
330
      AssertEqual(StartSSH(master['primary'],
331
                           utils.ShellQuoteArgs(cmd)).wait(), 0)
332
    finally:
333
      cmd = ['rm', '-f', script]
334
      AssertEqual(StartSSH(master['primary'],
335
                           utils.ShellQuoteArgs(cmd)).wait(), 0)
336
  finally:
337
    for inst in instances:
338
      ReleaseInstance(inst)
339

    
340

    
341
def TestClusterMasterFailover():
342
  """gnt-cluster masterfailover"""
343
  master = GetMasterNode()
344

    
345
  failovermaster = AcquireNode(exclude=master)
346
  try:
347
    cmd = ['gnt-cluster', 'masterfailover']
348
    AssertEqual(StartSSH(failovermaster['primary'],
349
                         utils.ShellQuoteArgs(cmd)).wait(), 0)
350

    
351
    cmd = ['gnt-cluster', 'masterfailover']
352
    AssertEqual(StartSSH(master['primary'],
353
                         utils.ShellQuoteArgs(cmd)).wait(), 0)
354
  finally:
355
    ReleaseNode(failovermaster)
356

    
357

    
358
def TestClusterDestroy():
359
  """gnt-cluster destroy"""
360
  cmd = ['gnt-cluster', 'destroy', '--yes-do-it']
361
  AssertEqual(StartSSH(GetMasterNode()['primary'],
362
                       utils.ShellQuoteArgs(cmd)).wait(), 0)
363
# }}}
364

    
365
# {{{ Node tests
366
def _NodeAdd(node):
367
  if node.get('_added', False):
368
    raise Error, ("Node %s already in cluster" % node['primary'])
369

    
370
  cmd = ['gnt-node', 'add']
371
  if node.get('secondary', None):
372
    cmd.append('--secondary-ip=%s' % node['secondary'])
373
  cmd.append(node['primary'])
374
  AssertEqual(StartSSH(GetMasterNode()['primary'],
375
                       utils.ShellQuoteArgs(cmd)).wait(), 0)
376

    
377
  node['_added'] = True
378

    
379

    
380
def TestNodeAddAll():
381
  """Adding all nodes to cluster."""
382
  master = GetMasterNode()
383
  for node in cfg['nodes']:
384
    if node != master:
385
      _NodeAdd(node)
386

    
387

    
388
def _NodeRemove(node):
389
  cmd = ['gnt-node', 'remove', node['primary']]
390
  AssertEqual(StartSSH(GetMasterNode()['primary'],
391
                       utils.ShellQuoteArgs(cmd)).wait(), 0)
392
  node['_added'] = False
393

    
394

    
395
def TestNodeRemoveAll():
396
  """Removing all nodes from cluster."""
397
  master = GetMasterNode()
398
  for node in cfg['nodes']:
399
    if node != master:
400
      _NodeRemove(node)
401
# }}}
402

    
403
# {{{ Instance tests
404
def _DiskTest(node, instance, args):
405
  cmd = ['gnt-instance', 'add',
406
         '--os-type=%s' % cfg['os'],
407
         '--os-size=%s' % cfg['os-size'],
408
         '--swap-size=%s' % cfg['swap-size'],
409
         '--memory=%s' % cfg['mem'],
410
         '--node=%s' % node['primary']]
411
  if args:
412
    cmd += args
413
  cmd.append(instance['name'])
414

    
415
  AssertEqual(StartSSH(GetMasterNode()['primary'],
416
                       utils.ShellQuoteArgs(cmd)).wait(), 0)
417
  return instance
418

    
419

    
420
def TestInstanceAddWithPlainDisk(node):
421
  """gnt-instance add -t plain"""
422
  return _DiskTest(node, AcquireInstance(), ['--disk-template=plain'])
423

    
424

    
425
def TestInstanceAddWithLocalMirrorDisk(node):
426
  """gnt-instance add -t local_raid1"""
427
  return _DiskTest(node, AcquireInstance(), ['--disk-template=local_raid1'])
428

    
429

    
430
def TestInstanceAddWithRemoteRaidDisk(node, node2):
431
  """gnt-instance add -t remote_raid1"""
432
  return _DiskTest(node, AcquireInstance(),
433
                   ['--disk-template=remote_raid1',
434
                    '--secondary-node=%s' % node2['primary']])
435

    
436

    
437
def TestInstanceRemove(instance):
438
  """gnt-instance remove"""
439
  cmd = ['gnt-instance', 'remove', '-f', instance['name']]
440
  AssertEqual(StartSSH(GetMasterNode()['primary'],
441
                       utils.ShellQuoteArgs(cmd)).wait(), 0)
442

    
443
  ReleaseInstance(instance)
444

    
445

    
446
def TestInstanceStartup(instance):
447
  """gnt-instance startup"""
448
  cmd = ['gnt-instance', 'startup', instance['name']]
449
  AssertEqual(StartSSH(GetMasterNode()['primary'],
450
                       utils.ShellQuoteArgs(cmd)).wait(), 0)
451

    
452

    
453
def TestInstanceShutdown(instance):
454
  """gnt-instance shutdown"""
455
  cmd = ['gnt-instance', 'shutdown', instance['name']]
456
  AssertEqual(StartSSH(GetMasterNode()['primary'],
457
                       utils.ShellQuoteArgs(cmd)).wait(), 0)
458

    
459

    
460
def TestInstanceFailover(instance):
461
  """gnt-instance failover"""
462
  cmd = ['gnt-instance', 'failover', '--force', instance['name']]
463
  AssertEqual(StartSSH(GetMasterNode()['primary'],
464
                       utils.ShellQuoteArgs(cmd)).wait(), 0)
465
# }}}
466

    
467
# {{{ Daemon tests
468
def _ResolveInstanceName(instance):
469
  """Gets the full Xen name of an instance.
470

471
  """
472
  master = GetMasterNode()
473

    
474
  info_cmd = utils.ShellQuoteArgs(['gnt-instance', 'info', instance['name']])
475
  sed_cmd = utils.ShellQuoteArgs(['sed', '-n', '-e', 's/^Instance name: *//p'])
476

    
477
  cmd = '%s | %s' % (info_cmd, sed_cmd)
478
  p = subprocess.Popen(GetSSHCommand(master['primary'], cmd), shell=False,
479
                       stdout=subprocess.PIPE)
480
  AssertEqual(p.wait(), 0)
481

    
482
  return p.stdout.read().strip()
483

    
484

    
485
def _InstanceRunning(node, name):
486
  """Checks whether an instance is running.
487

488
  Args:
489
    node: Node the instance runs on
490
    name: Full name of Xen instance
491
  """
492
  cmd = utils.ShellQuoteArgs(['xm', 'list', name]) + ' >/dev/null'
493
  ret = StartSSH(node['primary'], cmd).wait()
494
  return ret == 0
495

    
496

    
497
def _XmShutdownInstance(node, name):
498
  """Shuts down instance using "xm" and waits for completion.
499

500
  Args:
501
    node: Node the instance runs on
502
    name: Full name of Xen instance
503
  """
504
  cmd = ['xm', 'shutdown', name]
505
  AssertEqual(StartSSH(GetMasterNode()['primary'],
506
                       utils.ShellQuoteArgs(cmd)).wait(), 0)
507

    
508
  # Wait up to a minute
509
  end = time.time() + 60
510
  while time.time() <= end:
511
    if not _InstanceRunning(node, name):
512
      break
513
    time.sleep(5)
514
  else:
515
    raise Error, ("xm shutdown failed")
516

    
517

    
518
def _ResetWatcherDaemon(node):
519
  """Removes the watcher daemon's state file.
520

521
  Args:
522
    node: Node to be reset
523
  """
524
  cmd = ['rm', '-f', constants.WATCHER_STATEFILE]
525
  AssertEqual(StartSSH(node['primary'],
526
                       utils.ShellQuoteArgs(cmd)).wait(), 0)
527

    
528

    
529
def TestInstanceAutomaticRestart(node, instance):
530
  """Test automatic restart of instance by ganeti-watcher.
531

532
  Note: takes up to 6 minutes to complete.
533
  """
534
  master = GetMasterNode()
535
  inst_name = _ResolveInstanceName(instance)
536

    
537
  _ResetWatcherDaemon(node)
538
  _XmShutdownInstance(node, inst_name)
539

    
540
  # Give it a bit more than five minutes to start again
541
  restart_at = time.time() + 330
542

    
543
  # Wait until it's running again
544
  while time.time() <= restart_at:
545
    if _InstanceRunning(node, inst_name):
546
      break
547
    time.sleep(15)
548
  else:
549
    raise Error, ("Daemon didn't restart instance in time")
550

    
551
  cmd = ['gnt-instance', 'info', inst_name]
552
  AssertEqual(StartSSH(master['primary'],
553
                       utils.ShellQuoteArgs(cmd)).wait(), 0)
554

    
555

    
556
def TestInstanceConsecutiveFailures(node, instance):
557
  """Test five consecutive instance failures.
558

559
  Note: takes at least 35 minutes to complete.
560
  """
561
  master = GetMasterNode()
562
  inst_name = _ResolveInstanceName(instance)
563

    
564
  _ResetWatcherDaemon(node)
565
  _XmShutdownInstance(node, inst_name)
566

    
567
  # Do shutdowns for 30 minutes
568
  finished_at = time.time() + (35 * 60)
569

    
570
  while time.time() <= finished_at:
571
    if _InstanceRunning(node, inst_name):
572
      _XmShutdownInstance(node, inst_name)
573
    time.sleep(30)
574

    
575
  # Check for some time whether the instance doesn't start again
576
  check_until = time.time() + 330
577
  while time.time() <= check_until:
578
    if _InstanceRunning(node, inst_name):
579
      raise Error, ("Instance started when it shouldn't")
580
    time.sleep(30)
581

    
582
  cmd = ['gnt-instance', 'info', inst_name]
583
  AssertEqual(StartSSH(master['primary'],
584
                       utils.ShellQuoteArgs(cmd)).wait(), 0)
585
# }}}
586

    
587
# {{{ Main program
588
if __name__ == '__main__':
589
  # {{{ Option parsing
590
  parser = OptionParser(usage="%prog [options] <configfile>")
591
  parser.add_option('--cleanup', dest='cleanup',
592
      action="store_true",
593
      help="Clean up cluster after testing?")
594
  parser.add_option('--dry-run', dest='dry_run',
595
      action="store_true",
596
      help="Show what would be done")
597
  parser.add_option('--verbose', dest='verbose',
598
      action="store_true",
599
      help="Verbose output")
600
  parser.add_option('--yes-do-it', dest='yes_do_it',
601
      action="store_true",
602
      help="Really execute the tests")
603
  (options, args) = parser.parse_args()
604
  # }}}
605

    
606
  if len(args) == 1:
607
    config_file = args[0]
608
  else:
609
    raise SyntaxError, ("Exactly one configuration file is expected")
610

    
611
  if not options.yes_do_it:
612
    print ("Executing this script irreversibly destroys any Ganeti\n"
613
           "configuration on all nodes involved. If you really want\n"
614
           "to start testing, supply the --yes-do-it option.")
615
    sys.exit(1)
616

    
617
  f = open(config_file, 'r')
618
  try:
619
    cfg = yaml.load(f.read())
620
  finally:
621
    f.close()
622

    
623
  RunTest(TestConfig)
624

    
625
  if TestEnabled('env'):
626
    RunTest(TestSshConnection)
627
    RunTest(TestIcmpPing)
628
    RunTest(TestGanetiCommands)
629

    
630
  RunTest(TestClusterInit)
631

    
632
  if TestEnabled('cluster-verify'):
633
    RunTest(TestClusterVerify)
634
    RunTest(TestClusterInfo)
635

    
636
  RunTest(TestNodeAddAll)
637

    
638
  if TestEnabled('cluster-burnin'):
639
    RunTest(TestClusterBurnin)
640

    
641
  if TestEnabled('cluster-master-failover'):
642
    RunTest(TestClusterMasterFailover)
643

    
644
  node = AcquireNode()
645
  try:
646
    if TestEnabled('instance-add-plain-disk'):
647
      instance = RunTest(TestInstanceAddWithPlainDisk, node)
648
      RunTest(TestInstanceShutdown, instance)
649
      RunTest(TestInstanceStartup, instance)
650

    
651
      if TestEnabled('instance-automatic-restart'):
652
        RunTest(TestInstanceAutomaticRestart, node, instance)
653

    
654
      if TestEnabled('instance-consecutive-failures'):
655
        RunTest(TestInstanceConsecutiveFailures, node, instance)
656

    
657
      RunTest(TestInstanceRemove, instance)
658
      del instance
659

    
660
    if TestEnabled('instance-add-local-mirror-disk'):
661
      instance = RunTest(TestInstanceAddWithLocalMirrorDisk, node)
662
      RunTest(TestInstanceShutdown, instance)
663
      RunTest(TestInstanceStartup, instance)
664
      RunTest(TestInstanceRemove, instance)
665
      del instance
666

    
667
    if TestEnabled('instance-add-remote-raid-disk'):
668
      node2 = AcquireNode(exclude=node)
669
      try:
670
        instance = RunTest(TestInstanceAddWithRemoteRaidDisk, node, node2)
671
        RunTest(TestInstanceShutdown, instance)
672
        RunTest(TestInstanceStartup, instance)
673

    
674
        if TestEnabled('instance-failover'):
675
          RunTest(TestInstanceFailover, instance)
676

    
677
        RunTest(TestInstanceRemove, instance)
678
        del instance
679
      finally:
680
        ReleaseNode(node2)
681

    
682
  finally:
683
    ReleaseNode(node)
684

    
685
  RunTest(TestNodeRemoveAll)
686

    
687
  if TestEnabled('cluster-destroy'):
688
    RunTest(TestClusterDestroy)
689
# }}}
690

    
691
# vim: foldmethod=marker :