4 # Copyright (C) 2006, 2007 Google Inc.
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 """Script for doing Q&A on Ganeti"""
30 from datetime import datetime
31 from optparse import OptionParser
33 # I want more flexibility for testing over SSH, therefore I'm not using
34 # Ganeti's ssh module.
37 from ganeti import utils
38 from ganeti import constants
40 # {{{ Global variables
46 class Error(Exception):
47 """An error occurred during Q&A testing.
53 class OutOfNodesError(Error):
60 class OutOfInstancesError(Error):
68 def TestEnabled(test):
69 """Returns True if the given test is enabled."""
70 return cfg.get('tests', {}).get(test, False)
73 def RunTest(callable, *args):
74 """Runs a test after printing a header.
78 desc = callable.__doc__.splitlines()[0].strip()
80 desc = '%r' % callable
82 now = str(datetime.now())
85 print '---', now, ('-' * (55 - len(now)))
89 return callable(*args)
92 def AssertEqual(first, second, msg=None):
93 """Raises an error when values aren't equal.
96 if not first == second:
97 raise Error, (msg or '%r == %r' % (first, second))
100 def GetSSHCommand(node, cmd, strict=True):
101 """Builds SSH command to be executed.
104 args = [ 'ssh', '-oEscapeChar=none', '-oBatchMode=yes', '-l', 'root' ]
110 args.append('-oStrictHostKeyChecking=%s' % tmp)
118 args.append(prefix + cmd)
121 print 'SSH:', utils.ShellQuoteArgs(args)
126 def StartSSH(node, cmd, strict=True):
130 args = GetSSHCommand(node, cmd, strict=strict)
131 return subprocess.Popen(args, shell=False)
134 def UploadFile(node, file):
135 """Uploads a file to a node and returns the filename.
137 Caller needs to remove the file when it's not needed anymore.
139 if os.stat(file).st_mode & 0100:
144 cmd = ('tmp=$(tempfile --mode %s --prefix gnt) && '
145 '[[ -f "${tmp}" ]] && '
147 'echo "${tmp}"') % mode
151 p = subprocess.Popen(GetSSHCommand(node, cmd), shell=False, stdin=f,
152 stdout=subprocess.PIPE)
153 AssertEqual(p.wait(), 0)
155 name = p.stdout.read().strip()
164 return cfg['nodes'][0]
167 def AcquireInstance():
168 """Returns an instance which isn't in use.
171 # Filter out unwanted instances
172 tmp_flt = lambda inst: not inst.get('_used', False)
173 instances = filter(tmp_flt, cfg['instances'])
176 if len(instances) == 0:
177 raise OutOfInstancesError, ("No instances left")
184 def ReleaseInstance(inst):
185 inst['_used'] = False
188 def AcquireNode(exclude=None):
189 """Returns the least used node.
192 master = GetMasterNode()
194 # Filter out unwanted nodes
195 # TODO: Maybe combine filters
197 nodes = cfg['nodes'][:]
199 nodes = filter(lambda node: node != exclude, cfg['nodes'])
201 tmp_flt = lambda node: node.get('_added', False) or node == master
202 nodes = filter(tmp_flt, nodes)
206 raise OutOfNodesError, ("No nodes left")
208 # Get node with least number of uses
210 result = cmp(a.get('_count', 0), b.get('_count', 0))
212 result = cmp(a['primary'], b['primary'])
215 nodes.sort(cmp=compare)
218 node['_count'] = node.get('_count', 0) + 1
222 def ReleaseNode(node):
223 node['_count'] = node.get('_count', 0) - 1
226 # {{{ Environment tests
228 """Test configuration for sanity.
231 if len(cfg['nodes']) < 1:
232 raise Error, ("Need at least one node")
233 if len(cfg['instances']) < 1:
234 raise Error, ("Need at least one instance")
235 # TODO: Add more checks
238 def TestSshConnection():
239 """Test SSH connection.
242 for node in cfg['nodes']:
243 AssertEqual(StartSSH(node['primary'], 'exit').wait(), 0)
246 def TestGanetiCommands():
247 """Test availibility of Ganeti commands.
250 cmds = ( ['gnt-cluster', '--version'],
251 ['gnt-os', '--version'],
252 ['gnt-node', '--version'],
253 ['gnt-instance', '--version'],
254 ['gnt-backup', '--version'],
255 ['ganeti-noded', '--version'],
256 ['ganeti-watcher', '--version'] )
258 cmd = ' && '.join(map(utils.ShellQuoteArgs, cmds))
260 for node in cfg['nodes']:
261 AssertEqual(StartSSH(node['primary'], cmd).wait(), 0)
265 """ICMP ping each node.
268 for node in cfg['nodes']:
270 for i in cfg['nodes']:
271 check.append(i['primary'])
272 if i.has_key('secondary'):
273 check.append(i['secondary'])
275 ping = lambda ip: utils.ShellQuoteArgs(['ping', '-w', '3', '-c', '1', ip])
276 cmd = ' && '.join(map(ping, check))
278 AssertEqual(StartSSH(node['primary'], cmd).wait(), 0)
282 def TestClusterInit():
283 """gnt-cluster init"""
284 master = GetMasterNode()
286 cmd = ['gnt-cluster', 'init']
287 if master.get('secondary', None):
288 cmd.append('--secondary-ip=%s' % master['secondary'])
289 if cfg.get('bridge', None):
290 cmd.append('--bridge=%s' % cfg['bridge'])
291 cmd.append(cfg['name'])
293 AssertEqual(StartSSH(master['primary'],
294 utils.ShellQuoteArgs(cmd)).wait(), 0)
297 def TestClusterVerify():
298 """gnt-cluster verify"""
299 cmd = ['gnt-cluster', 'verify']
300 AssertEqual(StartSSH(GetMasterNode()['primary'],
301 utils.ShellQuoteArgs(cmd)).wait(), 0)
304 def TestClusterInfo():
305 """gnt-cluster info"""
306 cmd = ['gnt-cluster', 'info']
307 AssertEqual(StartSSH(GetMasterNode()['primary'],
308 utils.ShellQuoteArgs(cmd)).wait(), 0)
311 def TestClusterBurnin():
313 master = GetMasterNode()
315 # Get as many instances as we need
318 for _ in xrange(0, cfg.get('options', {}).get('burnin-instances', 1)):
319 instances.append(AcquireInstance())
320 except OutOfInstancesError:
321 print "Not enough instances, continuing anyway."
323 if len(instances) < 1:
324 raise Error, ("Burnin needs at least one instance")
328 script = UploadFile(master['primary'], '../tools/burnin')
330 cmd = [script, '--os=%s' % cfg['os']]
331 cmd += map(lambda inst: inst['name'], instances)
332 AssertEqual(StartSSH(master['primary'],
333 utils.ShellQuoteArgs(cmd)).wait(), 0)
335 cmd = ['rm', '-f', script]
336 AssertEqual(StartSSH(master['primary'],
337 utils.ShellQuoteArgs(cmd)).wait(), 0)
339 for inst in instances:
340 ReleaseInstance(inst)
343 def TestClusterMasterFailover():
344 """gnt-cluster masterfailover"""
345 master = GetMasterNode()
347 failovermaster = AcquireNode(exclude=master)
349 cmd = ['gnt-cluster', 'masterfailover']
350 AssertEqual(StartSSH(failovermaster['primary'],
351 utils.ShellQuoteArgs(cmd)).wait(), 0)
353 cmd = ['gnt-cluster', 'masterfailover']
354 AssertEqual(StartSSH(master['primary'],
355 utils.ShellQuoteArgs(cmd)).wait(), 0)
357 ReleaseNode(failovermaster)
360 def TestClusterDestroy():
361 """gnt-cluster destroy"""
362 cmd = ['gnt-cluster', 'destroy', '--yes-do-it']
363 AssertEqual(StartSSH(GetMasterNode()['primary'],
364 utils.ShellQuoteArgs(cmd)).wait(), 0)
369 if node.get('_added', False):
370 raise Error, ("Node %s already in cluster" % node['primary'])
372 cmd = ['gnt-node', 'add']
373 if node.get('secondary', None):
374 cmd.append('--secondary-ip=%s' % node['secondary'])
375 cmd.append(node['primary'])
376 AssertEqual(StartSSH(GetMasterNode()['primary'],
377 utils.ShellQuoteArgs(cmd)).wait(), 0)
379 node['_added'] = True
382 def TestNodeAddAll():
383 """Adding all nodes to cluster."""
384 master = GetMasterNode()
385 for node in cfg['nodes']:
390 def _NodeRemove(node):
391 cmd = ['gnt-node', 'remove', node['primary']]
392 AssertEqual(StartSSH(GetMasterNode()['primary'],
393 utils.ShellQuoteArgs(cmd)).wait(), 0)
394 node['_added'] = False
397 def TestNodeRemoveAll():
398 """Removing all nodes from cluster."""
399 master = GetMasterNode()
400 for node in cfg['nodes']:
406 def _DiskTest(node, instance, args):
407 cmd = ['gnt-instance', 'add',
408 '--os-type=%s' % cfg['os'],
409 '--os-size=%s' % cfg['os-size'],
410 '--swap-size=%s' % cfg['swap-size'],
411 '--memory=%s' % cfg['mem'],
412 '--node=%s' % node['primary']]
415 cmd.append(instance['name'])
417 AssertEqual(StartSSH(GetMasterNode()['primary'],
418 utils.ShellQuoteArgs(cmd)).wait(), 0)
422 def TestInstanceAddWithPlainDisk(node):
423 """gnt-instance add -t plain"""
424 return _DiskTest(node, AcquireInstance(), ['--disk-template=plain'])
427 def TestInstanceAddWithLocalMirrorDisk(node):
428 """gnt-instance add -t local_raid1"""
429 return _DiskTest(node, AcquireInstance(), ['--disk-template=local_raid1'])
432 def TestInstanceAddWithRemoteRaidDisk(node, node2):
433 """gnt-instance add -t remote_raid1"""
434 return _DiskTest(node, AcquireInstance(),
435 ['--disk-template=remote_raid1',
436 '--secondary-node=%s' % node2['primary']])
439 def TestInstanceRemove(instance):
440 """gnt-instance remove"""
441 cmd = ['gnt-instance', 'remove', '-f', instance['name']]
442 AssertEqual(StartSSH(GetMasterNode()['primary'],
443 utils.ShellQuoteArgs(cmd)).wait(), 0)
445 ReleaseInstance(instance)
448 def TestInstanceStartup(instance):
449 """gnt-instance startup"""
450 cmd = ['gnt-instance', 'startup', instance['name']]
451 AssertEqual(StartSSH(GetMasterNode()['primary'],
452 utils.ShellQuoteArgs(cmd)).wait(), 0)
455 def TestInstanceShutdown(instance):
456 """gnt-instance shutdown"""
457 cmd = ['gnt-instance', 'shutdown', instance['name']]
458 AssertEqual(StartSSH(GetMasterNode()['primary'],
459 utils.ShellQuoteArgs(cmd)).wait(), 0)
462 def TestInstanceFailover(instance):
463 """gnt-instance failover"""
464 cmd = ['gnt-instance', 'failover', '--force', instance['name']]
465 AssertEqual(StartSSH(GetMasterNode()['primary'],
466 utils.ShellQuoteArgs(cmd)).wait(), 0)
470 def _ResolveInstanceName(instance):
471 """Gets the full Xen name of an instance.
474 master = GetMasterNode()
476 info_cmd = utils.ShellQuoteArgs(['gnt-instance', 'info', instance['name']])
477 sed_cmd = utils.ShellQuoteArgs(['sed', '-n', '-e', 's/^Instance name: *//p'])
479 cmd = '%s | %s' % (info_cmd, sed_cmd)
480 p = subprocess.Popen(GetSSHCommand(master['primary'], cmd), shell=False,
481 stdout=subprocess.PIPE)
482 AssertEqual(p.wait(), 0)
484 return p.stdout.read().strip()
487 def _InstanceRunning(node, name):
488 """Checks whether an instance is running.
491 node: Node the instance runs on
492 name: Full name of Xen instance
494 cmd = utils.ShellQuoteArgs(['xm', 'list', name]) + ' >/dev/null'
495 ret = StartSSH(node['primary'], cmd).wait()
499 def _XmShutdownInstance(node, name):
500 """Shuts down instance using "xm" and waits for completion.
503 node: Node the instance runs on
504 name: Full name of Xen instance
506 cmd = ['xm', 'shutdown', name]
507 AssertEqual(StartSSH(GetMasterNode()['primary'],
508 utils.ShellQuoteArgs(cmd)).wait(), 0)
510 # Wait up to a minute
511 end = time.time() + 60
512 while time.time() <= end:
513 if not _InstanceRunning(node, name):
517 raise Error, ("xm shutdown failed")
520 def _ResetWatcherDaemon(node):
521 """Removes the watcher daemon's state file.
524 node: Node to be reset
526 cmd = ['rm', '-f', constants.WATCHER_STATEFILE]
527 AssertEqual(StartSSH(node['primary'],
528 utils.ShellQuoteArgs(cmd)).wait(), 0)
531 def TestInstanceAutomaticRestart(node, instance):
532 """Test automatic restart of instance by ganeti-watcher.
534 Note: takes up to 6 minutes to complete.
536 master = GetMasterNode()
537 inst_name = _ResolveInstanceName(instance)
539 _ResetWatcherDaemon(node)
540 _XmShutdownInstance(node, inst_name)
542 # Give it a bit more than five minutes to start again
543 restart_at = time.time() + 330
545 # Wait until it's running again
546 while time.time() <= restart_at:
547 if _InstanceRunning(node, inst_name):
551 raise Error, ("Daemon didn't restart instance in time")
553 cmd = ['gnt-instance', 'info', inst_name]
554 AssertEqual(StartSSH(master['primary'],
555 utils.ShellQuoteArgs(cmd)).wait(), 0)
558 def TestInstanceConsecutiveFailures(node, instance):
559 """Test five consecutive instance failures.
561 Note: takes at least 35 minutes to complete.
563 master = GetMasterNode()
564 inst_name = _ResolveInstanceName(instance)
566 _ResetWatcherDaemon(node)
567 _XmShutdownInstance(node, inst_name)
569 # Do shutdowns for 30 minutes
570 finished_at = time.time() + (35 * 60)
572 while time.time() <= finished_at:
573 if _InstanceRunning(node, inst_name):
574 _XmShutdownInstance(node, inst_name)
577 # Check for some time whether the instance doesn't start again
578 check_until = time.time() + 330
579 while time.time() <= check_until:
580 if _InstanceRunning(node, inst_name):
581 raise Error, ("Instance started when it shouldn't")
584 cmd = ['gnt-instance', 'info', inst_name]
585 AssertEqual(StartSSH(master['primary'],
586 utils.ShellQuoteArgs(cmd)).wait(), 0)
590 if __name__ == '__main__':
592 parser = OptionParser(usage="%prog [options] <configfile>")
593 parser.add_option('--cleanup', dest='cleanup',
595 help="Clean up cluster after testing?")
596 parser.add_option('--dry-run', dest='dry_run',
598 help="Show what would be done")
599 parser.add_option('--verbose', dest='verbose',
601 help="Verbose output")
602 parser.add_option('--yes-do-it', dest='yes_do_it',
604 help="Really execute the tests")
605 (options, args) = parser.parse_args()
609 config_file = args[0]
611 raise SyntaxError, ("Exactly one configuration file is expected")
613 if not options.yes_do_it:
614 print ("Executing this script irreversibly destroys any Ganeti\n"
615 "configuration on all nodes involved. If you really want\n"
616 "to start testing, supply the --yes-do-it option.")
619 f = open(config_file, 'r')
621 cfg = yaml.load(f.read())
627 if TestEnabled('env'):
628 RunTest(TestSshConnection)
629 RunTest(TestIcmpPing)
630 RunTest(TestGanetiCommands)
632 RunTest(TestClusterInit)
634 if TestEnabled('cluster-verify'):
635 RunTest(TestClusterVerify)
636 RunTest(TestClusterInfo)
638 RunTest(TestNodeAddAll)
640 if TestEnabled('cluster-burnin'):
641 RunTest(TestClusterBurnin)
643 if TestEnabled('cluster-master-failover'):
644 RunTest(TestClusterMasterFailover)
648 if TestEnabled('instance-add-plain-disk'):
649 instance = RunTest(TestInstanceAddWithPlainDisk, node)
650 RunTest(TestInstanceShutdown, instance)
651 RunTest(TestInstanceStartup, instance)
653 if TestEnabled('instance-automatic-restart'):
654 RunTest(TestInstanceAutomaticRestart, node, instance)
656 if TestEnabled('instance-consecutive-failures'):
657 RunTest(TestInstanceConsecutiveFailures, node, instance)
659 RunTest(TestInstanceRemove, instance)
662 if TestEnabled('instance-add-local-mirror-disk'):
663 instance = RunTest(TestInstanceAddWithLocalMirrorDisk, node)
664 RunTest(TestInstanceShutdown, instance)
665 RunTest(TestInstanceStartup, instance)
666 RunTest(TestInstanceRemove, instance)
669 if TestEnabled('instance-add-remote-raid-disk'):
670 node2 = AcquireNode(exclude=node)
672 instance = RunTest(TestInstanceAddWithRemoteRaidDisk, node, node2)
673 RunTest(TestInstanceShutdown, instance)
674 RunTest(TestInstanceStartup, instance)
676 if TestEnabled('instance-failover'):
677 RunTest(TestInstanceFailover, instance)
679 RunTest(TestInstanceRemove, instance)
687 RunTest(TestNodeRemoveAll)
689 if TestEnabled('cluster-destroy'):
690 RunTest(TestClusterDestroy)
693 # vim: foldmethod=marker :