Update the monitoring agent design document
[ganeti-local] / qa / ganeti-qa.py
1 #!/usr/bin/python -u
2 #
3
4 # Copyright (C) 2007, 2008, 2009, 2010, 2011, 2012 Google Inc.
5 #
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
10 #
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 # General Public License for more details.
15 #
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 # 02110-1301, USA.
20
21
22 """Script for doing QA on Ganeti.
23
24 """
25
26 # pylint: disable=C0103
27 # due to invalid name
28
29 import sys
30 import datetime
31 import optparse
32
33 import qa_cluster
34 import qa_config
35 import qa_daemon
36 import qa_env
37 import qa_error
38 import qa_group
39 import qa_instance
40 import qa_node
41 import qa_os
42 import qa_job
43 import qa_rapi
44 import qa_tags
45 import qa_utils
46
47 from ganeti import utils
48 from ganeti import rapi # pylint: disable=W0611
49 from ganeti import constants
50
51 import ganeti.rapi.client # pylint: disable=W0611
52 from ganeti.rapi.client import UsesRapiClient
53
54
55 def _FormatHeader(line, end=72):
56   """Fill a line up to the end column.
57
58   """
59   line = "---- " + line + " "
60   line += "-" * (end - len(line))
61   line = line.rstrip()
62   return line
63
64
65 def _DescriptionOf(fn):
66   """Computes the description of an item.
67
68   """
69   if fn.__doc__:
70     desc = fn.__doc__.splitlines()[0].strip()
71   else:
72     desc = "%r" % fn
73
74   return desc.rstrip(".")
75
76
77 def RunTest(fn, *args, **kwargs):
78   """Runs a test after printing a header.
79
80   """
81
82   tstart = datetime.datetime.now()
83
84   desc = _DescriptionOf(fn)
85
86   print
87   print _FormatHeader("%s start %s" % (tstart, desc))
88
89   try:
90     retval = fn(*args, **kwargs)
91     return retval
92   finally:
93     tstop = datetime.datetime.now()
94     tdelta = tstop - tstart
95     print _FormatHeader("%s time=%s %s" % (tstop, tdelta, desc))
96
97
98 def RunTestIf(testnames, fn, *args, **kwargs):
99   """Runs a test conditionally.
100
101   @param testnames: either a single test name in the configuration
102       file, or a list of testnames (which will be AND-ed together)
103
104   """
105   if qa_config.TestEnabled(testnames):
106     RunTest(fn, *args, **kwargs)
107   else:
108     tstart = datetime.datetime.now()
109     desc = _DescriptionOf(fn)
110     print _FormatHeader("%s skipping %s, test(s) %s disabled" %
111                         (tstart, desc, testnames))
112
113
114 def RunEnvTests():
115   """Run several environment tests.
116
117   """
118   RunTestIf("env", qa_env.TestSshConnection)
119   RunTestIf("env", qa_env.TestIcmpPing)
120   RunTestIf("env", qa_env.TestGanetiCommands)
121
122
123 def SetupCluster(rapi_user, rapi_secret):
124   """Initializes the cluster.
125
126   @param rapi_user: Login user for RAPI
127   @param rapi_secret: Login secret for RAPI
128
129   """
130   RunTestIf("create-cluster", qa_cluster.TestClusterInit,
131             rapi_user, rapi_secret)
132
133   # Test on empty cluster
134   RunTestIf("node-list", qa_node.TestNodeList)
135   RunTestIf("instance-list", qa_instance.TestInstanceList)
136   RunTestIf("job-list", qa_job.TestJobList)
137
138   RunTestIf("create-cluster", qa_node.TestNodeAddAll)
139   if not qa_config.TestEnabled("create-cluster"):
140     # consider the nodes are already there
141     qa_node.MarkNodeAddedAll()
142
143   RunTestIf("test-jobqueue", qa_cluster.TestJobqueue)
144
145   # enable the watcher (unconditionally)
146   RunTest(qa_daemon.TestResumeWatcher)
147
148   RunTestIf("node-list", qa_node.TestNodeList)
149
150   # Test listing fields
151   RunTestIf("node-list", qa_node.TestNodeListFields)
152   RunTestIf("instance-list", qa_instance.TestInstanceListFields)
153   RunTestIf("job-list", qa_job.TestJobListFields)
154   RunTestIf("instance-export", qa_instance.TestBackupListFields)
155
156   RunTestIf("node-info", qa_node.TestNodeInfo)
157
158
159 def RunClusterTests():
160   """Runs tests related to gnt-cluster.
161
162   """
163   for test, fn in [
164     ("create-cluster", qa_cluster.TestClusterInitDisk),
165     ("cluster-renew-crypto", qa_cluster.TestClusterRenewCrypto),
166     ("cluster-verify", qa_cluster.TestClusterVerify),
167     ("cluster-reserved-lvs", qa_cluster.TestClusterReservedLvs),
168     # TODO: add more cluster modify tests
169     ("cluster-modify", qa_cluster.TestClusterModifyEmpty),
170     ("cluster-modify", qa_cluster.TestClusterModifyBe),
171     ("cluster-modify", qa_cluster.TestClusterModifyDisk),
172     ("cluster-rename", qa_cluster.TestClusterRename),
173     ("cluster-info", qa_cluster.TestClusterVersion),
174     ("cluster-info", qa_cluster.TestClusterInfo),
175     ("cluster-info", qa_cluster.TestClusterGetmaster),
176     ("cluster-redist-conf", qa_cluster.TestClusterRedistConf),
177     ("cluster-copyfile", qa_cluster.TestClusterCopyfile),
178     ("cluster-command", qa_cluster.TestClusterCommand),
179     ("cluster-burnin", qa_cluster.TestClusterBurnin),
180     ("cluster-master-failover", qa_cluster.TestClusterMasterFailover),
181     ("cluster-master-failover",
182      qa_cluster.TestClusterMasterFailoverWithDrainedQueue),
183     ("cluster-oob", qa_cluster.TestClusterOob),
184     ("rapi", qa_rapi.TestVersion),
185     ("rapi", qa_rapi.TestEmptyCluster),
186     ("rapi", qa_rapi.TestRapiQuery),
187     ]:
188     RunTestIf(test, fn)
189
190
191 def RunRepairDiskSizes():
192   """Run the repair disk-sizes test.
193
194   """
195   RunTestIf("cluster-repair-disk-sizes", qa_cluster.TestClusterRepairDiskSizes)
196
197
198 def RunOsTests():
199   """Runs all tests related to gnt-os.
200
201   """
202   if qa_config.TestEnabled("rapi"):
203     rapi_getos = qa_rapi.GetOperatingSystems
204   else:
205     rapi_getos = None
206
207   for fn in [
208     qa_os.TestOsList,
209     qa_os.TestOsDiagnose,
210     ]:
211     RunTestIf("os", fn)
212
213   for fn in [
214     qa_os.TestOsValid,
215     qa_os.TestOsInvalid,
216     qa_os.TestOsPartiallyValid,
217     ]:
218     RunTestIf("os", fn, rapi_getos)
219
220   for fn in [
221     qa_os.TestOsModifyValid,
222     qa_os.TestOsModifyInvalid,
223     qa_os.TestOsStatesNonExisting,
224     ]:
225     RunTestIf("os", fn)
226
227
228 def RunCommonInstanceTests(instance):
229   """Runs a few tests that are common to all disk types.
230
231   """
232   RunTestIf("instance-shutdown", qa_instance.TestInstanceShutdown, instance)
233   RunTestIf(["instance-shutdown", "instance-console", "rapi"],
234             qa_rapi.TestRapiStoppedInstanceConsole, instance)
235   RunTestIf(["instance-shutdown", "instance-modify"],
236             qa_instance.TestInstanceStoppedModify, instance)
237   RunTestIf("instance-shutdown", qa_instance.TestInstanceStartup, instance)
238
239   # Test shutdown/start via RAPI
240   RunTestIf(["instance-shutdown", "rapi"],
241             qa_rapi.TestRapiInstanceShutdown, instance)
242   RunTestIf(["instance-shutdown", "rapi"],
243             qa_rapi.TestRapiInstanceStartup, instance)
244
245   RunTestIf("instance-list", qa_instance.TestInstanceList)
246
247   RunTestIf("instance-info", qa_instance.TestInstanceInfo, instance)
248
249   RunTestIf("instance-modify", qa_instance.TestInstanceModify, instance)
250   RunTestIf(["instance-modify", "rapi"],
251             qa_rapi.TestRapiInstanceModify, instance)
252
253   RunTestIf("instance-console", qa_instance.TestInstanceConsole, instance)
254   RunTestIf(["instance-console", "rapi"],
255             qa_rapi.TestRapiInstanceConsole, instance)
256
257   DOWN_TESTS = qa_config.Either([
258     "instance-reinstall",
259     "instance-rename",
260     "instance-grow-disk",
261     ])
262
263   # shutdown instance for any 'down' tests
264   RunTestIf(DOWN_TESTS, qa_instance.TestInstanceShutdown, instance)
265
266   # now run the 'down' state tests
267   RunTestIf("instance-reinstall", qa_instance.TestInstanceReinstall, instance)
268   RunTestIf(["instance-reinstall", "rapi"],
269             qa_rapi.TestRapiInstanceReinstall, instance)
270
271   if qa_config.TestEnabled("instance-rename"):
272     rename_source = instance["name"]
273     rename_target = qa_config.get("rename", None)
274     # perform instance rename to the same name
275     RunTest(qa_instance.TestInstanceRenameAndBack,
276             rename_source, rename_source)
277     RunTestIf("rapi", qa_rapi.TestRapiInstanceRenameAndBack,
278               rename_source, rename_source)
279     if rename_target is not None:
280       # perform instance rename to a different name, if we have one configured
281       RunTest(qa_instance.TestInstanceRenameAndBack,
282               rename_source, rename_target)
283       RunTestIf("rapi", qa_rapi.TestRapiInstanceRenameAndBack,
284                 rename_source, rename_target)
285
286   RunTestIf(["instance-grow-disk"], qa_instance.TestInstanceGrowDisk, instance)
287
288   # and now start the instance again
289   RunTestIf(DOWN_TESTS, qa_instance.TestInstanceStartup, instance)
290
291   RunTestIf("instance-reboot", qa_instance.TestInstanceReboot, instance)
292
293   RunTestIf("tags", qa_tags.TestInstanceTags, instance)
294
295   RunTestIf("cluster-verify", qa_cluster.TestClusterVerify)
296
297   RunTestIf("rapi", qa_rapi.TestInstance, instance)
298
299   # Lists instances, too
300   RunTestIf("node-list", qa_node.TestNodeList)
301
302   # Some jobs have been run, let's test listing them
303   RunTestIf("job-list", qa_job.TestJobList)
304
305
306 def RunCommonNodeTests():
307   """Run a few common node tests.
308
309   """
310   RunTestIf("node-volumes", qa_node.TestNodeVolumes)
311   RunTestIf("node-storage", qa_node.TestNodeStorage)
312   RunTestIf("node-oob", qa_node.TestOutOfBand)
313
314
315 def RunGroupListTests():
316   """Run tests for listing node groups.
317
318   """
319   RunTestIf("group-list", qa_group.TestGroupList)
320   RunTestIf("group-list", qa_group.TestGroupListFields)
321
322
323 def RunGroupRwTests():
324   """Run tests for adding/removing/renaming groups.
325
326   """
327   RunTestIf("group-rwops", qa_group.TestGroupAddRemoveRename)
328   RunTestIf("group-rwops", qa_group.TestGroupAddWithOptions)
329   RunTestIf("group-rwops", qa_group.TestGroupModify)
330   RunTestIf(["group-rwops", "rapi"], qa_rapi.TestRapiNodeGroups)
331   RunTestIf(["group-rwops", "tags"], qa_tags.TestGroupTags,
332             qa_group.GetDefaultGroup())
333
334
335 def RunExportImportTests(instance, pnode, snode):
336   """Tries to export and import the instance.
337
338   @param pnode: current primary node of the instance
339   @param snode: current secondary node of the instance, if any,
340       otherwise None
341
342   """
343   if qa_config.TestEnabled("instance-export"):
344     RunTest(qa_instance.TestInstanceExportNoTarget, instance)
345
346     expnode = qa_config.AcquireNode(exclude=pnode)
347     try:
348       name = RunTest(qa_instance.TestInstanceExport, instance, expnode)
349
350       RunTest(qa_instance.TestBackupList, expnode)
351
352       if qa_config.TestEnabled("instance-import"):
353         newinst = qa_config.AcquireInstance()
354         try:
355           RunTest(qa_instance.TestInstanceImport, newinst, pnode,
356                   expnode, name)
357           # Check if starting the instance works
358           RunTest(qa_instance.TestInstanceStartup, newinst)
359           RunTest(qa_instance.TestInstanceRemove, newinst)
360         finally:
361           qa_config.ReleaseInstance(newinst)
362     finally:
363       qa_config.ReleaseNode(expnode)
364
365   if qa_config.TestEnabled(["rapi", "inter-cluster-instance-move"]):
366     newinst = qa_config.AcquireInstance()
367     try:
368       if snode is None:
369         excl = [pnode]
370       else:
371         excl = [pnode, snode]
372       tnode = qa_config.AcquireNode(exclude=excl)
373       try:
374         RunTest(qa_rapi.TestInterClusterInstanceMove, instance, newinst,
375                 pnode, snode, tnode)
376       finally:
377         qa_config.ReleaseNode(tnode)
378     finally:
379       qa_config.ReleaseInstance(newinst)
380
381
382 def RunDaemonTests(instance):
383   """Test the ganeti-watcher script.
384
385   """
386   RunTest(qa_daemon.TestPauseWatcher)
387
388   RunTestIf("instance-automatic-restart",
389             qa_daemon.TestInstanceAutomaticRestart, instance)
390   RunTestIf("instance-consecutive-failures",
391             qa_daemon.TestInstanceConsecutiveFailures, instance)
392
393   RunTest(qa_daemon.TestResumeWatcher)
394
395
396 def RunSingleHomedHardwareFailureTests(instance, pnode):
397   """Test hardware failure recovery for single-homed instances.
398
399   """
400   if qa_config.TestEnabled("instance-recreate-disks"):
401     othernode = qa_config.AcquireNode(exclude=[pnode])
402     try:
403       RunTest(qa_instance.TestRecreateDisks,
404               instance, pnode, None, [othernode])
405     finally:
406       qa_config.ReleaseNode(othernode)
407
408
409 def RunHardwareFailureTests(instance, pnode, snode):
410   """Test cluster internal hardware failure recovery.
411
412   """
413   RunTestIf("instance-failover", qa_instance.TestInstanceFailover, instance)
414   RunTestIf(["instance-failover", "rapi"],
415             qa_rapi.TestRapiInstanceFailover, instance)
416
417   RunTestIf("instance-migrate", qa_instance.TestInstanceMigrate, instance)
418   RunTestIf(["instance-migrate", "rapi"],
419             qa_rapi.TestRapiInstanceMigrate, instance)
420
421   if qa_config.TestEnabled("instance-replace-disks"):
422     othernode = qa_config.AcquireNode(exclude=[pnode, snode])
423     try:
424       RunTestIf("rapi", qa_rapi.TestRapiInstanceReplaceDisks, instance)
425       RunTest(qa_instance.TestReplaceDisks,
426               instance, pnode, snode, othernode)
427     finally:
428       qa_config.ReleaseNode(othernode)
429
430   if qa_config.TestEnabled("instance-recreate-disks"):
431     othernode1 = qa_config.AcquireNode(exclude=[pnode, snode])
432     try:
433       othernode2 = qa_config.AcquireNode(exclude=[pnode, snode, othernode1])
434     except qa_error.OutOfNodesError:
435       # Let's reuse one of the nodes if the cluster is not big enough
436       othernode2 = pnode
437     try:
438       RunTest(qa_instance.TestRecreateDisks,
439               instance, pnode, snode, [othernode1, othernode2])
440     finally:
441       qa_config.ReleaseNode(othernode1)
442       if othernode2 != pnode:
443         qa_config.ReleaseNode(othernode2)
444
445   RunTestIf("node-evacuate", qa_node.TestNodeEvacuate, pnode, snode)
446
447   RunTestIf("node-failover", qa_node.TestNodeFailover, pnode, snode)
448
449   RunTestIf("instance-disk-failure", qa_instance.TestInstanceMasterDiskFailure,
450             instance, pnode, snode)
451   RunTestIf("instance-disk-failure",
452             qa_instance.TestInstanceSecondaryDiskFailure, instance,
453             pnode, snode)
454
455
456 def RunQa():
457   """Main QA body.
458
459   """
460   rapi_user = "ganeti-qa"
461   rapi_secret = utils.GenerateSecret()
462
463   RunEnvTests()
464   SetupCluster(rapi_user, rapi_secret)
465
466   # Load RAPI certificate
467   qa_rapi.Setup(rapi_user, rapi_secret)
468
469   RunClusterTests()
470   RunOsTests()
471
472   RunTestIf("tags", qa_tags.TestClusterTags)
473
474   RunCommonNodeTests()
475   RunGroupListTests()
476   RunGroupRwTests()
477
478   # The master shouldn't be readded or put offline; "delay" needs a non-master
479   # node to test
480   pnode = qa_config.AcquireNode(exclude=qa_config.GetMasterNode())
481   try:
482     RunTestIf("node-readd", qa_node.TestNodeReadd, pnode)
483     RunTestIf("node-modify", qa_node.TestNodeModify, pnode)
484     RunTestIf("delay", qa_cluster.TestDelay, pnode)
485   finally:
486     qa_config.ReleaseNode(pnode)
487
488   pnode = qa_config.AcquireNode()
489   try:
490     RunTestIf("tags", qa_tags.TestNodeTags, pnode)
491
492     if qa_rapi.Enabled():
493       RunTest(qa_rapi.TestNode, pnode)
494
495       if qa_config.TestEnabled("instance-add-plain-disk"):
496         for use_client in [True, False]:
497           rapi_instance = RunTest(qa_rapi.TestRapiInstanceAdd, pnode,
498                                   use_client)
499           if qa_config.TestEnabled("instance-plain-rapi-common-tests"):
500             RunCommonInstanceTests(rapi_instance)
501           RunTest(qa_rapi.TestRapiInstanceRemove, rapi_instance, use_client)
502           del rapi_instance
503
504     if qa_config.TestEnabled("instance-add-plain-disk"):
505       instance = RunTest(qa_instance.TestInstanceAddWithPlainDisk, pnode)
506       RunCommonInstanceTests(instance)
507       RunGroupListTests()
508       RunTestIf("cluster-epo", qa_cluster.TestClusterEpo)
509       RunExportImportTests(instance, pnode, None)
510       RunDaemonTests(instance)
511       RunRepairDiskSizes()
512       RunSingleHomedHardwareFailureTests(instance, pnode)
513       RunTest(qa_instance.TestInstanceRemove, instance)
514       del instance
515
516     multinode_tests = [
517       ("instance-add-drbd-disk",
518        qa_instance.TestInstanceAddWithDrbdDisk),
519     ]
520
521     for name, func in multinode_tests:
522       if qa_config.TestEnabled(name):
523         snode = qa_config.AcquireNode(exclude=pnode)
524         try:
525           instance = RunTest(func, pnode, snode)
526           RunTestIf("haskell-confd", qa_node.TestNodeListDrbd, pnode)
527           RunTestIf("haskell-confd", qa_node.TestNodeListDrbd, snode)
528           RunCommonInstanceTests(instance)
529           RunGroupListTests()
530           RunTestIf("group-rwops", qa_group.TestAssignNodesIncludingSplit,
531                     constants.INITIAL_NODE_GROUP_NAME,
532                     pnode["primary"], snode["primary"])
533           if qa_config.TestEnabled("instance-convert-disk"):
534             RunTest(qa_instance.TestInstanceShutdown, instance)
535             RunTest(qa_instance.TestInstanceConvertDisk, instance, snode)
536             RunTest(qa_instance.TestInstanceStartup, instance)
537           RunExportImportTests(instance, pnode, snode)
538           RunHardwareFailureTests(instance, pnode, snode)
539           RunRepairDiskSizes()
540           RunTest(qa_instance.TestInstanceRemove, instance)
541           del instance
542         finally:
543           qa_config.ReleaseNode(snode)
544
545   finally:
546     qa_config.ReleaseNode(pnode)
547
548   # Test removing instance with offline drbd secondary
549   if qa_config.TestEnabled("instance-remove-drbd-offline"):
550     # Make sure the master is not put offline
551     snode = qa_config.AcquireNode(exclude=qa_config.GetMasterNode())
552     try:
553       pnode = qa_config.AcquireNode(exclude=snode)
554       try:
555         instance = qa_instance.TestInstanceAddWithDrbdDisk(pnode, snode)
556         qa_node.MakeNodeOffline(snode, "yes")
557         try:
558           RunTest(qa_instance.TestInstanceRemove, instance)
559         finally:
560           qa_node.MakeNodeOffline(snode, "no")
561       finally:
562         qa_config.ReleaseNode(pnode)
563     finally:
564       qa_config.ReleaseNode(snode)
565
566   pnode = qa_config.AcquireNode()
567   try:
568     if qa_config.TestEnabled(["instance-add-plain-disk", "instance-export"]):
569       for shutdown in [False, True]:
570         instance = RunTest(qa_instance.TestInstanceAddWithPlainDisk, pnode)
571         expnode = qa_config.AcquireNode(exclude=pnode)
572         try:
573           if shutdown:
574             # Stop instance before exporting and removing it
575             RunTest(qa_instance.TestInstanceShutdown, instance)
576           RunTest(qa_instance.TestInstanceExportWithRemove, instance, expnode)
577           RunTest(qa_instance.TestBackupList, expnode)
578         finally:
579           qa_config.ReleaseNode(expnode)
580         del expnode
581         del instance
582
583   finally:
584     qa_config.ReleaseNode(pnode)
585
586   RunTestIf("create-cluster", qa_node.TestNodeRemoveAll)
587
588   RunTestIf("cluster-destroy", qa_cluster.TestClusterDestroy)
589
590
591 @UsesRapiClient
592 def main():
593   """Main program.
594
595   """
596   parser = optparse.OptionParser(usage="%prog [options] <config-file>")
597   parser.add_option("--yes-do-it", dest="yes_do_it",
598                     action="store_true",
599                     help="Really execute the tests")
600   (qa_config.options, args) = parser.parse_args()
601
602   if len(args) == 1:
603     (config_file, ) = args
604   else:
605     parser.error("Wrong number of arguments.")
606
607   if not qa_config.options.yes_do_it:
608     print ("Executing this script irreversibly destroys any Ganeti\n"
609            "configuration on all nodes involved. If you really want\n"
610            "to start testing, supply the --yes-do-it option.")
611     sys.exit(1)
612
613   qa_config.Load(config_file)
614
615   primary = qa_config.GetMasterNode()["primary"]
616   qa_utils.StartMultiplexer(primary)
617   print ("SSH command for primary node: %s" %
618          utils.ShellQuoteArgs(qa_utils.GetSSHCommand(primary, "")))
619   print ("SSH command for other nodes: %s" %
620          utils.ShellQuoteArgs(qa_utils.GetSSHCommand("NODE", "")))
621   try:
622     RunQa()
623   finally:
624     qa_utils.CloseMultiplexers()
625
626 if __name__ == "__main__":
627   main()