HTools/Types.hs: more auto-repair types
[ganeti-local] / qa / qa_cluster.py
1 #
2 #
3
4 # Copyright (C) 2007, 2010, 2011, 2012 Google Inc.
5 #
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
10 #
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 # General Public License for more details.
15 #
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 # 02110-1301, USA.
20
21
22 """Cluster related QA tests.
23
24 """
25
26 import tempfile
27 import os.path
28
29 from ganeti import constants
30 from ganeti import compat
31 from ganeti import utils
32 from ganeti import pathutils
33
34 import qa_config
35 import qa_utils
36 import qa_error
37
38 from qa_utils import AssertEqual, AssertCommand, GetCommandOutput
39
40
41 #: cluster verify command
42 _CLUSTER_VERIFY = ["gnt-cluster", "verify"]
43
44
45 def _RemoveFileFromAllNodes(filename):
46   """Removes a file from all nodes.
47
48   """
49   for node in qa_config.get("nodes"):
50     AssertCommand(["rm", "-f", filename], node=node)
51
52
53 def _CheckFileOnAllNodes(filename, content):
54   """Verifies the content of the given file on all nodes.
55
56   """
57   cmd = utils.ShellQuoteArgs(["cat", filename])
58   for node in qa_config.get("nodes"):
59     AssertEqual(qa_utils.GetCommandOutput(node["primary"], cmd), content)
60
61
62 # data for testing failures due to bad keys/values for disk parameters
63 _FAIL_PARAMS = ["nonexistent:resync-rate=1",
64                 "drbd:nonexistent=1",
65                 "drbd:resync-rate=invalid",
66                 ]
67
68
69 def TestClusterInitDisk():
70   """gnt-cluster init -D"""
71   name = qa_config.get("name")
72   for param in _FAIL_PARAMS:
73     AssertCommand(["gnt-cluster", "init", "-D", param, name], fail=True)
74
75
76 def TestClusterInit(rapi_user, rapi_secret):
77   """gnt-cluster init"""
78   master = qa_config.GetMasterNode()
79
80   rapi_dir = os.path.dirname(pathutils.RAPI_USERS_FILE)
81
82   # First create the RAPI credentials
83   fh = tempfile.NamedTemporaryFile()
84   try:
85     fh.write("%s %s write\n" % (rapi_user, rapi_secret))
86     fh.flush()
87
88     tmpru = qa_utils.UploadFile(master["primary"], fh.name)
89     try:
90       AssertCommand(["mkdir", "-p", rapi_dir])
91       AssertCommand(["mv", tmpru, pathutils.RAPI_USERS_FILE])
92     finally:
93       AssertCommand(["rm", "-f", tmpru])
94   finally:
95     fh.close()
96
97   # Initialize cluster
98   cmd = [
99     "gnt-cluster", "init",
100     "--primary-ip-version=%d" % qa_config.get("primary_ip_version", 4),
101     "--enabled-hypervisors=%s" % ",".join(qa_config.GetEnabledHypervisors()),
102     ]
103
104   for spec_type in ("mem-size", "disk-size", "disk-count", "cpu-count",
105                     "nic-count"):
106     for spec_val in ("min", "max", "std"):
107       spec = qa_config.get("ispec_%s_%s" %
108                            (spec_type.replace("-", "_"), spec_val), None)
109       if spec:
110         cmd.append("--specs-%s=%s=%d" % (spec_type, spec_val, spec))
111
112   if master.get("secondary", None):
113     cmd.append("--secondary-ip=%s" % master["secondary"])
114
115   master_netdev = qa_config.get("master-netdev", None)
116   if master_netdev:
117     cmd.append("--master-netdev=%s" % master_netdev)
118
119   nicparams = qa_config.get("default-nicparams", None)
120   if nicparams:
121     cmd.append("--nic-parameters=%s" %
122                ",".join(utils.FormatKeyValue(nicparams)))
123
124   cmd.append(qa_config.get("name"))
125   AssertCommand(cmd)
126
127   cmd = ["gnt-cluster", "modify"]
128
129   # hypervisor parameter modifications
130   hvp = qa_config.get("hypervisor-parameters", {})
131   for k, v in hvp.items():
132     cmd.extend(["-H", "%s:%s" % (k, v)])
133   # backend parameter modifications
134   bep = qa_config.get("backend-parameters", "")
135   if bep:
136     cmd.extend(["-B", bep])
137
138   if len(cmd) > 2:
139     AssertCommand(cmd)
140
141   # OS parameters
142   osp = qa_config.get("os-parameters", {})
143   for k, v in osp.items():
144     AssertCommand(["gnt-os", "modify", "-O", v, k])
145
146   # OS hypervisor parameters
147   os_hvp = qa_config.get("os-hvp", {})
148   for os_name in os_hvp:
149     for hv, hvp in os_hvp[os_name].items():
150       AssertCommand(["gnt-os", "modify", "-H", "%s:%s" % (hv, hvp), os_name])
151
152
153 def TestClusterRename():
154   """gnt-cluster rename"""
155   cmd = ["gnt-cluster", "rename", "-f"]
156
157   original_name = qa_config.get("name")
158   rename_target = qa_config.get("rename", None)
159   if rename_target is None:
160     print qa_utils.FormatError('"rename" entry is missing')
161     return
162
163   for data in [
164     cmd + [rename_target],
165     _CLUSTER_VERIFY,
166     cmd + [original_name],
167     _CLUSTER_VERIFY,
168     ]:
169     AssertCommand(data)
170
171
172 def TestClusterOob():
173   """out-of-band framework"""
174   oob_path_exists = "/tmp/ganeti-qa-oob-does-exist-%s" % utils.NewUUID()
175
176   AssertCommand(_CLUSTER_VERIFY)
177   AssertCommand(["gnt-cluster", "modify", "--node-parameters",
178                  "oob_program=/tmp/ganeti-qa-oob-does-not-exist-%s" %
179                  utils.NewUUID()])
180
181   AssertCommand(_CLUSTER_VERIFY, fail=True)
182
183   AssertCommand(["touch", oob_path_exists])
184   AssertCommand(["chmod", "0400", oob_path_exists])
185   AssertCommand(["gnt-cluster", "copyfile", oob_path_exists])
186
187   try:
188     AssertCommand(["gnt-cluster", "modify", "--node-parameters",
189                    "oob_program=%s" % oob_path_exists])
190
191     AssertCommand(_CLUSTER_VERIFY, fail=True)
192
193     AssertCommand(["chmod", "0500", oob_path_exists])
194     AssertCommand(["gnt-cluster", "copyfile", oob_path_exists])
195
196     AssertCommand(_CLUSTER_VERIFY)
197   finally:
198     AssertCommand(["gnt-cluster", "command", "rm", oob_path_exists])
199
200   AssertCommand(["gnt-cluster", "modify", "--node-parameters",
201                  "oob_program="])
202
203
204 def TestClusterEpo():
205   """gnt-cluster epo"""
206   master = qa_config.GetMasterNode()
207
208   # Assert that OOB is unavailable for all nodes
209   result_output = GetCommandOutput(master["primary"],
210                                    "gnt-node list --verbose --no-headers -o"
211                                    " powered")
212   AssertEqual(compat.all(powered == "(unavail)"
213                          for powered in result_output.splitlines()), True)
214
215   # Conflicting
216   AssertCommand(["gnt-cluster", "epo", "--groups", "--all"], fail=True)
217   # --all doesn't expect arguments
218   AssertCommand(["gnt-cluster", "epo", "--all", "some_arg"], fail=True)
219
220   # Unless --all is given master is not allowed to be in the list
221   AssertCommand(["gnt-cluster", "epo", "-f", master["primary"]], fail=True)
222
223   # This shouldn't fail
224   AssertCommand(["gnt-cluster", "epo", "-f", "--all"])
225
226   # All instances should have been stopped now
227   result_output = GetCommandOutput(master["primary"],
228                                    "gnt-instance list --no-headers -o status")
229   # ERROR_down because the instance is stopped but not recorded as such
230   AssertEqual(compat.all(status == "ERROR_down"
231                          for status in result_output.splitlines()), True)
232
233   # Now start everything again
234   AssertCommand(["gnt-cluster", "epo", "--on", "-f", "--all"])
235
236   # All instances should have been started now
237   result_output = GetCommandOutput(master["primary"],
238                                    "gnt-instance list --no-headers -o status")
239   AssertEqual(compat.all(status == "running"
240                          for status in result_output.splitlines()), True)
241
242
243 def TestClusterVerify():
244   """gnt-cluster verify"""
245   AssertCommand(_CLUSTER_VERIFY)
246   AssertCommand(["gnt-cluster", "verify-disks"])
247
248
249 def TestJobqueue():
250   """gnt-debug test-jobqueue"""
251   AssertCommand(["gnt-debug", "test-jobqueue"])
252
253
254 def TestDelay(node):
255   """gnt-debug delay"""
256   AssertCommand(["gnt-debug", "delay", "1"])
257   AssertCommand(["gnt-debug", "delay", "--no-master", "1"])
258   AssertCommand(["gnt-debug", "delay", "--no-master",
259                  "-n", node["primary"], "1"])
260
261
262 def TestClusterReservedLvs():
263   """gnt-cluster reserved lvs"""
264   for fail, cmd in [
265     (False, _CLUSTER_VERIFY),
266     (False, ["gnt-cluster", "modify", "--reserved-lvs", ""]),
267     (False, ["lvcreate", "-L1G", "-nqa-test", "xenvg"]),
268     (True, _CLUSTER_VERIFY),
269     (False, ["gnt-cluster", "modify", "--reserved-lvs",
270              "xenvg/qa-test,.*/other-test"]),
271     (False, _CLUSTER_VERIFY),
272     (False, ["gnt-cluster", "modify", "--reserved-lvs", ".*/qa-.*"]),
273     (False, _CLUSTER_VERIFY),
274     (False, ["gnt-cluster", "modify", "--reserved-lvs", ""]),
275     (True, _CLUSTER_VERIFY),
276     (False, ["lvremove", "-f", "xenvg/qa-test"]),
277     (False, _CLUSTER_VERIFY),
278     ]:
279     AssertCommand(cmd, fail=fail)
280
281
282 def TestClusterModifyEmpty():
283   """gnt-cluster modify"""
284   AssertCommand(["gnt-cluster", "modify"], fail=True)
285
286
287 def TestClusterModifyDisk():
288   """gnt-cluster modify -D"""
289   for param in _FAIL_PARAMS:
290     AssertCommand(["gnt-cluster", "modify", "-D", param], fail=True)
291
292
293 def TestClusterModifyBe():
294   """gnt-cluster modify -B"""
295   for fail, cmd in [
296     # max/min mem
297     (False, ["gnt-cluster", "modify", "-B", "maxmem=256"]),
298     (False, ["sh", "-c", "gnt-cluster info|grep '^ *maxmem: 256$'"]),
299     (False, ["gnt-cluster", "modify", "-B", "minmem=256"]),
300     (False, ["sh", "-c", "gnt-cluster info|grep '^ *minmem: 256$'"]),
301     (True, ["gnt-cluster", "modify", "-B", "maxmem=a"]),
302     (False, ["sh", "-c", "gnt-cluster info|grep '^ *maxmem: 256$'"]),
303     (True, ["gnt-cluster", "modify", "-B", "minmem=a"]),
304     (False, ["sh", "-c", "gnt-cluster info|grep '^ *minmem: 256$'"]),
305     (False, ["gnt-cluster", "modify", "-B", "maxmem=128,minmem=128"]),
306     (False, ["sh", "-c", "gnt-cluster info|grep '^ *maxmem: 128$'"]),
307     (False, ["sh", "-c", "gnt-cluster info|grep '^ *minmem: 128$'"]),
308     # vcpus
309     (False, ["gnt-cluster", "modify", "-B", "vcpus=4"]),
310     (False, ["sh", "-c", "gnt-cluster info|grep '^ *vcpus: 4$'"]),
311     (True, ["gnt-cluster", "modify", "-B", "vcpus=a"]),
312     (False, ["gnt-cluster", "modify", "-B", "vcpus=1"]),
313     (False, ["sh", "-c", "gnt-cluster info|grep '^ *vcpus: 1$'"]),
314     # auto_balance
315     (False, ["gnt-cluster", "modify", "-B", "auto_balance=False"]),
316     (False, ["sh", "-c", "gnt-cluster info|grep '^ *auto_balance: False$'"]),
317     (True, ["gnt-cluster", "modify", "-B", "auto_balance=1"]),
318     (False, ["gnt-cluster", "modify", "-B", "auto_balance=True"]),
319     (False, ["sh", "-c", "gnt-cluster info|grep '^ *auto_balance: True$'"]),
320     ]:
321     AssertCommand(cmd, fail=fail)
322
323   # redo the original-requested BE parameters, if any
324   bep = qa_config.get("backend-parameters", "")
325   if bep:
326     AssertCommand(["gnt-cluster", "modify", "-B", bep])
327
328
329 def TestClusterInfo():
330   """gnt-cluster info"""
331   AssertCommand(["gnt-cluster", "info"])
332
333
334 def TestClusterRedistConf():
335   """gnt-cluster redist-conf"""
336   AssertCommand(["gnt-cluster", "redist-conf"])
337
338
339 def TestClusterGetmaster():
340   """gnt-cluster getmaster"""
341   AssertCommand(["gnt-cluster", "getmaster"])
342
343
344 def TestClusterVersion():
345   """gnt-cluster version"""
346   AssertCommand(["gnt-cluster", "version"])
347
348
349 def TestClusterRenewCrypto():
350   """gnt-cluster renew-crypto"""
351   master = qa_config.GetMasterNode()
352
353   # Conflicting options
354   cmd = ["gnt-cluster", "renew-crypto", "--force",
355          "--new-cluster-certificate", "--new-confd-hmac-key"]
356   conflicting = [
357     ["--new-rapi-certificate", "--rapi-certificate=/dev/null"],
358     ["--new-cluster-domain-secret", "--cluster-domain-secret=/dev/null"],
359     ]
360   for i in conflicting:
361     AssertCommand(cmd + i, fail=True)
362
363   # Invalid RAPI certificate
364   cmd = ["gnt-cluster", "renew-crypto", "--force",
365          "--rapi-certificate=/dev/null"]
366   AssertCommand(cmd, fail=True)
367
368   rapi_cert_backup = qa_utils.BackupFile(master["primary"],
369                                          pathutils.RAPI_CERT_FILE)
370   try:
371     # Custom RAPI certificate
372     fh = tempfile.NamedTemporaryFile()
373
374     # Ensure certificate doesn't cause "gnt-cluster verify" to complain
375     validity = constants.SSL_CERT_EXPIRATION_WARN * 3
376
377     utils.GenerateSelfSignedSslCert(fh.name, validity=validity)
378
379     tmpcert = qa_utils.UploadFile(master["primary"], fh.name)
380     try:
381       AssertCommand(["gnt-cluster", "renew-crypto", "--force",
382                      "--rapi-certificate=%s" % tmpcert])
383     finally:
384       AssertCommand(["rm", "-f", tmpcert])
385
386     # Custom cluster domain secret
387     cds_fh = tempfile.NamedTemporaryFile()
388     cds_fh.write(utils.GenerateSecret())
389     cds_fh.write("\n")
390     cds_fh.flush()
391
392     tmpcds = qa_utils.UploadFile(master["primary"], cds_fh.name)
393     try:
394       AssertCommand(["gnt-cluster", "renew-crypto", "--force",
395                      "--cluster-domain-secret=%s" % tmpcds])
396     finally:
397       AssertCommand(["rm", "-f", tmpcds])
398
399     # Normal case
400     AssertCommand(["gnt-cluster", "renew-crypto", "--force",
401                    "--new-cluster-certificate", "--new-confd-hmac-key",
402                    "--new-rapi-certificate", "--new-cluster-domain-secret"])
403
404     # Restore RAPI certificate
405     AssertCommand(["gnt-cluster", "renew-crypto", "--force",
406                    "--rapi-certificate=%s" % rapi_cert_backup])
407   finally:
408     AssertCommand(["rm", "-f", rapi_cert_backup])
409
410
411 def TestClusterBurnin():
412   """Burnin"""
413   master = qa_config.GetMasterNode()
414
415   options = qa_config.get("options", {})
416   disk_template = options.get("burnin-disk-template", "drbd")
417   parallel = options.get("burnin-in-parallel", False)
418   check_inst = options.get("burnin-check-instances", False)
419   do_rename = options.get("burnin-rename", "")
420   do_reboot = options.get("burnin-reboot", True)
421   reboot_types = options.get("reboot-types", constants.REBOOT_TYPES)
422
423   # Get as many instances as we need
424   instances = []
425   try:
426     try:
427       num = qa_config.get("options", {}).get("burnin-instances", 1)
428       for _ in range(0, num):
429         instances.append(qa_config.AcquireInstance())
430     except qa_error.OutOfInstancesError:
431       print "Not enough instances, continuing anyway."
432
433     if len(instances) < 1:
434       raise qa_error.Error("Burnin needs at least one instance")
435
436     script = qa_utils.UploadFile(master["primary"], "../tools/burnin")
437     try:
438       # Run burnin
439       cmd = [script,
440              "--os=%s" % qa_config.get("os"),
441              "--minmem-size=%s" % qa_config.get(constants.BE_MINMEM),
442              "--maxmem-size=%s" % qa_config.get(constants.BE_MAXMEM),
443              "--disk-size=%s" % ",".join(qa_config.get("disk")),
444              "--disk-growth=%s" % ",".join(qa_config.get("disk-growth")),
445              "--disk-template=%s" % disk_template]
446       if parallel:
447         cmd.append("--parallel")
448         cmd.append("--early-release")
449       if check_inst:
450         cmd.append("--http-check")
451       if do_rename:
452         cmd.append("--rename=%s" % do_rename)
453       if not do_reboot:
454         cmd.append("--no-reboot")
455       else:
456         cmd.append("--reboot-types=%s" % ",".join(reboot_types))
457       cmd += [inst["name"] for inst in instances]
458       AssertCommand(cmd)
459     finally:
460       AssertCommand(["rm", "-f", script])
461
462   finally:
463     for inst in instances:
464       qa_config.ReleaseInstance(inst)
465
466
467 def TestClusterMasterFailover():
468   """gnt-cluster master-failover"""
469   master = qa_config.GetMasterNode()
470   failovermaster = qa_config.AcquireNode(exclude=master)
471
472   cmd = ["gnt-cluster", "master-failover"]
473   try:
474     AssertCommand(cmd, node=failovermaster)
475     # Back to original master node
476     AssertCommand(cmd, node=master)
477   finally:
478     qa_config.ReleaseNode(failovermaster)
479
480
481 def TestClusterMasterFailoverWithDrainedQueue():
482   """gnt-cluster master-failover with drained queue"""
483   drain_check = ["test", "-f", pathutils.JOB_QUEUE_DRAIN_FILE]
484
485   master = qa_config.GetMasterNode()
486   failovermaster = qa_config.AcquireNode(exclude=master)
487
488   # Ensure queue is not drained
489   for node in [master, failovermaster]:
490     AssertCommand(drain_check, node=node, fail=True)
491
492   # Drain queue on failover master
493   AssertCommand(["touch", pathutils.JOB_QUEUE_DRAIN_FILE], node=failovermaster)
494
495   cmd = ["gnt-cluster", "master-failover"]
496   try:
497     AssertCommand(drain_check, node=failovermaster)
498     AssertCommand(cmd, node=failovermaster)
499     AssertCommand(drain_check, fail=True)
500     AssertCommand(drain_check, node=failovermaster, fail=True)
501
502     # Back to original master node
503     AssertCommand(cmd, node=master)
504   finally:
505     qa_config.ReleaseNode(failovermaster)
506
507   AssertCommand(drain_check, fail=True)
508   AssertCommand(drain_check, node=failovermaster, fail=True)
509
510
511 def TestClusterCopyfile():
512   """gnt-cluster copyfile"""
513   master = qa_config.GetMasterNode()
514
515   uniqueid = utils.NewUUID()
516
517   # Create temporary file
518   f = tempfile.NamedTemporaryFile()
519   f.write(uniqueid)
520   f.flush()
521   f.seek(0)
522
523   # Upload file to master node
524   testname = qa_utils.UploadFile(master["primary"], f.name)
525   try:
526     # Copy file to all nodes
527     AssertCommand(["gnt-cluster", "copyfile", testname])
528     _CheckFileOnAllNodes(testname, uniqueid)
529   finally:
530     _RemoveFileFromAllNodes(testname)
531
532
533 def TestClusterCommand():
534   """gnt-cluster command"""
535   uniqueid = utils.NewUUID()
536   rfile = "/tmp/gnt%s" % utils.NewUUID()
537   rcmd = utils.ShellQuoteArgs(["echo", "-n", uniqueid])
538   cmd = utils.ShellQuoteArgs(["gnt-cluster", "command",
539                               "%s >%s" % (rcmd, rfile)])
540
541   try:
542     AssertCommand(cmd)
543     _CheckFileOnAllNodes(rfile, uniqueid)
544   finally:
545     _RemoveFileFromAllNodes(rfile)
546
547
548 def TestClusterDestroy():
549   """gnt-cluster destroy"""
550   AssertCommand(["gnt-cluster", "destroy", "--yes-do-it"])
551
552
553 def TestClusterRepairDiskSizes():
554   """gnt-cluster repair-disk-sizes"""
555   AssertCommand(["gnt-cluster", "repair-disk-sizes"])