018396c8f9928c5e13621a4545a165fc5f152b6f
[ganeti-local] / tools / burnin
1 #!/usr/bin/python
2 #
3
4 # Copyright (C) 2006, 2007 Google Inc.
5 #
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
10 #
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 # General Public License for more details.
15 #
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 # 02110-1301, USA.
20
21
22 """Burnin program
23
24 """
25
26 import os
27 import sys
28 import optparse
29 import time
30 from itertools import izip, islice, cycle
31 from cStringIO import StringIO
32
33 from ganeti import opcodes
34 from ganeti import mcpu
35 from ganeti import constants
36 from ganeti import cli
37 from ganeti import errors
38 from ganeti import utils
39
40
41 USAGE = ("\tburnin -o OS_NAME [options...] instance_name ...")
42
43
44 def Usage():
45   """Shows program usage information and exits the program."""
46
47   print >> sys.stderr, "Usage:"
48   print >> sys.stderr, USAGE
49   sys.exit(2)
50
51
52 def Log(msg):
53   """Simple function that prints out its argument.
54
55   """
56   print msg
57   sys.stdout.flush()
58
59
60 class Burner(object):
61   """Burner class."""
62
63   def __init__(self):
64     """Constructor."""
65     utils.SetupLogging(constants.LOG_BURNIN, debug=False, stderr_logging=True)
66     self._feed_buf = StringIO()
67     self.nodes = []
68     self.instances = []
69     self.to_rem = []
70     self.opts = None
71     self.cl = cli.GetClient()
72     self.ParseOptions()
73     self.GetState()
74
75   def ClearFeedbackBuf(self):
76     """Clear the feedback buffer."""
77     self._feed_buf.truncate(0)
78
79   def GetFeedbackBuf(self):
80     """Return the contents of the buffer."""
81     return self._feed_buf.getvalue()
82
83   def Feedback(self, msg):
84     """Acumulate feedback in our buffer."""
85     self._feed_buf.write("%s %s\n" % (time.ctime(utils.MergeTime(msg[0])),
86                                       msg[2]))
87     if self.opts.verbose:
88       Log(msg)
89
90   def ExecOp(self, op):
91     """Execute an opcode and manage the exec buffer."""
92     self.ClearFeedbackBuf()
93     return cli.SubmitOpCode(op, feedback_fn=self.Feedback, cl=self.cl)
94
95   def ExecJobSet(self, jobs):
96     """Execute a set of jobs and return once all are done.
97
98     The method will return the list of results, if all jobs are
99     successfull. Otherwise, OpExecError will be raised from within
100     cli.py.
101
102     """
103     self.ClearFeedbackBuf()
104     job_ids = [cli.SendJob(job, cl=self.cl) for job in jobs]
105     Log("- Submitted job IDs %s" % ", ".join(job_ids))
106     results = []
107     for jid in job_ids:
108       Log("- Waiting for job %s" % jid)
109       results.append(cli.PollJob(jid, cl=self.cl, feedback_fn=self.Feedback))
110
111     return results
112
113   def ParseOptions(self):
114     """Parses the command line options.
115
116     In case of command line errors, it will show the usage and exit the
117     program.
118
119     """
120
121     parser = optparse.OptionParser(usage="\n%s" % USAGE,
122                                    version="%%prog (ganeti) %s" %
123                                    constants.RELEASE_VERSION,
124                                    option_class=cli.CliOption)
125
126     parser.add_option("-o", "--os", dest="os", default=None,
127                       help="OS to use during burnin",
128                       metavar="<OS>")
129     parser.add_option("--disk-size", dest="disk_size",
130                       help="Disk size (determines disk count)",
131                       default="128m", type="string", metavar="<size,size,...>")
132     parser.add_option("--disk-growth", dest="disk_growth", help="Disk growth",
133                       default="128m", type="string", metavar="<size,size,...>")
134     parser.add_option("--mem-size", dest="mem_size", help="Memory size",
135                       default=128, type="unit", metavar="<size>")
136     parser.add_option("-v", "--verbose",
137                       action="store_true", dest="verbose", default=False,
138                       help="print command execution messages to stdout")
139     parser.add_option("--no-replace1", dest="do_replace1",
140                       help="Skip disk replacement with the same secondary",
141                       action="store_false", default=True)
142     parser.add_option("--no-replace2", dest="do_replace2",
143                       help="Skip disk replacement with a different secondary",
144                       action="store_false", default=True)
145     parser.add_option("--no-failover", dest="do_failover",
146                       help="Skip instance failovers", action="store_false",
147                       default=True)
148     parser.add_option("--no-importexport", dest="do_importexport",
149                       help="Skip instance export/import", action="store_false",
150                       default=True)
151     parser.add_option("--no-startstop", dest="do_startstop",
152                       help="Skip instance stop/start", action="store_false",
153                       default=True)
154     parser.add_option("--no-reinstall", dest="do_reinstall",
155                       help="Skip instance reinstall", action="store_false",
156                       default=True)
157     parser.add_option("--no-reboot", dest="do_reboot",
158                       help="Skip instance reboot", action="store_false",
159                       default=True)
160     parser.add_option("--no-nics", dest="nics",
161                       help="No network interfaces", action="store_const",
162                       const=[], default=[{}])
163     parser.add_option("--rename", dest="rename", default=None,
164                       help="Give one unused instance name which is taken"
165                            " to start the renaming sequence",
166                       metavar="<instance_name>")
167     parser.add_option("-t", "--disk-template", dest="disk_template",
168                       choices=("diskless", "file", "plain", "drbd"),
169                       default="drbd",
170                       help="Disk template (diskless, file, plain or drbd)"
171                             " [drbd]")
172     parser.add_option("-n", "--nodes", dest="nodes", default="",
173                       help="Comma separated list of nodes to perform"
174                       " the burnin on (defaults to all nodes)")
175     parser.add_option("--iallocator", dest="iallocator",
176                       default=None, type="string",
177                       help="Perform the allocation using an iallocator"
178                       " instead of fixed node spread (node restrictions no"
179                       " longer apply, therefore -n/--nodes must not be used")
180     parser.add_option("-p", "--parallel", default=False, action="store_true",
181                       dest="parallel",
182                       help="Enable parallelization of some operations in"
183                       " order to speed burnin or to test granular locking")
184
185     options, args = parser.parse_args()
186     if len(args) < 1 or options.os is None:
187       Usage()
188
189     supported_disk_templates = (constants.DT_DISKLESS,
190                                 constants.DT_FILE,
191                                 constants.DT_PLAIN,
192                                 constants.DT_DRBD8)
193     if options.disk_template not in supported_disk_templates:
194       Log("Unknown disk template '%s'" % options.disk_template)
195       sys.exit(1)
196
197     if options.disk_template == constants.DT_DISKLESS:
198       disk_size = disk_growth = []
199     else:
200       disk_size = [utils.ParseUnit(v) for v in options.disk_size.split(",")]
201       disk_growth = [utils.ParseUnit(v)
202                      for v in options.disk_growth.split(",")]
203       if len(disk_growth) != len(disk_size):
204         Log("Wrong disk sizes/growth combination")
205         sys.exit(1)
206     if ((disk_size and options.disk_template == constants.DT_DISKLESS) or
207         (not disk_size and options.disk_template != constants.DT_DISKLESS)):
208       Log("Wrong disk count/disk template combination")
209       sys.exit(1)
210
211     self.disk_size = disk_size
212     self.disk_growth = disk_growth
213     self.disk_count = len(disk_size)
214
215     if options.nodes and options.iallocator:
216       Log("Give either the nodes option or the iallocator option, not both")
217       sys.exit(1)
218
219     self.opts = options
220     self.instances = args
221     self.bep = {
222       constants.BE_MEMORY: options.mem_size,
223       constants.BE_VCPUS: 1,
224       }
225     self.hvp = {}
226
227   def GetState(self):
228     """Read the cluster state from the config."""
229     if self.opts.nodes:
230       names = self.opts.nodes.split(",")
231     else:
232       names = []
233     try:
234       op = opcodes.OpQueryNodes(output_fields=["name"], names=names)
235       result = self.ExecOp(op)
236     except errors.GenericError, err:
237       err_code, msg = cli.FormatError(err)
238       Log(msg)
239       sys.exit(err_code)
240     self.nodes = [data[0] for data in result]
241
242     result = self.ExecOp(opcodes.OpDiagnoseOS(output_fields=["name", "valid"],
243                                               names=[]))
244
245     if not result:
246       Log("Can't get the OS list")
247       sys.exit(1)
248
249     # filter non-valid OS-es
250     os_set = [val[0] for val in result if val[1]]
251
252     if self.opts.os not in os_set:
253       Log("OS '%s' not found" % self.opts.os)
254       sys.exit(1)
255
256   def CreateInstances(self):
257     """Create the given instances.
258
259     """
260     self.to_rem = []
261     mytor = izip(cycle(self.nodes),
262                  islice(cycle(self.nodes), 1, None),
263                  self.instances)
264     jobset = []
265
266     for pnode, snode, instance in mytor:
267       if self.opts.iallocator:
268         pnode = snode = None
269         Log("- Add instance %s (iallocator: %s)" %
270               (instance, self.opts.iallocator))
271       elif self.opts.disk_template not in constants.DTS_NET_MIRROR:
272         snode = None
273         Log("- Add instance %s on node %s" % (instance, pnode))
274       else:
275         Log("- Add instance %s on nodes %s/%s" % (instance, pnode, snode))
276
277       op = opcodes.OpCreateInstance(instance_name=instance,
278                                     disks = [ {"size": size}
279                                               for size in self.disk_size],
280                                     disk_template=self.opts.disk_template,
281                                     nics=self.opts.nics,
282                                     mode=constants.INSTANCE_CREATE,
283                                     os_type=self.opts.os,
284                                     pnode=pnode,
285                                     snode=snode,
286                                     start=True,
287                                     ip_check=True,
288                                     wait_for_sync=True,
289                                     file_driver="loop",
290                                     file_storage_dir=None,
291                                     iallocator=self.opts.iallocator,
292                                     beparams=self.bep,
293                                     hvparams=self.hvp,
294                                     )
295
296       if self.opts.parallel:
297         jobset.append([op])
298         # FIXME: here we should not append to to_rem uncoditionally,
299         # but only when the job is successful
300         self.to_rem.append(instance)
301       else:
302         self.ExecOp(op)
303         self.to_rem.append(instance)
304     if self.opts.parallel:
305       self.ExecJobSet(jobset)
306
307   def GrowDisks(self):
308     """Grow both the os and the swap disks by the requested amount, if any."""
309     for instance in self.instances:
310       for idx, growth in enumerate(self.disk_growth):
311         if growth > 0:
312           op = opcodes.OpGrowDisk(instance_name=instance, disk=idx,
313                                   amount=growth, wait_for_sync=True)
314           Log("- Increase %s's disk/%s by %s MB" % (instance, idx, growth))
315           self.ExecOp(op)
316
317   def ReplaceDisks1D8(self):
318     """Replace disks on primary and secondary for drbd8."""
319     for instance in self.instances:
320       for mode in constants.REPLACE_DISK_SEC, constants.REPLACE_DISK_PRI:
321         op = opcodes.OpReplaceDisks(instance_name=instance,
322                                     mode=mode,
323                                     disks=[i for i in range(self.disk_count)])
324         Log("- Replace disks (%s) for instance %s" % (mode, instance))
325         self.ExecOp(op)
326
327   def ReplaceDisks2(self):
328     """Replace secondary node."""
329     mode = constants.REPLACE_DISK_SEC
330
331     mytor = izip(islice(cycle(self.nodes), 2, None),
332                  self.instances)
333     for tnode, instance in mytor:
334       if self.opts.iallocator:
335         tnode = None
336       op = opcodes.OpReplaceDisks(instance_name=instance,
337                                   mode=mode,
338                                   remote_node=tnode,
339                                   iallocator=self.opts.iallocator,
340                                   disks=[i for i in range(self.disk_count)])
341       Log("- Replace secondary (%s) for instance %s" % (mode, instance))
342       self.ExecOp(op)
343
344   def Failover(self):
345     """Failover the instances."""
346
347     for instance in self.instances:
348       op = opcodes.OpFailoverInstance(instance_name=instance,
349                                       ignore_consistency=False)
350
351       Log("- Failover instance %s" % (instance))
352       self.ExecOp(op)
353
354   def ImportExport(self):
355     """Export the instance, delete it, and import it back.
356
357     """
358
359     mytor = izip(cycle(self.nodes),
360                  islice(cycle(self.nodes), 1, None),
361                  islice(cycle(self.nodes), 2, None),
362                  self.instances)
363
364     for pnode, snode, enode, instance in mytor:
365
366       if self.opts.iallocator:
367         pnode = snode = None
368         import_log_msg = ("- Import instance %s from node %s"
369                           " (iallocator: %s)" %
370                           (instance, enode, self.opts.iallocator))
371       elif self.opts.disk_template not in constants.DTS_NET_MIRROR:
372         snode = None
373         import_log_msg = ("- Import instance %s from node %s to node %s" %
374                           (instance, enode, pnode))
375       else:
376         import_log_msg = ("- Import instance %s from node %s to nodes %s/%s" %
377                           (instance, enode, pnode, snode))
378
379       exp_op = opcodes.OpExportInstance(instance_name=instance,
380                                            target_node=enode,
381                                            shutdown=True)
382       rem_op = opcodes.OpRemoveInstance(instance_name=instance,
383                                         ignore_failures=True)
384       nam_op = opcodes.OpQueryInstances(output_fields=["name"],
385                                            names=[instance])
386       full_name = self.ExecOp(nam_op)[0][0]
387       imp_dir = os.path.join(constants.EXPORT_DIR, full_name)
388       imp_op = opcodes.OpCreateInstance(instance_name=instance,
389                                         disks = [ {"size": size}
390                                                   for size in self.disk_size],
391                                         disk_template=self.opts.disk_template,
392                                         nics=self.opts.nics,
393                                         mode=constants.INSTANCE_IMPORT,
394                                         src_node=enode,
395                                         src_path=imp_dir,
396                                         pnode=pnode,
397                                         snode=snode,
398                                         start=True,
399                                         ip_check=True,
400                                         wait_for_sync=True,
401                                         file_storage_dir=None,
402                                         file_driver="loop",
403                                         iallocator=self.opts.iallocator,
404                                         beparams=self.bep,
405                                         hvparams=self.hvp,
406                                         )
407
408       erem_op = opcodes.OpRemoveExport(instance_name=instance)
409
410       Log("- Export instance %s to node %s" % (instance, enode))
411       self.ExecOp(exp_op)
412       Log("- Remove instance %s" % (instance))
413       self.ExecOp(rem_op)
414       self.to_rem.remove(instance)
415       Log(import_log_msg)
416       self.ExecOp(imp_op)
417       Log("- Remove export of instance %s" % (instance))
418       self.ExecOp(erem_op)
419
420       self.to_rem.append(instance)
421
422   def StopInstance(self, instance):
423     """Stop given instance."""
424     op = opcodes.OpShutdownInstance(instance_name=instance)
425     Log("- Shutdown instance %s" % instance)
426     self.ExecOp(op)
427
428   def StartInstance(self, instance):
429     """Start given instance."""
430     op = opcodes.OpStartupInstance(instance_name=instance, force=False)
431     Log("- Start instance %s" % instance)
432     self.ExecOp(op)
433
434   def RenameInstance(self, instance, instance_new):
435     """Rename instance."""
436     op = opcodes.OpRenameInstance(instance_name=instance,
437                                   new_name=instance_new)
438     Log("- Rename instance %s to %s" % (instance, instance_new))
439     self.ExecOp(op)
440
441   def StopStart(self):
442     """Stop/start the instances."""
443     for instance in self.instances:
444       self.StopInstance(instance)
445       self.StartInstance(instance)
446
447   def Remove(self):
448     """Remove the instances."""
449     for instance in self.to_rem:
450       op = opcodes.OpRemoveInstance(instance_name=instance,
451                                     ignore_failures=True)
452       Log("- Remove instance %s" % instance)
453       self.ExecOp(op)
454
455
456   def Rename(self):
457     """Rename the instances."""
458     rename = self.opts.rename
459     for instance in self.instances:
460       self.StopInstance(instance)
461       self.RenameInstance(instance, rename)
462       self.StartInstance(rename)
463       self.StopInstance(rename)
464       self.RenameInstance(rename, instance)
465       self.StartInstance(instance)
466
467   def Reinstall(self):
468     """Reinstall the instances."""
469     for instance in self.instances:
470       self.StopInstance(instance)
471       op = opcodes.OpReinstallInstance(instance_name=instance)
472       Log("- Reinstall instance %s without passing the OS" % (instance,))
473       self.ExecOp(op)
474       op = opcodes.OpReinstallInstance(instance_name=instance,
475                                        os_type=self.opts.os)
476       Log("- Reinstall instance %s specifying the OS" % (instance,))
477       self.ExecOp(op)
478       self.StartInstance(instance)
479
480   def Reboot(self):
481     """Reinstall the instances."""
482     for instance in self.instances:
483       for reboot_type in constants.REBOOT_TYPES:
484         op = opcodes.OpRebootInstance(instance_name=instance,
485                                       reboot_type=reboot_type,
486                                       ignore_secondaries=False)
487         Log("- Reboot instance %s with type '%s'" % (instance, reboot_type))
488         self.ExecOp(op)
489
490   def BurninCluster(self):
491     """Test a cluster intensively.
492
493     This will create instances and then start/stop/failover them.
494     It is safe for existing instances but could impact performance.
495
496     """
497
498     opts = self.opts
499
500     Log("- Testing global parameters")
501
502     if (len(self.nodes) == 1 and
503         opts.disk_template not in (constants.DT_DISKLESS, constants.DT_PLAIN,
504                                    constants.DT_FILE)):
505       Log("When one node is available/selected the disk template must"
506           " be 'diskless', 'file' or 'plain'")
507       sys.exit(1)
508
509     has_err = True
510     try:
511       self.CreateInstances()
512       if opts.do_replace1 and opts.disk_template in constants.DTS_NET_MIRROR:
513         self.ReplaceDisks1D8()
514       if (opts.do_replace2 and len(self.nodes) > 2 and
515           opts.disk_template in constants.DTS_NET_MIRROR) :
516         self.ReplaceDisks2()
517
518       if opts.disk_template != constants.DT_DISKLESS:
519         self.GrowDisks()
520
521       if opts.do_failover and opts.disk_template in constants.DTS_NET_MIRROR:
522         self.Failover()
523
524       if (opts.do_importexport and
525           opts.disk_template not in (constants.DT_DISKLESS,
526                                      constants.DT_FILE)):
527         self.ImportExport()
528
529       if opts.do_reinstall:
530         self.Reinstall()
531
532       if opts.do_reboot:
533         self.Reboot()
534
535       if opts.do_startstop:
536         self.StopStart()
537
538       if opts.rename:
539         self.Rename()
540
541       has_err = False
542     finally:
543       if has_err:
544         Log("Error detected: opcode buffer follows:\n\n")
545         Log(self.GetFeedbackBuf())
546         Log("\n\n")
547       self.Remove()
548
549     return 0
550
551
552 def main():
553   """Main function"""
554
555   burner = Burner()
556   return burner.BurninCluster()
557
558
559 if __name__ == "__main__":
560   main()