bbde37ebb5313206233a28dba961e3c99797778c
[ganeti-local] / tools / burnin
1 #!/usr/bin/python
2 #
3
4 # Copyright (C) 2006, 2007 Google Inc.
5 #
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
10 #
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 # General Public License for more details.
15 #
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 # 02110-1301, USA.
20
21
22 """Burnin program
23
24 """
25
26 import os
27 import sys
28 import optparse
29 import time
30 import socket
31 import urllib2
32 import errno
33 from itertools import izip, islice, cycle
34 from cStringIO import StringIO
35
36 from ganeti import opcodes
37 from ganeti import mcpu
38 from ganeti import constants
39 from ganeti import cli
40 from ganeti import errors
41 from ganeti import utils
42
43
44 USAGE = ("\tburnin -o OS_NAME [options...] instance_name ...")
45
46
47 class InstanceDown(Exception):
48   """The checked instance was not up"""
49
50
51 def Usage():
52   """Shows program usage information and exits the program."""
53
54   print >> sys.stderr, "Usage:"
55   print >> sys.stderr, USAGE
56   sys.exit(2)
57
58
59 def Log(msg):
60   """Simple function that prints out its argument.
61
62   """
63   print msg
64   sys.stdout.flush()
65
66
67 class Burner(object):
68   """Burner class."""
69
70   def __init__(self):
71     """Constructor."""
72     utils.SetupLogging(constants.LOG_BURNIN, debug=False, stderr_logging=True)
73     self._feed_buf = StringIO()
74     self.nodes = []
75     self.instances = []
76     self.to_rem = []
77     self.opts = None
78     self.cl = cli.GetClient()
79     self.ParseOptions()
80     self.GetState()
81
82   def ClearFeedbackBuf(self):
83     """Clear the feedback buffer."""
84     self._feed_buf.truncate(0)
85
86   def GetFeedbackBuf(self):
87     """Return the contents of the buffer."""
88     return self._feed_buf.getvalue()
89
90   def Feedback(self, msg):
91     """Acumulate feedback in our buffer."""
92     self._feed_buf.write("%s %s\n" % (time.ctime(utils.MergeTime(msg[0])),
93                                       msg[2]))
94     if self.opts.verbose:
95       Log(msg)
96
97   def ExecOp(self, op):
98     """Execute an opcode and manage the exec buffer."""
99     self.ClearFeedbackBuf()
100     return cli.SubmitOpCode(op, feedback_fn=self.Feedback, cl=self.cl)
101
102   def ExecJobSet(self, jobs):
103     """Execute a set of jobs and return once all are done.
104
105     The method will return the list of results, if all jobs are
106     successfull. Otherwise, OpExecError will be raised from within
107     cli.py.
108
109     """
110     self.ClearFeedbackBuf()
111     job_ids = [cli.SendJob(job, cl=self.cl) for job in jobs]
112     Log("- Submitted job IDs %s" % ", ".join(job_ids))
113     results = []
114     for jid in job_ids:
115       Log("- Waiting for job %s" % jid)
116       results.append(cli.PollJob(jid, cl=self.cl, feedback_fn=self.Feedback))
117
118     return results
119
120   def ParseOptions(self):
121     """Parses the command line options.
122
123     In case of command line errors, it will show the usage and exit the
124     program.
125
126     """
127
128     parser = optparse.OptionParser(usage="\n%s" % USAGE,
129                                    version="%%prog (ganeti) %s" %
130                                    constants.RELEASE_VERSION,
131                                    option_class=cli.CliOption)
132
133     parser.add_option("-o", "--os", dest="os", default=None,
134                       help="OS to use during burnin",
135                       metavar="<OS>")
136     parser.add_option("--disk-size", dest="disk_size",
137                       help="Disk size (determines disk count)",
138                       default="128m", type="string", metavar="<size,size,...>")
139     parser.add_option("--disk-growth", dest="disk_growth", help="Disk growth",
140                       default="128m", type="string", metavar="<size,size,...>")
141     parser.add_option("--mem-size", dest="mem_size", help="Memory size",
142                       default=128, type="unit", metavar="<size>")
143     parser.add_option("-v", "--verbose",
144                       action="store_true", dest="verbose", default=False,
145                       help="print command execution messages to stdout")
146     parser.add_option("--no-replace1", dest="do_replace1",
147                       help="Skip disk replacement with the same secondary",
148                       action="store_false", default=True)
149     parser.add_option("--no-replace2", dest="do_replace2",
150                       help="Skip disk replacement with a different secondary",
151                       action="store_false", default=True)
152     parser.add_option("--no-failover", dest="do_failover",
153                       help="Skip instance failovers", action="store_false",
154                       default=True)
155     parser.add_option("--no-importexport", dest="do_importexport",
156                       help="Skip instance export/import", action="store_false",
157                       default=True)
158     parser.add_option("--no-startstop", dest="do_startstop",
159                       help="Skip instance stop/start", action="store_false",
160                       default=True)
161     parser.add_option("--no-reinstall", dest="do_reinstall",
162                       help="Skip instance reinstall", action="store_false",
163                       default=True)
164     parser.add_option("--no-reboot", dest="do_reboot",
165                       help="Skip instance reboot", action="store_false",
166                       default=True)
167     parser.add_option("--no-activate-disks", dest="do_activate_disks",
168                       help="Skip disk activation/deactivation",
169                       action="store_false", default=True)
170     parser.add_option("--no-add-disks", dest="do_addremove_disks",
171                       help="Skip disk addition/removal",
172                       action="store_false", default=True)
173     parser.add_option("--no-add-nics", dest="do_addremove_nics",
174                       help="Skip NIC addition/removal",
175                       action="store_false", default=True)
176     parser.add_option("--no-nics", dest="nics",
177                       help="No network interfaces", action="store_const",
178                       const=[], default=[{}])
179     parser.add_option("--rename", dest="rename", default=None,
180                       help="Give one unused instance name which is taken"
181                            " to start the renaming sequence",
182                       metavar="<instance_name>")
183     parser.add_option("-t", "--disk-template", dest="disk_template",
184                       choices=("diskless", "file", "plain", "drbd"),
185                       default="drbd",
186                       help="Disk template (diskless, file, plain or drbd)"
187                             " [drbd]")
188     parser.add_option("-n", "--nodes", dest="nodes", default="",
189                       help="Comma separated list of nodes to perform"
190                       " the burnin on (defaults to all nodes)")
191     parser.add_option("--iallocator", dest="iallocator",
192                       default=None, type="string",
193                       help="Perform the allocation using an iallocator"
194                       " instead of fixed node spread (node restrictions no"
195                       " longer apply, therefore -n/--nodes must not be used")
196     parser.add_option("-p", "--parallel", default=False, action="store_true",
197                       dest="parallel",
198                       help="Enable parallelization of some operations in"
199                       " order to speed burnin or to test granular locking")
200     parser.add_option("--net-timeout", default=15, type="int",
201                       dest="net_timeout",
202                       help="The instance check network timeout in seconds"
203                       " (defaults to 15 seconds)")
204     parser.add_option("-C", "--http-check", default=False, action="store_true",
205                       dest="http_check",
206                       help="Enable checking of instance status via http,"
207                       " looking for /hostname.txt that should contain the"
208                       " name of the instance")
209
210
211     options, args = parser.parse_args()
212     if len(args) < 1 or options.os is None:
213       Usage()
214
215     supported_disk_templates = (constants.DT_DISKLESS,
216                                 constants.DT_FILE,
217                                 constants.DT_PLAIN,
218                                 constants.DT_DRBD8)
219     if options.disk_template not in supported_disk_templates:
220       Log("Unknown disk template '%s'" % options.disk_template)
221       sys.exit(1)
222
223     if options.disk_template == constants.DT_DISKLESS:
224       disk_size = disk_growth = []
225       options.do_addremove_disks = False
226     else:
227       disk_size = [utils.ParseUnit(v) for v in options.disk_size.split(",")]
228       disk_growth = [utils.ParseUnit(v)
229                      for v in options.disk_growth.split(",")]
230       if len(disk_growth) != len(disk_size):
231         Log("Wrong disk sizes/growth combination")
232         sys.exit(1)
233     if ((disk_size and options.disk_template == constants.DT_DISKLESS) or
234         (not disk_size and options.disk_template != constants.DT_DISKLESS)):
235       Log("Wrong disk count/disk template combination")
236       sys.exit(1)
237
238     self.disk_size = disk_size
239     self.disk_growth = disk_growth
240     self.disk_count = len(disk_size)
241
242     if options.nodes and options.iallocator:
243       Log("Give either the nodes option or the iallocator option, not both")
244       sys.exit(1)
245
246     self.opts = options
247     self.instances = args
248     self.bep = {
249       constants.BE_MEMORY: options.mem_size,
250       constants.BE_VCPUS: 1,
251       }
252     self.hvp = {}
253
254     socket.setdefaulttimeout(options.net_timeout)
255
256   def GetState(self):
257     """Read the cluster state from the config."""
258     if self.opts.nodes:
259       names = self.opts.nodes.split(",")
260     else:
261       names = []
262     try:
263       op = opcodes.OpQueryNodes(output_fields=["name", "offline"], names=names)
264       result = self.ExecOp(op)
265     except errors.GenericError, err:
266       err_code, msg = cli.FormatError(err)
267       Log(msg)
268       sys.exit(err_code)
269     self.nodes = [data[0] for data in result if not data[1]]
270
271     result = self.ExecOp(opcodes.OpDiagnoseOS(output_fields=["name", "valid"],
272                                               names=[]))
273
274     if not result:
275       Log("Can't get the OS list")
276       sys.exit(1)
277
278     # filter non-valid OS-es
279     os_set = [val[0] for val in result if val[1]]
280
281     if self.opts.os not in os_set:
282       Log("OS '%s' not found" % self.opts.os)
283       sys.exit(1)
284
285   def CreateInstances(self):
286     """Create the given instances.
287
288     """
289     self.to_rem = []
290     mytor = izip(cycle(self.nodes),
291                  islice(cycle(self.nodes), 1, None),
292                  self.instances)
293     jobset = []
294
295     for pnode, snode, instance in mytor:
296       if self.opts.iallocator:
297         pnode = snode = None
298         Log("- Add instance %s (iallocator: %s)" %
299               (instance, self.opts.iallocator))
300       elif self.opts.disk_template not in constants.DTS_NET_MIRROR:
301         snode = None
302         Log("- Add instance %s on node %s" % (instance, pnode))
303       else:
304         Log("- Add instance %s on nodes %s/%s" % (instance, pnode, snode))
305
306       op = opcodes.OpCreateInstance(instance_name=instance,
307                                     disks = [ {"size": size}
308                                               for size in self.disk_size],
309                                     disk_template=self.opts.disk_template,
310                                     nics=self.opts.nics,
311                                     mode=constants.INSTANCE_CREATE,
312                                     os_type=self.opts.os,
313                                     pnode=pnode,
314                                     snode=snode,
315                                     start=True,
316                                     ip_check=True,
317                                     wait_for_sync=True,
318                                     file_driver="loop",
319                                     file_storage_dir=None,
320                                     iallocator=self.opts.iallocator,
321                                     beparams=self.bep,
322                                     hvparams=self.hvp,
323                                     )
324
325       if self.opts.parallel:
326         jobset.append([op])
327         # FIXME: here we should not append to to_rem uncoditionally,
328         # but only when the job is successful
329         self.to_rem.append(instance)
330       else:
331         self.ExecOp(op)
332         self.to_rem.append(instance)
333     if self.opts.parallel:
334       self.ExecJobSet(jobset)
335
336     for instance in self.instances:
337       self._CheckInstanceAlive(instance)
338
339   def GrowDisks(self):
340     """Grow both the os and the swap disks by the requested amount, if any."""
341     for instance in self.instances:
342       for idx, growth in enumerate(self.disk_growth):
343         if growth > 0:
344           op = opcodes.OpGrowDisk(instance_name=instance, disk=idx,
345                                   amount=growth, wait_for_sync=True)
346           Log("- Increase %s's disk/%s by %s MB" % (instance, idx, growth))
347           self.ExecOp(op)
348
349   def ReplaceDisks1D8(self):
350     """Replace disks on primary and secondary for drbd8."""
351     for instance in self.instances:
352       for mode in constants.REPLACE_DISK_SEC, constants.REPLACE_DISK_PRI:
353         op = opcodes.OpReplaceDisks(instance_name=instance,
354                                     mode=mode,
355                                     disks=[i for i in range(self.disk_count)])
356         Log("- Replace disks (%s) for instance %s" % (mode, instance))
357         self.ExecOp(op)
358
359   def ReplaceDisks2(self):
360     """Replace secondary node."""
361     mode = constants.REPLACE_DISK_CHG
362
363     mytor = izip(islice(cycle(self.nodes), 2, None),
364                  self.instances)
365     for tnode, instance in mytor:
366       if self.opts.iallocator:
367         tnode = None
368       op = opcodes.OpReplaceDisks(instance_name=instance,
369                                   mode=mode,
370                                   remote_node=tnode,
371                                   iallocator=self.opts.iallocator,
372                                   disks=[i for i in range(self.disk_count)])
373       Log("- Replace secondary (%s) for instance %s" % (mode, instance))
374       self.ExecOp(op)
375
376   def Failover(self):
377     """Failover the instances."""
378
379     for instance in self.instances:
380       op = opcodes.OpFailoverInstance(instance_name=instance,
381                                       ignore_consistency=False)
382
383       Log("- Failover instance %s" % (instance))
384       self.ExecOp(op)
385     for instance in self.instances:
386       self._CheckInstanceAlive(instance)
387
388   def ImportExport(self):
389     """Export the instance, delete it, and import it back.
390
391     """
392
393     mytor = izip(cycle(self.nodes),
394                  islice(cycle(self.nodes), 1, None),
395                  islice(cycle(self.nodes), 2, None),
396                  self.instances)
397
398     for pnode, snode, enode, instance in mytor:
399
400       if self.opts.iallocator:
401         pnode = snode = None
402         import_log_msg = ("- Import instance %s from node %s"
403                           " (iallocator: %s)" %
404                           (instance, enode, self.opts.iallocator))
405       elif self.opts.disk_template not in constants.DTS_NET_MIRROR:
406         snode = None
407         import_log_msg = ("- Import instance %s from node %s to node %s" %
408                           (instance, enode, pnode))
409       else:
410         import_log_msg = ("- Import instance %s from node %s to nodes %s/%s" %
411                           (instance, enode, pnode, snode))
412
413       exp_op = opcodes.OpExportInstance(instance_name=instance,
414                                            target_node=enode,
415                                            shutdown=True)
416       rem_op = opcodes.OpRemoveInstance(instance_name=instance,
417                                         ignore_failures=True)
418       nam_op = opcodes.OpQueryInstances(output_fields=["name"],
419                                            names=[instance])
420       full_name = self.ExecOp(nam_op)[0][0]
421       imp_dir = os.path.join(constants.EXPORT_DIR, full_name)
422       imp_op = opcodes.OpCreateInstance(instance_name=instance,
423                                         disks = [ {"size": size}
424                                                   for size in self.disk_size],
425                                         disk_template=self.opts.disk_template,
426                                         nics=self.opts.nics,
427                                         mode=constants.INSTANCE_IMPORT,
428                                         src_node=enode,
429                                         src_path=imp_dir,
430                                         pnode=pnode,
431                                         snode=snode,
432                                         start=True,
433                                         ip_check=True,
434                                         wait_for_sync=True,
435                                         file_storage_dir=None,
436                                         file_driver="loop",
437                                         iallocator=self.opts.iallocator,
438                                         beparams=self.bep,
439                                         hvparams=self.hvp,
440                                         )
441
442       erem_op = opcodes.OpRemoveExport(instance_name=instance)
443
444       Log("- Export instance %s to node %s" % (instance, enode))
445       self.ExecOp(exp_op)
446       Log("- Remove instance %s" % (instance))
447       self.ExecOp(rem_op)
448       self.to_rem.remove(instance)
449       Log(import_log_msg)
450       self.ExecOp(imp_op)
451       Log("- Remove export of instance %s" % (instance))
452       self.ExecOp(erem_op)
453
454       self.to_rem.append(instance)
455
456     for instance in self.instances:
457       self._CheckInstanceAlive(instance)
458
459   def StopInstance(self, instance):
460     """Stop given instance."""
461     op = opcodes.OpShutdownInstance(instance_name=instance)
462     Log("- Shutdown instance %s" % instance)
463     self.ExecOp(op)
464
465   def StartInstance(self, instance):
466     """Start given instance."""
467     op = opcodes.OpStartupInstance(instance_name=instance, force=False)
468     Log("- Start instance %s" % instance)
469     self.ExecOp(op)
470
471   def RenameInstance(self, instance, instance_new):
472     """Rename instance."""
473     op = opcodes.OpRenameInstance(instance_name=instance,
474                                   new_name=instance_new)
475     Log("- Rename instance %s to %s" % (instance, instance_new))
476     self.ExecOp(op)
477
478   def StopStart(self):
479     """Stop/start the instances."""
480     for instance in self.instances:
481       self.StopInstance(instance)
482       self.StartInstance(instance)
483
484     for instance in self.instances:
485       self._CheckInstanceAlive(instance)
486
487   def Remove(self):
488     """Remove the instances."""
489     for instance in self.to_rem:
490       op = opcodes.OpRemoveInstance(instance_name=instance,
491                                     ignore_failures=True)
492       Log("- Remove instance %s" % instance)
493       self.ExecOp(op)
494
495   def Rename(self):
496     """Rename the instances."""
497     rename = self.opts.rename
498     for instance in self.instances:
499       self.StopInstance(instance)
500       self.RenameInstance(instance, rename)
501       self.StartInstance(rename)
502       self._CheckInstanceAlive(rename)
503       self.StopInstance(rename)
504       self.RenameInstance(rename, instance)
505       self.StartInstance(instance)
506
507     for instance in self.instances:
508       self._CheckInstanceAlive(instance)
509
510   def Reinstall(self):
511     """Reinstall the instances."""
512     for instance in self.instances:
513       self.StopInstance(instance)
514       op = opcodes.OpReinstallInstance(instance_name=instance)
515       Log("- Reinstall instance %s without passing the OS" % (instance,))
516       self.ExecOp(op)
517       op = opcodes.OpReinstallInstance(instance_name=instance,
518                                        os_type=self.opts.os)
519       Log("- Reinstall instance %s specifying the OS" % (instance,))
520       self.ExecOp(op)
521       self.StartInstance(instance)
522     for instance in self.instances:
523       self._CheckInstanceAlive(instance)
524
525   def Reboot(self):
526     """Reinstall the instances."""
527     for instance in self.instances:
528       for reboot_type in constants.REBOOT_TYPES:
529         op = opcodes.OpRebootInstance(instance_name=instance,
530                                       reboot_type=reboot_type,
531                                       ignore_secondaries=False)
532         Log("- Reboot instance %s with type '%s'" % (instance, reboot_type))
533         self.ExecOp(op)
534         self._CheckInstanceAlive(instance)
535
536   def ActivateDisks(self):
537     """Activate and deactivate disks of the instances."""
538     for instance in self.instances:
539       op_act = opcodes.OpActivateInstanceDisks(instance_name=instance)
540       op_deact = opcodes.OpDeactivateInstanceDisks(instance_name=instance)
541       Log("- Activate disks of online instance %s" % (instance,))
542       self.ExecOp(op_act)
543       self.StopInstance(instance)
544       Log("- Activate disks of offline instance %s" % (instance,))
545       self.ExecOp(op_act)
546       Log("- Deactivate disks of offline instance %s" % (instance,))
547       self.ExecOp(op_deact)
548       self.StartInstance(instance)
549     for instance in self.instances:
550       self._CheckInstanceAlive(instance)
551
552   def AddRemoveDisks(self):
553     """Add and remove an extra disk for the instances."""
554     for instance in self.instances:
555       op_add = opcodes.OpSetInstanceParams(\
556         instance_name=instance,
557         disks=[(constants.DDM_ADD, {"size": self.disk_size[0]})])
558       op_rem = opcodes.OpSetInstanceParams(\
559         instance_name=instance, disks=[(constants.DDM_REMOVE, {})])
560       Log("- Adding a disk to instance %s" % (instance,))
561       self.ExecOp(op_add)
562       self.StopInstance(instance)
563       Log("- Removing the last disk of instance %s" % (instance,))
564       self.ExecOp(op_rem)
565       self.StartInstance(instance)
566     for instance in self.instances:
567       self._CheckInstanceAlive(instance)
568
569   def AddRemoveNICs(self):
570     """Add and remove an extra NIC for the instances."""
571     for instance in self.instances:
572       op_add = opcodes.OpSetInstanceParams(\
573         instance_name=instance, nics=[(constants.DDM_ADD, {})])
574       op_rem = opcodes.OpSetInstanceParams(\
575         instance_name=instance, nics=[(constants.DDM_REMOVE, {})])
576       Log("- Adding a NIC to instance %s" % (instance,))
577       self.ExecOp(op_add)
578       Log("- Removing the last NIC of instance %s" % (instance,))
579       self.ExecOp(op_rem)
580
581   def _CheckInstanceAlive(self, instance):
582     """Check if an instance is alive by doing http checks.
583
584     This will try to retrieve the url on the instance /hostname.txt
585     and check that it contains the hostname of the instance. In case
586     we get ECONNREFUSED, we retry up to the net timeout seconds, for
587     any other error we abort.
588
589     """
590     if not self.opts.http_check:
591       return
592     try:
593       for retries in range(self.opts.net_timeout):
594         try:
595           url = urllib2.urlopen("http://%s/hostname.txt" % instance)
596         except urllib2.URLError, err:
597           if err.args[0][0] == errno.ECONNREFUSED:
598             time.sleep(1)
599             continue
600           raise
601     except urllib2.URLError, err:
602       raise InstanceDown(instance, str(err))
603     hostname = url.read().strip()
604     if hostname != instance:
605       raise InstanceDown(instance, ("Hostname mismatch, expected %s, got %s" %
606                                     (instance, hostname)))
607
608   def BurninCluster(self):
609     """Test a cluster intensively.
610
611     This will create instances and then start/stop/failover them.
612     It is safe for existing instances but could impact performance.
613
614     """
615
616     opts = self.opts
617
618     Log("- Testing global parameters")
619
620     if (len(self.nodes) == 1 and
621         opts.disk_template not in (constants.DT_DISKLESS, constants.DT_PLAIN,
622                                    constants.DT_FILE)):
623       Log("When one node is available/selected the disk template must"
624           " be 'diskless', 'file' or 'plain'")
625       sys.exit(1)
626
627     has_err = True
628     try:
629       self.CreateInstances()
630       if opts.do_replace1 and opts.disk_template in constants.DTS_NET_MIRROR:
631         self.ReplaceDisks1D8()
632       if (opts.do_replace2 and len(self.nodes) > 2 and
633           opts.disk_template in constants.DTS_NET_MIRROR) :
634         self.ReplaceDisks2()
635
636       if opts.disk_template != constants.DT_DISKLESS:
637         self.GrowDisks()
638
639       if opts.do_failover and opts.disk_template in constants.DTS_NET_MIRROR:
640         self.Failover()
641
642       if (opts.do_importexport and
643           opts.disk_template not in (constants.DT_DISKLESS,
644                                      constants.DT_FILE)):
645         self.ImportExport()
646
647       if opts.do_reinstall:
648         self.Reinstall()
649
650       if opts.do_reboot:
651         self.Reboot()
652
653       if opts.do_addremove_disks:
654         self.AddRemoveDisks()
655
656       if opts.do_addremove_nics:
657         self.AddRemoveNICs()
658
659       if opts.do_activate_disks:
660         self.ActivateDisks()
661
662       if opts.do_startstop:
663         self.StopStart()
664
665       if opts.rename:
666         self.Rename()
667
668       has_err = False
669     finally:
670       if has_err:
671         Log("Error detected: opcode buffer follows:\n\n")
672         Log(self.GetFeedbackBuf())
673         Log("\n\n")
674       self.Remove()
675
676     return 0
677
678
679 def main():
680   """Main function"""
681
682   burner = Burner()
683   return burner.BurninCluster()
684
685
686 if __name__ == "__main__":
687   main()