4 # Copyright (C) 2006, 2007 Google Inc.
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
33 from itertools import izip, islice, cycle
34 from cStringIO import StringIO
36 from ganeti import opcodes
37 from ganeti import mcpu
38 from ganeti import constants
39 from ganeti import cli
40 from ganeti import errors
41 from ganeti import utils
44 USAGE = ("\tburnin -o OS_NAME [options...] instance_name ...")
47 class InstanceDown(Exception):
48 """The checked instance was not up"""
52 """Shows program usage information and exits the program."""
54 print >> sys.stderr, "Usage:"
55 print >> sys.stderr, USAGE
59 def Log(msg, indent=0):
60 """Simple function that prints out its argument.
68 sys.stdout.write("%*s%s%s\n" % (2*indent, "",
69 headers.get(indent, " "), msg))
72 def Err(msg, exit_code=1):
73 """Simple error logging that prints to stderr.
76 sys.stderr.write(msg + "\n")
85 utils.SetupLogging(constants.LOG_BURNIN, debug=False, stderr_logging=True)
86 self._feed_buf = StringIO()
92 self.cl = cli.GetClient()
95 def ClearFeedbackBuf(self):
96 """Clear the feedback buffer."""
97 self._feed_buf.truncate(0)
99 def GetFeedbackBuf(self):
100 """Return the contents of the buffer."""
101 return self._feed_buf.getvalue()
103 def Feedback(self, msg):
104 """Acumulate feedback in our buffer."""
105 self._feed_buf.write("%s %s\n" % (time.ctime(utils.MergeTime(msg[0])),
107 if self.opts.verbose:
110 def ExecOp(self, op):
111 """Execute an opcode and manage the exec buffer."""
112 self.ClearFeedbackBuf()
113 return cli.SubmitOpCode(op, feedback_fn=self.Feedback, cl=self.cl)
115 def ExecJobSet(self, jobs):
116 """Execute a set of jobs and return once all are done.
118 The method will return the list of results, if all jobs are
119 successfull. Otherwise, OpExecError will be raised from within
123 self.ClearFeedbackBuf()
124 job_ids = [cli.SendJob(job, cl=self.cl) for job in jobs]
125 Log("Submitted job IDs %s" % ", ".join(job_ids), indent=1)
128 Log("Waiting for job %s" % jid, indent=2)
129 results.append(cli.PollJob(jid, cl=self.cl, feedback_fn=self.Feedback))
133 def ParseOptions(self):
134 """Parses the command line options.
136 In case of command line errors, it will show the usage and exit the
141 parser = optparse.OptionParser(usage="\n%s" % USAGE,
142 version="%%prog (ganeti) %s" %
143 constants.RELEASE_VERSION,
144 option_class=cli.CliOption)
146 parser.add_option("-o", "--os", dest="os", default=None,
147 help="OS to use during burnin",
149 parser.add_option("--disk-size", dest="disk_size",
150 help="Disk size (determines disk count)",
151 default="128m", type="string", metavar="<size,size,...>")
152 parser.add_option("--disk-growth", dest="disk_growth", help="Disk growth",
153 default="128m", type="string", metavar="<size,size,...>")
154 parser.add_option("--mem-size", dest="mem_size", help="Memory size",
155 default=128, type="unit", metavar="<size>")
156 parser.add_option("-v", "--verbose",
157 action="store_true", dest="verbose", default=False,
158 help="print command execution messages to stdout")
159 parser.add_option("--no-replace1", dest="do_replace1",
160 help="Skip disk replacement with the same secondary",
161 action="store_false", default=True)
162 parser.add_option("--no-replace2", dest="do_replace2",
163 help="Skip disk replacement with a different secondary",
164 action="store_false", default=True)
165 parser.add_option("--no-failover", dest="do_failover",
166 help="Skip instance failovers", action="store_false",
168 parser.add_option("--no-migrate", dest="do_migrate",
169 help="Skip instance live migration",
170 action="store_false", default=True)
171 parser.add_option("--no-importexport", dest="do_importexport",
172 help="Skip instance export/import", action="store_false",
174 parser.add_option("--no-startstop", dest="do_startstop",
175 help="Skip instance stop/start", action="store_false",
177 parser.add_option("--no-reinstall", dest="do_reinstall",
178 help="Skip instance reinstall", action="store_false",
180 parser.add_option("--no-reboot", dest="do_reboot",
181 help="Skip instance reboot", action="store_false",
183 parser.add_option("--no-activate-disks", dest="do_activate_disks",
184 help="Skip disk activation/deactivation",
185 action="store_false", default=True)
186 parser.add_option("--no-add-disks", dest="do_addremove_disks",
187 help="Skip disk addition/removal",
188 action="store_false", default=True)
189 parser.add_option("--no-add-nics", dest="do_addremove_nics",
190 help="Skip NIC addition/removal",
191 action="store_false", default=True)
192 parser.add_option("--no-nics", dest="nics",
193 help="No network interfaces", action="store_const",
194 const=[], default=[{}])
195 parser.add_option("--rename", dest="rename", default=None,
196 help="Give one unused instance name which is taken"
197 " to start the renaming sequence",
198 metavar="<instance_name>")
199 parser.add_option("-t", "--disk-template", dest="disk_template",
200 choices=("diskless", "file", "plain", "drbd"),
202 help="Disk template (diskless, file, plain or drbd)"
204 parser.add_option("-n", "--nodes", dest="nodes", default="",
205 help="Comma separated list of nodes to perform"
206 " the burnin on (defaults to all nodes)")
207 parser.add_option("--iallocator", dest="iallocator",
208 default=None, type="string",
209 help="Perform the allocation using an iallocator"
210 " instead of fixed node spread (node restrictions no"
211 " longer apply, therefore -n/--nodes must not be used")
212 parser.add_option("-p", "--parallel", default=False, action="store_true",
214 help="Enable parallelization of some operations in"
215 " order to speed burnin or to test granular locking")
216 parser.add_option("--net-timeout", default=15, type="int",
218 help="The instance check network timeout in seconds"
219 " (defaults to 15 seconds)")
220 parser.add_option("-C", "--http-check", default=False, action="store_true",
222 help="Enable checking of instance status via http,"
223 " looking for /hostname.txt that should contain the"
224 " name of the instance")
225 parser.add_option("-K", "--keep-instances", default=False,
227 dest="keep_instances",
228 help="Leave instances on the cluster after burnin,"
229 " for investigation in case of errors or simply"
233 options, args = parser.parse_args()
234 if len(args) < 1 or options.os is None:
237 supported_disk_templates = (constants.DT_DISKLESS,
241 if options.disk_template not in supported_disk_templates:
242 Err("Unknown disk template '%s'" % options.disk_template)
244 if options.disk_template == constants.DT_DISKLESS:
245 disk_size = disk_growth = []
246 options.do_addremove_disks = False
248 disk_size = [utils.ParseUnit(v) for v in options.disk_size.split(",")]
249 disk_growth = [utils.ParseUnit(v)
250 for v in options.disk_growth.split(",")]
251 if len(disk_growth) != len(disk_size):
252 Err("Wrong disk sizes/growth combination")
253 if ((disk_size and options.disk_template == constants.DT_DISKLESS) or
254 (not disk_size and options.disk_template != constants.DT_DISKLESS)):
255 Err("Wrong disk count/disk template combination")
257 self.disk_size = disk_size
258 self.disk_growth = disk_growth
259 self.disk_count = len(disk_size)
261 if options.nodes and options.iallocator:
262 Err("Give either the nodes option or the iallocator option, not both")
265 self.instances = args
267 constants.BE_MEMORY: options.mem_size,
268 constants.BE_VCPUS: 1,
272 socket.setdefaulttimeout(options.net_timeout)
275 """Read the cluster state from the config."""
277 names = self.opts.nodes.split(",")
281 op = opcodes.OpQueryNodes(output_fields=["name", "offline"], names=names)
282 result = self.ExecOp(op)
283 except errors.GenericError, err:
284 err_code, msg = cli.FormatError(err)
285 Err(msg, exit_code=err_code)
286 self.nodes = [data[0] for data in result if not data[1]]
288 result = self.ExecOp(opcodes.OpDiagnoseOS(output_fields=["name", "valid"],
292 Err("Can't get the OS list")
294 # filter non-valid OS-es
295 os_set = [val[0] for val in result if val[1]]
297 if self.opts.os not in os_set:
298 Err("OS '%s' not found" % self.opts.os)
300 def CreateInstances(self):
301 """Create the given instances.
305 mytor = izip(cycle(self.nodes),
306 islice(cycle(self.nodes), 1, None),
310 Log("Creating instances")
311 for pnode, snode, instance in mytor:
312 Log("instance %s" % instance, indent=1)
313 if self.opts.iallocator:
315 msg = "with iallocator %s" % self.opts.iallocator
316 elif self.opts.disk_template not in constants.DTS_NET_MIRROR:
318 msg = "on %s" % pnode
320 msg = "on %s, %s" % (pnode, snode)
324 op = opcodes.OpCreateInstance(instance_name=instance,
325 disks = [ {"size": size}
326 for size in self.disk_size],
327 disk_template=self.opts.disk_template,
329 mode=constants.INSTANCE_CREATE,
330 os_type=self.opts.os,
337 file_storage_dir=None,
338 iallocator=self.opts.iallocator,
343 if self.opts.parallel:
345 # FIXME: here we should not append to to_rem uncoditionally,
346 # but only when the job is successful
347 self.to_rem.append(instance)
350 self.to_rem.append(instance)
351 if self.opts.parallel:
352 self.ExecJobSet(jobset)
354 for instance in self.instances:
355 self._CheckInstanceAlive(instance)
358 """Grow both the os and the swap disks by the requested amount, if any."""
360 for instance in self.instances:
361 Log("instance %s" % instance, indent=1)
362 for idx, growth in enumerate(self.disk_growth):
364 op = opcodes.OpGrowDisk(instance_name=instance, disk=idx,
365 amount=growth, wait_for_sync=True)
366 Log("increase disk/%s by %s MB" % (idx, growth), indent=2)
369 def ReplaceDisks1D8(self):
370 """Replace disks on primary and secondary for drbd8."""
371 Log("Replacing disks on the same nodes")
372 for instance in self.instances:
373 Log("instance %s" % instance, indent=1)
374 for mode in constants.REPLACE_DISK_SEC, constants.REPLACE_DISK_PRI:
375 op = opcodes.OpReplaceDisks(instance_name=instance,
377 disks=[i for i in range(self.disk_count)])
378 Log("run %s" % mode, indent=2)
381 def ReplaceDisks2(self):
382 """Replace secondary node."""
383 Log("Changing the secondary node")
384 mode = constants.REPLACE_DISK_CHG
386 mytor = izip(islice(cycle(self.nodes), 2, None),
388 for tnode, instance in mytor:
389 Log("instance %s" % instance, indent=1)
390 if self.opts.iallocator:
392 msg = "with iallocator %s" % self.opts.iallocator
395 op = opcodes.OpReplaceDisks(instance_name=instance,
398 iallocator=self.opts.iallocator,
399 disks=[i for i in range(self.disk_count)])
400 Log("run %s %s" % (mode, msg), indent=2)
404 """Failover the instances."""
405 Log("Failing over instances")
406 for instance in self.instances:
407 Log("instance %s" % instance, indent=1)
408 op = opcodes.OpFailoverInstance(instance_name=instance,
409 ignore_consistency=False)
412 for instance in self.instances:
413 self._CheckInstanceAlive(instance)
416 """Migrate the instances."""
417 Log("Migrating instances")
418 for instance in self.instances:
419 Log("instance %s" % instance, indent=1)
420 op = opcodes.OpMigrateInstance(instance_name=instance, live=True,
423 Log("migration", indent=2)
425 op = opcodes.OpMigrateInstance(instance_name=instance, live=True,
427 Log("migration cleanup", indent=2)
430 def ImportExport(self):
431 """Export the instance, delete it, and import it back.
434 Log("Exporting and re-importing instances")
435 mytor = izip(cycle(self.nodes),
436 islice(cycle(self.nodes), 1, None),
437 islice(cycle(self.nodes), 2, None),
440 for pnode, snode, enode, instance in mytor:
441 Log("instance %s" % instance, indent=1)
442 if self.opts.iallocator:
444 import_log_msg = ("import from %s"
445 " with iallocator %s" %
446 (enode, self.opts.iallocator))
447 elif self.opts.disk_template not in constants.DTS_NET_MIRROR:
449 import_log_msg = ("import from %s to %s" %
452 import_log_msg = ("import from %s to %s, %s" %
453 (enode, pnode, snode))
455 exp_op = opcodes.OpExportInstance(instance_name=instance,
458 rem_op = opcodes.OpRemoveInstance(instance_name=instance,
459 ignore_failures=True)
460 nam_op = opcodes.OpQueryInstances(output_fields=["name"],
462 full_name = self.ExecOp(nam_op)[0][0]
463 imp_dir = os.path.join(constants.EXPORT_DIR, full_name)
464 imp_op = opcodes.OpCreateInstance(instance_name=instance,
465 disks = [ {"size": size}
466 for size in self.disk_size],
467 disk_template=self.opts.disk_template,
469 mode=constants.INSTANCE_IMPORT,
477 file_storage_dir=None,
479 iallocator=self.opts.iallocator,
484 erem_op = opcodes.OpRemoveExport(instance_name=instance)
486 Log("export to node %s" % enode, indent=2)
488 Log("remove instance", indent=2)
490 self.to_rem.remove(instance)
491 Log(import_log_msg, indent=2)
493 Log("remove export", indent=2)
496 self.to_rem.append(instance)
498 for instance in self.instances:
499 self._CheckInstanceAlive(instance)
501 def StopInstance(self, instance):
502 """Stop given instance."""
503 op = opcodes.OpShutdownInstance(instance_name=instance)
504 Log("shutdown", indent=2)
507 def StartInstance(self, instance):
508 """Start given instance."""
509 op = opcodes.OpStartupInstance(instance_name=instance, force=False)
510 Log("startup", indent=2)
513 def RenameInstance(self, instance, instance_new):
514 """Rename instance."""
515 op = opcodes.OpRenameInstance(instance_name=instance,
516 new_name=instance_new)
517 Log("rename to %s" % instance_new, indent=2)
521 """Stop/start the instances."""
522 Log("Stopping and starting instances")
523 for instance in self.instances:
524 Log("instance %s" % instance, indent=1)
525 self.StopInstance(instance)
526 self.StartInstance(instance)
528 for instance in self.instances:
529 self._CheckInstanceAlive(instance)
532 """Remove the instances."""
533 Log("Removing instances")
534 for instance in self.to_rem:
535 Log("instance %s" % instance, indent=1)
536 op = opcodes.OpRemoveInstance(instance_name=instance,
537 ignore_failures=True)
541 """Rename the instances."""
542 Log("Renaming instances")
543 rename = self.opts.rename
544 for instance in self.instances:
545 Log("instance %s" % instance, indent=1)
546 self.StopInstance(instance)
547 self.RenameInstance(instance, rename)
548 self.StartInstance(rename)
549 self._CheckInstanceAlive(rename)
550 self.StopInstance(rename)
551 self.RenameInstance(rename, instance)
552 self.StartInstance(instance)
554 for instance in self.instances:
555 self._CheckInstanceAlive(instance)
558 """Reinstall the instances."""
559 Log("Reinstalling instances")
560 for instance in self.instances:
561 Log("instance %s" % instance, indent=1)
562 self.StopInstance(instance)
563 op = opcodes.OpReinstallInstance(instance_name=instance)
564 Log("reinstall without passing the OS", indent=2)
566 op = opcodes.OpReinstallInstance(instance_name=instance,
567 os_type=self.opts.os)
568 Log("reinstall specifying the OS", indent=2)
570 self.StartInstance(instance)
571 for instance in self.instances:
572 self._CheckInstanceAlive(instance)
575 """Reboot the instances."""
576 Log("Rebooting instances")
577 for instance in self.instances:
578 Log("instance %s" % instance, indent=1)
579 for reboot_type in constants.REBOOT_TYPES:
580 op = opcodes.OpRebootInstance(instance_name=instance,
581 reboot_type=reboot_type,
582 ignore_secondaries=False)
583 Log("reboot with type '%s'" % reboot_type, indent=2)
585 self._CheckInstanceAlive(instance)
587 def ActivateDisks(self):
588 """Activate and deactivate disks of the instances."""
589 Log("Activating/deactivating disks")
590 for instance in self.instances:
591 Log("instance %s" % instance, indent=1)
592 op_act = opcodes.OpActivateInstanceDisks(instance_name=instance)
593 op_deact = opcodes.OpDeactivateInstanceDisks(instance_name=instance)
594 Log("activate disks when online", indent=2)
596 self.StopInstance(instance)
597 Log("activate disks when offline", indent=2)
599 Log("deactivate disks (when offline)", indent=2)
600 self.ExecOp(op_deact)
601 self.StartInstance(instance)
602 for instance in self.instances:
603 self._CheckInstanceAlive(instance)
605 def AddRemoveDisks(self):
606 """Add and remove an extra disk for the instances."""
607 Log("Adding and removing disks")
608 for instance in self.instances:
609 Log("instance %s" % instance, indent=1)
610 op_add = opcodes.OpSetInstanceParams(\
611 instance_name=instance,
612 disks=[(constants.DDM_ADD, {"size": self.disk_size[0]})])
613 op_rem = opcodes.OpSetInstanceParams(\
614 instance_name=instance, disks=[(constants.DDM_REMOVE, {})])
615 Log("adding a disk", indent=2)
617 self.StopInstance(instance)
618 Log("removing last disk", indent=2)
620 self.StartInstance(instance)
621 for instance in self.instances:
622 self._CheckInstanceAlive(instance)
624 def AddRemoveNICs(self):
625 """Add and remove an extra NIC for the instances."""
626 Log("Adding and removing NICs")
627 for instance in self.instances:
628 Log("instance %s" % instance, indent=1)
629 op_add = opcodes.OpSetInstanceParams(\
630 instance_name=instance, nics=[(constants.DDM_ADD, {})])
631 op_rem = opcodes.OpSetInstanceParams(\
632 instance_name=instance, nics=[(constants.DDM_REMOVE, {})])
633 Log("adding a NIC", indent=2)
635 Log("removing last NIC", indent=2)
638 def _CheckInstanceAlive(self, instance):
639 """Check if an instance is alive by doing http checks.
641 This will try to retrieve the url on the instance /hostname.txt
642 and check that it contains the hostname of the instance. In case
643 we get ECONNREFUSED, we retry up to the net timeout seconds, for
644 any other error we abort.
647 if not self.opts.http_check:
650 for retries in range(self.opts.net_timeout):
652 url = urllib2.urlopen("http://%s/hostname.txt" % instance)
653 except urllib2.URLError, err:
654 if err.args[0][0] == errno.ECONNREFUSED:
658 except urllib2.URLError, err:
659 raise InstanceDown(instance, str(err))
660 hostname = url.read().strip()
661 if hostname != instance:
662 raise InstanceDown(instance, ("Hostname mismatch, expected %s, got %s" %
663 (instance, hostname)))
665 def BurninCluster(self):
666 """Test a cluster intensively.
668 This will create instances and then start/stop/failover them.
669 It is safe for existing instances but could impact performance.
675 Log("Testing global parameters")
677 if (len(self.nodes) == 1 and
678 opts.disk_template not in (constants.DT_DISKLESS, constants.DT_PLAIN,
680 Err("When one node is available/selected the disk template must"
681 " be 'diskless', 'file' or 'plain'")
685 self.CreateInstances()
686 if opts.do_replace1 and opts.disk_template in constants.DTS_NET_MIRROR:
687 self.ReplaceDisks1D8()
688 if (opts.do_replace2 and len(self.nodes) > 2 and
689 opts.disk_template in constants.DTS_NET_MIRROR) :
692 if opts.disk_template != constants.DT_DISKLESS:
695 if opts.do_failover and opts.disk_template in constants.DTS_NET_MIRROR:
698 if opts.do_migrate and opts.disk_template == constants.DT_DRBD8:
701 if (opts.do_importexport and
702 opts.disk_template not in (constants.DT_DISKLESS,
706 if opts.do_reinstall:
712 if opts.do_addremove_disks:
713 self.AddRemoveDisks()
715 if opts.do_addremove_nics:
718 if opts.do_activate_disks:
724 if opts.do_startstop:
730 Log("Error detected: opcode buffer follows:\n\n")
731 Log(self.GetFeedbackBuf())
733 if not self.opts.keep_instances:
743 return burner.BurninCluster()
746 if __name__ == "__main__":