4 # Copyright (C) 2006, 2007 Google Inc.
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
33 from itertools import izip, islice, cycle
34 from cStringIO import StringIO
36 from ganeti import opcodes
37 from ganeti import mcpu
38 from ganeti import constants
39 from ganeti import cli
40 from ganeti import errors
41 from ganeti import utils
44 USAGE = ("\tburnin -o OS_NAME [options...] instance_name ...")
47 class InstanceDown(Exception):
48 """The checked instance was not up"""
52 """Shows program usage information and exits the program."""
54 print >> sys.stderr, "Usage:"
55 print >> sys.stderr, USAGE
59 def Log(msg, indent=0):
60 """Simple function that prints out its argument.
68 sys.stdout.write("%*s%s%s\n" % (2*indent, "",
69 headers.get(indent, " "), msg))
72 def Err(msg, exit_code=1):
73 """Simple error logging that prints to stderr.
76 sys.stderr.write(msg + "\n")
85 utils.SetupLogging(constants.LOG_BURNIN, debug=False, stderr_logging=True)
86 self._feed_buf = StringIO()
91 self.cl = cli.GetClient()
95 def ClearFeedbackBuf(self):
96 """Clear the feedback buffer."""
97 self._feed_buf.truncate(0)
99 def GetFeedbackBuf(self):
100 """Return the contents of the buffer."""
101 return self._feed_buf.getvalue()
103 def Feedback(self, msg):
104 """Acumulate feedback in our buffer."""
105 self._feed_buf.write("%s %s\n" % (time.ctime(utils.MergeTime(msg[0])),
107 if self.opts.verbose:
110 def ExecOp(self, op):
111 """Execute an opcode and manage the exec buffer."""
112 self.ClearFeedbackBuf()
113 return cli.SubmitOpCode(op, feedback_fn=self.Feedback, cl=self.cl)
115 def ExecJobSet(self, jobs):
116 """Execute a set of jobs and return once all are done.
118 The method will return the list of results, if all jobs are
119 successfull. Otherwise, OpExecError will be raised from within
123 self.ClearFeedbackBuf()
124 job_ids = [cli.SendJob(job, cl=self.cl) for job in jobs]
125 Log("Submitted job IDs %s" % ", ".join(job_ids), indent=1)
128 Log("Waiting for job %s" % jid, indent=2)
129 results.append(cli.PollJob(jid, cl=self.cl, feedback_fn=self.Feedback))
133 def ParseOptions(self):
134 """Parses the command line options.
136 In case of command line errors, it will show the usage and exit the
141 parser = optparse.OptionParser(usage="\n%s" % USAGE,
142 version="%%prog (ganeti) %s" %
143 constants.RELEASE_VERSION,
144 option_class=cli.CliOption)
146 parser.add_option("-o", "--os", dest="os", default=None,
147 help="OS to use during burnin",
149 parser.add_option("--disk-size", dest="disk_size",
150 help="Disk size (determines disk count)",
151 default="128m", type="string", metavar="<size,size,...>")
152 parser.add_option("--disk-growth", dest="disk_growth", help="Disk growth",
153 default="128m", type="string", metavar="<size,size,...>")
154 parser.add_option("--mem-size", dest="mem_size", help="Memory size",
155 default=128, type="unit", metavar="<size>")
156 parser.add_option("-v", "--verbose",
157 action="store_true", dest="verbose", default=False,
158 help="print command execution messages to stdout")
159 parser.add_option("--no-replace1", dest="do_replace1",
160 help="Skip disk replacement with the same secondary",
161 action="store_false", default=True)
162 parser.add_option("--no-replace2", dest="do_replace2",
163 help="Skip disk replacement with a different secondary",
164 action="store_false", default=True)
165 parser.add_option("--no-failover", dest="do_failover",
166 help="Skip instance failovers", action="store_false",
168 parser.add_option("--no-importexport", dest="do_importexport",
169 help="Skip instance export/import", action="store_false",
171 parser.add_option("--no-startstop", dest="do_startstop",
172 help="Skip instance stop/start", action="store_false",
174 parser.add_option("--no-reinstall", dest="do_reinstall",
175 help="Skip instance reinstall", action="store_false",
177 parser.add_option("--no-reboot", dest="do_reboot",
178 help="Skip instance reboot", action="store_false",
180 parser.add_option("--no-activate-disks", dest="do_activate_disks",
181 help="Skip disk activation/deactivation",
182 action="store_false", default=True)
183 parser.add_option("--no-add-disks", dest="do_addremove_disks",
184 help="Skip disk addition/removal",
185 action="store_false", default=True)
186 parser.add_option("--no-add-nics", dest="do_addremove_nics",
187 help="Skip NIC addition/removal",
188 action="store_false", default=True)
189 parser.add_option("--no-nics", dest="nics",
190 help="No network interfaces", action="store_const",
191 const=[], default=[{}])
192 parser.add_option("--rename", dest="rename", default=None,
193 help="Give one unused instance name which is taken"
194 " to start the renaming sequence",
195 metavar="<instance_name>")
196 parser.add_option("-t", "--disk-template", dest="disk_template",
197 choices=("diskless", "file", "plain", "drbd"),
199 help="Disk template (diskless, file, plain or drbd)"
201 parser.add_option("-n", "--nodes", dest="nodes", default="",
202 help="Comma separated list of nodes to perform"
203 " the burnin on (defaults to all nodes)")
204 parser.add_option("--iallocator", dest="iallocator",
205 default=None, type="string",
206 help="Perform the allocation using an iallocator"
207 " instead of fixed node spread (node restrictions no"
208 " longer apply, therefore -n/--nodes must not be used")
209 parser.add_option("-p", "--parallel", default=False, action="store_true",
211 help="Enable parallelization of some operations in"
212 " order to speed burnin or to test granular locking")
213 parser.add_option("--net-timeout", default=15, type="int",
215 help="The instance check network timeout in seconds"
216 " (defaults to 15 seconds)")
217 parser.add_option("-C", "--http-check", default=False, action="store_true",
219 help="Enable checking of instance status via http,"
220 " looking for /hostname.txt that should contain the"
221 " name of the instance")
224 options, args = parser.parse_args()
225 if len(args) < 1 or options.os is None:
228 supported_disk_templates = (constants.DT_DISKLESS,
232 if options.disk_template not in supported_disk_templates:
233 Err("Unknown disk template '%s'" % options.disk_template)
235 if options.disk_template == constants.DT_DISKLESS:
236 disk_size = disk_growth = []
237 options.do_addremove_disks = False
239 disk_size = [utils.ParseUnit(v) for v in options.disk_size.split(",")]
240 disk_growth = [utils.ParseUnit(v)
241 for v in options.disk_growth.split(",")]
242 if len(disk_growth) != len(disk_size):
243 Err("Wrong disk sizes/growth combination")
244 if ((disk_size and options.disk_template == constants.DT_DISKLESS) or
245 (not disk_size and options.disk_template != constants.DT_DISKLESS)):
246 Err("Wrong disk count/disk template combination")
248 self.disk_size = disk_size
249 self.disk_growth = disk_growth
250 self.disk_count = len(disk_size)
252 if options.nodes and options.iallocator:
253 Err("Give either the nodes option or the iallocator option, not both")
256 self.instances = args
258 constants.BE_MEMORY: options.mem_size,
259 constants.BE_VCPUS: 1,
263 socket.setdefaulttimeout(options.net_timeout)
266 """Read the cluster state from the config."""
268 names = self.opts.nodes.split(",")
272 op = opcodes.OpQueryNodes(output_fields=["name", "offline"], names=names)
273 result = self.ExecOp(op)
274 except errors.GenericError, err:
275 err_code, msg = cli.FormatError(err)
276 Err(msg, exit_code=err_code)
277 self.nodes = [data[0] for data in result if not data[1]]
279 result = self.ExecOp(opcodes.OpDiagnoseOS(output_fields=["name", "valid"],
283 Err("Can't get the OS list")
285 # filter non-valid OS-es
286 os_set = [val[0] for val in result if val[1]]
288 if self.opts.os not in os_set:
289 Err("OS '%s' not found" % self.opts.os)
291 def CreateInstances(self):
292 """Create the given instances.
296 mytor = izip(cycle(self.nodes),
297 islice(cycle(self.nodes), 1, None),
301 Log("Creating instances")
302 for pnode, snode, instance in mytor:
303 Log("instance %s" % instance, indent=1)
304 if self.opts.iallocator:
306 msg = "with iallocator %s" % self.opts.iallocator
307 elif self.opts.disk_template not in constants.DTS_NET_MIRROR:
309 msg = "on %s" % pnode
311 msg = "on %s, %s" % (pnode, snode)
315 op = opcodes.OpCreateInstance(instance_name=instance,
316 disks = [ {"size": size}
317 for size in self.disk_size],
318 disk_template=self.opts.disk_template,
320 mode=constants.INSTANCE_CREATE,
321 os_type=self.opts.os,
328 file_storage_dir=None,
329 iallocator=self.opts.iallocator,
334 if self.opts.parallel:
336 # FIXME: here we should not append to to_rem uncoditionally,
337 # but only when the job is successful
338 self.to_rem.append(instance)
341 self.to_rem.append(instance)
342 if self.opts.parallel:
343 self.ExecJobSet(jobset)
345 for instance in self.instances:
346 self._CheckInstanceAlive(instance)
349 """Grow both the os and the swap disks by the requested amount, if any."""
351 for instance in self.instances:
352 Log("instance %s" % instance, indent=1)
353 for idx, growth in enumerate(self.disk_growth):
355 op = opcodes.OpGrowDisk(instance_name=instance, disk=idx,
356 amount=growth, wait_for_sync=True)
357 Log("increase disk/%s by %s MB" % (idx, growth), indent=2)
360 def ReplaceDisks1D8(self):
361 """Replace disks on primary and secondary for drbd8."""
362 Log("Replacing disks on the same nodes")
363 for instance in self.instances:
364 Log("instance %s" % instance, indent=1)
365 for mode in constants.REPLACE_DISK_SEC, constants.REPLACE_DISK_PRI:
366 op = opcodes.OpReplaceDisks(instance_name=instance,
368 disks=[i for i in range(self.disk_count)])
369 Log("run %s" % mode, indent=2)
372 def ReplaceDisks2(self):
373 """Replace secondary node."""
374 Log("Changing the secondary node")
375 mode = constants.REPLACE_DISK_CHG
377 mytor = izip(islice(cycle(self.nodes), 2, None),
379 for tnode, instance in mytor:
380 Log("instance %s" % instance, indent=1)
381 if self.opts.iallocator:
383 msg = "with iallocator %s" % self.opts.iallocator
386 op = opcodes.OpReplaceDisks(instance_name=instance,
389 iallocator=self.opts.iallocator,
390 disks=[i for i in range(self.disk_count)])
391 Log("run %s %s" % (mode, msg), indent=2)
395 """Failover the instances."""
396 Log("Failing over instances")
397 for instance in self.instances:
398 Log("instance %s" % instance, indent=1)
399 op = opcodes.OpFailoverInstance(instance_name=instance,
400 ignore_consistency=False)
403 for instance in self.instances:
404 self._CheckInstanceAlive(instance)
406 def ImportExport(self):
407 """Export the instance, delete it, and import it back.
410 Log("Exporting and re-importing instances")
411 mytor = izip(cycle(self.nodes),
412 islice(cycle(self.nodes), 1, None),
413 islice(cycle(self.nodes), 2, None),
416 for pnode, snode, enode, instance in mytor:
417 Log("instance %s" % instance, indent=1)
418 if self.opts.iallocator:
420 import_log_msg = ("import from %s"
421 " with iallocator %s" %
422 (enode, self.opts.iallocator))
423 elif self.opts.disk_template not in constants.DTS_NET_MIRROR:
425 import_log_msg = ("import from %s to %s" %
428 import_log_msg = ("import from %s to %s, %s" %
429 (enode, pnode, snode))
431 exp_op = opcodes.OpExportInstance(instance_name=instance,
434 rem_op = opcodes.OpRemoveInstance(instance_name=instance,
435 ignore_failures=True)
436 nam_op = opcodes.OpQueryInstances(output_fields=["name"],
438 full_name = self.ExecOp(nam_op)[0][0]
439 imp_dir = os.path.join(constants.EXPORT_DIR, full_name)
440 imp_op = opcodes.OpCreateInstance(instance_name=instance,
441 disks = [ {"size": size}
442 for size in self.disk_size],
443 disk_template=self.opts.disk_template,
445 mode=constants.INSTANCE_IMPORT,
453 file_storage_dir=None,
455 iallocator=self.opts.iallocator,
460 erem_op = opcodes.OpRemoveExport(instance_name=instance)
462 Log("export to node %s" % enode, indent=2)
464 Log("remove instance", indent=2)
466 self.to_rem.remove(instance)
467 Log(import_log_msg, indent=2)
469 Log("remove export", indent=2)
472 self.to_rem.append(instance)
474 for instance in self.instances:
475 self._CheckInstanceAlive(instance)
477 def StopInstance(self, instance):
478 """Stop given instance."""
479 op = opcodes.OpShutdownInstance(instance_name=instance)
480 Log("shutdown", indent=2)
483 def StartInstance(self, instance):
484 """Start given instance."""
485 op = opcodes.OpStartupInstance(instance_name=instance, force=False)
486 Log("startup", indent=2)
489 def RenameInstance(self, instance, instance_new):
490 """Rename instance."""
491 op = opcodes.OpRenameInstance(instance_name=instance,
492 new_name=instance_new)
493 Log("rename to %s" % instance_new, indent=2)
497 """Stop/start the instances."""
498 Log("Stopping and starting instances")
499 for instance in self.instances:
500 Log("instance %s" % instance, indent=1)
501 self.StopInstance(instance)
502 self.StartInstance(instance)
504 for instance in self.instances:
505 self._CheckInstanceAlive(instance)
508 """Remove the instances."""
509 Log("Removing instances")
510 for instance in self.to_rem:
511 Log("instance %s" % instance, indent=1)
512 op = opcodes.OpRemoveInstance(instance_name=instance,
513 ignore_failures=True)
517 """Rename the instances."""
518 Log("Renaming instances")
519 rename = self.opts.rename
520 for instance in self.instances:
521 Log("instance %s" % instance, indent=1)
522 self.StopInstance(instance)
523 self.RenameInstance(instance, rename)
524 self.StartInstance(rename)
525 self._CheckInstanceAlive(rename)
526 self.StopInstance(rename)
527 self.RenameInstance(rename, instance)
528 self.StartInstance(instance)
530 for instance in self.instances:
531 self._CheckInstanceAlive(instance)
534 """Reinstall the instances."""
535 Log("Reinstalling instances")
536 for instance in self.instances:
537 Log("instance %s" % instance, indent=1)
538 self.StopInstance(instance)
539 op = opcodes.OpReinstallInstance(instance_name=instance)
540 Log("reinstall without passing the OS", indent=2)
542 op = opcodes.OpReinstallInstance(instance_name=instance,
543 os_type=self.opts.os)
544 Log("reinstall specifying the OS", indent=2)
546 self.StartInstance(instance)
547 for instance in self.instances:
548 self._CheckInstanceAlive(instance)
551 """Reboot the instances."""
552 Log("Rebooting instances")
553 for instance in self.instances:
554 Log("instance %s" % instance, indent=1)
555 for reboot_type in constants.REBOOT_TYPES:
556 op = opcodes.OpRebootInstance(instance_name=instance,
557 reboot_type=reboot_type,
558 ignore_secondaries=False)
559 Log("reboot with type '%s'" % reboot_type, indent=2)
561 self._CheckInstanceAlive(instance)
563 def ActivateDisks(self):
564 """Activate and deactivate disks of the instances."""
565 Log("Activating/deactivating disks")
566 for instance in self.instances:
567 Log("instance %s" % instance, indent=1)
568 op_act = opcodes.OpActivateInstanceDisks(instance_name=instance)
569 op_deact = opcodes.OpDeactivateInstanceDisks(instance_name=instance)
570 Log("activate disks when online", indent=2)
572 self.StopInstance(instance)
573 Log("activate disks when offline", indent=2)
575 Log("deactivate disks (when offline)", indent=2)
576 self.ExecOp(op_deact)
577 self.StartInstance(instance)
578 for instance in self.instances:
579 self._CheckInstanceAlive(instance)
581 def AddRemoveDisks(self):
582 """Add and remove an extra disk for the instances."""
583 Log("Adding and removing disks")
584 for instance in self.instances:
585 Log("instance %s" % instance, indent=1)
586 op_add = opcodes.OpSetInstanceParams(\
587 instance_name=instance,
588 disks=[(constants.DDM_ADD, {"size": self.disk_size[0]})])
589 op_rem = opcodes.OpSetInstanceParams(\
590 instance_name=instance, disks=[(constants.DDM_REMOVE, {})])
591 Log("adding a disk", indent=2)
593 self.StopInstance(instance)
594 Log("removing last disk", indent=2)
596 self.StartInstance(instance)
597 for instance in self.instances:
598 self._CheckInstanceAlive(instance)
600 def AddRemoveNICs(self):
601 """Add and remove an extra NIC for the instances."""
602 Log("Adding and removing NICs")
603 for instance in self.instances:
604 Log("instance %s" % instance, indent=1)
605 op_add = opcodes.OpSetInstanceParams(\
606 instance_name=instance, nics=[(constants.DDM_ADD, {})])
607 op_rem = opcodes.OpSetInstanceParams(\
608 instance_name=instance, nics=[(constants.DDM_REMOVE, {})])
609 Log("adding a NIC", indent=2)
611 Log("removing last NIC", indent=2)
614 def _CheckInstanceAlive(self, instance):
615 """Check if an instance is alive by doing http checks.
617 This will try to retrieve the url on the instance /hostname.txt
618 and check that it contains the hostname of the instance. In case
619 we get ECONNREFUSED, we retry up to the net timeout seconds, for
620 any other error we abort.
623 if not self.opts.http_check:
626 for retries in range(self.opts.net_timeout):
628 url = urllib2.urlopen("http://%s/hostname.txt" % instance)
629 except urllib2.URLError, err:
630 if err.args[0][0] == errno.ECONNREFUSED:
634 except urllib2.URLError, err:
635 raise InstanceDown(instance, str(err))
636 hostname = url.read().strip()
637 if hostname != instance:
638 raise InstanceDown(instance, ("Hostname mismatch, expected %s, got %s" %
639 (instance, hostname)))
641 def BurninCluster(self):
642 """Test a cluster intensively.
644 This will create instances and then start/stop/failover them.
645 It is safe for existing instances but could impact performance.
651 Log("Testing global parameters")
653 if (len(self.nodes) == 1 and
654 opts.disk_template not in (constants.DT_DISKLESS, constants.DT_PLAIN,
656 Err("When one node is available/selected the disk template must"
657 " be 'diskless', 'file' or 'plain'")
661 self.CreateInstances()
662 if opts.do_replace1 and opts.disk_template in constants.DTS_NET_MIRROR:
663 self.ReplaceDisks1D8()
664 if (opts.do_replace2 and len(self.nodes) > 2 and
665 opts.disk_template in constants.DTS_NET_MIRROR) :
668 if opts.disk_template != constants.DT_DISKLESS:
671 if opts.do_failover and opts.disk_template in constants.DTS_NET_MIRROR:
674 if (opts.do_importexport and
675 opts.disk_template not in (constants.DT_DISKLESS,
679 if opts.do_reinstall:
685 if opts.do_addremove_disks:
686 self.AddRemoveDisks()
688 if opts.do_addremove_nics:
691 if opts.do_activate_disks:
697 if opts.do_startstop:
703 Log("Error detected: opcode buffer follows:\n\n")
704 Log(self.GetFeedbackBuf())
715 return burner.BurninCluster()
718 if __name__ == "__main__":