4 # Copyright (C) 2006, 2007 Google Inc.
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
33 from itertools import izip, islice, cycle
34 from cStringIO import StringIO
36 from ganeti import opcodes
37 from ganeti import mcpu
38 from ganeti import constants
39 from ganeti import cli
40 from ganeti import errors
41 from ganeti import utils
44 USAGE = ("\tburnin -o OS_NAME [options...] instance_name ...")
47 class InstanceDown(Exception):
48 """The checked instance was not up"""
52 """Shows program usage information and exits the program."""
54 print >> sys.stderr, "Usage:"
55 print >> sys.stderr, USAGE
60 """Simple function that prints out its argument.
72 utils.SetupLogging(constants.LOG_BURNIN, debug=False, stderr_logging=True)
73 self._feed_buf = StringIO()
78 self.cl = cli.GetClient()
82 def ClearFeedbackBuf(self):
83 """Clear the feedback buffer."""
84 self._feed_buf.truncate(0)
86 def GetFeedbackBuf(self):
87 """Return the contents of the buffer."""
88 return self._feed_buf.getvalue()
90 def Feedback(self, msg):
91 """Acumulate feedback in our buffer."""
92 self._feed_buf.write("%s %s\n" % (time.ctime(utils.MergeTime(msg[0])),
98 """Execute an opcode and manage the exec buffer."""
99 self.ClearFeedbackBuf()
100 return cli.SubmitOpCode(op, feedback_fn=self.Feedback, cl=self.cl)
102 def ExecJobSet(self, jobs):
103 """Execute a set of jobs and return once all are done.
105 The method will return the list of results, if all jobs are
106 successfull. Otherwise, OpExecError will be raised from within
110 self.ClearFeedbackBuf()
111 job_ids = [cli.SendJob(job, cl=self.cl) for job in jobs]
112 Log("- Submitted job IDs %s" % ", ".join(job_ids))
115 Log("- Waiting for job %s" % jid)
116 results.append(cli.PollJob(jid, cl=self.cl, feedback_fn=self.Feedback))
120 def ParseOptions(self):
121 """Parses the command line options.
123 In case of command line errors, it will show the usage and exit the
128 parser = optparse.OptionParser(usage="\n%s" % USAGE,
129 version="%%prog (ganeti) %s" %
130 constants.RELEASE_VERSION,
131 option_class=cli.CliOption)
133 parser.add_option("-o", "--os", dest="os", default=None,
134 help="OS to use during burnin",
136 parser.add_option("--disk-size", dest="disk_size",
137 help="Disk size (determines disk count)",
138 default="128m", type="string", metavar="<size,size,...>")
139 parser.add_option("--disk-growth", dest="disk_growth", help="Disk growth",
140 default="128m", type="string", metavar="<size,size,...>")
141 parser.add_option("--mem-size", dest="mem_size", help="Memory size",
142 default=128, type="unit", metavar="<size>")
143 parser.add_option("-v", "--verbose",
144 action="store_true", dest="verbose", default=False,
145 help="print command execution messages to stdout")
146 parser.add_option("--no-replace1", dest="do_replace1",
147 help="Skip disk replacement with the same secondary",
148 action="store_false", default=True)
149 parser.add_option("--no-replace2", dest="do_replace2",
150 help="Skip disk replacement with a different secondary",
151 action="store_false", default=True)
152 parser.add_option("--no-failover", dest="do_failover",
153 help="Skip instance failovers", action="store_false",
155 parser.add_option("--no-importexport", dest="do_importexport",
156 help="Skip instance export/import", action="store_false",
158 parser.add_option("--no-startstop", dest="do_startstop",
159 help="Skip instance stop/start", action="store_false",
161 parser.add_option("--no-reinstall", dest="do_reinstall",
162 help="Skip instance reinstall", action="store_false",
164 parser.add_option("--no-reboot", dest="do_reboot",
165 help="Skip instance reboot", action="store_false",
167 parser.add_option("--no-activate-disks", dest="do_activate_disks",
168 help="Skip disk activation/deactivation",
169 action="store_false", default=True)
170 parser.add_option("--no-add-disks", dest="do_addremove_disks",
171 help="Skip disk addition/removal",
172 action="store_false", default=True)
173 parser.add_option("--no-add-nics", dest="do_addremove_nics",
174 help="Skip NIC addition/removal",
175 action="store_false", default=True)
176 parser.add_option("--no-nics", dest="nics",
177 help="No network interfaces", action="store_const",
178 const=[], default=[{}])
179 parser.add_option("--rename", dest="rename", default=None,
180 help="Give one unused instance name which is taken"
181 " to start the renaming sequence",
182 metavar="<instance_name>")
183 parser.add_option("-t", "--disk-template", dest="disk_template",
184 choices=("diskless", "file", "plain", "drbd"),
186 help="Disk template (diskless, file, plain or drbd)"
188 parser.add_option("-n", "--nodes", dest="nodes", default="",
189 help="Comma separated list of nodes to perform"
190 " the burnin on (defaults to all nodes)")
191 parser.add_option("--iallocator", dest="iallocator",
192 default=None, type="string",
193 help="Perform the allocation using an iallocator"
194 " instead of fixed node spread (node restrictions no"
195 " longer apply, therefore -n/--nodes must not be used")
196 parser.add_option("-p", "--parallel", default=False, action="store_true",
198 help="Enable parallelization of some operations in"
199 " order to speed burnin or to test granular locking")
200 parser.add_option("--net-timeout", default=15, type="int",
202 help="The instance check network timeout in seconds"
203 " (defaults to 15 seconds)")
204 parser.add_option("-C", "--http-check", default=False, action="store_true",
206 help="Enable checking of instance status via http,"
207 " looking for /hostname.txt that should contain the"
208 " name of the instance")
211 options, args = parser.parse_args()
212 if len(args) < 1 or options.os is None:
215 supported_disk_templates = (constants.DT_DISKLESS,
219 if options.disk_template not in supported_disk_templates:
220 Log("Unknown disk template '%s'" % options.disk_template)
223 if options.disk_template == constants.DT_DISKLESS:
224 disk_size = disk_growth = []
225 options.do_addremove_disks = False
227 disk_size = [utils.ParseUnit(v) for v in options.disk_size.split(",")]
228 disk_growth = [utils.ParseUnit(v)
229 for v in options.disk_growth.split(",")]
230 if len(disk_growth) != len(disk_size):
231 Log("Wrong disk sizes/growth combination")
233 if ((disk_size and options.disk_template == constants.DT_DISKLESS) or
234 (not disk_size and options.disk_template != constants.DT_DISKLESS)):
235 Log("Wrong disk count/disk template combination")
238 self.disk_size = disk_size
239 self.disk_growth = disk_growth
240 self.disk_count = len(disk_size)
242 if options.nodes and options.iallocator:
243 Log("Give either the nodes option or the iallocator option, not both")
247 self.instances = args
249 constants.BE_MEMORY: options.mem_size,
250 constants.BE_VCPUS: 1,
254 socket.setdefaulttimeout(options.net_timeout)
257 """Read the cluster state from the config."""
259 names = self.opts.nodes.split(",")
263 op = opcodes.OpQueryNodes(output_fields=["name", "offline"], names=names)
264 result = self.ExecOp(op)
265 except errors.GenericError, err:
266 err_code, msg = cli.FormatError(err)
269 self.nodes = [data[0] for data in result if not data[1]]
271 result = self.ExecOp(opcodes.OpDiagnoseOS(output_fields=["name", "valid"],
275 Log("Can't get the OS list")
278 # filter non-valid OS-es
279 os_set = [val[0] for val in result if val[1]]
281 if self.opts.os not in os_set:
282 Log("OS '%s' not found" % self.opts.os)
285 def CreateInstances(self):
286 """Create the given instances.
290 mytor = izip(cycle(self.nodes),
291 islice(cycle(self.nodes), 1, None),
295 for pnode, snode, instance in mytor:
296 if self.opts.iallocator:
298 Log("- Add instance %s (iallocator: %s)" %
299 (instance, self.opts.iallocator))
300 elif self.opts.disk_template not in constants.DTS_NET_MIRROR:
302 Log("- Add instance %s on node %s" % (instance, pnode))
304 Log("- Add instance %s on nodes %s/%s" % (instance, pnode, snode))
306 op = opcodes.OpCreateInstance(instance_name=instance,
307 disks = [ {"size": size}
308 for size in self.disk_size],
309 disk_template=self.opts.disk_template,
311 mode=constants.INSTANCE_CREATE,
312 os_type=self.opts.os,
319 file_storage_dir=None,
320 iallocator=self.opts.iallocator,
325 if self.opts.parallel:
327 # FIXME: here we should not append to to_rem uncoditionally,
328 # but only when the job is successful
329 self.to_rem.append(instance)
332 self.to_rem.append(instance)
333 if self.opts.parallel:
334 self.ExecJobSet(jobset)
336 for instance in self.instances:
337 self._CheckInstanceAlive(instance)
340 """Grow both the os and the swap disks by the requested amount, if any."""
341 for instance in self.instances:
342 for idx, growth in enumerate(self.disk_growth):
344 op = opcodes.OpGrowDisk(instance_name=instance, disk=idx,
345 amount=growth, wait_for_sync=True)
346 Log("- Increase %s's disk/%s by %s MB" % (instance, idx, growth))
349 def ReplaceDisks1D8(self):
350 """Replace disks on primary and secondary for drbd8."""
351 for instance in self.instances:
352 for mode in constants.REPLACE_DISK_SEC, constants.REPLACE_DISK_PRI:
353 op = opcodes.OpReplaceDisks(instance_name=instance,
355 disks=[i for i in range(self.disk_count)])
356 Log("- Replace disks (%s) for instance %s" % (mode, instance))
359 def ReplaceDisks2(self):
360 """Replace secondary node."""
361 mode = constants.REPLACE_DISK_CHG
363 mytor = izip(islice(cycle(self.nodes), 2, None),
365 for tnode, instance in mytor:
366 if self.opts.iallocator:
368 op = opcodes.OpReplaceDisks(instance_name=instance,
371 iallocator=self.opts.iallocator,
372 disks=[i for i in range(self.disk_count)])
373 Log("- Replace secondary (%s) for instance %s" % (mode, instance))
377 """Failover the instances."""
379 for instance in self.instances:
380 op = opcodes.OpFailoverInstance(instance_name=instance,
381 ignore_consistency=False)
383 Log("- Failover instance %s" % (instance))
385 for instance in self.instances:
386 self._CheckInstanceAlive(instance)
388 def ImportExport(self):
389 """Export the instance, delete it, and import it back.
393 mytor = izip(cycle(self.nodes),
394 islice(cycle(self.nodes), 1, None),
395 islice(cycle(self.nodes), 2, None),
398 for pnode, snode, enode, instance in mytor:
400 if self.opts.iallocator:
402 import_log_msg = ("- Import instance %s from node %s"
403 " (iallocator: %s)" %
404 (instance, enode, self.opts.iallocator))
405 elif self.opts.disk_template not in constants.DTS_NET_MIRROR:
407 import_log_msg = ("- Import instance %s from node %s to node %s" %
408 (instance, enode, pnode))
410 import_log_msg = ("- Import instance %s from node %s to nodes %s/%s" %
411 (instance, enode, pnode, snode))
413 exp_op = opcodes.OpExportInstance(instance_name=instance,
416 rem_op = opcodes.OpRemoveInstance(instance_name=instance,
417 ignore_failures=True)
418 nam_op = opcodes.OpQueryInstances(output_fields=["name"],
420 full_name = self.ExecOp(nam_op)[0][0]
421 imp_dir = os.path.join(constants.EXPORT_DIR, full_name)
422 imp_op = opcodes.OpCreateInstance(instance_name=instance,
423 disks = [ {"size": size}
424 for size in self.disk_size],
425 disk_template=self.opts.disk_template,
427 mode=constants.INSTANCE_IMPORT,
435 file_storage_dir=None,
437 iallocator=self.opts.iallocator,
442 erem_op = opcodes.OpRemoveExport(instance_name=instance)
444 Log("- Export instance %s to node %s" % (instance, enode))
446 Log("- Remove instance %s" % (instance))
448 self.to_rem.remove(instance)
451 Log("- Remove export of instance %s" % (instance))
454 self.to_rem.append(instance)
456 for instance in self.instances:
457 self._CheckInstanceAlive(instance)
459 def StopInstance(self, instance):
460 """Stop given instance."""
461 op = opcodes.OpShutdownInstance(instance_name=instance)
462 Log("- Shutdown instance %s" % instance)
465 def StartInstance(self, instance):
466 """Start given instance."""
467 op = opcodes.OpStartupInstance(instance_name=instance, force=False)
468 Log("- Start instance %s" % instance)
471 def RenameInstance(self, instance, instance_new):
472 """Rename instance."""
473 op = opcodes.OpRenameInstance(instance_name=instance,
474 new_name=instance_new)
475 Log("- Rename instance %s to %s" % (instance, instance_new))
479 """Stop/start the instances."""
480 for instance in self.instances:
481 self.StopInstance(instance)
482 self.StartInstance(instance)
484 for instance in self.instances:
485 self._CheckInstanceAlive(instance)
488 """Remove the instances."""
489 for instance in self.to_rem:
490 op = opcodes.OpRemoveInstance(instance_name=instance,
491 ignore_failures=True)
492 Log("- Remove instance %s" % instance)
496 """Rename the instances."""
497 rename = self.opts.rename
498 for instance in self.instances:
499 self.StopInstance(instance)
500 self.RenameInstance(instance, rename)
501 self.StartInstance(rename)
502 self._CheckInstanceAlive(rename)
503 self.StopInstance(rename)
504 self.RenameInstance(rename, instance)
505 self.StartInstance(instance)
507 for instance in self.instances:
508 self._CheckInstanceAlive(instance)
511 """Reinstall the instances."""
512 for instance in self.instances:
513 self.StopInstance(instance)
514 op = opcodes.OpReinstallInstance(instance_name=instance)
515 Log("- Reinstall instance %s without passing the OS" % (instance,))
517 op = opcodes.OpReinstallInstance(instance_name=instance,
518 os_type=self.opts.os)
519 Log("- Reinstall instance %s specifying the OS" % (instance,))
521 self.StartInstance(instance)
522 for instance in self.instances:
523 self._CheckInstanceAlive(instance)
526 """Reinstall the instances."""
527 for instance in self.instances:
528 for reboot_type in constants.REBOOT_TYPES:
529 op = opcodes.OpRebootInstance(instance_name=instance,
530 reboot_type=reboot_type,
531 ignore_secondaries=False)
532 Log("- Reboot instance %s with type '%s'" % (instance, reboot_type))
534 self._CheckInstanceAlive(instance)
536 def ActivateDisks(self):
537 """Activate and deactivate disks of the instances."""
538 for instance in self.instances:
539 op_act = opcodes.OpActivateInstanceDisks(instance_name=instance)
540 op_deact = opcodes.OpDeactivateInstanceDisks(instance_name=instance)
541 Log("- Activate disks of online instance %s" % (instance,))
543 self.StopInstance(instance)
544 Log("- Activate disks of offline instance %s" % (instance,))
546 Log("- Deactivate disks of offline instance %s" % (instance,))
547 self.ExecOp(op_deact)
548 self.StartInstance(instance)
549 for instance in self.instances:
550 self._CheckInstanceAlive(instance)
552 def AddRemoveDisks(self):
553 """Add and remove an extra disk for the instances."""
554 for instance in self.instances:
555 op_add = opcodes.OpSetInstanceParams(\
556 instance_name=instance,
557 disks=[(constants.DDM_ADD, {"size": self.disk_size[0]})])
558 op_rem = opcodes.OpSetInstanceParams(\
559 instance_name=instance, disks=[(constants.DDM_REMOVE, {})])
560 Log("- Adding a disk to instance %s" % (instance,))
562 self.StopInstance(instance)
563 Log("- Removing the last disk of instance %s" % (instance,))
565 self.StartInstance(instance)
566 for instance in self.instances:
567 self._CheckInstanceAlive(instance)
569 def AddRemoveNICs(self):
570 """Add and remove an extra NIC for the instances."""
571 for instance in self.instances:
572 op_add = opcodes.OpSetInstanceParams(\
573 instance_name=instance, nics=[(constants.DDM_ADD, {})])
574 op_rem = opcodes.OpSetInstanceParams(\
575 instance_name=instance, nics=[(constants.DDM_REMOVE, {})])
576 Log("- Adding a NIC to instance %s" % (instance,))
578 Log("- Removing the last NIC of instance %s" % (instance,))
581 def _CheckInstanceAlive(self, instance):
582 """Check if an instance is alive by doing http checks.
584 This will try to retrieve the url on the instance /hostname.txt
585 and check that it contains the hostname of the instance. In case
586 we get ECONNREFUSED, we retry up to the net timeout seconds, for
587 any other error we abort.
590 if not self.opts.http_check:
593 for retries in range(self.opts.net_timeout):
595 url = urllib2.urlopen("http://%s/hostname.txt" % instance)
596 except urllib2.URLError, err:
597 if err.args[0][0] == errno.ECONNREFUSED:
601 except urllib2.URLError, err:
602 raise InstanceDown(instance, str(err))
603 hostname = url.read().strip()
604 if hostname != instance:
605 raise InstanceDown(instance, ("Hostname mismatch, expected %s, got %s" %
606 (instance, hostname)))
608 def BurninCluster(self):
609 """Test a cluster intensively.
611 This will create instances and then start/stop/failover them.
612 It is safe for existing instances but could impact performance.
618 Log("- Testing global parameters")
620 if (len(self.nodes) == 1 and
621 opts.disk_template not in (constants.DT_DISKLESS, constants.DT_PLAIN,
623 Log("When one node is available/selected the disk template must"
624 " be 'diskless', 'file' or 'plain'")
629 self.CreateInstances()
630 if opts.do_replace1 and opts.disk_template in constants.DTS_NET_MIRROR:
631 self.ReplaceDisks1D8()
632 if (opts.do_replace2 and len(self.nodes) > 2 and
633 opts.disk_template in constants.DTS_NET_MIRROR) :
636 if opts.disk_template != constants.DT_DISKLESS:
639 if opts.do_failover and opts.disk_template in constants.DTS_NET_MIRROR:
642 if (opts.do_importexport and
643 opts.disk_template not in (constants.DT_DISKLESS,
647 if opts.do_reinstall:
653 if opts.do_addremove_disks:
654 self.AddRemoveDisks()
656 if opts.do_addremove_nics:
659 if opts.do_activate_disks:
662 if opts.do_startstop:
671 Log("Error detected: opcode buffer follows:\n\n")
672 Log(self.GetFeedbackBuf())
683 return burner.BurninCluster()
686 if __name__ == "__main__":