Two small improvements to burnin
[ganeti-local] / tools / burnin
1 #!/usr/bin/python
2 #
3
4 # Copyright (C) 2006, 2007 Google Inc.
5 #
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
10 #
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 # General Public License for more details.
15 #
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 # 02110-1301, USA.
20
21
22 """Burnin program
23
24 """
25
26 import os
27 import sys
28 import optparse
29 from itertools import izip, islice, cycle
30 from cStringIO import StringIO
31
32 from ganeti import opcodes
33 from ganeti import mcpu
34 from ganeti import constants
35 from ganeti import cli
36 from ganeti import logger
37 from ganeti import errors
38 from ganeti import utils
39
40
41 USAGE = ("\tburnin -o OS_NAME [options...] instance_name ...")
42
43
44 def Usage():
45   """Shows program usage information and exits the program."""
46
47   print >> sys.stderr, "Usage:"
48   print >> sys.stderr, USAGE
49   sys.exit(2)
50
51
52 def Log(msg):
53   """Simple function that prints out its argument.
54
55   """
56   print msg
57   sys.stdout.flush()
58
59
60 class Burner(object):
61   """Burner class."""
62
63   def __init__(self):
64     """Constructor."""
65     logger.SetupLogging(debug=False, program="ganeti/burnin")
66     self._feed_buf = StringIO()
67     self.proc = mcpu.Processor(feedback=self.Feedback)
68     self.nodes = []
69     self.instances = []
70     self.to_rem = []
71     self.opts = None
72     self.ParseOptions()
73     self.GetState()
74
75   def ClearFeedbackBuf(self):
76     """Clear the feedback buffer."""
77     self._feed_buf.truncate(0)
78
79   def GetFeedbackBuf(self):
80     """Return the contents of the buffer."""
81     return self._feed_buf.getvalue()
82
83   def Feedback(self, msg):
84     """Acumulate feedback in our buffer."""
85     self._feed_buf.write(msg)
86     self._feed_buf.write("\n")
87     if self.opts.verbose:
88       Log(msg)
89
90   def ExecOp(self, op):
91     """Execute an opcode and manage the exec buffer."""
92     self.ClearFeedbackBuf()
93     return self.proc.ExecOpCode(op)
94
95   def ParseOptions(self):
96     """Parses the command line options.
97
98     In case of command line errors, it will show the usage and exit the
99     program.
100
101     """
102
103     parser = optparse.OptionParser(usage="\n%s" % USAGE,
104                                    version="%%prog (ganeti) %s" %
105                                    constants.RELEASE_VERSION,
106                                    option_class=cli.CliOption)
107
108     parser.add_option("-o", "--os", dest="os", default=None,
109                       help="OS to use during burnin",
110                       metavar="<OS>")
111     parser.add_option("--os-size", dest="os_size", help="Disk size",
112                       default=4 * 1024, type="unit", metavar="<size>")
113     parser.add_option("--swap-size", dest="swap_size", help="Swap size",
114                       default=4 * 1024, type="unit", metavar="<size>")
115     parser.add_option("-v", "--verbose",
116                       action="store_true", dest="verbose", default=False,
117                       help="print command execution messages to stdout")
118     parser.add_option("--no-replace1", dest="do_replace1",
119                       help="Skip disk replacement with the same secondary",
120                       action="store_false", default=True)
121     parser.add_option("--no-replace2", dest="do_replace2",
122                       help="Skip disk replacement with a different secondary",
123                       action="store_false", default=True)
124     parser.add_option("--no-failover", dest="do_failover",
125                       help="Skip instance failovers", action="store_false",
126                       default=True)
127     parser.add_option("--no-importexport", dest="do_importexport",
128                       help="Skip instance export/import", action="store_false",
129                       default=True)
130     parser.add_option("--no-startstop", dest="do_startstop",
131                       help="Skip instance stop/start", action="store_false",
132                       default=True)
133     parser.add_option("-t", "--disk-template", dest="disk_template",
134                       choices=("diskless", "plain", "remote_raid1", "drbd"),
135                       default="remote_raid1",
136                       help="Template type for network mirroring (remote_raid1"
137                       " or drbd) [remote_raid1]")
138     parser.add_option("-n", "--nodes", dest="nodes", default="",
139                       help="Comma separated list of nodes to perform"
140                       " the burnin on (defaults to all nodes)")
141
142     options, args = parser.parse_args()
143     if len(args) < 1 or options.os is None:
144       Usage()
145
146     supported_disk_templates = (constants.DT_DISKLESS, constants.DT_PLAIN,
147                                 constants.DT_REMOTE_RAID1,
148                                 constants.DT_DRBD8)
149     if options.disk_template not in supported_disk_templates:
150       Log("Unknown disk template '%s'" % options.disk_template)
151       sys.exit(1)
152
153     self.opts = options
154     self.instances = args
155
156   def GetState(self):
157     """Read the cluster state from the config."""
158     if self.opts.nodes:
159       names = self.opts.nodes.split(",")
160     else:
161       names = []
162     try:
163       op = opcodes.OpQueryNodes(output_fields=["name"], names=names)
164       result = self.ExecOp(op)
165     except errors.GenericError, err:
166       err_code, msg = cli.FormatError(err)
167       Log(msg)
168       sys.exit(err_code)
169     self.nodes = [data[0] for data in result]
170
171     result = self.ExecOp(opcodes.OpDiagnoseOS())
172
173     if not result:
174       Log("Can't get the OS list")
175       sys.exit(1)
176
177     # filter non-valid OS-es
178     oses = {}
179     for node_name in result:
180       oses[node_name] = [obj for obj in result[node_name] if obj]
181
182     fnode = oses.keys()[0]
183     os_set = set([os_inst.name for os_inst in oses[fnode]])
184     del oses[fnode]
185     for node in oses:
186       os_set &= set([os_inst.name for os_inst in oses[node]])
187
188     if self.opts.os not in os_set:
189       Log("OS '%s' not found" % self.opts.os)
190       sys.exit(1)
191
192   def CreateInstances(self):
193     """Create the given instances.
194
195     """
196     self.to_rem = []
197     mytor = izip(cycle(self.nodes),
198                  islice(cycle(self.nodes), 1, None),
199                  self.instances)
200     for pnode, snode, instance in mytor:
201       op = opcodes.OpCreateInstance(instance_name=instance,
202                                     mem_size=128,
203                                     disk_size=self.opts.os_size,
204                                     swap_size=self.opts.swap_size,
205                                     disk_template=self.opts.disk_template,
206                                     mode=constants.INSTANCE_CREATE,
207                                     os_type=self.opts.os,
208                                     pnode=pnode,
209                                     snode=snode,
210                                     vcpus=1,
211                                     start=True,
212                                     ip_check=True,
213                                     wait_for_sync=True,
214                                     mac="auto",
215                                     kernel_path=None,
216                                     initrd_path=None,
217                                     hvm_boot_order=None)
218       Log("- Add instance %s on nodes %s/%s" % (instance, pnode, snode))
219       self.ExecOp(op)
220       self.to_rem.append(instance)
221
222   def ReplaceDisks1R1(self):
223     """Replace disks with the same secondary for rr1."""
224     # replace all, both disks
225     for instance in self.instances:
226       op = opcodes.OpReplaceDisks(instance_name=instance,
227                                   remote_node=None,
228                                   mode=constants.REPLACE_DISK_ALL,
229                                   disks=["sda", "sdb"])
230
231       Log("- Replace disks for instance %s" % (instance))
232       self.ExecOp(op)
233
234   def ReplaceDisks1D8(self):
235     """Replace disks on primary and secondary for drbd8."""
236     for instance in self.instances:
237       for mode in constants.REPLACE_DISK_SEC, constants.REPLACE_DISK_PRI:
238         op = opcodes.OpReplaceDisks(instance_name=instance,
239                                     mode=mode,
240                                     disks=["sda", "sdb"])
241         Log("- Replace disks (%s) for instance %s" % (mode, instance))
242         self.ExecOp(op)
243
244   def ReplaceDisks2(self):
245     """Replace secondary node."""
246     if self.opts.disk_template == constants.DT_REMOTE_RAID1:
247       mode = constants.REPLACE_DISK_ALL
248     else:
249       mode = constants.REPLACE_DISK_SEC
250
251     mytor = izip(islice(cycle(self.nodes), 2, None),
252                  self.instances)
253     for tnode, instance in mytor:
254       op = opcodes.OpReplaceDisks(instance_name=instance,
255                                   mode=mode,
256                                   remote_node=tnode,
257                                   disks=["sda", "sdb"])
258       Log("- Replace secondary (%s) for instance %s" % (mode, instance))
259       self.ExecOp(op)
260
261   def Failover(self):
262     """Failover the instances."""
263
264     for instance in self.instances:
265       op = opcodes.OpFailoverInstance(instance_name=instance,
266                                       ignore_consistency=False)
267
268       Log("- Failover instance %s" % (instance))
269       self.ExecOp(op)
270
271   def ImportExport(self):
272     """Export the instance, delete it, and import it back.
273
274     """
275
276     mytor = izip(cycle(self.nodes),
277                  islice(cycle(self.nodes), 1, None),
278                  islice(cycle(self.nodes), 2, None),
279                  self.instances)
280
281     for pnode, snode, enode, instance in mytor:
282       exp_op = opcodes.OpExportInstance(instance_name=instance,
283                                            target_node=enode,
284                                            shutdown=True)
285       rem_op = opcodes.OpRemoveInstance(instance_name=instance)
286       nam_op = opcodes.OpQueryInstances(output_fields=["name"],
287                                            names=[instance])
288       full_name = self.ExecOp(nam_op)[0][0]
289       imp_dir = os.path.join(constants.EXPORT_DIR, full_name)
290       imp_op = opcodes.OpCreateInstance(instance_name=instance,
291                                         mem_size=128,
292                                         disk_size=self.opts.os_size,
293                                         swap_size=self.opts.swap_size,
294                                         disk_template=self.opts.disk_template,
295                                         mode=constants.INSTANCE_IMPORT,
296                                         src_node=enode,
297                                         src_path=imp_dir,
298                                         pnode=pnode,
299                                         snode=snode,
300                                         vcpus=1,
301                                         start=True,
302                                         ip_check=True,
303                                         wait_for_sync=True,
304                                         mac="auto")
305
306       Log("- Export instance %s to node %s" % (instance, enode))
307       self.ExecOp(exp_op)
308       Log("- Remove instance %s" % (instance))
309       self.ExecOp(rem_op)
310       self.to_rem.remove(instance)
311       Log("- Import instance %s from node %s to node %s" %
312           (instance, enode, pnode))
313       self.ExecOp(imp_op)
314       self.to_rem.append(instance)
315
316   def StopStart(self):
317     """Stop/start the instances."""
318     for instance in self.instances:
319       op = opcodes.OpShutdownInstance(instance_name=instance)
320       Log("- Shutdown instance %s" % instance)
321       self.ExecOp(op)
322       op = opcodes.OpStartupInstance(instance_name=instance, force=False)
323       Log("- Start instance %s" % instance)
324       self.ExecOp(op)
325
326   def Remove(self):
327     """Remove the instances."""
328     for instance in self.to_rem:
329       op = opcodes.OpRemoveInstance(instance_name=instance)
330       Log("- Remove instance %s" % instance)
331       self.ExecOp(op)
332
333   def BurninCluster(self):
334     """Test a cluster intensively.
335
336     This will create instances and then start/stop/failover them.
337     It is safe for existing instances but could impact performance.
338
339     """
340
341     opts = self.opts
342
343     Log("- Testing global parameters")
344
345     if (len(self.nodes) == 1 and
346         opts.disk_template not in (constants.DT_DISKLESS, constants.DT_PLAIN)):
347       Log("When one node is available/selected the disk template must"
348                " be 'plain' or 'diskless'")
349       sys.exit(1)
350
351     has_err = True
352     try:
353       self.CreateInstances()
354       if opts.do_replace1 and opts.disk_template in constants.DTS_NET_MIRROR:
355         if opts.disk_template == constants.DT_REMOTE_RAID1:
356           self.ReplaceDisks1R1()
357         elif opts.disk_template == constants.DT_DRBD8:
358           self.ReplaceDisks1D8()
359       if (opts.do_replace2 and len(self.nodes) > 2 and
360           opts.disk_template in constants.DTS_NET_MIRROR) :
361         self.ReplaceDisks2()
362
363       if opts.do_failover and opts.disk_template in constants.DTS_NET_MIRROR:
364         self.Failover()
365
366       if opts.do_importexport:
367         self.ImportExport()
368
369       if opts.do_startstop:
370         self.StopStart()
371
372       has_err = False
373     finally:
374       if has_err:
375         Log("Error detected: opcode buffer follows:\n\n")
376         Log(self.GetFeedbackBuf())
377         Log("\n\n")
378       self.Remove()
379
380     return 0
381
382
383 def main():
384   """Main function"""
385
386   burner = Burner()
387   try:
388     utils.Lock('cmd', max_retries=15, debug=True)
389   except errors.LockError, err:
390     logger.ToStderr(str(err))
391     return 1
392   try:
393     retval = burner.BurninCluster()
394   finally:
395     utils.Unlock('cmd')
396     utils.LockCleanup()
397   return retval
398
399
400 if __name__ == "__main__":
401   main()