Revision e8ae0c20

b/qa/ganeti-qa.py
228 228
        if qa_config.TestEnabled('node-volumes'):
229 229
          RunTest(qa_node.TestNodeVolumes)
230 230

  
231
        if qa_config.TestEnabled('instance-disk-failure'):
232
          RunTest(qa_instance.TestInstanceMasterDiskFailure,
233
                  instance, node, node2)
234
          RunTest(qa_instance.TestInstanceSecondaryDiskFailure,
235
                  instance, node, node2)
236

  
231 237
        RunTest(qa_instance.TestInstanceRemove, instance)
232 238
        del instance
233 239
      finally:
b/qa/qa-sample.yaml
57 57
  instance-reinstall: True
58 58
  instance-shutdown: True
59 59

  
60
  # Make sure not to include the disk(s) required for Dom0 to be included in
61
  # the volume group used for instances. Otherwise the whole system may stop
62
  # working until restarted.
63
  instance-disk-failure: False
64

  
60 65
  # This test takes up to 6 minutes to complete
61 66
  instance-automatic-restart: False
62 67

  
b/qa/qa_instance.py
20 20

  
21 21
"""
22 22

  
23
import re
24
import time
25

  
23 26
from ganeti import utils
24 27
from ganeti import constants
25 28

  
26 29
import qa_config
27 30
import qa_utils
31
import qa_error
32

  
33
from qa_utils import AssertEqual, AssertNotEqual, StartSSH
28 34

  
29
from qa_utils import AssertEqual, StartSSH
35

  
36
def _GetDiskStatePath(disk):
37
  return "/sys/block/%s/device/state" % disk
30 38

  
31 39

  
32 40
def _GetGenericAddParameters():
......
172 180
  cmd = ['gnt-backup', 'list', '--nodes=%s' % expnode['primary']]
173 181
  AssertEqual(StartSSH(master['primary'],
174 182
                       utils.ShellQuoteArgs(cmd)).wait(), 0)
183

  
184

  
185
def _TestInstanceDiskFailure(instance, node, node2, onmaster):
186
  """Testing disk failure."""
187
  master = qa_config.GetMasterNode()
188
  sq = utils.ShellQuoteArgs
189

  
190
  instance_full = qa_utils.ResolveInstanceName(instance)
191
  node_full = qa_utils.ResolveNodeName(node)
192
  node2_full = qa_utils.ResolveNodeName(node2)
193

  
194
  cmd = ['gnt-node', 'volumes', '--separator=|', '--no-headers',
195
         '--output=node,phys,instance',
196
         node['primary'], node2['primary']]
197
  output = qa_utils.GetCommandOutput(master['primary'], sq(cmd))
198

  
199
  # Get physical disk names
200
  re_disk = re.compile(r'^/dev/([a-z]+)\d+$')
201
  node2disk = {}
202
  for line in output.splitlines():
203
    (node_name, phys, inst) = line.split('|')
204
    if inst == instance_full:
205
      if node_name not in node2disk:
206
        node2disk[node_name] = []
207

  
208
      m = re_disk.match(phys)
209
      if not m:
210
        raise qa_error.Error("Unknown disk name format: %s" % disk)
211

  
212
      name = m.group(1)
213
      if name not in node2disk[node_name]:
214
        node2disk[node_name].append(name)
215

  
216
  if [node2_full, node_full][int(onmaster)] not in node2disk:
217
    raise qa_error.Error("Couldn't find physical disks used on "
218
                         "%s node" % ["secondary", "master"][int(onmaster)])
219

  
220
  # Check whether nodes have ability to stop disks
221
  for node_name, disks in node2disk.iteritems():
222
    cmds = []
223
    for disk in disks:
224
      cmds.append(sq(["test", "-f", _GetDiskStatePath(disk)]))
225
    AssertEqual(StartSSH(node_name, ' && '.join(cmds)).wait(), 0)
226

  
227
  # Get device paths
228
  cmd = ['gnt-instance', 'activate-disks', instance['name']]
229
  output = qa_utils.GetCommandOutput(master['primary'], sq(cmd))
230
  devpath = []
231
  for line in output.splitlines():
232
    (_, _, tmpdevpath) = line.split(':')
233
    devpath.append(tmpdevpath)
234

  
235
  # Get drbd device paths
236
  cmd = ['gnt-instance', 'info', instance['name']]
237
  output = qa_utils.GetCommandOutput(master['primary'], sq(cmd))
238
  pattern = (r'\s+-\s+type:\s+drbd,\s+.*$'
239
             r'\s+primary:\s+(/dev/drbd\d+)\s+')
240
  drbddevs = re.findall(pattern, output, re.M)
241

  
242
  # Deactivate disks on secondary node
243
  halted_disks = []
244
  cmds = []
245
  for name in node2disk[[node2_full, node_full][int(onmaster)]]:
246
    halted_disks.append(name)
247
    cmds.append(sq(["echo", "offline"]) + " >%s" % _GetDiskStatePath(name))
248
  AssertEqual(StartSSH([node2, node][int(onmaster)]['primary'],
249
                       '; '.join(cmds)).wait(), 0)
250
  try:
251
    # Write something to the disks and give some time to notice the problem
252
    cmds = []
253
    for disk in devpath:
254
      cmds.append(sq(["dd", "count=1", "bs=512", "conv=notrunc",
255
                      "if=%s" % disk, "of=%s" % disk]))
256
    for _ in (0, 1, 2):
257
      AssertEqual(StartSSH(node['primary'], ' && '.join(cmds)).wait(), 0)
258
      time.sleep(3)
259

  
260
    # For manual checks
261
    cmd = ['gnt-instance', 'info', instance['name']]
262
    AssertEqual(StartSSH(master['primary'], sq(cmd)).wait(), 0)
263

  
264
  finally:
265
    # Activate disks again
266
    cmds = []
267
    for name in halted_disks:
268
      cmds.append(sq(["echo", "running"]) + " >%s" % _GetDiskStatePath(name))
269
    AssertEqual(StartSSH([node2, node][int(onmaster)]['primary'],
270
                         '; '.join(cmds)).wait(), 0)
271

  
272
  # Restart instance
273
  cmd = ['gnt-instance', 'shutdown', instance['name']]
274
  AssertEqual(StartSSH(master['primary'], sq(cmd)).wait(), 0)
275

  
276
  cmd = ['gnt-instance', 'startup', '--force', instance['name']]
277
  AssertEqual(StartSSH(master['primary'], sq(cmd)).wait(), 0)
278

  
279
  # Make sure disks are up again
280
  cmd = ['gnt-instance', 'activate-disks', instance['name']]
281
  AssertEqual(StartSSH(master['primary'], sq(cmd)).wait(), 0)
282

  
283
  cmd = ['gnt-cluster', 'verify']
284
  AssertEqual(StartSSH(master['primary'], sq(cmd)).wait(), 0)
285

  
286

  
287
def TestInstanceMasterDiskFailure(instance, node, node2):
288
  """Testing disk failure on master node."""
289
  qa_utils.PrintError("Disk failure on primary node cannot be "
290
                      "tested due to potential crashes.")
291
  # The following can cause crashes, thus it's disabled until fixed
292
  #return _TestInstanceDiskFailure(instance, node, node2, True)
293

  
294

  
295
def TestInstanceSecondaryDiskFailure(instance, node, node2):
296
  """Testing disk failure on secondary node."""
297
  return _TestInstanceDiskFailure(instance, node, node2, False)
b/qa/qa_utils.py
61 61
_SetupColours()
62 62

  
63 63

  
64
def AssertEqual(first, second, msg=None):
64
def AssertEqual(first, second):
65 65
  """Raises an error when values aren't equal.
66 66

  
67 67
  """
68 68
  if not first == second:
69
    raise qa_error.Error(msg or '%r == %r' % (first, second))
69
    raise qa_error.Error('%r == %r' % (first, second))
70

  
71

  
72
def AssertNotEqual(first, second):
73
  """Raises an error when values are equal.
74

  
75
  """
76
  if not first != second:
77
    raise qa_error.Error('%r != %r' % (first, second))
70 78

  
71 79

  
72 80
def GetSSHCommand(node, cmd, strict=True):
......
158 166
  """Gets the full name of an instance.
159 167

  
160 168
  """
161
  return _ResolveName(['gnt-instance', 'info', instance['info']],
169
  return _ResolveName(['gnt-instance', 'info', instance['name']],
162 170
                      'Instance name')
163 171

  
164 172

  

Also available in: Unified diff