Revision 02c521e4 lib/cmdlib.py

b/lib/cmdlib.py
1091 1091
  ETYPE_ERROR = "ERROR"
1092 1092
  ETYPE_WARNING = "WARNING"
1093 1093

  
1094
  class NodeImage(object):
1095
    """A class representing the logical and physical status of a node.
1096

  
1097
    @ivar volumes: a structure as returned from
1098
        L{ganeti.utils.GetVolumeList} (runtime)
1099
    @ivar instances: a list of running instances (runtime)
1100
    @ivar pinst: list of configured primary instances (config)
1101
    @ivar sinst: list of configured secondary instances (config)
1102
    @ivar sbp: diction of {secondary-node: list of instances} of all peers
1103
        of this node (config)
1104
    @ivar mfree: free memory, as reported by hypervisor (runtime)
1105
    @ivar dfree: free disk, as reported by the node (runtime)
1106
    @ivar offline: the offline status (config)
1107
    @type rpc_fail: boolean
1108
    @ivar rpc_fail: whether the RPC verify call was successfull (overall,
1109
        not whether the individual keys were correct) (runtime)
1110
    @type lvm_fail: boolean
1111
    @ivar lvm_fail: whether the RPC call didn't return valid LVM data
1112
    @type hyp_fail: boolean
1113
    @ivar hyp_fail: whether the RPC call didn't return the instance list
1114
    @type ghost: boolean
1115
    @ivar ghost: whether this is a known node or not (config)
1116

  
1117
    """
1118
    def __init__(self, offline=False):
1119
      self.volumes = {}
1120
      self.instances = []
1121
      self.pinst = []
1122
      self.sinst = []
1123
      self.sbp = {}
1124
      self.mfree = 0
1125
      self.dfree = 0
1126
      self.offline = offline
1127
      self.rpc_fail = False
1128
      self.lvm_fail = False
1129
      self.hyp_fail = False
1130
      self.ghost = False
1131

  
1094 1132
  def ExpandNames(self):
1095 1133
    self.needed_locks = {
1096 1134
      locking.LEVEL_NODE: locking.ALL_SET,
......
1135 1173
    if kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR) == self.ETYPE_ERROR:
1136 1174
      self.bad = self.bad or cond
1137 1175

  
1138
  def _VerifyNode(self, nodeinfo, file_list, local_cksum,
1139
                  node_result, master_files, drbd_map, vg_name):
1176
  def _VerifyNode(self, ninfo, nresult):
1140 1177
    """Run multiple tests against a node.
1141 1178

  
1142 1179
    Test list:
......
1146 1183
      - checks config file checksum
1147 1184
      - checks ssh to other nodes
1148 1185

  
1149
    @type nodeinfo: L{objects.Node}
1150
    @param nodeinfo: the node to check
1151
    @param file_list: required list of files
1152
    @param local_cksum: dictionary of local files and their checksums
1153
    @param node_result: the results from the node
1154
    @param master_files: list of files that only masters should have
1155
    @param drbd_map: the useddrbd minors for this node, in
1156
        form of minor: (instance, must_exist) which correspond to instances
1157
        and their running status
1158
    @param vg_name: Ganeti Volume Group (result of self.cfg.GetVGName())
1186
    @type ninfo: L{objects.Node}
1187
    @param ninfo: the node to check
1188
    @param nresult: the results from the node
1189
    @rtype: boolean
1190
    @return: whether overall this call was successful (and we can expect
1191
         reasonable values in the respose)
1159 1192

  
1160 1193
    """
1161
    node = nodeinfo.name
1194
    node = ninfo.name
1162 1195
    _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1163 1196

  
1164
    # main result, node_result should be a non-empty dict
1165
    test = not node_result or not isinstance(node_result, dict)
1197
    # main result, nresult should be a non-empty dict
1198
    test = not nresult or not isinstance(nresult, dict)
1166 1199
    _ErrorIf(test, self.ENODERPC, node,
1167 1200
                  "unable to verify node: no data returned")
1168 1201
    if test:
1169
      return
1202
      return False
1170 1203

  
1171 1204
    # compares ganeti version
1172 1205
    local_version = constants.PROTOCOL_VERSION
1173
    remote_version = node_result.get('version', None)
1206
    remote_version = nresult.get("version", None)
1174 1207
    test = not (remote_version and
1175 1208
                isinstance(remote_version, (list, tuple)) and
1176 1209
                len(remote_version) == 2)
1177 1210
    _ErrorIf(test, self.ENODERPC, node,
1178 1211
             "connection to node returned invalid data")
1179 1212
    if test:
1180
      return
1213
      return False
1181 1214

  
1182 1215
    test = local_version != remote_version[0]
1183 1216
    _ErrorIf(test, self.ENODEVERSION, node,
1184 1217
             "incompatible protocol versions: master %s,"
1185 1218
             " node %s", local_version, remote_version[0])
1186 1219
    if test:
1187
      return
1220
      return False
1188 1221

  
1189 1222
    # node seems compatible, we can actually try to look into its results
1190 1223

  
......
1195 1228
                  constants.RELEASE_VERSION, remote_version[1],
1196 1229
                  code=self.ETYPE_WARNING)
1197 1230

  
1198
    # checks vg existence and size > 20G
1199
    if vg_name is not None:
1200
      vglist = node_result.get(constants.NV_VGLIST, None)
1201
      test = not vglist
1202
      _ErrorIf(test, self.ENODELVM, node, "unable to check volume groups")
1203
      if not test:
1204
        vgstatus = utils.CheckVolumeGroupSize(vglist, vg_name,
1205
                                              constants.MIN_VG_SIZE)
1206
        _ErrorIf(vgstatus, self.ENODELVM, node, vgstatus)
1231
    hyp_result = nresult.get(constants.NV_HYPERVISOR, None)
1232
    if isinstance(hyp_result, dict):
1233
      for hv_name, hv_result in hyp_result.iteritems():
1234
        test = hv_result is not None
1235
        _ErrorIf(test, self.ENODEHV, node,
1236
                 "hypervisor %s verify failure: '%s'", hv_name, hv_result)
1207 1237

  
1208
    # checks config file checksum
1209 1238

  
1210
    remote_cksum = node_result.get(constants.NV_FILELIST, None)
1211
    test = not isinstance(remote_cksum, dict)
1212
    _ErrorIf(test, self.ENODEFILECHECK, node,
1213
             "node hasn't returned file checksum data")
1239
    test = nresult.get(constants.NV_NODESETUP,
1240
                           ["Missing NODESETUP results"])
1241
    _ErrorIf(test, self.ENODESETUP, node, "node setup error: %s",
1242
             "; ".join(test))
1243

  
1244
    return True
1245

  
1246
  def _VerifyNodeTime(self, ninfo, nresult,
1247
                      nvinfo_starttime, nvinfo_endtime):
1248
    """Check the node time.
1249

  
1250
    @type ninfo: L{objects.Node}
1251
    @param ninfo: the node to check
1252
    @param nresult: the remote results for the node
1253
    @param nvinfo_starttime: the start time of the RPC call
1254
    @param nvinfo_endtime: the end time of the RPC call
1255

  
1256
    """
1257
    node = ninfo.name
1258
    _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1259

  
1260
    ntime = nresult.get(constants.NV_TIME, None)
1261
    try:
1262
      ntime_merged = utils.MergeTime(ntime)
1263
    except (ValueError, TypeError):
1264
      _ErrorIf(True, self.ENODETIME, node, "Node returned invalid time")
1265
      return
1266

  
1267
    if ntime_merged < (nvinfo_starttime - constants.NODE_MAX_CLOCK_SKEW):
1268
      ntime_diff = "%.01fs" % abs(nvinfo_starttime - ntime_merged)
1269
    elif ntime_merged > (nvinfo_endtime + constants.NODE_MAX_CLOCK_SKEW):
1270
      ntime_diff = "%.01fs" % abs(ntime_merged - nvinfo_endtime)
1271
    else:
1272
      ntime_diff = None
1273

  
1274
    _ErrorIf(ntime_diff is not None, self.ENODETIME, node,
1275
             "Node time diverges by at least %s from master node time",
1276
             ntime_diff)
1277

  
1278
  def _VerifyNodeLVM(self, ninfo, nresult, vg_name):
1279
    """Check the node time.
1280

  
1281
    @type ninfo: L{objects.Node}
1282
    @param ninfo: the node to check
1283
    @param nresult: the remote results for the node
1284
    @param vg_name: the configured VG name
1285

  
1286
    """
1287
    if vg_name is None:
1288
      return
1289

  
1290
    node = ninfo.name
1291
    _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1292

  
1293
    # checks vg existence and size > 20G
1294
    vglist = nresult.get(constants.NV_VGLIST, None)
1295
    test = not vglist
1296
    _ErrorIf(test, self.ENODELVM, node, "unable to check volume groups")
1214 1297
    if not test:
1215
      for file_name in file_list:
1216
        node_is_mc = nodeinfo.master_candidate
1217
        must_have = (file_name not in master_files) or node_is_mc
1218
        # missing
1219
        test1 = file_name not in remote_cksum
1220
        # invalid checksum
1221
        test2 = not test1 and remote_cksum[file_name] != local_cksum[file_name]
1222
        # existing and good
1223
        test3 = not test1 and remote_cksum[file_name] == local_cksum[file_name]
1224
        _ErrorIf(test1 and must_have, self.ENODEFILECHECK, node,
1225
                 "file '%s' missing", file_name)
1226
        _ErrorIf(test2 and must_have, self.ENODEFILECHECK, node,
1227
                 "file '%s' has wrong checksum", file_name)
1228
        # not candidate and this is not a must-have file
1229
        _ErrorIf(test2 and not must_have, self.ENODEFILECHECK, node,
1230
                 "file '%s' should not exist on non master"
1231
                 " candidates (and the file is outdated)", file_name)
1232
        # all good, except non-master/non-must have combination
1233
        _ErrorIf(test3 and not must_have, self.ENODEFILECHECK, node,
1234
                 "file '%s' should not exist"
1235
                 " on non master candidates", file_name)
1236

  
1237
    # checks ssh to any
1238

  
1239
    test = constants.NV_NODELIST not in node_result
1298
      vgstatus = utils.CheckVolumeGroupSize(vglist, vg_name,
1299
                                            constants.MIN_VG_SIZE)
1300
      _ErrorIf(vgstatus, self.ENODELVM, node, vgstatus)
1301

  
1302
    # check pv names
1303
    pvlist = nresult.get(constants.NV_PVLIST, None)
1304
    test = pvlist is None
1305
    _ErrorIf(test, self.ENODELVM, node, "Can't get PV list from node")
1306
    if not test:
1307
      # check that ':' is not present in PV names, since it's a
1308
      # special character for lvcreate (denotes the range of PEs to
1309
      # use on the PV)
1310
      for _, pvname, owner_vg in pvlist:
1311
        test = ":" in pvname
1312
        _ErrorIf(test, self.ENODELVM, node, "Invalid character ':' in PV"
1313
                 " '%s' of VG '%s'", pvname, owner_vg)
1314

  
1315
  def _VerifyNodeNetwork(self, ninfo, nresult):
1316
    """Check the node time.
1317

  
1318
    @type ninfo: L{objects.Node}
1319
    @param ninfo: the node to check
1320
    @param nresult: the remote results for the node
1321

  
1322
    """
1323
    node = ninfo.name
1324
    _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1325

  
1326
    test = constants.NV_NODELIST not in nresult
1240 1327
    _ErrorIf(test, self.ENODESSH, node,
1241 1328
             "node hasn't returned node ssh connectivity data")
1242 1329
    if not test:
1243
      if node_result[constants.NV_NODELIST]:
1244
        for a_node, a_msg in node_result[constants.NV_NODELIST].items():
1330
      if nresult[constants.NV_NODELIST]:
1331
        for a_node, a_msg in nresult[constants.NV_NODELIST].items():
1245 1332
          _ErrorIf(True, self.ENODESSH, node,
1246 1333
                   "ssh communication with node '%s': %s", a_node, a_msg)
1247 1334

  
1248
    test = constants.NV_NODENETTEST not in node_result
1335
    test = constants.NV_NODENETTEST not in nresult
1249 1336
    _ErrorIf(test, self.ENODENET, node,
1250 1337
             "node hasn't returned node tcp connectivity data")
1251 1338
    if not test:
1252
      if node_result[constants.NV_NODENETTEST]:
1253
        nlist = utils.NiceSort(node_result[constants.NV_NODENETTEST].keys())
1339
      if nresult[constants.NV_NODENETTEST]:
1340
        nlist = utils.NiceSort(nresult[constants.NV_NODENETTEST].keys())
1254 1341
        for anode in nlist:
1255 1342
          _ErrorIf(True, self.ENODENET, node,
1256 1343
                   "tcp communication with node '%s': %s",
1257
                   anode, node_result[constants.NV_NODENETTEST][anode])
1258

  
1259
    hyp_result = node_result.get(constants.NV_HYPERVISOR, None)
1260
    if isinstance(hyp_result, dict):
1261
      for hv_name, hv_result in hyp_result.iteritems():
1262
        test = hv_result is not None
1263
        _ErrorIf(test, self.ENODEHV, node,
1264
                 "hypervisor %s verify failure: '%s'", hv_name, hv_result)
1265

  
1266
    # check used drbd list
1267
    if vg_name is not None:
1268
      used_minors = node_result.get(constants.NV_DRBDLIST, [])
1269
      test = not isinstance(used_minors, (tuple, list))
1270
      _ErrorIf(test, self.ENODEDRBD, node,
1271
               "cannot parse drbd status file: %s", str(used_minors))
1272
      if not test:
1273
        for minor, (iname, must_exist) in drbd_map.items():
1274
          test = minor not in used_minors and must_exist
1275
          _ErrorIf(test, self.ENODEDRBD, node,
1276
                   "drbd minor %d of instance %s is not active",
1277
                   minor, iname)
1278
        for minor in used_minors:
1279
          test = minor not in drbd_map
1280
          _ErrorIf(test, self.ENODEDRBD, node,
1281
                   "unallocated drbd minor %d is in use", minor)
1282
    test = node_result.get(constants.NV_NODESETUP,
1283
                           ["Missing NODESETUP results"])
1284
    _ErrorIf(test, self.ENODESETUP, node, "node setup error: %s",
1285
             "; ".join(test))
1344
                   anode, nresult[constants.NV_NODENETTEST][anode])
1286 1345

  
1287
    # check pv names
1288
    if vg_name is not None:
1289
      pvlist = node_result.get(constants.NV_PVLIST, None)
1290
      test = pvlist is None
1291
      _ErrorIf(test, self.ENODELVM, node, "Can't get PV list from node")
1292
      if not test:
1293
        # check that ':' is not present in PV names, since it's a
1294
        # special character for lvcreate (denotes the range of PEs to
1295
        # use on the PV)
1296
        for _, pvname, owner_vg in pvlist:
1297
          test = ":" in pvname
1298
          _ErrorIf(test, self.ENODELVM, node, "Invalid character ':' in PV"
1299
                   " '%s' of VG '%s'", pvname, owner_vg)
1300

  
1301
  def _VerifyInstance(self, instance, instanceconfig, node_vol_is,
1302
                      node_instance, n_offline):
1346
  def _VerifyInstance(self, instance, instanceconfig, node_image):
1303 1347
    """Verify an instance.
1304 1348

  
1305 1349
    This function checks to see if the required block devices are
......
1313 1357
    instanceconfig.MapLVsByNode(node_vol_should)
1314 1358

  
1315 1359
    for node in node_vol_should:
1316
      if node in n_offline:
1317
        # ignore missing volumes on offline nodes
1360
      n_img = node_image[node]
1361
      if n_img.offline or n_img.rpc_fail or n_img.lvm_fail:
1362
        # ignore missing volumes on offline or broken nodes
1318 1363
        continue
1319 1364
      for volume in node_vol_should[node]:
1320
        test = node not in node_vol_is or volume not in node_vol_is[node]
1365
        test = volume not in n_img.volumes
1321 1366
        _ErrorIf(test, self.EINSTANCEMISSINGDISK, instance,
1322 1367
                 "volume %s missing on node %s", volume, node)
1323 1368

  
1324 1369
    if instanceconfig.admin_up:
1325
      test = ((node_current not in node_instance or
1326
               not instance in node_instance[node_current]) and
1327
              node_current not in n_offline)
1370
      pri_img = node_image[node_current]
1371
      test = instance not in pri_img.instances and not pri_img.offline
1328 1372
      _ErrorIf(test, self.EINSTANCEDOWN, instance,
1329 1373
               "instance not running on its primary node %s",
1330 1374
               node_current)
1331 1375

  
1332
    for node in node_instance:
1376
    for node, n_img in node_image.items():
1333 1377
      if (not node == node_current):
1334
        test = instance in node_instance[node]
1378
        test = instance in n_img.instances
1335 1379
        _ErrorIf(test, self.EINSTANCEWRONGNODE, instance,
1336 1380
                 "instance should not run on node %s", node)
1337 1381

  
1338
  def _VerifyOrphanVolumes(self, node_vol_should, node_vol_is):
1382
  def _VerifyOrphanVolumes(self, node_vol_should, node_image):
1339 1383
    """Verify if there are any unknown volumes in the cluster.
1340 1384

  
1341 1385
    The .os, .swap and backup volumes are ignored. All other volumes are
1342 1386
    reported as unknown.
1343 1387

  
1344 1388
    """
1345
    for node in node_vol_is:
1346
      for volume in node_vol_is[node]:
1389
    for node, n_img in node_image.items():
1390
      if n_img.offline or n_img.rpc_fail or n_img.lvm_fail:
1391
        # skip non-healthy nodes
1392
        continue
1393
      for volume in n_img.volumes:
1347 1394
        test = (node not in node_vol_should or
1348 1395
                volume not in node_vol_should[node])
1349 1396
        self._ErrorIf(test, self.ENODEORPHANLV, node,
1350 1397
                      "volume %s is unknown", volume)
1351 1398

  
1352
  def _VerifyOrphanInstances(self, instancelist, node_instance):
1399
  def _VerifyOrphanInstances(self, instancelist, node_image):
1353 1400
    """Verify the list of running instances.
1354 1401

  
1355 1402
    This checks what instances are running but unknown to the cluster.
1356 1403

  
1357 1404
    """
1358
    for node in node_instance:
1359
      for o_inst in node_instance[node]:
1405
    for node, n_img in node_image.items():
1406
      for o_inst in n_img.instances:
1360 1407
        test = o_inst not in instancelist
1361 1408
        self._ErrorIf(test, self.ENODEORPHANINSTANCE, node,
1362 1409
                      "instance %s on node %s should not exist", o_inst, node)
1363 1410

  
1364
  def _VerifyNPlusOneMemory(self, node_info, instance_cfg):
1411
  def _VerifyNPlusOneMemory(self, node_image, instance_cfg):
1365 1412
    """Verify N+1 Memory Resilience.
1366 1413

  
1367
    Check that if one single node dies we can still start all the instances it
1368
    was primary for.
1414
    Check that if one single node dies we can still start all the
1415
    instances it was primary for.
1369 1416

  
1370 1417
    """
1371
    for node, nodeinfo in node_info.iteritems():
1372
      # This code checks that every node which is now listed as secondary has
1373
      # enough memory to host all instances it is supposed to should a single
1374
      # other node in the cluster fail.
1418
    for node, n_img in node_image.items():
1419
      # This code checks that every node which is now listed as
1420
      # secondary has enough memory to host all instances it is
1421
      # supposed to should a single other node in the cluster fail.
1375 1422
      # FIXME: not ready for failover to an arbitrary node
1376 1423
      # FIXME: does not support file-backed instances
1377
      # WARNING: we currently take into account down instances as well as up
1378
      # ones, considering that even if they're down someone might want to start
1379
      # them even in the event of a node failure.
1380
      for prinode, instances in nodeinfo['sinst-by-pnode'].iteritems():
1424
      # WARNING: we currently take into account down instances as well
1425
      # as up ones, considering that even if they're down someone
1426
      # might want to start them even in the event of a node failure.
1427
      for prinode, instances in n_img.sbp.items():
1381 1428
        needed_mem = 0
1382 1429
        for instance in instances:
1383 1430
          bep = self.cfg.GetClusterInfo().FillBE(instance_cfg[instance])
1384 1431
          if bep[constants.BE_AUTO_BALANCE]:
1385 1432
            needed_mem += bep[constants.BE_MEMORY]
1386
        test = nodeinfo['mfree'] < needed_mem
1433
        test = n_img.mfree < needed_mem
1387 1434
        self._ErrorIf(test, self.ENODEN1, node,
1388 1435
                      "not enough memory on to accommodate"
1389 1436
                      " failovers should peer node %s fail", prinode)
1390 1437

  
1438
  def _VerifyNodeFiles(self, ninfo, nresult, file_list, local_cksum,
1439
                       master_files):
1440
    """Verifies and computes the node required file checksums.
1441

  
1442
    @type ninfo: L{objects.Node}
1443
    @param ninfo: the node to check
1444
    @param nresult: the remote results for the node
1445
    @param file_list: required list of files
1446
    @param local_cksum: dictionary of local files and their checksums
1447
    @param master_files: list of files that only masters should have
1448

  
1449
    """
1450
    node = ninfo.name
1451
    _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1452

  
1453
    remote_cksum = nresult.get(constants.NV_FILELIST, None)
1454
    test = not isinstance(remote_cksum, dict)
1455
    _ErrorIf(test, self.ENODEFILECHECK, node,
1456
             "node hasn't returned file checksum data")
1457
    if test:
1458
      return
1459

  
1460
    for file_name in file_list:
1461
      node_is_mc = ninfo.master_candidate
1462
      must_have = (file_name not in master_files) or node_is_mc
1463
      # missing
1464
      test1 = file_name not in remote_cksum
1465
      # invalid checksum
1466
      test2 = not test1 and remote_cksum[file_name] != local_cksum[file_name]
1467
      # existing and good
1468
      test3 = not test1 and remote_cksum[file_name] == local_cksum[file_name]
1469
      _ErrorIf(test1 and must_have, self.ENODEFILECHECK, node,
1470
               "file '%s' missing", file_name)
1471
      _ErrorIf(test2 and must_have, self.ENODEFILECHECK, node,
1472
               "file '%s' has wrong checksum", file_name)
1473
      # not candidate and this is not a must-have file
1474
      _ErrorIf(test2 and not must_have, self.ENODEFILECHECK, node,
1475
               "file '%s' should not exist on non master"
1476
               " candidates (and the file is outdated)", file_name)
1477
      # all good, except non-master/non-must have combination
1478
      _ErrorIf(test3 and not must_have, self.ENODEFILECHECK, node,
1479
               "file '%s' should not exist"
1480
               " on non master candidates", file_name)
1481

  
1482
  def _VerifyNodeDrbd(self, ninfo, nresult, instanceinfo, drbd_map):
1483
    """Verifies and the node DRBD status.
1484

  
1485
    @type ninfo: L{objects.Node}
1486
    @param ninfo: the node to check
1487
    @param nresult: the remote results for the node
1488
    @param instanceinfo: the dict of instances
1489
    @param drbd_map: the DRBD map as returned by
1490
        L{ganeti.config.ConfigWriter.ComputeDRBDMap}
1491

  
1492
    """
1493
    node = ninfo.name
1494
    _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1495

  
1496
    # compute the DRBD minors
1497
    node_drbd = {}
1498
    for minor, instance in drbd_map[node].items():
1499
      test = instance not in instanceinfo
1500
      _ErrorIf(test, self.ECLUSTERCFG, None,
1501
               "ghost instance '%s' in temporary DRBD map", instance)
1502
        # ghost instance should not be running, but otherwise we
1503
        # don't give double warnings (both ghost instance and
1504
        # unallocated minor in use)
1505
      if test:
1506
        node_drbd[minor] = (instance, False)
1507
      else:
1508
        instance = instanceinfo[instance]
1509
        node_drbd[minor] = (instance.name, instance.admin_up)
1510

  
1511
    # and now check them
1512
    used_minors = nresult.get(constants.NV_DRBDLIST, [])
1513
    test = not isinstance(used_minors, (tuple, list))
1514
    _ErrorIf(test, self.ENODEDRBD, node,
1515
             "cannot parse drbd status file: %s", str(used_minors))
1516
    if test:
1517
      # we cannot check drbd status
1518
      return
1519

  
1520
    for minor, (iname, must_exist) in node_drbd.items():
1521
      test = minor not in used_minors and must_exist
1522
      _ErrorIf(test, self.ENODEDRBD, node,
1523
               "drbd minor %d of instance %s is not active", minor, iname)
1524
    for minor in used_minors:
1525
      test = minor not in node_drbd
1526
      _ErrorIf(test, self.ENODEDRBD, node,
1527
               "unallocated drbd minor %d is in use", minor)
1528

  
1529
  def _UpdateNodeVolumes(self, ninfo, nresult, nimg, vg_name):
1530
    """Verifies and updates the node volume data.
1531

  
1532
    This function will update a L{NodeImage}'s internal structures
1533
    with data from the remote call.
1534

  
1535
    @type ninfo: L{objects.Node}
1536
    @param ninfo: the node to check
1537
    @param nresult: the remote results for the node
1538
    @param nimg: the node image object
1539
    @param vg_name: the configured VG name
1540

  
1541
    """
1542
    node = ninfo.name
1543
    _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1544

  
1545
    nimg.lvm_fail = True
1546
    lvdata = nresult.get(constants.NV_LVLIST, "Missing LV data")
1547
    if vg_name is None:
1548
      pass
1549
    elif isinstance(lvdata, basestring):
1550
      _ErrorIf(True, self.ENODELVM, node, "LVM problem on node: %s",
1551
               utils.SafeEncode(lvdata))
1552
    elif not isinstance(lvdata, dict):
1553
      _ErrorIf(True, self.ENODELVM, node, "rpc call to node failed (lvlist)")
1554
    else:
1555
      nimg.volumes = lvdata
1556
      nimg.lvm_fail = False
1557

  
1558
  def _UpdateNodeInstances(self, ninfo, nresult, nimg):
1559
    """Verifies and updates the node instance list.
1560

  
1561
    If the listing was successful, then updates this node's instance
1562
    list. Otherwise, it marks the RPC call as failed for the instance
1563
    list key.
1564

  
1565
    @type ninfo: L{objects.Node}
1566
    @param ninfo: the node to check
1567
    @param nresult: the remote results for the node
1568
    @param nimg: the node image object
1569

  
1570
    """
1571
    idata = nresult.get(constants.NV_INSTANCELIST, None)
1572
    test = not isinstance(idata, list)
1573
    self._ErrorIf(test, self.ENODEHV, ninfo.name, "rpc call to node failed"
1574
                  " (instancelist): %s", utils.SafeEncode(str(idata)))
1575
    if test:
1576
      nimg.hyp_fail = True
1577
    else:
1578
      nimg.instances = idata
1579

  
1580
  def _UpdateNodeInfo(self, ninfo, nresult, nimg, vg_name):
1581
    """Verifies and computes a node information map
1582

  
1583
    @type ninfo: L{objects.Node}
1584
    @param ninfo: the node to check
1585
    @param nresult: the remote results for the node
1586
    @param nimg: the node image object
1587
    @param vg_name: the configured VG name
1588

  
1589
    """
1590
    node = ninfo.name
1591
    _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
1592

  
1593
    # try to read free memory (from the hypervisor)
1594
    hv_info = nresult.get(constants.NV_HVINFO, None)
1595
    test = not isinstance(hv_info, dict) or "memory_free" not in hv_info
1596
    _ErrorIf(test, self.ENODEHV, node, "rpc call to node failed (hvinfo)")
1597
    if not test:
1598
      try:
1599
        nimg.mfree = int(hv_info["memory_free"])
1600
      except (ValueError, TypeError):
1601
        _ErrorIf(True, self.ENODERPC, node,
1602
                 "node returned invalid nodeinfo, check hypervisor")
1603

  
1604
    # FIXME: devise a free space model for file based instances as well
1605
    if vg_name is not None:
1606
      test = (constants.NV_VGLIST not in nresult or
1607
              vg_name not in nresult[constants.NV_VGLIST])
1608
      _ErrorIf(test, self.ENODELVM, node,
1609
               "node didn't return data for the volume group '%s'"
1610
               " - it is either missing or broken", vg_name)
1611
      if not test:
1612
        try:
1613
          nimg.dfree = int(nresult[constants.NV_VGLIST][vg_name])
1614
        except (ValueError, TypeError):
1615
          _ErrorIf(True, self.ENODERPC, node,
1616
                   "node returned invalid LVM info, check LVM status")
1617

  
1391 1618
  def CheckPrereq(self):
1392 1619
    """Check prerequisites.
1393 1620

  
......
1442 1669
                        for iname in instancelist)
1443 1670
    i_non_redundant = [] # Non redundant instances
1444 1671
    i_non_a_balanced = [] # Non auto-balanced instances
1445
    n_offline = [] # List of offline nodes
1446
    n_drained = [] # List of nodes being drained
1447
    node_volume = {}
1448
    node_instance = {}
1449
    node_info = {}
1450
    instance_cfg = {}
1672
    n_offline = 0 # Count of offline nodes
1673
    n_drained = 0 # Count of nodes being drained
1674
    node_vol_should = {}
1451 1675

  
1452 1676
    # FIXME: verify OS list
1453 1677
    # do local checksums
......
1481 1705
      node_verify_param[constants.NV_PVLIST] = [vg_name]
1482 1706
      node_verify_param[constants.NV_DRBDLIST] = None
1483 1707

  
1708
    # Build our expected cluster state
1709
    node_image = dict((node.name, self.NodeImage(offline=node.offline))
1710
                      for node in nodeinfo)
1711

  
1712
    for instance in instancelist:
1713
      inst_config = instanceinfo[instance]
1714

  
1715
      for nname in inst_config.all_nodes:
1716
        if nname not in node_image:
1717
          # ghost node
1718
          gnode = self.NodeImage()
1719
          gnode.ghost = True
1720
          node_image[nname] = gnode
1721

  
1722
      inst_config.MapLVsByNode(node_vol_should)
1723

  
1724
      pnode = inst_config.primary_node
1725
      node_image[pnode].pinst.append(instance)
1726

  
1727
      for snode in inst_config.secondary_nodes:
1728
        nimg = node_image[snode]
1729
        nimg.sinst.append(instance)
1730
        if pnode not in nimg.sbp:
1731
          nimg.sbp[pnode] = []
1732
        nimg.sbp[pnode].append(instance)
1733

  
1734
    # At this point, we have the in-memory data structures complete,
1735
    # except for the runtime information, which we'll gather next
1736

  
1484 1737
    # Due to the way our RPC system works, exact response times cannot be
1485 1738
    # guaranteed (e.g. a broken node could run into a timeout). By keeping the
1486 1739
    # time before and after executing the request, we can at least have a time
......
1497 1750
    feedback_fn("* Verifying node status")
1498 1751
    for node_i in nodeinfo:
1499 1752
      node = node_i.name
1753
      nimg = node_image[node]
1500 1754

  
1501 1755
      if node_i.offline:
1502 1756
        if verbose:
1503 1757
          feedback_fn("* Skipping offline node %s" % (node,))
1504
        n_offline.append(node)
1758
        n_offline += 1
1505 1759
        continue
1506 1760

  
1507 1761
      if node == master_node:
......
1510 1764
        ntype = "master candidate"
1511 1765
      elif node_i.drained:
1512 1766
        ntype = "drained"
1513
        n_drained.append(node)
1767
        n_drained += 1
1514 1768
      else:
1515 1769
        ntype = "regular"
1516 1770
      if verbose:
......
1519 1773
      msg = all_nvinfo[node].fail_msg
1520 1774
      _ErrorIf(msg, self.ENODERPC, node, "while contacting node: %s", msg)
1521 1775
      if msg:
1776
        nimg.rpc_fail = True
1522 1777
        continue
1523 1778

  
1524 1779
      nresult = all_nvinfo[node].payload
1525
      node_drbd = {}
1526
      for minor, instance in all_drbd_map[node].items():
1527
        test = instance not in instanceinfo
1528
        _ErrorIf(test, self.ECLUSTERCFG, None,
1529
                 "ghost instance '%s' in temporary DRBD map", instance)
1530
          # ghost instance should not be running, but otherwise we
1531
          # don't give double warnings (both ghost instance and
1532
          # unallocated minor in use)
1533
        if test:
1534
          node_drbd[minor] = (instance, False)
1535
        else:
1536
          instance = instanceinfo[instance]
1537
          node_drbd[minor] = (instance.name, instance.admin_up)
1538

  
1539
      self._VerifyNode(node_i, file_names, local_checksums,
1540
                       nresult, master_files, node_drbd, vg_name)
1541

  
1542
      lvdata = nresult.get(constants.NV_LVLIST, "Missing LV data")
1543
      if vg_name is None:
1544
        node_volume[node] = {}
1545
      elif isinstance(lvdata, basestring):
1546
        _ErrorIf(True, self.ENODELVM, node, "LVM problem on node: %s",
1547
                 utils.SafeEncode(lvdata))
1548
        node_volume[node] = {}
1549
      elif not isinstance(lvdata, dict):
1550
        _ErrorIf(True, self.ENODELVM, node, "rpc call to node failed (lvlist)")
1551
        continue
1552
      else:
1553
        node_volume[node] = lvdata
1554

  
1555
      # node_instance
1556
      idata = nresult.get(constants.NV_INSTANCELIST, None)
1557
      test = not isinstance(idata, list)
1558
      _ErrorIf(test, self.ENODEHV, node,
1559
               "rpc call to node failed (instancelist): %s",
1560
               utils.SafeEncode(str(idata)))
1561
      if test:
1562
        continue
1563

  
1564
      node_instance[node] = idata
1565

  
1566
      # node_info
1567
      nodeinfo = nresult.get(constants.NV_HVINFO, None)
1568
      test = not isinstance(nodeinfo, dict)
1569
      _ErrorIf(test, self.ENODEHV, node, "rpc call to node failed (hvinfo)")
1570
      if test:
1571
        continue
1572

  
1573
      # Node time
1574
      ntime = nresult.get(constants.NV_TIME, None)
1575
      try:
1576
        ntime_merged = utils.MergeTime(ntime)
1577
      except (ValueError, TypeError):
1578
        _ErrorIf(True, self.ENODETIME, node, "Node returned invalid time")
1579 1780

  
1580
      if ntime_merged < (nvinfo_starttime - constants.NODE_MAX_CLOCK_SKEW):
1581
        ntime_diff = "%.01fs" % abs(nvinfo_starttime - ntime_merged)
1582
      elif ntime_merged > (nvinfo_endtime + constants.NODE_MAX_CLOCK_SKEW):
1583
        ntime_diff = "%.01fs" % abs(ntime_merged - nvinfo_endtime)
1584
      else:
1585
        ntime_diff = None
1781
      nimg.call_ok = self._VerifyNode(node_i, nresult)
1782
      self._VerifyNodeNetwork(node_i, nresult)
1783
      self._VerifyNodeLVM(node_i, nresult, vg_name)
1784
      self._VerifyNodeFiles(node_i, nresult, file_names, local_checksums,
1785
                            master_files)
1786
      self._VerifyNodeDrbd(node_i, nresult, instanceinfo, all_drbd_map)
1787
      self._VerifyNodeTime(node_i, nresult, nvinfo_starttime, nvinfo_endtime)
1586 1788

  
1587
      _ErrorIf(ntime_diff is not None, self.ENODETIME, node,
1588
               "Node time diverges by at least %s from master node time",
1589
               ntime_diff)
1590

  
1591
      if ntime_diff is not None:
1592
        continue
1593

  
1594
      try:
1595
        node_info[node] = {
1596
          "mfree": int(nodeinfo['memory_free']),
1597
          "pinst": [],
1598
          "sinst": [],
1599
          # dictionary holding all instances this node is secondary for,
1600
          # grouped by their primary node. Each key is a cluster node, and each
1601
          # value is a list of instances which have the key as primary and the
1602
          # current node as secondary.  this is handy to calculate N+1 memory
1603
          # availability if you can only failover from a primary to its
1604
          # secondary.
1605
          "sinst-by-pnode": {},
1606
        }
1607
        # FIXME: devise a free space model for file based instances as well
1608
        if vg_name is not None:
1609
          test = (constants.NV_VGLIST not in nresult or
1610
                  vg_name not in nresult[constants.NV_VGLIST])
1611
          _ErrorIf(test, self.ENODELVM, node,
1612
                   "node didn't return data for the volume group '%s'"
1613
                   " - it is either missing or broken", vg_name)
1614
          if test:
1615
            continue
1616
          node_info[node]["dfree"] = int(nresult[constants.NV_VGLIST][vg_name])
1617
      except (ValueError, KeyError):
1618
        _ErrorIf(True, self.ENODERPC, node,
1619
                 "node returned invalid nodeinfo, check lvm/hypervisor")
1620
        continue
1621

  
1622
    node_vol_should = {}
1789
      self._UpdateNodeVolumes(node_i, nresult, nimg, vg_name)
1790
      self._UpdateNodeInstances(node_i, nresult, nimg)
1791
      self._UpdateNodeInfo(node_i, nresult, nimg, vg_name)
1623 1792

  
1624 1793
    feedback_fn("* Verifying instance status")
1625 1794
    for instance in instancelist:
1626 1795
      if verbose:
1627 1796
        feedback_fn("* Verifying instance %s" % instance)
1628 1797
      inst_config = instanceinfo[instance]
1629
      self._VerifyInstance(instance, inst_config, node_volume,
1630
                           node_instance, n_offline)
1798
      self._VerifyInstance(instance, inst_config, node_image)
1631 1799
      inst_nodes_offline = []
1632 1800

  
1633
      inst_config.MapLVsByNode(node_vol_should)
1634

  
1635
      instance_cfg[instance] = inst_config
1636

  
1637 1801
      pnode = inst_config.primary_node
1638
      _ErrorIf(pnode not in node_info and pnode not in n_offline,
1802
      pnode_img = node_image[pnode]
1803
      _ErrorIf(pnode_img.rpc_fail and not pnode_img.offline,
1639 1804
               self.ENODERPC, pnode, "instance %s, connection to"
1640 1805
               " primary node failed", instance)
1641
      if pnode in node_info:
1642
        node_info[pnode]['pinst'].append(instance)
1643 1806

  
1644
      if pnode in n_offline:
1807
      if pnode_img.offline:
1645 1808
        inst_nodes_offline.append(pnode)
1646 1809

  
1647 1810
      # If the instance is non-redundant we cannot survive losing its primary
......
1649 1812
      # templates with more than one secondary so that situation is not well
1650 1813
      # supported either.
1651 1814
      # FIXME: does not support file-backed instances
1652
      if len(inst_config.secondary_nodes) == 0:
1815
      if not inst_config.secondary_nodes:
1653 1816
        i_non_redundant.append(instance)
1654
      _ErrorIf(len(inst_config.secondary_nodes) > 1,
1655
               self.EINSTANCELAYOUT, instance,
1656
               "instance has multiple secondary nodes", code="WARNING")
1817
      _ErrorIf(len(inst_config.secondary_nodes) > 1, self.EINSTANCELAYOUT,
1818
               instance, "instance has multiple secondary nodes: %s",
1819
               utils.CommaJoin(inst_config.secondary_nodes),
1820
               code=self.ETYPE_WARNING)
1657 1821

  
1658 1822
      if not cluster.FillBE(inst_config)[constants.BE_AUTO_BALANCE]:
1659 1823
        i_non_a_balanced.append(instance)
1660 1824

  
1661 1825
      for snode in inst_config.secondary_nodes:
1662
        _ErrorIf(snode not in node_info and snode not in n_offline,
1663
                 self.ENODERPC, snode,
1664
                 "instance %s, connection to secondary node"
1665
                 " failed", instance)
1666

  
1667
        if snode in node_info:
1668
          node_info[snode]['sinst'].append(instance)
1669
          if pnode not in node_info[snode]['sinst-by-pnode']:
1670
            node_info[snode]['sinst-by-pnode'][pnode] = []
1671
          node_info[snode]['sinst-by-pnode'][pnode].append(instance)
1672

  
1673
        if snode in n_offline:
1826
        s_img = node_image[snode]
1827
        _ErrorIf(s_img.rpc_fail and not s_img.offline, self.ENODERPC, snode,
1828
                 "instance %s, connection to secondary node failed", instance)
1829

  
1830
        if s_img.offline:
1674 1831
          inst_nodes_offline.append(snode)
1675 1832

  
1676 1833
      # warn that the instance lives on offline nodes
1677 1834
      _ErrorIf(inst_nodes_offline, self.EINSTANCEBADNODE, instance,
1678 1835
               "instance lives on offline node(s) %s",
1679 1836
               utils.CommaJoin(inst_nodes_offline))
1837
      # ... or ghost nodes
1838
      for node in inst_config.all_nodes:
1839
        _ErrorIf(node_image[node].ghost, self.EINSTANCEBADNODE, instance,
1840
                 "instance lives on ghost node %s", node)
1680 1841

  
1681 1842
    feedback_fn("* Verifying orphan volumes")
1682
    self._VerifyOrphanVolumes(node_vol_should, node_volume)
1843
    self._VerifyOrphanVolumes(node_vol_should, node_image)
1683 1844

  
1684
    feedback_fn("* Verifying remaining instances")
1685
    self._VerifyOrphanInstances(instancelist, node_instance)
1845
    feedback_fn("* Verifying oprhan instances")
1846
    self._VerifyOrphanInstances(instancelist, node_image)
1686 1847

  
1687 1848
    if constants.VERIFY_NPLUSONE_MEM not in self.skip_set:
1688 1849
      feedback_fn("* Verifying N+1 Memory redundancy")
1689
      self._VerifyNPlusOneMemory(node_info, instance_cfg)
1850
      self._VerifyNPlusOneMemory(node_image, instanceinfo)
1690 1851

  
1691 1852
    feedback_fn("* Other Notes")
1692 1853
    if i_non_redundant:
......
1698 1859
                  % len(i_non_a_balanced))
1699 1860

  
1700 1861
    if n_offline:
1701
      feedback_fn("  - NOTICE: %d offline node(s) found." % len(n_offline))
1862
      feedback_fn("  - NOTICE: %d offline node(s) found." % n_offline)
1702 1863

  
1703 1864
    if n_drained:
1704
      feedback_fn("  - NOTICE: %d drained node(s) found." % len(n_drained))
1865
      feedback_fn("  - NOTICE: %d drained node(s) found." % n_drained)
1705 1866

  
1706 1867
    return not self.bad
1707 1868

  

Also available in: Unified diff