Convert os api version file name to a constant

[ganeti-local] / lib / cmdlib.py
diff --git a/lib/cmdlib.py b/lib/cmdlib.py

index 864cfe9..c7e1ec1 100644 (file)
--- a/lib/cmdlib.py
+++ b/lib/cmdlib.py
@@ -670,22 +670,33 @@ def _BuildInstanceHookEnvByObject(lu, instance, override=None):
    return _BuildInstanceHookEnv(**args)
  
  
-def _AdjustCandidatePool(lu):
+def _AdjustCandidatePool(lu, exceptions):
    """Adjust the candidate pool after node operations.
  
    """
-  mod_list = lu.cfg.MaintainCandidatePool()
+  mod_list = lu.cfg.MaintainCandidatePool(exceptions)
    if mod_list:
      lu.LogInfo("Promoted nodes to master candidate role: %s",
                 ", ".join(node.name for node in mod_list))
      for name in mod_list:
        lu.context.ReaddNode(name)
-  mc_now, mc_max = lu.cfg.GetMasterCandidateStats()
+  mc_now, mc_max, _ = lu.cfg.GetMasterCandidateStats(exceptions)
    if mc_now > mc_max:
      lu.LogInfo("Note: more nodes are candidates (%d) than desired (%d)" %
                 (mc_now, mc_max))
  
  
+def _DecideSelfPromotion(lu, exceptions=None):
+  """Decide whether I should promote myself as a master candidate.
+
+  """
+  cp_size = lu.cfg.GetClusterInfo().candidate_pool_size
+  mc_now, mc_should, _ = lu.cfg.GetMasterCandidateStats(exceptions)
+  # the new node will increase mc_max with one, so:
+  mc_should = min(mc_should + 1, cp_size)
+  return mc_now < mc_should
+
+
  def _CheckNicsBridgesExist(lu, target_nics, target_node,
                                 profile=constants.PP_DEFAULT):
    """Check that the brigdes needed by a list of nics exist.
@@ -858,7 +869,7 @@ class LUVerifyCluster(LogicalUnit):
    """
    HPATH = "cluster-verify"
    HTYPE = constants.HTYPE_CLUSTER
-  _OP_REQP = ["skip_checks", "verbose", "error_codes"]
+  _OP_REQP = ["skip_checks", "verbose", "error_codes", "debug_simulate_errors"]
    REQ_BGL = False
  
    TCLUSTER = "cluster"
@@ -885,6 +896,10 @@ class LUVerifyCluster(LogicalUnit):
    ENODESSH = (TNODE, "ENODESSH")
    ENODEVERSION = (TNODE, "ENODEVERSION")
  
+  ETYPE_FIELD = "code"
+  ETYPE_ERROR = "ERROR"
+  ETYPE_WARNING = "WARNING"
+
    def ExpandNames(self):
      self.needed_locks = {
        locking.LEVEL_NODE: locking.ALL_SET,
@@ -901,7 +916,7 @@ class LUVerifyCluster(LogicalUnit):
      This must be called only from Exec and functions called from Exec.
  
      """
-    ltype = kwargs.get("code", "ERROR")
+    ltype = kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR)
      itype, etxt = ecode
      # first complete the msg
      if args:
@@ -918,6 +933,17 @@ class LUVerifyCluster(LogicalUnit):
      # and finally report it via the feedback_fn
      self._feedback_fn("  - %s" % msg)
  
+  def _ErrorIf(self, cond, *args, **kwargs):
+    """Log an error message if the passed condition is True.
+
+    """
+    cond = bool(cond) or self.op.debug_simulate_errors
+    if cond:
+      self._Error(*args, **kwargs)
+    # do not mark the operation as failed for WARN cases only
+    if kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR) == self.ETYPE_ERROR:
+      self.bad = self.bad or cond
+
    def _VerifyNode(self, nodeinfo, file_list, local_cksum,
                    node_result, master_files, drbd_map, vg_name):
      """Run multiple tests against a node.
@@ -942,138 +968,126 @@ class LUVerifyCluster(LogicalUnit):
  
      """
      node = nodeinfo.name
+    _ErrorIf = self._ErrorIf
  
      # main result, node_result should be a non-empty dict
-    if not node_result or not isinstance(node_result, dict):
-      self._Error(self.ENODERPC, node,
+    test = not node_result or not isinstance(node_result, dict)
+    _ErrorIf(test, self.ENODERPC, node,
                    "unable to verify node: no data returned")
-      return True
+    if test:
+      return
  
      # compares ganeti version
      local_version = constants.PROTOCOL_VERSION
      remote_version = node_result.get('version', None)
-    if not (remote_version and isinstance(remote_version, (list, tuple)) and
-            len(remote_version) == 2):
-      self._Error(self.ENODERPC, node,
-                  "connection to node returned invalid data")
-      return True
-
-    if local_version != remote_version[0]:
-      self._Error(self.ENODEVERSION, node,
-                  "incompatible protocol versions: master %s,"
-                  " node %s", local_version, remote_version[0])
-      return True
+    test = not (remote_version and
+                isinstance(remote_version, (list, tuple)) and
+                len(remote_version) == 2)
+    _ErrorIf(test, self.ENODERPC, node,
+             "connection to node returned invalid data")
+    if test:
+      return
  
-    # node seems compatible, we can actually try to look into its results
+    test = local_version != remote_version[0]
+    _ErrorIf(test, self.ENODEVERSION, node,
+             "incompatible protocol versions: master %s,"
+             " node %s", local_version, remote_version[0])
+    if test:
+      return
  
-    bad = False
+    # node seems compatible, we can actually try to look into its results
  
      # full package version
-    if constants.RELEASE_VERSION != remote_version[1]:
-      self._Error(self.ENODEVERSION, node,
+    self._ErrorIf(constants.RELEASE_VERSION != remote_version[1],
+                  self.ENODEVERSION, node,
                    "software version mismatch: master %s, node %s",
                    constants.RELEASE_VERSION, remote_version[1],
-                  code="WARNING")
+                  code=self.ETYPE_WARNING)
  
      # checks vg existence and size > 20G
      if vg_name is not None:
        vglist = node_result.get(constants.NV_VGLIST, None)
-      if not vglist:
-        self._Error(self.ENODELVM, node, "unable to check volume groups")
-        bad = True
-      else:
+      test = not vglist
+      _ErrorIf(test, self.ENODELVM, node, "unable to check volume groups")
+      if not test:
          vgstatus = utils.CheckVolumeGroupSize(vglist, vg_name,
                                                constants.MIN_VG_SIZE)
-        if vgstatus:
-          self._Error(self.ENODELVM, self.TNODE, node, vgstatus)
-          bad = True
+        _ErrorIf(vgstatus, self.ENODELVM, node, vgstatus)
  
      # checks config file checksum
  
      remote_cksum = node_result.get(constants.NV_FILELIST, None)
-    if not isinstance(remote_cksum, dict):
-      bad = True
-      self._Error(self.ENODEFILECHECK, node,
-                  "node hasn't returned file checksum data")
-    else:
+    test = not isinstance(remote_cksum, dict)
+    _ErrorIf(test, self.ENODEFILECHECK, node,
+             "node hasn't returned file checksum data")
+    if not test:
        for file_name in file_list:
          node_is_mc = nodeinfo.master_candidate
-        must_have_file = file_name not in master_files
-        if file_name not in remote_cksum:
-          if node_is_mc or must_have_file:
-            bad = True
-            self._Error(self.ENODEFILECHECK, node,
-                        "file '%s' missing", file_name)
-        elif remote_cksum[file_name] != local_cksum[file_name]:
-          if node_is_mc or must_have_file:
-            bad = True
-            self._Error(self.ENODEFILECHECK, node,
-                        "file '%s' has wrong checksum", file_name)
-          else:
-            # not candidate and this is not a must-have file
-            bad = True
-            self._Error(self.ENODEFILECHECK, node,
-                        "file '%s' should not exist on non master"
-                        " candidates (and the file is outdated)", file_name)
-        else:
-          # all good, except non-master/non-must have combination
-          if not node_is_mc and not must_have_file:
-            self._Error(self.ENODEFILECHECK, node, "file '%s' should not exist"
-                        " on non master candidates", file_name)
+        must_have = (file_name not in master_files) or node_is_mc
+        # missing
+        test1 = file_name not in remote_cksum
+        # invalid checksum
+        test2 = not test1 and remote_cksum[file_name] != local_cksum[file_name]
+        # existing and good
+        test3 = not test1 and remote_cksum[file_name] == local_cksum[file_name]
+        _ErrorIf(test1 and must_have, self.ENODEFILECHECK, node,
+                 "file '%s' missing", file_name)
+        _ErrorIf(test2 and must_have, self.ENODEFILECHECK, node,
+                 "file '%s' has wrong checksum", file_name)
+        # not candidate and this is not a must-have file
+        _ErrorIf(test2 and not must_have, self.ENODEFILECHECK, node,
+                 "file '%s' should not exist on non master"
+                 " candidates (and the file is outdated)", file_name)
+        # all good, except non-master/non-must have combination
+        _ErrorIf(test3 and not must_have, self.ENODEFILECHECK, node,
+                 "file '%s' should not exist"
+                 " on non master candidates", file_name)
  
      # checks ssh to any
  
-    if constants.NV_NODELIST not in node_result:
-      bad = True
-      self._Error(self.ENODESSH, node,
-                  "node hasn't returned node ssh connectivity data")
-    else:
+    test = constants.NV_NODELIST not in node_result
+    _ErrorIf(test, self.ENODESSH, node,
+             "node hasn't returned node ssh connectivity data")
+    if not test:
        if node_result[constants.NV_NODELIST]:
-        bad = True
          for a_node, a_msg in node_result[constants.NV_NODELIST].items():
-          self._Error(self.ENODESSH, node,
-                      "ssh communication with node '%s': %s", a_node, a_msg)
+          _ErrorIf(True, self.ENODESSH, node,
+                   "ssh communication with node '%s': %s", a_node, a_msg)
  
-    if constants.NV_NODENETTEST not in node_result:
-      bad = True
-      self._Error(self.ENODENET, node,
-                  "node hasn't returned node tcp connectivity data")
-    else:
+    test = constants.NV_NODENETTEST not in node_result
+    _ErrorIf(test, self.ENODENET, node,
+             "node hasn't returned node tcp connectivity data")
+    if not test:
        if node_result[constants.NV_NODENETTEST]:
-        bad = True
          nlist = utils.NiceSort(node_result[constants.NV_NODENETTEST].keys())
          for anode in nlist:
-          self._Error(self.ENODENET, node,
-                      "tcp communication with node '%s': %s",
-                      anode, node_result[constants.NV_NODENETTEST][anode])
+          _ErrorIf(True, self.ENODENET, node,
+                   "tcp communication with node '%s': %s",
+                   anode, node_result[constants.NV_NODENETTEST][anode])
  
      hyp_result = node_result.get(constants.NV_HYPERVISOR, None)
      if isinstance(hyp_result, dict):
        for hv_name, hv_result in hyp_result.iteritems():
-        if hv_result is not None:
-          self._Error(self.ENODEHV, node,
-                      "hypervisor %s verify failure: '%s'", hv_name, hv_result)
+        test = hv_result is not None
+        _ErrorIf(test, self.ENODEHV, node,
+                 "hypervisor %s verify failure: '%s'", hv_name, hv_result)
  
      # check used drbd list
      if vg_name is not None:
        used_minors = node_result.get(constants.NV_DRBDLIST, [])
-      if not isinstance(used_minors, (tuple, list)):
-        self._Error(self.ENODEDRBD, node,
-                    "cannot parse drbd status file: %s", str(used_minors))
-      else:
+      test = not isinstance(used_minors, (tuple, list))
+      _ErrorIf(test, self.ENODEDRBD, node,
+               "cannot parse drbd status file: %s", str(used_minors))
+      if not test:
          for minor, (iname, must_exist) in drbd_map.items():
-          if minor not in used_minors and must_exist:
-            self._Error(self.ENODEDRBD, node,
-                        "drbd minor %d of instance %s is not active",
-                        minor, iname)
-            bad = True
+          test = minor not in used_minors and must_exist
+          _ErrorIf(test, self.ENODEDRBD, node,
+                   "drbd minor %d of instance %s is not active",
+                   minor, iname)
          for minor in used_minors:
-          if minor not in drbd_map:
-            self._Error(self.ENODEDRBD, node,
-                        "unallocated drbd minor %d is in use", minor)
-            bad = True
-
-    return bad
+          test = minor not in drbd_map
+          _ErrorIf(test, self.ENODEDRBD, node,
+                   "unallocated drbd minor %d is in use", minor)
  
    def _VerifyInstance(self, instance, instanceconfig, node_vol_is,
                        node_instance, n_offline):
@@ -1083,8 +1097,7 @@ class LUVerifyCluster(LogicalUnit):
      available on the instance's node.
  
      """
-    bad = False
-
+    _ErrorIf = self._ErrorIf
      node_current = instanceconfig.primary_node
  
      node_vol_should = {}
@@ -1095,28 +1108,23 @@ class LUVerifyCluster(LogicalUnit):
          # ignore missing volumes on offline nodes
          continue
        for volume in node_vol_should[node]:
-        if node not in node_vol_is or volume not in node_vol_is[node]:
-          self._Error(self.EINSTANCEMISSINGDISK, instance,
-                      "volume %s missing on node %s", volume, node)
-          bad = True
+        test = node not in node_vol_is or volume not in node_vol_is[node]
+        _ErrorIf(test, self.EINSTANCEMISSINGDISK, instance,
+                 "volume %s missing on node %s", volume, node)
  
      if instanceconfig.admin_up:
-      if ((node_current not in node_instance or
-          not instance in node_instance[node_current]) and
-          node_current not in n_offline):
-        self._Error(self.EINSTANCEDOWN, instance,
-                    "instance not running on its primary node %s",
-                    node_current)
-        bad = True
+      test = ((node_current not in node_instance or
+               not instance in node_instance[node_current]) and
+              node_current not in n_offline)
+      _ErrorIf(test, self.EINSTANCEDOWN, instance,
+               "instance not running on its primary node %s",
+               node_current)
  
      for node in node_instance:
        if (not node == node_current):
-        if instance in node_instance[node]:
-          self._Error(self.EINSTANCEWRONGNODE, instance,
-                      "instance should not run on node %s", node)
-          bad = True
-
-    return bad
+        test = instance in node_instance[node]
+        _ErrorIf(test, self.EINSTANCEWRONGNODE, instance,
+                 "instance should not run on node %s", node)
  
    def _VerifyOrphanVolumes(self, node_vol_should, node_vol_is):
      """Verify if there are any unknown volumes in the cluster.
@@ -1125,15 +1133,12 @@ class LUVerifyCluster(LogicalUnit):
      reported as unknown.
  
      """
-    bad = False
-
      for node in node_vol_is:
        for volume in node_vol_is[node]:
-        if node not in node_vol_should or volume not in node_vol_should[node]:
-          self._Error(self.ENODEORPHANLV, node,
+        test = (node not in node_vol_should or
+                volume not in node_vol_should[node])
+        self._ErrorIf(test, self.ENODEORPHANLV, node,
                        "volume %s is unknown", volume)
-          bad = True
-    return bad
  
    def _VerifyOrphanInstances(self, instancelist, node_instance):
      """Verify the list of running instances.
@@ -1141,14 +1146,11 @@ class LUVerifyCluster(LogicalUnit):
      This checks what instances are running but unknown to the cluster.
  
      """
-    bad = False
      for node in node_instance:
        for o_inst in node_instance[node]:
-        if o_inst not in instancelist:
-          self._Error(self.ENODEORPHANINSTANCE, node,
+        test = o_inst not in instancelist
+        self._ErrorIf(test, self.ENODEORPHANINSTANCE, node,
                        "instance %s on node %s should not exist", o_inst, node)
-          bad = True
-    return bad
  
    def _VerifyNPlusOneMemory(self, node_info, instance_cfg):
      """Verify N+1 Memory Resilience.
@@ -1157,8 +1159,6 @@ class LUVerifyCluster(LogicalUnit):
      was primary for.
  
      """
-    bad = False
-
      for node, nodeinfo in node_info.iteritems():
        # This code checks that every node which is now listed as secondary has
        # enough memory to host all instances it is supposed to should a single
@@ -1174,12 +1174,10 @@ class LUVerifyCluster(LogicalUnit):
            bep = self.cfg.GetClusterInfo().FillBE(instance_cfg[instance])
            if bep[constants.BE_AUTO_BALANCE]:
              needed_mem += bep[constants.BE_MEMORY]
-        if nodeinfo['mfree'] < needed_mem:
-          self._Error(self.ENODEN1, node,
+        test = nodeinfo['mfree'] < needed_mem
+        self._ErrorIf(test, self.ENODEN1, node,
                        "not enough memory on to accommodate"
                        " failovers should peer node %s fail", prinode)
-          bad = True
-    return bad
  
    def CheckPrereq(self):
      """Check prerequisites.
@@ -1212,12 +1210,13 @@ class LUVerifyCluster(LogicalUnit):
      """Verify integrity of cluster, performing various test on nodes.
  
      """
-    bad = False
+    self.bad = False
+    _ErrorIf = self._ErrorIf
      verbose = self.op.verbose
      self._feedback_fn = feedback_fn
      feedback_fn("* Verifying global settings")
      for msg in self.cfg.VerifyConfig():
-      self._Error(self.ECLUSTERCFG, None, msg)
+      _ErrorIf(True, self.ECLUSTERCFG, None, msg)
  
      vg_name = self.cfg.GetVGName()
      hypervisors = self.cfg.GetClusterInfo().enabled_hypervisors
@@ -1293,57 +1292,55 @@ class LUVerifyCluster(LogicalUnit):
          feedback_fn("* Verifying node %s (%s)" % (node, ntype))
  
        msg = all_nvinfo[node].fail_msg
+      _ErrorIf(msg, self.ENODERPC, node, "while contacting node: %s", msg)
        if msg:
-        self._Error(self.ENODERPC, node, "while contacting node: %s", msg)
-        bad = True
          continue
  
        nresult = all_nvinfo[node].payload
        node_drbd = {}
        for minor, instance in all_drbd_map[node].items():
-        if instance not in instanceinfo:
-          self._Error(self.ECLUSTERCFG, None,
-                      "ghost instance '%s' in temporary DRBD map", instance)
+        test = instance not in instanceinfo
+        _ErrorIf(test, self.ECLUSTERCFG, None,
+                 "ghost instance '%s' in temporary DRBD map", instance)
            # ghost instance should not be running, but otherwise we
            # don't give double warnings (both ghost instance and
            # unallocated minor in use)
+        if test:
            node_drbd[minor] = (instance, False)
          else:
            instance = instanceinfo[instance]
            node_drbd[minor] = (instance.name, instance.admin_up)
-      result = self._VerifyNode(node_i, file_names, local_checksums,
-                                nresult, master_files, node_drbd, vg_name)
-      bad = bad or result
+      self._VerifyNode(node_i, file_names, local_checksums,
+                       nresult, master_files, node_drbd, vg_name)
  
        lvdata = nresult.get(constants.NV_LVLIST, "Missing LV data")
        if vg_name is None:
          node_volume[node] = {}
        elif isinstance(lvdata, basestring):
-        self._Error(self.ENODELVM, node, "LVM problem on node: %s",
-                    utils.SafeEncode(lvdata))
-        bad = True
+        _ErrorIf(True, self.ENODELVM, node, "LVM problem on node: %s",
+                 utils.SafeEncode(lvdata))
          node_volume[node] = {}
        elif not isinstance(lvdata, dict):
-        self._Error(self.ENODELVM, node, "rpc call to node failed (lvlist)")
-        bad = True
+        _ErrorIf(True, self.ENODELVM, node, "rpc call to node failed (lvlist)")
          continue
        else:
          node_volume[node] = lvdata
  
        # node_instance
        idata = nresult.get(constants.NV_INSTANCELIST, None)
-      if not isinstance(idata, list):
-        self._Error(self.ENODEHV, "rpc call to node failed (instancelist)")
-        bad = True
+      test = not isinstance(idata, list)
+      _ErrorIf(test, self.ENODEHV, node,
+               "rpc call to node failed (instancelist)")
+      if test:
          continue
  
        node_instance[node] = idata
  
        # node_info
        nodeinfo = nresult.get(constants.NV_HVINFO, None)
-      if not isinstance(nodeinfo, dict):
-        self._Error(self.ENODEHV, node, "rpc call to node failed (hvinfo)")
-        bad = True
+      test = not isinstance(nodeinfo, dict)
+      _ErrorIf(test, self.ENODEHV, node, "rpc call to node failed (hvinfo)")
+      if test:
          continue
  
        try:
@@ -1361,18 +1358,17 @@ class LUVerifyCluster(LogicalUnit):
          }
          # FIXME: devise a free space model for file based instances as well
          if vg_name is not None:
-          if (constants.NV_VGLIST not in nresult or
-              vg_name not in nresult[constants.NV_VGLIST]):
-            self._Error(self.ENODELVM, node,
-                        "node didn't return data for the volume group '%s'"
-                        " - it is either missing or broken", vg_name)
-            bad = True
+          test = (constants.NV_VGLIST not in nresult or
+                  vg_name not in nresult[constants.NV_VGLIST])
+          _ErrorIf(test, self.ENODELVM, node,
+                   "node didn't return data for the volume group '%s'"
+                   " - it is either missing or broken", vg_name)
+          if test:
              continue
            node_info[node]["dfree"] = int(nresult[constants.NV_VGLIST][vg_name])
        except (ValueError, KeyError):
-        self._Error(self.ENODERPC, node,
-                    "node returned invalid nodeinfo, check lvm/hypervisor")
-        bad = True
+        _ErrorIf(True, self.ENODERPC, node,
+                 "node returned invalid nodeinfo, check lvm/hypervisor")
          continue
  
      node_vol_should = {}
@@ -1382,9 +1378,8 @@ class LUVerifyCluster(LogicalUnit):
        if verbose:
          feedback_fn("* Verifying instance %s" % instance)
        inst_config = instanceinfo[instance]
-      result =  self._VerifyInstance(instance, inst_config, node_volume,
-                                     node_instance, n_offline)
-      bad = bad or result
+      self._VerifyInstance(instance, inst_config, node_volume,
+                           node_instance, n_offline)
        inst_nodes_offline = []
  
        inst_config.MapLVsByNode(node_vol_should)
@@ -1392,12 +1387,11 @@ class LUVerifyCluster(LogicalUnit):
        instance_cfg[instance] = inst_config
  
        pnode = inst_config.primary_node
+      _ErrorIf(pnode not in node_info and pnode not in n_offline,
+               self.ENODERPC, pnode, "instance %s, connection to"
+               " primary node failed", instance)
        if pnode in node_info:
          node_info[pnode]['pinst'].append(instance)
-      elif pnode not in n_offline:
-        self._Error(self.ENODERPC, pnode, "instance %s, connection to"
-                    " primary node failed", instance)
-        bad = True
  
        if pnode in n_offline:
          inst_nodes_offline.append(pnode)
@@ -1409,46 +1403,42 @@ class LUVerifyCluster(LogicalUnit):
        # FIXME: does not support file-backed instances
        if len(inst_config.secondary_nodes) == 0:
          i_non_redundant.append(instance)
-      elif len(inst_config.secondary_nodes) > 1:
-        self._Error(self.EINSTANCELAYOUT, instance,
-                    "instance has multiple secondary nodes", code="WARNING")
+      _ErrorIf(len(inst_config.secondary_nodes) > 1,
+               self.EINSTANCELAYOUT, instance,
+               "instance has multiple secondary nodes", code="WARNING")
  
        if not cluster.FillBE(inst_config)[constants.BE_AUTO_BALANCE]:
          i_non_a_balanced.append(instance)
  
        for snode in inst_config.secondary_nodes:
+        _ErrorIf(snode not in node_info and snode not in n_offline,
+                 self.ENODERPC, snode,
+                 "instance %s, connection to secondary node"
+                 "failed", instance)
+
          if snode in node_info:
            node_info[snode]['sinst'].append(instance)
            if pnode not in node_info[snode]['sinst-by-pnode']:
              node_info[snode]['sinst-by-pnode'][pnode] = []
            node_info[snode]['sinst-by-pnode'][pnode].append(instance)
-        elif snode not in n_offline:
-          self._Error(self.ENODERPC, snode,
-                      "instance %s, connection to secondary node"
-                      "failed", instance)
-          bad = True
+
          if snode in n_offline:
            inst_nodes_offline.append(snode)
  
-      if inst_nodes_offline:
-        # warn that the instance lives on offline nodes, and set bad=True
-        self._Error(self.EINSTANCEBADNODE, instance,
-                    "instance lives on offline node(s) %s",
-                    ", ".join(inst_nodes_offline))
-        bad = True
+      # warn that the instance lives on offline nodes
+      _ErrorIf(inst_nodes_offline, self.EINSTANCEBADNODE, instance,
+               "instance lives on offline node(s) %s",
+               ", ".join(inst_nodes_offline))
  
      feedback_fn("* Verifying orphan volumes")
-    result = self._VerifyOrphanVolumes(node_vol_should, node_volume)
-    bad = bad or result
+    self._VerifyOrphanVolumes(node_vol_should, node_volume)
  
      feedback_fn("* Verifying remaining instances")
-    result = self._VerifyOrphanInstances(instancelist, node_instance)
-    bad = bad or result
+    self._VerifyOrphanInstances(instancelist, node_instance)
  
      if constants.VERIFY_NPLUSONE_MEM not in self.skip_set:
        feedback_fn("* Verifying N+1 Memory redundancy")
-      result = self._VerifyNPlusOneMemory(node_info, instance_cfg)
-      bad = bad or result
+      self._VerifyNPlusOneMemory(node_info, instance_cfg)
  
      feedback_fn("* Other Notes")
      if i_non_redundant:
@@ -1465,7 +1455,7 @@ class LUVerifyCluster(LogicalUnit):
      if n_drained:
        feedback_fn("  - NOTICE: %d drained node(s) found." % len(n_drained))
  
-    return not bad
+    return not self.bad
  
    def HooksCallBack(self, phase, hooks_results, feedback_fn, lu_result):
      """Analyze the post-hooks' result
@@ -1494,18 +1484,19 @@ class LUVerifyCluster(LogicalUnit):
          show_node_header = True
          res = hooks_results[node_name]
          msg = res.fail_msg
-        if msg:
-          if res.offline:
-            # no need to warn or set fail return value
-            continue
-          self._Error(self.ENODEHOOKS, node_name,
+        test = msg and not res.offline
+        self._ErrorIf(test, self.ENODEHOOKS, node_name,
                        "Communication failure in hooks execution: %s", msg)
+        if test:
+          # override manually lu_result here as _ErrorIf only
+          # overrides self.bad
            lu_result = 1
            continue
          for script, hkr, output in res.payload:
-          if hkr == constants.HKR_FAIL:
-            self._Error(self.ENODEHOOKS, node_name,
+          test = hkr == constants.HKR_FAIL
+          self._ErrorIf(test, self.ENODEHOOKS, node_name,
                          "Script %s failed, output:", script)
+          if test:
              output = indent_re.sub('      ', output)
              feedback_fn("%s" % output)
              lu_result = 1
@@ -1614,7 +1605,6 @@ class LURepairDiskSizes(NoHooksLU):
          if full_name is None:
            raise errors.OpPrereqError("Instance '%s' not known" % name)
          self.wanted_names.append(full_name)
-      self.needed_locks[locking.LEVEL_INSTANCE] = self.wanted_names
        self.needed_locks = {
          locking.LEVEL_NODE: [],
          locking.LEVEL_INSTANCE: self.wanted_names,
@@ -1644,6 +1634,29 @@ class LURepairDiskSizes(NoHooksLU):
      self.wanted_instances = [self.cfg.GetInstanceInfo(name) for name
                               in self.wanted_names]
  
+  def _EnsureChildSizes(self, disk):
+    """Ensure children of the disk have the needed disk size.
+
+    This is valid mainly for DRBD8 and fixes an issue where the
+    children have smaller disk size.
+
+    @param disk: an L{ganeti.objects.Disk} object
+
+    """
+    if disk.dev_type == constants.LD_DRBD8:
+      assert disk.children, "Empty children for DRBD8?"
+      fchild = disk.children[0]
+      mismatch = fchild.size < disk.size
+      if mismatch:
+        self.LogInfo("Child disk has size %d, parent %d, fixing",
+                     fchild.size, disk.size)
+        fchild.size = disk.size
+
+      # and we recurse on this child only, not on the metadev
+      return self._EnsureChildSizes(fchild) or mismatch
+    else:
+      return False
+
    def Exec(self, feedback_fn):
      """Verify the size of cluster disks.
  
@@ -1660,8 +1673,11 @@ class LURepairDiskSizes(NoHooksLU):
  
      changed = []
      for node, dskl in per_node_disks.items():
-      result = self.rpc.call_blockdev_getsizes(node, [v[2] for v in dskl])
-      if result.RemoteFailMsg():
+      newl = [v[2].Copy() for v in dskl]
+      for dsk in newl:
+        self.cfg.SetDiskID(dsk, node)
+      result = self.rpc.call_blockdev_getsizes(node, newl)
+      if result.fail_msg:
          self.LogWarning("Failure in blockdev_getsizes call to node"
                          " %s, ignoring", node)
          continue
@@ -1686,6 +1702,9 @@ class LURepairDiskSizes(NoHooksLU):
            disk.size = size
            self.cfg.Update(instance)
            changed.append((instance.name, idx, size))
+        if self._EnsureChildSizes(disk):
+          self.cfg.Update(instance)
+          changed.append((instance.name, idx, disk.size))
      return changed
  
  
@@ -1938,7 +1957,7 @@ class LUSetClusterParams(LogicalUnit):
      if self.op.candidate_pool_size is not None:
        self.cluster.candidate_pool_size = self.op.candidate_pool_size
        # we need to update the pool size here, otherwise the save will fail
-      _AdjustCandidatePool(self)
+      _AdjustCandidatePool(self, [])
  
      self.cfg.Update(self.cluster)
  
@@ -2274,6 +2293,8 @@ class LURemoveNode(LogicalUnit):
      logging.info("Stopping the node daemon and removing configs from node %s",
                   node.name)
  
+    # Promote nodes to master candidate as needed
+    _AdjustCandidatePool(self, exceptions=[node.name])
      self.context.RemoveNode(node.name)
  
      # Run post hooks on the node before it's removed
@@ -2289,9 +2310,6 @@ class LURemoveNode(LogicalUnit):
        self.LogWarning("Errors encountered on the remote node while leaving"
                        " the cluster: %s", msg)
  
-    # Promote nodes to master candidate as needed
-    _AdjustCandidatePool(self)
-
  
  class LUQueryNodes(NoHooksLU):
    """Logical unit for querying nodes.
@@ -2299,6 +2317,10 @@ class LUQueryNodes(NoHooksLU):
    """
    _OP_REQP = ["output_fields", "names", "use_locking"]
    REQ_BGL = False
+
+  _SIMPLE_FIELDS = ["name", "serial_no", "ctime", "mtime", "uuid",
+                    "master_candidate", "offline", "drained"]
+
    _FIELDS_DYNAMIC = utils.FieldSet(
      "dtotal", "dfree",
      "mtotal", "mnode", "mfree",
@@ -2306,16 +2328,12 @@ class LUQueryNodes(NoHooksLU):
      "ctotal", "cnodes", "csockets",
      )
  
-  _FIELDS_STATIC = utils.FieldSet(
-    "name", "pinst_cnt", "sinst_cnt",
+  _FIELDS_STATIC = utils.FieldSet(*[
+    "pinst_cnt", "sinst_cnt",
      "pinst_list", "sinst_list",
      "pip", "sip", "tags",
-    "serial_no", "ctime", "mtime",
-    "master_candidate",
      "master",
-    "offline",
-    "drained",
-    "role",
+    "role"] + _SIMPLE_FIELDS
      )
  
    def ExpandNames(self):
@@ -2416,8 +2434,8 @@ class LUQueryNodes(NoHooksLU):
      for node in nodelist:
        node_output = []
        for field in self.op.output_fields:
-        if field == "name":
-          val = node.name
+        if field in self._SIMPLE_FIELDS:
+          val = getattr(node, field)
          elif field == "pinst_list":
            val = list(node_to_primary[node.name])
          elif field == "sinst_list":
@@ -2432,20 +2450,8 @@ class LUQueryNodes(NoHooksLU):
            val = node.secondary_ip
          elif field == "tags":
            val = list(node.GetTags())
-        elif field == "serial_no":
-          val = node.serial_no
-        elif field == "ctime":
-          val = node.ctime
-        elif field == "mtime":
-          val = node.mtime
-        elif field == "master_candidate":
-          val = node.master_candidate
          elif field == "master":
            val = node.name == master_node
-        elif field == "offline":
-          val = node.offline
-        elif field == "drained":
-          val = node.drained
          elif self._FIELDS_DYNAMIC.Matches(field):
            val = live_data[node.name].get(field, None)
          elif field == "role":
@@ -2798,15 +2804,12 @@ class LUAddNode(LogicalUnit):
          raise errors.OpPrereqError("Node secondary ip not reachable by TCP"
                                     " based ping to noded port")
  
-    cp_size = self.cfg.GetClusterInfo().candidate_pool_size
      if self.op.readd:
        exceptions = [node]
      else:
        exceptions = []
-    mc_now, mc_max = self.cfg.GetMasterCandidateStats(exceptions)
-    # the new node will increase mc_max with one, so:
-    mc_max = min(mc_max + 1, cp_size)
-    self.master_candidate = mc_now < mc_max
+
+    self.master_candidate = _DecideSelfPromotion(self, exceptions=exceptions)
  
      if self.op.readd:
        self.new_node = self.cfg.GetNodeInfo(node)
@@ -2893,7 +2896,8 @@ class LUAddNode(LogicalUnit):
        nl_payload = result[verifier].payload[constants.NV_NODELIST]
        if nl_payload:
          for failed in nl_payload:
-          feedback_fn("ssh/hostname verification failed %s -> %s" %
+          feedback_fn("ssh/hostname verification failed"
+                      " (checking from %s): %s" %
                        (verifier, nl_payload[failed]))
          raise errors.OpExecError("ssh/hostname verification failed.")
  
@@ -2905,7 +2909,7 @@ class LUAddNode(LogicalUnit):
        # and make sure the new node will not have old files around
        if not new_node.master_candidate:
          result = self.rpc.call_node_demote_from_mc(new_node.name)
-        msg = result.RemoteFailMsg()
+        msg = result.fail_msg
          if msg:
            self.LogWarning("Node failed to demote itself from master"
                            " candidate status: %s" % msg)
@@ -2973,14 +2977,22 @@ class LUSetNodeParams(LogicalUnit):
          raise errors.OpPrereqError("The master role can be changed"
                                     " only via masterfailover")
  
-    if ((self.op.master_candidate == False or self.op.offline == True or
-         self.op.drained == True) and node.master_candidate):
+    # Boolean value that tells us whether we're offlining or draining the node
+    offline_or_drain = self.op.offline == True or self.op.drained == True
+    deoffline_or_drain = self.op.offline == False or self.op.drained == False
+
+    if (node.master_candidate and
+        (self.op.master_candidate == False or offline_or_drain)):
        cp_size = self.cfg.GetClusterInfo().candidate_pool_size
-      num_candidates, _ = self.cfg.GetMasterCandidateStats()
-      if num_candidates <= cp_size:
+      mc_now, mc_should, mc_max = self.cfg.GetMasterCandidateStats()
+      if mc_now <= cp_size:
          msg = ("Not enough master candidates (desired"
-               " %d, new value will be %d)" % (cp_size, num_candidates-1))
-        if self.op.force:
+               " %d, new value will be %d)" % (cp_size, mc_now-1))
+        # Only allow forcing the operation if it's an offline/drain operation,
+        # and we could not possibly promote more nodes.
+        # FIXME: this can still lead to issues if in any way another node which
+        # could be promoted appears in the meantime.
+        if self.op.force and offline_or_drain and mc_should == mc_max:
            self.LogWarning(msg)
          else:
            raise errors.OpPrereqError(msg)
@@ -2991,6 +3003,13 @@ class LUSetNodeParams(LogicalUnit):
        raise errors.OpPrereqError("Node '%s' is offline or drained, can't set"
                                   " to master_candidate" % node.name)
  
+    # If we're being deofflined/drained, we'll MC ourself if needed
+    if (deoffline_or_drain and not offline_or_drain and not
+        self.op.master_candidate == True):
+      self.op.master_candidate = _DecideSelfPromotion(self)
+      if self.op.master_candidate:
+        self.LogInfo("Autopromoting node to master candidate")
+
      return
  
    def Exec(self, feedback_fn):
@@ -3033,7 +3052,7 @@ class LUSetNodeParams(LogicalUnit):
            changed_mc = True
            result.append(("master_candidate", "auto-demotion due to drain"))
            rrc = self.rpc.call_node_demote_from_mc(node.name)
-          msg = rrc.RemoteFailMsg()
+          msg = rrc.fail_msg
            if msg:
              self.LogWarning("Node failed to demote itself: %s" % msg)
          if node.offline:
@@ -3134,6 +3153,7 @@ class LUQueryClusterInfo(NoHooksLU):
        "file_storage_dir": cluster.file_storage_dir,
        "ctime": cluster.ctime,
        "mtime": cluster.mtime,
+      "uuid": cluster.uuid,
        "tags": list(cluster.GetTags()),
        }
  
@@ -4003,6 +4023,8 @@ class LUQueryInstances(NoHooksLU):
    """
    _OP_REQP = ["output_fields", "names", "use_locking"]
    REQ_BGL = False
+  _SIMPLE_FIELDS = ["name", "os", "network_port", "hypervisor",
+                    "serial_no", "ctime", "mtime", "uuid"]
    _FIELDS_STATIC = utils.FieldSet(*["name", "os", "pnode", "snodes",
                                      "admin_state",
                                      "disk_template", "ip", "mac", "bridge",
@@ -4015,9 +4037,8 @@ class LUQueryInstances(NoHooksLU):
                                      r"(nic)\.(bridge)/([0-9]+)",
                                      r"(nic)\.(macs|ips|modes|links|bridges)",
                                      r"(disk|nic)\.(count)",
-                                    "serial_no", "hypervisor", "hvparams",
-                                    "ctime", "mtime",
-                                    ] +
+                                    "hvparams",
+                                    ] + _SIMPLE_FIELDS +
                                    ["hv/%s" % name
                                     for name in constants.HVS_PARAMETERS] +
                                    ["be/%s" % name
@@ -4097,7 +4118,7 @@ class LUQueryInstances(NoHooksLU):
          if result.offline:
            # offline nodes will be in both lists
            off_nodes.append(name)
-        if result.RemoteFailMsg():
+        if result.fail_msg:
            bad_nodes.append(name)
          else:
            if result.payload:
@@ -4120,10 +4141,8 @@ class LUQueryInstances(NoHooksLU):
                                   nic.nicparams) for nic in instance.nics]
        for field in self.op.output_fields:
          st_match = self._FIELDS_STATIC.Matches(field)
-        if field == "name":
-          val = instance.name
-        elif field == "os":
-          val = instance.os
+        if field in self._SIMPLE_FIELDS:
+          val = getattr(instance, field)
          elif field == "pnode":
            val = instance.primary_node
          elif field == "snodes":
@@ -4200,16 +4219,6 @@ class LUQueryInstances(NoHooksLU):
            val = _ComputeDiskSize(instance.disk_template, disk_sizes)
          elif field == "tags":
            val = list(instance.GetTags())
-        elif field == "serial_no":
-          val = instance.serial_no
-        elif field == "ctime":
-          val = instance.ctime
-        elif field == "mtime":
-          val = instance.mtime
-        elif field == "network_port":
-          val = instance.network_port
-        elif field == "hypervisor":
-          val = instance.hypervisor
          elif field == "hvparams":
            val = i_hv
          elif (field.startswith(HVPREFIX) and
@@ -5227,7 +5236,7 @@ def _CreateDisks(lu, instance, to_skip=None, target_node=None):
      result = lu.rpc.call_file_storage_dir_create(pnode, file_storage_dir)
  
      result.Raise("Failed to create directory '%s' on"
-                 " node %s: %s" % (file_storage_dir, pnode))
+                 " node %s" % (file_storage_dir, pnode))
  
    # Note: this needs to be kept in sync with adding of disks in
    # LUSetInstanceParams
@@ -5278,10 +5287,10 @@ def _RemoveDisks(lu, instance, target_node=None):
  
    if instance.disk_template == constants.DT_FILE:
      file_storage_dir = os.path.dirname(instance.disks[0].logical_id[1])
-    if target_node is node:
-      tgt = instance.primary_node
+    if target_node:
+      tgt = target_node
      else:
-      tgt = instance.target_node
+      tgt = instance.primary_node
      result = lu.rpc.call_file_storage_dir_remove(tgt, file_storage_dir)
      if result.fail_msg:
        lu.LogWarning("Could not remove directory '%s' on node %s: %s",
@@ -7050,6 +7059,7 @@ class LUQueryInstanceData(NoHooksLU):
          "serial_no": instance.serial_no,
          "mtime": instance.mtime,
          "ctime": instance.ctime,
+        "uuid": instance.uuid,
          }
  
        result[instance.name] = idict