Add use_external_mip_script cluster parameter

[ganeti-local] / lib / cmdlib.py
diff --git a/lib/cmdlib.py b/lib/cmdlib.py

index 4f33e5f..711a79d 100644 (file)
--- a/lib/cmdlib.py
+++ b/lib/cmdlib.py
@@ -26,7 +26,7 @@
  # W0201 since most LU attributes are defined in CheckPrereq or similar
  # functions
  
-# C0302: since we have waaaay to many lines in this module
+# C0302: since we have waaaay too many lines in this module
  
  import os
  import os.path
@@ -59,10 +59,15 @@ from ganeti import query
  from ganeti import qlang
  from ganeti import opcodes
  from ganeti import ht
+from ganeti import rpc
  
  import ganeti.masterd.instance # pylint: disable=W0611
  
  
+#: Size of DRBD meta block device
+DRBD_META_SIZE = 128
+
+
  class ResultWithJobs:
    """Data container for LU results with jobs.
  
@@ -108,7 +113,7 @@ class LogicalUnit(object):
    HTYPE = None
    REQ_BGL = True
  
-  def __init__(self, processor, op, context, rpc):
+  def __init__(self, processor, op, context, rpc_runner):
      """Constructor for LogicalUnit.
  
      This needs to be overridden in derived classes in order to check op
@@ -122,7 +127,7 @@ class LogicalUnit(object):
      # readability alias
      self.owned_locks = context.glm.list_owned
      self.context = context
-    self.rpc = rpc
+    self.rpc = rpc_runner
      # Dicts used to declare locking needs to mcpu
      self.needed_locks = None
      self.share_locks = dict.fromkeys(locking.LEVELS, 0)
@@ -344,7 +349,8 @@ class LogicalUnit(object):
                                                  self.op.instance_name)
      self.needed_locks[locking.LEVEL_INSTANCE] = self.op.instance_name
  
-  def _LockInstancesNodes(self, primary_only=False):
+  def _LockInstancesNodes(self, primary_only=False,
+                          level=locking.LEVEL_NODE):
      """Helper function to declare instances' nodes for locking.
  
      This function should be called after locking one or more instances to lock
@@ -365,9 +371,10 @@ class LogicalUnit(object):
  
      @type primary_only: boolean
      @param primary_only: only lock primary nodes of locked instances
+    @param level: Which lock level to use for locking nodes
  
      """
-    assert locking.LEVEL_NODE in self.recalculate_locks, \
+    assert level in self.recalculate_locks, \
        "_LockInstancesNodes helper function called with no nodes to recalculate"
  
      # TODO: check if we're really been called with the instance locks held
@@ -382,12 +389,14 @@ class LogicalUnit(object):
        if not primary_only:
          wanted_nodes.extend(instance.secondary_nodes)
  
-    if self.recalculate_locks[locking.LEVEL_NODE] == constants.LOCKS_REPLACE:
-      self.needed_locks[locking.LEVEL_NODE] = wanted_nodes
-    elif self.recalculate_locks[locking.LEVEL_NODE] == constants.LOCKS_APPEND:
-      self.needed_locks[locking.LEVEL_NODE].extend(wanted_nodes)
+    if self.recalculate_locks[level] == constants.LOCKS_REPLACE:
+      self.needed_locks[level] = wanted_nodes
+    elif self.recalculate_locks[level] == constants.LOCKS_APPEND:
+      self.needed_locks[level].extend(wanted_nodes)
+    else:
+      raise errors.ProgrammerError("Unknown recalculation mode")
  
-    del self.recalculate_locks[locking.LEVEL_NODE]
+    del self.recalculate_locks[level]
  
  
  class NoHooksLU(LogicalUnit): # pylint: disable=W0223
@@ -468,13 +477,13 @@ class _QueryBase:
    #: Attribute holding field definitions
    FIELDS = None
  
-  def __init__(self, filter_, fields, use_locking):
+  def __init__(self, qfilter, fields, use_locking):
      """Initializes this class.
  
      """
      self.use_locking = use_locking
  
-    self.query = query.Query(self.FIELDS, fields, filter_=filter_,
+    self.query = query.Query(self.FIELDS, fields, qfilter=qfilter,
                               namefield="name")
      self.requested_data = self.query.RequestedData()
      self.names = self.query.RequestedNames()
@@ -753,7 +762,7 @@ def _RunPostHook(lu, node_name):
    """Runs the post-hook for an opcode on a single node.
  
    """
-  hm = lu.proc.hmclass(lu.rpc.call_hooks_runner, lu)
+  hm = lu.proc.BuildHooksManager(lu)
    try:
      hm.RunPhase(constants.HOOKS_PHASE_POST, nodes=[node_name])
    except:
@@ -1204,13 +1213,13 @@ def _GetStorageTypeArgs(cfg, storage_type):
    return []
  
  
-def _FindFaultyInstanceDisks(cfg, rpc, instance, node_name, prereq):
+def _FindFaultyInstanceDisks(cfg, rpc_runner, instance, node_name, prereq):
    faulty = []
  
    for dev in instance.disks:
      cfg.SetDiskID(dev, node_name)
  
-  result = rpc.call_blockdev_getmirrorstatus(node_name, instance.disks)
+  result = rpc_runner.call_blockdev_getmirrorstatus(node_name, instance.disks)
    result.Raise("Failed to get disk status from node %s" % node_name,
                 prereq=prereq, ecode=errors.ECODE_ENVIRON)
  
@@ -1350,15 +1359,16 @@ class LUClusterDestroy(LogicalUnit):
      """Destroys the cluster.
  
      """
-    master = self.cfg.GetMasterNode()
+    master_params = self.cfg.GetMasterNetworkParameters()
  
      # Run post hooks on master node before it's removed
-    _RunPostHook(self, master)
+    _RunPostHook(self, master_params.name)
  
-    result = self.rpc.call_node_stop_master(master, False)
+    result = self.rpc.call_node_deactivate_master_ip(master_params.name,
+                                                     master_params)
      result.Raise("Could not disable the master role")
  
-    return master
+    return master_params.name
  
  
  def _VerifyCertificate(filename):
@@ -1433,39 +1443,6 @@ class _VerifyErrors(object):
    self.op and self._feedback_fn to be available.)
  
    """
-  TCLUSTER = "cluster"
-  TNODE = "node"
-  TINSTANCE = "instance"
-
-  ECLUSTERCFG = (TCLUSTER, "ECLUSTERCFG")
-  ECLUSTERCERT = (TCLUSTER, "ECLUSTERCERT")
-  ECLUSTERFILECHECK = (TCLUSTER, "ECLUSTERFILECHECK")
-  ECLUSTERDANGLINGNODES = (TNODE, "ECLUSTERDANGLINGNODES")
-  ECLUSTERDANGLINGINST = (TNODE, "ECLUSTERDANGLINGINST")
-  EINSTANCEBADNODE = (TINSTANCE, "EINSTANCEBADNODE")
-  EINSTANCEDOWN = (TINSTANCE, "EINSTANCEDOWN")
-  EINSTANCELAYOUT = (TINSTANCE, "EINSTANCELAYOUT")
-  EINSTANCEMISSINGDISK = (TINSTANCE, "EINSTANCEMISSINGDISK")
-  EINSTANCEFAULTYDISK = (TINSTANCE, "EINSTANCEFAULTYDISK")
-  EINSTANCEWRONGNODE = (TINSTANCE, "EINSTANCEWRONGNODE")
-  EINSTANCESPLITGROUPS = (TINSTANCE, "EINSTANCESPLITGROUPS")
-  ENODEDRBD = (TNODE, "ENODEDRBD")
-  ENODEDRBDHELPER = (TNODE, "ENODEDRBDHELPER")
-  ENODEFILECHECK = (TNODE, "ENODEFILECHECK")
-  ENODEHOOKS = (TNODE, "ENODEHOOKS")
-  ENODEHV = (TNODE, "ENODEHV")
-  ENODELVM = (TNODE, "ENODELVM")
-  ENODEN1 = (TNODE, "ENODEN1")
-  ENODENET = (TNODE, "ENODENET")
-  ENODEOS = (TNODE, "ENODEOS")
-  ENODEORPHANINSTANCE = (TNODE, "ENODEORPHANINSTANCE")
-  ENODEORPHANLV = (TNODE, "ENODEORPHANLV")
-  ENODERPC = (TNODE, "ENODERPC")
-  ENODESSH = (TNODE, "ENODESSH")
-  ENODEVERSION = (TNODE, "ENODEVERSION")
-  ENODESETUP = (TNODE, "ENODESETUP")
-  ENODETIME = (TNODE, "ENODETIME")
-  ENODEOOBPATH = (TNODE, "ENODEOOBPATH")
  
    ETYPE_FIELD = "code"
    ETYPE_ERROR = "ERROR"
@@ -1481,7 +1458,7 @@ class _VerifyErrors(object):
  
      """
      ltype = kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR)
-    itype, etxt = ecode
+    itype, etxt, _ = ecode
      # first complete the msg
      if args:
        msg = msg % args
@@ -1497,14 +1474,22 @@ class _VerifyErrors(object):
      # and finally report it via the feedback_fn
      self._feedback_fn("  - %s" % msg) # Mix-in. pylint: disable=E1101
  
-  def _ErrorIf(self, cond, *args, **kwargs):
+  def _ErrorIf(self, cond, ecode, *args, **kwargs):
      """Log an error message if the passed condition is True.
  
      """
      cond = (bool(cond)
              or self.op.debug_simulate_errors) # pylint: disable=E1101
+
+    # If the error code is in the list of ignored errors, demote the error to a
+    # warning
+    (_, etxt, _) = ecode
+    if etxt in self.op.ignore_errors:     # pylint: disable=E1101
+      kwargs[self.ETYPE_FIELD] = self.ETYPE_WARNING
+
      if cond:
-      self._Error(*args, **kwargs)
+      self._Error(ecode, *args, **kwargs)
+
      # do not mark the operation as failed for WARN cases only
      if kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR) == self.ETYPE_ERROR:
        self.bad = self.bad or cond
@@ -1529,13 +1514,16 @@ class LUClusterVerify(NoHooksLU):
        groups = self.cfg.GetNodeGroupList()
  
        # Verify global configuration
-      jobs.append([opcodes.OpClusterVerifyConfig()])
+      jobs.append([
+        opcodes.OpClusterVerifyConfig(ignore_errors=self.op.ignore_errors)
+        ])
  
        # Always depend on global verification
        depends_fn = lambda: [(-len(jobs), [])]
  
      jobs.extend([opcodes.OpClusterVerifyGroup(group_name=group,
-                                              depends=depends_fn())]
+                                            ignore_errors=self.op.ignore_errors,
+                                            depends=depends_fn())]
                  for group in groups)
  
      # Fix up all parameters
@@ -1569,7 +1557,7 @@ class LUClusterVerifyConfig(NoHooksLU, _VerifyErrors):
          utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
          hv_class.CheckParameterSyntax(hv_params)
        except errors.GenericError, err:
-        self._ErrorIf(True, self.ECLUSTERCFG, None, msg % str(err))
+        self._ErrorIf(True, constants.CV_ECLUSTERCFG, None, msg % str(err))
  
    def ExpandNames(self):
      # Information can be safely retrieved as the BGL is acquired in exclusive
@@ -1590,13 +1578,13 @@ class LUClusterVerifyConfig(NoHooksLU, _VerifyErrors):
      feedback_fn("* Verifying cluster config")
  
      for msg in self.cfg.VerifyConfig():
-      self._ErrorIf(True, self.ECLUSTERCFG, None, msg)
+      self._ErrorIf(True, constants.CV_ECLUSTERCFG, None, msg)
  
      feedback_fn("* Verifying cluster certificate files")
  
      for cert_filename in constants.ALL_CERT_FILES:
        (errcode, msg) = _VerifyCertificate(cert_filename)
-      self._ErrorIf(errcode, self.ECLUSTERCERT, None, msg, code=errcode)
+      self._ErrorIf(errcode, constants.CV_ECLUSTERCERT, None, msg, code=errcode)
  
      feedback_fn("* Verifying hypervisor parameters")
  
@@ -1628,11 +1616,13 @@ class LUClusterVerifyConfig(NoHooksLU, _VerifyErrors):
                                                  ["no instances"])))
          for node in dangling_nodes]
  
-    self._ErrorIf(bool(dangling_nodes), self.ECLUSTERDANGLINGNODES, None,
+    self._ErrorIf(bool(dangling_nodes), constants.CV_ECLUSTERDANGLINGNODES,
+                  None,
                    "the following nodes (and their instances) belong to a non"
                    " existing group: %s", utils.CommaJoin(pretty_dangling))
  
-    self._ErrorIf(bool(no_node_instances), self.ECLUSTERDANGLINGINST, None,
+    self._ErrorIf(bool(no_node_instances), constants.CV_ECLUSTERDANGLINGINST,
+                  None,
                    "the following instances have a non-existing primary-node:"
                    " %s", utils.CommaJoin(no_node_instances))
  
@@ -1805,7 +1795,7 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
  
      # main result, nresult should be a non-empty dict
      test = not nresult or not isinstance(nresult, dict)
-    _ErrorIf(test, self.ENODERPC, node,
+    _ErrorIf(test, constants.CV_ENODERPC, node,
                    "unable to verify node: no data returned")
      if test:
        return False
@@ -1816,13 +1806,13 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
      test = not (remote_version and
                  isinstance(remote_version, (list, tuple)) and
                  len(remote_version) == 2)
-    _ErrorIf(test, self.ENODERPC, node,
+    _ErrorIf(test, constants.CV_ENODERPC, node,
               "connection to node returned invalid data")
      if test:
        return False
  
      test = local_version != remote_version[0]
-    _ErrorIf(test, self.ENODEVERSION, node,
+    _ErrorIf(test, constants.CV_ENODEVERSION, node,
               "incompatible protocol versions: master %s,"
               " node %s", local_version, remote_version[0])
      if test:
@@ -1832,7 +1822,7 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
  
      # full package version
      self._ErrorIf(constants.RELEASE_VERSION != remote_version[1],
-                  self.ENODEVERSION, node,
+                  constants.CV_ENODEVERSION, node,
                    "software version mismatch: master %s, node %s",
                    constants.RELEASE_VERSION, remote_version[1],
                    code=self.ETYPE_WARNING)
@@ -1841,19 +1831,19 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
      if ninfo.vm_capable and isinstance(hyp_result, dict):
        for hv_name, hv_result in hyp_result.iteritems():
          test = hv_result is not None
-        _ErrorIf(test, self.ENODEHV, node,
+        _ErrorIf(test, constants.CV_ENODEHV, node,
                   "hypervisor %s verify failure: '%s'", hv_name, hv_result)
  
      hvp_result = nresult.get(constants.NV_HVPARAMS, None)
      if ninfo.vm_capable and isinstance(hvp_result, list):
        for item, hv_name, hv_result in hvp_result:
-        _ErrorIf(True, self.ENODEHV, node,
+        _ErrorIf(True, constants.CV_ENODEHV, node,
                   "hypervisor %s parameter verify failure (source %s): %s",
                   hv_name, item, hv_result)
  
      test = nresult.get(constants.NV_NODESETUP,
                         ["Missing NODESETUP results"])
-    _ErrorIf(test, self.ENODESETUP, node, "node setup error: %s",
+    _ErrorIf(test, constants.CV_ENODESETUP, node, "node setup error: %s",
               "; ".join(test))
  
      return True
@@ -1876,7 +1866,7 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
      try:
        ntime_merged = utils.MergeTime(ntime)
      except (ValueError, TypeError):
-      _ErrorIf(True, self.ENODETIME, node, "Node returned invalid time")
+      _ErrorIf(True, constants.CV_ENODETIME, node, "Node returned invalid time")
        return
  
      if ntime_merged < (nvinfo_starttime - constants.NODE_MAX_CLOCK_SKEW):
@@ -1886,7 +1876,7 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
      else:
        ntime_diff = None
  
-    _ErrorIf(ntime_diff is not None, self.ENODETIME, node,
+    _ErrorIf(ntime_diff is not None, constants.CV_ENODETIME, node,
               "Node time diverges by at least %s from master node time",
               ntime_diff)
  
@@ -1908,24 +1898,25 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
      # checks vg existence and size > 20G
      vglist = nresult.get(constants.NV_VGLIST, None)
      test = not vglist
-    _ErrorIf(test, self.ENODELVM, node, "unable to check volume groups")
+    _ErrorIf(test, constants.CV_ENODELVM, node, "unable to check volume groups")
      if not test:
        vgstatus = utils.CheckVolumeGroupSize(vglist, vg_name,
                                              constants.MIN_VG_SIZE)
-      _ErrorIf(vgstatus, self.ENODELVM, node, vgstatus)
+      _ErrorIf(vgstatus, constants.CV_ENODELVM, node, vgstatus)
  
      # check pv names
      pvlist = nresult.get(constants.NV_PVLIST, None)
      test = pvlist is None
-    _ErrorIf(test, self.ENODELVM, node, "Can't get PV list from node")
+    _ErrorIf(test, constants.CV_ENODELVM, node, "Can't get PV list from node")
      if not test:
        # check that ':' is not present in PV names, since it's a
        # special character for lvcreate (denotes the range of PEs to
        # use on the PV)
        for _, pvname, owner_vg in pvlist:
          test = ":" in pvname
-        _ErrorIf(test, self.ENODELVM, node, "Invalid character ':' in PV"
-                 " '%s' of VG '%s'", pvname, owner_vg)
+        _ErrorIf(test, constants.CV_ENODELVM, node,
+                 "Invalid character ':' in PV '%s' of VG '%s'",
+                 pvname, owner_vg)
  
    def _VerifyNodeBridges(self, ninfo, nresult, bridges):
      """Check the node bridges.
@@ -1944,11 +1935,11 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
  
      missing = nresult.get(constants.NV_BRIDGES, None)
      test = not isinstance(missing, list)
-    _ErrorIf(test, self.ENODENET, node,
+    _ErrorIf(test, constants.CV_ENODENET, node,
               "did not return valid bridge information")
      if not test:
-      _ErrorIf(bool(missing), self.ENODENET, node, "missing bridges: %s" %
-               utils.CommaJoin(sorted(missing)))
+      _ErrorIf(bool(missing), constants.CV_ENODENET, node,
+               "missing bridges: %s" % utils.CommaJoin(sorted(missing)))
  
    def _VerifyNodeNetwork(self, ninfo, nresult):
      """Check the node network connectivity results.
@@ -1962,27 +1953,27 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
      _ErrorIf = self._ErrorIf # pylint: disable=C0103
  
      test = constants.NV_NODELIST not in nresult
-    _ErrorIf(test, self.ENODESSH, node,
+    _ErrorIf(test, constants.CV_ENODESSH, node,
               "node hasn't returned node ssh connectivity data")
      if not test:
        if nresult[constants.NV_NODELIST]:
          for a_node, a_msg in nresult[constants.NV_NODELIST].items():
-          _ErrorIf(True, self.ENODESSH, node,
+          _ErrorIf(True, constants.CV_ENODESSH, node,
                     "ssh communication with node '%s': %s", a_node, a_msg)
  
      test = constants.NV_NODENETTEST not in nresult
-    _ErrorIf(test, self.ENODENET, node,
+    _ErrorIf(test, constants.CV_ENODENET, node,
               "node hasn't returned node tcp connectivity data")
      if not test:
        if nresult[constants.NV_NODENETTEST]:
          nlist = utils.NiceSort(nresult[constants.NV_NODENETTEST].keys())
          for anode in nlist:
-          _ErrorIf(True, self.ENODENET, node,
+          _ErrorIf(True, constants.CV_ENODENET, node,
                     "tcp communication with node '%s': %s",
                     anode, nresult[constants.NV_NODENETTEST][anode])
  
      test = constants.NV_MASTERIP not in nresult
-    _ErrorIf(test, self.ENODENET, node,
+    _ErrorIf(test, constants.CV_ENODENET, node,
               "node hasn't returned node master IP reachability data")
      if not test:
        if not nresult[constants.NV_MASTERIP]:
@@ -1990,7 +1981,7 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
            msg = "the master node cannot reach the master IP (not configured?)"
          else:
            msg = "cannot reach the master IP"
-        _ErrorIf(True, self.ENODENET, node, msg)
+        _ErrorIf(True, constants.CV_ENODENET, node, msg)
  
    def _VerifyInstance(self, instance, instanceconfig, node_image,
                        diskstatus):
@@ -2013,13 +2004,13 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
          continue
        for volume in node_vol_should[node]:
          test = volume not in n_img.volumes
-        _ErrorIf(test, self.EINSTANCEMISSINGDISK, instance,
+        _ErrorIf(test, constants.CV_EINSTANCEMISSINGDISK, instance,
                   "volume %s missing on node %s", volume, node)
  
      if instanceconfig.admin_up:
        pri_img = node_image[node_current]
        test = instance not in pri_img.instances and not pri_img.offline
-      _ErrorIf(test, self.EINSTANCEDOWN, instance,
+      _ErrorIf(test, constants.CV_EINSTANCEDOWN, instance,
                 "instance not running on its primary node %s",
                 node_current)
  
@@ -2033,12 +2024,12 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
        snode = node_image[nname]
        bad_snode = snode.ghost or snode.offline
        _ErrorIf(instanceconfig.admin_up and not success and not bad_snode,
-               self.EINSTANCEFAULTYDISK, instance,
+               constants.CV_EINSTANCEFAULTYDISK, instance,
                 "couldn't retrieve status for disk/%s on %s: %s",
                 idx, nname, bdev_status)
        _ErrorIf((instanceconfig.admin_up and success and
                  bdev_status.ldisk_status == constants.LDS_FAULTY),
-               self.EINSTANCEFAULTYDISK, instance,
+               constants.CV_EINSTANCEFAULTYDISK, instance,
                 "disk/%s on %s is faulty", idx, nname)
  
    def _VerifyOrphanVolumes(self, node_vol_should, node_image, reserved):
@@ -2059,7 +2050,7 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
          test = ((node not in node_vol_should or
                  volume not in node_vol_should[node]) and
                  not reserved.Matches(volume))
-        self._ErrorIf(test, self.ENODEORPHANLV, node,
+        self._ErrorIf(test, constants.CV_ENODEORPHANLV, node,
                        "volume %s is unknown", volume)
  
    def _VerifyNPlusOneMemory(self, node_image, instance_cfg):
@@ -2092,14 +2083,14 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
            if bep[constants.BE_AUTO_BALANCE]:
              needed_mem += bep[constants.BE_MEMORY]
          test = n_img.mfree < needed_mem
-        self._ErrorIf(test, self.ENODEN1, node,
+        self._ErrorIf(test, constants.CV_ENODEN1, node,
                        "not enough memory to accomodate instance failovers"
                        " should node %s fail (%dMiB needed, %dMiB available)",
                        prinode, needed_mem, n_img.mfree)
  
    @classmethod
    def _VerifyFiles(cls, errorif, nodeinfo, master_node, all_nvinfo,
-                   (files_all, files_all_opt, files_mc, files_vm)):
+                   (files_all, files_opt, files_mc, files_vm)):
      """Verifies file checksums collected from all nodes.
  
      @param errorif: Callback for reporting errors
@@ -2108,26 +2099,33 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
      @param all_nvinfo: RPC results
  
      """
-    node_names = frozenset(node.name for node in nodeinfo if not node.offline)
+    # Define functions determining which nodes to consider for a file
+    files2nodefn = [
+      (files_all, None),
+      (files_mc, lambda node: (node.master_candidate or
+                               node.name == master_node)),
+      (files_vm, lambda node: node.vm_capable),
+      ]
  
-    assert master_node in node_names
-    assert (len(files_all | files_all_opt | files_mc | files_vm) ==
-            sum(map(len, [files_all, files_all_opt, files_mc, files_vm]))), \
-           "Found file listed in more than one file list"
+    # Build mapping from filename to list of nodes which should have the file
+    nodefiles = {}
+    for (files, fn) in files2nodefn:
+      if fn is None:
+        filenodes = nodeinfo
+      else:
+        filenodes = filter(fn, nodeinfo)
+      nodefiles.update((filename,
+                        frozenset(map(operator.attrgetter("name"), filenodes)))
+                       for filename in files)
  
-    # Define functions determining which nodes to consider for a file
-    file2nodefn = dict([(filename, fn)
-      for (files, fn) in [(files_all, None),
-                          (files_all_opt, None),
-                          (files_mc, lambda node: (node.master_candidate or
-                                                   node.name == master_node)),
-                          (files_vm, lambda node: node.vm_capable)]
-      for filename in files])
+    assert set(nodefiles) == (files_all | files_mc | files_vm)
  
-    fileinfo = dict((filename, {}) for filename in file2nodefn.keys())
+    fileinfo = dict((filename, {}) for filename in nodefiles)
+    ignore_nodes = set()
  
      for node in nodeinfo:
        if node.offline:
+        ignore_nodes.add(node.name)
          continue
  
        nresult = all_nvinfo[node.name]
@@ -2138,16 +2136,16 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
          node_files = nresult.payload.get(constants.NV_FILELIST, None)
  
        test = not (node_files and isinstance(node_files, dict))
-      errorif(test, cls.ENODEFILECHECK, node.name,
+      errorif(test, constants.CV_ENODEFILECHECK, node.name,
                "Node did not return file checksum data")
        if test:
+        ignore_nodes.add(node.name)
          continue
  
+      # Build per-checksum mapping from filename to nodes having it
        for (filename, checksum) in node_files.items():
-        # Check if the file should be considered for a node
-        fn = file2nodefn[filename]
-        if fn is None or fn(node):
-          fileinfo[filename].setdefault(checksum, set()).add(node.name)
+        assert filename in nodefiles
+        fileinfo[filename].setdefault(checksum, set()).add(node.name)
  
      for (filename, checksums) in fileinfo.items():
        assert compat.all(len(i) > 10 for i in checksums), "Invalid checksum"
@@ -2155,23 +2153,32 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
        # Nodes having the file
        with_file = frozenset(node_name
                              for nodes in fileinfo[filename].values()
-                            for node_name in nodes)
+                            for node_name in nodes) - ignore_nodes
+
+      expected_nodes = nodefiles[filename] - ignore_nodes
  
        # Nodes missing file
-      missing_file = node_names - with_file
+      missing_file = expected_nodes - with_file
  
-      if filename in files_all_opt:
+      if filename in files_opt:
          # All or no nodes
-        errorif(missing_file and missing_file != node_names,
-                cls.ECLUSTERFILECHECK, None,
+        errorif(missing_file and missing_file != expected_nodes,
+                constants.CV_ECLUSTERFILECHECK, None,
                  "File %s is optional, but it must exist on all or no"
                  " nodes (not found on %s)",
                  filename, utils.CommaJoin(utils.NiceSort(missing_file)))
        else:
-        errorif(missing_file, cls.ECLUSTERFILECHECK, None,
+        errorif(missing_file, constants.CV_ECLUSTERFILECHECK, None,
                  "File %s is missing from node(s) %s", filename,
                  utils.CommaJoin(utils.NiceSort(missing_file)))
  
+        # Warn if a node has a file it shouldn't
+        unexpected = with_file - expected_nodes
+        errorif(unexpected,
+                constants.CV_ECLUSTERFILECHECK, None,
+                "File %s should not exist on node(s) %s",
+                filename, utils.CommaJoin(utils.NiceSort(unexpected)))
+
        # See if there are multiple versions of the file
        test = len(checksums) > 1
        if test:
@@ -2182,7 +2189,7 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
        else:
          variants = []
  
-      errorif(test, cls.ECLUSTERFILECHECK, None,
+      errorif(test, constants.CV_ECLUSTERFILECHECK, None,
                "File %s found with %s different checksums (%s)",
                filename, len(checksums), "; ".join(variants))
  
@@ -2205,22 +2212,22 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
      if drbd_helper:
        helper_result = nresult.get(constants.NV_DRBDHELPER, None)
        test = (helper_result == None)
-      _ErrorIf(test, self.ENODEDRBDHELPER, node,
+      _ErrorIf(test, constants.CV_ENODEDRBDHELPER, node,
                 "no drbd usermode helper returned")
        if helper_result:
          status, payload = helper_result
          test = not status
-        _ErrorIf(test, self.ENODEDRBDHELPER, node,
+        _ErrorIf(test, constants.CV_ENODEDRBDHELPER, node,
                   "drbd usermode helper check unsuccessful: %s", payload)
          test = status and (payload != drbd_helper)
-        _ErrorIf(test, self.ENODEDRBDHELPER, node,
+        _ErrorIf(test, constants.CV_ENODEDRBDHELPER, node,
                   "wrong drbd usermode helper: %s", payload)
  
      # compute the DRBD minors
      node_drbd = {}
      for minor, instance in drbd_map[node].items():
        test = instance not in instanceinfo
-      _ErrorIf(test, self.ECLUSTERCFG, None,
+      _ErrorIf(test, constants.CV_ECLUSTERCFG, None,
                 "ghost instance '%s' in temporary DRBD map", instance)
          # ghost instance should not be running, but otherwise we
          # don't give double warnings (both ghost instance and
@@ -2234,7 +2241,7 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
      # and now check them
      used_minors = nresult.get(constants.NV_DRBDLIST, [])
      test = not isinstance(used_minors, (tuple, list))
-    _ErrorIf(test, self.ENODEDRBD, node,
+    _ErrorIf(test, constants.CV_ENODEDRBD, node,
               "cannot parse drbd status file: %s", str(used_minors))
      if test:
        # we cannot check drbd status
@@ -2242,11 +2249,11 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
  
      for minor, (iname, must_exist) in node_drbd.items():
        test = minor not in used_minors and must_exist
-      _ErrorIf(test, self.ENODEDRBD, node,
+      _ErrorIf(test, constants.CV_ENODEDRBD, node,
                 "drbd minor %d of instance %s is not active", minor, iname)
      for minor in used_minors:
        test = minor not in node_drbd
-      _ErrorIf(test, self.ENODEDRBD, node,
+      _ErrorIf(test, constants.CV_ENODEDRBD, node,
                 "unallocated drbd minor %d is in use", minor)
  
    def _UpdateNodeOS(self, ninfo, nresult, nimg):
@@ -2266,7 +2273,7 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
              not compat.all(isinstance(v, list) and len(v) == 7
                             for v in remote_os))
  
-    _ErrorIf(test, self.ENODEOS, node,
+    _ErrorIf(test, constants.CV_ENODEOS, node,
               "node hasn't returned valid OS data")
  
      nimg.os_fail = test
@@ -2308,14 +2315,14 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
      for os_name, os_data in nimg.oslist.items():
        assert os_data, "Empty OS status for OS %s?!" % os_name
        f_path, f_status, f_diag, f_var, f_param, f_api = os_data[0]
-      _ErrorIf(not f_status, self.ENODEOS, node,
+      _ErrorIf(not f_status, constants.CV_ENODEOS, node,
                 "Invalid OS %s (located at %s): %s", os_name, f_path, f_diag)
-      _ErrorIf(len(os_data) > 1, self.ENODEOS, node,
+      _ErrorIf(len(os_data) > 1, constants.CV_ENODEOS, node,
                 "OS '%s' has multiple entries (first one shadows the rest): %s",
                 os_name, utils.CommaJoin([v[0] for v in os_data]))
        # comparisons with the 'base' image
        test = os_name not in base.oslist
-      _ErrorIf(test, self.ENODEOS, node,
+      _ErrorIf(test, constants.CV_ENODEOS, node,
                 "Extra OS %s not present on reference node (%s)",
                 os_name, base.name)
        if test:
@@ -2329,14 +2336,14 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
                           ("variants list", f_var, b_var),
                           ("parameters", beautify_params(f_param),
                            beautify_params(b_param))]:
-        _ErrorIf(a != b, self.ENODEOS, node,
+        _ErrorIf(a != b, constants.CV_ENODEOS, node,
                   "OS %s for %s differs from reference node %s: [%s] vs. [%s]",
                   kind, os_name, base.name,
                   utils.CommaJoin(sorted(a)), utils.CommaJoin(sorted(b)))
  
      # check any missing OSes
      missing = set(base.oslist.keys()).difference(nimg.oslist.keys())
-    _ErrorIf(missing, self.ENODEOS, node,
+    _ErrorIf(missing, constants.CV_ENODEOS, node,
               "OSes present on reference node %s but missing on this node: %s",
               base.name, utils.CommaJoin(missing))
  
@@ -2354,7 +2361,7 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
      if ((ninfo.master_candidate or ninfo.master_capable) and
          constants.NV_OOB_PATHS in nresult):
        for path_result in nresult[constants.NV_OOB_PATHS]:
-        self._ErrorIf(path_result, self.ENODEOOBPATH, node, path_result)
+        self._ErrorIf(path_result, constants.CV_ENODEOOBPATH, node, path_result)
  
    def _UpdateNodeVolumes(self, ninfo, nresult, nimg, vg_name):
      """Verifies and updates the node volume data.
@@ -2377,10 +2384,11 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
      if vg_name is None:
        pass
      elif isinstance(lvdata, basestring):
-      _ErrorIf(True, self.ENODELVM, node, "LVM problem on node: %s",
+      _ErrorIf(True, constants.CV_ENODELVM, node, "LVM problem on node: %s",
                 utils.SafeEncode(lvdata))
      elif not isinstance(lvdata, dict):
-      _ErrorIf(True, self.ENODELVM, node, "rpc call to node failed (lvlist)")
+      _ErrorIf(True, constants.CV_ENODELVM, node,
+               "rpc call to node failed (lvlist)")
      else:
        nimg.volumes = lvdata
        nimg.lvm_fail = False
@@ -2400,8 +2408,9 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
      """
      idata = nresult.get(constants.NV_INSTANCELIST, None)
      test = not isinstance(idata, list)
-    self._ErrorIf(test, self.ENODEHV, ninfo.name, "rpc call to node failed"
-                  " (instancelist): %s", utils.SafeEncode(str(idata)))
+    self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name,
+                  "rpc call to node failed (instancelist): %s",
+                  utils.SafeEncode(str(idata)))
      if test:
        nimg.hyp_fail = True
      else:
@@ -2423,26 +2432,27 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
      # try to read free memory (from the hypervisor)
      hv_info = nresult.get(constants.NV_HVINFO, None)
      test = not isinstance(hv_info, dict) or "memory_free" not in hv_info
-    _ErrorIf(test, self.ENODEHV, node, "rpc call to node failed (hvinfo)")
+    _ErrorIf(test, constants.CV_ENODEHV, node,
+             "rpc call to node failed (hvinfo)")
      if not test:
        try:
          nimg.mfree = int(hv_info["memory_free"])
        except (ValueError, TypeError):
-        _ErrorIf(True, self.ENODERPC, node,
+        _ErrorIf(True, constants.CV_ENODERPC, node,
                   "node returned invalid nodeinfo, check hypervisor")
  
      # FIXME: devise a free space model for file based instances as well
      if vg_name is not None:
        test = (constants.NV_VGLIST not in nresult or
                vg_name not in nresult[constants.NV_VGLIST])
-      _ErrorIf(test, self.ENODELVM, node,
+      _ErrorIf(test, constants.CV_ENODELVM, node,
                 "node didn't return data for the volume group '%s'"
                 " - it is either missing or broken", vg_name)
        if not test:
          try:
            nimg.dfree = int(nresult[constants.NV_VGLIST][vg_name])
          except (ValueError, TypeError):
-          _ErrorIf(True, self.ENODERPC, node,
+          _ErrorIf(True, constants.CV_ENODERPC, node,
                     "node returned invalid LVM info, check LVM status")
  
    def _CollectDiskInfo(self, nodelist, node_image, instanceinfo):
@@ -2509,7 +2519,7 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
          data = len(disks) * [(False, "node offline")]
        else:
          msg = nres.fail_msg
-        _ErrorIf(msg, self.ENODERPC, nname,
+        _ErrorIf(msg, constants.CV_ENODERPC, nname,
                   "while getting disk information: %s", msg)
          if msg:
            # No data from this node
@@ -2542,6 +2552,40 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
  
      return instdisk
  
+  @staticmethod
+  def _SshNodeSelector(group_uuid, all_nodes):
+    """Create endless iterators for all potential SSH check hosts.
+
+    """
+    nodes = [node for node in all_nodes
+             if (node.group != group_uuid and
+                 not node.offline)]
+    keyfunc = operator.attrgetter("group")
+
+    return map(itertools.cycle,
+               [sorted(map(operator.attrgetter("name"), names))
+                for _, names in itertools.groupby(sorted(nodes, key=keyfunc),
+                                                  keyfunc)])
+
+  @classmethod
+  def _SelectSshCheckNodes(cls, group_nodes, group_uuid, all_nodes):
+    """Choose which nodes should talk to which other nodes.
+
+    We will make nodes contact all nodes in their group, and one node from
+    every other group.
+
+    @warning: This algorithm has a known issue if one node group is much
+      smaller than others (e.g. just one node). In such a case all other
+      nodes will talk to the single node.
+
+    """
+    online_nodes = sorted(node.name for node in group_nodes if not node.offline)
+    sel = cls._SshNodeSelector(group_uuid, all_nodes)
+
+    return (online_nodes,
+            dict((name, sorted([i.next() for i in sel]))
+                 for name in online_nodes))
+
    def BuildHooksEnv(self):
      """Build hooks env.
  
@@ -2605,25 +2649,14 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
  
      feedback_fn("* Gathering data (%d nodes)" % len(self.my_node_names))
  
-    # We will make nodes contact all nodes in their group, and one node from
-    # every other group.
-    # TODO: should it be a *random* node, different every time?
-    online_nodes = [node.name for node in node_data_list if not node.offline]
-    other_group_nodes = {}
-
-    for name in sorted(self.all_node_info):
-      node = self.all_node_info[name]
-      if (node.group not in other_group_nodes
-          and node.group != self.group_uuid
-          and not node.offline):
-        other_group_nodes[node.group] = node.name
-
      node_verify_param = {
        constants.NV_FILELIST:
          utils.UniqueSequence(filename
                               for files in filemap
                               for filename in files),
-      constants.NV_NODELIST: online_nodes + other_group_nodes.values(),
+      constants.NV_NODELIST:
+        self._SelectSshCheckNodes(node_data_list, self.group_uuid,
+                                  self.all_node_info.values()),
        constants.NV_HYPERVISOR: hypervisors,
        constants.NV_HVPARAMS:
          _GetAllHypervisorParameters(cluster, self.all_inst_info.values()),
@@ -2785,7 +2818,8 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
          feedback_fn("* Verifying node %s (%s)" % (node, ntype))
  
        msg = all_nvinfo[node].fail_msg
-      _ErrorIf(msg, self.ENODERPC, node, "while contacting node: %s", msg)
+      _ErrorIf(msg, constants.CV_ENODERPC, node, "while contacting node: %s",
+               msg)
        if msg:
          nimg.rpc_fail = True
          continue
@@ -2820,9 +2854,9 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
  
          for inst in non_primary_inst:
            test = inst in self.all_inst_info
-          _ErrorIf(test, self.EINSTANCEWRONGNODE, inst,
+          _ErrorIf(test, constants.CV_EINSTANCEWRONGNODE, inst,
                     "instance should not run on node %s", node_i.name)
-          _ErrorIf(not test, self.ENODEORPHANINSTANCE, node_i.name,
+          _ErrorIf(not test, constants.CV_ENODEORPHANINSTANCE, node_i.name,
                     "node is running unknown instance %s", inst)
  
      for node, result in extra_lv_nvinfo.items():
@@ -2841,11 +2875,11 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
        pnode = inst_config.primary_node
        pnode_img = node_image[pnode]
        _ErrorIf(pnode_img.rpc_fail and not pnode_img.offline,
-               self.ENODERPC, pnode, "instance %s, connection to"
+               constants.CV_ENODERPC, pnode, "instance %s, connection to"
                 " primary node failed", instance)
  
        _ErrorIf(inst_config.admin_up and pnode_img.offline,
-               self.EINSTANCEBADNODE, instance,
+               constants.CV_EINSTANCEBADNODE, instance,
                 "instance is marked as running and lives on offline node %s",
                 inst_config.primary_node)
  
@@ -2857,7 +2891,8 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
        if not inst_config.secondary_nodes:
          i_non_redundant.append(instance)
  
-      _ErrorIf(len(inst_config.secondary_nodes) > 1, self.EINSTANCELAYOUT,
+      _ErrorIf(len(inst_config.secondary_nodes) > 1,
+               constants.CV_EINSTANCELAYOUT,
                 instance, "instance has multiple secondary nodes: %s",
                 utils.CommaJoin(inst_config.secondary_nodes),
                 code=self.ETYPE_WARNING)
@@ -2878,7 +2913,8 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
                                       key=lambda (_, nodes): pnode in nodes,
                                       reverse=True)]
  
-        self._ErrorIf(len(instance_groups) > 1, self.EINSTANCESPLITGROUPS,
+        self._ErrorIf(len(instance_groups) > 1,
+                      constants.CV_EINSTANCESPLITGROUPS,
                        instance, "instance has primary and secondary nodes in"
                        " different groups: %s", utils.CommaJoin(pretty_list),
                        code=self.ETYPE_WARNING)
@@ -2888,21 +2924,22 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
  
        for snode in inst_config.secondary_nodes:
          s_img = node_image[snode]
-        _ErrorIf(s_img.rpc_fail and not s_img.offline, self.ENODERPC, snode,
-                 "instance %s, connection to secondary node failed", instance)
+        _ErrorIf(s_img.rpc_fail and not s_img.offline, constants.CV_ENODERPC,
+                 snode, "instance %s, connection to secondary node failed",
+                 instance)
  
          if s_img.offline:
            inst_nodes_offline.append(snode)
  
        # warn that the instance lives on offline nodes
-      _ErrorIf(inst_nodes_offline, self.EINSTANCEBADNODE, instance,
+      _ErrorIf(inst_nodes_offline, constants.CV_EINSTANCEBADNODE, instance,
                 "instance has offline secondary node(s) %s",
                 utils.CommaJoin(inst_nodes_offline))
        # ... or ghost/non-vm_capable nodes
        for node in inst_config.all_nodes:
-        _ErrorIf(node_image[node].ghost, self.EINSTANCEBADNODE, instance,
-                 "instance lives on ghost node %s", node)
-        _ErrorIf(not node_image[node].vm_capable, self.EINSTANCEBADNODE,
+        _ErrorIf(node_image[node].ghost, constants.CV_EINSTANCEBADNODE,
+                 instance, "instance lives on ghost node %s", node)
+        _ErrorIf(not node_image[node].vm_capable, constants.CV_EINSTANCEBADNODE,
                   instance, "instance lives on non-vm_capable node %s", node)
  
      feedback_fn("* Verifying orphan volumes")
@@ -2970,22 +3007,20 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
          res = hooks_results[node_name]
          msg = res.fail_msg
          test = msg and not res.offline
-        self._ErrorIf(test, self.ENODEHOOKS, node_name,
+        self._ErrorIf(test, constants.CV_ENODEHOOKS, node_name,
                        "Communication failure in hooks execution: %s", msg)
          if res.offline or msg:
-          # No need to investigate payload if node is offline or gave an error.
-          # override manually lu_result here as _ErrorIf only
-          # overrides self.bad
-          lu_result = 1
+          # No need to investigate payload if node is offline or gave
+          # an error.
            continue
          for script, hkr, output in res.payload:
            test = hkr == constants.HKR_FAIL
-          self._ErrorIf(test, self.ENODEHOOKS, node_name,
+          self._ErrorIf(test, constants.CV_ENODEHOOKS, node_name,
                          "Script %s failed, output:", script)
            if test:
              output = self._HOOKS_INDENT_RE.sub("      ", output)
              feedback_fn("%s" % output)
-            lu_result = 0
+            lu_result = False
  
      return lu_result
  
@@ -3124,7 +3159,7 @@ class LUGroupVerifyDisks(NoHooksLU):
        # any leftover items in nv_dict are missing LVs, let's arrange the data
        # better
        for key, inst in nv_dict.iteritems():
-        res_missing.setdefault(inst, []).append(key)
+        res_missing.setdefault(inst, []).append(list(key))
  
      return (res_nodes, list(res_instances), res_missing)
  
@@ -3293,29 +3328,32 @@ class LUClusterRename(LogicalUnit):
  
      """
      clustername = self.op.name
-    ip = self.ip
+    new_ip = self.ip
  
      # shutdown the master IP
-    master = self.cfg.GetMasterNode()
-    result = self.rpc.call_node_stop_master(master, False)
+    master_params = self.cfg.GetMasterNetworkParameters()
+    result = self.rpc.call_node_deactivate_master_ip(master_params.name,
+                                                     master_params)
      result.Raise("Could not disable the master role")
  
      try:
        cluster = self.cfg.GetClusterInfo()
        cluster.cluster_name = clustername
-      cluster.master_ip = ip
+      cluster.master_ip = new_ip
        self.cfg.Update(cluster, feedback_fn)
  
        # update the known hosts file
        ssh.WriteKnownHostsFile(self.cfg, constants.SSH_KNOWN_HOSTS_FILE)
        node_list = self.cfg.GetOnlineNodeList()
        try:
-        node_list.remove(master)
+        node_list.remove(master_params.name)
        except ValueError:
          pass
        _UploadHelper(self, node_list, constants.SSH_KNOWN_HOSTS_FILE)
      finally:
-      result = self.rpc.call_node_start_master(master, False, False)
+      master_params.ip = new_ip
+      result = self.rpc.call_node_activate_master_ip(master_params.name,
+                                                     master_params)
        msg = result.fail_msg
        if msg:
          self.LogWarning("Could not re-enable the master role on"
@@ -3324,6 +3362,27 @@ class LUClusterRename(LogicalUnit):
      return clustername
  
  
+def _ValidateNetmask(cfg, netmask):
+  """Checks if a netmask is valid.
+
+  @type cfg: L{config.ConfigWriter}
+  @param cfg: The cluster configuration
+  @type netmask: int
+  @param netmask: the netmask to be verified
+  @raise errors.OpPrereqError: if the validation fails
+
+  """
+  ip_family = cfg.GetPrimaryIPFamily()
+  try:
+    ipcls = netutils.IPAddress.GetClassFromIpFamily(ip_family)
+  except errors.ProgrammerError:
+    raise errors.OpPrereqError("Invalid primary ip family: %s." %
+                               ip_family)
+  if not ipcls.ValidateNetmask(netmask):
+    raise errors.OpPrereqError("CIDR netmask (%s) not valid" %
+                                (netmask))
+
+
  class LUClusterSetParams(LogicalUnit):
    """Change the parameters of the cluster.
  
@@ -3345,6 +3404,9 @@ class LUClusterSetParams(LogicalUnit):
      if self.op.remove_uids:
        uidpool.CheckUidPool(self.op.remove_uids)
  
+    if self.op.master_netmask is not None:
+      _ValidateNetmask(self.cfg, self.op.master_netmask)
+
    def ExpandNames(self):
      # FIXME: in the future maybe other cluster params won't require checking on
      # all nodes to be modified.
@@ -3645,21 +3707,38 @@ class LUClusterSetParams(LogicalUnit):
        helper_os("blacklisted_os", self.op.blacklisted_os, "blacklisted")
  
      if self.op.master_netdev:
-      master = self.cfg.GetMasterNode()
+      master_params = self.cfg.GetMasterNetworkParameters()
        feedback_fn("Shutting down master ip on the current netdev (%s)" %
                    self.cluster.master_netdev)
-      result = self.rpc.call_node_stop_master(master, False)
+      result = self.rpc.call_node_deactivate_master_ip(master_params.name,
+                                                       master_params)
        result.Raise("Could not disable the master ip")
        feedback_fn("Changing master_netdev from %s to %s" %
-                  (self.cluster.master_netdev, self.op.master_netdev))
+                  (master_params.netdev, self.op.master_netdev))
        self.cluster.master_netdev = self.op.master_netdev
  
+    if self.op.master_netmask:
+      master_params = self.cfg.GetMasterNetworkParameters()
+      feedback_fn("Changing master IP netmask to %s" % self.op.master_netmask)
+      result = self.rpc.call_node_change_master_netmask(master_params.name,
+                                                        master_params.netmask,
+                                                        self.op.master_netmask,
+                                                        master_params.ip,
+                                                        master_params.netdev)
+      if result.fail_msg:
+        msg = "Could not change the master IP netmask: %s" % result.fail_msg
+        feedback_fn(msg)
+
+      self.cluster.master_netmask = self.op.master_netmask
+
      self.cfg.Update(self.cluster, feedback_fn)
  
      if self.op.master_netdev:
+      master_params = self.cfg.GetMasterNetworkParameters()
        feedback_fn("Starting the master ip on the new master netdev (%s)" %
                    self.op.master_netdev)
-      result = self.rpc.call_node_start_master(master, False, False)
+      result = self.rpc.call_node_activate_master_ip(master_params.name,
+                                                     master_params)
        if result.fail_msg:
          self.LogWarning("Could not re-enable the master ip on"
                          " the master, please restart manually: %s",
@@ -3692,17 +3771,25 @@ def _ComputeAncillaryFiles(cluster, redist):
      constants.SSH_KNOWN_HOSTS_FILE,
      constants.CONFD_HMAC_KEY,
      constants.CLUSTER_DOMAIN_SECRET_FILE,
+    constants.SPICE_CERT_FILE,
+    constants.SPICE_CACERT_FILE,
+    constants.RAPI_USERS_FILE,
      ])
  
    if not redist:
      files_all.update(constants.ALL_CERT_FILES)
      files_all.update(ssconf.SimpleStore().GetFileList())
+  else:
+    # we need to ship at least the RAPI certificate
+    files_all.add(constants.RAPI_CERT_FILE)
  
    if cluster.modify_etc_hosts:
      files_all.add(constants.ETC_HOSTS)
  
-  # Files which must either exist on all nodes or on none
-  files_all_opt = set([
+  # Files which are optional, these must:
+  # - be present in one other category as well
+  # - either exist or not exist on all nodes of that category (mc, vm all)
+  files_opt = set([
      constants.RAPI_USERS_FILE,
      ])
  
@@ -3714,14 +3801,23 @@ def _ComputeAncillaryFiles(cluster, redist):
    # Files which should only be on VM-capable nodes
    files_vm = set(filename
      for hv_name in cluster.enabled_hypervisors
-    for filename in hypervisor.GetHypervisor(hv_name).GetAncillaryFiles())
+    for filename in hypervisor.GetHypervisor(hv_name).GetAncillaryFiles()[0])
+
+  files_opt |= set(filename
+    for hv_name in cluster.enabled_hypervisors
+    for filename in hypervisor.GetHypervisor(hv_name).GetAncillaryFiles()[1])
  
-  # Filenames must be unique
-  assert (len(files_all | files_all_opt | files_mc | files_vm) ==
-          sum(map(len, [files_all, files_all_opt, files_mc, files_vm]))), \
+  # Filenames in each category must be unique
+  all_files_set = files_all | files_mc | files_vm
+  assert (len(all_files_set) ==
+          sum(map(len, [files_all, files_mc, files_vm]))), \
           "Found file listed in more than one file list"
  
-  return (files_all, files_all_opt, files_mc, files_vm)
+  # Optional files must be present in one other category
+  assert all_files_set.issuperset(files_opt), \
+         "Optional file not in a different required list"
+
+  return (files_all, files_opt, files_mc, files_vm)
  
  
  def _RedistributeAncillaryFiles(lu, additional_nodes=None, additional_vm=True):
@@ -3755,7 +3851,7 @@ def _RedistributeAncillaryFiles(lu, additional_nodes=None, additional_vm=True):
        nodelist.remove(master_info.name)
  
    # Gather file lists
-  (files_all, files_all_opt, files_mc, files_vm) = \
+  (files_all, _, files_mc, files_vm) = \
      _ComputeAncillaryFiles(cluster, True)
  
    # Never re-distribute configuration file from here
@@ -3765,7 +3861,6 @@ def _RedistributeAncillaryFiles(lu, additional_nodes=None, additional_vm=True):
  
    filemap = [
      (online_nodes, files_all),
-    (online_nodes, files_all_opt),
      (vm_nodes, files_vm),
      ]
  
@@ -3797,6 +3892,31 @@ class LUClusterRedistConf(NoHooksLU):
      _RedistributeAncillaryFiles(self)
  
  
+class LUClusterActivateMasterIp(NoHooksLU):
+  """Activate the master IP on the master node.
+
+  """
+  def Exec(self, feedback_fn):
+    """Activate the master IP.
+
+    """
+    master_params = self.cfg.GetMasterNetworkParameters()
+    self.rpc.call_node_activate_master_ip(master_params.name,
+                                          master_params)
+
+
+class LUClusterDeactivateMasterIp(NoHooksLU):
+  """Deactivate the master IP on the master node.
+
+  """
+  def Exec(self, feedback_fn):
+    """Deactivate the master IP.
+
+    """
+    master_params = self.cfg.GetMasterNetworkParameters()
+    self.rpc.call_node_deactivate_master_ip(master_params.name, master_params)
+
+
  def _WaitForSync(lu, instance, disks=None, oneshot=False):
    """Sleep and poll for an instance's disk to sync.
  
@@ -4316,6 +4436,9 @@ class LUNodeRemove(LogicalUnit):
  
      modify_ssh_setup = self.cfg.GetClusterInfo().modify_ssh_setup
  
+    assert locking.BGL in self.owned_locks(locking.LEVEL_CLUSTER), \
+      "Not owning BGL"
+
      # Promote nodes to master candidate as needed
      _AdjustCandidatePool(self, exceptions=[node.name])
      self.context.RemoveNode(node.name)
@@ -4727,7 +4850,7 @@ class LUQuery(NoHooksLU):
    def CheckArguments(self):
      qcls = _GetQueryImplementation(self.op.what)
  
-    self.impl = qcls(self.op.filter, self.op.fields, self.op.use_locking)
+    self.impl = qcls(self.op.qfilter, self.op.fields, self.op.use_locking)
  
    def ExpandNames(self):
      self.impl.ExpandNames(self)
@@ -4982,6 +5105,9 @@ class LUNodeAdd(LogicalUnit):
      new_node = self.new_node
      node = new_node.name
  
+    assert locking.BGL in self.owned_locks(locking.LEVEL_CLUSTER), \
+      "Not owning BGL"
+
      # We adding a new node so we assume it's powered
      new_node.powered = True
  
@@ -5036,7 +5162,7 @@ class LUNodeAdd(LogicalUnit):
  
      node_verify_list = [self.cfg.GetMasterNode()]
      node_verify_param = {
-      constants.NV_NODELIST: [node],
+      constants.NV_NODELIST: ([node], {}),
        # TODO: do a node-net-test as well?
      }
  
@@ -5120,6 +5246,13 @@ class LUNodeSetParams(LogicalUnit):
      self.lock_all = self.op.auto_promote and self.might_demote
      self.lock_instances = self.op.secondary_ip is not None
  
+  def _InstanceFilter(self, instance):
+    """Filter for getting affected instances.
+
+    """
+    return (instance.disk_template in constants.DTS_INT_MIRROR and
+            self.op.node_name in instance.all_nodes)
+
    def ExpandNames(self):
      if self.lock_all:
        self.needed_locks = {locking.LEVEL_NODE: locking.ALL_SET}
@@ -5127,28 +5260,8 @@ class LUNodeSetParams(LogicalUnit):
        self.needed_locks = {locking.LEVEL_NODE: self.op.node_name}
  
      if self.lock_instances:
-      self.needed_locks[locking.LEVEL_INSTANCE] = locking.ALL_SET
-
-  def DeclareLocks(self, level):
-    # If we have locked all instances, before waiting to lock nodes, release
-    # all the ones living on nodes unrelated to the current operation.
-    if level == locking.LEVEL_NODE and self.lock_instances:
-      self.affected_instances = []
-      if self.needed_locks[locking.LEVEL_NODE] is not locking.ALL_SET:
-        instances_keep = []
-
-        # Build list of instances to release
-        locked_i = self.owned_locks(locking.LEVEL_INSTANCE)
-        for instance_name, instance in self.cfg.GetMultiInstanceInfo(locked_i):
-          if (instance.disk_template in constants.DTS_INT_MIRROR and
-              self.op.node_name in instance.all_nodes):
-            instances_keep.append(instance_name)
-            self.affected_instances.append(instance)
-
-        _ReleaseLocks(self, locking.LEVEL_INSTANCE, keep=instances_keep)
-
-        assert (set(self.owned_locks(locking.LEVEL_INSTANCE)) ==
-                set(instances_keep))
+      self.needed_locks[locking.LEVEL_INSTANCE] = \
+        frozenset(self.cfg.GetInstancesInfoByFilter(self._InstanceFilter))
  
    def BuildHooksEnv(self):
      """Build hooks env.
@@ -5180,6 +5293,25 @@ class LUNodeSetParams(LogicalUnit):
      """
      node = self.node = self.cfg.GetNodeInfo(self.op.node_name)
  
+    if self.lock_instances:
+      affected_instances = \
+        self.cfg.GetInstancesInfoByFilter(self._InstanceFilter)
+
+      # Verify instance locks
+      owned_instances = self.owned_locks(locking.LEVEL_INSTANCE)
+      wanted_instances = frozenset(affected_instances.keys())
+      if wanted_instances - owned_instances:
+        raise errors.OpPrereqError("Instances affected by changing node %s's"
+                                   " secondary IP address have changed since"
+                                   " locks were acquired, wanted '%s', have"
+                                   " '%s'; retry the operation" %
+                                   (self.op.node_name,
+                                    utils.CommaJoin(wanted_instances),
+                                    utils.CommaJoin(owned_instances)),
+                                   errors.ECODE_STATE)
+    else:
+      affected_instances = None
+
      if (self.op.master_candidate is not None or
          self.op.drained is not None or
          self.op.offline is not None):
@@ -5269,7 +5401,9 @@ class LUNodeSetParams(LogicalUnit):
  
      if old_role == self._ROLE_OFFLINE and new_role != old_role:
        # Trying to transition out of offline status
-      result = self.rpc.call_version([node.name])[node.name]
+      # TODO: Use standard RPC runner, but make sure it works when the node is
+      # still marked offline
+      result = rpc.BootstrapRunner().call_version([node.name])[node.name]
        if result.fail_msg:
          raise errors.OpPrereqError("Node %s is being de-offlined but fails"
                                     " to report its version: %s" %
@@ -5288,15 +5422,19 @@ class LUNodeSetParams(LogicalUnit):
          raise errors.OpPrereqError("Cannot change the secondary ip on a single"
                                     " homed cluster", errors.ECODE_INVAL)
  
+      assert not (frozenset(affected_instances) -
+                  self.owned_locks(locking.LEVEL_INSTANCE))
+
        if node.offline:
-        if self.affected_instances:
-          raise errors.OpPrereqError("Cannot change secondary ip: offline"
-                                     " node has instances (%s) configured"
-                                     " to use it" % self.affected_instances)
+        if affected_instances:
+          raise errors.OpPrereqError("Cannot change secondary IP address:"
+                                     " offline node has instances (%s)"
+                                     " configured to use it" %
+                                     utils.CommaJoin(affected_instances.keys()))
        else:
          # On online nodes, check that no instances are running, and that
          # the node has the new ip and we can reach it.
-        for instance in self.affected_instances:
+        for instance in affected_instances.values():
            _CheckInstanceDown(self, instance, "cannot change secondary ip")
  
          _CheckNodeHasSecondaryIP(self, node.name, self.op.secondary_ip, True)
@@ -5448,6 +5586,8 @@ class LUClusterQuery(NoHooksLU):
        "ndparams": cluster.ndparams,
        "candidate_pool_size": cluster.candidate_pool_size,
        "master_netdev": cluster.master_netdev,
+      "master_netmask": cluster.master_netmask,
+      "use_external_mip_script": cluster.use_external_mip_script,
        "volume_group_name": cluster.volume_group_name,
        "drbd_usermode_helper": cluster.drbd_usermode_helper,
        "file_storage_dir": cluster.file_storage_dir,
@@ -5838,6 +5978,40 @@ def _CheckNodesFreeDiskOnVG(lu, nodenames, vg, requested):
                                   errors.ECODE_NORES)
  
  
+def _CheckNodesPhysicalCPUs(lu, nodenames, requested, hypervisor_name):
+  """Checks if nodes have enough physical CPUs
+
+  This function checks if all given nodes have the needed number of
+  physical CPUs. In case any node has less CPUs or we cannot get the
+  information from the node, this function raises an OpPrereqError
+  exception.
+
+  @type lu: C{LogicalUnit}
+  @param lu: a logical unit from which we get configuration data
+  @type nodenames: C{list}
+  @param nodenames: the list of node names to check
+  @type requested: C{int}
+  @param requested: the minimum acceptable number of physical CPUs
+  @raise errors.OpPrereqError: if the node doesn't have enough CPUs,
+      or we cannot check the node
+
+  """
+  nodeinfo = lu.rpc.call_node_info(nodenames, None, hypervisor_name)
+  for node in nodenames:
+    info = nodeinfo[node]
+    info.Raise("Cannot get current information from node %s" % node,
+               prereq=True, ecode=errors.ECODE_ENVIRON)
+    num_cpus = info.payload.get("cpu_total", None)
+    if not isinstance(num_cpus, int):
+      raise errors.OpPrereqError("Can't compute the number of physical CPUs"
+                                 " on node %s, result was '%s'" %
+                                 (node, num_cpus), errors.ECODE_ENVIRON)
+    if requested > num_cpus:
+      raise errors.OpPrereqError("Node %s has %s physical CPUs, but %s are "
+                                 "required" % (node, num_cpus, requested),
+                                 errors.ECODE_NORES)
+
+
  class LUInstanceStartup(LogicalUnit):
    """Starts an instance.
  
@@ -5940,9 +6114,11 @@ class LUInstanceStartup(LogicalUnit):
  
        _StartInstanceDisks(self, instance, force)
  
-      result = self.rpc.call_instance_start(node_current, instance,
-                                            self.op.hvparams, self.op.beparams,
-                                            self.op.startup_paused)
+      result = \
+        self.rpc.call_instance_start(node_current,
+                                     (instance, self.op.hvparams,
+                                      self.op.beparams),
+                                     self.op.startup_paused)
        msg = result.fail_msg
        if msg:
          _ShutdownInstanceDisks(self, instance)
@@ -6032,8 +6208,8 @@ class LUInstanceReboot(LogicalUnit):
          self.LogInfo("Instance %s was already stopped, starting now",
                       instance.name)
        _StartInstanceDisks(self, instance, ignore_secondaries)
-      result = self.rpc.call_instance_start(node_current, instance,
-                                            None, None, False)
+      result = self.rpc.call_instance_start(node_current,
+                                            (instance, None, None), False)
        msg = result.fail_msg
        if msg:
          _ShutdownInstanceDisks(self, instance)
@@ -6194,9 +6370,9 @@ class LUInstanceReinstall(LogicalUnit):
      try:
        feedback_fn("Running the instance OS create scripts...")
        # FIXME: pass debug option from opcode to backend
-      result = self.rpc.call_instance_os_add(inst.primary_node, inst, True,
-                                             self.op.debug_level,
-                                             osparams=self.os_inst)
+      result = self.rpc.call_instance_os_add(inst.primary_node,
+                                             (inst, self.os_inst), True,
+                                             self.op.debug_level)
        result.Raise("Could not install OS for instance %s on node %s" %
                     (inst.name, inst.primary_node))
      finally:
@@ -6897,8 +7073,8 @@ class LUInstanceMove(LogicalUnit):
          _ShutdownInstanceDisks(self, instance)
          raise errors.OpExecError("Can't activate the instance's disks")
  
-      result = self.rpc.call_instance_start(target_node, instance,
-                                            None, None, False)
+      result = self.rpc.call_instance_start(target_node,
+                                            (instance, None, None), False)
        msg = result.fail_msg
        if msg:
          _ShutdownInstanceDisks(self, instance)
@@ -6991,6 +7167,11 @@ class TLMigrateInstance(Tasklet):
    @ivar shutdown_timeout: In case of failover timeout of the shutdown
  
    """
+
+  # Constants
+  _MIGRATION_POLL_INTERVAL = 1      # seconds
+  _MIGRATION_FEEDBACK_INTERVAL = 10 # seconds
+
    def __init__(self, lu, instance_name, cleanup=False,
                 failover=False, fallback=False,
                 ignore_consistency=False,
@@ -7314,12 +7495,13 @@ class TLMigrateInstance(Tasklet):
      """
      instance = self.instance
      target_node = self.target_node
+    source_node = self.source_node
      migration_info = self.migration_info
  
-    abort_result = self.rpc.call_finalize_migration(target_node,
-                                                    instance,
-                                                    migration_info,
-                                                    False)
+    abort_result = self.rpc.call_instance_finalize_migration_dst(target_node,
+                                                                 instance,
+                                                                 migration_info,
+                                                                 False)
      abort_msg = abort_result.fail_msg
      if abort_msg:
        logging.error("Aborting migration failed on target node %s: %s",
@@ -7327,6 +7509,13 @@ class TLMigrateInstance(Tasklet):
        # Don't raise an exception here, as we stil have to try to revert the
        # disk status, even if this step failed.
  
+    abort_result = self.rpc.call_instance_finalize_migration_src(source_node,
+        instance, False, self.live)
+    abort_msg = abort_result.fail_msg
+    if abort_msg:
+      logging.error("Aborting migration failed on source node %s: %s",
+                    source_node, abort_msg)
+
    def _ExecMigration(self):
      """Migrate an instance.
  
@@ -7343,6 +7532,21 @@ class TLMigrateInstance(Tasklet):
      target_node = self.target_node
      source_node = self.source_node
  
+    # Check for hypervisor version mismatch and warn the user.
+    nodeinfo = self.rpc.call_node_info([source_node, target_node],
+                                       None, self.instance.hypervisor)
+    src_info = nodeinfo[source_node]
+    dst_info = nodeinfo[target_node]
+
+    if ((constants.HV_NODEINFO_KEY_VERSION in src_info.payload) and
+        (constants.HV_NODEINFO_KEY_VERSION in dst_info.payload)):
+      src_version = src_info.payload[constants.HV_NODEINFO_KEY_VERSION]
+      dst_version = dst_info.payload[constants.HV_NODEINFO_KEY_VERSION]
+      if src_version != dst_version:
+        self.feedback_fn("* warning: hypervisor version mismatch between"
+                         " source (%s) and target (%s) node" %
+                         (src_version, dst_version))
+
      self.feedback_fn("* checking disk consistency between source and target")
      for dev in instance.disks:
        if not _CheckDiskConsistency(self.lu, dev, target_node, False):
@@ -7398,18 +7602,59 @@ class TLMigrateInstance(Tasklet):
        raise errors.OpExecError("Could not migrate instance %s: %s" %
                                 (instance.name, msg))
  
+    self.feedback_fn("* starting memory transfer")
+    last_feedback = time.time()
+    while True:
+      result = self.rpc.call_instance_get_migration_status(source_node,
+                                                           instance)
+      msg = result.fail_msg
+      ms = result.payload   # MigrationStatus instance
+      if msg or (ms.status in constants.HV_MIGRATION_FAILED_STATUSES):
+        logging.error("Instance migration failed, trying to revert"
+                      " disk status: %s", msg)
+        self.feedback_fn("Migration failed, aborting")
+        self._AbortMigration()
+        self._RevertDiskStatus()
+        raise errors.OpExecError("Could not migrate instance %s: %s" %
+                                 (instance.name, msg))
+
+      if result.payload.status != constants.HV_MIGRATION_ACTIVE:
+        self.feedback_fn("* memory transfer complete")
+        break
+
+      if (utils.TimeoutExpired(last_feedback,
+                               self._MIGRATION_FEEDBACK_INTERVAL) and
+          ms.transferred_ram is not None):
+        mem_progress = 100 * float(ms.transferred_ram) / float(ms.total_ram)
+        self.feedback_fn("* memory transfer progress: %.2f %%" % mem_progress)
+        last_feedback = time.time()
+
+      time.sleep(self._MIGRATION_POLL_INTERVAL)
+
+    result = self.rpc.call_instance_finalize_migration_src(source_node,
+                                                           instance,
+                                                           True,
+                                                           self.live)
+    msg = result.fail_msg
+    if msg:
+      logging.error("Instance migration succeeded, but finalization failed"
+                    " on the source node: %s", msg)
+      raise errors.OpExecError("Could not finalize instance migration: %s" %
+                               msg)
+
      instance.primary_node = target_node
+
      # distribute new instance config to the other nodes
      self.cfg.Update(instance, self.feedback_fn)
  
-    result = self.rpc.call_finalize_migration(target_node,
-                                              instance,
-                                              migration_info,
-                                              True)
+    result = self.rpc.call_instance_finalize_migration_dst(target_node,
+                                                           instance,
+                                                           migration_info,
+                                                           True)
      msg = result.fail_msg
      if msg:
-      logging.error("Instance migration succeeded, but finalization failed:"
-                    " %s", msg)
+      logging.error("Instance migration succeeded, but finalization failed"
+                    " on the target node: %s", msg)
        raise errors.OpExecError("Could not finalize instance migration: %s" %
                                 msg)
  
@@ -7492,7 +7737,7 @@ class TLMigrateInstance(Tasklet):
  
        self.feedback_fn("* starting the instance on the target node %s" %
                         target_node)
-      result = self.rpc.call_instance_start(target_node, instance, None, None,
+      result = self.rpc.call_instance_start(target_node, (instance, None, None),
                                              False)
        msg = result.fail_msg
        if msg:
@@ -7624,7 +7869,7 @@ def _GenerateDRBD8Branch(lu, primary, secondary, size, vgnames, names,
    shared_secret = lu.cfg.GenerateDRBDSecret(lu.proc.GetECId())
    dev_data = objects.Disk(dev_type=constants.LD_LV, size=size,
                            logical_id=(vgnames[0], names[0]))
-  dev_meta = objects.Disk(dev_type=constants.LD_LV, size=128,
+  dev_meta = objects.Disk(dev_type=constants.LD_LV, size=DRBD_META_SIZE,
                            logical_id=(vgnames[1], names[1]))
    drbd_dev = objects.Disk(dev_type=constants.LD_DRBD8, size=size,
                            logical_id=(primary, secondary, port,
@@ -7944,7 +8189,7 @@ def _ComputeDiskSizePerVG(disk_template, disks):
      constants.DT_DISKLESS: {},
      constants.DT_PLAIN: _compute(disks, 0),
      # 128 MB are added for drbd metadata for each disk
-    constants.DT_DRBD8: _compute(disks, 128),
+    constants.DT_DRBD8: _compute(disks, DRBD_META_SIZE),
      constants.DT_FILE: {},
      constants.DT_SHARED_FILE: {},
    }
@@ -7965,7 +8210,8 @@ def _ComputeDiskSize(disk_template, disks):
      constants.DT_DISKLESS: None,
      constants.DT_PLAIN: sum(d[constants.IDISK_SIZE] for d in disks),
      # 128 MB are added for drbd metadata for each disk
-    constants.DT_DRBD8: sum(d[constants.IDISK_SIZE] + 128 for d in disks),
+    constants.DT_DRBD8:
+      sum(d[constants.IDISK_SIZE] + DRBD_META_SIZE for d in disks),
      constants.DT_FILE: None,
      constants.DT_SHARED_FILE: 0,
      constants.DT_BLOCK: 0,
@@ -8011,9 +8257,11 @@ def _CheckHVParams(lu, nodenames, hvname, hvparams):
  
    """
    nodenames = _FilterVmNodes(lu, nodenames)
-  hvinfo = lu.rpc.call_hypervisor_validate_params(nodenames,
-                                                  hvname,
-                                                  hvparams)
+
+  cluster = lu.cfg.GetClusterInfo()
+  hvfull = objects.FillDict(cluster.hvparams.get(hvname, {}), hvparams)
+
+  hvinfo = lu.rpc.call_hypervisor_validate_params(nodenames, hvname, hvfull)
    for node in nodenames:
      info = hvinfo[node]
      if info.offline:
@@ -8039,7 +8287,7 @@ def _CheckOSParams(lu, required, nodenames, osname, osparams):
  
    """
    nodenames = _FilterVmNodes(lu, nodenames)
-  result = lu.rpc.call_os_validate(required, nodenames, osname,
+  result = lu.rpc.call_os_validate(nodenames, required, osname,
                                     [constants.OS_VALIDATE_PARAMETERS],
                                     osparams)
    for node, nres in result.items():
@@ -8403,33 +8651,39 @@ class LUInstanceCreate(LogicalUnit):
        if einfo.has_option(constants.INISECT_INS, "disk_template"):
          self.op.disk_template = einfo.get(constants.INISECT_INS,
                                            "disk_template")
+        if self.op.disk_template not in constants.DISK_TEMPLATES:
+          raise errors.OpPrereqError("Disk template specified in configuration"
+                                     " file is not one of the allowed values:"
+                                     " %s" % " ".join(constants.DISK_TEMPLATES))
        else:
          raise errors.OpPrereqError("No disk template specified and the export"
                                     " is missing the disk_template information",
                                     errors.ECODE_INVAL)
  
      if not self.op.disks:
-      if einfo.has_option(constants.INISECT_INS, "disk_count"):
-        disks = []
-        # TODO: import the disk iv_name too
-        for idx in range(einfo.getint(constants.INISECT_INS, "disk_count")):
+      disks = []
+      # TODO: import the disk iv_name too
+      for idx in range(constants.MAX_DISKS):
+        if einfo.has_option(constants.INISECT_INS, "disk%d_size" % idx):
            disk_sz = einfo.getint(constants.INISECT_INS, "disk%d_size" % idx)
            disks.append({constants.IDISK_SIZE: disk_sz})
-        self.op.disks = disks
-      else:
+      self.op.disks = disks
+      if not disks and self.op.disk_template != constants.DT_DISKLESS:
          raise errors.OpPrereqError("No disk info specified and the export"
                                     " is missing the disk information",
                                     errors.ECODE_INVAL)
  
-    if (not self.op.nics and
-        einfo.has_option(constants.INISECT_INS, "nic_count")):
+    if not self.op.nics:
        nics = []
-      for idx in range(einfo.getint(constants.INISECT_INS, "nic_count")):
-        ndict = {}
-        for name in list(constants.NICS_PARAMETERS) + ["ip", "mac"]:
-          v = einfo.get(constants.INISECT_INS, "nic%d_%s" % (idx, name))
-          ndict[name] = v
-        nics.append(ndict)
+      for idx in range(constants.MAX_NICS):
+        if einfo.has_option(constants.INISECT_INS, "nic%d_mac" % idx):
+          ndict = {}
+          for name in list(constants.NICS_PARAMETERS) + ["ip", "mac"]:
+            v = einfo.get(constants.INISECT_INS, "nic%d_%s" % (idx, name))
+            ndict[name] = v
+          nics.append(ndict)
+        else:
+          break
        self.op.nics = nics
  
      if not self.op.tags and einfo.has_option(constants.INISECT_INS, "tags"):
@@ -8533,7 +8787,8 @@ class LUInstanceCreate(LogicalUnit):
        raise errors.OpPrereqError("Cluster does not support lvm-based"
                                   " instances", errors.ECODE_STATE)
  
-    if self.op.hypervisor is None:
+    if (self.op.hypervisor is None or
+        self.op.hypervisor == constants.VALUE_AUTO):
        self.op.hypervisor = self.cfg.GetHypervisorType()
  
      cluster = self.cfg.GetClusterInfo()
@@ -8559,6 +8814,10 @@ class LUInstanceCreate(LogicalUnit):
      _CheckGlobalHvParams(self.op.hvparams)
  
      # fill and remember the beparams dict
+    default_beparams = cluster.beparams[constants.PP_DEFAULT]
+    for param, value in self.op.beparams.iteritems():
+      if value == constants.VALUE_AUTO:
+        self.op.beparams[param] = default_beparams[param]
      utils.ForceDictType(self.op.beparams, constants.BES_PARAMETER_TYPES)
      self.be_full = cluster.SimpleFillBE(self.op.beparams)
  
@@ -8575,7 +8834,7 @@ class LUInstanceCreate(LogicalUnit):
      for idx, nic in enumerate(self.op.nics):
        nic_mode_req = nic.get(constants.INIC_MODE, None)
        nic_mode = nic_mode_req
-      if nic_mode is None:
+      if nic_mode is None or nic_mode == constants.VALUE_AUTO:
          nic_mode = cluster.nicparams[constants.PP_DEFAULT][constants.NIC_MODE]
  
        # in routed mode, for the first nic, the default ip is 'auto'
@@ -8619,9 +8878,11 @@ class LUInstanceCreate(LogicalUnit):
  
        #  Build nic parameters
        link = nic.get(constants.INIC_LINK, None)
+      if link == constants.VALUE_AUTO:
+        link = cluster.nicparams[constants.PP_DEFAULT][constants.NIC_LINK]
        nicparams = {}
        if nic_mode_req:
-        nicparams[constants.NIC_MODE] = nic_mode_req
+        nicparams[constants.NIC_MODE] = nic_mode
        if link:
          nicparams[constants.NIC_LINK] = link
  
@@ -8658,18 +8919,8 @@ class LUInstanceCreate(LogicalUnit):
        self.disks.append(new_disk)
  
      if self.op.mode == constants.INSTANCE_IMPORT:
-
-      # Check that the new instance doesn't have less disks than the export
-      instance_disks = len(self.disks)
-      export_disks = export_info.getint(constants.INISECT_INS, 'disk_count')
-      if instance_disks < export_disks:
-        raise errors.OpPrereqError("Not enough disks to import."
-                                   " (instance: %d, export: %d)" %
-                                   (instance_disks, export_disks),
-                                   errors.ECODE_INVAL)
-
        disk_images = []
-      for idx in range(export_disks):
+      for idx in range(len(self.disks)):
          option = "disk%d_dump" % idx
          if export_info.has_option(constants.INISECT_INS, option):
            # FIXME: are the old os-es, disk sizes, etc. useful?
@@ -8682,15 +8933,9 @@ class LUInstanceCreate(LogicalUnit):
        self.src_images = disk_images
  
        old_name = export_info.get(constants.INISECT_INS, "name")
-      try:
-        exp_nic_count = export_info.getint(constants.INISECT_INS, "nic_count")
-      except (TypeError, ValueError), err:
-        raise errors.OpPrereqError("Invalid export file, nic_count is not"
-                                   " an integer: %s" % str(err),
-                                   errors.ECODE_STATE)
        if self.op.instance_name == old_name:
          for idx, nic in enumerate(self.nics):
-          if nic.mac == constants.VALUE_AUTO and exp_nic_count >= idx:
+          if nic.mac == constants.VALUE_AUTO:
              nic_mac_ini = "nic%d_mac" % idx
              nic.mac = export_info.get(constants.INISECT_INS, nic_mac_ini)
  
@@ -8968,7 +9213,7 @@ class LUInstanceCreate(LogicalUnit):
            feedback_fn("* running the instance OS create scripts...")
            # FIXME: pass debug option from opcode to backend
            os_add_result = \
-            self.rpc.call_instance_os_add(pnode_name, iobj, False,
+            self.rpc.call_instance_os_add(pnode_name, (iobj, None), False,
                                            self.op.debug_level)
            if pause_sync:
              feedback_fn("* resuming disk sync")
@@ -9048,8 +9293,8 @@ class LUInstanceCreate(LogicalUnit):
        self.cfg.Update(iobj, feedback_fn)
        logging.info("Starting instance %s on node %s", instance, pnode_name)
        feedback_fn("* starting instance...")
-      result = self.rpc.call_instance_start(pnode_name, iobj,
-                                            None, None, False)
+      result = self.rpc.call_instance_start(pnode_name, (iobj, None, None),
+                                            False)
        result.Raise("Could not start instance")
  
      return list(iobj.all_nodes)
@@ -9633,7 +9878,7 @@ class TLReplaceDisks(Tasklet):
        lv_data = objects.Disk(dev_type=constants.LD_LV, size=dev.size,
                               logical_id=(vg_data, names[0]))
        vg_meta = dev.children[1].logical_id[0]
-      lv_meta = objects.Disk(dev_type=constants.LD_LV, size=128,
+      lv_meta = objects.Disk(dev_type=constants.LD_LV, size=DRBD_META_SIZE,
                               logical_id=(vg_meta, names[1]))
  
        new_lvs = [lv_data, lv_meta]
@@ -10786,9 +11031,11 @@ class LUInstanceSetParams(LogicalUnit):
        # local check
        hypervisor.GetHypervisor(hv_type).CheckParameterSyntax(hv_new)
        _CheckHVParams(self, nodelist, instance.hypervisor, hv_new)
-      self.hv_new = hv_new # the new actual values
+      self.hv_proposed = self.hv_new = hv_new # the new actual values
        self.hv_inst = i_hvdict # the new dict (without defaults)
      else:
+      self.hv_proposed = cluster.SimpleFillHV(instance.hypervisor, instance.os,
+                                              instance.hvparams)
        self.hv_new = self.hv_inst = {}
  
      # beparams processing
@@ -10797,12 +11044,40 @@ class LUInstanceSetParams(LogicalUnit):
                                     use_none=True)
        utils.ForceDictType(i_bedict, constants.BES_PARAMETER_TYPES)
        be_new = cluster.SimpleFillBE(i_bedict)
-      self.be_new = be_new # the new actual values
+      self.be_proposed = self.be_new = be_new # the new actual values
        self.be_inst = i_bedict # the new dict (without defaults)
      else:
        self.be_new = self.be_inst = {}
+      self.be_proposed = cluster.SimpleFillBE(instance.beparams)
      be_old = cluster.FillBE(instance)
  
+    # CPU param validation -- checking every time a paramtere is
+    # changed to cover all cases where either CPU mask or vcpus have
+    # changed
+    if (constants.BE_VCPUS in self.be_proposed and
+        constants.HV_CPU_MASK in self.hv_proposed):
+      cpu_list = \
+        utils.ParseMultiCpuMask(self.hv_proposed[constants.HV_CPU_MASK])
+      # Verify mask is consistent with number of vCPUs. Can skip this
+      # test if only 1 entry in the CPU mask, which means same mask
+      # is applied to all vCPUs.
+      if (len(cpu_list) > 1 and
+          len(cpu_list) != self.be_proposed[constants.BE_VCPUS]):
+        raise errors.OpPrereqError("Number of vCPUs [%d] does not match the"
+                                   " CPU mask [%s]" %
+                                   (self.be_proposed[constants.BE_VCPUS],
+                                    self.hv_proposed[constants.HV_CPU_MASK]),
+                                   errors.ECODE_INVAL)
+
+      # Only perform this test if a new CPU mask is given
+      if constants.HV_CPU_MASK in self.hv_new:
+        # Calculate the largest CPU number requested
+        max_requested_cpu = max(map(max, cpu_list))
+        # Check that all of the instance's nodes have enough physical CPUs to
+        # satisfy the requested CPU mask
+        _CheckNodesPhysicalCPUs(self, instance.all_nodes,
+                                max_requested_cpu + 1, instance.hypervisor)
+
      # osparams processing
      if self.op.osparams:
        i_osdict = _GetUpdatedParams(instance.osparams, self.op.osparams)
@@ -11664,8 +11939,8 @@ class LUBackupExport(LogicalUnit):
              not self.op.remove_instance):
            assert not activate_disks
            feedback_fn("Starting instance %s" % instance.name)
-          result = self.rpc.call_instance_start(src_node, instance,
-                                                None, None, False)
+          result = self.rpc.call_instance_start(src_node,
+                                                (instance, None, None), False)
            msg = result.fail_msg
            if msg:
              feedback_fn("Failed to start instance: %s" % msg)
@@ -12797,9 +13072,9 @@ class IAllocator(object):
    # pylint: disable=R0902
    # lots of instance attributes
  
-  def __init__(self, cfg, rpc, mode, **kwargs):
+  def __init__(self, cfg, rpc_runner, mode, **kwargs):
      self.cfg = cfg
-    self.rpc = rpc
+    self.rpc = rpc_runner
      # init buffer variables
      self.in_text = self.out_text = self.in_data = self.out_data = None
      # init all input fields so that pylint is happy