Add 'Read' instances for most objects

[ganeti-local] / Ganeti / HTools / Cluster.hs
diff --git a/Ganeti/HTools/Cluster.hs b/Ganeti/HTools/Cluster.hs

index d8337ae..1a22866 100644 (file)
--- a/Ganeti/HTools/Cluster.hs
+++ b/Ganeti/HTools/Cluster.hs
@@ -7,7 +7,7 @@ goes into the "Main" module for the individual binaries.
  
  {-
  
-Copyright (C) 2009 Google Inc.
+Copyright (C) 2009, 2010 Google Inc.
  
  This program is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
@@ -29,15 +29,16 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  module Ganeti.HTools.Cluster
      (
       -- * Types
-      AllocSolution
+      AllocSolution(..)
      , Table(..)
      , CStats(..)
+    , AllocStats
      -- * Generic functions
      , totalResources
+    , computeAllocationDelta
      -- * First phase functions
      , computeBadItems
      -- * Second phase functions
-    , printSolution
      , printSolutionLine
      , formatCmds
      , involvedNodes
@@ -47,24 +48,38 @@ module Ganeti.HTools.Cluster
      , printInsts
      -- * Balacing functions
      , checkMove
+    , doNextBalance
      , tryBalance
      , compCV
+    , compDetailedCV
      , printStats
      , iMoveToJob
      -- * IAllocator functions
      , tryAlloc
+    , tryMGAlloc
      , tryReloc
+    , tryEvac
      , collapseFailures
+    -- * Allocation functions
+    , iterateAlloc
+    , tieredAlloc
+    , tieredSpecMap
+     -- * Node group functions
+    , instanceGroup
+    , findSplitInstances
+    , splitCluster
      ) where
  
+import Data.Function (on)
  import Data.List
+import Data.Ord (comparing)
  import Text.Printf (printf)
-import Data.Function
  import Control.Monad
  
  import qualified Ganeti.HTools.Container as Container
  import qualified Ganeti.HTools.Instance as Instance
  import qualified Ganeti.HTools.Node as Node
+import qualified Ganeti.HTools.Group as Group
  import Ganeti.HTools.Types
  import Ganeti.HTools.Utils
  import qualified Ganeti.OpCodes as OpCodes
@@ -72,15 +87,23 @@ import qualified Ganeti.OpCodes as OpCodes
  -- * Types
  
  -- | Allocation\/relocation solution.
-type AllocSolution = ([FailMode], Int, Maybe (Score, AllocElement))
-
--- | Allocation\/relocation element.
-type AllocElement = (Node.List, Instance.Instance, [Node.Node])
-
+data AllocSolution = AllocSolution
+  { asFailures  :: [FailMode]          -- ^ Failure counts
+  , asAllocs    :: Int                 -- ^ Good allocation count
+  , asSolutions :: [Node.AllocElement] -- ^ The actual result, length
+                                       -- of the list depends on the
+                                       -- allocation/relocation mode
+  , asLog       :: [String]            -- ^ A list of informational messages
+  }
+
+-- | The empty solution we start with when computing allocations
+emptySolution :: AllocSolution
+emptySolution = AllocSolution { asFailures = [], asAllocs = 0
+                              , asSolutions = [], asLog = [] }
  
  -- | The complete state for the balancing solution
  data Table = Table Node.List Instance.List Score [Placement]
-             deriving (Show)
+             deriving (Show, Read)
  
  data CStats = CStats { csFmem :: Int    -- ^ Cluster free mem
                       , csFdsk :: Int    -- ^ Cluster free disk
@@ -96,11 +119,18 @@ data CStats = CStats { csFmem :: Int    -- ^ Cluster free mem
                       , csTmem :: Double -- ^ Cluster total mem
                       , csTdsk :: Double -- ^ Cluster total disk
                       , csTcpu :: Double -- ^ Cluster total cpus
+                     , csVcpu :: Int    -- ^ Cluster virtual cpus (if
+                                        -- node pCpu has been set,
+                                        -- otherwise -1)
                       , csXmem :: Int    -- ^ Unnacounted for mem
                       , csNmem :: Int    -- ^ Node own memory
                       , csScore :: Score -- ^ The cluster score
                       , csNinst :: Int   -- ^ The total number of instances
                       }
+            deriving (Show, Read)
+
+-- | Currently used, possibly to allocate, unallocable
+type AllocStats = (RSpec, RSpec, RSpec)
  
  -- * Utility functions
  
@@ -119,7 +149,7 @@ computeBadItems :: Node.List -> Instance.List ->
                     ([Node.Node], [Instance.Instance])
  computeBadItems nl il =
    let bad_nodes = verifyN1 $ getOnline nl
-      bad_instances = map (\idx -> Container.find idx il) .
+      bad_instances = map (`Container.find` il) .
                        sort . nub $
                        concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
    in
@@ -127,7 +157,7 @@ computeBadItems nl il =
  
  -- | Zero-initializer for the CStats type
  emptyCStats :: CStats
-emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  
  -- | Update stats with data from a new node
  updateCStats :: CStats -> Node.Node -> CStats
@@ -137,6 +167,7 @@ updateCStats cs node =
                   csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
                   csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
                   csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
+                 csVcpu = x_vcpu,
                   csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
                 }
              = cs
@@ -147,6 +178,7 @@ updateCStats cs node =
                     - Node.xMem node - Node.fMem node
          inc_icpu = Node.uCpu node
          inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
+        inc_vcpu = Node.hiCpu node
  
      in cs { csFmem = x_fmem + Node.fMem node
            , csFdsk = x_fdsk + Node.fDsk node
@@ -162,6 +194,7 @@ updateCStats cs node =
            , csTmem = x_tmem + Node.tMem node
            , csTdsk = x_tdsk + Node.tDsk node
            , csTcpu = x_tcpu + Node.tCpu node
+          , csVcpu = x_vcpu + inc_vcpu
            , csXmem = x_xmem + Node.xMem node
            , csNmem = x_nmem + Node.nMem node
            , csNinst = x_ninst + length (Node.pList node)
@@ -173,19 +206,41 @@ totalResources nl =
      let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
      in cs { csScore = compCV nl }
  
--- | The names of the individual elements in the CV list
-detailedCVNames :: [String]
-detailedCVNames = [ "free_mem_cv"
-                  , "free_disk_cv"
-                  , "n1_score"
-                  , "reserved_mem_cv"
-                  , "offline_score"
-                  , "vcpu_ratio_cv"
-                  , "cpu_load_cv"
-                  , "mem_load_cv"
-                  , "disk_load_cv"
-                  , "net_load_cv"
-                  ]
+-- | Compute the delta between two cluster state.
+--
+-- This is used when doing allocations, to understand better the
+-- available cluster resources. The return value is a triple of the
+-- current used values, the delta that was still allocated, and what
+-- was left unallocated.
+computeAllocationDelta :: CStats -> CStats -> AllocStats
+computeAllocationDelta cini cfin =
+    let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu} = cini
+        CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
+                csTmem = t_mem, csTdsk = t_dsk, csVcpu = v_cpu } = cfin
+        rini = RSpec i_icpu i_imem i_idsk
+        rfin = RSpec (f_icpu - i_icpu) (f_imem - i_imem) (f_idsk - i_idsk)
+        un_cpu = v_cpu - f_icpu
+        runa = RSpec un_cpu (truncate t_mem - f_imem) (truncate t_dsk - f_idsk)
+    in (rini, rfin, runa)
+
+-- | The names and weights of the individual elements in the CV list
+detailedCVInfo :: [(Double, String)]
+detailedCVInfo = [ (1,  "free_mem_cv")
+                 , (1,  "free_disk_cv")
+                 , (1,  "n1_cnt")
+                 , (1,  "reserved_mem_cv")
+                 , (4,  "offline_all_cnt")
+                 , (16, "offline_pri_cnt")
+                 , (1,  "vcpu_ratio_cv")
+                 , (1,  "cpu_load_cv")
+                 , (1,  "mem_load_cv")
+                 , (1,  "disk_load_cv")
+                 , (1,  "net_load_cv")
+                 , (2,  "pri_tags_score")
+                 ]
+
+detailedCVWeights :: [Double]
+detailedCVWeights = map fst detailedCVInfo
  
  -- | Compute the mem and disk covariance.
  compDetailedCV :: Node.List -> [Double]
@@ -195,36 +250,46 @@ compDetailedCV nl =
          (offline, nodes) = partition Node.offline all_nodes
          mem_l = map Node.pMem nodes
          dsk_l = map Node.pDsk nodes
-        mem_cv = varianceCoeff mem_l
-        dsk_cv = varianceCoeff dsk_l
-        n1_l = length $ filter Node.failN1 nodes
-        n1_score = fromIntegral n1_l /
-                   fromIntegral (length nodes)::Double
+        -- metric: memory covariance
+        mem_cv = stdDev mem_l
+        -- metric: disk covariance
+        dsk_cv = stdDev dsk_l
+        -- metric: count of instances living on N1 failing nodes
+        n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
+                                                   length (Node.pList n)) .
+                   filter Node.failN1 $ nodes :: Double
          res_l = map Node.pRem nodes
-        res_cv = varianceCoeff res_l
-        offline_inst = sum . map (\n -> (length . Node.pList $ n) +
-                                        (length . Node.sList $ n)) $ offline
-        online_inst = sum . map (\n -> (length . Node.pList $ n) +
-                                       (length . Node.sList $ n)) $ nodes
-        off_score = if offline_inst == 0
-                    then 0::Double
-                    else fromIntegral offline_inst /
-                         fromIntegral (offline_inst + online_inst)::Double
+        -- metric: reserved memory covariance
+        res_cv = stdDev res_l
+        -- offline instances metrics
+        offline_ipri = sum . map (length . Node.pList) $ offline
+        offline_isec = sum . map (length . Node.sList) $ offline
+        -- metric: count of instances on offline nodes
+        off_score = fromIntegral (offline_ipri + offline_isec)::Double
+        -- metric: count of primary instances on offline nodes (this
+        -- helps with evacuation/failover of primary instances on
+        -- 2-node clusters with one node offline)
+        off_pri_score = fromIntegral offline_ipri::Double
          cpu_l = map Node.pCpu nodes
-        cpu_cv = varianceCoeff cpu_l
+        -- metric: covariance of vcpu/pcpu ratio
+        cpu_cv = stdDev cpu_l
+        -- metrics: covariance of cpu, memory, disk and network load
          (c_load, m_load, d_load, n_load) = unzip4 $
              map (\n ->
                       let DynUtil c1 m1 d1 n1 = Node.utilLoad n
                           DynUtil c2 m2 d2 n2 = Node.utilPool n
                       in (c1/c2, m1/m2, d1/d2, n1/n2)
                  ) nodes
-    in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, cpu_cv
-       , varianceCoeff c_load, varianceCoeff m_load
-       , varianceCoeff d_load, varianceCoeff n_load]
+        -- metric: conflicting instance count
+        pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
+        pri_tags_score = fromIntegral pri_tags_inst::Double
+    in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
+       , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
+       , pri_tags_score ]
  
  -- | Compute the /total/ variance.
  compCV :: Node.List -> Double
-compCV = sum . compDetailedCV
+compCV = sum . zipWith (*) detailedCVWeights . compDetailedCV
  
  -- | Compute online nodes from a Node.List
  getOnline :: Node.List -> [Node.Node]
@@ -248,8 +313,9 @@ applyMove nl inst Failover =
          old_s = Container.find old_sdx nl
          int_p = Node.removePri old_p inst
          int_s = Node.removeSec old_s inst
+        force_p = Node.offline old_p
          new_nl = do -- Maybe monad
-          new_p <- Node.addPri int_s inst
+          new_p <- Node.addPriEx force_p int_s inst
            new_s <- Node.addSec int_p inst old_sdx
            let new_inst = Instance.setBoth inst old_sdx old_pdx
            return (Container.addTwo old_pdx new_s old_sdx new_p nl,
@@ -265,13 +331,14 @@ applyMove nl inst (ReplacePrimary new_pdx) =
          tgt_n = Container.find new_pdx nl
          int_p = Node.removePri old_p inst
          int_s = Node.removeSec old_s inst
+        force_p = Node.offline old_p
          new_nl = do -- Maybe monad
            -- check that the current secondary can host the instance
            -- during the migration
-          tmp_s <- Node.addPri int_s inst
+          tmp_s <- Node.addPriEx force_p int_s inst
            let tmp_s' = Node.removePri tmp_s inst
-          new_p <- Node.addPri tgt_n inst
-          new_s <- Node.addSec tmp_s' inst new_pdx
+          new_p <- Node.addPriEx force_p tgt_n inst
+          new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
            let new_inst = Instance.setPri inst new_pdx
            return (Container.add new_pdx new_p $
                    Container.addTwo old_pdx int_p old_sdx new_s nl,
@@ -285,8 +352,9 @@ applyMove nl inst (ReplaceSecondary new_sdx) =
          old_s = Container.find old_sdx nl
          tgt_n = Container.find new_sdx nl
          int_s = Node.removeSec old_s inst
+        force_s = Node.offline old_s
          new_inst = Instance.setSec inst new_sdx
-        new_nl = Node.addSec tgt_n inst old_pdx >>=
+        new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
                   \new_s -> return (Container.addTwo new_sdx
                                     new_s old_sdx int_s nl,
                                     new_inst, old_pdx, new_sdx)
@@ -301,9 +369,10 @@ applyMove nl inst (ReplaceAndFailover new_pdx) =
          tgt_n = Container.find new_pdx nl
          int_p = Node.removePri old_p inst
          int_s = Node.removeSec old_s inst
+        force_s = Node.offline old_s
          new_nl = do -- Maybe monad
            new_p <- Node.addPri tgt_n inst
-          new_s <- Node.addSec int_p inst new_pdx
+          new_s <- Node.addSecEx force_s int_p inst new_pdx
            let new_inst = Instance.setBoth inst new_pdx old_pdx
            return (Container.add new_pdx new_p $
                    Container.addTwo old_pdx new_s old_sdx int_s nl,
@@ -319,9 +388,10 @@ applyMove nl inst (FailoverAndReplace new_sdx) =
          tgt_n = Container.find new_sdx nl
          int_p = Node.removePri old_p inst
          int_s = Node.removeSec old_s inst
+        force_p = Node.offline old_p
          new_nl = do -- Maybe monad
-          new_p <- Node.addPri int_s inst
-          new_s <- Node.addSec tgt_n inst old_sdx
+          new_p <- Node.addPriEx force_p int_s inst
+          new_s <- Node.addSecEx force_p tgt_n inst old_sdx
            let new_inst = Instance.setBoth inst old_sdx new_sdx
            return (Container.add new_sdx new_s $
                    Container.addTwo old_sdx new_p old_pdx int_p nl,
@@ -330,27 +400,27 @@ applyMove nl inst (FailoverAndReplace new_sdx) =
  
  -- | Tries to allocate an instance on one given node.
  allocateOnSingle :: Node.List -> Instance.Instance -> Node.Node
-                 -> OpResult AllocElement
+                 -> OpResult Node.AllocElement
  allocateOnSingle nl inst p =
      let new_pdx = Node.idx p
          new_inst = Instance.setBoth inst new_pdx Node.noSecondary
-        new_nl = Node.addPri p inst >>= \new_p ->
-                 return (Container.add new_pdx new_p nl, new_inst, [new_p])
-    in new_nl
+    in  Node.addPri p inst >>= \new_p -> do
+      let new_nl = Container.add new_pdx new_p nl
+          new_score = compCV nl
+      return (new_nl, new_inst, [new_p], new_score)
  
  -- | Tries to allocate an instance on a given pair of nodes.
  allocateOnPair :: Node.List -> Instance.Instance -> Node.Node -> Node.Node
-               -> OpResult AllocElement
+               -> OpResult Node.AllocElement
  allocateOnPair nl inst tgt_p tgt_s =
      let new_pdx = Node.idx tgt_p
          new_sdx = Node.idx tgt_s
-        new_nl = do -- Maybe monad
-          new_p <- Node.addPri tgt_p inst
-          new_s <- Node.addSec tgt_s inst new_pdx
-          let new_inst = Instance.setBoth inst new_pdx new_sdx
-          return (Container.addTwo new_pdx new_p new_sdx new_s nl, new_inst,
-                 [new_p, new_s])
-    in new_nl
+    in do
+      new_p <- Node.addPri tgt_p inst
+      new_s <- Node.addSec tgt_s inst new_pdx
+      let new_inst = Instance.setBoth inst new_pdx new_sdx
+          new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
+      return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
  
  -- | Tries to perform an instance move and returns the best table
  -- between the original one and the new one.
@@ -425,40 +495,49 @@ checkMove nodes_idx disk_moves ini_tbl victims =
          best_tbl =
              foldl'
              (\ step_tbl em ->
-                 if Instance.sNode em == Node.noSecondary then step_tbl
-                    else compareTables step_tbl $
-                         checkInstanceMove nodes_idx disk_moves ini_tbl em)
+                 compareTables step_tbl $
+                 checkInstanceMove nodes_idx disk_moves ini_tbl em)
              ini_tbl victims
          Table _ _ _ best_plc = best_tbl
-    in
-      if length best_plc == length ini_plc then -- no advancement
-          ini_tbl
-      else
-          best_tbl
+    in if length best_plc == length ini_plc
+       then ini_tbl -- no advancement
+       else best_tbl
+
+-- | Check if we are allowed to go deeper in the balancing
+doNextBalance :: Table     -- ^ The starting table
+              -> Int       -- ^ Remaining length
+              -> Score     -- ^ Score at which to stop
+              -> Bool      -- ^ The resulting table and commands
+doNextBalance ini_tbl max_rounds min_score =
+    let Table _ _ ini_cv ini_plc = ini_tbl
+        ini_plc_len = length ini_plc
+    in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
  
  -- | Run a balance move
-
  tryBalance :: Table       -- ^ The starting table
-           -> Int         -- ^ Remaining length
             -> Bool        -- ^ Allow disk moves
-           -> Score       -- ^ Score at which to stop
+           -> Bool        -- ^ Only evacuate moves
+           -> Score       -- ^ Min gain threshold
+           -> Score       -- ^ Min gain
             -> Maybe Table -- ^ The resulting table and commands
-tryBalance ini_tbl max_rounds disk_moves min_score =
-    let Table ini_nl ini_il ini_cv ini_plc = ini_tbl
-        ini_plc_len = length ini_plc
-        allowed_next = (max_rounds < 0 || ini_plc_len < max_rounds) &&
-                       ini_cv > min_score
+tryBalance ini_tbl disk_moves evac_mode mg_limit min_gain =
+    let Table ini_nl ini_il ini_cv _ = ini_tbl
+        all_inst = Container.elems ini_il
+        all_inst' = if evac_mode
+                    then let bad_nodes = map Node.idx . filter Node.offline $
+                                         Container.elems ini_nl
+                         in filter (\e -> Instance.sNode e `elem` bad_nodes ||
+                                          Instance.pNode e `elem` bad_nodes)
+                            all_inst
+                    else all_inst
+        reloc_inst = filter Instance.movable all_inst'
+        node_idx = map Node.idx . filter (not . Node.offline) $
+                   Container.elems ini_nl
+        fin_tbl = checkMove node_idx disk_moves ini_tbl reloc_inst
+        (Table _ _ fin_cv _) = fin_tbl
      in
-      if allowed_next
-      then let all_inst = Container.elems ini_il
-               node_idx = map Node.idx . filter (not . Node.offline) $
-                          Container.elems ini_nl
-               fin_tbl = checkMove node_idx disk_moves ini_tbl all_inst
-               (Table _ _ fin_cv _) = fin_tbl
-           in
-             if fin_cv < ini_cv
-             then Just fin_tbl -- this round made success, try deeper
-             else Nothing
+      if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
+      then Just fin_tbl -- this round made success, return the new table
        else Nothing
  
  -- * Allocation functions
@@ -466,29 +545,56 @@ tryBalance ini_tbl max_rounds disk_moves min_score =
  -- | Build failure stats out of a list of failures
  collapseFailures :: [FailMode] -> FailStats
  collapseFailures flst =
-    map (\k -> (k, length $ filter ((==) k) flst)) [minBound..maxBound]
+    map (\k -> (k, length $ filter (k ==) flst)) [minBound..maxBound]
  
  -- | Update current Allocation solution and failure stats with new
  -- elements
-concatAllocs :: AllocSolution -> OpResult AllocElement -> AllocSolution
-concatAllocs (flst, cntok, sols) (OpFail reason) = (reason:flst, cntok, sols)
+concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
+concatAllocs as (OpFail reason) = as { asFailures = reason : asFailures as }
  
-concatAllocs (flst, cntok, osols) (OpGood ns@(nl, _, _)) =
-    let nscore = compCV nl
-        -- Choose the old or new solution, based on the cluster score
+concatAllocs as (OpGood ns@(_, _, _, nscore)) =
+    let -- Choose the old or new solution, based on the cluster score
+        cntok = asAllocs as
+        osols = asSolutions as
          nsols = case osols of
-                  Nothing -> Just (nscore, ns)
-                  Just (oscore, _) ->
+                  [] -> [ns]
+                  (_, _, _, oscore):[] ->
                        if oscore < nscore
                        then osols
-                      else Just (nscore, ns)
+                      else [ns]
+                  -- FIXME: here we simply concat to lists with more
+                  -- than one element; we should instead abort, since
+                  -- this is not a valid usage of this function
+                  xs -> ns:xs
          nsuc = cntok + 1
      -- Note: we force evaluation of nsols here in order to keep the
      -- memory profile low - we know that we will need nsols for sure
      -- in the next cycle, so we force evaluation of nsols, since the
      -- foldl' in the caller will only evaluate the tuple, but not the
      -- elements of the tuple
-    in nsols `seq` nsuc `seq` (flst, nsuc, nsols)
+    in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolutions = nsols }
+
+-- | Given a solution, generates a reasonable description for it
+describeSolution :: AllocSolution -> String
+describeSolution as =
+  let fcnt = asFailures as
+      sols = asSolutions as
+      freasons =
+        intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
+        filter ((> 0) . snd) . collapseFailures $ fcnt
+  in if null sols
+     then "No valid allocation solutions, failure reasons: " ++
+          (if null fcnt
+           then "unknown reasons"
+           else freasons)
+     else let (_, _, nodes, cv) = head sols
+          in printf ("score: %.8f, successes %d, failures %d (%s)" ++
+                     " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
+             (intercalate "/" . map Node.name $ nodes)
+
+-- | Annotates a solution with the appropriate string
+annotateSolution :: AllocSolution -> AllocSolution
+annotateSolution as = as { asLog = describeSolution as : asLog as }
  
  -- | Try to allocate an instance on the cluster.
  tryAlloc :: (Monad m) =>
@@ -503,21 +609,81 @@ tryAlloc nl _ inst 2 =
          ok_pairs = filter (\(x, y) -> Node.idx x /= Node.idx y) all_pairs
          sols = foldl' (\cstate (p, s) ->
                             concatAllocs cstate $ allocateOnPair nl inst p s
-                      ) ([], 0, Nothing) ok_pairs
-    in return sols
+                      ) emptySolution ok_pairs
+
+    in if null ok_pairs -- means we have just one node
+       then fail "Not enough online nodes"
+       else return $ annotateSolution sols
  
  tryAlloc nl _ inst 1 =
      let all_nodes = getOnline nl
          sols = foldl' (\cstate ->
                             concatAllocs cstate . allocateOnSingle nl inst
-                      ) ([], 0, Nothing) all_nodes
-    in return sols
+                      ) emptySolution all_nodes
+    in if null all_nodes
+       then fail "No online nodes"
+       else return $ annotateSolution sols
  
  tryAlloc _ _ _ reqn = fail $ "Unsupported number of allocation \
                               \destinations required (" ++ show reqn ++
                                                 "), only two supported"
  
--- | Try to allocate an instance on the cluster.
+-- | Given a group/result, describe it as a nice (list of) messages
+solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String]
+solutionDescription gl (groupId, result) =
+  case result of
+    Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
+    Bad message -> [printf "Group %s: error %s" gname message]
+  where grp = Container.find groupId gl
+        gname = Group.name grp
+        pol = apolToString (Group.allocPolicy grp)
+
+-- | From a list of possibly bad and possibly empty solutions, filter
+-- only the groups with a valid result
+filterMGResults :: Group.List
+                -> [(Gdx, Result AllocSolution)]
+                -> [(Gdx, AllocSolution)]
+filterMGResults gl=
+  filter ((/= AllocUnallocable) . Group.allocPolicy .
+             flip Container.find gl . fst) .
+  filter (not . null . asSolutions . snd) .
+  map (\(y, Ok x) -> (y, x)) .
+  filter (isOk . snd)
+
+-- | Sort multigroup results based on policy and score
+sortMGResults :: Group.List
+             -> [(Gdx, AllocSolution)]
+             -> [(Gdx, AllocSolution)]
+sortMGResults gl sols =
+    let extractScore = \(_, _, _, x) -> x
+        solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl),
+                               (extractScore . head . asSolutions) sol)
+    in sortBy (comparing solScore) sols
+
+-- | Try to allocate an instance on a multi-group cluster.
+tryMGAlloc :: Group.List           -- ^ The group list
+           -> Node.List            -- ^ The node list
+           -> Instance.List        -- ^ The instance list
+           -> Instance.Instance    -- ^ The instance to allocate
+           -> Int                  -- ^ Required number of nodes
+           -> Result AllocSolution -- ^ Possible solution list
+tryMGAlloc mggl mgnl mgil inst cnt =
+  let groups = splitCluster mgnl mgil
+      -- TODO: currently we consider all groups preferred
+      sols = map (\(gid, (nl, il)) ->
+                   (gid, tryAlloc nl il inst cnt)) groups::
+        [(Gdx, Result AllocSolution)]
+      all_msgs = concatMap (solutionDescription mggl) sols
+      goodSols = filterMGResults mggl sols
+      sortedSols = sortMGResults mggl goodSols
+  in if null sortedSols
+     then Bad $ intercalate ", " all_msgs
+     else let (final_group, final_sol) = head sortedSols
+              final_name = Group.name $ Container.find final_group mggl
+              selmsg = "Selected group: " ++  final_name
+          in Ok $ final_sol { asLog = selmsg:all_msgs }
+
+-- | Try to relocate an instance on the cluster.
  tryReloc :: (Monad m) =>
              Node.List       -- ^ The node list
           -> Instance.List   -- ^ The instance list
@@ -535,15 +701,100 @@ tryReloc nl il xid 1 ex_idx =
                              let em = do
                                    (mnl, i, _, _) <-
                                        applyMove nl inst (ReplaceSecondary x)
-                                  return (mnl, i, [Container.find x mnl])
+                                  return (mnl, i, [Container.find x mnl],
+                                          compCV mnl)
                              in concatAllocs cstate em
-                       ) ([], 0, Nothing) valid_idxes
+                       ) emptySolution valid_idxes
      in return sols1
  
  tryReloc _ _ _ reqn _  = fail $ "Unsupported number of relocation \
                                  \destinations required (" ++ show reqn ++
                                                    "), only one supported"
  
+-- | Try to evacuate a list of nodes.
+tryEvac :: (Monad m) =>
+            Node.List       -- ^ The node list
+         -> Instance.List   -- ^ The instance list
+         -> [Ndx]           -- ^ Nodes to be evacuated
+         -> m AllocSolution -- ^ Solution list
+tryEvac nl il ex_ndx =
+    let ex_nodes = map (`Container.find` nl) ex_ndx
+        all_insts = nub . concatMap Node.sList $ ex_nodes
+    in do
+      (_, sol) <- foldM (\(nl', old_as) idx -> do
+                            -- FIXME: hardcoded one node here
+                            -- (fm, cs, aes)
+                            new_as <- tryReloc nl' il idx 1 ex_ndx
+                            case asSolutions new_as of
+                              csol@(nl'', _, _, _):_ ->
+                                -- an individual relocation succeeded,
+                                -- we kind of compose the data from
+                                -- the two solutions
+                                return (nl'',
+                                        new_as { asSolutions =
+                                                    csol:asSolutions old_as })
+                              -- this relocation failed, so we fail
+                              -- the entire evac
+                              _ -> fail $ "Can't evacuate instance " ++
+                                   Instance.name (Container.find idx il) ++
+                                   ": " ++ describeSolution new_as
+                        ) (nl, emptySolution) all_insts
+      return $ annotateSolution sol
+
+-- | Recursively place instances on the cluster until we're out of space
+iterateAlloc :: Node.List
+             -> Instance.List
+             -> Instance.Instance
+             -> Int
+             -> [Instance.Instance]
+             -> Result (FailStats, Node.List, Instance.List,
+                        [Instance.Instance])
+iterateAlloc nl il newinst nreq ixes =
+      let depth = length ixes
+          newname = printf "new-%d" depth::String
+          newidx = length (Container.elems il) + depth
+          newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
+      in case tryAlloc nl il newi2 nreq of
+           Bad s -> Bad s
+           Ok (AllocSolution { asFailures = errs, asSolutions = sols3 }) ->
+               case sols3 of
+                 [] -> Ok (collapseFailures errs, nl, il, ixes)
+                 (xnl, xi, _, _):[] ->
+                     iterateAlloc xnl (Container.add newidx xi il)
+                                  newinst nreq $! (xi:ixes)
+                 _ -> Bad "Internal error: multiple solutions for single\
+                          \ allocation"
+
+-- | The core of the tiered allocation mode
+tieredAlloc :: Node.List
+            -> Instance.List
+            -> Instance.Instance
+            -> Int
+            -> [Instance.Instance]
+            -> Result (FailStats, Node.List, Instance.List,
+                       [Instance.Instance])
+tieredAlloc nl il newinst nreq ixes =
+    case iterateAlloc nl il newinst nreq ixes of
+      Bad s -> Bad s
+      Ok (errs, nl', il', ixes') ->
+          case Instance.shrinkByType newinst . fst . last $
+               sortBy (comparing snd) errs of
+            Bad _ -> Ok (errs, nl', il', ixes')
+            Ok newinst' ->
+                tieredAlloc nl' il' newinst' nreq ixes'
+
+-- | Compute the tiered spec string description from a list of
+-- allocated instances.
+tieredSpecMap :: [Instance.Instance]
+              -> [String]
+tieredSpecMap trl_ixes =
+    let fin_trl_ixes = reverse trl_ixes
+        ix_byspec = groupBy ((==) `on` Instance.specOf) fin_trl_ixes
+        spec_map = map (\ixs -> (Instance.specOf $ head ixs, length ixs))
+                   ix_byspec
+    in  map (\(spec, cnt) -> printf "%d,%d,%d=%d" (rspecMem spec)
+                             (rspecDsk spec) (rspecCpu spec) cnt) spec_map
+
  -- * Formatting functions
  
  -- | Given the original and final nodes, computes the relocation description.
@@ -582,11 +833,11 @@ printSolutionLine nl il nmlen imlen plc pos =
          pmlen = (2*nmlen + 1)
          (i, p, s, mv, c) = plc
          inst = Container.find i il
-        inam = Instance.name inst
-        npri = Container.nameOf nl p
-        nsec = Container.nameOf nl s
-        opri = Container.nameOf nl $ Instance.pNode inst
-        osec = Container.nameOf nl $ Instance.sNode inst
+        inam = Instance.alias inst
+        npri = Node.alias $ Container.find p nl
+        nsec = Node.alias $ Container.find s nl
+        opri = Node.alias $ Container.find (Instance.pNode inst) nl
+        osec = Node.alias $ Container.find (Instance.sNode inst) nl
          (moves, cmds) =  computeMoves inst inam mv npri nsec
          ostr = printf "%s:%s" opri osec::String
          nstr = printf "%s:%s" npri nsec::String
@@ -639,43 +890,41 @@ formatCmds =
                               (zip [1..] js)) .
      zip [1..]
  
--- | Converts a solution to string format.
-printSolution :: Node.List
-              -> Instance.List
-              -> [Placement]
-              -> ([String], [[String]])
-printSolution nl il sol =
-    let
-        nmlen = Container.maxNameLen nl
-        imlen = Container.maxNameLen il
-    in
-      unzip $ zipWith (printSolutionLine nl il nmlen imlen) sol [1..]
-
  -- | Print the node list.
-printNodes :: Node.List -> String
-printNodes nl =
-    let snl = sortBy (compare `on` Node.idx) (Container.elems nl)
-        header = ["F", "Name"
-                 , "t_mem", "n_mem", "i_mem", "x_mem", "f_mem", "r_mem"
-                 , "t_dsk", "f_dsk", "pcpu", "vcpu", "pri",  "sec"
-                 , "p_fmem", "p_fdsk", "r_cpu"
-                 , "lCpu", "lMem", "lDsk", "lNet" ]
-        isnum = False:False:repeat True
+printNodes :: Node.List -> [String] -> String
+printNodes nl fs =
+    let fields = case fs of
+          [] -> Node.defaultFields
+          "+":rest -> Node.defaultFields ++ rest
+          _ -> fs
+        snl = sortBy (comparing Node.idx) (Container.elems nl)
+        (header, isnum) = unzip $ map Node.showHeader fields
      in unlines . map ((:) ' ' .  intercalate " ") $
-       formatTable (header:map Node.list snl) isnum
+       formatTable (header:map (Node.list fields) snl) isnum
  
  -- | Print the instance list.
  printInsts :: Node.List -> Instance.List -> String
  printInsts nl il =
-    let sil = sortBy (compare `on` Instance.idx) (Container.elems il)
-        helper inst = [ (Instance.name inst)
-                      , (Container.nameOf nl (Instance.pNode inst))
-                      , (let sdx = Instance.sNode inst
-                         in if sdx == Node.noSecondary
-                            then  ""
-                            else Container.nameOf nl sdx) ]
-        header = ["Name", "Pri_node", "Sec_node"]
-        isnum = repeat False
+    let sil = sortBy (comparing Instance.idx) (Container.elems il)
+        helper inst = [ if Instance.running inst then "R" else " "
+                      , Instance.name inst
+                      , Container.nameOf nl (Instance.pNode inst)
+                      , let sdx = Instance.sNode inst
+                        in if sdx == Node.noSecondary
+                           then  ""
+                           else Container.nameOf nl sdx
+                      , printf "%3d" $ Instance.vcpus inst
+                      , printf "%5d" $ Instance.mem inst
+                      , printf "%5d" $ Instance.dsk inst `div` 1024
+                      , printf "%5.3f" lC
+                      , printf "%5.3f" lM
+                      , printf "%5.3f" lD
+                      , printf "%5.3f" lN
+                      ]
+            where DynUtil lC lM lD lN = Instance.util inst
+        header = [ "F", "Name", "Pri_node", "Sec_node", "vcpu", "mem"
+                 , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
+        isnum = False:False:False:False:repeat True
      in unlines . map ((:) ' ' . intercalate " ") $
         formatTable (header:map helper sil) isnum
  
@@ -683,18 +932,19 @@ printInsts nl il =
  printStats :: Node.List -> String
  printStats nl =
      let dcvs = compDetailedCV nl
-        hd = zip (detailedCVNames ++ repeat "unknown") dcvs
-        formatted = map (\(header, val) ->
-                             printf "%s=%.8f" header val::String) hd
+        (weights, names) = unzip detailedCVInfo
+        hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
+        formatted = map (\(w, header, val) ->
+                             printf "%s=%.8f(x%.2f)" header val w::String) hd
      in intercalate ", " formatted
  
  -- | Convert a placement into a list of OpCodes (basically a job).
-iMoveToJob :: String -> Node.List -> Instance.List
+iMoveToJob :: Node.List -> Instance.List
            -> Idx -> IMove -> [OpCodes.OpCode]
-iMoveToJob csf nl il idx move =
+iMoveToJob nl il idx move =
      let inst = Container.find idx il
-        iname = Instance.name inst ++ csf
-        lookNode n = Just (Container.nameOf nl n ++ csf)
+        iname = Instance.name inst
+        lookNode  = Just . Container.nameOf nl
          opF = if Instance.running inst
                then OpCodes.OpMigrateInstance iname True False
                else OpCodes.OpFailoverInstance iname False
@@ -706,3 +956,37 @@ iMoveToJob csf nl il idx move =
           ReplaceSecondary ns -> [ opR ns ]
           ReplaceAndFailover np -> [ opR np, opF ]
           FailoverAndReplace ns -> [ opF, opR ns ]
+
+-- * Node group functions
+
+-- | Computes the group of an instance
+instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
+instanceGroup nl i =
+  let sidx = Instance.sNode i
+      pnode = Container.find (Instance.pNode i) nl
+      snode = if sidx == Node.noSecondary
+              then pnode
+              else Container.find sidx nl
+      pgroup = Node.group pnode
+      sgroup = Node.group snode
+  in if pgroup /= sgroup
+     then fail ("Instance placed accross two node groups, primary " ++
+                show pgroup ++ ", secondary " ++ show sgroup)
+     else return pgroup
+
+-- | Compute the list of badly allocated instances (split across node
+-- groups)
+findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
+findSplitInstances nl il =
+  filter (not . isOk . instanceGroup nl) (Container.elems il)
+
+-- | Splits a cluster into the component node groups
+splitCluster :: Node.List -> Instance.List ->
+                [(Gdx, (Node.List, Instance.List))]
+splitCluster nl il =
+  let ngroups = Node.computeGroups (Container.elems nl)
+  in map (\(guuid, nodes) ->
+           let nidxs = map Node.idx nodes
+               nodes' = zip nidxs nodes
+               instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
+           in (guuid, (Container.fromAssocList nodes', instances))) ngroups