Add a tryEvac function

[ganeti-local] / Ganeti / HTools / Cluster.hs
diff --git a/Ganeti/HTools/Cluster.hs b/Ganeti/HTools/Cluster.hs

index 9deaece..42f7a6c 100644 (file)
--- a/Ganeti/HTools/Cluster.hs
+++ b/Ganeti/HTools/Cluster.hs
@@ -29,11 +29,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  module Ganeti.HTools.Cluster
      (
       -- * Types
-      Placement
-    , AllocSolution
+      AllocSolution
      , Table(..)
-    , Score
-    , IMove(..)
      , CStats(..)
      -- * Generic functions
      , totalResources
@@ -43,14 +40,22 @@ module Ganeti.HTools.Cluster
      , printSolution
      , printSolutionLine
      , formatCmds
+    , involvedNodes
+    , splitJobs
+    -- * Display functions
      , printNodes
+    , printInsts
      -- * Balacing functions
      , checkMove
+    , doNextBalance
+    , tryBalance
      , compCV
      , printStats
+    , iMoveToJob
      -- * IAllocator functions
      , tryAlloc
      , tryReloc
+    , tryEvac
      , collapseFailures
      ) where
  
@@ -64,51 +69,35 @@ import qualified Ganeti.HTools.Instance as Instance
  import qualified Ganeti.HTools.Node as Node
  import Ganeti.HTools.Types
  import Ganeti.HTools.Utils
+import qualified Ganeti.OpCodes as OpCodes
  
  -- * Types
  
--- | A separate name for the cluster score type.
-type Score = Double
-
--- | The description of an instance placement.
-type Placement = (Idx, Ndx, Ndx, Score)
-
  -- | Allocation\/relocation solution.
-type AllocSolution = ([FailMode], Int, Maybe (Score, AllocElement))
-
--- | Allocation\/relocation element.
-type AllocElement = (Node.List, Instance.Instance, [Node.Node])
-
--- | An instance move definition
-data IMove = Failover                -- ^ Failover the instance (f)
-           | ReplacePrimary Ndx      -- ^ Replace primary (f, r:np, f)
-           | ReplaceSecondary Ndx    -- ^ Replace secondary (r:ns)
-           | ReplaceAndFailover Ndx  -- ^ Replace secondary, failover (r:np, f)
-           | FailoverAndReplace Ndx  -- ^ Failover, replace secondary (f, r:ns)
-             deriving (Show)
+type AllocSolution = ([FailMode], Int, [(Score, Node.AllocElement)])
  
  -- | The complete state for the balancing solution
  data Table = Table Node.List Instance.List Score [Placement]
               deriving (Show)
  
-data CStats = CStats { cs_fmem :: Int    -- ^ Cluster free mem
-                     , cs_fdsk :: Int    -- ^ Cluster free disk
-                     , cs_amem :: Int    -- ^ Cluster allocatable mem
-                     , cs_adsk :: Int    -- ^ Cluster allocatable disk
-                     , cs_acpu :: Int    -- ^ Cluster allocatable cpus
-                     , cs_mmem :: Int    -- ^ Max node allocatable mem
-                     , cs_mdsk :: Int    -- ^ Max node allocatable disk
-                     , cs_mcpu :: Int    -- ^ Max node allocatable cpu
-                     , cs_imem :: Int    -- ^ Instance used mem
-                     , cs_idsk :: Int    -- ^ Instance used disk
-                     , cs_icpu :: Int    -- ^ Instance used cpu
-                     , cs_tmem :: Double -- ^ Cluster total mem
-                     , cs_tdsk :: Double -- ^ Cluster total disk
-                     , cs_tcpu :: Double -- ^ Cluster total cpus
-                     , cs_xmem :: Int    -- ^ Unnacounted for mem
-                     , cs_nmem :: Int    -- ^ Node own memory
-                     , cs_score :: Score -- ^ The cluster score
-                     , cs_ninst :: Int   -- ^ The total number of instances
+data CStats = CStats { csFmem :: Int    -- ^ Cluster free mem
+                     , csFdsk :: Int    -- ^ Cluster free disk
+                     , csAmem :: Int    -- ^ Cluster allocatable mem
+                     , csAdsk :: Int    -- ^ Cluster allocatable disk
+                     , csAcpu :: Int    -- ^ Cluster allocatable cpus
+                     , csMmem :: Int    -- ^ Max node allocatable mem
+                     , csMdsk :: Int    -- ^ Max node allocatable disk
+                     , csMcpu :: Int    -- ^ Max node allocatable cpu
+                     , csImem :: Int    -- ^ Instance used mem
+                     , csIdsk :: Int    -- ^ Instance used disk
+                     , csIcpu :: Int    -- ^ Instance used cpu
+                     , csTmem :: Double -- ^ Cluster total mem
+                     , csTdsk :: Double -- ^ Cluster total disk
+                     , csTcpu :: Double -- ^ Cluster total cpus
+                     , csXmem :: Int    -- ^ Unnacounted for mem
+                     , csNmem :: Int    -- ^ Node own memory
+                     , csScore :: Score -- ^ The cluster score
+                     , csNinst :: Int   -- ^ The total number of instances
                       }
  
  -- * Utility functions
@@ -130,107 +119,122 @@ computeBadItems nl il =
    let bad_nodes = verifyN1 $ getOnline nl
        bad_instances = map (\idx -> Container.find idx il) .
                        sort . nub $
-                      concatMap (\ n -> Node.slist n ++ Node.plist n) bad_nodes
+                      concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
    in
      (bad_nodes, bad_instances)
  
+-- | Zero-initializer for the CStats type
  emptyCStats :: CStats
-emptyCStats = CStats { cs_fmem = 0
-                     , cs_fdsk = 0
-                     , cs_amem = 0
-                     , cs_adsk = 0
-                     , cs_acpu = 0
-                     , cs_mmem = 0
-                     , cs_mdsk = 0
-                     , cs_mcpu = 0
-                     , cs_imem = 0
-                     , cs_idsk = 0
-                     , cs_icpu = 0
-                     , cs_tmem = 0
-                     , cs_tdsk = 0
-                     , cs_tcpu = 0
-                     , cs_xmem = 0
-                     , cs_nmem = 0
-                     , cs_score = 0
-                     , cs_ninst = 0
-                     }
+emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  
+-- | Update stats with data from a new node
  updateCStats :: CStats -> Node.Node -> CStats
  updateCStats cs node =
-    let CStats { cs_fmem = x_fmem, cs_fdsk = x_fdsk,
-                 cs_amem = x_amem, cs_acpu = x_acpu, cs_adsk = x_adsk,
-                 cs_mmem = x_mmem, cs_mdsk = x_mdsk, cs_mcpu = x_mcpu,
-                 cs_imem = x_imem, cs_idsk = x_idsk, cs_icpu = x_icpu,
-                 cs_tmem = x_tmem, cs_tdsk = x_tdsk, cs_tcpu = x_tcpu,
-                 cs_xmem = x_xmem, cs_nmem = x_nmem, cs_ninst = x_ninst
+    let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
+                 csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
+                 csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
+                 csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
+                 csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
+                 csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
                 }
              = cs
-        inc_amem = Node.f_mem node - Node.r_mem node
+        inc_amem = Node.fMem node - Node.rMem node
          inc_amem' = if inc_amem > 0 then inc_amem else 0
          inc_adsk = Node.availDisk node
-        inc_imem = truncate (Node.t_mem node) - Node.n_mem node
-                   - Node.x_mem node - Node.f_mem node
-        inc_icpu = Node.u_cpu node
-        inc_idsk = truncate (Node.t_dsk node) - Node.f_dsk node
-
-    in cs { cs_fmem = x_fmem + Node.f_mem node
-          , cs_fdsk = x_fdsk + Node.f_dsk node
-          , cs_amem = x_amem + inc_amem'
-          , cs_adsk = x_adsk + inc_adsk
-          , cs_acpu = x_acpu
-          , cs_mmem = max x_mmem inc_amem'
-          , cs_mdsk = max x_mdsk inc_adsk
-          , cs_mcpu = x_mcpu
-          , cs_imem = x_imem + inc_imem
-          , cs_idsk = x_idsk + inc_idsk
-          , cs_icpu = x_icpu + inc_icpu
-          , cs_tmem = x_tmem + Node.t_mem node
-          , cs_tdsk = x_tdsk + Node.t_dsk node
-          , cs_tcpu = x_tcpu + Node.t_cpu node
-          , cs_xmem = x_xmem + Node.x_mem node
-          , cs_nmem = x_nmem + Node.n_mem node
-          , cs_ninst = x_ninst + length (Node.plist node)
+        inc_imem = truncate (Node.tMem node) - Node.nMem node
+                   - Node.xMem node - Node.fMem node
+        inc_icpu = Node.uCpu node
+        inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
+
+    in cs { csFmem = x_fmem + Node.fMem node
+          , csFdsk = x_fdsk + Node.fDsk node
+          , csAmem = x_amem + inc_amem'
+          , csAdsk = x_adsk + inc_adsk
+          , csAcpu = x_acpu
+          , csMmem = max x_mmem inc_amem'
+          , csMdsk = max x_mdsk inc_adsk
+          , csMcpu = x_mcpu
+          , csImem = x_imem + inc_imem
+          , csIdsk = x_idsk + inc_idsk
+          , csIcpu = x_icpu + inc_icpu
+          , csTmem = x_tmem + Node.tMem node
+          , csTdsk = x_tdsk + Node.tDsk node
+          , csTcpu = x_tcpu + Node.tCpu node
+          , csXmem = x_xmem + Node.xMem node
+          , csNmem = x_nmem + Node.nMem node
+          , csNinst = x_ninst + length (Node.pList node)
            }
  
  -- | Compute the total free disk and memory in the cluster.
  totalResources :: Node.List -> CStats
  totalResources nl =
      let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
-    in cs { cs_score = compCV nl }
+    in cs { csScore = compCV nl }
+
+-- | The names of the individual elements in the CV list
+detailedCVNames :: [String]
+detailedCVNames = [ "free_mem_cv"
+                  , "free_disk_cv"
+                  , "n1_cnt"
+                  , "reserved_mem_cv"
+                  , "offline_all_cnt"
+                  , "offline_pri_cnt"
+                  , "vcpu_ratio_cv"
+                  , "cpu_load_cv"
+                  , "mem_load_cv"
+                  , "disk_load_cv"
+                  , "net_load_cv"
+                  , "pri_tags_score"
+                  ]
  
  -- | Compute the mem and disk covariance.
-compDetailedCV :: Node.List -> (Double, Double, Double, Double, Double, Double)
+compDetailedCV :: Node.List -> [Double]
  compDetailedCV nl =
      let
          all_nodes = Container.elems nl
          (offline, nodes) = partition Node.offline all_nodes
-        mem_l = map Node.p_mem nodes
-        dsk_l = map Node.p_dsk nodes
+        mem_l = map Node.pMem nodes
+        dsk_l = map Node.pDsk nodes
+        -- metric: memory covariance
          mem_cv = varianceCoeff mem_l
+        -- metric: disk covariance
          dsk_cv = varianceCoeff dsk_l
          n1_l = length $ filter Node.failN1 nodes
-        n1_score = fromIntegral n1_l /
-                   fromIntegral (length nodes)::Double
-        res_l = map Node.p_rem nodes
+        -- metric: count of failN1 nodes
+        n1_score = fromIntegral n1_l::Double
+        res_l = map Node.pRem nodes
+        -- metric: reserved memory covariance
          res_cv = varianceCoeff res_l
-        offline_inst = sum . map (\n -> (length . Node.plist $ n) +
-                                        (length . Node.slist $ n)) $ offline
-        online_inst = sum . map (\n -> (length . Node.plist $ n) +
-                                       (length . Node.slist $ n)) $ nodes
-        off_score = if offline_inst == 0
-                    then 0::Double
-                    else fromIntegral offline_inst /
-                         fromIntegral (offline_inst + online_inst)::Double
-        cpu_l = map Node.p_cpu nodes
+        -- offline instances metrics
+        offline_ipri = sum . map (length . Node.pList) $ offline
+        offline_isec = sum . map (length . Node.sList) $ offline
+        -- metric: count of instances on offline nodes
+        off_score = fromIntegral (offline_ipri + offline_isec)::Double
+        -- metric: count of primary instances on offline nodes (this
+        -- helps with evacuation/failover of primary instances on
+        -- 2-node clusters with one node offline)
+        off_pri_score = fromIntegral offline_ipri::Double
+        cpu_l = map Node.pCpu nodes
+        -- metric: covariance of vcpu/pcpu ratio
          cpu_cv = varianceCoeff cpu_l
-    in (mem_cv, dsk_cv, n1_score, res_cv, off_score, cpu_cv)
+        -- metrics: covariance of cpu, memory, disk and network load
+        (c_load, m_load, d_load, n_load) = unzip4 $
+            map (\n ->
+                     let DynUtil c1 m1 d1 n1 = Node.utilLoad n
+                         DynUtil c2 m2 d2 n2 = Node.utilPool n
+                     in (c1/c2, m1/m2, d1/d2, n1/n2)
+                ) nodes
+        -- metric: conflicting instance count
+        pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
+        pri_tags_score = fromIntegral pri_tags_inst::Double
+    in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
+       , varianceCoeff c_load, varianceCoeff m_load
+       , varianceCoeff d_load, varianceCoeff n_load
+       , pri_tags_score ]
  
  -- | Compute the /total/ variance.
  compCV :: Node.List -> Double
-compCV nl =
-    let (mem_cv, dsk_cv, n1_score, res_cv, off_score, cpu_cv) =
-            compDetailedCV nl
-    in mem_cv + dsk_cv + n1_score + res_cv + off_score + cpu_cv
+compCV = sum . compDetailedCV
  
  -- | Compute online nodes from a Node.List
  getOnline :: Node.List -> [Node.Node]
@@ -248,8 +252,8 @@ applyMove :: Node.List -> Instance.Instance
            -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
  -- Failover (f)
  applyMove nl inst Failover =
-    let old_pdx = Instance.pnode inst
-        old_sdx = Instance.snode inst
+    let old_pdx = Instance.pNode inst
+        old_sdx = Instance.sNode inst
          old_p = Container.find old_pdx nl
          old_s = Container.find old_sdx nl
          int_p = Node.removePri old_p inst
@@ -264,8 +268,8 @@ applyMove nl inst Failover =
  
  -- Replace the primary (f:, r:np, f)
  applyMove nl inst (ReplacePrimary new_pdx) =
-    let old_pdx = Instance.pnode inst
-        old_sdx = Instance.snode inst
+    let old_pdx = Instance.pNode inst
+        old_sdx = Instance.sNode inst
          old_p = Container.find old_pdx nl
          old_s = Container.find old_sdx nl
          tgt_n = Container.find new_pdx nl
@@ -286,8 +290,8 @@ applyMove nl inst (ReplacePrimary new_pdx) =
  
  -- Replace the secondary (r:ns)
  applyMove nl inst (ReplaceSecondary new_sdx) =
-    let old_pdx = Instance.pnode inst
-        old_sdx = Instance.snode inst
+    let old_pdx = Instance.pNode inst
+        old_sdx = Instance.sNode inst
          old_s = Container.find old_sdx nl
          tgt_n = Container.find new_sdx nl
          int_s = Node.removeSec old_s inst
@@ -300,8 +304,8 @@ applyMove nl inst (ReplaceSecondary new_sdx) =
  
  -- Replace the secondary and failover (r:np, f)
  applyMove nl inst (ReplaceAndFailover new_pdx) =
-    let old_pdx = Instance.pnode inst
-        old_sdx = Instance.snode inst
+    let old_pdx = Instance.pNode inst
+        old_sdx = Instance.sNode inst
          old_p = Container.find old_pdx nl
          old_s = Container.find old_sdx nl
          tgt_n = Container.find new_pdx nl
@@ -318,8 +322,8 @@ applyMove nl inst (ReplaceAndFailover new_pdx) =
  
  -- Failver and replace the secondary (f, r:ns)
  applyMove nl inst (FailoverAndReplace new_sdx) =
-    let old_pdx = Instance.pnode inst
-        old_sdx = Instance.snode inst
+    let old_pdx = Instance.pNode inst
+        old_sdx = Instance.sNode inst
          old_p = Container.find old_pdx nl
          old_s = Container.find old_sdx nl
          tgt_n = Container.find new_sdx nl
@@ -336,7 +340,7 @@ applyMove nl inst (FailoverAndReplace new_sdx) =
  
  -- | Tries to allocate an instance on one given node.
  allocateOnSingle :: Node.List -> Instance.Instance -> Node.Node
-                 -> OpResult AllocElement
+                 -> OpResult Node.AllocElement
  allocateOnSingle nl inst p =
      let new_pdx = Node.idx p
          new_inst = Instance.setBoth inst new_pdx Node.noSecondary
@@ -346,7 +350,7 @@ allocateOnSingle nl inst p =
  
  -- | Tries to allocate an instance on a given pair of nodes.
  allocateOnPair :: Node.List -> Instance.Instance -> Node.Node -> Node.Node
-               -> OpResult AllocElement
+               -> OpResult Node.AllocElement
  allocateOnPair nl inst tgt_p tgt_s =
      let new_pdx = Node.idx tgt_p
          new_sdx = Node.idx tgt_s
@@ -372,19 +376,21 @@ checkSingleStep ini_tbl target cur_tbl move =
      in
        case tmp_resu of
          OpFail _ -> cur_tbl
-        OpGood (upd_nl, new_inst, pri_idx, sec_idx)  ->
+        OpGood (upd_nl, new_inst, pri_idx, sec_idx) ->
              let tgt_idx = Instance.idx target
                  upd_cvar = compCV upd_nl
                  upd_il = Container.add tgt_idx new_inst ini_il
-                upd_plc = (tgt_idx, pri_idx, sec_idx, upd_cvar):ini_plc
+                upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
                  upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
              in
                compareTables cur_tbl upd_tbl
  
--- | Given the status of the current secondary as a valid new node
--- and the current candidate target node,
--- generate the possible moves for a instance.
-possibleMoves :: Bool -> Ndx -> [IMove]
+-- | Given the status of the current secondary as a valid new node and
+-- the current candidate target node, generate the possible moves for
+-- a instance.
+possibleMoves :: Bool      -- ^ Whether the secondary node is a valid new node
+              -> Ndx       -- ^ Target node candidate
+              -> [IMove]   -- ^ List of valid result moves
  possibleMoves True tdx =
      [ReplaceSecondary tdx,
       ReplaceAndFailover tdx,
@@ -396,45 +402,84 @@ possibleMoves False tdx =
       ReplaceAndFailover tdx]
  
  -- | Compute the best move for a given instance.
-checkInstanceMove :: [Ndx]             -- Allowed target node indices
-                  -> Table             -- Original table
-                  -> Instance.Instance -- Instance to move
-                  -> Table             -- Best new table for this instance
-checkInstanceMove nodes_idx ini_tbl target =
+checkInstanceMove :: [Ndx]             -- ^ Allowed target node indices
+                  -> Bool              -- ^ Whether disk moves are allowed
+                  -> Table             -- ^ Original table
+                  -> Instance.Instance -- ^ Instance to move
+                  -> Table             -- ^ Best new table for this instance
+checkInstanceMove nodes_idx disk_moves ini_tbl target =
      let
-        opdx = Instance.pnode target
-        osdx = Instance.snode target
+        opdx = Instance.pNode target
+        osdx = Instance.sNode target
          nodes = filter (\idx -> idx /= opdx && idx /= osdx) nodes_idx
          use_secondary = elem osdx nodes_idx
          aft_failover = if use_secondary -- if allowed to failover
                         then checkSingleStep ini_tbl target ini_tbl Failover
                         else ini_tbl
-        all_moves = concatMap (possibleMoves use_secondary) nodes
+        all_moves = if disk_moves
+                    then concatMap (possibleMoves use_secondary) nodes
+                    else []
      in
        -- iterate over the possible nodes for this instance
        foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
  
  -- | Compute the best next move.
  checkMove :: [Ndx]               -- ^ Allowed target node indices
+          -> Bool                -- ^ Whether disk moves are allowed
            -> Table               -- ^ The current solution
            -> [Instance.Instance] -- ^ List of instances still to move
            -> Table               -- ^ The new solution
-checkMove nodes_idx ini_tbl victims =
+checkMove nodes_idx disk_moves ini_tbl victims =
      let Table _ _ _ ini_plc = ini_tbl
          -- iterate over all instances, computing the best move
          best_tbl =
              foldl'
-            (\ step_tbl elem ->
-                 if Instance.snode elem == Node.noSecondary then step_tbl
-                    else compareTables step_tbl $
-                         checkInstanceMove nodes_idx ini_tbl elem)
+            (\ step_tbl em ->
+                 compareTables step_tbl $
+                 checkInstanceMove nodes_idx disk_moves ini_tbl em)
              ini_tbl victims
          Table _ _ _ best_plc = best_tbl
+    in if length best_plc == length ini_plc
+       then ini_tbl -- no advancement
+       else best_tbl
+
+-- | Check if we are allowed to go deeper in the balancing
+
+doNextBalance :: Table       -- ^ The starting table
+              -> Int         -- ^ Remaining length
+              -> Score       -- ^ Score at which to stop
+              -> Bool -- ^ The resulting table and commands
+doNextBalance ini_tbl max_rounds min_score =
+    let Table _ _ ini_cv ini_plc = ini_tbl
+        ini_plc_len = length ini_plc
+    in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
+
+-- | Run a balance move
+
+tryBalance :: Table       -- ^ The starting table
+           -> Bool        -- ^ Allow disk moves
+           -> Bool        -- ^ Only evacuate moves
+           -> Maybe Table -- ^ The resulting table and commands
+tryBalance ini_tbl disk_moves evac_mode =
+    let Table ini_nl ini_il ini_cv _ = ini_tbl
+        all_inst = Container.elems ini_il
+        all_inst' = if evac_mode
+                    then let bad_nodes = map Node.idx . filter Node.offline $
+                                         Container.elems ini_nl
+                         in filter (\e -> Instance.sNode e `elem` bad_nodes ||
+                                          Instance.pNode e `elem` bad_nodes)
+                            all_inst
+                    else all_inst
+        reloc_inst = filter (\e -> Instance.sNode e /= Node.noSecondary)
+                     all_inst'
+        node_idx = map Node.idx . filter (not . Node.offline) $
+                   Container.elems ini_nl
+        fin_tbl = checkMove node_idx disk_moves ini_tbl reloc_inst
+        (Table _ _ fin_cv _) = fin_tbl
      in
-      if length best_plc == length ini_plc then -- no advancement
-          ini_tbl
-      else
-          best_tbl
+      if fin_cv < ini_cv
+      then Just fin_tbl -- this round made success, return the new table
+      else Nothing
  
  -- * Allocation functions
  
@@ -445,19 +490,23 @@ collapseFailures flst =
  
  -- | Update current Allocation solution and failure stats with new
  -- elements
-concatAllocs :: AllocSolution -> OpResult AllocElement -> AllocSolution
-concatAllocs (flst, succ, sols) (OpFail reason) = (reason:flst, succ, sols)
+concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
+concatAllocs (flst, cntok, sols) (OpFail reason) = (reason:flst, cntok, sols)
  
-concatAllocs (flst, succ, osols) (OpGood ns@(nl, _, _)) =
+concatAllocs (flst, cntok, osols) (OpGood ns@(nl, _, _)) =
      let nscore = compCV nl
          -- Choose the old or new solution, based on the cluster score
          nsols = case osols of
-                  Nothing -> Just (nscore, ns)
-                  Just (oscore, _) ->
+                  [] -> [(nscore, ns)]
+                  (oscore, _):[] ->
                        if oscore < nscore
                        then osols
-                      else Just (nscore, ns)
-        nsuc = succ + 1
+                      else [(nscore, ns)]
+                  -- FIXME: here we simply concat to lists with more
+                  -- than one element; we should instead abort, since
+                  -- this is not a valid usage of this function
+                  xs -> (nscore, ns):xs
+        nsuc = cntok + 1
      -- Note: we force evaluation of nsols here in order to keep the
      -- memory profile low - we know that we will need nsols for sure
      -- in the next cycle, so we force evaluation of nsols, since the
@@ -478,14 +527,14 @@ tryAlloc nl _ inst 2 =
          ok_pairs = filter (\(x, y) -> Node.idx x /= Node.idx y) all_pairs
          sols = foldl' (\cstate (p, s) ->
                             concatAllocs cstate $ allocateOnPair nl inst p s
-                      ) ([], 0, Nothing) ok_pairs
+                      ) ([], 0, []) ok_pairs
      in return sols
  
  tryAlloc nl _ inst 1 =
      let all_nodes = getOnline nl
-        sols = foldl' (\cstate p ->
-                           concatAllocs cstate $ allocateOnSingle nl inst p
-                      ) ([], 0, Nothing) all_nodes
+        sols = foldl' (\cstate ->
+                           concatAllocs cstate . allocateOnSingle nl inst
+                      ) ([], 0, []) all_nodes
      in return sols
  
  tryAlloc _ _ _ reqn = fail $ "Unsupported number of allocation \
@@ -503,28 +552,49 @@ tryReloc :: (Monad m) =>
  tryReloc nl il xid 1 ex_idx =
      let all_nodes = getOnline nl
          inst = Container.find xid il
-        ex_idx' = Instance.pnode inst:ex_idx
+        ex_idx' = Instance.pNode inst:ex_idx
          valid_nodes = filter (not . flip elem ex_idx' . Node.idx) all_nodes
          valid_idxes = map Node.idx valid_nodes
          sols1 = foldl' (\cstate x ->
-                            let elem = do
+                            let em = do
                                    (mnl, i, _, _) <-
                                        applyMove nl inst (ReplaceSecondary x)
                                    return (mnl, i, [Container.find x mnl])
-                            in concatAllocs cstate elem
-                       ) ([], 0, Nothing) valid_idxes
+                            in concatAllocs cstate em
+                       ) ([], 0, []) valid_idxes
      in return sols1
  
  tryReloc _ _ _ reqn _  = fail $ "Unsupported number of relocation \
                                  \destinations required (" ++ show reqn ++
                                                    "), only one supported"
  
+-- | Try to allocate an instance on the cluster.
+tryEvac :: (Monad m) =>
+            Node.List       -- ^ The node list
+         -> Instance.List   -- ^ The instance list
+         -> [Ndx]           -- ^ Nodes to be evacuated
+         -> m AllocSolution -- ^ Solution list
+tryEvac nl il ex_ndx =
+    let ex_nodes = map (flip Container.find nl) ex_ndx
+        all_insts = nub . concat . map Node.sList $ ex_nodes
+    in do
+      (_, sol) <- foldM (\(nl', (_, _, rsols)) idx -> do
+                           -- FIXME: hardcoded one node here
+                           (fm, cs, aes) <- tryReloc nl' il idx 1 ex_ndx
+                           case aes of
+                             csol@(_, (nl'', _, _)):_ ->
+                                 return (nl'', (fm, cs, csol:rsols))
+                             _ -> fail $ "Can't evacuate instance " ++
+                                  show idx
+                        ) (nl, ([], 0, [])) all_insts
+      return sol
+
  -- * Formatting functions
  
  -- | Given the original and final nodes, computes the relocation description.
-computeMoves :: String -- ^ The instance name
-             -> String -- ^ Original primary
-             -> String -- ^ Original secondary
+computeMoves :: Instance.Instance -- ^ The instance to be moved
+             -> String -- ^ The instance name
+             -> IMove  -- ^ The move being performed
               -> String -- ^ New primary
               -> String -- ^ New secondary
               -> (String, [String])
@@ -532,29 +602,16 @@ computeMoves :: String -- ^ The instance name
                  -- either @/f/@ for failover or @/r:name/@ for replace
                  -- secondary, while the command list holds gnt-instance
                  -- commands (without that prefix), e.g \"@failover instance1@\"
-computeMoves i a b c d
-    -- same primary
-    | c == a =
-        if d == b
-        then {- Same sec??! -} ("-", [])
-        else {- Change of secondary -}
-            (printf "r:%s" d, [rep d])
-    -- failover and ...
-    | c == b =
-        if d == a
-        then {- that's all -} ("f", [mig])
-        else (printf "f r:%s" d, [mig, rep d])
-    -- ... and keep primary as secondary
-    | d == a =
-        (printf "r:%s f" c, [rep c, mig])
-    -- ... keep same secondary
-    | d == b =
-        (printf "f r:%s f" c, [mig, rep c, mig])
-    -- nothing in common -
-    | otherwise =
-        (printf "r:%s f r:%s" c d, [rep c, mig, rep d])
-    where mig = printf "migrate -f %s" i::String
-          rep n = printf "replace-disks -n %s %s" n i
+computeMoves i inam mv c d =
+    case mv of
+      Failover -> ("f", [mig])
+      FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
+      ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
+      ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
+      ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
+    where morf = if Instance.running i then "migrate" else "failover"
+          mig = printf "%s -f %s" morf inam::String
+          rep n = printf "replace-disks -n %s %s" n inam
  
  -- | Converts a placement to string format.
  printSolutionLine :: Node.List     -- ^ The node list
@@ -568,14 +625,14 @@ printSolutionLine :: Node.List     -- ^ The node list
  printSolutionLine nl il nmlen imlen plc pos =
      let
          pmlen = (2*nmlen + 1)
-        (i, p, s, c) = plc
+        (i, p, s, mv, c) = plc
          inst = Container.find i il
          inam = Instance.name inst
          npri = Container.nameOf nl p
          nsec = Container.nameOf nl s
-        opri = Container.nameOf nl $ Instance.pnode inst
-        osec = Container.nameOf nl $ Instance.snode inst
-        (moves, cmds) =  computeMoves inam opri osec npri nsec
+        opri = Container.nameOf nl $ Instance.pNode inst
+        osec = Container.nameOf nl $ Instance.sNode inst
+        (moves, cmds) =  computeMoves inst inam mv npri nsec
          ostr = printf "%s:%s" opri osec::String
          nstr = printf "%s:%s" npri nsec::String
      in
@@ -584,16 +641,47 @@ printSolutionLine nl il nmlen imlen plc pos =
         pmlen nstr c moves,
         cmds)
  
+-- | Return the instance and involved nodes in an instance move.
+involvedNodes :: Instance.List -> Placement -> [Ndx]
+involvedNodes il plc =
+    let (i, np, ns, _, _) = plc
+        inst = Container.find i il
+        op = Instance.pNode inst
+        os = Instance.sNode inst
+    in nub [np, ns, op, os]
+
+-- | Inner function for splitJobs, that either appends the next job to
+-- the current jobset, or starts a new jobset.
+mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
+mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
+mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
+    | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
+    | otherwise = ([n]:cjs, ndx)
+
+-- | Break a list of moves into independent groups. Note that this
+-- will reverse the order of jobs.
+splitJobs :: [MoveJob] -> [JobSet]
+splitJobs = fst . foldl mergeJobs ([], [])
+
+-- | Given a list of commands, prefix them with @gnt-instance@ and
+-- also beautify the display a little.
+formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
+formatJob jsn jsl (sn, (_, _, _, cmds)) =
+    let out =
+            printf "  echo job %d/%d" jsn sn:
+            printf "  check":
+            map ("  gnt-instance " ++) cmds
+    in if sn == 1
+       then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
+       else out
+
  -- | Given a list of commands, prefix them with @gnt-instance@ and
  -- also beautify the display a little.
-formatCmds :: [[String]] -> String
+formatCmds :: [JobSet] -> String
  formatCmds =
      unlines .
-    concatMap (\(a, b) ->
-               printf "echo step %d" (a::Int):
-               printf "check":
-               map ("gnt-instance " ++) b
-              ) .
+    concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
+                             (zip [1..] js)) .
      zip [1..]
  
  -- | Converts a solution to string format.
@@ -609,25 +697,66 @@ printSolution nl il sol =
        unzip $ zipWith (printSolutionLine nl il nmlen imlen) sol [1..]
  
  -- | Print the node list.
-printNodes :: Node.List -> String
-printNodes nl =
-    let snl = sortBy (compare `on` Node.idx) (Container.elems nl)
-        m_name = maximum . map (length . Node.name) $ snl
-        helper = Node.list m_name
-        header = printf
-                 "%2s %-*s %5s %5s %5s %5s %5s %5s %5s %5s %4s %4s \
-                 \%3s %3s %6s %6s %5s"
-                 " F" m_name "Name"
-                 "t_mem" "n_mem" "i_mem" "x_mem" "f_mem" "r_mem"
-                 "t_dsk" "f_dsk" "pcpu" "vcpu"
-                 "pri" "sec" "p_fmem" "p_fdsk" "r_cpu"::String
-    in unlines (header:map helper snl)
+printNodes :: Node.List -> [String] -> String
+printNodes nl fs =
+    let fields = if null fs
+                 then Node.defaultFields
+                 else fs
+        snl = sortBy (compare `on` Node.idx) (Container.elems nl)
+        (header, isnum) = unzip $ map Node.showHeader fields
+    in unlines . map ((:) ' ' .  intercalate " ") $
+       formatTable (header:map (Node.list fields) snl) isnum
+
+-- | Print the instance list.
+printInsts :: Node.List -> Instance.List -> String
+printInsts nl il =
+    let sil = sortBy (compare `on` Instance.idx) (Container.elems il)
+        helper inst = [ if Instance.running inst then "R" else " "
+                      , Instance.name inst
+                      , Container.nameOf nl (Instance.pNode inst)
+                      , (let sdx = Instance.sNode inst
+                         in if sdx == Node.noSecondary
+                            then  ""
+                            else Container.nameOf nl sdx)
+                      , printf "%3d" $ Instance.vcpus inst
+                      , printf "%5d" $ Instance.mem inst
+                      , printf "%5d" $ Instance.dsk inst `div` 1024
+                      , printf "%5.3f" lC
+                      , printf "%5.3f" lM
+                      , printf "%5.3f" lD
+                      , printf "%5.3f" lN
+                      ]
+            where DynUtil lC lM lD lN = Instance.util inst
+        header = [ "F", "Name", "Pri_node", "Sec_node", "vcpu", "mem"
+                 , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
+        isnum = False:False:False:False:repeat True
+    in unlines . map ((:) ' ' . intercalate " ") $
+       formatTable (header:map helper sil) isnum
  
  -- | Shows statistics for a given node list.
  printStats :: Node.List -> String
  printStats nl =
-    let (mem_cv, dsk_cv, n1_score, res_cv, off_score, cpu_cv) =
-            compDetailedCV nl
-    in printf "f_mem=%.8f, r_mem=%.8f, f_dsk=%.8f, n1=%.3f, \
-              \uf=%.3f, r_cpu=%.3f"
-       mem_cv res_cv dsk_cv n1_score off_score cpu_cv
+    let dcvs = compDetailedCV nl
+        hd = zip (detailedCVNames ++ repeat "unknown") dcvs
+        formatted = map (\(header, val) ->
+                             printf "%s=%.8f" header val::String) hd
+    in intercalate ", " formatted
+
+-- | Convert a placement into a list of OpCodes (basically a job).
+iMoveToJob :: String -> Node.List -> Instance.List
+          -> Idx -> IMove -> [OpCodes.OpCode]
+iMoveToJob csf nl il idx move =
+    let inst = Container.find idx il
+        iname = Instance.name inst ++ csf
+        lookNode n = Just (Container.nameOf nl n ++ csf)
+        opF = if Instance.running inst
+              then OpCodes.OpMigrateInstance iname True False
+              else OpCodes.OpFailoverInstance iname False
+        opR n = OpCodes.OpReplaceDisks iname (lookNode n)
+                OpCodes.ReplaceNewSecondary [] Nothing
+    in case move of
+         Failover -> [ opF ]
+         ReplacePrimary np -> [ opF, opR np, opF ]
+         ReplaceSecondary ns -> [ opR ns ]
+         ReplaceAndFailover np -> [ opR np, opF ]
+         FailoverAndReplace ns -> [ opF, opR ns ]