code.grnet.gr Git - ganeti-local/blob - Ganeti/HTools/Cluster.hs

   1 {-| Implementation of cluster-wide logic.
   2
   3 This module holds all pure cluster-logic; I\/O related functionality
   4 goes into the "Main" module for the individual binaries.
   5
   6 -}
   7
   8 {-
   9
  10 Copyright (C) 2009, 2010 Google Inc.
  11
  12 This program is free software; you can redistribute it and/or modify
  13 it under the terms of the GNU General Public License as published by
  14 the Free Software Foundation; either version 2 of the License, or
  15 (at your option) any later version.
  16
  17 This program is distributed in the hope that it will be useful, but
  18 WITHOUT ANY WARRANTY; without even the implied warranty of
  19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20 General Public License for more details.
  21
  22 You should have received a copy of the GNU General Public License
  23 along with this program; if not, write to the Free Software
  24 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  25 02110-1301, USA.
  26
  27 -}
  28
  29 module Ganeti.HTools.Cluster
  30     (
  31      -- * Types
  32       AllocSolution(..)
  33     , Table(..)
  34     , CStats(..)
  35     , AllocStats
  36     -- * Generic functions
  37     , totalResources
  38     , computeAllocationDelta
  39     -- * First phase functions
  40     , computeBadItems
  41     -- * Second phase functions
  42     , printSolutionLine
  43     , formatCmds
  44     , involvedNodes
  45     , splitJobs
  46     -- * Display functions
  47     , printNodes
  48     , printInsts
  49     -- * Balacing functions
  50     , checkMove
  51     , doNextBalance
  52     , tryBalance
  53     , compCV
  54     , printStats
  55     , iMoveToJob
  56     -- * IAllocator functions
  57     , tryAlloc
  58     , tryMGAlloc
  59     , tryReloc
  60     , tryEvac
  61     , collapseFailures
  62     -- * Allocation functions
  63     , iterateAlloc
  64     , tieredAlloc
  65     , instanceGroup
  66     , findSplitInstances
  67     , splitCluster
  68     ) where
  69
  70 import Data.List
  71 import Data.Ord (comparing)
  72 import Text.Printf (printf)
  73 import Control.Monad
  74
  75 import qualified Ganeti.HTools.Container as Container
  76 import qualified Ganeti.HTools.Instance as Instance
  77 import qualified Ganeti.HTools.Node as Node
  78 import Ganeti.HTools.Types
  79 import Ganeti.HTools.Utils
  80 import qualified Ganeti.OpCodes as OpCodes
  81
  82 -- * Types
  83
  84 -- | Allocation\/relocation solution.
  85 data AllocSolution = AllocSolution
  86   { asFailures  :: [FailMode]          -- ^ Failure counts
  87   , asAllocs    :: Int                 -- ^ Good allocation count
  88   , asSolutions :: [Node.AllocElement] -- ^ The actual result, length
  89                                        -- of the list depends on the
  90                                        -- allocation/relocation mode
  91   , asLog       :: [String]            -- ^ A list of informational messages
  92   }
  93
  94 -- | The empty solution we start with when computing allocations
  95 emptySolution :: AllocSolution
  96 emptySolution = AllocSolution { asFailures = [], asAllocs = 0
  97                               , asSolutions = [], asLog = [] }
  98
  99 -- | The complete state for the balancing solution
 100 data Table = Table Node.List Instance.List Score [Placement]
 101              deriving (Show)
 102
 103 data CStats = CStats { csFmem :: Int    -- ^ Cluster free mem
 104                      , csFdsk :: Int    -- ^ Cluster free disk
 105                      , csAmem :: Int    -- ^ Cluster allocatable mem
 106                      , csAdsk :: Int    -- ^ Cluster allocatable disk
 107                      , csAcpu :: Int    -- ^ Cluster allocatable cpus
 108                      , csMmem :: Int    -- ^ Max node allocatable mem
 109                      , csMdsk :: Int    -- ^ Max node allocatable disk
 110                      , csMcpu :: Int    -- ^ Max node allocatable cpu
 111                      , csImem :: Int    -- ^ Instance used mem
 112                      , csIdsk :: Int    -- ^ Instance used disk
 113                      , csIcpu :: Int    -- ^ Instance used cpu
 114                      , csTmem :: Double -- ^ Cluster total mem
 115                      , csTdsk :: Double -- ^ Cluster total disk
 116                      , csTcpu :: Double -- ^ Cluster total cpus
 117                      , csVcpu :: Int    -- ^ Cluster virtual cpus (if
 118                                         -- node pCpu has been set,
 119                                         -- otherwise -1)
 120                      , csXmem :: Int    -- ^ Unnacounted for mem
 121                      , csNmem :: Int    -- ^ Node own memory
 122                      , csScore :: Score -- ^ The cluster score
 123                      , csNinst :: Int   -- ^ The total number of instances
 124                      }
 125             deriving (Show)
 126
 127 -- | Currently used, possibly to allocate, unallocable
 128 type AllocStats = (RSpec, RSpec, RSpec)
 129
 130 -- * Utility functions
 131
 132 -- | Verifies the N+1 status and return the affected nodes.
 133 verifyN1 :: [Node.Node] -> [Node.Node]
 134 verifyN1 = filter Node.failN1
 135
 136 {-| Computes the pair of bad nodes and instances.
 137
 138 The bad node list is computed via a simple 'verifyN1' check, and the
 139 bad instance list is the list of primary and secondary instances of
 140 those nodes.
 141
 142 -}
 143 computeBadItems :: Node.List -> Instance.List ->
 144                    ([Node.Node], [Instance.Instance])
 145 computeBadItems nl il =
 146   let bad_nodes = verifyN1 $ getOnline nl
 147       bad_instances = map (`Container.find` il) .
 148                       sort . nub $
 149                       concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
 150   in
 151     (bad_nodes, bad_instances)
 152
 153 -- | Zero-initializer for the CStats type
 154 emptyCStats :: CStats
 155 emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 156
 157 -- | Update stats with data from a new node
 158 updateCStats :: CStats -> Node.Node -> CStats
 159 updateCStats cs node =
 160     let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
 161                  csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
 162                  csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
 163                  csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
 164                  csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
 165                  csVcpu = x_vcpu,
 166                  csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
 167                }
 168             = cs
 169         inc_amem = Node.fMem node - Node.rMem node
 170         inc_amem' = if inc_amem > 0 then inc_amem else 0
 171         inc_adsk = Node.availDisk node
 172         inc_imem = truncate (Node.tMem node) - Node.nMem node
 173                    - Node.xMem node - Node.fMem node
 174         inc_icpu = Node.uCpu node
 175         inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
 176         inc_vcpu = Node.hiCpu node
 177
 178     in cs { csFmem = x_fmem + Node.fMem node
 179           , csFdsk = x_fdsk + Node.fDsk node
 180           , csAmem = x_amem + inc_amem'
 181           , csAdsk = x_adsk + inc_adsk
 182           , csAcpu = x_acpu
 183           , csMmem = max x_mmem inc_amem'
 184           , csMdsk = max x_mdsk inc_adsk
 185           , csMcpu = x_mcpu
 186           , csImem = x_imem + inc_imem
 187           , csIdsk = x_idsk + inc_idsk
 188           , csIcpu = x_icpu + inc_icpu
 189           , csTmem = x_tmem + Node.tMem node
 190           , csTdsk = x_tdsk + Node.tDsk node
 191           , csTcpu = x_tcpu + Node.tCpu node
 192           , csVcpu = x_vcpu + inc_vcpu
 193           , csXmem = x_xmem + Node.xMem node
 194           , csNmem = x_nmem + Node.nMem node
 195           , csNinst = x_ninst + length (Node.pList node)
 196           }
 197
 198 -- | Compute the total free disk and memory in the cluster.
 199 totalResources :: Node.List -> CStats
 200 totalResources nl =
 201     let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
 202     in cs { csScore = compCV nl }
 203
 204 -- | Compute the delta between two cluster state.
 205 --
 206 -- This is used when doing allocations, to understand better the
 207 -- available cluster resources. The return value is a triple of the
 208 -- current used values, the delta that was still allocated, and what
 209 -- was left unallocated.
 210 computeAllocationDelta :: CStats -> CStats -> AllocStats
 211 computeAllocationDelta cini cfin =
 212     let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu} = cini
 213         CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
 214                 csTmem = t_mem, csTdsk = t_dsk, csVcpu = v_cpu } = cfin
 215         rini = RSpec i_icpu i_imem i_idsk
 216         rfin = RSpec (f_icpu - i_icpu) (f_imem - i_imem) (f_idsk - i_idsk)
 217         un_cpu = v_cpu - f_icpu
 218         runa = RSpec un_cpu (truncate t_mem - f_imem) (truncate t_dsk - f_idsk)
 219     in (rini, rfin, runa)
 220
 221 -- | The names and weights of the individual elements in the CV list
 222 detailedCVInfo :: [(Double, String)]
 223 detailedCVInfo = [ (1,  "free_mem_cv")
 224                  , (1,  "free_disk_cv")
 225                  , (1,  "n1_cnt")
 226                  , (1,  "reserved_mem_cv")
 227                  , (4,  "offline_all_cnt")
 228                  , (16, "offline_pri_cnt")
 229                  , (1,  "vcpu_ratio_cv")
 230                  , (1,  "cpu_load_cv")
 231                  , (1,  "mem_load_cv")
 232                  , (1,  "disk_load_cv")
 233                  , (1,  "net_load_cv")
 234                  , (2,  "pri_tags_score")
 235                  ]
 236
 237 detailedCVWeights :: [Double]
 238 detailedCVWeights = map fst detailedCVInfo
 239
 240 -- | Compute the mem and disk covariance.
 241 compDetailedCV :: Node.List -> [Double]
 242 compDetailedCV nl =
 243     let
 244         all_nodes = Container.elems nl
 245         (offline, nodes) = partition Node.offline all_nodes
 246         mem_l = map Node.pMem nodes
 247         dsk_l = map Node.pDsk nodes
 248         -- metric: memory covariance
 249         mem_cv = varianceCoeff mem_l
 250         -- metric: disk covariance
 251         dsk_cv = varianceCoeff dsk_l
 252         -- metric: count of instances living on N1 failing nodes
 253         n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
 254                                                    length (Node.pList n)) .
 255                    filter Node.failN1 $ nodes :: Double
 256         res_l = map Node.pRem nodes
 257         -- metric: reserved memory covariance
 258         res_cv = varianceCoeff res_l
 259         -- offline instances metrics
 260         offline_ipri = sum . map (length . Node.pList) $ offline
 261         offline_isec = sum . map (length . Node.sList) $ offline
 262         -- metric: count of instances on offline nodes
 263         off_score = fromIntegral (offline_ipri + offline_isec)::Double
 264         -- metric: count of primary instances on offline nodes (this
 265         -- helps with evacuation/failover of primary instances on
 266         -- 2-node clusters with one node offline)
 267         off_pri_score = fromIntegral offline_ipri::Double
 268         cpu_l = map Node.pCpu nodes
 269         -- metric: covariance of vcpu/pcpu ratio
 270         cpu_cv = varianceCoeff cpu_l
 271         -- metrics: covariance of cpu, memory, disk and network load
 272         (c_load, m_load, d_load, n_load) = unzip4 $
 273             map (\n ->
 274                      let DynUtil c1 m1 d1 n1 = Node.utilLoad n
 275                          DynUtil c2 m2 d2 n2 = Node.utilPool n
 276                      in (c1/c2, m1/m2, d1/d2, n1/n2)
 277                 ) nodes
 278         -- metric: conflicting instance count
 279         pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
 280         pri_tags_score = fromIntegral pri_tags_inst::Double
 281     in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
 282        , varianceCoeff c_load, varianceCoeff m_load
 283        , varianceCoeff d_load, varianceCoeff n_load
 284        , pri_tags_score ]
 285
 286 -- | Compute the /total/ variance.
 287 compCV :: Node.List -> Double
 288 compCV = sum . zipWith (*) detailedCVWeights . compDetailedCV
 289
 290 -- | Compute online nodes from a Node.List
 291 getOnline :: Node.List -> [Node.Node]
 292 getOnline = filter (not . Node.offline) . Container.elems
 293
 294 -- * hbal functions
 295
 296 -- | Compute best table. Note that the ordering of the arguments is important.
 297 compareTables :: Table -> Table -> Table
 298 compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
 299     if a_cv > b_cv then b else a
 300
 301 -- | Applies an instance move to a given node list and instance.
 302 applyMove :: Node.List -> Instance.Instance
 303           -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
 304 -- Failover (f)
 305 applyMove nl inst Failover =
 306     let old_pdx = Instance.pNode inst
 307         old_sdx = Instance.sNode inst
 308         old_p = Container.find old_pdx nl
 309         old_s = Container.find old_sdx nl
 310         int_p = Node.removePri old_p inst
 311         int_s = Node.removeSec old_s inst
 312         force_p = Node.offline old_p
 313         new_nl = do -- Maybe monad
 314           new_p <- Node.addPriEx force_p int_s inst
 315           new_s <- Node.addSec int_p inst old_sdx
 316           let new_inst = Instance.setBoth inst old_sdx old_pdx
 317           return (Container.addTwo old_pdx new_s old_sdx new_p nl,
 318                   new_inst, old_sdx, old_pdx)
 319     in new_nl
 320
 321 -- Replace the primary (f:, r:np, f)
 322 applyMove nl inst (ReplacePrimary new_pdx) =
 323     let old_pdx = Instance.pNode inst
 324         old_sdx = Instance.sNode inst
 325         old_p = Container.find old_pdx nl
 326         old_s = Container.find old_sdx nl
 327         tgt_n = Container.find new_pdx nl
 328         int_p = Node.removePri old_p inst
 329         int_s = Node.removeSec old_s inst
 330         force_p = Node.offline old_p
 331         new_nl = do -- Maybe monad
 332           -- check that the current secondary can host the instance
 333           -- during the migration
 334           tmp_s <- Node.addPriEx force_p int_s inst
 335           let tmp_s' = Node.removePri tmp_s inst
 336           new_p <- Node.addPriEx force_p tgt_n inst
 337           new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
 338           let new_inst = Instance.setPri inst new_pdx
 339           return (Container.add new_pdx new_p $
 340                   Container.addTwo old_pdx int_p old_sdx new_s nl,
 341                   new_inst, new_pdx, old_sdx)
 342     in new_nl
 343
 344 -- Replace the secondary (r:ns)
 345 applyMove nl inst (ReplaceSecondary new_sdx) =
 346     let old_pdx = Instance.pNode inst
 347         old_sdx = Instance.sNode inst
 348         old_s = Container.find old_sdx nl
 349         tgt_n = Container.find new_sdx nl
 350         int_s = Node.removeSec old_s inst
 351         force_s = Node.offline old_s
 352         new_inst = Instance.setSec inst new_sdx
 353         new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
 354                  \new_s -> return (Container.addTwo new_sdx
 355                                    new_s old_sdx int_s nl,
 356                                    new_inst, old_pdx, new_sdx)
 357     in new_nl
 358
 359 -- Replace the secondary and failover (r:np, f)
 360 applyMove nl inst (ReplaceAndFailover new_pdx) =
 361     let old_pdx = Instance.pNode inst
 362         old_sdx = Instance.sNode inst
 363         old_p = Container.find old_pdx nl
 364         old_s = Container.find old_sdx nl
 365         tgt_n = Container.find new_pdx nl
 366         int_p = Node.removePri old_p inst
 367         int_s = Node.removeSec old_s inst
 368         force_s = Node.offline old_s
 369         new_nl = do -- Maybe monad
 370           new_p <- Node.addPri tgt_n inst
 371           new_s <- Node.addSecEx force_s int_p inst new_pdx
 372           let new_inst = Instance.setBoth inst new_pdx old_pdx
 373           return (Container.add new_pdx new_p $
 374                   Container.addTwo old_pdx new_s old_sdx int_s nl,
 375                   new_inst, new_pdx, old_pdx)
 376     in new_nl
 377
 378 -- Failver and replace the secondary (f, r:ns)
 379 applyMove nl inst (FailoverAndReplace new_sdx) =
 380     let old_pdx = Instance.pNode inst
 381         old_sdx = Instance.sNode inst
 382         old_p = Container.find old_pdx nl
 383         old_s = Container.find old_sdx nl
 384         tgt_n = Container.find new_sdx nl
 385         int_p = Node.removePri old_p inst
 386         int_s = Node.removeSec old_s inst
 387         force_p = Node.offline old_p
 388         new_nl = do -- Maybe monad
 389           new_p <- Node.addPriEx force_p int_s inst
 390           new_s <- Node.addSecEx force_p tgt_n inst old_sdx
 391           let new_inst = Instance.setBoth inst old_sdx new_sdx
 392           return (Container.add new_sdx new_s $
 393                   Container.addTwo old_sdx new_p old_pdx int_p nl,
 394                   new_inst, old_sdx, new_sdx)
 395     in new_nl
 396
 397 -- | Tries to allocate an instance on one given node.
 398 allocateOnSingle :: Node.List -> Instance.Instance -> Node.Node
 399                  -> OpResult Node.AllocElement
 400 allocateOnSingle nl inst p =
 401     let new_pdx = Node.idx p
 402         new_inst = Instance.setBoth inst new_pdx Node.noSecondary
 403     in  Node.addPri p inst >>= \new_p -> do
 404       let new_nl = Container.add new_pdx new_p nl
 405           new_score = compCV nl
 406       return (new_nl, new_inst, [new_p], new_score)
 407
 408 -- | Tries to allocate an instance on a given pair of nodes.
 409 allocateOnPair :: Node.List -> Instance.Instance -> Node.Node -> Node.Node
 410                -> OpResult Node.AllocElement
 411 allocateOnPair nl inst tgt_p tgt_s =
 412     let new_pdx = Node.idx tgt_p
 413         new_sdx = Node.idx tgt_s
 414     in do
 415       new_p <- Node.addPri tgt_p inst
 416       new_s <- Node.addSec tgt_s inst new_pdx
 417       let new_inst = Instance.setBoth inst new_pdx new_sdx
 418           new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
 419       return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
 420
 421 -- | Tries to perform an instance move and returns the best table
 422 -- between the original one and the new one.
 423 checkSingleStep :: Table -- ^ The original table
 424                 -> Instance.Instance -- ^ The instance to move
 425                 -> Table -- ^ The current best table
 426                 -> IMove -- ^ The move to apply
 427                 -> Table -- ^ The final best table
 428 checkSingleStep ini_tbl target cur_tbl move =
 429     let
 430         Table ini_nl ini_il _ ini_plc = ini_tbl
 431         tmp_resu = applyMove ini_nl target move
 432     in
 433       case tmp_resu of
 434         OpFail _ -> cur_tbl
 435         OpGood (upd_nl, new_inst, pri_idx, sec_idx) ->
 436             let tgt_idx = Instance.idx target
 437                 upd_cvar = compCV upd_nl
 438                 upd_il = Container.add tgt_idx new_inst ini_il
 439                 upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
 440                 upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
 441             in
 442               compareTables cur_tbl upd_tbl
 443
 444 -- | Given the status of the current secondary as a valid new node and
 445 -- the current candidate target node, generate the possible moves for
 446 -- a instance.
 447 possibleMoves :: Bool      -- ^ Whether the secondary node is a valid new node
 448               -> Ndx       -- ^ Target node candidate
 449               -> [IMove]   -- ^ List of valid result moves
 450 possibleMoves True tdx =
 451     [ReplaceSecondary tdx,
 452      ReplaceAndFailover tdx,
 453      ReplacePrimary tdx,
 454      FailoverAndReplace tdx]
 455
 456 possibleMoves False tdx =
 457     [ReplaceSecondary tdx,
 458      ReplaceAndFailover tdx]
 459
 460 -- | Compute the best move for a given instance.
 461 checkInstanceMove :: [Ndx]             -- ^ Allowed target node indices
 462                   -> Bool              -- ^ Whether disk moves are allowed
 463                   -> Table             -- ^ Original table
 464                   -> Instance.Instance -- ^ Instance to move
 465                   -> Table             -- ^ Best new table for this instance
 466 checkInstanceMove nodes_idx disk_moves ini_tbl target =
 467     let
 468         opdx = Instance.pNode target
 469         osdx = Instance.sNode target
 470         nodes = filter (\idx -> idx /= opdx && idx /= osdx) nodes_idx
 471         use_secondary = elem osdx nodes_idx
 472         aft_failover = if use_secondary -- if allowed to failover
 473                        then checkSingleStep ini_tbl target ini_tbl Failover
 474                        else ini_tbl
 475         all_moves = if disk_moves
 476                     then concatMap (possibleMoves use_secondary) nodes
 477                     else []
 478     in
 479       -- iterate over the possible nodes for this instance
 480       foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
 481
 482 -- | Compute the best next move.
 483 checkMove :: [Ndx]               -- ^ Allowed target node indices
 484           -> Bool                -- ^ Whether disk moves are allowed
 485           -> Table               -- ^ The current solution
 486           -> [Instance.Instance] -- ^ List of instances still to move
 487           -> Table               -- ^ The new solution
 488 checkMove nodes_idx disk_moves ini_tbl victims =
 489     let Table _ _ _ ini_plc = ini_tbl
 490         -- iterate over all instances, computing the best move
 491         best_tbl =
 492             foldl'
 493             (\ step_tbl em ->
 494                  compareTables step_tbl $
 495                  checkInstanceMove nodes_idx disk_moves ini_tbl em)
 496             ini_tbl victims
 497         Table _ _ _ best_plc = best_tbl
 498     in if length best_plc == length ini_plc
 499        then ini_tbl -- no advancement
 500        else best_tbl
 501
 502 -- | Check if we are allowed to go deeper in the balancing
 503 doNextBalance :: Table     -- ^ The starting table
 504               -> Int       -- ^ Remaining length
 505               -> Score     -- ^ Score at which to stop
 506               -> Bool      -- ^ The resulting table and commands
 507 doNextBalance ini_tbl max_rounds min_score =
 508     let Table _ _ ini_cv ini_plc = ini_tbl
 509         ini_plc_len = length ini_plc
 510     in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
 511
 512 -- | Run a balance move
 513 tryBalance :: Table       -- ^ The starting table
 514            -> Bool        -- ^ Allow disk moves
 515            -> Bool        -- ^ Only evacuate moves
 516            -> Score       -- ^ Min gain threshold
 517            -> Score       -- ^ Min gain
 518            -> Maybe Table -- ^ The resulting table and commands
 519 tryBalance ini_tbl disk_moves evac_mode mg_limit min_gain =
 520     let Table ini_nl ini_il ini_cv _ = ini_tbl
 521         all_inst = Container.elems ini_il
 522         all_inst' = if evac_mode
 523                     then let bad_nodes = map Node.idx . filter Node.offline $
 524                                          Container.elems ini_nl
 525                          in filter (\e -> Instance.sNode e `elem` bad_nodes ||
 526                                           Instance.pNode e `elem` bad_nodes)
 527                             all_inst
 528                     else all_inst
 529         reloc_inst = filter Instance.movable all_inst'
 530         node_idx = map Node.idx . filter (not . Node.offline) $
 531                    Container.elems ini_nl
 532         fin_tbl = checkMove node_idx disk_moves ini_tbl reloc_inst
 533         (Table _ _ fin_cv _) = fin_tbl
 534     in
 535       if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
 536       then Just fin_tbl -- this round made success, return the new table
 537       else Nothing
 538
 539 -- * Allocation functions
 540
 541 -- | Build failure stats out of a list of failures
 542 collapseFailures :: [FailMode] -> FailStats
 543 collapseFailures flst =
 544     map (\k -> (k, length $ filter (k ==) flst)) [minBound..maxBound]
 545
 546 -- | Update current Allocation solution and failure stats with new
 547 -- elements
 548 concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
 549 concatAllocs as (OpFail reason) = as { asFailures = reason : asFailures as }
 550
 551 concatAllocs as (OpGood ns@(_, _, _, nscore)) =
 552     let -- Choose the old or new solution, based on the cluster score
 553         cntok = asAllocs as
 554         osols = asSolutions as
 555         nsols = case osols of
 556                   [] -> [ns]
 557                   (_, _, _, oscore):[] ->
 558                       if oscore < nscore
 559                       then osols
 560                       else [ns]
 561                   -- FIXME: here we simply concat to lists with more
 562                   -- than one element; we should instead abort, since
 563                   -- this is not a valid usage of this function
 564                   xs -> ns:xs
 565         nsuc = cntok + 1
 566     -- Note: we force evaluation of nsols here in order to keep the
 567     -- memory profile low - we know that we will need nsols for sure
 568     -- in the next cycle, so we force evaluation of nsols, since the
 569     -- foldl' in the caller will only evaluate the tuple, but not the
 570     -- elements of the tuple
 571     in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolutions = nsols }
 572
 573 -- | Given a solution, generates a reasonable description for it
 574 describeSolution :: AllocSolution -> String
 575 describeSolution as =
 576   let fcnt = asFailures as
 577       sols = asSolutions as
 578       freasons =
 579         intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
 580         filter ((> 0) . snd) . collapseFailures $ fcnt
 581   in if null sols
 582      then "No valid allocation solutions, failure reasons: " ++
 583           (if null fcnt
 584            then "unknown reasons"
 585            else freasons)
 586      else let (_, _, nodes, cv) = head sols
 587           in printf ("score: %.8f, successes %d, failures %d (%s)" ++
 588                      " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
 589              (intercalate "/" . map Node.name $ nodes)
 590
 591 -- | Annotates a solution with the appropriate string
 592 annotateSolution :: AllocSolution -> AllocSolution
 593 annotateSolution as = as { asLog = describeSolution as : asLog as }
 594
 595 -- | Try to allocate an instance on the cluster.
 596 tryAlloc :: (Monad m) =>
 597             Node.List         -- ^ The node list
 598          -> Instance.List     -- ^ The instance list
 599          -> Instance.Instance -- ^ The instance to allocate
 600          -> Int               -- ^ Required number of nodes
 601          -> m AllocSolution   -- ^ Possible solution list
 602 tryAlloc nl _ inst 2 =
 603     let all_nodes = getOnline nl
 604         all_pairs = liftM2 (,) all_nodes all_nodes
 605         ok_pairs = filter (\(x, y) -> Node.idx x /= Node.idx y) all_pairs
 606         sols = foldl' (\cstate (p, s) ->
 607                            concatAllocs cstate $ allocateOnPair nl inst p s
 608                       ) emptySolution ok_pairs
 609
 610     in if null ok_pairs -- means we have just one node
 611        then fail "Not enough online nodes"
 612        else return $ annotateSolution sols
 613
 614 tryAlloc nl _ inst 1 =
 615     let all_nodes = getOnline nl
 616         sols = foldl' (\cstate ->
 617                            concatAllocs cstate . allocateOnSingle nl inst
 618                       ) emptySolution all_nodes
 619     in if null all_nodes
 620        then fail "No online nodes"
 621        else return $ annotateSolution sols
 622
 623 tryAlloc _ _ _ reqn = fail $ "Unsupported number of allocation \
 624                              \destinations required (" ++ show reqn ++
 625                                                "), only two supported"
 626
 627 -- | Given a group/result, describe it as a nice (list of) messages
 628 solutionDescription :: (Gdx, Result AllocSolution) -> [String]
 629 solutionDescription (groupId, result) =
 630   case result of
 631     Ok solution -> map (printf "Group %d: %s" groupId) (asLog solution)
 632     Bad message -> [printf "Group %d: error %s" groupId message]
 633
 634 -- | From a list of possibly bad and possibly empty solutions, filter
 635 -- only the groups with a valid result
 636 filterMGResults :: [(Gdx, Result AllocSolution)] ->
 637                    [(Gdx, AllocSolution)]
 638 filterMGResults =
 639   filter (not . null . asSolutions . snd) .
 640   map (\(y, Ok x) -> (y, x)) .
 641   filter (isOk . snd)
 642
 643 -- | Try to allocate an instance on a multi-group cluster.
 644 tryMGAlloc :: Node.List         -- ^ The node list
 645               -> Instance.List     -- ^ The instance list
 646               -> Instance.Instance -- ^ The instance to allocate
 647               -> Int               -- ^ Required number of nodes
 648               -> Result AllocSolution   -- ^ Possible solution list
 649 tryMGAlloc mgnl mgil inst cnt =
 650   let groups = splitCluster mgnl mgil
 651       -- TODO: currently we consider all groups preferred
 652       sols = map (\(gid, (nl, il)) ->
 653                    (gid, tryAlloc nl il inst cnt)) groups::
 654         [(Gdx, Result AllocSolution)]
 655       all_msgs = concatMap solutionDescription sols
 656       goodSols = filterMGResults sols
 657       extractScore = \(_, _, _, x) -> x
 658       solScore = extractScore . head . asSolutions . snd
 659       sortedSols = sortBy (comparing solScore) goodSols
 660   in if null sortedSols
 661      then Bad $ intercalate ", " all_msgs
 662      else let (final_group, final_sol) = head sortedSols
 663               selmsg = "Selected group: " ++ show final_group
 664           in Ok $ final_sol { asLog = selmsg:all_msgs }
 665
 666 -- | Try to relocate an instance on the cluster.
 667 tryReloc :: (Monad m) =>
 668             Node.List       -- ^ The node list
 669          -> Instance.List   -- ^ The instance list
 670          -> Idx             -- ^ The index of the instance to move
 671          -> Int             -- ^ The number of nodes required
 672          -> [Ndx]           -- ^ Nodes which should not be used
 673          -> m AllocSolution -- ^ Solution list
 674 tryReloc nl il xid 1 ex_idx =
 675     let all_nodes = getOnline nl
 676         inst = Container.find xid il
 677         ex_idx' = Instance.pNode inst:ex_idx
 678         valid_nodes = filter (not . flip elem ex_idx' . Node.idx) all_nodes
 679         valid_idxes = map Node.idx valid_nodes
 680         sols1 = foldl' (\cstate x ->
 681                             let em = do
 682                                   (mnl, i, _, _) <-
 683                                       applyMove nl inst (ReplaceSecondary x)
 684                                   return (mnl, i, [Container.find x mnl],
 685                                           compCV mnl)
 686                             in concatAllocs cstate em
 687                        ) emptySolution valid_idxes
 688     in return sols1
 689
 690 tryReloc _ _ _ reqn _  = fail $ "Unsupported number of relocation \
 691                                 \destinations required (" ++ show reqn ++
 692                                                   "), only one supported"
 693
 694 -- | Try to evacuate a list of nodes.
 695 tryEvac :: (Monad m) =>
 696             Node.List       -- ^ The node list
 697          -> Instance.List   -- ^ The instance list
 698          -> [Ndx]           -- ^ Nodes to be evacuated
 699          -> m AllocSolution -- ^ Solution list
 700 tryEvac nl il ex_ndx =
 701     let ex_nodes = map (`Container.find` nl) ex_ndx
 702         all_insts = nub . concatMap Node.sList $ ex_nodes
 703     in do
 704       (_, sol) <- foldM (\(nl', old_as) idx -> do
 705                             -- FIXME: hardcoded one node here
 706                             -- (fm, cs, aes)
 707                             new_as <- tryReloc nl' il idx 1 ex_ndx
 708                             case asSolutions new_as of
 709                               csol@(nl'', _, _, _):_ ->
 710                                 -- an individual relocation succeeded,
 711                                 -- we kind of compose the data from
 712                                 -- the two solutions
 713                                 return (nl'',
 714                                         new_as { asSolutions =
 715                                                     csol:asSolutions old_as })
 716                               -- this relocation failed, so we fail
 717                               -- the entire evac
 718                               _ -> fail $ "Can't evacuate instance " ++
 719                                    Instance.name (Container.find idx il) ++
 720                                    ": " ++ describeSolution new_as
 721                         ) (nl, emptySolution) all_insts
 722       return $ annotateSolution sol
 723
 724 -- | Recursively place instances on the cluster until we're out of space
 725 iterateAlloc :: Node.List
 726              -> Instance.List
 727              -> Instance.Instance
 728              -> Int
 729              -> [Instance.Instance]
 730              -> Result (FailStats, Node.List, Instance.List,
 731                         [Instance.Instance])
 732 iterateAlloc nl il newinst nreq ixes =
 733       let depth = length ixes
 734           newname = printf "new-%d" depth::String
 735           newidx = length (Container.elems il) + depth
 736           newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
 737       in case tryAlloc nl il newi2 nreq of
 738            Bad s -> Bad s
 739            Ok (AllocSolution { asFailures = errs, asSolutions = sols3 }) ->
 740                case sols3 of
 741                  [] -> Ok (collapseFailures errs, nl, il, ixes)
 742                  (xnl, xi, _, _):[] ->
 743                      iterateAlloc xnl (Container.add newidx xi il)
 744                                   newinst nreq $! (xi:ixes)
 745                  _ -> Bad "Internal error: multiple solutions for single\
 746                           \ allocation"
 747
 748 tieredAlloc :: Node.List
 749             -> Instance.List
 750             -> Instance.Instance
 751             -> Int
 752             -> [Instance.Instance]
 753             -> Result (FailStats, Node.List, Instance.List,
 754                        [Instance.Instance])
 755 tieredAlloc nl il newinst nreq ixes =
 756     case iterateAlloc nl il newinst nreq ixes of
 757       Bad s -> Bad s
 758       Ok (errs, nl', il', ixes') ->
 759           case Instance.shrinkByType newinst . fst . last $
 760                sortBy (comparing snd) errs of
 761             Bad _ -> Ok (errs, nl', il', ixes')
 762             Ok newinst' ->
 763                 tieredAlloc nl' il' newinst' nreq ixes'
 764
 765 -- * Formatting functions
 766
 767 -- | Given the original and final nodes, computes the relocation description.
 768 computeMoves :: Instance.Instance -- ^ The instance to be moved
 769              -> String -- ^ The instance name
 770              -> IMove  -- ^ The move being performed
 771              -> String -- ^ New primary
 772              -> String -- ^ New secondary
 773              -> (String, [String])
 774                 -- ^ Tuple of moves and commands list; moves is containing
 775                 -- either @/f/@ for failover or @/r:name/@ for replace
 776                 -- secondary, while the command list holds gnt-instance
 777                 -- commands (without that prefix), e.g \"@failover instance1@\"
 778 computeMoves i inam mv c d =
 779     case mv of
 780       Failover -> ("f", [mig])
 781       FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
 782       ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
 783       ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
 784       ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
 785     where morf = if Instance.running i then "migrate" else "failover"
 786           mig = printf "%s -f %s" morf inam::String
 787           rep n = printf "replace-disks -n %s %s" n inam
 788
 789 -- | Converts a placement to string format.
 790 printSolutionLine :: Node.List     -- ^ The node list
 791                   -> Instance.List -- ^ The instance list
 792                   -> Int           -- ^ Maximum node name length
 793                   -> Int           -- ^ Maximum instance name length
 794                   -> Placement     -- ^ The current placement
 795                   -> Int           -- ^ The index of the placement in
 796                                    -- the solution
 797                   -> (String, [String])
 798 printSolutionLine nl il nmlen imlen plc pos =
 799     let
 800         pmlen = (2*nmlen + 1)
 801         (i, p, s, mv, c) = plc
 802         inst = Container.find i il
 803         inam = Instance.alias inst
 804         npri = Node.alias $ Container.find p nl
 805         nsec = Node.alias $ Container.find s nl
 806         opri = Node.alias $ Container.find (Instance.pNode inst) nl
 807         osec = Node.alias $ Container.find (Instance.sNode inst) nl
 808         (moves, cmds) =  computeMoves inst inam mv npri nsec
 809         ostr = printf "%s:%s" opri osec::String
 810         nstr = printf "%s:%s" npri nsec::String
 811     in
 812       (printf "  %3d. %-*s %-*s => %-*s %.8f a=%s"
 813        pos imlen inam pmlen ostr
 814        pmlen nstr c moves,
 815        cmds)
 816
 817 -- | Return the instance and involved nodes in an instance move.
 818 involvedNodes :: Instance.List -> Placement -> [Ndx]
 819 involvedNodes il plc =
 820     let (i, np, ns, _, _) = plc
 821         inst = Container.find i il
 822         op = Instance.pNode inst
 823         os = Instance.sNode inst
 824     in nub [np, ns, op, os]
 825
 826 -- | Inner function for splitJobs, that either appends the next job to
 827 -- the current jobset, or starts a new jobset.
 828 mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
 829 mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
 830 mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
 831     | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
 832     | otherwise = ([n]:cjs, ndx)
 833
 834 -- | Break a list of moves into independent groups. Note that this
 835 -- will reverse the order of jobs.
 836 splitJobs :: [MoveJob] -> [JobSet]
 837 splitJobs = fst . foldl mergeJobs ([], [])
 838
 839 -- | Given a list of commands, prefix them with @gnt-instance@ and
 840 -- also beautify the display a little.
 841 formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
 842 formatJob jsn jsl (sn, (_, _, _, cmds)) =
 843     let out =
 844             printf "  echo job %d/%d" jsn sn:
 845             printf "  check":
 846             map ("  gnt-instance " ++) cmds
 847     in if sn == 1
 848        then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
 849        else out
 850
 851 -- | Given a list of commands, prefix them with @gnt-instance@ and
 852 -- also beautify the display a little.
 853 formatCmds :: [JobSet] -> String
 854 formatCmds =
 855     unlines .
 856     concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
 857                              (zip [1..] js)) .
 858     zip [1..]
 859
 860 -- | Print the node list.
 861 printNodes :: Node.List -> [String] -> String
 862 printNodes nl fs =
 863     let fields = case fs of
 864           [] -> Node.defaultFields
 865           "+":rest -> Node.defaultFields ++ rest
 866           _ -> fs
 867         snl = sortBy (comparing Node.idx) (Container.elems nl)
 868         (header, isnum) = unzip $ map Node.showHeader fields
 869     in unlines . map ((:) ' ' .  intercalate " ") $
 870        formatTable (header:map (Node.list fields) snl) isnum
 871
 872 -- | Print the instance list.
 873 printInsts :: Node.List -> Instance.List -> String
 874 printInsts nl il =
 875     let sil = sortBy (comparing Instance.idx) (Container.elems il)
 876         helper inst = [ if Instance.running inst then "R" else " "
 877                       , Instance.name inst
 878                       , Container.nameOf nl (Instance.pNode inst)
 879                       , let sdx = Instance.sNode inst
 880                         in if sdx == Node.noSecondary
 881                            then  ""
 882                            else Container.nameOf nl sdx
 883                       , printf "%3d" $ Instance.vcpus inst
 884                       , printf "%5d" $ Instance.mem inst
 885                       , printf "%5d" $ Instance.dsk inst `div` 1024
 886                       , printf "%5.3f" lC
 887                       , printf "%5.3f" lM
 888                       , printf "%5.3f" lD
 889                       , printf "%5.3f" lN
 890                       ]
 891             where DynUtil lC lM lD lN = Instance.util inst
 892         header = [ "F", "Name", "Pri_node", "Sec_node", "vcpu", "mem"
 893                  , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
 894         isnum = False:False:False:False:repeat True
 895     in unlines . map ((:) ' ' . intercalate " ") $
 896        formatTable (header:map helper sil) isnum
 897
 898 -- | Shows statistics for a given node list.
 899 printStats :: Node.List -> String
 900 printStats nl =
 901     let dcvs = compDetailedCV nl
 902         (weights, names) = unzip detailedCVInfo
 903         hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
 904         formatted = map (\(w, header, val) ->
 905                              printf "%s=%.8f(x%.2f)" header val w::String) hd
 906     in intercalate ", " formatted
 907
 908 -- | Convert a placement into a list of OpCodes (basically a job).
 909 iMoveToJob :: Node.List -> Instance.List
 910           -> Idx -> IMove -> [OpCodes.OpCode]
 911 iMoveToJob nl il idx move =
 912     let inst = Container.find idx il
 913         iname = Instance.name inst
 914         lookNode  = Just . Container.nameOf nl
 915         opF = if Instance.running inst
 916               then OpCodes.OpMigrateInstance iname True False
 917               else OpCodes.OpFailoverInstance iname False
 918         opR n = OpCodes.OpReplaceDisks iname (lookNode n)
 919                 OpCodes.ReplaceNewSecondary [] Nothing
 920     in case move of
 921          Failover -> [ opF ]
 922          ReplacePrimary np -> [ opF, opR np, opF ]
 923          ReplaceSecondary ns -> [ opR ns ]
 924          ReplaceAndFailover np -> [ opR np, opF ]
 925          FailoverAndReplace ns -> [ opF, opR ns ]
 926
 927 -- | Computes the group of an instance
 928 instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
 929 instanceGroup nl i =
 930   let sidx = Instance.sNode i
 931       pnode = Container.find (Instance.pNode i) nl
 932       snode = if sidx == Node.noSecondary
 933               then pnode
 934               else Container.find sidx nl
 935       pgroup = Node.group pnode
 936       sgroup = Node.group snode
 937   in if pgroup /= sgroup
 938      then fail ("Instance placed accross two node groups, primary " ++
 939                 show pgroup ++ ", secondary " ++ show sgroup)
 940      else return pgroup
 941
 942 -- | Compute the list of badly allocated instances (split across node
 943 -- groups)
 944 findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
 945 findSplitInstances nl il =
 946   filter (not . isOk . instanceGroup nl) (Container.elems il)
 947
 948 -- | Splits a cluster into the component node groups
 949 splitCluster :: Node.List -> Instance.List ->
 950                 [(Gdx, (Node.List, Instance.List))]
 951 splitCluster nl il =
 952   let ngroups = Node.computeGroups (Container.elems nl)
 953   in map (\(guuid, nodes) ->
 954            let nidxs = map Node.idx nodes
 955                nodes' = zip nidxs nodes
 956                instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
 957            in (guuid, (Container.fromAssocList nodes', instances))) ngroups