code.grnet.gr Git - ganeti-local/blob - Ganeti/HTools/Cluster.hs

   1 {-| Implementation of cluster-wide logic.
   2
   3 This module holds all pure cluster-logic; I\/O related functionality
   4 goes into the "Main" module for the individual binaries.
   5
   6 -}
   7
   8 {-
   9
  10 Copyright (C) 2009, 2010 Google Inc.
  11
  12 This program is free software; you can redistribute it and/or modify
  13 it under the terms of the GNU General Public License as published by
  14 the Free Software Foundation; either version 2 of the License, or
  15 (at your option) any later version.
  16
  17 This program is distributed in the hope that it will be useful, but
  18 WITHOUT ANY WARRANTY; without even the implied warranty of
  19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20 General Public License for more details.
  21
  22 You should have received a copy of the GNU General Public License
  23 along with this program; if not, write to the Free Software
  24 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  25 02110-1301, USA.
  26
  27 -}
  28
  29 module Ganeti.HTools.Cluster
  30     (
  31      -- * Types
  32       AllocSolution(..)
  33     , Table(..)
  34     , CStats(..)
  35     , AllocStats
  36     -- * Generic functions
  37     , totalResources
  38     , computeAllocationDelta
  39     -- * First phase functions
  40     , computeBadItems
  41     -- * Second phase functions
  42     , printSolutionLine
  43     , formatCmds
  44     , involvedNodes
  45     , splitJobs
  46     -- * Display functions
  47     , printNodes
  48     , printInsts
  49     -- * Balacing functions
  50     , checkMove
  51     , doNextBalance
  52     , tryBalance
  53     , compCV
  54     , printStats
  55     , iMoveToJob
  56     -- * IAllocator functions
  57     , tryAlloc
  58     , tryReloc
  59     , tryEvac
  60     , collapseFailures
  61     -- * Allocation functions
  62     , iterateAlloc
  63     , tieredAlloc
  64     , instanceGroup
  65     , findSplitInstances
  66     , splitCluster
  67     ) where
  68
  69 import Data.List
  70 import Data.Ord (comparing)
  71 import Text.Printf (printf)
  72 import Control.Monad
  73
  74 import qualified Ganeti.HTools.Container as Container
  75 import qualified Ganeti.HTools.Instance as Instance
  76 import qualified Ganeti.HTools.Node as Node
  77 import Ganeti.HTools.Types
  78 import Ganeti.HTools.Utils
  79 import qualified Ganeti.OpCodes as OpCodes
  80
  81 -- * Types
  82
  83 -- | Allocation\/relocation solution.
  84 data AllocSolution = AllocSolution
  85   { asFailures  :: [FailMode]          -- ^ Failure counts
  86   , asAllocs    :: Int                 -- ^ Good allocation count
  87   , asSolutions :: [Node.AllocElement] -- ^ The actual result, length
  88                                        -- of the list depends on the
  89                                        -- allocation/relocation mode
  90
  91   }
  92
  93 -- | The empty solution we start with when computing allocations
  94 emptySolution :: AllocSolution
  95 emptySolution = AllocSolution { asFailures = [], asAllocs = 0
  96                               , asSolutions = [] }
  97
  98 -- | The complete state for the balancing solution
  99 data Table = Table Node.List Instance.List Score [Placement]
 100              deriving (Show)
 101
 102 data CStats = CStats { csFmem :: Int    -- ^ Cluster free mem
 103                      , csFdsk :: Int    -- ^ Cluster free disk
 104                      , csAmem :: Int    -- ^ Cluster allocatable mem
 105                      , csAdsk :: Int    -- ^ Cluster allocatable disk
 106                      , csAcpu :: Int    -- ^ Cluster allocatable cpus
 107                      , csMmem :: Int    -- ^ Max node allocatable mem
 108                      , csMdsk :: Int    -- ^ Max node allocatable disk
 109                      , csMcpu :: Int    -- ^ Max node allocatable cpu
 110                      , csImem :: Int    -- ^ Instance used mem
 111                      , csIdsk :: Int    -- ^ Instance used disk
 112                      , csIcpu :: Int    -- ^ Instance used cpu
 113                      , csTmem :: Double -- ^ Cluster total mem
 114                      , csTdsk :: Double -- ^ Cluster total disk
 115                      , csTcpu :: Double -- ^ Cluster total cpus
 116                      , csVcpu :: Int    -- ^ Cluster virtual cpus (if
 117                                         -- node pCpu has been set,
 118                                         -- otherwise -1)
 119                      , csXmem :: Int    -- ^ Unnacounted for mem
 120                      , csNmem :: Int    -- ^ Node own memory
 121                      , csScore :: Score -- ^ The cluster score
 122                      , csNinst :: Int   -- ^ The total number of instances
 123                      }
 124             deriving (Show)
 125
 126 -- | Currently used, possibly to allocate, unallocable
 127 type AllocStats = (RSpec, RSpec, RSpec)
 128
 129 -- * Utility functions
 130
 131 -- | Verifies the N+1 status and return the affected nodes.
 132 verifyN1 :: [Node.Node] -> [Node.Node]
 133 verifyN1 = filter Node.failN1
 134
 135 {-| Computes the pair of bad nodes and instances.
 136
 137 The bad node list is computed via a simple 'verifyN1' check, and the
 138 bad instance list is the list of primary and secondary instances of
 139 those nodes.
 140
 141 -}
 142 computeBadItems :: Node.List -> Instance.List ->
 143                    ([Node.Node], [Instance.Instance])
 144 computeBadItems nl il =
 145   let bad_nodes = verifyN1 $ getOnline nl
 146       bad_instances = map (`Container.find` il) .
 147                       sort . nub $
 148                       concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
 149   in
 150     (bad_nodes, bad_instances)
 151
 152 -- | Zero-initializer for the CStats type
 153 emptyCStats :: CStats
 154 emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 155
 156 -- | Update stats with data from a new node
 157 updateCStats :: CStats -> Node.Node -> CStats
 158 updateCStats cs node =
 159     let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
 160                  csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
 161                  csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
 162                  csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
 163                  csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
 164                  csVcpu = x_vcpu,
 165                  csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
 166                }
 167             = cs
 168         inc_amem = Node.fMem node - Node.rMem node
 169         inc_amem' = if inc_amem > 0 then inc_amem else 0
 170         inc_adsk = Node.availDisk node
 171         inc_imem = truncate (Node.tMem node) - Node.nMem node
 172                    - Node.xMem node - Node.fMem node
 173         inc_icpu = Node.uCpu node
 174         inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
 175         inc_vcpu = Node.hiCpu node
 176
 177     in cs { csFmem = x_fmem + Node.fMem node
 178           , csFdsk = x_fdsk + Node.fDsk node
 179           , csAmem = x_amem + inc_amem'
 180           , csAdsk = x_adsk + inc_adsk
 181           , csAcpu = x_acpu
 182           , csMmem = max x_mmem inc_amem'
 183           , csMdsk = max x_mdsk inc_adsk
 184           , csMcpu = x_mcpu
 185           , csImem = x_imem + inc_imem
 186           , csIdsk = x_idsk + inc_idsk
 187           , csIcpu = x_icpu + inc_icpu
 188           , csTmem = x_tmem + Node.tMem node
 189           , csTdsk = x_tdsk + Node.tDsk node
 190           , csTcpu = x_tcpu + Node.tCpu node
 191           , csVcpu = x_vcpu + inc_vcpu
 192           , csXmem = x_xmem + Node.xMem node
 193           , csNmem = x_nmem + Node.nMem node
 194           , csNinst = x_ninst + length (Node.pList node)
 195           }
 196
 197 -- | Compute the total free disk and memory in the cluster.
 198 totalResources :: Node.List -> CStats
 199 totalResources nl =
 200     let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
 201     in cs { csScore = compCV nl }
 202
 203 -- | Compute the delta between two cluster state.
 204 --
 205 -- This is used when doing allocations, to understand better the
 206 -- available cluster resources. The return value is a triple of the
 207 -- current used values, the delta that was still allocated, and what
 208 -- was left unallocated.
 209 computeAllocationDelta :: CStats -> CStats -> AllocStats
 210 computeAllocationDelta cini cfin =
 211     let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu} = cini
 212         CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
 213                 csTmem = t_mem, csTdsk = t_dsk, csVcpu = v_cpu } = cfin
 214         rini = RSpec i_icpu i_imem i_idsk
 215         rfin = RSpec (f_icpu - i_icpu) (f_imem - i_imem) (f_idsk - i_idsk)
 216         un_cpu = v_cpu - f_icpu
 217         runa = RSpec un_cpu (truncate t_mem - f_imem) (truncate t_dsk - f_idsk)
 218     in (rini, rfin, runa)
 219
 220 -- | The names and weights of the individual elements in the CV list
 221 detailedCVInfo :: [(Double, String)]
 222 detailedCVInfo = [ (1,  "free_mem_cv")
 223                  , (1,  "free_disk_cv")
 224                  , (1,  "n1_cnt")
 225                  , (1,  "reserved_mem_cv")
 226                  , (4,  "offline_all_cnt")
 227                  , (16, "offline_pri_cnt")
 228                  , (1,  "vcpu_ratio_cv")
 229                  , (1,  "cpu_load_cv")
 230                  , (1,  "mem_load_cv")
 231                  , (1,  "disk_load_cv")
 232                  , (1,  "net_load_cv")
 233                  , (2,  "pri_tags_score")
 234                  ]
 235
 236 detailedCVWeights :: [Double]
 237 detailedCVWeights = map fst detailedCVInfo
 238
 239 -- | Compute the mem and disk covariance.
 240 compDetailedCV :: Node.List -> [Double]
 241 compDetailedCV nl =
 242     let
 243         all_nodes = Container.elems nl
 244         (offline, nodes) = partition Node.offline all_nodes
 245         mem_l = map Node.pMem nodes
 246         dsk_l = map Node.pDsk nodes
 247         -- metric: memory covariance
 248         mem_cv = varianceCoeff mem_l
 249         -- metric: disk covariance
 250         dsk_cv = varianceCoeff dsk_l
 251         -- metric: count of instances living on N1 failing nodes
 252         n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
 253                                                    length (Node.pList n)) .
 254                    filter Node.failN1 $ nodes :: Double
 255         res_l = map Node.pRem nodes
 256         -- metric: reserved memory covariance
 257         res_cv = varianceCoeff res_l
 258         -- offline instances metrics
 259         offline_ipri = sum . map (length . Node.pList) $ offline
 260         offline_isec = sum . map (length . Node.sList) $ offline
 261         -- metric: count of instances on offline nodes
 262         off_score = fromIntegral (offline_ipri + offline_isec)::Double
 263         -- metric: count of primary instances on offline nodes (this
 264         -- helps with evacuation/failover of primary instances on
 265         -- 2-node clusters with one node offline)
 266         off_pri_score = fromIntegral offline_ipri::Double
 267         cpu_l = map Node.pCpu nodes
 268         -- metric: covariance of vcpu/pcpu ratio
 269         cpu_cv = varianceCoeff cpu_l
 270         -- metrics: covariance of cpu, memory, disk and network load
 271         (c_load, m_load, d_load, n_load) = unzip4 $
 272             map (\n ->
 273                      let DynUtil c1 m1 d1 n1 = Node.utilLoad n
 274                          DynUtil c2 m2 d2 n2 = Node.utilPool n
 275                      in (c1/c2, m1/m2, d1/d2, n1/n2)
 276                 ) nodes
 277         -- metric: conflicting instance count
 278         pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
 279         pri_tags_score = fromIntegral pri_tags_inst::Double
 280     in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
 281        , varianceCoeff c_load, varianceCoeff m_load
 282        , varianceCoeff d_load, varianceCoeff n_load
 283        , pri_tags_score ]
 284
 285 -- | Compute the /total/ variance.
 286 compCV :: Node.List -> Double
 287 compCV = sum . zipWith (*) detailedCVWeights . compDetailedCV
 288
 289 -- | Compute online nodes from a Node.List
 290 getOnline :: Node.List -> [Node.Node]
 291 getOnline = filter (not . Node.offline) . Container.elems
 292
 293 -- * hbal functions
 294
 295 -- | Compute best table. Note that the ordering of the arguments is important.
 296 compareTables :: Table -> Table -> Table
 297 compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
 298     if a_cv > b_cv then b else a
 299
 300 -- | Applies an instance move to a given node list and instance.
 301 applyMove :: Node.List -> Instance.Instance
 302           -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
 303 -- Failover (f)
 304 applyMove nl inst Failover =
 305     let old_pdx = Instance.pNode inst
 306         old_sdx = Instance.sNode inst
 307         old_p = Container.find old_pdx nl
 308         old_s = Container.find old_sdx nl
 309         int_p = Node.removePri old_p inst
 310         int_s = Node.removeSec old_s inst
 311         force_p = Node.offline old_p
 312         new_nl = do -- Maybe monad
 313           new_p <- Node.addPriEx force_p int_s inst
 314           new_s <- Node.addSec int_p inst old_sdx
 315           let new_inst = Instance.setBoth inst old_sdx old_pdx
 316           return (Container.addTwo old_pdx new_s old_sdx new_p nl,
 317                   new_inst, old_sdx, old_pdx)
 318     in new_nl
 319
 320 -- Replace the primary (f:, r:np, f)
 321 applyMove nl inst (ReplacePrimary new_pdx) =
 322     let old_pdx = Instance.pNode inst
 323         old_sdx = Instance.sNode inst
 324         old_p = Container.find old_pdx nl
 325         old_s = Container.find old_sdx nl
 326         tgt_n = Container.find new_pdx nl
 327         int_p = Node.removePri old_p inst
 328         int_s = Node.removeSec old_s inst
 329         force_p = Node.offline old_p
 330         new_nl = do -- Maybe monad
 331           -- check that the current secondary can host the instance
 332           -- during the migration
 333           tmp_s <- Node.addPriEx force_p int_s inst
 334           let tmp_s' = Node.removePri tmp_s inst
 335           new_p <- Node.addPriEx force_p tgt_n inst
 336           new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
 337           let new_inst = Instance.setPri inst new_pdx
 338           return (Container.add new_pdx new_p $
 339                   Container.addTwo old_pdx int_p old_sdx new_s nl,
 340                   new_inst, new_pdx, old_sdx)
 341     in new_nl
 342
 343 -- Replace the secondary (r:ns)
 344 applyMove nl inst (ReplaceSecondary new_sdx) =
 345     let old_pdx = Instance.pNode inst
 346         old_sdx = Instance.sNode inst
 347         old_s = Container.find old_sdx nl
 348         tgt_n = Container.find new_sdx nl
 349         int_s = Node.removeSec old_s inst
 350         force_s = Node.offline old_s
 351         new_inst = Instance.setSec inst new_sdx
 352         new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
 353                  \new_s -> return (Container.addTwo new_sdx
 354                                    new_s old_sdx int_s nl,
 355                                    new_inst, old_pdx, new_sdx)
 356     in new_nl
 357
 358 -- Replace the secondary and failover (r:np, f)
 359 applyMove nl inst (ReplaceAndFailover new_pdx) =
 360     let old_pdx = Instance.pNode inst
 361         old_sdx = Instance.sNode inst
 362         old_p = Container.find old_pdx nl
 363         old_s = Container.find old_sdx nl
 364         tgt_n = Container.find new_pdx nl
 365         int_p = Node.removePri old_p inst
 366         int_s = Node.removeSec old_s inst
 367         force_s = Node.offline old_s
 368         new_nl = do -- Maybe monad
 369           new_p <- Node.addPri tgt_n inst
 370           new_s <- Node.addSecEx force_s int_p inst new_pdx
 371           let new_inst = Instance.setBoth inst new_pdx old_pdx
 372           return (Container.add new_pdx new_p $
 373                   Container.addTwo old_pdx new_s old_sdx int_s nl,
 374                   new_inst, new_pdx, old_pdx)
 375     in new_nl
 376
 377 -- Failver and replace the secondary (f, r:ns)
 378 applyMove nl inst (FailoverAndReplace new_sdx) =
 379     let old_pdx = Instance.pNode inst
 380         old_sdx = Instance.sNode inst
 381         old_p = Container.find old_pdx nl
 382         old_s = Container.find old_sdx nl
 383         tgt_n = Container.find new_sdx nl
 384         int_p = Node.removePri old_p inst
 385         int_s = Node.removeSec old_s inst
 386         force_p = Node.offline old_p
 387         new_nl = do -- Maybe monad
 388           new_p <- Node.addPriEx force_p int_s inst
 389           new_s <- Node.addSecEx force_p tgt_n inst old_sdx
 390           let new_inst = Instance.setBoth inst old_sdx new_sdx
 391           return (Container.add new_sdx new_s $
 392                   Container.addTwo old_sdx new_p old_pdx int_p nl,
 393                   new_inst, old_sdx, new_sdx)
 394     in new_nl
 395
 396 -- | Tries to allocate an instance on one given node.
 397 allocateOnSingle :: Node.List -> Instance.Instance -> Node.Node
 398                  -> OpResult Node.AllocElement
 399 allocateOnSingle nl inst p =
 400     let new_pdx = Node.idx p
 401         new_inst = Instance.setBoth inst new_pdx Node.noSecondary
 402     in  Node.addPri p inst >>= \new_p -> do
 403       let new_nl = Container.add new_pdx new_p nl
 404           new_score = compCV nl
 405       return (new_nl, new_inst, [new_p], new_score)
 406
 407 -- | Tries to allocate an instance on a given pair of nodes.
 408 allocateOnPair :: Node.List -> Instance.Instance -> Node.Node -> Node.Node
 409                -> OpResult Node.AllocElement
 410 allocateOnPair nl inst tgt_p tgt_s =
 411     let new_pdx = Node.idx tgt_p
 412         new_sdx = Node.idx tgt_s
 413     in do
 414       new_p <- Node.addPri tgt_p inst
 415       new_s <- Node.addSec tgt_s inst new_pdx
 416       let new_inst = Instance.setBoth inst new_pdx new_sdx
 417           new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
 418       return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
 419
 420 -- | Tries to perform an instance move and returns the best table
 421 -- between the original one and the new one.
 422 checkSingleStep :: Table -- ^ The original table
 423                 -> Instance.Instance -- ^ The instance to move
 424                 -> Table -- ^ The current best table
 425                 -> IMove -- ^ The move to apply
 426                 -> Table -- ^ The final best table
 427 checkSingleStep ini_tbl target cur_tbl move =
 428     let
 429         Table ini_nl ini_il _ ini_plc = ini_tbl
 430         tmp_resu = applyMove ini_nl target move
 431     in
 432       case tmp_resu of
 433         OpFail _ -> cur_tbl
 434         OpGood (upd_nl, new_inst, pri_idx, sec_idx) ->
 435             let tgt_idx = Instance.idx target
 436                 upd_cvar = compCV upd_nl
 437                 upd_il = Container.add tgt_idx new_inst ini_il
 438                 upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
 439                 upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
 440             in
 441               compareTables cur_tbl upd_tbl
 442
 443 -- | Given the status of the current secondary as a valid new node and
 444 -- the current candidate target node, generate the possible moves for
 445 -- a instance.
 446 possibleMoves :: Bool      -- ^ Whether the secondary node is a valid new node
 447               -> Ndx       -- ^ Target node candidate
 448               -> [IMove]   -- ^ List of valid result moves
 449 possibleMoves True tdx =
 450     [ReplaceSecondary tdx,
 451      ReplaceAndFailover tdx,
 452      ReplacePrimary tdx,
 453      FailoverAndReplace tdx]
 454
 455 possibleMoves False tdx =
 456     [ReplaceSecondary tdx,
 457      ReplaceAndFailover tdx]
 458
 459 -- | Compute the best move for a given instance.
 460 checkInstanceMove :: [Ndx]             -- ^ Allowed target node indices
 461                   -> Bool              -- ^ Whether disk moves are allowed
 462                   -> Table             -- ^ Original table
 463                   -> Instance.Instance -- ^ Instance to move
 464                   -> Table             -- ^ Best new table for this instance
 465 checkInstanceMove nodes_idx disk_moves ini_tbl target =
 466     let
 467         opdx = Instance.pNode target
 468         osdx = Instance.sNode target
 469         nodes = filter (\idx -> idx /= opdx && idx /= osdx) nodes_idx
 470         use_secondary = elem osdx nodes_idx
 471         aft_failover = if use_secondary -- if allowed to failover
 472                        then checkSingleStep ini_tbl target ini_tbl Failover
 473                        else ini_tbl
 474         all_moves = if disk_moves
 475                     then concatMap (possibleMoves use_secondary) nodes
 476                     else []
 477     in
 478       -- iterate over the possible nodes for this instance
 479       foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
 480
 481 -- | Compute the best next move.
 482 checkMove :: [Ndx]               -- ^ Allowed target node indices
 483           -> Bool                -- ^ Whether disk moves are allowed
 484           -> Table               -- ^ The current solution
 485           -> [Instance.Instance] -- ^ List of instances still to move
 486           -> Table               -- ^ The new solution
 487 checkMove nodes_idx disk_moves ini_tbl victims =
 488     let Table _ _ _ ini_plc = ini_tbl
 489         -- iterate over all instances, computing the best move
 490         best_tbl =
 491             foldl'
 492             (\ step_tbl em ->
 493                  compareTables step_tbl $
 494                  checkInstanceMove nodes_idx disk_moves ini_tbl em)
 495             ini_tbl victims
 496         Table _ _ _ best_plc = best_tbl
 497     in if length best_plc == length ini_plc
 498        then ini_tbl -- no advancement
 499        else best_tbl
 500
 501 -- | Check if we are allowed to go deeper in the balancing
 502 doNextBalance :: Table     -- ^ The starting table
 503               -> Int       -- ^ Remaining length
 504               -> Score     -- ^ Score at which to stop
 505               -> Bool      -- ^ The resulting table and commands
 506 doNextBalance ini_tbl max_rounds min_score =
 507     let Table _ _ ini_cv ini_plc = ini_tbl
 508         ini_plc_len = length ini_plc
 509     in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
 510
 511 -- | Run a balance move
 512 tryBalance :: Table       -- ^ The starting table
 513            -> Bool        -- ^ Allow disk moves
 514            -> Bool        -- ^ Only evacuate moves
 515            -> Score       -- ^ Min gain threshold
 516            -> Score       -- ^ Min gain
 517            -> Maybe Table -- ^ The resulting table and commands
 518 tryBalance ini_tbl disk_moves evac_mode mg_limit min_gain =
 519     let Table ini_nl ini_il ini_cv _ = ini_tbl
 520         all_inst = Container.elems ini_il
 521         all_inst' = if evac_mode
 522                     then let bad_nodes = map Node.idx . filter Node.offline $
 523                                          Container.elems ini_nl
 524                          in filter (\e -> Instance.sNode e `elem` bad_nodes ||
 525                                           Instance.pNode e `elem` bad_nodes)
 526                             all_inst
 527                     else all_inst
 528         reloc_inst = filter Instance.movable all_inst'
 529         node_idx = map Node.idx . filter (not . Node.offline) $
 530                    Container.elems ini_nl
 531         fin_tbl = checkMove node_idx disk_moves ini_tbl reloc_inst
 532         (Table _ _ fin_cv _) = fin_tbl
 533     in
 534       if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
 535       then Just fin_tbl -- this round made success, return the new table
 536       else Nothing
 537
 538 -- * Allocation functions
 539
 540 -- | Build failure stats out of a list of failures
 541 collapseFailures :: [FailMode] -> FailStats
 542 collapseFailures flst =
 543     map (\k -> (k, length $ filter (k ==) flst)) [minBound..maxBound]
 544
 545 -- | Update current Allocation solution and failure stats with new
 546 -- elements
 547 concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
 548 concatAllocs as (OpFail reason) = as { asFailures = reason : asFailures as }
 549
 550 concatAllocs as (OpGood ns@(_, _, _, nscore)) =
 551     let -- Choose the old or new solution, based on the cluster score
 552         cntok = asAllocs as
 553         osols = asSolutions as
 554         nsols = case osols of
 555                   [] -> [ns]
 556                   (_, _, _, oscore):[] ->
 557                       if oscore < nscore
 558                       then osols
 559                       else [ns]
 560                   -- FIXME: here we simply concat to lists with more
 561                   -- than one element; we should instead abort, since
 562                   -- this is not a valid usage of this function
 563                   xs -> ns:xs
 564         nsuc = cntok + 1
 565     -- Note: we force evaluation of nsols here in order to keep the
 566     -- memory profile low - we know that we will need nsols for sure
 567     -- in the next cycle, so we force evaluation of nsols, since the
 568     -- foldl' in the caller will only evaluate the tuple, but not the
 569     -- elements of the tuple
 570     in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolutions = nsols }
 571
 572 -- | Try to allocate an instance on the cluster.
 573 tryAlloc :: (Monad m) =>
 574             Node.List         -- ^ The node list
 575          -> Instance.List     -- ^ The instance list
 576          -> Instance.Instance -- ^ The instance to allocate
 577          -> Int               -- ^ Required number of nodes
 578          -> m AllocSolution   -- ^ Possible solution list
 579 tryAlloc nl _ inst 2 =
 580     let all_nodes = getOnline nl
 581         all_pairs = liftM2 (,) all_nodes all_nodes
 582         ok_pairs = filter (\(x, y) -> Node.idx x /= Node.idx y) all_pairs
 583         sols = foldl' (\cstate (p, s) ->
 584                            concatAllocs cstate $ allocateOnPair nl inst p s
 585                       ) emptySolution ok_pairs
 586     in return sols
 587
 588 tryAlloc nl _ inst 1 =
 589     let all_nodes = getOnline nl
 590         sols = foldl' (\cstate ->
 591                            concatAllocs cstate . allocateOnSingle nl inst
 592                       ) emptySolution all_nodes
 593     in return sols
 594
 595 tryAlloc _ _ _ reqn = fail $ "Unsupported number of allocation \
 596                              \destinations required (" ++ show reqn ++
 597                                                "), only two supported"
 598
 599 -- | Try to allocate an instance on the cluster.
 600 tryReloc :: (Monad m) =>
 601             Node.List       -- ^ The node list
 602          -> Instance.List   -- ^ The instance list
 603          -> Idx             -- ^ The index of the instance to move
 604          -> Int             -- ^ The number of nodes required
 605          -> [Ndx]           -- ^ Nodes which should not be used
 606          -> m AllocSolution -- ^ Solution list
 607 tryReloc nl il xid 1 ex_idx =
 608     let all_nodes = getOnline nl
 609         inst = Container.find xid il
 610         ex_idx' = Instance.pNode inst:ex_idx
 611         valid_nodes = filter (not . flip elem ex_idx' . Node.idx) all_nodes
 612         valid_idxes = map Node.idx valid_nodes
 613         sols1 = foldl' (\cstate x ->
 614                             let em = do
 615                                   (mnl, i, _, _) <-
 616                                       applyMove nl inst (ReplaceSecondary x)
 617                                   return (mnl, i, [Container.find x mnl],
 618                                           compCV mnl)
 619                             in concatAllocs cstate em
 620                        ) emptySolution valid_idxes
 621     in return sols1
 622
 623 tryReloc _ _ _ reqn _  = fail $ "Unsupported number of relocation \
 624                                 \destinations required (" ++ show reqn ++
 625                                                   "), only one supported"
 626
 627 -- | Try to evacuate a list of nodes.
 628 tryEvac :: (Monad m) =>
 629             Node.List       -- ^ The node list
 630          -> Instance.List   -- ^ The instance list
 631          -> [Ndx]           -- ^ Nodes to be evacuated
 632          -> m AllocSolution -- ^ Solution list
 633 tryEvac nl il ex_ndx =
 634     let ex_nodes = map (`Container.find` nl) ex_ndx
 635         all_insts = nub . concatMap Node.sList $ ex_nodes
 636     in do
 637       (_, sol) <- foldM (\(nl', old_as) idx -> do
 638                             -- FIXME: hardcoded one node here
 639                             -- (fm, cs, aes)
 640                             new_as <- tryReloc nl' il idx 1 ex_ndx
 641                             case asSolutions new_as of
 642                               csol@(nl'', _, _, _):_ ->
 643                                 -- an individual relocation succeeded,
 644                                 -- we kind of compose the data from
 645                                 -- the two solutions
 646                                 return (nl'',
 647                                         new_as { asSolutions =
 648                                                     csol:asSolutions old_as })
 649                               -- this relocation failed, so we fail
 650                               -- the entire evac
 651                               _ -> fail $ "Can't evacuate instance " ++
 652                                    Instance.name (Container.find idx il)
 653                         ) (nl, emptySolution) all_insts
 654       return sol
 655
 656 -- | Recursively place instances on the cluster until we're out of space
 657 iterateAlloc :: Node.List
 658              -> Instance.List
 659              -> Instance.Instance
 660              -> Int
 661              -> [Instance.Instance]
 662              -> Result (FailStats, Node.List, Instance.List,
 663                         [Instance.Instance])
 664 iterateAlloc nl il newinst nreq ixes =
 665       let depth = length ixes
 666           newname = printf "new-%d" depth::String
 667           newidx = length (Container.elems il) + depth
 668           newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
 669       in case tryAlloc nl il newi2 nreq of
 670            Bad s -> Bad s
 671            Ok (AllocSolution { asFailures = errs, asSolutions = sols3 }) ->
 672                case sols3 of
 673                  [] -> Ok (collapseFailures errs, nl, il, ixes)
 674                  (xnl, xi, _, _):[] ->
 675                      iterateAlloc xnl (Container.add newidx xi il)
 676                                   newinst nreq $! (xi:ixes)
 677                  _ -> Bad "Internal error: multiple solutions for single\
 678                           \ allocation"
 679
 680 tieredAlloc :: Node.List
 681             -> Instance.List
 682             -> Instance.Instance
 683             -> Int
 684             -> [Instance.Instance]
 685             -> Result (FailStats, Node.List, Instance.List,
 686                        [Instance.Instance])
 687 tieredAlloc nl il newinst nreq ixes =
 688     case iterateAlloc nl il newinst nreq ixes of
 689       Bad s -> Bad s
 690       Ok (errs, nl', il', ixes') ->
 691           case Instance.shrinkByType newinst . fst . last $
 692                sortBy (comparing snd) errs of
 693             Bad _ -> Ok (errs, nl', il', ixes')
 694             Ok newinst' ->
 695                 tieredAlloc nl' il' newinst' nreq ixes'
 696
 697 -- * Formatting functions
 698
 699 -- | Given the original and final nodes, computes the relocation description.
 700 computeMoves :: Instance.Instance -- ^ The instance to be moved
 701              -> String -- ^ The instance name
 702              -> IMove  -- ^ The move being performed
 703              -> String -- ^ New primary
 704              -> String -- ^ New secondary
 705              -> (String, [String])
 706                 -- ^ Tuple of moves and commands list; moves is containing
 707                 -- either @/f/@ for failover or @/r:name/@ for replace
 708                 -- secondary, while the command list holds gnt-instance
 709                 -- commands (without that prefix), e.g \"@failover instance1@\"
 710 computeMoves i inam mv c d =
 711     case mv of
 712       Failover -> ("f", [mig])
 713       FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
 714       ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
 715       ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
 716       ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
 717     where morf = if Instance.running i then "migrate" else "failover"
 718           mig = printf "%s -f %s" morf inam::String
 719           rep n = printf "replace-disks -n %s %s" n inam
 720
 721 -- | Converts a placement to string format.
 722 printSolutionLine :: Node.List     -- ^ The node list
 723                   -> Instance.List -- ^ The instance list
 724                   -> Int           -- ^ Maximum node name length
 725                   -> Int           -- ^ Maximum instance name length
 726                   -> Placement     -- ^ The current placement
 727                   -> Int           -- ^ The index of the placement in
 728                                    -- the solution
 729                   -> (String, [String])
 730 printSolutionLine nl il nmlen imlen plc pos =
 731     let
 732         pmlen = (2*nmlen + 1)
 733         (i, p, s, mv, c) = plc
 734         inst = Container.find i il
 735         inam = Instance.alias inst
 736         npri = Node.alias $ Container.find p nl
 737         nsec = Node.alias $ Container.find s nl
 738         opri = Node.alias $ Container.find (Instance.pNode inst) nl
 739         osec = Node.alias $ Container.find (Instance.sNode inst) nl
 740         (moves, cmds) =  computeMoves inst inam mv npri nsec
 741         ostr = printf "%s:%s" opri osec::String
 742         nstr = printf "%s:%s" npri nsec::String
 743     in
 744       (printf "  %3d. %-*s %-*s => %-*s %.8f a=%s"
 745        pos imlen inam pmlen ostr
 746        pmlen nstr c moves,
 747        cmds)
 748
 749 -- | Return the instance and involved nodes in an instance move.
 750 involvedNodes :: Instance.List -> Placement -> [Ndx]
 751 involvedNodes il plc =
 752     let (i, np, ns, _, _) = plc
 753         inst = Container.find i il
 754         op = Instance.pNode inst
 755         os = Instance.sNode inst
 756     in nub [np, ns, op, os]
 757
 758 -- | Inner function for splitJobs, that either appends the next job to
 759 -- the current jobset, or starts a new jobset.
 760 mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
 761 mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
 762 mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
 763     | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
 764     | otherwise = ([n]:cjs, ndx)
 765
 766 -- | Break a list of moves into independent groups. Note that this
 767 -- will reverse the order of jobs.
 768 splitJobs :: [MoveJob] -> [JobSet]
 769 splitJobs = fst . foldl mergeJobs ([], [])
 770
 771 -- | Given a list of commands, prefix them with @gnt-instance@ and
 772 -- also beautify the display a little.
 773 formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
 774 formatJob jsn jsl (sn, (_, _, _, cmds)) =
 775     let out =
 776             printf "  echo job %d/%d" jsn sn:
 777             printf "  check":
 778             map ("  gnt-instance " ++) cmds
 779     in if sn == 1
 780        then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
 781        else out
 782
 783 -- | Given a list of commands, prefix them with @gnt-instance@ and
 784 -- also beautify the display a little.
 785 formatCmds :: [JobSet] -> String
 786 formatCmds =
 787     unlines .
 788     concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
 789                              (zip [1..] js)) .
 790     zip [1..]
 791
 792 -- | Print the node list.
 793 printNodes :: Node.List -> [String] -> String
 794 printNodes nl fs =
 795     let fields = case fs of
 796           [] -> Node.defaultFields
 797           "+":rest -> Node.defaultFields ++ rest
 798           _ -> fs
 799         snl = sortBy (comparing Node.idx) (Container.elems nl)
 800         (header, isnum) = unzip $ map Node.showHeader fields
 801     in unlines . map ((:) ' ' .  intercalate " ") $
 802        formatTable (header:map (Node.list fields) snl) isnum
 803
 804 -- | Print the instance list.
 805 printInsts :: Node.List -> Instance.List -> String
 806 printInsts nl il =
 807     let sil = sortBy (comparing Instance.idx) (Container.elems il)
 808         helper inst = [ if Instance.running inst then "R" else " "
 809                       , Instance.name inst
 810                       , Container.nameOf nl (Instance.pNode inst)
 811                       , let sdx = Instance.sNode inst
 812                         in if sdx == Node.noSecondary
 813                            then  ""
 814                            else Container.nameOf nl sdx
 815                       , printf "%3d" $ Instance.vcpus inst
 816                       , printf "%5d" $ Instance.mem inst
 817                       , printf "%5d" $ Instance.dsk inst `div` 1024
 818                       , printf "%5.3f" lC
 819                       , printf "%5.3f" lM
 820                       , printf "%5.3f" lD
 821                       , printf "%5.3f" lN
 822                       ]
 823             where DynUtil lC lM lD lN = Instance.util inst
 824         header = [ "F", "Name", "Pri_node", "Sec_node", "vcpu", "mem"
 825                  , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
 826         isnum = False:False:False:False:repeat True
 827     in unlines . map ((:) ' ' . intercalate " ") $
 828        formatTable (header:map helper sil) isnum
 829
 830 -- | Shows statistics for a given node list.
 831 printStats :: Node.List -> String
 832 printStats nl =
 833     let dcvs = compDetailedCV nl
 834         (weights, names) = unzip detailedCVInfo
 835         hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
 836         formatted = map (\(w, header, val) ->
 837                              printf "%s=%.8f(x%.2f)" header val w::String) hd
 838     in intercalate ", " formatted
 839
 840 -- | Convert a placement into a list of OpCodes (basically a job).
 841 iMoveToJob :: Node.List -> Instance.List
 842           -> Idx -> IMove -> [OpCodes.OpCode]
 843 iMoveToJob nl il idx move =
 844     let inst = Container.find idx il
 845         iname = Instance.name inst
 846         lookNode  = Just . Container.nameOf nl
 847         opF = if Instance.running inst
 848               then OpCodes.OpMigrateInstance iname True False
 849               else OpCodes.OpFailoverInstance iname False
 850         opR n = OpCodes.OpReplaceDisks iname (lookNode n)
 851                 OpCodes.ReplaceNewSecondary [] Nothing
 852     in case move of
 853          Failover -> [ opF ]
 854          ReplacePrimary np -> [ opF, opR np, opF ]
 855          ReplaceSecondary ns -> [ opR ns ]
 856          ReplaceAndFailover np -> [ opR np, opF ]
 857          FailoverAndReplace ns -> [ opF, opR ns ]
 858
 859 -- | Computes the group of an instance
 860 instanceGroup :: Node.List -> Instance.Instance -> Result GroupID
 861 instanceGroup nl i =
 862   let sidx = Instance.sNode i
 863       pnode = Container.find (Instance.pNode i) nl
 864       snode = if sidx == Node.noSecondary
 865               then pnode
 866               else Container.find sidx nl
 867       puuid = Node.group pnode
 868       suuid = Node.group snode
 869   in if puuid /= suuid
 870      then fail ("Instance placed accross two node groups, primary " ++ puuid ++
 871                 ", secondary " ++ suuid)
 872      else return puuid
 873
 874 -- | Compute the list of badly allocated instances (split across node
 875 -- groups)
 876 findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
 877 findSplitInstances nl il =
 878   filter (not . isOk . instanceGroup nl) (Container.elems il)
 879
 880 -- | Splits a cluster into the component node groups
 881 splitCluster :: Node.List -> Instance.List ->
 882                 [(GroupID, (Node.List, Instance.List))]
 883 splitCluster nl il =
 884   let ngroups = Node.computeGroups (Container.elems nl)
 885   in map (\(guuid, nodes) ->
 886            let nidxs = map Node.idx nodes
 887                nodes' = zip nidxs nodes
 888                instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
 889            in (guuid, (Container.fromAssocList nodes', instances))) ngroups