code.grnet.gr Git - ganeti-local/blob - htools/Ganeti/HTools/Cluster.hs

   1 {-| Implementation of cluster-wide logic.
   2
   3 This module holds all pure cluster-logic; I\/O related functionality
   4 goes into the /Main/ module for the individual binaries.
   5
   6 -}
   7
   8 {-
   9
  10 Copyright (C) 2009, 2010, 2011, 2012 Google Inc.
  11
  12 This program is free software; you can redistribute it and/or modify
  13 it under the terms of the GNU General Public License as published by
  14 the Free Software Foundation; either version 2 of the License, or
  15 (at your option) any later version.
  16
  17 This program is distributed in the hope that it will be useful, but
  18 WITHOUT ANY WARRANTY; without even the implied warranty of
  19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20 General Public License for more details.
  21
  22 You should have received a copy of the GNU General Public License
  23 along with this program; if not, write to the Free Software
  24 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  25 02110-1301, USA.
  26
  27 -}
  28
  29 module Ganeti.HTools.Cluster
  30   (
  31     -- * Types
  32     AllocSolution(..)
  33   , EvacSolution(..)
  34   , Table(..)
  35   , CStats(..)
  36   , AllocStats
  37   , AllocResult
  38   , AllocMethod
  39   -- * Generic functions
  40   , totalResources
  41   , computeAllocationDelta
  42   -- * First phase functions
  43   , computeBadItems
  44   -- * Second phase functions
  45   , printSolutionLine
  46   , formatCmds
  47   , involvedNodes
  48   , splitJobs
  49   -- * Display functions
  50   , printNodes
  51   , printInsts
  52   -- * Balacing functions
  53   , checkMove
  54   , doNextBalance
  55   , tryBalance
  56   , compCV
  57   , compCVNodes
  58   , compDetailedCV
  59   , printStats
  60   , iMoveToJob
  61   -- * IAllocator functions
  62   , genAllocNodes
  63   , tryAlloc
  64   , tryMGAlloc
  65   , tryNodeEvac
  66   , tryChangeGroup
  67   , collapseFailures
  68   -- * Allocation functions
  69   , iterateAlloc
  70   , tieredAlloc
  71   -- * Node group functions
  72   , instanceGroup
  73   , findSplitInstances
  74   , splitCluster
  75   ) where
  76
  77 import qualified Data.IntSet as IntSet
  78 import Data.List
  79 import Data.Maybe (fromJust, isNothing)
  80 import Data.Ord (comparing)
  81 import Text.Printf (printf)
  82
  83 import qualified Ganeti.HTools.Container as Container
  84 import qualified Ganeti.HTools.Instance as Instance
  85 import qualified Ganeti.HTools.Node as Node
  86 import qualified Ganeti.HTools.Group as Group
  87 import Ganeti.HTools.Types
  88 import Ganeti.HTools.Utils
  89 import Ganeti.HTools.Compat
  90 import qualified Ganeti.OpCodes as OpCodes
  91
  92 -- * Types
  93
  94 -- | Allocation\/relocation solution.
  95 data AllocSolution = AllocSolution
  96   { asFailures :: [FailMode]              -- ^ Failure counts
  97   , asAllocs   :: Int                     -- ^ Good allocation count
  98   , asSolution :: Maybe Node.AllocElement -- ^ The actual allocation result
  99   , asLog      :: [String]                -- ^ Informational messages
 100   }
 101
 102 -- | Node evacuation/group change iallocator result type. This result
 103 -- type consists of actual opcodes (a restricted subset) that are
 104 -- transmitted back to Ganeti.
 105 data EvacSolution = EvacSolution
 106   { esMoved   :: [(Idx, Gdx, [Ndx])]  -- ^ Instances moved successfully
 107   , esFailed  :: [(Idx, String)]      -- ^ Instances which were not
 108                                       -- relocated
 109   , esOpCodes :: [[OpCodes.OpCode]]   -- ^ List of jobs
 110   } deriving (Show)
 111
 112 -- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
 113 type AllocResult = (FailStats, Node.List, Instance.List,
 114                     [Instance.Instance], [CStats])
 115
 116 -- | A type denoting the valid allocation mode/pairs.
 117 --
 118 -- For a one-node allocation, this will be a @Left ['Ndx']@, whereas
 119 -- for a two-node allocation, this will be a @Right [('Ndx',
 120 -- ['Ndx'])]@. In the latter case, the list is basically an
 121 -- association list, grouped by primary node and holding the potential
 122 -- secondary nodes in the sub-list.
 123 type AllocNodes = Either [Ndx] [(Ndx, [Ndx])]
 124
 125 -- | The empty solution we start with when computing allocations.
 126 emptyAllocSolution :: AllocSolution
 127 emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
 128                                    , asSolution = Nothing, asLog = [] }
 129
 130 -- | The empty evac solution.
 131 emptyEvacSolution :: EvacSolution
 132 emptyEvacSolution = EvacSolution { esMoved = []
 133                                  , esFailed = []
 134                                  , esOpCodes = []
 135                                  }
 136
 137 -- | The complete state for the balancing solution.
 138 data Table = Table Node.List Instance.List Score [Placement]
 139              deriving (Show, Read)
 140
 141 -- | Cluster statistics data type.
 142 data CStats = CStats { csFmem :: Integer -- ^ Cluster free mem
 143                      , csFdsk :: Integer -- ^ Cluster free disk
 144                      , csAmem :: Integer -- ^ Cluster allocatable mem
 145                      , csAdsk :: Integer -- ^ Cluster allocatable disk
 146                      , csAcpu :: Integer -- ^ Cluster allocatable cpus
 147                      , csMmem :: Integer -- ^ Max node allocatable mem
 148                      , csMdsk :: Integer -- ^ Max node allocatable disk
 149                      , csMcpu :: Integer -- ^ Max node allocatable cpu
 150                      , csImem :: Integer -- ^ Instance used mem
 151                      , csIdsk :: Integer -- ^ Instance used disk
 152                      , csIcpu :: Integer -- ^ Instance used cpu
 153                      , csTmem :: Double  -- ^ Cluster total mem
 154                      , csTdsk :: Double  -- ^ Cluster total disk
 155                      , csTcpu :: Double  -- ^ Cluster total cpus
 156                      , csVcpu :: Integer -- ^ Cluster virtual cpus (if
 157                                          -- node pCpu has been set,
 158                                          -- otherwise -1)
 159                      , csXmem :: Integer -- ^ Unnacounted for mem
 160                      , csNmem :: Integer -- ^ Node own memory
 161                      , csScore :: Score  -- ^ The cluster score
 162                      , csNinst :: Int    -- ^ The total number of instances
 163                      }
 164             deriving (Show, Read)
 165
 166 -- | Currently used, possibly to allocate, unallocable.
 167 type AllocStats = (RSpec, RSpec, RSpec)
 168
 169 -- | A simple type for allocation functions.
 170 type AllocMethod =  Node.List           -- ^ Node list
 171                  -> Instance.List       -- ^ Instance list
 172                  -> Maybe Int           -- ^ Optional allocation limit
 173                  -> Instance.Instance   -- ^ Instance spec for allocation
 174                  -> AllocNodes          -- ^ Which nodes we should allocate on
 175                  -> [Instance.Instance] -- ^ Allocated instances
 176                  -> [CStats]            -- ^ Running cluster stats
 177                  -> Result AllocResult  -- ^ Allocation result
 178
 179 -- * Utility functions
 180
 181 -- | Verifies the N+1 status and return the affected nodes.
 182 verifyN1 :: [Node.Node] -> [Node.Node]
 183 verifyN1 = filter Node.failN1
 184
 185 {-| Computes the pair of bad nodes and instances.
 186
 187 The bad node list is computed via a simple 'verifyN1' check, and the
 188 bad instance list is the list of primary and secondary instances of
 189 those nodes.
 190
 191 -}
 192 computeBadItems :: Node.List -> Instance.List ->
 193                    ([Node.Node], [Instance.Instance])
 194 computeBadItems nl il =
 195   let bad_nodes = verifyN1 $ getOnline nl
 196       bad_instances = map (`Container.find` il) .
 197                       sort . nub $
 198                       concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
 199   in
 200     (bad_nodes, bad_instances)
 201
 202 -- | Extracts the node pairs for an instance. This can fail if the
 203 -- instance is single-homed. FIXME: this needs to be improved,
 204 -- together with the general enhancement for handling non-DRBD moves.
 205 instanceNodes :: Node.List -> Instance.Instance ->
 206                  (Ndx, Ndx, Node.Node, Node.Node)
 207 instanceNodes nl inst =
 208   let old_pdx = Instance.pNode inst
 209       old_sdx = Instance.sNode inst
 210       old_p = Container.find old_pdx nl
 211       old_s = Container.find old_sdx nl
 212   in (old_pdx, old_sdx, old_p, old_s)
 213
 214 -- | Zero-initializer for the CStats type.
 215 emptyCStats :: CStats
 216 emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 217
 218 -- | Update stats with data from a new node.
 219 updateCStats :: CStats -> Node.Node -> CStats
 220 updateCStats cs node =
 221   let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
 222                csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
 223                csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
 224                csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
 225                csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
 226                csVcpu = x_vcpu,
 227                csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
 228              }
 229         = cs
 230       inc_amem = Node.fMem node - Node.rMem node
 231       inc_amem' = if inc_amem > 0 then inc_amem else 0
 232       inc_adsk = Node.availDisk node
 233       inc_imem = truncate (Node.tMem node) - Node.nMem node
 234                  - Node.xMem node - Node.fMem node
 235       inc_icpu = Node.uCpu node
 236       inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
 237       inc_vcpu = Node.hiCpu node
 238       inc_acpu = Node.availCpu node
 239   in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
 240         , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
 241         , csAmem = x_amem + fromIntegral inc_amem'
 242         , csAdsk = x_adsk + fromIntegral inc_adsk
 243         , csAcpu = x_acpu + fromIntegral inc_acpu
 244         , csMmem = max x_mmem (fromIntegral inc_amem')
 245         , csMdsk = max x_mdsk (fromIntegral inc_adsk)
 246         , csMcpu = max x_mcpu (fromIntegral inc_acpu)
 247         , csImem = x_imem + fromIntegral inc_imem
 248         , csIdsk = x_idsk + fromIntegral inc_idsk
 249         , csIcpu = x_icpu + fromIntegral inc_icpu
 250         , csTmem = x_tmem + Node.tMem node
 251         , csTdsk = x_tdsk + Node.tDsk node
 252         , csTcpu = x_tcpu + Node.tCpu node
 253         , csVcpu = x_vcpu + fromIntegral inc_vcpu
 254         , csXmem = x_xmem + fromIntegral (Node.xMem node)
 255         , csNmem = x_nmem + fromIntegral (Node.nMem node)
 256         , csNinst = x_ninst + length (Node.pList node)
 257         }
 258
 259 -- | Compute the total free disk and memory in the cluster.
 260 totalResources :: Node.List -> CStats
 261 totalResources nl =
 262   let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
 263   in cs { csScore = compCV nl }
 264
 265 -- | Compute the delta between two cluster state.
 266 --
 267 -- This is used when doing allocations, to understand better the
 268 -- available cluster resources. The return value is a triple of the
 269 -- current used values, the delta that was still allocated, and what
 270 -- was left unallocated.
 271 computeAllocationDelta :: CStats -> CStats -> AllocStats
 272 computeAllocationDelta cini cfin =
 273   let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu} = cini
 274       CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
 275               csTmem = t_mem, csTdsk = t_dsk, csVcpu = v_cpu } = cfin
 276       rini = RSpec (fromIntegral i_icpu) (fromIntegral i_imem)
 277              (fromIntegral i_idsk)
 278       rfin = RSpec (fromIntegral (f_icpu - i_icpu))
 279              (fromIntegral (f_imem - i_imem))
 280              (fromIntegral (f_idsk - i_idsk))
 281       un_cpu = fromIntegral (v_cpu - f_icpu)::Int
 282       runa = RSpec un_cpu (truncate t_mem - fromIntegral f_imem)
 283              (truncate t_dsk - fromIntegral f_idsk)
 284   in (rini, rfin, runa)
 285
 286 -- | The names and weights of the individual elements in the CV list.
 287 detailedCVInfo :: [(Double, String)]
 288 detailedCVInfo = [ (1,  "free_mem_cv")
 289                  , (1,  "free_disk_cv")
 290                  , (1,  "n1_cnt")
 291                  , (1,  "reserved_mem_cv")
 292                  , (4,  "offline_all_cnt")
 293                  , (16, "offline_pri_cnt")
 294                  , (1,  "vcpu_ratio_cv")
 295                  , (1,  "cpu_load_cv")
 296                  , (1,  "mem_load_cv")
 297                  , (1,  "disk_load_cv")
 298                  , (1,  "net_load_cv")
 299                  , (2,  "pri_tags_score")
 300                  ]
 301
 302 -- | Holds the weights used by 'compCVNodes' for each metric.
 303 detailedCVWeights :: [Double]
 304 detailedCVWeights = map fst detailedCVInfo
 305
 306 -- | Compute the mem and disk covariance.
 307 compDetailedCV :: [Node.Node] -> [Double]
 308 compDetailedCV all_nodes =
 309   let (offline, nodes) = partition Node.offline all_nodes
 310       mem_l = map Node.pMem nodes
 311       dsk_l = map Node.pDsk nodes
 312       -- metric: memory covariance
 313       mem_cv = stdDev mem_l
 314       -- metric: disk covariance
 315       dsk_cv = stdDev dsk_l
 316       -- metric: count of instances living on N1 failing nodes
 317       n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
 318                                                  length (Node.pList n)) .
 319                  filter Node.failN1 $ nodes :: Double
 320       res_l = map Node.pRem nodes
 321       -- metric: reserved memory covariance
 322       res_cv = stdDev res_l
 323       -- offline instances metrics
 324       offline_ipri = sum . map (length . Node.pList) $ offline
 325       offline_isec = sum . map (length . Node.sList) $ offline
 326       -- metric: count of instances on offline nodes
 327       off_score = fromIntegral (offline_ipri + offline_isec)::Double
 328       -- metric: count of primary instances on offline nodes (this
 329       -- helps with evacuation/failover of primary instances on
 330       -- 2-node clusters with one node offline)
 331       off_pri_score = fromIntegral offline_ipri::Double
 332       cpu_l = map Node.pCpu nodes
 333       -- metric: covariance of vcpu/pcpu ratio
 334       cpu_cv = stdDev cpu_l
 335       -- metrics: covariance of cpu, memory, disk and network load
 336       (c_load, m_load, d_load, n_load) =
 337         unzip4 $ map (\n ->
 338                       let DynUtil c1 m1 d1 n1 = Node.utilLoad n
 339                           DynUtil c2 m2 d2 n2 = Node.utilPool n
 340                       in (c1/c2, m1/m2, d1/d2, n1/n2)) nodes
 341       -- metric: conflicting instance count
 342       pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
 343       pri_tags_score = fromIntegral pri_tags_inst::Double
 344   in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
 345      , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
 346      , pri_tags_score ]
 347
 348 -- | Compute the /total/ variance.
 349 compCVNodes :: [Node.Node] -> Double
 350 compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
 351
 352 -- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
 353 compCV :: Node.List -> Double
 354 compCV = compCVNodes . Container.elems
 355
 356 -- | Compute online nodes from a 'Node.List'.
 357 getOnline :: Node.List -> [Node.Node]
 358 getOnline = filter (not . Node.offline) . Container.elems
 359
 360 -- * Balancing functions
 361
 362 -- | Compute best table. Note that the ordering of the arguments is important.
 363 compareTables :: Table -> Table -> Table
 364 compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
 365   if a_cv > b_cv then b else a
 366
 367 -- | Applies an instance move to a given node list and instance.
 368 applyMove :: Node.List -> Instance.Instance
 369           -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
 370 -- Failover (f)
 371 applyMove nl inst Failover =
 372   let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
 373       int_p = Node.removePri old_p inst
 374       int_s = Node.removeSec old_s inst
 375       new_nl = do -- Maybe monad
 376         new_p <- Node.addPriEx (Node.offline old_p) int_s inst
 377         new_s <- Node.addSec int_p inst old_sdx
 378         let new_inst = Instance.setBoth inst old_sdx old_pdx
 379         return (Container.addTwo old_pdx new_s old_sdx new_p nl,
 380                 new_inst, old_sdx, old_pdx)
 381   in new_nl
 382
 383 -- Replace the primary (f:, r:np, f)
 384 applyMove nl inst (ReplacePrimary new_pdx) =
 385   let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
 386       tgt_n = Container.find new_pdx nl
 387       int_p = Node.removePri old_p inst
 388       int_s = Node.removeSec old_s inst
 389       force_p = Node.offline old_p
 390       new_nl = do -- Maybe monad
 391                   -- check that the current secondary can host the instance
 392                   -- during the migration
 393         tmp_s <- Node.addPriEx force_p int_s inst
 394         let tmp_s' = Node.removePri tmp_s inst
 395         new_p <- Node.addPriEx force_p tgt_n inst
 396         new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
 397         let new_inst = Instance.setPri inst new_pdx
 398         return (Container.add new_pdx new_p $
 399                 Container.addTwo old_pdx int_p old_sdx new_s nl,
 400                 new_inst, new_pdx, old_sdx)
 401   in new_nl
 402
 403 -- Replace the secondary (r:ns)
 404 applyMove nl inst (ReplaceSecondary new_sdx) =
 405   let old_pdx = Instance.pNode inst
 406       old_sdx = Instance.sNode inst
 407       old_s = Container.find old_sdx nl
 408       tgt_n = Container.find new_sdx nl
 409       int_s = Node.removeSec old_s inst
 410       force_s = Node.offline old_s
 411       new_inst = Instance.setSec inst new_sdx
 412       new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
 413                \new_s -> return (Container.addTwo new_sdx
 414                                  new_s old_sdx int_s nl,
 415                                  new_inst, old_pdx, new_sdx)
 416   in new_nl
 417
 418 -- Replace the secondary and failover (r:np, f)
 419 applyMove nl inst (ReplaceAndFailover new_pdx) =
 420   let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
 421       tgt_n = Container.find new_pdx nl
 422       int_p = Node.removePri old_p inst
 423       int_s = Node.removeSec old_s inst
 424       force_s = Node.offline old_s
 425       new_nl = do -- Maybe monad
 426         new_p <- Node.addPri tgt_n inst
 427         new_s <- Node.addSecEx force_s int_p inst new_pdx
 428         let new_inst = Instance.setBoth inst new_pdx old_pdx
 429         return (Container.add new_pdx new_p $
 430                 Container.addTwo old_pdx new_s old_sdx int_s nl,
 431                 new_inst, new_pdx, old_pdx)
 432   in new_nl
 433
 434 -- Failver and replace the secondary (f, r:ns)
 435 applyMove nl inst (FailoverAndReplace new_sdx) =
 436   let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
 437       tgt_n = Container.find new_sdx nl
 438       int_p = Node.removePri old_p inst
 439       int_s = Node.removeSec old_s inst
 440       force_p = Node.offline old_p
 441       new_nl = do -- Maybe monad
 442         new_p <- Node.addPriEx force_p int_s inst
 443         new_s <- Node.addSecEx force_p tgt_n inst old_sdx
 444         let new_inst = Instance.setBoth inst old_sdx new_sdx
 445         return (Container.add new_sdx new_s $
 446                 Container.addTwo old_sdx new_p old_pdx int_p nl,
 447                 new_inst, old_sdx, new_sdx)
 448   in new_nl
 449
 450 -- | Tries to allocate an instance on one given node.
 451 allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
 452                  -> OpResult Node.AllocElement
 453 allocateOnSingle nl inst new_pdx =
 454   let p = Container.find new_pdx nl
 455       new_inst = Instance.setBoth inst new_pdx Node.noSecondary
 456   in do
 457     Instance.instMatchesPolicy inst (Node.iPolicy p)
 458     new_p <- Node.addPri p inst
 459     let new_nl = Container.add new_pdx new_p nl
 460         new_score = compCV nl
 461     return (new_nl, new_inst, [new_p], new_score)
 462
 463 -- | Tries to allocate an instance on a given pair of nodes.
 464 allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
 465                -> OpResult Node.AllocElement
 466 allocateOnPair nl inst new_pdx new_sdx =
 467   let tgt_p = Container.find new_pdx nl
 468       tgt_s = Container.find new_sdx nl
 469   in do
 470     Instance.instMatchesPolicy inst (Node.iPolicy tgt_p)
 471     new_p <- Node.addPri tgt_p inst
 472     new_s <- Node.addSec tgt_s inst new_pdx
 473     let new_inst = Instance.setBoth inst new_pdx new_sdx
 474         new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
 475     return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
 476
 477 -- | Tries to perform an instance move and returns the best table
 478 -- between the original one and the new one.
 479 checkSingleStep :: Table -- ^ The original table
 480                 -> Instance.Instance -- ^ The instance to move
 481                 -> Table -- ^ The current best table
 482                 -> IMove -- ^ The move to apply
 483                 -> Table -- ^ The final best table
 484 checkSingleStep ini_tbl target cur_tbl move =
 485   let Table ini_nl ini_il _ ini_plc = ini_tbl
 486       tmp_resu = applyMove ini_nl target move
 487   in case tmp_resu of
 488        OpFail _ -> cur_tbl
 489        OpGood (upd_nl, new_inst, pri_idx, sec_idx) ->
 490          let tgt_idx = Instance.idx target
 491              upd_cvar = compCV upd_nl
 492              upd_il = Container.add tgt_idx new_inst ini_il
 493              upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
 494              upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
 495          in compareTables cur_tbl upd_tbl
 496
 497 -- | Given the status of the current secondary as a valid new node and
 498 -- the current candidate target node, generate the possible moves for
 499 -- a instance.
 500 possibleMoves :: Bool      -- ^ Whether the secondary node is a valid new node
 501               -> Bool      -- ^ Whether we can change the primary node
 502               -> Ndx       -- ^ Target node candidate
 503               -> [IMove]   -- ^ List of valid result moves
 504
 505 possibleMoves _ False tdx =
 506   [ReplaceSecondary tdx]
 507
 508 possibleMoves True True tdx =
 509   [ ReplaceSecondary tdx
 510   , ReplaceAndFailover tdx
 511   , ReplacePrimary tdx
 512   , FailoverAndReplace tdx
 513   ]
 514
 515 possibleMoves False True tdx =
 516   [ ReplaceSecondary tdx
 517   , ReplaceAndFailover tdx
 518   ]
 519
 520 -- | Compute the best move for a given instance.
 521 checkInstanceMove :: [Ndx]             -- ^ Allowed target node indices
 522                   -> Bool              -- ^ Whether disk moves are allowed
 523                   -> Bool              -- ^ Whether instance moves are allowed
 524                   -> Table             -- ^ Original table
 525                   -> Instance.Instance -- ^ Instance to move
 526                   -> Table             -- ^ Best new table for this instance
 527 checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
 528   let opdx = Instance.pNode target
 529       osdx = Instance.sNode target
 530       bad_nodes = [opdx, osdx]
 531       nodes = filter (`notElem` bad_nodes) nodes_idx
 532       use_secondary = elem osdx nodes_idx && inst_moves
 533       aft_failover = if use_secondary -- if allowed to failover
 534                        then checkSingleStep ini_tbl target ini_tbl Failover
 535                        else ini_tbl
 536       all_moves = if disk_moves
 537                     then concatMap
 538                            (possibleMoves use_secondary inst_moves) nodes
 539                     else []
 540     in
 541       -- iterate over the possible nodes for this instance
 542       foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
 543
 544 -- | Compute the best next move.
 545 checkMove :: [Ndx]               -- ^ Allowed target node indices
 546           -> Bool                -- ^ Whether disk moves are allowed
 547           -> Bool                -- ^ Whether instance moves are allowed
 548           -> Table               -- ^ The current solution
 549           -> [Instance.Instance] -- ^ List of instances still to move
 550           -> Table               -- ^ The new solution
 551 checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
 552   let Table _ _ _ ini_plc = ini_tbl
 553       -- we're using rwhnf from the Control.Parallel.Strategies
 554       -- package; we don't need to use rnf as that would force too
 555       -- much evaluation in single-threaded cases, and in
 556       -- multi-threaded case the weak head normal form is enough to
 557       -- spark the evaluation
 558       tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
 559                              inst_moves ini_tbl)
 560                victims
 561       -- iterate over all instances, computing the best move
 562       best_tbl = foldl' compareTables ini_tbl tables
 563       Table _ _ _ best_plc = best_tbl
 564   in if length best_plc == length ini_plc
 565        then ini_tbl -- no advancement
 566        else best_tbl
 567
 568 -- | Check if we are allowed to go deeper in the balancing.
 569 doNextBalance :: Table     -- ^ The starting table
 570               -> Int       -- ^ Remaining length
 571               -> Score     -- ^ Score at which to stop
 572               -> Bool      -- ^ The resulting table and commands
 573 doNextBalance ini_tbl max_rounds min_score =
 574   let Table _ _ ini_cv ini_plc = ini_tbl
 575       ini_plc_len = length ini_plc
 576   in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
 577
 578 -- | Run a balance move.
 579 tryBalance :: Table       -- ^ The starting table
 580            -> Bool        -- ^ Allow disk moves
 581            -> Bool        -- ^ Allow instance moves
 582            -> Bool        -- ^ Only evacuate moves
 583            -> Score       -- ^ Min gain threshold
 584            -> Score       -- ^ Min gain
 585            -> Maybe Table -- ^ The resulting table and commands
 586 tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
 587     let Table ini_nl ini_il ini_cv _ = ini_tbl
 588         all_inst = Container.elems ini_il
 589         all_inst' = if evac_mode
 590                     then let bad_nodes = map Node.idx . filter Node.offline $
 591                                          Container.elems ini_nl
 592                          in filter (any (`elem` bad_nodes) . Instance.allNodes)
 593                             all_inst
 594                     else all_inst
 595         reloc_inst = filter Instance.movable all_inst'
 596         node_idx = map Node.idx . filter (not . Node.offline) $
 597                    Container.elems ini_nl
 598         fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
 599         (Table _ _ fin_cv _) = fin_tbl
 600     in
 601       if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
 602       then Just fin_tbl -- this round made success, return the new table
 603       else Nothing
 604
 605 -- * Allocation functions
 606
 607 -- | Build failure stats out of a list of failures.
 608 collapseFailures :: [FailMode] -> FailStats
 609 collapseFailures flst =
 610     map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst))
 611             [minBound..maxBound]
 612
 613 -- | Compares two Maybe AllocElement and chooses the besst score.
 614 bestAllocElement :: Maybe Node.AllocElement
 615                  -> Maybe Node.AllocElement
 616                  -> Maybe Node.AllocElement
 617 bestAllocElement a Nothing = a
 618 bestAllocElement Nothing b = b
 619 bestAllocElement a@(Just (_, _, _, ascore)) b@(Just (_, _, _, bscore)) =
 620   if ascore < bscore then a else b
 621
 622 -- | Update current Allocation solution and failure stats with new
 623 -- elements.
 624 concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
 625 concatAllocs as (OpFail reason) = as { asFailures = reason : asFailures as }
 626
 627 concatAllocs as (OpGood ns) =
 628   let -- Choose the old or new solution, based on the cluster score
 629     cntok = asAllocs as
 630     osols = asSolution as
 631     nsols = bestAllocElement osols (Just ns)
 632     nsuc = cntok + 1
 633     -- Note: we force evaluation of nsols here in order to keep the
 634     -- memory profile low - we know that we will need nsols for sure
 635     -- in the next cycle, so we force evaluation of nsols, since the
 636     -- foldl' in the caller will only evaluate the tuple, but not the
 637     -- elements of the tuple
 638   in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolution = nsols }
 639
 640 -- | Sums two 'AllocSolution' structures.
 641 sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution
 642 sumAllocs (AllocSolution aFails aAllocs aSols aLog)
 643           (AllocSolution bFails bAllocs bSols bLog) =
 644   -- note: we add b first, since usually it will be smaller; when
 645   -- fold'ing, a will grow and grow whereas b is the per-group
 646   -- result, hence smaller
 647   let nFails  = bFails ++ aFails
 648       nAllocs = aAllocs + bAllocs
 649       nSols   = bestAllocElement aSols bSols
 650       nLog    = bLog ++ aLog
 651   in AllocSolution nFails nAllocs nSols nLog
 652
 653 -- | Given a solution, generates a reasonable description for it.
 654 describeSolution :: AllocSolution -> String
 655 describeSolution as =
 656   let fcnt = asFailures as
 657       sols = asSolution as
 658       freasons =
 659         intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
 660         filter ((> 0) . snd) . collapseFailures $ fcnt
 661   in case sols of
 662      Nothing -> "No valid allocation solutions, failure reasons: " ++
 663                 (if null fcnt then "unknown reasons" else freasons)
 664      Just (_, _, nodes, cv) ->
 665          printf ("score: %.8f, successes %d, failures %d (%s)" ++
 666                  " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
 667                (intercalate "/" . map Node.name $ nodes)
 668
 669 -- | Annotates a solution with the appropriate string.
 670 annotateSolution :: AllocSolution -> AllocSolution
 671 annotateSolution as = as { asLog = describeSolution as : asLog as }
 672
 673 -- | Reverses an evacuation solution.
 674 --
 675 -- Rationale: we always concat the results to the top of the lists, so
 676 -- for proper jobset execution, we should reverse all lists.
 677 reverseEvacSolution :: EvacSolution -> EvacSolution
 678 reverseEvacSolution (EvacSolution f m o) =
 679   EvacSolution (reverse f) (reverse m) (reverse o)
 680
 681 -- | Generate the valid node allocation singles or pairs for a new instance.
 682 genAllocNodes :: Group.List        -- ^ Group list
 683               -> Node.List         -- ^ The node map
 684               -> Int               -- ^ The number of nodes required
 685               -> Bool              -- ^ Whether to drop or not
 686                                    -- unallocable nodes
 687               -> Result AllocNodes -- ^ The (monadic) result
 688 genAllocNodes gl nl count drop_unalloc =
 689   let filter_fn = if drop_unalloc
 690                     then filter (Group.isAllocable .
 691                                  flip Container.find gl . Node.group)
 692                     else id
 693       all_nodes = filter_fn $ getOnline nl
 694       all_pairs = [(Node.idx p,
 695                     [Node.idx s | s <- all_nodes,
 696                                        Node.idx p /= Node.idx s,
 697                                        Node.group p == Node.group s]) |
 698                    p <- all_nodes]
 699   in case count of
 700        1 -> Ok (Left (map Node.idx all_nodes))
 701        2 -> Ok (Right (filter (not . null . snd) all_pairs))
 702        _ -> Bad "Unsupported number of nodes, only one or two  supported"
 703
 704 -- | Try to allocate an instance on the cluster.
 705 tryAlloc :: (Monad m) =>
 706             Node.List         -- ^ The node list
 707          -> Instance.List     -- ^ The instance list
 708          -> Instance.Instance -- ^ The instance to allocate
 709          -> AllocNodes        -- ^ The allocation targets
 710          -> m AllocSolution   -- ^ Possible solution list
 711 tryAlloc _  _ _    (Right []) = fail "Not enough online nodes"
 712 tryAlloc nl _ inst (Right ok_pairs) =
 713   let psols = parMap rwhnf (\(p, ss) ->
 714                               foldl' (\cstate ->
 715                                         concatAllocs cstate .
 716                                         allocateOnPair nl inst p)
 717                               emptyAllocSolution ss) ok_pairs
 718       sols = foldl' sumAllocs emptyAllocSolution psols
 719   in return $ annotateSolution sols
 720
 721 tryAlloc _  _ _    (Left []) = fail "No online nodes"
 722 tryAlloc nl _ inst (Left all_nodes) =
 723   let sols = foldl' (\cstate ->
 724                        concatAllocs cstate . allocateOnSingle nl inst
 725                     ) emptyAllocSolution all_nodes
 726   in return $ annotateSolution sols
 727
 728 -- | Given a group/result, describe it as a nice (list of) messages.
 729 solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String]
 730 solutionDescription gl (groupId, result) =
 731   case result of
 732     Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
 733     Bad message -> [printf "Group %s: error %s" gname message]
 734   where grp = Container.find groupId gl
 735         gname = Group.name grp
 736         pol = allocPolicyToRaw (Group.allocPolicy grp)
 737
 738 -- | From a list of possibly bad and possibly empty solutions, filter
 739 -- only the groups with a valid result. Note that the result will be
 740 -- reversed compared to the original list.
 741 filterMGResults :: Group.List
 742                 -> [(Gdx, Result AllocSolution)]
 743                 -> [(Gdx, AllocSolution)]
 744 filterMGResults gl = foldl' fn []
 745   where unallocable = not . Group.isAllocable . flip Container.find gl
 746         fn accu (gdx, rasol) =
 747           case rasol of
 748             Bad _ -> accu
 749             Ok sol | isNothing (asSolution sol) -> accu
 750                    | unallocable gdx -> accu
 751                    | otherwise -> (gdx, sol):accu
 752
 753 -- | Sort multigroup results based on policy and score.
 754 sortMGResults :: Group.List
 755              -> [(Gdx, AllocSolution)]
 756              -> [(Gdx, AllocSolution)]
 757 sortMGResults gl sols =
 758   let extractScore (_, _, _, x) = x
 759       solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl),
 760                              (extractScore . fromJust . asSolution) sol)
 761   in sortBy (comparing solScore) sols
 762
 763 -- | Finds the best group for an instance on a multi-group cluster.
 764 --
 765 -- Only solutions in @preferred@ and @last_resort@ groups will be
 766 -- accepted as valid, and additionally if the allowed groups parameter
 767 -- is not null then allocation will only be run for those group
 768 -- indices.
 769 findBestAllocGroup :: Group.List           -- ^ The group list
 770                    -> Node.List            -- ^ The node list
 771                    -> Instance.List        -- ^ The instance list
 772                    -> Maybe [Gdx]          -- ^ The allowed groups
 773                    -> Instance.Instance    -- ^ The instance to allocate
 774                    -> Int                  -- ^ Required number of nodes
 775                    -> Result (Gdx, AllocSolution, [String])
 776 findBestAllocGroup mggl mgnl mgil allowed_gdxs inst cnt =
 777   let groups = splitCluster mgnl mgil
 778       groups' = maybe groups (\gs -> filter ((`elem` gs) . fst) groups)
 779                 allowed_gdxs
 780       sols = map (\(gid, (nl, il)) ->
 781                    (gid, genAllocNodes mggl nl cnt False >>=
 782                        tryAlloc nl il inst))
 783              groups'::[(Gdx, Result AllocSolution)]
 784       all_msgs = concatMap (solutionDescription mggl) sols
 785       goodSols = filterMGResults mggl sols
 786       sortedSols = sortMGResults mggl goodSols
 787   in if null sortedSols
 788        then if null groups'
 789               then Bad $ "no groups for evacuation: allowed groups was" ++
 790                      show allowed_gdxs ++ ", all groups: " ++
 791                      show (map fst groups)
 792               else Bad $ intercalate ", " all_msgs
 793        else let (final_group, final_sol) = head sortedSols
 794             in return (final_group, final_sol, all_msgs)
 795
 796 -- | Try to allocate an instance on a multi-group cluster.
 797 tryMGAlloc :: Group.List           -- ^ The group list
 798            -> Node.List            -- ^ The node list
 799            -> Instance.List        -- ^ The instance list
 800            -> Instance.Instance    -- ^ The instance to allocate
 801            -> Int                  -- ^ Required number of nodes
 802            -> Result AllocSolution -- ^ Possible solution list
 803 tryMGAlloc mggl mgnl mgil inst cnt = do
 804   (best_group, solution, all_msgs) <-
 805       findBestAllocGroup mggl mgnl mgil Nothing inst cnt
 806   let group_name = Group.name $ Container.find best_group mggl
 807       selmsg = "Selected group: " ++ group_name
 808   return $ solution { asLog = selmsg:all_msgs }
 809
 810 -- | Function which fails if the requested mode is change secondary.
 811 --
 812 -- This is useful since except DRBD, no other disk template can
 813 -- execute change secondary; thus, we can just call this function
 814 -- instead of always checking for secondary mode. After the call to
 815 -- this function, whatever mode we have is just a primary change.
 816 failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
 817 failOnSecondaryChange ChangeSecondary dt =
 818   fail $ "Instances with disk template '" ++ diskTemplateToRaw dt ++
 819          "' can't execute change secondary"
 820 failOnSecondaryChange _ _ = return ()
 821
 822 -- | Run evacuation for a single instance.
 823 --
 824 -- /Note:/ this function should correctly execute both intra-group
 825 -- evacuations (in all modes) and inter-group evacuations (in the
 826 -- 'ChangeAll' mode). Of course, this requires that the correct list
 827 -- of target nodes is passed.
 828 nodeEvacInstance :: Node.List         -- ^ The node list (cluster-wide)
 829                  -> Instance.List     -- ^ Instance list (cluster-wide)
 830                  -> EvacMode          -- ^ The evacuation mode
 831                  -> Instance.Instance -- ^ The instance to be evacuated
 832                  -> Gdx               -- ^ The group we're targetting
 833                  -> [Ndx]             -- ^ The list of available nodes
 834                                       -- for allocation
 835                  -> Result (Node.List, Instance.List, [OpCodes.OpCode])
 836 nodeEvacInstance _ _ mode (Instance.Instance
 837                            {Instance.diskTemplate = dt@DTDiskless}) _ _ =
 838                   failOnSecondaryChange mode dt >>
 839                   fail "Diskless relocations not implemented yet"
 840
 841 nodeEvacInstance _ _ _ (Instance.Instance
 842                         {Instance.diskTemplate = DTPlain}) _ _ =
 843                   fail "Instances of type plain cannot be relocated"
 844
 845 nodeEvacInstance _ _ _ (Instance.Instance
 846                         {Instance.diskTemplate = DTFile}) _ _ =
 847                   fail "Instances of type file cannot be relocated"
 848
 849 nodeEvacInstance _ _ mode  (Instance.Instance
 850                             {Instance.diskTemplate = dt@DTSharedFile}) _ _ =
 851                   failOnSecondaryChange mode dt >>
 852                   fail "Shared file relocations not implemented yet"
 853
 854 nodeEvacInstance _ _ mode (Instance.Instance
 855                            {Instance.diskTemplate = dt@DTBlock}) _ _ =
 856                   failOnSecondaryChange mode dt >>
 857                   fail "Block device relocations not implemented yet"
 858
 859 nodeEvacInstance nl il ChangePrimary
 860                  inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
 861                  _ _ =
 862   do
 863     (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
 864     let idx = Instance.idx inst
 865         il' = Container.add idx inst' il
 866         ops = iMoveToJob nl' il' idx Failover
 867     return (nl', il', ops)
 868
 869 nodeEvacInstance nl il ChangeSecondary
 870                  inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
 871                  gdx avail_nodes =
 872   do
 873     (nl', inst', _, ndx) <- annotateResult "Can't find any good node" $
 874                             eitherToResult $
 875                             foldl' (evacDrbdSecondaryInner nl inst gdx)
 876                             (Left "no nodes available") avail_nodes
 877     let idx = Instance.idx inst
 878         il' = Container.add idx inst' il
 879         ops = iMoveToJob nl' il' idx (ReplaceSecondary ndx)
 880     return (nl', il', ops)
 881
 882 -- The algorithm for ChangeAll is as follows:
 883 --
 884 -- * generate all (primary, secondary) node pairs for the target groups
 885 -- * for each pair, execute the needed moves (r:s, f, r:s) and compute
 886 --   the final node list state and group score
 887 -- * select the best choice via a foldl that uses the same Either
 888 --   String solution as the ChangeSecondary mode
 889 nodeEvacInstance nl il ChangeAll
 890                  inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
 891                  gdx avail_nodes =
 892   do
 893     let no_nodes = Left "no nodes available"
 894         node_pairs = [(p,s) | p <- avail_nodes, s <- avail_nodes, p /= s]
 895     (nl', il', ops, _) <-
 896         annotateResult "Can't find any good nodes for relocation" $
 897         eitherToResult $
 898         foldl'
 899         (\accu nodes -> case evacDrbdAllInner nl il inst gdx nodes of
 900                           Bad msg ->
 901                               case accu of
 902                                 Right _ -> accu
 903                                 -- we don't need more details (which
 904                                 -- nodes, etc.) as we only selected
 905                                 -- this group if we can allocate on
 906                                 -- it, hence failures will not
 907                                 -- propagate out of this fold loop
 908                                 Left _ -> Left $ "Allocation failed: " ++ msg
 909                           Ok result@(_, _, _, new_cv) ->
 910                               let new_accu = Right result in
 911                               case accu of
 912                                 Left _ -> new_accu
 913                                 Right (_, _, _, old_cv) ->
 914                                     if old_cv < new_cv
 915                                     then accu
 916                                     else new_accu
 917         ) no_nodes node_pairs
 918
 919     return (nl', il', ops)
 920
 921 -- | Inner fold function for changing secondary of a DRBD instance.
 922 --
 923 -- The running solution is either a @Left String@, which means we
 924 -- don't have yet a working solution, or a @Right (...)@, which
 925 -- represents a valid solution; it holds the modified node list, the
 926 -- modified instance (after evacuation), the score of that solution,
 927 -- and the new secondary node index.
 928 evacDrbdSecondaryInner :: Node.List -- ^ Cluster node list
 929                        -> Instance.Instance -- ^ Instance being evacuated
 930                        -> Gdx -- ^ The group index of the instance
 931                        -> Either String ( Node.List
 932                                         , Instance.Instance
 933                                         , Score
 934                                         , Ndx)  -- ^ Current best solution
 935                        -> Ndx  -- ^ Node we're evaluating as new secondary
 936                        -> Either String ( Node.List
 937                                         , Instance.Instance
 938                                         , Score
 939                                         , Ndx) -- ^ New best solution
 940 evacDrbdSecondaryInner nl inst gdx accu ndx =
 941   case applyMove nl inst (ReplaceSecondary ndx) of
 942     OpFail fm ->
 943       case accu of
 944         Right _ -> accu
 945         Left _ -> Left $ "Node " ++ Container.nameOf nl ndx ++
 946                   " failed: " ++ show fm
 947     OpGood (nl', inst', _, _) ->
 948       let nodes = Container.elems nl'
 949           -- The fromJust below is ugly (it can fail nastily), but
 950           -- at this point we should have any internal mismatches,
 951           -- and adding a monad here would be quite involved
 952           grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
 953           new_cv = compCVNodes grpnodes
 954           new_accu = Right (nl', inst', new_cv, ndx)
 955       in case accu of
 956            Left _ -> new_accu
 957            Right (_, _, old_cv, _) ->
 958              if old_cv < new_cv
 959                then accu
 960                else new_accu
 961
 962 -- | Compute result of changing all nodes of a DRBD instance.
 963 --
 964 -- Given the target primary and secondary node (which might be in a
 965 -- different group or not), this function will 'execute' all the
 966 -- required steps and assuming all operations succceed, will return
 967 -- the modified node and instance lists, the opcodes needed for this
 968 -- and the new group score.
 969 evacDrbdAllInner :: Node.List         -- ^ Cluster node list
 970                  -> Instance.List     -- ^ Cluster instance list
 971                  -> Instance.Instance -- ^ The instance to be moved
 972                  -> Gdx               -- ^ The target group index
 973                                       -- (which can differ from the
 974                                       -- current group of the
 975                                       -- instance)
 976                  -> (Ndx, Ndx)        -- ^ Tuple of new
 977                                       -- primary\/secondary nodes
 978                  -> Result (Node.List, Instance.List, [OpCodes.OpCode], Score)
 979 evacDrbdAllInner nl il inst gdx (t_pdx, t_sdx) = do
 980   let primary = Container.find (Instance.pNode inst) nl
 981       idx = Instance.idx inst
 982   -- if the primary is offline, then we first failover
 983   (nl1, inst1, ops1) <-
 984     if Node.offline primary
 985       then do
 986         (nl', inst', _, _) <-
 987           annotateResult "Failing over to the secondary" $
 988           opToResult $ applyMove nl inst Failover
 989         return (nl', inst', [Failover])
 990       else return (nl, inst, [])
 991   let (o1, o2, o3) = (ReplaceSecondary t_pdx,
 992                       Failover,
 993                       ReplaceSecondary t_sdx)
 994   -- we now need to execute a replace secondary to the future
 995   -- primary node
 996   (nl2, inst2, _, _) <-
 997     annotateResult "Changing secondary to new primary" $
 998     opToResult $
 999     applyMove nl1 inst1 o1
1000   let ops2 = o1:ops1
1001   -- we now execute another failover, the primary stays fixed now
1002   (nl3, inst3, _, _) <- annotateResult "Failing over to new primary" $
1003                         opToResult $ applyMove nl2 inst2 o2
1004   let ops3 = o2:ops2
1005   -- and finally another replace secondary, to the final secondary
1006   (nl4, inst4, _, _) <-
1007     annotateResult "Changing secondary to final secondary" $
1008     opToResult $
1009     applyMove nl3 inst3 o3
1010   let ops4 = o3:ops3
1011       il' = Container.add idx inst4 il
1012       ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4
1013   let nodes = Container.elems nl4
1014       -- The fromJust below is ugly (it can fail nastily), but
1015       -- at this point we should have any internal mismatches,
1016       -- and adding a monad here would be quite involved
1017       grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1018       new_cv = compCVNodes grpnodes
1019   return (nl4, il', ops, new_cv)
1020
1021 -- | Computes the nodes in a given group which are available for
1022 -- allocation.
1023 availableGroupNodes :: [(Gdx, [Ndx])] -- ^ Group index/node index assoc list
1024                     -> IntSet.IntSet  -- ^ Nodes that are excluded
1025                     -> Gdx            -- ^ The group for which we
1026                                       -- query the nodes
1027                     -> Result [Ndx]   -- ^ List of available node indices
1028 availableGroupNodes group_nodes excl_ndx gdx = do
1029   local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx)
1030                  Ok (lookup gdx group_nodes)
1031   let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes
1032   return avail_nodes
1033
1034 -- | Updates the evac solution with the results of an instance
1035 -- evacuation.
1036 updateEvacSolution :: (Node.List, Instance.List, EvacSolution)
1037                    -> Idx
1038                    -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1039                    -> (Node.List, Instance.List, EvacSolution)
1040 updateEvacSolution (nl, il, es) idx (Bad msg) =
1041   (nl, il, es { esFailed = (idx, msg):esFailed es})
1042 updateEvacSolution (_, _, es) idx (Ok (nl, il, opcodes)) =
1043   (nl, il, es { esMoved = new_elem:esMoved es
1044               , esOpCodes = opcodes:esOpCodes es })
1045     where inst = Container.find idx il
1046           new_elem = (idx,
1047                       instancePriGroup nl inst,
1048                       Instance.allNodes inst)
1049
1050 -- | Node-evacuation IAllocator mode main function.
1051 tryNodeEvac :: Group.List    -- ^ The cluster groups
1052             -> Node.List     -- ^ The node list (cluster-wide, not per group)
1053             -> Instance.List -- ^ Instance list (cluster-wide)
1054             -> EvacMode      -- ^ The evacuation mode
1055             -> [Idx]         -- ^ List of instance (indices) to be evacuated
1056             -> Result (Node.List, Instance.List, EvacSolution)
1057 tryNodeEvac _ ini_nl ini_il mode idxs =
1058   let evac_ndx = nodesToEvacuate ini_il mode idxs
1059       offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1060       excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline
1061       group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1062                                            (Container.elems nl))) $
1063                   splitCluster ini_nl ini_il
1064       (fin_nl, fin_il, esol) =
1065         foldl' (\state@(nl, il, _) inst ->
1066                   let gdx = instancePriGroup nl inst
1067                       pdx = Instance.pNode inst in
1068                   updateEvacSolution state (Instance.idx inst) $
1069                   availableGroupNodes group_ndx
1070                     (IntSet.insert pdx excl_ndx) gdx >>=
1071                       nodeEvacInstance nl il mode inst gdx
1072                )
1073         (ini_nl, ini_il, emptyEvacSolution)
1074         (map (`Container.find` ini_il) idxs)
1075   in return (fin_nl, fin_il, reverseEvacSolution esol)
1076
1077 -- | Change-group IAllocator mode main function.
1078 --
1079 -- This is very similar to 'tryNodeEvac', the only difference is that
1080 -- we don't choose as target group the current instance group, but
1081 -- instead:
1082 --
1083 --   1. at the start of the function, we compute which are the target
1084 --   groups; either no groups were passed in, in which case we choose
1085 --   all groups out of which we don't evacuate instance, or there were
1086 --   some groups passed, in which case we use those
1087 --
1088 --   2. for each instance, we use 'findBestAllocGroup' to choose the
1089 --   best group to hold the instance, and then we do what
1090 --   'tryNodeEvac' does, except for this group instead of the current
1091 --   instance group.
1092 --
1093 -- Note that the correct behaviour of this function relies on the
1094 -- function 'nodeEvacInstance' to be able to do correctly both
1095 -- intra-group and inter-group moves when passed the 'ChangeAll' mode.
1096 tryChangeGroup :: Group.List    -- ^ The cluster groups
1097                -> Node.List     -- ^ The node list (cluster-wide)
1098                -> Instance.List -- ^ Instance list (cluster-wide)
1099                -> [Gdx]         -- ^ Target groups; if empty, any
1100                                 -- groups not being evacuated
1101                -> [Idx]         -- ^ List of instance (indices) to be evacuated
1102                -> Result (Node.List, Instance.List, EvacSolution)
1103 tryChangeGroup gl ini_nl ini_il gdxs idxs =
1104   let evac_gdxs = nub $ map (instancePriGroup ini_nl .
1105                              flip Container.find ini_il) idxs
1106       target_gdxs = (if null gdxs
1107                        then Container.keys gl
1108                        else gdxs) \\ evac_gdxs
1109       offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1110       excl_ndx = foldl' (flip IntSet.insert) IntSet.empty offline
1111       group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1112                                            (Container.elems nl))) $
1113                   splitCluster ini_nl ini_il
1114       (fin_nl, fin_il, esol) =
1115         foldl' (\state@(nl, il, _) inst ->
1116                   let solution = do
1117                         let ncnt = Instance.requiredNodes $
1118                                    Instance.diskTemplate inst
1119                         (gdx, _, _) <- findBestAllocGroup gl nl il
1120                                        (Just target_gdxs) inst ncnt
1121                         av_nodes <- availableGroupNodes group_ndx
1122                                     excl_ndx gdx
1123                         nodeEvacInstance nl il ChangeAll inst gdx av_nodes
1124                   in updateEvacSolution state (Instance.idx inst) solution
1125                )
1126         (ini_nl, ini_il, emptyEvacSolution)
1127         (map (`Container.find` ini_il) idxs)
1128   in return (fin_nl, fin_il, reverseEvacSolution esol)
1129
1130 -- | Standard-sized allocation method.
1131 --
1132 -- This places instances of the same size on the cluster until we're
1133 -- out of space. The result will be a list of identically-sized
1134 -- instances.
1135 iterateAlloc :: AllocMethod
1136 iterateAlloc nl il limit newinst allocnodes ixes cstats =
1137   let depth = length ixes
1138       newname = printf "new-%d" depth::String
1139       newidx = Container.size il
1140       newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
1141       newlimit = fmap (flip (-) 1) limit
1142   in case tryAlloc nl il newi2 allocnodes of
1143        Bad s -> Bad s
1144        Ok (AllocSolution { asFailures = errs, asSolution = sols3 }) ->
1145          let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in
1146          case sols3 of
1147            Nothing -> newsol
1148            Just (xnl, xi, _, _) ->
1149              if limit == Just 0
1150                then newsol
1151                else iterateAlloc xnl (Container.add newidx xi il)
1152                       newlimit newinst allocnodes (xi:ixes)
1153                       (totalResources xnl:cstats)
1154
1155 -- | Tiered allocation method.
1156 --
1157 -- This places instances on the cluster, and decreases the spec until
1158 -- we can allocate again. The result will be a list of decreasing
1159 -- instance specs.
1160 tieredAlloc :: AllocMethod
1161 tieredAlloc nl il limit newinst allocnodes ixes cstats =
1162   case iterateAlloc nl il limit newinst allocnodes ixes cstats of
1163     Bad s -> Bad s
1164     Ok (errs, nl', il', ixes', cstats') ->
1165       let newsol = Ok (errs, nl', il', ixes', cstats')
1166           ixes_cnt = length ixes'
1167           (stop, newlimit) = case limit of
1168                                Nothing -> (False, Nothing)
1169                                Just n -> (n <= ixes_cnt,
1170                                             Just (n - ixes_cnt)) in
1171       if stop then newsol else
1172           case Instance.shrinkByType newinst . fst . last $
1173                sortBy (comparing snd) errs of
1174             Bad _ -> newsol
1175             Ok newinst' -> tieredAlloc nl' il' newlimit
1176                            newinst' allocnodes ixes' cstats'
1177
1178 -- * Formatting functions
1179
1180 -- | Given the original and final nodes, computes the relocation description.
1181 computeMoves :: Instance.Instance -- ^ The instance to be moved
1182              -> String -- ^ The instance name
1183              -> IMove  -- ^ The move being performed
1184              -> String -- ^ New primary
1185              -> String -- ^ New secondary
1186              -> (String, [String])
1187                 -- ^ Tuple of moves and commands list; moves is containing
1188                 -- either @/f/@ for failover or @/r:name/@ for replace
1189                 -- secondary, while the command list holds gnt-instance
1190                 -- commands (without that prefix), e.g \"@failover instance1@\"
1191 computeMoves i inam mv c d =
1192   case mv of
1193     Failover -> ("f", [mig])
1194     FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
1195     ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
1196     ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
1197     ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
1198   where morf = if Instance.instanceRunning i then "migrate" else "failover"
1199         mig = printf "%s -f %s" morf inam::String
1200         rep n = printf "replace-disks -n %s %s" n inam
1201
1202 -- | Converts a placement to string format.
1203 printSolutionLine :: Node.List     -- ^ The node list
1204                   -> Instance.List -- ^ The instance list
1205                   -> Int           -- ^ Maximum node name length
1206                   -> Int           -- ^ Maximum instance name length
1207                   -> Placement     -- ^ The current placement
1208                   -> Int           -- ^ The index of the placement in
1209                                    -- the solution
1210                   -> (String, [String])
1211 printSolutionLine nl il nmlen imlen plc pos =
1212   let pmlen = (2*nmlen + 1)
1213       (i, p, s, mv, c) = plc
1214       inst = Container.find i il
1215       inam = Instance.alias inst
1216       npri = Node.alias $ Container.find p nl
1217       nsec = Node.alias $ Container.find s nl
1218       opri = Node.alias $ Container.find (Instance.pNode inst) nl
1219       osec = Node.alias $ Container.find (Instance.sNode inst) nl
1220       (moves, cmds) =  computeMoves inst inam mv npri nsec
1221       ostr = printf "%s:%s" opri osec::String
1222       nstr = printf "%s:%s" npri nsec::String
1223   in (printf "  %3d. %-*s %-*s => %-*s %.8f a=%s"
1224       pos imlen inam pmlen ostr
1225       pmlen nstr c moves,
1226       cmds)
1227
1228 -- | Return the instance and involved nodes in an instance move.
1229 --
1230 -- Note that the output list length can vary, and is not required nor
1231 -- guaranteed to be of any specific length.
1232 involvedNodes :: Instance.List -- ^ Instance list, used for retrieving
1233                                -- the instance from its index; note
1234                                -- that this /must/ be the original
1235                                -- instance list, so that we can
1236                                -- retrieve the old nodes
1237               -> Placement     -- ^ The placement we're investigating,
1238                                -- containing the new nodes and
1239                                -- instance index
1240               -> [Ndx]         -- ^ Resulting list of node indices
1241 involvedNodes il plc =
1242   let (i, np, ns, _, _) = plc
1243       inst = Container.find i il
1244   in nub $ [np, ns] ++ Instance.allNodes inst
1245
1246 -- | Inner function for splitJobs, that either appends the next job to
1247 -- the current jobset, or starts a new jobset.
1248 mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
1249 mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
1250 mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
1251   | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
1252   | otherwise = ([n]:cjs, ndx)
1253
1254 -- | Break a list of moves into independent groups. Note that this
1255 -- will reverse the order of jobs.
1256 splitJobs :: [MoveJob] -> [JobSet]
1257 splitJobs = fst . foldl mergeJobs ([], [])
1258
1259 -- | Given a list of commands, prefix them with @gnt-instance@ and
1260 -- also beautify the display a little.
1261 formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
1262 formatJob jsn jsl (sn, (_, _, _, cmds)) =
1263   let out =
1264         printf "  echo job %d/%d" jsn sn:
1265         printf "  check":
1266         map ("  gnt-instance " ++) cmds
1267   in if sn == 1
1268        then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
1269        else out
1270
1271 -- | Given a list of commands, prefix them with @gnt-instance@ and
1272 -- also beautify the display a little.
1273 formatCmds :: [JobSet] -> String
1274 formatCmds =
1275   unlines .
1276   concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1277                            (zip [1..] js)) .
1278   zip [1..]
1279
1280 -- | Print the node list.
1281 printNodes :: Node.List -> [String] -> String
1282 printNodes nl fs =
1283   let fields = case fs of
1284                  [] -> Node.defaultFields
1285                  "+":rest -> Node.defaultFields ++ rest
1286                  _ -> fs
1287       snl = sortBy (comparing Node.idx) (Container.elems nl)
1288       (header, isnum) = unzip $ map Node.showHeader fields
1289   in unlines . map ((:) ' ' .  unwords) $
1290      formatTable (header:map (Node.list fields) snl) isnum
1291
1292 -- | Print the instance list.
1293 printInsts :: Node.List -> Instance.List -> String
1294 printInsts nl il =
1295   let sil = sortBy (comparing Instance.idx) (Container.elems il)
1296       helper inst = [ if Instance.instanceRunning inst then "R" else " "
1297                     , Instance.name inst
1298                     , Container.nameOf nl (Instance.pNode inst)
1299                     , let sdx = Instance.sNode inst
1300                       in if sdx == Node.noSecondary
1301                            then  ""
1302                            else Container.nameOf nl sdx
1303                     , if Instance.autoBalance inst then "Y" else "N"
1304                     , printf "%3d" $ Instance.vcpus inst
1305                     , printf "%5d" $ Instance.mem inst
1306                     , printf "%5d" $ Instance.dsk inst `div` 1024
1307                     , printf "%5.3f" lC
1308                     , printf "%5.3f" lM
1309                     , printf "%5.3f" lD
1310                     , printf "%5.3f" lN
1311                     ]
1312           where DynUtil lC lM lD lN = Instance.util inst
1313       header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1314                , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1315       isnum = False:False:False:False:False:repeat True
1316   in unlines . map ((:) ' ' . unwords) $
1317      formatTable (header:map helper sil) isnum
1318
1319 -- | Shows statistics for a given node list.
1320 printStats :: Node.List -> String
1321 printStats nl =
1322   let dcvs = compDetailedCV $ Container.elems nl
1323       (weights, names) = unzip detailedCVInfo
1324       hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1325       formatted = map (\(w, header, val) ->
1326                          printf "%s=%.8f(x%.2f)" header val w::String) hd
1327   in intercalate ", " formatted
1328
1329 -- | Convert a placement into a list of OpCodes (basically a job).
1330 iMoveToJob :: Node.List        -- ^ The node list; only used for node
1331                                -- names, so any version is good
1332                                -- (before or after the operation)
1333            -> Instance.List    -- ^ The instance list; also used for
1334                                -- names only
1335            -> Idx              -- ^ The index of the instance being
1336                                -- moved
1337            -> IMove            -- ^ The actual move to be described
1338            -> [OpCodes.OpCode] -- ^ The list of opcodes equivalent to
1339                                -- the given move
1340 iMoveToJob nl il idx move =
1341   let inst = Container.find idx il
1342       iname = Instance.name inst
1343       lookNode  = Just . Container.nameOf nl
1344       opF = OpCodes.OpInstanceMigrate iname True False True Nothing
1345       opR n = OpCodes.OpInstanceReplaceDisks iname (lookNode n)
1346               OpCodes.ReplaceNewSecondary [] Nothing
1347   in case move of
1348        Failover -> [ opF ]
1349        ReplacePrimary np -> [ opF, opR np, opF ]
1350        ReplaceSecondary ns -> [ opR ns ]
1351        ReplaceAndFailover np -> [ opR np, opF ]
1352        FailoverAndReplace ns -> [ opF, opR ns ]
1353
1354 -- * Node group functions
1355
1356 -- | Computes the group of an instance.
1357 instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1358 instanceGroup nl i =
1359   let sidx = Instance.sNode i
1360       pnode = Container.find (Instance.pNode i) nl
1361       snode = if sidx == Node.noSecondary
1362               then pnode
1363               else Container.find sidx nl
1364       pgroup = Node.group pnode
1365       sgroup = Node.group snode
1366   in if pgroup /= sgroup
1367        then fail ("Instance placed accross two node groups, primary " ++
1368                   show pgroup ++ ", secondary " ++ show sgroup)
1369        else return pgroup
1370
1371 -- | Computes the group of an instance per the primary node.
1372 instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1373 instancePriGroup nl i =
1374   let pnode = Container.find (Instance.pNode i) nl
1375   in  Node.group pnode
1376
1377 -- | Compute the list of badly allocated instances (split across node
1378 -- groups).
1379 findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1380 findSplitInstances nl =
1381   filter (not . isOk . instanceGroup nl) . Container.elems
1382
1383 -- | Splits a cluster into the component node groups.
1384 splitCluster :: Node.List -> Instance.List ->
1385                 [(Gdx, (Node.List, Instance.List))]
1386 splitCluster nl il =
1387   let ngroups = Node.computeGroups (Container.elems nl)
1388   in map (\(guuid, nodes) ->
1389            let nidxs = map Node.idx nodes
1390                nodes' = zip nidxs nodes
1391                instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1392            in (guuid, (Container.fromList nodes', instances))) ngroups
1393
1394 -- | Compute the list of nodes that are to be evacuated, given a list
1395 -- of instances and an evacuation mode.
1396 nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list
1397                 -> EvacMode      -- ^ The evacuation mode we're using
1398                 -> [Idx]         -- ^ List of instance indices being evacuated
1399                 -> IntSet.IntSet -- ^ Set of node indices
1400 nodesToEvacuate il mode =
1401   IntSet.delete Node.noSecondary .
1402   foldl' (\ns idx ->
1403             let i = Container.find idx il
1404                 pdx = Instance.pNode i
1405                 sdx = Instance.sNode i
1406                 dt = Instance.diskTemplate i
1407                 withSecondary = case dt of
1408                                   DTDrbd8 -> IntSet.insert sdx ns
1409                                   _ -> ns
1410             in case mode of
1411                  ChangePrimary   -> IntSet.insert pdx ns
1412                  ChangeSecondary -> withSecondary
1413                  ChangeAll       -> IntSet.insert pdx withSecondary
1414          ) IntSet.empty