1 {-| Implementation of cluster-wide logic.
3 This module holds all pure cluster-logic; I\/O related functionality
4 goes into the /Main/ module for the individual binaries.
10 Copyright (C) 2009, 2010, 2011 Google Inc.
12 This program is free software; you can redistribute it and/or modify
13 it under the terms of the GNU General Public License as published by
14 the Free Software Foundation; either version 2 of the License, or
15 (at your option) any later version.
17 This program is distributed in the hope that it will be useful, but
18 WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 General Public License for more details.
22 You should have received a copy of the GNU General Public License
23 along with this program; if not, write to the Free Software
24 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
29 module Ganeti.HTools.Cluster
36 -- * Generic functions
38 , computeAllocationDelta
39 -- * First phase functions
41 -- * Second phase functions
46 -- * Display functions
49 -- * Balacing functions
57 -- * IAllocator functions
66 -- * Allocation functions
70 -- * Node group functions
76 import Data.Function (on)
78 import Data.Ord (comparing)
79 import Text.Printf (printf)
81 import Control.Parallel.Strategies
83 import qualified Ganeti.HTools.Container as Container
84 import qualified Ganeti.HTools.Instance as Instance
85 import qualified Ganeti.HTools.Node as Node
86 import qualified Ganeti.HTools.Group as Group
87 import Ganeti.HTools.Types
88 import Ganeti.HTools.Utils
89 import qualified Ganeti.OpCodes as OpCodes
93 -- | Allocation\/relocation solution.
94 data AllocSolution = AllocSolution
95 { asFailures :: [FailMode] -- ^ Failure counts
96 , asAllocs :: Int -- ^ Good allocation count
97 , asSolutions :: [Node.AllocElement] -- ^ The actual result, length
98 -- of the list depends on the
99 -- allocation/relocation mode
100 , asLog :: [String] -- ^ A list of informational messages
103 -- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
104 type AllocResult = (FailStats, Node.List, Instance.List,
105 [Instance.Instance], [CStats])
108 -- | A type denoting the valid allocation mode/pairs.
110 -- For a one-node allocation, this will be a @Left ['Node.Node']@,
111 -- whereas for a two-node allocation, this will be a @Right
112 -- [('Node.Node', 'Node.Node')]@.
113 type AllocNodes = Either [Ndx] [(Ndx, Ndx)]
115 -- | The empty solution we start with when computing allocations.
116 emptySolution :: AllocSolution
117 emptySolution = AllocSolution { asFailures = [], asAllocs = 0
118 , asSolutions = [], asLog = [] }
120 -- | The complete state for the balancing solution.
121 data Table = Table Node.List Instance.List Score [Placement]
122 deriving (Show, Read)
124 data CStats = CStats { csFmem :: Integer -- ^ Cluster free mem
125 , csFdsk :: Integer -- ^ Cluster free disk
126 , csAmem :: Integer -- ^ Cluster allocatable mem
127 , csAdsk :: Integer -- ^ Cluster allocatable disk
128 , csAcpu :: Integer -- ^ Cluster allocatable cpus
129 , csMmem :: Integer -- ^ Max node allocatable mem
130 , csMdsk :: Integer -- ^ Max node allocatable disk
131 , csMcpu :: Integer -- ^ Max node allocatable cpu
132 , csImem :: Integer -- ^ Instance used mem
133 , csIdsk :: Integer -- ^ Instance used disk
134 , csIcpu :: Integer -- ^ Instance used cpu
135 , csTmem :: Double -- ^ Cluster total mem
136 , csTdsk :: Double -- ^ Cluster total disk
137 , csTcpu :: Double -- ^ Cluster total cpus
138 , csVcpu :: Integer -- ^ Cluster virtual cpus (if
139 -- node pCpu has been set,
141 , csXmem :: Integer -- ^ Unnacounted for mem
142 , csNmem :: Integer -- ^ Node own memory
143 , csScore :: Score -- ^ The cluster score
144 , csNinst :: Int -- ^ The total number of instances
146 deriving (Show, Read)
148 -- | Currently used, possibly to allocate, unallocable.
149 type AllocStats = (RSpec, RSpec, RSpec)
151 -- * Utility functions
153 -- | Verifies the N+1 status and return the affected nodes.
154 verifyN1 :: [Node.Node] -> [Node.Node]
155 verifyN1 = filter Node.failN1
157 {-| Computes the pair of bad nodes and instances.
159 The bad node list is computed via a simple 'verifyN1' check, and the
160 bad instance list is the list of primary and secondary instances of
164 computeBadItems :: Node.List -> Instance.List ->
165 ([Node.Node], [Instance.Instance])
166 computeBadItems nl il =
167 let bad_nodes = verifyN1 $ getOnline nl
168 bad_instances = map (`Container.find` il) .
170 concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
172 (bad_nodes, bad_instances)
174 -- | Zero-initializer for the CStats type.
175 emptyCStats :: CStats
176 emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
178 -- | Update stats with data from a new node.
179 updateCStats :: CStats -> Node.Node -> CStats
180 updateCStats cs node =
181 let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
182 csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
183 csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
184 csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
185 csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
187 csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
190 inc_amem = Node.fMem node - Node.rMem node
191 inc_amem' = if inc_amem > 0 then inc_amem else 0
192 inc_adsk = Node.availDisk node
193 inc_imem = truncate (Node.tMem node) - Node.nMem node
194 - Node.xMem node - Node.fMem node
195 inc_icpu = Node.uCpu node
196 inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
197 inc_vcpu = Node.hiCpu node
198 inc_acpu = Node.availCpu node
200 in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
201 , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
202 , csAmem = x_amem + fromIntegral inc_amem'
203 , csAdsk = x_adsk + fromIntegral inc_adsk
204 , csAcpu = x_acpu + fromIntegral inc_acpu
205 , csMmem = max x_mmem (fromIntegral inc_amem')
206 , csMdsk = max x_mdsk (fromIntegral inc_adsk)
207 , csMcpu = max x_mcpu (fromIntegral inc_acpu)
208 , csImem = x_imem + fromIntegral inc_imem
209 , csIdsk = x_idsk + fromIntegral inc_idsk
210 , csIcpu = x_icpu + fromIntegral inc_icpu
211 , csTmem = x_tmem + Node.tMem node
212 , csTdsk = x_tdsk + Node.tDsk node
213 , csTcpu = x_tcpu + Node.tCpu node
214 , csVcpu = x_vcpu + fromIntegral inc_vcpu
215 , csXmem = x_xmem + fromIntegral (Node.xMem node)
216 , csNmem = x_nmem + fromIntegral (Node.nMem node)
217 , csNinst = x_ninst + length (Node.pList node)
220 -- | Compute the total free disk and memory in the cluster.
221 totalResources :: Node.List -> CStats
223 let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
224 in cs { csScore = compCV nl }
226 -- | Compute the delta between two cluster state.
228 -- This is used when doing allocations, to understand better the
229 -- available cluster resources. The return value is a triple of the
230 -- current used values, the delta that was still allocated, and what
231 -- was left unallocated.
232 computeAllocationDelta :: CStats -> CStats -> AllocStats
233 computeAllocationDelta cini cfin =
234 let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu} = cini
235 CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
236 csTmem = t_mem, csTdsk = t_dsk, csVcpu = v_cpu } = cfin
237 rini = RSpec (fromIntegral i_icpu) (fromIntegral i_imem)
238 (fromIntegral i_idsk)
239 rfin = RSpec (fromIntegral (f_icpu - i_icpu))
240 (fromIntegral (f_imem - i_imem))
241 (fromIntegral (f_idsk - i_idsk))
242 un_cpu = fromIntegral (v_cpu - f_icpu)::Int
243 runa = RSpec un_cpu (truncate t_mem - fromIntegral f_imem)
244 (truncate t_dsk - fromIntegral f_idsk)
245 in (rini, rfin, runa)
247 -- | The names and weights of the individual elements in the CV list.
248 detailedCVInfo :: [(Double, String)]
249 detailedCVInfo = [ (1, "free_mem_cv")
250 , (1, "free_disk_cv")
252 , (1, "reserved_mem_cv")
253 , (4, "offline_all_cnt")
254 , (16, "offline_pri_cnt")
255 , (1, "vcpu_ratio_cv")
258 , (1, "disk_load_cv")
260 , (2, "pri_tags_score")
263 detailedCVWeights :: [Double]
264 detailedCVWeights = map fst detailedCVInfo
266 -- | Compute the mem and disk covariance.
267 compDetailedCV :: Node.List -> [Double]
270 all_nodes = Container.elems nl
271 (offline, nodes) = partition Node.offline all_nodes
272 mem_l = map Node.pMem nodes
273 dsk_l = map Node.pDsk nodes
274 -- metric: memory covariance
275 mem_cv = stdDev mem_l
276 -- metric: disk covariance
277 dsk_cv = stdDev dsk_l
278 -- metric: count of instances living on N1 failing nodes
279 n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
280 length (Node.pList n)) .
281 filter Node.failN1 $ nodes :: Double
282 res_l = map Node.pRem nodes
283 -- metric: reserved memory covariance
284 res_cv = stdDev res_l
285 -- offline instances metrics
286 offline_ipri = sum . map (length . Node.pList) $ offline
287 offline_isec = sum . map (length . Node.sList) $ offline
288 -- metric: count of instances on offline nodes
289 off_score = fromIntegral (offline_ipri + offline_isec)::Double
290 -- metric: count of primary instances on offline nodes (this
291 -- helps with evacuation/failover of primary instances on
292 -- 2-node clusters with one node offline)
293 off_pri_score = fromIntegral offline_ipri::Double
294 cpu_l = map Node.pCpu nodes
295 -- metric: covariance of vcpu/pcpu ratio
296 cpu_cv = stdDev cpu_l
297 -- metrics: covariance of cpu, memory, disk and network load
298 (c_load, m_load, d_load, n_load) = unzip4 $
300 let DynUtil c1 m1 d1 n1 = Node.utilLoad n
301 DynUtil c2 m2 d2 n2 = Node.utilPool n
302 in (c1/c2, m1/m2, d1/d2, n1/n2)
304 -- metric: conflicting instance count
305 pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
306 pri_tags_score = fromIntegral pri_tags_inst::Double
307 in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
308 , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
311 -- | Compute the /total/ variance.
312 compCV :: Node.List -> Double
313 compCV = sum . zipWith (*) detailedCVWeights . compDetailedCV
315 -- | Compute online nodes from a 'Node.List'.
316 getOnline :: Node.List -> [Node.Node]
317 getOnline = filter (not . Node.offline) . Container.elems
319 -- * Balancing functions
321 -- | Compute best table. Note that the ordering of the arguments is important.
322 compareTables :: Table -> Table -> Table
323 compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
324 if a_cv > b_cv then b else a
326 -- | Applies an instance move to a given node list and instance.
327 applyMove :: Node.List -> Instance.Instance
328 -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
330 applyMove nl inst Failover =
331 let old_pdx = Instance.pNode inst
332 old_sdx = Instance.sNode inst
333 old_p = Container.find old_pdx nl
334 old_s = Container.find old_sdx nl
335 int_p = Node.removePri old_p inst
336 int_s = Node.removeSec old_s inst
337 force_p = Node.offline old_p
338 new_nl = do -- Maybe monad
339 new_p <- Node.addPriEx force_p int_s inst
340 new_s <- Node.addSec int_p inst old_sdx
341 let new_inst = Instance.setBoth inst old_sdx old_pdx
342 return (Container.addTwo old_pdx new_s old_sdx new_p nl,
343 new_inst, old_sdx, old_pdx)
346 -- Replace the primary (f:, r:np, f)
347 applyMove nl inst (ReplacePrimary new_pdx) =
348 let old_pdx = Instance.pNode inst
349 old_sdx = Instance.sNode inst
350 old_p = Container.find old_pdx nl
351 old_s = Container.find old_sdx nl
352 tgt_n = Container.find new_pdx nl
353 int_p = Node.removePri old_p inst
354 int_s = Node.removeSec old_s inst
355 force_p = Node.offline old_p
356 new_nl = do -- Maybe monad
357 -- check that the current secondary can host the instance
358 -- during the migration
359 tmp_s <- Node.addPriEx force_p int_s inst
360 let tmp_s' = Node.removePri tmp_s inst
361 new_p <- Node.addPriEx force_p tgt_n inst
362 new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
363 let new_inst = Instance.setPri inst new_pdx
364 return (Container.add new_pdx new_p $
365 Container.addTwo old_pdx int_p old_sdx new_s nl,
366 new_inst, new_pdx, old_sdx)
369 -- Replace the secondary (r:ns)
370 applyMove nl inst (ReplaceSecondary new_sdx) =
371 let old_pdx = Instance.pNode inst
372 old_sdx = Instance.sNode inst
373 old_s = Container.find old_sdx nl
374 tgt_n = Container.find new_sdx nl
375 int_s = Node.removeSec old_s inst
376 force_s = Node.offline old_s
377 new_inst = Instance.setSec inst new_sdx
378 new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
379 \new_s -> return (Container.addTwo new_sdx
380 new_s old_sdx int_s nl,
381 new_inst, old_pdx, new_sdx)
384 -- Replace the secondary and failover (r:np, f)
385 applyMove nl inst (ReplaceAndFailover new_pdx) =
386 let old_pdx = Instance.pNode inst
387 old_sdx = Instance.sNode inst
388 old_p = Container.find old_pdx nl
389 old_s = Container.find old_sdx nl
390 tgt_n = Container.find new_pdx nl
391 int_p = Node.removePri old_p inst
392 int_s = Node.removeSec old_s inst
393 force_s = Node.offline old_s
394 new_nl = do -- Maybe monad
395 new_p <- Node.addPri tgt_n inst
396 new_s <- Node.addSecEx force_s int_p inst new_pdx
397 let new_inst = Instance.setBoth inst new_pdx old_pdx
398 return (Container.add new_pdx new_p $
399 Container.addTwo old_pdx new_s old_sdx int_s nl,
400 new_inst, new_pdx, old_pdx)
403 -- Failver and replace the secondary (f, r:ns)
404 applyMove nl inst (FailoverAndReplace new_sdx) =
405 let old_pdx = Instance.pNode inst
406 old_sdx = Instance.sNode inst
407 old_p = Container.find old_pdx nl
408 old_s = Container.find old_sdx nl
409 tgt_n = Container.find new_sdx nl
410 int_p = Node.removePri old_p inst
411 int_s = Node.removeSec old_s inst
412 force_p = Node.offline old_p
413 new_nl = do -- Maybe monad
414 new_p <- Node.addPriEx force_p int_s inst
415 new_s <- Node.addSecEx force_p tgt_n inst old_sdx
416 let new_inst = Instance.setBoth inst old_sdx new_sdx
417 return (Container.add new_sdx new_s $
418 Container.addTwo old_sdx new_p old_pdx int_p nl,
419 new_inst, old_sdx, new_sdx)
422 -- | Tries to allocate an instance on one given node.
423 allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
424 -> OpResult Node.AllocElement
425 allocateOnSingle nl inst new_pdx =
426 let p = Container.find new_pdx nl
427 new_inst = Instance.setBoth inst new_pdx Node.noSecondary
428 in Node.addPri p inst >>= \new_p -> do
429 let new_nl = Container.add new_pdx new_p nl
430 new_score = compCV nl
431 return (new_nl, new_inst, [new_p], new_score)
433 -- | Tries to allocate an instance on a given pair of nodes.
434 allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
435 -> OpResult Node.AllocElement
436 allocateOnPair nl inst new_pdx new_sdx =
437 let tgt_p = Container.find new_pdx nl
438 tgt_s = Container.find new_sdx nl
440 new_p <- Node.addPri tgt_p inst
441 new_s <- Node.addSec tgt_s inst new_pdx
442 let new_inst = Instance.setBoth inst new_pdx new_sdx
443 new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
444 return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
446 -- | Tries to perform an instance move and returns the best table
447 -- between the original one and the new one.
448 checkSingleStep :: Table -- ^ The original table
449 -> Instance.Instance -- ^ The instance to move
450 -> Table -- ^ The current best table
451 -> IMove -- ^ The move to apply
452 -> Table -- ^ The final best table
453 checkSingleStep ini_tbl target cur_tbl move =
455 Table ini_nl ini_il _ ini_plc = ini_tbl
456 tmp_resu = applyMove ini_nl target move
460 OpGood (upd_nl, new_inst, pri_idx, sec_idx) ->
461 let tgt_idx = Instance.idx target
462 upd_cvar = compCV upd_nl
463 upd_il = Container.add tgt_idx new_inst ini_il
464 upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
465 upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
467 compareTables cur_tbl upd_tbl
469 -- | Given the status of the current secondary as a valid new node and
470 -- the current candidate target node, generate the possible moves for
472 possibleMoves :: Bool -- ^ Whether the secondary node is a valid new node
473 -> Bool -- ^ Whether we can change the primary node
474 -> Ndx -- ^ Target node candidate
475 -> [IMove] -- ^ List of valid result moves
477 possibleMoves _ False tdx =
478 [ReplaceSecondary tdx]
480 possibleMoves True True tdx =
481 [ReplaceSecondary tdx,
482 ReplaceAndFailover tdx,
484 FailoverAndReplace tdx]
486 possibleMoves False True tdx =
487 [ReplaceSecondary tdx,
488 ReplaceAndFailover tdx]
490 -- | Compute the best move for a given instance.
491 checkInstanceMove :: [Ndx] -- ^ Allowed target node indices
492 -> Bool -- ^ Whether disk moves are allowed
493 -> Bool -- ^ Whether instance moves are allowed
494 -> Table -- ^ Original table
495 -> Instance.Instance -- ^ Instance to move
496 -> Table -- ^ Best new table for this instance
497 checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
499 opdx = Instance.pNode target
500 osdx = Instance.sNode target
501 nodes = filter (\idx -> idx /= opdx && idx /= osdx) nodes_idx
502 use_secondary = elem osdx nodes_idx && inst_moves
503 aft_failover = if use_secondary -- if allowed to failover
504 then checkSingleStep ini_tbl target ini_tbl Failover
506 all_moves = if disk_moves
508 (possibleMoves use_secondary inst_moves) nodes
511 -- iterate over the possible nodes for this instance
512 foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
514 -- | Compute the best next move.
515 checkMove :: [Ndx] -- ^ Allowed target node indices
516 -> Bool -- ^ Whether disk moves are allowed
517 -> Bool -- ^ Whether instance moves are allowed
518 -> Table -- ^ The current solution
519 -> [Instance.Instance] -- ^ List of instances still to move
520 -> Table -- ^ The new solution
521 checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
522 let Table _ _ _ ini_plc = ini_tbl
523 -- we're using rwhnf from the Control.Parallel.Strategies
524 -- package; we don't need to use rnf as that would force too
525 -- much evaluation in single-threaded cases, and in
526 -- multi-threaded case the weak head normal form is enough to
527 -- spark the evaluation
528 tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
531 -- iterate over all instances, computing the best move
532 best_tbl = foldl' compareTables ini_tbl tables
533 Table _ _ _ best_plc = best_tbl
534 in if length best_plc == length ini_plc
535 then ini_tbl -- no advancement
538 -- | Check if we are allowed to go deeper in the balancing.
539 doNextBalance :: Table -- ^ The starting table
540 -> Int -- ^ Remaining length
541 -> Score -- ^ Score at which to stop
542 -> Bool -- ^ The resulting table and commands
543 doNextBalance ini_tbl max_rounds min_score =
544 let Table _ _ ini_cv ini_plc = ini_tbl
545 ini_plc_len = length ini_plc
546 in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
548 -- | Run a balance move.
549 tryBalance :: Table -- ^ The starting table
550 -> Bool -- ^ Allow disk moves
551 -> Bool -- ^ Allow instance moves
552 -> Bool -- ^ Only evacuate moves
553 -> Score -- ^ Min gain threshold
554 -> Score -- ^ Min gain
555 -> Maybe Table -- ^ The resulting table and commands
556 tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
557 let Table ini_nl ini_il ini_cv _ = ini_tbl
558 all_inst = Container.elems ini_il
559 all_inst' = if evac_mode
560 then let bad_nodes = map Node.idx . filter Node.offline $
561 Container.elems ini_nl
562 in filter (\e -> Instance.sNode e `elem` bad_nodes ||
563 Instance.pNode e `elem` bad_nodes)
566 reloc_inst = filter Instance.movable all_inst'
567 node_idx = map Node.idx . filter (not . Node.offline) $
568 Container.elems ini_nl
569 fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
570 (Table _ _ fin_cv _) = fin_tbl
572 if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
573 then Just fin_tbl -- this round made success, return the new table
576 -- * Allocation functions
578 -- | Build failure stats out of a list of failures.
579 collapseFailures :: [FailMode] -> FailStats
580 collapseFailures flst =
581 map (\k -> (k, length $ filter (k ==) flst)) [minBound..maxBound]
583 -- | Update current Allocation solution and failure stats with new
585 concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
586 concatAllocs as (OpFail reason) = as { asFailures = reason : asFailures as }
588 concatAllocs as (OpGood ns@(_, _, _, nscore)) =
589 let -- Choose the old or new solution, based on the cluster score
591 osols = asSolutions as
592 nsols = case osols of
594 (_, _, _, oscore):[] ->
598 -- FIXME: here we simply concat to lists with more
599 -- than one element; we should instead abort, since
600 -- this is not a valid usage of this function
603 -- Note: we force evaluation of nsols here in order to keep the
604 -- memory profile low - we know that we will need nsols for sure
605 -- in the next cycle, so we force evaluation of nsols, since the
606 -- foldl' in the caller will only evaluate the tuple, but not the
607 -- elements of the tuple
608 in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolutions = nsols }
610 -- | Sums two allocation solutions (e.g. for two separate node groups).
611 sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution
612 sumAllocs (AllocSolution af aa as al) (AllocSolution bf ba bs bl) =
613 AllocSolution (af ++ bf) (aa + ba) (as ++ bs) (al ++ bl)
615 -- | Given a solution, generates a reasonable description for it.
616 describeSolution :: AllocSolution -> String
617 describeSolution as =
618 let fcnt = asFailures as
619 sols = asSolutions as
621 intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
622 filter ((> 0) . snd) . collapseFailures $ fcnt
624 then "No valid allocation solutions, failure reasons: " ++
626 then "unknown reasons"
628 else let (_, _, nodes, cv) = head sols
629 in printf ("score: %.8f, successes %d, failures %d (%s)" ++
630 " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
631 (intercalate "/" . map Node.name $ nodes)
633 -- | Annotates a solution with the appropriate string.
634 annotateSolution :: AllocSolution -> AllocSolution
635 annotateSolution as = as { asLog = describeSolution as : asLog as }
637 -- | Generate the valid node allocation singles or pairs for a new instance.
638 genAllocNodes :: Group.List -- ^ Group list
639 -> Node.List -- ^ The node map
640 -> Int -- ^ The number of nodes required
641 -> Bool -- ^ Whether to drop or not
643 -> Result AllocNodes -- ^ The (monadic) result
644 genAllocNodes gl nl count drop_unalloc =
645 let filter_fn = if drop_unalloc
646 then filter (Group.isAllocable .
647 flip Container.find gl . Node.group)
649 all_nodes = filter_fn $ getOnline nl
650 all_pairs = liftM2 (,) all_nodes all_nodes
651 ok_pairs = filter (\(x, y) -> Node.idx x /= Node.idx y &&
652 Node.group x == Node.group y) all_pairs
654 1 -> Ok (Left (map Node.idx all_nodes))
655 2 -> Ok (Right (map (\(p, s) -> (Node.idx p, Node.idx s)) ok_pairs))
656 _ -> Bad "Unsupported number of nodes, only one or two supported"
658 -- | Try to allocate an instance on the cluster.
659 tryAlloc :: (Monad m) =>
660 Node.List -- ^ The node list
661 -> Instance.List -- ^ The instance list
662 -> Instance.Instance -- ^ The instance to allocate
663 -> AllocNodes -- ^ The allocation targets
664 -> m AllocSolution -- ^ Possible solution list
665 tryAlloc nl _ inst (Right ok_pairs) =
666 let sols = foldl' (\cstate (p, s) ->
667 concatAllocs cstate $ allocateOnPair nl inst p s
668 ) emptySolution ok_pairs
670 in if null ok_pairs -- means we have just one node
671 then fail "Not enough online nodes"
672 else return $ annotateSolution sols
674 tryAlloc nl _ inst (Left all_nodes) =
675 let sols = foldl' (\cstate ->
676 concatAllocs cstate . allocateOnSingle nl inst
677 ) emptySolution all_nodes
679 then fail "No online nodes"
680 else return $ annotateSolution sols
682 -- | Given a group/result, describe it as a nice (list of) messages.
683 solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String]
684 solutionDescription gl (groupId, result) =
686 Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
687 Bad message -> [printf "Group %s: error %s" gname message]
688 where grp = Container.find groupId gl
689 gname = Group.name grp
690 pol = apolToString (Group.allocPolicy grp)
692 -- | From a list of possibly bad and possibly empty solutions, filter
693 -- only the groups with a valid result. Note that the result will be
694 -- reversed compared to the original list.
695 filterMGResults :: Group.List
696 -> [(Gdx, Result AllocSolution)]
697 -> [(Gdx, AllocSolution)]
698 filterMGResults gl = foldl' fn []
699 where unallocable = not . Group.isAllocable . flip Container.find gl
700 fn accu (gdx, rasol) =
703 Ok sol | null (asSolutions sol) -> accu
704 | unallocable gdx -> accu
705 | otherwise -> (gdx, sol):accu
707 -- | Sort multigroup results based on policy and score.
708 sortMGResults :: Group.List
709 -> [(Gdx, AllocSolution)]
710 -> [(Gdx, AllocSolution)]
711 sortMGResults gl sols =
712 let extractScore (_, _, _, x) = x
713 solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl),
714 (extractScore . head . asSolutions) sol)
715 in sortBy (comparing solScore) sols
717 -- | Try to allocate an instance on a multi-group cluster.
718 tryMGAlloc :: Group.List -- ^ The group list
719 -> Node.List -- ^ The node list
720 -> Instance.List -- ^ The instance list
721 -> Instance.Instance -- ^ The instance to allocate
722 -> Int -- ^ Required number of nodes
723 -> Result AllocSolution -- ^ Possible solution list
724 tryMGAlloc mggl mgnl mgil inst cnt =
725 let groups = splitCluster mgnl mgil
726 sols = map (\(gid, (nl, il)) ->
727 (gid, genAllocNodes mggl nl cnt False >>=
728 tryAlloc nl il inst))
729 groups::[(Gdx, Result AllocSolution)]
730 all_msgs = concatMap (solutionDescription mggl) sols
731 goodSols = filterMGResults mggl sols
732 sortedSols = sortMGResults mggl goodSols
733 in if null sortedSols
734 then Bad $ intercalate ", " all_msgs
735 else let (final_group, final_sol) = head sortedSols
736 final_name = Group.name $ Container.find final_group mggl
737 selmsg = "Selected group: " ++ final_name
738 in Ok $ final_sol { asLog = selmsg:all_msgs }
740 -- | Try to relocate an instance on the cluster.
741 tryReloc :: (Monad m) =>
742 Node.List -- ^ The node list
743 -> Instance.List -- ^ The instance list
744 -> Idx -- ^ The index of the instance to move
745 -> Int -- ^ The number of nodes required
746 -> [Ndx] -- ^ Nodes which should not be used
747 -> m AllocSolution -- ^ Solution list
748 tryReloc nl il xid 1 ex_idx =
749 let all_nodes = getOnline nl
750 inst = Container.find xid il
751 ex_idx' = Instance.pNode inst:ex_idx
752 valid_nodes = filter (not . flip elem ex_idx' . Node.idx) all_nodes
753 valid_idxes = map Node.idx valid_nodes
754 sols1 = foldl' (\cstate x ->
757 applyMove nl inst (ReplaceSecondary x)
758 return (mnl, i, [Container.find x mnl],
760 in concatAllocs cstate em
761 ) emptySolution valid_idxes
764 tryReloc _ _ _ reqn _ = fail $ "Unsupported number of relocation \
765 \destinations required (" ++ show reqn ++
766 "), only one supported"
768 tryMGReloc :: (Monad m) =>
769 Group.List -- ^ The group list
770 -> Node.List -- ^ The node list
771 -> Instance.List -- ^ The instance list
772 -> Idx -- ^ The index of the instance to move
773 -> Int -- ^ The number of nodes required
774 -> [Ndx] -- ^ Nodes which should not be used
775 -> m AllocSolution -- ^ Solution list
776 tryMGReloc _ mgnl mgil xid ncount ex_ndx = do
777 let groups = splitCluster mgnl mgil
778 -- TODO: we only relocate inside the group for now
779 inst = Container.find xid mgil
780 (nl, il) <- case lookup (instancePriGroup mgnl inst) groups of
781 Nothing -> fail $ "Cannot find group for instance " ++
784 tryReloc nl il xid ncount ex_ndx
786 -- | Change an instance's secondary node.
787 evacInstance :: (Monad m) =>
788 [Ndx] -- ^ Excluded nodes
789 -> Instance.List -- ^ The current instance list
790 -> (Node.List, AllocSolution) -- ^ The current state
791 -> Idx -- ^ The instance to evacuate
792 -> m (Node.List, AllocSolution)
793 evacInstance ex_ndx il (nl, old_as) idx = do
794 -- FIXME: hardcoded one node here
796 -- Longer explanation: evacuation is currently hardcoded to DRBD
797 -- instances (which have one secondary); hence, even if the
798 -- IAllocator protocol can request N nodes for an instance, and all
799 -- the message parsing/loading pass this, this implementation only
800 -- supports one; this situation needs to be revisited if we ever
801 -- support more than one secondary, or if we change the storage
803 new_as <- tryReloc nl il idx 1 ex_ndx
804 case asSolutions new_as of
805 -- an individual relocation succeeded, we kind of compose the data
806 -- from the two solutions
807 csol@(nl', _, _, _):_ ->
808 return (nl', new_as { asSolutions = csol:asSolutions old_as })
809 -- this relocation failed, so we fail the entire evac
810 _ -> fail $ "Can't evacuate instance " ++
811 Instance.name (Container.find idx il) ++
812 ": " ++ describeSolution new_as
814 -- | Try to evacuate a list of nodes.
815 tryEvac :: (Monad m) =>
816 Node.List -- ^ The node list
817 -> Instance.List -- ^ The instance list
818 -> [Idx] -- ^ Instances to be evacuated
819 -> [Ndx] -- ^ Restricted nodes (the ones being evacuated)
820 -> m AllocSolution -- ^ Solution list
821 tryEvac nl il idxs ex_ndx = do
822 (_, sol) <- foldM (evacInstance ex_ndx il) (nl, emptySolution) idxs
825 -- | Multi-group evacuation of a list of nodes.
826 tryMGEvac :: (Monad m) =>
827 Group.List -- ^ The group list
828 -> Node.List -- ^ The node list
829 -> Instance.List -- ^ The instance list
830 -> [Ndx] -- ^ Nodes to be evacuated
831 -> m AllocSolution -- ^ Solution list
832 tryMGEvac _ nl il ex_ndx =
833 let ex_nodes = map (`Container.find` nl) ex_ndx
834 all_insts = nub . concatMap Node.sList $ ex_nodes
835 all_insts' = associateIdxs all_insts $ splitCluster nl il
837 results <- mapM (\(_, (gnl, gil, idxs)) -> tryEvac gnl gil idxs ex_ndx)
839 let sol = foldl' sumAllocs emptySolution results
840 return $ annotateSolution sol
842 -- | Recursively place instances on the cluster until we're out of space.
843 iterateAlloc :: Node.List
847 -> [Instance.Instance]
849 -> Result AllocResult
850 iterateAlloc nl il newinst allocnodes ixes cstats =
851 let depth = length ixes
852 newname = printf "new-%d" depth::String
853 newidx = length (Container.elems il) + depth
854 newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
855 in case tryAlloc nl il newi2 allocnodes of
857 Ok (AllocSolution { asFailures = errs, asSolutions = sols3 }) ->
859 [] -> Ok (collapseFailures errs, nl, il, ixes, cstats)
860 (xnl, xi, _, _):[] ->
861 iterateAlloc xnl (Container.add newidx xi il)
862 newinst allocnodes (xi:ixes)
863 (totalResources xnl:cstats)
864 _ -> Bad "Internal error: multiple solutions for single\
867 -- | The core of the tiered allocation mode.
868 tieredAlloc :: Node.List
872 -> [Instance.Instance]
874 -> Result AllocResult
875 tieredAlloc nl il newinst allocnodes ixes cstats =
876 case iterateAlloc nl il newinst allocnodes ixes cstats of
878 Ok (errs, nl', il', ixes', cstats') ->
879 case Instance.shrinkByType newinst . fst . last $
880 sortBy (comparing snd) errs of
881 Bad _ -> Ok (errs, nl', il', ixes', cstats')
883 tieredAlloc nl' il' newinst' allocnodes ixes' cstats'
885 -- | Compute the tiered spec string description from a list of
886 -- allocated instances.
887 tieredSpecMap :: [Instance.Instance]
889 tieredSpecMap trl_ixes =
890 let fin_trl_ixes = reverse trl_ixes
891 ix_byspec = groupBy ((==) `on` Instance.specOf) fin_trl_ixes
892 spec_map = map (\ixs -> (Instance.specOf $ head ixs, length ixs))
894 in map (\(spec, cnt) -> printf "%d,%d,%d=%d" (rspecMem spec)
895 (rspecDsk spec) (rspecCpu spec) cnt) spec_map
897 -- * Formatting functions
899 -- | Given the original and final nodes, computes the relocation description.
900 computeMoves :: Instance.Instance -- ^ The instance to be moved
901 -> String -- ^ The instance name
902 -> IMove -- ^ The move being performed
903 -> String -- ^ New primary
904 -> String -- ^ New secondary
905 -> (String, [String])
906 -- ^ Tuple of moves and commands list; moves is containing
907 -- either @/f/@ for failover or @/r:name/@ for replace
908 -- secondary, while the command list holds gnt-instance
909 -- commands (without that prefix), e.g \"@failover instance1@\"
910 computeMoves i inam mv c d =
912 Failover -> ("f", [mig])
913 FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
914 ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
915 ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
916 ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
917 where morf = if Instance.running i then "migrate" else "failover"
918 mig = printf "%s -f %s" morf inam::String
919 rep n = printf "replace-disks -n %s %s" n inam
921 -- | Converts a placement to string format.
922 printSolutionLine :: Node.List -- ^ The node list
923 -> Instance.List -- ^ The instance list
924 -> Int -- ^ Maximum node name length
925 -> Int -- ^ Maximum instance name length
926 -> Placement -- ^ The current placement
927 -> Int -- ^ The index of the placement in
929 -> (String, [String])
930 printSolutionLine nl il nmlen imlen plc pos =
932 pmlen = (2*nmlen + 1)
933 (i, p, s, mv, c) = plc
934 inst = Container.find i il
935 inam = Instance.alias inst
936 npri = Node.alias $ Container.find p nl
937 nsec = Node.alias $ Container.find s nl
938 opri = Node.alias $ Container.find (Instance.pNode inst) nl
939 osec = Node.alias $ Container.find (Instance.sNode inst) nl
940 (moves, cmds) = computeMoves inst inam mv npri nsec
941 ostr = printf "%s:%s" opri osec::String
942 nstr = printf "%s:%s" npri nsec::String
944 (printf " %3d. %-*s %-*s => %-*s %.8f a=%s"
945 pos imlen inam pmlen ostr
949 -- | Return the instance and involved nodes in an instance move.
950 involvedNodes :: Instance.List -> Placement -> [Ndx]
951 involvedNodes il plc =
952 let (i, np, ns, _, _) = plc
953 inst = Container.find i il
954 op = Instance.pNode inst
955 os = Instance.sNode inst
956 in nub [np, ns, op, os]
958 -- | Inner function for splitJobs, that either appends the next job to
959 -- the current jobset, or starts a new jobset.
960 mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
961 mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
962 mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
963 | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
964 | otherwise = ([n]:cjs, ndx)
966 -- | Break a list of moves into independent groups. Note that this
967 -- will reverse the order of jobs.
968 splitJobs :: [MoveJob] -> [JobSet]
969 splitJobs = fst . foldl mergeJobs ([], [])
971 -- | Given a list of commands, prefix them with @gnt-instance@ and
972 -- also beautify the display a little.
973 formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
974 formatJob jsn jsl (sn, (_, _, _, cmds)) =
976 printf " echo job %d/%d" jsn sn:
978 map (" gnt-instance " ++) cmds
980 then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
983 -- | Given a list of commands, prefix them with @gnt-instance@ and
984 -- also beautify the display a little.
985 formatCmds :: [JobSet] -> String
988 concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
992 -- | Print the node list.
993 printNodes :: Node.List -> [String] -> String
995 let fields = case fs of
996 [] -> Node.defaultFields
997 "+":rest -> Node.defaultFields ++ rest
999 snl = sortBy (comparing Node.idx) (Container.elems nl)
1000 (header, isnum) = unzip $ map Node.showHeader fields
1001 in unlines . map ((:) ' ' . intercalate " ") $
1002 formatTable (header:map (Node.list fields) snl) isnum
1004 -- | Print the instance list.
1005 printInsts :: Node.List -> Instance.List -> String
1007 let sil = sortBy (comparing Instance.idx) (Container.elems il)
1008 helper inst = [ if Instance.running inst then "R" else " "
1009 , Instance.name inst
1010 , Container.nameOf nl (Instance.pNode inst)
1011 , let sdx = Instance.sNode inst
1012 in if sdx == Node.noSecondary
1014 else Container.nameOf nl sdx
1015 , if Instance.autoBalance inst then "Y" else "N"
1016 , printf "%3d" $ Instance.vcpus inst
1017 , printf "%5d" $ Instance.mem inst
1018 , printf "%5d" $ Instance.dsk inst `div` 1024
1024 where DynUtil lC lM lD lN = Instance.util inst
1025 header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1026 , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1027 isnum = False:False:False:False:False:repeat True
1028 in unlines . map ((:) ' ' . intercalate " ") $
1029 formatTable (header:map helper sil) isnum
1031 -- | Shows statistics for a given node list.
1032 printStats :: Node.List -> String
1034 let dcvs = compDetailedCV nl
1035 (weights, names) = unzip detailedCVInfo
1036 hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1037 formatted = map (\(w, header, val) ->
1038 printf "%s=%.8f(x%.2f)" header val w::String) hd
1039 in intercalate ", " formatted
1041 -- | Convert a placement into a list of OpCodes (basically a job).
1042 iMoveToJob :: Node.List -> Instance.List
1043 -> Idx -> IMove -> [OpCodes.OpCode]
1044 iMoveToJob nl il idx move =
1045 let inst = Container.find idx il
1046 iname = Instance.name inst
1047 lookNode = Just . Container.nameOf nl
1048 opF = OpCodes.OpInstanceMigrate iname True False True
1049 opR n = OpCodes.OpInstanceReplaceDisks iname (lookNode n)
1050 OpCodes.ReplaceNewSecondary [] Nothing
1053 ReplacePrimary np -> [ opF, opR np, opF ]
1054 ReplaceSecondary ns -> [ opR ns ]
1055 ReplaceAndFailover np -> [ opR np, opF ]
1056 FailoverAndReplace ns -> [ opF, opR ns ]
1058 -- * Node group functions
1060 -- | Computes the group of an instance.
1061 instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1062 instanceGroup nl i =
1063 let sidx = Instance.sNode i
1064 pnode = Container.find (Instance.pNode i) nl
1065 snode = if sidx == Node.noSecondary
1067 else Container.find sidx nl
1068 pgroup = Node.group pnode
1069 sgroup = Node.group snode
1070 in if pgroup /= sgroup
1071 then fail ("Instance placed accross two node groups, primary " ++
1072 show pgroup ++ ", secondary " ++ show sgroup)
1075 -- | Computes the group of an instance per the primary node.
1076 instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1077 instancePriGroup nl i =
1078 let pnode = Container.find (Instance.pNode i) nl
1081 -- | Compute the list of badly allocated instances (split across node
1083 findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1084 findSplitInstances nl =
1085 filter (not . isOk . instanceGroup nl) . Container.elems
1087 -- | Splits a cluster into the component node groups.
1088 splitCluster :: Node.List -> Instance.List ->
1089 [(Gdx, (Node.List, Instance.List))]
1090 splitCluster nl il =
1091 let ngroups = Node.computeGroups (Container.elems nl)
1092 in map (\(guuid, nodes) ->
1093 let nidxs = map Node.idx nodes
1094 nodes' = zip nidxs nodes
1095 instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1096 in (guuid, (Container.fromList nodes', instances))) ngroups
1098 -- | Split a global instance index map into per-group, and associate
1099 -- it with the group/node/instance lists.
1100 associateIdxs :: [Idx] -- ^ Instance indices to be split/associated
1101 -> [(Gdx, (Node.List, Instance.List))] -- ^ Input groups
1102 -> [(Gdx, (Node.List, Instance.List, [Idx]))] -- ^ Result
1103 associateIdxs idxs =
1104 map (\(gdx, (nl, il)) ->
1105 (gdx, (nl, il, filter (`Container.member` il) idxs)))