1 {-| Implementation of cluster-wide logic.
3 This module holds all pure cluster-logic; I\/O related functionality
4 goes into the "Main" module for the individual binaries.
10 Copyright (C) 2009, 2010, 2011 Google Inc.
12 This program is free software; you can redistribute it and/or modify
13 it under the terms of the GNU General Public License as published by
14 the Free Software Foundation; either version 2 of the License, or
15 (at your option) any later version.
17 This program is distributed in the hope that it will be useful, but
18 WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 General Public License for more details.
22 You should have received a copy of the GNU General Public License
23 along with this program; if not, write to the Free Software
24 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
29 module Ganeti.HTools.Cluster
36 -- * Generic functions
38 , computeAllocationDelta
39 -- * First phase functions
41 -- * Second phase functions
46 -- * Display functions
49 -- * Balacing functions
57 -- * IAllocator functions
66 -- * Allocation functions
70 -- * Node group functions
76 import Data.Function (on)
78 import Data.Ord (comparing)
79 import Text.Printf (printf)
81 import Control.Parallel.Strategies
83 import qualified Ganeti.HTools.Container as Container
84 import qualified Ganeti.HTools.Instance as Instance
85 import qualified Ganeti.HTools.Node as Node
86 import qualified Ganeti.HTools.Group as Group
87 import Ganeti.HTools.Types
88 import Ganeti.HTools.Utils
89 import qualified Ganeti.OpCodes as OpCodes
93 -- | Allocation\/relocation solution.
94 data AllocSolution = AllocSolution
95 { asFailures :: [FailMode] -- ^ Failure counts
96 , asAllocs :: Int -- ^ Good allocation count
97 , asSolutions :: [Node.AllocElement] -- ^ The actual result, length
98 -- of the list depends on the
99 -- allocation/relocation mode
100 , asLog :: [String] -- ^ A list of informational messages
103 -- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
104 type AllocResult = (FailStats, Node.List, Instance.List,
105 [Instance.Instance], [CStats])
108 -- | A type denoting the valid allocation mode/pairs.
109 -- For a one-node allocation, this will be a @Left ['Node.Node']@,
110 -- whereas for a two-node allocation, this will be a @Right
111 -- [('Node.Node', 'Node.Node')]@.
112 type AllocNodes = Either [Ndx] [(Ndx, Ndx)]
114 -- | The empty solution we start with when computing allocations
115 emptySolution :: AllocSolution
116 emptySolution = AllocSolution { asFailures = [], asAllocs = 0
117 , asSolutions = [], asLog = [] }
119 -- | The complete state for the balancing solution
120 data Table = Table Node.List Instance.List Score [Placement]
121 deriving (Show, Read)
123 data CStats = CStats { csFmem :: Int -- ^ Cluster free mem
124 , csFdsk :: Int -- ^ Cluster free disk
125 , csAmem :: Int -- ^ Cluster allocatable mem
126 , csAdsk :: Int -- ^ Cluster allocatable disk
127 , csAcpu :: Int -- ^ Cluster allocatable cpus
128 , csMmem :: Int -- ^ Max node allocatable mem
129 , csMdsk :: Int -- ^ Max node allocatable disk
130 , csMcpu :: Int -- ^ Max node allocatable cpu
131 , csImem :: Int -- ^ Instance used mem
132 , csIdsk :: Int -- ^ Instance used disk
133 , csIcpu :: Int -- ^ Instance used cpu
134 , csTmem :: Double -- ^ Cluster total mem
135 , csTdsk :: Double -- ^ Cluster total disk
136 , csTcpu :: Double -- ^ Cluster total cpus
137 , csVcpu :: Int -- ^ Cluster virtual cpus (if
138 -- node pCpu has been set,
140 , csXmem :: Int -- ^ Unnacounted for mem
141 , csNmem :: Int -- ^ Node own memory
142 , csScore :: Score -- ^ The cluster score
143 , csNinst :: Int -- ^ The total number of instances
145 deriving (Show, Read)
147 -- | Currently used, possibly to allocate, unallocable
148 type AllocStats = (RSpec, RSpec, RSpec)
150 -- * Utility functions
152 -- | Verifies the N+1 status and return the affected nodes.
153 verifyN1 :: [Node.Node] -> [Node.Node]
154 verifyN1 = filter Node.failN1
156 {-| Computes the pair of bad nodes and instances.
158 The bad node list is computed via a simple 'verifyN1' check, and the
159 bad instance list is the list of primary and secondary instances of
163 computeBadItems :: Node.List -> Instance.List ->
164 ([Node.Node], [Instance.Instance])
165 computeBadItems nl il =
166 let bad_nodes = verifyN1 $ getOnline nl
167 bad_instances = map (`Container.find` il) .
169 concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
171 (bad_nodes, bad_instances)
173 -- | Zero-initializer for the CStats type
174 emptyCStats :: CStats
175 emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
177 -- | Update stats with data from a new node
178 updateCStats :: CStats -> Node.Node -> CStats
179 updateCStats cs node =
180 let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
181 csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
182 csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
183 csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
184 csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
186 csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
189 inc_amem = Node.fMem node - Node.rMem node
190 inc_amem' = if inc_amem > 0 then inc_amem else 0
191 inc_adsk = Node.availDisk node
192 inc_imem = truncate (Node.tMem node) - Node.nMem node
193 - Node.xMem node - Node.fMem node
194 inc_icpu = Node.uCpu node
195 inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
196 inc_vcpu = Node.hiCpu node
197 inc_acpu = Node.availCpu node
199 in cs { csFmem = x_fmem + Node.fMem node
200 , csFdsk = x_fdsk + Node.fDsk node
201 , csAmem = x_amem + inc_amem'
202 , csAdsk = x_adsk + inc_adsk
203 , csAcpu = x_acpu + inc_acpu
204 , csMmem = max x_mmem inc_amem'
205 , csMdsk = max x_mdsk inc_adsk
206 , csMcpu = max x_mcpu inc_acpu
207 , csImem = x_imem + inc_imem
208 , csIdsk = x_idsk + inc_idsk
209 , csIcpu = x_icpu + inc_icpu
210 , csTmem = x_tmem + Node.tMem node
211 , csTdsk = x_tdsk + Node.tDsk node
212 , csTcpu = x_tcpu + Node.tCpu node
213 , csVcpu = x_vcpu + inc_vcpu
214 , csXmem = x_xmem + Node.xMem node
215 , csNmem = x_nmem + Node.nMem node
216 , csNinst = x_ninst + length (Node.pList node)
219 -- | Compute the total free disk and memory in the cluster.
220 totalResources :: Node.List -> CStats
222 let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
223 in cs { csScore = compCV nl }
225 -- | Compute the delta between two cluster state.
227 -- This is used when doing allocations, to understand better the
228 -- available cluster resources. The return value is a triple of the
229 -- current used values, the delta that was still allocated, and what
230 -- was left unallocated.
231 computeAllocationDelta :: CStats -> CStats -> AllocStats
232 computeAllocationDelta cini cfin =
233 let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu} = cini
234 CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
235 csTmem = t_mem, csTdsk = t_dsk, csVcpu = v_cpu } = cfin
236 rini = RSpec i_icpu i_imem i_idsk
237 rfin = RSpec (f_icpu - i_icpu) (f_imem - i_imem) (f_idsk - i_idsk)
238 un_cpu = v_cpu - f_icpu
239 runa = RSpec un_cpu (truncate t_mem - f_imem) (truncate t_dsk - f_idsk)
240 in (rini, rfin, runa)
242 -- | The names and weights of the individual elements in the CV list
243 detailedCVInfo :: [(Double, String)]
244 detailedCVInfo = [ (1, "free_mem_cv")
245 , (1, "free_disk_cv")
247 , (1, "reserved_mem_cv")
248 , (4, "offline_all_cnt")
249 , (16, "offline_pri_cnt")
250 , (1, "vcpu_ratio_cv")
253 , (1, "disk_load_cv")
255 , (2, "pri_tags_score")
258 detailedCVWeights :: [Double]
259 detailedCVWeights = map fst detailedCVInfo
261 -- | Compute the mem and disk covariance.
262 compDetailedCV :: Node.List -> [Double]
265 all_nodes = Container.elems nl
266 (offline, nodes) = partition Node.offline all_nodes
267 mem_l = map Node.pMem nodes
268 dsk_l = map Node.pDsk nodes
269 -- metric: memory covariance
270 mem_cv = stdDev mem_l
271 -- metric: disk covariance
272 dsk_cv = stdDev dsk_l
273 -- metric: count of instances living on N1 failing nodes
274 n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
275 length (Node.pList n)) .
276 filter Node.failN1 $ nodes :: Double
277 res_l = map Node.pRem nodes
278 -- metric: reserved memory covariance
279 res_cv = stdDev res_l
280 -- offline instances metrics
281 offline_ipri = sum . map (length . Node.pList) $ offline
282 offline_isec = sum . map (length . Node.sList) $ offline
283 -- metric: count of instances on offline nodes
284 off_score = fromIntegral (offline_ipri + offline_isec)::Double
285 -- metric: count of primary instances on offline nodes (this
286 -- helps with evacuation/failover of primary instances on
287 -- 2-node clusters with one node offline)
288 off_pri_score = fromIntegral offline_ipri::Double
289 cpu_l = map Node.pCpu nodes
290 -- metric: covariance of vcpu/pcpu ratio
291 cpu_cv = stdDev cpu_l
292 -- metrics: covariance of cpu, memory, disk and network load
293 (c_load, m_load, d_load, n_load) = unzip4 $
295 let DynUtil c1 m1 d1 n1 = Node.utilLoad n
296 DynUtil c2 m2 d2 n2 = Node.utilPool n
297 in (c1/c2, m1/m2, d1/d2, n1/n2)
299 -- metric: conflicting instance count
300 pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
301 pri_tags_score = fromIntegral pri_tags_inst::Double
302 in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
303 , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
306 -- | Compute the /total/ variance.
307 compCV :: Node.List -> Double
308 compCV = sum . zipWith (*) detailedCVWeights . compDetailedCV
310 -- | Compute online nodes from a Node.List
311 getOnline :: Node.List -> [Node.Node]
312 getOnline = filter (not . Node.offline) . Container.elems
316 -- | Compute best table. Note that the ordering of the arguments is important.
317 compareTables :: Table -> Table -> Table
318 compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
319 if a_cv > b_cv then b else a
321 -- | Applies an instance move to a given node list and instance.
322 applyMove :: Node.List -> Instance.Instance
323 -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
325 applyMove nl inst Failover =
326 let old_pdx = Instance.pNode inst
327 old_sdx = Instance.sNode inst
328 old_p = Container.find old_pdx nl
329 old_s = Container.find old_sdx nl
330 int_p = Node.removePri old_p inst
331 int_s = Node.removeSec old_s inst
332 force_p = Node.offline old_p
333 new_nl = do -- Maybe monad
334 new_p <- Node.addPriEx force_p int_s inst
335 new_s <- Node.addSec int_p inst old_sdx
336 let new_inst = Instance.setBoth inst old_sdx old_pdx
337 return (Container.addTwo old_pdx new_s old_sdx new_p nl,
338 new_inst, old_sdx, old_pdx)
341 -- Replace the primary (f:, r:np, f)
342 applyMove nl inst (ReplacePrimary new_pdx) =
343 let old_pdx = Instance.pNode inst
344 old_sdx = Instance.sNode inst
345 old_p = Container.find old_pdx nl
346 old_s = Container.find old_sdx nl
347 tgt_n = Container.find new_pdx nl
348 int_p = Node.removePri old_p inst
349 int_s = Node.removeSec old_s inst
350 force_p = Node.offline old_p
351 new_nl = do -- Maybe monad
352 -- check that the current secondary can host the instance
353 -- during the migration
354 tmp_s <- Node.addPriEx force_p int_s inst
355 let tmp_s' = Node.removePri tmp_s inst
356 new_p <- Node.addPriEx force_p tgt_n inst
357 new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
358 let new_inst = Instance.setPri inst new_pdx
359 return (Container.add new_pdx new_p $
360 Container.addTwo old_pdx int_p old_sdx new_s nl,
361 new_inst, new_pdx, old_sdx)
364 -- Replace the secondary (r:ns)
365 applyMove nl inst (ReplaceSecondary new_sdx) =
366 let old_pdx = Instance.pNode inst
367 old_sdx = Instance.sNode inst
368 old_s = Container.find old_sdx nl
369 tgt_n = Container.find new_sdx nl
370 int_s = Node.removeSec old_s inst
371 force_s = Node.offline old_s
372 new_inst = Instance.setSec inst new_sdx
373 new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
374 \new_s -> return (Container.addTwo new_sdx
375 new_s old_sdx int_s nl,
376 new_inst, old_pdx, new_sdx)
379 -- Replace the secondary and failover (r:np, f)
380 applyMove nl inst (ReplaceAndFailover new_pdx) =
381 let old_pdx = Instance.pNode inst
382 old_sdx = Instance.sNode inst
383 old_p = Container.find old_pdx nl
384 old_s = Container.find old_sdx nl
385 tgt_n = Container.find new_pdx nl
386 int_p = Node.removePri old_p inst
387 int_s = Node.removeSec old_s inst
388 force_s = Node.offline old_s
389 new_nl = do -- Maybe monad
390 new_p <- Node.addPri tgt_n inst
391 new_s <- Node.addSecEx force_s int_p inst new_pdx
392 let new_inst = Instance.setBoth inst new_pdx old_pdx
393 return (Container.add new_pdx new_p $
394 Container.addTwo old_pdx new_s old_sdx int_s nl,
395 new_inst, new_pdx, old_pdx)
398 -- Failver and replace the secondary (f, r:ns)
399 applyMove nl inst (FailoverAndReplace new_sdx) =
400 let old_pdx = Instance.pNode inst
401 old_sdx = Instance.sNode inst
402 old_p = Container.find old_pdx nl
403 old_s = Container.find old_sdx nl
404 tgt_n = Container.find new_sdx nl
405 int_p = Node.removePri old_p inst
406 int_s = Node.removeSec old_s inst
407 force_p = Node.offline old_p
408 new_nl = do -- Maybe monad
409 new_p <- Node.addPriEx force_p int_s inst
410 new_s <- Node.addSecEx force_p tgt_n inst old_sdx
411 let new_inst = Instance.setBoth inst old_sdx new_sdx
412 return (Container.add new_sdx new_s $
413 Container.addTwo old_sdx new_p old_pdx int_p nl,
414 new_inst, old_sdx, new_sdx)
417 -- | Tries to allocate an instance on one given node.
418 allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
419 -> OpResult Node.AllocElement
420 allocateOnSingle nl inst new_pdx =
421 let p = Container.find new_pdx nl
422 new_inst = Instance.setBoth inst new_pdx Node.noSecondary
423 in Node.addPri p inst >>= \new_p -> do
424 let new_nl = Container.add new_pdx new_p nl
425 new_score = compCV nl
426 return (new_nl, new_inst, [new_p], new_score)
428 -- | Tries to allocate an instance on a given pair of nodes.
429 allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
430 -> OpResult Node.AllocElement
431 allocateOnPair nl inst new_pdx new_sdx =
432 let tgt_p = Container.find new_pdx nl
433 tgt_s = Container.find new_sdx nl
435 new_p <- Node.addPri tgt_p inst
436 new_s <- Node.addSec tgt_s inst new_pdx
437 let new_inst = Instance.setBoth inst new_pdx new_sdx
438 new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
439 return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
441 -- | Tries to perform an instance move and returns the best table
442 -- between the original one and the new one.
443 checkSingleStep :: Table -- ^ The original table
444 -> Instance.Instance -- ^ The instance to move
445 -> Table -- ^ The current best table
446 -> IMove -- ^ The move to apply
447 -> Table -- ^ The final best table
448 checkSingleStep ini_tbl target cur_tbl move =
450 Table ini_nl ini_il _ ini_plc = ini_tbl
451 tmp_resu = applyMove ini_nl target move
455 OpGood (upd_nl, new_inst, pri_idx, sec_idx) ->
456 let tgt_idx = Instance.idx target
457 upd_cvar = compCV upd_nl
458 upd_il = Container.add tgt_idx new_inst ini_il
459 upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
460 upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
462 compareTables cur_tbl upd_tbl
464 -- | Given the status of the current secondary as a valid new node and
465 -- the current candidate target node, generate the possible moves for
467 possibleMoves :: Bool -- ^ Whether the secondary node is a valid new node
468 -> Bool -- ^ Whether we can change the primary node
469 -> Ndx -- ^ Target node candidate
470 -> [IMove] -- ^ List of valid result moves
472 possibleMoves _ False tdx =
473 [ReplaceSecondary tdx]
475 possibleMoves True True tdx =
476 [ReplaceSecondary tdx,
477 ReplaceAndFailover tdx,
479 FailoverAndReplace tdx]
481 possibleMoves False True tdx =
482 [ReplaceSecondary tdx,
483 ReplaceAndFailover tdx]
485 -- | Compute the best move for a given instance.
486 checkInstanceMove :: [Ndx] -- ^ Allowed target node indices
487 -> Bool -- ^ Whether disk moves are allowed
488 -> Bool -- ^ Whether instance moves are allowed
489 -> Table -- ^ Original table
490 -> Instance.Instance -- ^ Instance to move
491 -> Table -- ^ Best new table for this instance
492 checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
494 opdx = Instance.pNode target
495 osdx = Instance.sNode target
496 nodes = filter (\idx -> idx /= opdx && idx /= osdx) nodes_idx
497 use_secondary = elem osdx nodes_idx && inst_moves
498 aft_failover = if use_secondary -- if allowed to failover
499 then checkSingleStep ini_tbl target ini_tbl Failover
501 all_moves = if disk_moves
503 (possibleMoves use_secondary inst_moves) nodes
506 -- iterate over the possible nodes for this instance
507 foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
509 -- | Compute the best next move.
510 checkMove :: [Ndx] -- ^ Allowed target node indices
511 -> Bool -- ^ Whether disk moves are allowed
512 -> Bool -- ^ Whether instance moves are allowed
513 -> Table -- ^ The current solution
514 -> [Instance.Instance] -- ^ List of instances still to move
515 -> Table -- ^ The new solution
516 checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
517 let Table _ _ _ ini_plc = ini_tbl
518 -- we're using rwhnf from the Control.Parallel.Strategies
519 -- package; we don't need to use rnf as that would force too
520 -- much evaluation in single-threaded cases, and in
521 -- multi-threaded case the weak head normal form is enough to
522 -- spark the evaluation
523 tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
526 -- iterate over all instances, computing the best move
527 best_tbl = foldl' compareTables ini_tbl tables
528 Table _ _ _ best_plc = best_tbl
529 in if length best_plc == length ini_plc
530 then ini_tbl -- no advancement
533 -- | Check if we are allowed to go deeper in the balancing
534 doNextBalance :: Table -- ^ The starting table
535 -> Int -- ^ Remaining length
536 -> Score -- ^ Score at which to stop
537 -> Bool -- ^ The resulting table and commands
538 doNextBalance ini_tbl max_rounds min_score =
539 let Table _ _ ini_cv ini_plc = ini_tbl
540 ini_plc_len = length ini_plc
541 in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
543 -- | Run a balance move
544 tryBalance :: Table -- ^ The starting table
545 -> Bool -- ^ Allow disk moves
546 -> Bool -- ^ Allow instance moves
547 -> Bool -- ^ Only evacuate moves
548 -> Score -- ^ Min gain threshold
549 -> Score -- ^ Min gain
550 -> Maybe Table -- ^ The resulting table and commands
551 tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
552 let Table ini_nl ini_il ini_cv _ = ini_tbl
553 all_inst = Container.elems ini_il
554 all_inst' = if evac_mode
555 then let bad_nodes = map Node.idx . filter Node.offline $
556 Container.elems ini_nl
557 in filter (\e -> Instance.sNode e `elem` bad_nodes ||
558 Instance.pNode e `elem` bad_nodes)
561 reloc_inst = filter Instance.movable all_inst'
562 node_idx = map Node.idx . filter (not . Node.offline) $
563 Container.elems ini_nl
564 fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
565 (Table _ _ fin_cv _) = fin_tbl
567 if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
568 then Just fin_tbl -- this round made success, return the new table
571 -- * Allocation functions
573 -- | Build failure stats out of a list of failures
574 collapseFailures :: [FailMode] -> FailStats
575 collapseFailures flst =
576 map (\k -> (k, length $ filter (k ==) flst)) [minBound..maxBound]
578 -- | Update current Allocation solution and failure stats with new
580 concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
581 concatAllocs as (OpFail reason) = as { asFailures = reason : asFailures as }
583 concatAllocs as (OpGood ns@(_, _, _, nscore)) =
584 let -- Choose the old or new solution, based on the cluster score
586 osols = asSolutions as
587 nsols = case osols of
589 (_, _, _, oscore):[] ->
593 -- FIXME: here we simply concat to lists with more
594 -- than one element; we should instead abort, since
595 -- this is not a valid usage of this function
598 -- Note: we force evaluation of nsols here in order to keep the
599 -- memory profile low - we know that we will need nsols for sure
600 -- in the next cycle, so we force evaluation of nsols, since the
601 -- foldl' in the caller will only evaluate the tuple, but not the
602 -- elements of the tuple
603 in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolutions = nsols }
605 -- | Sums two allocation solutions (e.g. for two separate node groups).
606 sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution
607 sumAllocs (AllocSolution af aa as al) (AllocSolution bf ba bs bl) =
608 AllocSolution (af ++ bf) (aa + ba) (as ++ bs) (al ++ bl)
610 -- | Given a solution, generates a reasonable description for it
611 describeSolution :: AllocSolution -> String
612 describeSolution as =
613 let fcnt = asFailures as
614 sols = asSolutions as
616 intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
617 filter ((> 0) . snd) . collapseFailures $ fcnt
619 then "No valid allocation solutions, failure reasons: " ++
621 then "unknown reasons"
623 else let (_, _, nodes, cv) = head sols
624 in printf ("score: %.8f, successes %d, failures %d (%s)" ++
625 " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
626 (intercalate "/" . map Node.name $ nodes)
628 -- | Annotates a solution with the appropriate string
629 annotateSolution :: AllocSolution -> AllocSolution
630 annotateSolution as = as { asLog = describeSolution as : asLog as }
632 -- | Generate the valid node allocation singles or pairs for a new instance.
633 genAllocNodes :: Group.List -- ^ Group list
634 -> Node.List -- ^ The node map
635 -> Int -- ^ The number of nodes required
636 -> Bool -- ^ Whether to drop or not
638 -> Result AllocNodes -- ^ The (monadic) result
639 genAllocNodes gl nl count drop_unalloc =
640 let filter_fn = if drop_unalloc
641 then filter (Group.isAllocable .
642 flip Container.find gl . Node.group)
644 all_nodes = filter_fn $ getOnline nl
645 all_pairs = liftM2 (,) all_nodes all_nodes
646 ok_pairs = filter (\(x, y) -> Node.idx x /= Node.idx y &&
647 Node.group x == Node.group y) all_pairs
649 1 -> Ok (Left (map Node.idx all_nodes))
650 2 -> Ok (Right (map (\(p, s) -> (Node.idx p, Node.idx s)) ok_pairs))
651 _ -> Bad "Unsupported number of nodes, only one or two supported"
653 -- | Try to allocate an instance on the cluster.
654 tryAlloc :: (Monad m) =>
655 Node.List -- ^ The node list
656 -> Instance.List -- ^ The instance list
657 -> Instance.Instance -- ^ The instance to allocate
658 -> AllocNodes -- ^ The allocation targets
659 -> m AllocSolution -- ^ Possible solution list
660 tryAlloc nl _ inst (Right ok_pairs) =
661 let sols = foldl' (\cstate (p, s) ->
662 concatAllocs cstate $ allocateOnPair nl inst p s
663 ) emptySolution ok_pairs
665 in if null ok_pairs -- means we have just one node
666 then fail "Not enough online nodes"
667 else return $ annotateSolution sols
669 tryAlloc nl _ inst (Left all_nodes) =
670 let sols = foldl' (\cstate ->
671 concatAllocs cstate . allocateOnSingle nl inst
672 ) emptySolution all_nodes
674 then fail "No online nodes"
675 else return $ annotateSolution sols
677 -- | Given a group/result, describe it as a nice (list of) messages
678 solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String]
679 solutionDescription gl (groupId, result) =
681 Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
682 Bad message -> [printf "Group %s: error %s" gname message]
683 where grp = Container.find groupId gl
684 gname = Group.name grp
685 pol = apolToString (Group.allocPolicy grp)
687 -- | From a list of possibly bad and possibly empty solutions, filter
688 -- only the groups with a valid result. Note that the result will be
689 -- reversed compared to the original list
690 filterMGResults :: Group.List
691 -> [(Gdx, Result AllocSolution)]
692 -> [(Gdx, AllocSolution)]
693 filterMGResults gl = foldl' fn []
694 where unallocable = not . Group.isAllocable . flip Container.find gl
695 fn accu (gdx, rasol) =
698 Ok sol | null (asSolutions sol) -> accu
699 | unallocable gdx -> accu
700 | otherwise -> (gdx, sol):accu
702 -- | Sort multigroup results based on policy and score
703 sortMGResults :: Group.List
704 -> [(Gdx, AllocSolution)]
705 -> [(Gdx, AllocSolution)]
706 sortMGResults gl sols =
707 let extractScore (_, _, _, x) = x
708 solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl),
709 (extractScore . head . asSolutions) sol)
710 in sortBy (comparing solScore) sols
712 -- | Try to allocate an instance on a multi-group cluster.
713 tryMGAlloc :: Group.List -- ^ The group list
714 -> Node.List -- ^ The node list
715 -> Instance.List -- ^ The instance list
716 -> Instance.Instance -- ^ The instance to allocate
717 -> Int -- ^ Required number of nodes
718 -> Result AllocSolution -- ^ Possible solution list
719 tryMGAlloc mggl mgnl mgil inst cnt =
720 let groups = splitCluster mgnl mgil
721 sols = map (\(gid, (nl, il)) ->
722 (gid, genAllocNodes mggl nl cnt False >>=
723 tryAlloc nl il inst))
724 groups::[(Gdx, Result AllocSolution)]
725 all_msgs = concatMap (solutionDescription mggl) sols
726 goodSols = filterMGResults mggl sols
727 sortedSols = sortMGResults mggl goodSols
728 in if null sortedSols
729 then Bad $ intercalate ", " all_msgs
730 else let (final_group, final_sol) = head sortedSols
731 final_name = Group.name $ Container.find final_group mggl
732 selmsg = "Selected group: " ++ final_name
733 in Ok $ final_sol { asLog = selmsg:all_msgs }
735 -- | Try to relocate an instance on the cluster.
736 tryReloc :: (Monad m) =>
737 Node.List -- ^ The node list
738 -> Instance.List -- ^ The instance list
739 -> Idx -- ^ The index of the instance to move
740 -> Int -- ^ The number of nodes required
741 -> [Ndx] -- ^ Nodes which should not be used
742 -> m AllocSolution -- ^ Solution list
743 tryReloc nl il xid 1 ex_idx =
744 let all_nodes = getOnline nl
745 inst = Container.find xid il
746 ex_idx' = Instance.pNode inst:ex_idx
747 valid_nodes = filter (not . flip elem ex_idx' . Node.idx) all_nodes
748 valid_idxes = map Node.idx valid_nodes
749 sols1 = foldl' (\cstate x ->
752 applyMove nl inst (ReplaceSecondary x)
753 return (mnl, i, [Container.find x mnl],
755 in concatAllocs cstate em
756 ) emptySolution valid_idxes
759 tryReloc _ _ _ reqn _ = fail $ "Unsupported number of relocation \
760 \destinations required (" ++ show reqn ++
761 "), only one supported"
763 tryMGReloc :: (Monad m) =>
764 Group.List -- ^ The group list
765 -> Node.List -- ^ The node list
766 -> Instance.List -- ^ The instance list
767 -> Idx -- ^ The index of the instance to move
768 -> Int -- ^ The number of nodes required
769 -> [Ndx] -- ^ Nodes which should not be used
770 -> m AllocSolution -- ^ Solution list
771 tryMGReloc _ mgnl mgil xid ncount ex_ndx = do
772 let groups = splitCluster mgnl mgil
773 -- TODO: we only relocate inside the group for now
774 inst = Container.find xid mgil
775 (nl, il) <- case lookup (instancePriGroup mgnl inst) groups of
776 Nothing -> fail $ "Cannot find group for instance " ++
779 tryReloc nl il xid ncount ex_ndx
781 -- | Change an instance's secondary node
782 evacInstance :: (Monad m) =>
783 [Ndx] -- ^ Excluded nodes
784 -> Instance.List -- ^ The current instance list
785 -> (Node.List, AllocSolution) -- ^ The current state
786 -> Idx -- ^ The instance to evacuate
787 -> m (Node.List, AllocSolution)
788 evacInstance ex_ndx il (nl, old_as) idx = do
789 -- FIXME: hardcoded one node here
791 -- Longer explanation: evacuation is currently hardcoded to DRBD
792 -- instances (which have one secondary); hence, even if the
793 -- IAllocator protocol can request N nodes for an instance, and all
794 -- the message parsing/loading pass this, this implementation only
795 -- supports one; this situation needs to be revisited if we ever
796 -- support more than one secondary, or if we change the storage
798 new_as <- tryReloc nl il idx 1 ex_ndx
799 case asSolutions new_as of
800 -- an individual relocation succeeded, we kind of compose the data
801 -- from the two solutions
802 csol@(nl', _, _, _):_ ->
803 return (nl', new_as { asSolutions = csol:asSolutions old_as })
804 -- this relocation failed, so we fail the entire evac
805 _ -> fail $ "Can't evacuate instance " ++
806 Instance.name (Container.find idx il) ++
807 ": " ++ describeSolution new_as
809 -- | Try to evacuate a list of nodes.
810 tryEvac :: (Monad m) =>
811 Node.List -- ^ The node list
812 -> Instance.List -- ^ The instance list
813 -> [Idx] -- ^ Instances to be evacuated
814 -> [Ndx] -- ^ Restricted nodes (the ones being evacuated)
815 -> m AllocSolution -- ^ Solution list
816 tryEvac nl il idxs ex_ndx = do
817 (_, sol) <- foldM (evacInstance ex_ndx il) (nl, emptySolution) idxs
820 -- | Multi-group evacuation of a list of nodes.
821 tryMGEvac :: (Monad m) =>
822 Group.List -- ^ The group list
823 -> Node.List -- ^ The node list
824 -> Instance.List -- ^ The instance list
825 -> [Ndx] -- ^ Nodes to be evacuated
826 -> m AllocSolution -- ^ Solution list
827 tryMGEvac _ nl il ex_ndx =
828 let ex_nodes = map (`Container.find` nl) ex_ndx
829 all_insts = nub . concatMap Node.sList $ ex_nodes
830 gni = splitCluster nl il
831 -- we run the instance index list through a couple of maps to
832 -- get finally to a structure of the type [(group index,
833 -- [instance indices])]
834 all_insts' = map (\idx ->
835 (instancePriGroup nl (Container.find idx il),
837 all_insts'' = groupBy ((==) `on` fst) all_insts'
838 all_insts3 = map (\xs -> let (gdxs, idxs) = unzip xs
839 in (head gdxs, idxs)) all_insts''
841 -- that done, we now add the per-group nl/il to the tuple
843 mapM (\(gdx, idxs) ->
844 case lookup gdx gni of
845 Nothing -> fail $ "Can't find group index " ++ show gdx
846 Just (gnl, gil) -> return (gdx, gnl, gil, idxs))
848 results <- mapM (\(_, gnl, gil, idxs) -> tryEvac gnl gil idxs ex_ndx)
850 let sol = foldl' sumAllocs emptySolution results
851 return $ annotateSolution sol
853 -- | Recursively place instances on the cluster until we're out of space
854 iterateAlloc :: Node.List
858 -> [Instance.Instance]
860 -> Result AllocResult
861 iterateAlloc nl il newinst allocnodes ixes cstats =
862 let depth = length ixes
863 newname = printf "new-%d" depth::String
864 newidx = length (Container.elems il) + depth
865 newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
866 in case tryAlloc nl il newi2 allocnodes of
868 Ok (AllocSolution { asFailures = errs, asSolutions = sols3 }) ->
870 [] -> Ok (collapseFailures errs, nl, il, ixes, cstats)
871 (xnl, xi, _, _):[] ->
872 iterateAlloc xnl (Container.add newidx xi il)
873 newinst allocnodes (xi:ixes)
874 (totalResources xnl:cstats)
875 _ -> Bad "Internal error: multiple solutions for single\
878 -- | The core of the tiered allocation mode
879 tieredAlloc :: Node.List
883 -> [Instance.Instance]
885 -> Result AllocResult
886 tieredAlloc nl il newinst allocnodes ixes cstats =
887 case iterateAlloc nl il newinst allocnodes ixes cstats of
889 Ok (errs, nl', il', ixes', cstats') ->
890 case Instance.shrinkByType newinst . fst . last $
891 sortBy (comparing snd) errs of
892 Bad _ -> Ok (errs, nl', il', ixes', cstats')
894 tieredAlloc nl' il' newinst' allocnodes ixes' cstats'
896 -- | Compute the tiered spec string description from a list of
897 -- allocated instances.
898 tieredSpecMap :: [Instance.Instance]
900 tieredSpecMap trl_ixes =
901 let fin_trl_ixes = reverse trl_ixes
902 ix_byspec = groupBy ((==) `on` Instance.specOf) fin_trl_ixes
903 spec_map = map (\ixs -> (Instance.specOf $ head ixs, length ixs))
905 in map (\(spec, cnt) -> printf "%d,%d,%d=%d" (rspecMem spec)
906 (rspecDsk spec) (rspecCpu spec) cnt) spec_map
908 -- * Formatting functions
910 -- | Given the original and final nodes, computes the relocation description.
911 computeMoves :: Instance.Instance -- ^ The instance to be moved
912 -> String -- ^ The instance name
913 -> IMove -- ^ The move being performed
914 -> String -- ^ New primary
915 -> String -- ^ New secondary
916 -> (String, [String])
917 -- ^ Tuple of moves and commands list; moves is containing
918 -- either @/f/@ for failover or @/r:name/@ for replace
919 -- secondary, while the command list holds gnt-instance
920 -- commands (without that prefix), e.g \"@failover instance1@\"
921 computeMoves i inam mv c d =
923 Failover -> ("f", [mig])
924 FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
925 ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
926 ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
927 ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
928 where morf = if Instance.running i then "migrate" else "failover"
929 mig = printf "%s -f %s" morf inam::String
930 rep n = printf "replace-disks -n %s %s" n inam
932 -- | Converts a placement to string format.
933 printSolutionLine :: Node.List -- ^ The node list
934 -> Instance.List -- ^ The instance list
935 -> Int -- ^ Maximum node name length
936 -> Int -- ^ Maximum instance name length
937 -> Placement -- ^ The current placement
938 -> Int -- ^ The index of the placement in
940 -> (String, [String])
941 printSolutionLine nl il nmlen imlen plc pos =
943 pmlen = (2*nmlen + 1)
944 (i, p, s, mv, c) = plc
945 inst = Container.find i il
946 inam = Instance.alias inst
947 npri = Node.alias $ Container.find p nl
948 nsec = Node.alias $ Container.find s nl
949 opri = Node.alias $ Container.find (Instance.pNode inst) nl
950 osec = Node.alias $ Container.find (Instance.sNode inst) nl
951 (moves, cmds) = computeMoves inst inam mv npri nsec
952 ostr = printf "%s:%s" opri osec::String
953 nstr = printf "%s:%s" npri nsec::String
955 (printf " %3d. %-*s %-*s => %-*s %.8f a=%s"
956 pos imlen inam pmlen ostr
960 -- | Return the instance and involved nodes in an instance move.
961 involvedNodes :: Instance.List -> Placement -> [Ndx]
962 involvedNodes il plc =
963 let (i, np, ns, _, _) = plc
964 inst = Container.find i il
965 op = Instance.pNode inst
966 os = Instance.sNode inst
967 in nub [np, ns, op, os]
969 -- | Inner function for splitJobs, that either appends the next job to
970 -- the current jobset, or starts a new jobset.
971 mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
972 mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
973 mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
974 | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
975 | otherwise = ([n]:cjs, ndx)
977 -- | Break a list of moves into independent groups. Note that this
978 -- will reverse the order of jobs.
979 splitJobs :: [MoveJob] -> [JobSet]
980 splitJobs = fst . foldl mergeJobs ([], [])
982 -- | Given a list of commands, prefix them with @gnt-instance@ and
983 -- also beautify the display a little.
984 formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
985 formatJob jsn jsl (sn, (_, _, _, cmds)) =
987 printf " echo job %d/%d" jsn sn:
989 map (" gnt-instance " ++) cmds
991 then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
994 -- | Given a list of commands, prefix them with @gnt-instance@ and
995 -- also beautify the display a little.
996 formatCmds :: [JobSet] -> String
999 concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1003 -- | Print the node list.
1004 printNodes :: Node.List -> [String] -> String
1006 let fields = case fs of
1007 [] -> Node.defaultFields
1008 "+":rest -> Node.defaultFields ++ rest
1010 snl = sortBy (comparing Node.idx) (Container.elems nl)
1011 (header, isnum) = unzip $ map Node.showHeader fields
1012 in unlines . map ((:) ' ' . intercalate " ") $
1013 formatTable (header:map (Node.list fields) snl) isnum
1015 -- | Print the instance list.
1016 printInsts :: Node.List -> Instance.List -> String
1018 let sil = sortBy (comparing Instance.idx) (Container.elems il)
1019 helper inst = [ if Instance.running inst then "R" else " "
1020 , Instance.name inst
1021 , Container.nameOf nl (Instance.pNode inst)
1022 , let sdx = Instance.sNode inst
1023 in if sdx == Node.noSecondary
1025 else Container.nameOf nl sdx
1026 , if Instance.auto_balance inst then "Y" else "N"
1027 , printf "%3d" $ Instance.vcpus inst
1028 , printf "%5d" $ Instance.mem inst
1029 , printf "%5d" $ Instance.dsk inst `div` 1024
1035 where DynUtil lC lM lD lN = Instance.util inst
1036 header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1037 , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1038 isnum = False:False:False:False:False:repeat True
1039 in unlines . map ((:) ' ' . intercalate " ") $
1040 formatTable (header:map helper sil) isnum
1042 -- | Shows statistics for a given node list.
1043 printStats :: Node.List -> String
1045 let dcvs = compDetailedCV nl
1046 (weights, names) = unzip detailedCVInfo
1047 hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1048 formatted = map (\(w, header, val) ->
1049 printf "%s=%.8f(x%.2f)" header val w::String) hd
1050 in intercalate ", " formatted
1052 -- | Convert a placement into a list of OpCodes (basically a job).
1053 iMoveToJob :: Node.List -> Instance.List
1054 -> Idx -> IMove -> [OpCodes.OpCode]
1055 iMoveToJob nl il idx move =
1056 let inst = Container.find idx il
1057 iname = Instance.name inst
1058 lookNode = Just . Container.nameOf nl
1059 opF = OpCodes.OpInstanceMigrate iname True False True
1060 opR n = OpCodes.OpInstanceReplaceDisks iname (lookNode n)
1061 OpCodes.ReplaceNewSecondary [] Nothing
1064 ReplacePrimary np -> [ opF, opR np, opF ]
1065 ReplaceSecondary ns -> [ opR ns ]
1066 ReplaceAndFailover np -> [ opR np, opF ]
1067 FailoverAndReplace ns -> [ opF, opR ns ]
1069 -- * Node group functions
1071 -- | Computes the group of an instance
1072 instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1073 instanceGroup nl i =
1074 let sidx = Instance.sNode i
1075 pnode = Container.find (Instance.pNode i) nl
1076 snode = if sidx == Node.noSecondary
1078 else Container.find sidx nl
1079 pgroup = Node.group pnode
1080 sgroup = Node.group snode
1081 in if pgroup /= sgroup
1082 then fail ("Instance placed accross two node groups, primary " ++
1083 show pgroup ++ ", secondary " ++ show sgroup)
1086 -- | Computes the group of an instance per the primary node
1087 instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1088 instancePriGroup nl i =
1089 let pnode = Container.find (Instance.pNode i) nl
1092 -- | Compute the list of badly allocated instances (split across node
1094 findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1095 findSplitInstances nl =
1096 filter (not . isOk . instanceGroup nl) . Container.elems
1098 -- | Splits a cluster into the component node groups
1099 splitCluster :: Node.List -> Instance.List ->
1100 [(Gdx, (Node.List, Instance.List))]
1101 splitCluster nl il =
1102 let ngroups = Node.computeGroups (Container.elems nl)
1103 in map (\(guuid, nodes) ->
1104 let nidxs = map Node.idx nodes
1105 nodes' = zip nidxs nodes
1106 instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1107 in (guuid, (Container.fromList nodes', instances))) ngroups