1 {-| Implementation of cluster-wide logic.
3 This module holds all pure cluster-logic; I\/O related functionality
4 goes into the "Main" module for the individual binaries.
8 module Ganeti.HTools.Cluster
17 -- * Generic functions
19 -- * First phase functions
21 -- * Second phase functions
28 -- * Balacing functions
33 -- * IAllocator functions
39 import Data.Maybe (isNothing, fromJust)
40 import Text.Printf (printf)
44 import qualified Ganeti.HTools.Container as Container
45 import qualified Ganeti.HTools.Instance as Instance
46 import qualified Ganeti.HTools.Node as Node
47 import Ganeti.HTools.Types
48 import Ganeti.HTools.Utils
52 -- | A separate name for the cluster score type.
55 -- | The description of an instance placement.
56 type Placement = (Idx, Ndx, Ndx, Score)
58 -- | A cluster solution described as the solution delta and the list
60 data Solution = Solution Int [Placement]
61 deriving (Eq, Ord, Show)
64 data Removal = Removal Node.List [Instance.Instance]
66 -- | An instance move definition
67 data IMove = Failover -- ^ Failover the instance (f)
68 | ReplacePrimary Ndx -- ^ Replace primary (f, r:np, f)
69 | ReplaceSecondary Ndx -- ^ Replace secondary (r:ns)
70 | ReplaceAndFailover Ndx -- ^ Replace secondary, failover (r:np, f)
71 | FailoverAndReplace Ndx -- ^ Failover, replace secondary (f, r:ns)
74 -- | The complete state for the balancing solution
75 data Table = Table Node.List Instance.List Score [Placement]
78 -- * Utility functions
80 -- | Returns the delta of a solution or -1 for Nothing.
81 solutionDelta :: Maybe Solution -> Int
82 solutionDelta sol = case sol of
83 Just (Solution d _) -> d
86 -- | Cap the removal list if needed.
87 capRemovals :: [a] -> Int -> [a]
88 capRemovals removals max_removals =
89 if max_removals > 0 then
90 take max_removals removals
94 -- | Check if the given node list fails the N+1 check.
95 verifyN1Check :: [Node.Node] -> Bool
96 verifyN1Check nl = any Node.failN1 nl
98 -- | Verifies the N+1 status and return the affected nodes.
99 verifyN1 :: [Node.Node] -> [Node.Node]
100 verifyN1 nl = filter Node.failN1 nl
102 {-| Computes the pair of bad nodes and instances.
104 The bad node list is computed via a simple 'verifyN1' check, and the
105 bad instance list is the list of primary and secondary instances of
109 computeBadItems :: Node.List -> Instance.List ->
110 ([Node.Node], [Instance.Instance])
111 computeBadItems nl il =
112 let bad_nodes = verifyN1 $ filter (not . Node.offline) $ Container.elems nl
113 bad_instances = map (\idx -> Container.find idx il) $
114 sort $ nub $ concat $
115 map (\ n -> (Node.slist n) ++ (Node.plist n)) bad_nodes
117 (bad_nodes, bad_instances)
119 -- | Compute the total free disk and memory in the cluster.
120 totalResources :: Container.Container Node.Node -> (Int, Int)
123 (\ (mem, dsk) node -> (mem + (Node.f_mem node),
124 dsk + (Node.f_dsk node)))
125 (0, 0) (Container.elems nl)
127 -- | Compute the mem and disk covariance.
128 compDetailedCV :: Node.List -> (Double, Double, Double, Double, Double)
131 all_nodes = Container.elems nl
132 (offline, nodes) = partition Node.offline all_nodes
133 mem_l = map Node.p_mem nodes
134 dsk_l = map Node.p_dsk nodes
135 mem_cv = varianceCoeff mem_l
136 dsk_cv = varianceCoeff dsk_l
137 n1_l = length $ filter Node.failN1 nodes
138 n1_score = (fromIntegral n1_l) / (fromIntegral $ length nodes)
139 res_l = map Node.p_rem nodes
140 res_cv = varianceCoeff res_l
141 offline_inst = sum . map (\n -> (length . Node.plist $ n) +
142 (length . Node.slist $ n)) $ offline
143 online_inst = sum . map (\n -> (length . Node.plist $ n) +
144 (length . Node.slist $ n)) $ nodes
145 off_score = (fromIntegral offline_inst) /
146 (fromIntegral $ online_inst + offline_inst)
147 in (mem_cv, dsk_cv, n1_score, res_cv, off_score)
149 -- | Compute the /total/ variance.
150 compCV :: Node.List -> Double
152 let (mem_cv, dsk_cv, n1_score, res_cv, off_score) = compDetailedCV nl
153 in mem_cv + dsk_cv + n1_score + res_cv + off_score
157 -- | Add an instance and return the new node and instance maps.
158 addInstance :: Node.List -> Instance.Instance ->
159 Node.Node -> Node.Node -> Maybe Node.List
160 addInstance nl idata pri sec =
161 let pdx = Node.idx pri
164 pnode <- Node.addPri pri idata
165 snode <- Node.addSec sec idata pdx
166 new_nl <- return $ Container.addTwo sdx snode
170 -- | Remove an instance and return the new node and instance maps.
171 removeInstance :: Node.List -> Instance.Instance -> Node.List
172 removeInstance nl idata =
173 let pnode = Instance.pnode idata
174 snode = Instance.snode idata
175 pn = Container.find pnode nl
176 sn = Container.find snode nl
177 new_nl = Container.addTwo
178 pnode (Node.removePri pn idata)
179 snode (Node.removeSec sn idata) nl in
182 -- | Remove an instance and return the new node map.
183 removeInstances :: Node.List -> [Instance.Instance] -> Node.List
184 removeInstances = foldl' removeInstance
187 {-| Compute a new version of a cluster given a solution.
189 This is not used for computing the solutions, but for applying a
190 (known-good) solution to the original cluster for final display.
192 It first removes the relocated instances after which it places them on
196 applySolution :: Node.List -> Instance.List -> [Placement] -> Node.List
197 applySolution nl il sol =
198 let odxes = map (\ (a, b, c, _) -> (Container.find a il,
199 Node.idx (Container.find b nl),
200 Node.idx (Container.find c nl))
202 idxes = (\ (x, _, _) -> x) (unzip3 odxes)
203 nc = removeInstances nl idxes
205 foldl' (\ nz (a, b, c) ->
206 let new_p = Container.find b nz
207 new_s = Container.find c nz in
208 fromJust (addInstance nz a new_p new_s)
212 -- ** First phase functions
214 {-| Given a list 1,2,3..n build a list of pairs [(1, [2..n]), (2,
218 genParts :: [a] -> Int -> [(a, [a])]
223 if length l < count then
226 (x, xs) : (genParts xs count)
228 -- | Generates combinations of count items from the names list.
229 genNames :: Int -> [b] -> [[b]]
230 genNames count1 names1 =
231 let aux_fn count names current =
236 (\ (x, xs) -> aux_fn (count - 1) xs (x:current))
237 (genParts names count)
239 aux_fn count1 names1 []
241 {-| Checks if removal of instances results in N+1 pass.
243 Note: the check removal cannot optimize by scanning only the affected
244 nodes, since the cluster is known to be not healthy; only the check
245 placement can make this shortcut.
248 checkRemoval :: Node.List -> [Instance.Instance] -> Maybe Removal
249 checkRemoval nl victims =
250 let nx = removeInstances nl victims
251 failN1 = verifyN1Check (Container.elems nx)
256 Just $ Removal nx victims
259 -- | Computes the removals list for a given depth.
260 computeRemovals :: Node.List
261 -> [Instance.Instance]
264 computeRemovals nl bad_instances depth =
265 map (checkRemoval nl) $ genNames depth bad_instances
267 -- ** Second phase functions
269 -- | Single-node relocation cost.
270 nodeDelta :: Ndx -> Ndx -> Ndx -> Int
272 if i == p || i == s then
277 -- | Compute best solution.
279 -- This function compares two solutions, choosing the minimum valid
281 compareSolutions :: Maybe Solution -> Maybe Solution -> Maybe Solution
282 compareSolutions a b = case (a, b) of
287 -- | Check if a given delta is worse then an existing solution.
288 tooHighDelta :: Maybe Solution -> Int -> Int -> Bool
289 tooHighDelta sol new_delta max_delta =
290 if new_delta > max_delta && max_delta >=0 then
295 Just (Solution old_delta _) -> old_delta <= new_delta
297 {-| Check if placement of instances still keeps the cluster N+1 compliant.
299 This is the workhorse of the allocation algorithm: given the
300 current node and instance maps, the list of instances to be
301 placed, and the current solution, this will return all possible
302 solution by recursing until all target instances are placed.
305 checkPlacement :: Node.List -- ^ The current node list
306 -> [Instance.Instance] -- ^ List of instances still to place
307 -> [Placement] -- ^ Partial solution until now
308 -> Int -- ^ The delta of the partial solution
309 -> Maybe Solution -- ^ The previous solution
310 -> Int -- ^ Abort if the we go above this delta
311 -> Maybe Solution -- ^ The new solution
312 checkPlacement nl victims current current_delta prev_sol max_delta =
313 let target = head victims
314 opdx = Instance.pnode target
315 osdx = Instance.snode target
317 have_tail = (length vtail) > 0
318 nodes = Container.elems nl
319 iidx = Instance.idx target
324 pri_idx = Node.idx pri
325 upri_delta = current_delta + nodeDelta pri_idx opdx osdx
326 new_pri = Node.addPri pri target
327 fail_delta1 = tooHighDelta accu_p upri_delta max_delta
329 if fail_delta1 || isNothing(new_pri) then accu_p
330 else let pri_nl = Container.add pri_idx (fromJust new_pri) nl in
334 sec_idx = Node.idx sec
335 upd_delta = upri_delta +
336 nodeDelta sec_idx opdx osdx
337 fail_delta2 = tooHighDelta accu upd_delta max_delta
338 new_sec = Node.addSec sec target pri_idx
340 if sec_idx == pri_idx || fail_delta2 ||
341 isNothing new_sec then accu
343 nx = Container.add sec_idx (fromJust new_sec) pri_nl
345 plc = (iidx, pri_idx, sec_idx, upd_cv)
349 checkPlacement nx vtail c2 upd_delta
352 Just (Solution upd_delta c2)
353 in compareSolutions accu result
357 {-| Auxiliary function for solution computation.
359 We write this in an explicit recursive fashion in order to control
360 early-abort in case we have met the min delta. We can't use foldr
361 instead of explicit recursion since we need the accumulator for the
365 advanceSolution :: [Maybe Removal] -- ^ The removal to process
366 -> Int -- ^ Minimum delta parameter
367 -> Int -- ^ Maximum delta parameter
368 -> Maybe Solution -- ^ Current best solution
369 -> Maybe Solution -- ^ New best solution
370 advanceSolution [] _ _ sol = sol
371 advanceSolution (Nothing:xs) m n sol = advanceSolution xs m n sol
372 advanceSolution ((Just (Removal nx removed)):xs) min_d max_d prev_sol =
373 let new_sol = checkPlacement nx removed [] 0 prev_sol max_d
374 new_delta = solutionDelta $! new_sol
376 if new_delta >= 0 && new_delta <= min_d then
379 advanceSolution xs min_d max_d new_sol
381 -- | Computes the placement solution.
382 solutionFromRemovals :: [Maybe Removal] -- ^ The list of (possible) removals
383 -> Int -- ^ Minimum delta parameter
384 -> Int -- ^ Maximum delta parameter
385 -> Maybe Solution -- ^ The best solution found
386 solutionFromRemovals removals min_delta max_delta =
387 advanceSolution removals min_delta max_delta Nothing
389 {-| Computes the solution at the given depth.
391 This is a wrapper over both computeRemovals and
392 solutionFromRemovals. In case we have no solution, we return Nothing.
395 computeSolution :: Node.List -- ^ The original node data
396 -> [Instance.Instance] -- ^ The list of /bad/ instances
397 -> Int -- ^ The /depth/ of removals
398 -> Int -- ^ Maximum number of removals to process
399 -> Int -- ^ Minimum delta parameter
400 -> Int -- ^ Maximum delta parameter
401 -> Maybe Solution -- ^ The best solution found (or Nothing)
402 computeSolution nl bad_instances depth max_removals min_delta max_delta =
404 removals = computeRemovals nl bad_instances depth
405 removals' = capRemovals removals max_removals
407 solutionFromRemovals removals' min_delta max_delta
411 -- | Compute best table. Note that the ordering of the arguments is important.
412 compareTables :: Table -> Table -> Table
413 compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
414 if a_cv > b_cv then b else a
416 -- | Applies an instance move to a given node list and instance.
417 applyMove :: Node.List -> Instance.Instance
418 -> IMove -> (Maybe Node.List, Instance.Instance, Ndx, Ndx)
420 applyMove nl inst Failover =
421 let old_pdx = Instance.pnode inst
422 old_sdx = Instance.snode inst
423 old_p = Container.find old_pdx nl
424 old_s = Container.find old_sdx nl
425 int_p = Node.removePri old_p inst
426 int_s = Node.removeSec old_s inst
427 new_nl = do -- Maybe monad
428 new_p <- Node.addPri int_s inst
429 new_s <- Node.addSec int_p inst old_sdx
430 return $ Container.addTwo old_pdx new_s old_sdx new_p nl
431 in (new_nl, Instance.setBoth inst old_sdx old_pdx, old_sdx, old_pdx)
433 -- Replace the primary (f:, r:np, f)
434 applyMove nl inst (ReplacePrimary new_pdx) =
435 let old_pdx = Instance.pnode inst
436 old_sdx = Instance.snode inst
437 old_p = Container.find old_pdx nl
438 old_s = Container.find old_sdx nl
439 tgt_n = Container.find new_pdx nl
440 int_p = Node.removePri old_p inst
441 int_s = Node.removeSec old_s inst
442 new_nl = do -- Maybe monad
443 new_p <- Node.addPri tgt_n inst
444 new_s <- Node.addSec int_s inst new_pdx
445 return $ Container.add new_pdx new_p $
446 Container.addTwo old_pdx int_p old_sdx new_s nl
447 in (new_nl, Instance.setPri inst new_pdx, new_pdx, old_sdx)
449 -- Replace the secondary (r:ns)
450 applyMove nl inst (ReplaceSecondary new_sdx) =
451 let old_pdx = Instance.pnode inst
452 old_sdx = Instance.snode inst
453 old_s = Container.find old_sdx nl
454 tgt_n = Container.find new_sdx nl
455 int_s = Node.removeSec old_s inst
456 new_nl = Node.addSec tgt_n inst old_pdx >>=
457 \new_s -> return $ Container.addTwo new_sdx
458 new_s old_sdx int_s nl
459 in (new_nl, Instance.setSec inst new_sdx, old_pdx, new_sdx)
461 -- Replace the secondary and failover (r:np, f)
462 applyMove nl inst (ReplaceAndFailover new_pdx) =
463 let old_pdx = Instance.pnode inst
464 old_sdx = Instance.snode inst
465 old_p = Container.find old_pdx nl
466 old_s = Container.find old_sdx nl
467 tgt_n = Container.find new_pdx nl
468 int_p = Node.removePri old_p inst
469 int_s = Node.removeSec old_s inst
470 new_nl = do -- Maybe monad
471 new_p <- Node.addPri tgt_n inst
472 new_s <- Node.addSec int_p inst new_pdx
473 return $ Container.add new_pdx new_p $
474 Container.addTwo old_pdx new_s old_sdx int_s nl
475 in (new_nl, Instance.setBoth inst new_pdx old_pdx, new_pdx, old_pdx)
477 -- Failver and replace the secondary (f, r:ns)
478 applyMove nl inst (FailoverAndReplace new_sdx) =
479 let old_pdx = Instance.pnode inst
480 old_sdx = Instance.snode inst
481 old_p = Container.find old_pdx nl
482 old_s = Container.find old_sdx nl
483 tgt_n = Container.find new_sdx nl
484 int_p = Node.removePri old_p inst
485 int_s = Node.removeSec old_s inst
486 new_nl = do -- Maybe monad
487 new_p <- Node.addPri int_s inst
488 new_s <- Node.addSec tgt_n inst old_sdx
489 return $ Container.add new_sdx new_s $
490 Container.addTwo old_sdx new_p old_pdx int_p nl
491 in (new_nl, Instance.setBoth inst old_sdx new_sdx, old_sdx, new_sdx)
493 -- | Tries to allocate an instance on one given node.
494 allocateOnSingle :: Node.List -> Instance.Instance -> Node.Node
495 -> (Maybe Node.List, Instance.Instance)
496 allocateOnSingle nl inst p =
497 let new_pdx = Node.idx p
498 new_nl = Node.addPri p inst >>= \new_p ->
499 return $ Container.add new_pdx new_p nl
500 in (new_nl, Instance.setBoth inst new_pdx Node.noSecondary)
502 -- | Tries to allocate an instance on a given pair of nodes.
503 allocateOnPair :: Node.List -> Instance.Instance -> Node.Node -> Node.Node
504 -> (Maybe Node.List, Instance.Instance)
505 allocateOnPair nl inst tgt_p tgt_s =
506 let new_pdx = Node.idx tgt_p
507 new_sdx = Node.idx tgt_s
508 new_nl = do -- Maybe monad
509 new_p <- Node.addPri tgt_p inst
510 new_s <- Node.addSec tgt_s inst new_pdx
511 return $ Container.addTwo new_pdx new_p new_sdx new_s nl
512 in (new_nl, Instance.setBoth inst new_pdx new_sdx)
514 -- | Tries to perform an instance move and returns the best table
515 -- between the original one and the new one.
516 checkSingleStep :: Table -- ^ The original table
517 -> Instance.Instance -- ^ The instance to move
518 -> Table -- ^ The current best table
519 -> IMove -- ^ The move to apply
520 -> Table -- ^ The final best table
521 checkSingleStep ini_tbl target cur_tbl move =
523 Table ini_nl ini_il _ ini_plc = ini_tbl
524 (tmp_nl, new_inst, pri_idx, sec_idx) = applyMove ini_nl target move
526 if isNothing tmp_nl then cur_tbl
528 let tgt_idx = Instance.idx target
529 upd_nl = fromJust tmp_nl
530 upd_cvar = compCV upd_nl
531 upd_il = Container.add tgt_idx new_inst ini_il
532 upd_plc = (tgt_idx, pri_idx, sec_idx, upd_cvar):ini_plc
533 upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
535 compareTables cur_tbl upd_tbl
537 -- | Given the status of the current secondary as a valid new node
538 -- and the current candidate target node,
539 -- generate the possible moves for a instance.
540 possibleMoves :: Bool -> Ndx -> [IMove]
541 possibleMoves True tdx =
542 [ReplaceSecondary tdx,
543 ReplaceAndFailover tdx,
545 FailoverAndReplace tdx]
547 possibleMoves False tdx =
548 [ReplaceSecondary tdx,
549 ReplaceAndFailover tdx]
551 -- | Compute the best move for a given instance.
552 checkInstanceMove :: [Ndx] -- Allowed target node indices
553 -> Table -- Original table
554 -> Instance.Instance -- Instance to move
555 -> Table -- Best new table for this instance
556 checkInstanceMove nodes_idx ini_tbl target =
558 opdx = Instance.pnode target
559 osdx = Instance.snode target
560 nodes = filter (\idx -> idx /= opdx && idx /= osdx) nodes_idx
561 use_secondary = elem osdx nodes_idx
562 aft_failover = if use_secondary -- if allowed to failover
563 then checkSingleStep ini_tbl target ini_tbl Failover
565 all_moves = concatMap (possibleMoves use_secondary) nodes
567 -- iterate over the possible nodes for this instance
568 foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
570 -- | Compute the best next move.
571 checkMove :: [Ndx] -- ^ Allowed target node indices
572 -> Table -- ^ The current solution
573 -> [Instance.Instance] -- ^ List of instances still to move
574 -> Table -- ^ The new solution
575 checkMove nodes_idx ini_tbl victims =
576 let Table _ _ _ ini_plc = ini_tbl
577 -- iterate over all instances, computing the best move
581 if Instance.snode elem == Node.noSecondary then step_tbl
582 else compareTables step_tbl $
583 checkInstanceMove nodes_idx ini_tbl elem)
585 Table _ _ _ best_plc = best_tbl
587 if length best_plc == length ini_plc then -- no advancement
593 -- * Formatting functions
595 -- | Given the original and final nodes, computes the relocation description.
596 computeMoves :: String -- ^ The instance name
597 -> String -- ^ Original primary
598 -> String -- ^ Original secondary
599 -> String -- ^ New primary
600 -> String -- ^ New secondary
601 -> (String, [String])
602 -- ^ Tuple of moves and commands list; moves is containing
603 -- either @/f/@ for failover or @/r:name/@ for replace
604 -- secondary, while the command list holds gnt-instance
605 -- commands (without that prefix), e.g \"@failover instance1@\"
606 computeMoves i a b c d =
607 if c == a then {- Same primary -}
608 if d == b then {- Same sec??! -}
610 else {- Change of secondary -}
612 [printf "replace-disks -n %s %s" d i])
614 if c == b then {- Failover and ... -}
615 if d == a then {- that's all -}
616 ("f", [printf "migrate -f %s" i])
619 [printf "migrate -f %s" i,
620 printf "replace-disks -n %s %s" d i])
622 if d == a then {- ... and keep primary as secondary -}
624 [printf "replace-disks -n %s %s" c i,
625 printf "migrate -f %s" i])
627 if d == b then {- ... keep same secondary -}
628 (printf "f r:%s f" c,
629 [printf "migrate -f %s" i,
630 printf "replace-disks -n %s %s" c i,
631 printf "migrate -f %s" i])
633 else {- Nothing in common -}
634 (printf "r:%s f r:%s" c d,
635 [printf "replace-disks -n %s %s" c i,
636 printf "migrate -f %s" i,
637 printf "replace-disks -n %s %s" d i])
639 -- | Converts a placement to string format.
640 printSolutionLine :: Node.List -- ^ The node list
641 -> Instance.List -- ^ The instance list
642 -> Int -- ^ Maximum node name length
643 -> Int -- ^ Maximum instance name length
644 -> Placement -- ^ The current placement
645 -> Int -- ^ The index of the placement in
647 -> (String, [String])
648 printSolutionLine nl il nmlen imlen plc pos =
650 pmlen = (2*nmlen + 1)
652 inst = Container.find i il
653 inam = Instance.name inst
654 npri = Container.nameOf nl p
655 nsec = Container.nameOf nl s
656 opri = Container.nameOf nl $ Instance.pnode inst
657 osec = Container.nameOf nl $ Instance.snode inst
658 (moves, cmds) = computeMoves inam opri osec npri nsec
659 ostr = (printf "%s:%s" opri osec)::String
660 nstr = (printf "%s:%s" npri nsec)::String
662 (printf " %3d. %-*s %-*s => %-*s %.8f a=%s"
663 pos imlen inam pmlen ostr
667 -- | Given a list of commands, prefix them with @gnt-instance@ and
668 -- also beautify the display a little.
669 formatCmds :: [[String]] -> String
670 formatCmds cmd_strs =
672 concat $ map (\(a, b) ->
673 (printf "echo step %d" (a::Int)):
675 (map ("gnt-instance " ++) b)) $
678 -- | Converts a solution to string format.
679 printSolution :: Node.List
682 -> ([String], [[String]])
683 printSolution nl il sol =
685 nmlen = Container.maxNameLen nl
686 imlen = Container.maxNameLen il
688 unzip $ map (uncurry $ printSolutionLine nl il nmlen imlen) $
691 -- | Print the node list.
692 printNodes :: Node.List -> String
694 let snl = sortBy (compare `on` Node.idx) (Container.elems nl)
695 m_name = maximum . map (length . Node.name) $ snl
696 helper = Node.list m_name
698 "%2s %-*s %5s %5s %5s %5s %5s %5s %5s %5s %3s %3s %7s %7s"
700 "t_mem" "n_mem" "i_mem" "x_mem" "f_mem" "r_mem"
702 "pri" "sec" "p_fmem" "p_fdsk"
703 in unlines $ (header:map helper snl)
705 -- | Shows statistics for a given node list.
706 printStats :: Node.List -> String
708 let (mem_cv, dsk_cv, n1_score, res_cv, off_score) = compDetailedCV nl
709 in printf "f_mem=%.8f, r_mem=%.8f, f_dsk=%.8f, n1=%.3f, uf=%.3f"
710 mem_cv res_cv dsk_cv n1_score off_score