1 {-| Implementation of cluster-wide logic.
3 This module holds all pure cluster-logic; I\/O related functionality
4 goes into the "Main" module for the individual binaries.
10 Copyright (C) 2009 Google Inc.
12 This program is free software; you can redistribute it and/or modify
13 it under the terms of the GNU General Public License as published by
14 the Free Software Foundation; either version 2 of the License, or
15 (at your option) any later version.
17 This program is distributed in the hope that it will be useful, but
18 WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 General Public License for more details.
22 You should have received a copy of the GNU General Public License
23 along with this program; if not, write to the Free Software
24 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
29 module Ganeti.HTools.Cluster
38 -- * Generic functions
40 -- * First phase functions
42 -- * Second phase functions
47 -- * Balacing functions
51 -- * IAllocator functions
58 import Text.Printf (printf)
62 import qualified Ganeti.HTools.Container as Container
63 import qualified Ganeti.HTools.Instance as Instance
64 import qualified Ganeti.HTools.Node as Node
65 import Ganeti.HTools.Types
66 import Ganeti.HTools.Utils
70 -- | A separate name for the cluster score type.
73 -- | The description of an instance placement.
74 type Placement = (Idx, Ndx, Ndx, Score)
76 -- | Allocation\/relocation solution.
77 type AllocSolution = ([FailMode], Int, Maybe (Score, AllocElement))
79 -- | Allocation\/relocation element.
80 type AllocElement = (Node.List, Instance.Instance, [Node.Node])
82 -- | An instance move definition
83 data IMove = Failover -- ^ Failover the instance (f)
84 | ReplacePrimary Ndx -- ^ Replace primary (f, r:np, f)
85 | ReplaceSecondary Ndx -- ^ Replace secondary (r:ns)
86 | ReplaceAndFailover Ndx -- ^ Replace secondary, failover (r:np, f)
87 | FailoverAndReplace Ndx -- ^ Failover, replace secondary (f, r:ns)
90 -- | The complete state for the balancing solution
91 data Table = Table Node.List Instance.List Score [Placement]
94 data CStats = CStats { cs_fmem :: Int -- ^ Cluster free mem
95 , cs_fdsk :: Int -- ^ Cluster free disk
96 , cs_amem :: Int -- ^ Cluster allocatable mem
97 , cs_adsk :: Int -- ^ Cluster allocatable disk
98 , cs_acpu :: Int -- ^ Cluster allocatable cpus
99 , cs_mmem :: Int -- ^ Max node allocatable mem
100 , cs_mdsk :: Int -- ^ Max node allocatable disk
101 , cs_mcpu :: Int -- ^ Max node allocatable cpu
102 , cs_imem :: Int -- ^ Instance used mem
103 , cs_idsk :: Int -- ^ Instance used disk
104 , cs_icpu :: Int -- ^ Instance used cpu
105 , cs_tmem :: Double -- ^ Cluster total mem
106 , cs_tdsk :: Double -- ^ Cluster total disk
107 , cs_tcpu :: Double -- ^ Cluster total cpus
108 , cs_xmem :: Int -- ^ Unnacounted for mem
109 , cs_nmem :: Int -- ^ Node own memory
110 , cs_score :: Score -- ^ The cluster score
111 , cs_ninst :: Int -- ^ The total number of instances
114 -- * Utility functions
116 -- | Verifies the N+1 status and return the affected nodes.
117 verifyN1 :: [Node.Node] -> [Node.Node]
118 verifyN1 = filter Node.failN1
120 {-| Computes the pair of bad nodes and instances.
122 The bad node list is computed via a simple 'verifyN1' check, and the
123 bad instance list is the list of primary and secondary instances of
127 computeBadItems :: Node.List -> Instance.List ->
128 ([Node.Node], [Instance.Instance])
129 computeBadItems nl il =
130 let bad_nodes = verifyN1 $ getOnline nl
131 bad_instances = map (\idx -> Container.find idx il) .
133 concatMap (\ n -> Node.slist n ++ Node.plist n) bad_nodes
135 (bad_nodes, bad_instances)
137 emptyCStats :: CStats
138 emptyCStats = CStats { cs_fmem = 0
158 updateCStats :: CStats -> Node.Node -> CStats
159 updateCStats cs node =
160 let CStats { cs_fmem = x_fmem, cs_fdsk = x_fdsk,
161 cs_amem = x_amem, cs_acpu = x_acpu, cs_adsk = x_adsk,
162 cs_mmem = x_mmem, cs_mdsk = x_mdsk, cs_mcpu = x_mcpu,
163 cs_imem = x_imem, cs_idsk = x_idsk, cs_icpu = x_icpu,
164 cs_tmem = x_tmem, cs_tdsk = x_tdsk, cs_tcpu = x_tcpu,
165 cs_xmem = x_xmem, cs_nmem = x_nmem, cs_ninst = x_ninst
168 inc_amem = Node.f_mem node - Node.r_mem node
169 inc_amem' = if inc_amem > 0 then inc_amem else 0
170 inc_adsk = Node.availDisk node
171 inc_imem = truncate (Node.t_mem node) - Node.n_mem node
172 - Node.x_mem node - Node.f_mem node
173 inc_icpu = Node.u_cpu node
174 inc_idsk = truncate (Node.t_dsk node) - Node.f_dsk node
176 in cs { cs_fmem = x_fmem + Node.f_mem node
177 , cs_fdsk = x_fdsk + Node.f_dsk node
178 , cs_amem = x_amem + inc_amem'
179 , cs_adsk = x_adsk + inc_adsk
181 , cs_mmem = max x_mmem inc_amem'
182 , cs_mdsk = max x_mdsk inc_adsk
184 , cs_imem = x_imem + inc_imem
185 , cs_idsk = x_idsk + inc_idsk
186 , cs_icpu = x_icpu + inc_icpu
187 , cs_tmem = x_tmem + Node.t_mem node
188 , cs_tdsk = x_tdsk + Node.t_dsk node
189 , cs_tcpu = x_tcpu + Node.t_cpu node
190 , cs_xmem = x_xmem + Node.x_mem node
191 , cs_nmem = x_nmem + Node.n_mem node
192 , cs_ninst = x_ninst + length (Node.plist node)
195 -- | Compute the total free disk and memory in the cluster.
196 totalResources :: Node.List -> CStats
198 let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
199 in cs { cs_score = compCV nl }
201 -- | Compute the mem and disk covariance.
202 compDetailedCV :: Node.List -> (Double, Double, Double, Double, Double, Double)
205 all_nodes = Container.elems nl
206 (offline, nodes) = partition Node.offline all_nodes
207 mem_l = map Node.p_mem nodes
208 dsk_l = map Node.p_dsk nodes
209 mem_cv = varianceCoeff mem_l
210 dsk_cv = varianceCoeff dsk_l
211 n1_l = length $ filter Node.failN1 nodes
212 n1_score = fromIntegral n1_l /
213 fromIntegral (length nodes)::Double
214 res_l = map Node.p_rem nodes
215 res_cv = varianceCoeff res_l
216 offline_inst = sum . map (\n -> (length . Node.plist $ n) +
217 (length . Node.slist $ n)) $ offline
218 online_inst = sum . map (\n -> (length . Node.plist $ n) +
219 (length . Node.slist $ n)) $ nodes
220 off_score = if offline_inst == 0
222 else fromIntegral offline_inst /
223 fromIntegral (offline_inst + online_inst)::Double
224 cpu_l = map Node.p_cpu nodes
225 cpu_cv = varianceCoeff cpu_l
226 in (mem_cv, dsk_cv, n1_score, res_cv, off_score, cpu_cv)
228 -- | Compute the /total/ variance.
229 compCV :: Node.List -> Double
231 let (mem_cv, dsk_cv, n1_score, res_cv, off_score, cpu_cv) =
233 in mem_cv + dsk_cv + n1_score + res_cv + off_score + cpu_cv
235 -- | Compute online nodes from a Node.List
236 getOnline :: Node.List -> [Node.Node]
237 getOnline = filter (not . Node.offline) . Container.elems
241 -- | Compute best table. Note that the ordering of the arguments is important.
242 compareTables :: Table -> Table -> Table
243 compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
244 if a_cv > b_cv then b else a
246 -- | Applies an instance move to a given node list and instance.
247 applyMove :: Node.List -> Instance.Instance
248 -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
250 applyMove nl inst Failover =
251 let old_pdx = Instance.pnode inst
252 old_sdx = Instance.snode inst
253 old_p = Container.find old_pdx nl
254 old_s = Container.find old_sdx nl
255 int_p = Node.removePri old_p inst
256 int_s = Node.removeSec old_s inst
257 new_nl = do -- Maybe monad
258 new_p <- Node.addPri int_s inst
259 new_s <- Node.addSec int_p inst old_sdx
260 let new_inst = Instance.setBoth inst old_sdx old_pdx
261 return (Container.addTwo old_pdx new_s old_sdx new_p nl,
262 new_inst, old_sdx, old_pdx)
265 -- Replace the primary (f:, r:np, f)
266 applyMove nl inst (ReplacePrimary new_pdx) =
267 let old_pdx = Instance.pnode inst
268 old_sdx = Instance.snode inst
269 old_p = Container.find old_pdx nl
270 old_s = Container.find old_sdx nl
271 tgt_n = Container.find new_pdx nl
272 int_p = Node.removePri old_p inst
273 int_s = Node.removeSec old_s inst
274 new_nl = do -- Maybe monad
275 -- check that the current secondary can host the instance
276 -- during the migration
277 tmp_s <- Node.addPri int_s inst
278 let tmp_s' = Node.removePri tmp_s inst
279 new_p <- Node.addPri tgt_n inst
280 new_s <- Node.addSec tmp_s' inst new_pdx
281 let new_inst = Instance.setPri inst new_pdx
282 return (Container.add new_pdx new_p $
283 Container.addTwo old_pdx int_p old_sdx new_s nl,
284 new_inst, new_pdx, old_sdx)
287 -- Replace the secondary (r:ns)
288 applyMove nl inst (ReplaceSecondary new_sdx) =
289 let old_pdx = Instance.pnode inst
290 old_sdx = Instance.snode inst
291 old_s = Container.find old_sdx nl
292 tgt_n = Container.find new_sdx nl
293 int_s = Node.removeSec old_s inst
294 new_inst = Instance.setSec inst new_sdx
295 new_nl = Node.addSec tgt_n inst old_pdx >>=
296 \new_s -> return (Container.addTwo new_sdx
297 new_s old_sdx int_s nl,
298 new_inst, old_pdx, new_sdx)
301 -- Replace the secondary and failover (r:np, f)
302 applyMove nl inst (ReplaceAndFailover new_pdx) =
303 let old_pdx = Instance.pnode inst
304 old_sdx = Instance.snode inst
305 old_p = Container.find old_pdx nl
306 old_s = Container.find old_sdx nl
307 tgt_n = Container.find new_pdx nl
308 int_p = Node.removePri old_p inst
309 int_s = Node.removeSec old_s inst
310 new_nl = do -- Maybe monad
311 new_p <- Node.addPri tgt_n inst
312 new_s <- Node.addSec int_p inst new_pdx
313 let new_inst = Instance.setBoth inst new_pdx old_pdx
314 return (Container.add new_pdx new_p $
315 Container.addTwo old_pdx new_s old_sdx int_s nl,
316 new_inst, new_pdx, old_pdx)
319 -- Failver and replace the secondary (f, r:ns)
320 applyMove nl inst (FailoverAndReplace new_sdx) =
321 let old_pdx = Instance.pnode inst
322 old_sdx = Instance.snode inst
323 old_p = Container.find old_pdx nl
324 old_s = Container.find old_sdx nl
325 tgt_n = Container.find new_sdx nl
326 int_p = Node.removePri old_p inst
327 int_s = Node.removeSec old_s inst
328 new_nl = do -- Maybe monad
329 new_p <- Node.addPri int_s inst
330 new_s <- Node.addSec tgt_n inst old_sdx
331 let new_inst = Instance.setBoth inst old_sdx new_sdx
332 return (Container.add new_sdx new_s $
333 Container.addTwo old_sdx new_p old_pdx int_p nl,
334 new_inst, old_sdx, new_sdx)
337 -- | Tries to allocate an instance on one given node.
338 allocateOnSingle :: Node.List -> Instance.Instance -> Node.Node
339 -> OpResult AllocElement
340 allocateOnSingle nl inst p =
341 let new_pdx = Node.idx p
342 new_inst = Instance.setBoth inst new_pdx Node.noSecondary
343 new_nl = Node.addPri p inst >>= \new_p ->
344 return (Container.add new_pdx new_p nl, new_inst, [new_p])
347 -- | Tries to allocate an instance on a given pair of nodes.
348 allocateOnPair :: Node.List -> Instance.Instance -> Node.Node -> Node.Node
349 -> OpResult AllocElement
350 allocateOnPair nl inst tgt_p tgt_s =
351 let new_pdx = Node.idx tgt_p
352 new_sdx = Node.idx tgt_s
353 new_nl = do -- Maybe monad
354 new_p <- Node.addPri tgt_p inst
355 new_s <- Node.addSec tgt_s inst new_pdx
356 let new_inst = Instance.setBoth inst new_pdx new_sdx
357 return (Container.addTwo new_pdx new_p new_sdx new_s nl, new_inst,
361 -- | Tries to perform an instance move and returns the best table
362 -- between the original one and the new one.
363 checkSingleStep :: Table -- ^ The original table
364 -> Instance.Instance -- ^ The instance to move
365 -> Table -- ^ The current best table
366 -> IMove -- ^ The move to apply
367 -> Table -- ^ The final best table
368 checkSingleStep ini_tbl target cur_tbl move =
370 Table ini_nl ini_il _ ini_plc = ini_tbl
371 tmp_resu = applyMove ini_nl target move
375 OpGood (upd_nl, new_inst, pri_idx, sec_idx) ->
376 let tgt_idx = Instance.idx target
377 upd_cvar = compCV upd_nl
378 upd_il = Container.add tgt_idx new_inst ini_il
379 upd_plc = (tgt_idx, pri_idx, sec_idx, upd_cvar):ini_plc
380 upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
382 compareTables cur_tbl upd_tbl
384 -- | Given the status of the current secondary as a valid new node
385 -- and the current candidate target node,
386 -- generate the possible moves for a instance.
387 possibleMoves :: Bool -> Ndx -> [IMove]
388 possibleMoves True tdx =
389 [ReplaceSecondary tdx,
390 ReplaceAndFailover tdx,
392 FailoverAndReplace tdx]
394 possibleMoves False tdx =
395 [ReplaceSecondary tdx,
396 ReplaceAndFailover tdx]
398 -- | Compute the best move for a given instance.
399 checkInstanceMove :: [Ndx] -- Allowed target node indices
400 -> Table -- Original table
401 -> Instance.Instance -- Instance to move
402 -> Table -- Best new table for this instance
403 checkInstanceMove nodes_idx ini_tbl target =
405 opdx = Instance.pnode target
406 osdx = Instance.snode target
407 nodes = filter (\idx -> idx /= opdx && idx /= osdx) nodes_idx
408 use_secondary = elem osdx nodes_idx
409 aft_failover = if use_secondary -- if allowed to failover
410 then checkSingleStep ini_tbl target ini_tbl Failover
412 all_moves = concatMap (possibleMoves use_secondary) nodes
414 -- iterate over the possible nodes for this instance
415 foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
417 -- | Compute the best next move.
418 checkMove :: [Ndx] -- ^ Allowed target node indices
419 -> Table -- ^ The current solution
420 -> [Instance.Instance] -- ^ List of instances still to move
421 -> Table -- ^ The new solution
422 checkMove nodes_idx ini_tbl victims =
423 let Table _ _ _ ini_plc = ini_tbl
424 -- iterate over all instances, computing the best move
428 if Instance.snode elem == Node.noSecondary then step_tbl
429 else compareTables step_tbl $
430 checkInstanceMove nodes_idx ini_tbl elem)
432 Table _ _ _ best_plc = best_tbl
434 if length best_plc == length ini_plc then -- no advancement
439 -- * Allocation functions
441 -- | Build failure stats out of a list of failures
442 collapseFailures :: [FailMode] -> FailStats
443 collapseFailures flst =
444 map (\k -> (k, length $ filter ((==) k) flst)) [minBound..maxBound]
446 -- | Update current Allocation solution and failure stats with new
448 concatAllocs :: AllocSolution -> OpResult AllocElement -> AllocSolution
449 concatAllocs (flst, succ, sols) (OpFail reason) = (reason:flst, succ, sols)
451 concatAllocs (flst, succ, osols) (OpGood ns@(nl, _, _)) =
452 let nscore = compCV nl
453 -- Choose the old or new solution, based on the cluster score
454 nsols = case osols of
455 Nothing -> Just (nscore, ns)
459 else Just (nscore, ns)
461 -- Note: we force evaluation of nsols here in order to keep the
462 -- memory profile low - we know that we will need nsols for sure
463 -- in the next cycle, so we force evaluation of nsols, since the
464 -- foldl' in the caller will only evaluate the tuple, but not the
465 -- elements of the tuple
466 in nsols `seq` nsuc `seq` (flst, nsuc, nsols)
468 -- | Try to allocate an instance on the cluster.
469 tryAlloc :: (Monad m) =>
470 Node.List -- ^ The node list
471 -> Instance.List -- ^ The instance list
472 -> Instance.Instance -- ^ The instance to allocate
473 -> Int -- ^ Required number of nodes
474 -> m AllocSolution -- ^ Possible solution list
475 tryAlloc nl _ inst 2 =
476 let all_nodes = getOnline nl
477 all_pairs = liftM2 (,) all_nodes all_nodes
478 ok_pairs = filter (\(x, y) -> Node.idx x /= Node.idx y) all_pairs
479 sols = foldl' (\cstate (p, s) ->
480 concatAllocs cstate $ allocateOnPair nl inst p s
481 ) ([], 0, Nothing) ok_pairs
484 tryAlloc nl _ inst 1 =
485 let all_nodes = getOnline nl
486 sols = foldl' (\cstate ->
487 concatAllocs cstate . allocateOnSingle nl inst
488 ) ([], 0, Nothing) all_nodes
491 tryAlloc _ _ _ reqn = fail $ "Unsupported number of allocation \
492 \destinations required (" ++ show reqn ++
493 "), only two supported"
495 -- | Try to allocate an instance on the cluster.
496 tryReloc :: (Monad m) =>
497 Node.List -- ^ The node list
498 -> Instance.List -- ^ The instance list
499 -> Idx -- ^ The index of the instance to move
500 -> Int -- ^ The number of nodes required
501 -> [Ndx] -- ^ Nodes which should not be used
502 -> m AllocSolution -- ^ Solution list
503 tryReloc nl il xid 1 ex_idx =
504 let all_nodes = getOnline nl
505 inst = Container.find xid il
506 ex_idx' = Instance.pnode inst:ex_idx
507 valid_nodes = filter (not . flip elem ex_idx' . Node.idx) all_nodes
508 valid_idxes = map Node.idx valid_nodes
509 sols1 = foldl' (\cstate x ->
512 applyMove nl inst (ReplaceSecondary x)
513 return (mnl, i, [Container.find x mnl])
514 in concatAllocs cstate elem
515 ) ([], 0, Nothing) valid_idxes
518 tryReloc _ _ _ reqn _ = fail $ "Unsupported number of relocation \
519 \destinations required (" ++ show reqn ++
520 "), only one supported"
522 -- * Formatting functions
524 -- | Given the original and final nodes, computes the relocation description.
525 computeMoves :: String -- ^ The instance name
526 -> String -- ^ Original primary
527 -> String -- ^ Original secondary
528 -> String -- ^ New primary
529 -> String -- ^ New secondary
530 -> (String, [String])
531 -- ^ Tuple of moves and commands list; moves is containing
532 -- either @/f/@ for failover or @/r:name/@ for replace
533 -- secondary, while the command list holds gnt-instance
534 -- commands (without that prefix), e.g \"@failover instance1@\"
535 computeMoves i a b c d
539 then {- Same sec??! -} ("-", [])
540 else {- Change of secondary -}
541 (printf "r:%s" d, [rep d])
545 then {- that's all -} ("f", [mig])
546 else (printf "f r:%s" d, [mig, rep d])
547 -- ... and keep primary as secondary
549 (printf "r:%s f" c, [rep c, mig])
550 -- ... keep same secondary
552 (printf "f r:%s f" c, [mig, rep c, mig])
553 -- nothing in common -
555 (printf "r:%s f r:%s" c d, [rep c, mig, rep d])
556 where mig = printf "migrate -f %s" i::String
557 rep n = printf "replace-disks -n %s %s" n i
559 -- | Converts a placement to string format.
560 printSolutionLine :: Node.List -- ^ The node list
561 -> Instance.List -- ^ The instance list
562 -> Int -- ^ Maximum node name length
563 -> Int -- ^ Maximum instance name length
564 -> Placement -- ^ The current placement
565 -> Int -- ^ The index of the placement in
567 -> (String, [String])
568 printSolutionLine nl il nmlen imlen plc pos =
570 pmlen = (2*nmlen + 1)
572 inst = Container.find i il
573 inam = Instance.name inst
574 npri = Container.nameOf nl p
575 nsec = Container.nameOf nl s
576 opri = Container.nameOf nl $ Instance.pnode inst
577 osec = Container.nameOf nl $ Instance.snode inst
578 (moves, cmds) = computeMoves inam opri osec npri nsec
579 ostr = printf "%s:%s" opri osec::String
580 nstr = printf "%s:%s" npri nsec::String
582 (printf " %3d. %-*s %-*s => %-*s %.8f a=%s"
583 pos imlen inam pmlen ostr
587 -- | Given a list of commands, prefix them with @gnt-instance@ and
588 -- also beautify the display a little.
589 formatCmds :: [[String]] -> String
592 concatMap (\(a, b) ->
593 printf "echo step %d" (a::Int):
595 map ("gnt-instance " ++) b
599 -- | Converts a solution to string format.
600 printSolution :: Node.List
603 -> ([String], [[String]])
604 printSolution nl il sol =
606 nmlen = Container.maxNameLen nl
607 imlen = Container.maxNameLen il
609 unzip $ zipWith (printSolutionLine nl il nmlen imlen) sol [1..]
611 -- | Print the node list.
612 printNodes :: Node.List -> String
614 let snl = sortBy (compare `on` Node.idx) (Container.elems nl)
615 m_name = maximum . map (length . Node.name) $ snl
616 helper = Node.list m_name
618 "%2s %-*s %5s %5s %5s %5s %5s %5s %5s %5s %4s %4s \
619 \%3s %3s %6s %6s %5s"
621 "t_mem" "n_mem" "i_mem" "x_mem" "f_mem" "r_mem"
622 "t_dsk" "f_dsk" "pcpu" "vcpu"
623 "pri" "sec" "p_fmem" "p_fdsk" "r_cpu"::String
624 in unlines (header:map helper snl)
626 -- | Shows statistics for a given node list.
627 printStats :: Node.List -> String
629 let (mem_cv, dsk_cv, n1_score, res_cv, off_score, cpu_cv) =
631 in printf "f_mem=%.8f, r_mem=%.8f, f_dsk=%.8f, n1=%.3f, \
632 \uf=%.3f, r_cpu=%.3f"
633 mem_cv res_cv dsk_cv n1_score off_score cpu_cv