1 {-| Implementation of cluster-wide logic.
3 This module holds all pure cluster-logic; I\/O related functionality
4 goes into the "Main" module for the individual binaries.
10 Copyright (C) 2009 Google Inc.
12 This program is free software; you can redistribute it and/or modify
13 it under the terms of the GNU General Public License as published by
14 the Free Software Foundation; either version 2 of the License, or
15 (at your option) any later version.
17 This program is distributed in the hope that it will be useful, but
18 WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 General Public License for more details.
22 You should have received a copy of the GNU General Public License
23 along with this program; if not, write to the Free Software
24 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
29 module Ganeti.HTools.Cluster
38 -- * Generic functions
40 -- * First phase functions
42 -- * Second phase functions
47 -- * Balacing functions
52 -- * IAllocator functions
60 import Data.Maybe (isNothing, fromJust)
61 import Text.Printf (printf)
65 import qualified Ganeti.HTools.Container as Container
66 import qualified Ganeti.HTools.Instance as Instance
67 import qualified Ganeti.HTools.Node as Node
68 import Ganeti.HTools.Types
69 import Ganeti.HTools.Utils
73 -- | A separate name for the cluster score type.
76 -- | The description of an instance placement.
77 type Placement = (Idx, Ndx, Ndx, Score)
79 -- | Allocation\/relocation solution.
80 type AllocSolution = [(Maybe Node.List, Instance.Instance, [Node.Node])]
82 -- | An instance move definition
83 data IMove = Failover -- ^ Failover the instance (f)
84 | ReplacePrimary Ndx -- ^ Replace primary (f, r:np, f)
85 | ReplaceSecondary Ndx -- ^ Replace secondary (r:ns)
86 | ReplaceAndFailover Ndx -- ^ Replace secondary, failover (r:np, f)
87 | FailoverAndReplace Ndx -- ^ Failover, replace secondary (f, r:ns)
90 -- | The complete state for the balancing solution
91 data Table = Table Node.List Instance.List Score [Placement]
94 data CStats = CStats { cs_fmem :: Int -- ^ Cluster free mem
95 , cs_fdsk :: Int -- ^ Cluster free disk
96 , cs_amem :: Int -- ^ Cluster allocatable mem
97 , cs_adsk :: Int -- ^ Cluster allocatable disk
98 , cs_acpu :: Int -- ^ Cluster allocatable cpus
99 , cs_mmem :: Int -- ^ Max node allocatable mem
100 , cs_mdsk :: Int -- ^ Max node allocatable disk
101 , cs_mcpu :: Int -- ^ Max node allocatable cpu
104 -- * Utility functions
106 -- | Verifies the N+1 status and return the affected nodes.
107 verifyN1 :: [Node.Node] -> [Node.Node]
108 verifyN1 nl = filter Node.failN1 nl
110 {-| Computes the pair of bad nodes and instances.
112 The bad node list is computed via a simple 'verifyN1' check, and the
113 bad instance list is the list of primary and secondary instances of
117 computeBadItems :: Node.List -> Instance.List ->
118 ([Node.Node], [Instance.Instance])
119 computeBadItems nl il =
120 let bad_nodes = verifyN1 $ getOnline nl
121 bad_instances = map (\idx -> Container.find idx il) $
122 sort $ nub $ concat $
123 map (\ n -> (Node.slist n) ++ (Node.plist n)) bad_nodes
125 (bad_nodes, bad_instances)
127 emptyCStats :: CStats
128 emptyCStats = CStats { cs_fmem = 0
138 updateCStats :: CStats -> Node.Node -> CStats
139 updateCStats cs node =
140 let CStats { cs_fmem = x_fmem, cs_fdsk = x_fdsk,
141 cs_amem = x_amem, cs_acpu = x_acpu, cs_adsk = x_adsk,
142 cs_mmem = x_mmem, cs_mdsk = x_mdsk, cs_mcpu = x_mcpu }
144 inc_amem = (Node.f_mem node) - (Node.r_mem node)
145 inc_amem' = if inc_amem > 0 then inc_amem else 0
146 inc_adsk = Node.availDisk node
147 in CStats { cs_fmem = x_fmem + (Node.f_mem node)
148 , cs_fdsk = x_fdsk + (Node.f_dsk node)
149 , cs_amem = x_amem + inc_amem'
150 , cs_adsk = x_adsk + inc_adsk
152 , cs_mmem = max x_mmem inc_amem'
153 , cs_mdsk = max x_mdsk inc_adsk
157 -- | Compute the total free disk and memory in the cluster.
158 totalResources :: Node.List -> CStats
159 totalResources = foldl' updateCStats emptyCStats . Container.elems
161 -- | Compute the mem and disk covariance.
162 compDetailedCV :: Node.List -> (Double, Double, Double, Double, Double, Double)
165 all_nodes = Container.elems nl
166 (offline, nodes) = partition Node.offline all_nodes
167 mem_l = map Node.p_mem nodes
168 dsk_l = map Node.p_dsk nodes
169 mem_cv = varianceCoeff mem_l
170 dsk_cv = varianceCoeff dsk_l
171 n1_l = length $ filter Node.failN1 nodes
172 n1_score = ((fromIntegral n1_l) /
173 (fromIntegral $ length nodes))::Double
174 res_l = map Node.p_rem nodes
175 res_cv = varianceCoeff res_l
176 offline_inst = sum . map (\n -> (length . Node.plist $ n) +
177 (length . Node.slist $ n)) $ offline
178 online_inst = sum . map (\n -> (length . Node.plist $ n) +
179 (length . Node.slist $ n)) $ nodes
180 off_score = ((fromIntegral offline_inst) /
181 (fromIntegral $ online_inst + offline_inst))::Double
182 cpu_l = map Node.p_cpu nodes
183 cpu_cv = varianceCoeff cpu_l
184 in (mem_cv, dsk_cv, n1_score, res_cv, off_score, cpu_cv)
186 -- | Compute the /total/ variance.
187 compCV :: Node.List -> Double
189 let (mem_cv, dsk_cv, n1_score, res_cv, off_score, cpu_cv) =
191 in mem_cv + dsk_cv + n1_score + res_cv + off_score + cpu_cv
193 -- | Compute online nodes from a Node.List
194 getOnline :: Node.List -> [Node.Node]
195 getOnline = filter (not . Node.offline) . Container.elems
199 -- | Compute best table. Note that the ordering of the arguments is important.
200 compareTables :: Table -> Table -> Table
201 compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
202 if a_cv > b_cv then b else a
204 -- | Applies an instance move to a given node list and instance.
205 applyMove :: Node.List -> Instance.Instance
206 -> IMove -> (Maybe Node.List, Instance.Instance, Ndx, Ndx)
208 applyMove nl inst Failover =
209 let old_pdx = Instance.pnode inst
210 old_sdx = Instance.snode inst
211 old_p = Container.find old_pdx nl
212 old_s = Container.find old_sdx nl
213 int_p = Node.removePri old_p inst
214 int_s = Node.removeSec old_s inst
215 new_nl = do -- Maybe monad
216 new_p <- Node.addPri int_s inst
217 new_s <- Node.addSec int_p inst old_sdx
218 return $ Container.addTwo old_pdx new_s old_sdx new_p nl
219 in (new_nl, Instance.setBoth inst old_sdx old_pdx, old_sdx, old_pdx)
221 -- Replace the primary (f:, r:np, f)
222 applyMove nl inst (ReplacePrimary new_pdx) =
223 let old_pdx = Instance.pnode inst
224 old_sdx = Instance.snode inst
225 old_p = Container.find old_pdx nl
226 old_s = Container.find old_sdx nl
227 tgt_n = Container.find new_pdx nl
228 int_p = Node.removePri old_p inst
229 int_s = Node.removeSec old_s inst
230 new_nl = do -- Maybe monad
231 -- check that the current secondary can host the instance
232 -- during the migration
233 tmp_s <- Node.addPri int_s inst
234 let tmp_s' = Node.removePri tmp_s inst
235 new_p <- Node.addPri tgt_n inst
236 new_s <- Node.addSec tmp_s' inst new_pdx
237 return $ Container.add new_pdx new_p $
238 Container.addTwo old_pdx int_p old_sdx new_s nl
239 in (new_nl, Instance.setPri inst new_pdx, new_pdx, old_sdx)
241 -- Replace the secondary (r:ns)
242 applyMove nl inst (ReplaceSecondary new_sdx) =
243 let old_pdx = Instance.pnode inst
244 old_sdx = Instance.snode inst
245 old_s = Container.find old_sdx nl
246 tgt_n = Container.find new_sdx nl
247 int_s = Node.removeSec old_s inst
248 new_nl = Node.addSec tgt_n inst old_pdx >>=
249 \new_s -> return $ Container.addTwo new_sdx
250 new_s old_sdx int_s nl
251 in (new_nl, Instance.setSec inst new_sdx, old_pdx, new_sdx)
253 -- Replace the secondary and failover (r:np, f)
254 applyMove nl inst (ReplaceAndFailover new_pdx) =
255 let old_pdx = Instance.pnode inst
256 old_sdx = Instance.snode inst
257 old_p = Container.find old_pdx nl
258 old_s = Container.find old_sdx nl
259 tgt_n = Container.find new_pdx nl
260 int_p = Node.removePri old_p inst
261 int_s = Node.removeSec old_s inst
262 new_nl = do -- Maybe monad
263 new_p <- Node.addPri tgt_n inst
264 new_s <- Node.addSec int_p inst new_pdx
265 return $ Container.add new_pdx new_p $
266 Container.addTwo old_pdx new_s old_sdx int_s nl
267 in (new_nl, Instance.setBoth inst new_pdx old_pdx, new_pdx, old_pdx)
269 -- Failver and replace the secondary (f, r:ns)
270 applyMove nl inst (FailoverAndReplace new_sdx) =
271 let old_pdx = Instance.pnode inst
272 old_sdx = Instance.snode inst
273 old_p = Container.find old_pdx nl
274 old_s = Container.find old_sdx nl
275 tgt_n = Container.find new_sdx nl
276 int_p = Node.removePri old_p inst
277 int_s = Node.removeSec old_s inst
278 new_nl = do -- Maybe monad
279 new_p <- Node.addPri int_s inst
280 new_s <- Node.addSec tgt_n inst old_sdx
281 return $ Container.add new_sdx new_s $
282 Container.addTwo old_sdx new_p old_pdx int_p nl
283 in (new_nl, Instance.setBoth inst old_sdx new_sdx, old_sdx, new_sdx)
285 -- | Tries to allocate an instance on one given node.
286 allocateOnSingle :: Node.List -> Instance.Instance -> Node.Node
287 -> (Maybe Node.List, Instance.Instance)
288 allocateOnSingle nl inst p =
289 let new_pdx = Node.idx p
290 new_nl = Node.addPri p inst >>= \new_p ->
291 return $ Container.add new_pdx new_p nl
292 in (new_nl, Instance.setBoth inst new_pdx Node.noSecondary)
294 -- | Tries to allocate an instance on a given pair of nodes.
295 allocateOnPair :: Node.List -> Instance.Instance -> Node.Node -> Node.Node
296 -> (Maybe Node.List, Instance.Instance)
297 allocateOnPair nl inst tgt_p tgt_s =
298 let new_pdx = Node.idx tgt_p
299 new_sdx = Node.idx tgt_s
300 new_nl = do -- Maybe monad
301 new_p <- Node.addPri tgt_p inst
302 new_s <- Node.addSec tgt_s inst new_pdx
303 return $ Container.addTwo new_pdx new_p new_sdx new_s nl
304 in (new_nl, Instance.setBoth inst new_pdx new_sdx)
306 -- | Tries to perform an instance move and returns the best table
307 -- between the original one and the new one.
308 checkSingleStep :: Table -- ^ The original table
309 -> Instance.Instance -- ^ The instance to move
310 -> Table -- ^ The current best table
311 -> IMove -- ^ The move to apply
312 -> Table -- ^ The final best table
313 checkSingleStep ini_tbl target cur_tbl move =
315 Table ini_nl ini_il _ ini_plc = ini_tbl
316 (tmp_nl, new_inst, pri_idx, sec_idx) = applyMove ini_nl target move
318 if isNothing tmp_nl then cur_tbl
320 let tgt_idx = Instance.idx target
321 upd_nl = fromJust tmp_nl
322 upd_cvar = compCV upd_nl
323 upd_il = Container.add tgt_idx new_inst ini_il
324 upd_plc = (tgt_idx, pri_idx, sec_idx, upd_cvar):ini_plc
325 upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
327 compareTables cur_tbl upd_tbl
329 -- | Given the status of the current secondary as a valid new node
330 -- and the current candidate target node,
331 -- generate the possible moves for a instance.
332 possibleMoves :: Bool -> Ndx -> [IMove]
333 possibleMoves True tdx =
334 [ReplaceSecondary tdx,
335 ReplaceAndFailover tdx,
337 FailoverAndReplace tdx]
339 possibleMoves False tdx =
340 [ReplaceSecondary tdx,
341 ReplaceAndFailover tdx]
343 -- | Compute the best move for a given instance.
344 checkInstanceMove :: [Ndx] -- Allowed target node indices
345 -> Table -- Original table
346 -> Instance.Instance -- Instance to move
347 -> Table -- Best new table for this instance
348 checkInstanceMove nodes_idx ini_tbl target =
350 opdx = Instance.pnode target
351 osdx = Instance.snode target
352 nodes = filter (\idx -> idx /= opdx && idx /= osdx) nodes_idx
353 use_secondary = elem osdx nodes_idx
354 aft_failover = if use_secondary -- if allowed to failover
355 then checkSingleStep ini_tbl target ini_tbl Failover
357 all_moves = concatMap (possibleMoves use_secondary) nodes
359 -- iterate over the possible nodes for this instance
360 foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
362 -- | Compute the best next move.
363 checkMove :: [Ndx] -- ^ Allowed target node indices
364 -> Table -- ^ The current solution
365 -> [Instance.Instance] -- ^ List of instances still to move
366 -> Table -- ^ The new solution
367 checkMove nodes_idx ini_tbl victims =
368 let Table _ _ _ ini_plc = ini_tbl
369 -- iterate over all instances, computing the best move
373 if Instance.snode elem == Node.noSecondary then step_tbl
374 else compareTables step_tbl $
375 checkInstanceMove nodes_idx ini_tbl elem)
377 Table _ _ _ best_plc = best_tbl
379 if length best_plc == length ini_plc then -- no advancement
384 -- * Alocation functions
386 -- | Try to allocate an instance on the cluster.
387 tryAlloc :: (Monad m) =>
388 Node.List -- ^ The node list
389 -> Instance.List -- ^ The instance list
390 -> Instance.Instance -- ^ The instance to allocate
391 -> Int -- ^ Required number of nodes
392 -> m AllocSolution -- ^ Possible solution list
393 tryAlloc nl _ inst 2 =
394 let all_nodes = getOnline nl
395 all_pairs = liftM2 (,) all_nodes all_nodes
396 ok_pairs = filter (\(x, y) -> Node.idx x /= Node.idx y) all_pairs
397 sols = map (\(p, s) -> let (mnl, i) = allocateOnPair nl inst p s
402 tryAlloc nl _ inst 1 =
403 let all_nodes = getOnline nl
404 sols = map (\p -> let (mnl, i) = allocateOnSingle nl inst p
409 tryAlloc _ _ _ reqn = fail $ "Unsupported number of alllocation \
410 \destinations required (" ++ (show reqn) ++
411 "), only two supported"
413 -- | Try to allocate an instance on the cluster.
414 tryReloc :: (Monad m) =>
415 Node.List -- ^ The node list
416 -> Instance.List -- ^ The instance list
417 -> Idx -- ^ The index of the instance to move
418 -> Int -- ^ The numver of nodes required
419 -> [Ndx] -- ^ Nodes which should not be used
420 -> m AllocSolution -- ^ Solution list
421 tryReloc nl il xid 1 ex_idx =
422 let all_nodes = getOnline nl
423 inst = Container.find xid il
424 ex_idx' = (Instance.pnode inst):ex_idx
425 valid_nodes = filter (not . flip elem ex_idx' . Node.idx) all_nodes
426 valid_idxes = map Node.idx valid_nodes
427 sols1 = map (\x -> let (mnl, i, _, _) =
428 applyMove nl inst (ReplaceSecondary x)
429 in (mnl, i, [Container.find x nl])
433 tryReloc _ _ _ reqn _ = fail $ "Unsupported number of relocation \
434 \destinations required (" ++ (show reqn) ++
435 "), only one supported"
437 -- * Formatting functions
439 -- | Given the original and final nodes, computes the relocation description.
440 computeMoves :: String -- ^ The instance name
441 -> String -- ^ Original primary
442 -> String -- ^ Original secondary
443 -> String -- ^ New primary
444 -> String -- ^ New secondary
445 -> (String, [String])
446 -- ^ Tuple of moves and commands list; moves is containing
447 -- either @/f/@ for failover or @/r:name/@ for replace
448 -- secondary, while the command list holds gnt-instance
449 -- commands (without that prefix), e.g \"@failover instance1@\"
450 computeMoves i a b c d =
451 if c == a then {- Same primary -}
452 if d == b then {- Same sec??! -}
454 else {- Change of secondary -}
456 [printf "replace-disks -n %s %s" d i])
458 if c == b then {- Failover and ... -}
459 if d == a then {- that's all -}
460 ("f", [printf "migrate -f %s" i])
463 [printf "migrate -f %s" i,
464 printf "replace-disks -n %s %s" d i])
466 if d == a then {- ... and keep primary as secondary -}
468 [printf "replace-disks -n %s %s" c i,
469 printf "migrate -f %s" i])
471 if d == b then {- ... keep same secondary -}
472 (printf "f r:%s f" c,
473 [printf "migrate -f %s" i,
474 printf "replace-disks -n %s %s" c i,
475 printf "migrate -f %s" i])
477 else {- Nothing in common -}
478 (printf "r:%s f r:%s" c d,
479 [printf "replace-disks -n %s %s" c i,
480 printf "migrate -f %s" i,
481 printf "replace-disks -n %s %s" d i])
483 -- | Converts a placement to string format.
484 printSolutionLine :: Node.List -- ^ The node list
485 -> Instance.List -- ^ The instance list
486 -> Int -- ^ Maximum node name length
487 -> Int -- ^ Maximum instance name length
488 -> Placement -- ^ The current placement
489 -> Int -- ^ The index of the placement in
491 -> (String, [String])
492 printSolutionLine nl il nmlen imlen plc pos =
494 pmlen = (2*nmlen + 1)
496 inst = Container.find i il
497 inam = Instance.name inst
498 npri = Container.nameOf nl p
499 nsec = Container.nameOf nl s
500 opri = Container.nameOf nl $ Instance.pnode inst
501 osec = Container.nameOf nl $ Instance.snode inst
502 (moves, cmds) = computeMoves inam opri osec npri nsec
503 ostr = (printf "%s:%s" opri osec)::String
504 nstr = (printf "%s:%s" npri nsec)::String
506 (printf " %3d. %-*s %-*s => %-*s %.8f a=%s"
507 pos imlen inam pmlen ostr
511 -- | Given a list of commands, prefix them with @gnt-instance@ and
512 -- also beautify the display a little.
513 formatCmds :: [[String]] -> String
514 formatCmds cmd_strs =
516 concat $ map (\(a, b) ->
517 (printf "echo step %d" (a::Int)):
519 (map ("gnt-instance " ++) b)) $
522 -- | Converts a solution to string format.
523 printSolution :: Node.List
526 -> ([String], [[String]])
527 printSolution nl il sol =
529 nmlen = Container.maxNameLen nl
530 imlen = Container.maxNameLen il
532 unzip $ map (uncurry $ printSolutionLine nl il nmlen imlen) $
535 -- | Print the node list.
536 printNodes :: Node.List -> String
538 let snl = sortBy (compare `on` Node.idx) (Container.elems nl)
539 m_name = maximum . map (length . Node.name) $ snl
540 helper = Node.list m_name
542 "%2s %-*s %5s %5s %5s %5s %5s %5s %5s %5s %4s %4s \
543 \%3s %3s %6s %6s %5s"
545 "t_mem" "n_mem" "i_mem" "x_mem" "f_mem" "r_mem"
546 "t_dsk" "f_dsk" "pcpu" "vcpu"
547 "pri" "sec" "p_fmem" "p_fdsk" "r_cpu")::String
548 in unlines $ (header:map helper snl)
550 -- | Shows statistics for a given node list.
551 printStats :: Node.List -> String
553 let (mem_cv, dsk_cv, n1_score, res_cv, off_score, cpu_cv) =
555 in printf "f_mem=%.8f, r_mem=%.8f, f_dsk=%.8f, n1=%.3f, \
556 \uf=%.3f, r_cpu=%.3f"
557 mem_cv res_cv dsk_cv n1_score off_score cpu_cv