1 {-| Implementation of cluster-wide logic.
3 This module holds all pure cluster-logic; I\/O related functionality
4 goes into the "Main" module for the individual binaries.
10 Copyright (C) 2009, 2010, 2011 Google Inc.
12 This program is free software; you can redistribute it and/or modify
13 it under the terms of the GNU General Public License as published by
14 the Free Software Foundation; either version 2 of the License, or
15 (at your option) any later version.
17 This program is distributed in the hope that it will be useful, but
18 WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 General Public License for more details.
22 You should have received a copy of the GNU General Public License
23 along with this program; if not, write to the Free Software
24 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
29 module Ganeti.HTools.Cluster
36 -- * Generic functions
38 , computeAllocationDelta
39 -- * First phase functions
41 -- * Second phase functions
46 -- * Display functions
49 -- * Balacing functions
57 -- * IAllocator functions
65 -- * Allocation functions
69 -- * Node group functions
75 import Data.Function (on)
77 import Data.Ord (comparing)
78 import Text.Printf (printf)
80 import Control.Parallel.Strategies
82 import qualified Ganeti.HTools.Container as Container
83 import qualified Ganeti.HTools.Instance as Instance
84 import qualified Ganeti.HTools.Node as Node
85 import qualified Ganeti.HTools.Group as Group
86 import Ganeti.HTools.Types
87 import Ganeti.HTools.Utils
88 import qualified Ganeti.OpCodes as OpCodes
92 -- | Allocation\/relocation solution.
93 data AllocSolution = AllocSolution
94 { asFailures :: [FailMode] -- ^ Failure counts
95 , asAllocs :: Int -- ^ Good allocation count
96 , asSolutions :: [Node.AllocElement] -- ^ The actual result, length
97 -- of the list depends on the
98 -- allocation/relocation mode
99 , asLog :: [String] -- ^ A list of informational messages
102 -- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
103 type AllocResult = (FailStats, Node.List, Instance.List,
104 [Instance.Instance], [CStats])
106 -- | The empty solution we start with when computing allocations
107 emptySolution :: AllocSolution
108 emptySolution = AllocSolution { asFailures = [], asAllocs = 0
109 , asSolutions = [], asLog = [] }
111 -- | The complete state for the balancing solution
112 data Table = Table Node.List Instance.List Score [Placement]
113 deriving (Show, Read)
115 data CStats = CStats { csFmem :: Int -- ^ Cluster free mem
116 , csFdsk :: Int -- ^ Cluster free disk
117 , csAmem :: Int -- ^ Cluster allocatable mem
118 , csAdsk :: Int -- ^ Cluster allocatable disk
119 , csAcpu :: Int -- ^ Cluster allocatable cpus
120 , csMmem :: Int -- ^ Max node allocatable mem
121 , csMdsk :: Int -- ^ Max node allocatable disk
122 , csMcpu :: Int -- ^ Max node allocatable cpu
123 , csImem :: Int -- ^ Instance used mem
124 , csIdsk :: Int -- ^ Instance used disk
125 , csIcpu :: Int -- ^ Instance used cpu
126 , csTmem :: Double -- ^ Cluster total mem
127 , csTdsk :: Double -- ^ Cluster total disk
128 , csTcpu :: Double -- ^ Cluster total cpus
129 , csVcpu :: Int -- ^ Cluster virtual cpus (if
130 -- node pCpu has been set,
132 , csXmem :: Int -- ^ Unnacounted for mem
133 , csNmem :: Int -- ^ Node own memory
134 , csScore :: Score -- ^ The cluster score
135 , csNinst :: Int -- ^ The total number of instances
137 deriving (Show, Read)
139 -- | Currently used, possibly to allocate, unallocable
140 type AllocStats = (RSpec, RSpec, RSpec)
142 -- * Utility functions
144 -- | Verifies the N+1 status and return the affected nodes.
145 verifyN1 :: [Node.Node] -> [Node.Node]
146 verifyN1 = filter Node.failN1
148 {-| Computes the pair of bad nodes and instances.
150 The bad node list is computed via a simple 'verifyN1' check, and the
151 bad instance list is the list of primary and secondary instances of
155 computeBadItems :: Node.List -> Instance.List ->
156 ([Node.Node], [Instance.Instance])
157 computeBadItems nl il =
158 let bad_nodes = verifyN1 $ getOnline nl
159 bad_instances = map (`Container.find` il) .
161 concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
163 (bad_nodes, bad_instances)
165 -- | Zero-initializer for the CStats type
166 emptyCStats :: CStats
167 emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
169 -- | Update stats with data from a new node
170 updateCStats :: CStats -> Node.Node -> CStats
171 updateCStats cs node =
172 let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
173 csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
174 csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
175 csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
176 csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
178 csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
181 inc_amem = Node.fMem node - Node.rMem node
182 inc_amem' = if inc_amem > 0 then inc_amem else 0
183 inc_adsk = Node.availDisk node
184 inc_imem = truncate (Node.tMem node) - Node.nMem node
185 - Node.xMem node - Node.fMem node
186 inc_icpu = Node.uCpu node
187 inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
188 inc_vcpu = Node.hiCpu node
189 inc_acpu = Node.availCpu node
191 in cs { csFmem = x_fmem + Node.fMem node
192 , csFdsk = x_fdsk + Node.fDsk node
193 , csAmem = x_amem + inc_amem'
194 , csAdsk = x_adsk + inc_adsk
195 , csAcpu = x_acpu + inc_acpu
196 , csMmem = max x_mmem inc_amem'
197 , csMdsk = max x_mdsk inc_adsk
198 , csMcpu = max x_mcpu inc_acpu
199 , csImem = x_imem + inc_imem
200 , csIdsk = x_idsk + inc_idsk
201 , csIcpu = x_icpu + inc_icpu
202 , csTmem = x_tmem + Node.tMem node
203 , csTdsk = x_tdsk + Node.tDsk node
204 , csTcpu = x_tcpu + Node.tCpu node
205 , csVcpu = x_vcpu + inc_vcpu
206 , csXmem = x_xmem + Node.xMem node
207 , csNmem = x_nmem + Node.nMem node
208 , csNinst = x_ninst + length (Node.pList node)
211 -- | Compute the total free disk and memory in the cluster.
212 totalResources :: Node.List -> CStats
214 let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
215 in cs { csScore = compCV nl }
217 -- | Compute the delta between two cluster state.
219 -- This is used when doing allocations, to understand better the
220 -- available cluster resources. The return value is a triple of the
221 -- current used values, the delta that was still allocated, and what
222 -- was left unallocated.
223 computeAllocationDelta :: CStats -> CStats -> AllocStats
224 computeAllocationDelta cini cfin =
225 let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu} = cini
226 CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
227 csTmem = t_mem, csTdsk = t_dsk, csVcpu = v_cpu } = cfin
228 rini = RSpec i_icpu i_imem i_idsk
229 rfin = RSpec (f_icpu - i_icpu) (f_imem - i_imem) (f_idsk - i_idsk)
230 un_cpu = v_cpu - f_icpu
231 runa = RSpec un_cpu (truncate t_mem - f_imem) (truncate t_dsk - f_idsk)
232 in (rini, rfin, runa)
234 -- | The names and weights of the individual elements in the CV list
235 detailedCVInfo :: [(Double, String)]
236 detailedCVInfo = [ (1, "free_mem_cv")
237 , (1, "free_disk_cv")
239 , (1, "reserved_mem_cv")
240 , (4, "offline_all_cnt")
241 , (16, "offline_pri_cnt")
242 , (1, "vcpu_ratio_cv")
245 , (1, "disk_load_cv")
247 , (2, "pri_tags_score")
250 detailedCVWeights :: [Double]
251 detailedCVWeights = map fst detailedCVInfo
253 -- | Compute the mem and disk covariance.
254 compDetailedCV :: Node.List -> [Double]
257 all_nodes = Container.elems nl
258 (offline, nodes) = partition Node.offline all_nodes
259 mem_l = map Node.pMem nodes
260 dsk_l = map Node.pDsk nodes
261 -- metric: memory covariance
262 mem_cv = stdDev mem_l
263 -- metric: disk covariance
264 dsk_cv = stdDev dsk_l
265 -- metric: count of instances living on N1 failing nodes
266 n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
267 length (Node.pList n)) .
268 filter Node.failN1 $ nodes :: Double
269 res_l = map Node.pRem nodes
270 -- metric: reserved memory covariance
271 res_cv = stdDev res_l
272 -- offline instances metrics
273 offline_ipri = sum . map (length . Node.pList) $ offline
274 offline_isec = sum . map (length . Node.sList) $ offline
275 -- metric: count of instances on offline nodes
276 off_score = fromIntegral (offline_ipri + offline_isec)::Double
277 -- metric: count of primary instances on offline nodes (this
278 -- helps with evacuation/failover of primary instances on
279 -- 2-node clusters with one node offline)
280 off_pri_score = fromIntegral offline_ipri::Double
281 cpu_l = map Node.pCpu nodes
282 -- metric: covariance of vcpu/pcpu ratio
283 cpu_cv = stdDev cpu_l
284 -- metrics: covariance of cpu, memory, disk and network load
285 (c_load, m_load, d_load, n_load) = unzip4 $
287 let DynUtil c1 m1 d1 n1 = Node.utilLoad n
288 DynUtil c2 m2 d2 n2 = Node.utilPool n
289 in (c1/c2, m1/m2, d1/d2, n1/n2)
291 -- metric: conflicting instance count
292 pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
293 pri_tags_score = fromIntegral pri_tags_inst::Double
294 in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
295 , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
298 -- | Compute the /total/ variance.
299 compCV :: Node.List -> Double
300 compCV = sum . zipWith (*) detailedCVWeights . compDetailedCV
302 -- | Compute online nodes from a Node.List
303 getOnline :: Node.List -> [Node.Node]
304 getOnline = filter (not . Node.offline) . Container.elems
308 -- | Compute best table. Note that the ordering of the arguments is important.
309 compareTables :: Table -> Table -> Table
310 compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
311 if a_cv > b_cv then b else a
313 -- | Applies an instance move to a given node list and instance.
314 applyMove :: Node.List -> Instance.Instance
315 -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
317 applyMove nl inst Failover =
318 let old_pdx = Instance.pNode inst
319 old_sdx = Instance.sNode inst
320 old_p = Container.find old_pdx nl
321 old_s = Container.find old_sdx nl
322 int_p = Node.removePri old_p inst
323 int_s = Node.removeSec old_s inst
324 force_p = Node.offline old_p
325 new_nl = do -- Maybe monad
326 new_p <- Node.addPriEx force_p int_s inst
327 new_s <- Node.addSec int_p inst old_sdx
328 let new_inst = Instance.setBoth inst old_sdx old_pdx
329 return (Container.addTwo old_pdx new_s old_sdx new_p nl,
330 new_inst, old_sdx, old_pdx)
333 -- Replace the primary (f:, r:np, f)
334 applyMove nl inst (ReplacePrimary new_pdx) =
335 let old_pdx = Instance.pNode inst
336 old_sdx = Instance.sNode inst
337 old_p = Container.find old_pdx nl
338 old_s = Container.find old_sdx nl
339 tgt_n = Container.find new_pdx nl
340 int_p = Node.removePri old_p inst
341 int_s = Node.removeSec old_s inst
342 force_p = Node.offline old_p
343 new_nl = do -- Maybe monad
344 -- check that the current secondary can host the instance
345 -- during the migration
346 tmp_s <- Node.addPriEx force_p int_s inst
347 let tmp_s' = Node.removePri tmp_s inst
348 new_p <- Node.addPriEx force_p tgt_n inst
349 new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
350 let new_inst = Instance.setPri inst new_pdx
351 return (Container.add new_pdx new_p $
352 Container.addTwo old_pdx int_p old_sdx new_s nl,
353 new_inst, new_pdx, old_sdx)
356 -- Replace the secondary (r:ns)
357 applyMove nl inst (ReplaceSecondary new_sdx) =
358 let old_pdx = Instance.pNode inst
359 old_sdx = Instance.sNode inst
360 old_s = Container.find old_sdx nl
361 tgt_n = Container.find new_sdx nl
362 int_s = Node.removeSec old_s inst
363 force_s = Node.offline old_s
364 new_inst = Instance.setSec inst new_sdx
365 new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
366 \new_s -> return (Container.addTwo new_sdx
367 new_s old_sdx int_s nl,
368 new_inst, old_pdx, new_sdx)
371 -- Replace the secondary and failover (r:np, f)
372 applyMove nl inst (ReplaceAndFailover new_pdx) =
373 let old_pdx = Instance.pNode inst
374 old_sdx = Instance.sNode inst
375 old_p = Container.find old_pdx nl
376 old_s = Container.find old_sdx nl
377 tgt_n = Container.find new_pdx nl
378 int_p = Node.removePri old_p inst
379 int_s = Node.removeSec old_s inst
380 force_s = Node.offline old_s
381 new_nl = do -- Maybe monad
382 new_p <- Node.addPri tgt_n inst
383 new_s <- Node.addSecEx force_s int_p inst new_pdx
384 let new_inst = Instance.setBoth inst new_pdx old_pdx
385 return (Container.add new_pdx new_p $
386 Container.addTwo old_pdx new_s old_sdx int_s nl,
387 new_inst, new_pdx, old_pdx)
390 -- Failver and replace the secondary (f, r:ns)
391 applyMove nl inst (FailoverAndReplace new_sdx) =
392 let old_pdx = Instance.pNode inst
393 old_sdx = Instance.sNode inst
394 old_p = Container.find old_pdx nl
395 old_s = Container.find old_sdx nl
396 tgt_n = Container.find new_sdx nl
397 int_p = Node.removePri old_p inst
398 int_s = Node.removeSec old_s inst
399 force_p = Node.offline old_p
400 new_nl = do -- Maybe monad
401 new_p <- Node.addPriEx force_p int_s inst
402 new_s <- Node.addSecEx force_p tgt_n inst old_sdx
403 let new_inst = Instance.setBoth inst old_sdx new_sdx
404 return (Container.add new_sdx new_s $
405 Container.addTwo old_sdx new_p old_pdx int_p nl,
406 new_inst, old_sdx, new_sdx)
409 -- | Tries to allocate an instance on one given node.
410 allocateOnSingle :: Node.List -> Instance.Instance -> Node.Node
411 -> OpResult Node.AllocElement
412 allocateOnSingle nl inst p =
413 let new_pdx = Node.idx p
414 new_inst = Instance.setBoth inst new_pdx Node.noSecondary
415 in Node.addPri p inst >>= \new_p -> do
416 let new_nl = Container.add new_pdx new_p nl
417 new_score = compCV nl
418 return (new_nl, new_inst, [new_p], new_score)
420 -- | Tries to allocate an instance on a given pair of nodes.
421 allocateOnPair :: Node.List -> Instance.Instance -> Node.Node -> Node.Node
422 -> OpResult Node.AllocElement
423 allocateOnPair nl inst tgt_p tgt_s =
424 let new_pdx = Node.idx tgt_p
425 new_sdx = Node.idx tgt_s
427 new_p <- Node.addPri tgt_p inst
428 new_s <- Node.addSec tgt_s inst new_pdx
429 let new_inst = Instance.setBoth inst new_pdx new_sdx
430 new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
431 return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
433 -- | Tries to perform an instance move and returns the best table
434 -- between the original one and the new one.
435 checkSingleStep :: Table -- ^ The original table
436 -> Instance.Instance -- ^ The instance to move
437 -> Table -- ^ The current best table
438 -> IMove -- ^ The move to apply
439 -> Table -- ^ The final best table
440 checkSingleStep ini_tbl target cur_tbl move =
442 Table ini_nl ini_il _ ini_plc = ini_tbl
443 tmp_resu = applyMove ini_nl target move
447 OpGood (upd_nl, new_inst, pri_idx, sec_idx) ->
448 let tgt_idx = Instance.idx target
449 upd_cvar = compCV upd_nl
450 upd_il = Container.add tgt_idx new_inst ini_il
451 upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
452 upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
454 compareTables cur_tbl upd_tbl
456 -- | Given the status of the current secondary as a valid new node and
457 -- the current candidate target node, generate the possible moves for
459 possibleMoves :: Bool -- ^ Whether the secondary node is a valid new node
460 -> Ndx -- ^ Target node candidate
461 -> [IMove] -- ^ List of valid result moves
462 possibleMoves True tdx =
463 [ReplaceSecondary tdx,
464 ReplaceAndFailover tdx,
466 FailoverAndReplace tdx]
468 possibleMoves False tdx =
469 [ReplaceSecondary tdx,
470 ReplaceAndFailover tdx]
472 -- | Compute the best move for a given instance.
473 checkInstanceMove :: [Ndx] -- ^ Allowed target node indices
474 -> Bool -- ^ Whether disk moves are allowed
475 -> Table -- ^ Original table
476 -> Instance.Instance -- ^ Instance to move
477 -> Table -- ^ Best new table for this instance
478 checkInstanceMove nodes_idx disk_moves ini_tbl target =
480 opdx = Instance.pNode target
481 osdx = Instance.sNode target
482 nodes = filter (\idx -> idx /= opdx && idx /= osdx) nodes_idx
483 use_secondary = elem osdx nodes_idx
484 aft_failover = if use_secondary -- if allowed to failover
485 then checkSingleStep ini_tbl target ini_tbl Failover
487 all_moves = if disk_moves
488 then concatMap (possibleMoves use_secondary) nodes
491 -- iterate over the possible nodes for this instance
492 foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
494 -- | Compute the best next move.
495 checkMove :: [Ndx] -- ^ Allowed target node indices
496 -> Bool -- ^ Whether disk moves are allowed
497 -> Table -- ^ The current solution
498 -> [Instance.Instance] -- ^ List of instances still to move
499 -> Table -- ^ The new solution
500 checkMove nodes_idx disk_moves ini_tbl victims =
501 let Table _ _ _ ini_plc = ini_tbl
502 -- we're using rwhnf from the Control.Parallel.Strategies
503 -- package; we don't need to use rnf as that would force too
504 -- much evaluation in single-threaded cases, and in
505 -- multi-threaded case the weak head normal form is enough to
506 -- spark the evaluation
507 tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves ini_tbl)
509 -- iterate over all instances, computing the best move
512 (\ step_tbl new_tbl -> compareTables step_tbl new_tbl)
514 Table _ _ _ best_plc = best_tbl
515 in if length best_plc == length ini_plc
516 then ini_tbl -- no advancement
519 -- | Check if we are allowed to go deeper in the balancing
520 doNextBalance :: Table -- ^ The starting table
521 -> Int -- ^ Remaining length
522 -> Score -- ^ Score at which to stop
523 -> Bool -- ^ The resulting table and commands
524 doNextBalance ini_tbl max_rounds min_score =
525 let Table _ _ ini_cv ini_plc = ini_tbl
526 ini_plc_len = length ini_plc
527 in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
529 -- | Run a balance move
530 tryBalance :: Table -- ^ The starting table
531 -> Bool -- ^ Allow disk moves
532 -> Bool -- ^ Only evacuate moves
533 -> Score -- ^ Min gain threshold
534 -> Score -- ^ Min gain
535 -> Maybe Table -- ^ The resulting table and commands
536 tryBalance ini_tbl disk_moves evac_mode mg_limit min_gain =
537 let Table ini_nl ini_il ini_cv _ = ini_tbl
538 all_inst = Container.elems ini_il
539 all_inst' = if evac_mode
540 then let bad_nodes = map Node.idx . filter Node.offline $
541 Container.elems ini_nl
542 in filter (\e -> Instance.sNode e `elem` bad_nodes ||
543 Instance.pNode e `elem` bad_nodes)
546 reloc_inst = filter Instance.movable all_inst'
547 node_idx = map Node.idx . filter (not . Node.offline) $
548 Container.elems ini_nl
549 fin_tbl = checkMove node_idx disk_moves ini_tbl reloc_inst
550 (Table _ _ fin_cv _) = fin_tbl
552 if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
553 then Just fin_tbl -- this round made success, return the new table
556 -- * Allocation functions
558 -- | Build failure stats out of a list of failures
559 collapseFailures :: [FailMode] -> FailStats
560 collapseFailures flst =
561 map (\k -> (k, length $ filter (k ==) flst)) [minBound..maxBound]
563 -- | Update current Allocation solution and failure stats with new
565 concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
566 concatAllocs as (OpFail reason) = as { asFailures = reason : asFailures as }
568 concatAllocs as (OpGood ns@(_, _, _, nscore)) =
569 let -- Choose the old or new solution, based on the cluster score
571 osols = asSolutions as
572 nsols = case osols of
574 (_, _, _, oscore):[] ->
578 -- FIXME: here we simply concat to lists with more
579 -- than one element; we should instead abort, since
580 -- this is not a valid usage of this function
583 -- Note: we force evaluation of nsols here in order to keep the
584 -- memory profile low - we know that we will need nsols for sure
585 -- in the next cycle, so we force evaluation of nsols, since the
586 -- foldl' in the caller will only evaluate the tuple, but not the
587 -- elements of the tuple
588 in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolutions = nsols }
590 -- | Sums two allocation solutions (e.g. for two separate node groups).
591 sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution
592 sumAllocs (AllocSolution af aa as al) (AllocSolution bf ba bs bl) =
593 AllocSolution (af ++ bf) (aa + ba) (as ++ bs) (al ++ bl)
595 -- | Given a solution, generates a reasonable description for it
596 describeSolution :: AllocSolution -> String
597 describeSolution as =
598 let fcnt = asFailures as
599 sols = asSolutions as
601 intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
602 filter ((> 0) . snd) . collapseFailures $ fcnt
604 then "No valid allocation solutions, failure reasons: " ++
606 then "unknown reasons"
608 else let (_, _, nodes, cv) = head sols
609 in printf ("score: %.8f, successes %d, failures %d (%s)" ++
610 " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
611 (intercalate "/" . map Node.name $ nodes)
613 -- | Annotates a solution with the appropriate string
614 annotateSolution :: AllocSolution -> AllocSolution
615 annotateSolution as = as { asLog = describeSolution as : asLog as }
617 -- | Try to allocate an instance on the cluster.
618 tryAlloc :: (Monad m) =>
619 Node.List -- ^ The node list
620 -> Instance.List -- ^ The instance list
621 -> Instance.Instance -- ^ The instance to allocate
622 -> Int -- ^ Required number of nodes
623 -> m AllocSolution -- ^ Possible solution list
624 tryAlloc nl _ inst 2 =
625 let all_nodes = getOnline nl
626 all_pairs = liftM2 (,) all_nodes all_nodes
627 ok_pairs = filter (\(x, y) -> Node.idx x /= Node.idx y) all_pairs
628 sols = foldl' (\cstate (p, s) ->
629 concatAllocs cstate $ allocateOnPair nl inst p s
630 ) emptySolution ok_pairs
632 in if null ok_pairs -- means we have just one node
633 then fail "Not enough online nodes"
634 else return $ annotateSolution sols
636 tryAlloc nl _ inst 1 =
637 let all_nodes = getOnline nl
638 sols = foldl' (\cstate ->
639 concatAllocs cstate . allocateOnSingle nl inst
640 ) emptySolution all_nodes
642 then fail "No online nodes"
643 else return $ annotateSolution sols
645 tryAlloc _ _ _ reqn = fail $ "Unsupported number of allocation \
646 \destinations required (" ++ show reqn ++
647 "), only two supported"
649 -- | Given a group/result, describe it as a nice (list of) messages
650 solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String]
651 solutionDescription gl (groupId, result) =
653 Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
654 Bad message -> [printf "Group %s: error %s" gname message]
655 where grp = Container.find groupId gl
656 gname = Group.name grp
657 pol = apolToString (Group.allocPolicy grp)
659 -- | From a list of possibly bad and possibly empty solutions, filter
660 -- only the groups with a valid result
661 filterMGResults :: Group.List
662 -> [(Gdx, Result AllocSolution)]
663 -> [(Gdx, AllocSolution)]
665 filter ((/= AllocUnallocable) . Group.allocPolicy .
666 flip Container.find gl . fst) .
667 filter (not . null . asSolutions . snd) .
668 map (\(y, Ok x) -> (y, x)) .
671 -- | Sort multigroup results based on policy and score
672 sortMGResults :: Group.List
673 -> [(Gdx, AllocSolution)]
674 -> [(Gdx, AllocSolution)]
675 sortMGResults gl sols =
676 let extractScore = \(_, _, _, x) -> x
677 solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl),
678 (extractScore . head . asSolutions) sol)
679 in sortBy (comparing solScore) sols
681 -- | Try to allocate an instance on a multi-group cluster.
682 tryMGAlloc :: Group.List -- ^ The group list
683 -> Node.List -- ^ The node list
684 -> Instance.List -- ^ The instance list
685 -> Instance.Instance -- ^ The instance to allocate
686 -> Int -- ^ Required number of nodes
687 -> Result AllocSolution -- ^ Possible solution list
688 tryMGAlloc mggl mgnl mgil inst cnt =
689 let groups = splitCluster mgnl mgil
690 -- TODO: currently we consider all groups preferred
691 sols = map (\(gid, (nl, il)) ->
692 (gid, tryAlloc nl il inst cnt)) groups::
693 [(Gdx, Result AllocSolution)]
694 all_msgs = concatMap (solutionDescription mggl) sols
695 goodSols = filterMGResults mggl sols
696 sortedSols = sortMGResults mggl goodSols
697 in if null sortedSols
698 then Bad $ intercalate ", " all_msgs
699 else let (final_group, final_sol) = head sortedSols
700 final_name = Group.name $ Container.find final_group mggl
701 selmsg = "Selected group: " ++ final_name
702 in Ok $ final_sol { asLog = selmsg:all_msgs }
704 -- | Try to relocate an instance on the cluster.
705 tryReloc :: (Monad m) =>
706 Node.List -- ^ The node list
707 -> Instance.List -- ^ The instance list
708 -> Idx -- ^ The index of the instance to move
709 -> Int -- ^ The number of nodes required
710 -> [Ndx] -- ^ Nodes which should not be used
711 -> m AllocSolution -- ^ Solution list
712 tryReloc nl il xid 1 ex_idx =
713 let all_nodes = getOnline nl
714 inst = Container.find xid il
715 ex_idx' = Instance.pNode inst:ex_idx
716 valid_nodes = filter (not . flip elem ex_idx' . Node.idx) all_nodes
717 valid_idxes = map Node.idx valid_nodes
718 sols1 = foldl' (\cstate x ->
721 applyMove nl inst (ReplaceSecondary x)
722 return (mnl, i, [Container.find x mnl],
724 in concatAllocs cstate em
725 ) emptySolution valid_idxes
728 tryReloc _ _ _ reqn _ = fail $ "Unsupported number of relocation \
729 \destinations required (" ++ show reqn ++
730 "), only one supported"
732 tryMGReloc :: (Monad m) =>
733 Group.List -- ^ The group list
734 -> Node.List -- ^ The node list
735 -> Instance.List -- ^ The instance list
736 -> Idx -- ^ The index of the instance to move
737 -> Int -- ^ The number of nodes required
738 -> [Ndx] -- ^ Nodes which should not be used
739 -> m AllocSolution -- ^ Solution list
740 tryMGReloc _ mgnl mgil xid ncount ex_ndx = do
741 let groups = splitCluster mgnl mgil
742 -- TODO: we only relocate inside the group for now
743 inst = Container.find xid mgil
744 (nl, il) <- case lookup (instancePriGroup mgnl inst) groups of
745 Nothing -> fail $ "Cannot find group for instance " ++
748 tryReloc nl il xid ncount ex_ndx
750 -- | Change an instance's secondary node
751 evacInstance :: (Monad m) =>
752 [Ndx] -- ^ Excluded nodes
753 -> Instance.List -- ^ The current instance list
754 -> (Node.List, AllocSolution) -- ^ The current state
755 -> Idx -- ^ The instance to evacuate
756 -> m (Node.List, AllocSolution)
757 evacInstance ex_ndx il (nl, old_as) idx = do
758 -- FIXME: hardcoded one node here
760 -- Longer explanation: evacuation is currently hardcoded to DRBD
761 -- instances (which have one secondary); hence, even if the
762 -- IAllocator protocol can request N nodes for an instance, and all
763 -- the message parsing/loading pass this, this implementation only
764 -- supports one; this situation needs to be revisited if we ever
765 -- support more than one secondary, or if we change the storage
767 new_as <- tryReloc nl il idx 1 ex_ndx
768 case asSolutions new_as of
769 -- an individual relocation succeeded, we kind of compose the data
770 -- from the two solutions
771 csol@(nl', _, _, _):_ ->
772 return (nl', new_as { asSolutions = csol:asSolutions old_as })
773 -- this relocation failed, so we fail the entire evac
774 _ -> fail $ "Can't evacuate instance " ++
775 Instance.name (Container.find idx il) ++
776 ": " ++ describeSolution new_as
778 -- | Try to evacuate a list of nodes.
779 tryEvac :: (Monad m) =>
780 Node.List -- ^ The node list
781 -> Instance.List -- ^ The instance list
782 -> [Idx] -- ^ Instances to be evacuated
783 -> [Ndx] -- ^ Restricted nodes (the ones being evacuated)
784 -> m AllocSolution -- ^ Solution list
785 tryEvac nl il idxs ex_ndx = do
786 (_, sol) <- foldM (evacInstance ex_ndx il) (nl, emptySolution) idxs
789 -- | Multi-group evacuation of a list of nodes.
790 tryMGEvac :: (Monad m) =>
791 Group.List -- ^ The group list
792 -> Node.List -- ^ The node list
793 -> Instance.List -- ^ The instance list
794 -> [Ndx] -- ^ Nodes to be evacuated
795 -> m AllocSolution -- ^ Solution list
796 tryMGEvac _ nl il ex_ndx =
797 let ex_nodes = map (`Container.find` nl) ex_ndx
798 all_insts = nub . concatMap Node.sList $ ex_nodes
799 gni = splitCluster nl il
800 -- we run the instance index list through a couple of maps to
801 -- get finally to a structure of the type [(group index,
802 -- [instance indices])]
803 all_insts' = map (\idx ->
804 (instancePriGroup nl (Container.find idx il),
806 all_insts'' = groupBy ((==) `on` fst) all_insts'
807 all_insts3 = map (\xs -> let (gdxs, idxs) = unzip xs
808 in (head gdxs, idxs)) all_insts''
810 -- that done, we now add the per-group nl/il to the tuple
812 mapM (\(gdx, idxs) -> do
813 case lookup gdx gni of
814 Nothing -> fail $ "Can't find group index " ++ show gdx
815 Just (gnl, gil) -> return (gdx, gnl, gil, idxs))
817 results <- mapM (\(_, gnl, gil, idxs) -> tryEvac gnl gil idxs ex_ndx)
819 let sol = foldl' (\orig_sol group_sol ->
820 sumAllocs orig_sol group_sol) emptySolution results
821 return $ annotateSolution sol
823 -- | Recursively place instances on the cluster until we're out of space
824 iterateAlloc :: Node.List
828 -> [Instance.Instance]
830 -> Result AllocResult
831 iterateAlloc nl il newinst nreq ixes cstats =
832 let depth = length ixes
833 newname = printf "new-%d" depth::String
834 newidx = length (Container.elems il) + depth
835 newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
836 in case tryAlloc nl il newi2 nreq of
838 Ok (AllocSolution { asFailures = errs, asSolutions = sols3 }) ->
840 [] -> Ok (collapseFailures errs, nl, il, ixes, cstats)
841 (xnl, xi, _, _):[] ->
842 iterateAlloc xnl (Container.add newidx xi il)
843 newinst nreq (xi:ixes)
844 (totalResources xnl:cstats)
845 _ -> Bad "Internal error: multiple solutions for single\
848 -- | The core of the tiered allocation mode
849 tieredAlloc :: Node.List
853 -> [Instance.Instance]
855 -> Result AllocResult
856 tieredAlloc nl il newinst nreq ixes cstats =
857 case iterateAlloc nl il newinst nreq ixes cstats of
859 Ok (errs, nl', il', ixes', cstats') ->
860 case Instance.shrinkByType newinst . fst . last $
861 sortBy (comparing snd) errs of
862 Bad _ -> Ok (errs, nl', il', ixes', cstats')
864 tieredAlloc nl' il' newinst' nreq ixes' cstats'
866 -- | Compute the tiered spec string description from a list of
867 -- allocated instances.
868 tieredSpecMap :: [Instance.Instance]
870 tieredSpecMap trl_ixes =
871 let fin_trl_ixes = reverse trl_ixes
872 ix_byspec = groupBy ((==) `on` Instance.specOf) fin_trl_ixes
873 spec_map = map (\ixs -> (Instance.specOf $ head ixs, length ixs))
875 in map (\(spec, cnt) -> printf "%d,%d,%d=%d" (rspecMem spec)
876 (rspecDsk spec) (rspecCpu spec) cnt) spec_map
878 -- * Formatting functions
880 -- | Given the original and final nodes, computes the relocation description.
881 computeMoves :: Instance.Instance -- ^ The instance to be moved
882 -> String -- ^ The instance name
883 -> IMove -- ^ The move being performed
884 -> String -- ^ New primary
885 -> String -- ^ New secondary
886 -> (String, [String])
887 -- ^ Tuple of moves and commands list; moves is containing
888 -- either @/f/@ for failover or @/r:name/@ for replace
889 -- secondary, while the command list holds gnt-instance
890 -- commands (without that prefix), e.g \"@failover instance1@\"
891 computeMoves i inam mv c d =
893 Failover -> ("f", [mig])
894 FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
895 ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
896 ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
897 ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
898 where morf = if Instance.running i then "migrate" else "failover"
899 mig = printf "%s -f %s" morf inam::String
900 rep n = printf "replace-disks -n %s %s" n inam
902 -- | Converts a placement to string format.
903 printSolutionLine :: Node.List -- ^ The node list
904 -> Instance.List -- ^ The instance list
905 -> Int -- ^ Maximum node name length
906 -> Int -- ^ Maximum instance name length
907 -> Placement -- ^ The current placement
908 -> Int -- ^ The index of the placement in
910 -> (String, [String])
911 printSolutionLine nl il nmlen imlen plc pos =
913 pmlen = (2*nmlen + 1)
914 (i, p, s, mv, c) = plc
915 inst = Container.find i il
916 inam = Instance.alias inst
917 npri = Node.alias $ Container.find p nl
918 nsec = Node.alias $ Container.find s nl
919 opri = Node.alias $ Container.find (Instance.pNode inst) nl
920 osec = Node.alias $ Container.find (Instance.sNode inst) nl
921 (moves, cmds) = computeMoves inst inam mv npri nsec
922 ostr = printf "%s:%s" opri osec::String
923 nstr = printf "%s:%s" npri nsec::String
925 (printf " %3d. %-*s %-*s => %-*s %.8f a=%s"
926 pos imlen inam pmlen ostr
930 -- | Return the instance and involved nodes in an instance move.
931 involvedNodes :: Instance.List -> Placement -> [Ndx]
932 involvedNodes il plc =
933 let (i, np, ns, _, _) = plc
934 inst = Container.find i il
935 op = Instance.pNode inst
936 os = Instance.sNode inst
937 in nub [np, ns, op, os]
939 -- | Inner function for splitJobs, that either appends the next job to
940 -- the current jobset, or starts a new jobset.
941 mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
942 mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
943 mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
944 | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
945 | otherwise = ([n]:cjs, ndx)
947 -- | Break a list of moves into independent groups. Note that this
948 -- will reverse the order of jobs.
949 splitJobs :: [MoveJob] -> [JobSet]
950 splitJobs = fst . foldl mergeJobs ([], [])
952 -- | Given a list of commands, prefix them with @gnt-instance@ and
953 -- also beautify the display a little.
954 formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
955 formatJob jsn jsl (sn, (_, _, _, cmds)) =
957 printf " echo job %d/%d" jsn sn:
959 map (" gnt-instance " ++) cmds
961 then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
964 -- | Given a list of commands, prefix them with @gnt-instance@ and
965 -- also beautify the display a little.
966 formatCmds :: [JobSet] -> String
969 concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
973 -- | Print the node list.
974 printNodes :: Node.List -> [String] -> String
976 let fields = case fs of
977 [] -> Node.defaultFields
978 "+":rest -> Node.defaultFields ++ rest
980 snl = sortBy (comparing Node.idx) (Container.elems nl)
981 (header, isnum) = unzip $ map Node.showHeader fields
982 in unlines . map ((:) ' ' . intercalate " ") $
983 formatTable (header:map (Node.list fields) snl) isnum
985 -- | Print the instance list.
986 printInsts :: Node.List -> Instance.List -> String
988 let sil = sortBy (comparing Instance.idx) (Container.elems il)
989 helper inst = [ if Instance.running inst then "R" else " "
991 , Container.nameOf nl (Instance.pNode inst)
992 , let sdx = Instance.sNode inst
993 in if sdx == Node.noSecondary
995 else Container.nameOf nl sdx
996 , printf "%3d" $ Instance.vcpus inst
997 , printf "%5d" $ Instance.mem inst
998 , printf "%5d" $ Instance.dsk inst `div` 1024
1004 where DynUtil lC lM lD lN = Instance.util inst
1005 header = [ "F", "Name", "Pri_node", "Sec_node", "vcpu", "mem"
1006 , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1007 isnum = False:False:False:False:repeat True
1008 in unlines . map ((:) ' ' . intercalate " ") $
1009 formatTable (header:map helper sil) isnum
1011 -- | Shows statistics for a given node list.
1012 printStats :: Node.List -> String
1014 let dcvs = compDetailedCV nl
1015 (weights, names) = unzip detailedCVInfo
1016 hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1017 formatted = map (\(w, header, val) ->
1018 printf "%s=%.8f(x%.2f)" header val w::String) hd
1019 in intercalate ", " formatted
1021 -- | Convert a placement into a list of OpCodes (basically a job).
1022 iMoveToJob :: Node.List -> Instance.List
1023 -> Idx -> IMove -> [OpCodes.OpCode]
1024 iMoveToJob nl il idx move =
1025 let inst = Container.find idx il
1026 iname = Instance.name inst
1027 lookNode = Just . Container.nameOf nl
1028 opF = if Instance.running inst
1029 then OpCodes.OpMigrateInstance iname True False
1030 else OpCodes.OpFailoverInstance iname False
1031 opR n = OpCodes.OpReplaceDisks iname (lookNode n)
1032 OpCodes.ReplaceNewSecondary [] Nothing
1035 ReplacePrimary np -> [ opF, opR np, opF ]
1036 ReplaceSecondary ns -> [ opR ns ]
1037 ReplaceAndFailover np -> [ opR np, opF ]
1038 FailoverAndReplace ns -> [ opF, opR ns ]
1040 -- * Node group functions
1042 -- | Computes the group of an instance
1043 instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1044 instanceGroup nl i =
1045 let sidx = Instance.sNode i
1046 pnode = Container.find (Instance.pNode i) nl
1047 snode = if sidx == Node.noSecondary
1049 else Container.find sidx nl
1050 pgroup = Node.group pnode
1051 sgroup = Node.group snode
1052 in if pgroup /= sgroup
1053 then fail ("Instance placed accross two node groups, primary " ++
1054 show pgroup ++ ", secondary " ++ show sgroup)
1057 -- | Computes the group of an instance per the primary node
1058 instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1059 instancePriGroup nl i =
1060 let pnode = Container.find (Instance.pNode i) nl
1063 -- | Compute the list of badly allocated instances (split across node
1065 findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1066 findSplitInstances nl il =
1067 filter (not . isOk . instanceGroup nl) (Container.elems il)
1069 -- | Splits a cluster into the component node groups
1070 splitCluster :: Node.List -> Instance.List ->
1071 [(Gdx, (Node.List, Instance.List))]
1072 splitCluster nl il =
1073 let ngroups = Node.computeGroups (Container.elems nl)
1074 in map (\(guuid, nodes) ->
1075 let nidxs = map Node.idx nodes
1076 nodes' = zip nidxs nodes
1077 instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1078 in (guuid, (Container.fromList nodes', instances))) ngroups