1 {-| Implementation of cluster-wide logic.
3 This module holds all pure cluster-logic; I\/O related functionality
4 goes into the "Main" module for the individual binaries.
10 Copyright (C) 2009, 2010, 2011 Google Inc.
12 This program is free software; you can redistribute it and/or modify
13 it under the terms of the GNU General Public License as published by
14 the Free Software Foundation; either version 2 of the License, or
15 (at your option) any later version.
17 This program is distributed in the hope that it will be useful, but
18 WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 General Public License for more details.
22 You should have received a copy of the GNU General Public License
23 along with this program; if not, write to the Free Software
24 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
29 module Ganeti.HTools.Cluster
36 -- * Generic functions
38 , computeAllocationDelta
39 -- * First phase functions
41 -- * Second phase functions
46 -- * Display functions
49 -- * Balacing functions
57 -- * IAllocator functions
66 -- * Allocation functions
70 -- * Node group functions
76 import Data.Function (on)
78 import Data.Ord (comparing)
79 import Text.Printf (printf)
81 import Control.Parallel.Strategies
83 import qualified Ganeti.HTools.Container as Container
84 import qualified Ganeti.HTools.Instance as Instance
85 import qualified Ganeti.HTools.Node as Node
86 import qualified Ganeti.HTools.Group as Group
87 import Ganeti.HTools.Types
88 import Ganeti.HTools.Utils
89 import qualified Ganeti.OpCodes as OpCodes
93 -- | Allocation\/relocation solution.
94 data AllocSolution = AllocSolution
95 { asFailures :: [FailMode] -- ^ Failure counts
96 , asAllocs :: Int -- ^ Good allocation count
97 , asSolutions :: [Node.AllocElement] -- ^ The actual result, length
98 -- of the list depends on the
99 -- allocation/relocation mode
100 , asLog :: [String] -- ^ A list of informational messages
103 -- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
104 type AllocResult = (FailStats, Node.List, Instance.List,
105 [Instance.Instance], [CStats])
108 -- | A type denoting the valid allocation mode/pairs.
109 -- For a one-node allocation, this will be a @Left ['Node.Node']@,
110 -- whereas for a two-node allocation, this will be a @Right
111 -- [('Node.Node', 'Node.Node')]@.
112 type AllocNodes = Either [Ndx] [(Ndx, Ndx)]
114 -- | The empty solution we start with when computing allocations
115 emptySolution :: AllocSolution
116 emptySolution = AllocSolution { asFailures = [], asAllocs = 0
117 , asSolutions = [], asLog = [] }
119 -- | The complete state for the balancing solution
120 data Table = Table Node.List Instance.List Score [Placement]
121 deriving (Show, Read)
123 data CStats = CStats { csFmem :: Int -- ^ Cluster free mem
124 , csFdsk :: Int -- ^ Cluster free disk
125 , csAmem :: Int -- ^ Cluster allocatable mem
126 , csAdsk :: Int -- ^ Cluster allocatable disk
127 , csAcpu :: Int -- ^ Cluster allocatable cpus
128 , csMmem :: Int -- ^ Max node allocatable mem
129 , csMdsk :: Int -- ^ Max node allocatable disk
130 , csMcpu :: Int -- ^ Max node allocatable cpu
131 , csImem :: Int -- ^ Instance used mem
132 , csIdsk :: Int -- ^ Instance used disk
133 , csIcpu :: Int -- ^ Instance used cpu
134 , csTmem :: Double -- ^ Cluster total mem
135 , csTdsk :: Double -- ^ Cluster total disk
136 , csTcpu :: Double -- ^ Cluster total cpus
137 , csVcpu :: Int -- ^ Cluster virtual cpus (if
138 -- node pCpu has been set,
140 , csXmem :: Int -- ^ Unnacounted for mem
141 , csNmem :: Int -- ^ Node own memory
142 , csScore :: Score -- ^ The cluster score
143 , csNinst :: Int -- ^ The total number of instances
145 deriving (Show, Read)
147 -- | Currently used, possibly to allocate, unallocable
148 type AllocStats = (RSpec, RSpec, RSpec)
150 -- * Utility functions
152 -- | Verifies the N+1 status and return the affected nodes.
153 verifyN1 :: [Node.Node] -> [Node.Node]
154 verifyN1 = filter Node.failN1
156 {-| Computes the pair of bad nodes and instances.
158 The bad node list is computed via a simple 'verifyN1' check, and the
159 bad instance list is the list of primary and secondary instances of
163 computeBadItems :: Node.List -> Instance.List ->
164 ([Node.Node], [Instance.Instance])
165 computeBadItems nl il =
166 let bad_nodes = verifyN1 $ getOnline nl
167 bad_instances = map (`Container.find` il) .
169 concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
171 (bad_nodes, bad_instances)
173 -- | Zero-initializer for the CStats type
174 emptyCStats :: CStats
175 emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
177 -- | Update stats with data from a new node
178 updateCStats :: CStats -> Node.Node -> CStats
179 updateCStats cs node =
180 let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
181 csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
182 csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
183 csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
184 csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
186 csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
189 inc_amem = Node.fMem node - Node.rMem node
190 inc_amem' = if inc_amem > 0 then inc_amem else 0
191 inc_adsk = Node.availDisk node
192 inc_imem = truncate (Node.tMem node) - Node.nMem node
193 - Node.xMem node - Node.fMem node
194 inc_icpu = Node.uCpu node
195 inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
196 inc_vcpu = Node.hiCpu node
197 inc_acpu = Node.availCpu node
199 in cs { csFmem = x_fmem + Node.fMem node
200 , csFdsk = x_fdsk + Node.fDsk node
201 , csAmem = x_amem + inc_amem'
202 , csAdsk = x_adsk + inc_adsk
203 , csAcpu = x_acpu + inc_acpu
204 , csMmem = max x_mmem inc_amem'
205 , csMdsk = max x_mdsk inc_adsk
206 , csMcpu = max x_mcpu inc_acpu
207 , csImem = x_imem + inc_imem
208 , csIdsk = x_idsk + inc_idsk
209 , csIcpu = x_icpu + inc_icpu
210 , csTmem = x_tmem + Node.tMem node
211 , csTdsk = x_tdsk + Node.tDsk node
212 , csTcpu = x_tcpu + Node.tCpu node
213 , csVcpu = x_vcpu + inc_vcpu
214 , csXmem = x_xmem + Node.xMem node
215 , csNmem = x_nmem + Node.nMem node
216 , csNinst = x_ninst + length (Node.pList node)
219 -- | Compute the total free disk and memory in the cluster.
220 totalResources :: Node.List -> CStats
222 let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
223 in cs { csScore = compCV nl }
225 -- | Compute the delta between two cluster state.
227 -- This is used when doing allocations, to understand better the
228 -- available cluster resources. The return value is a triple of the
229 -- current used values, the delta that was still allocated, and what
230 -- was left unallocated.
231 computeAllocationDelta :: CStats -> CStats -> AllocStats
232 computeAllocationDelta cini cfin =
233 let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu} = cini
234 CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
235 csTmem = t_mem, csTdsk = t_dsk, csVcpu = v_cpu } = cfin
236 rini = RSpec i_icpu i_imem i_idsk
237 rfin = RSpec (f_icpu - i_icpu) (f_imem - i_imem) (f_idsk - i_idsk)
238 un_cpu = v_cpu - f_icpu
239 runa = RSpec un_cpu (truncate t_mem - f_imem) (truncate t_dsk - f_idsk)
240 in (rini, rfin, runa)
242 -- | The names and weights of the individual elements in the CV list
243 detailedCVInfo :: [(Double, String)]
244 detailedCVInfo = [ (1, "free_mem_cv")
245 , (1, "free_disk_cv")
247 , (1, "reserved_mem_cv")
248 , (4, "offline_all_cnt")
249 , (16, "offline_pri_cnt")
250 , (1, "vcpu_ratio_cv")
253 , (1, "disk_load_cv")
255 , (2, "pri_tags_score")
258 detailedCVWeights :: [Double]
259 detailedCVWeights = map fst detailedCVInfo
261 -- | Compute the mem and disk covariance.
262 compDetailedCV :: Node.List -> [Double]
265 all_nodes = Container.elems nl
266 (offline, nodes) = partition Node.offline all_nodes
267 mem_l = map Node.pMem nodes
268 dsk_l = map Node.pDsk nodes
269 -- metric: memory covariance
270 mem_cv = stdDev mem_l
271 -- metric: disk covariance
272 dsk_cv = stdDev dsk_l
273 -- metric: count of instances living on N1 failing nodes
274 n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
275 length (Node.pList n)) .
276 filter Node.failN1 $ nodes :: Double
277 res_l = map Node.pRem nodes
278 -- metric: reserved memory covariance
279 res_cv = stdDev res_l
280 -- offline instances metrics
281 offline_ipri = sum . map (length . Node.pList) $ offline
282 offline_isec = sum . map (length . Node.sList) $ offline
283 -- metric: count of instances on offline nodes
284 off_score = fromIntegral (offline_ipri + offline_isec)::Double
285 -- metric: count of primary instances on offline nodes (this
286 -- helps with evacuation/failover of primary instances on
287 -- 2-node clusters with one node offline)
288 off_pri_score = fromIntegral offline_ipri::Double
289 cpu_l = map Node.pCpu nodes
290 -- metric: covariance of vcpu/pcpu ratio
291 cpu_cv = stdDev cpu_l
292 -- metrics: covariance of cpu, memory, disk and network load
293 (c_load, m_load, d_load, n_load) = unzip4 $
295 let DynUtil c1 m1 d1 n1 = Node.utilLoad n
296 DynUtil c2 m2 d2 n2 = Node.utilPool n
297 in (c1/c2, m1/m2, d1/d2, n1/n2)
299 -- metric: conflicting instance count
300 pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
301 pri_tags_score = fromIntegral pri_tags_inst::Double
302 in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
303 , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
306 -- | Compute the /total/ variance.
307 compCV :: Node.List -> Double
308 compCV = sum . zipWith (*) detailedCVWeights . compDetailedCV
310 -- | Compute online nodes from a Node.List
311 getOnline :: Node.List -> [Node.Node]
312 getOnline = filter (not . Node.offline) . Container.elems
316 -- | Compute best table. Note that the ordering of the arguments is important.
317 compareTables :: Table -> Table -> Table
318 compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
319 if a_cv > b_cv then b else a
321 -- | Applies an instance move to a given node list and instance.
322 applyMove :: Node.List -> Instance.Instance
323 -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
325 applyMove nl inst Failover =
326 let old_pdx = Instance.pNode inst
327 old_sdx = Instance.sNode inst
328 old_p = Container.find old_pdx nl
329 old_s = Container.find old_sdx nl
330 int_p = Node.removePri old_p inst
331 int_s = Node.removeSec old_s inst
332 force_p = Node.offline old_p
333 new_nl = do -- Maybe monad
334 new_p <- Node.addPriEx force_p int_s inst
335 new_s <- Node.addSec int_p inst old_sdx
336 let new_inst = Instance.setBoth inst old_sdx old_pdx
337 return (Container.addTwo old_pdx new_s old_sdx new_p nl,
338 new_inst, old_sdx, old_pdx)
341 -- Replace the primary (f:, r:np, f)
342 applyMove nl inst (ReplacePrimary new_pdx) =
343 let old_pdx = Instance.pNode inst
344 old_sdx = Instance.sNode inst
345 old_p = Container.find old_pdx nl
346 old_s = Container.find old_sdx nl
347 tgt_n = Container.find new_pdx nl
348 int_p = Node.removePri old_p inst
349 int_s = Node.removeSec old_s inst
350 force_p = Node.offline old_p
351 new_nl = do -- Maybe monad
352 -- check that the current secondary can host the instance
353 -- during the migration
354 tmp_s <- Node.addPriEx force_p int_s inst
355 let tmp_s' = Node.removePri tmp_s inst
356 new_p <- Node.addPriEx force_p tgt_n inst
357 new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
358 let new_inst = Instance.setPri inst new_pdx
359 return (Container.add new_pdx new_p $
360 Container.addTwo old_pdx int_p old_sdx new_s nl,
361 new_inst, new_pdx, old_sdx)
364 -- Replace the secondary (r:ns)
365 applyMove nl inst (ReplaceSecondary new_sdx) =
366 let old_pdx = Instance.pNode inst
367 old_sdx = Instance.sNode inst
368 old_s = Container.find old_sdx nl
369 tgt_n = Container.find new_sdx nl
370 int_s = Node.removeSec old_s inst
371 force_s = Node.offline old_s
372 new_inst = Instance.setSec inst new_sdx
373 new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
374 \new_s -> return (Container.addTwo new_sdx
375 new_s old_sdx int_s nl,
376 new_inst, old_pdx, new_sdx)
379 -- Replace the secondary and failover (r:np, f)
380 applyMove nl inst (ReplaceAndFailover new_pdx) =
381 let old_pdx = Instance.pNode inst
382 old_sdx = Instance.sNode inst
383 old_p = Container.find old_pdx nl
384 old_s = Container.find old_sdx nl
385 tgt_n = Container.find new_pdx nl
386 int_p = Node.removePri old_p inst
387 int_s = Node.removeSec old_s inst
388 force_s = Node.offline old_s
389 new_nl = do -- Maybe monad
390 new_p <- Node.addPri tgt_n inst
391 new_s <- Node.addSecEx force_s int_p inst new_pdx
392 let new_inst = Instance.setBoth inst new_pdx old_pdx
393 return (Container.add new_pdx new_p $
394 Container.addTwo old_pdx new_s old_sdx int_s nl,
395 new_inst, new_pdx, old_pdx)
398 -- Failver and replace the secondary (f, r:ns)
399 applyMove nl inst (FailoverAndReplace new_sdx) =
400 let old_pdx = Instance.pNode inst
401 old_sdx = Instance.sNode inst
402 old_p = Container.find old_pdx nl
403 old_s = Container.find old_sdx nl
404 tgt_n = Container.find new_sdx nl
405 int_p = Node.removePri old_p inst
406 int_s = Node.removeSec old_s inst
407 force_p = Node.offline old_p
408 new_nl = do -- Maybe monad
409 new_p <- Node.addPriEx force_p int_s inst
410 new_s <- Node.addSecEx force_p tgt_n inst old_sdx
411 let new_inst = Instance.setBoth inst old_sdx new_sdx
412 return (Container.add new_sdx new_s $
413 Container.addTwo old_sdx new_p old_pdx int_p nl,
414 new_inst, old_sdx, new_sdx)
417 -- | Tries to allocate an instance on one given node.
418 allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
419 -> OpResult Node.AllocElement
420 allocateOnSingle nl inst new_pdx =
421 let p = Container.find new_pdx nl
422 new_inst = Instance.setBoth inst new_pdx Node.noSecondary
423 in Node.addPri p inst >>= \new_p -> do
424 let new_nl = Container.add new_pdx new_p nl
425 new_score = compCV nl
426 return (new_nl, new_inst, [new_p], new_score)
428 -- | Tries to allocate an instance on a given pair of nodes.
429 allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
430 -> OpResult Node.AllocElement
431 allocateOnPair nl inst new_pdx new_sdx =
432 let tgt_p = Container.find new_pdx nl
433 tgt_s = Container.find new_sdx nl
435 new_p <- Node.addPri tgt_p inst
436 new_s <- Node.addSec tgt_s inst new_pdx
437 let new_inst = Instance.setBoth inst new_pdx new_sdx
438 new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
439 return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
441 -- | Tries to perform an instance move and returns the best table
442 -- between the original one and the new one.
443 checkSingleStep :: Table -- ^ The original table
444 -> Instance.Instance -- ^ The instance to move
445 -> Table -- ^ The current best table
446 -> IMove -- ^ The move to apply
447 -> Table -- ^ The final best table
448 checkSingleStep ini_tbl target cur_tbl move =
450 Table ini_nl ini_il _ ini_plc = ini_tbl
451 tmp_resu = applyMove ini_nl target move
455 OpGood (upd_nl, new_inst, pri_idx, sec_idx) ->
456 let tgt_idx = Instance.idx target
457 upd_cvar = compCV upd_nl
458 upd_il = Container.add tgt_idx new_inst ini_il
459 upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
460 upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
462 compareTables cur_tbl upd_tbl
464 -- | Given the status of the current secondary as a valid new node and
465 -- the current candidate target node, generate the possible moves for
467 possibleMoves :: Bool -- ^ Whether the secondary node is a valid new node
468 -> Ndx -- ^ Target node candidate
469 -> [IMove] -- ^ List of valid result moves
470 possibleMoves True tdx =
471 [ReplaceSecondary tdx,
472 ReplaceAndFailover tdx,
474 FailoverAndReplace tdx]
476 possibleMoves False tdx =
477 [ReplaceSecondary tdx,
478 ReplaceAndFailover tdx]
480 -- | Compute the best move for a given instance.
481 checkInstanceMove :: [Ndx] -- ^ Allowed target node indices
482 -> Bool -- ^ Whether disk moves are allowed
483 -> Table -- ^ Original table
484 -> Instance.Instance -- ^ Instance to move
485 -> Table -- ^ Best new table for this instance
486 checkInstanceMove nodes_idx disk_moves ini_tbl target =
488 opdx = Instance.pNode target
489 osdx = Instance.sNode target
490 nodes = filter (\idx -> idx /= opdx && idx /= osdx) nodes_idx
491 use_secondary = elem osdx nodes_idx
492 aft_failover = if use_secondary -- if allowed to failover
493 then checkSingleStep ini_tbl target ini_tbl Failover
495 all_moves = if disk_moves
496 then concatMap (possibleMoves use_secondary) nodes
499 -- iterate over the possible nodes for this instance
500 foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
502 -- | Compute the best next move.
503 checkMove :: [Ndx] -- ^ Allowed target node indices
504 -> Bool -- ^ Whether disk moves are allowed
505 -> Table -- ^ The current solution
506 -> [Instance.Instance] -- ^ List of instances still to move
507 -> Table -- ^ The new solution
508 checkMove nodes_idx disk_moves ini_tbl victims =
509 let Table _ _ _ ini_plc = ini_tbl
510 -- we're using rwhnf from the Control.Parallel.Strategies
511 -- package; we don't need to use rnf as that would force too
512 -- much evaluation in single-threaded cases, and in
513 -- multi-threaded case the weak head normal form is enough to
514 -- spark the evaluation
515 tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves ini_tbl)
517 -- iterate over all instances, computing the best move
518 best_tbl = foldl' compareTables ini_tbl tables
519 Table _ _ _ best_plc = best_tbl
520 in if length best_plc == length ini_plc
521 then ini_tbl -- no advancement
524 -- | Check if we are allowed to go deeper in the balancing
525 doNextBalance :: Table -- ^ The starting table
526 -> Int -- ^ Remaining length
527 -> Score -- ^ Score at which to stop
528 -> Bool -- ^ The resulting table and commands
529 doNextBalance ini_tbl max_rounds min_score =
530 let Table _ _ ini_cv ini_plc = ini_tbl
531 ini_plc_len = length ini_plc
532 in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
534 -- | Run a balance move
535 tryBalance :: Table -- ^ The starting table
536 -> Bool -- ^ Allow disk moves
537 -> Bool -- ^ Only evacuate moves
538 -> Score -- ^ Min gain threshold
539 -> Score -- ^ Min gain
540 -> Maybe Table -- ^ The resulting table and commands
541 tryBalance ini_tbl disk_moves evac_mode mg_limit min_gain =
542 let Table ini_nl ini_il ini_cv _ = ini_tbl
543 all_inst = Container.elems ini_il
544 all_inst' = if evac_mode
545 then let bad_nodes = map Node.idx . filter Node.offline $
546 Container.elems ini_nl
547 in filter (\e -> Instance.sNode e `elem` bad_nodes ||
548 Instance.pNode e `elem` bad_nodes)
551 reloc_inst = filter Instance.movable all_inst'
552 node_idx = map Node.idx . filter (not . Node.offline) $
553 Container.elems ini_nl
554 fin_tbl = checkMove node_idx disk_moves ini_tbl reloc_inst
555 (Table _ _ fin_cv _) = fin_tbl
557 if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
558 then Just fin_tbl -- this round made success, return the new table
561 -- * Allocation functions
563 -- | Build failure stats out of a list of failures
564 collapseFailures :: [FailMode] -> FailStats
565 collapseFailures flst =
566 map (\k -> (k, length $ filter (k ==) flst)) [minBound..maxBound]
568 -- | Update current Allocation solution and failure stats with new
570 concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
571 concatAllocs as (OpFail reason) = as { asFailures = reason : asFailures as }
573 concatAllocs as (OpGood ns@(_, _, _, nscore)) =
574 let -- Choose the old or new solution, based on the cluster score
576 osols = asSolutions as
577 nsols = case osols of
579 (_, _, _, oscore):[] ->
583 -- FIXME: here we simply concat to lists with more
584 -- than one element; we should instead abort, since
585 -- this is not a valid usage of this function
588 -- Note: we force evaluation of nsols here in order to keep the
589 -- memory profile low - we know that we will need nsols for sure
590 -- in the next cycle, so we force evaluation of nsols, since the
591 -- foldl' in the caller will only evaluate the tuple, but not the
592 -- elements of the tuple
593 in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolutions = nsols }
595 -- | Sums two allocation solutions (e.g. for two separate node groups).
596 sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution
597 sumAllocs (AllocSolution af aa as al) (AllocSolution bf ba bs bl) =
598 AllocSolution (af ++ bf) (aa + ba) (as ++ bs) (al ++ bl)
600 -- | Given a solution, generates a reasonable description for it
601 describeSolution :: AllocSolution -> String
602 describeSolution as =
603 let fcnt = asFailures as
604 sols = asSolutions as
606 intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
607 filter ((> 0) . snd) . collapseFailures $ fcnt
609 then "No valid allocation solutions, failure reasons: " ++
611 then "unknown reasons"
613 else let (_, _, nodes, cv) = head sols
614 in printf ("score: %.8f, successes %d, failures %d (%s)" ++
615 " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
616 (intercalate "/" . map Node.name $ nodes)
618 -- | Annotates a solution with the appropriate string
619 annotateSolution :: AllocSolution -> AllocSolution
620 annotateSolution as = as { asLog = describeSolution as : asLog as }
622 -- | Generate the valid node allocation singles or pairs for a new instance.
623 genAllocNodes :: Group.List -- ^ Group list
624 -> Node.List -- ^ The node map
625 -> Int -- ^ The number of nodes required
626 -> Bool -- ^ Whether to drop or not
628 -> Result AllocNodes -- ^ The (monadic) result
629 genAllocNodes gl nl count drop_unalloc =
630 let filter_fn = if drop_unalloc
631 then filter ((/=) AllocUnallocable . Group.allocPolicy .
632 flip Container.find gl . Node.group)
634 all_nodes = filter_fn $ getOnline nl
635 all_pairs = liftM2 (,) all_nodes all_nodes
636 ok_pairs = filter (\(x, y) -> Node.idx x /= Node.idx y &&
637 Node.group x == Node.group y) all_pairs
639 1 -> Ok (Left (map Node.idx all_nodes))
640 2 -> Ok (Right (map (\(p, s) -> (Node.idx p, Node.idx s)) ok_pairs))
641 _ -> Bad "Unsupported number of nodes, only one or two supported"
643 -- | Try to allocate an instance on the cluster.
644 tryAlloc :: (Monad m) =>
645 Node.List -- ^ The node list
646 -> Instance.List -- ^ The instance list
647 -> Instance.Instance -- ^ The instance to allocate
648 -> AllocNodes -- ^ The allocation targets
649 -> m AllocSolution -- ^ Possible solution list
650 tryAlloc nl _ inst (Right ok_pairs) =
651 let sols = foldl' (\cstate (p, s) ->
652 concatAllocs cstate $ allocateOnPair nl inst p s
653 ) emptySolution ok_pairs
655 in if null ok_pairs -- means we have just one node
656 then fail "Not enough online nodes"
657 else return $ annotateSolution sols
659 tryAlloc nl _ inst (Left all_nodes) =
660 let sols = foldl' (\cstate ->
661 concatAllocs cstate . allocateOnSingle nl inst
662 ) emptySolution all_nodes
664 then fail "No online nodes"
665 else return $ annotateSolution sols
667 -- | Given a group/result, describe it as a nice (list of) messages
668 solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String]
669 solutionDescription gl (groupId, result) =
671 Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
672 Bad message -> [printf "Group %s: error %s" gname message]
673 where grp = Container.find groupId gl
674 gname = Group.name grp
675 pol = apolToString (Group.allocPolicy grp)
677 -- | From a list of possibly bad and possibly empty solutions, filter
678 -- only the groups with a valid result
679 filterMGResults :: Group.List
680 -> [(Gdx, Result AllocSolution)]
681 -> [(Gdx, AllocSolution)]
683 filter ((/= AllocUnallocable) . Group.allocPolicy .
684 flip Container.find gl . fst) .
685 filter (not . null . asSolutions . snd) .
686 map (\(y, Ok x) -> (y, x)) .
689 -- | Sort multigroup results based on policy and score
690 sortMGResults :: Group.List
691 -> [(Gdx, AllocSolution)]
692 -> [(Gdx, AllocSolution)]
693 sortMGResults gl sols =
694 let extractScore (_, _, _, x) = x
695 solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl),
696 (extractScore . head . asSolutions) sol)
697 in sortBy (comparing solScore) sols
699 -- | Try to allocate an instance on a multi-group cluster.
700 tryMGAlloc :: Group.List -- ^ The group list
701 -> Node.List -- ^ The node list
702 -> Instance.List -- ^ The instance list
703 -> Instance.Instance -- ^ The instance to allocate
704 -> Int -- ^ Required number of nodes
705 -> Result AllocSolution -- ^ Possible solution list
706 tryMGAlloc mggl mgnl mgil inst cnt =
707 let groups = splitCluster mgnl mgil
708 sols = map (\(gid, (nl, il)) ->
709 (gid, genAllocNodes mggl nl cnt False >>=
710 tryAlloc nl il inst))
711 groups::[(Gdx, Result AllocSolution)]
712 all_msgs = concatMap (solutionDescription mggl) sols
713 goodSols = filterMGResults mggl sols
714 sortedSols = sortMGResults mggl goodSols
715 in if null sortedSols
716 then Bad $ intercalate ", " all_msgs
717 else let (final_group, final_sol) = head sortedSols
718 final_name = Group.name $ Container.find final_group mggl
719 selmsg = "Selected group: " ++ final_name
720 in Ok $ final_sol { asLog = selmsg:all_msgs }
722 -- | Try to relocate an instance on the cluster.
723 tryReloc :: (Monad m) =>
724 Node.List -- ^ The node list
725 -> Instance.List -- ^ The instance list
726 -> Idx -- ^ The index of the instance to move
727 -> Int -- ^ The number of nodes required
728 -> [Ndx] -- ^ Nodes which should not be used
729 -> m AllocSolution -- ^ Solution list
730 tryReloc nl il xid 1 ex_idx =
731 let all_nodes = getOnline nl
732 inst = Container.find xid il
733 ex_idx' = Instance.pNode inst:ex_idx
734 valid_nodes = filter (not . flip elem ex_idx' . Node.idx) all_nodes
735 valid_idxes = map Node.idx valid_nodes
736 sols1 = foldl' (\cstate x ->
739 applyMove nl inst (ReplaceSecondary x)
740 return (mnl, i, [Container.find x mnl],
742 in concatAllocs cstate em
743 ) emptySolution valid_idxes
746 tryReloc _ _ _ reqn _ = fail $ "Unsupported number of relocation \
747 \destinations required (" ++ show reqn ++
748 "), only one supported"
750 tryMGReloc :: (Monad m) =>
751 Group.List -- ^ The group list
752 -> Node.List -- ^ The node list
753 -> Instance.List -- ^ The instance list
754 -> Idx -- ^ The index of the instance to move
755 -> Int -- ^ The number of nodes required
756 -> [Ndx] -- ^ Nodes which should not be used
757 -> m AllocSolution -- ^ Solution list
758 tryMGReloc _ mgnl mgil xid ncount ex_ndx = do
759 let groups = splitCluster mgnl mgil
760 -- TODO: we only relocate inside the group for now
761 inst = Container.find xid mgil
762 (nl, il) <- case lookup (instancePriGroup mgnl inst) groups of
763 Nothing -> fail $ "Cannot find group for instance " ++
766 tryReloc nl il xid ncount ex_ndx
768 -- | Change an instance's secondary node
769 evacInstance :: (Monad m) =>
770 [Ndx] -- ^ Excluded nodes
771 -> Instance.List -- ^ The current instance list
772 -> (Node.List, AllocSolution) -- ^ The current state
773 -> Idx -- ^ The instance to evacuate
774 -> m (Node.List, AllocSolution)
775 evacInstance ex_ndx il (nl, old_as) idx = do
776 -- FIXME: hardcoded one node here
778 -- Longer explanation: evacuation is currently hardcoded to DRBD
779 -- instances (which have one secondary); hence, even if the
780 -- IAllocator protocol can request N nodes for an instance, and all
781 -- the message parsing/loading pass this, this implementation only
782 -- supports one; this situation needs to be revisited if we ever
783 -- support more than one secondary, or if we change the storage
785 new_as <- tryReloc nl il idx 1 ex_ndx
786 case asSolutions new_as of
787 -- an individual relocation succeeded, we kind of compose the data
788 -- from the two solutions
789 csol@(nl', _, _, _):_ ->
790 return (nl', new_as { asSolutions = csol:asSolutions old_as })
791 -- this relocation failed, so we fail the entire evac
792 _ -> fail $ "Can't evacuate instance " ++
793 Instance.name (Container.find idx il) ++
794 ": " ++ describeSolution new_as
796 -- | Try to evacuate a list of nodes.
797 tryEvac :: (Monad m) =>
798 Node.List -- ^ The node list
799 -> Instance.List -- ^ The instance list
800 -> [Idx] -- ^ Instances to be evacuated
801 -> [Ndx] -- ^ Restricted nodes (the ones being evacuated)
802 -> m AllocSolution -- ^ Solution list
803 tryEvac nl il idxs ex_ndx = do
804 (_, sol) <- foldM (evacInstance ex_ndx il) (nl, emptySolution) idxs
807 -- | Multi-group evacuation of a list of nodes.
808 tryMGEvac :: (Monad m) =>
809 Group.List -- ^ The group list
810 -> Node.List -- ^ The node list
811 -> Instance.List -- ^ The instance list
812 -> [Ndx] -- ^ Nodes to be evacuated
813 -> m AllocSolution -- ^ Solution list
814 tryMGEvac _ nl il ex_ndx =
815 let ex_nodes = map (`Container.find` nl) ex_ndx
816 all_insts = nub . concatMap Node.sList $ ex_nodes
817 gni = splitCluster nl il
818 -- we run the instance index list through a couple of maps to
819 -- get finally to a structure of the type [(group index,
820 -- [instance indices])]
821 all_insts' = map (\idx ->
822 (instancePriGroup nl (Container.find idx il),
824 all_insts'' = groupBy ((==) `on` fst) all_insts'
825 all_insts3 = map (\xs -> let (gdxs, idxs) = unzip xs
826 in (head gdxs, idxs)) all_insts''
828 -- that done, we now add the per-group nl/il to the tuple
830 mapM (\(gdx, idxs) ->
831 case lookup gdx gni of
832 Nothing -> fail $ "Can't find group index " ++ show gdx
833 Just (gnl, gil) -> return (gdx, gnl, gil, idxs))
835 results <- mapM (\(_, gnl, gil, idxs) -> tryEvac gnl gil idxs ex_ndx)
837 let sol = foldl' sumAllocs emptySolution results
838 return $ annotateSolution sol
840 -- | Recursively place instances on the cluster until we're out of space
841 iterateAlloc :: Node.List
845 -> [Instance.Instance]
847 -> Result AllocResult
848 iterateAlloc nl il newinst allocnodes ixes cstats =
849 let depth = length ixes
850 newname = printf "new-%d" depth::String
851 newidx = length (Container.elems il) + depth
852 newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
853 in case tryAlloc nl il newi2 allocnodes of
855 Ok (AllocSolution { asFailures = errs, asSolutions = sols3 }) ->
857 [] -> Ok (collapseFailures errs, nl, il, ixes, cstats)
858 (xnl, xi, _, _):[] ->
859 iterateAlloc xnl (Container.add newidx xi il)
860 newinst allocnodes (xi:ixes)
861 (totalResources xnl:cstats)
862 _ -> Bad "Internal error: multiple solutions for single\
865 -- | The core of the tiered allocation mode
866 tieredAlloc :: Node.List
870 -> [Instance.Instance]
872 -> Result AllocResult
873 tieredAlloc nl il newinst allocnodes ixes cstats =
874 case iterateAlloc nl il newinst allocnodes ixes cstats of
876 Ok (errs, nl', il', ixes', cstats') ->
877 case Instance.shrinkByType newinst . fst . last $
878 sortBy (comparing snd) errs of
879 Bad _ -> Ok (errs, nl', il', ixes', cstats')
881 tieredAlloc nl' il' newinst' allocnodes ixes' cstats'
883 -- | Compute the tiered spec string description from a list of
884 -- allocated instances.
885 tieredSpecMap :: [Instance.Instance]
887 tieredSpecMap trl_ixes =
888 let fin_trl_ixes = reverse trl_ixes
889 ix_byspec = groupBy ((==) `on` Instance.specOf) fin_trl_ixes
890 spec_map = map (\ixs -> (Instance.specOf $ head ixs, length ixs))
892 in map (\(spec, cnt) -> printf "%d,%d,%d=%d" (rspecMem spec)
893 (rspecDsk spec) (rspecCpu spec) cnt) spec_map
895 -- * Formatting functions
897 -- | Given the original and final nodes, computes the relocation description.
898 computeMoves :: Instance.Instance -- ^ The instance to be moved
899 -> String -- ^ The instance name
900 -> IMove -- ^ The move being performed
901 -> String -- ^ New primary
902 -> String -- ^ New secondary
903 -> (String, [String])
904 -- ^ Tuple of moves and commands list; moves is containing
905 -- either @/f/@ for failover or @/r:name/@ for replace
906 -- secondary, while the command list holds gnt-instance
907 -- commands (without that prefix), e.g \"@failover instance1@\"
908 computeMoves i inam mv c d =
910 Failover -> ("f", [mig])
911 FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
912 ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
913 ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
914 ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
915 where morf = if Instance.running i then "migrate" else "failover"
916 mig = printf "%s -f %s" morf inam::String
917 rep n = printf "replace-disks -n %s %s" n inam
919 -- | Converts a placement to string format.
920 printSolutionLine :: Node.List -- ^ The node list
921 -> Instance.List -- ^ The instance list
922 -> Int -- ^ Maximum node name length
923 -> Int -- ^ Maximum instance name length
924 -> Placement -- ^ The current placement
925 -> Int -- ^ The index of the placement in
927 -> (String, [String])
928 printSolutionLine nl il nmlen imlen plc pos =
930 pmlen = (2*nmlen + 1)
931 (i, p, s, mv, c) = plc
932 inst = Container.find i il
933 inam = Instance.alias inst
934 npri = Node.alias $ Container.find p nl
935 nsec = Node.alias $ Container.find s nl
936 opri = Node.alias $ Container.find (Instance.pNode inst) nl
937 osec = Node.alias $ Container.find (Instance.sNode inst) nl
938 (moves, cmds) = computeMoves inst inam mv npri nsec
939 ostr = printf "%s:%s" opri osec::String
940 nstr = printf "%s:%s" npri nsec::String
942 (printf " %3d. %-*s %-*s => %-*s %.8f a=%s"
943 pos imlen inam pmlen ostr
947 -- | Return the instance and involved nodes in an instance move.
948 involvedNodes :: Instance.List -> Placement -> [Ndx]
949 involvedNodes il plc =
950 let (i, np, ns, _, _) = plc
951 inst = Container.find i il
952 op = Instance.pNode inst
953 os = Instance.sNode inst
954 in nub [np, ns, op, os]
956 -- | Inner function for splitJobs, that either appends the next job to
957 -- the current jobset, or starts a new jobset.
958 mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
959 mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
960 mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
961 | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
962 | otherwise = ([n]:cjs, ndx)
964 -- | Break a list of moves into independent groups. Note that this
965 -- will reverse the order of jobs.
966 splitJobs :: [MoveJob] -> [JobSet]
967 splitJobs = fst . foldl mergeJobs ([], [])
969 -- | Given a list of commands, prefix them with @gnt-instance@ and
970 -- also beautify the display a little.
971 formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
972 formatJob jsn jsl (sn, (_, _, _, cmds)) =
974 printf " echo job %d/%d" jsn sn:
976 map (" gnt-instance " ++) cmds
978 then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
981 -- | Given a list of commands, prefix them with @gnt-instance@ and
982 -- also beautify the display a little.
983 formatCmds :: [JobSet] -> String
986 concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
990 -- | Print the node list.
991 printNodes :: Node.List -> [String] -> String
993 let fields = case fs of
994 [] -> Node.defaultFields
995 "+":rest -> Node.defaultFields ++ rest
997 snl = sortBy (comparing Node.idx) (Container.elems nl)
998 (header, isnum) = unzip $ map Node.showHeader fields
999 in unlines . map ((:) ' ' . intercalate " ") $
1000 formatTable (header:map (Node.list fields) snl) isnum
1002 -- | Print the instance list.
1003 printInsts :: Node.List -> Instance.List -> String
1005 let sil = sortBy (comparing Instance.idx) (Container.elems il)
1006 helper inst = [ if Instance.running inst then "R" else " "
1007 , Instance.name inst
1008 , Container.nameOf nl (Instance.pNode inst)
1009 , let sdx = Instance.sNode inst
1010 in if sdx == Node.noSecondary
1012 else Container.nameOf nl sdx
1013 , printf "%3d" $ Instance.vcpus inst
1014 , printf "%5d" $ Instance.mem inst
1015 , printf "%5d" $ Instance.dsk inst `div` 1024
1021 where DynUtil lC lM lD lN = Instance.util inst
1022 header = [ "F", "Name", "Pri_node", "Sec_node", "vcpu", "mem"
1023 , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1024 isnum = False:False:False:False:repeat True
1025 in unlines . map ((:) ' ' . intercalate " ") $
1026 formatTable (header:map helper sil) isnum
1028 -- | Shows statistics for a given node list.
1029 printStats :: Node.List -> String
1031 let dcvs = compDetailedCV nl
1032 (weights, names) = unzip detailedCVInfo
1033 hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1034 formatted = map (\(w, header, val) ->
1035 printf "%s=%.8f(x%.2f)" header val w::String) hd
1036 in intercalate ", " formatted
1038 -- | Convert a placement into a list of OpCodes (basically a job).
1039 iMoveToJob :: Node.List -> Instance.List
1040 -> Idx -> IMove -> [OpCodes.OpCode]
1041 iMoveToJob nl il idx move =
1042 let inst = Container.find idx il
1043 iname = Instance.name inst
1044 lookNode = Just . Container.nameOf nl
1045 opF = OpCodes.OpMigrateInstance iname True False True
1046 opR n = OpCodes.OpReplaceDisks iname (lookNode n)
1047 OpCodes.ReplaceNewSecondary [] Nothing
1050 ReplacePrimary np -> [ opF, opR np, opF ]
1051 ReplaceSecondary ns -> [ opR ns ]
1052 ReplaceAndFailover np -> [ opR np, opF ]
1053 FailoverAndReplace ns -> [ opF, opR ns ]
1055 -- * Node group functions
1057 -- | Computes the group of an instance
1058 instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1059 instanceGroup nl i =
1060 let sidx = Instance.sNode i
1061 pnode = Container.find (Instance.pNode i) nl
1062 snode = if sidx == Node.noSecondary
1064 else Container.find sidx nl
1065 pgroup = Node.group pnode
1066 sgroup = Node.group snode
1067 in if pgroup /= sgroup
1068 then fail ("Instance placed accross two node groups, primary " ++
1069 show pgroup ++ ", secondary " ++ show sgroup)
1072 -- | Computes the group of an instance per the primary node
1073 instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1074 instancePriGroup nl i =
1075 let pnode = Container.find (Instance.pNode i) nl
1078 -- | Compute the list of badly allocated instances (split across node
1080 findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1081 findSplitInstances nl =
1082 filter (not . isOk . instanceGroup nl) . Container.elems
1084 -- | Splits a cluster into the component node groups
1085 splitCluster :: Node.List -> Instance.List ->
1086 [(Gdx, (Node.List, Instance.List))]
1087 splitCluster nl il =
1088 let ngroups = Node.computeGroups (Container.elems nl)
1089 in map (\(guuid, nodes) ->
1090 let nidxs = map Node.idx nodes
1091 nodes' = zip nidxs nodes
1092 instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1093 in (guuid, (Container.fromList nodes', instances))) ngroups