1 {-| Implementation of cluster-wide logic.
3 This module holds all pure cluster-logic; I\/O related functionality
4 goes into the /Main/ module for the individual binaries.
10 Copyright (C) 2009, 2010, 2011 Google Inc.
12 This program is free software; you can redistribute it and/or modify
13 it under the terms of the GNU General Public License as published by
14 the Free Software Foundation; either version 2 of the License, or
15 (at your option) any later version.
17 This program is distributed in the hope that it will be useful, but
18 WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 General Public License for more details.
22 You should have received a copy of the GNU General Public License
23 along with this program; if not, write to the Free Software
24 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
29 module Ganeti.HTools.Cluster
37 -- * Generic functions
39 , computeAllocationDelta
40 -- * First phase functions
42 -- * Second phase functions
47 -- * Display functions
50 -- * Balacing functions
59 -- * IAllocator functions
70 -- * Allocation functions
74 -- * Node group functions
80 import Data.Function (on)
81 import qualified Data.IntSet as IntSet
83 import Data.Maybe (fromJust)
84 import Data.Ord (comparing)
85 import Text.Printf (printf)
87 import Control.Parallel.Strategies
89 import qualified Ganeti.HTools.Container as Container
90 import qualified Ganeti.HTools.Instance as Instance
91 import qualified Ganeti.HTools.Node as Node
92 import qualified Ganeti.HTools.Group as Group
93 import Ganeti.HTools.Types
94 import Ganeti.HTools.Utils
95 import qualified Ganeti.OpCodes as OpCodes
99 -- | Allocation\/relocation solution.
100 data AllocSolution = AllocSolution
101 { asFailures :: [FailMode] -- ^ Failure counts
102 , asAllocs :: Int -- ^ Good allocation count
103 , asSolutions :: [Node.AllocElement] -- ^ The actual result, length
104 -- of the list depends on the
105 -- allocation/relocation mode
106 , asLog :: [String] -- ^ A list of informational messages
109 -- | Node evacuation/group change iallocator result type. This result
110 -- type consists of actual opcodes (a restricted subset) that are
111 -- transmitted back to Ganeti.
112 data EvacSolution = EvacSolution
113 { esMoved :: [String] -- ^ Instance moved successfully
114 , esFailed :: [String] -- ^ Instance which were not
116 , esOpCodes :: [[[OpCodes.OpCode]]] -- ^ List of lists of jobs
119 -- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
120 type AllocResult = (FailStats, Node.List, Instance.List,
121 [Instance.Instance], [CStats])
123 -- | A type denoting the valid allocation mode/pairs.
125 -- For a one-node allocation, this will be a @Left ['Node.Node']@,
126 -- whereas for a two-node allocation, this will be a @Right
127 -- [('Node.Node', 'Node.Node')]@.
128 type AllocNodes = Either [Ndx] [(Ndx, Ndx)]
130 -- | The empty solution we start with when computing allocations.
131 emptyAllocSolution :: AllocSolution
132 emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
133 , asSolutions = [], asLog = [] }
135 -- | The empty evac solution.
136 emptyEvacSolution :: EvacSolution
137 emptyEvacSolution = EvacSolution { esMoved = []
142 -- | The complete state for the balancing solution.
143 data Table = Table Node.List Instance.List Score [Placement]
144 deriving (Show, Read)
146 data CStats = CStats { csFmem :: Integer -- ^ Cluster free mem
147 , csFdsk :: Integer -- ^ Cluster free disk
148 , csAmem :: Integer -- ^ Cluster allocatable mem
149 , csAdsk :: Integer -- ^ Cluster allocatable disk
150 , csAcpu :: Integer -- ^ Cluster allocatable cpus
151 , csMmem :: Integer -- ^ Max node allocatable mem
152 , csMdsk :: Integer -- ^ Max node allocatable disk
153 , csMcpu :: Integer -- ^ Max node allocatable cpu
154 , csImem :: Integer -- ^ Instance used mem
155 , csIdsk :: Integer -- ^ Instance used disk
156 , csIcpu :: Integer -- ^ Instance used cpu
157 , csTmem :: Double -- ^ Cluster total mem
158 , csTdsk :: Double -- ^ Cluster total disk
159 , csTcpu :: Double -- ^ Cluster total cpus
160 , csVcpu :: Integer -- ^ Cluster virtual cpus (if
161 -- node pCpu has been set,
163 , csXmem :: Integer -- ^ Unnacounted for mem
164 , csNmem :: Integer -- ^ Node own memory
165 , csScore :: Score -- ^ The cluster score
166 , csNinst :: Int -- ^ The total number of instances
168 deriving (Show, Read)
170 -- | Currently used, possibly to allocate, unallocable.
171 type AllocStats = (RSpec, RSpec, RSpec)
173 -- * Utility functions
175 -- | Verifies the N+1 status and return the affected nodes.
176 verifyN1 :: [Node.Node] -> [Node.Node]
177 verifyN1 = filter Node.failN1
179 {-| Computes the pair of bad nodes and instances.
181 The bad node list is computed via a simple 'verifyN1' check, and the
182 bad instance list is the list of primary and secondary instances of
186 computeBadItems :: Node.List -> Instance.List ->
187 ([Node.Node], [Instance.Instance])
188 computeBadItems nl il =
189 let bad_nodes = verifyN1 $ getOnline nl
190 bad_instances = map (`Container.find` il) .
192 concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
194 (bad_nodes, bad_instances)
196 -- | Zero-initializer for the CStats type.
197 emptyCStats :: CStats
198 emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
200 -- | Update stats with data from a new node.
201 updateCStats :: CStats -> Node.Node -> CStats
202 updateCStats cs node =
203 let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
204 csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
205 csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
206 csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
207 csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
209 csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
212 inc_amem = Node.fMem node - Node.rMem node
213 inc_amem' = if inc_amem > 0 then inc_amem else 0
214 inc_adsk = Node.availDisk node
215 inc_imem = truncate (Node.tMem node) - Node.nMem node
216 - Node.xMem node - Node.fMem node
217 inc_icpu = Node.uCpu node
218 inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
219 inc_vcpu = Node.hiCpu node
220 inc_acpu = Node.availCpu node
222 in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
223 , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
224 , csAmem = x_amem + fromIntegral inc_amem'
225 , csAdsk = x_adsk + fromIntegral inc_adsk
226 , csAcpu = x_acpu + fromIntegral inc_acpu
227 , csMmem = max x_mmem (fromIntegral inc_amem')
228 , csMdsk = max x_mdsk (fromIntegral inc_adsk)
229 , csMcpu = max x_mcpu (fromIntegral inc_acpu)
230 , csImem = x_imem + fromIntegral inc_imem
231 , csIdsk = x_idsk + fromIntegral inc_idsk
232 , csIcpu = x_icpu + fromIntegral inc_icpu
233 , csTmem = x_tmem + Node.tMem node
234 , csTdsk = x_tdsk + Node.tDsk node
235 , csTcpu = x_tcpu + Node.tCpu node
236 , csVcpu = x_vcpu + fromIntegral inc_vcpu
237 , csXmem = x_xmem + fromIntegral (Node.xMem node)
238 , csNmem = x_nmem + fromIntegral (Node.nMem node)
239 , csNinst = x_ninst + length (Node.pList node)
242 -- | Compute the total free disk and memory in the cluster.
243 totalResources :: Node.List -> CStats
245 let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
246 in cs { csScore = compCV nl }
248 -- | Compute the delta between two cluster state.
250 -- This is used when doing allocations, to understand better the
251 -- available cluster resources. The return value is a triple of the
252 -- current used values, the delta that was still allocated, and what
253 -- was left unallocated.
254 computeAllocationDelta :: CStats -> CStats -> AllocStats
255 computeAllocationDelta cini cfin =
256 let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu} = cini
257 CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
258 csTmem = t_mem, csTdsk = t_dsk, csVcpu = v_cpu } = cfin
259 rini = RSpec (fromIntegral i_icpu) (fromIntegral i_imem)
260 (fromIntegral i_idsk)
261 rfin = RSpec (fromIntegral (f_icpu - i_icpu))
262 (fromIntegral (f_imem - i_imem))
263 (fromIntegral (f_idsk - i_idsk))
264 un_cpu = fromIntegral (v_cpu - f_icpu)::Int
265 runa = RSpec un_cpu (truncate t_mem - fromIntegral f_imem)
266 (truncate t_dsk - fromIntegral f_idsk)
267 in (rini, rfin, runa)
269 -- | The names and weights of the individual elements in the CV list.
270 detailedCVInfo :: [(Double, String)]
271 detailedCVInfo = [ (1, "free_mem_cv")
272 , (1, "free_disk_cv")
274 , (1, "reserved_mem_cv")
275 , (4, "offline_all_cnt")
276 , (16, "offline_pri_cnt")
277 , (1, "vcpu_ratio_cv")
280 , (1, "disk_load_cv")
282 , (2, "pri_tags_score")
285 detailedCVWeights :: [Double]
286 detailedCVWeights = map fst detailedCVInfo
288 -- | Compute the mem and disk covariance.
289 compDetailedCV :: [Node.Node] -> [Double]
290 compDetailedCV all_nodes =
292 (offline, nodes) = partition Node.offline all_nodes
293 mem_l = map Node.pMem nodes
294 dsk_l = map Node.pDsk nodes
295 -- metric: memory covariance
296 mem_cv = stdDev mem_l
297 -- metric: disk covariance
298 dsk_cv = stdDev dsk_l
299 -- metric: count of instances living on N1 failing nodes
300 n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
301 length (Node.pList n)) .
302 filter Node.failN1 $ nodes :: Double
303 res_l = map Node.pRem nodes
304 -- metric: reserved memory covariance
305 res_cv = stdDev res_l
306 -- offline instances metrics
307 offline_ipri = sum . map (length . Node.pList) $ offline
308 offline_isec = sum . map (length . Node.sList) $ offline
309 -- metric: count of instances on offline nodes
310 off_score = fromIntegral (offline_ipri + offline_isec)::Double
311 -- metric: count of primary instances on offline nodes (this
312 -- helps with evacuation/failover of primary instances on
313 -- 2-node clusters with one node offline)
314 off_pri_score = fromIntegral offline_ipri::Double
315 cpu_l = map Node.pCpu nodes
316 -- metric: covariance of vcpu/pcpu ratio
317 cpu_cv = stdDev cpu_l
318 -- metrics: covariance of cpu, memory, disk and network load
319 (c_load, m_load, d_load, n_load) = unzip4 $
321 let DynUtil c1 m1 d1 n1 = Node.utilLoad n
322 DynUtil c2 m2 d2 n2 = Node.utilPool n
323 in (c1/c2, m1/m2, d1/d2, n1/n2)
325 -- metric: conflicting instance count
326 pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
327 pri_tags_score = fromIntegral pri_tags_inst::Double
328 in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
329 , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
332 -- | Compute the /total/ variance.
333 compCVNodes :: [Node.Node] -> Double
334 compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
336 -- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
337 compCV :: Node.List -> Double
338 compCV = compCVNodes . Container.elems
341 -- | Compute online nodes from a 'Node.List'.
342 getOnline :: Node.List -> [Node.Node]
343 getOnline = filter (not . Node.offline) . Container.elems
345 -- * Balancing functions
347 -- | Compute best table. Note that the ordering of the arguments is important.
348 compareTables :: Table -> Table -> Table
349 compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
350 if a_cv > b_cv then b else a
352 -- | Applies an instance move to a given node list and instance.
353 applyMove :: Node.List -> Instance.Instance
354 -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
356 applyMove nl inst Failover =
357 let old_pdx = Instance.pNode inst
358 old_sdx = Instance.sNode inst
359 old_p = Container.find old_pdx nl
360 old_s = Container.find old_sdx nl
361 int_p = Node.removePri old_p inst
362 int_s = Node.removeSec old_s inst
363 force_p = Node.offline old_p
364 new_nl = do -- Maybe monad
365 new_p <- Node.addPriEx force_p int_s inst
366 new_s <- Node.addSec int_p inst old_sdx
367 let new_inst = Instance.setBoth inst old_sdx old_pdx
368 return (Container.addTwo old_pdx new_s old_sdx new_p nl,
369 new_inst, old_sdx, old_pdx)
372 -- Replace the primary (f:, r:np, f)
373 applyMove nl inst (ReplacePrimary new_pdx) =
374 let old_pdx = Instance.pNode inst
375 old_sdx = Instance.sNode inst
376 old_p = Container.find old_pdx nl
377 old_s = Container.find old_sdx nl
378 tgt_n = Container.find new_pdx nl
379 int_p = Node.removePri old_p inst
380 int_s = Node.removeSec old_s inst
381 force_p = Node.offline old_p
382 new_nl = do -- Maybe monad
383 -- check that the current secondary can host the instance
384 -- during the migration
385 tmp_s <- Node.addPriEx force_p int_s inst
386 let tmp_s' = Node.removePri tmp_s inst
387 new_p <- Node.addPriEx force_p tgt_n inst
388 new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
389 let new_inst = Instance.setPri inst new_pdx
390 return (Container.add new_pdx new_p $
391 Container.addTwo old_pdx int_p old_sdx new_s nl,
392 new_inst, new_pdx, old_sdx)
395 -- Replace the secondary (r:ns)
396 applyMove nl inst (ReplaceSecondary new_sdx) =
397 let old_pdx = Instance.pNode inst
398 old_sdx = Instance.sNode inst
399 old_s = Container.find old_sdx nl
400 tgt_n = Container.find new_sdx nl
401 int_s = Node.removeSec old_s inst
402 force_s = Node.offline old_s
403 new_inst = Instance.setSec inst new_sdx
404 new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
405 \new_s -> return (Container.addTwo new_sdx
406 new_s old_sdx int_s nl,
407 new_inst, old_pdx, new_sdx)
410 -- Replace the secondary and failover (r:np, f)
411 applyMove nl inst (ReplaceAndFailover new_pdx) =
412 let old_pdx = Instance.pNode inst
413 old_sdx = Instance.sNode inst
414 old_p = Container.find old_pdx nl
415 old_s = Container.find old_sdx nl
416 tgt_n = Container.find new_pdx nl
417 int_p = Node.removePri old_p inst
418 int_s = Node.removeSec old_s inst
419 force_s = Node.offline old_s
420 new_nl = do -- Maybe monad
421 new_p <- Node.addPri tgt_n inst
422 new_s <- Node.addSecEx force_s int_p inst new_pdx
423 let new_inst = Instance.setBoth inst new_pdx old_pdx
424 return (Container.add new_pdx new_p $
425 Container.addTwo old_pdx new_s old_sdx int_s nl,
426 new_inst, new_pdx, old_pdx)
429 -- Failver and replace the secondary (f, r:ns)
430 applyMove nl inst (FailoverAndReplace new_sdx) =
431 let old_pdx = Instance.pNode inst
432 old_sdx = Instance.sNode inst
433 old_p = Container.find old_pdx nl
434 old_s = Container.find old_sdx nl
435 tgt_n = Container.find new_sdx nl
436 int_p = Node.removePri old_p inst
437 int_s = Node.removeSec old_s inst
438 force_p = Node.offline old_p
439 new_nl = do -- Maybe monad
440 new_p <- Node.addPriEx force_p int_s inst
441 new_s <- Node.addSecEx force_p tgt_n inst old_sdx
442 let new_inst = Instance.setBoth inst old_sdx new_sdx
443 return (Container.add new_sdx new_s $
444 Container.addTwo old_sdx new_p old_pdx int_p nl,
445 new_inst, old_sdx, new_sdx)
448 -- | Tries to allocate an instance on one given node.
449 allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
450 -> OpResult Node.AllocElement
451 allocateOnSingle nl inst new_pdx =
452 let p = Container.find new_pdx nl
453 new_inst = Instance.setBoth inst new_pdx Node.noSecondary
454 in Node.addPri p inst >>= \new_p -> do
455 let new_nl = Container.add new_pdx new_p nl
456 new_score = compCV nl
457 return (new_nl, new_inst, [new_p], new_score)
459 -- | Tries to allocate an instance on a given pair of nodes.
460 allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
461 -> OpResult Node.AllocElement
462 allocateOnPair nl inst new_pdx new_sdx =
463 let tgt_p = Container.find new_pdx nl
464 tgt_s = Container.find new_sdx nl
466 new_p <- Node.addPri tgt_p inst
467 new_s <- Node.addSec tgt_s inst new_pdx
468 let new_inst = Instance.setBoth inst new_pdx new_sdx
469 new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
470 return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
472 -- | Tries to perform an instance move and returns the best table
473 -- between the original one and the new one.
474 checkSingleStep :: Table -- ^ The original table
475 -> Instance.Instance -- ^ The instance to move
476 -> Table -- ^ The current best table
477 -> IMove -- ^ The move to apply
478 -> Table -- ^ The final best table
479 checkSingleStep ini_tbl target cur_tbl move =
481 Table ini_nl ini_il _ ini_plc = ini_tbl
482 tmp_resu = applyMove ini_nl target move
486 OpGood (upd_nl, new_inst, pri_idx, sec_idx) ->
487 let tgt_idx = Instance.idx target
488 upd_cvar = compCV upd_nl
489 upd_il = Container.add tgt_idx new_inst ini_il
490 upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
491 upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
493 compareTables cur_tbl upd_tbl
495 -- | Given the status of the current secondary as a valid new node and
496 -- the current candidate target node, generate the possible moves for
498 possibleMoves :: Bool -- ^ Whether the secondary node is a valid new node
499 -> Bool -- ^ Whether we can change the primary node
500 -> Ndx -- ^ Target node candidate
501 -> [IMove] -- ^ List of valid result moves
503 possibleMoves _ False tdx =
504 [ReplaceSecondary tdx]
506 possibleMoves True True tdx =
507 [ReplaceSecondary tdx,
508 ReplaceAndFailover tdx,
510 FailoverAndReplace tdx]
512 possibleMoves False True tdx =
513 [ReplaceSecondary tdx,
514 ReplaceAndFailover tdx]
516 -- | Compute the best move for a given instance.
517 checkInstanceMove :: [Ndx] -- ^ Allowed target node indices
518 -> Bool -- ^ Whether disk moves are allowed
519 -> Bool -- ^ Whether instance moves are allowed
520 -> Table -- ^ Original table
521 -> Instance.Instance -- ^ Instance to move
522 -> Table -- ^ Best new table for this instance
523 checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
525 opdx = Instance.pNode target
526 osdx = Instance.sNode target
527 nodes = filter (\idx -> idx /= opdx && idx /= osdx) nodes_idx
528 use_secondary = elem osdx nodes_idx && inst_moves
529 aft_failover = if use_secondary -- if allowed to failover
530 then checkSingleStep ini_tbl target ini_tbl Failover
532 all_moves = if disk_moves
534 (possibleMoves use_secondary inst_moves) nodes
537 -- iterate over the possible nodes for this instance
538 foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
540 -- | Compute the best next move.
541 checkMove :: [Ndx] -- ^ Allowed target node indices
542 -> Bool -- ^ Whether disk moves are allowed
543 -> Bool -- ^ Whether instance moves are allowed
544 -> Table -- ^ The current solution
545 -> [Instance.Instance] -- ^ List of instances still to move
546 -> Table -- ^ The new solution
547 checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
548 let Table _ _ _ ini_plc = ini_tbl
549 -- we're using rwhnf from the Control.Parallel.Strategies
550 -- package; we don't need to use rnf as that would force too
551 -- much evaluation in single-threaded cases, and in
552 -- multi-threaded case the weak head normal form is enough to
553 -- spark the evaluation
554 tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
557 -- iterate over all instances, computing the best move
558 best_tbl = foldl' compareTables ini_tbl tables
559 Table _ _ _ best_plc = best_tbl
560 in if length best_plc == length ini_plc
561 then ini_tbl -- no advancement
564 -- | Check if we are allowed to go deeper in the balancing.
565 doNextBalance :: Table -- ^ The starting table
566 -> Int -- ^ Remaining length
567 -> Score -- ^ Score at which to stop
568 -> Bool -- ^ The resulting table and commands
569 doNextBalance ini_tbl max_rounds min_score =
570 let Table _ _ ini_cv ini_plc = ini_tbl
571 ini_plc_len = length ini_plc
572 in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
574 -- | Run a balance move.
575 tryBalance :: Table -- ^ The starting table
576 -> Bool -- ^ Allow disk moves
577 -> Bool -- ^ Allow instance moves
578 -> Bool -- ^ Only evacuate moves
579 -> Score -- ^ Min gain threshold
580 -> Score -- ^ Min gain
581 -> Maybe Table -- ^ The resulting table and commands
582 tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
583 let Table ini_nl ini_il ini_cv _ = ini_tbl
584 all_inst = Container.elems ini_il
585 all_inst' = if evac_mode
586 then let bad_nodes = map Node.idx . filter Node.offline $
587 Container.elems ini_nl
588 in filter (\e -> Instance.sNode e `elem` bad_nodes ||
589 Instance.pNode e `elem` bad_nodes)
592 reloc_inst = filter Instance.movable all_inst'
593 node_idx = map Node.idx . filter (not . Node.offline) $
594 Container.elems ini_nl
595 fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
596 (Table _ _ fin_cv _) = fin_tbl
598 if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
599 then Just fin_tbl -- this round made success, return the new table
602 -- * Allocation functions
604 -- | Build failure stats out of a list of failures.
605 collapseFailures :: [FailMode] -> FailStats
606 collapseFailures flst =
607 map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst))
610 -- | Update current Allocation solution and failure stats with new
612 concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
613 concatAllocs as (OpFail reason) = as { asFailures = reason : asFailures as }
615 concatAllocs as (OpGood ns@(_, _, _, nscore)) =
616 let -- Choose the old or new solution, based on the cluster score
618 osols = asSolutions as
619 nsols = case osols of
621 (_, _, _, oscore):[] ->
625 -- FIXME: here we simply concat to lists with more
626 -- than one element; we should instead abort, since
627 -- this is not a valid usage of this function
630 -- Note: we force evaluation of nsols here in order to keep the
631 -- memory profile low - we know that we will need nsols for sure
632 -- in the next cycle, so we force evaluation of nsols, since the
633 -- foldl' in the caller will only evaluate the tuple, but not the
634 -- elements of the tuple
635 in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolutions = nsols }
637 -- | Sums two allocation solutions (e.g. for two separate node groups).
638 sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution
639 sumAllocs (AllocSolution af aa as al) (AllocSolution bf ba bs bl) =
640 AllocSolution (af ++ bf) (aa + ba) (as ++ bs) (al ++ bl)
642 -- | Given a solution, generates a reasonable description for it.
643 describeSolution :: AllocSolution -> String
644 describeSolution as =
645 let fcnt = asFailures as
646 sols = asSolutions as
648 intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
649 filter ((> 0) . snd) . collapseFailures $ fcnt
651 then "No valid allocation solutions, failure reasons: " ++
653 then "unknown reasons"
655 else let (_, _, nodes, cv) = head sols
656 in printf ("score: %.8f, successes %d, failures %d (%s)" ++
657 " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
658 (intercalate "/" . map Node.name $ nodes)
660 -- | Annotates a solution with the appropriate string.
661 annotateSolution :: AllocSolution -> AllocSolution
662 annotateSolution as = as { asLog = describeSolution as : asLog as }
664 -- | Reverses an evacuation solution.
666 -- Rationale: we always concat the results to the top of the lists, so
667 -- for proper jobset execution, we should reverse all lists.
668 reverseEvacSolution :: EvacSolution -> EvacSolution
669 reverseEvacSolution (EvacSolution f m o) =
670 EvacSolution (reverse f) (reverse m) (reverse o)
672 -- | Generate the valid node allocation singles or pairs for a new instance.
673 genAllocNodes :: Group.List -- ^ Group list
674 -> Node.List -- ^ The node map
675 -> Int -- ^ The number of nodes required
676 -> Bool -- ^ Whether to drop or not
678 -> Result AllocNodes -- ^ The (monadic) result
679 genAllocNodes gl nl count drop_unalloc =
680 let filter_fn = if drop_unalloc
681 then filter (Group.isAllocable .
682 flip Container.find gl . Node.group)
684 all_nodes = filter_fn $ getOnline nl
685 all_pairs = liftM2 (,) all_nodes all_nodes
686 ok_pairs = filter (\(x, y) -> Node.idx x /= Node.idx y &&
687 Node.group x == Node.group y) all_pairs
689 1 -> Ok (Left (map Node.idx all_nodes))
690 2 -> Ok (Right (map (\(p, s) -> (Node.idx p, Node.idx s)) ok_pairs))
691 _ -> Bad "Unsupported number of nodes, only one or two supported"
693 -- | Try to allocate an instance on the cluster.
694 tryAlloc :: (Monad m) =>
695 Node.List -- ^ The node list
696 -> Instance.List -- ^ The instance list
697 -> Instance.Instance -- ^ The instance to allocate
698 -> AllocNodes -- ^ The allocation targets
699 -> m AllocSolution -- ^ Possible solution list
700 tryAlloc nl _ inst (Right ok_pairs) =
701 let sols = foldl' (\cstate (p, s) ->
702 concatAllocs cstate $ allocateOnPair nl inst p s
703 ) emptyAllocSolution ok_pairs
705 in if null ok_pairs -- means we have just one node
706 then fail "Not enough online nodes"
707 else return $ annotateSolution sols
709 tryAlloc nl _ inst (Left all_nodes) =
710 let sols = foldl' (\cstate ->
711 concatAllocs cstate . allocateOnSingle nl inst
712 ) emptyAllocSolution all_nodes
714 then fail "No online nodes"
715 else return $ annotateSolution sols
717 -- | Given a group/result, describe it as a nice (list of) messages.
718 solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String]
719 solutionDescription gl (groupId, result) =
721 Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
722 Bad message -> [printf "Group %s: error %s" gname message]
723 where grp = Container.find groupId gl
724 gname = Group.name grp
725 pol = apolToString (Group.allocPolicy grp)
727 -- | From a list of possibly bad and possibly empty solutions, filter
728 -- only the groups with a valid result. Note that the result will be
729 -- reversed compared to the original list.
730 filterMGResults :: Group.List
731 -> [(Gdx, Result AllocSolution)]
732 -> [(Gdx, AllocSolution)]
733 filterMGResults gl = foldl' fn []
734 where unallocable = not . Group.isAllocable . flip Container.find gl
735 fn accu (gdx, rasol) =
738 Ok sol | null (asSolutions sol) -> accu
739 | unallocable gdx -> accu
740 | otherwise -> (gdx, sol):accu
742 -- | Sort multigroup results based on policy and score.
743 sortMGResults :: Group.List
744 -> [(Gdx, AllocSolution)]
745 -> [(Gdx, AllocSolution)]
746 sortMGResults gl sols =
747 let extractScore (_, _, _, x) = x
748 solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl),
749 (extractScore . head . asSolutions) sol)
750 in sortBy (comparing solScore) sols
752 -- | Finds the best group for an instance on a multi-group cluster.
754 -- Only solutions in @preferred@ and @last_resort@ groups will be
755 -- accepted as valid, and additionally if the allowed groups parameter
756 -- is not null then allocation will only be run for those group
758 findBestAllocGroup :: Group.List -- ^ The group list
759 -> Node.List -- ^ The node list
760 -> Instance.List -- ^ The instance list
761 -> Maybe [Gdx] -- ^ The allowed groups
762 -> Instance.Instance -- ^ The instance to allocate
763 -> Int -- ^ Required number of nodes
764 -> Result (Gdx, AllocSolution, [String])
765 findBestAllocGroup mggl mgnl mgil allowed_gdxs inst cnt =
766 let groups = splitCluster mgnl mgil
767 groups' = maybe groups (\gs -> filter ((`elem` gs) . fst) groups)
769 sols = map (\(gid, (nl, il)) ->
770 (gid, genAllocNodes mggl nl cnt False >>=
771 tryAlloc nl il inst))
772 groups'::[(Gdx, Result AllocSolution)]
773 all_msgs = concatMap (solutionDescription mggl) sols
774 goodSols = filterMGResults mggl sols
775 sortedSols = sortMGResults mggl goodSols
776 in if null sortedSols
777 then Bad $ intercalate ", " all_msgs
778 else let (final_group, final_sol) = head sortedSols
779 in return (final_group, final_sol, all_msgs)
781 -- | Try to allocate an instance on a multi-group cluster.
782 tryMGAlloc :: Group.List -- ^ The group list
783 -> Node.List -- ^ The node list
784 -> Instance.List -- ^ The instance list
785 -> Instance.Instance -- ^ The instance to allocate
786 -> Int -- ^ Required number of nodes
787 -> Result AllocSolution -- ^ Possible solution list
788 tryMGAlloc mggl mgnl mgil inst cnt = do
789 (best_group, solution, all_msgs) <-
790 findBestAllocGroup mggl mgnl mgil Nothing inst cnt
791 let group_name = Group.name $ Container.find best_group mggl
792 selmsg = "Selected group: " ++ group_name
793 return $ solution { asLog = selmsg:all_msgs }
795 -- | Try to relocate an instance on the cluster.
796 tryReloc :: (Monad m) =>
797 Node.List -- ^ The node list
798 -> Instance.List -- ^ The instance list
799 -> Idx -- ^ The index of the instance to move
800 -> Int -- ^ The number of nodes required
801 -> [Ndx] -- ^ Nodes which should not be used
802 -> m AllocSolution -- ^ Solution list
803 tryReloc nl il xid 1 ex_idx =
804 let all_nodes = getOnline nl
805 inst = Container.find xid il
806 ex_idx' = Instance.pNode inst:ex_idx
807 valid_nodes = filter (not . flip elem ex_idx' . Node.idx) all_nodes
808 valid_idxes = map Node.idx valid_nodes
809 sols1 = foldl' (\cstate x ->
812 applyMove nl inst (ReplaceSecondary x)
813 return (mnl, i, [Container.find x mnl],
815 in concatAllocs cstate em
816 ) emptyAllocSolution valid_idxes
819 tryReloc _ _ _ reqn _ = fail $ "Unsupported number of relocation \
820 \destinations required (" ++ show reqn ++
821 "), only one supported"
823 tryMGReloc :: (Monad m) =>
824 Group.List -- ^ The group list
825 -> Node.List -- ^ The node list
826 -> Instance.List -- ^ The instance list
827 -> Idx -- ^ The index of the instance to move
828 -> Int -- ^ The number of nodes required
829 -> [Ndx] -- ^ Nodes which should not be used
830 -> m AllocSolution -- ^ Solution list
831 tryMGReloc _ mgnl mgil xid ncount ex_ndx = do
832 let groups = splitCluster mgnl mgil
833 -- TODO: we only relocate inside the group for now
834 inst = Container.find xid mgil
835 (nl, il) <- case lookup (instancePriGroup mgnl inst) groups of
836 Nothing -> fail $ "Cannot find group for instance " ++
839 tryReloc nl il xid ncount ex_ndx
841 -- | Change an instance's secondary node.
842 evacInstance :: (Monad m) =>
843 [Ndx] -- ^ Excluded nodes
844 -> Instance.List -- ^ The current instance list
845 -> (Node.List, AllocSolution) -- ^ The current state
846 -> Idx -- ^ The instance to evacuate
847 -> m (Node.List, AllocSolution)
848 evacInstance ex_ndx il (nl, old_as) idx = do
849 -- FIXME: hardcoded one node here
851 -- Longer explanation: evacuation is currently hardcoded to DRBD
852 -- instances (which have one secondary); hence, even if the
853 -- IAllocator protocol can request N nodes for an instance, and all
854 -- the message parsing/loading pass this, this implementation only
855 -- supports one; this situation needs to be revisited if we ever
856 -- support more than one secondary, or if we change the storage
858 new_as <- tryReloc nl il idx 1 ex_ndx
859 case asSolutions new_as of
860 -- an individual relocation succeeded, we kind of compose the data
861 -- from the two solutions
862 csol@(nl', _, _, _):_ ->
863 return (nl', new_as { asSolutions = csol:asSolutions old_as })
864 -- this relocation failed, so we fail the entire evac
865 _ -> fail $ "Can't evacuate instance " ++
866 Instance.name (Container.find idx il) ++
867 ": " ++ describeSolution new_as
869 -- | Try to evacuate a list of nodes.
870 tryEvac :: (Monad m) =>
871 Node.List -- ^ The node list
872 -> Instance.List -- ^ The instance list
873 -> [Idx] -- ^ Instances to be evacuated
874 -> [Ndx] -- ^ Restricted nodes (the ones being evacuated)
875 -> m AllocSolution -- ^ Solution list
876 tryEvac nl il idxs ex_ndx = do
877 (_, sol) <- foldM (evacInstance ex_ndx il) (nl, emptyAllocSolution) idxs
880 -- | Multi-group evacuation of a list of nodes.
881 tryMGEvac :: (Monad m) =>
882 Group.List -- ^ The group list
883 -> Node.List -- ^ The node list
884 -> Instance.List -- ^ The instance list
885 -> [Ndx] -- ^ Nodes to be evacuated
886 -> m AllocSolution -- ^ Solution list
887 tryMGEvac _ nl il ex_ndx =
888 let ex_nodes = map (`Container.find` nl) ex_ndx
889 all_insts = nub . concatMap Node.sList $ ex_nodes
890 all_insts' = associateIdxs all_insts $ splitCluster nl il
892 results <- mapM (\(_, (gnl, gil, idxs)) -> tryEvac gnl gil idxs ex_ndx)
894 let sol = foldl' sumAllocs emptyAllocSolution results
895 return $ annotateSolution sol
897 -- | Function which fails if the requested mode is change secondary.
899 -- This is useful since except DRBD, no other disk template can
900 -- execute change secondary; thus, we can just call this function
901 -- instead of always checking for secondary mode. After the call to
902 -- this function, whatever mode we have is just a primary change.
903 failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
904 failOnSecondaryChange ChangeSecondary dt =
905 fail $ "Instances with disk template '" ++ dtToString dt ++
906 "' can't execute change secondary"
907 failOnSecondaryChange _ _ = return ()
909 -- | Run evacuation for a single instance.
911 -- /Note:/ this function should correctly execute both intra-group
912 -- evacuations (in all modes) and inter-group evacuations (in the
913 -- 'ChangeAll' mode). Of course, this requires that the correct list
914 -- of target nodes is passed.
915 nodeEvacInstance :: Node.List -- ^ The node list (cluster-wide)
916 -> Instance.List -- ^ Instance list (cluster-wide)
917 -> EvacMode -- ^ The evacuation mode
918 -> Instance.Instance -- ^ The instance to be evacuated
919 -> [Ndx] -- ^ The list of available nodes
921 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
922 nodeEvacInstance _ _ mode (Instance.Instance
923 {Instance.diskTemplate = dt@DTDiskless}) _ =
924 failOnSecondaryChange mode dt >>
925 fail "Diskless relocations not implemented yet"
927 nodeEvacInstance _ _ _ (Instance.Instance
928 {Instance.diskTemplate = DTPlain}) _ =
929 fail "Instances of type plain cannot be relocated"
931 nodeEvacInstance _ _ _ (Instance.Instance
932 {Instance.diskTemplate = DTFile}) _ =
933 fail "Instances of type file cannot be relocated"
935 nodeEvacInstance _ _ mode (Instance.Instance
936 {Instance.diskTemplate = dt@DTSharedFile}) _ =
937 failOnSecondaryChange mode dt >>
938 fail "Shared file relocations not implemented yet"
940 nodeEvacInstance _ _ mode (Instance.Instance
941 {Instance.diskTemplate = dt@DTBlock}) _ =
942 failOnSecondaryChange mode dt >>
943 fail "Block device relocations not implemented yet"
945 nodeEvacInstance nl il ChangePrimary
946 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8}) _ =
948 (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
949 let idx = Instance.idx inst
950 il' = Container.add idx inst' il
951 ops = iMoveToJob nl' il' idx Failover
952 return (nl', il', ops)
954 nodeEvacInstance nl il ChangeSecondary
955 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
958 let gdx = instancePriGroup nl inst
959 (nl', inst', _, ndx) <- annotateResult "Can't find any good node" $
961 foldl' (evacDrbdSecondaryInner nl inst gdx)
962 (Left "no nodes available") avail_nodes
963 let idx = Instance.idx inst
964 il' = Container.add idx inst' il
965 ops = iMoveToJob nl' il' idx (ReplaceSecondary ndx)
966 return (nl', il', ops)
968 nodeEvacInstance nl il ChangeAll
969 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
972 let primary = Container.find (Instance.pNode inst) nl
973 idx = Instance.idx inst
974 gdx = instancePriGroup nl inst
975 no_nodes = Left "no nodes available"
976 -- if the primary is offline, then we first failover
977 (nl1, inst1, ops1) <-
978 if Node.offline primary
980 (nl', inst', _, _) <-
981 annotateResult "Failing over to the secondary" $
982 opToResult $ applyMove nl inst Failover
983 return (nl', inst', [Failover])
984 else return (nl, inst, [])
985 -- we now need to execute a replace secondary to the future
987 (nl2, inst2, _, new_pdx) <- annotateResult "Searching for a new primary" $
989 foldl' (evacDrbdSecondaryInner nl1 inst1 gdx)
991 let ops2 = ReplaceSecondary new_pdx:ops1
992 -- since we chose the new primary, we remove it from the list of
994 let avail_nodes_sec = new_pdx `delete` avail_nodes
995 -- we now execute another failover, the primary stays fixed now
996 (nl3, inst3, _, _) <- annotateResult "Failing over to new primary" $
997 opToResult $ applyMove nl2 inst2 Failover
998 let ops3 = Failover:ops2
999 -- and finally another replace secondary, to the final secondary
1000 (nl4, inst4, _, new_sdx) <-
1001 annotateResult "Searching for a new secondary" $
1003 foldl' (evacDrbdSecondaryInner nl3 inst3 gdx) no_nodes avail_nodes_sec
1004 let ops4 = ReplaceSecondary new_sdx:ops3
1005 il' = Container.add idx inst4 il
1006 ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4
1007 return (nl4, il', ops)
1009 -- | Inner fold function for changing secondary of a DRBD instance.
1011 -- The "running" solution is either a @Left String@, which means we
1012 -- don't have yet a working solution, or a @Right (...)@, which
1013 -- represents a valid solution; it holds the modified node list, the
1014 -- modified instance (after evacuation), the score of that solution,
1015 -- and the new secondary node index.
1016 evacDrbdSecondaryInner :: Node.List -- ^ Cluster node list
1017 -> Instance.Instance -- ^ Instance being evacuated
1018 -> Gdx -- ^ The group index of the instance
1019 -> Either String ( Node.List
1022 , Ndx) -- ^ Current best solution
1023 -> Ndx -- ^ Node we're evaluating as new secondary
1024 -> Either String ( Node.List
1027 , Ndx) -- ^ New best solution
1028 evacDrbdSecondaryInner nl inst gdx accu ndx =
1029 case applyMove nl inst (ReplaceSecondary ndx) of
1033 Left _ -> Left $ "Node " ++ Container.nameOf nl ndx ++
1034 " failed: " ++ show fm
1035 OpGood (nl', inst', _, _) ->
1036 let nodes = Container.elems nl'
1037 -- The fromJust below is ugly (it can fail nastily), but
1038 -- at this point we should have any internal mismatches,
1039 -- and adding a monad here would be quite involved
1040 grpnodes = fromJust (gdx `lookup` (Node.computeGroups nodes))
1041 new_cv = compCVNodes grpnodes
1042 new_accu = Right (nl', inst', new_cv, ndx)
1045 Right (_, _, old_cv, _) ->
1050 -- | Computes the nodes in a given group which are available for
1052 availableGroupNodes :: [(Gdx, [Ndx])] -- ^ Group index/node index assoc list
1053 -> IntSet.IntSet -- ^ Nodes that are excluded
1054 -> Gdx -- ^ The group for which we
1056 -> Result [Ndx] -- ^ List of available node indices
1057 availableGroupNodes group_nodes excl_ndx gdx = do
1058 local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx)
1059 Ok (lookup gdx group_nodes)
1060 let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes
1063 -- | Updates the evac solution with the results of an instance
1065 updateEvacSolution :: (Node.List, Instance.List, EvacSolution)
1066 -> Instance.Instance
1067 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1068 -> (Node.List, Instance.List, EvacSolution)
1069 updateEvacSolution (nl, il, es) inst (Bad msg) =
1070 (nl, il, es { esFailed = (Instance.name inst ++ ": " ++ msg):esFailed es})
1071 updateEvacSolution (_, _, es) inst (Ok (nl, il, opcodes)) =
1072 (nl, il, es { esMoved = Instance.name inst:esMoved es
1073 , esOpCodes = [opcodes]:esOpCodes es })
1075 -- | Node-evacuation IAllocator mode main function.
1076 tryNodeEvac :: Group.List -- ^ The cluster groups
1077 -> Node.List -- ^ The node list (cluster-wide, not per group)
1078 -> Instance.List -- ^ Instance list (cluster-wide)
1079 -> EvacMode -- ^ The evacuation mode
1080 -> [Idx] -- ^ List of instance (indices) to be evacuated
1081 -> Result EvacSolution
1082 tryNodeEvac _ ini_nl ini_il mode idxs =
1083 let evac_ndx = nodesToEvacuate ini_il mode idxs
1084 offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1085 excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline
1086 group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1087 (Container.elems nl))) $
1088 splitCluster ini_nl ini_il
1090 foldl' (\state@(nl, il, _) inst ->
1091 updateEvacSolution state inst $
1092 availableGroupNodes group_ndx
1093 excl_ndx (instancePriGroup nl inst) >>=
1094 nodeEvacInstance nl il mode inst
1096 (ini_nl, ini_il, emptyEvacSolution)
1097 (map (`Container.find` ini_il) idxs)
1098 in return $ reverseEvacSolution esol
1100 -- | Change-group IAllocator mode main function.
1102 -- This is very similar to 'tryNodeEvac', the only difference is that
1103 -- we don't choose as target group the current instance group, but
1106 -- 1. at the start of the function, we compute which are the target
1107 -- groups; either no groups were passed in, in which case we choose
1108 -- all groups out of which we don't evacuate instance, or there were
1109 -- some groups passed, in which case we use those
1111 -- 2. for each instance, we use 'findBestAllocGroup' to choose the
1112 -- best group to hold the instance, and then we do what
1113 -- 'tryNodeEvac' does, except for this group instead of the current
1116 -- Note that the correct behaviour of this function relies on the
1117 -- function 'nodeEvacInstance' to be able to do correctly both
1118 -- intra-group and inter-group moves when passed the 'ChangeAll' mode.
1119 tryChangeGroup :: Group.List -- ^ The cluster groups
1120 -> Node.List -- ^ The node list (cluster-wide)
1121 -> Instance.List -- ^ Instance list (cluster-wide)
1122 -> [Gdx] -- ^ Target groups; if empty, any
1123 -- groups not being evacuated
1124 -> [Idx] -- ^ List of instance (indices) to be evacuated
1125 -> Result EvacSolution
1126 tryChangeGroup gl ini_nl ini_il gdxs idxs =
1127 let evac_gdxs = nub $ map (instancePriGroup ini_nl .
1128 flip Container.find ini_il) idxs
1129 target_gdxs = (if null gdxs
1130 then Container.keys gl
1131 else gdxs) \\ evac_gdxs
1132 offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1133 excl_ndx = foldl' (flip IntSet.insert) IntSet.empty offline
1134 group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1135 (Container.elems nl))) $
1136 splitCluster ini_nl ini_il
1138 foldl' (\state@(nl, il, _) inst ->
1140 let ncnt = Instance.requiredNodes $
1141 Instance.diskTemplate inst
1142 (gdx, _, _) <- findBestAllocGroup gl nl il
1143 (Just target_gdxs) inst ncnt
1144 av_nodes <- availableGroupNodes group_ndx
1146 nodeEvacInstance nl il ChangeAll inst av_nodes
1147 in updateEvacSolution state inst solution
1149 (ini_nl, ini_il, emptyEvacSolution)
1150 (map (`Container.find` ini_il) idxs)
1151 in return $ reverseEvacSolution esol
1154 -- | Recursively place instances on the cluster until we're out of space.
1155 iterateAlloc :: Node.List
1158 -> Instance.Instance
1160 -> [Instance.Instance]
1162 -> Result AllocResult
1163 iterateAlloc nl il limit newinst allocnodes ixes cstats =
1164 let depth = length ixes
1165 newname = printf "new-%d" depth::String
1166 newidx = length (Container.elems il) + depth
1167 newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
1168 newlimit = fmap (flip (-) 1) limit
1169 in case tryAlloc nl il newi2 allocnodes of
1171 Ok (AllocSolution { asFailures = errs, asSolutions = sols3 }) ->
1172 let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in
1175 (xnl, xi, _, _):[] ->
1178 else iterateAlloc xnl (Container.add newidx xi il)
1179 newlimit newinst allocnodes (xi:ixes)
1180 (totalResources xnl:cstats)
1181 _ -> Bad "Internal error: multiple solutions for single\
1184 -- | The core of the tiered allocation mode.
1185 tieredAlloc :: Node.List
1188 -> Instance.Instance
1190 -> [Instance.Instance]
1192 -> Result AllocResult
1193 tieredAlloc nl il limit newinst allocnodes ixes cstats =
1194 case iterateAlloc nl il limit newinst allocnodes ixes cstats of
1196 Ok (errs, nl', il', ixes', cstats') ->
1197 let newsol = Ok (errs, nl', il', ixes', cstats')
1198 ixes_cnt = length ixes'
1199 (stop, newlimit) = case limit of
1200 Nothing -> (False, Nothing)
1201 Just n -> (n <= ixes_cnt,
1202 Just (n - ixes_cnt)) in
1203 if stop then newsol else
1204 case Instance.shrinkByType newinst . fst . last $
1205 sortBy (comparing snd) errs of
1207 Ok newinst' -> tieredAlloc nl' il' newlimit
1208 newinst' allocnodes ixes' cstats'
1210 -- | Compute the tiered spec string description from a list of
1211 -- allocated instances.
1212 tieredSpecMap :: [Instance.Instance]
1214 tieredSpecMap trl_ixes =
1215 let fin_trl_ixes = reverse trl_ixes
1216 ix_byspec = groupBy ((==) `on` Instance.specOf) fin_trl_ixes
1217 spec_map = map (\ixs -> (Instance.specOf $ head ixs, length ixs))
1219 in map (\(spec, cnt) -> printf "%d,%d,%d=%d" (rspecMem spec)
1220 (rspecDsk spec) (rspecCpu spec) cnt) spec_map
1222 -- * Formatting functions
1224 -- | Given the original and final nodes, computes the relocation description.
1225 computeMoves :: Instance.Instance -- ^ The instance to be moved
1226 -> String -- ^ The instance name
1227 -> IMove -- ^ The move being performed
1228 -> String -- ^ New primary
1229 -> String -- ^ New secondary
1230 -> (String, [String])
1231 -- ^ Tuple of moves and commands list; moves is containing
1232 -- either @/f/@ for failover or @/r:name/@ for replace
1233 -- secondary, while the command list holds gnt-instance
1234 -- commands (without that prefix), e.g \"@failover instance1@\"
1235 computeMoves i inam mv c d =
1237 Failover -> ("f", [mig])
1238 FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
1239 ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
1240 ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
1241 ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
1242 where morf = if Instance.running i then "migrate" else "failover"
1243 mig = printf "%s -f %s" morf inam::String
1244 rep n = printf "replace-disks -n %s %s" n inam
1246 -- | Converts a placement to string format.
1247 printSolutionLine :: Node.List -- ^ The node list
1248 -> Instance.List -- ^ The instance list
1249 -> Int -- ^ Maximum node name length
1250 -> Int -- ^ Maximum instance name length
1251 -> Placement -- ^ The current placement
1252 -> Int -- ^ The index of the placement in
1254 -> (String, [String])
1255 printSolutionLine nl il nmlen imlen plc pos =
1257 pmlen = (2*nmlen + 1)
1258 (i, p, s, mv, c) = plc
1259 inst = Container.find i il
1260 inam = Instance.alias inst
1261 npri = Node.alias $ Container.find p nl
1262 nsec = Node.alias $ Container.find s nl
1263 opri = Node.alias $ Container.find (Instance.pNode inst) nl
1264 osec = Node.alias $ Container.find (Instance.sNode inst) nl
1265 (moves, cmds) = computeMoves inst inam mv npri nsec
1266 ostr = printf "%s:%s" opri osec::String
1267 nstr = printf "%s:%s" npri nsec::String
1269 (printf " %3d. %-*s %-*s => %-*s %.8f a=%s"
1270 pos imlen inam pmlen ostr
1274 -- | Return the instance and involved nodes in an instance move.
1275 involvedNodes :: Instance.List -> Placement -> [Ndx]
1276 involvedNodes il plc =
1277 let (i, np, ns, _, _) = plc
1278 inst = Container.find i il
1279 op = Instance.pNode inst
1280 os = Instance.sNode inst
1281 in nub [np, ns, op, os]
1283 -- | Inner function for splitJobs, that either appends the next job to
1284 -- the current jobset, or starts a new jobset.
1285 mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
1286 mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
1287 mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
1288 | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
1289 | otherwise = ([n]:cjs, ndx)
1291 -- | Break a list of moves into independent groups. Note that this
1292 -- will reverse the order of jobs.
1293 splitJobs :: [MoveJob] -> [JobSet]
1294 splitJobs = fst . foldl mergeJobs ([], [])
1296 -- | Given a list of commands, prefix them with @gnt-instance@ and
1297 -- also beautify the display a little.
1298 formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
1299 formatJob jsn jsl (sn, (_, _, _, cmds)) =
1301 printf " echo job %d/%d" jsn sn:
1303 map (" gnt-instance " ++) cmds
1305 then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
1308 -- | Given a list of commands, prefix them with @gnt-instance@ and
1309 -- also beautify the display a little.
1310 formatCmds :: [JobSet] -> String
1313 concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1317 -- | Print the node list.
1318 printNodes :: Node.List -> [String] -> String
1320 let fields = case fs of
1321 [] -> Node.defaultFields
1322 "+":rest -> Node.defaultFields ++ rest
1324 snl = sortBy (comparing Node.idx) (Container.elems nl)
1325 (header, isnum) = unzip $ map Node.showHeader fields
1326 in unlines . map ((:) ' ' . intercalate " ") $
1327 formatTable (header:map (Node.list fields) snl) isnum
1329 -- | Print the instance list.
1330 printInsts :: Node.List -> Instance.List -> String
1332 let sil = sortBy (comparing Instance.idx) (Container.elems il)
1333 helper inst = [ if Instance.running inst then "R" else " "
1334 , Instance.name inst
1335 , Container.nameOf nl (Instance.pNode inst)
1336 , let sdx = Instance.sNode inst
1337 in if sdx == Node.noSecondary
1339 else Container.nameOf nl sdx
1340 , if Instance.autoBalance inst then "Y" else "N"
1341 , printf "%3d" $ Instance.vcpus inst
1342 , printf "%5d" $ Instance.mem inst
1343 , printf "%5d" $ Instance.dsk inst `div` 1024
1349 where DynUtil lC lM lD lN = Instance.util inst
1350 header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1351 , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1352 isnum = False:False:False:False:False:repeat True
1353 in unlines . map ((:) ' ' . intercalate " ") $
1354 formatTable (header:map helper sil) isnum
1356 -- | Shows statistics for a given node list.
1357 printStats :: Node.List -> String
1359 let dcvs = compDetailedCV $ Container.elems nl
1360 (weights, names) = unzip detailedCVInfo
1361 hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1362 formatted = map (\(w, header, val) ->
1363 printf "%s=%.8f(x%.2f)" header val w::String) hd
1364 in intercalate ", " formatted
1366 -- | Convert a placement into a list of OpCodes (basically a job).
1367 iMoveToJob :: Node.List -> Instance.List
1368 -> Idx -> IMove -> [OpCodes.OpCode]
1369 iMoveToJob nl il idx move =
1370 let inst = Container.find idx il
1371 iname = Instance.name inst
1372 lookNode = Just . Container.nameOf nl
1373 opF = OpCodes.OpInstanceMigrate iname True False True
1374 opR n = OpCodes.OpInstanceReplaceDisks iname (lookNode n)
1375 OpCodes.ReplaceNewSecondary [] Nothing
1378 ReplacePrimary np -> [ opF, opR np, opF ]
1379 ReplaceSecondary ns -> [ opR ns ]
1380 ReplaceAndFailover np -> [ opR np, opF ]
1381 FailoverAndReplace ns -> [ opF, opR ns ]
1383 -- * Node group functions
1385 -- | Computes the group of an instance.
1386 instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1387 instanceGroup nl i =
1388 let sidx = Instance.sNode i
1389 pnode = Container.find (Instance.pNode i) nl
1390 snode = if sidx == Node.noSecondary
1392 else Container.find sidx nl
1393 pgroup = Node.group pnode
1394 sgroup = Node.group snode
1395 in if pgroup /= sgroup
1396 then fail ("Instance placed accross two node groups, primary " ++
1397 show pgroup ++ ", secondary " ++ show sgroup)
1400 -- | Computes the group of an instance per the primary node.
1401 instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1402 instancePriGroup nl i =
1403 let pnode = Container.find (Instance.pNode i) nl
1406 -- | Compute the list of badly allocated instances (split across node
1408 findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1409 findSplitInstances nl =
1410 filter (not . isOk . instanceGroup nl) . Container.elems
1412 -- | Splits a cluster into the component node groups.
1413 splitCluster :: Node.List -> Instance.List ->
1414 [(Gdx, (Node.List, Instance.List))]
1415 splitCluster nl il =
1416 let ngroups = Node.computeGroups (Container.elems nl)
1417 in map (\(guuid, nodes) ->
1418 let nidxs = map Node.idx nodes
1419 nodes' = zip nidxs nodes
1420 instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1421 in (guuid, (Container.fromList nodes', instances))) ngroups
1423 -- | Split a global instance index map into per-group, and associate
1424 -- it with the group/node/instance lists.
1425 associateIdxs :: [Idx] -- ^ Instance indices to be split/associated
1426 -> [(Gdx, (Node.List, Instance.List))] -- ^ Input groups
1427 -> [(Gdx, (Node.List, Instance.List, [Idx]))] -- ^ Result
1428 associateIdxs idxs =
1429 map (\(gdx, (nl, il)) ->
1430 (gdx, (nl, il, filter (`Container.member` il) idxs)))
1432 -- | Compute the list of nodes that are to be evacuated, given a list
1433 -- of instances and an evacuation mode.
1434 nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list
1435 -> EvacMode -- ^ The evacuation mode we're using
1436 -> [Idx] -- ^ List of instance indices being evacuated
1437 -> IntSet.IntSet -- ^ Set of node indices
1438 nodesToEvacuate il mode =
1439 IntSet.delete Node.noSecondary .
1441 let i = Container.find idx il
1442 pdx = Instance.pNode i
1443 sdx = Instance.sNode i
1444 dt = Instance.diskTemplate i
1445 withSecondary = case dt of
1446 DTDrbd8 -> IntSet.insert sdx ns
1449 ChangePrimary -> IntSet.insert pdx ns
1450 ChangeSecondary -> withSecondary
1451 ChangeAll -> IntSet.insert pdx withSecondary