1 {-| Implementation of cluster-wide logic.
3 This module holds all pure cluster-logic; I\/O related functionality
4 goes into the /Main/ module for the individual binaries.
10 Copyright (C) 2009, 2010, 2011 Google Inc.
12 This program is free software; you can redistribute it and/or modify
13 it under the terms of the GNU General Public License as published by
14 the Free Software Foundation; either version 2 of the License, or
15 (at your option) any later version.
17 This program is distributed in the hope that it will be useful, but
18 WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 General Public License for more details.
22 You should have received a copy of the GNU General Public License
23 along with this program; if not, write to the Free Software
24 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
29 module Ganeti.HTools.Cluster
37 -- * Generic functions
39 , computeAllocationDelta
40 -- * First phase functions
42 -- * Second phase functions
47 -- * Display functions
50 -- * Balacing functions
59 -- * IAllocator functions
68 -- * Allocation functions
71 -- * Node group functions
77 import qualified Data.IntSet as IntSet
79 import Data.Maybe (fromJust)
80 import Data.Ord (comparing)
81 import Text.Printf (printf)
84 import qualified Ganeti.HTools.Container as Container
85 import qualified Ganeti.HTools.Instance as Instance
86 import qualified Ganeti.HTools.Node as Node
87 import qualified Ganeti.HTools.Group as Group
88 import Ganeti.HTools.Types
89 import Ganeti.HTools.Utils
90 import Ganeti.HTools.Compat
91 import qualified Ganeti.OpCodes as OpCodes
95 -- | Allocation\/relocation solution.
96 data AllocSolution = AllocSolution
97 { asFailures :: [FailMode] -- ^ Failure counts
98 , asAllocs :: Int -- ^ Good allocation count
99 , asSolutions :: [Node.AllocElement] -- ^ The actual result, length
100 -- of the list depends on the
101 -- allocation/relocation mode
102 , asLog :: [String] -- ^ A list of informational messages
105 -- | Node evacuation/group change iallocator result type. This result
106 -- type consists of actual opcodes (a restricted subset) that are
107 -- transmitted back to Ganeti.
108 data EvacSolution = EvacSolution
109 { esMoved :: [(Idx, Gdx, [Ndx])] -- ^ Instances moved successfully
110 , esFailed :: [(Idx, String)] -- ^ Instances which were not
112 , esOpCodes :: [[[OpCodes.OpCode]]] -- ^ List of lists of jobs
115 -- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
116 type AllocResult = (FailStats, Node.List, Instance.List,
117 [Instance.Instance], [CStats])
119 -- | A type denoting the valid allocation mode/pairs.
121 -- For a one-node allocation, this will be a @Left ['Node.Node']@,
122 -- whereas for a two-node allocation, this will be a @Right
123 -- [('Node.Node', 'Node.Node')]@.
124 type AllocNodes = Either [Ndx] [(Ndx, Ndx)]
126 -- | The empty solution we start with when computing allocations.
127 emptyAllocSolution :: AllocSolution
128 emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
129 , asSolutions = [], asLog = [] }
131 -- | The empty evac solution.
132 emptyEvacSolution :: EvacSolution
133 emptyEvacSolution = EvacSolution { esMoved = []
138 -- | The complete state for the balancing solution.
139 data Table = Table Node.List Instance.List Score [Placement]
140 deriving (Show, Read)
142 -- | Cluster statistics data type.
143 data CStats = CStats { csFmem :: Integer -- ^ Cluster free mem
144 , csFdsk :: Integer -- ^ Cluster free disk
145 , csAmem :: Integer -- ^ Cluster allocatable mem
146 , csAdsk :: Integer -- ^ Cluster allocatable disk
147 , csAcpu :: Integer -- ^ Cluster allocatable cpus
148 , csMmem :: Integer -- ^ Max node allocatable mem
149 , csMdsk :: Integer -- ^ Max node allocatable disk
150 , csMcpu :: Integer -- ^ Max node allocatable cpu
151 , csImem :: Integer -- ^ Instance used mem
152 , csIdsk :: Integer -- ^ Instance used disk
153 , csIcpu :: Integer -- ^ Instance used cpu
154 , csTmem :: Double -- ^ Cluster total mem
155 , csTdsk :: Double -- ^ Cluster total disk
156 , csTcpu :: Double -- ^ Cluster total cpus
157 , csVcpu :: Integer -- ^ Cluster virtual cpus (if
158 -- node pCpu has been set,
160 , csXmem :: Integer -- ^ Unnacounted for mem
161 , csNmem :: Integer -- ^ Node own memory
162 , csScore :: Score -- ^ The cluster score
163 , csNinst :: Int -- ^ The total number of instances
165 deriving (Show, Read)
167 -- | Currently used, possibly to allocate, unallocable.
168 type AllocStats = (RSpec, RSpec, RSpec)
170 -- * Utility functions
172 -- | Verifies the N+1 status and return the affected nodes.
173 verifyN1 :: [Node.Node] -> [Node.Node]
174 verifyN1 = filter Node.failN1
176 {-| Computes the pair of bad nodes and instances.
178 The bad node list is computed via a simple 'verifyN1' check, and the
179 bad instance list is the list of primary and secondary instances of
183 computeBadItems :: Node.List -> Instance.List ->
184 ([Node.Node], [Instance.Instance])
185 computeBadItems nl il =
186 let bad_nodes = verifyN1 $ getOnline nl
187 bad_instances = map (`Container.find` il) .
189 concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
191 (bad_nodes, bad_instances)
193 -- | Zero-initializer for the CStats type.
194 emptyCStats :: CStats
195 emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
197 -- | Update stats with data from a new node.
198 updateCStats :: CStats -> Node.Node -> CStats
199 updateCStats cs node =
200 let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
201 csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
202 csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
203 csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
204 csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
206 csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
209 inc_amem = Node.fMem node - Node.rMem node
210 inc_amem' = if inc_amem > 0 then inc_amem else 0
211 inc_adsk = Node.availDisk node
212 inc_imem = truncate (Node.tMem node) - Node.nMem node
213 - Node.xMem node - Node.fMem node
214 inc_icpu = Node.uCpu node
215 inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
216 inc_vcpu = Node.hiCpu node
217 inc_acpu = Node.availCpu node
219 in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
220 , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
221 , csAmem = x_amem + fromIntegral inc_amem'
222 , csAdsk = x_adsk + fromIntegral inc_adsk
223 , csAcpu = x_acpu + fromIntegral inc_acpu
224 , csMmem = max x_mmem (fromIntegral inc_amem')
225 , csMdsk = max x_mdsk (fromIntegral inc_adsk)
226 , csMcpu = max x_mcpu (fromIntegral inc_acpu)
227 , csImem = x_imem + fromIntegral inc_imem
228 , csIdsk = x_idsk + fromIntegral inc_idsk
229 , csIcpu = x_icpu + fromIntegral inc_icpu
230 , csTmem = x_tmem + Node.tMem node
231 , csTdsk = x_tdsk + Node.tDsk node
232 , csTcpu = x_tcpu + Node.tCpu node
233 , csVcpu = x_vcpu + fromIntegral inc_vcpu
234 , csXmem = x_xmem + fromIntegral (Node.xMem node)
235 , csNmem = x_nmem + fromIntegral (Node.nMem node)
236 , csNinst = x_ninst + length (Node.pList node)
239 -- | Compute the total free disk and memory in the cluster.
240 totalResources :: Node.List -> CStats
242 let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
243 in cs { csScore = compCV nl }
245 -- | Compute the delta between two cluster state.
247 -- This is used when doing allocations, to understand better the
248 -- available cluster resources. The return value is a triple of the
249 -- current used values, the delta that was still allocated, and what
250 -- was left unallocated.
251 computeAllocationDelta :: CStats -> CStats -> AllocStats
252 computeAllocationDelta cini cfin =
253 let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu} = cini
254 CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
255 csTmem = t_mem, csTdsk = t_dsk, csVcpu = v_cpu } = cfin
256 rini = RSpec (fromIntegral i_icpu) (fromIntegral i_imem)
257 (fromIntegral i_idsk)
258 rfin = RSpec (fromIntegral (f_icpu - i_icpu))
259 (fromIntegral (f_imem - i_imem))
260 (fromIntegral (f_idsk - i_idsk))
261 un_cpu = fromIntegral (v_cpu - f_icpu)::Int
262 runa = RSpec un_cpu (truncate t_mem - fromIntegral f_imem)
263 (truncate t_dsk - fromIntegral f_idsk)
264 in (rini, rfin, runa)
266 -- | The names and weights of the individual elements in the CV list.
267 detailedCVInfo :: [(Double, String)]
268 detailedCVInfo = [ (1, "free_mem_cv")
269 , (1, "free_disk_cv")
271 , (1, "reserved_mem_cv")
272 , (4, "offline_all_cnt")
273 , (16, "offline_pri_cnt")
274 , (1, "vcpu_ratio_cv")
277 , (1, "disk_load_cv")
279 , (2, "pri_tags_score")
282 -- | Holds the weights used by 'compCVNodes' for each metric.
283 detailedCVWeights :: [Double]
284 detailedCVWeights = map fst detailedCVInfo
286 -- | Compute the mem and disk covariance.
287 compDetailedCV :: [Node.Node] -> [Double]
288 compDetailedCV all_nodes =
290 (offline, nodes) = partition Node.offline all_nodes
291 mem_l = map Node.pMem nodes
292 dsk_l = map Node.pDsk nodes
293 -- metric: memory covariance
294 mem_cv = stdDev mem_l
295 -- metric: disk covariance
296 dsk_cv = stdDev dsk_l
297 -- metric: count of instances living on N1 failing nodes
298 n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
299 length (Node.pList n)) .
300 filter Node.failN1 $ nodes :: Double
301 res_l = map Node.pRem nodes
302 -- metric: reserved memory covariance
303 res_cv = stdDev res_l
304 -- offline instances metrics
305 offline_ipri = sum . map (length . Node.pList) $ offline
306 offline_isec = sum . map (length . Node.sList) $ offline
307 -- metric: count of instances on offline nodes
308 off_score = fromIntegral (offline_ipri + offline_isec)::Double
309 -- metric: count of primary instances on offline nodes (this
310 -- helps with evacuation/failover of primary instances on
311 -- 2-node clusters with one node offline)
312 off_pri_score = fromIntegral offline_ipri::Double
313 cpu_l = map Node.pCpu nodes
314 -- metric: covariance of vcpu/pcpu ratio
315 cpu_cv = stdDev cpu_l
316 -- metrics: covariance of cpu, memory, disk and network load
317 (c_load, m_load, d_load, n_load) = unzip4 $
319 let DynUtil c1 m1 d1 n1 = Node.utilLoad n
320 DynUtil c2 m2 d2 n2 = Node.utilPool n
321 in (c1/c2, m1/m2, d1/d2, n1/n2)
323 -- metric: conflicting instance count
324 pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
325 pri_tags_score = fromIntegral pri_tags_inst::Double
326 in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
327 , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
330 -- | Compute the /total/ variance.
331 compCVNodes :: [Node.Node] -> Double
332 compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
334 -- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
335 compCV :: Node.List -> Double
336 compCV = compCVNodes . Container.elems
338 -- | Compute online nodes from a 'Node.List'.
339 getOnline :: Node.List -> [Node.Node]
340 getOnline = filter (not . Node.offline) . Container.elems
342 -- * Balancing functions
344 -- | Compute best table. Note that the ordering of the arguments is important.
345 compareTables :: Table -> Table -> Table
346 compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
347 if a_cv > b_cv then b else a
349 -- | Applies an instance move to a given node list and instance.
350 applyMove :: Node.List -> Instance.Instance
351 -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
353 applyMove nl inst Failover =
354 let old_pdx = Instance.pNode inst
355 old_sdx = Instance.sNode inst
356 old_p = Container.find old_pdx nl
357 old_s = Container.find old_sdx nl
358 int_p = Node.removePri old_p inst
359 int_s = Node.removeSec old_s inst
360 force_p = Node.offline old_p
361 new_nl = do -- Maybe monad
362 new_p <- Node.addPriEx force_p int_s inst
363 new_s <- Node.addSec int_p inst old_sdx
364 let new_inst = Instance.setBoth inst old_sdx old_pdx
365 return (Container.addTwo old_pdx new_s old_sdx new_p nl,
366 new_inst, old_sdx, old_pdx)
369 -- Replace the primary (f:, r:np, f)
370 applyMove nl inst (ReplacePrimary new_pdx) =
371 let old_pdx = Instance.pNode inst
372 old_sdx = Instance.sNode inst
373 old_p = Container.find old_pdx nl
374 old_s = Container.find old_sdx nl
375 tgt_n = Container.find new_pdx nl
376 int_p = Node.removePri old_p inst
377 int_s = Node.removeSec old_s inst
378 force_p = Node.offline old_p
379 new_nl = do -- Maybe monad
380 -- check that the current secondary can host the instance
381 -- during the migration
382 tmp_s <- Node.addPriEx force_p int_s inst
383 let tmp_s' = Node.removePri tmp_s inst
384 new_p <- Node.addPriEx force_p tgt_n inst
385 new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
386 let new_inst = Instance.setPri inst new_pdx
387 return (Container.add new_pdx new_p $
388 Container.addTwo old_pdx int_p old_sdx new_s nl,
389 new_inst, new_pdx, old_sdx)
392 -- Replace the secondary (r:ns)
393 applyMove nl inst (ReplaceSecondary new_sdx) =
394 let old_pdx = Instance.pNode inst
395 old_sdx = Instance.sNode inst
396 old_s = Container.find old_sdx nl
397 tgt_n = Container.find new_sdx nl
398 int_s = Node.removeSec old_s inst
399 force_s = Node.offline old_s
400 new_inst = Instance.setSec inst new_sdx
401 new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
402 \new_s -> return (Container.addTwo new_sdx
403 new_s old_sdx int_s nl,
404 new_inst, old_pdx, new_sdx)
407 -- Replace the secondary and failover (r:np, f)
408 applyMove nl inst (ReplaceAndFailover new_pdx) =
409 let old_pdx = Instance.pNode inst
410 old_sdx = Instance.sNode inst
411 old_p = Container.find old_pdx nl
412 old_s = Container.find old_sdx nl
413 tgt_n = Container.find new_pdx nl
414 int_p = Node.removePri old_p inst
415 int_s = Node.removeSec old_s inst
416 force_s = Node.offline old_s
417 new_nl = do -- Maybe monad
418 new_p <- Node.addPri tgt_n inst
419 new_s <- Node.addSecEx force_s int_p inst new_pdx
420 let new_inst = Instance.setBoth inst new_pdx old_pdx
421 return (Container.add new_pdx new_p $
422 Container.addTwo old_pdx new_s old_sdx int_s nl,
423 new_inst, new_pdx, old_pdx)
426 -- Failver and replace the secondary (f, r:ns)
427 applyMove nl inst (FailoverAndReplace new_sdx) =
428 let old_pdx = Instance.pNode inst
429 old_sdx = Instance.sNode inst
430 old_p = Container.find old_pdx nl
431 old_s = Container.find old_sdx nl
432 tgt_n = Container.find new_sdx nl
433 int_p = Node.removePri old_p inst
434 int_s = Node.removeSec old_s inst
435 force_p = Node.offline old_p
436 new_nl = do -- Maybe monad
437 new_p <- Node.addPriEx force_p int_s inst
438 new_s <- Node.addSecEx force_p tgt_n inst old_sdx
439 let new_inst = Instance.setBoth inst old_sdx new_sdx
440 return (Container.add new_sdx new_s $
441 Container.addTwo old_sdx new_p old_pdx int_p nl,
442 new_inst, old_sdx, new_sdx)
445 -- | Tries to allocate an instance on one given node.
446 allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
447 -> OpResult Node.AllocElement
448 allocateOnSingle nl inst new_pdx =
449 let p = Container.find new_pdx nl
450 new_inst = Instance.setBoth inst new_pdx Node.noSecondary
451 in Node.addPri p inst >>= \new_p -> do
452 let new_nl = Container.add new_pdx new_p nl
453 new_score = compCV nl
454 return (new_nl, new_inst, [new_p], new_score)
456 -- | Tries to allocate an instance on a given pair of nodes.
457 allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
458 -> OpResult Node.AllocElement
459 allocateOnPair nl inst new_pdx new_sdx =
460 let tgt_p = Container.find new_pdx nl
461 tgt_s = Container.find new_sdx nl
463 new_p <- Node.addPri tgt_p inst
464 new_s <- Node.addSec tgt_s inst new_pdx
465 let new_inst = Instance.setBoth inst new_pdx new_sdx
466 new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
467 return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
469 -- | Tries to perform an instance move and returns the best table
470 -- between the original one and the new one.
471 checkSingleStep :: Table -- ^ The original table
472 -> Instance.Instance -- ^ The instance to move
473 -> Table -- ^ The current best table
474 -> IMove -- ^ The move to apply
475 -> Table -- ^ The final best table
476 checkSingleStep ini_tbl target cur_tbl move =
478 Table ini_nl ini_il _ ini_plc = ini_tbl
479 tmp_resu = applyMove ini_nl target move
483 OpGood (upd_nl, new_inst, pri_idx, sec_idx) ->
484 let tgt_idx = Instance.idx target
485 upd_cvar = compCV upd_nl
486 upd_il = Container.add tgt_idx new_inst ini_il
487 upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
488 upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
490 compareTables cur_tbl upd_tbl
492 -- | Given the status of the current secondary as a valid new node and
493 -- the current candidate target node, generate the possible moves for
495 possibleMoves :: Bool -- ^ Whether the secondary node is a valid new node
496 -> Bool -- ^ Whether we can change the primary node
497 -> Ndx -- ^ Target node candidate
498 -> [IMove] -- ^ List of valid result moves
500 possibleMoves _ False tdx =
501 [ReplaceSecondary tdx]
503 possibleMoves True True tdx =
504 [ReplaceSecondary tdx,
505 ReplaceAndFailover tdx,
507 FailoverAndReplace tdx]
509 possibleMoves False True tdx =
510 [ReplaceSecondary tdx,
511 ReplaceAndFailover tdx]
513 -- | Compute the best move for a given instance.
514 checkInstanceMove :: [Ndx] -- ^ Allowed target node indices
515 -> Bool -- ^ Whether disk moves are allowed
516 -> Bool -- ^ Whether instance moves are allowed
517 -> Table -- ^ Original table
518 -> Instance.Instance -- ^ Instance to move
519 -> Table -- ^ Best new table for this instance
520 checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
522 opdx = Instance.pNode target
523 osdx = Instance.sNode target
524 nodes = filter (\idx -> idx /= opdx && idx /= osdx) nodes_idx
525 use_secondary = elem osdx nodes_idx && inst_moves
526 aft_failover = if use_secondary -- if allowed to failover
527 then checkSingleStep ini_tbl target ini_tbl Failover
529 all_moves = if disk_moves
531 (possibleMoves use_secondary inst_moves) nodes
534 -- iterate over the possible nodes for this instance
535 foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
537 -- | Compute the best next move.
538 checkMove :: [Ndx] -- ^ Allowed target node indices
539 -> Bool -- ^ Whether disk moves are allowed
540 -> Bool -- ^ Whether instance moves are allowed
541 -> Table -- ^ The current solution
542 -> [Instance.Instance] -- ^ List of instances still to move
543 -> Table -- ^ The new solution
544 checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
545 let Table _ _ _ ini_plc = ini_tbl
546 -- we're using rwhnf from the Control.Parallel.Strategies
547 -- package; we don't need to use rnf as that would force too
548 -- much evaluation in single-threaded cases, and in
549 -- multi-threaded case the weak head normal form is enough to
550 -- spark the evaluation
551 tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
554 -- iterate over all instances, computing the best move
555 best_tbl = foldl' compareTables ini_tbl tables
556 Table _ _ _ best_plc = best_tbl
557 in if length best_plc == length ini_plc
558 then ini_tbl -- no advancement
561 -- | Check if we are allowed to go deeper in the balancing.
562 doNextBalance :: Table -- ^ The starting table
563 -> Int -- ^ Remaining length
564 -> Score -- ^ Score at which to stop
565 -> Bool -- ^ The resulting table and commands
566 doNextBalance ini_tbl max_rounds min_score =
567 let Table _ _ ini_cv ini_plc = ini_tbl
568 ini_plc_len = length ini_plc
569 in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
571 -- | Run a balance move.
572 tryBalance :: Table -- ^ The starting table
573 -> Bool -- ^ Allow disk moves
574 -> Bool -- ^ Allow instance moves
575 -> Bool -- ^ Only evacuate moves
576 -> Score -- ^ Min gain threshold
577 -> Score -- ^ Min gain
578 -> Maybe Table -- ^ The resulting table and commands
579 tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
580 let Table ini_nl ini_il ini_cv _ = ini_tbl
581 all_inst = Container.elems ini_il
582 all_inst' = if evac_mode
583 then let bad_nodes = map Node.idx . filter Node.offline $
584 Container.elems ini_nl
585 in filter (any (`elem` bad_nodes) . Instance.allNodes)
588 reloc_inst = filter Instance.movable all_inst'
589 node_idx = map Node.idx . filter (not . Node.offline) $
590 Container.elems ini_nl
591 fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
592 (Table _ _ fin_cv _) = fin_tbl
594 if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
595 then Just fin_tbl -- this round made success, return the new table
598 -- * Allocation functions
600 -- | Build failure stats out of a list of failures.
601 collapseFailures :: [FailMode] -> FailStats
602 collapseFailures flst =
603 map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst))
606 -- | Update current Allocation solution and failure stats with new
608 concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
609 concatAllocs as (OpFail reason) = as { asFailures = reason : asFailures as }
611 concatAllocs as (OpGood ns@(_, _, _, nscore)) =
612 let -- Choose the old or new solution, based on the cluster score
614 osols = asSolutions as
615 nsols = case osols of
617 (_, _, _, oscore):[] ->
621 -- FIXME: here we simply concat to lists with more
622 -- than one element; we should instead abort, since
623 -- this is not a valid usage of this function
626 -- Note: we force evaluation of nsols here in order to keep the
627 -- memory profile low - we know that we will need nsols for sure
628 -- in the next cycle, so we force evaluation of nsols, since the
629 -- foldl' in the caller will only evaluate the tuple, but not the
630 -- elements of the tuple
631 in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolutions = nsols }
633 -- | Given a solution, generates a reasonable description for it.
634 describeSolution :: AllocSolution -> String
635 describeSolution as =
636 let fcnt = asFailures as
637 sols = asSolutions as
639 intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
640 filter ((> 0) . snd) . collapseFailures $ fcnt
642 then "No valid allocation solutions, failure reasons: " ++
644 then "unknown reasons"
646 else let (_, _, nodes, cv) = head sols
647 in printf ("score: %.8f, successes %d, failures %d (%s)" ++
648 " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
649 (intercalate "/" . map Node.name $ nodes)
651 -- | Annotates a solution with the appropriate string.
652 annotateSolution :: AllocSolution -> AllocSolution
653 annotateSolution as = as { asLog = describeSolution as : asLog as }
655 -- | Reverses an evacuation solution.
657 -- Rationale: we always concat the results to the top of the lists, so
658 -- for proper jobset execution, we should reverse all lists.
659 reverseEvacSolution :: EvacSolution -> EvacSolution
660 reverseEvacSolution (EvacSolution f m o) =
661 EvacSolution (reverse f) (reverse m) (reverse o)
663 -- | Generate the valid node allocation singles or pairs for a new instance.
664 genAllocNodes :: Group.List -- ^ Group list
665 -> Node.List -- ^ The node map
666 -> Int -- ^ The number of nodes required
667 -> Bool -- ^ Whether to drop or not
669 -> Result AllocNodes -- ^ The (monadic) result
670 genAllocNodes gl nl count drop_unalloc =
671 let filter_fn = if drop_unalloc
672 then filter (Group.isAllocable .
673 flip Container.find gl . Node.group)
675 all_nodes = filter_fn $ getOnline nl
676 all_pairs = liftM2 (,) all_nodes all_nodes
677 ok_pairs = filter (\(x, y) -> Node.idx x /= Node.idx y &&
678 Node.group x == Node.group y) all_pairs
680 1 -> Ok (Left (map Node.idx all_nodes))
681 2 -> Ok (Right (map (\(p, s) -> (Node.idx p, Node.idx s)) ok_pairs))
682 _ -> Bad "Unsupported number of nodes, only one or two supported"
684 -- | Try to allocate an instance on the cluster.
685 tryAlloc :: (Monad m) =>
686 Node.List -- ^ The node list
687 -> Instance.List -- ^ The instance list
688 -> Instance.Instance -- ^ The instance to allocate
689 -> AllocNodes -- ^ The allocation targets
690 -> m AllocSolution -- ^ Possible solution list
691 tryAlloc nl _ inst (Right ok_pairs) =
692 let sols = foldl' (\cstate (p, s) ->
693 concatAllocs cstate $ allocateOnPair nl inst p s
694 ) emptyAllocSolution ok_pairs
696 in if null ok_pairs -- means we have just one node
697 then fail "Not enough online nodes"
698 else return $ annotateSolution sols
700 tryAlloc nl _ inst (Left all_nodes) =
701 let sols = foldl' (\cstate ->
702 concatAllocs cstate . allocateOnSingle nl inst
703 ) emptyAllocSolution all_nodes
705 then fail "No online nodes"
706 else return $ annotateSolution sols
708 -- | Given a group/result, describe it as a nice (list of) messages.
709 solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String]
710 solutionDescription gl (groupId, result) =
712 Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
713 Bad message -> [printf "Group %s: error %s" gname message]
714 where grp = Container.find groupId gl
715 gname = Group.name grp
716 pol = apolToString (Group.allocPolicy grp)
718 -- | From a list of possibly bad and possibly empty solutions, filter
719 -- only the groups with a valid result. Note that the result will be
720 -- reversed compared to the original list.
721 filterMGResults :: Group.List
722 -> [(Gdx, Result AllocSolution)]
723 -> [(Gdx, AllocSolution)]
724 filterMGResults gl = foldl' fn []
725 where unallocable = not . Group.isAllocable . flip Container.find gl
726 fn accu (gdx, rasol) =
729 Ok sol | null (asSolutions sol) -> accu
730 | unallocable gdx -> accu
731 | otherwise -> (gdx, sol):accu
733 -- | Sort multigroup results based on policy and score.
734 sortMGResults :: Group.List
735 -> [(Gdx, AllocSolution)]
736 -> [(Gdx, AllocSolution)]
737 sortMGResults gl sols =
738 let extractScore (_, _, _, x) = x
739 solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl),
740 (extractScore . head . asSolutions) sol)
741 in sortBy (comparing solScore) sols
743 -- | Finds the best group for an instance on a multi-group cluster.
745 -- Only solutions in @preferred@ and @last_resort@ groups will be
746 -- accepted as valid, and additionally if the allowed groups parameter
747 -- is not null then allocation will only be run for those group
749 findBestAllocGroup :: Group.List -- ^ The group list
750 -> Node.List -- ^ The node list
751 -> Instance.List -- ^ The instance list
752 -> Maybe [Gdx] -- ^ The allowed groups
753 -> Instance.Instance -- ^ The instance to allocate
754 -> Int -- ^ Required number of nodes
755 -> Result (Gdx, AllocSolution, [String])
756 findBestAllocGroup mggl mgnl mgil allowed_gdxs inst cnt =
757 let groups = splitCluster mgnl mgil
758 groups' = maybe groups (\gs -> filter ((`elem` gs) . fst) groups)
760 sols = map (\(gid, (nl, il)) ->
761 (gid, genAllocNodes mggl nl cnt False >>=
762 tryAlloc nl il inst))
763 groups'::[(Gdx, Result AllocSolution)]
764 all_msgs = concatMap (solutionDescription mggl) sols
765 goodSols = filterMGResults mggl sols
766 sortedSols = sortMGResults mggl goodSols
767 in if null sortedSols
768 then Bad $ intercalate ", " all_msgs
769 else let (final_group, final_sol) = head sortedSols
770 in return (final_group, final_sol, all_msgs)
772 -- | Try to allocate an instance on a multi-group cluster.
773 tryMGAlloc :: Group.List -- ^ The group list
774 -> Node.List -- ^ The node list
775 -> Instance.List -- ^ The instance list
776 -> Instance.Instance -- ^ The instance to allocate
777 -> Int -- ^ Required number of nodes
778 -> Result AllocSolution -- ^ Possible solution list
779 tryMGAlloc mggl mgnl mgil inst cnt = do
780 (best_group, solution, all_msgs) <-
781 findBestAllocGroup mggl mgnl mgil Nothing inst cnt
782 let group_name = Group.name $ Container.find best_group mggl
783 selmsg = "Selected group: " ++ group_name
784 return $ solution { asLog = selmsg:all_msgs }
786 -- | Try to relocate an instance on the cluster.
787 tryReloc :: (Monad m) =>
788 Node.List -- ^ The node list
789 -> Instance.List -- ^ The instance list
790 -> Idx -- ^ The index of the instance to move
791 -> Int -- ^ The number of nodes required
792 -> [Ndx] -- ^ Nodes which should not be used
793 -> m AllocSolution -- ^ Solution list
794 tryReloc nl il xid 1 ex_idx =
795 let all_nodes = getOnline nl
796 inst = Container.find xid il
797 ex_idx' = Instance.pNode inst:ex_idx
798 valid_nodes = filter (not . flip elem ex_idx' . Node.idx) all_nodes
799 valid_idxes = map Node.idx valid_nodes
800 sols1 = foldl' (\cstate x ->
803 applyMove nl inst (ReplaceSecondary x)
804 return (mnl, i, [Container.find x mnl],
806 in concatAllocs cstate em
807 ) emptyAllocSolution valid_idxes
810 tryReloc _ _ _ reqn _ = fail $ "Unsupported number of relocation \
811 \destinations required (" ++ show reqn ++
812 "), only one supported"
814 -- | Change an instance's secondary node.
815 evacInstance :: (Monad m) =>
816 [Ndx] -- ^ Excluded nodes
817 -> Instance.List -- ^ The current instance list
818 -> (Node.List, AllocSolution) -- ^ The current state
819 -> Idx -- ^ The instance to evacuate
820 -> m (Node.List, AllocSolution)
821 evacInstance ex_ndx il (nl, old_as) idx = do
822 -- FIXME: hardcoded one node here
824 -- Longer explanation: evacuation is currently hardcoded to DRBD
825 -- instances (which have one secondary); hence, even if the
826 -- IAllocator protocol can request N nodes for an instance, and all
827 -- the message parsing/loading pass this, this implementation only
828 -- supports one; this situation needs to be revisited if we ever
829 -- support more than one secondary, or if we change the storage
831 new_as <- tryReloc nl il idx 1 ex_ndx
832 case asSolutions new_as of
833 -- an individual relocation succeeded, we kind of compose the data
834 -- from the two solutions
835 csol@(nl', _, _, _):_ ->
836 return (nl', new_as { asSolutions = csol:asSolutions old_as })
837 -- this relocation failed, so we fail the entire evac
838 _ -> fail $ "Can't evacuate instance " ++
839 Instance.name (Container.find idx il) ++
840 ": " ++ describeSolution new_as
842 -- | Try to evacuate a list of nodes.
843 tryEvac :: (Monad m) =>
844 Node.List -- ^ The node list
845 -> Instance.List -- ^ The instance list
846 -> [Idx] -- ^ Instances to be evacuated
847 -> [Ndx] -- ^ Restricted nodes (the ones being evacuated)
848 -> m AllocSolution -- ^ Solution list
849 tryEvac nl il idxs ex_ndx = do
850 (_, sol) <- foldM (evacInstance ex_ndx il) (nl, emptyAllocSolution) idxs
853 -- | Function which fails if the requested mode is change secondary.
855 -- This is useful since except DRBD, no other disk template can
856 -- execute change secondary; thus, we can just call this function
857 -- instead of always checking for secondary mode. After the call to
858 -- this function, whatever mode we have is just a primary change.
859 failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
860 failOnSecondaryChange ChangeSecondary dt =
861 fail $ "Instances with disk template '" ++ dtToString dt ++
862 "' can't execute change secondary"
863 failOnSecondaryChange _ _ = return ()
865 -- | Run evacuation for a single instance.
867 -- /Note:/ this function should correctly execute both intra-group
868 -- evacuations (in all modes) and inter-group evacuations (in the
869 -- 'ChangeAll' mode). Of course, this requires that the correct list
870 -- of target nodes is passed.
871 nodeEvacInstance :: Node.List -- ^ The node list (cluster-wide)
872 -> Instance.List -- ^ Instance list (cluster-wide)
873 -> EvacMode -- ^ The evacuation mode
874 -> Instance.Instance -- ^ The instance to be evacuated
875 -> Gdx -- ^ The group we're targetting
876 -> [Ndx] -- ^ The list of available nodes
878 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
879 nodeEvacInstance _ _ mode (Instance.Instance
880 {Instance.diskTemplate = dt@DTDiskless}) _ _ =
881 failOnSecondaryChange mode dt >>
882 fail "Diskless relocations not implemented yet"
884 nodeEvacInstance _ _ _ (Instance.Instance
885 {Instance.diskTemplate = DTPlain}) _ _ =
886 fail "Instances of type plain cannot be relocated"
888 nodeEvacInstance _ _ _ (Instance.Instance
889 {Instance.diskTemplate = DTFile}) _ _ =
890 fail "Instances of type file cannot be relocated"
892 nodeEvacInstance _ _ mode (Instance.Instance
893 {Instance.diskTemplate = dt@DTSharedFile}) _ _ =
894 failOnSecondaryChange mode dt >>
895 fail "Shared file relocations not implemented yet"
897 nodeEvacInstance _ _ mode (Instance.Instance
898 {Instance.diskTemplate = dt@DTBlock}) _ _ =
899 failOnSecondaryChange mode dt >>
900 fail "Block device relocations not implemented yet"
902 nodeEvacInstance nl il ChangePrimary
903 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
906 (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
907 let idx = Instance.idx inst
908 il' = Container.add idx inst' il
909 ops = iMoveToJob nl' il' idx Failover
910 return (nl', il', ops)
912 nodeEvacInstance nl il ChangeSecondary
913 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
916 (nl', inst', _, ndx) <- annotateResult "Can't find any good node" $
918 foldl' (evacDrbdSecondaryInner nl inst gdx)
919 (Left "no nodes available") avail_nodes
920 let idx = Instance.idx inst
921 il' = Container.add idx inst' il
922 ops = iMoveToJob nl' il' idx (ReplaceSecondary ndx)
923 return (nl', il', ops)
925 -- The algorithm for ChangeAll is as follows:
927 -- * generate all (primary, secondary) node pairs for the target groups
928 -- * for each pair, execute the needed moves (r:s, f, r:s) and compute
929 -- the final node list state and group score
930 -- * select the best choice via a foldl that uses the same Either
931 -- String solution as the ChangeSecondary mode
932 nodeEvacInstance nl il ChangeAll
933 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
936 let no_nodes = Left "no nodes available"
937 node_pairs = [(p,s) | p <- avail_nodes, s <- avail_nodes, p /= s]
938 (nl', il', ops, _) <-
939 annotateResult "Can't find any good nodes for relocation" $
942 (\accu nodes -> case evacDrbdAllInner nl il inst gdx nodes of
946 -- we don't need more details (which
947 -- nodes, etc.) as we only selected
948 -- this group if we can allocate on
949 -- it, hence failures will not
950 -- propagate out of this fold loop
951 Left _ -> Left $ "Allocation failed: " ++ msg
952 Ok result@(_, _, _, new_cv) ->
953 let new_accu = Right result in
956 Right (_, _, _, old_cv) ->
960 ) no_nodes node_pairs
962 return (nl', il', ops)
964 -- | Inner fold function for changing secondary of a DRBD instance.
966 -- The running solution is either a @Left String@, which means we
967 -- don't have yet a working solution, or a @Right (...)@, which
968 -- represents a valid solution; it holds the modified node list, the
969 -- modified instance (after evacuation), the score of that solution,
970 -- and the new secondary node index.
971 evacDrbdSecondaryInner :: Node.List -- ^ Cluster node list
972 -> Instance.Instance -- ^ Instance being evacuated
973 -> Gdx -- ^ The group index of the instance
974 -> Either String ( Node.List
977 , Ndx) -- ^ Current best solution
978 -> Ndx -- ^ Node we're evaluating as new secondary
979 -> Either String ( Node.List
982 , Ndx) -- ^ New best solution
983 evacDrbdSecondaryInner nl inst gdx accu ndx =
984 case applyMove nl inst (ReplaceSecondary ndx) of
988 Left _ -> Left $ "Node " ++ Container.nameOf nl ndx ++
989 " failed: " ++ show fm
990 OpGood (nl', inst', _, _) ->
991 let nodes = Container.elems nl'
992 -- The fromJust below is ugly (it can fail nastily), but
993 -- at this point we should have any internal mismatches,
994 -- and adding a monad here would be quite involved
995 grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
996 new_cv = compCVNodes grpnodes
997 new_accu = Right (nl', inst', new_cv, ndx)
1000 Right (_, _, old_cv, _) ->
1005 -- | Compute result of changing all nodes of a DRBD instance.
1007 -- Given the target primary and secondary node (which might be in a
1008 -- different group or not), this function will 'execute' all the
1009 -- required steps and assuming all operations succceed, will return
1010 -- the modified node and instance lists, the opcodes needed for this
1011 -- and the new group score.
1012 evacDrbdAllInner :: Node.List -- ^ Cluster node list
1013 -> Instance.List -- ^ Cluster instance list
1014 -> Instance.Instance -- ^ The instance to be moved
1015 -> Gdx -- ^ The target group index
1016 -- (which can differ from the
1017 -- current group of the
1019 -> (Ndx, Ndx) -- ^ Tuple of new
1020 -- primary\/secondary nodes
1021 -> Result (Node.List, Instance.List, [OpCodes.OpCode], Score)
1022 evacDrbdAllInner nl il inst gdx (t_pdx, t_sdx) =
1024 let primary = Container.find (Instance.pNode inst) nl
1025 idx = Instance.idx inst
1026 -- if the primary is offline, then we first failover
1027 (nl1, inst1, ops1) <-
1028 if Node.offline primary
1030 (nl', inst', _, _) <-
1031 annotateResult "Failing over to the secondary" $
1032 opToResult $ applyMove nl inst Failover
1033 return (nl', inst', [Failover])
1034 else return (nl, inst, [])
1035 let (o1, o2, o3) = (ReplaceSecondary t_pdx,
1037 ReplaceSecondary t_sdx)
1038 -- we now need to execute a replace secondary to the future
1040 (nl2, inst2, _, _) <-
1041 annotateResult "Changing secondary to new primary" $
1043 applyMove nl1 inst1 o1
1045 -- we now execute another failover, the primary stays fixed now
1046 (nl3, inst3, _, _) <- annotateResult "Failing over to new primary" $
1047 opToResult $ applyMove nl2 inst2 o2
1049 -- and finally another replace secondary, to the final secondary
1050 (nl4, inst4, _, _) <-
1051 annotateResult "Changing secondary to final secondary" $
1053 applyMove nl3 inst3 o3
1055 il' = Container.add idx inst4 il
1056 ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4
1057 let nodes = Container.elems nl4
1058 -- The fromJust below is ugly (it can fail nastily), but
1059 -- at this point we should have any internal mismatches,
1060 -- and adding a monad here would be quite involved
1061 grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1062 new_cv = compCVNodes grpnodes
1063 return (nl4, il', ops, new_cv)
1065 -- | Computes the nodes in a given group which are available for
1067 availableGroupNodes :: [(Gdx, [Ndx])] -- ^ Group index/node index assoc list
1068 -> IntSet.IntSet -- ^ Nodes that are excluded
1069 -> Gdx -- ^ The group for which we
1071 -> Result [Ndx] -- ^ List of available node indices
1072 availableGroupNodes group_nodes excl_ndx gdx = do
1073 local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx)
1074 Ok (lookup gdx group_nodes)
1075 let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes
1078 -- | Updates the evac solution with the results of an instance
1080 updateEvacSolution :: (Node.List, Instance.List, EvacSolution)
1082 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1083 -> (Node.List, Instance.List, EvacSolution)
1084 updateEvacSolution (nl, il, es) idx (Bad msg) =
1085 (nl, il, es { esFailed = (idx, msg):esFailed es})
1086 updateEvacSolution (_, _, es) idx (Ok (nl, il, opcodes)) =
1087 (nl, il, es { esMoved = new_elem:esMoved es
1088 , esOpCodes = [opcodes]:esOpCodes es })
1089 where inst = Container.find idx il
1091 instancePriGroup nl inst,
1092 Instance.allNodes inst)
1094 -- | Node-evacuation IAllocator mode main function.
1095 tryNodeEvac :: Group.List -- ^ The cluster groups
1096 -> Node.List -- ^ The node list (cluster-wide, not per group)
1097 -> Instance.List -- ^ Instance list (cluster-wide)
1098 -> EvacMode -- ^ The evacuation mode
1099 -> [Idx] -- ^ List of instance (indices) to be evacuated
1100 -> Result (Node.List, Instance.List, EvacSolution)
1101 tryNodeEvac _ ini_nl ini_il mode idxs =
1102 let evac_ndx = nodesToEvacuate ini_il mode idxs
1103 offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1104 excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline
1105 group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1106 (Container.elems nl))) $
1107 splitCluster ini_nl ini_il
1108 (fin_nl, fin_il, esol) =
1109 foldl' (\state@(nl, il, _) inst ->
1110 let gdx = instancePriGroup nl inst
1111 pdx = Instance.pNode inst in
1112 updateEvacSolution state (Instance.idx inst) $
1113 availableGroupNodes group_ndx
1114 (IntSet.insert pdx excl_ndx) gdx >>=
1115 nodeEvacInstance nl il mode inst gdx
1117 (ini_nl, ini_il, emptyEvacSolution)
1118 (map (`Container.find` ini_il) idxs)
1119 in return (fin_nl, fin_il, reverseEvacSolution esol)
1121 -- | Change-group IAllocator mode main function.
1123 -- This is very similar to 'tryNodeEvac', the only difference is that
1124 -- we don't choose as target group the current instance group, but
1127 -- 1. at the start of the function, we compute which are the target
1128 -- groups; either no groups were passed in, in which case we choose
1129 -- all groups out of which we don't evacuate instance, or there were
1130 -- some groups passed, in which case we use those
1132 -- 2. for each instance, we use 'findBestAllocGroup' to choose the
1133 -- best group to hold the instance, and then we do what
1134 -- 'tryNodeEvac' does, except for this group instead of the current
1137 -- Note that the correct behaviour of this function relies on the
1138 -- function 'nodeEvacInstance' to be able to do correctly both
1139 -- intra-group and inter-group moves when passed the 'ChangeAll' mode.
1140 tryChangeGroup :: Group.List -- ^ The cluster groups
1141 -> Node.List -- ^ The node list (cluster-wide)
1142 -> Instance.List -- ^ Instance list (cluster-wide)
1143 -> [Gdx] -- ^ Target groups; if empty, any
1144 -- groups not being evacuated
1145 -> [Idx] -- ^ List of instance (indices) to be evacuated
1146 -> Result (Node.List, Instance.List, EvacSolution)
1147 tryChangeGroup gl ini_nl ini_il gdxs idxs =
1148 let evac_gdxs = nub $ map (instancePriGroup ini_nl .
1149 flip Container.find ini_il) idxs
1150 target_gdxs = (if null gdxs
1151 then Container.keys gl
1152 else gdxs) \\ evac_gdxs
1153 offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1154 excl_ndx = foldl' (flip IntSet.insert) IntSet.empty offline
1155 group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1156 (Container.elems nl))) $
1157 splitCluster ini_nl ini_il
1158 (fin_nl, fin_il, esol) =
1159 foldl' (\state@(nl, il, _) inst ->
1161 let ncnt = Instance.requiredNodes $
1162 Instance.diskTemplate inst
1163 (gdx, _, _) <- findBestAllocGroup gl nl il
1164 (Just target_gdxs) inst ncnt
1165 av_nodes <- availableGroupNodes group_ndx
1167 nodeEvacInstance nl il ChangeAll inst
1169 in updateEvacSolution state
1170 (Instance.idx inst) solution
1172 (ini_nl, ini_il, emptyEvacSolution)
1173 (map (`Container.find` ini_il) idxs)
1174 in return (fin_nl, fin_il, reverseEvacSolution esol)
1176 -- | Recursively place instances on the cluster until we're out of space.
1177 iterateAlloc :: Node.List
1180 -> Instance.Instance
1182 -> [Instance.Instance]
1184 -> Result AllocResult
1185 iterateAlloc nl il limit newinst allocnodes ixes cstats =
1186 let depth = length ixes
1187 newname = printf "new-%d" depth::String
1188 newidx = length (Container.elems il) + depth
1189 newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
1190 newlimit = fmap (flip (-) 1) limit
1191 in case tryAlloc nl il newi2 allocnodes of
1193 Ok (AllocSolution { asFailures = errs, asSolutions = sols3 }) ->
1194 let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in
1197 (xnl, xi, _, _):[] ->
1200 else iterateAlloc xnl (Container.add newidx xi il)
1201 newlimit newinst allocnodes (xi:ixes)
1202 (totalResources xnl:cstats)
1203 _ -> Bad "Internal error: multiple solutions for single\
1206 -- | The core of the tiered allocation mode.
1207 tieredAlloc :: Node.List
1210 -> Instance.Instance
1212 -> [Instance.Instance]
1214 -> Result AllocResult
1215 tieredAlloc nl il limit newinst allocnodes ixes cstats =
1216 case iterateAlloc nl il limit newinst allocnodes ixes cstats of
1218 Ok (errs, nl', il', ixes', cstats') ->
1219 let newsol = Ok (errs, nl', il', ixes', cstats')
1220 ixes_cnt = length ixes'
1221 (stop, newlimit) = case limit of
1222 Nothing -> (False, Nothing)
1223 Just n -> (n <= ixes_cnt,
1224 Just (n - ixes_cnt)) in
1225 if stop then newsol else
1226 case Instance.shrinkByType newinst . fst . last $
1227 sortBy (comparing snd) errs of
1229 Ok newinst' -> tieredAlloc nl' il' newlimit
1230 newinst' allocnodes ixes' cstats'
1232 -- * Formatting functions
1234 -- | Given the original and final nodes, computes the relocation description.
1235 computeMoves :: Instance.Instance -- ^ The instance to be moved
1236 -> String -- ^ The instance name
1237 -> IMove -- ^ The move being performed
1238 -> String -- ^ New primary
1239 -> String -- ^ New secondary
1240 -> (String, [String])
1241 -- ^ Tuple of moves and commands list; moves is containing
1242 -- either @/f/@ for failover or @/r:name/@ for replace
1243 -- secondary, while the command list holds gnt-instance
1244 -- commands (without that prefix), e.g \"@failover instance1@\"
1245 computeMoves i inam mv c d =
1247 Failover -> ("f", [mig])
1248 FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
1249 ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
1250 ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
1251 ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
1252 where morf = if Instance.running i then "migrate" else "failover"
1253 mig = printf "%s -f %s" morf inam::String
1254 rep n = printf "replace-disks -n %s %s" n inam
1256 -- | Converts a placement to string format.
1257 printSolutionLine :: Node.List -- ^ The node list
1258 -> Instance.List -- ^ The instance list
1259 -> Int -- ^ Maximum node name length
1260 -> Int -- ^ Maximum instance name length
1261 -> Placement -- ^ The current placement
1262 -> Int -- ^ The index of the placement in
1264 -> (String, [String])
1265 printSolutionLine nl il nmlen imlen plc pos =
1267 pmlen = (2*nmlen + 1)
1268 (i, p, s, mv, c) = plc
1269 inst = Container.find i il
1270 inam = Instance.alias inst
1271 npri = Node.alias $ Container.find p nl
1272 nsec = Node.alias $ Container.find s nl
1273 opri = Node.alias $ Container.find (Instance.pNode inst) nl
1274 osec = Node.alias $ Container.find (Instance.sNode inst) nl
1275 (moves, cmds) = computeMoves inst inam mv npri nsec
1276 ostr = printf "%s:%s" opri osec::String
1277 nstr = printf "%s:%s" npri nsec::String
1279 (printf " %3d. %-*s %-*s => %-*s %.8f a=%s"
1280 pos imlen inam pmlen ostr
1284 -- | Return the instance and involved nodes in an instance move.
1286 -- Note that the output list length can vary, and is not required nor
1287 -- guaranteed to be of any specific length.
1288 involvedNodes :: Instance.List -- ^ Instance list, used for retrieving
1289 -- the instance from its index; note
1290 -- that this /must/ be the original
1291 -- instance list, so that we can
1292 -- retrieve the old nodes
1293 -> Placement -- ^ The placement we're investigating,
1294 -- containing the new nodes and
1296 -> [Ndx] -- ^ Resulting list of node indices
1297 involvedNodes il plc =
1298 let (i, np, ns, _, _) = plc
1299 inst = Container.find i il
1300 in nub $ [np, ns] ++ Instance.allNodes inst
1302 -- | Inner function for splitJobs, that either appends the next job to
1303 -- the current jobset, or starts a new jobset.
1304 mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
1305 mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
1306 mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
1307 | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
1308 | otherwise = ([n]:cjs, ndx)
1310 -- | Break a list of moves into independent groups. Note that this
1311 -- will reverse the order of jobs.
1312 splitJobs :: [MoveJob] -> [JobSet]
1313 splitJobs = fst . foldl mergeJobs ([], [])
1315 -- | Given a list of commands, prefix them with @gnt-instance@ and
1316 -- also beautify the display a little.
1317 formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
1318 formatJob jsn jsl (sn, (_, _, _, cmds)) =
1320 printf " echo job %d/%d" jsn sn:
1322 map (" gnt-instance " ++) cmds
1324 then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
1327 -- | Given a list of commands, prefix them with @gnt-instance@ and
1328 -- also beautify the display a little.
1329 formatCmds :: [JobSet] -> String
1332 concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1336 -- | Print the node list.
1337 printNodes :: Node.List -> [String] -> String
1339 let fields = case fs of
1340 [] -> Node.defaultFields
1341 "+":rest -> Node.defaultFields ++ rest
1343 snl = sortBy (comparing Node.idx) (Container.elems nl)
1344 (header, isnum) = unzip $ map Node.showHeader fields
1345 in unlines . map ((:) ' ' . intercalate " ") $
1346 formatTable (header:map (Node.list fields) snl) isnum
1348 -- | Print the instance list.
1349 printInsts :: Node.List -> Instance.List -> String
1351 let sil = sortBy (comparing Instance.idx) (Container.elems il)
1352 helper inst = [ if Instance.running inst then "R" else " "
1353 , Instance.name inst
1354 , Container.nameOf nl (Instance.pNode inst)
1355 , let sdx = Instance.sNode inst
1356 in if sdx == Node.noSecondary
1358 else Container.nameOf nl sdx
1359 , if Instance.autoBalance inst then "Y" else "N"
1360 , printf "%3d" $ Instance.vcpus inst
1361 , printf "%5d" $ Instance.mem inst
1362 , printf "%5d" $ Instance.dsk inst `div` 1024
1368 where DynUtil lC lM lD lN = Instance.util inst
1369 header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1370 , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1371 isnum = False:False:False:False:False:repeat True
1372 in unlines . map ((:) ' ' . intercalate " ") $
1373 formatTable (header:map helper sil) isnum
1375 -- | Shows statistics for a given node list.
1376 printStats :: Node.List -> String
1378 let dcvs = compDetailedCV $ Container.elems nl
1379 (weights, names) = unzip detailedCVInfo
1380 hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1381 formatted = map (\(w, header, val) ->
1382 printf "%s=%.8f(x%.2f)" header val w::String) hd
1383 in intercalate ", " formatted
1385 -- | Convert a placement into a list of OpCodes (basically a job).
1386 iMoveToJob :: Node.List -- ^ The node list; only used for node
1387 -- names, so any version is good
1388 -- (before or after the operation)
1389 -> Instance.List -- ^ The instance list; also used for
1391 -> Idx -- ^ The index of the instance being
1393 -> IMove -- ^ The actual move to be described
1394 -> [OpCodes.OpCode] -- ^ The list of opcodes equivalent to
1396 iMoveToJob nl il idx move =
1397 let inst = Container.find idx il
1398 iname = Instance.name inst
1399 lookNode = Just . Container.nameOf nl
1400 opF = OpCodes.OpInstanceMigrate iname True False True Nothing
1401 opR n = OpCodes.OpInstanceReplaceDisks iname (lookNode n)
1402 OpCodes.ReplaceNewSecondary [] Nothing
1405 ReplacePrimary np -> [ opF, opR np, opF ]
1406 ReplaceSecondary ns -> [ opR ns ]
1407 ReplaceAndFailover np -> [ opR np, opF ]
1408 FailoverAndReplace ns -> [ opF, opR ns ]
1410 -- * Node group functions
1412 -- | Computes the group of an instance.
1413 instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1414 instanceGroup nl i =
1415 let sidx = Instance.sNode i
1416 pnode = Container.find (Instance.pNode i) nl
1417 snode = if sidx == Node.noSecondary
1419 else Container.find sidx nl
1420 pgroup = Node.group pnode
1421 sgroup = Node.group snode
1422 in if pgroup /= sgroup
1423 then fail ("Instance placed accross two node groups, primary " ++
1424 show pgroup ++ ", secondary " ++ show sgroup)
1427 -- | Computes the group of an instance per the primary node.
1428 instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1429 instancePriGroup nl i =
1430 let pnode = Container.find (Instance.pNode i) nl
1433 -- | Compute the list of badly allocated instances (split across node
1435 findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1436 findSplitInstances nl =
1437 filter (not . isOk . instanceGroup nl) . Container.elems
1439 -- | Splits a cluster into the component node groups.
1440 splitCluster :: Node.List -> Instance.List ->
1441 [(Gdx, (Node.List, Instance.List))]
1442 splitCluster nl il =
1443 let ngroups = Node.computeGroups (Container.elems nl)
1444 in map (\(guuid, nodes) ->
1445 let nidxs = map Node.idx nodes
1446 nodes' = zip nidxs nodes
1447 instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1448 in (guuid, (Container.fromList nodes', instances))) ngroups
1450 -- | Compute the list of nodes that are to be evacuated, given a list
1451 -- of instances and an evacuation mode.
1452 nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list
1453 -> EvacMode -- ^ The evacuation mode we're using
1454 -> [Idx] -- ^ List of instance indices being evacuated
1455 -> IntSet.IntSet -- ^ Set of node indices
1456 nodesToEvacuate il mode =
1457 IntSet.delete Node.noSecondary .
1459 let i = Container.find idx il
1460 pdx = Instance.pNode i
1461 sdx = Instance.sNode i
1462 dt = Instance.diskTemplate i
1463 withSecondary = case dt of
1464 DTDrbd8 -> IntSet.insert sdx ns
1467 ChangePrimary -> IntSet.insert pdx ns
1468 ChangeSecondary -> withSecondary
1469 ChangeAll -> IntSet.insert pdx withSecondary