1 {-| Implementation of cluster-wide logic.
3 This module holds all pure cluster-logic; I\/O related functionality
4 goes into the /Main/ module for the individual binaries.
10 Copyright (C) 2009, 2010, 2011 Google Inc.
12 This program is free software; you can redistribute it and/or modify
13 it under the terms of the GNU General Public License as published by
14 the Free Software Foundation; either version 2 of the License, or
15 (at your option) any later version.
17 This program is distributed in the hope that it will be useful, but
18 WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 General Public License for more details.
22 You should have received a copy of the GNU General Public License
23 along with this program; if not, write to the Free Software
24 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
29 module Ganeti.HTools.Cluster
37 -- * Generic functions
39 , computeAllocationDelta
40 -- * First phase functions
42 -- * Second phase functions
47 -- * Display functions
50 -- * Balacing functions
59 -- * IAllocator functions
67 -- * Allocation functions
70 -- * Node group functions
76 import qualified Data.IntSet as IntSet
78 import Data.Maybe (fromJust, isNothing)
79 import Data.Ord (comparing)
80 import Text.Printf (printf)
82 import qualified Ganeti.HTools.Container as Container
83 import qualified Ganeti.HTools.Instance as Instance
84 import qualified Ganeti.HTools.Node as Node
85 import qualified Ganeti.HTools.Group as Group
86 import Ganeti.HTools.Types
87 import Ganeti.HTools.Utils
88 import Ganeti.HTools.Compat
89 import qualified Ganeti.OpCodes as OpCodes
93 -- | Allocation\/relocation solution.
94 data AllocSolution = AllocSolution
95 { asFailures :: [FailMode] -- ^ Failure counts
96 , asAllocs :: Int -- ^ Good allocation count
97 , asSolution :: Maybe Node.AllocElement -- ^ The actual allocation result
98 , asLog :: [String] -- ^ Informational messages
101 -- | Node evacuation/group change iallocator result type. This result
102 -- type consists of actual opcodes (a restricted subset) that are
103 -- transmitted back to Ganeti.
104 data EvacSolution = EvacSolution
105 { esMoved :: [(Idx, Gdx, [Ndx])] -- ^ Instances moved successfully
106 , esFailed :: [(Idx, String)] -- ^ Instances which were not
108 , esOpCodes :: [[[OpCodes.OpCode]]] -- ^ List of lists of jobs
111 -- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
112 type AllocResult = (FailStats, Node.List, Instance.List,
113 [Instance.Instance], [CStats])
115 -- | A type denoting the valid allocation mode/pairs.
117 -- For a one-node allocation, this will be a @Left ['Ndx']@, whereas
118 -- for a two-node allocation, this will be a @Right [('Ndx',
119 -- ['Ndx'])]@. In the latter case, the list is basically an
120 -- association list, grouped by primary node and holding the potential
121 -- secondary nodes in the sub-list.
122 type AllocNodes = Either [Ndx] [(Ndx, [Ndx])]
124 -- | The empty solution we start with when computing allocations.
125 emptyAllocSolution :: AllocSolution
126 emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
127 , asSolution = Nothing, asLog = [] }
129 -- | The empty evac solution.
130 emptyEvacSolution :: EvacSolution
131 emptyEvacSolution = EvacSolution { esMoved = []
136 -- | The complete state for the balancing solution.
137 data Table = Table Node.List Instance.List Score [Placement]
138 deriving (Show, Read)
140 -- | Cluster statistics data type.
141 data CStats = CStats { csFmem :: Integer -- ^ Cluster free mem
142 , csFdsk :: Integer -- ^ Cluster free disk
143 , csAmem :: Integer -- ^ Cluster allocatable mem
144 , csAdsk :: Integer -- ^ Cluster allocatable disk
145 , csAcpu :: Integer -- ^ Cluster allocatable cpus
146 , csMmem :: Integer -- ^ Max node allocatable mem
147 , csMdsk :: Integer -- ^ Max node allocatable disk
148 , csMcpu :: Integer -- ^ Max node allocatable cpu
149 , csImem :: Integer -- ^ Instance used mem
150 , csIdsk :: Integer -- ^ Instance used disk
151 , csIcpu :: Integer -- ^ Instance used cpu
152 , csTmem :: Double -- ^ Cluster total mem
153 , csTdsk :: Double -- ^ Cluster total disk
154 , csTcpu :: Double -- ^ Cluster total cpus
155 , csVcpu :: Integer -- ^ Cluster virtual cpus (if
156 -- node pCpu has been set,
158 , csXmem :: Integer -- ^ Unnacounted for mem
159 , csNmem :: Integer -- ^ Node own memory
160 , csScore :: Score -- ^ The cluster score
161 , csNinst :: Int -- ^ The total number of instances
163 deriving (Show, Read)
165 -- | Currently used, possibly to allocate, unallocable.
166 type AllocStats = (RSpec, RSpec, RSpec)
168 -- * Utility functions
170 -- | Verifies the N+1 status and return the affected nodes.
171 verifyN1 :: [Node.Node] -> [Node.Node]
172 verifyN1 = filter Node.failN1
174 {-| Computes the pair of bad nodes and instances.
176 The bad node list is computed via a simple 'verifyN1' check, and the
177 bad instance list is the list of primary and secondary instances of
181 computeBadItems :: Node.List -> Instance.List ->
182 ([Node.Node], [Instance.Instance])
183 computeBadItems nl il =
184 let bad_nodes = verifyN1 $ getOnline nl
185 bad_instances = map (`Container.find` il) .
187 concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
189 (bad_nodes, bad_instances)
191 -- | Zero-initializer for the CStats type.
192 emptyCStats :: CStats
193 emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
195 -- | Update stats with data from a new node.
196 updateCStats :: CStats -> Node.Node -> CStats
197 updateCStats cs node =
198 let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
199 csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
200 csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
201 csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
202 csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
204 csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
207 inc_amem = Node.fMem node - Node.rMem node
208 inc_amem' = if inc_amem > 0 then inc_amem else 0
209 inc_adsk = Node.availDisk node
210 inc_imem = truncate (Node.tMem node) - Node.nMem node
211 - Node.xMem node - Node.fMem node
212 inc_icpu = Node.uCpu node
213 inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
214 inc_vcpu = Node.hiCpu node
215 inc_acpu = Node.availCpu node
217 in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
218 , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
219 , csAmem = x_amem + fromIntegral inc_amem'
220 , csAdsk = x_adsk + fromIntegral inc_adsk
221 , csAcpu = x_acpu + fromIntegral inc_acpu
222 , csMmem = max x_mmem (fromIntegral inc_amem')
223 , csMdsk = max x_mdsk (fromIntegral inc_adsk)
224 , csMcpu = max x_mcpu (fromIntegral inc_acpu)
225 , csImem = x_imem + fromIntegral inc_imem
226 , csIdsk = x_idsk + fromIntegral inc_idsk
227 , csIcpu = x_icpu + fromIntegral inc_icpu
228 , csTmem = x_tmem + Node.tMem node
229 , csTdsk = x_tdsk + Node.tDsk node
230 , csTcpu = x_tcpu + Node.tCpu node
231 , csVcpu = x_vcpu + fromIntegral inc_vcpu
232 , csXmem = x_xmem + fromIntegral (Node.xMem node)
233 , csNmem = x_nmem + fromIntegral (Node.nMem node)
234 , csNinst = x_ninst + length (Node.pList node)
237 -- | Compute the total free disk and memory in the cluster.
238 totalResources :: Node.List -> CStats
240 let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
241 in cs { csScore = compCV nl }
243 -- | Compute the delta between two cluster state.
245 -- This is used when doing allocations, to understand better the
246 -- available cluster resources. The return value is a triple of the
247 -- current used values, the delta that was still allocated, and what
248 -- was left unallocated.
249 computeAllocationDelta :: CStats -> CStats -> AllocStats
250 computeAllocationDelta cini cfin =
251 let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu} = cini
252 CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
253 csTmem = t_mem, csTdsk = t_dsk, csVcpu = v_cpu } = cfin
254 rini = RSpec (fromIntegral i_icpu) (fromIntegral i_imem)
255 (fromIntegral i_idsk)
256 rfin = RSpec (fromIntegral (f_icpu - i_icpu))
257 (fromIntegral (f_imem - i_imem))
258 (fromIntegral (f_idsk - i_idsk))
259 un_cpu = fromIntegral (v_cpu - f_icpu)::Int
260 runa = RSpec un_cpu (truncate t_mem - fromIntegral f_imem)
261 (truncate t_dsk - fromIntegral f_idsk)
262 in (rini, rfin, runa)
264 -- | The names and weights of the individual elements in the CV list.
265 detailedCVInfo :: [(Double, String)]
266 detailedCVInfo = [ (1, "free_mem_cv")
267 , (1, "free_disk_cv")
269 , (1, "reserved_mem_cv")
270 , (4, "offline_all_cnt")
271 , (16, "offline_pri_cnt")
272 , (1, "vcpu_ratio_cv")
275 , (1, "disk_load_cv")
277 , (2, "pri_tags_score")
280 -- | Holds the weights used by 'compCVNodes' for each metric.
281 detailedCVWeights :: [Double]
282 detailedCVWeights = map fst detailedCVInfo
284 -- | Compute the mem and disk covariance.
285 compDetailedCV :: [Node.Node] -> [Double]
286 compDetailedCV all_nodes =
288 (offline, nodes) = partition Node.offline all_nodes
289 mem_l = map Node.pMem nodes
290 dsk_l = map Node.pDsk nodes
291 -- metric: memory covariance
292 mem_cv = stdDev mem_l
293 -- metric: disk covariance
294 dsk_cv = stdDev dsk_l
295 -- metric: count of instances living on N1 failing nodes
296 n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
297 length (Node.pList n)) .
298 filter Node.failN1 $ nodes :: Double
299 res_l = map Node.pRem nodes
300 -- metric: reserved memory covariance
301 res_cv = stdDev res_l
302 -- offline instances metrics
303 offline_ipri = sum . map (length . Node.pList) $ offline
304 offline_isec = sum . map (length . Node.sList) $ offline
305 -- metric: count of instances on offline nodes
306 off_score = fromIntegral (offline_ipri + offline_isec)::Double
307 -- metric: count of primary instances on offline nodes (this
308 -- helps with evacuation/failover of primary instances on
309 -- 2-node clusters with one node offline)
310 off_pri_score = fromIntegral offline_ipri::Double
311 cpu_l = map Node.pCpu nodes
312 -- metric: covariance of vcpu/pcpu ratio
313 cpu_cv = stdDev cpu_l
314 -- metrics: covariance of cpu, memory, disk and network load
315 (c_load, m_load, d_load, n_load) = unzip4 $
317 let DynUtil c1 m1 d1 n1 = Node.utilLoad n
318 DynUtil c2 m2 d2 n2 = Node.utilPool n
319 in (c1/c2, m1/m2, d1/d2, n1/n2)
321 -- metric: conflicting instance count
322 pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
323 pri_tags_score = fromIntegral pri_tags_inst::Double
324 in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
325 , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
328 -- | Compute the /total/ variance.
329 compCVNodes :: [Node.Node] -> Double
330 compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
332 -- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
333 compCV :: Node.List -> Double
334 compCV = compCVNodes . Container.elems
336 -- | Compute online nodes from a 'Node.List'.
337 getOnline :: Node.List -> [Node.Node]
338 getOnline = filter (not . Node.offline) . Container.elems
340 -- * Balancing functions
342 -- | Compute best table. Note that the ordering of the arguments is important.
343 compareTables :: Table -> Table -> Table
344 compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
345 if a_cv > b_cv then b else a
347 -- | Applies an instance move to a given node list and instance.
348 applyMove :: Node.List -> Instance.Instance
349 -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
351 applyMove nl inst Failover =
352 let old_pdx = Instance.pNode inst
353 old_sdx = Instance.sNode inst
354 old_p = Container.find old_pdx nl
355 old_s = Container.find old_sdx nl
356 int_p = Node.removePri old_p inst
357 int_s = Node.removeSec old_s inst
358 force_p = Node.offline old_p
359 new_nl = do -- Maybe monad
360 new_p <- Node.addPriEx force_p int_s inst
361 new_s <- Node.addSec int_p inst old_sdx
362 let new_inst = Instance.setBoth inst old_sdx old_pdx
363 return (Container.addTwo old_pdx new_s old_sdx new_p nl,
364 new_inst, old_sdx, old_pdx)
367 -- Replace the primary (f:, r:np, f)
368 applyMove nl inst (ReplacePrimary new_pdx) =
369 let old_pdx = Instance.pNode inst
370 old_sdx = Instance.sNode inst
371 old_p = Container.find old_pdx nl
372 old_s = Container.find old_sdx nl
373 tgt_n = Container.find new_pdx nl
374 int_p = Node.removePri old_p inst
375 int_s = Node.removeSec old_s inst
376 force_p = Node.offline old_p
377 new_nl = do -- Maybe monad
378 -- check that the current secondary can host the instance
379 -- during the migration
380 tmp_s <- Node.addPriEx force_p int_s inst
381 let tmp_s' = Node.removePri tmp_s inst
382 new_p <- Node.addPriEx force_p tgt_n inst
383 new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
384 let new_inst = Instance.setPri inst new_pdx
385 return (Container.add new_pdx new_p $
386 Container.addTwo old_pdx int_p old_sdx new_s nl,
387 new_inst, new_pdx, old_sdx)
390 -- Replace the secondary (r:ns)
391 applyMove nl inst (ReplaceSecondary new_sdx) =
392 let old_pdx = Instance.pNode inst
393 old_sdx = Instance.sNode inst
394 old_s = Container.find old_sdx nl
395 tgt_n = Container.find new_sdx nl
396 int_s = Node.removeSec old_s inst
397 force_s = Node.offline old_s
398 new_inst = Instance.setSec inst new_sdx
399 new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
400 \new_s -> return (Container.addTwo new_sdx
401 new_s old_sdx int_s nl,
402 new_inst, old_pdx, new_sdx)
405 -- Replace the secondary and failover (r:np, f)
406 applyMove nl inst (ReplaceAndFailover new_pdx) =
407 let old_pdx = Instance.pNode inst
408 old_sdx = Instance.sNode inst
409 old_p = Container.find old_pdx nl
410 old_s = Container.find old_sdx nl
411 tgt_n = Container.find new_pdx nl
412 int_p = Node.removePri old_p inst
413 int_s = Node.removeSec old_s inst
414 force_s = Node.offline old_s
415 new_nl = do -- Maybe monad
416 new_p <- Node.addPri tgt_n inst
417 new_s <- Node.addSecEx force_s int_p inst new_pdx
418 let new_inst = Instance.setBoth inst new_pdx old_pdx
419 return (Container.add new_pdx new_p $
420 Container.addTwo old_pdx new_s old_sdx int_s nl,
421 new_inst, new_pdx, old_pdx)
424 -- Failver and replace the secondary (f, r:ns)
425 applyMove nl inst (FailoverAndReplace new_sdx) =
426 let old_pdx = Instance.pNode inst
427 old_sdx = Instance.sNode inst
428 old_p = Container.find old_pdx nl
429 old_s = Container.find old_sdx nl
430 tgt_n = Container.find new_sdx nl
431 int_p = Node.removePri old_p inst
432 int_s = Node.removeSec old_s inst
433 force_p = Node.offline old_p
434 new_nl = do -- Maybe monad
435 new_p <- Node.addPriEx force_p int_s inst
436 new_s <- Node.addSecEx force_p tgt_n inst old_sdx
437 let new_inst = Instance.setBoth inst old_sdx new_sdx
438 return (Container.add new_sdx new_s $
439 Container.addTwo old_sdx new_p old_pdx int_p nl,
440 new_inst, old_sdx, new_sdx)
443 -- | Tries to allocate an instance on one given node.
444 allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
445 -> OpResult Node.AllocElement
446 allocateOnSingle nl inst new_pdx =
447 let p = Container.find new_pdx nl
448 new_inst = Instance.setBoth inst new_pdx Node.noSecondary
449 in Node.addPri p inst >>= \new_p -> do
450 let new_nl = Container.add new_pdx new_p nl
451 new_score = compCV nl
452 return (new_nl, new_inst, [new_p], new_score)
454 -- | Tries to allocate an instance on a given pair of nodes.
455 allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
456 -> OpResult Node.AllocElement
457 allocateOnPair nl inst new_pdx new_sdx =
458 let tgt_p = Container.find new_pdx nl
459 tgt_s = Container.find new_sdx nl
461 new_p <- Node.addPri tgt_p inst
462 new_s <- Node.addSec tgt_s inst new_pdx
463 let new_inst = Instance.setBoth inst new_pdx new_sdx
464 new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
465 return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
467 -- | Tries to perform an instance move and returns the best table
468 -- between the original one and the new one.
469 checkSingleStep :: Table -- ^ The original table
470 -> Instance.Instance -- ^ The instance to move
471 -> Table -- ^ The current best table
472 -> IMove -- ^ The move to apply
473 -> Table -- ^ The final best table
474 checkSingleStep ini_tbl target cur_tbl move =
476 Table ini_nl ini_il _ ini_plc = ini_tbl
477 tmp_resu = applyMove ini_nl target move
481 OpGood (upd_nl, new_inst, pri_idx, sec_idx) ->
482 let tgt_idx = Instance.idx target
483 upd_cvar = compCV upd_nl
484 upd_il = Container.add tgt_idx new_inst ini_il
485 upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
486 upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
488 compareTables cur_tbl upd_tbl
490 -- | Given the status of the current secondary as a valid new node and
491 -- the current candidate target node, generate the possible moves for
493 possibleMoves :: Bool -- ^ Whether the secondary node is a valid new node
494 -> Bool -- ^ Whether we can change the primary node
495 -> Ndx -- ^ Target node candidate
496 -> [IMove] -- ^ List of valid result moves
498 possibleMoves _ False tdx =
499 [ReplaceSecondary tdx]
501 possibleMoves True True tdx =
502 [ReplaceSecondary tdx,
503 ReplaceAndFailover tdx,
505 FailoverAndReplace tdx]
507 possibleMoves False True tdx =
508 [ReplaceSecondary tdx,
509 ReplaceAndFailover tdx]
511 -- | Compute the best move for a given instance.
512 checkInstanceMove :: [Ndx] -- ^ Allowed target node indices
513 -> Bool -- ^ Whether disk moves are allowed
514 -> Bool -- ^ Whether instance moves are allowed
515 -> Table -- ^ Original table
516 -> Instance.Instance -- ^ Instance to move
517 -> Table -- ^ Best new table for this instance
518 checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
520 opdx = Instance.pNode target
521 osdx = Instance.sNode target
522 nodes = filter (\idx -> idx /= opdx && idx /= osdx) nodes_idx
523 use_secondary = elem osdx nodes_idx && inst_moves
524 aft_failover = if use_secondary -- if allowed to failover
525 then checkSingleStep ini_tbl target ini_tbl Failover
527 all_moves = if disk_moves
529 (possibleMoves use_secondary inst_moves) nodes
532 -- iterate over the possible nodes for this instance
533 foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
535 -- | Compute the best next move.
536 checkMove :: [Ndx] -- ^ Allowed target node indices
537 -> Bool -- ^ Whether disk moves are allowed
538 -> Bool -- ^ Whether instance moves are allowed
539 -> Table -- ^ The current solution
540 -> [Instance.Instance] -- ^ List of instances still to move
541 -> Table -- ^ The new solution
542 checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
543 let Table _ _ _ ini_plc = ini_tbl
544 -- we're using rwhnf from the Control.Parallel.Strategies
545 -- package; we don't need to use rnf as that would force too
546 -- much evaluation in single-threaded cases, and in
547 -- multi-threaded case the weak head normal form is enough to
548 -- spark the evaluation
549 tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
552 -- iterate over all instances, computing the best move
553 best_tbl = foldl' compareTables ini_tbl tables
554 Table _ _ _ best_plc = best_tbl
555 in if length best_plc == length ini_plc
556 then ini_tbl -- no advancement
559 -- | Check if we are allowed to go deeper in the balancing.
560 doNextBalance :: Table -- ^ The starting table
561 -> Int -- ^ Remaining length
562 -> Score -- ^ Score at which to stop
563 -> Bool -- ^ The resulting table and commands
564 doNextBalance ini_tbl max_rounds min_score =
565 let Table _ _ ini_cv ini_plc = ini_tbl
566 ini_plc_len = length ini_plc
567 in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
569 -- | Run a balance move.
570 tryBalance :: Table -- ^ The starting table
571 -> Bool -- ^ Allow disk moves
572 -> Bool -- ^ Allow instance moves
573 -> Bool -- ^ Only evacuate moves
574 -> Score -- ^ Min gain threshold
575 -> Score -- ^ Min gain
576 -> Maybe Table -- ^ The resulting table and commands
577 tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
578 let Table ini_nl ini_il ini_cv _ = ini_tbl
579 all_inst = Container.elems ini_il
580 all_inst' = if evac_mode
581 then let bad_nodes = map Node.idx . filter Node.offline $
582 Container.elems ini_nl
583 in filter (any (`elem` bad_nodes) . Instance.allNodes)
586 reloc_inst = filter Instance.movable all_inst'
587 node_idx = map Node.idx . filter (not . Node.offline) $
588 Container.elems ini_nl
589 fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
590 (Table _ _ fin_cv _) = fin_tbl
592 if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
593 then Just fin_tbl -- this round made success, return the new table
596 -- * Allocation functions
598 -- | Build failure stats out of a list of failures.
599 collapseFailures :: [FailMode] -> FailStats
600 collapseFailures flst =
601 map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst))
604 -- | Compares two Maybe AllocElement and chooses the besst score.
605 bestAllocElement :: Maybe Node.AllocElement
606 -> Maybe Node.AllocElement
607 -> Maybe Node.AllocElement
608 bestAllocElement a Nothing = a
609 bestAllocElement Nothing b = b
610 bestAllocElement a@(Just (_, _, _, ascore)) b@(Just (_, _, _, bscore)) =
611 if ascore < bscore then a else b
613 -- | Update current Allocation solution and failure stats with new
615 concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
616 concatAllocs as (OpFail reason) = as { asFailures = reason : asFailures as }
618 concatAllocs as (OpGood ns) =
619 let -- Choose the old or new solution, based on the cluster score
621 osols = asSolution as
622 nsols = bestAllocElement osols (Just ns)
624 -- Note: we force evaluation of nsols here in order to keep the
625 -- memory profile low - we know that we will need nsols for sure
626 -- in the next cycle, so we force evaluation of nsols, since the
627 -- foldl' in the caller will only evaluate the tuple, but not the
628 -- elements of the tuple
629 in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolution = nsols }
631 -- | Sums two 'AllocSolution' structures.
632 sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution
633 sumAllocs (AllocSolution aFails aAllocs aSols aLog)
634 (AllocSolution bFails bAllocs bSols bLog) =
635 -- note: we add b first, since usually it will be smaller; when
636 -- fold'ing, a will grow and grow whereas b is the per-group
637 -- result, hence smaller
638 let nFails = bFails ++ aFails
639 nAllocs = aAllocs + bAllocs
640 nSols = bestAllocElement aSols bSols
642 in AllocSolution nFails nAllocs nSols nLog
644 -- | Given a solution, generates a reasonable description for it.
645 describeSolution :: AllocSolution -> String
646 describeSolution as =
647 let fcnt = asFailures as
650 intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
651 filter ((> 0) . snd) . collapseFailures $ fcnt
653 Nothing -> "No valid allocation solutions, failure reasons: " ++
654 (if null fcnt then "unknown reasons" else freasons)
655 Just (_, _, nodes, cv) ->
656 printf ("score: %.8f, successes %d, failures %d (%s)" ++
657 " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
658 (intercalate "/" . map Node.name $ nodes)
660 -- | Annotates a solution with the appropriate string.
661 annotateSolution :: AllocSolution -> AllocSolution
662 annotateSolution as = as { asLog = describeSolution as : asLog as }
664 -- | Reverses an evacuation solution.
666 -- Rationale: we always concat the results to the top of the lists, so
667 -- for proper jobset execution, we should reverse all lists.
668 reverseEvacSolution :: EvacSolution -> EvacSolution
669 reverseEvacSolution (EvacSolution f m o) =
670 EvacSolution (reverse f) (reverse m) (reverse o)
672 -- | Generate the valid node allocation singles or pairs for a new instance.
673 genAllocNodes :: Group.List -- ^ Group list
674 -> Node.List -- ^ The node map
675 -> Int -- ^ The number of nodes required
676 -> Bool -- ^ Whether to drop or not
678 -> Result AllocNodes -- ^ The (monadic) result
679 genAllocNodes gl nl count drop_unalloc =
680 let filter_fn = if drop_unalloc
681 then filter (Group.isAllocable .
682 flip Container.find gl . Node.group)
684 all_nodes = filter_fn $ getOnline nl
685 all_pairs = [(Node.idx p,
686 [Node.idx s | s <- all_nodes,
687 Node.idx p /= Node.idx s,
688 Node.group p == Node.group s]) |
691 1 -> Ok (Left (map Node.idx all_nodes))
692 2 -> Ok (Right (filter (not . null . snd) all_pairs))
693 _ -> Bad "Unsupported number of nodes, only one or two supported"
695 -- | Try to allocate an instance on the cluster.
696 tryAlloc :: (Monad m) =>
697 Node.List -- ^ The node list
698 -> Instance.List -- ^ The instance list
699 -> Instance.Instance -- ^ The instance to allocate
700 -> AllocNodes -- ^ The allocation targets
701 -> m AllocSolution -- ^ Possible solution list
702 tryAlloc _ _ _ (Right []) = fail "Not enough online nodes"
703 tryAlloc nl _ inst (Right ok_pairs) =
704 let psols = parMap rwhnf (\(p, ss) ->
706 concatAllocs cstate .
707 allocateOnPair nl inst p)
708 emptyAllocSolution ss) ok_pairs
709 sols = foldl' sumAllocs emptyAllocSolution psols
710 in return $ annotateSolution sols
712 tryAlloc _ _ _ (Left []) = fail "No online nodes"
713 tryAlloc nl _ inst (Left all_nodes) =
714 let sols = foldl' (\cstate ->
715 concatAllocs cstate . allocateOnSingle nl inst
716 ) emptyAllocSolution all_nodes
717 in return $ annotateSolution sols
719 -- | Given a group/result, describe it as a nice (list of) messages.
720 solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String]
721 solutionDescription gl (groupId, result) =
723 Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
724 Bad message -> [printf "Group %s: error %s" gname message]
725 where grp = Container.find groupId gl
726 gname = Group.name grp
727 pol = apolToString (Group.allocPolicy grp)
729 -- | From a list of possibly bad and possibly empty solutions, filter
730 -- only the groups with a valid result. Note that the result will be
731 -- reversed compared to the original list.
732 filterMGResults :: Group.List
733 -> [(Gdx, Result AllocSolution)]
734 -> [(Gdx, AllocSolution)]
735 filterMGResults gl = foldl' fn []
736 where unallocable = not . Group.isAllocable . flip Container.find gl
737 fn accu (gdx, rasol) =
740 Ok sol | isNothing (asSolution sol) -> accu
741 | unallocable gdx -> accu
742 | otherwise -> (gdx, sol):accu
744 -- | Sort multigroup results based on policy and score.
745 sortMGResults :: Group.List
746 -> [(Gdx, AllocSolution)]
747 -> [(Gdx, AllocSolution)]
748 sortMGResults gl sols =
749 let extractScore (_, _, _, x) = x
750 solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl),
751 (extractScore . fromJust . asSolution) sol)
752 in sortBy (comparing solScore) sols
754 -- | Finds the best group for an instance on a multi-group cluster.
756 -- Only solutions in @preferred@ and @last_resort@ groups will be
757 -- accepted as valid, and additionally if the allowed groups parameter
758 -- is not null then allocation will only be run for those group
760 findBestAllocGroup :: Group.List -- ^ The group list
761 -> Node.List -- ^ The node list
762 -> Instance.List -- ^ The instance list
763 -> Maybe [Gdx] -- ^ The allowed groups
764 -> Instance.Instance -- ^ The instance to allocate
765 -> Int -- ^ Required number of nodes
766 -> Result (Gdx, AllocSolution, [String])
767 findBestAllocGroup mggl mgnl mgil allowed_gdxs inst cnt =
768 let groups = splitCluster mgnl mgil
769 groups' = maybe groups (\gs -> filter ((`elem` gs) . fst) groups)
771 sols = map (\(gid, (nl, il)) ->
772 (gid, genAllocNodes mggl nl cnt False >>=
773 tryAlloc nl il inst))
774 groups'::[(Gdx, Result AllocSolution)]
775 all_msgs = concatMap (solutionDescription mggl) sols
776 goodSols = filterMGResults mggl sols
777 sortedSols = sortMGResults mggl goodSols
778 in if null sortedSols
779 then Bad $ intercalate ", " all_msgs
780 else let (final_group, final_sol) = head sortedSols
781 in return (final_group, final_sol, all_msgs)
783 -- | Try to allocate an instance on a multi-group cluster.
784 tryMGAlloc :: Group.List -- ^ The group list
785 -> Node.List -- ^ The node list
786 -> Instance.List -- ^ The instance list
787 -> Instance.Instance -- ^ The instance to allocate
788 -> Int -- ^ Required number of nodes
789 -> Result AllocSolution -- ^ Possible solution list
790 tryMGAlloc mggl mgnl mgil inst cnt = do
791 (best_group, solution, all_msgs) <-
792 findBestAllocGroup mggl mgnl mgil Nothing inst cnt
793 let group_name = Group.name $ Container.find best_group mggl
794 selmsg = "Selected group: " ++ group_name
795 return $ solution { asLog = selmsg:all_msgs }
797 -- | Try to relocate an instance on the cluster.
798 tryReloc :: (Monad m) =>
799 Node.List -- ^ The node list
800 -> Instance.List -- ^ The instance list
801 -> Idx -- ^ The index of the instance to move
802 -> Int -- ^ The number of nodes required
803 -> [Ndx] -- ^ Nodes which should not be used
804 -> m AllocSolution -- ^ Solution list
805 tryReloc nl il xid 1 ex_idx =
806 let all_nodes = getOnline nl
807 inst = Container.find xid il
808 ex_idx' = Instance.pNode inst:ex_idx
809 valid_nodes = filter (not . flip elem ex_idx' . Node.idx) all_nodes
810 valid_idxes = map Node.idx valid_nodes
811 sols1 = foldl' (\cstate x ->
814 applyMove nl inst (ReplaceSecondary x)
815 return (mnl, i, [Container.find x mnl],
817 in concatAllocs cstate em
818 ) emptyAllocSolution valid_idxes
821 tryReloc _ _ _ reqn _ = fail $ "Unsupported number of relocation \
822 \destinations required (" ++ show reqn ++
823 "), only one supported"
825 -- | Function which fails if the requested mode is change secondary.
827 -- This is useful since except DRBD, no other disk template can
828 -- execute change secondary; thus, we can just call this function
829 -- instead of always checking for secondary mode. After the call to
830 -- this function, whatever mode we have is just a primary change.
831 failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
832 failOnSecondaryChange ChangeSecondary dt =
833 fail $ "Instances with disk template '" ++ dtToString dt ++
834 "' can't execute change secondary"
835 failOnSecondaryChange _ _ = return ()
837 -- | Run evacuation for a single instance.
839 -- /Note:/ this function should correctly execute both intra-group
840 -- evacuations (in all modes) and inter-group evacuations (in the
841 -- 'ChangeAll' mode). Of course, this requires that the correct list
842 -- of target nodes is passed.
843 nodeEvacInstance :: Node.List -- ^ The node list (cluster-wide)
844 -> Instance.List -- ^ Instance list (cluster-wide)
845 -> EvacMode -- ^ The evacuation mode
846 -> Instance.Instance -- ^ The instance to be evacuated
847 -> Gdx -- ^ The group we're targetting
848 -> [Ndx] -- ^ The list of available nodes
850 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
851 nodeEvacInstance _ _ mode (Instance.Instance
852 {Instance.diskTemplate = dt@DTDiskless}) _ _ =
853 failOnSecondaryChange mode dt >>
854 fail "Diskless relocations not implemented yet"
856 nodeEvacInstance _ _ _ (Instance.Instance
857 {Instance.diskTemplate = DTPlain}) _ _ =
858 fail "Instances of type plain cannot be relocated"
860 nodeEvacInstance _ _ _ (Instance.Instance
861 {Instance.diskTemplate = DTFile}) _ _ =
862 fail "Instances of type file cannot be relocated"
864 nodeEvacInstance _ _ mode (Instance.Instance
865 {Instance.diskTemplate = dt@DTSharedFile}) _ _ =
866 failOnSecondaryChange mode dt >>
867 fail "Shared file relocations not implemented yet"
869 nodeEvacInstance _ _ mode (Instance.Instance
870 {Instance.diskTemplate = dt@DTBlock}) _ _ =
871 failOnSecondaryChange mode dt >>
872 fail "Block device relocations not implemented yet"
874 nodeEvacInstance nl il ChangePrimary
875 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
878 (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
879 let idx = Instance.idx inst
880 il' = Container.add idx inst' il
881 ops = iMoveToJob nl' il' idx Failover
882 return (nl', il', ops)
884 nodeEvacInstance nl il ChangeSecondary
885 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
888 (nl', inst', _, ndx) <- annotateResult "Can't find any good node" $
890 foldl' (evacDrbdSecondaryInner nl inst gdx)
891 (Left "no nodes available") avail_nodes
892 let idx = Instance.idx inst
893 il' = Container.add idx inst' il
894 ops = iMoveToJob nl' il' idx (ReplaceSecondary ndx)
895 return (nl', il', ops)
897 -- The algorithm for ChangeAll is as follows:
899 -- * generate all (primary, secondary) node pairs for the target groups
900 -- * for each pair, execute the needed moves (r:s, f, r:s) and compute
901 -- the final node list state and group score
902 -- * select the best choice via a foldl that uses the same Either
903 -- String solution as the ChangeSecondary mode
904 nodeEvacInstance nl il ChangeAll
905 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
908 let no_nodes = Left "no nodes available"
909 node_pairs = [(p,s) | p <- avail_nodes, s <- avail_nodes, p /= s]
910 (nl', il', ops, _) <-
911 annotateResult "Can't find any good nodes for relocation" $
914 (\accu nodes -> case evacDrbdAllInner nl il inst gdx nodes of
918 -- we don't need more details (which
919 -- nodes, etc.) as we only selected
920 -- this group if we can allocate on
921 -- it, hence failures will not
922 -- propagate out of this fold loop
923 Left _ -> Left $ "Allocation failed: " ++ msg
924 Ok result@(_, _, _, new_cv) ->
925 let new_accu = Right result in
928 Right (_, _, _, old_cv) ->
932 ) no_nodes node_pairs
934 return (nl', il', ops)
936 -- | Inner fold function for changing secondary of a DRBD instance.
938 -- The running solution is either a @Left String@, which means we
939 -- don't have yet a working solution, or a @Right (...)@, which
940 -- represents a valid solution; it holds the modified node list, the
941 -- modified instance (after evacuation), the score of that solution,
942 -- and the new secondary node index.
943 evacDrbdSecondaryInner :: Node.List -- ^ Cluster node list
944 -> Instance.Instance -- ^ Instance being evacuated
945 -> Gdx -- ^ The group index of the instance
946 -> Either String ( Node.List
949 , Ndx) -- ^ Current best solution
950 -> Ndx -- ^ Node we're evaluating as new secondary
951 -> Either String ( Node.List
954 , Ndx) -- ^ New best solution
955 evacDrbdSecondaryInner nl inst gdx accu ndx =
956 case applyMove nl inst (ReplaceSecondary ndx) of
960 Left _ -> Left $ "Node " ++ Container.nameOf nl ndx ++
961 " failed: " ++ show fm
962 OpGood (nl', inst', _, _) ->
963 let nodes = Container.elems nl'
964 -- The fromJust below is ugly (it can fail nastily), but
965 -- at this point we should have any internal mismatches,
966 -- and adding a monad here would be quite involved
967 grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
968 new_cv = compCVNodes grpnodes
969 new_accu = Right (nl', inst', new_cv, ndx)
972 Right (_, _, old_cv, _) ->
977 -- | Compute result of changing all nodes of a DRBD instance.
979 -- Given the target primary and secondary node (which might be in a
980 -- different group or not), this function will 'execute' all the
981 -- required steps and assuming all operations succceed, will return
982 -- the modified node and instance lists, the opcodes needed for this
983 -- and the new group score.
984 evacDrbdAllInner :: Node.List -- ^ Cluster node list
985 -> Instance.List -- ^ Cluster instance list
986 -> Instance.Instance -- ^ The instance to be moved
987 -> Gdx -- ^ The target group index
988 -- (which can differ from the
989 -- current group of the
991 -> (Ndx, Ndx) -- ^ Tuple of new
992 -- primary\/secondary nodes
993 -> Result (Node.List, Instance.List, [OpCodes.OpCode], Score)
994 evacDrbdAllInner nl il inst gdx (t_pdx, t_sdx) =
996 let primary = Container.find (Instance.pNode inst) nl
997 idx = Instance.idx inst
998 -- if the primary is offline, then we first failover
999 (nl1, inst1, ops1) <-
1000 if Node.offline primary
1002 (nl', inst', _, _) <-
1003 annotateResult "Failing over to the secondary" $
1004 opToResult $ applyMove nl inst Failover
1005 return (nl', inst', [Failover])
1006 else return (nl, inst, [])
1007 let (o1, o2, o3) = (ReplaceSecondary t_pdx,
1009 ReplaceSecondary t_sdx)
1010 -- we now need to execute a replace secondary to the future
1012 (nl2, inst2, _, _) <-
1013 annotateResult "Changing secondary to new primary" $
1015 applyMove nl1 inst1 o1
1017 -- we now execute another failover, the primary stays fixed now
1018 (nl3, inst3, _, _) <- annotateResult "Failing over to new primary" $
1019 opToResult $ applyMove nl2 inst2 o2
1021 -- and finally another replace secondary, to the final secondary
1022 (nl4, inst4, _, _) <-
1023 annotateResult "Changing secondary to final secondary" $
1025 applyMove nl3 inst3 o3
1027 il' = Container.add idx inst4 il
1028 ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4
1029 let nodes = Container.elems nl4
1030 -- The fromJust below is ugly (it can fail nastily), but
1031 -- at this point we should have any internal mismatches,
1032 -- and adding a monad here would be quite involved
1033 grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1034 new_cv = compCVNodes grpnodes
1035 return (nl4, il', ops, new_cv)
1037 -- | Computes the nodes in a given group which are available for
1039 availableGroupNodes :: [(Gdx, [Ndx])] -- ^ Group index/node index assoc list
1040 -> IntSet.IntSet -- ^ Nodes that are excluded
1041 -> Gdx -- ^ The group for which we
1043 -> Result [Ndx] -- ^ List of available node indices
1044 availableGroupNodes group_nodes excl_ndx gdx = do
1045 local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx)
1046 Ok (lookup gdx group_nodes)
1047 let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes
1050 -- | Updates the evac solution with the results of an instance
1052 updateEvacSolution :: (Node.List, Instance.List, EvacSolution)
1054 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1055 -> (Node.List, Instance.List, EvacSolution)
1056 updateEvacSolution (nl, il, es) idx (Bad msg) =
1057 (nl, il, es { esFailed = (idx, msg):esFailed es})
1058 updateEvacSolution (_, _, es) idx (Ok (nl, il, opcodes)) =
1059 (nl, il, es { esMoved = new_elem:esMoved es
1060 , esOpCodes = [opcodes]:esOpCodes es })
1061 where inst = Container.find idx il
1063 instancePriGroup nl inst,
1064 Instance.allNodes inst)
1066 -- | Node-evacuation IAllocator mode main function.
1067 tryNodeEvac :: Group.List -- ^ The cluster groups
1068 -> Node.List -- ^ The node list (cluster-wide, not per group)
1069 -> Instance.List -- ^ Instance list (cluster-wide)
1070 -> EvacMode -- ^ The evacuation mode
1071 -> [Idx] -- ^ List of instance (indices) to be evacuated
1072 -> Result (Node.List, Instance.List, EvacSolution)
1073 tryNodeEvac _ ini_nl ini_il mode idxs =
1074 let evac_ndx = nodesToEvacuate ini_il mode idxs
1075 offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1076 excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline
1077 group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1078 (Container.elems nl))) $
1079 splitCluster ini_nl ini_il
1080 (fin_nl, fin_il, esol) =
1081 foldl' (\state@(nl, il, _) inst ->
1082 let gdx = instancePriGroup nl inst
1083 pdx = Instance.pNode inst in
1084 updateEvacSolution state (Instance.idx inst) $
1085 availableGroupNodes group_ndx
1086 (IntSet.insert pdx excl_ndx) gdx >>=
1087 nodeEvacInstance nl il mode inst gdx
1089 (ini_nl, ini_il, emptyEvacSolution)
1090 (map (`Container.find` ini_il) idxs)
1091 in return (fin_nl, fin_il, reverseEvacSolution esol)
1093 -- | Change-group IAllocator mode main function.
1095 -- This is very similar to 'tryNodeEvac', the only difference is that
1096 -- we don't choose as target group the current instance group, but
1099 -- 1. at the start of the function, we compute which are the target
1100 -- groups; either no groups were passed in, in which case we choose
1101 -- all groups out of which we don't evacuate instance, or there were
1102 -- some groups passed, in which case we use those
1104 -- 2. for each instance, we use 'findBestAllocGroup' to choose the
1105 -- best group to hold the instance, and then we do what
1106 -- 'tryNodeEvac' does, except for this group instead of the current
1109 -- Note that the correct behaviour of this function relies on the
1110 -- function 'nodeEvacInstance' to be able to do correctly both
1111 -- intra-group and inter-group moves when passed the 'ChangeAll' mode.
1112 tryChangeGroup :: Group.List -- ^ The cluster groups
1113 -> Node.List -- ^ The node list (cluster-wide)
1114 -> Instance.List -- ^ Instance list (cluster-wide)
1115 -> [Gdx] -- ^ Target groups; if empty, any
1116 -- groups not being evacuated
1117 -> [Idx] -- ^ List of instance (indices) to be evacuated
1118 -> Result (Node.List, Instance.List, EvacSolution)
1119 tryChangeGroup gl ini_nl ini_il gdxs idxs =
1120 let evac_gdxs = nub $ map (instancePriGroup ini_nl .
1121 flip Container.find ini_il) idxs
1122 target_gdxs = (if null gdxs
1123 then Container.keys gl
1124 else gdxs) \\ evac_gdxs
1125 offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1126 excl_ndx = foldl' (flip IntSet.insert) IntSet.empty offline
1127 group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1128 (Container.elems nl))) $
1129 splitCluster ini_nl ini_il
1130 (fin_nl, fin_il, esol) =
1131 foldl' (\state@(nl, il, _) inst ->
1133 let ncnt = Instance.requiredNodes $
1134 Instance.diskTemplate inst
1135 (gdx, _, _) <- findBestAllocGroup gl nl il
1136 (Just target_gdxs) inst ncnt
1137 av_nodes <- availableGroupNodes group_ndx
1139 nodeEvacInstance nl il ChangeAll inst
1141 in updateEvacSolution state
1142 (Instance.idx inst) solution
1144 (ini_nl, ini_il, emptyEvacSolution)
1145 (map (`Container.find` ini_il) idxs)
1146 in return (fin_nl, fin_il, reverseEvacSolution esol)
1148 -- | Recursively place instances on the cluster until we're out of space.
1149 iterateAlloc :: Node.List
1152 -> Instance.Instance
1154 -> [Instance.Instance]
1156 -> Result AllocResult
1157 iterateAlloc nl il limit newinst allocnodes ixes cstats =
1158 let depth = length ixes
1159 newname = printf "new-%d" depth::String
1160 newidx = length (Container.elems il) + depth
1161 newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
1162 newlimit = fmap (flip (-) 1) limit
1163 in case tryAlloc nl il newi2 allocnodes of
1165 Ok (AllocSolution { asFailures = errs, asSolution = sols3 }) ->
1166 let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in
1169 Just (xnl, xi, _, _) ->
1172 else iterateAlloc xnl (Container.add newidx xi il)
1173 newlimit newinst allocnodes (xi:ixes)
1174 (totalResources xnl:cstats)
1176 -- | The core of the tiered allocation mode.
1177 tieredAlloc :: Node.List
1180 -> Instance.Instance
1182 -> [Instance.Instance]
1184 -> Result AllocResult
1185 tieredAlloc nl il limit newinst allocnodes ixes cstats =
1186 case iterateAlloc nl il limit newinst allocnodes ixes cstats of
1188 Ok (errs, nl', il', ixes', cstats') ->
1189 let newsol = Ok (errs, nl', il', ixes', cstats')
1190 ixes_cnt = length ixes'
1191 (stop, newlimit) = case limit of
1192 Nothing -> (False, Nothing)
1193 Just n -> (n <= ixes_cnt,
1194 Just (n - ixes_cnt)) in
1195 if stop then newsol else
1196 case Instance.shrinkByType newinst . fst . last $
1197 sortBy (comparing snd) errs of
1199 Ok newinst' -> tieredAlloc nl' il' newlimit
1200 newinst' allocnodes ixes' cstats'
1202 -- * Formatting functions
1204 -- | Given the original and final nodes, computes the relocation description.
1205 computeMoves :: Instance.Instance -- ^ The instance to be moved
1206 -> String -- ^ The instance name
1207 -> IMove -- ^ The move being performed
1208 -> String -- ^ New primary
1209 -> String -- ^ New secondary
1210 -> (String, [String])
1211 -- ^ Tuple of moves and commands list; moves is containing
1212 -- either @/f/@ for failover or @/r:name/@ for replace
1213 -- secondary, while the command list holds gnt-instance
1214 -- commands (without that prefix), e.g \"@failover instance1@\"
1215 computeMoves i inam mv c d =
1217 Failover -> ("f", [mig])
1218 FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
1219 ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
1220 ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
1221 ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
1222 where morf = if Instance.running i then "migrate" else "failover"
1223 mig = printf "%s -f %s" morf inam::String
1224 rep n = printf "replace-disks -n %s %s" n inam
1226 -- | Converts a placement to string format.
1227 printSolutionLine :: Node.List -- ^ The node list
1228 -> Instance.List -- ^ The instance list
1229 -> Int -- ^ Maximum node name length
1230 -> Int -- ^ Maximum instance name length
1231 -> Placement -- ^ The current placement
1232 -> Int -- ^ The index of the placement in
1234 -> (String, [String])
1235 printSolutionLine nl il nmlen imlen plc pos =
1237 pmlen = (2*nmlen + 1)
1238 (i, p, s, mv, c) = plc
1239 inst = Container.find i il
1240 inam = Instance.alias inst
1241 npri = Node.alias $ Container.find p nl
1242 nsec = Node.alias $ Container.find s nl
1243 opri = Node.alias $ Container.find (Instance.pNode inst) nl
1244 osec = Node.alias $ Container.find (Instance.sNode inst) nl
1245 (moves, cmds) = computeMoves inst inam mv npri nsec
1246 ostr = printf "%s:%s" opri osec::String
1247 nstr = printf "%s:%s" npri nsec::String
1249 (printf " %3d. %-*s %-*s => %-*s %.8f a=%s"
1250 pos imlen inam pmlen ostr
1254 -- | Return the instance and involved nodes in an instance move.
1256 -- Note that the output list length can vary, and is not required nor
1257 -- guaranteed to be of any specific length.
1258 involvedNodes :: Instance.List -- ^ Instance list, used for retrieving
1259 -- the instance from its index; note
1260 -- that this /must/ be the original
1261 -- instance list, so that we can
1262 -- retrieve the old nodes
1263 -> Placement -- ^ The placement we're investigating,
1264 -- containing the new nodes and
1266 -> [Ndx] -- ^ Resulting list of node indices
1267 involvedNodes il plc =
1268 let (i, np, ns, _, _) = plc
1269 inst = Container.find i il
1270 in nub $ [np, ns] ++ Instance.allNodes inst
1272 -- | Inner function for splitJobs, that either appends the next job to
1273 -- the current jobset, or starts a new jobset.
1274 mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
1275 mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
1276 mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
1277 | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
1278 | otherwise = ([n]:cjs, ndx)
1280 -- | Break a list of moves into independent groups. Note that this
1281 -- will reverse the order of jobs.
1282 splitJobs :: [MoveJob] -> [JobSet]
1283 splitJobs = fst . foldl mergeJobs ([], [])
1285 -- | Given a list of commands, prefix them with @gnt-instance@ and
1286 -- also beautify the display a little.
1287 formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
1288 formatJob jsn jsl (sn, (_, _, _, cmds)) =
1290 printf " echo job %d/%d" jsn sn:
1292 map (" gnt-instance " ++) cmds
1294 then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
1297 -- | Given a list of commands, prefix them with @gnt-instance@ and
1298 -- also beautify the display a little.
1299 formatCmds :: [JobSet] -> String
1302 concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1306 -- | Print the node list.
1307 printNodes :: Node.List -> [String] -> String
1309 let fields = case fs of
1310 [] -> Node.defaultFields
1311 "+":rest -> Node.defaultFields ++ rest
1313 snl = sortBy (comparing Node.idx) (Container.elems nl)
1314 (header, isnum) = unzip $ map Node.showHeader fields
1315 in unlines . map ((:) ' ' . intercalate " ") $
1316 formatTable (header:map (Node.list fields) snl) isnum
1318 -- | Print the instance list.
1319 printInsts :: Node.List -> Instance.List -> String
1321 let sil = sortBy (comparing Instance.idx) (Container.elems il)
1322 helper inst = [ if Instance.running inst then "R" else " "
1323 , Instance.name inst
1324 , Container.nameOf nl (Instance.pNode inst)
1325 , let sdx = Instance.sNode inst
1326 in if sdx == Node.noSecondary
1328 else Container.nameOf nl sdx
1329 , if Instance.autoBalance inst then "Y" else "N"
1330 , printf "%3d" $ Instance.vcpus inst
1331 , printf "%5d" $ Instance.mem inst
1332 , printf "%5d" $ Instance.dsk inst `div` 1024
1338 where DynUtil lC lM lD lN = Instance.util inst
1339 header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1340 , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1341 isnum = False:False:False:False:False:repeat True
1342 in unlines . map ((:) ' ' . intercalate " ") $
1343 formatTable (header:map helper sil) isnum
1345 -- | Shows statistics for a given node list.
1346 printStats :: Node.List -> String
1348 let dcvs = compDetailedCV $ Container.elems nl
1349 (weights, names) = unzip detailedCVInfo
1350 hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1351 formatted = map (\(w, header, val) ->
1352 printf "%s=%.8f(x%.2f)" header val w::String) hd
1353 in intercalate ", " formatted
1355 -- | Convert a placement into a list of OpCodes (basically a job).
1356 iMoveToJob :: Node.List -- ^ The node list; only used for node
1357 -- names, so any version is good
1358 -- (before or after the operation)
1359 -> Instance.List -- ^ The instance list; also used for
1361 -> Idx -- ^ The index of the instance being
1363 -> IMove -- ^ The actual move to be described
1364 -> [OpCodes.OpCode] -- ^ The list of opcodes equivalent to
1366 iMoveToJob nl il idx move =
1367 let inst = Container.find idx il
1368 iname = Instance.name inst
1369 lookNode = Just . Container.nameOf nl
1370 opF = OpCodes.OpInstanceMigrate iname True False True Nothing
1371 opR n = OpCodes.OpInstanceReplaceDisks iname (lookNode n)
1372 OpCodes.ReplaceNewSecondary [] Nothing
1375 ReplacePrimary np -> [ opF, opR np, opF ]
1376 ReplaceSecondary ns -> [ opR ns ]
1377 ReplaceAndFailover np -> [ opR np, opF ]
1378 FailoverAndReplace ns -> [ opF, opR ns ]
1380 -- * Node group functions
1382 -- | Computes the group of an instance.
1383 instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1384 instanceGroup nl i =
1385 let sidx = Instance.sNode i
1386 pnode = Container.find (Instance.pNode i) nl
1387 snode = if sidx == Node.noSecondary
1389 else Container.find sidx nl
1390 pgroup = Node.group pnode
1391 sgroup = Node.group snode
1392 in if pgroup /= sgroup
1393 then fail ("Instance placed accross two node groups, primary " ++
1394 show pgroup ++ ", secondary " ++ show sgroup)
1397 -- | Computes the group of an instance per the primary node.
1398 instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1399 instancePriGroup nl i =
1400 let pnode = Container.find (Instance.pNode i) nl
1403 -- | Compute the list of badly allocated instances (split across node
1405 findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1406 findSplitInstances nl =
1407 filter (not . isOk . instanceGroup nl) . Container.elems
1409 -- | Splits a cluster into the component node groups.
1410 splitCluster :: Node.List -> Instance.List ->
1411 [(Gdx, (Node.List, Instance.List))]
1412 splitCluster nl il =
1413 let ngroups = Node.computeGroups (Container.elems nl)
1414 in map (\(guuid, nodes) ->
1415 let nidxs = map Node.idx nodes
1416 nodes' = zip nidxs nodes
1417 instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1418 in (guuid, (Container.fromList nodes', instances))) ngroups
1420 -- | Compute the list of nodes that are to be evacuated, given a list
1421 -- of instances and an evacuation mode.
1422 nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list
1423 -> EvacMode -- ^ The evacuation mode we're using
1424 -> [Idx] -- ^ List of instance indices being evacuated
1425 -> IntSet.IntSet -- ^ Set of node indices
1426 nodesToEvacuate il mode =
1427 IntSet.delete Node.noSecondary .
1429 let i = Container.find idx il
1430 pdx = Instance.pNode i
1431 sdx = Instance.sNode i
1432 dt = Instance.diskTemplate i
1433 withSecondary = case dt of
1434 DTDrbd8 -> IntSet.insert sdx ns
1437 ChangePrimary -> IntSet.insert pdx ns
1438 ChangeSecondary -> withSecondary
1439 ChangeAll -> IntSet.insert pdx withSecondary