1 {-| Implementation of cluster-wide logic.
3 This module holds all pure cluster-logic; I\/O related functionality
4 goes into the /Main/ module for the individual binaries.
10 Copyright (C) 2009, 2010, 2011, 2012 Google Inc.
12 This program is free software; you can redistribute it and/or modify
13 it under the terms of the GNU General Public License as published by
14 the Free Software Foundation; either version 2 of the License, or
15 (at your option) any later version.
17 This program is distributed in the hope that it will be useful, but
18 WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 General Public License for more details.
22 You should have received a copy of the GNU General Public License
23 along with this program; if not, write to the Free Software
24 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
29 module Ganeti.HTools.Cluster
39 -- * Generic functions
41 , computeAllocationDelta
42 -- * First phase functions
44 -- * Second phase functions
49 -- * Display functions
52 -- * Balacing functions
61 -- * IAllocator functions
68 -- * Allocation functions
71 -- * Node group functions
77 import qualified Data.IntSet as IntSet
79 import Data.Maybe (fromJust, isNothing)
80 import Data.Ord (comparing)
81 import Text.Printf (printf)
83 import qualified Ganeti.HTools.Container as Container
84 import qualified Ganeti.HTools.Instance as Instance
85 import qualified Ganeti.HTools.Node as Node
86 import qualified Ganeti.HTools.Group as Group
87 import Ganeti.HTools.Types
88 import Ganeti.HTools.Utils
89 import Ganeti.HTools.Compat
90 import qualified Ganeti.OpCodes as OpCodes
94 -- | Allocation\/relocation solution.
95 data AllocSolution = AllocSolution
96 { asFailures :: [FailMode] -- ^ Failure counts
97 , asAllocs :: Int -- ^ Good allocation count
98 , asSolution :: Maybe Node.AllocElement -- ^ The actual allocation result
99 , asLog :: [String] -- ^ Informational messages
102 -- | Node evacuation/group change iallocator result type. This result
103 -- type consists of actual opcodes (a restricted subset) that are
104 -- transmitted back to Ganeti.
105 data EvacSolution = EvacSolution
106 { esMoved :: [(Idx, Gdx, [Ndx])] -- ^ Instances moved successfully
107 , esFailed :: [(Idx, String)] -- ^ Instances which were not
109 , esOpCodes :: [[OpCodes.OpCode]] -- ^ List of jobs
112 -- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
113 type AllocResult = (FailStats, Node.List, Instance.List,
114 [Instance.Instance], [CStats])
116 -- | A type denoting the valid allocation mode/pairs.
118 -- For a one-node allocation, this will be a @Left ['Ndx']@, whereas
119 -- for a two-node allocation, this will be a @Right [('Ndx',
120 -- ['Ndx'])]@. In the latter case, the list is basically an
121 -- association list, grouped by primary node and holding the potential
122 -- secondary nodes in the sub-list.
123 type AllocNodes = Either [Ndx] [(Ndx, [Ndx])]
125 -- | The empty solution we start with when computing allocations.
126 emptyAllocSolution :: AllocSolution
127 emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
128 , asSolution = Nothing, asLog = [] }
130 -- | The empty evac solution.
131 emptyEvacSolution :: EvacSolution
132 emptyEvacSolution = EvacSolution { esMoved = []
137 -- | The complete state for the balancing solution.
138 data Table = Table Node.List Instance.List Score [Placement]
139 deriving (Show, Read)
141 -- | Cluster statistics data type.
142 data CStats = CStats { csFmem :: Integer -- ^ Cluster free mem
143 , csFdsk :: Integer -- ^ Cluster free disk
144 , csAmem :: Integer -- ^ Cluster allocatable mem
145 , csAdsk :: Integer -- ^ Cluster allocatable disk
146 , csAcpu :: Integer -- ^ Cluster allocatable cpus
147 , csMmem :: Integer -- ^ Max node allocatable mem
148 , csMdsk :: Integer -- ^ Max node allocatable disk
149 , csMcpu :: Integer -- ^ Max node allocatable cpu
150 , csImem :: Integer -- ^ Instance used mem
151 , csIdsk :: Integer -- ^ Instance used disk
152 , csIcpu :: Integer -- ^ Instance used cpu
153 , csTmem :: Double -- ^ Cluster total mem
154 , csTdsk :: Double -- ^ Cluster total disk
155 , csTcpu :: Double -- ^ Cluster total cpus
156 , csVcpu :: Integer -- ^ Cluster virtual cpus (if
157 -- node pCpu has been set,
159 , csXmem :: Integer -- ^ Unnacounted for mem
160 , csNmem :: Integer -- ^ Node own memory
161 , csScore :: Score -- ^ The cluster score
162 , csNinst :: Int -- ^ The total number of instances
164 deriving (Show, Read)
166 -- | Currently used, possibly to allocate, unallocable.
167 type AllocStats = (RSpec, RSpec, RSpec)
169 -- | A simple type for allocation functions.
170 type AllocMethod = Node.List -- ^ Node list
171 -> Instance.List -- ^ Instance list
172 -> Maybe Int -- ^ Optional allocation limit
173 -> Instance.Instance -- ^ Instance spec for allocation
174 -> AllocNodes -- ^ Which nodes we should allocate on
175 -> [Instance.Instance] -- ^ Allocated instances
176 -> [CStats] -- ^ Running cluster stats
177 -> Result AllocResult -- ^ Allocation result
179 -- * Utility functions
181 -- | Verifies the N+1 status and return the affected nodes.
182 verifyN1 :: [Node.Node] -> [Node.Node]
183 verifyN1 = filter Node.failN1
185 {-| Computes the pair of bad nodes and instances.
187 The bad node list is computed via a simple 'verifyN1' check, and the
188 bad instance list is the list of primary and secondary instances of
192 computeBadItems :: Node.List -> Instance.List ->
193 ([Node.Node], [Instance.Instance])
194 computeBadItems nl il =
195 let bad_nodes = verifyN1 $ getOnline nl
196 bad_instances = map (`Container.find` il) .
198 concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
200 (bad_nodes, bad_instances)
202 -- | Extracts the node pairs for an instance. This can fail if the
203 -- instance is single-homed. FIXME: this needs to be improved,
204 -- together with the general enhancement for handling non-DRBD moves.
205 instanceNodes :: Node.List -> Instance.Instance ->
206 (Ndx, Ndx, Node.Node, Node.Node)
207 instanceNodes nl inst =
208 let old_pdx = Instance.pNode inst
209 old_sdx = Instance.sNode inst
210 old_p = Container.find old_pdx nl
211 old_s = Container.find old_sdx nl
212 in (old_pdx, old_sdx, old_p, old_s)
214 -- | Zero-initializer for the CStats type.
215 emptyCStats :: CStats
216 emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
218 -- | Update stats with data from a new node.
219 updateCStats :: CStats -> Node.Node -> CStats
220 updateCStats cs node =
221 let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
222 csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
223 csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
224 csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
225 csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
227 csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
230 inc_amem = Node.fMem node - Node.rMem node
231 inc_amem' = if inc_amem > 0 then inc_amem else 0
232 inc_adsk = Node.availDisk node
233 inc_imem = truncate (Node.tMem node) - Node.nMem node
234 - Node.xMem node - Node.fMem node
235 inc_icpu = Node.uCpu node
236 inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
237 inc_vcpu = Node.hiCpu node
238 inc_acpu = Node.availCpu node
239 in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
240 , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
241 , csAmem = x_amem + fromIntegral inc_amem'
242 , csAdsk = x_adsk + fromIntegral inc_adsk
243 , csAcpu = x_acpu + fromIntegral inc_acpu
244 , csMmem = max x_mmem (fromIntegral inc_amem')
245 , csMdsk = max x_mdsk (fromIntegral inc_adsk)
246 , csMcpu = max x_mcpu (fromIntegral inc_acpu)
247 , csImem = x_imem + fromIntegral inc_imem
248 , csIdsk = x_idsk + fromIntegral inc_idsk
249 , csIcpu = x_icpu + fromIntegral inc_icpu
250 , csTmem = x_tmem + Node.tMem node
251 , csTdsk = x_tdsk + Node.tDsk node
252 , csTcpu = x_tcpu + Node.tCpu node
253 , csVcpu = x_vcpu + fromIntegral inc_vcpu
254 , csXmem = x_xmem + fromIntegral (Node.xMem node)
255 , csNmem = x_nmem + fromIntegral (Node.nMem node)
256 , csNinst = x_ninst + length (Node.pList node)
259 -- | Compute the total free disk and memory in the cluster.
260 totalResources :: Node.List -> CStats
262 let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
263 in cs { csScore = compCV nl }
265 -- | Compute the delta between two cluster state.
267 -- This is used when doing allocations, to understand better the
268 -- available cluster resources. The return value is a triple of the
269 -- current used values, the delta that was still allocated, and what
270 -- was left unallocated.
271 computeAllocationDelta :: CStats -> CStats -> AllocStats
272 computeAllocationDelta cini cfin =
273 let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu} = cini
274 CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
275 csTmem = t_mem, csTdsk = t_dsk, csVcpu = v_cpu } = cfin
276 rini = RSpec (fromIntegral i_icpu) (fromIntegral i_imem)
277 (fromIntegral i_idsk)
278 rfin = RSpec (fromIntegral (f_icpu - i_icpu))
279 (fromIntegral (f_imem - i_imem))
280 (fromIntegral (f_idsk - i_idsk))
281 un_cpu = fromIntegral (v_cpu - f_icpu)::Int
282 runa = RSpec un_cpu (truncate t_mem - fromIntegral f_imem)
283 (truncate t_dsk - fromIntegral f_idsk)
284 in (rini, rfin, runa)
286 -- | The names and weights of the individual elements in the CV list.
287 detailedCVInfo :: [(Double, String)]
288 detailedCVInfo = [ (1, "free_mem_cv")
289 , (1, "free_disk_cv")
291 , (1, "reserved_mem_cv")
292 , (4, "offline_all_cnt")
293 , (16, "offline_pri_cnt")
294 , (1, "vcpu_ratio_cv")
297 , (1, "disk_load_cv")
299 , (2, "pri_tags_score")
302 -- | Holds the weights used by 'compCVNodes' for each metric.
303 detailedCVWeights :: [Double]
304 detailedCVWeights = map fst detailedCVInfo
306 -- | Compute the mem and disk covariance.
307 compDetailedCV :: [Node.Node] -> [Double]
308 compDetailedCV all_nodes =
309 let (offline, nodes) = partition Node.offline all_nodes
310 mem_l = map Node.pMem nodes
311 dsk_l = map Node.pDsk nodes
312 -- metric: memory covariance
313 mem_cv = stdDev mem_l
314 -- metric: disk covariance
315 dsk_cv = stdDev dsk_l
316 -- metric: count of instances living on N1 failing nodes
317 n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
318 length (Node.pList n)) .
319 filter Node.failN1 $ nodes :: Double
320 res_l = map Node.pRem nodes
321 -- metric: reserved memory covariance
322 res_cv = stdDev res_l
323 -- offline instances metrics
324 offline_ipri = sum . map (length . Node.pList) $ offline
325 offline_isec = sum . map (length . Node.sList) $ offline
326 -- metric: count of instances on offline nodes
327 off_score = fromIntegral (offline_ipri + offline_isec)::Double
328 -- metric: count of primary instances on offline nodes (this
329 -- helps with evacuation/failover of primary instances on
330 -- 2-node clusters with one node offline)
331 off_pri_score = fromIntegral offline_ipri::Double
332 cpu_l = map Node.pCpu nodes
333 -- metric: covariance of vcpu/pcpu ratio
334 cpu_cv = stdDev cpu_l
335 -- metrics: covariance of cpu, memory, disk and network load
336 (c_load, m_load, d_load, n_load) =
338 let DynUtil c1 m1 d1 n1 = Node.utilLoad n
339 DynUtil c2 m2 d2 n2 = Node.utilPool n
340 in (c1/c2, m1/m2, d1/d2, n1/n2)) nodes
341 -- metric: conflicting instance count
342 pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
343 pri_tags_score = fromIntegral pri_tags_inst::Double
344 in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
345 , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
348 -- | Compute the /total/ variance.
349 compCVNodes :: [Node.Node] -> Double
350 compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
352 -- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
353 compCV :: Node.List -> Double
354 compCV = compCVNodes . Container.elems
356 -- | Compute online nodes from a 'Node.List'.
357 getOnline :: Node.List -> [Node.Node]
358 getOnline = filter (not . Node.offline) . Container.elems
360 -- * Balancing functions
362 -- | Compute best table. Note that the ordering of the arguments is important.
363 compareTables :: Table -> Table -> Table
364 compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
365 if a_cv > b_cv then b else a
367 -- | Applies an instance move to a given node list and instance.
368 applyMove :: Node.List -> Instance.Instance
369 -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
371 applyMove nl inst Failover =
372 let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
373 int_p = Node.removePri old_p inst
374 int_s = Node.removeSec old_s inst
375 new_nl = do -- Maybe monad
376 new_p <- Node.addPriEx (Node.offline old_p) int_s inst
377 new_s <- Node.addSec int_p inst old_sdx
378 let new_inst = Instance.setBoth inst old_sdx old_pdx
379 return (Container.addTwo old_pdx new_s old_sdx new_p nl,
380 new_inst, old_sdx, old_pdx)
383 -- Replace the primary (f:, r:np, f)
384 applyMove nl inst (ReplacePrimary new_pdx) =
385 let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
386 tgt_n = Container.find new_pdx nl
387 int_p = Node.removePri old_p inst
388 int_s = Node.removeSec old_s inst
389 force_p = Node.offline old_p
390 new_nl = do -- Maybe monad
391 -- check that the current secondary can host the instance
392 -- during the migration
393 tmp_s <- Node.addPriEx force_p int_s inst
394 let tmp_s' = Node.removePri tmp_s inst
395 new_p <- Node.addPriEx force_p tgt_n inst
396 new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
397 let new_inst = Instance.setPri inst new_pdx
398 return (Container.add new_pdx new_p $
399 Container.addTwo old_pdx int_p old_sdx new_s nl,
400 new_inst, new_pdx, old_sdx)
403 -- Replace the secondary (r:ns)
404 applyMove nl inst (ReplaceSecondary new_sdx) =
405 let old_pdx = Instance.pNode inst
406 old_sdx = Instance.sNode inst
407 old_s = Container.find old_sdx nl
408 tgt_n = Container.find new_sdx nl
409 int_s = Node.removeSec old_s inst
410 force_s = Node.offline old_s
411 new_inst = Instance.setSec inst new_sdx
412 new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
413 \new_s -> return (Container.addTwo new_sdx
414 new_s old_sdx int_s nl,
415 new_inst, old_pdx, new_sdx)
418 -- Replace the secondary and failover (r:np, f)
419 applyMove nl inst (ReplaceAndFailover new_pdx) =
420 let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
421 tgt_n = Container.find new_pdx nl
422 int_p = Node.removePri old_p inst
423 int_s = Node.removeSec old_s inst
424 force_s = Node.offline old_s
425 new_nl = do -- Maybe monad
426 new_p <- Node.addPri tgt_n inst
427 new_s <- Node.addSecEx force_s int_p inst new_pdx
428 let new_inst = Instance.setBoth inst new_pdx old_pdx
429 return (Container.add new_pdx new_p $
430 Container.addTwo old_pdx new_s old_sdx int_s nl,
431 new_inst, new_pdx, old_pdx)
434 -- Failver and replace the secondary (f, r:ns)
435 applyMove nl inst (FailoverAndReplace new_sdx) =
436 let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
437 tgt_n = Container.find new_sdx nl
438 int_p = Node.removePri old_p inst
439 int_s = Node.removeSec old_s inst
440 force_p = Node.offline old_p
441 new_nl = do -- Maybe monad
442 new_p <- Node.addPriEx force_p int_s inst
443 new_s <- Node.addSecEx force_p tgt_n inst old_sdx
444 let new_inst = Instance.setBoth inst old_sdx new_sdx
445 return (Container.add new_sdx new_s $
446 Container.addTwo old_sdx new_p old_pdx int_p nl,
447 new_inst, old_sdx, new_sdx)
450 -- | Tries to allocate an instance on one given node.
451 allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
452 -> OpResult Node.AllocElement
453 allocateOnSingle nl inst new_pdx =
454 let p = Container.find new_pdx nl
455 new_inst = Instance.setBoth inst new_pdx Node.noSecondary
457 Instance.instMatchesPolicy inst (Node.iPolicy p)
458 new_p <- Node.addPri p inst
459 let new_nl = Container.add new_pdx new_p nl
460 new_score = compCV nl
461 return (new_nl, new_inst, [new_p], new_score)
463 -- | Tries to allocate an instance on a given pair of nodes.
464 allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
465 -> OpResult Node.AllocElement
466 allocateOnPair nl inst new_pdx new_sdx =
467 let tgt_p = Container.find new_pdx nl
468 tgt_s = Container.find new_sdx nl
470 Instance.instMatchesPolicy inst (Node.iPolicy tgt_p)
471 new_p <- Node.addPri tgt_p inst
472 new_s <- Node.addSec tgt_s inst new_pdx
473 let new_inst = Instance.setBoth inst new_pdx new_sdx
474 new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
475 return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
477 -- | Tries to perform an instance move and returns the best table
478 -- between the original one and the new one.
479 checkSingleStep :: Table -- ^ The original table
480 -> Instance.Instance -- ^ The instance to move
481 -> Table -- ^ The current best table
482 -> IMove -- ^ The move to apply
483 -> Table -- ^ The final best table
484 checkSingleStep ini_tbl target cur_tbl move =
485 let Table ini_nl ini_il _ ini_plc = ini_tbl
486 tmp_resu = applyMove ini_nl target move
489 OpGood (upd_nl, new_inst, pri_idx, sec_idx) ->
490 let tgt_idx = Instance.idx target
491 upd_cvar = compCV upd_nl
492 upd_il = Container.add tgt_idx new_inst ini_il
493 upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
494 upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
495 in compareTables cur_tbl upd_tbl
497 -- | Given the status of the current secondary as a valid new node and
498 -- the current candidate target node, generate the possible moves for
500 possibleMoves :: Bool -- ^ Whether the secondary node is a valid new node
501 -> Bool -- ^ Whether we can change the primary node
502 -> Ndx -- ^ Target node candidate
503 -> [IMove] -- ^ List of valid result moves
505 possibleMoves _ False tdx =
506 [ReplaceSecondary tdx]
508 possibleMoves True True tdx =
509 [ ReplaceSecondary tdx
510 , ReplaceAndFailover tdx
512 , FailoverAndReplace tdx
515 possibleMoves False True tdx =
516 [ ReplaceSecondary tdx
517 , ReplaceAndFailover tdx
520 -- | Compute the best move for a given instance.
521 checkInstanceMove :: [Ndx] -- ^ Allowed target node indices
522 -> Bool -- ^ Whether disk moves are allowed
523 -> Bool -- ^ Whether instance moves are allowed
524 -> Table -- ^ Original table
525 -> Instance.Instance -- ^ Instance to move
526 -> Table -- ^ Best new table for this instance
527 checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
528 let opdx = Instance.pNode target
529 osdx = Instance.sNode target
530 bad_nodes = [opdx, osdx]
531 nodes = filter (`notElem` bad_nodes) nodes_idx
532 use_secondary = elem osdx nodes_idx && inst_moves
533 aft_failover = if use_secondary -- if allowed to failover
534 then checkSingleStep ini_tbl target ini_tbl Failover
536 all_moves = if disk_moves
538 (possibleMoves use_secondary inst_moves) nodes
541 -- iterate over the possible nodes for this instance
542 foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
544 -- | Compute the best next move.
545 checkMove :: [Ndx] -- ^ Allowed target node indices
546 -> Bool -- ^ Whether disk moves are allowed
547 -> Bool -- ^ Whether instance moves are allowed
548 -> Table -- ^ The current solution
549 -> [Instance.Instance] -- ^ List of instances still to move
550 -> Table -- ^ The new solution
551 checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
552 let Table _ _ _ ini_plc = ini_tbl
553 -- we're using rwhnf from the Control.Parallel.Strategies
554 -- package; we don't need to use rnf as that would force too
555 -- much evaluation in single-threaded cases, and in
556 -- multi-threaded case the weak head normal form is enough to
557 -- spark the evaluation
558 tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
561 -- iterate over all instances, computing the best move
562 best_tbl = foldl' compareTables ini_tbl tables
563 Table _ _ _ best_plc = best_tbl
564 in if length best_plc == length ini_plc
565 then ini_tbl -- no advancement
568 -- | Check if we are allowed to go deeper in the balancing.
569 doNextBalance :: Table -- ^ The starting table
570 -> Int -- ^ Remaining length
571 -> Score -- ^ Score at which to stop
572 -> Bool -- ^ The resulting table and commands
573 doNextBalance ini_tbl max_rounds min_score =
574 let Table _ _ ini_cv ini_plc = ini_tbl
575 ini_plc_len = length ini_plc
576 in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
578 -- | Run a balance move.
579 tryBalance :: Table -- ^ The starting table
580 -> Bool -- ^ Allow disk moves
581 -> Bool -- ^ Allow instance moves
582 -> Bool -- ^ Only evacuate moves
583 -> Score -- ^ Min gain threshold
584 -> Score -- ^ Min gain
585 -> Maybe Table -- ^ The resulting table and commands
586 tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
587 let Table ini_nl ini_il ini_cv _ = ini_tbl
588 all_inst = Container.elems ini_il
589 all_inst' = if evac_mode
590 then let bad_nodes = map Node.idx . filter Node.offline $
591 Container.elems ini_nl
592 in filter (any (`elem` bad_nodes) . Instance.allNodes)
595 reloc_inst = filter Instance.movable all_inst'
596 node_idx = map Node.idx . filter (not . Node.offline) $
597 Container.elems ini_nl
598 fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
599 (Table _ _ fin_cv _) = fin_tbl
601 if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
602 then Just fin_tbl -- this round made success, return the new table
605 -- * Allocation functions
607 -- | Build failure stats out of a list of failures.
608 collapseFailures :: [FailMode] -> FailStats
609 collapseFailures flst =
610 map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst))
613 -- | Compares two Maybe AllocElement and chooses the besst score.
614 bestAllocElement :: Maybe Node.AllocElement
615 -> Maybe Node.AllocElement
616 -> Maybe Node.AllocElement
617 bestAllocElement a Nothing = a
618 bestAllocElement Nothing b = b
619 bestAllocElement a@(Just (_, _, _, ascore)) b@(Just (_, _, _, bscore)) =
620 if ascore < bscore then a else b
622 -- | Update current Allocation solution and failure stats with new
624 concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
625 concatAllocs as (OpFail reason) = as { asFailures = reason : asFailures as }
627 concatAllocs as (OpGood ns) =
628 let -- Choose the old or new solution, based on the cluster score
630 osols = asSolution as
631 nsols = bestAllocElement osols (Just ns)
633 -- Note: we force evaluation of nsols here in order to keep the
634 -- memory profile low - we know that we will need nsols for sure
635 -- in the next cycle, so we force evaluation of nsols, since the
636 -- foldl' in the caller will only evaluate the tuple, but not the
637 -- elements of the tuple
638 in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolution = nsols }
640 -- | Sums two 'AllocSolution' structures.
641 sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution
642 sumAllocs (AllocSolution aFails aAllocs aSols aLog)
643 (AllocSolution bFails bAllocs bSols bLog) =
644 -- note: we add b first, since usually it will be smaller; when
645 -- fold'ing, a will grow and grow whereas b is the per-group
646 -- result, hence smaller
647 let nFails = bFails ++ aFails
648 nAllocs = aAllocs + bAllocs
649 nSols = bestAllocElement aSols bSols
651 in AllocSolution nFails nAllocs nSols nLog
653 -- | Given a solution, generates a reasonable description for it.
654 describeSolution :: AllocSolution -> String
655 describeSolution as =
656 let fcnt = asFailures as
659 intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
660 filter ((> 0) . snd) . collapseFailures $ fcnt
662 Nothing -> "No valid allocation solutions, failure reasons: " ++
663 (if null fcnt then "unknown reasons" else freasons)
664 Just (_, _, nodes, cv) ->
665 printf ("score: %.8f, successes %d, failures %d (%s)" ++
666 " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
667 (intercalate "/" . map Node.name $ nodes)
669 -- | Annotates a solution with the appropriate string.
670 annotateSolution :: AllocSolution -> AllocSolution
671 annotateSolution as = as { asLog = describeSolution as : asLog as }
673 -- | Reverses an evacuation solution.
675 -- Rationale: we always concat the results to the top of the lists, so
676 -- for proper jobset execution, we should reverse all lists.
677 reverseEvacSolution :: EvacSolution -> EvacSolution
678 reverseEvacSolution (EvacSolution f m o) =
679 EvacSolution (reverse f) (reverse m) (reverse o)
681 -- | Generate the valid node allocation singles or pairs for a new instance.
682 genAllocNodes :: Group.List -- ^ Group list
683 -> Node.List -- ^ The node map
684 -> Int -- ^ The number of nodes required
685 -> Bool -- ^ Whether to drop or not
687 -> Result AllocNodes -- ^ The (monadic) result
688 genAllocNodes gl nl count drop_unalloc =
689 let filter_fn = if drop_unalloc
690 then filter (Group.isAllocable .
691 flip Container.find gl . Node.group)
693 all_nodes = filter_fn $ getOnline nl
694 all_pairs = [(Node.idx p,
695 [Node.idx s | s <- all_nodes,
696 Node.idx p /= Node.idx s,
697 Node.group p == Node.group s]) |
700 1 -> Ok (Left (map Node.idx all_nodes))
701 2 -> Ok (Right (filter (not . null . snd) all_pairs))
702 _ -> Bad "Unsupported number of nodes, only one or two supported"
704 -- | Try to allocate an instance on the cluster.
705 tryAlloc :: (Monad m) =>
706 Node.List -- ^ The node list
707 -> Instance.List -- ^ The instance list
708 -> Instance.Instance -- ^ The instance to allocate
709 -> AllocNodes -- ^ The allocation targets
710 -> m AllocSolution -- ^ Possible solution list
711 tryAlloc _ _ _ (Right []) = fail "Not enough online nodes"
712 tryAlloc nl _ inst (Right ok_pairs) =
713 let psols = parMap rwhnf (\(p, ss) ->
715 concatAllocs cstate .
716 allocateOnPair nl inst p)
717 emptyAllocSolution ss) ok_pairs
718 sols = foldl' sumAllocs emptyAllocSolution psols
719 in return $ annotateSolution sols
721 tryAlloc _ _ _ (Left []) = fail "No online nodes"
722 tryAlloc nl _ inst (Left all_nodes) =
723 let sols = foldl' (\cstate ->
724 concatAllocs cstate . allocateOnSingle nl inst
725 ) emptyAllocSolution all_nodes
726 in return $ annotateSolution sols
728 -- | Given a group/result, describe it as a nice (list of) messages.
729 solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String]
730 solutionDescription gl (groupId, result) =
732 Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
733 Bad message -> [printf "Group %s: error %s" gname message]
734 where grp = Container.find groupId gl
735 gname = Group.name grp
736 pol = allocPolicyToRaw (Group.allocPolicy grp)
738 -- | From a list of possibly bad and possibly empty solutions, filter
739 -- only the groups with a valid result. Note that the result will be
740 -- reversed compared to the original list.
741 filterMGResults :: Group.List
742 -> [(Gdx, Result AllocSolution)]
743 -> [(Gdx, AllocSolution)]
744 filterMGResults gl = foldl' fn []
745 where unallocable = not . Group.isAllocable . flip Container.find gl
746 fn accu (gdx, rasol) =
749 Ok sol | isNothing (asSolution sol) -> accu
750 | unallocable gdx -> accu
751 | otherwise -> (gdx, sol):accu
753 -- | Sort multigroup results based on policy and score.
754 sortMGResults :: Group.List
755 -> [(Gdx, AllocSolution)]
756 -> [(Gdx, AllocSolution)]
757 sortMGResults gl sols =
758 let extractScore (_, _, _, x) = x
759 solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl),
760 (extractScore . fromJust . asSolution) sol)
761 in sortBy (comparing solScore) sols
763 -- | Finds the best group for an instance on a multi-group cluster.
765 -- Only solutions in @preferred@ and @last_resort@ groups will be
766 -- accepted as valid, and additionally if the allowed groups parameter
767 -- is not null then allocation will only be run for those group
769 findBestAllocGroup :: Group.List -- ^ The group list
770 -> Node.List -- ^ The node list
771 -> Instance.List -- ^ The instance list
772 -> Maybe [Gdx] -- ^ The allowed groups
773 -> Instance.Instance -- ^ The instance to allocate
774 -> Int -- ^ Required number of nodes
775 -> Result (Gdx, AllocSolution, [String])
776 findBestAllocGroup mggl mgnl mgil allowed_gdxs inst cnt =
777 let groups = splitCluster mgnl mgil
778 groups' = maybe groups (\gs -> filter ((`elem` gs) . fst) groups)
780 sols = map (\(gid, (nl, il)) ->
781 (gid, genAllocNodes mggl nl cnt False >>=
782 tryAlloc nl il inst))
783 groups'::[(Gdx, Result AllocSolution)]
784 all_msgs = concatMap (solutionDescription mggl) sols
785 goodSols = filterMGResults mggl sols
786 sortedSols = sortMGResults mggl goodSols
787 in if null sortedSols
789 then Bad $ "no groups for evacuation: allowed groups was" ++
790 show allowed_gdxs ++ ", all groups: " ++
791 show (map fst groups)
792 else Bad $ intercalate ", " all_msgs
793 else let (final_group, final_sol) = head sortedSols
794 in return (final_group, final_sol, all_msgs)
796 -- | Try to allocate an instance on a multi-group cluster.
797 tryMGAlloc :: Group.List -- ^ The group list
798 -> Node.List -- ^ The node list
799 -> Instance.List -- ^ The instance list
800 -> Instance.Instance -- ^ The instance to allocate
801 -> Int -- ^ Required number of nodes
802 -> Result AllocSolution -- ^ Possible solution list
803 tryMGAlloc mggl mgnl mgil inst cnt = do
804 (best_group, solution, all_msgs) <-
805 findBestAllocGroup mggl mgnl mgil Nothing inst cnt
806 let group_name = Group.name $ Container.find best_group mggl
807 selmsg = "Selected group: " ++ group_name
808 return $ solution { asLog = selmsg:all_msgs }
810 -- | Function which fails if the requested mode is change secondary.
812 -- This is useful since except DRBD, no other disk template can
813 -- execute change secondary; thus, we can just call this function
814 -- instead of always checking for secondary mode. After the call to
815 -- this function, whatever mode we have is just a primary change.
816 failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
817 failOnSecondaryChange ChangeSecondary dt =
818 fail $ "Instances with disk template '" ++ diskTemplateToRaw dt ++
819 "' can't execute change secondary"
820 failOnSecondaryChange _ _ = return ()
822 -- | Run evacuation for a single instance.
824 -- /Note:/ this function should correctly execute both intra-group
825 -- evacuations (in all modes) and inter-group evacuations (in the
826 -- 'ChangeAll' mode). Of course, this requires that the correct list
827 -- of target nodes is passed.
828 nodeEvacInstance :: Node.List -- ^ The node list (cluster-wide)
829 -> Instance.List -- ^ Instance list (cluster-wide)
830 -> EvacMode -- ^ The evacuation mode
831 -> Instance.Instance -- ^ The instance to be evacuated
832 -> Gdx -- ^ The group we're targetting
833 -> [Ndx] -- ^ The list of available nodes
835 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
836 nodeEvacInstance _ _ mode (Instance.Instance
837 {Instance.diskTemplate = dt@DTDiskless}) _ _ =
838 failOnSecondaryChange mode dt >>
839 fail "Diskless relocations not implemented yet"
841 nodeEvacInstance _ _ _ (Instance.Instance
842 {Instance.diskTemplate = DTPlain}) _ _ =
843 fail "Instances of type plain cannot be relocated"
845 nodeEvacInstance _ _ _ (Instance.Instance
846 {Instance.diskTemplate = DTFile}) _ _ =
847 fail "Instances of type file cannot be relocated"
849 nodeEvacInstance _ _ mode (Instance.Instance
850 {Instance.diskTemplate = dt@DTSharedFile}) _ _ =
851 failOnSecondaryChange mode dt >>
852 fail "Shared file relocations not implemented yet"
854 nodeEvacInstance _ _ mode (Instance.Instance
855 {Instance.diskTemplate = dt@DTBlock}) _ _ =
856 failOnSecondaryChange mode dt >>
857 fail "Block device relocations not implemented yet"
859 nodeEvacInstance nl il ChangePrimary
860 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
863 (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
864 let idx = Instance.idx inst
865 il' = Container.add idx inst' il
866 ops = iMoveToJob nl' il' idx Failover
867 return (nl', il', ops)
869 nodeEvacInstance nl il ChangeSecondary
870 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
873 (nl', inst', _, ndx) <- annotateResult "Can't find any good node" $
875 foldl' (evacDrbdSecondaryInner nl inst gdx)
876 (Left "no nodes available") avail_nodes
877 let idx = Instance.idx inst
878 il' = Container.add idx inst' il
879 ops = iMoveToJob nl' il' idx (ReplaceSecondary ndx)
880 return (nl', il', ops)
882 -- The algorithm for ChangeAll is as follows:
884 -- * generate all (primary, secondary) node pairs for the target groups
885 -- * for each pair, execute the needed moves (r:s, f, r:s) and compute
886 -- the final node list state and group score
887 -- * select the best choice via a foldl that uses the same Either
888 -- String solution as the ChangeSecondary mode
889 nodeEvacInstance nl il ChangeAll
890 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
893 let no_nodes = Left "no nodes available"
894 node_pairs = [(p,s) | p <- avail_nodes, s <- avail_nodes, p /= s]
895 (nl', il', ops, _) <-
896 annotateResult "Can't find any good nodes for relocation" $
899 (\accu nodes -> case evacDrbdAllInner nl il inst gdx nodes of
903 -- we don't need more details (which
904 -- nodes, etc.) as we only selected
905 -- this group if we can allocate on
906 -- it, hence failures will not
907 -- propagate out of this fold loop
908 Left _ -> Left $ "Allocation failed: " ++ msg
909 Ok result@(_, _, _, new_cv) ->
910 let new_accu = Right result in
913 Right (_, _, _, old_cv) ->
917 ) no_nodes node_pairs
919 return (nl', il', ops)
921 -- | Inner fold function for changing secondary of a DRBD instance.
923 -- The running solution is either a @Left String@, which means we
924 -- don't have yet a working solution, or a @Right (...)@, which
925 -- represents a valid solution; it holds the modified node list, the
926 -- modified instance (after evacuation), the score of that solution,
927 -- and the new secondary node index.
928 evacDrbdSecondaryInner :: Node.List -- ^ Cluster node list
929 -> Instance.Instance -- ^ Instance being evacuated
930 -> Gdx -- ^ The group index of the instance
931 -> Either String ( Node.List
934 , Ndx) -- ^ Current best solution
935 -> Ndx -- ^ Node we're evaluating as new secondary
936 -> Either String ( Node.List
939 , Ndx) -- ^ New best solution
940 evacDrbdSecondaryInner nl inst gdx accu ndx =
941 case applyMove nl inst (ReplaceSecondary ndx) of
945 Left _ -> Left $ "Node " ++ Container.nameOf nl ndx ++
946 " failed: " ++ show fm
947 OpGood (nl', inst', _, _) ->
948 let nodes = Container.elems nl'
949 -- The fromJust below is ugly (it can fail nastily), but
950 -- at this point we should have any internal mismatches,
951 -- and adding a monad here would be quite involved
952 grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
953 new_cv = compCVNodes grpnodes
954 new_accu = Right (nl', inst', new_cv, ndx)
957 Right (_, _, old_cv, _) ->
962 -- | Compute result of changing all nodes of a DRBD instance.
964 -- Given the target primary and secondary node (which might be in a
965 -- different group or not), this function will 'execute' all the
966 -- required steps and assuming all operations succceed, will return
967 -- the modified node and instance lists, the opcodes needed for this
968 -- and the new group score.
969 evacDrbdAllInner :: Node.List -- ^ Cluster node list
970 -> Instance.List -- ^ Cluster instance list
971 -> Instance.Instance -- ^ The instance to be moved
972 -> Gdx -- ^ The target group index
973 -- (which can differ from the
974 -- current group of the
976 -> (Ndx, Ndx) -- ^ Tuple of new
977 -- primary\/secondary nodes
978 -> Result (Node.List, Instance.List, [OpCodes.OpCode], Score)
979 evacDrbdAllInner nl il inst gdx (t_pdx, t_sdx) = do
980 let primary = Container.find (Instance.pNode inst) nl
981 idx = Instance.idx inst
982 -- if the primary is offline, then we first failover
983 (nl1, inst1, ops1) <-
984 if Node.offline primary
986 (nl', inst', _, _) <-
987 annotateResult "Failing over to the secondary" $
988 opToResult $ applyMove nl inst Failover
989 return (nl', inst', [Failover])
990 else return (nl, inst, [])
991 let (o1, o2, o3) = (ReplaceSecondary t_pdx,
993 ReplaceSecondary t_sdx)
994 -- we now need to execute a replace secondary to the future
996 (nl2, inst2, _, _) <-
997 annotateResult "Changing secondary to new primary" $
999 applyMove nl1 inst1 o1
1001 -- we now execute another failover, the primary stays fixed now
1002 (nl3, inst3, _, _) <- annotateResult "Failing over to new primary" $
1003 opToResult $ applyMove nl2 inst2 o2
1005 -- and finally another replace secondary, to the final secondary
1006 (nl4, inst4, _, _) <-
1007 annotateResult "Changing secondary to final secondary" $
1009 applyMove nl3 inst3 o3
1011 il' = Container.add idx inst4 il
1012 ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4
1013 let nodes = Container.elems nl4
1014 -- The fromJust below is ugly (it can fail nastily), but
1015 -- at this point we should have any internal mismatches,
1016 -- and adding a monad here would be quite involved
1017 grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1018 new_cv = compCVNodes grpnodes
1019 return (nl4, il', ops, new_cv)
1021 -- | Computes the nodes in a given group which are available for
1023 availableGroupNodes :: [(Gdx, [Ndx])] -- ^ Group index/node index assoc list
1024 -> IntSet.IntSet -- ^ Nodes that are excluded
1025 -> Gdx -- ^ The group for which we
1027 -> Result [Ndx] -- ^ List of available node indices
1028 availableGroupNodes group_nodes excl_ndx gdx = do
1029 local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx)
1030 Ok (lookup gdx group_nodes)
1031 let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes
1034 -- | Updates the evac solution with the results of an instance
1036 updateEvacSolution :: (Node.List, Instance.List, EvacSolution)
1038 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1039 -> (Node.List, Instance.List, EvacSolution)
1040 updateEvacSolution (nl, il, es) idx (Bad msg) =
1041 (nl, il, es { esFailed = (idx, msg):esFailed es})
1042 updateEvacSolution (_, _, es) idx (Ok (nl, il, opcodes)) =
1043 (nl, il, es { esMoved = new_elem:esMoved es
1044 , esOpCodes = opcodes:esOpCodes es })
1045 where inst = Container.find idx il
1047 instancePriGroup nl inst,
1048 Instance.allNodes inst)
1050 -- | Node-evacuation IAllocator mode main function.
1051 tryNodeEvac :: Group.List -- ^ The cluster groups
1052 -> Node.List -- ^ The node list (cluster-wide, not per group)
1053 -> Instance.List -- ^ Instance list (cluster-wide)
1054 -> EvacMode -- ^ The evacuation mode
1055 -> [Idx] -- ^ List of instance (indices) to be evacuated
1056 -> Result (Node.List, Instance.List, EvacSolution)
1057 tryNodeEvac _ ini_nl ini_il mode idxs =
1058 let evac_ndx = nodesToEvacuate ini_il mode idxs
1059 offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1060 excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline
1061 group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1062 (Container.elems nl))) $
1063 splitCluster ini_nl ini_il
1064 (fin_nl, fin_il, esol) =
1065 foldl' (\state@(nl, il, _) inst ->
1066 let gdx = instancePriGroup nl inst
1067 pdx = Instance.pNode inst in
1068 updateEvacSolution state (Instance.idx inst) $
1069 availableGroupNodes group_ndx
1070 (IntSet.insert pdx excl_ndx) gdx >>=
1071 nodeEvacInstance nl il mode inst gdx
1073 (ini_nl, ini_il, emptyEvacSolution)
1074 (map (`Container.find` ini_il) idxs)
1075 in return (fin_nl, fin_il, reverseEvacSolution esol)
1077 -- | Change-group IAllocator mode main function.
1079 -- This is very similar to 'tryNodeEvac', the only difference is that
1080 -- we don't choose as target group the current instance group, but
1083 -- 1. at the start of the function, we compute which are the target
1084 -- groups; either no groups were passed in, in which case we choose
1085 -- all groups out of which we don't evacuate instance, or there were
1086 -- some groups passed, in which case we use those
1088 -- 2. for each instance, we use 'findBestAllocGroup' to choose the
1089 -- best group to hold the instance, and then we do what
1090 -- 'tryNodeEvac' does, except for this group instead of the current
1093 -- Note that the correct behaviour of this function relies on the
1094 -- function 'nodeEvacInstance' to be able to do correctly both
1095 -- intra-group and inter-group moves when passed the 'ChangeAll' mode.
1096 tryChangeGroup :: Group.List -- ^ The cluster groups
1097 -> Node.List -- ^ The node list (cluster-wide)
1098 -> Instance.List -- ^ Instance list (cluster-wide)
1099 -> [Gdx] -- ^ Target groups; if empty, any
1100 -- groups not being evacuated
1101 -> [Idx] -- ^ List of instance (indices) to be evacuated
1102 -> Result (Node.List, Instance.List, EvacSolution)
1103 tryChangeGroup gl ini_nl ini_il gdxs idxs =
1104 let evac_gdxs = nub $ map (instancePriGroup ini_nl .
1105 flip Container.find ini_il) idxs
1106 target_gdxs = (if null gdxs
1107 then Container.keys gl
1108 else gdxs) \\ evac_gdxs
1109 offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1110 excl_ndx = foldl' (flip IntSet.insert) IntSet.empty offline
1111 group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1112 (Container.elems nl))) $
1113 splitCluster ini_nl ini_il
1114 (fin_nl, fin_il, esol) =
1115 foldl' (\state@(nl, il, _) inst ->
1117 let ncnt = Instance.requiredNodes $
1118 Instance.diskTemplate inst
1119 (gdx, _, _) <- findBestAllocGroup gl nl il
1120 (Just target_gdxs) inst ncnt
1121 av_nodes <- availableGroupNodes group_ndx
1123 nodeEvacInstance nl il ChangeAll inst gdx av_nodes
1124 in updateEvacSolution state (Instance.idx inst) solution
1126 (ini_nl, ini_il, emptyEvacSolution)
1127 (map (`Container.find` ini_il) idxs)
1128 in return (fin_nl, fin_il, reverseEvacSolution esol)
1130 -- | Standard-sized allocation method.
1132 -- This places instances of the same size on the cluster until we're
1133 -- out of space. The result will be a list of identically-sized
1135 iterateAlloc :: AllocMethod
1136 iterateAlloc nl il limit newinst allocnodes ixes cstats =
1137 let depth = length ixes
1138 newname = printf "new-%d" depth::String
1139 newidx = Container.size il
1140 newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
1141 newlimit = fmap (flip (-) 1) limit
1142 in case tryAlloc nl il newi2 allocnodes of
1144 Ok (AllocSolution { asFailures = errs, asSolution = sols3 }) ->
1145 let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in
1148 Just (xnl, xi, _, _) ->
1151 else iterateAlloc xnl (Container.add newidx xi il)
1152 newlimit newinst allocnodes (xi:ixes)
1153 (totalResources xnl:cstats)
1155 -- | Tiered allocation method.
1157 -- This places instances on the cluster, and decreases the spec until
1158 -- we can allocate again. The result will be a list of decreasing
1160 tieredAlloc :: AllocMethod
1161 tieredAlloc nl il limit newinst allocnodes ixes cstats =
1162 case iterateAlloc nl il limit newinst allocnodes ixes cstats of
1164 Ok (errs, nl', il', ixes', cstats') ->
1165 let newsol = Ok (errs, nl', il', ixes', cstats')
1166 ixes_cnt = length ixes'
1167 (stop, newlimit) = case limit of
1168 Nothing -> (False, Nothing)
1169 Just n -> (n <= ixes_cnt,
1170 Just (n - ixes_cnt)) in
1171 if stop then newsol else
1172 case Instance.shrinkByType newinst . fst . last $
1173 sortBy (comparing snd) errs of
1175 Ok newinst' -> tieredAlloc nl' il' newlimit
1176 newinst' allocnodes ixes' cstats'
1178 -- * Formatting functions
1180 -- | Given the original and final nodes, computes the relocation description.
1181 computeMoves :: Instance.Instance -- ^ The instance to be moved
1182 -> String -- ^ The instance name
1183 -> IMove -- ^ The move being performed
1184 -> String -- ^ New primary
1185 -> String -- ^ New secondary
1186 -> (String, [String])
1187 -- ^ Tuple of moves and commands list; moves is containing
1188 -- either @/f/@ for failover or @/r:name/@ for replace
1189 -- secondary, while the command list holds gnt-instance
1190 -- commands (without that prefix), e.g \"@failover instance1@\"
1191 computeMoves i inam mv c d =
1193 Failover -> ("f", [mig])
1194 FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
1195 ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
1196 ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
1197 ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
1198 where morf = if Instance.instanceRunning i then "migrate" else "failover"
1199 mig = printf "%s -f %s" morf inam::String
1200 rep n = printf "replace-disks -n %s %s" n inam
1202 -- | Converts a placement to string format.
1203 printSolutionLine :: Node.List -- ^ The node list
1204 -> Instance.List -- ^ The instance list
1205 -> Int -- ^ Maximum node name length
1206 -> Int -- ^ Maximum instance name length
1207 -> Placement -- ^ The current placement
1208 -> Int -- ^ The index of the placement in
1210 -> (String, [String])
1211 printSolutionLine nl il nmlen imlen plc pos =
1212 let pmlen = (2*nmlen + 1)
1213 (i, p, s, mv, c) = plc
1214 inst = Container.find i il
1215 inam = Instance.alias inst
1216 npri = Node.alias $ Container.find p nl
1217 nsec = Node.alias $ Container.find s nl
1218 opri = Node.alias $ Container.find (Instance.pNode inst) nl
1219 osec = Node.alias $ Container.find (Instance.sNode inst) nl
1220 (moves, cmds) = computeMoves inst inam mv npri nsec
1221 ostr = printf "%s:%s" opri osec::String
1222 nstr = printf "%s:%s" npri nsec::String
1223 in (printf " %3d. %-*s %-*s => %-*s %.8f a=%s"
1224 pos imlen inam pmlen ostr
1228 -- | Return the instance and involved nodes in an instance move.
1230 -- Note that the output list length can vary, and is not required nor
1231 -- guaranteed to be of any specific length.
1232 involvedNodes :: Instance.List -- ^ Instance list, used for retrieving
1233 -- the instance from its index; note
1234 -- that this /must/ be the original
1235 -- instance list, so that we can
1236 -- retrieve the old nodes
1237 -> Placement -- ^ The placement we're investigating,
1238 -- containing the new nodes and
1240 -> [Ndx] -- ^ Resulting list of node indices
1241 involvedNodes il plc =
1242 let (i, np, ns, _, _) = plc
1243 inst = Container.find i il
1244 in nub $ [np, ns] ++ Instance.allNodes inst
1246 -- | Inner function for splitJobs, that either appends the next job to
1247 -- the current jobset, or starts a new jobset.
1248 mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
1249 mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
1250 mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
1251 | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
1252 | otherwise = ([n]:cjs, ndx)
1254 -- | Break a list of moves into independent groups. Note that this
1255 -- will reverse the order of jobs.
1256 splitJobs :: [MoveJob] -> [JobSet]
1257 splitJobs = fst . foldl mergeJobs ([], [])
1259 -- | Given a list of commands, prefix them with @gnt-instance@ and
1260 -- also beautify the display a little.
1261 formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
1262 formatJob jsn jsl (sn, (_, _, _, cmds)) =
1264 printf " echo job %d/%d" jsn sn:
1266 map (" gnt-instance " ++) cmds
1268 then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
1271 -- | Given a list of commands, prefix them with @gnt-instance@ and
1272 -- also beautify the display a little.
1273 formatCmds :: [JobSet] -> String
1276 concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1280 -- | Print the node list.
1281 printNodes :: Node.List -> [String] -> String
1283 let fields = case fs of
1284 [] -> Node.defaultFields
1285 "+":rest -> Node.defaultFields ++ rest
1287 snl = sortBy (comparing Node.idx) (Container.elems nl)
1288 (header, isnum) = unzip $ map Node.showHeader fields
1289 in unlines . map ((:) ' ' . unwords) $
1290 formatTable (header:map (Node.list fields) snl) isnum
1292 -- | Print the instance list.
1293 printInsts :: Node.List -> Instance.List -> String
1295 let sil = sortBy (comparing Instance.idx) (Container.elems il)
1296 helper inst = [ if Instance.instanceRunning inst then "R" else " "
1297 , Instance.name inst
1298 , Container.nameOf nl (Instance.pNode inst)
1299 , let sdx = Instance.sNode inst
1300 in if sdx == Node.noSecondary
1302 else Container.nameOf nl sdx
1303 , if Instance.autoBalance inst then "Y" else "N"
1304 , printf "%3d" $ Instance.vcpus inst
1305 , printf "%5d" $ Instance.mem inst
1306 , printf "%5d" $ Instance.dsk inst `div` 1024
1312 where DynUtil lC lM lD lN = Instance.util inst
1313 header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1314 , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1315 isnum = False:False:False:False:False:repeat True
1316 in unlines . map ((:) ' ' . unwords) $
1317 formatTable (header:map helper sil) isnum
1319 -- | Shows statistics for a given node list.
1320 printStats :: Node.List -> String
1322 let dcvs = compDetailedCV $ Container.elems nl
1323 (weights, names) = unzip detailedCVInfo
1324 hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1325 formatted = map (\(w, header, val) ->
1326 printf "%s=%.8f(x%.2f)" header val w::String) hd
1327 in intercalate ", " formatted
1329 -- | Convert a placement into a list of OpCodes (basically a job).
1330 iMoveToJob :: Node.List -- ^ The node list; only used for node
1331 -- names, so any version is good
1332 -- (before or after the operation)
1333 -> Instance.List -- ^ The instance list; also used for
1335 -> Idx -- ^ The index of the instance being
1337 -> IMove -- ^ The actual move to be described
1338 -> [OpCodes.OpCode] -- ^ The list of opcodes equivalent to
1340 iMoveToJob nl il idx move =
1341 let inst = Container.find idx il
1342 iname = Instance.name inst
1343 lookNode = Just . Container.nameOf nl
1344 opF = OpCodes.OpInstanceMigrate iname True False True Nothing
1345 opR n = OpCodes.OpInstanceReplaceDisks iname (lookNode n)
1346 OpCodes.ReplaceNewSecondary [] Nothing
1349 ReplacePrimary np -> [ opF, opR np, opF ]
1350 ReplaceSecondary ns -> [ opR ns ]
1351 ReplaceAndFailover np -> [ opR np, opF ]
1352 FailoverAndReplace ns -> [ opF, opR ns ]
1354 -- * Node group functions
1356 -- | Computes the group of an instance.
1357 instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1358 instanceGroup nl i =
1359 let sidx = Instance.sNode i
1360 pnode = Container.find (Instance.pNode i) nl
1361 snode = if sidx == Node.noSecondary
1363 else Container.find sidx nl
1364 pgroup = Node.group pnode
1365 sgroup = Node.group snode
1366 in if pgroup /= sgroup
1367 then fail ("Instance placed accross two node groups, primary " ++
1368 show pgroup ++ ", secondary " ++ show sgroup)
1371 -- | Computes the group of an instance per the primary node.
1372 instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1373 instancePriGroup nl i =
1374 let pnode = Container.find (Instance.pNode i) nl
1377 -- | Compute the list of badly allocated instances (split across node
1379 findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1380 findSplitInstances nl =
1381 filter (not . isOk . instanceGroup nl) . Container.elems
1383 -- | Splits a cluster into the component node groups.
1384 splitCluster :: Node.List -> Instance.List ->
1385 [(Gdx, (Node.List, Instance.List))]
1386 splitCluster nl il =
1387 let ngroups = Node.computeGroups (Container.elems nl)
1388 in map (\(guuid, nodes) ->
1389 let nidxs = map Node.idx nodes
1390 nodes' = zip nidxs nodes
1391 instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1392 in (guuid, (Container.fromList nodes', instances))) ngroups
1394 -- | Compute the list of nodes that are to be evacuated, given a list
1395 -- of instances and an evacuation mode.
1396 nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list
1397 -> EvacMode -- ^ The evacuation mode we're using
1398 -> [Idx] -- ^ List of instance indices being evacuated
1399 -> IntSet.IntSet -- ^ Set of node indices
1400 nodesToEvacuate il mode =
1401 IntSet.delete Node.noSecondary .
1403 let i = Container.find idx il
1404 pdx = Instance.pNode i
1405 sdx = Instance.sNode i
1406 dt = Instance.diskTemplate i
1407 withSecondary = case dt of
1408 DTDrbd8 -> IntSet.insert sdx ns
1411 ChangePrimary -> IntSet.insert pdx ns
1412 ChangeSecondary -> withSecondary
1413 ChangeAll -> IntSet.insert pdx withSecondary