1 {-| Implementation of cluster-wide logic.
3 This module holds all pure cluster-logic; I\/O related functionality
4 goes into the /Main/ module for the individual binaries.
10 Copyright (C) 2009, 2010, 2011, 2012 Google Inc.
12 This program is free software; you can redistribute it and/or modify
13 it under the terms of the GNU General Public License as published by
14 the Free Software Foundation; either version 2 of the License, or
15 (at your option) any later version.
17 This program is distributed in the hope that it will be useful, but
18 WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 General Public License for more details.
22 You should have received a copy of the GNU General Public License
23 along with this program; if not, write to the Free Software
24 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
29 module Ganeti.HTools.Cluster
38 -- * Generic functions
40 , computeAllocationDelta
41 -- * First phase functions
43 -- * Second phase functions
48 -- * Display functions
51 -- * Balacing functions
60 -- * IAllocator functions
67 -- * Allocation functions
70 -- * Node group functions
76 import qualified Data.IntSet as IntSet
78 import Data.Maybe (fromJust, isNothing)
79 import Data.Ord (comparing)
80 import Text.Printf (printf)
82 import qualified Ganeti.HTools.Container as Container
83 import qualified Ganeti.HTools.Instance as Instance
84 import qualified Ganeti.HTools.Node as Node
85 import qualified Ganeti.HTools.Group as Group
86 import Ganeti.HTools.Types
87 import Ganeti.HTools.Utils
88 import Ganeti.HTools.Compat
89 import qualified Ganeti.OpCodes as OpCodes
93 -- | Allocation\/relocation solution.
94 data AllocSolution = AllocSolution
95 { asFailures :: [FailMode] -- ^ Failure counts
96 , asAllocs :: Int -- ^ Good allocation count
97 , asSolution :: Maybe Node.AllocElement -- ^ The actual allocation result
98 , asLog :: [String] -- ^ Informational messages
101 -- | Node evacuation/group change iallocator result type. This result
102 -- type consists of actual opcodes (a restricted subset) that are
103 -- transmitted back to Ganeti.
104 data EvacSolution = EvacSolution
105 { esMoved :: [(Idx, Gdx, [Ndx])] -- ^ Instances moved successfully
106 , esFailed :: [(Idx, String)] -- ^ Instances which were not
108 , esOpCodes :: [[OpCodes.OpCode]] -- ^ List of jobs
111 -- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
112 type AllocResult = (FailStats, Node.List, Instance.List,
113 [Instance.Instance], [CStats])
115 -- | A type denoting the valid allocation mode/pairs.
117 -- For a one-node allocation, this will be a @Left ['Ndx']@, whereas
118 -- for a two-node allocation, this will be a @Right [('Ndx',
119 -- ['Ndx'])]@. In the latter case, the list is basically an
120 -- association list, grouped by primary node and holding the potential
121 -- secondary nodes in the sub-list.
122 type AllocNodes = Either [Ndx] [(Ndx, [Ndx])]
124 -- | The empty solution we start with when computing allocations.
125 emptyAllocSolution :: AllocSolution
126 emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
127 , asSolution = Nothing, asLog = [] }
129 -- | The empty evac solution.
130 emptyEvacSolution :: EvacSolution
131 emptyEvacSolution = EvacSolution { esMoved = []
136 -- | The complete state for the balancing solution.
137 data Table = Table Node.List Instance.List Score [Placement]
138 deriving (Show, Read)
140 -- | Cluster statistics data type.
142 { csFmem :: Integer -- ^ Cluster free mem
143 , csFdsk :: Integer -- ^ Cluster free disk
144 , csAmem :: Integer -- ^ Cluster allocatable mem
145 , csAdsk :: Integer -- ^ Cluster allocatable disk
146 , csAcpu :: Integer -- ^ Cluster allocatable cpus
147 , csMmem :: Integer -- ^ Max node allocatable mem
148 , csMdsk :: Integer -- ^ Max node allocatable disk
149 , csMcpu :: Integer -- ^ Max node allocatable cpu
150 , csImem :: Integer -- ^ Instance used mem
151 , csIdsk :: Integer -- ^ Instance used disk
152 , csIcpu :: Integer -- ^ Instance used cpu
153 , csTmem :: Double -- ^ Cluster total mem
154 , csTdsk :: Double -- ^ Cluster total disk
155 , csTcpu :: Double -- ^ Cluster total cpus
156 , csVcpu :: Integer -- ^ Cluster total virtual cpus
157 , csNcpu :: Double -- ^ Equivalent to 'csIcpu' but in terms of
158 -- physical CPUs, i.e. normalised used phys CPUs
159 , csXmem :: Integer -- ^ Unnacounted for mem
160 , csNmem :: Integer -- ^ Node own memory
161 , csScore :: Score -- ^ The cluster score
162 , csNinst :: Int -- ^ The total number of instances
163 } deriving (Show, Read)
165 -- | A simple type for allocation functions.
166 type AllocMethod = Node.List -- ^ Node list
167 -> Instance.List -- ^ Instance list
168 -> Maybe Int -- ^ Optional allocation limit
169 -> Instance.Instance -- ^ Instance spec for allocation
170 -> AllocNodes -- ^ Which nodes we should allocate on
171 -> [Instance.Instance] -- ^ Allocated instances
172 -> [CStats] -- ^ Running cluster stats
173 -> Result AllocResult -- ^ Allocation result
175 -- * Utility functions
177 -- | Verifies the N+1 status and return the affected nodes.
178 verifyN1 :: [Node.Node] -> [Node.Node]
179 verifyN1 = filter Node.failN1
181 {-| Computes the pair of bad nodes and instances.
183 The bad node list is computed via a simple 'verifyN1' check, and the
184 bad instance list is the list of primary and secondary instances of
188 computeBadItems :: Node.List -> Instance.List ->
189 ([Node.Node], [Instance.Instance])
190 computeBadItems nl il =
191 let bad_nodes = verifyN1 $ getOnline nl
192 bad_instances = map (`Container.find` il) .
194 concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
196 (bad_nodes, bad_instances)
198 -- | Extracts the node pairs for an instance. This can fail if the
199 -- instance is single-homed. FIXME: this needs to be improved,
200 -- together with the general enhancement for handling non-DRBD moves.
201 instanceNodes :: Node.List -> Instance.Instance ->
202 (Ndx, Ndx, Node.Node, Node.Node)
203 instanceNodes nl inst =
204 let old_pdx = Instance.pNode inst
205 old_sdx = Instance.sNode inst
206 old_p = Container.find old_pdx nl
207 old_s = Container.find old_sdx nl
208 in (old_pdx, old_sdx, old_p, old_s)
210 -- | Zero-initializer for the CStats type.
211 emptyCStats :: CStats
212 emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
214 -- | Update stats with data from a new node.
215 updateCStats :: CStats -> Node.Node -> CStats
216 updateCStats cs node =
217 let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
218 csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
219 csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
220 csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
221 csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
222 csVcpu = x_vcpu, csNcpu = x_ncpu,
223 csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
226 inc_amem = Node.fMem node - Node.rMem node
227 inc_amem' = if inc_amem > 0 then inc_amem else 0
228 inc_adsk = Node.availDisk node
229 inc_imem = truncate (Node.tMem node) - Node.nMem node
230 - Node.xMem node - Node.fMem node
231 inc_icpu = Node.uCpu node
232 inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
233 inc_vcpu = Node.hiCpu node
234 inc_acpu = Node.availCpu node
235 inc_ncpu = fromIntegral (Node.uCpu node) /
236 iPolicyVcpuRatio (Node.iPolicy node)
237 in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
238 , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
239 , csAmem = x_amem + fromIntegral inc_amem'
240 , csAdsk = x_adsk + fromIntegral inc_adsk
241 , csAcpu = x_acpu + fromIntegral inc_acpu
242 , csMmem = max x_mmem (fromIntegral inc_amem')
243 , csMdsk = max x_mdsk (fromIntegral inc_adsk)
244 , csMcpu = max x_mcpu (fromIntegral inc_acpu)
245 , csImem = x_imem + fromIntegral inc_imem
246 , csIdsk = x_idsk + fromIntegral inc_idsk
247 , csIcpu = x_icpu + fromIntegral inc_icpu
248 , csTmem = x_tmem + Node.tMem node
249 , csTdsk = x_tdsk + Node.tDsk node
250 , csTcpu = x_tcpu + Node.tCpu node
251 , csVcpu = x_vcpu + fromIntegral inc_vcpu
252 , csNcpu = x_ncpu + inc_ncpu
253 , csXmem = x_xmem + fromIntegral (Node.xMem node)
254 , csNmem = x_nmem + fromIntegral (Node.nMem node)
255 , csNinst = x_ninst + length (Node.pList node)
258 -- | Compute the total free disk and memory in the cluster.
259 totalResources :: Node.List -> CStats
261 let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
262 in cs { csScore = compCV nl }
264 -- | Compute the delta between two cluster state.
266 -- This is used when doing allocations, to understand better the
267 -- available cluster resources. The return value is a triple of the
268 -- current used values, the delta that was still allocated, and what
269 -- was left unallocated.
270 computeAllocationDelta :: CStats -> CStats -> AllocStats
271 computeAllocationDelta cini cfin =
272 let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu,
273 csNcpu = i_ncpu } = cini
274 CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
275 csTmem = t_mem, csTdsk = t_dsk, csVcpu = f_vcpu,
276 csNcpu = f_ncpu, csTcpu = f_tcpu } = cfin
277 rini = AllocInfo { allocInfoVCpus = fromIntegral i_icpu
278 , allocInfoNCpus = i_ncpu
279 , allocInfoMem = fromIntegral i_imem
280 , allocInfoDisk = fromIntegral i_idsk
282 rfin = AllocInfo { allocInfoVCpus = fromIntegral (f_icpu - i_icpu)
283 , allocInfoNCpus = f_ncpu - i_ncpu
284 , allocInfoMem = fromIntegral (f_imem - i_imem)
285 , allocInfoDisk = fromIntegral (f_idsk - i_idsk)
287 runa = AllocInfo { allocInfoVCpus = fromIntegral (f_vcpu - f_icpu)
288 , allocInfoNCpus = f_tcpu - f_ncpu
289 , allocInfoMem = truncate t_mem - fromIntegral f_imem
290 , allocInfoDisk = truncate t_dsk - fromIntegral f_idsk
292 in (rini, rfin, runa)
294 -- | The names and weights of the individual elements in the CV list.
295 detailedCVInfo :: [(Double, String)]
296 detailedCVInfo = [ (1, "free_mem_cv")
297 , (1, "free_disk_cv")
299 , (1, "reserved_mem_cv")
300 , (4, "offline_all_cnt")
301 , (16, "offline_pri_cnt")
302 , (1, "vcpu_ratio_cv")
305 , (1, "disk_load_cv")
307 , (2, "pri_tags_score")
311 -- | Holds the weights used by 'compCVNodes' for each metric.
312 detailedCVWeights :: [Double]
313 detailedCVWeights = map fst detailedCVInfo
315 -- | Compute the mem and disk covariance.
316 compDetailedCV :: [Node.Node] -> [Double]
317 compDetailedCV all_nodes =
318 let (offline, nodes) = partition Node.offline all_nodes
319 mem_l = map Node.pMem nodes
320 dsk_l = map Node.pDsk nodes
321 -- metric: memory covariance
322 mem_cv = stdDev mem_l
323 -- metric: disk covariance
324 dsk_cv = stdDev dsk_l
325 -- metric: count of instances living on N1 failing nodes
326 n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
327 length (Node.pList n)) .
328 filter Node.failN1 $ nodes :: Double
329 res_l = map Node.pRem nodes
330 -- metric: reserved memory covariance
331 res_cv = stdDev res_l
332 -- offline instances metrics
333 offline_ipri = sum . map (length . Node.pList) $ offline
334 offline_isec = sum . map (length . Node.sList) $ offline
335 -- metric: count of instances on offline nodes
336 off_score = fromIntegral (offline_ipri + offline_isec)::Double
337 -- metric: count of primary instances on offline nodes (this
338 -- helps with evacuation/failover of primary instances on
339 -- 2-node clusters with one node offline)
340 off_pri_score = fromIntegral offline_ipri::Double
341 cpu_l = map Node.pCpu nodes
342 -- metric: covariance of vcpu/pcpu ratio
343 cpu_cv = stdDev cpu_l
344 -- metrics: covariance of cpu, memory, disk and network load
345 (c_load, m_load, d_load, n_load) =
347 let DynUtil c1 m1 d1 n1 = Node.utilLoad n
348 DynUtil c2 m2 d2 n2 = Node.utilPool n
349 in (c1/c2, m1/m2, d1/d2, n1/n2)) nodes
350 -- metric: conflicting instance count
351 pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
352 pri_tags_score = fromIntegral pri_tags_inst::Double
353 -- metric: spindles %
354 spindles_cv = map (\n -> Node.instSpindles n / Node.hiSpindles n) nodes
355 in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
356 , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
357 , pri_tags_score, stdDev spindles_cv ]
359 -- | Compute the /total/ variance.
360 compCVNodes :: [Node.Node] -> Double
361 compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
363 -- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
364 compCV :: Node.List -> Double
365 compCV = compCVNodes . Container.elems
367 -- | Compute online nodes from a 'Node.List'.
368 getOnline :: Node.List -> [Node.Node]
369 getOnline = filter (not . Node.offline) . Container.elems
371 -- * Balancing functions
373 -- | Compute best table. Note that the ordering of the arguments is important.
374 compareTables :: Table -> Table -> Table
375 compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
376 if a_cv > b_cv then b else a
378 -- | Applies an instance move to a given node list and instance.
379 applyMove :: Node.List -> Instance.Instance
380 -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
382 applyMove nl inst Failover =
383 let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
384 int_p = Node.removePri old_p inst
385 int_s = Node.removeSec old_s inst
386 new_nl = do -- Maybe monad
387 new_p <- Node.addPriEx (Node.offline old_p) int_s inst
388 new_s <- Node.addSec int_p inst old_sdx
389 let new_inst = Instance.setBoth inst old_sdx old_pdx
390 return (Container.addTwo old_pdx new_s old_sdx new_p nl,
391 new_inst, old_sdx, old_pdx)
394 -- Replace the primary (f:, r:np, f)
395 applyMove nl inst (ReplacePrimary new_pdx) =
396 let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
397 tgt_n = Container.find new_pdx nl
398 int_p = Node.removePri old_p inst
399 int_s = Node.removeSec old_s inst
400 force_p = Node.offline old_p
401 new_nl = do -- Maybe monad
402 -- check that the current secondary can host the instance
403 -- during the migration
404 tmp_s <- Node.addPriEx force_p int_s inst
405 let tmp_s' = Node.removePri tmp_s inst
406 new_p <- Node.addPriEx force_p tgt_n inst
407 new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
408 let new_inst = Instance.setPri inst new_pdx
409 return (Container.add new_pdx new_p $
410 Container.addTwo old_pdx int_p old_sdx new_s nl,
411 new_inst, new_pdx, old_sdx)
414 -- Replace the secondary (r:ns)
415 applyMove nl inst (ReplaceSecondary new_sdx) =
416 let old_pdx = Instance.pNode inst
417 old_sdx = Instance.sNode inst
418 old_s = Container.find old_sdx nl
419 tgt_n = Container.find new_sdx nl
420 int_s = Node.removeSec old_s inst
421 force_s = Node.offline old_s
422 new_inst = Instance.setSec inst new_sdx
423 new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
424 \new_s -> return (Container.addTwo new_sdx
425 new_s old_sdx int_s nl,
426 new_inst, old_pdx, new_sdx)
429 -- Replace the secondary and failover (r:np, f)
430 applyMove nl inst (ReplaceAndFailover new_pdx) =
431 let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
432 tgt_n = Container.find new_pdx nl
433 int_p = Node.removePri old_p inst
434 int_s = Node.removeSec old_s inst
435 force_s = Node.offline old_s
436 new_nl = do -- Maybe monad
437 new_p <- Node.addPri tgt_n inst
438 new_s <- Node.addSecEx force_s int_p inst new_pdx
439 let new_inst = Instance.setBoth inst new_pdx old_pdx
440 return (Container.add new_pdx new_p $
441 Container.addTwo old_pdx new_s old_sdx int_s nl,
442 new_inst, new_pdx, old_pdx)
445 -- Failver and replace the secondary (f, r:ns)
446 applyMove nl inst (FailoverAndReplace new_sdx) =
447 let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
448 tgt_n = Container.find new_sdx nl
449 int_p = Node.removePri old_p inst
450 int_s = Node.removeSec old_s inst
451 force_p = Node.offline old_p
452 new_nl = do -- Maybe monad
453 new_p <- Node.addPriEx force_p int_s inst
454 new_s <- Node.addSecEx force_p tgt_n inst old_sdx
455 let new_inst = Instance.setBoth inst old_sdx new_sdx
456 return (Container.add new_sdx new_s $
457 Container.addTwo old_sdx new_p old_pdx int_p nl,
458 new_inst, old_sdx, new_sdx)
461 -- | Tries to allocate an instance on one given node.
462 allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
463 -> OpResult Node.AllocElement
464 allocateOnSingle nl inst new_pdx =
465 let p = Container.find new_pdx nl
466 new_inst = Instance.setBoth inst new_pdx Node.noSecondary
468 Instance.instMatchesPolicy inst (Node.iPolicy p)
469 new_p <- Node.addPri p inst
470 let new_nl = Container.add new_pdx new_p nl
471 new_score = compCV nl
472 return (new_nl, new_inst, [new_p], new_score)
474 -- | Tries to allocate an instance on a given pair of nodes.
475 allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
476 -> OpResult Node.AllocElement
477 allocateOnPair nl inst new_pdx new_sdx =
478 let tgt_p = Container.find new_pdx nl
479 tgt_s = Container.find new_sdx nl
481 Instance.instMatchesPolicy inst (Node.iPolicy tgt_p)
482 new_p <- Node.addPri tgt_p inst
483 new_s <- Node.addSec tgt_s inst new_pdx
484 let new_inst = Instance.setBoth inst new_pdx new_sdx
485 new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
486 return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
488 -- | Tries to perform an instance move and returns the best table
489 -- between the original one and the new one.
490 checkSingleStep :: Table -- ^ The original table
491 -> Instance.Instance -- ^ The instance to move
492 -> Table -- ^ The current best table
493 -> IMove -- ^ The move to apply
494 -> Table -- ^ The final best table
495 checkSingleStep ini_tbl target cur_tbl move =
496 let Table ini_nl ini_il _ ini_plc = ini_tbl
497 tmp_resu = applyMove ini_nl target move
500 OpGood (upd_nl, new_inst, pri_idx, sec_idx) ->
501 let tgt_idx = Instance.idx target
502 upd_cvar = compCV upd_nl
503 upd_il = Container.add tgt_idx new_inst ini_il
504 upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
505 upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
506 in compareTables cur_tbl upd_tbl
508 -- | Given the status of the current secondary as a valid new node and
509 -- the current candidate target node, generate the possible moves for
511 possibleMoves :: Bool -- ^ Whether the secondary node is a valid new node
512 -> Bool -- ^ Whether we can change the primary node
513 -> Ndx -- ^ Target node candidate
514 -> [IMove] -- ^ List of valid result moves
516 possibleMoves _ False tdx =
517 [ReplaceSecondary tdx]
519 possibleMoves True True tdx =
520 [ ReplaceSecondary tdx
521 , ReplaceAndFailover tdx
523 , FailoverAndReplace tdx
526 possibleMoves False True tdx =
527 [ ReplaceSecondary tdx
528 , ReplaceAndFailover tdx
531 -- | Compute the best move for a given instance.
532 checkInstanceMove :: [Ndx] -- ^ Allowed target node indices
533 -> Bool -- ^ Whether disk moves are allowed
534 -> Bool -- ^ Whether instance moves are allowed
535 -> Table -- ^ Original table
536 -> Instance.Instance -- ^ Instance to move
537 -> Table -- ^ Best new table for this instance
538 checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
539 let opdx = Instance.pNode target
540 osdx = Instance.sNode target
541 bad_nodes = [opdx, osdx]
542 nodes = filter (`notElem` bad_nodes) nodes_idx
543 use_secondary = elem osdx nodes_idx && inst_moves
544 aft_failover = if use_secondary -- if allowed to failover
545 then checkSingleStep ini_tbl target ini_tbl Failover
547 all_moves = if disk_moves
549 (possibleMoves use_secondary inst_moves) nodes
552 -- iterate over the possible nodes for this instance
553 foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
555 -- | Compute the best next move.
556 checkMove :: [Ndx] -- ^ Allowed target node indices
557 -> Bool -- ^ Whether disk moves are allowed
558 -> Bool -- ^ Whether instance moves are allowed
559 -> Table -- ^ The current solution
560 -> [Instance.Instance] -- ^ List of instances still to move
561 -> Table -- ^ The new solution
562 checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
563 let Table _ _ _ ini_plc = ini_tbl
564 -- we're using rwhnf from the Control.Parallel.Strategies
565 -- package; we don't need to use rnf as that would force too
566 -- much evaluation in single-threaded cases, and in
567 -- multi-threaded case the weak head normal form is enough to
568 -- spark the evaluation
569 tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
572 -- iterate over all instances, computing the best move
573 best_tbl = foldl' compareTables ini_tbl tables
574 Table _ _ _ best_plc = best_tbl
575 in if length best_plc == length ini_plc
576 then ini_tbl -- no advancement
579 -- | Check if we are allowed to go deeper in the balancing.
580 doNextBalance :: Table -- ^ The starting table
581 -> Int -- ^ Remaining length
582 -> Score -- ^ Score at which to stop
583 -> Bool -- ^ The resulting table and commands
584 doNextBalance ini_tbl max_rounds min_score =
585 let Table _ _ ini_cv ini_plc = ini_tbl
586 ini_plc_len = length ini_plc
587 in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
589 -- | Run a balance move.
590 tryBalance :: Table -- ^ The starting table
591 -> Bool -- ^ Allow disk moves
592 -> Bool -- ^ Allow instance moves
593 -> Bool -- ^ Only evacuate moves
594 -> Score -- ^ Min gain threshold
595 -> Score -- ^ Min gain
596 -> Maybe Table -- ^ The resulting table and commands
597 tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
598 let Table ini_nl ini_il ini_cv _ = ini_tbl
599 all_inst = Container.elems ini_il
600 all_inst' = if evac_mode
601 then let bad_nodes = map Node.idx . filter Node.offline $
602 Container.elems ini_nl
603 in filter (any (`elem` bad_nodes) . Instance.allNodes)
606 reloc_inst = filter Instance.movable all_inst'
607 node_idx = map Node.idx . filter (not . Node.offline) $
608 Container.elems ini_nl
609 fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
610 (Table _ _ fin_cv _) = fin_tbl
612 if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
613 then Just fin_tbl -- this round made success, return the new table
616 -- * Allocation functions
618 -- | Build failure stats out of a list of failures.
619 collapseFailures :: [FailMode] -> FailStats
620 collapseFailures flst =
621 map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst))
624 -- | Compares two Maybe AllocElement and chooses the besst score.
625 bestAllocElement :: Maybe Node.AllocElement
626 -> Maybe Node.AllocElement
627 -> Maybe Node.AllocElement
628 bestAllocElement a Nothing = a
629 bestAllocElement Nothing b = b
630 bestAllocElement a@(Just (_, _, _, ascore)) b@(Just (_, _, _, bscore)) =
631 if ascore < bscore then a else b
633 -- | Update current Allocation solution and failure stats with new
635 concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
636 concatAllocs as (OpFail reason) = as { asFailures = reason : asFailures as }
638 concatAllocs as (OpGood ns) =
639 let -- Choose the old or new solution, based on the cluster score
641 osols = asSolution as
642 nsols = bestAllocElement osols (Just ns)
644 -- Note: we force evaluation of nsols here in order to keep the
645 -- memory profile low - we know that we will need nsols for sure
646 -- in the next cycle, so we force evaluation of nsols, since the
647 -- foldl' in the caller will only evaluate the tuple, but not the
648 -- elements of the tuple
649 in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolution = nsols }
651 -- | Sums two 'AllocSolution' structures.
652 sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution
653 sumAllocs (AllocSolution aFails aAllocs aSols aLog)
654 (AllocSolution bFails bAllocs bSols bLog) =
655 -- note: we add b first, since usually it will be smaller; when
656 -- fold'ing, a will grow and grow whereas b is the per-group
657 -- result, hence smaller
658 let nFails = bFails ++ aFails
659 nAllocs = aAllocs + bAllocs
660 nSols = bestAllocElement aSols bSols
662 in AllocSolution nFails nAllocs nSols nLog
664 -- | Given a solution, generates a reasonable description for it.
665 describeSolution :: AllocSolution -> String
666 describeSolution as =
667 let fcnt = asFailures as
670 intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
671 filter ((> 0) . snd) . collapseFailures $ fcnt
673 Nothing -> "No valid allocation solutions, failure reasons: " ++
674 (if null fcnt then "unknown reasons" else freasons)
675 Just (_, _, nodes, cv) ->
676 printf ("score: %.8f, successes %d, failures %d (%s)" ++
677 " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
678 (intercalate "/" . map Node.name $ nodes)
680 -- | Annotates a solution with the appropriate string.
681 annotateSolution :: AllocSolution -> AllocSolution
682 annotateSolution as = as { asLog = describeSolution as : asLog as }
684 -- | Reverses an evacuation solution.
686 -- Rationale: we always concat the results to the top of the lists, so
687 -- for proper jobset execution, we should reverse all lists.
688 reverseEvacSolution :: EvacSolution -> EvacSolution
689 reverseEvacSolution (EvacSolution f m o) =
690 EvacSolution (reverse f) (reverse m) (reverse o)
692 -- | Generate the valid node allocation singles or pairs for a new instance.
693 genAllocNodes :: Group.List -- ^ Group list
694 -> Node.List -- ^ The node map
695 -> Int -- ^ The number of nodes required
696 -> Bool -- ^ Whether to drop or not
698 -> Result AllocNodes -- ^ The (monadic) result
699 genAllocNodes gl nl count drop_unalloc =
700 let filter_fn = if drop_unalloc
701 then filter (Group.isAllocable .
702 flip Container.find gl . Node.group)
704 all_nodes = filter_fn $ getOnline nl
705 all_pairs = [(Node.idx p,
706 [Node.idx s | s <- all_nodes,
707 Node.idx p /= Node.idx s,
708 Node.group p == Node.group s]) |
711 1 -> Ok (Left (map Node.idx all_nodes))
712 2 -> Ok (Right (filter (not . null . snd) all_pairs))
713 _ -> Bad "Unsupported number of nodes, only one or two supported"
715 -- | Try to allocate an instance on the cluster.
716 tryAlloc :: (Monad m) =>
717 Node.List -- ^ The node list
718 -> Instance.List -- ^ The instance list
719 -> Instance.Instance -- ^ The instance to allocate
720 -> AllocNodes -- ^ The allocation targets
721 -> m AllocSolution -- ^ Possible solution list
722 tryAlloc _ _ _ (Right []) = fail "Not enough online nodes"
723 tryAlloc nl _ inst (Right ok_pairs) =
724 let psols = parMap rwhnf (\(p, ss) ->
726 concatAllocs cstate .
727 allocateOnPair nl inst p)
728 emptyAllocSolution ss) ok_pairs
729 sols = foldl' sumAllocs emptyAllocSolution psols
730 in return $ annotateSolution sols
732 tryAlloc _ _ _ (Left []) = fail "No online nodes"
733 tryAlloc nl _ inst (Left all_nodes) =
734 let sols = foldl' (\cstate ->
735 concatAllocs cstate . allocateOnSingle nl inst
736 ) emptyAllocSolution all_nodes
737 in return $ annotateSolution sols
739 -- | Given a group/result, describe it as a nice (list of) messages.
740 solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String]
741 solutionDescription gl (groupId, result) =
743 Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
744 Bad message -> [printf "Group %s: error %s" gname message]
745 where grp = Container.find groupId gl
746 gname = Group.name grp
747 pol = allocPolicyToRaw (Group.allocPolicy grp)
749 -- | From a list of possibly bad and possibly empty solutions, filter
750 -- only the groups with a valid result. Note that the result will be
751 -- reversed compared to the original list.
752 filterMGResults :: Group.List
753 -> [(Gdx, Result AllocSolution)]
754 -> [(Gdx, AllocSolution)]
755 filterMGResults gl = foldl' fn []
756 where unallocable = not . Group.isAllocable . flip Container.find gl
757 fn accu (gdx, rasol) =
760 Ok sol | isNothing (asSolution sol) -> accu
761 | unallocable gdx -> accu
762 | otherwise -> (gdx, sol):accu
764 -- | Sort multigroup results based on policy and score.
765 sortMGResults :: Group.List
766 -> [(Gdx, AllocSolution)]
767 -> [(Gdx, AllocSolution)]
768 sortMGResults gl sols =
769 let extractScore (_, _, _, x) = x
770 solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl),
771 (extractScore . fromJust . asSolution) sol)
772 in sortBy (comparing solScore) sols
774 -- | Finds the best group for an instance on a multi-group cluster.
776 -- Only solutions in @preferred@ and @last_resort@ groups will be
777 -- accepted as valid, and additionally if the allowed groups parameter
778 -- is not null then allocation will only be run for those group
780 findBestAllocGroup :: Group.List -- ^ The group list
781 -> Node.List -- ^ The node list
782 -> Instance.List -- ^ The instance list
783 -> Maybe [Gdx] -- ^ The allowed groups
784 -> Instance.Instance -- ^ The instance to allocate
785 -> Int -- ^ Required number of nodes
786 -> Result (Gdx, AllocSolution, [String])
787 findBestAllocGroup mggl mgnl mgil allowed_gdxs inst cnt =
788 let groups = splitCluster mgnl mgil
789 groups' = maybe groups (\gs -> filter ((`elem` gs) . fst) groups)
791 sols = map (\(gid, (nl, il)) ->
792 (gid, genAllocNodes mggl nl cnt False >>=
793 tryAlloc nl il inst))
794 groups'::[(Gdx, Result AllocSolution)]
795 all_msgs = concatMap (solutionDescription mggl) sols
796 goodSols = filterMGResults mggl sols
797 sortedSols = sortMGResults mggl goodSols
798 in if null sortedSols
800 then Bad $ "no groups for evacuation: allowed groups was" ++
801 show allowed_gdxs ++ ", all groups: " ++
802 show (map fst groups)
803 else Bad $ intercalate ", " all_msgs
804 else let (final_group, final_sol) = head sortedSols
805 in return (final_group, final_sol, all_msgs)
807 -- | Try to allocate an instance on a multi-group cluster.
808 tryMGAlloc :: Group.List -- ^ The group list
809 -> Node.List -- ^ The node list
810 -> Instance.List -- ^ The instance list
811 -> Instance.Instance -- ^ The instance to allocate
812 -> Int -- ^ Required number of nodes
813 -> Result AllocSolution -- ^ Possible solution list
814 tryMGAlloc mggl mgnl mgil inst cnt = do
815 (best_group, solution, all_msgs) <-
816 findBestAllocGroup mggl mgnl mgil Nothing inst cnt
817 let group_name = Group.name $ Container.find best_group mggl
818 selmsg = "Selected group: " ++ group_name
819 return $ solution { asLog = selmsg:all_msgs }
821 -- | Function which fails if the requested mode is change secondary.
823 -- This is useful since except DRBD, no other disk template can
824 -- execute change secondary; thus, we can just call this function
825 -- instead of always checking for secondary mode. After the call to
826 -- this function, whatever mode we have is just a primary change.
827 failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
828 failOnSecondaryChange ChangeSecondary dt =
829 fail $ "Instances with disk template '" ++ diskTemplateToRaw dt ++
830 "' can't execute change secondary"
831 failOnSecondaryChange _ _ = return ()
833 -- | Run evacuation for a single instance.
835 -- /Note:/ this function should correctly execute both intra-group
836 -- evacuations (in all modes) and inter-group evacuations (in the
837 -- 'ChangeAll' mode). Of course, this requires that the correct list
838 -- of target nodes is passed.
839 nodeEvacInstance :: Node.List -- ^ The node list (cluster-wide)
840 -> Instance.List -- ^ Instance list (cluster-wide)
841 -> EvacMode -- ^ The evacuation mode
842 -> Instance.Instance -- ^ The instance to be evacuated
843 -> Gdx -- ^ The group we're targetting
844 -> [Ndx] -- ^ The list of available nodes
846 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
847 nodeEvacInstance _ _ mode (Instance.Instance
848 {Instance.diskTemplate = dt@DTDiskless}) _ _ =
849 failOnSecondaryChange mode dt >>
850 fail "Diskless relocations not implemented yet"
852 nodeEvacInstance _ _ _ (Instance.Instance
853 {Instance.diskTemplate = DTPlain}) _ _ =
854 fail "Instances of type plain cannot be relocated"
856 nodeEvacInstance _ _ _ (Instance.Instance
857 {Instance.diskTemplate = DTFile}) _ _ =
858 fail "Instances of type file cannot be relocated"
860 nodeEvacInstance _ _ mode (Instance.Instance
861 {Instance.diskTemplate = dt@DTSharedFile}) _ _ =
862 failOnSecondaryChange mode dt >>
863 fail "Shared file relocations not implemented yet"
865 nodeEvacInstance _ _ mode (Instance.Instance
866 {Instance.diskTemplate = dt@DTBlock}) _ _ =
867 failOnSecondaryChange mode dt >>
868 fail "Block device relocations not implemented yet"
870 nodeEvacInstance _ _ mode (Instance.Instance
871 {Instance.diskTemplate = dt@DTRbd}) _ _ =
872 failOnSecondaryChange mode dt >>
873 fail "Rbd relocations not implemented yet"
875 nodeEvacInstance nl il ChangePrimary
876 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
879 (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
880 let idx = Instance.idx inst
881 il' = Container.add idx inst' il
882 ops = iMoveToJob nl' il' idx Failover
883 return (nl', il', ops)
885 nodeEvacInstance nl il ChangeSecondary
886 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
889 (nl', inst', _, ndx) <- annotateResult "Can't find any good node" $
891 foldl' (evacDrbdSecondaryInner nl inst gdx)
892 (Left "no nodes available") avail_nodes
893 let idx = Instance.idx inst
894 il' = Container.add idx inst' il
895 ops = iMoveToJob nl' il' idx (ReplaceSecondary ndx)
896 return (nl', il', ops)
898 -- The algorithm for ChangeAll is as follows:
900 -- * generate all (primary, secondary) node pairs for the target groups
901 -- * for each pair, execute the needed moves (r:s, f, r:s) and compute
902 -- the final node list state and group score
903 -- * select the best choice via a foldl that uses the same Either
904 -- String solution as the ChangeSecondary mode
905 nodeEvacInstance nl il ChangeAll
906 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
909 let no_nodes = Left "no nodes available"
910 node_pairs = [(p,s) | p <- avail_nodes, s <- avail_nodes, p /= s]
911 (nl', il', ops, _) <-
912 annotateResult "Can't find any good nodes for relocation" $
915 (\accu nodes -> case evacDrbdAllInner nl il inst gdx nodes of
919 -- we don't need more details (which
920 -- nodes, etc.) as we only selected
921 -- this group if we can allocate on
922 -- it, hence failures will not
923 -- propagate out of this fold loop
924 Left _ -> Left $ "Allocation failed: " ++ msg
925 Ok result@(_, _, _, new_cv) ->
926 let new_accu = Right result in
929 Right (_, _, _, old_cv) ->
933 ) no_nodes node_pairs
935 return (nl', il', ops)
937 -- | Inner fold function for changing secondary of a DRBD instance.
939 -- The running solution is either a @Left String@, which means we
940 -- don't have yet a working solution, or a @Right (...)@, which
941 -- represents a valid solution; it holds the modified node list, the
942 -- modified instance (after evacuation), the score of that solution,
943 -- and the new secondary node index.
944 evacDrbdSecondaryInner :: Node.List -- ^ Cluster node list
945 -> Instance.Instance -- ^ Instance being evacuated
946 -> Gdx -- ^ The group index of the instance
947 -> Either String ( Node.List
950 , Ndx) -- ^ Current best solution
951 -> Ndx -- ^ Node we're evaluating as new secondary
952 -> Either String ( Node.List
955 , Ndx) -- ^ New best solution
956 evacDrbdSecondaryInner nl inst gdx accu ndx =
957 case applyMove nl inst (ReplaceSecondary ndx) of
961 Left _ -> Left $ "Node " ++ Container.nameOf nl ndx ++
962 " failed: " ++ show fm
963 OpGood (nl', inst', _, _) ->
964 let nodes = Container.elems nl'
965 -- The fromJust below is ugly (it can fail nastily), but
966 -- at this point we should have any internal mismatches,
967 -- and adding a monad here would be quite involved
968 grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
969 new_cv = compCVNodes grpnodes
970 new_accu = Right (nl', inst', new_cv, ndx)
973 Right (_, _, old_cv, _) ->
978 -- | Compute result of changing all nodes of a DRBD instance.
980 -- Given the target primary and secondary node (which might be in a
981 -- different group or not), this function will 'execute' all the
982 -- required steps and assuming all operations succceed, will return
983 -- the modified node and instance lists, the opcodes needed for this
984 -- and the new group score.
985 evacDrbdAllInner :: Node.List -- ^ Cluster node list
986 -> Instance.List -- ^ Cluster instance list
987 -> Instance.Instance -- ^ The instance to be moved
988 -> Gdx -- ^ The target group index
989 -- (which can differ from the
990 -- current group of the
992 -> (Ndx, Ndx) -- ^ Tuple of new
993 -- primary\/secondary nodes
994 -> Result (Node.List, Instance.List, [OpCodes.OpCode], Score)
995 evacDrbdAllInner nl il inst gdx (t_pdx, t_sdx) = do
996 let primary = Container.find (Instance.pNode inst) nl
997 idx = Instance.idx inst
998 -- if the primary is offline, then we first failover
999 (nl1, inst1, ops1) <-
1000 if Node.offline primary
1002 (nl', inst', _, _) <-
1003 annotateResult "Failing over to the secondary" $
1004 opToResult $ applyMove nl inst Failover
1005 return (nl', inst', [Failover])
1006 else return (nl, inst, [])
1007 let (o1, o2, o3) = (ReplaceSecondary t_pdx,
1009 ReplaceSecondary t_sdx)
1010 -- we now need to execute a replace secondary to the future
1012 (nl2, inst2, _, _) <-
1013 annotateResult "Changing secondary to new primary" $
1015 applyMove nl1 inst1 o1
1017 -- we now execute another failover, the primary stays fixed now
1018 (nl3, inst3, _, _) <- annotateResult "Failing over to new primary" $
1019 opToResult $ applyMove nl2 inst2 o2
1021 -- and finally another replace secondary, to the final secondary
1022 (nl4, inst4, _, _) <-
1023 annotateResult "Changing secondary to final secondary" $
1025 applyMove nl3 inst3 o3
1027 il' = Container.add idx inst4 il
1028 ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4
1029 let nodes = Container.elems nl4
1030 -- The fromJust below is ugly (it can fail nastily), but
1031 -- at this point we should have any internal mismatches,
1032 -- and adding a monad here would be quite involved
1033 grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1034 new_cv = compCVNodes grpnodes
1035 return (nl4, il', ops, new_cv)
1037 -- | Computes the nodes in a given group which are available for
1039 availableGroupNodes :: [(Gdx, [Ndx])] -- ^ Group index/node index assoc list
1040 -> IntSet.IntSet -- ^ Nodes that are excluded
1041 -> Gdx -- ^ The group for which we
1043 -> Result [Ndx] -- ^ List of available node indices
1044 availableGroupNodes group_nodes excl_ndx gdx = do
1045 local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx)
1046 Ok (lookup gdx group_nodes)
1047 let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes
1050 -- | Updates the evac solution with the results of an instance
1052 updateEvacSolution :: (Node.List, Instance.List, EvacSolution)
1054 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1055 -> (Node.List, Instance.List, EvacSolution)
1056 updateEvacSolution (nl, il, es) idx (Bad msg) =
1057 (nl, il, es { esFailed = (idx, msg):esFailed es})
1058 updateEvacSolution (_, _, es) idx (Ok (nl, il, opcodes)) =
1059 (nl, il, es { esMoved = new_elem:esMoved es
1060 , esOpCodes = opcodes:esOpCodes es })
1061 where inst = Container.find idx il
1063 instancePriGroup nl inst,
1064 Instance.allNodes inst)
1066 -- | Node-evacuation IAllocator mode main function.
1067 tryNodeEvac :: Group.List -- ^ The cluster groups
1068 -> Node.List -- ^ The node list (cluster-wide, not per group)
1069 -> Instance.List -- ^ Instance list (cluster-wide)
1070 -> EvacMode -- ^ The evacuation mode
1071 -> [Idx] -- ^ List of instance (indices) to be evacuated
1072 -> Result (Node.List, Instance.List, EvacSolution)
1073 tryNodeEvac _ ini_nl ini_il mode idxs =
1074 let evac_ndx = nodesToEvacuate ini_il mode idxs
1075 offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1076 excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline
1077 group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1078 (Container.elems nl))) $
1079 splitCluster ini_nl ini_il
1080 (fin_nl, fin_il, esol) =
1081 foldl' (\state@(nl, il, _) inst ->
1082 let gdx = instancePriGroup nl inst
1083 pdx = Instance.pNode inst in
1084 updateEvacSolution state (Instance.idx inst) $
1085 availableGroupNodes group_ndx
1086 (IntSet.insert pdx excl_ndx) gdx >>=
1087 nodeEvacInstance nl il mode inst gdx
1089 (ini_nl, ini_il, emptyEvacSolution)
1090 (map (`Container.find` ini_il) idxs)
1091 in return (fin_nl, fin_il, reverseEvacSolution esol)
1093 -- | Change-group IAllocator mode main function.
1095 -- This is very similar to 'tryNodeEvac', the only difference is that
1096 -- we don't choose as target group the current instance group, but
1099 -- 1. at the start of the function, we compute which are the target
1100 -- groups; either no groups were passed in, in which case we choose
1101 -- all groups out of which we don't evacuate instance, or there were
1102 -- some groups passed, in which case we use those
1104 -- 2. for each instance, we use 'findBestAllocGroup' to choose the
1105 -- best group to hold the instance, and then we do what
1106 -- 'tryNodeEvac' does, except for this group instead of the current
1109 -- Note that the correct behaviour of this function relies on the
1110 -- function 'nodeEvacInstance' to be able to do correctly both
1111 -- intra-group and inter-group moves when passed the 'ChangeAll' mode.
1112 tryChangeGroup :: Group.List -- ^ The cluster groups
1113 -> Node.List -- ^ The node list (cluster-wide)
1114 -> Instance.List -- ^ Instance list (cluster-wide)
1115 -> [Gdx] -- ^ Target groups; if empty, any
1116 -- groups not being evacuated
1117 -> [Idx] -- ^ List of instance (indices) to be evacuated
1118 -> Result (Node.List, Instance.List, EvacSolution)
1119 tryChangeGroup gl ini_nl ini_il gdxs idxs =
1120 let evac_gdxs = nub $ map (instancePriGroup ini_nl .
1121 flip Container.find ini_il) idxs
1122 target_gdxs = (if null gdxs
1123 then Container.keys gl
1124 else gdxs) \\ evac_gdxs
1125 offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1126 excl_ndx = foldl' (flip IntSet.insert) IntSet.empty offline
1127 group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1128 (Container.elems nl))) $
1129 splitCluster ini_nl ini_il
1130 (fin_nl, fin_il, esol) =
1131 foldl' (\state@(nl, il, _) inst ->
1133 let ncnt = Instance.requiredNodes $
1134 Instance.diskTemplate inst
1135 (gdx, _, _) <- findBestAllocGroup gl nl il
1136 (Just target_gdxs) inst ncnt
1137 av_nodes <- availableGroupNodes group_ndx
1139 nodeEvacInstance nl il ChangeAll inst gdx av_nodes
1140 in updateEvacSolution state (Instance.idx inst) solution
1142 (ini_nl, ini_il, emptyEvacSolution)
1143 (map (`Container.find` ini_il) idxs)
1144 in return (fin_nl, fin_il, reverseEvacSolution esol)
1146 -- | Standard-sized allocation method.
1148 -- This places instances of the same size on the cluster until we're
1149 -- out of space. The result will be a list of identically-sized
1151 iterateAlloc :: AllocMethod
1152 iterateAlloc nl il limit newinst allocnodes ixes cstats =
1153 let depth = length ixes
1154 newname = printf "new-%d" depth::String
1155 newidx = Container.size il
1156 newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
1157 newlimit = fmap (flip (-) 1) limit
1158 in case tryAlloc nl il newi2 allocnodes of
1160 Ok (AllocSolution { asFailures = errs, asSolution = sols3 }) ->
1161 let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in
1164 Just (xnl, xi, _, _) ->
1167 else iterateAlloc xnl (Container.add newidx xi il)
1168 newlimit newinst allocnodes (xi:ixes)
1169 (totalResources xnl:cstats)
1171 -- | Tiered allocation method.
1173 -- This places instances on the cluster, and decreases the spec until
1174 -- we can allocate again. The result will be a list of decreasing
1176 tieredAlloc :: AllocMethod
1177 tieredAlloc nl il limit newinst allocnodes ixes cstats =
1178 case iterateAlloc nl il limit newinst allocnodes ixes cstats of
1180 Ok (errs, nl', il', ixes', cstats') ->
1181 let newsol = Ok (errs, nl', il', ixes', cstats')
1182 ixes_cnt = length ixes'
1183 (stop, newlimit) = case limit of
1184 Nothing -> (False, Nothing)
1185 Just n -> (n <= ixes_cnt,
1186 Just (n - ixes_cnt)) in
1187 if stop then newsol else
1188 case Instance.shrinkByType newinst . fst . last $
1189 sortBy (comparing snd) errs of
1191 Ok newinst' -> tieredAlloc nl' il' newlimit
1192 newinst' allocnodes ixes' cstats'
1194 -- * Formatting functions
1196 -- | Given the original and final nodes, computes the relocation description.
1197 computeMoves :: Instance.Instance -- ^ The instance to be moved
1198 -> String -- ^ The instance name
1199 -> IMove -- ^ The move being performed
1200 -> String -- ^ New primary
1201 -> String -- ^ New secondary
1202 -> (String, [String])
1203 -- ^ Tuple of moves and commands list; moves is containing
1204 -- either @/f/@ for failover or @/r:name/@ for replace
1205 -- secondary, while the command list holds gnt-instance
1206 -- commands (without that prefix), e.g \"@failover instance1@\"
1207 computeMoves i inam mv c d =
1209 Failover -> ("f", [mig])
1210 FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
1211 ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
1212 ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
1213 ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
1214 where morf = if Instance.isRunning i then "migrate" else "failover"
1215 mig = printf "%s -f %s" morf inam::String
1216 rep n = printf "replace-disks -n %s %s" n inam
1218 -- | Converts a placement to string format.
1219 printSolutionLine :: Node.List -- ^ The node list
1220 -> Instance.List -- ^ The instance list
1221 -> Int -- ^ Maximum node name length
1222 -> Int -- ^ Maximum instance name length
1223 -> Placement -- ^ The current placement
1224 -> Int -- ^ The index of the placement in
1226 -> (String, [String])
1227 printSolutionLine nl il nmlen imlen plc pos =
1228 let pmlen = (2*nmlen + 1)
1229 (i, p, s, mv, c) = plc
1230 inst = Container.find i il
1231 inam = Instance.alias inst
1232 npri = Node.alias $ Container.find p nl
1233 nsec = Node.alias $ Container.find s nl
1234 opri = Node.alias $ Container.find (Instance.pNode inst) nl
1235 osec = Node.alias $ Container.find (Instance.sNode inst) nl
1236 (moves, cmds) = computeMoves inst inam mv npri nsec
1237 ostr = printf "%s:%s" opri osec::String
1238 nstr = printf "%s:%s" npri nsec::String
1239 in (printf " %3d. %-*s %-*s => %-*s %.8f a=%s"
1240 pos imlen inam pmlen ostr
1244 -- | Return the instance and involved nodes in an instance move.
1246 -- Note that the output list length can vary, and is not required nor
1247 -- guaranteed to be of any specific length.
1248 involvedNodes :: Instance.List -- ^ Instance list, used for retrieving
1249 -- the instance from its index; note
1250 -- that this /must/ be the original
1251 -- instance list, so that we can
1252 -- retrieve the old nodes
1253 -> Placement -- ^ The placement we're investigating,
1254 -- containing the new nodes and
1256 -> [Ndx] -- ^ Resulting list of node indices
1257 involvedNodes il plc =
1258 let (i, np, ns, _, _) = plc
1259 inst = Container.find i il
1260 in nub $ [np, ns] ++ Instance.allNodes inst
1262 -- | Inner function for splitJobs, that either appends the next job to
1263 -- the current jobset, or starts a new jobset.
1264 mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
1265 mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
1266 mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
1267 | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
1268 | otherwise = ([n]:cjs, ndx)
1270 -- | Break a list of moves into independent groups. Note that this
1271 -- will reverse the order of jobs.
1272 splitJobs :: [MoveJob] -> [JobSet]
1273 splitJobs = fst . foldl mergeJobs ([], [])
1275 -- | Given a list of commands, prefix them with @gnt-instance@ and
1276 -- also beautify the display a little.
1277 formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
1278 formatJob jsn jsl (sn, (_, _, _, cmds)) =
1280 printf " echo job %d/%d" jsn sn:
1282 map (" gnt-instance " ++) cmds
1284 then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
1287 -- | Given a list of commands, prefix them with @gnt-instance@ and
1288 -- also beautify the display a little.
1289 formatCmds :: [JobSet] -> String
1292 concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1296 -- | Print the node list.
1297 printNodes :: Node.List -> [String] -> String
1299 let fields = case fs of
1300 [] -> Node.defaultFields
1301 "+":rest -> Node.defaultFields ++ rest
1303 snl = sortBy (comparing Node.idx) (Container.elems nl)
1304 (header, isnum) = unzip $ map Node.showHeader fields
1305 in printTable "" header (map (Node.list fields) snl) isnum
1307 -- | Print the instance list.
1308 printInsts :: Node.List -> Instance.List -> String
1310 let sil = sortBy (comparing Instance.idx) (Container.elems il)
1311 helper inst = [ if Instance.isRunning inst then "R" else " "
1312 , Instance.name inst
1313 , Container.nameOf nl (Instance.pNode inst)
1314 , let sdx = Instance.sNode inst
1315 in if sdx == Node.noSecondary
1317 else Container.nameOf nl sdx
1318 , if Instance.autoBalance inst then "Y" else "N"
1319 , printf "%3d" $ Instance.vcpus inst
1320 , printf "%5d" $ Instance.mem inst
1321 , printf "%5d" $ Instance.dsk inst `div` 1024
1327 where DynUtil lC lM lD lN = Instance.util inst
1328 header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1329 , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1330 isnum = False:False:False:False:False:repeat True
1331 in printTable "" header (map helper sil) isnum
1333 -- | Shows statistics for a given node list.
1334 printStats :: String -> Node.List -> String
1336 let dcvs = compDetailedCV $ Container.elems nl
1337 (weights, names) = unzip detailedCVInfo
1338 hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1339 header = [ "Field", "Value", "Weight" ]
1340 formatted = map (\(w, h, val) ->
1345 in printTable lp header formatted $ False:repeat True
1347 -- | Convert a placement into a list of OpCodes (basically a job).
1348 iMoveToJob :: Node.List -- ^ The node list; only used for node
1349 -- names, so any version is good
1350 -- (before or after the operation)
1351 -> Instance.List -- ^ The instance list; also used for
1353 -> Idx -- ^ The index of the instance being
1355 -> IMove -- ^ The actual move to be described
1356 -> [OpCodes.OpCode] -- ^ The list of opcodes equivalent to
1358 iMoveToJob nl il idx move =
1359 let inst = Container.find idx il
1360 iname = Instance.name inst
1361 lookNode = Just . Container.nameOf nl
1362 opF = OpCodes.OpInstanceMigrate iname True False True Nothing
1363 opR n = OpCodes.OpInstanceReplaceDisks iname (lookNode n)
1364 OpCodes.ReplaceNewSecondary [] Nothing
1367 ReplacePrimary np -> [ opF, opR np, opF ]
1368 ReplaceSecondary ns -> [ opR ns ]
1369 ReplaceAndFailover np -> [ opR np, opF ]
1370 FailoverAndReplace ns -> [ opF, opR ns ]
1372 -- * Node group functions
1374 -- | Computes the group of an instance.
1375 instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1376 instanceGroup nl i =
1377 let sidx = Instance.sNode i
1378 pnode = Container.find (Instance.pNode i) nl
1379 snode = if sidx == Node.noSecondary
1381 else Container.find sidx nl
1382 pgroup = Node.group pnode
1383 sgroup = Node.group snode
1384 in if pgroup /= sgroup
1385 then fail ("Instance placed accross two node groups, primary " ++
1386 show pgroup ++ ", secondary " ++ show sgroup)
1389 -- | Computes the group of an instance per the primary node.
1390 instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1391 instancePriGroup nl i =
1392 let pnode = Container.find (Instance.pNode i) nl
1395 -- | Compute the list of badly allocated instances (split across node
1397 findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1398 findSplitInstances nl =
1399 filter (not . isOk . instanceGroup nl) . Container.elems
1401 -- | Splits a cluster into the component node groups.
1402 splitCluster :: Node.List -> Instance.List ->
1403 [(Gdx, (Node.List, Instance.List))]
1404 splitCluster nl il =
1405 let ngroups = Node.computeGroups (Container.elems nl)
1406 in map (\(guuid, nodes) ->
1407 let nidxs = map Node.idx nodes
1408 nodes' = zip nidxs nodes
1409 instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1410 in (guuid, (Container.fromList nodes', instances))) ngroups
1412 -- | Compute the list of nodes that are to be evacuated, given a list
1413 -- of instances and an evacuation mode.
1414 nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list
1415 -> EvacMode -- ^ The evacuation mode we're using
1416 -> [Idx] -- ^ List of instance indices being evacuated
1417 -> IntSet.IntSet -- ^ Set of node indices
1418 nodesToEvacuate il mode =
1419 IntSet.delete Node.noSecondary .
1421 let i = Container.find idx il
1422 pdx = Instance.pNode i
1423 sdx = Instance.sNode i
1424 dt = Instance.diskTemplate i
1425 withSecondary = case dt of
1426 DTDrbd8 -> IntSet.insert sdx ns
1429 ChangePrimary -> IntSet.insert pdx ns
1430 ChangeSecondary -> withSecondary
1431 ChangeAll -> IntSet.insert pdx withSecondary