1 {-| Implementation of cluster-wide logic.
3 This module holds all pure cluster-logic; I\/O related functionality
4 goes into the /Main/ module for the individual binaries.
10 Copyright (C) 2009, 2010, 2011, 2012 Google Inc.
12 This program is free software; you can redistribute it and/or modify
13 it under the terms of the GNU General Public License as published by
14 the Free Software Foundation; either version 2 of the License, or
15 (at your option) any later version.
17 This program is distributed in the hope that it will be useful, but
18 WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 General Public License for more details.
22 You should have received a copy of the GNU General Public License
23 along with this program; if not, write to the Free Software
24 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
29 module Ganeti.HTools.Cluster
38 -- * Generic functions
40 , computeAllocationDelta
41 -- * First phase functions
43 -- * Second phase functions
48 -- * Display functions
51 -- * Balacing functions
60 -- * IAllocator functions
67 -- * Allocation functions
70 -- * Node group functions
76 import qualified Data.IntSet as IntSet
78 import Data.Maybe (fromJust, isNothing)
79 import Data.Ord (comparing)
80 import Text.Printf (printf)
82 import qualified Ganeti.HTools.Container as Container
83 import qualified Ganeti.HTools.Instance as Instance
84 import qualified Ganeti.HTools.Node as Node
85 import qualified Ganeti.HTools.Group as Group
86 import Ganeti.HTools.Types
87 import Ganeti.HTools.Utils
88 import Ganeti.HTools.Compat
89 import qualified Ganeti.OpCodes as OpCodes
93 -- | Allocation\/relocation solution.
94 data AllocSolution = AllocSolution
95 { asFailures :: [FailMode] -- ^ Failure counts
96 , asAllocs :: Int -- ^ Good allocation count
97 , asSolution :: Maybe Node.AllocElement -- ^ The actual allocation result
98 , asLog :: [String] -- ^ Informational messages
101 -- | Node evacuation/group change iallocator result type. This result
102 -- type consists of actual opcodes (a restricted subset) that are
103 -- transmitted back to Ganeti.
104 data EvacSolution = EvacSolution
105 { esMoved :: [(Idx, Gdx, [Ndx])] -- ^ Instances moved successfully
106 , esFailed :: [(Idx, String)] -- ^ Instances which were not
108 , esOpCodes :: [[OpCodes.OpCode]] -- ^ List of jobs
111 -- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
112 type AllocResult = (FailStats, Node.List, Instance.List,
113 [Instance.Instance], [CStats])
115 -- | A type denoting the valid allocation mode/pairs.
117 -- For a one-node allocation, this will be a @Left ['Ndx']@, whereas
118 -- for a two-node allocation, this will be a @Right [('Ndx',
119 -- ['Ndx'])]@. In the latter case, the list is basically an
120 -- association list, grouped by primary node and holding the potential
121 -- secondary nodes in the sub-list.
122 type AllocNodes = Either [Ndx] [(Ndx, [Ndx])]
124 -- | The empty solution we start with when computing allocations.
125 emptyAllocSolution :: AllocSolution
126 emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
127 , asSolution = Nothing, asLog = [] }
129 -- | The empty evac solution.
130 emptyEvacSolution :: EvacSolution
131 emptyEvacSolution = EvacSolution { esMoved = []
136 -- | The complete state for the balancing solution.
137 data Table = Table Node.List Instance.List Score [Placement]
138 deriving (Show, Read)
140 -- | Cluster statistics data type.
142 { csFmem :: Integer -- ^ Cluster free mem
143 , csFdsk :: Integer -- ^ Cluster free disk
144 , csAmem :: Integer -- ^ Cluster allocatable mem
145 , csAdsk :: Integer -- ^ Cluster allocatable disk
146 , csAcpu :: Integer -- ^ Cluster allocatable cpus
147 , csMmem :: Integer -- ^ Max node allocatable mem
148 , csMdsk :: Integer -- ^ Max node allocatable disk
149 , csMcpu :: Integer -- ^ Max node allocatable cpu
150 , csImem :: Integer -- ^ Instance used mem
151 , csIdsk :: Integer -- ^ Instance used disk
152 , csIcpu :: Integer -- ^ Instance used cpu
153 , csTmem :: Double -- ^ Cluster total mem
154 , csTdsk :: Double -- ^ Cluster total disk
155 , csTcpu :: Double -- ^ Cluster total cpus
156 , csVcpu :: Integer -- ^ Cluster total virtual cpus
157 , csNcpu :: Double -- ^ Equivalent to 'csIcpu' but in terms of
158 -- physical CPUs, i.e. normalised used phys CPUs
159 , csXmem :: Integer -- ^ Unnacounted for mem
160 , csNmem :: Integer -- ^ Node own memory
161 , csScore :: Score -- ^ The cluster score
162 , csNinst :: Int -- ^ The total number of instances
163 } deriving (Show, Read)
165 -- | A simple type for allocation functions.
166 type AllocMethod = Node.List -- ^ Node list
167 -> Instance.List -- ^ Instance list
168 -> Maybe Int -- ^ Optional allocation limit
169 -> Instance.Instance -- ^ Instance spec for allocation
170 -> AllocNodes -- ^ Which nodes we should allocate on
171 -> [Instance.Instance] -- ^ Allocated instances
172 -> [CStats] -- ^ Running cluster stats
173 -> Result AllocResult -- ^ Allocation result
175 -- | A simple type for the running solution of evacuations.
176 type EvacInnerState =
177 Either String (Node.List, Instance.Instance, Score, Ndx)
179 -- * Utility functions
181 -- | Verifies the N+1 status and return the affected nodes.
182 verifyN1 :: [Node.Node] -> [Node.Node]
183 verifyN1 = filter Node.failN1
185 {-| Computes the pair of bad nodes and instances.
187 The bad node list is computed via a simple 'verifyN1' check, and the
188 bad instance list is the list of primary and secondary instances of
192 computeBadItems :: Node.List -> Instance.List ->
193 ([Node.Node], [Instance.Instance])
194 computeBadItems nl il =
195 let bad_nodes = verifyN1 $ getOnline nl
196 bad_instances = map (`Container.find` il) .
198 concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
200 (bad_nodes, bad_instances)
202 -- | Extracts the node pairs for an instance. This can fail if the
203 -- instance is single-homed. FIXME: this needs to be improved,
204 -- together with the general enhancement for handling non-DRBD moves.
205 instanceNodes :: Node.List -> Instance.Instance ->
206 (Ndx, Ndx, Node.Node, Node.Node)
207 instanceNodes nl inst =
208 let old_pdx = Instance.pNode inst
209 old_sdx = Instance.sNode inst
210 old_p = Container.find old_pdx nl
211 old_s = Container.find old_sdx nl
212 in (old_pdx, old_sdx, old_p, old_s)
214 -- | Zero-initializer for the CStats type.
215 emptyCStats :: CStats
216 emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
218 -- | Update stats with data from a new node.
219 updateCStats :: CStats -> Node.Node -> CStats
220 updateCStats cs node =
221 let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
222 csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
223 csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
224 csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
225 csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
226 csVcpu = x_vcpu, csNcpu = x_ncpu,
227 csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
230 inc_amem = Node.fMem node - Node.rMem node
231 inc_amem' = if inc_amem > 0 then inc_amem else 0
232 inc_adsk = Node.availDisk node
233 inc_imem = truncate (Node.tMem node) - Node.nMem node
234 - Node.xMem node - Node.fMem node
235 inc_icpu = Node.uCpu node
236 inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
237 inc_vcpu = Node.hiCpu node
238 inc_acpu = Node.availCpu node
239 inc_ncpu = fromIntegral (Node.uCpu node) /
240 iPolicyVcpuRatio (Node.iPolicy node)
241 in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
242 , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
243 , csAmem = x_amem + fromIntegral inc_amem'
244 , csAdsk = x_adsk + fromIntegral inc_adsk
245 , csAcpu = x_acpu + fromIntegral inc_acpu
246 , csMmem = max x_mmem (fromIntegral inc_amem')
247 , csMdsk = max x_mdsk (fromIntegral inc_adsk)
248 , csMcpu = max x_mcpu (fromIntegral inc_acpu)
249 , csImem = x_imem + fromIntegral inc_imem
250 , csIdsk = x_idsk + fromIntegral inc_idsk
251 , csIcpu = x_icpu + fromIntegral inc_icpu
252 , csTmem = x_tmem + Node.tMem node
253 , csTdsk = x_tdsk + Node.tDsk node
254 , csTcpu = x_tcpu + Node.tCpu node
255 , csVcpu = x_vcpu + fromIntegral inc_vcpu
256 , csNcpu = x_ncpu + inc_ncpu
257 , csXmem = x_xmem + fromIntegral (Node.xMem node)
258 , csNmem = x_nmem + fromIntegral (Node.nMem node)
259 , csNinst = x_ninst + length (Node.pList node)
262 -- | Compute the total free disk and memory in the cluster.
263 totalResources :: Node.List -> CStats
265 let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
266 in cs { csScore = compCV nl }
268 -- | Compute the delta between two cluster state.
270 -- This is used when doing allocations, to understand better the
271 -- available cluster resources. The return value is a triple of the
272 -- current used values, the delta that was still allocated, and what
273 -- was left unallocated.
274 computeAllocationDelta :: CStats -> CStats -> AllocStats
275 computeAllocationDelta cini cfin =
276 let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu,
277 csNcpu = i_ncpu } = cini
278 CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
279 csTmem = t_mem, csTdsk = t_dsk, csVcpu = f_vcpu,
280 csNcpu = f_ncpu, csTcpu = f_tcpu } = cfin
281 rini = AllocInfo { allocInfoVCpus = fromIntegral i_icpu
282 , allocInfoNCpus = i_ncpu
283 , allocInfoMem = fromIntegral i_imem
284 , allocInfoDisk = fromIntegral i_idsk
286 rfin = AllocInfo { allocInfoVCpus = fromIntegral (f_icpu - i_icpu)
287 , allocInfoNCpus = f_ncpu - i_ncpu
288 , allocInfoMem = fromIntegral (f_imem - i_imem)
289 , allocInfoDisk = fromIntegral (f_idsk - i_idsk)
291 runa = AllocInfo { allocInfoVCpus = fromIntegral (f_vcpu - f_icpu)
292 , allocInfoNCpus = f_tcpu - f_ncpu
293 , allocInfoMem = truncate t_mem - fromIntegral f_imem
294 , allocInfoDisk = truncate t_dsk - fromIntegral f_idsk
296 in (rini, rfin, runa)
298 -- | The names and weights of the individual elements in the CV list.
299 detailedCVInfo :: [(Double, String)]
300 detailedCVInfo = [ (1, "free_mem_cv")
301 , (1, "free_disk_cv")
303 , (1, "reserved_mem_cv")
304 , (4, "offline_all_cnt")
305 , (16, "offline_pri_cnt")
306 , (1, "vcpu_ratio_cv")
309 , (1, "disk_load_cv")
311 , (2, "pri_tags_score")
315 -- | Holds the weights used by 'compCVNodes' for each metric.
316 detailedCVWeights :: [Double]
317 detailedCVWeights = map fst detailedCVInfo
319 -- | Compute the mem and disk covariance.
320 compDetailedCV :: [Node.Node] -> [Double]
321 compDetailedCV all_nodes =
322 let (offline, nodes) = partition Node.offline all_nodes
323 mem_l = map Node.pMem nodes
324 dsk_l = map Node.pDsk nodes
325 -- metric: memory covariance
326 mem_cv = stdDev mem_l
327 -- metric: disk covariance
328 dsk_cv = stdDev dsk_l
329 -- metric: count of instances living on N1 failing nodes
330 n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
331 length (Node.pList n)) .
332 filter Node.failN1 $ nodes :: Double
333 res_l = map Node.pRem nodes
334 -- metric: reserved memory covariance
335 res_cv = stdDev res_l
336 -- offline instances metrics
337 offline_ipri = sum . map (length . Node.pList) $ offline
338 offline_isec = sum . map (length . Node.sList) $ offline
339 -- metric: count of instances on offline nodes
340 off_score = fromIntegral (offline_ipri + offline_isec)::Double
341 -- metric: count of primary instances on offline nodes (this
342 -- helps with evacuation/failover of primary instances on
343 -- 2-node clusters with one node offline)
344 off_pri_score = fromIntegral offline_ipri::Double
345 cpu_l = map Node.pCpu nodes
346 -- metric: covariance of vcpu/pcpu ratio
347 cpu_cv = stdDev cpu_l
348 -- metrics: covariance of cpu, memory, disk and network load
349 (c_load, m_load, d_load, n_load) =
351 let DynUtil c1 m1 d1 n1 = Node.utilLoad n
352 DynUtil c2 m2 d2 n2 = Node.utilPool n
353 in (c1/c2, m1/m2, d1/d2, n1/n2)) nodes
354 -- metric: conflicting instance count
355 pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
356 pri_tags_score = fromIntegral pri_tags_inst::Double
357 -- metric: spindles %
358 spindles_cv = map (\n -> Node.instSpindles n / Node.hiSpindles n) nodes
359 in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
360 , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
361 , pri_tags_score, stdDev spindles_cv ]
363 -- | Compute the /total/ variance.
364 compCVNodes :: [Node.Node] -> Double
365 compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
367 -- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
368 compCV :: Node.List -> Double
369 compCV = compCVNodes . Container.elems
371 -- | Compute online nodes from a 'Node.List'.
372 getOnline :: Node.List -> [Node.Node]
373 getOnline = filter (not . Node.offline) . Container.elems
375 -- * Balancing functions
377 -- | Compute best table. Note that the ordering of the arguments is important.
378 compareTables :: Table -> Table -> Table
379 compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
380 if a_cv > b_cv then b else a
382 -- | Applies an instance move to a given node list and instance.
383 applyMove :: Node.List -> Instance.Instance
384 -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
386 applyMove nl inst Failover =
387 let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
388 int_p = Node.removePri old_p inst
389 int_s = Node.removeSec old_s inst
390 new_nl = do -- Maybe monad
391 new_p <- Node.addPriEx (Node.offline old_p) int_s inst
392 new_s <- Node.addSec int_p inst old_sdx
393 let new_inst = Instance.setBoth inst old_sdx old_pdx
394 return (Container.addTwo old_pdx new_s old_sdx new_p nl,
395 new_inst, old_sdx, old_pdx)
398 -- Failover to any (fa)
399 applyMove nl inst (FailoverToAny new_pdx) = do
400 let (old_pdx, old_sdx, old_pnode, _) = instanceNodes nl inst
401 new_pnode = Container.find new_pdx nl
402 force_failover = Node.offline old_pnode
403 new_pnode' <- Node.addPriEx force_failover new_pnode inst
404 let old_pnode' = Node.removePri old_pnode inst
405 inst' = Instance.setPri inst new_pdx
406 nl' = Container.addTwo old_pdx old_pnode' new_pdx new_pnode' nl
407 return (nl', inst', new_pdx, old_sdx)
409 -- Replace the primary (f:, r:np, f)
410 applyMove nl inst (ReplacePrimary new_pdx) =
411 let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
412 tgt_n = Container.find new_pdx nl
413 int_p = Node.removePri old_p inst
414 int_s = Node.removeSec old_s inst
415 force_p = Node.offline old_p
416 new_nl = do -- Maybe monad
417 -- check that the current secondary can host the instance
418 -- during the migration
419 tmp_s <- Node.addPriEx force_p int_s inst
420 let tmp_s' = Node.removePri tmp_s inst
421 new_p <- Node.addPriEx force_p tgt_n inst
422 new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
423 let new_inst = Instance.setPri inst new_pdx
424 return (Container.add new_pdx new_p $
425 Container.addTwo old_pdx int_p old_sdx new_s nl,
426 new_inst, new_pdx, old_sdx)
429 -- Replace the secondary (r:ns)
430 applyMove nl inst (ReplaceSecondary new_sdx) =
431 let old_pdx = Instance.pNode inst
432 old_sdx = Instance.sNode inst
433 old_s = Container.find old_sdx nl
434 tgt_n = Container.find new_sdx nl
435 int_s = Node.removeSec old_s inst
436 force_s = Node.offline old_s
437 new_inst = Instance.setSec inst new_sdx
438 new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
439 \new_s -> return (Container.addTwo new_sdx
440 new_s old_sdx int_s nl,
441 new_inst, old_pdx, new_sdx)
444 -- Replace the secondary and failover (r:np, f)
445 applyMove nl inst (ReplaceAndFailover new_pdx) =
446 let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
447 tgt_n = Container.find new_pdx nl
448 int_p = Node.removePri old_p inst
449 int_s = Node.removeSec old_s inst
450 force_s = Node.offline old_s
451 new_nl = do -- Maybe monad
452 new_p <- Node.addPri tgt_n inst
453 new_s <- Node.addSecEx force_s int_p inst new_pdx
454 let new_inst = Instance.setBoth inst new_pdx old_pdx
455 return (Container.add new_pdx new_p $
456 Container.addTwo old_pdx new_s old_sdx int_s nl,
457 new_inst, new_pdx, old_pdx)
460 -- Failver and replace the secondary (f, r:ns)
461 applyMove nl inst (FailoverAndReplace new_sdx) =
462 let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
463 tgt_n = Container.find new_sdx nl
464 int_p = Node.removePri old_p inst
465 int_s = Node.removeSec old_s inst
466 force_p = Node.offline old_p
467 new_nl = do -- Maybe monad
468 new_p <- Node.addPriEx force_p int_s inst
469 new_s <- Node.addSecEx force_p tgt_n inst old_sdx
470 let new_inst = Instance.setBoth inst old_sdx new_sdx
471 return (Container.add new_sdx new_s $
472 Container.addTwo old_sdx new_p old_pdx int_p nl,
473 new_inst, old_sdx, new_sdx)
476 -- | Tries to allocate an instance on one given node.
477 allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
478 -> OpResult Node.AllocElement
479 allocateOnSingle nl inst new_pdx =
480 let p = Container.find new_pdx nl
481 new_inst = Instance.setBoth inst new_pdx Node.noSecondary
483 Instance.instMatchesPolicy inst (Node.iPolicy p)
484 new_p <- Node.addPri p inst
485 let new_nl = Container.add new_pdx new_p nl
486 new_score = compCV nl
487 return (new_nl, new_inst, [new_p], new_score)
489 -- | Tries to allocate an instance on a given pair of nodes.
490 allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
491 -> OpResult Node.AllocElement
492 allocateOnPair nl inst new_pdx new_sdx =
493 let tgt_p = Container.find new_pdx nl
494 tgt_s = Container.find new_sdx nl
496 Instance.instMatchesPolicy inst (Node.iPolicy tgt_p)
497 new_p <- Node.addPri tgt_p inst
498 new_s <- Node.addSec tgt_s inst new_pdx
499 let new_inst = Instance.setBoth inst new_pdx new_sdx
500 new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
501 return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
503 -- | Tries to perform an instance move and returns the best table
504 -- between the original one and the new one.
505 checkSingleStep :: Table -- ^ The original table
506 -> Instance.Instance -- ^ The instance to move
507 -> Table -- ^ The current best table
508 -> IMove -- ^ The move to apply
509 -> Table -- ^ The final best table
510 checkSingleStep ini_tbl target cur_tbl move =
511 let Table ini_nl ini_il _ ini_plc = ini_tbl
512 tmp_resu = applyMove ini_nl target move
515 OpGood (upd_nl, new_inst, pri_idx, sec_idx) ->
516 let tgt_idx = Instance.idx target
517 upd_cvar = compCV upd_nl
518 upd_il = Container.add tgt_idx new_inst ini_il
519 upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
520 upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
521 in compareTables cur_tbl upd_tbl
523 -- | Given the status of the current secondary as a valid new node and
524 -- the current candidate target node, generate the possible moves for
526 possibleMoves :: MirrorType -- ^ The mirroring type of the instance
527 -> Bool -- ^ Whether the secondary node is a valid new node
528 -> Bool -- ^ Whether we can change the primary node
529 -> Ndx -- ^ Target node candidate
530 -> [IMove] -- ^ List of valid result moves
532 possibleMoves MirrorNone _ _ _ = []
534 possibleMoves MirrorExternal _ False _ = []
536 possibleMoves MirrorExternal _ True tdx =
537 [ FailoverToAny tdx ]
539 possibleMoves MirrorInternal _ False tdx =
540 [ ReplaceSecondary tdx ]
542 possibleMoves MirrorInternal True True tdx =
543 [ ReplaceSecondary tdx
544 , ReplaceAndFailover tdx
546 , FailoverAndReplace tdx
549 possibleMoves MirrorInternal False True tdx =
550 [ ReplaceSecondary tdx
551 , ReplaceAndFailover tdx
554 -- | Compute the best move for a given instance.
555 checkInstanceMove :: [Ndx] -- ^ Allowed target node indices
556 -> Bool -- ^ Whether disk moves are allowed
557 -> Bool -- ^ Whether instance moves are allowed
558 -> Table -- ^ Original table
559 -> Instance.Instance -- ^ Instance to move
560 -> Table -- ^ Best new table for this instance
561 checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
562 let opdx = Instance.pNode target
563 osdx = Instance.sNode target
564 bad_nodes = [opdx, osdx]
565 nodes = filter (`notElem` bad_nodes) nodes_idx
566 mir_type = templateMirrorType $ Instance.diskTemplate target
567 use_secondary = elem osdx nodes_idx && inst_moves
568 aft_failover = if mir_type == MirrorInternal && use_secondary
569 -- if drbd and allowed to failover
570 then checkSingleStep ini_tbl target ini_tbl Failover
574 then concatMap (possibleMoves mir_type use_secondary inst_moves)
578 -- iterate over the possible nodes for this instance
579 foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
581 -- | Compute the best next move.
582 checkMove :: [Ndx] -- ^ Allowed target node indices
583 -> Bool -- ^ Whether disk moves are allowed
584 -> Bool -- ^ Whether instance moves are allowed
585 -> Table -- ^ The current solution
586 -> [Instance.Instance] -- ^ List of instances still to move
587 -> Table -- ^ The new solution
588 checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
589 let Table _ _ _ ini_plc = ini_tbl
590 -- we're using rwhnf from the Control.Parallel.Strategies
591 -- package; we don't need to use rnf as that would force too
592 -- much evaluation in single-threaded cases, and in
593 -- multi-threaded case the weak head normal form is enough to
594 -- spark the evaluation
595 tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
598 -- iterate over all instances, computing the best move
599 best_tbl = foldl' compareTables ini_tbl tables
600 Table _ _ _ best_plc = best_tbl
601 in if length best_plc == length ini_plc
602 then ini_tbl -- no advancement
605 -- | Check if we are allowed to go deeper in the balancing.
606 doNextBalance :: Table -- ^ The starting table
607 -> Int -- ^ Remaining length
608 -> Score -- ^ Score at which to stop
609 -> Bool -- ^ The resulting table and commands
610 doNextBalance ini_tbl max_rounds min_score =
611 let Table _ _ ini_cv ini_plc = ini_tbl
612 ini_plc_len = length ini_plc
613 in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
615 -- | Run a balance move.
616 tryBalance :: Table -- ^ The starting table
617 -> Bool -- ^ Allow disk moves
618 -> Bool -- ^ Allow instance moves
619 -> Bool -- ^ Only evacuate moves
620 -> Score -- ^ Min gain threshold
621 -> Score -- ^ Min gain
622 -> Maybe Table -- ^ The resulting table and commands
623 tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
624 let Table ini_nl ini_il ini_cv _ = ini_tbl
625 all_inst = Container.elems ini_il
626 all_nodes = Container.elems ini_nl
627 (offline_nodes, online_nodes) = partition Node.offline all_nodes
628 all_inst' = if evac_mode
629 then let bad_nodes = map Node.idx offline_nodes
630 in filter (any (`elem` bad_nodes) .
631 Instance.allNodes) all_inst
633 reloc_inst = filter Instance.movable all_inst'
634 node_idx = map Node.idx online_nodes
635 fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
636 (Table _ _ fin_cv _) = fin_tbl
638 if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
639 then Just fin_tbl -- this round made success, return the new table
642 -- * Allocation functions
644 -- | Build failure stats out of a list of failures.
645 collapseFailures :: [FailMode] -> FailStats
646 collapseFailures flst =
647 map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst))
650 -- | Compares two Maybe AllocElement and chooses the besst score.
651 bestAllocElement :: Maybe Node.AllocElement
652 -> Maybe Node.AllocElement
653 -> Maybe Node.AllocElement
654 bestAllocElement a Nothing = a
655 bestAllocElement Nothing b = b
656 bestAllocElement a@(Just (_, _, _, ascore)) b@(Just (_, _, _, bscore)) =
657 if ascore < bscore then a else b
659 -- | Update current Allocation solution and failure stats with new
661 concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
662 concatAllocs as (OpFail reason) = as { asFailures = reason : asFailures as }
664 concatAllocs as (OpGood ns) =
665 let -- Choose the old or new solution, based on the cluster score
667 osols = asSolution as
668 nsols = bestAllocElement osols (Just ns)
670 -- Note: we force evaluation of nsols here in order to keep the
671 -- memory profile low - we know that we will need nsols for sure
672 -- in the next cycle, so we force evaluation of nsols, since the
673 -- foldl' in the caller will only evaluate the tuple, but not the
674 -- elements of the tuple
675 in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolution = nsols }
677 -- | Sums two 'AllocSolution' structures.
678 sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution
679 sumAllocs (AllocSolution aFails aAllocs aSols aLog)
680 (AllocSolution bFails bAllocs bSols bLog) =
681 -- note: we add b first, since usually it will be smaller; when
682 -- fold'ing, a will grow and grow whereas b is the per-group
683 -- result, hence smaller
684 let nFails = bFails ++ aFails
685 nAllocs = aAllocs + bAllocs
686 nSols = bestAllocElement aSols bSols
688 in AllocSolution nFails nAllocs nSols nLog
690 -- | Given a solution, generates a reasonable description for it.
691 describeSolution :: AllocSolution -> String
692 describeSolution as =
693 let fcnt = asFailures as
696 intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
697 filter ((> 0) . snd) . collapseFailures $ fcnt
699 Nothing -> "No valid allocation solutions, failure reasons: " ++
700 (if null fcnt then "unknown reasons" else freasons)
701 Just (_, _, nodes, cv) ->
702 printf ("score: %.8f, successes %d, failures %d (%s)" ++
703 " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
704 (intercalate "/" . map Node.name $ nodes)
706 -- | Annotates a solution with the appropriate string.
707 annotateSolution :: AllocSolution -> AllocSolution
708 annotateSolution as = as { asLog = describeSolution as : asLog as }
710 -- | Reverses an evacuation solution.
712 -- Rationale: we always concat the results to the top of the lists, so
713 -- for proper jobset execution, we should reverse all lists.
714 reverseEvacSolution :: EvacSolution -> EvacSolution
715 reverseEvacSolution (EvacSolution f m o) =
716 EvacSolution (reverse f) (reverse m) (reverse o)
718 -- | Generate the valid node allocation singles or pairs for a new instance.
719 genAllocNodes :: Group.List -- ^ Group list
720 -> Node.List -- ^ The node map
721 -> Int -- ^ The number of nodes required
722 -> Bool -- ^ Whether to drop or not
724 -> Result AllocNodes -- ^ The (monadic) result
725 genAllocNodes gl nl count drop_unalloc =
726 let filter_fn = if drop_unalloc
727 then filter (Group.isAllocable .
728 flip Container.find gl . Node.group)
730 all_nodes = filter_fn $ getOnline nl
731 all_pairs = [(Node.idx p,
732 [Node.idx s | s <- all_nodes,
733 Node.idx p /= Node.idx s,
734 Node.group p == Node.group s]) |
737 1 -> Ok (Left (map Node.idx all_nodes))
738 2 -> Ok (Right (filter (not . null . snd) all_pairs))
739 _ -> Bad "Unsupported number of nodes, only one or two supported"
741 -- | Try to allocate an instance on the cluster.
742 tryAlloc :: (Monad m) =>
743 Node.List -- ^ The node list
744 -> Instance.List -- ^ The instance list
745 -> Instance.Instance -- ^ The instance to allocate
746 -> AllocNodes -- ^ The allocation targets
747 -> m AllocSolution -- ^ Possible solution list
748 tryAlloc _ _ _ (Right []) = fail "Not enough online nodes"
749 tryAlloc nl _ inst (Right ok_pairs) =
750 let psols = parMap rwhnf (\(p, ss) ->
752 concatAllocs cstate .
753 allocateOnPair nl inst p)
754 emptyAllocSolution ss) ok_pairs
755 sols = foldl' sumAllocs emptyAllocSolution psols
756 in return $ annotateSolution sols
758 tryAlloc _ _ _ (Left []) = fail "No online nodes"
759 tryAlloc nl _ inst (Left all_nodes) =
760 let sols = foldl' (\cstate ->
761 concatAllocs cstate . allocateOnSingle nl inst
762 ) emptyAllocSolution all_nodes
763 in return $ annotateSolution sols
765 -- | Given a group/result, describe it as a nice (list of) messages.
766 solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String]
767 solutionDescription gl (groupId, result) =
769 Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
770 Bad message -> [printf "Group %s: error %s" gname message]
771 where grp = Container.find groupId gl
772 gname = Group.name grp
773 pol = allocPolicyToRaw (Group.allocPolicy grp)
775 -- | From a list of possibly bad and possibly empty solutions, filter
776 -- only the groups with a valid result. Note that the result will be
777 -- reversed compared to the original list.
778 filterMGResults :: Group.List
779 -> [(Gdx, Result AllocSolution)]
780 -> [(Gdx, AllocSolution)]
781 filterMGResults gl = foldl' fn []
782 where unallocable = not . Group.isAllocable . flip Container.find gl
783 fn accu (gdx, rasol) =
786 Ok sol | isNothing (asSolution sol) -> accu
787 | unallocable gdx -> accu
788 | otherwise -> (gdx, sol):accu
790 -- | Sort multigroup results based on policy and score.
791 sortMGResults :: Group.List
792 -> [(Gdx, AllocSolution)]
793 -> [(Gdx, AllocSolution)]
794 sortMGResults gl sols =
795 let extractScore (_, _, _, x) = x
796 solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl),
797 (extractScore . fromJust . asSolution) sol)
798 in sortBy (comparing solScore) sols
800 -- | Finds the best group for an instance on a multi-group cluster.
802 -- Only solutions in @preferred@ and @last_resort@ groups will be
803 -- accepted as valid, and additionally if the allowed groups parameter
804 -- is not null then allocation will only be run for those group
806 findBestAllocGroup :: Group.List -- ^ The group list
807 -> Node.List -- ^ The node list
808 -> Instance.List -- ^ The instance list
809 -> Maybe [Gdx] -- ^ The allowed groups
810 -> Instance.Instance -- ^ The instance to allocate
811 -> Int -- ^ Required number of nodes
812 -> Result (Gdx, AllocSolution, [String])
813 findBestAllocGroup mggl mgnl mgil allowed_gdxs inst cnt =
814 let groups = splitCluster mgnl mgil
815 groups' = maybe groups (\gs -> filter ((`elem` gs) . fst) groups)
817 sols = map (\(gid, (nl, il)) ->
818 (gid, genAllocNodes mggl nl cnt False >>=
819 tryAlloc nl il inst))
820 groups'::[(Gdx, Result AllocSolution)]
821 all_msgs = concatMap (solutionDescription mggl) sols
822 goodSols = filterMGResults mggl sols
823 sortedSols = sortMGResults mggl goodSols
824 in if null sortedSols
826 then Bad $ "no groups for evacuation: allowed groups was" ++
827 show allowed_gdxs ++ ", all groups: " ++
828 show (map fst groups)
829 else Bad $ intercalate ", " all_msgs
830 else let (final_group, final_sol) = head sortedSols
831 in return (final_group, final_sol, all_msgs)
833 -- | Try to allocate an instance on a multi-group cluster.
834 tryMGAlloc :: Group.List -- ^ The group list
835 -> Node.List -- ^ The node list
836 -> Instance.List -- ^ The instance list
837 -> Instance.Instance -- ^ The instance to allocate
838 -> Int -- ^ Required number of nodes
839 -> Result AllocSolution -- ^ Possible solution list
840 tryMGAlloc mggl mgnl mgil inst cnt = do
841 (best_group, solution, all_msgs) <-
842 findBestAllocGroup mggl mgnl mgil Nothing inst cnt
843 let group_name = Group.name $ Container.find best_group mggl
844 selmsg = "Selected group: " ++ group_name
845 return $ solution { asLog = selmsg:all_msgs }
847 -- | Function which fails if the requested mode is change secondary.
849 -- This is useful since except DRBD, no other disk template can
850 -- execute change secondary; thus, we can just call this function
851 -- instead of always checking for secondary mode. After the call to
852 -- this function, whatever mode we have is just a primary change.
853 failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
854 failOnSecondaryChange ChangeSecondary dt =
855 fail $ "Instances with disk template '" ++ diskTemplateToRaw dt ++
856 "' can't execute change secondary"
857 failOnSecondaryChange _ _ = return ()
859 -- | Run evacuation for a single instance.
861 -- /Note:/ this function should correctly execute both intra-group
862 -- evacuations (in all modes) and inter-group evacuations (in the
863 -- 'ChangeAll' mode). Of course, this requires that the correct list
864 -- of target nodes is passed.
865 nodeEvacInstance :: Node.List -- ^ The node list (cluster-wide)
866 -> Instance.List -- ^ Instance list (cluster-wide)
867 -> EvacMode -- ^ The evacuation mode
868 -> Instance.Instance -- ^ The instance to be evacuated
869 -> Gdx -- ^ The group we're targetting
870 -> [Ndx] -- ^ The list of available nodes
872 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
873 nodeEvacInstance _ _ mode (Instance.Instance
874 {Instance.diskTemplate = dt@DTDiskless}) _ _ =
875 failOnSecondaryChange mode dt >>
876 fail "Diskless relocations not implemented yet"
878 nodeEvacInstance _ _ _ (Instance.Instance
879 {Instance.diskTemplate = DTPlain}) _ _ =
880 fail "Instances of type plain cannot be relocated"
882 nodeEvacInstance _ _ _ (Instance.Instance
883 {Instance.diskTemplate = DTFile}) _ _ =
884 fail "Instances of type file cannot be relocated"
886 nodeEvacInstance _ _ mode (Instance.Instance
887 {Instance.diskTemplate = dt@DTSharedFile}) _ _ =
888 failOnSecondaryChange mode dt >>
889 fail "Shared file relocations not implemented yet"
891 nodeEvacInstance _ _ mode (Instance.Instance
892 {Instance.diskTemplate = dt@DTBlock}) _ _ =
893 failOnSecondaryChange mode dt >>
894 fail "Block device relocations not implemented yet"
896 nodeEvacInstance _ _ mode (Instance.Instance
897 {Instance.diskTemplate = dt@DTRbd}) _ _ =
898 failOnSecondaryChange mode dt >>
899 fail "Rbd relocations not implemented yet"
901 nodeEvacInstance nl il ChangePrimary
902 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
905 (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
906 let idx = Instance.idx inst
907 il' = Container.add idx inst' il
908 ops = iMoveToJob nl' il' idx Failover
909 return (nl', il', ops)
911 nodeEvacInstance nl il ChangeSecondary
912 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
914 evacOneNodeOnly nl il inst gdx avail_nodes
916 -- The algorithm for ChangeAll is as follows:
918 -- * generate all (primary, secondary) node pairs for the target groups
919 -- * for each pair, execute the needed moves (r:s, f, r:s) and compute
920 -- the final node list state and group score
921 -- * select the best choice via a foldl that uses the same Either
922 -- String solution as the ChangeSecondary mode
923 nodeEvacInstance nl il ChangeAll
924 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
927 let no_nodes = Left "no nodes available"
928 node_pairs = [(p,s) | p <- avail_nodes, s <- avail_nodes, p /= s]
929 (nl', il', ops, _) <-
930 annotateResult "Can't find any good nodes for relocation" $
933 (\accu nodes -> case evacDrbdAllInner nl il inst gdx nodes of
937 -- we don't need more details (which
938 -- nodes, etc.) as we only selected
939 -- this group if we can allocate on
940 -- it, hence failures will not
941 -- propagate out of this fold loop
942 Left _ -> Left $ "Allocation failed: " ++ msg
943 Ok result@(_, _, _, new_cv) ->
944 let new_accu = Right result in
947 Right (_, _, _, old_cv) ->
951 ) no_nodes node_pairs
953 return (nl', il', ops)
955 -- | Generic function for changing one node of an instance.
957 -- This is similar to 'nodeEvacInstance' but will be used in a few of
958 -- its sub-patterns. It folds the inner function 'evacOneNodeInner'
959 -- over the list of available nodes, which results in the best choice
961 evacOneNodeOnly :: Node.List -- ^ The node list (cluster-wide)
962 -> Instance.List -- ^ Instance list (cluster-wide)
963 -> Instance.Instance -- ^ The instance to be evacuated
964 -> Gdx -- ^ The group we're targetting
965 -> [Ndx] -- ^ The list of available nodes
967 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
968 evacOneNodeOnly nl il inst gdx avail_nodes = do
969 op_fn <- case templateMirrorType (Instance.diskTemplate inst) of
970 MirrorNone -> Bad "Can't relocate/evacuate non-mirrored instances"
971 MirrorInternal -> Ok ReplaceSecondary
972 MirrorExternal -> Ok FailoverToAny
973 (nl', inst', _, ndx) <- annotateResult "Can't find any good node" $
975 foldl' (evacOneNodeInner nl inst gdx op_fn)
976 (Left "no nodes available") avail_nodes
977 let idx = Instance.idx inst
978 il' = Container.add idx inst' il
979 ops = iMoveToJob nl' il' idx (op_fn ndx)
980 return (nl', il', ops)
982 -- | Inner fold function for changing one node of an instance.
984 -- Depending on the instance disk template, this will either change
985 -- the secondary (for DRBD) or the primary node (for shared
986 -- storage). However, the operation is generic otherwise.
988 -- The running solution is either a @Left String@, which means we
989 -- don't have yet a working solution, or a @Right (...)@, which
990 -- represents a valid solution; it holds the modified node list, the
991 -- modified instance (after evacuation), the score of that solution,
992 -- and the new secondary node index.
993 evacOneNodeInner :: Node.List -- ^ Cluster node list
994 -> Instance.Instance -- ^ Instance being evacuated
995 -> Gdx -- ^ The group index of the instance
996 -> (Ndx -> IMove) -- ^ Operation constructor
997 -> EvacInnerState -- ^ Current best solution
998 -> Ndx -- ^ Node we're evaluating as target
999 -> EvacInnerState -- ^ New best solution
1000 evacOneNodeInner nl inst gdx op_fn accu ndx =
1001 case applyMove nl inst (op_fn ndx) of
1005 Left _ -> Left $ "Node " ++ Container.nameOf nl ndx ++
1006 " failed: " ++ show fm
1007 OpGood (nl', inst', _, _) ->
1008 let nodes = Container.elems nl'
1009 -- The fromJust below is ugly (it can fail nastily), but
1010 -- at this point we should have any internal mismatches,
1011 -- and adding a monad here would be quite involved
1012 grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1013 new_cv = compCVNodes grpnodes
1014 new_accu = Right (nl', inst', new_cv, ndx)
1017 Right (_, _, old_cv, _) ->
1022 -- | Compute result of changing all nodes of a DRBD instance.
1024 -- Given the target primary and secondary node (which might be in a
1025 -- different group or not), this function will 'execute' all the
1026 -- required steps and assuming all operations succceed, will return
1027 -- the modified node and instance lists, the opcodes needed for this
1028 -- and the new group score.
1029 evacDrbdAllInner :: Node.List -- ^ Cluster node list
1030 -> Instance.List -- ^ Cluster instance list
1031 -> Instance.Instance -- ^ The instance to be moved
1032 -> Gdx -- ^ The target group index
1033 -- (which can differ from the
1034 -- current group of the
1036 -> (Ndx, Ndx) -- ^ Tuple of new
1037 -- primary\/secondary nodes
1038 -> Result (Node.List, Instance.List, [OpCodes.OpCode], Score)
1039 evacDrbdAllInner nl il inst gdx (t_pdx, t_sdx) = do
1040 let primary = Container.find (Instance.pNode inst) nl
1041 idx = Instance.idx inst
1042 -- if the primary is offline, then we first failover
1043 (nl1, inst1, ops1) <-
1044 if Node.offline primary
1046 (nl', inst', _, _) <-
1047 annotateResult "Failing over to the secondary" $
1048 opToResult $ applyMove nl inst Failover
1049 return (nl', inst', [Failover])
1050 else return (nl, inst, [])
1051 let (o1, o2, o3) = (ReplaceSecondary t_pdx,
1053 ReplaceSecondary t_sdx)
1054 -- we now need to execute a replace secondary to the future
1056 (nl2, inst2, _, _) <-
1057 annotateResult "Changing secondary to new primary" $
1059 applyMove nl1 inst1 o1
1061 -- we now execute another failover, the primary stays fixed now
1062 (nl3, inst3, _, _) <- annotateResult "Failing over to new primary" $
1063 opToResult $ applyMove nl2 inst2 o2
1065 -- and finally another replace secondary, to the final secondary
1066 (nl4, inst4, _, _) <-
1067 annotateResult "Changing secondary to final secondary" $
1069 applyMove nl3 inst3 o3
1071 il' = Container.add idx inst4 il
1072 ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4
1073 let nodes = Container.elems nl4
1074 -- The fromJust below is ugly (it can fail nastily), but
1075 -- at this point we should have any internal mismatches,
1076 -- and adding a monad here would be quite involved
1077 grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1078 new_cv = compCVNodes grpnodes
1079 return (nl4, il', ops, new_cv)
1081 -- | Computes the nodes in a given group which are available for
1083 availableGroupNodes :: [(Gdx, [Ndx])] -- ^ Group index/node index assoc list
1084 -> IntSet.IntSet -- ^ Nodes that are excluded
1085 -> Gdx -- ^ The group for which we
1087 -> Result [Ndx] -- ^ List of available node indices
1088 availableGroupNodes group_nodes excl_ndx gdx = do
1089 local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx)
1090 Ok (lookup gdx group_nodes)
1091 let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes
1094 -- | Updates the evac solution with the results of an instance
1096 updateEvacSolution :: (Node.List, Instance.List, EvacSolution)
1098 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1099 -> (Node.List, Instance.List, EvacSolution)
1100 updateEvacSolution (nl, il, es) idx (Bad msg) =
1101 (nl, il, es { esFailed = (idx, msg):esFailed es})
1102 updateEvacSolution (_, _, es) idx (Ok (nl, il, opcodes)) =
1103 (nl, il, es { esMoved = new_elem:esMoved es
1104 , esOpCodes = opcodes:esOpCodes es })
1105 where inst = Container.find idx il
1107 instancePriGroup nl inst,
1108 Instance.allNodes inst)
1110 -- | Node-evacuation IAllocator mode main function.
1111 tryNodeEvac :: Group.List -- ^ The cluster groups
1112 -> Node.List -- ^ The node list (cluster-wide, not per group)
1113 -> Instance.List -- ^ Instance list (cluster-wide)
1114 -> EvacMode -- ^ The evacuation mode
1115 -> [Idx] -- ^ List of instance (indices) to be evacuated
1116 -> Result (Node.List, Instance.List, EvacSolution)
1117 tryNodeEvac _ ini_nl ini_il mode idxs =
1118 let evac_ndx = nodesToEvacuate ini_il mode idxs
1119 offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1120 excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline
1121 group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1122 (Container.elems nl))) $
1123 splitCluster ini_nl ini_il
1124 (fin_nl, fin_il, esol) =
1125 foldl' (\state@(nl, il, _) inst ->
1126 let gdx = instancePriGroup nl inst
1127 pdx = Instance.pNode inst in
1128 updateEvacSolution state (Instance.idx inst) $
1129 availableGroupNodes group_ndx
1130 (IntSet.insert pdx excl_ndx) gdx >>=
1131 nodeEvacInstance nl il mode inst gdx
1133 (ini_nl, ini_il, emptyEvacSolution)
1134 (map (`Container.find` ini_il) idxs)
1135 in return (fin_nl, fin_il, reverseEvacSolution esol)
1137 -- | Change-group IAllocator mode main function.
1139 -- This is very similar to 'tryNodeEvac', the only difference is that
1140 -- we don't choose as target group the current instance group, but
1143 -- 1. at the start of the function, we compute which are the target
1144 -- groups; either no groups were passed in, in which case we choose
1145 -- all groups out of which we don't evacuate instance, or there were
1146 -- some groups passed, in which case we use those
1148 -- 2. for each instance, we use 'findBestAllocGroup' to choose the
1149 -- best group to hold the instance, and then we do what
1150 -- 'tryNodeEvac' does, except for this group instead of the current
1153 -- Note that the correct behaviour of this function relies on the
1154 -- function 'nodeEvacInstance' to be able to do correctly both
1155 -- intra-group and inter-group moves when passed the 'ChangeAll' mode.
1156 tryChangeGroup :: Group.List -- ^ The cluster groups
1157 -> Node.List -- ^ The node list (cluster-wide)
1158 -> Instance.List -- ^ Instance list (cluster-wide)
1159 -> [Gdx] -- ^ Target groups; if empty, any
1160 -- groups not being evacuated
1161 -> [Idx] -- ^ List of instance (indices) to be evacuated
1162 -> Result (Node.List, Instance.List, EvacSolution)
1163 tryChangeGroup gl ini_nl ini_il gdxs idxs =
1164 let evac_gdxs = nub $ map (instancePriGroup ini_nl .
1165 flip Container.find ini_il) idxs
1166 target_gdxs = (if null gdxs
1167 then Container.keys gl
1168 else gdxs) \\ evac_gdxs
1169 offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1170 excl_ndx = foldl' (flip IntSet.insert) IntSet.empty offline
1171 group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1172 (Container.elems nl))) $
1173 splitCluster ini_nl ini_il
1174 (fin_nl, fin_il, esol) =
1175 foldl' (\state@(nl, il, _) inst ->
1177 let ncnt = Instance.requiredNodes $
1178 Instance.diskTemplate inst
1179 (gdx, _, _) <- findBestAllocGroup gl nl il
1180 (Just target_gdxs) inst ncnt
1181 av_nodes <- availableGroupNodes group_ndx
1183 nodeEvacInstance nl il ChangeAll inst gdx av_nodes
1184 in updateEvacSolution state (Instance.idx inst) solution
1186 (ini_nl, ini_il, emptyEvacSolution)
1187 (map (`Container.find` ini_il) idxs)
1188 in return (fin_nl, fin_il, reverseEvacSolution esol)
1190 -- | Standard-sized allocation method.
1192 -- This places instances of the same size on the cluster until we're
1193 -- out of space. The result will be a list of identically-sized
1195 iterateAlloc :: AllocMethod
1196 iterateAlloc nl il limit newinst allocnodes ixes cstats =
1197 let depth = length ixes
1198 newname = printf "new-%d" depth::String
1199 newidx = Container.size il
1200 newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
1201 newlimit = fmap (flip (-) 1) limit
1202 in case tryAlloc nl il newi2 allocnodes of
1204 Ok (AllocSolution { asFailures = errs, asSolution = sols3 }) ->
1205 let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in
1208 Just (xnl, xi, _, _) ->
1211 else iterateAlloc xnl (Container.add newidx xi il)
1212 newlimit newinst allocnodes (xi:ixes)
1213 (totalResources xnl:cstats)
1215 -- | Tiered allocation method.
1217 -- This places instances on the cluster, and decreases the spec until
1218 -- we can allocate again. The result will be a list of decreasing
1220 tieredAlloc :: AllocMethod
1221 tieredAlloc nl il limit newinst allocnodes ixes cstats =
1222 case iterateAlloc nl il limit newinst allocnodes ixes cstats of
1224 Ok (errs, nl', il', ixes', cstats') ->
1225 let newsol = Ok (errs, nl', il', ixes', cstats')
1226 ixes_cnt = length ixes'
1227 (stop, newlimit) = case limit of
1228 Nothing -> (False, Nothing)
1229 Just n -> (n <= ixes_cnt,
1230 Just (n - ixes_cnt)) in
1231 if stop then newsol else
1232 case Instance.shrinkByType newinst . fst . last $
1233 sortBy (comparing snd) errs of
1235 Ok newinst' -> tieredAlloc nl' il' newlimit
1236 newinst' allocnodes ixes' cstats'
1238 -- * Formatting functions
1240 -- | Given the original and final nodes, computes the relocation description.
1241 computeMoves :: Instance.Instance -- ^ The instance to be moved
1242 -> String -- ^ The instance name
1243 -> IMove -- ^ The move being performed
1244 -> String -- ^ New primary
1245 -> String -- ^ New secondary
1246 -> (String, [String])
1247 -- ^ Tuple of moves and commands list; moves is containing
1248 -- either @/f/@ for failover or @/r:name/@ for replace
1249 -- secondary, while the command list holds gnt-instance
1250 -- commands (without that prefix), e.g \"@failover instance1@\"
1251 computeMoves i inam mv c d =
1253 Failover -> ("f", [mig])
1254 FailoverToAny _ -> (printf "fa:%s" c, [mig_any])
1255 FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
1256 ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
1257 ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
1258 ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
1259 where morf = if Instance.isRunning i then "migrate" else "failover"
1260 mig = printf "%s -f %s" morf inam::String
1261 mig_any = printf "%s -f -n %s %s" morf c inam
1262 rep n = printf "replace-disks -n %s %s" n inam
1264 -- | Converts a placement to string format.
1265 printSolutionLine :: Node.List -- ^ The node list
1266 -> Instance.List -- ^ The instance list
1267 -> Int -- ^ Maximum node name length
1268 -> Int -- ^ Maximum instance name length
1269 -> Placement -- ^ The current placement
1270 -> Int -- ^ The index of the placement in
1272 -> (String, [String])
1273 printSolutionLine nl il nmlen imlen plc pos =
1274 let pmlen = (2*nmlen + 1)
1275 (i, p, s, mv, c) = plc
1276 old_sec = Instance.sNode inst
1277 inst = Container.find i il
1278 inam = Instance.alias inst
1279 npri = Node.alias $ Container.find p nl
1280 nsec = Node.alias $ Container.find s nl
1281 opri = Node.alias $ Container.find (Instance.pNode inst) nl
1282 osec = Node.alias $ Container.find old_sec nl
1283 (moves, cmds) = computeMoves inst inam mv npri nsec
1284 -- FIXME: this should check instead/also the disk template
1285 ostr = if old_sec == Node.noSecondary
1286 then printf "%s" opri
1287 else printf "%s:%s" opri osec
1288 nstr = if s == Node.noSecondary
1289 then printf "%s" npri
1290 else printf "%s:%s" npri nsec
1291 in (printf " %3d. %-*s %-*s => %-*s %12.8f a=%s"
1292 pos imlen inam pmlen (ostr::String)
1293 pmlen (nstr::String) c moves,
1296 -- | Return the instance and involved nodes in an instance move.
1298 -- Note that the output list length can vary, and is not required nor
1299 -- guaranteed to be of any specific length.
1300 involvedNodes :: Instance.List -- ^ Instance list, used for retrieving
1301 -- the instance from its index; note
1302 -- that this /must/ be the original
1303 -- instance list, so that we can
1304 -- retrieve the old nodes
1305 -> Placement -- ^ The placement we're investigating,
1306 -- containing the new nodes and
1308 -> [Ndx] -- ^ Resulting list of node indices
1309 involvedNodes il plc =
1310 let (i, np, ns, _, _) = plc
1311 inst = Container.find i il
1312 in nub $ [np, ns] ++ Instance.allNodes inst
1314 -- | Inner function for splitJobs, that either appends the next job to
1315 -- the current jobset, or starts a new jobset.
1316 mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
1317 mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
1318 mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
1319 | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
1320 | otherwise = ([n]:cjs, ndx)
1322 -- | Break a list of moves into independent groups. Note that this
1323 -- will reverse the order of jobs.
1324 splitJobs :: [MoveJob] -> [JobSet]
1325 splitJobs = fst . foldl mergeJobs ([], [])
1327 -- | Given a list of commands, prefix them with @gnt-instance@ and
1328 -- also beautify the display a little.
1329 formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
1330 formatJob jsn jsl (sn, (_, _, _, cmds)) =
1332 printf " echo job %d/%d" jsn sn:
1334 map (" gnt-instance " ++) cmds
1336 then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
1339 -- | Given a list of commands, prefix them with @gnt-instance@ and
1340 -- also beautify the display a little.
1341 formatCmds :: [JobSet] -> String
1344 concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1348 -- | Print the node list.
1349 printNodes :: Node.List -> [String] -> String
1351 let fields = case fs of
1352 [] -> Node.defaultFields
1353 "+":rest -> Node.defaultFields ++ rest
1355 snl = sortBy (comparing Node.idx) (Container.elems nl)
1356 (header, isnum) = unzip $ map Node.showHeader fields
1357 in printTable "" header (map (Node.list fields) snl) isnum
1359 -- | Print the instance list.
1360 printInsts :: Node.List -> Instance.List -> String
1362 let sil = sortBy (comparing Instance.idx) (Container.elems il)
1363 helper inst = [ if Instance.isRunning inst then "R" else " "
1364 , Instance.name inst
1365 , Container.nameOf nl (Instance.pNode inst)
1366 , let sdx = Instance.sNode inst
1367 in if sdx == Node.noSecondary
1369 else Container.nameOf nl sdx
1370 , if Instance.autoBalance inst then "Y" else "N"
1371 , printf "%3d" $ Instance.vcpus inst
1372 , printf "%5d" $ Instance.mem inst
1373 , printf "%5d" $ Instance.dsk inst `div` 1024
1379 where DynUtil lC lM lD lN = Instance.util inst
1380 header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1381 , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1382 isnum = False:False:False:False:False:repeat True
1383 in printTable "" header (map helper sil) isnum
1385 -- | Shows statistics for a given node list.
1386 printStats :: String -> Node.List -> String
1388 let dcvs = compDetailedCV $ Container.elems nl
1389 (weights, names) = unzip detailedCVInfo
1390 hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1391 header = [ "Field", "Value", "Weight" ]
1392 formatted = map (\(w, h, val) ->
1397 in printTable lp header formatted $ False:repeat True
1399 -- | Convert a placement into a list of OpCodes (basically a job).
1400 iMoveToJob :: Node.List -- ^ The node list; only used for node
1401 -- names, so any version is good
1402 -- (before or after the operation)
1403 -> Instance.List -- ^ The instance list; also used for
1405 -> Idx -- ^ The index of the instance being
1407 -> IMove -- ^ The actual move to be described
1408 -> [OpCodes.OpCode] -- ^ The list of opcodes equivalent to
1410 iMoveToJob nl il idx move =
1411 let inst = Container.find idx il
1412 iname = Instance.name inst
1413 lookNode = Just . Container.nameOf nl
1414 opF = OpCodes.OpInstanceMigrate iname True False True Nothing
1415 opFA n = OpCodes.OpInstanceMigrate iname True False True (lookNode n)
1416 opR n = OpCodes.OpInstanceReplaceDisks iname (lookNode n)
1417 OpCodes.ReplaceNewSecondary [] Nothing
1420 FailoverToAny np -> [ opFA np ]
1421 ReplacePrimary np -> [ opF, opR np, opF ]
1422 ReplaceSecondary ns -> [ opR ns ]
1423 ReplaceAndFailover np -> [ opR np, opF ]
1424 FailoverAndReplace ns -> [ opF, opR ns ]
1426 -- * Node group functions
1428 -- | Computes the group of an instance.
1429 instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1430 instanceGroup nl i =
1431 let sidx = Instance.sNode i
1432 pnode = Container.find (Instance.pNode i) nl
1433 snode = if sidx == Node.noSecondary
1435 else Container.find sidx nl
1436 pgroup = Node.group pnode
1437 sgroup = Node.group snode
1438 in if pgroup /= sgroup
1439 then fail ("Instance placed accross two node groups, primary " ++
1440 show pgroup ++ ", secondary " ++ show sgroup)
1443 -- | Computes the group of an instance per the primary node.
1444 instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1445 instancePriGroup nl i =
1446 let pnode = Container.find (Instance.pNode i) nl
1449 -- | Compute the list of badly allocated instances (split across node
1451 findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1452 findSplitInstances nl =
1453 filter (not . isOk . instanceGroup nl) . Container.elems
1455 -- | Splits a cluster into the component node groups.
1456 splitCluster :: Node.List -> Instance.List ->
1457 [(Gdx, (Node.List, Instance.List))]
1458 splitCluster nl il =
1459 let ngroups = Node.computeGroups (Container.elems nl)
1460 in map (\(guuid, nodes) ->
1461 let nidxs = map Node.idx nodes
1462 nodes' = zip nidxs nodes
1463 instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1464 in (guuid, (Container.fromList nodes', instances))) ngroups
1466 -- | Compute the list of nodes that are to be evacuated, given a list
1467 -- of instances and an evacuation mode.
1468 nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list
1469 -> EvacMode -- ^ The evacuation mode we're using
1470 -> [Idx] -- ^ List of instance indices being evacuated
1471 -> IntSet.IntSet -- ^ Set of node indices
1472 nodesToEvacuate il mode =
1473 IntSet.delete Node.noSecondary .
1475 let i = Container.find idx il
1476 pdx = Instance.pNode i
1477 sdx = Instance.sNode i
1478 dt = Instance.diskTemplate i
1479 withSecondary = case dt of
1480 DTDrbd8 -> IntSet.insert sdx ns
1483 ChangePrimary -> IntSet.insert pdx ns
1484 ChangeSecondary -> withSecondary
1485 ChangeAll -> IntSet.insert pdx withSecondary