1 {-| Implementation of cluster-wide logic.
3 This module holds all pure cluster-logic; I\/O related functionality
4 goes into the /Main/ module for the individual binaries.
10 Copyright (C) 2009, 2010, 2011, 2012 Google Inc.
12 This program is free software; you can redistribute it and/or modify
13 it under the terms of the GNU General Public License as published by
14 the Free Software Foundation; either version 2 of the License, or
15 (at your option) any later version.
17 This program is distributed in the hope that it will be useful, but
18 WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 General Public License for more details.
22 You should have received a copy of the GNU General Public License
23 along with this program; if not, write to the Free Software
24 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
29 module Ganeti.HTools.Cluster
39 -- * Generic functions
41 , computeAllocationDelta
42 -- * First phase functions
44 -- * Second phase functions
49 -- * Display functions
52 -- * Balacing functions
61 -- * IAllocator functions
69 -- * Allocation functions
72 -- * Node group functions
78 import qualified Data.IntSet as IntSet
80 import Data.Maybe (fromJust, isNothing)
81 import Data.Ord (comparing)
82 import Text.Printf (printf)
84 import Ganeti.BasicTypes
85 import qualified Ganeti.HTools.Container as Container
86 import qualified Ganeti.HTools.Instance as Instance
87 import qualified Ganeti.HTools.Node as Node
88 import qualified Ganeti.HTools.Group as Group
89 import Ganeti.HTools.Types
91 import qualified Ganeti.OpCodes as OpCodes
96 -- | Allocation\/relocation solution.
97 data AllocSolution = AllocSolution
98 { asFailures :: [FailMode] -- ^ Failure counts
99 , asAllocs :: Int -- ^ Good allocation count
100 , asSolution :: Maybe Node.AllocElement -- ^ The actual allocation result
101 , asLog :: [String] -- ^ Informational messages
104 -- | Node evacuation/group change iallocator result type. This result
105 -- type consists of actual opcodes (a restricted subset) that are
106 -- transmitted back to Ganeti.
107 data EvacSolution = EvacSolution
108 { esMoved :: [(Idx, Gdx, [Ndx])] -- ^ Instances moved successfully
109 , esFailed :: [(Idx, String)] -- ^ Instances which were not
111 , esOpCodes :: [[OpCodes.OpCode]] -- ^ List of jobs
114 -- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
115 type AllocResult = (FailStats, Node.List, Instance.List,
116 [Instance.Instance], [CStats])
118 -- | Type alias for easier handling.
119 type AllocSolutionList = [(Instance.Instance, AllocSolution)]
121 -- | A type denoting the valid allocation mode/pairs.
123 -- For a one-node allocation, this will be a @Left ['Ndx']@, whereas
124 -- for a two-node allocation, this will be a @Right [('Ndx',
125 -- ['Ndx'])]@. In the latter case, the list is basically an
126 -- association list, grouped by primary node and holding the potential
127 -- secondary nodes in the sub-list.
128 type AllocNodes = Either [Ndx] [(Ndx, [Ndx])]
130 -- | The empty solution we start with when computing allocations.
131 emptyAllocSolution :: AllocSolution
132 emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
133 , asSolution = Nothing, asLog = [] }
135 -- | The empty evac solution.
136 emptyEvacSolution :: EvacSolution
137 emptyEvacSolution = EvacSolution { esMoved = []
142 -- | The complete state for the balancing solution.
143 data Table = Table Node.List Instance.List Score [Placement]
144 deriving (Show, Read)
146 -- | Cluster statistics data type.
148 { csFmem :: Integer -- ^ Cluster free mem
149 , csFdsk :: Integer -- ^ Cluster free disk
150 , csAmem :: Integer -- ^ Cluster allocatable mem
151 , csAdsk :: Integer -- ^ Cluster allocatable disk
152 , csAcpu :: Integer -- ^ Cluster allocatable cpus
153 , csMmem :: Integer -- ^ Max node allocatable mem
154 , csMdsk :: Integer -- ^ Max node allocatable disk
155 , csMcpu :: Integer -- ^ Max node allocatable cpu
156 , csImem :: Integer -- ^ Instance used mem
157 , csIdsk :: Integer -- ^ Instance used disk
158 , csIcpu :: Integer -- ^ Instance used cpu
159 , csTmem :: Double -- ^ Cluster total mem
160 , csTdsk :: Double -- ^ Cluster total disk
161 , csTcpu :: Double -- ^ Cluster total cpus
162 , csVcpu :: Integer -- ^ Cluster total virtual cpus
163 , csNcpu :: Double -- ^ Equivalent to 'csIcpu' but in terms of
164 -- physical CPUs, i.e. normalised used phys CPUs
165 , csXmem :: Integer -- ^ Unnacounted for mem
166 , csNmem :: Integer -- ^ Node own memory
167 , csScore :: Score -- ^ The cluster score
168 , csNinst :: Int -- ^ The total number of instances
169 } deriving (Show, Read)
171 -- | A simple type for allocation functions.
172 type AllocMethod = Node.List -- ^ Node list
173 -> Instance.List -- ^ Instance list
174 -> Maybe Int -- ^ Optional allocation limit
175 -> Instance.Instance -- ^ Instance spec for allocation
176 -> AllocNodes -- ^ Which nodes we should allocate on
177 -> [Instance.Instance] -- ^ Allocated instances
178 -> [CStats] -- ^ Running cluster stats
179 -> Result AllocResult -- ^ Allocation result
181 -- | A simple type for the running solution of evacuations.
182 type EvacInnerState =
183 Either String (Node.List, Instance.Instance, Score, Ndx)
185 -- * Utility functions
187 -- | Verifies the N+1 status and return the affected nodes.
188 verifyN1 :: [Node.Node] -> [Node.Node]
189 verifyN1 = filter Node.failN1
191 {-| Computes the pair of bad nodes and instances.
193 The bad node list is computed via a simple 'verifyN1' check, and the
194 bad instance list is the list of primary and secondary instances of
198 computeBadItems :: Node.List -> Instance.List ->
199 ([Node.Node], [Instance.Instance])
200 computeBadItems nl il =
201 let bad_nodes = verifyN1 $ getOnline nl
202 bad_instances = map (`Container.find` il) .
204 concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
206 (bad_nodes, bad_instances)
208 -- | Extracts the node pairs for an instance. This can fail if the
209 -- instance is single-homed. FIXME: this needs to be improved,
210 -- together with the general enhancement for handling non-DRBD moves.
211 instanceNodes :: Node.List -> Instance.Instance ->
212 (Ndx, Ndx, Node.Node, Node.Node)
213 instanceNodes nl inst =
214 let old_pdx = Instance.pNode inst
215 old_sdx = Instance.sNode inst
216 old_p = Container.find old_pdx nl
217 old_s = Container.find old_sdx nl
218 in (old_pdx, old_sdx, old_p, old_s)
220 -- | Zero-initializer for the CStats type.
221 emptyCStats :: CStats
222 emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
224 -- | Update stats with data from a new node.
225 updateCStats :: CStats -> Node.Node -> CStats
226 updateCStats cs node =
227 let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
228 csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
229 csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
230 csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
231 csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
232 csVcpu = x_vcpu, csNcpu = x_ncpu,
233 csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
236 inc_amem = Node.fMem node - Node.rMem node
237 inc_amem' = if inc_amem > 0 then inc_amem else 0
238 inc_adsk = Node.availDisk node
239 inc_imem = truncate (Node.tMem node) - Node.nMem node
240 - Node.xMem node - Node.fMem node
241 inc_icpu = Node.uCpu node
242 inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
243 inc_vcpu = Node.hiCpu node
244 inc_acpu = Node.availCpu node
245 inc_ncpu = fromIntegral (Node.uCpu node) /
246 iPolicyVcpuRatio (Node.iPolicy node)
247 in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
248 , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
249 , csAmem = x_amem + fromIntegral inc_amem'
250 , csAdsk = x_adsk + fromIntegral inc_adsk
251 , csAcpu = x_acpu + fromIntegral inc_acpu
252 , csMmem = max x_mmem (fromIntegral inc_amem')
253 , csMdsk = max x_mdsk (fromIntegral inc_adsk)
254 , csMcpu = max x_mcpu (fromIntegral inc_acpu)
255 , csImem = x_imem + fromIntegral inc_imem
256 , csIdsk = x_idsk + fromIntegral inc_idsk
257 , csIcpu = x_icpu + fromIntegral inc_icpu
258 , csTmem = x_tmem + Node.tMem node
259 , csTdsk = x_tdsk + Node.tDsk node
260 , csTcpu = x_tcpu + Node.tCpu node
261 , csVcpu = x_vcpu + fromIntegral inc_vcpu
262 , csNcpu = x_ncpu + inc_ncpu
263 , csXmem = x_xmem + fromIntegral (Node.xMem node)
264 , csNmem = x_nmem + fromIntegral (Node.nMem node)
265 , csNinst = x_ninst + length (Node.pList node)
268 -- | Compute the total free disk and memory in the cluster.
269 totalResources :: Node.List -> CStats
271 let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
272 in cs { csScore = compCV nl }
274 -- | Compute the delta between two cluster state.
276 -- This is used when doing allocations, to understand better the
277 -- available cluster resources. The return value is a triple of the
278 -- current used values, the delta that was still allocated, and what
279 -- was left unallocated.
280 computeAllocationDelta :: CStats -> CStats -> AllocStats
281 computeAllocationDelta cini cfin =
282 let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu,
283 csNcpu = i_ncpu } = cini
284 CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
285 csTmem = t_mem, csTdsk = t_dsk, csVcpu = f_vcpu,
286 csNcpu = f_ncpu, csTcpu = f_tcpu } = cfin
287 rini = AllocInfo { allocInfoVCpus = fromIntegral i_icpu
288 , allocInfoNCpus = i_ncpu
289 , allocInfoMem = fromIntegral i_imem
290 , allocInfoDisk = fromIntegral i_idsk
292 rfin = AllocInfo { allocInfoVCpus = fromIntegral (f_icpu - i_icpu)
293 , allocInfoNCpus = f_ncpu - i_ncpu
294 , allocInfoMem = fromIntegral (f_imem - i_imem)
295 , allocInfoDisk = fromIntegral (f_idsk - i_idsk)
297 runa = AllocInfo { allocInfoVCpus = fromIntegral (f_vcpu - f_icpu)
298 , allocInfoNCpus = f_tcpu - f_ncpu
299 , allocInfoMem = truncate t_mem - fromIntegral f_imem
300 , allocInfoDisk = truncate t_dsk - fromIntegral f_idsk
302 in (rini, rfin, runa)
304 -- | The names and weights of the individual elements in the CV list.
305 detailedCVInfo :: [(Double, String)]
306 detailedCVInfo = [ (1, "free_mem_cv")
307 , (1, "free_disk_cv")
309 , (1, "reserved_mem_cv")
310 , (4, "offline_all_cnt")
311 , (16, "offline_pri_cnt")
312 , (1, "vcpu_ratio_cv")
315 , (1, "disk_load_cv")
317 , (2, "pri_tags_score")
321 -- | Holds the weights used by 'compCVNodes' for each metric.
322 detailedCVWeights :: [Double]
323 detailedCVWeights = map fst detailedCVInfo
325 -- | Compute the mem and disk covariance.
326 compDetailedCV :: [Node.Node] -> [Double]
327 compDetailedCV all_nodes =
328 let (offline, nodes) = partition Node.offline all_nodes
329 mem_l = map Node.pMem nodes
330 dsk_l = map Node.pDsk nodes
331 -- metric: memory covariance
332 mem_cv = stdDev mem_l
333 -- metric: disk covariance
334 dsk_cv = stdDev dsk_l
335 -- metric: count of instances living on N1 failing nodes
336 n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
337 length (Node.pList n)) .
338 filter Node.failN1 $ nodes :: Double
339 res_l = map Node.pRem nodes
340 -- metric: reserved memory covariance
341 res_cv = stdDev res_l
342 -- offline instances metrics
343 offline_ipri = sum . map (length . Node.pList) $ offline
344 offline_isec = sum . map (length . Node.sList) $ offline
345 -- metric: count of instances on offline nodes
346 off_score = fromIntegral (offline_ipri + offline_isec)::Double
347 -- metric: count of primary instances on offline nodes (this
348 -- helps with evacuation/failover of primary instances on
349 -- 2-node clusters with one node offline)
350 off_pri_score = fromIntegral offline_ipri::Double
351 cpu_l = map Node.pCpu nodes
352 -- metric: covariance of vcpu/pcpu ratio
353 cpu_cv = stdDev cpu_l
354 -- metrics: covariance of cpu, memory, disk and network load
355 (c_load, m_load, d_load, n_load) =
357 let DynUtil c1 m1 d1 n1 = Node.utilLoad n
358 DynUtil c2 m2 d2 n2 = Node.utilPool n
359 in (c1/c2, m1/m2, d1/d2, n1/n2)) nodes
360 -- metric: conflicting instance count
361 pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
362 pri_tags_score = fromIntegral pri_tags_inst::Double
363 -- metric: spindles %
364 spindles_cv = map (\n -> Node.instSpindles n / Node.hiSpindles n) nodes
365 in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
366 , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
367 , pri_tags_score, stdDev spindles_cv ]
369 -- | Compute the /total/ variance.
370 compCVNodes :: [Node.Node] -> Double
371 compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
373 -- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
374 compCV :: Node.List -> Double
375 compCV = compCVNodes . Container.elems
377 -- | Compute online nodes from a 'Node.List'.
378 getOnline :: Node.List -> [Node.Node]
379 getOnline = filter (not . Node.offline) . Container.elems
381 -- * Balancing functions
383 -- | Compute best table. Note that the ordering of the arguments is important.
384 compareTables :: Table -> Table -> Table
385 compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
386 if a_cv > b_cv then b else a
388 -- | Applies an instance move to a given node list and instance.
389 applyMove :: Node.List -> Instance.Instance
390 -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
392 applyMove nl inst Failover =
393 let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
394 int_p = Node.removePri old_p inst
395 int_s = Node.removeSec old_s inst
396 new_nl = do -- Maybe monad
397 new_p <- Node.addPriEx (Node.offline old_p) int_s inst
398 new_s <- Node.addSec int_p inst old_sdx
399 let new_inst = Instance.setBoth inst old_sdx old_pdx
400 return (Container.addTwo old_pdx new_s old_sdx new_p nl,
401 new_inst, old_sdx, old_pdx)
404 -- Failover to any (fa)
405 applyMove nl inst (FailoverToAny new_pdx) = do
406 let (old_pdx, old_sdx, old_pnode, _) = instanceNodes nl inst
407 new_pnode = Container.find new_pdx nl
408 force_failover = Node.offline old_pnode
409 new_pnode' <- Node.addPriEx force_failover new_pnode inst
410 let old_pnode' = Node.removePri old_pnode inst
411 inst' = Instance.setPri inst new_pdx
412 nl' = Container.addTwo old_pdx old_pnode' new_pdx new_pnode' nl
413 return (nl', inst', new_pdx, old_sdx)
415 -- Replace the primary (f:, r:np, f)
416 applyMove nl inst (ReplacePrimary new_pdx) =
417 let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
418 tgt_n = Container.find new_pdx nl
419 int_p = Node.removePri old_p inst
420 int_s = Node.removeSec old_s inst
421 force_p = Node.offline old_p
422 new_nl = do -- Maybe monad
423 -- check that the current secondary can host the instance
424 -- during the migration
425 tmp_s <- Node.addPriEx force_p int_s inst
426 let tmp_s' = Node.removePri tmp_s inst
427 new_p <- Node.addPriEx force_p tgt_n inst
428 new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
429 let new_inst = Instance.setPri inst new_pdx
430 return (Container.add new_pdx new_p $
431 Container.addTwo old_pdx int_p old_sdx new_s nl,
432 new_inst, new_pdx, old_sdx)
435 -- Replace the secondary (r:ns)
436 applyMove nl inst (ReplaceSecondary new_sdx) =
437 let old_pdx = Instance.pNode inst
438 old_sdx = Instance.sNode inst
439 old_s = Container.find old_sdx nl
440 tgt_n = Container.find new_sdx nl
441 int_s = Node.removeSec old_s inst
442 force_s = Node.offline old_s
443 new_inst = Instance.setSec inst new_sdx
444 new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
445 \new_s -> return (Container.addTwo new_sdx
446 new_s old_sdx int_s nl,
447 new_inst, old_pdx, new_sdx)
450 -- Replace the secondary and failover (r:np, f)
451 applyMove nl inst (ReplaceAndFailover new_pdx) =
452 let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
453 tgt_n = Container.find new_pdx nl
454 int_p = Node.removePri old_p inst
455 int_s = Node.removeSec old_s inst
456 force_s = Node.offline old_s
457 new_nl = do -- Maybe monad
458 new_p <- Node.addPri tgt_n inst
459 new_s <- Node.addSecEx force_s int_p inst new_pdx
460 let new_inst = Instance.setBoth inst new_pdx old_pdx
461 return (Container.add new_pdx new_p $
462 Container.addTwo old_pdx new_s old_sdx int_s nl,
463 new_inst, new_pdx, old_pdx)
466 -- Failver and replace the secondary (f, r:ns)
467 applyMove nl inst (FailoverAndReplace new_sdx) =
468 let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
469 tgt_n = Container.find new_sdx nl
470 int_p = Node.removePri old_p inst
471 int_s = Node.removeSec old_s inst
472 force_p = Node.offline old_p
473 new_nl = do -- Maybe monad
474 new_p <- Node.addPriEx force_p int_s inst
475 new_s <- Node.addSecEx force_p tgt_n inst old_sdx
476 let new_inst = Instance.setBoth inst old_sdx new_sdx
477 return (Container.add new_sdx new_s $
478 Container.addTwo old_sdx new_p old_pdx int_p nl,
479 new_inst, old_sdx, new_sdx)
482 -- | Tries to allocate an instance on one given node.
483 allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
484 -> OpResult Node.AllocElement
485 allocateOnSingle nl inst new_pdx =
486 let p = Container.find new_pdx nl
487 new_inst = Instance.setBoth inst new_pdx Node.noSecondary
489 Instance.instMatchesPolicy inst (Node.iPolicy p)
490 new_p <- Node.addPri p inst
491 let new_nl = Container.add new_pdx new_p nl
492 new_score = compCV new_nl
493 return (new_nl, new_inst, [new_p], new_score)
495 -- | Tries to allocate an instance on a given pair of nodes.
496 allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
497 -> OpResult Node.AllocElement
498 allocateOnPair nl inst new_pdx new_sdx =
499 let tgt_p = Container.find new_pdx nl
500 tgt_s = Container.find new_sdx nl
502 Instance.instMatchesPolicy inst (Node.iPolicy tgt_p)
503 new_p <- Node.addPri tgt_p inst
504 new_s <- Node.addSec tgt_s inst new_pdx
505 let new_inst = Instance.setBoth inst new_pdx new_sdx
506 new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
507 return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
509 -- | Tries to perform an instance move and returns the best table
510 -- between the original one and the new one.
511 checkSingleStep :: Table -- ^ The original table
512 -> Instance.Instance -- ^ The instance to move
513 -> Table -- ^ The current best table
514 -> IMove -- ^ The move to apply
515 -> Table -- ^ The final best table
516 checkSingleStep ini_tbl target cur_tbl move =
517 let Table ini_nl ini_il _ ini_plc = ini_tbl
518 tmp_resu = applyMove ini_nl target move
521 Ok (upd_nl, new_inst, pri_idx, sec_idx) ->
522 let tgt_idx = Instance.idx target
523 upd_cvar = compCV upd_nl
524 upd_il = Container.add tgt_idx new_inst ini_il
525 upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
526 upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
527 in compareTables cur_tbl upd_tbl
529 -- | Given the status of the current secondary as a valid new node and
530 -- the current candidate target node, generate the possible moves for
532 possibleMoves :: MirrorType -- ^ The mirroring type of the instance
533 -> Bool -- ^ Whether the secondary node is a valid new node
534 -> Bool -- ^ Whether we can change the primary node
535 -> Ndx -- ^ Target node candidate
536 -> [IMove] -- ^ List of valid result moves
538 possibleMoves MirrorNone _ _ _ = []
540 possibleMoves MirrorExternal _ False _ = []
542 possibleMoves MirrorExternal _ True tdx =
543 [ FailoverToAny tdx ]
545 possibleMoves MirrorInternal _ False tdx =
546 [ ReplaceSecondary tdx ]
548 possibleMoves MirrorInternal True True tdx =
549 [ ReplaceSecondary tdx
550 , ReplaceAndFailover tdx
552 , FailoverAndReplace tdx
555 possibleMoves MirrorInternal False True tdx =
556 [ ReplaceSecondary tdx
557 , ReplaceAndFailover tdx
560 -- | Compute the best move for a given instance.
561 checkInstanceMove :: [Ndx] -- ^ Allowed target node indices
562 -> Bool -- ^ Whether disk moves are allowed
563 -> Bool -- ^ Whether instance moves are allowed
564 -> Table -- ^ Original table
565 -> Instance.Instance -- ^ Instance to move
566 -> Table -- ^ Best new table for this instance
567 checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
568 let opdx = Instance.pNode target
569 osdx = Instance.sNode target
570 bad_nodes = [opdx, osdx]
571 nodes = filter (`notElem` bad_nodes) nodes_idx
572 mir_type = Instance.mirrorType target
573 use_secondary = elem osdx nodes_idx && inst_moves
574 aft_failover = if mir_type == MirrorInternal && use_secondary
575 -- if drbd and allowed to failover
576 then checkSingleStep ini_tbl target ini_tbl Failover
580 then concatMap (possibleMoves mir_type use_secondary inst_moves)
584 -- iterate over the possible nodes for this instance
585 foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
587 -- | Compute the best next move.
588 checkMove :: [Ndx] -- ^ Allowed target node indices
589 -> Bool -- ^ Whether disk moves are allowed
590 -> Bool -- ^ Whether instance moves are allowed
591 -> Table -- ^ The current solution
592 -> [Instance.Instance] -- ^ List of instances still to move
593 -> Table -- ^ The new solution
594 checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
595 let Table _ _ _ ini_plc = ini_tbl
596 -- we're using rwhnf from the Control.Parallel.Strategies
597 -- package; we don't need to use rnf as that would force too
598 -- much evaluation in single-threaded cases, and in
599 -- multi-threaded case the weak head normal form is enough to
600 -- spark the evaluation
601 tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
604 -- iterate over all instances, computing the best move
605 best_tbl = foldl' compareTables ini_tbl tables
606 Table _ _ _ best_plc = best_tbl
607 in if length best_plc == length ini_plc
608 then ini_tbl -- no advancement
611 -- | Check if we are allowed to go deeper in the balancing.
612 doNextBalance :: Table -- ^ The starting table
613 -> Int -- ^ Remaining length
614 -> Score -- ^ Score at which to stop
615 -> Bool -- ^ The resulting table and commands
616 doNextBalance ini_tbl max_rounds min_score =
617 let Table _ _ ini_cv ini_plc = ini_tbl
618 ini_plc_len = length ini_plc
619 in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
621 -- | Run a balance move.
622 tryBalance :: Table -- ^ The starting table
623 -> Bool -- ^ Allow disk moves
624 -> Bool -- ^ Allow instance moves
625 -> Bool -- ^ Only evacuate moves
626 -> Score -- ^ Min gain threshold
627 -> Score -- ^ Min gain
628 -> Maybe Table -- ^ The resulting table and commands
629 tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
630 let Table ini_nl ini_il ini_cv _ = ini_tbl
631 all_inst = Container.elems ini_il
632 all_nodes = Container.elems ini_nl
633 (offline_nodes, online_nodes) = partition Node.offline all_nodes
634 all_inst' = if evac_mode
635 then let bad_nodes = map Node.idx offline_nodes
636 in filter (any (`elem` bad_nodes) .
637 Instance.allNodes) all_inst
639 reloc_inst = filter Instance.movable all_inst'
640 node_idx = map Node.idx online_nodes
641 fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
642 (Table _ _ fin_cv _) = fin_tbl
644 if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
645 then Just fin_tbl -- this round made success, return the new table
648 -- * Allocation functions
650 -- | Build failure stats out of a list of failures.
651 collapseFailures :: [FailMode] -> FailStats
652 collapseFailures flst =
653 map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst))
656 -- | Compares two Maybe AllocElement and chooses the besst score.
657 bestAllocElement :: Maybe Node.AllocElement
658 -> Maybe Node.AllocElement
659 -> Maybe Node.AllocElement
660 bestAllocElement a Nothing = a
661 bestAllocElement Nothing b = b
662 bestAllocElement a@(Just (_, _, _, ascore)) b@(Just (_, _, _, bscore)) =
663 if ascore < bscore then a else b
665 -- | Update current Allocation solution and failure stats with new
667 concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
668 concatAllocs as (Bad reason) = as { asFailures = reason : asFailures as }
670 concatAllocs as (Ok ns) =
671 let -- Choose the old or new solution, based on the cluster score
673 osols = asSolution as
674 nsols = bestAllocElement osols (Just ns)
676 -- Note: we force evaluation of nsols here in order to keep the
677 -- memory profile low - we know that we will need nsols for sure
678 -- in the next cycle, so we force evaluation of nsols, since the
679 -- foldl' in the caller will only evaluate the tuple, but not the
680 -- elements of the tuple
681 in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolution = nsols }
683 -- | Sums two 'AllocSolution' structures.
684 sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution
685 sumAllocs (AllocSolution aFails aAllocs aSols aLog)
686 (AllocSolution bFails bAllocs bSols bLog) =
687 -- note: we add b first, since usually it will be smaller; when
688 -- fold'ing, a will grow and grow whereas b is the per-group
689 -- result, hence smaller
690 let nFails = bFails ++ aFails
691 nAllocs = aAllocs + bAllocs
692 nSols = bestAllocElement aSols bSols
694 in AllocSolution nFails nAllocs nSols nLog
696 -- | Given a solution, generates a reasonable description for it.
697 describeSolution :: AllocSolution -> String
698 describeSolution as =
699 let fcnt = asFailures as
702 intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
703 filter ((> 0) . snd) . collapseFailures $ fcnt
705 Nothing -> "No valid allocation solutions, failure reasons: " ++
706 (if null fcnt then "unknown reasons" else freasons)
707 Just (_, _, nodes, cv) ->
708 printf ("score: %.8f, successes %d, failures %d (%s)" ++
709 " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
710 (intercalate "/" . map Node.name $ nodes)
712 -- | Annotates a solution with the appropriate string.
713 annotateSolution :: AllocSolution -> AllocSolution
714 annotateSolution as = as { asLog = describeSolution as : asLog as }
716 -- | Reverses an evacuation solution.
718 -- Rationale: we always concat the results to the top of the lists, so
719 -- for proper jobset execution, we should reverse all lists.
720 reverseEvacSolution :: EvacSolution -> EvacSolution
721 reverseEvacSolution (EvacSolution f m o) =
722 EvacSolution (reverse f) (reverse m) (reverse o)
724 -- | Generate the valid node allocation singles or pairs for a new instance.
725 genAllocNodes :: Group.List -- ^ Group list
726 -> Node.List -- ^ The node map
727 -> Int -- ^ The number of nodes required
728 -> Bool -- ^ Whether to drop or not
730 -> Result AllocNodes -- ^ The (monadic) result
731 genAllocNodes gl nl count drop_unalloc =
732 let filter_fn = if drop_unalloc
733 then filter (Group.isAllocable .
734 flip Container.find gl . Node.group)
736 all_nodes = filter_fn $ getOnline nl
737 all_pairs = [(Node.idx p,
738 [Node.idx s | s <- all_nodes,
739 Node.idx p /= Node.idx s,
740 Node.group p == Node.group s]) |
743 1 -> Ok (Left (map Node.idx all_nodes))
744 2 -> Ok (Right (filter (not . null . snd) all_pairs))
745 _ -> Bad "Unsupported number of nodes, only one or two supported"
747 -- | Try to allocate an instance on the cluster.
748 tryAlloc :: (Monad m) =>
749 Node.List -- ^ The node list
750 -> Instance.List -- ^ The instance list
751 -> Instance.Instance -- ^ The instance to allocate
752 -> AllocNodes -- ^ The allocation targets
753 -> m AllocSolution -- ^ Possible solution list
754 tryAlloc _ _ _ (Right []) = fail "Not enough online nodes"
755 tryAlloc nl _ inst (Right ok_pairs) =
756 let psols = parMap rwhnf (\(p, ss) ->
758 concatAllocs cstate .
759 allocateOnPair nl inst p)
760 emptyAllocSolution ss) ok_pairs
761 sols = foldl' sumAllocs emptyAllocSolution psols
762 in return $ annotateSolution sols
764 tryAlloc _ _ _ (Left []) = fail "No online nodes"
765 tryAlloc nl _ inst (Left all_nodes) =
766 let sols = foldl' (\cstate ->
767 concatAllocs cstate . allocateOnSingle nl inst
768 ) emptyAllocSolution all_nodes
769 in return $ annotateSolution sols
771 -- | Given a group/result, describe it as a nice (list of) messages.
772 solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String]
773 solutionDescription gl (groupId, result) =
775 Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
776 Bad message -> [printf "Group %s: error %s" gname message]
777 where grp = Container.find groupId gl
778 gname = Group.name grp
779 pol = allocPolicyToRaw (Group.allocPolicy grp)
781 -- | From a list of possibly bad and possibly empty solutions, filter
782 -- only the groups with a valid result. Note that the result will be
783 -- reversed compared to the original list.
784 filterMGResults :: Group.List
785 -> [(Gdx, Result AllocSolution)]
786 -> [(Gdx, AllocSolution)]
787 filterMGResults gl = foldl' fn []
788 where unallocable = not . Group.isAllocable . flip Container.find gl
789 fn accu (gdx, rasol) =
792 Ok sol | isNothing (asSolution sol) -> accu
793 | unallocable gdx -> accu
794 | otherwise -> (gdx, sol):accu
796 -- | Sort multigroup results based on policy and score.
797 sortMGResults :: Group.List
798 -> [(Gdx, AllocSolution)]
799 -> [(Gdx, AllocSolution)]
800 sortMGResults gl sols =
801 let extractScore (_, _, _, x) = x
802 solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl),
803 (extractScore . fromJust . asSolution) sol)
804 in sortBy (comparing solScore) sols
806 -- | Finds the best group for an instance on a multi-group cluster.
808 -- Only solutions in @preferred@ and @last_resort@ groups will be
809 -- accepted as valid, and additionally if the allowed groups parameter
810 -- is not null then allocation will only be run for those group
812 findBestAllocGroup :: Group.List -- ^ The group list
813 -> Node.List -- ^ The node list
814 -> Instance.List -- ^ The instance list
815 -> Maybe [Gdx] -- ^ The allowed groups
816 -> Instance.Instance -- ^ The instance to allocate
817 -> Int -- ^ Required number of nodes
818 -> Result (Gdx, AllocSolution, [String])
819 findBestAllocGroup mggl mgnl mgil allowed_gdxs inst cnt =
820 let groups = splitCluster mgnl mgil
821 groups' = maybe groups (\gs -> filter ((`elem` gs) . fst) groups)
823 sols = map (\(gid, (nl, il)) ->
824 (gid, genAllocNodes mggl nl cnt False >>=
825 tryAlloc nl il inst))
826 groups'::[(Gdx, Result AllocSolution)]
827 all_msgs = concatMap (solutionDescription mggl) sols
828 goodSols = filterMGResults mggl sols
829 sortedSols = sortMGResults mggl goodSols
830 in if null sortedSols
832 then Bad $ "no groups for evacuation: allowed groups was" ++
833 show allowed_gdxs ++ ", all groups: " ++
834 show (map fst groups)
835 else Bad $ intercalate ", " all_msgs
836 else let (final_group, final_sol) = head sortedSols
837 in return (final_group, final_sol, all_msgs)
839 -- | Try to allocate an instance on a multi-group cluster.
840 tryMGAlloc :: Group.List -- ^ The group list
841 -> Node.List -- ^ The node list
842 -> Instance.List -- ^ The instance list
843 -> Instance.Instance -- ^ The instance to allocate
844 -> Int -- ^ Required number of nodes
845 -> Result AllocSolution -- ^ Possible solution list
846 tryMGAlloc mggl mgnl mgil inst cnt = do
847 (best_group, solution, all_msgs) <-
848 findBestAllocGroup mggl mgnl mgil Nothing inst cnt
849 let group_name = Group.name $ Container.find best_group mggl
850 selmsg = "Selected group: " ++ group_name
851 return $ solution { asLog = selmsg:all_msgs }
853 -- | Calculate the new instance list after allocation solution.
854 updateIl :: Instance.List -- ^ The original instance list
855 -> Maybe Node.AllocElement -- ^ The result of the allocation attempt
856 -> Instance.List -- ^ The updated instance list
857 updateIl il Nothing = il
858 updateIl il (Just (_, xi, _, _)) = Container.add (Container.size il) xi il
860 -- | Extract the the new node list from the allocation solution.
861 extractNl :: Node.List -- ^ The original node list
862 -> Maybe Node.AllocElement -- ^ The result of the allocation attempt
863 -> Node.List -- ^ The new node list
864 extractNl nl Nothing = nl
865 extractNl _ (Just (xnl, _, _, _)) = xnl
867 -- | Try to allocate a list of instances on a multi-group cluster.
868 allocList :: Group.List -- ^ The group list
869 -> Node.List -- ^ The node list
870 -> Instance.List -- ^ The instance list
871 -> [(Instance.Instance, Int)] -- ^ The instance to allocate
872 -> AllocSolutionList -- ^ Possible solution list
873 -> Result (Node.List, Instance.List,
874 AllocSolutionList) -- ^ The final solution list
875 allocList _ nl il [] result = Ok (nl, il, result)
876 allocList gl nl il ((xi, xicnt):xies) result = do
877 ares <- tryMGAlloc gl nl il xi xicnt
878 let sol = asSolution ares
879 nl' = extractNl nl sol
880 il' = updateIl il sol
881 allocList gl nl' il' xies ((xi, ares):result)
883 -- | Function which fails if the requested mode is change secondary.
885 -- This is useful since except DRBD, no other disk template can
886 -- execute change secondary; thus, we can just call this function
887 -- instead of always checking for secondary mode. After the call to
888 -- this function, whatever mode we have is just a primary change.
889 failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
890 failOnSecondaryChange ChangeSecondary dt =
891 fail $ "Instances with disk template '" ++ diskTemplateToRaw dt ++
892 "' can't execute change secondary"
893 failOnSecondaryChange _ _ = return ()
895 -- | Run evacuation for a single instance.
897 -- /Note:/ this function should correctly execute both intra-group
898 -- evacuations (in all modes) and inter-group evacuations (in the
899 -- 'ChangeAll' mode). Of course, this requires that the correct list
900 -- of target nodes is passed.
901 nodeEvacInstance :: Node.List -- ^ The node list (cluster-wide)
902 -> Instance.List -- ^ Instance list (cluster-wide)
903 -> EvacMode -- ^ The evacuation mode
904 -> Instance.Instance -- ^ The instance to be evacuated
905 -> Gdx -- ^ The group we're targetting
906 -> [Ndx] -- ^ The list of available nodes
908 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
909 nodeEvacInstance nl il mode inst@(Instance.Instance
910 {Instance.diskTemplate = dt@DTDiskless})
912 failOnSecondaryChange mode dt >>
913 evacOneNodeOnly nl il inst gdx avail_nodes
915 nodeEvacInstance _ _ _ (Instance.Instance
916 {Instance.diskTemplate = DTPlain}) _ _ =
917 fail "Instances of type plain cannot be relocated"
919 nodeEvacInstance _ _ _ (Instance.Instance
920 {Instance.diskTemplate = DTFile}) _ _ =
921 fail "Instances of type file cannot be relocated"
923 nodeEvacInstance nl il mode inst@(Instance.Instance
924 {Instance.diskTemplate = dt@DTSharedFile})
926 failOnSecondaryChange mode dt >>
927 evacOneNodeOnly nl il inst gdx avail_nodes
929 nodeEvacInstance nl il mode inst@(Instance.Instance
930 {Instance.diskTemplate = dt@DTBlock})
932 failOnSecondaryChange mode dt >>
933 evacOneNodeOnly nl il inst gdx avail_nodes
935 nodeEvacInstance nl il mode inst@(Instance.Instance
936 {Instance.diskTemplate = dt@DTRbd})
938 failOnSecondaryChange mode dt >>
939 evacOneNodeOnly nl il inst gdx avail_nodes
941 nodeEvacInstance nl il ChangePrimary
942 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
945 (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
946 let idx = Instance.idx inst
947 il' = Container.add idx inst' il
948 ops = iMoveToJob nl' il' idx Failover
949 return (nl', il', ops)
951 nodeEvacInstance nl il ChangeSecondary
952 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
954 evacOneNodeOnly nl il inst gdx avail_nodes
956 -- The algorithm for ChangeAll is as follows:
958 -- * generate all (primary, secondary) node pairs for the target groups
959 -- * for each pair, execute the needed moves (r:s, f, r:s) and compute
960 -- the final node list state and group score
961 -- * select the best choice via a foldl that uses the same Either
962 -- String solution as the ChangeSecondary mode
963 nodeEvacInstance nl il ChangeAll
964 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
967 let no_nodes = Left "no nodes available"
968 node_pairs = [(p,s) | p <- avail_nodes, s <- avail_nodes, p /= s]
969 (nl', il', ops, _) <-
970 annotateResult "Can't find any good nodes for relocation" .
973 (\accu nodes -> case evacDrbdAllInner nl il inst gdx nodes of
977 -- we don't need more details (which
978 -- nodes, etc.) as we only selected
979 -- this group if we can allocate on
980 -- it, hence failures will not
981 -- propagate out of this fold loop
982 Left _ -> Left $ "Allocation failed: " ++ msg
983 Ok result@(_, _, _, new_cv) ->
984 let new_accu = Right result in
987 Right (_, _, _, old_cv) ->
991 ) no_nodes node_pairs
993 return (nl', il', ops)
995 -- | Generic function for changing one node of an instance.
997 -- This is similar to 'nodeEvacInstance' but will be used in a few of
998 -- its sub-patterns. It folds the inner function 'evacOneNodeInner'
999 -- over the list of available nodes, which results in the best choice
1001 evacOneNodeOnly :: Node.List -- ^ The node list (cluster-wide)
1002 -> Instance.List -- ^ Instance list (cluster-wide)
1003 -> Instance.Instance -- ^ The instance to be evacuated
1004 -> Gdx -- ^ The group we're targetting
1005 -> [Ndx] -- ^ The list of available nodes
1007 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1008 evacOneNodeOnly nl il inst gdx avail_nodes = do
1009 op_fn <- case Instance.mirrorType inst of
1010 MirrorNone -> Bad "Can't relocate/evacuate non-mirrored instances"
1011 MirrorInternal -> Ok ReplaceSecondary
1012 MirrorExternal -> Ok FailoverToAny
1013 (nl', inst', _, ndx) <- annotateResult "Can't find any good node" .
1015 foldl' (evacOneNodeInner nl inst gdx op_fn)
1016 (Left "no nodes available") avail_nodes
1017 let idx = Instance.idx inst
1018 il' = Container.add idx inst' il
1019 ops = iMoveToJob nl' il' idx (op_fn ndx)
1020 return (nl', il', ops)
1022 -- | Inner fold function for changing one node of an instance.
1024 -- Depending on the instance disk template, this will either change
1025 -- the secondary (for DRBD) or the primary node (for shared
1026 -- storage). However, the operation is generic otherwise.
1028 -- The running solution is either a @Left String@, which means we
1029 -- don't have yet a working solution, or a @Right (...)@, which
1030 -- represents a valid solution; it holds the modified node list, the
1031 -- modified instance (after evacuation), the score of that solution,
1032 -- and the new secondary node index.
1033 evacOneNodeInner :: Node.List -- ^ Cluster node list
1034 -> Instance.Instance -- ^ Instance being evacuated
1035 -> Gdx -- ^ The group index of the instance
1036 -> (Ndx -> IMove) -- ^ Operation constructor
1037 -> EvacInnerState -- ^ Current best solution
1038 -> Ndx -- ^ Node we're evaluating as target
1039 -> EvacInnerState -- ^ New best solution
1040 evacOneNodeInner nl inst gdx op_fn accu ndx =
1041 case applyMove nl inst (op_fn ndx) of
1042 Bad fm -> let fail_msg = "Node " ++ Container.nameOf nl ndx ++
1043 " failed: " ++ show fm
1044 in either (const $ Left fail_msg) (const accu) accu
1045 Ok (nl', inst', _, _) ->
1046 let nodes = Container.elems nl'
1047 -- The fromJust below is ugly (it can fail nastily), but
1048 -- at this point we should have any internal mismatches,
1049 -- and adding a monad here would be quite involved
1050 grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1051 new_cv = compCVNodes grpnodes
1052 new_accu = Right (nl', inst', new_cv, ndx)
1055 Right (_, _, old_cv, _) ->
1060 -- | Compute result of changing all nodes of a DRBD instance.
1062 -- Given the target primary and secondary node (which might be in a
1063 -- different group or not), this function will 'execute' all the
1064 -- required steps and assuming all operations succceed, will return
1065 -- the modified node and instance lists, the opcodes needed for this
1066 -- and the new group score.
1067 evacDrbdAllInner :: Node.List -- ^ Cluster node list
1068 -> Instance.List -- ^ Cluster instance list
1069 -> Instance.Instance -- ^ The instance to be moved
1070 -> Gdx -- ^ The target group index
1071 -- (which can differ from the
1072 -- current group of the
1074 -> (Ndx, Ndx) -- ^ Tuple of new
1075 -- primary\/secondary nodes
1076 -> Result (Node.List, Instance.List, [OpCodes.OpCode], Score)
1077 evacDrbdAllInner nl il inst gdx (t_pdx, t_sdx) = do
1078 let primary = Container.find (Instance.pNode inst) nl
1079 idx = Instance.idx inst
1080 -- if the primary is offline, then we first failover
1081 (nl1, inst1, ops1) <-
1082 if Node.offline primary
1084 (nl', inst', _, _) <-
1085 annotateResult "Failing over to the secondary" .
1086 opToResult $ applyMove nl inst Failover
1087 return (nl', inst', [Failover])
1088 else return (nl, inst, [])
1089 let (o1, o2, o3) = (ReplaceSecondary t_pdx,
1091 ReplaceSecondary t_sdx)
1092 -- we now need to execute a replace secondary to the future
1094 (nl2, inst2, _, _) <-
1095 annotateResult "Changing secondary to new primary" .
1097 applyMove nl1 inst1 o1
1099 -- we now execute another failover, the primary stays fixed now
1100 (nl3, inst3, _, _) <- annotateResult "Failing over to new primary" .
1101 opToResult $ applyMove nl2 inst2 o2
1103 -- and finally another replace secondary, to the final secondary
1104 (nl4, inst4, _, _) <-
1105 annotateResult "Changing secondary to final secondary" .
1107 applyMove nl3 inst3 o3
1109 il' = Container.add idx inst4 il
1110 ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4
1111 let nodes = Container.elems nl4
1112 -- The fromJust below is ugly (it can fail nastily), but
1113 -- at this point we should have any internal mismatches,
1114 -- and adding a monad here would be quite involved
1115 grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1116 new_cv = compCVNodes grpnodes
1117 return (nl4, il', ops, new_cv)
1119 -- | Computes the nodes in a given group which are available for
1121 availableGroupNodes :: [(Gdx, [Ndx])] -- ^ Group index/node index assoc list
1122 -> IntSet.IntSet -- ^ Nodes that are excluded
1123 -> Gdx -- ^ The group for which we
1125 -> Result [Ndx] -- ^ List of available node indices
1126 availableGroupNodes group_nodes excl_ndx gdx = do
1127 local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx)
1128 Ok (lookup gdx group_nodes)
1129 let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes
1132 -- | Updates the evac solution with the results of an instance
1134 updateEvacSolution :: (Node.List, Instance.List, EvacSolution)
1136 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1137 -> (Node.List, Instance.List, EvacSolution)
1138 updateEvacSolution (nl, il, es) idx (Bad msg) =
1139 (nl, il, es { esFailed = (idx, msg):esFailed es})
1140 updateEvacSolution (_, _, es) idx (Ok (nl, il, opcodes)) =
1141 (nl, il, es { esMoved = new_elem:esMoved es
1142 , esOpCodes = opcodes:esOpCodes es })
1143 where inst = Container.find idx il
1145 instancePriGroup nl inst,
1146 Instance.allNodes inst)
1148 -- | Node-evacuation IAllocator mode main function.
1149 tryNodeEvac :: Group.List -- ^ The cluster groups
1150 -> Node.List -- ^ The node list (cluster-wide, not per group)
1151 -> Instance.List -- ^ Instance list (cluster-wide)
1152 -> EvacMode -- ^ The evacuation mode
1153 -> [Idx] -- ^ List of instance (indices) to be evacuated
1154 -> Result (Node.List, Instance.List, EvacSolution)
1155 tryNodeEvac _ ini_nl ini_il mode idxs =
1156 let evac_ndx = nodesToEvacuate ini_il mode idxs
1157 offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1158 excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline
1159 group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1160 (Container.elems nl))) $
1161 splitCluster ini_nl ini_il
1162 (fin_nl, fin_il, esol) =
1163 foldl' (\state@(nl, il, _) inst ->
1164 let gdx = instancePriGroup nl inst
1165 pdx = Instance.pNode inst in
1166 updateEvacSolution state (Instance.idx inst) $
1167 availableGroupNodes group_ndx
1168 (IntSet.insert pdx excl_ndx) gdx >>=
1169 nodeEvacInstance nl il mode inst gdx
1171 (ini_nl, ini_il, emptyEvacSolution)
1172 (map (`Container.find` ini_il) idxs)
1173 in return (fin_nl, fin_il, reverseEvacSolution esol)
1175 -- | Change-group IAllocator mode main function.
1177 -- This is very similar to 'tryNodeEvac', the only difference is that
1178 -- we don't choose as target group the current instance group, but
1181 -- 1. at the start of the function, we compute which are the target
1182 -- groups; either no groups were passed in, in which case we choose
1183 -- all groups out of which we don't evacuate instance, or there were
1184 -- some groups passed, in which case we use those
1186 -- 2. for each instance, we use 'findBestAllocGroup' to choose the
1187 -- best group to hold the instance, and then we do what
1188 -- 'tryNodeEvac' does, except for this group instead of the current
1191 -- Note that the correct behaviour of this function relies on the
1192 -- function 'nodeEvacInstance' to be able to do correctly both
1193 -- intra-group and inter-group moves when passed the 'ChangeAll' mode.
1194 tryChangeGroup :: Group.List -- ^ The cluster groups
1195 -> Node.List -- ^ The node list (cluster-wide)
1196 -> Instance.List -- ^ Instance list (cluster-wide)
1197 -> [Gdx] -- ^ Target groups; if empty, any
1198 -- groups not being evacuated
1199 -> [Idx] -- ^ List of instance (indices) to be evacuated
1200 -> Result (Node.List, Instance.List, EvacSolution)
1201 tryChangeGroup gl ini_nl ini_il gdxs idxs =
1202 let evac_gdxs = nub $ map (instancePriGroup ini_nl .
1203 flip Container.find ini_il) idxs
1204 target_gdxs = (if null gdxs
1205 then Container.keys gl
1206 else gdxs) \\ evac_gdxs
1207 offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1208 excl_ndx = foldl' (flip IntSet.insert) IntSet.empty offline
1209 group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1210 (Container.elems nl))) $
1211 splitCluster ini_nl ini_il
1212 (fin_nl, fin_il, esol) =
1213 foldl' (\state@(nl, il, _) inst ->
1215 let ncnt = Instance.requiredNodes $
1216 Instance.diskTemplate inst
1217 (gdx, _, _) <- findBestAllocGroup gl nl il
1218 (Just target_gdxs) inst ncnt
1219 av_nodes <- availableGroupNodes group_ndx
1221 nodeEvacInstance nl il ChangeAll inst gdx av_nodes
1222 in updateEvacSolution state (Instance.idx inst) solution
1224 (ini_nl, ini_il, emptyEvacSolution)
1225 (map (`Container.find` ini_il) idxs)
1226 in return (fin_nl, fin_il, reverseEvacSolution esol)
1228 -- | Standard-sized allocation method.
1230 -- This places instances of the same size on the cluster until we're
1231 -- out of space. The result will be a list of identically-sized
1233 iterateAlloc :: AllocMethod
1234 iterateAlloc nl il limit newinst allocnodes ixes cstats =
1235 let depth = length ixes
1236 newname = printf "new-%d" depth::String
1237 newidx = Container.size il
1238 newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
1239 newlimit = fmap (flip (-) 1) limit
1240 in case tryAlloc nl il newi2 allocnodes of
1242 Ok (AllocSolution { asFailures = errs, asSolution = sols3 }) ->
1243 let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in
1246 Just (xnl, xi, _, _) ->
1249 else iterateAlloc xnl (Container.add newidx xi il)
1250 newlimit newinst allocnodes (xi:ixes)
1251 (totalResources xnl:cstats)
1253 -- | Tiered allocation method.
1255 -- This places instances on the cluster, and decreases the spec until
1256 -- we can allocate again. The result will be a list of decreasing
1258 tieredAlloc :: AllocMethod
1259 tieredAlloc nl il limit newinst allocnodes ixes cstats =
1260 case iterateAlloc nl il limit newinst allocnodes ixes cstats of
1262 Ok (errs, nl', il', ixes', cstats') ->
1263 let newsol = Ok (errs, nl', il', ixes', cstats')
1264 ixes_cnt = length ixes'
1265 (stop, newlimit) = case limit of
1266 Nothing -> (False, Nothing)
1267 Just n -> (n <= ixes_cnt,
1268 Just (n - ixes_cnt)) in
1269 if stop then newsol else
1270 case Instance.shrinkByType newinst . fst . last $
1271 sortBy (comparing snd) errs of
1273 Ok newinst' -> tieredAlloc nl' il' newlimit
1274 newinst' allocnodes ixes' cstats'
1276 -- * Formatting functions
1278 -- | Given the original and final nodes, computes the relocation description.
1279 computeMoves :: Instance.Instance -- ^ The instance to be moved
1280 -> String -- ^ The instance name
1281 -> IMove -- ^ The move being performed
1282 -> String -- ^ New primary
1283 -> String -- ^ New secondary
1284 -> (String, [String])
1285 -- ^ Tuple of moves and commands list; moves is containing
1286 -- either @/f/@ for failover or @/r:name/@ for replace
1287 -- secondary, while the command list holds gnt-instance
1288 -- commands (without that prefix), e.g \"@failover instance1@\"
1289 computeMoves i inam mv c d =
1291 Failover -> ("f", [mig])
1292 FailoverToAny _ -> (printf "fa:%s" c, [mig_any])
1293 FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
1294 ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
1295 ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
1296 ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
1297 where morf = if Instance.isRunning i then "migrate" else "failover"
1298 mig = printf "%s -f %s" morf inam::String
1299 mig_any = printf "%s -f -n %s %s" morf c inam::String
1300 rep n = printf "replace-disks -n %s %s" n inam::String
1302 -- | Converts a placement to string format.
1303 printSolutionLine :: Node.List -- ^ The node list
1304 -> Instance.List -- ^ The instance list
1305 -> Int -- ^ Maximum node name length
1306 -> Int -- ^ Maximum instance name length
1307 -> Placement -- ^ The current placement
1308 -> Int -- ^ The index of the placement in
1310 -> (String, [String])
1311 printSolutionLine nl il nmlen imlen plc pos =
1312 let pmlen = (2*nmlen + 1)
1313 (i, p, s, mv, c) = plc
1314 old_sec = Instance.sNode inst
1315 inst = Container.find i il
1316 inam = Instance.alias inst
1317 npri = Node.alias $ Container.find p nl
1318 nsec = Node.alias $ Container.find s nl
1319 opri = Node.alias $ Container.find (Instance.pNode inst) nl
1320 osec = Node.alias $ Container.find old_sec nl
1321 (moves, cmds) = computeMoves inst inam mv npri nsec
1322 -- FIXME: this should check instead/also the disk template
1323 ostr = if old_sec == Node.noSecondary
1324 then printf "%s" opri::String
1325 else printf "%s:%s" opri osec::String
1326 nstr = if s == Node.noSecondary
1327 then printf "%s" npri::String
1328 else printf "%s:%s" npri nsec::String
1329 in (printf " %3d. %-*s %-*s => %-*s %12.8f a=%s"
1330 pos imlen inam pmlen ostr pmlen nstr c moves,
1333 -- | Return the instance and involved nodes in an instance move.
1335 -- Note that the output list length can vary, and is not required nor
1336 -- guaranteed to be of any specific length.
1337 involvedNodes :: Instance.List -- ^ Instance list, used for retrieving
1338 -- the instance from its index; note
1339 -- that this /must/ be the original
1340 -- instance list, so that we can
1341 -- retrieve the old nodes
1342 -> Placement -- ^ The placement we're investigating,
1343 -- containing the new nodes and
1345 -> [Ndx] -- ^ Resulting list of node indices
1346 involvedNodes il plc =
1347 let (i, np, ns, _, _) = plc
1348 inst = Container.find i il
1349 in nub $ [np, ns] ++ Instance.allNodes inst
1351 -- | Inner function for splitJobs, that either appends the next job to
1352 -- the current jobset, or starts a new jobset.
1353 mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
1354 mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
1355 mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
1356 | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
1357 | otherwise = ([n]:cjs, ndx)
1359 -- | Break a list of moves into independent groups. Note that this
1360 -- will reverse the order of jobs.
1361 splitJobs :: [MoveJob] -> [JobSet]
1362 splitJobs = fst . foldl mergeJobs ([], [])
1364 -- | Given a list of commands, prefix them with @gnt-instance@ and
1365 -- also beautify the display a little.
1366 formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
1367 formatJob jsn jsl (sn, (_, _, _, cmds)) =
1369 printf " echo job %d/%d" jsn sn:
1371 map (" gnt-instance " ++) cmds
1373 then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
1376 -- | Given a list of commands, prefix them with @gnt-instance@ and
1377 -- also beautify the display a little.
1378 formatCmds :: [JobSet] -> String
1381 concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1385 -- | Print the node list.
1386 printNodes :: Node.List -> [String] -> String
1388 let fields = case fs of
1389 [] -> Node.defaultFields
1390 "+":rest -> Node.defaultFields ++ rest
1392 snl = sortBy (comparing Node.idx) (Container.elems nl)
1393 (header, isnum) = unzip $ map Node.showHeader fields
1394 in printTable "" header (map (Node.list fields) snl) isnum
1396 -- | Print the instance list.
1397 printInsts :: Node.List -> Instance.List -> String
1399 let sil = sortBy (comparing Instance.idx) (Container.elems il)
1400 helper inst = [ if Instance.isRunning inst then "R" else " "
1401 , Instance.name inst
1402 , Container.nameOf nl (Instance.pNode inst)
1403 , let sdx = Instance.sNode inst
1404 in if sdx == Node.noSecondary
1406 else Container.nameOf nl sdx
1407 , if Instance.autoBalance inst then "Y" else "N"
1408 , printf "%3d" $ Instance.vcpus inst
1409 , printf "%5d" $ Instance.mem inst
1410 , printf "%5d" $ Instance.dsk inst `div` 1024
1416 where DynUtil lC lM lD lN = Instance.util inst
1417 header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1418 , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1419 isnum = False:False:False:False:False:repeat True
1420 in printTable "" header (map helper sil) isnum
1422 -- | Shows statistics for a given node list.
1423 printStats :: String -> Node.List -> String
1425 let dcvs = compDetailedCV $ Container.elems nl
1426 (weights, names) = unzip detailedCVInfo
1427 hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1428 header = [ "Field", "Value", "Weight" ]
1429 formatted = map (\(w, h, val) ->
1434 in printTable lp header formatted $ False:repeat True
1436 -- | Convert a placement into a list of OpCodes (basically a job).
1437 iMoveToJob :: Node.List -- ^ The node list; only used for node
1438 -- names, so any version is good
1439 -- (before or after the operation)
1440 -> Instance.List -- ^ The instance list; also used for
1442 -> Idx -- ^ The index of the instance being
1444 -> IMove -- ^ The actual move to be described
1445 -> [OpCodes.OpCode] -- ^ The list of opcodes equivalent to
1447 iMoveToJob nl il idx move =
1448 let inst = Container.find idx il
1449 iname = Instance.name inst
1450 lookNode = Just . Container.nameOf nl
1451 opF = OpCodes.OpInstanceMigrate iname True False True Nothing
1452 opFA n = OpCodes.OpInstanceMigrate iname True False True (lookNode n)
1453 opR n = OpCodes.OpInstanceReplaceDisks iname (lookNode n)
1454 OpCodes.ReplaceNewSecondary [] Nothing
1457 FailoverToAny np -> [ opFA np ]
1458 ReplacePrimary np -> [ opF, opR np, opF ]
1459 ReplaceSecondary ns -> [ opR ns ]
1460 ReplaceAndFailover np -> [ opR np, opF ]
1461 FailoverAndReplace ns -> [ opF, opR ns ]
1463 -- * Node group functions
1465 -- | Computes the group of an instance.
1466 instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1467 instanceGroup nl i =
1468 let sidx = Instance.sNode i
1469 pnode = Container.find (Instance.pNode i) nl
1470 snode = if sidx == Node.noSecondary
1472 else Container.find sidx nl
1473 pgroup = Node.group pnode
1474 sgroup = Node.group snode
1475 in if pgroup /= sgroup
1476 then fail ("Instance placed accross two node groups, primary " ++
1477 show pgroup ++ ", secondary " ++ show sgroup)
1480 -- | Computes the group of an instance per the primary node.
1481 instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1482 instancePriGroup nl i =
1483 let pnode = Container.find (Instance.pNode i) nl
1486 -- | Compute the list of badly allocated instances (split across node
1488 findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1489 findSplitInstances nl =
1490 filter (not . isOk . instanceGroup nl) . Container.elems
1492 -- | Splits a cluster into the component node groups.
1493 splitCluster :: Node.List -> Instance.List ->
1494 [(Gdx, (Node.List, Instance.List))]
1495 splitCluster nl il =
1496 let ngroups = Node.computeGroups (Container.elems nl)
1497 in map (\(guuid, nodes) ->
1498 let nidxs = map Node.idx nodes
1499 nodes' = zip nidxs nodes
1500 instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1501 in (guuid, (Container.fromList nodes', instances))) ngroups
1503 -- | Compute the list of nodes that are to be evacuated, given a list
1504 -- of instances and an evacuation mode.
1505 nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list
1506 -> EvacMode -- ^ The evacuation mode we're using
1507 -> [Idx] -- ^ List of instance indices being evacuated
1508 -> IntSet.IntSet -- ^ Set of node indices
1509 nodesToEvacuate il mode =
1510 IntSet.delete Node.noSecondary .
1512 let i = Container.find idx il
1513 pdx = Instance.pNode i
1514 sdx = Instance.sNode i
1515 dt = Instance.diskTemplate i
1516 withSecondary = case dt of
1517 DTDrbd8 -> IntSet.insert sdx ns
1520 ChangePrimary -> IntSet.insert pdx ns
1521 ChangeSecondary -> withSecondary
1522 ChangeAll -> IntSet.insert pdx withSecondary