1 {-| Implementation of cluster-wide logic.
3 This module holds all pure cluster-logic; I\/O related functionality
4 goes into the /Main/ module for the individual binaries.
10 Copyright (C) 2009, 2010, 2011, 2012 Google Inc.
12 This program is free software; you can redistribute it and/or modify
13 it under the terms of the GNU General Public License as published by
14 the Free Software Foundation; either version 2 of the License, or
15 (at your option) any later version.
17 This program is distributed in the hope that it will be useful, but
18 WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 General Public License for more details.
22 You should have received a copy of the GNU General Public License
23 along with this program; if not, write to the Free Software
24 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
29 module Ganeti.HTools.Cluster
39 -- * Generic functions
41 , computeAllocationDelta
42 -- * First phase functions
44 -- * Second phase functions
49 -- * Display functions
52 -- * Balacing functions
61 -- * IAllocator functions
69 -- * Allocation functions
72 -- * Node group functions
78 import qualified Data.IntSet as IntSet
80 import Data.Maybe (fromJust, isNothing)
81 import Data.Ord (comparing)
82 import Text.Printf (printf)
84 import qualified Ganeti.HTools.Container as Container
85 import qualified Ganeti.HTools.Instance as Instance
86 import qualified Ganeti.HTools.Node as Node
87 import qualified Ganeti.HTools.Group as Group
88 import Ganeti.HTools.Types
89 import Ganeti.HTools.Utils
91 import qualified Ganeti.OpCodes as OpCodes
95 -- | Allocation\/relocation solution.
96 data AllocSolution = AllocSolution
97 { asFailures :: [FailMode] -- ^ Failure counts
98 , asAllocs :: Int -- ^ Good allocation count
99 , asSolution :: Maybe Node.AllocElement -- ^ The actual allocation result
100 , asLog :: [String] -- ^ Informational messages
103 -- | Node evacuation/group change iallocator result type. This result
104 -- type consists of actual opcodes (a restricted subset) that are
105 -- transmitted back to Ganeti.
106 data EvacSolution = EvacSolution
107 { esMoved :: [(Idx, Gdx, [Ndx])] -- ^ Instances moved successfully
108 , esFailed :: [(Idx, String)] -- ^ Instances which were not
110 , esOpCodes :: [[OpCodes.OpCode]] -- ^ List of jobs
113 -- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
114 type AllocResult = (FailStats, Node.List, Instance.List,
115 [Instance.Instance], [CStats])
117 -- | Type alias for easier handling.
118 type AllocSolutionList = [(Instance.Instance, AllocSolution)]
120 -- | A type denoting the valid allocation mode/pairs.
122 -- For a one-node allocation, this will be a @Left ['Ndx']@, whereas
123 -- for a two-node allocation, this will be a @Right [('Ndx',
124 -- ['Ndx'])]@. In the latter case, the list is basically an
125 -- association list, grouped by primary node and holding the potential
126 -- secondary nodes in the sub-list.
127 type AllocNodes = Either [Ndx] [(Ndx, [Ndx])]
129 -- | The empty solution we start with when computing allocations.
130 emptyAllocSolution :: AllocSolution
131 emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
132 , asSolution = Nothing, asLog = [] }
134 -- | The empty evac solution.
135 emptyEvacSolution :: EvacSolution
136 emptyEvacSolution = EvacSolution { esMoved = []
141 -- | The complete state for the balancing solution.
142 data Table = Table Node.List Instance.List Score [Placement]
143 deriving (Show, Read)
145 -- | Cluster statistics data type.
147 { csFmem :: Integer -- ^ Cluster free mem
148 , csFdsk :: Integer -- ^ Cluster free disk
149 , csAmem :: Integer -- ^ Cluster allocatable mem
150 , csAdsk :: Integer -- ^ Cluster allocatable disk
151 , csAcpu :: Integer -- ^ Cluster allocatable cpus
152 , csMmem :: Integer -- ^ Max node allocatable mem
153 , csMdsk :: Integer -- ^ Max node allocatable disk
154 , csMcpu :: Integer -- ^ Max node allocatable cpu
155 , csImem :: Integer -- ^ Instance used mem
156 , csIdsk :: Integer -- ^ Instance used disk
157 , csIcpu :: Integer -- ^ Instance used cpu
158 , csTmem :: Double -- ^ Cluster total mem
159 , csTdsk :: Double -- ^ Cluster total disk
160 , csTcpu :: Double -- ^ Cluster total cpus
161 , csVcpu :: Integer -- ^ Cluster total virtual cpus
162 , csNcpu :: Double -- ^ Equivalent to 'csIcpu' but in terms of
163 -- physical CPUs, i.e. normalised used phys CPUs
164 , csXmem :: Integer -- ^ Unnacounted for mem
165 , csNmem :: Integer -- ^ Node own memory
166 , csScore :: Score -- ^ The cluster score
167 , csNinst :: Int -- ^ The total number of instances
168 } deriving (Show, Read)
170 -- | A simple type for allocation functions.
171 type AllocMethod = Node.List -- ^ Node list
172 -> Instance.List -- ^ Instance list
173 -> Maybe Int -- ^ Optional allocation limit
174 -> Instance.Instance -- ^ Instance spec for allocation
175 -> AllocNodes -- ^ Which nodes we should allocate on
176 -> [Instance.Instance] -- ^ Allocated instances
177 -> [CStats] -- ^ Running cluster stats
178 -> Result AllocResult -- ^ Allocation result
180 -- | A simple type for the running solution of evacuations.
181 type EvacInnerState =
182 Either String (Node.List, Instance.Instance, Score, Ndx)
184 -- * Utility functions
186 -- | Verifies the N+1 status and return the affected nodes.
187 verifyN1 :: [Node.Node] -> [Node.Node]
188 verifyN1 = filter Node.failN1
190 {-| Computes the pair of bad nodes and instances.
192 The bad node list is computed via a simple 'verifyN1' check, and the
193 bad instance list is the list of primary and secondary instances of
197 computeBadItems :: Node.List -> Instance.List ->
198 ([Node.Node], [Instance.Instance])
199 computeBadItems nl il =
200 let bad_nodes = verifyN1 $ getOnline nl
201 bad_instances = map (`Container.find` il) .
203 concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
205 (bad_nodes, bad_instances)
207 -- | Extracts the node pairs for an instance. This can fail if the
208 -- instance is single-homed. FIXME: this needs to be improved,
209 -- together with the general enhancement for handling non-DRBD moves.
210 instanceNodes :: Node.List -> Instance.Instance ->
211 (Ndx, Ndx, Node.Node, Node.Node)
212 instanceNodes nl inst =
213 let old_pdx = Instance.pNode inst
214 old_sdx = Instance.sNode inst
215 old_p = Container.find old_pdx nl
216 old_s = Container.find old_sdx nl
217 in (old_pdx, old_sdx, old_p, old_s)
219 -- | Zero-initializer for the CStats type.
220 emptyCStats :: CStats
221 emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
223 -- | Update stats with data from a new node.
224 updateCStats :: CStats -> Node.Node -> CStats
225 updateCStats cs node =
226 let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
227 csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
228 csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
229 csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
230 csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
231 csVcpu = x_vcpu, csNcpu = x_ncpu,
232 csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
235 inc_amem = Node.fMem node - Node.rMem node
236 inc_amem' = if inc_amem > 0 then inc_amem else 0
237 inc_adsk = Node.availDisk node
238 inc_imem = truncate (Node.tMem node) - Node.nMem node
239 - Node.xMem node - Node.fMem node
240 inc_icpu = Node.uCpu node
241 inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
242 inc_vcpu = Node.hiCpu node
243 inc_acpu = Node.availCpu node
244 inc_ncpu = fromIntegral (Node.uCpu node) /
245 iPolicyVcpuRatio (Node.iPolicy node)
246 in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
247 , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
248 , csAmem = x_amem + fromIntegral inc_amem'
249 , csAdsk = x_adsk + fromIntegral inc_adsk
250 , csAcpu = x_acpu + fromIntegral inc_acpu
251 , csMmem = max x_mmem (fromIntegral inc_amem')
252 , csMdsk = max x_mdsk (fromIntegral inc_adsk)
253 , csMcpu = max x_mcpu (fromIntegral inc_acpu)
254 , csImem = x_imem + fromIntegral inc_imem
255 , csIdsk = x_idsk + fromIntegral inc_idsk
256 , csIcpu = x_icpu + fromIntegral inc_icpu
257 , csTmem = x_tmem + Node.tMem node
258 , csTdsk = x_tdsk + Node.tDsk node
259 , csTcpu = x_tcpu + Node.tCpu node
260 , csVcpu = x_vcpu + fromIntegral inc_vcpu
261 , csNcpu = x_ncpu + inc_ncpu
262 , csXmem = x_xmem + fromIntegral (Node.xMem node)
263 , csNmem = x_nmem + fromIntegral (Node.nMem node)
264 , csNinst = x_ninst + length (Node.pList node)
267 -- | Compute the total free disk and memory in the cluster.
268 totalResources :: Node.List -> CStats
270 let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
271 in cs { csScore = compCV nl }
273 -- | Compute the delta between two cluster state.
275 -- This is used when doing allocations, to understand better the
276 -- available cluster resources. The return value is a triple of the
277 -- current used values, the delta that was still allocated, and what
278 -- was left unallocated.
279 computeAllocationDelta :: CStats -> CStats -> AllocStats
280 computeAllocationDelta cini cfin =
281 let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu,
282 csNcpu = i_ncpu } = cini
283 CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
284 csTmem = t_mem, csTdsk = t_dsk, csVcpu = f_vcpu,
285 csNcpu = f_ncpu, csTcpu = f_tcpu } = cfin
286 rini = AllocInfo { allocInfoVCpus = fromIntegral i_icpu
287 , allocInfoNCpus = i_ncpu
288 , allocInfoMem = fromIntegral i_imem
289 , allocInfoDisk = fromIntegral i_idsk
291 rfin = AllocInfo { allocInfoVCpus = fromIntegral (f_icpu - i_icpu)
292 , allocInfoNCpus = f_ncpu - i_ncpu
293 , allocInfoMem = fromIntegral (f_imem - i_imem)
294 , allocInfoDisk = fromIntegral (f_idsk - i_idsk)
296 runa = AllocInfo { allocInfoVCpus = fromIntegral (f_vcpu - f_icpu)
297 , allocInfoNCpus = f_tcpu - f_ncpu
298 , allocInfoMem = truncate t_mem - fromIntegral f_imem
299 , allocInfoDisk = truncate t_dsk - fromIntegral f_idsk
301 in (rini, rfin, runa)
303 -- | The names and weights of the individual elements in the CV list.
304 detailedCVInfo :: [(Double, String)]
305 detailedCVInfo = [ (1, "free_mem_cv")
306 , (1, "free_disk_cv")
308 , (1, "reserved_mem_cv")
309 , (4, "offline_all_cnt")
310 , (16, "offline_pri_cnt")
311 , (1, "vcpu_ratio_cv")
314 , (1, "disk_load_cv")
316 , (2, "pri_tags_score")
320 -- | Holds the weights used by 'compCVNodes' for each metric.
321 detailedCVWeights :: [Double]
322 detailedCVWeights = map fst detailedCVInfo
324 -- | Compute the mem and disk covariance.
325 compDetailedCV :: [Node.Node] -> [Double]
326 compDetailedCV all_nodes =
327 let (offline, nodes) = partition Node.offline all_nodes
328 mem_l = map Node.pMem nodes
329 dsk_l = map Node.pDsk nodes
330 -- metric: memory covariance
331 mem_cv = stdDev mem_l
332 -- metric: disk covariance
333 dsk_cv = stdDev dsk_l
334 -- metric: count of instances living on N1 failing nodes
335 n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
336 length (Node.pList n)) .
337 filter Node.failN1 $ nodes :: Double
338 res_l = map Node.pRem nodes
339 -- metric: reserved memory covariance
340 res_cv = stdDev res_l
341 -- offline instances metrics
342 offline_ipri = sum . map (length . Node.pList) $ offline
343 offline_isec = sum . map (length . Node.sList) $ offline
344 -- metric: count of instances on offline nodes
345 off_score = fromIntegral (offline_ipri + offline_isec)::Double
346 -- metric: count of primary instances on offline nodes (this
347 -- helps with evacuation/failover of primary instances on
348 -- 2-node clusters with one node offline)
349 off_pri_score = fromIntegral offline_ipri::Double
350 cpu_l = map Node.pCpu nodes
351 -- metric: covariance of vcpu/pcpu ratio
352 cpu_cv = stdDev cpu_l
353 -- metrics: covariance of cpu, memory, disk and network load
354 (c_load, m_load, d_load, n_load) =
356 let DynUtil c1 m1 d1 n1 = Node.utilLoad n
357 DynUtil c2 m2 d2 n2 = Node.utilPool n
358 in (c1/c2, m1/m2, d1/d2, n1/n2)) nodes
359 -- metric: conflicting instance count
360 pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
361 pri_tags_score = fromIntegral pri_tags_inst::Double
362 -- metric: spindles %
363 spindles_cv = map (\n -> Node.instSpindles n / Node.hiSpindles n) nodes
364 in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
365 , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
366 , pri_tags_score, stdDev spindles_cv ]
368 -- | Compute the /total/ variance.
369 compCVNodes :: [Node.Node] -> Double
370 compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
372 -- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
373 compCV :: Node.List -> Double
374 compCV = compCVNodes . Container.elems
376 -- | Compute online nodes from a 'Node.List'.
377 getOnline :: Node.List -> [Node.Node]
378 getOnline = filter (not . Node.offline) . Container.elems
380 -- * Balancing functions
382 -- | Compute best table. Note that the ordering of the arguments is important.
383 compareTables :: Table -> Table -> Table
384 compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
385 if a_cv > b_cv then b else a
387 -- | Applies an instance move to a given node list and instance.
388 applyMove :: Node.List -> Instance.Instance
389 -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
391 applyMove nl inst Failover =
392 let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
393 int_p = Node.removePri old_p inst
394 int_s = Node.removeSec old_s inst
395 new_nl = do -- Maybe monad
396 new_p <- Node.addPriEx (Node.offline old_p) int_s inst
397 new_s <- Node.addSec int_p inst old_sdx
398 let new_inst = Instance.setBoth inst old_sdx old_pdx
399 return (Container.addTwo old_pdx new_s old_sdx new_p nl,
400 new_inst, old_sdx, old_pdx)
403 -- Failover to any (fa)
404 applyMove nl inst (FailoverToAny new_pdx) = do
405 let (old_pdx, old_sdx, old_pnode, _) = instanceNodes nl inst
406 new_pnode = Container.find new_pdx nl
407 force_failover = Node.offline old_pnode
408 new_pnode' <- Node.addPriEx force_failover new_pnode inst
409 let old_pnode' = Node.removePri old_pnode inst
410 inst' = Instance.setPri inst new_pdx
411 nl' = Container.addTwo old_pdx old_pnode' new_pdx new_pnode' nl
412 return (nl', inst', new_pdx, old_sdx)
414 -- Replace the primary (f:, r:np, f)
415 applyMove nl inst (ReplacePrimary new_pdx) =
416 let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
417 tgt_n = Container.find new_pdx nl
418 int_p = Node.removePri old_p inst
419 int_s = Node.removeSec old_s inst
420 force_p = Node.offline old_p
421 new_nl = do -- Maybe monad
422 -- check that the current secondary can host the instance
423 -- during the migration
424 tmp_s <- Node.addPriEx force_p int_s inst
425 let tmp_s' = Node.removePri tmp_s inst
426 new_p <- Node.addPriEx force_p tgt_n inst
427 new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
428 let new_inst = Instance.setPri inst new_pdx
429 return (Container.add new_pdx new_p $
430 Container.addTwo old_pdx int_p old_sdx new_s nl,
431 new_inst, new_pdx, old_sdx)
434 -- Replace the secondary (r:ns)
435 applyMove nl inst (ReplaceSecondary new_sdx) =
436 let old_pdx = Instance.pNode inst
437 old_sdx = Instance.sNode inst
438 old_s = Container.find old_sdx nl
439 tgt_n = Container.find new_sdx nl
440 int_s = Node.removeSec old_s inst
441 force_s = Node.offline old_s
442 new_inst = Instance.setSec inst new_sdx
443 new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
444 \new_s -> return (Container.addTwo new_sdx
445 new_s old_sdx int_s nl,
446 new_inst, old_pdx, new_sdx)
449 -- Replace the secondary and failover (r:np, f)
450 applyMove nl inst (ReplaceAndFailover new_pdx) =
451 let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
452 tgt_n = Container.find new_pdx nl
453 int_p = Node.removePri old_p inst
454 int_s = Node.removeSec old_s inst
455 force_s = Node.offline old_s
456 new_nl = do -- Maybe monad
457 new_p <- Node.addPri tgt_n inst
458 new_s <- Node.addSecEx force_s int_p inst new_pdx
459 let new_inst = Instance.setBoth inst new_pdx old_pdx
460 return (Container.add new_pdx new_p $
461 Container.addTwo old_pdx new_s old_sdx int_s nl,
462 new_inst, new_pdx, old_pdx)
465 -- Failver and replace the secondary (f, r:ns)
466 applyMove nl inst (FailoverAndReplace new_sdx) =
467 let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
468 tgt_n = Container.find new_sdx nl
469 int_p = Node.removePri old_p inst
470 int_s = Node.removeSec old_s inst
471 force_p = Node.offline old_p
472 new_nl = do -- Maybe monad
473 new_p <- Node.addPriEx force_p int_s inst
474 new_s <- Node.addSecEx force_p tgt_n inst old_sdx
475 let new_inst = Instance.setBoth inst old_sdx new_sdx
476 return (Container.add new_sdx new_s $
477 Container.addTwo old_sdx new_p old_pdx int_p nl,
478 new_inst, old_sdx, new_sdx)
481 -- | Tries to allocate an instance on one given node.
482 allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
483 -> OpResult Node.AllocElement
484 allocateOnSingle nl inst new_pdx =
485 let p = Container.find new_pdx nl
486 new_inst = Instance.setBoth inst new_pdx Node.noSecondary
488 Instance.instMatchesPolicy inst (Node.iPolicy p)
489 new_p <- Node.addPri p inst
490 let new_nl = Container.add new_pdx new_p nl
491 new_score = compCV new_nl
492 return (new_nl, new_inst, [new_p], new_score)
494 -- | Tries to allocate an instance on a given pair of nodes.
495 allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
496 -> OpResult Node.AllocElement
497 allocateOnPair nl inst new_pdx new_sdx =
498 let tgt_p = Container.find new_pdx nl
499 tgt_s = Container.find new_sdx nl
501 Instance.instMatchesPolicy inst (Node.iPolicy tgt_p)
502 new_p <- Node.addPri tgt_p inst
503 new_s <- Node.addSec tgt_s inst new_pdx
504 let new_inst = Instance.setBoth inst new_pdx new_sdx
505 new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
506 return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
508 -- | Tries to perform an instance move and returns the best table
509 -- between the original one and the new one.
510 checkSingleStep :: Table -- ^ The original table
511 -> Instance.Instance -- ^ The instance to move
512 -> Table -- ^ The current best table
513 -> IMove -- ^ The move to apply
514 -> Table -- ^ The final best table
515 checkSingleStep ini_tbl target cur_tbl move =
516 let Table ini_nl ini_il _ ini_plc = ini_tbl
517 tmp_resu = applyMove ini_nl target move
520 OpGood (upd_nl, new_inst, pri_idx, sec_idx) ->
521 let tgt_idx = Instance.idx target
522 upd_cvar = compCV upd_nl
523 upd_il = Container.add tgt_idx new_inst ini_il
524 upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
525 upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
526 in compareTables cur_tbl upd_tbl
528 -- | Given the status of the current secondary as a valid new node and
529 -- the current candidate target node, generate the possible moves for
531 possibleMoves :: MirrorType -- ^ The mirroring type of the instance
532 -> Bool -- ^ Whether the secondary node is a valid new node
533 -> Bool -- ^ Whether we can change the primary node
534 -> Ndx -- ^ Target node candidate
535 -> [IMove] -- ^ List of valid result moves
537 possibleMoves MirrorNone _ _ _ = []
539 possibleMoves MirrorExternal _ False _ = []
541 possibleMoves MirrorExternal _ True tdx =
542 [ FailoverToAny tdx ]
544 possibleMoves MirrorInternal _ False tdx =
545 [ ReplaceSecondary tdx ]
547 possibleMoves MirrorInternal True True tdx =
548 [ ReplaceSecondary tdx
549 , ReplaceAndFailover tdx
551 , FailoverAndReplace tdx
554 possibleMoves MirrorInternal False True tdx =
555 [ ReplaceSecondary tdx
556 , ReplaceAndFailover tdx
559 -- | Compute the best move for a given instance.
560 checkInstanceMove :: [Ndx] -- ^ Allowed target node indices
561 -> Bool -- ^ Whether disk moves are allowed
562 -> Bool -- ^ Whether instance moves are allowed
563 -> Table -- ^ Original table
564 -> Instance.Instance -- ^ Instance to move
565 -> Table -- ^ Best new table for this instance
566 checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
567 let opdx = Instance.pNode target
568 osdx = Instance.sNode target
569 bad_nodes = [opdx, osdx]
570 nodes = filter (`notElem` bad_nodes) nodes_idx
571 mir_type = Instance.mirrorType target
572 use_secondary = elem osdx nodes_idx && inst_moves
573 aft_failover = if mir_type == MirrorInternal && use_secondary
574 -- if drbd and allowed to failover
575 then checkSingleStep ini_tbl target ini_tbl Failover
579 then concatMap (possibleMoves mir_type use_secondary inst_moves)
583 -- iterate over the possible nodes for this instance
584 foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
586 -- | Compute the best next move.
587 checkMove :: [Ndx] -- ^ Allowed target node indices
588 -> Bool -- ^ Whether disk moves are allowed
589 -> Bool -- ^ Whether instance moves are allowed
590 -> Table -- ^ The current solution
591 -> [Instance.Instance] -- ^ List of instances still to move
592 -> Table -- ^ The new solution
593 checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
594 let Table _ _ _ ini_plc = ini_tbl
595 -- we're using rwhnf from the Control.Parallel.Strategies
596 -- package; we don't need to use rnf as that would force too
597 -- much evaluation in single-threaded cases, and in
598 -- multi-threaded case the weak head normal form is enough to
599 -- spark the evaluation
600 tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
603 -- iterate over all instances, computing the best move
604 best_tbl = foldl' compareTables ini_tbl tables
605 Table _ _ _ best_plc = best_tbl
606 in if length best_plc == length ini_plc
607 then ini_tbl -- no advancement
610 -- | Check if we are allowed to go deeper in the balancing.
611 doNextBalance :: Table -- ^ The starting table
612 -> Int -- ^ Remaining length
613 -> Score -- ^ Score at which to stop
614 -> Bool -- ^ The resulting table and commands
615 doNextBalance ini_tbl max_rounds min_score =
616 let Table _ _ ini_cv ini_plc = ini_tbl
617 ini_plc_len = length ini_plc
618 in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
620 -- | Run a balance move.
621 tryBalance :: Table -- ^ The starting table
622 -> Bool -- ^ Allow disk moves
623 -> Bool -- ^ Allow instance moves
624 -> Bool -- ^ Only evacuate moves
625 -> Score -- ^ Min gain threshold
626 -> Score -- ^ Min gain
627 -> Maybe Table -- ^ The resulting table and commands
628 tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
629 let Table ini_nl ini_il ini_cv _ = ini_tbl
630 all_inst = Container.elems ini_il
631 all_nodes = Container.elems ini_nl
632 (offline_nodes, online_nodes) = partition Node.offline all_nodes
633 all_inst' = if evac_mode
634 then let bad_nodes = map Node.idx offline_nodes
635 in filter (any (`elem` bad_nodes) .
636 Instance.allNodes) all_inst
638 reloc_inst = filter Instance.movable all_inst'
639 node_idx = map Node.idx online_nodes
640 fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
641 (Table _ _ fin_cv _) = fin_tbl
643 if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
644 then Just fin_tbl -- this round made success, return the new table
647 -- * Allocation functions
649 -- | Build failure stats out of a list of failures.
650 collapseFailures :: [FailMode] -> FailStats
651 collapseFailures flst =
652 map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst))
655 -- | Compares two Maybe AllocElement and chooses the besst score.
656 bestAllocElement :: Maybe Node.AllocElement
657 -> Maybe Node.AllocElement
658 -> Maybe Node.AllocElement
659 bestAllocElement a Nothing = a
660 bestAllocElement Nothing b = b
661 bestAllocElement a@(Just (_, _, _, ascore)) b@(Just (_, _, _, bscore)) =
662 if ascore < bscore then a else b
664 -- | Update current Allocation solution and failure stats with new
666 concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
667 concatAllocs as (OpFail reason) = as { asFailures = reason : asFailures as }
669 concatAllocs as (OpGood ns) =
670 let -- Choose the old or new solution, based on the cluster score
672 osols = asSolution as
673 nsols = bestAllocElement osols (Just ns)
675 -- Note: we force evaluation of nsols here in order to keep the
676 -- memory profile low - we know that we will need nsols for sure
677 -- in the next cycle, so we force evaluation of nsols, since the
678 -- foldl' in the caller will only evaluate the tuple, but not the
679 -- elements of the tuple
680 in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolution = nsols }
682 -- | Sums two 'AllocSolution' structures.
683 sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution
684 sumAllocs (AllocSolution aFails aAllocs aSols aLog)
685 (AllocSolution bFails bAllocs bSols bLog) =
686 -- note: we add b first, since usually it will be smaller; when
687 -- fold'ing, a will grow and grow whereas b is the per-group
688 -- result, hence smaller
689 let nFails = bFails ++ aFails
690 nAllocs = aAllocs + bAllocs
691 nSols = bestAllocElement aSols bSols
693 in AllocSolution nFails nAllocs nSols nLog
695 -- | Given a solution, generates a reasonable description for it.
696 describeSolution :: AllocSolution -> String
697 describeSolution as =
698 let fcnt = asFailures as
701 intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
702 filter ((> 0) . snd) . collapseFailures $ fcnt
704 Nothing -> "No valid allocation solutions, failure reasons: " ++
705 (if null fcnt then "unknown reasons" else freasons)
706 Just (_, _, nodes, cv) ->
707 printf ("score: %.8f, successes %d, failures %d (%s)" ++
708 " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
709 (intercalate "/" . map Node.name $ nodes)
711 -- | Annotates a solution with the appropriate string.
712 annotateSolution :: AllocSolution -> AllocSolution
713 annotateSolution as = as { asLog = describeSolution as : asLog as }
715 -- | Reverses an evacuation solution.
717 -- Rationale: we always concat the results to the top of the lists, so
718 -- for proper jobset execution, we should reverse all lists.
719 reverseEvacSolution :: EvacSolution -> EvacSolution
720 reverseEvacSolution (EvacSolution f m o) =
721 EvacSolution (reverse f) (reverse m) (reverse o)
723 -- | Generate the valid node allocation singles or pairs for a new instance.
724 genAllocNodes :: Group.List -- ^ Group list
725 -> Node.List -- ^ The node map
726 -> Int -- ^ The number of nodes required
727 -> Bool -- ^ Whether to drop or not
729 -> Result AllocNodes -- ^ The (monadic) result
730 genAllocNodes gl nl count drop_unalloc =
731 let filter_fn = if drop_unalloc
732 then filter (Group.isAllocable .
733 flip Container.find gl . Node.group)
735 all_nodes = filter_fn $ getOnline nl
736 all_pairs = [(Node.idx p,
737 [Node.idx s | s <- all_nodes,
738 Node.idx p /= Node.idx s,
739 Node.group p == Node.group s]) |
742 1 -> Ok (Left (map Node.idx all_nodes))
743 2 -> Ok (Right (filter (not . null . snd) all_pairs))
744 _ -> Bad "Unsupported number of nodes, only one or two supported"
746 -- | Try to allocate an instance on the cluster.
747 tryAlloc :: (Monad m) =>
748 Node.List -- ^ The node list
749 -> Instance.List -- ^ The instance list
750 -> Instance.Instance -- ^ The instance to allocate
751 -> AllocNodes -- ^ The allocation targets
752 -> m AllocSolution -- ^ Possible solution list
753 tryAlloc _ _ _ (Right []) = fail "Not enough online nodes"
754 tryAlloc nl _ inst (Right ok_pairs) =
755 let psols = parMap rwhnf (\(p, ss) ->
757 concatAllocs cstate .
758 allocateOnPair nl inst p)
759 emptyAllocSolution ss) ok_pairs
760 sols = foldl' sumAllocs emptyAllocSolution psols
761 in return $ annotateSolution sols
763 tryAlloc _ _ _ (Left []) = fail "No online nodes"
764 tryAlloc nl _ inst (Left all_nodes) =
765 let sols = foldl' (\cstate ->
766 concatAllocs cstate . allocateOnSingle nl inst
767 ) emptyAllocSolution all_nodes
768 in return $ annotateSolution sols
770 -- | Given a group/result, describe it as a nice (list of) messages.
771 solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String]
772 solutionDescription gl (groupId, result) =
774 Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
775 Bad message -> [printf "Group %s: error %s" gname message]
776 where grp = Container.find groupId gl
777 gname = Group.name grp
778 pol = allocPolicyToRaw (Group.allocPolicy grp)
780 -- | From a list of possibly bad and possibly empty solutions, filter
781 -- only the groups with a valid result. Note that the result will be
782 -- reversed compared to the original list.
783 filterMGResults :: Group.List
784 -> [(Gdx, Result AllocSolution)]
785 -> [(Gdx, AllocSolution)]
786 filterMGResults gl = foldl' fn []
787 where unallocable = not . Group.isAllocable . flip Container.find gl
788 fn accu (gdx, rasol) =
791 Ok sol | isNothing (asSolution sol) -> accu
792 | unallocable gdx -> accu
793 | otherwise -> (gdx, sol):accu
795 -- | Sort multigroup results based on policy and score.
796 sortMGResults :: Group.List
797 -> [(Gdx, AllocSolution)]
798 -> [(Gdx, AllocSolution)]
799 sortMGResults gl sols =
800 let extractScore (_, _, _, x) = x
801 solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl),
802 (extractScore . fromJust . asSolution) sol)
803 in sortBy (comparing solScore) sols
805 -- | Finds the best group for an instance on a multi-group cluster.
807 -- Only solutions in @preferred@ and @last_resort@ groups will be
808 -- accepted as valid, and additionally if the allowed groups parameter
809 -- is not null then allocation will only be run for those group
811 findBestAllocGroup :: Group.List -- ^ The group list
812 -> Node.List -- ^ The node list
813 -> Instance.List -- ^ The instance list
814 -> Maybe [Gdx] -- ^ The allowed groups
815 -> Instance.Instance -- ^ The instance to allocate
816 -> Int -- ^ Required number of nodes
817 -> Result (Gdx, AllocSolution, [String])
818 findBestAllocGroup mggl mgnl mgil allowed_gdxs inst cnt =
819 let groups = splitCluster mgnl mgil
820 groups' = maybe groups (\gs -> filter ((`elem` gs) . fst) groups)
822 sols = map (\(gid, (nl, il)) ->
823 (gid, genAllocNodes mggl nl cnt False >>=
824 tryAlloc nl il inst))
825 groups'::[(Gdx, Result AllocSolution)]
826 all_msgs = concatMap (solutionDescription mggl) sols
827 goodSols = filterMGResults mggl sols
828 sortedSols = sortMGResults mggl goodSols
829 in if null sortedSols
831 then Bad $ "no groups for evacuation: allowed groups was" ++
832 show allowed_gdxs ++ ", all groups: " ++
833 show (map fst groups)
834 else Bad $ intercalate ", " all_msgs
835 else let (final_group, final_sol) = head sortedSols
836 in return (final_group, final_sol, all_msgs)
838 -- | Try to allocate an instance on a multi-group cluster.
839 tryMGAlloc :: Group.List -- ^ The group list
840 -> Node.List -- ^ The node list
841 -> Instance.List -- ^ The instance list
842 -> Instance.Instance -- ^ The instance to allocate
843 -> Int -- ^ Required number of nodes
844 -> Result AllocSolution -- ^ Possible solution list
845 tryMGAlloc mggl mgnl mgil inst cnt = do
846 (best_group, solution, all_msgs) <-
847 findBestAllocGroup mggl mgnl mgil Nothing inst cnt
848 let group_name = Group.name $ Container.find best_group mggl
849 selmsg = "Selected group: " ++ group_name
850 return $ solution { asLog = selmsg:all_msgs }
852 -- | Calculate the new instance list after allocation solution.
853 updateIl :: Instance.List -- ^ The original instance list
854 -> Maybe Node.AllocElement -- ^ The result of the allocation attempt
855 -> Instance.List -- ^ The updated instance list
856 updateIl il Nothing = il
857 updateIl il (Just (_, xi, _, _)) = Container.add (Container.size il) xi il
859 -- | Extract the the new node list from the allocation solution.
860 extractNl :: Node.List -- ^ The original node list
861 -> Maybe Node.AllocElement -- ^ The result of the allocation attempt
862 -> Node.List -- ^ The new node list
863 extractNl nl Nothing = nl
864 extractNl _ (Just (xnl, _, _, _)) = xnl
866 -- | Try to allocate a list of instances on a multi-group cluster.
867 allocList :: Group.List -- ^ The group list
868 -> Node.List -- ^ The node list
869 -> Instance.List -- ^ The instance list
870 -> [(Instance.Instance, Int)] -- ^ The instance to allocate
871 -> AllocSolutionList -- ^ Possible solution list
872 -> Result (Node.List, Instance.List,
873 AllocSolutionList) -- ^ The final solution list
874 allocList _ nl il [] result = Ok (nl, il, result)
875 allocList gl nl il ((xi, xicnt):xies) result = do
876 ares <- tryMGAlloc gl nl il xi xicnt
877 let sol = asSolution ares
878 nl' = extractNl nl sol
879 il' = updateIl il sol
880 allocList gl nl' il' xies ((xi, ares):result)
882 -- | Function which fails if the requested mode is change secondary.
884 -- This is useful since except DRBD, no other disk template can
885 -- execute change secondary; thus, we can just call this function
886 -- instead of always checking for secondary mode. After the call to
887 -- this function, whatever mode we have is just a primary change.
888 failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
889 failOnSecondaryChange ChangeSecondary dt =
890 fail $ "Instances with disk template '" ++ diskTemplateToRaw dt ++
891 "' can't execute change secondary"
892 failOnSecondaryChange _ _ = return ()
894 -- | Run evacuation for a single instance.
896 -- /Note:/ this function should correctly execute both intra-group
897 -- evacuations (in all modes) and inter-group evacuations (in the
898 -- 'ChangeAll' mode). Of course, this requires that the correct list
899 -- of target nodes is passed.
900 nodeEvacInstance :: Node.List -- ^ The node list (cluster-wide)
901 -> Instance.List -- ^ Instance list (cluster-wide)
902 -> EvacMode -- ^ The evacuation mode
903 -> Instance.Instance -- ^ The instance to be evacuated
904 -> Gdx -- ^ The group we're targetting
905 -> [Ndx] -- ^ The list of available nodes
907 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
908 nodeEvacInstance nl il mode inst@(Instance.Instance
909 {Instance.diskTemplate = dt@DTDiskless})
911 failOnSecondaryChange mode dt >>
912 evacOneNodeOnly nl il inst gdx avail_nodes
914 nodeEvacInstance _ _ _ (Instance.Instance
915 {Instance.diskTemplate = DTPlain}) _ _ =
916 fail "Instances of type plain cannot be relocated"
918 nodeEvacInstance _ _ _ (Instance.Instance
919 {Instance.diskTemplate = DTFile}) _ _ =
920 fail "Instances of type file cannot be relocated"
922 nodeEvacInstance nl il mode inst@(Instance.Instance
923 {Instance.diskTemplate = dt@DTSharedFile})
925 failOnSecondaryChange mode dt >>
926 evacOneNodeOnly nl il inst gdx avail_nodes
928 nodeEvacInstance nl il mode inst@(Instance.Instance
929 {Instance.diskTemplate = dt@DTBlock})
931 failOnSecondaryChange mode dt >>
932 evacOneNodeOnly nl il inst gdx avail_nodes
934 nodeEvacInstance nl il mode inst@(Instance.Instance
935 {Instance.diskTemplate = dt@DTRbd})
937 failOnSecondaryChange mode dt >>
938 evacOneNodeOnly nl il inst gdx avail_nodes
940 nodeEvacInstance nl il ChangePrimary
941 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
944 (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
945 let idx = Instance.idx inst
946 il' = Container.add idx inst' il
947 ops = iMoveToJob nl' il' idx Failover
948 return (nl', il', ops)
950 nodeEvacInstance nl il ChangeSecondary
951 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
953 evacOneNodeOnly nl il inst gdx avail_nodes
955 -- The algorithm for ChangeAll is as follows:
957 -- * generate all (primary, secondary) node pairs for the target groups
958 -- * for each pair, execute the needed moves (r:s, f, r:s) and compute
959 -- the final node list state and group score
960 -- * select the best choice via a foldl that uses the same Either
961 -- String solution as the ChangeSecondary mode
962 nodeEvacInstance nl il ChangeAll
963 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
966 let no_nodes = Left "no nodes available"
967 node_pairs = [(p,s) | p <- avail_nodes, s <- avail_nodes, p /= s]
968 (nl', il', ops, _) <-
969 annotateResult "Can't find any good nodes for relocation" .
972 (\accu nodes -> case evacDrbdAllInner nl il inst gdx nodes of
976 -- we don't need more details (which
977 -- nodes, etc.) as we only selected
978 -- this group if we can allocate on
979 -- it, hence failures will not
980 -- propagate out of this fold loop
981 Left _ -> Left $ "Allocation failed: " ++ msg
982 Ok result@(_, _, _, new_cv) ->
983 let new_accu = Right result in
986 Right (_, _, _, old_cv) ->
990 ) no_nodes node_pairs
992 return (nl', il', ops)
994 -- | Generic function for changing one node of an instance.
996 -- This is similar to 'nodeEvacInstance' but will be used in a few of
997 -- its sub-patterns. It folds the inner function 'evacOneNodeInner'
998 -- over the list of available nodes, which results in the best choice
1000 evacOneNodeOnly :: Node.List -- ^ The node list (cluster-wide)
1001 -> Instance.List -- ^ Instance list (cluster-wide)
1002 -> Instance.Instance -- ^ The instance to be evacuated
1003 -> Gdx -- ^ The group we're targetting
1004 -> [Ndx] -- ^ The list of available nodes
1006 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1007 evacOneNodeOnly nl il inst gdx avail_nodes = do
1008 op_fn <- case Instance.mirrorType inst of
1009 MirrorNone -> Bad "Can't relocate/evacuate non-mirrored instances"
1010 MirrorInternal -> Ok ReplaceSecondary
1011 MirrorExternal -> Ok FailoverToAny
1012 (nl', inst', _, ndx) <- annotateResult "Can't find any good node" .
1014 foldl' (evacOneNodeInner nl inst gdx op_fn)
1015 (Left "no nodes available") avail_nodes
1016 let idx = Instance.idx inst
1017 il' = Container.add idx inst' il
1018 ops = iMoveToJob nl' il' idx (op_fn ndx)
1019 return (nl', il', ops)
1021 -- | Inner fold function for changing one node of an instance.
1023 -- Depending on the instance disk template, this will either change
1024 -- the secondary (for DRBD) or the primary node (for shared
1025 -- storage). However, the operation is generic otherwise.
1027 -- The running solution is either a @Left String@, which means we
1028 -- don't have yet a working solution, or a @Right (...)@, which
1029 -- represents a valid solution; it holds the modified node list, the
1030 -- modified instance (after evacuation), the score of that solution,
1031 -- and the new secondary node index.
1032 evacOneNodeInner :: Node.List -- ^ Cluster node list
1033 -> Instance.Instance -- ^ Instance being evacuated
1034 -> Gdx -- ^ The group index of the instance
1035 -> (Ndx -> IMove) -- ^ Operation constructor
1036 -> EvacInnerState -- ^ Current best solution
1037 -> Ndx -- ^ Node we're evaluating as target
1038 -> EvacInnerState -- ^ New best solution
1039 evacOneNodeInner nl inst gdx op_fn accu ndx =
1040 case applyMove nl inst (op_fn ndx) of
1041 OpFail fm -> let fail_msg = "Node " ++ Container.nameOf nl ndx ++
1042 " failed: " ++ show fm
1043 in either (const $ Left fail_msg) (const accu) accu
1044 OpGood (nl', inst', _, _) ->
1045 let nodes = Container.elems nl'
1046 -- The fromJust below is ugly (it can fail nastily), but
1047 -- at this point we should have any internal mismatches,
1048 -- and adding a monad here would be quite involved
1049 grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1050 new_cv = compCVNodes grpnodes
1051 new_accu = Right (nl', inst', new_cv, ndx)
1054 Right (_, _, old_cv, _) ->
1059 -- | Compute result of changing all nodes of a DRBD instance.
1061 -- Given the target primary and secondary node (which might be in a
1062 -- different group or not), this function will 'execute' all the
1063 -- required steps and assuming all operations succceed, will return
1064 -- the modified node and instance lists, the opcodes needed for this
1065 -- and the new group score.
1066 evacDrbdAllInner :: Node.List -- ^ Cluster node list
1067 -> Instance.List -- ^ Cluster instance list
1068 -> Instance.Instance -- ^ The instance to be moved
1069 -> Gdx -- ^ The target group index
1070 -- (which can differ from the
1071 -- current group of the
1073 -> (Ndx, Ndx) -- ^ Tuple of new
1074 -- primary\/secondary nodes
1075 -> Result (Node.List, Instance.List, [OpCodes.OpCode], Score)
1076 evacDrbdAllInner nl il inst gdx (t_pdx, t_sdx) = do
1077 let primary = Container.find (Instance.pNode inst) nl
1078 idx = Instance.idx inst
1079 -- if the primary is offline, then we first failover
1080 (nl1, inst1, ops1) <-
1081 if Node.offline primary
1083 (nl', inst', _, _) <-
1084 annotateResult "Failing over to the secondary" .
1085 opToResult $ applyMove nl inst Failover
1086 return (nl', inst', [Failover])
1087 else return (nl, inst, [])
1088 let (o1, o2, o3) = (ReplaceSecondary t_pdx,
1090 ReplaceSecondary t_sdx)
1091 -- we now need to execute a replace secondary to the future
1093 (nl2, inst2, _, _) <-
1094 annotateResult "Changing secondary to new primary" .
1096 applyMove nl1 inst1 o1
1098 -- we now execute another failover, the primary stays fixed now
1099 (nl3, inst3, _, _) <- annotateResult "Failing over to new primary" .
1100 opToResult $ applyMove nl2 inst2 o2
1102 -- and finally another replace secondary, to the final secondary
1103 (nl4, inst4, _, _) <-
1104 annotateResult "Changing secondary to final secondary" .
1106 applyMove nl3 inst3 o3
1108 il' = Container.add idx inst4 il
1109 ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4
1110 let nodes = Container.elems nl4
1111 -- The fromJust below is ugly (it can fail nastily), but
1112 -- at this point we should have any internal mismatches,
1113 -- and adding a monad here would be quite involved
1114 grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1115 new_cv = compCVNodes grpnodes
1116 return (nl4, il', ops, new_cv)
1118 -- | Computes the nodes in a given group which are available for
1120 availableGroupNodes :: [(Gdx, [Ndx])] -- ^ Group index/node index assoc list
1121 -> IntSet.IntSet -- ^ Nodes that are excluded
1122 -> Gdx -- ^ The group for which we
1124 -> Result [Ndx] -- ^ List of available node indices
1125 availableGroupNodes group_nodes excl_ndx gdx = do
1126 local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx)
1127 Ok (lookup gdx group_nodes)
1128 let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes
1131 -- | Updates the evac solution with the results of an instance
1133 updateEvacSolution :: (Node.List, Instance.List, EvacSolution)
1135 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1136 -> (Node.List, Instance.List, EvacSolution)
1137 updateEvacSolution (nl, il, es) idx (Bad msg) =
1138 (nl, il, es { esFailed = (idx, msg):esFailed es})
1139 updateEvacSolution (_, _, es) idx (Ok (nl, il, opcodes)) =
1140 (nl, il, es { esMoved = new_elem:esMoved es
1141 , esOpCodes = opcodes:esOpCodes es })
1142 where inst = Container.find idx il
1144 instancePriGroup nl inst,
1145 Instance.allNodes inst)
1147 -- | Node-evacuation IAllocator mode main function.
1148 tryNodeEvac :: Group.List -- ^ The cluster groups
1149 -> Node.List -- ^ The node list (cluster-wide, not per group)
1150 -> Instance.List -- ^ Instance list (cluster-wide)
1151 -> EvacMode -- ^ The evacuation mode
1152 -> [Idx] -- ^ List of instance (indices) to be evacuated
1153 -> Result (Node.List, Instance.List, EvacSolution)
1154 tryNodeEvac _ ini_nl ini_il mode idxs =
1155 let evac_ndx = nodesToEvacuate ini_il mode idxs
1156 offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1157 excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline
1158 group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1159 (Container.elems nl))) $
1160 splitCluster ini_nl ini_il
1161 (fin_nl, fin_il, esol) =
1162 foldl' (\state@(nl, il, _) inst ->
1163 let gdx = instancePriGroup nl inst
1164 pdx = Instance.pNode inst in
1165 updateEvacSolution state (Instance.idx inst) $
1166 availableGroupNodes group_ndx
1167 (IntSet.insert pdx excl_ndx) gdx >>=
1168 nodeEvacInstance nl il mode inst gdx
1170 (ini_nl, ini_il, emptyEvacSolution)
1171 (map (`Container.find` ini_il) idxs)
1172 in return (fin_nl, fin_il, reverseEvacSolution esol)
1174 -- | Change-group IAllocator mode main function.
1176 -- This is very similar to 'tryNodeEvac', the only difference is that
1177 -- we don't choose as target group the current instance group, but
1180 -- 1. at the start of the function, we compute which are the target
1181 -- groups; either no groups were passed in, in which case we choose
1182 -- all groups out of which we don't evacuate instance, or there were
1183 -- some groups passed, in which case we use those
1185 -- 2. for each instance, we use 'findBestAllocGroup' to choose the
1186 -- best group to hold the instance, and then we do what
1187 -- 'tryNodeEvac' does, except for this group instead of the current
1190 -- Note that the correct behaviour of this function relies on the
1191 -- function 'nodeEvacInstance' to be able to do correctly both
1192 -- intra-group and inter-group moves when passed the 'ChangeAll' mode.
1193 tryChangeGroup :: Group.List -- ^ The cluster groups
1194 -> Node.List -- ^ The node list (cluster-wide)
1195 -> Instance.List -- ^ Instance list (cluster-wide)
1196 -> [Gdx] -- ^ Target groups; if empty, any
1197 -- groups not being evacuated
1198 -> [Idx] -- ^ List of instance (indices) to be evacuated
1199 -> Result (Node.List, Instance.List, EvacSolution)
1200 tryChangeGroup gl ini_nl ini_il gdxs idxs =
1201 let evac_gdxs = nub $ map (instancePriGroup ini_nl .
1202 flip Container.find ini_il) idxs
1203 target_gdxs = (if null gdxs
1204 then Container.keys gl
1205 else gdxs) \\ evac_gdxs
1206 offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1207 excl_ndx = foldl' (flip IntSet.insert) IntSet.empty offline
1208 group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1209 (Container.elems nl))) $
1210 splitCluster ini_nl ini_il
1211 (fin_nl, fin_il, esol) =
1212 foldl' (\state@(nl, il, _) inst ->
1214 let ncnt = Instance.requiredNodes $
1215 Instance.diskTemplate inst
1216 (gdx, _, _) <- findBestAllocGroup gl nl il
1217 (Just target_gdxs) inst ncnt
1218 av_nodes <- availableGroupNodes group_ndx
1220 nodeEvacInstance nl il ChangeAll inst gdx av_nodes
1221 in updateEvacSolution state (Instance.idx inst) solution
1223 (ini_nl, ini_il, emptyEvacSolution)
1224 (map (`Container.find` ini_il) idxs)
1225 in return (fin_nl, fin_il, reverseEvacSolution esol)
1227 -- | Standard-sized allocation method.
1229 -- This places instances of the same size on the cluster until we're
1230 -- out of space. The result will be a list of identically-sized
1232 iterateAlloc :: AllocMethod
1233 iterateAlloc nl il limit newinst allocnodes ixes cstats =
1234 let depth = length ixes
1235 newname = printf "new-%d" depth::String
1236 newidx = Container.size il
1237 newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
1238 newlimit = fmap (flip (-) 1) limit
1239 in case tryAlloc nl il newi2 allocnodes of
1241 Ok (AllocSolution { asFailures = errs, asSolution = sols3 }) ->
1242 let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in
1245 Just (xnl, xi, _, _) ->
1248 else iterateAlloc xnl (Container.add newidx xi il)
1249 newlimit newinst allocnodes (xi:ixes)
1250 (totalResources xnl:cstats)
1252 -- | Tiered allocation method.
1254 -- This places instances on the cluster, and decreases the spec until
1255 -- we can allocate again. The result will be a list of decreasing
1257 tieredAlloc :: AllocMethod
1258 tieredAlloc nl il limit newinst allocnodes ixes cstats =
1259 case iterateAlloc nl il limit newinst allocnodes ixes cstats of
1261 Ok (errs, nl', il', ixes', cstats') ->
1262 let newsol = Ok (errs, nl', il', ixes', cstats')
1263 ixes_cnt = length ixes'
1264 (stop, newlimit) = case limit of
1265 Nothing -> (False, Nothing)
1266 Just n -> (n <= ixes_cnt,
1267 Just (n - ixes_cnt)) in
1268 if stop then newsol else
1269 case Instance.shrinkByType newinst . fst . last $
1270 sortBy (comparing snd) errs of
1272 Ok newinst' -> tieredAlloc nl' il' newlimit
1273 newinst' allocnodes ixes' cstats'
1275 -- * Formatting functions
1277 -- | Given the original and final nodes, computes the relocation description.
1278 computeMoves :: Instance.Instance -- ^ The instance to be moved
1279 -> String -- ^ The instance name
1280 -> IMove -- ^ The move being performed
1281 -> String -- ^ New primary
1282 -> String -- ^ New secondary
1283 -> (String, [String])
1284 -- ^ Tuple of moves and commands list; moves is containing
1285 -- either @/f/@ for failover or @/r:name/@ for replace
1286 -- secondary, while the command list holds gnt-instance
1287 -- commands (without that prefix), e.g \"@failover instance1@\"
1288 computeMoves i inam mv c d =
1290 Failover -> ("f", [mig])
1291 FailoverToAny _ -> (printf "fa:%s" c, [mig_any])
1292 FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
1293 ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
1294 ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
1295 ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
1296 where morf = if Instance.isRunning i then "migrate" else "failover"
1297 mig = printf "%s -f %s" morf inam::String
1298 mig_any = printf "%s -f -n %s %s" morf c inam::String
1299 rep n = printf "replace-disks -n %s %s" n inam::String
1301 -- | Converts a placement to string format.
1302 printSolutionLine :: Node.List -- ^ The node list
1303 -> Instance.List -- ^ The instance list
1304 -> Int -- ^ Maximum node name length
1305 -> Int -- ^ Maximum instance name length
1306 -> Placement -- ^ The current placement
1307 -> Int -- ^ The index of the placement in
1309 -> (String, [String])
1310 printSolutionLine nl il nmlen imlen plc pos =
1311 let pmlen = (2*nmlen + 1)
1312 (i, p, s, mv, c) = plc
1313 old_sec = Instance.sNode inst
1314 inst = Container.find i il
1315 inam = Instance.alias inst
1316 npri = Node.alias $ Container.find p nl
1317 nsec = Node.alias $ Container.find s nl
1318 opri = Node.alias $ Container.find (Instance.pNode inst) nl
1319 osec = Node.alias $ Container.find old_sec nl
1320 (moves, cmds) = computeMoves inst inam mv npri nsec
1321 -- FIXME: this should check instead/also the disk template
1322 ostr = if old_sec == Node.noSecondary
1323 then printf "%s" opri::String
1324 else printf "%s:%s" opri osec::String
1325 nstr = if s == Node.noSecondary
1326 then printf "%s" npri::String
1327 else printf "%s:%s" npri nsec::String
1328 in (printf " %3d. %-*s %-*s => %-*s %12.8f a=%s"
1329 pos imlen inam pmlen ostr pmlen nstr c moves,
1332 -- | Return the instance and involved nodes in an instance move.
1334 -- Note that the output list length can vary, and is not required nor
1335 -- guaranteed to be of any specific length.
1336 involvedNodes :: Instance.List -- ^ Instance list, used for retrieving
1337 -- the instance from its index; note
1338 -- that this /must/ be the original
1339 -- instance list, so that we can
1340 -- retrieve the old nodes
1341 -> Placement -- ^ The placement we're investigating,
1342 -- containing the new nodes and
1344 -> [Ndx] -- ^ Resulting list of node indices
1345 involvedNodes il plc =
1346 let (i, np, ns, _, _) = plc
1347 inst = Container.find i il
1348 in nub $ [np, ns] ++ Instance.allNodes inst
1350 -- | Inner function for splitJobs, that either appends the next job to
1351 -- the current jobset, or starts a new jobset.
1352 mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
1353 mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
1354 mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
1355 | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
1356 | otherwise = ([n]:cjs, ndx)
1358 -- | Break a list of moves into independent groups. Note that this
1359 -- will reverse the order of jobs.
1360 splitJobs :: [MoveJob] -> [JobSet]
1361 splitJobs = fst . foldl mergeJobs ([], [])
1363 -- | Given a list of commands, prefix them with @gnt-instance@ and
1364 -- also beautify the display a little.
1365 formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
1366 formatJob jsn jsl (sn, (_, _, _, cmds)) =
1368 printf " echo job %d/%d" jsn sn:
1370 map (" gnt-instance " ++) cmds
1372 then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
1375 -- | Given a list of commands, prefix them with @gnt-instance@ and
1376 -- also beautify the display a little.
1377 formatCmds :: [JobSet] -> String
1380 concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1384 -- | Print the node list.
1385 printNodes :: Node.List -> [String] -> String
1387 let fields = case fs of
1388 [] -> Node.defaultFields
1389 "+":rest -> Node.defaultFields ++ rest
1391 snl = sortBy (comparing Node.idx) (Container.elems nl)
1392 (header, isnum) = unzip $ map Node.showHeader fields
1393 in printTable "" header (map (Node.list fields) snl) isnum
1395 -- | Print the instance list.
1396 printInsts :: Node.List -> Instance.List -> String
1398 let sil = sortBy (comparing Instance.idx) (Container.elems il)
1399 helper inst = [ if Instance.isRunning inst then "R" else " "
1400 , Instance.name inst
1401 , Container.nameOf nl (Instance.pNode inst)
1402 , let sdx = Instance.sNode inst
1403 in if sdx == Node.noSecondary
1405 else Container.nameOf nl sdx
1406 , if Instance.autoBalance inst then "Y" else "N"
1407 , printf "%3d" $ Instance.vcpus inst
1408 , printf "%5d" $ Instance.mem inst
1409 , printf "%5d" $ Instance.dsk inst `div` 1024
1415 where DynUtil lC lM lD lN = Instance.util inst
1416 header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1417 , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1418 isnum = False:False:False:False:False:repeat True
1419 in printTable "" header (map helper sil) isnum
1421 -- | Shows statistics for a given node list.
1422 printStats :: String -> Node.List -> String
1424 let dcvs = compDetailedCV $ Container.elems nl
1425 (weights, names) = unzip detailedCVInfo
1426 hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1427 header = [ "Field", "Value", "Weight" ]
1428 formatted = map (\(w, h, val) ->
1433 in printTable lp header formatted $ False:repeat True
1435 -- | Convert a placement into a list of OpCodes (basically a job).
1436 iMoveToJob :: Node.List -- ^ The node list; only used for node
1437 -- names, so any version is good
1438 -- (before or after the operation)
1439 -> Instance.List -- ^ The instance list; also used for
1441 -> Idx -- ^ The index of the instance being
1443 -> IMove -- ^ The actual move to be described
1444 -> [OpCodes.OpCode] -- ^ The list of opcodes equivalent to
1446 iMoveToJob nl il idx move =
1447 let inst = Container.find idx il
1448 iname = Instance.name inst
1449 lookNode = Just . Container.nameOf nl
1450 opF = OpCodes.OpInstanceMigrate iname True False True Nothing
1451 opFA n = OpCodes.OpInstanceMigrate iname True False True (lookNode n)
1452 opR n = OpCodes.OpInstanceReplaceDisks iname (lookNode n)
1453 OpCodes.ReplaceNewSecondary [] Nothing
1456 FailoverToAny np -> [ opFA np ]
1457 ReplacePrimary np -> [ opF, opR np, opF ]
1458 ReplaceSecondary ns -> [ opR ns ]
1459 ReplaceAndFailover np -> [ opR np, opF ]
1460 FailoverAndReplace ns -> [ opF, opR ns ]
1462 -- * Node group functions
1464 -- | Computes the group of an instance.
1465 instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1466 instanceGroup nl i =
1467 let sidx = Instance.sNode i
1468 pnode = Container.find (Instance.pNode i) nl
1469 snode = if sidx == Node.noSecondary
1471 else Container.find sidx nl
1472 pgroup = Node.group pnode
1473 sgroup = Node.group snode
1474 in if pgroup /= sgroup
1475 then fail ("Instance placed accross two node groups, primary " ++
1476 show pgroup ++ ", secondary " ++ show sgroup)
1479 -- | Computes the group of an instance per the primary node.
1480 instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1481 instancePriGroup nl i =
1482 let pnode = Container.find (Instance.pNode i) nl
1485 -- | Compute the list of badly allocated instances (split across node
1487 findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1488 findSplitInstances nl =
1489 filter (not . isOk . instanceGroup nl) . Container.elems
1491 -- | Splits a cluster into the component node groups.
1492 splitCluster :: Node.List -> Instance.List ->
1493 [(Gdx, (Node.List, Instance.List))]
1494 splitCluster nl il =
1495 let ngroups = Node.computeGroups (Container.elems nl)
1496 in map (\(guuid, nodes) ->
1497 let nidxs = map Node.idx nodes
1498 nodes' = zip nidxs nodes
1499 instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1500 in (guuid, (Container.fromList nodes', instances))) ngroups
1502 -- | Compute the list of nodes that are to be evacuated, given a list
1503 -- of instances and an evacuation mode.
1504 nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list
1505 -> EvacMode -- ^ The evacuation mode we're using
1506 -> [Idx] -- ^ List of instance indices being evacuated
1507 -> IntSet.IntSet -- ^ Set of node indices
1508 nodesToEvacuate il mode =
1509 IntSet.delete Node.noSecondary .
1511 let i = Container.find idx il
1512 pdx = Instance.pNode i
1513 sdx = Instance.sNode i
1514 dt = Instance.diskTemplate i
1515 withSecondary = case dt of
1516 DTDrbd8 -> IntSet.insert sdx ns
1519 ChangePrimary -> IntSet.insert pdx ns
1520 ChangeSecondary -> withSecondary
1521 ChangeAll -> IntSet.insert pdx withSecondary