1 {-| Implementation of cluster-wide logic.
3 This module holds all pure cluster-logic; I\/O related functionality
4 goes into the /Main/ module for the individual binaries.
10 Copyright (C) 2009, 2010, 2011, 2012 Google Inc.
12 This program is free software; you can redistribute it and/or modify
13 it under the terms of the GNU General Public License as published by
14 the Free Software Foundation; either version 2 of the License, or
15 (at your option) any later version.
17 This program is distributed in the hope that it will be useful, but
18 WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 General Public License for more details.
22 You should have received a copy of the GNU General Public License
23 along with this program; if not, write to the Free Software
24 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
29 module Ganeti.HTools.Cluster
39 -- * Generic functions
41 , computeAllocationDelta
42 -- * First phase functions
44 -- * Second phase functions
49 -- * Display functions
52 -- * Balacing functions
61 -- * IAllocator functions
69 -- * Allocation functions
72 -- * Node group functions
78 import qualified Data.IntSet as IntSet
80 import Data.Maybe (fromJust, isNothing)
81 import Data.Ord (comparing)
82 import Text.Printf (printf)
84 import Ganeti.BasicTypes
85 import qualified Ganeti.HTools.Container as Container
86 import qualified Ganeti.HTools.Instance as Instance
87 import qualified Ganeti.HTools.Node as Node
88 import qualified Ganeti.HTools.Group as Group
89 import Ganeti.HTools.Types
91 import qualified Ganeti.OpCodes as OpCodes
93 import Ganeti.Types (mkNonEmpty)
97 -- | Allocation\/relocation solution.
98 data AllocSolution = AllocSolution
99 { asFailures :: [FailMode] -- ^ Failure counts
100 , asAllocs :: Int -- ^ Good allocation count
101 , asSolution :: Maybe Node.AllocElement -- ^ The actual allocation result
102 , asLog :: [String] -- ^ Informational messages
105 -- | Node evacuation/group change iallocator result type. This result
106 -- type consists of actual opcodes (a restricted subset) that are
107 -- transmitted back to Ganeti.
108 data EvacSolution = EvacSolution
109 { esMoved :: [(Idx, Gdx, [Ndx])] -- ^ Instances moved successfully
110 , esFailed :: [(Idx, String)] -- ^ Instances which were not
112 , esOpCodes :: [[OpCodes.OpCode]] -- ^ List of jobs
115 -- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
116 type AllocResult = (FailStats, Node.List, Instance.List,
117 [Instance.Instance], [CStats])
119 -- | Type alias for easier handling.
120 type AllocSolutionList = [(Instance.Instance, AllocSolution)]
122 -- | A type denoting the valid allocation mode/pairs.
124 -- For a one-node allocation, this will be a @Left ['Ndx']@, whereas
125 -- for a two-node allocation, this will be a @Right [('Ndx',
126 -- ['Ndx'])]@. In the latter case, the list is basically an
127 -- association list, grouped by primary node and holding the potential
128 -- secondary nodes in the sub-list.
129 type AllocNodes = Either [Ndx] [(Ndx, [Ndx])]
131 -- | The empty solution we start with when computing allocations.
132 emptyAllocSolution :: AllocSolution
133 emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
134 , asSolution = Nothing, asLog = [] }
136 -- | The empty evac solution.
137 emptyEvacSolution :: EvacSolution
138 emptyEvacSolution = EvacSolution { esMoved = []
143 -- | The complete state for the balancing solution.
144 data Table = Table Node.List Instance.List Score [Placement]
147 -- | Cluster statistics data type.
149 { csFmem :: Integer -- ^ Cluster free mem
150 , csFdsk :: Integer -- ^ Cluster free disk
151 , csAmem :: Integer -- ^ Cluster allocatable mem
152 , csAdsk :: Integer -- ^ Cluster allocatable disk
153 , csAcpu :: Integer -- ^ Cluster allocatable cpus
154 , csMmem :: Integer -- ^ Max node allocatable mem
155 , csMdsk :: Integer -- ^ Max node allocatable disk
156 , csMcpu :: Integer -- ^ Max node allocatable cpu
157 , csImem :: Integer -- ^ Instance used mem
158 , csIdsk :: Integer -- ^ Instance used disk
159 , csIcpu :: Integer -- ^ Instance used cpu
160 , csTmem :: Double -- ^ Cluster total mem
161 , csTdsk :: Double -- ^ Cluster total disk
162 , csTcpu :: Double -- ^ Cluster total cpus
163 , csVcpu :: Integer -- ^ Cluster total virtual cpus
164 , csNcpu :: Double -- ^ Equivalent to 'csIcpu' but in terms of
165 -- physical CPUs, i.e. normalised used phys CPUs
166 , csXmem :: Integer -- ^ Unnacounted for mem
167 , csNmem :: Integer -- ^ Node own memory
168 , csScore :: Score -- ^ The cluster score
169 , csNinst :: Int -- ^ The total number of instances
172 -- | A simple type for allocation functions.
173 type AllocMethod = Node.List -- ^ Node list
174 -> Instance.List -- ^ Instance list
175 -> Maybe Int -- ^ Optional allocation limit
176 -> Instance.Instance -- ^ Instance spec for allocation
177 -> AllocNodes -- ^ Which nodes we should allocate on
178 -> [Instance.Instance] -- ^ Allocated instances
179 -> [CStats] -- ^ Running cluster stats
180 -> Result AllocResult -- ^ Allocation result
182 -- | A simple type for the running solution of evacuations.
183 type EvacInnerState =
184 Either String (Node.List, Instance.Instance, Score, Ndx)
186 -- * Utility functions
188 -- | Verifies the N+1 status and return the affected nodes.
189 verifyN1 :: [Node.Node] -> [Node.Node]
190 verifyN1 = filter Node.failN1
192 {-| Computes the pair of bad nodes and instances.
194 The bad node list is computed via a simple 'verifyN1' check, and the
195 bad instance list is the list of primary and secondary instances of
199 computeBadItems :: Node.List -> Instance.List ->
200 ([Node.Node], [Instance.Instance])
201 computeBadItems nl il =
202 let bad_nodes = verifyN1 $ getOnline nl
203 bad_instances = map (`Container.find` il) .
205 concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
207 (bad_nodes, bad_instances)
209 -- | Extracts the node pairs for an instance. This can fail if the
210 -- instance is single-homed. FIXME: this needs to be improved,
211 -- together with the general enhancement for handling non-DRBD moves.
212 instanceNodes :: Node.List -> Instance.Instance ->
213 (Ndx, Ndx, Node.Node, Node.Node)
214 instanceNodes nl inst =
215 let old_pdx = Instance.pNode inst
216 old_sdx = Instance.sNode inst
217 old_p = Container.find old_pdx nl
218 old_s = Container.find old_sdx nl
219 in (old_pdx, old_sdx, old_p, old_s)
221 -- | Zero-initializer for the CStats type.
222 emptyCStats :: CStats
223 emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
225 -- | Update stats with data from a new node.
226 updateCStats :: CStats -> Node.Node -> CStats
227 updateCStats cs node =
228 let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
229 csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
230 csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
231 csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
232 csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
233 csVcpu = x_vcpu, csNcpu = x_ncpu,
234 csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
237 inc_amem = Node.fMem node - Node.rMem node
238 inc_amem' = if inc_amem > 0 then inc_amem else 0
239 inc_adsk = Node.availDisk node
240 inc_imem = truncate (Node.tMem node) - Node.nMem node
241 - Node.xMem node - Node.fMem node
242 inc_icpu = Node.uCpu node
243 inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
244 inc_vcpu = Node.hiCpu node
245 inc_acpu = Node.availCpu node
246 inc_ncpu = fromIntegral (Node.uCpu node) /
247 iPolicyVcpuRatio (Node.iPolicy node)
248 in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
249 , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
250 , csAmem = x_amem + fromIntegral inc_amem'
251 , csAdsk = x_adsk + fromIntegral inc_adsk
252 , csAcpu = x_acpu + fromIntegral inc_acpu
253 , csMmem = max x_mmem (fromIntegral inc_amem')
254 , csMdsk = max x_mdsk (fromIntegral inc_adsk)
255 , csMcpu = max x_mcpu (fromIntegral inc_acpu)
256 , csImem = x_imem + fromIntegral inc_imem
257 , csIdsk = x_idsk + fromIntegral inc_idsk
258 , csIcpu = x_icpu + fromIntegral inc_icpu
259 , csTmem = x_tmem + Node.tMem node
260 , csTdsk = x_tdsk + Node.tDsk node
261 , csTcpu = x_tcpu + Node.tCpu node
262 , csVcpu = x_vcpu + fromIntegral inc_vcpu
263 , csNcpu = x_ncpu + inc_ncpu
264 , csXmem = x_xmem + fromIntegral (Node.xMem node)
265 , csNmem = x_nmem + fromIntegral (Node.nMem node)
266 , csNinst = x_ninst + length (Node.pList node)
269 -- | Compute the total free disk and memory in the cluster.
270 totalResources :: Node.List -> CStats
272 let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
273 in cs { csScore = compCV nl }
275 -- | Compute the delta between two cluster state.
277 -- This is used when doing allocations, to understand better the
278 -- available cluster resources. The return value is a triple of the
279 -- current used values, the delta that was still allocated, and what
280 -- was left unallocated.
281 computeAllocationDelta :: CStats -> CStats -> AllocStats
282 computeAllocationDelta cini cfin =
283 let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu,
284 csNcpu = i_ncpu } = cini
285 CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
286 csTmem = t_mem, csTdsk = t_dsk, csVcpu = f_vcpu,
287 csNcpu = f_ncpu, csTcpu = f_tcpu } = cfin
288 rini = AllocInfo { allocInfoVCpus = fromIntegral i_icpu
289 , allocInfoNCpus = i_ncpu
290 , allocInfoMem = fromIntegral i_imem
291 , allocInfoDisk = fromIntegral i_idsk
293 rfin = AllocInfo { allocInfoVCpus = fromIntegral (f_icpu - i_icpu)
294 , allocInfoNCpus = f_ncpu - i_ncpu
295 , allocInfoMem = fromIntegral (f_imem - i_imem)
296 , allocInfoDisk = fromIntegral (f_idsk - i_idsk)
298 runa = AllocInfo { allocInfoVCpus = fromIntegral (f_vcpu - f_icpu)
299 , allocInfoNCpus = f_tcpu - f_ncpu
300 , allocInfoMem = truncate t_mem - fromIntegral f_imem
301 , allocInfoDisk = truncate t_dsk - fromIntegral f_idsk
303 in (rini, rfin, runa)
305 -- | The names and weights of the individual elements in the CV list.
306 detailedCVInfo :: [(Double, String)]
307 detailedCVInfo = [ (1, "free_mem_cv")
308 , (1, "free_disk_cv")
310 , (1, "reserved_mem_cv")
311 , (4, "offline_all_cnt")
312 , (16, "offline_pri_cnt")
313 , (1, "vcpu_ratio_cv")
316 , (1, "disk_load_cv")
318 , (2, "pri_tags_score")
322 -- | Holds the weights used by 'compCVNodes' for each metric.
323 detailedCVWeights :: [Double]
324 detailedCVWeights = map fst detailedCVInfo
326 -- | Compute the mem and disk covariance.
327 compDetailedCV :: [Node.Node] -> [Double]
328 compDetailedCV all_nodes =
329 let (offline, nodes) = partition Node.offline all_nodes
330 mem_l = map Node.pMem nodes
331 dsk_l = map Node.pDsk nodes
332 -- metric: memory covariance
333 mem_cv = stdDev mem_l
334 -- metric: disk covariance
335 dsk_cv = stdDev dsk_l
336 -- metric: count of instances living on N1 failing nodes
337 n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
338 length (Node.pList n)) .
339 filter Node.failN1 $ nodes :: Double
340 res_l = map Node.pRem nodes
341 -- metric: reserved memory covariance
342 res_cv = stdDev res_l
343 -- offline instances metrics
344 offline_ipri = sum . map (length . Node.pList) $ offline
345 offline_isec = sum . map (length . Node.sList) $ offline
346 -- metric: count of instances on offline nodes
347 off_score = fromIntegral (offline_ipri + offline_isec)::Double
348 -- metric: count of primary instances on offline nodes (this
349 -- helps with evacuation/failover of primary instances on
350 -- 2-node clusters with one node offline)
351 off_pri_score = fromIntegral offline_ipri::Double
352 cpu_l = map Node.pCpu nodes
353 -- metric: covariance of vcpu/pcpu ratio
354 cpu_cv = stdDev cpu_l
355 -- metrics: covariance of cpu, memory, disk and network load
356 (c_load, m_load, d_load, n_load) =
358 let DynUtil c1 m1 d1 n1 = Node.utilLoad n
359 DynUtil c2 m2 d2 n2 = Node.utilPool n
360 in (c1/c2, m1/m2, d1/d2, n1/n2)) nodes
361 -- metric: conflicting instance count
362 pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
363 pri_tags_score = fromIntegral pri_tags_inst::Double
364 -- metric: spindles %
365 spindles_cv = map (\n -> Node.instSpindles n / Node.hiSpindles n) nodes
366 in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
367 , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
368 , pri_tags_score, stdDev spindles_cv ]
370 -- | Compute the /total/ variance.
371 compCVNodes :: [Node.Node] -> Double
372 compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
374 -- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
375 compCV :: Node.List -> Double
376 compCV = compCVNodes . Container.elems
378 -- | Compute online nodes from a 'Node.List'.
379 getOnline :: Node.List -> [Node.Node]
380 getOnline = filter (not . Node.offline) . Container.elems
382 -- * Balancing functions
384 -- | Compute best table. Note that the ordering of the arguments is important.
385 compareTables :: Table -> Table -> Table
386 compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
387 if a_cv > b_cv then b else a
389 -- | Applies an instance move to a given node list and instance.
390 applyMove :: Node.List -> Instance.Instance
391 -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
393 applyMove nl inst Failover =
394 let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
395 int_p = Node.removePri old_p inst
396 int_s = Node.removeSec old_s inst
397 new_nl = do -- Maybe monad
398 new_p <- Node.addPriEx (Node.offline old_p) int_s inst
399 new_s <- Node.addSec int_p inst old_sdx
400 let new_inst = Instance.setBoth inst old_sdx old_pdx
401 return (Container.addTwo old_pdx new_s old_sdx new_p nl,
402 new_inst, old_sdx, old_pdx)
405 -- Failover to any (fa)
406 applyMove nl inst (FailoverToAny new_pdx) = do
407 let (old_pdx, old_sdx, old_pnode, _) = instanceNodes nl inst
408 new_pnode = Container.find new_pdx nl
409 force_failover = Node.offline old_pnode
410 new_pnode' <- Node.addPriEx force_failover new_pnode inst
411 let old_pnode' = Node.removePri old_pnode inst
412 inst' = Instance.setPri inst new_pdx
413 nl' = Container.addTwo old_pdx old_pnode' new_pdx new_pnode' nl
414 return (nl', inst', new_pdx, old_sdx)
416 -- Replace the primary (f:, r:np, f)
417 applyMove nl inst (ReplacePrimary new_pdx) =
418 let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
419 tgt_n = Container.find new_pdx nl
420 int_p = Node.removePri old_p inst
421 int_s = Node.removeSec old_s inst
422 force_p = Node.offline old_p
423 new_nl = do -- Maybe monad
424 -- check that the current secondary can host the instance
425 -- during the migration
426 tmp_s <- Node.addPriEx force_p int_s inst
427 let tmp_s' = Node.removePri tmp_s inst
428 new_p <- Node.addPriEx force_p tgt_n inst
429 new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
430 let new_inst = Instance.setPri inst new_pdx
431 return (Container.add new_pdx new_p $
432 Container.addTwo old_pdx int_p old_sdx new_s nl,
433 new_inst, new_pdx, old_sdx)
436 -- Replace the secondary (r:ns)
437 applyMove nl inst (ReplaceSecondary new_sdx) =
438 let old_pdx = Instance.pNode inst
439 old_sdx = Instance.sNode inst
440 old_s = Container.find old_sdx nl
441 tgt_n = Container.find new_sdx nl
442 int_s = Node.removeSec old_s inst
443 force_s = Node.offline old_s
444 new_inst = Instance.setSec inst new_sdx
445 new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
446 \new_s -> return (Container.addTwo new_sdx
447 new_s old_sdx int_s nl,
448 new_inst, old_pdx, new_sdx)
451 -- Replace the secondary and failover (r:np, f)
452 applyMove nl inst (ReplaceAndFailover new_pdx) =
453 let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
454 tgt_n = Container.find new_pdx nl
455 int_p = Node.removePri old_p inst
456 int_s = Node.removeSec old_s inst
457 force_s = Node.offline old_s
458 new_nl = do -- Maybe monad
459 new_p <- Node.addPri tgt_n inst
460 new_s <- Node.addSecEx force_s int_p inst new_pdx
461 let new_inst = Instance.setBoth inst new_pdx old_pdx
462 return (Container.add new_pdx new_p $
463 Container.addTwo old_pdx new_s old_sdx int_s nl,
464 new_inst, new_pdx, old_pdx)
467 -- Failver and replace the secondary (f, r:ns)
468 applyMove nl inst (FailoverAndReplace new_sdx) =
469 let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
470 tgt_n = Container.find new_sdx nl
471 int_p = Node.removePri old_p inst
472 int_s = Node.removeSec old_s inst
473 force_p = Node.offline old_p
474 new_nl = do -- Maybe monad
475 new_p <- Node.addPriEx force_p int_s inst
476 new_s <- Node.addSecEx force_p tgt_n inst old_sdx
477 let new_inst = Instance.setBoth inst old_sdx new_sdx
478 return (Container.add new_sdx new_s $
479 Container.addTwo old_sdx new_p old_pdx int_p nl,
480 new_inst, old_sdx, new_sdx)
483 -- | Tries to allocate an instance on one given node.
484 allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
485 -> OpResult Node.AllocElement
486 allocateOnSingle nl inst new_pdx =
487 let p = Container.find new_pdx nl
488 new_inst = Instance.setBoth inst new_pdx Node.noSecondary
490 Instance.instMatchesPolicy inst (Node.iPolicy p)
491 new_p <- Node.addPri p inst
492 let new_nl = Container.add new_pdx new_p nl
493 new_score = compCV new_nl
494 return (new_nl, new_inst, [new_p], new_score)
496 -- | Tries to allocate an instance on a given pair of nodes.
497 allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
498 -> OpResult Node.AllocElement
499 allocateOnPair nl inst new_pdx new_sdx =
500 let tgt_p = Container.find new_pdx nl
501 tgt_s = Container.find new_sdx nl
503 Instance.instMatchesPolicy inst (Node.iPolicy tgt_p)
504 new_p <- Node.addPri tgt_p inst
505 new_s <- Node.addSec tgt_s inst new_pdx
506 let new_inst = Instance.setBoth inst new_pdx new_sdx
507 new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
508 return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
510 -- | Tries to perform an instance move and returns the best table
511 -- between the original one and the new one.
512 checkSingleStep :: Table -- ^ The original table
513 -> Instance.Instance -- ^ The instance to move
514 -> Table -- ^ The current best table
515 -> IMove -- ^ The move to apply
516 -> Table -- ^ The final best table
517 checkSingleStep ini_tbl target cur_tbl move =
518 let Table ini_nl ini_il _ ini_plc = ini_tbl
519 tmp_resu = applyMove ini_nl target move
522 Ok (upd_nl, new_inst, pri_idx, sec_idx) ->
523 let tgt_idx = Instance.idx target
524 upd_cvar = compCV upd_nl
525 upd_il = Container.add tgt_idx new_inst ini_il
526 upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
527 upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
528 in compareTables cur_tbl upd_tbl
530 -- | Given the status of the current secondary as a valid new node and
531 -- the current candidate target node, generate the possible moves for
533 possibleMoves :: MirrorType -- ^ The mirroring type of the instance
534 -> Bool -- ^ Whether the secondary node is a valid new node
535 -> Bool -- ^ Whether we can change the primary node
536 -> Ndx -- ^ Target node candidate
537 -> [IMove] -- ^ List of valid result moves
539 possibleMoves MirrorNone _ _ _ = []
541 possibleMoves MirrorExternal _ False _ = []
543 possibleMoves MirrorExternal _ True tdx =
544 [ FailoverToAny tdx ]
546 possibleMoves MirrorInternal _ False tdx =
547 [ ReplaceSecondary tdx ]
549 possibleMoves MirrorInternal True True tdx =
550 [ ReplaceSecondary tdx
551 , ReplaceAndFailover tdx
553 , FailoverAndReplace tdx
556 possibleMoves MirrorInternal False True tdx =
557 [ ReplaceSecondary tdx
558 , ReplaceAndFailover tdx
561 -- | Compute the best move for a given instance.
562 checkInstanceMove :: [Ndx] -- ^ Allowed target node indices
563 -> Bool -- ^ Whether disk moves are allowed
564 -> Bool -- ^ Whether instance moves are allowed
565 -> Table -- ^ Original table
566 -> Instance.Instance -- ^ Instance to move
567 -> Table -- ^ Best new table for this instance
568 checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
569 let opdx = Instance.pNode target
570 osdx = Instance.sNode target
571 bad_nodes = [opdx, osdx]
572 nodes = filter (`notElem` bad_nodes) nodes_idx
573 mir_type = Instance.mirrorType target
574 use_secondary = elem osdx nodes_idx && inst_moves
575 aft_failover = if mir_type == MirrorInternal && use_secondary
576 -- if drbd and allowed to failover
577 then checkSingleStep ini_tbl target ini_tbl Failover
581 then concatMap (possibleMoves mir_type use_secondary inst_moves)
585 -- iterate over the possible nodes for this instance
586 foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
588 -- | Compute the best next move.
589 checkMove :: [Ndx] -- ^ Allowed target node indices
590 -> Bool -- ^ Whether disk moves are allowed
591 -> Bool -- ^ Whether instance moves are allowed
592 -> Table -- ^ The current solution
593 -> [Instance.Instance] -- ^ List of instances still to move
594 -> Table -- ^ The new solution
595 checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
596 let Table _ _ _ ini_plc = ini_tbl
597 -- we're using rwhnf from the Control.Parallel.Strategies
598 -- package; we don't need to use rnf as that would force too
599 -- much evaluation in single-threaded cases, and in
600 -- multi-threaded case the weak head normal form is enough to
601 -- spark the evaluation
602 tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
605 -- iterate over all instances, computing the best move
606 best_tbl = foldl' compareTables ini_tbl tables
607 Table _ _ _ best_plc = best_tbl
608 in if length best_plc == length ini_plc
609 then ini_tbl -- no advancement
612 -- | Check if we are allowed to go deeper in the balancing.
613 doNextBalance :: Table -- ^ The starting table
614 -> Int -- ^ Remaining length
615 -> Score -- ^ Score at which to stop
616 -> Bool -- ^ The resulting table and commands
617 doNextBalance ini_tbl max_rounds min_score =
618 let Table _ _ ini_cv ini_plc = ini_tbl
619 ini_plc_len = length ini_plc
620 in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
622 -- | Run a balance move.
623 tryBalance :: Table -- ^ The starting table
624 -> Bool -- ^ Allow disk moves
625 -> Bool -- ^ Allow instance moves
626 -> Bool -- ^ Only evacuate moves
627 -> Score -- ^ Min gain threshold
628 -> Score -- ^ Min gain
629 -> Maybe Table -- ^ The resulting table and commands
630 tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
631 let Table ini_nl ini_il ini_cv _ = ini_tbl
632 all_inst = Container.elems ini_il
633 all_nodes = Container.elems ini_nl
634 (offline_nodes, online_nodes) = partition Node.offline all_nodes
635 all_inst' = if evac_mode
636 then let bad_nodes = map Node.idx offline_nodes
637 in filter (any (`elem` bad_nodes) .
638 Instance.allNodes) all_inst
640 reloc_inst = filter (\i -> Instance.movable i &&
641 Instance.autoBalance i) all_inst'
642 node_idx = map Node.idx online_nodes
643 fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
644 (Table _ _ fin_cv _) = fin_tbl
646 if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
647 then Just fin_tbl -- this round made success, return the new table
650 -- * Allocation functions
652 -- | Build failure stats out of a list of failures.
653 collapseFailures :: [FailMode] -> FailStats
654 collapseFailures flst =
655 map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst))
658 -- | Compares two Maybe AllocElement and chooses the besst score.
659 bestAllocElement :: Maybe Node.AllocElement
660 -> Maybe Node.AllocElement
661 -> Maybe Node.AllocElement
662 bestAllocElement a Nothing = a
663 bestAllocElement Nothing b = b
664 bestAllocElement a@(Just (_, _, _, ascore)) b@(Just (_, _, _, bscore)) =
665 if ascore < bscore then a else b
667 -- | Update current Allocation solution and failure stats with new
669 concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
670 concatAllocs as (Bad reason) = as { asFailures = reason : asFailures as }
672 concatAllocs as (Ok ns) =
673 let -- Choose the old or new solution, based on the cluster score
675 osols = asSolution as
676 nsols = bestAllocElement osols (Just ns)
678 -- Note: we force evaluation of nsols here in order to keep the
679 -- memory profile low - we know that we will need nsols for sure
680 -- in the next cycle, so we force evaluation of nsols, since the
681 -- foldl' in the caller will only evaluate the tuple, but not the
682 -- elements of the tuple
683 in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolution = nsols }
685 -- | Sums two 'AllocSolution' structures.
686 sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution
687 sumAllocs (AllocSolution aFails aAllocs aSols aLog)
688 (AllocSolution bFails bAllocs bSols bLog) =
689 -- note: we add b first, since usually it will be smaller; when
690 -- fold'ing, a will grow and grow whereas b is the per-group
691 -- result, hence smaller
692 let nFails = bFails ++ aFails
693 nAllocs = aAllocs + bAllocs
694 nSols = bestAllocElement aSols bSols
696 in AllocSolution nFails nAllocs nSols nLog
698 -- | Given a solution, generates a reasonable description for it.
699 describeSolution :: AllocSolution -> String
700 describeSolution as =
701 let fcnt = asFailures as
704 intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
705 filter ((> 0) . snd) . collapseFailures $ fcnt
707 Nothing -> "No valid allocation solutions, failure reasons: " ++
708 (if null fcnt then "unknown reasons" else freasons)
709 Just (_, _, nodes, cv) ->
710 printf ("score: %.8f, successes %d, failures %d (%s)" ++
711 " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
712 (intercalate "/" . map Node.name $ nodes)
714 -- | Annotates a solution with the appropriate string.
715 annotateSolution :: AllocSolution -> AllocSolution
716 annotateSolution as = as { asLog = describeSolution as : asLog as }
718 -- | Reverses an evacuation solution.
720 -- Rationale: we always concat the results to the top of the lists, so
721 -- for proper jobset execution, we should reverse all lists.
722 reverseEvacSolution :: EvacSolution -> EvacSolution
723 reverseEvacSolution (EvacSolution f m o) =
724 EvacSolution (reverse f) (reverse m) (reverse o)
726 -- | Generate the valid node allocation singles or pairs for a new instance.
727 genAllocNodes :: Group.List -- ^ Group list
728 -> Node.List -- ^ The node map
729 -> Int -- ^ The number of nodes required
730 -> Bool -- ^ Whether to drop or not
732 -> Result AllocNodes -- ^ The (monadic) result
733 genAllocNodes gl nl count drop_unalloc =
734 let filter_fn = if drop_unalloc
735 then filter (Group.isAllocable .
736 flip Container.find gl . Node.group)
738 all_nodes = filter_fn $ getOnline nl
739 all_pairs = [(Node.idx p,
740 [Node.idx s | s <- all_nodes,
741 Node.idx p /= Node.idx s,
742 Node.group p == Node.group s]) |
745 1 -> Ok (Left (map Node.idx all_nodes))
746 2 -> Ok (Right (filter (not . null . snd) all_pairs))
747 _ -> Bad "Unsupported number of nodes, only one or two supported"
749 -- | Try to allocate an instance on the cluster.
750 tryAlloc :: (Monad m) =>
751 Node.List -- ^ The node list
752 -> Instance.List -- ^ The instance list
753 -> Instance.Instance -- ^ The instance to allocate
754 -> AllocNodes -- ^ The allocation targets
755 -> m AllocSolution -- ^ Possible solution list
756 tryAlloc _ _ _ (Right []) = fail "Not enough online nodes"
757 tryAlloc nl _ inst (Right ok_pairs) =
758 let psols = parMap rwhnf (\(p, ss) ->
760 concatAllocs cstate .
761 allocateOnPair nl inst p)
762 emptyAllocSolution ss) ok_pairs
763 sols = foldl' sumAllocs emptyAllocSolution psols
764 in return $ annotateSolution sols
766 tryAlloc _ _ _ (Left []) = fail "No online nodes"
767 tryAlloc nl _ inst (Left all_nodes) =
768 let sols = foldl' (\cstate ->
769 concatAllocs cstate . allocateOnSingle nl inst
770 ) emptyAllocSolution all_nodes
771 in return $ annotateSolution sols
773 -- | Given a group/result, describe it as a nice (list of) messages.
774 solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String]
775 solutionDescription gl (groupId, result) =
777 Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
778 Bad message -> [printf "Group %s: error %s" gname message]
779 where grp = Container.find groupId gl
780 gname = Group.name grp
781 pol = allocPolicyToRaw (Group.allocPolicy grp)
783 -- | From a list of possibly bad and possibly empty solutions, filter
784 -- only the groups with a valid result. Note that the result will be
785 -- reversed compared to the original list.
786 filterMGResults :: Group.List
787 -> [(Gdx, Result AllocSolution)]
788 -> [(Gdx, AllocSolution)]
789 filterMGResults gl = foldl' fn []
790 where unallocable = not . Group.isAllocable . flip Container.find gl
791 fn accu (gdx, rasol) =
794 Ok sol | isNothing (asSolution sol) -> accu
795 | unallocable gdx -> accu
796 | otherwise -> (gdx, sol):accu
798 -- | Sort multigroup results based on policy and score.
799 sortMGResults :: Group.List
800 -> [(Gdx, AllocSolution)]
801 -> [(Gdx, AllocSolution)]
802 sortMGResults gl sols =
803 let extractScore (_, _, _, x) = x
804 solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl),
805 (extractScore . fromJust . asSolution) sol)
806 in sortBy (comparing solScore) sols
808 -- | Finds the best group for an instance on a multi-group cluster.
810 -- Only solutions in @preferred@ and @last_resort@ groups will be
811 -- accepted as valid, and additionally if the allowed groups parameter
812 -- is not null then allocation will only be run for those group
814 findBestAllocGroup :: Group.List -- ^ The group list
815 -> Node.List -- ^ The node list
816 -> Instance.List -- ^ The instance list
817 -> Maybe [Gdx] -- ^ The allowed groups
818 -> Instance.Instance -- ^ The instance to allocate
819 -> Int -- ^ Required number of nodes
820 -> Result (Gdx, AllocSolution, [String])
821 findBestAllocGroup mggl mgnl mgil allowed_gdxs inst cnt =
822 let groups = splitCluster mgnl mgil
823 groups' = maybe groups (\gs -> filter ((`elem` gs) . fst) groups)
825 sols = map (\(gid, (nl, il)) ->
826 (gid, genAllocNodes mggl nl cnt False >>=
827 tryAlloc nl il inst))
828 groups'::[(Gdx, Result AllocSolution)]
829 all_msgs = concatMap (solutionDescription mggl) sols
830 goodSols = filterMGResults mggl sols
831 sortedSols = sortMGResults mggl goodSols
832 in if null sortedSols
833 then Bad $ if null groups'
834 then "no groups for evacuation: allowed groups was" ++
835 show allowed_gdxs ++ ", all groups: " ++
836 show (map fst groups)
837 else intercalate ", " all_msgs
838 else let (final_group, final_sol) = head sortedSols
839 in return (final_group, final_sol, all_msgs)
841 -- | Try to allocate an instance on a multi-group cluster.
842 tryMGAlloc :: Group.List -- ^ The group list
843 -> Node.List -- ^ The node list
844 -> Instance.List -- ^ The instance list
845 -> Instance.Instance -- ^ The instance to allocate
846 -> Int -- ^ Required number of nodes
847 -> Result AllocSolution -- ^ Possible solution list
848 tryMGAlloc mggl mgnl mgil inst cnt = do
849 (best_group, solution, all_msgs) <-
850 findBestAllocGroup mggl mgnl mgil Nothing inst cnt
851 let group_name = Group.name $ Container.find best_group mggl
852 selmsg = "Selected group: " ++ group_name
853 return $ solution { asLog = selmsg:all_msgs }
855 -- | Calculate the new instance list after allocation solution.
856 updateIl :: Instance.List -- ^ The original instance list
857 -> Maybe Node.AllocElement -- ^ The result of the allocation attempt
858 -> Instance.List -- ^ The updated instance list
859 updateIl il Nothing = il
860 updateIl il (Just (_, xi, _, _)) = Container.add (Container.size il) xi il
862 -- | Extract the the new node list from the allocation solution.
863 extractNl :: Node.List -- ^ The original node list
864 -> Maybe Node.AllocElement -- ^ The result of the allocation attempt
865 -> Node.List -- ^ The new node list
866 extractNl nl Nothing = nl
867 extractNl _ (Just (xnl, _, _, _)) = xnl
869 -- | Try to allocate a list of instances on a multi-group cluster.
870 allocList :: Group.List -- ^ The group list
871 -> Node.List -- ^ The node list
872 -> Instance.List -- ^ The instance list
873 -> [(Instance.Instance, Int)] -- ^ The instance to allocate
874 -> AllocSolutionList -- ^ Possible solution list
875 -> Result (Node.List, Instance.List,
876 AllocSolutionList) -- ^ The final solution list
877 allocList _ nl il [] result = Ok (nl, il, result)
878 allocList gl nl il ((xi, xicnt):xies) result = do
879 ares <- tryMGAlloc gl nl il xi xicnt
880 let sol = asSolution ares
881 nl' = extractNl nl sol
882 il' = updateIl il sol
883 allocList gl nl' il' xies ((xi, ares):result)
885 -- | Function which fails if the requested mode is change secondary.
887 -- This is useful since except DRBD, no other disk template can
888 -- execute change secondary; thus, we can just call this function
889 -- instead of always checking for secondary mode. After the call to
890 -- this function, whatever mode we have is just a primary change.
891 failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
892 failOnSecondaryChange ChangeSecondary dt =
893 fail $ "Instances with disk template '" ++ diskTemplateToRaw dt ++
894 "' can't execute change secondary"
895 failOnSecondaryChange _ _ = return ()
897 -- | Run evacuation for a single instance.
899 -- /Note:/ this function should correctly execute both intra-group
900 -- evacuations (in all modes) and inter-group evacuations (in the
901 -- 'ChangeAll' mode). Of course, this requires that the correct list
902 -- of target nodes is passed.
903 nodeEvacInstance :: Node.List -- ^ The node list (cluster-wide)
904 -> Instance.List -- ^ Instance list (cluster-wide)
905 -> EvacMode -- ^ The evacuation mode
906 -> Instance.Instance -- ^ The instance to be evacuated
907 -> Gdx -- ^ The group we're targetting
908 -> [Ndx] -- ^ The list of available nodes
910 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
911 nodeEvacInstance nl il mode inst@(Instance.Instance
912 {Instance.diskTemplate = dt@DTDiskless})
914 failOnSecondaryChange mode dt >>
915 evacOneNodeOnly nl il inst gdx avail_nodes
917 nodeEvacInstance _ _ _ (Instance.Instance
918 {Instance.diskTemplate = DTPlain}) _ _ =
919 fail "Instances of type plain cannot be relocated"
921 nodeEvacInstance _ _ _ (Instance.Instance
922 {Instance.diskTemplate = DTFile}) _ _ =
923 fail "Instances of type file cannot be relocated"
925 nodeEvacInstance nl il mode inst@(Instance.Instance
926 {Instance.diskTemplate = dt@DTSharedFile})
928 failOnSecondaryChange mode dt >>
929 evacOneNodeOnly nl il inst gdx avail_nodes
931 nodeEvacInstance nl il mode inst@(Instance.Instance
932 {Instance.diskTemplate = dt@DTBlock})
934 failOnSecondaryChange mode dt >>
935 evacOneNodeOnly nl il inst gdx avail_nodes
937 nodeEvacInstance nl il mode inst@(Instance.Instance
938 {Instance.diskTemplate = dt@DTRbd})
940 failOnSecondaryChange mode dt >>
941 evacOneNodeOnly nl il inst gdx avail_nodes
943 nodeEvacInstance nl il mode inst@(Instance.Instance
944 {Instance.diskTemplate = dt@DTExt})
946 failOnSecondaryChange mode dt >>
947 evacOneNodeOnly nl il inst gdx avail_nodes
949 nodeEvacInstance nl il ChangePrimary
950 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
953 (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
954 let idx = Instance.idx inst
955 il' = Container.add idx inst' il
956 ops = iMoveToJob nl' il' idx Failover
957 return (nl', il', ops)
959 nodeEvacInstance nl il ChangeSecondary
960 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
962 evacOneNodeOnly nl il inst gdx avail_nodes
964 -- The algorithm for ChangeAll is as follows:
966 -- * generate all (primary, secondary) node pairs for the target groups
967 -- * for each pair, execute the needed moves (r:s, f, r:s) and compute
968 -- the final node list state and group score
969 -- * select the best choice via a foldl that uses the same Either
970 -- String solution as the ChangeSecondary mode
971 nodeEvacInstance nl il ChangeAll
972 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
975 let no_nodes = Left "no nodes available"
976 node_pairs = [(p,s) | p <- avail_nodes, s <- avail_nodes, p /= s]
977 (nl', il', ops, _) <-
978 annotateResult "Can't find any good nodes for relocation" .
981 (\accu nodes -> case evacDrbdAllInner nl il inst gdx nodes of
985 -- we don't need more details (which
986 -- nodes, etc.) as we only selected
987 -- this group if we can allocate on
988 -- it, hence failures will not
989 -- propagate out of this fold loop
990 Left _ -> Left $ "Allocation failed: " ++ msg
991 Ok result@(_, _, _, new_cv) ->
992 let new_accu = Right result in
995 Right (_, _, _, old_cv) ->
999 ) no_nodes node_pairs
1001 return (nl', il', ops)
1003 -- | Generic function for changing one node of an instance.
1005 -- This is similar to 'nodeEvacInstance' but will be used in a few of
1006 -- its sub-patterns. It folds the inner function 'evacOneNodeInner'
1007 -- over the list of available nodes, which results in the best choice
1009 evacOneNodeOnly :: Node.List -- ^ The node list (cluster-wide)
1010 -> Instance.List -- ^ Instance list (cluster-wide)
1011 -> Instance.Instance -- ^ The instance to be evacuated
1012 -> Gdx -- ^ The group we're targetting
1013 -> [Ndx] -- ^ The list of available nodes
1015 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1016 evacOneNodeOnly nl il inst gdx avail_nodes = do
1017 op_fn <- case Instance.mirrorType inst of
1018 MirrorNone -> Bad "Can't relocate/evacuate non-mirrored instances"
1019 MirrorInternal -> Ok ReplaceSecondary
1020 MirrorExternal -> Ok FailoverToAny
1021 (nl', inst', _, ndx) <- annotateResult "Can't find any good node" .
1023 foldl' (evacOneNodeInner nl inst gdx op_fn)
1024 (Left "no nodes available") avail_nodes
1025 let idx = Instance.idx inst
1026 il' = Container.add idx inst' il
1027 ops = iMoveToJob nl' il' idx (op_fn ndx)
1028 return (nl', il', ops)
1030 -- | Inner fold function for changing one node of an instance.
1032 -- Depending on the instance disk template, this will either change
1033 -- the secondary (for DRBD) or the primary node (for shared
1034 -- storage). However, the operation is generic otherwise.
1036 -- The running solution is either a @Left String@, which means we
1037 -- don't have yet a working solution, or a @Right (...)@, which
1038 -- represents a valid solution; it holds the modified node list, the
1039 -- modified instance (after evacuation), the score of that solution,
1040 -- and the new secondary node index.
1041 evacOneNodeInner :: Node.List -- ^ Cluster node list
1042 -> Instance.Instance -- ^ Instance being evacuated
1043 -> Gdx -- ^ The group index of the instance
1044 -> (Ndx -> IMove) -- ^ Operation constructor
1045 -> EvacInnerState -- ^ Current best solution
1046 -> Ndx -- ^ Node we're evaluating as target
1047 -> EvacInnerState -- ^ New best solution
1048 evacOneNodeInner nl inst gdx op_fn accu ndx =
1049 case applyMove nl inst (op_fn ndx) of
1050 Bad fm -> let fail_msg = "Node " ++ Container.nameOf nl ndx ++
1051 " failed: " ++ show fm
1052 in either (const $ Left fail_msg) (const accu) accu
1053 Ok (nl', inst', _, _) ->
1054 let nodes = Container.elems nl'
1055 -- The fromJust below is ugly (it can fail nastily), but
1056 -- at this point we should have any internal mismatches,
1057 -- and adding a monad here would be quite involved
1058 grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1059 new_cv = compCVNodes grpnodes
1060 new_accu = Right (nl', inst', new_cv, ndx)
1063 Right (_, _, old_cv, _) ->
1068 -- | Compute result of changing all nodes of a DRBD instance.
1070 -- Given the target primary and secondary node (which might be in a
1071 -- different group or not), this function will 'execute' all the
1072 -- required steps and assuming all operations succceed, will return
1073 -- the modified node and instance lists, the opcodes needed for this
1074 -- and the new group score.
1075 evacDrbdAllInner :: Node.List -- ^ Cluster node list
1076 -> Instance.List -- ^ Cluster instance list
1077 -> Instance.Instance -- ^ The instance to be moved
1078 -> Gdx -- ^ The target group index
1079 -- (which can differ from the
1080 -- current group of the
1082 -> (Ndx, Ndx) -- ^ Tuple of new
1083 -- primary\/secondary nodes
1084 -> Result (Node.List, Instance.List, [OpCodes.OpCode], Score)
1085 evacDrbdAllInner nl il inst gdx (t_pdx, t_sdx) = do
1086 let primary = Container.find (Instance.pNode inst) nl
1087 idx = Instance.idx inst
1088 -- if the primary is offline, then we first failover
1089 (nl1, inst1, ops1) <-
1090 if Node.offline primary
1092 (nl', inst', _, _) <-
1093 annotateResult "Failing over to the secondary" .
1094 opToResult $ applyMove nl inst Failover
1095 return (nl', inst', [Failover])
1096 else return (nl, inst, [])
1097 let (o1, o2, o3) = (ReplaceSecondary t_pdx,
1099 ReplaceSecondary t_sdx)
1100 -- we now need to execute a replace secondary to the future
1102 (nl2, inst2, _, _) <-
1103 annotateResult "Changing secondary to new primary" .
1105 applyMove nl1 inst1 o1
1107 -- we now execute another failover, the primary stays fixed now
1108 (nl3, inst3, _, _) <- annotateResult "Failing over to new primary" .
1109 opToResult $ applyMove nl2 inst2 o2
1111 -- and finally another replace secondary, to the final secondary
1112 (nl4, inst4, _, _) <-
1113 annotateResult "Changing secondary to final secondary" .
1115 applyMove nl3 inst3 o3
1117 il' = Container.add idx inst4 il
1118 ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4
1119 let nodes = Container.elems nl4
1120 -- The fromJust below is ugly (it can fail nastily), but
1121 -- at this point we should have any internal mismatches,
1122 -- and adding a monad here would be quite involved
1123 grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1124 new_cv = compCVNodes grpnodes
1125 return (nl4, il', ops, new_cv)
1127 -- | Computes the nodes in a given group which are available for
1129 availableGroupNodes :: [(Gdx, [Ndx])] -- ^ Group index/node index assoc list
1130 -> IntSet.IntSet -- ^ Nodes that are excluded
1131 -> Gdx -- ^ The group for which we
1133 -> Result [Ndx] -- ^ List of available node indices
1134 availableGroupNodes group_nodes excl_ndx gdx = do
1135 local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx)
1136 Ok (lookup gdx group_nodes)
1137 let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes
1140 -- | Updates the evac solution with the results of an instance
1142 updateEvacSolution :: (Node.List, Instance.List, EvacSolution)
1144 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1145 -> (Node.List, Instance.List, EvacSolution)
1146 updateEvacSolution (nl, il, es) idx (Bad msg) =
1147 (nl, il, es { esFailed = (idx, msg):esFailed es})
1148 updateEvacSolution (_, _, es) idx (Ok (nl, il, opcodes)) =
1149 (nl, il, es { esMoved = new_elem:esMoved es
1150 , esOpCodes = opcodes:esOpCodes es })
1151 where inst = Container.find idx il
1153 instancePriGroup nl inst,
1154 Instance.allNodes inst)
1156 -- | Node-evacuation IAllocator mode main function.
1157 tryNodeEvac :: Group.List -- ^ The cluster groups
1158 -> Node.List -- ^ The node list (cluster-wide, not per group)
1159 -> Instance.List -- ^ Instance list (cluster-wide)
1160 -> EvacMode -- ^ The evacuation mode
1161 -> [Idx] -- ^ List of instance (indices) to be evacuated
1162 -> Result (Node.List, Instance.List, EvacSolution)
1163 tryNodeEvac _ ini_nl ini_il mode idxs =
1164 let evac_ndx = nodesToEvacuate ini_il mode idxs
1165 offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1166 excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline
1167 group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1168 (Container.elems nl))) $
1169 splitCluster ini_nl ini_il
1170 (fin_nl, fin_il, esol) =
1171 foldl' (\state@(nl, il, _) inst ->
1172 let gdx = instancePriGroup nl inst
1173 pdx = Instance.pNode inst in
1174 updateEvacSolution state (Instance.idx inst) $
1175 availableGroupNodes group_ndx
1176 (IntSet.insert pdx excl_ndx) gdx >>=
1177 nodeEvacInstance nl il mode inst gdx
1179 (ini_nl, ini_il, emptyEvacSolution)
1180 (map (`Container.find` ini_il) idxs)
1181 in return (fin_nl, fin_il, reverseEvacSolution esol)
1183 -- | Change-group IAllocator mode main function.
1185 -- This is very similar to 'tryNodeEvac', the only difference is that
1186 -- we don't choose as target group the current instance group, but
1189 -- 1. at the start of the function, we compute which are the target
1190 -- groups; either no groups were passed in, in which case we choose
1191 -- all groups out of which we don't evacuate instance, or there were
1192 -- some groups passed, in which case we use those
1194 -- 2. for each instance, we use 'findBestAllocGroup' to choose the
1195 -- best group to hold the instance, and then we do what
1196 -- 'tryNodeEvac' does, except for this group instead of the current
1199 -- Note that the correct behaviour of this function relies on the
1200 -- function 'nodeEvacInstance' to be able to do correctly both
1201 -- intra-group and inter-group moves when passed the 'ChangeAll' mode.
1202 tryChangeGroup :: Group.List -- ^ The cluster groups
1203 -> Node.List -- ^ The node list (cluster-wide)
1204 -> Instance.List -- ^ Instance list (cluster-wide)
1205 -> [Gdx] -- ^ Target groups; if empty, any
1206 -- groups not being evacuated
1207 -> [Idx] -- ^ List of instance (indices) to be evacuated
1208 -> Result (Node.List, Instance.List, EvacSolution)
1209 tryChangeGroup gl ini_nl ini_il gdxs idxs =
1210 let evac_gdxs = nub $ map (instancePriGroup ini_nl .
1211 flip Container.find ini_il) idxs
1212 target_gdxs = (if null gdxs
1213 then Container.keys gl
1214 else gdxs) \\ evac_gdxs
1215 offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1216 excl_ndx = foldl' (flip IntSet.insert) IntSet.empty offline
1217 group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1218 (Container.elems nl))) $
1219 splitCluster ini_nl ini_il
1220 (fin_nl, fin_il, esol) =
1221 foldl' (\state@(nl, il, _) inst ->
1223 let ncnt = Instance.requiredNodes $
1224 Instance.diskTemplate inst
1225 (gdx, _, _) <- findBestAllocGroup gl nl il
1226 (Just target_gdxs) inst ncnt
1227 av_nodes <- availableGroupNodes group_ndx
1229 nodeEvacInstance nl il ChangeAll inst gdx av_nodes
1230 in updateEvacSolution state (Instance.idx inst) solution
1232 (ini_nl, ini_il, emptyEvacSolution)
1233 (map (`Container.find` ini_il) idxs)
1234 in return (fin_nl, fin_il, reverseEvacSolution esol)
1236 -- | Standard-sized allocation method.
1238 -- This places instances of the same size on the cluster until we're
1239 -- out of space. The result will be a list of identically-sized
1241 iterateAlloc :: AllocMethod
1242 iterateAlloc nl il limit newinst allocnodes ixes cstats =
1243 let depth = length ixes
1244 newname = printf "new-%d" depth::String
1245 newidx = Container.size il
1246 newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
1247 newlimit = fmap (flip (-) 1) limit
1248 in case tryAlloc nl il newi2 allocnodes of
1250 Ok (AllocSolution { asFailures = errs, asSolution = sols3 }) ->
1251 let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in
1254 Just (xnl, xi, _, _) ->
1257 else iterateAlloc xnl (Container.add newidx xi il)
1258 newlimit newinst allocnodes (xi:ixes)
1259 (totalResources xnl:cstats)
1261 -- | Tiered allocation method.
1263 -- This places instances on the cluster, and decreases the spec until
1264 -- we can allocate again. The result will be a list of decreasing
1266 tieredAlloc :: AllocMethod
1267 tieredAlloc nl il limit newinst allocnodes ixes cstats =
1268 case iterateAlloc nl il limit newinst allocnodes ixes cstats of
1270 Ok (errs, nl', il', ixes', cstats') ->
1271 let newsol = Ok (errs, nl', il', ixes', cstats')
1272 ixes_cnt = length ixes'
1273 (stop, newlimit) = case limit of
1274 Nothing -> (False, Nothing)
1275 Just n -> (n <= ixes_cnt,
1276 Just (n - ixes_cnt)) in
1277 if stop then newsol else
1278 case Instance.shrinkByType newinst . fst . last $
1279 sortBy (comparing snd) errs of
1281 Ok newinst' -> tieredAlloc nl' il' newlimit
1282 newinst' allocnodes ixes' cstats'
1284 -- * Formatting functions
1286 -- | Given the original and final nodes, computes the relocation description.
1287 computeMoves :: Instance.Instance -- ^ The instance to be moved
1288 -> String -- ^ The instance name
1289 -> IMove -- ^ The move being performed
1290 -> String -- ^ New primary
1291 -> String -- ^ New secondary
1292 -> (String, [String])
1293 -- ^ Tuple of moves and commands list; moves is containing
1294 -- either @/f/@ for failover or @/r:name/@ for replace
1295 -- secondary, while the command list holds gnt-instance
1296 -- commands (without that prefix), e.g \"@failover instance1@\"
1297 computeMoves i inam mv c d =
1299 Failover -> ("f", [mig])
1300 FailoverToAny _ -> (printf "fa:%s" c, [mig_any])
1301 FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
1302 ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
1303 ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
1304 ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
1305 where morf = if Instance.isRunning i then "migrate" else "failover"
1306 mig = printf "%s -f %s" morf inam::String
1307 mig_any = printf "%s -f -n %s %s" morf c inam::String
1308 rep n = printf "replace-disks -n %s %s" n inam::String
1310 -- | Converts a placement to string format.
1311 printSolutionLine :: Node.List -- ^ The node list
1312 -> Instance.List -- ^ The instance list
1313 -> Int -- ^ Maximum node name length
1314 -> Int -- ^ Maximum instance name length
1315 -> Placement -- ^ The current placement
1316 -> Int -- ^ The index of the placement in
1318 -> (String, [String])
1319 printSolutionLine nl il nmlen imlen plc pos =
1320 let pmlen = (2*nmlen + 1)
1321 (i, p, s, mv, c) = plc
1322 old_sec = Instance.sNode inst
1323 inst = Container.find i il
1324 inam = Instance.alias inst
1325 npri = Node.alias $ Container.find p nl
1326 nsec = Node.alias $ Container.find s nl
1327 opri = Node.alias $ Container.find (Instance.pNode inst) nl
1328 osec = Node.alias $ Container.find old_sec nl
1329 (moves, cmds) = computeMoves inst inam mv npri nsec
1330 -- FIXME: this should check instead/also the disk template
1331 ostr = if old_sec == Node.noSecondary
1332 then printf "%s" opri::String
1333 else printf "%s:%s" opri osec::String
1334 nstr = if s == Node.noSecondary
1335 then printf "%s" npri::String
1336 else printf "%s:%s" npri nsec::String
1337 in (printf " %3d. %-*s %-*s => %-*s %12.8f a=%s"
1338 pos imlen inam pmlen ostr pmlen nstr c moves,
1341 -- | Return the instance and involved nodes in an instance move.
1343 -- Note that the output list length can vary, and is not required nor
1344 -- guaranteed to be of any specific length.
1345 involvedNodes :: Instance.List -- ^ Instance list, used for retrieving
1346 -- the instance from its index; note
1347 -- that this /must/ be the original
1348 -- instance list, so that we can
1349 -- retrieve the old nodes
1350 -> Placement -- ^ The placement we're investigating,
1351 -- containing the new nodes and
1353 -> [Ndx] -- ^ Resulting list of node indices
1354 involvedNodes il plc =
1355 let (i, np, ns, _, _) = plc
1356 inst = Container.find i il
1357 in nub $ [np, ns] ++ Instance.allNodes inst
1359 -- | Inner function for splitJobs, that either appends the next job to
1360 -- the current jobset, or starts a new jobset.
1361 mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
1362 mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
1363 mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
1364 | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
1365 | otherwise = ([n]:cjs, ndx)
1367 -- | Break a list of moves into independent groups. Note that this
1368 -- will reverse the order of jobs.
1369 splitJobs :: [MoveJob] -> [JobSet]
1370 splitJobs = fst . foldl mergeJobs ([], [])
1372 -- | Given a list of commands, prefix them with @gnt-instance@ and
1373 -- also beautify the display a little.
1374 formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
1375 formatJob jsn jsl (sn, (_, _, _, cmds)) =
1377 printf " echo job %d/%d" jsn sn:
1379 map (" gnt-instance " ++) cmds
1381 then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
1384 -- | Given a list of commands, prefix them with @gnt-instance@ and
1385 -- also beautify the display a little.
1386 formatCmds :: [JobSet] -> String
1389 concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1393 -- | Print the node list.
1394 printNodes :: Node.List -> [String] -> String
1396 let fields = case fs of
1397 [] -> Node.defaultFields
1398 "+":rest -> Node.defaultFields ++ rest
1400 snl = sortBy (comparing Node.idx) (Container.elems nl)
1401 (header, isnum) = unzip $ map Node.showHeader fields
1402 in printTable "" header (map (Node.list fields) snl) isnum
1404 -- | Print the instance list.
1405 printInsts :: Node.List -> Instance.List -> String
1407 let sil = sortBy (comparing Instance.idx) (Container.elems il)
1408 helper inst = [ if Instance.isRunning inst then "R" else " "
1409 , Instance.name inst
1410 , Container.nameOf nl (Instance.pNode inst)
1411 , let sdx = Instance.sNode inst
1412 in if sdx == Node.noSecondary
1414 else Container.nameOf nl sdx
1415 , if Instance.autoBalance inst then "Y" else "N"
1416 , printf "%3d" $ Instance.vcpus inst
1417 , printf "%5d" $ Instance.mem inst
1418 , printf "%5d" $ Instance.dsk inst `div` 1024
1424 where DynUtil lC lM lD lN = Instance.util inst
1425 header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1426 , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1427 isnum = False:False:False:False:False:repeat True
1428 in printTable "" header (map helper sil) isnum
1430 -- | Shows statistics for a given node list.
1431 printStats :: String -> Node.List -> String
1433 let dcvs = compDetailedCV $ Container.elems nl
1434 (weights, names) = unzip detailedCVInfo
1435 hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1436 header = [ "Field", "Value", "Weight" ]
1437 formatted = map (\(w, h, val) ->
1442 in printTable lp header formatted $ False:repeat True
1444 -- | Convert a placement into a list of OpCodes (basically a job).
1445 iMoveToJob :: Node.List -- ^ The node list; only used for node
1446 -- names, so any version is good
1447 -- (before or after the operation)
1448 -> Instance.List -- ^ The instance list; also used for
1450 -> Idx -- ^ The index of the instance being
1452 -> IMove -- ^ The actual move to be described
1453 -> [OpCodes.OpCode] -- ^ The list of opcodes equivalent to
1455 iMoveToJob nl il idx move =
1456 let inst = Container.find idx il
1457 iname = Instance.name inst
1458 lookNode n = case mkNonEmpty (Container.nameOf nl n) of
1459 -- FIXME: convert htools codebase to non-empty strings
1460 Bad msg -> error $ "Empty node name for idx " ++
1461 show n ++ ": " ++ msg ++ "??"
1463 opF = OpCodes.OpInstanceMigrate
1464 { OpCodes.opInstanceName = iname
1465 , OpCodes.opMigrationMode = Nothing -- default
1466 , OpCodes.opOldLiveMode = Nothing -- default as well
1467 , OpCodes.opTargetNode = Nothing -- this is drbd
1468 , OpCodes.opAllowRuntimeChanges = False
1469 , OpCodes.opIgnoreIpolicy = False
1470 , OpCodes.opMigrationCleanup = False
1471 , OpCodes.opIallocator = Nothing
1472 , OpCodes.opAllowFailover = True }
1473 opFA n = opF { OpCodes.opTargetNode = lookNode n } -- not drbd
1474 opR n = OpCodes.OpInstanceReplaceDisks
1475 { OpCodes.opInstanceName = iname
1476 , OpCodes.opEarlyRelease = False
1477 , OpCodes.opIgnoreIpolicy = False
1478 , OpCodes.opReplaceDisksMode = OpCodes.ReplaceNewSecondary
1479 , OpCodes.opReplaceDisksList = []
1480 , OpCodes.opRemoteNode = lookNode n
1481 , OpCodes.opIallocator = Nothing
1485 FailoverToAny np -> [ opFA np ]
1486 ReplacePrimary np -> [ opF, opR np, opF ]
1487 ReplaceSecondary ns -> [ opR ns ]
1488 ReplaceAndFailover np -> [ opR np, opF ]
1489 FailoverAndReplace ns -> [ opF, opR ns ]
1491 -- * Node group functions
1493 -- | Computes the group of an instance.
1494 instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1495 instanceGroup nl i =
1496 let sidx = Instance.sNode i
1497 pnode = Container.find (Instance.pNode i) nl
1498 snode = if sidx == Node.noSecondary
1500 else Container.find sidx nl
1501 pgroup = Node.group pnode
1502 sgroup = Node.group snode
1503 in if pgroup /= sgroup
1504 then fail ("Instance placed accross two node groups, primary " ++
1505 show pgroup ++ ", secondary " ++ show sgroup)
1508 -- | Computes the group of an instance per the primary node.
1509 instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1510 instancePriGroup nl i =
1511 let pnode = Container.find (Instance.pNode i) nl
1514 -- | Compute the list of badly allocated instances (split across node
1516 findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1517 findSplitInstances nl =
1518 filter (not . isOk . instanceGroup nl) . Container.elems
1520 -- | Splits a cluster into the component node groups.
1521 splitCluster :: Node.List -> Instance.List ->
1522 [(Gdx, (Node.List, Instance.List))]
1523 splitCluster nl il =
1524 let ngroups = Node.computeGroups (Container.elems nl)
1525 in map (\(guuid, nodes) ->
1526 let nidxs = map Node.idx nodes
1527 nodes' = zip nidxs nodes
1528 instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1529 in (guuid, (Container.fromList nodes', instances))) ngroups
1531 -- | Compute the list of nodes that are to be evacuated, given a list
1532 -- of instances and an evacuation mode.
1533 nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list
1534 -> EvacMode -- ^ The evacuation mode we're using
1535 -> [Idx] -- ^ List of instance indices being evacuated
1536 -> IntSet.IntSet -- ^ Set of node indices
1537 nodesToEvacuate il mode =
1538 IntSet.delete Node.noSecondary .
1540 let i = Container.find idx il
1541 pdx = Instance.pNode i
1542 sdx = Instance.sNode i
1543 dt = Instance.diskTemplate i
1544 withSecondary = case dt of
1545 DTDrbd8 -> IntSet.insert sdx ns
1548 ChangePrimary -> IntSet.insert pdx ns
1549 ChangeSecondary -> withSecondary
1550 ChangeAll -> IntSet.insert pdx withSecondary