1 {-| Implementation of cluster-wide logic.
3 This module holds all pure cluster-logic; I\/O related functionality
4 goes into the /Main/ module for the individual binaries.
10 Copyright (C) 2009, 2010, 2011, 2012, 2013 Google Inc.
12 This program is free software; you can redistribute it and/or modify
13 it under the terms of the GNU General Public License as published by
14 the Free Software Foundation; either version 2 of the License, or
15 (at your option) any later version.
17 This program is distributed in the hope that it will be useful, but
18 WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 General Public License for more details.
22 You should have received a copy of the GNU General Public License
23 along with this program; if not, write to the Free Software
24 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
29 module Ganeti.HTools.Cluster
40 -- * Generic functions
42 , computeAllocationDelta
43 -- * First phase functions
45 -- * Second phase functions
50 -- * Display functions
53 -- * Balacing functions
62 -- * IAllocator functions
70 -- * Allocation functions
73 -- * Node group functions
79 import qualified Data.IntSet as IntSet
81 import Data.Maybe (fromJust, isNothing)
82 import Data.Ord (comparing)
83 import Text.Printf (printf)
85 import Ganeti.BasicTypes
86 import qualified Ganeti.HTools.Container as Container
87 import qualified Ganeti.HTools.Instance as Instance
88 import qualified Ganeti.HTools.Nic as Nic
89 import qualified Ganeti.HTools.Node as Node
90 import qualified Ganeti.HTools.Group as Group
91 import Ganeti.HTools.Types
93 import qualified Ganeti.OpCodes as OpCodes
95 import Ganeti.Types (mkNonEmpty)
99 -- | Allocation\/relocation solution.
100 data AllocSolution = AllocSolution
101 { asFailures :: [FailMode] -- ^ Failure counts
102 , asAllocs :: Int -- ^ Good allocation count
103 , asSolution :: Maybe Node.AllocElement -- ^ The actual allocation result
104 , asLog :: [String] -- ^ Informational messages
107 -- | Node evacuation/group change iallocator result type. This result
108 -- type consists of actual opcodes (a restricted subset) that are
109 -- transmitted back to Ganeti.
110 data EvacSolution = EvacSolution
111 { esMoved :: [(Idx, Gdx, [Ndx])] -- ^ Instances moved successfully
112 , esFailed :: [(Idx, String)] -- ^ Instances which were not
114 , esOpCodes :: [[OpCodes.OpCode]] -- ^ List of jobs
117 -- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
118 type AllocResult = (FailStats, Node.List, Instance.List,
119 [Instance.Instance], [CStats])
121 -- | Type alias for easier handling.
122 type AllocSolutionList = [(Instance.Instance, AllocSolution)]
124 -- | A type denoting the valid allocation mode/pairs.
126 -- For a one-node allocation, this will be a @Left ['Ndx']@, whereas
127 -- for a two-node allocation, this will be a @Right [('Ndx',
128 -- ['Ndx'])]@. In the latter case, the list is basically an
129 -- association list, grouped by primary node and holding the potential
130 -- secondary nodes in the sub-list.
131 type AllocNodes = Either [Ndx] [(Ndx, [Ndx])]
133 -- | The empty solution we start with when computing allocations.
134 emptyAllocSolution :: AllocSolution
135 emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
136 , asSolution = Nothing, asLog = [] }
138 -- | The empty evac solution.
139 emptyEvacSolution :: EvacSolution
140 emptyEvacSolution = EvacSolution { esMoved = []
145 -- | The complete state for the balancing solution.
146 data Table = Table Node.List Instance.List Score [Placement]
149 -- | Cluster statistics data type.
151 { csFmem :: Integer -- ^ Cluster free mem
152 , csFdsk :: Integer -- ^ Cluster free disk
153 , csAmem :: Integer -- ^ Cluster allocatable mem
154 , csAdsk :: Integer -- ^ Cluster allocatable disk
155 , csAcpu :: Integer -- ^ Cluster allocatable cpus
156 , csMmem :: Integer -- ^ Max node allocatable mem
157 , csMdsk :: Integer -- ^ Max node allocatable disk
158 , csMcpu :: Integer -- ^ Max node allocatable cpu
159 , csImem :: Integer -- ^ Instance used mem
160 , csIdsk :: Integer -- ^ Instance used disk
161 , csIcpu :: Integer -- ^ Instance used cpu
162 , csTmem :: Double -- ^ Cluster total mem
163 , csTdsk :: Double -- ^ Cluster total disk
164 , csTcpu :: Double -- ^ Cluster total cpus
165 , csVcpu :: Integer -- ^ Cluster total virtual cpus
166 , csNcpu :: Double -- ^ Equivalent to 'csIcpu' but in terms of
167 -- physical CPUs, i.e. normalised used phys CPUs
168 , csXmem :: Integer -- ^ Unnacounted for mem
169 , csNmem :: Integer -- ^ Node own memory
170 , csScore :: Score -- ^ The cluster score
171 , csNinst :: Int -- ^ The total number of instances
174 -- | A simple type for allocation functions.
175 type AllocMethod = Node.List -- ^ Node list
176 -> Instance.List -- ^ Instance list
177 -> Maybe Int -- ^ Optional allocation limit
178 -> Instance.Instance -- ^ Instance spec for allocation
179 -> AllocNodes -- ^ Which nodes we should allocate on
180 -> [Instance.Instance] -- ^ Allocated instances
181 -> [CStats] -- ^ Running cluster stats
182 -> Result AllocResult -- ^ Allocation result
184 -- | A simple type for the running solution of evacuations.
185 type EvacInnerState =
186 Either String (Node.List, Instance.Instance, Score, Ndx)
188 -- * Utility functions
190 -- | Verifies the N+1 status and return the affected nodes.
191 verifyN1 :: [Node.Node] -> [Node.Node]
192 verifyN1 = filter Node.failN1
194 {-| Computes the pair of bad nodes and instances.
196 The bad node list is computed via a simple 'verifyN1' check, and the
197 bad instance list is the list of primary and secondary instances of
201 computeBadItems :: Node.List -> Instance.List ->
202 ([Node.Node], [Instance.Instance])
203 computeBadItems nl il =
204 let bad_nodes = verifyN1 $ getOnline nl
205 bad_instances = map (`Container.find` il) .
207 concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
209 (bad_nodes, bad_instances)
211 -- | Extracts the node pairs for an instance. This can fail if the
212 -- instance is single-homed. FIXME: this needs to be improved,
213 -- together with the general enhancement for handling non-DRBD moves.
214 instanceNodes :: Node.List -> Instance.Instance ->
215 (Ndx, Ndx, Node.Node, Node.Node)
216 instanceNodes nl inst =
217 let old_pdx = Instance.pNode inst
218 old_sdx = Instance.sNode inst
219 old_p = Container.find old_pdx nl
220 old_s = Container.find old_sdx nl
221 in (old_pdx, old_sdx, old_p, old_s)
223 -- | Zero-initializer for the CStats type.
224 emptyCStats :: CStats
225 emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
227 -- | Update stats with data from a new node.
228 updateCStats :: CStats -> Node.Node -> CStats
229 updateCStats cs node =
230 let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
231 csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
232 csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
233 csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
234 csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
235 csVcpu = x_vcpu, csNcpu = x_ncpu,
236 csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
239 inc_amem = Node.fMem node - Node.rMem node
240 inc_amem' = if inc_amem > 0 then inc_amem else 0
241 inc_adsk = Node.availDisk node
242 inc_imem = truncate (Node.tMem node) - Node.nMem node
243 - Node.xMem node - Node.fMem node
244 inc_icpu = Node.uCpu node
245 inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
246 inc_vcpu = Node.hiCpu node
247 inc_acpu = Node.availCpu node
248 inc_ncpu = fromIntegral (Node.uCpu node) /
249 iPolicyVcpuRatio (Node.iPolicy node)
250 in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
251 , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
252 , csAmem = x_amem + fromIntegral inc_amem'
253 , csAdsk = x_adsk + fromIntegral inc_adsk
254 , csAcpu = x_acpu + fromIntegral inc_acpu
255 , csMmem = max x_mmem (fromIntegral inc_amem')
256 , csMdsk = max x_mdsk (fromIntegral inc_adsk)
257 , csMcpu = max x_mcpu (fromIntegral inc_acpu)
258 , csImem = x_imem + fromIntegral inc_imem
259 , csIdsk = x_idsk + fromIntegral inc_idsk
260 , csIcpu = x_icpu + fromIntegral inc_icpu
261 , csTmem = x_tmem + Node.tMem node
262 , csTdsk = x_tdsk + Node.tDsk node
263 , csTcpu = x_tcpu + Node.tCpu node
264 , csVcpu = x_vcpu + fromIntegral inc_vcpu
265 , csNcpu = x_ncpu + inc_ncpu
266 , csXmem = x_xmem + fromIntegral (Node.xMem node)
267 , csNmem = x_nmem + fromIntegral (Node.nMem node)
268 , csNinst = x_ninst + length (Node.pList node)
271 -- | Compute the total free disk and memory in the cluster.
272 totalResources :: Node.List -> CStats
274 let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
275 in cs { csScore = compCV nl }
277 -- | Compute the delta between two cluster state.
279 -- This is used when doing allocations, to understand better the
280 -- available cluster resources. The return value is a triple of the
281 -- current used values, the delta that was still allocated, and what
282 -- was left unallocated.
283 computeAllocationDelta :: CStats -> CStats -> AllocStats
284 computeAllocationDelta cini cfin =
285 let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu,
286 csNcpu = i_ncpu } = cini
287 CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
288 csTmem = t_mem, csTdsk = t_dsk, csVcpu = f_vcpu,
289 csNcpu = f_ncpu, csTcpu = f_tcpu } = cfin
290 rini = AllocInfo { allocInfoVCpus = fromIntegral i_icpu
291 , allocInfoNCpus = i_ncpu
292 , allocInfoMem = fromIntegral i_imem
293 , allocInfoDisk = fromIntegral i_idsk
295 rfin = AllocInfo { allocInfoVCpus = fromIntegral (f_icpu - i_icpu)
296 , allocInfoNCpus = f_ncpu - i_ncpu
297 , allocInfoMem = fromIntegral (f_imem - i_imem)
298 , allocInfoDisk = fromIntegral (f_idsk - i_idsk)
300 runa = AllocInfo { allocInfoVCpus = fromIntegral (f_vcpu - f_icpu)
301 , allocInfoNCpus = f_tcpu - f_ncpu
302 , allocInfoMem = truncate t_mem - fromIntegral f_imem
303 , allocInfoDisk = truncate t_dsk - fromIntegral f_idsk
305 in (rini, rfin, runa)
307 -- | The names and weights of the individual elements in the CV list.
308 detailedCVInfo :: [(Double, String)]
309 detailedCVInfo = [ (1, "free_mem_cv")
310 , (1, "free_disk_cv")
312 , (1, "reserved_mem_cv")
313 , (4, "offline_all_cnt")
314 , (16, "offline_pri_cnt")
315 , (1, "vcpu_ratio_cv")
318 , (1, "disk_load_cv")
320 , (2, "pri_tags_score")
324 -- | Holds the weights used by 'compCVNodes' for each metric.
325 detailedCVWeights :: [Double]
326 detailedCVWeights = map fst detailedCVInfo
328 -- | Compute the mem and disk covariance.
329 compDetailedCV :: [Node.Node] -> [Double]
330 compDetailedCV all_nodes =
331 let (offline, nodes) = partition Node.offline all_nodes
332 mem_l = map Node.pMem nodes
333 dsk_l = map Node.pDsk nodes
334 -- metric: memory covariance
335 mem_cv = stdDev mem_l
336 -- metric: disk covariance
337 dsk_cv = stdDev dsk_l
338 -- metric: count of instances living on N1 failing nodes
339 n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
340 length (Node.pList n)) .
341 filter Node.failN1 $ nodes :: Double
342 res_l = map Node.pRem nodes
343 -- metric: reserved memory covariance
344 res_cv = stdDev res_l
345 -- offline instances metrics
346 offline_ipri = sum . map (length . Node.pList) $ offline
347 offline_isec = sum . map (length . Node.sList) $ offline
348 -- metric: count of instances on offline nodes
349 off_score = fromIntegral (offline_ipri + offline_isec)::Double
350 -- metric: count of primary instances on offline nodes (this
351 -- helps with evacuation/failover of primary instances on
352 -- 2-node clusters with one node offline)
353 off_pri_score = fromIntegral offline_ipri::Double
354 cpu_l = map Node.pCpu nodes
355 -- metric: covariance of vcpu/pcpu ratio
356 cpu_cv = stdDev cpu_l
357 -- metrics: covariance of cpu, memory, disk and network load
358 (c_load, m_load, d_load, n_load) =
360 let DynUtil c1 m1 d1 n1 = Node.utilLoad n
361 DynUtil c2 m2 d2 n2 = Node.utilPool n
362 in (c1/c2, m1/m2, d1/d2, n1/n2)) nodes
363 -- metric: conflicting instance count
364 pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
365 pri_tags_score = fromIntegral pri_tags_inst::Double
366 -- metric: spindles %
367 spindles_cv = map (\n -> Node.instSpindles n / Node.hiSpindles n) nodes
368 in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
369 , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
370 , pri_tags_score, stdDev spindles_cv ]
372 -- | Compute the /total/ variance.
373 compCVNodes :: [Node.Node] -> Double
374 compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
376 -- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
377 compCV :: Node.List -> Double
378 compCV = compCVNodes . Container.elems
380 -- | Compute online nodes from a 'Node.List'.
381 getOnline :: Node.List -> [Node.Node]
382 getOnline = filter (not . Node.offline) . Container.elems
384 -- * Balancing functions
386 -- | Compute best table. Note that the ordering of the arguments is important.
387 compareTables :: Table -> Table -> Table
388 compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
389 if a_cv > b_cv then b else a
391 -- | Applies an instance move to a given node list and instance.
392 applyMove :: Node.List -> Instance.Instance
393 -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
395 applyMove nl inst Failover =
396 let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
397 int_p = Node.removePri old_p inst
398 int_s = Node.removeSec old_s inst
399 new_nl = do -- Maybe monad
400 new_p <- Node.addPriEx (Node.offline old_p) int_s inst
401 new_s <- Node.addSec int_p inst old_sdx
402 let new_inst = Instance.setBoth inst old_sdx old_pdx
403 return (Container.addTwo old_pdx new_s old_sdx new_p nl,
404 new_inst, old_sdx, old_pdx)
407 -- Failover to any (fa)
408 applyMove nl inst (FailoverToAny new_pdx) = do
409 let (old_pdx, old_sdx, old_pnode, _) = instanceNodes nl inst
410 new_pnode = Container.find new_pdx nl
411 force_failover = Node.offline old_pnode
412 new_pnode' <- Node.addPriEx force_failover new_pnode inst
413 let old_pnode' = Node.removePri old_pnode inst
414 inst' = Instance.setPri inst new_pdx
415 nl' = Container.addTwo old_pdx old_pnode' new_pdx new_pnode' nl
416 return (nl', inst', new_pdx, old_sdx)
418 -- Replace the primary (f:, r:np, f)
419 applyMove nl inst (ReplacePrimary new_pdx) =
420 let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
421 tgt_n = Container.find new_pdx nl
422 int_p = Node.removePri old_p inst
423 int_s = Node.removeSec old_s inst
424 force_p = Node.offline old_p
425 new_nl = do -- Maybe monad
426 -- check that the current secondary can host the instance
427 -- during the migration
428 tmp_s <- Node.addPriEx force_p int_s inst
429 let tmp_s' = Node.removePri tmp_s inst
430 new_p <- Node.addPriEx force_p tgt_n inst
431 new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
432 let new_inst = Instance.setPri inst new_pdx
433 return (Container.add new_pdx new_p $
434 Container.addTwo old_pdx int_p old_sdx new_s nl,
435 new_inst, new_pdx, old_sdx)
438 -- Replace the secondary (r:ns)
439 applyMove nl inst (ReplaceSecondary new_sdx) =
440 let old_pdx = Instance.pNode inst
441 old_sdx = Instance.sNode inst
442 old_s = Container.find old_sdx nl
443 tgt_n = Container.find new_sdx nl
444 int_s = Node.removeSec old_s inst
445 force_s = Node.offline old_s
446 new_inst = Instance.setSec inst new_sdx
447 new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
448 \new_s -> return (Container.addTwo new_sdx
449 new_s old_sdx int_s nl,
450 new_inst, old_pdx, new_sdx)
453 -- Replace the secondary and failover (r:np, f)
454 applyMove nl inst (ReplaceAndFailover new_pdx) =
455 let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
456 tgt_n = Container.find new_pdx nl
457 int_p = Node.removePri old_p inst
458 int_s = Node.removeSec old_s inst
459 force_s = Node.offline old_s
460 new_nl = do -- Maybe monad
461 new_p <- Node.addPri tgt_n inst
462 new_s <- Node.addSecEx force_s int_p inst new_pdx
463 let new_inst = Instance.setBoth inst new_pdx old_pdx
464 return (Container.add new_pdx new_p $
465 Container.addTwo old_pdx new_s old_sdx int_s nl,
466 new_inst, new_pdx, old_pdx)
469 -- Failver and replace the secondary (f, r:ns)
470 applyMove nl inst (FailoverAndReplace new_sdx) =
471 let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
472 tgt_n = Container.find new_sdx nl
473 int_p = Node.removePri old_p inst
474 int_s = Node.removeSec old_s inst
475 force_p = Node.offline old_p
476 new_nl = do -- Maybe monad
477 new_p <- Node.addPriEx force_p int_s inst
478 new_s <- Node.addSecEx force_p tgt_n inst old_sdx
479 let new_inst = Instance.setBoth inst old_sdx new_sdx
480 return (Container.add new_sdx new_s $
481 Container.addTwo old_sdx new_p old_pdx int_p nl,
482 new_inst, old_sdx, new_sdx)
485 -- | Tries to allocate an instance on one given node.
486 allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
487 -> OpResult Node.AllocElement
488 allocateOnSingle nl inst new_pdx =
489 let p = Container.find new_pdx nl
490 new_inst = Instance.setBoth inst new_pdx Node.noSecondary
492 Instance.instMatchesPolicy inst (Node.iPolicy p) (Node.exclStorage p)
493 new_p <- Node.addPri p inst
494 let new_nl = Container.add new_pdx new_p nl
495 new_score = compCV new_nl
496 return (new_nl, new_inst, [new_p], new_score)
498 -- | Tries to allocate an instance on a given pair of nodes.
499 allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
500 -> OpResult Node.AllocElement
501 allocateOnPair nl inst new_pdx new_sdx =
502 let tgt_p = Container.find new_pdx nl
503 tgt_s = Container.find new_sdx nl
505 Instance.instMatchesPolicy inst (Node.iPolicy tgt_p)
506 (Node.exclStorage tgt_p)
507 new_p <- Node.addPri tgt_p inst
508 new_s <- Node.addSec tgt_s inst new_pdx
509 let new_inst = Instance.setBoth inst new_pdx new_sdx
510 new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
511 return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
513 -- | Tries to perform an instance move and returns the best table
514 -- between the original one and the new one.
515 checkSingleStep :: Table -- ^ The original table
516 -> Instance.Instance -- ^ The instance to move
517 -> Table -- ^ The current best table
518 -> IMove -- ^ The move to apply
519 -> Table -- ^ The final best table
520 checkSingleStep ini_tbl target cur_tbl move =
521 let Table ini_nl ini_il _ ini_plc = ini_tbl
522 tmp_resu = applyMove ini_nl target move
525 Ok (upd_nl, new_inst, pri_idx, sec_idx) ->
526 let tgt_idx = Instance.idx target
527 upd_cvar = compCV upd_nl
528 upd_il = Container.add tgt_idx new_inst ini_il
529 upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
530 upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
531 in compareTables cur_tbl upd_tbl
533 -- | Given the status of the current secondary as a valid new node and
534 -- the current candidate target node, generate the possible moves for
536 possibleMoves :: MirrorType -- ^ The mirroring type of the instance
537 -> Bool -- ^ Whether the secondary node is a valid new node
538 -> Bool -- ^ Whether we can change the primary node
539 -> Ndx -- ^ Target node candidate
540 -> [IMove] -- ^ List of valid result moves
542 possibleMoves MirrorNone _ _ _ = []
544 possibleMoves MirrorExternal _ False _ = []
546 possibleMoves MirrorExternal _ True tdx =
547 [ FailoverToAny tdx ]
549 possibleMoves MirrorInternal _ False tdx =
550 [ ReplaceSecondary tdx ]
552 possibleMoves MirrorInternal True True tdx =
553 [ ReplaceSecondary tdx
554 , ReplaceAndFailover tdx
556 , FailoverAndReplace tdx
559 possibleMoves MirrorInternal False True tdx =
560 [ ReplaceSecondary tdx
561 , ReplaceAndFailover tdx
564 -- | Compute the best move for a given instance.
565 checkInstanceMove :: [Ndx] -- ^ Allowed target node indices
566 -> Bool -- ^ Whether disk moves are allowed
567 -> Bool -- ^ Whether instance moves are allowed
568 -> Table -- ^ Original table
569 -> Instance.Instance -- ^ Instance to move
570 -> Table -- ^ Best new table for this instance
571 checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
572 let opdx = Instance.pNode target
573 osdx = Instance.sNode target
574 bad_nodes = [opdx, osdx]
575 nodes = filter (`notElem` bad_nodes) nodes_idx
576 mir_type = Instance.mirrorType target
577 use_secondary = elem osdx nodes_idx && inst_moves
578 aft_failover = if mir_type == MirrorInternal && use_secondary
579 -- if drbd and allowed to failover
580 then checkSingleStep ini_tbl target ini_tbl Failover
584 then concatMap (possibleMoves mir_type use_secondary inst_moves)
588 -- iterate over the possible nodes for this instance
589 foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
591 -- | Compute the best next move.
592 checkMove :: [Ndx] -- ^ Allowed target node indices
593 -> Bool -- ^ Whether disk moves are allowed
594 -> Bool -- ^ Whether instance moves are allowed
595 -> Table -- ^ The current solution
596 -> [Instance.Instance] -- ^ List of instances still to move
597 -> Table -- ^ The new solution
598 checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
599 let Table _ _ _ ini_plc = ini_tbl
600 -- we're using rwhnf from the Control.Parallel.Strategies
601 -- package; we don't need to use rnf as that would force too
602 -- much evaluation in single-threaded cases, and in
603 -- multi-threaded case the weak head normal form is enough to
604 -- spark the evaluation
605 tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
608 -- iterate over all instances, computing the best move
609 best_tbl = foldl' compareTables ini_tbl tables
610 Table _ _ _ best_plc = best_tbl
611 in if length best_plc == length ini_plc
612 then ini_tbl -- no advancement
615 -- | Check if we are allowed to go deeper in the balancing.
616 doNextBalance :: Table -- ^ The starting table
617 -> Int -- ^ Remaining length
618 -> Score -- ^ Score at which to stop
619 -> Bool -- ^ The resulting table and commands
620 doNextBalance ini_tbl max_rounds min_score =
621 let Table _ _ ini_cv ini_plc = ini_tbl
622 ini_plc_len = length ini_plc
623 in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
625 -- | Run a balance move.
626 tryBalance :: Table -- ^ The starting table
627 -> Bool -- ^ Allow disk moves
628 -> Bool -- ^ Allow instance moves
629 -> Bool -- ^ Only evacuate moves
630 -> Score -- ^ Min gain threshold
631 -> Score -- ^ Min gain
632 -> Maybe Table -- ^ The resulting table and commands
633 tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
634 let Table ini_nl ini_il ini_cv _ = ini_tbl
635 all_inst = Container.elems ini_il
636 all_nodes = Container.elems ini_nl
637 (offline_nodes, online_nodes) = partition Node.offline all_nodes
638 all_inst' = if evac_mode
639 then let bad_nodes = map Node.idx offline_nodes
640 in filter (any (`elem` bad_nodes) .
641 Instance.allNodes) all_inst
643 reloc_inst = filter (\i -> Instance.movable i &&
644 Instance.autoBalance i) all_inst'
645 node_idx = map Node.idx online_nodes
646 fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
647 (Table _ _ fin_cv _) = fin_tbl
649 if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
650 then Just fin_tbl -- this round made success, return the new table
653 -- * Allocation functions
655 -- | Build failure stats out of a list of failures.
656 collapseFailures :: [FailMode] -> FailStats
657 collapseFailures flst =
658 map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst))
661 -- | Compares two Maybe AllocElement and chooses the best score.
662 bestAllocElement :: Maybe Node.AllocElement
663 -> Maybe Node.AllocElement
664 -> Maybe Node.AllocElement
665 bestAllocElement a Nothing = a
666 bestAllocElement Nothing b = b
667 bestAllocElement a@(Just (_, _, _, ascore)) b@(Just (_, _, _, bscore)) =
668 if ascore < bscore then a else b
670 -- | Update current Allocation solution and failure stats with new
672 concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
673 concatAllocs as (Bad reason) = as { asFailures = reason : asFailures as }
675 concatAllocs as (Ok ns) =
676 let -- Choose the old or new solution, based on the cluster score
678 osols = asSolution as
679 nsols = bestAllocElement osols (Just ns)
681 -- Note: we force evaluation of nsols here in order to keep the
682 -- memory profile low - we know that we will need nsols for sure
683 -- in the next cycle, so we force evaluation of nsols, since the
684 -- foldl' in the caller will only evaluate the tuple, but not the
685 -- elements of the tuple
686 in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolution = nsols }
688 -- | Sums two 'AllocSolution' structures.
689 sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution
690 sumAllocs (AllocSolution aFails aAllocs aSols aLog)
691 (AllocSolution bFails bAllocs bSols bLog) =
692 -- note: we add b first, since usually it will be smaller; when
693 -- fold'ing, a will grow and grow whereas b is the per-group
694 -- result, hence smaller
695 let nFails = bFails ++ aFails
696 nAllocs = aAllocs + bAllocs
697 nSols = bestAllocElement aSols bSols
699 in AllocSolution nFails nAllocs nSols nLog
701 -- | Given a solution, generates a reasonable description for it.
702 describeSolution :: AllocSolution -> String
703 describeSolution as =
704 let fcnt = asFailures as
707 intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
708 filter ((> 0) . snd) . collapseFailures $ fcnt
710 Nothing -> "No valid allocation solutions, failure reasons: " ++
711 (if null fcnt then "unknown reasons" else freasons)
712 Just (_, _, nodes, cv) ->
713 printf ("score: %.8f, successes %d, failures %d (%s)" ++
714 " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
715 (intercalate "/" . map Node.name $ nodes)
717 -- | Annotates a solution with the appropriate string.
718 annotateSolution :: AllocSolution -> AllocSolution
719 annotateSolution as = as { asLog = describeSolution as : asLog as }
721 -- | Reverses an evacuation solution.
723 -- Rationale: we always concat the results to the top of the lists, so
724 -- for proper jobset execution, we should reverse all lists.
725 reverseEvacSolution :: EvacSolution -> EvacSolution
726 reverseEvacSolution (EvacSolution f m o) =
727 EvacSolution (reverse f) (reverse m) (reverse o)
729 -- | Generate the valid node allocation singles or pairs for a new instance.
730 genAllocNodes :: Group.List -- ^ Group list
731 -> Node.List -- ^ The node map
732 -> Int -- ^ The number of nodes required
733 -> Bool -- ^ Whether to drop or not
735 -> Result AllocNodes -- ^ The (monadic) result
736 genAllocNodes gl nl count drop_unalloc =
737 let filter_fn = if drop_unalloc
738 then filter (Group.isAllocable .
739 flip Container.find gl . Node.group)
741 all_nodes = filter_fn $ getOnline nl
742 all_pairs = [(Node.idx p,
743 [Node.idx s | s <- all_nodes,
744 Node.idx p /= Node.idx s,
745 Node.group p == Node.group s]) |
748 1 -> Ok (Left (map Node.idx all_nodes))
749 2 -> Ok (Right (filter (not . null . snd) all_pairs))
750 _ -> Bad "Unsupported number of nodes, only one or two supported"
752 -- | Try to allocate an instance on the cluster.
753 tryAlloc :: (Monad m) =>
754 Node.List -- ^ The node list
755 -> Instance.List -- ^ The instance list
756 -> Instance.Instance -- ^ The instance to allocate
757 -> AllocNodes -- ^ The allocation targets
758 -> m AllocSolution -- ^ Possible solution list
759 tryAlloc _ _ _ (Right []) = fail "Not enough online nodes"
760 tryAlloc nl _ inst (Right ok_pairs) =
761 let psols = parMap rwhnf (\(p, ss) ->
763 concatAllocs cstate .
764 allocateOnPair nl inst p)
765 emptyAllocSolution ss) ok_pairs
766 sols = foldl' sumAllocs emptyAllocSolution psols
767 in return $ annotateSolution sols
769 tryAlloc _ _ _ (Left []) = fail "No online nodes"
770 tryAlloc nl _ inst (Left all_nodes) =
771 let sols = foldl' (\cstate ->
772 concatAllocs cstate . allocateOnSingle nl inst
773 ) emptyAllocSolution all_nodes
774 in return $ annotateSolution sols
776 -- | Given a group/result, describe it as a nice (list of) messages.
777 solutionDescription :: (Group.Group, Result AllocSolution)
779 solutionDescription (grp, result) =
781 Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
782 Bad message -> [printf "Group %s: error %s" gname message]
783 where gname = Group.name grp
784 pol = allocPolicyToRaw (Group.allocPolicy grp)
786 -- | From a list of possibly bad and possibly empty solutions, filter
787 -- only the groups with a valid result. Note that the result will be
788 -- reversed compared to the original list.
789 filterMGResults :: [(Group.Group, Result AllocSolution)]
790 -> [(Group.Group, AllocSolution)]
791 filterMGResults = foldl' fn []
792 where unallocable = not . Group.isAllocable
793 fn accu (grp, rasol) =
796 Ok sol | isNothing (asSolution sol) -> accu
797 | unallocable grp -> accu
798 | otherwise -> (grp, sol):accu
800 -- | Sort multigroup results based on policy and score.
801 sortMGResults :: [(Group.Group, AllocSolution)]
802 -> [(Group.Group, AllocSolution)]
804 let extractScore (_, _, _, x) = x
805 solScore (grp, sol) = (Group.allocPolicy grp,
806 (extractScore . fromJust . asSolution) sol)
807 in sortBy (comparing solScore) sols
809 -- | Removes node groups which can't accommodate the instance
810 filterValidGroups :: [(Group.Group, (Node.List, Instance.List))]
812 -> ([(Group.Group, (Node.List, Instance.List))], [String])
813 filterValidGroups [] _ = ([], [])
814 filterValidGroups (ng:ngs) inst =
815 let (valid_ngs, msgs) = filterValidGroups ngs inst
816 hasNetwork nic = case Nic.network nic of
817 Just net -> net `elem` Group.networks (fst ng)
819 hasRequiredNetworks = all hasNetwork (Instance.nics inst)
820 in if hasRequiredNetworks
821 then (ng:valid_ngs, msgs)
823 ("group " ++ Group.name (fst ng) ++
824 " is not connected to a network required by instance " ++
825 Instance.name inst):msgs)
827 -- | Finds the best group for an instance on a multi-group cluster.
829 -- Only solutions in @preferred@ and @last_resort@ groups will be
830 -- accepted as valid, and additionally if the allowed groups parameter
831 -- is not null then allocation will only be run for those group
833 findBestAllocGroup :: Group.List -- ^ The group list
834 -> Node.List -- ^ The node list
835 -> Instance.List -- ^ The instance list
836 -> Maybe [Gdx] -- ^ The allowed groups
837 -> Instance.Instance -- ^ The instance to allocate
838 -> Int -- ^ Required number of nodes
839 -> Result (Group.Group, AllocSolution, [String])
840 findBestAllocGroup mggl mgnl mgil allowed_gdxs inst cnt =
841 let groups_by_idx = splitCluster mgnl mgil
842 groups = map (\(gid, d) -> (Container.find gid mggl, d)) groups_by_idx
843 groups' = maybe groups
844 (\gs -> filter ((`elem` gs) . Group.idx . fst) groups)
846 (groups'', filter_group_msgs) = filterValidGroups groups' inst
847 sols = map (\(gr, (nl, il)) ->
848 (gr, genAllocNodes mggl nl cnt False >>=
849 tryAlloc nl il inst))
850 groups''::[(Group.Group, Result AllocSolution)]
851 all_msgs = filter_group_msgs ++ concatMap solutionDescription sols
852 goodSols = filterMGResults sols
853 sortedSols = sortMGResults goodSols
854 in case sortedSols of
855 [] -> Bad $ if null groups'
856 then "no groups for evacuation: allowed groups was" ++
857 show allowed_gdxs ++ ", all groups: " ++
858 show (map fst groups)
859 else intercalate ", " all_msgs
860 (final_group, final_sol):_ -> return (final_group, final_sol, all_msgs)
862 -- | Try to allocate an instance on a multi-group cluster.
863 tryMGAlloc :: Group.List -- ^ The group list
864 -> Node.List -- ^ The node list
865 -> Instance.List -- ^ The instance list
866 -> Instance.Instance -- ^ The instance to allocate
867 -> Int -- ^ Required number of nodes
868 -> Result AllocSolution -- ^ Possible solution list
869 tryMGAlloc mggl mgnl mgil inst cnt = do
870 (best_group, solution, all_msgs) <-
871 findBestAllocGroup mggl mgnl mgil Nothing inst cnt
872 let group_name = Group.name best_group
873 selmsg = "Selected group: " ++ group_name
874 return $ solution { asLog = selmsg:all_msgs }
876 -- | Calculate the new instance list after allocation solution.
877 updateIl :: Instance.List -- ^ The original instance list
878 -> Maybe Node.AllocElement -- ^ The result of the allocation attempt
879 -> Instance.List -- ^ The updated instance list
880 updateIl il Nothing = il
881 updateIl il (Just (_, xi, _, _)) = Container.add (Container.size il) xi il
883 -- | Extract the the new node list from the allocation solution.
884 extractNl :: Node.List -- ^ The original node list
885 -> Maybe Node.AllocElement -- ^ The result of the allocation attempt
886 -> Node.List -- ^ The new node list
887 extractNl nl Nothing = nl
888 extractNl _ (Just (xnl, _, _, _)) = xnl
890 -- | Try to allocate a list of instances on a multi-group cluster.
891 allocList :: Group.List -- ^ The group list
892 -> Node.List -- ^ The node list
893 -> Instance.List -- ^ The instance list
894 -> [(Instance.Instance, Int)] -- ^ The instance to allocate
895 -> AllocSolutionList -- ^ Possible solution list
896 -> Result (Node.List, Instance.List,
897 AllocSolutionList) -- ^ The final solution list
898 allocList _ nl il [] result = Ok (nl, il, result)
899 allocList gl nl il ((xi, xicnt):xies) result = do
900 ares <- tryMGAlloc gl nl il xi xicnt
901 let sol = asSolution ares
902 nl' = extractNl nl sol
903 il' = updateIl il sol
904 allocList gl nl' il' xies ((xi, ares):result)
906 -- | Function which fails if the requested mode is change secondary.
908 -- This is useful since except DRBD, no other disk template can
909 -- execute change secondary; thus, we can just call this function
910 -- instead of always checking for secondary mode. After the call to
911 -- this function, whatever mode we have is just a primary change.
912 failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
913 failOnSecondaryChange ChangeSecondary dt =
914 fail $ "Instances with disk template '" ++ diskTemplateToRaw dt ++
915 "' can't execute change secondary"
916 failOnSecondaryChange _ _ = return ()
918 -- | Run evacuation for a single instance.
920 -- /Note:/ this function should correctly execute both intra-group
921 -- evacuations (in all modes) and inter-group evacuations (in the
922 -- 'ChangeAll' mode). Of course, this requires that the correct list
923 -- of target nodes is passed.
924 nodeEvacInstance :: Node.List -- ^ The node list (cluster-wide)
925 -> Instance.List -- ^ Instance list (cluster-wide)
926 -> EvacMode -- ^ The evacuation mode
927 -> Instance.Instance -- ^ The instance to be evacuated
928 -> Gdx -- ^ The group we're targetting
929 -> [Ndx] -- ^ The list of available nodes
931 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
932 nodeEvacInstance nl il mode inst@(Instance.Instance
933 {Instance.diskTemplate = dt@DTDiskless})
935 failOnSecondaryChange mode dt >>
936 evacOneNodeOnly nl il inst gdx avail_nodes
938 nodeEvacInstance _ _ _ (Instance.Instance
939 {Instance.diskTemplate = DTPlain}) _ _ =
940 fail "Instances of type plain cannot be relocated"
942 nodeEvacInstance _ _ _ (Instance.Instance
943 {Instance.diskTemplate = DTFile}) _ _ =
944 fail "Instances of type file cannot be relocated"
946 nodeEvacInstance nl il mode inst@(Instance.Instance
947 {Instance.diskTemplate = dt@DTSharedFile})
949 failOnSecondaryChange mode dt >>
950 evacOneNodeOnly nl il inst gdx avail_nodes
952 nodeEvacInstance nl il mode inst@(Instance.Instance
953 {Instance.diskTemplate = dt@DTBlock})
955 failOnSecondaryChange mode dt >>
956 evacOneNodeOnly nl il inst gdx avail_nodes
958 nodeEvacInstance nl il mode inst@(Instance.Instance
959 {Instance.diskTemplate = dt@DTRbd})
961 failOnSecondaryChange mode dt >>
962 evacOneNodeOnly nl il inst gdx avail_nodes
964 nodeEvacInstance nl il mode inst@(Instance.Instance
965 {Instance.diskTemplate = dt@DTExt})
967 failOnSecondaryChange mode dt >>
968 evacOneNodeOnly nl il inst gdx avail_nodes
970 nodeEvacInstance nl il ChangePrimary
971 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
974 (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
975 let idx = Instance.idx inst
976 il' = Container.add idx inst' il
977 ops = iMoveToJob nl' il' idx Failover
978 return (nl', il', ops)
980 nodeEvacInstance nl il ChangeSecondary
981 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
983 evacOneNodeOnly nl il inst gdx avail_nodes
985 -- The algorithm for ChangeAll is as follows:
987 -- * generate all (primary, secondary) node pairs for the target groups
988 -- * for each pair, execute the needed moves (r:s, f, r:s) and compute
989 -- the final node list state and group score
990 -- * select the best choice via a foldl that uses the same Either
991 -- String solution as the ChangeSecondary mode
992 nodeEvacInstance nl il ChangeAll
993 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
996 let no_nodes = Left "no nodes available"
997 node_pairs = [(p,s) | p <- avail_nodes, s <- avail_nodes, p /= s]
998 (nl', il', ops, _) <-
999 annotateResult "Can't find any good nodes for relocation" .
1002 (\accu nodes -> case evacDrbdAllInner nl il inst gdx nodes of
1006 -- we don't need more details (which
1007 -- nodes, etc.) as we only selected
1008 -- this group if we can allocate on
1009 -- it, hence failures will not
1010 -- propagate out of this fold loop
1011 Left _ -> Left $ "Allocation failed: " ++ msg
1012 Ok result@(_, _, _, new_cv) ->
1013 let new_accu = Right result in
1016 Right (_, _, _, old_cv) ->
1020 ) no_nodes node_pairs
1022 return (nl', il', ops)
1024 -- | Generic function for changing one node of an instance.
1026 -- This is similar to 'nodeEvacInstance' but will be used in a few of
1027 -- its sub-patterns. It folds the inner function 'evacOneNodeInner'
1028 -- over the list of available nodes, which results in the best choice
1030 evacOneNodeOnly :: Node.List -- ^ The node list (cluster-wide)
1031 -> Instance.List -- ^ Instance list (cluster-wide)
1032 -> Instance.Instance -- ^ The instance to be evacuated
1033 -> Gdx -- ^ The group we're targetting
1034 -> [Ndx] -- ^ The list of available nodes
1036 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1037 evacOneNodeOnly nl il inst gdx avail_nodes = do
1038 op_fn <- case Instance.mirrorType inst of
1039 MirrorNone -> Bad "Can't relocate/evacuate non-mirrored instances"
1040 MirrorInternal -> Ok ReplaceSecondary
1041 MirrorExternal -> Ok FailoverToAny
1042 (nl', inst', _, ndx) <- annotateResult "Can't find any good node" .
1044 foldl' (evacOneNodeInner nl inst gdx op_fn)
1045 (Left "no nodes available") avail_nodes
1046 let idx = Instance.idx inst
1047 il' = Container.add idx inst' il
1048 ops = iMoveToJob nl' il' idx (op_fn ndx)
1049 return (nl', il', ops)
1051 -- | Inner fold function for changing one node of an instance.
1053 -- Depending on the instance disk template, this will either change
1054 -- the secondary (for DRBD) or the primary node (for shared
1055 -- storage). However, the operation is generic otherwise.
1057 -- The running solution is either a @Left String@, which means we
1058 -- don't have yet a working solution, or a @Right (...)@, which
1059 -- represents a valid solution; it holds the modified node list, the
1060 -- modified instance (after evacuation), the score of that solution,
1061 -- and the new secondary node index.
1062 evacOneNodeInner :: Node.List -- ^ Cluster node list
1063 -> Instance.Instance -- ^ Instance being evacuated
1064 -> Gdx -- ^ The group index of the instance
1065 -> (Ndx -> IMove) -- ^ Operation constructor
1066 -> EvacInnerState -- ^ Current best solution
1067 -> Ndx -- ^ Node we're evaluating as target
1068 -> EvacInnerState -- ^ New best solution
1069 evacOneNodeInner nl inst gdx op_fn accu ndx =
1070 case applyMove nl inst (op_fn ndx) of
1071 Bad fm -> let fail_msg = "Node " ++ Container.nameOf nl ndx ++
1072 " failed: " ++ show fm
1073 in either (const $ Left fail_msg) (const accu) accu
1074 Ok (nl', inst', _, _) ->
1075 let nodes = Container.elems nl'
1076 -- The fromJust below is ugly (it can fail nastily), but
1077 -- at this point we should have any internal mismatches,
1078 -- and adding a monad here would be quite involved
1079 grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1080 new_cv = compCVNodes grpnodes
1081 new_accu = Right (nl', inst', new_cv, ndx)
1084 Right (_, _, old_cv, _) ->
1089 -- | Compute result of changing all nodes of a DRBD instance.
1091 -- Given the target primary and secondary node (which might be in a
1092 -- different group or not), this function will 'execute' all the
1093 -- required steps and assuming all operations succceed, will return
1094 -- the modified node and instance lists, the opcodes needed for this
1095 -- and the new group score.
1096 evacDrbdAllInner :: Node.List -- ^ Cluster node list
1097 -> Instance.List -- ^ Cluster instance list
1098 -> Instance.Instance -- ^ The instance to be moved
1099 -> Gdx -- ^ The target group index
1100 -- (which can differ from the
1101 -- current group of the
1103 -> (Ndx, Ndx) -- ^ Tuple of new
1104 -- primary\/secondary nodes
1105 -> Result (Node.List, Instance.List, [OpCodes.OpCode], Score)
1106 evacDrbdAllInner nl il inst gdx (t_pdx, t_sdx) = do
1107 let primary = Container.find (Instance.pNode inst) nl
1108 idx = Instance.idx inst
1109 -- if the primary is offline, then we first failover
1110 (nl1, inst1, ops1) <-
1111 if Node.offline primary
1113 (nl', inst', _, _) <-
1114 annotateResult "Failing over to the secondary" .
1115 opToResult $ applyMove nl inst Failover
1116 return (nl', inst', [Failover])
1117 else return (nl, inst, [])
1118 let (o1, o2, o3) = (ReplaceSecondary t_pdx,
1120 ReplaceSecondary t_sdx)
1121 -- we now need to execute a replace secondary to the future
1123 (nl2, inst2, _, _) <-
1124 annotateResult "Changing secondary to new primary" .
1126 applyMove nl1 inst1 o1
1128 -- we now execute another failover, the primary stays fixed now
1129 (nl3, inst3, _, _) <- annotateResult "Failing over to new primary" .
1130 opToResult $ applyMove nl2 inst2 o2
1132 -- and finally another replace secondary, to the final secondary
1133 (nl4, inst4, _, _) <-
1134 annotateResult "Changing secondary to final secondary" .
1136 applyMove nl3 inst3 o3
1138 il' = Container.add idx inst4 il
1139 ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4
1140 let nodes = Container.elems nl4
1141 -- The fromJust below is ugly (it can fail nastily), but
1142 -- at this point we should have any internal mismatches,
1143 -- and adding a monad here would be quite involved
1144 grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1145 new_cv = compCVNodes grpnodes
1146 return (nl4, il', ops, new_cv)
1148 -- | Computes the nodes in a given group which are available for
1150 availableGroupNodes :: [(Gdx, [Ndx])] -- ^ Group index/node index assoc list
1151 -> IntSet.IntSet -- ^ Nodes that are excluded
1152 -> Gdx -- ^ The group for which we
1154 -> Result [Ndx] -- ^ List of available node indices
1155 availableGroupNodes group_nodes excl_ndx gdx = do
1156 local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx)
1157 Ok (lookup gdx group_nodes)
1158 let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes
1161 -- | Updates the evac solution with the results of an instance
1163 updateEvacSolution :: (Node.List, Instance.List, EvacSolution)
1165 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1166 -> (Node.List, Instance.List, EvacSolution)
1167 updateEvacSolution (nl, il, es) idx (Bad msg) =
1168 (nl, il, es { esFailed = (idx, msg):esFailed es})
1169 updateEvacSolution (_, _, es) idx (Ok (nl, il, opcodes)) =
1170 (nl, il, es { esMoved = new_elem:esMoved es
1171 , esOpCodes = opcodes:esOpCodes es })
1172 where inst = Container.find idx il
1174 instancePriGroup nl inst,
1175 Instance.allNodes inst)
1177 -- | Node-evacuation IAllocator mode main function.
1178 tryNodeEvac :: Group.List -- ^ The cluster groups
1179 -> Node.List -- ^ The node list (cluster-wide, not per group)
1180 -> Instance.List -- ^ Instance list (cluster-wide)
1181 -> EvacMode -- ^ The evacuation mode
1182 -> [Idx] -- ^ List of instance (indices) to be evacuated
1183 -> Result (Node.List, Instance.List, EvacSolution)
1184 tryNodeEvac _ ini_nl ini_il mode idxs =
1185 let evac_ndx = nodesToEvacuate ini_il mode idxs
1186 offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1187 excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline
1188 group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1189 (Container.elems nl))) $
1190 splitCluster ini_nl ini_il
1191 (fin_nl, fin_il, esol) =
1192 foldl' (\state@(nl, il, _) inst ->
1193 let gdx = instancePriGroup nl inst
1194 pdx = Instance.pNode inst in
1195 updateEvacSolution state (Instance.idx inst) $
1196 availableGroupNodes group_ndx
1197 (IntSet.insert pdx excl_ndx) gdx >>=
1198 nodeEvacInstance nl il mode inst gdx
1200 (ini_nl, ini_il, emptyEvacSolution)
1201 (map (`Container.find` ini_il) idxs)
1202 in return (fin_nl, fin_il, reverseEvacSolution esol)
1204 -- | Change-group IAllocator mode main function.
1206 -- This is very similar to 'tryNodeEvac', the only difference is that
1207 -- we don't choose as target group the current instance group, but
1210 -- 1. at the start of the function, we compute which are the target
1211 -- groups; either no groups were passed in, in which case we choose
1212 -- all groups out of which we don't evacuate instance, or there were
1213 -- some groups passed, in which case we use those
1215 -- 2. for each instance, we use 'findBestAllocGroup' to choose the
1216 -- best group to hold the instance, and then we do what
1217 -- 'tryNodeEvac' does, except for this group instead of the current
1220 -- Note that the correct behaviour of this function relies on the
1221 -- function 'nodeEvacInstance' to be able to do correctly both
1222 -- intra-group and inter-group moves when passed the 'ChangeAll' mode.
1223 tryChangeGroup :: Group.List -- ^ The cluster groups
1224 -> Node.List -- ^ The node list (cluster-wide)
1225 -> Instance.List -- ^ Instance list (cluster-wide)
1226 -> [Gdx] -- ^ Target groups; if empty, any
1227 -- groups not being evacuated
1228 -> [Idx] -- ^ List of instance (indices) to be evacuated
1229 -> Result (Node.List, Instance.List, EvacSolution)
1230 tryChangeGroup gl ini_nl ini_il gdxs idxs =
1231 let evac_gdxs = nub $ map (instancePriGroup ini_nl .
1232 flip Container.find ini_il) idxs
1233 target_gdxs = (if null gdxs
1234 then Container.keys gl
1235 else gdxs) \\ evac_gdxs
1236 offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1237 excl_ndx = foldl' (flip IntSet.insert) IntSet.empty offline
1238 group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1239 (Container.elems nl))) $
1240 splitCluster ini_nl ini_il
1241 (fin_nl, fin_il, esol) =
1242 foldl' (\state@(nl, il, _) inst ->
1244 let ncnt = Instance.requiredNodes $
1245 Instance.diskTemplate inst
1246 (grp, _, _) <- findBestAllocGroup gl nl il
1247 (Just target_gdxs) inst ncnt
1248 let gdx = Group.idx grp
1249 av_nodes <- availableGroupNodes group_ndx
1251 nodeEvacInstance nl il ChangeAll inst gdx av_nodes
1252 in updateEvacSolution state (Instance.idx inst) solution
1254 (ini_nl, ini_il, emptyEvacSolution)
1255 (map (`Container.find` ini_il) idxs)
1256 in return (fin_nl, fin_il, reverseEvacSolution esol)
1258 -- | Standard-sized allocation method.
1260 -- This places instances of the same size on the cluster until we're
1261 -- out of space. The result will be a list of identically-sized
1263 iterateAlloc :: AllocMethod
1264 iterateAlloc nl il limit newinst allocnodes ixes cstats =
1265 let depth = length ixes
1266 newname = printf "new-%d" depth::String
1267 newidx = Container.size il
1268 newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
1269 newlimit = fmap (flip (-) 1) limit
1270 in case tryAlloc nl il newi2 allocnodes of
1272 Ok (AllocSolution { asFailures = errs, asSolution = sols3 }) ->
1273 let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in
1276 Just (xnl, xi, _, _) ->
1279 else iterateAlloc xnl (Container.add newidx xi il)
1280 newlimit newinst allocnodes (xi:ixes)
1281 (totalResources xnl:cstats)
1283 -- | Tiered allocation method.
1285 -- This places instances on the cluster, and decreases the spec until
1286 -- we can allocate again. The result will be a list of decreasing
1288 tieredAlloc :: AllocMethod
1289 tieredAlloc nl il limit newinst allocnodes ixes cstats =
1290 case iterateAlloc nl il limit newinst allocnodes ixes cstats of
1292 Ok (errs, nl', il', ixes', cstats') ->
1293 let newsol = Ok (errs, nl', il', ixes', cstats')
1294 ixes_cnt = length ixes'
1295 (stop, newlimit) = case limit of
1296 Nothing -> (False, Nothing)
1297 Just n -> (n <= ixes_cnt,
1298 Just (n - ixes_cnt)) in
1299 if stop then newsol else
1300 case Instance.shrinkByType newinst . fst . last $
1301 sortBy (comparing snd) errs of
1303 Ok newinst' -> tieredAlloc nl' il' newlimit
1304 newinst' allocnodes ixes' cstats'
1306 -- * Formatting functions
1308 -- | Given the original and final nodes, computes the relocation description.
1309 computeMoves :: Instance.Instance -- ^ The instance to be moved
1310 -> String -- ^ The instance name
1311 -> IMove -- ^ The move being performed
1312 -> String -- ^ New primary
1313 -> String -- ^ New secondary
1314 -> (String, [String])
1315 -- ^ Tuple of moves and commands list; moves is containing
1316 -- either @/f/@ for failover or @/r:name/@ for replace
1317 -- secondary, while the command list holds gnt-instance
1318 -- commands (without that prefix), e.g \"@failover instance1@\"
1319 computeMoves i inam mv c d =
1321 Failover -> ("f", [mig])
1322 FailoverToAny _ -> (printf "fa:%s" c, [mig_any])
1323 FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
1324 ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
1325 ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
1326 ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
1327 where morf = if Instance.isRunning i then "migrate" else "failover"
1328 mig = printf "%s -f %s" morf inam::String
1329 mig_any = printf "%s -f -n %s %s" morf c inam::String
1330 rep n = printf "replace-disks -n %s %s" n inam::String
1332 -- | Converts a placement to string format.
1333 printSolutionLine :: Node.List -- ^ The node list
1334 -> Instance.List -- ^ The instance list
1335 -> Int -- ^ Maximum node name length
1336 -> Int -- ^ Maximum instance name length
1337 -> Placement -- ^ The current placement
1338 -> Int -- ^ The index of the placement in
1340 -> (String, [String])
1341 printSolutionLine nl il nmlen imlen plc pos =
1342 let pmlen = (2*nmlen + 1)
1343 (i, p, s, mv, c) = plc
1344 old_sec = Instance.sNode inst
1345 inst = Container.find i il
1346 inam = Instance.alias inst
1347 npri = Node.alias $ Container.find p nl
1348 nsec = Node.alias $ Container.find s nl
1349 opri = Node.alias $ Container.find (Instance.pNode inst) nl
1350 osec = Node.alias $ Container.find old_sec nl
1351 (moves, cmds) = computeMoves inst inam mv npri nsec
1352 -- FIXME: this should check instead/also the disk template
1353 ostr = if old_sec == Node.noSecondary
1354 then printf "%s" opri::String
1355 else printf "%s:%s" opri osec::String
1356 nstr = if s == Node.noSecondary
1357 then printf "%s" npri::String
1358 else printf "%s:%s" npri nsec::String
1359 in (printf " %3d. %-*s %-*s => %-*s %12.8f a=%s"
1360 pos imlen inam pmlen ostr pmlen nstr c moves,
1363 -- | Return the instance and involved nodes in an instance move.
1365 -- Note that the output list length can vary, and is not required nor
1366 -- guaranteed to be of any specific length.
1367 involvedNodes :: Instance.List -- ^ Instance list, used for retrieving
1368 -- the instance from its index; note
1369 -- that this /must/ be the original
1370 -- instance list, so that we can
1371 -- retrieve the old nodes
1372 -> Placement -- ^ The placement we're investigating,
1373 -- containing the new nodes and
1375 -> [Ndx] -- ^ Resulting list of node indices
1376 involvedNodes il plc =
1377 let (i, np, ns, _, _) = plc
1378 inst = Container.find i il
1379 in nub $ [np, ns] ++ Instance.allNodes inst
1381 -- | Inner function for splitJobs, that either appends the next job to
1382 -- the current jobset, or starts a new jobset.
1383 mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
1384 mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
1385 mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
1386 | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
1387 | otherwise = ([n]:cjs, ndx)
1389 -- | Break a list of moves into independent groups. Note that this
1390 -- will reverse the order of jobs.
1391 splitJobs :: [MoveJob] -> [JobSet]
1392 splitJobs = fst . foldl mergeJobs ([], [])
1394 -- | Given a list of commands, prefix them with @gnt-instance@ and
1395 -- also beautify the display a little.
1396 formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
1397 formatJob jsn jsl (sn, (_, _, _, cmds)) =
1399 printf " echo job %d/%d" jsn sn:
1401 map (" gnt-instance " ++) cmds
1403 then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
1406 -- | Given a list of commands, prefix them with @gnt-instance@ and
1407 -- also beautify the display a little.
1408 formatCmds :: [JobSet] -> String
1411 concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1415 -- | Print the node list.
1416 printNodes :: Node.List -> [String] -> String
1418 let fields = case fs of
1419 [] -> Node.defaultFields
1420 "+":rest -> Node.defaultFields ++ rest
1422 snl = sortBy (comparing Node.idx) (Container.elems nl)
1423 (header, isnum) = unzip $ map Node.showHeader fields
1424 in printTable "" header (map (Node.list fields) snl) isnum
1426 -- | Print the instance list.
1427 printInsts :: Node.List -> Instance.List -> String
1429 let sil = sortBy (comparing Instance.idx) (Container.elems il)
1430 helper inst = [ if Instance.isRunning inst then "R" else " "
1431 , Instance.name inst
1432 , Container.nameOf nl (Instance.pNode inst)
1433 , let sdx = Instance.sNode inst
1434 in if sdx == Node.noSecondary
1436 else Container.nameOf nl sdx
1437 , if Instance.autoBalance inst then "Y" else "N"
1438 , printf "%3d" $ Instance.vcpus inst
1439 , printf "%5d" $ Instance.mem inst
1440 , printf "%5d" $ Instance.dsk inst `div` 1024
1446 where DynUtil lC lM lD lN = Instance.util inst
1447 header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1448 , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1449 isnum = False:False:False:False:False:repeat True
1450 in printTable "" header (map helper sil) isnum
1452 -- | Shows statistics for a given node list.
1453 printStats :: String -> Node.List -> String
1455 let dcvs = compDetailedCV $ Container.elems nl
1456 (weights, names) = unzip detailedCVInfo
1457 hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1458 header = [ "Field", "Value", "Weight" ]
1459 formatted = map (\(w, h, val) ->
1464 in printTable lp header formatted $ False:repeat True
1466 -- | Convert a placement into a list of OpCodes (basically a job).
1467 iMoveToJob :: Node.List -- ^ The node list; only used for node
1468 -- names, so any version is good
1469 -- (before or after the operation)
1470 -> Instance.List -- ^ The instance list; also used for
1472 -> Idx -- ^ The index of the instance being
1474 -> IMove -- ^ The actual move to be described
1475 -> [OpCodes.OpCode] -- ^ The list of opcodes equivalent to
1477 iMoveToJob nl il idx move =
1478 let inst = Container.find idx il
1479 iname = Instance.name inst
1480 lookNode n = case mkNonEmpty (Container.nameOf nl n) of
1481 -- FIXME: convert htools codebase to non-empty strings
1482 Bad msg -> error $ "Empty node name for idx " ++
1483 show n ++ ": " ++ msg ++ "??"
1485 opF = OpCodes.OpInstanceMigrate
1486 { OpCodes.opInstanceName = iname
1487 , OpCodes.opMigrationMode = Nothing -- default
1488 , OpCodes.opOldLiveMode = Nothing -- default as well
1489 , OpCodes.opTargetNode = Nothing -- this is drbd
1490 , OpCodes.opAllowRuntimeChanges = False
1491 , OpCodes.opIgnoreIpolicy = False
1492 , OpCodes.opMigrationCleanup = False
1493 , OpCodes.opIallocator = Nothing
1494 , OpCodes.opAllowFailover = True }
1495 opFA n = opF { OpCodes.opTargetNode = lookNode n } -- not drbd
1496 opR n = OpCodes.OpInstanceReplaceDisks
1497 { OpCodes.opInstanceName = iname
1498 , OpCodes.opEarlyRelease = False
1499 , OpCodes.opIgnoreIpolicy = False
1500 , OpCodes.opReplaceDisksMode = OpCodes.ReplaceNewSecondary
1501 , OpCodes.opReplaceDisksList = []
1502 , OpCodes.opRemoteNode = lookNode n
1503 , OpCodes.opIallocator = Nothing
1507 FailoverToAny np -> [ opFA np ]
1508 ReplacePrimary np -> [ opF, opR np, opF ]
1509 ReplaceSecondary ns -> [ opR ns ]
1510 ReplaceAndFailover np -> [ opR np, opF ]
1511 FailoverAndReplace ns -> [ opF, opR ns ]
1513 -- * Node group functions
1515 -- | Computes the group of an instance.
1516 instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1517 instanceGroup nl i =
1518 let sidx = Instance.sNode i
1519 pnode = Container.find (Instance.pNode i) nl
1520 snode = if sidx == Node.noSecondary
1522 else Container.find sidx nl
1523 pgroup = Node.group pnode
1524 sgroup = Node.group snode
1525 in if pgroup /= sgroup
1526 then fail ("Instance placed accross two node groups, primary " ++
1527 show pgroup ++ ", secondary " ++ show sgroup)
1530 -- | Computes the group of an instance per the primary node.
1531 instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1532 instancePriGroup nl i =
1533 let pnode = Container.find (Instance.pNode i) nl
1536 -- | Compute the list of badly allocated instances (split across node
1538 findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1539 findSplitInstances nl =
1540 filter (not . isOk . instanceGroup nl) . Container.elems
1542 -- | Splits a cluster into the component node groups.
1543 splitCluster :: Node.List -> Instance.List ->
1544 [(Gdx, (Node.List, Instance.List))]
1545 splitCluster nl il =
1546 let ngroups = Node.computeGroups (Container.elems nl)
1547 in map (\(gdx, nodes) ->
1548 let nidxs = map Node.idx nodes
1549 nodes' = zip nidxs nodes
1550 instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1551 in (gdx, (Container.fromList nodes', instances))) ngroups
1553 -- | Compute the list of nodes that are to be evacuated, given a list
1554 -- of instances and an evacuation mode.
1555 nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list
1556 -> EvacMode -- ^ The evacuation mode we're using
1557 -> [Idx] -- ^ List of instance indices being evacuated
1558 -> IntSet.IntSet -- ^ Set of node indices
1559 nodesToEvacuate il mode =
1560 IntSet.delete Node.noSecondary .
1562 let i = Container.find idx il
1563 pdx = Instance.pNode i
1564 sdx = Instance.sNode i
1565 dt = Instance.diskTemplate i
1566 withSecondary = case dt of
1567 DTDrbd8 -> IntSet.insert sdx ns
1570 ChangePrimary -> IntSet.insert pdx ns
1571 ChangeSecondary -> withSecondary
1572 ChangeAll -> IntSet.insert pdx withSecondary