1 {-| Implementation of cluster-wide logic.
3 This module holds all pure cluster-logic; I\/O related functionality
4 goes into the /Main/ module for the individual binaries.
10 Copyright (C) 2009, 2010, 2011, 2012, 2013 Google Inc.
12 This program is free software; you can redistribute it and/or modify
13 it under the terms of the GNU General Public License as published by
14 the Free Software Foundation; either version 2 of the License, or
15 (at your option) any later version.
17 This program is distributed in the hope that it will be useful, but
18 WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 General Public License for more details.
22 You should have received a copy of the GNU General Public License
23 along with this program; if not, write to the Free Software
24 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
29 module Ganeti.HTools.Cluster
40 -- * Generic functions
42 , computeAllocationDelta
43 -- * First phase functions
45 -- * Second phase functions
50 -- * Display functions
53 -- * Balacing functions
62 -- * IAllocator functions
70 -- * Allocation functions
73 -- * Node group functions
79 import qualified Data.IntSet as IntSet
81 import Data.Maybe (fromJust, isNothing)
82 import Data.Ord (comparing)
83 import Text.Printf (printf)
85 import Ganeti.BasicTypes
86 import qualified Ganeti.HTools.Container as Container
87 import qualified Ganeti.HTools.Instance as Instance
88 import qualified Ganeti.HTools.Nic as Nic
89 import qualified Ganeti.HTools.Node as Node
90 import qualified Ganeti.HTools.Group as Group
91 import Ganeti.HTools.Types
93 import qualified Ganeti.OpCodes as OpCodes
95 import Ganeti.Types (mkNonEmpty)
99 -- | Allocation\/relocation solution.
100 data AllocSolution = AllocSolution
101 { asFailures :: [FailMode] -- ^ Failure counts
102 , asAllocs :: Int -- ^ Good allocation count
103 , asSolution :: Maybe Node.AllocElement -- ^ The actual allocation result
104 , asLog :: [String] -- ^ Informational messages
107 -- | Node evacuation/group change iallocator result type. This result
108 -- type consists of actual opcodes (a restricted subset) that are
109 -- transmitted back to Ganeti.
110 data EvacSolution = EvacSolution
111 { esMoved :: [(Idx, Gdx, [Ndx])] -- ^ Instances moved successfully
112 , esFailed :: [(Idx, String)] -- ^ Instances which were not
114 , esOpCodes :: [[OpCodes.OpCode]] -- ^ List of jobs
117 -- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
118 type AllocResult = (FailStats, Node.List, Instance.List,
119 [Instance.Instance], [CStats])
121 -- | Type alias for easier handling.
122 type AllocSolutionList = [(Instance.Instance, AllocSolution)]
124 -- | A type denoting the valid allocation mode/pairs.
126 -- For a one-node allocation, this will be a @Left ['Ndx']@, whereas
127 -- for a two-node allocation, this will be a @Right [('Ndx',
128 -- ['Ndx'])]@. In the latter case, the list is basically an
129 -- association list, grouped by primary node and holding the potential
130 -- secondary nodes in the sub-list.
131 type AllocNodes = Either [Ndx] [(Ndx, [Ndx])]
133 -- | The empty solution we start with when computing allocations.
134 emptyAllocSolution :: AllocSolution
135 emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
136 , asSolution = Nothing, asLog = [] }
138 -- | The empty evac solution.
139 emptyEvacSolution :: EvacSolution
140 emptyEvacSolution = EvacSolution { esMoved = []
145 -- | The complete state for the balancing solution.
146 data Table = Table Node.List Instance.List Score [Placement]
149 -- | Cluster statistics data type.
151 { csFmem :: Integer -- ^ Cluster free mem
152 , csFdsk :: Integer -- ^ Cluster free disk
153 , csAmem :: Integer -- ^ Cluster allocatable mem
154 , csAdsk :: Integer -- ^ Cluster allocatable disk
155 , csAcpu :: Integer -- ^ Cluster allocatable cpus
156 , csMmem :: Integer -- ^ Max node allocatable mem
157 , csMdsk :: Integer -- ^ Max node allocatable disk
158 , csMcpu :: Integer -- ^ Max node allocatable cpu
159 , csImem :: Integer -- ^ Instance used mem
160 , csIdsk :: Integer -- ^ Instance used disk
161 , csIcpu :: Integer -- ^ Instance used cpu
162 , csTmem :: Double -- ^ Cluster total mem
163 , csTdsk :: Double -- ^ Cluster total disk
164 , csTcpu :: Double -- ^ Cluster total cpus
165 , csVcpu :: Integer -- ^ Cluster total virtual cpus
166 , csNcpu :: Double -- ^ Equivalent to 'csIcpu' but in terms of
167 -- physical CPUs, i.e. normalised used phys CPUs
168 , csXmem :: Integer -- ^ Unnacounted for mem
169 , csNmem :: Integer -- ^ Node own memory
170 , csScore :: Score -- ^ The cluster score
171 , csNinst :: Int -- ^ The total number of instances
174 -- | A simple type for allocation functions.
175 type AllocMethod = Node.List -- ^ Node list
176 -> Instance.List -- ^ Instance list
177 -> Maybe Int -- ^ Optional allocation limit
178 -> Instance.Instance -- ^ Instance spec for allocation
179 -> AllocNodes -- ^ Which nodes we should allocate on
180 -> [Instance.Instance] -- ^ Allocated instances
181 -> [CStats] -- ^ Running cluster stats
182 -> Result AllocResult -- ^ Allocation result
184 -- | A simple type for the running solution of evacuations.
185 type EvacInnerState =
186 Either String (Node.List, Instance.Instance, Score, Ndx)
188 -- * Utility functions
190 -- | Verifies the N+1 status and return the affected nodes.
191 verifyN1 :: [Node.Node] -> [Node.Node]
192 verifyN1 = filter Node.failN1
194 {-| Computes the pair of bad nodes and instances.
196 The bad node list is computed via a simple 'verifyN1' check, and the
197 bad instance list is the list of primary and secondary instances of
201 computeBadItems :: Node.List -> Instance.List ->
202 ([Node.Node], [Instance.Instance])
203 computeBadItems nl il =
204 let bad_nodes = verifyN1 $ getOnline nl
205 bad_instances = map (`Container.find` il) .
207 concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
209 (bad_nodes, bad_instances)
211 -- | Extracts the node pairs for an instance. This can fail if the
212 -- instance is single-homed. FIXME: this needs to be improved,
213 -- together with the general enhancement for handling non-DRBD moves.
214 instanceNodes :: Node.List -> Instance.Instance ->
215 (Ndx, Ndx, Node.Node, Node.Node)
216 instanceNodes nl inst =
217 let old_pdx = Instance.pNode inst
218 old_sdx = Instance.sNode inst
219 old_p = Container.find old_pdx nl
220 old_s = Container.find old_sdx nl
221 in (old_pdx, old_sdx, old_p, old_s)
223 -- | Zero-initializer for the CStats type.
224 emptyCStats :: CStats
225 emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
227 -- | Update stats with data from a new node.
228 updateCStats :: CStats -> Node.Node -> CStats
229 updateCStats cs node =
230 let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
231 csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
232 csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
233 csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
234 csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
235 csVcpu = x_vcpu, csNcpu = x_ncpu,
236 csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
239 inc_amem = Node.fMem node - Node.rMem node
240 inc_amem' = if inc_amem > 0 then inc_amem else 0
241 inc_adsk = Node.availDisk node
242 inc_imem = truncate (Node.tMem node) - Node.nMem node
243 - Node.xMem node - Node.fMem node
244 inc_icpu = Node.uCpu node
245 inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
246 inc_vcpu = Node.hiCpu node
247 inc_acpu = Node.availCpu node
248 inc_ncpu = fromIntegral (Node.uCpu node) /
249 iPolicyVcpuRatio (Node.iPolicy node)
250 in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
251 , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
252 , csAmem = x_amem + fromIntegral inc_amem'
253 , csAdsk = x_adsk + fromIntegral inc_adsk
254 , csAcpu = x_acpu + fromIntegral inc_acpu
255 , csMmem = max x_mmem (fromIntegral inc_amem')
256 , csMdsk = max x_mdsk (fromIntegral inc_adsk)
257 , csMcpu = max x_mcpu (fromIntegral inc_acpu)
258 , csImem = x_imem + fromIntegral inc_imem
259 , csIdsk = x_idsk + fromIntegral inc_idsk
260 , csIcpu = x_icpu + fromIntegral inc_icpu
261 , csTmem = x_tmem + Node.tMem node
262 , csTdsk = x_tdsk + Node.tDsk node
263 , csTcpu = x_tcpu + Node.tCpu node
264 , csVcpu = x_vcpu + fromIntegral inc_vcpu
265 , csNcpu = x_ncpu + inc_ncpu
266 , csXmem = x_xmem + fromIntegral (Node.xMem node)
267 , csNmem = x_nmem + fromIntegral (Node.nMem node)
268 , csNinst = x_ninst + length (Node.pList node)
271 -- | Compute the total free disk and memory in the cluster.
272 totalResources :: Node.List -> CStats
274 let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
275 in cs { csScore = compCV nl }
277 -- | Compute the delta between two cluster state.
279 -- This is used when doing allocations, to understand better the
280 -- available cluster resources. The return value is a triple of the
281 -- current used values, the delta that was still allocated, and what
282 -- was left unallocated.
283 computeAllocationDelta :: CStats -> CStats -> AllocStats
284 computeAllocationDelta cini cfin =
285 let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu,
286 csNcpu = i_ncpu } = cini
287 CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
288 csTmem = t_mem, csTdsk = t_dsk, csVcpu = f_vcpu,
289 csNcpu = f_ncpu, csTcpu = f_tcpu } = cfin
290 rini = AllocInfo { allocInfoVCpus = fromIntegral i_icpu
291 , allocInfoNCpus = i_ncpu
292 , allocInfoMem = fromIntegral i_imem
293 , allocInfoDisk = fromIntegral i_idsk
295 rfin = AllocInfo { allocInfoVCpus = fromIntegral (f_icpu - i_icpu)
296 , allocInfoNCpus = f_ncpu - i_ncpu
297 , allocInfoMem = fromIntegral (f_imem - i_imem)
298 , allocInfoDisk = fromIntegral (f_idsk - i_idsk)
300 runa = AllocInfo { allocInfoVCpus = fromIntegral (f_vcpu - f_icpu)
301 , allocInfoNCpus = f_tcpu - f_ncpu
302 , allocInfoMem = truncate t_mem - fromIntegral f_imem
303 , allocInfoDisk = truncate t_dsk - fromIntegral f_idsk
305 in (rini, rfin, runa)
307 -- | The names and weights of the individual elements in the CV list.
308 detailedCVInfo :: [(Double, String)]
309 detailedCVInfo = [ (1, "free_mem_cv")
310 , (1, "free_disk_cv")
312 , (1, "reserved_mem_cv")
313 , (4, "offline_all_cnt")
314 , (16, "offline_pri_cnt")
315 , (1, "vcpu_ratio_cv")
318 , (1, "disk_load_cv")
320 , (2, "pri_tags_score")
324 -- | Holds the weights used by 'compCVNodes' for each metric.
325 detailedCVWeights :: [Double]
326 detailedCVWeights = map fst detailedCVInfo
328 -- | Compute the mem and disk covariance.
329 compDetailedCV :: [Node.Node] -> [Double]
330 compDetailedCV all_nodes =
331 let (offline, nodes) = partition Node.offline all_nodes
332 mem_l = map Node.pMem nodes
333 dsk_l = map Node.pDsk nodes
334 -- metric: memory covariance
335 mem_cv = stdDev mem_l
336 -- metric: disk covariance
337 dsk_cv = stdDev dsk_l
338 -- metric: count of instances living on N1 failing nodes
339 n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
340 length (Node.pList n)) .
341 filter Node.failN1 $ nodes :: Double
342 res_l = map Node.pRem nodes
343 -- metric: reserved memory covariance
344 res_cv = stdDev res_l
345 -- offline instances metrics
346 offline_ipri = sum . map (length . Node.pList) $ offline
347 offline_isec = sum . map (length . Node.sList) $ offline
348 -- metric: count of instances on offline nodes
349 off_score = fromIntegral (offline_ipri + offline_isec)::Double
350 -- metric: count of primary instances on offline nodes (this
351 -- helps with evacuation/failover of primary instances on
352 -- 2-node clusters with one node offline)
353 off_pri_score = fromIntegral offline_ipri::Double
354 cpu_l = map Node.pCpu nodes
355 -- metric: covariance of vcpu/pcpu ratio
356 cpu_cv = stdDev cpu_l
357 -- metrics: covariance of cpu, memory, disk and network load
358 (c_load, m_load, d_load, n_load) =
360 let DynUtil c1 m1 d1 n1 = Node.utilLoad n
361 DynUtil c2 m2 d2 n2 = Node.utilPool n
362 in (c1/c2, m1/m2, d1/d2, n1/n2)) nodes
363 -- metric: conflicting instance count
364 pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
365 pri_tags_score = fromIntegral pri_tags_inst::Double
366 -- metric: spindles %
367 spindles_cv = map (\n -> Node.instSpindles n / Node.hiSpindles n) nodes
368 in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
369 , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
370 , pri_tags_score, stdDev spindles_cv ]
372 -- | Compute the /total/ variance.
373 compCVNodes :: [Node.Node] -> Double
374 compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
376 -- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
377 compCV :: Node.List -> Double
378 compCV = compCVNodes . Container.elems
380 -- | Compute online nodes from a 'Node.List'.
381 getOnline :: Node.List -> [Node.Node]
382 getOnline = filter (not . Node.offline) . Container.elems
384 -- * Balancing functions
386 -- | Compute best table. Note that the ordering of the arguments is important.
387 compareTables :: Table -> Table -> Table
388 compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
389 if a_cv > b_cv then b else a
391 -- | Applies an instance move to a given node list and instance.
392 applyMove :: Node.List -> Instance.Instance
393 -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
395 applyMove nl inst Failover =
396 let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
397 int_p = Node.removePri old_p inst
398 int_s = Node.removeSec old_s inst
399 new_nl = do -- Maybe monad
400 new_p <- Node.addPriEx (Node.offline old_p) int_s inst
401 new_s <- Node.addSec int_p inst old_sdx
402 let new_inst = Instance.setBoth inst old_sdx old_pdx
403 return (Container.addTwo old_pdx new_s old_sdx new_p nl,
404 new_inst, old_sdx, old_pdx)
407 -- Failover to any (fa)
408 applyMove nl inst (FailoverToAny new_pdx) = do
409 let (old_pdx, old_sdx, old_pnode, _) = instanceNodes nl inst
410 new_pnode = Container.find new_pdx nl
411 force_failover = Node.offline old_pnode
412 new_pnode' <- Node.addPriEx force_failover new_pnode inst
413 let old_pnode' = Node.removePri old_pnode inst
414 inst' = Instance.setPri inst new_pdx
415 nl' = Container.addTwo old_pdx old_pnode' new_pdx new_pnode' nl
416 return (nl', inst', new_pdx, old_sdx)
418 -- Replace the primary (f:, r:np, f)
419 applyMove nl inst (ReplacePrimary new_pdx) =
420 let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
421 tgt_n = Container.find new_pdx nl
422 int_p = Node.removePri old_p inst
423 int_s = Node.removeSec old_s inst
424 force_p = Node.offline old_p
425 new_nl = do -- Maybe monad
426 -- check that the current secondary can host the instance
427 -- during the migration
428 tmp_s <- Node.addPriEx force_p int_s inst
429 let tmp_s' = Node.removePri tmp_s inst
430 new_p <- Node.addPriEx force_p tgt_n inst
431 new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
432 let new_inst = Instance.setPri inst new_pdx
433 return (Container.add new_pdx new_p $
434 Container.addTwo old_pdx int_p old_sdx new_s nl,
435 new_inst, new_pdx, old_sdx)
438 -- Replace the secondary (r:ns)
439 applyMove nl inst (ReplaceSecondary new_sdx) =
440 let old_pdx = Instance.pNode inst
441 old_sdx = Instance.sNode inst
442 old_s = Container.find old_sdx nl
443 tgt_n = Container.find new_sdx nl
444 int_s = Node.removeSec old_s inst
445 force_s = Node.offline old_s
446 new_inst = Instance.setSec inst new_sdx
447 new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
448 \new_s -> return (Container.addTwo new_sdx
449 new_s old_sdx int_s nl,
450 new_inst, old_pdx, new_sdx)
453 -- Replace the secondary and failover (r:np, f)
454 applyMove nl inst (ReplaceAndFailover new_pdx) =
455 let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
456 tgt_n = Container.find new_pdx nl
457 int_p = Node.removePri old_p inst
458 int_s = Node.removeSec old_s inst
459 force_s = Node.offline old_s
460 new_nl = do -- Maybe monad
461 new_p <- Node.addPri tgt_n inst
462 new_s <- Node.addSecEx force_s int_p inst new_pdx
463 let new_inst = Instance.setBoth inst new_pdx old_pdx
464 return (Container.add new_pdx new_p $
465 Container.addTwo old_pdx new_s old_sdx int_s nl,
466 new_inst, new_pdx, old_pdx)
469 -- Failver and replace the secondary (f, r:ns)
470 applyMove nl inst (FailoverAndReplace new_sdx) =
471 let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
472 tgt_n = Container.find new_sdx nl
473 int_p = Node.removePri old_p inst
474 int_s = Node.removeSec old_s inst
475 force_p = Node.offline old_p
476 new_nl = do -- Maybe monad
477 new_p <- Node.addPriEx force_p int_s inst
478 new_s <- Node.addSecEx force_p tgt_n inst old_sdx
479 let new_inst = Instance.setBoth inst old_sdx new_sdx
480 return (Container.add new_sdx new_s $
481 Container.addTwo old_sdx new_p old_pdx int_p nl,
482 new_inst, old_sdx, new_sdx)
485 -- | Tries to allocate an instance on one given node.
486 allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
487 -> OpResult Node.AllocElement
488 allocateOnSingle nl inst new_pdx =
489 let p = Container.find new_pdx nl
490 new_inst = Instance.setBoth inst new_pdx Node.noSecondary
492 Instance.instMatchesPolicy inst (Node.iPolicy p)
493 new_p <- Node.addPri p inst
494 let new_nl = Container.add new_pdx new_p nl
495 new_score = compCV new_nl
496 return (new_nl, new_inst, [new_p], new_score)
498 -- | Tries to allocate an instance on a given pair of nodes.
499 allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
500 -> OpResult Node.AllocElement
501 allocateOnPair nl inst new_pdx new_sdx =
502 let tgt_p = Container.find new_pdx nl
503 tgt_s = Container.find new_sdx nl
505 Instance.instMatchesPolicy inst (Node.iPolicy tgt_p)
506 new_p <- Node.addPri tgt_p inst
507 new_s <- Node.addSec tgt_s inst new_pdx
508 let new_inst = Instance.setBoth inst new_pdx new_sdx
509 new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
510 return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
512 -- | Tries to perform an instance move and returns the best table
513 -- between the original one and the new one.
514 checkSingleStep :: Table -- ^ The original table
515 -> Instance.Instance -- ^ The instance to move
516 -> Table -- ^ The current best table
517 -> IMove -- ^ The move to apply
518 -> Table -- ^ The final best table
519 checkSingleStep ini_tbl target cur_tbl move =
520 let Table ini_nl ini_il _ ini_plc = ini_tbl
521 tmp_resu = applyMove ini_nl target move
524 Ok (upd_nl, new_inst, pri_idx, sec_idx) ->
525 let tgt_idx = Instance.idx target
526 upd_cvar = compCV upd_nl
527 upd_il = Container.add tgt_idx new_inst ini_il
528 upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
529 upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
530 in compareTables cur_tbl upd_tbl
532 -- | Given the status of the current secondary as a valid new node and
533 -- the current candidate target node, generate the possible moves for
535 possibleMoves :: MirrorType -- ^ The mirroring type of the instance
536 -> Bool -- ^ Whether the secondary node is a valid new node
537 -> Bool -- ^ Whether we can change the primary node
538 -> Ndx -- ^ Target node candidate
539 -> [IMove] -- ^ List of valid result moves
541 possibleMoves MirrorNone _ _ _ = []
543 possibleMoves MirrorExternal _ False _ = []
545 possibleMoves MirrorExternal _ True tdx =
546 [ FailoverToAny tdx ]
548 possibleMoves MirrorInternal _ False tdx =
549 [ ReplaceSecondary tdx ]
551 possibleMoves MirrorInternal True True tdx =
552 [ ReplaceSecondary tdx
553 , ReplaceAndFailover tdx
555 , FailoverAndReplace tdx
558 possibleMoves MirrorInternal False True tdx =
559 [ ReplaceSecondary tdx
560 , ReplaceAndFailover tdx
563 -- | Compute the best move for a given instance.
564 checkInstanceMove :: [Ndx] -- ^ Allowed target node indices
565 -> Bool -- ^ Whether disk moves are allowed
566 -> Bool -- ^ Whether instance moves are allowed
567 -> Table -- ^ Original table
568 -> Instance.Instance -- ^ Instance to move
569 -> Table -- ^ Best new table for this instance
570 checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
571 let opdx = Instance.pNode target
572 osdx = Instance.sNode target
573 bad_nodes = [opdx, osdx]
574 nodes = filter (`notElem` bad_nodes) nodes_idx
575 mir_type = Instance.mirrorType target
576 use_secondary = elem osdx nodes_idx && inst_moves
577 aft_failover = if mir_type == MirrorInternal && use_secondary
578 -- if drbd and allowed to failover
579 then checkSingleStep ini_tbl target ini_tbl Failover
583 then concatMap (possibleMoves mir_type use_secondary inst_moves)
587 -- iterate over the possible nodes for this instance
588 foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
590 -- | Compute the best next move.
591 checkMove :: [Ndx] -- ^ Allowed target node indices
592 -> Bool -- ^ Whether disk moves are allowed
593 -> Bool -- ^ Whether instance moves are allowed
594 -> Table -- ^ The current solution
595 -> [Instance.Instance] -- ^ List of instances still to move
596 -> Table -- ^ The new solution
597 checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
598 let Table _ _ _ ini_plc = ini_tbl
599 -- we're using rwhnf from the Control.Parallel.Strategies
600 -- package; we don't need to use rnf as that would force too
601 -- much evaluation in single-threaded cases, and in
602 -- multi-threaded case the weak head normal form is enough to
603 -- spark the evaluation
604 tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
607 -- iterate over all instances, computing the best move
608 best_tbl = foldl' compareTables ini_tbl tables
609 Table _ _ _ best_plc = best_tbl
610 in if length best_plc == length ini_plc
611 then ini_tbl -- no advancement
614 -- | Check if we are allowed to go deeper in the balancing.
615 doNextBalance :: Table -- ^ The starting table
616 -> Int -- ^ Remaining length
617 -> Score -- ^ Score at which to stop
618 -> Bool -- ^ The resulting table and commands
619 doNextBalance ini_tbl max_rounds min_score =
620 let Table _ _ ini_cv ini_plc = ini_tbl
621 ini_plc_len = length ini_plc
622 in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
624 -- | Run a balance move.
625 tryBalance :: Table -- ^ The starting table
626 -> Bool -- ^ Allow disk moves
627 -> Bool -- ^ Allow instance moves
628 -> Bool -- ^ Only evacuate moves
629 -> Score -- ^ Min gain threshold
630 -> Score -- ^ Min gain
631 -> Maybe Table -- ^ The resulting table and commands
632 tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
633 let Table ini_nl ini_il ini_cv _ = ini_tbl
634 all_inst = Container.elems ini_il
635 all_nodes = Container.elems ini_nl
636 (offline_nodes, online_nodes) = partition Node.offline all_nodes
637 all_inst' = if evac_mode
638 then let bad_nodes = map Node.idx offline_nodes
639 in filter (any (`elem` bad_nodes) .
640 Instance.allNodes) all_inst
642 reloc_inst = filter (\i -> Instance.movable i &&
643 Instance.autoBalance i) all_inst'
644 node_idx = map Node.idx online_nodes
645 fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
646 (Table _ _ fin_cv _) = fin_tbl
648 if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
649 then Just fin_tbl -- this round made success, return the new table
652 -- * Allocation functions
654 -- | Build failure stats out of a list of failures.
655 collapseFailures :: [FailMode] -> FailStats
656 collapseFailures flst =
657 map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst))
660 -- | Compares two Maybe AllocElement and chooses the best score.
661 bestAllocElement :: Maybe Node.AllocElement
662 -> Maybe Node.AllocElement
663 -> Maybe Node.AllocElement
664 bestAllocElement a Nothing = a
665 bestAllocElement Nothing b = b
666 bestAllocElement a@(Just (_, _, _, ascore)) b@(Just (_, _, _, bscore)) =
667 if ascore < bscore then a else b
669 -- | Update current Allocation solution and failure stats with new
671 concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
672 concatAllocs as (Bad reason) = as { asFailures = reason : asFailures as }
674 concatAllocs as (Ok ns) =
675 let -- Choose the old or new solution, based on the cluster score
677 osols = asSolution as
678 nsols = bestAllocElement osols (Just ns)
680 -- Note: we force evaluation of nsols here in order to keep the
681 -- memory profile low - we know that we will need nsols for sure
682 -- in the next cycle, so we force evaluation of nsols, since the
683 -- foldl' in the caller will only evaluate the tuple, but not the
684 -- elements of the tuple
685 in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolution = nsols }
687 -- | Sums two 'AllocSolution' structures.
688 sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution
689 sumAllocs (AllocSolution aFails aAllocs aSols aLog)
690 (AllocSolution bFails bAllocs bSols bLog) =
691 -- note: we add b first, since usually it will be smaller; when
692 -- fold'ing, a will grow and grow whereas b is the per-group
693 -- result, hence smaller
694 let nFails = bFails ++ aFails
695 nAllocs = aAllocs + bAllocs
696 nSols = bestAllocElement aSols bSols
698 in AllocSolution nFails nAllocs nSols nLog
700 -- | Given a solution, generates a reasonable description for it.
701 describeSolution :: AllocSolution -> String
702 describeSolution as =
703 let fcnt = asFailures as
706 intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
707 filter ((> 0) . snd) . collapseFailures $ fcnt
709 Nothing -> "No valid allocation solutions, failure reasons: " ++
710 (if null fcnt then "unknown reasons" else freasons)
711 Just (_, _, nodes, cv) ->
712 printf ("score: %.8f, successes %d, failures %d (%s)" ++
713 " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
714 (intercalate "/" . map Node.name $ nodes)
716 -- | Annotates a solution with the appropriate string.
717 annotateSolution :: AllocSolution -> AllocSolution
718 annotateSolution as = as { asLog = describeSolution as : asLog as }
720 -- | Reverses an evacuation solution.
722 -- Rationale: we always concat the results to the top of the lists, so
723 -- for proper jobset execution, we should reverse all lists.
724 reverseEvacSolution :: EvacSolution -> EvacSolution
725 reverseEvacSolution (EvacSolution f m o) =
726 EvacSolution (reverse f) (reverse m) (reverse o)
728 -- | Generate the valid node allocation singles or pairs for a new instance.
729 genAllocNodes :: Group.List -- ^ Group list
730 -> Node.List -- ^ The node map
731 -> Int -- ^ The number of nodes required
732 -> Bool -- ^ Whether to drop or not
734 -> Result AllocNodes -- ^ The (monadic) result
735 genAllocNodes gl nl count drop_unalloc =
736 let filter_fn = if drop_unalloc
737 then filter (Group.isAllocable .
738 flip Container.find gl . Node.group)
740 all_nodes = filter_fn $ getOnline nl
741 all_pairs = [(Node.idx p,
742 [Node.idx s | s <- all_nodes,
743 Node.idx p /= Node.idx s,
744 Node.group p == Node.group s]) |
747 1 -> Ok (Left (map Node.idx all_nodes))
748 2 -> Ok (Right (filter (not . null . snd) all_pairs))
749 _ -> Bad "Unsupported number of nodes, only one or two supported"
751 -- | Try to allocate an instance on the cluster.
752 tryAlloc :: (Monad m) =>
753 Node.List -- ^ The node list
754 -> Instance.List -- ^ The instance list
755 -> Instance.Instance -- ^ The instance to allocate
756 -> AllocNodes -- ^ The allocation targets
757 -> m AllocSolution -- ^ Possible solution list
758 tryAlloc _ _ _ (Right []) = fail "Not enough online nodes"
759 tryAlloc nl _ inst (Right ok_pairs) =
760 let psols = parMap rwhnf (\(p, ss) ->
762 concatAllocs cstate .
763 allocateOnPair nl inst p)
764 emptyAllocSolution ss) ok_pairs
765 sols = foldl' sumAllocs emptyAllocSolution psols
766 in return $ annotateSolution sols
768 tryAlloc _ _ _ (Left []) = fail "No online nodes"
769 tryAlloc nl _ inst (Left all_nodes) =
770 let sols = foldl' (\cstate ->
771 concatAllocs cstate . allocateOnSingle nl inst
772 ) emptyAllocSolution all_nodes
773 in return $ annotateSolution sols
775 -- | Given a group/result, describe it as a nice (list of) messages.
776 solutionDescription :: (Group.Group, Result AllocSolution)
778 solutionDescription (grp, result) =
780 Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
781 Bad message -> [printf "Group %s: error %s" gname message]
782 where gname = Group.name grp
783 pol = allocPolicyToRaw (Group.allocPolicy grp)
785 -- | From a list of possibly bad and possibly empty solutions, filter
786 -- only the groups with a valid result. Note that the result will be
787 -- reversed compared to the original list.
788 filterMGResults :: [(Group.Group, Result AllocSolution)]
789 -> [(Group.Group, AllocSolution)]
790 filterMGResults = foldl' fn []
791 where unallocable = not . Group.isAllocable
792 fn accu (grp, rasol) =
795 Ok sol | isNothing (asSolution sol) -> accu
796 | unallocable grp -> accu
797 | otherwise -> (grp, sol):accu
799 -- | Sort multigroup results based on policy and score.
800 sortMGResults :: [(Group.Group, AllocSolution)]
801 -> [(Group.Group, AllocSolution)]
803 let extractScore (_, _, _, x) = x
804 solScore (grp, sol) = (Group.allocPolicy grp,
805 (extractScore . fromJust . asSolution) sol)
806 in sortBy (comparing solScore) sols
808 -- | Removes node groups which can't accommodate the instance
809 filterValidGroups :: [(Group.Group, (Node.List, Instance.List))]
811 -> ([(Group.Group, (Node.List, Instance.List))], [String])
812 filterValidGroups [] _ = ([], [])
813 filterValidGroups (ng:ngs) inst =
814 let (valid_ngs, msgs) = filterValidGroups ngs inst
815 hasNetwork nic = case Nic.network nic of
816 Just net -> net `elem` Group.networks (fst ng)
818 hasRequiredNetworks = all hasNetwork (Instance.nics inst)
819 in if hasRequiredNetworks
820 then (ng:valid_ngs, msgs)
822 ("group " ++ Group.name (fst ng) ++
823 " is not connected to a network required by instance " ++
824 Instance.name inst):msgs)
826 -- | Finds the best group for an instance on a multi-group cluster.
828 -- Only solutions in @preferred@ and @last_resort@ groups will be
829 -- accepted as valid, and additionally if the allowed groups parameter
830 -- is not null then allocation will only be run for those group
832 findBestAllocGroup :: Group.List -- ^ The group list
833 -> Node.List -- ^ The node list
834 -> Instance.List -- ^ The instance list
835 -> Maybe [Gdx] -- ^ The allowed groups
836 -> Instance.Instance -- ^ The instance to allocate
837 -> Int -- ^ Required number of nodes
838 -> Result (Group.Group, AllocSolution, [String])
839 findBestAllocGroup mggl mgnl mgil allowed_gdxs inst cnt =
840 let groups_by_idx = splitCluster mgnl mgil
841 groups = map (\(gid, d) -> (Container.find gid mggl, d)) groups_by_idx
842 groups' = maybe groups
843 (\gs -> filter ((`elem` gs) . Group.idx . fst) groups)
845 (groups'', filter_group_msgs) = filterValidGroups groups' inst
846 sols = map (\(gr, (nl, il)) ->
847 (gr, genAllocNodes mggl nl cnt False >>=
848 tryAlloc nl il inst))
849 groups''::[(Group.Group, Result AllocSolution)]
850 all_msgs = filter_group_msgs ++ (concatMap solutionDescription sols)
851 goodSols = filterMGResults sols
852 sortedSols = sortMGResults goodSols
853 in case sortedSols of
854 [] -> Bad $ if null groups'
855 then "no groups for evacuation: allowed groups was" ++
856 show allowed_gdxs ++ ", all groups: " ++
857 show (map fst groups)
858 else intercalate ", " all_msgs
859 (final_group, final_sol):_ -> return (final_group, final_sol, all_msgs)
861 -- | Try to allocate an instance on a multi-group cluster.
862 tryMGAlloc :: Group.List -- ^ The group list
863 -> Node.List -- ^ The node list
864 -> Instance.List -- ^ The instance list
865 -> Instance.Instance -- ^ The instance to allocate
866 -> Int -- ^ Required number of nodes
867 -> Result AllocSolution -- ^ Possible solution list
868 tryMGAlloc mggl mgnl mgil inst cnt = do
869 (best_group, solution, all_msgs) <-
870 findBestAllocGroup mggl mgnl mgil Nothing inst cnt
871 let group_name = Group.name best_group
872 selmsg = "Selected group: " ++ group_name
873 return $ solution { asLog = selmsg:all_msgs }
875 -- | Calculate the new instance list after allocation solution.
876 updateIl :: Instance.List -- ^ The original instance list
877 -> Maybe Node.AllocElement -- ^ The result of the allocation attempt
878 -> Instance.List -- ^ The updated instance list
879 updateIl il Nothing = il
880 updateIl il (Just (_, xi, _, _)) = Container.add (Container.size il) xi il
882 -- | Extract the the new node list from the allocation solution.
883 extractNl :: Node.List -- ^ The original node list
884 -> Maybe Node.AllocElement -- ^ The result of the allocation attempt
885 -> Node.List -- ^ The new node list
886 extractNl nl Nothing = nl
887 extractNl _ (Just (xnl, _, _, _)) = xnl
889 -- | Try to allocate a list of instances on a multi-group cluster.
890 allocList :: Group.List -- ^ The group list
891 -> Node.List -- ^ The node list
892 -> Instance.List -- ^ The instance list
893 -> [(Instance.Instance, Int)] -- ^ The instance to allocate
894 -> AllocSolutionList -- ^ Possible solution list
895 -> Result (Node.List, Instance.List,
896 AllocSolutionList) -- ^ The final solution list
897 allocList _ nl il [] result = Ok (nl, il, result)
898 allocList gl nl il ((xi, xicnt):xies) result = do
899 ares <- tryMGAlloc gl nl il xi xicnt
900 let sol = asSolution ares
901 nl' = extractNl nl sol
902 il' = updateIl il sol
903 allocList gl nl' il' xies ((xi, ares):result)
905 -- | Function which fails if the requested mode is change secondary.
907 -- This is useful since except DRBD, no other disk template can
908 -- execute change secondary; thus, we can just call this function
909 -- instead of always checking for secondary mode. After the call to
910 -- this function, whatever mode we have is just a primary change.
911 failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
912 failOnSecondaryChange ChangeSecondary dt =
913 fail $ "Instances with disk template '" ++ diskTemplateToRaw dt ++
914 "' can't execute change secondary"
915 failOnSecondaryChange _ _ = return ()
917 -- | Run evacuation for a single instance.
919 -- /Note:/ this function should correctly execute both intra-group
920 -- evacuations (in all modes) and inter-group evacuations (in the
921 -- 'ChangeAll' mode). Of course, this requires that the correct list
922 -- of target nodes is passed.
923 nodeEvacInstance :: Node.List -- ^ The node list (cluster-wide)
924 -> Instance.List -- ^ Instance list (cluster-wide)
925 -> EvacMode -- ^ The evacuation mode
926 -> Instance.Instance -- ^ The instance to be evacuated
927 -> Gdx -- ^ The group we're targetting
928 -> [Ndx] -- ^ The list of available nodes
930 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
931 nodeEvacInstance nl il mode inst@(Instance.Instance
932 {Instance.diskTemplate = dt@DTDiskless})
934 failOnSecondaryChange mode dt >>
935 evacOneNodeOnly nl il inst gdx avail_nodes
937 nodeEvacInstance _ _ _ (Instance.Instance
938 {Instance.diskTemplate = DTPlain}) _ _ =
939 fail "Instances of type plain cannot be relocated"
941 nodeEvacInstance _ _ _ (Instance.Instance
942 {Instance.diskTemplate = DTFile}) _ _ =
943 fail "Instances of type file cannot be relocated"
945 nodeEvacInstance nl il mode inst@(Instance.Instance
946 {Instance.diskTemplate = dt@DTSharedFile})
948 failOnSecondaryChange mode dt >>
949 evacOneNodeOnly nl il inst gdx avail_nodes
951 nodeEvacInstance nl il mode inst@(Instance.Instance
952 {Instance.diskTemplate = dt@DTBlock})
954 failOnSecondaryChange mode dt >>
955 evacOneNodeOnly nl il inst gdx avail_nodes
957 nodeEvacInstance nl il mode inst@(Instance.Instance
958 {Instance.diskTemplate = dt@DTRbd})
960 failOnSecondaryChange mode dt >>
961 evacOneNodeOnly nl il inst gdx avail_nodes
963 nodeEvacInstance nl il mode inst@(Instance.Instance
964 {Instance.diskTemplate = dt@DTExt})
966 failOnSecondaryChange mode dt >>
967 evacOneNodeOnly nl il inst gdx avail_nodes
969 nodeEvacInstance nl il ChangePrimary
970 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
973 (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
974 let idx = Instance.idx inst
975 il' = Container.add idx inst' il
976 ops = iMoveToJob nl' il' idx Failover
977 return (nl', il', ops)
979 nodeEvacInstance nl il ChangeSecondary
980 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
982 evacOneNodeOnly nl il inst gdx avail_nodes
984 -- The algorithm for ChangeAll is as follows:
986 -- * generate all (primary, secondary) node pairs for the target groups
987 -- * for each pair, execute the needed moves (r:s, f, r:s) and compute
988 -- the final node list state and group score
989 -- * select the best choice via a foldl that uses the same Either
990 -- String solution as the ChangeSecondary mode
991 nodeEvacInstance nl il ChangeAll
992 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
995 let no_nodes = Left "no nodes available"
996 node_pairs = [(p,s) | p <- avail_nodes, s <- avail_nodes, p /= s]
997 (nl', il', ops, _) <-
998 annotateResult "Can't find any good nodes for relocation" .
1001 (\accu nodes -> case evacDrbdAllInner nl il inst gdx nodes of
1005 -- we don't need more details (which
1006 -- nodes, etc.) as we only selected
1007 -- this group if we can allocate on
1008 -- it, hence failures will not
1009 -- propagate out of this fold loop
1010 Left _ -> Left $ "Allocation failed: " ++ msg
1011 Ok result@(_, _, _, new_cv) ->
1012 let new_accu = Right result in
1015 Right (_, _, _, old_cv) ->
1019 ) no_nodes node_pairs
1021 return (nl', il', ops)
1023 -- | Generic function for changing one node of an instance.
1025 -- This is similar to 'nodeEvacInstance' but will be used in a few of
1026 -- its sub-patterns. It folds the inner function 'evacOneNodeInner'
1027 -- over the list of available nodes, which results in the best choice
1029 evacOneNodeOnly :: Node.List -- ^ The node list (cluster-wide)
1030 -> Instance.List -- ^ Instance list (cluster-wide)
1031 -> Instance.Instance -- ^ The instance to be evacuated
1032 -> Gdx -- ^ The group we're targetting
1033 -> [Ndx] -- ^ The list of available nodes
1035 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1036 evacOneNodeOnly nl il inst gdx avail_nodes = do
1037 op_fn <- case Instance.mirrorType inst of
1038 MirrorNone -> Bad "Can't relocate/evacuate non-mirrored instances"
1039 MirrorInternal -> Ok ReplaceSecondary
1040 MirrorExternal -> Ok FailoverToAny
1041 (nl', inst', _, ndx) <- annotateResult "Can't find any good node" .
1043 foldl' (evacOneNodeInner nl inst gdx op_fn)
1044 (Left "no nodes available") avail_nodes
1045 let idx = Instance.idx inst
1046 il' = Container.add idx inst' il
1047 ops = iMoveToJob nl' il' idx (op_fn ndx)
1048 return (nl', il', ops)
1050 -- | Inner fold function for changing one node of an instance.
1052 -- Depending on the instance disk template, this will either change
1053 -- the secondary (for DRBD) or the primary node (for shared
1054 -- storage). However, the operation is generic otherwise.
1056 -- The running solution is either a @Left String@, which means we
1057 -- don't have yet a working solution, or a @Right (...)@, which
1058 -- represents a valid solution; it holds the modified node list, the
1059 -- modified instance (after evacuation), the score of that solution,
1060 -- and the new secondary node index.
1061 evacOneNodeInner :: Node.List -- ^ Cluster node list
1062 -> Instance.Instance -- ^ Instance being evacuated
1063 -> Gdx -- ^ The group index of the instance
1064 -> (Ndx -> IMove) -- ^ Operation constructor
1065 -> EvacInnerState -- ^ Current best solution
1066 -> Ndx -- ^ Node we're evaluating as target
1067 -> EvacInnerState -- ^ New best solution
1068 evacOneNodeInner nl inst gdx op_fn accu ndx =
1069 case applyMove nl inst (op_fn ndx) of
1070 Bad fm -> let fail_msg = "Node " ++ Container.nameOf nl ndx ++
1071 " failed: " ++ show fm
1072 in either (const $ Left fail_msg) (const accu) accu
1073 Ok (nl', inst', _, _) ->
1074 let nodes = Container.elems nl'
1075 -- The fromJust below is ugly (it can fail nastily), but
1076 -- at this point we should have any internal mismatches,
1077 -- and adding a monad here would be quite involved
1078 grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1079 new_cv = compCVNodes grpnodes
1080 new_accu = Right (nl', inst', new_cv, ndx)
1083 Right (_, _, old_cv, _) ->
1088 -- | Compute result of changing all nodes of a DRBD instance.
1090 -- Given the target primary and secondary node (which might be in a
1091 -- different group or not), this function will 'execute' all the
1092 -- required steps and assuming all operations succceed, will return
1093 -- the modified node and instance lists, the opcodes needed for this
1094 -- and the new group score.
1095 evacDrbdAllInner :: Node.List -- ^ Cluster node list
1096 -> Instance.List -- ^ Cluster instance list
1097 -> Instance.Instance -- ^ The instance to be moved
1098 -> Gdx -- ^ The target group index
1099 -- (which can differ from the
1100 -- current group of the
1102 -> (Ndx, Ndx) -- ^ Tuple of new
1103 -- primary\/secondary nodes
1104 -> Result (Node.List, Instance.List, [OpCodes.OpCode], Score)
1105 evacDrbdAllInner nl il inst gdx (t_pdx, t_sdx) = do
1106 let primary = Container.find (Instance.pNode inst) nl
1107 idx = Instance.idx inst
1108 -- if the primary is offline, then we first failover
1109 (nl1, inst1, ops1) <-
1110 if Node.offline primary
1112 (nl', inst', _, _) <-
1113 annotateResult "Failing over to the secondary" .
1114 opToResult $ applyMove nl inst Failover
1115 return (nl', inst', [Failover])
1116 else return (nl, inst, [])
1117 let (o1, o2, o3) = (ReplaceSecondary t_pdx,
1119 ReplaceSecondary t_sdx)
1120 -- we now need to execute a replace secondary to the future
1122 (nl2, inst2, _, _) <-
1123 annotateResult "Changing secondary to new primary" .
1125 applyMove nl1 inst1 o1
1127 -- we now execute another failover, the primary stays fixed now
1128 (nl3, inst3, _, _) <- annotateResult "Failing over to new primary" .
1129 opToResult $ applyMove nl2 inst2 o2
1131 -- and finally another replace secondary, to the final secondary
1132 (nl4, inst4, _, _) <-
1133 annotateResult "Changing secondary to final secondary" .
1135 applyMove nl3 inst3 o3
1137 il' = Container.add idx inst4 il
1138 ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4
1139 let nodes = Container.elems nl4
1140 -- The fromJust below is ugly (it can fail nastily), but
1141 -- at this point we should have any internal mismatches,
1142 -- and adding a monad here would be quite involved
1143 grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1144 new_cv = compCVNodes grpnodes
1145 return (nl4, il', ops, new_cv)
1147 -- | Computes the nodes in a given group which are available for
1149 availableGroupNodes :: [(Gdx, [Ndx])] -- ^ Group index/node index assoc list
1150 -> IntSet.IntSet -- ^ Nodes that are excluded
1151 -> Gdx -- ^ The group for which we
1153 -> Result [Ndx] -- ^ List of available node indices
1154 availableGroupNodes group_nodes excl_ndx gdx = do
1155 local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx)
1156 Ok (lookup gdx group_nodes)
1157 let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes
1160 -- | Updates the evac solution with the results of an instance
1162 updateEvacSolution :: (Node.List, Instance.List, EvacSolution)
1164 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1165 -> (Node.List, Instance.List, EvacSolution)
1166 updateEvacSolution (nl, il, es) idx (Bad msg) =
1167 (nl, il, es { esFailed = (idx, msg):esFailed es})
1168 updateEvacSolution (_, _, es) idx (Ok (nl, il, opcodes)) =
1169 (nl, il, es { esMoved = new_elem:esMoved es
1170 , esOpCodes = opcodes:esOpCodes es })
1171 where inst = Container.find idx il
1173 instancePriGroup nl inst,
1174 Instance.allNodes inst)
1176 -- | Node-evacuation IAllocator mode main function.
1177 tryNodeEvac :: Group.List -- ^ The cluster groups
1178 -> Node.List -- ^ The node list (cluster-wide, not per group)
1179 -> Instance.List -- ^ Instance list (cluster-wide)
1180 -> EvacMode -- ^ The evacuation mode
1181 -> [Idx] -- ^ List of instance (indices) to be evacuated
1182 -> Result (Node.List, Instance.List, EvacSolution)
1183 tryNodeEvac _ ini_nl ini_il mode idxs =
1184 let evac_ndx = nodesToEvacuate ini_il mode idxs
1185 offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1186 excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline
1187 group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1188 (Container.elems nl))) $
1189 splitCluster ini_nl ini_il
1190 (fin_nl, fin_il, esol) =
1191 foldl' (\state@(nl, il, _) inst ->
1192 let gdx = instancePriGroup nl inst
1193 pdx = Instance.pNode inst in
1194 updateEvacSolution state (Instance.idx inst) $
1195 availableGroupNodes group_ndx
1196 (IntSet.insert pdx excl_ndx) gdx >>=
1197 nodeEvacInstance nl il mode inst gdx
1199 (ini_nl, ini_il, emptyEvacSolution)
1200 (map (`Container.find` ini_il) idxs)
1201 in return (fin_nl, fin_il, reverseEvacSolution esol)
1203 -- | Change-group IAllocator mode main function.
1205 -- This is very similar to 'tryNodeEvac', the only difference is that
1206 -- we don't choose as target group the current instance group, but
1209 -- 1. at the start of the function, we compute which are the target
1210 -- groups; either no groups were passed in, in which case we choose
1211 -- all groups out of which we don't evacuate instance, or there were
1212 -- some groups passed, in which case we use those
1214 -- 2. for each instance, we use 'findBestAllocGroup' to choose the
1215 -- best group to hold the instance, and then we do what
1216 -- 'tryNodeEvac' does, except for this group instead of the current
1219 -- Note that the correct behaviour of this function relies on the
1220 -- function 'nodeEvacInstance' to be able to do correctly both
1221 -- intra-group and inter-group moves when passed the 'ChangeAll' mode.
1222 tryChangeGroup :: Group.List -- ^ The cluster groups
1223 -> Node.List -- ^ The node list (cluster-wide)
1224 -> Instance.List -- ^ Instance list (cluster-wide)
1225 -> [Gdx] -- ^ Target groups; if empty, any
1226 -- groups not being evacuated
1227 -> [Idx] -- ^ List of instance (indices) to be evacuated
1228 -> Result (Node.List, Instance.List, EvacSolution)
1229 tryChangeGroup gl ini_nl ini_il gdxs idxs =
1230 let evac_gdxs = nub $ map (instancePriGroup ini_nl .
1231 flip Container.find ini_il) idxs
1232 target_gdxs = (if null gdxs
1233 then Container.keys gl
1234 else gdxs) \\ evac_gdxs
1235 offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1236 excl_ndx = foldl' (flip IntSet.insert) IntSet.empty offline
1237 group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1238 (Container.elems nl))) $
1239 splitCluster ini_nl ini_il
1240 (fin_nl, fin_il, esol) =
1241 foldl' (\state@(nl, il, _) inst ->
1243 let ncnt = Instance.requiredNodes $
1244 Instance.diskTemplate inst
1245 (grp, _, _) <- findBestAllocGroup gl nl il
1246 (Just target_gdxs) inst ncnt
1247 let gdx = Group.idx grp
1248 av_nodes <- availableGroupNodes group_ndx
1250 nodeEvacInstance nl il ChangeAll inst gdx av_nodes
1251 in updateEvacSolution state (Instance.idx inst) solution
1253 (ini_nl, ini_il, emptyEvacSolution)
1254 (map (`Container.find` ini_il) idxs)
1255 in return (fin_nl, fin_il, reverseEvacSolution esol)
1257 -- | Standard-sized allocation method.
1259 -- This places instances of the same size on the cluster until we're
1260 -- out of space. The result will be a list of identically-sized
1262 iterateAlloc :: AllocMethod
1263 iterateAlloc nl il limit newinst allocnodes ixes cstats =
1264 let depth = length ixes
1265 newname = printf "new-%d" depth::String
1266 newidx = Container.size il
1267 newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
1268 newlimit = fmap (flip (-) 1) limit
1269 in case tryAlloc nl il newi2 allocnodes of
1271 Ok (AllocSolution { asFailures = errs, asSolution = sols3 }) ->
1272 let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in
1275 Just (xnl, xi, _, _) ->
1278 else iterateAlloc xnl (Container.add newidx xi il)
1279 newlimit newinst allocnodes (xi:ixes)
1280 (totalResources xnl:cstats)
1282 -- | Tiered allocation method.
1284 -- This places instances on the cluster, and decreases the spec until
1285 -- we can allocate again. The result will be a list of decreasing
1287 tieredAlloc :: AllocMethod
1288 tieredAlloc nl il limit newinst allocnodes ixes cstats =
1289 case iterateAlloc nl il limit newinst allocnodes ixes cstats of
1291 Ok (errs, nl', il', ixes', cstats') ->
1292 let newsol = Ok (errs, nl', il', ixes', cstats')
1293 ixes_cnt = length ixes'
1294 (stop, newlimit) = case limit of
1295 Nothing -> (False, Nothing)
1296 Just n -> (n <= ixes_cnt,
1297 Just (n - ixes_cnt)) in
1298 if stop then newsol else
1299 case Instance.shrinkByType newinst . fst . last $
1300 sortBy (comparing snd) errs of
1302 Ok newinst' -> tieredAlloc nl' il' newlimit
1303 newinst' allocnodes ixes' cstats'
1305 -- * Formatting functions
1307 -- | Given the original and final nodes, computes the relocation description.
1308 computeMoves :: Instance.Instance -- ^ The instance to be moved
1309 -> String -- ^ The instance name
1310 -> IMove -- ^ The move being performed
1311 -> String -- ^ New primary
1312 -> String -- ^ New secondary
1313 -> (String, [String])
1314 -- ^ Tuple of moves and commands list; moves is containing
1315 -- either @/f/@ for failover or @/r:name/@ for replace
1316 -- secondary, while the command list holds gnt-instance
1317 -- commands (without that prefix), e.g \"@failover instance1@\"
1318 computeMoves i inam mv c d =
1320 Failover -> ("f", [mig])
1321 FailoverToAny _ -> (printf "fa:%s" c, [mig_any])
1322 FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
1323 ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
1324 ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
1325 ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
1326 where morf = if Instance.isRunning i then "migrate" else "failover"
1327 mig = printf "%s -f %s" morf inam::String
1328 mig_any = printf "%s -f -n %s %s" morf c inam::String
1329 rep n = printf "replace-disks -n %s %s" n inam::String
1331 -- | Converts a placement to string format.
1332 printSolutionLine :: Node.List -- ^ The node list
1333 -> Instance.List -- ^ The instance list
1334 -> Int -- ^ Maximum node name length
1335 -> Int -- ^ Maximum instance name length
1336 -> Placement -- ^ The current placement
1337 -> Int -- ^ The index of the placement in
1339 -> (String, [String])
1340 printSolutionLine nl il nmlen imlen plc pos =
1341 let pmlen = (2*nmlen + 1)
1342 (i, p, s, mv, c) = plc
1343 old_sec = Instance.sNode inst
1344 inst = Container.find i il
1345 inam = Instance.alias inst
1346 npri = Node.alias $ Container.find p nl
1347 nsec = Node.alias $ Container.find s nl
1348 opri = Node.alias $ Container.find (Instance.pNode inst) nl
1349 osec = Node.alias $ Container.find old_sec nl
1350 (moves, cmds) = computeMoves inst inam mv npri nsec
1351 -- FIXME: this should check instead/also the disk template
1352 ostr = if old_sec == Node.noSecondary
1353 then printf "%s" opri::String
1354 else printf "%s:%s" opri osec::String
1355 nstr = if s == Node.noSecondary
1356 then printf "%s" npri::String
1357 else printf "%s:%s" npri nsec::String
1358 in (printf " %3d. %-*s %-*s => %-*s %12.8f a=%s"
1359 pos imlen inam pmlen ostr pmlen nstr c moves,
1362 -- | Return the instance and involved nodes in an instance move.
1364 -- Note that the output list length can vary, and is not required nor
1365 -- guaranteed to be of any specific length.
1366 involvedNodes :: Instance.List -- ^ Instance list, used for retrieving
1367 -- the instance from its index; note
1368 -- that this /must/ be the original
1369 -- instance list, so that we can
1370 -- retrieve the old nodes
1371 -> Placement -- ^ The placement we're investigating,
1372 -- containing the new nodes and
1374 -> [Ndx] -- ^ Resulting list of node indices
1375 involvedNodes il plc =
1376 let (i, np, ns, _, _) = plc
1377 inst = Container.find i il
1378 in nub $ [np, ns] ++ Instance.allNodes inst
1380 -- | Inner function for splitJobs, that either appends the next job to
1381 -- the current jobset, or starts a new jobset.
1382 mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
1383 mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
1384 mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
1385 | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
1386 | otherwise = ([n]:cjs, ndx)
1388 -- | Break a list of moves into independent groups. Note that this
1389 -- will reverse the order of jobs.
1390 splitJobs :: [MoveJob] -> [JobSet]
1391 splitJobs = fst . foldl mergeJobs ([], [])
1393 -- | Given a list of commands, prefix them with @gnt-instance@ and
1394 -- also beautify the display a little.
1395 formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
1396 formatJob jsn jsl (sn, (_, _, _, cmds)) =
1398 printf " echo job %d/%d" jsn sn:
1400 map (" gnt-instance " ++) cmds
1402 then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
1405 -- | Given a list of commands, prefix them with @gnt-instance@ and
1406 -- also beautify the display a little.
1407 formatCmds :: [JobSet] -> String
1410 concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1414 -- | Print the node list.
1415 printNodes :: Node.List -> [String] -> String
1417 let fields = case fs of
1418 [] -> Node.defaultFields
1419 "+":rest -> Node.defaultFields ++ rest
1421 snl = sortBy (comparing Node.idx) (Container.elems nl)
1422 (header, isnum) = unzip $ map Node.showHeader fields
1423 in printTable "" header (map (Node.list fields) snl) isnum
1425 -- | Print the instance list.
1426 printInsts :: Node.List -> Instance.List -> String
1428 let sil = sortBy (comparing Instance.idx) (Container.elems il)
1429 helper inst = [ if Instance.isRunning inst then "R" else " "
1430 , Instance.name inst
1431 , Container.nameOf nl (Instance.pNode inst)
1432 , let sdx = Instance.sNode inst
1433 in if sdx == Node.noSecondary
1435 else Container.nameOf nl sdx
1436 , if Instance.autoBalance inst then "Y" else "N"
1437 , printf "%3d" $ Instance.vcpus inst
1438 , printf "%5d" $ Instance.mem inst
1439 , printf "%5d" $ Instance.dsk inst `div` 1024
1445 where DynUtil lC lM lD lN = Instance.util inst
1446 header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1447 , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1448 isnum = False:False:False:False:False:repeat True
1449 in printTable "" header (map helper sil) isnum
1451 -- | Shows statistics for a given node list.
1452 printStats :: String -> Node.List -> String
1454 let dcvs = compDetailedCV $ Container.elems nl
1455 (weights, names) = unzip detailedCVInfo
1456 hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1457 header = [ "Field", "Value", "Weight" ]
1458 formatted = map (\(w, h, val) ->
1463 in printTable lp header formatted $ False:repeat True
1465 -- | Convert a placement into a list of OpCodes (basically a job).
1466 iMoveToJob :: Node.List -- ^ The node list; only used for node
1467 -- names, so any version is good
1468 -- (before or after the operation)
1469 -> Instance.List -- ^ The instance list; also used for
1471 -> Idx -- ^ The index of the instance being
1473 -> IMove -- ^ The actual move to be described
1474 -> [OpCodes.OpCode] -- ^ The list of opcodes equivalent to
1476 iMoveToJob nl il idx move =
1477 let inst = Container.find idx il
1478 iname = Instance.name inst
1479 lookNode n = case mkNonEmpty (Container.nameOf nl n) of
1480 -- FIXME: convert htools codebase to non-empty strings
1481 Bad msg -> error $ "Empty node name for idx " ++
1482 show n ++ ": " ++ msg ++ "??"
1484 opF = OpCodes.OpInstanceMigrate
1485 { OpCodes.opInstanceName = iname
1486 , OpCodes.opMigrationMode = Nothing -- default
1487 , OpCodes.opOldLiveMode = Nothing -- default as well
1488 , OpCodes.opTargetNode = Nothing -- this is drbd
1489 , OpCodes.opAllowRuntimeChanges = False
1490 , OpCodes.opIgnoreIpolicy = False
1491 , OpCodes.opMigrationCleanup = False
1492 , OpCodes.opIallocator = Nothing
1493 , OpCodes.opAllowFailover = True }
1494 opFA n = opF { OpCodes.opTargetNode = lookNode n } -- not drbd
1495 opR n = OpCodes.OpInstanceReplaceDisks
1496 { OpCodes.opInstanceName = iname
1497 , OpCodes.opEarlyRelease = False
1498 , OpCodes.opIgnoreIpolicy = False
1499 , OpCodes.opReplaceDisksMode = OpCodes.ReplaceNewSecondary
1500 , OpCodes.opReplaceDisksList = []
1501 , OpCodes.opRemoteNode = lookNode n
1502 , OpCodes.opIallocator = Nothing
1506 FailoverToAny np -> [ opFA np ]
1507 ReplacePrimary np -> [ opF, opR np, opF ]
1508 ReplaceSecondary ns -> [ opR ns ]
1509 ReplaceAndFailover np -> [ opR np, opF ]
1510 FailoverAndReplace ns -> [ opF, opR ns ]
1512 -- * Node group functions
1514 -- | Computes the group of an instance.
1515 instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1516 instanceGroup nl i =
1517 let sidx = Instance.sNode i
1518 pnode = Container.find (Instance.pNode i) nl
1519 snode = if sidx == Node.noSecondary
1521 else Container.find sidx nl
1522 pgroup = Node.group pnode
1523 sgroup = Node.group snode
1524 in if pgroup /= sgroup
1525 then fail ("Instance placed accross two node groups, primary " ++
1526 show pgroup ++ ", secondary " ++ show sgroup)
1529 -- | Computes the group of an instance per the primary node.
1530 instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1531 instancePriGroup nl i =
1532 let pnode = Container.find (Instance.pNode i) nl
1535 -- | Compute the list of badly allocated instances (split across node
1537 findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1538 findSplitInstances nl =
1539 filter (not . isOk . instanceGroup nl) . Container.elems
1541 -- | Splits a cluster into the component node groups.
1542 splitCluster :: Node.List -> Instance.List ->
1543 [(Gdx, (Node.List, Instance.List))]
1544 splitCluster nl il =
1545 let ngroups = Node.computeGroups (Container.elems nl)
1546 in map (\(gdx, nodes) ->
1547 let nidxs = map Node.idx nodes
1548 nodes' = zip nidxs nodes
1549 instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1550 in (gdx, (Container.fromList nodes', instances))) ngroups
1552 -- | Compute the list of nodes that are to be evacuated, given a list
1553 -- of instances and an evacuation mode.
1554 nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list
1555 -> EvacMode -- ^ The evacuation mode we're using
1556 -> [Idx] -- ^ List of instance indices being evacuated
1557 -> IntSet.IntSet -- ^ Set of node indices
1558 nodesToEvacuate il mode =
1559 IntSet.delete Node.noSecondary .
1561 let i = Container.find idx il
1562 pdx = Instance.pNode i
1563 sdx = Instance.sNode i
1564 dt = Instance.diskTemplate i
1565 withSecondary = case dt of
1566 DTDrbd8 -> IntSet.insert sdx ns
1569 ChangePrimary -> IntSet.insert pdx ns
1570 ChangeSecondary -> withSecondary
1571 ChangeAll -> IntSet.insert pdx withSecondary