1 {-| Implementation of cluster-wide logic.
3 This module holds all pure cluster-logic; I\/O related functionality
4 goes into the /Main/ module for the individual binaries.
10 Copyright (C) 2009, 2010, 2011, 2012, 2013 Google Inc.
12 This program is free software; you can redistribute it and/or modify
13 it under the terms of the GNU General Public License as published by
14 the Free Software Foundation; either version 2 of the License, or
15 (at your option) any later version.
17 This program is distributed in the hope that it will be useful, but
18 WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 General Public License for more details.
22 You should have received a copy of the GNU General Public License
23 along with this program; if not, write to the Free Software
24 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
29 module Ganeti.HTools.Cluster
40 -- * Generic functions
42 , computeAllocationDelta
43 -- * First phase functions
45 -- * Second phase functions
50 -- * Display functions
53 -- * Balacing functions
62 -- * IAllocator functions
70 -- * Allocation functions
73 -- * Node group functions
79 import Control.Applicative (liftA2)
80 import Control.Arrow ((&&&))
81 import qualified Data.IntSet as IntSet
83 import Data.Maybe (fromJust, fromMaybe, isJust, isNothing)
84 import Data.Ord (comparing)
85 import Text.Printf (printf)
87 import Ganeti.BasicTypes
88 import qualified Ganeti.HTools.Container as Container
89 import qualified Ganeti.HTools.Instance as Instance
90 import qualified Ganeti.HTools.Nic as Nic
91 import qualified Ganeti.HTools.Node as Node
92 import qualified Ganeti.HTools.Group as Group
93 import Ganeti.HTools.Types
95 import qualified Ganeti.OpCodes as OpCodes
97 import Ganeti.Types (mkNonEmpty)
101 -- | Allocation\/relocation solution.
102 data AllocSolution = AllocSolution
103 { asFailures :: [FailMode] -- ^ Failure counts
104 , asAllocs :: Int -- ^ Good allocation count
105 , asSolution :: Maybe Node.AllocElement -- ^ The actual allocation result
106 , asLog :: [String] -- ^ Informational messages
109 -- | Node evacuation/group change iallocator result type. This result
110 -- type consists of actual opcodes (a restricted subset) that are
111 -- transmitted back to Ganeti.
112 data EvacSolution = EvacSolution
113 { esMoved :: [(Idx, Gdx, [Ndx])] -- ^ Instances moved successfully
114 , esFailed :: [(Idx, String)] -- ^ Instances which were not
116 , esOpCodes :: [[OpCodes.OpCode]] -- ^ List of jobs
119 -- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
120 type AllocResult = (FailStats, Node.List, Instance.List,
121 [Instance.Instance], [CStats])
123 -- | Type alias for easier handling.
124 type AllocSolutionList = [(Instance.Instance, AllocSolution)]
126 -- | A type denoting the valid allocation mode/pairs.
128 -- For a one-node allocation, this will be a @Left ['Ndx']@, whereas
129 -- for a two-node allocation, this will be a @Right [('Ndx',
130 -- ['Ndx'])]@. In the latter case, the list is basically an
131 -- association list, grouped by primary node and holding the potential
132 -- secondary nodes in the sub-list.
133 type AllocNodes = Either [Ndx] [(Ndx, [Ndx])]
135 -- | The empty solution we start with when computing allocations.
136 emptyAllocSolution :: AllocSolution
137 emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
138 , asSolution = Nothing, asLog = [] }
140 -- | The empty evac solution.
141 emptyEvacSolution :: EvacSolution
142 emptyEvacSolution = EvacSolution { esMoved = []
147 -- | The complete state for the balancing solution.
148 data Table = Table Node.List Instance.List Score [Placement]
151 -- | Cluster statistics data type.
153 { csFmem :: Integer -- ^ Cluster free mem
154 , csFdsk :: Integer -- ^ Cluster free disk
155 , csFspn :: Integer -- ^ Cluster free spindles
156 , csAmem :: Integer -- ^ Cluster allocatable mem
157 , csAdsk :: Integer -- ^ Cluster allocatable disk
158 , csAcpu :: Integer -- ^ Cluster allocatable cpus
159 , csMmem :: Integer -- ^ Max node allocatable mem
160 , csMdsk :: Integer -- ^ Max node allocatable disk
161 , csMcpu :: Integer -- ^ Max node allocatable cpu
162 , csImem :: Integer -- ^ Instance used mem
163 , csIdsk :: Integer -- ^ Instance used disk
164 , csIspn :: Integer -- ^ Instance used spindles
165 , csIcpu :: Integer -- ^ Instance used cpu
166 , csTmem :: Double -- ^ Cluster total mem
167 , csTdsk :: Double -- ^ Cluster total disk
168 , csTspn :: Double -- ^ Cluster total spindles
169 , csTcpu :: Double -- ^ Cluster total cpus
170 , csVcpu :: Integer -- ^ Cluster total virtual cpus
171 , csNcpu :: Double -- ^ Equivalent to 'csIcpu' but in terms of
172 -- physical CPUs, i.e. normalised used phys CPUs
173 , csXmem :: Integer -- ^ Unnacounted for mem
174 , csNmem :: Integer -- ^ Node own memory
175 , csScore :: Score -- ^ The cluster score
176 , csNinst :: Int -- ^ The total number of instances
179 -- | A simple type for allocation functions.
180 type AllocMethod = Node.List -- ^ Node list
181 -> Instance.List -- ^ Instance list
182 -> Maybe Int -- ^ Optional allocation limit
183 -> Instance.Instance -- ^ Instance spec for allocation
184 -> AllocNodes -- ^ Which nodes we should allocate on
185 -> [Instance.Instance] -- ^ Allocated instances
186 -> [CStats] -- ^ Running cluster stats
187 -> Result AllocResult -- ^ Allocation result
189 -- | A simple type for the running solution of evacuations.
190 type EvacInnerState =
191 Either String (Node.List, Instance.Instance, Score, Ndx)
193 -- * Utility functions
195 -- | Verifies the N+1 status and return the affected nodes.
196 verifyN1 :: [Node.Node] -> [Node.Node]
197 verifyN1 = filter Node.failN1
199 {-| Computes the pair of bad nodes and instances.
201 The bad node list is computed via a simple 'verifyN1' check, and the
202 bad instance list is the list of primary and secondary instances of
206 computeBadItems :: Node.List -> Instance.List ->
207 ([Node.Node], [Instance.Instance])
208 computeBadItems nl il =
209 let bad_nodes = verifyN1 $ getOnline nl
210 bad_instances = map (`Container.find` il) .
212 concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
214 (bad_nodes, bad_instances)
216 -- | Extracts the node pairs for an instance. This can fail if the
217 -- instance is single-homed. FIXME: this needs to be improved,
218 -- together with the general enhancement for handling non-DRBD moves.
219 instanceNodes :: Node.List -> Instance.Instance ->
220 (Ndx, Ndx, Node.Node, Node.Node)
221 instanceNodes nl inst =
222 let old_pdx = Instance.pNode inst
223 old_sdx = Instance.sNode inst
224 old_p = Container.find old_pdx nl
225 old_s = Container.find old_sdx nl
226 in (old_pdx, old_sdx, old_p, old_s)
228 -- | Zero-initializer for the CStats type.
229 emptyCStats :: CStats
230 emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
232 -- | Update stats with data from a new node.
233 updateCStats :: CStats -> Node.Node -> CStats
234 updateCStats cs node =
235 let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
236 csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
237 csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
238 csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
239 csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
240 csVcpu = x_vcpu, csNcpu = x_ncpu,
241 csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst,
242 csFspn = x_fspn, csIspn = x_ispn, csTspn = x_tspn
245 inc_amem = Node.fMem node - Node.rMem node
246 inc_amem' = if inc_amem > 0 then inc_amem else 0
247 inc_adsk = Node.availDisk node
248 inc_imem = truncate (Node.tMem node) - Node.nMem node
249 - Node.xMem node - Node.fMem node
250 inc_icpu = Node.uCpu node
251 inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
252 inc_ispn = Node.tSpindles node - Node.fSpindles node
253 inc_vcpu = Node.hiCpu node
254 inc_acpu = Node.availCpu node
255 inc_ncpu = fromIntegral (Node.uCpu node) /
256 iPolicyVcpuRatio (Node.iPolicy node)
257 in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
258 , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
259 , csFspn = x_fspn + fromIntegral (Node.fSpindles node)
260 , csAmem = x_amem + fromIntegral inc_amem'
261 , csAdsk = x_adsk + fromIntegral inc_adsk
262 , csAcpu = x_acpu + fromIntegral inc_acpu
263 , csMmem = max x_mmem (fromIntegral inc_amem')
264 , csMdsk = max x_mdsk (fromIntegral inc_adsk)
265 , csMcpu = max x_mcpu (fromIntegral inc_acpu)
266 , csImem = x_imem + fromIntegral inc_imem
267 , csIdsk = x_idsk + fromIntegral inc_idsk
268 , csIspn = x_ispn + fromIntegral inc_ispn
269 , csIcpu = x_icpu + fromIntegral inc_icpu
270 , csTmem = x_tmem + Node.tMem node
271 , csTdsk = x_tdsk + Node.tDsk node
272 , csTspn = x_tspn + fromIntegral (Node.tSpindles node)
273 , csTcpu = x_tcpu + Node.tCpu node
274 , csVcpu = x_vcpu + fromIntegral inc_vcpu
275 , csNcpu = x_ncpu + inc_ncpu
276 , csXmem = x_xmem + fromIntegral (Node.xMem node)
277 , csNmem = x_nmem + fromIntegral (Node.nMem node)
278 , csNinst = x_ninst + length (Node.pList node)
281 -- | Compute the total free disk and memory in the cluster.
282 totalResources :: Node.List -> CStats
284 let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
285 in cs { csScore = compCV nl }
287 -- | Compute the delta between two cluster state.
289 -- This is used when doing allocations, to understand better the
290 -- available cluster resources. The return value is a triple of the
291 -- current used values, the delta that was still allocated, and what
292 -- was left unallocated.
293 computeAllocationDelta :: CStats -> CStats -> AllocStats
294 computeAllocationDelta cini cfin =
295 let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu,
296 csNcpu = i_ncpu, csIspn = i_ispn } = cini
297 CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
298 csTmem = t_mem, csTdsk = t_dsk, csVcpu = f_vcpu,
299 csNcpu = f_ncpu, csTcpu = f_tcpu,
300 csIspn = f_ispn, csTspn = t_spn } = cfin
301 rini = AllocInfo { allocInfoVCpus = fromIntegral i_icpu
302 , allocInfoNCpus = i_ncpu
303 , allocInfoMem = fromIntegral i_imem
304 , allocInfoDisk = fromIntegral i_idsk
305 , allocInfoSpn = fromIntegral i_ispn
307 rfin = AllocInfo { allocInfoVCpus = fromIntegral (f_icpu - i_icpu)
308 , allocInfoNCpus = f_ncpu - i_ncpu
309 , allocInfoMem = fromIntegral (f_imem - i_imem)
310 , allocInfoDisk = fromIntegral (f_idsk - i_idsk)
311 , allocInfoSpn = fromIntegral (f_ispn - i_ispn)
313 runa = AllocInfo { allocInfoVCpus = fromIntegral (f_vcpu - f_icpu)
314 , allocInfoNCpus = f_tcpu - f_ncpu
315 , allocInfoMem = truncate t_mem - fromIntegral f_imem
316 , allocInfoDisk = truncate t_dsk - fromIntegral f_idsk
317 , allocInfoSpn = truncate t_spn - fromIntegral f_ispn
319 in (rini, rfin, runa)
321 -- | The names and weights of the individual elements in the CV list.
322 detailedCVInfo :: [(Double, String)]
323 detailedCVInfo = [ (1, "free_mem_cv")
324 , (1, "free_disk_cv")
326 , (1, "reserved_mem_cv")
327 , (4, "offline_all_cnt")
328 , (16, "offline_pri_cnt")
329 , (1, "vcpu_ratio_cv")
332 , (1, "disk_load_cv")
334 , (2, "pri_tags_score")
338 -- | Holds the weights used by 'compCVNodes' for each metric.
339 detailedCVWeights :: [Double]
340 detailedCVWeights = map fst detailedCVInfo
342 -- | Compute the mem and disk covariance.
343 compDetailedCV :: [Node.Node] -> [Double]
344 compDetailedCV all_nodes =
345 let (offline, nodes) = partition Node.offline all_nodes
346 mem_l = map Node.pMem nodes
347 dsk_l = map Node.pDsk nodes
348 -- metric: memory covariance
349 mem_cv = stdDev mem_l
350 -- metric: disk covariance
351 dsk_cv = stdDev dsk_l
352 -- metric: count of instances living on N1 failing nodes
353 n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
354 length (Node.pList n)) .
355 filter Node.failN1 $ nodes :: Double
356 res_l = map Node.pRem nodes
357 -- metric: reserved memory covariance
358 res_cv = stdDev res_l
359 -- offline instances metrics
360 offline_ipri = sum . map (length . Node.pList) $ offline
361 offline_isec = sum . map (length . Node.sList) $ offline
362 -- metric: count of instances on offline nodes
363 off_score = fromIntegral (offline_ipri + offline_isec)::Double
364 -- metric: count of primary instances on offline nodes (this
365 -- helps with evacuation/failover of primary instances on
366 -- 2-node clusters with one node offline)
367 off_pri_score = fromIntegral offline_ipri::Double
368 cpu_l = map Node.pCpu nodes
369 -- metric: covariance of vcpu/pcpu ratio
370 cpu_cv = stdDev cpu_l
371 -- metrics: covariance of cpu, memory, disk and network load
372 (c_load, m_load, d_load, n_load) =
374 let DynUtil c1 m1 d1 n1 = Node.utilLoad n
375 DynUtil c2 m2 d2 n2 = Node.utilPool n
376 in (c1/c2, m1/m2, d1/d2, n1/n2)) nodes
377 -- metric: conflicting instance count
378 pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
379 pri_tags_score = fromIntegral pri_tags_inst::Double
380 -- metric: spindles %
381 spindles_cv = map (\n -> Node.instSpindles n / Node.hiSpindles n) nodes
382 in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
383 , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
384 , pri_tags_score, stdDev spindles_cv ]
386 -- | Compute the /total/ variance.
387 compCVNodes :: [Node.Node] -> Double
388 compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
390 -- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
391 compCV :: Node.List -> Double
392 compCV = compCVNodes . Container.elems
394 -- | Compute online nodes from a 'Node.List'.
395 getOnline :: Node.List -> [Node.Node]
396 getOnline = filter (not . Node.offline) . Container.elems
398 -- * Balancing functions
400 -- | Compute best table. Note that the ordering of the arguments is important.
401 compareTables :: Table -> Table -> Table
402 compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
403 if a_cv > b_cv then b else a
405 -- | Applies an instance move to a given node list and instance.
406 applyMove :: Node.List -> Instance.Instance
407 -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
409 applyMove nl inst Failover =
410 let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
411 int_p = Node.removePri old_p inst
412 int_s = Node.removeSec old_s inst
413 new_nl = do -- Maybe monad
414 new_p <- Node.addPriEx (Node.offline old_p) int_s inst
415 new_s <- Node.addSec int_p inst old_sdx
416 let new_inst = Instance.setBoth inst old_sdx old_pdx
417 return (Container.addTwo old_pdx new_s old_sdx new_p nl,
418 new_inst, old_sdx, old_pdx)
421 -- Failover to any (fa)
422 applyMove nl inst (FailoverToAny new_pdx) = do
423 let (old_pdx, old_sdx, old_pnode, _) = instanceNodes nl inst
424 new_pnode = Container.find new_pdx nl
425 force_failover = Node.offline old_pnode
426 new_pnode' <- Node.addPriEx force_failover new_pnode inst
427 let old_pnode' = Node.removePri old_pnode inst
428 inst' = Instance.setPri inst new_pdx
429 nl' = Container.addTwo old_pdx old_pnode' new_pdx new_pnode' nl
430 return (nl', inst', new_pdx, old_sdx)
432 -- Replace the primary (f:, r:np, f)
433 applyMove nl inst (ReplacePrimary new_pdx) =
434 let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
435 tgt_n = Container.find new_pdx nl
436 int_p = Node.removePri old_p inst
437 int_s = Node.removeSec old_s inst
438 force_p = Node.offline old_p
439 new_nl = do -- Maybe monad
440 -- check that the current secondary can host the instance
441 -- during the migration
442 tmp_s <- Node.addPriEx force_p int_s inst
443 let tmp_s' = Node.removePri tmp_s inst
444 new_p <- Node.addPriEx force_p tgt_n inst
445 new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
446 let new_inst = Instance.setPri inst new_pdx
447 return (Container.add new_pdx new_p $
448 Container.addTwo old_pdx int_p old_sdx new_s nl,
449 new_inst, new_pdx, old_sdx)
452 -- Replace the secondary (r:ns)
453 applyMove nl inst (ReplaceSecondary new_sdx) =
454 let old_pdx = Instance.pNode inst
455 old_sdx = Instance.sNode inst
456 old_s = Container.find old_sdx nl
457 tgt_n = Container.find new_sdx nl
458 int_s = Node.removeSec old_s inst
459 force_s = Node.offline old_s
460 new_inst = Instance.setSec inst new_sdx
461 new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
462 \new_s -> return (Container.addTwo new_sdx
463 new_s old_sdx int_s nl,
464 new_inst, old_pdx, new_sdx)
467 -- Replace the secondary and failover (r:np, f)
468 applyMove nl inst (ReplaceAndFailover new_pdx) =
469 let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
470 tgt_n = Container.find new_pdx nl
471 int_p = Node.removePri old_p inst
472 int_s = Node.removeSec old_s inst
473 force_s = Node.offline old_s
474 new_nl = do -- Maybe monad
475 new_p <- Node.addPri tgt_n inst
476 new_s <- Node.addSecEx force_s int_p inst new_pdx
477 let new_inst = Instance.setBoth inst new_pdx old_pdx
478 return (Container.add new_pdx new_p $
479 Container.addTwo old_pdx new_s old_sdx int_s nl,
480 new_inst, new_pdx, old_pdx)
483 -- Failver and replace the secondary (f, r:ns)
484 applyMove nl inst (FailoverAndReplace new_sdx) =
485 let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
486 tgt_n = Container.find new_sdx nl
487 int_p = Node.removePri old_p inst
488 int_s = Node.removeSec old_s inst
489 force_p = Node.offline old_p
490 new_nl = do -- Maybe monad
491 new_p <- Node.addPriEx force_p int_s inst
492 new_s <- Node.addSecEx force_p tgt_n inst old_sdx
493 let new_inst = Instance.setBoth inst old_sdx new_sdx
494 return (Container.add new_sdx new_s $
495 Container.addTwo old_sdx new_p old_pdx int_p nl,
496 new_inst, old_sdx, new_sdx)
499 -- | Tries to allocate an instance on one given node.
500 allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
501 -> OpResult Node.AllocElement
502 allocateOnSingle nl inst new_pdx =
503 let p = Container.find new_pdx nl
504 new_inst = Instance.setBoth inst new_pdx Node.noSecondary
506 Instance.instMatchesPolicy inst (Node.iPolicy p) (Node.exclStorage p)
507 new_p <- Node.addPri p inst
508 let new_nl = Container.add new_pdx new_p nl
509 new_score = compCV new_nl
510 return (new_nl, new_inst, [new_p], new_score)
512 -- | Tries to allocate an instance on a given pair of nodes.
513 allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
514 -> OpResult Node.AllocElement
515 allocateOnPair nl inst new_pdx new_sdx =
516 let tgt_p = Container.find new_pdx nl
517 tgt_s = Container.find new_sdx nl
519 Instance.instMatchesPolicy inst (Node.iPolicy tgt_p)
520 (Node.exclStorage tgt_p)
521 new_p <- Node.addPri tgt_p inst
522 new_s <- Node.addSec tgt_s inst new_pdx
523 let new_inst = Instance.setBoth inst new_pdx new_sdx
524 new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
525 return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
527 -- | Tries to perform an instance move and returns the best table
528 -- between the original one and the new one.
529 checkSingleStep :: Table -- ^ The original table
530 -> Instance.Instance -- ^ The instance to move
531 -> Table -- ^ The current best table
532 -> IMove -- ^ The move to apply
533 -> Table -- ^ The final best table
534 checkSingleStep ini_tbl target cur_tbl move =
535 let Table ini_nl ini_il _ ini_plc = ini_tbl
536 tmp_resu = applyMove ini_nl target move
539 Ok (upd_nl, new_inst, pri_idx, sec_idx) ->
540 let tgt_idx = Instance.idx target
541 upd_cvar = compCV upd_nl
542 upd_il = Container.add tgt_idx new_inst ini_il
543 upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
544 upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
545 in compareTables cur_tbl upd_tbl
547 -- | Given the status of the current secondary as a valid new node and
548 -- the current candidate target node, generate the possible moves for
550 possibleMoves :: MirrorType -- ^ The mirroring type of the instance
551 -> Bool -- ^ Whether the secondary node is a valid new node
552 -> Bool -- ^ Whether we can change the primary node
553 -> Ndx -- ^ Target node candidate
554 -> [IMove] -- ^ List of valid result moves
556 possibleMoves MirrorNone _ _ _ = []
558 possibleMoves MirrorExternal _ False _ = []
560 possibleMoves MirrorExternal _ True tdx =
561 [ FailoverToAny tdx ]
563 possibleMoves MirrorInternal _ False tdx =
564 [ ReplaceSecondary tdx ]
566 possibleMoves MirrorInternal True True tdx =
567 [ ReplaceSecondary tdx
568 , ReplaceAndFailover tdx
570 , FailoverAndReplace tdx
573 possibleMoves MirrorInternal False True tdx =
574 [ ReplaceSecondary tdx
575 , ReplaceAndFailover tdx
578 -- | Compute the best move for a given instance.
579 checkInstanceMove :: [Ndx] -- ^ Allowed target node indices
580 -> Bool -- ^ Whether disk moves are allowed
581 -> Bool -- ^ Whether instance moves are allowed
582 -> Table -- ^ Original table
583 -> Instance.Instance -- ^ Instance to move
584 -> Table -- ^ Best new table for this instance
585 checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
586 let opdx = Instance.pNode target
587 osdx = Instance.sNode target
588 bad_nodes = [opdx, osdx]
589 nodes = filter (`notElem` bad_nodes) nodes_idx
590 mir_type = Instance.mirrorType target
591 use_secondary = elem osdx nodes_idx && inst_moves
592 aft_failover = if mir_type == MirrorInternal && use_secondary
593 -- if drbd and allowed to failover
594 then checkSingleStep ini_tbl target ini_tbl Failover
598 then concatMap (possibleMoves mir_type use_secondary inst_moves)
602 -- iterate over the possible nodes for this instance
603 foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
605 -- | Compute the best next move.
606 checkMove :: [Ndx] -- ^ Allowed target node indices
607 -> Bool -- ^ Whether disk moves are allowed
608 -> Bool -- ^ Whether instance moves are allowed
609 -> Table -- ^ The current solution
610 -> [Instance.Instance] -- ^ List of instances still to move
611 -> Table -- ^ The new solution
612 checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
613 let Table _ _ _ ini_plc = ini_tbl
614 -- we're using rwhnf from the Control.Parallel.Strategies
615 -- package; we don't need to use rnf as that would force too
616 -- much evaluation in single-threaded cases, and in
617 -- multi-threaded case the weak head normal form is enough to
618 -- spark the evaluation
619 tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
622 -- iterate over all instances, computing the best move
623 best_tbl = foldl' compareTables ini_tbl tables
624 Table _ _ _ best_plc = best_tbl
625 in if length best_plc == length ini_plc
626 then ini_tbl -- no advancement
629 -- | Check if we are allowed to go deeper in the balancing.
630 doNextBalance :: Table -- ^ The starting table
631 -> Int -- ^ Remaining length
632 -> Score -- ^ Score at which to stop
633 -> Bool -- ^ The resulting table and commands
634 doNextBalance ini_tbl max_rounds min_score =
635 let Table _ _ ini_cv ini_plc = ini_tbl
636 ini_plc_len = length ini_plc
637 in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
639 -- | Run a balance move.
640 tryBalance :: Table -- ^ The starting table
641 -> Bool -- ^ Allow disk moves
642 -> Bool -- ^ Allow instance moves
643 -> Bool -- ^ Only evacuate moves
644 -> Score -- ^ Min gain threshold
645 -> Score -- ^ Min gain
646 -> Maybe Table -- ^ The resulting table and commands
647 tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
648 let Table ini_nl ini_il ini_cv _ = ini_tbl
649 all_inst = Container.elems ini_il
650 all_nodes = Container.elems ini_nl
651 (offline_nodes, online_nodes) = partition Node.offline all_nodes
652 all_inst' = if evac_mode
653 then let bad_nodes = map Node.idx offline_nodes
654 in filter (any (`elem` bad_nodes) .
655 Instance.allNodes) all_inst
657 reloc_inst = filter (\i -> Instance.movable i &&
658 Instance.autoBalance i) all_inst'
659 node_idx = map Node.idx online_nodes
660 fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
661 (Table _ _ fin_cv _) = fin_tbl
663 if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
664 then Just fin_tbl -- this round made success, return the new table
667 -- * Allocation functions
669 -- | Build failure stats out of a list of failures.
670 collapseFailures :: [FailMode] -> FailStats
671 collapseFailures flst =
672 map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst))
675 -- | Compares two Maybe AllocElement and chooses the best score.
676 bestAllocElement :: Maybe Node.AllocElement
677 -> Maybe Node.AllocElement
678 -> Maybe Node.AllocElement
679 bestAllocElement a Nothing = a
680 bestAllocElement Nothing b = b
681 bestAllocElement a@(Just (_, _, _, ascore)) b@(Just (_, _, _, bscore)) =
682 if ascore < bscore then a else b
684 -- | Update current Allocation solution and failure stats with new
686 concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
687 concatAllocs as (Bad reason) = as { asFailures = reason : asFailures as }
689 concatAllocs as (Ok ns) =
690 let -- Choose the old or new solution, based on the cluster score
692 osols = asSolution as
693 nsols = bestAllocElement osols (Just ns)
695 -- Note: we force evaluation of nsols here in order to keep the
696 -- memory profile low - we know that we will need nsols for sure
697 -- in the next cycle, so we force evaluation of nsols, since the
698 -- foldl' in the caller will only evaluate the tuple, but not the
699 -- elements of the tuple
700 in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolution = nsols }
702 -- | Sums two 'AllocSolution' structures.
703 sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution
704 sumAllocs (AllocSolution aFails aAllocs aSols aLog)
705 (AllocSolution bFails bAllocs bSols bLog) =
706 -- note: we add b first, since usually it will be smaller; when
707 -- fold'ing, a will grow and grow whereas b is the per-group
708 -- result, hence smaller
709 let nFails = bFails ++ aFails
710 nAllocs = aAllocs + bAllocs
711 nSols = bestAllocElement aSols bSols
713 in AllocSolution nFails nAllocs nSols nLog
715 -- | Given a solution, generates a reasonable description for it.
716 describeSolution :: AllocSolution -> String
717 describeSolution as =
718 let fcnt = asFailures as
721 intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
722 filter ((> 0) . snd) . collapseFailures $ fcnt
724 Nothing -> "No valid allocation solutions, failure reasons: " ++
725 (if null fcnt then "unknown reasons" else freasons)
726 Just (_, _, nodes, cv) ->
727 printf ("score: %.8f, successes %d, failures %d (%s)" ++
728 " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
729 (intercalate "/" . map Node.name $ nodes)
731 -- | Annotates a solution with the appropriate string.
732 annotateSolution :: AllocSolution -> AllocSolution
733 annotateSolution as = as { asLog = describeSolution as : asLog as }
735 -- | Reverses an evacuation solution.
737 -- Rationale: we always concat the results to the top of the lists, so
738 -- for proper jobset execution, we should reverse all lists.
739 reverseEvacSolution :: EvacSolution -> EvacSolution
740 reverseEvacSolution (EvacSolution f m o) =
741 EvacSolution (reverse f) (reverse m) (reverse o)
743 -- | Generate the valid node allocation singles or pairs for a new instance.
744 genAllocNodes :: Group.List -- ^ Group list
745 -> Node.List -- ^ The node map
746 -> Int -- ^ The number of nodes required
747 -> Bool -- ^ Whether to drop or not
749 -> Result AllocNodes -- ^ The (monadic) result
750 genAllocNodes gl nl count drop_unalloc =
751 let filter_fn = if drop_unalloc
752 then filter (Group.isAllocable .
753 flip Container.find gl . Node.group)
755 all_nodes = filter_fn $ getOnline nl
756 all_pairs = [(Node.idx p,
757 [Node.idx s | s <- all_nodes,
758 Node.idx p /= Node.idx s,
759 Node.group p == Node.group s]) |
762 1 -> Ok (Left (map Node.idx all_nodes))
763 2 -> Ok (Right (filter (not . null . snd) all_pairs))
764 _ -> Bad "Unsupported number of nodes, only one or two supported"
766 -- | Try to allocate an instance on the cluster.
767 tryAlloc :: (Monad m) =>
768 Node.List -- ^ The node list
769 -> Instance.List -- ^ The instance list
770 -> Instance.Instance -- ^ The instance to allocate
771 -> AllocNodes -- ^ The allocation targets
772 -> m AllocSolution -- ^ Possible solution list
773 tryAlloc _ _ _ (Right []) = fail "Not enough online nodes"
774 tryAlloc nl _ inst (Right ok_pairs) =
775 let psols = parMap rwhnf (\(p, ss) ->
777 concatAllocs cstate .
778 allocateOnPair nl inst p)
779 emptyAllocSolution ss) ok_pairs
780 sols = foldl' sumAllocs emptyAllocSolution psols
781 in return $ annotateSolution sols
783 tryAlloc _ _ _ (Left []) = fail "No online nodes"
784 tryAlloc nl _ inst (Left all_nodes) =
785 let sols = foldl' (\cstate ->
786 concatAllocs cstate . allocateOnSingle nl inst
787 ) emptyAllocSolution all_nodes
788 in return $ annotateSolution sols
790 -- | Given a group/result, describe it as a nice (list of) messages.
791 solutionDescription :: (Group.Group, Result AllocSolution)
793 solutionDescription (grp, result) =
795 Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
796 Bad message -> [printf "Group %s: error %s" gname message]
797 where gname = Group.name grp
798 pol = allocPolicyToRaw (Group.allocPolicy grp)
800 -- | From a list of possibly bad and possibly empty solutions, filter
801 -- only the groups with a valid result. Note that the result will be
802 -- reversed compared to the original list.
803 filterMGResults :: [(Group.Group, Result AllocSolution)]
804 -> [(Group.Group, AllocSolution)]
805 filterMGResults = foldl' fn []
806 where unallocable = not . Group.isAllocable
807 fn accu (grp, rasol) =
810 Ok sol | isNothing (asSolution sol) -> accu
811 | unallocable grp -> accu
812 | otherwise -> (grp, sol):accu
814 -- | Sort multigroup results based on policy and score.
815 sortMGResults :: [(Group.Group, AllocSolution)]
816 -> [(Group.Group, AllocSolution)]
818 let extractScore (_, _, _, x) = x
819 solScore (grp, sol) = (Group.allocPolicy grp,
820 (extractScore . fromJust . asSolution) sol)
821 in sortBy (comparing solScore) sols
823 -- | Removes node groups which can't accommodate the instance
824 filterValidGroups :: [(Group.Group, (Node.List, Instance.List))]
826 -> ([(Group.Group, (Node.List, Instance.List))], [String])
827 filterValidGroups [] _ = ([], [])
828 filterValidGroups (ng:ngs) inst =
829 let (valid_ngs, msgs) = filterValidGroups ngs inst
830 hasNetwork nic = case Nic.network nic of
831 Just net -> net `elem` Group.networks (fst ng)
833 hasRequiredNetworks = all hasNetwork (Instance.nics inst)
834 in if hasRequiredNetworks
835 then (ng:valid_ngs, msgs)
837 ("group " ++ Group.name (fst ng) ++
838 " is not connected to a network required by instance " ++
839 Instance.name inst):msgs)
841 -- | Finds the best group for an instance on a multi-group cluster.
843 -- Only solutions in @preferred@ and @last_resort@ groups will be
844 -- accepted as valid, and additionally if the allowed groups parameter
845 -- is not null then allocation will only be run for those group
847 findBestAllocGroup :: Group.List -- ^ The group list
848 -> Node.List -- ^ The node list
849 -> Instance.List -- ^ The instance list
850 -> Maybe [Gdx] -- ^ The allowed groups
851 -> Instance.Instance -- ^ The instance to allocate
852 -> Int -- ^ Required number of nodes
853 -> Result (Group.Group, AllocSolution, [String])
854 findBestAllocGroup mggl mgnl mgil allowed_gdxs inst cnt =
855 let groups_by_idx = splitCluster mgnl mgil
856 groups = map (\(gid, d) -> (Container.find gid mggl, d)) groups_by_idx
857 groups' = maybe groups
858 (\gs -> filter ((`elem` gs) . Group.idx . fst) groups)
860 (groups'', filter_group_msgs) = filterValidGroups groups' inst
861 sols = map (\(gr, (nl, il)) ->
862 (gr, genAllocNodes mggl nl cnt False >>=
863 tryAlloc nl il inst))
864 groups''::[(Group.Group, Result AllocSolution)]
865 all_msgs = filter_group_msgs ++ concatMap solutionDescription sols
866 goodSols = filterMGResults sols
867 sortedSols = sortMGResults goodSols
868 in case sortedSols of
869 [] -> Bad $ if null groups'
870 then "no groups for evacuation: allowed groups was" ++
871 show allowed_gdxs ++ ", all groups: " ++
872 show (map fst groups)
873 else intercalate ", " all_msgs
874 (final_group, final_sol):_ -> return (final_group, final_sol, all_msgs)
876 -- | Try to allocate an instance on a multi-group cluster.
877 tryMGAlloc :: Group.List -- ^ The group list
878 -> Node.List -- ^ The node list
879 -> Instance.List -- ^ The instance list
880 -> Instance.Instance -- ^ The instance to allocate
881 -> Int -- ^ Required number of nodes
882 -> Result AllocSolution -- ^ Possible solution list
883 tryMGAlloc mggl mgnl mgil inst cnt = do
884 (best_group, solution, all_msgs) <-
885 findBestAllocGroup mggl mgnl mgil Nothing inst cnt
886 let group_name = Group.name best_group
887 selmsg = "Selected group: " ++ group_name
888 return $ solution { asLog = selmsg:all_msgs }
890 -- | Calculate the new instance list after allocation solution.
891 updateIl :: Instance.List -- ^ The original instance list
892 -> Maybe Node.AllocElement -- ^ The result of the allocation attempt
893 -> Instance.List -- ^ The updated instance list
894 updateIl il Nothing = il
895 updateIl il (Just (_, xi, _, _)) = Container.add (Container.size il) xi il
897 -- | Extract the the new node list from the allocation solution.
898 extractNl :: Node.List -- ^ The original node list
899 -> Maybe Node.AllocElement -- ^ The result of the allocation attempt
900 -> Node.List -- ^ The new node list
901 extractNl nl Nothing = nl
902 extractNl _ (Just (xnl, _, _, _)) = xnl
904 -- | Try to allocate a list of instances on a multi-group cluster.
905 allocList :: Group.List -- ^ The group list
906 -> Node.List -- ^ The node list
907 -> Instance.List -- ^ The instance list
908 -> [(Instance.Instance, Int)] -- ^ The instance to allocate
909 -> AllocSolutionList -- ^ Possible solution list
910 -> Result (Node.List, Instance.List,
911 AllocSolutionList) -- ^ The final solution list
912 allocList _ nl il [] result = Ok (nl, il, result)
913 allocList gl nl il ((xi, xicnt):xies) result = do
914 ares <- tryMGAlloc gl nl il xi xicnt
915 let sol = asSolution ares
916 nl' = extractNl nl sol
917 il' = updateIl il sol
918 allocList gl nl' il' xies ((xi, ares):result)
920 -- | Function which fails if the requested mode is change secondary.
922 -- This is useful since except DRBD, no other disk template can
923 -- execute change secondary; thus, we can just call this function
924 -- instead of always checking for secondary mode. After the call to
925 -- this function, whatever mode we have is just a primary change.
926 failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
927 failOnSecondaryChange ChangeSecondary dt =
928 fail $ "Instances with disk template '" ++ diskTemplateToRaw dt ++
929 "' can't execute change secondary"
930 failOnSecondaryChange _ _ = return ()
932 -- | Run evacuation for a single instance.
934 -- /Note:/ this function should correctly execute both intra-group
935 -- evacuations (in all modes) and inter-group evacuations (in the
936 -- 'ChangeAll' mode). Of course, this requires that the correct list
937 -- of target nodes is passed.
938 nodeEvacInstance :: Node.List -- ^ The node list (cluster-wide)
939 -> Instance.List -- ^ Instance list (cluster-wide)
940 -> EvacMode -- ^ The evacuation mode
941 -> Instance.Instance -- ^ The instance to be evacuated
942 -> Gdx -- ^ The group we're targetting
943 -> [Ndx] -- ^ The list of available nodes
945 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
946 nodeEvacInstance nl il mode inst@(Instance.Instance
947 {Instance.diskTemplate = dt@DTDiskless})
949 failOnSecondaryChange mode dt >>
950 evacOneNodeOnly nl il inst gdx avail_nodes
952 nodeEvacInstance _ _ _ (Instance.Instance
953 {Instance.diskTemplate = DTPlain}) _ _ =
954 fail "Instances of type plain cannot be relocated"
956 nodeEvacInstance _ _ _ (Instance.Instance
957 {Instance.diskTemplate = DTFile}) _ _ =
958 fail "Instances of type file cannot be relocated"
960 nodeEvacInstance nl il mode inst@(Instance.Instance
961 {Instance.diskTemplate = dt@DTSharedFile})
963 failOnSecondaryChange mode dt >>
964 evacOneNodeOnly nl il inst gdx avail_nodes
966 nodeEvacInstance nl il mode inst@(Instance.Instance
967 {Instance.diskTemplate = dt@DTBlock})
969 failOnSecondaryChange mode dt >>
970 evacOneNodeOnly nl il inst gdx avail_nodes
972 nodeEvacInstance nl il mode inst@(Instance.Instance
973 {Instance.diskTemplate = dt@DTRbd})
975 failOnSecondaryChange mode dt >>
976 evacOneNodeOnly nl il inst gdx avail_nodes
978 nodeEvacInstance nl il mode inst@(Instance.Instance
979 {Instance.diskTemplate = dt@DTExt})
981 failOnSecondaryChange mode dt >>
982 evacOneNodeOnly nl il inst gdx avail_nodes
984 nodeEvacInstance nl il ChangePrimary
985 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
988 (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
989 let idx = Instance.idx inst
990 il' = Container.add idx inst' il
991 ops = iMoveToJob nl' il' idx Failover
992 return (nl', il', ops)
994 nodeEvacInstance nl il ChangeSecondary
995 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
997 evacOneNodeOnly nl il inst gdx avail_nodes
999 -- The algorithm for ChangeAll is as follows:
1001 -- * generate all (primary, secondary) node pairs for the target groups
1002 -- * for each pair, execute the needed moves (r:s, f, r:s) and compute
1003 -- the final node list state and group score
1004 -- * select the best choice via a foldl that uses the same Either
1005 -- String solution as the ChangeSecondary mode
1006 nodeEvacInstance nl il ChangeAll
1007 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
1010 let no_nodes = Left "no nodes available"
1011 node_pairs = [(p,s) | p <- avail_nodes, s <- avail_nodes, p /= s]
1012 (nl', il', ops, _) <-
1013 annotateResult "Can't find any good nodes for relocation" .
1016 (\accu nodes -> case evacDrbdAllInner nl il inst gdx nodes of
1020 -- we don't need more details (which
1021 -- nodes, etc.) as we only selected
1022 -- this group if we can allocate on
1023 -- it, hence failures will not
1024 -- propagate out of this fold loop
1025 Left _ -> Left $ "Allocation failed: " ++ msg
1026 Ok result@(_, _, _, new_cv) ->
1027 let new_accu = Right result in
1030 Right (_, _, _, old_cv) ->
1034 ) no_nodes node_pairs
1036 return (nl', il', ops)
1038 -- | Generic function for changing one node of an instance.
1040 -- This is similar to 'nodeEvacInstance' but will be used in a few of
1041 -- its sub-patterns. It folds the inner function 'evacOneNodeInner'
1042 -- over the list of available nodes, which results in the best choice
1044 evacOneNodeOnly :: Node.List -- ^ The node list (cluster-wide)
1045 -> Instance.List -- ^ Instance list (cluster-wide)
1046 -> Instance.Instance -- ^ The instance to be evacuated
1047 -> Gdx -- ^ The group we're targetting
1048 -> [Ndx] -- ^ The list of available nodes
1050 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1051 evacOneNodeOnly nl il inst gdx avail_nodes = do
1052 op_fn <- case Instance.mirrorType inst of
1053 MirrorNone -> Bad "Can't relocate/evacuate non-mirrored instances"
1054 MirrorInternal -> Ok ReplaceSecondary
1055 MirrorExternal -> Ok FailoverToAny
1056 (nl', inst', _, ndx) <- annotateResult "Can't find any good node" .
1058 foldl' (evacOneNodeInner nl inst gdx op_fn)
1059 (Left "no nodes available") avail_nodes
1060 let idx = Instance.idx inst
1061 il' = Container.add idx inst' il
1062 ops = iMoveToJob nl' il' idx (op_fn ndx)
1063 return (nl', il', ops)
1065 -- | Inner fold function for changing one node of an instance.
1067 -- Depending on the instance disk template, this will either change
1068 -- the secondary (for DRBD) or the primary node (for shared
1069 -- storage). However, the operation is generic otherwise.
1071 -- The running solution is either a @Left String@, which means we
1072 -- don't have yet a working solution, or a @Right (...)@, which
1073 -- represents a valid solution; it holds the modified node list, the
1074 -- modified instance (after evacuation), the score of that solution,
1075 -- and the new secondary node index.
1076 evacOneNodeInner :: Node.List -- ^ Cluster node list
1077 -> Instance.Instance -- ^ Instance being evacuated
1078 -> Gdx -- ^ The group index of the instance
1079 -> (Ndx -> IMove) -- ^ Operation constructor
1080 -> EvacInnerState -- ^ Current best solution
1081 -> Ndx -- ^ Node we're evaluating as target
1082 -> EvacInnerState -- ^ New best solution
1083 evacOneNodeInner nl inst gdx op_fn accu ndx =
1084 case applyMove nl inst (op_fn ndx) of
1085 Bad fm -> let fail_msg = "Node " ++ Container.nameOf nl ndx ++
1086 " failed: " ++ show fm
1087 in either (const $ Left fail_msg) (const accu) accu
1088 Ok (nl', inst', _, _) ->
1089 let nodes = Container.elems nl'
1090 -- The fromJust below is ugly (it can fail nastily), but
1091 -- at this point we should have any internal mismatches,
1092 -- and adding a monad here would be quite involved
1093 grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1094 new_cv = compCVNodes grpnodes
1095 new_accu = Right (nl', inst', new_cv, ndx)
1098 Right (_, _, old_cv, _) ->
1103 -- | Compute result of changing all nodes of a DRBD instance.
1105 -- Given the target primary and secondary node (which might be in a
1106 -- different group or not), this function will 'execute' all the
1107 -- required steps and assuming all operations succceed, will return
1108 -- the modified node and instance lists, the opcodes needed for this
1109 -- and the new group score.
1110 evacDrbdAllInner :: Node.List -- ^ Cluster node list
1111 -> Instance.List -- ^ Cluster instance list
1112 -> Instance.Instance -- ^ The instance to be moved
1113 -> Gdx -- ^ The target group index
1114 -- (which can differ from the
1115 -- current group of the
1117 -> (Ndx, Ndx) -- ^ Tuple of new
1118 -- primary\/secondary nodes
1119 -> Result (Node.List, Instance.List, [OpCodes.OpCode], Score)
1120 evacDrbdAllInner nl il inst gdx (t_pdx, t_sdx) = do
1121 let primary = Container.find (Instance.pNode inst) nl
1122 idx = Instance.idx inst
1123 -- if the primary is offline, then we first failover
1124 (nl1, inst1, ops1) <-
1125 if Node.offline primary
1127 (nl', inst', _, _) <-
1128 annotateResult "Failing over to the secondary" .
1129 opToResult $ applyMove nl inst Failover
1130 return (nl', inst', [Failover])
1131 else return (nl, inst, [])
1132 let (o1, o2, o3) = (ReplaceSecondary t_pdx,
1134 ReplaceSecondary t_sdx)
1135 -- we now need to execute a replace secondary to the future
1137 (nl2, inst2, _, _) <-
1138 annotateResult "Changing secondary to new primary" .
1140 applyMove nl1 inst1 o1
1142 -- we now execute another failover, the primary stays fixed now
1143 (nl3, inst3, _, _) <- annotateResult "Failing over to new primary" .
1144 opToResult $ applyMove nl2 inst2 o2
1146 -- and finally another replace secondary, to the final secondary
1147 (nl4, inst4, _, _) <-
1148 annotateResult "Changing secondary to final secondary" .
1150 applyMove nl3 inst3 o3
1152 il' = Container.add idx inst4 il
1153 ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4
1154 let nodes = Container.elems nl4
1155 -- The fromJust below is ugly (it can fail nastily), but
1156 -- at this point we should have any internal mismatches,
1157 -- and adding a monad here would be quite involved
1158 grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1159 new_cv = compCVNodes grpnodes
1160 return (nl4, il', ops, new_cv)
1162 -- | Computes the nodes in a given group which are available for
1164 availableGroupNodes :: [(Gdx, [Ndx])] -- ^ Group index/node index assoc list
1165 -> IntSet.IntSet -- ^ Nodes that are excluded
1166 -> Gdx -- ^ The group for which we
1168 -> Result [Ndx] -- ^ List of available node indices
1169 availableGroupNodes group_nodes excl_ndx gdx = do
1170 local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx)
1171 Ok (lookup gdx group_nodes)
1172 let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes
1175 -- | Updates the evac solution with the results of an instance
1177 updateEvacSolution :: (Node.List, Instance.List, EvacSolution)
1179 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1180 -> (Node.List, Instance.List, EvacSolution)
1181 updateEvacSolution (nl, il, es) idx (Bad msg) =
1182 (nl, il, es { esFailed = (idx, msg):esFailed es})
1183 updateEvacSolution (_, _, es) idx (Ok (nl, il, opcodes)) =
1184 (nl, il, es { esMoved = new_elem:esMoved es
1185 , esOpCodes = opcodes:esOpCodes es })
1186 where inst = Container.find idx il
1188 instancePriGroup nl inst,
1189 Instance.allNodes inst)
1191 -- | Node-evacuation IAllocator mode main function.
1192 tryNodeEvac :: Group.List -- ^ The cluster groups
1193 -> Node.List -- ^ The node list (cluster-wide, not per group)
1194 -> Instance.List -- ^ Instance list (cluster-wide)
1195 -> EvacMode -- ^ The evacuation mode
1196 -> [Idx] -- ^ List of instance (indices) to be evacuated
1197 -> Result (Node.List, Instance.List, EvacSolution)
1198 tryNodeEvac _ ini_nl ini_il mode idxs =
1199 let evac_ndx = nodesToEvacuate ini_il mode idxs
1200 offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1201 excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline
1202 group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1203 (Container.elems nl))) $
1204 splitCluster ini_nl ini_il
1205 (fin_nl, fin_il, esol) =
1206 foldl' (\state@(nl, il, _) inst ->
1207 let gdx = instancePriGroup nl inst
1208 pdx = Instance.pNode inst in
1209 updateEvacSolution state (Instance.idx inst) $
1210 availableGroupNodes group_ndx
1211 (IntSet.insert pdx excl_ndx) gdx >>=
1212 nodeEvacInstance nl il mode inst gdx
1214 (ini_nl, ini_il, emptyEvacSolution)
1215 (map (`Container.find` ini_il) idxs)
1216 in return (fin_nl, fin_il, reverseEvacSolution esol)
1218 -- | Change-group IAllocator mode main function.
1220 -- This is very similar to 'tryNodeEvac', the only difference is that
1221 -- we don't choose as target group the current instance group, but
1224 -- 1. at the start of the function, we compute which are the target
1225 -- groups; either no groups were passed in, in which case we choose
1226 -- all groups out of which we don't evacuate instance, or there were
1227 -- some groups passed, in which case we use those
1229 -- 2. for each instance, we use 'findBestAllocGroup' to choose the
1230 -- best group to hold the instance, and then we do what
1231 -- 'tryNodeEvac' does, except for this group instead of the current
1234 -- Note that the correct behaviour of this function relies on the
1235 -- function 'nodeEvacInstance' to be able to do correctly both
1236 -- intra-group and inter-group moves when passed the 'ChangeAll' mode.
1237 tryChangeGroup :: Group.List -- ^ The cluster groups
1238 -> Node.List -- ^ The node list (cluster-wide)
1239 -> Instance.List -- ^ Instance list (cluster-wide)
1240 -> [Gdx] -- ^ Target groups; if empty, any
1241 -- groups not being evacuated
1242 -> [Idx] -- ^ List of instance (indices) to be evacuated
1243 -> Result (Node.List, Instance.List, EvacSolution)
1244 tryChangeGroup gl ini_nl ini_il gdxs idxs =
1245 let evac_gdxs = nub $ map (instancePriGroup ini_nl .
1246 flip Container.find ini_il) idxs
1247 target_gdxs = (if null gdxs
1248 then Container.keys gl
1249 else gdxs) \\ evac_gdxs
1250 offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1251 excl_ndx = foldl' (flip IntSet.insert) IntSet.empty offline
1252 group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1253 (Container.elems nl))) $
1254 splitCluster ini_nl ini_il
1255 (fin_nl, fin_il, esol) =
1256 foldl' (\state@(nl, il, _) inst ->
1258 let ncnt = Instance.requiredNodes $
1259 Instance.diskTemplate inst
1260 (grp, _, _) <- findBestAllocGroup gl nl il
1261 (Just target_gdxs) inst ncnt
1262 let gdx = Group.idx grp
1263 av_nodes <- availableGroupNodes group_ndx
1265 nodeEvacInstance nl il ChangeAll inst gdx av_nodes
1266 in updateEvacSolution state (Instance.idx inst) solution
1268 (ini_nl, ini_il, emptyEvacSolution)
1269 (map (`Container.find` ini_il) idxs)
1270 in return (fin_nl, fin_il, reverseEvacSolution esol)
1272 -- | Standard-sized allocation method.
1274 -- This places instances of the same size on the cluster until we're
1275 -- out of space. The result will be a list of identically-sized
1277 iterateAlloc :: AllocMethod
1278 iterateAlloc nl il limit newinst allocnodes ixes cstats =
1279 let depth = length ixes
1280 newname = printf "new-%d" depth::String
1281 newidx = Container.size il
1282 newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
1283 newlimit = fmap (flip (-) 1) limit
1284 in case tryAlloc nl il newi2 allocnodes of
1286 Ok (AllocSolution { asFailures = errs, asSolution = sols3 }) ->
1287 let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in
1290 Just (xnl, xi, _, _) ->
1293 else iterateAlloc xnl (Container.add newidx xi il)
1294 newlimit newinst allocnodes (xi:ixes)
1295 (totalResources xnl:cstats)
1297 -- | Predicate whether shrinking a single resource can lead to a valid
1299 sufficesShrinking :: (Instance.Instance -> AllocSolution) -> Instance.Instance
1300 -> FailMode -> Maybe Instance.Instance
1301 sufficesShrinking allocFn inst fm =
1302 case dropWhile (isNothing . asSolution . fst)
1303 . takeWhile (liftA2 (||) (elem fm . asFailures . fst)
1304 (isJust . asSolution . fst))
1305 . map (allocFn &&& id) $
1306 iterateOk (`Instance.shrinkByType` fm) inst
1307 of x:_ -> Just . snd $ x
1310 -- | Tiered allocation method.
1312 -- This places instances on the cluster, and decreases the spec until
1313 -- we can allocate again. The result will be a list of decreasing
1315 tieredAlloc :: AllocMethod
1316 tieredAlloc nl il limit newinst allocnodes ixes cstats =
1317 case iterateAlloc nl il limit newinst allocnodes ixes cstats of
1319 Ok (errs, nl', il', ixes', cstats') ->
1320 let newsol = Ok (errs, nl', il', ixes', cstats')
1321 ixes_cnt = length ixes'
1322 (stop, newlimit) = case limit of
1323 Nothing -> (False, Nothing)
1324 Just n -> (n <= ixes_cnt,
1325 Just (n - ixes_cnt))
1326 sortedErrs = map fst $ sortBy (comparing snd) errs
1327 suffShrink = sufficesShrinking (fromMaybe emptyAllocSolution
1328 . flip (tryAlloc nl' il') allocnodes)
1330 bigSteps = filter isJust . map suffShrink . reverse $ sortedErrs
1331 in if stop then newsol else
1333 Just newinst':_ -> tieredAlloc nl' il' newlimit
1334 newinst' allocnodes ixes' cstats'
1335 _ -> case Instance.shrinkByType newinst . last $ sortedErrs of
1337 Ok newinst' -> tieredAlloc nl' il' newlimit
1338 newinst' allocnodes ixes' cstats'
1340 -- * Formatting functions
1342 -- | Given the original and final nodes, computes the relocation description.
1343 computeMoves :: Instance.Instance -- ^ The instance to be moved
1344 -> String -- ^ The instance name
1345 -> IMove -- ^ The move being performed
1346 -> String -- ^ New primary
1347 -> String -- ^ New secondary
1348 -> (String, [String])
1349 -- ^ Tuple of moves and commands list; moves is containing
1350 -- either @/f/@ for failover or @/r:name/@ for replace
1351 -- secondary, while the command list holds gnt-instance
1352 -- commands (without that prefix), e.g \"@failover instance1@\"
1353 computeMoves i inam mv c d =
1355 Failover -> ("f", [mig])
1356 FailoverToAny _ -> (printf "fa:%s" c, [mig_any])
1357 FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
1358 ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
1359 ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
1360 ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
1361 where morf = if Instance.isRunning i then "migrate" else "failover"
1362 mig = printf "%s -f %s" morf inam::String
1363 mig_any = printf "%s -f -n %s %s" morf c inam::String
1364 rep n = printf "replace-disks -n %s %s" n inam::String
1366 -- | Converts a placement to string format.
1367 printSolutionLine :: Node.List -- ^ The node list
1368 -> Instance.List -- ^ The instance list
1369 -> Int -- ^ Maximum node name length
1370 -> Int -- ^ Maximum instance name length
1371 -> Placement -- ^ The current placement
1372 -> Int -- ^ The index of the placement in
1374 -> (String, [String])
1375 printSolutionLine nl il nmlen imlen plc pos =
1376 let pmlen = (2*nmlen + 1)
1377 (i, p, s, mv, c) = plc
1378 old_sec = Instance.sNode inst
1379 inst = Container.find i il
1380 inam = Instance.alias inst
1381 npri = Node.alias $ Container.find p nl
1382 nsec = Node.alias $ Container.find s nl
1383 opri = Node.alias $ Container.find (Instance.pNode inst) nl
1384 osec = Node.alias $ Container.find old_sec nl
1385 (moves, cmds) = computeMoves inst inam mv npri nsec
1386 -- FIXME: this should check instead/also the disk template
1387 ostr = if old_sec == Node.noSecondary
1388 then printf "%s" opri::String
1389 else printf "%s:%s" opri osec::String
1390 nstr = if s == Node.noSecondary
1391 then printf "%s" npri::String
1392 else printf "%s:%s" npri nsec::String
1393 in (printf " %3d. %-*s %-*s => %-*s %12.8f a=%s"
1394 pos imlen inam pmlen ostr pmlen nstr c moves,
1397 -- | Return the instance and involved nodes in an instance move.
1399 -- Note that the output list length can vary, and is not required nor
1400 -- guaranteed to be of any specific length.
1401 involvedNodes :: Instance.List -- ^ Instance list, used for retrieving
1402 -- the instance from its index; note
1403 -- that this /must/ be the original
1404 -- instance list, so that we can
1405 -- retrieve the old nodes
1406 -> Placement -- ^ The placement we're investigating,
1407 -- containing the new nodes and
1409 -> [Ndx] -- ^ Resulting list of node indices
1410 involvedNodes il plc =
1411 let (i, np, ns, _, _) = plc
1412 inst = Container.find i il
1413 in nub $ [np, ns] ++ Instance.allNodes inst
1415 -- | Inner function for splitJobs, that either appends the next job to
1416 -- the current jobset, or starts a new jobset.
1417 mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
1418 mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
1419 mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
1420 | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
1421 | otherwise = ([n]:cjs, ndx)
1423 -- | Break a list of moves into independent groups. Note that this
1424 -- will reverse the order of jobs.
1425 splitJobs :: [MoveJob] -> [JobSet]
1426 splitJobs = fst . foldl mergeJobs ([], [])
1428 -- | Given a list of commands, prefix them with @gnt-instance@ and
1429 -- also beautify the display a little.
1430 formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
1431 formatJob jsn jsl (sn, (_, _, _, cmds)) =
1433 printf " echo job %d/%d" jsn sn:
1435 map (" gnt-instance " ++) cmds
1437 then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
1440 -- | Given a list of commands, prefix them with @gnt-instance@ and
1441 -- also beautify the display a little.
1442 formatCmds :: [JobSet] -> String
1445 concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1449 -- | Print the node list.
1450 printNodes :: Node.List -> [String] -> String
1452 let fields = case fs of
1453 [] -> Node.defaultFields
1454 "+":rest -> Node.defaultFields ++ rest
1456 snl = sortBy (comparing Node.idx) (Container.elems nl)
1457 (header, isnum) = unzip $ map Node.showHeader fields
1458 in printTable "" header (map (Node.list fields) snl) isnum
1460 -- | Print the instance list.
1461 printInsts :: Node.List -> Instance.List -> String
1463 let sil = sortBy (comparing Instance.idx) (Container.elems il)
1464 helper inst = [ if Instance.isRunning inst then "R" else " "
1465 , Instance.name inst
1466 , Container.nameOf nl (Instance.pNode inst)
1467 , let sdx = Instance.sNode inst
1468 in if sdx == Node.noSecondary
1470 else Container.nameOf nl sdx
1471 , if Instance.autoBalance inst then "Y" else "N"
1472 , printf "%3d" $ Instance.vcpus inst
1473 , printf "%5d" $ Instance.mem inst
1474 , printf "%5d" $ Instance.dsk inst `div` 1024
1480 where DynUtil lC lM lD lN = Instance.util inst
1481 header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1482 , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1483 isnum = False:False:False:False:False:repeat True
1484 in printTable "" header (map helper sil) isnum
1486 -- | Shows statistics for a given node list.
1487 printStats :: String -> Node.List -> String
1489 let dcvs = compDetailedCV $ Container.elems nl
1490 (weights, names) = unzip detailedCVInfo
1491 hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1492 header = [ "Field", "Value", "Weight" ]
1493 formatted = map (\(w, h, val) ->
1498 in printTable lp header formatted $ False:repeat True
1500 -- | Convert a placement into a list of OpCodes (basically a job).
1501 iMoveToJob :: Node.List -- ^ The node list; only used for node
1502 -- names, so any version is good
1503 -- (before or after the operation)
1504 -> Instance.List -- ^ The instance list; also used for
1506 -> Idx -- ^ The index of the instance being
1508 -> IMove -- ^ The actual move to be described
1509 -> [OpCodes.OpCode] -- ^ The list of opcodes equivalent to
1511 iMoveToJob nl il idx move =
1512 let inst = Container.find idx il
1513 iname = Instance.name inst
1514 lookNode n = case mkNonEmpty (Container.nameOf nl n) of
1515 -- FIXME: convert htools codebase to non-empty strings
1516 Bad msg -> error $ "Empty node name for idx " ++
1517 show n ++ ": " ++ msg ++ "??"
1519 opF = OpCodes.OpInstanceMigrate
1520 { OpCodes.opInstanceName = iname
1521 , OpCodes.opInstanceUuid = Nothing
1522 , OpCodes.opMigrationMode = Nothing -- default
1523 , OpCodes.opOldLiveMode = Nothing -- default as well
1524 , OpCodes.opTargetNode = Nothing -- this is drbd
1525 , OpCodes.opTargetNodeUuid = Nothing
1526 , OpCodes.opAllowRuntimeChanges = False
1527 , OpCodes.opIgnoreIpolicy = False
1528 , OpCodes.opMigrationCleanup = False
1529 , OpCodes.opIallocator = Nothing
1530 , OpCodes.opAllowFailover = True }
1531 opFA n = opF { OpCodes.opTargetNode = lookNode n } -- not drbd
1532 opR n = OpCodes.OpInstanceReplaceDisks
1533 { OpCodes.opInstanceName = iname
1534 , OpCodes.opInstanceUuid = Nothing
1535 , OpCodes.opEarlyRelease = False
1536 , OpCodes.opIgnoreIpolicy = False
1537 , OpCodes.opReplaceDisksMode = OpCodes.ReplaceNewSecondary
1538 , OpCodes.opReplaceDisksList = []
1539 , OpCodes.opRemoteNode = lookNode n
1540 , OpCodes.opRemoteNodeUuid = Nothing
1541 , OpCodes.opIallocator = Nothing
1545 FailoverToAny np -> [ opFA np ]
1546 ReplacePrimary np -> [ opF, opR np, opF ]
1547 ReplaceSecondary ns -> [ opR ns ]
1548 ReplaceAndFailover np -> [ opR np, opF ]
1549 FailoverAndReplace ns -> [ opF, opR ns ]
1551 -- * Node group functions
1553 -- | Computes the group of an instance.
1554 instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1555 instanceGroup nl i =
1556 let sidx = Instance.sNode i
1557 pnode = Container.find (Instance.pNode i) nl
1558 snode = if sidx == Node.noSecondary
1560 else Container.find sidx nl
1561 pgroup = Node.group pnode
1562 sgroup = Node.group snode
1563 in if pgroup /= sgroup
1564 then fail ("Instance placed accross two node groups, primary " ++
1565 show pgroup ++ ", secondary " ++ show sgroup)
1568 -- | Computes the group of an instance per the primary node.
1569 instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1570 instancePriGroup nl i =
1571 let pnode = Container.find (Instance.pNode i) nl
1574 -- | Compute the list of badly allocated instances (split across node
1576 findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1577 findSplitInstances nl =
1578 filter (not . isOk . instanceGroup nl) . Container.elems
1580 -- | Splits a cluster into the component node groups.
1581 splitCluster :: Node.List -> Instance.List ->
1582 [(Gdx, (Node.List, Instance.List))]
1583 splitCluster nl il =
1584 let ngroups = Node.computeGroups (Container.elems nl)
1585 in map (\(gdx, nodes) ->
1586 let nidxs = map Node.idx nodes
1587 nodes' = zip nidxs nodes
1588 instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1589 in (gdx, (Container.fromList nodes', instances))) ngroups
1591 -- | Compute the list of nodes that are to be evacuated, given a list
1592 -- of instances and an evacuation mode.
1593 nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list
1594 -> EvacMode -- ^ The evacuation mode we're using
1595 -> [Idx] -- ^ List of instance indices being evacuated
1596 -> IntSet.IntSet -- ^ Set of node indices
1597 nodesToEvacuate il mode =
1598 IntSet.delete Node.noSecondary .
1600 let i = Container.find idx il
1601 pdx = Instance.pNode i
1602 sdx = Instance.sNode i
1603 dt = Instance.diskTemplate i
1604 withSecondary = case dt of
1605 DTDrbd8 -> IntSet.insert sdx ns
1608 ChangePrimary -> IntSet.insert pdx ns
1609 ChangeSecondary -> withSecondary
1610 ChangeAll -> IntSet.insert pdx withSecondary