X-Git-Url: https://code.grnet.gr/git/ganeti-local/blobdiff_plain/2a8e2dc95937c411aa93aa32f50538d8fafd18f1..d52d41de85348cbf9c18156e122718471d4bbf1f:/htools/Ganeti/HTools/Cluster.hs diff --git a/htools/Ganeti/HTools/Cluster.hs b/htools/Ganeti/HTools/Cluster.hs index 4588e77..ecd6c91 100644 --- a/htools/Ganeti/HTools/Cluster.hs +++ b/htools/Ganeti/HTools/Cluster.hs @@ -1,7 +1,7 @@ {-| Implementation of cluster-wide logic. This module holds all pure cluster-logic; I\/O related functionality -goes into the "Main" module for the individual binaries. +goes into the /Main/ module for the individual binaries. -} @@ -30,6 +30,7 @@ module Ganeti.HTools.Cluster ( -- * Types AllocSolution(..) + , EvacSolution(..) , Table(..) , CStats(..) , AllocStats @@ -51,6 +52,7 @@ module Ganeti.HTools.Cluster , doNextBalance , tryBalance , compCV + , compCVNodes , compDetailedCV , printStats , iMoveToJob @@ -62,6 +64,7 @@ module Ganeti.HTools.Cluster , tryMGReloc , tryEvac , tryMGEvac + , tryNodeEvac , collapseFailures -- * Allocation functions , iterateAlloc @@ -74,7 +77,9 @@ module Ganeti.HTools.Cluster ) where import Data.Function (on) +import qualified Data.IntSet as IntSet import Data.List +import Data.Maybe (fromJust) import Data.Ord (comparing) import Text.Printf (printf) import Control.Monad @@ -100,51 +105,68 @@ data AllocSolution = AllocSolution , asLog :: [String] -- ^ A list of informational messages } +-- | Node evacuation/group change iallocator result type. This result +-- type consists of actual opcodes (a restricted subset) that are +-- transmitted back to Ganeti. +data EvacSolution = EvacSolution + { esMoved :: [String] -- ^ Instance moved successfully + , esFailed :: [String] -- ^ Instance which were not + -- relocated + , esOpCodes :: [[[OpCodes.OpCode]]] -- ^ List of lists of jobs + } + -- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'. type AllocResult = (FailStats, Node.List, Instance.List, [Instance.Instance], [CStats]) - -- | A type denoting the valid allocation mode/pairs. +-- -- For a one-node allocation, this will be a @Left ['Node.Node']@, -- whereas for a two-node allocation, this will be a @Right -- [('Node.Node', 'Node.Node')]@. type AllocNodes = Either [Ndx] [(Ndx, Ndx)] --- | The empty solution we start with when computing allocations -emptySolution :: AllocSolution -emptySolution = AllocSolution { asFailures = [], asAllocs = 0 - , asSolutions = [], asLog = [] } +-- | The empty solution we start with when computing allocations. +emptyAllocSolution :: AllocSolution +emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0 + , asSolutions = [], asLog = [] } + +-- | The empty evac solution. +emptyEvacSolution :: EvacSolution +emptyEvacSolution = EvacSolution { esMoved = [] + , esFailed = [] + , esOpCodes = [] + } --- | The complete state for the balancing solution +-- | The complete state for the balancing solution. data Table = Table Node.List Instance.List Score [Placement] deriving (Show, Read) -data CStats = CStats { csFmem :: Int -- ^ Cluster free mem - , csFdsk :: Int -- ^ Cluster free disk - , csAmem :: Int -- ^ Cluster allocatable mem - , csAdsk :: Int -- ^ Cluster allocatable disk - , csAcpu :: Int -- ^ Cluster allocatable cpus - , csMmem :: Int -- ^ Max node allocatable mem - , csMdsk :: Int -- ^ Max node allocatable disk - , csMcpu :: Int -- ^ Max node allocatable cpu - , csImem :: Int -- ^ Instance used mem - , csIdsk :: Int -- ^ Instance used disk - , csIcpu :: Int -- ^ Instance used cpu - , csTmem :: Double -- ^ Cluster total mem - , csTdsk :: Double -- ^ Cluster total disk - , csTcpu :: Double -- ^ Cluster total cpus - , csVcpu :: Int -- ^ Cluster virtual cpus (if - -- node pCpu has been set, - -- otherwise -1) - , csXmem :: Int -- ^ Unnacounted for mem - , csNmem :: Int -- ^ Node own memory - , csScore :: Score -- ^ The cluster score - , csNinst :: Int -- ^ The total number of instances +data CStats = CStats { csFmem :: Integer -- ^ Cluster free mem + , csFdsk :: Integer -- ^ Cluster free disk + , csAmem :: Integer -- ^ Cluster allocatable mem + , csAdsk :: Integer -- ^ Cluster allocatable disk + , csAcpu :: Integer -- ^ Cluster allocatable cpus + , csMmem :: Integer -- ^ Max node allocatable mem + , csMdsk :: Integer -- ^ Max node allocatable disk + , csMcpu :: Integer -- ^ Max node allocatable cpu + , csImem :: Integer -- ^ Instance used mem + , csIdsk :: Integer -- ^ Instance used disk + , csIcpu :: Integer -- ^ Instance used cpu + , csTmem :: Double -- ^ Cluster total mem + , csTdsk :: Double -- ^ Cluster total disk + , csTcpu :: Double -- ^ Cluster total cpus + , csVcpu :: Integer -- ^ Cluster virtual cpus (if + -- node pCpu has been set, + -- otherwise -1) + , csXmem :: Integer -- ^ Unnacounted for mem + , csNmem :: Integer -- ^ Node own memory + , csScore :: Score -- ^ The cluster score + , csNinst :: Int -- ^ The total number of instances } deriving (Show, Read) --- | Currently used, possibly to allocate, unallocable +-- | Currently used, possibly to allocate, unallocable. type AllocStats = (RSpec, RSpec, RSpec) -- * Utility functions @@ -170,11 +192,11 @@ computeBadItems nl il = in (bad_nodes, bad_instances) --- | Zero-initializer for the CStats type +-- | Zero-initializer for the CStats type. emptyCStats :: CStats emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 --- | Update stats with data from a new node +-- | Update stats with data from a new node. updateCStats :: CStats -> Node.Node -> CStats updateCStats cs node = let CStats { csFmem = x_fmem, csFdsk = x_fdsk, @@ -196,23 +218,23 @@ updateCStats cs node = inc_vcpu = Node.hiCpu node inc_acpu = Node.availCpu node - in cs { csFmem = x_fmem + Node.fMem node - , csFdsk = x_fdsk + Node.fDsk node - , csAmem = x_amem + inc_amem' - , csAdsk = x_adsk + inc_adsk - , csAcpu = x_acpu + inc_acpu - , csMmem = max x_mmem inc_amem' - , csMdsk = max x_mdsk inc_adsk - , csMcpu = max x_mcpu inc_acpu - , csImem = x_imem + inc_imem - , csIdsk = x_idsk + inc_idsk - , csIcpu = x_icpu + inc_icpu + in cs { csFmem = x_fmem + fromIntegral (Node.fMem node) + , csFdsk = x_fdsk + fromIntegral (Node.fDsk node) + , csAmem = x_amem + fromIntegral inc_amem' + , csAdsk = x_adsk + fromIntegral inc_adsk + , csAcpu = x_acpu + fromIntegral inc_acpu + , csMmem = max x_mmem (fromIntegral inc_amem') + , csMdsk = max x_mdsk (fromIntegral inc_adsk) + , csMcpu = max x_mcpu (fromIntegral inc_acpu) + , csImem = x_imem + fromIntegral inc_imem + , csIdsk = x_idsk + fromIntegral inc_idsk + , csIcpu = x_icpu + fromIntegral inc_icpu , csTmem = x_tmem + Node.tMem node , csTdsk = x_tdsk + Node.tDsk node , csTcpu = x_tcpu + Node.tCpu node - , csVcpu = x_vcpu + inc_vcpu - , csXmem = x_xmem + Node.xMem node - , csNmem = x_nmem + Node.nMem node + , csVcpu = x_vcpu + fromIntegral inc_vcpu + , csXmem = x_xmem + fromIntegral (Node.xMem node) + , csNmem = x_nmem + fromIntegral (Node.nMem node) , csNinst = x_ninst + length (Node.pList node) } @@ -233,13 +255,17 @@ computeAllocationDelta cini cfin = let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu} = cini CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu, csTmem = t_mem, csTdsk = t_dsk, csVcpu = v_cpu } = cfin - rini = RSpec i_icpu i_imem i_idsk - rfin = RSpec (f_icpu - i_icpu) (f_imem - i_imem) (f_idsk - i_idsk) - un_cpu = v_cpu - f_icpu - runa = RSpec un_cpu (truncate t_mem - f_imem) (truncate t_dsk - f_idsk) + rini = RSpec (fromIntegral i_icpu) (fromIntegral i_imem) + (fromIntegral i_idsk) + rfin = RSpec (fromIntegral (f_icpu - i_icpu)) + (fromIntegral (f_imem - i_imem)) + (fromIntegral (f_idsk - i_idsk)) + un_cpu = fromIntegral (v_cpu - f_icpu)::Int + runa = RSpec un_cpu (truncate t_mem - fromIntegral f_imem) + (truncate t_dsk - fromIntegral f_idsk) in (rini, rfin, runa) --- | The names and weights of the individual elements in the CV list +-- | The names and weights of the individual elements in the CV list. detailedCVInfo :: [(Double, String)] detailedCVInfo = [ (1, "free_mem_cv") , (1, "free_disk_cv") @@ -259,10 +285,9 @@ detailedCVWeights :: [Double] detailedCVWeights = map fst detailedCVInfo -- | Compute the mem and disk covariance. -compDetailedCV :: Node.List -> [Double] -compDetailedCV nl = +compDetailedCV :: [Node.Node] -> [Double] +compDetailedCV all_nodes = let - all_nodes = Container.elems nl (offline, nodes) = partition Node.offline all_nodes mem_l = map Node.pMem nodes dsk_l = map Node.pDsk nodes @@ -304,14 +329,19 @@ compDetailedCV nl = , pri_tags_score ] -- | Compute the /total/ variance. +compCVNodes :: [Node.Node] -> Double +compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV + +-- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'. compCV :: Node.List -> Double -compCV = sum . zipWith (*) detailedCVWeights . compDetailedCV +compCV = compCVNodes . Container.elems + --- | Compute online nodes from a Node.List +-- | Compute online nodes from a 'Node.List'. getOnline :: Node.List -> [Node.Node] getOnline = filter (not . Node.offline) . Container.elems --- * hbal functions +-- * Balancing functions -- | Compute best table. Note that the ordering of the arguments is important. compareTables :: Table -> Table -> Table @@ -465,35 +495,42 @@ checkSingleStep ini_tbl target cur_tbl move = -- the current candidate target node, generate the possible moves for -- a instance. possibleMoves :: Bool -- ^ Whether the secondary node is a valid new node + -> Bool -- ^ Whether we can change the primary node -> Ndx -- ^ Target node candidate -> [IMove] -- ^ List of valid result moves -possibleMoves True tdx = + +possibleMoves _ False tdx = + [ReplaceSecondary tdx] + +possibleMoves True True tdx = [ReplaceSecondary tdx, ReplaceAndFailover tdx, ReplacePrimary tdx, FailoverAndReplace tdx] -possibleMoves False tdx = +possibleMoves False True tdx = [ReplaceSecondary tdx, ReplaceAndFailover tdx] -- | Compute the best move for a given instance. checkInstanceMove :: [Ndx] -- ^ Allowed target node indices -> Bool -- ^ Whether disk moves are allowed + -> Bool -- ^ Whether instance moves are allowed -> Table -- ^ Original table -> Instance.Instance -- ^ Instance to move -> Table -- ^ Best new table for this instance -checkInstanceMove nodes_idx disk_moves ini_tbl target = +checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target = let opdx = Instance.pNode target osdx = Instance.sNode target nodes = filter (\idx -> idx /= opdx && idx /= osdx) nodes_idx - use_secondary = elem osdx nodes_idx + use_secondary = elem osdx nodes_idx && inst_moves aft_failover = if use_secondary -- if allowed to failover then checkSingleStep ini_tbl target ini_tbl Failover else ini_tbl all_moves = if disk_moves - then concatMap (possibleMoves use_secondary) nodes + then concatMap + (possibleMoves use_secondary inst_moves) nodes else [] in -- iterate over the possible nodes for this instance @@ -502,17 +539,19 @@ checkInstanceMove nodes_idx disk_moves ini_tbl target = -- | Compute the best next move. checkMove :: [Ndx] -- ^ Allowed target node indices -> Bool -- ^ Whether disk moves are allowed + -> Bool -- ^ Whether instance moves are allowed -> Table -- ^ The current solution -> [Instance.Instance] -- ^ List of instances still to move -> Table -- ^ The new solution -checkMove nodes_idx disk_moves ini_tbl victims = +checkMove nodes_idx disk_moves inst_moves ini_tbl victims = let Table _ _ _ ini_plc = ini_tbl -- we're using rwhnf from the Control.Parallel.Strategies -- package; we don't need to use rnf as that would force too -- much evaluation in single-threaded cases, and in -- multi-threaded case the weak head normal form is enough to -- spark the evaluation - tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves ini_tbl) + tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves + inst_moves ini_tbl) victims -- iterate over all instances, computing the best move best_tbl = foldl' compareTables ini_tbl tables @@ -521,7 +560,7 @@ checkMove nodes_idx disk_moves ini_tbl victims = then ini_tbl -- no advancement else best_tbl --- | Check if we are allowed to go deeper in the balancing +-- | Check if we are allowed to go deeper in the balancing. doNextBalance :: Table -- ^ The starting table -> Int -- ^ Remaining length -> Score -- ^ Score at which to stop @@ -531,14 +570,15 @@ doNextBalance ini_tbl max_rounds min_score = ini_plc_len = length ini_plc in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score --- | Run a balance move +-- | Run a balance move. tryBalance :: Table -- ^ The starting table -> Bool -- ^ Allow disk moves + -> Bool -- ^ Allow instance moves -> Bool -- ^ Only evacuate moves -> Score -- ^ Min gain threshold -> Score -- ^ Min gain -> Maybe Table -- ^ The resulting table and commands -tryBalance ini_tbl disk_moves evac_mode mg_limit min_gain = +tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain = let Table ini_nl ini_il ini_cv _ = ini_tbl all_inst = Container.elems ini_il all_inst' = if evac_mode @@ -551,7 +591,7 @@ tryBalance ini_tbl disk_moves evac_mode mg_limit min_gain = reloc_inst = filter Instance.movable all_inst' node_idx = map Node.idx . filter (not . Node.offline) $ Container.elems ini_nl - fin_tbl = checkMove node_idx disk_moves ini_tbl reloc_inst + fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst (Table _ _ fin_cv _) = fin_tbl in if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain) @@ -560,13 +600,14 @@ tryBalance ini_tbl disk_moves evac_mode mg_limit min_gain = -- * Allocation functions --- | Build failure stats out of a list of failures +-- | Build failure stats out of a list of failures. collapseFailures :: [FailMode] -> FailStats collapseFailures flst = - map (\k -> (k, length $ filter (k ==) flst)) [minBound..maxBound] + map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst)) + [minBound..maxBound] -- | Update current Allocation solution and failure stats with new --- elements +-- elements. concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution concatAllocs as (OpFail reason) = as { asFailures = reason : asFailures as } @@ -597,7 +638,7 @@ sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution sumAllocs (AllocSolution af aa as al) (AllocSolution bf ba bs bl) = AllocSolution (af ++ bf) (aa + ba) (as ++ bs) (al ++ bl) --- | Given a solution, generates a reasonable description for it +-- | Given a solution, generates a reasonable description for it. describeSolution :: AllocSolution -> String describeSolution as = let fcnt = asFailures as @@ -615,10 +656,18 @@ describeSolution as = " for node(s) %s") cv (asAllocs as) (length fcnt) freasons (intercalate "/" . map Node.name $ nodes) --- | Annotates a solution with the appropriate string +-- | Annotates a solution with the appropriate string. annotateSolution :: AllocSolution -> AllocSolution annotateSolution as = as { asLog = describeSolution as : asLog as } +-- | Reverses an evacuation solution. +-- +-- Rationale: we always concat the results to the top of the lists, so +-- for proper jobset execution, we should reverse all lists. +reverseEvacSolution :: EvacSolution -> EvacSolution +reverseEvacSolution (EvacSolution f m o) = + EvacSolution (reverse f) (reverse m) (reverse o) + -- | Generate the valid node allocation singles or pairs for a new instance. genAllocNodes :: Group.List -- ^ Group list -> Node.List -- ^ The node map @@ -628,8 +677,8 @@ genAllocNodes :: Group.List -- ^ Group list -> Result AllocNodes -- ^ The (monadic) result genAllocNodes gl nl count drop_unalloc = let filter_fn = if drop_unalloc - then filter ((/=) AllocUnallocable . Group.allocPolicy . - flip Container.find gl . Node.group) + then filter (Group.isAllocable . + flip Container.find gl . Node.group) else id all_nodes = filter_fn $ getOnline nl all_pairs = liftM2 (,) all_nodes all_nodes @@ -650,7 +699,7 @@ tryAlloc :: (Monad m) => tryAlloc nl _ inst (Right ok_pairs) = let sols = foldl' (\cstate (p, s) -> concatAllocs cstate $ allocateOnPair nl inst p s - ) emptySolution ok_pairs + ) emptyAllocSolution ok_pairs in if null ok_pairs -- means we have just one node then fail "Not enough online nodes" @@ -659,12 +708,12 @@ tryAlloc nl _ inst (Right ok_pairs) = tryAlloc nl _ inst (Left all_nodes) = let sols = foldl' (\cstate -> concatAllocs cstate . allocateOnSingle nl inst - ) emptySolution all_nodes + ) emptyAllocSolution all_nodes in if null all_nodes then fail "No online nodes" else return $ annotateSolution sols --- | Given a group/result, describe it as a nice (list of) messages +-- | Given a group/result, describe it as a nice (list of) messages. solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String] solutionDescription gl (groupId, result) = case result of @@ -675,18 +724,21 @@ solutionDescription gl (groupId, result) = pol = apolToString (Group.allocPolicy grp) -- | From a list of possibly bad and possibly empty solutions, filter --- only the groups with a valid result +-- only the groups with a valid result. Note that the result will be +-- reversed compared to the original list. filterMGResults :: Group.List -> [(Gdx, Result AllocSolution)] -> [(Gdx, AllocSolution)] -filterMGResults gl= - filter ((/= AllocUnallocable) . Group.allocPolicy . - flip Container.find gl . fst) . - filter (not . null . asSolutions . snd) . - map (\(y, Ok x) -> (y, x)) . - filter (isOk . snd) - --- | Sort multigroup results based on policy and score +filterMGResults gl = foldl' fn [] + where unallocable = not . Group.isAllocable . flip Container.find gl + fn accu (gdx, rasol) = + case rasol of + Bad _ -> accu + Ok sol | null (asSolutions sol) -> accu + | unallocable gdx -> accu + | otherwise -> (gdx, sol):accu + +-- | Sort multigroup results based on policy and score. sortMGResults :: Group.List -> [(Gdx, AllocSolution)] -> [(Gdx, AllocSolution)] @@ -740,7 +792,7 @@ tryReloc nl il xid 1 ex_idx = return (mnl, i, [Container.find x mnl], compCV mnl) in concatAllocs cstate em - ) emptySolution valid_idxes + ) emptyAllocSolution valid_idxes in return sols1 tryReloc _ _ _ reqn _ = fail $ "Unsupported number of relocation \ @@ -765,7 +817,7 @@ tryMGReloc _ mgnl mgil xid ncount ex_ndx = do Just v -> return v tryReloc nl il xid ncount ex_ndx --- | Change an instance's secondary node +-- | Change an instance's secondary node. evacInstance :: (Monad m) => [Ndx] -- ^ Excluded nodes -> Instance.List -- ^ The current instance list @@ -801,7 +853,7 @@ tryEvac :: (Monad m) => -> [Ndx] -- ^ Restricted nodes (the ones being evacuated) -> m AllocSolution -- ^ Solution list tryEvac nl il idxs ex_ndx = do - (_, sol) <- foldM (evacInstance ex_ndx il) (nl, emptySolution) idxs + (_, sol) <- foldM (evacInstance ex_ndx il) (nl, emptyAllocSolution) idxs return sol -- | Multi-group evacuation of a list of nodes. @@ -814,71 +866,266 @@ tryMGEvac :: (Monad m) => tryMGEvac _ nl il ex_ndx = let ex_nodes = map (`Container.find` nl) ex_ndx all_insts = nub . concatMap Node.sList $ ex_nodes - gni = splitCluster nl il - -- we run the instance index list through a couple of maps to - -- get finally to a structure of the type [(group index, - -- [instance indices])] - all_insts' = map (\idx -> - (instancePriGroup nl (Container.find idx il), - idx)) all_insts - all_insts'' = groupBy ((==) `on` fst) all_insts' - all_insts3 = map (\xs -> let (gdxs, idxs) = unzip xs - in (head gdxs, idxs)) all_insts'' + all_insts' = associateIdxs all_insts $ splitCluster nl il in do - -- that done, we now add the per-group nl/il to the tuple - all_insts4 <- - mapM (\(gdx, idxs) -> - case lookup gdx gni of - Nothing -> fail $ "Can't find group index " ++ show gdx - Just (gnl, gil) -> return (gdx, gnl, gil, idxs)) - all_insts3 - results <- mapM (\(_, gnl, gil, idxs) -> tryEvac gnl gil idxs ex_ndx) - all_insts4 - let sol = foldl' sumAllocs emptySolution results + results <- mapM (\(_, (gnl, gil, idxs)) -> tryEvac gnl gil idxs ex_ndx) + all_insts' + let sol = foldl' sumAllocs emptyAllocSolution results return $ annotateSolution sol --- | Recursively place instances on the cluster until we're out of space +-- | Function which fails if the requested mode is change secondary. +-- +-- This is useful since except DRBD, no other disk template can +-- execute change secondary; thus, we can just call this function +-- instead of always checking for secondary mode. After the call to +-- this function, whatever mode we have is just a primary change. +failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m () +failOnSecondaryChange ChangeSecondary dt = + fail $ "Instances with disk template '" ++ dtToString dt ++ + "' can't execute change secondary" +failOnSecondaryChange _ _ = return () + +-- | Run evacuation for a single instance. +nodeEvacInstance :: Node.List -- ^ The node list (cluster-wide) + -> Instance.List -- ^ Instance list (cluster-wide) + -> EvacMode -- ^ The evacuation mode + -> Instance.Instance -- ^ The instance to be evacuated + -> [Ndx] -- ^ The list of available nodes + -- for allocation + -> Result (Node.List, Instance.List, [OpCodes.OpCode]) +nodeEvacInstance _ _ mode (Instance.Instance + {Instance.diskTemplate = dt@DTDiskless}) _ = + failOnSecondaryChange mode dt >> + fail "Diskless relocations not implemented yet" + +nodeEvacInstance _ _ _ (Instance.Instance + {Instance.diskTemplate = DTPlain}) _ = + fail "Instances of type plain cannot be relocated" + +nodeEvacInstance _ _ _ (Instance.Instance + {Instance.diskTemplate = DTFile}) _ = + fail "Instances of type file cannot be relocated" + +nodeEvacInstance _ _ mode (Instance.Instance + {Instance.diskTemplate = dt@DTSharedFile}) _ = + failOnSecondaryChange mode dt >> + fail "Shared file relocations not implemented yet" + +nodeEvacInstance _ _ mode (Instance.Instance + {Instance.diskTemplate = dt@DTBlock}) _ = + failOnSecondaryChange mode dt >> + fail "Block device relocations not implemented yet" + +nodeEvacInstance nl il ChangePrimary + inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8}) _ = + do + (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover + let idx = Instance.idx inst + il' = Container.add idx inst' il + ops = iMoveToJob nl' il' idx Failover + return (nl', il', ops) + +nodeEvacInstance nl il ChangeSecondary + inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8}) + avail_nodes = + do + let gdx = instancePriGroup nl inst + (nl', inst', _, ndx) <- annotateResult "Can't find any good node" $ + eitherToResult $ + foldl' (evacDrbdSecondaryInner nl inst gdx) + (Left "no nodes available") avail_nodes + let idx = Instance.idx inst + il' = Container.add idx inst' il + ops = iMoveToJob nl' il' idx (ReplaceSecondary ndx) + return (nl', il', ops) + +nodeEvacInstance nl il ChangeAll + inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8}) + avail_nodes = + do + let primary = Container.find (Instance.pNode inst) nl + idx = Instance.idx inst + gdx = instancePriGroup nl inst + no_nodes = Left "no nodes available" + -- if the primary is offline, then we first failover + (nl1, inst1, ops1) <- + if Node.offline primary + then do + (nl', inst', _, _) <- + annotateResult "Failing over to the secondary" $ + opToResult $ applyMove nl inst Failover + return (nl', inst', [Failover]) + else return (nl, inst, []) + -- we now need to execute a replace secondary to the future + -- primary node + (nl2, inst2, _, new_pdx) <- annotateResult "Searching for a new primary" $ + eitherToResult $ + foldl' (evacDrbdSecondaryInner nl1 inst1 gdx) + no_nodes avail_nodes + let ops2 = ReplaceSecondary new_pdx:ops1 + -- since we chose the new primary, we remove it from the list of + -- available nodes + let avail_nodes_sec = new_pdx `delete` avail_nodes + -- we now execute another failover, the primary stays fixed now + (nl3, inst3, _, _) <- annotateResult "Failing over to new primary" $ + opToResult $ applyMove nl2 inst2 Failover + let ops3 = Failover:ops2 + -- and finally another replace secondary, to the final secondary + (nl4, inst4, _, new_sdx) <- + annotateResult "Searching for a new secondary" $ + eitherToResult $ + foldl' (evacDrbdSecondaryInner nl3 inst3 gdx) no_nodes avail_nodes_sec + let ops4 = ReplaceSecondary new_sdx:ops3 + il' = Container.add idx inst4 il + ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4 + return (nl4, il', ops) + +-- | Inner fold function for changing secondary of a DRBD instance. +-- +-- The "running" solution is either a @Left String@, which means we +-- don't have yet a working solution, or a @Right (...)@, which +-- represents a valid solution; it holds the modified node list, the +-- modified instance (after evacuation), the score of that solution, +-- and the new secondary node index. +evacDrbdSecondaryInner :: Node.List -- ^ Cluster node list + -> Instance.Instance -- ^ Instance being evacuated + -> Gdx -- ^ The group index of the instance + -> Either String ( Node.List + , Instance.Instance + , Score + , Ndx) -- ^ Current best solution + -> Ndx -- ^ Node we're evaluating as new secondary + -> Either String ( Node.List + , Instance.Instance + , Score + , Ndx) -- ^ New best solution +evacDrbdSecondaryInner nl inst gdx accu ndx = + case applyMove nl inst (ReplaceSecondary ndx) of + OpFail fm -> + case accu of + Right _ -> accu + Left _ -> Left $ "Node " ++ Container.nameOf nl ndx ++ + " failed: " ++ show fm + OpGood (nl', inst', _, _) -> + let nodes = Container.elems nl' + -- The fromJust below is ugly (it can fail nastily), but + -- at this point we should have any internal mismatches, + -- and adding a monad here would be quite involved + grpnodes = fromJust (gdx `lookup` (Node.computeGroups nodes)) + new_cv = compCVNodes grpnodes + new_accu = Right (nl', inst', new_cv, ndx) + in case accu of + Left _ -> new_accu + Right (_, _, old_cv, _) -> + if old_cv < new_cv + then accu + else new_accu + +-- | Computes the local nodes of a given instance which are available +-- for allocation. +availableLocalNodes :: Node.List + -> [(Gdx, [Ndx])] + -> IntSet.IntSet + -> Instance.Instance + -> Result [Ndx] +availableLocalNodes nl group_nodes excl_ndx inst = do + let gdx = instancePriGroup nl inst + local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx) + Ok (lookup gdx group_nodes) + let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes + return avail_nodes + +-- | Updates the evac solution with the results of an instance +-- evacuation. +updateEvacSolution :: (Node.List, Instance.List, EvacSolution) + -> Instance.Instance + -> Result (Node.List, Instance.List, [OpCodes.OpCode]) + -> (Node.List, Instance.List, EvacSolution) +updateEvacSolution (nl, il, es) inst (Bad msg) = + (nl, il, es { esFailed = (Instance.name inst ++ ": " ++ msg):esFailed es}) +updateEvacSolution (_, _, es) inst (Ok (nl, il, opcodes)) = + (nl, il, es { esMoved = Instance.name inst:esMoved es + , esOpCodes = [opcodes]:esOpCodes es }) + +-- | Node-evacuation IAllocator mode main function. +tryNodeEvac :: Group.List -- ^ The cluster groups + -> Node.List -- ^ The node list (cluster-wide, not per group) + -> Instance.List -- ^ Instance list (cluster-wide) + -> EvacMode -- ^ The evacuation mode + -> [Idx] -- ^ List of instance (indices) to be evacuated + -> Result EvacSolution +tryNodeEvac _ ini_nl ini_il mode idxs = + let evac_ndx = nodesToEvacuate ini_il mode idxs + offline = map Node.idx . filter Node.offline $ Container.elems ini_nl + excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline + group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx + (Container.elems nl))) $ + splitCluster ini_nl ini_il + (_, _, esol) = + foldl' (\state@(nl, il, _) inst -> + updateEvacSolution state inst $ + availableLocalNodes nl group_ndx excl_ndx inst >>= + nodeEvacInstance nl il mode inst + ) + (ini_nl, ini_il, emptyEvacSolution) + (map (`Container.find` ini_il) idxs) + in return $ reverseEvacSolution esol + +-- | Recursively place instances on the cluster until we're out of space. iterateAlloc :: Node.List -> Instance.List + -> Maybe Int -> Instance.Instance -> AllocNodes -> [Instance.Instance] -> [CStats] -> Result AllocResult -iterateAlloc nl il newinst allocnodes ixes cstats = +iterateAlloc nl il limit newinst allocnodes ixes cstats = let depth = length ixes newname = printf "new-%d" depth::String newidx = length (Container.elems il) + depth newi2 = Instance.setIdx (Instance.setName newinst newname) newidx + newlimit = fmap (flip (-) 1) limit in case tryAlloc nl il newi2 allocnodes of Bad s -> Bad s Ok (AllocSolution { asFailures = errs, asSolutions = sols3 }) -> + let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in case sols3 of - [] -> Ok (collapseFailures errs, nl, il, ixes, cstats) + [] -> newsol (xnl, xi, _, _):[] -> - iterateAlloc xnl (Container.add newidx xi il) - newinst allocnodes (xi:ixes) - (totalResources xnl:cstats) + if limit == Just 0 + then newsol + else iterateAlloc xnl (Container.add newidx xi il) + newlimit newinst allocnodes (xi:ixes) + (totalResources xnl:cstats) _ -> Bad "Internal error: multiple solutions for single\ \ allocation" --- | The core of the tiered allocation mode +-- | The core of the tiered allocation mode. tieredAlloc :: Node.List -> Instance.List + -> Maybe Int -> Instance.Instance -> AllocNodes -> [Instance.Instance] -> [CStats] -> Result AllocResult -tieredAlloc nl il newinst allocnodes ixes cstats = - case iterateAlloc nl il newinst allocnodes ixes cstats of +tieredAlloc nl il limit newinst allocnodes ixes cstats = + case iterateAlloc nl il limit newinst allocnodes ixes cstats of Bad s -> Bad s Ok (errs, nl', il', ixes', cstats') -> + let newsol = Ok (errs, nl', il', ixes', cstats') + ixes_cnt = length ixes' + (stop, newlimit) = case limit of + Nothing -> (False, Nothing) + Just n -> (n <= ixes_cnt, + Just (n - ixes_cnt)) in + if stop then newsol else case Instance.shrinkByType newinst . fst . last $ sortBy (comparing snd) errs of - Bad _ -> Ok (errs, nl', il', ixes', cstats') - Ok newinst' -> - tieredAlloc nl' il' newinst' allocnodes ixes' cstats' + Bad _ -> newsol + Ok newinst' -> tieredAlloc nl' il' newlimit + newinst' allocnodes ixes' cstats' -- | Compute the tiered spec string description from a list of -- allocated instances. @@ -1010,6 +1257,7 @@ printInsts nl il = in if sdx == Node.noSecondary then "" else Container.nameOf nl sdx + , if Instance.autoBalance inst then "Y" else "N" , printf "%3d" $ Instance.vcpus inst , printf "%5d" $ Instance.mem inst , printf "%5d" $ Instance.dsk inst `div` 1024 @@ -1019,16 +1267,16 @@ printInsts nl il = , printf "%5.3f" lN ] where DynUtil lC lM lD lN = Instance.util inst - header = [ "F", "Name", "Pri_node", "Sec_node", "vcpu", "mem" - , "dsk", "lCpu", "lMem", "lDsk", "lNet" ] - isnum = False:False:False:False:repeat True + header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal" + , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ] + isnum = False:False:False:False:False:repeat True in unlines . map ((:) ' ' . intercalate " ") $ formatTable (header:map helper sil) isnum -- | Shows statistics for a given node list. printStats :: Node.List -> String printStats nl = - let dcvs = compDetailedCV nl + let dcvs = compDetailedCV $ Container.elems nl (weights, names) = unzip detailedCVInfo hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs formatted = map (\(w, header, val) -> @@ -1042,10 +1290,8 @@ iMoveToJob nl il idx move = let inst = Container.find idx il iname = Instance.name inst lookNode = Just . Container.nameOf nl - opF = if Instance.running inst - then OpCodes.OpMigrateInstance iname True False - else OpCodes.OpFailoverInstance iname False - opR n = OpCodes.OpReplaceDisks iname (lookNode n) + opF = OpCodes.OpInstanceMigrate iname True False True + opR n = OpCodes.OpInstanceReplaceDisks iname (lookNode n) OpCodes.ReplaceNewSecondary [] Nothing in case move of Failover -> [ opF ] @@ -1056,7 +1302,7 @@ iMoveToJob nl il idx move = -- * Node group functions --- | Computes the group of an instance +-- | Computes the group of an instance. instanceGroup :: Node.List -> Instance.Instance -> Result Gdx instanceGroup nl i = let sidx = Instance.sNode i @@ -1071,19 +1317,19 @@ instanceGroup nl i = show pgroup ++ ", secondary " ++ show sgroup) else return pgroup --- | Computes the group of an instance per the primary node +-- | Computes the group of an instance per the primary node. instancePriGroup :: Node.List -> Instance.Instance -> Gdx instancePriGroup nl i = let pnode = Container.find (Instance.pNode i) nl in Node.group pnode -- | Compute the list of badly allocated instances (split across node --- groups) +-- groups). findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance] findSplitInstances nl = filter (not . isOk . instanceGroup nl) . Container.elems --- | Splits a cluster into the component node groups +-- | Splits a cluster into the component node groups. splitCluster :: Node.List -> Instance.List -> [(Gdx, (Node.List, Instance.List))] splitCluster nl il = @@ -1093,3 +1339,34 @@ splitCluster nl il = nodes' = zip nidxs nodes instances = Container.filter ((`elem` nidxs) . Instance.pNode) il in (guuid, (Container.fromList nodes', instances))) ngroups + +-- | Split a global instance index map into per-group, and associate +-- it with the group/node/instance lists. +associateIdxs :: [Idx] -- ^ Instance indices to be split/associated + -> [(Gdx, (Node.List, Instance.List))] -- ^ Input groups + -> [(Gdx, (Node.List, Instance.List, [Idx]))] -- ^ Result +associateIdxs idxs = + map (\(gdx, (nl, il)) -> + (gdx, (nl, il, filter (`Container.member` il) idxs))) + +-- | Compute the list of nodes that are to be evacuated, given a list +-- of instances and an evacuation mode. +nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list + -> EvacMode -- ^ The evacuation mode we're using + -> [Idx] -- ^ List of instance indices being evacuated + -> IntSet.IntSet -- ^ Set of node indices +nodesToEvacuate il mode = + IntSet.delete Node.noSecondary . + foldl' (\ns idx -> + let i = Container.find idx il + pdx = Instance.pNode i + sdx = Instance.sNode i + dt = Instance.diskTemplate i + withSecondary = case dt of + DTDrbd8 -> IntSet.insert sdx ns + _ -> ns + in case mode of + ChangePrimary -> IntSet.insert pdx ns + ChangeSecondary -> withSecondary + ChangeAll -> IntSet.insert pdx withSecondary + ) IntSet.empty