X-Git-Url: https://code.grnet.gr/git/ganeti-local/blobdiff_plain/2a8e2dc95937c411aa93aa32f50538d8fafd18f1..2922d2c576d85983eb5f6ac82982b641aacc954b:/htools/Ganeti/HTools/Cluster.hs diff --git a/htools/Ganeti/HTools/Cluster.hs b/htools/Ganeti/HTools/Cluster.hs index 4588e77..37af920 100644 --- a/htools/Ganeti/HTools/Cluster.hs +++ b/htools/Ganeti/HTools/Cluster.hs @@ -1,13 +1,13 @@ {-| Implementation of cluster-wide logic. This module holds all pure cluster-logic; I\/O related functionality -goes into the "Main" module for the individual binaries. +goes into the /Main/ module for the individual binaries. -} {- -Copyright (C) 2009, 2010, 2011 Google Inc. +Copyright (C) 2009, 2010, 2011, 2012 Google Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -27,58 +27,57 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA -} module Ganeti.HTools.Cluster - ( - -- * Types - AllocSolution(..) - , Table(..) - , CStats(..) - , AllocStats - -- * Generic functions - , totalResources - , computeAllocationDelta - -- * First phase functions - , computeBadItems - -- * Second phase functions - , printSolutionLine - , formatCmds - , involvedNodes - , splitJobs - -- * Display functions - , printNodes - , printInsts - -- * Balacing functions - , checkMove - , doNextBalance - , tryBalance - , compCV - , compDetailedCV - , printStats - , iMoveToJob - -- * IAllocator functions - , genAllocNodes - , tryAlloc - , tryMGAlloc - , tryReloc - , tryMGReloc - , tryEvac - , tryMGEvac - , collapseFailures - -- * Allocation functions - , iterateAlloc - , tieredAlloc - , tieredSpecMap - -- * Node group functions - , instanceGroup - , findSplitInstances - , splitCluster - ) where - -import Data.Function (on) + ( + -- * Types + AllocSolution(..) + , EvacSolution(..) + , Table(..) + , CStats(..) + , AllocResult + , AllocMethod + -- * Generic functions + , totalResources + , computeAllocationDelta + -- * First phase functions + , computeBadItems + -- * Second phase functions + , printSolutionLine + , formatCmds + , involvedNodes + , splitJobs + -- * Display functions + , printNodes + , printInsts + -- * Balacing functions + , checkMove + , doNextBalance + , tryBalance + , compCV + , compCVNodes + , compDetailedCV + , printStats + , iMoveToJob + -- * IAllocator functions + , genAllocNodes + , tryAlloc + , tryMGAlloc + , tryNodeEvac + , tryChangeGroup + , collapseFailures + -- * Allocation functions + , iterateAlloc + , tieredAlloc + -- * Node group functions + , instanceGroup + , findSplitInstances + , splitCluster + ) where + +import qualified Data.IntSet as IntSet import Data.List +import Data.Maybe (fromJust, isNothing) import Data.Ord (comparing) import Text.Printf (printf) -import Control.Monad -import Control.Parallel.Strategies import qualified Ganeti.HTools.Container as Container import qualified Ganeti.HTools.Instance as Instance @@ -86,66 +85,92 @@ import qualified Ganeti.HTools.Node as Node import qualified Ganeti.HTools.Group as Group import Ganeti.HTools.Types import Ganeti.HTools.Utils +import Ganeti.HTools.Compat import qualified Ganeti.OpCodes as OpCodes -- * Types -- | Allocation\/relocation solution. data AllocSolution = AllocSolution - { asFailures :: [FailMode] -- ^ Failure counts - , asAllocs :: Int -- ^ Good allocation count - , asSolutions :: [Node.AllocElement] -- ^ The actual result, length - -- of the list depends on the - -- allocation/relocation mode - , asLog :: [String] -- ^ A list of informational messages + { asFailures :: [FailMode] -- ^ Failure counts + , asAllocs :: Int -- ^ Good allocation count + , asSolution :: Maybe Node.AllocElement -- ^ The actual allocation result + , asLog :: [String] -- ^ Informational messages } +-- | Node evacuation/group change iallocator result type. This result +-- type consists of actual opcodes (a restricted subset) that are +-- transmitted back to Ganeti. +data EvacSolution = EvacSolution + { esMoved :: [(Idx, Gdx, [Ndx])] -- ^ Instances moved successfully + , esFailed :: [(Idx, String)] -- ^ Instances which were not + -- relocated + , esOpCodes :: [[OpCodes.OpCode]] -- ^ List of jobs + } deriving (Show) + -- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'. type AllocResult = (FailStats, Node.List, Instance.List, [Instance.Instance], [CStats]) - -- | A type denoting the valid allocation mode/pairs. --- For a one-node allocation, this will be a @Left ['Node.Node']@, --- whereas for a two-node allocation, this will be a @Right --- [('Node.Node', 'Node.Node')]@. -type AllocNodes = Either [Ndx] [(Ndx, Ndx)] - --- | The empty solution we start with when computing allocations -emptySolution :: AllocSolution -emptySolution = AllocSolution { asFailures = [], asAllocs = 0 - , asSolutions = [], asLog = [] } - --- | The complete state for the balancing solution +-- +-- For a one-node allocation, this will be a @Left ['Ndx']@, whereas +-- for a two-node allocation, this will be a @Right [('Ndx', +-- ['Ndx'])]@. In the latter case, the list is basically an +-- association list, grouped by primary node and holding the potential +-- secondary nodes in the sub-list. +type AllocNodes = Either [Ndx] [(Ndx, [Ndx])] + +-- | The empty solution we start with when computing allocations. +emptyAllocSolution :: AllocSolution +emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0 + , asSolution = Nothing, asLog = [] } + +-- | The empty evac solution. +emptyEvacSolution :: EvacSolution +emptyEvacSolution = EvacSolution { esMoved = [] + , esFailed = [] + , esOpCodes = [] + } + +-- | The complete state for the balancing solution. data Table = Table Node.List Instance.List Score [Placement] deriving (Show, Read) -data CStats = CStats { csFmem :: Int -- ^ Cluster free mem - , csFdsk :: Int -- ^ Cluster free disk - , csAmem :: Int -- ^ Cluster allocatable mem - , csAdsk :: Int -- ^ Cluster allocatable disk - , csAcpu :: Int -- ^ Cluster allocatable cpus - , csMmem :: Int -- ^ Max node allocatable mem - , csMdsk :: Int -- ^ Max node allocatable disk - , csMcpu :: Int -- ^ Max node allocatable cpu - , csImem :: Int -- ^ Instance used mem - , csIdsk :: Int -- ^ Instance used disk - , csIcpu :: Int -- ^ Instance used cpu - , csTmem :: Double -- ^ Cluster total mem - , csTdsk :: Double -- ^ Cluster total disk - , csTcpu :: Double -- ^ Cluster total cpus - , csVcpu :: Int -- ^ Cluster virtual cpus (if - -- node pCpu has been set, - -- otherwise -1) - , csXmem :: Int -- ^ Unnacounted for mem - , csNmem :: Int -- ^ Node own memory - , csScore :: Score -- ^ The cluster score - , csNinst :: Int -- ^ The total number of instances - } - deriving (Show, Read) - --- | Currently used, possibly to allocate, unallocable -type AllocStats = (RSpec, RSpec, RSpec) +-- | Cluster statistics data type. +data CStats = CStats + { csFmem :: Integer -- ^ Cluster free mem + , csFdsk :: Integer -- ^ Cluster free disk + , csAmem :: Integer -- ^ Cluster allocatable mem + , csAdsk :: Integer -- ^ Cluster allocatable disk + , csAcpu :: Integer -- ^ Cluster allocatable cpus + , csMmem :: Integer -- ^ Max node allocatable mem + , csMdsk :: Integer -- ^ Max node allocatable disk + , csMcpu :: Integer -- ^ Max node allocatable cpu + , csImem :: Integer -- ^ Instance used mem + , csIdsk :: Integer -- ^ Instance used disk + , csIcpu :: Integer -- ^ Instance used cpu + , csTmem :: Double -- ^ Cluster total mem + , csTdsk :: Double -- ^ Cluster total disk + , csTcpu :: Double -- ^ Cluster total cpus + , csVcpu :: Integer -- ^ Cluster total virtual cpus + , csNcpu :: Double -- ^ Equivalent to 'csIcpu' but in terms of + -- physical CPUs, i.e. normalised used phys CPUs + , csXmem :: Integer -- ^ Unnacounted for mem + , csNmem :: Integer -- ^ Node own memory + , csScore :: Score -- ^ The cluster score + , csNinst :: Int -- ^ The total number of instances + } deriving (Show, Read) + +-- | A simple type for allocation functions. +type AllocMethod = Node.List -- ^ Node list + -> Instance.List -- ^ Instance list + -> Maybe Int -- ^ Optional allocation limit + -> Instance.Instance -- ^ Instance spec for allocation + -> AllocNodes -- ^ Which nodes we should allocate on + -> [Instance.Instance] -- ^ Allocated instances + -> [CStats] -- ^ Running cluster stats + -> Result AllocResult -- ^ Allocation result -- * Utility functions @@ -170,57 +195,71 @@ computeBadItems nl il = in (bad_nodes, bad_instances) --- | Zero-initializer for the CStats type +-- | Extracts the node pairs for an instance. This can fail if the +-- instance is single-homed. FIXME: this needs to be improved, +-- together with the general enhancement for handling non-DRBD moves. +instanceNodes :: Node.List -> Instance.Instance -> + (Ndx, Ndx, Node.Node, Node.Node) +instanceNodes nl inst = + let old_pdx = Instance.pNode inst + old_sdx = Instance.sNode inst + old_p = Container.find old_pdx nl + old_s = Container.find old_sdx nl + in (old_pdx, old_sdx, old_p, old_s) + +-- | Zero-initializer for the CStats type. emptyCStats :: CStats -emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 --- | Update stats with data from a new node +-- | Update stats with data from a new node. updateCStats :: CStats -> Node.Node -> CStats updateCStats cs node = - let CStats { csFmem = x_fmem, csFdsk = x_fdsk, - csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk, - csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu, - csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu, - csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu, - csVcpu = x_vcpu, - csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst - } - = cs - inc_amem = Node.fMem node - Node.rMem node - inc_amem' = if inc_amem > 0 then inc_amem else 0 - inc_adsk = Node.availDisk node - inc_imem = truncate (Node.tMem node) - Node.nMem node - - Node.xMem node - Node.fMem node - inc_icpu = Node.uCpu node - inc_idsk = truncate (Node.tDsk node) - Node.fDsk node - inc_vcpu = Node.hiCpu node - inc_acpu = Node.availCpu node - - in cs { csFmem = x_fmem + Node.fMem node - , csFdsk = x_fdsk + Node.fDsk node - , csAmem = x_amem + inc_amem' - , csAdsk = x_adsk + inc_adsk - , csAcpu = x_acpu + inc_acpu - , csMmem = max x_mmem inc_amem' - , csMdsk = max x_mdsk inc_adsk - , csMcpu = max x_mcpu inc_acpu - , csImem = x_imem + inc_imem - , csIdsk = x_idsk + inc_idsk - , csIcpu = x_icpu + inc_icpu - , csTmem = x_tmem + Node.tMem node - , csTdsk = x_tdsk + Node.tDsk node - , csTcpu = x_tcpu + Node.tCpu node - , csVcpu = x_vcpu + inc_vcpu - , csXmem = x_xmem + Node.xMem node - , csNmem = x_nmem + Node.nMem node - , csNinst = x_ninst + length (Node.pList node) - } + let CStats { csFmem = x_fmem, csFdsk = x_fdsk, + csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk, + csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu, + csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu, + csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu, + csVcpu = x_vcpu, csNcpu = x_ncpu, + csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst + } + = cs + inc_amem = Node.fMem node - Node.rMem node + inc_amem' = if inc_amem > 0 then inc_amem else 0 + inc_adsk = Node.availDisk node + inc_imem = truncate (Node.tMem node) - Node.nMem node + - Node.xMem node - Node.fMem node + inc_icpu = Node.uCpu node + inc_idsk = truncate (Node.tDsk node) - Node.fDsk node + inc_vcpu = Node.hiCpu node + inc_acpu = Node.availCpu node + inc_ncpu = fromIntegral (Node.uCpu node) / + iPolicyVcpuRatio (Node.iPolicy node) + in cs { csFmem = x_fmem + fromIntegral (Node.fMem node) + , csFdsk = x_fdsk + fromIntegral (Node.fDsk node) + , csAmem = x_amem + fromIntegral inc_amem' + , csAdsk = x_adsk + fromIntegral inc_adsk + , csAcpu = x_acpu + fromIntegral inc_acpu + , csMmem = max x_mmem (fromIntegral inc_amem') + , csMdsk = max x_mdsk (fromIntegral inc_adsk) + , csMcpu = max x_mcpu (fromIntegral inc_acpu) + , csImem = x_imem + fromIntegral inc_imem + , csIdsk = x_idsk + fromIntegral inc_idsk + , csIcpu = x_icpu + fromIntegral inc_icpu + , csTmem = x_tmem + Node.tMem node + , csTdsk = x_tdsk + Node.tDsk node + , csTcpu = x_tcpu + Node.tCpu node + , csVcpu = x_vcpu + fromIntegral inc_vcpu + , csNcpu = x_ncpu + inc_ncpu + , csXmem = x_xmem + fromIntegral (Node.xMem node) + , csNmem = x_nmem + fromIntegral (Node.nMem node) + , csNinst = x_ninst + length (Node.pList node) + } -- | Compute the total free disk and memory in the cluster. totalResources :: Node.List -> CStats totalResources nl = - let cs = foldl' updateCStats emptyCStats . Container.elems $ nl - in cs { csScore = compCV nl } + let cs = foldl' updateCStats emptyCStats . Container.elems $ nl + in cs { csScore = compCV nl } -- | Compute the delta between two cluster state. -- @@ -230,16 +269,29 @@ totalResources nl = -- was left unallocated. computeAllocationDelta :: CStats -> CStats -> AllocStats computeAllocationDelta cini cfin = - let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu} = cini - CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu, - csTmem = t_mem, csTdsk = t_dsk, csVcpu = v_cpu } = cfin - rini = RSpec i_icpu i_imem i_idsk - rfin = RSpec (f_icpu - i_icpu) (f_imem - i_imem) (f_idsk - i_idsk) - un_cpu = v_cpu - f_icpu - runa = RSpec un_cpu (truncate t_mem - f_imem) (truncate t_dsk - f_idsk) - in (rini, rfin, runa) - --- | The names and weights of the individual elements in the CV list + let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu, + csNcpu = i_ncpu } = cini + CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu, + csTmem = t_mem, csTdsk = t_dsk, csVcpu = f_vcpu, + csNcpu = f_ncpu, csTcpu = f_tcpu } = cfin + rini = AllocInfo { allocInfoVCpus = fromIntegral i_icpu + , allocInfoNCpus = i_ncpu + , allocInfoMem = fromIntegral i_imem + , allocInfoDisk = fromIntegral i_idsk + } + rfin = AllocInfo { allocInfoVCpus = fromIntegral (f_icpu - i_icpu) + , allocInfoNCpus = f_ncpu - i_ncpu + , allocInfoMem = fromIntegral (f_imem - i_imem) + , allocInfoDisk = fromIntegral (f_idsk - i_idsk) + } + runa = AllocInfo { allocInfoVCpus = fromIntegral (f_vcpu - f_icpu) + , allocInfoNCpus = f_tcpu - f_ncpu + , allocInfoMem = truncate t_mem - fromIntegral f_imem + , allocInfoDisk = truncate t_dsk - fromIntegral f_idsk + } + in (rini, rfin, runa) + +-- | The names and weights of the individual elements in the CV list. detailedCVInfo :: [(Double, String)] detailedCVInfo = [ (1, "free_mem_cv") , (1, "free_disk_cv") @@ -255,188 +307,180 @@ detailedCVInfo = [ (1, "free_mem_cv") , (2, "pri_tags_score") ] +-- | Holds the weights used by 'compCVNodes' for each metric. detailedCVWeights :: [Double] detailedCVWeights = map fst detailedCVInfo -- | Compute the mem and disk covariance. -compDetailedCV :: Node.List -> [Double] -compDetailedCV nl = - let - all_nodes = Container.elems nl - (offline, nodes) = partition Node.offline all_nodes - mem_l = map Node.pMem nodes - dsk_l = map Node.pDsk nodes - -- metric: memory covariance - mem_cv = stdDev mem_l - -- metric: disk covariance - dsk_cv = stdDev dsk_l - -- metric: count of instances living on N1 failing nodes - n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) + - length (Node.pList n)) . - filter Node.failN1 $ nodes :: Double - res_l = map Node.pRem nodes - -- metric: reserved memory covariance - res_cv = stdDev res_l - -- offline instances metrics - offline_ipri = sum . map (length . Node.pList) $ offline - offline_isec = sum . map (length . Node.sList) $ offline - -- metric: count of instances on offline nodes - off_score = fromIntegral (offline_ipri + offline_isec)::Double - -- metric: count of primary instances on offline nodes (this - -- helps with evacuation/failover of primary instances on - -- 2-node clusters with one node offline) - off_pri_score = fromIntegral offline_ipri::Double - cpu_l = map Node.pCpu nodes - -- metric: covariance of vcpu/pcpu ratio - cpu_cv = stdDev cpu_l - -- metrics: covariance of cpu, memory, disk and network load - (c_load, m_load, d_load, n_load) = unzip4 $ - map (\n -> - let DynUtil c1 m1 d1 n1 = Node.utilLoad n - DynUtil c2 m2 d2 n2 = Node.utilPool n - in (c1/c2, m1/m2, d1/d2, n1/n2) - ) nodes - -- metric: conflicting instance count - pri_tags_inst = sum $ map Node.conflictingPrimaries nodes - pri_tags_score = fromIntegral pri_tags_inst::Double - in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv - , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load - , pri_tags_score ] +compDetailedCV :: [Node.Node] -> [Double] +compDetailedCV all_nodes = + let (offline, nodes) = partition Node.offline all_nodes + mem_l = map Node.pMem nodes + dsk_l = map Node.pDsk nodes + -- metric: memory covariance + mem_cv = stdDev mem_l + -- metric: disk covariance + dsk_cv = stdDev dsk_l + -- metric: count of instances living on N1 failing nodes + n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) + + length (Node.pList n)) . + filter Node.failN1 $ nodes :: Double + res_l = map Node.pRem nodes + -- metric: reserved memory covariance + res_cv = stdDev res_l + -- offline instances metrics + offline_ipri = sum . map (length . Node.pList) $ offline + offline_isec = sum . map (length . Node.sList) $ offline + -- metric: count of instances on offline nodes + off_score = fromIntegral (offline_ipri + offline_isec)::Double + -- metric: count of primary instances on offline nodes (this + -- helps with evacuation/failover of primary instances on + -- 2-node clusters with one node offline) + off_pri_score = fromIntegral offline_ipri::Double + cpu_l = map Node.pCpu nodes + -- metric: covariance of vcpu/pcpu ratio + cpu_cv = stdDev cpu_l + -- metrics: covariance of cpu, memory, disk and network load + (c_load, m_load, d_load, n_load) = + unzip4 $ map (\n -> + let DynUtil c1 m1 d1 n1 = Node.utilLoad n + DynUtil c2 m2 d2 n2 = Node.utilPool n + in (c1/c2, m1/m2, d1/d2, n1/n2)) nodes + -- metric: conflicting instance count + pri_tags_inst = sum $ map Node.conflictingPrimaries nodes + pri_tags_score = fromIntegral pri_tags_inst::Double + in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv + , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load + , pri_tags_score ] -- | Compute the /total/ variance. +compCVNodes :: [Node.Node] -> Double +compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV + +-- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'. compCV :: Node.List -> Double -compCV = sum . zipWith (*) detailedCVWeights . compDetailedCV +compCV = compCVNodes . Container.elems --- | Compute online nodes from a Node.List +-- | Compute online nodes from a 'Node.List'. getOnline :: Node.List -> [Node.Node] getOnline = filter (not . Node.offline) . Container.elems --- * hbal functions +-- * Balancing functions -- | Compute best table. Note that the ordering of the arguments is important. compareTables :: Table -> Table -> Table compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) = - if a_cv > b_cv then b else a + if a_cv > b_cv then b else a -- | Applies an instance move to a given node list and instance. applyMove :: Node.List -> Instance.Instance -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx) -- Failover (f) applyMove nl inst Failover = - let old_pdx = Instance.pNode inst - old_sdx = Instance.sNode inst - old_p = Container.find old_pdx nl - old_s = Container.find old_sdx nl - int_p = Node.removePri old_p inst - int_s = Node.removeSec old_s inst - force_p = Node.offline old_p - new_nl = do -- Maybe monad - new_p <- Node.addPriEx force_p int_s inst - new_s <- Node.addSec int_p inst old_sdx - let new_inst = Instance.setBoth inst old_sdx old_pdx - return (Container.addTwo old_pdx new_s old_sdx new_p nl, - new_inst, old_sdx, old_pdx) - in new_nl + let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst + int_p = Node.removePri old_p inst + int_s = Node.removeSec old_s inst + new_nl = do -- Maybe monad + new_p <- Node.addPriEx (Node.offline old_p) int_s inst + new_s <- Node.addSec int_p inst old_sdx + let new_inst = Instance.setBoth inst old_sdx old_pdx + return (Container.addTwo old_pdx new_s old_sdx new_p nl, + new_inst, old_sdx, old_pdx) + in new_nl -- Replace the primary (f:, r:np, f) applyMove nl inst (ReplacePrimary new_pdx) = - let old_pdx = Instance.pNode inst - old_sdx = Instance.sNode inst - old_p = Container.find old_pdx nl - old_s = Container.find old_sdx nl - tgt_n = Container.find new_pdx nl - int_p = Node.removePri old_p inst - int_s = Node.removeSec old_s inst - force_p = Node.offline old_p - new_nl = do -- Maybe monad - -- check that the current secondary can host the instance - -- during the migration - tmp_s <- Node.addPriEx force_p int_s inst - let tmp_s' = Node.removePri tmp_s inst - new_p <- Node.addPriEx force_p tgt_n inst - new_s <- Node.addSecEx force_p tmp_s' inst new_pdx - let new_inst = Instance.setPri inst new_pdx - return (Container.add new_pdx new_p $ - Container.addTwo old_pdx int_p old_sdx new_s nl, - new_inst, new_pdx, old_sdx) - in new_nl + let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst + tgt_n = Container.find new_pdx nl + int_p = Node.removePri old_p inst + int_s = Node.removeSec old_s inst + force_p = Node.offline old_p + new_nl = do -- Maybe monad + -- check that the current secondary can host the instance + -- during the migration + tmp_s <- Node.addPriEx force_p int_s inst + let tmp_s' = Node.removePri tmp_s inst + new_p <- Node.addPriEx force_p tgt_n inst + new_s <- Node.addSecEx force_p tmp_s' inst new_pdx + let new_inst = Instance.setPri inst new_pdx + return (Container.add new_pdx new_p $ + Container.addTwo old_pdx int_p old_sdx new_s nl, + new_inst, new_pdx, old_sdx) + in new_nl -- Replace the secondary (r:ns) applyMove nl inst (ReplaceSecondary new_sdx) = - let old_pdx = Instance.pNode inst - old_sdx = Instance.sNode inst - old_s = Container.find old_sdx nl - tgt_n = Container.find new_sdx nl - int_s = Node.removeSec old_s inst - force_s = Node.offline old_s - new_inst = Instance.setSec inst new_sdx - new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>= - \new_s -> return (Container.addTwo new_sdx - new_s old_sdx int_s nl, - new_inst, old_pdx, new_sdx) - in new_nl + let old_pdx = Instance.pNode inst + old_sdx = Instance.sNode inst + old_s = Container.find old_sdx nl + tgt_n = Container.find new_sdx nl + int_s = Node.removeSec old_s inst + force_s = Node.offline old_s + new_inst = Instance.setSec inst new_sdx + new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>= + \new_s -> return (Container.addTwo new_sdx + new_s old_sdx int_s nl, + new_inst, old_pdx, new_sdx) + in new_nl -- Replace the secondary and failover (r:np, f) applyMove nl inst (ReplaceAndFailover new_pdx) = - let old_pdx = Instance.pNode inst - old_sdx = Instance.sNode inst - old_p = Container.find old_pdx nl - old_s = Container.find old_sdx nl - tgt_n = Container.find new_pdx nl - int_p = Node.removePri old_p inst - int_s = Node.removeSec old_s inst - force_s = Node.offline old_s - new_nl = do -- Maybe monad - new_p <- Node.addPri tgt_n inst - new_s <- Node.addSecEx force_s int_p inst new_pdx - let new_inst = Instance.setBoth inst new_pdx old_pdx - return (Container.add new_pdx new_p $ - Container.addTwo old_pdx new_s old_sdx int_s nl, - new_inst, new_pdx, old_pdx) - in new_nl + let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst + tgt_n = Container.find new_pdx nl + int_p = Node.removePri old_p inst + int_s = Node.removeSec old_s inst + force_s = Node.offline old_s + new_nl = do -- Maybe monad + new_p <- Node.addPri tgt_n inst + new_s <- Node.addSecEx force_s int_p inst new_pdx + let new_inst = Instance.setBoth inst new_pdx old_pdx + return (Container.add new_pdx new_p $ + Container.addTwo old_pdx new_s old_sdx int_s nl, + new_inst, new_pdx, old_pdx) + in new_nl -- Failver and replace the secondary (f, r:ns) applyMove nl inst (FailoverAndReplace new_sdx) = - let old_pdx = Instance.pNode inst - old_sdx = Instance.sNode inst - old_p = Container.find old_pdx nl - old_s = Container.find old_sdx nl - tgt_n = Container.find new_sdx nl - int_p = Node.removePri old_p inst - int_s = Node.removeSec old_s inst - force_p = Node.offline old_p - new_nl = do -- Maybe monad - new_p <- Node.addPriEx force_p int_s inst - new_s <- Node.addSecEx force_p tgt_n inst old_sdx - let new_inst = Instance.setBoth inst old_sdx new_sdx - return (Container.add new_sdx new_s $ - Container.addTwo old_sdx new_p old_pdx int_p nl, - new_inst, old_sdx, new_sdx) - in new_nl + let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst + tgt_n = Container.find new_sdx nl + int_p = Node.removePri old_p inst + int_s = Node.removeSec old_s inst + force_p = Node.offline old_p + new_nl = do -- Maybe monad + new_p <- Node.addPriEx force_p int_s inst + new_s <- Node.addSecEx force_p tgt_n inst old_sdx + let new_inst = Instance.setBoth inst old_sdx new_sdx + return (Container.add new_sdx new_s $ + Container.addTwo old_sdx new_p old_pdx int_p nl, + new_inst, old_sdx, new_sdx) + in new_nl -- | Tries to allocate an instance on one given node. allocateOnSingle :: Node.List -> Instance.Instance -> Ndx -> OpResult Node.AllocElement allocateOnSingle nl inst new_pdx = - let p = Container.find new_pdx nl - new_inst = Instance.setBoth inst new_pdx Node.noSecondary - in Node.addPri p inst >>= \new_p -> do - let new_nl = Container.add new_pdx new_p nl - new_score = compCV nl - return (new_nl, new_inst, [new_p], new_score) + let p = Container.find new_pdx nl + new_inst = Instance.setBoth inst new_pdx Node.noSecondary + in do + Instance.instMatchesPolicy inst (Node.iPolicy p) + new_p <- Node.addPri p inst + let new_nl = Container.add new_pdx new_p nl + new_score = compCV nl + return (new_nl, new_inst, [new_p], new_score) -- | Tries to allocate an instance on a given pair of nodes. allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx -> OpResult Node.AllocElement allocateOnPair nl inst new_pdx new_sdx = - let tgt_p = Container.find new_pdx nl - tgt_s = Container.find new_sdx nl - in do - new_p <- Node.addPri tgt_p inst - new_s <- Node.addSec tgt_s inst new_pdx - let new_inst = Instance.setBoth inst new_pdx new_sdx - new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl - return (new_nl, new_inst, [new_p, new_s], compCV new_nl) + let tgt_p = Container.find new_pdx nl + tgt_s = Container.find new_sdx nl + in do + Instance.instMatchesPolicy inst (Node.iPolicy tgt_p) + new_p <- Node.addPri tgt_p inst + new_s <- Node.addSec tgt_s inst new_pdx + let new_inst = Instance.setBoth inst new_pdx new_sdx + new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl + return (new_nl, new_inst, [new_p, new_s], compCV new_nl) -- | Tries to perform an instance move and returns the best table -- between the original one and the new one. @@ -446,54 +490,60 @@ checkSingleStep :: Table -- ^ The original table -> IMove -- ^ The move to apply -> Table -- ^ The final best table checkSingleStep ini_tbl target cur_tbl move = - let - Table ini_nl ini_il _ ini_plc = ini_tbl - tmp_resu = applyMove ini_nl target move - in - case tmp_resu of - OpFail _ -> cur_tbl - OpGood (upd_nl, new_inst, pri_idx, sec_idx) -> - let tgt_idx = Instance.idx target - upd_cvar = compCV upd_nl - upd_il = Container.add tgt_idx new_inst ini_il - upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc - upd_tbl = Table upd_nl upd_il upd_cvar upd_plc - in - compareTables cur_tbl upd_tbl + let Table ini_nl ini_il _ ini_plc = ini_tbl + tmp_resu = applyMove ini_nl target move + in case tmp_resu of + OpFail _ -> cur_tbl + OpGood (upd_nl, new_inst, pri_idx, sec_idx) -> + let tgt_idx = Instance.idx target + upd_cvar = compCV upd_nl + upd_il = Container.add tgt_idx new_inst ini_il + upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc + upd_tbl = Table upd_nl upd_il upd_cvar upd_plc + in compareTables cur_tbl upd_tbl -- | Given the status of the current secondary as a valid new node and -- the current candidate target node, generate the possible moves for -- a instance. possibleMoves :: Bool -- ^ Whether the secondary node is a valid new node + -> Bool -- ^ Whether we can change the primary node -> Ndx -- ^ Target node candidate -> [IMove] -- ^ List of valid result moves -possibleMoves True tdx = - [ReplaceSecondary tdx, - ReplaceAndFailover tdx, - ReplacePrimary tdx, - FailoverAndReplace tdx] -possibleMoves False tdx = - [ReplaceSecondary tdx, - ReplaceAndFailover tdx] +possibleMoves _ False tdx = + [ReplaceSecondary tdx] + +possibleMoves True True tdx = + [ ReplaceSecondary tdx + , ReplaceAndFailover tdx + , ReplacePrimary tdx + , FailoverAndReplace tdx + ] + +possibleMoves False True tdx = + [ ReplaceSecondary tdx + , ReplaceAndFailover tdx + ] -- | Compute the best move for a given instance. checkInstanceMove :: [Ndx] -- ^ Allowed target node indices -> Bool -- ^ Whether disk moves are allowed + -> Bool -- ^ Whether instance moves are allowed -> Table -- ^ Original table -> Instance.Instance -- ^ Instance to move -> Table -- ^ Best new table for this instance -checkInstanceMove nodes_idx disk_moves ini_tbl target = - let - opdx = Instance.pNode target - osdx = Instance.sNode target - nodes = filter (\idx -> idx /= opdx && idx /= osdx) nodes_idx - use_secondary = elem osdx nodes_idx - aft_failover = if use_secondary -- if allowed to failover +checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target = + let opdx = Instance.pNode target + osdx = Instance.sNode target + bad_nodes = [opdx, osdx] + nodes = filter (`notElem` bad_nodes) nodes_idx + use_secondary = elem osdx nodes_idx && inst_moves + aft_failover = if use_secondary -- if allowed to failover then checkSingleStep ini_tbl target ini_tbl Failover else ini_tbl - all_moves = if disk_moves - then concatMap (possibleMoves use_secondary) nodes + all_moves = if disk_moves + then concatMap + (possibleMoves use_secondary inst_moves) nodes else [] in -- iterate over the possible nodes for this instance @@ -502,56 +552,58 @@ checkInstanceMove nodes_idx disk_moves ini_tbl target = -- | Compute the best next move. checkMove :: [Ndx] -- ^ Allowed target node indices -> Bool -- ^ Whether disk moves are allowed + -> Bool -- ^ Whether instance moves are allowed -> Table -- ^ The current solution -> [Instance.Instance] -- ^ List of instances still to move -> Table -- ^ The new solution -checkMove nodes_idx disk_moves ini_tbl victims = - let Table _ _ _ ini_plc = ini_tbl - -- we're using rwhnf from the Control.Parallel.Strategies - -- package; we don't need to use rnf as that would force too - -- much evaluation in single-threaded cases, and in - -- multi-threaded case the weak head normal form is enough to - -- spark the evaluation - tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves ini_tbl) - victims - -- iterate over all instances, computing the best move - best_tbl = foldl' compareTables ini_tbl tables - Table _ _ _ best_plc = best_tbl - in if length best_plc == length ini_plc +checkMove nodes_idx disk_moves inst_moves ini_tbl victims = + let Table _ _ _ ini_plc = ini_tbl + -- we're using rwhnf from the Control.Parallel.Strategies + -- package; we don't need to use rnf as that would force too + -- much evaluation in single-threaded cases, and in + -- multi-threaded case the weak head normal form is enough to + -- spark the evaluation + tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves + inst_moves ini_tbl) + victims + -- iterate over all instances, computing the best move + best_tbl = foldl' compareTables ini_tbl tables + Table _ _ _ best_plc = best_tbl + in if length best_plc == length ini_plc then ini_tbl -- no advancement else best_tbl --- | Check if we are allowed to go deeper in the balancing +-- | Check if we are allowed to go deeper in the balancing. doNextBalance :: Table -- ^ The starting table -> Int -- ^ Remaining length -> Score -- ^ Score at which to stop -> Bool -- ^ The resulting table and commands doNextBalance ini_tbl max_rounds min_score = - let Table _ _ ini_cv ini_plc = ini_tbl - ini_plc_len = length ini_plc - in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score + let Table _ _ ini_cv ini_plc = ini_tbl + ini_plc_len = length ini_plc + in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score --- | Run a balance move +-- | Run a balance move. tryBalance :: Table -- ^ The starting table -> Bool -- ^ Allow disk moves + -> Bool -- ^ Allow instance moves -> Bool -- ^ Only evacuate moves -> Score -- ^ Min gain threshold -> Score -- ^ Min gain -> Maybe Table -- ^ The resulting table and commands -tryBalance ini_tbl disk_moves evac_mode mg_limit min_gain = +tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain = let Table ini_nl ini_il ini_cv _ = ini_tbl all_inst = Container.elems ini_il all_inst' = if evac_mode then let bad_nodes = map Node.idx . filter Node.offline $ Container.elems ini_nl - in filter (\e -> Instance.sNode e `elem` bad_nodes || - Instance.pNode e `elem` bad_nodes) + in filter (any (`elem` bad_nodes) . Instance.allNodes) all_inst else all_inst reloc_inst = filter Instance.movable all_inst' node_idx = map Node.idx . filter (not . Node.offline) $ Container.elems ini_nl - fin_tbl = checkMove node_idx disk_moves ini_tbl reloc_inst + fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst (Table _ _ fin_cv _) = fin_tbl in if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain) @@ -560,65 +612,80 @@ tryBalance ini_tbl disk_moves evac_mode mg_limit min_gain = -- * Allocation functions --- | Build failure stats out of a list of failures +-- | Build failure stats out of a list of failures. collapseFailures :: [FailMode] -> FailStats collapseFailures flst = - map (\k -> (k, length $ filter (k ==) flst)) [minBound..maxBound] + map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst)) + [minBound..maxBound] + +-- | Compares two Maybe AllocElement and chooses the besst score. +bestAllocElement :: Maybe Node.AllocElement + -> Maybe Node.AllocElement + -> Maybe Node.AllocElement +bestAllocElement a Nothing = a +bestAllocElement Nothing b = b +bestAllocElement a@(Just (_, _, _, ascore)) b@(Just (_, _, _, bscore)) = + if ascore < bscore then a else b -- | Update current Allocation solution and failure stats with new --- elements +-- elements. concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution concatAllocs as (OpFail reason) = as { asFailures = reason : asFailures as } -concatAllocs as (OpGood ns@(_, _, _, nscore)) = - let -- Choose the old or new solution, based on the cluster score - cntok = asAllocs as - osols = asSolutions as - nsols = case osols of - [] -> [ns] - (_, _, _, oscore):[] -> - if oscore < nscore - then osols - else [ns] - -- FIXME: here we simply concat to lists with more - -- than one element; we should instead abort, since - -- this is not a valid usage of this function - xs -> ns:xs - nsuc = cntok + 1 +concatAllocs as (OpGood ns) = + let -- Choose the old or new solution, based on the cluster score + cntok = asAllocs as + osols = asSolution as + nsols = bestAllocElement osols (Just ns) + nsuc = cntok + 1 -- Note: we force evaluation of nsols here in order to keep the -- memory profile low - we know that we will need nsols for sure -- in the next cycle, so we force evaluation of nsols, since the -- foldl' in the caller will only evaluate the tuple, but not the -- elements of the tuple - in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolutions = nsols } + in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolution = nsols } --- | Sums two allocation solutions (e.g. for two separate node groups). +-- | Sums two 'AllocSolution' structures. sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution -sumAllocs (AllocSolution af aa as al) (AllocSolution bf ba bs bl) = - AllocSolution (af ++ bf) (aa + ba) (as ++ bs) (al ++ bl) - --- | Given a solution, generates a reasonable description for it +sumAllocs (AllocSolution aFails aAllocs aSols aLog) + (AllocSolution bFails bAllocs bSols bLog) = + -- note: we add b first, since usually it will be smaller; when + -- fold'ing, a will grow and grow whereas b is the per-group + -- result, hence smaller + let nFails = bFails ++ aFails + nAllocs = aAllocs + bAllocs + nSols = bestAllocElement aSols bSols + nLog = bLog ++ aLog + in AllocSolution nFails nAllocs nSols nLog + +-- | Given a solution, generates a reasonable description for it. describeSolution :: AllocSolution -> String describeSolution as = let fcnt = asFailures as - sols = asSolutions as + sols = asSolution as freasons = intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) . filter ((> 0) . snd) . collapseFailures $ fcnt - in if null sols - then "No valid allocation solutions, failure reasons: " ++ - (if null fcnt - then "unknown reasons" - else freasons) - else let (_, _, nodes, cv) = head sols - in printf ("score: %.8f, successes %d, failures %d (%s)" ++ - " for node(s) %s") cv (asAllocs as) (length fcnt) freasons - (intercalate "/" . map Node.name $ nodes) - --- | Annotates a solution with the appropriate string + in case sols of + Nothing -> "No valid allocation solutions, failure reasons: " ++ + (if null fcnt then "unknown reasons" else freasons) + Just (_, _, nodes, cv) -> + printf ("score: %.8f, successes %d, failures %d (%s)" ++ + " for node(s) %s") cv (asAllocs as) (length fcnt) freasons + (intercalate "/" . map Node.name $ nodes) + +-- | Annotates a solution with the appropriate string. annotateSolution :: AllocSolution -> AllocSolution annotateSolution as = as { asLog = describeSolution as : asLog as } +-- | Reverses an evacuation solution. +-- +-- Rationale: we always concat the results to the top of the lists, so +-- for proper jobset execution, we should reverse all lists. +reverseEvacSolution :: EvacSolution -> EvacSolution +reverseEvacSolution (EvacSolution f m o) = + EvacSolution (reverse f) (reverse m) (reverse o) + -- | Generate the valid node allocation singles or pairs for a new instance. genAllocNodes :: Group.List -- ^ Group list -> Node.List -- ^ The node map @@ -627,18 +694,20 @@ genAllocNodes :: Group.List -- ^ Group list -- unallocable nodes -> Result AllocNodes -- ^ The (monadic) result genAllocNodes gl nl count drop_unalloc = - let filter_fn = if drop_unalloc - then filter ((/=) AllocUnallocable . Group.allocPolicy . - flip Container.find gl . Node.group) + let filter_fn = if drop_unalloc + then filter (Group.isAllocable . + flip Container.find gl . Node.group) else id - all_nodes = filter_fn $ getOnline nl - all_pairs = liftM2 (,) all_nodes all_nodes - ok_pairs = filter (\(x, y) -> Node.idx x /= Node.idx y && - Node.group x == Node.group y) all_pairs - in case count of - 1 -> Ok (Left (map Node.idx all_nodes)) - 2 -> Ok (Right (map (\(p, s) -> (Node.idx p, Node.idx s)) ok_pairs)) - _ -> Bad "Unsupported number of nodes, only one or two supported" + all_nodes = filter_fn $ getOnline nl + all_pairs = [(Node.idx p, + [Node.idx s | s <- all_nodes, + Node.idx p /= Node.idx s, + Node.group p == Node.group s]) | + p <- all_nodes] + in case count of + 1 -> Ok (Left (map Node.idx all_nodes)) + 2 -> Ok (Right (filter (not . null . snd) all_pairs)) + _ -> Bad "Unsupported number of nodes, only one or two supported" -- | Try to allocate an instance on the cluster. tryAlloc :: (Monad m) => @@ -647,24 +716,24 @@ tryAlloc :: (Monad m) => -> Instance.Instance -- ^ The instance to allocate -> AllocNodes -- ^ The allocation targets -> m AllocSolution -- ^ Possible solution list +tryAlloc _ _ _ (Right []) = fail "Not enough online nodes" tryAlloc nl _ inst (Right ok_pairs) = - let sols = foldl' (\cstate (p, s) -> - concatAllocs cstate $ allocateOnPair nl inst p s - ) emptySolution ok_pairs - - in if null ok_pairs -- means we have just one node - then fail "Not enough online nodes" - else return $ annotateSolution sols - + let psols = parMap rwhnf (\(p, ss) -> + foldl' (\cstate -> + concatAllocs cstate . + allocateOnPair nl inst p) + emptyAllocSolution ss) ok_pairs + sols = foldl' sumAllocs emptyAllocSolution psols + in return $ annotateSolution sols + +tryAlloc _ _ _ (Left []) = fail "No online nodes" tryAlloc nl _ inst (Left all_nodes) = - let sols = foldl' (\cstate -> - concatAllocs cstate . allocateOnSingle nl inst - ) emptySolution all_nodes - in if null all_nodes - then fail "No online nodes" - else return $ annotateSolution sols - --- | Given a group/result, describe it as a nice (list of) messages + let sols = foldl' (\cstate -> + concatAllocs cstate . allocateOnSingle nl inst + ) emptyAllocSolution all_nodes + in return $ annotateSolution sols + +-- | Given a group/result, describe it as a nice (list of) messages. solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String] solutionDescription gl (groupId, result) = case result of @@ -672,225 +741,452 @@ solutionDescription gl (groupId, result) = Bad message -> [printf "Group %s: error %s" gname message] where grp = Container.find groupId gl gname = Group.name grp - pol = apolToString (Group.allocPolicy grp) + pol = allocPolicyToRaw (Group.allocPolicy grp) -- | From a list of possibly bad and possibly empty solutions, filter --- only the groups with a valid result +-- only the groups with a valid result. Note that the result will be +-- reversed compared to the original list. filterMGResults :: Group.List -> [(Gdx, Result AllocSolution)] -> [(Gdx, AllocSolution)] -filterMGResults gl= - filter ((/= AllocUnallocable) . Group.allocPolicy . - flip Container.find gl . fst) . - filter (not . null . asSolutions . snd) . - map (\(y, Ok x) -> (y, x)) . - filter (isOk . snd) - --- | Sort multigroup results based on policy and score +filterMGResults gl = foldl' fn [] + where unallocable = not . Group.isAllocable . flip Container.find gl + fn accu (gdx, rasol) = + case rasol of + Bad _ -> accu + Ok sol | isNothing (asSolution sol) -> accu + | unallocable gdx -> accu + | otherwise -> (gdx, sol):accu + +-- | Sort multigroup results based on policy and score. sortMGResults :: Group.List -> [(Gdx, AllocSolution)] -> [(Gdx, AllocSolution)] sortMGResults gl sols = - let extractScore (_, _, _, x) = x - solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl), - (extractScore . head . asSolutions) sol) - in sortBy (comparing solScore) sols + let extractScore (_, _, _, x) = x + solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl), + (extractScore . fromJust . asSolution) sol) + in sortBy (comparing solScore) sols --- | Try to allocate an instance on a multi-group cluster. -tryMGAlloc :: Group.List -- ^ The group list - -> Node.List -- ^ The node list - -> Instance.List -- ^ The instance list - -> Instance.Instance -- ^ The instance to allocate - -> Int -- ^ Required number of nodes - -> Result AllocSolution -- ^ Possible solution list -tryMGAlloc mggl mgnl mgil inst cnt = +-- | Finds the best group for an instance on a multi-group cluster. +-- +-- Only solutions in @preferred@ and @last_resort@ groups will be +-- accepted as valid, and additionally if the allowed groups parameter +-- is not null then allocation will only be run for those group +-- indices. +findBestAllocGroup :: Group.List -- ^ The group list + -> Node.List -- ^ The node list + -> Instance.List -- ^ The instance list + -> Maybe [Gdx] -- ^ The allowed groups + -> Instance.Instance -- ^ The instance to allocate + -> Int -- ^ Required number of nodes + -> Result (Gdx, AllocSolution, [String]) +findBestAllocGroup mggl mgnl mgil allowed_gdxs inst cnt = let groups = splitCluster mgnl mgil + groups' = maybe groups (\gs -> filter ((`elem` gs) . fst) groups) + allowed_gdxs sols = map (\(gid, (nl, il)) -> (gid, genAllocNodes mggl nl cnt False >>= tryAlloc nl il inst)) - groups::[(Gdx, Result AllocSolution)] + groups'::[(Gdx, Result AllocSolution)] all_msgs = concatMap (solutionDescription mggl) sols goodSols = filterMGResults mggl sols sortedSols = sortMGResults mggl goodSols in if null sortedSols - then Bad $ intercalate ", " all_msgs - else let (final_group, final_sol) = head sortedSols - final_name = Group.name $ Container.find final_group mggl - selmsg = "Selected group: " ++ final_name - in Ok $ final_sol { asLog = selmsg:all_msgs } - --- | Try to relocate an instance on the cluster. -tryReloc :: (Monad m) => - Node.List -- ^ The node list - -> Instance.List -- ^ The instance list - -> Idx -- ^ The index of the instance to move - -> Int -- ^ The number of nodes required - -> [Ndx] -- ^ Nodes which should not be used - -> m AllocSolution -- ^ Solution list -tryReloc nl il xid 1 ex_idx = - let all_nodes = getOnline nl - inst = Container.find xid il - ex_idx' = Instance.pNode inst:ex_idx - valid_nodes = filter (not . flip elem ex_idx' . Node.idx) all_nodes - valid_idxes = map Node.idx valid_nodes - sols1 = foldl' (\cstate x -> - let em = do - (mnl, i, _, _) <- - applyMove nl inst (ReplaceSecondary x) - return (mnl, i, [Container.find x mnl], - compCV mnl) - in concatAllocs cstate em - ) emptySolution valid_idxes - in return sols1 - -tryReloc _ _ _ reqn _ = fail $ "Unsupported number of relocation \ - \destinations required (" ++ show reqn ++ - "), only one supported" - -tryMGReloc :: (Monad m) => - Group.List -- ^ The group list - -> Node.List -- ^ The node list - -> Instance.List -- ^ The instance list - -> Idx -- ^ The index of the instance to move - -> Int -- ^ The number of nodes required - -> [Ndx] -- ^ Nodes which should not be used - -> m AllocSolution -- ^ Solution list -tryMGReloc _ mgnl mgil xid ncount ex_ndx = do - let groups = splitCluster mgnl mgil - -- TODO: we only relocate inside the group for now - inst = Container.find xid mgil - (nl, il) <- case lookup (instancePriGroup mgnl inst) groups of - Nothing -> fail $ "Cannot find group for instance " ++ - Instance.name inst - Just v -> return v - tryReloc nl il xid ncount ex_ndx - --- | Change an instance's secondary node -evacInstance :: (Monad m) => - [Ndx] -- ^ Excluded nodes - -> Instance.List -- ^ The current instance list - -> (Node.List, AllocSolution) -- ^ The current state - -> Idx -- ^ The instance to evacuate - -> m (Node.List, AllocSolution) -evacInstance ex_ndx il (nl, old_as) idx = do - -- FIXME: hardcoded one node here - - -- Longer explanation: evacuation is currently hardcoded to DRBD - -- instances (which have one secondary); hence, even if the - -- IAllocator protocol can request N nodes for an instance, and all - -- the message parsing/loading pass this, this implementation only - -- supports one; this situation needs to be revisited if we ever - -- support more than one secondary, or if we change the storage - -- model - new_as <- tryReloc nl il idx 1 ex_ndx - case asSolutions new_as of - -- an individual relocation succeeded, we kind of compose the data - -- from the two solutions - csol@(nl', _, _, _):_ -> - return (nl', new_as { asSolutions = csol:asSolutions old_as }) - -- this relocation failed, so we fail the entire evac - _ -> fail $ "Can't evacuate instance " ++ - Instance.name (Container.find idx il) ++ - ": " ++ describeSolution new_as - --- | Try to evacuate a list of nodes. -tryEvac :: (Monad m) => - Node.List -- ^ The node list - -> Instance.List -- ^ The instance list - -> [Idx] -- ^ Instances to be evacuated - -> [Ndx] -- ^ Restricted nodes (the ones being evacuated) - -> m AllocSolution -- ^ Solution list -tryEvac nl il idxs ex_ndx = do - (_, sol) <- foldM (evacInstance ex_ndx il) (nl, emptySolution) idxs - return sol - --- | Multi-group evacuation of a list of nodes. -tryMGEvac :: (Monad m) => - Group.List -- ^ The group list - -> Node.List -- ^ The node list - -> Instance.List -- ^ The instance list - -> [Ndx] -- ^ Nodes to be evacuated - -> m AllocSolution -- ^ Solution list -tryMGEvac _ nl il ex_ndx = - let ex_nodes = map (`Container.find` nl) ex_ndx - all_insts = nub . concatMap Node.sList $ ex_nodes - gni = splitCluster nl il - -- we run the instance index list through a couple of maps to - -- get finally to a structure of the type [(group index, - -- [instance indices])] - all_insts' = map (\idx -> - (instancePriGroup nl (Container.find idx il), - idx)) all_insts - all_insts'' = groupBy ((==) `on` fst) all_insts' - all_insts3 = map (\xs -> let (gdxs, idxs) = unzip xs - in (head gdxs, idxs)) all_insts'' - in do - -- that done, we now add the per-group nl/il to the tuple - all_insts4 <- - mapM (\(gdx, idxs) -> - case lookup gdx gni of - Nothing -> fail $ "Can't find group index " ++ show gdx - Just (gnl, gil) -> return (gdx, gnl, gil, idxs)) - all_insts3 - results <- mapM (\(_, gnl, gil, idxs) -> tryEvac gnl gil idxs ex_ndx) - all_insts4 - let sol = foldl' sumAllocs emptySolution results - return $ annotateSolution sol - --- | Recursively place instances on the cluster until we're out of space -iterateAlloc :: Node.List - -> Instance.List - -> Instance.Instance - -> AllocNodes - -> [Instance.Instance] - -> [CStats] - -> Result AllocResult -iterateAlloc nl il newinst allocnodes ixes cstats = - let depth = length ixes - newname = printf "new-%d" depth::String - newidx = length (Container.elems il) + depth - newi2 = Instance.setIdx (Instance.setName newinst newname) newidx - in case tryAlloc nl il newi2 allocnodes of - Bad s -> Bad s - Ok (AllocSolution { asFailures = errs, asSolutions = sols3 }) -> - case sols3 of - [] -> Ok (collapseFailures errs, nl, il, ixes, cstats) - (xnl, xi, _, _):[] -> - iterateAlloc xnl (Container.add newidx xi il) - newinst allocnodes (xi:ixes) - (totalResources xnl:cstats) - _ -> Bad "Internal error: multiple solutions for single\ - \ allocation" - --- | The core of the tiered allocation mode -tieredAlloc :: Node.List - -> Instance.List - -> Instance.Instance - -> AllocNodes - -> [Instance.Instance] - -> [CStats] - -> Result AllocResult -tieredAlloc nl il newinst allocnodes ixes cstats = - case iterateAlloc nl il newinst allocnodes ixes cstats of - Bad s -> Bad s - Ok (errs, nl', il', ixes', cstats') -> + then if null groups' + then Bad $ "no groups for evacuation: allowed groups was" ++ + show allowed_gdxs ++ ", all groups: " ++ + show (map fst groups) + else Bad $ intercalate ", " all_msgs + else let (final_group, final_sol) = head sortedSols + in return (final_group, final_sol, all_msgs) + +-- | Try to allocate an instance on a multi-group cluster. +tryMGAlloc :: Group.List -- ^ The group list + -> Node.List -- ^ The node list + -> Instance.List -- ^ The instance list + -> Instance.Instance -- ^ The instance to allocate + -> Int -- ^ Required number of nodes + -> Result AllocSolution -- ^ Possible solution list +tryMGAlloc mggl mgnl mgil inst cnt = do + (best_group, solution, all_msgs) <- + findBestAllocGroup mggl mgnl mgil Nothing inst cnt + let group_name = Group.name $ Container.find best_group mggl + selmsg = "Selected group: " ++ group_name + return $ solution { asLog = selmsg:all_msgs } + +-- | Function which fails if the requested mode is change secondary. +-- +-- This is useful since except DRBD, no other disk template can +-- execute change secondary; thus, we can just call this function +-- instead of always checking for secondary mode. After the call to +-- this function, whatever mode we have is just a primary change. +failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m () +failOnSecondaryChange ChangeSecondary dt = + fail $ "Instances with disk template '" ++ diskTemplateToRaw dt ++ + "' can't execute change secondary" +failOnSecondaryChange _ _ = return () + +-- | Run evacuation for a single instance. +-- +-- /Note:/ this function should correctly execute both intra-group +-- evacuations (in all modes) and inter-group evacuations (in the +-- 'ChangeAll' mode). Of course, this requires that the correct list +-- of target nodes is passed. +nodeEvacInstance :: Node.List -- ^ The node list (cluster-wide) + -> Instance.List -- ^ Instance list (cluster-wide) + -> EvacMode -- ^ The evacuation mode + -> Instance.Instance -- ^ The instance to be evacuated + -> Gdx -- ^ The group we're targetting + -> [Ndx] -- ^ The list of available nodes + -- for allocation + -> Result (Node.List, Instance.List, [OpCodes.OpCode]) +nodeEvacInstance _ _ mode (Instance.Instance + {Instance.diskTemplate = dt@DTDiskless}) _ _ = + failOnSecondaryChange mode dt >> + fail "Diskless relocations not implemented yet" + +nodeEvacInstance _ _ _ (Instance.Instance + {Instance.diskTemplate = DTPlain}) _ _ = + fail "Instances of type plain cannot be relocated" + +nodeEvacInstance _ _ _ (Instance.Instance + {Instance.diskTemplate = DTFile}) _ _ = + fail "Instances of type file cannot be relocated" + +nodeEvacInstance _ _ mode (Instance.Instance + {Instance.diskTemplate = dt@DTSharedFile}) _ _ = + failOnSecondaryChange mode dt >> + fail "Shared file relocations not implemented yet" + +nodeEvacInstance _ _ mode (Instance.Instance + {Instance.diskTemplate = dt@DTBlock}) _ _ = + failOnSecondaryChange mode dt >> + fail "Block device relocations not implemented yet" + +nodeEvacInstance _ _ mode (Instance.Instance + {Instance.diskTemplate = dt@DTRbd}) _ _ = + failOnSecondaryChange mode dt >> + fail "Rbd relocations not implemented yet" + +nodeEvacInstance nl il ChangePrimary + inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8}) + _ _ = + do + (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover + let idx = Instance.idx inst + il' = Container.add idx inst' il + ops = iMoveToJob nl' il' idx Failover + return (nl', il', ops) + +nodeEvacInstance nl il ChangeSecondary + inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8}) + gdx avail_nodes = + do + (nl', inst', _, ndx) <- annotateResult "Can't find any good node" $ + eitherToResult $ + foldl' (evacDrbdSecondaryInner nl inst gdx) + (Left "no nodes available") avail_nodes + let idx = Instance.idx inst + il' = Container.add idx inst' il + ops = iMoveToJob nl' il' idx (ReplaceSecondary ndx) + return (nl', il', ops) + +-- The algorithm for ChangeAll is as follows: +-- +-- * generate all (primary, secondary) node pairs for the target groups +-- * for each pair, execute the needed moves (r:s, f, r:s) and compute +-- the final node list state and group score +-- * select the best choice via a foldl that uses the same Either +-- String solution as the ChangeSecondary mode +nodeEvacInstance nl il ChangeAll + inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8}) + gdx avail_nodes = + do + let no_nodes = Left "no nodes available" + node_pairs = [(p,s) | p <- avail_nodes, s <- avail_nodes, p /= s] + (nl', il', ops, _) <- + annotateResult "Can't find any good nodes for relocation" $ + eitherToResult $ + foldl' + (\accu nodes -> case evacDrbdAllInner nl il inst gdx nodes of + Bad msg -> + case accu of + Right _ -> accu + -- we don't need more details (which + -- nodes, etc.) as we only selected + -- this group if we can allocate on + -- it, hence failures will not + -- propagate out of this fold loop + Left _ -> Left $ "Allocation failed: " ++ msg + Ok result@(_, _, _, new_cv) -> + let new_accu = Right result in + case accu of + Left _ -> new_accu + Right (_, _, _, old_cv) -> + if old_cv < new_cv + then accu + else new_accu + ) no_nodes node_pairs + + return (nl', il', ops) + +-- | Inner fold function for changing secondary of a DRBD instance. +-- +-- The running solution is either a @Left String@, which means we +-- don't have yet a working solution, or a @Right (...)@, which +-- represents a valid solution; it holds the modified node list, the +-- modified instance (after evacuation), the score of that solution, +-- and the new secondary node index. +evacDrbdSecondaryInner :: Node.List -- ^ Cluster node list + -> Instance.Instance -- ^ Instance being evacuated + -> Gdx -- ^ The group index of the instance + -> Either String ( Node.List + , Instance.Instance + , Score + , Ndx) -- ^ Current best solution + -> Ndx -- ^ Node we're evaluating as new secondary + -> Either String ( Node.List + , Instance.Instance + , Score + , Ndx) -- ^ New best solution +evacDrbdSecondaryInner nl inst gdx accu ndx = + case applyMove nl inst (ReplaceSecondary ndx) of + OpFail fm -> + case accu of + Right _ -> accu + Left _ -> Left $ "Node " ++ Container.nameOf nl ndx ++ + " failed: " ++ show fm + OpGood (nl', inst', _, _) -> + let nodes = Container.elems nl' + -- The fromJust below is ugly (it can fail nastily), but + -- at this point we should have any internal mismatches, + -- and adding a monad here would be quite involved + grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes) + new_cv = compCVNodes grpnodes + new_accu = Right (nl', inst', new_cv, ndx) + in case accu of + Left _ -> new_accu + Right (_, _, old_cv, _) -> + if old_cv < new_cv + then accu + else new_accu + +-- | Compute result of changing all nodes of a DRBD instance. +-- +-- Given the target primary and secondary node (which might be in a +-- different group or not), this function will 'execute' all the +-- required steps and assuming all operations succceed, will return +-- the modified node and instance lists, the opcodes needed for this +-- and the new group score. +evacDrbdAllInner :: Node.List -- ^ Cluster node list + -> Instance.List -- ^ Cluster instance list + -> Instance.Instance -- ^ The instance to be moved + -> Gdx -- ^ The target group index + -- (which can differ from the + -- current group of the + -- instance) + -> (Ndx, Ndx) -- ^ Tuple of new + -- primary\/secondary nodes + -> Result (Node.List, Instance.List, [OpCodes.OpCode], Score) +evacDrbdAllInner nl il inst gdx (t_pdx, t_sdx) = do + let primary = Container.find (Instance.pNode inst) nl + idx = Instance.idx inst + -- if the primary is offline, then we first failover + (nl1, inst1, ops1) <- + if Node.offline primary + then do + (nl', inst', _, _) <- + annotateResult "Failing over to the secondary" $ + opToResult $ applyMove nl inst Failover + return (nl', inst', [Failover]) + else return (nl, inst, []) + let (o1, o2, o3) = (ReplaceSecondary t_pdx, + Failover, + ReplaceSecondary t_sdx) + -- we now need to execute a replace secondary to the future + -- primary node + (nl2, inst2, _, _) <- + annotateResult "Changing secondary to new primary" $ + opToResult $ + applyMove nl1 inst1 o1 + let ops2 = o1:ops1 + -- we now execute another failover, the primary stays fixed now + (nl3, inst3, _, _) <- annotateResult "Failing over to new primary" $ + opToResult $ applyMove nl2 inst2 o2 + let ops3 = o2:ops2 + -- and finally another replace secondary, to the final secondary + (nl4, inst4, _, _) <- + annotateResult "Changing secondary to final secondary" $ + opToResult $ + applyMove nl3 inst3 o3 + let ops4 = o3:ops3 + il' = Container.add idx inst4 il + ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4 + let nodes = Container.elems nl4 + -- The fromJust below is ugly (it can fail nastily), but + -- at this point we should have any internal mismatches, + -- and adding a monad here would be quite involved + grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes) + new_cv = compCVNodes grpnodes + return (nl4, il', ops, new_cv) + +-- | Computes the nodes in a given group which are available for +-- allocation. +availableGroupNodes :: [(Gdx, [Ndx])] -- ^ Group index/node index assoc list + -> IntSet.IntSet -- ^ Nodes that are excluded + -> Gdx -- ^ The group for which we + -- query the nodes + -> Result [Ndx] -- ^ List of available node indices +availableGroupNodes group_nodes excl_ndx gdx = do + local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx) + Ok (lookup gdx group_nodes) + let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes + return avail_nodes + +-- | Updates the evac solution with the results of an instance +-- evacuation. +updateEvacSolution :: (Node.List, Instance.List, EvacSolution) + -> Idx + -> Result (Node.List, Instance.List, [OpCodes.OpCode]) + -> (Node.List, Instance.List, EvacSolution) +updateEvacSolution (nl, il, es) idx (Bad msg) = + (nl, il, es { esFailed = (idx, msg):esFailed es}) +updateEvacSolution (_, _, es) idx (Ok (nl, il, opcodes)) = + (nl, il, es { esMoved = new_elem:esMoved es + , esOpCodes = opcodes:esOpCodes es }) + where inst = Container.find idx il + new_elem = (idx, + instancePriGroup nl inst, + Instance.allNodes inst) + +-- | Node-evacuation IAllocator mode main function. +tryNodeEvac :: Group.List -- ^ The cluster groups + -> Node.List -- ^ The node list (cluster-wide, not per group) + -> Instance.List -- ^ Instance list (cluster-wide) + -> EvacMode -- ^ The evacuation mode + -> [Idx] -- ^ List of instance (indices) to be evacuated + -> Result (Node.List, Instance.List, EvacSolution) +tryNodeEvac _ ini_nl ini_il mode idxs = + let evac_ndx = nodesToEvacuate ini_il mode idxs + offline = map Node.idx . filter Node.offline $ Container.elems ini_nl + excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline + group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx + (Container.elems nl))) $ + splitCluster ini_nl ini_il + (fin_nl, fin_il, esol) = + foldl' (\state@(nl, il, _) inst -> + let gdx = instancePriGroup nl inst + pdx = Instance.pNode inst in + updateEvacSolution state (Instance.idx inst) $ + availableGroupNodes group_ndx + (IntSet.insert pdx excl_ndx) gdx >>= + nodeEvacInstance nl il mode inst gdx + ) + (ini_nl, ini_il, emptyEvacSolution) + (map (`Container.find` ini_il) idxs) + in return (fin_nl, fin_il, reverseEvacSolution esol) + +-- | Change-group IAllocator mode main function. +-- +-- This is very similar to 'tryNodeEvac', the only difference is that +-- we don't choose as target group the current instance group, but +-- instead: +-- +-- 1. at the start of the function, we compute which are the target +-- groups; either no groups were passed in, in which case we choose +-- all groups out of which we don't evacuate instance, or there were +-- some groups passed, in which case we use those +-- +-- 2. for each instance, we use 'findBestAllocGroup' to choose the +-- best group to hold the instance, and then we do what +-- 'tryNodeEvac' does, except for this group instead of the current +-- instance group. +-- +-- Note that the correct behaviour of this function relies on the +-- function 'nodeEvacInstance' to be able to do correctly both +-- intra-group and inter-group moves when passed the 'ChangeAll' mode. +tryChangeGroup :: Group.List -- ^ The cluster groups + -> Node.List -- ^ The node list (cluster-wide) + -> Instance.List -- ^ Instance list (cluster-wide) + -> [Gdx] -- ^ Target groups; if empty, any + -- groups not being evacuated + -> [Idx] -- ^ List of instance (indices) to be evacuated + -> Result (Node.List, Instance.List, EvacSolution) +tryChangeGroup gl ini_nl ini_il gdxs idxs = + let evac_gdxs = nub $ map (instancePriGroup ini_nl . + flip Container.find ini_il) idxs + target_gdxs = (if null gdxs + then Container.keys gl + else gdxs) \\ evac_gdxs + offline = map Node.idx . filter Node.offline $ Container.elems ini_nl + excl_ndx = foldl' (flip IntSet.insert) IntSet.empty offline + group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx + (Container.elems nl))) $ + splitCluster ini_nl ini_il + (fin_nl, fin_il, esol) = + foldl' (\state@(nl, il, _) inst -> + let solution = do + let ncnt = Instance.requiredNodes $ + Instance.diskTemplate inst + (gdx, _, _) <- findBestAllocGroup gl nl il + (Just target_gdxs) inst ncnt + av_nodes <- availableGroupNodes group_ndx + excl_ndx gdx + nodeEvacInstance nl il ChangeAll inst gdx av_nodes + in updateEvacSolution state (Instance.idx inst) solution + ) + (ini_nl, ini_il, emptyEvacSolution) + (map (`Container.find` ini_il) idxs) + in return (fin_nl, fin_il, reverseEvacSolution esol) + +-- | Standard-sized allocation method. +-- +-- This places instances of the same size on the cluster until we're +-- out of space. The result will be a list of identically-sized +-- instances. +iterateAlloc :: AllocMethod +iterateAlloc nl il limit newinst allocnodes ixes cstats = + let depth = length ixes + newname = printf "new-%d" depth::String + newidx = Container.size il + newi2 = Instance.setIdx (Instance.setName newinst newname) newidx + newlimit = fmap (flip (-) 1) limit + in case tryAlloc nl il newi2 allocnodes of + Bad s -> Bad s + Ok (AllocSolution { asFailures = errs, asSolution = sols3 }) -> + let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in + case sols3 of + Nothing -> newsol + Just (xnl, xi, _, _) -> + if limit == Just 0 + then newsol + else iterateAlloc xnl (Container.add newidx xi il) + newlimit newinst allocnodes (xi:ixes) + (totalResources xnl:cstats) + +-- | Tiered allocation method. +-- +-- This places instances on the cluster, and decreases the spec until +-- we can allocate again. The result will be a list of decreasing +-- instance specs. +tieredAlloc :: AllocMethod +tieredAlloc nl il limit newinst allocnodes ixes cstats = + case iterateAlloc nl il limit newinst allocnodes ixes cstats of + Bad s -> Bad s + Ok (errs, nl', il', ixes', cstats') -> + let newsol = Ok (errs, nl', il', ixes', cstats') + ixes_cnt = length ixes' + (stop, newlimit) = case limit of + Nothing -> (False, Nothing) + Just n -> (n <= ixes_cnt, + Just (n - ixes_cnt)) in + if stop then newsol else case Instance.shrinkByType newinst . fst . last $ sortBy (comparing snd) errs of - Bad _ -> Ok (errs, nl', il', ixes', cstats') - Ok newinst' -> - tieredAlloc nl' il' newinst' allocnodes ixes' cstats' - --- | Compute the tiered spec string description from a list of --- allocated instances. -tieredSpecMap :: [Instance.Instance] - -> [String] -tieredSpecMap trl_ixes = - let fin_trl_ixes = reverse trl_ixes - ix_byspec = groupBy ((==) `on` Instance.specOf) fin_trl_ixes - spec_map = map (\ixs -> (Instance.specOf $ head ixs, length ixs)) - ix_byspec - in map (\(spec, cnt) -> printf "%d,%d,%d=%d" (rspecMem spec) - (rspecDsk spec) (rspecCpu spec) cnt) spec_map + Bad _ -> newsol + Ok newinst' -> tieredAlloc nl' il' newlimit + newinst' allocnodes ixes' cstats' -- * Formatting functions @@ -906,15 +1202,15 @@ computeMoves :: Instance.Instance -- ^ The instance to be moved -- secondary, while the command list holds gnt-instance -- commands (without that prefix), e.g \"@failover instance1@\" computeMoves i inam mv c d = - case mv of - Failover -> ("f", [mig]) - FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d]) - ReplaceSecondary _ -> (printf "r:%s" d, [rep d]) - ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig]) - ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig]) - where morf = if Instance.running i then "migrate" else "failover" - mig = printf "%s -f %s" morf inam::String - rep n = printf "replace-disks -n %s %s" n inam + case mv of + Failover -> ("f", [mig]) + FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d]) + ReplaceSecondary _ -> (printf "r:%s" d, [rep d]) + ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig]) + ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig]) + where morf = if Instance.instanceRunning i then "migrate" else "failover" + mig = printf "%s -f %s" morf inam::String + rep n = printf "replace-disks -n %s %s" n inam -- | Converts a placement to string format. printSolutionLine :: Node.List -- ^ The node list @@ -926,40 +1222,47 @@ printSolutionLine :: Node.List -- ^ The node list -- the solution -> (String, [String]) printSolutionLine nl il nmlen imlen plc pos = - let - pmlen = (2*nmlen + 1) - (i, p, s, mv, c) = plc - inst = Container.find i il - inam = Instance.alias inst - npri = Node.alias $ Container.find p nl - nsec = Node.alias $ Container.find s nl - opri = Node.alias $ Container.find (Instance.pNode inst) nl - osec = Node.alias $ Container.find (Instance.sNode inst) nl - (moves, cmds) = computeMoves inst inam mv npri nsec - ostr = printf "%s:%s" opri osec::String - nstr = printf "%s:%s" npri nsec::String - in - (printf " %3d. %-*s %-*s => %-*s %.8f a=%s" - pos imlen inam pmlen ostr - pmlen nstr c moves, - cmds) + let pmlen = (2*nmlen + 1) + (i, p, s, mv, c) = plc + inst = Container.find i il + inam = Instance.alias inst + npri = Node.alias $ Container.find p nl + nsec = Node.alias $ Container.find s nl + opri = Node.alias $ Container.find (Instance.pNode inst) nl + osec = Node.alias $ Container.find (Instance.sNode inst) nl + (moves, cmds) = computeMoves inst inam mv npri nsec + ostr = printf "%s:%s" opri osec::String + nstr = printf "%s:%s" npri nsec::String + in (printf " %3d. %-*s %-*s => %-*s %.8f a=%s" + pos imlen inam pmlen ostr + pmlen nstr c moves, + cmds) -- | Return the instance and involved nodes in an instance move. -involvedNodes :: Instance.List -> Placement -> [Ndx] +-- +-- Note that the output list length can vary, and is not required nor +-- guaranteed to be of any specific length. +involvedNodes :: Instance.List -- ^ Instance list, used for retrieving + -- the instance from its index; note + -- that this /must/ be the original + -- instance list, so that we can + -- retrieve the old nodes + -> Placement -- ^ The placement we're investigating, + -- containing the new nodes and + -- instance index + -> [Ndx] -- ^ Resulting list of node indices involvedNodes il plc = - let (i, np, ns, _, _) = plc - inst = Container.find i il - op = Instance.pNode inst - os = Instance.sNode inst - in nub [np, ns, op, os] + let (i, np, ns, _, _) = plc + inst = Container.find i il + in nub $ [np, ns] ++ Instance.allNodes inst -- | Inner function for splitJobs, that either appends the next job to -- the current jobset, or starts a new jobset. mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx]) mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx) mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _) - | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf) - | otherwise = ([n]:cjs, ndx) + | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf) + | otherwise = ([n]:cjs, ndx) -- | Break a list of moves into independent groups. Note that this -- will reverse the order of jobs. @@ -970,11 +1273,11 @@ splitJobs = fst . foldl mergeJobs ([], []) -- also beautify the display a little. formatJob :: Int -> Int -> (Int, MoveJob) -> [String] formatJob jsn jsl (sn, (_, _, _, cmds)) = - let out = - printf " echo job %d/%d" jsn sn: - printf " check": - map (" gnt-instance " ++) cmds - in if sn == 1 + let out = + printf " echo job %d/%d" jsn sn: + printf " check": + map (" gnt-instance " ++) cmds + in if sn == 1 then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out else out @@ -982,81 +1285,93 @@ formatJob jsn jsl (sn, (_, _, _, cmds)) = -- also beautify the display a little. formatCmds :: [JobSet] -> String formatCmds = - unlines . - concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js)) - (zip [1..] js)) . - zip [1..] + unlines . + concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js)) + (zip [1..] js)) . + zip [1..] -- | Print the node list. printNodes :: Node.List -> [String] -> String printNodes nl fs = - let fields = case fs of - [] -> Node.defaultFields - "+":rest -> Node.defaultFields ++ rest - _ -> fs - snl = sortBy (comparing Node.idx) (Container.elems nl) - (header, isnum) = unzip $ map Node.showHeader fields - in unlines . map ((:) ' ' . intercalate " ") $ - formatTable (header:map (Node.list fields) snl) isnum + let fields = case fs of + [] -> Node.defaultFields + "+":rest -> Node.defaultFields ++ rest + _ -> fs + snl = sortBy (comparing Node.idx) (Container.elems nl) + (header, isnum) = unzip $ map Node.showHeader fields + in unlines . map ((:) ' ' . unwords) $ + formatTable (header:map (Node.list fields) snl) isnum -- | Print the instance list. printInsts :: Node.List -> Instance.List -> String printInsts nl il = - let sil = sortBy (comparing Instance.idx) (Container.elems il) - helper inst = [ if Instance.running inst then "R" else " " - , Instance.name inst - , Container.nameOf nl (Instance.pNode inst) - , let sdx = Instance.sNode inst - in if sdx == Node.noSecondary + let sil = sortBy (comparing Instance.idx) (Container.elems il) + helper inst = [ if Instance.instanceRunning inst then "R" else " " + , Instance.name inst + , Container.nameOf nl (Instance.pNode inst) + , let sdx = Instance.sNode inst + in if sdx == Node.noSecondary then "" else Container.nameOf nl sdx - , printf "%3d" $ Instance.vcpus inst - , printf "%5d" $ Instance.mem inst - , printf "%5d" $ Instance.dsk inst `div` 1024 - , printf "%5.3f" lC - , printf "%5.3f" lM - , printf "%5.3f" lD - , printf "%5.3f" lN - ] - where DynUtil lC lM lD lN = Instance.util inst - header = [ "F", "Name", "Pri_node", "Sec_node", "vcpu", "mem" - , "dsk", "lCpu", "lMem", "lDsk", "lNet" ] - isnum = False:False:False:False:repeat True - in unlines . map ((:) ' ' . intercalate " ") $ - formatTable (header:map helper sil) isnum + , if Instance.autoBalance inst then "Y" else "N" + , printf "%3d" $ Instance.vcpus inst + , printf "%5d" $ Instance.mem inst + , printf "%5d" $ Instance.dsk inst `div` 1024 + , printf "%5.3f" lC + , printf "%5.3f" lM + , printf "%5.3f" lD + , printf "%5.3f" lN + ] + where DynUtil lC lM lD lN = Instance.util inst + header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal" + , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ] + isnum = False:False:False:False:False:repeat True + in unlines . map ((:) ' ' . unwords) $ + formatTable (header:map helper sil) isnum -- | Shows statistics for a given node list. -printStats :: Node.List -> String -printStats nl = - let dcvs = compDetailedCV nl - (weights, names) = unzip detailedCVInfo - hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs - formatted = map (\(w, header, val) -> - printf "%s=%.8f(x%.2f)" header val w::String) hd - in intercalate ", " formatted +printStats :: String -> Node.List -> String +printStats lp nl = + let dcvs = compDetailedCV $ Container.elems nl + (weights, names) = unzip detailedCVInfo + hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs + header = [ "Field", "Value", "Weight" ] + formatted = map (\(w, h, val) -> + [ h + , printf "%.8f" val + , printf "x%.2f" w + ]) hd + in unlines . map ((++) lp) . map ((:) ' ' . unwords) $ + formatTable (header:formatted) $ False:repeat True -- | Convert a placement into a list of OpCodes (basically a job). -iMoveToJob :: Node.List -> Instance.List - -> Idx -> IMove -> [OpCodes.OpCode] +iMoveToJob :: Node.List -- ^ The node list; only used for node + -- names, so any version is good + -- (before or after the operation) + -> Instance.List -- ^ The instance list; also used for + -- names only + -> Idx -- ^ The index of the instance being + -- moved + -> IMove -- ^ The actual move to be described + -> [OpCodes.OpCode] -- ^ The list of opcodes equivalent to + -- the given move iMoveToJob nl il idx move = - let inst = Container.find idx il - iname = Instance.name inst - lookNode = Just . Container.nameOf nl - opF = if Instance.running inst - then OpCodes.OpMigrateInstance iname True False - else OpCodes.OpFailoverInstance iname False - opR n = OpCodes.OpReplaceDisks iname (lookNode n) - OpCodes.ReplaceNewSecondary [] Nothing - in case move of - Failover -> [ opF ] - ReplacePrimary np -> [ opF, opR np, opF ] - ReplaceSecondary ns -> [ opR ns ] - ReplaceAndFailover np -> [ opR np, opF ] - FailoverAndReplace ns -> [ opF, opR ns ] + let inst = Container.find idx il + iname = Instance.name inst + lookNode = Just . Container.nameOf nl + opF = OpCodes.OpInstanceMigrate iname True False True Nothing + opR n = OpCodes.OpInstanceReplaceDisks iname (lookNode n) + OpCodes.ReplaceNewSecondary [] Nothing + in case move of + Failover -> [ opF ] + ReplacePrimary np -> [ opF, opR np, opF ] + ReplaceSecondary ns -> [ opR ns ] + ReplaceAndFailover np -> [ opR np, opF ] + FailoverAndReplace ns -> [ opF, opR ns ] -- * Node group functions --- | Computes the group of an instance +-- | Computes the group of an instance. instanceGroup :: Node.List -> Instance.Instance -> Result Gdx instanceGroup nl i = let sidx = Instance.sNode i @@ -1067,23 +1382,23 @@ instanceGroup nl i = pgroup = Node.group pnode sgroup = Node.group snode in if pgroup /= sgroup - then fail ("Instance placed accross two node groups, primary " ++ - show pgroup ++ ", secondary " ++ show sgroup) - else return pgroup + then fail ("Instance placed accross two node groups, primary " ++ + show pgroup ++ ", secondary " ++ show sgroup) + else return pgroup --- | Computes the group of an instance per the primary node +-- | Computes the group of an instance per the primary node. instancePriGroup :: Node.List -> Instance.Instance -> Gdx instancePriGroup nl i = let pnode = Container.find (Instance.pNode i) nl in Node.group pnode -- | Compute the list of badly allocated instances (split across node --- groups) +-- groups). findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance] findSplitInstances nl = filter (not . isOk . instanceGroup nl) . Container.elems --- | Splits a cluster into the component node groups +-- | Splits a cluster into the component node groups. splitCluster :: Node.List -> Instance.List -> [(Gdx, (Node.List, Instance.List))] splitCluster nl il = @@ -1093,3 +1408,25 @@ splitCluster nl il = nodes' = zip nidxs nodes instances = Container.filter ((`elem` nidxs) . Instance.pNode) il in (guuid, (Container.fromList nodes', instances))) ngroups + +-- | Compute the list of nodes that are to be evacuated, given a list +-- of instances and an evacuation mode. +nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list + -> EvacMode -- ^ The evacuation mode we're using + -> [Idx] -- ^ List of instance indices being evacuated + -> IntSet.IntSet -- ^ Set of node indices +nodesToEvacuate il mode = + IntSet.delete Node.noSecondary . + foldl' (\ns idx -> + let i = Container.find idx il + pdx = Instance.pNode i + sdx = Instance.sNode i + dt = Instance.diskTemplate i + withSecondary = case dt of + DTDrbd8 -> IntSet.insert sdx ns + _ -> ns + in case mode of + ChangePrimary -> IntSet.insert pdx ns + ChangeSecondary -> withSecondary + ChangeAll -> IntSet.insert pdx withSecondary + ) IntSet.empty