root / src / Ganeti / HTools / Cluster.hs @ 94a420ed
History | View | Annotate | Download (70.9 kB)
1 |
{-| Implementation of cluster-wide logic. |
---|---|
2 |
|
3 |
This module holds all pure cluster-logic; I\/O related functionality |
4 |
goes into the /Main/ module for the individual binaries. |
5 |
|
6 |
-} |
7 |
|
8 |
{- |
9 |
|
10 |
Copyright (C) 2009, 2010, 2011, 2012, 2013 Google Inc. |
11 |
|
12 |
This program is free software; you can redistribute it and/or modify |
13 |
it under the terms of the GNU General Public License as published by |
14 |
the Free Software Foundation; either version 2 of the License, or |
15 |
(at your option) any later version. |
16 |
|
17 |
This program is distributed in the hope that it will be useful, but |
18 |
WITHOUT ANY WARRANTY; without even the implied warranty of |
19 |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
20 |
General Public License for more details. |
21 |
|
22 |
You should have received a copy of the GNU General Public License |
23 |
along with this program; if not, write to the Free Software |
24 |
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA |
25 |
02110-1301, USA. |
26 |
|
27 |
-} |
28 |
|
29 |
module Ganeti.HTools.Cluster |
30 |
( |
31 |
-- * Types |
32 |
AllocSolution(..) |
33 |
, EvacSolution(..) |
34 |
, Table(..) |
35 |
, CStats(..) |
36 |
, AllocNodes |
37 |
, AllocResult |
38 |
, AllocMethod |
39 |
, AllocSolutionList |
40 |
-- * Generic functions |
41 |
, totalResources |
42 |
, computeAllocationDelta |
43 |
-- * First phase functions |
44 |
, computeBadItems |
45 |
-- * Second phase functions |
46 |
, printSolutionLine |
47 |
, formatCmds |
48 |
, involvedNodes |
49 |
, getMoves |
50 |
, splitJobs |
51 |
-- * Display functions |
52 |
, printNodes |
53 |
, printInsts |
54 |
-- * Balacing functions |
55 |
, checkMove |
56 |
, doNextBalance |
57 |
, tryBalance |
58 |
, compCV |
59 |
, compCVNodes |
60 |
, compDetailedCV |
61 |
, printStats |
62 |
, iMoveToJob |
63 |
-- * IAllocator functions |
64 |
, genAllocNodes |
65 |
, tryAlloc |
66 |
, tryMGAlloc |
67 |
, tryNodeEvac |
68 |
, tryChangeGroup |
69 |
, collapseFailures |
70 |
, allocList |
71 |
-- * Allocation functions |
72 |
, iterateAlloc |
73 |
, tieredAlloc |
74 |
-- * Node group functions |
75 |
, instanceGroup |
76 |
, findSplitInstances |
77 |
, splitCluster |
78 |
) where |
79 |
|
80 |
import Control.Applicative (liftA2) |
81 |
import Control.Arrow ((&&&)) |
82 |
import qualified Data.IntSet as IntSet |
83 |
import Data.List |
84 |
import Data.Maybe (fromJust, fromMaybe, isJust, isNothing) |
85 |
import Data.Ord (comparing) |
86 |
import Text.Printf (printf) |
87 |
|
88 |
import Ganeti.BasicTypes |
89 |
import qualified Ganeti.HTools.Container as Container |
90 |
import qualified Ganeti.HTools.Instance as Instance |
91 |
import qualified Ganeti.HTools.Nic as Nic |
92 |
import qualified Ganeti.HTools.Node as Node |
93 |
import qualified Ganeti.HTools.Group as Group |
94 |
import Ganeti.HTools.Types |
95 |
import Ganeti.Compat |
96 |
import qualified Ganeti.OpCodes as OpCodes |
97 |
import Ganeti.Utils |
98 |
import Ganeti.Types (EvacMode(..), mkNonEmpty) |
99 |
|
100 |
-- * Types |
101 |
|
102 |
-- | Allocation\/relocation solution. |
103 |
data AllocSolution = AllocSolution |
104 |
{ asFailures :: [FailMode] -- ^ Failure counts |
105 |
, asAllocs :: Int -- ^ Good allocation count |
106 |
, asSolution :: Maybe Node.AllocElement -- ^ The actual allocation result |
107 |
, asLog :: [String] -- ^ Informational messages |
108 |
} |
109 |
|
110 |
-- | Node evacuation/group change iallocator result type. This result |
111 |
-- type consists of actual opcodes (a restricted subset) that are |
112 |
-- transmitted back to Ganeti. |
113 |
data EvacSolution = EvacSolution |
114 |
{ esMoved :: [(Idx, Gdx, [Ndx])] -- ^ Instances moved successfully |
115 |
, esFailed :: [(Idx, String)] -- ^ Instances which were not |
116 |
-- relocated |
117 |
, esOpCodes :: [[OpCodes.OpCode]] -- ^ List of jobs |
118 |
} deriving (Show) |
119 |
|
120 |
-- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'. |
121 |
type AllocResult = (FailStats, Node.List, Instance.List, |
122 |
[Instance.Instance], [CStats]) |
123 |
|
124 |
-- | Type alias for easier handling. |
125 |
type AllocSolutionList = [(Instance.Instance, AllocSolution)] |
126 |
|
127 |
-- | A type denoting the valid allocation mode/pairs. |
128 |
-- |
129 |
-- For a one-node allocation, this will be a @Left ['Ndx']@, whereas |
130 |
-- for a two-node allocation, this will be a @Right [('Ndx', |
131 |
-- ['Ndx'])]@. In the latter case, the list is basically an |
132 |
-- association list, grouped by primary node and holding the potential |
133 |
-- secondary nodes in the sub-list. |
134 |
type AllocNodes = Either [Ndx] [(Ndx, [Ndx])] |
135 |
|
136 |
-- | The empty solution we start with when computing allocations. |
137 |
emptyAllocSolution :: AllocSolution |
138 |
emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0 |
139 |
, asSolution = Nothing, asLog = [] } |
140 |
|
141 |
-- | The empty evac solution. |
142 |
emptyEvacSolution :: EvacSolution |
143 |
emptyEvacSolution = EvacSolution { esMoved = [] |
144 |
, esFailed = [] |
145 |
, esOpCodes = [] |
146 |
} |
147 |
|
148 |
-- | The complete state for the balancing solution. |
149 |
data Table = Table Node.List Instance.List Score [Placement] |
150 |
deriving (Show) |
151 |
|
152 |
-- | Cluster statistics data type. |
153 |
data CStats = CStats |
154 |
{ csFmem :: Integer -- ^ Cluster free mem |
155 |
, csFdsk :: Integer -- ^ Cluster free disk |
156 |
, csFspn :: Integer -- ^ Cluster free spindles |
157 |
, csAmem :: Integer -- ^ Cluster allocatable mem |
158 |
, csAdsk :: Integer -- ^ Cluster allocatable disk |
159 |
, csAcpu :: Integer -- ^ Cluster allocatable cpus |
160 |
, csMmem :: Integer -- ^ Max node allocatable mem |
161 |
, csMdsk :: Integer -- ^ Max node allocatable disk |
162 |
, csMcpu :: Integer -- ^ Max node allocatable cpu |
163 |
, csImem :: Integer -- ^ Instance used mem |
164 |
, csIdsk :: Integer -- ^ Instance used disk |
165 |
, csIspn :: Integer -- ^ Instance used spindles |
166 |
, csIcpu :: Integer -- ^ Instance used cpu |
167 |
, csTmem :: Double -- ^ Cluster total mem |
168 |
, csTdsk :: Double -- ^ Cluster total disk |
169 |
, csTspn :: Double -- ^ Cluster total spindles |
170 |
, csTcpu :: Double -- ^ Cluster total cpus |
171 |
, csVcpu :: Integer -- ^ Cluster total virtual cpus |
172 |
, csNcpu :: Double -- ^ Equivalent to 'csIcpu' but in terms of |
173 |
-- physical CPUs, i.e. normalised used phys CPUs |
174 |
, csXmem :: Integer -- ^ Unnacounted for mem |
175 |
, csNmem :: Integer -- ^ Node own memory |
176 |
, csScore :: Score -- ^ The cluster score |
177 |
, csNinst :: Int -- ^ The total number of instances |
178 |
} deriving (Show) |
179 |
|
180 |
-- | A simple type for allocation functions. |
181 |
type AllocMethod = Node.List -- ^ Node list |
182 |
-> Instance.List -- ^ Instance list |
183 |
-> Maybe Int -- ^ Optional allocation limit |
184 |
-> Instance.Instance -- ^ Instance spec for allocation |
185 |
-> AllocNodes -- ^ Which nodes we should allocate on |
186 |
-> [Instance.Instance] -- ^ Allocated instances |
187 |
-> [CStats] -- ^ Running cluster stats |
188 |
-> Result AllocResult -- ^ Allocation result |
189 |
|
190 |
-- | A simple type for the running solution of evacuations. |
191 |
type EvacInnerState = |
192 |
Either String (Node.List, Instance.Instance, Score, Ndx) |
193 |
|
194 |
-- * Utility functions |
195 |
|
196 |
-- | Verifies the N+1 status and return the affected nodes. |
197 |
verifyN1 :: [Node.Node] -> [Node.Node] |
198 |
verifyN1 = filter Node.failN1 |
199 |
|
200 |
{-| Computes the pair of bad nodes and instances. |
201 |
|
202 |
The bad node list is computed via a simple 'verifyN1' check, and the |
203 |
bad instance list is the list of primary and secondary instances of |
204 |
those nodes. |
205 |
|
206 |
-} |
207 |
computeBadItems :: Node.List -> Instance.List -> |
208 |
([Node.Node], [Instance.Instance]) |
209 |
computeBadItems nl il = |
210 |
let bad_nodes = verifyN1 $ getOnline nl |
211 |
bad_instances = map (`Container.find` il) . |
212 |
sort . nub $ |
213 |
concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes |
214 |
in |
215 |
(bad_nodes, bad_instances) |
216 |
|
217 |
-- | Extracts the node pairs for an instance. This can fail if the |
218 |
-- instance is single-homed. FIXME: this needs to be improved, |
219 |
-- together with the general enhancement for handling non-DRBD moves. |
220 |
instanceNodes :: Node.List -> Instance.Instance -> |
221 |
(Ndx, Ndx, Node.Node, Node.Node) |
222 |
instanceNodes nl inst = |
223 |
let old_pdx = Instance.pNode inst |
224 |
old_sdx = Instance.sNode inst |
225 |
old_p = Container.find old_pdx nl |
226 |
old_s = Container.find old_sdx nl |
227 |
in (old_pdx, old_sdx, old_p, old_s) |
228 |
|
229 |
-- | Zero-initializer for the CStats type. |
230 |
emptyCStats :: CStats |
231 |
emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 |
232 |
|
233 |
-- | Update stats with data from a new node. |
234 |
updateCStats :: CStats -> Node.Node -> CStats |
235 |
updateCStats cs node = |
236 |
let CStats { csFmem = x_fmem, csFdsk = x_fdsk, |
237 |
csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk, |
238 |
csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu, |
239 |
csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu, |
240 |
csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu, |
241 |
csVcpu = x_vcpu, csNcpu = x_ncpu, |
242 |
csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst, |
243 |
csFspn = x_fspn, csIspn = x_ispn, csTspn = x_tspn |
244 |
} |
245 |
= cs |
246 |
inc_amem = Node.fMem node - Node.rMem node |
247 |
inc_amem' = if inc_amem > 0 then inc_amem else 0 |
248 |
inc_adsk = Node.availDisk node |
249 |
inc_imem = truncate (Node.tMem node) - Node.nMem node |
250 |
- Node.xMem node - Node.fMem node |
251 |
inc_icpu = Node.uCpu node |
252 |
inc_idsk = truncate (Node.tDsk node) - Node.fDsk node |
253 |
inc_ispn = Node.tSpindles node - Node.fSpindles node |
254 |
inc_vcpu = Node.hiCpu node |
255 |
inc_acpu = Node.availCpu node |
256 |
inc_ncpu = fromIntegral (Node.uCpu node) / |
257 |
iPolicyVcpuRatio (Node.iPolicy node) |
258 |
in cs { csFmem = x_fmem + fromIntegral (Node.fMem node) |
259 |
, csFdsk = x_fdsk + fromIntegral (Node.fDsk node) |
260 |
, csFspn = x_fspn + fromIntegral (Node.fSpindles node) |
261 |
, csAmem = x_amem + fromIntegral inc_amem' |
262 |
, csAdsk = x_adsk + fromIntegral inc_adsk |
263 |
, csAcpu = x_acpu + fromIntegral inc_acpu |
264 |
, csMmem = max x_mmem (fromIntegral inc_amem') |
265 |
, csMdsk = max x_mdsk (fromIntegral inc_adsk) |
266 |
, csMcpu = max x_mcpu (fromIntegral inc_acpu) |
267 |
, csImem = x_imem + fromIntegral inc_imem |
268 |
, csIdsk = x_idsk + fromIntegral inc_idsk |
269 |
, csIspn = x_ispn + fromIntegral inc_ispn |
270 |
, csIcpu = x_icpu + fromIntegral inc_icpu |
271 |
, csTmem = x_tmem + Node.tMem node |
272 |
, csTdsk = x_tdsk + Node.tDsk node |
273 |
, csTspn = x_tspn + fromIntegral (Node.tSpindles node) |
274 |
, csTcpu = x_tcpu + Node.tCpu node |
275 |
, csVcpu = x_vcpu + fromIntegral inc_vcpu |
276 |
, csNcpu = x_ncpu + inc_ncpu |
277 |
, csXmem = x_xmem + fromIntegral (Node.xMem node) |
278 |
, csNmem = x_nmem + fromIntegral (Node.nMem node) |
279 |
, csNinst = x_ninst + length (Node.pList node) |
280 |
} |
281 |
|
282 |
-- | Compute the total free disk and memory in the cluster. |
283 |
totalResources :: Node.List -> CStats |
284 |
totalResources nl = |
285 |
let cs = foldl' updateCStats emptyCStats . Container.elems $ nl |
286 |
in cs { csScore = compCV nl } |
287 |
|
288 |
-- | Compute the delta between two cluster state. |
289 |
-- |
290 |
-- This is used when doing allocations, to understand better the |
291 |
-- available cluster resources. The return value is a triple of the |
292 |
-- current used values, the delta that was still allocated, and what |
293 |
-- was left unallocated. |
294 |
computeAllocationDelta :: CStats -> CStats -> AllocStats |
295 |
computeAllocationDelta cini cfin = |
296 |
let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu, |
297 |
csNcpu = i_ncpu, csIspn = i_ispn } = cini |
298 |
CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu, |
299 |
csTmem = t_mem, csTdsk = t_dsk, csVcpu = f_vcpu, |
300 |
csNcpu = f_ncpu, csTcpu = f_tcpu, |
301 |
csIspn = f_ispn, csTspn = t_spn } = cfin |
302 |
rini = AllocInfo { allocInfoVCpus = fromIntegral i_icpu |
303 |
, allocInfoNCpus = i_ncpu |
304 |
, allocInfoMem = fromIntegral i_imem |
305 |
, allocInfoDisk = fromIntegral i_idsk |
306 |
, allocInfoSpn = fromIntegral i_ispn |
307 |
} |
308 |
rfin = AllocInfo { allocInfoVCpus = fromIntegral (f_icpu - i_icpu) |
309 |
, allocInfoNCpus = f_ncpu - i_ncpu |
310 |
, allocInfoMem = fromIntegral (f_imem - i_imem) |
311 |
, allocInfoDisk = fromIntegral (f_idsk - i_idsk) |
312 |
, allocInfoSpn = fromIntegral (f_ispn - i_ispn) |
313 |
} |
314 |
runa = AllocInfo { allocInfoVCpus = fromIntegral (f_vcpu - f_icpu) |
315 |
, allocInfoNCpus = f_tcpu - f_ncpu |
316 |
, allocInfoMem = truncate t_mem - fromIntegral f_imem |
317 |
, allocInfoDisk = truncate t_dsk - fromIntegral f_idsk |
318 |
, allocInfoSpn = truncate t_spn - fromIntegral f_ispn |
319 |
} |
320 |
in (rini, rfin, runa) |
321 |
|
322 |
-- | The names and weights of the individual elements in the CV list. |
323 |
detailedCVInfo :: [(Double, String)] |
324 |
detailedCVInfo = [ (1, "free_mem_cv") |
325 |
, (1, "free_disk_cv") |
326 |
, (1, "n1_cnt") |
327 |
, (1, "reserved_mem_cv") |
328 |
, (4, "offline_all_cnt") |
329 |
, (16, "offline_pri_cnt") |
330 |
, (1, "vcpu_ratio_cv") |
331 |
, (1, "cpu_load_cv") |
332 |
, (1, "mem_load_cv") |
333 |
, (1, "disk_load_cv") |
334 |
, (1, "net_load_cv") |
335 |
, (2, "pri_tags_score") |
336 |
, (1, "spindles_cv") |
337 |
] |
338 |
|
339 |
-- | Holds the weights used by 'compCVNodes' for each metric. |
340 |
detailedCVWeights :: [Double] |
341 |
detailedCVWeights = map fst detailedCVInfo |
342 |
|
343 |
-- | Compute the mem and disk covariance. |
344 |
compDetailedCV :: [Node.Node] -> [Double] |
345 |
compDetailedCV all_nodes = |
346 |
let (offline, nodes) = partition Node.offline all_nodes |
347 |
mem_l = map Node.pMem nodes |
348 |
dsk_l = map Node.pDsk nodes |
349 |
-- metric: memory covariance |
350 |
mem_cv = stdDev mem_l |
351 |
-- metric: disk covariance |
352 |
dsk_cv = stdDev dsk_l |
353 |
-- metric: count of instances living on N1 failing nodes |
354 |
n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) + |
355 |
length (Node.pList n)) . |
356 |
filter Node.failN1 $ nodes :: Double |
357 |
res_l = map Node.pRem nodes |
358 |
-- metric: reserved memory covariance |
359 |
res_cv = stdDev res_l |
360 |
-- offline instances metrics |
361 |
offline_ipri = sum . map (length . Node.pList) $ offline |
362 |
offline_isec = sum . map (length . Node.sList) $ offline |
363 |
-- metric: count of instances on offline nodes |
364 |
off_score = fromIntegral (offline_ipri + offline_isec)::Double |
365 |
-- metric: count of primary instances on offline nodes (this |
366 |
-- helps with evacuation/failover of primary instances on |
367 |
-- 2-node clusters with one node offline) |
368 |
off_pri_score = fromIntegral offline_ipri::Double |
369 |
cpu_l = map Node.pCpu nodes |
370 |
-- metric: covariance of vcpu/pcpu ratio |
371 |
cpu_cv = stdDev cpu_l |
372 |
-- metrics: covariance of cpu, memory, disk and network load |
373 |
(c_load, m_load, d_load, n_load) = |
374 |
unzip4 $ map (\n -> |
375 |
let DynUtil c1 m1 d1 n1 = Node.utilLoad n |
376 |
DynUtil c2 m2 d2 n2 = Node.utilPool n |
377 |
in (c1/c2, m1/m2, d1/d2, n1/n2)) nodes |
378 |
-- metric: conflicting instance count |
379 |
pri_tags_inst = sum $ map Node.conflictingPrimaries nodes |
380 |
pri_tags_score = fromIntegral pri_tags_inst::Double |
381 |
-- metric: spindles % |
382 |
spindles_cv = map (\n -> Node.instSpindles n / Node.hiSpindles n) nodes |
383 |
in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv |
384 |
, stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load |
385 |
, pri_tags_score, stdDev spindles_cv ] |
386 |
|
387 |
-- | Compute the /total/ variance. |
388 |
compCVNodes :: [Node.Node] -> Double |
389 |
compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV |
390 |
|
391 |
-- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'. |
392 |
compCV :: Node.List -> Double |
393 |
compCV = compCVNodes . Container.elems |
394 |
|
395 |
-- | Compute online nodes from a 'Node.List'. |
396 |
getOnline :: Node.List -> [Node.Node] |
397 |
getOnline = filter (not . Node.offline) . Container.elems |
398 |
|
399 |
-- * Balancing functions |
400 |
|
401 |
-- | Compute best table. Note that the ordering of the arguments is important. |
402 |
compareTables :: Table -> Table -> Table |
403 |
compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) = |
404 |
if a_cv > b_cv then b else a |
405 |
|
406 |
-- | Applies an instance move to a given node list and instance. |
407 |
applyMove :: Node.List -> Instance.Instance |
408 |
-> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx) |
409 |
-- Failover (f) |
410 |
applyMove nl inst Failover = |
411 |
let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst |
412 |
int_p = Node.removePri old_p inst |
413 |
int_s = Node.removeSec old_s inst |
414 |
new_nl = do -- Maybe monad |
415 |
new_p <- Node.addPriEx (Node.offline old_p) int_s inst |
416 |
new_s <- Node.addSec int_p inst old_sdx |
417 |
let new_inst = Instance.setBoth inst old_sdx old_pdx |
418 |
return (Container.addTwo old_pdx new_s old_sdx new_p nl, |
419 |
new_inst, old_sdx, old_pdx) |
420 |
in new_nl |
421 |
|
422 |
-- Failover to any (fa) |
423 |
applyMove nl inst (FailoverToAny new_pdx) = do |
424 |
let (old_pdx, old_sdx, old_pnode, _) = instanceNodes nl inst |
425 |
new_pnode = Container.find new_pdx nl |
426 |
force_failover = Node.offline old_pnode |
427 |
new_pnode' <- Node.addPriEx force_failover new_pnode inst |
428 |
let old_pnode' = Node.removePri old_pnode inst |
429 |
inst' = Instance.setPri inst new_pdx |
430 |
nl' = Container.addTwo old_pdx old_pnode' new_pdx new_pnode' nl |
431 |
return (nl', inst', new_pdx, old_sdx) |
432 |
|
433 |
-- Replace the primary (f:, r:np, f) |
434 |
applyMove nl inst (ReplacePrimary new_pdx) = |
435 |
let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst |
436 |
tgt_n = Container.find new_pdx nl |
437 |
int_p = Node.removePri old_p inst |
438 |
int_s = Node.removeSec old_s inst |
439 |
force_p = Node.offline old_p |
440 |
new_nl = do -- Maybe monad |
441 |
-- check that the current secondary can host the instance |
442 |
-- during the migration |
443 |
tmp_s <- Node.addPriEx force_p int_s inst |
444 |
let tmp_s' = Node.removePri tmp_s inst |
445 |
new_p <- Node.addPriEx force_p tgt_n inst |
446 |
new_s <- Node.addSecEx force_p tmp_s' inst new_pdx |
447 |
let new_inst = Instance.setPri inst new_pdx |
448 |
return (Container.add new_pdx new_p $ |
449 |
Container.addTwo old_pdx int_p old_sdx new_s nl, |
450 |
new_inst, new_pdx, old_sdx) |
451 |
in new_nl |
452 |
|
453 |
-- Replace the secondary (r:ns) |
454 |
applyMove nl inst (ReplaceSecondary new_sdx) = |
455 |
let old_pdx = Instance.pNode inst |
456 |
old_sdx = Instance.sNode inst |
457 |
old_s = Container.find old_sdx nl |
458 |
tgt_n = Container.find new_sdx nl |
459 |
int_s = Node.removeSec old_s inst |
460 |
force_s = Node.offline old_s |
461 |
new_inst = Instance.setSec inst new_sdx |
462 |
new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>= |
463 |
\new_s -> return (Container.addTwo new_sdx |
464 |
new_s old_sdx int_s nl, |
465 |
new_inst, old_pdx, new_sdx) |
466 |
in new_nl |
467 |
|
468 |
-- Replace the secondary and failover (r:np, f) |
469 |
applyMove nl inst (ReplaceAndFailover new_pdx) = |
470 |
let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst |
471 |
tgt_n = Container.find new_pdx nl |
472 |
int_p = Node.removePri old_p inst |
473 |
int_s = Node.removeSec old_s inst |
474 |
force_s = Node.offline old_s |
475 |
new_nl = do -- Maybe monad |
476 |
new_p <- Node.addPri tgt_n inst |
477 |
new_s <- Node.addSecEx force_s int_p inst new_pdx |
478 |
let new_inst = Instance.setBoth inst new_pdx old_pdx |
479 |
return (Container.add new_pdx new_p $ |
480 |
Container.addTwo old_pdx new_s old_sdx int_s nl, |
481 |
new_inst, new_pdx, old_pdx) |
482 |
in new_nl |
483 |
|
484 |
-- Failver and replace the secondary (f, r:ns) |
485 |
applyMove nl inst (FailoverAndReplace new_sdx) = |
486 |
let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst |
487 |
tgt_n = Container.find new_sdx nl |
488 |
int_p = Node.removePri old_p inst |
489 |
int_s = Node.removeSec old_s inst |
490 |
force_p = Node.offline old_p |
491 |
new_nl = do -- Maybe monad |
492 |
new_p <- Node.addPriEx force_p int_s inst |
493 |
new_s <- Node.addSecEx force_p tgt_n inst old_sdx |
494 |
let new_inst = Instance.setBoth inst old_sdx new_sdx |
495 |
return (Container.add new_sdx new_s $ |
496 |
Container.addTwo old_sdx new_p old_pdx int_p nl, |
497 |
new_inst, old_sdx, new_sdx) |
498 |
in new_nl |
499 |
|
500 |
-- | Tries to allocate an instance on one given node. |
501 |
allocateOnSingle :: Node.List -> Instance.Instance -> Ndx |
502 |
-> OpResult Node.AllocElement |
503 |
allocateOnSingle nl inst new_pdx = |
504 |
let p = Container.find new_pdx nl |
505 |
new_inst = Instance.setBoth inst new_pdx Node.noSecondary |
506 |
in do |
507 |
Instance.instMatchesPolicy inst (Node.iPolicy p) (Node.exclStorage p) |
508 |
new_p <- Node.addPri p inst |
509 |
let new_nl = Container.add new_pdx new_p nl |
510 |
new_score = compCV new_nl |
511 |
return (new_nl, new_inst, [new_p], new_score) |
512 |
|
513 |
-- | Tries to allocate an instance on a given pair of nodes. |
514 |
allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx |
515 |
-> OpResult Node.AllocElement |
516 |
allocateOnPair nl inst new_pdx new_sdx = |
517 |
let tgt_p = Container.find new_pdx nl |
518 |
tgt_s = Container.find new_sdx nl |
519 |
in do |
520 |
Instance.instMatchesPolicy inst (Node.iPolicy tgt_p) |
521 |
(Node.exclStorage tgt_p) |
522 |
new_p <- Node.addPri tgt_p inst |
523 |
new_s <- Node.addSec tgt_s inst new_pdx |
524 |
let new_inst = Instance.setBoth inst new_pdx new_sdx |
525 |
new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl |
526 |
return (new_nl, new_inst, [new_p, new_s], compCV new_nl) |
527 |
|
528 |
-- | Tries to perform an instance move and returns the best table |
529 |
-- between the original one and the new one. |
530 |
checkSingleStep :: Table -- ^ The original table |
531 |
-> Instance.Instance -- ^ The instance to move |
532 |
-> Table -- ^ The current best table |
533 |
-> IMove -- ^ The move to apply |
534 |
-> Table -- ^ The final best table |
535 |
checkSingleStep ini_tbl target cur_tbl move = |
536 |
let Table ini_nl ini_il _ ini_plc = ini_tbl |
537 |
tmp_resu = applyMove ini_nl target move |
538 |
in case tmp_resu of |
539 |
Bad _ -> cur_tbl |
540 |
Ok (upd_nl, new_inst, pri_idx, sec_idx) -> |
541 |
let tgt_idx = Instance.idx target |
542 |
upd_cvar = compCV upd_nl |
543 |
upd_il = Container.add tgt_idx new_inst ini_il |
544 |
upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc |
545 |
upd_tbl = Table upd_nl upd_il upd_cvar upd_plc |
546 |
in compareTables cur_tbl upd_tbl |
547 |
|
548 |
-- | Given the status of the current secondary as a valid new node and |
549 |
-- the current candidate target node, generate the possible moves for |
550 |
-- a instance. |
551 |
possibleMoves :: MirrorType -- ^ The mirroring type of the instance |
552 |
-> Bool -- ^ Whether the secondary node is a valid new node |
553 |
-> Bool -- ^ Whether we can change the primary node |
554 |
-> Ndx -- ^ Target node candidate |
555 |
-> [IMove] -- ^ List of valid result moves |
556 |
|
557 |
possibleMoves MirrorNone _ _ _ = [] |
558 |
|
559 |
possibleMoves MirrorExternal _ False _ = [] |
560 |
|
561 |
possibleMoves MirrorExternal _ True tdx = |
562 |
[ FailoverToAny tdx ] |
563 |
|
564 |
possibleMoves MirrorInternal _ False tdx = |
565 |
[ ReplaceSecondary tdx ] |
566 |
|
567 |
possibleMoves MirrorInternal True True tdx = |
568 |
[ ReplaceSecondary tdx |
569 |
, ReplaceAndFailover tdx |
570 |
, ReplacePrimary tdx |
571 |
, FailoverAndReplace tdx |
572 |
] |
573 |
|
574 |
possibleMoves MirrorInternal False True tdx = |
575 |
[ ReplaceSecondary tdx |
576 |
, ReplaceAndFailover tdx |
577 |
] |
578 |
|
579 |
-- | Compute the best move for a given instance. |
580 |
checkInstanceMove :: [Ndx] -- ^ Allowed target node indices |
581 |
-> Bool -- ^ Whether disk moves are allowed |
582 |
-> Bool -- ^ Whether instance moves are allowed |
583 |
-> Table -- ^ Original table |
584 |
-> Instance.Instance -- ^ Instance to move |
585 |
-> Table -- ^ Best new table for this instance |
586 |
checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target = |
587 |
let opdx = Instance.pNode target |
588 |
osdx = Instance.sNode target |
589 |
bad_nodes = [opdx, osdx] |
590 |
nodes = filter (`notElem` bad_nodes) nodes_idx |
591 |
mir_type = Instance.mirrorType target |
592 |
use_secondary = elem osdx nodes_idx && inst_moves |
593 |
aft_failover = if mir_type == MirrorInternal && use_secondary |
594 |
-- if drbd and allowed to failover |
595 |
then checkSingleStep ini_tbl target ini_tbl Failover |
596 |
else ini_tbl |
597 |
all_moves = |
598 |
if disk_moves |
599 |
then concatMap (possibleMoves mir_type use_secondary inst_moves) |
600 |
nodes |
601 |
else [] |
602 |
in |
603 |
-- iterate over the possible nodes for this instance |
604 |
foldl' (checkSingleStep ini_tbl target) aft_failover all_moves |
605 |
|
606 |
-- | Compute the best next move. |
607 |
checkMove :: [Ndx] -- ^ Allowed target node indices |
608 |
-> Bool -- ^ Whether disk moves are allowed |
609 |
-> Bool -- ^ Whether instance moves are allowed |
610 |
-> Table -- ^ The current solution |
611 |
-> [Instance.Instance] -- ^ List of instances still to move |
612 |
-> Table -- ^ The new solution |
613 |
checkMove nodes_idx disk_moves inst_moves ini_tbl victims = |
614 |
let Table _ _ _ ini_plc = ini_tbl |
615 |
-- we're using rwhnf from the Control.Parallel.Strategies |
616 |
-- package; we don't need to use rnf as that would force too |
617 |
-- much evaluation in single-threaded cases, and in |
618 |
-- multi-threaded case the weak head normal form is enough to |
619 |
-- spark the evaluation |
620 |
tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves |
621 |
inst_moves ini_tbl) |
622 |
victims |
623 |
-- iterate over all instances, computing the best move |
624 |
best_tbl = foldl' compareTables ini_tbl tables |
625 |
Table _ _ _ best_plc = best_tbl |
626 |
in if length best_plc == length ini_plc |
627 |
then ini_tbl -- no advancement |
628 |
else best_tbl |
629 |
|
630 |
-- | Check if we are allowed to go deeper in the balancing. |
631 |
doNextBalance :: Table -- ^ The starting table |
632 |
-> Int -- ^ Remaining length |
633 |
-> Score -- ^ Score at which to stop |
634 |
-> Bool -- ^ The resulting table and commands |
635 |
doNextBalance ini_tbl max_rounds min_score = |
636 |
let Table _ _ ini_cv ini_plc = ini_tbl |
637 |
ini_plc_len = length ini_plc |
638 |
in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score |
639 |
|
640 |
-- | Run a balance move. |
641 |
tryBalance :: Table -- ^ The starting table |
642 |
-> Bool -- ^ Allow disk moves |
643 |
-> Bool -- ^ Allow instance moves |
644 |
-> Bool -- ^ Only evacuate moves |
645 |
-> Score -- ^ Min gain threshold |
646 |
-> Score -- ^ Min gain |
647 |
-> Maybe Table -- ^ The resulting table and commands |
648 |
tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain = |
649 |
let Table ini_nl ini_il ini_cv _ = ini_tbl |
650 |
all_inst = Container.elems ini_il |
651 |
all_nodes = Container.elems ini_nl |
652 |
(offline_nodes, online_nodes) = partition Node.offline all_nodes |
653 |
all_inst' = if evac_mode |
654 |
then let bad_nodes = map Node.idx offline_nodes |
655 |
in filter (any (`elem` bad_nodes) . |
656 |
Instance.allNodes) all_inst |
657 |
else all_inst |
658 |
reloc_inst = filter (\i -> Instance.movable i && |
659 |
Instance.autoBalance i) all_inst' |
660 |
node_idx = map Node.idx online_nodes |
661 |
fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst |
662 |
(Table _ _ fin_cv _) = fin_tbl |
663 |
in |
664 |
if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain) |
665 |
then Just fin_tbl -- this round made success, return the new table |
666 |
else Nothing |
667 |
|
668 |
-- * Allocation functions |
669 |
|
670 |
-- | Build failure stats out of a list of failures. |
671 |
collapseFailures :: [FailMode] -> FailStats |
672 |
collapseFailures flst = |
673 |
map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst)) |
674 |
[minBound..maxBound] |
675 |
|
676 |
-- | Compares two Maybe AllocElement and chooses the best score. |
677 |
bestAllocElement :: Maybe Node.AllocElement |
678 |
-> Maybe Node.AllocElement |
679 |
-> Maybe Node.AllocElement |
680 |
bestAllocElement a Nothing = a |
681 |
bestAllocElement Nothing b = b |
682 |
bestAllocElement a@(Just (_, _, _, ascore)) b@(Just (_, _, _, bscore)) = |
683 |
if ascore < bscore then a else b |
684 |
|
685 |
-- | Update current Allocation solution and failure stats with new |
686 |
-- elements. |
687 |
concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution |
688 |
concatAllocs as (Bad reason) = as { asFailures = reason : asFailures as } |
689 |
|
690 |
concatAllocs as (Ok ns) = |
691 |
let -- Choose the old or new solution, based on the cluster score |
692 |
cntok = asAllocs as |
693 |
osols = asSolution as |
694 |
nsols = bestAllocElement osols (Just ns) |
695 |
nsuc = cntok + 1 |
696 |
-- Note: we force evaluation of nsols here in order to keep the |
697 |
-- memory profile low - we know that we will need nsols for sure |
698 |
-- in the next cycle, so we force evaluation of nsols, since the |
699 |
-- foldl' in the caller will only evaluate the tuple, but not the |
700 |
-- elements of the tuple |
701 |
in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolution = nsols } |
702 |
|
703 |
-- | Sums two 'AllocSolution' structures. |
704 |
sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution |
705 |
sumAllocs (AllocSolution aFails aAllocs aSols aLog) |
706 |
(AllocSolution bFails bAllocs bSols bLog) = |
707 |
-- note: we add b first, since usually it will be smaller; when |
708 |
-- fold'ing, a will grow and grow whereas b is the per-group |
709 |
-- result, hence smaller |
710 |
let nFails = bFails ++ aFails |
711 |
nAllocs = aAllocs + bAllocs |
712 |
nSols = bestAllocElement aSols bSols |
713 |
nLog = bLog ++ aLog |
714 |
in AllocSolution nFails nAllocs nSols nLog |
715 |
|
716 |
-- | Given a solution, generates a reasonable description for it. |
717 |
describeSolution :: AllocSolution -> String |
718 |
describeSolution as = |
719 |
let fcnt = asFailures as |
720 |
sols = asSolution as |
721 |
freasons = |
722 |
intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) . |
723 |
filter ((> 0) . snd) . collapseFailures $ fcnt |
724 |
in case sols of |
725 |
Nothing -> "No valid allocation solutions, failure reasons: " ++ |
726 |
(if null fcnt then "unknown reasons" else freasons) |
727 |
Just (_, _, nodes, cv) -> |
728 |
printf ("score: %.8f, successes %d, failures %d (%s)" ++ |
729 |
" for node(s) %s") cv (asAllocs as) (length fcnt) freasons |
730 |
(intercalate "/" . map Node.name $ nodes) |
731 |
|
732 |
-- | Annotates a solution with the appropriate string. |
733 |
annotateSolution :: AllocSolution -> AllocSolution |
734 |
annotateSolution as = as { asLog = describeSolution as : asLog as } |
735 |
|
736 |
-- | Reverses an evacuation solution. |
737 |
-- |
738 |
-- Rationale: we always concat the results to the top of the lists, so |
739 |
-- for proper jobset execution, we should reverse all lists. |
740 |
reverseEvacSolution :: EvacSolution -> EvacSolution |
741 |
reverseEvacSolution (EvacSolution f m o) = |
742 |
EvacSolution (reverse f) (reverse m) (reverse o) |
743 |
|
744 |
-- | Generate the valid node allocation singles or pairs for a new instance. |
745 |
genAllocNodes :: Group.List -- ^ Group list |
746 |
-> Node.List -- ^ The node map |
747 |
-> Int -- ^ The number of nodes required |
748 |
-> Bool -- ^ Whether to drop or not |
749 |
-- unallocable nodes |
750 |
-> Result AllocNodes -- ^ The (monadic) result |
751 |
genAllocNodes gl nl count drop_unalloc = |
752 |
let filter_fn = if drop_unalloc |
753 |
then filter (Group.isAllocable . |
754 |
flip Container.find gl . Node.group) |
755 |
else id |
756 |
all_nodes = filter_fn $ getOnline nl |
757 |
all_pairs = [(Node.idx p, |
758 |
[Node.idx s | s <- all_nodes, |
759 |
Node.idx p /= Node.idx s, |
760 |
Node.group p == Node.group s]) | |
761 |
p <- all_nodes] |
762 |
in case count of |
763 |
1 -> Ok (Left (map Node.idx all_nodes)) |
764 |
2 -> Ok (Right (filter (not . null . snd) all_pairs)) |
765 |
_ -> Bad "Unsupported number of nodes, only one or two supported" |
766 |
|
767 |
-- | Try to allocate an instance on the cluster. |
768 |
tryAlloc :: (Monad m) => |
769 |
Node.List -- ^ The node list |
770 |
-> Instance.List -- ^ The instance list |
771 |
-> Instance.Instance -- ^ The instance to allocate |
772 |
-> AllocNodes -- ^ The allocation targets |
773 |
-> m AllocSolution -- ^ Possible solution list |
774 |
tryAlloc _ _ _ (Right []) = fail "Not enough online nodes" |
775 |
tryAlloc nl _ inst (Right ok_pairs) = |
776 |
let psols = parMap rwhnf (\(p, ss) -> |
777 |
foldl' (\cstate -> |
778 |
concatAllocs cstate . |
779 |
allocateOnPair nl inst p) |
780 |
emptyAllocSolution ss) ok_pairs |
781 |
sols = foldl' sumAllocs emptyAllocSolution psols |
782 |
in return $ annotateSolution sols |
783 |
|
784 |
tryAlloc _ _ _ (Left []) = fail "No online nodes" |
785 |
tryAlloc nl _ inst (Left all_nodes) = |
786 |
let sols = foldl' (\cstate -> |
787 |
concatAllocs cstate . allocateOnSingle nl inst |
788 |
) emptyAllocSolution all_nodes |
789 |
in return $ annotateSolution sols |
790 |
|
791 |
-- | Given a group/result, describe it as a nice (list of) messages. |
792 |
solutionDescription :: (Group.Group, Result AllocSolution) |
793 |
-> [String] |
794 |
solutionDescription (grp, result) = |
795 |
case result of |
796 |
Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution) |
797 |
Bad message -> [printf "Group %s: error %s" gname message] |
798 |
where gname = Group.name grp |
799 |
pol = allocPolicyToRaw (Group.allocPolicy grp) |
800 |
|
801 |
-- | From a list of possibly bad and possibly empty solutions, filter |
802 |
-- only the groups with a valid result. Note that the result will be |
803 |
-- reversed compared to the original list. |
804 |
filterMGResults :: [(Group.Group, Result AllocSolution)] |
805 |
-> [(Group.Group, AllocSolution)] |
806 |
filterMGResults = foldl' fn [] |
807 |
where unallocable = not . Group.isAllocable |
808 |
fn accu (grp, rasol) = |
809 |
case rasol of |
810 |
Bad _ -> accu |
811 |
Ok sol | isNothing (asSolution sol) -> accu |
812 |
| unallocable grp -> accu |
813 |
| otherwise -> (grp, sol):accu |
814 |
|
815 |
-- | Sort multigroup results based on policy and score. |
816 |
sortMGResults :: [(Group.Group, AllocSolution)] |
817 |
-> [(Group.Group, AllocSolution)] |
818 |
sortMGResults sols = |
819 |
let extractScore (_, _, _, x) = x |
820 |
solScore (grp, sol) = (Group.allocPolicy grp, |
821 |
(extractScore . fromJust . asSolution) sol) |
822 |
in sortBy (comparing solScore) sols |
823 |
|
824 |
-- | Removes node groups which can't accommodate the instance |
825 |
filterValidGroups :: [(Group.Group, (Node.List, Instance.List))] |
826 |
-> Instance.Instance |
827 |
-> ([(Group.Group, (Node.List, Instance.List))], [String]) |
828 |
filterValidGroups [] _ = ([], []) |
829 |
filterValidGroups (ng:ngs) inst = |
830 |
let (valid_ngs, msgs) = filterValidGroups ngs inst |
831 |
hasNetwork nic = case Nic.network nic of |
832 |
Just net -> net `elem` Group.networks (fst ng) |
833 |
Nothing -> True |
834 |
hasRequiredNetworks = all hasNetwork (Instance.nics inst) |
835 |
in if hasRequiredNetworks |
836 |
then (ng:valid_ngs, msgs) |
837 |
else (valid_ngs, |
838 |
("group " ++ Group.name (fst ng) ++ |
839 |
" is not connected to a network required by instance " ++ |
840 |
Instance.name inst):msgs) |
841 |
|
842 |
-- | Finds the best group for an instance on a multi-group cluster. |
843 |
-- |
844 |
-- Only solutions in @preferred@ and @last_resort@ groups will be |
845 |
-- accepted as valid, and additionally if the allowed groups parameter |
846 |
-- is not null then allocation will only be run for those group |
847 |
-- indices. |
848 |
findBestAllocGroup :: Group.List -- ^ The group list |
849 |
-> Node.List -- ^ The node list |
850 |
-> Instance.List -- ^ The instance list |
851 |
-> Maybe [Gdx] -- ^ The allowed groups |
852 |
-> Instance.Instance -- ^ The instance to allocate |
853 |
-> Int -- ^ Required number of nodes |
854 |
-> Result (Group.Group, AllocSolution, [String]) |
855 |
findBestAllocGroup mggl mgnl mgil allowed_gdxs inst cnt = |
856 |
let groups_by_idx = splitCluster mgnl mgil |
857 |
groups = map (\(gid, d) -> (Container.find gid mggl, d)) groups_by_idx |
858 |
groups' = maybe groups |
859 |
(\gs -> filter ((`elem` gs) . Group.idx . fst) groups) |
860 |
allowed_gdxs |
861 |
(groups'', filter_group_msgs) = filterValidGroups groups' inst |
862 |
sols = map (\(gr, (nl, il)) -> |
863 |
(gr, genAllocNodes mggl nl cnt False >>= |
864 |
tryAlloc nl il inst)) |
865 |
groups''::[(Group.Group, Result AllocSolution)] |
866 |
all_msgs = filter_group_msgs ++ concatMap solutionDescription sols |
867 |
goodSols = filterMGResults sols |
868 |
sortedSols = sortMGResults goodSols |
869 |
in case sortedSols of |
870 |
[] -> Bad $ if null groups' |
871 |
then "no groups for evacuation: allowed groups was" ++ |
872 |
show allowed_gdxs ++ ", all groups: " ++ |
873 |
show (map fst groups) |
874 |
else intercalate ", " all_msgs |
875 |
(final_group, final_sol):_ -> return (final_group, final_sol, all_msgs) |
876 |
|
877 |
-- | Try to allocate an instance on a multi-group cluster. |
878 |
tryMGAlloc :: Group.List -- ^ The group list |
879 |
-> Node.List -- ^ The node list |
880 |
-> Instance.List -- ^ The instance list |
881 |
-> Instance.Instance -- ^ The instance to allocate |
882 |
-> Int -- ^ Required number of nodes |
883 |
-> Result AllocSolution -- ^ Possible solution list |
884 |
tryMGAlloc mggl mgnl mgil inst cnt = do |
885 |
(best_group, solution, all_msgs) <- |
886 |
findBestAllocGroup mggl mgnl mgil Nothing inst cnt |
887 |
let group_name = Group.name best_group |
888 |
selmsg = "Selected group: " ++ group_name |
889 |
return $ solution { asLog = selmsg:all_msgs } |
890 |
|
891 |
-- | Calculate the new instance list after allocation solution. |
892 |
updateIl :: Instance.List -- ^ The original instance list |
893 |
-> Maybe Node.AllocElement -- ^ The result of the allocation attempt |
894 |
-> Instance.List -- ^ The updated instance list |
895 |
updateIl il Nothing = il |
896 |
updateIl il (Just (_, xi, _, _)) = Container.add (Container.size il) xi il |
897 |
|
898 |
-- | Extract the the new node list from the allocation solution. |
899 |
extractNl :: Node.List -- ^ The original node list |
900 |
-> Maybe Node.AllocElement -- ^ The result of the allocation attempt |
901 |
-> Node.List -- ^ The new node list |
902 |
extractNl nl Nothing = nl |
903 |
extractNl _ (Just (xnl, _, _, _)) = xnl |
904 |
|
905 |
-- | Try to allocate a list of instances on a multi-group cluster. |
906 |
allocList :: Group.List -- ^ The group list |
907 |
-> Node.List -- ^ The node list |
908 |
-> Instance.List -- ^ The instance list |
909 |
-> [(Instance.Instance, Int)] -- ^ The instance to allocate |
910 |
-> AllocSolutionList -- ^ Possible solution list |
911 |
-> Result (Node.List, Instance.List, |
912 |
AllocSolutionList) -- ^ The final solution list |
913 |
allocList _ nl il [] result = Ok (nl, il, result) |
914 |
allocList gl nl il ((xi, xicnt):xies) result = do |
915 |
ares <- tryMGAlloc gl nl il xi xicnt |
916 |
let sol = asSolution ares |
917 |
nl' = extractNl nl sol |
918 |
il' = updateIl il sol |
919 |
allocList gl nl' il' xies ((xi, ares):result) |
920 |
|
921 |
-- | Function which fails if the requested mode is change secondary. |
922 |
-- |
923 |
-- This is useful since except DRBD, no other disk template can |
924 |
-- execute change secondary; thus, we can just call this function |
925 |
-- instead of always checking for secondary mode. After the call to |
926 |
-- this function, whatever mode we have is just a primary change. |
927 |
failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m () |
928 |
failOnSecondaryChange ChangeSecondary dt = |
929 |
fail $ "Instances with disk template '" ++ diskTemplateToRaw dt ++ |
930 |
"' can't execute change secondary" |
931 |
failOnSecondaryChange _ _ = return () |
932 |
|
933 |
-- | Run evacuation for a single instance. |
934 |
-- |
935 |
-- /Note:/ this function should correctly execute both intra-group |
936 |
-- evacuations (in all modes) and inter-group evacuations (in the |
937 |
-- 'ChangeAll' mode). Of course, this requires that the correct list |
938 |
-- of target nodes is passed. |
939 |
nodeEvacInstance :: Node.List -- ^ The node list (cluster-wide) |
940 |
-> Instance.List -- ^ Instance list (cluster-wide) |
941 |
-> EvacMode -- ^ The evacuation mode |
942 |
-> Instance.Instance -- ^ The instance to be evacuated |
943 |
-> Gdx -- ^ The group we're targetting |
944 |
-> [Ndx] -- ^ The list of available nodes |
945 |
-- for allocation |
946 |
-> Result (Node.List, Instance.List, [OpCodes.OpCode]) |
947 |
nodeEvacInstance nl il mode inst@(Instance.Instance |
948 |
{Instance.diskTemplate = dt@DTDiskless}) |
949 |
gdx avail_nodes = |
950 |
failOnSecondaryChange mode dt >> |
951 |
evacOneNodeOnly nl il inst gdx avail_nodes |
952 |
|
953 |
nodeEvacInstance _ _ _ (Instance.Instance |
954 |
{Instance.diskTemplate = DTPlain}) _ _ = |
955 |
fail "Instances of type plain cannot be relocated" |
956 |
|
957 |
nodeEvacInstance _ _ _ (Instance.Instance |
958 |
{Instance.diskTemplate = DTFile}) _ _ = |
959 |
fail "Instances of type file cannot be relocated" |
960 |
|
961 |
nodeEvacInstance nl il mode inst@(Instance.Instance |
962 |
{Instance.diskTemplate = dt@DTSharedFile}) |
963 |
gdx avail_nodes = |
964 |
failOnSecondaryChange mode dt >> |
965 |
evacOneNodeOnly nl il inst gdx avail_nodes |
966 |
|
967 |
nodeEvacInstance nl il mode inst@(Instance.Instance |
968 |
{Instance.diskTemplate = dt@DTBlock}) |
969 |
gdx avail_nodes = |
970 |
failOnSecondaryChange mode dt >> |
971 |
evacOneNodeOnly nl il inst gdx avail_nodes |
972 |
|
973 |
nodeEvacInstance nl il mode inst@(Instance.Instance |
974 |
{Instance.diskTemplate = dt@DTRbd}) |
975 |
gdx avail_nodes = |
976 |
failOnSecondaryChange mode dt >> |
977 |
evacOneNodeOnly nl il inst gdx avail_nodes |
978 |
|
979 |
nodeEvacInstance nl il mode inst@(Instance.Instance |
980 |
{Instance.diskTemplate = dt@DTExt}) |
981 |
gdx avail_nodes = |
982 |
failOnSecondaryChange mode dt >> |
983 |
evacOneNodeOnly nl il inst gdx avail_nodes |
984 |
|
985 |
nodeEvacInstance nl il ChangePrimary |
986 |
inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8}) |
987 |
_ _ = |
988 |
do |
989 |
(nl', inst', _, _) <- opToResult $ applyMove nl inst Failover |
990 |
let idx = Instance.idx inst |
991 |
il' = Container.add idx inst' il |
992 |
ops = iMoveToJob nl' il' idx Failover |
993 |
return (nl', il', ops) |
994 |
|
995 |
nodeEvacInstance nl il ChangeSecondary |
996 |
inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8}) |
997 |
gdx avail_nodes = |
998 |
evacOneNodeOnly nl il inst gdx avail_nodes |
999 |
|
1000 |
-- The algorithm for ChangeAll is as follows: |
1001 |
-- |
1002 |
-- * generate all (primary, secondary) node pairs for the target groups |
1003 |
-- * for each pair, execute the needed moves (r:s, f, r:s) and compute |
1004 |
-- the final node list state and group score |
1005 |
-- * select the best choice via a foldl that uses the same Either |
1006 |
-- String solution as the ChangeSecondary mode |
1007 |
nodeEvacInstance nl il ChangeAll |
1008 |
inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8}) |
1009 |
gdx avail_nodes = |
1010 |
do |
1011 |
let no_nodes = Left "no nodes available" |
1012 |
node_pairs = [(p,s) | p <- avail_nodes, s <- avail_nodes, p /= s] |
1013 |
(nl', il', ops, _) <- |
1014 |
annotateResult "Can't find any good nodes for relocation" . |
1015 |
eitherToResult $ |
1016 |
foldl' |
1017 |
(\accu nodes -> case evacDrbdAllInner nl il inst gdx nodes of |
1018 |
Bad msg -> |
1019 |
case accu of |
1020 |
Right _ -> accu |
1021 |
-- we don't need more details (which |
1022 |
-- nodes, etc.) as we only selected |
1023 |
-- this group if we can allocate on |
1024 |
-- it, hence failures will not |
1025 |
-- propagate out of this fold loop |
1026 |
Left _ -> Left $ "Allocation failed: " ++ msg |
1027 |
Ok result@(_, _, _, new_cv) -> |
1028 |
let new_accu = Right result in |
1029 |
case accu of |
1030 |
Left _ -> new_accu |
1031 |
Right (_, _, _, old_cv) -> |
1032 |
if old_cv < new_cv |
1033 |
then accu |
1034 |
else new_accu |
1035 |
) no_nodes node_pairs |
1036 |
|
1037 |
return (nl', il', ops) |
1038 |
|
1039 |
-- | Generic function for changing one node of an instance. |
1040 |
-- |
1041 |
-- This is similar to 'nodeEvacInstance' but will be used in a few of |
1042 |
-- its sub-patterns. It folds the inner function 'evacOneNodeInner' |
1043 |
-- over the list of available nodes, which results in the best choice |
1044 |
-- for relocation. |
1045 |
evacOneNodeOnly :: Node.List -- ^ The node list (cluster-wide) |
1046 |
-> Instance.List -- ^ Instance list (cluster-wide) |
1047 |
-> Instance.Instance -- ^ The instance to be evacuated |
1048 |
-> Gdx -- ^ The group we're targetting |
1049 |
-> [Ndx] -- ^ The list of available nodes |
1050 |
-- for allocation |
1051 |
-> Result (Node.List, Instance.List, [OpCodes.OpCode]) |
1052 |
evacOneNodeOnly nl il inst gdx avail_nodes = do |
1053 |
op_fn <- case Instance.mirrorType inst of |
1054 |
MirrorNone -> Bad "Can't relocate/evacuate non-mirrored instances" |
1055 |
MirrorInternal -> Ok ReplaceSecondary |
1056 |
MirrorExternal -> Ok FailoverToAny |
1057 |
(nl', inst', _, ndx) <- annotateResult "Can't find any good node" . |
1058 |
eitherToResult $ |
1059 |
foldl' (evacOneNodeInner nl inst gdx op_fn) |
1060 |
(Left "no nodes available") avail_nodes |
1061 |
let idx = Instance.idx inst |
1062 |
il' = Container.add idx inst' il |
1063 |
ops = iMoveToJob nl' il' idx (op_fn ndx) |
1064 |
return (nl', il', ops) |
1065 |
|
1066 |
-- | Inner fold function for changing one node of an instance. |
1067 |
-- |
1068 |
-- Depending on the instance disk template, this will either change |
1069 |
-- the secondary (for DRBD) or the primary node (for shared |
1070 |
-- storage). However, the operation is generic otherwise. |
1071 |
-- |
1072 |
-- The running solution is either a @Left String@, which means we |
1073 |
-- don't have yet a working solution, or a @Right (...)@, which |
1074 |
-- represents a valid solution; it holds the modified node list, the |
1075 |
-- modified instance (after evacuation), the score of that solution, |
1076 |
-- and the new secondary node index. |
1077 |
evacOneNodeInner :: Node.List -- ^ Cluster node list |
1078 |
-> Instance.Instance -- ^ Instance being evacuated |
1079 |
-> Gdx -- ^ The group index of the instance |
1080 |
-> (Ndx -> IMove) -- ^ Operation constructor |
1081 |
-> EvacInnerState -- ^ Current best solution |
1082 |
-> Ndx -- ^ Node we're evaluating as target |
1083 |
-> EvacInnerState -- ^ New best solution |
1084 |
evacOneNodeInner nl inst gdx op_fn accu ndx = |
1085 |
case applyMove nl inst (op_fn ndx) of |
1086 |
Bad fm -> let fail_msg = "Node " ++ Container.nameOf nl ndx ++ |
1087 |
" failed: " ++ show fm |
1088 |
in either (const $ Left fail_msg) (const accu) accu |
1089 |
Ok (nl', inst', _, _) -> |
1090 |
let nodes = Container.elems nl' |
1091 |
-- The fromJust below is ugly (it can fail nastily), but |
1092 |
-- at this point we should have any internal mismatches, |
1093 |
-- and adding a monad here would be quite involved |
1094 |
grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes) |
1095 |
new_cv = compCVNodes grpnodes |
1096 |
new_accu = Right (nl', inst', new_cv, ndx) |
1097 |
in case accu of |
1098 |
Left _ -> new_accu |
1099 |
Right (_, _, old_cv, _) -> |
1100 |
if old_cv < new_cv |
1101 |
then accu |
1102 |
else new_accu |
1103 |
|
1104 |
-- | Compute result of changing all nodes of a DRBD instance. |
1105 |
-- |
1106 |
-- Given the target primary and secondary node (which might be in a |
1107 |
-- different group or not), this function will 'execute' all the |
1108 |
-- required steps and assuming all operations succceed, will return |
1109 |
-- the modified node and instance lists, the opcodes needed for this |
1110 |
-- and the new group score. |
1111 |
evacDrbdAllInner :: Node.List -- ^ Cluster node list |
1112 |
-> Instance.List -- ^ Cluster instance list |
1113 |
-> Instance.Instance -- ^ The instance to be moved |
1114 |
-> Gdx -- ^ The target group index |
1115 |
-- (which can differ from the |
1116 |
-- current group of the |
1117 |
-- instance) |
1118 |
-> (Ndx, Ndx) -- ^ Tuple of new |
1119 |
-- primary\/secondary nodes |
1120 |
-> Result (Node.List, Instance.List, [OpCodes.OpCode], Score) |
1121 |
evacDrbdAllInner nl il inst gdx (t_pdx, t_sdx) = do |
1122 |
let primary = Container.find (Instance.pNode inst) nl |
1123 |
idx = Instance.idx inst |
1124 |
-- if the primary is offline, then we first failover |
1125 |
(nl1, inst1, ops1) <- |
1126 |
if Node.offline primary |
1127 |
then do |
1128 |
(nl', inst', _, _) <- |
1129 |
annotateResult "Failing over to the secondary" . |
1130 |
opToResult $ applyMove nl inst Failover |
1131 |
return (nl', inst', [Failover]) |
1132 |
else return (nl, inst, []) |
1133 |
let (o1, o2, o3) = (ReplaceSecondary t_pdx, |
1134 |
Failover, |
1135 |
ReplaceSecondary t_sdx) |
1136 |
-- we now need to execute a replace secondary to the future |
1137 |
-- primary node |
1138 |
(nl2, inst2, _, _) <- |
1139 |
annotateResult "Changing secondary to new primary" . |
1140 |
opToResult $ |
1141 |
applyMove nl1 inst1 o1 |
1142 |
let ops2 = o1:ops1 |
1143 |
-- we now execute another failover, the primary stays fixed now |
1144 |
(nl3, inst3, _, _) <- annotateResult "Failing over to new primary" . |
1145 |
opToResult $ applyMove nl2 inst2 o2 |
1146 |
let ops3 = o2:ops2 |
1147 |
-- and finally another replace secondary, to the final secondary |
1148 |
(nl4, inst4, _, _) <- |
1149 |
annotateResult "Changing secondary to final secondary" . |
1150 |
opToResult $ |
1151 |
applyMove nl3 inst3 o3 |
1152 |
let ops4 = o3:ops3 |
1153 |
il' = Container.add idx inst4 il |
1154 |
ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4 |
1155 |
let nodes = Container.elems nl4 |
1156 |
-- The fromJust below is ugly (it can fail nastily), but |
1157 |
-- at this point we should have any internal mismatches, |
1158 |
-- and adding a monad here would be quite involved |
1159 |
grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes) |
1160 |
new_cv = compCVNodes grpnodes |
1161 |
return (nl4, il', ops, new_cv) |
1162 |
|
1163 |
-- | Computes the nodes in a given group which are available for |
1164 |
-- allocation. |
1165 |
availableGroupNodes :: [(Gdx, [Ndx])] -- ^ Group index/node index assoc list |
1166 |
-> IntSet.IntSet -- ^ Nodes that are excluded |
1167 |
-> Gdx -- ^ The group for which we |
1168 |
-- query the nodes |
1169 |
-> Result [Ndx] -- ^ List of available node indices |
1170 |
availableGroupNodes group_nodes excl_ndx gdx = do |
1171 |
local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx) |
1172 |
Ok (lookup gdx group_nodes) |
1173 |
let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes |
1174 |
return avail_nodes |
1175 |
|
1176 |
-- | Updates the evac solution with the results of an instance |
1177 |
-- evacuation. |
1178 |
updateEvacSolution :: (Node.List, Instance.List, EvacSolution) |
1179 |
-> Idx |
1180 |
-> Result (Node.List, Instance.List, [OpCodes.OpCode]) |
1181 |
-> (Node.List, Instance.List, EvacSolution) |
1182 |
updateEvacSolution (nl, il, es) idx (Bad msg) = |
1183 |
(nl, il, es { esFailed = (idx, msg):esFailed es}) |
1184 |
updateEvacSolution (_, _, es) idx (Ok (nl, il, opcodes)) = |
1185 |
(nl, il, es { esMoved = new_elem:esMoved es |
1186 |
, esOpCodes = opcodes:esOpCodes es }) |
1187 |
where inst = Container.find idx il |
1188 |
new_elem = (idx, |
1189 |
instancePriGroup nl inst, |
1190 |
Instance.allNodes inst) |
1191 |
|
1192 |
-- | Node-evacuation IAllocator mode main function. |
1193 |
tryNodeEvac :: Group.List -- ^ The cluster groups |
1194 |
-> Node.List -- ^ The node list (cluster-wide, not per group) |
1195 |
-> Instance.List -- ^ Instance list (cluster-wide) |
1196 |
-> EvacMode -- ^ The evacuation mode |
1197 |
-> [Idx] -- ^ List of instance (indices) to be evacuated |
1198 |
-> Result (Node.List, Instance.List, EvacSolution) |
1199 |
tryNodeEvac _ ini_nl ini_il mode idxs = |
1200 |
let evac_ndx = nodesToEvacuate ini_il mode idxs |
1201 |
offline = map Node.idx . filter Node.offline $ Container.elems ini_nl |
1202 |
excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline |
1203 |
group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx |
1204 |
(Container.elems nl))) $ |
1205 |
splitCluster ini_nl ini_il |
1206 |
(fin_nl, fin_il, esol) = |
1207 |
foldl' (\state@(nl, il, _) inst -> |
1208 |
let gdx = instancePriGroup nl inst |
1209 |
pdx = Instance.pNode inst in |
1210 |
updateEvacSolution state (Instance.idx inst) $ |
1211 |
availableGroupNodes group_ndx |
1212 |
(IntSet.insert pdx excl_ndx) gdx >>= |
1213 |
nodeEvacInstance nl il mode inst gdx |
1214 |
) |
1215 |
(ini_nl, ini_il, emptyEvacSolution) |
1216 |
(map (`Container.find` ini_il) idxs) |
1217 |
in return (fin_nl, fin_il, reverseEvacSolution esol) |
1218 |
|
1219 |
-- | Change-group IAllocator mode main function. |
1220 |
-- |
1221 |
-- This is very similar to 'tryNodeEvac', the only difference is that |
1222 |
-- we don't choose as target group the current instance group, but |
1223 |
-- instead: |
1224 |
-- |
1225 |
-- 1. at the start of the function, we compute which are the target |
1226 |
-- groups; either no groups were passed in, in which case we choose |
1227 |
-- all groups out of which we don't evacuate instance, or there were |
1228 |
-- some groups passed, in which case we use those |
1229 |
-- |
1230 |
-- 2. for each instance, we use 'findBestAllocGroup' to choose the |
1231 |
-- best group to hold the instance, and then we do what |
1232 |
-- 'tryNodeEvac' does, except for this group instead of the current |
1233 |
-- instance group. |
1234 |
-- |
1235 |
-- Note that the correct behaviour of this function relies on the |
1236 |
-- function 'nodeEvacInstance' to be able to do correctly both |
1237 |
-- intra-group and inter-group moves when passed the 'ChangeAll' mode. |
1238 |
tryChangeGroup :: Group.List -- ^ The cluster groups |
1239 |
-> Node.List -- ^ The node list (cluster-wide) |
1240 |
-> Instance.List -- ^ Instance list (cluster-wide) |
1241 |
-> [Gdx] -- ^ Target groups; if empty, any |
1242 |
-- groups not being evacuated |
1243 |
-> [Idx] -- ^ List of instance (indices) to be evacuated |
1244 |
-> Result (Node.List, Instance.List, EvacSolution) |
1245 |
tryChangeGroup gl ini_nl ini_il gdxs idxs = |
1246 |
let evac_gdxs = nub $ map (instancePriGroup ini_nl . |
1247 |
flip Container.find ini_il) idxs |
1248 |
target_gdxs = (if null gdxs |
1249 |
then Container.keys gl |
1250 |
else gdxs) \\ evac_gdxs |
1251 |
offline = map Node.idx . filter Node.offline $ Container.elems ini_nl |
1252 |
excl_ndx = foldl' (flip IntSet.insert) IntSet.empty offline |
1253 |
group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx |
1254 |
(Container.elems nl))) $ |
1255 |
splitCluster ini_nl ini_il |
1256 |
(fin_nl, fin_il, esol) = |
1257 |
foldl' (\state@(nl, il, _) inst -> |
1258 |
let solution = do |
1259 |
let ncnt = Instance.requiredNodes $ |
1260 |
Instance.diskTemplate inst |
1261 |
(grp, _, _) <- findBestAllocGroup gl nl il |
1262 |
(Just target_gdxs) inst ncnt |
1263 |
let gdx = Group.idx grp |
1264 |
av_nodes <- availableGroupNodes group_ndx |
1265 |
excl_ndx gdx |
1266 |
nodeEvacInstance nl il ChangeAll inst gdx av_nodes |
1267 |
in updateEvacSolution state (Instance.idx inst) solution |
1268 |
) |
1269 |
(ini_nl, ini_il, emptyEvacSolution) |
1270 |
(map (`Container.find` ini_il) idxs) |
1271 |
in return (fin_nl, fin_il, reverseEvacSolution esol) |
1272 |
|
1273 |
-- | Standard-sized allocation method. |
1274 |
-- |
1275 |
-- This places instances of the same size on the cluster until we're |
1276 |
-- out of space. The result will be a list of identically-sized |
1277 |
-- instances. |
1278 |
iterateAlloc :: AllocMethod |
1279 |
iterateAlloc nl il limit newinst allocnodes ixes cstats = |
1280 |
let depth = length ixes |
1281 |
newname = printf "new-%d" depth::String |
1282 |
newidx = Container.size il |
1283 |
newi2 = Instance.setIdx (Instance.setName newinst newname) newidx |
1284 |
newlimit = fmap (flip (-) 1) limit |
1285 |
in case tryAlloc nl il newi2 allocnodes of |
1286 |
Bad s -> Bad s |
1287 |
Ok (AllocSolution { asFailures = errs, asSolution = sols3 }) -> |
1288 |
let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in |
1289 |
case sols3 of |
1290 |
Nothing -> newsol |
1291 |
Just (xnl, xi, _, _) -> |
1292 |
if limit == Just 0 |
1293 |
then newsol |
1294 |
else iterateAlloc xnl (Container.add newidx xi il) |
1295 |
newlimit newinst allocnodes (xi:ixes) |
1296 |
(totalResources xnl:cstats) |
1297 |
|
1298 |
-- | Predicate whether shrinking a single resource can lead to a valid |
1299 |
-- allocation. |
1300 |
sufficesShrinking :: (Instance.Instance -> AllocSolution) -> Instance.Instance |
1301 |
-> FailMode -> Maybe Instance.Instance |
1302 |
sufficesShrinking allocFn inst fm = |
1303 |
case dropWhile (isNothing . asSolution . fst) |
1304 |
. takeWhile (liftA2 (||) (elem fm . asFailures . fst) |
1305 |
(isJust . asSolution . fst)) |
1306 |
. map (allocFn &&& id) $ |
1307 |
iterateOk (`Instance.shrinkByType` fm) inst |
1308 |
of x:_ -> Just . snd $ x |
1309 |
_ -> Nothing |
1310 |
|
1311 |
-- | Tiered allocation method. |
1312 |
-- |
1313 |
-- This places instances on the cluster, and decreases the spec until |
1314 |
-- we can allocate again. The result will be a list of decreasing |
1315 |
-- instance specs. |
1316 |
tieredAlloc :: AllocMethod |
1317 |
tieredAlloc nl il limit newinst allocnodes ixes cstats = |
1318 |
case iterateAlloc nl il limit newinst allocnodes ixes cstats of |
1319 |
Bad s -> Bad s |
1320 |
Ok (errs, nl', il', ixes', cstats') -> |
1321 |
let newsol = Ok (errs, nl', il', ixes', cstats') |
1322 |
ixes_cnt = length ixes' |
1323 |
(stop, newlimit) = case limit of |
1324 |
Nothing -> (False, Nothing) |
1325 |
Just n -> (n <= ixes_cnt, |
1326 |
Just (n - ixes_cnt)) |
1327 |
sortedErrs = map fst $ sortBy (comparing snd) errs |
1328 |
suffShrink = sufficesShrinking (fromMaybe emptyAllocSolution |
1329 |
. flip (tryAlloc nl' il') allocnodes) |
1330 |
newinst |
1331 |
bigSteps = filter isJust . map suffShrink . reverse $ sortedErrs |
1332 |
in if stop then newsol else |
1333 |
case bigSteps of |
1334 |
Just newinst':_ -> tieredAlloc nl' il' newlimit |
1335 |
newinst' allocnodes ixes' cstats' |
1336 |
_ -> case Instance.shrinkByType newinst . last $ sortedErrs of |
1337 |
Bad _ -> newsol |
1338 |
Ok newinst' -> tieredAlloc nl' il' newlimit |
1339 |
newinst' allocnodes ixes' cstats' |
1340 |
|
1341 |
-- * Formatting functions |
1342 |
|
1343 |
-- | Given the original and final nodes, computes the relocation description. |
1344 |
computeMoves :: Instance.Instance -- ^ The instance to be moved |
1345 |
-> String -- ^ The instance name |
1346 |
-> IMove -- ^ The move being performed |
1347 |
-> String -- ^ New primary |
1348 |
-> String -- ^ New secondary |
1349 |
-> (String, [String]) |
1350 |
-- ^ Tuple of moves and commands list; moves is containing |
1351 |
-- either @/f/@ for failover or @/r:name/@ for replace |
1352 |
-- secondary, while the command list holds gnt-instance |
1353 |
-- commands (without that prefix), e.g \"@failover instance1@\" |
1354 |
computeMoves i inam mv c d = |
1355 |
case mv of |
1356 |
Failover -> ("f", [mig]) |
1357 |
FailoverToAny _ -> (printf "fa:%s" c, [mig_any]) |
1358 |
FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d]) |
1359 |
ReplaceSecondary _ -> (printf "r:%s" d, [rep d]) |
1360 |
ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig]) |
1361 |
ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig]) |
1362 |
where morf = if Instance.isRunning i then "migrate" else "failover" |
1363 |
mig = printf "%s -f %s" morf inam::String |
1364 |
mig_any = printf "%s -f -n %s %s" morf c inam::String |
1365 |
rep n = printf "replace-disks -n %s %s" n inam::String |
1366 |
|
1367 |
-- | Converts a placement to string format. |
1368 |
printSolutionLine :: Node.List -- ^ The node list |
1369 |
-> Instance.List -- ^ The instance list |
1370 |
-> Int -- ^ Maximum node name length |
1371 |
-> Int -- ^ Maximum instance name length |
1372 |
-> Placement -- ^ The current placement |
1373 |
-> Int -- ^ The index of the placement in |
1374 |
-- the solution |
1375 |
-> (String, [String]) |
1376 |
printSolutionLine nl il nmlen imlen plc pos = |
1377 |
let pmlen = (2*nmlen + 1) |
1378 |
(i, p, s, mv, c) = plc |
1379 |
old_sec = Instance.sNode inst |
1380 |
inst = Container.find i il |
1381 |
inam = Instance.alias inst |
1382 |
npri = Node.alias $ Container.find p nl |
1383 |
nsec = Node.alias $ Container.find s nl |
1384 |
opri = Node.alias $ Container.find (Instance.pNode inst) nl |
1385 |
osec = Node.alias $ Container.find old_sec nl |
1386 |
(moves, cmds) = computeMoves inst inam mv npri nsec |
1387 |
-- FIXME: this should check instead/also the disk template |
1388 |
ostr = if old_sec == Node.noSecondary |
1389 |
then printf "%s" opri::String |
1390 |
else printf "%s:%s" opri osec::String |
1391 |
nstr = if s == Node.noSecondary |
1392 |
then printf "%s" npri::String |
1393 |
else printf "%s:%s" npri nsec::String |
1394 |
in (printf " %3d. %-*s %-*s => %-*s %12.8f a=%s" |
1395 |
pos imlen inam pmlen ostr pmlen nstr c moves, |
1396 |
cmds) |
1397 |
|
1398 |
-- | Return the instance and involved nodes in an instance move. |
1399 |
-- |
1400 |
-- Note that the output list length can vary, and is not required nor |
1401 |
-- guaranteed to be of any specific length. |
1402 |
involvedNodes :: Instance.List -- ^ Instance list, used for retrieving |
1403 |
-- the instance from its index; note |
1404 |
-- that this /must/ be the original |
1405 |
-- instance list, so that we can |
1406 |
-- retrieve the old nodes |
1407 |
-> Placement -- ^ The placement we're investigating, |
1408 |
-- containing the new nodes and |
1409 |
-- instance index |
1410 |
-> [Ndx] -- ^ Resulting list of node indices |
1411 |
involvedNodes il plc = |
1412 |
let (i, np, ns, _, _) = plc |
1413 |
inst = Container.find i il |
1414 |
in nub . filter (>= 0) $ [np, ns] ++ Instance.allNodes inst |
1415 |
|
1416 |
-- | From two adjacent cluster tables get the list of moves that transitions |
1417 |
-- from to the other |
1418 |
getMoves :: (Table, Table) -> [MoveJob] |
1419 |
getMoves (Table _ initial_il _ initial_plc, Table final_nl _ _ final_plc) = |
1420 |
let |
1421 |
plctoMoves (plc@(idx, p, s, mv, _)) = |
1422 |
let inst = Container.find idx initial_il |
1423 |
inst_name = Instance.name inst |
1424 |
affected = involvedNodes initial_il plc |
1425 |
np = Node.alias $ Container.find p final_nl |
1426 |
ns = Node.alias $ Container.find s final_nl |
1427 |
(_, cmds) = computeMoves inst inst_name mv np ns |
1428 |
in (affected, idx, mv, cmds) |
1429 |
in map plctoMoves . reverse . drop (length initial_plc) $ reverse final_plc |
1430 |
|
1431 |
-- | Inner function for splitJobs, that either appends the next job to |
1432 |
-- the current jobset, or starts a new jobset. |
1433 |
mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx]) |
1434 |
mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx) |
1435 |
mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _) |
1436 |
| null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf) |
1437 |
| otherwise = ([n]:cjs, ndx) |
1438 |
|
1439 |
-- | Break a list of moves into independent groups. Note that this |
1440 |
-- will reverse the order of jobs. |
1441 |
splitJobs :: [MoveJob] -> [JobSet] |
1442 |
splitJobs = fst . foldl mergeJobs ([], []) |
1443 |
|
1444 |
-- | Given a list of commands, prefix them with @gnt-instance@ and |
1445 |
-- also beautify the display a little. |
1446 |
formatJob :: Int -> Int -> (Int, MoveJob) -> [String] |
1447 |
formatJob jsn jsl (sn, (_, _, _, cmds)) = |
1448 |
let out = |
1449 |
printf " echo job %d/%d" jsn sn: |
1450 |
printf " check": |
1451 |
map (" gnt-instance " ++) cmds |
1452 |
in if sn == 1 |
1453 |
then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out |
1454 |
else out |
1455 |
|
1456 |
-- | Given a list of commands, prefix them with @gnt-instance@ and |
1457 |
-- also beautify the display a little. |
1458 |
formatCmds :: [JobSet] -> String |
1459 |
formatCmds = |
1460 |
unlines . |
1461 |
concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js)) |
1462 |
(zip [1..] js)) . |
1463 |
zip [1..] |
1464 |
|
1465 |
-- | Print the node list. |
1466 |
printNodes :: Node.List -> [String] -> String |
1467 |
printNodes nl fs = |
1468 |
let fields = case fs of |
1469 |
[] -> Node.defaultFields |
1470 |
"+":rest -> Node.defaultFields ++ rest |
1471 |
_ -> fs |
1472 |
snl = sortBy (comparing Node.idx) (Container.elems nl) |
1473 |
(header, isnum) = unzip $ map Node.showHeader fields |
1474 |
in printTable "" header (map (Node.list fields) snl) isnum |
1475 |
|
1476 |
-- | Print the instance list. |
1477 |
printInsts :: Node.List -> Instance.List -> String |
1478 |
printInsts nl il = |
1479 |
let sil = sortBy (comparing Instance.idx) (Container.elems il) |
1480 |
helper inst = [ if Instance.isRunning inst then "R" else " " |
1481 |
, Instance.name inst |
1482 |
, Container.nameOf nl (Instance.pNode inst) |
1483 |
, let sdx = Instance.sNode inst |
1484 |
in if sdx == Node.noSecondary |
1485 |
then "" |
1486 |
else Container.nameOf nl sdx |
1487 |
, if Instance.autoBalance inst then "Y" else "N" |
1488 |
, printf "%3d" $ Instance.vcpus inst |
1489 |
, printf "%5d" $ Instance.mem inst |
1490 |
, printf "%5d" $ Instance.dsk inst `div` 1024 |
1491 |
, printf "%5.3f" lC |
1492 |
, printf "%5.3f" lM |
1493 |
, printf "%5.3f" lD |
1494 |
, printf "%5.3f" lN |
1495 |
] |
1496 |
where DynUtil lC lM lD lN = Instance.util inst |
1497 |
header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal" |
1498 |
, "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ] |
1499 |
isnum = False:False:False:False:False:repeat True |
1500 |
in printTable "" header (map helper sil) isnum |
1501 |
|
1502 |
-- | Shows statistics for a given node list. |
1503 |
printStats :: String -> Node.List -> String |
1504 |
printStats lp nl = |
1505 |
let dcvs = compDetailedCV $ Container.elems nl |
1506 |
(weights, names) = unzip detailedCVInfo |
1507 |
hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs |
1508 |
header = [ "Field", "Value", "Weight" ] |
1509 |
formatted = map (\(w, h, val) -> |
1510 |
[ h |
1511 |
, printf "%.8f" val |
1512 |
, printf "x%.2f" w |
1513 |
]) hd |
1514 |
in printTable lp header formatted $ False:repeat True |
1515 |
|
1516 |
-- | Convert a placement into a list of OpCodes (basically a job). |
1517 |
iMoveToJob :: Node.List -- ^ The node list; only used for node |
1518 |
-- names, so any version is good |
1519 |
-- (before or after the operation) |
1520 |
-> Instance.List -- ^ The instance list; also used for |
1521 |
-- names only |
1522 |
-> Idx -- ^ The index of the instance being |
1523 |
-- moved |
1524 |
-> IMove -- ^ The actual move to be described |
1525 |
-> [OpCodes.OpCode] -- ^ The list of opcodes equivalent to |
1526 |
-- the given move |
1527 |
iMoveToJob nl il idx move = |
1528 |
let inst = Container.find idx il |
1529 |
iname = Instance.name inst |
1530 |
lookNode n = case mkNonEmpty (Container.nameOf nl n) of |
1531 |
-- FIXME: convert htools codebase to non-empty strings |
1532 |
Bad msg -> error $ "Empty node name for idx " ++ |
1533 |
show n ++ ": " ++ msg ++ "??" |
1534 |
Ok ne -> Just ne |
1535 |
opF = OpCodes.OpInstanceMigrate |
1536 |
{ OpCodes.opInstanceName = iname |
1537 |
, OpCodes.opInstanceUuid = Nothing |
1538 |
, OpCodes.opMigrationMode = Nothing -- default |
1539 |
, OpCodes.opOldLiveMode = Nothing -- default as well |
1540 |
, OpCodes.opTargetNode = Nothing -- this is drbd |
1541 |
, OpCodes.opTargetNodeUuid = Nothing |
1542 |
, OpCodes.opAllowRuntimeChanges = False |
1543 |
, OpCodes.opIgnoreIpolicy = False |
1544 |
, OpCodes.opMigrationCleanup = False |
1545 |
, OpCodes.opIallocator = Nothing |
1546 |
, OpCodes.opAllowFailover = True } |
1547 |
opFA n = opF { OpCodes.opTargetNode = lookNode n } -- not drbd |
1548 |
opR n = OpCodes.OpInstanceReplaceDisks |
1549 |
{ OpCodes.opInstanceName = iname |
1550 |
, OpCodes.opInstanceUuid = Nothing |
1551 |
, OpCodes.opEarlyRelease = False |
1552 |
, OpCodes.opIgnoreIpolicy = False |
1553 |
, OpCodes.opReplaceDisksMode = OpCodes.ReplaceNewSecondary |
1554 |
, OpCodes.opReplaceDisksList = [] |
1555 |
, OpCodes.opRemoteNode = lookNode n |
1556 |
, OpCodes.opRemoteNodeUuid = Nothing |
1557 |
, OpCodes.opIallocator = Nothing |
1558 |
} |
1559 |
in case move of |
1560 |
Failover -> [ opF ] |
1561 |
FailoverToAny np -> [ opFA np ] |
1562 |
ReplacePrimary np -> [ opF, opR np, opF ] |
1563 |
ReplaceSecondary ns -> [ opR ns ] |
1564 |
ReplaceAndFailover np -> [ opR np, opF ] |
1565 |
FailoverAndReplace ns -> [ opF, opR ns ] |
1566 |
|
1567 |
-- * Node group functions |
1568 |
|
1569 |
-- | Computes the group of an instance. |
1570 |
instanceGroup :: Node.List -> Instance.Instance -> Result Gdx |
1571 |
instanceGroup nl i = |
1572 |
let sidx = Instance.sNode i |
1573 |
pnode = Container.find (Instance.pNode i) nl |
1574 |
snode = if sidx == Node.noSecondary |
1575 |
then pnode |
1576 |
else Container.find sidx nl |
1577 |
pgroup = Node.group pnode |
1578 |
sgroup = Node.group snode |
1579 |
in if pgroup /= sgroup |
1580 |
then fail ("Instance placed accross two node groups, primary " ++ |
1581 |
show pgroup ++ ", secondary " ++ show sgroup) |
1582 |
else return pgroup |
1583 |
|
1584 |
-- | Computes the group of an instance per the primary node. |
1585 |
instancePriGroup :: Node.List -> Instance.Instance -> Gdx |
1586 |
instancePriGroup nl i = |
1587 |
let pnode = Container.find (Instance.pNode i) nl |
1588 |
in Node.group pnode |
1589 |
|
1590 |
-- | Compute the list of badly allocated instances (split across node |
1591 |
-- groups). |
1592 |
findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance] |
1593 |
findSplitInstances nl = |
1594 |
filter (not . isOk . instanceGroup nl) . Container.elems |
1595 |
|
1596 |
-- | Splits a cluster into the component node groups. |
1597 |
splitCluster :: Node.List -> Instance.List -> |
1598 |
[(Gdx, (Node.List, Instance.List))] |
1599 |
splitCluster nl il = |
1600 |
let ngroups = Node.computeGroups (Container.elems nl) |
1601 |
in map (\(gdx, nodes) -> |
1602 |
let nidxs = map Node.idx nodes |
1603 |
nodes' = zip nidxs nodes |
1604 |
instances = Container.filter ((`elem` nidxs) . Instance.pNode) il |
1605 |
in (gdx, (Container.fromList nodes', instances))) ngroups |
1606 |
|
1607 |
-- | Compute the list of nodes that are to be evacuated, given a list |
1608 |
-- of instances and an evacuation mode. |
1609 |
nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list |
1610 |
-> EvacMode -- ^ The evacuation mode we're using |
1611 |
-> [Idx] -- ^ List of instance indices being evacuated |
1612 |
-> IntSet.IntSet -- ^ Set of node indices |
1613 |
nodesToEvacuate il mode = |
1614 |
IntSet.delete Node.noSecondary . |
1615 |
foldl' (\ns idx -> |
1616 |
let i = Container.find idx il |
1617 |
pdx = Instance.pNode i |
1618 |
sdx = Instance.sNode i |
1619 |
dt = Instance.diskTemplate i |
1620 |
withSecondary = case dt of |
1621 |
DTDrbd8 -> IntSet.insert sdx ns |
1622 |
_ -> ns |
1623 |
in case mode of |
1624 |
ChangePrimary -> IntSet.insert pdx ns |
1625 |
ChangeSecondary -> withSecondary |
1626 |
ChangeAll -> IntSet.insert pdx withSecondary |
1627 |
) IntSet.empty |