root / htools / Ganeti / HTools / Cluster.hs @ 47eed3f4
History | View | Annotate | Download (53 kB)
1 |
{-| Implementation of cluster-wide logic. |
---|---|
2 |
|
3 |
This module holds all pure cluster-logic; I\/O related functionality |
4 |
goes into the /Main/ module for the individual binaries. |
5 |
|
6 |
-} |
7 |
|
8 |
{- |
9 |
|
10 |
Copyright (C) 2009, 2010, 2011 Google Inc. |
11 |
|
12 |
This program is free software; you can redistribute it and/or modify |
13 |
it under the terms of the GNU General Public License as published by |
14 |
the Free Software Foundation; either version 2 of the License, or |
15 |
(at your option) any later version. |
16 |
|
17 |
This program is distributed in the hope that it will be useful, but |
18 |
WITHOUT ANY WARRANTY; without even the implied warranty of |
19 |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
20 |
General Public License for more details. |
21 |
|
22 |
You should have received a copy of the GNU General Public License |
23 |
along with this program; if not, write to the Free Software |
24 |
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA |
25 |
02110-1301, USA. |
26 |
|
27 |
-} |
28 |
|
29 |
module Ganeti.HTools.Cluster |
30 |
( |
31 |
-- * Types |
32 |
AllocSolution(..) |
33 |
, EvacSolution(..) |
34 |
, Table(..) |
35 |
, CStats(..) |
36 |
, AllocStats |
37 |
-- * Generic functions |
38 |
, totalResources |
39 |
, computeAllocationDelta |
40 |
-- * First phase functions |
41 |
, computeBadItems |
42 |
-- * Second phase functions |
43 |
, printSolutionLine |
44 |
, formatCmds |
45 |
, involvedNodes |
46 |
, splitJobs |
47 |
-- * Display functions |
48 |
, printNodes |
49 |
, printInsts |
50 |
-- * Balacing functions |
51 |
, checkMove |
52 |
, doNextBalance |
53 |
, tryBalance |
54 |
, compCV |
55 |
, compDetailedCV |
56 |
, printStats |
57 |
, iMoveToJob |
58 |
-- * IAllocator functions |
59 |
, genAllocNodes |
60 |
, tryAlloc |
61 |
, tryMGAlloc |
62 |
, tryReloc |
63 |
, tryMGReloc |
64 |
, tryEvac |
65 |
, tryMGEvac |
66 |
, tryNodeEvac |
67 |
, collapseFailures |
68 |
-- * Allocation functions |
69 |
, iterateAlloc |
70 |
, tieredAlloc |
71 |
, tieredSpecMap |
72 |
-- * Node group functions |
73 |
, instanceGroup |
74 |
, findSplitInstances |
75 |
, splitCluster |
76 |
) where |
77 |
|
78 |
import Data.Function (on) |
79 |
import qualified Data.IntSet as IntSet |
80 |
import Data.List |
81 |
import Data.Ord (comparing) |
82 |
import Text.Printf (printf) |
83 |
import Control.Monad |
84 |
import Control.Parallel.Strategies |
85 |
|
86 |
import qualified Ganeti.HTools.Container as Container |
87 |
import qualified Ganeti.HTools.Instance as Instance |
88 |
import qualified Ganeti.HTools.Node as Node |
89 |
import qualified Ganeti.HTools.Group as Group |
90 |
import Ganeti.HTools.Types |
91 |
import Ganeti.HTools.Utils |
92 |
import qualified Ganeti.OpCodes as OpCodes |
93 |
|
94 |
-- * Types |
95 |
|
96 |
-- | Allocation\/relocation solution. |
97 |
data AllocSolution = AllocSolution |
98 |
{ asFailures :: [FailMode] -- ^ Failure counts |
99 |
, asAllocs :: Int -- ^ Good allocation count |
100 |
, asSolutions :: [Node.AllocElement] -- ^ The actual result, length |
101 |
-- of the list depends on the |
102 |
-- allocation/relocation mode |
103 |
, asLog :: [String] -- ^ A list of informational messages |
104 |
} |
105 |
|
106 |
-- | Node evacuation/group change iallocator result type. This result |
107 |
-- type consists of actual opcodes (a restricted subset) that are |
108 |
-- transmitted back to Ganeti. |
109 |
data EvacSolution = EvacSolution |
110 |
{ esMoved :: [String] -- ^ Instance moved successfully |
111 |
, esFailed :: [String] -- ^ Instance which were not |
112 |
-- relocated |
113 |
, esOpCodes :: [[[OpCodes.OpCode]]] -- ^ List of lists of jobs |
114 |
} |
115 |
|
116 |
-- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'. |
117 |
type AllocResult = (FailStats, Node.List, Instance.List, |
118 |
[Instance.Instance], [CStats]) |
119 |
|
120 |
-- | A type denoting the valid allocation mode/pairs. |
121 |
-- |
122 |
-- For a one-node allocation, this will be a @Left ['Node.Node']@, |
123 |
-- whereas for a two-node allocation, this will be a @Right |
124 |
-- [('Node.Node', 'Node.Node')]@. |
125 |
type AllocNodes = Either [Ndx] [(Ndx, Ndx)] |
126 |
|
127 |
-- | The empty solution we start with when computing allocations. |
128 |
emptyAllocSolution :: AllocSolution |
129 |
emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0 |
130 |
, asSolutions = [], asLog = [] } |
131 |
|
132 |
-- | The empty evac solution. |
133 |
emptyEvacSolution :: EvacSolution |
134 |
emptyEvacSolution = EvacSolution { esMoved = [] |
135 |
, esFailed = [] |
136 |
, esOpCodes = [] |
137 |
} |
138 |
|
139 |
-- | The complete state for the balancing solution. |
140 |
data Table = Table Node.List Instance.List Score [Placement] |
141 |
deriving (Show, Read) |
142 |
|
143 |
data CStats = CStats { csFmem :: Integer -- ^ Cluster free mem |
144 |
, csFdsk :: Integer -- ^ Cluster free disk |
145 |
, csAmem :: Integer -- ^ Cluster allocatable mem |
146 |
, csAdsk :: Integer -- ^ Cluster allocatable disk |
147 |
, csAcpu :: Integer -- ^ Cluster allocatable cpus |
148 |
, csMmem :: Integer -- ^ Max node allocatable mem |
149 |
, csMdsk :: Integer -- ^ Max node allocatable disk |
150 |
, csMcpu :: Integer -- ^ Max node allocatable cpu |
151 |
, csImem :: Integer -- ^ Instance used mem |
152 |
, csIdsk :: Integer -- ^ Instance used disk |
153 |
, csIcpu :: Integer -- ^ Instance used cpu |
154 |
, csTmem :: Double -- ^ Cluster total mem |
155 |
, csTdsk :: Double -- ^ Cluster total disk |
156 |
, csTcpu :: Double -- ^ Cluster total cpus |
157 |
, csVcpu :: Integer -- ^ Cluster virtual cpus (if |
158 |
-- node pCpu has been set, |
159 |
-- otherwise -1) |
160 |
, csXmem :: Integer -- ^ Unnacounted for mem |
161 |
, csNmem :: Integer -- ^ Node own memory |
162 |
, csScore :: Score -- ^ The cluster score |
163 |
, csNinst :: Int -- ^ The total number of instances |
164 |
} |
165 |
deriving (Show, Read) |
166 |
|
167 |
-- | Currently used, possibly to allocate, unallocable. |
168 |
type AllocStats = (RSpec, RSpec, RSpec) |
169 |
|
170 |
-- * Utility functions |
171 |
|
172 |
-- | Verifies the N+1 status and return the affected nodes. |
173 |
verifyN1 :: [Node.Node] -> [Node.Node] |
174 |
verifyN1 = filter Node.failN1 |
175 |
|
176 |
{-| Computes the pair of bad nodes and instances. |
177 |
|
178 |
The bad node list is computed via a simple 'verifyN1' check, and the |
179 |
bad instance list is the list of primary and secondary instances of |
180 |
those nodes. |
181 |
|
182 |
-} |
183 |
computeBadItems :: Node.List -> Instance.List -> |
184 |
([Node.Node], [Instance.Instance]) |
185 |
computeBadItems nl il = |
186 |
let bad_nodes = verifyN1 $ getOnline nl |
187 |
bad_instances = map (`Container.find` il) . |
188 |
sort . nub $ |
189 |
concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes |
190 |
in |
191 |
(bad_nodes, bad_instances) |
192 |
|
193 |
-- | Zero-initializer for the CStats type. |
194 |
emptyCStats :: CStats |
195 |
emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 |
196 |
|
197 |
-- | Update stats with data from a new node. |
198 |
updateCStats :: CStats -> Node.Node -> CStats |
199 |
updateCStats cs node = |
200 |
let CStats { csFmem = x_fmem, csFdsk = x_fdsk, |
201 |
csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk, |
202 |
csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu, |
203 |
csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu, |
204 |
csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu, |
205 |
csVcpu = x_vcpu, |
206 |
csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst |
207 |
} |
208 |
= cs |
209 |
inc_amem = Node.fMem node - Node.rMem node |
210 |
inc_amem' = if inc_amem > 0 then inc_amem else 0 |
211 |
inc_adsk = Node.availDisk node |
212 |
inc_imem = truncate (Node.tMem node) - Node.nMem node |
213 |
- Node.xMem node - Node.fMem node |
214 |
inc_icpu = Node.uCpu node |
215 |
inc_idsk = truncate (Node.tDsk node) - Node.fDsk node |
216 |
inc_vcpu = Node.hiCpu node |
217 |
inc_acpu = Node.availCpu node |
218 |
|
219 |
in cs { csFmem = x_fmem + fromIntegral (Node.fMem node) |
220 |
, csFdsk = x_fdsk + fromIntegral (Node.fDsk node) |
221 |
, csAmem = x_amem + fromIntegral inc_amem' |
222 |
, csAdsk = x_adsk + fromIntegral inc_adsk |
223 |
, csAcpu = x_acpu + fromIntegral inc_acpu |
224 |
, csMmem = max x_mmem (fromIntegral inc_amem') |
225 |
, csMdsk = max x_mdsk (fromIntegral inc_adsk) |
226 |
, csMcpu = max x_mcpu (fromIntegral inc_acpu) |
227 |
, csImem = x_imem + fromIntegral inc_imem |
228 |
, csIdsk = x_idsk + fromIntegral inc_idsk |
229 |
, csIcpu = x_icpu + fromIntegral inc_icpu |
230 |
, csTmem = x_tmem + Node.tMem node |
231 |
, csTdsk = x_tdsk + Node.tDsk node |
232 |
, csTcpu = x_tcpu + Node.tCpu node |
233 |
, csVcpu = x_vcpu + fromIntegral inc_vcpu |
234 |
, csXmem = x_xmem + fromIntegral (Node.xMem node) |
235 |
, csNmem = x_nmem + fromIntegral (Node.nMem node) |
236 |
, csNinst = x_ninst + length (Node.pList node) |
237 |
} |
238 |
|
239 |
-- | Compute the total free disk and memory in the cluster. |
240 |
totalResources :: Node.List -> CStats |
241 |
totalResources nl = |
242 |
let cs = foldl' updateCStats emptyCStats . Container.elems $ nl |
243 |
in cs { csScore = compCV nl } |
244 |
|
245 |
-- | Compute the delta between two cluster state. |
246 |
-- |
247 |
-- This is used when doing allocations, to understand better the |
248 |
-- available cluster resources. The return value is a triple of the |
249 |
-- current used values, the delta that was still allocated, and what |
250 |
-- was left unallocated. |
251 |
computeAllocationDelta :: CStats -> CStats -> AllocStats |
252 |
computeAllocationDelta cini cfin = |
253 |
let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu} = cini |
254 |
CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu, |
255 |
csTmem = t_mem, csTdsk = t_dsk, csVcpu = v_cpu } = cfin |
256 |
rini = RSpec (fromIntegral i_icpu) (fromIntegral i_imem) |
257 |
(fromIntegral i_idsk) |
258 |
rfin = RSpec (fromIntegral (f_icpu - i_icpu)) |
259 |
(fromIntegral (f_imem - i_imem)) |
260 |
(fromIntegral (f_idsk - i_idsk)) |
261 |
un_cpu = fromIntegral (v_cpu - f_icpu)::Int |
262 |
runa = RSpec un_cpu (truncate t_mem - fromIntegral f_imem) |
263 |
(truncate t_dsk - fromIntegral f_idsk) |
264 |
in (rini, rfin, runa) |
265 |
|
266 |
-- | The names and weights of the individual elements in the CV list. |
267 |
detailedCVInfo :: [(Double, String)] |
268 |
detailedCVInfo = [ (1, "free_mem_cv") |
269 |
, (1, "free_disk_cv") |
270 |
, (1, "n1_cnt") |
271 |
, (1, "reserved_mem_cv") |
272 |
, (4, "offline_all_cnt") |
273 |
, (16, "offline_pri_cnt") |
274 |
, (1, "vcpu_ratio_cv") |
275 |
, (1, "cpu_load_cv") |
276 |
, (1, "mem_load_cv") |
277 |
, (1, "disk_load_cv") |
278 |
, (1, "net_load_cv") |
279 |
, (2, "pri_tags_score") |
280 |
] |
281 |
|
282 |
detailedCVWeights :: [Double] |
283 |
detailedCVWeights = map fst detailedCVInfo |
284 |
|
285 |
-- | Compute the mem and disk covariance. |
286 |
compDetailedCV :: Node.List -> [Double] |
287 |
compDetailedCV nl = |
288 |
let |
289 |
all_nodes = Container.elems nl |
290 |
(offline, nodes) = partition Node.offline all_nodes |
291 |
mem_l = map Node.pMem nodes |
292 |
dsk_l = map Node.pDsk nodes |
293 |
-- metric: memory covariance |
294 |
mem_cv = stdDev mem_l |
295 |
-- metric: disk covariance |
296 |
dsk_cv = stdDev dsk_l |
297 |
-- metric: count of instances living on N1 failing nodes |
298 |
n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) + |
299 |
length (Node.pList n)) . |
300 |
filter Node.failN1 $ nodes :: Double |
301 |
res_l = map Node.pRem nodes |
302 |
-- metric: reserved memory covariance |
303 |
res_cv = stdDev res_l |
304 |
-- offline instances metrics |
305 |
offline_ipri = sum . map (length . Node.pList) $ offline |
306 |
offline_isec = sum . map (length . Node.sList) $ offline |
307 |
-- metric: count of instances on offline nodes |
308 |
off_score = fromIntegral (offline_ipri + offline_isec)::Double |
309 |
-- metric: count of primary instances on offline nodes (this |
310 |
-- helps with evacuation/failover of primary instances on |
311 |
-- 2-node clusters with one node offline) |
312 |
off_pri_score = fromIntegral offline_ipri::Double |
313 |
cpu_l = map Node.pCpu nodes |
314 |
-- metric: covariance of vcpu/pcpu ratio |
315 |
cpu_cv = stdDev cpu_l |
316 |
-- metrics: covariance of cpu, memory, disk and network load |
317 |
(c_load, m_load, d_load, n_load) = unzip4 $ |
318 |
map (\n -> |
319 |
let DynUtil c1 m1 d1 n1 = Node.utilLoad n |
320 |
DynUtil c2 m2 d2 n2 = Node.utilPool n |
321 |
in (c1/c2, m1/m2, d1/d2, n1/n2) |
322 |
) nodes |
323 |
-- metric: conflicting instance count |
324 |
pri_tags_inst = sum $ map Node.conflictingPrimaries nodes |
325 |
pri_tags_score = fromIntegral pri_tags_inst::Double |
326 |
in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv |
327 |
, stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load |
328 |
, pri_tags_score ] |
329 |
|
330 |
-- | Compute the /total/ variance. |
331 |
compCV :: Node.List -> Double |
332 |
compCV = sum . zipWith (*) detailedCVWeights . compDetailedCV |
333 |
|
334 |
-- | Compute online nodes from a 'Node.List'. |
335 |
getOnline :: Node.List -> [Node.Node] |
336 |
getOnline = filter (not . Node.offline) . Container.elems |
337 |
|
338 |
-- * Balancing functions |
339 |
|
340 |
-- | Compute best table. Note that the ordering of the arguments is important. |
341 |
compareTables :: Table -> Table -> Table |
342 |
compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) = |
343 |
if a_cv > b_cv then b else a |
344 |
|
345 |
-- | Applies an instance move to a given node list and instance. |
346 |
applyMove :: Node.List -> Instance.Instance |
347 |
-> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx) |
348 |
-- Failover (f) |
349 |
applyMove nl inst Failover = |
350 |
let old_pdx = Instance.pNode inst |
351 |
old_sdx = Instance.sNode inst |
352 |
old_p = Container.find old_pdx nl |
353 |
old_s = Container.find old_sdx nl |
354 |
int_p = Node.removePri old_p inst |
355 |
int_s = Node.removeSec old_s inst |
356 |
force_p = Node.offline old_p |
357 |
new_nl = do -- Maybe monad |
358 |
new_p <- Node.addPriEx force_p int_s inst |
359 |
new_s <- Node.addSec int_p inst old_sdx |
360 |
let new_inst = Instance.setBoth inst old_sdx old_pdx |
361 |
return (Container.addTwo old_pdx new_s old_sdx new_p nl, |
362 |
new_inst, old_sdx, old_pdx) |
363 |
in new_nl |
364 |
|
365 |
-- Replace the primary (f:, r:np, f) |
366 |
applyMove nl inst (ReplacePrimary new_pdx) = |
367 |
let old_pdx = Instance.pNode inst |
368 |
old_sdx = Instance.sNode inst |
369 |
old_p = Container.find old_pdx nl |
370 |
old_s = Container.find old_sdx nl |
371 |
tgt_n = Container.find new_pdx nl |
372 |
int_p = Node.removePri old_p inst |
373 |
int_s = Node.removeSec old_s inst |
374 |
force_p = Node.offline old_p |
375 |
new_nl = do -- Maybe monad |
376 |
-- check that the current secondary can host the instance |
377 |
-- during the migration |
378 |
tmp_s <- Node.addPriEx force_p int_s inst |
379 |
let tmp_s' = Node.removePri tmp_s inst |
380 |
new_p <- Node.addPriEx force_p tgt_n inst |
381 |
new_s <- Node.addSecEx force_p tmp_s' inst new_pdx |
382 |
let new_inst = Instance.setPri inst new_pdx |
383 |
return (Container.add new_pdx new_p $ |
384 |
Container.addTwo old_pdx int_p old_sdx new_s nl, |
385 |
new_inst, new_pdx, old_sdx) |
386 |
in new_nl |
387 |
|
388 |
-- Replace the secondary (r:ns) |
389 |
applyMove nl inst (ReplaceSecondary new_sdx) = |
390 |
let old_pdx = Instance.pNode inst |
391 |
old_sdx = Instance.sNode inst |
392 |
old_s = Container.find old_sdx nl |
393 |
tgt_n = Container.find new_sdx nl |
394 |
int_s = Node.removeSec old_s inst |
395 |
force_s = Node.offline old_s |
396 |
new_inst = Instance.setSec inst new_sdx |
397 |
new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>= |
398 |
\new_s -> return (Container.addTwo new_sdx |
399 |
new_s old_sdx int_s nl, |
400 |
new_inst, old_pdx, new_sdx) |
401 |
in new_nl |
402 |
|
403 |
-- Replace the secondary and failover (r:np, f) |
404 |
applyMove nl inst (ReplaceAndFailover new_pdx) = |
405 |
let old_pdx = Instance.pNode inst |
406 |
old_sdx = Instance.sNode inst |
407 |
old_p = Container.find old_pdx nl |
408 |
old_s = Container.find old_sdx nl |
409 |
tgt_n = Container.find new_pdx nl |
410 |
int_p = Node.removePri old_p inst |
411 |
int_s = Node.removeSec old_s inst |
412 |
force_s = Node.offline old_s |
413 |
new_nl = do -- Maybe monad |
414 |
new_p <- Node.addPri tgt_n inst |
415 |
new_s <- Node.addSecEx force_s int_p inst new_pdx |
416 |
let new_inst = Instance.setBoth inst new_pdx old_pdx |
417 |
return (Container.add new_pdx new_p $ |
418 |
Container.addTwo old_pdx new_s old_sdx int_s nl, |
419 |
new_inst, new_pdx, old_pdx) |
420 |
in new_nl |
421 |
|
422 |
-- Failver and replace the secondary (f, r:ns) |
423 |
applyMove nl inst (FailoverAndReplace new_sdx) = |
424 |
let old_pdx = Instance.pNode inst |
425 |
old_sdx = Instance.sNode inst |
426 |
old_p = Container.find old_pdx nl |
427 |
old_s = Container.find old_sdx nl |
428 |
tgt_n = Container.find new_sdx nl |
429 |
int_p = Node.removePri old_p inst |
430 |
int_s = Node.removeSec old_s inst |
431 |
force_p = Node.offline old_p |
432 |
new_nl = do -- Maybe monad |
433 |
new_p <- Node.addPriEx force_p int_s inst |
434 |
new_s <- Node.addSecEx force_p tgt_n inst old_sdx |
435 |
let new_inst = Instance.setBoth inst old_sdx new_sdx |
436 |
return (Container.add new_sdx new_s $ |
437 |
Container.addTwo old_sdx new_p old_pdx int_p nl, |
438 |
new_inst, old_sdx, new_sdx) |
439 |
in new_nl |
440 |
|
441 |
-- | Tries to allocate an instance on one given node. |
442 |
allocateOnSingle :: Node.List -> Instance.Instance -> Ndx |
443 |
-> OpResult Node.AllocElement |
444 |
allocateOnSingle nl inst new_pdx = |
445 |
let p = Container.find new_pdx nl |
446 |
new_inst = Instance.setBoth inst new_pdx Node.noSecondary |
447 |
in Node.addPri p inst >>= \new_p -> do |
448 |
let new_nl = Container.add new_pdx new_p nl |
449 |
new_score = compCV nl |
450 |
return (new_nl, new_inst, [new_p], new_score) |
451 |
|
452 |
-- | Tries to allocate an instance on a given pair of nodes. |
453 |
allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx |
454 |
-> OpResult Node.AllocElement |
455 |
allocateOnPair nl inst new_pdx new_sdx = |
456 |
let tgt_p = Container.find new_pdx nl |
457 |
tgt_s = Container.find new_sdx nl |
458 |
in do |
459 |
new_p <- Node.addPri tgt_p inst |
460 |
new_s <- Node.addSec tgt_s inst new_pdx |
461 |
let new_inst = Instance.setBoth inst new_pdx new_sdx |
462 |
new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl |
463 |
return (new_nl, new_inst, [new_p, new_s], compCV new_nl) |
464 |
|
465 |
-- | Tries to perform an instance move and returns the best table |
466 |
-- between the original one and the new one. |
467 |
checkSingleStep :: Table -- ^ The original table |
468 |
-> Instance.Instance -- ^ The instance to move |
469 |
-> Table -- ^ The current best table |
470 |
-> IMove -- ^ The move to apply |
471 |
-> Table -- ^ The final best table |
472 |
checkSingleStep ini_tbl target cur_tbl move = |
473 |
let |
474 |
Table ini_nl ini_il _ ini_plc = ini_tbl |
475 |
tmp_resu = applyMove ini_nl target move |
476 |
in |
477 |
case tmp_resu of |
478 |
OpFail _ -> cur_tbl |
479 |
OpGood (upd_nl, new_inst, pri_idx, sec_idx) -> |
480 |
let tgt_idx = Instance.idx target |
481 |
upd_cvar = compCV upd_nl |
482 |
upd_il = Container.add tgt_idx new_inst ini_il |
483 |
upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc |
484 |
upd_tbl = Table upd_nl upd_il upd_cvar upd_plc |
485 |
in |
486 |
compareTables cur_tbl upd_tbl |
487 |
|
488 |
-- | Given the status of the current secondary as a valid new node and |
489 |
-- the current candidate target node, generate the possible moves for |
490 |
-- a instance. |
491 |
possibleMoves :: Bool -- ^ Whether the secondary node is a valid new node |
492 |
-> Bool -- ^ Whether we can change the primary node |
493 |
-> Ndx -- ^ Target node candidate |
494 |
-> [IMove] -- ^ List of valid result moves |
495 |
|
496 |
possibleMoves _ False tdx = |
497 |
[ReplaceSecondary tdx] |
498 |
|
499 |
possibleMoves True True tdx = |
500 |
[ReplaceSecondary tdx, |
501 |
ReplaceAndFailover tdx, |
502 |
ReplacePrimary tdx, |
503 |
FailoverAndReplace tdx] |
504 |
|
505 |
possibleMoves False True tdx = |
506 |
[ReplaceSecondary tdx, |
507 |
ReplaceAndFailover tdx] |
508 |
|
509 |
-- | Compute the best move for a given instance. |
510 |
checkInstanceMove :: [Ndx] -- ^ Allowed target node indices |
511 |
-> Bool -- ^ Whether disk moves are allowed |
512 |
-> Bool -- ^ Whether instance moves are allowed |
513 |
-> Table -- ^ Original table |
514 |
-> Instance.Instance -- ^ Instance to move |
515 |
-> Table -- ^ Best new table for this instance |
516 |
checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target = |
517 |
let |
518 |
opdx = Instance.pNode target |
519 |
osdx = Instance.sNode target |
520 |
nodes = filter (\idx -> idx /= opdx && idx /= osdx) nodes_idx |
521 |
use_secondary = elem osdx nodes_idx && inst_moves |
522 |
aft_failover = if use_secondary -- if allowed to failover |
523 |
then checkSingleStep ini_tbl target ini_tbl Failover |
524 |
else ini_tbl |
525 |
all_moves = if disk_moves |
526 |
then concatMap |
527 |
(possibleMoves use_secondary inst_moves) nodes |
528 |
else [] |
529 |
in |
530 |
-- iterate over the possible nodes for this instance |
531 |
foldl' (checkSingleStep ini_tbl target) aft_failover all_moves |
532 |
|
533 |
-- | Compute the best next move. |
534 |
checkMove :: [Ndx] -- ^ Allowed target node indices |
535 |
-> Bool -- ^ Whether disk moves are allowed |
536 |
-> Bool -- ^ Whether instance moves are allowed |
537 |
-> Table -- ^ The current solution |
538 |
-> [Instance.Instance] -- ^ List of instances still to move |
539 |
-> Table -- ^ The new solution |
540 |
checkMove nodes_idx disk_moves inst_moves ini_tbl victims = |
541 |
let Table _ _ _ ini_plc = ini_tbl |
542 |
-- we're using rwhnf from the Control.Parallel.Strategies |
543 |
-- package; we don't need to use rnf as that would force too |
544 |
-- much evaluation in single-threaded cases, and in |
545 |
-- multi-threaded case the weak head normal form is enough to |
546 |
-- spark the evaluation |
547 |
tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves |
548 |
inst_moves ini_tbl) |
549 |
victims |
550 |
-- iterate over all instances, computing the best move |
551 |
best_tbl = foldl' compareTables ini_tbl tables |
552 |
Table _ _ _ best_plc = best_tbl |
553 |
in if length best_plc == length ini_plc |
554 |
then ini_tbl -- no advancement |
555 |
else best_tbl |
556 |
|
557 |
-- | Check if we are allowed to go deeper in the balancing. |
558 |
doNextBalance :: Table -- ^ The starting table |
559 |
-> Int -- ^ Remaining length |
560 |
-> Score -- ^ Score at which to stop |
561 |
-> Bool -- ^ The resulting table and commands |
562 |
doNextBalance ini_tbl max_rounds min_score = |
563 |
let Table _ _ ini_cv ini_plc = ini_tbl |
564 |
ini_plc_len = length ini_plc |
565 |
in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score |
566 |
|
567 |
-- | Run a balance move. |
568 |
tryBalance :: Table -- ^ The starting table |
569 |
-> Bool -- ^ Allow disk moves |
570 |
-> Bool -- ^ Allow instance moves |
571 |
-> Bool -- ^ Only evacuate moves |
572 |
-> Score -- ^ Min gain threshold |
573 |
-> Score -- ^ Min gain |
574 |
-> Maybe Table -- ^ The resulting table and commands |
575 |
tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain = |
576 |
let Table ini_nl ini_il ini_cv _ = ini_tbl |
577 |
all_inst = Container.elems ini_il |
578 |
all_inst' = if evac_mode |
579 |
then let bad_nodes = map Node.idx . filter Node.offline $ |
580 |
Container.elems ini_nl |
581 |
in filter (\e -> Instance.sNode e `elem` bad_nodes || |
582 |
Instance.pNode e `elem` bad_nodes) |
583 |
all_inst |
584 |
else all_inst |
585 |
reloc_inst = filter Instance.movable all_inst' |
586 |
node_idx = map Node.idx . filter (not . Node.offline) $ |
587 |
Container.elems ini_nl |
588 |
fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst |
589 |
(Table _ _ fin_cv _) = fin_tbl |
590 |
in |
591 |
if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain) |
592 |
then Just fin_tbl -- this round made success, return the new table |
593 |
else Nothing |
594 |
|
595 |
-- * Allocation functions |
596 |
|
597 |
-- | Build failure stats out of a list of failures. |
598 |
collapseFailures :: [FailMode] -> FailStats |
599 |
collapseFailures flst = |
600 |
map (\k -> (k, length $ filter (k ==) flst)) [minBound..maxBound] |
601 |
|
602 |
-- | Update current Allocation solution and failure stats with new |
603 |
-- elements. |
604 |
concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution |
605 |
concatAllocs as (OpFail reason) = as { asFailures = reason : asFailures as } |
606 |
|
607 |
concatAllocs as (OpGood ns@(_, _, _, nscore)) = |
608 |
let -- Choose the old or new solution, based on the cluster score |
609 |
cntok = asAllocs as |
610 |
osols = asSolutions as |
611 |
nsols = case osols of |
612 |
[] -> [ns] |
613 |
(_, _, _, oscore):[] -> |
614 |
if oscore < nscore |
615 |
then osols |
616 |
else [ns] |
617 |
-- FIXME: here we simply concat to lists with more |
618 |
-- than one element; we should instead abort, since |
619 |
-- this is not a valid usage of this function |
620 |
xs -> ns:xs |
621 |
nsuc = cntok + 1 |
622 |
-- Note: we force evaluation of nsols here in order to keep the |
623 |
-- memory profile low - we know that we will need nsols for sure |
624 |
-- in the next cycle, so we force evaluation of nsols, since the |
625 |
-- foldl' in the caller will only evaluate the tuple, but not the |
626 |
-- elements of the tuple |
627 |
in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolutions = nsols } |
628 |
|
629 |
-- | Sums two allocation solutions (e.g. for two separate node groups). |
630 |
sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution |
631 |
sumAllocs (AllocSolution af aa as al) (AllocSolution bf ba bs bl) = |
632 |
AllocSolution (af ++ bf) (aa + ba) (as ++ bs) (al ++ bl) |
633 |
|
634 |
-- | Given a solution, generates a reasonable description for it. |
635 |
describeSolution :: AllocSolution -> String |
636 |
describeSolution as = |
637 |
let fcnt = asFailures as |
638 |
sols = asSolutions as |
639 |
freasons = |
640 |
intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) . |
641 |
filter ((> 0) . snd) . collapseFailures $ fcnt |
642 |
in if null sols |
643 |
then "No valid allocation solutions, failure reasons: " ++ |
644 |
(if null fcnt |
645 |
then "unknown reasons" |
646 |
else freasons) |
647 |
else let (_, _, nodes, cv) = head sols |
648 |
in printf ("score: %.8f, successes %d, failures %d (%s)" ++ |
649 |
" for node(s) %s") cv (asAllocs as) (length fcnt) freasons |
650 |
(intercalate "/" . map Node.name $ nodes) |
651 |
|
652 |
-- | Annotates a solution with the appropriate string. |
653 |
annotateSolution :: AllocSolution -> AllocSolution |
654 |
annotateSolution as = as { asLog = describeSolution as : asLog as } |
655 |
|
656 |
-- | Reverses an evacuation solution. |
657 |
-- |
658 |
-- Rationale: we always concat the results to the top of the lists, so |
659 |
-- for proper jobset execution, we should reverse all lists. |
660 |
reverseEvacSolution :: EvacSolution -> EvacSolution |
661 |
reverseEvacSolution (EvacSolution f m o) = |
662 |
EvacSolution (reverse f) (reverse m) (reverse o) |
663 |
|
664 |
-- | Generate the valid node allocation singles or pairs for a new instance. |
665 |
genAllocNodes :: Group.List -- ^ Group list |
666 |
-> Node.List -- ^ The node map |
667 |
-> Int -- ^ The number of nodes required |
668 |
-> Bool -- ^ Whether to drop or not |
669 |
-- unallocable nodes |
670 |
-> Result AllocNodes -- ^ The (monadic) result |
671 |
genAllocNodes gl nl count drop_unalloc = |
672 |
let filter_fn = if drop_unalloc |
673 |
then filter (Group.isAllocable . |
674 |
flip Container.find gl . Node.group) |
675 |
else id |
676 |
all_nodes = filter_fn $ getOnline nl |
677 |
all_pairs = liftM2 (,) all_nodes all_nodes |
678 |
ok_pairs = filter (\(x, y) -> Node.idx x /= Node.idx y && |
679 |
Node.group x == Node.group y) all_pairs |
680 |
in case count of |
681 |
1 -> Ok (Left (map Node.idx all_nodes)) |
682 |
2 -> Ok (Right (map (\(p, s) -> (Node.idx p, Node.idx s)) ok_pairs)) |
683 |
_ -> Bad "Unsupported number of nodes, only one or two supported" |
684 |
|
685 |
-- | Try to allocate an instance on the cluster. |
686 |
tryAlloc :: (Monad m) => |
687 |
Node.List -- ^ The node list |
688 |
-> Instance.List -- ^ The instance list |
689 |
-> Instance.Instance -- ^ The instance to allocate |
690 |
-> AllocNodes -- ^ The allocation targets |
691 |
-> m AllocSolution -- ^ Possible solution list |
692 |
tryAlloc nl _ inst (Right ok_pairs) = |
693 |
let sols = foldl' (\cstate (p, s) -> |
694 |
concatAllocs cstate $ allocateOnPair nl inst p s |
695 |
) emptyAllocSolution ok_pairs |
696 |
|
697 |
in if null ok_pairs -- means we have just one node |
698 |
then fail "Not enough online nodes" |
699 |
else return $ annotateSolution sols |
700 |
|
701 |
tryAlloc nl _ inst (Left all_nodes) = |
702 |
let sols = foldl' (\cstate -> |
703 |
concatAllocs cstate . allocateOnSingle nl inst |
704 |
) emptyAllocSolution all_nodes |
705 |
in if null all_nodes |
706 |
then fail "No online nodes" |
707 |
else return $ annotateSolution sols |
708 |
|
709 |
-- | Given a group/result, describe it as a nice (list of) messages. |
710 |
solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String] |
711 |
solutionDescription gl (groupId, result) = |
712 |
case result of |
713 |
Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution) |
714 |
Bad message -> [printf "Group %s: error %s" gname message] |
715 |
where grp = Container.find groupId gl |
716 |
gname = Group.name grp |
717 |
pol = apolToString (Group.allocPolicy grp) |
718 |
|
719 |
-- | From a list of possibly bad and possibly empty solutions, filter |
720 |
-- only the groups with a valid result. Note that the result will be |
721 |
-- reversed compared to the original list. |
722 |
filterMGResults :: Group.List |
723 |
-> [(Gdx, Result AllocSolution)] |
724 |
-> [(Gdx, AllocSolution)] |
725 |
filterMGResults gl = foldl' fn [] |
726 |
where unallocable = not . Group.isAllocable . flip Container.find gl |
727 |
fn accu (gdx, rasol) = |
728 |
case rasol of |
729 |
Bad _ -> accu |
730 |
Ok sol | null (asSolutions sol) -> accu |
731 |
| unallocable gdx -> accu |
732 |
| otherwise -> (gdx, sol):accu |
733 |
|
734 |
-- | Sort multigroup results based on policy and score. |
735 |
sortMGResults :: Group.List |
736 |
-> [(Gdx, AllocSolution)] |
737 |
-> [(Gdx, AllocSolution)] |
738 |
sortMGResults gl sols = |
739 |
let extractScore (_, _, _, x) = x |
740 |
solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl), |
741 |
(extractScore . head . asSolutions) sol) |
742 |
in sortBy (comparing solScore) sols |
743 |
|
744 |
-- | Try to allocate an instance on a multi-group cluster. |
745 |
tryMGAlloc :: Group.List -- ^ The group list |
746 |
-> Node.List -- ^ The node list |
747 |
-> Instance.List -- ^ The instance list |
748 |
-> Instance.Instance -- ^ The instance to allocate |
749 |
-> Int -- ^ Required number of nodes |
750 |
-> Result AllocSolution -- ^ Possible solution list |
751 |
tryMGAlloc mggl mgnl mgil inst cnt = |
752 |
let groups = splitCluster mgnl mgil |
753 |
sols = map (\(gid, (nl, il)) -> |
754 |
(gid, genAllocNodes mggl nl cnt False >>= |
755 |
tryAlloc nl il inst)) |
756 |
groups::[(Gdx, Result AllocSolution)] |
757 |
all_msgs = concatMap (solutionDescription mggl) sols |
758 |
goodSols = filterMGResults mggl sols |
759 |
sortedSols = sortMGResults mggl goodSols |
760 |
in if null sortedSols |
761 |
then Bad $ intercalate ", " all_msgs |
762 |
else let (final_group, final_sol) = head sortedSols |
763 |
final_name = Group.name $ Container.find final_group mggl |
764 |
selmsg = "Selected group: " ++ final_name |
765 |
in Ok $ final_sol { asLog = selmsg:all_msgs } |
766 |
|
767 |
-- | Try to relocate an instance on the cluster. |
768 |
tryReloc :: (Monad m) => |
769 |
Node.List -- ^ The node list |
770 |
-> Instance.List -- ^ The instance list |
771 |
-> Idx -- ^ The index of the instance to move |
772 |
-> Int -- ^ The number of nodes required |
773 |
-> [Ndx] -- ^ Nodes which should not be used |
774 |
-> m AllocSolution -- ^ Solution list |
775 |
tryReloc nl il xid 1 ex_idx = |
776 |
let all_nodes = getOnline nl |
777 |
inst = Container.find xid il |
778 |
ex_idx' = Instance.pNode inst:ex_idx |
779 |
valid_nodes = filter (not . flip elem ex_idx' . Node.idx) all_nodes |
780 |
valid_idxes = map Node.idx valid_nodes |
781 |
sols1 = foldl' (\cstate x -> |
782 |
let em = do |
783 |
(mnl, i, _, _) <- |
784 |
applyMove nl inst (ReplaceSecondary x) |
785 |
return (mnl, i, [Container.find x mnl], |
786 |
compCV mnl) |
787 |
in concatAllocs cstate em |
788 |
) emptyAllocSolution valid_idxes |
789 |
in return sols1 |
790 |
|
791 |
tryReloc _ _ _ reqn _ = fail $ "Unsupported number of relocation \ |
792 |
\destinations required (" ++ show reqn ++ |
793 |
"), only one supported" |
794 |
|
795 |
tryMGReloc :: (Monad m) => |
796 |
Group.List -- ^ The group list |
797 |
-> Node.List -- ^ The node list |
798 |
-> Instance.List -- ^ The instance list |
799 |
-> Idx -- ^ The index of the instance to move |
800 |
-> Int -- ^ The number of nodes required |
801 |
-> [Ndx] -- ^ Nodes which should not be used |
802 |
-> m AllocSolution -- ^ Solution list |
803 |
tryMGReloc _ mgnl mgil xid ncount ex_ndx = do |
804 |
let groups = splitCluster mgnl mgil |
805 |
-- TODO: we only relocate inside the group for now |
806 |
inst = Container.find xid mgil |
807 |
(nl, il) <- case lookup (instancePriGroup mgnl inst) groups of |
808 |
Nothing -> fail $ "Cannot find group for instance " ++ |
809 |
Instance.name inst |
810 |
Just v -> return v |
811 |
tryReloc nl il xid ncount ex_ndx |
812 |
|
813 |
-- | Change an instance's secondary node. |
814 |
evacInstance :: (Monad m) => |
815 |
[Ndx] -- ^ Excluded nodes |
816 |
-> Instance.List -- ^ The current instance list |
817 |
-> (Node.List, AllocSolution) -- ^ The current state |
818 |
-> Idx -- ^ The instance to evacuate |
819 |
-> m (Node.List, AllocSolution) |
820 |
evacInstance ex_ndx il (nl, old_as) idx = do |
821 |
-- FIXME: hardcoded one node here |
822 |
|
823 |
-- Longer explanation: evacuation is currently hardcoded to DRBD |
824 |
-- instances (which have one secondary); hence, even if the |
825 |
-- IAllocator protocol can request N nodes for an instance, and all |
826 |
-- the message parsing/loading pass this, this implementation only |
827 |
-- supports one; this situation needs to be revisited if we ever |
828 |
-- support more than one secondary, or if we change the storage |
829 |
-- model |
830 |
new_as <- tryReloc nl il idx 1 ex_ndx |
831 |
case asSolutions new_as of |
832 |
-- an individual relocation succeeded, we kind of compose the data |
833 |
-- from the two solutions |
834 |
csol@(nl', _, _, _):_ -> |
835 |
return (nl', new_as { asSolutions = csol:asSolutions old_as }) |
836 |
-- this relocation failed, so we fail the entire evac |
837 |
_ -> fail $ "Can't evacuate instance " ++ |
838 |
Instance.name (Container.find idx il) ++ |
839 |
": " ++ describeSolution new_as |
840 |
|
841 |
-- | Try to evacuate a list of nodes. |
842 |
tryEvac :: (Monad m) => |
843 |
Node.List -- ^ The node list |
844 |
-> Instance.List -- ^ The instance list |
845 |
-> [Idx] -- ^ Instances to be evacuated |
846 |
-> [Ndx] -- ^ Restricted nodes (the ones being evacuated) |
847 |
-> m AllocSolution -- ^ Solution list |
848 |
tryEvac nl il idxs ex_ndx = do |
849 |
(_, sol) <- foldM (evacInstance ex_ndx il) (nl, emptyAllocSolution) idxs |
850 |
return sol |
851 |
|
852 |
-- | Multi-group evacuation of a list of nodes. |
853 |
tryMGEvac :: (Monad m) => |
854 |
Group.List -- ^ The group list |
855 |
-> Node.List -- ^ The node list |
856 |
-> Instance.List -- ^ The instance list |
857 |
-> [Ndx] -- ^ Nodes to be evacuated |
858 |
-> m AllocSolution -- ^ Solution list |
859 |
tryMGEvac _ nl il ex_ndx = |
860 |
let ex_nodes = map (`Container.find` nl) ex_ndx |
861 |
all_insts = nub . concatMap Node.sList $ ex_nodes |
862 |
all_insts' = associateIdxs all_insts $ splitCluster nl il |
863 |
in do |
864 |
results <- mapM (\(_, (gnl, gil, idxs)) -> tryEvac gnl gil idxs ex_ndx) |
865 |
all_insts' |
866 |
let sol = foldl' sumAllocs emptyAllocSolution results |
867 |
return $ annotateSolution sol |
868 |
|
869 |
-- | Function which fails if the requested mode is change secondary. |
870 |
-- |
871 |
-- This is useful since except DRBD, no other disk template can |
872 |
-- execute change secondary; thus, we can just call this function |
873 |
-- instead of always checking for secondary mode. After the call to |
874 |
-- this function, whatever mode we have is just a primary change. |
875 |
failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m () |
876 |
failOnSecondaryChange ChangeSecondary dt = |
877 |
fail $ "Instances with disk template '" ++ dtToString dt ++ |
878 |
"' can't execute change secondary" |
879 |
failOnSecondaryChange _ _ = return () |
880 |
|
881 |
-- | Run evacuation for a single instance. |
882 |
nodeEvacInstance :: Node.List -- ^ The node list (cluster-wide) |
883 |
-> Instance.List -- ^ Instance list (cluster-wide) |
884 |
-> EvacMode -- ^ The evacuation mode |
885 |
-> Instance.Instance -- ^ The instance to be evacuated |
886 |
-> [Ndx] -- ^ The list of available nodes |
887 |
-- for allocation |
888 |
-> Result (Node.List, Instance.List, [OpCodes.OpCode]) |
889 |
nodeEvacInstance _ _ mode (Instance.Instance |
890 |
{Instance.diskTemplate = dt@DTDiskless}) _ = |
891 |
failOnSecondaryChange mode dt >> |
892 |
fail "Diskless relocations not implemented yet" |
893 |
|
894 |
nodeEvacInstance _ _ _ (Instance.Instance |
895 |
{Instance.diskTemplate = DTPlain}) _ = |
896 |
fail "Instances of type plain cannot be relocated" |
897 |
|
898 |
nodeEvacInstance _ _ _ (Instance.Instance |
899 |
{Instance.diskTemplate = DTFile}) _ = |
900 |
fail "Instances of type file cannot be relocated" |
901 |
|
902 |
nodeEvacInstance _ _ mode (Instance.Instance |
903 |
{Instance.diskTemplate = dt@DTSharedFile}) _ = |
904 |
failOnSecondaryChange mode dt >> |
905 |
fail "Shared file relocations not implemented yet" |
906 |
|
907 |
nodeEvacInstance _ _ mode (Instance.Instance |
908 |
{Instance.diskTemplate = dt@DTBlock}) _ = |
909 |
failOnSecondaryChange mode dt >> |
910 |
fail "Block device relocations not implemented yet" |
911 |
|
912 |
nodeEvacInstance _ _ _ (Instance.Instance |
913 |
{Instance.diskTemplate = DTDrbd8}) _ = |
914 |
fail "DRBD relocations not implemented yet" |
915 |
|
916 |
-- | Computes the local nodes of a given instance which are available |
917 |
-- for allocation. |
918 |
availableLocalNodes :: Node.List |
919 |
-> [(Gdx, [Ndx])] |
920 |
-> IntSet.IntSet |
921 |
-> Instance.Instance |
922 |
-> Result [Ndx] |
923 |
availableLocalNodes nl group_nodes excl_ndx inst = do |
924 |
let gdx = instancePriGroup nl inst |
925 |
local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx) |
926 |
Ok (lookup gdx group_nodes) |
927 |
let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes |
928 |
return avail_nodes |
929 |
|
930 |
-- | Updates the evac solution with the results of an instance |
931 |
-- evacuation. |
932 |
updateEvacSolution :: (Node.List, Instance.List, EvacSolution) |
933 |
-> Instance.Instance |
934 |
-> Result (Node.List, Instance.List, [OpCodes.OpCode]) |
935 |
-> (Node.List, Instance.List, EvacSolution) |
936 |
updateEvacSolution (nl, il, es) inst (Bad msg) = |
937 |
(nl, il, es { esFailed = (Instance.name inst ++ ": " ++ msg):esFailed es}) |
938 |
updateEvacSolution (_, _, es) inst (Ok (nl, il, opcodes)) = |
939 |
(nl, il, es { esMoved = Instance.name inst:esMoved es |
940 |
, esOpCodes = [opcodes]:esOpCodes es }) |
941 |
|
942 |
-- | Node-evacuation IAllocator mode main function. |
943 |
tryNodeEvac :: Group.List -- ^ The cluster groups |
944 |
-> Node.List -- ^ The node list (cluster-wide, not per group) |
945 |
-> Instance.List -- ^ Instance list (cluster-wide) |
946 |
-> EvacMode -- ^ The evacuation mode |
947 |
-> [Idx] -- ^ List of instance (indices) to be evacuated |
948 |
-> Result EvacSolution |
949 |
tryNodeEvac _ ini_nl ini_il mode idxs = |
950 |
let evac_ndx = nodesToEvacuate ini_il mode idxs |
951 |
offline = map Node.idx . filter Node.offline $ Container.elems ini_nl |
952 |
excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline |
953 |
group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx |
954 |
(Container.elems nl))) $ |
955 |
splitCluster ini_nl ini_il |
956 |
(_, _, esol) = |
957 |
foldl' (\state@(nl, il, _) inst -> |
958 |
updateEvacSolution state inst $ |
959 |
availableLocalNodes nl group_ndx excl_ndx inst >>= |
960 |
nodeEvacInstance nl il mode inst |
961 |
) |
962 |
(ini_nl, ini_il, emptyEvacSolution) |
963 |
(map (`Container.find` ini_il) idxs) |
964 |
in return $ reverseEvacSolution esol |
965 |
|
966 |
-- | Recursively place instances on the cluster until we're out of space. |
967 |
iterateAlloc :: Node.List |
968 |
-> Instance.List |
969 |
-> Instance.Instance |
970 |
-> AllocNodes |
971 |
-> [Instance.Instance] |
972 |
-> [CStats] |
973 |
-> Result AllocResult |
974 |
iterateAlloc nl il newinst allocnodes ixes cstats = |
975 |
let depth = length ixes |
976 |
newname = printf "new-%d" depth::String |
977 |
newidx = length (Container.elems il) + depth |
978 |
newi2 = Instance.setIdx (Instance.setName newinst newname) newidx |
979 |
in case tryAlloc nl il newi2 allocnodes of |
980 |
Bad s -> Bad s |
981 |
Ok (AllocSolution { asFailures = errs, asSolutions = sols3 }) -> |
982 |
case sols3 of |
983 |
[] -> Ok (collapseFailures errs, nl, il, ixes, cstats) |
984 |
(xnl, xi, _, _):[] -> |
985 |
iterateAlloc xnl (Container.add newidx xi il) |
986 |
newinst allocnodes (xi:ixes) |
987 |
(totalResources xnl:cstats) |
988 |
_ -> Bad "Internal error: multiple solutions for single\ |
989 |
\ allocation" |
990 |
|
991 |
-- | The core of the tiered allocation mode. |
992 |
tieredAlloc :: Node.List |
993 |
-> Instance.List |
994 |
-> Instance.Instance |
995 |
-> AllocNodes |
996 |
-> [Instance.Instance] |
997 |
-> [CStats] |
998 |
-> Result AllocResult |
999 |
tieredAlloc nl il newinst allocnodes ixes cstats = |
1000 |
case iterateAlloc nl il newinst allocnodes ixes cstats of |
1001 |
Bad s -> Bad s |
1002 |
Ok (errs, nl', il', ixes', cstats') -> |
1003 |
case Instance.shrinkByType newinst . fst . last $ |
1004 |
sortBy (comparing snd) errs of |
1005 |
Bad _ -> Ok (errs, nl', il', ixes', cstats') |
1006 |
Ok newinst' -> |
1007 |
tieredAlloc nl' il' newinst' allocnodes ixes' cstats' |
1008 |
|
1009 |
-- | Compute the tiered spec string description from a list of |
1010 |
-- allocated instances. |
1011 |
tieredSpecMap :: [Instance.Instance] |
1012 |
-> [String] |
1013 |
tieredSpecMap trl_ixes = |
1014 |
let fin_trl_ixes = reverse trl_ixes |
1015 |
ix_byspec = groupBy ((==) `on` Instance.specOf) fin_trl_ixes |
1016 |
spec_map = map (\ixs -> (Instance.specOf $ head ixs, length ixs)) |
1017 |
ix_byspec |
1018 |
in map (\(spec, cnt) -> printf "%d,%d,%d=%d" (rspecMem spec) |
1019 |
(rspecDsk spec) (rspecCpu spec) cnt) spec_map |
1020 |
|
1021 |
-- * Formatting functions |
1022 |
|
1023 |
-- | Given the original and final nodes, computes the relocation description. |
1024 |
computeMoves :: Instance.Instance -- ^ The instance to be moved |
1025 |
-> String -- ^ The instance name |
1026 |
-> IMove -- ^ The move being performed |
1027 |
-> String -- ^ New primary |
1028 |
-> String -- ^ New secondary |
1029 |
-> (String, [String]) |
1030 |
-- ^ Tuple of moves and commands list; moves is containing |
1031 |
-- either @/f/@ for failover or @/r:name/@ for replace |
1032 |
-- secondary, while the command list holds gnt-instance |
1033 |
-- commands (without that prefix), e.g \"@failover instance1@\" |
1034 |
computeMoves i inam mv c d = |
1035 |
case mv of |
1036 |
Failover -> ("f", [mig]) |
1037 |
FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d]) |
1038 |
ReplaceSecondary _ -> (printf "r:%s" d, [rep d]) |
1039 |
ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig]) |
1040 |
ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig]) |
1041 |
where morf = if Instance.running i then "migrate" else "failover" |
1042 |
mig = printf "%s -f %s" morf inam::String |
1043 |
rep n = printf "replace-disks -n %s %s" n inam |
1044 |
|
1045 |
-- | Converts a placement to string format. |
1046 |
printSolutionLine :: Node.List -- ^ The node list |
1047 |
-> Instance.List -- ^ The instance list |
1048 |
-> Int -- ^ Maximum node name length |
1049 |
-> Int -- ^ Maximum instance name length |
1050 |
-> Placement -- ^ The current placement |
1051 |
-> Int -- ^ The index of the placement in |
1052 |
-- the solution |
1053 |
-> (String, [String]) |
1054 |
printSolutionLine nl il nmlen imlen plc pos = |
1055 |
let |
1056 |
pmlen = (2*nmlen + 1) |
1057 |
(i, p, s, mv, c) = plc |
1058 |
inst = Container.find i il |
1059 |
inam = Instance.alias inst |
1060 |
npri = Node.alias $ Container.find p nl |
1061 |
nsec = Node.alias $ Container.find s nl |
1062 |
opri = Node.alias $ Container.find (Instance.pNode inst) nl |
1063 |
osec = Node.alias $ Container.find (Instance.sNode inst) nl |
1064 |
(moves, cmds) = computeMoves inst inam mv npri nsec |
1065 |
ostr = printf "%s:%s" opri osec::String |
1066 |
nstr = printf "%s:%s" npri nsec::String |
1067 |
in |
1068 |
(printf " %3d. %-*s %-*s => %-*s %.8f a=%s" |
1069 |
pos imlen inam pmlen ostr |
1070 |
pmlen nstr c moves, |
1071 |
cmds) |
1072 |
|
1073 |
-- | Return the instance and involved nodes in an instance move. |
1074 |
involvedNodes :: Instance.List -> Placement -> [Ndx] |
1075 |
involvedNodes il plc = |
1076 |
let (i, np, ns, _, _) = plc |
1077 |
inst = Container.find i il |
1078 |
op = Instance.pNode inst |
1079 |
os = Instance.sNode inst |
1080 |
in nub [np, ns, op, os] |
1081 |
|
1082 |
-- | Inner function for splitJobs, that either appends the next job to |
1083 |
-- the current jobset, or starts a new jobset. |
1084 |
mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx]) |
1085 |
mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx) |
1086 |
mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _) |
1087 |
| null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf) |
1088 |
| otherwise = ([n]:cjs, ndx) |
1089 |
|
1090 |
-- | Break a list of moves into independent groups. Note that this |
1091 |
-- will reverse the order of jobs. |
1092 |
splitJobs :: [MoveJob] -> [JobSet] |
1093 |
splitJobs = fst . foldl mergeJobs ([], []) |
1094 |
|
1095 |
-- | Given a list of commands, prefix them with @gnt-instance@ and |
1096 |
-- also beautify the display a little. |
1097 |
formatJob :: Int -> Int -> (Int, MoveJob) -> [String] |
1098 |
formatJob jsn jsl (sn, (_, _, _, cmds)) = |
1099 |
let out = |
1100 |
printf " echo job %d/%d" jsn sn: |
1101 |
printf " check": |
1102 |
map (" gnt-instance " ++) cmds |
1103 |
in if sn == 1 |
1104 |
then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out |
1105 |
else out |
1106 |
|
1107 |
-- | Given a list of commands, prefix them with @gnt-instance@ and |
1108 |
-- also beautify the display a little. |
1109 |
formatCmds :: [JobSet] -> String |
1110 |
formatCmds = |
1111 |
unlines . |
1112 |
concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js)) |
1113 |
(zip [1..] js)) . |
1114 |
zip [1..] |
1115 |
|
1116 |
-- | Print the node list. |
1117 |
printNodes :: Node.List -> [String] -> String |
1118 |
printNodes nl fs = |
1119 |
let fields = case fs of |
1120 |
[] -> Node.defaultFields |
1121 |
"+":rest -> Node.defaultFields ++ rest |
1122 |
_ -> fs |
1123 |
snl = sortBy (comparing Node.idx) (Container.elems nl) |
1124 |
(header, isnum) = unzip $ map Node.showHeader fields |
1125 |
in unlines . map ((:) ' ' . intercalate " ") $ |
1126 |
formatTable (header:map (Node.list fields) snl) isnum |
1127 |
|
1128 |
-- | Print the instance list. |
1129 |
printInsts :: Node.List -> Instance.List -> String |
1130 |
printInsts nl il = |
1131 |
let sil = sortBy (comparing Instance.idx) (Container.elems il) |
1132 |
helper inst = [ if Instance.running inst then "R" else " " |
1133 |
, Instance.name inst |
1134 |
, Container.nameOf nl (Instance.pNode inst) |
1135 |
, let sdx = Instance.sNode inst |
1136 |
in if sdx == Node.noSecondary |
1137 |
then "" |
1138 |
else Container.nameOf nl sdx |
1139 |
, if Instance.autoBalance inst then "Y" else "N" |
1140 |
, printf "%3d" $ Instance.vcpus inst |
1141 |
, printf "%5d" $ Instance.mem inst |
1142 |
, printf "%5d" $ Instance.dsk inst `div` 1024 |
1143 |
, printf "%5.3f" lC |
1144 |
, printf "%5.3f" lM |
1145 |
, printf "%5.3f" lD |
1146 |
, printf "%5.3f" lN |
1147 |
] |
1148 |
where DynUtil lC lM lD lN = Instance.util inst |
1149 |
header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal" |
1150 |
, "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ] |
1151 |
isnum = False:False:False:False:False:repeat True |
1152 |
in unlines . map ((:) ' ' . intercalate " ") $ |
1153 |
formatTable (header:map helper sil) isnum |
1154 |
|
1155 |
-- | Shows statistics for a given node list. |
1156 |
printStats :: Node.List -> String |
1157 |
printStats nl = |
1158 |
let dcvs = compDetailedCV nl |
1159 |
(weights, names) = unzip detailedCVInfo |
1160 |
hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs |
1161 |
formatted = map (\(w, header, val) -> |
1162 |
printf "%s=%.8f(x%.2f)" header val w::String) hd |
1163 |
in intercalate ", " formatted |
1164 |
|
1165 |
-- | Convert a placement into a list of OpCodes (basically a job). |
1166 |
iMoveToJob :: Node.List -> Instance.List |
1167 |
-> Idx -> IMove -> [OpCodes.OpCode] |
1168 |
iMoveToJob nl il idx move = |
1169 |
let inst = Container.find idx il |
1170 |
iname = Instance.name inst |
1171 |
lookNode = Just . Container.nameOf nl |
1172 |
opF = OpCodes.OpInstanceMigrate iname True False True |
1173 |
opR n = OpCodes.OpInstanceReplaceDisks iname (lookNode n) |
1174 |
OpCodes.ReplaceNewSecondary [] Nothing |
1175 |
in case move of |
1176 |
Failover -> [ opF ] |
1177 |
ReplacePrimary np -> [ opF, opR np, opF ] |
1178 |
ReplaceSecondary ns -> [ opR ns ] |
1179 |
ReplaceAndFailover np -> [ opR np, opF ] |
1180 |
FailoverAndReplace ns -> [ opF, opR ns ] |
1181 |
|
1182 |
-- * Node group functions |
1183 |
|
1184 |
-- | Computes the group of an instance. |
1185 |
instanceGroup :: Node.List -> Instance.Instance -> Result Gdx |
1186 |
instanceGroup nl i = |
1187 |
let sidx = Instance.sNode i |
1188 |
pnode = Container.find (Instance.pNode i) nl |
1189 |
snode = if sidx == Node.noSecondary |
1190 |
then pnode |
1191 |
else Container.find sidx nl |
1192 |
pgroup = Node.group pnode |
1193 |
sgroup = Node.group snode |
1194 |
in if pgroup /= sgroup |
1195 |
then fail ("Instance placed accross two node groups, primary " ++ |
1196 |
show pgroup ++ ", secondary " ++ show sgroup) |
1197 |
else return pgroup |
1198 |
|
1199 |
-- | Computes the group of an instance per the primary node. |
1200 |
instancePriGroup :: Node.List -> Instance.Instance -> Gdx |
1201 |
instancePriGroup nl i = |
1202 |
let pnode = Container.find (Instance.pNode i) nl |
1203 |
in Node.group pnode |
1204 |
|
1205 |
-- | Compute the list of badly allocated instances (split across node |
1206 |
-- groups). |
1207 |
findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance] |
1208 |
findSplitInstances nl = |
1209 |
filter (not . isOk . instanceGroup nl) . Container.elems |
1210 |
|
1211 |
-- | Splits a cluster into the component node groups. |
1212 |
splitCluster :: Node.List -> Instance.List -> |
1213 |
[(Gdx, (Node.List, Instance.List))] |
1214 |
splitCluster nl il = |
1215 |
let ngroups = Node.computeGroups (Container.elems nl) |
1216 |
in map (\(guuid, nodes) -> |
1217 |
let nidxs = map Node.idx nodes |
1218 |
nodes' = zip nidxs nodes |
1219 |
instances = Container.filter ((`elem` nidxs) . Instance.pNode) il |
1220 |
in (guuid, (Container.fromList nodes', instances))) ngroups |
1221 |
|
1222 |
-- | Split a global instance index map into per-group, and associate |
1223 |
-- it with the group/node/instance lists. |
1224 |
associateIdxs :: [Idx] -- ^ Instance indices to be split/associated |
1225 |
-> [(Gdx, (Node.List, Instance.List))] -- ^ Input groups |
1226 |
-> [(Gdx, (Node.List, Instance.List, [Idx]))] -- ^ Result |
1227 |
associateIdxs idxs = |
1228 |
map (\(gdx, (nl, il)) -> |
1229 |
(gdx, (nl, il, filter (`Container.member` il) idxs))) |
1230 |
|
1231 |
-- | Compute the list of nodes that are to be evacuated, given a list |
1232 |
-- of instances and an evacuation mode. |
1233 |
nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list |
1234 |
-> EvacMode -- ^ The evacuation mode we're using |
1235 |
-> [Idx] -- ^ List of instance indices being evacuated |
1236 |
-> IntSet.IntSet -- ^ Set of node indices |
1237 |
nodesToEvacuate il mode = |
1238 |
IntSet.delete Node.noSecondary . |
1239 |
foldl' (\ns idx -> |
1240 |
let i = Container.find idx il |
1241 |
pdx = Instance.pNode i |
1242 |
sdx = Instance.sNode i |
1243 |
dt = Instance.diskTemplate i |
1244 |
withSecondary = case dt of |
1245 |
DTDrbd8 -> IntSet.insert sdx ns |
1246 |
_ -> ns |
1247 |
in case mode of |
1248 |
ChangePrimary -> IntSet.insert pdx ns |
1249 |
ChangeSecondary -> withSecondary |
1250 |
ChangeAll -> IntSet.insert pdx withSecondary |
1251 |
) IntSet.empty |