Statistics
| Branch: | Tag: | Revision:

root / Ganeti / HTools / Cluster.hs @ 4bc33d60

History | View | Annotate | Download (42.3 kB)

1
{-| Implementation of cluster-wide logic.
2

    
3
This module holds all pure cluster-logic; I\/O related functionality
4
goes into the "Main" module for the individual binaries.
5

    
6
-}
7

    
8
{-
9

    
10
Copyright (C) 2009, 2010, 2011 Google Inc.
11

    
12
This program is free software; you can redistribute it and/or modify
13
it under the terms of the GNU General Public License as published by
14
the Free Software Foundation; either version 2 of the License, or
15
(at your option) any later version.
16

    
17
This program is distributed in the hope that it will be useful, but
18
WITHOUT ANY WARRANTY; without even the implied warranty of
19
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
General Public License for more details.
21

    
22
You should have received a copy of the GNU General Public License
23
along with this program; if not, write to the Free Software
24
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25
02110-1301, USA.
26

    
27
-}
28

    
29
module Ganeti.HTools.Cluster
30
    (
31
     -- * Types
32
      AllocSolution(..)
33
    , Table(..)
34
    , CStats(..)
35
    , AllocStats
36
    -- * Generic functions
37
    , totalResources
38
    , computeAllocationDelta
39
    -- * First phase functions
40
    , computeBadItems
41
    -- * Second phase functions
42
    , printSolutionLine
43
    , formatCmds
44
    , involvedNodes
45
    , splitJobs
46
    -- * Display functions
47
    , printNodes
48
    , printInsts
49
    -- * Balacing functions
50
    , checkMove
51
    , doNextBalance
52
    , tryBalance
53
    , compCV
54
    , compDetailedCV
55
    , printStats
56
    , iMoveToJob
57
    -- * IAllocator functions
58
    , tryAlloc
59
    , tryMGAlloc
60
    , tryReloc
61
    , tryMGReloc
62
    , tryEvac
63
    , collapseFailures
64
    -- * Allocation functions
65
    , iterateAlloc
66
    , tieredAlloc
67
    , tieredSpecMap
68
     -- * Node group functions
69
    , instanceGroup
70
    , findSplitInstances
71
    , splitCluster
72
    ) where
73

    
74
import Data.Function (on)
75
import Data.List
76
import Data.Ord (comparing)
77
import Text.Printf (printf)
78
import Control.Monad
79
import Control.Parallel.Strategies
80

    
81
import qualified Ganeti.HTools.Container as Container
82
import qualified Ganeti.HTools.Instance as Instance
83
import qualified Ganeti.HTools.Node as Node
84
import qualified Ganeti.HTools.Group as Group
85
import Ganeti.HTools.Types
86
import Ganeti.HTools.Utils
87
import qualified Ganeti.OpCodes as OpCodes
88

    
89
-- * Types
90

    
91
-- | Allocation\/relocation solution.
92
data AllocSolution = AllocSolution
93
  { asFailures  :: [FailMode]          -- ^ Failure counts
94
  , asAllocs    :: Int                 -- ^ Good allocation count
95
  , asSolutions :: [Node.AllocElement] -- ^ The actual result, length
96
                                       -- of the list depends on the
97
                                       -- allocation/relocation mode
98
  , asLog       :: [String]            -- ^ A list of informational messages
99
  }
100

    
101
-- | The empty solution we start with when computing allocations
102
emptySolution :: AllocSolution
103
emptySolution = AllocSolution { asFailures = [], asAllocs = 0
104
                              , asSolutions = [], asLog = [] }
105

    
106
-- | The complete state for the balancing solution
107
data Table = Table Node.List Instance.List Score [Placement]
108
             deriving (Show, Read)
109

    
110
data CStats = CStats { csFmem :: Int    -- ^ Cluster free mem
111
                     , csFdsk :: Int    -- ^ Cluster free disk
112
                     , csAmem :: Int    -- ^ Cluster allocatable mem
113
                     , csAdsk :: Int    -- ^ Cluster allocatable disk
114
                     , csAcpu :: Int    -- ^ Cluster allocatable cpus
115
                     , csMmem :: Int    -- ^ Max node allocatable mem
116
                     , csMdsk :: Int    -- ^ Max node allocatable disk
117
                     , csMcpu :: Int    -- ^ Max node allocatable cpu
118
                     , csImem :: Int    -- ^ Instance used mem
119
                     , csIdsk :: Int    -- ^ Instance used disk
120
                     , csIcpu :: Int    -- ^ Instance used cpu
121
                     , csTmem :: Double -- ^ Cluster total mem
122
                     , csTdsk :: Double -- ^ Cluster total disk
123
                     , csTcpu :: Double -- ^ Cluster total cpus
124
                     , csVcpu :: Int    -- ^ Cluster virtual cpus (if
125
                                        -- node pCpu has been set,
126
                                        -- otherwise -1)
127
                     , csXmem :: Int    -- ^ Unnacounted for mem
128
                     , csNmem :: Int    -- ^ Node own memory
129
                     , csScore :: Score -- ^ The cluster score
130
                     , csNinst :: Int   -- ^ The total number of instances
131
                     }
132
            deriving (Show, Read)
133

    
134
-- | Currently used, possibly to allocate, unallocable
135
type AllocStats = (RSpec, RSpec, RSpec)
136

    
137
-- * Utility functions
138

    
139
-- | Verifies the N+1 status and return the affected nodes.
140
verifyN1 :: [Node.Node] -> [Node.Node]
141
verifyN1 = filter Node.failN1
142

    
143
{-| Computes the pair of bad nodes and instances.
144

    
145
The bad node list is computed via a simple 'verifyN1' check, and the
146
bad instance list is the list of primary and secondary instances of
147
those nodes.
148

    
149
-}
150
computeBadItems :: Node.List -> Instance.List ->
151
                   ([Node.Node], [Instance.Instance])
152
computeBadItems nl il =
153
  let bad_nodes = verifyN1 $ getOnline nl
154
      bad_instances = map (`Container.find` il) .
155
                      sort . nub $
156
                      concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
157
  in
158
    (bad_nodes, bad_instances)
159

    
160
-- | Zero-initializer for the CStats type
161
emptyCStats :: CStats
162
emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
163

    
164
-- | Update stats with data from a new node
165
updateCStats :: CStats -> Node.Node -> CStats
166
updateCStats cs node =
167
    let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
168
                 csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
169
                 csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
170
                 csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
171
                 csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
172
                 csVcpu = x_vcpu,
173
                 csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
174
               }
175
            = cs
176
        inc_amem = Node.fMem node - Node.rMem node
177
        inc_amem' = if inc_amem > 0 then inc_amem else 0
178
        inc_adsk = Node.availDisk node
179
        inc_imem = truncate (Node.tMem node) - Node.nMem node
180
                   - Node.xMem node - Node.fMem node
181
        inc_icpu = Node.uCpu node
182
        inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
183
        inc_vcpu = Node.hiCpu node
184
        inc_acpu = Node.availCpu node
185

    
186
    in cs { csFmem = x_fmem + Node.fMem node
187
          , csFdsk = x_fdsk + Node.fDsk node
188
          , csAmem = x_amem + inc_amem'
189
          , csAdsk = x_adsk + inc_adsk
190
          , csAcpu = x_acpu + inc_acpu
191
          , csMmem = max x_mmem inc_amem'
192
          , csMdsk = max x_mdsk inc_adsk
193
          , csMcpu = max x_mcpu inc_acpu
194
          , csImem = x_imem + inc_imem
195
          , csIdsk = x_idsk + inc_idsk
196
          , csIcpu = x_icpu + inc_icpu
197
          , csTmem = x_tmem + Node.tMem node
198
          , csTdsk = x_tdsk + Node.tDsk node
199
          , csTcpu = x_tcpu + Node.tCpu node
200
          , csVcpu = x_vcpu + inc_vcpu
201
          , csXmem = x_xmem + Node.xMem node
202
          , csNmem = x_nmem + Node.nMem node
203
          , csNinst = x_ninst + length (Node.pList node)
204
          }
205

    
206
-- | Compute the total free disk and memory in the cluster.
207
totalResources :: Node.List -> CStats
208
totalResources nl =
209
    let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
210
    in cs { csScore = compCV nl }
211

    
212
-- | Compute the delta between two cluster state.
213
--
214
-- This is used when doing allocations, to understand better the
215
-- available cluster resources. The return value is a triple of the
216
-- current used values, the delta that was still allocated, and what
217
-- was left unallocated.
218
computeAllocationDelta :: CStats -> CStats -> AllocStats
219
computeAllocationDelta cini cfin =
220
    let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu} = cini
221
        CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
222
                csTmem = t_mem, csTdsk = t_dsk, csVcpu = v_cpu } = cfin
223
        rini = RSpec i_icpu i_imem i_idsk
224
        rfin = RSpec (f_icpu - i_icpu) (f_imem - i_imem) (f_idsk - i_idsk)
225
        un_cpu = v_cpu - f_icpu
226
        runa = RSpec un_cpu (truncate t_mem - f_imem) (truncate t_dsk - f_idsk)
227
    in (rini, rfin, runa)
228

    
229
-- | The names and weights of the individual elements in the CV list
230
detailedCVInfo :: [(Double, String)]
231
detailedCVInfo = [ (1,  "free_mem_cv")
232
                 , (1,  "free_disk_cv")
233
                 , (1,  "n1_cnt")
234
                 , (1,  "reserved_mem_cv")
235
                 , (4,  "offline_all_cnt")
236
                 , (16, "offline_pri_cnt")
237
                 , (1,  "vcpu_ratio_cv")
238
                 , (1,  "cpu_load_cv")
239
                 , (1,  "mem_load_cv")
240
                 , (1,  "disk_load_cv")
241
                 , (1,  "net_load_cv")
242
                 , (2,  "pri_tags_score")
243
                 ]
244

    
245
detailedCVWeights :: [Double]
246
detailedCVWeights = map fst detailedCVInfo
247

    
248
-- | Compute the mem and disk covariance.
249
compDetailedCV :: Node.List -> [Double]
250
compDetailedCV nl =
251
    let
252
        all_nodes = Container.elems nl
253
        (offline, nodes) = partition Node.offline all_nodes
254
        mem_l = map Node.pMem nodes
255
        dsk_l = map Node.pDsk nodes
256
        -- metric: memory covariance
257
        mem_cv = stdDev mem_l
258
        -- metric: disk covariance
259
        dsk_cv = stdDev dsk_l
260
        -- metric: count of instances living on N1 failing nodes
261
        n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
262
                                                   length (Node.pList n)) .
263
                   filter Node.failN1 $ nodes :: Double
264
        res_l = map Node.pRem nodes
265
        -- metric: reserved memory covariance
266
        res_cv = stdDev res_l
267
        -- offline instances metrics
268
        offline_ipri = sum . map (length . Node.pList) $ offline
269
        offline_isec = sum . map (length . Node.sList) $ offline
270
        -- metric: count of instances on offline nodes
271
        off_score = fromIntegral (offline_ipri + offline_isec)::Double
272
        -- metric: count of primary instances on offline nodes (this
273
        -- helps with evacuation/failover of primary instances on
274
        -- 2-node clusters with one node offline)
275
        off_pri_score = fromIntegral offline_ipri::Double
276
        cpu_l = map Node.pCpu nodes
277
        -- metric: covariance of vcpu/pcpu ratio
278
        cpu_cv = stdDev cpu_l
279
        -- metrics: covariance of cpu, memory, disk and network load
280
        (c_load, m_load, d_load, n_load) = unzip4 $
281
            map (\n ->
282
                     let DynUtil c1 m1 d1 n1 = Node.utilLoad n
283
                         DynUtil c2 m2 d2 n2 = Node.utilPool n
284
                     in (c1/c2, m1/m2, d1/d2, n1/n2)
285
                ) nodes
286
        -- metric: conflicting instance count
287
        pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
288
        pri_tags_score = fromIntegral pri_tags_inst::Double
289
    in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
290
       , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
291
       , pri_tags_score ]
292

    
293
-- | Compute the /total/ variance.
294
compCV :: Node.List -> Double
295
compCV = sum . zipWith (*) detailedCVWeights . compDetailedCV
296

    
297
-- | Compute online nodes from a Node.List
298
getOnline :: Node.List -> [Node.Node]
299
getOnline = filter (not . Node.offline) . Container.elems
300

    
301
-- * hbal functions
302

    
303
-- | Compute best table. Note that the ordering of the arguments is important.
304
compareTables :: Table -> Table -> Table
305
compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
306
    if a_cv > b_cv then b else a
307

    
308
-- | Applies an instance move to a given node list and instance.
309
applyMove :: Node.List -> Instance.Instance
310
          -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
311
-- Failover (f)
312
applyMove nl inst Failover =
313
    let old_pdx = Instance.pNode inst
314
        old_sdx = Instance.sNode inst
315
        old_p = Container.find old_pdx nl
316
        old_s = Container.find old_sdx nl
317
        int_p = Node.removePri old_p inst
318
        int_s = Node.removeSec old_s inst
319
        force_p = Node.offline old_p
320
        new_nl = do -- Maybe monad
321
          new_p <- Node.addPriEx force_p int_s inst
322
          new_s <- Node.addSec int_p inst old_sdx
323
          let new_inst = Instance.setBoth inst old_sdx old_pdx
324
          return (Container.addTwo old_pdx new_s old_sdx new_p nl,
325
                  new_inst, old_sdx, old_pdx)
326
    in new_nl
327

    
328
-- Replace the primary (f:, r:np, f)
329
applyMove nl inst (ReplacePrimary new_pdx) =
330
    let old_pdx = Instance.pNode inst
331
        old_sdx = Instance.sNode inst
332
        old_p = Container.find old_pdx nl
333
        old_s = Container.find old_sdx nl
334
        tgt_n = Container.find new_pdx nl
335
        int_p = Node.removePri old_p inst
336
        int_s = Node.removeSec old_s inst
337
        force_p = Node.offline old_p
338
        new_nl = do -- Maybe monad
339
          -- check that the current secondary can host the instance
340
          -- during the migration
341
          tmp_s <- Node.addPriEx force_p int_s inst
342
          let tmp_s' = Node.removePri tmp_s inst
343
          new_p <- Node.addPriEx force_p tgt_n inst
344
          new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
345
          let new_inst = Instance.setPri inst new_pdx
346
          return (Container.add new_pdx new_p $
347
                  Container.addTwo old_pdx int_p old_sdx new_s nl,
348
                  new_inst, new_pdx, old_sdx)
349
    in new_nl
350

    
351
-- Replace the secondary (r:ns)
352
applyMove nl inst (ReplaceSecondary new_sdx) =
353
    let old_pdx = Instance.pNode inst
354
        old_sdx = Instance.sNode inst
355
        old_s = Container.find old_sdx nl
356
        tgt_n = Container.find new_sdx nl
357
        int_s = Node.removeSec old_s inst
358
        force_s = Node.offline old_s
359
        new_inst = Instance.setSec inst new_sdx
360
        new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
361
                 \new_s -> return (Container.addTwo new_sdx
362
                                   new_s old_sdx int_s nl,
363
                                   new_inst, old_pdx, new_sdx)
364
    in new_nl
365

    
366
-- Replace the secondary and failover (r:np, f)
367
applyMove nl inst (ReplaceAndFailover new_pdx) =
368
    let old_pdx = Instance.pNode inst
369
        old_sdx = Instance.sNode inst
370
        old_p = Container.find old_pdx nl
371
        old_s = Container.find old_sdx nl
372
        tgt_n = Container.find new_pdx nl
373
        int_p = Node.removePri old_p inst
374
        int_s = Node.removeSec old_s inst
375
        force_s = Node.offline old_s
376
        new_nl = do -- Maybe monad
377
          new_p <- Node.addPri tgt_n inst
378
          new_s <- Node.addSecEx force_s int_p inst new_pdx
379
          let new_inst = Instance.setBoth inst new_pdx old_pdx
380
          return (Container.add new_pdx new_p $
381
                  Container.addTwo old_pdx new_s old_sdx int_s nl,
382
                  new_inst, new_pdx, old_pdx)
383
    in new_nl
384

    
385
-- Failver and replace the secondary (f, r:ns)
386
applyMove nl inst (FailoverAndReplace new_sdx) =
387
    let old_pdx = Instance.pNode inst
388
        old_sdx = Instance.sNode inst
389
        old_p = Container.find old_pdx nl
390
        old_s = Container.find old_sdx nl
391
        tgt_n = Container.find new_sdx nl
392
        int_p = Node.removePri old_p inst
393
        int_s = Node.removeSec old_s inst
394
        force_p = Node.offline old_p
395
        new_nl = do -- Maybe monad
396
          new_p <- Node.addPriEx force_p int_s inst
397
          new_s <- Node.addSecEx force_p tgt_n inst old_sdx
398
          let new_inst = Instance.setBoth inst old_sdx new_sdx
399
          return (Container.add new_sdx new_s $
400
                  Container.addTwo old_sdx new_p old_pdx int_p nl,
401
                  new_inst, old_sdx, new_sdx)
402
    in new_nl
403

    
404
-- | Tries to allocate an instance on one given node.
405
allocateOnSingle :: Node.List -> Instance.Instance -> Node.Node
406
                 -> OpResult Node.AllocElement
407
allocateOnSingle nl inst p =
408
    let new_pdx = Node.idx p
409
        new_inst = Instance.setBoth inst new_pdx Node.noSecondary
410
    in  Node.addPri p inst >>= \new_p -> do
411
      let new_nl = Container.add new_pdx new_p nl
412
          new_score = compCV nl
413
      return (new_nl, new_inst, [new_p], new_score)
414

    
415
-- | Tries to allocate an instance on a given pair of nodes.
416
allocateOnPair :: Node.List -> Instance.Instance -> Node.Node -> Node.Node
417
               -> OpResult Node.AllocElement
418
allocateOnPair nl inst tgt_p tgt_s =
419
    let new_pdx = Node.idx tgt_p
420
        new_sdx = Node.idx tgt_s
421
    in do
422
      new_p <- Node.addPri tgt_p inst
423
      new_s <- Node.addSec tgt_s inst new_pdx
424
      let new_inst = Instance.setBoth inst new_pdx new_sdx
425
          new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
426
      return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
427

    
428
-- | Tries to perform an instance move and returns the best table
429
-- between the original one and the new one.
430
checkSingleStep :: Table -- ^ The original table
431
                -> Instance.Instance -- ^ The instance to move
432
                -> Table -- ^ The current best table
433
                -> IMove -- ^ The move to apply
434
                -> Table -- ^ The final best table
435
checkSingleStep ini_tbl target cur_tbl move =
436
    let
437
        Table ini_nl ini_il _ ini_plc = ini_tbl
438
        tmp_resu = applyMove ini_nl target move
439
    in
440
      case tmp_resu of
441
        OpFail _ -> cur_tbl
442
        OpGood (upd_nl, new_inst, pri_idx, sec_idx) ->
443
            let tgt_idx = Instance.idx target
444
                upd_cvar = compCV upd_nl
445
                upd_il = Container.add tgt_idx new_inst ini_il
446
                upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
447
                upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
448
            in
449
              compareTables cur_tbl upd_tbl
450

    
451
-- | Given the status of the current secondary as a valid new node and
452
-- the current candidate target node, generate the possible moves for
453
-- a instance.
454
possibleMoves :: Bool      -- ^ Whether the secondary node is a valid new node
455
              -> Ndx       -- ^ Target node candidate
456
              -> [IMove]   -- ^ List of valid result moves
457
possibleMoves True tdx =
458
    [ReplaceSecondary tdx,
459
     ReplaceAndFailover tdx,
460
     ReplacePrimary tdx,
461
     FailoverAndReplace tdx]
462

    
463
possibleMoves False tdx =
464
    [ReplaceSecondary tdx,
465
     ReplaceAndFailover tdx]
466

    
467
-- | Compute the best move for a given instance.
468
checkInstanceMove :: [Ndx]             -- ^ Allowed target node indices
469
                  -> Bool              -- ^ Whether disk moves are allowed
470
                  -> Table             -- ^ Original table
471
                  -> Instance.Instance -- ^ Instance to move
472
                  -> Table             -- ^ Best new table for this instance
473
checkInstanceMove nodes_idx disk_moves ini_tbl target =
474
    let
475
        opdx = Instance.pNode target
476
        osdx = Instance.sNode target
477
        nodes = filter (\idx -> idx /= opdx && idx /= osdx) nodes_idx
478
        use_secondary = elem osdx nodes_idx
479
        aft_failover = if use_secondary -- if allowed to failover
480
                       then checkSingleStep ini_tbl target ini_tbl Failover
481
                       else ini_tbl
482
        all_moves = if disk_moves
483
                    then concatMap (possibleMoves use_secondary) nodes
484
                    else []
485
    in
486
      -- iterate over the possible nodes for this instance
487
      foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
488

    
489
-- | Compute the best next move.
490
checkMove :: [Ndx]               -- ^ Allowed target node indices
491
          -> Bool                -- ^ Whether disk moves are allowed
492
          -> Table               -- ^ The current solution
493
          -> [Instance.Instance] -- ^ List of instances still to move
494
          -> Table               -- ^ The new solution
495
checkMove nodes_idx disk_moves ini_tbl victims =
496
    let Table _ _ _ ini_plc = ini_tbl
497
        -- we're using rwhnf from the Control.Parallel.Strategies
498
        -- package; we don't need to use rnf as that would force too
499
        -- much evaluation in single-threaded cases, and in
500
        -- multi-threaded case the weak head normal form is enough to
501
        -- spark the evaluation
502
        tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves ini_tbl)
503
                 victims
504
        -- iterate over all instances, computing the best move
505
        best_tbl =
506
            foldl'
507
            (\ step_tbl new_tbl -> compareTables step_tbl new_tbl)
508
            ini_tbl tables
509
        Table _ _ _ best_plc = best_tbl
510
    in if length best_plc == length ini_plc
511
       then ini_tbl -- no advancement
512
       else best_tbl
513

    
514
-- | Check if we are allowed to go deeper in the balancing
515
doNextBalance :: Table     -- ^ The starting table
516
              -> Int       -- ^ Remaining length
517
              -> Score     -- ^ Score at which to stop
518
              -> Bool      -- ^ The resulting table and commands
519
doNextBalance ini_tbl max_rounds min_score =
520
    let Table _ _ ini_cv ini_plc = ini_tbl
521
        ini_plc_len = length ini_plc
522
    in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
523

    
524
-- | Run a balance move
525
tryBalance :: Table       -- ^ The starting table
526
           -> Bool        -- ^ Allow disk moves
527
           -> Bool        -- ^ Only evacuate moves
528
           -> Score       -- ^ Min gain threshold
529
           -> Score       -- ^ Min gain
530
           -> Maybe Table -- ^ The resulting table and commands
531
tryBalance ini_tbl disk_moves evac_mode mg_limit min_gain =
532
    let Table ini_nl ini_il ini_cv _ = ini_tbl
533
        all_inst = Container.elems ini_il
534
        all_inst' = if evac_mode
535
                    then let bad_nodes = map Node.idx . filter Node.offline $
536
                                         Container.elems ini_nl
537
                         in filter (\e -> Instance.sNode e `elem` bad_nodes ||
538
                                          Instance.pNode e `elem` bad_nodes)
539
                            all_inst
540
                    else all_inst
541
        reloc_inst = filter Instance.movable all_inst'
542
        node_idx = map Node.idx . filter (not . Node.offline) $
543
                   Container.elems ini_nl
544
        fin_tbl = checkMove node_idx disk_moves ini_tbl reloc_inst
545
        (Table _ _ fin_cv _) = fin_tbl
546
    in
547
      if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
548
      then Just fin_tbl -- this round made success, return the new table
549
      else Nothing
550

    
551
-- * Allocation functions
552

    
553
-- | Build failure stats out of a list of failures
554
collapseFailures :: [FailMode] -> FailStats
555
collapseFailures flst =
556
    map (\k -> (k, length $ filter (k ==) flst)) [minBound..maxBound]
557

    
558
-- | Update current Allocation solution and failure stats with new
559
-- elements
560
concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
561
concatAllocs as (OpFail reason) = as { asFailures = reason : asFailures as }
562

    
563
concatAllocs as (OpGood ns@(_, _, _, nscore)) =
564
    let -- Choose the old or new solution, based on the cluster score
565
        cntok = asAllocs as
566
        osols = asSolutions as
567
        nsols = case osols of
568
                  [] -> [ns]
569
                  (_, _, _, oscore):[] ->
570
                      if oscore < nscore
571
                      then osols
572
                      else [ns]
573
                  -- FIXME: here we simply concat to lists with more
574
                  -- than one element; we should instead abort, since
575
                  -- this is not a valid usage of this function
576
                  xs -> ns:xs
577
        nsuc = cntok + 1
578
    -- Note: we force evaluation of nsols here in order to keep the
579
    -- memory profile low - we know that we will need nsols for sure
580
    -- in the next cycle, so we force evaluation of nsols, since the
581
    -- foldl' in the caller will only evaluate the tuple, but not the
582
    -- elements of the tuple
583
    in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolutions = nsols }
584

    
585
-- | Given a solution, generates a reasonable description for it
586
describeSolution :: AllocSolution -> String
587
describeSolution as =
588
  let fcnt = asFailures as
589
      sols = asSolutions as
590
      freasons =
591
        intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
592
        filter ((> 0) . snd) . collapseFailures $ fcnt
593
  in if null sols
594
     then "No valid allocation solutions, failure reasons: " ++
595
          (if null fcnt
596
           then "unknown reasons"
597
           else freasons)
598
     else let (_, _, nodes, cv) = head sols
599
          in printf ("score: %.8f, successes %d, failures %d (%s)" ++
600
                     " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
601
             (intercalate "/" . map Node.name $ nodes)
602

    
603
-- | Annotates a solution with the appropriate string
604
annotateSolution :: AllocSolution -> AllocSolution
605
annotateSolution as = as { asLog = describeSolution as : asLog as }
606

    
607
-- | Try to allocate an instance on the cluster.
608
tryAlloc :: (Monad m) =>
609
            Node.List         -- ^ The node list
610
         -> Instance.List     -- ^ The instance list
611
         -> Instance.Instance -- ^ The instance to allocate
612
         -> Int               -- ^ Required number of nodes
613
         -> m AllocSolution   -- ^ Possible solution list
614
tryAlloc nl _ inst 2 =
615
    let all_nodes = getOnline nl
616
        all_pairs = liftM2 (,) all_nodes all_nodes
617
        ok_pairs = filter (\(x, y) -> Node.idx x /= Node.idx y) all_pairs
618
        sols = foldl' (\cstate (p, s) ->
619
                           concatAllocs cstate $ allocateOnPair nl inst p s
620
                      ) emptySolution ok_pairs
621

    
622
    in if null ok_pairs -- means we have just one node
623
       then fail "Not enough online nodes"
624
       else return $ annotateSolution sols
625

    
626
tryAlloc nl _ inst 1 =
627
    let all_nodes = getOnline nl
628
        sols = foldl' (\cstate ->
629
                           concatAllocs cstate . allocateOnSingle nl inst
630
                      ) emptySolution all_nodes
631
    in if null all_nodes
632
       then fail "No online nodes"
633
       else return $ annotateSolution sols
634

    
635
tryAlloc _ _ _ reqn = fail $ "Unsupported number of allocation \
636
                             \destinations required (" ++ show reqn ++
637
                                               "), only two supported"
638

    
639
-- | Given a group/result, describe it as a nice (list of) messages
640
solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String]
641
solutionDescription gl (groupId, result) =
642
  case result of
643
    Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
644
    Bad message -> [printf "Group %s: error %s" gname message]
645
  where grp = Container.find groupId gl
646
        gname = Group.name grp
647
        pol = apolToString (Group.allocPolicy grp)
648

    
649
-- | From a list of possibly bad and possibly empty solutions, filter
650
-- only the groups with a valid result
651
filterMGResults :: Group.List
652
                -> [(Gdx, Result AllocSolution)]
653
                -> [(Gdx, AllocSolution)]
654
filterMGResults gl=
655
  filter ((/= AllocUnallocable) . Group.allocPolicy .
656
             flip Container.find gl . fst) .
657
  filter (not . null . asSolutions . snd) .
658
  map (\(y, Ok x) -> (y, x)) .
659
  filter (isOk . snd)
660

    
661
-- | Sort multigroup results based on policy and score
662
sortMGResults :: Group.List
663
             -> [(Gdx, AllocSolution)]
664
             -> [(Gdx, AllocSolution)]
665
sortMGResults gl sols =
666
    let extractScore = \(_, _, _, x) -> x
667
        solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl),
668
                               (extractScore . head . asSolutions) sol)
669
    in sortBy (comparing solScore) sols
670

    
671
-- | Try to allocate an instance on a multi-group cluster.
672
tryMGAlloc :: Group.List           -- ^ The group list
673
           -> Node.List            -- ^ The node list
674
           -> Instance.List        -- ^ The instance list
675
           -> Instance.Instance    -- ^ The instance to allocate
676
           -> Int                  -- ^ Required number of nodes
677
           -> Result AllocSolution -- ^ Possible solution list
678
tryMGAlloc mggl mgnl mgil inst cnt =
679
  let groups = splitCluster mgnl mgil
680
      -- TODO: currently we consider all groups preferred
681
      sols = map (\(gid, (nl, il)) ->
682
                   (gid, tryAlloc nl il inst cnt)) groups::
683
        [(Gdx, Result AllocSolution)]
684
      all_msgs = concatMap (solutionDescription mggl) sols
685
      goodSols = filterMGResults mggl sols
686
      sortedSols = sortMGResults mggl goodSols
687
  in if null sortedSols
688
     then Bad $ intercalate ", " all_msgs
689
     else let (final_group, final_sol) = head sortedSols
690
              final_name = Group.name $ Container.find final_group mggl
691
              selmsg = "Selected group: " ++  final_name
692
          in Ok $ final_sol { asLog = selmsg:all_msgs }
693

    
694
-- | Try to relocate an instance on the cluster.
695
tryReloc :: (Monad m) =>
696
            Node.List       -- ^ The node list
697
         -> Instance.List   -- ^ The instance list
698
         -> Idx             -- ^ The index of the instance to move
699
         -> Int             -- ^ The number of nodes required
700
         -> [Ndx]           -- ^ Nodes which should not be used
701
         -> m AllocSolution -- ^ Solution list
702
tryReloc nl il xid 1 ex_idx =
703
    let all_nodes = getOnline nl
704
        inst = Container.find xid il
705
        ex_idx' = Instance.pNode inst:ex_idx
706
        valid_nodes = filter (not . flip elem ex_idx' . Node.idx) all_nodes
707
        valid_idxes = map Node.idx valid_nodes
708
        sols1 = foldl' (\cstate x ->
709
                            let em = do
710
                                  (mnl, i, _, _) <-
711
                                      applyMove nl inst (ReplaceSecondary x)
712
                                  return (mnl, i, [Container.find x mnl],
713
                                          compCV mnl)
714
                            in concatAllocs cstate em
715
                       ) emptySolution valid_idxes
716
    in return sols1
717

    
718
tryReloc _ _ _ reqn _  = fail $ "Unsupported number of relocation \
719
                                \destinations required (" ++ show reqn ++
720
                                                  "), only one supported"
721

    
722
tryMGReloc :: (Monad m) =>
723
              Group.List      -- ^ The group list
724
           -> Node.List       -- ^ The node list
725
           -> Instance.List   -- ^ The instance list
726
           -> Idx             -- ^ The index of the instance to move
727
           -> Int             -- ^ The number of nodes required
728
           -> [Ndx]           -- ^ Nodes which should not be used
729
           -> m AllocSolution -- ^ Solution list
730
tryMGReloc _ mgnl mgil xid ncount ex_ndx = do
731
  let groups = splitCluster mgnl mgil
732
      -- TODO: we only relocate inside the group for now
733
      inst = Container.find xid mgil
734
  (nl, il) <- case lookup (instancePriGroup mgnl inst) groups of
735
                Nothing -> fail $ "Cannot find group for instance " ++
736
                           Instance.name inst
737
                Just v -> return v
738
  tryReloc nl il xid ncount ex_ndx
739

    
740
-- | Try to evacuate a list of nodes.
741
tryEvac :: (Monad m) =>
742
            Node.List       -- ^ The node list
743
         -> Instance.List   -- ^ The instance list
744
         -> [Ndx]           -- ^ Nodes to be evacuated
745
         -> m AllocSolution -- ^ Solution list
746
tryEvac nl il ex_ndx =
747
    let ex_nodes = map (`Container.find` nl) ex_ndx
748
        all_insts = nub . concatMap Node.sList $ ex_nodes
749
    in do
750
      (_, sol) <- foldM (\(nl', old_as) idx -> do
751
                            -- FIXME: hardcoded one node here
752
                            -- (fm, cs, aes)
753
                            new_as <- tryReloc nl' il idx 1 ex_ndx
754
                            case asSolutions new_as of
755
                              csol@(nl'', _, _, _):_ ->
756
                                -- an individual relocation succeeded,
757
                                -- we kind of compose the data from
758
                                -- the two solutions
759
                                return (nl'',
760
                                        new_as { asSolutions =
761
                                                    csol:asSolutions old_as })
762
                              -- this relocation failed, so we fail
763
                              -- the entire evac
764
                              _ -> fail $ "Can't evacuate instance " ++
765
                                   Instance.name (Container.find idx il) ++
766
                                   ": " ++ describeSolution new_as
767
                        ) (nl, emptySolution) all_insts
768
      return $ annotateSolution sol
769

    
770
-- | Recursively place instances on the cluster until we're out of space
771
iterateAlloc :: Node.List
772
             -> Instance.List
773
             -> Instance.Instance
774
             -> Int
775
             -> [Instance.Instance]
776
             -> [CStats]
777
             -> Result (FailStats, Node.List, Instance.List,
778
                        [Instance.Instance], [CStats])
779
iterateAlloc nl il newinst nreq ixes cstats =
780
      let depth = length ixes
781
          newname = printf "new-%d" depth::String
782
          newidx = length (Container.elems il) + depth
783
          newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
784
      in case tryAlloc nl il newi2 nreq of
785
           Bad s -> Bad s
786
           Ok (AllocSolution { asFailures = errs, asSolutions = sols3 }) ->
787
               case sols3 of
788
                 [] -> Ok (collapseFailures errs, nl, il, ixes, cstats)
789
                 (xnl, xi, _, _):[] ->
790
                     iterateAlloc xnl (Container.add newidx xi il)
791
                                  newinst nreq (xi:ixes)
792
                                  (totalResources xnl:cstats)
793
                 _ -> Bad "Internal error: multiple solutions for single\
794
                          \ allocation"
795

    
796
-- | The core of the tiered allocation mode
797
tieredAlloc :: Node.List
798
            -> Instance.List
799
            -> Instance.Instance
800
            -> Int
801
            -> [Instance.Instance]
802
            -> [CStats]
803
            -> Result (FailStats, Node.List, Instance.List,
804
                       [Instance.Instance], [CStats])
805
tieredAlloc nl il newinst nreq ixes cstats =
806
    case iterateAlloc nl il newinst nreq ixes cstats of
807
      Bad s -> Bad s
808
      Ok (errs, nl', il', ixes', cstats') ->
809
          case Instance.shrinkByType newinst . fst . last $
810
               sortBy (comparing snd) errs of
811
            Bad _ -> Ok (errs, nl', il', ixes', cstats')
812
            Ok newinst' ->
813
                tieredAlloc nl' il' newinst' nreq ixes' cstats'
814

    
815
-- | Compute the tiered spec string description from a list of
816
-- allocated instances.
817
tieredSpecMap :: [Instance.Instance]
818
              -> [String]
819
tieredSpecMap trl_ixes =
820
    let fin_trl_ixes = reverse trl_ixes
821
        ix_byspec = groupBy ((==) `on` Instance.specOf) fin_trl_ixes
822
        spec_map = map (\ixs -> (Instance.specOf $ head ixs, length ixs))
823
                   ix_byspec
824
    in  map (\(spec, cnt) -> printf "%d,%d,%d=%d" (rspecMem spec)
825
                             (rspecDsk spec) (rspecCpu spec) cnt) spec_map
826

    
827
-- * Formatting functions
828

    
829
-- | Given the original and final nodes, computes the relocation description.
830
computeMoves :: Instance.Instance -- ^ The instance to be moved
831
             -> String -- ^ The instance name
832
             -> IMove  -- ^ The move being performed
833
             -> String -- ^ New primary
834
             -> String -- ^ New secondary
835
             -> (String, [String])
836
                -- ^ Tuple of moves and commands list; moves is containing
837
                -- either @/f/@ for failover or @/r:name/@ for replace
838
                -- secondary, while the command list holds gnt-instance
839
                -- commands (without that prefix), e.g \"@failover instance1@\"
840
computeMoves i inam mv c d =
841
    case mv of
842
      Failover -> ("f", [mig])
843
      FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
844
      ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
845
      ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
846
      ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
847
    where morf = if Instance.running i then "migrate" else "failover"
848
          mig = printf "%s -f %s" morf inam::String
849
          rep n = printf "replace-disks -n %s %s" n inam
850

    
851
-- | Converts a placement to string format.
852
printSolutionLine :: Node.List     -- ^ The node list
853
                  -> Instance.List -- ^ The instance list
854
                  -> Int           -- ^ Maximum node name length
855
                  -> Int           -- ^ Maximum instance name length
856
                  -> Placement     -- ^ The current placement
857
                  -> Int           -- ^ The index of the placement in
858
                                   -- the solution
859
                  -> (String, [String])
860
printSolutionLine nl il nmlen imlen plc pos =
861
    let
862
        pmlen = (2*nmlen + 1)
863
        (i, p, s, mv, c) = plc
864
        inst = Container.find i il
865
        inam = Instance.alias inst
866
        npri = Node.alias $ Container.find p nl
867
        nsec = Node.alias $ Container.find s nl
868
        opri = Node.alias $ Container.find (Instance.pNode inst) nl
869
        osec = Node.alias $ Container.find (Instance.sNode inst) nl
870
        (moves, cmds) =  computeMoves inst inam mv npri nsec
871
        ostr = printf "%s:%s" opri osec::String
872
        nstr = printf "%s:%s" npri nsec::String
873
    in
874
      (printf "  %3d. %-*s %-*s => %-*s %.8f a=%s"
875
       pos imlen inam pmlen ostr
876
       pmlen nstr c moves,
877
       cmds)
878

    
879
-- | Return the instance and involved nodes in an instance move.
880
involvedNodes :: Instance.List -> Placement -> [Ndx]
881
involvedNodes il plc =
882
    let (i, np, ns, _, _) = plc
883
        inst = Container.find i il
884
        op = Instance.pNode inst
885
        os = Instance.sNode inst
886
    in nub [np, ns, op, os]
887

    
888
-- | Inner function for splitJobs, that either appends the next job to
889
-- the current jobset, or starts a new jobset.
890
mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
891
mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
892
mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
893
    | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
894
    | otherwise = ([n]:cjs, ndx)
895

    
896
-- | Break a list of moves into independent groups. Note that this
897
-- will reverse the order of jobs.
898
splitJobs :: [MoveJob] -> [JobSet]
899
splitJobs = fst . foldl mergeJobs ([], [])
900

    
901
-- | Given a list of commands, prefix them with @gnt-instance@ and
902
-- also beautify the display a little.
903
formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
904
formatJob jsn jsl (sn, (_, _, _, cmds)) =
905
    let out =
906
            printf "  echo job %d/%d" jsn sn:
907
            printf "  check":
908
            map ("  gnt-instance " ++) cmds
909
    in if sn == 1
910
       then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
911
       else out
912

    
913
-- | Given a list of commands, prefix them with @gnt-instance@ and
914
-- also beautify the display a little.
915
formatCmds :: [JobSet] -> String
916
formatCmds =
917
    unlines .
918
    concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
919
                             (zip [1..] js)) .
920
    zip [1..]
921

    
922
-- | Print the node list.
923
printNodes :: Node.List -> [String] -> String
924
printNodes nl fs =
925
    let fields = case fs of
926
          [] -> Node.defaultFields
927
          "+":rest -> Node.defaultFields ++ rest
928
          _ -> fs
929
        snl = sortBy (comparing Node.idx) (Container.elems nl)
930
        (header, isnum) = unzip $ map Node.showHeader fields
931
    in unlines . map ((:) ' ' .  intercalate " ") $
932
       formatTable (header:map (Node.list fields) snl) isnum
933

    
934
-- | Print the instance list.
935
printInsts :: Node.List -> Instance.List -> String
936
printInsts nl il =
937
    let sil = sortBy (comparing Instance.idx) (Container.elems il)
938
        helper inst = [ if Instance.running inst then "R" else " "
939
                      , Instance.name inst
940
                      , Container.nameOf nl (Instance.pNode inst)
941
                      , let sdx = Instance.sNode inst
942
                        in if sdx == Node.noSecondary
943
                           then  ""
944
                           else Container.nameOf nl sdx
945
                      , printf "%3d" $ Instance.vcpus inst
946
                      , printf "%5d" $ Instance.mem inst
947
                      , printf "%5d" $ Instance.dsk inst `div` 1024
948
                      , printf "%5.3f" lC
949
                      , printf "%5.3f" lM
950
                      , printf "%5.3f" lD
951
                      , printf "%5.3f" lN
952
                      ]
953
            where DynUtil lC lM lD lN = Instance.util inst
954
        header = [ "F", "Name", "Pri_node", "Sec_node", "vcpu", "mem"
955
                 , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
956
        isnum = False:False:False:False:repeat True
957
    in unlines . map ((:) ' ' . intercalate " ") $
958
       formatTable (header:map helper sil) isnum
959

    
960
-- | Shows statistics for a given node list.
961
printStats :: Node.List -> String
962
printStats nl =
963
    let dcvs = compDetailedCV nl
964
        (weights, names) = unzip detailedCVInfo
965
        hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
966
        formatted = map (\(w, header, val) ->
967
                             printf "%s=%.8f(x%.2f)" header val w::String) hd
968
    in intercalate ", " formatted
969

    
970
-- | Convert a placement into a list of OpCodes (basically a job).
971
iMoveToJob :: Node.List -> Instance.List
972
          -> Idx -> IMove -> [OpCodes.OpCode]
973
iMoveToJob nl il idx move =
974
    let inst = Container.find idx il
975
        iname = Instance.name inst
976
        lookNode  = Just . Container.nameOf nl
977
        opF = if Instance.running inst
978
              then OpCodes.OpMigrateInstance iname True False
979
              else OpCodes.OpFailoverInstance iname False
980
        opR n = OpCodes.OpReplaceDisks iname (lookNode n)
981
                OpCodes.ReplaceNewSecondary [] Nothing
982
    in case move of
983
         Failover -> [ opF ]
984
         ReplacePrimary np -> [ opF, opR np, opF ]
985
         ReplaceSecondary ns -> [ opR ns ]
986
         ReplaceAndFailover np -> [ opR np, opF ]
987
         FailoverAndReplace ns -> [ opF, opR ns ]
988

    
989
-- * Node group functions
990

    
991
-- | Computes the group of an instance
992
instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
993
instanceGroup nl i =
994
  let sidx = Instance.sNode i
995
      pnode = Container.find (Instance.pNode i) nl
996
      snode = if sidx == Node.noSecondary
997
              then pnode
998
              else Container.find sidx nl
999
      pgroup = Node.group pnode
1000
      sgroup = Node.group snode
1001
  in if pgroup /= sgroup
1002
     then fail ("Instance placed accross two node groups, primary " ++
1003
                show pgroup ++ ", secondary " ++ show sgroup)
1004
     else return pgroup
1005

    
1006
-- | Computes the group of an instance per the primary node
1007
instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1008
instancePriGroup nl i =
1009
  let pnode = Container.find (Instance.pNode i) nl
1010
  in  Node.group pnode
1011

    
1012
-- | Compute the list of badly allocated instances (split across node
1013
-- groups)
1014
findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1015
findSplitInstances nl il =
1016
  filter (not . isOk . instanceGroup nl) (Container.elems il)
1017

    
1018
-- | Splits a cluster into the component node groups
1019
splitCluster :: Node.List -> Instance.List ->
1020
                [(Gdx, (Node.List, Instance.List))]
1021
splitCluster nl il =
1022
  let ngroups = Node.computeGroups (Container.elems nl)
1023
  in map (\(guuid, nodes) ->
1024
           let nidxs = map Node.idx nodes
1025
               nodes' = zip nidxs nodes
1026
               instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1027
           in (guuid, (Container.fromList nodes', instances))) ngroups