Statistics
| Branch: | Tag: | Revision:

root / Ganeti / HTools / Cluster.hs @ 949397c8

History | View | Annotate | Download (40.7 kB)

1
{-| Implementation of cluster-wide logic.
2

    
3
This module holds all pure cluster-logic; I\/O related functionality
4
goes into the "Main" module for the individual binaries.
5

    
6
-}
7

    
8
{-
9

    
10
Copyright (C) 2009, 2010 Google Inc.
11

    
12
This program is free software; you can redistribute it and/or modify
13
it under the terms of the GNU General Public License as published by
14
the Free Software Foundation; either version 2 of the License, or
15
(at your option) any later version.
16

    
17
This program is distributed in the hope that it will be useful, but
18
WITHOUT ANY WARRANTY; without even the implied warranty of
19
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
General Public License for more details.
21

    
22
You should have received a copy of the GNU General Public License
23
along with this program; if not, write to the Free Software
24
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25
02110-1301, USA.
26

    
27
-}
28

    
29
module Ganeti.HTools.Cluster
30
    (
31
     -- * Types
32
      AllocSolution(..)
33
    , Table(..)
34
    , CStats(..)
35
    , AllocStats
36
    -- * Generic functions
37
    , totalResources
38
    , computeAllocationDelta
39
    -- * First phase functions
40
    , computeBadItems
41
    -- * Second phase functions
42
    , printSolutionLine
43
    , formatCmds
44
    , involvedNodes
45
    , splitJobs
46
    -- * Display functions
47
    , printNodes
48
    , printInsts
49
    -- * Balacing functions
50
    , checkMove
51
    , doNextBalance
52
    , tryBalance
53
    , compCV
54
    , printStats
55
    , iMoveToJob
56
    -- * IAllocator functions
57
    , tryAlloc
58
    , tryMGAlloc
59
    , tryReloc
60
    , tryEvac
61
    , collapseFailures
62
    -- * Allocation functions
63
    , iterateAlloc
64
    , tieredAlloc
65
    , tieredSpecMap
66
     -- * Node group functions
67
    , instanceGroup
68
    , findSplitInstances
69
    , splitCluster
70
    ) where
71

    
72
import Data.Function (on)
73
import Data.List
74
import Data.Ord (comparing)
75
import Text.Printf (printf)
76
import Control.Monad
77

    
78
import qualified Ganeti.HTools.Container as Container
79
import qualified Ganeti.HTools.Instance as Instance
80
import qualified Ganeti.HTools.Node as Node
81
import qualified Ganeti.HTools.Group as Group
82
import Ganeti.HTools.Types
83
import Ganeti.HTools.Utils
84
import qualified Ganeti.OpCodes as OpCodes
85

    
86
-- * Types
87

    
88
-- | Allocation\/relocation solution.
89
data AllocSolution = AllocSolution
90
  { asFailures  :: [FailMode]          -- ^ Failure counts
91
  , asAllocs    :: Int                 -- ^ Good allocation count
92
  , asSolutions :: [Node.AllocElement] -- ^ The actual result, length
93
                                       -- of the list depends on the
94
                                       -- allocation/relocation mode
95
  , asLog       :: [String]            -- ^ A list of informational messages
96
  }
97

    
98
-- | The empty solution we start with when computing allocations
99
emptySolution :: AllocSolution
100
emptySolution = AllocSolution { asFailures = [], asAllocs = 0
101
                              , asSolutions = [], asLog = [] }
102

    
103
-- | The complete state for the balancing solution
104
data Table = Table Node.List Instance.List Score [Placement]
105
             deriving (Show)
106

    
107
data CStats = CStats { csFmem :: Int    -- ^ Cluster free mem
108
                     , csFdsk :: Int    -- ^ Cluster free disk
109
                     , csAmem :: Int    -- ^ Cluster allocatable mem
110
                     , csAdsk :: Int    -- ^ Cluster allocatable disk
111
                     , csAcpu :: Int    -- ^ Cluster allocatable cpus
112
                     , csMmem :: Int    -- ^ Max node allocatable mem
113
                     , csMdsk :: Int    -- ^ Max node allocatable disk
114
                     , csMcpu :: Int    -- ^ Max node allocatable cpu
115
                     , csImem :: Int    -- ^ Instance used mem
116
                     , csIdsk :: Int    -- ^ Instance used disk
117
                     , csIcpu :: Int    -- ^ Instance used cpu
118
                     , csTmem :: Double -- ^ Cluster total mem
119
                     , csTdsk :: Double -- ^ Cluster total disk
120
                     , csTcpu :: Double -- ^ Cluster total cpus
121
                     , csVcpu :: Int    -- ^ Cluster virtual cpus (if
122
                                        -- node pCpu has been set,
123
                                        -- otherwise -1)
124
                     , csXmem :: Int    -- ^ Unnacounted for mem
125
                     , csNmem :: Int    -- ^ Node own memory
126
                     , csScore :: Score -- ^ The cluster score
127
                     , csNinst :: Int   -- ^ The total number of instances
128
                     }
129
            deriving (Show)
130

    
131
-- | Currently used, possibly to allocate, unallocable
132
type AllocStats = (RSpec, RSpec, RSpec)
133

    
134
-- * Utility functions
135

    
136
-- | Verifies the N+1 status and return the affected nodes.
137
verifyN1 :: [Node.Node] -> [Node.Node]
138
verifyN1 = filter Node.failN1
139

    
140
{-| Computes the pair of bad nodes and instances.
141

    
142
The bad node list is computed via a simple 'verifyN1' check, and the
143
bad instance list is the list of primary and secondary instances of
144
those nodes.
145

    
146
-}
147
computeBadItems :: Node.List -> Instance.List ->
148
                   ([Node.Node], [Instance.Instance])
149
computeBadItems nl il =
150
  let bad_nodes = verifyN1 $ getOnline nl
151
      bad_instances = map (`Container.find` il) .
152
                      sort . nub $
153
                      concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
154
  in
155
    (bad_nodes, bad_instances)
156

    
157
-- | Zero-initializer for the CStats type
158
emptyCStats :: CStats
159
emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
160

    
161
-- | Update stats with data from a new node
162
updateCStats :: CStats -> Node.Node -> CStats
163
updateCStats cs node =
164
    let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
165
                 csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
166
                 csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
167
                 csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
168
                 csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
169
                 csVcpu = x_vcpu,
170
                 csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
171
               }
172
            = cs
173
        inc_amem = Node.fMem node - Node.rMem node
174
        inc_amem' = if inc_amem > 0 then inc_amem else 0
175
        inc_adsk = Node.availDisk node
176
        inc_imem = truncate (Node.tMem node) - Node.nMem node
177
                   - Node.xMem node - Node.fMem node
178
        inc_icpu = Node.uCpu node
179
        inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
180
        inc_vcpu = Node.hiCpu node
181

    
182
    in cs { csFmem = x_fmem + Node.fMem node
183
          , csFdsk = x_fdsk + Node.fDsk node
184
          , csAmem = x_amem + inc_amem'
185
          , csAdsk = x_adsk + inc_adsk
186
          , csAcpu = x_acpu
187
          , csMmem = max x_mmem inc_amem'
188
          , csMdsk = max x_mdsk inc_adsk
189
          , csMcpu = x_mcpu
190
          , csImem = x_imem + inc_imem
191
          , csIdsk = x_idsk + inc_idsk
192
          , csIcpu = x_icpu + inc_icpu
193
          , csTmem = x_tmem + Node.tMem node
194
          , csTdsk = x_tdsk + Node.tDsk node
195
          , csTcpu = x_tcpu + Node.tCpu node
196
          , csVcpu = x_vcpu + inc_vcpu
197
          , csXmem = x_xmem + Node.xMem node
198
          , csNmem = x_nmem + Node.nMem node
199
          , csNinst = x_ninst + length (Node.pList node)
200
          }
201

    
202
-- | Compute the total free disk and memory in the cluster.
203
totalResources :: Node.List -> CStats
204
totalResources nl =
205
    let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
206
    in cs { csScore = compCV nl }
207

    
208
-- | Compute the delta between two cluster state.
209
--
210
-- This is used when doing allocations, to understand better the
211
-- available cluster resources. The return value is a triple of the
212
-- current used values, the delta that was still allocated, and what
213
-- was left unallocated.
214
computeAllocationDelta :: CStats -> CStats -> AllocStats
215
computeAllocationDelta cini cfin =
216
    let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu} = cini
217
        CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
218
                csTmem = t_mem, csTdsk = t_dsk, csVcpu = v_cpu } = cfin
219
        rini = RSpec i_icpu i_imem i_idsk
220
        rfin = RSpec (f_icpu - i_icpu) (f_imem - i_imem) (f_idsk - i_idsk)
221
        un_cpu = v_cpu - f_icpu
222
        runa = RSpec un_cpu (truncate t_mem - f_imem) (truncate t_dsk - f_idsk)
223
    in (rini, rfin, runa)
224

    
225
-- | The names and weights of the individual elements in the CV list
226
detailedCVInfo :: [(Double, String)]
227
detailedCVInfo = [ (1,  "free_mem_cv")
228
                 , (1,  "free_disk_cv")
229
                 , (1,  "n1_cnt")
230
                 , (1,  "reserved_mem_cv")
231
                 , (4,  "offline_all_cnt")
232
                 , (16, "offline_pri_cnt")
233
                 , (1,  "vcpu_ratio_cv")
234
                 , (1,  "cpu_load_cv")
235
                 , (1,  "mem_load_cv")
236
                 , (1,  "disk_load_cv")
237
                 , (1,  "net_load_cv")
238
                 , (2,  "pri_tags_score")
239
                 ]
240

    
241
detailedCVWeights :: [Double]
242
detailedCVWeights = map fst detailedCVInfo
243

    
244
-- | Compute the mem and disk covariance.
245
compDetailedCV :: Node.List -> [Double]
246
compDetailedCV nl =
247
    let
248
        all_nodes = Container.elems nl
249
        (offline, nodes) = partition Node.offline all_nodes
250
        mem_l = map Node.pMem nodes
251
        dsk_l = map Node.pDsk nodes
252
        -- metric: memory covariance
253
        mem_cv = varianceCoeff mem_l
254
        -- metric: disk covariance
255
        dsk_cv = varianceCoeff dsk_l
256
        -- metric: count of instances living on N1 failing nodes
257
        n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
258
                                                   length (Node.pList n)) .
259
                   filter Node.failN1 $ nodes :: Double
260
        res_l = map Node.pRem nodes
261
        -- metric: reserved memory covariance
262
        res_cv = varianceCoeff res_l
263
        -- offline instances metrics
264
        offline_ipri = sum . map (length . Node.pList) $ offline
265
        offline_isec = sum . map (length . Node.sList) $ offline
266
        -- metric: count of instances on offline nodes
267
        off_score = fromIntegral (offline_ipri + offline_isec)::Double
268
        -- metric: count of primary instances on offline nodes (this
269
        -- helps with evacuation/failover of primary instances on
270
        -- 2-node clusters with one node offline)
271
        off_pri_score = fromIntegral offline_ipri::Double
272
        cpu_l = map Node.pCpu nodes
273
        -- metric: covariance of vcpu/pcpu ratio
274
        cpu_cv = varianceCoeff cpu_l
275
        -- metrics: covariance of cpu, memory, disk and network load
276
        (c_load, m_load, d_load, n_load) = unzip4 $
277
            map (\n ->
278
                     let DynUtil c1 m1 d1 n1 = Node.utilLoad n
279
                         DynUtil c2 m2 d2 n2 = Node.utilPool n
280
                     in (c1/c2, m1/m2, d1/d2, n1/n2)
281
                ) nodes
282
        -- metric: conflicting instance count
283
        pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
284
        pri_tags_score = fromIntegral pri_tags_inst::Double
285
    in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
286
       , varianceCoeff c_load, varianceCoeff m_load
287
       , varianceCoeff d_load, varianceCoeff n_load
288
       , pri_tags_score ]
289

    
290
-- | Compute the /total/ variance.
291
compCV :: Node.List -> Double
292
compCV = sum . zipWith (*) detailedCVWeights . compDetailedCV
293

    
294
-- | Compute online nodes from a Node.List
295
getOnline :: Node.List -> [Node.Node]
296
getOnline = filter (not . Node.offline) . Container.elems
297

    
298
-- * hbal functions
299

    
300
-- | Compute best table. Note that the ordering of the arguments is important.
301
compareTables :: Table -> Table -> Table
302
compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
303
    if a_cv > b_cv then b else a
304

    
305
-- | Applies an instance move to a given node list and instance.
306
applyMove :: Node.List -> Instance.Instance
307
          -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
308
-- Failover (f)
309
applyMove nl inst Failover =
310
    let old_pdx = Instance.pNode inst
311
        old_sdx = Instance.sNode inst
312
        old_p = Container.find old_pdx nl
313
        old_s = Container.find old_sdx nl
314
        int_p = Node.removePri old_p inst
315
        int_s = Node.removeSec old_s inst
316
        force_p = Node.offline old_p
317
        new_nl = do -- Maybe monad
318
          new_p <- Node.addPriEx force_p int_s inst
319
          new_s <- Node.addSec int_p inst old_sdx
320
          let new_inst = Instance.setBoth inst old_sdx old_pdx
321
          return (Container.addTwo old_pdx new_s old_sdx new_p nl,
322
                  new_inst, old_sdx, old_pdx)
323
    in new_nl
324

    
325
-- Replace the primary (f:, r:np, f)
326
applyMove nl inst (ReplacePrimary new_pdx) =
327
    let old_pdx = Instance.pNode inst
328
        old_sdx = Instance.sNode inst
329
        old_p = Container.find old_pdx nl
330
        old_s = Container.find old_sdx nl
331
        tgt_n = Container.find new_pdx nl
332
        int_p = Node.removePri old_p inst
333
        int_s = Node.removeSec old_s inst
334
        force_p = Node.offline old_p
335
        new_nl = do -- Maybe monad
336
          -- check that the current secondary can host the instance
337
          -- during the migration
338
          tmp_s <- Node.addPriEx force_p int_s inst
339
          let tmp_s' = Node.removePri tmp_s inst
340
          new_p <- Node.addPriEx force_p tgt_n inst
341
          new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
342
          let new_inst = Instance.setPri inst new_pdx
343
          return (Container.add new_pdx new_p $
344
                  Container.addTwo old_pdx int_p old_sdx new_s nl,
345
                  new_inst, new_pdx, old_sdx)
346
    in new_nl
347

    
348
-- Replace the secondary (r:ns)
349
applyMove nl inst (ReplaceSecondary new_sdx) =
350
    let old_pdx = Instance.pNode inst
351
        old_sdx = Instance.sNode inst
352
        old_s = Container.find old_sdx nl
353
        tgt_n = Container.find new_sdx nl
354
        int_s = Node.removeSec old_s inst
355
        force_s = Node.offline old_s
356
        new_inst = Instance.setSec inst new_sdx
357
        new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
358
                 \new_s -> return (Container.addTwo new_sdx
359
                                   new_s old_sdx int_s nl,
360
                                   new_inst, old_pdx, new_sdx)
361
    in new_nl
362

    
363
-- Replace the secondary and failover (r:np, f)
364
applyMove nl inst (ReplaceAndFailover new_pdx) =
365
    let old_pdx = Instance.pNode inst
366
        old_sdx = Instance.sNode inst
367
        old_p = Container.find old_pdx nl
368
        old_s = Container.find old_sdx nl
369
        tgt_n = Container.find new_pdx nl
370
        int_p = Node.removePri old_p inst
371
        int_s = Node.removeSec old_s inst
372
        force_s = Node.offline old_s
373
        new_nl = do -- Maybe monad
374
          new_p <- Node.addPri tgt_n inst
375
          new_s <- Node.addSecEx force_s int_p inst new_pdx
376
          let new_inst = Instance.setBoth inst new_pdx old_pdx
377
          return (Container.add new_pdx new_p $
378
                  Container.addTwo old_pdx new_s old_sdx int_s nl,
379
                  new_inst, new_pdx, old_pdx)
380
    in new_nl
381

    
382
-- Failver and replace the secondary (f, r:ns)
383
applyMove nl inst (FailoverAndReplace new_sdx) =
384
    let old_pdx = Instance.pNode inst
385
        old_sdx = Instance.sNode inst
386
        old_p = Container.find old_pdx nl
387
        old_s = Container.find old_sdx nl
388
        tgt_n = Container.find new_sdx nl
389
        int_p = Node.removePri old_p inst
390
        int_s = Node.removeSec old_s inst
391
        force_p = Node.offline old_p
392
        new_nl = do -- Maybe monad
393
          new_p <- Node.addPriEx force_p int_s inst
394
          new_s <- Node.addSecEx force_p tgt_n inst old_sdx
395
          let new_inst = Instance.setBoth inst old_sdx new_sdx
396
          return (Container.add new_sdx new_s $
397
                  Container.addTwo old_sdx new_p old_pdx int_p nl,
398
                  new_inst, old_sdx, new_sdx)
399
    in new_nl
400

    
401
-- | Tries to allocate an instance on one given node.
402
allocateOnSingle :: Node.List -> Instance.Instance -> Node.Node
403
                 -> OpResult Node.AllocElement
404
allocateOnSingle nl inst p =
405
    let new_pdx = Node.idx p
406
        new_inst = Instance.setBoth inst new_pdx Node.noSecondary
407
    in  Node.addPri p inst >>= \new_p -> do
408
      let new_nl = Container.add new_pdx new_p nl
409
          new_score = compCV nl
410
      return (new_nl, new_inst, [new_p], new_score)
411

    
412
-- | Tries to allocate an instance on a given pair of nodes.
413
allocateOnPair :: Node.List -> Instance.Instance -> Node.Node -> Node.Node
414
               -> OpResult Node.AllocElement
415
allocateOnPair nl inst tgt_p tgt_s =
416
    let new_pdx = Node.idx tgt_p
417
        new_sdx = Node.idx tgt_s
418
    in do
419
      new_p <- Node.addPri tgt_p inst
420
      new_s <- Node.addSec tgt_s inst new_pdx
421
      let new_inst = Instance.setBoth inst new_pdx new_sdx
422
          new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
423
      return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
424

    
425
-- | Tries to perform an instance move and returns the best table
426
-- between the original one and the new one.
427
checkSingleStep :: Table -- ^ The original table
428
                -> Instance.Instance -- ^ The instance to move
429
                -> Table -- ^ The current best table
430
                -> IMove -- ^ The move to apply
431
                -> Table -- ^ The final best table
432
checkSingleStep ini_tbl target cur_tbl move =
433
    let
434
        Table ini_nl ini_il _ ini_plc = ini_tbl
435
        tmp_resu = applyMove ini_nl target move
436
    in
437
      case tmp_resu of
438
        OpFail _ -> cur_tbl
439
        OpGood (upd_nl, new_inst, pri_idx, sec_idx) ->
440
            let tgt_idx = Instance.idx target
441
                upd_cvar = compCV upd_nl
442
                upd_il = Container.add tgt_idx new_inst ini_il
443
                upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
444
                upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
445
            in
446
              compareTables cur_tbl upd_tbl
447

    
448
-- | Given the status of the current secondary as a valid new node and
449
-- the current candidate target node, generate the possible moves for
450
-- a instance.
451
possibleMoves :: Bool      -- ^ Whether the secondary node is a valid new node
452
              -> Ndx       -- ^ Target node candidate
453
              -> [IMove]   -- ^ List of valid result moves
454
possibleMoves True tdx =
455
    [ReplaceSecondary tdx,
456
     ReplaceAndFailover tdx,
457
     ReplacePrimary tdx,
458
     FailoverAndReplace tdx]
459

    
460
possibleMoves False tdx =
461
    [ReplaceSecondary tdx,
462
     ReplaceAndFailover tdx]
463

    
464
-- | Compute the best move for a given instance.
465
checkInstanceMove :: [Ndx]             -- ^ Allowed target node indices
466
                  -> Bool              -- ^ Whether disk moves are allowed
467
                  -> Table             -- ^ Original table
468
                  -> Instance.Instance -- ^ Instance to move
469
                  -> Table             -- ^ Best new table for this instance
470
checkInstanceMove nodes_idx disk_moves ini_tbl target =
471
    let
472
        opdx = Instance.pNode target
473
        osdx = Instance.sNode target
474
        nodes = filter (\idx -> idx /= opdx && idx /= osdx) nodes_idx
475
        use_secondary = elem osdx nodes_idx
476
        aft_failover = if use_secondary -- if allowed to failover
477
                       then checkSingleStep ini_tbl target ini_tbl Failover
478
                       else ini_tbl
479
        all_moves = if disk_moves
480
                    then concatMap (possibleMoves use_secondary) nodes
481
                    else []
482
    in
483
      -- iterate over the possible nodes for this instance
484
      foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
485

    
486
-- | Compute the best next move.
487
checkMove :: [Ndx]               -- ^ Allowed target node indices
488
          -> Bool                -- ^ Whether disk moves are allowed
489
          -> Table               -- ^ The current solution
490
          -> [Instance.Instance] -- ^ List of instances still to move
491
          -> Table               -- ^ The new solution
492
checkMove nodes_idx disk_moves ini_tbl victims =
493
    let Table _ _ _ ini_plc = ini_tbl
494
        -- iterate over all instances, computing the best move
495
        best_tbl =
496
            foldl'
497
            (\ step_tbl em ->
498
                 compareTables step_tbl $
499
                 checkInstanceMove nodes_idx disk_moves ini_tbl em)
500
            ini_tbl victims
501
        Table _ _ _ best_plc = best_tbl
502
    in if length best_plc == length ini_plc
503
       then ini_tbl -- no advancement
504
       else best_tbl
505

    
506
-- | Check if we are allowed to go deeper in the balancing
507
doNextBalance :: Table     -- ^ The starting table
508
              -> Int       -- ^ Remaining length
509
              -> Score     -- ^ Score at which to stop
510
              -> Bool      -- ^ The resulting table and commands
511
doNextBalance ini_tbl max_rounds min_score =
512
    let Table _ _ ini_cv ini_plc = ini_tbl
513
        ini_plc_len = length ini_plc
514
    in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
515

    
516
-- | Run a balance move
517
tryBalance :: Table       -- ^ The starting table
518
           -> Bool        -- ^ Allow disk moves
519
           -> Bool        -- ^ Only evacuate moves
520
           -> Score       -- ^ Min gain threshold
521
           -> Score       -- ^ Min gain
522
           -> Maybe Table -- ^ The resulting table and commands
523
tryBalance ini_tbl disk_moves evac_mode mg_limit min_gain =
524
    let Table ini_nl ini_il ini_cv _ = ini_tbl
525
        all_inst = Container.elems ini_il
526
        all_inst' = if evac_mode
527
                    then let bad_nodes = map Node.idx . filter Node.offline $
528
                                         Container.elems ini_nl
529
                         in filter (\e -> Instance.sNode e `elem` bad_nodes ||
530
                                          Instance.pNode e `elem` bad_nodes)
531
                            all_inst
532
                    else all_inst
533
        reloc_inst = filter Instance.movable all_inst'
534
        node_idx = map Node.idx . filter (not . Node.offline) $
535
                   Container.elems ini_nl
536
        fin_tbl = checkMove node_idx disk_moves ini_tbl reloc_inst
537
        (Table _ _ fin_cv _) = fin_tbl
538
    in
539
      if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
540
      then Just fin_tbl -- this round made success, return the new table
541
      else Nothing
542

    
543
-- * Allocation functions
544

    
545
-- | Build failure stats out of a list of failures
546
collapseFailures :: [FailMode] -> FailStats
547
collapseFailures flst =
548
    map (\k -> (k, length $ filter (k ==) flst)) [minBound..maxBound]
549

    
550
-- | Update current Allocation solution and failure stats with new
551
-- elements
552
concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
553
concatAllocs as (OpFail reason) = as { asFailures = reason : asFailures as }
554

    
555
concatAllocs as (OpGood ns@(_, _, _, nscore)) =
556
    let -- Choose the old or new solution, based on the cluster score
557
        cntok = asAllocs as
558
        osols = asSolutions as
559
        nsols = case osols of
560
                  [] -> [ns]
561
                  (_, _, _, oscore):[] ->
562
                      if oscore < nscore
563
                      then osols
564
                      else [ns]
565
                  -- FIXME: here we simply concat to lists with more
566
                  -- than one element; we should instead abort, since
567
                  -- this is not a valid usage of this function
568
                  xs -> ns:xs
569
        nsuc = cntok + 1
570
    -- Note: we force evaluation of nsols here in order to keep the
571
    -- memory profile low - we know that we will need nsols for sure
572
    -- in the next cycle, so we force evaluation of nsols, since the
573
    -- foldl' in the caller will only evaluate the tuple, but not the
574
    -- elements of the tuple
575
    in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolutions = nsols }
576

    
577
-- | Given a solution, generates a reasonable description for it
578
describeSolution :: AllocSolution -> String
579
describeSolution as =
580
  let fcnt = asFailures as
581
      sols = asSolutions as
582
      freasons =
583
        intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
584
        filter ((> 0) . snd) . collapseFailures $ fcnt
585
  in if null sols
586
     then "No valid allocation solutions, failure reasons: " ++
587
          (if null fcnt
588
           then "unknown reasons"
589
           else freasons)
590
     else let (_, _, nodes, cv) = head sols
591
          in printf ("score: %.8f, successes %d, failures %d (%s)" ++
592
                     " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
593
             (intercalate "/" . map Node.name $ nodes)
594

    
595
-- | Annotates a solution with the appropriate string
596
annotateSolution :: AllocSolution -> AllocSolution
597
annotateSolution as = as { asLog = describeSolution as : asLog as }
598

    
599
-- | Try to allocate an instance on the cluster.
600
tryAlloc :: (Monad m) =>
601
            Node.List         -- ^ The node list
602
         -> Instance.List     -- ^ The instance list
603
         -> Instance.Instance -- ^ The instance to allocate
604
         -> Int               -- ^ Required number of nodes
605
         -> m AllocSolution   -- ^ Possible solution list
606
tryAlloc nl _ inst 2 =
607
    let all_nodes = getOnline nl
608
        all_pairs = liftM2 (,) all_nodes all_nodes
609
        ok_pairs = filter (\(x, y) -> Node.idx x /= Node.idx y) all_pairs
610
        sols = foldl' (\cstate (p, s) ->
611
                           concatAllocs cstate $ allocateOnPair nl inst p s
612
                      ) emptySolution ok_pairs
613

    
614
    in if null ok_pairs -- means we have just one node
615
       then fail "Not enough online nodes"
616
       else return $ annotateSolution sols
617

    
618
tryAlloc nl _ inst 1 =
619
    let all_nodes = getOnline nl
620
        sols = foldl' (\cstate ->
621
                           concatAllocs cstate . allocateOnSingle nl inst
622
                      ) emptySolution all_nodes
623
    in if null all_nodes
624
       then fail "No online nodes"
625
       else return $ annotateSolution sols
626

    
627
tryAlloc _ _ _ reqn = fail $ "Unsupported number of allocation \
628
                             \destinations required (" ++ show reqn ++
629
                                               "), only two supported"
630

    
631
-- | Given a group/result, describe it as a nice (list of) messages
632
solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String]
633
solutionDescription gl (groupId, result) =
634
  case result of
635
    Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
636
    Bad message -> [printf "Group %s: error %s" gname message]
637
  where grp = Container.find groupId gl
638
        gname = Group.name grp
639
        pol = apolToString (Group.allocPolicy grp)
640

    
641
-- | From a list of possibly bad and possibly empty solutions, filter
642
-- only the groups with a valid result
643
filterMGResults :: Group.List
644
                -> [(Gdx, Result AllocSolution)]
645
                -> [(Gdx, AllocSolution)]
646
filterMGResults gl=
647
  filter ((/= AllocUnallocable) . Group.allocPolicy .
648
             flip Container.find gl . fst) .
649
  filter (not . null . asSolutions . snd) .
650
  map (\(y, Ok x) -> (y, x)) .
651
  filter (isOk . snd)
652

    
653
-- | Sort multigroup results based on policy and score
654
sortMGResults :: Group.List
655
             -> [(Gdx, AllocSolution)]
656
             -> [(Gdx, AllocSolution)]
657
sortMGResults gl sols =
658
    let extractScore = \(_, _, _, x) -> x
659
        solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl),
660
                               (extractScore . head . asSolutions) sol)
661
    in sortBy (comparing solScore) sols
662

    
663
-- | Try to allocate an instance on a multi-group cluster.
664
tryMGAlloc :: Group.List           -- ^ The group list
665
           -> Node.List            -- ^ The node list
666
           -> Instance.List        -- ^ The instance list
667
           -> Instance.Instance    -- ^ The instance to allocate
668
           -> Int                  -- ^ Required number of nodes
669
           -> Result AllocSolution -- ^ Possible solution list
670
tryMGAlloc mggl mgnl mgil inst cnt =
671
  let groups = splitCluster mgnl mgil
672
      -- TODO: currently we consider all groups preferred
673
      sols = map (\(gid, (nl, il)) ->
674
                   (gid, tryAlloc nl il inst cnt)) groups::
675
        [(Gdx, Result AllocSolution)]
676
      all_msgs = concatMap (solutionDescription mggl) sols
677
      goodSols = filterMGResults mggl sols
678
      sortedSols = sortMGResults mggl goodSols
679
  in if null sortedSols
680
     then Bad $ intercalate ", " all_msgs
681
     else let (final_group, final_sol) = head sortedSols
682
              final_name = Group.name $ Container.find final_group mggl
683
              selmsg = "Selected group: " ++  final_name
684
          in Ok $ final_sol { asLog = selmsg:all_msgs }
685

    
686
-- | Try to relocate an instance on the cluster.
687
tryReloc :: (Monad m) =>
688
            Node.List       -- ^ The node list
689
         -> Instance.List   -- ^ The instance list
690
         -> Idx             -- ^ The index of the instance to move
691
         -> Int             -- ^ The number of nodes required
692
         -> [Ndx]           -- ^ Nodes which should not be used
693
         -> m AllocSolution -- ^ Solution list
694
tryReloc nl il xid 1 ex_idx =
695
    let all_nodes = getOnline nl
696
        inst = Container.find xid il
697
        ex_idx' = Instance.pNode inst:ex_idx
698
        valid_nodes = filter (not . flip elem ex_idx' . Node.idx) all_nodes
699
        valid_idxes = map Node.idx valid_nodes
700
        sols1 = foldl' (\cstate x ->
701
                            let em = do
702
                                  (mnl, i, _, _) <-
703
                                      applyMove nl inst (ReplaceSecondary x)
704
                                  return (mnl, i, [Container.find x mnl],
705
                                          compCV mnl)
706
                            in concatAllocs cstate em
707
                       ) emptySolution valid_idxes
708
    in return sols1
709

    
710
tryReloc _ _ _ reqn _  = fail $ "Unsupported number of relocation \
711
                                \destinations required (" ++ show reqn ++
712
                                                  "), only one supported"
713

    
714
-- | Try to evacuate a list of nodes.
715
tryEvac :: (Monad m) =>
716
            Node.List       -- ^ The node list
717
         -> Instance.List   -- ^ The instance list
718
         -> [Ndx]           -- ^ Nodes to be evacuated
719
         -> m AllocSolution -- ^ Solution list
720
tryEvac nl il ex_ndx =
721
    let ex_nodes = map (`Container.find` nl) ex_ndx
722
        all_insts = nub . concatMap Node.sList $ ex_nodes
723
    in do
724
      (_, sol) <- foldM (\(nl', old_as) idx -> do
725
                            -- FIXME: hardcoded one node here
726
                            -- (fm, cs, aes)
727
                            new_as <- tryReloc nl' il idx 1 ex_ndx
728
                            case asSolutions new_as of
729
                              csol@(nl'', _, _, _):_ ->
730
                                -- an individual relocation succeeded,
731
                                -- we kind of compose the data from
732
                                -- the two solutions
733
                                return (nl'',
734
                                        new_as { asSolutions =
735
                                                    csol:asSolutions old_as })
736
                              -- this relocation failed, so we fail
737
                              -- the entire evac
738
                              _ -> fail $ "Can't evacuate instance " ++
739
                                   Instance.name (Container.find idx il) ++
740
                                   ": " ++ describeSolution new_as
741
                        ) (nl, emptySolution) all_insts
742
      return $ annotateSolution sol
743

    
744
-- | Recursively place instances on the cluster until we're out of space
745
iterateAlloc :: Node.List
746
             -> Instance.List
747
             -> Instance.Instance
748
             -> Int
749
             -> [Instance.Instance]
750
             -> Result (FailStats, Node.List, Instance.List,
751
                        [Instance.Instance])
752
iterateAlloc nl il newinst nreq ixes =
753
      let depth = length ixes
754
          newname = printf "new-%d" depth::String
755
          newidx = length (Container.elems il) + depth
756
          newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
757
      in case tryAlloc nl il newi2 nreq of
758
           Bad s -> Bad s
759
           Ok (AllocSolution { asFailures = errs, asSolutions = sols3 }) ->
760
               case sols3 of
761
                 [] -> Ok (collapseFailures errs, nl, il, ixes)
762
                 (xnl, xi, _, _):[] ->
763
                     iterateAlloc xnl (Container.add newidx xi il)
764
                                  newinst nreq $! (xi:ixes)
765
                 _ -> Bad "Internal error: multiple solutions for single\
766
                          \ allocation"
767

    
768
-- | The core of the tiered allocation mode
769
tieredAlloc :: Node.List
770
            -> Instance.List
771
            -> Instance.Instance
772
            -> Int
773
            -> [Instance.Instance]
774
            -> Result (FailStats, Node.List, Instance.List,
775
                       [Instance.Instance])
776
tieredAlloc nl il newinst nreq ixes =
777
    case iterateAlloc nl il newinst nreq ixes of
778
      Bad s -> Bad s
779
      Ok (errs, nl', il', ixes') ->
780
          case Instance.shrinkByType newinst . fst . last $
781
               sortBy (comparing snd) errs of
782
            Bad _ -> Ok (errs, nl', il', ixes')
783
            Ok newinst' ->
784
                tieredAlloc nl' il' newinst' nreq ixes'
785

    
786
-- | Compute the tiered spec string description from a list of
787
-- allocated instances.
788
tieredSpecMap :: [Instance.Instance]
789
              -> [String]
790
tieredSpecMap trl_ixes =
791
    let fin_trl_ixes = reverse trl_ixes
792
        ix_byspec = groupBy ((==) `on` Instance.specOf) fin_trl_ixes
793
        spec_map = map (\ixs -> (Instance.specOf $ head ixs, length ixs))
794
                   ix_byspec
795
    in  map (\(spec, cnt) -> printf "%d,%d,%d=%d" (rspecMem spec)
796
                             (rspecDsk spec) (rspecCpu spec) cnt) spec_map
797

    
798
-- * Formatting functions
799

    
800
-- | Given the original and final nodes, computes the relocation description.
801
computeMoves :: Instance.Instance -- ^ The instance to be moved
802
             -> String -- ^ The instance name
803
             -> IMove  -- ^ The move being performed
804
             -> String -- ^ New primary
805
             -> String -- ^ New secondary
806
             -> (String, [String])
807
                -- ^ Tuple of moves and commands list; moves is containing
808
                -- either @/f/@ for failover or @/r:name/@ for replace
809
                -- secondary, while the command list holds gnt-instance
810
                -- commands (without that prefix), e.g \"@failover instance1@\"
811
computeMoves i inam mv c d =
812
    case mv of
813
      Failover -> ("f", [mig])
814
      FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
815
      ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
816
      ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
817
      ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
818
    where morf = if Instance.running i then "migrate" else "failover"
819
          mig = printf "%s -f %s" morf inam::String
820
          rep n = printf "replace-disks -n %s %s" n inam
821

    
822
-- | Converts a placement to string format.
823
printSolutionLine :: Node.List     -- ^ The node list
824
                  -> Instance.List -- ^ The instance list
825
                  -> Int           -- ^ Maximum node name length
826
                  -> Int           -- ^ Maximum instance name length
827
                  -> Placement     -- ^ The current placement
828
                  -> Int           -- ^ The index of the placement in
829
                                   -- the solution
830
                  -> (String, [String])
831
printSolutionLine nl il nmlen imlen plc pos =
832
    let
833
        pmlen = (2*nmlen + 1)
834
        (i, p, s, mv, c) = plc
835
        inst = Container.find i il
836
        inam = Instance.alias inst
837
        npri = Node.alias $ Container.find p nl
838
        nsec = Node.alias $ Container.find s nl
839
        opri = Node.alias $ Container.find (Instance.pNode inst) nl
840
        osec = Node.alias $ Container.find (Instance.sNode inst) nl
841
        (moves, cmds) =  computeMoves inst inam mv npri nsec
842
        ostr = printf "%s:%s" opri osec::String
843
        nstr = printf "%s:%s" npri nsec::String
844
    in
845
      (printf "  %3d. %-*s %-*s => %-*s %.8f a=%s"
846
       pos imlen inam pmlen ostr
847
       pmlen nstr c moves,
848
       cmds)
849

    
850
-- | Return the instance and involved nodes in an instance move.
851
involvedNodes :: Instance.List -> Placement -> [Ndx]
852
involvedNodes il plc =
853
    let (i, np, ns, _, _) = plc
854
        inst = Container.find i il
855
        op = Instance.pNode inst
856
        os = Instance.sNode inst
857
    in nub [np, ns, op, os]
858

    
859
-- | Inner function for splitJobs, that either appends the next job to
860
-- the current jobset, or starts a new jobset.
861
mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
862
mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
863
mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
864
    | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
865
    | otherwise = ([n]:cjs, ndx)
866

    
867
-- | Break a list of moves into independent groups. Note that this
868
-- will reverse the order of jobs.
869
splitJobs :: [MoveJob] -> [JobSet]
870
splitJobs = fst . foldl mergeJobs ([], [])
871

    
872
-- | Given a list of commands, prefix them with @gnt-instance@ and
873
-- also beautify the display a little.
874
formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
875
formatJob jsn jsl (sn, (_, _, _, cmds)) =
876
    let out =
877
            printf "  echo job %d/%d" jsn sn:
878
            printf "  check":
879
            map ("  gnt-instance " ++) cmds
880
    in if sn == 1
881
       then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
882
       else out
883

    
884
-- | Given a list of commands, prefix them with @gnt-instance@ and
885
-- also beautify the display a little.
886
formatCmds :: [JobSet] -> String
887
formatCmds =
888
    unlines .
889
    concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
890
                             (zip [1..] js)) .
891
    zip [1..]
892

    
893
-- | Print the node list.
894
printNodes :: Node.List -> [String] -> String
895
printNodes nl fs =
896
    let fields = case fs of
897
          [] -> Node.defaultFields
898
          "+":rest -> Node.defaultFields ++ rest
899
          _ -> fs
900
        snl = sortBy (comparing Node.idx) (Container.elems nl)
901
        (header, isnum) = unzip $ map Node.showHeader fields
902
    in unlines . map ((:) ' ' .  intercalate " ") $
903
       formatTable (header:map (Node.list fields) snl) isnum
904

    
905
-- | Print the instance list.
906
printInsts :: Node.List -> Instance.List -> String
907
printInsts nl il =
908
    let sil = sortBy (comparing Instance.idx) (Container.elems il)
909
        helper inst = [ if Instance.running inst then "R" else " "
910
                      , Instance.name inst
911
                      , Container.nameOf nl (Instance.pNode inst)
912
                      , let sdx = Instance.sNode inst
913
                        in if sdx == Node.noSecondary
914
                           then  ""
915
                           else Container.nameOf nl sdx
916
                      , printf "%3d" $ Instance.vcpus inst
917
                      , printf "%5d" $ Instance.mem inst
918
                      , printf "%5d" $ Instance.dsk inst `div` 1024
919
                      , printf "%5.3f" lC
920
                      , printf "%5.3f" lM
921
                      , printf "%5.3f" lD
922
                      , printf "%5.3f" lN
923
                      ]
924
            where DynUtil lC lM lD lN = Instance.util inst
925
        header = [ "F", "Name", "Pri_node", "Sec_node", "vcpu", "mem"
926
                 , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
927
        isnum = False:False:False:False:repeat True
928
    in unlines . map ((:) ' ' . intercalate " ") $
929
       formatTable (header:map helper sil) isnum
930

    
931
-- | Shows statistics for a given node list.
932
printStats :: Node.List -> String
933
printStats nl =
934
    let dcvs = compDetailedCV nl
935
        (weights, names) = unzip detailedCVInfo
936
        hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
937
        formatted = map (\(w, header, val) ->
938
                             printf "%s=%.8f(x%.2f)" header val w::String) hd
939
    in intercalate ", " formatted
940

    
941
-- | Convert a placement into a list of OpCodes (basically a job).
942
iMoveToJob :: Node.List -> Instance.List
943
          -> Idx -> IMove -> [OpCodes.OpCode]
944
iMoveToJob nl il idx move =
945
    let inst = Container.find idx il
946
        iname = Instance.name inst
947
        lookNode  = Just . Container.nameOf nl
948
        opF = if Instance.running inst
949
              then OpCodes.OpMigrateInstance iname True False
950
              else OpCodes.OpFailoverInstance iname False
951
        opR n = OpCodes.OpReplaceDisks iname (lookNode n)
952
                OpCodes.ReplaceNewSecondary [] Nothing
953
    in case move of
954
         Failover -> [ opF ]
955
         ReplacePrimary np -> [ opF, opR np, opF ]
956
         ReplaceSecondary ns -> [ opR ns ]
957
         ReplaceAndFailover np -> [ opR np, opF ]
958
         FailoverAndReplace ns -> [ opF, opR ns ]
959

    
960
-- * Node group functions
961

    
962
-- | Computes the group of an instance
963
instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
964
instanceGroup nl i =
965
  let sidx = Instance.sNode i
966
      pnode = Container.find (Instance.pNode i) nl
967
      snode = if sidx == Node.noSecondary
968
              then pnode
969
              else Container.find sidx nl
970
      pgroup = Node.group pnode
971
      sgroup = Node.group snode
972
  in if pgroup /= sgroup
973
     then fail ("Instance placed accross two node groups, primary " ++
974
                show pgroup ++ ", secondary " ++ show sgroup)
975
     else return pgroup
976

    
977
-- | Compute the list of badly allocated instances (split across node
978
-- groups)
979
findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
980
findSplitInstances nl il =
981
  filter (not . isOk . instanceGroup nl) (Container.elems il)
982

    
983
-- | Splits a cluster into the component node groups
984
splitCluster :: Node.List -> Instance.List ->
985
                [(Gdx, (Node.List, Instance.List))]
986
splitCluster nl il =
987
  let ngroups = Node.computeGroups (Container.elems nl)
988
  in map (\(guuid, nodes) ->
989
           let nidxs = map Node.idx nodes
990
               nodes' = zip nidxs nodes
991
               instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
992
           in (guuid, (Container.fromAssocList nodes', instances))) ngroups