Statistics
| Branch: | Tag: | Revision:

root / Ganeti / HTools / Cluster.hs @ 859fc11d

History | View | Annotate | Download (37.4 kB)

1
{-| Implementation of cluster-wide logic.
2

    
3
This module holds all pure cluster-logic; I\/O related functionality
4
goes into the "Main" module for the individual binaries.
5

    
6
-}
7

    
8
{-
9

    
10
Copyright (C) 2009, 2010 Google Inc.
11

    
12
This program is free software; you can redistribute it and/or modify
13
it under the terms of the GNU General Public License as published by
14
the Free Software Foundation; either version 2 of the License, or
15
(at your option) any later version.
16

    
17
This program is distributed in the hope that it will be useful, but
18
WITHOUT ANY WARRANTY; without even the implied warranty of
19
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
General Public License for more details.
21

    
22
You should have received a copy of the GNU General Public License
23
along with this program; if not, write to the Free Software
24
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
25
02110-1301, USA.
26

    
27
-}
28

    
29
module Ganeti.HTools.Cluster
30
    (
31
     -- * Types
32
      AllocSolution(..)
33
    , Table(..)
34
    , CStats(..)
35
    , AllocStats
36
    -- * Generic functions
37
    , totalResources
38
    , computeAllocationDelta
39
    -- * First phase functions
40
    , computeBadItems
41
    -- * Second phase functions
42
    , printSolutionLine
43
    , formatCmds
44
    , involvedNodes
45
    , splitJobs
46
    -- * Display functions
47
    , printNodes
48
    , printInsts
49
    -- * Balacing functions
50
    , checkMove
51
    , doNextBalance
52
    , tryBalance
53
    , compCV
54
    , printStats
55
    , iMoveToJob
56
    -- * IAllocator functions
57
    , tryAlloc
58
    , tryReloc
59
    , tryEvac
60
    , collapseFailures
61
    -- * Allocation functions
62
    , iterateAlloc
63
    , tieredAlloc
64
    , instanceGroup
65
    , findSplitInstances
66
    , splitCluster
67
    ) where
68

    
69
import Data.List
70
import Data.Ord (comparing)
71
import Text.Printf (printf)
72
import Control.Monad
73

    
74
import qualified Ganeti.HTools.Container as Container
75
import qualified Ganeti.HTools.Instance as Instance
76
import qualified Ganeti.HTools.Node as Node
77
import Ganeti.HTools.Types
78
import Ganeti.HTools.Utils
79
import qualified Ganeti.OpCodes as OpCodes
80

    
81
-- * Types
82

    
83
-- | Allocation\/relocation solution.
84
data AllocSolution = AllocSolution
85
  { asFailures  :: [FailMode]          -- ^ Failure counts
86
  , asAllocs    :: Int                 -- ^ Good allocation count
87
  , asSolutions :: [Node.AllocElement] -- ^ The actual result, length
88
                                       -- of the list depends on the
89
                                       -- allocation/relocation mode
90
  , asLog       :: [String]            -- ^ A list of informational messages
91
  }
92

    
93
-- | The empty solution we start with when computing allocations
94
emptySolution :: AllocSolution
95
emptySolution = AllocSolution { asFailures = [], asAllocs = 0
96
                              , asSolutions = [], asLog = [] }
97

    
98
-- | The complete state for the balancing solution
99
data Table = Table Node.List Instance.List Score [Placement]
100
             deriving (Show)
101

    
102
data CStats = CStats { csFmem :: Int    -- ^ Cluster free mem
103
                     , csFdsk :: Int    -- ^ Cluster free disk
104
                     , csAmem :: Int    -- ^ Cluster allocatable mem
105
                     , csAdsk :: Int    -- ^ Cluster allocatable disk
106
                     , csAcpu :: Int    -- ^ Cluster allocatable cpus
107
                     , csMmem :: Int    -- ^ Max node allocatable mem
108
                     , csMdsk :: Int    -- ^ Max node allocatable disk
109
                     , csMcpu :: Int    -- ^ Max node allocatable cpu
110
                     , csImem :: Int    -- ^ Instance used mem
111
                     , csIdsk :: Int    -- ^ Instance used disk
112
                     , csIcpu :: Int    -- ^ Instance used cpu
113
                     , csTmem :: Double -- ^ Cluster total mem
114
                     , csTdsk :: Double -- ^ Cluster total disk
115
                     , csTcpu :: Double -- ^ Cluster total cpus
116
                     , csVcpu :: Int    -- ^ Cluster virtual cpus (if
117
                                        -- node pCpu has been set,
118
                                        -- otherwise -1)
119
                     , csXmem :: Int    -- ^ Unnacounted for mem
120
                     , csNmem :: Int    -- ^ Node own memory
121
                     , csScore :: Score -- ^ The cluster score
122
                     , csNinst :: Int   -- ^ The total number of instances
123
                     }
124
            deriving (Show)
125

    
126
-- | Currently used, possibly to allocate, unallocable
127
type AllocStats = (RSpec, RSpec, RSpec)
128

    
129
-- * Utility functions
130

    
131
-- | Verifies the N+1 status and return the affected nodes.
132
verifyN1 :: [Node.Node] -> [Node.Node]
133
verifyN1 = filter Node.failN1
134

    
135
{-| Computes the pair of bad nodes and instances.
136

    
137
The bad node list is computed via a simple 'verifyN1' check, and the
138
bad instance list is the list of primary and secondary instances of
139
those nodes.
140

    
141
-}
142
computeBadItems :: Node.List -> Instance.List ->
143
                   ([Node.Node], [Instance.Instance])
144
computeBadItems nl il =
145
  let bad_nodes = verifyN1 $ getOnline nl
146
      bad_instances = map (`Container.find` il) .
147
                      sort . nub $
148
                      concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
149
  in
150
    (bad_nodes, bad_instances)
151

    
152
-- | Zero-initializer for the CStats type
153
emptyCStats :: CStats
154
emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
155

    
156
-- | Update stats with data from a new node
157
updateCStats :: CStats -> Node.Node -> CStats
158
updateCStats cs node =
159
    let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
160
                 csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
161
                 csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
162
                 csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
163
                 csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
164
                 csVcpu = x_vcpu,
165
                 csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
166
               }
167
            = cs
168
        inc_amem = Node.fMem node - Node.rMem node
169
        inc_amem' = if inc_amem > 0 then inc_amem else 0
170
        inc_adsk = Node.availDisk node
171
        inc_imem = truncate (Node.tMem node) - Node.nMem node
172
                   - Node.xMem node - Node.fMem node
173
        inc_icpu = Node.uCpu node
174
        inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
175
        inc_vcpu = Node.hiCpu node
176

    
177
    in cs { csFmem = x_fmem + Node.fMem node
178
          , csFdsk = x_fdsk + Node.fDsk node
179
          , csAmem = x_amem + inc_amem'
180
          , csAdsk = x_adsk + inc_adsk
181
          , csAcpu = x_acpu
182
          , csMmem = max x_mmem inc_amem'
183
          , csMdsk = max x_mdsk inc_adsk
184
          , csMcpu = x_mcpu
185
          , csImem = x_imem + inc_imem
186
          , csIdsk = x_idsk + inc_idsk
187
          , csIcpu = x_icpu + inc_icpu
188
          , csTmem = x_tmem + Node.tMem node
189
          , csTdsk = x_tdsk + Node.tDsk node
190
          , csTcpu = x_tcpu + Node.tCpu node
191
          , csVcpu = x_vcpu + inc_vcpu
192
          , csXmem = x_xmem + Node.xMem node
193
          , csNmem = x_nmem + Node.nMem node
194
          , csNinst = x_ninst + length (Node.pList node)
195
          }
196

    
197
-- | Compute the total free disk and memory in the cluster.
198
totalResources :: Node.List -> CStats
199
totalResources nl =
200
    let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
201
    in cs { csScore = compCV nl }
202

    
203
-- | Compute the delta between two cluster state.
204
--
205
-- This is used when doing allocations, to understand better the
206
-- available cluster resources. The return value is a triple of the
207
-- current used values, the delta that was still allocated, and what
208
-- was left unallocated.
209
computeAllocationDelta :: CStats -> CStats -> AllocStats
210
computeAllocationDelta cini cfin =
211
    let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu} = cini
212
        CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
213
                csTmem = t_mem, csTdsk = t_dsk, csVcpu = v_cpu } = cfin
214
        rini = RSpec i_icpu i_imem i_idsk
215
        rfin = RSpec (f_icpu - i_icpu) (f_imem - i_imem) (f_idsk - i_idsk)
216
        un_cpu = v_cpu - f_icpu
217
        runa = RSpec un_cpu (truncate t_mem - f_imem) (truncate t_dsk - f_idsk)
218
    in (rini, rfin, runa)
219

    
220
-- | The names and weights of the individual elements in the CV list
221
detailedCVInfo :: [(Double, String)]
222
detailedCVInfo = [ (1,  "free_mem_cv")
223
                 , (1,  "free_disk_cv")
224
                 , (1,  "n1_cnt")
225
                 , (1,  "reserved_mem_cv")
226
                 , (4,  "offline_all_cnt")
227
                 , (16, "offline_pri_cnt")
228
                 , (1,  "vcpu_ratio_cv")
229
                 , (1,  "cpu_load_cv")
230
                 , (1,  "mem_load_cv")
231
                 , (1,  "disk_load_cv")
232
                 , (1,  "net_load_cv")
233
                 , (2,  "pri_tags_score")
234
                 ]
235

    
236
detailedCVWeights :: [Double]
237
detailedCVWeights = map fst detailedCVInfo
238

    
239
-- | Compute the mem and disk covariance.
240
compDetailedCV :: Node.List -> [Double]
241
compDetailedCV nl =
242
    let
243
        all_nodes = Container.elems nl
244
        (offline, nodes) = partition Node.offline all_nodes
245
        mem_l = map Node.pMem nodes
246
        dsk_l = map Node.pDsk nodes
247
        -- metric: memory covariance
248
        mem_cv = varianceCoeff mem_l
249
        -- metric: disk covariance
250
        dsk_cv = varianceCoeff dsk_l
251
        -- metric: count of instances living on N1 failing nodes
252
        n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
253
                                                   length (Node.pList n)) .
254
                   filter Node.failN1 $ nodes :: Double
255
        res_l = map Node.pRem nodes
256
        -- metric: reserved memory covariance
257
        res_cv = varianceCoeff res_l
258
        -- offline instances metrics
259
        offline_ipri = sum . map (length . Node.pList) $ offline
260
        offline_isec = sum . map (length . Node.sList) $ offline
261
        -- metric: count of instances on offline nodes
262
        off_score = fromIntegral (offline_ipri + offline_isec)::Double
263
        -- metric: count of primary instances on offline nodes (this
264
        -- helps with evacuation/failover of primary instances on
265
        -- 2-node clusters with one node offline)
266
        off_pri_score = fromIntegral offline_ipri::Double
267
        cpu_l = map Node.pCpu nodes
268
        -- metric: covariance of vcpu/pcpu ratio
269
        cpu_cv = varianceCoeff cpu_l
270
        -- metrics: covariance of cpu, memory, disk and network load
271
        (c_load, m_load, d_load, n_load) = unzip4 $
272
            map (\n ->
273
                     let DynUtil c1 m1 d1 n1 = Node.utilLoad n
274
                         DynUtil c2 m2 d2 n2 = Node.utilPool n
275
                     in (c1/c2, m1/m2, d1/d2, n1/n2)
276
                ) nodes
277
        -- metric: conflicting instance count
278
        pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
279
        pri_tags_score = fromIntegral pri_tags_inst::Double
280
    in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
281
       , varianceCoeff c_load, varianceCoeff m_load
282
       , varianceCoeff d_load, varianceCoeff n_load
283
       , pri_tags_score ]
284

    
285
-- | Compute the /total/ variance.
286
compCV :: Node.List -> Double
287
compCV = sum . zipWith (*) detailedCVWeights . compDetailedCV
288

    
289
-- | Compute online nodes from a Node.List
290
getOnline :: Node.List -> [Node.Node]
291
getOnline = filter (not . Node.offline) . Container.elems
292

    
293
-- * hbal functions
294

    
295
-- | Compute best table. Note that the ordering of the arguments is important.
296
compareTables :: Table -> Table -> Table
297
compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
298
    if a_cv > b_cv then b else a
299

    
300
-- | Applies an instance move to a given node list and instance.
301
applyMove :: Node.List -> Instance.Instance
302
          -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
303
-- Failover (f)
304
applyMove nl inst Failover =
305
    let old_pdx = Instance.pNode inst
306
        old_sdx = Instance.sNode inst
307
        old_p = Container.find old_pdx nl
308
        old_s = Container.find old_sdx nl
309
        int_p = Node.removePri old_p inst
310
        int_s = Node.removeSec old_s inst
311
        force_p = Node.offline old_p
312
        new_nl = do -- Maybe monad
313
          new_p <- Node.addPriEx force_p int_s inst
314
          new_s <- Node.addSec int_p inst old_sdx
315
          let new_inst = Instance.setBoth inst old_sdx old_pdx
316
          return (Container.addTwo old_pdx new_s old_sdx new_p nl,
317
                  new_inst, old_sdx, old_pdx)
318
    in new_nl
319

    
320
-- Replace the primary (f:, r:np, f)
321
applyMove nl inst (ReplacePrimary new_pdx) =
322
    let old_pdx = Instance.pNode inst
323
        old_sdx = Instance.sNode inst
324
        old_p = Container.find old_pdx nl
325
        old_s = Container.find old_sdx nl
326
        tgt_n = Container.find new_pdx nl
327
        int_p = Node.removePri old_p inst
328
        int_s = Node.removeSec old_s inst
329
        force_p = Node.offline old_p
330
        new_nl = do -- Maybe monad
331
          -- check that the current secondary can host the instance
332
          -- during the migration
333
          tmp_s <- Node.addPriEx force_p int_s inst
334
          let tmp_s' = Node.removePri tmp_s inst
335
          new_p <- Node.addPriEx force_p tgt_n inst
336
          new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
337
          let new_inst = Instance.setPri inst new_pdx
338
          return (Container.add new_pdx new_p $
339
                  Container.addTwo old_pdx int_p old_sdx new_s nl,
340
                  new_inst, new_pdx, old_sdx)
341
    in new_nl
342

    
343
-- Replace the secondary (r:ns)
344
applyMove nl inst (ReplaceSecondary new_sdx) =
345
    let old_pdx = Instance.pNode inst
346
        old_sdx = Instance.sNode inst
347
        old_s = Container.find old_sdx nl
348
        tgt_n = Container.find new_sdx nl
349
        int_s = Node.removeSec old_s inst
350
        force_s = Node.offline old_s
351
        new_inst = Instance.setSec inst new_sdx
352
        new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
353
                 \new_s -> return (Container.addTwo new_sdx
354
                                   new_s old_sdx int_s nl,
355
                                   new_inst, old_pdx, new_sdx)
356
    in new_nl
357

    
358
-- Replace the secondary and failover (r:np, f)
359
applyMove nl inst (ReplaceAndFailover new_pdx) =
360
    let old_pdx = Instance.pNode inst
361
        old_sdx = Instance.sNode inst
362
        old_p = Container.find old_pdx nl
363
        old_s = Container.find old_sdx nl
364
        tgt_n = Container.find new_pdx nl
365
        int_p = Node.removePri old_p inst
366
        int_s = Node.removeSec old_s inst
367
        force_s = Node.offline old_s
368
        new_nl = do -- Maybe monad
369
          new_p <- Node.addPri tgt_n inst
370
          new_s <- Node.addSecEx force_s int_p inst new_pdx
371
          let new_inst = Instance.setBoth inst new_pdx old_pdx
372
          return (Container.add new_pdx new_p $
373
                  Container.addTwo old_pdx new_s old_sdx int_s nl,
374
                  new_inst, new_pdx, old_pdx)
375
    in new_nl
376

    
377
-- Failver and replace the secondary (f, r:ns)
378
applyMove nl inst (FailoverAndReplace new_sdx) =
379
    let old_pdx = Instance.pNode inst
380
        old_sdx = Instance.sNode inst
381
        old_p = Container.find old_pdx nl
382
        old_s = Container.find old_sdx nl
383
        tgt_n = Container.find new_sdx nl
384
        int_p = Node.removePri old_p inst
385
        int_s = Node.removeSec old_s inst
386
        force_p = Node.offline old_p
387
        new_nl = do -- Maybe monad
388
          new_p <- Node.addPriEx force_p int_s inst
389
          new_s <- Node.addSecEx force_p tgt_n inst old_sdx
390
          let new_inst = Instance.setBoth inst old_sdx new_sdx
391
          return (Container.add new_sdx new_s $
392
                  Container.addTwo old_sdx new_p old_pdx int_p nl,
393
                  new_inst, old_sdx, new_sdx)
394
    in new_nl
395

    
396
-- | Tries to allocate an instance on one given node.
397
allocateOnSingle :: Node.List -> Instance.Instance -> Node.Node
398
                 -> OpResult Node.AllocElement
399
allocateOnSingle nl inst p =
400
    let new_pdx = Node.idx p
401
        new_inst = Instance.setBoth inst new_pdx Node.noSecondary
402
    in  Node.addPri p inst >>= \new_p -> do
403
      let new_nl = Container.add new_pdx new_p nl
404
          new_score = compCV nl
405
      return (new_nl, new_inst, [new_p], new_score)
406

    
407
-- | Tries to allocate an instance on a given pair of nodes.
408
allocateOnPair :: Node.List -> Instance.Instance -> Node.Node -> Node.Node
409
               -> OpResult Node.AllocElement
410
allocateOnPair nl inst tgt_p tgt_s =
411
    let new_pdx = Node.idx tgt_p
412
        new_sdx = Node.idx tgt_s
413
    in do
414
      new_p <- Node.addPri tgt_p inst
415
      new_s <- Node.addSec tgt_s inst new_pdx
416
      let new_inst = Instance.setBoth inst new_pdx new_sdx
417
          new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
418
      return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
419

    
420
-- | Tries to perform an instance move and returns the best table
421
-- between the original one and the new one.
422
checkSingleStep :: Table -- ^ The original table
423
                -> Instance.Instance -- ^ The instance to move
424
                -> Table -- ^ The current best table
425
                -> IMove -- ^ The move to apply
426
                -> Table -- ^ The final best table
427
checkSingleStep ini_tbl target cur_tbl move =
428
    let
429
        Table ini_nl ini_il _ ini_plc = ini_tbl
430
        tmp_resu = applyMove ini_nl target move
431
    in
432
      case tmp_resu of
433
        OpFail _ -> cur_tbl
434
        OpGood (upd_nl, new_inst, pri_idx, sec_idx) ->
435
            let tgt_idx = Instance.idx target
436
                upd_cvar = compCV upd_nl
437
                upd_il = Container.add tgt_idx new_inst ini_il
438
                upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
439
                upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
440
            in
441
              compareTables cur_tbl upd_tbl
442

    
443
-- | Given the status of the current secondary as a valid new node and
444
-- the current candidate target node, generate the possible moves for
445
-- a instance.
446
possibleMoves :: Bool      -- ^ Whether the secondary node is a valid new node
447
              -> Ndx       -- ^ Target node candidate
448
              -> [IMove]   -- ^ List of valid result moves
449
possibleMoves True tdx =
450
    [ReplaceSecondary tdx,
451
     ReplaceAndFailover tdx,
452
     ReplacePrimary tdx,
453
     FailoverAndReplace tdx]
454

    
455
possibleMoves False tdx =
456
    [ReplaceSecondary tdx,
457
     ReplaceAndFailover tdx]
458

    
459
-- | Compute the best move for a given instance.
460
checkInstanceMove :: [Ndx]             -- ^ Allowed target node indices
461
                  -> Bool              -- ^ Whether disk moves are allowed
462
                  -> Table             -- ^ Original table
463
                  -> Instance.Instance -- ^ Instance to move
464
                  -> Table             -- ^ Best new table for this instance
465
checkInstanceMove nodes_idx disk_moves ini_tbl target =
466
    let
467
        opdx = Instance.pNode target
468
        osdx = Instance.sNode target
469
        nodes = filter (\idx -> idx /= opdx && idx /= osdx) nodes_idx
470
        use_secondary = elem osdx nodes_idx
471
        aft_failover = if use_secondary -- if allowed to failover
472
                       then checkSingleStep ini_tbl target ini_tbl Failover
473
                       else ini_tbl
474
        all_moves = if disk_moves
475
                    then concatMap (possibleMoves use_secondary) nodes
476
                    else []
477
    in
478
      -- iterate over the possible nodes for this instance
479
      foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
480

    
481
-- | Compute the best next move.
482
checkMove :: [Ndx]               -- ^ Allowed target node indices
483
          -> Bool                -- ^ Whether disk moves are allowed
484
          -> Table               -- ^ The current solution
485
          -> [Instance.Instance] -- ^ List of instances still to move
486
          -> Table               -- ^ The new solution
487
checkMove nodes_idx disk_moves ini_tbl victims =
488
    let Table _ _ _ ini_plc = ini_tbl
489
        -- iterate over all instances, computing the best move
490
        best_tbl =
491
            foldl'
492
            (\ step_tbl em ->
493
                 compareTables step_tbl $
494
                 checkInstanceMove nodes_idx disk_moves ini_tbl em)
495
            ini_tbl victims
496
        Table _ _ _ best_plc = best_tbl
497
    in if length best_plc == length ini_plc
498
       then ini_tbl -- no advancement
499
       else best_tbl
500

    
501
-- | Check if we are allowed to go deeper in the balancing
502
doNextBalance :: Table     -- ^ The starting table
503
              -> Int       -- ^ Remaining length
504
              -> Score     -- ^ Score at which to stop
505
              -> Bool      -- ^ The resulting table and commands
506
doNextBalance ini_tbl max_rounds min_score =
507
    let Table _ _ ini_cv ini_plc = ini_tbl
508
        ini_plc_len = length ini_plc
509
    in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
510

    
511
-- | Run a balance move
512
tryBalance :: Table       -- ^ The starting table
513
           -> Bool        -- ^ Allow disk moves
514
           -> Bool        -- ^ Only evacuate moves
515
           -> Score       -- ^ Min gain threshold
516
           -> Score       -- ^ Min gain
517
           -> Maybe Table -- ^ The resulting table and commands
518
tryBalance ini_tbl disk_moves evac_mode mg_limit min_gain =
519
    let Table ini_nl ini_il ini_cv _ = ini_tbl
520
        all_inst = Container.elems ini_il
521
        all_inst' = if evac_mode
522
                    then let bad_nodes = map Node.idx . filter Node.offline $
523
                                         Container.elems ini_nl
524
                         in filter (\e -> Instance.sNode e `elem` bad_nodes ||
525
                                          Instance.pNode e `elem` bad_nodes)
526
                            all_inst
527
                    else all_inst
528
        reloc_inst = filter Instance.movable all_inst'
529
        node_idx = map Node.idx . filter (not . Node.offline) $
530
                   Container.elems ini_nl
531
        fin_tbl = checkMove node_idx disk_moves ini_tbl reloc_inst
532
        (Table _ _ fin_cv _) = fin_tbl
533
    in
534
      if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
535
      then Just fin_tbl -- this round made success, return the new table
536
      else Nothing
537

    
538
-- * Allocation functions
539

    
540
-- | Build failure stats out of a list of failures
541
collapseFailures :: [FailMode] -> FailStats
542
collapseFailures flst =
543
    map (\k -> (k, length $ filter (k ==) flst)) [minBound..maxBound]
544

    
545
-- | Update current Allocation solution and failure stats with new
546
-- elements
547
concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
548
concatAllocs as (OpFail reason) = as { asFailures = reason : asFailures as }
549

    
550
concatAllocs as (OpGood ns@(_, _, _, nscore)) =
551
    let -- Choose the old or new solution, based on the cluster score
552
        cntok = asAllocs as
553
        osols = asSolutions as
554
        nsols = case osols of
555
                  [] -> [ns]
556
                  (_, _, _, oscore):[] ->
557
                      if oscore < nscore
558
                      then osols
559
                      else [ns]
560
                  -- FIXME: here we simply concat to lists with more
561
                  -- than one element; we should instead abort, since
562
                  -- this is not a valid usage of this function
563
                  xs -> ns:xs
564
        nsuc = cntok + 1
565
    -- Note: we force evaluation of nsols here in order to keep the
566
    -- memory profile low - we know that we will need nsols for sure
567
    -- in the next cycle, so we force evaluation of nsols, since the
568
    -- foldl' in the caller will only evaluate the tuple, but not the
569
    -- elements of the tuple
570
    in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolutions = nsols }
571

    
572
-- | Given a solution, generates a reasonable description for it
573
describeSolution :: AllocSolution -> String
574
describeSolution as =
575
  let fcnt = asFailures as
576
      sols = asSolutions as
577
      freasons =
578
        intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
579
        filter ((> 0) . snd) . collapseFailures $ fcnt
580
  in if null sols
581
     then "No valid allocation solutions, failure reasons: " ++
582
          (if null fcnt
583
           then "unknown reasons"
584
           else freasons)
585
     else let (_, _, nodes, cv) = head sols
586
          in printf ("score: %.8f, successes %d, failures %d (%s)" ++
587
                     " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
588
             (intercalate "/" . map Node.name $ nodes)
589

    
590
-- | Annotates a solution with the appropriate string
591
annotateSolution :: AllocSolution -> AllocSolution
592
annotateSolution as = as { asLog = describeSolution as : asLog as }
593

    
594
-- | Try to allocate an instance on the cluster.
595
tryAlloc :: (Monad m) =>
596
            Node.List         -- ^ The node list
597
         -> Instance.List     -- ^ The instance list
598
         -> Instance.Instance -- ^ The instance to allocate
599
         -> Int               -- ^ Required number of nodes
600
         -> m AllocSolution   -- ^ Possible solution list
601
tryAlloc nl _ inst 2 =
602
    let all_nodes = getOnline nl
603
        all_pairs = liftM2 (,) all_nodes all_nodes
604
        ok_pairs = filter (\(x, y) -> Node.idx x /= Node.idx y) all_pairs
605
        sols = foldl' (\cstate (p, s) ->
606
                           concatAllocs cstate $ allocateOnPair nl inst p s
607
                      ) emptySolution ok_pairs
608

    
609
    in return $ annotateSolution sols
610

    
611
tryAlloc nl _ inst 1 =
612
    let all_nodes = getOnline nl
613
        sols = foldl' (\cstate ->
614
                           concatAllocs cstate . allocateOnSingle nl inst
615
                      ) emptySolution all_nodes
616
    in return $ annotateSolution sols
617

    
618
tryAlloc _ _ _ reqn = fail $ "Unsupported number of allocation \
619
                             \destinations required (" ++ show reqn ++
620
                                               "), only two supported"
621

    
622
-- | Try to allocate an instance on the cluster.
623
tryReloc :: (Monad m) =>
624
            Node.List       -- ^ The node list
625
         -> Instance.List   -- ^ The instance list
626
         -> Idx             -- ^ The index of the instance to move
627
         -> Int             -- ^ The number of nodes required
628
         -> [Ndx]           -- ^ Nodes which should not be used
629
         -> m AllocSolution -- ^ Solution list
630
tryReloc nl il xid 1 ex_idx =
631
    let all_nodes = getOnline nl
632
        inst = Container.find xid il
633
        ex_idx' = Instance.pNode inst:ex_idx
634
        valid_nodes = filter (not . flip elem ex_idx' . Node.idx) all_nodes
635
        valid_idxes = map Node.idx valid_nodes
636
        sols1 = foldl' (\cstate x ->
637
                            let em = do
638
                                  (mnl, i, _, _) <-
639
                                      applyMove nl inst (ReplaceSecondary x)
640
                                  return (mnl, i, [Container.find x mnl],
641
                                          compCV mnl)
642
                            in concatAllocs cstate em
643
                       ) emptySolution valid_idxes
644
    in return sols1
645

    
646
tryReloc _ _ _ reqn _  = fail $ "Unsupported number of relocation \
647
                                \destinations required (" ++ show reqn ++
648
                                                  "), only one supported"
649

    
650
-- | Try to evacuate a list of nodes.
651
tryEvac :: (Monad m) =>
652
            Node.List       -- ^ The node list
653
         -> Instance.List   -- ^ The instance list
654
         -> [Ndx]           -- ^ Nodes to be evacuated
655
         -> m AllocSolution -- ^ Solution list
656
tryEvac nl il ex_ndx =
657
    let ex_nodes = map (`Container.find` nl) ex_ndx
658
        all_insts = nub . concatMap Node.sList $ ex_nodes
659
    in do
660
      (_, sol) <- foldM (\(nl', old_as) idx -> do
661
                            -- FIXME: hardcoded one node here
662
                            -- (fm, cs, aes)
663
                            new_as <- tryReloc nl' il idx 1 ex_ndx
664
                            case asSolutions new_as of
665
                              csol@(nl'', _, _, _):_ ->
666
                                -- an individual relocation succeeded,
667
                                -- we kind of compose the data from
668
                                -- the two solutions
669
                                return (nl'',
670
                                        new_as { asSolutions =
671
                                                    csol:asSolutions old_as })
672
                              -- this relocation failed, so we fail
673
                              -- the entire evac
674
                              _ -> fail $ "Can't evacuate instance " ++
675
                                   Instance.name (Container.find idx il) ++
676
                                   ": " ++ describeSolution new_as
677
                        ) (nl, emptySolution) all_insts
678
      return $ annotateSolution sol
679

    
680
-- | Recursively place instances on the cluster until we're out of space
681
iterateAlloc :: Node.List
682
             -> Instance.List
683
             -> Instance.Instance
684
             -> Int
685
             -> [Instance.Instance]
686
             -> Result (FailStats, Node.List, Instance.List,
687
                        [Instance.Instance])
688
iterateAlloc nl il newinst nreq ixes =
689
      let depth = length ixes
690
          newname = printf "new-%d" depth::String
691
          newidx = length (Container.elems il) + depth
692
          newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
693
      in case tryAlloc nl il newi2 nreq of
694
           Bad s -> Bad s
695
           Ok (AllocSolution { asFailures = errs, asSolutions = sols3 }) ->
696
               case sols3 of
697
                 [] -> Ok (collapseFailures errs, nl, il, ixes)
698
                 (xnl, xi, _, _):[] ->
699
                     iterateAlloc xnl (Container.add newidx xi il)
700
                                  newinst nreq $! (xi:ixes)
701
                 _ -> Bad "Internal error: multiple solutions for single\
702
                          \ allocation"
703

    
704
tieredAlloc :: Node.List
705
            -> Instance.List
706
            -> Instance.Instance
707
            -> Int
708
            -> [Instance.Instance]
709
            -> Result (FailStats, Node.List, Instance.List,
710
                       [Instance.Instance])
711
tieredAlloc nl il newinst nreq ixes =
712
    case iterateAlloc nl il newinst nreq ixes of
713
      Bad s -> Bad s
714
      Ok (errs, nl', il', ixes') ->
715
          case Instance.shrinkByType newinst . fst . last $
716
               sortBy (comparing snd) errs of
717
            Bad _ -> Ok (errs, nl', il', ixes')
718
            Ok newinst' ->
719
                tieredAlloc nl' il' newinst' nreq ixes'
720

    
721
-- * Formatting functions
722

    
723
-- | Given the original and final nodes, computes the relocation description.
724
computeMoves :: Instance.Instance -- ^ The instance to be moved
725
             -> String -- ^ The instance name
726
             -> IMove  -- ^ The move being performed
727
             -> String -- ^ New primary
728
             -> String -- ^ New secondary
729
             -> (String, [String])
730
                -- ^ Tuple of moves and commands list; moves is containing
731
                -- either @/f/@ for failover or @/r:name/@ for replace
732
                -- secondary, while the command list holds gnt-instance
733
                -- commands (without that prefix), e.g \"@failover instance1@\"
734
computeMoves i inam mv c d =
735
    case mv of
736
      Failover -> ("f", [mig])
737
      FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
738
      ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
739
      ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
740
      ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
741
    where morf = if Instance.running i then "migrate" else "failover"
742
          mig = printf "%s -f %s" morf inam::String
743
          rep n = printf "replace-disks -n %s %s" n inam
744

    
745
-- | Converts a placement to string format.
746
printSolutionLine :: Node.List     -- ^ The node list
747
                  -> Instance.List -- ^ The instance list
748
                  -> Int           -- ^ Maximum node name length
749
                  -> Int           -- ^ Maximum instance name length
750
                  -> Placement     -- ^ The current placement
751
                  -> Int           -- ^ The index of the placement in
752
                                   -- the solution
753
                  -> (String, [String])
754
printSolutionLine nl il nmlen imlen plc pos =
755
    let
756
        pmlen = (2*nmlen + 1)
757
        (i, p, s, mv, c) = plc
758
        inst = Container.find i il
759
        inam = Instance.alias inst
760
        npri = Node.alias $ Container.find p nl
761
        nsec = Node.alias $ Container.find s nl
762
        opri = Node.alias $ Container.find (Instance.pNode inst) nl
763
        osec = Node.alias $ Container.find (Instance.sNode inst) nl
764
        (moves, cmds) =  computeMoves inst inam mv npri nsec
765
        ostr = printf "%s:%s" opri osec::String
766
        nstr = printf "%s:%s" npri nsec::String
767
    in
768
      (printf "  %3d. %-*s %-*s => %-*s %.8f a=%s"
769
       pos imlen inam pmlen ostr
770
       pmlen nstr c moves,
771
       cmds)
772

    
773
-- | Return the instance and involved nodes in an instance move.
774
involvedNodes :: Instance.List -> Placement -> [Ndx]
775
involvedNodes il plc =
776
    let (i, np, ns, _, _) = plc
777
        inst = Container.find i il
778
        op = Instance.pNode inst
779
        os = Instance.sNode inst
780
    in nub [np, ns, op, os]
781

    
782
-- | Inner function for splitJobs, that either appends the next job to
783
-- the current jobset, or starts a new jobset.
784
mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
785
mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
786
mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
787
    | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
788
    | otherwise = ([n]:cjs, ndx)
789

    
790
-- | Break a list of moves into independent groups. Note that this
791
-- will reverse the order of jobs.
792
splitJobs :: [MoveJob] -> [JobSet]
793
splitJobs = fst . foldl mergeJobs ([], [])
794

    
795
-- | Given a list of commands, prefix them with @gnt-instance@ and
796
-- also beautify the display a little.
797
formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
798
formatJob jsn jsl (sn, (_, _, _, cmds)) =
799
    let out =
800
            printf "  echo job %d/%d" jsn sn:
801
            printf "  check":
802
            map ("  gnt-instance " ++) cmds
803
    in if sn == 1
804
       then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
805
       else out
806

    
807
-- | Given a list of commands, prefix them with @gnt-instance@ and
808
-- also beautify the display a little.
809
formatCmds :: [JobSet] -> String
810
formatCmds =
811
    unlines .
812
    concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
813
                             (zip [1..] js)) .
814
    zip [1..]
815

    
816
-- | Print the node list.
817
printNodes :: Node.List -> [String] -> String
818
printNodes nl fs =
819
    let fields = case fs of
820
          [] -> Node.defaultFields
821
          "+":rest -> Node.defaultFields ++ rest
822
          _ -> fs
823
        snl = sortBy (comparing Node.idx) (Container.elems nl)
824
        (header, isnum) = unzip $ map Node.showHeader fields
825
    in unlines . map ((:) ' ' .  intercalate " ") $
826
       formatTable (header:map (Node.list fields) snl) isnum
827

    
828
-- | Print the instance list.
829
printInsts :: Node.List -> Instance.List -> String
830
printInsts nl il =
831
    let sil = sortBy (comparing Instance.idx) (Container.elems il)
832
        helper inst = [ if Instance.running inst then "R" else " "
833
                      , Instance.name inst
834
                      , Container.nameOf nl (Instance.pNode inst)
835
                      , let sdx = Instance.sNode inst
836
                        in if sdx == Node.noSecondary
837
                           then  ""
838
                           else Container.nameOf nl sdx
839
                      , printf "%3d" $ Instance.vcpus inst
840
                      , printf "%5d" $ Instance.mem inst
841
                      , printf "%5d" $ Instance.dsk inst `div` 1024
842
                      , printf "%5.3f" lC
843
                      , printf "%5.3f" lM
844
                      , printf "%5.3f" lD
845
                      , printf "%5.3f" lN
846
                      ]
847
            where DynUtil lC lM lD lN = Instance.util inst
848
        header = [ "F", "Name", "Pri_node", "Sec_node", "vcpu", "mem"
849
                 , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
850
        isnum = False:False:False:False:repeat True
851
    in unlines . map ((:) ' ' . intercalate " ") $
852
       formatTable (header:map helper sil) isnum
853

    
854
-- | Shows statistics for a given node list.
855
printStats :: Node.List -> String
856
printStats nl =
857
    let dcvs = compDetailedCV nl
858
        (weights, names) = unzip detailedCVInfo
859
        hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
860
        formatted = map (\(w, header, val) ->
861
                             printf "%s=%.8f(x%.2f)" header val w::String) hd
862
    in intercalate ", " formatted
863

    
864
-- | Convert a placement into a list of OpCodes (basically a job).
865
iMoveToJob :: Node.List -> Instance.List
866
          -> Idx -> IMove -> [OpCodes.OpCode]
867
iMoveToJob nl il idx move =
868
    let inst = Container.find idx il
869
        iname = Instance.name inst
870
        lookNode  = Just . Container.nameOf nl
871
        opF = if Instance.running inst
872
              then OpCodes.OpMigrateInstance iname True False
873
              else OpCodes.OpFailoverInstance iname False
874
        opR n = OpCodes.OpReplaceDisks iname (lookNode n)
875
                OpCodes.ReplaceNewSecondary [] Nothing
876
    in case move of
877
         Failover -> [ opF ]
878
         ReplacePrimary np -> [ opF, opR np, opF ]
879
         ReplaceSecondary ns -> [ opR ns ]
880
         ReplaceAndFailover np -> [ opR np, opF ]
881
         FailoverAndReplace ns -> [ opF, opR ns ]
882

    
883
-- | Computes the group of an instance
884
instanceGroup :: Node.List -> Instance.Instance -> Result GroupID
885
instanceGroup nl i =
886
  let sidx = Instance.sNode i
887
      pnode = Container.find (Instance.pNode i) nl
888
      snode = if sidx == Node.noSecondary
889
              then pnode
890
              else Container.find sidx nl
891
      puuid = Node.group pnode
892
      suuid = Node.group snode
893
  in if puuid /= suuid
894
     then fail ("Instance placed accross two node groups, primary " ++ puuid ++
895
                ", secondary " ++ suuid)
896
     else return puuid
897

    
898
-- | Compute the list of badly allocated instances (split across node
899
-- groups)
900
findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
901
findSplitInstances nl il =
902
  filter (not . isOk . instanceGroup nl) (Container.elems il)
903

    
904
-- | Splits a cluster into the component node groups
905
splitCluster :: Node.List -> Instance.List ->
906
                [(GroupID, (Node.List, Instance.List))]
907
splitCluster nl il =
908
  let ngroups = Node.computeGroups (Container.elems nl)
909
  in map (\(guuid, nodes) ->
910
           let nidxs = map Node.idx nodes
911
               nodes' = zip nidxs nodes
912
               instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
913
           in (guuid, (Container.fromAssocList nodes', instances))) ngroups